diff options
Diffstat (limited to 'arch/x86')
236 files changed, 8729 insertions, 6155 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index ad8ec356fb36..0e103236b754 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild | |||
@@ -14,3 +14,4 @@ obj-y += crypto/ | |||
14 | obj-y += vdso/ | 14 | obj-y += vdso/ |
15 | obj-$(CONFIG_IA32_EMULATION) += ia32/ | 15 | obj-$(CONFIG_IA32_EMULATION) += ia32/ |
16 | 16 | ||
17 | obj-y += platform/ | ||
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 74ea59d34076..e8327686d3c5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -1,6 +1,3 @@ | |||
1 | # x86 configuration | ||
2 | mainmenu "Linux Kernel Configuration for x86" | ||
3 | |||
4 | # Select 32 or 64 bit | 1 | # Select 32 or 64 bit |
5 | config 64BIT | 2 | config 64BIT |
6 | bool "64-bit kernel" if ARCH = "x86" | 3 | bool "64-bit kernel" if ARCH = "x86" |
@@ -25,14 +22,17 @@ config X86 | |||
25 | select HAVE_IDE | 22 | select HAVE_IDE |
26 | select HAVE_OPROFILE | 23 | select HAVE_OPROFILE |
27 | select HAVE_PERF_EVENTS if (!M386 && !M486) | 24 | select HAVE_PERF_EVENTS if (!M386 && !M486) |
25 | select HAVE_IRQ_WORK | ||
28 | select HAVE_IOREMAP_PROT | 26 | select HAVE_IOREMAP_PROT |
29 | select HAVE_KPROBES | 27 | select HAVE_KPROBES |
28 | select HAVE_MEMBLOCK | ||
30 | select ARCH_WANT_OPTIONAL_GPIOLIB | 29 | select ARCH_WANT_OPTIONAL_GPIOLIB |
31 | select ARCH_WANT_FRAME_POINTERS | 30 | select ARCH_WANT_FRAME_POINTERS |
32 | select HAVE_DMA_ATTRS | 31 | select HAVE_DMA_ATTRS |
33 | select HAVE_KRETPROBES | 32 | select HAVE_KRETPROBES |
34 | select HAVE_OPTPROBES | 33 | select HAVE_OPTPROBES |
35 | select HAVE_FTRACE_MCOUNT_RECORD | 34 | select HAVE_FTRACE_MCOUNT_RECORD |
35 | select HAVE_C_RECORDMCOUNT | ||
36 | select HAVE_DYNAMIC_FTRACE | 36 | select HAVE_DYNAMIC_FTRACE |
37 | select HAVE_FUNCTION_TRACER | 37 | select HAVE_FUNCTION_TRACER |
38 | select HAVE_FUNCTION_GRAPH_TRACER | 38 | select HAVE_FUNCTION_GRAPH_TRACER |
@@ -59,6 +59,8 @@ config X86 | |||
59 | select ANON_INODES | 59 | select ANON_INODES |
60 | select HAVE_ARCH_KMEMCHECK | 60 | select HAVE_ARCH_KMEMCHECK |
61 | select HAVE_USER_RETURN_NOTIFIER | 61 | select HAVE_USER_RETURN_NOTIFIER |
62 | select HAVE_ARCH_JUMP_LABEL | ||
63 | select HAVE_TEXT_POKE_SMP | ||
62 | select HAVE_GENERIC_HARDIRQS | 64 | select HAVE_GENERIC_HARDIRQS |
63 | select HAVE_SPARSE_IRQ | 65 | select HAVE_SPARSE_IRQ |
64 | select GENERIC_IRQ_PROBE | 66 | select GENERIC_IRQ_PROBE |
@@ -197,9 +199,6 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING | |||
197 | config ARCH_SUPPORTS_DEBUG_PAGEALLOC | 199 | config ARCH_SUPPORTS_DEBUG_PAGEALLOC |
198 | def_bool y | 200 | def_bool y |
199 | 201 | ||
200 | config HAVE_EARLY_RES | ||
201 | def_bool y | ||
202 | |||
203 | config HAVE_INTEL_TXT | 202 | config HAVE_INTEL_TXT |
204 | def_bool y | 203 | def_bool y |
205 | depends on EXPERIMENTAL && DMAR && ACPI | 204 | depends on EXPERIMENTAL && DMAR && ACPI |
@@ -345,6 +344,7 @@ endif | |||
345 | 344 | ||
346 | config X86_VSMP | 345 | config X86_VSMP |
347 | bool "ScaleMP vSMP" | 346 | bool "ScaleMP vSMP" |
347 | select PARAVIRT_GUEST | ||
348 | select PARAVIRT | 348 | select PARAVIRT |
349 | depends on X86_64 && PCI | 349 | depends on X86_64 && PCI |
350 | depends on X86_EXTENDED_PLATFORM | 350 | depends on X86_EXTENDED_PLATFORM |
@@ -490,25 +490,6 @@ if PARAVIRT_GUEST | |||
490 | 490 | ||
491 | source "arch/x86/xen/Kconfig" | 491 | source "arch/x86/xen/Kconfig" |
492 | 492 | ||
493 | config VMI | ||
494 | bool "VMI Guest support (DEPRECATED)" | ||
495 | select PARAVIRT | ||
496 | depends on X86_32 | ||
497 | ---help--- | ||
498 | VMI provides a paravirtualized interface to the VMware ESX server | ||
499 | (it could be used by other hypervisors in theory too, but is not | ||
500 | at the moment), by linking the kernel to a GPL-ed ROM module | ||
501 | provided by the hypervisor. | ||
502 | |||
503 | As of September 2009, VMware has started a phased retirement | ||
504 | of this feature from VMware's products. Please see | ||
505 | feature-removal-schedule.txt for details. If you are | ||
506 | planning to enable this option, please note that you cannot | ||
507 | live migrate a VMI enabled VM to a future VMware product, | ||
508 | which doesn't support VMI. So if you expect your kernel to | ||
509 | seamlessly migrate to newer VMware products, keep this | ||
510 | disabled. | ||
511 | |||
512 | config KVM_CLOCK | 493 | config KVM_CLOCK |
513 | bool "KVM paravirtualized clock" | 494 | bool "KVM paravirtualized clock" |
514 | select PARAVIRT | 495 | select PARAVIRT |
@@ -563,16 +544,7 @@ config PARAVIRT_DEBUG | |||
563 | a paravirt_op is missing when it is called. | 544 | a paravirt_op is missing when it is called. |
564 | 545 | ||
565 | config NO_BOOTMEM | 546 | config NO_BOOTMEM |
566 | default y | 547 | def_bool y |
567 | bool "Disable Bootmem code" | ||
568 | ---help--- | ||
569 | Use early_res directly instead of bootmem before slab is ready. | ||
570 | - allocator (buddy) [generic] | ||
571 | - early allocator (bootmem) [generic] | ||
572 | - very early allocator (reserve_early*()) [x86] | ||
573 | - very very early allocator (early brk model) [x86] | ||
574 | So reduce one layer between early allocator to final allocator | ||
575 | |||
576 | 548 | ||
577 | config MEMTEST | 549 | config MEMTEST |
578 | bool "Memtest" | 550 | bool "Memtest" |
@@ -643,7 +615,7 @@ config GART_IOMMU | |||
643 | bool "GART IOMMU support" if EMBEDDED | 615 | bool "GART IOMMU support" if EMBEDDED |
644 | default y | 616 | default y |
645 | select SWIOTLB | 617 | select SWIOTLB |
646 | depends on X86_64 && PCI && K8_NB | 618 | depends on X86_64 && PCI && AMD_NB |
647 | ---help--- | 619 | ---help--- |
648 | Support for full DMA access of devices with 32bit memory access only | 620 | Support for full DMA access of devices with 32bit memory access only |
649 | on systems with more than 3GB. This is usually needed for USB, | 621 | on systems with more than 3GB. This is usually needed for USB, |
@@ -768,6 +740,17 @@ config SCHED_MC | |||
768 | making when dealing with multi-core CPU chips at a cost of slightly | 740 | making when dealing with multi-core CPU chips at a cost of slightly |
769 | increased overhead in some places. If unsure say N here. | 741 | increased overhead in some places. If unsure say N here. |
770 | 742 | ||
743 | config IRQ_TIME_ACCOUNTING | ||
744 | bool "Fine granularity task level IRQ time accounting" | ||
745 | default n | ||
746 | ---help--- | ||
747 | Select this option to enable fine granularity task irq time | ||
748 | accounting. This is done by reading a timestamp on each | ||
749 | transitions between softirq and hardirq state, so there can be a | ||
750 | small performance impact. | ||
751 | |||
752 | If in doubt, say N here. | ||
753 | |||
771 | source "kernel/Kconfig.preempt" | 754 | source "kernel/Kconfig.preempt" |
772 | 755 | ||
773 | config X86_UP_APIC | 756 | config X86_UP_APIC |
@@ -1121,6 +1104,9 @@ config X86_PAE | |||
1121 | config ARCH_PHYS_ADDR_T_64BIT | 1104 | config ARCH_PHYS_ADDR_T_64BIT |
1122 | def_bool X86_64 || X86_PAE | 1105 | def_bool X86_64 || X86_PAE |
1123 | 1106 | ||
1107 | config ARCH_DMA_ADDR_T_64BIT | ||
1108 | def_bool X86_64 || HIGHMEM64G | ||
1109 | |||
1124 | config DIRECT_GBPAGES | 1110 | config DIRECT_GBPAGES |
1125 | bool "Enable 1GB pages for kernel pagetables" if EMBEDDED | 1111 | bool "Enable 1GB pages for kernel pagetables" if EMBEDDED |
1126 | default y | 1112 | default y |
@@ -1299,25 +1285,34 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK | |||
1299 | Set whether the default state of memory_corruption_check is | 1285 | Set whether the default state of memory_corruption_check is |
1300 | on or off. | 1286 | on or off. |
1301 | 1287 | ||
1302 | config X86_RESERVE_LOW_64K | 1288 | config X86_RESERVE_LOW |
1303 | bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" | 1289 | int "Amount of low memory, in kilobytes, to reserve for the BIOS" |
1304 | default y | 1290 | default 64 |
1291 | range 4 640 | ||
1305 | ---help--- | 1292 | ---help--- |
1306 | Reserve the first 64K of physical RAM on BIOSes that are known | 1293 | Specify the amount of low memory to reserve for the BIOS. |
1307 | to potentially corrupt that memory range. A numbers of BIOSes are | 1294 | |
1308 | known to utilize this area during suspend/resume, so it must not | 1295 | The first page contains BIOS data structures that the kernel |
1309 | be used by the kernel. | 1296 | must not use, so that page must always be reserved. |
1310 | 1297 | ||
1311 | Set this to N if you are absolutely sure that you trust the BIOS | 1298 | By default we reserve the first 64K of physical RAM, as a |
1312 | to get all its memory reservations and usages right. | 1299 | number of BIOSes are known to corrupt that memory range |
1300 | during events such as suspend/resume or monitor cable | ||
1301 | insertion, so it must not be used by the kernel. | ||
1313 | 1302 | ||
1314 | If you have doubts about the BIOS (e.g. suspend/resume does not | 1303 | You can set this to 4 if you are absolutely sure that you |
1315 | work or there's kernel crashes after certain hardware hotplug | 1304 | trust the BIOS to get all its memory reservations and usages |
1316 | events) and it's not AMI or Phoenix, then you might want to enable | 1305 | right. If you know your BIOS have problems beyond the |
1317 | X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical | 1306 | default 64K area, you can set this to 640 to avoid using the |
1318 | corruption patterns. | 1307 | entire low memory range. |
1319 | 1308 | ||
1320 | Say Y if unsure. | 1309 | If you have doubts about the BIOS (e.g. suspend/resume does |
1310 | not work or there's kernel crashes after certain hardware | ||
1311 | hotplug events) then you might want to enable | ||
1312 | X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check | ||
1313 | typical corruption patterns. | ||
1314 | |||
1315 | Leave this to the default value of 64 if you are unsure. | ||
1321 | 1316 | ||
1322 | config MATH_EMULATION | 1317 | config MATH_EMULATION |
1323 | bool | 1318 | bool |
@@ -1873,7 +1868,7 @@ config PCI_GODIRECT | |||
1873 | bool "Direct" | 1868 | bool "Direct" |
1874 | 1869 | ||
1875 | config PCI_GOOLPC | 1870 | config PCI_GOOLPC |
1876 | bool "OLPC" | 1871 | bool "OLPC XO-1" |
1877 | depends on OLPC | 1872 | depends on OLPC |
1878 | 1873 | ||
1879 | config PCI_GOANY | 1874 | config PCI_GOANY |
@@ -2039,14 +2034,21 @@ config SCx200HR_TIMER | |||
2039 | config OLPC | 2034 | config OLPC |
2040 | bool "One Laptop Per Child support" | 2035 | bool "One Laptop Per Child support" |
2041 | select GPIOLIB | 2036 | select GPIOLIB |
2037 | select OLPC_OPENFIRMWARE | ||
2042 | ---help--- | 2038 | ---help--- |
2043 | Add support for detecting the unique features of the OLPC | 2039 | Add support for detecting the unique features of the OLPC |
2044 | XO hardware. | 2040 | XO hardware. |
2045 | 2041 | ||
2042 | config OLPC_XO1 | ||
2043 | tristate "OLPC XO-1 support" | ||
2044 | depends on OLPC && PCI | ||
2045 | ---help--- | ||
2046 | Add support for non-essential features of the OLPC XO-1 laptop. | ||
2047 | |||
2046 | config OLPC_OPENFIRMWARE | 2048 | config OLPC_OPENFIRMWARE |
2047 | bool "Support for OLPC's Open Firmware" | 2049 | bool "Support for OLPC's Open Firmware" |
2048 | depends on !X86_64 && !X86_PAE | 2050 | depends on !X86_64 && !X86_PAE |
2049 | default y if OLPC | 2051 | default n |
2050 | help | 2052 | help |
2051 | This option adds support for the implementation of Open Firmware | 2053 | This option adds support for the implementation of Open Firmware |
2052 | that is used on the OLPC XO-1 Children's Machine. | 2054 | that is used on the OLPC XO-1 Children's Machine. |
@@ -2054,7 +2056,7 @@ config OLPC_OPENFIRMWARE | |||
2054 | 2056 | ||
2055 | endif # X86_32 | 2057 | endif # X86_32 |
2056 | 2058 | ||
2057 | config K8_NB | 2059 | config AMD_NB |
2058 | def_bool y | 2060 | def_bool y |
2059 | depends on CPU_SUP_AMD && PCI | 2061 | depends on CPU_SUP_AMD && PCI |
2060 | 2062 | ||
@@ -2103,6 +2105,10 @@ config HAVE_ATOMIC_IOMAP | |||
2103 | def_bool y | 2105 | def_bool y |
2104 | depends on X86_32 | 2106 | depends on X86_32 |
2105 | 2107 | ||
2108 | config HAVE_TEXT_POKE_SMP | ||
2109 | bool | ||
2110 | select STOP_MACHINE if SMP | ||
2111 | |||
2106 | source "net/Kconfig" | 2112 | source "net/Kconfig" |
2107 | 2113 | ||
2108 | source "drivers/Kconfig" | 2114 | source "drivers/Kconfig" |
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 75085080b63e..b59ee765414e 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug | |||
@@ -43,6 +43,10 @@ config EARLY_PRINTK | |||
43 | with klogd/syslogd or the X server. You should normally N here, | 43 | with klogd/syslogd or the X server. You should normally N here, |
44 | unless you want to debug such a crash. | 44 | unless you want to debug such a crash. |
45 | 45 | ||
46 | config EARLY_PRINTK_MRST | ||
47 | bool "Early printk for MRST platform support" | ||
48 | depends on EARLY_PRINTK && X86_MRST | ||
49 | |||
46 | config EARLY_PRINTK_DBGP | 50 | config EARLY_PRINTK_DBGP |
47 | bool "Early printk via EHCI debug port" | 51 | bool "Early printk via EHCI debug port" |
48 | depends on EARLY_PRINTK && PCI | 52 | depends on EARLY_PRINTK && PCI |
@@ -121,16 +125,6 @@ config DEBUG_NX_TEST | |||
121 | and the software setup of this feature. | 125 | and the software setup of this feature. |
122 | If in doubt, say "N" | 126 | If in doubt, say "N" |
123 | 127 | ||
124 | config 4KSTACKS | ||
125 | bool "Use 4Kb for kernel stacks instead of 8Kb" | ||
126 | depends on X86_32 | ||
127 | ---help--- | ||
128 | If you say Y here the kernel will use a 4Kb stacksize for the | ||
129 | kernel stack attached to each process/thread. This facilitates | ||
130 | running more threads on a system and also reduces the pressure | ||
131 | on the VM subsystem for higher order allocations. This option | ||
132 | will also use IRQ stacks to compensate for the reduced stackspace. | ||
133 | |||
134 | config DOUBLEFAULT | 128 | config DOUBLEFAULT |
135 | default y | 129 | default y |
136 | bool "Enable doublefault exception handler" if EMBEDDED | 130 | bool "Enable doublefault exception handler" if EMBEDDED |
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index e8c8881351b3..b02e509072a7 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile | |||
@@ -96,8 +96,12 @@ cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_en | |||
96 | # is .cfi_signal_frame supported too? | 96 | # is .cfi_signal_frame supported too? |
97 | cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) | 97 | cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) |
98 | cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) | 98 | cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) |
99 | KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) | 99 | |
100 | KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) | 100 | # does binutils support specific instructions? |
101 | asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) | ||
102 | |||
103 | KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) | ||
104 | KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) | ||
101 | 105 | ||
102 | LDFLAGS := -m elf_$(UTS_MACHINE) | 106 | LDFLAGS := -m elf_$(UTS_MACHINE) |
103 | 107 | ||
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 8f7bef8e9fff..23f315c9f215 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c | |||
@@ -229,18 +229,35 @@ void *memset(void *s, int c, size_t n) | |||
229 | ss[i] = c; | 229 | ss[i] = c; |
230 | return s; | 230 | return s; |
231 | } | 231 | } |
232 | 232 | #ifdef CONFIG_X86_32 | |
233 | void *memcpy(void *dest, const void *src, size_t n) | 233 | void *memcpy(void *dest, const void *src, size_t n) |
234 | { | 234 | { |
235 | int i; | 235 | int d0, d1, d2; |
236 | const char *s = src; | 236 | asm volatile( |
237 | char *d = dest; | 237 | "rep ; movsl\n\t" |
238 | "movl %4,%%ecx\n\t" | ||
239 | "rep ; movsb\n\t" | ||
240 | : "=&c" (d0), "=&D" (d1), "=&S" (d2) | ||
241 | : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src) | ||
242 | : "memory"); | ||
238 | 243 | ||
239 | for (i = 0; i < n; i++) | ||
240 | d[i] = s[i]; | ||
241 | return dest; | 244 | return dest; |
242 | } | 245 | } |
246 | #else | ||
247 | void *memcpy(void *dest, const void *src, size_t n) | ||
248 | { | ||
249 | long d0, d1, d2; | ||
250 | asm volatile( | ||
251 | "rep ; movsq\n\t" | ||
252 | "movq %4,%%rcx\n\t" | ||
253 | "rep ; movsb\n\t" | ||
254 | : "=&c" (d0), "=&D" (d1), "=&S" (d2) | ||
255 | : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src) | ||
256 | : "memory"); | ||
243 | 257 | ||
258 | return dest; | ||
259 | } | ||
260 | #endif | ||
244 | 261 | ||
245 | static void error(char *x) | 262 | static void error(char *x) |
246 | { | 263 | { |
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index bc6abb7bc7ee..76561d20ea2f 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/types.h> | 4 | #include <linux/types.h> |
5 | #include <linux/stddef.h> | 5 | #include <linux/stddef.h> |
6 | #include <linux/stringify.h> | 6 | #include <linux/stringify.h> |
7 | #include <linux/jump_label.h> | ||
7 | #include <asm/asm.h> | 8 | #include <asm/asm.h> |
8 | 9 | ||
9 | /* | 10 | /* |
@@ -160,6 +161,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, | |||
160 | #define __parainstructions_end NULL | 161 | #define __parainstructions_end NULL |
161 | #endif | 162 | #endif |
162 | 163 | ||
164 | extern void *text_poke_early(void *addr, const void *opcode, size_t len); | ||
165 | |||
163 | /* | 166 | /* |
164 | * Clear and restore the kernel write-protection flag on the local CPU. | 167 | * Clear and restore the kernel write-protection flag on the local CPU. |
165 | * Allows the kernel to edit read-only pages. | 168 | * Allows the kernel to edit read-only pages. |
@@ -180,4 +183,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, | |||
180 | extern void *text_poke(void *addr, const void *opcode, size_t len); | 183 | extern void *text_poke(void *addr, const void *opcode, size_t len); |
181 | extern void *text_poke_smp(void *addr, const void *opcode, size_t len); | 184 | extern void *text_poke_smp(void *addr, const void *opcode, size_t len); |
182 | 185 | ||
186 | #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) | ||
187 | #define IDEAL_NOP_SIZE_5 5 | ||
188 | extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; | ||
189 | extern void arch_init_ideal_nop5(void); | ||
190 | #else | ||
191 | static inline void arch_init_ideal_nop5(void) {} | ||
192 | #endif | ||
193 | |||
183 | #endif /* _ASM_X86_ALTERNATIVE_H */ | 194 | #endif /* _ASM_X86_ALTERNATIVE_H */ |
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index 5af2982133b5..a6863a2dec1f 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. | 2 | * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. |
3 | * Author: Joerg Roedel <joerg.roedel@amd.com> | 3 | * Author: Joerg Roedel <joerg.roedel@amd.com> |
4 | * Leo Duran <leo.duran@amd.com> | 4 | * Leo Duran <leo.duran@amd.com> |
5 | * | 5 | * |
@@ -24,11 +24,11 @@ | |||
24 | 24 | ||
25 | #ifdef CONFIG_AMD_IOMMU | 25 | #ifdef CONFIG_AMD_IOMMU |
26 | 26 | ||
27 | extern void amd_iommu_detect(void); | 27 | extern int amd_iommu_detect(void); |
28 | 28 | ||
29 | #else | 29 | #else |
30 | 30 | ||
31 | static inline void amd_iommu_detect(void) { } | 31 | static inline int amd_iommu_detect(void) { return -ENODEV; } |
32 | 32 | ||
33 | #endif | 33 | #endif |
34 | 34 | ||
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index cb030374b90a..916bc8111a01 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2009 Advanced Micro Devices, Inc. | 2 | * Copyright (C) 2009-2010 Advanced Micro Devices, Inc. |
3 | * Author: Joerg Roedel <joerg.roedel@amd.com> | 3 | * Author: Joerg Roedel <joerg.roedel@amd.com> |
4 | * | 4 | * |
5 | * This program is free software; you can redistribute it and/or modify it | 5 | * This program is free software; you can redistribute it and/or modify it |
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 08616180deaf..e3509fc303bf 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. | 2 | * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. |
3 | * Author: Joerg Roedel <joerg.roedel@amd.com> | 3 | * Author: Joerg Roedel <joerg.roedel@amd.com> |
4 | * Leo Duran <leo.duran@amd.com> | 4 | * Leo Duran <leo.duran@amd.com> |
5 | * | 5 | * |
@@ -416,13 +416,22 @@ struct amd_iommu { | |||
416 | struct dma_ops_domain *default_dom; | 416 | struct dma_ops_domain *default_dom; |
417 | 417 | ||
418 | /* | 418 | /* |
419 | * This array is required to work around a potential BIOS bug. | 419 | * We can't rely on the BIOS to restore all values on reinit, so we |
420 | * The BIOS may miss to restore parts of the PCI configuration | 420 | * need to stash them |
421 | * space when the system resumes from S3. The result is that the | ||
422 | * IOMMU does not execute commands anymore which leads to system | ||
423 | * failure. | ||
424 | */ | 421 | */ |
425 | u32 cache_cfg[4]; | 422 | |
423 | /* The iommu BAR */ | ||
424 | u32 stored_addr_lo; | ||
425 | u32 stored_addr_hi; | ||
426 | |||
427 | /* | ||
428 | * Each iommu has 6 l1s, each of which is documented as having 0x12 | ||
429 | * registers | ||
430 | */ | ||
431 | u32 stored_l1[6][0x12]; | ||
432 | |||
433 | /* The l2 indirect registers */ | ||
434 | u32 stored_l2[0x83]; | ||
426 | }; | 435 | }; |
427 | 436 | ||
428 | /* | 437 | /* |
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/amd_nb.h index af00bd1d2089..c8517f81b21e 100644 --- a/arch/x86/include/asm/k8.h +++ b/arch/x86/include/asm/amd_nb.h | |||
@@ -1,5 +1,5 @@ | |||
1 | #ifndef _ASM_X86_K8_H | 1 | #ifndef _ASM_X86_AMD_NB_H |
2 | #define _ASM_X86_K8_H | 2 | #define _ASM_X86_AMD_NB_H |
3 | 3 | ||
4 | #include <linux/pci.h> | 4 | #include <linux/pci.h> |
5 | 5 | ||
@@ -7,24 +7,27 @@ extern struct pci_device_id k8_nb_ids[]; | |||
7 | struct bootnode; | 7 | struct bootnode; |
8 | 8 | ||
9 | extern int early_is_k8_nb(u32 value); | 9 | extern int early_is_k8_nb(u32 value); |
10 | extern struct pci_dev **k8_northbridges; | ||
11 | extern int num_k8_northbridges; | ||
12 | extern int cache_k8_northbridges(void); | 10 | extern int cache_k8_northbridges(void); |
13 | extern void k8_flush_garts(void); | 11 | extern void k8_flush_garts(void); |
14 | extern int k8_get_nodes(struct bootnode *nodes); | 12 | extern int k8_get_nodes(struct bootnode *nodes); |
15 | extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); | 13 | extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); |
16 | extern int k8_scan_nodes(void); | 14 | extern int k8_scan_nodes(void); |
17 | 15 | ||
18 | #ifdef CONFIG_K8_NB | 16 | struct k8_northbridge_info { |
19 | extern int num_k8_northbridges; | 17 | u16 num; |
18 | u8 gart_supported; | ||
19 | struct pci_dev **nb_misc; | ||
20 | }; | ||
21 | extern struct k8_northbridge_info k8_northbridges; | ||
22 | |||
23 | #ifdef CONFIG_AMD_NB | ||
20 | 24 | ||
21 | static inline struct pci_dev *node_to_k8_nb_misc(int node) | 25 | static inline struct pci_dev *node_to_k8_nb_misc(int node) |
22 | { | 26 | { |
23 | return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; | 27 | return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL; |
24 | } | 28 | } |
25 | 29 | ||
26 | #else | 30 | #else |
27 | #define num_k8_northbridges 0 | ||
28 | 31 | ||
29 | static inline struct pci_dev *node_to_k8_nb_misc(int node) | 32 | static inline struct pci_dev *node_to_k8_nb_misc(int node) |
30 | { | 33 | { |
@@ -33,4 +36,4 @@ static inline struct pci_dev *node_to_k8_nb_misc(int node) | |||
33 | #endif | 36 | #endif |
34 | 37 | ||
35 | 38 | ||
36 | #endif /* _ASM_X86_K8_H */ | 39 | #endif /* _ASM_X86_AMD_NB_H */ |
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 1fa03e04ae44..286de34b0ed6 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h | |||
@@ -252,9 +252,7 @@ static inline int apic_is_clustered_box(void) | |||
252 | } | 252 | } |
253 | #endif | 253 | #endif |
254 | 254 | ||
255 | extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask); | 255 | extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); |
256 | extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask); | ||
257 | |||
258 | 256 | ||
259 | #else /* !CONFIG_X86_LOCAL_APIC */ | 257 | #else /* !CONFIG_X86_LOCAL_APIC */ |
260 | static inline void lapic_shutdown(void) { } | 258 | static inline void lapic_shutdown(void) { } |
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 7fe3b3060f08..a859ca461fb0 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h | |||
@@ -131,6 +131,7 @@ | |||
131 | #define APIC_EILVTn(n) (0x500 + 0x10 * n) | 131 | #define APIC_EILVTn(n) (0x500 + 0x10 * n) |
132 | #define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ | 132 | #define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ |
133 | #define APIC_EILVT_NR_AMD_10H 4 | 133 | #define APIC_EILVT_NR_AMD_10H 4 |
134 | #define APIC_EILVT_NR_MAX APIC_EILVT_NR_AMD_10H | ||
134 | #define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) | 135 | #define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) |
135 | #define APIC_EILVT_MSG_FIX 0x0 | 136 | #define APIC_EILVT_MSG_FIX 0x0 |
136 | #define APIC_EILVT_MSG_SMI 0x2 | 137 | #define APIC_EILVT_MSG_SMI 0x2 |
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index bafd80defa43..903683b07e42 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h | |||
@@ -440,6 +440,8 @@ static inline int fls(int x) | |||
440 | 440 | ||
441 | #ifdef __KERNEL__ | 441 | #ifdef __KERNEL__ |
442 | 442 | ||
443 | #include <asm-generic/bitops/find.h> | ||
444 | |||
443 | #include <asm-generic/bitops/sched.h> | 445 | #include <asm-generic/bitops/sched.h> |
444 | 446 | ||
445 | #define ARCH_HAS_FAST_MULTIPLIER 1 | 447 | #define ARCH_HAS_FAST_MULTIPLIER 1 |
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h index 0918654305af..0d467b338835 100644 --- a/arch/x86/include/asm/calgary.h +++ b/arch/x86/include/asm/calgary.h | |||
@@ -62,9 +62,9 @@ struct cal_chipset_ops { | |||
62 | extern int use_calgary; | 62 | extern int use_calgary; |
63 | 63 | ||
64 | #ifdef CONFIG_CALGARY_IOMMU | 64 | #ifdef CONFIG_CALGARY_IOMMU |
65 | extern void detect_calgary(void); | 65 | extern int detect_calgary(void); |
66 | #else | 66 | #else |
67 | static inline void detect_calgary(void) { return; } | 67 | static inline int detect_calgary(void) { return -ENODEV; } |
68 | #endif | 68 | #endif |
69 | 69 | ||
70 | #endif /* _ASM_X86_CALGARY_H */ | 70 | #endif /* _ASM_X86_CALGARY_H */ |
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 0e63c9a2a8d0..30af5a832163 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h | |||
@@ -48,36 +48,38 @@ For 32-bit we have the following conventions - kernel is built with | |||
48 | 48 | ||
49 | 49 | ||
50 | /* | 50 | /* |
51 | * 64-bit system call stack frame layout defines and helpers, | 51 | * 64-bit system call stack frame layout defines and helpers, for |
52 | * for assembly code: | 52 | * assembly code (note that the seemingly unnecessary parentheses |
53 | * are to prevent cpp from inserting spaces in expressions that get | ||
54 | * passed to macros): | ||
53 | */ | 55 | */ |
54 | 56 | ||
55 | #define R15 0 | 57 | #define R15 (0) |
56 | #define R14 8 | 58 | #define R14 (8) |
57 | #define R13 16 | 59 | #define R13 (16) |
58 | #define R12 24 | 60 | #define R12 (24) |
59 | #define RBP 32 | 61 | #define RBP (32) |
60 | #define RBX 40 | 62 | #define RBX (40) |
61 | 63 | ||
62 | /* arguments: interrupts/non tracing syscalls only save up to here: */ | 64 | /* arguments: interrupts/non tracing syscalls only save up to here: */ |
63 | #define R11 48 | 65 | #define R11 (48) |
64 | #define R10 56 | 66 | #define R10 (56) |
65 | #define R9 64 | 67 | #define R9 (64) |
66 | #define R8 72 | 68 | #define R8 (72) |
67 | #define RAX 80 | 69 | #define RAX (80) |
68 | #define RCX 88 | 70 | #define RCX (88) |
69 | #define RDX 96 | 71 | #define RDX (96) |
70 | #define RSI 104 | 72 | #define RSI (104) |
71 | #define RDI 112 | 73 | #define RDI (112) |
72 | #define ORIG_RAX 120 /* + error_code */ | 74 | #define ORIG_RAX (120) /* + error_code */ |
73 | /* end of arguments */ | 75 | /* end of arguments */ |
74 | 76 | ||
75 | /* cpu exception frame or undefined in case of fast syscall: */ | 77 | /* cpu exception frame or undefined in case of fast syscall: */ |
76 | #define RIP 128 | 78 | #define RIP (128) |
77 | #define CS 136 | 79 | #define CS (136) |
78 | #define EFLAGS 144 | 80 | #define EFLAGS (144) |
79 | #define RSP 152 | 81 | #define RSP (152) |
80 | #define SS 160 | 82 | #define SS (160) |
81 | 83 | ||
82 | #define ARGOFFSET R11 | 84 | #define ARGOFFSET R11 |
83 | #define SWFRAME ORIG_RAX | 85 | #define SWFRAME ORIG_RAX |
@@ -111,7 +113,7 @@ For 32-bit we have the following conventions - kernel is built with | |||
111 | .endif | 113 | .endif |
112 | .endm | 114 | .endm |
113 | 115 | ||
114 | #define ARG_SKIP 9*8 | 116 | #define ARG_SKIP (9*8) |
115 | 117 | ||
116 | .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \ | 118 | .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \ |
117 | skipr8910=0, skiprdx=0 | 119 | skipr8910=0, skiprdx=0 |
@@ -169,7 +171,7 @@ For 32-bit we have the following conventions - kernel is built with | |||
169 | .endif | 171 | .endif |
170 | .endm | 172 | .endm |
171 | 173 | ||
172 | #define REST_SKIP 6*8 | 174 | #define REST_SKIP (6*8) |
173 | 175 | ||
174 | .macro SAVE_REST | 176 | .macro SAVE_REST |
175 | subq $REST_SKIP, %rsp | 177 | subq $REST_SKIP, %rsp |
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3f76523589af..220e2ea08e80 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
@@ -152,10 +152,14 @@ | |||
152 | #define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ | 152 | #define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ |
153 | #define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ | 153 | #define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ |
154 | #define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ | 154 | #define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ |
155 | #define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ | 155 | #define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */ |
156 | #define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ | 156 | #define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ |
157 | #define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ | 157 | #define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ |
158 | #define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */ | ||
159 | #define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */ | ||
158 | #define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ | 160 | #define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ |
161 | #define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ | ||
162 | #define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ | ||
159 | 163 | ||
160 | /* | 164 | /* |
161 | * Auxiliary flags: Linux defined - For features scattered in various | 165 | * Auxiliary flags: Linux defined - For features scattered in various |
@@ -180,6 +184,13 @@ | |||
180 | #define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */ | 184 | #define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */ |
181 | #define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ | 185 | #define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ |
182 | #define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ | 186 | #define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ |
187 | #define X86_FEATURE_TSCRATEMSR (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */ | ||
188 | #define X86_FEATURE_VMCBCLEAN (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */ | ||
189 | #define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */ | ||
190 | #define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */ | ||
191 | #define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */ | ||
192 | #define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */ | ||
193 | |||
183 | 194 | ||
184 | /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ | 195 | /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ |
185 | #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ | 196 | #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ |
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 733f7e91e7a9..326099199318 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h | |||
@@ -89,6 +89,16 @@ | |||
89 | CFI_ADJUST_CFA_OFFSET -8 | 89 | CFI_ADJUST_CFA_OFFSET -8 |
90 | .endm | 90 | .endm |
91 | 91 | ||
92 | .macro pushfq_cfi | ||
93 | pushfq | ||
94 | CFI_ADJUST_CFA_OFFSET 8 | ||
95 | .endm | ||
96 | |||
97 | .macro popfq_cfi | ||
98 | popfq | ||
99 | CFI_ADJUST_CFA_OFFSET -8 | ||
100 | .endm | ||
101 | |||
92 | .macro movq_cfi reg offset=0 | 102 | .macro movq_cfi reg offset=0 |
93 | movq %\reg, \offset(%rsp) | 103 | movq %\reg, \offset(%rsp) |
94 | CFI_REL_OFFSET \reg, \offset | 104 | CFI_REL_OFFSET \reg, \offset |
@@ -109,6 +119,16 @@ | |||
109 | CFI_ADJUST_CFA_OFFSET -4 | 119 | CFI_ADJUST_CFA_OFFSET -4 |
110 | .endm | 120 | .endm |
111 | 121 | ||
122 | .macro pushfl_cfi | ||
123 | pushfl | ||
124 | CFI_ADJUST_CFA_OFFSET 4 | ||
125 | .endm | ||
126 | |||
127 | .macro popfl_cfi | ||
128 | popfl | ||
129 | CFI_ADJUST_CFA_OFFSET -4 | ||
130 | .endm | ||
131 | |||
112 | .macro movl_cfi reg offset=0 | 132 | .macro movl_cfi reg offset=0 |
113 | movl %\reg, \offset(%esp) | 133 | movl %\reg, \offset(%esp) |
114 | CFI_REL_OFFSET \reg, \offset | 134 | CFI_REL_OFFSET \reg, \offset |
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index ec8a52d14ab1..5be1542fbfaf 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h | |||
@@ -112,23 +112,13 @@ static inline void early_memtest(unsigned long start, unsigned long end) | |||
112 | } | 112 | } |
113 | #endif | 113 | #endif |
114 | 114 | ||
115 | extern unsigned long end_user_pfn; | ||
116 | |||
117 | extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); | ||
118 | extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); | ||
119 | extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); | ||
120 | #include <linux/early_res.h> | ||
121 | |||
122 | extern unsigned long e820_end_of_ram_pfn(void); | 115 | extern unsigned long e820_end_of_ram_pfn(void); |
123 | extern unsigned long e820_end_of_low_ram_pfn(void); | 116 | extern unsigned long e820_end_of_low_ram_pfn(void); |
124 | extern int e820_find_active_region(const struct e820entry *ei, | 117 | extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); |
125 | unsigned long start_pfn, | 118 | |
126 | unsigned long last_pfn, | 119 | void memblock_x86_fill(void); |
127 | unsigned long *ei_startpfn, | 120 | void memblock_find_dma_reserve(void); |
128 | unsigned long *ei_endpfn); | 121 | |
129 | extern void e820_register_active_regions(int nid, unsigned long start_pfn, | ||
130 | unsigned long end_pfn); | ||
131 | extern u64 e820_hole_size(u64 start, u64 end); | ||
132 | extern void finish_e820_parsing(void); | 122 | extern void finish_e820_parsing(void); |
133 | extern void e820_reserve_resources(void); | 123 | extern void e820_reserve_resources(void); |
134 | extern void e820_reserve_resources_late(void); | 124 | extern void e820_reserve_resources_late(void); |
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 8406ed7f9926..8e4a16508d4e 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h | |||
@@ -90,7 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, | |||
90 | #endif /* CONFIG_X86_32 */ | 90 | #endif /* CONFIG_X86_32 */ |
91 | 91 | ||
92 | extern int add_efi_memmap; | 92 | extern int add_efi_memmap; |
93 | extern void efi_reserve_early(void); | 93 | extern void efi_memblock_x86_reserve_range(void); |
94 | extern void efi_call_phys_prelog(void); | 94 | extern void efi_call_phys_prelog(void); |
95 | extern void efi_call_phys_epilog(void); | 95 | extern void efi_call_phys_epilog(void); |
96 | 96 | ||
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 8e8ec663a98f..57650ab4a5f5 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h | |||
@@ -16,22 +16,11 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) | |||
16 | BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) | 16 | BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) |
17 | BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) | 17 | BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) |
18 | 18 | ||
19 | BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, | 19 | .irpc idx, "01234567" |
20 | smp_invalidate_interrupt) | 20 | BUILD_INTERRUPT3(invalidate_interrupt\idx, |
21 | BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1, | 21 | (INVALIDATE_TLB_VECTOR_START)+\idx, |
22 | smp_invalidate_interrupt) | ||
23 | BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2, | ||
24 | smp_invalidate_interrupt) | ||
25 | BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3, | ||
26 | smp_invalidate_interrupt) | ||
27 | BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4, | ||
28 | smp_invalidate_interrupt) | ||
29 | BUILD_INTERRUPT3(invalidate_interrupt5,INVALIDATE_TLB_VECTOR_START+5, | ||
30 | smp_invalidate_interrupt) | ||
31 | BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6, | ||
32 | smp_invalidate_interrupt) | ||
33 | BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7, | ||
34 | smp_invalidate_interrupt) | 22 | smp_invalidate_interrupt) |
23 | .endr | ||
35 | #endif | 24 | #endif |
36 | 25 | ||
37 | BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) | 26 | BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) |
@@ -49,8 +38,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) | |||
49 | BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) | 38 | BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) |
50 | BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) | 39 | BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) |
51 | 40 | ||
52 | #ifdef CONFIG_PERF_EVENTS | 41 | #ifdef CONFIG_IRQ_WORK |
53 | BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) | 42 | BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) |
54 | #endif | 43 | #endif |
55 | 44 | ||
56 | #ifdef CONFIG_X86_THERMAL_VECTOR | 45 | #ifdef CONFIG_X86_THERMAL_VECTOR |
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index d07b44f7d1dc..4d293dced62f 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h | |||
@@ -214,5 +214,20 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr) | |||
214 | BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); | 214 | BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); |
215 | return __virt_to_fix(vaddr); | 215 | return __virt_to_fix(vaddr); |
216 | } | 216 | } |
217 | |||
218 | /* Return an pointer with offset calculated */ | ||
219 | static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx, | ||
220 | phys_addr_t phys, pgprot_t flags) | ||
221 | { | ||
222 | __set_fixmap(idx, phys, flags); | ||
223 | return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1)); | ||
224 | } | ||
225 | |||
226 | #define set_fixmap_offset(idx, phys) \ | ||
227 | __set_fixmap_offset(idx, phys, PAGE_KERNEL) | ||
228 | |||
229 | #define set_fixmap_offset_nocache(idx, phys) \ | ||
230 | __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE) | ||
231 | |||
217 | #endif /* !__ASSEMBLY__ */ | 232 | #endif /* !__ASSEMBLY__ */ |
218 | #endif /* _ASM_X86_FIXMAP_H */ | 233 | #endif /* _ASM_X86_FIXMAP_H */ |
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 4ac5b0f33fc1..43085bfc99c3 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h | |||
@@ -17,6 +17,7 @@ extern int fix_aperture; | |||
17 | #define GARTEN (1<<0) | 17 | #define GARTEN (1<<0) |
18 | #define DISGARTCPU (1<<4) | 18 | #define DISGARTCPU (1<<4) |
19 | #define DISGARTIO (1<<5) | 19 | #define DISGARTIO (1<<5) |
20 | #define DISTLBWALKPRB (1<<6) | ||
20 | 21 | ||
21 | /* GART cache control register bits. */ | 22 | /* GART cache control register bits. */ |
22 | #define INVGART (1<<0) | 23 | #define INVGART (1<<0) |
@@ -27,7 +28,6 @@ extern int fix_aperture; | |||
27 | #define AMD64_GARTAPERTUREBASE 0x94 | 28 | #define AMD64_GARTAPERTUREBASE 0x94 |
28 | #define AMD64_GARTTABLEBASE 0x98 | 29 | #define AMD64_GARTTABLEBASE 0x98 |
29 | #define AMD64_GARTCACHECTL 0x9c | 30 | #define AMD64_GARTCACHECTL 0x9c |
30 | #define AMD64_GARTEN (1<<0) | ||
31 | 31 | ||
32 | #ifdef CONFIG_GART_IOMMU | 32 | #ifdef CONFIG_GART_IOMMU |
33 | extern int gart_iommu_aperture; | 33 | extern int gart_iommu_aperture; |
@@ -37,7 +37,7 @@ extern int gart_iommu_aperture_disabled; | |||
37 | extern void early_gart_iommu_check(void); | 37 | extern void early_gart_iommu_check(void); |
38 | extern int gart_iommu_init(void); | 38 | extern int gart_iommu_init(void); |
39 | extern void __init gart_parse_options(char *); | 39 | extern void __init gart_parse_options(char *); |
40 | extern void gart_iommu_hole_init(void); | 40 | extern int gart_iommu_hole_init(void); |
41 | 41 | ||
42 | #else | 42 | #else |
43 | #define gart_iommu_aperture 0 | 43 | #define gart_iommu_aperture 0 |
@@ -50,13 +50,27 @@ static inline void early_gart_iommu_check(void) | |||
50 | static inline void gart_parse_options(char *options) | 50 | static inline void gart_parse_options(char *options) |
51 | { | 51 | { |
52 | } | 52 | } |
53 | static inline void gart_iommu_hole_init(void) | 53 | static inline int gart_iommu_hole_init(void) |
54 | { | 54 | { |
55 | return -ENODEV; | ||
55 | } | 56 | } |
56 | #endif | 57 | #endif |
57 | 58 | ||
58 | extern int agp_amd64_init(void); | 59 | extern int agp_amd64_init(void); |
59 | 60 | ||
61 | static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order) | ||
62 | { | ||
63 | u32 ctl; | ||
64 | |||
65 | /* | ||
66 | * Don't enable translation but enable GART IO and CPU accesses. | ||
67 | * Also, set DISTLBWALKPRB since GART tables memory is UC. | ||
68 | */ | ||
69 | ctl = DISTLBWALKPRB | order << 1; | ||
70 | |||
71 | pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); | ||
72 | } | ||
73 | |||
60 | static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) | 74 | static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) |
61 | { | 75 | { |
62 | u32 tmp, ctl; | 76 | u32 tmp, ctl; |
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index aeab29aee617..55e4de613f0e 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h | |||
@@ -14,7 +14,7 @@ typedef struct { | |||
14 | #endif | 14 | #endif |
15 | unsigned int x86_platform_ipis; /* arch dependent */ | 15 | unsigned int x86_platform_ipis; /* arch dependent */ |
16 | unsigned int apic_perf_irqs; | 16 | unsigned int apic_perf_irqs; |
17 | unsigned int apic_pending_irqs; | 17 | unsigned int apic_irq_work_irqs; |
18 | #ifdef CONFIG_SMP | 18 | #ifdef CONFIG_SMP |
19 | unsigned int irq_resched_count; | 19 | unsigned int irq_resched_count; |
20 | unsigned int irq_call_count; | 20 | unsigned int irq_call_count; |
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 8caac76ac324..3bd04022fd0c 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h | |||
@@ -59,11 +59,12 @@ extern void kunmap_high(struct page *page); | |||
59 | 59 | ||
60 | void *kmap(struct page *page); | 60 | void *kmap(struct page *page); |
61 | void kunmap(struct page *page); | 61 | void kunmap(struct page *page); |
62 | void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); | 62 | |
63 | void *kmap_atomic(struct page *page, enum km_type type); | 63 | void *kmap_atomic_prot(struct page *page, pgprot_t prot); |
64 | void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); | 64 | void *__kmap_atomic(struct page *page); |
65 | void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); | 65 | void __kunmap_atomic(void *kvaddr); |
66 | void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); | 66 | void *kmap_atomic_pfn(unsigned long pfn); |
67 | void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); | ||
67 | struct page *kmap_atomic_to_page(void *ptr); | 68 | struct page *kmap_atomic_to_page(void *ptr); |
68 | 69 | ||
69 | #define flush_cache_kmaps() do { } while (0) | 70 | #define flush_cache_kmaps() do { } while (0) |
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index d5905fd8ba41..0274ec5a7e62 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h | |||
@@ -29,7 +29,7 @@ | |||
29 | extern void apic_timer_interrupt(void); | 29 | extern void apic_timer_interrupt(void); |
30 | extern void x86_platform_ipi(void); | 30 | extern void x86_platform_ipi(void); |
31 | extern void error_interrupt(void); | 31 | extern void error_interrupt(void); |
32 | extern void perf_pending_interrupt(void); | 32 | extern void irq_work_interrupt(void); |
33 | 33 | ||
34 | extern void spurious_interrupt(void); | 34 | extern void spurious_interrupt(void); |
35 | extern void thermal_interrupt(void); | 35 | extern void thermal_interrupt(void); |
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index a73a8d5a5e69..4aa2bb3b242a 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h | |||
@@ -55,6 +55,12 @@ extern int save_i387_xstate_ia32(void __user *buf); | |||
55 | extern int restore_i387_xstate_ia32(void __user *buf); | 55 | extern int restore_i387_xstate_ia32(void __user *buf); |
56 | #endif | 56 | #endif |
57 | 57 | ||
58 | #ifdef CONFIG_MATH_EMULATION | ||
59 | extern void finit_soft_fpu(struct i387_soft_struct *soft); | ||
60 | #else | ||
61 | static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} | ||
62 | #endif | ||
63 | |||
58 | #define X87_FSW_ES (1 << 7) /* Exception Summary */ | 64 | #define X87_FSW_ES (1 << 7) /* Exception Summary */ |
59 | 65 | ||
60 | static __always_inline __pure bool use_xsaveopt(void) | 66 | static __always_inline __pure bool use_xsaveopt(void) |
@@ -67,6 +73,11 @@ static __always_inline __pure bool use_xsave(void) | |||
67 | return static_cpu_has(X86_FEATURE_XSAVE); | 73 | return static_cpu_has(X86_FEATURE_XSAVE); |
68 | } | 74 | } |
69 | 75 | ||
76 | static __always_inline __pure bool use_fxsr(void) | ||
77 | { | ||
78 | return static_cpu_has(X86_FEATURE_FXSR); | ||
79 | } | ||
80 | |||
70 | extern void __sanitize_i387_state(struct task_struct *); | 81 | extern void __sanitize_i387_state(struct task_struct *); |
71 | 82 | ||
72 | static inline void sanitize_i387_state(struct task_struct *tsk) | 83 | static inline void sanitize_i387_state(struct task_struct *tsk) |
@@ -77,19 +88,11 @@ static inline void sanitize_i387_state(struct task_struct *tsk) | |||
77 | } | 88 | } |
78 | 89 | ||
79 | #ifdef CONFIG_X86_64 | 90 | #ifdef CONFIG_X86_64 |
80 | |||
81 | /* Ignore delayed exceptions from user space */ | ||
82 | static inline void tolerant_fwait(void) | ||
83 | { | ||
84 | asm volatile("1: fwait\n" | ||
85 | "2:\n" | ||
86 | _ASM_EXTABLE(1b, 2b)); | ||
87 | } | ||
88 | |||
89 | static inline int fxrstor_checking(struct i387_fxsave_struct *fx) | 91 | static inline int fxrstor_checking(struct i387_fxsave_struct *fx) |
90 | { | 92 | { |
91 | int err; | 93 | int err; |
92 | 94 | ||
95 | /* See comment in fxsave() below. */ | ||
93 | asm volatile("1: rex64/fxrstor (%[fx])\n\t" | 96 | asm volatile("1: rex64/fxrstor (%[fx])\n\t" |
94 | "2:\n" | 97 | "2:\n" |
95 | ".section .fixup,\"ax\"\n" | 98 | ".section .fixup,\"ax\"\n" |
@@ -98,44 +101,10 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) | |||
98 | ".previous\n" | 101 | ".previous\n" |
99 | _ASM_EXTABLE(1b, 3b) | 102 | _ASM_EXTABLE(1b, 3b) |
100 | : [err] "=r" (err) | 103 | : [err] "=r" (err) |
101 | #if 0 /* See comment in fxsave() below. */ | 104 | : [fx] "R" (fx), "m" (*fx), "0" (0)); |
102 | : [fx] "r" (fx), "m" (*fx), "0" (0)); | ||
103 | #else | ||
104 | : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0)); | ||
105 | #endif | ||
106 | return err; | 105 | return err; |
107 | } | 106 | } |
108 | 107 | ||
109 | /* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception | ||
110 | is pending. Clear the x87 state here by setting it to fixed | ||
111 | values. The kernel data segment can be sometimes 0 and sometimes | ||
112 | new user value. Both should be ok. | ||
113 | Use the PDA as safe address because it should be already in L1. */ | ||
114 | static inline void fpu_clear(struct fpu *fpu) | ||
115 | { | ||
116 | struct xsave_struct *xstate = &fpu->state->xsave; | ||
117 | struct i387_fxsave_struct *fx = &fpu->state->fxsave; | ||
118 | |||
119 | /* | ||
120 | * xsave header may indicate the init state of the FP. | ||
121 | */ | ||
122 | if (use_xsave() && | ||
123 | !(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) | ||
124 | return; | ||
125 | |||
126 | if (unlikely(fx->swd & X87_FSW_ES)) | ||
127 | asm volatile("fnclex"); | ||
128 | alternative_input(ASM_NOP8 ASM_NOP2, | ||
129 | " emms\n" /* clear stack tags */ | ||
130 | " fildl %%gs:0", /* load to clear state */ | ||
131 | X86_FEATURE_FXSAVE_LEAK); | ||
132 | } | ||
133 | |||
134 | static inline void clear_fpu_state(struct task_struct *tsk) | ||
135 | { | ||
136 | fpu_clear(&tsk->thread.fpu); | ||
137 | } | ||
138 | |||
139 | static inline int fxsave_user(struct i387_fxsave_struct __user *fx) | 108 | static inline int fxsave_user(struct i387_fxsave_struct __user *fx) |
140 | { | 109 | { |
141 | int err; | 110 | int err; |
@@ -149,6 +118,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) | |||
149 | if (unlikely(err)) | 118 | if (unlikely(err)) |
150 | return -EFAULT; | 119 | return -EFAULT; |
151 | 120 | ||
121 | /* See comment in fxsave() below. */ | ||
152 | asm volatile("1: rex64/fxsave (%[fx])\n\t" | 122 | asm volatile("1: rex64/fxsave (%[fx])\n\t" |
153 | "2:\n" | 123 | "2:\n" |
154 | ".section .fixup,\"ax\"\n" | 124 | ".section .fixup,\"ax\"\n" |
@@ -157,11 +127,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) | |||
157 | ".previous\n" | 127 | ".previous\n" |
158 | _ASM_EXTABLE(1b, 3b) | 128 | _ASM_EXTABLE(1b, 3b) |
159 | : [err] "=r" (err), "=m" (*fx) | 129 | : [err] "=r" (err), "=m" (*fx) |
160 | #if 0 /* See comment in fxsave() below. */ | 130 | : [fx] "R" (fx), "0" (0)); |
161 | : [fx] "r" (fx), "0" (0)); | ||
162 | #else | ||
163 | : [fx] "cdaSDb" (fx), "0" (0)); | ||
164 | #endif | ||
165 | if (unlikely(err) && | 131 | if (unlikely(err) && |
166 | __clear_user(fx, sizeof(struct i387_fxsave_struct))) | 132 | __clear_user(fx, sizeof(struct i387_fxsave_struct))) |
167 | err = -EFAULT; | 133 | err = -EFAULT; |
@@ -175,56 +141,29 @@ static inline void fpu_fxsave(struct fpu *fpu) | |||
175 | uses any extended registers for addressing, a second REX prefix | 141 | uses any extended registers for addressing, a second REX prefix |
176 | will be generated (to the assembler, rex64 followed by semicolon | 142 | will be generated (to the assembler, rex64 followed by semicolon |
177 | is a separate instruction), and hence the 64-bitness is lost. */ | 143 | is a separate instruction), and hence the 64-bitness is lost. */ |
178 | #if 0 | 144 | |
145 | #ifdef CONFIG_AS_FXSAVEQ | ||
179 | /* Using "fxsaveq %0" would be the ideal choice, but is only supported | 146 | /* Using "fxsaveq %0" would be the ideal choice, but is only supported |
180 | starting with gas 2.16. */ | 147 | starting with gas 2.16. */ |
181 | __asm__ __volatile__("fxsaveq %0" | 148 | __asm__ __volatile__("fxsaveq %0" |
182 | : "=m" (fpu->state->fxsave)); | 149 | : "=m" (fpu->state->fxsave)); |
183 | #elif 0 | 150 | #else |
184 | /* Using, as a workaround, the properly prefixed form below isn't | 151 | /* Using, as a workaround, the properly prefixed form below isn't |
185 | accepted by any binutils version so far released, complaining that | 152 | accepted by any binutils version so far released, complaining that |
186 | the same type of prefix is used twice if an extended register is | 153 | the same type of prefix is used twice if an extended register is |
187 | needed for addressing (fix submitted to mainline 2005-11-21). */ | 154 | needed for addressing (fix submitted to mainline 2005-11-21). |
188 | __asm__ __volatile__("rex64/fxsave %0" | 155 | asm volatile("rex64/fxsave %0" |
189 | : "=m" (fpu->state->fxsave)); | 156 | : "=m" (fpu->state->fxsave)); |
190 | #else | 157 | This, however, we can work around by forcing the compiler to select |
191 | /* This, however, we can work around by forcing the compiler to select | ||
192 | an addressing mode that doesn't require extended registers. */ | 158 | an addressing mode that doesn't require extended registers. */ |
193 | __asm__ __volatile__("rex64/fxsave (%1)" | 159 | asm volatile("rex64/fxsave (%[fx])" |
194 | : "=m" (fpu->state->fxsave) | 160 | : "=m" (fpu->state->fxsave) |
195 | : "cdaSDb" (&fpu->state->fxsave)); | 161 | : [fx] "R" (&fpu->state->fxsave)); |
196 | #endif | 162 | #endif |
197 | } | 163 | } |
198 | 164 | ||
199 | static inline void fpu_save_init(struct fpu *fpu) | ||
200 | { | ||
201 | if (use_xsave()) | ||
202 | fpu_xsave(fpu); | ||
203 | else | ||
204 | fpu_fxsave(fpu); | ||
205 | |||
206 | fpu_clear(fpu); | ||
207 | } | ||
208 | |||
209 | static inline void __save_init_fpu(struct task_struct *tsk) | ||
210 | { | ||
211 | fpu_save_init(&tsk->thread.fpu); | ||
212 | task_thread_info(tsk)->status &= ~TS_USEDFPU; | ||
213 | } | ||
214 | |||
215 | #else /* CONFIG_X86_32 */ | 165 | #else /* CONFIG_X86_32 */ |
216 | 166 | ||
217 | #ifdef CONFIG_MATH_EMULATION | ||
218 | extern void finit_soft_fpu(struct i387_soft_struct *soft); | ||
219 | #else | ||
220 | static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} | ||
221 | #endif | ||
222 | |||
223 | static inline void tolerant_fwait(void) | ||
224 | { | ||
225 | asm volatile("fnclex ; fwait"); | ||
226 | } | ||
227 | |||
228 | /* perform fxrstor iff the processor has extended states, otherwise frstor */ | 167 | /* perform fxrstor iff the processor has extended states, otherwise frstor */ |
229 | static inline int fxrstor_checking(struct i387_fxsave_struct *fx) | 168 | static inline int fxrstor_checking(struct i387_fxsave_struct *fx) |
230 | { | 169 | { |
@@ -241,6 +180,14 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) | |||
241 | return 0; | 180 | return 0; |
242 | } | 181 | } |
243 | 182 | ||
183 | static inline void fpu_fxsave(struct fpu *fpu) | ||
184 | { | ||
185 | asm volatile("fxsave %[fx]" | ||
186 | : [fx] "=m" (fpu->state->fxsave)); | ||
187 | } | ||
188 | |||
189 | #endif /* CONFIG_X86_64 */ | ||
190 | |||
244 | /* We need a safe address that is cheap to find and that is already | 191 | /* We need a safe address that is cheap to find and that is already |
245 | in L1 during context switch. The best choices are unfortunately | 192 | in L1 during context switch. The best choices are unfortunately |
246 | different for UP and SMP */ | 193 | different for UP and SMP */ |
@@ -256,47 +203,33 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) | |||
256 | static inline void fpu_save_init(struct fpu *fpu) | 203 | static inline void fpu_save_init(struct fpu *fpu) |
257 | { | 204 | { |
258 | if (use_xsave()) { | 205 | if (use_xsave()) { |
259 | struct xsave_struct *xstate = &fpu->state->xsave; | ||
260 | struct i387_fxsave_struct *fx = &fpu->state->fxsave; | ||
261 | |||
262 | fpu_xsave(fpu); | 206 | fpu_xsave(fpu); |
263 | 207 | ||
264 | /* | 208 | /* |
265 | * xsave header may indicate the init state of the FP. | 209 | * xsave header may indicate the init state of the FP. |
266 | */ | 210 | */ |
267 | if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) | 211 | if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) |
268 | goto end; | 212 | return; |
269 | 213 | } else if (use_fxsr()) { | |
270 | if (unlikely(fx->swd & X87_FSW_ES)) | 214 | fpu_fxsave(fpu); |
271 | asm volatile("fnclex"); | 215 | } else { |
272 | 216 | asm volatile("fsave %[fx]; fwait" | |
273 | /* | 217 | : [fx] "=m" (fpu->state->fsave)); |
274 | * we can do a simple return here or be paranoid :) | 218 | return; |
275 | */ | ||
276 | goto clear_state; | ||
277 | } | 219 | } |
278 | 220 | ||
279 | /* Use more nops than strictly needed in case the compiler | 221 | if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) |
280 | varies code */ | 222 | asm volatile("fnclex"); |
281 | alternative_input( | 223 | |
282 | "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4, | ||
283 | "fxsave %[fx]\n" | ||
284 | "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:", | ||
285 | X86_FEATURE_FXSR, | ||
286 | [fx] "m" (fpu->state->fxsave), | ||
287 | [fsw] "m" (fpu->state->fxsave.swd) : "memory"); | ||
288 | clear_state: | ||
289 | /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception | 224 | /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception |
290 | is pending. Clear the x87 state here by setting it to fixed | 225 | is pending. Clear the x87 state here by setting it to fixed |
291 | values. safe_address is a random variable that should be in L1 */ | 226 | values. safe_address is a random variable that should be in L1 */ |
292 | alternative_input( | 227 | alternative_input( |
293 | GENERIC_NOP8 GENERIC_NOP2, | 228 | ASM_NOP8 ASM_NOP2, |
294 | "emms\n\t" /* clear stack tags */ | 229 | "emms\n\t" /* clear stack tags */ |
295 | "fildl %[addr]", /* set F?P to defined value */ | 230 | "fildl %P[addr]", /* set F?P to defined value */ |
296 | X86_FEATURE_FXSAVE_LEAK, | 231 | X86_FEATURE_FXSAVE_LEAK, |
297 | [addr] "m" (safe_address)); | 232 | [addr] "m" (safe_address)); |
298 | end: | ||
299 | ; | ||
300 | } | 233 | } |
301 | 234 | ||
302 | static inline void __save_init_fpu(struct task_struct *tsk) | 235 | static inline void __save_init_fpu(struct task_struct *tsk) |
@@ -305,9 +238,6 @@ static inline void __save_init_fpu(struct task_struct *tsk) | |||
305 | task_thread_info(tsk)->status &= ~TS_USEDFPU; | 238 | task_thread_info(tsk)->status &= ~TS_USEDFPU; |
306 | } | 239 | } |
307 | 240 | ||
308 | |||
309 | #endif /* CONFIG_X86_64 */ | ||
310 | |||
311 | static inline int fpu_fxrstor_checking(struct fpu *fpu) | 241 | static inline int fpu_fxrstor_checking(struct fpu *fpu) |
312 | { | 242 | { |
313 | return fxrstor_checking(&fpu->state->fxsave); | 243 | return fxrstor_checking(&fpu->state->fxsave); |
@@ -344,7 +274,10 @@ static inline void __unlazy_fpu(struct task_struct *tsk) | |||
344 | static inline void __clear_fpu(struct task_struct *tsk) | 274 | static inline void __clear_fpu(struct task_struct *tsk) |
345 | { | 275 | { |
346 | if (task_thread_info(tsk)->status & TS_USEDFPU) { | 276 | if (task_thread_info(tsk)->status & TS_USEDFPU) { |
347 | tolerant_fwait(); | 277 | /* Ignore delayed exceptions from user space */ |
278 | asm volatile("1: fwait\n" | ||
279 | "2:\n" | ||
280 | _ASM_EXTABLE(1b, 2b)); | ||
348 | task_thread_info(tsk)->status &= ~TS_USEDFPU; | 281 | task_thread_info(tsk)->status &= ~TS_USEDFPU; |
349 | stts(); | 282 | stts(); |
350 | } | 283 | } |
@@ -405,19 +338,6 @@ static inline void irq_ts_restore(int TS_state) | |||
405 | stts(); | 338 | stts(); |
406 | } | 339 | } |
407 | 340 | ||
408 | #ifdef CONFIG_X86_64 | ||
409 | |||
410 | static inline void save_init_fpu(struct task_struct *tsk) | ||
411 | { | ||
412 | __save_init_fpu(tsk); | ||
413 | stts(); | ||
414 | } | ||
415 | |||
416 | #define unlazy_fpu __unlazy_fpu | ||
417 | #define clear_fpu __clear_fpu | ||
418 | |||
419 | #else /* CONFIG_X86_32 */ | ||
420 | |||
421 | /* | 341 | /* |
422 | * These disable preemption on their own and are safe | 342 | * These disable preemption on their own and are safe |
423 | */ | 343 | */ |
@@ -443,8 +363,6 @@ static inline void clear_fpu(struct task_struct *tsk) | |||
443 | preempt_enable(); | 363 | preempt_enable(); |
444 | } | 364 | } |
445 | 365 | ||
446 | #endif /* CONFIG_X86_64 */ | ||
447 | |||
448 | /* | 366 | /* |
449 | * i387 state interaction | 367 | * i387 state interaction |
450 | */ | 368 | */ |
@@ -508,7 +426,4 @@ extern void fpu_finit(struct fpu *fpu); | |||
508 | 426 | ||
509 | #endif /* __ASSEMBLY__ */ | 427 | #endif /* __ASSEMBLY__ */ |
510 | 428 | ||
511 | #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 | ||
512 | #define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5 | ||
513 | |||
514 | #endif /* _ASM_X86_I387_H */ | 429 | #endif /* _ASM_X86_I387_H */ |
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 0ad29d401565..072273082528 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h | |||
@@ -208,6 +208,7 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) | |||
208 | 208 | ||
209 | extern void iounmap(volatile void __iomem *addr); | 209 | extern void iounmap(volatile void __iomem *addr); |
210 | 210 | ||
211 | extern void set_iounmap_nonlazy(void); | ||
211 | 212 | ||
212 | #ifdef __KERNEL__ | 213 | #ifdef __KERNEL__ |
213 | 214 | ||
@@ -350,6 +351,7 @@ extern void __iomem *early_memremap(resource_size_t phys_addr, | |||
350 | unsigned long size); | 351 | unsigned long size); |
351 | extern void early_iounmap(void __iomem *addr, unsigned long size); | 352 | extern void early_iounmap(void __iomem *addr, unsigned long size); |
352 | extern void fixup_early_ioremap(void); | 353 | extern void fixup_early_ioremap(void); |
354 | extern bool is_early_ioremap_ptep(pte_t *ptep); | ||
353 | 355 | ||
354 | #ifdef CONFIG_XEN | 356 | #ifdef CONFIG_XEN |
355 | struct bio_vec; | 357 | struct bio_vec; |
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index c4191b3b7056..363e33eb6ec1 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h | |||
@@ -27,10 +27,10 @@ | |||
27 | #include <asm/tlbflush.h> | 27 | #include <asm/tlbflush.h> |
28 | 28 | ||
29 | void __iomem * | 29 | void __iomem * |
30 | iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); | 30 | iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); |
31 | 31 | ||
32 | void | 32 | void |
33 | iounmap_atomic(void __iomem *kvaddr, enum km_type type); | 33 | iounmap_atomic(void __iomem *kvaddr); |
34 | 34 | ||
35 | int | 35 | int |
36 | iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); | 36 | iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); |
diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h new file mode 100644 index 000000000000..f229b13a5f30 --- /dev/null +++ b/arch/x86/include/asm/iommu_table.h | |||
@@ -0,0 +1,100 @@ | |||
1 | #ifndef _ASM_X86_IOMMU_TABLE_H | ||
2 | #define _ASM_X86_IOMMU_TABLE_H | ||
3 | |||
4 | #include <asm/swiotlb.h> | ||
5 | |||
6 | /* | ||
7 | * History lesson: | ||
8 | * The execution chain of IOMMUs in 2.6.36 looks as so: | ||
9 | * | ||
10 | * [xen-swiotlb] | ||
11 | * | | ||
12 | * +----[swiotlb *]--+ | ||
13 | * / | \ | ||
14 | * / | \ | ||
15 | * [GART] [Calgary] [Intel VT-d] | ||
16 | * / | ||
17 | * / | ||
18 | * [AMD-Vi] | ||
19 | * | ||
20 | * *: if SWIOTLB detected 'iommu=soft'/'swiotlb=force' it would skip | ||
21 | * over the rest of IOMMUs and unconditionally initialize the SWIOTLB. | ||
22 | * Also it would surreptitiously initialize set the swiotlb=1 if there were | ||
23 | * more than 4GB and if the user did not pass in 'iommu=off'. The swiotlb | ||
24 | * flag would be turned off by all IOMMUs except the Calgary one. | ||
25 | * | ||
26 | * The IOMMU_INIT* macros allow a similar tree (or more complex if desired) | ||
27 | * to be built by defining who we depend on. | ||
28 | * | ||
29 | * And all that needs to be done is to use one of the macros in the IOMMU | ||
30 | * and the pci-dma.c will take care of the rest. | ||
31 | */ | ||
32 | |||
33 | struct iommu_table_entry { | ||
34 | initcall_t detect; | ||
35 | initcall_t depend; | ||
36 | void (*early_init)(void); /* No memory allocate available. */ | ||
37 | void (*late_init)(void); /* Yes, can allocate memory. */ | ||
38 | #define IOMMU_FINISH_IF_DETECTED (1<<0) | ||
39 | #define IOMMU_DETECTED (1<<1) | ||
40 | int flags; | ||
41 | }; | ||
42 | /* | ||
43 | * Macro fills out an entry in the .iommu_table that is equivalent | ||
44 | * to the fields that 'struct iommu_table_entry' has. The entries | ||
45 | * that are put in the .iommu_table section are not put in any order | ||
46 | * hence during boot-time we will have to resort them based on | ||
47 | * dependency. */ | ||
48 | |||
49 | |||
50 | #define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\ | ||
51 | static const struct iommu_table_entry const \ | ||
52 | __iommu_entry_##_detect __used \ | ||
53 | __attribute__ ((unused, __section__(".iommu_table"), \ | ||
54 | aligned((sizeof(void *))))) \ | ||
55 | = {_detect, _depend, _early_init, _late_init, \ | ||
56 | _finish ? IOMMU_FINISH_IF_DETECTED : 0} | ||
57 | /* | ||
58 | * The simplest IOMMU definition. Provide the detection routine | ||
59 | * and it will be run after the SWIOTLB and the other IOMMUs | ||
60 | * that utilize this macro. If the IOMMU is detected (ie, the | ||
61 | * detect routine returns a positive value), the other IOMMUs | ||
62 | * are also checked. You can use IOMMU_INIT_POST_FINISH if you prefer | ||
63 | * to stop detecting the other IOMMUs after yours has been detected. | ||
64 | */ | ||
65 | #define IOMMU_INIT_POST(_detect) \ | ||
66 | __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 0) | ||
67 | |||
68 | #define IOMMU_INIT_POST_FINISH(detect) \ | ||
69 | __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 1) | ||
70 | |||
71 | /* | ||
72 | * A more sophisticated version of IOMMU_INIT. This variant requires: | ||
73 | * a). A detection routine function. | ||
74 | * b). The name of the detection routine we depend on to get called | ||
75 | * before us. | ||
76 | * c). The init routine which gets called if the detection routine | ||
77 | * returns a positive value from the pci_iommu_alloc. This means | ||
78 | * no presence of a memory allocator. | ||
79 | * d). Similar to the 'init', except that this gets called from pci_iommu_init | ||
80 | * where we do have a memory allocator. | ||
81 | * | ||
82 | * The standard vs the _FINISH differs in that the _FINISH variant will | ||
83 | * continue detecting other IOMMUs in the call list after the | ||
84 | * the detection routine returns a positive number. The _FINISH will | ||
85 | * stop the execution chain. Both will still call the 'init' and | ||
86 | * 'late_init' functions if they are set. | ||
87 | */ | ||
88 | #define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \ | ||
89 | __IOMMU_INIT(_detect, _depend, _init, _late_init, 1) | ||
90 | |||
91 | #define IOMMU_INIT(_detect, _depend, _init, _late_init) \ | ||
92 | __IOMMU_INIT(_detect, _depend, _init, _late_init, 0) | ||
93 | |||
94 | void sort_iommu_table(struct iommu_table_entry *start, | ||
95 | struct iommu_table_entry *finish); | ||
96 | |||
97 | void check_iommu_entries(struct iommu_table_entry *start, | ||
98 | struct iommu_table_entry *finish); | ||
99 | |||
100 | #endif /* _ASM_X86_IOMMU_TABLE_H */ | ||
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 5458380b6ef8..13b0ebaa512f 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h | |||
@@ -19,18 +19,14 @@ static inline int irq_canonicalize(int irq) | |||
19 | # define ARCH_HAS_NMI_WATCHDOG | 19 | # define ARCH_HAS_NMI_WATCHDOG |
20 | #endif | 20 | #endif |
21 | 21 | ||
22 | #ifdef CONFIG_4KSTACKS | 22 | #ifdef CONFIG_X86_32 |
23 | extern void irq_ctx_init(int cpu); | 23 | extern void irq_ctx_init(int cpu); |
24 | extern void irq_ctx_exit(int cpu); | ||
25 | # define __ARCH_HAS_DO_SOFTIRQ | ||
26 | #else | 24 | #else |
27 | # define irq_ctx_init(cpu) do { } while (0) | 25 | # define irq_ctx_init(cpu) do { } while (0) |
28 | # define irq_ctx_exit(cpu) do { } while (0) | ||
29 | # ifdef CONFIG_X86_64 | ||
30 | # define __ARCH_HAS_DO_SOFTIRQ | ||
31 | # endif | ||
32 | #endif | 26 | #endif |
33 | 27 | ||
28 | #define __ARCH_HAS_DO_SOFTIRQ | ||
29 | |||
34 | #ifdef CONFIG_HOTPLUG_CPU | 30 | #ifdef CONFIG_HOTPLUG_CPU |
35 | #include <linux/cpumask.h> | 31 | #include <linux/cpumask.h> |
36 | extern void fixup_irqs(void); | 32 | extern void fixup_irqs(void); |
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index e2ca30092557..6af0894dafb4 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h | |||
@@ -114,9 +114,9 @@ | |||
114 | #define X86_PLATFORM_IPI_VECTOR 0xed | 114 | #define X86_PLATFORM_IPI_VECTOR 0xed |
115 | 115 | ||
116 | /* | 116 | /* |
117 | * Performance monitoring pending work vector: | 117 | * IRQ work vector: |
118 | */ | 118 | */ |
119 | #define LOCAL_PENDING_VECTOR 0xec | 119 | #define IRQ_WORK_VECTOR 0xec |
120 | 120 | ||
121 | #define UV_BAU_MESSAGE 0xea | 121 | #define UV_BAU_MESSAGE 0xea |
122 | 122 | ||
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 9e2b952f810a..5745ce8bf108 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h | |||
@@ -61,22 +61,22 @@ static inline void native_halt(void) | |||
61 | #else | 61 | #else |
62 | #ifndef __ASSEMBLY__ | 62 | #ifndef __ASSEMBLY__ |
63 | 63 | ||
64 | static inline unsigned long __raw_local_save_flags(void) | 64 | static inline unsigned long arch_local_save_flags(void) |
65 | { | 65 | { |
66 | return native_save_fl(); | 66 | return native_save_fl(); |
67 | } | 67 | } |
68 | 68 | ||
69 | static inline void raw_local_irq_restore(unsigned long flags) | 69 | static inline void arch_local_irq_restore(unsigned long flags) |
70 | { | 70 | { |
71 | native_restore_fl(flags); | 71 | native_restore_fl(flags); |
72 | } | 72 | } |
73 | 73 | ||
74 | static inline void raw_local_irq_disable(void) | 74 | static inline void arch_local_irq_disable(void) |
75 | { | 75 | { |
76 | native_irq_disable(); | 76 | native_irq_disable(); |
77 | } | 77 | } |
78 | 78 | ||
79 | static inline void raw_local_irq_enable(void) | 79 | static inline void arch_local_irq_enable(void) |
80 | { | 80 | { |
81 | native_irq_enable(); | 81 | native_irq_enable(); |
82 | } | 82 | } |
@@ -85,7 +85,7 @@ static inline void raw_local_irq_enable(void) | |||
85 | * Used in the idle loop; sti takes one instruction cycle | 85 | * Used in the idle loop; sti takes one instruction cycle |
86 | * to complete: | 86 | * to complete: |
87 | */ | 87 | */ |
88 | static inline void raw_safe_halt(void) | 88 | static inline void arch_safe_halt(void) |
89 | { | 89 | { |
90 | native_safe_halt(); | 90 | native_safe_halt(); |
91 | } | 91 | } |
@@ -102,12 +102,10 @@ static inline void halt(void) | |||
102 | /* | 102 | /* |
103 | * For spinlocks, etc: | 103 | * For spinlocks, etc: |
104 | */ | 104 | */ |
105 | static inline unsigned long __raw_local_irq_save(void) | 105 | static inline unsigned long arch_local_irq_save(void) |
106 | { | 106 | { |
107 | unsigned long flags = __raw_local_save_flags(); | 107 | unsigned long flags = arch_local_save_flags(); |
108 | 108 | arch_local_irq_disable(); | |
109 | raw_local_irq_disable(); | ||
110 | |||
111 | return flags; | 109 | return flags; |
112 | } | 110 | } |
113 | #else | 111 | #else |
@@ -153,22 +151,16 @@ static inline unsigned long __raw_local_irq_save(void) | |||
153 | #endif /* CONFIG_PARAVIRT */ | 151 | #endif /* CONFIG_PARAVIRT */ |
154 | 152 | ||
155 | #ifndef __ASSEMBLY__ | 153 | #ifndef __ASSEMBLY__ |
156 | #define raw_local_save_flags(flags) \ | 154 | static inline int arch_irqs_disabled_flags(unsigned long flags) |
157 | do { (flags) = __raw_local_save_flags(); } while (0) | ||
158 | |||
159 | #define raw_local_irq_save(flags) \ | ||
160 | do { (flags) = __raw_local_irq_save(); } while (0) | ||
161 | |||
162 | static inline int raw_irqs_disabled_flags(unsigned long flags) | ||
163 | { | 155 | { |
164 | return !(flags & X86_EFLAGS_IF); | 156 | return !(flags & X86_EFLAGS_IF); |
165 | } | 157 | } |
166 | 158 | ||
167 | static inline int raw_irqs_disabled(void) | 159 | static inline int arch_irqs_disabled(void) |
168 | { | 160 | { |
169 | unsigned long flags = __raw_local_save_flags(); | 161 | unsigned long flags = arch_local_save_flags(); |
170 | 162 | ||
171 | return raw_irqs_disabled_flags(flags); | 163 | return arch_irqs_disabled_flags(flags); |
172 | } | 164 | } |
173 | 165 | ||
174 | #else | 166 | #else |
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h new file mode 100644 index 000000000000..f52d42e80585 --- /dev/null +++ b/arch/x86/include/asm/jump_label.h | |||
@@ -0,0 +1,37 @@ | |||
1 | #ifndef _ASM_X86_JUMP_LABEL_H | ||
2 | #define _ASM_X86_JUMP_LABEL_H | ||
3 | |||
4 | #ifdef __KERNEL__ | ||
5 | |||
6 | #include <linux/types.h> | ||
7 | #include <asm/nops.h> | ||
8 | |||
9 | #define JUMP_LABEL_NOP_SIZE 5 | ||
10 | |||
11 | # define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" | ||
12 | |||
13 | # define JUMP_LABEL(key, label) \ | ||
14 | do { \ | ||
15 | asm goto("1:" \ | ||
16 | JUMP_LABEL_INITIAL_NOP \ | ||
17 | ".pushsection __jump_table, \"a\" \n\t"\ | ||
18 | _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ | ||
19 | ".popsection \n\t" \ | ||
20 | : : "i" (key) : : label); \ | ||
21 | } while (0) | ||
22 | |||
23 | #endif /* __KERNEL__ */ | ||
24 | |||
25 | #ifdef CONFIG_X86_64 | ||
26 | typedef u64 jump_label_t; | ||
27 | #else | ||
28 | typedef u32 jump_label_t; | ||
29 | #endif | ||
30 | |||
31 | struct jump_entry { | ||
32 | jump_label_t code; | ||
33 | jump_label_t target; | ||
34 | jump_label_t key; | ||
35 | }; | ||
36 | |||
37 | #endif | ||
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1f99ecfc48e1..b36c6b3fe144 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h | |||
@@ -139,6 +139,7 @@ struct x86_emulate_ops { | |||
139 | void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); | 139 | void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); |
140 | unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); | 140 | unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); |
141 | void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); | 141 | void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); |
142 | void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); | ||
142 | ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); | 143 | ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); |
143 | int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); | 144 | int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); |
144 | int (*cpl)(struct kvm_vcpu *vcpu); | 145 | int (*cpl)(struct kvm_vcpu *vcpu); |
@@ -156,7 +157,10 @@ struct operand { | |||
156 | unsigned long orig_val; | 157 | unsigned long orig_val; |
157 | u64 orig_val64; | 158 | u64 orig_val64; |
158 | }; | 159 | }; |
159 | unsigned long *ptr; | 160 | union { |
161 | unsigned long *reg; | ||
162 | unsigned long mem; | ||
163 | } addr; | ||
160 | union { | 164 | union { |
161 | unsigned long val; | 165 | unsigned long val; |
162 | u64 val64; | 166 | u64 val64; |
@@ -190,6 +194,7 @@ struct decode_cache { | |||
190 | bool has_seg_override; | 194 | bool has_seg_override; |
191 | u8 seg_override; | 195 | u8 seg_override; |
192 | unsigned int d; | 196 | unsigned int d; |
197 | int (*execute)(struct x86_emulate_ctxt *ctxt); | ||
193 | unsigned long regs[NR_VCPU_REGS]; | 198 | unsigned long regs[NR_VCPU_REGS]; |
194 | unsigned long eip; | 199 | unsigned long eip; |
195 | /* modrm */ | 200 | /* modrm */ |
@@ -197,17 +202,16 @@ struct decode_cache { | |||
197 | u8 modrm_mod; | 202 | u8 modrm_mod; |
198 | u8 modrm_reg; | 203 | u8 modrm_reg; |
199 | u8 modrm_rm; | 204 | u8 modrm_rm; |
200 | u8 use_modrm_ea; | 205 | u8 modrm_seg; |
201 | bool rip_relative; | 206 | bool rip_relative; |
202 | unsigned long modrm_ea; | ||
203 | void *modrm_ptr; | ||
204 | unsigned long modrm_val; | ||
205 | struct fetch_cache fetch; | 207 | struct fetch_cache fetch; |
206 | struct read_cache io_read; | 208 | struct read_cache io_read; |
207 | struct read_cache mem_read; | 209 | struct read_cache mem_read; |
208 | }; | 210 | }; |
209 | 211 | ||
210 | struct x86_emulate_ctxt { | 212 | struct x86_emulate_ctxt { |
213 | struct x86_emulate_ops *ops; | ||
214 | |||
211 | /* Register state before/after emulation. */ | 215 | /* Register state before/after emulation. */ |
212 | struct kvm_vcpu *vcpu; | 216 | struct kvm_vcpu *vcpu; |
213 | 217 | ||
@@ -220,12 +224,11 @@ struct x86_emulate_ctxt { | |||
220 | /* interruptibility state, as a result of execution of STI or MOV SS */ | 224 | /* interruptibility state, as a result of execution of STI or MOV SS */ |
221 | int interruptibility; | 225 | int interruptibility; |
222 | 226 | ||
223 | bool restart; /* restart string instruction after writeback */ | 227 | bool perm_ok; /* do not check permissions if true */ |
224 | 228 | ||
225 | int exception; /* exception that happens during emulation or -1 */ | 229 | int exception; /* exception that happens during emulation or -1 */ |
226 | u32 error_code; /* error code for exception */ | 230 | u32 error_code; /* error code for exception */ |
227 | bool error_code_valid; | 231 | bool error_code_valid; |
228 | unsigned long cr2; /* faulted address in case of #PF */ | ||
229 | 232 | ||
230 | /* decode cache */ | 233 | /* decode cache */ |
231 | struct decode_cache decode; | 234 | struct decode_cache decode; |
@@ -249,13 +252,14 @@ struct x86_emulate_ctxt { | |||
249 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 | 252 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 |
250 | #endif | 253 | #endif |
251 | 254 | ||
252 | int x86_decode_insn(struct x86_emulate_ctxt *ctxt, | 255 | int x86_decode_insn(struct x86_emulate_ctxt *ctxt); |
253 | struct x86_emulate_ops *ops); | 256 | #define EMULATION_FAILED -1 |
254 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, | 257 | #define EMULATION_OK 0 |
255 | struct x86_emulate_ops *ops); | 258 | #define EMULATION_RESTART 1 |
259 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); | ||
256 | int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | 260 | int emulator_task_switch(struct x86_emulate_ctxt *ctxt, |
257 | struct x86_emulate_ops *ops, | ||
258 | u16 tss_selector, int reason, | 261 | u16 tss_selector, int reason, |
259 | bool has_error_code, u32 error_code); | 262 | bool has_error_code, u32 error_code); |
260 | 263 | int emulate_int_real(struct x86_emulate_ctxt *ctxt, | |
264 | struct x86_emulate_ops *ops, int irq); | ||
261 | #endif /* _ASM_X86_KVM_X86_EMULATE_H */ | 265 | #endif /* _ASM_X86_KVM_X86_EMULATE_H */ |
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 502e53f999cf..9e6fe391094e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -236,10 +236,14 @@ struct kvm_pio_request { | |||
236 | */ | 236 | */ |
237 | struct kvm_mmu { | 237 | struct kvm_mmu { |
238 | void (*new_cr3)(struct kvm_vcpu *vcpu); | 238 | void (*new_cr3)(struct kvm_vcpu *vcpu); |
239 | void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); | ||
240 | unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); | ||
239 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); | 241 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); |
242 | void (*inject_page_fault)(struct kvm_vcpu *vcpu); | ||
240 | void (*free)(struct kvm_vcpu *vcpu); | 243 | void (*free)(struct kvm_vcpu *vcpu); |
241 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, | 244 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, |
242 | u32 *error); | 245 | u32 *error); |
246 | gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); | ||
243 | void (*prefetch_page)(struct kvm_vcpu *vcpu, | 247 | void (*prefetch_page)(struct kvm_vcpu *vcpu, |
244 | struct kvm_mmu_page *page); | 248 | struct kvm_mmu_page *page); |
245 | int (*sync_page)(struct kvm_vcpu *vcpu, | 249 | int (*sync_page)(struct kvm_vcpu *vcpu, |
@@ -249,13 +253,18 @@ struct kvm_mmu { | |||
249 | int root_level; | 253 | int root_level; |
250 | int shadow_root_level; | 254 | int shadow_root_level; |
251 | union kvm_mmu_page_role base_role; | 255 | union kvm_mmu_page_role base_role; |
256 | bool direct_map; | ||
252 | 257 | ||
253 | u64 *pae_root; | 258 | u64 *pae_root; |
259 | u64 *lm_root; | ||
254 | u64 rsvd_bits_mask[2][4]; | 260 | u64 rsvd_bits_mask[2][4]; |
261 | |||
262 | bool nx; | ||
263 | |||
264 | u64 pdptrs[4]; /* pae */ | ||
255 | }; | 265 | }; |
256 | 266 | ||
257 | struct kvm_vcpu_arch { | 267 | struct kvm_vcpu_arch { |
258 | u64 host_tsc; | ||
259 | /* | 268 | /* |
260 | * rip and regs accesses must go through | 269 | * rip and regs accesses must go through |
261 | * kvm_{register,rip}_{read,write} functions. | 270 | * kvm_{register,rip}_{read,write} functions. |
@@ -272,7 +281,6 @@ struct kvm_vcpu_arch { | |||
272 | unsigned long cr4_guest_owned_bits; | 281 | unsigned long cr4_guest_owned_bits; |
273 | unsigned long cr8; | 282 | unsigned long cr8; |
274 | u32 hflags; | 283 | u32 hflags; |
275 | u64 pdptrs[4]; /* pae */ | ||
276 | u64 efer; | 284 | u64 efer; |
277 | u64 apic_base; | 285 | u64 apic_base; |
278 | struct kvm_lapic *apic; /* kernel irqchip context */ | 286 | struct kvm_lapic *apic; /* kernel irqchip context */ |
@@ -282,7 +290,41 @@ struct kvm_vcpu_arch { | |||
282 | u64 ia32_misc_enable_msr; | 290 | u64 ia32_misc_enable_msr; |
283 | bool tpr_access_reporting; | 291 | bool tpr_access_reporting; |
284 | 292 | ||
293 | /* | ||
294 | * Paging state of the vcpu | ||
295 | * | ||
296 | * If the vcpu runs in guest mode with two level paging this still saves | ||
297 | * the paging mode of the l1 guest. This context is always used to | ||
298 | * handle faults. | ||
299 | */ | ||
285 | struct kvm_mmu mmu; | 300 | struct kvm_mmu mmu; |
301 | |||
302 | /* | ||
303 | * Paging state of an L2 guest (used for nested npt) | ||
304 | * | ||
305 | * This context will save all necessary information to walk page tables | ||
306 | * of the an L2 guest. This context is only initialized for page table | ||
307 | * walking and not for faulting since we never handle l2 page faults on | ||
308 | * the host. | ||
309 | */ | ||
310 | struct kvm_mmu nested_mmu; | ||
311 | |||
312 | /* | ||
313 | * Pointer to the mmu context currently used for | ||
314 | * gva_to_gpa translations. | ||
315 | */ | ||
316 | struct kvm_mmu *walk_mmu; | ||
317 | |||
318 | /* | ||
319 | * This struct is filled with the necessary information to propagate a | ||
320 | * page fault into the guest | ||
321 | */ | ||
322 | struct { | ||
323 | u64 address; | ||
324 | unsigned error_code; | ||
325 | bool nested; | ||
326 | } fault; | ||
327 | |||
286 | /* only needed in kvm_pv_mmu_op() path, but it's hot so | 328 | /* only needed in kvm_pv_mmu_op() path, but it's hot so |
287 | * put it here to avoid allocation */ | 329 | * put it here to avoid allocation */ |
288 | struct kvm_pv_mmu_op_buffer mmu_op_buffer; | 330 | struct kvm_pv_mmu_op_buffer mmu_op_buffer; |
@@ -336,9 +378,15 @@ struct kvm_vcpu_arch { | |||
336 | 378 | ||
337 | gpa_t time; | 379 | gpa_t time; |
338 | struct pvclock_vcpu_time_info hv_clock; | 380 | struct pvclock_vcpu_time_info hv_clock; |
339 | unsigned int hv_clock_tsc_khz; | 381 | unsigned int hw_tsc_khz; |
340 | unsigned int time_offset; | 382 | unsigned int time_offset; |
341 | struct page *time_page; | 383 | struct page *time_page; |
384 | u64 last_host_tsc; | ||
385 | u64 last_guest_tsc; | ||
386 | u64 last_kernel_ns; | ||
387 | u64 last_tsc_nsec; | ||
388 | u64 last_tsc_write; | ||
389 | bool tsc_catchup; | ||
342 | 390 | ||
343 | bool nmi_pending; | 391 | bool nmi_pending; |
344 | bool nmi_injected; | 392 | bool nmi_injected; |
@@ -367,9 +415,9 @@ struct kvm_vcpu_arch { | |||
367 | }; | 415 | }; |
368 | 416 | ||
369 | struct kvm_arch { | 417 | struct kvm_arch { |
370 | unsigned int n_free_mmu_pages; | 418 | unsigned int n_used_mmu_pages; |
371 | unsigned int n_requested_mmu_pages; | 419 | unsigned int n_requested_mmu_pages; |
372 | unsigned int n_alloc_mmu_pages; | 420 | unsigned int n_max_mmu_pages; |
373 | atomic_t invlpg_counter; | 421 | atomic_t invlpg_counter; |
374 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; | 422 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; |
375 | /* | 423 | /* |
@@ -394,8 +442,14 @@ struct kvm_arch { | |||
394 | gpa_t ept_identity_map_addr; | 442 | gpa_t ept_identity_map_addr; |
395 | 443 | ||
396 | unsigned long irq_sources_bitmap; | 444 | unsigned long irq_sources_bitmap; |
397 | u64 vm_init_tsc; | ||
398 | s64 kvmclock_offset; | 445 | s64 kvmclock_offset; |
446 | spinlock_t tsc_write_lock; | ||
447 | u64 last_tsc_nsec; | ||
448 | u64 last_tsc_offset; | ||
449 | u64 last_tsc_write; | ||
450 | u32 virtual_tsc_khz; | ||
451 | u32 virtual_tsc_mult; | ||
452 | s8 virtual_tsc_shift; | ||
399 | 453 | ||
400 | struct kvm_xen_hvm_config xen_hvm_config; | 454 | struct kvm_xen_hvm_config xen_hvm_config; |
401 | 455 | ||
@@ -505,6 +559,7 @@ struct kvm_x86_ops { | |||
505 | void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, | 559 | void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, |
506 | bool has_error_code, u32 error_code, | 560 | bool has_error_code, u32 error_code, |
507 | bool reinject); | 561 | bool reinject); |
562 | void (*cancel_injection)(struct kvm_vcpu *vcpu); | ||
508 | int (*interrupt_allowed)(struct kvm_vcpu *vcpu); | 563 | int (*interrupt_allowed)(struct kvm_vcpu *vcpu); |
509 | int (*nmi_allowed)(struct kvm_vcpu *vcpu); | 564 | int (*nmi_allowed)(struct kvm_vcpu *vcpu); |
510 | bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); | 565 | bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); |
@@ -517,11 +572,16 @@ struct kvm_x86_ops { | |||
517 | u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); | 572 | u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); |
518 | int (*get_lpage_level)(void); | 573 | int (*get_lpage_level)(void); |
519 | bool (*rdtscp_supported)(void); | 574 | bool (*rdtscp_supported)(void); |
575 | void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment); | ||
576 | |||
577 | void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); | ||
520 | 578 | ||
521 | void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); | 579 | void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); |
522 | 580 | ||
523 | bool (*has_wbinvd_exit)(void); | 581 | bool (*has_wbinvd_exit)(void); |
524 | 582 | ||
583 | void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); | ||
584 | |||
525 | const struct trace_print_flags *exit_reasons_str; | 585 | const struct trace_print_flags *exit_reasons_str; |
526 | }; | 586 | }; |
527 | 587 | ||
@@ -544,7 +604,7 @@ void kvm_mmu_zap_all(struct kvm *kvm); | |||
544 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); | 604 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); |
545 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); | 605 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); |
546 | 606 | ||
547 | int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); | 607 | int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); |
548 | 608 | ||
549 | int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | 609 | int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, |
550 | const void *val, int bytes); | 610 | const void *val, int bytes); |
@@ -608,8 +668,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); | |||
608 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); | 668 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); |
609 | void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); | 669 | void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); |
610 | void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); | 670 | void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); |
611 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, | 671 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu); |
612 | u32 error_code); | 672 | int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
673 | gfn_t gfn, void *data, int offset, int len, | ||
674 | u32 access); | ||
675 | void kvm_propagate_fault(struct kvm_vcpu *vcpu); | ||
613 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); | 676 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); |
614 | 677 | ||
615 | int kvm_pic_set_irq(void *opaque, int irq, int level); | 678 | int kvm_pic_set_irq(void *opaque, int irq, int level); |
@@ -652,20 +715,6 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) | |||
652 | return (struct kvm_mmu_page *)page_private(page); | 715 | return (struct kvm_mmu_page *)page_private(page); |
653 | } | 716 | } |
654 | 717 | ||
655 | static inline u16 kvm_read_fs(void) | ||
656 | { | ||
657 | u16 seg; | ||
658 | asm("mov %%fs, %0" : "=g"(seg)); | ||
659 | return seg; | ||
660 | } | ||
661 | |||
662 | static inline u16 kvm_read_gs(void) | ||
663 | { | ||
664 | u16 seg; | ||
665 | asm("mov %%gs, %0" : "=g"(seg)); | ||
666 | return seg; | ||
667 | } | ||
668 | |||
669 | static inline u16 kvm_read_ldt(void) | 718 | static inline u16 kvm_read_ldt(void) |
670 | { | 719 | { |
671 | u16 ldt; | 720 | u16 ldt; |
@@ -673,16 +722,6 @@ static inline u16 kvm_read_ldt(void) | |||
673 | return ldt; | 722 | return ldt; |
674 | } | 723 | } |
675 | 724 | ||
676 | static inline void kvm_load_fs(u16 sel) | ||
677 | { | ||
678 | asm("mov %0, %%fs" : : "rm"(sel)); | ||
679 | } | ||
680 | |||
681 | static inline void kvm_load_gs(u16 sel) | ||
682 | { | ||
683 | asm("mov %0, %%gs" : : "rm"(sel)); | ||
684 | } | ||
685 | |||
686 | static inline void kvm_load_ldt(u16 sel) | 725 | static inline void kvm_load_ldt(u16 sel) |
687 | { | 726 | { |
688 | asm("lldt %0" : : "rm"(sel)); | 727 | asm("lldt %0" : : "rm"(sel)); |
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 05eba5e9a8e8..7b562b6184bc 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h | |||
@@ -158,6 +158,12 @@ static inline unsigned int kvm_arch_para_features(void) | |||
158 | return cpuid_eax(KVM_CPUID_FEATURES); | 158 | return cpuid_eax(KVM_CPUID_FEATURES); |
159 | } | 159 | } |
160 | 160 | ||
161 | #ifdef CONFIG_KVM_GUEST | ||
162 | void __init kvm_guest_init(void); | ||
163 | #else | ||
164 | #define kvm_guest_init() do { } while (0) | ||
161 | #endif | 165 | #endif |
162 | 166 | ||
167 | #endif /* __KERNEL__ */ | ||
168 | |||
163 | #endif /* _ASM_X86_KVM_PARA_H */ | 169 | #endif /* _ASM_X86_KVM_PARA_H */ |
diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h new file mode 100644 index 000000000000..19ae14ba6978 --- /dev/null +++ b/arch/x86/include/asm/memblock.h | |||
@@ -0,0 +1,23 @@ | |||
1 | #ifndef _X86_MEMBLOCK_H | ||
2 | #define _X86_MEMBLOCK_H | ||
3 | |||
4 | #define ARCH_DISCARD_MEMBLOCK | ||
5 | |||
6 | u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align); | ||
7 | void memblock_x86_to_bootmem(u64 start, u64 end); | ||
8 | |||
9 | void memblock_x86_reserve_range(u64 start, u64 end, char *name); | ||
10 | void memblock_x86_free_range(u64 start, u64 end); | ||
11 | struct range; | ||
12 | int __get_free_all_memory_range(struct range **range, int nodeid, | ||
13 | unsigned long start_pfn, unsigned long end_pfn); | ||
14 | int get_free_all_memory_range(struct range **rangep, int nodeid); | ||
15 | |||
16 | void memblock_x86_register_active_regions(int nid, unsigned long start_pfn, | ||
17 | unsigned long last_pfn); | ||
18 | u64 memblock_x86_hole_size(u64 start, u64 end); | ||
19 | u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align); | ||
20 | u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit); | ||
21 | u64 memblock_x86_memory_in_range(u64 addr, u64 limit); | ||
22 | |||
23 | #endif | ||
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 3e2ce58a31a3..67763c5d8b4e 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h | |||
@@ -60,12 +60,7 @@ | |||
60 | #endif | 60 | #endif |
61 | 61 | ||
62 | #ifdef CONFIG_X86_32 | 62 | #ifdef CONFIG_X86_32 |
63 | # ifdef CONFIG_4KSTACKS | 63 | # define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY |
64 | # define MODULE_STACKSIZE "4KSTACKS " | ||
65 | # else | ||
66 | # define MODULE_STACKSIZE "" | ||
67 | # endif | ||
68 | # define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE | ||
69 | #endif | 64 | #endif |
70 | 65 | ||
71 | #endif /* _ASM_X86_MODULE_H */ | 66 | #endif /* _ASM_X86_MODULE_H */ |
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 16350740edf6..4a711a684b17 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h | |||
@@ -10,6 +10,9 @@ | |||
10 | */ | 10 | */ |
11 | #ifndef _ASM_X86_MRST_H | 11 | #ifndef _ASM_X86_MRST_H |
12 | #define _ASM_X86_MRST_H | 12 | #define _ASM_X86_MRST_H |
13 | |||
14 | #include <linux/sfi.h> | ||
15 | |||
13 | extern int pci_mrst_init(void); | 16 | extern int pci_mrst_init(void); |
14 | int __init sfi_parse_mrtc(struct sfi_table_header *table); | 17 | int __init sfi_parse_mrtc(struct sfi_table_header *table); |
15 | 18 | ||
@@ -26,7 +29,7 @@ enum mrst_cpu_type { | |||
26 | }; | 29 | }; |
27 | 30 | ||
28 | extern enum mrst_cpu_type __mrst_cpu_chip; | 31 | extern enum mrst_cpu_type __mrst_cpu_chip; |
29 | static enum mrst_cpu_type mrst_identify_cpu(void) | 32 | static inline enum mrst_cpu_type mrst_identify_cpu(void) |
30 | { | 33 | { |
31 | return __mrst_cpu_chip; | 34 | return __mrst_cpu_chip; |
32 | } | 35 | } |
@@ -42,4 +45,9 @@ extern enum mrst_timer_options mrst_timer_options; | |||
42 | #define SFI_MTMR_MAX_NUM 8 | 45 | #define SFI_MTMR_MAX_NUM 8 |
43 | #define SFI_MRTC_MAX 8 | 46 | #define SFI_MRTC_MAX 8 |
44 | 47 | ||
48 | extern struct console early_mrst_console; | ||
49 | extern void mrst_early_console_init(void); | ||
50 | |||
51 | extern struct console early_hsu_console; | ||
52 | extern void hsu_early_console_init(void); | ||
45 | #endif /* _ASM_X86_MRST_H */ | 53 | #endif /* _ASM_X86_MRST_H */ |
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 986f7790fdb2..3ea3dc487047 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h | |||
@@ -121,6 +121,7 @@ | |||
121 | #define MSR_AMD64_IBSDCLINAD 0xc0011038 | 121 | #define MSR_AMD64_IBSDCLINAD 0xc0011038 |
122 | #define MSR_AMD64_IBSDCPHYSAD 0xc0011039 | 122 | #define MSR_AMD64_IBSDCPHYSAD 0xc0011039 |
123 | #define MSR_AMD64_IBSCTL 0xc001103a | 123 | #define MSR_AMD64_IBSCTL 0xc001103a |
124 | #define MSR_AMD64_IBSBRTARGET 0xc001103b | ||
124 | 125 | ||
125 | /* Fam 10h MSRs */ | 126 | /* Fam 10h MSRs */ |
126 | #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 | 127 | #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 |
@@ -198,6 +199,7 @@ | |||
198 | #define MSR_IA32_TSC 0x00000010 | 199 | #define MSR_IA32_TSC 0x00000010 |
199 | #define MSR_IA32_PLATFORM_ID 0x00000017 | 200 | #define MSR_IA32_PLATFORM_ID 0x00000017 |
200 | #define MSR_IA32_EBL_CR_POWERON 0x0000002a | 201 | #define MSR_IA32_EBL_CR_POWERON 0x0000002a |
202 | #define MSR_EBC_FREQUENCY_ID 0x0000002c | ||
201 | #define MSR_IA32_FEATURE_CONTROL 0x0000003a | 203 | #define MSR_IA32_FEATURE_CONTROL 0x0000003a |
202 | 204 | ||
203 | #define FEATURE_CONTROL_LOCKED (1<<0) | 205 | #define FEATURE_CONTROL_LOCKED (1<<0) |
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h new file mode 100644 index 000000000000..bcdff997668c --- /dev/null +++ b/arch/x86/include/asm/mwait.h | |||
@@ -0,0 +1,15 @@ | |||
1 | #ifndef _ASM_X86_MWAIT_H | ||
2 | #define _ASM_X86_MWAIT_H | ||
3 | |||
4 | #define MWAIT_SUBSTATE_MASK 0xf | ||
5 | #define MWAIT_CSTATE_MASK 0xf | ||
6 | #define MWAIT_SUBSTATE_SIZE 4 | ||
7 | #define MWAIT_MAX_NUM_CSTATES 8 | ||
8 | |||
9 | #define CPUID_MWAIT_LEAF 5 | ||
10 | #define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 | ||
11 | #define CPUID5_ECX_INTERRUPT_BREAK 0x2 | ||
12 | |||
13 | #define MWAIT_ECX_INTERRUPT_BREAK 0x1 | ||
14 | |||
15 | #endif /* _ASM_X86_MWAIT_H */ | ||
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 101229b0d8ed..42a978c0c1b3 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h | |||
@@ -89,6 +89,8 @@ extern int olpc_ec_mask_unset(uint8_t bits); | |||
89 | /* EC commands */ | 89 | /* EC commands */ |
90 | 90 | ||
91 | #define EC_FIRMWARE_REV 0x08 | 91 | #define EC_FIRMWARE_REV 0x08 |
92 | #define EC_WLAN_ENTER_RESET 0x35 | ||
93 | #define EC_WLAN_LEAVE_RESET 0x25 | ||
92 | 94 | ||
93 | /* SCI source values */ | 95 | /* SCI source values */ |
94 | 96 | ||
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h index 08fde475cb3b..2a8478140bb3 100644 --- a/arch/x86/include/asm/olpc_ofw.h +++ b/arch/x86/include/asm/olpc_ofw.h | |||
@@ -21,10 +21,14 @@ extern void olpc_ofw_detect(void); | |||
21 | /* install OFW's pde permanently into the kernel's pgtable */ | 21 | /* install OFW's pde permanently into the kernel's pgtable */ |
22 | extern void setup_olpc_ofw_pgd(void); | 22 | extern void setup_olpc_ofw_pgd(void); |
23 | 23 | ||
24 | /* check if OFW was detected during boot */ | ||
25 | extern bool olpc_ofw_present(void); | ||
26 | |||
24 | #else /* !CONFIG_OLPC_OPENFIRMWARE */ | 27 | #else /* !CONFIG_OLPC_OPENFIRMWARE */ |
25 | 28 | ||
26 | static inline void olpc_ofw_detect(void) { } | 29 | static inline void olpc_ofw_detect(void) { } |
27 | static inline void setup_olpc_ofw_pgd(void) { } | 30 | static inline void setup_olpc_ofw_pgd(void) { } |
31 | static inline bool olpc_ofw_present(void) { return false; } | ||
28 | 32 | ||
29 | #endif /* !CONFIG_OLPC_OPENFIRMWARE */ | 33 | #endif /* !CONFIG_OLPC_OPENFIRMWARE */ |
30 | 34 | ||
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 6f1b7331313f..ade619ff9e2a 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h | |||
@@ -15,11 +15,7 @@ | |||
15 | */ | 15 | */ |
16 | #define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) | 16 | #define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) |
17 | 17 | ||
18 | #ifdef CONFIG_4KSTACKS | ||
19 | #define THREAD_ORDER 0 | ||
20 | #else | ||
21 | #define THREAD_ORDER 1 | 18 | #define THREAD_ORDER 1 |
22 | #endif | ||
23 | #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) | 19 | #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) |
24 | 20 | ||
25 | #define STACKFAULT_STACK 0 | 21 | #define STACKFAULT_STACK 0 |
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index a667f24c7254..1df66211fd1b 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h | |||
@@ -8,7 +8,7 @@ | |||
8 | #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) | 8 | #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) |
9 | #define PAGE_MASK (~(PAGE_SIZE-1)) | 9 | #define PAGE_MASK (~(PAGE_SIZE-1)) |
10 | 10 | ||
11 | #define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1) | 11 | #define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) |
12 | #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) | 12 | #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) |
13 | 13 | ||
14 | /* Cast PAGE_MASK to a signed type so that it is sign-extended if | 14 | /* Cast PAGE_MASK to a signed type so that it is sign-extended if |
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 5653f43d90e5..18e3b8a8709f 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h | |||
@@ -105,7 +105,7 @@ static inline void write_cr8(unsigned long x) | |||
105 | } | 105 | } |
106 | #endif | 106 | #endif |
107 | 107 | ||
108 | static inline void raw_safe_halt(void) | 108 | static inline void arch_safe_halt(void) |
109 | { | 109 | { |
110 | PVOP_VCALL0(pv_irq_ops.safe_halt); | 110 | PVOP_VCALL0(pv_irq_ops.safe_halt); |
111 | } | 111 | } |
@@ -416,11 +416,6 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) | |||
416 | PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn); | 416 | PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn); |
417 | } | 417 | } |
418 | 418 | ||
419 | static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, | ||
420 | unsigned long start, unsigned long count) | ||
421 | { | ||
422 | PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count); | ||
423 | } | ||
424 | static inline void paravirt_release_pmd(unsigned long pfn) | 419 | static inline void paravirt_release_pmd(unsigned long pfn) |
425 | { | 420 | { |
426 | PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn); | 421 | PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn); |
@@ -829,32 +824,32 @@ static __always_inline void arch_spin_unlock(struct arch_spinlock *lock) | |||
829 | #define __PV_IS_CALLEE_SAVE(func) \ | 824 | #define __PV_IS_CALLEE_SAVE(func) \ |
830 | ((struct paravirt_callee_save) { func }) | 825 | ((struct paravirt_callee_save) { func }) |
831 | 826 | ||
832 | static inline unsigned long __raw_local_save_flags(void) | 827 | static inline unsigned long arch_local_save_flags(void) |
833 | { | 828 | { |
834 | return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl); | 829 | return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl); |
835 | } | 830 | } |
836 | 831 | ||
837 | static inline void raw_local_irq_restore(unsigned long f) | 832 | static inline void arch_local_irq_restore(unsigned long f) |
838 | { | 833 | { |
839 | PVOP_VCALLEE1(pv_irq_ops.restore_fl, f); | 834 | PVOP_VCALLEE1(pv_irq_ops.restore_fl, f); |
840 | } | 835 | } |
841 | 836 | ||
842 | static inline void raw_local_irq_disable(void) | 837 | static inline void arch_local_irq_disable(void) |
843 | { | 838 | { |
844 | PVOP_VCALLEE0(pv_irq_ops.irq_disable); | 839 | PVOP_VCALLEE0(pv_irq_ops.irq_disable); |
845 | } | 840 | } |
846 | 841 | ||
847 | static inline void raw_local_irq_enable(void) | 842 | static inline void arch_local_irq_enable(void) |
848 | { | 843 | { |
849 | PVOP_VCALLEE0(pv_irq_ops.irq_enable); | 844 | PVOP_VCALLEE0(pv_irq_ops.irq_enable); |
850 | } | 845 | } |
851 | 846 | ||
852 | static inline unsigned long __raw_local_irq_save(void) | 847 | static inline unsigned long arch_local_irq_save(void) |
853 | { | 848 | { |
854 | unsigned long f; | 849 | unsigned long f; |
855 | 850 | ||
856 | f = __raw_local_save_flags(); | 851 | f = arch_local_save_flags(); |
857 | raw_local_irq_disable(); | 852 | arch_local_irq_disable(); |
858 | return f; | 853 | return f; |
859 | } | 854 | } |
860 | 855 | ||
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index db9ef5532341..b82bac975250 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h | |||
@@ -255,7 +255,6 @@ struct pv_mmu_ops { | |||
255 | */ | 255 | */ |
256 | void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); | 256 | void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); |
257 | void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); | 257 | void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); |
258 | void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); | ||
259 | void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); | 258 | void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); |
260 | void (*release_pte)(unsigned long pfn); | 259 | void (*release_pte)(unsigned long pfn); |
261 | void (*release_pmd)(unsigned long pfn); | 260 | void (*release_pmd)(unsigned long pfn); |
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index cd28f9ad910d..f899e01a8ac9 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h | |||
@@ -47,6 +47,20 @@ | |||
47 | #ifdef CONFIG_SMP | 47 | #ifdef CONFIG_SMP |
48 | #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x | 48 | #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x |
49 | #define __my_cpu_offset percpu_read(this_cpu_off) | 49 | #define __my_cpu_offset percpu_read(this_cpu_off) |
50 | |||
51 | /* | ||
52 | * Compared to the generic __my_cpu_offset version, the following | ||
53 | * saves one instruction and avoids clobbering a temp register. | ||
54 | */ | ||
55 | #define __this_cpu_ptr(ptr) \ | ||
56 | ({ \ | ||
57 | unsigned long tcp_ptr__; \ | ||
58 | __verify_pcpu_ptr(ptr); \ | ||
59 | asm volatile("add " __percpu_arg(1) ", %0" \ | ||
60 | : "=r" (tcp_ptr__) \ | ||
61 | : "m" (this_cpu_off), "0" (ptr)); \ | ||
62 | (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ | ||
63 | }) | ||
50 | #else | 64 | #else |
51 | #define __percpu_arg(x) "%P" #x | 65 | #define __percpu_arg(x) "%P" #x |
52 | #endif | 66 | #endif |
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 6e742cc4251b..550e26b1dbb3 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h | |||
@@ -111,17 +111,18 @@ union cpuid10_edx { | |||
111 | #define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) | 111 | #define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) |
112 | 112 | ||
113 | /* IbsFetchCtl bits/masks */ | 113 | /* IbsFetchCtl bits/masks */ |
114 | #define IBS_FETCH_RAND_EN (1ULL<<57) | 114 | #define IBS_FETCH_RAND_EN (1ULL<<57) |
115 | #define IBS_FETCH_VAL (1ULL<<49) | 115 | #define IBS_FETCH_VAL (1ULL<<49) |
116 | #define IBS_FETCH_ENABLE (1ULL<<48) | 116 | #define IBS_FETCH_ENABLE (1ULL<<48) |
117 | #define IBS_FETCH_CNT 0xFFFF0000ULL | 117 | #define IBS_FETCH_CNT 0xFFFF0000ULL |
118 | #define IBS_FETCH_MAX_CNT 0x0000FFFFULL | 118 | #define IBS_FETCH_MAX_CNT 0x0000FFFFULL |
119 | 119 | ||
120 | /* IbsOpCtl bits */ | 120 | /* IbsOpCtl bits */ |
121 | #define IBS_OP_CNT_CTL (1ULL<<19) | 121 | #define IBS_OP_CNT_CTL (1ULL<<19) |
122 | #define IBS_OP_VAL (1ULL<<18) | 122 | #define IBS_OP_VAL (1ULL<<18) |
123 | #define IBS_OP_ENABLE (1ULL<<17) | 123 | #define IBS_OP_ENABLE (1ULL<<17) |
124 | #define IBS_OP_MAX_CNT 0x0000FFFFULL | 124 | #define IBS_OP_MAX_CNT 0x0000FFFFULL |
125 | #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ | ||
125 | 126 | ||
126 | #ifdef CONFIG_PERF_EVENTS | 127 | #ifdef CONFIG_PERF_EVENTS |
127 | extern void init_hw_perf_events(void); | 128 | extern void init_hw_perf_events(void); |
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index def500776b16..a70cd216be5d 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h | |||
@@ -36,19 +36,6 @@ | |||
36 | #define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT) | 36 | #define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT) |
37 | #define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT) | 37 | #define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT) |
38 | 38 | ||
39 | /* Non HT mask */ | ||
40 | #define P4_ESCR_MASK \ | ||
41 | (P4_ESCR_EVENT_MASK | \ | ||
42 | P4_ESCR_EVENTMASK_MASK | \ | ||
43 | P4_ESCR_TAG_MASK | \ | ||
44 | P4_ESCR_TAG_ENABLE | \ | ||
45 | P4_ESCR_T0_OS | \ | ||
46 | P4_ESCR_T0_USR) | ||
47 | |||
48 | /* HT mask */ | ||
49 | #define P4_ESCR_MASK_HT \ | ||
50 | (P4_ESCR_MASK | P4_ESCR_T1_OS | P4_ESCR_T1_USR) | ||
51 | |||
52 | #define P4_CCCR_OVF 0x80000000U | 39 | #define P4_CCCR_OVF 0x80000000U |
53 | #define P4_CCCR_CASCADE 0x40000000U | 40 | #define P4_CCCR_CASCADE 0x40000000U |
54 | #define P4_CCCR_OVF_PMI_T0 0x04000000U | 41 | #define P4_CCCR_OVF_PMI_T0 0x04000000U |
@@ -70,23 +57,6 @@ | |||
70 | #define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT) | 57 | #define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT) |
71 | #define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT) | 58 | #define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT) |
72 | 59 | ||
73 | /* Non HT mask */ | ||
74 | #define P4_CCCR_MASK \ | ||
75 | (P4_CCCR_OVF | \ | ||
76 | P4_CCCR_CASCADE | \ | ||
77 | P4_CCCR_OVF_PMI_T0 | \ | ||
78 | P4_CCCR_FORCE_OVF | \ | ||
79 | P4_CCCR_EDGE | \ | ||
80 | P4_CCCR_THRESHOLD_MASK | \ | ||
81 | P4_CCCR_COMPLEMENT | \ | ||
82 | P4_CCCR_COMPARE | \ | ||
83 | P4_CCCR_ESCR_SELECT_MASK | \ | ||
84 | P4_CCCR_ENABLE) | ||
85 | |||
86 | /* HT mask */ | ||
87 | #define P4_CCCR_MASK_HT \ | ||
88 | (P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY) | ||
89 | |||
90 | #define P4_GEN_ESCR_EMASK(class, name, bit) \ | 60 | #define P4_GEN_ESCR_EMASK(class, name, bit) \ |
91 | class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT) | 61 | class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT) |
92 | #define P4_ESCR_EMASK_BIT(class, name) class##__##name | 62 | #define P4_ESCR_EMASK_BIT(class, name) class##__##name |
@@ -127,6 +97,28 @@ | |||
127 | #define P4_CONFIG_HT_SHIFT 63 | 97 | #define P4_CONFIG_HT_SHIFT 63 |
128 | #define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) | 98 | #define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) |
129 | 99 | ||
100 | /* | ||
101 | * The bits we allow to pass for RAW events | ||
102 | */ | ||
103 | #define P4_CONFIG_MASK_ESCR \ | ||
104 | P4_ESCR_EVENT_MASK | \ | ||
105 | P4_ESCR_EVENTMASK_MASK | \ | ||
106 | P4_ESCR_TAG_MASK | \ | ||
107 | P4_ESCR_TAG_ENABLE | ||
108 | |||
109 | #define P4_CONFIG_MASK_CCCR \ | ||
110 | P4_CCCR_EDGE | \ | ||
111 | P4_CCCR_THRESHOLD_MASK | \ | ||
112 | P4_CCCR_COMPLEMENT | \ | ||
113 | P4_CCCR_COMPARE | \ | ||
114 | P4_CCCR_THREAD_ANY | \ | ||
115 | P4_CCCR_RESERVED | ||
116 | |||
117 | /* some dangerous bits are reserved for kernel internals */ | ||
118 | #define P4_CONFIG_MASK \ | ||
119 | (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \ | ||
120 | (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR)) | ||
121 | |||
130 | static inline bool p4_is_event_cascaded(u64 config) | 122 | static inline bool p4_is_event_cascaded(u64 config) |
131 | { | 123 | { |
132 | u32 cccr = p4_config_unpack_cccr(config); | 124 | u32 cccr = p4_config_unpack_cccr(config); |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a34c785c5a63..ada823a13c7c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; | |||
28 | extern spinlock_t pgd_lock; | 28 | extern spinlock_t pgd_lock; |
29 | extern struct list_head pgd_list; | 29 | extern struct list_head pgd_list; |
30 | 30 | ||
31 | extern struct mm_struct *pgd_page_get_mm(struct page *page); | ||
32 | |||
31 | #ifdef CONFIG_PARAVIRT | 33 | #ifdef CONFIG_PARAVIRT |
32 | #include <asm/paravirt.h> | 34 | #include <asm/paravirt.h> |
33 | #else /* !CONFIG_PARAVIRT */ | 35 | #else /* !CONFIG_PARAVIRT */ |
@@ -603,6 +605,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, | |||
603 | pte_update(mm, addr, ptep); | 605 | pte_update(mm, addr, ptep); |
604 | } | 606 | } |
605 | 607 | ||
608 | #define flush_tlb_fix_spurious_fault(vma, address) | ||
609 | |||
606 | /* | 610 | /* |
607 | * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); | 611 | * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); |
608 | * | 612 | * |
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index f686f49e8b7b..0c92113c4cb6 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h | |||
@@ -26,7 +26,7 @@ struct mm_struct; | |||
26 | struct vm_area_struct; | 26 | struct vm_area_struct; |
27 | 27 | ||
28 | extern pgd_t swapper_pg_dir[1024]; | 28 | extern pgd_t swapper_pg_dir[1024]; |
29 | extern pgd_t trampoline_pg_dir[1024]; | 29 | extern pgd_t initial_page_table[1024]; |
30 | 30 | ||
31 | static inline void pgtable_cache_init(void) { } | 31 | static inline void pgtable_cache_init(void) { } |
32 | static inline void check_pgt_cache(void) { } | 32 | static inline void check_pgt_cache(void) { } |
@@ -49,24 +49,14 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); | |||
49 | #endif | 49 | #endif |
50 | 50 | ||
51 | #if defined(CONFIG_HIGHPTE) | 51 | #if defined(CONFIG_HIGHPTE) |
52 | #define __KM_PTE \ | ||
53 | (in_nmi() ? KM_NMI_PTE : \ | ||
54 | in_irq() ? KM_IRQ_PTE : \ | ||
55 | KM_PTE0) | ||
56 | #define pte_offset_map(dir, address) \ | 52 | #define pte_offset_map(dir, address) \ |
57 | ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \ | 53 | ((pte_t *)kmap_atomic(pmd_page(*(dir))) + \ |
58 | pte_index((address))) | 54 | pte_index((address))) |
59 | #define pte_offset_map_nested(dir, address) \ | 55 | #define pte_unmap(pte) kunmap_atomic((pte)) |
60 | ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \ | ||
61 | pte_index((address))) | ||
62 | #define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) | ||
63 | #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) | ||
64 | #else | 56 | #else |
65 | #define pte_offset_map(dir, address) \ | 57 | #define pte_offset_map(dir, address) \ |
66 | ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) | 58 | ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) |
67 | #define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address)) | ||
68 | #define pte_unmap(pte) do { } while (0) | 59 | #define pte_unmap(pte) do { } while (0) |
69 | #define pte_unmap_nested(pte) do { } while (0) | ||
70 | #endif | 60 | #endif |
71 | 61 | ||
72 | /* Clear a kernel PTE and flush it from the TLB */ | 62 | /* Clear a kernel PTE and flush it from the TLB */ |
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 076052cd62be..f86da20347f2 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h | |||
@@ -102,6 +102,8 @@ static inline void native_pgd_clear(pgd_t *pgd) | |||
102 | native_set_pgd(pgd, native_make_pgd(0)); | 102 | native_set_pgd(pgd, native_make_pgd(0)); |
103 | } | 103 | } |
104 | 104 | ||
105 | extern void sync_global_pgds(unsigned long start, unsigned long end); | ||
106 | |||
105 | /* | 107 | /* |
106 | * Conversion functions: convert a page and protection to a page entry, | 108 | * Conversion functions: convert a page and protection to a page entry, |
107 | * and a page entry and page directory to the page they refer to. | 109 | * and a page entry and page directory to the page they refer to. |
@@ -125,9 +127,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } | |||
125 | 127 | ||
126 | /* x86-64 always has all page tables mapped. */ | 128 | /* x86-64 always has all page tables mapped. */ |
127 | #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) | 129 | #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) |
128 | #define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) | ||
129 | #define pte_unmap(pte) ((void)(pte))/* NOP */ | 130 | #define pte_unmap(pte) ((void)(pte))/* NOP */ |
130 | #define pte_unmap_nested(pte) ((void)(pte)) /* NOP */ | ||
131 | 131 | ||
132 | #define update_mmu_cache(vma, address, ptep) do { } while (0) | 132 | #define update_mmu_cache(vma, address, ptep) do { } while (0) |
133 | 133 | ||
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 325b7bdbebaa..cae9c3cb95cf 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -110,6 +110,8 @@ struct cpuinfo_x86 { | |||
110 | u16 phys_proc_id; | 110 | u16 phys_proc_id; |
111 | /* Core id: */ | 111 | /* Core id: */ |
112 | u16 cpu_core_id; | 112 | u16 cpu_core_id; |
113 | /* Compute unit id */ | ||
114 | u8 compute_unit_id; | ||
113 | /* Index into per_cpu list: */ | 115 | /* Index into per_cpu list: */ |
114 | u16 cpu_index; | 116 | u16 cpu_index; |
115 | #endif | 117 | #endif |
@@ -602,7 +604,7 @@ extern unsigned long mmu_cr4_features; | |||
602 | 604 | ||
603 | static inline void set_in_cr4(unsigned long mask) | 605 | static inline void set_in_cr4(unsigned long mask) |
604 | { | 606 | { |
605 | unsigned cr4; | 607 | unsigned long cr4; |
606 | 608 | ||
607 | mmu_cr4_features |= mask; | 609 | mmu_cr4_features |= mask; |
608 | cr4 = read_cr4(); | 610 | cr4 = read_cr4(); |
@@ -612,7 +614,7 @@ static inline void set_in_cr4(unsigned long mask) | |||
612 | 614 | ||
613 | static inline void clear_in_cr4(unsigned long mask) | 615 | static inline void clear_in_cr4(unsigned long mask) |
614 | { | 616 | { |
615 | unsigned cr4; | 617 | unsigned long cr4; |
616 | 618 | ||
617 | mmu_cr4_features &= ~mask; | 619 | mmu_cr4_features &= ~mask; |
618 | cr4 = read_cr4(); | 620 | cr4 = read_cr4(); |
@@ -764,29 +766,6 @@ extern unsigned long idle_halt; | |||
764 | extern unsigned long idle_nomwait; | 766 | extern unsigned long idle_nomwait; |
765 | extern bool c1e_detected; | 767 | extern bool c1e_detected; |
766 | 768 | ||
767 | /* | ||
768 | * on systems with caches, caches must be flashed as the absolute | ||
769 | * last instruction before going into a suspended halt. Otherwise, | ||
770 | * dirty data can linger in the cache and become stale on resume, | ||
771 | * leading to strange errors. | ||
772 | * | ||
773 | * perform a variety of operations to guarantee that the compiler | ||
774 | * will not reorder instructions. wbinvd itself is serializing | ||
775 | * so the processor will not reorder. | ||
776 | * | ||
777 | * Systems without cache can just go into halt. | ||
778 | */ | ||
779 | static inline void wbinvd_halt(void) | ||
780 | { | ||
781 | mb(); | ||
782 | /* check for clflush to determine if wbinvd is legal */ | ||
783 | if (cpu_has_clflush) | ||
784 | asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory"); | ||
785 | else | ||
786 | while (1) | ||
787 | halt(); | ||
788 | } | ||
789 | |||
790 | extern void enable_sep_cpu(void); | 769 | extern void enable_sep_cpu(void); |
791 | extern int sysenter_setup(void); | 770 | extern int sysenter_setup(void); |
792 | 771 | ||
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index cd02f324aa6b..7f7e577a0e39 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h | |||
@@ -12,4 +12,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall, | |||
12 | struct pvclock_vcpu_time_info *vcpu, | 12 | struct pvclock_vcpu_time_info *vcpu, |
13 | struct timespec *ts); | 13 | struct timespec *ts); |
14 | 14 | ||
15 | /* | ||
16 | * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, | ||
17 | * yielding a 64-bit result. | ||
18 | */ | ||
19 | static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) | ||
20 | { | ||
21 | u64 product; | ||
22 | #ifdef __i386__ | ||
23 | u32 tmp1, tmp2; | ||
24 | #endif | ||
25 | |||
26 | if (shift < 0) | ||
27 | delta >>= -shift; | ||
28 | else | ||
29 | delta <<= shift; | ||
30 | |||
31 | #ifdef __i386__ | ||
32 | __asm__ ( | ||
33 | "mul %5 ; " | ||
34 | "mov %4,%%eax ; " | ||
35 | "mov %%edx,%4 ; " | ||
36 | "mul %5 ; " | ||
37 | "xor %5,%5 ; " | ||
38 | "add %4,%%eax ; " | ||
39 | "adc %5,%%edx ; " | ||
40 | : "=A" (product), "=r" (tmp1), "=r" (tmp2) | ||
41 | : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); | ||
42 | #elif defined(__x86_64__) | ||
43 | __asm__ ( | ||
44 | "mul %%rdx ; shrd $32,%%rdx,%%rax" | ||
45 | : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); | ||
46 | #else | ||
47 | #error implement me! | ||
48 | #endif | ||
49 | |||
50 | return product; | ||
51 | } | ||
52 | |||
15 | #endif /* _ASM_X86_PVCLOCK_H */ | 53 | #endif /* _ASM_X86_PVCLOCK_H */ |
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 14e0ed86a6f9..231f1c1d6607 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h | |||
@@ -73,31 +73,31 @@ | |||
73 | 73 | ||
74 | #define GDT_ENTRY_DEFAULT_USER_DS 15 | 74 | #define GDT_ENTRY_DEFAULT_USER_DS 15 |
75 | 75 | ||
76 | #define GDT_ENTRY_KERNEL_BASE 12 | 76 | #define GDT_ENTRY_KERNEL_BASE (12) |
77 | 77 | ||
78 | #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0) | 78 | #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0) |
79 | 79 | ||
80 | #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1) | 80 | #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1) |
81 | 81 | ||
82 | #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4) | 82 | #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4) |
83 | #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5) | 83 | #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5) |
84 | 84 | ||
85 | #define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6) | 85 | #define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6) |
86 | #define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11) | 86 | #define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11) |
87 | 87 | ||
88 | #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) | 88 | #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14) |
89 | #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) | 89 | #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) |
90 | 90 | ||
91 | #define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) | 91 | #define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15) |
92 | #ifdef CONFIG_SMP | 92 | #ifdef CONFIG_SMP |
93 | #define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) | 93 | #define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) |
94 | #else | 94 | #else |
95 | #define __KERNEL_PERCPU 0 | 95 | #define __KERNEL_PERCPU 0 |
96 | #endif | 96 | #endif |
97 | 97 | ||
98 | #define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE + 16) | 98 | #define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16) |
99 | #ifdef CONFIG_CC_STACKPROTECTOR | 99 | #ifdef CONFIG_CC_STACKPROTECTOR |
100 | #define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY * 8) | 100 | #define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) |
101 | #else | 101 | #else |
102 | #define __KERNEL_STACK_CANARY 0 | 102 | #define __KERNEL_STACK_CANARY 0 |
103 | #endif | 103 | #endif |
@@ -182,10 +182,10 @@ | |||
182 | 182 | ||
183 | #endif | 183 | #endif |
184 | 184 | ||
185 | #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8) | 185 | #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) |
186 | #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8) | 186 | #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) |
187 | #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3) | 187 | #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3) |
188 | #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3) | 188 | #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3) |
189 | #ifndef CONFIG_PARAVIRT | 189 | #ifndef CONFIG_PARAVIRT |
190 | #define get_kernel_rpl() 0 | 190 | #define get_kernel_rpl() 0 |
191 | #endif | 191 | #endif |
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ef292c792d74..d6763b139a84 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h | |||
@@ -93,6 +93,11 @@ void *extend_brk(size_t size, size_t align); | |||
93 | : : "i" (sz)); \ | 93 | : : "i" (sz)); \ |
94 | } | 94 | } |
95 | 95 | ||
96 | /* Helper for reserving space for arrays of things */ | ||
97 | #define RESERVE_BRK_ARRAY(type, name, entries) \ | ||
98 | type *name; \ | ||
99 | RESERVE_BRK(name, sizeof(type) * entries) | ||
100 | |||
96 | #ifdef __i386__ | 101 | #ifdef __i386__ |
97 | 102 | ||
98 | void __init i386_start_kernel(void); | 103 | void __init i386_start_kernel(void); |
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4cfc90824068..4c2f63c7fc1b 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h | |||
@@ -50,7 +50,7 @@ struct smp_ops { | |||
50 | void (*smp_prepare_cpus)(unsigned max_cpus); | 50 | void (*smp_prepare_cpus)(unsigned max_cpus); |
51 | void (*smp_cpus_done)(unsigned max_cpus); | 51 | void (*smp_cpus_done)(unsigned max_cpus); |
52 | 52 | ||
53 | void (*smp_send_stop)(void); | 53 | void (*stop_other_cpus)(int wait); |
54 | void (*smp_send_reschedule)(int cpu); | 54 | void (*smp_send_reschedule)(int cpu); |
55 | 55 | ||
56 | int (*cpu_up)(unsigned cpu); | 56 | int (*cpu_up)(unsigned cpu); |
@@ -73,7 +73,12 @@ extern struct smp_ops smp_ops; | |||
73 | 73 | ||
74 | static inline void smp_send_stop(void) | 74 | static inline void smp_send_stop(void) |
75 | { | 75 | { |
76 | smp_ops.smp_send_stop(); | 76 | smp_ops.stop_other_cpus(0); |
77 | } | ||
78 | |||
79 | static inline void stop_other_cpus(void) | ||
80 | { | ||
81 | smp_ops.stop_other_cpus(1); | ||
77 | } | 82 | } |
78 | 83 | ||
79 | static inline void smp_prepare_boot_cpu(void) | 84 | static inline void smp_prepare_boot_cpu(void) |
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index 8085277e1b8b..977f1761a25d 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h | |||
@@ -5,17 +5,26 @@ | |||
5 | 5 | ||
6 | #ifdef CONFIG_SWIOTLB | 6 | #ifdef CONFIG_SWIOTLB |
7 | extern int swiotlb; | 7 | extern int swiotlb; |
8 | extern int __init pci_swiotlb_detect(void); | 8 | extern int __init pci_swiotlb_detect_override(void); |
9 | extern int __init pci_swiotlb_detect_4gb(void); | ||
9 | extern void __init pci_swiotlb_init(void); | 10 | extern void __init pci_swiotlb_init(void); |
11 | extern void __init pci_swiotlb_late_init(void); | ||
10 | #else | 12 | #else |
11 | #define swiotlb 0 | 13 | #define swiotlb 0 |
12 | static inline int pci_swiotlb_detect(void) | 14 | static inline int pci_swiotlb_detect_override(void) |
15 | { | ||
16 | return 0; | ||
17 | } | ||
18 | static inline int pci_swiotlb_detect_4gb(void) | ||
13 | { | 19 | { |
14 | return 0; | 20 | return 0; |
15 | } | 21 | } |
16 | static inline void pci_swiotlb_init(void) | 22 | static inline void pci_swiotlb_init(void) |
17 | { | 23 | { |
18 | } | 24 | } |
25 | static inline void pci_swiotlb_late_init(void) | ||
26 | { | ||
27 | } | ||
19 | #endif | 28 | #endif |
20 | 29 | ||
21 | static inline void dma_mark_clean(void *addr, size_t size) {} | 30 | static inline void dma_mark_clean(void *addr, size_t size) {} |
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 7f3eba08e7de..169be8938b96 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h | |||
@@ -172,6 +172,4 @@ static inline void flush_tlb_kernel_range(unsigned long start, | |||
172 | flush_tlb_all(); | 172 | flush_tlb_all(); |
173 | } | 173 | } |
174 | 174 | ||
175 | extern void zap_low_mappings(bool early); | ||
176 | |||
177 | #endif /* _ASM_X86_TLBFLUSH_H */ | 175 | #endif /* _ASM_X86_TLBFLUSH_H */ |
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h index 4dde797c0578..f4500fb3b485 100644 --- a/arch/x86/include/asm/trampoline.h +++ b/arch/x86/include/asm/trampoline.h | |||
@@ -13,16 +13,13 @@ extern unsigned char *trampoline_base; | |||
13 | 13 | ||
14 | extern unsigned long init_rsp; | 14 | extern unsigned long init_rsp; |
15 | extern unsigned long initial_code; | 15 | extern unsigned long initial_code; |
16 | extern unsigned long initial_page_table; | ||
17 | extern unsigned long initial_gs; | 16 | extern unsigned long initial_gs; |
18 | 17 | ||
19 | #define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) | 18 | #define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) |
20 | 19 | ||
21 | extern unsigned long setup_trampoline(void); | 20 | extern unsigned long setup_trampoline(void); |
22 | extern void __init setup_trampoline_page_table(void); | ||
23 | extern void __init reserve_trampoline_memory(void); | 21 | extern void __init reserve_trampoline_memory(void); |
24 | #else | 22 | #else |
25 | static inline void setup_trampoline_page_table(void) {} | ||
26 | static inline void reserve_trampoline_memory(void) {} | 23 | static inline void reserve_trampoline_memory(void) {} |
27 | #endif /* CONFIG_X86_TRAMPOLINE */ | 24 | #endif /* CONFIG_X86_TRAMPOLINE */ |
28 | 25 | ||
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h deleted file mode 100644 index 61e08c0a2907..000000000000 --- a/arch/x86/include/asm/vmi.h +++ /dev/null | |||
@@ -1,269 +0,0 @@ | |||
1 | /* | ||
2 | * VMI interface definition | ||
3 | * | ||
4 | * Copyright (C) 2005, VMware, Inc. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, but | ||
12 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
14 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
15 | * details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
20 | * | ||
21 | * Maintained by: Zachary Amsden zach@vmware.com | ||
22 | * | ||
23 | */ | ||
24 | #include <linux/types.h> | ||
25 | |||
26 | /* | ||
27 | *--------------------------------------------------------------------- | ||
28 | * | ||
29 | * VMI Option ROM API | ||
30 | * | ||
31 | *--------------------------------------------------------------------- | ||
32 | */ | ||
33 | #define VMI_SIGNATURE 0x696d5663 /* "cVmi" */ | ||
34 | |||
35 | #define PCI_VENDOR_ID_VMWARE 0x15AD | ||
36 | #define PCI_DEVICE_ID_VMWARE_VMI 0x0801 | ||
37 | |||
38 | /* | ||
39 | * We use two version numbers for compatibility, with the major | ||
40 | * number signifying interface breakages, and the minor number | ||
41 | * interface extensions. | ||
42 | */ | ||
43 | #define VMI_API_REV_MAJOR 3 | ||
44 | #define VMI_API_REV_MINOR 0 | ||
45 | |||
46 | #define VMI_CALL_CPUID 0 | ||
47 | #define VMI_CALL_WRMSR 1 | ||
48 | #define VMI_CALL_RDMSR 2 | ||
49 | #define VMI_CALL_SetGDT 3 | ||
50 | #define VMI_CALL_SetLDT 4 | ||
51 | #define VMI_CALL_SetIDT 5 | ||
52 | #define VMI_CALL_SetTR 6 | ||
53 | #define VMI_CALL_GetGDT 7 | ||
54 | #define VMI_CALL_GetLDT 8 | ||
55 | #define VMI_CALL_GetIDT 9 | ||
56 | #define VMI_CALL_GetTR 10 | ||
57 | #define VMI_CALL_WriteGDTEntry 11 | ||
58 | #define VMI_CALL_WriteLDTEntry 12 | ||
59 | #define VMI_CALL_WriteIDTEntry 13 | ||
60 | #define VMI_CALL_UpdateKernelStack 14 | ||
61 | #define VMI_CALL_SetCR0 15 | ||
62 | #define VMI_CALL_SetCR2 16 | ||
63 | #define VMI_CALL_SetCR3 17 | ||
64 | #define VMI_CALL_SetCR4 18 | ||
65 | #define VMI_CALL_GetCR0 19 | ||
66 | #define VMI_CALL_GetCR2 20 | ||
67 | #define VMI_CALL_GetCR3 21 | ||
68 | #define VMI_CALL_GetCR4 22 | ||
69 | #define VMI_CALL_WBINVD 23 | ||
70 | #define VMI_CALL_SetDR 24 | ||
71 | #define VMI_CALL_GetDR 25 | ||
72 | #define VMI_CALL_RDPMC 26 | ||
73 | #define VMI_CALL_RDTSC 27 | ||
74 | #define VMI_CALL_CLTS 28 | ||
75 | #define VMI_CALL_EnableInterrupts 29 | ||
76 | #define VMI_CALL_DisableInterrupts 30 | ||
77 | #define VMI_CALL_GetInterruptMask 31 | ||
78 | #define VMI_CALL_SetInterruptMask 32 | ||
79 | #define VMI_CALL_IRET 33 | ||
80 | #define VMI_CALL_SYSEXIT 34 | ||
81 | #define VMI_CALL_Halt 35 | ||
82 | #define VMI_CALL_Reboot 36 | ||
83 | #define VMI_CALL_Shutdown 37 | ||
84 | #define VMI_CALL_SetPxE 38 | ||
85 | #define VMI_CALL_SetPxELong 39 | ||
86 | #define VMI_CALL_UpdatePxE 40 | ||
87 | #define VMI_CALL_UpdatePxELong 41 | ||
88 | #define VMI_CALL_MachineToPhysical 42 | ||
89 | #define VMI_CALL_PhysicalToMachine 43 | ||
90 | #define VMI_CALL_AllocatePage 44 | ||
91 | #define VMI_CALL_ReleasePage 45 | ||
92 | #define VMI_CALL_InvalPage 46 | ||
93 | #define VMI_CALL_FlushTLB 47 | ||
94 | #define VMI_CALL_SetLinearMapping 48 | ||
95 | |||
96 | #define VMI_CALL_SetIOPLMask 61 | ||
97 | #define VMI_CALL_SetInitialAPState 62 | ||
98 | #define VMI_CALL_APICWrite 63 | ||
99 | #define VMI_CALL_APICRead 64 | ||
100 | #define VMI_CALL_IODelay 65 | ||
101 | #define VMI_CALL_SetLazyMode 73 | ||
102 | |||
103 | /* | ||
104 | *--------------------------------------------------------------------- | ||
105 | * | ||
106 | * MMU operation flags | ||
107 | * | ||
108 | *--------------------------------------------------------------------- | ||
109 | */ | ||
110 | |||
111 | /* Flags used by VMI_{Allocate|Release}Page call */ | ||
112 | #define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */ | ||
113 | #define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */ | ||
114 | #define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */ | ||
115 | |||
116 | |||
117 | /* Flags shared by Allocate|Release Page and PTE updates */ | ||
118 | #define VMI_PAGE_PT 0x01 | ||
119 | #define VMI_PAGE_PD 0x02 | ||
120 | #define VMI_PAGE_PDP 0x04 | ||
121 | #define VMI_PAGE_PML4 0x08 | ||
122 | |||
123 | #define VMI_PAGE_NORMAL 0x00 /* for debugging */ | ||
124 | |||
125 | /* Flags used by PTE updates */ | ||
126 | #define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */ | ||
127 | #define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */ | ||
128 | #define VMI_PAGE_VA_MASK 0xfffff000 | ||
129 | |||
130 | #ifdef CONFIG_X86_PAE | ||
131 | #define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED) | ||
132 | #define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED) | ||
133 | #else | ||
134 | #define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED) | ||
135 | #define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED) | ||
136 | #endif | ||
137 | |||
138 | /* Flags used by VMI_FlushTLB call */ | ||
139 | #define VMI_FLUSH_TLB 0x01 | ||
140 | #define VMI_FLUSH_GLOBAL 0x02 | ||
141 | |||
142 | /* | ||
143 | *--------------------------------------------------------------------- | ||
144 | * | ||
145 | * VMI relocation definitions for ROM call get_reloc | ||
146 | * | ||
147 | *--------------------------------------------------------------------- | ||
148 | */ | ||
149 | |||
150 | /* VMI Relocation types */ | ||
151 | #define VMI_RELOCATION_NONE 0 | ||
152 | #define VMI_RELOCATION_CALL_REL 1 | ||
153 | #define VMI_RELOCATION_JUMP_REL 2 | ||
154 | #define VMI_RELOCATION_NOP 3 | ||
155 | |||
156 | #ifndef __ASSEMBLY__ | ||
157 | struct vmi_relocation_info { | ||
158 | unsigned char *eip; | ||
159 | unsigned char type; | ||
160 | unsigned char reserved[3]; | ||
161 | }; | ||
162 | #endif | ||
163 | |||
164 | |||
165 | /* | ||
166 | *--------------------------------------------------------------------- | ||
167 | * | ||
168 | * Generic ROM structures and definitions | ||
169 | * | ||
170 | *--------------------------------------------------------------------- | ||
171 | */ | ||
172 | |||
173 | #ifndef __ASSEMBLY__ | ||
174 | |||
175 | struct vrom_header { | ||
176 | u16 rom_signature; /* option ROM signature */ | ||
177 | u8 rom_length; /* ROM length in 512 byte chunks */ | ||
178 | u8 rom_entry[4]; /* 16-bit code entry point */ | ||
179 | u8 rom_pad0; /* 4-byte align pad */ | ||
180 | u32 vrom_signature; /* VROM identification signature */ | ||
181 | u8 api_version_min;/* Minor version of API */ | ||
182 | u8 api_version_maj;/* Major version of API */ | ||
183 | u8 jump_slots; /* Number of jump slots */ | ||
184 | u8 reserved1; /* Reserved for expansion */ | ||
185 | u32 virtual_top; /* Hypervisor virtual address start */ | ||
186 | u16 reserved2; /* Reserved for expansion */ | ||
187 | u16 license_offs; /* Offset to License string */ | ||
188 | u16 pci_header_offs;/* Offset to PCI OPROM header */ | ||
189 | u16 pnp_header_offs;/* Offset to PnP OPROM header */ | ||
190 | u32 rom_pad3; /* PnP reserverd / VMI reserved */ | ||
191 | u8 reserved[96]; /* Reserved for headers */ | ||
192 | char vmi_init[8]; /* VMI_Init jump point */ | ||
193 | char get_reloc[8]; /* VMI_GetRelocationInfo jump point */ | ||
194 | } __attribute__((packed)); | ||
195 | |||
196 | struct pnp_header { | ||
197 | char sig[4]; | ||
198 | char rev; | ||
199 | char size; | ||
200 | short next; | ||
201 | short res; | ||
202 | long devID; | ||
203 | unsigned short manufacturer_offset; | ||
204 | unsigned short product_offset; | ||
205 | } __attribute__((packed)); | ||
206 | |||
207 | struct pci_header { | ||
208 | char sig[4]; | ||
209 | short vendorID; | ||
210 | short deviceID; | ||
211 | short vpdData; | ||
212 | short size; | ||
213 | char rev; | ||
214 | char class; | ||
215 | char subclass; | ||
216 | char interface; | ||
217 | short chunks; | ||
218 | char rom_version_min; | ||
219 | char rom_version_maj; | ||
220 | char codetype; | ||
221 | char lastRom; | ||
222 | short reserved; | ||
223 | } __attribute__((packed)); | ||
224 | |||
225 | /* Function prototypes for bootstrapping */ | ||
226 | #ifdef CONFIG_VMI | ||
227 | extern void vmi_init(void); | ||
228 | extern void vmi_activate(void); | ||
229 | extern void vmi_bringup(void); | ||
230 | #else | ||
231 | static inline void vmi_init(void) {} | ||
232 | static inline void vmi_activate(void) {} | ||
233 | static inline void vmi_bringup(void) {} | ||
234 | #endif | ||
235 | |||
236 | /* State needed to start an application processor in an SMP system. */ | ||
237 | struct vmi_ap_state { | ||
238 | u32 cr0; | ||
239 | u32 cr2; | ||
240 | u32 cr3; | ||
241 | u32 cr4; | ||
242 | |||
243 | u64 efer; | ||
244 | |||
245 | u32 eip; | ||
246 | u32 eflags; | ||
247 | u32 eax; | ||
248 | u32 ebx; | ||
249 | u32 ecx; | ||
250 | u32 edx; | ||
251 | u32 esp; | ||
252 | u32 ebp; | ||
253 | u32 esi; | ||
254 | u32 edi; | ||
255 | u16 cs; | ||
256 | u16 ss; | ||
257 | u16 ds; | ||
258 | u16 es; | ||
259 | u16 fs; | ||
260 | u16 gs; | ||
261 | u16 ldtr; | ||
262 | |||
263 | u16 gdtr_limit; | ||
264 | u32 gdtr_base; | ||
265 | u32 idtr_base; | ||
266 | u16 idtr_limit; | ||
267 | }; | ||
268 | |||
269 | #endif | ||
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h deleted file mode 100644 index c6e0bee93e3c..000000000000 --- a/arch/x86/include/asm/vmi_time.h +++ /dev/null | |||
@@ -1,98 +0,0 @@ | |||
1 | /* | ||
2 | * VMI Time wrappers | ||
3 | * | ||
4 | * Copyright (C) 2006, VMware, Inc. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, but | ||
12 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
14 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
15 | * details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
20 | * | ||
21 | * Send feedback to dhecht@vmware.com | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #ifndef _ASM_X86_VMI_TIME_H | ||
26 | #define _ASM_X86_VMI_TIME_H | ||
27 | |||
28 | /* | ||
29 | * Raw VMI call indices for timer functions | ||
30 | */ | ||
31 | #define VMI_CALL_GetCycleFrequency 66 | ||
32 | #define VMI_CALL_GetCycleCounter 67 | ||
33 | #define VMI_CALL_SetAlarm 68 | ||
34 | #define VMI_CALL_CancelAlarm 69 | ||
35 | #define VMI_CALL_GetWallclockTime 70 | ||
36 | #define VMI_CALL_WallclockUpdated 71 | ||
37 | |||
38 | /* Cached VMI timer operations */ | ||
39 | extern struct vmi_timer_ops { | ||
40 | u64 (*get_cycle_frequency)(void); | ||
41 | u64 (*get_cycle_counter)(int); | ||
42 | u64 (*get_wallclock)(void); | ||
43 | int (*wallclock_updated)(void); | ||
44 | void (*set_alarm)(u32 flags, u64 expiry, u64 period); | ||
45 | void (*cancel_alarm)(u32 flags); | ||
46 | } vmi_timer_ops; | ||
47 | |||
48 | /* Prototypes */ | ||
49 | extern void __init vmi_time_init(void); | ||
50 | extern unsigned long vmi_get_wallclock(void); | ||
51 | extern int vmi_set_wallclock(unsigned long now); | ||
52 | extern unsigned long long vmi_sched_clock(void); | ||
53 | extern unsigned long vmi_tsc_khz(void); | ||
54 | |||
55 | #ifdef CONFIG_X86_LOCAL_APIC | ||
56 | extern void __devinit vmi_time_bsp_init(void); | ||
57 | extern void __devinit vmi_time_ap_init(void); | ||
58 | #endif | ||
59 | |||
60 | /* | ||
61 | * When run under a hypervisor, a vcpu is always in one of three states: | ||
62 | * running, halted, or ready. The vcpu is in the 'running' state if it | ||
63 | * is executing. When the vcpu executes the halt interface, the vcpu | ||
64 | * enters the 'halted' state and remains halted until there is some work | ||
65 | * pending for the vcpu (e.g. an alarm expires, host I/O completes on | ||
66 | * behalf of virtual I/O). At this point, the vcpu enters the 'ready' | ||
67 | * state (waiting for the hypervisor to reschedule it). Finally, at any | ||
68 | * time when the vcpu is not in the 'running' state nor the 'halted' | ||
69 | * state, it is in the 'ready' state. | ||
70 | * | ||
71 | * Real time is advances while the vcpu is 'running', 'ready', or | ||
72 | * 'halted'. Stolen time is the time in which the vcpu is in the | ||
73 | * 'ready' state. Available time is the remaining time -- the vcpu is | ||
74 | * either 'running' or 'halted'. | ||
75 | * | ||
76 | * All three views of time are accessible through the VMI cycle | ||
77 | * counters. | ||
78 | */ | ||
79 | |||
80 | /* The cycle counters. */ | ||
81 | #define VMI_CYCLES_REAL 0 | ||
82 | #define VMI_CYCLES_AVAILABLE 1 | ||
83 | #define VMI_CYCLES_STOLEN 2 | ||
84 | |||
85 | /* The alarm interface 'flags' bits */ | ||
86 | #define VMI_ALARM_COUNTERS 2 | ||
87 | |||
88 | #define VMI_ALARM_COUNTER_MASK 0x000000ff | ||
89 | |||
90 | #define VMI_ALARM_WIRED_IRQ0 0x00000000 | ||
91 | #define VMI_ALARM_WIRED_LVTT 0x00010000 | ||
92 | |||
93 | #define VMI_ALARM_IS_ONESHOT 0x00000000 | ||
94 | #define VMI_ALARM_IS_PERIODIC 0x00000100 | ||
95 | |||
96 | #define CONFIG_VMI_ALARM_HZ 100 | ||
97 | |||
98 | #endif /* _ASM_X86_VMI_TIME_H */ | ||
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 7fda040a76cd..a3c28ae4025b 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h | |||
@@ -200,6 +200,23 @@ extern struct { char _entry[32]; } hypercall_page[]; | |||
200 | (type)__res; \ | 200 | (type)__res; \ |
201 | }) | 201 | }) |
202 | 202 | ||
203 | static inline long | ||
204 | privcmd_call(unsigned call, | ||
205 | unsigned long a1, unsigned long a2, | ||
206 | unsigned long a3, unsigned long a4, | ||
207 | unsigned long a5) | ||
208 | { | ||
209 | __HYPERCALL_DECLS; | ||
210 | __HYPERCALL_5ARG(a1, a2, a3, a4, a5); | ||
211 | |||
212 | asm volatile("call *%[call]" | ||
213 | : __HYPERCALL_5PARAM | ||
214 | : [call] "a" (&hypercall_page[call]) | ||
215 | : __HYPERCALL_CLOBBER5); | ||
216 | |||
217 | return (long)__res; | ||
218 | } | ||
219 | |||
203 | static inline int | 220 | static inline int |
204 | HYPERVISOR_set_trap_table(struct trap_info *table) | 221 | HYPERVISOR_set_trap_table(struct trap_info *table) |
205 | { | 222 | { |
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index bf5f7d32bd08..dd8c1414b3d5 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h | |||
@@ -37,14 +37,21 @@ typedef struct xpaddr { | |||
37 | 37 | ||
38 | 38 | ||
39 | extern unsigned long get_phys_to_machine(unsigned long pfn); | 39 | extern unsigned long get_phys_to_machine(unsigned long pfn); |
40 | extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn); | 40 | extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); |
41 | 41 | ||
42 | static inline unsigned long pfn_to_mfn(unsigned long pfn) | 42 | static inline unsigned long pfn_to_mfn(unsigned long pfn) |
43 | { | 43 | { |
44 | unsigned long mfn; | ||
45 | |||
44 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 46 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
45 | return pfn; | 47 | return pfn; |
46 | 48 | ||
47 | return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT; | 49 | mfn = get_phys_to_machine(pfn); |
50 | |||
51 | if (mfn != INVALID_P2M_ENTRY) | ||
52 | mfn &= ~FOREIGN_FRAME_BIT; | ||
53 | |||
54 | return mfn; | ||
48 | } | 55 | } |
49 | 56 | ||
50 | static inline int phys_to_machine_mapping_valid(unsigned long pfn) | 57 | static inline int phys_to_machine_mapping_valid(unsigned long pfn) |
@@ -159,6 +166,7 @@ static inline pte_t __pte_ma(pteval_t x) | |||
159 | 166 | ||
160 | #define pgd_val_ma(x) ((x).pgd) | 167 | #define pgd_val_ma(x) ((x).pgd) |
161 | 168 | ||
169 | void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); | ||
162 | 170 | ||
163 | xmaddr_t arbitrary_virt_to_machine(void *address); | 171 | xmaddr_t arbitrary_virt_to_machine(void *address); |
164 | unsigned long arbitrary_virt_to_mfn(void *vaddr); | 172 | unsigned long arbitrary_virt_to_mfn(void *vaddr); |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index fedf32a8c3ec..9e13763b6092 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -34,8 +34,8 @@ GCOV_PROFILE_paravirt.o := n | |||
34 | obj-y := process_$(BITS).o signal.o entry_$(BITS).o | 34 | obj-y := process_$(BITS).o signal.o entry_$(BITS).o |
35 | obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o | 35 | obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o |
36 | obj-y += time.o ioport.o ldt.o dumpstack.o | 36 | obj-y += time.o ioport.o ldt.o dumpstack.o |
37 | obj-y += setup.o x86_init.o i8259.o irqinit.o | 37 | obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o |
38 | obj-$(CONFIG_X86_VISWS) += visws_quirks.o | 38 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
39 | obj-$(CONFIG_X86_32) += probe_roms_32.o | 39 | obj-$(CONFIG_X86_32) += probe_roms_32.o |
40 | obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o | 40 | obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o |
41 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o | 41 | obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o |
@@ -44,6 +44,7 @@ obj-y += bootflag.o e820.o | |||
44 | obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o | 44 | obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o |
45 | obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o | 45 | obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o |
46 | obj-y += tsc.o io_delay.o rtc.o | 46 | obj-y += tsc.o io_delay.o rtc.o |
47 | obj-y += pci-iommu_table.o | ||
47 | 48 | ||
48 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o | 49 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o |
49 | obj-y += process.o | 50 | obj-y += process.o |
@@ -56,7 +57,6 @@ obj-$(CONFIG_INTEL_TXT) += tboot.o | |||
56 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | 57 | obj-$(CONFIG_STACKTRACE) += stacktrace.o |
57 | obj-y += cpu/ | 58 | obj-y += cpu/ |
58 | obj-y += acpi/ | 59 | obj-y += acpi/ |
59 | obj-$(CONFIG_SFI) += sfi.o | ||
60 | obj-y += reboot.o | 60 | obj-y += reboot.o |
61 | obj-$(CONFIG_MCA) += mca_32.o | 61 | obj-$(CONFIG_MCA) += mca_32.o |
62 | obj-$(CONFIG_X86_MSR) += msr.o | 62 | obj-$(CONFIG_X86_MSR) += msr.o |
@@ -80,20 +80,19 @@ obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o | |||
80 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o | 80 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o |
81 | obj-$(CONFIG_KPROBES) += kprobes.o | 81 | obj-$(CONFIG_KPROBES) += kprobes.o |
82 | obj-$(CONFIG_MODULES) += module.o | 82 | obj-$(CONFIG_MODULES) += module.o |
83 | obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o | ||
84 | obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o | 83 | obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o |
85 | obj-$(CONFIG_KGDB) += kgdb.o | 84 | obj-$(CONFIG_KGDB) += kgdb.o |
86 | obj-$(CONFIG_VM86) += vm86_32.o | 85 | obj-$(CONFIG_VM86) += vm86_32.o |
87 | obj-$(CONFIG_EARLY_PRINTK) += early_printk.o | 86 | obj-$(CONFIG_EARLY_PRINTK) += early_printk.o |
87 | obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o | ||
88 | 88 | ||
89 | obj-$(CONFIG_HPET_TIMER) += hpet.o | 89 | obj-$(CONFIG_HPET_TIMER) += hpet.o |
90 | obj-$(CONFIG_APB_TIMER) += apb_timer.o | 90 | obj-$(CONFIG_APB_TIMER) += apb_timer.o |
91 | 91 | ||
92 | obj-$(CONFIG_K8_NB) += k8.o | 92 | obj-$(CONFIG_AMD_NB) += amd_nb.o |
93 | obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o | 93 | obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o |
94 | obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o | 94 | obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o |
95 | 95 | ||
96 | obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o | ||
97 | obj-$(CONFIG_KVM_GUEST) += kvm.o | 96 | obj-$(CONFIG_KVM_GUEST) += kvm.o |
98 | obj-$(CONFIG_KVM_CLOCK) += kvmclock.o | 97 | obj-$(CONFIG_KVM_CLOCK) += kvmclock.o |
99 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o | 98 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o |
@@ -102,13 +101,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o | |||
102 | 101 | ||
103 | obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o | 102 | obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o |
104 | 103 | ||
105 | obj-$(CONFIG_SCx200) += scx200.o | ||
106 | scx200-y += scx200_32.o | ||
107 | |||
108 | obj-$(CONFIG_OLPC) += olpc.o | ||
109 | obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o | ||
110 | obj-$(CONFIG_X86_MRST) += mrst.o | ||
111 | |||
112 | microcode-y := microcode_core.o | 104 | microcode-y := microcode_core.o |
113 | microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o | 105 | microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o |
114 | microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o | 106 | microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o |
@@ -121,8 +113,6 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o | |||
121 | ### | 113 | ### |
122 | # 64 bit specific files | 114 | # 64 bit specific files |
123 | ifeq ($(CONFIG_X86_64),y) | 115 | ifeq ($(CONFIG_X86_64),y) |
124 | obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o | ||
125 | obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o | ||
126 | obj-$(CONFIG_AUDIT) += audit_64.o | 116 | obj-$(CONFIG_AUDIT) += audit_64.o |
127 | 117 | ||
128 | obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o | 118 | obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o |
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index fb16f17e59be..5812404a0d4c 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c | |||
@@ -13,6 +13,7 @@ | |||
13 | 13 | ||
14 | #include <acpi/processor.h> | 14 | #include <acpi/processor.h> |
15 | #include <asm/acpi.h> | 15 | #include <asm/acpi.h> |
16 | #include <asm/mwait.h> | ||
16 | 17 | ||
17 | /* | 18 | /* |
18 | * Initialize bm_flags based on the CPU cache properties | 19 | * Initialize bm_flags based on the CPU cache properties |
@@ -65,16 +66,6 @@ static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */ | |||
65 | 66 | ||
66 | static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; | 67 | static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; |
67 | 68 | ||
68 | #define MWAIT_SUBSTATE_MASK (0xf) | ||
69 | #define MWAIT_CSTATE_MASK (0xf) | ||
70 | #define MWAIT_SUBSTATE_SIZE (4) | ||
71 | |||
72 | #define CPUID_MWAIT_LEAF (5) | ||
73 | #define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1) | ||
74 | #define CPUID5_ECX_INTERRUPT_BREAK (0x2) | ||
75 | |||
76 | #define MWAIT_ECX_INTERRUPT_BREAK (0x1) | ||
77 | |||
78 | #define NATIVE_CSTATE_BEYOND_HALT (2) | 69 | #define NATIVE_CSTATE_BEYOND_HALT (2) |
79 | 70 | ||
80 | static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) | 71 | static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) |
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 33cec152070d..69fd72aa5594 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c | |||
@@ -7,11 +7,16 @@ | |||
7 | 7 | ||
8 | #include <linux/acpi.h> | 8 | #include <linux/acpi.h> |
9 | #include <linux/bootmem.h> | 9 | #include <linux/bootmem.h> |
10 | #include <linux/memblock.h> | ||
10 | #include <linux/dmi.h> | 11 | #include <linux/dmi.h> |
11 | #include <linux/cpumask.h> | 12 | #include <linux/cpumask.h> |
12 | #include <asm/segment.h> | 13 | #include <asm/segment.h> |
13 | #include <asm/desc.h> | 14 | #include <asm/desc.h> |
14 | 15 | ||
16 | #ifdef CONFIG_X86_32 | ||
17 | #include <asm/pgtable.h> | ||
18 | #endif | ||
19 | |||
15 | #include "realmode/wakeup.h" | 20 | #include "realmode/wakeup.h" |
16 | #include "sleep.h" | 21 | #include "sleep.h" |
17 | 22 | ||
@@ -90,7 +95,7 @@ int acpi_save_state_mem(void) | |||
90 | 95 | ||
91 | #ifndef CONFIG_64BIT | 96 | #ifndef CONFIG_64BIT |
92 | header->pmode_entry = (u32)&wakeup_pmode_return; | 97 | header->pmode_entry = (u32)&wakeup_pmode_return; |
93 | header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET); | 98 | header->pmode_cr3 = (u32)__pa(&initial_page_table); |
94 | saved_magic = 0x12345678; | 99 | saved_magic = 0x12345678; |
95 | #else /* CONFIG_64BIT */ | 100 | #else /* CONFIG_64BIT */ |
96 | header->trampoline_segment = setup_trampoline() >> 4; | 101 | header->trampoline_segment = setup_trampoline() >> 4; |
@@ -125,7 +130,7 @@ void acpi_restore_state_mem(void) | |||
125 | */ | 130 | */ |
126 | void __init acpi_reserve_wakeup_memory(void) | 131 | void __init acpi_reserve_wakeup_memory(void) |
127 | { | 132 | { |
128 | unsigned long mem; | 133 | phys_addr_t mem; |
129 | 134 | ||
130 | if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { | 135 | if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { |
131 | printk(KERN_ERR | 136 | printk(KERN_ERR |
@@ -133,15 +138,15 @@ void __init acpi_reserve_wakeup_memory(void) | |||
133 | return; | 138 | return; |
134 | } | 139 | } |
135 | 140 | ||
136 | mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); | 141 | mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); |
137 | 142 | ||
138 | if (mem == -1L) { | 143 | if (mem == MEMBLOCK_ERROR) { |
139 | printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); | 144 | printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); |
140 | return; | 145 | return; |
141 | } | 146 | } |
142 | acpi_realmode = (unsigned long) phys_to_virt(mem); | 147 | acpi_realmode = (unsigned long) phys_to_virt(mem); |
143 | acpi_wakeup_address = mem; | 148 | acpi_wakeup_address = mem; |
144 | reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); | 149 | memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); |
145 | } | 150 | } |
146 | 151 | ||
147 | 152 | ||
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f65ab8b014c4..a36bb90aef53 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -195,7 +195,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) | |||
195 | 195 | ||
196 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; | 196 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; |
197 | extern s32 __smp_locks[], __smp_locks_end[]; | 197 | extern s32 __smp_locks[], __smp_locks_end[]; |
198 | static void *text_poke_early(void *addr, const void *opcode, size_t len); | 198 | void *text_poke_early(void *addr, const void *opcode, size_t len); |
199 | 199 | ||
200 | /* Replace instructions with better alternatives for this CPU type. | 200 | /* Replace instructions with better alternatives for this CPU type. |
201 | This runs before SMP is initialized to avoid SMP problems with | 201 | This runs before SMP is initialized to avoid SMP problems with |
@@ -522,7 +522,7 @@ void __init alternative_instructions(void) | |||
522 | * instructions. And on the local CPU you need to be protected again NMI or MCE | 522 | * instructions. And on the local CPU you need to be protected again NMI or MCE |
523 | * handlers seeing an inconsistent instruction while you patch. | 523 | * handlers seeing an inconsistent instruction while you patch. |
524 | */ | 524 | */ |
525 | static void *__init_or_module text_poke_early(void *addr, const void *opcode, | 525 | void *__init_or_module text_poke_early(void *addr, const void *opcode, |
526 | size_t len) | 526 | size_t len) |
527 | { | 527 | { |
528 | unsigned long flags; | 528 | unsigned long flags; |
@@ -637,7 +637,72 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) | |||
637 | tpp.len = len; | 637 | tpp.len = len; |
638 | atomic_set(&stop_machine_first, 1); | 638 | atomic_set(&stop_machine_first, 1); |
639 | wrote_text = 0; | 639 | wrote_text = 0; |
640 | stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); | 640 | /* Use __stop_machine() because the caller already got online_cpus. */ |
641 | __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); | ||
641 | return addr; | 642 | return addr; |
642 | } | 643 | } |
643 | 644 | ||
645 | #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) | ||
646 | |||
647 | unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; | ||
648 | |||
649 | void __init arch_init_ideal_nop5(void) | ||
650 | { | ||
651 | extern const unsigned char ftrace_test_p6nop[]; | ||
652 | extern const unsigned char ftrace_test_nop5[]; | ||
653 | extern const unsigned char ftrace_test_jmp[]; | ||
654 | int faulted = 0; | ||
655 | |||
656 | /* | ||
657 | * There is no good nop for all x86 archs. | ||
658 | * We will default to using the P6_NOP5, but first we | ||
659 | * will test to make sure that the nop will actually | ||
660 | * work on this CPU. If it faults, we will then | ||
661 | * go to a lesser efficient 5 byte nop. If that fails | ||
662 | * we then just use a jmp as our nop. This isn't the most | ||
663 | * efficient nop, but we can not use a multi part nop | ||
664 | * since we would then risk being preempted in the middle | ||
665 | * of that nop, and if we enabled tracing then, it might | ||
666 | * cause a system crash. | ||
667 | * | ||
668 | * TODO: check the cpuid to determine the best nop. | ||
669 | */ | ||
670 | asm volatile ( | ||
671 | "ftrace_test_jmp:" | ||
672 | "jmp ftrace_test_p6nop\n" | ||
673 | "nop\n" | ||
674 | "nop\n" | ||
675 | "nop\n" /* 2 byte jmp + 3 bytes */ | ||
676 | "ftrace_test_p6nop:" | ||
677 | P6_NOP5 | ||
678 | "jmp 1f\n" | ||
679 | "ftrace_test_nop5:" | ||
680 | ".byte 0x66,0x66,0x66,0x66,0x90\n" | ||
681 | "1:" | ||
682 | ".section .fixup, \"ax\"\n" | ||
683 | "2: movl $1, %0\n" | ||
684 | " jmp ftrace_test_nop5\n" | ||
685 | "3: movl $2, %0\n" | ||
686 | " jmp 1b\n" | ||
687 | ".previous\n" | ||
688 | _ASM_EXTABLE(ftrace_test_p6nop, 2b) | ||
689 | _ASM_EXTABLE(ftrace_test_nop5, 3b) | ||
690 | : "=r"(faulted) : "0" (faulted)); | ||
691 | |||
692 | switch (faulted) { | ||
693 | case 0: | ||
694 | pr_info("converting mcount calls to 0f 1f 44 00 00\n"); | ||
695 | memcpy(ideal_nop5, ftrace_test_p6nop, IDEAL_NOP_SIZE_5); | ||
696 | break; | ||
697 | case 1: | ||
698 | pr_info("converting mcount calls to 66 66 66 66 90\n"); | ||
699 | memcpy(ideal_nop5, ftrace_test_nop5, IDEAL_NOP_SIZE_5); | ||
700 | break; | ||
701 | case 2: | ||
702 | pr_info("converting mcount calls to jmp . + 5\n"); | ||
703 | memcpy(ideal_nop5, ftrace_test_jmp, IDEAL_NOP_SIZE_5); | ||
704 | break; | ||
705 | } | ||
706 | |||
707 | } | ||
708 | #endif | ||
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 679b6450382b..d2fdb0826df2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. | 2 | * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. |
3 | * Author: Joerg Roedel <joerg.roedel@amd.com> | 3 | * Author: Joerg Roedel <joerg.roedel@amd.com> |
4 | * Leo Duran <leo.duran@amd.com> | 4 | * Leo Duran <leo.duran@amd.com> |
5 | * | 5 | * |
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 5a170cbbbed8..6e11c8134158 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. | 2 | * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. |
3 | * Author: Joerg Roedel <joerg.roedel@amd.com> | 3 | * Author: Joerg Roedel <joerg.roedel@amd.com> |
4 | * Leo Duran <leo.duran@amd.com> | 4 | * Leo Duran <leo.duran@amd.com> |
5 | * | 5 | * |
@@ -31,7 +31,7 @@ | |||
31 | #include <asm/iommu.h> | 31 | #include <asm/iommu.h> |
32 | #include <asm/gart.h> | 32 | #include <asm/gart.h> |
33 | #include <asm/x86_init.h> | 33 | #include <asm/x86_init.h> |
34 | 34 | #include <asm/iommu_table.h> | |
35 | /* | 35 | /* |
36 | * definitions for the ACPI scanning code | 36 | * definitions for the ACPI scanning code |
37 | */ | 37 | */ |
@@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size) | |||
194 | return 1UL << shift; | 194 | return 1UL << shift; |
195 | } | 195 | } |
196 | 196 | ||
197 | /* Access to l1 and l2 indexed register spaces */ | ||
198 | |||
199 | static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address) | ||
200 | { | ||
201 | u32 val; | ||
202 | |||
203 | pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); | ||
204 | pci_read_config_dword(iommu->dev, 0xfc, &val); | ||
205 | return val; | ||
206 | } | ||
207 | |||
208 | static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val) | ||
209 | { | ||
210 | pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31)); | ||
211 | pci_write_config_dword(iommu->dev, 0xfc, val); | ||
212 | pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); | ||
213 | } | ||
214 | |||
215 | static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address) | ||
216 | { | ||
217 | u32 val; | ||
218 | |||
219 | pci_write_config_dword(iommu->dev, 0xf0, address); | ||
220 | pci_read_config_dword(iommu->dev, 0xf4, &val); | ||
221 | return val; | ||
222 | } | ||
223 | |||
224 | static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val) | ||
225 | { | ||
226 | pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8)); | ||
227 | pci_write_config_dword(iommu->dev, 0xf4, val); | ||
228 | } | ||
229 | |||
197 | /**************************************************************************** | 230 | /**************************************************************************** |
198 | * | 231 | * |
199 | * AMD IOMMU MMIO register space handling functions | 232 | * AMD IOMMU MMIO register space handling functions |
@@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) | |||
619 | { | 652 | { |
620 | int cap_ptr = iommu->cap_ptr; | 653 | int cap_ptr = iommu->cap_ptr; |
621 | u32 range, misc; | 654 | u32 range, misc; |
655 | int i, j; | ||
622 | 656 | ||
623 | pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, | 657 | pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, |
624 | &iommu->cap); | 658 | &iommu->cap); |
@@ -633,12 +667,29 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) | |||
633 | MMIO_GET_LD(range)); | 667 | MMIO_GET_LD(range)); |
634 | iommu->evt_msi_num = MMIO_MSI_NUM(misc); | 668 | iommu->evt_msi_num = MMIO_MSI_NUM(misc); |
635 | 669 | ||
636 | if (is_rd890_iommu(iommu->dev)) { | 670 | if (!is_rd890_iommu(iommu->dev)) |
637 | pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]); | 671 | return; |
638 | pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]); | 672 | |
639 | pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]); | 673 | /* |
640 | pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]); | 674 | * Some rd890 systems may not be fully reconfigured by the BIOS, so |
641 | } | 675 | * it's necessary for us to store this information so it can be |
676 | * reprogrammed on resume | ||
677 | */ | ||
678 | |||
679 | pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4, | ||
680 | &iommu->stored_addr_lo); | ||
681 | pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8, | ||
682 | &iommu->stored_addr_hi); | ||
683 | |||
684 | /* Low bit locks writes to configuration space */ | ||
685 | iommu->stored_addr_lo &= ~1; | ||
686 | |||
687 | for (i = 0; i < 6; i++) | ||
688 | for (j = 0; j < 0x12; j++) | ||
689 | iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j); | ||
690 | |||
691 | for (i = 0; i < 0x83; i++) | ||
692 | iommu->stored_l2[i] = iommu_read_l2(iommu, i); | ||
642 | } | 693 | } |
643 | 694 | ||
644 | /* | 695 | /* |
@@ -1127,14 +1178,53 @@ static void iommu_init_flags(struct amd_iommu *iommu) | |||
1127 | iommu_feature_enable(iommu, CONTROL_COHERENT_EN); | 1178 | iommu_feature_enable(iommu, CONTROL_COHERENT_EN); |
1128 | } | 1179 | } |
1129 | 1180 | ||
1130 | static void iommu_apply_quirks(struct amd_iommu *iommu) | 1181 | static void iommu_apply_resume_quirks(struct amd_iommu *iommu) |
1131 | { | 1182 | { |
1132 | if (is_rd890_iommu(iommu->dev)) { | 1183 | int i, j; |
1133 | pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]); | 1184 | u32 ioc_feature_control; |
1134 | pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]); | 1185 | struct pci_dev *pdev = NULL; |
1135 | pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]); | 1186 | |
1136 | pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]); | 1187 | /* RD890 BIOSes may not have completely reconfigured the iommu */ |
1137 | } | 1188 | if (!is_rd890_iommu(iommu->dev)) |
1189 | return; | ||
1190 | |||
1191 | /* | ||
1192 | * First, we need to ensure that the iommu is enabled. This is | ||
1193 | * controlled by a register in the northbridge | ||
1194 | */ | ||
1195 | pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0)); | ||
1196 | |||
1197 | if (!pdev) | ||
1198 | return; | ||
1199 | |||
1200 | /* Select Northbridge indirect register 0x75 and enable writing */ | ||
1201 | pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7)); | ||
1202 | pci_read_config_dword(pdev, 0x64, &ioc_feature_control); | ||
1203 | |||
1204 | /* Enable the iommu */ | ||
1205 | if (!(ioc_feature_control & 0x1)) | ||
1206 | pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1); | ||
1207 | |||
1208 | pci_dev_put(pdev); | ||
1209 | |||
1210 | /* Restore the iommu BAR */ | ||
1211 | pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, | ||
1212 | iommu->stored_addr_lo); | ||
1213 | pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8, | ||
1214 | iommu->stored_addr_hi); | ||
1215 | |||
1216 | /* Restore the l1 indirect regs for each of the 6 l1s */ | ||
1217 | for (i = 0; i < 6; i++) | ||
1218 | for (j = 0; j < 0x12; j++) | ||
1219 | iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]); | ||
1220 | |||
1221 | /* Restore the l2 indirect regs */ | ||
1222 | for (i = 0; i < 0x83; i++) | ||
1223 | iommu_write_l2(iommu, i, iommu->stored_l2[i]); | ||
1224 | |||
1225 | /* Lock PCI setup registers */ | ||
1226 | pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, | ||
1227 | iommu->stored_addr_lo | 1); | ||
1138 | } | 1228 | } |
1139 | 1229 | ||
1140 | /* | 1230 | /* |
@@ -1147,7 +1237,6 @@ static void enable_iommus(void) | |||
1147 | 1237 | ||
1148 | for_each_iommu(iommu) { | 1238 | for_each_iommu(iommu) { |
1149 | iommu_disable(iommu); | 1239 | iommu_disable(iommu); |
1150 | iommu_apply_quirks(iommu); | ||
1151 | iommu_init_flags(iommu); | 1240 | iommu_init_flags(iommu); |
1152 | iommu_set_device_table(iommu); | 1241 | iommu_set_device_table(iommu); |
1153 | iommu_enable_command_buffer(iommu); | 1242 | iommu_enable_command_buffer(iommu); |
@@ -1173,6 +1262,11 @@ static void disable_iommus(void) | |||
1173 | 1262 | ||
1174 | static int amd_iommu_resume(struct sys_device *dev) | 1263 | static int amd_iommu_resume(struct sys_device *dev) |
1175 | { | 1264 | { |
1265 | struct amd_iommu *iommu; | ||
1266 | |||
1267 | for_each_iommu(iommu) | ||
1268 | iommu_apply_resume_quirks(iommu); | ||
1269 | |||
1176 | /* re-load the hardware */ | 1270 | /* re-load the hardware */ |
1177 | enable_iommus(); | 1271 | enable_iommus(); |
1178 | 1272 | ||
@@ -1405,13 +1499,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) | |||
1405 | return 0; | 1499 | return 0; |
1406 | } | 1500 | } |
1407 | 1501 | ||
1408 | void __init amd_iommu_detect(void) | 1502 | int __init amd_iommu_detect(void) |
1409 | { | 1503 | { |
1410 | if (no_iommu || (iommu_detected && !gart_iommu_aperture)) | 1504 | if (no_iommu || (iommu_detected && !gart_iommu_aperture)) |
1411 | return; | 1505 | return -ENODEV; |
1412 | 1506 | ||
1413 | if (amd_iommu_disabled) | 1507 | if (amd_iommu_disabled) |
1414 | return; | 1508 | return -ENODEV; |
1415 | 1509 | ||
1416 | if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { | 1510 | if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { |
1417 | iommu_detected = 1; | 1511 | iommu_detected = 1; |
@@ -1420,7 +1514,9 @@ void __init amd_iommu_detect(void) | |||
1420 | 1514 | ||
1421 | /* Make sure ACS will be enabled */ | 1515 | /* Make sure ACS will be enabled */ |
1422 | pci_request_acs(); | 1516 | pci_request_acs(); |
1517 | return 1; | ||
1423 | } | 1518 | } |
1519 | return -ENODEV; | ||
1424 | } | 1520 | } |
1425 | 1521 | ||
1426 | /**************************************************************************** | 1522 | /**************************************************************************** |
@@ -1451,3 +1547,8 @@ static int __init parse_amd_iommu_options(char *str) | |||
1451 | 1547 | ||
1452 | __setup("amd_iommu_dump", parse_amd_iommu_dump); | 1548 | __setup("amd_iommu_dump", parse_amd_iommu_dump); |
1453 | __setup("amd_iommu=", parse_amd_iommu_options); | 1549 | __setup("amd_iommu=", parse_amd_iommu_options); |
1550 | |||
1551 | IOMMU_INIT_FINISH(amd_iommu_detect, | ||
1552 | gart_iommu_hole_init, | ||
1553 | 0, | ||
1554 | 0); | ||
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/amd_nb.c index 0f7bc20cfcde..8f6463d8ed0d 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/amd_nb.c | |||
@@ -8,21 +8,19 @@ | |||
8 | #include <linux/errno.h> | 8 | #include <linux/errno.h> |
9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
10 | #include <linux/spinlock.h> | 10 | #include <linux/spinlock.h> |
11 | #include <asm/k8.h> | 11 | #include <asm/amd_nb.h> |
12 | |||
13 | int num_k8_northbridges; | ||
14 | EXPORT_SYMBOL(num_k8_northbridges); | ||
15 | 12 | ||
16 | static u32 *flush_words; | 13 | static u32 *flush_words; |
17 | 14 | ||
18 | struct pci_device_id k8_nb_ids[] = { | 15 | struct pci_device_id k8_nb_ids[] = { |
19 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, | 16 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, |
20 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, | 17 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, |
18 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, | ||
21 | {} | 19 | {} |
22 | }; | 20 | }; |
23 | EXPORT_SYMBOL(k8_nb_ids); | 21 | EXPORT_SYMBOL(k8_nb_ids); |
24 | 22 | ||
25 | struct pci_dev **k8_northbridges; | 23 | struct k8_northbridge_info k8_northbridges; |
26 | EXPORT_SYMBOL(k8_northbridges); | 24 | EXPORT_SYMBOL(k8_northbridges); |
27 | 25 | ||
28 | static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) | 26 | static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) |
@@ -40,36 +38,45 @@ int cache_k8_northbridges(void) | |||
40 | int i; | 38 | int i; |
41 | struct pci_dev *dev; | 39 | struct pci_dev *dev; |
42 | 40 | ||
43 | if (num_k8_northbridges) | 41 | if (k8_northbridges.num) |
44 | return 0; | 42 | return 0; |
45 | 43 | ||
46 | dev = NULL; | 44 | dev = NULL; |
47 | while ((dev = next_k8_northbridge(dev)) != NULL) | 45 | while ((dev = next_k8_northbridge(dev)) != NULL) |
48 | num_k8_northbridges++; | 46 | k8_northbridges.num++; |
47 | |||
48 | /* some CPU families (e.g. family 0x11) do not support GART */ | ||
49 | if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || | ||
50 | boot_cpu_data.x86 == 0x15) | ||
51 | k8_northbridges.gart_supported = 1; | ||
49 | 52 | ||
50 | k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *), | 53 | k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * |
51 | GFP_KERNEL); | 54 | sizeof(void *), GFP_KERNEL); |
52 | if (!k8_northbridges) | 55 | if (!k8_northbridges.nb_misc) |
53 | return -ENOMEM; | 56 | return -ENOMEM; |
54 | 57 | ||
55 | if (!num_k8_northbridges) { | 58 | if (!k8_northbridges.num) { |
56 | k8_northbridges[0] = NULL; | 59 | k8_northbridges.nb_misc[0] = NULL; |
57 | return 0; | 60 | return 0; |
58 | } | 61 | } |
59 | 62 | ||
60 | flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL); | 63 | if (k8_northbridges.gart_supported) { |
61 | if (!flush_words) { | 64 | flush_words = kmalloc(k8_northbridges.num * sizeof(u32), |
62 | kfree(k8_northbridges); | 65 | GFP_KERNEL); |
63 | return -ENOMEM; | 66 | if (!flush_words) { |
67 | kfree(k8_northbridges.nb_misc); | ||
68 | return -ENOMEM; | ||
69 | } | ||
64 | } | 70 | } |
65 | 71 | ||
66 | dev = NULL; | 72 | dev = NULL; |
67 | i = 0; | 73 | i = 0; |
68 | while ((dev = next_k8_northbridge(dev)) != NULL) { | 74 | while ((dev = next_k8_northbridge(dev)) != NULL) { |
69 | k8_northbridges[i] = dev; | 75 | k8_northbridges.nb_misc[i] = dev; |
70 | pci_read_config_dword(dev, 0x9c, &flush_words[i++]); | 76 | if (k8_northbridges.gart_supported) |
77 | pci_read_config_dword(dev, 0x9c, &flush_words[i++]); | ||
71 | } | 78 | } |
72 | k8_northbridges[i] = NULL; | 79 | k8_northbridges.nb_misc[i] = NULL; |
73 | return 0; | 80 | return 0; |
74 | } | 81 | } |
75 | EXPORT_SYMBOL_GPL(cache_k8_northbridges); | 82 | EXPORT_SYMBOL_GPL(cache_k8_northbridges); |
@@ -93,22 +100,25 @@ void k8_flush_garts(void) | |||
93 | unsigned long flags; | 100 | unsigned long flags; |
94 | static DEFINE_SPINLOCK(gart_lock); | 101 | static DEFINE_SPINLOCK(gart_lock); |
95 | 102 | ||
103 | if (!k8_northbridges.gart_supported) | ||
104 | return; | ||
105 | |||
96 | /* Avoid races between AGP and IOMMU. In theory it's not needed | 106 | /* Avoid races between AGP and IOMMU. In theory it's not needed |
97 | but I'm not sure if the hardware won't lose flush requests | 107 | but I'm not sure if the hardware won't lose flush requests |
98 | when another is pending. This whole thing is so expensive anyways | 108 | when another is pending. This whole thing is so expensive anyways |
99 | that it doesn't matter to serialize more. -AK */ | 109 | that it doesn't matter to serialize more. -AK */ |
100 | spin_lock_irqsave(&gart_lock, flags); | 110 | spin_lock_irqsave(&gart_lock, flags); |
101 | flushed = 0; | 111 | flushed = 0; |
102 | for (i = 0; i < num_k8_northbridges; i++) { | 112 | for (i = 0; i < k8_northbridges.num; i++) { |
103 | pci_write_config_dword(k8_northbridges[i], 0x9c, | 113 | pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c, |
104 | flush_words[i]|1); | 114 | flush_words[i]|1); |
105 | flushed++; | 115 | flushed++; |
106 | } | 116 | } |
107 | for (i = 0; i < num_k8_northbridges; i++) { | 117 | for (i = 0; i < k8_northbridges.num; i++) { |
108 | u32 w; | 118 | u32 w; |
109 | /* Make sure the hardware actually executed the flush*/ | 119 | /* Make sure the hardware actually executed the flush*/ |
110 | for (;;) { | 120 | for (;;) { |
111 | pci_read_config_dword(k8_northbridges[i], | 121 | pci_read_config_dword(k8_northbridges.nb_misc[i], |
112 | 0x9c, &w); | 122 | 0x9c, &w); |
113 | if (!(w & 1)) | 123 | if (!(w & 1)) |
114 | break; | 124 | break; |
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 42a70a2accc0..92543c73cf8e 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c | |||
@@ -392,7 +392,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n, | |||
392 | } | 392 | } |
393 | break; | 393 | break; |
394 | default: | 394 | default: |
395 | pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); | 395 | pr_debug("APBT notified %lu, no action\n", action); |
396 | } | 396 | } |
397 | return NOTIFY_OK; | 397 | return NOTIFY_OK; |
398 | } | 398 | } |
@@ -546,7 +546,7 @@ bad_count: | |||
546 | pr_debug("APB CS going back %lx:%lx:%lx ", | 546 | pr_debug("APB CS going back %lx:%lx:%lx ", |
547 | t2, last_read, t2 - last_read); | 547 | t2, last_read, t2 - last_read); |
548 | bad_count_x3: | 548 | bad_count_x3: |
549 | pr_debug(KERN_INFO "tripple check enforced\n"); | 549 | pr_debug("triple check enforced\n"); |
550 | t0 = apbt_readl(phy_cs_timer_id, | 550 | t0 = apbt_readl(phy_cs_timer_id, |
551 | APBTMR_N_CURRENT_VALUE); | 551 | APBTMR_N_CURRENT_VALUE); |
552 | udelay(1); | 552 | udelay(1); |
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index a2e0caf26e17..b3a16e8f0703 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c | |||
@@ -27,7 +27,7 @@ | |||
27 | #include <asm/gart.h> | 27 | #include <asm/gart.h> |
28 | #include <asm/pci-direct.h> | 28 | #include <asm/pci-direct.h> |
29 | #include <asm/dma.h> | 29 | #include <asm/dma.h> |
30 | #include <asm/k8.h> | 30 | #include <asm/amd_nb.h> |
31 | #include <asm/x86_init.h> | 31 | #include <asm/x86_init.h> |
32 | 32 | ||
33 | int gart_iommu_aperture; | 33 | int gart_iommu_aperture; |
@@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void) | |||
307 | continue; | 307 | continue; |
308 | 308 | ||
309 | ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); | 309 | ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); |
310 | aper_enabled = ctl & AMD64_GARTEN; | 310 | aper_enabled = ctl & GARTEN; |
311 | aper_order = (ctl >> 1) & 7; | 311 | aper_order = (ctl >> 1) & 7; |
312 | aper_size = (32 * 1024 * 1024) << aper_order; | 312 | aper_size = (32 * 1024 * 1024) << aper_order; |
313 | aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; | 313 | aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; |
@@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void) | |||
362 | continue; | 362 | continue; |
363 | 363 | ||
364 | ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); | 364 | ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); |
365 | ctl &= ~AMD64_GARTEN; | 365 | ctl &= ~GARTEN; |
366 | write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); | 366 | write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); |
367 | } | 367 | } |
368 | } | 368 | } |
@@ -371,7 +371,7 @@ void __init early_gart_iommu_check(void) | |||
371 | 371 | ||
372 | static int __initdata printed_gart_size_msg; | 372 | static int __initdata printed_gart_size_msg; |
373 | 373 | ||
374 | void __init gart_iommu_hole_init(void) | 374 | int __init gart_iommu_hole_init(void) |
375 | { | 375 | { |
376 | u32 agp_aper_base = 0, agp_aper_order = 0; | 376 | u32 agp_aper_base = 0, agp_aper_order = 0; |
377 | u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; | 377 | u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; |
@@ -381,7 +381,7 @@ void __init gart_iommu_hole_init(void) | |||
381 | 381 | ||
382 | if (gart_iommu_aperture_disabled || !fix_aperture || | 382 | if (gart_iommu_aperture_disabled || !fix_aperture || |
383 | !early_pci_allowed()) | 383 | !early_pci_allowed()) |
384 | return; | 384 | return -ENODEV; |
385 | 385 | ||
386 | printk(KERN_INFO "Checking aperture...\n"); | 386 | printk(KERN_INFO "Checking aperture...\n"); |
387 | 387 | ||
@@ -463,8 +463,9 @@ out: | |||
463 | unsigned long n = (32 * 1024 * 1024) << last_aper_order; | 463 | unsigned long n = (32 * 1024 * 1024) << last_aper_order; |
464 | 464 | ||
465 | insert_aperture_resource((u32)last_aper_base, n); | 465 | insert_aperture_resource((u32)last_aper_base, n); |
466 | return 1; | ||
466 | } | 467 | } |
467 | return; | 468 | return 0; |
468 | } | 469 | } |
469 | 470 | ||
470 | if (!fallback_aper_force) { | 471 | if (!fallback_aper_force) { |
@@ -500,13 +501,18 @@ out: | |||
500 | panic("Not enough memory for aperture"); | 501 | panic("Not enough memory for aperture"); |
501 | } | 502 | } |
502 | } else { | 503 | } else { |
503 | return; | 504 | return 0; |
504 | } | 505 | } |
505 | 506 | ||
506 | /* Fix up the north bridges */ | 507 | /* Fix up the north bridges */ |
507 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { | 508 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { |
508 | int bus; | 509 | int bus, dev_base, dev_limit; |
509 | int dev_base, dev_limit; | 510 | |
511 | /* | ||
512 | * Don't enable translation yet but enable GART IO and CPU | ||
513 | * accesses and set DISTLBWALKPRB since GART table memory is UC. | ||
514 | */ | ||
515 | u32 ctl = DISTLBWALKPRB | aper_order << 1; | ||
510 | 516 | ||
511 | bus = bus_dev_ranges[i].bus; | 517 | bus = bus_dev_ranges[i].bus; |
512 | dev_base = bus_dev_ranges[i].dev_base; | 518 | dev_base = bus_dev_ranges[i].dev_base; |
@@ -515,13 +521,12 @@ out: | |||
515 | if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) | 521 | if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) |
516 | continue; | 522 | continue; |
517 | 523 | ||
518 | /* Don't enable translation yet. That is done later. | 524 | write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); |
519 | Assume this BIOS didn't initialise the GART so | ||
520 | just overwrite all previous bits */ | ||
521 | write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1); | ||
522 | write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); | 525 | write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); |
523 | } | 526 | } |
524 | } | 527 | } |
525 | 528 | ||
526 | set_up_gart_resume(aper_order, aper_alloc); | 529 | set_up_gart_resume(aper_order, aper_alloc); |
530 | |||
531 | return 1; | ||
527 | } | 532 | } |
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8cf86fb3b4e3..850657d1b0ed 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -52,6 +52,7 @@ | |||
52 | #include <asm/mce.h> | 52 | #include <asm/mce.h> |
53 | #include <asm/kvm_para.h> | 53 | #include <asm/kvm_para.h> |
54 | #include <asm/tsc.h> | 54 | #include <asm/tsc.h> |
55 | #include <asm/atomic.h> | ||
55 | 56 | ||
56 | unsigned int num_processors; | 57 | unsigned int num_processors; |
57 | 58 | ||
@@ -370,38 +371,87 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | |||
370 | } | 371 | } |
371 | 372 | ||
372 | /* | 373 | /* |
373 | * Setup extended LVT, AMD specific (K8, family 10h) | 374 | * Setup extended LVT, AMD specific |
374 | * | 375 | * |
375 | * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and | 376 | * Software should use the LVT offsets the BIOS provides. The offsets |
376 | * MCE interrupts are supported. Thus MCE offset must be set to 0. | 377 | * are determined by the subsystems using it like those for MCE |
378 | * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts | ||
379 | * are supported. Beginning with family 10h at least 4 offsets are | ||
380 | * available. | ||
377 | * | 381 | * |
378 | * If mask=1, the LVT entry does not generate interrupts while mask=0 | 382 | * Since the offsets must be consistent for all cores, we keep track |
379 | * enables the vector. See also the BKDGs. | 383 | * of the LVT offsets in software and reserve the offset for the same |
384 | * vector also to be used on other cores. An offset is freed by | ||
385 | * setting the entry to APIC_EILVT_MASKED. | ||
386 | * | ||
387 | * If the BIOS is right, there should be no conflicts. Otherwise a | ||
388 | * "[Firmware Bug]: ..." error message is generated. However, if | ||
389 | * software does not properly determines the offsets, it is not | ||
390 | * necessarily a BIOS bug. | ||
380 | */ | 391 | */ |
381 | 392 | ||
382 | #define APIC_EILVT_LVTOFF_MCE 0 | 393 | static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX]; |
383 | #define APIC_EILVT_LVTOFF_IBS 1 | ||
384 | 394 | ||
385 | static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) | 395 | static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new) |
386 | { | 396 | { |
387 | unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0); | 397 | return (old & APIC_EILVT_MASKED) |
388 | unsigned int v = (mask << 16) | (msg_type << 8) | vector; | 398 | || (new == APIC_EILVT_MASKED) |
389 | 399 | || ((new & ~APIC_EILVT_MASKED) == old); | |
390 | apic_write(reg, v); | ||
391 | } | 400 | } |
392 | 401 | ||
393 | u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) | 402 | static unsigned int reserve_eilvt_offset(int offset, unsigned int new) |
394 | { | 403 | { |
395 | setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); | 404 | unsigned int rsvd; /* 0: uninitialized */ |
396 | return APIC_EILVT_LVTOFF_MCE; | 405 | |
406 | if (offset >= APIC_EILVT_NR_MAX) | ||
407 | return ~0; | ||
408 | |||
409 | rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED; | ||
410 | do { | ||
411 | if (rsvd && | ||
412 | !eilvt_entry_is_changeable(rsvd, new)) | ||
413 | /* may not change if vectors are different */ | ||
414 | return rsvd; | ||
415 | rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new); | ||
416 | } while (rsvd != new); | ||
417 | |||
418 | return new; | ||
397 | } | 419 | } |
398 | 420 | ||
399 | u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) | 421 | /* |
422 | * If mask=1, the LVT entry does not generate interrupts while mask=0 | ||
423 | * enables the vector. See also the BKDGs. | ||
424 | */ | ||
425 | |||
426 | int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) | ||
400 | { | 427 | { |
401 | setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); | 428 | unsigned long reg = APIC_EILVTn(offset); |
402 | return APIC_EILVT_LVTOFF_IBS; | 429 | unsigned int new, old, reserved; |
430 | |||
431 | new = (mask << 16) | (msg_type << 8) | vector; | ||
432 | old = apic_read(reg); | ||
433 | reserved = reserve_eilvt_offset(offset, new); | ||
434 | |||
435 | if (reserved != new) { | ||
436 | pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but " | ||
437 | "vector 0x%x was already reserved by another core, " | ||
438 | "APIC%lX=0x%x\n", | ||
439 | smp_processor_id(), new, reserved, reg, old); | ||
440 | return -EINVAL; | ||
441 | } | ||
442 | |||
443 | if (!eilvt_entry_is_changeable(old, new)) { | ||
444 | pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but " | ||
445 | "register already in use, APIC%lX=0x%x\n", | ||
446 | smp_processor_id(), new, reg, old); | ||
447 | return -EBUSY; | ||
448 | } | ||
449 | |||
450 | apic_write(reg, new); | ||
451 | |||
452 | return 0; | ||
403 | } | 453 | } |
404 | EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); | 454 | EXPORT_SYMBOL_GPL(setup_APIC_eilvt); |
405 | 455 | ||
406 | /* | 456 | /* |
407 | * Program the next event, relative to now | 457 | * Program the next event, relative to now |
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 0885a4120737..0929191d83cf 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c | |||
@@ -3109,7 +3109,8 @@ void destroy_irq(unsigned int irq) | |||
3109 | 3109 | ||
3110 | irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); | 3110 | irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); |
3111 | 3111 | ||
3112 | free_irte(irq); | 3112 | if (intr_remapping_enabled) |
3113 | free_irte(irq); | ||
3113 | raw_spin_lock_irqsave(&vector_lock, flags); | 3114 | raw_spin_lock_irqsave(&vector_lock, flags); |
3114 | __clear_irq_vector(irq, cfg); | 3115 | __clear_irq_vector(irq, cfg); |
3115 | raw_spin_unlock_irqrestore(&vector_lock, flags); | 3116 | raw_spin_unlock_irqrestore(&vector_lock, flags); |
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 3e28401f161c..960f26ab5c9f 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/nodemask.h> | 26 | #include <linux/nodemask.h> |
27 | #include <linux/topology.h> | 27 | #include <linux/topology.h> |
28 | #include <linux/bootmem.h> | 28 | #include <linux/bootmem.h> |
29 | #include <linux/memblock.h> | ||
29 | #include <linux/threads.h> | 30 | #include <linux/threads.h> |
30 | #include <linux/cpumask.h> | 31 | #include <linux/cpumask.h> |
31 | #include <linux/kernel.h> | 32 | #include <linux/kernel.h> |
@@ -88,7 +89,7 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd) | |||
88 | node_end_pfn[node] = | 89 | node_end_pfn[node] = |
89 | MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); | 90 | MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); |
90 | 91 | ||
91 | e820_register_active_regions(node, node_start_pfn[node], | 92 | memblock_x86_register_active_regions(node, node_start_pfn[node], |
92 | node_end_pfn[node]); | 93 | node_end_pfn[node]); |
93 | 94 | ||
94 | memory_present(node, node_start_pfn[node], node_end_pfn[node]); | 95 | memory_present(node, node_start_pfn[node], node_end_pfn[node]); |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 4c9c67bf09b7..0e4f24c2a746 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -189,8 +189,8 @@ | |||
189 | * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01. | 189 | * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01. |
190 | * | 190 | * |
191 | * [This document is available free from Intel by calling 800.628.8686 (fax | 191 | * [This document is available free from Intel by calling 800.628.8686 (fax |
192 | * 916.356.6100) or 800.548.4725; or via anonymous ftp from | 192 | * 916.356.6100) or 800.548.4725; or from |
193 | * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also | 193 | * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also |
194 | * available from Microsoft by calling 206.882.8080.] | 194 | * available from Microsoft by calling 206.882.8080.] |
195 | * | 195 | * |
196 | * APM 1.2 Reference: | 196 | * APM 1.2 Reference: |
@@ -1926,6 +1926,7 @@ static const struct file_operations apm_bios_fops = { | |||
1926 | .unlocked_ioctl = do_ioctl, | 1926 | .unlocked_ioctl = do_ioctl, |
1927 | .open = do_open, | 1927 | .open = do_open, |
1928 | .release = do_release, | 1928 | .release = do_release, |
1929 | .llseek = noop_llseek, | ||
1929 | }; | 1930 | }; |
1930 | 1931 | ||
1931 | static struct miscdevice apm_device = { | 1932 | static struct miscdevice apm_device = { |
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index dfdbf6403895..1a4088dda37a 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c | |||
@@ -99,9 +99,7 @@ void foo(void) | |||
99 | 99 | ||
100 | DEFINE(PAGE_SIZE_asm, PAGE_SIZE); | 100 | DEFINE(PAGE_SIZE_asm, PAGE_SIZE); |
101 | DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); | 101 | DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); |
102 | DEFINE(PTRS_PER_PTE, PTRS_PER_PTE); | 102 | DEFINE(THREAD_SIZE_asm, THREAD_SIZE); |
103 | DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); | ||
104 | DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); | ||
105 | 103 | ||
106 | OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); | 104 | OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); |
107 | 105 | ||
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index fc999e6fc46a..13a389179514 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c | |||
@@ -2,7 +2,8 @@ | |||
2 | #include <linux/sched.h> | 2 | #include <linux/sched.h> |
3 | #include <linux/kthread.h> | 3 | #include <linux/kthread.h> |
4 | #include <linux/workqueue.h> | 4 | #include <linux/workqueue.h> |
5 | #include <asm/e820.h> | 5 | #include <linux/memblock.h> |
6 | |||
6 | #include <asm/proto.h> | 7 | #include <asm/proto.h> |
7 | 8 | ||
8 | /* | 9 | /* |
@@ -18,10 +19,12 @@ static int __read_mostly memory_corruption_check = -1; | |||
18 | static unsigned __read_mostly corruption_check_size = 64*1024; | 19 | static unsigned __read_mostly corruption_check_size = 64*1024; |
19 | static unsigned __read_mostly corruption_check_period = 60; /* seconds */ | 20 | static unsigned __read_mostly corruption_check_period = 60; /* seconds */ |
20 | 21 | ||
21 | static struct e820entry scan_areas[MAX_SCAN_AREAS]; | 22 | static struct scan_area { |
23 | u64 addr; | ||
24 | u64 size; | ||
25 | } scan_areas[MAX_SCAN_AREAS]; | ||
22 | static int num_scan_areas; | 26 | static int num_scan_areas; |
23 | 27 | ||
24 | |||
25 | static __init int set_corruption_check(char *arg) | 28 | static __init int set_corruption_check(char *arg) |
26 | { | 29 | { |
27 | char *end; | 30 | char *end; |
@@ -81,9 +84,9 @@ void __init setup_bios_corruption_check(void) | |||
81 | 84 | ||
82 | while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { | 85 | while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { |
83 | u64 size; | 86 | u64 size; |
84 | addr = find_e820_area_size(addr, &size, PAGE_SIZE); | 87 | addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE); |
85 | 88 | ||
86 | if (!(addr + 1)) | 89 | if (addr == MEMBLOCK_ERROR) |
87 | break; | 90 | break; |
88 | 91 | ||
89 | if (addr >= corruption_check_size) | 92 | if (addr >= corruption_check_size) |
@@ -92,7 +95,7 @@ void __init setup_bios_corruption_check(void) | |||
92 | if ((addr + size) > corruption_check_size) | 95 | if ((addr + size) > corruption_check_size) |
93 | size = corruption_check_size - addr; | 96 | size = corruption_check_size - addr; |
94 | 97 | ||
95 | e820_update_range(addr, size, E820_RAM, E820_RESERVED); | 98 | memblock_x86_reserve_range(addr, addr + size, "SCAN RAM"); |
96 | scan_areas[num_scan_areas].addr = addr; | 99 | scan_areas[num_scan_areas].addr = addr; |
97 | scan_areas[num_scan_areas].size = size; | 100 | scan_areas[num_scan_areas].size = size; |
98 | num_scan_areas++; | 101 | num_scan_areas++; |
@@ -105,7 +108,6 @@ void __init setup_bios_corruption_check(void) | |||
105 | 108 | ||
106 | printk(KERN_INFO "Scanning %d areas for low memory corruption\n", | 109 | printk(KERN_INFO "Scanning %d areas for low memory corruption\n", |
107 | num_scan_areas); | 110 | num_scan_areas); |
108 | update_e820(); | ||
109 | } | 111 | } |
110 | 112 | ||
111 | 113 | ||
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a8b4d91b8394..9e093f8fe78c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -253,37 +253,51 @@ static int __cpuinit nearby_node(int apicid) | |||
253 | #endif | 253 | #endif |
254 | 254 | ||
255 | /* | 255 | /* |
256 | * Fixup core topology information for AMD multi-node processors. | 256 | * Fixup core topology information for |
257 | * Assumption: Number of cores in each internal node is the same. | 257 | * (1) AMD multi-node processors |
258 | * Assumption: Number of cores in each internal node is the same. | ||
259 | * (2) AMD processors supporting compute units | ||
258 | */ | 260 | */ |
259 | #ifdef CONFIG_X86_HT | 261 | #ifdef CONFIG_X86_HT |
260 | static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) | 262 | static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) |
261 | { | 263 | { |
262 | unsigned long long value; | 264 | u32 nodes; |
263 | u32 nodes, cores_per_node; | 265 | u8 node_id; |
264 | int cpu = smp_processor_id(); | 266 | int cpu = smp_processor_id(); |
265 | 267 | ||
266 | if (!cpu_has(c, X86_FEATURE_NODEID_MSR)) | 268 | /* get information required for multi-node processors */ |
267 | return; | 269 | if (cpu_has(c, X86_FEATURE_TOPOEXT)) { |
270 | u32 eax, ebx, ecx, edx; | ||
268 | 271 | ||
269 | /* fixup topology information only once for a core */ | 272 | cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); |
270 | if (cpu_has(c, X86_FEATURE_AMD_DCM)) | 273 | nodes = ((ecx >> 8) & 7) + 1; |
271 | return; | 274 | node_id = ecx & 7; |
272 | 275 | ||
273 | rdmsrl(MSR_FAM10H_NODE_ID, value); | 276 | /* get compute unit information */ |
277 | smp_num_siblings = ((ebx >> 8) & 3) + 1; | ||
278 | c->compute_unit_id = ebx & 0xff; | ||
279 | } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { | ||
280 | u64 value; | ||
274 | 281 | ||
275 | nodes = ((value >> 3) & 7) + 1; | 282 | rdmsrl(MSR_FAM10H_NODE_ID, value); |
276 | if (nodes == 1) | 283 | nodes = ((value >> 3) & 7) + 1; |
284 | node_id = value & 7; | ||
285 | } else | ||
277 | return; | 286 | return; |
278 | 287 | ||
279 | set_cpu_cap(c, X86_FEATURE_AMD_DCM); | 288 | /* fixup multi-node processor information */ |
280 | cores_per_node = c->x86_max_cores / nodes; | 289 | if (nodes > 1) { |
290 | u32 cores_per_node; | ||
291 | |||
292 | set_cpu_cap(c, X86_FEATURE_AMD_DCM); | ||
293 | cores_per_node = c->x86_max_cores / nodes; | ||
281 | 294 | ||
282 | /* store NodeID, use llc_shared_map to store sibling info */ | 295 | /* store NodeID, use llc_shared_map to store sibling info */ |
283 | per_cpu(cpu_llc_id, cpu) = value & 7; | 296 | per_cpu(cpu_llc_id, cpu) = node_id; |
284 | 297 | ||
285 | /* fixup core id to be in range from 0 to (cores_per_node - 1) */ | 298 | /* core id to be in range from 0 to (cores_per_node - 1) */ |
286 | c->cpu_core_id = c->cpu_core_id % cores_per_node; | 299 | c->cpu_core_id = c->cpu_core_id % cores_per_node; |
300 | } | ||
287 | } | 301 | } |
288 | #endif | 302 | #endif |
289 | 303 | ||
@@ -304,9 +318,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) | |||
304 | c->phys_proc_id = c->initial_apicid >> bits; | 318 | c->phys_proc_id = c->initial_apicid >> bits; |
305 | /* use socket ID also for last level cache */ | 319 | /* use socket ID also for last level cache */ |
306 | per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; | 320 | per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; |
307 | /* fixup topology information on multi-node processors */ | 321 | amd_get_topology(c); |
308 | if ((c->x86 == 0x10) && (c->x86_model == 9)) | ||
309 | amd_fixup_dcm(c); | ||
310 | #endif | 322 | #endif |
311 | } | 323 | } |
312 | 324 | ||
@@ -412,6 +424,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) | |||
412 | set_cpu_cap(c, X86_FEATURE_EXTD_APICID); | 424 | set_cpu_cap(c, X86_FEATURE_EXTD_APICID); |
413 | } | 425 | } |
414 | #endif | 426 | #endif |
427 | |||
428 | /* We need to do the following only once */ | ||
429 | if (c != &boot_cpu_data) | ||
430 | return; | ||
431 | |||
432 | if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { | ||
433 | |||
434 | if (c->x86 > 0x10 || | ||
435 | (c->x86 == 0x10 && c->x86_model >= 0x2)) { | ||
436 | u64 val; | ||
437 | |||
438 | rdmsrl(MSR_K7_HWCR, val); | ||
439 | if (!(val & BIT(24))) | ||
440 | printk(KERN_WARNING FW_BUG "TSC doesn't count " | ||
441 | "with P0 frequency!\n"); | ||
442 | } | ||
443 | } | ||
415 | } | 444 | } |
416 | 445 | ||
417 | static void __cpuinit init_amd(struct cpuinfo_x86 *c) | 446 | static void __cpuinit init_amd(struct cpuinfo_x86 *c) |
@@ -523,7 +552,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
523 | #endif | 552 | #endif |
524 | 553 | ||
525 | if (c->extended_cpuid_level >= 0x80000006) { | 554 | if (c->extended_cpuid_level >= 0x80000006) { |
526 | if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) | 555 | if (cpuid_edx(0x80000006) & 0xf000) |
527 | num_cache_leaves = 4; | 556 | num_cache_leaves = 4; |
528 | else | 557 | else |
529 | num_cache_leaves = 3; | 558 | num_cache_leaves = 3; |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 15c671385f59..4b68bda30938 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -704,16 +704,21 @@ void __init early_cpu_init(void) | |||
704 | } | 704 | } |
705 | 705 | ||
706 | /* | 706 | /* |
707 | * The NOPL instruction is supposed to exist on all CPUs with | 707 | * The NOPL instruction is supposed to exist on all CPUs of family >= 6; |
708 | * family >= 6; unfortunately, that's not true in practice because | 708 | * unfortunately, that's not true in practice because of early VIA |
709 | * of early VIA chips and (more importantly) broken virtualizers that | 709 | * chips and (more importantly) broken virtualizers that are not easy |
710 | * are not easy to detect. In the latter case it doesn't even *fail* | 710 | * to detect. In the latter case it doesn't even *fail* reliably, so |
711 | * reliably, so probing for it doesn't even work. Disable it completely | 711 | * probing for it doesn't even work. Disable it completely on 32-bit |
712 | * unless we can find a reliable way to detect all the broken cases. | 712 | * unless we can find a reliable way to detect all the broken cases. |
713 | * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). | ||
713 | */ | 714 | */ |
714 | static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) | 715 | static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) |
715 | { | 716 | { |
717 | #ifdef CONFIG_X86_32 | ||
716 | clear_cpu_cap(c, X86_FEATURE_NOPL); | 718 | clear_cpu_cap(c, X86_FEATURE_NOPL); |
719 | #else | ||
720 | set_cpu_cap(c, X86_FEATURE_NOPL); | ||
721 | #endif | ||
717 | } | 722 | } |
718 | 723 | ||
719 | static void __cpuinit generic_identify(struct cpuinfo_x86 *c) | 724 | static void __cpuinit generic_identify(struct cpuinfo_x86 *c) |
@@ -1264,13 +1269,6 @@ void __cpuinit cpu_init(void) | |||
1264 | clear_all_debug_regs(); | 1269 | clear_all_debug_regs(); |
1265 | dbg_restore_debug_regs(); | 1270 | dbg_restore_debug_regs(); |
1266 | 1271 | ||
1267 | /* | ||
1268 | * Force FPU initialization: | ||
1269 | */ | ||
1270 | current_thread_info()->status = 0; | ||
1271 | clear_used_math(); | ||
1272 | mxcsr_feature_mask_init(); | ||
1273 | |||
1274 | fpu_init(); | 1272 | fpu_init(); |
1275 | xsave_init(); | 1273 | xsave_init(); |
1276 | } | 1274 | } |
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index f668bb1f7d43..e765633f210e 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h | |||
@@ -32,6 +32,7 @@ struct cpu_dev { | |||
32 | extern const struct cpu_dev *const __x86_cpu_dev_start[], | 32 | extern const struct cpu_dev *const __x86_cpu_dev_start[], |
33 | *const __x86_cpu_dev_end[]; | 33 | *const __x86_cpu_dev_end[]; |
34 | 34 | ||
35 | extern void get_cpu_cap(struct cpuinfo_x86 *c); | ||
35 | extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); | 36 | extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); |
36 | extern void get_cpu_cap(struct cpuinfo_x86 *c); | 37 | extern void get_cpu_cap(struct cpuinfo_x86 *c); |
37 | 38 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index cd8da247dda1..a2baafb2fe6d 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | |||
@@ -701,6 +701,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) | |||
701 | per_cpu(acfreq_data, policy->cpu) = NULL; | 701 | per_cpu(acfreq_data, policy->cpu) = NULL; |
702 | acpi_processor_unregister_performance(data->acpi_data, | 702 | acpi_processor_unregister_performance(data->acpi_data, |
703 | policy->cpu); | 703 | policy->cpu); |
704 | kfree(data->freq_table); | ||
704 | kfree(data); | 705 | kfree(data); |
705 | } | 706 | } |
706 | 707 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c index 733093d60436..141abebc4516 100644 --- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c +++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c | |||
@@ -393,7 +393,7 @@ static struct cpufreq_driver nforce2_driver = { | |||
393 | * Detects nForce2 A2 and C1 stepping | 393 | * Detects nForce2 A2 and C1 stepping |
394 | * | 394 | * |
395 | */ | 395 | */ |
396 | static unsigned int nforce2_detect_chipset(void) | 396 | static int nforce2_detect_chipset(void) |
397 | { | 397 | { |
398 | nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, | 398 | nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, |
399 | PCI_DEVICE_ID_NVIDIA_NFORCE2, | 399 | PCI_DEVICE_ID_NVIDIA_NFORCE2, |
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index fc09f142d94d..d9f51367666b 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c | |||
@@ -35,7 +35,7 @@ static unsigned int longrun_low_freq, longrun_high_freq; | |||
35 | * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS | 35 | * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS |
36 | * and MSR_TMTA_LONGRUN_CTRL | 36 | * and MSR_TMTA_LONGRUN_CTRL |
37 | */ | 37 | */ |
38 | static void __init longrun_get_policy(struct cpufreq_policy *policy) | 38 | static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy) |
39 | { | 39 | { |
40 | u32 msr_lo, msr_hi; | 40 | u32 msr_lo, msr_hi; |
41 | 41 | ||
@@ -165,7 +165,7 @@ static unsigned int longrun_get(unsigned int cpu) | |||
165 | * TMTA rules: | 165 | * TMTA rules: |
166 | * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) | 166 | * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) |
167 | */ | 167 | */ |
168 | static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq, | 168 | static int __cpuinit longrun_determine_freqs(unsigned int *low_freq, |
169 | unsigned int *high_freq) | 169 | unsigned int *high_freq) |
170 | { | 170 | { |
171 | u32 msr_lo, msr_hi; | 171 | u32 msr_lo, msr_hi; |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 695f17731e23..d16c2c53d6bf 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -284,9 +284,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) | |||
284 | /* Don't do the funky fallback heuristics the AMD version employs | 284 | /* Don't do the funky fallback heuristics the AMD version employs |
285 | for now. */ | 285 | for now. */ |
286 | node = apicid_to_node[apicid]; | 286 | node = apicid_to_node[apicid]; |
287 | if (node == NUMA_NO_NODE) | 287 | if (node == NUMA_NO_NODE || !node_online(node)) { |
288 | node = first_node(node_online_map); | ||
289 | else if (!node_online(node)) { | ||
290 | /* reuse the value from init_cpu_to_node() */ | 288 | /* reuse the value from init_cpu_to_node() */ |
291 | node = cpu_to_node(cpu); | 289 | node = cpu_to_node(cpu); |
292 | } | 290 | } |
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 898c2f4eab88..17ad03366211 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -17,7 +17,7 @@ | |||
17 | 17 | ||
18 | #include <asm/processor.h> | 18 | #include <asm/processor.h> |
19 | #include <linux/smp.h> | 19 | #include <linux/smp.h> |
20 | #include <asm/k8.h> | 20 | #include <asm/amd_nb.h> |
21 | #include <asm/smp.h> | 21 | #include <asm/smp.h> |
22 | 22 | ||
23 | #define LVL_1_INST 1 | 23 | #define LVL_1_INST 1 |
@@ -306,7 +306,7 @@ struct _cache_attr { | |||
306 | ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); | 306 | ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); |
307 | }; | 307 | }; |
308 | 308 | ||
309 | #ifdef CONFIG_CPU_SUP_AMD | 309 | #ifdef CONFIG_AMD_NB |
310 | 310 | ||
311 | /* | 311 | /* |
312 | * L3 cache descriptors | 312 | * L3 cache descriptors |
@@ -327,6 +327,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) | |||
327 | l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); | 327 | l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); |
328 | 328 | ||
329 | l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; | 329 | l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; |
330 | l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; | ||
330 | } | 331 | } |
331 | 332 | ||
332 | static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) | 333 | static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) |
@@ -369,7 +370,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, | |||
369 | return; | 370 | return; |
370 | 371 | ||
371 | /* not in virtualized environments */ | 372 | /* not in virtualized environments */ |
372 | if (num_k8_northbridges == 0) | 373 | if (k8_northbridges.num == 0) |
373 | return; | 374 | return; |
374 | 375 | ||
375 | /* | 376 | /* |
@@ -377,7 +378,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, | |||
377 | * never freed but this is done only on shutdown so it doesn't matter. | 378 | * never freed but this is done only on shutdown so it doesn't matter. |
378 | */ | 379 | */ |
379 | if (!l3_caches) { | 380 | if (!l3_caches) { |
380 | int size = num_k8_northbridges * sizeof(struct amd_l3_cache *); | 381 | int size = k8_northbridges.num * sizeof(struct amd_l3_cache *); |
381 | 382 | ||
382 | l3_caches = kzalloc(size, GFP_ATOMIC); | 383 | l3_caches = kzalloc(size, GFP_ATOMIC); |
383 | if (!l3_caches) | 384 | if (!l3_caches) |
@@ -556,12 +557,12 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, | |||
556 | static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, | 557 | static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, |
557 | show_cache_disable_1, store_cache_disable_1); | 558 | show_cache_disable_1, store_cache_disable_1); |
558 | 559 | ||
559 | #else /* CONFIG_CPU_SUP_AMD */ | 560 | #else /* CONFIG_AMD_NB */ |
560 | static void __cpuinit | 561 | static void __cpuinit |
561 | amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) | 562 | amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) |
562 | { | 563 | { |
563 | }; | 564 | }; |
564 | #endif /* CONFIG_CPU_SUP_AMD */ | 565 | #endif /* CONFIG_AMD_NB */ |
565 | 566 | ||
566 | static int | 567 | static int |
567 | __cpuinit cpuid4_cache_lookup_regs(int index, | 568 | __cpuinit cpuid4_cache_lookup_regs(int index, |
@@ -1000,7 +1001,7 @@ static struct attribute *default_attrs[] = { | |||
1000 | 1001 | ||
1001 | static struct attribute *default_l3_attrs[] = { | 1002 | static struct attribute *default_l3_attrs[] = { |
1002 | DEFAULT_SYSFS_CACHE_ATTRS, | 1003 | DEFAULT_SYSFS_CACHE_ATTRS, |
1003 | #ifdef CONFIG_CPU_SUP_AMD | 1004 | #ifdef CONFIG_AMD_NB |
1004 | &cache_disable_0.attr, | 1005 | &cache_disable_0.attr, |
1005 | &cache_disable_1.attr, | 1006 | &cache_disable_1.attr, |
1006 | #endif | 1007 | #endif |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 8a85dd1b1aa1..1e8d66c1336a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c | |||
@@ -192,6 +192,7 @@ static const struct file_operations severities_coverage_fops = { | |||
192 | .release = seq_release, | 192 | .release = seq_release, |
193 | .read = seq_read, | 193 | .read = seq_read, |
194 | .write = severities_coverage_write, | 194 | .write = severities_coverage_write, |
195 | .llseek = seq_lseek, | ||
195 | }; | 196 | }; |
196 | 197 | ||
197 | static int __init severities_debugfs_init(void) | 198 | static int __init severities_debugfs_init(void) |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ed41562909fe..7a35b72d7c03 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -1665,6 +1665,7 @@ struct file_operations mce_chrdev_ops = { | |||
1665 | .read = mce_read, | 1665 | .read = mce_read, |
1666 | .poll = mce_poll, | 1666 | .poll = mce_poll, |
1667 | .unlocked_ioctl = mce_ioctl, | 1667 | .unlocked_ioctl = mce_ioctl, |
1668 | .llseek = no_llseek, | ||
1668 | }; | 1669 | }; |
1669 | EXPORT_SYMBOL_GPL(mce_chrdev_ops); | 1670 | EXPORT_SYMBOL_GPL(mce_chrdev_ops); |
1670 | 1671 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 39aaee5c1ab2..80c482382d5c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c | |||
@@ -131,7 +131,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) | |||
131 | u32 low = 0, high = 0, address = 0; | 131 | u32 low = 0, high = 0, address = 0; |
132 | unsigned int bank, block; | 132 | unsigned int bank, block; |
133 | struct thresh_restart tr; | 133 | struct thresh_restart tr; |
134 | u8 lvt_off; | 134 | int lvt_off = -1; |
135 | u8 offset; | ||
135 | 136 | ||
136 | for (bank = 0; bank < NR_BANKS; ++bank) { | 137 | for (bank = 0; bank < NR_BANKS; ++bank) { |
137 | for (block = 0; block < NR_BLOCKS; ++block) { | 138 | for (block = 0; block < NR_BLOCKS; ++block) { |
@@ -162,8 +163,28 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) | |||
162 | if (shared_bank[bank] && c->cpu_core_id) | 163 | if (shared_bank[bank] && c->cpu_core_id) |
163 | break; | 164 | break; |
164 | #endif | 165 | #endif |
165 | lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR, | 166 | offset = (high & MASK_LVTOFF_HI) >> 20; |
166 | APIC_EILVT_MSG_FIX, 0); | 167 | if (lvt_off < 0) { |
168 | if (setup_APIC_eilvt(offset, | ||
169 | THRESHOLD_APIC_VECTOR, | ||
170 | APIC_EILVT_MSG_FIX, 0)) { | ||
171 | pr_err(FW_BUG "cpu %d, failed to " | ||
172 | "setup threshold interrupt " | ||
173 | "for bank %d, block %d " | ||
174 | "(MSR%08X=0x%x%08x)", | ||
175 | smp_processor_id(), bank, block, | ||
176 | address, high, low); | ||
177 | continue; | ||
178 | } | ||
179 | lvt_off = offset; | ||
180 | } else if (lvt_off != offset) { | ||
181 | pr_err(FW_BUG "cpu %d, invalid threshold " | ||
182 | "interrupt offset %d for bank %d," | ||
183 | "block %d (MSR%08X=0x%x%08x)", | ||
184 | smp_processor_id(), lvt_off, bank, | ||
185 | block, address, high, low); | ||
186 | continue; | ||
187 | } | ||
167 | 188 | ||
168 | high &= ~MASK_LVTOFF_HI; | 189 | high &= ~MASK_LVTOFF_HI; |
169 | high |= lvt_off << 20; | 190 | high |= lvt_off << 20; |
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 169d8804a9f8..4b683267eca5 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c | |||
@@ -350,7 +350,7 @@ static void intel_thermal_interrupt(void) | |||
350 | 350 | ||
351 | static void unexpected_thermal_interrupt(void) | 351 | static void unexpected_thermal_interrupt(void) |
352 | { | 352 | { |
353 | printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", | 353 | printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", |
354 | smp_processor_id()); | 354 | smp_processor_id()); |
355 | add_taint(TAINT_MACHINE_CHECK); | 355 | add_taint(TAINT_MACHINE_CHECK); |
356 | } | 356 | } |
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index c5f59d071425..ac140c7be396 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c | |||
@@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void) | |||
827 | 827 | ||
828 | if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) | 828 | if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) |
829 | return 0; | 829 | return 0; |
830 | if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) | 830 | if (boot_cpu_data.x86 < 0xf) |
831 | return 0; | 831 | return 0; |
832 | /* In case some hypervisor doesn't pass SYSCFG through: */ | 832 | /* In case some hypervisor doesn't pass SYSCFG through: */ |
833 | if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) | 833 | if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) |
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7d28d7d03885..9f27228ceffd 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c | |||
@@ -64,18 +64,59 @@ static inline void k8_check_syscfg_dram_mod_en(void) | |||
64 | } | 64 | } |
65 | } | 65 | } |
66 | 66 | ||
67 | /* Get the size of contiguous MTRR range */ | ||
68 | static u64 get_mtrr_size(u64 mask) | ||
69 | { | ||
70 | u64 size; | ||
71 | |||
72 | mask >>= PAGE_SHIFT; | ||
73 | mask |= size_or_mask; | ||
74 | size = -mask; | ||
75 | size <<= PAGE_SHIFT; | ||
76 | return size; | ||
77 | } | ||
78 | |||
67 | /* | 79 | /* |
68 | * Returns the effective MTRR type for the region | 80 | * Check and return the effective type for MTRR-MTRR type overlap. |
69 | * Error returns: | 81 | * Returns 1 if the effective type is UNCACHEABLE, else returns 0 |
70 | * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR | ||
71 | * - 0xFF - when MTRR is not enabled | ||
72 | */ | 82 | */ |
73 | u8 mtrr_type_lookup(u64 start, u64 end) | 83 | static int check_type_overlap(u8 *prev, u8 *curr) |
84 | { | ||
85 | if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) { | ||
86 | *prev = MTRR_TYPE_UNCACHABLE; | ||
87 | *curr = MTRR_TYPE_UNCACHABLE; | ||
88 | return 1; | ||
89 | } | ||
90 | |||
91 | if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) || | ||
92 | (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) { | ||
93 | *prev = MTRR_TYPE_WRTHROUGH; | ||
94 | *curr = MTRR_TYPE_WRTHROUGH; | ||
95 | } | ||
96 | |||
97 | if (*prev != *curr) { | ||
98 | *prev = MTRR_TYPE_UNCACHABLE; | ||
99 | *curr = MTRR_TYPE_UNCACHABLE; | ||
100 | return 1; | ||
101 | } | ||
102 | |||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | /* | ||
107 | * Error/Semi-error returns: | ||
108 | * 0xFF - when MTRR is not enabled | ||
109 | * *repeat == 1 implies [start:end] spanned across MTRR range and type returned | ||
110 | * corresponds only to [start:*partial_end]. | ||
111 | * Caller has to lookup again for [*partial_end:end]. | ||
112 | */ | ||
113 | static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) | ||
74 | { | 114 | { |
75 | int i; | 115 | int i; |
76 | u64 base, mask; | 116 | u64 base, mask; |
77 | u8 prev_match, curr_match; | 117 | u8 prev_match, curr_match; |
78 | 118 | ||
119 | *repeat = 0; | ||
79 | if (!mtrr_state_set) | 120 | if (!mtrr_state_set) |
80 | return 0xFF; | 121 | return 0xFF; |
81 | 122 | ||
@@ -126,8 +167,34 @@ u8 mtrr_type_lookup(u64 start, u64 end) | |||
126 | 167 | ||
127 | start_state = ((start & mask) == (base & mask)); | 168 | start_state = ((start & mask) == (base & mask)); |
128 | end_state = ((end & mask) == (base & mask)); | 169 | end_state = ((end & mask) == (base & mask)); |
129 | if (start_state != end_state) | 170 | |
130 | return 0xFE; | 171 | if (start_state != end_state) { |
172 | /* | ||
173 | * We have start:end spanning across an MTRR. | ||
174 | * We split the region into | ||
175 | * either | ||
176 | * (start:mtrr_end) (mtrr_end:end) | ||
177 | * or | ||
178 | * (start:mtrr_start) (mtrr_start:end) | ||
179 | * depending on kind of overlap. | ||
180 | * Return the type for first region and a pointer to | ||
181 | * the start of second region so that caller will | ||
182 | * lookup again on the second region. | ||
183 | * Note: This way we handle multiple overlaps as well. | ||
184 | */ | ||
185 | if (start_state) | ||
186 | *partial_end = base + get_mtrr_size(mask); | ||
187 | else | ||
188 | *partial_end = base; | ||
189 | |||
190 | if (unlikely(*partial_end <= start)) { | ||
191 | WARN_ON(1); | ||
192 | *partial_end = start + PAGE_SIZE; | ||
193 | } | ||
194 | |||
195 | end = *partial_end - 1; /* end is inclusive */ | ||
196 | *repeat = 1; | ||
197 | } | ||
131 | 198 | ||
132 | if ((start & mask) != (base & mask)) | 199 | if ((start & mask) != (base & mask)) |
133 | continue; | 200 | continue; |
@@ -138,21 +205,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) | |||
138 | continue; | 205 | continue; |
139 | } | 206 | } |
140 | 207 | ||
141 | if (prev_match == MTRR_TYPE_UNCACHABLE || | 208 | if (check_type_overlap(&prev_match, &curr_match)) |
142 | curr_match == MTRR_TYPE_UNCACHABLE) { | 209 | return curr_match; |
143 | return MTRR_TYPE_UNCACHABLE; | ||
144 | } | ||
145 | |||
146 | if ((prev_match == MTRR_TYPE_WRBACK && | ||
147 | curr_match == MTRR_TYPE_WRTHROUGH) || | ||
148 | (prev_match == MTRR_TYPE_WRTHROUGH && | ||
149 | curr_match == MTRR_TYPE_WRBACK)) { | ||
150 | prev_match = MTRR_TYPE_WRTHROUGH; | ||
151 | curr_match = MTRR_TYPE_WRTHROUGH; | ||
152 | } | ||
153 | |||
154 | if (prev_match != curr_match) | ||
155 | return MTRR_TYPE_UNCACHABLE; | ||
156 | } | 210 | } |
157 | 211 | ||
158 | if (mtrr_tom2) { | 212 | if (mtrr_tom2) { |
@@ -166,6 +220,36 @@ u8 mtrr_type_lookup(u64 start, u64 end) | |||
166 | return mtrr_state.def_type; | 220 | return mtrr_state.def_type; |
167 | } | 221 | } |
168 | 222 | ||
223 | /* | ||
224 | * Returns the effective MTRR type for the region | ||
225 | * Error return: | ||
226 | * 0xFF - when MTRR is not enabled | ||
227 | */ | ||
228 | u8 mtrr_type_lookup(u64 start, u64 end) | ||
229 | { | ||
230 | u8 type, prev_type; | ||
231 | int repeat; | ||
232 | u64 partial_end; | ||
233 | |||
234 | type = __mtrr_type_lookup(start, end, &partial_end, &repeat); | ||
235 | |||
236 | /* | ||
237 | * Common path is with repeat = 0. | ||
238 | * However, we can have cases where [start:end] spans across some | ||
239 | * MTRR range. Do repeated lookups for that case here. | ||
240 | */ | ||
241 | while (repeat) { | ||
242 | prev_type = type; | ||
243 | start = partial_end; | ||
244 | type = __mtrr_type_lookup(start, end, &partial_end, &repeat); | ||
245 | |||
246 | if (check_type_overlap(&prev_type, &type)) | ||
247 | return type; | ||
248 | } | ||
249 | |||
250 | return type; | ||
251 | } | ||
252 | |||
169 | /* Get the MSR pair relating to a var range */ | 253 | /* Get the MSR pair relating to a var range */ |
170 | static void | 254 | static void |
171 | get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) | 255 | get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) |
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 03a5b0385ad6..ed6310183efb 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -49,7 +49,6 @@ static unsigned long | |||
49 | copy_from_user_nmi(void *to, const void __user *from, unsigned long n) | 49 | copy_from_user_nmi(void *to, const void __user *from, unsigned long n) |
50 | { | 50 | { |
51 | unsigned long offset, addr = (unsigned long)from; | 51 | unsigned long offset, addr = (unsigned long)from; |
52 | int type = in_nmi() ? KM_NMI : KM_IRQ0; | ||
53 | unsigned long size, len = 0; | 52 | unsigned long size, len = 0; |
54 | struct page *page; | 53 | struct page *page; |
55 | void *map; | 54 | void *map; |
@@ -63,9 +62,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) | |||
63 | offset = addr & (PAGE_SIZE - 1); | 62 | offset = addr & (PAGE_SIZE - 1); |
64 | size = min(PAGE_SIZE - offset, n - len); | 63 | size = min(PAGE_SIZE - offset, n - len); |
65 | 64 | ||
66 | map = kmap_atomic(page, type); | 65 | map = kmap_atomic(page); |
67 | memcpy(to, map+offset, size); | 66 | memcpy(to, map+offset, size); |
68 | kunmap_atomic(map, type); | 67 | kunmap_atomic(map); |
69 | put_page(page); | 68 | put_page(page); |
70 | 69 | ||
71 | len += size; | 70 | len += size; |
@@ -238,6 +237,7 @@ struct x86_pmu { | |||
238 | * Intel DebugStore bits | 237 | * Intel DebugStore bits |
239 | */ | 238 | */ |
240 | int bts, pebs; | 239 | int bts, pebs; |
240 | int bts_active, pebs_active; | ||
241 | int pebs_record_size; | 241 | int pebs_record_size; |
242 | void (*drain_pebs)(struct pt_regs *regs); | 242 | void (*drain_pebs)(struct pt_regs *regs); |
243 | struct event_constraint *pebs_constraints; | 243 | struct event_constraint *pebs_constraints; |
@@ -381,7 +381,7 @@ static void release_pmc_hardware(void) {} | |||
381 | 381 | ||
382 | #endif | 382 | #endif |
383 | 383 | ||
384 | static int reserve_ds_buffers(void); | 384 | static void reserve_ds_buffers(void); |
385 | static void release_ds_buffers(void); | 385 | static void release_ds_buffers(void); |
386 | 386 | ||
387 | static void hw_perf_event_destroy(struct perf_event *event) | 387 | static void hw_perf_event_destroy(struct perf_event *event) |
@@ -478,7 +478,7 @@ static int x86_setup_perfctr(struct perf_event *event) | |||
478 | if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && | 478 | if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && |
479 | (hwc->sample_period == 1)) { | 479 | (hwc->sample_period == 1)) { |
480 | /* BTS is not supported by this architecture. */ | 480 | /* BTS is not supported by this architecture. */ |
481 | if (!x86_pmu.bts) | 481 | if (!x86_pmu.bts_active) |
482 | return -EOPNOTSUPP; | 482 | return -EOPNOTSUPP; |
483 | 483 | ||
484 | /* BTS is currently only allowed for user-mode. */ | 484 | /* BTS is currently only allowed for user-mode. */ |
@@ -497,12 +497,13 @@ static int x86_pmu_hw_config(struct perf_event *event) | |||
497 | int precise = 0; | 497 | int precise = 0; |
498 | 498 | ||
499 | /* Support for constant skid */ | 499 | /* Support for constant skid */ |
500 | if (x86_pmu.pebs) | 500 | if (x86_pmu.pebs_active) { |
501 | precise++; | 501 | precise++; |
502 | 502 | ||
503 | /* Support for IP fixup */ | 503 | /* Support for IP fixup */ |
504 | if (x86_pmu.lbr_nr) | 504 | if (x86_pmu.lbr_nr) |
505 | precise++; | 505 | precise++; |
506 | } | ||
506 | 507 | ||
507 | if (event->attr.precise_ip > precise) | 508 | if (event->attr.precise_ip > precise) |
508 | return -EOPNOTSUPP; | 509 | return -EOPNOTSUPP; |
@@ -531,7 +532,7 @@ static int x86_pmu_hw_config(struct perf_event *event) | |||
531 | /* | 532 | /* |
532 | * Setup the hardware configuration for a given attr_type | 533 | * Setup the hardware configuration for a given attr_type |
533 | */ | 534 | */ |
534 | static int __hw_perf_event_init(struct perf_event *event) | 535 | static int __x86_pmu_event_init(struct perf_event *event) |
535 | { | 536 | { |
536 | int err; | 537 | int err; |
537 | 538 | ||
@@ -544,11 +545,8 @@ static int __hw_perf_event_init(struct perf_event *event) | |||
544 | if (atomic_read(&active_events) == 0) { | 545 | if (atomic_read(&active_events) == 0) { |
545 | if (!reserve_pmc_hardware()) | 546 | if (!reserve_pmc_hardware()) |
546 | err = -EBUSY; | 547 | err = -EBUSY; |
547 | else { | 548 | else |
548 | err = reserve_ds_buffers(); | 549 | reserve_ds_buffers(); |
549 | if (err) | ||
550 | release_pmc_hardware(); | ||
551 | } | ||
552 | } | 550 | } |
553 | if (!err) | 551 | if (!err) |
554 | atomic_inc(&active_events); | 552 | atomic_inc(&active_events); |
@@ -584,7 +582,7 @@ static void x86_pmu_disable_all(void) | |||
584 | } | 582 | } |
585 | } | 583 | } |
586 | 584 | ||
587 | void hw_perf_disable(void) | 585 | static void x86_pmu_disable(struct pmu *pmu) |
588 | { | 586 | { |
589 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 587 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
590 | 588 | ||
@@ -619,7 +617,7 @@ static void x86_pmu_enable_all(int added) | |||
619 | } | 617 | } |
620 | } | 618 | } |
621 | 619 | ||
622 | static const struct pmu pmu; | 620 | static struct pmu pmu; |
623 | 621 | ||
624 | static inline int is_x86_event(struct perf_event *event) | 622 | static inline int is_x86_event(struct perf_event *event) |
625 | { | 623 | { |
@@ -801,10 +799,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc, | |||
801 | hwc->last_tag == cpuc->tags[i]; | 799 | hwc->last_tag == cpuc->tags[i]; |
802 | } | 800 | } |
803 | 801 | ||
804 | static int x86_pmu_start(struct perf_event *event); | 802 | static void x86_pmu_start(struct perf_event *event, int flags); |
805 | static void x86_pmu_stop(struct perf_event *event); | 803 | static void x86_pmu_stop(struct perf_event *event, int flags); |
806 | 804 | ||
807 | void hw_perf_enable(void) | 805 | static void x86_pmu_enable(struct pmu *pmu) |
808 | { | 806 | { |
809 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 807 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
810 | struct perf_event *event; | 808 | struct perf_event *event; |
@@ -840,7 +838,14 @@ void hw_perf_enable(void) | |||
840 | match_prev_assignment(hwc, cpuc, i)) | 838 | match_prev_assignment(hwc, cpuc, i)) |
841 | continue; | 839 | continue; |
842 | 840 | ||
843 | x86_pmu_stop(event); | 841 | /* |
842 | * Ensure we don't accidentally enable a stopped | ||
843 | * counter simply because we rescheduled. | ||
844 | */ | ||
845 | if (hwc->state & PERF_HES_STOPPED) | ||
846 | hwc->state |= PERF_HES_ARCH; | ||
847 | |||
848 | x86_pmu_stop(event, PERF_EF_UPDATE); | ||
844 | } | 849 | } |
845 | 850 | ||
846 | for (i = 0; i < cpuc->n_events; i++) { | 851 | for (i = 0; i < cpuc->n_events; i++) { |
@@ -852,7 +857,10 @@ void hw_perf_enable(void) | |||
852 | else if (i < n_running) | 857 | else if (i < n_running) |
853 | continue; | 858 | continue; |
854 | 859 | ||
855 | x86_pmu_start(event); | 860 | if (hwc->state & PERF_HES_ARCH) |
861 | continue; | ||
862 | |||
863 | x86_pmu_start(event, PERF_EF_RELOAD); | ||
856 | } | 864 | } |
857 | cpuc->n_added = 0; | 865 | cpuc->n_added = 0; |
858 | perf_events_lapic_init(); | 866 | perf_events_lapic_init(); |
@@ -953,15 +961,12 @@ static void x86_pmu_enable_event(struct perf_event *event) | |||
953 | } | 961 | } |
954 | 962 | ||
955 | /* | 963 | /* |
956 | * activate a single event | 964 | * Add a single event to the PMU. |
957 | * | 965 | * |
958 | * The event is added to the group of enabled events | 966 | * The event is added to the group of enabled events |
959 | * but only if it can be scehduled with existing events. | 967 | * but only if it can be scehduled with existing events. |
960 | * | ||
961 | * Called with PMU disabled. If successful and return value 1, | ||
962 | * then guaranteed to call perf_enable() and hw_perf_enable() | ||
963 | */ | 968 | */ |
964 | static int x86_pmu_enable(struct perf_event *event) | 969 | static int x86_pmu_add(struct perf_event *event, int flags) |
965 | { | 970 | { |
966 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 971 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
967 | struct hw_perf_event *hwc; | 972 | struct hw_perf_event *hwc; |
@@ -970,58 +975,67 @@ static int x86_pmu_enable(struct perf_event *event) | |||
970 | 975 | ||
971 | hwc = &event->hw; | 976 | hwc = &event->hw; |
972 | 977 | ||
978 | perf_pmu_disable(event->pmu); | ||
973 | n0 = cpuc->n_events; | 979 | n0 = cpuc->n_events; |
974 | n = collect_events(cpuc, event, false); | 980 | ret = n = collect_events(cpuc, event, false); |
975 | if (n < 0) | 981 | if (ret < 0) |
976 | return n; | 982 | goto out; |
983 | |||
984 | hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; | ||
985 | if (!(flags & PERF_EF_START)) | ||
986 | hwc->state |= PERF_HES_ARCH; | ||
977 | 987 | ||
978 | /* | 988 | /* |
979 | * If group events scheduling transaction was started, | 989 | * If group events scheduling transaction was started, |
980 | * skip the schedulability test here, it will be peformed | 990 | * skip the schedulability test here, it will be peformed |
981 | * at commit time(->commit_txn) as a whole | 991 | * at commit time (->commit_txn) as a whole |
982 | */ | 992 | */ |
983 | if (cpuc->group_flag & PERF_EVENT_TXN) | 993 | if (cpuc->group_flag & PERF_EVENT_TXN) |
984 | goto out; | 994 | goto done_collect; |
985 | 995 | ||
986 | ret = x86_pmu.schedule_events(cpuc, n, assign); | 996 | ret = x86_pmu.schedule_events(cpuc, n, assign); |
987 | if (ret) | 997 | if (ret) |
988 | return ret; | 998 | goto out; |
989 | /* | 999 | /* |
990 | * copy new assignment, now we know it is possible | 1000 | * copy new assignment, now we know it is possible |
991 | * will be used by hw_perf_enable() | 1001 | * will be used by hw_perf_enable() |
992 | */ | 1002 | */ |
993 | memcpy(cpuc->assign, assign, n*sizeof(int)); | 1003 | memcpy(cpuc->assign, assign, n*sizeof(int)); |
994 | 1004 | ||
995 | out: | 1005 | done_collect: |
996 | cpuc->n_events = n; | 1006 | cpuc->n_events = n; |
997 | cpuc->n_added += n - n0; | 1007 | cpuc->n_added += n - n0; |
998 | cpuc->n_txn += n - n0; | 1008 | cpuc->n_txn += n - n0; |
999 | 1009 | ||
1000 | return 0; | 1010 | ret = 0; |
1011 | out: | ||
1012 | perf_pmu_enable(event->pmu); | ||
1013 | return ret; | ||
1001 | } | 1014 | } |
1002 | 1015 | ||
1003 | static int x86_pmu_start(struct perf_event *event) | 1016 | static void x86_pmu_start(struct perf_event *event, int flags) |
1004 | { | 1017 | { |
1005 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 1018 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1006 | int idx = event->hw.idx; | 1019 | int idx = event->hw.idx; |
1007 | 1020 | ||
1008 | if (idx == -1) | 1021 | if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) |
1009 | return -EAGAIN; | 1022 | return; |
1023 | |||
1024 | if (WARN_ON_ONCE(idx == -1)) | ||
1025 | return; | ||
1026 | |||
1027 | if (flags & PERF_EF_RELOAD) { | ||
1028 | WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); | ||
1029 | x86_perf_event_set_period(event); | ||
1030 | } | ||
1031 | |||
1032 | event->hw.state = 0; | ||
1010 | 1033 | ||
1011 | x86_perf_event_set_period(event); | ||
1012 | cpuc->events[idx] = event; | 1034 | cpuc->events[idx] = event; |
1013 | __set_bit(idx, cpuc->active_mask); | 1035 | __set_bit(idx, cpuc->active_mask); |
1014 | __set_bit(idx, cpuc->running); | 1036 | __set_bit(idx, cpuc->running); |
1015 | x86_pmu.enable(event); | 1037 | x86_pmu.enable(event); |
1016 | perf_event_update_userpage(event); | 1038 | perf_event_update_userpage(event); |
1017 | |||
1018 | return 0; | ||
1019 | } | ||
1020 | |||
1021 | static void x86_pmu_unthrottle(struct perf_event *event) | ||
1022 | { | ||
1023 | int ret = x86_pmu_start(event); | ||
1024 | WARN_ON_ONCE(ret); | ||
1025 | } | 1039 | } |
1026 | 1040 | ||
1027 | void perf_event_print_debug(void) | 1041 | void perf_event_print_debug(void) |
@@ -1078,27 +1092,29 @@ void perf_event_print_debug(void) | |||
1078 | local_irq_restore(flags); | 1092 | local_irq_restore(flags); |
1079 | } | 1093 | } |
1080 | 1094 | ||
1081 | static void x86_pmu_stop(struct perf_event *event) | 1095 | static void x86_pmu_stop(struct perf_event *event, int flags) |
1082 | { | 1096 | { |
1083 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 1097 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1084 | struct hw_perf_event *hwc = &event->hw; | 1098 | struct hw_perf_event *hwc = &event->hw; |
1085 | int idx = hwc->idx; | ||
1086 | |||
1087 | if (!__test_and_clear_bit(idx, cpuc->active_mask)) | ||
1088 | return; | ||
1089 | |||
1090 | x86_pmu.disable(event); | ||
1091 | 1099 | ||
1092 | /* | 1100 | if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { |
1093 | * Drain the remaining delta count out of a event | 1101 | x86_pmu.disable(event); |
1094 | * that we are disabling: | 1102 | cpuc->events[hwc->idx] = NULL; |
1095 | */ | 1103 | WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); |
1096 | x86_perf_event_update(event); | 1104 | hwc->state |= PERF_HES_STOPPED; |
1105 | } | ||
1097 | 1106 | ||
1098 | cpuc->events[idx] = NULL; | 1107 | if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { |
1108 | /* | ||
1109 | * Drain the remaining delta count out of a event | ||
1110 | * that we are disabling: | ||
1111 | */ | ||
1112 | x86_perf_event_update(event); | ||
1113 | hwc->state |= PERF_HES_UPTODATE; | ||
1114 | } | ||
1099 | } | 1115 | } |
1100 | 1116 | ||
1101 | static void x86_pmu_disable(struct perf_event *event) | 1117 | static void x86_pmu_del(struct perf_event *event, int flags) |
1102 | { | 1118 | { |
1103 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 1119 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1104 | int i; | 1120 | int i; |
@@ -1111,7 +1127,7 @@ static void x86_pmu_disable(struct perf_event *event) | |||
1111 | if (cpuc->group_flag & PERF_EVENT_TXN) | 1127 | if (cpuc->group_flag & PERF_EVENT_TXN) |
1112 | return; | 1128 | return; |
1113 | 1129 | ||
1114 | x86_pmu_stop(event); | 1130 | x86_pmu_stop(event, PERF_EF_UPDATE); |
1115 | 1131 | ||
1116 | for (i = 0; i < cpuc->n_events; i++) { | 1132 | for (i = 0; i < cpuc->n_events; i++) { |
1117 | if (event == cpuc->event_list[i]) { | 1133 | if (event == cpuc->event_list[i]) { |
@@ -1134,7 +1150,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) | |||
1134 | struct perf_sample_data data; | 1150 | struct perf_sample_data data; |
1135 | struct cpu_hw_events *cpuc; | 1151 | struct cpu_hw_events *cpuc; |
1136 | struct perf_event *event; | 1152 | struct perf_event *event; |
1137 | struct hw_perf_event *hwc; | ||
1138 | int idx, handled = 0; | 1153 | int idx, handled = 0; |
1139 | u64 val; | 1154 | u64 val; |
1140 | 1155 | ||
@@ -1155,7 +1170,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) | |||
1155 | } | 1170 | } |
1156 | 1171 | ||
1157 | event = cpuc->events[idx]; | 1172 | event = cpuc->events[idx]; |
1158 | hwc = &event->hw; | ||
1159 | 1173 | ||
1160 | val = x86_perf_event_update(event); | 1174 | val = x86_perf_event_update(event); |
1161 | if (val & (1ULL << (x86_pmu.cntval_bits - 1))) | 1175 | if (val & (1ULL << (x86_pmu.cntval_bits - 1))) |
@@ -1171,7 +1185,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) | |||
1171 | continue; | 1185 | continue; |
1172 | 1186 | ||
1173 | if (perf_event_overflow(event, 1, &data, regs)) | 1187 | if (perf_event_overflow(event, 1, &data, regs)) |
1174 | x86_pmu_stop(event); | 1188 | x86_pmu_stop(event, 0); |
1175 | } | 1189 | } |
1176 | 1190 | ||
1177 | if (handled) | 1191 | if (handled) |
@@ -1180,25 +1194,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) | |||
1180 | return handled; | 1194 | return handled; |
1181 | } | 1195 | } |
1182 | 1196 | ||
1183 | void smp_perf_pending_interrupt(struct pt_regs *regs) | ||
1184 | { | ||
1185 | irq_enter(); | ||
1186 | ack_APIC_irq(); | ||
1187 | inc_irq_stat(apic_pending_irqs); | ||
1188 | perf_event_do_pending(); | ||
1189 | irq_exit(); | ||
1190 | } | ||
1191 | |||
1192 | void set_perf_event_pending(void) | ||
1193 | { | ||
1194 | #ifdef CONFIG_X86_LOCAL_APIC | ||
1195 | if (!x86_pmu.apic || !x86_pmu_initialized()) | ||
1196 | return; | ||
1197 | |||
1198 | apic->send_IPI_self(LOCAL_PENDING_VECTOR); | ||
1199 | #endif | ||
1200 | } | ||
1201 | |||
1202 | void perf_events_lapic_init(void) | 1197 | void perf_events_lapic_init(void) |
1203 | { | 1198 | { |
1204 | if (!x86_pmu.apic || !x86_pmu_initialized()) | 1199 | if (!x86_pmu.apic || !x86_pmu_initialized()) |
@@ -1388,7 +1383,6 @@ void __init init_hw_perf_events(void) | |||
1388 | x86_pmu.num_counters = X86_PMC_MAX_GENERIC; | 1383 | x86_pmu.num_counters = X86_PMC_MAX_GENERIC; |
1389 | } | 1384 | } |
1390 | x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; | 1385 | x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; |
1391 | perf_max_events = x86_pmu.num_counters; | ||
1392 | 1386 | ||
1393 | if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { | 1387 | if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { |
1394 | WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", | 1388 | WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", |
@@ -1424,6 +1418,7 @@ void __init init_hw_perf_events(void) | |||
1424 | pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); | 1418 | pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); |
1425 | pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); | 1419 | pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); |
1426 | 1420 | ||
1421 | perf_pmu_register(&pmu); | ||
1427 | perf_cpu_notifier(x86_pmu_notifier); | 1422 | perf_cpu_notifier(x86_pmu_notifier); |
1428 | } | 1423 | } |
1429 | 1424 | ||
@@ -1437,10 +1432,11 @@ static inline void x86_pmu_read(struct perf_event *event) | |||
1437 | * Set the flag to make pmu::enable() not perform the | 1432 | * Set the flag to make pmu::enable() not perform the |
1438 | * schedulability test, it will be performed at commit time | 1433 | * schedulability test, it will be performed at commit time |
1439 | */ | 1434 | */ |
1440 | static void x86_pmu_start_txn(const struct pmu *pmu) | 1435 | static void x86_pmu_start_txn(struct pmu *pmu) |
1441 | { | 1436 | { |
1442 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 1437 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1443 | 1438 | ||
1439 | perf_pmu_disable(pmu); | ||
1444 | cpuc->group_flag |= PERF_EVENT_TXN; | 1440 | cpuc->group_flag |= PERF_EVENT_TXN; |
1445 | cpuc->n_txn = 0; | 1441 | cpuc->n_txn = 0; |
1446 | } | 1442 | } |
@@ -1450,7 +1446,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu) | |||
1450 | * Clear the flag and pmu::enable() will perform the | 1446 | * Clear the flag and pmu::enable() will perform the |
1451 | * schedulability test. | 1447 | * schedulability test. |
1452 | */ | 1448 | */ |
1453 | static void x86_pmu_cancel_txn(const struct pmu *pmu) | 1449 | static void x86_pmu_cancel_txn(struct pmu *pmu) |
1454 | { | 1450 | { |
1455 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 1451 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1456 | 1452 | ||
@@ -1460,6 +1456,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) | |||
1460 | */ | 1456 | */ |
1461 | cpuc->n_added -= cpuc->n_txn; | 1457 | cpuc->n_added -= cpuc->n_txn; |
1462 | cpuc->n_events -= cpuc->n_txn; | 1458 | cpuc->n_events -= cpuc->n_txn; |
1459 | perf_pmu_enable(pmu); | ||
1463 | } | 1460 | } |
1464 | 1461 | ||
1465 | /* | 1462 | /* |
@@ -1467,7 +1464,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) | |||
1467 | * Perform the group schedulability test as a whole | 1464 | * Perform the group schedulability test as a whole |
1468 | * Return 0 if success | 1465 | * Return 0 if success |
1469 | */ | 1466 | */ |
1470 | static int x86_pmu_commit_txn(const struct pmu *pmu) | 1467 | static int x86_pmu_commit_txn(struct pmu *pmu) |
1471 | { | 1468 | { |
1472 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 1469 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1473 | int assign[X86_PMC_IDX_MAX]; | 1470 | int assign[X86_PMC_IDX_MAX]; |
@@ -1489,22 +1486,10 @@ static int x86_pmu_commit_txn(const struct pmu *pmu) | |||
1489 | memcpy(cpuc->assign, assign, n*sizeof(int)); | 1486 | memcpy(cpuc->assign, assign, n*sizeof(int)); |
1490 | 1487 | ||
1491 | cpuc->group_flag &= ~PERF_EVENT_TXN; | 1488 | cpuc->group_flag &= ~PERF_EVENT_TXN; |
1492 | 1489 | perf_pmu_enable(pmu); | |
1493 | return 0; | 1490 | return 0; |
1494 | } | 1491 | } |
1495 | 1492 | ||
1496 | static const struct pmu pmu = { | ||
1497 | .enable = x86_pmu_enable, | ||
1498 | .disable = x86_pmu_disable, | ||
1499 | .start = x86_pmu_start, | ||
1500 | .stop = x86_pmu_stop, | ||
1501 | .read = x86_pmu_read, | ||
1502 | .unthrottle = x86_pmu_unthrottle, | ||
1503 | .start_txn = x86_pmu_start_txn, | ||
1504 | .cancel_txn = x86_pmu_cancel_txn, | ||
1505 | .commit_txn = x86_pmu_commit_txn, | ||
1506 | }; | ||
1507 | |||
1508 | /* | 1493 | /* |
1509 | * validate that we can schedule this event | 1494 | * validate that we can schedule this event |
1510 | */ | 1495 | */ |
@@ -1579,12 +1564,22 @@ out: | |||
1579 | return ret; | 1564 | return ret; |
1580 | } | 1565 | } |
1581 | 1566 | ||
1582 | const struct pmu *hw_perf_event_init(struct perf_event *event) | 1567 | int x86_pmu_event_init(struct perf_event *event) |
1583 | { | 1568 | { |
1584 | const struct pmu *tmp; | 1569 | struct pmu *tmp; |
1585 | int err; | 1570 | int err; |
1586 | 1571 | ||
1587 | err = __hw_perf_event_init(event); | 1572 | switch (event->attr.type) { |
1573 | case PERF_TYPE_RAW: | ||
1574 | case PERF_TYPE_HARDWARE: | ||
1575 | case PERF_TYPE_HW_CACHE: | ||
1576 | break; | ||
1577 | |||
1578 | default: | ||
1579 | return -ENOENT; | ||
1580 | } | ||
1581 | |||
1582 | err = __x86_pmu_event_init(event); | ||
1588 | if (!err) { | 1583 | if (!err) { |
1589 | /* | 1584 | /* |
1590 | * we temporarily connect event to its pmu | 1585 | * we temporarily connect event to its pmu |
@@ -1604,26 +1599,31 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) | |||
1604 | if (err) { | 1599 | if (err) { |
1605 | if (event->destroy) | 1600 | if (event->destroy) |
1606 | event->destroy(event); | 1601 | event->destroy(event); |
1607 | return ERR_PTR(err); | ||
1608 | } | 1602 | } |
1609 | 1603 | ||
1610 | return &pmu; | 1604 | return err; |
1611 | } | 1605 | } |
1612 | 1606 | ||
1613 | /* | 1607 | static struct pmu pmu = { |
1614 | * callchain support | 1608 | .pmu_enable = x86_pmu_enable, |
1615 | */ | 1609 | .pmu_disable = x86_pmu_disable, |
1616 | 1610 | ||
1617 | static inline | 1611 | .event_init = x86_pmu_event_init, |
1618 | void callchain_store(struct perf_callchain_entry *entry, u64 ip) | ||
1619 | { | ||
1620 | if (entry->nr < PERF_MAX_STACK_DEPTH) | ||
1621 | entry->ip[entry->nr++] = ip; | ||
1622 | } | ||
1623 | 1612 | ||
1624 | static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); | 1613 | .add = x86_pmu_add, |
1625 | static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); | 1614 | .del = x86_pmu_del, |
1615 | .start = x86_pmu_start, | ||
1616 | .stop = x86_pmu_stop, | ||
1617 | .read = x86_pmu_read, | ||
1626 | 1618 | ||
1619 | .start_txn = x86_pmu_start_txn, | ||
1620 | .cancel_txn = x86_pmu_cancel_txn, | ||
1621 | .commit_txn = x86_pmu_commit_txn, | ||
1622 | }; | ||
1623 | |||
1624 | /* | ||
1625 | * callchain support | ||
1626 | */ | ||
1627 | 1627 | ||
1628 | static void | 1628 | static void |
1629 | backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) | 1629 | backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) |
@@ -1645,7 +1645,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) | |||
1645 | { | 1645 | { |
1646 | struct perf_callchain_entry *entry = data; | 1646 | struct perf_callchain_entry *entry = data; |
1647 | 1647 | ||
1648 | callchain_store(entry, addr); | 1648 | perf_callchain_store(entry, addr); |
1649 | } | 1649 | } |
1650 | 1650 | ||
1651 | static const struct stacktrace_ops backtrace_ops = { | 1651 | static const struct stacktrace_ops backtrace_ops = { |
@@ -1656,11 +1656,15 @@ static const struct stacktrace_ops backtrace_ops = { | |||
1656 | .walk_stack = print_context_stack_bp, | 1656 | .walk_stack = print_context_stack_bp, |
1657 | }; | 1657 | }; |
1658 | 1658 | ||
1659 | static void | 1659 | void |
1660 | perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) | 1660 | perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) |
1661 | { | 1661 | { |
1662 | callchain_store(entry, PERF_CONTEXT_KERNEL); | 1662 | if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { |
1663 | callchain_store(entry, regs->ip); | 1663 | /* TODO: We don't support guest os callchain now */ |
1664 | return; | ||
1665 | } | ||
1666 | |||
1667 | perf_callchain_store(entry, regs->ip); | ||
1664 | 1668 | ||
1665 | dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); | 1669 | dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); |
1666 | } | 1670 | } |
@@ -1689,7 +1693,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) | |||
1689 | if (fp < compat_ptr(regs->sp)) | 1693 | if (fp < compat_ptr(regs->sp)) |
1690 | break; | 1694 | break; |
1691 | 1695 | ||
1692 | callchain_store(entry, frame.return_address); | 1696 | perf_callchain_store(entry, frame.return_address); |
1693 | fp = compat_ptr(frame.next_frame); | 1697 | fp = compat_ptr(frame.next_frame); |
1694 | } | 1698 | } |
1695 | return 1; | 1699 | return 1; |
@@ -1702,19 +1706,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) | |||
1702 | } | 1706 | } |
1703 | #endif | 1707 | #endif |
1704 | 1708 | ||
1705 | static void | 1709 | void |
1706 | perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) | 1710 | perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) |
1707 | { | 1711 | { |
1708 | struct stack_frame frame; | 1712 | struct stack_frame frame; |
1709 | const void __user *fp; | 1713 | const void __user *fp; |
1710 | 1714 | ||
1711 | if (!user_mode(regs)) | 1715 | if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { |
1712 | regs = task_pt_regs(current); | 1716 | /* TODO: We don't support guest os callchain now */ |
1717 | return; | ||
1718 | } | ||
1713 | 1719 | ||
1714 | fp = (void __user *)regs->bp; | 1720 | fp = (void __user *)regs->bp; |
1715 | 1721 | ||
1716 | callchain_store(entry, PERF_CONTEXT_USER); | 1722 | perf_callchain_store(entry, regs->ip); |
1717 | callchain_store(entry, regs->ip); | ||
1718 | 1723 | ||
1719 | if (perf_callchain_user32(regs, entry)) | 1724 | if (perf_callchain_user32(regs, entry)) |
1720 | return; | 1725 | return; |
@@ -1731,52 +1736,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) | |||
1731 | if ((unsigned long)fp < regs->sp) | 1736 | if ((unsigned long)fp < regs->sp) |
1732 | break; | 1737 | break; |
1733 | 1738 | ||
1734 | callchain_store(entry, frame.return_address); | 1739 | perf_callchain_store(entry, frame.return_address); |
1735 | fp = frame.next_frame; | 1740 | fp = frame.next_frame; |
1736 | } | 1741 | } |
1737 | } | 1742 | } |
1738 | 1743 | ||
1739 | static void | ||
1740 | perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) | ||
1741 | { | ||
1742 | int is_user; | ||
1743 | |||
1744 | if (!regs) | ||
1745 | return; | ||
1746 | |||
1747 | is_user = user_mode(regs); | ||
1748 | |||
1749 | if (is_user && current->state != TASK_RUNNING) | ||
1750 | return; | ||
1751 | |||
1752 | if (!is_user) | ||
1753 | perf_callchain_kernel(regs, entry); | ||
1754 | |||
1755 | if (current->mm) | ||
1756 | perf_callchain_user(regs, entry); | ||
1757 | } | ||
1758 | |||
1759 | struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
1760 | { | ||
1761 | struct perf_callchain_entry *entry; | ||
1762 | |||
1763 | if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { | ||
1764 | /* TODO: We don't support guest os callchain now */ | ||
1765 | return NULL; | ||
1766 | } | ||
1767 | |||
1768 | if (in_nmi()) | ||
1769 | entry = &__get_cpu_var(pmc_nmi_entry); | ||
1770 | else | ||
1771 | entry = &__get_cpu_var(pmc_irq_entry); | ||
1772 | |||
1773 | entry->nr = 0; | ||
1774 | |||
1775 | perf_do_callchain(regs, entry); | ||
1776 | |||
1777 | return entry; | ||
1778 | } | ||
1779 | |||
1780 | unsigned long perf_instruction_pointer(struct pt_regs *regs) | 1744 | unsigned long perf_instruction_pointer(struct pt_regs *regs) |
1781 | { | 1745 | { |
1782 | unsigned long ip; | 1746 | unsigned long ip; |
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index c2897b7b4a3b..46d58448c3af 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c | |||
@@ -52,7 +52,7 @@ static __initconst const u64 amd_hw_cache_event_ids | |||
52 | [ C(DTLB) ] = { | 52 | [ C(DTLB) ] = { |
53 | [ C(OP_READ) ] = { | 53 | [ C(OP_READ) ] = { |
54 | [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ | 54 | [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ |
55 | [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ | 55 | [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */ |
56 | }, | 56 | }, |
57 | [ C(OP_WRITE) ] = { | 57 | [ C(OP_WRITE) ] = { |
58 | [ C(RESULT_ACCESS) ] = 0, | 58 | [ C(RESULT_ACCESS) ] = 0, |
@@ -66,7 +66,7 @@ static __initconst const u64 amd_hw_cache_event_ids | |||
66 | [ C(ITLB) ] = { | 66 | [ C(ITLB) ] = { |
67 | [ C(OP_READ) ] = { | 67 | [ C(OP_READ) ] = { |
68 | [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ | 68 | [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ |
69 | [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ | 69 | [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */ |
70 | }, | 70 | }, |
71 | [ C(OP_WRITE) ] = { | 71 | [ C(OP_WRITE) ] = { |
72 | [ C(RESULT_ACCESS) ] = -1, | 72 | [ C(RESULT_ACCESS) ] = -1, |
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index ee05c90012d2..c8f5c088cad1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c | |||
@@ -713,18 +713,18 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) | |||
713 | struct cpu_hw_events *cpuc; | 713 | struct cpu_hw_events *cpuc; |
714 | int bit, loops; | 714 | int bit, loops; |
715 | u64 status; | 715 | u64 status; |
716 | int handled = 0; | 716 | int handled; |
717 | 717 | ||
718 | perf_sample_data_init(&data, 0); | 718 | perf_sample_data_init(&data, 0); |
719 | 719 | ||
720 | cpuc = &__get_cpu_var(cpu_hw_events); | 720 | cpuc = &__get_cpu_var(cpu_hw_events); |
721 | 721 | ||
722 | intel_pmu_disable_all(); | 722 | intel_pmu_disable_all(); |
723 | intel_pmu_drain_bts_buffer(); | 723 | handled = intel_pmu_drain_bts_buffer(); |
724 | status = intel_pmu_get_status(); | 724 | status = intel_pmu_get_status(); |
725 | if (!status) { | 725 | if (!status) { |
726 | intel_pmu_enable_all(0); | 726 | intel_pmu_enable_all(0); |
727 | return 0; | 727 | return handled; |
728 | } | 728 | } |
729 | 729 | ||
730 | loops = 0; | 730 | loops = 0; |
@@ -763,7 +763,7 @@ again: | |||
763 | data.period = event->hw.last_period; | 763 | data.period = event->hw.last_period; |
764 | 764 | ||
765 | if (perf_event_overflow(event, 1, &data, regs)) | 765 | if (perf_event_overflow(event, 1, &data, regs)) |
766 | x86_pmu_stop(event); | 766 | x86_pmu_stop(event, 0); |
767 | } | 767 | } |
768 | 768 | ||
769 | /* | 769 | /* |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 18018d1311cd..b7dcd9f2b8a0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c | |||
@@ -74,6 +74,107 @@ static void fini_debug_store_on_cpu(int cpu) | |||
74 | wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); | 74 | wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); |
75 | } | 75 | } |
76 | 76 | ||
77 | static int alloc_pebs_buffer(int cpu) | ||
78 | { | ||
79 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
80 | int node = cpu_to_node(cpu); | ||
81 | int max, thresh = 1; /* always use a single PEBS record */ | ||
82 | void *buffer; | ||
83 | |||
84 | if (!x86_pmu.pebs) | ||
85 | return 0; | ||
86 | |||
87 | buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); | ||
88 | if (unlikely(!buffer)) | ||
89 | return -ENOMEM; | ||
90 | |||
91 | max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; | ||
92 | |||
93 | ds->pebs_buffer_base = (u64)(unsigned long)buffer; | ||
94 | ds->pebs_index = ds->pebs_buffer_base; | ||
95 | ds->pebs_absolute_maximum = ds->pebs_buffer_base + | ||
96 | max * x86_pmu.pebs_record_size; | ||
97 | |||
98 | ds->pebs_interrupt_threshold = ds->pebs_buffer_base + | ||
99 | thresh * x86_pmu.pebs_record_size; | ||
100 | |||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | static void release_pebs_buffer(int cpu) | ||
105 | { | ||
106 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
107 | |||
108 | if (!ds || !x86_pmu.pebs) | ||
109 | return; | ||
110 | |||
111 | kfree((void *)(unsigned long)ds->pebs_buffer_base); | ||
112 | ds->pebs_buffer_base = 0; | ||
113 | } | ||
114 | |||
115 | static int alloc_bts_buffer(int cpu) | ||
116 | { | ||
117 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
118 | int node = cpu_to_node(cpu); | ||
119 | int max, thresh; | ||
120 | void *buffer; | ||
121 | |||
122 | if (!x86_pmu.bts) | ||
123 | return 0; | ||
124 | |||
125 | buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); | ||
126 | if (unlikely(!buffer)) | ||
127 | return -ENOMEM; | ||
128 | |||
129 | max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; | ||
130 | thresh = max / 16; | ||
131 | |||
132 | ds->bts_buffer_base = (u64)(unsigned long)buffer; | ||
133 | ds->bts_index = ds->bts_buffer_base; | ||
134 | ds->bts_absolute_maximum = ds->bts_buffer_base + | ||
135 | max * BTS_RECORD_SIZE; | ||
136 | ds->bts_interrupt_threshold = ds->bts_absolute_maximum - | ||
137 | thresh * BTS_RECORD_SIZE; | ||
138 | |||
139 | return 0; | ||
140 | } | ||
141 | |||
142 | static void release_bts_buffer(int cpu) | ||
143 | { | ||
144 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
145 | |||
146 | if (!ds || !x86_pmu.bts) | ||
147 | return; | ||
148 | |||
149 | kfree((void *)(unsigned long)ds->bts_buffer_base); | ||
150 | ds->bts_buffer_base = 0; | ||
151 | } | ||
152 | |||
153 | static int alloc_ds_buffer(int cpu) | ||
154 | { | ||
155 | int node = cpu_to_node(cpu); | ||
156 | struct debug_store *ds; | ||
157 | |||
158 | ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); | ||
159 | if (unlikely(!ds)) | ||
160 | return -ENOMEM; | ||
161 | |||
162 | per_cpu(cpu_hw_events, cpu).ds = ds; | ||
163 | |||
164 | return 0; | ||
165 | } | ||
166 | |||
167 | static void release_ds_buffer(int cpu) | ||
168 | { | ||
169 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
170 | |||
171 | if (!ds) | ||
172 | return; | ||
173 | |||
174 | per_cpu(cpu_hw_events, cpu).ds = NULL; | ||
175 | kfree(ds); | ||
176 | } | ||
177 | |||
77 | static void release_ds_buffers(void) | 178 | static void release_ds_buffers(void) |
78 | { | 179 | { |
79 | int cpu; | 180 | int cpu; |
@@ -82,93 +183,77 @@ static void release_ds_buffers(void) | |||
82 | return; | 183 | return; |
83 | 184 | ||
84 | get_online_cpus(); | 185 | get_online_cpus(); |
85 | |||
86 | for_each_online_cpu(cpu) | 186 | for_each_online_cpu(cpu) |
87 | fini_debug_store_on_cpu(cpu); | 187 | fini_debug_store_on_cpu(cpu); |
88 | 188 | ||
89 | for_each_possible_cpu(cpu) { | 189 | for_each_possible_cpu(cpu) { |
90 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | 190 | release_pebs_buffer(cpu); |
91 | 191 | release_bts_buffer(cpu); | |
92 | if (!ds) | 192 | release_ds_buffer(cpu); |
93 | continue; | ||
94 | |||
95 | per_cpu(cpu_hw_events, cpu).ds = NULL; | ||
96 | |||
97 | kfree((void *)(unsigned long)ds->pebs_buffer_base); | ||
98 | kfree((void *)(unsigned long)ds->bts_buffer_base); | ||
99 | kfree(ds); | ||
100 | } | 193 | } |
101 | |||
102 | put_online_cpus(); | 194 | put_online_cpus(); |
103 | } | 195 | } |
104 | 196 | ||
105 | static int reserve_ds_buffers(void) | 197 | static void reserve_ds_buffers(void) |
106 | { | 198 | { |
107 | int cpu, err = 0; | 199 | int bts_err = 0, pebs_err = 0; |
200 | int cpu; | ||
201 | |||
202 | x86_pmu.bts_active = 0; | ||
203 | x86_pmu.pebs_active = 0; | ||
108 | 204 | ||
109 | if (!x86_pmu.bts && !x86_pmu.pebs) | 205 | if (!x86_pmu.bts && !x86_pmu.pebs) |
110 | return 0; | 206 | return; |
207 | |||
208 | if (!x86_pmu.bts) | ||
209 | bts_err = 1; | ||
210 | |||
211 | if (!x86_pmu.pebs) | ||
212 | pebs_err = 1; | ||
111 | 213 | ||
112 | get_online_cpus(); | 214 | get_online_cpus(); |
113 | 215 | ||
114 | for_each_possible_cpu(cpu) { | 216 | for_each_possible_cpu(cpu) { |
115 | struct debug_store *ds; | 217 | if (alloc_ds_buffer(cpu)) { |
116 | void *buffer; | 218 | bts_err = 1; |
117 | int max, thresh; | 219 | pebs_err = 1; |
220 | } | ||
221 | |||
222 | if (!bts_err && alloc_bts_buffer(cpu)) | ||
223 | bts_err = 1; | ||
224 | |||
225 | if (!pebs_err && alloc_pebs_buffer(cpu)) | ||
226 | pebs_err = 1; | ||
118 | 227 | ||
119 | err = -ENOMEM; | 228 | if (bts_err && pebs_err) |
120 | ds = kzalloc(sizeof(*ds), GFP_KERNEL); | ||
121 | if (unlikely(!ds)) | ||
122 | break; | 229 | break; |
123 | per_cpu(cpu_hw_events, cpu).ds = ds; | 230 | } |
124 | |||
125 | if (x86_pmu.bts) { | ||
126 | buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); | ||
127 | if (unlikely(!buffer)) | ||
128 | break; | ||
129 | |||
130 | max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; | ||
131 | thresh = max / 16; | ||
132 | |||
133 | ds->bts_buffer_base = (u64)(unsigned long)buffer; | ||
134 | ds->bts_index = ds->bts_buffer_base; | ||
135 | ds->bts_absolute_maximum = ds->bts_buffer_base + | ||
136 | max * BTS_RECORD_SIZE; | ||
137 | ds->bts_interrupt_threshold = ds->bts_absolute_maximum - | ||
138 | thresh * BTS_RECORD_SIZE; | ||
139 | } | ||
140 | 231 | ||
141 | if (x86_pmu.pebs) { | 232 | if (bts_err) { |
142 | buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL); | 233 | for_each_possible_cpu(cpu) |
143 | if (unlikely(!buffer)) | 234 | release_bts_buffer(cpu); |
144 | break; | 235 | } |
145 | |||
146 | max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; | ||
147 | |||
148 | ds->pebs_buffer_base = (u64)(unsigned long)buffer; | ||
149 | ds->pebs_index = ds->pebs_buffer_base; | ||
150 | ds->pebs_absolute_maximum = ds->pebs_buffer_base + | ||
151 | max * x86_pmu.pebs_record_size; | ||
152 | /* | ||
153 | * Always use single record PEBS | ||
154 | */ | ||
155 | ds->pebs_interrupt_threshold = ds->pebs_buffer_base + | ||
156 | x86_pmu.pebs_record_size; | ||
157 | } | ||
158 | 236 | ||
159 | err = 0; | 237 | if (pebs_err) { |
238 | for_each_possible_cpu(cpu) | ||
239 | release_pebs_buffer(cpu); | ||
160 | } | 240 | } |
161 | 241 | ||
162 | if (err) | 242 | if (bts_err && pebs_err) { |
163 | release_ds_buffers(); | 243 | for_each_possible_cpu(cpu) |
164 | else { | 244 | release_ds_buffer(cpu); |
245 | } else { | ||
246 | if (x86_pmu.bts && !bts_err) | ||
247 | x86_pmu.bts_active = 1; | ||
248 | |||
249 | if (x86_pmu.pebs && !pebs_err) | ||
250 | x86_pmu.pebs_active = 1; | ||
251 | |||
165 | for_each_online_cpu(cpu) | 252 | for_each_online_cpu(cpu) |
166 | init_debug_store_on_cpu(cpu); | 253 | init_debug_store_on_cpu(cpu); |
167 | } | 254 | } |
168 | 255 | ||
169 | put_online_cpus(); | 256 | put_online_cpus(); |
170 | |||
171 | return err; | ||
172 | } | 257 | } |
173 | 258 | ||
174 | /* | 259 | /* |
@@ -214,7 +299,7 @@ static void intel_pmu_disable_bts(void) | |||
214 | update_debugctlmsr(debugctlmsr); | 299 | update_debugctlmsr(debugctlmsr); |
215 | } | 300 | } |
216 | 301 | ||
217 | static void intel_pmu_drain_bts_buffer(void) | 302 | static int intel_pmu_drain_bts_buffer(void) |
218 | { | 303 | { |
219 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 304 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
220 | struct debug_store *ds = cpuc->ds; | 305 | struct debug_store *ds = cpuc->ds; |
@@ -231,16 +316,16 @@ static void intel_pmu_drain_bts_buffer(void) | |||
231 | struct pt_regs regs; | 316 | struct pt_regs regs; |
232 | 317 | ||
233 | if (!event) | 318 | if (!event) |
234 | return; | 319 | return 0; |
235 | 320 | ||
236 | if (!ds) | 321 | if (!x86_pmu.bts_active) |
237 | return; | 322 | return 0; |
238 | 323 | ||
239 | at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; | 324 | at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; |
240 | top = (struct bts_record *)(unsigned long)ds->bts_index; | 325 | top = (struct bts_record *)(unsigned long)ds->bts_index; |
241 | 326 | ||
242 | if (top <= at) | 327 | if (top <= at) |
243 | return; | 328 | return 0; |
244 | 329 | ||
245 | ds->bts_index = ds->bts_buffer_base; | 330 | ds->bts_index = ds->bts_buffer_base; |
246 | 331 | ||
@@ -256,7 +341,7 @@ static void intel_pmu_drain_bts_buffer(void) | |||
256 | perf_prepare_sample(&header, &data, event, ®s); | 341 | perf_prepare_sample(&header, &data, event, ®s); |
257 | 342 | ||
258 | if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) | 343 | if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) |
259 | return; | 344 | return 1; |
260 | 345 | ||
261 | for (; at < top; at++) { | 346 | for (; at < top; at++) { |
262 | data.ip = at->from; | 347 | data.ip = at->from; |
@@ -270,6 +355,7 @@ static void intel_pmu_drain_bts_buffer(void) | |||
270 | /* There's new data available. */ | 355 | /* There's new data available. */ |
271 | event->hw.interrupts++; | 356 | event->hw.interrupts++; |
272 | event->pending_kill = POLL_IN; | 357 | event->pending_kill = POLL_IN; |
358 | return 1; | ||
273 | } | 359 | } |
274 | 360 | ||
275 | /* | 361 | /* |
@@ -491,7 +577,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, | |||
491 | regs.flags &= ~PERF_EFLAGS_EXACT; | 577 | regs.flags &= ~PERF_EFLAGS_EXACT; |
492 | 578 | ||
493 | if (perf_event_overflow(event, 1, &data, ®s)) | 579 | if (perf_event_overflow(event, 1, &data, ®s)) |
494 | x86_pmu_stop(event); | 580 | x86_pmu_stop(event, 0); |
495 | } | 581 | } |
496 | 582 | ||
497 | static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) | 583 | static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) |
@@ -502,7 +588,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) | |||
502 | struct pebs_record_core *at, *top; | 588 | struct pebs_record_core *at, *top; |
503 | int n; | 589 | int n; |
504 | 590 | ||
505 | if (!ds || !x86_pmu.pebs) | 591 | if (!x86_pmu.pebs_active) |
506 | return; | 592 | return; |
507 | 593 | ||
508 | at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base; | 594 | at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base; |
@@ -544,7 +630,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) | |||
544 | u64 status = 0; | 630 | u64 status = 0; |
545 | int bit, n; | 631 | int bit, n; |
546 | 632 | ||
547 | if (!ds || !x86_pmu.pebs) | 633 | if (!x86_pmu.pebs_active) |
548 | return; | 634 | return; |
549 | 635 | ||
550 | at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; | 636 | at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; |
@@ -629,9 +715,8 @@ static void intel_ds_init(void) | |||
629 | 715 | ||
630 | #else /* CONFIG_CPU_SUP_INTEL */ | 716 | #else /* CONFIG_CPU_SUP_INTEL */ |
631 | 717 | ||
632 | static int reserve_ds_buffers(void) | 718 | static void reserve_ds_buffers(void) |
633 | { | 719 | { |
634 | return 0; | ||
635 | } | 720 | } |
636 | 721 | ||
637 | static void release_ds_buffers(void) | 722 | static void release_ds_buffers(void) |
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 249015173992..81400b93e694 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c | |||
@@ -18,6 +18,8 @@ | |||
18 | struct p4_event_bind { | 18 | struct p4_event_bind { |
19 | unsigned int opcode; /* Event code and ESCR selector */ | 19 | unsigned int opcode; /* Event code and ESCR selector */ |
20 | unsigned int escr_msr[2]; /* ESCR MSR for this event */ | 20 | unsigned int escr_msr[2]; /* ESCR MSR for this event */ |
21 | unsigned int escr_emask; /* valid ESCR EventMask bits */ | ||
22 | unsigned int shared; /* event is shared across threads */ | ||
21 | char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ | 23 | char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ |
22 | }; | 24 | }; |
23 | 25 | ||
@@ -66,231 +68,435 @@ static struct p4_event_bind p4_event_bind_map[] = { | |||
66 | [P4_EVENT_TC_DELIVER_MODE] = { | 68 | [P4_EVENT_TC_DELIVER_MODE] = { |
67 | .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE), | 69 | .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE), |
68 | .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, | 70 | .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, |
71 | .escr_emask = | ||
72 | P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) | | ||
73 | P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) | | ||
74 | P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) | | ||
75 | P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) | | ||
76 | P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) | | ||
77 | P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) | | ||
78 | P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID), | ||
79 | .shared = 1, | ||
69 | .cntr = { {4, 5, -1}, {6, 7, -1} }, | 80 | .cntr = { {4, 5, -1}, {6, 7, -1} }, |
70 | }, | 81 | }, |
71 | [P4_EVENT_BPU_FETCH_REQUEST] = { | 82 | [P4_EVENT_BPU_FETCH_REQUEST] = { |
72 | .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST), | 83 | .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST), |
73 | .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 }, | 84 | .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 }, |
85 | .escr_emask = | ||
86 | P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS), | ||
74 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | 87 | .cntr = { {0, -1, -1}, {2, -1, -1} }, |
75 | }, | 88 | }, |
76 | [P4_EVENT_ITLB_REFERENCE] = { | 89 | [P4_EVENT_ITLB_REFERENCE] = { |
77 | .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE), | 90 | .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE), |
78 | .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 }, | 91 | .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 }, |
92 | .escr_emask = | ||
93 | P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) | | ||
94 | P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) | | ||
95 | P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK), | ||
79 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | 96 | .cntr = { {0, -1, -1}, {2, -1, -1} }, |
80 | }, | 97 | }, |
81 | [P4_EVENT_MEMORY_CANCEL] = { | 98 | [P4_EVENT_MEMORY_CANCEL] = { |
82 | .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL), | 99 | .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL), |
83 | .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, | 100 | .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, |
101 | .escr_emask = | ||
102 | P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) | | ||
103 | P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF), | ||
84 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | 104 | .cntr = { {8, 9, -1}, {10, 11, -1} }, |
85 | }, | 105 | }, |
86 | [P4_EVENT_MEMORY_COMPLETE] = { | 106 | [P4_EVENT_MEMORY_COMPLETE] = { |
87 | .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE), | 107 | .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE), |
88 | .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, | 108 | .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, |
109 | .escr_emask = | ||
110 | P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) | | ||
111 | P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC), | ||
89 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | 112 | .cntr = { {8, 9, -1}, {10, 11, -1} }, |
90 | }, | 113 | }, |
91 | [P4_EVENT_LOAD_PORT_REPLAY] = { | 114 | [P4_EVENT_LOAD_PORT_REPLAY] = { |
92 | .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY), | 115 | .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY), |
93 | .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 }, | 116 | .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 }, |
117 | .escr_emask = | ||
118 | P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD), | ||
94 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | 119 | .cntr = { {8, 9, -1}, {10, 11, -1} }, |
95 | }, | 120 | }, |
96 | [P4_EVENT_STORE_PORT_REPLAY] = { | 121 | [P4_EVENT_STORE_PORT_REPLAY] = { |
97 | .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY), | 122 | .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY), |
98 | .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, | 123 | .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, |
124 | .escr_emask = | ||
125 | P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST), | ||
99 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | 126 | .cntr = { {8, 9, -1}, {10, 11, -1} }, |
100 | }, | 127 | }, |
101 | [P4_EVENT_MOB_LOAD_REPLAY] = { | 128 | [P4_EVENT_MOB_LOAD_REPLAY] = { |
102 | .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY), | 129 | .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY), |
103 | .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 }, | 130 | .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 }, |
131 | .escr_emask = | ||
132 | P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) | | ||
133 | P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) | | ||
134 | P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) | | ||
135 | P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR), | ||
104 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | 136 | .cntr = { {0, -1, -1}, {2, -1, -1} }, |
105 | }, | 137 | }, |
106 | [P4_EVENT_PAGE_WALK_TYPE] = { | 138 | [P4_EVENT_PAGE_WALK_TYPE] = { |
107 | .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE), | 139 | .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE), |
108 | .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 }, | 140 | .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 }, |
141 | .escr_emask = | ||
142 | P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) | | ||
143 | P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS), | ||
144 | .shared = 1, | ||
109 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | 145 | .cntr = { {0, -1, -1}, {2, -1, -1} }, |
110 | }, | 146 | }, |
111 | [P4_EVENT_BSQ_CACHE_REFERENCE] = { | 147 | [P4_EVENT_BSQ_CACHE_REFERENCE] = { |
112 | .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE), | 148 | .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE), |
113 | .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 }, | 149 | .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 }, |
150 | .escr_emask = | ||
151 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) | | ||
152 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) | | ||
153 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) | | ||
154 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) | | ||
155 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) | | ||
156 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) | | ||
157 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) | | ||
158 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) | | ||
159 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS), | ||
114 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | 160 | .cntr = { {0, -1, -1}, {2, -1, -1} }, |
115 | }, | 161 | }, |
116 | [P4_EVENT_IOQ_ALLOCATION] = { | 162 | [P4_EVENT_IOQ_ALLOCATION] = { |
117 | .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION), | 163 | .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION), |
118 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, | 164 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, |
165 | .escr_emask = | ||
166 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) | | ||
167 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) | | ||
168 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) | | ||
169 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) | | ||
170 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) | | ||
171 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) | | ||
172 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) | | ||
173 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) | | ||
174 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) | | ||
175 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) | | ||
176 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH), | ||
119 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | 177 | .cntr = { {0, -1, -1}, {2, -1, -1} }, |
120 | }, | 178 | }, |
121 | [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */ | 179 | [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */ |
122 | .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES), | 180 | .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES), |
123 | .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 }, | 181 | .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 }, |
182 | .escr_emask = | ||
183 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) | | ||
184 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) | | ||
185 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) | | ||
186 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) | | ||
187 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) | | ||
188 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) | | ||
189 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) | | ||
190 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) | | ||
191 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) | | ||
192 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) | | ||
193 | P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH), | ||
124 | .cntr = { {2, -1, -1}, {3, -1, -1} }, | 194 | .cntr = { {2, -1, -1}, {3, -1, -1} }, |
125 | }, | 195 | }, |
126 | [P4_EVENT_FSB_DATA_ACTIVITY] = { | 196 | [P4_EVENT_FSB_DATA_ACTIVITY] = { |
127 | .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY), | 197 | .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY), |
128 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, | 198 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, |
199 | .escr_emask = | ||
200 | P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) | | ||
201 | P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) | | ||
202 | P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) | | ||
203 | P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) | | ||
204 | P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) | | ||
205 | P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER), | ||
206 | .shared = 1, | ||
129 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | 207 | .cntr = { {0, -1, -1}, {2, -1, -1} }, |
130 | }, | 208 | }, |
131 | [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */ | 209 | [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */ |
132 | .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION), | 210 | .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION), |
133 | .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 }, | 211 | .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 }, |
212 | .escr_emask = | ||
213 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) | | ||
214 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) | | ||
215 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) | | ||
216 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) | | ||
217 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) | | ||
218 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) | | ||
219 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) | | ||
220 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) | | ||
221 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) | | ||
222 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) | | ||
223 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) | | ||
224 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) | | ||
225 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2), | ||
134 | .cntr = { {0, -1, -1}, {1, -1, -1} }, | 226 | .cntr = { {0, -1, -1}, {1, -1, -1} }, |
135 | }, | 227 | }, |
136 | [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */ | 228 | [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */ |
137 | .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES), | 229 | .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES), |
138 | .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 }, | 230 | .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 }, |
231 | .escr_emask = | ||
232 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) | | ||
233 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) | | ||
234 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) | | ||
235 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) | | ||
236 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) | | ||
237 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) | | ||
238 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) | | ||
239 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) | | ||
240 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) | | ||
241 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) | | ||
242 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) | | ||
243 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) | | ||
244 | P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2), | ||
139 | .cntr = { {2, -1, -1}, {3, -1, -1} }, | 245 | .cntr = { {2, -1, -1}, {3, -1, -1} }, |
140 | }, | 246 | }, |
141 | [P4_EVENT_SSE_INPUT_ASSIST] = { | 247 | [P4_EVENT_SSE_INPUT_ASSIST] = { |
142 | .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST), | 248 | .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST), |
143 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | 249 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, |
250 | .escr_emask = | ||
251 | P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL), | ||
252 | .shared = 1, | ||
144 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | 253 | .cntr = { {8, 9, -1}, {10, 11, -1} }, |
145 | }, | 254 | }, |
146 | [P4_EVENT_PACKED_SP_UOP] = { | 255 | [P4_EVENT_PACKED_SP_UOP] = { |
147 | .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP), | 256 | .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP), |
148 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | 257 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, |
258 | .escr_emask = | ||
259 | P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL), | ||
260 | .shared = 1, | ||
149 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | 261 | .cntr = { {8, 9, -1}, {10, 11, -1} }, |
150 | }, | 262 | }, |
151 | [P4_EVENT_PACKED_DP_UOP] = { | 263 | [P4_EVENT_PACKED_DP_UOP] = { |
152 | .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP), | 264 | .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP), |
153 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | 265 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, |
266 | .escr_emask = | ||
267 | P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL), | ||
268 | .shared = 1, | ||
154 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | 269 | .cntr = { {8, 9, -1}, {10, 11, -1} }, |
155 | }, | 270 | }, |
156 | [P4_EVENT_SCALAR_SP_UOP] = { | 271 | [P4_EVENT_SCALAR_SP_UOP] = { |
157 | .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP), | 272 | .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP), |
158 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | 273 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, |
274 | .escr_emask = | ||
275 | P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL), | ||
276 | .shared = 1, | ||
159 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | 277 | .cntr = { {8, 9, -1}, {10, 11, -1} }, |
160 | }, | 278 | }, |
161 | [P4_EVENT_SCALAR_DP_UOP] = { | 279 | [P4_EVENT_SCALAR_DP_UOP] = { |
162 | .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP), | 280 | .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP), |
163 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | 281 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, |
282 | .escr_emask = | ||
283 | P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL), | ||
284 | .shared = 1, | ||
164 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | 285 | .cntr = { {8, 9, -1}, {10, 11, -1} }, |
165 | }, | 286 | }, |
166 | [P4_EVENT_64BIT_MMX_UOP] = { | 287 | [P4_EVENT_64BIT_MMX_UOP] = { |
167 | .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP), | 288 | .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP), |
168 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | 289 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, |
290 | .escr_emask = | ||
291 | P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL), | ||
292 | .shared = 1, | ||
169 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | 293 | .cntr = { {8, 9, -1}, {10, 11, -1} }, |
170 | }, | 294 | }, |
171 | [P4_EVENT_128BIT_MMX_UOP] = { | 295 | [P4_EVENT_128BIT_MMX_UOP] = { |
172 | .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP), | 296 | .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP), |
173 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | 297 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, |
298 | .escr_emask = | ||
299 | P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL), | ||
300 | .shared = 1, | ||
174 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | 301 | .cntr = { {8, 9, -1}, {10, 11, -1} }, |
175 | }, | 302 | }, |
176 | [P4_EVENT_X87_FP_UOP] = { | 303 | [P4_EVENT_X87_FP_UOP] = { |
177 | .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP), | 304 | .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP), |
178 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, | 305 | .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, |
306 | .escr_emask = | ||
307 | P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL), | ||
308 | .shared = 1, | ||
179 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | 309 | .cntr = { {8, 9, -1}, {10, 11, -1} }, |
180 | }, | 310 | }, |
181 | [P4_EVENT_TC_MISC] = { | 311 | [P4_EVENT_TC_MISC] = { |
182 | .opcode = P4_OPCODE(P4_EVENT_TC_MISC), | 312 | .opcode = P4_OPCODE(P4_EVENT_TC_MISC), |
183 | .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, | 313 | .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, |
314 | .escr_emask = | ||
315 | P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH), | ||
184 | .cntr = { {4, 5, -1}, {6, 7, -1} }, | 316 | .cntr = { {4, 5, -1}, {6, 7, -1} }, |
185 | }, | 317 | }, |
186 | [P4_EVENT_GLOBAL_POWER_EVENTS] = { | 318 | [P4_EVENT_GLOBAL_POWER_EVENTS] = { |
187 | .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS), | 319 | .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS), |
188 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, | 320 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, |
321 | .escr_emask = | ||
322 | P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING), | ||
189 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | 323 | .cntr = { {0, -1, -1}, {2, -1, -1} }, |
190 | }, | 324 | }, |
191 | [P4_EVENT_TC_MS_XFER] = { | 325 | [P4_EVENT_TC_MS_XFER] = { |
192 | .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER), | 326 | .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER), |
193 | .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, | 327 | .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, |
328 | .escr_emask = | ||
329 | P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC), | ||
194 | .cntr = { {4, 5, -1}, {6, 7, -1} }, | 330 | .cntr = { {4, 5, -1}, {6, 7, -1} }, |
195 | }, | 331 | }, |
196 | [P4_EVENT_UOP_QUEUE_WRITES] = { | 332 | [P4_EVENT_UOP_QUEUE_WRITES] = { |
197 | .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES), | 333 | .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES), |
198 | .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, | 334 | .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, |
335 | .escr_emask = | ||
336 | P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) | | ||
337 | P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) | | ||
338 | P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM), | ||
199 | .cntr = { {4, 5, -1}, {6, 7, -1} }, | 339 | .cntr = { {4, 5, -1}, {6, 7, -1} }, |
200 | }, | 340 | }, |
201 | [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = { | 341 | [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = { |
202 | .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE), | 342 | .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE), |
203 | .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 }, | 343 | .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 }, |
344 | .escr_emask = | ||
345 | P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) | | ||
346 | P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) | | ||
347 | P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) | | ||
348 | P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT), | ||
204 | .cntr = { {4, 5, -1}, {6, 7, -1} }, | 349 | .cntr = { {4, 5, -1}, {6, 7, -1} }, |
205 | }, | 350 | }, |
206 | [P4_EVENT_RETIRED_BRANCH_TYPE] = { | 351 | [P4_EVENT_RETIRED_BRANCH_TYPE] = { |
207 | .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE), | 352 | .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE), |
208 | .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 }, | 353 | .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 }, |
354 | .escr_emask = | ||
355 | P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) | | ||
356 | P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) | | ||
357 | P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) | | ||
358 | P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT), | ||
209 | .cntr = { {4, 5, -1}, {6, 7, -1} }, | 359 | .cntr = { {4, 5, -1}, {6, 7, -1} }, |
210 | }, | 360 | }, |
211 | [P4_EVENT_RESOURCE_STALL] = { | 361 | [P4_EVENT_RESOURCE_STALL] = { |
212 | .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL), | 362 | .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL), |
213 | .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 }, | 363 | .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 }, |
364 | .escr_emask = | ||
365 | P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL), | ||
214 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | 366 | .cntr = { {12, 13, 16}, {14, 15, 17} }, |
215 | }, | 367 | }, |
216 | [P4_EVENT_WC_BUFFER] = { | 368 | [P4_EVENT_WC_BUFFER] = { |
217 | .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER), | 369 | .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER), |
218 | .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, | 370 | .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, |
371 | .escr_emask = | ||
372 | P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) | | ||
373 | P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS), | ||
374 | .shared = 1, | ||
219 | .cntr = { {8, 9, -1}, {10, 11, -1} }, | 375 | .cntr = { {8, 9, -1}, {10, 11, -1} }, |
220 | }, | 376 | }, |
221 | [P4_EVENT_B2B_CYCLES] = { | 377 | [P4_EVENT_B2B_CYCLES] = { |
222 | .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES), | 378 | .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES), |
223 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, | 379 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, |
380 | .escr_emask = 0, | ||
224 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | 381 | .cntr = { {0, -1, -1}, {2, -1, -1} }, |
225 | }, | 382 | }, |
226 | [P4_EVENT_BNR] = { | 383 | [P4_EVENT_BNR] = { |
227 | .opcode = P4_OPCODE(P4_EVENT_BNR), | 384 | .opcode = P4_OPCODE(P4_EVENT_BNR), |
228 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, | 385 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, |
386 | .escr_emask = 0, | ||
229 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | 387 | .cntr = { {0, -1, -1}, {2, -1, -1} }, |
230 | }, | 388 | }, |
231 | [P4_EVENT_SNOOP] = { | 389 | [P4_EVENT_SNOOP] = { |
232 | .opcode = P4_OPCODE(P4_EVENT_SNOOP), | 390 | .opcode = P4_OPCODE(P4_EVENT_SNOOP), |
233 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, | 391 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, |
392 | .escr_emask = 0, | ||
234 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | 393 | .cntr = { {0, -1, -1}, {2, -1, -1} }, |
235 | }, | 394 | }, |
236 | [P4_EVENT_RESPONSE] = { | 395 | [P4_EVENT_RESPONSE] = { |
237 | .opcode = P4_OPCODE(P4_EVENT_RESPONSE), | 396 | .opcode = P4_OPCODE(P4_EVENT_RESPONSE), |
238 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, | 397 | .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, |
398 | .escr_emask = 0, | ||
239 | .cntr = { {0, -1, -1}, {2, -1, -1} }, | 399 | .cntr = { {0, -1, -1}, {2, -1, -1} }, |
240 | }, | 400 | }, |
241 | [P4_EVENT_FRONT_END_EVENT] = { | 401 | [P4_EVENT_FRONT_END_EVENT] = { |
242 | .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT), | 402 | .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT), |
243 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, | 403 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, |
404 | .escr_emask = | ||
405 | P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) | | ||
406 | P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS), | ||
244 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | 407 | .cntr = { {12, 13, 16}, {14, 15, 17} }, |
245 | }, | 408 | }, |
246 | [P4_EVENT_EXECUTION_EVENT] = { | 409 | [P4_EVENT_EXECUTION_EVENT] = { |
247 | .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT), | 410 | .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT), |
248 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, | 411 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, |
412 | .escr_emask = | ||
413 | P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | | ||
414 | P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | | ||
415 | P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | | ||
416 | P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | | ||
417 | P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | | ||
418 | P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | | ||
419 | P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | | ||
420 | P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3), | ||
249 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | 421 | .cntr = { {12, 13, 16}, {14, 15, 17} }, |
250 | }, | 422 | }, |
251 | [P4_EVENT_REPLAY_EVENT] = { | 423 | [P4_EVENT_REPLAY_EVENT] = { |
252 | .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT), | 424 | .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT), |
253 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, | 425 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, |
426 | .escr_emask = | ||
427 | P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) | | ||
428 | P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS), | ||
254 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | 429 | .cntr = { {12, 13, 16}, {14, 15, 17} }, |
255 | }, | 430 | }, |
256 | [P4_EVENT_INSTR_RETIRED] = { | 431 | [P4_EVENT_INSTR_RETIRED] = { |
257 | .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED), | 432 | .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED), |
258 | .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, | 433 | .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, |
434 | .escr_emask = | ||
435 | P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) | | ||
436 | P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) | | ||
437 | P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) | | ||
438 | P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG), | ||
259 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | 439 | .cntr = { {12, 13, 16}, {14, 15, 17} }, |
260 | }, | 440 | }, |
261 | [P4_EVENT_UOPS_RETIRED] = { | 441 | [P4_EVENT_UOPS_RETIRED] = { |
262 | .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED), | 442 | .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED), |
263 | .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, | 443 | .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, |
444 | .escr_emask = | ||
445 | P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) | | ||
446 | P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS), | ||
264 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | 447 | .cntr = { {12, 13, 16}, {14, 15, 17} }, |
265 | }, | 448 | }, |
266 | [P4_EVENT_UOP_TYPE] = { | 449 | [P4_EVENT_UOP_TYPE] = { |
267 | .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE), | 450 | .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE), |
268 | .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 }, | 451 | .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 }, |
452 | .escr_emask = | ||
453 | P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) | | ||
454 | P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES), | ||
269 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | 455 | .cntr = { {12, 13, 16}, {14, 15, 17} }, |
270 | }, | 456 | }, |
271 | [P4_EVENT_BRANCH_RETIRED] = { | 457 | [P4_EVENT_BRANCH_RETIRED] = { |
272 | .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED), | 458 | .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED), |
273 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, | 459 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, |
460 | .escr_emask = | ||
461 | P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) | | ||
462 | P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) | | ||
463 | P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) | | ||
464 | P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM), | ||
274 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | 465 | .cntr = { {12, 13, 16}, {14, 15, 17} }, |
275 | }, | 466 | }, |
276 | [P4_EVENT_MISPRED_BRANCH_RETIRED] = { | 467 | [P4_EVENT_MISPRED_BRANCH_RETIRED] = { |
277 | .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), | 468 | .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), |
278 | .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, | 469 | .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, |
470 | .escr_emask = | ||
471 | P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), | ||
279 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | 472 | .cntr = { {12, 13, 16}, {14, 15, 17} }, |
280 | }, | 473 | }, |
281 | [P4_EVENT_X87_ASSIST] = { | 474 | [P4_EVENT_X87_ASSIST] = { |
282 | .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST), | 475 | .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST), |
283 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, | 476 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, |
477 | .escr_emask = | ||
478 | P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) | | ||
479 | P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) | | ||
480 | P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) | | ||
481 | P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) | | ||
482 | P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA), | ||
284 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | 483 | .cntr = { {12, 13, 16}, {14, 15, 17} }, |
285 | }, | 484 | }, |
286 | [P4_EVENT_MACHINE_CLEAR] = { | 485 | [P4_EVENT_MACHINE_CLEAR] = { |
287 | .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR), | 486 | .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR), |
288 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, | 487 | .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, |
488 | .escr_emask = | ||
489 | P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) | | ||
490 | P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) | | ||
491 | P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR), | ||
289 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | 492 | .cntr = { {12, 13, 16}, {14, 15, 17} }, |
290 | }, | 493 | }, |
291 | [P4_EVENT_INSTR_COMPLETED] = { | 494 | [P4_EVENT_INSTR_COMPLETED] = { |
292 | .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED), | 495 | .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED), |
293 | .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, | 496 | .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, |
497 | .escr_emask = | ||
498 | P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) | | ||
499 | P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS), | ||
294 | .cntr = { {12, 13, 16}, {14, 15, 17} }, | 500 | .cntr = { {12, 13, 16}, {14, 15, 17} }, |
295 | }, | 501 | }, |
296 | }; | 502 | }; |
@@ -428,29 +634,73 @@ static u64 p4_pmu_event_map(int hw_event) | |||
428 | return config; | 634 | return config; |
429 | } | 635 | } |
430 | 636 | ||
637 | /* check cpu model specifics */ | ||
638 | static bool p4_event_match_cpu_model(unsigned int event_idx) | ||
639 | { | ||
640 | /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */ | ||
641 | if (event_idx == P4_EVENT_INSTR_COMPLETED) { | ||
642 | if (boot_cpu_data.x86_model != 3 && | ||
643 | boot_cpu_data.x86_model != 4 && | ||
644 | boot_cpu_data.x86_model != 6) | ||
645 | return false; | ||
646 | } | ||
647 | |||
648 | /* | ||
649 | * For info | ||
650 | * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2 | ||
651 | */ | ||
652 | |||
653 | return true; | ||
654 | } | ||
655 | |||
431 | static int p4_validate_raw_event(struct perf_event *event) | 656 | static int p4_validate_raw_event(struct perf_event *event) |
432 | { | 657 | { |
433 | unsigned int v; | 658 | unsigned int v, emask; |
434 | 659 | ||
435 | /* user data may have out-of-bound event index */ | 660 | /* User data may have out-of-bound event index */ |
436 | v = p4_config_unpack_event(event->attr.config); | 661 | v = p4_config_unpack_event(event->attr.config); |
437 | if (v >= ARRAY_SIZE(p4_event_bind_map)) { | 662 | if (v >= ARRAY_SIZE(p4_event_bind_map)) |
438 | pr_warning("P4 PMU: Unknown event code: %d\n", v); | 663 | return -EINVAL; |
664 | |||
665 | /* It may be unsupported: */ | ||
666 | if (!p4_event_match_cpu_model(v)) | ||
439 | return -EINVAL; | 667 | return -EINVAL; |
668 | |||
669 | /* | ||
670 | * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as | ||
671 | * in Architectural Performance Monitoring, it means not | ||
672 | * on _which_ logical cpu to count but rather _when_, ie it | ||
673 | * depends on logical cpu state -- count event if one cpu active, | ||
674 | * none, both or any, so we just allow user to pass any value | ||
675 | * desired. | ||
676 | * | ||
677 | * In turn we always set Tx_OS/Tx_USR bits bound to logical | ||
678 | * cpu without their propagation to another cpu | ||
679 | */ | ||
680 | |||
681 | /* | ||
682 | * if an event is shared accross the logical threads | ||
683 | * the user needs special permissions to be able to use it | ||
684 | */ | ||
685 | if (p4_event_bind_map[v].shared) { | ||
686 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | ||
687 | return -EACCES; | ||
440 | } | 688 | } |
441 | 689 | ||
690 | /* ESCR EventMask bits may be invalid */ | ||
691 | emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK; | ||
692 | if (emask & ~p4_event_bind_map[v].escr_emask) | ||
693 | return -EINVAL; | ||
694 | |||
442 | /* | 695 | /* |
443 | * it may have some screwed PEBS bits | 696 | * it may have some invalid PEBS bits |
444 | */ | 697 | */ |
445 | if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) { | 698 | if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) |
446 | pr_warning("P4 PMU: PEBS are not supported yet\n"); | ||
447 | return -EINVAL; | 699 | return -EINVAL; |
448 | } | 700 | |
449 | v = p4_config_unpack_metric(event->attr.config); | 701 | v = p4_config_unpack_metric(event->attr.config); |
450 | if (v >= ARRAY_SIZE(p4_pebs_bind_map)) { | 702 | if (v >= ARRAY_SIZE(p4_pebs_bind_map)) |
451 | pr_warning("P4 PMU: Unknown metric code: %d\n", v); | ||
452 | return -EINVAL; | 703 | return -EINVAL; |
453 | } | ||
454 | 704 | ||
455 | return 0; | 705 | return 0; |
456 | } | 706 | } |
@@ -478,27 +728,21 @@ static int p4_hw_config(struct perf_event *event) | |||
478 | 728 | ||
479 | if (event->attr.type == PERF_TYPE_RAW) { | 729 | if (event->attr.type == PERF_TYPE_RAW) { |
480 | 730 | ||
731 | /* | ||
732 | * Clear bits we reserve to be managed by kernel itself | ||
733 | * and never allowed from a user space | ||
734 | */ | ||
735 | event->attr.config &= P4_CONFIG_MASK; | ||
736 | |||
481 | rc = p4_validate_raw_event(event); | 737 | rc = p4_validate_raw_event(event); |
482 | if (rc) | 738 | if (rc) |
483 | goto out; | 739 | goto out; |
484 | 740 | ||
485 | /* | 741 | /* |
486 | * We don't control raw events so it's up to the caller | ||
487 | * to pass sane values (and we don't count the thread number | ||
488 | * on HT machine but allow HT-compatible specifics to be | ||
489 | * passed on) | ||
490 | * | ||
491 | * Note that for RAW events we allow user to use P4_CCCR_RESERVED | 742 | * Note that for RAW events we allow user to use P4_CCCR_RESERVED |
492 | * bits since we keep additional info here (for cache events and etc) | 743 | * bits since we keep additional info here (for cache events and etc) |
493 | * | ||
494 | * XXX: HT wide things should check perf_paranoid_cpu() && | ||
495 | * CAP_SYS_ADMIN | ||
496 | */ | 744 | */ |
497 | event->hw.config |= event->attr.config & | 745 | event->hw.config |= event->attr.config; |
498 | (p4_config_pack_escr(P4_ESCR_MASK_HT) | | ||
499 | p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); | ||
500 | |||
501 | event->hw.config &= ~P4_CCCR_FORCE_OVF; | ||
502 | } | 746 | } |
503 | 747 | ||
504 | rc = x86_setup_perfctr(event); | 748 | rc = x86_setup_perfctr(event); |
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index fb329e9f8494..d9f4ff8fcd69 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c | |||
@@ -700,11 +700,10 @@ static void probe_nmi_watchdog(void) | |||
700 | { | 700 | { |
701 | switch (boot_cpu_data.x86_vendor) { | 701 | switch (boot_cpu_data.x86_vendor) { |
702 | case X86_VENDOR_AMD: | 702 | case X86_VENDOR_AMD: |
703 | if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && | 703 | if (boot_cpu_data.x86 == 6 || |
704 | boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17) | 704 | (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15)) |
705 | return; | 705 | wd_ops = &k7_wd_ops; |
706 | wd_ops = &k7_wd_ops; | 706 | return; |
707 | break; | ||
708 | case X86_VENDOR_INTEL: | 707 | case X86_VENDOR_INTEL: |
709 | /* Work around where perfctr1 doesn't have a working enable | 708 | /* Work around where perfctr1 doesn't have a working enable |
710 | * bit as described in the following errata: | 709 | * bit as described in the following errata: |
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index d49079515122..c7f64e6f537a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c | |||
@@ -44,6 +44,12 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) | |||
44 | { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, | 44 | { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, |
45 | { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, | 45 | { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, |
46 | { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, | 46 | { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, |
47 | { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 }, | ||
48 | { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 }, | ||
49 | { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 }, | ||
50 | { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 }, | ||
51 | { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 }, | ||
52 | { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 }, | ||
47 | { 0, 0, 0, 0, 0 } | 53 | { 0, 0, 0, 0, 0 } |
48 | }; | 54 | }; |
49 | 55 | ||
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 67414550c3cc..d5cd13945d5a 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c | |||
@@ -61,7 +61,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | |||
61 | if (!is_crashed_pfn_valid(pfn)) | 61 | if (!is_crashed_pfn_valid(pfn)) |
62 | return -EFAULT; | 62 | return -EFAULT; |
63 | 63 | ||
64 | vaddr = kmap_atomic_pfn(pfn, KM_PTE0); | 64 | vaddr = kmap_atomic_pfn(pfn); |
65 | 65 | ||
66 | if (!userbuf) { | 66 | if (!userbuf) { |
67 | memcpy(buf, (vaddr + offset), csize); | 67 | memcpy(buf, (vaddr + offset), csize); |
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 045b36cada65..994828899e09 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c | |||
@@ -34,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | |||
34 | if (!csize) | 34 | if (!csize) |
35 | return 0; | 35 | return 0; |
36 | 36 | ||
37 | vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); | 37 | vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); |
38 | if (!vaddr) | 38 | if (!vaddr) |
39 | return -ENOMEM; | 39 | return -ENOMEM; |
40 | 40 | ||
@@ -46,6 +46,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | |||
46 | } else | 46 | } else |
47 | memcpy(buf, vaddr + offset, csize); | 47 | memcpy(buf, vaddr + offset, csize); |
48 | 48 | ||
49 | set_iounmap_nonlazy(); | ||
49 | iounmap(vaddr); | 50 | iounmap(vaddr); |
50 | return csize; | 51 | return csize; |
51 | } | 52 | } |
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 0f6376ffa2d9..1bc7f75a5bda 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -82,11 +82,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | |||
82 | if (kstack_end(stack)) | 82 | if (kstack_end(stack)) |
83 | break; | 83 | break; |
84 | if (i && ((i % STACKSLOTS_PER_LINE) == 0)) | 84 | if (i && ((i % STACKSLOTS_PER_LINE) == 0)) |
85 | printk("\n%s", log_lvl); | 85 | printk(KERN_CONT "\n"); |
86 | printk(" %08lx", *stack++); | 86 | printk(KERN_CONT " %08lx", *stack++); |
87 | touch_nmi_watchdog(); | 87 | touch_nmi_watchdog(); |
88 | } | 88 | } |
89 | printk("\n"); | 89 | printk(KERN_CONT "\n"); |
90 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); | 90 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); |
91 | } | 91 | } |
92 | 92 | ||
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 57a21f11c791..6a340485249a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c | |||
@@ -265,20 +265,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | |||
265 | if (stack >= irq_stack && stack <= irq_stack_end) { | 265 | if (stack >= irq_stack && stack <= irq_stack_end) { |
266 | if (stack == irq_stack_end) { | 266 | if (stack == irq_stack_end) { |
267 | stack = (unsigned long *) (irq_stack_end[-1]); | 267 | stack = (unsigned long *) (irq_stack_end[-1]); |
268 | printk(" <EOI> "); | 268 | printk(KERN_CONT " <EOI> "); |
269 | } | 269 | } |
270 | } else { | 270 | } else { |
271 | if (((long) stack & (THREAD_SIZE-1)) == 0) | 271 | if (((long) stack & (THREAD_SIZE-1)) == 0) |
272 | break; | 272 | break; |
273 | } | 273 | } |
274 | if (i && ((i % STACKSLOTS_PER_LINE) == 0)) | 274 | if (i && ((i % STACKSLOTS_PER_LINE) == 0)) |
275 | printk("\n%s", log_lvl); | 275 | printk(KERN_CONT "\n"); |
276 | printk(" %016lx", *stack++); | 276 | printk(KERN_CONT " %016lx", *stack++); |
277 | touch_nmi_watchdog(); | 277 | touch_nmi_watchdog(); |
278 | } | 278 | } |
279 | preempt_enable(); | 279 | preempt_enable(); |
280 | 280 | ||
281 | printk("\n"); | 281 | printk(KERN_CONT "\n"); |
282 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); | 282 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); |
283 | } | 283 | } |
284 | 284 | ||
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0d6fc71bedb1..0c2b7ef7a34d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/pfn.h> | 15 | #include <linux/pfn.h> |
16 | #include <linux/suspend.h> | 16 | #include <linux/suspend.h> |
17 | #include <linux/firmware-map.h> | 17 | #include <linux/firmware-map.h> |
18 | #include <linux/memblock.h> | ||
18 | 19 | ||
19 | #include <asm/e820.h> | 20 | #include <asm/e820.h> |
20 | #include <asm/proto.h> | 21 | #include <asm/proto.h> |
@@ -738,73 +739,7 @@ core_initcall(e820_mark_nvs_memory); | |||
738 | #endif | 739 | #endif |
739 | 740 | ||
740 | /* | 741 | /* |
741 | * Find a free area with specified alignment in a specific range. | 742 | * pre allocated 4k and reserved it in memblock and e820_saved |
742 | */ | ||
743 | u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) | ||
744 | { | ||
745 | int i; | ||
746 | |||
747 | for (i = 0; i < e820.nr_map; i++) { | ||
748 | struct e820entry *ei = &e820.map[i]; | ||
749 | u64 addr; | ||
750 | u64 ei_start, ei_last; | ||
751 | |||
752 | if (ei->type != E820_RAM) | ||
753 | continue; | ||
754 | |||
755 | ei_last = ei->addr + ei->size; | ||
756 | ei_start = ei->addr; | ||
757 | addr = find_early_area(ei_start, ei_last, start, end, | ||
758 | size, align); | ||
759 | |||
760 | if (addr != -1ULL) | ||
761 | return addr; | ||
762 | } | ||
763 | return -1ULL; | ||
764 | } | ||
765 | |||
766 | u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) | ||
767 | { | ||
768 | return find_e820_area(start, end, size, align); | ||
769 | } | ||
770 | |||
771 | u64 __init get_max_mapped(void) | ||
772 | { | ||
773 | u64 end = max_pfn_mapped; | ||
774 | |||
775 | end <<= PAGE_SHIFT; | ||
776 | |||
777 | return end; | ||
778 | } | ||
779 | /* | ||
780 | * Find next free range after *start | ||
781 | */ | ||
782 | u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) | ||
783 | { | ||
784 | int i; | ||
785 | |||
786 | for (i = 0; i < e820.nr_map; i++) { | ||
787 | struct e820entry *ei = &e820.map[i]; | ||
788 | u64 addr; | ||
789 | u64 ei_start, ei_last; | ||
790 | |||
791 | if (ei->type != E820_RAM) | ||
792 | continue; | ||
793 | |||
794 | ei_last = ei->addr + ei->size; | ||
795 | ei_start = ei->addr; | ||
796 | addr = find_early_area_size(ei_start, ei_last, start, | ||
797 | sizep, align); | ||
798 | |||
799 | if (addr != -1ULL) | ||
800 | return addr; | ||
801 | } | ||
802 | |||
803 | return -1ULL; | ||
804 | } | ||
805 | |||
806 | /* | ||
807 | * pre allocated 4k and reserved it in e820 | ||
808 | */ | 743 | */ |
809 | u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) | 744 | u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) |
810 | { | 745 | { |
@@ -813,8 +748,8 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) | |||
813 | u64 start; | 748 | u64 start; |
814 | 749 | ||
815 | for (start = startt; ; start += size) { | 750 | for (start = startt; ; start += size) { |
816 | start = find_e820_area_size(start, &size, align); | 751 | start = memblock_x86_find_in_range_size(start, &size, align); |
817 | if (!(start + 1)) | 752 | if (start == MEMBLOCK_ERROR) |
818 | return 0; | 753 | return 0; |
819 | if (size >= sizet) | 754 | if (size >= sizet) |
820 | break; | 755 | break; |
@@ -830,10 +765,9 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) | |||
830 | addr = round_down(start + size - sizet, align); | 765 | addr = round_down(start + size - sizet, align); |
831 | if (addr < start) | 766 | if (addr < start) |
832 | return 0; | 767 | return 0; |
833 | e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); | 768 | memblock_x86_reserve_range(addr, addr + sizet, "new next"); |
834 | e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); | 769 | e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); |
835 | printk(KERN_INFO "update e820 for early_reserve_e820\n"); | 770 | printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); |
836 | update_e820(); | ||
837 | update_e820_saved(); | 771 | update_e820_saved(); |
838 | 772 | ||
839 | return addr; | 773 | return addr; |
@@ -895,74 +829,6 @@ unsigned long __init e820_end_of_low_ram_pfn(void) | |||
895 | { | 829 | { |
896 | return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); | 830 | return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); |
897 | } | 831 | } |
898 | /* | ||
899 | * Finds an active region in the address range from start_pfn to last_pfn and | ||
900 | * returns its range in ei_startpfn and ei_endpfn for the e820 entry. | ||
901 | */ | ||
902 | int __init e820_find_active_region(const struct e820entry *ei, | ||
903 | unsigned long start_pfn, | ||
904 | unsigned long last_pfn, | ||
905 | unsigned long *ei_startpfn, | ||
906 | unsigned long *ei_endpfn) | ||
907 | { | ||
908 | u64 align = PAGE_SIZE; | ||
909 | |||
910 | *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT; | ||
911 | *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT; | ||
912 | |||
913 | /* Skip map entries smaller than a page */ | ||
914 | if (*ei_startpfn >= *ei_endpfn) | ||
915 | return 0; | ||
916 | |||
917 | /* Skip if map is outside the node */ | ||
918 | if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || | ||
919 | *ei_startpfn >= last_pfn) | ||
920 | return 0; | ||
921 | |||
922 | /* Check for overlaps */ | ||
923 | if (*ei_startpfn < start_pfn) | ||
924 | *ei_startpfn = start_pfn; | ||
925 | if (*ei_endpfn > last_pfn) | ||
926 | *ei_endpfn = last_pfn; | ||
927 | |||
928 | return 1; | ||
929 | } | ||
930 | |||
931 | /* Walk the e820 map and register active regions within a node */ | ||
932 | void __init e820_register_active_regions(int nid, unsigned long start_pfn, | ||
933 | unsigned long last_pfn) | ||
934 | { | ||
935 | unsigned long ei_startpfn; | ||
936 | unsigned long ei_endpfn; | ||
937 | int i; | ||
938 | |||
939 | for (i = 0; i < e820.nr_map; i++) | ||
940 | if (e820_find_active_region(&e820.map[i], | ||
941 | start_pfn, last_pfn, | ||
942 | &ei_startpfn, &ei_endpfn)) | ||
943 | add_active_range(nid, ei_startpfn, ei_endpfn); | ||
944 | } | ||
945 | |||
946 | /* | ||
947 | * Find the hole size (in bytes) in the memory range. | ||
948 | * @start: starting address of the memory range to scan | ||
949 | * @end: ending address of the memory range to scan | ||
950 | */ | ||
951 | u64 __init e820_hole_size(u64 start, u64 end) | ||
952 | { | ||
953 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
954 | unsigned long last_pfn = end >> PAGE_SHIFT; | ||
955 | unsigned long ei_startpfn, ei_endpfn, ram = 0; | ||
956 | int i; | ||
957 | |||
958 | for (i = 0; i < e820.nr_map; i++) { | ||
959 | if (e820_find_active_region(&e820.map[i], | ||
960 | start_pfn, last_pfn, | ||
961 | &ei_startpfn, &ei_endpfn)) | ||
962 | ram += ei_endpfn - ei_startpfn; | ||
963 | } | ||
964 | return end - start - ((u64)ram << PAGE_SHIFT); | ||
965 | } | ||
966 | 832 | ||
967 | static void early_panic(char *msg) | 833 | static void early_panic(char *msg) |
968 | { | 834 | { |
@@ -1210,3 +1076,48 @@ void __init setup_memory_map(void) | |||
1210 | printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | 1076 | printk(KERN_INFO "BIOS-provided physical RAM map:\n"); |
1211 | e820_print_map(who); | 1077 | e820_print_map(who); |
1212 | } | 1078 | } |
1079 | |||
1080 | void __init memblock_x86_fill(void) | ||
1081 | { | ||
1082 | int i; | ||
1083 | u64 end; | ||
1084 | |||
1085 | /* | ||
1086 | * EFI may have more than 128 entries | ||
1087 | * We are safe to enable resizing, beause memblock_x86_fill() | ||
1088 | * is rather later for x86 | ||
1089 | */ | ||
1090 | memblock_can_resize = 1; | ||
1091 | |||
1092 | for (i = 0; i < e820.nr_map; i++) { | ||
1093 | struct e820entry *ei = &e820.map[i]; | ||
1094 | |||
1095 | end = ei->addr + ei->size; | ||
1096 | if (end != (resource_size_t)end) | ||
1097 | continue; | ||
1098 | |||
1099 | if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) | ||
1100 | continue; | ||
1101 | |||
1102 | memblock_add(ei->addr, ei->size); | ||
1103 | } | ||
1104 | |||
1105 | memblock_analyze(); | ||
1106 | memblock_dump_all(); | ||
1107 | } | ||
1108 | |||
1109 | void __init memblock_find_dma_reserve(void) | ||
1110 | { | ||
1111 | #ifdef CONFIG_X86_64 | ||
1112 | u64 free_size_pfn; | ||
1113 | u64 mem_size_pfn; | ||
1114 | /* | ||
1115 | * need to find out used area below MAX_DMA_PFN | ||
1116 | * need to use memblock to get free size in [0, MAX_DMA_PFN] | ||
1117 | * at first, and assume boot_mem will not take below MAX_DMA_PFN | ||
1118 | */ | ||
1119 | mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; | ||
1120 | free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; | ||
1121 | set_dma_reserve(mem_size_pfn - free_size_pfn); | ||
1122 | #endif | ||
1123 | } | ||
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index fa99bae75ace..4572f25f9325 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <xen/hvc-console.h> | 14 | #include <xen/hvc-console.h> |
15 | #include <asm/pci-direct.h> | 15 | #include <asm/pci-direct.h> |
16 | #include <asm/fixmap.h> | 16 | #include <asm/fixmap.h> |
17 | #include <asm/mrst.h> | ||
17 | #include <asm/pgtable.h> | 18 | #include <asm/pgtable.h> |
18 | #include <linux/usb/ehci_def.h> | 19 | #include <linux/usb/ehci_def.h> |
19 | 20 | ||
@@ -239,6 +240,18 @@ static int __init setup_early_printk(char *buf) | |||
239 | if (!strncmp(buf, "xen", 3)) | 240 | if (!strncmp(buf, "xen", 3)) |
240 | early_console_register(&xenboot_console, keep); | 241 | early_console_register(&xenboot_console, keep); |
241 | #endif | 242 | #endif |
243 | #ifdef CONFIG_X86_MRST_EARLY_PRINTK | ||
244 | if (!strncmp(buf, "mrst", 4)) { | ||
245 | mrst_early_console_init(); | ||
246 | early_console_register(&early_mrst_console, keep); | ||
247 | } | ||
248 | |||
249 | if (!strncmp(buf, "hsu", 3)) { | ||
250 | hsu_early_console_init(); | ||
251 | early_console_register(&early_hsu_console, keep); | ||
252 | } | ||
253 | |||
254 | #endif | ||
242 | buf++; | 255 | buf++; |
243 | } | 256 | } |
244 | return 0; | 257 | return 0; |
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c new file mode 100644 index 000000000000..65df603622b2 --- /dev/null +++ b/arch/x86/kernel/early_printk_mrst.c | |||
@@ -0,0 +1,319 @@ | |||
1 | /* | ||
2 | * early_printk_mrst.c - early consoles for Intel MID platforms | ||
3 | * | ||
4 | * Copyright (c) 2008-2010, Intel Corporation | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; version 2 | ||
9 | * of the License. | ||
10 | */ | ||
11 | |||
12 | /* | ||
13 | * This file implements two early consoles named mrst and hsu. | ||
14 | * mrst is based on Maxim3110 spi-uart device, it exists in both | ||
15 | * Moorestown and Medfield platforms, while hsu is based on a High | ||
16 | * Speed UART device which only exists in the Medfield platform | ||
17 | */ | ||
18 | |||
19 | #include <linux/serial_reg.h> | ||
20 | #include <linux/serial_mfd.h> | ||
21 | #include <linux/kmsg_dump.h> | ||
22 | #include <linux/console.h> | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/delay.h> | ||
25 | #include <linux/init.h> | ||
26 | #include <linux/io.h> | ||
27 | |||
28 | #include <asm/fixmap.h> | ||
29 | #include <asm/pgtable.h> | ||
30 | #include <asm/mrst.h> | ||
31 | |||
32 | #define MRST_SPI_TIMEOUT 0x200000 | ||
33 | #define MRST_REGBASE_SPI0 0xff128000 | ||
34 | #define MRST_REGBASE_SPI1 0xff128400 | ||
35 | #define MRST_CLK_SPI0_REG 0xff11d86c | ||
36 | |||
37 | /* Bit fields in CTRLR0 */ | ||
38 | #define SPI_DFS_OFFSET 0 | ||
39 | |||
40 | #define SPI_FRF_OFFSET 4 | ||
41 | #define SPI_FRF_SPI 0x0 | ||
42 | #define SPI_FRF_SSP 0x1 | ||
43 | #define SPI_FRF_MICROWIRE 0x2 | ||
44 | #define SPI_FRF_RESV 0x3 | ||
45 | |||
46 | #define SPI_MODE_OFFSET 6 | ||
47 | #define SPI_SCPH_OFFSET 6 | ||
48 | #define SPI_SCOL_OFFSET 7 | ||
49 | #define SPI_TMOD_OFFSET 8 | ||
50 | #define SPI_TMOD_TR 0x0 /* xmit & recv */ | ||
51 | #define SPI_TMOD_TO 0x1 /* xmit only */ | ||
52 | #define SPI_TMOD_RO 0x2 /* recv only */ | ||
53 | #define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */ | ||
54 | |||
55 | #define SPI_SLVOE_OFFSET 10 | ||
56 | #define SPI_SRL_OFFSET 11 | ||
57 | #define SPI_CFS_OFFSET 12 | ||
58 | |||
59 | /* Bit fields in SR, 7 bits */ | ||
60 | #define SR_MASK 0x7f /* cover 7 bits */ | ||
61 | #define SR_BUSY (1 << 0) | ||
62 | #define SR_TF_NOT_FULL (1 << 1) | ||
63 | #define SR_TF_EMPT (1 << 2) | ||
64 | #define SR_RF_NOT_EMPT (1 << 3) | ||
65 | #define SR_RF_FULL (1 << 4) | ||
66 | #define SR_TX_ERR (1 << 5) | ||
67 | #define SR_DCOL (1 << 6) | ||
68 | |||
69 | struct dw_spi_reg { | ||
70 | u32 ctrl0; | ||
71 | u32 ctrl1; | ||
72 | u32 ssienr; | ||
73 | u32 mwcr; | ||
74 | u32 ser; | ||
75 | u32 baudr; | ||
76 | u32 txfltr; | ||
77 | u32 rxfltr; | ||
78 | u32 txflr; | ||
79 | u32 rxflr; | ||
80 | u32 sr; | ||
81 | u32 imr; | ||
82 | u32 isr; | ||
83 | u32 risr; | ||
84 | u32 txoicr; | ||
85 | u32 rxoicr; | ||
86 | u32 rxuicr; | ||
87 | u32 msticr; | ||
88 | u32 icr; | ||
89 | u32 dmacr; | ||
90 | u32 dmatdlr; | ||
91 | u32 dmardlr; | ||
92 | u32 idr; | ||
93 | u32 version; | ||
94 | |||
95 | /* Currently operates as 32 bits, though only the low 16 bits matter */ | ||
96 | u32 dr; | ||
97 | } __packed; | ||
98 | |||
99 | #define dw_readl(dw, name) __raw_readl(&(dw)->name) | ||
100 | #define dw_writel(dw, name, val) __raw_writel((val), &(dw)->name) | ||
101 | |||
102 | /* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */ | ||
103 | static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0; | ||
104 | |||
105 | static u32 *pclk_spi0; | ||
106 | /* Always contains an accessable address, start with 0 */ | ||
107 | static struct dw_spi_reg *pspi; | ||
108 | |||
109 | static struct kmsg_dumper dw_dumper; | ||
110 | static int dumper_registered; | ||
111 | |||
112 | static void dw_kmsg_dump(struct kmsg_dumper *dumper, | ||
113 | enum kmsg_dump_reason reason, | ||
114 | const char *s1, unsigned long l1, | ||
115 | const char *s2, unsigned long l2) | ||
116 | { | ||
117 | int i; | ||
118 | |||
119 | /* When run to this, we'd better re-init the HW */ | ||
120 | mrst_early_console_init(); | ||
121 | |||
122 | for (i = 0; i < l1; i++) | ||
123 | early_mrst_console.write(&early_mrst_console, s1 + i, 1); | ||
124 | for (i = 0; i < l2; i++) | ||
125 | early_mrst_console.write(&early_mrst_console, s2 + i, 1); | ||
126 | } | ||
127 | |||
128 | /* Set the ratio rate to 115200, 8n1, IRQ disabled */ | ||
129 | static void max3110_write_config(void) | ||
130 | { | ||
131 | u16 config; | ||
132 | |||
133 | config = 0xc001; | ||
134 | dw_writel(pspi, dr, config); | ||
135 | } | ||
136 | |||
137 | /* Translate char to a eligible word and send to max3110 */ | ||
138 | static void max3110_write_data(char c) | ||
139 | { | ||
140 | u16 data; | ||
141 | |||
142 | data = 0x8000 | c; | ||
143 | dw_writel(pspi, dr, data); | ||
144 | } | ||
145 | |||
146 | void mrst_early_console_init(void) | ||
147 | { | ||
148 | u32 ctrlr0 = 0; | ||
149 | u32 spi0_cdiv; | ||
150 | u32 freq; /* Freqency info only need be searched once */ | ||
151 | |||
152 | /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */ | ||
153 | pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, | ||
154 | MRST_CLK_SPI0_REG); | ||
155 | spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9; | ||
156 | freq = 100000000 / (spi0_cdiv + 1); | ||
157 | |||
158 | if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL) | ||
159 | mrst_spi_paddr = MRST_REGBASE_SPI1; | ||
160 | |||
161 | pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, | ||
162 | mrst_spi_paddr); | ||
163 | |||
164 | /* Disable SPI controller */ | ||
165 | dw_writel(pspi, ssienr, 0); | ||
166 | |||
167 | /* Set control param, 8 bits, transmit only mode */ | ||
168 | ctrlr0 = dw_readl(pspi, ctrl0); | ||
169 | |||
170 | ctrlr0 &= 0xfcc0; | ||
171 | ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET) | ||
172 | | (SPI_TMOD_TO << SPI_TMOD_OFFSET); | ||
173 | dw_writel(pspi, ctrl0, ctrlr0); | ||
174 | |||
175 | /* | ||
176 | * Change the spi0 clk to comply with 115200 bps, use 100000 to | ||
177 | * calculate the clk dividor to make the clock a little slower | ||
178 | * than real baud rate. | ||
179 | */ | ||
180 | dw_writel(pspi, baudr, freq/100000); | ||
181 | |||
182 | /* Disable all INT for early phase */ | ||
183 | dw_writel(pspi, imr, 0x0); | ||
184 | |||
185 | /* Set the cs to spi-uart */ | ||
186 | dw_writel(pspi, ser, 0x2); | ||
187 | |||
188 | /* Enable the HW, the last step for HW init */ | ||
189 | dw_writel(pspi, ssienr, 0x1); | ||
190 | |||
191 | /* Set the default configuration */ | ||
192 | max3110_write_config(); | ||
193 | |||
194 | /* Register the kmsg dumper */ | ||
195 | if (!dumper_registered) { | ||
196 | dw_dumper.dump = dw_kmsg_dump; | ||
197 | kmsg_dump_register(&dw_dumper); | ||
198 | dumper_registered = 1; | ||
199 | } | ||
200 | } | ||
201 | |||
202 | /* Slave select should be called in the read/write function */ | ||
203 | static void early_mrst_spi_putc(char c) | ||
204 | { | ||
205 | unsigned int timeout; | ||
206 | u32 sr; | ||
207 | |||
208 | timeout = MRST_SPI_TIMEOUT; | ||
209 | /* Early putc needs to make sure the TX FIFO is not full */ | ||
210 | while (--timeout) { | ||
211 | sr = dw_readl(pspi, sr); | ||
212 | if (!(sr & SR_TF_NOT_FULL)) | ||
213 | cpu_relax(); | ||
214 | else | ||
215 | break; | ||
216 | } | ||
217 | |||
218 | if (!timeout) | ||
219 | pr_warning("MRST earlycon: timed out\n"); | ||
220 | else | ||
221 | max3110_write_data(c); | ||
222 | } | ||
223 | |||
224 | /* Early SPI only uses polling mode */ | ||
225 | static void early_mrst_spi_write(struct console *con, const char *str, unsigned n) | ||
226 | { | ||
227 | int i; | ||
228 | |||
229 | for (i = 0; i < n && *str; i++) { | ||
230 | if (*str == '\n') | ||
231 | early_mrst_spi_putc('\r'); | ||
232 | early_mrst_spi_putc(*str); | ||
233 | str++; | ||
234 | } | ||
235 | } | ||
236 | |||
237 | struct console early_mrst_console = { | ||
238 | .name = "earlymrst", | ||
239 | .write = early_mrst_spi_write, | ||
240 | .flags = CON_PRINTBUFFER, | ||
241 | .index = -1, | ||
242 | }; | ||
243 | |||
244 | /* | ||
245 | * Following is the early console based on Medfield HSU (High | ||
246 | * Speed UART) device. | ||
247 | */ | ||
248 | #define HSU_PORT2_PADDR 0xffa28180 | ||
249 | |||
250 | static void __iomem *phsu; | ||
251 | |||
252 | void hsu_early_console_init(void) | ||
253 | { | ||
254 | u8 lcr; | ||
255 | |||
256 | phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, | ||
257 | HSU_PORT2_PADDR); | ||
258 | |||
259 | /* Disable FIFO */ | ||
260 | writeb(0x0, phsu + UART_FCR); | ||
261 | |||
262 | /* Set to default 115200 bps, 8n1 */ | ||
263 | lcr = readb(phsu + UART_LCR); | ||
264 | writeb((0x80 | lcr), phsu + UART_LCR); | ||
265 | writeb(0x18, phsu + UART_DLL); | ||
266 | writeb(lcr, phsu + UART_LCR); | ||
267 | writel(0x3600, phsu + UART_MUL*4); | ||
268 | |||
269 | writeb(0x8, phsu + UART_MCR); | ||
270 | writeb(0x7, phsu + UART_FCR); | ||
271 | writeb(0x3, phsu + UART_LCR); | ||
272 | |||
273 | /* Clear IRQ status */ | ||
274 | readb(phsu + UART_LSR); | ||
275 | readb(phsu + UART_RX); | ||
276 | readb(phsu + UART_IIR); | ||
277 | readb(phsu + UART_MSR); | ||
278 | |||
279 | /* Enable FIFO */ | ||
280 | writeb(0x7, phsu + UART_FCR); | ||
281 | } | ||
282 | |||
283 | #define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE) | ||
284 | |||
285 | static void early_hsu_putc(char ch) | ||
286 | { | ||
287 | unsigned int timeout = 10000; /* 10ms */ | ||
288 | u8 status; | ||
289 | |||
290 | while (--timeout) { | ||
291 | status = readb(phsu + UART_LSR); | ||
292 | if (status & BOTH_EMPTY) | ||
293 | break; | ||
294 | udelay(1); | ||
295 | } | ||
296 | |||
297 | /* Only write the char when there was no timeout */ | ||
298 | if (timeout) | ||
299 | writeb(ch, phsu + UART_TX); | ||
300 | } | ||
301 | |||
302 | static void early_hsu_write(struct console *con, const char *str, unsigned n) | ||
303 | { | ||
304 | int i; | ||
305 | |||
306 | for (i = 0; i < n && *str; i++) { | ||
307 | if (*str == '\n') | ||
308 | early_hsu_putc('\r'); | ||
309 | early_hsu_putc(*str); | ||
310 | str++; | ||
311 | } | ||
312 | } | ||
313 | |||
314 | struct console early_hsu_console = { | ||
315 | .name = "earlyhsu", | ||
316 | .write = early_hsu_write, | ||
317 | .flags = CON_PRINTBUFFER, | ||
318 | .index = -1, | ||
319 | }; | ||
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 227d00920d2f..59e175e89599 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -115,8 +115,7 @@ | |||
115 | 115 | ||
116 | /* unfortunately push/pop can't be no-op */ | 116 | /* unfortunately push/pop can't be no-op */ |
117 | .macro PUSH_GS | 117 | .macro PUSH_GS |
118 | pushl $0 | 118 | pushl_cfi $0 |
119 | CFI_ADJUST_CFA_OFFSET 4 | ||
120 | .endm | 119 | .endm |
121 | .macro POP_GS pop=0 | 120 | .macro POP_GS pop=0 |
122 | addl $(4 + \pop), %esp | 121 | addl $(4 + \pop), %esp |
@@ -140,14 +139,12 @@ | |||
140 | #else /* CONFIG_X86_32_LAZY_GS */ | 139 | #else /* CONFIG_X86_32_LAZY_GS */ |
141 | 140 | ||
142 | .macro PUSH_GS | 141 | .macro PUSH_GS |
143 | pushl %gs | 142 | pushl_cfi %gs |
144 | CFI_ADJUST_CFA_OFFSET 4 | ||
145 | /*CFI_REL_OFFSET gs, 0*/ | 143 | /*CFI_REL_OFFSET gs, 0*/ |
146 | .endm | 144 | .endm |
147 | 145 | ||
148 | .macro POP_GS pop=0 | 146 | .macro POP_GS pop=0 |
149 | 98: popl %gs | 147 | 98: popl_cfi %gs |
150 | CFI_ADJUST_CFA_OFFSET -4 | ||
151 | /*CFI_RESTORE gs*/ | 148 | /*CFI_RESTORE gs*/ |
152 | .if \pop <> 0 | 149 | .if \pop <> 0 |
153 | add $\pop, %esp | 150 | add $\pop, %esp |
@@ -195,35 +192,25 @@ | |||
195 | .macro SAVE_ALL | 192 | .macro SAVE_ALL |
196 | cld | 193 | cld |
197 | PUSH_GS | 194 | PUSH_GS |
198 | pushl %fs | 195 | pushl_cfi %fs |
199 | CFI_ADJUST_CFA_OFFSET 4 | ||
200 | /*CFI_REL_OFFSET fs, 0;*/ | 196 | /*CFI_REL_OFFSET fs, 0;*/ |
201 | pushl %es | 197 | pushl_cfi %es |
202 | CFI_ADJUST_CFA_OFFSET 4 | ||
203 | /*CFI_REL_OFFSET es, 0;*/ | 198 | /*CFI_REL_OFFSET es, 0;*/ |
204 | pushl %ds | 199 | pushl_cfi %ds |
205 | CFI_ADJUST_CFA_OFFSET 4 | ||
206 | /*CFI_REL_OFFSET ds, 0;*/ | 200 | /*CFI_REL_OFFSET ds, 0;*/ |
207 | pushl %eax | 201 | pushl_cfi %eax |
208 | CFI_ADJUST_CFA_OFFSET 4 | ||
209 | CFI_REL_OFFSET eax, 0 | 202 | CFI_REL_OFFSET eax, 0 |
210 | pushl %ebp | 203 | pushl_cfi %ebp |
211 | CFI_ADJUST_CFA_OFFSET 4 | ||
212 | CFI_REL_OFFSET ebp, 0 | 204 | CFI_REL_OFFSET ebp, 0 |
213 | pushl %edi | 205 | pushl_cfi %edi |
214 | CFI_ADJUST_CFA_OFFSET 4 | ||
215 | CFI_REL_OFFSET edi, 0 | 206 | CFI_REL_OFFSET edi, 0 |
216 | pushl %esi | 207 | pushl_cfi %esi |
217 | CFI_ADJUST_CFA_OFFSET 4 | ||
218 | CFI_REL_OFFSET esi, 0 | 208 | CFI_REL_OFFSET esi, 0 |
219 | pushl %edx | 209 | pushl_cfi %edx |
220 | CFI_ADJUST_CFA_OFFSET 4 | ||
221 | CFI_REL_OFFSET edx, 0 | 210 | CFI_REL_OFFSET edx, 0 |
222 | pushl %ecx | 211 | pushl_cfi %ecx |
223 | CFI_ADJUST_CFA_OFFSET 4 | ||
224 | CFI_REL_OFFSET ecx, 0 | 212 | CFI_REL_OFFSET ecx, 0 |
225 | pushl %ebx | 213 | pushl_cfi %ebx |
226 | CFI_ADJUST_CFA_OFFSET 4 | ||
227 | CFI_REL_OFFSET ebx, 0 | 214 | CFI_REL_OFFSET ebx, 0 |
228 | movl $(__USER_DS), %edx | 215 | movl $(__USER_DS), %edx |
229 | movl %edx, %ds | 216 | movl %edx, %ds |
@@ -234,39 +221,29 @@ | |||
234 | .endm | 221 | .endm |
235 | 222 | ||
236 | .macro RESTORE_INT_REGS | 223 | .macro RESTORE_INT_REGS |
237 | popl %ebx | 224 | popl_cfi %ebx |
238 | CFI_ADJUST_CFA_OFFSET -4 | ||
239 | CFI_RESTORE ebx | 225 | CFI_RESTORE ebx |
240 | popl %ecx | 226 | popl_cfi %ecx |
241 | CFI_ADJUST_CFA_OFFSET -4 | ||
242 | CFI_RESTORE ecx | 227 | CFI_RESTORE ecx |
243 | popl %edx | 228 | popl_cfi %edx |
244 | CFI_ADJUST_CFA_OFFSET -4 | ||
245 | CFI_RESTORE edx | 229 | CFI_RESTORE edx |
246 | popl %esi | 230 | popl_cfi %esi |
247 | CFI_ADJUST_CFA_OFFSET -4 | ||
248 | CFI_RESTORE esi | 231 | CFI_RESTORE esi |
249 | popl %edi | 232 | popl_cfi %edi |
250 | CFI_ADJUST_CFA_OFFSET -4 | ||
251 | CFI_RESTORE edi | 233 | CFI_RESTORE edi |
252 | popl %ebp | 234 | popl_cfi %ebp |
253 | CFI_ADJUST_CFA_OFFSET -4 | ||
254 | CFI_RESTORE ebp | 235 | CFI_RESTORE ebp |
255 | popl %eax | 236 | popl_cfi %eax |
256 | CFI_ADJUST_CFA_OFFSET -4 | ||
257 | CFI_RESTORE eax | 237 | CFI_RESTORE eax |
258 | .endm | 238 | .endm |
259 | 239 | ||
260 | .macro RESTORE_REGS pop=0 | 240 | .macro RESTORE_REGS pop=0 |
261 | RESTORE_INT_REGS | 241 | RESTORE_INT_REGS |
262 | 1: popl %ds | 242 | 1: popl_cfi %ds |
263 | CFI_ADJUST_CFA_OFFSET -4 | ||
264 | /*CFI_RESTORE ds;*/ | 243 | /*CFI_RESTORE ds;*/ |
265 | 2: popl %es | 244 | 2: popl_cfi %es |
266 | CFI_ADJUST_CFA_OFFSET -4 | ||
267 | /*CFI_RESTORE es;*/ | 245 | /*CFI_RESTORE es;*/ |
268 | 3: popl %fs | 246 | 3: popl_cfi %fs |
269 | CFI_ADJUST_CFA_OFFSET -4 | ||
270 | /*CFI_RESTORE fs;*/ | 247 | /*CFI_RESTORE fs;*/ |
271 | POP_GS \pop | 248 | POP_GS \pop |
272 | .pushsection .fixup, "ax" | 249 | .pushsection .fixup, "ax" |
@@ -320,16 +297,12 @@ | |||
320 | 297 | ||
321 | ENTRY(ret_from_fork) | 298 | ENTRY(ret_from_fork) |
322 | CFI_STARTPROC | 299 | CFI_STARTPROC |
323 | pushl %eax | 300 | pushl_cfi %eax |
324 | CFI_ADJUST_CFA_OFFSET 4 | ||
325 | call schedule_tail | 301 | call schedule_tail |
326 | GET_THREAD_INFO(%ebp) | 302 | GET_THREAD_INFO(%ebp) |
327 | popl %eax | 303 | popl_cfi %eax |
328 | CFI_ADJUST_CFA_OFFSET -4 | 304 | pushl_cfi $0x0202 # Reset kernel eflags |
329 | pushl $0x0202 # Reset kernel eflags | 305 | popfl_cfi |
330 | CFI_ADJUST_CFA_OFFSET 4 | ||
331 | popfl | ||
332 | CFI_ADJUST_CFA_OFFSET -4 | ||
333 | jmp syscall_exit | 306 | jmp syscall_exit |
334 | CFI_ENDPROC | 307 | CFI_ENDPROC |
335 | END(ret_from_fork) | 308 | END(ret_from_fork) |
@@ -409,29 +382,23 @@ sysenter_past_esp: | |||
409 | * enough kernel state to call TRACE_IRQS_OFF can be called - but | 382 | * enough kernel state to call TRACE_IRQS_OFF can be called - but |
410 | * we immediately enable interrupts at that point anyway. | 383 | * we immediately enable interrupts at that point anyway. |
411 | */ | 384 | */ |
412 | pushl $(__USER_DS) | 385 | pushl_cfi $__USER_DS |
413 | CFI_ADJUST_CFA_OFFSET 4 | ||
414 | /*CFI_REL_OFFSET ss, 0*/ | 386 | /*CFI_REL_OFFSET ss, 0*/ |
415 | pushl %ebp | 387 | pushl_cfi %ebp |
416 | CFI_ADJUST_CFA_OFFSET 4 | ||
417 | CFI_REL_OFFSET esp, 0 | 388 | CFI_REL_OFFSET esp, 0 |
418 | pushfl | 389 | pushfl_cfi |
419 | orl $X86_EFLAGS_IF, (%esp) | 390 | orl $X86_EFLAGS_IF, (%esp) |
420 | CFI_ADJUST_CFA_OFFSET 4 | 391 | pushl_cfi $__USER_CS |
421 | pushl $(__USER_CS) | ||
422 | CFI_ADJUST_CFA_OFFSET 4 | ||
423 | /*CFI_REL_OFFSET cs, 0*/ | 392 | /*CFI_REL_OFFSET cs, 0*/ |
424 | /* | 393 | /* |
425 | * Push current_thread_info()->sysenter_return to the stack. | 394 | * Push current_thread_info()->sysenter_return to the stack. |
426 | * A tiny bit of offset fixup is necessary - 4*4 means the 4 words | 395 | * A tiny bit of offset fixup is necessary - 4*4 means the 4 words |
427 | * pushed above; +8 corresponds to copy_thread's esp0 setting. | 396 | * pushed above; +8 corresponds to copy_thread's esp0 setting. |
428 | */ | 397 | */ |
429 | pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) | 398 | pushl_cfi (TI_sysenter_return-THREAD_SIZE_asm+8+4*4)(%esp) |
430 | CFI_ADJUST_CFA_OFFSET 4 | ||
431 | CFI_REL_OFFSET eip, 0 | 399 | CFI_REL_OFFSET eip, 0 |
432 | 400 | ||
433 | pushl %eax | 401 | pushl_cfi %eax |
434 | CFI_ADJUST_CFA_OFFSET 4 | ||
435 | SAVE_ALL | 402 | SAVE_ALL |
436 | ENABLE_INTERRUPTS(CLBR_NONE) | 403 | ENABLE_INTERRUPTS(CLBR_NONE) |
437 | 404 | ||
@@ -486,8 +453,7 @@ sysenter_audit: | |||
486 | movl %eax,%edx /* 2nd arg: syscall number */ | 453 | movl %eax,%edx /* 2nd arg: syscall number */ |
487 | movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ | 454 | movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ |
488 | call audit_syscall_entry | 455 | call audit_syscall_entry |
489 | pushl %ebx | 456 | pushl_cfi %ebx |
490 | CFI_ADJUST_CFA_OFFSET 4 | ||
491 | movl PT_EAX(%esp),%eax /* reload syscall number */ | 457 | movl PT_EAX(%esp),%eax /* reload syscall number */ |
492 | jmp sysenter_do_call | 458 | jmp sysenter_do_call |
493 | 459 | ||
@@ -529,8 +495,7 @@ ENDPROC(ia32_sysenter_target) | |||
529 | # system call handler stub | 495 | # system call handler stub |
530 | ENTRY(system_call) | 496 | ENTRY(system_call) |
531 | RING0_INT_FRAME # can't unwind into user space anyway | 497 | RING0_INT_FRAME # can't unwind into user space anyway |
532 | pushl %eax # save orig_eax | 498 | pushl_cfi %eax # save orig_eax |
533 | CFI_ADJUST_CFA_OFFSET 4 | ||
534 | SAVE_ALL | 499 | SAVE_ALL |
535 | GET_THREAD_INFO(%ebp) | 500 | GET_THREAD_INFO(%ebp) |
536 | # system call tracing in operation / emulation | 501 | # system call tracing in operation / emulation |
@@ -566,7 +531,6 @@ restore_all_notrace: | |||
566 | je ldt_ss # returning to user-space with LDT SS | 531 | je ldt_ss # returning to user-space with LDT SS |
567 | restore_nocheck: | 532 | restore_nocheck: |
568 | RESTORE_REGS 4 # skip orig_eax/error_code | 533 | RESTORE_REGS 4 # skip orig_eax/error_code |
569 | CFI_ADJUST_CFA_OFFSET -4 | ||
570 | irq_return: | 534 | irq_return: |
571 | INTERRUPT_RETURN | 535 | INTERRUPT_RETURN |
572 | .section .fixup,"ax" | 536 | .section .fixup,"ax" |
@@ -619,10 +583,8 @@ ldt_ss: | |||
619 | shr $16, %edx | 583 | shr $16, %edx |
620 | mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ | 584 | mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ |
621 | mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ | 585 | mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ |
622 | pushl $__ESPFIX_SS | 586 | pushl_cfi $__ESPFIX_SS |
623 | CFI_ADJUST_CFA_OFFSET 4 | 587 | pushl_cfi %eax /* new kernel esp */ |
624 | push %eax /* new kernel esp */ | ||
625 | CFI_ADJUST_CFA_OFFSET 4 | ||
626 | /* Disable interrupts, but do not irqtrace this section: we | 588 | /* Disable interrupts, but do not irqtrace this section: we |
627 | * will soon execute iret and the tracer was already set to | 589 | * will soon execute iret and the tracer was already set to |
628 | * the irqstate after the iret */ | 590 | * the irqstate after the iret */ |
@@ -666,11 +628,9 @@ work_notifysig: # deal with pending signals and | |||
666 | 628 | ||
667 | ALIGN | 629 | ALIGN |
668 | work_notifysig_v86: | 630 | work_notifysig_v86: |
669 | pushl %ecx # save ti_flags for do_notify_resume | 631 | pushl_cfi %ecx # save ti_flags for do_notify_resume |
670 | CFI_ADJUST_CFA_OFFSET 4 | ||
671 | call save_v86_state # %eax contains pt_regs pointer | 632 | call save_v86_state # %eax contains pt_regs pointer |
672 | popl %ecx | 633 | popl_cfi %ecx |
673 | CFI_ADJUST_CFA_OFFSET -4 | ||
674 | movl %eax, %esp | 634 | movl %eax, %esp |
675 | #else | 635 | #else |
676 | movl %esp, %eax | 636 | movl %esp, %eax |
@@ -750,14 +710,18 @@ ptregs_##name: \ | |||
750 | #define PTREGSCALL3(name) \ | 710 | #define PTREGSCALL3(name) \ |
751 | ALIGN; \ | 711 | ALIGN; \ |
752 | ptregs_##name: \ | 712 | ptregs_##name: \ |
713 | CFI_STARTPROC; \ | ||
753 | leal 4(%esp),%eax; \ | 714 | leal 4(%esp),%eax; \ |
754 | pushl %eax; \ | 715 | pushl_cfi %eax; \ |
755 | movl PT_EDX(%eax),%ecx; \ | 716 | movl PT_EDX(%eax),%ecx; \ |
756 | movl PT_ECX(%eax),%edx; \ | 717 | movl PT_ECX(%eax),%edx; \ |
757 | movl PT_EBX(%eax),%eax; \ | 718 | movl PT_EBX(%eax),%eax; \ |
758 | call sys_##name; \ | 719 | call sys_##name; \ |
759 | addl $4,%esp; \ | 720 | addl $4,%esp; \ |
760 | ret | 721 | CFI_ADJUST_CFA_OFFSET -4; \ |
722 | ret; \ | ||
723 | CFI_ENDPROC; \ | ||
724 | ENDPROC(ptregs_##name) | ||
761 | 725 | ||
762 | PTREGSCALL1(iopl) | 726 | PTREGSCALL1(iopl) |
763 | PTREGSCALL0(fork) | 727 | PTREGSCALL0(fork) |
@@ -772,15 +736,19 @@ PTREGSCALL1(vm86old) | |||
772 | /* Clone is an oddball. The 4th arg is in %edi */ | 736 | /* Clone is an oddball. The 4th arg is in %edi */ |
773 | ALIGN; | 737 | ALIGN; |
774 | ptregs_clone: | 738 | ptregs_clone: |
739 | CFI_STARTPROC | ||
775 | leal 4(%esp),%eax | 740 | leal 4(%esp),%eax |
776 | pushl %eax | 741 | pushl_cfi %eax |
777 | pushl PT_EDI(%eax) | 742 | pushl_cfi PT_EDI(%eax) |
778 | movl PT_EDX(%eax),%ecx | 743 | movl PT_EDX(%eax),%ecx |
779 | movl PT_ECX(%eax),%edx | 744 | movl PT_ECX(%eax),%edx |
780 | movl PT_EBX(%eax),%eax | 745 | movl PT_EBX(%eax),%eax |
781 | call sys_clone | 746 | call sys_clone |
782 | addl $8,%esp | 747 | addl $8,%esp |
748 | CFI_ADJUST_CFA_OFFSET -8 | ||
783 | ret | 749 | ret |
750 | CFI_ENDPROC | ||
751 | ENDPROC(ptregs_clone) | ||
784 | 752 | ||
785 | .macro FIXUP_ESPFIX_STACK | 753 | .macro FIXUP_ESPFIX_STACK |
786 | /* | 754 | /* |
@@ -795,10 +763,8 @@ ptregs_clone: | |||
795 | mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ | 763 | mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ |
796 | shl $16, %eax | 764 | shl $16, %eax |
797 | addl %esp, %eax /* the adjusted stack pointer */ | 765 | addl %esp, %eax /* the adjusted stack pointer */ |
798 | pushl $__KERNEL_DS | 766 | pushl_cfi $__KERNEL_DS |
799 | CFI_ADJUST_CFA_OFFSET 4 | 767 | pushl_cfi %eax |
800 | pushl %eax | ||
801 | CFI_ADJUST_CFA_OFFSET 4 | ||
802 | lss (%esp), %esp /* switch to the normal stack segment */ | 768 | lss (%esp), %esp /* switch to the normal stack segment */ |
803 | CFI_ADJUST_CFA_OFFSET -8 | 769 | CFI_ADJUST_CFA_OFFSET -8 |
804 | .endm | 770 | .endm |
@@ -835,8 +801,7 @@ vector=FIRST_EXTERNAL_VECTOR | |||
835 | .if vector <> FIRST_EXTERNAL_VECTOR | 801 | .if vector <> FIRST_EXTERNAL_VECTOR |
836 | CFI_ADJUST_CFA_OFFSET -4 | 802 | CFI_ADJUST_CFA_OFFSET -4 |
837 | .endif | 803 | .endif |
838 | 1: pushl $(~vector+0x80) /* Note: always in signed byte range */ | 804 | 1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ |
839 | CFI_ADJUST_CFA_OFFSET 4 | ||
840 | .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 | 805 | .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 |
841 | jmp 2f | 806 | jmp 2f |
842 | .endif | 807 | .endif |
@@ -876,8 +841,7 @@ ENDPROC(common_interrupt) | |||
876 | #define BUILD_INTERRUPT3(name, nr, fn) \ | 841 | #define BUILD_INTERRUPT3(name, nr, fn) \ |
877 | ENTRY(name) \ | 842 | ENTRY(name) \ |
878 | RING0_INT_FRAME; \ | 843 | RING0_INT_FRAME; \ |
879 | pushl $~(nr); \ | 844 | pushl_cfi $~(nr); \ |
880 | CFI_ADJUST_CFA_OFFSET 4; \ | ||
881 | SAVE_ALL; \ | 845 | SAVE_ALL; \ |
882 | TRACE_IRQS_OFF \ | 846 | TRACE_IRQS_OFF \ |
883 | movl %esp,%eax; \ | 847 | movl %esp,%eax; \ |
@@ -893,21 +857,18 @@ ENDPROC(name) | |||
893 | 857 | ||
894 | ENTRY(coprocessor_error) | 858 | ENTRY(coprocessor_error) |
895 | RING0_INT_FRAME | 859 | RING0_INT_FRAME |
896 | pushl $0 | 860 | pushl_cfi $0 |
897 | CFI_ADJUST_CFA_OFFSET 4 | 861 | pushl_cfi $do_coprocessor_error |
898 | pushl $do_coprocessor_error | ||
899 | CFI_ADJUST_CFA_OFFSET 4 | ||
900 | jmp error_code | 862 | jmp error_code |
901 | CFI_ENDPROC | 863 | CFI_ENDPROC |
902 | END(coprocessor_error) | 864 | END(coprocessor_error) |
903 | 865 | ||
904 | ENTRY(simd_coprocessor_error) | 866 | ENTRY(simd_coprocessor_error) |
905 | RING0_INT_FRAME | 867 | RING0_INT_FRAME |
906 | pushl $0 | 868 | pushl_cfi $0 |
907 | CFI_ADJUST_CFA_OFFSET 4 | ||
908 | #ifdef CONFIG_X86_INVD_BUG | 869 | #ifdef CONFIG_X86_INVD_BUG |
909 | /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ | 870 | /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ |
910 | 661: pushl $do_general_protection | 871 | 661: pushl_cfi $do_general_protection |
911 | 662: | 872 | 662: |
912 | .section .altinstructions,"a" | 873 | .section .altinstructions,"a" |
913 | .balign 4 | 874 | .balign 4 |
@@ -922,19 +883,16 @@ ENTRY(simd_coprocessor_error) | |||
922 | 664: | 883 | 664: |
923 | .previous | 884 | .previous |
924 | #else | 885 | #else |
925 | pushl $do_simd_coprocessor_error | 886 | pushl_cfi $do_simd_coprocessor_error |
926 | #endif | 887 | #endif |
927 | CFI_ADJUST_CFA_OFFSET 4 | ||
928 | jmp error_code | 888 | jmp error_code |
929 | CFI_ENDPROC | 889 | CFI_ENDPROC |
930 | END(simd_coprocessor_error) | 890 | END(simd_coprocessor_error) |
931 | 891 | ||
932 | ENTRY(device_not_available) | 892 | ENTRY(device_not_available) |
933 | RING0_INT_FRAME | 893 | RING0_INT_FRAME |
934 | pushl $-1 # mark this as an int | 894 | pushl_cfi $-1 # mark this as an int |
935 | CFI_ADJUST_CFA_OFFSET 4 | 895 | pushl_cfi $do_device_not_available |
936 | pushl $do_device_not_available | ||
937 | CFI_ADJUST_CFA_OFFSET 4 | ||
938 | jmp error_code | 896 | jmp error_code |
939 | CFI_ENDPROC | 897 | CFI_ENDPROC |
940 | END(device_not_available) | 898 | END(device_not_available) |
@@ -956,82 +914,68 @@ END(native_irq_enable_sysexit) | |||
956 | 914 | ||
957 | ENTRY(overflow) | 915 | ENTRY(overflow) |
958 | RING0_INT_FRAME | 916 | RING0_INT_FRAME |
959 | pushl $0 | 917 | pushl_cfi $0 |
960 | CFI_ADJUST_CFA_OFFSET 4 | 918 | pushl_cfi $do_overflow |
961 | pushl $do_overflow | ||
962 | CFI_ADJUST_CFA_OFFSET 4 | ||
963 | jmp error_code | 919 | jmp error_code |
964 | CFI_ENDPROC | 920 | CFI_ENDPROC |
965 | END(overflow) | 921 | END(overflow) |
966 | 922 | ||
967 | ENTRY(bounds) | 923 | ENTRY(bounds) |
968 | RING0_INT_FRAME | 924 | RING0_INT_FRAME |
969 | pushl $0 | 925 | pushl_cfi $0 |
970 | CFI_ADJUST_CFA_OFFSET 4 | 926 | pushl_cfi $do_bounds |
971 | pushl $do_bounds | ||
972 | CFI_ADJUST_CFA_OFFSET 4 | ||
973 | jmp error_code | 927 | jmp error_code |
974 | CFI_ENDPROC | 928 | CFI_ENDPROC |
975 | END(bounds) | 929 | END(bounds) |
976 | 930 | ||
977 | ENTRY(invalid_op) | 931 | ENTRY(invalid_op) |
978 | RING0_INT_FRAME | 932 | RING0_INT_FRAME |
979 | pushl $0 | 933 | pushl_cfi $0 |
980 | CFI_ADJUST_CFA_OFFSET 4 | 934 | pushl_cfi $do_invalid_op |
981 | pushl $do_invalid_op | ||
982 | CFI_ADJUST_CFA_OFFSET 4 | ||
983 | jmp error_code | 935 | jmp error_code |
984 | CFI_ENDPROC | 936 | CFI_ENDPROC |
985 | END(invalid_op) | 937 | END(invalid_op) |
986 | 938 | ||
987 | ENTRY(coprocessor_segment_overrun) | 939 | ENTRY(coprocessor_segment_overrun) |
988 | RING0_INT_FRAME | 940 | RING0_INT_FRAME |
989 | pushl $0 | 941 | pushl_cfi $0 |
990 | CFI_ADJUST_CFA_OFFSET 4 | 942 | pushl_cfi $do_coprocessor_segment_overrun |
991 | pushl $do_coprocessor_segment_overrun | ||
992 | CFI_ADJUST_CFA_OFFSET 4 | ||
993 | jmp error_code | 943 | jmp error_code |
994 | CFI_ENDPROC | 944 | CFI_ENDPROC |
995 | END(coprocessor_segment_overrun) | 945 | END(coprocessor_segment_overrun) |
996 | 946 | ||
997 | ENTRY(invalid_TSS) | 947 | ENTRY(invalid_TSS) |
998 | RING0_EC_FRAME | 948 | RING0_EC_FRAME |
999 | pushl $do_invalid_TSS | 949 | pushl_cfi $do_invalid_TSS |
1000 | CFI_ADJUST_CFA_OFFSET 4 | ||
1001 | jmp error_code | 950 | jmp error_code |
1002 | CFI_ENDPROC | 951 | CFI_ENDPROC |
1003 | END(invalid_TSS) | 952 | END(invalid_TSS) |
1004 | 953 | ||
1005 | ENTRY(segment_not_present) | 954 | ENTRY(segment_not_present) |
1006 | RING0_EC_FRAME | 955 | RING0_EC_FRAME |
1007 | pushl $do_segment_not_present | 956 | pushl_cfi $do_segment_not_present |
1008 | CFI_ADJUST_CFA_OFFSET 4 | ||
1009 | jmp error_code | 957 | jmp error_code |
1010 | CFI_ENDPROC | 958 | CFI_ENDPROC |
1011 | END(segment_not_present) | 959 | END(segment_not_present) |
1012 | 960 | ||
1013 | ENTRY(stack_segment) | 961 | ENTRY(stack_segment) |
1014 | RING0_EC_FRAME | 962 | RING0_EC_FRAME |
1015 | pushl $do_stack_segment | 963 | pushl_cfi $do_stack_segment |
1016 | CFI_ADJUST_CFA_OFFSET 4 | ||
1017 | jmp error_code | 964 | jmp error_code |
1018 | CFI_ENDPROC | 965 | CFI_ENDPROC |
1019 | END(stack_segment) | 966 | END(stack_segment) |
1020 | 967 | ||
1021 | ENTRY(alignment_check) | 968 | ENTRY(alignment_check) |
1022 | RING0_EC_FRAME | 969 | RING0_EC_FRAME |
1023 | pushl $do_alignment_check | 970 | pushl_cfi $do_alignment_check |
1024 | CFI_ADJUST_CFA_OFFSET 4 | ||
1025 | jmp error_code | 971 | jmp error_code |
1026 | CFI_ENDPROC | 972 | CFI_ENDPROC |
1027 | END(alignment_check) | 973 | END(alignment_check) |
1028 | 974 | ||
1029 | ENTRY(divide_error) | 975 | ENTRY(divide_error) |
1030 | RING0_INT_FRAME | 976 | RING0_INT_FRAME |
1031 | pushl $0 # no error code | 977 | pushl_cfi $0 # no error code |
1032 | CFI_ADJUST_CFA_OFFSET 4 | 978 | pushl_cfi $do_divide_error |
1033 | pushl $do_divide_error | ||
1034 | CFI_ADJUST_CFA_OFFSET 4 | ||
1035 | jmp error_code | 979 | jmp error_code |
1036 | CFI_ENDPROC | 980 | CFI_ENDPROC |
1037 | END(divide_error) | 981 | END(divide_error) |
@@ -1039,10 +983,8 @@ END(divide_error) | |||
1039 | #ifdef CONFIG_X86_MCE | 983 | #ifdef CONFIG_X86_MCE |
1040 | ENTRY(machine_check) | 984 | ENTRY(machine_check) |
1041 | RING0_INT_FRAME | 985 | RING0_INT_FRAME |
1042 | pushl $0 | 986 | pushl_cfi $0 |
1043 | CFI_ADJUST_CFA_OFFSET 4 | 987 | pushl_cfi machine_check_vector |
1044 | pushl machine_check_vector | ||
1045 | CFI_ADJUST_CFA_OFFSET 4 | ||
1046 | jmp error_code | 988 | jmp error_code |
1047 | CFI_ENDPROC | 989 | CFI_ENDPROC |
1048 | END(machine_check) | 990 | END(machine_check) |
@@ -1050,10 +992,8 @@ END(machine_check) | |||
1050 | 992 | ||
1051 | ENTRY(spurious_interrupt_bug) | 993 | ENTRY(spurious_interrupt_bug) |
1052 | RING0_INT_FRAME | 994 | RING0_INT_FRAME |
1053 | pushl $0 | 995 | pushl_cfi $0 |
1054 | CFI_ADJUST_CFA_OFFSET 4 | 996 | pushl_cfi $do_spurious_interrupt_bug |
1055 | pushl $do_spurious_interrupt_bug | ||
1056 | CFI_ADJUST_CFA_OFFSET 4 | ||
1057 | jmp error_code | 997 | jmp error_code |
1058 | CFI_ENDPROC | 998 | CFI_ENDPROC |
1059 | END(spurious_interrupt_bug) | 999 | END(spurious_interrupt_bug) |
@@ -1084,8 +1024,7 @@ ENTRY(xen_sysenter_target) | |||
1084 | 1024 | ||
1085 | ENTRY(xen_hypervisor_callback) | 1025 | ENTRY(xen_hypervisor_callback) |
1086 | CFI_STARTPROC | 1026 | CFI_STARTPROC |
1087 | pushl $0 | 1027 | pushl_cfi $0 |
1088 | CFI_ADJUST_CFA_OFFSET 4 | ||
1089 | SAVE_ALL | 1028 | SAVE_ALL |
1090 | TRACE_IRQS_OFF | 1029 | TRACE_IRQS_OFF |
1091 | 1030 | ||
@@ -1121,23 +1060,20 @@ ENDPROC(xen_hypervisor_callback) | |||
1121 | # We distinguish between categories by maintaining a status value in EAX. | 1060 | # We distinguish between categories by maintaining a status value in EAX. |
1122 | ENTRY(xen_failsafe_callback) | 1061 | ENTRY(xen_failsafe_callback) |
1123 | CFI_STARTPROC | 1062 | CFI_STARTPROC |
1124 | pushl %eax | 1063 | pushl_cfi %eax |
1125 | CFI_ADJUST_CFA_OFFSET 4 | ||
1126 | movl $1,%eax | 1064 | movl $1,%eax |
1127 | 1: mov 4(%esp),%ds | 1065 | 1: mov 4(%esp),%ds |
1128 | 2: mov 8(%esp),%es | 1066 | 2: mov 8(%esp),%es |
1129 | 3: mov 12(%esp),%fs | 1067 | 3: mov 12(%esp),%fs |
1130 | 4: mov 16(%esp),%gs | 1068 | 4: mov 16(%esp),%gs |
1131 | testl %eax,%eax | 1069 | testl %eax,%eax |
1132 | popl %eax | 1070 | popl_cfi %eax |
1133 | CFI_ADJUST_CFA_OFFSET -4 | ||
1134 | lea 16(%esp),%esp | 1071 | lea 16(%esp),%esp |
1135 | CFI_ADJUST_CFA_OFFSET -16 | 1072 | CFI_ADJUST_CFA_OFFSET -16 |
1136 | jz 5f | 1073 | jz 5f |
1137 | addl $16,%esp | 1074 | addl $16,%esp |
1138 | jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) | 1075 | jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) |
1139 | 5: pushl $0 # EAX == 0 => Category 1 (Bad segment) | 1076 | 5: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment) |
1140 | CFI_ADJUST_CFA_OFFSET 4 | ||
1141 | SAVE_ALL | 1077 | SAVE_ALL |
1142 | jmp ret_from_exception | 1078 | jmp ret_from_exception |
1143 | CFI_ENDPROC | 1079 | CFI_ENDPROC |
@@ -1287,40 +1223,29 @@ syscall_table_size=(.-sys_call_table) | |||
1287 | 1223 | ||
1288 | ENTRY(page_fault) | 1224 | ENTRY(page_fault) |
1289 | RING0_EC_FRAME | 1225 | RING0_EC_FRAME |
1290 | pushl $do_page_fault | 1226 | pushl_cfi $do_page_fault |
1291 | CFI_ADJUST_CFA_OFFSET 4 | ||
1292 | ALIGN | 1227 | ALIGN |
1293 | error_code: | 1228 | error_code: |
1294 | /* the function address is in %gs's slot on the stack */ | 1229 | /* the function address is in %gs's slot on the stack */ |
1295 | pushl %fs | 1230 | pushl_cfi %fs |
1296 | CFI_ADJUST_CFA_OFFSET 4 | ||
1297 | /*CFI_REL_OFFSET fs, 0*/ | 1231 | /*CFI_REL_OFFSET fs, 0*/ |
1298 | pushl %es | 1232 | pushl_cfi %es |
1299 | CFI_ADJUST_CFA_OFFSET 4 | ||
1300 | /*CFI_REL_OFFSET es, 0*/ | 1233 | /*CFI_REL_OFFSET es, 0*/ |
1301 | pushl %ds | 1234 | pushl_cfi %ds |
1302 | CFI_ADJUST_CFA_OFFSET 4 | ||
1303 | /*CFI_REL_OFFSET ds, 0*/ | 1235 | /*CFI_REL_OFFSET ds, 0*/ |
1304 | pushl %eax | 1236 | pushl_cfi %eax |
1305 | CFI_ADJUST_CFA_OFFSET 4 | ||
1306 | CFI_REL_OFFSET eax, 0 | 1237 | CFI_REL_OFFSET eax, 0 |
1307 | pushl %ebp | 1238 | pushl_cfi %ebp |
1308 | CFI_ADJUST_CFA_OFFSET 4 | ||
1309 | CFI_REL_OFFSET ebp, 0 | 1239 | CFI_REL_OFFSET ebp, 0 |
1310 | pushl %edi | 1240 | pushl_cfi %edi |
1311 | CFI_ADJUST_CFA_OFFSET 4 | ||
1312 | CFI_REL_OFFSET edi, 0 | 1241 | CFI_REL_OFFSET edi, 0 |
1313 | pushl %esi | 1242 | pushl_cfi %esi |
1314 | CFI_ADJUST_CFA_OFFSET 4 | ||
1315 | CFI_REL_OFFSET esi, 0 | 1243 | CFI_REL_OFFSET esi, 0 |
1316 | pushl %edx | 1244 | pushl_cfi %edx |
1317 | CFI_ADJUST_CFA_OFFSET 4 | ||
1318 | CFI_REL_OFFSET edx, 0 | 1245 | CFI_REL_OFFSET edx, 0 |
1319 | pushl %ecx | 1246 | pushl_cfi %ecx |
1320 | CFI_ADJUST_CFA_OFFSET 4 | ||
1321 | CFI_REL_OFFSET ecx, 0 | 1247 | CFI_REL_OFFSET ecx, 0 |
1322 | pushl %ebx | 1248 | pushl_cfi %ebx |
1323 | CFI_ADJUST_CFA_OFFSET 4 | ||
1324 | CFI_REL_OFFSET ebx, 0 | 1249 | CFI_REL_OFFSET ebx, 0 |
1325 | cld | 1250 | cld |
1326 | movl $(__KERNEL_PERCPU), %ecx | 1251 | movl $(__KERNEL_PERCPU), %ecx |
@@ -1362,12 +1287,9 @@ END(page_fault) | |||
1362 | movl TSS_sysenter_sp0 + \offset(%esp), %esp | 1287 | movl TSS_sysenter_sp0 + \offset(%esp), %esp |
1363 | CFI_DEF_CFA esp, 0 | 1288 | CFI_DEF_CFA esp, 0 |
1364 | CFI_UNDEFINED eip | 1289 | CFI_UNDEFINED eip |
1365 | pushfl | 1290 | pushfl_cfi |
1366 | CFI_ADJUST_CFA_OFFSET 4 | 1291 | pushl_cfi $__KERNEL_CS |
1367 | pushl $__KERNEL_CS | 1292 | pushl_cfi $sysenter_past_esp |
1368 | CFI_ADJUST_CFA_OFFSET 4 | ||
1369 | pushl $sysenter_past_esp | ||
1370 | CFI_ADJUST_CFA_OFFSET 4 | ||
1371 | CFI_REL_OFFSET eip, 0 | 1293 | CFI_REL_OFFSET eip, 0 |
1372 | .endm | 1294 | .endm |
1373 | 1295 | ||
@@ -1377,8 +1299,7 @@ ENTRY(debug) | |||
1377 | jne debug_stack_correct | 1299 | jne debug_stack_correct |
1378 | FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn | 1300 | FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn |
1379 | debug_stack_correct: | 1301 | debug_stack_correct: |
1380 | pushl $-1 # mark this as an int | 1302 | pushl_cfi $-1 # mark this as an int |
1381 | CFI_ADJUST_CFA_OFFSET 4 | ||
1382 | SAVE_ALL | 1303 | SAVE_ALL |
1383 | TRACE_IRQS_OFF | 1304 | TRACE_IRQS_OFF |
1384 | xorl %edx,%edx # error code 0 | 1305 | xorl %edx,%edx # error code 0 |
@@ -1398,32 +1319,27 @@ END(debug) | |||
1398 | */ | 1319 | */ |
1399 | ENTRY(nmi) | 1320 | ENTRY(nmi) |
1400 | RING0_INT_FRAME | 1321 | RING0_INT_FRAME |
1401 | pushl %eax | 1322 | pushl_cfi %eax |
1402 | CFI_ADJUST_CFA_OFFSET 4 | ||
1403 | movl %ss, %eax | 1323 | movl %ss, %eax |
1404 | cmpw $__ESPFIX_SS, %ax | 1324 | cmpw $__ESPFIX_SS, %ax |
1405 | popl %eax | 1325 | popl_cfi %eax |
1406 | CFI_ADJUST_CFA_OFFSET -4 | ||
1407 | je nmi_espfix_stack | 1326 | je nmi_espfix_stack |
1408 | cmpl $ia32_sysenter_target,(%esp) | 1327 | cmpl $ia32_sysenter_target,(%esp) |
1409 | je nmi_stack_fixup | 1328 | je nmi_stack_fixup |
1410 | pushl %eax | 1329 | pushl_cfi %eax |
1411 | CFI_ADJUST_CFA_OFFSET 4 | ||
1412 | movl %esp,%eax | 1330 | movl %esp,%eax |
1413 | /* Do not access memory above the end of our stack page, | 1331 | /* Do not access memory above the end of our stack page, |
1414 | * it might not exist. | 1332 | * it might not exist. |
1415 | */ | 1333 | */ |
1416 | andl $(THREAD_SIZE-1),%eax | 1334 | andl $(THREAD_SIZE-1),%eax |
1417 | cmpl $(THREAD_SIZE-20),%eax | 1335 | cmpl $(THREAD_SIZE-20),%eax |
1418 | popl %eax | 1336 | popl_cfi %eax |
1419 | CFI_ADJUST_CFA_OFFSET -4 | ||
1420 | jae nmi_stack_correct | 1337 | jae nmi_stack_correct |
1421 | cmpl $ia32_sysenter_target,12(%esp) | 1338 | cmpl $ia32_sysenter_target,12(%esp) |
1422 | je nmi_debug_stack_check | 1339 | je nmi_debug_stack_check |
1423 | nmi_stack_correct: | 1340 | nmi_stack_correct: |
1424 | /* We have a RING0_INT_FRAME here */ | 1341 | /* We have a RING0_INT_FRAME here */ |
1425 | pushl %eax | 1342 | pushl_cfi %eax |
1426 | CFI_ADJUST_CFA_OFFSET 4 | ||
1427 | SAVE_ALL | 1343 | SAVE_ALL |
1428 | xorl %edx,%edx # zero error code | 1344 | xorl %edx,%edx # zero error code |
1429 | movl %esp,%eax # pt_regs pointer | 1345 | movl %esp,%eax # pt_regs pointer |
@@ -1452,18 +1368,14 @@ nmi_espfix_stack: | |||
1452 | * | 1368 | * |
1453 | * create the pointer to lss back | 1369 | * create the pointer to lss back |
1454 | */ | 1370 | */ |
1455 | pushl %ss | 1371 | pushl_cfi %ss |
1456 | CFI_ADJUST_CFA_OFFSET 4 | 1372 | pushl_cfi %esp |
1457 | pushl %esp | ||
1458 | CFI_ADJUST_CFA_OFFSET 4 | ||
1459 | addl $4, (%esp) | 1373 | addl $4, (%esp) |
1460 | /* copy the iret frame of 12 bytes */ | 1374 | /* copy the iret frame of 12 bytes */ |
1461 | .rept 3 | 1375 | .rept 3 |
1462 | pushl 16(%esp) | 1376 | pushl_cfi 16(%esp) |
1463 | CFI_ADJUST_CFA_OFFSET 4 | ||
1464 | .endr | 1377 | .endr |
1465 | pushl %eax | 1378 | pushl_cfi %eax |
1466 | CFI_ADJUST_CFA_OFFSET 4 | ||
1467 | SAVE_ALL | 1379 | SAVE_ALL |
1468 | FIXUP_ESPFIX_STACK # %eax == %esp | 1380 | FIXUP_ESPFIX_STACK # %eax == %esp |
1469 | xorl %edx,%edx # zero error code | 1381 | xorl %edx,%edx # zero error code |
@@ -1477,8 +1389,7 @@ END(nmi) | |||
1477 | 1389 | ||
1478 | ENTRY(int3) | 1390 | ENTRY(int3) |
1479 | RING0_INT_FRAME | 1391 | RING0_INT_FRAME |
1480 | pushl $-1 # mark this as an int | 1392 | pushl_cfi $-1 # mark this as an int |
1481 | CFI_ADJUST_CFA_OFFSET 4 | ||
1482 | SAVE_ALL | 1393 | SAVE_ALL |
1483 | TRACE_IRQS_OFF | 1394 | TRACE_IRQS_OFF |
1484 | xorl %edx,%edx # zero error code | 1395 | xorl %edx,%edx # zero error code |
@@ -1490,8 +1401,7 @@ END(int3) | |||
1490 | 1401 | ||
1491 | ENTRY(general_protection) | 1402 | ENTRY(general_protection) |
1492 | RING0_EC_FRAME | 1403 | RING0_EC_FRAME |
1493 | pushl $do_general_protection | 1404 | pushl_cfi $do_general_protection |
1494 | CFI_ADJUST_CFA_OFFSET 4 | ||
1495 | jmp error_code | 1405 | jmp error_code |
1496 | CFI_ENDPROC | 1406 | CFI_ENDPROC |
1497 | END(general_protection) | 1407 | END(general_protection) |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 17be5ec7cbba..fe2690d71c0c 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -213,23 +213,17 @@ ENDPROC(native_usergs_sysret64) | |||
213 | .macro FAKE_STACK_FRAME child_rip | 213 | .macro FAKE_STACK_FRAME child_rip |
214 | /* push in order ss, rsp, eflags, cs, rip */ | 214 | /* push in order ss, rsp, eflags, cs, rip */ |
215 | xorl %eax, %eax | 215 | xorl %eax, %eax |
216 | pushq $__KERNEL_DS /* ss */ | 216 | pushq_cfi $__KERNEL_DS /* ss */ |
217 | CFI_ADJUST_CFA_OFFSET 8 | ||
218 | /*CFI_REL_OFFSET ss,0*/ | 217 | /*CFI_REL_OFFSET ss,0*/ |
219 | pushq %rax /* rsp */ | 218 | pushq_cfi %rax /* rsp */ |
220 | CFI_ADJUST_CFA_OFFSET 8 | ||
221 | CFI_REL_OFFSET rsp,0 | 219 | CFI_REL_OFFSET rsp,0 |
222 | pushq $X86_EFLAGS_IF /* eflags - interrupts on */ | 220 | pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ |
223 | CFI_ADJUST_CFA_OFFSET 8 | ||
224 | /*CFI_REL_OFFSET rflags,0*/ | 221 | /*CFI_REL_OFFSET rflags,0*/ |
225 | pushq $__KERNEL_CS /* cs */ | 222 | pushq_cfi $__KERNEL_CS /* cs */ |
226 | CFI_ADJUST_CFA_OFFSET 8 | ||
227 | /*CFI_REL_OFFSET cs,0*/ | 223 | /*CFI_REL_OFFSET cs,0*/ |
228 | pushq \child_rip /* rip */ | 224 | pushq_cfi \child_rip /* rip */ |
229 | CFI_ADJUST_CFA_OFFSET 8 | ||
230 | CFI_REL_OFFSET rip,0 | 225 | CFI_REL_OFFSET rip,0 |
231 | pushq %rax /* orig rax */ | 226 | pushq_cfi %rax /* orig rax */ |
232 | CFI_ADJUST_CFA_OFFSET 8 | ||
233 | .endm | 227 | .endm |
234 | 228 | ||
235 | .macro UNFAKE_STACK_FRAME | 229 | .macro UNFAKE_STACK_FRAME |
@@ -398,10 +392,8 @@ ENTRY(ret_from_fork) | |||
398 | 392 | ||
399 | LOCK ; btr $TIF_FORK,TI_flags(%r8) | 393 | LOCK ; btr $TIF_FORK,TI_flags(%r8) |
400 | 394 | ||
401 | push kernel_eflags(%rip) | 395 | pushq_cfi kernel_eflags(%rip) |
402 | CFI_ADJUST_CFA_OFFSET 8 | 396 | popfq_cfi # reset kernel eflags |
403 | popf # reset kernel eflags | ||
404 | CFI_ADJUST_CFA_OFFSET -8 | ||
405 | 397 | ||
406 | call schedule_tail # rdi: 'prev' task parameter | 398 | call schedule_tail # rdi: 'prev' task parameter |
407 | 399 | ||
@@ -521,11 +513,9 @@ sysret_careful: | |||
521 | jnc sysret_signal | 513 | jnc sysret_signal |
522 | TRACE_IRQS_ON | 514 | TRACE_IRQS_ON |
523 | ENABLE_INTERRUPTS(CLBR_NONE) | 515 | ENABLE_INTERRUPTS(CLBR_NONE) |
524 | pushq %rdi | 516 | pushq_cfi %rdi |
525 | CFI_ADJUST_CFA_OFFSET 8 | ||
526 | call schedule | 517 | call schedule |
527 | popq %rdi | 518 | popq_cfi %rdi |
528 | CFI_ADJUST_CFA_OFFSET -8 | ||
529 | jmp sysret_check | 519 | jmp sysret_check |
530 | 520 | ||
531 | /* Handle a signal */ | 521 | /* Handle a signal */ |
@@ -634,11 +624,9 @@ int_careful: | |||
634 | jnc int_very_careful | 624 | jnc int_very_careful |
635 | TRACE_IRQS_ON | 625 | TRACE_IRQS_ON |
636 | ENABLE_INTERRUPTS(CLBR_NONE) | 626 | ENABLE_INTERRUPTS(CLBR_NONE) |
637 | pushq %rdi | 627 | pushq_cfi %rdi |
638 | CFI_ADJUST_CFA_OFFSET 8 | ||
639 | call schedule | 628 | call schedule |
640 | popq %rdi | 629 | popq_cfi %rdi |
641 | CFI_ADJUST_CFA_OFFSET -8 | ||
642 | DISABLE_INTERRUPTS(CLBR_NONE) | 630 | DISABLE_INTERRUPTS(CLBR_NONE) |
643 | TRACE_IRQS_OFF | 631 | TRACE_IRQS_OFF |
644 | jmp int_with_check | 632 | jmp int_with_check |
@@ -652,12 +640,10 @@ int_check_syscall_exit_work: | |||
652 | /* Check for syscall exit trace */ | 640 | /* Check for syscall exit trace */ |
653 | testl $_TIF_WORK_SYSCALL_EXIT,%edx | 641 | testl $_TIF_WORK_SYSCALL_EXIT,%edx |
654 | jz int_signal | 642 | jz int_signal |
655 | pushq %rdi | 643 | pushq_cfi %rdi |
656 | CFI_ADJUST_CFA_OFFSET 8 | ||
657 | leaq 8(%rsp),%rdi # &ptregs -> arg1 | 644 | leaq 8(%rsp),%rdi # &ptregs -> arg1 |
658 | call syscall_trace_leave | 645 | call syscall_trace_leave |
659 | popq %rdi | 646 | popq_cfi %rdi |
660 | CFI_ADJUST_CFA_OFFSET -8 | ||
661 | andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi | 647 | andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi |
662 | jmp int_restore_rest | 648 | jmp int_restore_rest |
663 | 649 | ||
@@ -714,9 +700,8 @@ END(ptregscall_common) | |||
714 | 700 | ||
715 | ENTRY(stub_execve) | 701 | ENTRY(stub_execve) |
716 | CFI_STARTPROC | 702 | CFI_STARTPROC |
717 | popq %r11 | 703 | addq $8, %rsp |
718 | CFI_ADJUST_CFA_OFFSET -8 | 704 | PARTIAL_FRAME 0 |
719 | CFI_REGISTER rip, r11 | ||
720 | SAVE_REST | 705 | SAVE_REST |
721 | FIXUP_TOP_OF_STACK %r11 | 706 | FIXUP_TOP_OF_STACK %r11 |
722 | movq %rsp, %rcx | 707 | movq %rsp, %rcx |
@@ -735,7 +720,7 @@ END(stub_execve) | |||
735 | ENTRY(stub_rt_sigreturn) | 720 | ENTRY(stub_rt_sigreturn) |
736 | CFI_STARTPROC | 721 | CFI_STARTPROC |
737 | addq $8, %rsp | 722 | addq $8, %rsp |
738 | CFI_ADJUST_CFA_OFFSET -8 | 723 | PARTIAL_FRAME 0 |
739 | SAVE_REST | 724 | SAVE_REST |
740 | movq %rsp,%rdi | 725 | movq %rsp,%rdi |
741 | FIXUP_TOP_OF_STACK %r11 | 726 | FIXUP_TOP_OF_STACK %r11 |
@@ -766,8 +751,7 @@ vector=FIRST_EXTERNAL_VECTOR | |||
766 | .if vector <> FIRST_EXTERNAL_VECTOR | 751 | .if vector <> FIRST_EXTERNAL_VECTOR |
767 | CFI_ADJUST_CFA_OFFSET -8 | 752 | CFI_ADJUST_CFA_OFFSET -8 |
768 | .endif | 753 | .endif |
769 | 1: pushq $(~vector+0x80) /* Note: always in signed byte range */ | 754 | 1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ |
770 | CFI_ADJUST_CFA_OFFSET 8 | ||
771 | .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 | 755 | .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 |
772 | jmp 2f | 756 | jmp 2f |
773 | .endif | 757 | .endif |
@@ -796,8 +780,8 @@ END(interrupt) | |||
796 | 780 | ||
797 | /* 0(%rsp): ~(interrupt number) */ | 781 | /* 0(%rsp): ~(interrupt number) */ |
798 | .macro interrupt func | 782 | .macro interrupt func |
799 | subq $10*8, %rsp | 783 | subq $ORIG_RAX-ARGOFFSET+8, %rsp |
800 | CFI_ADJUST_CFA_OFFSET 10*8 | 784 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8 |
801 | call save_args | 785 | call save_args |
802 | PARTIAL_FRAME 0 | 786 | PARTIAL_FRAME 0 |
803 | call \func | 787 | call \func |
@@ -822,6 +806,7 @@ ret_from_intr: | |||
822 | TRACE_IRQS_OFF | 806 | TRACE_IRQS_OFF |
823 | decl PER_CPU_VAR(irq_count) | 807 | decl PER_CPU_VAR(irq_count) |
824 | leaveq | 808 | leaveq |
809 | CFI_RESTORE rbp | ||
825 | CFI_DEF_CFA_REGISTER rsp | 810 | CFI_DEF_CFA_REGISTER rsp |
826 | CFI_ADJUST_CFA_OFFSET -8 | 811 | CFI_ADJUST_CFA_OFFSET -8 |
827 | exit_intr: | 812 | exit_intr: |
@@ -903,11 +888,9 @@ retint_careful: | |||
903 | jnc retint_signal | 888 | jnc retint_signal |
904 | TRACE_IRQS_ON | 889 | TRACE_IRQS_ON |
905 | ENABLE_INTERRUPTS(CLBR_NONE) | 890 | ENABLE_INTERRUPTS(CLBR_NONE) |
906 | pushq %rdi | 891 | pushq_cfi %rdi |
907 | CFI_ADJUST_CFA_OFFSET 8 | ||
908 | call schedule | 892 | call schedule |
909 | popq %rdi | 893 | popq_cfi %rdi |
910 | CFI_ADJUST_CFA_OFFSET -8 | ||
911 | GET_THREAD_INFO(%rcx) | 894 | GET_THREAD_INFO(%rcx) |
912 | DISABLE_INTERRUPTS(CLBR_NONE) | 895 | DISABLE_INTERRUPTS(CLBR_NONE) |
913 | TRACE_IRQS_OFF | 896 | TRACE_IRQS_OFF |
@@ -956,8 +939,7 @@ END(common_interrupt) | |||
956 | .macro apicinterrupt num sym do_sym | 939 | .macro apicinterrupt num sym do_sym |
957 | ENTRY(\sym) | 940 | ENTRY(\sym) |
958 | INTR_FRAME | 941 | INTR_FRAME |
959 | pushq $~(\num) | 942 | pushq_cfi $~(\num) |
960 | CFI_ADJUST_CFA_OFFSET 8 | ||
961 | interrupt \do_sym | 943 | interrupt \do_sym |
962 | jmp ret_from_intr | 944 | jmp ret_from_intr |
963 | CFI_ENDPROC | 945 | CFI_ENDPROC |
@@ -981,22 +963,10 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ | |||
981 | x86_platform_ipi smp_x86_platform_ipi | 963 | x86_platform_ipi smp_x86_platform_ipi |
982 | 964 | ||
983 | #ifdef CONFIG_SMP | 965 | #ifdef CONFIG_SMP |
984 | apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ | 966 | .irpc idx, "01234567" |
985 | invalidate_interrupt0 smp_invalidate_interrupt | 967 | apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ |
986 | apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \ | 968 | invalidate_interrupt\idx smp_invalidate_interrupt |
987 | invalidate_interrupt1 smp_invalidate_interrupt | 969 | .endr |
988 | apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \ | ||
989 | invalidate_interrupt2 smp_invalidate_interrupt | ||
990 | apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \ | ||
991 | invalidate_interrupt3 smp_invalidate_interrupt | ||
992 | apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \ | ||
993 | invalidate_interrupt4 smp_invalidate_interrupt | ||
994 | apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \ | ||
995 | invalidate_interrupt5 smp_invalidate_interrupt | ||
996 | apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \ | ||
997 | invalidate_interrupt6 smp_invalidate_interrupt | ||
998 | apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ | ||
999 | invalidate_interrupt7 smp_invalidate_interrupt | ||
1000 | #endif | 970 | #endif |
1001 | 971 | ||
1002 | apicinterrupt THRESHOLD_APIC_VECTOR \ | 972 | apicinterrupt THRESHOLD_APIC_VECTOR \ |
@@ -1023,9 +993,9 @@ apicinterrupt ERROR_APIC_VECTOR \ | |||
1023 | apicinterrupt SPURIOUS_APIC_VECTOR \ | 993 | apicinterrupt SPURIOUS_APIC_VECTOR \ |
1024 | spurious_interrupt smp_spurious_interrupt | 994 | spurious_interrupt smp_spurious_interrupt |
1025 | 995 | ||
1026 | #ifdef CONFIG_PERF_EVENTS | 996 | #ifdef CONFIG_IRQ_WORK |
1027 | apicinterrupt LOCAL_PENDING_VECTOR \ | 997 | apicinterrupt IRQ_WORK_VECTOR \ |
1028 | perf_pending_interrupt smp_perf_pending_interrupt | 998 | irq_work_interrupt smp_irq_work_interrupt |
1029 | #endif | 999 | #endif |
1030 | 1000 | ||
1031 | /* | 1001 | /* |
@@ -1036,8 +1006,8 @@ ENTRY(\sym) | |||
1036 | INTR_FRAME | 1006 | INTR_FRAME |
1037 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1007 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
1038 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ | 1008 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ |
1039 | subq $15*8,%rsp | 1009 | subq $ORIG_RAX-R15, %rsp |
1040 | CFI_ADJUST_CFA_OFFSET 15*8 | 1010 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 |
1041 | call error_entry | 1011 | call error_entry |
1042 | DEFAULT_FRAME 0 | 1012 | DEFAULT_FRAME 0 |
1043 | movq %rsp,%rdi /* pt_regs pointer */ | 1013 | movq %rsp,%rdi /* pt_regs pointer */ |
@@ -1052,9 +1022,9 @@ END(\sym) | |||
1052 | ENTRY(\sym) | 1022 | ENTRY(\sym) |
1053 | INTR_FRAME | 1023 | INTR_FRAME |
1054 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1024 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
1055 | pushq $-1 /* ORIG_RAX: no syscall to restart */ | 1025 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ |
1056 | CFI_ADJUST_CFA_OFFSET 8 | 1026 | subq $ORIG_RAX-R15, %rsp |
1057 | subq $15*8, %rsp | 1027 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 |
1058 | call save_paranoid | 1028 | call save_paranoid |
1059 | TRACE_IRQS_OFF | 1029 | TRACE_IRQS_OFF |
1060 | movq %rsp,%rdi /* pt_regs pointer */ | 1030 | movq %rsp,%rdi /* pt_regs pointer */ |
@@ -1070,9 +1040,9 @@ END(\sym) | |||
1070 | ENTRY(\sym) | 1040 | ENTRY(\sym) |
1071 | INTR_FRAME | 1041 | INTR_FRAME |
1072 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1042 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
1073 | pushq $-1 /* ORIG_RAX: no syscall to restart */ | 1043 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ |
1074 | CFI_ADJUST_CFA_OFFSET 8 | 1044 | subq $ORIG_RAX-R15, %rsp |
1075 | subq $15*8, %rsp | 1045 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 |
1076 | call save_paranoid | 1046 | call save_paranoid |
1077 | TRACE_IRQS_OFF | 1047 | TRACE_IRQS_OFF |
1078 | movq %rsp,%rdi /* pt_regs pointer */ | 1048 | movq %rsp,%rdi /* pt_regs pointer */ |
@@ -1089,8 +1059,8 @@ END(\sym) | |||
1089 | ENTRY(\sym) | 1059 | ENTRY(\sym) |
1090 | XCPT_FRAME | 1060 | XCPT_FRAME |
1091 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1061 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
1092 | subq $15*8,%rsp | 1062 | subq $ORIG_RAX-R15, %rsp |
1093 | CFI_ADJUST_CFA_OFFSET 15*8 | 1063 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 |
1094 | call error_entry | 1064 | call error_entry |
1095 | DEFAULT_FRAME 0 | 1065 | DEFAULT_FRAME 0 |
1096 | movq %rsp,%rdi /* pt_regs pointer */ | 1066 | movq %rsp,%rdi /* pt_regs pointer */ |
@@ -1107,8 +1077,8 @@ END(\sym) | |||
1107 | ENTRY(\sym) | 1077 | ENTRY(\sym) |
1108 | XCPT_FRAME | 1078 | XCPT_FRAME |
1109 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1079 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
1110 | subq $15*8,%rsp | 1080 | subq $ORIG_RAX-R15, %rsp |
1111 | CFI_ADJUST_CFA_OFFSET 15*8 | 1081 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 |
1112 | call save_paranoid | 1082 | call save_paranoid |
1113 | DEFAULT_FRAME 0 | 1083 | DEFAULT_FRAME 0 |
1114 | TRACE_IRQS_OFF | 1084 | TRACE_IRQS_OFF |
@@ -1139,16 +1109,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error | |||
1139 | /* edi: new selector */ | 1109 | /* edi: new selector */ |
1140 | ENTRY(native_load_gs_index) | 1110 | ENTRY(native_load_gs_index) |
1141 | CFI_STARTPROC | 1111 | CFI_STARTPROC |
1142 | pushf | 1112 | pushfq_cfi |
1143 | CFI_ADJUST_CFA_OFFSET 8 | ||
1144 | DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) | 1113 | DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) |
1145 | SWAPGS | 1114 | SWAPGS |
1146 | gs_change: | 1115 | gs_change: |
1147 | movl %edi,%gs | 1116 | movl %edi,%gs |
1148 | 2: mfence /* workaround */ | 1117 | 2: mfence /* workaround */ |
1149 | SWAPGS | 1118 | SWAPGS |
1150 | popf | 1119 | popfq_cfi |
1151 | CFI_ADJUST_CFA_OFFSET -8 | ||
1152 | ret | 1120 | ret |
1153 | CFI_ENDPROC | 1121 | CFI_ENDPROC |
1154 | END(native_load_gs_index) | 1122 | END(native_load_gs_index) |
@@ -1215,8 +1183,7 @@ END(kernel_execve) | |||
1215 | /* Call softirq on interrupt stack. Interrupts are off. */ | 1183 | /* Call softirq on interrupt stack. Interrupts are off. */ |
1216 | ENTRY(call_softirq) | 1184 | ENTRY(call_softirq) |
1217 | CFI_STARTPROC | 1185 | CFI_STARTPROC |
1218 | push %rbp | 1186 | pushq_cfi %rbp |
1219 | CFI_ADJUST_CFA_OFFSET 8 | ||
1220 | CFI_REL_OFFSET rbp,0 | 1187 | CFI_REL_OFFSET rbp,0 |
1221 | mov %rsp,%rbp | 1188 | mov %rsp,%rbp |
1222 | CFI_DEF_CFA_REGISTER rbp | 1189 | CFI_DEF_CFA_REGISTER rbp |
@@ -1225,6 +1192,7 @@ ENTRY(call_softirq) | |||
1225 | push %rbp # backlink for old unwinder | 1192 | push %rbp # backlink for old unwinder |
1226 | call __do_softirq | 1193 | call __do_softirq |
1227 | leaveq | 1194 | leaveq |
1195 | CFI_RESTORE rbp | ||
1228 | CFI_DEF_CFA_REGISTER rsp | 1196 | CFI_DEF_CFA_REGISTER rsp |
1229 | CFI_ADJUST_CFA_OFFSET -8 | 1197 | CFI_ADJUST_CFA_OFFSET -8 |
1230 | decl PER_CPU_VAR(irq_count) | 1198 | decl PER_CPU_VAR(irq_count) |
@@ -1368,7 +1336,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip) | |||
1368 | 1336 | ||
1369 | /* ebx: no swapgs flag */ | 1337 | /* ebx: no swapgs flag */ |
1370 | ENTRY(paranoid_exit) | 1338 | ENTRY(paranoid_exit) |
1371 | INTR_FRAME | 1339 | DEFAULT_FRAME |
1372 | DISABLE_INTERRUPTS(CLBR_NONE) | 1340 | DISABLE_INTERRUPTS(CLBR_NONE) |
1373 | TRACE_IRQS_OFF | 1341 | TRACE_IRQS_OFF |
1374 | testl %ebx,%ebx /* swapgs needed? */ | 1342 | testl %ebx,%ebx /* swapgs needed? */ |
@@ -1445,7 +1413,6 @@ error_swapgs: | |||
1445 | error_sti: | 1413 | error_sti: |
1446 | TRACE_IRQS_OFF | 1414 | TRACE_IRQS_OFF |
1447 | ret | 1415 | ret |
1448 | CFI_ENDPROC | ||
1449 | 1416 | ||
1450 | /* | 1417 | /* |
1451 | * There are two places in the kernel that can potentially fault with | 1418 | * There are two places in the kernel that can potentially fault with |
@@ -1470,6 +1437,7 @@ bstep_iret: | |||
1470 | /* Fix truncated RIP */ | 1437 | /* Fix truncated RIP */ |
1471 | movq %rcx,RIP+8(%rsp) | 1438 | movq %rcx,RIP+8(%rsp) |
1472 | jmp error_swapgs | 1439 | jmp error_swapgs |
1440 | CFI_ENDPROC | ||
1473 | END(error_entry) | 1441 | END(error_entry) |
1474 | 1442 | ||
1475 | 1443 | ||
@@ -1498,8 +1466,8 @@ ENTRY(nmi) | |||
1498 | INTR_FRAME | 1466 | INTR_FRAME |
1499 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1467 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
1500 | pushq_cfi $-1 | 1468 | pushq_cfi $-1 |
1501 | subq $15*8, %rsp | 1469 | subq $ORIG_RAX-R15, %rsp |
1502 | CFI_ADJUST_CFA_OFFSET 15*8 | 1470 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 |
1503 | call save_paranoid | 1471 | call save_paranoid |
1504 | DEFAULT_FRAME 0 | 1472 | DEFAULT_FRAME 0 |
1505 | /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ | 1473 | /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ |
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index cd37469b54ee..3afb33f14d2d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c | |||
@@ -257,14 +257,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) | |||
257 | return mod_code_status; | 257 | return mod_code_status; |
258 | } | 258 | } |
259 | 259 | ||
260 | |||
261 | |||
262 | |||
263 | static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; | ||
264 | |||
265 | static unsigned char *ftrace_nop_replace(void) | 260 | static unsigned char *ftrace_nop_replace(void) |
266 | { | 261 | { |
267 | return ftrace_nop; | 262 | return ideal_nop5; |
268 | } | 263 | } |
269 | 264 | ||
270 | static int | 265 | static int |
@@ -338,62 +333,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) | |||
338 | 333 | ||
339 | int __init ftrace_dyn_arch_init(void *data) | 334 | int __init ftrace_dyn_arch_init(void *data) |
340 | { | 335 | { |
341 | extern const unsigned char ftrace_test_p6nop[]; | ||
342 | extern const unsigned char ftrace_test_nop5[]; | ||
343 | extern const unsigned char ftrace_test_jmp[]; | ||
344 | int faulted = 0; | ||
345 | |||
346 | /* | ||
347 | * There is no good nop for all x86 archs. | ||
348 | * We will default to using the P6_NOP5, but first we | ||
349 | * will test to make sure that the nop will actually | ||
350 | * work on this CPU. If it faults, we will then | ||
351 | * go to a lesser efficient 5 byte nop. If that fails | ||
352 | * we then just use a jmp as our nop. This isn't the most | ||
353 | * efficient nop, but we can not use a multi part nop | ||
354 | * since we would then risk being preempted in the middle | ||
355 | * of that nop, and if we enabled tracing then, it might | ||
356 | * cause a system crash. | ||
357 | * | ||
358 | * TODO: check the cpuid to determine the best nop. | ||
359 | */ | ||
360 | asm volatile ( | ||
361 | "ftrace_test_jmp:" | ||
362 | "jmp ftrace_test_p6nop\n" | ||
363 | "nop\n" | ||
364 | "nop\n" | ||
365 | "nop\n" /* 2 byte jmp + 3 bytes */ | ||
366 | "ftrace_test_p6nop:" | ||
367 | P6_NOP5 | ||
368 | "jmp 1f\n" | ||
369 | "ftrace_test_nop5:" | ||
370 | ".byte 0x66,0x66,0x66,0x66,0x90\n" | ||
371 | "1:" | ||
372 | ".section .fixup, \"ax\"\n" | ||
373 | "2: movl $1, %0\n" | ||
374 | " jmp ftrace_test_nop5\n" | ||
375 | "3: movl $2, %0\n" | ||
376 | " jmp 1b\n" | ||
377 | ".previous\n" | ||
378 | _ASM_EXTABLE(ftrace_test_p6nop, 2b) | ||
379 | _ASM_EXTABLE(ftrace_test_nop5, 3b) | ||
380 | : "=r"(faulted) : "0" (faulted)); | ||
381 | |||
382 | switch (faulted) { | ||
383 | case 0: | ||
384 | pr_info("converting mcount calls to 0f 1f 44 00 00\n"); | ||
385 | memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); | ||
386 | break; | ||
387 | case 1: | ||
388 | pr_info("converting mcount calls to 66 66 66 66 90\n"); | ||
389 | memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); | ||
390 | break; | ||
391 | case 2: | ||
392 | pr_info("converting mcount calls to jmp . + 5\n"); | ||
393 | memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); | ||
394 | break; | ||
395 | } | ||
396 | |||
397 | /* The return code is retured via data */ | 336 | /* The return code is retured via data */ |
398 | *(unsigned long *)data = 0; | 337 | *(unsigned long *)data = 0; |
399 | 338 | ||
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 3e66bd364a9d..af0699ba48cf 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c | |||
@@ -1,5 +1,6 @@ | |||
1 | #include <linux/kernel.h> | 1 | #include <linux/kernel.h> |
2 | #include <linux/init.h> | 2 | #include <linux/init.h> |
3 | #include <linux/memblock.h> | ||
3 | 4 | ||
4 | #include <asm/setup.h> | 5 | #include <asm/setup.h> |
5 | #include <asm/bios_ebda.h> | 6 | #include <asm/bios_ebda.h> |
@@ -51,5 +52,5 @@ void __init reserve_ebda_region(void) | |||
51 | lowmem = 0x9f000; | 52 | lowmem = 0x9f000; |
52 | 53 | ||
53 | /* reserve all memory between lowmem and the 1MB mark */ | 54 | /* reserve all memory between lowmem and the 1MB mark */ |
54 | reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved"); | 55 | memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved"); |
55 | } | 56 | } |
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 784360c0625c..763310165fa0 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/init.h> | 8 | #include <linux/init.h> |
9 | #include <linux/start_kernel.h> | 9 | #include <linux/start_kernel.h> |
10 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
11 | #include <linux/memblock.h> | ||
11 | 12 | ||
12 | #include <asm/setup.h> | 13 | #include <asm/setup.h> |
13 | #include <asm/sections.h> | 14 | #include <asm/sections.h> |
@@ -17,6 +18,7 @@ | |||
17 | #include <asm/apic.h> | 18 | #include <asm/apic.h> |
18 | #include <asm/io_apic.h> | 19 | #include <asm/io_apic.h> |
19 | #include <asm/bios_ebda.h> | 20 | #include <asm/bios_ebda.h> |
21 | #include <asm/tlbflush.h> | ||
20 | 22 | ||
21 | static void __init i386_default_early_setup(void) | 23 | static void __init i386_default_early_setup(void) |
22 | { | 24 | { |
@@ -30,17 +32,18 @@ static void __init i386_default_early_setup(void) | |||
30 | 32 | ||
31 | void __init i386_start_kernel(void) | 33 | void __init i386_start_kernel(void) |
32 | { | 34 | { |
35 | memblock_init(); | ||
36 | |||
33 | #ifdef CONFIG_X86_TRAMPOLINE | 37 | #ifdef CONFIG_X86_TRAMPOLINE |
34 | /* | 38 | /* |
35 | * But first pinch a few for the stack/trampoline stuff | 39 | * But first pinch a few for the stack/trampoline stuff |
36 | * FIXME: Don't need the extra page at 4K, but need to fix | 40 | * FIXME: Don't need the extra page at 4K, but need to fix |
37 | * trampoline before removing it. (see the GDT stuff) | 41 | * trampoline before removing it. (see the GDT stuff) |
38 | */ | 42 | */ |
39 | reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, | 43 | memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); |
40 | "EX TRAMPOLINE"); | ||
41 | #endif | 44 | #endif |
42 | 45 | ||
43 | reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); | 46 | memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); |
44 | 47 | ||
45 | #ifdef CONFIG_BLK_DEV_INITRD | 48 | #ifdef CONFIG_BLK_DEV_INITRD |
46 | /* Reserve INITRD */ | 49 | /* Reserve INITRD */ |
@@ -49,7 +52,7 @@ void __init i386_start_kernel(void) | |||
49 | u64 ramdisk_image = boot_params.hdr.ramdisk_image; | 52 | u64 ramdisk_image = boot_params.hdr.ramdisk_image; |
50 | u64 ramdisk_size = boot_params.hdr.ramdisk_size; | 53 | u64 ramdisk_size = boot_params.hdr.ramdisk_size; |
51 | u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); | 54 | u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); |
52 | reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); | 55 | memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); |
53 | } | 56 | } |
54 | #endif | 57 | #endif |
55 | 58 | ||
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 7147143fd614..2d2673c28aff 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/percpu.h> | 12 | #include <linux/percpu.h> |
13 | #include <linux/start_kernel.h> | 13 | #include <linux/start_kernel.h> |
14 | #include <linux/io.h> | 14 | #include <linux/io.h> |
15 | #include <linux/memblock.h> | ||
15 | 16 | ||
16 | #include <asm/processor.h> | 17 | #include <asm/processor.h> |
17 | #include <asm/proto.h> | 18 | #include <asm/proto.h> |
@@ -79,6 +80,8 @@ void __init x86_64_start_kernel(char * real_mode_data) | |||
79 | /* Cleanup the over mapped high alias */ | 80 | /* Cleanup the over mapped high alias */ |
80 | cleanup_highmap(); | 81 | cleanup_highmap(); |
81 | 82 | ||
83 | max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; | ||
84 | |||
82 | for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { | 85 | for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { |
83 | #ifdef CONFIG_EARLY_PRINTK | 86 | #ifdef CONFIG_EARLY_PRINTK |
84 | set_intr_gate(i, &early_idt_handlers[i]); | 87 | set_intr_gate(i, &early_idt_handlers[i]); |
@@ -98,7 +101,9 @@ void __init x86_64_start_reservations(char *real_mode_data) | |||
98 | { | 101 | { |
99 | copy_bootdata(__va(real_mode_data)); | 102 | copy_bootdata(__va(real_mode_data)); |
100 | 103 | ||
101 | reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); | 104 | memblock_init(); |
105 | |||
106 | memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); | ||
102 | 107 | ||
103 | #ifdef CONFIG_BLK_DEV_INITRD | 108 | #ifdef CONFIG_BLK_DEV_INITRD |
104 | /* Reserve INITRD */ | 109 | /* Reserve INITRD */ |
@@ -107,7 +112,7 @@ void __init x86_64_start_reservations(char *real_mode_data) | |||
107 | unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; | 112 | unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; |
108 | unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; | 113 | unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; |
109 | unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); | 114 | unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); |
110 | reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); | 115 | memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); |
111 | } | 116 | } |
112 | #endif | 117 | #endif |
113 | 118 | ||
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index fa8c1b8e09fb..bcece91dd311 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -183,13 +183,12 @@ default_entry: | |||
183 | #ifdef CONFIG_X86_PAE | 183 | #ifdef CONFIG_X86_PAE |
184 | 184 | ||
185 | /* | 185 | /* |
186 | * In PAE mode swapper_pg_dir is statically defined to contain enough | 186 | * In PAE mode initial_page_table is statically defined to contain |
187 | * entries to cover the VMSPLIT option (that is the top 1, 2 or 3 | 187 | * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 |
188 | * entries). The identity mapping is handled by pointing two PGD | 188 | * entries). The identity mapping is handled by pointing two PGD entries |
189 | * entries to the first kernel PMD. | 189 | * to the first kernel PMD. |
190 | * | 190 | * |
191 | * Note the upper half of each PMD or PTE are always zero at | 191 | * Note the upper half of each PMD or PTE are always zero at this stage. |
192 | * this stage. | ||
193 | */ | 192 | */ |
194 | 193 | ||
195 | #define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ | 194 | #define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ |
@@ -197,7 +196,7 @@ default_entry: | |||
197 | xorl %ebx,%ebx /* %ebx is kept at zero */ | 196 | xorl %ebx,%ebx /* %ebx is kept at zero */ |
198 | 197 | ||
199 | movl $pa(__brk_base), %edi | 198 | movl $pa(__brk_base), %edi |
200 | movl $pa(swapper_pg_pmd), %edx | 199 | movl $pa(initial_pg_pmd), %edx |
201 | movl $PTE_IDENT_ATTR, %eax | 200 | movl $PTE_IDENT_ATTR, %eax |
202 | 10: | 201 | 10: |
203 | leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ | 202 | leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ |
@@ -226,14 +225,14 @@ default_entry: | |||
226 | movl %eax, pa(max_pfn_mapped) | 225 | movl %eax, pa(max_pfn_mapped) |
227 | 226 | ||
228 | /* Do early initialization of the fixmap area */ | 227 | /* Do early initialization of the fixmap area */ |
229 | movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax | 228 | movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax |
230 | movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) | 229 | movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8) |
231 | #else /* Not PAE */ | 230 | #else /* Not PAE */ |
232 | 231 | ||
233 | page_pde_offset = (__PAGE_OFFSET >> 20); | 232 | page_pde_offset = (__PAGE_OFFSET >> 20); |
234 | 233 | ||
235 | movl $pa(__brk_base), %edi | 234 | movl $pa(__brk_base), %edi |
236 | movl $pa(swapper_pg_dir), %edx | 235 | movl $pa(initial_page_table), %edx |
237 | movl $PTE_IDENT_ATTR, %eax | 236 | movl $PTE_IDENT_ATTR, %eax |
238 | 10: | 237 | 10: |
239 | leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ | 238 | leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ |
@@ -257,8 +256,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20); | |||
257 | movl %eax, pa(max_pfn_mapped) | 256 | movl %eax, pa(max_pfn_mapped) |
258 | 257 | ||
259 | /* Do early initialization of the fixmap area */ | 258 | /* Do early initialization of the fixmap area */ |
260 | movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax | 259 | movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax |
261 | movl %eax,pa(swapper_pg_dir+0xffc) | 260 | movl %eax,pa(initial_page_table+0xffc) |
262 | #endif | 261 | #endif |
263 | jmp 3f | 262 | jmp 3f |
264 | /* | 263 | /* |
@@ -334,7 +333,7 @@ ENTRY(startup_32_smp) | |||
334 | /* | 333 | /* |
335 | * Enable paging | 334 | * Enable paging |
336 | */ | 335 | */ |
337 | movl pa(initial_page_table), %eax | 336 | movl $pa(initial_page_table), %eax |
338 | movl %eax,%cr3 /* set the page table pointer.. */ | 337 | movl %eax,%cr3 /* set the page table pointer.. */ |
339 | movl %cr0,%eax | 338 | movl %cr0,%eax |
340 | orl $X86_CR0_PG,%eax | 339 | orl $X86_CR0_PG,%eax |
@@ -614,8 +613,6 @@ ignore_int: | |||
614 | .align 4 | 613 | .align 4 |
615 | ENTRY(initial_code) | 614 | ENTRY(initial_code) |
616 | .long i386_start_kernel | 615 | .long i386_start_kernel |
617 | ENTRY(initial_page_table) | ||
618 | .long pa(swapper_pg_dir) | ||
619 | 616 | ||
620 | /* | 617 | /* |
621 | * BSS section | 618 | * BSS section |
@@ -623,20 +620,18 @@ ENTRY(initial_page_table) | |||
623 | __PAGE_ALIGNED_BSS | 620 | __PAGE_ALIGNED_BSS |
624 | .align PAGE_SIZE_asm | 621 | .align PAGE_SIZE_asm |
625 | #ifdef CONFIG_X86_PAE | 622 | #ifdef CONFIG_X86_PAE |
626 | swapper_pg_pmd: | 623 | initial_pg_pmd: |
627 | .fill 1024*KPMDS,4,0 | 624 | .fill 1024*KPMDS,4,0 |
628 | #else | 625 | #else |
629 | ENTRY(swapper_pg_dir) | 626 | ENTRY(initial_page_table) |
630 | .fill 1024,4,0 | 627 | .fill 1024,4,0 |
631 | #endif | 628 | #endif |
632 | swapper_pg_fixmap: | 629 | initial_pg_fixmap: |
633 | .fill 1024,4,0 | 630 | .fill 1024,4,0 |
634 | #ifdef CONFIG_X86_TRAMPOLINE | ||
635 | ENTRY(trampoline_pg_dir) | ||
636 | .fill 1024,4,0 | ||
637 | #endif | ||
638 | ENTRY(empty_zero_page) | 631 | ENTRY(empty_zero_page) |
639 | .fill 4096,1,0 | 632 | .fill 4096,1,0 |
633 | ENTRY(swapper_pg_dir) | ||
634 | .fill 1024,4,0 | ||
640 | 635 | ||
641 | /* | 636 | /* |
642 | * This starts the data section. | 637 | * This starts the data section. |
@@ -645,20 +640,20 @@ ENTRY(empty_zero_page) | |||
645 | __PAGE_ALIGNED_DATA | 640 | __PAGE_ALIGNED_DATA |
646 | /* Page-aligned for the benefit of paravirt? */ | 641 | /* Page-aligned for the benefit of paravirt? */ |
647 | .align PAGE_SIZE_asm | 642 | .align PAGE_SIZE_asm |
648 | ENTRY(swapper_pg_dir) | 643 | ENTRY(initial_page_table) |
649 | .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ | 644 | .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ |
650 | # if KPMDS == 3 | 645 | # if KPMDS == 3 |
651 | .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 | 646 | .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 |
652 | .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 | 647 | .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0 |
653 | .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0 | 648 | .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0 |
654 | # elif KPMDS == 2 | 649 | # elif KPMDS == 2 |
655 | .long 0,0 | 650 | .long 0,0 |
656 | .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 | 651 | .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 |
657 | .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 | 652 | .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0 |
658 | # elif KPMDS == 1 | 653 | # elif KPMDS == 1 |
659 | .long 0,0 | 654 | .long 0,0 |
660 | .long 0,0 | 655 | .long 0,0 |
661 | .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 | 656 | .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 |
662 | # else | 657 | # else |
663 | # error "Kernel PMDs should be 1, 2 or 3" | 658 | # error "Kernel PMDs should be 1, 2 or 3" |
664 | # endif | 659 | # endif |
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index efaf906daf93..ae03cab4352e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c | |||
@@ -380,44 +380,35 @@ static int hpet_next_event(unsigned long delta, | |||
380 | struct clock_event_device *evt, int timer) | 380 | struct clock_event_device *evt, int timer) |
381 | { | 381 | { |
382 | u32 cnt; | 382 | u32 cnt; |
383 | s32 res; | ||
383 | 384 | ||
384 | cnt = hpet_readl(HPET_COUNTER); | 385 | cnt = hpet_readl(HPET_COUNTER); |
385 | cnt += (u32) delta; | 386 | cnt += (u32) delta; |
386 | hpet_writel(cnt, HPET_Tn_CMP(timer)); | 387 | hpet_writel(cnt, HPET_Tn_CMP(timer)); |
387 | 388 | ||
388 | /* | 389 | /* |
389 | * We need to read back the CMP register on certain HPET | 390 | * HPETs are a complete disaster. The compare register is |
390 | * implementations (ATI chipsets) which seem to delay the | 391 | * based on a equal comparison and neither provides a less |
391 | * transfer of the compare register into the internal compare | 392 | * than or equal functionality (which would require to take |
392 | * logic. With small deltas this might actually be too late as | 393 | * the wraparound into account) nor a simple count down event |
393 | * the counter could already be higher than the compare value | 394 | * mode. Further the write to the comparator register is |
394 | * at that point and we would wait for the next hpet interrupt | 395 | * delayed internally up to two HPET clock cycles in certain |
395 | * forever. We found out that reading the CMP register back | 396 | * chipsets (ATI, ICH9,10). We worked around that by reading |
396 | * forces the transfer so we can rely on the comparison with | 397 | * back the compare register, but that required another |
397 | * the counter register below. If the read back from the | 398 | * workaround for ICH9,10 chips where the first readout after |
398 | * compare register does not match the value we programmed | 399 | * write can return the old stale value. We already have a |
399 | * then we might have a real hardware problem. We can not do | 400 | * minimum delta of 5us enforced, but a NMI or SMI hitting |
400 | * much about it here, but at least alert the user/admin with | 401 | * between the counter readout and the comparator write can |
401 | * a prominent warning. | 402 | * move us behind that point easily. Now instead of reading |
402 | * | 403 | * the compare register back several times, we make the ETIME |
403 | * An erratum on some chipsets (ICH9,..), results in | 404 | * decision based on the following: Return ETIME if the |
404 | * comparator read immediately following a write returning old | 405 | * counter value after the write is less than 8 HPET cycles |
405 | * value. Workaround for this is to read this value second | 406 | * away from the event or if the counter is already ahead of |
406 | * time, when first read returns old value. | 407 | * the event. |
407 | * | ||
408 | * In fact the write to the comparator register is delayed up | ||
409 | * to two HPET cycles so the workaround we tried to restrict | ||
410 | * the readback to those known to be borked ATI chipsets | ||
411 | * failed miserably. So we give up on optimizations forever | ||
412 | * and penalize all HPET incarnations unconditionally. | ||
413 | */ | 408 | */ |
414 | if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { | 409 | res = (s32)(cnt - hpet_readl(HPET_COUNTER)); |
415 | if (hpet_readl(HPET_Tn_CMP(timer)) != cnt) | ||
416 | printk_once(KERN_WARNING | ||
417 | "hpet: compare register read back failed.\n"); | ||
418 | } | ||
419 | 410 | ||
420 | return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; | 411 | return res < 8 ? -ETIME : 0; |
421 | } | 412 | } |
422 | 413 | ||
423 | static void hpet_legacy_set_mode(enum clock_event_mode mode, | 414 | static void hpet_legacy_set_mode(enum clock_event_mode mode, |
@@ -722,7 +713,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n, | |||
722 | 713 | ||
723 | switch (action & 0xf) { | 714 | switch (action & 0xf) { |
724 | case CPU_ONLINE: | 715 | case CPU_ONLINE: |
725 | INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work); | 716 | INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); |
726 | init_completion(&work.complete); | 717 | init_completion(&work.complete); |
727 | /* FIXME: add schedule_work_on() */ | 718 | /* FIXME: add schedule_work_on() */ |
728 | schedule_delayed_work_on(cpu, &work.work, 0); | 719 | schedule_delayed_work_on(cpu, &work.work, 0); |
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index a46cb3522c0c..58bb239a2fd7 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
@@ -68,19 +68,22 @@ static void __cpuinit init_thread_xstate(void) | |||
68 | */ | 68 | */ |
69 | 69 | ||
70 | if (!HAVE_HWFP) { | 70 | if (!HAVE_HWFP) { |
71 | /* | ||
72 | * Disable xsave as we do not support it if i387 | ||
73 | * emulation is enabled. | ||
74 | */ | ||
75 | setup_clear_cpu_cap(X86_FEATURE_XSAVE); | ||
76 | setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); | ||
71 | xstate_size = sizeof(struct i387_soft_struct); | 77 | xstate_size = sizeof(struct i387_soft_struct); |
72 | return; | 78 | return; |
73 | } | 79 | } |
74 | 80 | ||
75 | if (cpu_has_fxsr) | 81 | if (cpu_has_fxsr) |
76 | xstate_size = sizeof(struct i387_fxsave_struct); | 82 | xstate_size = sizeof(struct i387_fxsave_struct); |
77 | #ifdef CONFIG_X86_32 | ||
78 | else | 83 | else |
79 | xstate_size = sizeof(struct i387_fsave_struct); | 84 | xstate_size = sizeof(struct i387_fsave_struct); |
80 | #endif | ||
81 | } | 85 | } |
82 | 86 | ||
83 | #ifdef CONFIG_X86_64 | ||
84 | /* | 87 | /* |
85 | * Called at bootup to set up the initial FPU state that is later cloned | 88 | * Called at bootup to set up the initial FPU state that is later cloned |
86 | * into all processes. | 89 | * into all processes. |
@@ -88,12 +91,21 @@ static void __cpuinit init_thread_xstate(void) | |||
88 | 91 | ||
89 | void __cpuinit fpu_init(void) | 92 | void __cpuinit fpu_init(void) |
90 | { | 93 | { |
91 | unsigned long oldcr0 = read_cr0(); | 94 | unsigned long cr0; |
92 | 95 | unsigned long cr4_mask = 0; | |
93 | set_in_cr4(X86_CR4_OSFXSR); | ||
94 | set_in_cr4(X86_CR4_OSXMMEXCPT); | ||
95 | 96 | ||
96 | write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ | 97 | if (cpu_has_fxsr) |
98 | cr4_mask |= X86_CR4_OSFXSR; | ||
99 | if (cpu_has_xmm) | ||
100 | cr4_mask |= X86_CR4_OSXMMEXCPT; | ||
101 | if (cr4_mask) | ||
102 | set_in_cr4(cr4_mask); | ||
103 | |||
104 | cr0 = read_cr0(); | ||
105 | cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ | ||
106 | if (!HAVE_HWFP) | ||
107 | cr0 |= X86_CR0_EM; | ||
108 | write_cr0(cr0); | ||
97 | 109 | ||
98 | if (!smp_processor_id()) | 110 | if (!smp_processor_id()) |
99 | init_thread_xstate(); | 111 | init_thread_xstate(); |
@@ -104,24 +116,12 @@ void __cpuinit fpu_init(void) | |||
104 | clear_used_math(); | 116 | clear_used_math(); |
105 | } | 117 | } |
106 | 118 | ||
107 | #else /* CONFIG_X86_64 */ | ||
108 | |||
109 | void __cpuinit fpu_init(void) | ||
110 | { | ||
111 | if (!smp_processor_id()) | ||
112 | init_thread_xstate(); | ||
113 | } | ||
114 | |||
115 | #endif /* CONFIG_X86_32 */ | ||
116 | |||
117 | void fpu_finit(struct fpu *fpu) | 119 | void fpu_finit(struct fpu *fpu) |
118 | { | 120 | { |
119 | #ifdef CONFIG_X86_32 | ||
120 | if (!HAVE_HWFP) { | 121 | if (!HAVE_HWFP) { |
121 | finit_soft_fpu(&fpu->state->soft); | 122 | finit_soft_fpu(&fpu->state->soft); |
122 | return; | 123 | return; |
123 | } | 124 | } |
124 | #endif | ||
125 | 125 | ||
126 | if (cpu_has_fxsr) { | 126 | if (cpu_has_fxsr) { |
127 | struct i387_fxsave_struct *fx = &fpu->state->fxsave; | 127 | struct i387_fxsave_struct *fx = &fpu->state->fxsave; |
@@ -386,19 +386,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) | |||
386 | #ifdef CONFIG_X86_64 | 386 | #ifdef CONFIG_X86_64 |
387 | env->fip = fxsave->rip; | 387 | env->fip = fxsave->rip; |
388 | env->foo = fxsave->rdp; | 388 | env->foo = fxsave->rdp; |
389 | /* | ||
390 | * should be actually ds/cs at fpu exception time, but | ||
391 | * that information is not available in 64bit mode. | ||
392 | */ | ||
393 | env->fcs = task_pt_regs(tsk)->cs; | ||
389 | if (tsk == current) { | 394 | if (tsk == current) { |
390 | /* | 395 | savesegment(ds, env->fos); |
391 | * should be actually ds/cs at fpu exception time, but | ||
392 | * that information is not available in 64bit mode. | ||
393 | */ | ||
394 | asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos)); | ||
395 | asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs)); | ||
396 | } else { | 396 | } else { |
397 | struct pt_regs *regs = task_pt_regs(tsk); | 397 | env->fos = tsk->thread.ds; |
398 | |||
399 | env->fos = 0xffff0000 | tsk->thread.ds; | ||
400 | env->fcs = regs->cs; | ||
401 | } | 398 | } |
399 | env->fos |= 0xffff0000; | ||
402 | #else | 400 | #else |
403 | env->fip = fxsave->fip; | 401 | env->fip = fxsave->fip; |
404 | env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); | 402 | env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); |
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d765bdc48074..83ec0175f986 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c | |||
@@ -67,10 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) | |||
67 | for_each_online_cpu(j) | 67 | for_each_online_cpu(j) |
68 | seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); | 68 | seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); |
69 | seq_printf(p, " Performance monitoring interrupts\n"); | 69 | seq_printf(p, " Performance monitoring interrupts\n"); |
70 | seq_printf(p, "%*s: ", prec, "PND"); | 70 | seq_printf(p, "%*s: ", prec, "IWI"); |
71 | for_each_online_cpu(j) | 71 | for_each_online_cpu(j) |
72 | seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); | 72 | seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); |
73 | seq_printf(p, " Performance pending work\n"); | 73 | seq_printf(p, " IRQ work interrupts\n"); |
74 | #endif | 74 | #endif |
75 | if (x86_platform_ipi_callback) { | 75 | if (x86_platform_ipi_callback) { |
76 | seq_printf(p, "%*s: ", prec, "PLT"); | 76 | seq_printf(p, "%*s: ", prec, "PLT"); |
@@ -185,7 +185,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) | |||
185 | sum += irq_stats(cpu)->apic_timer_irqs; | 185 | sum += irq_stats(cpu)->apic_timer_irqs; |
186 | sum += irq_stats(cpu)->irq_spurious_count; | 186 | sum += irq_stats(cpu)->irq_spurious_count; |
187 | sum += irq_stats(cpu)->apic_perf_irqs; | 187 | sum += irq_stats(cpu)->apic_perf_irqs; |
188 | sum += irq_stats(cpu)->apic_pending_irqs; | 188 | sum += irq_stats(cpu)->apic_irq_work_irqs; |
189 | #endif | 189 | #endif |
190 | if (x86_platform_ipi_callback) | 190 | if (x86_platform_ipi_callback) |
191 | sum += irq_stats(cpu)->x86_platform_ipis; | 191 | sum += irq_stats(cpu)->x86_platform_ipis; |
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 10709f29d166..64668dbf00a4 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
@@ -49,21 +49,17 @@ static inline int check_stack_overflow(void) { return 0; } | |||
49 | static inline void print_stack_overflow(void) { } | 49 | static inline void print_stack_overflow(void) { } |
50 | #endif | 50 | #endif |
51 | 51 | ||
52 | #ifdef CONFIG_4KSTACKS | ||
53 | /* | 52 | /* |
54 | * per-CPU IRQ handling contexts (thread information and stack) | 53 | * per-CPU IRQ handling contexts (thread information and stack) |
55 | */ | 54 | */ |
56 | union irq_ctx { | 55 | union irq_ctx { |
57 | struct thread_info tinfo; | 56 | struct thread_info tinfo; |
58 | u32 stack[THREAD_SIZE/sizeof(u32)]; | 57 | u32 stack[THREAD_SIZE/sizeof(u32)]; |
59 | } __attribute__((aligned(PAGE_SIZE))); | 58 | } __attribute__((aligned(THREAD_SIZE))); |
60 | 59 | ||
61 | static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); | 60 | static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); |
62 | static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); | 61 | static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); |
63 | 62 | ||
64 | static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack); | ||
65 | static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack); | ||
66 | |||
67 | static void call_on_stack(void *func, void *stack) | 63 | static void call_on_stack(void *func, void *stack) |
68 | { | 64 | { |
69 | asm volatile("xchgl %%ebx,%%esp \n" | 65 | asm volatile("xchgl %%ebx,%%esp \n" |
@@ -129,7 +125,7 @@ void __cpuinit irq_ctx_init(int cpu) | |||
129 | if (per_cpu(hardirq_ctx, cpu)) | 125 | if (per_cpu(hardirq_ctx, cpu)) |
130 | return; | 126 | return; |
131 | 127 | ||
132 | irqctx = &per_cpu(hardirq_stack, cpu); | 128 | irqctx = (union irq_ctx *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER); |
133 | irqctx->tinfo.task = NULL; | 129 | irqctx->tinfo.task = NULL; |
134 | irqctx->tinfo.exec_domain = NULL; | 130 | irqctx->tinfo.exec_domain = NULL; |
135 | irqctx->tinfo.cpu = cpu; | 131 | irqctx->tinfo.cpu = cpu; |
@@ -138,7 +134,7 @@ void __cpuinit irq_ctx_init(int cpu) | |||
138 | 134 | ||
139 | per_cpu(hardirq_ctx, cpu) = irqctx; | 135 | per_cpu(hardirq_ctx, cpu) = irqctx; |
140 | 136 | ||
141 | irqctx = &per_cpu(softirq_stack, cpu); | 137 | irqctx = (union irq_ctx *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER); |
142 | irqctx->tinfo.task = NULL; | 138 | irqctx->tinfo.task = NULL; |
143 | irqctx->tinfo.exec_domain = NULL; | 139 | irqctx->tinfo.exec_domain = NULL; |
144 | irqctx->tinfo.cpu = cpu; | 140 | irqctx->tinfo.cpu = cpu; |
@@ -151,11 +147,6 @@ void __cpuinit irq_ctx_init(int cpu) | |||
151 | cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); | 147 | cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); |
152 | } | 148 | } |
153 | 149 | ||
154 | void irq_ctx_exit(int cpu) | ||
155 | { | ||
156 | per_cpu(hardirq_ctx, cpu) = NULL; | ||
157 | } | ||
158 | |||
159 | asmlinkage void do_softirq(void) | 150 | asmlinkage void do_softirq(void) |
160 | { | 151 | { |
161 | unsigned long flags; | 152 | unsigned long flags; |
@@ -187,11 +178,6 @@ asmlinkage void do_softirq(void) | |||
187 | local_irq_restore(flags); | 178 | local_irq_restore(flags); |
188 | } | 179 | } |
189 | 180 | ||
190 | #else | ||
191 | static inline int | ||
192 | execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } | ||
193 | #endif | ||
194 | |||
195 | bool handle_irq(unsigned irq, struct pt_regs *regs) | 181 | bool handle_irq(unsigned irq, struct pt_regs *regs) |
196 | { | 182 | { |
197 | struct irq_desc *desc; | 183 | struct irq_desc *desc; |
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c new file mode 100644 index 000000000000..ca8f703a1e70 --- /dev/null +++ b/arch/x86/kernel/irq_work.c | |||
@@ -0,0 +1,30 @@ | |||
1 | /* | ||
2 | * x86 specific code for irq_work | ||
3 | * | ||
4 | * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
5 | */ | ||
6 | |||
7 | #include <linux/kernel.h> | ||
8 | #include <linux/irq_work.h> | ||
9 | #include <linux/hardirq.h> | ||
10 | #include <asm/apic.h> | ||
11 | |||
12 | void smp_irq_work_interrupt(struct pt_regs *regs) | ||
13 | { | ||
14 | irq_enter(); | ||
15 | ack_APIC_irq(); | ||
16 | inc_irq_stat(apic_irq_work_irqs); | ||
17 | irq_work_run(); | ||
18 | irq_exit(); | ||
19 | } | ||
20 | |||
21 | void arch_irq_work_raise(void) | ||
22 | { | ||
23 | #ifdef CONFIG_X86_LOCAL_APIC | ||
24 | if (!cpu_has_apic) | ||
25 | return; | ||
26 | |||
27 | apic->send_IPI_self(IRQ_WORK_VECTOR); | ||
28 | apic_wait_icr_idle(); | ||
29 | #endif | ||
30 | } | ||
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a91ab503e24f..c752e973958d 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c | |||
@@ -215,9 +215,9 @@ static void __init apic_intr_init(void) | |||
215 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | 215 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); |
216 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | 216 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); |
217 | 217 | ||
218 | /* Performance monitoring interrupts: */ | 218 | /* IRQ work interrupts: */ |
219 | # ifdef CONFIG_PERF_EVENTS | 219 | # ifdef CONFIG_IRQ_WORK |
220 | alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); | 220 | alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt); |
221 | # endif | 221 | # endif |
222 | 222 | ||
223 | #endif | 223 | #endif |
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c new file mode 100644 index 000000000000..961b6b30ba90 --- /dev/null +++ b/arch/x86/kernel/jump_label.c | |||
@@ -0,0 +1,50 @@ | |||
1 | /* | ||
2 | * jump label x86 support | ||
3 | * | ||
4 | * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> | ||
5 | * | ||
6 | */ | ||
7 | #include <linux/jump_label.h> | ||
8 | #include <linux/memory.h> | ||
9 | #include <linux/uaccess.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/list.h> | ||
12 | #include <linux/jhash.h> | ||
13 | #include <linux/cpu.h> | ||
14 | #include <asm/kprobes.h> | ||
15 | #include <asm/alternative.h> | ||
16 | |||
17 | #ifdef HAVE_JUMP_LABEL | ||
18 | |||
19 | union jump_code_union { | ||
20 | char code[JUMP_LABEL_NOP_SIZE]; | ||
21 | struct { | ||
22 | char jump; | ||
23 | int offset; | ||
24 | } __attribute__((packed)); | ||
25 | }; | ||
26 | |||
27 | void arch_jump_label_transform(struct jump_entry *entry, | ||
28 | enum jump_label_type type) | ||
29 | { | ||
30 | union jump_code_union code; | ||
31 | |||
32 | if (type == JUMP_LABEL_ENABLE) { | ||
33 | code.jump = 0xe9; | ||
34 | code.offset = entry->target - | ||
35 | (entry->code + JUMP_LABEL_NOP_SIZE); | ||
36 | } else | ||
37 | memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE); | ||
38 | get_online_cpus(); | ||
39 | mutex_lock(&text_mutex); | ||
40 | text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); | ||
41 | mutex_unlock(&text_mutex); | ||
42 | put_online_cpus(); | ||
43 | } | ||
44 | |||
45 | void arch_jump_label_text_poke_early(jump_label_t addr) | ||
46 | { | ||
47 | text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE); | ||
48 | } | ||
49 | |||
50 | #endif | ||
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 8afd9f321f10..90fcf62854bb 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c | |||
@@ -78,6 +78,7 @@ static int setup_data_open(struct inode *inode, struct file *file) | |||
78 | static const struct file_operations fops_setup_data = { | 78 | static const struct file_operations fops_setup_data = { |
79 | .read = setup_data_read, | 79 | .read = setup_data_read, |
80 | .open = setup_data_open, | 80 | .open = setup_data_open, |
81 | .llseek = default_llseek, | ||
81 | }; | 82 | }; |
82 | 83 | ||
83 | static int __init | 84 | static int __init |
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 852b81967a37..d81cfebb848f 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c | |||
@@ -477,8 +477,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, | |||
477 | raw_smp_processor_id()); | 477 | raw_smp_processor_id()); |
478 | } | 478 | } |
479 | 479 | ||
480 | kgdb_correct_hw_break(); | ||
481 | |||
482 | return 0; | 480 | return 0; |
483 | } | 481 | } |
484 | 482 | ||
@@ -621,7 +619,12 @@ int kgdb_arch_init(void) | |||
621 | static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, | 619 | static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, |
622 | struct perf_sample_data *data, struct pt_regs *regs) | 620 | struct perf_sample_data *data, struct pt_regs *regs) |
623 | { | 621 | { |
624 | kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP); | 622 | struct task_struct *tsk = current; |
623 | int i; | ||
624 | |||
625 | for (i = 0; i < 4; i++) | ||
626 | if (breakinfo[i].enabled) | ||
627 | tsk->thread.debugreg6 |= (DR_TRAP0 << i); | ||
625 | } | 628 | } |
626 | 629 | ||
627 | void kgdb_arch_late(void) | 630 | void kgdb_arch_late(void) |
@@ -644,7 +647,7 @@ void kgdb_arch_late(void) | |||
644 | if (breakinfo[i].pev) | 647 | if (breakinfo[i].pev) |
645 | continue; | 648 | continue; |
646 | breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); | 649 | breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); |
647 | if (IS_ERR(breakinfo[i].pev)) { | 650 | if (IS_ERR((void * __force)breakinfo[i].pev)) { |
648 | printk(KERN_ERR "kgdb: Could not allocate hw" | 651 | printk(KERN_ERR "kgdb: Could not allocate hw" |
649 | "breakpoints\nDisabling the kernel debugger\n"); | 652 | "breakpoints\nDisabling the kernel debugger\n"); |
650 | breakinfo[i].pev = NULL; | 653 | breakinfo[i].pev = NULL; |
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 770ebfb349e9..1cbd54c0df99 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c | |||
@@ -230,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) | |||
230 | return 0; | 230 | return 0; |
231 | } | 231 | } |
232 | 232 | ||
233 | /* Dummy buffers for kallsyms_lookup */ | ||
234 | static char __dummy_buf[KSYM_NAME_LEN]; | ||
235 | |||
236 | /* Check if paddr is at an instruction boundary */ | 233 | /* Check if paddr is at an instruction boundary */ |
237 | static int __kprobes can_probe(unsigned long paddr) | 234 | static int __kprobes can_probe(unsigned long paddr) |
238 | { | 235 | { |
@@ -241,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr) | |||
241 | struct insn insn; | 238 | struct insn insn; |
242 | kprobe_opcode_t buf[MAX_INSN_SIZE]; | 239 | kprobe_opcode_t buf[MAX_INSN_SIZE]; |
243 | 240 | ||
244 | if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) | 241 | if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) |
245 | return 0; | 242 | return 0; |
246 | 243 | ||
247 | /* Decode instructions */ | 244 | /* Decode instructions */ |
@@ -1129,7 +1126,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, | |||
1129 | *(unsigned long *)addr = val; | 1126 | *(unsigned long *)addr = val; |
1130 | } | 1127 | } |
1131 | 1128 | ||
1132 | void __kprobes kprobes_optinsn_template_holder(void) | 1129 | static void __used __kprobes kprobes_optinsn_template_holder(void) |
1133 | { | 1130 | { |
1134 | asm volatile ( | 1131 | asm volatile ( |
1135 | ".global optprobe_template_entry\n" | 1132 | ".global optprobe_template_entry\n" |
@@ -1221,7 +1218,8 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) | |||
1221 | } | 1218 | } |
1222 | /* Check whether the address range is reserved */ | 1219 | /* Check whether the address range is reserved */ |
1223 | if (ftrace_text_reserved(src, src + len - 1) || | 1220 | if (ftrace_text_reserved(src, src + len - 1) || |
1224 | alternatives_text_reserved(src, src + len - 1)) | 1221 | alternatives_text_reserved(src, src + len - 1) || |
1222 | jump_label_text_reserved(src, src + len - 1)) | ||
1225 | return -EBUSY; | 1223 | return -EBUSY; |
1226 | 1224 | ||
1227 | return len; | 1225 | return len; |
@@ -1269,11 +1267,9 @@ static int __kprobes can_optimize(unsigned long paddr) | |||
1269 | unsigned long addr, size = 0, offset = 0; | 1267 | unsigned long addr, size = 0, offset = 0; |
1270 | struct insn insn; | 1268 | struct insn insn; |
1271 | kprobe_opcode_t buf[MAX_INSN_SIZE]; | 1269 | kprobe_opcode_t buf[MAX_INSN_SIZE]; |
1272 | /* Dummy buffers for lookup_symbol_attrs */ | ||
1273 | static char __dummy_buf[KSYM_NAME_LEN]; | ||
1274 | 1270 | ||
1275 | /* Lookup symbol including addr */ | 1271 | /* Lookup symbol including addr */ |
1276 | if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) | 1272 | if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) |
1277 | return 0; | 1273 | return 0; |
1278 | 1274 | ||
1279 | /* Check there is enough space for a relative jump. */ | 1275 | /* Check there is enough space for a relative jump. */ |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index eb9b76c716c2..ca43ce31a19c 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -128,13 +128,15 @@ static struct clocksource kvm_clock = { | |||
128 | static int kvm_register_clock(char *txt) | 128 | static int kvm_register_clock(char *txt) |
129 | { | 129 | { |
130 | int cpu = smp_processor_id(); | 130 | int cpu = smp_processor_id(); |
131 | int low, high; | 131 | int low, high, ret; |
132 | |||
132 | low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; | 133 | low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; |
133 | high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); | 134 | high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); |
135 | ret = native_write_msr_safe(msr_kvm_system_time, low, high); | ||
134 | printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", | 136 | printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", |
135 | cpu, high, low, txt); | 137 | cpu, high, low, txt); |
136 | 138 | ||
137 | return native_write_msr_safe(msr_kvm_system_time, low, high); | 139 | return ret; |
138 | } | 140 | } |
139 | 141 | ||
140 | #ifdef CONFIG_X86_LOCAL_APIC | 142 | #ifdef CONFIG_X86_LOCAL_APIC |
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fa6551d36c10..1cca374a2bac 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c | |||
@@ -12,7 +12,7 @@ | |||
12 | * Software Developer's Manual | 12 | * Software Developer's Manual |
13 | * Order Number 253668 or free download from: | 13 | * Order Number 253668 or free download from: |
14 | * | 14 | * |
15 | * http://developer.intel.com/design/pentium4/manuals/253668.htm | 15 | * http://developer.intel.com/Assets/PDF/manual/253668.pdf |
16 | * | 16 | * |
17 | * For more information, go to http://www.urbanmyth.org/microcode | 17 | * For more information, go to http://www.urbanmyth.org/microcode |
18 | * | 18 | * |
@@ -232,6 +232,7 @@ static const struct file_operations microcode_fops = { | |||
232 | .owner = THIS_MODULE, | 232 | .owner = THIS_MODULE, |
233 | .write = microcode_write, | 233 | .write = microcode_write, |
234 | .open = microcode_open, | 234 | .open = microcode_open, |
235 | .llseek = no_llseek, | ||
235 | }; | 236 | }; |
236 | 237 | ||
237 | static struct miscdevice microcode_dev = { | 238 | static struct miscdevice microcode_dev = { |
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 356170262a93..dcb65cc0a053 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c | |||
@@ -12,7 +12,7 @@ | |||
12 | * Software Developer's Manual | 12 | * Software Developer's Manual |
13 | * Order Number 253668 or free download from: | 13 | * Order Number 253668 or free download from: |
14 | * | 14 | * |
15 | * http://developer.intel.com/design/pentium4/manuals/253668.htm | 15 | * http://developer.intel.com/Assets/PDF/manual/253668.pdf |
16 | * | 16 | * |
17 | * For more information, go to http://www.urbanmyth.org/microcode | 17 | * For more information, go to http://www.urbanmyth.org/microcode |
18 | * | 18 | * |
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 1c355c550960..8f2956091735 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c | |||
@@ -239,6 +239,9 @@ int module_finalize(const Elf_Ehdr *hdr, | |||
239 | apply_paravirt(pseg, pseg + para->sh_size); | 239 | apply_paravirt(pseg, pseg + para->sh_size); |
240 | } | 240 | } |
241 | 241 | ||
242 | /* make jump label nops */ | ||
243 | jump_label_apply_nops(me); | ||
244 | |||
242 | return 0; | 245 | return 0; |
243 | } | 246 | } |
244 | 247 | ||
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index d7b6f7fb4fec..9af64d9c4b67 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/init.h> | 11 | #include <linux/init.h> |
12 | #include <linux/delay.h> | 12 | #include <linux/delay.h> |
13 | #include <linux/bootmem.h> | 13 | #include <linux/bootmem.h> |
14 | #include <linux/memblock.h> | ||
14 | #include <linux/kernel_stat.h> | 15 | #include <linux/kernel_stat.h> |
15 | #include <linux/mc146818rtc.h> | 16 | #include <linux/mc146818rtc.h> |
16 | #include <linux/bitops.h> | 17 | #include <linux/bitops.h> |
@@ -657,7 +658,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf) | |||
657 | { | 658 | { |
658 | unsigned long size = get_mpc_size(mpf->physptr); | 659 | unsigned long size = get_mpc_size(mpf->physptr); |
659 | 660 | ||
660 | reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc"); | 661 | memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc"); |
661 | } | 662 | } |
662 | 663 | ||
663 | static int __init smp_scan_config(unsigned long base, unsigned long length) | 664 | static int __init smp_scan_config(unsigned long base, unsigned long length) |
@@ -686,7 +687,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) | |||
686 | mpf, (u64)virt_to_phys(mpf)); | 687 | mpf, (u64)virt_to_phys(mpf)); |
687 | 688 | ||
688 | mem = virt_to_phys(mpf); | 689 | mem = virt_to_phys(mpf); |
689 | reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf"); | 690 | memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf"); |
690 | if (mpf->physptr) | 691 | if (mpf->physptr) |
691 | smp_reserve_memory(mpf); | 692 | smp_reserve_memory(mpf); |
692 | 693 | ||
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1db183ed7c01..c5b250011fd4 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -413,7 +413,6 @@ struct pv_mmu_ops pv_mmu_ops = { | |||
413 | 413 | ||
414 | .alloc_pte = paravirt_nop, | 414 | .alloc_pte = paravirt_nop, |
415 | .alloc_pmd = paravirt_nop, | 415 | .alloc_pmd = paravirt_nop, |
416 | .alloc_pmd_clone = paravirt_nop, | ||
417 | .alloc_pud = paravirt_nop, | 416 | .alloc_pud = paravirt_nop, |
418 | .release_pte = paravirt_nop, | 417 | .release_pte = paravirt_nop, |
419 | .release_pmd = paravirt_nop, | 418 | .release_pmd = paravirt_nop, |
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 078d4ec1a9d9..f56a117cef68 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <asm/rio.h> | 47 | #include <asm/rio.h> |
48 | #include <asm/bios_ebda.h> | 48 | #include <asm/bios_ebda.h> |
49 | #include <asm/x86_init.h> | 49 | #include <asm/x86_init.h> |
50 | #include <asm/iommu_table.h> | ||
50 | 51 | ||
51 | #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT | 52 | #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT |
52 | int use_calgary __read_mostly = 1; | 53 | int use_calgary __read_mostly = 1; |
@@ -1364,7 +1365,7 @@ static int __init calgary_iommu_init(void) | |||
1364 | return 0; | 1365 | return 0; |
1365 | } | 1366 | } |
1366 | 1367 | ||
1367 | void __init detect_calgary(void) | 1368 | int __init detect_calgary(void) |
1368 | { | 1369 | { |
1369 | int bus; | 1370 | int bus; |
1370 | void *tbl; | 1371 | void *tbl; |
@@ -1378,13 +1379,13 @@ void __init detect_calgary(void) | |||
1378 | * another HW IOMMU already, bail out. | 1379 | * another HW IOMMU already, bail out. |
1379 | */ | 1380 | */ |
1380 | if (no_iommu || iommu_detected) | 1381 | if (no_iommu || iommu_detected) |
1381 | return; | 1382 | return -ENODEV; |
1382 | 1383 | ||
1383 | if (!use_calgary) | 1384 | if (!use_calgary) |
1384 | return; | 1385 | return -ENODEV; |
1385 | 1386 | ||
1386 | if (!early_pci_allowed()) | 1387 | if (!early_pci_allowed()) |
1387 | return; | 1388 | return -ENODEV; |
1388 | 1389 | ||
1389 | printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); | 1390 | printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); |
1390 | 1391 | ||
@@ -1410,13 +1411,13 @@ void __init detect_calgary(void) | |||
1410 | if (!rio_table_hdr) { | 1411 | if (!rio_table_hdr) { |
1411 | printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " | 1412 | printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " |
1412 | "in EBDA - bailing!\n"); | 1413 | "in EBDA - bailing!\n"); |
1413 | return; | 1414 | return -ENODEV; |
1414 | } | 1415 | } |
1415 | 1416 | ||
1416 | ret = build_detail_arrays(); | 1417 | ret = build_detail_arrays(); |
1417 | if (ret) { | 1418 | if (ret) { |
1418 | printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); | 1419 | printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); |
1419 | return; | 1420 | return -ENOMEM; |
1420 | } | 1421 | } |
1421 | 1422 | ||
1422 | specified_table_size = determine_tce_table_size((is_kdump_kernel() ? | 1423 | specified_table_size = determine_tce_table_size((is_kdump_kernel() ? |
@@ -1464,7 +1465,7 @@ void __init detect_calgary(void) | |||
1464 | 1465 | ||
1465 | x86_init.iommu.iommu_init = calgary_iommu_init; | 1466 | x86_init.iommu.iommu_init = calgary_iommu_init; |
1466 | } | 1467 | } |
1467 | return; | 1468 | return calgary_found; |
1468 | 1469 | ||
1469 | cleanup: | 1470 | cleanup: |
1470 | for (--bus; bus >= 0; --bus) { | 1471 | for (--bus; bus >= 0; --bus) { |
@@ -1473,6 +1474,7 @@ cleanup: | |||
1473 | if (info->tce_space) | 1474 | if (info->tce_space) |
1474 | free_tce_table(info->tce_space); | 1475 | free_tce_table(info->tce_space); |
1475 | } | 1476 | } |
1477 | return -ENOMEM; | ||
1476 | } | 1478 | } |
1477 | 1479 | ||
1478 | static int __init calgary_parse_options(char *p) | 1480 | static int __init calgary_parse_options(char *p) |
@@ -1594,3 +1596,5 @@ static int __init calgary_fixup_tce_spaces(void) | |||
1594 | * and before device_initcall. | 1596 | * and before device_initcall. |
1595 | */ | 1597 | */ |
1596 | rootfs_initcall(calgary_fixup_tce_spaces); | 1598 | rootfs_initcall(calgary_fixup_tce_spaces); |
1599 | |||
1600 | IOMMU_INIT_POST(detect_calgary); | ||
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 9f07cfcbd3a5..9ea999a4dcc1 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c | |||
@@ -11,9 +11,8 @@ | |||
11 | #include <asm/iommu.h> | 11 | #include <asm/iommu.h> |
12 | #include <asm/gart.h> | 12 | #include <asm/gart.h> |
13 | #include <asm/calgary.h> | 13 | #include <asm/calgary.h> |
14 | #include <asm/amd_iommu.h> | ||
15 | #include <asm/x86_init.h> | 14 | #include <asm/x86_init.h> |
16 | #include <asm/xen/swiotlb-xen.h> | 15 | #include <asm/iommu_table.h> |
17 | 16 | ||
18 | static int forbid_dac __read_mostly; | 17 | static int forbid_dac __read_mostly; |
19 | 18 | ||
@@ -45,6 +44,8 @@ int iommu_detected __read_mostly = 0; | |||
45 | */ | 44 | */ |
46 | int iommu_pass_through __read_mostly; | 45 | int iommu_pass_through __read_mostly; |
47 | 46 | ||
47 | extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; | ||
48 | |||
48 | /* Dummy device used for NULL arguments (normally ISA). */ | 49 | /* Dummy device used for NULL arguments (normally ISA). */ |
49 | struct device x86_dma_fallback_dev = { | 50 | struct device x86_dma_fallback_dev = { |
50 | .init_name = "fallback device", | 51 | .init_name = "fallback device", |
@@ -130,26 +131,24 @@ static void __init dma32_free_bootmem(void) | |||
130 | 131 | ||
131 | void __init pci_iommu_alloc(void) | 132 | void __init pci_iommu_alloc(void) |
132 | { | 133 | { |
134 | struct iommu_table_entry *p; | ||
135 | |||
133 | /* free the range so iommu could get some range less than 4G */ | 136 | /* free the range so iommu could get some range less than 4G */ |
134 | dma32_free_bootmem(); | 137 | dma32_free_bootmem(); |
135 | 138 | ||
136 | if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) | 139 | sort_iommu_table(__iommu_table, __iommu_table_end); |
137 | goto out; | 140 | check_iommu_entries(__iommu_table, __iommu_table_end); |
138 | |||
139 | gart_iommu_hole_init(); | ||
140 | |||
141 | detect_calgary(); | ||
142 | |||
143 | detect_intel_iommu(); | ||
144 | 141 | ||
145 | /* needs to be called after gart_iommu_hole_init */ | 142 | for (p = __iommu_table; p < __iommu_table_end; p++) { |
146 | amd_iommu_detect(); | 143 | if (p && p->detect && p->detect() > 0) { |
147 | out: | 144 | p->flags |= IOMMU_DETECTED; |
148 | pci_xen_swiotlb_init(); | 145 | if (p->early_init) |
149 | 146 | p->early_init(); | |
150 | pci_swiotlb_init(); | 147 | if (p->flags & IOMMU_FINISH_IF_DETECTED) |
148 | break; | ||
149 | } | ||
150 | } | ||
151 | } | 151 | } |
152 | |||
153 | void *dma_generic_alloc_coherent(struct device *dev, size_t size, | 152 | void *dma_generic_alloc_coherent(struct device *dev, size_t size, |
154 | dma_addr_t *dma_addr, gfp_t flag) | 153 | dma_addr_t *dma_addr, gfp_t flag) |
155 | { | 154 | { |
@@ -292,6 +291,7 @@ EXPORT_SYMBOL(dma_supported); | |||
292 | 291 | ||
293 | static int __init pci_iommu_init(void) | 292 | static int __init pci_iommu_init(void) |
294 | { | 293 | { |
294 | struct iommu_table_entry *p; | ||
295 | dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); | 295 | dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); |
296 | 296 | ||
297 | #ifdef CONFIG_PCI | 297 | #ifdef CONFIG_PCI |
@@ -299,12 +299,10 @@ static int __init pci_iommu_init(void) | |||
299 | #endif | 299 | #endif |
300 | x86_init.iommu.iommu_init(); | 300 | x86_init.iommu.iommu_init(); |
301 | 301 | ||
302 | if (swiotlb || xen_swiotlb) { | 302 | for (p = __iommu_table; p < __iommu_table_end; p++) { |
303 | printk(KERN_INFO "PCI-DMA: " | 303 | if (p && (p->flags & IOMMU_DETECTED) && p->late_init) |
304 | "Using software bounce buffering for IO (SWIOTLB)\n"); | 304 | p->late_init(); |
305 | swiotlb_print_info(); | 305 | } |
306 | } else | ||
307 | swiotlb_free(); | ||
308 | 306 | ||
309 | return 0; | 307 | return 0; |
310 | } | 308 | } |
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 0f7f130caa67..ba0f0ca9f280 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c | |||
@@ -39,8 +39,9 @@ | |||
39 | #include <asm/cacheflush.h> | 39 | #include <asm/cacheflush.h> |
40 | #include <asm/swiotlb.h> | 40 | #include <asm/swiotlb.h> |
41 | #include <asm/dma.h> | 41 | #include <asm/dma.h> |
42 | #include <asm/k8.h> | 42 | #include <asm/amd_nb.h> |
43 | #include <asm/x86_init.h> | 43 | #include <asm/x86_init.h> |
44 | #include <asm/iommu_table.h> | ||
44 | 45 | ||
45 | static unsigned long iommu_bus_base; /* GART remapping area (physical) */ | 46 | static unsigned long iommu_bus_base; /* GART remapping area (physical) */ |
46 | static unsigned long iommu_size; /* size of remapping area bytes */ | 47 | static unsigned long iommu_size; /* size of remapping area bytes */ |
@@ -560,8 +561,11 @@ static void enable_gart_translations(void) | |||
560 | { | 561 | { |
561 | int i; | 562 | int i; |
562 | 563 | ||
563 | for (i = 0; i < num_k8_northbridges; i++) { | 564 | if (!k8_northbridges.gart_supported) |
564 | struct pci_dev *dev = k8_northbridges[i]; | 565 | return; |
566 | |||
567 | for (i = 0; i < k8_northbridges.num; i++) { | ||
568 | struct pci_dev *dev = k8_northbridges.nb_misc[i]; | ||
565 | 569 | ||
566 | enable_gart_translation(dev, __pa(agp_gatt_table)); | 570 | enable_gart_translation(dev, __pa(agp_gatt_table)); |
567 | } | 571 | } |
@@ -592,16 +596,19 @@ static void gart_fixup_northbridges(struct sys_device *dev) | |||
592 | if (!fix_up_north_bridges) | 596 | if (!fix_up_north_bridges) |
593 | return; | 597 | return; |
594 | 598 | ||
599 | if (!k8_northbridges.gart_supported) | ||
600 | return; | ||
601 | |||
595 | pr_info("PCI-DMA: Restoring GART aperture settings\n"); | 602 | pr_info("PCI-DMA: Restoring GART aperture settings\n"); |
596 | 603 | ||
597 | for (i = 0; i < num_k8_northbridges; i++) { | 604 | for (i = 0; i < k8_northbridges.num; i++) { |
598 | struct pci_dev *dev = k8_northbridges[i]; | 605 | struct pci_dev *dev = k8_northbridges.nb_misc[i]; |
599 | 606 | ||
600 | /* | 607 | /* |
601 | * Don't enable translations just yet. That is the next | 608 | * Don't enable translations just yet. That is the next |
602 | * step. Restore the pre-suspend aperture settings. | 609 | * step. Restore the pre-suspend aperture settings. |
603 | */ | 610 | */ |
604 | pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); | 611 | gart_set_size_and_enable(dev, aperture_order); |
605 | pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); | 612 | pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); |
606 | } | 613 | } |
607 | } | 614 | } |
@@ -649,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) | |||
649 | 656 | ||
650 | aper_size = aper_base = info->aper_size = 0; | 657 | aper_size = aper_base = info->aper_size = 0; |
651 | dev = NULL; | 658 | dev = NULL; |
652 | for (i = 0; i < num_k8_northbridges; i++) { | 659 | for (i = 0; i < k8_northbridges.num; i++) { |
653 | dev = k8_northbridges[i]; | 660 | dev = k8_northbridges.nb_misc[i]; |
654 | new_aper_base = read_aperture(dev, &new_aper_size); | 661 | new_aper_base = read_aperture(dev, &new_aper_size); |
655 | if (!new_aper_base) | 662 | if (!new_aper_base) |
656 | goto nommu; | 663 | goto nommu; |
@@ -718,10 +725,13 @@ static void gart_iommu_shutdown(void) | |||
718 | if (!no_agp) | 725 | if (!no_agp) |
719 | return; | 726 | return; |
720 | 727 | ||
721 | for (i = 0; i < num_k8_northbridges; i++) { | 728 | if (!k8_northbridges.gart_supported) |
729 | return; | ||
730 | |||
731 | for (i = 0; i < k8_northbridges.num; i++) { | ||
722 | u32 ctl; | 732 | u32 ctl; |
723 | 733 | ||
724 | dev = k8_northbridges[i]; | 734 | dev = k8_northbridges.nb_misc[i]; |
725 | pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); | 735 | pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); |
726 | 736 | ||
727 | ctl &= ~GARTEN; | 737 | ctl &= ~GARTEN; |
@@ -739,7 +749,7 @@ int __init gart_iommu_init(void) | |||
739 | unsigned long scratch; | 749 | unsigned long scratch; |
740 | long i; | 750 | long i; |
741 | 751 | ||
742 | if (num_k8_northbridges == 0) | 752 | if (!k8_northbridges.gart_supported) |
743 | return 0; | 753 | return 0; |
744 | 754 | ||
745 | #ifndef CONFIG_AGP_AMD64 | 755 | #ifndef CONFIG_AGP_AMD64 |
@@ -896,3 +906,4 @@ void __init gart_parse_options(char *p) | |||
896 | } | 906 | } |
897 | } | 907 | } |
898 | } | 908 | } |
909 | IOMMU_INIT_POST(gart_iommu_hole_init); | ||
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c new file mode 100644 index 000000000000..55d745ec1181 --- /dev/null +++ b/arch/x86/kernel/pci-iommu_table.c | |||
@@ -0,0 +1,89 @@ | |||
1 | #include <linux/dma-mapping.h> | ||
2 | #include <asm/iommu_table.h> | ||
3 | #include <linux/string.h> | ||
4 | #include <linux/kallsyms.h> | ||
5 | |||
6 | |||
7 | #define DEBUG 1 | ||
8 | |||
9 | static struct iommu_table_entry * __init | ||
10 | find_dependents_of(struct iommu_table_entry *start, | ||
11 | struct iommu_table_entry *finish, | ||
12 | struct iommu_table_entry *q) | ||
13 | { | ||
14 | struct iommu_table_entry *p; | ||
15 | |||
16 | if (!q) | ||
17 | return NULL; | ||
18 | |||
19 | for (p = start; p < finish; p++) | ||
20 | if (p->detect == q->depend) | ||
21 | return p; | ||
22 | |||
23 | return NULL; | ||
24 | } | ||
25 | |||
26 | |||
27 | void __init sort_iommu_table(struct iommu_table_entry *start, | ||
28 | struct iommu_table_entry *finish) { | ||
29 | |||
30 | struct iommu_table_entry *p, *q, tmp; | ||
31 | |||
32 | for (p = start; p < finish; p++) { | ||
33 | again: | ||
34 | q = find_dependents_of(start, finish, p); | ||
35 | /* We are bit sneaky here. We use the memory address to figure | ||
36 | * out if the node we depend on is past our point, if so, swap. | ||
37 | */ | ||
38 | if (q > p) { | ||
39 | tmp = *p; | ||
40 | memmove(p, q, sizeof(*p)); | ||
41 | *q = tmp; | ||
42 | goto again; | ||
43 | } | ||
44 | } | ||
45 | |||
46 | } | ||
47 | |||
48 | #ifdef DEBUG | ||
49 | void __init check_iommu_entries(struct iommu_table_entry *start, | ||
50 | struct iommu_table_entry *finish) | ||
51 | { | ||
52 | struct iommu_table_entry *p, *q, *x; | ||
53 | char sym_p[KSYM_SYMBOL_LEN]; | ||
54 | char sym_q[KSYM_SYMBOL_LEN]; | ||
55 | |||
56 | /* Simple cyclic dependency checker. */ | ||
57 | for (p = start; p < finish; p++) { | ||
58 | q = find_dependents_of(start, finish, p); | ||
59 | x = find_dependents_of(start, finish, q); | ||
60 | if (p == x) { | ||
61 | sprint_symbol(sym_p, (unsigned long)p->detect); | ||
62 | sprint_symbol(sym_q, (unsigned long)q->detect); | ||
63 | |||
64 | printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \ | ||
65 | " on %s and vice-versa. BREAKING IT.\n", | ||
66 | sym_p, sym_q); | ||
67 | /* Heavy handed way..*/ | ||
68 | x->depend = 0; | ||
69 | } | ||
70 | } | ||
71 | |||
72 | for (p = start; p < finish; p++) { | ||
73 | q = find_dependents_of(p, finish, p); | ||
74 | if (q && q > p) { | ||
75 | sprint_symbol(sym_p, (unsigned long)p->detect); | ||
76 | sprint_symbol(sym_q, (unsigned long)q->detect); | ||
77 | |||
78 | printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\ | ||
79 | "should be called before %s!\n", | ||
80 | sym_p, sym_q); | ||
81 | } | ||
82 | } | ||
83 | } | ||
84 | #else | ||
85 | inline void check_iommu_entries(struct iommu_table_entry *start, | ||
86 | struct iommu_table_entry *finish) | ||
87 | { | ||
88 | } | ||
89 | #endif | ||
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index a5bc528d4328..8f972cbddef0 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c | |||
@@ -10,7 +10,8 @@ | |||
10 | #include <asm/iommu.h> | 10 | #include <asm/iommu.h> |
11 | #include <asm/swiotlb.h> | 11 | #include <asm/swiotlb.h> |
12 | #include <asm/dma.h> | 12 | #include <asm/dma.h> |
13 | 13 | #include <asm/xen/swiotlb-xen.h> | |
14 | #include <asm/iommu_table.h> | ||
14 | int swiotlb __read_mostly; | 15 | int swiotlb __read_mostly; |
15 | 16 | ||
16 | static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, | 17 | static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, |
@@ -41,25 +42,42 @@ static struct dma_map_ops swiotlb_dma_ops = { | |||
41 | }; | 42 | }; |
42 | 43 | ||
43 | /* | 44 | /* |
44 | * pci_swiotlb_detect - set swiotlb to 1 if necessary | 45 | * pci_swiotlb_detect_override - set swiotlb to 1 if necessary |
45 | * | 46 | * |
46 | * This returns non-zero if we are forced to use swiotlb (by the boot | 47 | * This returns non-zero if we are forced to use swiotlb (by the boot |
47 | * option). | 48 | * option). |
48 | */ | 49 | */ |
49 | int __init pci_swiotlb_detect(void) | 50 | int __init pci_swiotlb_detect_override(void) |
50 | { | 51 | { |
51 | int use_swiotlb = swiotlb | swiotlb_force; | 52 | int use_swiotlb = swiotlb | swiotlb_force; |
52 | 53 | ||
54 | if (swiotlb_force) | ||
55 | swiotlb = 1; | ||
56 | |||
57 | return use_swiotlb; | ||
58 | } | ||
59 | IOMMU_INIT_FINISH(pci_swiotlb_detect_override, | ||
60 | pci_xen_swiotlb_detect, | ||
61 | pci_swiotlb_init, | ||
62 | pci_swiotlb_late_init); | ||
63 | |||
64 | /* | ||
65 | * if 4GB or more detected (and iommu=off not set) return 1 | ||
66 | * and set swiotlb to 1. | ||
67 | */ | ||
68 | int __init pci_swiotlb_detect_4gb(void) | ||
69 | { | ||
53 | /* don't initialize swiotlb if iommu=off (no_iommu=1) */ | 70 | /* don't initialize swiotlb if iommu=off (no_iommu=1) */ |
54 | #ifdef CONFIG_X86_64 | 71 | #ifdef CONFIG_X86_64 |
55 | if (!no_iommu && max_pfn > MAX_DMA32_PFN) | 72 | if (!no_iommu && max_pfn > MAX_DMA32_PFN) |
56 | swiotlb = 1; | 73 | swiotlb = 1; |
57 | #endif | 74 | #endif |
58 | if (swiotlb_force) | 75 | return swiotlb; |
59 | swiotlb = 1; | ||
60 | |||
61 | return use_swiotlb; | ||
62 | } | 76 | } |
77 | IOMMU_INIT(pci_swiotlb_detect_4gb, | ||
78 | pci_swiotlb_detect_override, | ||
79 | pci_swiotlb_init, | ||
80 | pci_swiotlb_late_init); | ||
63 | 81 | ||
64 | void __init pci_swiotlb_init(void) | 82 | void __init pci_swiotlb_init(void) |
65 | { | 83 | { |
@@ -68,3 +86,15 @@ void __init pci_swiotlb_init(void) | |||
68 | dma_ops = &swiotlb_dma_ops; | 86 | dma_ops = &swiotlb_dma_ops; |
69 | } | 87 | } |
70 | } | 88 | } |
89 | |||
90 | void __init pci_swiotlb_late_init(void) | ||
91 | { | ||
92 | /* An IOMMU turned us off. */ | ||
93 | if (!swiotlb) | ||
94 | swiotlb_free(); | ||
95 | else { | ||
96 | printk(KERN_INFO "PCI-DMA: " | ||
97 | "Using software bounce buffering for IO (SWIOTLB)\n"); | ||
98 | swiotlb_print_info(); | ||
99 | } | ||
100 | } | ||
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c deleted file mode 100644 index b112406f1996..000000000000 --- a/arch/x86/kernel/pmtimer_64.c +++ /dev/null | |||
@@ -1,69 +0,0 @@ | |||
1 | /* Ported over from i386 by AK, original copyright was: | ||
2 | * | ||
3 | * (C) Dominik Brodowski <linux@brodo.de> 2003 | ||
4 | * | ||
5 | * Driver to use the Power Management Timer (PMTMR) available in some | ||
6 | * southbridges as primary timing source for the Linux kernel. | ||
7 | * | ||
8 | * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, | ||
9 | * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. | ||
10 | * | ||
11 | * This file is licensed under the GPL v2. | ||
12 | * | ||
13 | * Dropped all the hardware bug workarounds for now. Hopefully they | ||
14 | * are not needed on 64bit chipsets. | ||
15 | */ | ||
16 | |||
17 | #include <linux/jiffies.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/time.h> | ||
20 | #include <linux/init.h> | ||
21 | #include <linux/cpumask.h> | ||
22 | #include <linux/acpi_pmtmr.h> | ||
23 | |||
24 | #include <asm/io.h> | ||
25 | #include <asm/proto.h> | ||
26 | #include <asm/msr.h> | ||
27 | #include <asm/vsyscall.h> | ||
28 | |||
29 | static inline u32 cyc2us(u32 cycles) | ||
30 | { | ||
31 | /* The Power Management Timer ticks at 3.579545 ticks per microsecond. | ||
32 | * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] | ||
33 | * | ||
34 | * Even with HZ = 100, delta is at maximum 35796 ticks, so it can | ||
35 | * easily be multiplied with 286 (=0x11E) without having to fear | ||
36 | * u32 overflows. | ||
37 | */ | ||
38 | cycles *= 286; | ||
39 | return (cycles >> 10); | ||
40 | } | ||
41 | |||
42 | static unsigned pmtimer_wait_tick(void) | ||
43 | { | ||
44 | u32 a, b; | ||
45 | for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK; | ||
46 | a == b; | ||
47 | b = inl(pmtmr_ioport) & ACPI_PM_MASK) | ||
48 | cpu_relax(); | ||
49 | return b; | ||
50 | } | ||
51 | |||
52 | /* note: wait time is rounded up to one tick */ | ||
53 | void pmtimer_wait(unsigned us) | ||
54 | { | ||
55 | u32 a, b; | ||
56 | a = pmtimer_wait_tick(); | ||
57 | do { | ||
58 | b = inl(pmtmr_ioport); | ||
59 | cpu_relax(); | ||
60 | } while (cyc2us(b - a) < us); | ||
61 | } | ||
62 | |||
63 | static int __init nopmtimer_setup(char *s) | ||
64 | { | ||
65 | pmtmr_ioport = 0; | ||
66 | return 1; | ||
67 | } | ||
68 | |||
69 | __setup("nopmtimer", nopmtimer_setup); | ||
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3d9ea531ddd1..b3d7a3a04f38 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -424,7 +424,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
424 | load_TLS(next, cpu); | 424 | load_TLS(next, cpu); |
425 | 425 | ||
426 | /* Must be after DS reload */ | 426 | /* Must be after DS reload */ |
427 | unlazy_fpu(prev_p); | 427 | __unlazy_fpu(prev_p); |
428 | 428 | ||
429 | /* Make sure cpu is ready for new context */ | 429 | /* Make sure cpu is ready for new context */ |
430 | if (preload_fpu) | 430 | if (preload_fpu) |
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 70c4872cd8aa..45892dc4b72a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c | |||
@@ -801,7 +801,8 @@ void ptrace_disable(struct task_struct *child) | |||
801 | static const struct user_regset_view user_x86_32_view; /* Initialized below. */ | 801 | static const struct user_regset_view user_x86_32_view; /* Initialized below. */ |
802 | #endif | 802 | #endif |
803 | 803 | ||
804 | long arch_ptrace(struct task_struct *child, long request, long addr, long data) | 804 | long arch_ptrace(struct task_struct *child, long request, |
805 | unsigned long addr, unsigned long data) | ||
805 | { | 806 | { |
806 | int ret; | 807 | int ret; |
807 | unsigned long __user *datap = (unsigned long __user *)data; | 808 | unsigned long __user *datap = (unsigned long __user *)data; |
@@ -812,8 +813,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) | |||
812 | unsigned long tmp; | 813 | unsigned long tmp; |
813 | 814 | ||
814 | ret = -EIO; | 815 | ret = -EIO; |
815 | if ((addr & (sizeof(data) - 1)) || addr < 0 || | 816 | if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) |
816 | addr >= sizeof(struct user)) | ||
817 | break; | 817 | break; |
818 | 818 | ||
819 | tmp = 0; /* Default return condition */ | 819 | tmp = 0; /* Default return condition */ |
@@ -830,8 +830,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) | |||
830 | 830 | ||
831 | case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ | 831 | case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ |
832 | ret = -EIO; | 832 | ret = -EIO; |
833 | if ((addr & (sizeof(data) - 1)) || addr < 0 || | 833 | if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) |
834 | addr >= sizeof(struct user)) | ||
835 | break; | 834 | break; |
836 | 835 | ||
837 | if (addr < sizeof(struct user_regs_struct)) | 836 | if (addr < sizeof(struct user_regs_struct)) |
@@ -888,17 +887,17 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) | |||
888 | 887 | ||
889 | #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION | 888 | #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION |
890 | case PTRACE_GET_THREAD_AREA: | 889 | case PTRACE_GET_THREAD_AREA: |
891 | if (addr < 0) | 890 | if ((int) addr < 0) |
892 | return -EIO; | 891 | return -EIO; |
893 | ret = do_get_thread_area(child, addr, | 892 | ret = do_get_thread_area(child, addr, |
894 | (struct user_desc __user *) data); | 893 | (struct user_desc __user *)data); |
895 | break; | 894 | break; |
896 | 895 | ||
897 | case PTRACE_SET_THREAD_AREA: | 896 | case PTRACE_SET_THREAD_AREA: |
898 | if (addr < 0) | 897 | if ((int) addr < 0) |
899 | return -EIO; | 898 | return -EIO; |
900 | ret = do_set_thread_area(child, addr, | 899 | ret = do_set_thread_area(child, addr, |
901 | (struct user_desc __user *) data, 0); | 900 | (struct user_desc __user *)data, 0); |
902 | break; | 901 | break; |
903 | #endif | 902 | #endif |
904 | 903 | ||
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 239427ca02af..bab3b9e6f66d 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c | |||
@@ -82,7 +82,8 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) | |||
82 | static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) | 82 | static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) |
83 | { | 83 | { |
84 | u64 delta = native_read_tsc() - shadow->tsc_timestamp; | 84 | u64 delta = native_read_tsc() - shadow->tsc_timestamp; |
85 | return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); | 85 | return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul, |
86 | shadow->tsc_shift); | ||
86 | } | 87 | } |
87 | 88 | ||
88 | /* | 89 | /* |
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 939b9e98245f..8bbe8c56916d 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c | |||
@@ -344,6 +344,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, | |||
344 | vt8237_force_enable_hpet); | 344 | vt8237_force_enable_hpet); |
345 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, | 345 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, |
346 | vt8237_force_enable_hpet); | 346 | vt8237_force_enable_hpet); |
347 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700, | ||
348 | vt8237_force_enable_hpet); | ||
347 | 349 | ||
348 | static void ati_force_hpet_resume(void) | 350 | static void ati_force_hpet_resume(void) |
349 | { | 351 | { |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 7a4cf14223ba..c495aa8d4815 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -371,16 +371,10 @@ void machine_real_restart(const unsigned char *code, int length) | |||
371 | CMOS_WRITE(0x00, 0x8f); | 371 | CMOS_WRITE(0x00, 0x8f); |
372 | spin_unlock(&rtc_lock); | 372 | spin_unlock(&rtc_lock); |
373 | 373 | ||
374 | /* Remap the kernel at virtual address zero, as well as offset zero | ||
375 | from the kernel segment. This assumes the kernel segment starts at | ||
376 | virtual address PAGE_OFFSET. */ | ||
377 | memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
378 | sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS); | ||
379 | |||
380 | /* | 374 | /* |
381 | * Use `swapper_pg_dir' as our page directory. | 375 | * Switch back to the initial page table. |
382 | */ | 376 | */ |
383 | load_cr3(swapper_pg_dir); | 377 | load_cr3(initial_page_table); |
384 | 378 | ||
385 | /* Write 0x1234 to absolute memory location 0x472. The BIOS reads | 379 | /* Write 0x1234 to absolute memory location 0x472. The BIOS reads |
386 | this on booting to tell it to "Bypass memory test (also warm | 380 | this on booting to tell it to "Bypass memory test (also warm |
@@ -641,7 +635,7 @@ void native_machine_shutdown(void) | |||
641 | /* O.K Now that I'm on the appropriate processor, | 635 | /* O.K Now that I'm on the appropriate processor, |
642 | * stop all of the others. | 636 | * stop all of the others. |
643 | */ | 637 | */ |
644 | smp_send_stop(); | 638 | stop_other_cpus(); |
645 | #endif | 639 | #endif |
646 | 640 | ||
647 | lapic_shutdown(); | 641 | lapic_shutdown(); |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 7d5ee08c982d..21c6746338af 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/apm_bios.h> | 31 | #include <linux/apm_bios.h> |
32 | #include <linux/initrd.h> | 32 | #include <linux/initrd.h> |
33 | #include <linux/bootmem.h> | 33 | #include <linux/bootmem.h> |
34 | #include <linux/memblock.h> | ||
34 | #include <linux/seq_file.h> | 35 | #include <linux/seq_file.h> |
35 | #include <linux/console.h> | 36 | #include <linux/console.h> |
36 | #include <linux/mca.h> | 37 | #include <linux/mca.h> |
@@ -83,7 +84,6 @@ | |||
83 | #include <asm/dmi.h> | 84 | #include <asm/dmi.h> |
84 | #include <asm/io_apic.h> | 85 | #include <asm/io_apic.h> |
85 | #include <asm/ist.h> | 86 | #include <asm/ist.h> |
86 | #include <asm/vmi.h> | ||
87 | #include <asm/setup_arch.h> | 87 | #include <asm/setup_arch.h> |
88 | #include <asm/bios_ebda.h> | 88 | #include <asm/bios_ebda.h> |
89 | #include <asm/cacheflush.h> | 89 | #include <asm/cacheflush.h> |
@@ -107,11 +107,12 @@ | |||
107 | #include <asm/percpu.h> | 107 | #include <asm/percpu.h> |
108 | #include <asm/topology.h> | 108 | #include <asm/topology.h> |
109 | #include <asm/apicdef.h> | 109 | #include <asm/apicdef.h> |
110 | #include <asm/k8.h> | 110 | #include <asm/amd_nb.h> |
111 | #ifdef CONFIG_X86_64 | 111 | #ifdef CONFIG_X86_64 |
112 | #include <asm/numa_64.h> | 112 | #include <asm/numa_64.h> |
113 | #endif | 113 | #endif |
114 | #include <asm/mce.h> | 114 | #include <asm/mce.h> |
115 | #include <asm/alternative.h> | ||
115 | 116 | ||
116 | /* | 117 | /* |
117 | * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. | 118 | * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. |
@@ -301,7 +302,7 @@ static inline void init_gbpages(void) | |||
301 | static void __init reserve_brk(void) | 302 | static void __init reserve_brk(void) |
302 | { | 303 | { |
303 | if (_brk_end > _brk_start) | 304 | if (_brk_end > _brk_start) |
304 | reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK"); | 305 | memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK"); |
305 | 306 | ||
306 | /* Mark brk area as locked down and no longer taking any | 307 | /* Mark brk area as locked down and no longer taking any |
307 | new allocations */ | 308 | new allocations */ |
@@ -323,17 +324,16 @@ static void __init relocate_initrd(void) | |||
323 | char *p, *q; | 324 | char *p, *q; |
324 | 325 | ||
325 | /* We need to move the initrd down into lowmem */ | 326 | /* We need to move the initrd down into lowmem */ |
326 | ramdisk_here = find_e820_area(0, end_of_lowmem, area_size, | 327 | ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, |
327 | PAGE_SIZE); | 328 | PAGE_SIZE); |
328 | 329 | ||
329 | if (ramdisk_here == -1ULL) | 330 | if (ramdisk_here == MEMBLOCK_ERROR) |
330 | panic("Cannot find place for new RAMDISK of size %lld\n", | 331 | panic("Cannot find place for new RAMDISK of size %lld\n", |
331 | ramdisk_size); | 332 | ramdisk_size); |
332 | 333 | ||
333 | /* Note: this includes all the lowmem currently occupied by | 334 | /* Note: this includes all the lowmem currently occupied by |
334 | the initrd, we rely on that fact to keep the data intact. */ | 335 | the initrd, we rely on that fact to keep the data intact. */ |
335 | reserve_early(ramdisk_here, ramdisk_here + area_size, | 336 | memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); |
336 | "NEW RAMDISK"); | ||
337 | initrd_start = ramdisk_here + PAGE_OFFSET; | 337 | initrd_start = ramdisk_here + PAGE_OFFSET; |
338 | initrd_end = initrd_start + ramdisk_size; | 338 | initrd_end = initrd_start + ramdisk_size; |
339 | printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", | 339 | printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", |
@@ -389,7 +389,7 @@ static void __init reserve_initrd(void) | |||
389 | initrd_start = 0; | 389 | initrd_start = 0; |
390 | 390 | ||
391 | if (ramdisk_size >= (end_of_lowmem>>1)) { | 391 | if (ramdisk_size >= (end_of_lowmem>>1)) { |
392 | free_early(ramdisk_image, ramdisk_end); | 392 | memblock_x86_free_range(ramdisk_image, ramdisk_end); |
393 | printk(KERN_ERR "initrd too large to handle, " | 393 | printk(KERN_ERR "initrd too large to handle, " |
394 | "disabling initrd\n"); | 394 | "disabling initrd\n"); |
395 | return; | 395 | return; |
@@ -412,7 +412,7 @@ static void __init reserve_initrd(void) | |||
412 | 412 | ||
413 | relocate_initrd(); | 413 | relocate_initrd(); |
414 | 414 | ||
415 | free_early(ramdisk_image, ramdisk_end); | 415 | memblock_x86_free_range(ramdisk_image, ramdisk_end); |
416 | } | 416 | } |
417 | #else | 417 | #else |
418 | static void __init reserve_initrd(void) | 418 | static void __init reserve_initrd(void) |
@@ -468,7 +468,7 @@ static void __init e820_reserve_setup_data(void) | |||
468 | e820_print_map("reserve setup_data"); | 468 | e820_print_map("reserve setup_data"); |
469 | } | 469 | } |
470 | 470 | ||
471 | static void __init reserve_early_setup_data(void) | 471 | static void __init memblock_x86_reserve_range_setup_data(void) |
472 | { | 472 | { |
473 | struct setup_data *data; | 473 | struct setup_data *data; |
474 | u64 pa_data; | 474 | u64 pa_data; |
@@ -480,7 +480,7 @@ static void __init reserve_early_setup_data(void) | |||
480 | while (pa_data) { | 480 | while (pa_data) { |
481 | data = early_memremap(pa_data, sizeof(*data)); | 481 | data = early_memremap(pa_data, sizeof(*data)); |
482 | sprintf(buf, "setup data %x", data->type); | 482 | sprintf(buf, "setup data %x", data->type); |
483 | reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); | 483 | memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf); |
484 | pa_data = data->next; | 484 | pa_data = data->next; |
485 | early_iounmap(data, sizeof(*data)); | 485 | early_iounmap(data, sizeof(*data)); |
486 | } | 486 | } |
@@ -501,6 +501,7 @@ static inline unsigned long long get_total_mem(void) | |||
501 | return total << PAGE_SHIFT; | 501 | return total << PAGE_SHIFT; |
502 | } | 502 | } |
503 | 503 | ||
504 | #define DEFAULT_BZIMAGE_ADDR_MAX 0x37FFFFFF | ||
504 | static void __init reserve_crashkernel(void) | 505 | static void __init reserve_crashkernel(void) |
505 | { | 506 | { |
506 | unsigned long long total_mem; | 507 | unsigned long long total_mem; |
@@ -518,23 +519,27 @@ static void __init reserve_crashkernel(void) | |||
518 | if (crash_base <= 0) { | 519 | if (crash_base <= 0) { |
519 | const unsigned long long alignment = 16<<20; /* 16M */ | 520 | const unsigned long long alignment = 16<<20; /* 16M */ |
520 | 521 | ||
521 | crash_base = find_e820_area(alignment, ULONG_MAX, crash_size, | 522 | /* |
522 | alignment); | 523 | * kexec want bzImage is below DEFAULT_BZIMAGE_ADDR_MAX |
523 | if (crash_base == -1ULL) { | 524 | */ |
525 | crash_base = memblock_find_in_range(alignment, | ||
526 | DEFAULT_BZIMAGE_ADDR_MAX, crash_size, alignment); | ||
527 | |||
528 | if (crash_base == MEMBLOCK_ERROR) { | ||
524 | pr_info("crashkernel reservation failed - No suitable area found.\n"); | 529 | pr_info("crashkernel reservation failed - No suitable area found.\n"); |
525 | return; | 530 | return; |
526 | } | 531 | } |
527 | } else { | 532 | } else { |
528 | unsigned long long start; | 533 | unsigned long long start; |
529 | 534 | ||
530 | start = find_e820_area(crash_base, ULONG_MAX, crash_size, | 535 | start = memblock_find_in_range(crash_base, |
531 | 1<<20); | 536 | crash_base + crash_size, crash_size, 1<<20); |
532 | if (start != crash_base) { | 537 | if (start != crash_base) { |
533 | pr_info("crashkernel reservation failed - memory is in use.\n"); | 538 | pr_info("crashkernel reservation failed - memory is in use.\n"); |
534 | return; | 539 | return; |
535 | } | 540 | } |
536 | } | 541 | } |
537 | reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL"); | 542 | memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL"); |
538 | 543 | ||
539 | printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " | 544 | printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " |
540 | "for crashkernel (System RAM: %ldMB)\n", | 545 | "for crashkernel (System RAM: %ldMB)\n", |
@@ -614,82 +619,10 @@ static __init void reserve_ibft_region(void) | |||
614 | addr = find_ibft_region(&size); | 619 | addr = find_ibft_region(&size); |
615 | 620 | ||
616 | if (size) | 621 | if (size) |
617 | reserve_early_overlap_ok(addr, addr + size, "ibft"); | 622 | memblock_x86_reserve_range(addr, addr + size, "* ibft"); |
618 | } | 623 | } |
619 | 624 | ||
620 | #ifdef CONFIG_X86_RESERVE_LOW_64K | 625 | static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; |
621 | static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) | ||
622 | { | ||
623 | printk(KERN_NOTICE | ||
624 | "%s detected: BIOS may corrupt low RAM, working around it.\n", | ||
625 | d->ident); | ||
626 | |||
627 | e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED); | ||
628 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | ||
629 | |||
630 | return 0; | ||
631 | } | ||
632 | #endif | ||
633 | |||
634 | /* List of systems that have known low memory corruption BIOS problems */ | ||
635 | static struct dmi_system_id __initdata bad_bios_dmi_table[] = { | ||
636 | #ifdef CONFIG_X86_RESERVE_LOW_64K | ||
637 | { | ||
638 | .callback = dmi_low_memory_corruption, | ||
639 | .ident = "AMI BIOS", | ||
640 | .matches = { | ||
641 | DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), | ||
642 | }, | ||
643 | }, | ||
644 | { | ||
645 | .callback = dmi_low_memory_corruption, | ||
646 | .ident = "Phoenix BIOS", | ||
647 | .matches = { | ||
648 | DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), | ||
649 | }, | ||
650 | }, | ||
651 | { | ||
652 | .callback = dmi_low_memory_corruption, | ||
653 | .ident = "Phoenix/MSC BIOS", | ||
654 | .matches = { | ||
655 | DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), | ||
656 | }, | ||
657 | }, | ||
658 | /* | ||
659 | * AMI BIOS with low memory corruption was found on Intel DG45ID and | ||
660 | * DG45FC boards. | ||
661 | * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will | ||
662 | * match only DMI_BOARD_NAME and see if there is more bad products | ||
663 | * with this vendor. | ||
664 | */ | ||
665 | { | ||
666 | .callback = dmi_low_memory_corruption, | ||
667 | .ident = "AMI BIOS", | ||
668 | .matches = { | ||
669 | DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), | ||
670 | }, | ||
671 | }, | ||
672 | { | ||
673 | .callback = dmi_low_memory_corruption, | ||
674 | .ident = "AMI BIOS", | ||
675 | .matches = { | ||
676 | DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), | ||
677 | }, | ||
678 | }, | ||
679 | /* | ||
680 | * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so | ||
681 | * match on the product name. | ||
682 | */ | ||
683 | { | ||
684 | .callback = dmi_low_memory_corruption, | ||
685 | .ident = "Phoenix BIOS", | ||
686 | .matches = { | ||
687 | DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"), | ||
688 | }, | ||
689 | }, | ||
690 | #endif | ||
691 | {} | ||
692 | }; | ||
693 | 626 | ||
694 | static void __init trim_bios_range(void) | 627 | static void __init trim_bios_range(void) |
695 | { | 628 | { |
@@ -697,8 +630,14 @@ static void __init trim_bios_range(void) | |||
697 | * A special case is the first 4Kb of memory; | 630 | * A special case is the first 4Kb of memory; |
698 | * This is a BIOS owned area, not kernel ram, but generally | 631 | * This is a BIOS owned area, not kernel ram, but generally |
699 | * not listed as such in the E820 table. | 632 | * not listed as such in the E820 table. |
633 | * | ||
634 | * This typically reserves additional memory (64KiB by default) | ||
635 | * since some BIOSes are known to corrupt low memory. See the | ||
636 | * Kconfig help text for X86_RESERVE_LOW. | ||
700 | */ | 637 | */ |
701 | e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); | 638 | e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE), |
639 | E820_RAM, E820_RESERVED); | ||
640 | |||
702 | /* | 641 | /* |
703 | * special case: Some BIOSen report the PC BIOS | 642 | * special case: Some BIOSen report the PC BIOS |
704 | * area (640->1Mb) as ram even though it is not. | 643 | * area (640->1Mb) as ram even though it is not. |
@@ -708,6 +647,37 @@ static void __init trim_bios_range(void) | |||
708 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | 647 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |
709 | } | 648 | } |
710 | 649 | ||
650 | static int __init parse_reservelow(char *p) | ||
651 | { | ||
652 | unsigned long long size; | ||
653 | |||
654 | if (!p) | ||
655 | return -EINVAL; | ||
656 | |||
657 | size = memparse(p, &p); | ||
658 | |||
659 | if (size < 4096) | ||
660 | size = 4096; | ||
661 | |||
662 | if (size > 640*1024) | ||
663 | size = 640*1024; | ||
664 | |||
665 | reserve_low = size; | ||
666 | |||
667 | return 0; | ||
668 | } | ||
669 | |||
670 | early_param("reservelow", parse_reservelow); | ||
671 | |||
672 | static u64 __init get_max_mapped(void) | ||
673 | { | ||
674 | u64 end = max_pfn_mapped; | ||
675 | |||
676 | end <<= PAGE_SHIFT; | ||
677 | |||
678 | return end; | ||
679 | } | ||
680 | |||
711 | /* | 681 | /* |
712 | * Determine if we were loaded by an EFI loader. If so, then we have also been | 682 | * Determine if we were loaded by an EFI loader. If so, then we have also been |
713 | * passed the efi memmap, systab, etc., so we should use these data structures | 683 | * passed the efi memmap, systab, etc., so we should use these data structures |
@@ -725,18 +695,30 @@ void __init setup_arch(char **cmdline_p) | |||
725 | { | 695 | { |
726 | int acpi = 0; | 696 | int acpi = 0; |
727 | int k8 = 0; | 697 | int k8 = 0; |
698 | unsigned long flags; | ||
728 | 699 | ||
729 | #ifdef CONFIG_X86_32 | 700 | #ifdef CONFIG_X86_32 |
730 | memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); | 701 | memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); |
731 | visws_early_detect(); | 702 | visws_early_detect(); |
703 | |||
704 | /* | ||
705 | * copy kernel address range established so far and switch | ||
706 | * to the proper swapper page table | ||
707 | */ | ||
708 | clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
709 | initial_page_table + KERNEL_PGD_BOUNDARY, | ||
710 | KERNEL_PGD_PTRS); | ||
711 | |||
712 | load_cr3(swapper_pg_dir); | ||
713 | __flush_tlb_all(); | ||
732 | #else | 714 | #else |
733 | printk(KERN_INFO "Command line: %s\n", boot_command_line); | 715 | printk(KERN_INFO "Command line: %s\n", boot_command_line); |
734 | #endif | 716 | #endif |
735 | 717 | ||
736 | /* VMI may relocate the fixmap; do this before touching ioremap area */ | 718 | /* |
737 | vmi_init(); | 719 | * If we have OLPC OFW, we might end up relocating the fixmap due to |
738 | 720 | * reserve_top(), so do this before touching the ioremap area. | |
739 | /* OFW also may relocate the fixmap */ | 721 | */ |
740 | olpc_ofw_detect(); | 722 | olpc_ofw_detect(); |
741 | 723 | ||
742 | early_trap_init(); | 724 | early_trap_init(); |
@@ -781,12 +763,14 @@ void __init setup_arch(char **cmdline_p) | |||
781 | #endif | 763 | #endif |
782 | 4)) { | 764 | 4)) { |
783 | efi_enabled = 1; | 765 | efi_enabled = 1; |
784 | efi_reserve_early(); | 766 | efi_memblock_x86_reserve_range(); |
785 | } | 767 | } |
786 | #endif | 768 | #endif |
787 | 769 | ||
788 | x86_init.oem.arch_setup(); | 770 | x86_init.oem.arch_setup(); |
789 | 771 | ||
772 | resource_alloc_from_bottom = 0; | ||
773 | iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; | ||
790 | setup_memory_map(); | 774 | setup_memory_map(); |
791 | parse_setup_data(); | 775 | parse_setup_data(); |
792 | /* update the e820_saved too */ | 776 | /* update the e820_saved too */ |
@@ -837,11 +821,8 @@ void __init setup_arch(char **cmdline_p) | |||
837 | 821 | ||
838 | x86_report_nx(); | 822 | x86_report_nx(); |
839 | 823 | ||
840 | /* Must be before kernel pagetables are setup */ | ||
841 | vmi_activate(); | ||
842 | |||
843 | /* after early param, so could get panic from serial */ | 824 | /* after early param, so could get panic from serial */ |
844 | reserve_early_setup_data(); | 825 | memblock_x86_reserve_range_setup_data(); |
845 | 826 | ||
846 | if (acpi_mps_check()) { | 827 | if (acpi_mps_check()) { |
847 | #ifdef CONFIG_X86_LOCAL_APIC | 828 | #ifdef CONFIG_X86_LOCAL_APIC |
@@ -862,8 +843,6 @@ void __init setup_arch(char **cmdline_p) | |||
862 | 843 | ||
863 | dmi_scan_machine(); | 844 | dmi_scan_machine(); |
864 | 845 | ||
865 | dmi_check_system(bad_bios_dmi_table); | ||
866 | |||
867 | /* | 846 | /* |
868 | * VMware detection requires dmi to be available, so this | 847 | * VMware detection requires dmi to be available, so this |
869 | * needs to be done after dmi_scan_machine, for the BP. | 848 | * needs to be done after dmi_scan_machine, for the BP. |
@@ -896,8 +875,6 @@ void __init setup_arch(char **cmdline_p) | |||
896 | */ | 875 | */ |
897 | max_pfn = e820_end_of_ram_pfn(); | 876 | max_pfn = e820_end_of_ram_pfn(); |
898 | 877 | ||
899 | /* preallocate 4k for mptable mpc */ | ||
900 | early_reserve_e820_mpc_new(); | ||
901 | /* update e820 for memory not covered by WB MTRRs */ | 878 | /* update e820 for memory not covered by WB MTRRs */ |
902 | mtrr_bp_init(); | 879 | mtrr_bp_init(); |
903 | if (mtrr_trim_uncached_memory(max_pfn)) | 880 | if (mtrr_trim_uncached_memory(max_pfn)) |
@@ -919,18 +896,8 @@ void __init setup_arch(char **cmdline_p) | |||
919 | max_low_pfn = max_pfn; | 896 | max_low_pfn = max_pfn; |
920 | 897 | ||
921 | high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; | 898 | high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; |
922 | max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; | ||
923 | #endif | 899 | #endif |
924 | 900 | ||
925 | #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION | ||
926 | setup_bios_corruption_check(); | ||
927 | #endif | ||
928 | |||
929 | printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", | ||
930 | max_pfn_mapped<<PAGE_SHIFT); | ||
931 | |||
932 | reserve_brk(); | ||
933 | |||
934 | /* | 901 | /* |
935 | * Find and reserve possible boot-time SMP configuration: | 902 | * Find and reserve possible boot-time SMP configuration: |
936 | */ | 903 | */ |
@@ -938,6 +905,26 @@ void __init setup_arch(char **cmdline_p) | |||
938 | 905 | ||
939 | reserve_ibft_region(); | 906 | reserve_ibft_region(); |
940 | 907 | ||
908 | /* | ||
909 | * Need to conclude brk, before memblock_x86_fill() | ||
910 | * it could use memblock_find_in_range, could overlap with | ||
911 | * brk area. | ||
912 | */ | ||
913 | reserve_brk(); | ||
914 | |||
915 | memblock.current_limit = get_max_mapped(); | ||
916 | memblock_x86_fill(); | ||
917 | |||
918 | /* preallocate 4k for mptable mpc */ | ||
919 | early_reserve_e820_mpc_new(); | ||
920 | |||
921 | #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION | ||
922 | setup_bios_corruption_check(); | ||
923 | #endif | ||
924 | |||
925 | printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", | ||
926 | max_pfn_mapped<<PAGE_SHIFT); | ||
927 | |||
941 | reserve_trampoline_memory(); | 928 | reserve_trampoline_memory(); |
942 | 929 | ||
943 | #ifdef CONFIG_ACPI_SLEEP | 930 | #ifdef CONFIG_ACPI_SLEEP |
@@ -961,6 +948,7 @@ void __init setup_arch(char **cmdline_p) | |||
961 | max_low_pfn = max_pfn; | 948 | max_low_pfn = max_pfn; |
962 | } | 949 | } |
963 | #endif | 950 | #endif |
951 | memblock.current_limit = get_max_mapped(); | ||
964 | 952 | ||
965 | /* | 953 | /* |
966 | * NOTE: On x86-32, only from this point on, fixmaps are ready for use. | 954 | * NOTE: On x86-32, only from this point on, fixmaps are ready for use. |
@@ -999,10 +987,7 @@ void __init setup_arch(char **cmdline_p) | |||
999 | #endif | 987 | #endif |
1000 | 988 | ||
1001 | initmem_init(0, max_pfn, acpi, k8); | 989 | initmem_init(0, max_pfn, acpi, k8); |
1002 | #ifndef CONFIG_NO_BOOTMEM | 990 | memblock_find_dma_reserve(); |
1003 | early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); | ||
1004 | #endif | ||
1005 | |||
1006 | dma32_reserve_bootmem(); | 991 | dma32_reserve_bootmem(); |
1007 | 992 | ||
1008 | #ifdef CONFIG_KVM_CLOCK | 993 | #ifdef CONFIG_KVM_CLOCK |
@@ -1013,7 +998,12 @@ void __init setup_arch(char **cmdline_p) | |||
1013 | paging_init(); | 998 | paging_init(); |
1014 | x86_init.paging.pagetable_setup_done(swapper_pg_dir); | 999 | x86_init.paging.pagetable_setup_done(swapper_pg_dir); |
1015 | 1000 | ||
1016 | setup_trampoline_page_table(); | 1001 | #ifdef CONFIG_X86_32 |
1002 | /* sync back kernel address range */ | ||
1003 | clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, | ||
1004 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
1005 | KERNEL_PGD_PTRS); | ||
1006 | #endif | ||
1017 | 1007 | ||
1018 | tboot_probe(); | 1008 | tboot_probe(); |
1019 | 1009 | ||
@@ -1070,6 +1060,10 @@ void __init setup_arch(char **cmdline_p) | |||
1070 | x86_init.oem.banner(); | 1060 | x86_init.oem.banner(); |
1071 | 1061 | ||
1072 | mcheck_init(); | 1062 | mcheck_init(); |
1063 | |||
1064 | local_irq_save(flags); | ||
1065 | arch_init_ideal_nop5(); | ||
1066 | local_irq_restore(flags); | ||
1073 | } | 1067 | } |
1074 | 1068 | ||
1075 | #ifdef CONFIG_X86_32 | 1069 | #ifdef CONFIG_X86_32 |
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2335c15c93a4..002b79685f73 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -131,13 +131,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) | |||
131 | 131 | ||
132 | static void __init pcpu_fc_free(void *ptr, size_t size) | 132 | static void __init pcpu_fc_free(void *ptr, size_t size) |
133 | { | 133 | { |
134 | #ifdef CONFIG_NO_BOOTMEM | ||
135 | u64 start = __pa(ptr); | ||
136 | u64 end = start + size; | ||
137 | free_early_partial(start, end); | ||
138 | #else | ||
139 | free_bootmem(__pa(ptr), size); | 134 | free_bootmem(__pa(ptr), size); |
140 | #endif | ||
141 | } | 135 | } |
142 | 136 | ||
143 | static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) | 137 | static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) |
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d801210945d6..513deac7228d 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c | |||
@@ -159,10 +159,10 @@ asmlinkage void smp_reboot_interrupt(void) | |||
159 | irq_exit(); | 159 | irq_exit(); |
160 | } | 160 | } |
161 | 161 | ||
162 | static void native_smp_send_stop(void) | 162 | static void native_stop_other_cpus(int wait) |
163 | { | 163 | { |
164 | unsigned long flags; | 164 | unsigned long flags; |
165 | unsigned long wait; | 165 | unsigned long timeout; |
166 | 166 | ||
167 | if (reboot_force) | 167 | if (reboot_force) |
168 | return; | 168 | return; |
@@ -179,9 +179,12 @@ static void native_smp_send_stop(void) | |||
179 | if (num_online_cpus() > 1) { | 179 | if (num_online_cpus() > 1) { |
180 | apic->send_IPI_allbutself(REBOOT_VECTOR); | 180 | apic->send_IPI_allbutself(REBOOT_VECTOR); |
181 | 181 | ||
182 | /* Don't wait longer than a second */ | 182 | /* |
183 | wait = USEC_PER_SEC; | 183 | * Don't wait longer than a second if the caller |
184 | while (num_online_cpus() > 1 && wait--) | 184 | * didn't ask us to wait. |
185 | */ | ||
186 | timeout = USEC_PER_SEC; | ||
187 | while (num_online_cpus() > 1 && (wait || timeout--)) | ||
185 | udelay(1); | 188 | udelay(1); |
186 | } | 189 | } |
187 | 190 | ||
@@ -227,7 +230,7 @@ struct smp_ops smp_ops = { | |||
227 | .smp_prepare_cpus = native_smp_prepare_cpus, | 230 | .smp_prepare_cpus = native_smp_prepare_cpus, |
228 | .smp_cpus_done = native_smp_cpus_done, | 231 | .smp_cpus_done = native_smp_cpus_done, |
229 | 232 | ||
230 | .smp_send_stop = native_smp_send_stop, | 233 | .stop_other_cpus = native_stop_other_cpus, |
231 | .smp_send_reschedule = native_smp_send_reschedule, | 234 | .smp_send_reschedule = native_smp_send_reschedule, |
232 | 235 | ||
233 | .cpu_up = native_cpu_up, | 236 | .cpu_up = native_cpu_up, |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 864b386f6c0e..083e99d1b7df 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -62,7 +62,7 @@ | |||
62 | #include <asm/pgtable.h> | 62 | #include <asm/pgtable.h> |
63 | #include <asm/tlbflush.h> | 63 | #include <asm/tlbflush.h> |
64 | #include <asm/mtrr.h> | 64 | #include <asm/mtrr.h> |
65 | #include <asm/vmi.h> | 65 | #include <asm/mwait.h> |
66 | #include <asm/apic.h> | 66 | #include <asm/apic.h> |
67 | #include <asm/setup.h> | 67 | #include <asm/setup.h> |
68 | #include <asm/uv/uv.h> | 68 | #include <asm/uv/uv.h> |
@@ -299,23 +299,16 @@ notrace static void __cpuinit start_secondary(void *unused) | |||
299 | * fragile that we want to limit the things done here to the | 299 | * fragile that we want to limit the things done here to the |
300 | * most necessary things. | 300 | * most necessary things. |
301 | */ | 301 | */ |
302 | cpu_init(); | ||
303 | preempt_disable(); | ||
304 | smp_callin(); | ||
302 | 305 | ||
303 | #ifdef CONFIG_X86_32 | 306 | #ifdef CONFIG_X86_32 |
304 | /* | 307 | /* switch away from the initial page table */ |
305 | * Switch away from the trampoline page-table | ||
306 | * | ||
307 | * Do this before cpu_init() because it needs to access per-cpu | ||
308 | * data which may not be mapped in the trampoline page-table. | ||
309 | */ | ||
310 | load_cr3(swapper_pg_dir); | 308 | load_cr3(swapper_pg_dir); |
311 | __flush_tlb_all(); | 309 | __flush_tlb_all(); |
312 | #endif | 310 | #endif |
313 | 311 | ||
314 | vmi_bringup(); | ||
315 | cpu_init(); | ||
316 | preempt_disable(); | ||
317 | smp_callin(); | ||
318 | |||
319 | /* otherwise gcc will move up smp_processor_id before the cpu_init */ | 312 | /* otherwise gcc will move up smp_processor_id before the cpu_init */ |
320 | barrier(); | 313 | barrier(); |
321 | /* | 314 | /* |
@@ -397,6 +390,19 @@ void __cpuinit smp_store_cpu_info(int id) | |||
397 | identify_secondary_cpu(c); | 390 | identify_secondary_cpu(c); |
398 | } | 391 | } |
399 | 392 | ||
393 | static void __cpuinit link_thread_siblings(int cpu1, int cpu2) | ||
394 | { | ||
395 | struct cpuinfo_x86 *c1 = &cpu_data(cpu1); | ||
396 | struct cpuinfo_x86 *c2 = &cpu_data(cpu2); | ||
397 | |||
398 | cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); | ||
399 | cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); | ||
400 | cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); | ||
401 | cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); | ||
402 | cpumask_set_cpu(cpu1, c2->llc_shared_map); | ||
403 | cpumask_set_cpu(cpu2, c1->llc_shared_map); | ||
404 | } | ||
405 | |||
400 | 406 | ||
401 | void __cpuinit set_cpu_sibling_map(int cpu) | 407 | void __cpuinit set_cpu_sibling_map(int cpu) |
402 | { | 408 | { |
@@ -409,14 +415,13 @@ void __cpuinit set_cpu_sibling_map(int cpu) | |||
409 | for_each_cpu(i, cpu_sibling_setup_mask) { | 415 | for_each_cpu(i, cpu_sibling_setup_mask) { |
410 | struct cpuinfo_x86 *o = &cpu_data(i); | 416 | struct cpuinfo_x86 *o = &cpu_data(i); |
411 | 417 | ||
412 | if (c->phys_proc_id == o->phys_proc_id && | 418 | if (cpu_has(c, X86_FEATURE_TOPOEXT)) { |
413 | c->cpu_core_id == o->cpu_core_id) { | 419 | if (c->phys_proc_id == o->phys_proc_id && |
414 | cpumask_set_cpu(i, cpu_sibling_mask(cpu)); | 420 | c->compute_unit_id == o->compute_unit_id) |
415 | cpumask_set_cpu(cpu, cpu_sibling_mask(i)); | 421 | link_thread_siblings(cpu, i); |
416 | cpumask_set_cpu(i, cpu_core_mask(cpu)); | 422 | } else if (c->phys_proc_id == o->phys_proc_id && |
417 | cpumask_set_cpu(cpu, cpu_core_mask(i)); | 423 | c->cpu_core_id == o->cpu_core_id) { |
418 | cpumask_set_cpu(i, c->llc_shared_map); | 424 | link_thread_siblings(cpu, i); |
419 | cpumask_set_cpu(cpu, o->llc_shared_map); | ||
420 | } | 425 | } |
421 | } | 426 | } |
422 | } else { | 427 | } else { |
@@ -742,7 +747,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) | |||
742 | .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), | 747 | .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), |
743 | }; | 748 | }; |
744 | 749 | ||
745 | INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle); | 750 | INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle); |
746 | 751 | ||
747 | alternatives_smp_switch(1); | 752 | alternatives_smp_switch(1); |
748 | 753 | ||
@@ -774,7 +779,6 @@ do_rest: | |||
774 | #ifdef CONFIG_X86_32 | 779 | #ifdef CONFIG_X86_32 |
775 | /* Stack for startup_32 can be just as for start_secondary onwards */ | 780 | /* Stack for startup_32 can be just as for start_secondary onwards */ |
776 | irq_ctx_init(cpu); | 781 | irq_ctx_init(cpu); |
777 | initial_page_table = __pa(&trampoline_pg_dir); | ||
778 | #else | 782 | #else |
779 | clear_tsk_thread_flag(c_idle.idle, TIF_FORK); | 783 | clear_tsk_thread_flag(c_idle.idle, TIF_FORK); |
780 | initial_gs = per_cpu_offset(cpu); | 784 | initial_gs = per_cpu_offset(cpu); |
@@ -923,7 +927,6 @@ int __cpuinit native_cpu_up(unsigned int cpu) | |||
923 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; | 927 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; |
924 | 928 | ||
925 | err = do_boot_cpu(apicid, cpu); | 929 | err = do_boot_cpu(apicid, cpu); |
926 | |||
927 | if (err) { | 930 | if (err) { |
928 | pr_debug("do_boot_cpu failed %d\n", err); | 931 | pr_debug("do_boot_cpu failed %d\n", err); |
929 | return -EIO; | 932 | return -EIO; |
@@ -1370,7 +1373,6 @@ void play_dead_common(void) | |||
1370 | { | 1373 | { |
1371 | idle_task_exit(); | 1374 | idle_task_exit(); |
1372 | reset_lazy_tlbstate(); | 1375 | reset_lazy_tlbstate(); |
1373 | irq_ctx_exit(raw_smp_processor_id()); | ||
1374 | c1e_remove_cpu(raw_smp_processor_id()); | 1376 | c1e_remove_cpu(raw_smp_processor_id()); |
1375 | 1377 | ||
1376 | mb(); | 1378 | mb(); |
@@ -1383,11 +1385,88 @@ void play_dead_common(void) | |||
1383 | local_irq_disable(); | 1385 | local_irq_disable(); |
1384 | } | 1386 | } |
1385 | 1387 | ||
1388 | /* | ||
1389 | * We need to flush the caches before going to sleep, lest we have | ||
1390 | * dirty data in our caches when we come back up. | ||
1391 | */ | ||
1392 | static inline void mwait_play_dead(void) | ||
1393 | { | ||
1394 | unsigned int eax, ebx, ecx, edx; | ||
1395 | unsigned int highest_cstate = 0; | ||
1396 | unsigned int highest_subcstate = 0; | ||
1397 | int i; | ||
1398 | void *mwait_ptr; | ||
1399 | |||
1400 | if (!cpu_has(¤t_cpu_data, X86_FEATURE_MWAIT)) | ||
1401 | return; | ||
1402 | if (!cpu_has(¤t_cpu_data, X86_FEATURE_CLFLSH)) | ||
1403 | return; | ||
1404 | if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) | ||
1405 | return; | ||
1406 | |||
1407 | eax = CPUID_MWAIT_LEAF; | ||
1408 | ecx = 0; | ||
1409 | native_cpuid(&eax, &ebx, &ecx, &edx); | ||
1410 | |||
1411 | /* | ||
1412 | * eax will be 0 if EDX enumeration is not valid. | ||
1413 | * Initialized below to cstate, sub_cstate value when EDX is valid. | ||
1414 | */ | ||
1415 | if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { | ||
1416 | eax = 0; | ||
1417 | } else { | ||
1418 | edx >>= MWAIT_SUBSTATE_SIZE; | ||
1419 | for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { | ||
1420 | if (edx & MWAIT_SUBSTATE_MASK) { | ||
1421 | highest_cstate = i; | ||
1422 | highest_subcstate = edx & MWAIT_SUBSTATE_MASK; | ||
1423 | } | ||
1424 | } | ||
1425 | eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | | ||
1426 | (highest_subcstate - 1); | ||
1427 | } | ||
1428 | |||
1429 | /* | ||
1430 | * This should be a memory location in a cache line which is | ||
1431 | * unlikely to be touched by other processors. The actual | ||
1432 | * content is immaterial as it is not actually modified in any way. | ||
1433 | */ | ||
1434 | mwait_ptr = ¤t_thread_info()->flags; | ||
1435 | |||
1436 | wbinvd(); | ||
1437 | |||
1438 | while (1) { | ||
1439 | /* | ||
1440 | * The CLFLUSH is a workaround for erratum AAI65 for | ||
1441 | * the Xeon 7400 series. It's not clear it is actually | ||
1442 | * needed, but it should be harmless in either case. | ||
1443 | * The WBINVD is insufficient due to the spurious-wakeup | ||
1444 | * case where we return around the loop. | ||
1445 | */ | ||
1446 | clflush(mwait_ptr); | ||
1447 | __monitor(mwait_ptr, 0, 0); | ||
1448 | mb(); | ||
1449 | __mwait(eax, 0); | ||
1450 | } | ||
1451 | } | ||
1452 | |||
1453 | static inline void hlt_play_dead(void) | ||
1454 | { | ||
1455 | if (current_cpu_data.x86 >= 4) | ||
1456 | wbinvd(); | ||
1457 | |||
1458 | while (1) { | ||
1459 | native_halt(); | ||
1460 | } | ||
1461 | } | ||
1462 | |||
1386 | void native_play_dead(void) | 1463 | void native_play_dead(void) |
1387 | { | 1464 | { |
1388 | play_dead_common(); | 1465 | play_dead_common(); |
1389 | tboot_shutdown(TB_SHUTDOWN_WFS); | 1466 | tboot_shutdown(TB_SHUTDOWN_WFS); |
1390 | wbinvd_halt(); | 1467 | |
1468 | mwait_play_dead(); /* Only returns on failure */ | ||
1469 | hlt_play_dead(); | ||
1391 | } | 1470 | } |
1392 | 1471 | ||
1393 | #else /* ... !CONFIG_HOTPLUG_CPU */ | 1472 | #else /* ... !CONFIG_HOTPLUG_CPU */ |
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index d5e06624e34a..0b0cb5fede19 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c | |||
@@ -33,8 +33,8 @@ int kernel_execve(const char *filename, | |||
33 | const char *const envp[]) | 33 | const char *const envp[]) |
34 | { | 34 | { |
35 | long __res; | 35 | long __res; |
36 | asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" | 36 | asm volatile ("int $0x80" |
37 | : "=a" (__res) | 37 | : "=a" (__res) |
38 | : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory"); | 38 | : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory"); |
39 | return __res; | 39 | return __res; |
40 | } | 40 | } |
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index e2a595257390..a375616d77f7 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c | |||
@@ -1,8 +1,8 @@ | |||
1 | #include <linux/io.h> | 1 | #include <linux/io.h> |
2 | #include <linux/memblock.h> | ||
2 | 3 | ||
3 | #include <asm/trampoline.h> | 4 | #include <asm/trampoline.h> |
4 | #include <asm/pgtable.h> | 5 | #include <asm/pgtable.h> |
5 | #include <asm/e820.h> | ||
6 | 6 | ||
7 | #if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) | 7 | #if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) |
8 | #define __trampinit | 8 | #define __trampinit |
@@ -17,15 +17,15 @@ unsigned char *__trampinitdata trampoline_base; | |||
17 | 17 | ||
18 | void __init reserve_trampoline_memory(void) | 18 | void __init reserve_trampoline_memory(void) |
19 | { | 19 | { |
20 | unsigned long mem; | 20 | phys_addr_t mem; |
21 | 21 | ||
22 | /* Has to be in very low memory so we can execute real-mode AP code. */ | 22 | /* Has to be in very low memory so we can execute real-mode AP code. */ |
23 | mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); | 23 | mem = memblock_find_in_range(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); |
24 | if (mem == -1L) | 24 | if (mem == MEMBLOCK_ERROR) |
25 | panic("Cannot allocate trampoline\n"); | 25 | panic("Cannot allocate trampoline\n"); |
26 | 26 | ||
27 | trampoline_base = __va(mem); | 27 | trampoline_base = __va(mem); |
28 | reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); | 28 | memblock_x86_reserve_range(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); |
29 | } | 29 | } |
30 | 30 | ||
31 | /* | 31 | /* |
@@ -38,19 +38,3 @@ unsigned long __trampinit setup_trampoline(void) | |||
38 | memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); | 38 | memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); |
39 | return virt_to_phys(trampoline_base); | 39 | return virt_to_phys(trampoline_base); |
40 | } | 40 | } |
41 | |||
42 | void __init setup_trampoline_page_table(void) | ||
43 | { | ||
44 | #ifdef CONFIG_X86_32 | ||
45 | /* Copy kernel address range */ | ||
46 | clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY, | ||
47 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
48 | KERNEL_PGD_PTRS); | ||
49 | |||
50 | /* Initialize low mappings */ | ||
51 | clone_pgd_range(trampoline_pg_dir, | ||
52 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
53 | min_t(unsigned long, KERNEL_PGD_PTRS, | ||
54 | KERNEL_PGD_BOUNDARY)); | ||
55 | #endif | ||
56 | } | ||
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 60788dee0f8a..cb838ca42c96 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -575,6 +575,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
575 | if (regs->flags & X86_VM_MASK) { | 575 | if (regs->flags & X86_VM_MASK) { |
576 | handle_vm86_trap((struct kernel_vm86_regs *) regs, | 576 | handle_vm86_trap((struct kernel_vm86_regs *) regs, |
577 | error_code, 1); | 577 | error_code, 1); |
578 | preempt_conditional_cli(regs); | ||
578 | return; | 579 | return; |
579 | } | 580 | } |
580 | 581 | ||
@@ -776,21 +777,10 @@ asmlinkage void math_state_restore(void) | |||
776 | } | 777 | } |
777 | EXPORT_SYMBOL_GPL(math_state_restore); | 778 | EXPORT_SYMBOL_GPL(math_state_restore); |
778 | 779 | ||
779 | #ifndef CONFIG_MATH_EMULATION | ||
780 | void math_emulate(struct math_emu_info *info) | ||
781 | { | ||
782 | printk(KERN_EMERG | ||
783 | "math-emulation not enabled and no coprocessor found.\n"); | ||
784 | printk(KERN_EMERG "killing %s.\n", current->comm); | ||
785 | force_sig(SIGFPE, current); | ||
786 | schedule(); | ||
787 | } | ||
788 | #endif /* CONFIG_MATH_EMULATION */ | ||
789 | |||
790 | dotraplinkage void __kprobes | 780 | dotraplinkage void __kprobes |
791 | do_device_not_available(struct pt_regs *regs, long error_code) | 781 | do_device_not_available(struct pt_regs *regs, long error_code) |
792 | { | 782 | { |
793 | #ifdef CONFIG_X86_32 | 783 | #ifdef CONFIG_MATH_EMULATION |
794 | if (read_cr0() & X86_CR0_EM) { | 784 | if (read_cr0() & X86_CR0_EM) { |
795 | struct math_emu_info info = { }; | 785 | struct math_emu_info info = { }; |
796 | 786 | ||
@@ -798,12 +788,12 @@ do_device_not_available(struct pt_regs *regs, long error_code) | |||
798 | 788 | ||
799 | info.regs = regs; | 789 | info.regs = regs; |
800 | math_emulate(&info); | 790 | math_emulate(&info); |
801 | } else { | 791 | return; |
802 | math_state_restore(); /* interrupts still off */ | ||
803 | conditional_sti(regs); | ||
804 | } | 792 | } |
805 | #else | 793 | #endif |
806 | math_state_restore(); | 794 | math_state_restore(); /* interrupts still off */ |
795 | #ifdef CONFIG_X86_32 | ||
796 | conditional_sti(regs); | ||
807 | #endif | 797 | #endif |
808 | } | 798 | } |
809 | 799 | ||
@@ -881,18 +871,6 @@ void __init trap_init(void) | |||
881 | #endif | 871 | #endif |
882 | 872 | ||
883 | #ifdef CONFIG_X86_32 | 873 | #ifdef CONFIG_X86_32 |
884 | if (cpu_has_fxsr) { | ||
885 | printk(KERN_INFO "Enabling fast FPU save and restore... "); | ||
886 | set_in_cr4(X86_CR4_OSFXSR); | ||
887 | printk("done.\n"); | ||
888 | } | ||
889 | if (cpu_has_xmm) { | ||
890 | printk(KERN_INFO | ||
891 | "Enabling unmasked SIMD FPU exception support... "); | ||
892 | set_in_cr4(X86_CR4_OSXMMEXCPT); | ||
893 | printk("done.\n"); | ||
894 | } | ||
895 | |||
896 | set_system_trap_gate(SYSCALL_VECTOR, &system_call); | 874 | set_system_trap_gate(SYSCALL_VECTOR, &system_call); |
897 | set_bit(SYSCALL_VECTOR, used_vectors); | 875 | set_bit(SYSCALL_VECTOR, used_vectors); |
898 | #endif | 876 | #endif |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 26a863a9c2a8..0c40d8b72416 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -104,10 +104,14 @@ int __init notsc_setup(char *str) | |||
104 | 104 | ||
105 | __setup("notsc", notsc_setup); | 105 | __setup("notsc", notsc_setup); |
106 | 106 | ||
107 | static int no_sched_irq_time; | ||
108 | |||
107 | static int __init tsc_setup(char *str) | 109 | static int __init tsc_setup(char *str) |
108 | { | 110 | { |
109 | if (!strcmp(str, "reliable")) | 111 | if (!strcmp(str, "reliable")) |
110 | tsc_clocksource_reliable = 1; | 112 | tsc_clocksource_reliable = 1; |
113 | if (!strncmp(str, "noirqtime", 9)) | ||
114 | no_sched_irq_time = 1; | ||
111 | return 1; | 115 | return 1; |
112 | } | 116 | } |
113 | 117 | ||
@@ -801,6 +805,7 @@ void mark_tsc_unstable(char *reason) | |||
801 | if (!tsc_unstable) { | 805 | if (!tsc_unstable) { |
802 | tsc_unstable = 1; | 806 | tsc_unstable = 1; |
803 | sched_clock_stable = 0; | 807 | sched_clock_stable = 0; |
808 | disable_sched_clock_irqtime(); | ||
804 | printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); | 809 | printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); |
805 | /* Change only the rating, when not registered */ | 810 | /* Change only the rating, when not registered */ |
806 | if (clocksource_tsc.mult) | 811 | if (clocksource_tsc.mult) |
@@ -892,60 +897,6 @@ static void __init init_tsc_clocksource(void) | |||
892 | clocksource_register_khz(&clocksource_tsc, tsc_khz); | 897 | clocksource_register_khz(&clocksource_tsc, tsc_khz); |
893 | } | 898 | } |
894 | 899 | ||
895 | #ifdef CONFIG_X86_64 | ||
896 | /* | ||
897 | * calibrate_cpu is used on systems with fixed rate TSCs to determine | ||
898 | * processor frequency | ||
899 | */ | ||
900 | #define TICK_COUNT 100000000 | ||
901 | static unsigned long __init calibrate_cpu(void) | ||
902 | { | ||
903 | int tsc_start, tsc_now; | ||
904 | int i, no_ctr_free; | ||
905 | unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; | ||
906 | unsigned long flags; | ||
907 | |||
908 | for (i = 0; i < 4; i++) | ||
909 | if (avail_to_resrv_perfctr_nmi_bit(i)) | ||
910 | break; | ||
911 | no_ctr_free = (i == 4); | ||
912 | if (no_ctr_free) { | ||
913 | WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " | ||
914 | "cpu_khz value may be incorrect.\n"); | ||
915 | i = 3; | ||
916 | rdmsrl(MSR_K7_EVNTSEL3, evntsel3); | ||
917 | wrmsrl(MSR_K7_EVNTSEL3, 0); | ||
918 | rdmsrl(MSR_K7_PERFCTR3, pmc3); | ||
919 | } else { | ||
920 | reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); | ||
921 | reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); | ||
922 | } | ||
923 | local_irq_save(flags); | ||
924 | /* start measuring cycles, incrementing from 0 */ | ||
925 | wrmsrl(MSR_K7_PERFCTR0 + i, 0); | ||
926 | wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); | ||
927 | rdtscl(tsc_start); | ||
928 | do { | ||
929 | rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); | ||
930 | tsc_now = get_cycles(); | ||
931 | } while ((tsc_now - tsc_start) < TICK_COUNT); | ||
932 | |||
933 | local_irq_restore(flags); | ||
934 | if (no_ctr_free) { | ||
935 | wrmsrl(MSR_K7_EVNTSEL3, 0); | ||
936 | wrmsrl(MSR_K7_PERFCTR3, pmc3); | ||
937 | wrmsrl(MSR_K7_EVNTSEL3, evntsel3); | ||
938 | } else { | ||
939 | release_perfctr_nmi(MSR_K7_PERFCTR0 + i); | ||
940 | release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); | ||
941 | } | ||
942 | |||
943 | return pmc_now * tsc_khz / (tsc_now - tsc_start); | ||
944 | } | ||
945 | #else | ||
946 | static inline unsigned long calibrate_cpu(void) { return cpu_khz; } | ||
947 | #endif | ||
948 | |||
949 | void __init tsc_init(void) | 900 | void __init tsc_init(void) |
950 | { | 901 | { |
951 | u64 lpj; | 902 | u64 lpj; |
@@ -964,10 +915,6 @@ void __init tsc_init(void) | |||
964 | return; | 915 | return; |
965 | } | 916 | } |
966 | 917 | ||
967 | if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && | ||
968 | (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) | ||
969 | cpu_khz = calibrate_cpu(); | ||
970 | |||
971 | printk("Detected %lu.%03lu MHz processor.\n", | 918 | printk("Detected %lu.%03lu MHz processor.\n", |
972 | (unsigned long)cpu_khz / 1000, | 919 | (unsigned long)cpu_khz / 1000, |
973 | (unsigned long)cpu_khz % 1000); | 920 | (unsigned long)cpu_khz % 1000); |
@@ -987,6 +934,9 @@ void __init tsc_init(void) | |||
987 | /* now allow native_sched_clock() to use rdtsc */ | 934 | /* now allow native_sched_clock() to use rdtsc */ |
988 | tsc_disabled = 0; | 935 | tsc_disabled = 0; |
989 | 936 | ||
937 | if (!no_sched_irq_time) | ||
938 | enable_sched_clock_irqtime(); | ||
939 | |||
990 | lpj = ((u64)tsc_khz * 1000); | 940 | lpj = ((u64)tsc_khz * 1000); |
991 | do_div(lpj, HZ); | 941 | do_div(lpj, HZ); |
992 | lpj_fine = lpj; | 942 | lpj_fine = lpj; |
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5ffb5622f793..61fb98519622 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -551,8 +551,14 @@ cannot_handle: | |||
551 | int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) | 551 | int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) |
552 | { | 552 | { |
553 | if (VMPI.is_vm86pus) { | 553 | if (VMPI.is_vm86pus) { |
554 | if ((trapno == 3) || (trapno == 1)) | 554 | if ((trapno == 3) || (trapno == 1)) { |
555 | return_to_32bit(regs, VM86_TRAP + (trapno << 8)); | 555 | KVM86->regs32->ax = VM86_TRAP + (trapno << 8); |
556 | /* setting this flag forces the code in entry_32.S to | ||
557 | call save_v86_state() and change the stack pointer | ||
558 | to KVM86->regs32 */ | ||
559 | set_thread_flag(TIF_IRET); | ||
560 | return 0; | ||
561 | } | ||
556 | do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); | 562 | do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); |
557 | return 0; | 563 | return 0; |
558 | } | 564 | } |
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c deleted file mode 100644 index ce9fbacb7526..000000000000 --- a/arch/x86/kernel/vmi_32.c +++ /dev/null | |||
@@ -1,893 +0,0 @@ | |||
1 | /* | ||
2 | * VMI specific paravirt-ops implementation | ||
3 | * | ||
4 | * Copyright (C) 2005, VMware, Inc. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, but | ||
12 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
14 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
15 | * details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
20 | * | ||
21 | * Send feedback to zach@vmware.com | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #include <linux/module.h> | ||
26 | #include <linux/cpu.h> | ||
27 | #include <linux/bootmem.h> | ||
28 | #include <linux/mm.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/sched.h> | ||
31 | #include <linux/gfp.h> | ||
32 | #include <asm/vmi.h> | ||
33 | #include <asm/io.h> | ||
34 | #include <asm/fixmap.h> | ||
35 | #include <asm/apicdef.h> | ||
36 | #include <asm/apic.h> | ||
37 | #include <asm/pgalloc.h> | ||
38 | #include <asm/processor.h> | ||
39 | #include <asm/timer.h> | ||
40 | #include <asm/vmi_time.h> | ||
41 | #include <asm/kmap_types.h> | ||
42 | #include <asm/setup.h> | ||
43 | |||
44 | /* Convenient for calling VMI functions indirectly in the ROM */ | ||
45 | typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); | ||
46 | typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int); | ||
47 | |||
48 | #define call_vrom_func(rom,func) \ | ||
49 | (((VROMFUNC *)(rom->func))()) | ||
50 | |||
51 | #define call_vrom_long_func(rom,func,arg) \ | ||
52 | (((VROMLONGFUNC *)(rom->func)) (arg)) | ||
53 | |||
54 | static struct vrom_header *vmi_rom; | ||
55 | static int disable_pge; | ||
56 | static int disable_pse; | ||
57 | static int disable_sep; | ||
58 | static int disable_tsc; | ||
59 | static int disable_mtrr; | ||
60 | static int disable_noidle; | ||
61 | static int disable_vmi_timer; | ||
62 | |||
63 | /* Cached VMI operations */ | ||
64 | static struct { | ||
65 | void (*cpuid)(void /* non-c */); | ||
66 | void (*_set_ldt)(u32 selector); | ||
67 | void (*set_tr)(u32 selector); | ||
68 | void (*write_idt_entry)(struct desc_struct *, int, u32, u32); | ||
69 | void (*write_gdt_entry)(struct desc_struct *, int, u32, u32); | ||
70 | void (*write_ldt_entry)(struct desc_struct *, int, u32, u32); | ||
71 | void (*set_kernel_stack)(u32 selector, u32 sp0); | ||
72 | void (*allocate_page)(u32, u32, u32, u32, u32); | ||
73 | void (*release_page)(u32, u32); | ||
74 | void (*set_pte)(pte_t, pte_t *, unsigned); | ||
75 | void (*update_pte)(pte_t *, unsigned); | ||
76 | void (*set_linear_mapping)(int, void *, u32, u32); | ||
77 | void (*_flush_tlb)(int); | ||
78 | void (*set_initial_ap_state)(int, int); | ||
79 | void (*halt)(void); | ||
80 | void (*set_lazy_mode)(int mode); | ||
81 | } vmi_ops; | ||
82 | |||
83 | /* Cached VMI operations */ | ||
84 | struct vmi_timer_ops vmi_timer_ops; | ||
85 | |||
86 | /* | ||
87 | * VMI patching routines. | ||
88 | */ | ||
89 | #define MNEM_CALL 0xe8 | ||
90 | #define MNEM_JMP 0xe9 | ||
91 | #define MNEM_RET 0xc3 | ||
92 | |||
93 | #define IRQ_PATCH_INT_MASK 0 | ||
94 | #define IRQ_PATCH_DISABLE 5 | ||
95 | |||
96 | static inline void patch_offset(void *insnbuf, | ||
97 | unsigned long ip, unsigned long dest) | ||
98 | { | ||
99 | *(unsigned long *)(insnbuf+1) = dest-ip-5; | ||
100 | } | ||
101 | |||
102 | static unsigned patch_internal(int call, unsigned len, void *insnbuf, | ||
103 | unsigned long ip) | ||
104 | { | ||
105 | u64 reloc; | ||
106 | struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc; | ||
107 | reloc = call_vrom_long_func(vmi_rom, get_reloc, call); | ||
108 | switch(rel->type) { | ||
109 | case VMI_RELOCATION_CALL_REL: | ||
110 | BUG_ON(len < 5); | ||
111 | *(char *)insnbuf = MNEM_CALL; | ||
112 | patch_offset(insnbuf, ip, (unsigned long)rel->eip); | ||
113 | return 5; | ||
114 | |||
115 | case VMI_RELOCATION_JUMP_REL: | ||
116 | BUG_ON(len < 5); | ||
117 | *(char *)insnbuf = MNEM_JMP; | ||
118 | patch_offset(insnbuf, ip, (unsigned long)rel->eip); | ||
119 | return 5; | ||
120 | |||
121 | case VMI_RELOCATION_NOP: | ||
122 | /* obliterate the whole thing */ | ||
123 | return 0; | ||
124 | |||
125 | case VMI_RELOCATION_NONE: | ||
126 | /* leave native code in place */ | ||
127 | break; | ||
128 | |||
129 | default: | ||
130 | BUG(); | ||
131 | } | ||
132 | return len; | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * Apply patch if appropriate, return length of new instruction | ||
137 | * sequence. The callee does nop padding for us. | ||
138 | */ | ||
139 | static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, | ||
140 | unsigned long ip, unsigned len) | ||
141 | { | ||
142 | switch (type) { | ||
143 | case PARAVIRT_PATCH(pv_irq_ops.irq_disable): | ||
144 | return patch_internal(VMI_CALL_DisableInterrupts, len, | ||
145 | insns, ip); | ||
146 | case PARAVIRT_PATCH(pv_irq_ops.irq_enable): | ||
147 | return patch_internal(VMI_CALL_EnableInterrupts, len, | ||
148 | insns, ip); | ||
149 | case PARAVIRT_PATCH(pv_irq_ops.restore_fl): | ||
150 | return patch_internal(VMI_CALL_SetInterruptMask, len, | ||
151 | insns, ip); | ||
152 | case PARAVIRT_PATCH(pv_irq_ops.save_fl): | ||
153 | return patch_internal(VMI_CALL_GetInterruptMask, len, | ||
154 | insns, ip); | ||
155 | case PARAVIRT_PATCH(pv_cpu_ops.iret): | ||
156 | return patch_internal(VMI_CALL_IRET, len, insns, ip); | ||
157 | case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit): | ||
158 | return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip); | ||
159 | default: | ||
160 | break; | ||
161 | } | ||
162 | return len; | ||
163 | } | ||
164 | |||
165 | /* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */ | ||
166 | static void vmi_cpuid(unsigned int *ax, unsigned int *bx, | ||
167 | unsigned int *cx, unsigned int *dx) | ||
168 | { | ||
169 | int override = 0; | ||
170 | if (*ax == 1) | ||
171 | override = 1; | ||
172 | asm volatile ("call *%6" | ||
173 | : "=a" (*ax), | ||
174 | "=b" (*bx), | ||
175 | "=c" (*cx), | ||
176 | "=d" (*dx) | ||
177 | : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid)); | ||
178 | if (override) { | ||
179 | if (disable_pse) | ||
180 | *dx &= ~X86_FEATURE_PSE; | ||
181 | if (disable_pge) | ||
182 | *dx &= ~X86_FEATURE_PGE; | ||
183 | if (disable_sep) | ||
184 | *dx &= ~X86_FEATURE_SEP; | ||
185 | if (disable_tsc) | ||
186 | *dx &= ~X86_FEATURE_TSC; | ||
187 | if (disable_mtrr) | ||
188 | *dx &= ~X86_FEATURE_MTRR; | ||
189 | } | ||
190 | } | ||
191 | |||
192 | static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new) | ||
193 | { | ||
194 | if (gdt[nr].a != new->a || gdt[nr].b != new->b) | ||
195 | write_gdt_entry(gdt, nr, new, 0); | ||
196 | } | ||
197 | |||
198 | static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) | ||
199 | { | ||
200 | struct desc_struct *gdt = get_cpu_gdt_table(cpu); | ||
201 | vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]); | ||
202 | vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]); | ||
203 | vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]); | ||
204 | } | ||
205 | |||
206 | static void vmi_set_ldt(const void *addr, unsigned entries) | ||
207 | { | ||
208 | unsigned cpu = smp_processor_id(); | ||
209 | struct desc_struct desc; | ||
210 | |||
211 | pack_descriptor(&desc, (unsigned long)addr, | ||
212 | entries * sizeof(struct desc_struct) - 1, | ||
213 | DESC_LDT, 0); | ||
214 | write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT); | ||
215 | vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0); | ||
216 | } | ||
217 | |||
218 | static void vmi_set_tr(void) | ||
219 | { | ||
220 | vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct)); | ||
221 | } | ||
222 | |||
223 | static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) | ||
224 | { | ||
225 | u32 *idt_entry = (u32 *)g; | ||
226 | vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]); | ||
227 | } | ||
228 | |||
229 | static void vmi_write_gdt_entry(struct desc_struct *dt, int entry, | ||
230 | const void *desc, int type) | ||
231 | { | ||
232 | u32 *gdt_entry = (u32 *)desc; | ||
233 | vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]); | ||
234 | } | ||
235 | |||
236 | static void vmi_write_ldt_entry(struct desc_struct *dt, int entry, | ||
237 | const void *desc) | ||
238 | { | ||
239 | u32 *ldt_entry = (u32 *)desc; | ||
240 | vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); | ||
241 | } | ||
242 | |||
243 | static void vmi_load_sp0(struct tss_struct *tss, | ||
244 | struct thread_struct *thread) | ||
245 | { | ||
246 | tss->x86_tss.sp0 = thread->sp0; | ||
247 | |||
248 | /* This can only happen when SEP is enabled, no need to test "SEP"arately */ | ||
249 | if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { | ||
250 | tss->x86_tss.ss1 = thread->sysenter_cs; | ||
251 | wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); | ||
252 | } | ||
253 | vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0); | ||
254 | } | ||
255 | |||
256 | static void vmi_flush_tlb_user(void) | ||
257 | { | ||
258 | vmi_ops._flush_tlb(VMI_FLUSH_TLB); | ||
259 | } | ||
260 | |||
261 | static void vmi_flush_tlb_kernel(void) | ||
262 | { | ||
263 | vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL); | ||
264 | } | ||
265 | |||
266 | /* Stub to do nothing at all; used for delays and unimplemented calls */ | ||
267 | static void vmi_nop(void) | ||
268 | { | ||
269 | } | ||
270 | |||
271 | static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) | ||
272 | { | ||
273 | vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); | ||
274 | } | ||
275 | |||
276 | static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn) | ||
277 | { | ||
278 | /* | ||
279 | * This call comes in very early, before mem_map is setup. | ||
280 | * It is called only for swapper_pg_dir, which already has | ||
281 | * data on it. | ||
282 | */ | ||
283 | vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); | ||
284 | } | ||
285 | |||
286 | static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) | ||
287 | { | ||
288 | vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); | ||
289 | } | ||
290 | |||
291 | static void vmi_release_pte(unsigned long pfn) | ||
292 | { | ||
293 | vmi_ops.release_page(pfn, VMI_PAGE_L1); | ||
294 | } | ||
295 | |||
296 | static void vmi_release_pmd(unsigned long pfn) | ||
297 | { | ||
298 | vmi_ops.release_page(pfn, VMI_PAGE_L2); | ||
299 | } | ||
300 | |||
301 | /* | ||
302 | * We use the pgd_free hook for releasing the pgd page: | ||
303 | */ | ||
304 | static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd) | ||
305 | { | ||
306 | unsigned long pfn = __pa(pgd) >> PAGE_SHIFT; | ||
307 | |||
308 | vmi_ops.release_page(pfn, VMI_PAGE_L2); | ||
309 | } | ||
310 | |||
311 | /* | ||
312 | * Helper macros for MMU update flags. We can defer updates until a flush | ||
313 | * or page invalidation only if the update is to the current address space | ||
314 | * (otherwise, there is no flush). We must check against init_mm, since | ||
315 | * this could be a kernel update, which usually passes init_mm, although | ||
316 | * sometimes this check can be skipped if we know the particular function | ||
317 | * is only called on user mode PTEs. We could change the kernel to pass | ||
318 | * current->active_mm here, but in particular, I was unsure if changing | ||
319 | * mm/highmem.c to do this would still be correct on other architectures. | ||
320 | */ | ||
321 | #define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \ | ||
322 | (!mustbeuser && (mm) == &init_mm)) | ||
323 | #define vmi_flags_addr(mm, addr, level, user) \ | ||
324 | ((level) | (is_current_as(mm, user) ? \ | ||
325 | (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) | ||
326 | #define vmi_flags_addr_defer(mm, addr, level, user) \ | ||
327 | ((level) | (is_current_as(mm, user) ? \ | ||
328 | (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) | ||
329 | |||
330 | static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
331 | { | ||
332 | vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); | ||
333 | } | ||
334 | |||
335 | static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
336 | { | ||
337 | vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); | ||
338 | } | ||
339 | |||
340 | static void vmi_set_pte(pte_t *ptep, pte_t pte) | ||
341 | { | ||
342 | /* XXX because of set_pmd_pte, this can be called on PT or PD layers */ | ||
343 | vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); | ||
344 | } | ||
345 | |||
346 | static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) | ||
347 | { | ||
348 | vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); | ||
349 | } | ||
350 | |||
351 | static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) | ||
352 | { | ||
353 | #ifdef CONFIG_X86_PAE | ||
354 | const pte_t pte = { .pte = pmdval.pmd }; | ||
355 | #else | ||
356 | const pte_t pte = { pmdval.pud.pgd.pgd }; | ||
357 | #endif | ||
358 | vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD); | ||
359 | } | ||
360 | |||
361 | #ifdef CONFIG_X86_PAE | ||
362 | |||
363 | static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval) | ||
364 | { | ||
365 | /* | ||
366 | * XXX This is called from set_pmd_pte, but at both PT | ||
367 | * and PD layers so the VMI_PAGE_PT flag is wrong. But | ||
368 | * it is only called for large page mapping changes, | ||
369 | * the Xen backend, doesn't support large pages, and the | ||
370 | * ESX backend doesn't depend on the flag. | ||
371 | */ | ||
372 | set_64bit((unsigned long long *)ptep,pte_val(pteval)); | ||
373 | vmi_ops.update_pte(ptep, VMI_PAGE_PT); | ||
374 | } | ||
375 | |||
376 | static void vmi_set_pud(pud_t *pudp, pud_t pudval) | ||
377 | { | ||
378 | /* Um, eww */ | ||
379 | const pte_t pte = { .pte = pudval.pgd.pgd }; | ||
380 | vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); | ||
381 | } | ||
382 | |||
383 | static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
384 | { | ||
385 | const pte_t pte = { .pte = 0 }; | ||
386 | vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); | ||
387 | } | ||
388 | |||
389 | static void vmi_pmd_clear(pmd_t *pmd) | ||
390 | { | ||
391 | const pte_t pte = { .pte = 0 }; | ||
392 | vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); | ||
393 | } | ||
394 | #endif | ||
395 | |||
396 | #ifdef CONFIG_SMP | ||
397 | static void __devinit | ||
398 | vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, | ||
399 | unsigned long start_esp) | ||
400 | { | ||
401 | struct vmi_ap_state ap; | ||
402 | |||
403 | /* Default everything to zero. This is fine for most GPRs. */ | ||
404 | memset(&ap, 0, sizeof(struct vmi_ap_state)); | ||
405 | |||
406 | ap.gdtr_limit = GDT_SIZE - 1; | ||
407 | ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid); | ||
408 | |||
409 | ap.idtr_limit = IDT_ENTRIES * 8 - 1; | ||
410 | ap.idtr_base = (unsigned long) idt_table; | ||
411 | |||
412 | ap.ldtr = 0; | ||
413 | |||
414 | ap.cs = __KERNEL_CS; | ||
415 | ap.eip = (unsigned long) start_eip; | ||
416 | ap.ss = __KERNEL_DS; | ||
417 | ap.esp = (unsigned long) start_esp; | ||
418 | |||
419 | ap.ds = __USER_DS; | ||
420 | ap.es = __USER_DS; | ||
421 | ap.fs = __KERNEL_PERCPU; | ||
422 | ap.gs = __KERNEL_STACK_CANARY; | ||
423 | |||
424 | ap.eflags = 0; | ||
425 | |||
426 | #ifdef CONFIG_X86_PAE | ||
427 | /* efer should match BSP efer. */ | ||
428 | if (cpu_has_nx) { | ||
429 | unsigned l, h; | ||
430 | rdmsr(MSR_EFER, l, h); | ||
431 | ap.efer = (unsigned long long) h << 32 | l; | ||
432 | } | ||
433 | #endif | ||
434 | |||
435 | ap.cr3 = __pa(swapper_pg_dir); | ||
436 | /* Protected mode, paging, AM, WP, NE, MP. */ | ||
437 | ap.cr0 = 0x80050023; | ||
438 | ap.cr4 = mmu_cr4_features; | ||
439 | vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid); | ||
440 | } | ||
441 | #endif | ||
442 | |||
443 | static void vmi_start_context_switch(struct task_struct *prev) | ||
444 | { | ||
445 | paravirt_start_context_switch(prev); | ||
446 | vmi_ops.set_lazy_mode(2); | ||
447 | } | ||
448 | |||
449 | static void vmi_end_context_switch(struct task_struct *next) | ||
450 | { | ||
451 | vmi_ops.set_lazy_mode(0); | ||
452 | paravirt_end_context_switch(next); | ||
453 | } | ||
454 | |||
455 | static void vmi_enter_lazy_mmu(void) | ||
456 | { | ||
457 | paravirt_enter_lazy_mmu(); | ||
458 | vmi_ops.set_lazy_mode(1); | ||
459 | } | ||
460 | |||
461 | static void vmi_leave_lazy_mmu(void) | ||
462 | { | ||
463 | vmi_ops.set_lazy_mode(0); | ||
464 | paravirt_leave_lazy_mmu(); | ||
465 | } | ||
466 | |||
467 | static inline int __init check_vmi_rom(struct vrom_header *rom) | ||
468 | { | ||
469 | struct pci_header *pci; | ||
470 | struct pnp_header *pnp; | ||
471 | const char *manufacturer = "UNKNOWN"; | ||
472 | const char *product = "UNKNOWN"; | ||
473 | const char *license = "unspecified"; | ||
474 | |||
475 | if (rom->rom_signature != 0xaa55) | ||
476 | return 0; | ||
477 | if (rom->vrom_signature != VMI_SIGNATURE) | ||
478 | return 0; | ||
479 | if (rom->api_version_maj != VMI_API_REV_MAJOR || | ||
480 | rom->api_version_min+1 < VMI_API_REV_MINOR+1) { | ||
481 | printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n", | ||
482 | rom->api_version_maj, | ||
483 | rom->api_version_min); | ||
484 | return 0; | ||
485 | } | ||
486 | |||
487 | /* | ||
488 | * Relying on the VMI_SIGNATURE field is not 100% safe, so check | ||
489 | * the PCI header and device type to make sure this is really a | ||
490 | * VMI device. | ||
491 | */ | ||
492 | if (!rom->pci_header_offs) { | ||
493 | printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n"); | ||
494 | return 0; | ||
495 | } | ||
496 | |||
497 | pci = (struct pci_header *)((char *)rom+rom->pci_header_offs); | ||
498 | if (pci->vendorID != PCI_VENDOR_ID_VMWARE || | ||
499 | pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) { | ||
500 | /* Allow it to run... anyways, but warn */ | ||
501 | printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n"); | ||
502 | } | ||
503 | |||
504 | if (rom->pnp_header_offs) { | ||
505 | pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs); | ||
506 | if (pnp->manufacturer_offset) | ||
507 | manufacturer = (const char *)rom+pnp->manufacturer_offset; | ||
508 | if (pnp->product_offset) | ||
509 | product = (const char *)rom+pnp->product_offset; | ||
510 | } | ||
511 | |||
512 | if (rom->license_offs) | ||
513 | license = (char *)rom+rom->license_offs; | ||
514 | |||
515 | printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n", | ||
516 | manufacturer, product, | ||
517 | rom->api_version_maj, rom->api_version_min, | ||
518 | pci->rom_version_maj, pci->rom_version_min); | ||
519 | |||
520 | /* Don't allow BSD/MIT here for now because we don't want to end up | ||
521 | with any binary only shim layers */ | ||
522 | if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) { | ||
523 | printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n", | ||
524 | license); | ||
525 | return 0; | ||
526 | } | ||
527 | |||
528 | return 1; | ||
529 | } | ||
530 | |||
531 | /* | ||
532 | * Probe for the VMI option ROM | ||
533 | */ | ||
534 | static inline int __init probe_vmi_rom(void) | ||
535 | { | ||
536 | unsigned long base; | ||
537 | |||
538 | /* VMI ROM is in option ROM area, check signature */ | ||
539 | for (base = 0xC0000; base < 0xE0000; base += 2048) { | ||
540 | struct vrom_header *romstart; | ||
541 | romstart = (struct vrom_header *)isa_bus_to_virt(base); | ||
542 | if (check_vmi_rom(romstart)) { | ||
543 | vmi_rom = romstart; | ||
544 | return 1; | ||
545 | } | ||
546 | } | ||
547 | return 0; | ||
548 | } | ||
549 | |||
550 | /* | ||
551 | * VMI setup common to all processors | ||
552 | */ | ||
553 | void vmi_bringup(void) | ||
554 | { | ||
555 | /* We must establish the lowmem mapping for MMU ops to work */ | ||
556 | if (vmi_ops.set_linear_mapping) | ||
557 | vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0); | ||
558 | } | ||
559 | |||
560 | /* | ||
561 | * Return a pointer to a VMI function or NULL if unimplemented | ||
562 | */ | ||
563 | static void *vmi_get_function(int vmicall) | ||
564 | { | ||
565 | u64 reloc; | ||
566 | const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; | ||
567 | reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall); | ||
568 | BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); | ||
569 | if (rel->type == VMI_RELOCATION_CALL_REL) | ||
570 | return (void *)rel->eip; | ||
571 | else | ||
572 | return NULL; | ||
573 | } | ||
574 | |||
575 | /* | ||
576 | * Helper macro for making the VMI paravirt-ops fill code readable. | ||
577 | * For unimplemented operations, fall back to default, unless nop | ||
578 | * is returned by the ROM. | ||
579 | */ | ||
580 | #define para_fill(opname, vmicall) \ | ||
581 | do { \ | ||
582 | reloc = call_vrom_long_func(vmi_rom, get_reloc, \ | ||
583 | VMI_CALL_##vmicall); \ | ||
584 | if (rel->type == VMI_RELOCATION_CALL_REL) \ | ||
585 | opname = (void *)rel->eip; \ | ||
586 | else if (rel->type == VMI_RELOCATION_NOP) \ | ||
587 | opname = (void *)vmi_nop; \ | ||
588 | else if (rel->type != VMI_RELOCATION_NONE) \ | ||
589 | printk(KERN_WARNING "VMI: Unknown relocation " \ | ||
590 | "type %d for " #vmicall"\n",\ | ||
591 | rel->type); \ | ||
592 | } while (0) | ||
593 | |||
594 | /* | ||
595 | * Helper macro for making the VMI paravirt-ops fill code readable. | ||
596 | * For cached operations which do not match the VMI ROM ABI and must | ||
597 | * go through a tranlation stub. Ignore NOPs, since it is not clear | ||
598 | * a NOP * VMI function corresponds to a NOP paravirt-op when the | ||
599 | * functions are not in 1-1 correspondence. | ||
600 | */ | ||
601 | #define para_wrap(opname, wrapper, cache, vmicall) \ | ||
602 | do { \ | ||
603 | reloc = call_vrom_long_func(vmi_rom, get_reloc, \ | ||
604 | VMI_CALL_##vmicall); \ | ||
605 | BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \ | ||
606 | if (rel->type == VMI_RELOCATION_CALL_REL) { \ | ||
607 | opname = wrapper; \ | ||
608 | vmi_ops.cache = (void *)rel->eip; \ | ||
609 | } \ | ||
610 | } while (0) | ||
611 | |||
612 | /* | ||
613 | * Activate the VMI interface and switch into paravirtualized mode | ||
614 | */ | ||
615 | static inline int __init activate_vmi(void) | ||
616 | { | ||
617 | short kernel_cs; | ||
618 | u64 reloc; | ||
619 | const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; | ||
620 | |||
621 | /* | ||
622 | * Prevent page tables from being allocated in highmem, even if | ||
623 | * CONFIG_HIGHPTE is enabled. | ||
624 | */ | ||
625 | __userpte_alloc_gfp &= ~__GFP_HIGHMEM; | ||
626 | |||
627 | if (call_vrom_func(vmi_rom, vmi_init) != 0) { | ||
628 | printk(KERN_ERR "VMI ROM failed to initialize!"); | ||
629 | return 0; | ||
630 | } | ||
631 | savesegment(cs, kernel_cs); | ||
632 | |||
633 | pv_info.paravirt_enabled = 1; | ||
634 | pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; | ||
635 | pv_info.name = "vmi [deprecated]"; | ||
636 | |||
637 | pv_init_ops.patch = vmi_patch; | ||
638 | |||
639 | /* | ||
640 | * Many of these operations are ABI compatible with VMI. | ||
641 | * This means we can fill in the paravirt-ops with direct | ||
642 | * pointers into the VMI ROM. If the calling convention for | ||
643 | * these operations changes, this code needs to be updated. | ||
644 | * | ||
645 | * Exceptions | ||
646 | * CPUID paravirt-op uses pointers, not the native ISA | ||
647 | * halt has no VMI equivalent; all VMI halts are "safe" | ||
648 | * no MSR support yet - just trap and emulate. VMI uses the | ||
649 | * same ABI as the native ISA, but Linux wants exceptions | ||
650 | * from bogus MSR read / write handled | ||
651 | * rdpmc is not yet used in Linux | ||
652 | */ | ||
653 | |||
654 | /* CPUID is special, so very special it gets wrapped like a present */ | ||
655 | para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID); | ||
656 | |||
657 | para_fill(pv_cpu_ops.clts, CLTS); | ||
658 | para_fill(pv_cpu_ops.get_debugreg, GetDR); | ||
659 | para_fill(pv_cpu_ops.set_debugreg, SetDR); | ||
660 | para_fill(pv_cpu_ops.read_cr0, GetCR0); | ||
661 | para_fill(pv_mmu_ops.read_cr2, GetCR2); | ||
662 | para_fill(pv_mmu_ops.read_cr3, GetCR3); | ||
663 | para_fill(pv_cpu_ops.read_cr4, GetCR4); | ||
664 | para_fill(pv_cpu_ops.write_cr0, SetCR0); | ||
665 | para_fill(pv_mmu_ops.write_cr2, SetCR2); | ||
666 | para_fill(pv_mmu_ops.write_cr3, SetCR3); | ||
667 | para_fill(pv_cpu_ops.write_cr4, SetCR4); | ||
668 | |||
669 | para_fill(pv_irq_ops.save_fl.func, GetInterruptMask); | ||
670 | para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask); | ||
671 | para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts); | ||
672 | para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts); | ||
673 | |||
674 | para_fill(pv_cpu_ops.wbinvd, WBINVD); | ||
675 | para_fill(pv_cpu_ops.read_tsc, RDTSC); | ||
676 | |||
677 | /* The following we emulate with trap and emulate for now */ | ||
678 | /* paravirt_ops.read_msr = vmi_rdmsr */ | ||
679 | /* paravirt_ops.write_msr = vmi_wrmsr */ | ||
680 | /* paravirt_ops.rdpmc = vmi_rdpmc */ | ||
681 | |||
682 | /* TR interface doesn't pass TR value, wrap */ | ||
683 | para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR); | ||
684 | |||
685 | /* LDT is special, too */ | ||
686 | para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT); | ||
687 | |||
688 | para_fill(pv_cpu_ops.load_gdt, SetGDT); | ||
689 | para_fill(pv_cpu_ops.load_idt, SetIDT); | ||
690 | para_fill(pv_cpu_ops.store_gdt, GetGDT); | ||
691 | para_fill(pv_cpu_ops.store_idt, GetIDT); | ||
692 | para_fill(pv_cpu_ops.store_tr, GetTR); | ||
693 | pv_cpu_ops.load_tls = vmi_load_tls; | ||
694 | para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry, | ||
695 | write_ldt_entry, WriteLDTEntry); | ||
696 | para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry, | ||
697 | write_gdt_entry, WriteGDTEntry); | ||
698 | para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry, | ||
699 | write_idt_entry, WriteIDTEntry); | ||
700 | para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack); | ||
701 | para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); | ||
702 | para_fill(pv_cpu_ops.io_delay, IODelay); | ||
703 | |||
704 | para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch, | ||
705 | set_lazy_mode, SetLazyMode); | ||
706 | para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch, | ||
707 | set_lazy_mode, SetLazyMode); | ||
708 | |||
709 | para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, | ||
710 | set_lazy_mode, SetLazyMode); | ||
711 | para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu, | ||
712 | set_lazy_mode, SetLazyMode); | ||
713 | |||
714 | /* user and kernel flush are just handled with different flags to FlushTLB */ | ||
715 | para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); | ||
716 | para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); | ||
717 | para_fill(pv_mmu_ops.flush_tlb_single, InvalPage); | ||
718 | |||
719 | /* | ||
720 | * Until a standard flag format can be agreed on, we need to | ||
721 | * implement these as wrappers in Linux. Get the VMI ROM | ||
722 | * function pointers for the two backend calls. | ||
723 | */ | ||
724 | #ifdef CONFIG_X86_PAE | ||
725 | vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong); | ||
726 | vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong); | ||
727 | #else | ||
728 | vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE); | ||
729 | vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE); | ||
730 | #endif | ||
731 | |||
732 | if (vmi_ops.set_pte) { | ||
733 | pv_mmu_ops.set_pte = vmi_set_pte; | ||
734 | pv_mmu_ops.set_pte_at = vmi_set_pte_at; | ||
735 | pv_mmu_ops.set_pmd = vmi_set_pmd; | ||
736 | #ifdef CONFIG_X86_PAE | ||
737 | pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic; | ||
738 | pv_mmu_ops.set_pud = vmi_set_pud; | ||
739 | pv_mmu_ops.pte_clear = vmi_pte_clear; | ||
740 | pv_mmu_ops.pmd_clear = vmi_pmd_clear; | ||
741 | #endif | ||
742 | } | ||
743 | |||
744 | if (vmi_ops.update_pte) { | ||
745 | pv_mmu_ops.pte_update = vmi_update_pte; | ||
746 | pv_mmu_ops.pte_update_defer = vmi_update_pte_defer; | ||
747 | } | ||
748 | |||
749 | vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); | ||
750 | if (vmi_ops.allocate_page) { | ||
751 | pv_mmu_ops.alloc_pte = vmi_allocate_pte; | ||
752 | pv_mmu_ops.alloc_pmd = vmi_allocate_pmd; | ||
753 | pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone; | ||
754 | } | ||
755 | |||
756 | vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); | ||
757 | if (vmi_ops.release_page) { | ||
758 | pv_mmu_ops.release_pte = vmi_release_pte; | ||
759 | pv_mmu_ops.release_pmd = vmi_release_pmd; | ||
760 | pv_mmu_ops.pgd_free = vmi_pgd_free; | ||
761 | } | ||
762 | |||
763 | /* Set linear is needed in all cases */ | ||
764 | vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); | ||
765 | |||
766 | /* | ||
767 | * These MUST always be patched. Don't support indirect jumps | ||
768 | * through these operations, as the VMI interface may use either | ||
769 | * a jump or a call to get to these operations, depending on | ||
770 | * the backend. They are performance critical anyway, so requiring | ||
771 | * a patch is not a big problem. | ||
772 | */ | ||
773 | pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0; | ||
774 | pv_cpu_ops.iret = (void *)0xbadbab0; | ||
775 | |||
776 | #ifdef CONFIG_SMP | ||
777 | para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState); | ||
778 | #endif | ||
779 | |||
780 | #ifdef CONFIG_X86_LOCAL_APIC | ||
781 | para_fill(apic->read, APICRead); | ||
782 | para_fill(apic->write, APICWrite); | ||
783 | #endif | ||
784 | |||
785 | /* | ||
786 | * Check for VMI timer functionality by probing for a cycle frequency method | ||
787 | */ | ||
788 | reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency); | ||
789 | if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) { | ||
790 | vmi_timer_ops.get_cycle_frequency = (void *)rel->eip; | ||
791 | vmi_timer_ops.get_cycle_counter = | ||
792 | vmi_get_function(VMI_CALL_GetCycleCounter); | ||
793 | vmi_timer_ops.get_wallclock = | ||
794 | vmi_get_function(VMI_CALL_GetWallclockTime); | ||
795 | vmi_timer_ops.wallclock_updated = | ||
796 | vmi_get_function(VMI_CALL_WallclockUpdated); | ||
797 | vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); | ||
798 | vmi_timer_ops.cancel_alarm = | ||
799 | vmi_get_function(VMI_CALL_CancelAlarm); | ||
800 | x86_init.timers.timer_init = vmi_time_init; | ||
801 | #ifdef CONFIG_X86_LOCAL_APIC | ||
802 | x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init; | ||
803 | x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init; | ||
804 | #endif | ||
805 | pv_time_ops.sched_clock = vmi_sched_clock; | ||
806 | x86_platform.calibrate_tsc = vmi_tsc_khz; | ||
807 | x86_platform.get_wallclock = vmi_get_wallclock; | ||
808 | x86_platform.set_wallclock = vmi_set_wallclock; | ||
809 | |||
810 | /* We have true wallclock functions; disable CMOS clock sync */ | ||
811 | no_sync_cmos_clock = 1; | ||
812 | } else { | ||
813 | disable_noidle = 1; | ||
814 | disable_vmi_timer = 1; | ||
815 | } | ||
816 | |||
817 | para_fill(pv_irq_ops.safe_halt, Halt); | ||
818 | |||
819 | /* | ||
820 | * Alternative instruction rewriting doesn't happen soon enough | ||
821 | * to convert VMI_IRET to a call instead of a jump; so we have | ||
822 | * to do this before IRQs get reenabled. Fortunately, it is | ||
823 | * idempotent. | ||
824 | */ | ||
825 | apply_paravirt(__parainstructions, __parainstructions_end); | ||
826 | |||
827 | vmi_bringup(); | ||
828 | |||
829 | return 1; | ||
830 | } | ||
831 | |||
832 | #undef para_fill | ||
833 | |||
834 | void __init vmi_init(void) | ||
835 | { | ||
836 | if (!vmi_rom) | ||
837 | probe_vmi_rom(); | ||
838 | else | ||
839 | check_vmi_rom(vmi_rom); | ||
840 | |||
841 | /* In case probing for or validating the ROM failed, basil */ | ||
842 | if (!vmi_rom) | ||
843 | return; | ||
844 | |||
845 | reserve_top_address(-vmi_rom->virtual_top); | ||
846 | |||
847 | #ifdef CONFIG_X86_IO_APIC | ||
848 | /* This is virtual hardware; timer routing is wired correctly */ | ||
849 | no_timer_check = 1; | ||
850 | #endif | ||
851 | } | ||
852 | |||
853 | void __init vmi_activate(void) | ||
854 | { | ||
855 | unsigned long flags; | ||
856 | |||
857 | if (!vmi_rom) | ||
858 | return; | ||
859 | |||
860 | local_irq_save(flags); | ||
861 | activate_vmi(); | ||
862 | local_irq_restore(flags & X86_EFLAGS_IF); | ||
863 | } | ||
864 | |||
865 | static int __init parse_vmi(char *arg) | ||
866 | { | ||
867 | if (!arg) | ||
868 | return -EINVAL; | ||
869 | |||
870 | if (!strcmp(arg, "disable_pge")) { | ||
871 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); | ||
872 | disable_pge = 1; | ||
873 | } else if (!strcmp(arg, "disable_pse")) { | ||
874 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE); | ||
875 | disable_pse = 1; | ||
876 | } else if (!strcmp(arg, "disable_sep")) { | ||
877 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); | ||
878 | disable_sep = 1; | ||
879 | } else if (!strcmp(arg, "disable_tsc")) { | ||
880 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC); | ||
881 | disable_tsc = 1; | ||
882 | } else if (!strcmp(arg, "disable_mtrr")) { | ||
883 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR); | ||
884 | disable_mtrr = 1; | ||
885 | } else if (!strcmp(arg, "disable_timer")) { | ||
886 | disable_vmi_timer = 1; | ||
887 | disable_noidle = 1; | ||
888 | } else if (!strcmp(arg, "disable_noidle")) | ||
889 | disable_noidle = 1; | ||
890 | return 0; | ||
891 | } | ||
892 | |||
893 | early_param("vmi", parse_vmi); | ||
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c deleted file mode 100644 index 5e1ff66ecd73..000000000000 --- a/arch/x86/kernel/vmiclock_32.c +++ /dev/null | |||
@@ -1,317 +0,0 @@ | |||
1 | /* | ||
2 | * VMI paravirtual timer support routines. | ||
3 | * | ||
4 | * Copyright (C) 2007, VMware, Inc. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, but | ||
12 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
14 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
15 | * details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
20 | * | ||
21 | */ | ||
22 | |||
23 | #include <linux/smp.h> | ||
24 | #include <linux/interrupt.h> | ||
25 | #include <linux/cpumask.h> | ||
26 | #include <linux/clocksource.h> | ||
27 | #include <linux/clockchips.h> | ||
28 | |||
29 | #include <asm/vmi.h> | ||
30 | #include <asm/vmi_time.h> | ||
31 | #include <asm/apicdef.h> | ||
32 | #include <asm/apic.h> | ||
33 | #include <asm/timer.h> | ||
34 | #include <asm/i8253.h> | ||
35 | #include <asm/irq_vectors.h> | ||
36 | |||
37 | #define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) | ||
38 | #define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) | ||
39 | |||
40 | static DEFINE_PER_CPU(struct clock_event_device, local_events); | ||
41 | |||
42 | static inline u32 vmi_counter(u32 flags) | ||
43 | { | ||
44 | /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding | ||
45 | * cycle counter. */ | ||
46 | return flags & VMI_ALARM_COUNTER_MASK; | ||
47 | } | ||
48 | |||
49 | /* paravirt_ops.get_wallclock = vmi_get_wallclock */ | ||
50 | unsigned long vmi_get_wallclock(void) | ||
51 | { | ||
52 | unsigned long long wallclock; | ||
53 | wallclock = vmi_timer_ops.get_wallclock(); // nsec | ||
54 | (void)do_div(wallclock, 1000000000); // sec | ||
55 | |||
56 | return wallclock; | ||
57 | } | ||
58 | |||
59 | /* paravirt_ops.set_wallclock = vmi_set_wallclock */ | ||
60 | int vmi_set_wallclock(unsigned long now) | ||
61 | { | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | /* paravirt_ops.sched_clock = vmi_sched_clock */ | ||
66 | unsigned long long vmi_sched_clock(void) | ||
67 | { | ||
68 | return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); | ||
69 | } | ||
70 | |||
71 | /* x86_platform.calibrate_tsc = vmi_tsc_khz */ | ||
72 | unsigned long vmi_tsc_khz(void) | ||
73 | { | ||
74 | unsigned long long khz; | ||
75 | khz = vmi_timer_ops.get_cycle_frequency(); | ||
76 | (void)do_div(khz, 1000); | ||
77 | return khz; | ||
78 | } | ||
79 | |||
80 | static inline unsigned int vmi_get_timer_vector(void) | ||
81 | { | ||
82 | return IRQ0_VECTOR; | ||
83 | } | ||
84 | |||
85 | /** vmi clockchip */ | ||
86 | #ifdef CONFIG_X86_LOCAL_APIC | ||
87 | static unsigned int startup_timer_irq(unsigned int irq) | ||
88 | { | ||
89 | unsigned long val = apic_read(APIC_LVTT); | ||
90 | apic_write(APIC_LVTT, vmi_get_timer_vector()); | ||
91 | |||
92 | return (val & APIC_SEND_PENDING); | ||
93 | } | ||
94 | |||
95 | static void mask_timer_irq(unsigned int irq) | ||
96 | { | ||
97 | unsigned long val = apic_read(APIC_LVTT); | ||
98 | apic_write(APIC_LVTT, val | APIC_LVT_MASKED); | ||
99 | } | ||
100 | |||
101 | static void unmask_timer_irq(unsigned int irq) | ||
102 | { | ||
103 | unsigned long val = apic_read(APIC_LVTT); | ||
104 | apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED); | ||
105 | } | ||
106 | |||
107 | static void ack_timer_irq(unsigned int irq) | ||
108 | { | ||
109 | ack_APIC_irq(); | ||
110 | } | ||
111 | |||
112 | static struct irq_chip vmi_chip __read_mostly = { | ||
113 | .name = "VMI-LOCAL", | ||
114 | .startup = startup_timer_irq, | ||
115 | .mask = mask_timer_irq, | ||
116 | .unmask = unmask_timer_irq, | ||
117 | .ack = ack_timer_irq | ||
118 | }; | ||
119 | #endif | ||
120 | |||
121 | /** vmi clockevent */ | ||
122 | #define VMI_ALARM_WIRED_IRQ0 0x00000000 | ||
123 | #define VMI_ALARM_WIRED_LVTT 0x00010000 | ||
124 | static int vmi_wiring = VMI_ALARM_WIRED_IRQ0; | ||
125 | |||
126 | static inline int vmi_get_alarm_wiring(void) | ||
127 | { | ||
128 | return vmi_wiring; | ||
129 | } | ||
130 | |||
131 | static void vmi_timer_set_mode(enum clock_event_mode mode, | ||
132 | struct clock_event_device *evt) | ||
133 | { | ||
134 | cycle_t now, cycles_per_hz; | ||
135 | BUG_ON(!irqs_disabled()); | ||
136 | |||
137 | switch (mode) { | ||
138 | case CLOCK_EVT_MODE_ONESHOT: | ||
139 | case CLOCK_EVT_MODE_RESUME: | ||
140 | break; | ||
141 | case CLOCK_EVT_MODE_PERIODIC: | ||
142 | cycles_per_hz = vmi_timer_ops.get_cycle_frequency(); | ||
143 | (void)do_div(cycles_per_hz, HZ); | ||
144 | now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC)); | ||
145 | vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz); | ||
146 | break; | ||
147 | case CLOCK_EVT_MODE_UNUSED: | ||
148 | case CLOCK_EVT_MODE_SHUTDOWN: | ||
149 | switch (evt->mode) { | ||
150 | case CLOCK_EVT_MODE_ONESHOT: | ||
151 | vmi_timer_ops.cancel_alarm(VMI_ONESHOT); | ||
152 | break; | ||
153 | case CLOCK_EVT_MODE_PERIODIC: | ||
154 | vmi_timer_ops.cancel_alarm(VMI_PERIODIC); | ||
155 | break; | ||
156 | default: | ||
157 | break; | ||
158 | } | ||
159 | break; | ||
160 | default: | ||
161 | break; | ||
162 | } | ||
163 | } | ||
164 | |||
165 | static int vmi_timer_next_event(unsigned long delta, | ||
166 | struct clock_event_device *evt) | ||
167 | { | ||
168 | /* Unfortunately, set_next_event interface only passes relative | ||
169 | * expiry, but we want absolute expiry. It'd be better if were | ||
170 | * were passed an absolute expiry, since a bunch of time may | ||
171 | * have been stolen between the time the delta is computed and | ||
172 | * when we set the alarm below. */ | ||
173 | cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); | ||
174 | |||
175 | BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); | ||
176 | vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0); | ||
177 | return 0; | ||
178 | } | ||
179 | |||
180 | static struct clock_event_device vmi_clockevent = { | ||
181 | .name = "vmi-timer", | ||
182 | .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, | ||
183 | .shift = 22, | ||
184 | .set_mode = vmi_timer_set_mode, | ||
185 | .set_next_event = vmi_timer_next_event, | ||
186 | .rating = 1000, | ||
187 | .irq = 0, | ||
188 | }; | ||
189 | |||
190 | static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id) | ||
191 | { | ||
192 | struct clock_event_device *evt = &__get_cpu_var(local_events); | ||
193 | evt->event_handler(evt); | ||
194 | return IRQ_HANDLED; | ||
195 | } | ||
196 | |||
197 | static struct irqaction vmi_clock_action = { | ||
198 | .name = "vmi-timer", | ||
199 | .handler = vmi_timer_interrupt, | ||
200 | .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER, | ||
201 | }; | ||
202 | |||
203 | static void __devinit vmi_time_init_clockevent(void) | ||
204 | { | ||
205 | cycle_t cycles_per_msec; | ||
206 | struct clock_event_device *evt; | ||
207 | |||
208 | int cpu = smp_processor_id(); | ||
209 | evt = &__get_cpu_var(local_events); | ||
210 | |||
211 | /* Use cycles_per_msec since div_sc params are 32-bits. */ | ||
212 | cycles_per_msec = vmi_timer_ops.get_cycle_frequency(); | ||
213 | (void)do_div(cycles_per_msec, 1000); | ||
214 | |||
215 | memcpy(evt, &vmi_clockevent, sizeof(*evt)); | ||
216 | /* Must pick .shift such that .mult fits in 32-bits. Choosing | ||
217 | * .shift to be 22 allows 2^(32-22) cycles per nano-seconds | ||
218 | * before overflow. */ | ||
219 | evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift); | ||
220 | /* Upper bound is clockevent's use of ulong for cycle deltas. */ | ||
221 | evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt); | ||
222 | evt->min_delta_ns = clockevent_delta2ns(1, evt); | ||
223 | evt->cpumask = cpumask_of(cpu); | ||
224 | |||
225 | printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n", | ||
226 | evt->name, evt->mult, evt->shift); | ||
227 | clockevents_register_device(evt); | ||
228 | } | ||
229 | |||
230 | void __init vmi_time_init(void) | ||
231 | { | ||
232 | unsigned int cpu; | ||
233 | /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ | ||
234 | outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ | ||
235 | |||
236 | vmi_time_init_clockevent(); | ||
237 | setup_irq(0, &vmi_clock_action); | ||
238 | for_each_possible_cpu(cpu) | ||
239 | per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0; | ||
240 | } | ||
241 | |||
242 | #ifdef CONFIG_X86_LOCAL_APIC | ||
243 | void __devinit vmi_time_bsp_init(void) | ||
244 | { | ||
245 | /* | ||
246 | * On APIC systems, we want local timers to fire on each cpu. We do | ||
247 | * this by programming LVTT to deliver timer events to the IRQ handler | ||
248 | * for IRQ-0, since we can't re-use the APIC local timer handler | ||
249 | * without interfering with that code. | ||
250 | */ | ||
251 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | ||
252 | local_irq_disable(); | ||
253 | #ifdef CONFIG_SMP | ||
254 | /* | ||
255 | * XXX handle_percpu_irq only defined for SMP; we need to switch over | ||
256 | * to using it, since this is a local interrupt, which each CPU must | ||
257 | * handle individually without locking out or dropping simultaneous | ||
258 | * local timers on other CPUs. We also don't want to trigger the | ||
259 | * quirk workaround code for interrupts which gets invoked from | ||
260 | * handle_percpu_irq via eoi, so we use our own IRQ chip. | ||
261 | */ | ||
262 | set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt"); | ||
263 | #else | ||
264 | set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt"); | ||
265 | #endif | ||
266 | vmi_wiring = VMI_ALARM_WIRED_LVTT; | ||
267 | apic_write(APIC_LVTT, vmi_get_timer_vector()); | ||
268 | local_irq_enable(); | ||
269 | clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); | ||
270 | } | ||
271 | |||
272 | void __devinit vmi_time_ap_init(void) | ||
273 | { | ||
274 | vmi_time_init_clockevent(); | ||
275 | apic_write(APIC_LVTT, vmi_get_timer_vector()); | ||
276 | } | ||
277 | #endif | ||
278 | |||
279 | /** vmi clocksource */ | ||
280 | static struct clocksource clocksource_vmi; | ||
281 | |||
282 | static cycle_t read_real_cycles(struct clocksource *cs) | ||
283 | { | ||
284 | cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); | ||
285 | return max(ret, clocksource_vmi.cycle_last); | ||
286 | } | ||
287 | |||
288 | static struct clocksource clocksource_vmi = { | ||
289 | .name = "vmi-timer", | ||
290 | .rating = 450, | ||
291 | .read = read_real_cycles, | ||
292 | .mask = CLOCKSOURCE_MASK(64), | ||
293 | .mult = 0, /* to be set */ | ||
294 | .shift = 22, | ||
295 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | ||
296 | }; | ||
297 | |||
298 | static int __init init_vmi_clocksource(void) | ||
299 | { | ||
300 | cycle_t cycles_per_msec; | ||
301 | |||
302 | if (!vmi_timer_ops.get_cycle_frequency) | ||
303 | return 0; | ||
304 | /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */ | ||
305 | cycles_per_msec = vmi_timer_ops.get_cycle_frequency(); | ||
306 | (void)do_div(cycles_per_msec, 1000); | ||
307 | |||
308 | /* Note that clocksource.{mult, shift} converts in the opposite direction | ||
309 | * as clockevents. */ | ||
310 | clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec, | ||
311 | clocksource_vmi.shift); | ||
312 | |||
313 | printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec); | ||
314 | return clocksource_register(&clocksource_vmi); | ||
315 | |||
316 | } | ||
317 | module_init(init_vmi_clocksource); | ||
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d0bb52296fa3..e03530aebfd0 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
@@ -242,6 +242,12 @@ SECTIONS | |||
242 | __x86_cpu_dev_end = .; | 242 | __x86_cpu_dev_end = .; |
243 | } | 243 | } |
244 | 244 | ||
245 | /* | ||
246 | * start address and size of operations which during runtime | ||
247 | * can be patched with virtualization friendly instructions or | ||
248 | * baremetal native ones. Think page table operations. | ||
249 | * Details in paravirt_types.h | ||
250 | */ | ||
245 | . = ALIGN(8); | 251 | . = ALIGN(8); |
246 | .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { | 252 | .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { |
247 | __parainstructions = .; | 253 | __parainstructions = .; |
@@ -249,6 +255,11 @@ SECTIONS | |||
249 | __parainstructions_end = .; | 255 | __parainstructions_end = .; |
250 | } | 256 | } |
251 | 257 | ||
258 | /* | ||
259 | * struct alt_inst entries. From the header (alternative.h): | ||
260 | * "Alternative instructions for different CPU types or capabilities" | ||
261 | * Think locking instructions on spinlocks. | ||
262 | */ | ||
252 | . = ALIGN(8); | 263 | . = ALIGN(8); |
253 | .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { | 264 | .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { |
254 | __alt_instructions = .; | 265 | __alt_instructions = .; |
@@ -256,11 +267,28 @@ SECTIONS | |||
256 | __alt_instructions_end = .; | 267 | __alt_instructions_end = .; |
257 | } | 268 | } |
258 | 269 | ||
270 | /* | ||
271 | * And here are the replacement instructions. The linker sticks | ||
272 | * them as binary blobs. The .altinstructions has enough data to | ||
273 | * get the address and the length of them to patch the kernel safely. | ||
274 | */ | ||
259 | .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { | 275 | .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { |
260 | *(.altinstr_replacement) | 276 | *(.altinstr_replacement) |
261 | } | 277 | } |
262 | 278 | ||
263 | /* | 279 | /* |
280 | * struct iommu_table_entry entries are injected in this section. | ||
281 | * It is an array of IOMMUs which during run time gets sorted depending | ||
282 | * on its dependency order. After rootfs_initcall is complete | ||
283 | * this section can be safely removed. | ||
284 | */ | ||
285 | .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) { | ||
286 | __iommu_table = .; | ||
287 | *(.iommu_table) | ||
288 | __iommu_table_end = .; | ||
289 | } | ||
290 | . = ALIGN(8); | ||
291 | /* | ||
264 | * .exit.text is discard at runtime, not link time, to deal with | 292 | * .exit.text is discard at runtime, not link time, to deal with |
265 | * references from .altinstructions and .eh_frame | 293 | * references from .altinstructions and .eh_frame |
266 | */ | 294 | */ |
@@ -273,7 +301,7 @@ SECTIONS | |||
273 | } | 301 | } |
274 | 302 | ||
275 | #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) | 303 | #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) |
276 | PERCPU(PAGE_SIZE) | 304 | PERCPU(THREAD_SIZE) |
277 | #endif | 305 | #endif |
278 | 306 | ||
279 | . = ALIGN(PAGE_SIZE); | 307 | . = ALIGN(PAGE_SIZE); |
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 970bbd479516..ddc131ff438f 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -64,6 +64,13 @@ config KVM_AMD | |||
64 | To compile this as a module, choose M here: the module | 64 | To compile this as a module, choose M here: the module |
65 | will be called kvm-amd. | 65 | will be called kvm-amd. |
66 | 66 | ||
67 | config KVM_MMU_AUDIT | ||
68 | bool "Audit KVM MMU" | ||
69 | depends on KVM && TRACEPOINTS | ||
70 | ---help--- | ||
71 | This option adds a R/W kVM module parameter 'mmu_audit', which allows | ||
72 | audit KVM MMU at runtime. | ||
73 | |||
67 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under | 74 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under |
68 | # the virtualization menu. | 75 | # the virtualization menu. |
69 | source drivers/vhost/Kconfig | 76 | source drivers/vhost/Kconfig |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 66ca98aafdd6..38b6e8dafaff 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * privileged instructions: | 9 | * privileged instructions: |
10 | * | 10 | * |
11 | * Copyright (C) 2006 Qumranet | 11 | * Copyright (C) 2006 Qumranet |
12 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 12 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
13 | * | 13 | * |
14 | * Avi Kivity <avi@qumranet.com> | 14 | * Avi Kivity <avi@qumranet.com> |
15 | * Yaniv Kamay <yaniv@qumranet.com> | 15 | * Yaniv Kamay <yaniv@qumranet.com> |
@@ -51,13 +51,13 @@ | |||
51 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ | 51 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ |
52 | #define DstReg (2<<1) /* Register operand. */ | 52 | #define DstReg (2<<1) /* Register operand. */ |
53 | #define DstMem (3<<1) /* Memory operand. */ | 53 | #define DstMem (3<<1) /* Memory operand. */ |
54 | #define DstAcc (4<<1) /* Destination Accumulator */ | 54 | #define DstAcc (4<<1) /* Destination Accumulator */ |
55 | #define DstDI (5<<1) /* Destination is in ES:(E)DI */ | 55 | #define DstDI (5<<1) /* Destination is in ES:(E)DI */ |
56 | #define DstMem64 (6<<1) /* 64bit memory operand */ | 56 | #define DstMem64 (6<<1) /* 64bit memory operand */ |
57 | #define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */ | ||
57 | #define DstMask (7<<1) | 58 | #define DstMask (7<<1) |
58 | /* Source operand type. */ | 59 | /* Source operand type. */ |
59 | #define SrcNone (0<<4) /* No source operand. */ | 60 | #define SrcNone (0<<4) /* No source operand. */ |
60 | #define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ | ||
61 | #define SrcReg (1<<4) /* Register operand. */ | 61 | #define SrcReg (1<<4) /* Register operand. */ |
62 | #define SrcMem (2<<4) /* Memory operand. */ | 62 | #define SrcMem (2<<4) /* Memory operand. */ |
63 | #define SrcMem16 (3<<4) /* Memory operand (16-bit). */ | 63 | #define SrcMem16 (3<<4) /* Memory operand (16-bit). */ |
@@ -71,6 +71,7 @@ | |||
71 | #define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ | 71 | #define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ |
72 | #define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ | 72 | #define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ |
73 | #define SrcAcc (0xd<<4) /* Source Accumulator */ | 73 | #define SrcAcc (0xd<<4) /* Source Accumulator */ |
74 | #define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */ | ||
74 | #define SrcMask (0xf<<4) | 75 | #define SrcMask (0xf<<4) |
75 | /* Generic ModRM decode. */ | 76 | /* Generic ModRM decode. */ |
76 | #define ModRM (1<<8) | 77 | #define ModRM (1<<8) |
@@ -82,8 +83,10 @@ | |||
82 | #define Stack (1<<13) /* Stack instruction (push/pop) */ | 83 | #define Stack (1<<13) /* Stack instruction (push/pop) */ |
83 | #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ | 84 | #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ |
84 | #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ | 85 | #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ |
85 | #define GroupMask 0xff /* Group number stored in bits 0:7 */ | ||
86 | /* Misc flags */ | 86 | /* Misc flags */ |
87 | #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ | ||
88 | #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ | ||
89 | #define Undefined (1<<25) /* No Such Instruction */ | ||
87 | #define Lock (1<<26) /* lock prefix is allowed for the instruction */ | 90 | #define Lock (1<<26) /* lock prefix is allowed for the instruction */ |
88 | #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ | 91 | #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ |
89 | #define No64 (1<<28) | 92 | #define No64 (1<<28) |
@@ -92,285 +95,30 @@ | |||
92 | #define Src2CL (1<<29) | 95 | #define Src2CL (1<<29) |
93 | #define Src2ImmByte (2<<29) | 96 | #define Src2ImmByte (2<<29) |
94 | #define Src2One (3<<29) | 97 | #define Src2One (3<<29) |
98 | #define Src2Imm (4<<29) | ||
95 | #define Src2Mask (7<<29) | 99 | #define Src2Mask (7<<29) |
96 | 100 | ||
97 | enum { | 101 | #define X2(x...) x, x |
98 | Group1_80, Group1_81, Group1_82, Group1_83, | 102 | #define X3(x...) X2(x), x |
99 | Group1A, Group3_Byte, Group3, Group4, Group5, Group7, | 103 | #define X4(x...) X2(x), X2(x) |
100 | Group8, Group9, | 104 | #define X5(x...) X4(x), x |
105 | #define X6(x...) X4(x), X2(x) | ||
106 | #define X7(x...) X4(x), X3(x) | ||
107 | #define X8(x...) X4(x), X4(x) | ||
108 | #define X16(x...) X8(x), X8(x) | ||
109 | |||
110 | struct opcode { | ||
111 | u32 flags; | ||
112 | union { | ||
113 | int (*execute)(struct x86_emulate_ctxt *ctxt); | ||
114 | struct opcode *group; | ||
115 | struct group_dual *gdual; | ||
116 | } u; | ||
101 | }; | 117 | }; |
102 | 118 | ||
103 | static u32 opcode_table[256] = { | 119 | struct group_dual { |
104 | /* 0x00 - 0x07 */ | 120 | struct opcode mod012[8]; |
105 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | 121 | struct opcode mod3[8]; |
106 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
107 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, | ||
108 | ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, | ||
109 | /* 0x08 - 0x0F */ | ||
110 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | ||
111 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
112 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, | ||
113 | ImplicitOps | Stack | No64, 0, | ||
114 | /* 0x10 - 0x17 */ | ||
115 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | ||
116 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
117 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, | ||
118 | ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, | ||
119 | /* 0x18 - 0x1F */ | ||
120 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | ||
121 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
122 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, | ||
123 | ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, | ||
124 | /* 0x20 - 0x27 */ | ||
125 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | ||
126 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
127 | ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, | ||
128 | /* 0x28 - 0x2F */ | ||
129 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | ||
130 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
131 | ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, | ||
132 | /* 0x30 - 0x37 */ | ||
133 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | ||
134 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
135 | ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, | ||
136 | /* 0x38 - 0x3F */ | ||
137 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
138 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
139 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, | ||
140 | 0, 0, | ||
141 | /* 0x40 - 0x47 */ | ||
142 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | ||
143 | /* 0x48 - 0x4F */ | ||
144 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | ||
145 | /* 0x50 - 0x57 */ | ||
146 | SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, | ||
147 | SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, | ||
148 | /* 0x58 - 0x5F */ | ||
149 | DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, | ||
150 | DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, | ||
151 | /* 0x60 - 0x67 */ | ||
152 | ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, | ||
153 | 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , | ||
154 | 0, 0, 0, 0, | ||
155 | /* 0x68 - 0x6F */ | ||
156 | SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, | ||
157 | DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */ | ||
158 | SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */ | ||
159 | /* 0x70 - 0x77 */ | ||
160 | SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, | ||
161 | SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, | ||
162 | /* 0x78 - 0x7F */ | ||
163 | SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, | ||
164 | SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, | ||
165 | /* 0x80 - 0x87 */ | ||
166 | Group | Group1_80, Group | Group1_81, | ||
167 | Group | Group1_82, Group | Group1_83, | ||
168 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
169 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | ||
170 | /* 0x88 - 0x8F */ | ||
171 | ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, | ||
172 | ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
173 | DstMem | SrcNone | ModRM | Mov, ModRM | DstReg, | ||
174 | ImplicitOps | SrcMem16 | ModRM, Group | Group1A, | ||
175 | /* 0x90 - 0x97 */ | ||
176 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | ||
177 | /* 0x98 - 0x9F */ | ||
178 | 0, 0, SrcImmFAddr | No64, 0, | ||
179 | ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, | ||
180 | /* 0xA0 - 0xA7 */ | ||
181 | ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs, | ||
182 | ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs, | ||
183 | ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, | ||
184 | ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, | ||
185 | /* 0xA8 - 0xAF */ | ||
186 | DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String, | ||
187 | ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, | ||
188 | ByteOp | DstDI | String, DstDI | String, | ||
189 | /* 0xB0 - 0xB7 */ | ||
190 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, | ||
191 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, | ||
192 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, | ||
193 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, | ||
194 | /* 0xB8 - 0xBF */ | ||
195 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
196 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
197 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
198 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
199 | /* 0xC0 - 0xC7 */ | ||
200 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
201 | 0, ImplicitOps | Stack, 0, 0, | ||
202 | ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, | ||
203 | /* 0xC8 - 0xCF */ | ||
204 | 0, 0, 0, ImplicitOps | Stack, | ||
205 | ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps, | ||
206 | /* 0xD0 - 0xD7 */ | ||
207 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
208 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
209 | 0, 0, 0, 0, | ||
210 | /* 0xD8 - 0xDF */ | ||
211 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
212 | /* 0xE0 - 0xE7 */ | ||
213 | 0, 0, 0, 0, | ||
214 | ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, | ||
215 | ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, | ||
216 | /* 0xE8 - 0xEF */ | ||
217 | SrcImm | Stack, SrcImm | ImplicitOps, | ||
218 | SrcImmFAddr | No64, SrcImmByte | ImplicitOps, | ||
219 | SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, | ||
220 | SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, | ||
221 | /* 0xF0 - 0xF7 */ | ||
222 | 0, 0, 0, 0, | ||
223 | ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, | ||
224 | /* 0xF8 - 0xFF */ | ||
225 | ImplicitOps, 0, ImplicitOps, ImplicitOps, | ||
226 | ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, | ||
227 | }; | ||
228 | |||
229 | static u32 twobyte_table[256] = { | ||
230 | /* 0x00 - 0x0F */ | ||
231 | 0, Group | GroupDual | Group7, 0, 0, | ||
232 | 0, ImplicitOps, ImplicitOps | Priv, 0, | ||
233 | ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, | ||
234 | 0, ImplicitOps | ModRM, 0, 0, | ||
235 | /* 0x10 - 0x1F */ | ||
236 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, | ||
237 | /* 0x20 - 0x2F */ | ||
238 | ModRM | ImplicitOps | Priv, ModRM | Priv, | ||
239 | ModRM | ImplicitOps | Priv, ModRM | Priv, | ||
240 | 0, 0, 0, 0, | ||
241 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
242 | /* 0x30 - 0x3F */ | ||
243 | ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, | ||
244 | ImplicitOps, ImplicitOps | Priv, 0, 0, | ||
245 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
246 | /* 0x40 - 0x47 */ | ||
247 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
248 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
249 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
250 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
251 | /* 0x48 - 0x4F */ | ||
252 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
253 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
254 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
255 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
256 | /* 0x50 - 0x5F */ | ||
257 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
258 | /* 0x60 - 0x6F */ | ||
259 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
260 | /* 0x70 - 0x7F */ | ||
261 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
262 | /* 0x80 - 0x8F */ | ||
263 | SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, | ||
264 | SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, | ||
265 | /* 0x90 - 0x9F */ | ||
266 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
267 | /* 0xA0 - 0xA7 */ | ||
268 | ImplicitOps | Stack, ImplicitOps | Stack, | ||
269 | 0, DstMem | SrcReg | ModRM | BitOp, | ||
270 | DstMem | SrcReg | Src2ImmByte | ModRM, | ||
271 | DstMem | SrcReg | Src2CL | ModRM, 0, 0, | ||
272 | /* 0xA8 - 0xAF */ | ||
273 | ImplicitOps | Stack, ImplicitOps | Stack, | ||
274 | 0, DstMem | SrcReg | ModRM | BitOp | Lock, | ||
275 | DstMem | SrcReg | Src2ImmByte | ModRM, | ||
276 | DstMem | SrcReg | Src2CL | ModRM, | ||
277 | ModRM, 0, | ||
278 | /* 0xB0 - 0xB7 */ | ||
279 | ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, | ||
280 | 0, DstMem | SrcReg | ModRM | BitOp | Lock, | ||
281 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
282 | DstReg | SrcMem16 | ModRM | Mov, | ||
283 | /* 0xB8 - 0xBF */ | ||
284 | 0, 0, | ||
285 | Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock, | ||
286 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
287 | DstReg | SrcMem16 | ModRM | Mov, | ||
288 | /* 0xC0 - 0xCF */ | ||
289 | 0, 0, 0, DstMem | SrcReg | ModRM | Mov, | ||
290 | 0, 0, 0, Group | GroupDual | Group9, | ||
291 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
292 | /* 0xD0 - 0xDF */ | ||
293 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
294 | /* 0xE0 - 0xEF */ | ||
295 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
296 | /* 0xF0 - 0xFF */ | ||
297 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
298 | }; | ||
299 | |||
300 | static u32 group_table[] = { | ||
301 | [Group1_80*8] = | ||
302 | ByteOp | DstMem | SrcImm | ModRM | Lock, | ||
303 | ByteOp | DstMem | SrcImm | ModRM | Lock, | ||
304 | ByteOp | DstMem | SrcImm | ModRM | Lock, | ||
305 | ByteOp | DstMem | SrcImm | ModRM | Lock, | ||
306 | ByteOp | DstMem | SrcImm | ModRM | Lock, | ||
307 | ByteOp | DstMem | SrcImm | ModRM | Lock, | ||
308 | ByteOp | DstMem | SrcImm | ModRM | Lock, | ||
309 | ByteOp | DstMem | SrcImm | ModRM, | ||
310 | [Group1_81*8] = | ||
311 | DstMem | SrcImm | ModRM | Lock, | ||
312 | DstMem | SrcImm | ModRM | Lock, | ||
313 | DstMem | SrcImm | ModRM | Lock, | ||
314 | DstMem | SrcImm | ModRM | Lock, | ||
315 | DstMem | SrcImm | ModRM | Lock, | ||
316 | DstMem | SrcImm | ModRM | Lock, | ||
317 | DstMem | SrcImm | ModRM | Lock, | ||
318 | DstMem | SrcImm | ModRM, | ||
319 | [Group1_82*8] = | ||
320 | ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, | ||
321 | ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, | ||
322 | ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, | ||
323 | ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, | ||
324 | ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, | ||
325 | ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, | ||
326 | ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, | ||
327 | ByteOp | DstMem | SrcImm | ModRM | No64, | ||
328 | [Group1_83*8] = | ||
329 | DstMem | SrcImmByte | ModRM | Lock, | ||
330 | DstMem | SrcImmByte | ModRM | Lock, | ||
331 | DstMem | SrcImmByte | ModRM | Lock, | ||
332 | DstMem | SrcImmByte | ModRM | Lock, | ||
333 | DstMem | SrcImmByte | ModRM | Lock, | ||
334 | DstMem | SrcImmByte | ModRM | Lock, | ||
335 | DstMem | SrcImmByte | ModRM | Lock, | ||
336 | DstMem | SrcImmByte | ModRM, | ||
337 | [Group1A*8] = | ||
338 | DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, | ||
339 | [Group3_Byte*8] = | ||
340 | ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM, | ||
341 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | ||
342 | 0, 0, 0, 0, | ||
343 | [Group3*8] = | ||
344 | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | ||
345 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, | ||
346 | 0, 0, 0, 0, | ||
347 | [Group4*8] = | ||
348 | ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, | ||
349 | 0, 0, 0, 0, 0, 0, | ||
350 | [Group5*8] = | ||
351 | DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, | ||
352 | SrcMem | ModRM | Stack, 0, | ||
353 | SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps, | ||
354 | SrcMem | ModRM | Stack, 0, | ||
355 | [Group7*8] = | ||
356 | 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, | ||
357 | SrcNone | ModRM | DstMem | Mov, 0, | ||
358 | SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, | ||
359 | [Group8*8] = | ||
360 | 0, 0, 0, 0, | ||
361 | DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, | ||
362 | DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, | ||
363 | [Group9*8] = | ||
364 | 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0, | ||
365 | }; | ||
366 | |||
367 | static u32 group2_table[] = { | ||
368 | [Group7*8] = | ||
369 | SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv, | ||
370 | SrcNone | ModRM | DstMem | Mov, 0, | ||
371 | SrcMem16 | ModRM | Mov | Priv, 0, | ||
372 | [Group9*8] = | ||
373 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
374 | }; | 122 | }; |
375 | 123 | ||
376 | /* EFLAGS bit definitions. */ | 124 | /* EFLAGS bit definitions. */ |
@@ -392,6 +140,9 @@ static u32 group2_table[] = { | |||
392 | #define EFLG_PF (1<<2) | 140 | #define EFLG_PF (1<<2) |
393 | #define EFLG_CF (1<<0) | 141 | #define EFLG_CF (1<<0) |
394 | 142 | ||
143 | #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a | ||
144 | #define EFLG_RESERVED_ONE_MASK 2 | ||
145 | |||
395 | /* | 146 | /* |
396 | * Instruction emulation: | 147 | * Instruction emulation: |
397 | * Most instructions are emulated directly via a fragment of inline assembly | 148 | * Most instructions are emulated directly via a fragment of inline assembly |
@@ -444,13 +195,13 @@ static u32 group2_table[] = { | |||
444 | #define ON64(x) | 195 | #define ON64(x) |
445 | #endif | 196 | #endif |
446 | 197 | ||
447 | #define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ | 198 | #define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \ |
448 | do { \ | 199 | do { \ |
449 | __asm__ __volatile__ ( \ | 200 | __asm__ __volatile__ ( \ |
450 | _PRE_EFLAGS("0", "4", "2") \ | 201 | _PRE_EFLAGS("0", "4", "2") \ |
451 | _op _suffix " %"_x"3,%1; " \ | 202 | _op _suffix " %"_x"3,%1; " \ |
452 | _POST_EFLAGS("0", "4", "2") \ | 203 | _POST_EFLAGS("0", "4", "2") \ |
453 | : "=m" (_eflags), "=m" ((_dst).val), \ | 204 | : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\ |
454 | "=&r" (_tmp) \ | 205 | "=&r" (_tmp) \ |
455 | : _y ((_src).val), "i" (EFLAGS_MASK)); \ | 206 | : _y ((_src).val), "i" (EFLAGS_MASK)); \ |
456 | } while (0) | 207 | } while (0) |
@@ -463,13 +214,13 @@ static u32 group2_table[] = { | |||
463 | \ | 214 | \ |
464 | switch ((_dst).bytes) { \ | 215 | switch ((_dst).bytes) { \ |
465 | case 2: \ | 216 | case 2: \ |
466 | ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ | 217 | ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\ |
467 | break; \ | 218 | break; \ |
468 | case 4: \ | 219 | case 4: \ |
469 | ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ | 220 | ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\ |
470 | break; \ | 221 | break; \ |
471 | case 8: \ | 222 | case 8: \ |
472 | ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ | 223 | ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \ |
473 | break; \ | 224 | break; \ |
474 | } \ | 225 | } \ |
475 | } while (0) | 226 | } while (0) |
@@ -479,7 +230,7 @@ static u32 group2_table[] = { | |||
479 | unsigned long _tmp; \ | 230 | unsigned long _tmp; \ |
480 | switch ((_dst).bytes) { \ | 231 | switch ((_dst).bytes) { \ |
481 | case 1: \ | 232 | case 1: \ |
482 | ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ | 233 | ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \ |
483 | break; \ | 234 | break; \ |
484 | default: \ | 235 | default: \ |
485 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | 236 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ |
@@ -566,6 +317,74 @@ static u32 group2_table[] = { | |||
566 | } \ | 317 | } \ |
567 | } while (0) | 318 | } while (0) |
568 | 319 | ||
320 | #define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \ | ||
321 | do { \ | ||
322 | unsigned long _tmp; \ | ||
323 | \ | ||
324 | __asm__ __volatile__ ( \ | ||
325 | _PRE_EFLAGS("0", "4", "1") \ | ||
326 | _op _suffix " %5; " \ | ||
327 | _POST_EFLAGS("0", "4", "1") \ | ||
328 | : "=m" (_eflags), "=&r" (_tmp), \ | ||
329 | "+a" (_rax), "+d" (_rdx) \ | ||
330 | : "i" (EFLAGS_MASK), "m" ((_src).val), \ | ||
331 | "a" (_rax), "d" (_rdx)); \ | ||
332 | } while (0) | ||
333 | |||
334 | #define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \ | ||
335 | do { \ | ||
336 | unsigned long _tmp; \ | ||
337 | \ | ||
338 | __asm__ __volatile__ ( \ | ||
339 | _PRE_EFLAGS("0", "5", "1") \ | ||
340 | "1: \n\t" \ | ||
341 | _op _suffix " %6; " \ | ||
342 | "2: \n\t" \ | ||
343 | _POST_EFLAGS("0", "5", "1") \ | ||
344 | ".pushsection .fixup,\"ax\" \n\t" \ | ||
345 | "3: movb $1, %4 \n\t" \ | ||
346 | "jmp 2b \n\t" \ | ||
347 | ".popsection \n\t" \ | ||
348 | _ASM_EXTABLE(1b, 3b) \ | ||
349 | : "=m" (_eflags), "=&r" (_tmp), \ | ||
350 | "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \ | ||
351 | : "i" (EFLAGS_MASK), "m" ((_src).val), \ | ||
352 | "a" (_rax), "d" (_rdx)); \ | ||
353 | } while (0) | ||
354 | |||
355 | /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ | ||
356 | #define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ | ||
357 | do { \ | ||
358 | switch((_src).bytes) { \ | ||
359 | case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \ | ||
360 | case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \ | ||
361 | case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \ | ||
362 | case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \ | ||
363 | } \ | ||
364 | } while (0) | ||
365 | |||
366 | #define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \ | ||
367 | do { \ | ||
368 | switch((_src).bytes) { \ | ||
369 | case 1: \ | ||
370 | __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ | ||
371 | _eflags, "b", _ex); \ | ||
372 | break; \ | ||
373 | case 2: \ | ||
374 | __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ | ||
375 | _eflags, "w", _ex); \ | ||
376 | break; \ | ||
377 | case 4: \ | ||
378 | __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ | ||
379 | _eflags, "l", _ex); \ | ||
380 | break; \ | ||
381 | case 8: ON64( \ | ||
382 | __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ | ||
383 | _eflags, "q", _ex)); \ | ||
384 | break; \ | ||
385 | } \ | ||
386 | } while (0) | ||
387 | |||
569 | /* Fetch next part of the instruction being emulated. */ | 388 | /* Fetch next part of the instruction being emulated. */ |
570 | #define insn_fetch(_type, _size, _eip) \ | 389 | #define insn_fetch(_type, _size, _eip) \ |
571 | ({ unsigned long _x; \ | 390 | ({ unsigned long _x; \ |
@@ -661,7 +480,6 @@ static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, | |||
661 | ctxt->exception = vec; | 480 | ctxt->exception = vec; |
662 | ctxt->error_code = error; | 481 | ctxt->error_code = error; |
663 | ctxt->error_code_valid = valid; | 482 | ctxt->error_code_valid = valid; |
664 | ctxt->restart = false; | ||
665 | } | 483 | } |
666 | 484 | ||
667 | static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) | 485 | static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) |
@@ -669,11 +487,9 @@ static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) | |||
669 | emulate_exception(ctxt, GP_VECTOR, err, true); | 487 | emulate_exception(ctxt, GP_VECTOR, err, true); |
670 | } | 488 | } |
671 | 489 | ||
672 | static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, | 490 | static void emulate_pf(struct x86_emulate_ctxt *ctxt) |
673 | int err) | ||
674 | { | 491 | { |
675 | ctxt->cr2 = addr; | 492 | emulate_exception(ctxt, PF_VECTOR, 0, true); |
676 | emulate_exception(ctxt, PF_VECTOR, err, true); | ||
677 | } | 493 | } |
678 | 494 | ||
679 | static void emulate_ud(struct x86_emulate_ctxt *ctxt) | 495 | static void emulate_ud(struct x86_emulate_ctxt *ctxt) |
@@ -686,6 +502,12 @@ static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) | |||
686 | emulate_exception(ctxt, TS_VECTOR, err, true); | 502 | emulate_exception(ctxt, TS_VECTOR, err, true); |
687 | } | 503 | } |
688 | 504 | ||
505 | static int emulate_de(struct x86_emulate_ctxt *ctxt) | ||
506 | { | ||
507 | emulate_exception(ctxt, DE_VECTOR, 0, false); | ||
508 | return X86EMUL_PROPAGATE_FAULT; | ||
509 | } | ||
510 | |||
689 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | 511 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, |
690 | struct x86_emulate_ops *ops, | 512 | struct x86_emulate_ops *ops, |
691 | unsigned long eip, u8 *dest) | 513 | unsigned long eip, u8 *dest) |
@@ -742,7 +564,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs, | |||
742 | 564 | ||
743 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, | 565 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, |
744 | struct x86_emulate_ops *ops, | 566 | struct x86_emulate_ops *ops, |
745 | void *ptr, | 567 | ulong addr, |
746 | u16 *size, unsigned long *address, int op_bytes) | 568 | u16 *size, unsigned long *address, int op_bytes) |
747 | { | 569 | { |
748 | int rc; | 570 | int rc; |
@@ -750,12 +572,10 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, | |||
750 | if (op_bytes == 2) | 572 | if (op_bytes == 2) |
751 | op_bytes = 3; | 573 | op_bytes = 3; |
752 | *address = 0; | 574 | *address = 0; |
753 | rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, | 575 | rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL); |
754 | ctxt->vcpu, NULL); | ||
755 | if (rc != X86EMUL_CONTINUE) | 576 | if (rc != X86EMUL_CONTINUE) |
756 | return rc; | 577 | return rc; |
757 | rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, | 578 | rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL); |
758 | ctxt->vcpu, NULL); | ||
759 | return rc; | 579 | return rc; |
760 | } | 580 | } |
761 | 581 | ||
@@ -794,6 +614,24 @@ static int test_cc(unsigned int condition, unsigned int flags) | |||
794 | return (!!rc ^ (condition & 1)); | 614 | return (!!rc ^ (condition & 1)); |
795 | } | 615 | } |
796 | 616 | ||
617 | static void fetch_register_operand(struct operand *op) | ||
618 | { | ||
619 | switch (op->bytes) { | ||
620 | case 1: | ||
621 | op->val = *(u8 *)op->addr.reg; | ||
622 | break; | ||
623 | case 2: | ||
624 | op->val = *(u16 *)op->addr.reg; | ||
625 | break; | ||
626 | case 4: | ||
627 | op->val = *(u32 *)op->addr.reg; | ||
628 | break; | ||
629 | case 8: | ||
630 | op->val = *(u64 *)op->addr.reg; | ||
631 | break; | ||
632 | } | ||
633 | } | ||
634 | |||
797 | static void decode_register_operand(struct operand *op, | 635 | static void decode_register_operand(struct operand *op, |
798 | struct decode_cache *c, | 636 | struct decode_cache *c, |
799 | int inhibit_bytereg) | 637 | int inhibit_bytereg) |
@@ -805,34 +643,25 @@ static void decode_register_operand(struct operand *op, | |||
805 | reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); | 643 | reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); |
806 | op->type = OP_REG; | 644 | op->type = OP_REG; |
807 | if ((c->d & ByteOp) && !inhibit_bytereg) { | 645 | if ((c->d & ByteOp) && !inhibit_bytereg) { |
808 | op->ptr = decode_register(reg, c->regs, highbyte_regs); | 646 | op->addr.reg = decode_register(reg, c->regs, highbyte_regs); |
809 | op->val = *(u8 *)op->ptr; | ||
810 | op->bytes = 1; | 647 | op->bytes = 1; |
811 | } else { | 648 | } else { |
812 | op->ptr = decode_register(reg, c->regs, 0); | 649 | op->addr.reg = decode_register(reg, c->regs, 0); |
813 | op->bytes = c->op_bytes; | 650 | op->bytes = c->op_bytes; |
814 | switch (op->bytes) { | ||
815 | case 2: | ||
816 | op->val = *(u16 *)op->ptr; | ||
817 | break; | ||
818 | case 4: | ||
819 | op->val = *(u32 *)op->ptr; | ||
820 | break; | ||
821 | case 8: | ||
822 | op->val = *(u64 *) op->ptr; | ||
823 | break; | ||
824 | } | ||
825 | } | 651 | } |
652 | fetch_register_operand(op); | ||
826 | op->orig_val = op->val; | 653 | op->orig_val = op->val; |
827 | } | 654 | } |
828 | 655 | ||
829 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, | 656 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, |
830 | struct x86_emulate_ops *ops) | 657 | struct x86_emulate_ops *ops, |
658 | struct operand *op) | ||
831 | { | 659 | { |
832 | struct decode_cache *c = &ctxt->decode; | 660 | struct decode_cache *c = &ctxt->decode; |
833 | u8 sib; | 661 | u8 sib; |
834 | int index_reg = 0, base_reg = 0, scale; | 662 | int index_reg = 0, base_reg = 0, scale; |
835 | int rc = X86EMUL_CONTINUE; | 663 | int rc = X86EMUL_CONTINUE; |
664 | ulong modrm_ea = 0; | ||
836 | 665 | ||
837 | if (c->rex_prefix) { | 666 | if (c->rex_prefix) { |
838 | c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ | 667 | c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ |
@@ -844,16 +673,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
844 | c->modrm_mod |= (c->modrm & 0xc0) >> 6; | 673 | c->modrm_mod |= (c->modrm & 0xc0) >> 6; |
845 | c->modrm_reg |= (c->modrm & 0x38) >> 3; | 674 | c->modrm_reg |= (c->modrm & 0x38) >> 3; |
846 | c->modrm_rm |= (c->modrm & 0x07); | 675 | c->modrm_rm |= (c->modrm & 0x07); |
847 | c->modrm_ea = 0; | 676 | c->modrm_seg = VCPU_SREG_DS; |
848 | c->use_modrm_ea = 1; | ||
849 | 677 | ||
850 | if (c->modrm_mod == 3) { | 678 | if (c->modrm_mod == 3) { |
851 | c->modrm_ptr = decode_register(c->modrm_rm, | 679 | op->type = OP_REG; |
680 | op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
681 | op->addr.reg = decode_register(c->modrm_rm, | ||
852 | c->regs, c->d & ByteOp); | 682 | c->regs, c->d & ByteOp); |
853 | c->modrm_val = *(unsigned long *)c->modrm_ptr; | 683 | fetch_register_operand(op); |
854 | return rc; | 684 | return rc; |
855 | } | 685 | } |
856 | 686 | ||
687 | op->type = OP_MEM; | ||
688 | |||
857 | if (c->ad_bytes == 2) { | 689 | if (c->ad_bytes == 2) { |
858 | unsigned bx = c->regs[VCPU_REGS_RBX]; | 690 | unsigned bx = c->regs[VCPU_REGS_RBX]; |
859 | unsigned bp = c->regs[VCPU_REGS_RBP]; | 691 | unsigned bp = c->regs[VCPU_REGS_RBP]; |
@@ -864,47 +696,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
864 | switch (c->modrm_mod) { | 696 | switch (c->modrm_mod) { |
865 | case 0: | 697 | case 0: |
866 | if (c->modrm_rm == 6) | 698 | if (c->modrm_rm == 6) |
867 | c->modrm_ea += insn_fetch(u16, 2, c->eip); | 699 | modrm_ea += insn_fetch(u16, 2, c->eip); |
868 | break; | 700 | break; |
869 | case 1: | 701 | case 1: |
870 | c->modrm_ea += insn_fetch(s8, 1, c->eip); | 702 | modrm_ea += insn_fetch(s8, 1, c->eip); |
871 | break; | 703 | break; |
872 | case 2: | 704 | case 2: |
873 | c->modrm_ea += insn_fetch(u16, 2, c->eip); | 705 | modrm_ea += insn_fetch(u16, 2, c->eip); |
874 | break; | 706 | break; |
875 | } | 707 | } |
876 | switch (c->modrm_rm) { | 708 | switch (c->modrm_rm) { |
877 | case 0: | 709 | case 0: |
878 | c->modrm_ea += bx + si; | 710 | modrm_ea += bx + si; |
879 | break; | 711 | break; |
880 | case 1: | 712 | case 1: |
881 | c->modrm_ea += bx + di; | 713 | modrm_ea += bx + di; |
882 | break; | 714 | break; |
883 | case 2: | 715 | case 2: |
884 | c->modrm_ea += bp + si; | 716 | modrm_ea += bp + si; |
885 | break; | 717 | break; |
886 | case 3: | 718 | case 3: |
887 | c->modrm_ea += bp + di; | 719 | modrm_ea += bp + di; |
888 | break; | 720 | break; |
889 | case 4: | 721 | case 4: |
890 | c->modrm_ea += si; | 722 | modrm_ea += si; |
891 | break; | 723 | break; |
892 | case 5: | 724 | case 5: |
893 | c->modrm_ea += di; | 725 | modrm_ea += di; |
894 | break; | 726 | break; |
895 | case 6: | 727 | case 6: |
896 | if (c->modrm_mod != 0) | 728 | if (c->modrm_mod != 0) |
897 | c->modrm_ea += bp; | 729 | modrm_ea += bp; |
898 | break; | 730 | break; |
899 | case 7: | 731 | case 7: |
900 | c->modrm_ea += bx; | 732 | modrm_ea += bx; |
901 | break; | 733 | break; |
902 | } | 734 | } |
903 | if (c->modrm_rm == 2 || c->modrm_rm == 3 || | 735 | if (c->modrm_rm == 2 || c->modrm_rm == 3 || |
904 | (c->modrm_rm == 6 && c->modrm_mod != 0)) | 736 | (c->modrm_rm == 6 && c->modrm_mod != 0)) |
905 | if (!c->has_seg_override) | 737 | c->modrm_seg = VCPU_SREG_SS; |
906 | set_seg_override(c, VCPU_SREG_SS); | 738 | modrm_ea = (u16)modrm_ea; |
907 | c->modrm_ea = (u16)c->modrm_ea; | ||
908 | } else { | 739 | } else { |
909 | /* 32/64-bit ModR/M decode. */ | 740 | /* 32/64-bit ModR/M decode. */ |
910 | if ((c->modrm_rm & 7) == 4) { | 741 | if ((c->modrm_rm & 7) == 4) { |
@@ -914,410 +745,74 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
914 | scale = sib >> 6; | 745 | scale = sib >> 6; |
915 | 746 | ||
916 | if ((base_reg & 7) == 5 && c->modrm_mod == 0) | 747 | if ((base_reg & 7) == 5 && c->modrm_mod == 0) |
917 | c->modrm_ea += insn_fetch(s32, 4, c->eip); | 748 | modrm_ea += insn_fetch(s32, 4, c->eip); |
918 | else | 749 | else |
919 | c->modrm_ea += c->regs[base_reg]; | 750 | modrm_ea += c->regs[base_reg]; |
920 | if (index_reg != 4) | 751 | if (index_reg != 4) |
921 | c->modrm_ea += c->regs[index_reg] << scale; | 752 | modrm_ea += c->regs[index_reg] << scale; |
922 | } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { | 753 | } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { |
923 | if (ctxt->mode == X86EMUL_MODE_PROT64) | 754 | if (ctxt->mode == X86EMUL_MODE_PROT64) |
924 | c->rip_relative = 1; | 755 | c->rip_relative = 1; |
925 | } else | 756 | } else |
926 | c->modrm_ea += c->regs[c->modrm_rm]; | 757 | modrm_ea += c->regs[c->modrm_rm]; |
927 | switch (c->modrm_mod) { | 758 | switch (c->modrm_mod) { |
928 | case 0: | 759 | case 0: |
929 | if (c->modrm_rm == 5) | 760 | if (c->modrm_rm == 5) |
930 | c->modrm_ea += insn_fetch(s32, 4, c->eip); | 761 | modrm_ea += insn_fetch(s32, 4, c->eip); |
931 | break; | 762 | break; |
932 | case 1: | 763 | case 1: |
933 | c->modrm_ea += insn_fetch(s8, 1, c->eip); | 764 | modrm_ea += insn_fetch(s8, 1, c->eip); |
934 | break; | 765 | break; |
935 | case 2: | 766 | case 2: |
936 | c->modrm_ea += insn_fetch(s32, 4, c->eip); | 767 | modrm_ea += insn_fetch(s32, 4, c->eip); |
937 | break; | 768 | break; |
938 | } | 769 | } |
939 | } | 770 | } |
771 | op->addr.mem = modrm_ea; | ||
940 | done: | 772 | done: |
941 | return rc; | 773 | return rc; |
942 | } | 774 | } |
943 | 775 | ||
944 | static int decode_abs(struct x86_emulate_ctxt *ctxt, | 776 | static int decode_abs(struct x86_emulate_ctxt *ctxt, |
945 | struct x86_emulate_ops *ops) | 777 | struct x86_emulate_ops *ops, |
778 | struct operand *op) | ||
946 | { | 779 | { |
947 | struct decode_cache *c = &ctxt->decode; | 780 | struct decode_cache *c = &ctxt->decode; |
948 | int rc = X86EMUL_CONTINUE; | 781 | int rc = X86EMUL_CONTINUE; |
949 | 782 | ||
783 | op->type = OP_MEM; | ||
950 | switch (c->ad_bytes) { | 784 | switch (c->ad_bytes) { |
951 | case 2: | 785 | case 2: |
952 | c->modrm_ea = insn_fetch(u16, 2, c->eip); | 786 | op->addr.mem = insn_fetch(u16, 2, c->eip); |
953 | break; | 787 | break; |
954 | case 4: | 788 | case 4: |
955 | c->modrm_ea = insn_fetch(u32, 4, c->eip); | 789 | op->addr.mem = insn_fetch(u32, 4, c->eip); |
956 | break; | 790 | break; |
957 | case 8: | 791 | case 8: |
958 | c->modrm_ea = insn_fetch(u64, 8, c->eip); | 792 | op->addr.mem = insn_fetch(u64, 8, c->eip); |
959 | break; | 793 | break; |
960 | } | 794 | } |
961 | done: | 795 | done: |
962 | return rc; | 796 | return rc; |
963 | } | 797 | } |
964 | 798 | ||
965 | int | 799 | static void fetch_bit_operand(struct decode_cache *c) |
966 | x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
967 | { | 800 | { |
968 | struct decode_cache *c = &ctxt->decode; | 801 | long sv = 0, mask; |
969 | int rc = X86EMUL_CONTINUE; | ||
970 | int mode = ctxt->mode; | ||
971 | int def_op_bytes, def_ad_bytes, group; | ||
972 | |||
973 | |||
974 | /* we cannot decode insn before we complete previous rep insn */ | ||
975 | WARN_ON(ctxt->restart); | ||
976 | |||
977 | c->eip = ctxt->eip; | ||
978 | c->fetch.start = c->fetch.end = c->eip; | ||
979 | ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); | ||
980 | |||
981 | switch (mode) { | ||
982 | case X86EMUL_MODE_REAL: | ||
983 | case X86EMUL_MODE_VM86: | ||
984 | case X86EMUL_MODE_PROT16: | ||
985 | def_op_bytes = def_ad_bytes = 2; | ||
986 | break; | ||
987 | case X86EMUL_MODE_PROT32: | ||
988 | def_op_bytes = def_ad_bytes = 4; | ||
989 | break; | ||
990 | #ifdef CONFIG_X86_64 | ||
991 | case X86EMUL_MODE_PROT64: | ||
992 | def_op_bytes = 4; | ||
993 | def_ad_bytes = 8; | ||
994 | break; | ||
995 | #endif | ||
996 | default: | ||
997 | return -1; | ||
998 | } | ||
999 | |||
1000 | c->op_bytes = def_op_bytes; | ||
1001 | c->ad_bytes = def_ad_bytes; | ||
1002 | |||
1003 | /* Legacy prefixes. */ | ||
1004 | for (;;) { | ||
1005 | switch (c->b = insn_fetch(u8, 1, c->eip)) { | ||
1006 | case 0x66: /* operand-size override */ | ||
1007 | /* switch between 2/4 bytes */ | ||
1008 | c->op_bytes = def_op_bytes ^ 6; | ||
1009 | break; | ||
1010 | case 0x67: /* address-size override */ | ||
1011 | if (mode == X86EMUL_MODE_PROT64) | ||
1012 | /* switch between 4/8 bytes */ | ||
1013 | c->ad_bytes = def_ad_bytes ^ 12; | ||
1014 | else | ||
1015 | /* switch between 2/4 bytes */ | ||
1016 | c->ad_bytes = def_ad_bytes ^ 6; | ||
1017 | break; | ||
1018 | case 0x26: /* ES override */ | ||
1019 | case 0x2e: /* CS override */ | ||
1020 | case 0x36: /* SS override */ | ||
1021 | case 0x3e: /* DS override */ | ||
1022 | set_seg_override(c, (c->b >> 3) & 3); | ||
1023 | break; | ||
1024 | case 0x64: /* FS override */ | ||
1025 | case 0x65: /* GS override */ | ||
1026 | set_seg_override(c, c->b & 7); | ||
1027 | break; | ||
1028 | case 0x40 ... 0x4f: /* REX */ | ||
1029 | if (mode != X86EMUL_MODE_PROT64) | ||
1030 | goto done_prefixes; | ||
1031 | c->rex_prefix = c->b; | ||
1032 | continue; | ||
1033 | case 0xf0: /* LOCK */ | ||
1034 | c->lock_prefix = 1; | ||
1035 | break; | ||
1036 | case 0xf2: /* REPNE/REPNZ */ | ||
1037 | c->rep_prefix = REPNE_PREFIX; | ||
1038 | break; | ||
1039 | case 0xf3: /* REP/REPE/REPZ */ | ||
1040 | c->rep_prefix = REPE_PREFIX; | ||
1041 | break; | ||
1042 | default: | ||
1043 | goto done_prefixes; | ||
1044 | } | ||
1045 | |||
1046 | /* Any legacy prefix after a REX prefix nullifies its effect. */ | ||
1047 | |||
1048 | c->rex_prefix = 0; | ||
1049 | } | ||
1050 | |||
1051 | done_prefixes: | ||
1052 | |||
1053 | /* REX prefix. */ | ||
1054 | if (c->rex_prefix) | ||
1055 | if (c->rex_prefix & 8) | ||
1056 | c->op_bytes = 8; /* REX.W */ | ||
1057 | |||
1058 | /* Opcode byte(s). */ | ||
1059 | c->d = opcode_table[c->b]; | ||
1060 | if (c->d == 0) { | ||
1061 | /* Two-byte opcode? */ | ||
1062 | if (c->b == 0x0f) { | ||
1063 | c->twobyte = 1; | ||
1064 | c->b = insn_fetch(u8, 1, c->eip); | ||
1065 | c->d = twobyte_table[c->b]; | ||
1066 | } | ||
1067 | } | ||
1068 | |||
1069 | if (c->d & Group) { | ||
1070 | group = c->d & GroupMask; | ||
1071 | c->modrm = insn_fetch(u8, 1, c->eip); | ||
1072 | --c->eip; | ||
1073 | |||
1074 | group = (group << 3) + ((c->modrm >> 3) & 7); | ||
1075 | if ((c->d & GroupDual) && (c->modrm >> 6) == 3) | ||
1076 | c->d = group2_table[group]; | ||
1077 | else | ||
1078 | c->d = group_table[group]; | ||
1079 | } | ||
1080 | |||
1081 | /* Unrecognised? */ | ||
1082 | if (c->d == 0) { | ||
1083 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1084 | return -1; | ||
1085 | } | ||
1086 | |||
1087 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) | ||
1088 | c->op_bytes = 8; | ||
1089 | |||
1090 | /* ModRM and SIB bytes. */ | ||
1091 | if (c->d & ModRM) | ||
1092 | rc = decode_modrm(ctxt, ops); | ||
1093 | else if (c->d & MemAbs) | ||
1094 | rc = decode_abs(ctxt, ops); | ||
1095 | if (rc != X86EMUL_CONTINUE) | ||
1096 | goto done; | ||
1097 | |||
1098 | if (!c->has_seg_override) | ||
1099 | set_seg_override(c, VCPU_SREG_DS); | ||
1100 | |||
1101 | if (!(!c->twobyte && c->b == 0x8d)) | ||
1102 | c->modrm_ea += seg_override_base(ctxt, ops, c); | ||
1103 | |||
1104 | if (c->ad_bytes != 8) | ||
1105 | c->modrm_ea = (u32)c->modrm_ea; | ||
1106 | |||
1107 | if (c->rip_relative) | ||
1108 | c->modrm_ea += c->eip; | ||
1109 | |||
1110 | /* | ||
1111 | * Decode and fetch the source operand: register, memory | ||
1112 | * or immediate. | ||
1113 | */ | ||
1114 | switch (c->d & SrcMask) { | ||
1115 | case SrcNone: | ||
1116 | break; | ||
1117 | case SrcReg: | ||
1118 | decode_register_operand(&c->src, c, 0); | ||
1119 | break; | ||
1120 | case SrcMem16: | ||
1121 | c->src.bytes = 2; | ||
1122 | goto srcmem_common; | ||
1123 | case SrcMem32: | ||
1124 | c->src.bytes = 4; | ||
1125 | goto srcmem_common; | ||
1126 | case SrcMem: | ||
1127 | c->src.bytes = (c->d & ByteOp) ? 1 : | ||
1128 | c->op_bytes; | ||
1129 | /* Don't fetch the address for invlpg: it could be unmapped. */ | ||
1130 | if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) | ||
1131 | break; | ||
1132 | srcmem_common: | ||
1133 | /* | ||
1134 | * For instructions with a ModR/M byte, switch to register | ||
1135 | * access if Mod = 3. | ||
1136 | */ | ||
1137 | if ((c->d & ModRM) && c->modrm_mod == 3) { | ||
1138 | c->src.type = OP_REG; | ||
1139 | c->src.val = c->modrm_val; | ||
1140 | c->src.ptr = c->modrm_ptr; | ||
1141 | break; | ||
1142 | } | ||
1143 | c->src.type = OP_MEM; | ||
1144 | c->src.ptr = (unsigned long *)c->modrm_ea; | ||
1145 | c->src.val = 0; | ||
1146 | break; | ||
1147 | case SrcImm: | ||
1148 | case SrcImmU: | ||
1149 | c->src.type = OP_IMM; | ||
1150 | c->src.ptr = (unsigned long *)c->eip; | ||
1151 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1152 | if (c->src.bytes == 8) | ||
1153 | c->src.bytes = 4; | ||
1154 | /* NB. Immediates are sign-extended as necessary. */ | ||
1155 | switch (c->src.bytes) { | ||
1156 | case 1: | ||
1157 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
1158 | break; | ||
1159 | case 2: | ||
1160 | c->src.val = insn_fetch(s16, 2, c->eip); | ||
1161 | break; | ||
1162 | case 4: | ||
1163 | c->src.val = insn_fetch(s32, 4, c->eip); | ||
1164 | break; | ||
1165 | } | ||
1166 | if ((c->d & SrcMask) == SrcImmU) { | ||
1167 | switch (c->src.bytes) { | ||
1168 | case 1: | ||
1169 | c->src.val &= 0xff; | ||
1170 | break; | ||
1171 | case 2: | ||
1172 | c->src.val &= 0xffff; | ||
1173 | break; | ||
1174 | case 4: | ||
1175 | c->src.val &= 0xffffffff; | ||
1176 | break; | ||
1177 | } | ||
1178 | } | ||
1179 | break; | ||
1180 | case SrcImmByte: | ||
1181 | case SrcImmUByte: | ||
1182 | c->src.type = OP_IMM; | ||
1183 | c->src.ptr = (unsigned long *)c->eip; | ||
1184 | c->src.bytes = 1; | ||
1185 | if ((c->d & SrcMask) == SrcImmByte) | ||
1186 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
1187 | else | ||
1188 | c->src.val = insn_fetch(u8, 1, c->eip); | ||
1189 | break; | ||
1190 | case SrcAcc: | ||
1191 | c->src.type = OP_REG; | ||
1192 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1193 | c->src.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1194 | switch (c->src.bytes) { | ||
1195 | case 1: | ||
1196 | c->src.val = *(u8 *)c->src.ptr; | ||
1197 | break; | ||
1198 | case 2: | ||
1199 | c->src.val = *(u16 *)c->src.ptr; | ||
1200 | break; | ||
1201 | case 4: | ||
1202 | c->src.val = *(u32 *)c->src.ptr; | ||
1203 | break; | ||
1204 | case 8: | ||
1205 | c->src.val = *(u64 *)c->src.ptr; | ||
1206 | break; | ||
1207 | } | ||
1208 | break; | ||
1209 | case SrcOne: | ||
1210 | c->src.bytes = 1; | ||
1211 | c->src.val = 1; | ||
1212 | break; | ||
1213 | case SrcSI: | ||
1214 | c->src.type = OP_MEM; | ||
1215 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1216 | c->src.ptr = (unsigned long *) | ||
1217 | register_address(c, seg_override_base(ctxt, ops, c), | ||
1218 | c->regs[VCPU_REGS_RSI]); | ||
1219 | c->src.val = 0; | ||
1220 | break; | ||
1221 | case SrcImmFAddr: | ||
1222 | c->src.type = OP_IMM; | ||
1223 | c->src.ptr = (unsigned long *)c->eip; | ||
1224 | c->src.bytes = c->op_bytes + 2; | ||
1225 | insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); | ||
1226 | break; | ||
1227 | case SrcMemFAddr: | ||
1228 | c->src.type = OP_MEM; | ||
1229 | c->src.ptr = (unsigned long *)c->modrm_ea; | ||
1230 | c->src.bytes = c->op_bytes + 2; | ||
1231 | break; | ||
1232 | } | ||
1233 | 802 | ||
1234 | /* | 803 | if (c->dst.type == OP_MEM && c->src.type == OP_REG) { |
1235 | * Decode and fetch the second source operand: register, memory | 804 | mask = ~(c->dst.bytes * 8 - 1); |
1236 | * or immediate. | ||
1237 | */ | ||
1238 | switch (c->d & Src2Mask) { | ||
1239 | case Src2None: | ||
1240 | break; | ||
1241 | case Src2CL: | ||
1242 | c->src2.bytes = 1; | ||
1243 | c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; | ||
1244 | break; | ||
1245 | case Src2ImmByte: | ||
1246 | c->src2.type = OP_IMM; | ||
1247 | c->src2.ptr = (unsigned long *)c->eip; | ||
1248 | c->src2.bytes = 1; | ||
1249 | c->src2.val = insn_fetch(u8, 1, c->eip); | ||
1250 | break; | ||
1251 | case Src2One: | ||
1252 | c->src2.bytes = 1; | ||
1253 | c->src2.val = 1; | ||
1254 | break; | ||
1255 | } | ||
1256 | 805 | ||
1257 | /* Decode and fetch the destination operand: register or memory. */ | 806 | if (c->src.bytes == 2) |
1258 | switch (c->d & DstMask) { | 807 | sv = (s16)c->src.val & (s16)mask; |
1259 | case ImplicitOps: | 808 | else if (c->src.bytes == 4) |
1260 | /* Special instructions do their own operand decoding. */ | 809 | sv = (s32)c->src.val & (s32)mask; |
1261 | return 0; | ||
1262 | case DstReg: | ||
1263 | decode_register_operand(&c->dst, c, | ||
1264 | c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); | ||
1265 | break; | ||
1266 | case DstMem: | ||
1267 | case DstMem64: | ||
1268 | if ((c->d & ModRM) && c->modrm_mod == 3) { | ||
1269 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1270 | c->dst.type = OP_REG; | ||
1271 | c->dst.val = c->dst.orig_val = c->modrm_val; | ||
1272 | c->dst.ptr = c->modrm_ptr; | ||
1273 | break; | ||
1274 | } | ||
1275 | c->dst.type = OP_MEM; | ||
1276 | c->dst.ptr = (unsigned long *)c->modrm_ea; | ||
1277 | if ((c->d & DstMask) == DstMem64) | ||
1278 | c->dst.bytes = 8; | ||
1279 | else | ||
1280 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1281 | c->dst.val = 0; | ||
1282 | if (c->d & BitOp) { | ||
1283 | unsigned long mask = ~(c->dst.bytes * 8 - 1); | ||
1284 | 810 | ||
1285 | c->dst.ptr = (void *)c->dst.ptr + | 811 | c->dst.addr.mem += (sv >> 3); |
1286 | (c->src.val & mask) / 8; | ||
1287 | } | ||
1288 | break; | ||
1289 | case DstAcc: | ||
1290 | c->dst.type = OP_REG; | ||
1291 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1292 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1293 | switch (c->dst.bytes) { | ||
1294 | case 1: | ||
1295 | c->dst.val = *(u8 *)c->dst.ptr; | ||
1296 | break; | ||
1297 | case 2: | ||
1298 | c->dst.val = *(u16 *)c->dst.ptr; | ||
1299 | break; | ||
1300 | case 4: | ||
1301 | c->dst.val = *(u32 *)c->dst.ptr; | ||
1302 | break; | ||
1303 | case 8: | ||
1304 | c->dst.val = *(u64 *)c->dst.ptr; | ||
1305 | break; | ||
1306 | } | ||
1307 | c->dst.orig_val = c->dst.val; | ||
1308 | break; | ||
1309 | case DstDI: | ||
1310 | c->dst.type = OP_MEM; | ||
1311 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1312 | c->dst.ptr = (unsigned long *) | ||
1313 | register_address(c, es_base(ctxt, ops), | ||
1314 | c->regs[VCPU_REGS_RDI]); | ||
1315 | c->dst.val = 0; | ||
1316 | break; | ||
1317 | } | 812 | } |
1318 | 813 | ||
1319 | done: | 814 | /* only subword offset */ |
1320 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | 815 | c->src.val &= (c->dst.bytes << 3) - 1; |
1321 | } | 816 | } |
1322 | 817 | ||
1323 | static int read_emulated(struct x86_emulate_ctxt *ctxt, | 818 | static int read_emulated(struct x86_emulate_ctxt *ctxt, |
@@ -1337,7 +832,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, | |||
1337 | rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, | 832 | rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, |
1338 | ctxt->vcpu); | 833 | ctxt->vcpu); |
1339 | if (rc == X86EMUL_PROPAGATE_FAULT) | 834 | if (rc == X86EMUL_PROPAGATE_FAULT) |
1340 | emulate_pf(ctxt, addr, err); | 835 | emulate_pf(ctxt); |
1341 | if (rc != X86EMUL_CONTINUE) | 836 | if (rc != X86EMUL_CONTINUE) |
1342 | return rc; | 837 | return rc; |
1343 | mc->end += n; | 838 | mc->end += n; |
@@ -1424,7 +919,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1424 | addr = dt.address + index * 8; | 919 | addr = dt.address + index * 8; |
1425 | ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); | 920 | ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); |
1426 | if (ret == X86EMUL_PROPAGATE_FAULT) | 921 | if (ret == X86EMUL_PROPAGATE_FAULT) |
1427 | emulate_pf(ctxt, addr, err); | 922 | emulate_pf(ctxt); |
1428 | 923 | ||
1429 | return ret; | 924 | return ret; |
1430 | } | 925 | } |
@@ -1450,7 +945,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
1450 | addr = dt.address + index * 8; | 945 | addr = dt.address + index * 8; |
1451 | ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); | 946 | ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); |
1452 | if (ret == X86EMUL_PROPAGATE_FAULT) | 947 | if (ret == X86EMUL_PROPAGATE_FAULT) |
1453 | emulate_pf(ctxt, addr, err); | 948 | emulate_pf(ctxt); |
1454 | 949 | ||
1455 | return ret; | 950 | return ret; |
1456 | } | 951 | } |
@@ -1573,6 +1068,25 @@ exception: | |||
1573 | return X86EMUL_PROPAGATE_FAULT; | 1068 | return X86EMUL_PROPAGATE_FAULT; |
1574 | } | 1069 | } |
1575 | 1070 | ||
1071 | static void write_register_operand(struct operand *op) | ||
1072 | { | ||
1073 | /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ | ||
1074 | switch (op->bytes) { | ||
1075 | case 1: | ||
1076 | *(u8 *)op->addr.reg = (u8)op->val; | ||
1077 | break; | ||
1078 | case 2: | ||
1079 | *(u16 *)op->addr.reg = (u16)op->val; | ||
1080 | break; | ||
1081 | case 4: | ||
1082 | *op->addr.reg = (u32)op->val; | ||
1083 | break; /* 64b: zero-extend */ | ||
1084 | case 8: | ||
1085 | *op->addr.reg = op->val; | ||
1086 | break; | ||
1087 | } | ||
1088 | } | ||
1089 | |||
1576 | static inline int writeback(struct x86_emulate_ctxt *ctxt, | 1090 | static inline int writeback(struct x86_emulate_ctxt *ctxt, |
1577 | struct x86_emulate_ops *ops) | 1091 | struct x86_emulate_ops *ops) |
1578 | { | 1092 | { |
@@ -1582,28 +1096,12 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, | |||
1582 | 1096 | ||
1583 | switch (c->dst.type) { | 1097 | switch (c->dst.type) { |
1584 | case OP_REG: | 1098 | case OP_REG: |
1585 | /* The 4-byte case *is* correct: | 1099 | write_register_operand(&c->dst); |
1586 | * in 64-bit mode we zero-extend. | ||
1587 | */ | ||
1588 | switch (c->dst.bytes) { | ||
1589 | case 1: | ||
1590 | *(u8 *)c->dst.ptr = (u8)c->dst.val; | ||
1591 | break; | ||
1592 | case 2: | ||
1593 | *(u16 *)c->dst.ptr = (u16)c->dst.val; | ||
1594 | break; | ||
1595 | case 4: | ||
1596 | *c->dst.ptr = (u32)c->dst.val; | ||
1597 | break; /* 64b: zero-ext */ | ||
1598 | case 8: | ||
1599 | *c->dst.ptr = c->dst.val; | ||
1600 | break; | ||
1601 | } | ||
1602 | break; | 1100 | break; |
1603 | case OP_MEM: | 1101 | case OP_MEM: |
1604 | if (c->lock_prefix) | 1102 | if (c->lock_prefix) |
1605 | rc = ops->cmpxchg_emulated( | 1103 | rc = ops->cmpxchg_emulated( |
1606 | (unsigned long)c->dst.ptr, | 1104 | c->dst.addr.mem, |
1607 | &c->dst.orig_val, | 1105 | &c->dst.orig_val, |
1608 | &c->dst.val, | 1106 | &c->dst.val, |
1609 | c->dst.bytes, | 1107 | c->dst.bytes, |
@@ -1611,14 +1109,13 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, | |||
1611 | ctxt->vcpu); | 1109 | ctxt->vcpu); |
1612 | else | 1110 | else |
1613 | rc = ops->write_emulated( | 1111 | rc = ops->write_emulated( |
1614 | (unsigned long)c->dst.ptr, | 1112 | c->dst.addr.mem, |
1615 | &c->dst.val, | 1113 | &c->dst.val, |
1616 | c->dst.bytes, | 1114 | c->dst.bytes, |
1617 | &err, | 1115 | &err, |
1618 | ctxt->vcpu); | 1116 | ctxt->vcpu); |
1619 | if (rc == X86EMUL_PROPAGATE_FAULT) | 1117 | if (rc == X86EMUL_PROPAGATE_FAULT) |
1620 | emulate_pf(ctxt, | 1118 | emulate_pf(ctxt); |
1621 | (unsigned long)c->dst.ptr, err); | ||
1622 | if (rc != X86EMUL_CONTINUE) | 1119 | if (rc != X86EMUL_CONTINUE) |
1623 | return rc; | 1120 | return rc; |
1624 | break; | 1121 | break; |
@@ -1640,8 +1137,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt, | |||
1640 | c->dst.bytes = c->op_bytes; | 1137 | c->dst.bytes = c->op_bytes; |
1641 | c->dst.val = c->src.val; | 1138 | c->dst.val = c->src.val; |
1642 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); | 1139 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); |
1643 | c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), | 1140 | c->dst.addr.mem = register_address(c, ss_base(ctxt, ops), |
1644 | c->regs[VCPU_REGS_RSP]); | 1141 | c->regs[VCPU_REGS_RSP]); |
1645 | } | 1142 | } |
1646 | 1143 | ||
1647 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, | 1144 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, |
@@ -1701,6 +1198,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, | |||
1701 | *(unsigned long *)dest = | 1198 | *(unsigned long *)dest = |
1702 | (ctxt->eflags & ~change_mask) | (val & change_mask); | 1199 | (ctxt->eflags & ~change_mask) | (val & change_mask); |
1703 | 1200 | ||
1201 | if (rc == X86EMUL_PROPAGATE_FAULT) | ||
1202 | emulate_pf(ctxt); | ||
1203 | |||
1704 | return rc; | 1204 | return rc; |
1705 | } | 1205 | } |
1706 | 1206 | ||
@@ -1778,6 +1278,150 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, | |||
1778 | return rc; | 1278 | return rc; |
1779 | } | 1279 | } |
1780 | 1280 | ||
1281 | int emulate_int_real(struct x86_emulate_ctxt *ctxt, | ||
1282 | struct x86_emulate_ops *ops, int irq) | ||
1283 | { | ||
1284 | struct decode_cache *c = &ctxt->decode; | ||
1285 | int rc; | ||
1286 | struct desc_ptr dt; | ||
1287 | gva_t cs_addr; | ||
1288 | gva_t eip_addr; | ||
1289 | u16 cs, eip; | ||
1290 | u32 err; | ||
1291 | |||
1292 | /* TODO: Add limit checks */ | ||
1293 | c->src.val = ctxt->eflags; | ||
1294 | emulate_push(ctxt, ops); | ||
1295 | rc = writeback(ctxt, ops); | ||
1296 | if (rc != X86EMUL_CONTINUE) | ||
1297 | return rc; | ||
1298 | |||
1299 | ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); | ||
1300 | |||
1301 | c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); | ||
1302 | emulate_push(ctxt, ops); | ||
1303 | rc = writeback(ctxt, ops); | ||
1304 | if (rc != X86EMUL_CONTINUE) | ||
1305 | return rc; | ||
1306 | |||
1307 | c->src.val = c->eip; | ||
1308 | emulate_push(ctxt, ops); | ||
1309 | rc = writeback(ctxt, ops); | ||
1310 | if (rc != X86EMUL_CONTINUE) | ||
1311 | return rc; | ||
1312 | |||
1313 | c->dst.type = OP_NONE; | ||
1314 | |||
1315 | ops->get_idt(&dt, ctxt->vcpu); | ||
1316 | |||
1317 | eip_addr = dt.address + (irq << 2); | ||
1318 | cs_addr = dt.address + (irq << 2) + 2; | ||
1319 | |||
1320 | rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err); | ||
1321 | if (rc != X86EMUL_CONTINUE) | ||
1322 | return rc; | ||
1323 | |||
1324 | rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err); | ||
1325 | if (rc != X86EMUL_CONTINUE) | ||
1326 | return rc; | ||
1327 | |||
1328 | rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS); | ||
1329 | if (rc != X86EMUL_CONTINUE) | ||
1330 | return rc; | ||
1331 | |||
1332 | c->eip = eip; | ||
1333 | |||
1334 | return rc; | ||
1335 | } | ||
1336 | |||
1337 | static int emulate_int(struct x86_emulate_ctxt *ctxt, | ||
1338 | struct x86_emulate_ops *ops, int irq) | ||
1339 | { | ||
1340 | switch(ctxt->mode) { | ||
1341 | case X86EMUL_MODE_REAL: | ||
1342 | return emulate_int_real(ctxt, ops, irq); | ||
1343 | case X86EMUL_MODE_VM86: | ||
1344 | case X86EMUL_MODE_PROT16: | ||
1345 | case X86EMUL_MODE_PROT32: | ||
1346 | case X86EMUL_MODE_PROT64: | ||
1347 | default: | ||
1348 | /* Protected mode interrupts unimplemented yet */ | ||
1349 | return X86EMUL_UNHANDLEABLE; | ||
1350 | } | ||
1351 | } | ||
1352 | |||
1353 | static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, | ||
1354 | struct x86_emulate_ops *ops) | ||
1355 | { | ||
1356 | struct decode_cache *c = &ctxt->decode; | ||
1357 | int rc = X86EMUL_CONTINUE; | ||
1358 | unsigned long temp_eip = 0; | ||
1359 | unsigned long temp_eflags = 0; | ||
1360 | unsigned long cs = 0; | ||
1361 | unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF | | ||
1362 | EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF | | ||
1363 | EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */ | ||
1364 | unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP; | ||
1365 | |||
1366 | /* TODO: Add stack limit check */ | ||
1367 | |||
1368 | rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes); | ||
1369 | |||
1370 | if (rc != X86EMUL_CONTINUE) | ||
1371 | return rc; | ||
1372 | |||
1373 | if (temp_eip & ~0xffff) { | ||
1374 | emulate_gp(ctxt, 0); | ||
1375 | return X86EMUL_PROPAGATE_FAULT; | ||
1376 | } | ||
1377 | |||
1378 | rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); | ||
1379 | |||
1380 | if (rc != X86EMUL_CONTINUE) | ||
1381 | return rc; | ||
1382 | |||
1383 | rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes); | ||
1384 | |||
1385 | if (rc != X86EMUL_CONTINUE) | ||
1386 | return rc; | ||
1387 | |||
1388 | rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); | ||
1389 | |||
1390 | if (rc != X86EMUL_CONTINUE) | ||
1391 | return rc; | ||
1392 | |||
1393 | c->eip = temp_eip; | ||
1394 | |||
1395 | |||
1396 | if (c->op_bytes == 4) | ||
1397 | ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); | ||
1398 | else if (c->op_bytes == 2) { | ||
1399 | ctxt->eflags &= ~0xffff; | ||
1400 | ctxt->eflags |= temp_eflags; | ||
1401 | } | ||
1402 | |||
1403 | ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ | ||
1404 | ctxt->eflags |= EFLG_RESERVED_ONE_MASK; | ||
1405 | |||
1406 | return rc; | ||
1407 | } | ||
1408 | |||
1409 | static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, | ||
1410 | struct x86_emulate_ops* ops) | ||
1411 | { | ||
1412 | switch(ctxt->mode) { | ||
1413 | case X86EMUL_MODE_REAL: | ||
1414 | return emulate_iret_real(ctxt, ops); | ||
1415 | case X86EMUL_MODE_VM86: | ||
1416 | case X86EMUL_MODE_PROT16: | ||
1417 | case X86EMUL_MODE_PROT32: | ||
1418 | case X86EMUL_MODE_PROT64: | ||
1419 | default: | ||
1420 | /* iret from protected mode unimplemented yet */ | ||
1421 | return X86EMUL_UNHANDLEABLE; | ||
1422 | } | ||
1423 | } | ||
1424 | |||
1781 | static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, | 1425 | static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, |
1782 | struct x86_emulate_ops *ops) | 1426 | struct x86_emulate_ops *ops) |
1783 | { | 1427 | { |
@@ -1819,6 +1463,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, | |||
1819 | struct x86_emulate_ops *ops) | 1463 | struct x86_emulate_ops *ops) |
1820 | { | 1464 | { |
1821 | struct decode_cache *c = &ctxt->decode; | 1465 | struct decode_cache *c = &ctxt->decode; |
1466 | unsigned long *rax = &c->regs[VCPU_REGS_RAX]; | ||
1467 | unsigned long *rdx = &c->regs[VCPU_REGS_RDX]; | ||
1468 | u8 de = 0; | ||
1822 | 1469 | ||
1823 | switch (c->modrm_reg) { | 1470 | switch (c->modrm_reg) { |
1824 | case 0 ... 1: /* test */ | 1471 | case 0 ... 1: /* test */ |
@@ -1830,10 +1477,26 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, | |||
1830 | case 3: /* neg */ | 1477 | case 3: /* neg */ |
1831 | emulate_1op("neg", c->dst, ctxt->eflags); | 1478 | emulate_1op("neg", c->dst, ctxt->eflags); |
1832 | break; | 1479 | break; |
1480 | case 4: /* mul */ | ||
1481 | emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags); | ||
1482 | break; | ||
1483 | case 5: /* imul */ | ||
1484 | emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags); | ||
1485 | break; | ||
1486 | case 6: /* div */ | ||
1487 | emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx, | ||
1488 | ctxt->eflags, de); | ||
1489 | break; | ||
1490 | case 7: /* idiv */ | ||
1491 | emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx, | ||
1492 | ctxt->eflags, de); | ||
1493 | break; | ||
1833 | default: | 1494 | default: |
1834 | return 0; | 1495 | return X86EMUL_UNHANDLEABLE; |
1835 | } | 1496 | } |
1836 | return 1; | 1497 | if (de) |
1498 | return emulate_de(ctxt); | ||
1499 | return X86EMUL_CONTINUE; | ||
1837 | } | 1500 | } |
1838 | 1501 | ||
1839 | static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, | 1502 | static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, |
@@ -1905,6 +1568,23 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, | |||
1905 | return rc; | 1568 | return rc; |
1906 | } | 1569 | } |
1907 | 1570 | ||
1571 | static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, | ||
1572 | struct x86_emulate_ops *ops, int seg) | ||
1573 | { | ||
1574 | struct decode_cache *c = &ctxt->decode; | ||
1575 | unsigned short sel; | ||
1576 | int rc; | ||
1577 | |||
1578 | memcpy(&sel, c->src.valptr + c->op_bytes, 2); | ||
1579 | |||
1580 | rc = load_segment_descriptor(ctxt, ops, sel, seg); | ||
1581 | if (rc != X86EMUL_CONTINUE) | ||
1582 | return rc; | ||
1583 | |||
1584 | c->dst.val = c->src.val; | ||
1585 | return rc; | ||
1586 | } | ||
1587 | |||
1908 | static inline void | 1588 | static inline void |
1909 | setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, | 1589 | setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, |
1910 | struct x86_emulate_ops *ops, struct desc_struct *cs, | 1590 | struct x86_emulate_ops *ops, struct desc_struct *cs, |
@@ -2160,9 +1840,15 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, | |||
2160 | struct x86_emulate_ops *ops, | 1840 | struct x86_emulate_ops *ops, |
2161 | u16 port, u16 len) | 1841 | u16 port, u16 len) |
2162 | { | 1842 | { |
1843 | if (ctxt->perm_ok) | ||
1844 | return true; | ||
1845 | |||
2163 | if (emulator_bad_iopl(ctxt, ops)) | 1846 | if (emulator_bad_iopl(ctxt, ops)) |
2164 | if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) | 1847 | if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) |
2165 | return false; | 1848 | return false; |
1849 | |||
1850 | ctxt->perm_ok = true; | ||
1851 | |||
2166 | return true; | 1852 | return true; |
2167 | } | 1853 | } |
2168 | 1854 | ||
@@ -2254,7 +1940,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2254 | &err); | 1940 | &err); |
2255 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 1941 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2256 | /* FIXME: need to provide precise fault address */ | 1942 | /* FIXME: need to provide precise fault address */ |
2257 | emulate_pf(ctxt, old_tss_base, err); | 1943 | emulate_pf(ctxt); |
2258 | return ret; | 1944 | return ret; |
2259 | } | 1945 | } |
2260 | 1946 | ||
@@ -2264,7 +1950,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2264 | &err); | 1950 | &err); |
2265 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 1951 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2266 | /* FIXME: need to provide precise fault address */ | 1952 | /* FIXME: need to provide precise fault address */ |
2267 | emulate_pf(ctxt, old_tss_base, err); | 1953 | emulate_pf(ctxt); |
2268 | return ret; | 1954 | return ret; |
2269 | } | 1955 | } |
2270 | 1956 | ||
@@ -2272,7 +1958,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2272 | &err); | 1958 | &err); |
2273 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 1959 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2274 | /* FIXME: need to provide precise fault address */ | 1960 | /* FIXME: need to provide precise fault address */ |
2275 | emulate_pf(ctxt, new_tss_base, err); | 1961 | emulate_pf(ctxt); |
2276 | return ret; | 1962 | return ret; |
2277 | } | 1963 | } |
2278 | 1964 | ||
@@ -2285,7 +1971,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
2285 | ctxt->vcpu, &err); | 1971 | ctxt->vcpu, &err); |
2286 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 1972 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2287 | /* FIXME: need to provide precise fault address */ | 1973 | /* FIXME: need to provide precise fault address */ |
2288 | emulate_pf(ctxt, new_tss_base, err); | 1974 | emulate_pf(ctxt); |
2289 | return ret; | 1975 | return ret; |
2290 | } | 1976 | } |
2291 | } | 1977 | } |
@@ -2396,7 +2082,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2396 | &err); | 2082 | &err); |
2397 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2083 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2398 | /* FIXME: need to provide precise fault address */ | 2084 | /* FIXME: need to provide precise fault address */ |
2399 | emulate_pf(ctxt, old_tss_base, err); | 2085 | emulate_pf(ctxt); |
2400 | return ret; | 2086 | return ret; |
2401 | } | 2087 | } |
2402 | 2088 | ||
@@ -2406,7 +2092,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2406 | &err); | 2092 | &err); |
2407 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2093 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2408 | /* FIXME: need to provide precise fault address */ | 2094 | /* FIXME: need to provide precise fault address */ |
2409 | emulate_pf(ctxt, old_tss_base, err); | 2095 | emulate_pf(ctxt); |
2410 | return ret; | 2096 | return ret; |
2411 | } | 2097 | } |
2412 | 2098 | ||
@@ -2414,7 +2100,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2414 | &err); | 2100 | &err); |
2415 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2101 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2416 | /* FIXME: need to provide precise fault address */ | 2102 | /* FIXME: need to provide precise fault address */ |
2417 | emulate_pf(ctxt, new_tss_base, err); | 2103 | emulate_pf(ctxt); |
2418 | return ret; | 2104 | return ret; |
2419 | } | 2105 | } |
2420 | 2106 | ||
@@ -2427,7 +2113,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2427 | ctxt->vcpu, &err); | 2113 | ctxt->vcpu, &err); |
2428 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2114 | if (ret == X86EMUL_PROPAGATE_FAULT) { |
2429 | /* FIXME: need to provide precise fault address */ | 2115 | /* FIXME: need to provide precise fault address */ |
2430 | emulate_pf(ctxt, new_tss_base, err); | 2116 | emulate_pf(ctxt); |
2431 | return ret; | 2117 | return ret; |
2432 | } | 2118 | } |
2433 | } | 2119 | } |
@@ -2523,10 +2209,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2523 | } | 2209 | } |
2524 | 2210 | ||
2525 | int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | 2211 | int emulator_task_switch(struct x86_emulate_ctxt *ctxt, |
2526 | struct x86_emulate_ops *ops, | ||
2527 | u16 tss_selector, int reason, | 2212 | u16 tss_selector, int reason, |
2528 | bool has_error_code, u32 error_code) | 2213 | bool has_error_code, u32 error_code) |
2529 | { | 2214 | { |
2215 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2530 | struct decode_cache *c = &ctxt->decode; | 2216 | struct decode_cache *c = &ctxt->decode; |
2531 | int rc; | 2217 | int rc; |
2532 | 2218 | ||
@@ -2552,16 +2238,784 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, | |||
2552 | int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; | 2238 | int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; |
2553 | 2239 | ||
2554 | register_address_increment(c, &c->regs[reg], df * op->bytes); | 2240 | register_address_increment(c, &c->regs[reg], df * op->bytes); |
2555 | op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); | 2241 | op->addr.mem = register_address(c, base, c->regs[reg]); |
2242 | } | ||
2243 | |||
2244 | static int em_push(struct x86_emulate_ctxt *ctxt) | ||
2245 | { | ||
2246 | emulate_push(ctxt, ctxt->ops); | ||
2247 | return X86EMUL_CONTINUE; | ||
2248 | } | ||
2249 | |||
2250 | static int em_das(struct x86_emulate_ctxt *ctxt) | ||
2251 | { | ||
2252 | struct decode_cache *c = &ctxt->decode; | ||
2253 | u8 al, old_al; | ||
2254 | bool af, cf, old_cf; | ||
2255 | |||
2256 | cf = ctxt->eflags & X86_EFLAGS_CF; | ||
2257 | al = c->dst.val; | ||
2258 | |||
2259 | old_al = al; | ||
2260 | old_cf = cf; | ||
2261 | cf = false; | ||
2262 | af = ctxt->eflags & X86_EFLAGS_AF; | ||
2263 | if ((al & 0x0f) > 9 || af) { | ||
2264 | al -= 6; | ||
2265 | cf = old_cf | (al >= 250); | ||
2266 | af = true; | ||
2267 | } else { | ||
2268 | af = false; | ||
2269 | } | ||
2270 | if (old_al > 0x99 || old_cf) { | ||
2271 | al -= 0x60; | ||
2272 | cf = true; | ||
2273 | } | ||
2274 | |||
2275 | c->dst.val = al; | ||
2276 | /* Set PF, ZF, SF */ | ||
2277 | c->src.type = OP_IMM; | ||
2278 | c->src.val = 0; | ||
2279 | c->src.bytes = 1; | ||
2280 | emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); | ||
2281 | ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); | ||
2282 | if (cf) | ||
2283 | ctxt->eflags |= X86_EFLAGS_CF; | ||
2284 | if (af) | ||
2285 | ctxt->eflags |= X86_EFLAGS_AF; | ||
2286 | return X86EMUL_CONTINUE; | ||
2287 | } | ||
2288 | |||
2289 | static int em_call_far(struct x86_emulate_ctxt *ctxt) | ||
2290 | { | ||
2291 | struct decode_cache *c = &ctxt->decode; | ||
2292 | u16 sel, old_cs; | ||
2293 | ulong old_eip; | ||
2294 | int rc; | ||
2295 | |||
2296 | old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); | ||
2297 | old_eip = c->eip; | ||
2298 | |||
2299 | memcpy(&sel, c->src.valptr + c->op_bytes, 2); | ||
2300 | if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS)) | ||
2301 | return X86EMUL_CONTINUE; | ||
2302 | |||
2303 | c->eip = 0; | ||
2304 | memcpy(&c->eip, c->src.valptr, c->op_bytes); | ||
2305 | |||
2306 | c->src.val = old_cs; | ||
2307 | emulate_push(ctxt, ctxt->ops); | ||
2308 | rc = writeback(ctxt, ctxt->ops); | ||
2309 | if (rc != X86EMUL_CONTINUE) | ||
2310 | return rc; | ||
2311 | |||
2312 | c->src.val = old_eip; | ||
2313 | emulate_push(ctxt, ctxt->ops); | ||
2314 | rc = writeback(ctxt, ctxt->ops); | ||
2315 | if (rc != X86EMUL_CONTINUE) | ||
2316 | return rc; | ||
2317 | |||
2318 | c->dst.type = OP_NONE; | ||
2319 | |||
2320 | return X86EMUL_CONTINUE; | ||
2321 | } | ||
2322 | |||
2323 | static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) | ||
2324 | { | ||
2325 | struct decode_cache *c = &ctxt->decode; | ||
2326 | int rc; | ||
2327 | |||
2328 | c->dst.type = OP_REG; | ||
2329 | c->dst.addr.reg = &c->eip; | ||
2330 | c->dst.bytes = c->op_bytes; | ||
2331 | rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); | ||
2332 | if (rc != X86EMUL_CONTINUE) | ||
2333 | return rc; | ||
2334 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); | ||
2335 | return X86EMUL_CONTINUE; | ||
2336 | } | ||
2337 | |||
2338 | static int em_imul(struct x86_emulate_ctxt *ctxt) | ||
2339 | { | ||
2340 | struct decode_cache *c = &ctxt->decode; | ||
2341 | |||
2342 | emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags); | ||
2343 | return X86EMUL_CONTINUE; | ||
2344 | } | ||
2345 | |||
2346 | static int em_imul_3op(struct x86_emulate_ctxt *ctxt) | ||
2347 | { | ||
2348 | struct decode_cache *c = &ctxt->decode; | ||
2349 | |||
2350 | c->dst.val = c->src2.val; | ||
2351 | return em_imul(ctxt); | ||
2352 | } | ||
2353 | |||
2354 | static int em_cwd(struct x86_emulate_ctxt *ctxt) | ||
2355 | { | ||
2356 | struct decode_cache *c = &ctxt->decode; | ||
2357 | |||
2358 | c->dst.type = OP_REG; | ||
2359 | c->dst.bytes = c->src.bytes; | ||
2360 | c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; | ||
2361 | c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1); | ||
2362 | |||
2363 | return X86EMUL_CONTINUE; | ||
2364 | } | ||
2365 | |||
2366 | static int em_rdtsc(struct x86_emulate_ctxt *ctxt) | ||
2367 | { | ||
2368 | unsigned cpl = ctxt->ops->cpl(ctxt->vcpu); | ||
2369 | struct decode_cache *c = &ctxt->decode; | ||
2370 | u64 tsc = 0; | ||
2371 | |||
2372 | if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) { | ||
2373 | emulate_gp(ctxt, 0); | ||
2374 | return X86EMUL_PROPAGATE_FAULT; | ||
2375 | } | ||
2376 | ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); | ||
2377 | c->regs[VCPU_REGS_RAX] = (u32)tsc; | ||
2378 | c->regs[VCPU_REGS_RDX] = tsc >> 32; | ||
2379 | return X86EMUL_CONTINUE; | ||
2380 | } | ||
2381 | |||
2382 | static int em_mov(struct x86_emulate_ctxt *ctxt) | ||
2383 | { | ||
2384 | struct decode_cache *c = &ctxt->decode; | ||
2385 | c->dst.val = c->src.val; | ||
2386 | return X86EMUL_CONTINUE; | ||
2387 | } | ||
2388 | |||
2389 | #define D(_y) { .flags = (_y) } | ||
2390 | #define N D(0) | ||
2391 | #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } | ||
2392 | #define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } | ||
2393 | #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } | ||
2394 | |||
2395 | #define D2bv(_f) D((_f) | ByteOp), D(_f) | ||
2396 | #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) | ||
2397 | |||
2398 | #define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \ | ||
2399 | D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \ | ||
2400 | D2bv(((_f) & ~Lock) | DstAcc | SrcImm) | ||
2401 | |||
2402 | |||
2403 | static struct opcode group1[] = { | ||
2404 | X7(D(Lock)), N | ||
2405 | }; | ||
2406 | |||
2407 | static struct opcode group1A[] = { | ||
2408 | D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, | ||
2409 | }; | ||
2410 | |||
2411 | static struct opcode group3[] = { | ||
2412 | D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), | ||
2413 | D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), | ||
2414 | X4(D(SrcMem | ModRM)), | ||
2415 | }; | ||
2416 | |||
2417 | static struct opcode group4[] = { | ||
2418 | D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), | ||
2419 | N, N, N, N, N, N, | ||
2420 | }; | ||
2421 | |||
2422 | static struct opcode group5[] = { | ||
2423 | D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), | ||
2424 | D(SrcMem | ModRM | Stack), | ||
2425 | I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), | ||
2426 | D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), | ||
2427 | D(SrcMem | ModRM | Stack), N, | ||
2428 | }; | ||
2429 | |||
2430 | static struct group_dual group7 = { { | ||
2431 | N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), | ||
2432 | D(SrcNone | ModRM | DstMem | Mov), N, | ||
2433 | D(SrcMem16 | ModRM | Mov | Priv), | ||
2434 | D(SrcMem | ModRM | ByteOp | Priv | NoAccess), | ||
2435 | }, { | ||
2436 | D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), | ||
2437 | D(SrcNone | ModRM | DstMem | Mov), N, | ||
2438 | D(SrcMem16 | ModRM | Mov | Priv), N, | ||
2439 | } }; | ||
2440 | |||
2441 | static struct opcode group8[] = { | ||
2442 | N, N, N, N, | ||
2443 | D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), | ||
2444 | D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), | ||
2445 | }; | ||
2446 | |||
2447 | static struct group_dual group9 = { { | ||
2448 | N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, | ||
2449 | }, { | ||
2450 | N, N, N, N, N, N, N, N, | ||
2451 | } }; | ||
2452 | |||
2453 | static struct opcode group11[] = { | ||
2454 | I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), | ||
2455 | }; | ||
2456 | |||
2457 | static struct opcode opcode_table[256] = { | ||
2458 | /* 0x00 - 0x07 */ | ||
2459 | D6ALU(Lock), | ||
2460 | D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), | ||
2461 | /* 0x08 - 0x0F */ | ||
2462 | D6ALU(Lock), | ||
2463 | D(ImplicitOps | Stack | No64), N, | ||
2464 | /* 0x10 - 0x17 */ | ||
2465 | D6ALU(Lock), | ||
2466 | D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), | ||
2467 | /* 0x18 - 0x1F */ | ||
2468 | D6ALU(Lock), | ||
2469 | D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), | ||
2470 | /* 0x20 - 0x27 */ | ||
2471 | D6ALU(Lock), N, N, | ||
2472 | /* 0x28 - 0x2F */ | ||
2473 | D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das), | ||
2474 | /* 0x30 - 0x37 */ | ||
2475 | D6ALU(Lock), N, N, | ||
2476 | /* 0x38 - 0x3F */ | ||
2477 | D6ALU(0), N, N, | ||
2478 | /* 0x40 - 0x4F */ | ||
2479 | X16(D(DstReg)), | ||
2480 | /* 0x50 - 0x57 */ | ||
2481 | X8(I(SrcReg | Stack, em_push)), | ||
2482 | /* 0x58 - 0x5F */ | ||
2483 | X8(D(DstReg | Stack)), | ||
2484 | /* 0x60 - 0x67 */ | ||
2485 | D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), | ||
2486 | N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , | ||
2487 | N, N, N, N, | ||
2488 | /* 0x68 - 0x6F */ | ||
2489 | I(SrcImm | Mov | Stack, em_push), | ||
2490 | I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), | ||
2491 | I(SrcImmByte | Mov | Stack, em_push), | ||
2492 | I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), | ||
2493 | D2bv(DstDI | Mov | String), /* insb, insw/insd */ | ||
2494 | D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ | ||
2495 | /* 0x70 - 0x7F */ | ||
2496 | X16(D(SrcImmByte)), | ||
2497 | /* 0x80 - 0x87 */ | ||
2498 | G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), | ||
2499 | G(DstMem | SrcImm | ModRM | Group, group1), | ||
2500 | G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), | ||
2501 | G(DstMem | SrcImmByte | ModRM | Group, group1), | ||
2502 | D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock), | ||
2503 | /* 0x88 - 0x8F */ | ||
2504 | I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), | ||
2505 | I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), | ||
2506 | D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), | ||
2507 | D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), | ||
2508 | /* 0x90 - 0x97 */ | ||
2509 | X8(D(SrcAcc | DstReg)), | ||
2510 | /* 0x98 - 0x9F */ | ||
2511 | D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), | ||
2512 | I(SrcImmFAddr | No64, em_call_far), N, | ||
2513 | D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, | ||
2514 | /* 0xA0 - 0xA7 */ | ||
2515 | I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), | ||
2516 | I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), | ||
2517 | I2bv(SrcSI | DstDI | Mov | String, em_mov), | ||
2518 | D2bv(SrcSI | DstDI | String), | ||
2519 | /* 0xA8 - 0xAF */ | ||
2520 | D2bv(DstAcc | SrcImm), | ||
2521 | I2bv(SrcAcc | DstDI | Mov | String, em_mov), | ||
2522 | I2bv(SrcSI | DstAcc | Mov | String, em_mov), | ||
2523 | D2bv(SrcAcc | DstDI | String), | ||
2524 | /* 0xB0 - 0xB7 */ | ||
2525 | X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), | ||
2526 | /* 0xB8 - 0xBF */ | ||
2527 | X8(I(DstReg | SrcImm | Mov, em_mov)), | ||
2528 | /* 0xC0 - 0xC7 */ | ||
2529 | D2bv(DstMem | SrcImmByte | ModRM), | ||
2530 | I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), | ||
2531 | D(ImplicitOps | Stack), | ||
2532 | D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), | ||
2533 | G(ByteOp, group11), G(0, group11), | ||
2534 | /* 0xC8 - 0xCF */ | ||
2535 | N, N, N, D(ImplicitOps | Stack), | ||
2536 | D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), | ||
2537 | /* 0xD0 - 0xD7 */ | ||
2538 | D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), | ||
2539 | N, N, N, N, | ||
2540 | /* 0xD8 - 0xDF */ | ||
2541 | N, N, N, N, N, N, N, N, | ||
2542 | /* 0xE0 - 0xE7 */ | ||
2543 | X4(D(SrcImmByte)), | ||
2544 | D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte), | ||
2545 | /* 0xE8 - 0xEF */ | ||
2546 | D(SrcImm | Stack), D(SrcImm | ImplicitOps), | ||
2547 | D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), | ||
2548 | D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps), | ||
2549 | /* 0xF0 - 0xF7 */ | ||
2550 | N, N, N, N, | ||
2551 | D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), | ||
2552 | /* 0xF8 - 0xFF */ | ||
2553 | D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), | ||
2554 | D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), | ||
2555 | }; | ||
2556 | |||
2557 | static struct opcode twobyte_table[256] = { | ||
2558 | /* 0x00 - 0x0F */ | ||
2559 | N, GD(0, &group7), N, N, | ||
2560 | N, D(ImplicitOps), D(ImplicitOps | Priv), N, | ||
2561 | D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, | ||
2562 | N, D(ImplicitOps | ModRM), N, N, | ||
2563 | /* 0x10 - 0x1F */ | ||
2564 | N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, | ||
2565 | /* 0x20 - 0x2F */ | ||
2566 | D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264), | ||
2567 | D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264), | ||
2568 | N, N, N, N, | ||
2569 | N, N, N, N, N, N, N, N, | ||
2570 | /* 0x30 - 0x3F */ | ||
2571 | D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), | ||
2572 | D(ImplicitOps | Priv), N, | ||
2573 | D(ImplicitOps), D(ImplicitOps | Priv), N, N, | ||
2574 | N, N, N, N, N, N, N, N, | ||
2575 | /* 0x40 - 0x4F */ | ||
2576 | X16(D(DstReg | SrcMem | ModRM | Mov)), | ||
2577 | /* 0x50 - 0x5F */ | ||
2578 | N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, | ||
2579 | /* 0x60 - 0x6F */ | ||
2580 | N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, | ||
2581 | /* 0x70 - 0x7F */ | ||
2582 | N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, | ||
2583 | /* 0x80 - 0x8F */ | ||
2584 | X16(D(SrcImm)), | ||
2585 | /* 0x90 - 0x9F */ | ||
2586 | X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), | ||
2587 | /* 0xA0 - 0xA7 */ | ||
2588 | D(ImplicitOps | Stack), D(ImplicitOps | Stack), | ||
2589 | N, D(DstMem | SrcReg | ModRM | BitOp), | ||
2590 | D(DstMem | SrcReg | Src2ImmByte | ModRM), | ||
2591 | D(DstMem | SrcReg | Src2CL | ModRM), N, N, | ||
2592 | /* 0xA8 - 0xAF */ | ||
2593 | D(ImplicitOps | Stack), D(ImplicitOps | Stack), | ||
2594 | N, D(DstMem | SrcReg | ModRM | BitOp | Lock), | ||
2595 | D(DstMem | SrcReg | Src2ImmByte | ModRM), | ||
2596 | D(DstMem | SrcReg | Src2CL | ModRM), | ||
2597 | D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), | ||
2598 | /* 0xB0 - 0xB7 */ | ||
2599 | D2bv(DstMem | SrcReg | ModRM | Lock), | ||
2600 | D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock), | ||
2601 | D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM), | ||
2602 | D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), | ||
2603 | /* 0xB8 - 0xBF */ | ||
2604 | N, N, | ||
2605 | G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), | ||
2606 | D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), | ||
2607 | D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), | ||
2608 | /* 0xC0 - 0xCF */ | ||
2609 | D2bv(DstMem | SrcReg | ModRM | Lock), | ||
2610 | N, D(DstMem | SrcReg | ModRM | Mov), | ||
2611 | N, N, N, GD(0, &group9), | ||
2612 | N, N, N, N, N, N, N, N, | ||
2613 | /* 0xD0 - 0xDF */ | ||
2614 | N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, | ||
2615 | /* 0xE0 - 0xEF */ | ||
2616 | N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, | ||
2617 | /* 0xF0 - 0xFF */ | ||
2618 | N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N | ||
2619 | }; | ||
2620 | |||
2621 | #undef D | ||
2622 | #undef N | ||
2623 | #undef G | ||
2624 | #undef GD | ||
2625 | #undef I | ||
2626 | |||
2627 | #undef D2bv | ||
2628 | #undef I2bv | ||
2629 | #undef D6ALU | ||
2630 | |||
2631 | static unsigned imm_size(struct decode_cache *c) | ||
2632 | { | ||
2633 | unsigned size; | ||
2634 | |||
2635 | size = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
2636 | if (size == 8) | ||
2637 | size = 4; | ||
2638 | return size; | ||
2639 | } | ||
2640 | |||
2641 | static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, | ||
2642 | unsigned size, bool sign_extension) | ||
2643 | { | ||
2644 | struct decode_cache *c = &ctxt->decode; | ||
2645 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2646 | int rc = X86EMUL_CONTINUE; | ||
2647 | |||
2648 | op->type = OP_IMM; | ||
2649 | op->bytes = size; | ||
2650 | op->addr.mem = c->eip; | ||
2651 | /* NB. Immediates are sign-extended as necessary. */ | ||
2652 | switch (op->bytes) { | ||
2653 | case 1: | ||
2654 | op->val = insn_fetch(s8, 1, c->eip); | ||
2655 | break; | ||
2656 | case 2: | ||
2657 | op->val = insn_fetch(s16, 2, c->eip); | ||
2658 | break; | ||
2659 | case 4: | ||
2660 | op->val = insn_fetch(s32, 4, c->eip); | ||
2661 | break; | ||
2662 | } | ||
2663 | if (!sign_extension) { | ||
2664 | switch (op->bytes) { | ||
2665 | case 1: | ||
2666 | op->val &= 0xff; | ||
2667 | break; | ||
2668 | case 2: | ||
2669 | op->val &= 0xffff; | ||
2670 | break; | ||
2671 | case 4: | ||
2672 | op->val &= 0xffffffff; | ||
2673 | break; | ||
2674 | } | ||
2675 | } | ||
2676 | done: | ||
2677 | return rc; | ||
2556 | } | 2678 | } |
2557 | 2679 | ||
2558 | int | 2680 | int |
2559 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | 2681 | x86_decode_insn(struct x86_emulate_ctxt *ctxt) |
2560 | { | 2682 | { |
2683 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2684 | struct decode_cache *c = &ctxt->decode; | ||
2685 | int rc = X86EMUL_CONTINUE; | ||
2686 | int mode = ctxt->mode; | ||
2687 | int def_op_bytes, def_ad_bytes, dual, goffset; | ||
2688 | struct opcode opcode, *g_mod012, *g_mod3; | ||
2689 | struct operand memop = { .type = OP_NONE }; | ||
2690 | |||
2691 | c->eip = ctxt->eip; | ||
2692 | c->fetch.start = c->fetch.end = c->eip; | ||
2693 | ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); | ||
2694 | |||
2695 | switch (mode) { | ||
2696 | case X86EMUL_MODE_REAL: | ||
2697 | case X86EMUL_MODE_VM86: | ||
2698 | case X86EMUL_MODE_PROT16: | ||
2699 | def_op_bytes = def_ad_bytes = 2; | ||
2700 | break; | ||
2701 | case X86EMUL_MODE_PROT32: | ||
2702 | def_op_bytes = def_ad_bytes = 4; | ||
2703 | break; | ||
2704 | #ifdef CONFIG_X86_64 | ||
2705 | case X86EMUL_MODE_PROT64: | ||
2706 | def_op_bytes = 4; | ||
2707 | def_ad_bytes = 8; | ||
2708 | break; | ||
2709 | #endif | ||
2710 | default: | ||
2711 | return -1; | ||
2712 | } | ||
2713 | |||
2714 | c->op_bytes = def_op_bytes; | ||
2715 | c->ad_bytes = def_ad_bytes; | ||
2716 | |||
2717 | /* Legacy prefixes. */ | ||
2718 | for (;;) { | ||
2719 | switch (c->b = insn_fetch(u8, 1, c->eip)) { | ||
2720 | case 0x66: /* operand-size override */ | ||
2721 | /* switch between 2/4 bytes */ | ||
2722 | c->op_bytes = def_op_bytes ^ 6; | ||
2723 | break; | ||
2724 | case 0x67: /* address-size override */ | ||
2725 | if (mode == X86EMUL_MODE_PROT64) | ||
2726 | /* switch between 4/8 bytes */ | ||
2727 | c->ad_bytes = def_ad_bytes ^ 12; | ||
2728 | else | ||
2729 | /* switch between 2/4 bytes */ | ||
2730 | c->ad_bytes = def_ad_bytes ^ 6; | ||
2731 | break; | ||
2732 | case 0x26: /* ES override */ | ||
2733 | case 0x2e: /* CS override */ | ||
2734 | case 0x36: /* SS override */ | ||
2735 | case 0x3e: /* DS override */ | ||
2736 | set_seg_override(c, (c->b >> 3) & 3); | ||
2737 | break; | ||
2738 | case 0x64: /* FS override */ | ||
2739 | case 0x65: /* GS override */ | ||
2740 | set_seg_override(c, c->b & 7); | ||
2741 | break; | ||
2742 | case 0x40 ... 0x4f: /* REX */ | ||
2743 | if (mode != X86EMUL_MODE_PROT64) | ||
2744 | goto done_prefixes; | ||
2745 | c->rex_prefix = c->b; | ||
2746 | continue; | ||
2747 | case 0xf0: /* LOCK */ | ||
2748 | c->lock_prefix = 1; | ||
2749 | break; | ||
2750 | case 0xf2: /* REPNE/REPNZ */ | ||
2751 | c->rep_prefix = REPNE_PREFIX; | ||
2752 | break; | ||
2753 | case 0xf3: /* REP/REPE/REPZ */ | ||
2754 | c->rep_prefix = REPE_PREFIX; | ||
2755 | break; | ||
2756 | default: | ||
2757 | goto done_prefixes; | ||
2758 | } | ||
2759 | |||
2760 | /* Any legacy prefix after a REX prefix nullifies its effect. */ | ||
2761 | |||
2762 | c->rex_prefix = 0; | ||
2763 | } | ||
2764 | |||
2765 | done_prefixes: | ||
2766 | |||
2767 | /* REX prefix. */ | ||
2768 | if (c->rex_prefix & 8) | ||
2769 | c->op_bytes = 8; /* REX.W */ | ||
2770 | |||
2771 | /* Opcode byte(s). */ | ||
2772 | opcode = opcode_table[c->b]; | ||
2773 | /* Two-byte opcode? */ | ||
2774 | if (c->b == 0x0f) { | ||
2775 | c->twobyte = 1; | ||
2776 | c->b = insn_fetch(u8, 1, c->eip); | ||
2777 | opcode = twobyte_table[c->b]; | ||
2778 | } | ||
2779 | c->d = opcode.flags; | ||
2780 | |||
2781 | if (c->d & Group) { | ||
2782 | dual = c->d & GroupDual; | ||
2783 | c->modrm = insn_fetch(u8, 1, c->eip); | ||
2784 | --c->eip; | ||
2785 | |||
2786 | if (c->d & GroupDual) { | ||
2787 | g_mod012 = opcode.u.gdual->mod012; | ||
2788 | g_mod3 = opcode.u.gdual->mod3; | ||
2789 | } else | ||
2790 | g_mod012 = g_mod3 = opcode.u.group; | ||
2791 | |||
2792 | c->d &= ~(Group | GroupDual); | ||
2793 | |||
2794 | goffset = (c->modrm >> 3) & 7; | ||
2795 | |||
2796 | if ((c->modrm >> 6) == 3) | ||
2797 | opcode = g_mod3[goffset]; | ||
2798 | else | ||
2799 | opcode = g_mod012[goffset]; | ||
2800 | c->d |= opcode.flags; | ||
2801 | } | ||
2802 | |||
2803 | c->execute = opcode.u.execute; | ||
2804 | |||
2805 | /* Unrecognised? */ | ||
2806 | if (c->d == 0 || (c->d & Undefined)) { | ||
2807 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
2808 | return -1; | ||
2809 | } | ||
2810 | |||
2811 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) | ||
2812 | c->op_bytes = 8; | ||
2813 | |||
2814 | if (c->d & Op3264) { | ||
2815 | if (mode == X86EMUL_MODE_PROT64) | ||
2816 | c->op_bytes = 8; | ||
2817 | else | ||
2818 | c->op_bytes = 4; | ||
2819 | } | ||
2820 | |||
2821 | /* ModRM and SIB bytes. */ | ||
2822 | if (c->d & ModRM) { | ||
2823 | rc = decode_modrm(ctxt, ops, &memop); | ||
2824 | if (!c->has_seg_override) | ||
2825 | set_seg_override(c, c->modrm_seg); | ||
2826 | } else if (c->d & MemAbs) | ||
2827 | rc = decode_abs(ctxt, ops, &memop); | ||
2828 | if (rc != X86EMUL_CONTINUE) | ||
2829 | goto done; | ||
2830 | |||
2831 | if (!c->has_seg_override) | ||
2832 | set_seg_override(c, VCPU_SREG_DS); | ||
2833 | |||
2834 | if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d)) | ||
2835 | memop.addr.mem += seg_override_base(ctxt, ops, c); | ||
2836 | |||
2837 | if (memop.type == OP_MEM && c->ad_bytes != 8) | ||
2838 | memop.addr.mem = (u32)memop.addr.mem; | ||
2839 | |||
2840 | if (memop.type == OP_MEM && c->rip_relative) | ||
2841 | memop.addr.mem += c->eip; | ||
2842 | |||
2843 | /* | ||
2844 | * Decode and fetch the source operand: register, memory | ||
2845 | * or immediate. | ||
2846 | */ | ||
2847 | switch (c->d & SrcMask) { | ||
2848 | case SrcNone: | ||
2849 | break; | ||
2850 | case SrcReg: | ||
2851 | decode_register_operand(&c->src, c, 0); | ||
2852 | break; | ||
2853 | case SrcMem16: | ||
2854 | memop.bytes = 2; | ||
2855 | goto srcmem_common; | ||
2856 | case SrcMem32: | ||
2857 | memop.bytes = 4; | ||
2858 | goto srcmem_common; | ||
2859 | case SrcMem: | ||
2860 | memop.bytes = (c->d & ByteOp) ? 1 : | ||
2861 | c->op_bytes; | ||
2862 | srcmem_common: | ||
2863 | c->src = memop; | ||
2864 | break; | ||
2865 | case SrcImmU16: | ||
2866 | rc = decode_imm(ctxt, &c->src, 2, false); | ||
2867 | break; | ||
2868 | case SrcImm: | ||
2869 | rc = decode_imm(ctxt, &c->src, imm_size(c), true); | ||
2870 | break; | ||
2871 | case SrcImmU: | ||
2872 | rc = decode_imm(ctxt, &c->src, imm_size(c), false); | ||
2873 | break; | ||
2874 | case SrcImmByte: | ||
2875 | rc = decode_imm(ctxt, &c->src, 1, true); | ||
2876 | break; | ||
2877 | case SrcImmUByte: | ||
2878 | rc = decode_imm(ctxt, &c->src, 1, false); | ||
2879 | break; | ||
2880 | case SrcAcc: | ||
2881 | c->src.type = OP_REG; | ||
2882 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
2883 | c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; | ||
2884 | fetch_register_operand(&c->src); | ||
2885 | break; | ||
2886 | case SrcOne: | ||
2887 | c->src.bytes = 1; | ||
2888 | c->src.val = 1; | ||
2889 | break; | ||
2890 | case SrcSI: | ||
2891 | c->src.type = OP_MEM; | ||
2892 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
2893 | c->src.addr.mem = | ||
2894 | register_address(c, seg_override_base(ctxt, ops, c), | ||
2895 | c->regs[VCPU_REGS_RSI]); | ||
2896 | c->src.val = 0; | ||
2897 | break; | ||
2898 | case SrcImmFAddr: | ||
2899 | c->src.type = OP_IMM; | ||
2900 | c->src.addr.mem = c->eip; | ||
2901 | c->src.bytes = c->op_bytes + 2; | ||
2902 | insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); | ||
2903 | break; | ||
2904 | case SrcMemFAddr: | ||
2905 | memop.bytes = c->op_bytes + 2; | ||
2906 | goto srcmem_common; | ||
2907 | break; | ||
2908 | } | ||
2909 | |||
2910 | if (rc != X86EMUL_CONTINUE) | ||
2911 | goto done; | ||
2912 | |||
2913 | /* | ||
2914 | * Decode and fetch the second source operand: register, memory | ||
2915 | * or immediate. | ||
2916 | */ | ||
2917 | switch (c->d & Src2Mask) { | ||
2918 | case Src2None: | ||
2919 | break; | ||
2920 | case Src2CL: | ||
2921 | c->src2.bytes = 1; | ||
2922 | c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; | ||
2923 | break; | ||
2924 | case Src2ImmByte: | ||
2925 | rc = decode_imm(ctxt, &c->src2, 1, true); | ||
2926 | break; | ||
2927 | case Src2One: | ||
2928 | c->src2.bytes = 1; | ||
2929 | c->src2.val = 1; | ||
2930 | break; | ||
2931 | case Src2Imm: | ||
2932 | rc = decode_imm(ctxt, &c->src2, imm_size(c), true); | ||
2933 | break; | ||
2934 | } | ||
2935 | |||
2936 | if (rc != X86EMUL_CONTINUE) | ||
2937 | goto done; | ||
2938 | |||
2939 | /* Decode and fetch the destination operand: register or memory. */ | ||
2940 | switch (c->d & DstMask) { | ||
2941 | case DstReg: | ||
2942 | decode_register_operand(&c->dst, c, | ||
2943 | c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); | ||
2944 | break; | ||
2945 | case DstImmUByte: | ||
2946 | c->dst.type = OP_IMM; | ||
2947 | c->dst.addr.mem = c->eip; | ||
2948 | c->dst.bytes = 1; | ||
2949 | c->dst.val = insn_fetch(u8, 1, c->eip); | ||
2950 | break; | ||
2951 | case DstMem: | ||
2952 | case DstMem64: | ||
2953 | c->dst = memop; | ||
2954 | if ((c->d & DstMask) == DstMem64) | ||
2955 | c->dst.bytes = 8; | ||
2956 | else | ||
2957 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
2958 | if (c->d & BitOp) | ||
2959 | fetch_bit_operand(c); | ||
2960 | c->dst.orig_val = c->dst.val; | ||
2961 | break; | ||
2962 | case DstAcc: | ||
2963 | c->dst.type = OP_REG; | ||
2964 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
2965 | c->dst.addr.reg = &c->regs[VCPU_REGS_RAX]; | ||
2966 | fetch_register_operand(&c->dst); | ||
2967 | c->dst.orig_val = c->dst.val; | ||
2968 | break; | ||
2969 | case DstDI: | ||
2970 | c->dst.type = OP_MEM; | ||
2971 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
2972 | c->dst.addr.mem = | ||
2973 | register_address(c, es_base(ctxt, ops), | ||
2974 | c->regs[VCPU_REGS_RDI]); | ||
2975 | c->dst.val = 0; | ||
2976 | break; | ||
2977 | case ImplicitOps: | ||
2978 | /* Special instructions do their own operand decoding. */ | ||
2979 | default: | ||
2980 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
2981 | return 0; | ||
2982 | } | ||
2983 | |||
2984 | done: | ||
2985 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | ||
2986 | } | ||
2987 | |||
2988 | static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) | ||
2989 | { | ||
2990 | struct decode_cache *c = &ctxt->decode; | ||
2991 | |||
2992 | /* The second termination condition only applies for REPE | ||
2993 | * and REPNE. Test if the repeat string operation prefix is | ||
2994 | * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the | ||
2995 | * corresponding termination condition according to: | ||
2996 | * - if REPE/REPZ and ZF = 0 then done | ||
2997 | * - if REPNE/REPNZ and ZF = 1 then done | ||
2998 | */ | ||
2999 | if (((c->b == 0xa6) || (c->b == 0xa7) || | ||
3000 | (c->b == 0xae) || (c->b == 0xaf)) | ||
3001 | && (((c->rep_prefix == REPE_PREFIX) && | ||
3002 | ((ctxt->eflags & EFLG_ZF) == 0)) | ||
3003 | || ((c->rep_prefix == REPNE_PREFIX) && | ||
3004 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) | ||
3005 | return true; | ||
3006 | |||
3007 | return false; | ||
3008 | } | ||
3009 | |||
3010 | int | ||
3011 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | ||
3012 | { | ||
3013 | struct x86_emulate_ops *ops = ctxt->ops; | ||
2561 | u64 msr_data; | 3014 | u64 msr_data; |
2562 | struct decode_cache *c = &ctxt->decode; | 3015 | struct decode_cache *c = &ctxt->decode; |
2563 | int rc = X86EMUL_CONTINUE; | 3016 | int rc = X86EMUL_CONTINUE; |
2564 | int saved_dst_type = c->dst.type; | 3017 | int saved_dst_type = c->dst.type; |
3018 | int irq; /* Used for int 3, int, and into */ | ||
2565 | 3019 | ||
2566 | ctxt->decode.mem_read.pos = 0; | 3020 | ctxt->decode.mem_read.pos = 0; |
2567 | 3021 | ||
@@ -2576,6 +3030,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2576 | goto done; | 3030 | goto done; |
2577 | } | 3031 | } |
2578 | 3032 | ||
3033 | if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { | ||
3034 | emulate_ud(ctxt); | ||
3035 | goto done; | ||
3036 | } | ||
3037 | |||
2579 | /* Privileged instruction can be executed only in CPL=0 */ | 3038 | /* Privileged instruction can be executed only in CPL=0 */ |
2580 | if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { | 3039 | if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { |
2581 | emulate_gp(ctxt, 0); | 3040 | emulate_gp(ctxt, 0); |
@@ -2583,35 +3042,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2583 | } | 3042 | } |
2584 | 3043 | ||
2585 | if (c->rep_prefix && (c->d & String)) { | 3044 | if (c->rep_prefix && (c->d & String)) { |
2586 | ctxt->restart = true; | ||
2587 | /* All REP prefixes have the same first termination condition */ | 3045 | /* All REP prefixes have the same first termination condition */ |
2588 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { | 3046 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { |
2589 | string_done: | ||
2590 | ctxt->restart = false; | ||
2591 | ctxt->eip = c->eip; | 3047 | ctxt->eip = c->eip; |
2592 | goto done; | 3048 | goto done; |
2593 | } | 3049 | } |
2594 | /* The second termination condition only applies for REPE | ||
2595 | * and REPNE. Test if the repeat string operation prefix is | ||
2596 | * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the | ||
2597 | * corresponding termination condition according to: | ||
2598 | * - if REPE/REPZ and ZF = 0 then done | ||
2599 | * - if REPNE/REPNZ and ZF = 1 then done | ||
2600 | */ | ||
2601 | if ((c->b == 0xa6) || (c->b == 0xa7) || | ||
2602 | (c->b == 0xae) || (c->b == 0xaf)) { | ||
2603 | if ((c->rep_prefix == REPE_PREFIX) && | ||
2604 | ((ctxt->eflags & EFLG_ZF) == 0)) | ||
2605 | goto string_done; | ||
2606 | if ((c->rep_prefix == REPNE_PREFIX) && | ||
2607 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) | ||
2608 | goto string_done; | ||
2609 | } | ||
2610 | c->eip = ctxt->eip; | ||
2611 | } | 3050 | } |
2612 | 3051 | ||
2613 | if (c->src.type == OP_MEM) { | 3052 | if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { |
2614 | rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, | 3053 | rc = read_emulated(ctxt, ops, c->src.addr.mem, |
2615 | c->src.valptr, c->src.bytes); | 3054 | c->src.valptr, c->src.bytes); |
2616 | if (rc != X86EMUL_CONTINUE) | 3055 | if (rc != X86EMUL_CONTINUE) |
2617 | goto done; | 3056 | goto done; |
@@ -2619,7 +3058,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2619 | } | 3058 | } |
2620 | 3059 | ||
2621 | if (c->src2.type == OP_MEM) { | 3060 | if (c->src2.type == OP_MEM) { |
2622 | rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, | 3061 | rc = read_emulated(ctxt, ops, c->src2.addr.mem, |
2623 | &c->src2.val, c->src2.bytes); | 3062 | &c->src2.val, c->src2.bytes); |
2624 | if (rc != X86EMUL_CONTINUE) | 3063 | if (rc != X86EMUL_CONTINUE) |
2625 | goto done; | 3064 | goto done; |
@@ -2631,7 +3070,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2631 | 3070 | ||
2632 | if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { | 3071 | if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { |
2633 | /* optimisation - avoid slow emulated read if Mov */ | 3072 | /* optimisation - avoid slow emulated read if Mov */ |
2634 | rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, | 3073 | rc = read_emulated(ctxt, ops, c->dst.addr.mem, |
2635 | &c->dst.val, c->dst.bytes); | 3074 | &c->dst.val, c->dst.bytes); |
2636 | if (rc != X86EMUL_CONTINUE) | 3075 | if (rc != X86EMUL_CONTINUE) |
2637 | goto done; | 3076 | goto done; |
@@ -2640,6 +3079,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
2640 | 3079 | ||
2641 | special_insn: | 3080 | special_insn: |
2642 | 3081 | ||
3082 | if (c->execute) { | ||
3083 | rc = c->execute(ctxt); | ||
3084 | if (rc != X86EMUL_CONTINUE) | ||
3085 | goto done; | ||
3086 | goto writeback; | ||
3087 | } | ||
3088 | |||
2643 | if (c->twobyte) | 3089 | if (c->twobyte) |
2644 | goto twobyte_insn; | 3090 | goto twobyte_insn; |
2645 | 3091 | ||
@@ -2653,8 +3099,6 @@ special_insn: | |||
2653 | break; | 3099 | break; |
2654 | case 0x07: /* pop es */ | 3100 | case 0x07: /* pop es */ |
2655 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); | 3101 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); |
2656 | if (rc != X86EMUL_CONTINUE) | ||
2657 | goto done; | ||
2658 | break; | 3102 | break; |
2659 | case 0x08 ... 0x0d: | 3103 | case 0x08 ... 0x0d: |
2660 | or: /* or */ | 3104 | or: /* or */ |
@@ -2672,8 +3116,6 @@ special_insn: | |||
2672 | break; | 3116 | break; |
2673 | case 0x17: /* pop ss */ | 3117 | case 0x17: /* pop ss */ |
2674 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); | 3118 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); |
2675 | if (rc != X86EMUL_CONTINUE) | ||
2676 | goto done; | ||
2677 | break; | 3119 | break; |
2678 | case 0x18 ... 0x1d: | 3120 | case 0x18 ... 0x1d: |
2679 | sbb: /* sbb */ | 3121 | sbb: /* sbb */ |
@@ -2684,8 +3126,6 @@ special_insn: | |||
2684 | break; | 3126 | break; |
2685 | case 0x1f: /* pop ds */ | 3127 | case 0x1f: /* pop ds */ |
2686 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); | 3128 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); |
2687 | if (rc != X86EMUL_CONTINUE) | ||
2688 | goto done; | ||
2689 | break; | 3129 | break; |
2690 | case 0x20 ... 0x25: | 3130 | case 0x20 ... 0x25: |
2691 | and: /* and */ | 3131 | and: /* and */ |
@@ -2709,58 +3149,29 @@ special_insn: | |||
2709 | case 0x48 ... 0x4f: /* dec r16/r32 */ | 3149 | case 0x48 ... 0x4f: /* dec r16/r32 */ |
2710 | emulate_1op("dec", c->dst, ctxt->eflags); | 3150 | emulate_1op("dec", c->dst, ctxt->eflags); |
2711 | break; | 3151 | break; |
2712 | case 0x50 ... 0x57: /* push reg */ | ||
2713 | emulate_push(ctxt, ops); | ||
2714 | break; | ||
2715 | case 0x58 ... 0x5f: /* pop reg */ | 3152 | case 0x58 ... 0x5f: /* pop reg */ |
2716 | pop_instruction: | 3153 | pop_instruction: |
2717 | rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); | 3154 | rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); |
2718 | if (rc != X86EMUL_CONTINUE) | ||
2719 | goto done; | ||
2720 | break; | 3155 | break; |
2721 | case 0x60: /* pusha */ | 3156 | case 0x60: /* pusha */ |
2722 | rc = emulate_pusha(ctxt, ops); | 3157 | rc = emulate_pusha(ctxt, ops); |
2723 | if (rc != X86EMUL_CONTINUE) | ||
2724 | goto done; | ||
2725 | break; | 3158 | break; |
2726 | case 0x61: /* popa */ | 3159 | case 0x61: /* popa */ |
2727 | rc = emulate_popa(ctxt, ops); | 3160 | rc = emulate_popa(ctxt, ops); |
2728 | if (rc != X86EMUL_CONTINUE) | ||
2729 | goto done; | ||
2730 | break; | 3161 | break; |
2731 | case 0x63: /* movsxd */ | 3162 | case 0x63: /* movsxd */ |
2732 | if (ctxt->mode != X86EMUL_MODE_PROT64) | 3163 | if (ctxt->mode != X86EMUL_MODE_PROT64) |
2733 | goto cannot_emulate; | 3164 | goto cannot_emulate; |
2734 | c->dst.val = (s32) c->src.val; | 3165 | c->dst.val = (s32) c->src.val; |
2735 | break; | 3166 | break; |
2736 | case 0x68: /* push imm */ | ||
2737 | case 0x6a: /* push imm8 */ | ||
2738 | emulate_push(ctxt, ops); | ||
2739 | break; | ||
2740 | case 0x6c: /* insb */ | 3167 | case 0x6c: /* insb */ |
2741 | case 0x6d: /* insw/insd */ | 3168 | case 0x6d: /* insw/insd */ |
2742 | c->dst.bytes = min(c->dst.bytes, 4u); | 3169 | c->src.val = c->regs[VCPU_REGS_RDX]; |
2743 | if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], | 3170 | goto do_io_in; |
2744 | c->dst.bytes)) { | ||
2745 | emulate_gp(ctxt, 0); | ||
2746 | goto done; | ||
2747 | } | ||
2748 | if (!pio_in_emulated(ctxt, ops, c->dst.bytes, | ||
2749 | c->regs[VCPU_REGS_RDX], &c->dst.val)) | ||
2750 | goto done; /* IO is needed, skip writeback */ | ||
2751 | break; | ||
2752 | case 0x6e: /* outsb */ | 3171 | case 0x6e: /* outsb */ |
2753 | case 0x6f: /* outsw/outsd */ | 3172 | case 0x6f: /* outsw/outsd */ |
2754 | c->src.bytes = min(c->src.bytes, 4u); | 3173 | c->dst.val = c->regs[VCPU_REGS_RDX]; |
2755 | if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], | 3174 | goto do_io_out; |
2756 | c->src.bytes)) { | ||
2757 | emulate_gp(ctxt, 0); | ||
2758 | goto done; | ||
2759 | } | ||
2760 | ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], | ||
2761 | &c->src.val, 1, ctxt->vcpu); | ||
2762 | |||
2763 | c->dst.type = OP_NONE; /* nothing to writeback */ | ||
2764 | break; | 3175 | break; |
2765 | case 0x70 ... 0x7f: /* jcc (short) */ | 3176 | case 0x70 ... 0x7f: /* jcc (short) */ |
2766 | if (test_cc(c->b, ctxt->eflags)) | 3177 | if (test_cc(c->b, ctxt->eflags)) |
@@ -2793,29 +3204,15 @@ special_insn: | |||
2793 | case 0x86 ... 0x87: /* xchg */ | 3204 | case 0x86 ... 0x87: /* xchg */ |
2794 | xchg: | 3205 | xchg: |
2795 | /* Write back the register source. */ | 3206 | /* Write back the register source. */ |
2796 | switch (c->dst.bytes) { | 3207 | c->src.val = c->dst.val; |
2797 | case 1: | 3208 | write_register_operand(&c->src); |
2798 | *(u8 *) c->src.ptr = (u8) c->dst.val; | ||
2799 | break; | ||
2800 | case 2: | ||
2801 | *(u16 *) c->src.ptr = (u16) c->dst.val; | ||
2802 | break; | ||
2803 | case 4: | ||
2804 | *c->src.ptr = (u32) c->dst.val; | ||
2805 | break; /* 64b reg: zero-extend */ | ||
2806 | case 8: | ||
2807 | *c->src.ptr = c->dst.val; | ||
2808 | break; | ||
2809 | } | ||
2810 | /* | 3209 | /* |
2811 | * Write back the memory destination with implicit LOCK | 3210 | * Write back the memory destination with implicit LOCK |
2812 | * prefix. | 3211 | * prefix. |
2813 | */ | 3212 | */ |
2814 | c->dst.val = c->src.val; | 3213 | c->dst.val = c->src.orig_val; |
2815 | c->lock_prefix = 1; | 3214 | c->lock_prefix = 1; |
2816 | break; | 3215 | break; |
2817 | case 0x88 ... 0x8b: /* mov */ | ||
2818 | goto mov; | ||
2819 | case 0x8c: /* mov r/m, sreg */ | 3216 | case 0x8c: /* mov r/m, sreg */ |
2820 | if (c->modrm_reg > VCPU_SREG_GS) { | 3217 | if (c->modrm_reg > VCPU_SREG_GS) { |
2821 | emulate_ud(ctxt); | 3218 | emulate_ud(ctxt); |
@@ -2824,7 +3221,7 @@ special_insn: | |||
2824 | c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); | 3221 | c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); |
2825 | break; | 3222 | break; |
2826 | case 0x8d: /* lea r16/r32, m */ | 3223 | case 0x8d: /* lea r16/r32, m */ |
2827 | c->dst.val = c->modrm_ea; | 3224 | c->dst.val = c->src.addr.mem; |
2828 | break; | 3225 | break; |
2829 | case 0x8e: { /* mov seg, r/m16 */ | 3226 | case 0x8e: { /* mov seg, r/m16 */ |
2830 | uint16_t sel; | 3227 | uint16_t sel; |
@@ -2847,76 +3244,87 @@ special_insn: | |||
2847 | } | 3244 | } |
2848 | case 0x8f: /* pop (sole member of Grp1a) */ | 3245 | case 0x8f: /* pop (sole member of Grp1a) */ |
2849 | rc = emulate_grp1a(ctxt, ops); | 3246 | rc = emulate_grp1a(ctxt, ops); |
2850 | if (rc != X86EMUL_CONTINUE) | ||
2851 | goto done; | ||
2852 | break; | 3247 | break; |
2853 | case 0x90: /* nop / xchg r8,rax */ | 3248 | case 0x90 ... 0x97: /* nop / xchg reg, rax */ |
2854 | if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { | 3249 | if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) |
2855 | c->dst.type = OP_NONE; /* nop */ | ||
2856 | break; | 3250 | break; |
2857 | } | ||
2858 | case 0x91 ... 0x97: /* xchg reg,rax */ | ||
2859 | c->src.type = OP_REG; | ||
2860 | c->src.bytes = c->op_bytes; | ||
2861 | c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; | ||
2862 | c->src.val = *(c->src.ptr); | ||
2863 | goto xchg; | 3251 | goto xchg; |
3252 | case 0x98: /* cbw/cwde/cdqe */ | ||
3253 | switch (c->op_bytes) { | ||
3254 | case 2: c->dst.val = (s8)c->dst.val; break; | ||
3255 | case 4: c->dst.val = (s16)c->dst.val; break; | ||
3256 | case 8: c->dst.val = (s32)c->dst.val; break; | ||
3257 | } | ||
3258 | break; | ||
2864 | case 0x9c: /* pushf */ | 3259 | case 0x9c: /* pushf */ |
2865 | c->src.val = (unsigned long) ctxt->eflags; | 3260 | c->src.val = (unsigned long) ctxt->eflags; |
2866 | emulate_push(ctxt, ops); | 3261 | emulate_push(ctxt, ops); |
2867 | break; | 3262 | break; |
2868 | case 0x9d: /* popf */ | 3263 | case 0x9d: /* popf */ |
2869 | c->dst.type = OP_REG; | 3264 | c->dst.type = OP_REG; |
2870 | c->dst.ptr = (unsigned long *) &ctxt->eflags; | 3265 | c->dst.addr.reg = &ctxt->eflags; |
2871 | c->dst.bytes = c->op_bytes; | 3266 | c->dst.bytes = c->op_bytes; |
2872 | rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); | 3267 | rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); |
2873 | if (rc != X86EMUL_CONTINUE) | ||
2874 | goto done; | ||
2875 | break; | 3268 | break; |
2876 | case 0xa0 ... 0xa3: /* mov */ | ||
2877 | case 0xa4 ... 0xa5: /* movs */ | ||
2878 | goto mov; | ||
2879 | case 0xa6 ... 0xa7: /* cmps */ | 3269 | case 0xa6 ... 0xa7: /* cmps */ |
2880 | c->dst.type = OP_NONE; /* Disable writeback. */ | 3270 | c->dst.type = OP_NONE; /* Disable writeback. */ |
2881 | DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); | 3271 | DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem); |
2882 | goto cmp; | 3272 | goto cmp; |
2883 | case 0xa8 ... 0xa9: /* test ax, imm */ | 3273 | case 0xa8 ... 0xa9: /* test ax, imm */ |
2884 | goto test; | 3274 | goto test; |
2885 | case 0xaa ... 0xab: /* stos */ | ||
2886 | c->dst.val = c->regs[VCPU_REGS_RAX]; | ||
2887 | break; | ||
2888 | case 0xac ... 0xad: /* lods */ | ||
2889 | goto mov; | ||
2890 | case 0xae ... 0xaf: /* scas */ | 3275 | case 0xae ... 0xaf: /* scas */ |
2891 | DPRINTF("Urk! I don't handle SCAS.\n"); | 3276 | goto cmp; |
2892 | goto cannot_emulate; | ||
2893 | case 0xb0 ... 0xbf: /* mov r, imm */ | ||
2894 | goto mov; | ||
2895 | case 0xc0 ... 0xc1: | 3277 | case 0xc0 ... 0xc1: |
2896 | emulate_grp2(ctxt); | 3278 | emulate_grp2(ctxt); |
2897 | break; | 3279 | break; |
2898 | case 0xc3: /* ret */ | 3280 | case 0xc3: /* ret */ |
2899 | c->dst.type = OP_REG; | 3281 | c->dst.type = OP_REG; |
2900 | c->dst.ptr = &c->eip; | 3282 | c->dst.addr.reg = &c->eip; |
2901 | c->dst.bytes = c->op_bytes; | 3283 | c->dst.bytes = c->op_bytes; |
2902 | goto pop_instruction; | 3284 | goto pop_instruction; |
2903 | case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ | 3285 | case 0xc4: /* les */ |
2904 | mov: | 3286 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); |
2905 | c->dst.val = c->src.val; | 3287 | break; |
3288 | case 0xc5: /* lds */ | ||
3289 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS); | ||
2906 | break; | 3290 | break; |
2907 | case 0xcb: /* ret far */ | 3291 | case 0xcb: /* ret far */ |
2908 | rc = emulate_ret_far(ctxt, ops); | 3292 | rc = emulate_ret_far(ctxt, ops); |
2909 | if (rc != X86EMUL_CONTINUE) | 3293 | break; |
2910 | goto done; | 3294 | case 0xcc: /* int3 */ |
3295 | irq = 3; | ||
3296 | goto do_interrupt; | ||
3297 | case 0xcd: /* int n */ | ||
3298 | irq = c->src.val; | ||
3299 | do_interrupt: | ||
3300 | rc = emulate_int(ctxt, ops, irq); | ||
3301 | break; | ||
3302 | case 0xce: /* into */ | ||
3303 | if (ctxt->eflags & EFLG_OF) { | ||
3304 | irq = 4; | ||
3305 | goto do_interrupt; | ||
3306 | } | ||
3307 | break; | ||
3308 | case 0xcf: /* iret */ | ||
3309 | rc = emulate_iret(ctxt, ops); | ||
2911 | break; | 3310 | break; |
2912 | case 0xd0 ... 0xd1: /* Grp2 */ | 3311 | case 0xd0 ... 0xd1: /* Grp2 */ |
2913 | c->src.val = 1; | ||
2914 | emulate_grp2(ctxt); | 3312 | emulate_grp2(ctxt); |
2915 | break; | 3313 | break; |
2916 | case 0xd2 ... 0xd3: /* Grp2 */ | 3314 | case 0xd2 ... 0xd3: /* Grp2 */ |
2917 | c->src.val = c->regs[VCPU_REGS_RCX]; | 3315 | c->src.val = c->regs[VCPU_REGS_RCX]; |
2918 | emulate_grp2(ctxt); | 3316 | emulate_grp2(ctxt); |
2919 | break; | 3317 | break; |
3318 | case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ | ||
3319 | register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); | ||
3320 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 && | ||
3321 | (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags))) | ||
3322 | jmp_rel(c, c->src.val); | ||
3323 | break; | ||
3324 | case 0xe3: /* jcxz/jecxz/jrcxz */ | ||
3325 | if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) | ||
3326 | jmp_rel(c, c->src.val); | ||
3327 | break; | ||
2920 | case 0xe4: /* inb */ | 3328 | case 0xe4: /* inb */ |
2921 | case 0xe5: /* in */ | 3329 | case 0xe5: /* in */ |
2922 | goto do_io_in; | 3330 | goto do_io_in; |
@@ -2964,15 +3372,16 @@ special_insn: | |||
2964 | break; | 3372 | break; |
2965 | case 0xee: /* out dx,al */ | 3373 | case 0xee: /* out dx,al */ |
2966 | case 0xef: /* out dx,(e/r)ax */ | 3374 | case 0xef: /* out dx,(e/r)ax */ |
2967 | c->src.val = c->regs[VCPU_REGS_RDX]; | 3375 | c->dst.val = c->regs[VCPU_REGS_RDX]; |
2968 | do_io_out: | 3376 | do_io_out: |
2969 | c->dst.bytes = min(c->dst.bytes, 4u); | 3377 | c->src.bytes = min(c->src.bytes, 4u); |
2970 | if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { | 3378 | if (!emulator_io_permited(ctxt, ops, c->dst.val, |
3379 | c->src.bytes)) { | ||
2971 | emulate_gp(ctxt, 0); | 3380 | emulate_gp(ctxt, 0); |
2972 | goto done; | 3381 | goto done; |
2973 | } | 3382 | } |
2974 | ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, | 3383 | ops->pio_out_emulated(c->src.bytes, c->dst.val, |
2975 | ctxt->vcpu); | 3384 | &c->src.val, 1, ctxt->vcpu); |
2976 | c->dst.type = OP_NONE; /* Disable writeback. */ | 3385 | c->dst.type = OP_NONE; /* Disable writeback. */ |
2977 | break; | 3386 | break; |
2978 | case 0xf4: /* hlt */ | 3387 | case 0xf4: /* hlt */ |
@@ -2981,24 +3390,22 @@ special_insn: | |||
2981 | case 0xf5: /* cmc */ | 3390 | case 0xf5: /* cmc */ |
2982 | /* complement carry flag from eflags reg */ | 3391 | /* complement carry flag from eflags reg */ |
2983 | ctxt->eflags ^= EFLG_CF; | 3392 | ctxt->eflags ^= EFLG_CF; |
2984 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
2985 | break; | 3393 | break; |
2986 | case 0xf6 ... 0xf7: /* Grp3 */ | 3394 | case 0xf6 ... 0xf7: /* Grp3 */ |
2987 | if (!emulate_grp3(ctxt, ops)) | 3395 | rc = emulate_grp3(ctxt, ops); |
2988 | goto cannot_emulate; | ||
2989 | break; | 3396 | break; |
2990 | case 0xf8: /* clc */ | 3397 | case 0xf8: /* clc */ |
2991 | ctxt->eflags &= ~EFLG_CF; | 3398 | ctxt->eflags &= ~EFLG_CF; |
2992 | c->dst.type = OP_NONE; /* Disable writeback. */ | 3399 | break; |
3400 | case 0xf9: /* stc */ | ||
3401 | ctxt->eflags |= EFLG_CF; | ||
2993 | break; | 3402 | break; |
2994 | case 0xfa: /* cli */ | 3403 | case 0xfa: /* cli */ |
2995 | if (emulator_bad_iopl(ctxt, ops)) { | 3404 | if (emulator_bad_iopl(ctxt, ops)) { |
2996 | emulate_gp(ctxt, 0); | 3405 | emulate_gp(ctxt, 0); |
2997 | goto done; | 3406 | goto done; |
2998 | } else { | 3407 | } else |
2999 | ctxt->eflags &= ~X86_EFLAGS_IF; | 3408 | ctxt->eflags &= ~X86_EFLAGS_IF; |
3000 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
3001 | } | ||
3002 | break; | 3409 | break; |
3003 | case 0xfb: /* sti */ | 3410 | case 0xfb: /* sti */ |
3004 | if (emulator_bad_iopl(ctxt, ops)) { | 3411 | if (emulator_bad_iopl(ctxt, ops)) { |
@@ -3007,29 +3414,29 @@ special_insn: | |||
3007 | } else { | 3414 | } else { |
3008 | ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; | 3415 | ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; |
3009 | ctxt->eflags |= X86_EFLAGS_IF; | 3416 | ctxt->eflags |= X86_EFLAGS_IF; |
3010 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
3011 | } | 3417 | } |
3012 | break; | 3418 | break; |
3013 | case 0xfc: /* cld */ | 3419 | case 0xfc: /* cld */ |
3014 | ctxt->eflags &= ~EFLG_DF; | 3420 | ctxt->eflags &= ~EFLG_DF; |
3015 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
3016 | break; | 3421 | break; |
3017 | case 0xfd: /* std */ | 3422 | case 0xfd: /* std */ |
3018 | ctxt->eflags |= EFLG_DF; | 3423 | ctxt->eflags |= EFLG_DF; |
3019 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
3020 | break; | 3424 | break; |
3021 | case 0xfe: /* Grp4 */ | 3425 | case 0xfe: /* Grp4 */ |
3022 | grp45: | 3426 | grp45: |
3023 | rc = emulate_grp45(ctxt, ops); | 3427 | rc = emulate_grp45(ctxt, ops); |
3024 | if (rc != X86EMUL_CONTINUE) | ||
3025 | goto done; | ||
3026 | break; | 3428 | break; |
3027 | case 0xff: /* Grp5 */ | 3429 | case 0xff: /* Grp5 */ |
3028 | if (c->modrm_reg == 5) | 3430 | if (c->modrm_reg == 5) |
3029 | goto jump_far; | 3431 | goto jump_far; |
3030 | goto grp45; | 3432 | goto grp45; |
3433 | default: | ||
3434 | goto cannot_emulate; | ||
3031 | } | 3435 | } |
3032 | 3436 | ||
3437 | if (rc != X86EMUL_CONTINUE) | ||
3438 | goto done; | ||
3439 | |||
3033 | writeback: | 3440 | writeback: |
3034 | rc = writeback(ctxt, ops); | 3441 | rc = writeback(ctxt, ops); |
3035 | if (rc != X86EMUL_CONTINUE) | 3442 | if (rc != X86EMUL_CONTINUE) |
@@ -3050,25 +3457,32 @@ writeback: | |||
3050 | &c->dst); | 3457 | &c->dst); |
3051 | 3458 | ||
3052 | if (c->rep_prefix && (c->d & String)) { | 3459 | if (c->rep_prefix && (c->d & String)) { |
3053 | struct read_cache *rc = &ctxt->decode.io_read; | 3460 | struct read_cache *r = &ctxt->decode.io_read; |
3054 | register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); | 3461 | register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); |
3055 | /* | 3462 | |
3056 | * Re-enter guest when pio read ahead buffer is empty or, | 3463 | if (!string_insn_completed(ctxt)) { |
3057 | * if it is not used, after each 1024 iteration. | 3464 | /* |
3058 | */ | 3465 | * Re-enter guest when pio read ahead buffer is empty |
3059 | if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || | 3466 | * or, if it is not used, after each 1024 iteration. |
3060 | (rc->end != 0 && rc->end == rc->pos)) | 3467 | */ |
3061 | ctxt->restart = false; | 3468 | if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) && |
3469 | (r->end == 0 || r->end != r->pos)) { | ||
3470 | /* | ||
3471 | * Reset read cache. Usually happens before | ||
3472 | * decode, but since instruction is restarted | ||
3473 | * we have to do it here. | ||
3474 | */ | ||
3475 | ctxt->decode.mem_read.end = 0; | ||
3476 | return EMULATION_RESTART; | ||
3477 | } | ||
3478 | goto done; /* skip rip writeback */ | ||
3479 | } | ||
3062 | } | 3480 | } |
3063 | /* | 3481 | |
3064 | * reset read cache here in case string instruction is restared | ||
3065 | * without decoding | ||
3066 | */ | ||
3067 | ctxt->decode.mem_read.end = 0; | ||
3068 | ctxt->eip = c->eip; | 3482 | ctxt->eip = c->eip; |
3069 | 3483 | ||
3070 | done: | 3484 | done: |
3071 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | 3485 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; |
3072 | 3486 | ||
3073 | twobyte_insn: | 3487 | twobyte_insn: |
3074 | switch (c->b) { | 3488 | switch (c->b) { |
@@ -3091,7 +3505,7 @@ twobyte_insn: | |||
3091 | c->dst.type = OP_NONE; | 3505 | c->dst.type = OP_NONE; |
3092 | break; | 3506 | break; |
3093 | case 2: /* lgdt */ | 3507 | case 2: /* lgdt */ |
3094 | rc = read_descriptor(ctxt, ops, c->src.ptr, | 3508 | rc = read_descriptor(ctxt, ops, c->src.addr.mem, |
3095 | &size, &address, c->op_bytes); | 3509 | &size, &address, c->op_bytes); |
3096 | if (rc != X86EMUL_CONTINUE) | 3510 | if (rc != X86EMUL_CONTINUE) |
3097 | goto done; | 3511 | goto done; |
@@ -3104,14 +3518,12 @@ twobyte_insn: | |||
3104 | switch (c->modrm_rm) { | 3518 | switch (c->modrm_rm) { |
3105 | case 1: | 3519 | case 1: |
3106 | rc = kvm_fix_hypercall(ctxt->vcpu); | 3520 | rc = kvm_fix_hypercall(ctxt->vcpu); |
3107 | if (rc != X86EMUL_CONTINUE) | ||
3108 | goto done; | ||
3109 | break; | 3521 | break; |
3110 | default: | 3522 | default: |
3111 | goto cannot_emulate; | 3523 | goto cannot_emulate; |
3112 | } | 3524 | } |
3113 | } else { | 3525 | } else { |
3114 | rc = read_descriptor(ctxt, ops, c->src.ptr, | 3526 | rc = read_descriptor(ctxt, ops, c->src.addr.mem, |
3115 | &size, &address, | 3527 | &size, &address, |
3116 | c->op_bytes); | 3528 | c->op_bytes); |
3117 | if (rc != X86EMUL_CONTINUE) | 3529 | if (rc != X86EMUL_CONTINUE) |
@@ -3126,7 +3538,7 @@ twobyte_insn: | |||
3126 | c->dst.val = ops->get_cr(0, ctxt->vcpu); | 3538 | c->dst.val = ops->get_cr(0, ctxt->vcpu); |
3127 | break; | 3539 | break; |
3128 | case 6: /* lmsw */ | 3540 | case 6: /* lmsw */ |
3129 | ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) | | 3541 | ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) | |
3130 | (c->src.val & 0x0f), ctxt->vcpu); | 3542 | (c->src.val & 0x0f), ctxt->vcpu); |
3131 | c->dst.type = OP_NONE; | 3543 | c->dst.type = OP_NONE; |
3132 | break; | 3544 | break; |
@@ -3134,7 +3546,7 @@ twobyte_insn: | |||
3134 | emulate_ud(ctxt); | 3546 | emulate_ud(ctxt); |
3135 | goto done; | 3547 | goto done; |
3136 | case 7: /* invlpg*/ | 3548 | case 7: /* invlpg*/ |
3137 | emulate_invlpg(ctxt->vcpu, c->modrm_ea); | 3549 | emulate_invlpg(ctxt->vcpu, c->src.addr.mem); |
3138 | /* Disable writeback. */ | 3550 | /* Disable writeback. */ |
3139 | c->dst.type = OP_NONE; | 3551 | c->dst.type = OP_NONE; |
3140 | break; | 3552 | break; |
@@ -3144,23 +3556,16 @@ twobyte_insn: | |||
3144 | break; | 3556 | break; |
3145 | case 0x05: /* syscall */ | 3557 | case 0x05: /* syscall */ |
3146 | rc = emulate_syscall(ctxt, ops); | 3558 | rc = emulate_syscall(ctxt, ops); |
3147 | if (rc != X86EMUL_CONTINUE) | ||
3148 | goto done; | ||
3149 | else | ||
3150 | goto writeback; | ||
3151 | break; | 3559 | break; |
3152 | case 0x06: | 3560 | case 0x06: |
3153 | emulate_clts(ctxt->vcpu); | 3561 | emulate_clts(ctxt->vcpu); |
3154 | c->dst.type = OP_NONE; | ||
3155 | break; | 3562 | break; |
3156 | case 0x09: /* wbinvd */ | 3563 | case 0x09: /* wbinvd */ |
3157 | kvm_emulate_wbinvd(ctxt->vcpu); | 3564 | kvm_emulate_wbinvd(ctxt->vcpu); |
3158 | c->dst.type = OP_NONE; | ||
3159 | break; | 3565 | break; |
3160 | case 0x08: /* invd */ | 3566 | case 0x08: /* invd */ |
3161 | case 0x0d: /* GrpP (prefetch) */ | 3567 | case 0x0d: /* GrpP (prefetch) */ |
3162 | case 0x18: /* Grp16 (prefetch/nop) */ | 3568 | case 0x18: /* Grp16 (prefetch/nop) */ |
3163 | c->dst.type = OP_NONE; | ||
3164 | break; | 3569 | break; |
3165 | case 0x20: /* mov cr, reg */ | 3570 | case 0x20: /* mov cr, reg */ |
3166 | switch (c->modrm_reg) { | 3571 | switch (c->modrm_reg) { |
@@ -3170,8 +3575,7 @@ twobyte_insn: | |||
3170 | emulate_ud(ctxt); | 3575 | emulate_ud(ctxt); |
3171 | goto done; | 3576 | goto done; |
3172 | } | 3577 | } |
3173 | c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); | 3578 | c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); |
3174 | c->dst.type = OP_NONE; /* no writeback */ | ||
3175 | break; | 3579 | break; |
3176 | case 0x21: /* mov from dr to reg */ | 3580 | case 0x21: /* mov from dr to reg */ |
3177 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && | 3581 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && |
@@ -3179,11 +3583,10 @@ twobyte_insn: | |||
3179 | emulate_ud(ctxt); | 3583 | emulate_ud(ctxt); |
3180 | goto done; | 3584 | goto done; |
3181 | } | 3585 | } |
3182 | ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); | 3586 | ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); |
3183 | c->dst.type = OP_NONE; /* no writeback */ | ||
3184 | break; | 3587 | break; |
3185 | case 0x22: /* mov reg, cr */ | 3588 | case 0x22: /* mov reg, cr */ |
3186 | if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { | 3589 | if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { |
3187 | emulate_gp(ctxt, 0); | 3590 | emulate_gp(ctxt, 0); |
3188 | goto done; | 3591 | goto done; |
3189 | } | 3592 | } |
@@ -3196,7 +3599,7 @@ twobyte_insn: | |||
3196 | goto done; | 3599 | goto done; |
3197 | } | 3600 | } |
3198 | 3601 | ||
3199 | if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] & | 3602 | if (ops->set_dr(c->modrm_reg, c->src.val & |
3200 | ((ctxt->mode == X86EMUL_MODE_PROT64) ? | 3603 | ((ctxt->mode == X86EMUL_MODE_PROT64) ? |
3201 | ~0ULL : ~0U), ctxt->vcpu) < 0) { | 3604 | ~0ULL : ~0U), ctxt->vcpu) < 0) { |
3202 | /* #UD condition is already handled by the code above */ | 3605 | /* #UD condition is already handled by the code above */ |
@@ -3215,7 +3618,6 @@ twobyte_insn: | |||
3215 | goto done; | 3618 | goto done; |
3216 | } | 3619 | } |
3217 | rc = X86EMUL_CONTINUE; | 3620 | rc = X86EMUL_CONTINUE; |
3218 | c->dst.type = OP_NONE; | ||
3219 | break; | 3621 | break; |
3220 | case 0x32: | 3622 | case 0x32: |
3221 | /* rdmsr */ | 3623 | /* rdmsr */ |
@@ -3227,21 +3629,12 @@ twobyte_insn: | |||
3227 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; | 3629 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; |
3228 | } | 3630 | } |
3229 | rc = X86EMUL_CONTINUE; | 3631 | rc = X86EMUL_CONTINUE; |
3230 | c->dst.type = OP_NONE; | ||
3231 | break; | 3632 | break; |
3232 | case 0x34: /* sysenter */ | 3633 | case 0x34: /* sysenter */ |
3233 | rc = emulate_sysenter(ctxt, ops); | 3634 | rc = emulate_sysenter(ctxt, ops); |
3234 | if (rc != X86EMUL_CONTINUE) | ||
3235 | goto done; | ||
3236 | else | ||
3237 | goto writeback; | ||
3238 | break; | 3635 | break; |
3239 | case 0x35: /* sysexit */ | 3636 | case 0x35: /* sysexit */ |
3240 | rc = emulate_sysexit(ctxt, ops); | 3637 | rc = emulate_sysexit(ctxt, ops); |
3241 | if (rc != X86EMUL_CONTINUE) | ||
3242 | goto done; | ||
3243 | else | ||
3244 | goto writeback; | ||
3245 | break; | 3638 | break; |
3246 | case 0x40 ... 0x4f: /* cmov */ | 3639 | case 0x40 ... 0x4f: /* cmov */ |
3247 | c->dst.val = c->dst.orig_val = c->src.val; | 3640 | c->dst.val = c->dst.orig_val = c->src.val; |
@@ -3251,15 +3644,15 @@ twobyte_insn: | |||
3251 | case 0x80 ... 0x8f: /* jnz rel, etc*/ | 3644 | case 0x80 ... 0x8f: /* jnz rel, etc*/ |
3252 | if (test_cc(c->b, ctxt->eflags)) | 3645 | if (test_cc(c->b, ctxt->eflags)) |
3253 | jmp_rel(c, c->src.val); | 3646 | jmp_rel(c, c->src.val); |
3254 | c->dst.type = OP_NONE; | 3647 | break; |
3648 | case 0x90 ... 0x9f: /* setcc r/m8 */ | ||
3649 | c->dst.val = test_cc(c->b, ctxt->eflags); | ||
3255 | break; | 3650 | break; |
3256 | case 0xa0: /* push fs */ | 3651 | case 0xa0: /* push fs */ |
3257 | emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); | 3652 | emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); |
3258 | break; | 3653 | break; |
3259 | case 0xa1: /* pop fs */ | 3654 | case 0xa1: /* pop fs */ |
3260 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); | 3655 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); |
3261 | if (rc != X86EMUL_CONTINUE) | ||
3262 | goto done; | ||
3263 | break; | 3656 | break; |
3264 | case 0xa3: | 3657 | case 0xa3: |
3265 | bt: /* bt */ | 3658 | bt: /* bt */ |
@@ -3277,13 +3670,9 @@ twobyte_insn: | |||
3277 | break; | 3670 | break; |
3278 | case 0xa9: /* pop gs */ | 3671 | case 0xa9: /* pop gs */ |
3279 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); | 3672 | rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); |
3280 | if (rc != X86EMUL_CONTINUE) | ||
3281 | goto done; | ||
3282 | break; | 3673 | break; |
3283 | case 0xab: | 3674 | case 0xab: |
3284 | bts: /* bts */ | 3675 | bts: /* bts */ |
3285 | /* only subword offset */ | ||
3286 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
3287 | emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); | 3676 | emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); |
3288 | break; | 3677 | break; |
3289 | case 0xac: /* shrd imm8, r, r/m */ | 3678 | case 0xac: /* shrd imm8, r, r/m */ |
@@ -3306,15 +3695,22 @@ twobyte_insn: | |||
3306 | } else { | 3695 | } else { |
3307 | /* Failure: write the value we saw to EAX. */ | 3696 | /* Failure: write the value we saw to EAX. */ |
3308 | c->dst.type = OP_REG; | 3697 | c->dst.type = OP_REG; |
3309 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | 3698 | c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX]; |
3310 | } | 3699 | } |
3311 | break; | 3700 | break; |
3701 | case 0xb2: /* lss */ | ||
3702 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS); | ||
3703 | break; | ||
3312 | case 0xb3: | 3704 | case 0xb3: |
3313 | btr: /* btr */ | 3705 | btr: /* btr */ |
3314 | /* only subword offset */ | ||
3315 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
3316 | emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); | 3706 | emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); |
3317 | break; | 3707 | break; |
3708 | case 0xb4: /* lfs */ | ||
3709 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS); | ||
3710 | break; | ||
3711 | case 0xb5: /* lgs */ | ||
3712 | rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS); | ||
3713 | break; | ||
3318 | case 0xb6 ... 0xb7: /* movzx */ | 3714 | case 0xb6 ... 0xb7: /* movzx */ |
3319 | c->dst.bytes = c->op_bytes; | 3715 | c->dst.bytes = c->op_bytes; |
3320 | c->dst.val = (c->d & ByteOp) ? (u8) c->src.val | 3716 | c->dst.val = (c->d & ByteOp) ? (u8) c->src.val |
@@ -3334,15 +3730,43 @@ twobyte_insn: | |||
3334 | break; | 3730 | break; |
3335 | case 0xbb: | 3731 | case 0xbb: |
3336 | btc: /* btc */ | 3732 | btc: /* btc */ |
3337 | /* only subword offset */ | ||
3338 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
3339 | emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); | 3733 | emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); |
3340 | break; | 3734 | break; |
3735 | case 0xbc: { /* bsf */ | ||
3736 | u8 zf; | ||
3737 | __asm__ ("bsf %2, %0; setz %1" | ||
3738 | : "=r"(c->dst.val), "=q"(zf) | ||
3739 | : "r"(c->src.val)); | ||
3740 | ctxt->eflags &= ~X86_EFLAGS_ZF; | ||
3741 | if (zf) { | ||
3742 | ctxt->eflags |= X86_EFLAGS_ZF; | ||
3743 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
3744 | } | ||
3745 | break; | ||
3746 | } | ||
3747 | case 0xbd: { /* bsr */ | ||
3748 | u8 zf; | ||
3749 | __asm__ ("bsr %2, %0; setz %1" | ||
3750 | : "=r"(c->dst.val), "=q"(zf) | ||
3751 | : "r"(c->src.val)); | ||
3752 | ctxt->eflags &= ~X86_EFLAGS_ZF; | ||
3753 | if (zf) { | ||
3754 | ctxt->eflags |= X86_EFLAGS_ZF; | ||
3755 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
3756 | } | ||
3757 | break; | ||
3758 | } | ||
3341 | case 0xbe ... 0xbf: /* movsx */ | 3759 | case 0xbe ... 0xbf: /* movsx */ |
3342 | c->dst.bytes = c->op_bytes; | 3760 | c->dst.bytes = c->op_bytes; |
3343 | c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : | 3761 | c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : |
3344 | (s16) c->src.val; | 3762 | (s16) c->src.val; |
3345 | break; | 3763 | break; |
3764 | case 0xc0 ... 0xc1: /* xadd */ | ||
3765 | emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); | ||
3766 | /* Write back the register source. */ | ||
3767 | c->src.val = c->dst.orig_val; | ||
3768 | write_register_operand(&c->src); | ||
3769 | break; | ||
3346 | case 0xc3: /* movnti */ | 3770 | case 0xc3: /* movnti */ |
3347 | c->dst.bytes = c->op_bytes; | 3771 | c->dst.bytes = c->op_bytes; |
3348 | c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : | 3772 | c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : |
@@ -3350,10 +3774,14 @@ twobyte_insn: | |||
3350 | break; | 3774 | break; |
3351 | case 0xc7: /* Grp9 (cmpxchg8b) */ | 3775 | case 0xc7: /* Grp9 (cmpxchg8b) */ |
3352 | rc = emulate_grp9(ctxt, ops); | 3776 | rc = emulate_grp9(ctxt, ops); |
3353 | if (rc != X86EMUL_CONTINUE) | ||
3354 | goto done; | ||
3355 | break; | 3777 | break; |
3778 | default: | ||
3779 | goto cannot_emulate; | ||
3356 | } | 3780 | } |
3781 | |||
3782 | if (rc != X86EMUL_CONTINUE) | ||
3783 | goto done; | ||
3784 | |||
3357 | goto writeback; | 3785 | goto writeback; |
3358 | 3786 | ||
3359 | cannot_emulate: | 3787 | cannot_emulate: |
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index ddeb2314b522..efad72385058 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * Copyright (c) 2006 Intel Corporation | 5 | * Copyright (c) 2006 Intel Corporation |
6 | * Copyright (c) 2007 Keir Fraser, XenSource Inc | 6 | * Copyright (c) 2007 Keir Fraser, XenSource Inc |
7 | * Copyright (c) 2008 Intel Corporation | 7 | * Copyright (c) 2008 Intel Corporation |
8 | * Copyright 2009 Red Hat, Inc. and/or its affilates. | 8 | * Copyright 2009 Red Hat, Inc. and/or its affiliates. |
9 | * | 9 | * |
10 | * Permission is hereby granted, free of charge, to any person obtaining a copy | 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
11 | * of this software and associated documentation files (the "Software"), to deal | 11 | * of this software and associated documentation files (the "Software"), to deal |
@@ -232,15 +232,6 @@ static void pit_latch_status(struct kvm *kvm, int channel) | |||
232 | } | 232 | } |
233 | } | 233 | } |
234 | 234 | ||
235 | int pit_has_pending_timer(struct kvm_vcpu *vcpu) | ||
236 | { | ||
237 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; | ||
238 | |||
239 | if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack) | ||
240 | return atomic_read(&pit->pit_state.pit_timer.pending); | ||
241 | return 0; | ||
242 | } | ||
243 | |||
244 | static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) | 235 | static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) |
245 | { | 236 | { |
246 | struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, | 237 | struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 4b7b73ce2098..f628234fbeca 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * Copyright (c) 2003-2004 Fabrice Bellard | 4 | * Copyright (c) 2003-2004 Fabrice Bellard |
5 | * Copyright (c) 2007 Intel Corporation | 5 | * Copyright (c) 2007 Intel Corporation |
6 | * Copyright 2009 Red Hat, Inc. and/or its affilates. | 6 | * Copyright 2009 Red Hat, Inc. and/or its affiliates. |
7 | * | 7 | * |
8 | * Permission is hereby granted, free of charge, to any person obtaining a copy | 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
9 | * of this software and associated documentation files (the "Software"), to deal | 9 | * of this software and associated documentation files (the "Software"), to deal |
@@ -39,7 +39,7 @@ static void pic_irq_request(struct kvm *kvm, int level); | |||
39 | static void pic_lock(struct kvm_pic *s) | 39 | static void pic_lock(struct kvm_pic *s) |
40 | __acquires(&s->lock) | 40 | __acquires(&s->lock) |
41 | { | 41 | { |
42 | raw_spin_lock(&s->lock); | 42 | spin_lock(&s->lock); |
43 | } | 43 | } |
44 | 44 | ||
45 | static void pic_unlock(struct kvm_pic *s) | 45 | static void pic_unlock(struct kvm_pic *s) |
@@ -51,7 +51,7 @@ static void pic_unlock(struct kvm_pic *s) | |||
51 | 51 | ||
52 | s->wakeup_needed = false; | 52 | s->wakeup_needed = false; |
53 | 53 | ||
54 | raw_spin_unlock(&s->lock); | 54 | spin_unlock(&s->lock); |
55 | 55 | ||
56 | if (wakeup) { | 56 | if (wakeup) { |
57 | kvm_for_each_vcpu(i, vcpu, s->kvm) { | 57 | kvm_for_each_vcpu(i, vcpu, s->kvm) { |
@@ -67,6 +67,7 @@ static void pic_unlock(struct kvm_pic *s) | |||
67 | if (!found) | 67 | if (!found) |
68 | return; | 68 | return; |
69 | 69 | ||
70 | kvm_make_request(KVM_REQ_EVENT, found); | ||
70 | kvm_vcpu_kick(found); | 71 | kvm_vcpu_kick(found); |
71 | } | 72 | } |
72 | } | 73 | } |
@@ -308,13 +309,17 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
308 | addr &= 1; | 309 | addr &= 1; |
309 | if (addr == 0) { | 310 | if (addr == 0) { |
310 | if (val & 0x10) { | 311 | if (val & 0x10) { |
311 | kvm_pic_reset(s); /* init */ | ||
312 | /* | ||
313 | * deassert a pending interrupt | ||
314 | */ | ||
315 | pic_irq_request(s->pics_state->kvm, 0); | ||
316 | s->init_state = 1; | ||
317 | s->init4 = val & 1; | 312 | s->init4 = val & 1; |
313 | s->last_irr = 0; | ||
314 | s->imr = 0; | ||
315 | s->priority_add = 0; | ||
316 | s->special_mask = 0; | ||
317 | s->read_reg_select = 0; | ||
318 | if (!s->init4) { | ||
319 | s->special_fully_nested_mode = 0; | ||
320 | s->auto_eoi = 0; | ||
321 | } | ||
322 | s->init_state = 1; | ||
318 | if (val & 0x02) | 323 | if (val & 0x02) |
319 | printk(KERN_ERR "single mode not supported"); | 324 | printk(KERN_ERR "single mode not supported"); |
320 | if (val & 0x08) | 325 | if (val & 0x08) |
@@ -564,7 +569,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) | |||
564 | s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); | 569 | s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); |
565 | if (!s) | 570 | if (!s) |
566 | return NULL; | 571 | return NULL; |
567 | raw_spin_lock_init(&s->lock); | 572 | spin_lock_init(&s->lock); |
568 | s->kvm = kvm; | 573 | s->kvm = kvm; |
569 | s->pics[0].elcr_mask = 0xf8; | 574 | s->pics[0].elcr_mask = 0xf8; |
570 | s->pics[1].elcr_mask = 0xde; | 575 | s->pics[1].elcr_mask = 0xde; |
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 2095a049835e..7e06ba1618bd 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * irq.c: API for in kernel interrupt controller | 2 | * irq.c: API for in kernel interrupt controller |
3 | * Copyright (c) 2007, Intel Corporation. | 3 | * Copyright (c) 2007, Intel Corporation. |
4 | * Copyright 2009 Red Hat, Inc. and/or its affilates. | 4 | * Copyright 2009 Red Hat, Inc. and/or its affiliates. |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify it | 6 | * This program is free software; you can redistribute it and/or modify it |
7 | * under the terms and conditions of the GNU General Public License, | 7 | * under the terms and conditions of the GNU General Public License, |
@@ -33,12 +33,7 @@ | |||
33 | */ | 33 | */ |
34 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) | 34 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) |
35 | { | 35 | { |
36 | int ret; | 36 | return apic_has_pending_timer(vcpu); |
37 | |||
38 | ret = pit_has_pending_timer(vcpu); | ||
39 | ret |= apic_has_pending_timer(vcpu); | ||
40 | |||
41 | return ret; | ||
42 | } | 37 | } |
43 | EXPORT_SYMBOL(kvm_cpu_has_pending_timer); | 38 | EXPORT_SYMBOL(kvm_cpu_has_pending_timer); |
44 | 39 | ||
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 63c314502993..ba910d149410 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h | |||
@@ -60,7 +60,7 @@ struct kvm_kpic_state { | |||
60 | }; | 60 | }; |
61 | 61 | ||
62 | struct kvm_pic { | 62 | struct kvm_pic { |
63 | raw_spinlock_t lock; | 63 | spinlock_t lock; |
64 | bool wakeup_needed; | 64 | bool wakeup_needed; |
65 | unsigned pending_acks; | 65 | unsigned pending_acks; |
66 | struct kvm *kvm; | 66 | struct kvm *kvm; |
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 6491ac8e755b..975bb45329a1 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h | |||
@@ -42,7 +42,14 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) | |||
42 | (unsigned long *)&vcpu->arch.regs_avail)) | 42 | (unsigned long *)&vcpu->arch.regs_avail)) |
43 | kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); | 43 | kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); |
44 | 44 | ||
45 | return vcpu->arch.pdptrs[index]; | 45 | return vcpu->arch.walk_mmu->pdptrs[index]; |
46 | } | ||
47 | |||
48 | static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index) | ||
49 | { | ||
50 | load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu)); | ||
51 | |||
52 | return mmu->pdptrs[index]; | ||
46 | } | 53 | } |
47 | 54 | ||
48 | static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) | 55 | static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 22b06f7660f4..413f8973a855 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * Copyright (C) 2006 Qumranet, Inc. | 5 | * Copyright (C) 2006 Qumranet, Inc. |
6 | * Copyright (C) 2007 Novell | 6 | * Copyright (C) 2007 Novell |
7 | * Copyright (C) 2007 Intel | 7 | * Copyright (C) 2007 Intel |
8 | * Copyright 2009 Red Hat, Inc. and/or its affilates. | 8 | * Copyright 2009 Red Hat, Inc. and/or its affiliates. |
9 | * | 9 | * |
10 | * Authors: | 10 | * Authors: |
11 | * Dor Laor <dor.laor@qumranet.com> | 11 | * Dor Laor <dor.laor@qumranet.com> |
@@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) | |||
259 | 259 | ||
260 | static void apic_update_ppr(struct kvm_lapic *apic) | 260 | static void apic_update_ppr(struct kvm_lapic *apic) |
261 | { | 261 | { |
262 | u32 tpr, isrv, ppr; | 262 | u32 tpr, isrv, ppr, old_ppr; |
263 | int isr; | 263 | int isr; |
264 | 264 | ||
265 | old_ppr = apic_get_reg(apic, APIC_PROCPRI); | ||
265 | tpr = apic_get_reg(apic, APIC_TASKPRI); | 266 | tpr = apic_get_reg(apic, APIC_TASKPRI); |
266 | isr = apic_find_highest_isr(apic); | 267 | isr = apic_find_highest_isr(apic); |
267 | isrv = (isr != -1) ? isr : 0; | 268 | isrv = (isr != -1) ? isr : 0; |
@@ -274,7 +275,10 @@ static void apic_update_ppr(struct kvm_lapic *apic) | |||
274 | apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", | 275 | apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", |
275 | apic, ppr, isr, isrv); | 276 | apic, ppr, isr, isrv); |
276 | 277 | ||
277 | apic_set_reg(apic, APIC_PROCPRI, ppr); | 278 | if (old_ppr != ppr) { |
279 | apic_set_reg(apic, APIC_PROCPRI, ppr); | ||
280 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); | ||
281 | } | ||
278 | } | 282 | } |
279 | 283 | ||
280 | static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) | 284 | static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) |
@@ -391,6 +395,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
391 | break; | 395 | break; |
392 | } | 396 | } |
393 | 397 | ||
398 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
394 | kvm_vcpu_kick(vcpu); | 399 | kvm_vcpu_kick(vcpu); |
395 | break; | 400 | break; |
396 | 401 | ||
@@ -416,6 +421,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
416 | "INIT on a runnable vcpu %d\n", | 421 | "INIT on a runnable vcpu %d\n", |
417 | vcpu->vcpu_id); | 422 | vcpu->vcpu_id); |
418 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; | 423 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; |
424 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
419 | kvm_vcpu_kick(vcpu); | 425 | kvm_vcpu_kick(vcpu); |
420 | } else { | 426 | } else { |
421 | apic_debug("Ignoring de-assert INIT to vcpu %d\n", | 427 | apic_debug("Ignoring de-assert INIT to vcpu %d\n", |
@@ -430,6 +436,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
430 | result = 1; | 436 | result = 1; |
431 | vcpu->arch.sipi_vector = vector; | 437 | vcpu->arch.sipi_vector = vector; |
432 | vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; | 438 | vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; |
439 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
433 | kvm_vcpu_kick(vcpu); | 440 | kvm_vcpu_kick(vcpu); |
434 | } | 441 | } |
435 | break; | 442 | break; |
@@ -475,6 +482,7 @@ static void apic_set_eoi(struct kvm_lapic *apic) | |||
475 | trigger_mode = IOAPIC_EDGE_TRIG; | 482 | trigger_mode = IOAPIC_EDGE_TRIG; |
476 | if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) | 483 | if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) |
477 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); | 484 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); |
485 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); | ||
478 | } | 486 | } |
479 | 487 | ||
480 | static void apic_send_ipi(struct kvm_lapic *apic) | 488 | static void apic_send_ipi(struct kvm_lapic *apic) |
@@ -1151,6 +1159,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | |||
1151 | update_divide_count(apic); | 1159 | update_divide_count(apic); |
1152 | start_apic_timer(apic); | 1160 | start_apic_timer(apic); |
1153 | apic->irr_pending = true; | 1161 | apic->irr_pending = true; |
1162 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
1154 | } | 1163 | } |
1155 | 1164 | ||
1156 | void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | 1165 | void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 311f6dad8951..908ea5464a51 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * MMU support | 7 | * MMU support |
8 | * | 8 | * |
9 | * Copyright (C) 2006 Qumranet, Inc. | 9 | * Copyright (C) 2006 Qumranet, Inc. |
10 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 10 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
11 | * | 11 | * |
12 | * Authors: | 12 | * Authors: |
13 | * Yaniv Kamay <yaniv@qumranet.com> | 13 | * Yaniv Kamay <yaniv@qumranet.com> |
@@ -49,15 +49,25 @@ | |||
49 | */ | 49 | */ |
50 | bool tdp_enabled = false; | 50 | bool tdp_enabled = false; |
51 | 51 | ||
52 | #undef MMU_DEBUG | 52 | enum { |
53 | AUDIT_PRE_PAGE_FAULT, | ||
54 | AUDIT_POST_PAGE_FAULT, | ||
55 | AUDIT_PRE_PTE_WRITE, | ||
56 | AUDIT_POST_PTE_WRITE, | ||
57 | AUDIT_PRE_SYNC, | ||
58 | AUDIT_POST_SYNC | ||
59 | }; | ||
53 | 60 | ||
54 | #undef AUDIT | 61 | char *audit_point_name[] = { |
62 | "pre page fault", | ||
63 | "post page fault", | ||
64 | "pre pte write", | ||
65 | "post pte write", | ||
66 | "pre sync", | ||
67 | "post sync" | ||
68 | }; | ||
55 | 69 | ||
56 | #ifdef AUDIT | 70 | #undef MMU_DEBUG |
57 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); | ||
58 | #else | ||
59 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} | ||
60 | #endif | ||
61 | 71 | ||
62 | #ifdef MMU_DEBUG | 72 | #ifdef MMU_DEBUG |
63 | 73 | ||
@@ -71,7 +81,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} | |||
71 | 81 | ||
72 | #endif | 82 | #endif |
73 | 83 | ||
74 | #if defined(MMU_DEBUG) || defined(AUDIT) | 84 | #ifdef MMU_DEBUG |
75 | static int dbg = 0; | 85 | static int dbg = 0; |
76 | module_param(dbg, bool, 0644); | 86 | module_param(dbg, bool, 0644); |
77 | #endif | 87 | #endif |
@@ -89,6 +99,8 @@ module_param(oos_shadow, bool, 0644); | |||
89 | } | 99 | } |
90 | #endif | 100 | #endif |
91 | 101 | ||
102 | #define PTE_PREFETCH_NUM 8 | ||
103 | |||
92 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | 104 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 |
93 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | 105 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 |
94 | 106 | ||
@@ -178,6 +190,7 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); | |||
178 | static struct kmem_cache *pte_chain_cache; | 190 | static struct kmem_cache *pte_chain_cache; |
179 | static struct kmem_cache *rmap_desc_cache; | 191 | static struct kmem_cache *rmap_desc_cache; |
180 | static struct kmem_cache *mmu_page_header_cache; | 192 | static struct kmem_cache *mmu_page_header_cache; |
193 | static struct percpu_counter kvm_total_used_mmu_pages; | ||
181 | 194 | ||
182 | static u64 __read_mostly shadow_trap_nonpresent_pte; | 195 | static u64 __read_mostly shadow_trap_nonpresent_pte; |
183 | static u64 __read_mostly shadow_notrap_nonpresent_pte; | 196 | static u64 __read_mostly shadow_notrap_nonpresent_pte; |
@@ -299,18 +312,50 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte) | |||
299 | #endif | 312 | #endif |
300 | } | 313 | } |
301 | 314 | ||
315 | static bool spte_has_volatile_bits(u64 spte) | ||
316 | { | ||
317 | if (!shadow_accessed_mask) | ||
318 | return false; | ||
319 | |||
320 | if (!is_shadow_present_pte(spte)) | ||
321 | return false; | ||
322 | |||
323 | if ((spte & shadow_accessed_mask) && | ||
324 | (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) | ||
325 | return false; | ||
326 | |||
327 | return true; | ||
328 | } | ||
329 | |||
330 | static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) | ||
331 | { | ||
332 | return (old_spte & bit_mask) && !(new_spte & bit_mask); | ||
333 | } | ||
334 | |||
302 | static void update_spte(u64 *sptep, u64 new_spte) | 335 | static void update_spte(u64 *sptep, u64 new_spte) |
303 | { | 336 | { |
304 | u64 old_spte; | 337 | u64 mask, old_spte = *sptep; |
338 | |||
339 | WARN_ON(!is_rmap_spte(new_spte)); | ||
340 | |||
341 | new_spte |= old_spte & shadow_dirty_mask; | ||
305 | 342 | ||
306 | if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || | 343 | mask = shadow_accessed_mask; |
307 | !is_rmap_spte(*sptep)) | 344 | if (is_writable_pte(old_spte)) |
345 | mask |= shadow_dirty_mask; | ||
346 | |||
347 | if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) | ||
308 | __set_spte(sptep, new_spte); | 348 | __set_spte(sptep, new_spte); |
309 | else { | 349 | else |
310 | old_spte = __xchg_spte(sptep, new_spte); | 350 | old_spte = __xchg_spte(sptep, new_spte); |
311 | if (old_spte & shadow_accessed_mask) | 351 | |
312 | mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); | 352 | if (!shadow_accessed_mask) |
313 | } | 353 | return; |
354 | |||
355 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) | ||
356 | kvm_set_pfn_accessed(spte_to_pfn(old_spte)); | ||
357 | if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) | ||
358 | kvm_set_pfn_dirty(spte_to_pfn(old_spte)); | ||
314 | } | 359 | } |
315 | 360 | ||
316 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | 361 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, |
@@ -367,7 +412,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) | |||
367 | if (r) | 412 | if (r) |
368 | goto out; | 413 | goto out; |
369 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, | 414 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, |
370 | rmap_desc_cache, 4); | 415 | rmap_desc_cache, 4 + PTE_PREFETCH_NUM); |
371 | if (r) | 416 | if (r) |
372 | goto out; | 417 | goto out; |
373 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); | 418 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); |
@@ -591,6 +636,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
591 | desc->sptes[0] = (u64 *)*rmapp; | 636 | desc->sptes[0] = (u64 *)*rmapp; |
592 | desc->sptes[1] = spte; | 637 | desc->sptes[1] = spte; |
593 | *rmapp = (unsigned long)desc | 1; | 638 | *rmapp = (unsigned long)desc | 1; |
639 | ++count; | ||
594 | } else { | 640 | } else { |
595 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | 641 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); |
596 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 642 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); |
@@ -603,7 +649,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
603 | desc = desc->more; | 649 | desc = desc->more; |
604 | } | 650 | } |
605 | for (i = 0; desc->sptes[i]; ++i) | 651 | for (i = 0; desc->sptes[i]; ++i) |
606 | ; | 652 | ++count; |
607 | desc->sptes[i] = spte; | 653 | desc->sptes[i] = spte; |
608 | } | 654 | } |
609 | return count; | 655 | return count; |
@@ -645,18 +691,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
645 | gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); | 691 | gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); |
646 | rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); | 692 | rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); |
647 | if (!*rmapp) { | 693 | if (!*rmapp) { |
648 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | 694 | printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte); |
649 | BUG(); | 695 | BUG(); |
650 | } else if (!(*rmapp & 1)) { | 696 | } else if (!(*rmapp & 1)) { |
651 | rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); | 697 | rmap_printk("rmap_remove: %p 1->0\n", spte); |
652 | if ((u64 *)*rmapp != spte) { | 698 | if ((u64 *)*rmapp != spte) { |
653 | printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", | 699 | printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); |
654 | spte, *spte); | ||
655 | BUG(); | 700 | BUG(); |
656 | } | 701 | } |
657 | *rmapp = 0; | 702 | *rmapp = 0; |
658 | } else { | 703 | } else { |
659 | rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); | 704 | rmap_printk("rmap_remove: %p many->many\n", spte); |
660 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 705 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); |
661 | prev_desc = NULL; | 706 | prev_desc = NULL; |
662 | while (desc) { | 707 | while (desc) { |
@@ -670,7 +715,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
670 | prev_desc = desc; | 715 | prev_desc = desc; |
671 | desc = desc->more; | 716 | desc = desc->more; |
672 | } | 717 | } |
673 | pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); | 718 | pr_err("rmap_remove: %p many->many\n", spte); |
674 | BUG(); | 719 | BUG(); |
675 | } | 720 | } |
676 | } | 721 | } |
@@ -680,18 +725,18 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte) | |||
680 | pfn_t pfn; | 725 | pfn_t pfn; |
681 | u64 old_spte = *sptep; | 726 | u64 old_spte = *sptep; |
682 | 727 | ||
683 | if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || | 728 | if (!spte_has_volatile_bits(old_spte)) |
684 | old_spte & shadow_accessed_mask) { | ||
685 | __set_spte(sptep, new_spte); | 729 | __set_spte(sptep, new_spte); |
686 | } else | 730 | else |
687 | old_spte = __xchg_spte(sptep, new_spte); | 731 | old_spte = __xchg_spte(sptep, new_spte); |
688 | 732 | ||
689 | if (!is_rmap_spte(old_spte)) | 733 | if (!is_rmap_spte(old_spte)) |
690 | return; | 734 | return; |
735 | |||
691 | pfn = spte_to_pfn(old_spte); | 736 | pfn = spte_to_pfn(old_spte); |
692 | if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) | 737 | if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) |
693 | kvm_set_pfn_accessed(pfn); | 738 | kvm_set_pfn_accessed(pfn); |
694 | if (is_writable_pte(old_spte)) | 739 | if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) |
695 | kvm_set_pfn_dirty(pfn); | 740 | kvm_set_pfn_dirty(pfn); |
696 | } | 741 | } |
697 | 742 | ||
@@ -746,13 +791,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
746 | } | 791 | } |
747 | spte = rmap_next(kvm, rmapp, spte); | 792 | spte = rmap_next(kvm, rmapp, spte); |
748 | } | 793 | } |
749 | if (write_protected) { | ||
750 | pfn_t pfn; | ||
751 | |||
752 | spte = rmap_next(kvm, rmapp, NULL); | ||
753 | pfn = spte_to_pfn(*spte); | ||
754 | kvm_set_pfn_dirty(pfn); | ||
755 | } | ||
756 | 794 | ||
757 | /* check for huge page mappings */ | 795 | /* check for huge page mappings */ |
758 | for (i = PT_DIRECTORY_LEVEL; | 796 | for (i = PT_DIRECTORY_LEVEL; |
@@ -947,6 +985,18 @@ static int is_empty_shadow_page(u64 *spt) | |||
947 | } | 985 | } |
948 | #endif | 986 | #endif |
949 | 987 | ||
988 | /* | ||
989 | * This value is the sum of all of the kvm instances's | ||
990 | * kvm->arch.n_used_mmu_pages values. We need a global, | ||
991 | * aggregate version in order to make the slab shrinker | ||
992 | * faster | ||
993 | */ | ||
994 | static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) | ||
995 | { | ||
996 | kvm->arch.n_used_mmu_pages += nr; | ||
997 | percpu_counter_add(&kvm_total_used_mmu_pages, nr); | ||
998 | } | ||
999 | |||
950 | static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1000 | static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) |
951 | { | 1001 | { |
952 | ASSERT(is_empty_shadow_page(sp->spt)); | 1002 | ASSERT(is_empty_shadow_page(sp->spt)); |
@@ -956,7 +1006,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
956 | if (!sp->role.direct) | 1006 | if (!sp->role.direct) |
957 | __free_page(virt_to_page(sp->gfns)); | 1007 | __free_page(virt_to_page(sp->gfns)); |
958 | kmem_cache_free(mmu_page_header_cache, sp); | 1008 | kmem_cache_free(mmu_page_header_cache, sp); |
959 | ++kvm->arch.n_free_mmu_pages; | 1009 | kvm_mod_used_mmu_pages(kvm, -1); |
960 | } | 1010 | } |
961 | 1011 | ||
962 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | 1012 | static unsigned kvm_page_table_hashfn(gfn_t gfn) |
@@ -979,7 +1029,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | |||
979 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); | 1029 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); |
980 | sp->multimapped = 0; | 1030 | sp->multimapped = 0; |
981 | sp->parent_pte = parent_pte; | 1031 | sp->parent_pte = parent_pte; |
982 | --vcpu->kvm->arch.n_free_mmu_pages; | 1032 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); |
983 | return sp; | 1033 | return sp; |
984 | } | 1034 | } |
985 | 1035 | ||
@@ -1403,7 +1453,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1403 | if (role.direct) | 1453 | if (role.direct) |
1404 | role.cr4_pae = 0; | 1454 | role.cr4_pae = 0; |
1405 | role.access = access; | 1455 | role.access = access; |
1406 | if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { | 1456 | if (!vcpu->arch.mmu.direct_map |
1457 | && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { | ||
1407 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); | 1458 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); |
1408 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | 1459 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; |
1409 | role.quadrant = quadrant; | 1460 | role.quadrant = quadrant; |
@@ -1458,6 +1509,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, | |||
1458 | iterator->addr = addr; | 1509 | iterator->addr = addr; |
1459 | iterator->shadow_addr = vcpu->arch.mmu.root_hpa; | 1510 | iterator->shadow_addr = vcpu->arch.mmu.root_hpa; |
1460 | iterator->level = vcpu->arch.mmu.shadow_root_level; | 1511 | iterator->level = vcpu->arch.mmu.shadow_root_level; |
1512 | |||
1513 | if (iterator->level == PT64_ROOT_LEVEL && | ||
1514 | vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && | ||
1515 | !vcpu->arch.mmu.direct_map) | ||
1516 | --iterator->level; | ||
1517 | |||
1461 | if (iterator->level == PT32E_ROOT_LEVEL) { | 1518 | if (iterator->level == PT32E_ROOT_LEVEL) { |
1462 | iterator->shadow_addr | 1519 | iterator->shadow_addr |
1463 | = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | 1520 | = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; |
@@ -1665,41 +1722,31 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, | |||
1665 | 1722 | ||
1666 | /* | 1723 | /* |
1667 | * Changing the number of mmu pages allocated to the vm | 1724 | * Changing the number of mmu pages allocated to the vm |
1668 | * Note: if kvm_nr_mmu_pages is too small, you will get dead lock | 1725 | * Note: if goal_nr_mmu_pages is too small, you will get dead lock |
1669 | */ | 1726 | */ |
1670 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) | 1727 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) |
1671 | { | 1728 | { |
1672 | int used_pages; | ||
1673 | LIST_HEAD(invalid_list); | 1729 | LIST_HEAD(invalid_list); |
1674 | |||
1675 | used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; | ||
1676 | used_pages = max(0, used_pages); | ||
1677 | |||
1678 | /* | 1730 | /* |
1679 | * If we set the number of mmu pages to be smaller be than the | 1731 | * If we set the number of mmu pages to be smaller be than the |
1680 | * number of actived pages , we must to free some mmu pages before we | 1732 | * number of actived pages , we must to free some mmu pages before we |
1681 | * change the value | 1733 | * change the value |
1682 | */ | 1734 | */ |
1683 | 1735 | ||
1684 | if (used_pages > kvm_nr_mmu_pages) { | 1736 | if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { |
1685 | while (used_pages > kvm_nr_mmu_pages && | 1737 | while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && |
1686 | !list_empty(&kvm->arch.active_mmu_pages)) { | 1738 | !list_empty(&kvm->arch.active_mmu_pages)) { |
1687 | struct kvm_mmu_page *page; | 1739 | struct kvm_mmu_page *page; |
1688 | 1740 | ||
1689 | page = container_of(kvm->arch.active_mmu_pages.prev, | 1741 | page = container_of(kvm->arch.active_mmu_pages.prev, |
1690 | struct kvm_mmu_page, link); | 1742 | struct kvm_mmu_page, link); |
1691 | used_pages -= kvm_mmu_prepare_zap_page(kvm, page, | 1743 | kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); |
1692 | &invalid_list); | 1744 | kvm_mmu_commit_zap_page(kvm, &invalid_list); |
1693 | } | 1745 | } |
1694 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | 1746 | goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; |
1695 | kvm_nr_mmu_pages = used_pages; | ||
1696 | kvm->arch.n_free_mmu_pages = 0; | ||
1697 | } | 1747 | } |
1698 | else | ||
1699 | kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages | ||
1700 | - kvm->arch.n_alloc_mmu_pages; | ||
1701 | 1748 | ||
1702 | kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; | 1749 | kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; |
1703 | } | 1750 | } |
1704 | 1751 | ||
1705 | static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | 1752 | static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) |
@@ -1709,11 +1756,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | |||
1709 | LIST_HEAD(invalid_list); | 1756 | LIST_HEAD(invalid_list); |
1710 | int r; | 1757 | int r; |
1711 | 1758 | ||
1712 | pgprintk("%s: looking for gfn %lx\n", __func__, gfn); | 1759 | pgprintk("%s: looking for gfn %llx\n", __func__, gfn); |
1713 | r = 0; | 1760 | r = 0; |
1714 | 1761 | ||
1715 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { | 1762 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { |
1716 | pgprintk("%s: gfn %lx role %x\n", __func__, gfn, | 1763 | pgprintk("%s: gfn %llx role %x\n", __func__, gfn, |
1717 | sp->role.word); | 1764 | sp->role.word); |
1718 | r = 1; | 1765 | r = 1; |
1719 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); | 1766 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); |
@@ -1729,7 +1776,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) | |||
1729 | LIST_HEAD(invalid_list); | 1776 | LIST_HEAD(invalid_list); |
1730 | 1777 | ||
1731 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { | 1778 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { |
1732 | pgprintk("%s: zap %lx %x\n", | 1779 | pgprintk("%s: zap %llx %x\n", |
1733 | __func__, gfn, sp->role.word); | 1780 | __func__, gfn, sp->role.word); |
1734 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); | 1781 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); |
1735 | } | 1782 | } |
@@ -1925,7 +1972,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1925 | * whether the guest actually used the pte (in order to detect | 1972 | * whether the guest actually used the pte (in order to detect |
1926 | * demand paging). | 1973 | * demand paging). |
1927 | */ | 1974 | */ |
1928 | spte = shadow_base_present_pte | shadow_dirty_mask; | 1975 | spte = shadow_base_present_pte; |
1929 | if (!speculative) | 1976 | if (!speculative) |
1930 | spte |= shadow_accessed_mask; | 1977 | spte |= shadow_accessed_mask; |
1931 | if (!dirty) | 1978 | if (!dirty) |
@@ -1948,8 +1995,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1948 | spte |= (u64)pfn << PAGE_SHIFT; | 1995 | spte |= (u64)pfn << PAGE_SHIFT; |
1949 | 1996 | ||
1950 | if ((pte_access & ACC_WRITE_MASK) | 1997 | if ((pte_access & ACC_WRITE_MASK) |
1951 | || (!tdp_enabled && write_fault && !is_write_protection(vcpu) | 1998 | || (!vcpu->arch.mmu.direct_map && write_fault |
1952 | && !user_fault)) { | 1999 | && !is_write_protection(vcpu) && !user_fault)) { |
1953 | 2000 | ||
1954 | if (level > PT_PAGE_TABLE_LEVEL && | 2001 | if (level > PT_PAGE_TABLE_LEVEL && |
1955 | has_wrprotected_page(vcpu->kvm, gfn, level)) { | 2002 | has_wrprotected_page(vcpu->kvm, gfn, level)) { |
@@ -1960,7 +2007,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1960 | 2007 | ||
1961 | spte |= PT_WRITABLE_MASK; | 2008 | spte |= PT_WRITABLE_MASK; |
1962 | 2009 | ||
1963 | if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) | 2010 | if (!vcpu->arch.mmu.direct_map |
2011 | && !(pte_access & ACC_WRITE_MASK)) | ||
1964 | spte &= ~PT_USER_MASK; | 2012 | spte &= ~PT_USER_MASK; |
1965 | 2013 | ||
1966 | /* | 2014 | /* |
@@ -1973,7 +2021,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1973 | goto set_pte; | 2021 | goto set_pte; |
1974 | 2022 | ||
1975 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { | 2023 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { |
1976 | pgprintk("%s: found shadow page for %lx, marking ro\n", | 2024 | pgprintk("%s: found shadow page for %llx, marking ro\n", |
1977 | __func__, gfn); | 2025 | __func__, gfn); |
1978 | ret = 1; | 2026 | ret = 1; |
1979 | pte_access &= ~ACC_WRITE_MASK; | 2027 | pte_access &= ~ACC_WRITE_MASK; |
@@ -1986,8 +2034,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1986 | mark_page_dirty(vcpu->kvm, gfn); | 2034 | mark_page_dirty(vcpu->kvm, gfn); |
1987 | 2035 | ||
1988 | set_pte: | 2036 | set_pte: |
1989 | if (is_writable_pte(*sptep) && !is_writable_pte(spte)) | ||
1990 | kvm_set_pfn_dirty(pfn); | ||
1991 | update_spte(sptep, spte); | 2037 | update_spte(sptep, spte); |
1992 | done: | 2038 | done: |
1993 | return ret; | 2039 | return ret; |
@@ -2004,7 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2004 | int rmap_count; | 2050 | int rmap_count; |
2005 | 2051 | ||
2006 | pgprintk("%s: spte %llx access %x write_fault %d" | 2052 | pgprintk("%s: spte %llx access %x write_fault %d" |
2007 | " user_fault %d gfn %lx\n", | 2053 | " user_fault %d gfn %llx\n", |
2008 | __func__, *sptep, pt_access, | 2054 | __func__, *sptep, pt_access, |
2009 | write_fault, user_fault, gfn); | 2055 | write_fault, user_fault, gfn); |
2010 | 2056 | ||
@@ -2023,7 +2069,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2023 | __set_spte(sptep, shadow_trap_nonpresent_pte); | 2069 | __set_spte(sptep, shadow_trap_nonpresent_pte); |
2024 | kvm_flush_remote_tlbs(vcpu->kvm); | 2070 | kvm_flush_remote_tlbs(vcpu->kvm); |
2025 | } else if (pfn != spte_to_pfn(*sptep)) { | 2071 | } else if (pfn != spte_to_pfn(*sptep)) { |
2026 | pgprintk("hfn old %lx new %lx\n", | 2072 | pgprintk("hfn old %llx new %llx\n", |
2027 | spte_to_pfn(*sptep), pfn); | 2073 | spte_to_pfn(*sptep), pfn); |
2028 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); | 2074 | drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); |
2029 | kvm_flush_remote_tlbs(vcpu->kvm); | 2075 | kvm_flush_remote_tlbs(vcpu->kvm); |
@@ -2040,7 +2086,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2040 | } | 2086 | } |
2041 | 2087 | ||
2042 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); | 2088 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); |
2043 | pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", | 2089 | pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", |
2044 | is_large_pte(*sptep)? "2MB" : "4kB", | 2090 | is_large_pte(*sptep)? "2MB" : "4kB", |
2045 | *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, | 2091 | *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, |
2046 | *sptep, sptep); | 2092 | *sptep, sptep); |
@@ -2064,6 +2110,105 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | |||
2064 | { | 2110 | { |
2065 | } | 2111 | } |
2066 | 2112 | ||
2113 | static struct kvm_memory_slot * | ||
2114 | pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) | ||
2115 | { | ||
2116 | struct kvm_memory_slot *slot; | ||
2117 | |||
2118 | slot = gfn_to_memslot(vcpu->kvm, gfn); | ||
2119 | if (!slot || slot->flags & KVM_MEMSLOT_INVALID || | ||
2120 | (no_dirty_log && slot->dirty_bitmap)) | ||
2121 | slot = NULL; | ||
2122 | |||
2123 | return slot; | ||
2124 | } | ||
2125 | |||
2126 | static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, | ||
2127 | bool no_dirty_log) | ||
2128 | { | ||
2129 | struct kvm_memory_slot *slot; | ||
2130 | unsigned long hva; | ||
2131 | |||
2132 | slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); | ||
2133 | if (!slot) { | ||
2134 | get_page(bad_page); | ||
2135 | return page_to_pfn(bad_page); | ||
2136 | } | ||
2137 | |||
2138 | hva = gfn_to_hva_memslot(slot, gfn); | ||
2139 | |||
2140 | return hva_to_pfn_atomic(vcpu->kvm, hva); | ||
2141 | } | ||
2142 | |||
2143 | static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, | ||
2144 | struct kvm_mmu_page *sp, | ||
2145 | u64 *start, u64 *end) | ||
2146 | { | ||
2147 | struct page *pages[PTE_PREFETCH_NUM]; | ||
2148 | unsigned access = sp->role.access; | ||
2149 | int i, ret; | ||
2150 | gfn_t gfn; | ||
2151 | |||
2152 | gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); | ||
2153 | if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) | ||
2154 | return -1; | ||
2155 | |||
2156 | ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); | ||
2157 | if (ret <= 0) | ||
2158 | return -1; | ||
2159 | |||
2160 | for (i = 0; i < ret; i++, gfn++, start++) | ||
2161 | mmu_set_spte(vcpu, start, ACC_ALL, | ||
2162 | access, 0, 0, 1, NULL, | ||
2163 | sp->role.level, gfn, | ||
2164 | page_to_pfn(pages[i]), true, true); | ||
2165 | |||
2166 | return 0; | ||
2167 | } | ||
2168 | |||
2169 | static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, | ||
2170 | struct kvm_mmu_page *sp, u64 *sptep) | ||
2171 | { | ||
2172 | u64 *spte, *start = NULL; | ||
2173 | int i; | ||
2174 | |||
2175 | WARN_ON(!sp->role.direct); | ||
2176 | |||
2177 | i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); | ||
2178 | spte = sp->spt + i; | ||
2179 | |||
2180 | for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { | ||
2181 | if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { | ||
2182 | if (!start) | ||
2183 | continue; | ||
2184 | if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) | ||
2185 | break; | ||
2186 | start = NULL; | ||
2187 | } else if (!start) | ||
2188 | start = spte; | ||
2189 | } | ||
2190 | } | ||
2191 | |||
2192 | static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) | ||
2193 | { | ||
2194 | struct kvm_mmu_page *sp; | ||
2195 | |||
2196 | /* | ||
2197 | * Since it's no accessed bit on EPT, it's no way to | ||
2198 | * distinguish between actually accessed translations | ||
2199 | * and prefetched, so disable pte prefetch if EPT is | ||
2200 | * enabled. | ||
2201 | */ | ||
2202 | if (!shadow_accessed_mask) | ||
2203 | return; | ||
2204 | |||
2205 | sp = page_header(__pa(sptep)); | ||
2206 | if (sp->role.level > PT_PAGE_TABLE_LEVEL) | ||
2207 | return; | ||
2208 | |||
2209 | __direct_pte_prefetch(vcpu, sp, sptep); | ||
2210 | } | ||
2211 | |||
2067 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | 2212 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, |
2068 | int level, gfn_t gfn, pfn_t pfn) | 2213 | int level, gfn_t gfn, pfn_t pfn) |
2069 | { | 2214 | { |
@@ -2077,6 +2222,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2077 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, | 2222 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, |
2078 | 0, write, 1, &pt_write, | 2223 | 0, write, 1, &pt_write, |
2079 | level, gfn, pfn, false, true); | 2224 | level, gfn, pfn, false, true); |
2225 | direct_pte_prefetch(vcpu, iterator.sptep); | ||
2080 | ++vcpu->stat.pf_fixed; | 2226 | ++vcpu->stat.pf_fixed; |
2081 | break; | 2227 | break; |
2082 | } | 2228 | } |
@@ -2098,28 +2244,31 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2098 | __set_spte(iterator.sptep, | 2244 | __set_spte(iterator.sptep, |
2099 | __pa(sp->spt) | 2245 | __pa(sp->spt) |
2100 | | PT_PRESENT_MASK | PT_WRITABLE_MASK | 2246 | | PT_PRESENT_MASK | PT_WRITABLE_MASK |
2101 | | shadow_user_mask | shadow_x_mask); | 2247 | | shadow_user_mask | shadow_x_mask |
2248 | | shadow_accessed_mask); | ||
2102 | } | 2249 | } |
2103 | } | 2250 | } |
2104 | return pt_write; | 2251 | return pt_write; |
2105 | } | 2252 | } |
2106 | 2253 | ||
2107 | static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) | 2254 | static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) |
2108 | { | 2255 | { |
2109 | char buf[1]; | 2256 | siginfo_t info; |
2110 | void __user *hva; | 2257 | |
2111 | int r; | 2258 | info.si_signo = SIGBUS; |
2259 | info.si_errno = 0; | ||
2260 | info.si_code = BUS_MCEERR_AR; | ||
2261 | info.si_addr = (void __user *)address; | ||
2262 | info.si_addr_lsb = PAGE_SHIFT; | ||
2112 | 2263 | ||
2113 | /* Touch the page, so send SIGBUS */ | 2264 | send_sig_info(SIGBUS, &info, tsk); |
2114 | hva = (void __user *)gfn_to_hva(kvm, gfn); | ||
2115 | r = copy_from_user(buf, hva, 1); | ||
2116 | } | 2265 | } |
2117 | 2266 | ||
2118 | static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) | 2267 | static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) |
2119 | { | 2268 | { |
2120 | kvm_release_pfn_clean(pfn); | 2269 | kvm_release_pfn_clean(pfn); |
2121 | if (is_hwpoison_pfn(pfn)) { | 2270 | if (is_hwpoison_pfn(pfn)) { |
2122 | kvm_send_hwpoison_signal(kvm, gfn); | 2271 | kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); |
2123 | return 0; | 2272 | return 0; |
2124 | } else if (is_fault_pfn(pfn)) | 2273 | } else if (is_fault_pfn(pfn)) |
2125 | return -EFAULT; | 2274 | return -EFAULT; |
@@ -2179,7 +2328,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) | |||
2179 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | 2328 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) |
2180 | return; | 2329 | return; |
2181 | spin_lock(&vcpu->kvm->mmu_lock); | 2330 | spin_lock(&vcpu->kvm->mmu_lock); |
2182 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | 2331 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && |
2332 | (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || | ||
2333 | vcpu->arch.mmu.direct_map)) { | ||
2183 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2334 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2184 | 2335 | ||
2185 | sp = page_header(root); | 2336 | sp = page_header(root); |
@@ -2222,80 +2373,158 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) | |||
2222 | return ret; | 2373 | return ret; |
2223 | } | 2374 | } |
2224 | 2375 | ||
2225 | static int mmu_alloc_roots(struct kvm_vcpu *vcpu) | 2376 | static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) |
2226 | { | 2377 | { |
2227 | int i; | ||
2228 | gfn_t root_gfn; | ||
2229 | struct kvm_mmu_page *sp; | 2378 | struct kvm_mmu_page *sp; |
2230 | int direct = 0; | 2379 | unsigned i; |
2231 | u64 pdptr; | ||
2232 | |||
2233 | root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; | ||
2234 | 2380 | ||
2235 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | 2381 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { |
2382 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2383 | kvm_mmu_free_some_pages(vcpu); | ||
2384 | sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, | ||
2385 | 1, ACC_ALL, NULL); | ||
2386 | ++sp->root_count; | ||
2387 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2388 | vcpu->arch.mmu.root_hpa = __pa(sp->spt); | ||
2389 | } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { | ||
2390 | for (i = 0; i < 4; ++i) { | ||
2391 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
2392 | |||
2393 | ASSERT(!VALID_PAGE(root)); | ||
2394 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2395 | kvm_mmu_free_some_pages(vcpu); | ||
2396 | sp = kvm_mmu_get_page(vcpu, i << 30, i << 30, | ||
2397 | PT32_ROOT_LEVEL, 1, ACC_ALL, | ||
2398 | NULL); | ||
2399 | root = __pa(sp->spt); | ||
2400 | ++sp->root_count; | ||
2401 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2402 | vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; | ||
2403 | } | ||
2404 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | ||
2405 | } else | ||
2406 | BUG(); | ||
2407 | |||
2408 | return 0; | ||
2409 | } | ||
2410 | |||
2411 | static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) | ||
2412 | { | ||
2413 | struct kvm_mmu_page *sp; | ||
2414 | u64 pdptr, pm_mask; | ||
2415 | gfn_t root_gfn; | ||
2416 | int i; | ||
2417 | |||
2418 | root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; | ||
2419 | |||
2420 | if (mmu_check_root(vcpu, root_gfn)) | ||
2421 | return 1; | ||
2422 | |||
2423 | /* | ||
2424 | * Do we shadow a long mode page table? If so we need to | ||
2425 | * write-protect the guests page table root. | ||
2426 | */ | ||
2427 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | ||
2236 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2428 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2237 | 2429 | ||
2238 | ASSERT(!VALID_PAGE(root)); | 2430 | ASSERT(!VALID_PAGE(root)); |
2239 | if (mmu_check_root(vcpu, root_gfn)) | 2431 | |
2240 | return 1; | ||
2241 | if (tdp_enabled) { | ||
2242 | direct = 1; | ||
2243 | root_gfn = 0; | ||
2244 | } | ||
2245 | spin_lock(&vcpu->kvm->mmu_lock); | 2432 | spin_lock(&vcpu->kvm->mmu_lock); |
2246 | kvm_mmu_free_some_pages(vcpu); | 2433 | kvm_mmu_free_some_pages(vcpu); |
2247 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, | 2434 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, |
2248 | PT64_ROOT_LEVEL, direct, | 2435 | 0, ACC_ALL, NULL); |
2249 | ACC_ALL, NULL); | ||
2250 | root = __pa(sp->spt); | 2436 | root = __pa(sp->spt); |
2251 | ++sp->root_count; | 2437 | ++sp->root_count; |
2252 | spin_unlock(&vcpu->kvm->mmu_lock); | 2438 | spin_unlock(&vcpu->kvm->mmu_lock); |
2253 | vcpu->arch.mmu.root_hpa = root; | 2439 | vcpu->arch.mmu.root_hpa = root; |
2254 | return 0; | 2440 | return 0; |
2255 | } | 2441 | } |
2256 | direct = !is_paging(vcpu); | 2442 | |
2443 | /* | ||
2444 | * We shadow a 32 bit page table. This may be a legacy 2-level | ||
2445 | * or a PAE 3-level page table. In either case we need to be aware that | ||
2446 | * the shadow page table may be a PAE or a long mode page table. | ||
2447 | */ | ||
2448 | pm_mask = PT_PRESENT_MASK; | ||
2449 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) | ||
2450 | pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; | ||
2451 | |||
2257 | for (i = 0; i < 4; ++i) { | 2452 | for (i = 0; i < 4; ++i) { |
2258 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | 2453 | hpa_t root = vcpu->arch.mmu.pae_root[i]; |
2259 | 2454 | ||
2260 | ASSERT(!VALID_PAGE(root)); | 2455 | ASSERT(!VALID_PAGE(root)); |
2261 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { | 2456 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { |
2262 | pdptr = kvm_pdptr_read(vcpu, i); | 2457 | pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i); |
2263 | if (!is_present_gpte(pdptr)) { | 2458 | if (!is_present_gpte(pdptr)) { |
2264 | vcpu->arch.mmu.pae_root[i] = 0; | 2459 | vcpu->arch.mmu.pae_root[i] = 0; |
2265 | continue; | 2460 | continue; |
2266 | } | 2461 | } |
2267 | root_gfn = pdptr >> PAGE_SHIFT; | 2462 | root_gfn = pdptr >> PAGE_SHIFT; |
2268 | } else if (vcpu->arch.mmu.root_level == 0) | 2463 | if (mmu_check_root(vcpu, root_gfn)) |
2269 | root_gfn = 0; | 2464 | return 1; |
2270 | if (mmu_check_root(vcpu, root_gfn)) | ||
2271 | return 1; | ||
2272 | if (tdp_enabled) { | ||
2273 | direct = 1; | ||
2274 | root_gfn = i << 30; | ||
2275 | } | 2465 | } |
2276 | spin_lock(&vcpu->kvm->mmu_lock); | 2466 | spin_lock(&vcpu->kvm->mmu_lock); |
2277 | kvm_mmu_free_some_pages(vcpu); | 2467 | kvm_mmu_free_some_pages(vcpu); |
2278 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | 2468 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, |
2279 | PT32_ROOT_LEVEL, direct, | 2469 | PT32_ROOT_LEVEL, 0, |
2280 | ACC_ALL, NULL); | 2470 | ACC_ALL, NULL); |
2281 | root = __pa(sp->spt); | 2471 | root = __pa(sp->spt); |
2282 | ++sp->root_count; | 2472 | ++sp->root_count; |
2283 | spin_unlock(&vcpu->kvm->mmu_lock); | 2473 | spin_unlock(&vcpu->kvm->mmu_lock); |
2284 | 2474 | ||
2285 | vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; | 2475 | vcpu->arch.mmu.pae_root[i] = root | pm_mask; |
2286 | } | 2476 | } |
2287 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | 2477 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); |
2478 | |||
2479 | /* | ||
2480 | * If we shadow a 32 bit page table with a long mode page | ||
2481 | * table we enter this path. | ||
2482 | */ | ||
2483 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
2484 | if (vcpu->arch.mmu.lm_root == NULL) { | ||
2485 | /* | ||
2486 | * The additional page necessary for this is only | ||
2487 | * allocated on demand. | ||
2488 | */ | ||
2489 | |||
2490 | u64 *lm_root; | ||
2491 | |||
2492 | lm_root = (void*)get_zeroed_page(GFP_KERNEL); | ||
2493 | if (lm_root == NULL) | ||
2494 | return 1; | ||
2495 | |||
2496 | lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; | ||
2497 | |||
2498 | vcpu->arch.mmu.lm_root = lm_root; | ||
2499 | } | ||
2500 | |||
2501 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); | ||
2502 | } | ||
2503 | |||
2288 | return 0; | 2504 | return 0; |
2289 | } | 2505 | } |
2290 | 2506 | ||
2507 | static int mmu_alloc_roots(struct kvm_vcpu *vcpu) | ||
2508 | { | ||
2509 | if (vcpu->arch.mmu.direct_map) | ||
2510 | return mmu_alloc_direct_roots(vcpu); | ||
2511 | else | ||
2512 | return mmu_alloc_shadow_roots(vcpu); | ||
2513 | } | ||
2514 | |||
2291 | static void mmu_sync_roots(struct kvm_vcpu *vcpu) | 2515 | static void mmu_sync_roots(struct kvm_vcpu *vcpu) |
2292 | { | 2516 | { |
2293 | int i; | 2517 | int i; |
2294 | struct kvm_mmu_page *sp; | 2518 | struct kvm_mmu_page *sp; |
2295 | 2519 | ||
2520 | if (vcpu->arch.mmu.direct_map) | ||
2521 | return; | ||
2522 | |||
2296 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | 2523 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) |
2297 | return; | 2524 | return; |
2298 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | 2525 | |
2526 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); | ||
2527 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | ||
2299 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2528 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2300 | sp = page_header(root); | 2529 | sp = page_header(root); |
2301 | mmu_sync_children(vcpu, sp); | 2530 | mmu_sync_children(vcpu, sp); |
@@ -2310,6 +2539,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2310 | mmu_sync_children(vcpu, sp); | 2539 | mmu_sync_children(vcpu, sp); |
2311 | } | 2540 | } |
2312 | } | 2541 | } |
2542 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); | ||
2313 | } | 2543 | } |
2314 | 2544 | ||
2315 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | 2545 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) |
@@ -2327,6 +2557,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, | |||
2327 | return vaddr; | 2557 | return vaddr; |
2328 | } | 2558 | } |
2329 | 2559 | ||
2560 | static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, | ||
2561 | u32 access, u32 *error) | ||
2562 | { | ||
2563 | if (error) | ||
2564 | *error = 0; | ||
2565 | return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); | ||
2566 | } | ||
2567 | |||
2330 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | 2568 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, |
2331 | u32 error_code) | 2569 | u32 error_code) |
2332 | { | 2570 | { |
@@ -2393,10 +2631,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu) | |||
2393 | mmu_free_roots(vcpu); | 2631 | mmu_free_roots(vcpu); |
2394 | } | 2632 | } |
2395 | 2633 | ||
2396 | static int nonpaging_init_context(struct kvm_vcpu *vcpu) | 2634 | static int nonpaging_init_context(struct kvm_vcpu *vcpu, |
2635 | struct kvm_mmu *context) | ||
2397 | { | 2636 | { |
2398 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
2399 | |||
2400 | context->new_cr3 = nonpaging_new_cr3; | 2637 | context->new_cr3 = nonpaging_new_cr3; |
2401 | context->page_fault = nonpaging_page_fault; | 2638 | context->page_fault = nonpaging_page_fault; |
2402 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 2639 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
@@ -2407,6 +2644,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) | |||
2407 | context->root_level = 0; | 2644 | context->root_level = 0; |
2408 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 2645 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
2409 | context->root_hpa = INVALID_PAGE; | 2646 | context->root_hpa = INVALID_PAGE; |
2647 | context->direct_map = true; | ||
2648 | context->nx = false; | ||
2410 | return 0; | 2649 | return 0; |
2411 | } | 2650 | } |
2412 | 2651 | ||
@@ -2422,11 +2661,14 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu) | |||
2422 | mmu_free_roots(vcpu); | 2661 | mmu_free_roots(vcpu); |
2423 | } | 2662 | } |
2424 | 2663 | ||
2425 | static void inject_page_fault(struct kvm_vcpu *vcpu, | 2664 | static unsigned long get_cr3(struct kvm_vcpu *vcpu) |
2426 | u64 addr, | 2665 | { |
2427 | u32 err_code) | 2666 | return vcpu->arch.cr3; |
2667 | } | ||
2668 | |||
2669 | static void inject_page_fault(struct kvm_vcpu *vcpu) | ||
2428 | { | 2670 | { |
2429 | kvm_inject_page_fault(vcpu, addr, err_code); | 2671 | vcpu->arch.mmu.inject_page_fault(vcpu); |
2430 | } | 2672 | } |
2431 | 2673 | ||
2432 | static void paging_free(struct kvm_vcpu *vcpu) | 2674 | static void paging_free(struct kvm_vcpu *vcpu) |
@@ -2434,12 +2676,12 @@ static void paging_free(struct kvm_vcpu *vcpu) | |||
2434 | nonpaging_free(vcpu); | 2676 | nonpaging_free(vcpu); |
2435 | } | 2677 | } |
2436 | 2678 | ||
2437 | static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) | 2679 | static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) |
2438 | { | 2680 | { |
2439 | int bit7; | 2681 | int bit7; |
2440 | 2682 | ||
2441 | bit7 = (gpte >> 7) & 1; | 2683 | bit7 = (gpte >> 7) & 1; |
2442 | return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; | 2684 | return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; |
2443 | } | 2685 | } |
2444 | 2686 | ||
2445 | #define PTTYPE 64 | 2687 | #define PTTYPE 64 |
@@ -2450,13 +2692,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) | |||
2450 | #include "paging_tmpl.h" | 2692 | #include "paging_tmpl.h" |
2451 | #undef PTTYPE | 2693 | #undef PTTYPE |
2452 | 2694 | ||
2453 | static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) | 2695 | static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, |
2696 | struct kvm_mmu *context, | ||
2697 | int level) | ||
2454 | { | 2698 | { |
2455 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
2456 | int maxphyaddr = cpuid_maxphyaddr(vcpu); | 2699 | int maxphyaddr = cpuid_maxphyaddr(vcpu); |
2457 | u64 exb_bit_rsvd = 0; | 2700 | u64 exb_bit_rsvd = 0; |
2458 | 2701 | ||
2459 | if (!is_nx(vcpu)) | 2702 | if (!context->nx) |
2460 | exb_bit_rsvd = rsvd_bits(63, 63); | 2703 | exb_bit_rsvd = rsvd_bits(63, 63); |
2461 | switch (level) { | 2704 | switch (level) { |
2462 | case PT32_ROOT_LEVEL: | 2705 | case PT32_ROOT_LEVEL: |
@@ -2511,9 +2754,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) | |||
2511 | } | 2754 | } |
2512 | } | 2755 | } |
2513 | 2756 | ||
2514 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | 2757 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, |
2758 | struct kvm_mmu *context, | ||
2759 | int level) | ||
2515 | { | 2760 | { |
2516 | struct kvm_mmu *context = &vcpu->arch.mmu; | 2761 | context->nx = is_nx(vcpu); |
2762 | |||
2763 | reset_rsvds_bits_mask(vcpu, context, level); | ||
2517 | 2764 | ||
2518 | ASSERT(is_pae(vcpu)); | 2765 | ASSERT(is_pae(vcpu)); |
2519 | context->new_cr3 = paging_new_cr3; | 2766 | context->new_cr3 = paging_new_cr3; |
@@ -2526,20 +2773,23 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | |||
2526 | context->root_level = level; | 2773 | context->root_level = level; |
2527 | context->shadow_root_level = level; | 2774 | context->shadow_root_level = level; |
2528 | context->root_hpa = INVALID_PAGE; | 2775 | context->root_hpa = INVALID_PAGE; |
2776 | context->direct_map = false; | ||
2529 | return 0; | 2777 | return 0; |
2530 | } | 2778 | } |
2531 | 2779 | ||
2532 | static int paging64_init_context(struct kvm_vcpu *vcpu) | 2780 | static int paging64_init_context(struct kvm_vcpu *vcpu, |
2781 | struct kvm_mmu *context) | ||
2533 | { | 2782 | { |
2534 | reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); | 2783 | return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); |
2535 | return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); | ||
2536 | } | 2784 | } |
2537 | 2785 | ||
2538 | static int paging32_init_context(struct kvm_vcpu *vcpu) | 2786 | static int paging32_init_context(struct kvm_vcpu *vcpu, |
2787 | struct kvm_mmu *context) | ||
2539 | { | 2788 | { |
2540 | struct kvm_mmu *context = &vcpu->arch.mmu; | 2789 | context->nx = false; |
2790 | |||
2791 | reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); | ||
2541 | 2792 | ||
2542 | reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); | ||
2543 | context->new_cr3 = paging_new_cr3; | 2793 | context->new_cr3 = paging_new_cr3; |
2544 | context->page_fault = paging32_page_fault; | 2794 | context->page_fault = paging32_page_fault; |
2545 | context->gva_to_gpa = paging32_gva_to_gpa; | 2795 | context->gva_to_gpa = paging32_gva_to_gpa; |
@@ -2550,18 +2800,19 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) | |||
2550 | context->root_level = PT32_ROOT_LEVEL; | 2800 | context->root_level = PT32_ROOT_LEVEL; |
2551 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 2801 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
2552 | context->root_hpa = INVALID_PAGE; | 2802 | context->root_hpa = INVALID_PAGE; |
2803 | context->direct_map = false; | ||
2553 | return 0; | 2804 | return 0; |
2554 | } | 2805 | } |
2555 | 2806 | ||
2556 | static int paging32E_init_context(struct kvm_vcpu *vcpu) | 2807 | static int paging32E_init_context(struct kvm_vcpu *vcpu, |
2808 | struct kvm_mmu *context) | ||
2557 | { | 2809 | { |
2558 | reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); | 2810 | return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); |
2559 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); | ||
2560 | } | 2811 | } |
2561 | 2812 | ||
2562 | static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | 2813 | static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) |
2563 | { | 2814 | { |
2564 | struct kvm_mmu *context = &vcpu->arch.mmu; | 2815 | struct kvm_mmu *context = vcpu->arch.walk_mmu; |
2565 | 2816 | ||
2566 | context->new_cr3 = nonpaging_new_cr3; | 2817 | context->new_cr3 = nonpaging_new_cr3; |
2567 | context->page_fault = tdp_page_fault; | 2818 | context->page_fault = tdp_page_fault; |
@@ -2571,20 +2822,29 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
2571 | context->invlpg = nonpaging_invlpg; | 2822 | context->invlpg = nonpaging_invlpg; |
2572 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); | 2823 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); |
2573 | context->root_hpa = INVALID_PAGE; | 2824 | context->root_hpa = INVALID_PAGE; |
2825 | context->direct_map = true; | ||
2826 | context->set_cr3 = kvm_x86_ops->set_tdp_cr3; | ||
2827 | context->get_cr3 = get_cr3; | ||
2828 | context->inject_page_fault = kvm_inject_page_fault; | ||
2829 | context->nx = is_nx(vcpu); | ||
2574 | 2830 | ||
2575 | if (!is_paging(vcpu)) { | 2831 | if (!is_paging(vcpu)) { |
2832 | context->nx = false; | ||
2576 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 2833 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
2577 | context->root_level = 0; | 2834 | context->root_level = 0; |
2578 | } else if (is_long_mode(vcpu)) { | 2835 | } else if (is_long_mode(vcpu)) { |
2579 | reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); | 2836 | context->nx = is_nx(vcpu); |
2837 | reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); | ||
2580 | context->gva_to_gpa = paging64_gva_to_gpa; | 2838 | context->gva_to_gpa = paging64_gva_to_gpa; |
2581 | context->root_level = PT64_ROOT_LEVEL; | 2839 | context->root_level = PT64_ROOT_LEVEL; |
2582 | } else if (is_pae(vcpu)) { | 2840 | } else if (is_pae(vcpu)) { |
2583 | reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); | 2841 | context->nx = is_nx(vcpu); |
2842 | reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); | ||
2584 | context->gva_to_gpa = paging64_gva_to_gpa; | 2843 | context->gva_to_gpa = paging64_gva_to_gpa; |
2585 | context->root_level = PT32E_ROOT_LEVEL; | 2844 | context->root_level = PT32E_ROOT_LEVEL; |
2586 | } else { | 2845 | } else { |
2587 | reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); | 2846 | context->nx = false; |
2847 | reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); | ||
2588 | context->gva_to_gpa = paging32_gva_to_gpa; | 2848 | context->gva_to_gpa = paging32_gva_to_gpa; |
2589 | context->root_level = PT32_ROOT_LEVEL; | 2849 | context->root_level = PT32_ROOT_LEVEL; |
2590 | } | 2850 | } |
@@ -2592,33 +2852,83 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
2592 | return 0; | 2852 | return 0; |
2593 | } | 2853 | } |
2594 | 2854 | ||
2595 | static int init_kvm_softmmu(struct kvm_vcpu *vcpu) | 2855 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) |
2596 | { | 2856 | { |
2597 | int r; | 2857 | int r; |
2598 | |||
2599 | ASSERT(vcpu); | 2858 | ASSERT(vcpu); |
2600 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 2859 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
2601 | 2860 | ||
2602 | if (!is_paging(vcpu)) | 2861 | if (!is_paging(vcpu)) |
2603 | r = nonpaging_init_context(vcpu); | 2862 | r = nonpaging_init_context(vcpu, context); |
2604 | else if (is_long_mode(vcpu)) | 2863 | else if (is_long_mode(vcpu)) |
2605 | r = paging64_init_context(vcpu); | 2864 | r = paging64_init_context(vcpu, context); |
2606 | else if (is_pae(vcpu)) | 2865 | else if (is_pae(vcpu)) |
2607 | r = paging32E_init_context(vcpu); | 2866 | r = paging32E_init_context(vcpu, context); |
2608 | else | 2867 | else |
2609 | r = paging32_init_context(vcpu); | 2868 | r = paging32_init_context(vcpu, context); |
2610 | 2869 | ||
2611 | vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); | 2870 | vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); |
2612 | vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); | 2871 | vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); |
2613 | 2872 | ||
2614 | return r; | 2873 | return r; |
2615 | } | 2874 | } |
2875 | EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); | ||
2876 | |||
2877 | static int init_kvm_softmmu(struct kvm_vcpu *vcpu) | ||
2878 | { | ||
2879 | int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); | ||
2880 | |||
2881 | vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; | ||
2882 | vcpu->arch.walk_mmu->get_cr3 = get_cr3; | ||
2883 | vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; | ||
2884 | |||
2885 | return r; | ||
2886 | } | ||
2887 | |||
2888 | static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) | ||
2889 | { | ||
2890 | struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; | ||
2891 | |||
2892 | g_context->get_cr3 = get_cr3; | ||
2893 | g_context->inject_page_fault = kvm_inject_page_fault; | ||
2894 | |||
2895 | /* | ||
2896 | * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The | ||
2897 | * translation of l2_gpa to l1_gpa addresses is done using the | ||
2898 | * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa | ||
2899 | * functions between mmu and nested_mmu are swapped. | ||
2900 | */ | ||
2901 | if (!is_paging(vcpu)) { | ||
2902 | g_context->nx = false; | ||
2903 | g_context->root_level = 0; | ||
2904 | g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; | ||
2905 | } else if (is_long_mode(vcpu)) { | ||
2906 | g_context->nx = is_nx(vcpu); | ||
2907 | reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); | ||
2908 | g_context->root_level = PT64_ROOT_LEVEL; | ||
2909 | g_context->gva_to_gpa = paging64_gva_to_gpa_nested; | ||
2910 | } else if (is_pae(vcpu)) { | ||
2911 | g_context->nx = is_nx(vcpu); | ||
2912 | reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); | ||
2913 | g_context->root_level = PT32E_ROOT_LEVEL; | ||
2914 | g_context->gva_to_gpa = paging64_gva_to_gpa_nested; | ||
2915 | } else { | ||
2916 | g_context->nx = false; | ||
2917 | reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); | ||
2918 | g_context->root_level = PT32_ROOT_LEVEL; | ||
2919 | g_context->gva_to_gpa = paging32_gva_to_gpa_nested; | ||
2920 | } | ||
2921 | |||
2922 | return 0; | ||
2923 | } | ||
2616 | 2924 | ||
2617 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | 2925 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) |
2618 | { | 2926 | { |
2619 | vcpu->arch.update_pte.pfn = bad_pfn; | 2927 | vcpu->arch.update_pte.pfn = bad_pfn; |
2620 | 2928 | ||
2621 | if (tdp_enabled) | 2929 | if (mmu_is_nested(vcpu)) |
2930 | return init_kvm_nested_mmu(vcpu); | ||
2931 | else if (tdp_enabled) | ||
2622 | return init_kvm_tdp_mmu(vcpu); | 2932 | return init_kvm_tdp_mmu(vcpu); |
2623 | else | 2933 | else |
2624 | return init_kvm_softmmu(vcpu); | 2934 | return init_kvm_softmmu(vcpu); |
@@ -2653,7 +2963,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) | |||
2653 | if (r) | 2963 | if (r) |
2654 | goto out; | 2964 | goto out; |
2655 | /* set_cr3() should ensure TLB has been flushed */ | 2965 | /* set_cr3() should ensure TLB has been flushed */ |
2656 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); | 2966 | vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); |
2657 | out: | 2967 | out: |
2658 | return r; | 2968 | return r; |
2659 | } | 2969 | } |
@@ -2663,6 +2973,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) | |||
2663 | { | 2973 | { |
2664 | mmu_free_roots(vcpu); | 2974 | mmu_free_roots(vcpu); |
2665 | } | 2975 | } |
2976 | EXPORT_SYMBOL_GPL(kvm_mmu_unload); | ||
2666 | 2977 | ||
2667 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | 2978 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, |
2668 | struct kvm_mmu_page *sp, | 2979 | struct kvm_mmu_page *sp, |
@@ -2695,7 +3006,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | |||
2695 | return; | 3006 | return; |
2696 | } | 3007 | } |
2697 | 3008 | ||
2698 | if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) | 3009 | if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) |
2699 | return; | 3010 | return; |
2700 | 3011 | ||
2701 | ++vcpu->kvm->stat.mmu_pte_updated; | 3012 | ++vcpu->kvm->stat.mmu_pte_updated; |
@@ -2837,7 +3148,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2837 | kvm_mmu_access_page(vcpu, gfn); | 3148 | kvm_mmu_access_page(vcpu, gfn); |
2838 | kvm_mmu_free_some_pages(vcpu); | 3149 | kvm_mmu_free_some_pages(vcpu); |
2839 | ++vcpu->kvm->stat.mmu_pte_write; | 3150 | ++vcpu->kvm->stat.mmu_pte_write; |
2840 | kvm_mmu_audit(vcpu, "pre pte write"); | 3151 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); |
2841 | if (guest_initiated) { | 3152 | if (guest_initiated) { |
2842 | if (gfn == vcpu->arch.last_pt_write_gfn | 3153 | if (gfn == vcpu->arch.last_pt_write_gfn |
2843 | && !last_updated_pte_accessed(vcpu)) { | 3154 | && !last_updated_pte_accessed(vcpu)) { |
@@ -2910,7 +3221,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2910 | } | 3221 | } |
2911 | mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); | 3222 | mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); |
2912 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | 3223 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); |
2913 | kvm_mmu_audit(vcpu, "post pte write"); | 3224 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); |
2914 | spin_unlock(&vcpu->kvm->mmu_lock); | 3225 | spin_unlock(&vcpu->kvm->mmu_lock); |
2915 | if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { | 3226 | if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { |
2916 | kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); | 3227 | kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); |
@@ -2923,7 +3234,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | |||
2923 | gpa_t gpa; | 3234 | gpa_t gpa; |
2924 | int r; | 3235 | int r; |
2925 | 3236 | ||
2926 | if (tdp_enabled) | 3237 | if (vcpu->arch.mmu.direct_map) |
2927 | return 0; | 3238 | return 0; |
2928 | 3239 | ||
2929 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); | 3240 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); |
@@ -2937,21 +3248,18 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); | |||
2937 | 3248 | ||
2938 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | 3249 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) |
2939 | { | 3250 | { |
2940 | int free_pages; | ||
2941 | LIST_HEAD(invalid_list); | 3251 | LIST_HEAD(invalid_list); |
2942 | 3252 | ||
2943 | free_pages = vcpu->kvm->arch.n_free_mmu_pages; | 3253 | while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && |
2944 | while (free_pages < KVM_REFILL_PAGES && | ||
2945 | !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { | 3254 | !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { |
2946 | struct kvm_mmu_page *sp; | 3255 | struct kvm_mmu_page *sp; |
2947 | 3256 | ||
2948 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, | 3257 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, |
2949 | struct kvm_mmu_page, link); | 3258 | struct kvm_mmu_page, link); |
2950 | free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, | 3259 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); |
2951 | &invalid_list); | 3260 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); |
2952 | ++vcpu->kvm->stat.mmu_recycled; | 3261 | ++vcpu->kvm->stat.mmu_recycled; |
2953 | } | 3262 | } |
2954 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | ||
2955 | } | 3263 | } |
2956 | 3264 | ||
2957 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | 3265 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) |
@@ -3013,6 +3321,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); | |||
3013 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | 3321 | static void free_mmu_pages(struct kvm_vcpu *vcpu) |
3014 | { | 3322 | { |
3015 | free_page((unsigned long)vcpu->arch.mmu.pae_root); | 3323 | free_page((unsigned long)vcpu->arch.mmu.pae_root); |
3324 | if (vcpu->arch.mmu.lm_root != NULL) | ||
3325 | free_page((unsigned long)vcpu->arch.mmu.lm_root); | ||
3016 | } | 3326 | } |
3017 | 3327 | ||
3018 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | 3328 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) |
@@ -3054,15 +3364,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) | |||
3054 | return init_kvm_mmu(vcpu); | 3364 | return init_kvm_mmu(vcpu); |
3055 | } | 3365 | } |
3056 | 3366 | ||
3057 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | ||
3058 | { | ||
3059 | ASSERT(vcpu); | ||
3060 | |||
3061 | destroy_kvm_mmu(vcpu); | ||
3062 | free_mmu_pages(vcpu); | ||
3063 | mmu_free_memory_caches(vcpu); | ||
3064 | } | ||
3065 | |||
3066 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | 3367 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) |
3067 | { | 3368 | { |
3068 | struct kvm_mmu_page *sp; | 3369 | struct kvm_mmu_page *sp; |
@@ -3112,23 +3413,22 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) | |||
3112 | { | 3413 | { |
3113 | struct kvm *kvm; | 3414 | struct kvm *kvm; |
3114 | struct kvm *kvm_freed = NULL; | 3415 | struct kvm *kvm_freed = NULL; |
3115 | int cache_count = 0; | 3416 | |
3417 | if (nr_to_scan == 0) | ||
3418 | goto out; | ||
3116 | 3419 | ||
3117 | spin_lock(&kvm_lock); | 3420 | spin_lock(&kvm_lock); |
3118 | 3421 | ||
3119 | list_for_each_entry(kvm, &vm_list, vm_list) { | 3422 | list_for_each_entry(kvm, &vm_list, vm_list) { |
3120 | int npages, idx, freed_pages; | 3423 | int idx, freed_pages; |
3121 | LIST_HEAD(invalid_list); | 3424 | LIST_HEAD(invalid_list); |
3122 | 3425 | ||
3123 | idx = srcu_read_lock(&kvm->srcu); | 3426 | idx = srcu_read_lock(&kvm->srcu); |
3124 | spin_lock(&kvm->mmu_lock); | 3427 | spin_lock(&kvm->mmu_lock); |
3125 | npages = kvm->arch.n_alloc_mmu_pages - | 3428 | if (!kvm_freed && nr_to_scan > 0 && |
3126 | kvm->arch.n_free_mmu_pages; | 3429 | kvm->arch.n_used_mmu_pages > 0) { |
3127 | cache_count += npages; | ||
3128 | if (!kvm_freed && nr_to_scan > 0 && npages > 0) { | ||
3129 | freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, | 3430 | freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, |
3130 | &invalid_list); | 3431 | &invalid_list); |
3131 | cache_count -= freed_pages; | ||
3132 | kvm_freed = kvm; | 3432 | kvm_freed = kvm; |
3133 | } | 3433 | } |
3134 | nr_to_scan--; | 3434 | nr_to_scan--; |
@@ -3142,7 +3442,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) | |||
3142 | 3442 | ||
3143 | spin_unlock(&kvm_lock); | 3443 | spin_unlock(&kvm_lock); |
3144 | 3444 | ||
3145 | return cache_count; | 3445 | out: |
3446 | return percpu_counter_read_positive(&kvm_total_used_mmu_pages); | ||
3146 | } | 3447 | } |
3147 | 3448 | ||
3148 | static struct shrinker mmu_shrinker = { | 3449 | static struct shrinker mmu_shrinker = { |
@@ -3163,6 +3464,7 @@ static void mmu_destroy_caches(void) | |||
3163 | void kvm_mmu_module_exit(void) | 3464 | void kvm_mmu_module_exit(void) |
3164 | { | 3465 | { |
3165 | mmu_destroy_caches(); | 3466 | mmu_destroy_caches(); |
3467 | percpu_counter_destroy(&kvm_total_used_mmu_pages); | ||
3166 | unregister_shrinker(&mmu_shrinker); | 3468 | unregister_shrinker(&mmu_shrinker); |
3167 | } | 3469 | } |
3168 | 3470 | ||
@@ -3185,6 +3487,9 @@ int kvm_mmu_module_init(void) | |||
3185 | if (!mmu_page_header_cache) | 3487 | if (!mmu_page_header_cache) |
3186 | goto nomem; | 3488 | goto nomem; |
3187 | 3489 | ||
3490 | if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) | ||
3491 | goto nomem; | ||
3492 | |||
3188 | register_shrinker(&mmu_shrinker); | 3493 | register_shrinker(&mmu_shrinker); |
3189 | 3494 | ||
3190 | return 0; | 3495 | return 0; |
@@ -3355,271 +3660,18 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) | |||
3355 | } | 3660 | } |
3356 | EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); | 3661 | EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); |
3357 | 3662 | ||
3358 | #ifdef AUDIT | 3663 | #ifdef CONFIG_KVM_MMU_AUDIT |
3359 | 3664 | #include "mmu_audit.c" | |
3360 | static const char *audit_msg; | 3665 | #else |
3361 | 3666 | static void mmu_audit_disable(void) { } | |
3362 | static gva_t canonicalize(gva_t gva) | ||
3363 | { | ||
3364 | #ifdef CONFIG_X86_64 | ||
3365 | gva = (long long)(gva << 16) >> 16; | ||
3366 | #endif | 3667 | #endif |
3367 | return gva; | ||
3368 | } | ||
3369 | |||
3370 | |||
3371 | typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep); | ||
3372 | |||
3373 | static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, | ||
3374 | inspect_spte_fn fn) | ||
3375 | { | ||
3376 | int i; | ||
3377 | |||
3378 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
3379 | u64 ent = sp->spt[i]; | ||
3380 | |||
3381 | if (is_shadow_present_pte(ent)) { | ||
3382 | if (!is_last_spte(ent, sp->role.level)) { | ||
3383 | struct kvm_mmu_page *child; | ||
3384 | child = page_header(ent & PT64_BASE_ADDR_MASK); | ||
3385 | __mmu_spte_walk(kvm, child, fn); | ||
3386 | } else | ||
3387 | fn(kvm, &sp->spt[i]); | ||
3388 | } | ||
3389 | } | ||
3390 | } | ||
3391 | |||
3392 | static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) | ||
3393 | { | ||
3394 | int i; | ||
3395 | struct kvm_mmu_page *sp; | ||
3396 | |||
3397 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
3398 | return; | ||
3399 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
3400 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
3401 | sp = page_header(root); | ||
3402 | __mmu_spte_walk(vcpu->kvm, sp, fn); | ||
3403 | return; | ||
3404 | } | ||
3405 | for (i = 0; i < 4; ++i) { | ||
3406 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
3407 | |||
3408 | if (root && VALID_PAGE(root)) { | ||
3409 | root &= PT64_BASE_ADDR_MASK; | ||
3410 | sp = page_header(root); | ||
3411 | __mmu_spte_walk(vcpu->kvm, sp, fn); | ||
3412 | } | ||
3413 | } | ||
3414 | return; | ||
3415 | } | ||
3416 | |||
3417 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | ||
3418 | gva_t va, int level) | ||
3419 | { | ||
3420 | u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); | ||
3421 | int i; | ||
3422 | gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); | ||
3423 | |||
3424 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { | ||
3425 | u64 ent = pt[i]; | ||
3426 | |||
3427 | if (ent == shadow_trap_nonpresent_pte) | ||
3428 | continue; | ||
3429 | |||
3430 | va = canonicalize(va); | ||
3431 | if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) | ||
3432 | audit_mappings_page(vcpu, ent, va, level - 1); | ||
3433 | else { | ||
3434 | gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); | ||
3435 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
3436 | pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); | ||
3437 | hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; | ||
3438 | 3668 | ||
3439 | if (is_error_pfn(pfn)) { | 3669 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) |
3440 | kvm_release_pfn_clean(pfn); | ||
3441 | continue; | ||
3442 | } | ||
3443 | |||
3444 | if (is_shadow_present_pte(ent) | ||
3445 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | ||
3446 | printk(KERN_ERR "xx audit error: (%s) levels %d" | ||
3447 | " gva %lx gpa %llx hpa %llx ent %llx %d\n", | ||
3448 | audit_msg, vcpu->arch.mmu.root_level, | ||
3449 | va, gpa, hpa, ent, | ||
3450 | is_shadow_present_pte(ent)); | ||
3451 | else if (ent == shadow_notrap_nonpresent_pte | ||
3452 | && !is_error_hpa(hpa)) | ||
3453 | printk(KERN_ERR "audit: (%s) notrap shadow," | ||
3454 | " valid guest gva %lx\n", audit_msg, va); | ||
3455 | kvm_release_pfn_clean(pfn); | ||
3456 | |||
3457 | } | ||
3458 | } | ||
3459 | } | ||
3460 | |||
3461 | static void audit_mappings(struct kvm_vcpu *vcpu) | ||
3462 | { | ||
3463 | unsigned i; | ||
3464 | |||
3465 | if (vcpu->arch.mmu.root_level == 4) | ||
3466 | audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); | ||
3467 | else | ||
3468 | for (i = 0; i < 4; ++i) | ||
3469 | if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) | ||
3470 | audit_mappings_page(vcpu, | ||
3471 | vcpu->arch.mmu.pae_root[i], | ||
3472 | i << 30, | ||
3473 | 2); | ||
3474 | } | ||
3475 | |||
3476 | static int count_rmaps(struct kvm_vcpu *vcpu) | ||
3477 | { | ||
3478 | struct kvm *kvm = vcpu->kvm; | ||
3479 | struct kvm_memslots *slots; | ||
3480 | int nmaps = 0; | ||
3481 | int i, j, k, idx; | ||
3482 | |||
3483 | idx = srcu_read_lock(&kvm->srcu); | ||
3484 | slots = kvm_memslots(kvm); | ||
3485 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
3486 | struct kvm_memory_slot *m = &slots->memslots[i]; | ||
3487 | struct kvm_rmap_desc *d; | ||
3488 | |||
3489 | for (j = 0; j < m->npages; ++j) { | ||
3490 | unsigned long *rmapp = &m->rmap[j]; | ||
3491 | |||
3492 | if (!*rmapp) | ||
3493 | continue; | ||
3494 | if (!(*rmapp & 1)) { | ||
3495 | ++nmaps; | ||
3496 | continue; | ||
3497 | } | ||
3498 | d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
3499 | while (d) { | ||
3500 | for (k = 0; k < RMAP_EXT; ++k) | ||
3501 | if (d->sptes[k]) | ||
3502 | ++nmaps; | ||
3503 | else | ||
3504 | break; | ||
3505 | d = d->more; | ||
3506 | } | ||
3507 | } | ||
3508 | } | ||
3509 | srcu_read_unlock(&kvm->srcu, idx); | ||
3510 | return nmaps; | ||
3511 | } | ||
3512 | |||
3513 | void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) | ||
3514 | { | ||
3515 | unsigned long *rmapp; | ||
3516 | struct kvm_mmu_page *rev_sp; | ||
3517 | gfn_t gfn; | ||
3518 | |||
3519 | if (is_writable_pte(*sptep)) { | ||
3520 | rev_sp = page_header(__pa(sptep)); | ||
3521 | gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); | ||
3522 | |||
3523 | if (!gfn_to_memslot(kvm, gfn)) { | ||
3524 | if (!printk_ratelimit()) | ||
3525 | return; | ||
3526 | printk(KERN_ERR "%s: no memslot for gfn %ld\n", | ||
3527 | audit_msg, gfn); | ||
3528 | printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", | ||
3529 | audit_msg, (long int)(sptep - rev_sp->spt), | ||
3530 | rev_sp->gfn); | ||
3531 | dump_stack(); | ||
3532 | return; | ||
3533 | } | ||
3534 | |||
3535 | rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); | ||
3536 | if (!*rmapp) { | ||
3537 | if (!printk_ratelimit()) | ||
3538 | return; | ||
3539 | printk(KERN_ERR "%s: no rmap for writable spte %llx\n", | ||
3540 | audit_msg, *sptep); | ||
3541 | dump_stack(); | ||
3542 | } | ||
3543 | } | ||
3544 | |||
3545 | } | ||
3546 | |||
3547 | void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) | ||
3548 | { | ||
3549 | mmu_spte_walk(vcpu, inspect_spte_has_rmap); | ||
3550 | } | ||
3551 | |||
3552 | static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) | ||
3553 | { | ||
3554 | struct kvm_mmu_page *sp; | ||
3555 | int i; | ||
3556 | |||
3557 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | ||
3558 | u64 *pt = sp->spt; | ||
3559 | |||
3560 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) | ||
3561 | continue; | ||
3562 | |||
3563 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
3564 | u64 ent = pt[i]; | ||
3565 | |||
3566 | if (!(ent & PT_PRESENT_MASK)) | ||
3567 | continue; | ||
3568 | if (!is_writable_pte(ent)) | ||
3569 | continue; | ||
3570 | inspect_spte_has_rmap(vcpu->kvm, &pt[i]); | ||
3571 | } | ||
3572 | } | ||
3573 | return; | ||
3574 | } | ||
3575 | |||
3576 | static void audit_rmap(struct kvm_vcpu *vcpu) | ||
3577 | { | ||
3578 | check_writable_mappings_rmap(vcpu); | ||
3579 | count_rmaps(vcpu); | ||
3580 | } | ||
3581 | |||
3582 | static void audit_write_protection(struct kvm_vcpu *vcpu) | ||
3583 | { | ||
3584 | struct kvm_mmu_page *sp; | ||
3585 | struct kvm_memory_slot *slot; | ||
3586 | unsigned long *rmapp; | ||
3587 | u64 *spte; | ||
3588 | gfn_t gfn; | ||
3589 | |||
3590 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | ||
3591 | if (sp->role.direct) | ||
3592 | continue; | ||
3593 | if (sp->unsync) | ||
3594 | continue; | ||
3595 | |||
3596 | slot = gfn_to_memslot(vcpu->kvm, sp->gfn); | ||
3597 | rmapp = &slot->rmap[gfn - slot->base_gfn]; | ||
3598 | |||
3599 | spte = rmap_next(vcpu->kvm, rmapp, NULL); | ||
3600 | while (spte) { | ||
3601 | if (is_writable_pte(*spte)) | ||
3602 | printk(KERN_ERR "%s: (%s) shadow page has " | ||
3603 | "writable mappings: gfn %lx role %x\n", | ||
3604 | __func__, audit_msg, sp->gfn, | ||
3605 | sp->role.word); | ||
3606 | spte = rmap_next(vcpu->kvm, rmapp, spte); | ||
3607 | } | ||
3608 | } | ||
3609 | } | ||
3610 | |||
3611 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) | ||
3612 | { | 3670 | { |
3613 | int olddbg = dbg; | 3671 | ASSERT(vcpu); |
3614 | 3672 | ||
3615 | dbg = 0; | 3673 | destroy_kvm_mmu(vcpu); |
3616 | audit_msg = msg; | 3674 | free_mmu_pages(vcpu); |
3617 | audit_rmap(vcpu); | 3675 | mmu_free_memory_caches(vcpu); |
3618 | audit_write_protection(vcpu); | 3676 | mmu_audit_disable(); |
3619 | if (strcmp("pre pte write", audit_msg) != 0) | ||
3620 | audit_mappings(vcpu); | ||
3621 | audit_writable_sptes_have_rmaps(vcpu); | ||
3622 | dbg = olddbg; | ||
3623 | } | 3677 | } |
3624 | |||
3625 | #endif | ||
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index be66759321a5..7086ca85d3e7 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h | |||
@@ -49,10 +49,17 @@ | |||
49 | #define PFERR_FETCH_MASK (1U << 4) | 49 | #define PFERR_FETCH_MASK (1U << 4) |
50 | 50 | ||
51 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); | 51 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); |
52 | int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); | ||
53 | |||
54 | static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) | ||
55 | { | ||
56 | return kvm->arch.n_max_mmu_pages - | ||
57 | kvm->arch.n_used_mmu_pages; | ||
58 | } | ||
52 | 59 | ||
53 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | 60 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) |
54 | { | 61 | { |
55 | if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) | 62 | if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES)) |
56 | __kvm_mmu_free_some_pages(vcpu); | 63 | __kvm_mmu_free_some_pages(vcpu); |
57 | } | 64 | } |
58 | 65 | ||
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c new file mode 100644 index 000000000000..ba2bcdde6221 --- /dev/null +++ b/arch/x86/kvm/mmu_audit.c | |||
@@ -0,0 +1,299 @@ | |||
1 | /* | ||
2 | * mmu_audit.c: | ||
3 | * | ||
4 | * Audit code for KVM MMU | ||
5 | * | ||
6 | * Copyright (C) 2006 Qumranet, Inc. | ||
7 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. | ||
8 | * | ||
9 | * Authors: | ||
10 | * Yaniv Kamay <yaniv@qumranet.com> | ||
11 | * Avi Kivity <avi@qumranet.com> | ||
12 | * Marcelo Tosatti <mtosatti@redhat.com> | ||
13 | * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | #include <linux/ratelimit.h> | ||
21 | |||
22 | static int audit_point; | ||
23 | |||
24 | #define audit_printk(fmt, args...) \ | ||
25 | printk(KERN_ERR "audit: (%s) error: " \ | ||
26 | fmt, audit_point_name[audit_point], ##args) | ||
27 | |||
28 | typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); | ||
29 | |||
30 | static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | ||
31 | inspect_spte_fn fn, int level) | ||
32 | { | ||
33 | int i; | ||
34 | |||
35 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
36 | u64 *ent = sp->spt; | ||
37 | |||
38 | fn(vcpu, ent + i, level); | ||
39 | |||
40 | if (is_shadow_present_pte(ent[i]) && | ||
41 | !is_last_spte(ent[i], level)) { | ||
42 | struct kvm_mmu_page *child; | ||
43 | |||
44 | child = page_header(ent[i] & PT64_BASE_ADDR_MASK); | ||
45 | __mmu_spte_walk(vcpu, child, fn, level - 1); | ||
46 | } | ||
47 | } | ||
48 | } | ||
49 | |||
50 | static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) | ||
51 | { | ||
52 | int i; | ||
53 | struct kvm_mmu_page *sp; | ||
54 | |||
55 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
56 | return; | ||
57 | |||
58 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | ||
59 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
60 | |||
61 | sp = page_header(root); | ||
62 | __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL); | ||
63 | return; | ||
64 | } | ||
65 | |||
66 | for (i = 0; i < 4; ++i) { | ||
67 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
68 | |||
69 | if (root && VALID_PAGE(root)) { | ||
70 | root &= PT64_BASE_ADDR_MASK; | ||
71 | sp = page_header(root); | ||
72 | __mmu_spte_walk(vcpu, sp, fn, 2); | ||
73 | } | ||
74 | } | ||
75 | |||
76 | return; | ||
77 | } | ||
78 | |||
79 | typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp); | ||
80 | |||
81 | static void walk_all_active_sps(struct kvm *kvm, sp_handler fn) | ||
82 | { | ||
83 | struct kvm_mmu_page *sp; | ||
84 | |||
85 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) | ||
86 | fn(kvm, sp); | ||
87 | } | ||
88 | |||
89 | static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) | ||
90 | { | ||
91 | struct kvm_mmu_page *sp; | ||
92 | gfn_t gfn; | ||
93 | pfn_t pfn; | ||
94 | hpa_t hpa; | ||
95 | |||
96 | sp = page_header(__pa(sptep)); | ||
97 | |||
98 | if (sp->unsync) { | ||
99 | if (level != PT_PAGE_TABLE_LEVEL) { | ||
100 | audit_printk("unsync sp: %p level = %d\n", sp, level); | ||
101 | return; | ||
102 | } | ||
103 | |||
104 | if (*sptep == shadow_notrap_nonpresent_pte) { | ||
105 | audit_printk("notrap spte in unsync sp: %p\n", sp); | ||
106 | return; | ||
107 | } | ||
108 | } | ||
109 | |||
110 | if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { | ||
111 | audit_printk("notrap spte in direct sp: %p\n", sp); | ||
112 | return; | ||
113 | } | ||
114 | |||
115 | if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) | ||
116 | return; | ||
117 | |||
118 | gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); | ||
119 | pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); | ||
120 | |||
121 | if (is_error_pfn(pfn)) { | ||
122 | kvm_release_pfn_clean(pfn); | ||
123 | return; | ||
124 | } | ||
125 | |||
126 | hpa = pfn << PAGE_SHIFT; | ||
127 | if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) | ||
128 | audit_printk("levels %d pfn %llx hpa %llx ent %llxn", | ||
129 | vcpu->arch.mmu.root_level, pfn, hpa, *sptep); | ||
130 | } | ||
131 | |||
132 | static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) | ||
133 | { | ||
134 | unsigned long *rmapp; | ||
135 | struct kvm_mmu_page *rev_sp; | ||
136 | gfn_t gfn; | ||
137 | |||
138 | |||
139 | rev_sp = page_header(__pa(sptep)); | ||
140 | gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); | ||
141 | |||
142 | if (!gfn_to_memslot(kvm, gfn)) { | ||
143 | if (!printk_ratelimit()) | ||
144 | return; | ||
145 | audit_printk("no memslot for gfn %llx\n", gfn); | ||
146 | audit_printk("index %ld of sp (gfn=%llx)\n", | ||
147 | (long int)(sptep - rev_sp->spt), rev_sp->gfn); | ||
148 | dump_stack(); | ||
149 | return; | ||
150 | } | ||
151 | |||
152 | rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); | ||
153 | if (!*rmapp) { | ||
154 | if (!printk_ratelimit()) | ||
155 | return; | ||
156 | audit_printk("no rmap for writable spte %llx\n", *sptep); | ||
157 | dump_stack(); | ||
158 | } | ||
159 | } | ||
160 | |||
161 | static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) | ||
162 | { | ||
163 | if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level)) | ||
164 | inspect_spte_has_rmap(vcpu->kvm, sptep); | ||
165 | } | ||
166 | |||
167 | static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) | ||
168 | { | ||
169 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | ||
170 | |||
171 | if (audit_point == AUDIT_POST_SYNC && sp->unsync) | ||
172 | audit_printk("meet unsync sp(%p) after sync root.\n", sp); | ||
173 | } | ||
174 | |||
175 | static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
176 | { | ||
177 | int i; | ||
178 | |||
179 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) | ||
180 | return; | ||
181 | |||
182 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
183 | if (!is_rmap_spte(sp->spt[i])) | ||
184 | continue; | ||
185 | |||
186 | inspect_spte_has_rmap(kvm, sp->spt + i); | ||
187 | } | ||
188 | } | ||
189 | |||
190 | static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
191 | { | ||
192 | struct kvm_memory_slot *slot; | ||
193 | unsigned long *rmapp; | ||
194 | u64 *spte; | ||
195 | |||
196 | if (sp->role.direct || sp->unsync || sp->role.invalid) | ||
197 | return; | ||
198 | |||
199 | slot = gfn_to_memslot(kvm, sp->gfn); | ||
200 | rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; | ||
201 | |||
202 | spte = rmap_next(kvm, rmapp, NULL); | ||
203 | while (spte) { | ||
204 | if (is_writable_pte(*spte)) | ||
205 | audit_printk("shadow page has writable mappings: gfn " | ||
206 | "%llx role %x\n", sp->gfn, sp->role.word); | ||
207 | spte = rmap_next(kvm, rmapp, spte); | ||
208 | } | ||
209 | } | ||
210 | |||
211 | static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
212 | { | ||
213 | check_mappings_rmap(kvm, sp); | ||
214 | audit_write_protection(kvm, sp); | ||
215 | } | ||
216 | |||
217 | static void audit_all_active_sps(struct kvm *kvm) | ||
218 | { | ||
219 | walk_all_active_sps(kvm, audit_sp); | ||
220 | } | ||
221 | |||
222 | static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level) | ||
223 | { | ||
224 | audit_sptes_have_rmaps(vcpu, sptep, level); | ||
225 | audit_mappings(vcpu, sptep, level); | ||
226 | audit_spte_after_sync(vcpu, sptep, level); | ||
227 | } | ||
228 | |||
229 | static void audit_vcpu_spte(struct kvm_vcpu *vcpu) | ||
230 | { | ||
231 | mmu_spte_walk(vcpu, audit_spte); | ||
232 | } | ||
233 | |||
234 | static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) | ||
235 | { | ||
236 | static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); | ||
237 | |||
238 | if (!__ratelimit(&ratelimit_state)) | ||
239 | return; | ||
240 | |||
241 | audit_point = point; | ||
242 | audit_all_active_sps(vcpu->kvm); | ||
243 | audit_vcpu_spte(vcpu); | ||
244 | } | ||
245 | |||
246 | static bool mmu_audit; | ||
247 | |||
248 | static void mmu_audit_enable(void) | ||
249 | { | ||
250 | int ret; | ||
251 | |||
252 | if (mmu_audit) | ||
253 | return; | ||
254 | |||
255 | ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); | ||
256 | WARN_ON(ret); | ||
257 | |||
258 | mmu_audit = true; | ||
259 | } | ||
260 | |||
261 | static void mmu_audit_disable(void) | ||
262 | { | ||
263 | if (!mmu_audit) | ||
264 | return; | ||
265 | |||
266 | unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); | ||
267 | tracepoint_synchronize_unregister(); | ||
268 | mmu_audit = false; | ||
269 | } | ||
270 | |||
271 | static int mmu_audit_set(const char *val, const struct kernel_param *kp) | ||
272 | { | ||
273 | int ret; | ||
274 | unsigned long enable; | ||
275 | |||
276 | ret = strict_strtoul(val, 10, &enable); | ||
277 | if (ret < 0) | ||
278 | return -EINVAL; | ||
279 | |||
280 | switch (enable) { | ||
281 | case 0: | ||
282 | mmu_audit_disable(); | ||
283 | break; | ||
284 | case 1: | ||
285 | mmu_audit_enable(); | ||
286 | break; | ||
287 | default: | ||
288 | return -EINVAL; | ||
289 | } | ||
290 | |||
291 | return 0; | ||
292 | } | ||
293 | |||
294 | static struct kernel_param_ops audit_param_ops = { | ||
295 | .set = mmu_audit_set, | ||
296 | .get = param_get_bool, | ||
297 | }; | ||
298 | |||
299 | module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); | ||
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 3aab0f0930ef..b60b4fdb3eda 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h | |||
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, | |||
195 | 195 | ||
196 | TP_ARGS(sp) | 196 | TP_ARGS(sp) |
197 | ); | 197 | ); |
198 | |||
199 | TRACE_EVENT( | ||
200 | kvm_mmu_audit, | ||
201 | TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), | ||
202 | TP_ARGS(vcpu, audit_point), | ||
203 | |||
204 | TP_STRUCT__entry( | ||
205 | __field(struct kvm_vcpu *, vcpu) | ||
206 | __field(int, audit_point) | ||
207 | ), | ||
208 | |||
209 | TP_fast_assign( | ||
210 | __entry->vcpu = vcpu; | ||
211 | __entry->audit_point = audit_point; | ||
212 | ), | ||
213 | |||
214 | TP_printk("vcpu:%d %s", __entry->vcpu->cpu, | ||
215 | audit_point_name[__entry->audit_point]) | ||
216 | ); | ||
198 | #endif /* _TRACE_KVMMMU_H */ | 217 | #endif /* _TRACE_KVMMMU_H */ |
199 | 218 | ||
200 | #undef TRACE_INCLUDE_PATH | 219 | #undef TRACE_INCLUDE_PATH |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 51ef9097960d..cd7a833a3b52 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -7,7 +7,7 @@ | |||
7 | * MMU support | 7 | * MMU support |
8 | * | 8 | * |
9 | * Copyright (C) 2006 Qumranet, Inc. | 9 | * Copyright (C) 2006 Qumranet, Inc. |
10 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 10 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
11 | * | 11 | * |
12 | * Authors: | 12 | * Authors: |
13 | * Yaniv Kamay <yaniv@qumranet.com> | 13 | * Yaniv Kamay <yaniv@qumranet.com> |
@@ -67,6 +67,7 @@ struct guest_walker { | |||
67 | int level; | 67 | int level; |
68 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; | 68 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; |
69 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; | 69 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; |
70 | pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; | ||
70 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; | 71 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; |
71 | unsigned pt_access; | 72 | unsigned pt_access; |
72 | unsigned pte_access; | 73 | unsigned pte_access; |
@@ -104,7 +105,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) | |||
104 | 105 | ||
105 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; | 106 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; |
106 | #if PTTYPE == 64 | 107 | #if PTTYPE == 64 |
107 | if (is_nx(vcpu)) | 108 | if (vcpu->arch.mmu.nx) |
108 | access &= ~(gpte >> PT64_NX_SHIFT); | 109 | access &= ~(gpte >> PT64_NX_SHIFT); |
109 | #endif | 110 | #endif |
110 | return access; | 111 | return access; |
@@ -113,26 +114,32 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) | |||
113 | /* | 114 | /* |
114 | * Fetch a guest pte for a guest virtual address | 115 | * Fetch a guest pte for a guest virtual address |
115 | */ | 116 | */ |
116 | static int FNAME(walk_addr)(struct guest_walker *walker, | 117 | static int FNAME(walk_addr_generic)(struct guest_walker *walker, |
117 | struct kvm_vcpu *vcpu, gva_t addr, | 118 | struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
118 | int write_fault, int user_fault, int fetch_fault) | 119 | gva_t addr, u32 access) |
119 | { | 120 | { |
120 | pt_element_t pte; | 121 | pt_element_t pte; |
121 | gfn_t table_gfn; | 122 | gfn_t table_gfn; |
122 | unsigned index, pt_access, uninitialized_var(pte_access); | 123 | unsigned index, pt_access, uninitialized_var(pte_access); |
123 | gpa_t pte_gpa; | 124 | gpa_t pte_gpa; |
124 | bool eperm, present, rsvd_fault; | 125 | bool eperm, present, rsvd_fault; |
126 | int offset, write_fault, user_fault, fetch_fault; | ||
127 | |||
128 | write_fault = access & PFERR_WRITE_MASK; | ||
129 | user_fault = access & PFERR_USER_MASK; | ||
130 | fetch_fault = access & PFERR_FETCH_MASK; | ||
125 | 131 | ||
126 | trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, | 132 | trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, |
127 | fetch_fault); | 133 | fetch_fault); |
128 | walk: | 134 | walk: |
129 | present = true; | 135 | present = true; |
130 | eperm = rsvd_fault = false; | 136 | eperm = rsvd_fault = false; |
131 | walker->level = vcpu->arch.mmu.root_level; | 137 | walker->level = mmu->root_level; |
132 | pte = vcpu->arch.cr3; | 138 | pte = mmu->get_cr3(vcpu); |
139 | |||
133 | #if PTTYPE == 64 | 140 | #if PTTYPE == 64 |
134 | if (!is_long_mode(vcpu)) { | 141 | if (walker->level == PT32E_ROOT_LEVEL) { |
135 | pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); | 142 | pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); |
136 | trace_kvm_mmu_paging_element(pte, walker->level); | 143 | trace_kvm_mmu_paging_element(pte, walker->level); |
137 | if (!is_present_gpte(pte)) { | 144 | if (!is_present_gpte(pte)) { |
138 | present = false; | 145 | present = false; |
@@ -142,7 +149,7 @@ walk: | |||
142 | } | 149 | } |
143 | #endif | 150 | #endif |
144 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || | 151 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || |
145 | (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0); | 152 | (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); |
146 | 153 | ||
147 | pt_access = ACC_ALL; | 154 | pt_access = ACC_ALL; |
148 | 155 | ||
@@ -150,12 +157,14 @@ walk: | |||
150 | index = PT_INDEX(addr, walker->level); | 157 | index = PT_INDEX(addr, walker->level); |
151 | 158 | ||
152 | table_gfn = gpte_to_gfn(pte); | 159 | table_gfn = gpte_to_gfn(pte); |
153 | pte_gpa = gfn_to_gpa(table_gfn); | 160 | offset = index * sizeof(pt_element_t); |
154 | pte_gpa += index * sizeof(pt_element_t); | 161 | pte_gpa = gfn_to_gpa(table_gfn) + offset; |
155 | walker->table_gfn[walker->level - 1] = table_gfn; | 162 | walker->table_gfn[walker->level - 1] = table_gfn; |
156 | walker->pte_gpa[walker->level - 1] = pte_gpa; | 163 | walker->pte_gpa[walker->level - 1] = pte_gpa; |
157 | 164 | ||
158 | if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { | 165 | if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte, |
166 | offset, sizeof(pte), | ||
167 | PFERR_USER_MASK|PFERR_WRITE_MASK)) { | ||
159 | present = false; | 168 | present = false; |
160 | break; | 169 | break; |
161 | } | 170 | } |
@@ -167,7 +176,7 @@ walk: | |||
167 | break; | 176 | break; |
168 | } | 177 | } |
169 | 178 | ||
170 | if (is_rsvd_bits_set(vcpu, pte, walker->level)) { | 179 | if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) { |
171 | rsvd_fault = true; | 180 | rsvd_fault = true; |
172 | break; | 181 | break; |
173 | } | 182 | } |
@@ -204,17 +213,28 @@ walk: | |||
204 | (PTTYPE == 64 || is_pse(vcpu))) || | 213 | (PTTYPE == 64 || is_pse(vcpu))) || |
205 | ((walker->level == PT_PDPE_LEVEL) && | 214 | ((walker->level == PT_PDPE_LEVEL) && |
206 | is_large_pte(pte) && | 215 | is_large_pte(pte) && |
207 | is_long_mode(vcpu))) { | 216 | mmu->root_level == PT64_ROOT_LEVEL)) { |
208 | int lvl = walker->level; | 217 | int lvl = walker->level; |
218 | gpa_t real_gpa; | ||
219 | gfn_t gfn; | ||
220 | u32 ac; | ||
209 | 221 | ||
210 | walker->gfn = gpte_to_gfn_lvl(pte, lvl); | 222 | gfn = gpte_to_gfn_lvl(pte, lvl); |
211 | walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) | 223 | gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; |
212 | >> PAGE_SHIFT; | ||
213 | 224 | ||
214 | if (PTTYPE == 32 && | 225 | if (PTTYPE == 32 && |
215 | walker->level == PT_DIRECTORY_LEVEL && | 226 | walker->level == PT_DIRECTORY_LEVEL && |
216 | is_cpuid_PSE36()) | 227 | is_cpuid_PSE36()) |
217 | walker->gfn += pse36_gfn_delta(pte); | 228 | gfn += pse36_gfn_delta(pte); |
229 | |||
230 | ac = write_fault | fetch_fault | user_fault; | ||
231 | |||
232 | real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), | ||
233 | ac); | ||
234 | if (real_gpa == UNMAPPED_GVA) | ||
235 | return 0; | ||
236 | |||
237 | walker->gfn = real_gpa >> PAGE_SHIFT; | ||
218 | 238 | ||
219 | break; | 239 | break; |
220 | } | 240 | } |
@@ -249,18 +269,36 @@ error: | |||
249 | walker->error_code = 0; | 269 | walker->error_code = 0; |
250 | if (present) | 270 | if (present) |
251 | walker->error_code |= PFERR_PRESENT_MASK; | 271 | walker->error_code |= PFERR_PRESENT_MASK; |
252 | if (write_fault) | 272 | |
253 | walker->error_code |= PFERR_WRITE_MASK; | 273 | walker->error_code |= write_fault | user_fault; |
254 | if (user_fault) | 274 | |
255 | walker->error_code |= PFERR_USER_MASK; | 275 | if (fetch_fault && mmu->nx) |
256 | if (fetch_fault && is_nx(vcpu)) | ||
257 | walker->error_code |= PFERR_FETCH_MASK; | 276 | walker->error_code |= PFERR_FETCH_MASK; |
258 | if (rsvd_fault) | 277 | if (rsvd_fault) |
259 | walker->error_code |= PFERR_RSVD_MASK; | 278 | walker->error_code |= PFERR_RSVD_MASK; |
279 | |||
280 | vcpu->arch.fault.address = addr; | ||
281 | vcpu->arch.fault.error_code = walker->error_code; | ||
282 | |||
260 | trace_kvm_mmu_walker_error(walker->error_code); | 283 | trace_kvm_mmu_walker_error(walker->error_code); |
261 | return 0; | 284 | return 0; |
262 | } | 285 | } |
263 | 286 | ||
287 | static int FNAME(walk_addr)(struct guest_walker *walker, | ||
288 | struct kvm_vcpu *vcpu, gva_t addr, u32 access) | ||
289 | { | ||
290 | return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr, | ||
291 | access); | ||
292 | } | ||
293 | |||
294 | static int FNAME(walk_addr_nested)(struct guest_walker *walker, | ||
295 | struct kvm_vcpu *vcpu, gva_t addr, | ||
296 | u32 access) | ||
297 | { | ||
298 | return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, | ||
299 | addr, access); | ||
300 | } | ||
301 | |||
264 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | 302 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
265 | u64 *spte, const void *pte) | 303 | u64 *spte, const void *pte) |
266 | { | 304 | { |
@@ -302,14 +340,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
302 | static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, | 340 | static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, |
303 | struct guest_walker *gw, int level) | 341 | struct guest_walker *gw, int level) |
304 | { | 342 | { |
305 | int r; | ||
306 | pt_element_t curr_pte; | 343 | pt_element_t curr_pte; |
307 | 344 | gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1]; | |
308 | r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], | 345 | u64 mask; |
346 | int r, index; | ||
347 | |||
348 | if (level == PT_PAGE_TABLE_LEVEL) { | ||
349 | mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; | ||
350 | base_gpa = pte_gpa & ~mask; | ||
351 | index = (pte_gpa - base_gpa) / sizeof(pt_element_t); | ||
352 | |||
353 | r = kvm_read_guest_atomic(vcpu->kvm, base_gpa, | ||
354 | gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); | ||
355 | curr_pte = gw->prefetch_ptes[index]; | ||
356 | } else | ||
357 | r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, | ||
309 | &curr_pte, sizeof(curr_pte)); | 358 | &curr_pte, sizeof(curr_pte)); |
359 | |||
310 | return r || curr_pte != gw->ptes[level - 1]; | 360 | return r || curr_pte != gw->ptes[level - 1]; |
311 | } | 361 | } |
312 | 362 | ||
363 | static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | ||
364 | u64 *sptep) | ||
365 | { | ||
366 | struct kvm_mmu_page *sp; | ||
367 | struct kvm_mmu *mmu = &vcpu->arch.mmu; | ||
368 | pt_element_t *gptep = gw->prefetch_ptes; | ||
369 | u64 *spte; | ||
370 | int i; | ||
371 | |||
372 | sp = page_header(__pa(sptep)); | ||
373 | |||
374 | if (sp->role.level > PT_PAGE_TABLE_LEVEL) | ||
375 | return; | ||
376 | |||
377 | if (sp->role.direct) | ||
378 | return __direct_pte_prefetch(vcpu, sp, sptep); | ||
379 | |||
380 | i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); | ||
381 | spte = sp->spt + i; | ||
382 | |||
383 | for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { | ||
384 | pt_element_t gpte; | ||
385 | unsigned pte_access; | ||
386 | gfn_t gfn; | ||
387 | pfn_t pfn; | ||
388 | bool dirty; | ||
389 | |||
390 | if (spte == sptep) | ||
391 | continue; | ||
392 | |||
393 | if (*spte != shadow_trap_nonpresent_pte) | ||
394 | continue; | ||
395 | |||
396 | gpte = gptep[i]; | ||
397 | |||
398 | if (!is_present_gpte(gpte) || | ||
399 | is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) { | ||
400 | if (!sp->unsync) | ||
401 | __set_spte(spte, shadow_notrap_nonpresent_pte); | ||
402 | continue; | ||
403 | } | ||
404 | |||
405 | if (!(gpte & PT_ACCESSED_MASK)) | ||
406 | continue; | ||
407 | |||
408 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | ||
409 | gfn = gpte_to_gfn(gpte); | ||
410 | dirty = is_dirty_gpte(gpte); | ||
411 | pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, | ||
412 | (pte_access & ACC_WRITE_MASK) && dirty); | ||
413 | if (is_error_pfn(pfn)) { | ||
414 | kvm_release_pfn_clean(pfn); | ||
415 | break; | ||
416 | } | ||
417 | |||
418 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, | ||
419 | dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn, | ||
420 | pfn, true, true); | ||
421 | } | ||
422 | } | ||
423 | |||
313 | /* | 424 | /* |
314 | * Fetch a shadow pte for a specific level in the paging hierarchy. | 425 | * Fetch a shadow pte for a specific level in the paging hierarchy. |
315 | */ | 426 | */ |
@@ -391,6 +502,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
391 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, | 502 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, |
392 | user_fault, write_fault, dirty, ptwrite, it.level, | 503 | user_fault, write_fault, dirty, ptwrite, it.level, |
393 | gw->gfn, pfn, false, true); | 504 | gw->gfn, pfn, false, true); |
505 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); | ||
394 | 506 | ||
395 | return it.sptep; | 507 | return it.sptep; |
396 | 508 | ||
@@ -420,7 +532,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
420 | { | 532 | { |
421 | int write_fault = error_code & PFERR_WRITE_MASK; | 533 | int write_fault = error_code & PFERR_WRITE_MASK; |
422 | int user_fault = error_code & PFERR_USER_MASK; | 534 | int user_fault = error_code & PFERR_USER_MASK; |
423 | int fetch_fault = error_code & PFERR_FETCH_MASK; | ||
424 | struct guest_walker walker; | 535 | struct guest_walker walker; |
425 | u64 *sptep; | 536 | u64 *sptep; |
426 | int write_pt = 0; | 537 | int write_pt = 0; |
@@ -430,7 +541,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
430 | unsigned long mmu_seq; | 541 | unsigned long mmu_seq; |
431 | 542 | ||
432 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); | 543 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
433 | kvm_mmu_audit(vcpu, "pre page fault"); | ||
434 | 544 | ||
435 | r = mmu_topup_memory_caches(vcpu); | 545 | r = mmu_topup_memory_caches(vcpu); |
436 | if (r) | 546 | if (r) |
@@ -439,15 +549,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
439 | /* | 549 | /* |
440 | * Look up the guest pte for the faulting address. | 550 | * Look up the guest pte for the faulting address. |
441 | */ | 551 | */ |
442 | r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, | 552 | r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); |
443 | fetch_fault); | ||
444 | 553 | ||
445 | /* | 554 | /* |
446 | * The page is not mapped by the guest. Let the guest handle it. | 555 | * The page is not mapped by the guest. Let the guest handle it. |
447 | */ | 556 | */ |
448 | if (!r) { | 557 | if (!r) { |
449 | pgprintk("%s: guest page fault\n", __func__); | 558 | pgprintk("%s: guest page fault\n", __func__); |
450 | inject_page_fault(vcpu, addr, walker.error_code); | 559 | inject_page_fault(vcpu); |
451 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | 560 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ |
452 | return 0; | 561 | return 0; |
453 | } | 562 | } |
@@ -468,6 +577,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
468 | spin_lock(&vcpu->kvm->mmu_lock); | 577 | spin_lock(&vcpu->kvm->mmu_lock); |
469 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 578 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
470 | goto out_unlock; | 579 | goto out_unlock; |
580 | |||
581 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); | ||
471 | kvm_mmu_free_some_pages(vcpu); | 582 | kvm_mmu_free_some_pages(vcpu); |
472 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 583 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
473 | level, &write_pt, pfn); | 584 | level, &write_pt, pfn); |
@@ -479,7 +590,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
479 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | 590 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ |
480 | 591 | ||
481 | ++vcpu->stat.pf_fixed; | 592 | ++vcpu->stat.pf_fixed; |
482 | kvm_mmu_audit(vcpu, "post page fault (fixed)"); | 593 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); |
483 | spin_unlock(&vcpu->kvm->mmu_lock); | 594 | spin_unlock(&vcpu->kvm->mmu_lock); |
484 | 595 | ||
485 | return write_pt; | 596 | return write_pt; |
@@ -556,10 +667,25 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, | |||
556 | gpa_t gpa = UNMAPPED_GVA; | 667 | gpa_t gpa = UNMAPPED_GVA; |
557 | int r; | 668 | int r; |
558 | 669 | ||
559 | r = FNAME(walk_addr)(&walker, vcpu, vaddr, | 670 | r = FNAME(walk_addr)(&walker, vcpu, vaddr, access); |
560 | !!(access & PFERR_WRITE_MASK), | 671 | |
561 | !!(access & PFERR_USER_MASK), | 672 | if (r) { |
562 | !!(access & PFERR_FETCH_MASK)); | 673 | gpa = gfn_to_gpa(walker.gfn); |
674 | gpa |= vaddr & ~PAGE_MASK; | ||
675 | } else if (error) | ||
676 | *error = walker.error_code; | ||
677 | |||
678 | return gpa; | ||
679 | } | ||
680 | |||
681 | static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, | ||
682 | u32 access, u32 *error) | ||
683 | { | ||
684 | struct guest_walker walker; | ||
685 | gpa_t gpa = UNMAPPED_GVA; | ||
686 | int r; | ||
687 | |||
688 | r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); | ||
563 | 689 | ||
564 | if (r) { | 690 | if (r) { |
565 | gpa = gfn_to_gpa(walker.gfn); | 691 | gpa = gfn_to_gpa(walker.gfn); |
@@ -638,7 +764,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
638 | return -EINVAL; | 764 | return -EINVAL; |
639 | 765 | ||
640 | gfn = gpte_to_gfn(gpte); | 766 | gfn = gpte_to_gfn(gpte); |
641 | if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL) | 767 | if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL) |
642 | || gfn != sp->gfns[i] || !is_present_gpte(gpte) | 768 | || gfn != sp->gfns[i] || !is_present_gpte(gpte) |
643 | || !(gpte & PT_ACCESSED_MASK)) { | 769 | || !(gpte & PT_ACCESSED_MASK)) { |
644 | u64 nonpresent; | 770 | u64 nonpresent; |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 81ed28cb36e6..82e144a4e514 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * AMD SVM support | 4 | * AMD SVM support |
5 | * | 5 | * |
6 | * Copyright (C) 2006 Qumranet, Inc. | 6 | * Copyright (C) 2006 Qumranet, Inc. |
7 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 7 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
8 | * | 8 | * |
9 | * Authors: | 9 | * Authors: |
10 | * Yaniv Kamay <yaniv@qumranet.com> | 10 | * Yaniv Kamay <yaniv@qumranet.com> |
@@ -88,6 +88,14 @@ struct nested_state { | |||
88 | /* A VMEXIT is required but not yet emulated */ | 88 | /* A VMEXIT is required but not yet emulated */ |
89 | bool exit_required; | 89 | bool exit_required; |
90 | 90 | ||
91 | /* | ||
92 | * If we vmexit during an instruction emulation we need this to restore | ||
93 | * the l1 guest rip after the emulation | ||
94 | */ | ||
95 | unsigned long vmexit_rip; | ||
96 | unsigned long vmexit_rsp; | ||
97 | unsigned long vmexit_rax; | ||
98 | |||
91 | /* cache for intercepts of the guest */ | 99 | /* cache for intercepts of the guest */ |
92 | u16 intercept_cr_read; | 100 | u16 intercept_cr_read; |
93 | u16 intercept_cr_write; | 101 | u16 intercept_cr_write; |
@@ -96,6 +104,8 @@ struct nested_state { | |||
96 | u32 intercept_exceptions; | 104 | u32 intercept_exceptions; |
97 | u64 intercept; | 105 | u64 intercept; |
98 | 106 | ||
107 | /* Nested Paging related state */ | ||
108 | u64 nested_cr3; | ||
99 | }; | 109 | }; |
100 | 110 | ||
101 | #define MSRPM_OFFSETS 16 | 111 | #define MSRPM_OFFSETS 16 |
@@ -284,6 +294,15 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) | |||
284 | force_new_asid(vcpu); | 294 | force_new_asid(vcpu); |
285 | } | 295 | } |
286 | 296 | ||
297 | static int get_npt_level(void) | ||
298 | { | ||
299 | #ifdef CONFIG_X86_64 | ||
300 | return PT64_ROOT_LEVEL; | ||
301 | #else | ||
302 | return PT32E_ROOT_LEVEL; | ||
303 | #endif | ||
304 | } | ||
305 | |||
287 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) | 306 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) |
288 | { | 307 | { |
289 | vcpu->arch.efer = efer; | 308 | vcpu->arch.efer = efer; |
@@ -701,6 +720,29 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) | |||
701 | seg->base = 0; | 720 | seg->base = 0; |
702 | } | 721 | } |
703 | 722 | ||
723 | static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | ||
724 | { | ||
725 | struct vcpu_svm *svm = to_svm(vcpu); | ||
726 | u64 g_tsc_offset = 0; | ||
727 | |||
728 | if (is_nested(svm)) { | ||
729 | g_tsc_offset = svm->vmcb->control.tsc_offset - | ||
730 | svm->nested.hsave->control.tsc_offset; | ||
731 | svm->nested.hsave->control.tsc_offset = offset; | ||
732 | } | ||
733 | |||
734 | svm->vmcb->control.tsc_offset = offset + g_tsc_offset; | ||
735 | } | ||
736 | |||
737 | static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) | ||
738 | { | ||
739 | struct vcpu_svm *svm = to_svm(vcpu); | ||
740 | |||
741 | svm->vmcb->control.tsc_offset += adjustment; | ||
742 | if (is_nested(svm)) | ||
743 | svm->nested.hsave->control.tsc_offset += adjustment; | ||
744 | } | ||
745 | |||
704 | static void init_vmcb(struct vcpu_svm *svm) | 746 | static void init_vmcb(struct vcpu_svm *svm) |
705 | { | 747 | { |
706 | struct vmcb_control_area *control = &svm->vmcb->control; | 748 | struct vmcb_control_area *control = &svm->vmcb->control; |
@@ -793,7 +835,7 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
793 | init_sys_seg(&save->ldtr, SEG_TYPE_LDT); | 835 | init_sys_seg(&save->ldtr, SEG_TYPE_LDT); |
794 | init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); | 836 | init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); |
795 | 837 | ||
796 | save->efer = EFER_SVME; | 838 | svm_set_efer(&svm->vcpu, 0); |
797 | save->dr6 = 0xffff0ff0; | 839 | save->dr6 = 0xffff0ff0; |
798 | save->dr7 = 0x400; | 840 | save->dr7 = 0x400; |
799 | save->rflags = 2; | 841 | save->rflags = 2; |
@@ -804,8 +846,8 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
804 | * This is the guest-visible cr0 value. | 846 | * This is the guest-visible cr0 value. |
805 | * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. | 847 | * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. |
806 | */ | 848 | */ |
807 | svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; | 849 | svm->vcpu.arch.cr0 = 0; |
808 | (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); | 850 | (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); |
809 | 851 | ||
810 | save->cr4 = X86_CR4_PAE; | 852 | save->cr4 = X86_CR4_PAE; |
811 | /* rdx = ?? */ | 853 | /* rdx = ?? */ |
@@ -901,7 +943,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
901 | svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; | 943 | svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; |
902 | svm->asid_generation = 0; | 944 | svm->asid_generation = 0; |
903 | init_vmcb(svm); | 945 | init_vmcb(svm); |
904 | svm->vmcb->control.tsc_offset = 0-native_read_tsc(); | 946 | kvm_write_tsc(&svm->vcpu, 0); |
905 | 947 | ||
906 | err = fx_init(&svm->vcpu); | 948 | err = fx_init(&svm->vcpu); |
907 | if (err) | 949 | if (err) |
@@ -947,20 +989,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
947 | int i; | 989 | int i; |
948 | 990 | ||
949 | if (unlikely(cpu != vcpu->cpu)) { | 991 | if (unlikely(cpu != vcpu->cpu)) { |
950 | u64 delta; | ||
951 | |||
952 | if (check_tsc_unstable()) { | ||
953 | /* | ||
954 | * Make sure that the guest sees a monotonically | ||
955 | * increasing TSC. | ||
956 | */ | ||
957 | delta = vcpu->arch.host_tsc - native_read_tsc(); | ||
958 | svm->vmcb->control.tsc_offset += delta; | ||
959 | if (is_nested(svm)) | ||
960 | svm->nested.hsave->control.tsc_offset += delta; | ||
961 | } | ||
962 | vcpu->cpu = cpu; | ||
963 | kvm_migrate_timers(vcpu); | ||
964 | svm->asid_generation = 0; | 992 | svm->asid_generation = 0; |
965 | } | 993 | } |
966 | 994 | ||
@@ -976,8 +1004,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) | |||
976 | ++vcpu->stat.host_state_reload; | 1004 | ++vcpu->stat.host_state_reload; |
977 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | 1005 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) |
978 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | 1006 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); |
979 | |||
980 | vcpu->arch.host_tsc = native_read_tsc(); | ||
981 | } | 1007 | } |
982 | 1008 | ||
983 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | 1009 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) |
@@ -995,7 +1021,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | |||
995 | switch (reg) { | 1021 | switch (reg) { |
996 | case VCPU_EXREG_PDPTR: | 1022 | case VCPU_EXREG_PDPTR: |
997 | BUG_ON(!npt_enabled); | 1023 | BUG_ON(!npt_enabled); |
998 | load_pdptrs(vcpu, vcpu->arch.cr3); | 1024 | load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); |
999 | break; | 1025 | break; |
1000 | default: | 1026 | default: |
1001 | BUG(); | 1027 | BUG(); |
@@ -1206,8 +1232,12 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1206 | if (old == new) { | 1232 | if (old == new) { |
1207 | /* cr0 write with ts and mp unchanged */ | 1233 | /* cr0 write with ts and mp unchanged */ |
1208 | svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; | 1234 | svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; |
1209 | if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) | 1235 | if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) { |
1236 | svm->nested.vmexit_rip = kvm_rip_read(vcpu); | ||
1237 | svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); | ||
1238 | svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX); | ||
1210 | return; | 1239 | return; |
1240 | } | ||
1211 | } | 1241 | } |
1212 | } | 1242 | } |
1213 | 1243 | ||
@@ -1581,6 +1611,54 @@ static int vmmcall_interception(struct vcpu_svm *svm) | |||
1581 | return 1; | 1611 | return 1; |
1582 | } | 1612 | } |
1583 | 1613 | ||
1614 | static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) | ||
1615 | { | ||
1616 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1617 | |||
1618 | return svm->nested.nested_cr3; | ||
1619 | } | ||
1620 | |||
1621 | static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, | ||
1622 | unsigned long root) | ||
1623 | { | ||
1624 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1625 | |||
1626 | svm->vmcb->control.nested_cr3 = root; | ||
1627 | force_new_asid(vcpu); | ||
1628 | } | ||
1629 | |||
1630 | static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu) | ||
1631 | { | ||
1632 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1633 | |||
1634 | svm->vmcb->control.exit_code = SVM_EXIT_NPF; | ||
1635 | svm->vmcb->control.exit_code_hi = 0; | ||
1636 | svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code; | ||
1637 | svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address; | ||
1638 | |||
1639 | nested_svm_vmexit(svm); | ||
1640 | } | ||
1641 | |||
1642 | static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) | ||
1643 | { | ||
1644 | int r; | ||
1645 | |||
1646 | r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); | ||
1647 | |||
1648 | vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; | ||
1649 | vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; | ||
1650 | vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; | ||
1651 | vcpu->arch.mmu.shadow_root_level = get_npt_level(); | ||
1652 | vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; | ||
1653 | |||
1654 | return r; | ||
1655 | } | ||
1656 | |||
1657 | static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) | ||
1658 | { | ||
1659 | vcpu->arch.walk_mmu = &vcpu->arch.mmu; | ||
1660 | } | ||
1661 | |||
1584 | static int nested_svm_check_permissions(struct vcpu_svm *svm) | 1662 | static int nested_svm_check_permissions(struct vcpu_svm *svm) |
1585 | { | 1663 | { |
1586 | if (!(svm->vcpu.arch.efer & EFER_SVME) | 1664 | if (!(svm->vcpu.arch.efer & EFER_SVME) |
@@ -1629,6 +1707,14 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) | |||
1629 | if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) | 1707 | if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) |
1630 | return false; | 1708 | return false; |
1631 | 1709 | ||
1710 | /* | ||
1711 | * if vmexit was already requested (by intercepted exception | ||
1712 | * for instance) do not overwrite it with "external interrupt" | ||
1713 | * vmexit. | ||
1714 | */ | ||
1715 | if (svm->nested.exit_required) | ||
1716 | return false; | ||
1717 | |||
1632 | svm->vmcb->control.exit_code = SVM_EXIT_INTR; | 1718 | svm->vmcb->control.exit_code = SVM_EXIT_INTR; |
1633 | svm->vmcb->control.exit_info_1 = 0; | 1719 | svm->vmcb->control.exit_info_1 = 0; |
1634 | svm->vmcb->control.exit_info_2 = 0; | 1720 | svm->vmcb->control.exit_info_2 = 0; |
@@ -1896,6 +1982,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1896 | nested_vmcb->save.ds = vmcb->save.ds; | 1982 | nested_vmcb->save.ds = vmcb->save.ds; |
1897 | nested_vmcb->save.gdtr = vmcb->save.gdtr; | 1983 | nested_vmcb->save.gdtr = vmcb->save.gdtr; |
1898 | nested_vmcb->save.idtr = vmcb->save.idtr; | 1984 | nested_vmcb->save.idtr = vmcb->save.idtr; |
1985 | nested_vmcb->save.efer = svm->vcpu.arch.efer; | ||
1899 | nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); | 1986 | nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); |
1900 | nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; | 1987 | nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; |
1901 | nested_vmcb->save.cr2 = vmcb->save.cr2; | 1988 | nested_vmcb->save.cr2 = vmcb->save.cr2; |
@@ -1917,6 +2004,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1917 | nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; | 2004 | nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; |
1918 | nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; | 2005 | nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; |
1919 | nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; | 2006 | nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; |
2007 | nested_vmcb->control.next_rip = vmcb->control.next_rip; | ||
1920 | 2008 | ||
1921 | /* | 2009 | /* |
1922 | * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have | 2010 | * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have |
@@ -1947,6 +2035,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1947 | kvm_clear_exception_queue(&svm->vcpu); | 2035 | kvm_clear_exception_queue(&svm->vcpu); |
1948 | kvm_clear_interrupt_queue(&svm->vcpu); | 2036 | kvm_clear_interrupt_queue(&svm->vcpu); |
1949 | 2037 | ||
2038 | svm->nested.nested_cr3 = 0; | ||
2039 | |||
1950 | /* Restore selected save entries */ | 2040 | /* Restore selected save entries */ |
1951 | svm->vmcb->save.es = hsave->save.es; | 2041 | svm->vmcb->save.es = hsave->save.es; |
1952 | svm->vmcb->save.cs = hsave->save.cs; | 2042 | svm->vmcb->save.cs = hsave->save.cs; |
@@ -1973,6 +2063,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1973 | 2063 | ||
1974 | nested_svm_unmap(page); | 2064 | nested_svm_unmap(page); |
1975 | 2065 | ||
2066 | nested_svm_uninit_mmu_context(&svm->vcpu); | ||
1976 | kvm_mmu_reset_context(&svm->vcpu); | 2067 | kvm_mmu_reset_context(&svm->vcpu); |
1977 | kvm_mmu_load(&svm->vcpu); | 2068 | kvm_mmu_load(&svm->vcpu); |
1978 | 2069 | ||
@@ -2012,6 +2103,20 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) | |||
2012 | return true; | 2103 | return true; |
2013 | } | 2104 | } |
2014 | 2105 | ||
2106 | static bool nested_vmcb_checks(struct vmcb *vmcb) | ||
2107 | { | ||
2108 | if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) | ||
2109 | return false; | ||
2110 | |||
2111 | if (vmcb->control.asid == 0) | ||
2112 | return false; | ||
2113 | |||
2114 | if (vmcb->control.nested_ctl && !npt_enabled) | ||
2115 | return false; | ||
2116 | |||
2117 | return true; | ||
2118 | } | ||
2119 | |||
2015 | static bool nested_svm_vmrun(struct vcpu_svm *svm) | 2120 | static bool nested_svm_vmrun(struct vcpu_svm *svm) |
2016 | { | 2121 | { |
2017 | struct vmcb *nested_vmcb; | 2122 | struct vmcb *nested_vmcb; |
@@ -2026,7 +2131,18 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2026 | if (!nested_vmcb) | 2131 | if (!nested_vmcb) |
2027 | return false; | 2132 | return false; |
2028 | 2133 | ||
2029 | trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa, | 2134 | if (!nested_vmcb_checks(nested_vmcb)) { |
2135 | nested_vmcb->control.exit_code = SVM_EXIT_ERR; | ||
2136 | nested_vmcb->control.exit_code_hi = 0; | ||
2137 | nested_vmcb->control.exit_info_1 = 0; | ||
2138 | nested_vmcb->control.exit_info_2 = 0; | ||
2139 | |||
2140 | nested_svm_unmap(page); | ||
2141 | |||
2142 | return false; | ||
2143 | } | ||
2144 | |||
2145 | trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, | ||
2030 | nested_vmcb->save.rip, | 2146 | nested_vmcb->save.rip, |
2031 | nested_vmcb->control.int_ctl, | 2147 | nested_vmcb->control.int_ctl, |
2032 | nested_vmcb->control.event_inj, | 2148 | nested_vmcb->control.event_inj, |
@@ -2055,7 +2171,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2055 | hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); | 2171 | hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); |
2056 | hsave->save.cr4 = svm->vcpu.arch.cr4; | 2172 | hsave->save.cr4 = svm->vcpu.arch.cr4; |
2057 | hsave->save.rflags = vmcb->save.rflags; | 2173 | hsave->save.rflags = vmcb->save.rflags; |
2058 | hsave->save.rip = svm->next_rip; | 2174 | hsave->save.rip = kvm_rip_read(&svm->vcpu); |
2059 | hsave->save.rsp = vmcb->save.rsp; | 2175 | hsave->save.rsp = vmcb->save.rsp; |
2060 | hsave->save.rax = vmcb->save.rax; | 2176 | hsave->save.rax = vmcb->save.rax; |
2061 | if (npt_enabled) | 2177 | if (npt_enabled) |
@@ -2070,6 +2186,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2070 | else | 2186 | else |
2071 | svm->vcpu.arch.hflags &= ~HF_HIF_MASK; | 2187 | svm->vcpu.arch.hflags &= ~HF_HIF_MASK; |
2072 | 2188 | ||
2189 | if (nested_vmcb->control.nested_ctl) { | ||
2190 | kvm_mmu_unload(&svm->vcpu); | ||
2191 | svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; | ||
2192 | nested_svm_init_mmu_context(&svm->vcpu); | ||
2193 | } | ||
2194 | |||
2073 | /* Load the nested guest state */ | 2195 | /* Load the nested guest state */ |
2074 | svm->vmcb->save.es = nested_vmcb->save.es; | 2196 | svm->vmcb->save.es = nested_vmcb->save.es; |
2075 | svm->vmcb->save.cs = nested_vmcb->save.cs; | 2197 | svm->vmcb->save.cs = nested_vmcb->save.cs; |
@@ -2227,8 +2349,8 @@ static int vmrun_interception(struct vcpu_svm *svm) | |||
2227 | if (nested_svm_check_permissions(svm)) | 2349 | if (nested_svm_check_permissions(svm)) |
2228 | return 1; | 2350 | return 1; |
2229 | 2351 | ||
2230 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 2352 | /* Save rip after vmrun instruction */ |
2231 | skip_emulated_instruction(&svm->vcpu); | 2353 | kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3); |
2232 | 2354 | ||
2233 | if (!nested_svm_vmrun(svm)) | 2355 | if (!nested_svm_vmrun(svm)) |
2234 | return 1; | 2356 | return 1; |
@@ -2257,6 +2379,7 @@ static int stgi_interception(struct vcpu_svm *svm) | |||
2257 | 2379 | ||
2258 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 2380 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
2259 | skip_emulated_instruction(&svm->vcpu); | 2381 | skip_emulated_instruction(&svm->vcpu); |
2382 | kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); | ||
2260 | 2383 | ||
2261 | enable_gif(svm); | 2384 | enable_gif(svm); |
2262 | 2385 | ||
@@ -2399,6 +2522,23 @@ static int emulate_on_interception(struct vcpu_svm *svm) | |||
2399 | return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; | 2522 | return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; |
2400 | } | 2523 | } |
2401 | 2524 | ||
2525 | static int cr0_write_interception(struct vcpu_svm *svm) | ||
2526 | { | ||
2527 | struct kvm_vcpu *vcpu = &svm->vcpu; | ||
2528 | int r; | ||
2529 | |||
2530 | r = emulate_instruction(&svm->vcpu, 0, 0, 0); | ||
2531 | |||
2532 | if (svm->nested.vmexit_rip) { | ||
2533 | kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); | ||
2534 | kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp); | ||
2535 | kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax); | ||
2536 | svm->nested.vmexit_rip = 0; | ||
2537 | } | ||
2538 | |||
2539 | return r == EMULATE_DONE; | ||
2540 | } | ||
2541 | |||
2402 | static int cr8_write_interception(struct vcpu_svm *svm) | 2542 | static int cr8_write_interception(struct vcpu_svm *svm) |
2403 | { | 2543 | { |
2404 | struct kvm_run *kvm_run = svm->vcpu.run; | 2544 | struct kvm_run *kvm_run = svm->vcpu.run; |
@@ -2542,20 +2682,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
2542 | struct vcpu_svm *svm = to_svm(vcpu); | 2682 | struct vcpu_svm *svm = to_svm(vcpu); |
2543 | 2683 | ||
2544 | switch (ecx) { | 2684 | switch (ecx) { |
2545 | case MSR_IA32_TSC: { | 2685 | case MSR_IA32_TSC: |
2546 | u64 tsc_offset = data - native_read_tsc(); | 2686 | kvm_write_tsc(vcpu, data); |
2547 | u64 g_tsc_offset = 0; | ||
2548 | |||
2549 | if (is_nested(svm)) { | ||
2550 | g_tsc_offset = svm->vmcb->control.tsc_offset - | ||
2551 | svm->nested.hsave->control.tsc_offset; | ||
2552 | svm->nested.hsave->control.tsc_offset = tsc_offset; | ||
2553 | } | ||
2554 | |||
2555 | svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset; | ||
2556 | |||
2557 | break; | 2687 | break; |
2558 | } | ||
2559 | case MSR_STAR: | 2688 | case MSR_STAR: |
2560 | svm->vmcb->save.star = data; | 2689 | svm->vmcb->save.star = data; |
2561 | break; | 2690 | break; |
@@ -2643,6 +2772,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) | |||
2643 | { | 2772 | { |
2644 | struct kvm_run *kvm_run = svm->vcpu.run; | 2773 | struct kvm_run *kvm_run = svm->vcpu.run; |
2645 | 2774 | ||
2775 | kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); | ||
2646 | svm_clear_vintr(svm); | 2776 | svm_clear_vintr(svm); |
2647 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; | 2777 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; |
2648 | /* | 2778 | /* |
@@ -2672,7 +2802,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { | |||
2672 | [SVM_EXIT_READ_CR4] = emulate_on_interception, | 2802 | [SVM_EXIT_READ_CR4] = emulate_on_interception, |
2673 | [SVM_EXIT_READ_CR8] = emulate_on_interception, | 2803 | [SVM_EXIT_READ_CR8] = emulate_on_interception, |
2674 | [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, | 2804 | [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, |
2675 | [SVM_EXIT_WRITE_CR0] = emulate_on_interception, | 2805 | [SVM_EXIT_WRITE_CR0] = cr0_write_interception, |
2676 | [SVM_EXIT_WRITE_CR3] = emulate_on_interception, | 2806 | [SVM_EXIT_WRITE_CR3] = emulate_on_interception, |
2677 | [SVM_EXIT_WRITE_CR4] = emulate_on_interception, | 2807 | [SVM_EXIT_WRITE_CR4] = emulate_on_interception, |
2678 | [SVM_EXIT_WRITE_CR8] = cr8_write_interception, | 2808 | [SVM_EXIT_WRITE_CR8] = cr8_write_interception, |
@@ -2871,7 +3001,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) | |||
2871 | 3001 | ||
2872 | if (is_external_interrupt(svm->vmcb->control.exit_int_info) && | 3002 | if (is_external_interrupt(svm->vmcb->control.exit_int_info) && |
2873 | exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && | 3003 | exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && |
2874 | exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) | 3004 | exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && |
3005 | exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) | ||
2875 | printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " | 3006 | printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " |
2876 | "exit_code 0x%x\n", | 3007 | "exit_code 0x%x\n", |
2877 | __func__, svm->vmcb->control.exit_int_info, | 3008 | __func__, svm->vmcb->control.exit_int_info, |
@@ -3088,8 +3219,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) | |||
3088 | 3219 | ||
3089 | svm->int3_injected = 0; | 3220 | svm->int3_injected = 0; |
3090 | 3221 | ||
3091 | if (svm->vcpu.arch.hflags & HF_IRET_MASK) | 3222 | if (svm->vcpu.arch.hflags & HF_IRET_MASK) { |
3092 | svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); | 3223 | svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); |
3224 | kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); | ||
3225 | } | ||
3093 | 3226 | ||
3094 | svm->vcpu.arch.nmi_injected = false; | 3227 | svm->vcpu.arch.nmi_injected = false; |
3095 | kvm_clear_exception_queue(&svm->vcpu); | 3228 | kvm_clear_exception_queue(&svm->vcpu); |
@@ -3098,6 +3231,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) | |||
3098 | if (!(exitintinfo & SVM_EXITINTINFO_VALID)) | 3231 | if (!(exitintinfo & SVM_EXITINTINFO_VALID)) |
3099 | return; | 3232 | return; |
3100 | 3233 | ||
3234 | kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); | ||
3235 | |||
3101 | vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; | 3236 | vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; |
3102 | type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; | 3237 | type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; |
3103 | 3238 | ||
@@ -3134,6 +3269,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) | |||
3134 | } | 3269 | } |
3135 | } | 3270 | } |
3136 | 3271 | ||
3272 | static void svm_cancel_injection(struct kvm_vcpu *vcpu) | ||
3273 | { | ||
3274 | struct vcpu_svm *svm = to_svm(vcpu); | ||
3275 | struct vmcb_control_area *control = &svm->vmcb->control; | ||
3276 | |||
3277 | control->exit_int_info = control->event_inj; | ||
3278 | control->exit_int_info_err = control->event_inj_err; | ||
3279 | control->event_inj = 0; | ||
3280 | svm_complete_interrupts(svm); | ||
3281 | } | ||
3282 | |||
3137 | #ifdef CONFIG_X86_64 | 3283 | #ifdef CONFIG_X86_64 |
3138 | #define R "r" | 3284 | #define R "r" |
3139 | #else | 3285 | #else |
@@ -3163,13 +3309,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3163 | sync_lapic_to_cr8(vcpu); | 3309 | sync_lapic_to_cr8(vcpu); |
3164 | 3310 | ||
3165 | save_host_msrs(vcpu); | 3311 | save_host_msrs(vcpu); |
3166 | fs_selector = kvm_read_fs(); | 3312 | savesegment(fs, fs_selector); |
3167 | gs_selector = kvm_read_gs(); | 3313 | savesegment(gs, gs_selector); |
3168 | ldt_selector = kvm_read_ldt(); | 3314 | ldt_selector = kvm_read_ldt(); |
3169 | svm->vmcb->save.cr2 = vcpu->arch.cr2; | 3315 | svm->vmcb->save.cr2 = vcpu->arch.cr2; |
3170 | /* required for live migration with NPT */ | ||
3171 | if (npt_enabled) | ||
3172 | svm->vmcb->save.cr3 = vcpu->arch.cr3; | ||
3173 | 3316 | ||
3174 | clgi(); | 3317 | clgi(); |
3175 | 3318 | ||
@@ -3251,10 +3394,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3251 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | 3394 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; |
3252 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; | 3395 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; |
3253 | 3396 | ||
3254 | kvm_load_fs(fs_selector); | ||
3255 | kvm_load_gs(gs_selector); | ||
3256 | kvm_load_ldt(ldt_selector); | ||
3257 | load_host_msrs(vcpu); | 3397 | load_host_msrs(vcpu); |
3398 | loadsegment(fs, fs_selector); | ||
3399 | #ifdef CONFIG_X86_64 | ||
3400 | load_gs_index(gs_selector); | ||
3401 | wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); | ||
3402 | #else | ||
3403 | loadsegment(gs, gs_selector); | ||
3404 | #endif | ||
3405 | kvm_load_ldt(ldt_selector); | ||
3258 | 3406 | ||
3259 | reload_tss(vcpu); | 3407 | reload_tss(vcpu); |
3260 | 3408 | ||
@@ -3286,16 +3434,22 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | |||
3286 | { | 3434 | { |
3287 | struct vcpu_svm *svm = to_svm(vcpu); | 3435 | struct vcpu_svm *svm = to_svm(vcpu); |
3288 | 3436 | ||
3289 | if (npt_enabled) { | ||
3290 | svm->vmcb->control.nested_cr3 = root; | ||
3291 | force_new_asid(vcpu); | ||
3292 | return; | ||
3293 | } | ||
3294 | |||
3295 | svm->vmcb->save.cr3 = root; | 3437 | svm->vmcb->save.cr3 = root; |
3296 | force_new_asid(vcpu); | 3438 | force_new_asid(vcpu); |
3297 | } | 3439 | } |
3298 | 3440 | ||
3441 | static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) | ||
3442 | { | ||
3443 | struct vcpu_svm *svm = to_svm(vcpu); | ||
3444 | |||
3445 | svm->vmcb->control.nested_cr3 = root; | ||
3446 | |||
3447 | /* Also sync guest cr3 here in case we live migrate */ | ||
3448 | svm->vmcb->save.cr3 = vcpu->arch.cr3; | ||
3449 | |||
3450 | force_new_asid(vcpu); | ||
3451 | } | ||
3452 | |||
3299 | static int is_disabled(void) | 3453 | static int is_disabled(void) |
3300 | { | 3454 | { |
3301 | u64 vm_cr; | 3455 | u64 vm_cr; |
@@ -3328,15 +3482,6 @@ static bool svm_cpu_has_accelerated_tpr(void) | |||
3328 | return false; | 3482 | return false; |
3329 | } | 3483 | } |
3330 | 3484 | ||
3331 | static int get_npt_level(void) | ||
3332 | { | ||
3333 | #ifdef CONFIG_X86_64 | ||
3334 | return PT64_ROOT_LEVEL; | ||
3335 | #else | ||
3336 | return PT32E_ROOT_LEVEL; | ||
3337 | #endif | ||
3338 | } | ||
3339 | |||
3340 | static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) | 3485 | static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) |
3341 | { | 3486 | { |
3342 | return 0; | 3487 | return 0; |
@@ -3349,12 +3494,25 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) | |||
3349 | static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) | 3494 | static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) |
3350 | { | 3495 | { |
3351 | switch (func) { | 3496 | switch (func) { |
3497 | case 0x80000001: | ||
3498 | if (nested) | ||
3499 | entry->ecx |= (1 << 2); /* Set SVM bit */ | ||
3500 | break; | ||
3352 | case 0x8000000A: | 3501 | case 0x8000000A: |
3353 | entry->eax = 1; /* SVM revision 1 */ | 3502 | entry->eax = 1; /* SVM revision 1 */ |
3354 | entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper | 3503 | entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper |
3355 | ASID emulation to nested SVM */ | 3504 | ASID emulation to nested SVM */ |
3356 | entry->ecx = 0; /* Reserved */ | 3505 | entry->ecx = 0; /* Reserved */ |
3357 | entry->edx = 0; /* Do not support any additional features */ | 3506 | entry->edx = 0; /* Per default do not support any |
3507 | additional features */ | ||
3508 | |||
3509 | /* Support next_rip if host supports it */ | ||
3510 | if (svm_has(SVM_FEATURE_NRIP)) | ||
3511 | entry->edx |= SVM_FEATURE_NRIP; | ||
3512 | |||
3513 | /* Support NPT for the guest if enabled */ | ||
3514 | if (npt_enabled) | ||
3515 | entry->edx |= SVM_FEATURE_NPT; | ||
3358 | 3516 | ||
3359 | break; | 3517 | break; |
3360 | } | 3518 | } |
@@ -3492,6 +3650,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
3492 | .set_irq = svm_set_irq, | 3650 | .set_irq = svm_set_irq, |
3493 | .set_nmi = svm_inject_nmi, | 3651 | .set_nmi = svm_inject_nmi, |
3494 | .queue_exception = svm_queue_exception, | 3652 | .queue_exception = svm_queue_exception, |
3653 | .cancel_injection = svm_cancel_injection, | ||
3495 | .interrupt_allowed = svm_interrupt_allowed, | 3654 | .interrupt_allowed = svm_interrupt_allowed, |
3496 | .nmi_allowed = svm_nmi_allowed, | 3655 | .nmi_allowed = svm_nmi_allowed, |
3497 | .get_nmi_mask = svm_get_nmi_mask, | 3656 | .get_nmi_mask = svm_get_nmi_mask, |
@@ -3514,6 +3673,11 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
3514 | .set_supported_cpuid = svm_set_supported_cpuid, | 3673 | .set_supported_cpuid = svm_set_supported_cpuid, |
3515 | 3674 | ||
3516 | .has_wbinvd_exit = svm_has_wbinvd_exit, | 3675 | .has_wbinvd_exit = svm_has_wbinvd_exit, |
3676 | |||
3677 | .write_tsc_offset = svm_write_tsc_offset, | ||
3678 | .adjust_tsc_offset = svm_adjust_tsc_offset, | ||
3679 | |||
3680 | .set_tdp_cr3 = set_tdp_cr3, | ||
3517 | }; | 3681 | }; |
3518 | 3682 | ||
3519 | static int __init svm_init(void) | 3683 | static int __init svm_init(void) |
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index e16a0dbe74d8..fc7a101c4a35 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * | 6 | * |
7 | * timer support | 7 | * timer support |
8 | * | 8 | * |
9 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 9 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
10 | * | 10 | * |
11 | * This work is licensed under the terms of the GNU GPL, version 2. See | 11 | * This work is licensed under the terms of the GNU GPL, version 2. See |
12 | * the COPYING file in the top-level directory. | 12 | * the COPYING file in the top-level directory. |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 49b25eee25ac..8da0e45ff7c9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * machines without emulation or binary translation. | 5 | * machines without emulation or binary translation. |
6 | * | 6 | * |
7 | * Copyright (C) 2006 Qumranet, Inc. | 7 | * Copyright (C) 2006 Qumranet, Inc. |
8 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 8 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
9 | * | 9 | * |
10 | * Authors: | 10 | * Authors: |
11 | * Avi Kivity <avi@qumranet.com> | 11 | * Avi Kivity <avi@qumranet.com> |
@@ -125,6 +125,7 @@ struct vcpu_vmx { | |||
125 | unsigned long host_rsp; | 125 | unsigned long host_rsp; |
126 | int launched; | 126 | int launched; |
127 | u8 fail; | 127 | u8 fail; |
128 | u32 exit_intr_info; | ||
128 | u32 idt_vectoring_info; | 129 | u32 idt_vectoring_info; |
129 | struct shared_msr_entry *guest_msrs; | 130 | struct shared_msr_entry *guest_msrs; |
130 | int nmsrs; | 131 | int nmsrs; |
@@ -154,11 +155,6 @@ struct vcpu_vmx { | |||
154 | u32 limit; | 155 | u32 limit; |
155 | u32 ar; | 156 | u32 ar; |
156 | } tr, es, ds, fs, gs; | 157 | } tr, es, ds, fs, gs; |
157 | struct { | ||
158 | bool pending; | ||
159 | u8 vector; | ||
160 | unsigned rip; | ||
161 | } irq; | ||
162 | } rmode; | 158 | } rmode; |
163 | int vpid; | 159 | int vpid; |
164 | bool emulation_required; | 160 | bool emulation_required; |
@@ -505,7 +501,6 @@ static void __vcpu_clear(void *arg) | |||
505 | vmcs_clear(vmx->vmcs); | 501 | vmcs_clear(vmx->vmcs); |
506 | if (per_cpu(current_vmcs, cpu) == vmx->vmcs) | 502 | if (per_cpu(current_vmcs, cpu) == vmx->vmcs) |
507 | per_cpu(current_vmcs, cpu) = NULL; | 503 | per_cpu(current_vmcs, cpu) = NULL; |
508 | rdtscll(vmx->vcpu.arch.host_tsc); | ||
509 | list_del(&vmx->local_vcpus_link); | 504 | list_del(&vmx->local_vcpus_link); |
510 | vmx->vcpu.cpu = -1; | 505 | vmx->vcpu.cpu = -1; |
511 | vmx->launched = 0; | 506 | vmx->launched = 0; |
@@ -706,11 +701,10 @@ static void reload_tss(void) | |||
706 | /* | 701 | /* |
707 | * VT restores TR but not its size. Useless. | 702 | * VT restores TR but not its size. Useless. |
708 | */ | 703 | */ |
709 | struct desc_ptr gdt; | 704 | struct desc_ptr *gdt = &__get_cpu_var(host_gdt); |
710 | struct desc_struct *descs; | 705 | struct desc_struct *descs; |
711 | 706 | ||
712 | native_store_gdt(&gdt); | 707 | descs = (void *)gdt->address; |
713 | descs = (void *)gdt.address; | ||
714 | descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ | 708 | descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ |
715 | load_TR_desc(); | 709 | load_TR_desc(); |
716 | } | 710 | } |
@@ -753,7 +747,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) | |||
753 | 747 | ||
754 | static unsigned long segment_base(u16 selector) | 748 | static unsigned long segment_base(u16 selector) |
755 | { | 749 | { |
756 | struct desc_ptr gdt; | 750 | struct desc_ptr *gdt = &__get_cpu_var(host_gdt); |
757 | struct desc_struct *d; | 751 | struct desc_struct *d; |
758 | unsigned long table_base; | 752 | unsigned long table_base; |
759 | unsigned long v; | 753 | unsigned long v; |
@@ -761,8 +755,7 @@ static unsigned long segment_base(u16 selector) | |||
761 | if (!(selector & ~3)) | 755 | if (!(selector & ~3)) |
762 | return 0; | 756 | return 0; |
763 | 757 | ||
764 | native_store_gdt(&gdt); | 758 | table_base = gdt->address; |
765 | table_base = gdt.address; | ||
766 | 759 | ||
767 | if (selector & 4) { /* from ldt */ | 760 | if (selector & 4) { /* from ldt */ |
768 | u16 ldt_selector = kvm_read_ldt(); | 761 | u16 ldt_selector = kvm_read_ldt(); |
@@ -803,7 +796,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) | |||
803 | */ | 796 | */ |
804 | vmx->host_state.ldt_sel = kvm_read_ldt(); | 797 | vmx->host_state.ldt_sel = kvm_read_ldt(); |
805 | vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; | 798 | vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; |
806 | vmx->host_state.fs_sel = kvm_read_fs(); | 799 | savesegment(fs, vmx->host_state.fs_sel); |
807 | if (!(vmx->host_state.fs_sel & 7)) { | 800 | if (!(vmx->host_state.fs_sel & 7)) { |
808 | vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); | 801 | vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); |
809 | vmx->host_state.fs_reload_needed = 0; | 802 | vmx->host_state.fs_reload_needed = 0; |
@@ -811,7 +804,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) | |||
811 | vmcs_write16(HOST_FS_SELECTOR, 0); | 804 | vmcs_write16(HOST_FS_SELECTOR, 0); |
812 | vmx->host_state.fs_reload_needed = 1; | 805 | vmx->host_state.fs_reload_needed = 1; |
813 | } | 806 | } |
814 | vmx->host_state.gs_sel = kvm_read_gs(); | 807 | savesegment(gs, vmx->host_state.gs_sel); |
815 | if (!(vmx->host_state.gs_sel & 7)) | 808 | if (!(vmx->host_state.gs_sel & 7)) |
816 | vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); | 809 | vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); |
817 | else { | 810 | else { |
@@ -841,27 +834,21 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) | |||
841 | 834 | ||
842 | static void __vmx_load_host_state(struct vcpu_vmx *vmx) | 835 | static void __vmx_load_host_state(struct vcpu_vmx *vmx) |
843 | { | 836 | { |
844 | unsigned long flags; | ||
845 | |||
846 | if (!vmx->host_state.loaded) | 837 | if (!vmx->host_state.loaded) |
847 | return; | 838 | return; |
848 | 839 | ||
849 | ++vmx->vcpu.stat.host_state_reload; | 840 | ++vmx->vcpu.stat.host_state_reload; |
850 | vmx->host_state.loaded = 0; | 841 | vmx->host_state.loaded = 0; |
851 | if (vmx->host_state.fs_reload_needed) | 842 | if (vmx->host_state.fs_reload_needed) |
852 | kvm_load_fs(vmx->host_state.fs_sel); | 843 | loadsegment(fs, vmx->host_state.fs_sel); |
853 | if (vmx->host_state.gs_ldt_reload_needed) { | 844 | if (vmx->host_state.gs_ldt_reload_needed) { |
854 | kvm_load_ldt(vmx->host_state.ldt_sel); | 845 | kvm_load_ldt(vmx->host_state.ldt_sel); |
855 | /* | ||
856 | * If we have to reload gs, we must take care to | ||
857 | * preserve our gs base. | ||
858 | */ | ||
859 | local_irq_save(flags); | ||
860 | kvm_load_gs(vmx->host_state.gs_sel); | ||
861 | #ifdef CONFIG_X86_64 | 846 | #ifdef CONFIG_X86_64 |
862 | wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); | 847 | load_gs_index(vmx->host_state.gs_sel); |
848 | wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); | ||
849 | #else | ||
850 | loadsegment(gs, vmx->host_state.gs_sel); | ||
863 | #endif | 851 | #endif |
864 | local_irq_restore(flags); | ||
865 | } | 852 | } |
866 | reload_tss(); | 853 | reload_tss(); |
867 | #ifdef CONFIG_X86_64 | 854 | #ifdef CONFIG_X86_64 |
@@ -889,7 +876,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) | |||
889 | static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | 876 | static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) |
890 | { | 877 | { |
891 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 878 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
892 | u64 tsc_this, delta, new_offset; | ||
893 | u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); | 879 | u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); |
894 | 880 | ||
895 | if (!vmm_exclusive) | 881 | if (!vmm_exclusive) |
@@ -903,37 +889,24 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
903 | } | 889 | } |
904 | 890 | ||
905 | if (vcpu->cpu != cpu) { | 891 | if (vcpu->cpu != cpu) { |
906 | struct desc_ptr dt; | 892 | struct desc_ptr *gdt = &__get_cpu_var(host_gdt); |
907 | unsigned long sysenter_esp; | 893 | unsigned long sysenter_esp; |
908 | 894 | ||
909 | kvm_migrate_timers(vcpu); | ||
910 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | 895 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); |
911 | local_irq_disable(); | 896 | local_irq_disable(); |
912 | list_add(&vmx->local_vcpus_link, | 897 | list_add(&vmx->local_vcpus_link, |
913 | &per_cpu(vcpus_on_cpu, cpu)); | 898 | &per_cpu(vcpus_on_cpu, cpu)); |
914 | local_irq_enable(); | 899 | local_irq_enable(); |
915 | 900 | ||
916 | vcpu->cpu = cpu; | ||
917 | /* | 901 | /* |
918 | * Linux uses per-cpu TSS and GDT, so set these when switching | 902 | * Linux uses per-cpu TSS and GDT, so set these when switching |
919 | * processors. | 903 | * processors. |
920 | */ | 904 | */ |
921 | vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ | 905 | vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ |
922 | native_store_gdt(&dt); | 906 | vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ |
923 | vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */ | ||
924 | 907 | ||
925 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); | 908 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); |
926 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ | 909 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ |
927 | |||
928 | /* | ||
929 | * Make sure the time stamp counter is monotonous. | ||
930 | */ | ||
931 | rdtscll(tsc_this); | ||
932 | if (tsc_this < vcpu->arch.host_tsc) { | ||
933 | delta = vcpu->arch.host_tsc - tsc_this; | ||
934 | new_offset = vmcs_read64(TSC_OFFSET) + delta; | ||
935 | vmcs_write64(TSC_OFFSET, new_offset); | ||
936 | } | ||
937 | } | 910 | } |
938 | } | 911 | } |
939 | 912 | ||
@@ -1050,16 +1023,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
1050 | } | 1023 | } |
1051 | 1024 | ||
1052 | if (vmx->rmode.vm86_active) { | 1025 | if (vmx->rmode.vm86_active) { |
1053 | vmx->rmode.irq.pending = true; | 1026 | if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE) |
1054 | vmx->rmode.irq.vector = nr; | 1027 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
1055 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | ||
1056 | if (kvm_exception_is_soft(nr)) | ||
1057 | vmx->rmode.irq.rip += | ||
1058 | vmx->vcpu.arch.event_exit_inst_len; | ||
1059 | intr_info |= INTR_TYPE_SOFT_INTR; | ||
1060 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); | ||
1061 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | ||
1062 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); | ||
1063 | return; | 1028 | return; |
1064 | } | 1029 | } |
1065 | 1030 | ||
@@ -1155,12 +1120,17 @@ static u64 guest_read_tsc(void) | |||
1155 | } | 1120 | } |
1156 | 1121 | ||
1157 | /* | 1122 | /* |
1158 | * writes 'guest_tsc' into guest's timestamp counter "register" | 1123 | * writes 'offset' into guest's timestamp counter offset register |
1159 | * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc | ||
1160 | */ | 1124 | */ |
1161 | static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) | 1125 | static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) |
1162 | { | 1126 | { |
1163 | vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); | 1127 | vmcs_write64(TSC_OFFSET, offset); |
1128 | } | ||
1129 | |||
1130 | static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) | ||
1131 | { | ||
1132 | u64 offset = vmcs_read64(TSC_OFFSET); | ||
1133 | vmcs_write64(TSC_OFFSET, offset + adjustment); | ||
1164 | } | 1134 | } |
1165 | 1135 | ||
1166 | /* | 1136 | /* |
@@ -1233,7 +1203,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
1233 | { | 1203 | { |
1234 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1204 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1235 | struct shared_msr_entry *msr; | 1205 | struct shared_msr_entry *msr; |
1236 | u64 host_tsc; | ||
1237 | int ret = 0; | 1206 | int ret = 0; |
1238 | 1207 | ||
1239 | switch (msr_index) { | 1208 | switch (msr_index) { |
@@ -1263,8 +1232,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
1263 | vmcs_writel(GUEST_SYSENTER_ESP, data); | 1232 | vmcs_writel(GUEST_SYSENTER_ESP, data); |
1264 | break; | 1233 | break; |
1265 | case MSR_IA32_TSC: | 1234 | case MSR_IA32_TSC: |
1266 | rdtscll(host_tsc); | 1235 | kvm_write_tsc(vcpu, data); |
1267 | guest_write_tsc(data, host_tsc); | ||
1268 | break; | 1236 | break; |
1269 | case MSR_IA32_CR_PAT: | 1237 | case MSR_IA32_CR_PAT: |
1270 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | 1238 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { |
@@ -1862,20 +1830,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) | |||
1862 | return; | 1830 | return; |
1863 | 1831 | ||
1864 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | 1832 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { |
1865 | vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); | 1833 | vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]); |
1866 | vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); | 1834 | vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]); |
1867 | vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); | 1835 | vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]); |
1868 | vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); | 1836 | vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]); |
1869 | } | 1837 | } |
1870 | } | 1838 | } |
1871 | 1839 | ||
1872 | static void ept_save_pdptrs(struct kvm_vcpu *vcpu) | 1840 | static void ept_save_pdptrs(struct kvm_vcpu *vcpu) |
1873 | { | 1841 | { |
1874 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | 1842 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { |
1875 | vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); | 1843 | vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); |
1876 | vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); | 1844 | vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); |
1877 | vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); | 1845 | vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); |
1878 | vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); | 1846 | vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); |
1879 | } | 1847 | } |
1880 | 1848 | ||
1881 | __set_bit(VCPU_EXREG_PDPTR, | 1849 | __set_bit(VCPU_EXREG_PDPTR, |
@@ -2521,7 +2489,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2521 | { | 2489 | { |
2522 | u32 host_sysenter_cs, msr_low, msr_high; | 2490 | u32 host_sysenter_cs, msr_low, msr_high; |
2523 | u32 junk; | 2491 | u32 junk; |
2524 | u64 host_pat, tsc_this, tsc_base; | 2492 | u64 host_pat; |
2525 | unsigned long a; | 2493 | unsigned long a; |
2526 | struct desc_ptr dt; | 2494 | struct desc_ptr dt; |
2527 | int i; | 2495 | int i; |
@@ -2589,8 +2557,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2589 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ | 2557 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ |
2590 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | 2558 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ |
2591 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | 2559 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ |
2592 | vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ | 2560 | vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ |
2593 | vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ | 2561 | vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ |
2594 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | 2562 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ |
2595 | #ifdef CONFIG_X86_64 | 2563 | #ifdef CONFIG_X86_64 |
2596 | rdmsrl(MSR_FS_BASE, a); | 2564 | rdmsrl(MSR_FS_BASE, a); |
@@ -2662,12 +2630,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2662 | vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; | 2630 | vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; |
2663 | vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); | 2631 | vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); |
2664 | 2632 | ||
2665 | tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; | 2633 | kvm_write_tsc(&vmx->vcpu, 0); |
2666 | rdtscll(tsc_this); | ||
2667 | if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc) | ||
2668 | tsc_base = tsc_this; | ||
2669 | |||
2670 | guest_write_tsc(0, tsc_base); | ||
2671 | 2634 | ||
2672 | return 0; | 2635 | return 0; |
2673 | } | 2636 | } |
@@ -2840,16 +2803,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) | |||
2840 | 2803 | ||
2841 | ++vcpu->stat.irq_injections; | 2804 | ++vcpu->stat.irq_injections; |
2842 | if (vmx->rmode.vm86_active) { | 2805 | if (vmx->rmode.vm86_active) { |
2843 | vmx->rmode.irq.pending = true; | 2806 | if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE) |
2844 | vmx->rmode.irq.vector = irq; | 2807 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
2845 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | ||
2846 | if (vcpu->arch.interrupt.soft) | ||
2847 | vmx->rmode.irq.rip += | ||
2848 | vmx->vcpu.arch.event_exit_inst_len; | ||
2849 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
2850 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); | ||
2851 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | ||
2852 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); | ||
2853 | return; | 2808 | return; |
2854 | } | 2809 | } |
2855 | intr = irq | INTR_INFO_VALID_MASK; | 2810 | intr = irq | INTR_INFO_VALID_MASK; |
@@ -2881,14 +2836,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
2881 | 2836 | ||
2882 | ++vcpu->stat.nmi_injections; | 2837 | ++vcpu->stat.nmi_injections; |
2883 | if (vmx->rmode.vm86_active) { | 2838 | if (vmx->rmode.vm86_active) { |
2884 | vmx->rmode.irq.pending = true; | 2839 | if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE) |
2885 | vmx->rmode.irq.vector = NMI_VECTOR; | 2840 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
2886 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | ||
2887 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
2888 | NMI_VECTOR | INTR_TYPE_SOFT_INTR | | ||
2889 | INTR_INFO_VALID_MASK); | ||
2890 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | ||
2891 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); | ||
2892 | return; | 2841 | return; |
2893 | } | 2842 | } |
2894 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2843 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
@@ -3352,6 +3301,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) | |||
3352 | 3301 | ||
3353 | static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) | 3302 | static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) |
3354 | { | 3303 | { |
3304 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
3355 | return 1; | 3305 | return 1; |
3356 | } | 3306 | } |
3357 | 3307 | ||
@@ -3364,6 +3314,8 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) | |||
3364 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | 3314 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; |
3365 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | 3315 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); |
3366 | 3316 | ||
3317 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
3318 | |||
3367 | ++vcpu->stat.irq_window_exits; | 3319 | ++vcpu->stat.irq_window_exits; |
3368 | 3320 | ||
3369 | /* | 3321 | /* |
@@ -3620,6 +3572,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) | |||
3620 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; | 3572 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; |
3621 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | 3573 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); |
3622 | ++vcpu->stat.nmi_window_exits; | 3574 | ++vcpu->stat.nmi_window_exits; |
3575 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
3623 | 3576 | ||
3624 | return 1; | 3577 | return 1; |
3625 | } | 3578 | } |
@@ -3629,8 +3582,17 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | |||
3629 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3582 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3630 | enum emulation_result err = EMULATE_DONE; | 3583 | enum emulation_result err = EMULATE_DONE; |
3631 | int ret = 1; | 3584 | int ret = 1; |
3585 | u32 cpu_exec_ctrl; | ||
3586 | bool intr_window_requested; | ||
3587 | |||
3588 | cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
3589 | intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; | ||
3632 | 3590 | ||
3633 | while (!guest_state_valid(vcpu)) { | 3591 | while (!guest_state_valid(vcpu)) { |
3592 | if (intr_window_requested | ||
3593 | && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) | ||
3594 | return handle_interrupt_window(&vmx->vcpu); | ||
3595 | |||
3634 | err = emulate_instruction(vcpu, 0, 0, 0); | 3596 | err = emulate_instruction(vcpu, 0, 0, 0); |
3635 | 3597 | ||
3636 | if (err == EMULATE_DO_MMIO) { | 3598 | if (err == EMULATE_DO_MMIO) { |
@@ -3796,18 +3758,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) | |||
3796 | vmcs_write32(TPR_THRESHOLD, irr); | 3758 | vmcs_write32(TPR_THRESHOLD, irr); |
3797 | } | 3759 | } |
3798 | 3760 | ||
3799 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | 3761 | static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) |
3800 | { | 3762 | { |
3801 | u32 exit_intr_info; | 3763 | u32 exit_intr_info = vmx->exit_intr_info; |
3802 | u32 idt_vectoring_info = vmx->idt_vectoring_info; | ||
3803 | bool unblock_nmi; | ||
3804 | u8 vector; | ||
3805 | int type; | ||
3806 | bool idtv_info_valid; | ||
3807 | |||
3808 | exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
3809 | |||
3810 | vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
3811 | 3764 | ||
3812 | /* Handle machine checks before interrupts are enabled */ | 3765 | /* Handle machine checks before interrupts are enabled */ |
3813 | if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) | 3766 | if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) |
@@ -3822,8 +3775,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
3822 | asm("int $2"); | 3775 | asm("int $2"); |
3823 | kvm_after_handle_nmi(&vmx->vcpu); | 3776 | kvm_after_handle_nmi(&vmx->vcpu); |
3824 | } | 3777 | } |
3778 | } | ||
3825 | 3779 | ||
3826 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; | 3780 | static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) |
3781 | { | ||
3782 | u32 exit_intr_info = vmx->exit_intr_info; | ||
3783 | bool unblock_nmi; | ||
3784 | u8 vector; | ||
3785 | bool idtv_info_valid; | ||
3786 | |||
3787 | idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; | ||
3827 | 3788 | ||
3828 | if (cpu_has_virtual_nmis()) { | 3789 | if (cpu_has_virtual_nmis()) { |
3829 | unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; | 3790 | unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; |
@@ -3845,6 +3806,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
3845 | } else if (unlikely(vmx->soft_vnmi_blocked)) | 3806 | } else if (unlikely(vmx->soft_vnmi_blocked)) |
3846 | vmx->vnmi_blocked_time += | 3807 | vmx->vnmi_blocked_time += |
3847 | ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); | 3808 | ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); |
3809 | } | ||
3810 | |||
3811 | static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, | ||
3812 | u32 idt_vectoring_info, | ||
3813 | int instr_len_field, | ||
3814 | int error_code_field) | ||
3815 | { | ||
3816 | u8 vector; | ||
3817 | int type; | ||
3818 | bool idtv_info_valid; | ||
3819 | |||
3820 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; | ||
3848 | 3821 | ||
3849 | vmx->vcpu.arch.nmi_injected = false; | 3822 | vmx->vcpu.arch.nmi_injected = false; |
3850 | kvm_clear_exception_queue(&vmx->vcpu); | 3823 | kvm_clear_exception_queue(&vmx->vcpu); |
@@ -3853,6 +3826,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
3853 | if (!idtv_info_valid) | 3826 | if (!idtv_info_valid) |
3854 | return; | 3827 | return; |
3855 | 3828 | ||
3829 | kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); | ||
3830 | |||
3856 | vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; | 3831 | vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; |
3857 | type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; | 3832 | type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; |
3858 | 3833 | ||
@@ -3869,18 +3844,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
3869 | break; | 3844 | break; |
3870 | case INTR_TYPE_SOFT_EXCEPTION: | 3845 | case INTR_TYPE_SOFT_EXCEPTION: |
3871 | vmx->vcpu.arch.event_exit_inst_len = | 3846 | vmx->vcpu.arch.event_exit_inst_len = |
3872 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | 3847 | vmcs_read32(instr_len_field); |
3873 | /* fall through */ | 3848 | /* fall through */ |
3874 | case INTR_TYPE_HARD_EXCEPTION: | 3849 | case INTR_TYPE_HARD_EXCEPTION: |
3875 | if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { | 3850 | if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { |
3876 | u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); | 3851 | u32 err = vmcs_read32(error_code_field); |
3877 | kvm_queue_exception_e(&vmx->vcpu, vector, err); | 3852 | kvm_queue_exception_e(&vmx->vcpu, vector, err); |
3878 | } else | 3853 | } else |
3879 | kvm_queue_exception(&vmx->vcpu, vector); | 3854 | kvm_queue_exception(&vmx->vcpu, vector); |
3880 | break; | 3855 | break; |
3881 | case INTR_TYPE_SOFT_INTR: | 3856 | case INTR_TYPE_SOFT_INTR: |
3882 | vmx->vcpu.arch.event_exit_inst_len = | 3857 | vmx->vcpu.arch.event_exit_inst_len = |
3883 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | 3858 | vmcs_read32(instr_len_field); |
3884 | /* fall through */ | 3859 | /* fall through */ |
3885 | case INTR_TYPE_EXT_INTR: | 3860 | case INTR_TYPE_EXT_INTR: |
3886 | kvm_queue_interrupt(&vmx->vcpu, vector, | 3861 | kvm_queue_interrupt(&vmx->vcpu, vector, |
@@ -3891,27 +3866,21 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
3891 | } | 3866 | } |
3892 | } | 3867 | } |
3893 | 3868 | ||
3894 | /* | 3869 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) |
3895 | * Failure to inject an interrupt should give us the information | ||
3896 | * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs | ||
3897 | * when fetching the interrupt redirection bitmap in the real-mode | ||
3898 | * tss, this doesn't happen. So we do it ourselves. | ||
3899 | */ | ||
3900 | static void fixup_rmode_irq(struct vcpu_vmx *vmx) | ||
3901 | { | 3870 | { |
3902 | vmx->rmode.irq.pending = 0; | 3871 | __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, |
3903 | if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) | 3872 | VM_EXIT_INSTRUCTION_LEN, |
3904 | return; | 3873 | IDT_VECTORING_ERROR_CODE); |
3905 | kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); | 3874 | } |
3906 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { | 3875 | |
3907 | vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; | 3876 | static void vmx_cancel_injection(struct kvm_vcpu *vcpu) |
3908 | vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; | 3877 | { |
3909 | return; | 3878 | __vmx_complete_interrupts(to_vmx(vcpu), |
3910 | } | 3879 | vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), |
3911 | vmx->idt_vectoring_info = | 3880 | VM_ENTRY_INSTRUCTION_LEN, |
3912 | VECTORING_INFO_VALID_MASK | 3881 | VM_ENTRY_EXCEPTION_ERROR_CODE); |
3913 | | INTR_TYPE_EXT_INTR | 3882 | |
3914 | | vmx->rmode.irq.vector; | 3883 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); |
3915 | } | 3884 | } |
3916 | 3885 | ||
3917 | #ifdef CONFIG_X86_64 | 3886 | #ifdef CONFIG_X86_64 |
@@ -4038,7 +4007,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4038 | #endif | 4007 | #endif |
4039 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) | 4008 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) |
4040 | : "cc", "memory" | 4009 | : "cc", "memory" |
4041 | , R"bx", R"di", R"si" | 4010 | , R"ax", R"bx", R"di", R"si" |
4042 | #ifdef CONFIG_X86_64 | 4011 | #ifdef CONFIG_X86_64 |
4043 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" | 4012 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" |
4044 | #endif | 4013 | #endif |
@@ -4049,12 +4018,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4049 | vcpu->arch.regs_dirty = 0; | 4018 | vcpu->arch.regs_dirty = 0; |
4050 | 4019 | ||
4051 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 4020 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
4052 | if (vmx->rmode.irq.pending) | ||
4053 | fixup_rmode_irq(vmx); | ||
4054 | 4021 | ||
4055 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | 4022 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); |
4056 | vmx->launched = 1; | 4023 | vmx->launched = 1; |
4057 | 4024 | ||
4025 | vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
4026 | vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
4027 | |||
4028 | vmx_complete_atomic_exit(vmx); | ||
4029 | vmx_recover_nmi_blocking(vmx); | ||
4058 | vmx_complete_interrupts(vmx); | 4030 | vmx_complete_interrupts(vmx); |
4059 | } | 4031 | } |
4060 | 4032 | ||
@@ -4125,6 +4097,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
4125 | 4097 | ||
4126 | cpu = get_cpu(); | 4098 | cpu = get_cpu(); |
4127 | vmx_vcpu_load(&vmx->vcpu, cpu); | 4099 | vmx_vcpu_load(&vmx->vcpu, cpu); |
4100 | vmx->vcpu.cpu = cpu; | ||
4128 | err = vmx_vcpu_setup(vmx); | 4101 | err = vmx_vcpu_setup(vmx); |
4129 | vmx_vcpu_put(&vmx->vcpu); | 4102 | vmx_vcpu_put(&vmx->vcpu); |
4130 | put_cpu(); | 4103 | put_cpu(); |
@@ -4340,6 +4313,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
4340 | .set_irq = vmx_inject_irq, | 4313 | .set_irq = vmx_inject_irq, |
4341 | .set_nmi = vmx_inject_nmi, | 4314 | .set_nmi = vmx_inject_nmi, |
4342 | .queue_exception = vmx_queue_exception, | 4315 | .queue_exception = vmx_queue_exception, |
4316 | .cancel_injection = vmx_cancel_injection, | ||
4343 | .interrupt_allowed = vmx_interrupt_allowed, | 4317 | .interrupt_allowed = vmx_interrupt_allowed, |
4344 | .nmi_allowed = vmx_nmi_allowed, | 4318 | .nmi_allowed = vmx_nmi_allowed, |
4345 | .get_nmi_mask = vmx_get_nmi_mask, | 4319 | .get_nmi_mask = vmx_get_nmi_mask, |
@@ -4362,6 +4336,11 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
4362 | .set_supported_cpuid = vmx_set_supported_cpuid, | 4336 | .set_supported_cpuid = vmx_set_supported_cpuid, |
4363 | 4337 | ||
4364 | .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, | 4338 | .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, |
4339 | |||
4340 | .write_tsc_offset = vmx_write_tsc_offset, | ||
4341 | .adjust_tsc_offset = vmx_adjust_tsc_offset, | ||
4342 | |||
4343 | .set_tdp_cr3 = vmx_set_cr3, | ||
4365 | }; | 4344 | }; |
4366 | 4345 | ||
4367 | static int __init vmx_init(void) | 4346 | static int __init vmx_init(void) |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3a09c625d526..2288ad829b32 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * Copyright (C) 2006 Qumranet, Inc. | 6 | * Copyright (C) 2006 Qumranet, Inc. |
7 | * Copyright (C) 2008 Qumranet, Inc. | 7 | * Copyright (C) 2008 Qumranet, Inc. |
8 | * Copyright IBM Corporation, 2008 | 8 | * Copyright IBM Corporation, 2008 |
9 | * Copyright 2010 Red Hat, Inc. and/or its affilates. | 9 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
10 | * | 10 | * |
11 | * Authors: | 11 | * Authors: |
12 | * Avi Kivity <avi@qumranet.com> | 12 | * Avi Kivity <avi@qumranet.com> |
@@ -55,6 +55,8 @@ | |||
55 | #include <asm/mce.h> | 55 | #include <asm/mce.h> |
56 | #include <asm/i387.h> | 56 | #include <asm/i387.h> |
57 | #include <asm/xcr.h> | 57 | #include <asm/xcr.h> |
58 | #include <asm/pvclock.h> | ||
59 | #include <asm/div64.h> | ||
58 | 60 | ||
59 | #define MAX_IO_MSRS 256 | 61 | #define MAX_IO_MSRS 256 |
60 | #define CR0_RESERVED_BITS \ | 62 | #define CR0_RESERVED_BITS \ |
@@ -71,7 +73,7 @@ | |||
71 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) | 73 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) |
72 | 74 | ||
73 | #define KVM_MAX_MCE_BANKS 32 | 75 | #define KVM_MAX_MCE_BANKS 32 |
74 | #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P | 76 | #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) |
75 | 77 | ||
76 | /* EFER defaults: | 78 | /* EFER defaults: |
77 | * - enable syscall per default because its emulated by KVM | 79 | * - enable syscall per default because its emulated by KVM |
@@ -282,6 +284,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, | |||
282 | u32 prev_nr; | 284 | u32 prev_nr; |
283 | int class1, class2; | 285 | int class1, class2; |
284 | 286 | ||
287 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
288 | |||
285 | if (!vcpu->arch.exception.pending) { | 289 | if (!vcpu->arch.exception.pending) { |
286 | queue: | 290 | queue: |
287 | vcpu->arch.exception.pending = true; | 291 | vcpu->arch.exception.pending = true; |
@@ -327,16 +331,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) | |||
327 | } | 331 | } |
328 | EXPORT_SYMBOL_GPL(kvm_requeue_exception); | 332 | EXPORT_SYMBOL_GPL(kvm_requeue_exception); |
329 | 333 | ||
330 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, | 334 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu) |
331 | u32 error_code) | ||
332 | { | 335 | { |
336 | unsigned error_code = vcpu->arch.fault.error_code; | ||
337 | |||
333 | ++vcpu->stat.pf_guest; | 338 | ++vcpu->stat.pf_guest; |
334 | vcpu->arch.cr2 = addr; | 339 | vcpu->arch.cr2 = vcpu->arch.fault.address; |
335 | kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); | 340 | kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); |
336 | } | 341 | } |
337 | 342 | ||
343 | void kvm_propagate_fault(struct kvm_vcpu *vcpu) | ||
344 | { | ||
345 | if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested) | ||
346 | vcpu->arch.nested_mmu.inject_page_fault(vcpu); | ||
347 | else | ||
348 | vcpu->arch.mmu.inject_page_fault(vcpu); | ||
349 | |||
350 | vcpu->arch.fault.nested = false; | ||
351 | } | ||
352 | |||
338 | void kvm_inject_nmi(struct kvm_vcpu *vcpu) | 353 | void kvm_inject_nmi(struct kvm_vcpu *vcpu) |
339 | { | 354 | { |
355 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
340 | vcpu->arch.nmi_pending = 1; | 356 | vcpu->arch.nmi_pending = 1; |
341 | } | 357 | } |
342 | EXPORT_SYMBOL_GPL(kvm_inject_nmi); | 358 | EXPORT_SYMBOL_GPL(kvm_inject_nmi); |
@@ -367,18 +383,49 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) | |||
367 | EXPORT_SYMBOL_GPL(kvm_require_cpl); | 383 | EXPORT_SYMBOL_GPL(kvm_require_cpl); |
368 | 384 | ||
369 | /* | 385 | /* |
386 | * This function will be used to read from the physical memory of the currently | ||
387 | * running guest. The difference to kvm_read_guest_page is that this function | ||
388 | * can read from guest physical or from the guest's guest physical memory. | ||
389 | */ | ||
390 | int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | ||
391 | gfn_t ngfn, void *data, int offset, int len, | ||
392 | u32 access) | ||
393 | { | ||
394 | gfn_t real_gfn; | ||
395 | gpa_t ngpa; | ||
396 | |||
397 | ngpa = gfn_to_gpa(ngfn); | ||
398 | real_gfn = mmu->translate_gpa(vcpu, ngpa, access); | ||
399 | if (real_gfn == UNMAPPED_GVA) | ||
400 | return -EFAULT; | ||
401 | |||
402 | real_gfn = gpa_to_gfn(real_gfn); | ||
403 | |||
404 | return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); | ||
405 | } | ||
406 | EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); | ||
407 | |||
408 | int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, | ||
409 | void *data, int offset, int len, u32 access) | ||
410 | { | ||
411 | return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, | ||
412 | data, offset, len, access); | ||
413 | } | ||
414 | |||
415 | /* | ||
370 | * Load the pae pdptrs. Return true is they are all valid. | 416 | * Load the pae pdptrs. Return true is they are all valid. |
371 | */ | 417 | */ |
372 | int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | 418 | int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) |
373 | { | 419 | { |
374 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; | 420 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; |
375 | unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; | 421 | unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; |
376 | int i; | 422 | int i; |
377 | int ret; | 423 | int ret; |
378 | u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; | 424 | u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; |
379 | 425 | ||
380 | ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, | 426 | ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, |
381 | offset * sizeof(u64), sizeof(pdpte)); | 427 | offset * sizeof(u64), sizeof(pdpte), |
428 | PFERR_USER_MASK|PFERR_WRITE_MASK); | ||
382 | if (ret < 0) { | 429 | if (ret < 0) { |
383 | ret = 0; | 430 | ret = 0; |
384 | goto out; | 431 | goto out; |
@@ -392,7 +439,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
392 | } | 439 | } |
393 | ret = 1; | 440 | ret = 1; |
394 | 441 | ||
395 | memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); | 442 | memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); |
396 | __set_bit(VCPU_EXREG_PDPTR, | 443 | __set_bit(VCPU_EXREG_PDPTR, |
397 | (unsigned long *)&vcpu->arch.regs_avail); | 444 | (unsigned long *)&vcpu->arch.regs_avail); |
398 | __set_bit(VCPU_EXREG_PDPTR, | 445 | __set_bit(VCPU_EXREG_PDPTR, |
@@ -405,8 +452,10 @@ EXPORT_SYMBOL_GPL(load_pdptrs); | |||
405 | 452 | ||
406 | static bool pdptrs_changed(struct kvm_vcpu *vcpu) | 453 | static bool pdptrs_changed(struct kvm_vcpu *vcpu) |
407 | { | 454 | { |
408 | u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; | 455 | u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; |
409 | bool changed = true; | 456 | bool changed = true; |
457 | int offset; | ||
458 | gfn_t gfn; | ||
410 | int r; | 459 | int r; |
411 | 460 | ||
412 | if (is_long_mode(vcpu) || !is_pae(vcpu)) | 461 | if (is_long_mode(vcpu) || !is_pae(vcpu)) |
@@ -416,10 +465,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) | |||
416 | (unsigned long *)&vcpu->arch.regs_avail)) | 465 | (unsigned long *)&vcpu->arch.regs_avail)) |
417 | return true; | 466 | return true; |
418 | 467 | ||
419 | r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); | 468 | gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT; |
469 | offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1); | ||
470 | r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), | ||
471 | PFERR_USER_MASK | PFERR_WRITE_MASK); | ||
420 | if (r < 0) | 472 | if (r < 0) |
421 | goto out; | 473 | goto out; |
422 | changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; | 474 | changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; |
423 | out: | 475 | out: |
424 | 476 | ||
425 | return changed; | 477 | return changed; |
@@ -458,7 +510,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
458 | return 1; | 510 | return 1; |
459 | } else | 511 | } else |
460 | #endif | 512 | #endif |
461 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) | 513 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, |
514 | vcpu->arch.cr3)) | ||
462 | return 1; | 515 | return 1; |
463 | } | 516 | } |
464 | 517 | ||
@@ -547,7 +600,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
547 | return 1; | 600 | return 1; |
548 | } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) | 601 | } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) |
549 | && ((cr4 ^ old_cr4) & pdptr_bits) | 602 | && ((cr4 ^ old_cr4) & pdptr_bits) |
550 | && !load_pdptrs(vcpu, vcpu->arch.cr3)) | 603 | && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)) |
551 | return 1; | 604 | return 1; |
552 | 605 | ||
553 | if (cr4 & X86_CR4_VMXE) | 606 | if (cr4 & X86_CR4_VMXE) |
@@ -580,7 +633,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
580 | if (is_pae(vcpu)) { | 633 | if (is_pae(vcpu)) { |
581 | if (cr3 & CR3_PAE_RESERVED_BITS) | 634 | if (cr3 & CR3_PAE_RESERVED_BITS) |
582 | return 1; | 635 | return 1; |
583 | if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) | 636 | if (is_paging(vcpu) && |
637 | !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) | ||
584 | return 1; | 638 | return 1; |
585 | } | 639 | } |
586 | /* | 640 | /* |
@@ -737,7 +791,7 @@ static u32 msrs_to_save[] = { | |||
737 | #ifdef CONFIG_X86_64 | 791 | #ifdef CONFIG_X86_64 |
738 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | 792 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, |
739 | #endif | 793 | #endif |
740 | MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA | 794 | MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA |
741 | }; | 795 | }; |
742 | 796 | ||
743 | static unsigned num_msrs_to_save; | 797 | static unsigned num_msrs_to_save; |
@@ -838,7 +892,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) | |||
838 | 892 | ||
839 | /* | 893 | /* |
840 | * The guest calculates current wall clock time by adding | 894 | * The guest calculates current wall clock time by adding |
841 | * system time (updated by kvm_write_guest_time below) to the | 895 | * system time (updated by kvm_guest_time_update below) to the |
842 | * wall clock specified here. guest system time equals host | 896 | * wall clock specified here. guest system time equals host |
843 | * system time for us, thus we must fill in host boot time here. | 897 | * system time for us, thus we must fill in host boot time here. |
844 | */ | 898 | */ |
@@ -866,65 +920,229 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor) | |||
866 | return quotient; | 920 | return quotient; |
867 | } | 921 | } |
868 | 922 | ||
869 | static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) | 923 | static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, |
924 | s8 *pshift, u32 *pmultiplier) | ||
870 | { | 925 | { |
871 | uint64_t nsecs = 1000000000LL; | 926 | uint64_t scaled64; |
872 | int32_t shift = 0; | 927 | int32_t shift = 0; |
873 | uint64_t tps64; | 928 | uint64_t tps64; |
874 | uint32_t tps32; | 929 | uint32_t tps32; |
875 | 930 | ||
876 | tps64 = tsc_khz * 1000LL; | 931 | tps64 = base_khz * 1000LL; |
877 | while (tps64 > nsecs*2) { | 932 | scaled64 = scaled_khz * 1000LL; |
933 | while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { | ||
878 | tps64 >>= 1; | 934 | tps64 >>= 1; |
879 | shift--; | 935 | shift--; |
880 | } | 936 | } |
881 | 937 | ||
882 | tps32 = (uint32_t)tps64; | 938 | tps32 = (uint32_t)tps64; |
883 | while (tps32 <= (uint32_t)nsecs) { | 939 | while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { |
884 | tps32 <<= 1; | 940 | if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) |
941 | scaled64 >>= 1; | ||
942 | else | ||
943 | tps32 <<= 1; | ||
885 | shift++; | 944 | shift++; |
886 | } | 945 | } |
887 | 946 | ||
888 | hv_clock->tsc_shift = shift; | 947 | *pshift = shift; |
889 | hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); | 948 | *pmultiplier = div_frac(scaled64, tps32); |
890 | 949 | ||
891 | pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", | 950 | pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", |
892 | __func__, tsc_khz, hv_clock->tsc_shift, | 951 | __func__, base_khz, scaled_khz, shift, *pmultiplier); |
893 | hv_clock->tsc_to_system_mul); | 952 | } |
953 | |||
954 | static inline u64 get_kernel_ns(void) | ||
955 | { | ||
956 | struct timespec ts; | ||
957 | |||
958 | WARN_ON(preemptible()); | ||
959 | ktime_get_ts(&ts); | ||
960 | monotonic_to_bootbased(&ts); | ||
961 | return timespec_to_ns(&ts); | ||
894 | } | 962 | } |
895 | 963 | ||
896 | static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); | 964 | static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); |
965 | unsigned long max_tsc_khz; | ||
897 | 966 | ||
898 | static void kvm_write_guest_time(struct kvm_vcpu *v) | 967 | static inline int kvm_tsc_changes_freq(void) |
968 | { | ||
969 | int cpu = get_cpu(); | ||
970 | int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && | ||
971 | cpufreq_quick_get(cpu) != 0; | ||
972 | put_cpu(); | ||
973 | return ret; | ||
974 | } | ||
975 | |||
976 | static inline u64 nsec_to_cycles(u64 nsec) | ||
977 | { | ||
978 | u64 ret; | ||
979 | |||
980 | WARN_ON(preemptible()); | ||
981 | if (kvm_tsc_changes_freq()) | ||
982 | printk_once(KERN_WARNING | ||
983 | "kvm: unreliable cycle conversion on adjustable rate TSC\n"); | ||
984 | ret = nsec * __get_cpu_var(cpu_tsc_khz); | ||
985 | do_div(ret, USEC_PER_SEC); | ||
986 | return ret; | ||
987 | } | ||
988 | |||
989 | static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) | ||
990 | { | ||
991 | /* Compute a scale to convert nanoseconds in TSC cycles */ | ||
992 | kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, | ||
993 | &kvm->arch.virtual_tsc_shift, | ||
994 | &kvm->arch.virtual_tsc_mult); | ||
995 | kvm->arch.virtual_tsc_khz = this_tsc_khz; | ||
996 | } | ||
997 | |||
998 | static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) | ||
999 | { | ||
1000 | u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, | ||
1001 | vcpu->kvm->arch.virtual_tsc_mult, | ||
1002 | vcpu->kvm->arch.virtual_tsc_shift); | ||
1003 | tsc += vcpu->arch.last_tsc_write; | ||
1004 | return tsc; | ||
1005 | } | ||
1006 | |||
1007 | void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) | ||
1008 | { | ||
1009 | struct kvm *kvm = vcpu->kvm; | ||
1010 | u64 offset, ns, elapsed; | ||
1011 | unsigned long flags; | ||
1012 | s64 sdiff; | ||
1013 | |||
1014 | spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); | ||
1015 | offset = data - native_read_tsc(); | ||
1016 | ns = get_kernel_ns(); | ||
1017 | elapsed = ns - kvm->arch.last_tsc_nsec; | ||
1018 | sdiff = data - kvm->arch.last_tsc_write; | ||
1019 | if (sdiff < 0) | ||
1020 | sdiff = -sdiff; | ||
1021 | |||
1022 | /* | ||
1023 | * Special case: close write to TSC within 5 seconds of | ||
1024 | * another CPU is interpreted as an attempt to synchronize | ||
1025 | * The 5 seconds is to accomodate host load / swapping as | ||
1026 | * well as any reset of TSC during the boot process. | ||
1027 | * | ||
1028 | * In that case, for a reliable TSC, we can match TSC offsets, | ||
1029 | * or make a best guest using elapsed value. | ||
1030 | */ | ||
1031 | if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && | ||
1032 | elapsed < 5ULL * NSEC_PER_SEC) { | ||
1033 | if (!check_tsc_unstable()) { | ||
1034 | offset = kvm->arch.last_tsc_offset; | ||
1035 | pr_debug("kvm: matched tsc offset for %llu\n", data); | ||
1036 | } else { | ||
1037 | u64 delta = nsec_to_cycles(elapsed); | ||
1038 | offset += delta; | ||
1039 | pr_debug("kvm: adjusted tsc offset by %llu\n", delta); | ||
1040 | } | ||
1041 | ns = kvm->arch.last_tsc_nsec; | ||
1042 | } | ||
1043 | kvm->arch.last_tsc_nsec = ns; | ||
1044 | kvm->arch.last_tsc_write = data; | ||
1045 | kvm->arch.last_tsc_offset = offset; | ||
1046 | kvm_x86_ops->write_tsc_offset(vcpu, offset); | ||
1047 | spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); | ||
1048 | |||
1049 | /* Reset of TSC must disable overshoot protection below */ | ||
1050 | vcpu->arch.hv_clock.tsc_timestamp = 0; | ||
1051 | vcpu->arch.last_tsc_write = data; | ||
1052 | vcpu->arch.last_tsc_nsec = ns; | ||
1053 | } | ||
1054 | EXPORT_SYMBOL_GPL(kvm_write_tsc); | ||
1055 | |||
1056 | static int kvm_guest_time_update(struct kvm_vcpu *v) | ||
899 | { | 1057 | { |
900 | struct timespec ts; | ||
901 | unsigned long flags; | 1058 | unsigned long flags; |
902 | struct kvm_vcpu_arch *vcpu = &v->arch; | 1059 | struct kvm_vcpu_arch *vcpu = &v->arch; |
903 | void *shared_kaddr; | 1060 | void *shared_kaddr; |
904 | unsigned long this_tsc_khz; | 1061 | unsigned long this_tsc_khz; |
1062 | s64 kernel_ns, max_kernel_ns; | ||
1063 | u64 tsc_timestamp; | ||
905 | 1064 | ||
906 | if ((!vcpu->time_page)) | 1065 | /* Keep irq disabled to prevent changes to the clock */ |
907 | return; | 1066 | local_irq_save(flags); |
1067 | kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); | ||
1068 | kernel_ns = get_kernel_ns(); | ||
1069 | this_tsc_khz = __get_cpu_var(cpu_tsc_khz); | ||
908 | 1070 | ||
909 | this_tsc_khz = get_cpu_var(cpu_tsc_khz); | 1071 | if (unlikely(this_tsc_khz == 0)) { |
910 | if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { | 1072 | local_irq_restore(flags); |
911 | kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); | 1073 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); |
912 | vcpu->hv_clock_tsc_khz = this_tsc_khz; | 1074 | return 1; |
1075 | } | ||
1076 | |||
1077 | /* | ||
1078 | * We may have to catch up the TSC to match elapsed wall clock | ||
1079 | * time for two reasons, even if kvmclock is used. | ||
1080 | * 1) CPU could have been running below the maximum TSC rate | ||
1081 | * 2) Broken TSC compensation resets the base at each VCPU | ||
1082 | * entry to avoid unknown leaps of TSC even when running | ||
1083 | * again on the same CPU. This may cause apparent elapsed | ||
1084 | * time to disappear, and the guest to stand still or run | ||
1085 | * very slowly. | ||
1086 | */ | ||
1087 | if (vcpu->tsc_catchup) { | ||
1088 | u64 tsc = compute_guest_tsc(v, kernel_ns); | ||
1089 | if (tsc > tsc_timestamp) { | ||
1090 | kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); | ||
1091 | tsc_timestamp = tsc; | ||
1092 | } | ||
913 | } | 1093 | } |
914 | put_cpu_var(cpu_tsc_khz); | ||
915 | 1094 | ||
916 | /* Keep irq disabled to prevent changes to the clock */ | ||
917 | local_irq_save(flags); | ||
918 | kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); | ||
919 | ktime_get_ts(&ts); | ||
920 | monotonic_to_bootbased(&ts); | ||
921 | local_irq_restore(flags); | 1095 | local_irq_restore(flags); |
922 | 1096 | ||
923 | /* With all the info we got, fill in the values */ | 1097 | if (!vcpu->time_page) |
1098 | return 0; | ||
924 | 1099 | ||
925 | vcpu->hv_clock.system_time = ts.tv_nsec + | 1100 | /* |
926 | (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; | 1101 | * Time as measured by the TSC may go backwards when resetting the base |
1102 | * tsc_timestamp. The reason for this is that the TSC resolution is | ||
1103 | * higher than the resolution of the other clock scales. Thus, many | ||
1104 | * possible measurments of the TSC correspond to one measurement of any | ||
1105 | * other clock, and so a spread of values is possible. This is not a | ||
1106 | * problem for the computation of the nanosecond clock; with TSC rates | ||
1107 | * around 1GHZ, there can only be a few cycles which correspond to one | ||
1108 | * nanosecond value, and any path through this code will inevitably | ||
1109 | * take longer than that. However, with the kernel_ns value itself, | ||
1110 | * the precision may be much lower, down to HZ granularity. If the | ||
1111 | * first sampling of TSC against kernel_ns ends in the low part of the | ||
1112 | * range, and the second in the high end of the range, we can get: | ||
1113 | * | ||
1114 | * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new | ||
1115 | * | ||
1116 | * As the sampling errors potentially range in the thousands of cycles, | ||
1117 | * it is possible such a time value has already been observed by the | ||
1118 | * guest. To protect against this, we must compute the system time as | ||
1119 | * observed by the guest and ensure the new system time is greater. | ||
1120 | */ | ||
1121 | max_kernel_ns = 0; | ||
1122 | if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { | ||
1123 | max_kernel_ns = vcpu->last_guest_tsc - | ||
1124 | vcpu->hv_clock.tsc_timestamp; | ||
1125 | max_kernel_ns = pvclock_scale_delta(max_kernel_ns, | ||
1126 | vcpu->hv_clock.tsc_to_system_mul, | ||
1127 | vcpu->hv_clock.tsc_shift); | ||
1128 | max_kernel_ns += vcpu->last_kernel_ns; | ||
1129 | } | ||
927 | 1130 | ||
1131 | if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { | ||
1132 | kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, | ||
1133 | &vcpu->hv_clock.tsc_shift, | ||
1134 | &vcpu->hv_clock.tsc_to_system_mul); | ||
1135 | vcpu->hw_tsc_khz = this_tsc_khz; | ||
1136 | } | ||
1137 | |||
1138 | if (max_kernel_ns > kernel_ns) | ||
1139 | kernel_ns = max_kernel_ns; | ||
1140 | |||
1141 | /* With all the info we got, fill in the values */ | ||
1142 | vcpu->hv_clock.tsc_timestamp = tsc_timestamp; | ||
1143 | vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; | ||
1144 | vcpu->last_kernel_ns = kernel_ns; | ||
1145 | vcpu->last_guest_tsc = tsc_timestamp; | ||
928 | vcpu->hv_clock.flags = 0; | 1146 | vcpu->hv_clock.flags = 0; |
929 | 1147 | ||
930 | /* | 1148 | /* |
@@ -942,16 +1160,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) | |||
942 | kunmap_atomic(shared_kaddr, KM_USER0); | 1160 | kunmap_atomic(shared_kaddr, KM_USER0); |
943 | 1161 | ||
944 | mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); | 1162 | mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); |
945 | } | 1163 | return 0; |
946 | |||
947 | static int kvm_request_guest_time_update(struct kvm_vcpu *v) | ||
948 | { | ||
949 | struct kvm_vcpu_arch *vcpu = &v->arch; | ||
950 | |||
951 | if (!vcpu->time_page) | ||
952 | return 0; | ||
953 | kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v); | ||
954 | return 1; | ||
955 | } | 1164 | } |
956 | 1165 | ||
957 | static bool msr_mtrr_valid(unsigned msr) | 1166 | static bool msr_mtrr_valid(unsigned msr) |
@@ -1277,6 +1486,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1277 | } | 1486 | } |
1278 | 1487 | ||
1279 | vcpu->arch.time = data; | 1488 | vcpu->arch.time = data; |
1489 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | ||
1280 | 1490 | ||
1281 | /* we verify if the enable bit is set... */ | 1491 | /* we verify if the enable bit is set... */ |
1282 | if (!(data & 1)) | 1492 | if (!(data & 1)) |
@@ -1292,8 +1502,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1292 | kvm_release_page_clean(vcpu->arch.time_page); | 1502 | kvm_release_page_clean(vcpu->arch.time_page); |
1293 | vcpu->arch.time_page = NULL; | 1503 | vcpu->arch.time_page = NULL; |
1294 | } | 1504 | } |
1295 | |||
1296 | kvm_request_guest_time_update(vcpu); | ||
1297 | break; | 1505 | break; |
1298 | } | 1506 | } |
1299 | case MSR_IA32_MCG_CTL: | 1507 | case MSR_IA32_MCG_CTL: |
@@ -1330,6 +1538,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1330 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " | 1538 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " |
1331 | "0x%x data 0x%llx\n", msr, data); | 1539 | "0x%x data 0x%llx\n", msr, data); |
1332 | break; | 1540 | break; |
1541 | case MSR_K7_CLK_CTL: | ||
1542 | /* | ||
1543 | * Ignore all writes to this no longer documented MSR. | ||
1544 | * Writes are only relevant for old K7 processors, | ||
1545 | * all pre-dating SVM, but a recommended workaround from | ||
1546 | * AMD for these chips. It is possible to speicify the | ||
1547 | * affected processor models on the command line, hence | ||
1548 | * the need to ignore the workaround. | ||
1549 | */ | ||
1550 | break; | ||
1333 | case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: | 1551 | case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: |
1334 | if (kvm_hv_msr_partition_wide(msr)) { | 1552 | if (kvm_hv_msr_partition_wide(msr)) { |
1335 | int r; | 1553 | int r; |
@@ -1522,6 +1740,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1522 | case 0xcd: /* fsb frequency */ | 1740 | case 0xcd: /* fsb frequency */ |
1523 | data = 3; | 1741 | data = 3; |
1524 | break; | 1742 | break; |
1743 | /* | ||
1744 | * MSR_EBC_FREQUENCY_ID | ||
1745 | * Conservative value valid for even the basic CPU models. | ||
1746 | * Models 0,1: 000 in bits 23:21 indicating a bus speed of | ||
1747 | * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, | ||
1748 | * and 266MHz for model 3, or 4. Set Core Clock | ||
1749 | * Frequency to System Bus Frequency Ratio to 1 (bits | ||
1750 | * 31:24) even though these are only valid for CPU | ||
1751 | * models > 2, however guests may end up dividing or | ||
1752 | * multiplying by zero otherwise. | ||
1753 | */ | ||
1754 | case MSR_EBC_FREQUENCY_ID: | ||
1755 | data = 1 << 24; | ||
1756 | break; | ||
1525 | case MSR_IA32_APICBASE: | 1757 | case MSR_IA32_APICBASE: |
1526 | data = kvm_get_apic_base(vcpu); | 1758 | data = kvm_get_apic_base(vcpu); |
1527 | break; | 1759 | break; |
@@ -1555,6 +1787,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1555 | case MSR_IA32_MCG_STATUS: | 1787 | case MSR_IA32_MCG_STATUS: |
1556 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: | 1788 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: |
1557 | return get_msr_mce(vcpu, msr, pdata); | 1789 | return get_msr_mce(vcpu, msr, pdata); |
1790 | case MSR_K7_CLK_CTL: | ||
1791 | /* | ||
1792 | * Provide expected ramp-up count for K7. All other | ||
1793 | * are set to zero, indicating minimum divisors for | ||
1794 | * every field. | ||
1795 | * | ||
1796 | * This prevents guest kernels on AMD host with CPU | ||
1797 | * type 6, model 8 and higher from exploding due to | ||
1798 | * the rdmsr failing. | ||
1799 | */ | ||
1800 | data = 0x20000000; | ||
1801 | break; | ||
1558 | case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: | 1802 | case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: |
1559 | if (kvm_hv_msr_partition_wide(msr)) { | 1803 | if (kvm_hv_msr_partition_wide(msr)) { |
1560 | int r; | 1804 | int r; |
@@ -1808,19 +2052,28 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
1808 | } | 2052 | } |
1809 | 2053 | ||
1810 | kvm_x86_ops->vcpu_load(vcpu, cpu); | 2054 | kvm_x86_ops->vcpu_load(vcpu, cpu); |
1811 | if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { | 2055 | if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { |
1812 | unsigned long khz = cpufreq_quick_get(cpu); | 2056 | /* Make sure TSC doesn't go backwards */ |
1813 | if (!khz) | 2057 | s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : |
1814 | khz = tsc_khz; | 2058 | native_read_tsc() - vcpu->arch.last_host_tsc; |
1815 | per_cpu(cpu_tsc_khz, cpu) = khz; | 2059 | if (tsc_delta < 0) |
2060 | mark_tsc_unstable("KVM discovered backwards TSC"); | ||
2061 | if (check_tsc_unstable()) { | ||
2062 | kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); | ||
2063 | vcpu->arch.tsc_catchup = 1; | ||
2064 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | ||
2065 | } | ||
2066 | if (vcpu->cpu != cpu) | ||
2067 | kvm_migrate_timers(vcpu); | ||
2068 | vcpu->cpu = cpu; | ||
1816 | } | 2069 | } |
1817 | kvm_request_guest_time_update(vcpu); | ||
1818 | } | 2070 | } |
1819 | 2071 | ||
1820 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) | 2072 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) |
1821 | { | 2073 | { |
1822 | kvm_x86_ops->vcpu_put(vcpu); | 2074 | kvm_x86_ops->vcpu_put(vcpu); |
1823 | kvm_put_guest_fpu(vcpu); | 2075 | kvm_put_guest_fpu(vcpu); |
2076 | vcpu->arch.last_host_tsc = native_read_tsc(); | ||
1824 | } | 2077 | } |
1825 | 2078 | ||
1826 | static int is_efer_nx(void) | 2079 | static int is_efer_nx(void) |
@@ -1991,13 +2244,14 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
1991 | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | | 2244 | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | |
1992 | 0 /* Reserved, DCA */ | F(XMM4_1) | | 2245 | 0 /* Reserved, DCA */ | F(XMM4_1) | |
1993 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | | 2246 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | |
1994 | 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); | 2247 | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | |
2248 | F(F16C); | ||
1995 | /* cpuid 0x80000001.ecx */ | 2249 | /* cpuid 0x80000001.ecx */ |
1996 | const u32 kvm_supported_word6_x86_features = | 2250 | const u32 kvm_supported_word6_x86_features = |
1997 | F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | | 2251 | F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | |
1998 | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | | 2252 | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | |
1999 | F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | | 2253 | F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | |
2000 | 0 /* SKINIT */ | 0 /* WDT */; | 2254 | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); |
2001 | 2255 | ||
2002 | /* all calls to cpuid_count() should be made on the same cpu */ | 2256 | /* all calls to cpuid_count() should be made on the same cpu */ |
2003 | get_cpu(); | 2257 | get_cpu(); |
@@ -2203,6 +2457,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | |||
2203 | return -ENXIO; | 2457 | return -ENXIO; |
2204 | 2458 | ||
2205 | kvm_queue_interrupt(vcpu, irq->irq, false); | 2459 | kvm_queue_interrupt(vcpu, irq->irq, false); |
2460 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
2206 | 2461 | ||
2207 | return 0; | 2462 | return 0; |
2208 | } | 2463 | } |
@@ -2356,6 +2611,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | |||
2356 | if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) | 2611 | if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) |
2357 | vcpu->arch.sipi_vector = events->sipi_vector; | 2612 | vcpu->arch.sipi_vector = events->sipi_vector; |
2358 | 2613 | ||
2614 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
2615 | |||
2359 | return 0; | 2616 | return 0; |
2360 | } | 2617 | } |
2361 | 2618 | ||
@@ -2759,7 +3016,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, | |||
2759 | 3016 | ||
2760 | static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) | 3017 | static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) |
2761 | { | 3018 | { |
2762 | return kvm->arch.n_alloc_mmu_pages; | 3019 | return kvm->arch.n_max_mmu_pages; |
2763 | } | 3020 | } |
2764 | 3021 | ||
2765 | static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | 3022 | static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) |
@@ -2795,18 +3052,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | |||
2795 | r = 0; | 3052 | r = 0; |
2796 | switch (chip->chip_id) { | 3053 | switch (chip->chip_id) { |
2797 | case KVM_IRQCHIP_PIC_MASTER: | 3054 | case KVM_IRQCHIP_PIC_MASTER: |
2798 | raw_spin_lock(&pic_irqchip(kvm)->lock); | 3055 | spin_lock(&pic_irqchip(kvm)->lock); |
2799 | memcpy(&pic_irqchip(kvm)->pics[0], | 3056 | memcpy(&pic_irqchip(kvm)->pics[0], |
2800 | &chip->chip.pic, | 3057 | &chip->chip.pic, |
2801 | sizeof(struct kvm_pic_state)); | 3058 | sizeof(struct kvm_pic_state)); |
2802 | raw_spin_unlock(&pic_irqchip(kvm)->lock); | 3059 | spin_unlock(&pic_irqchip(kvm)->lock); |
2803 | break; | 3060 | break; |
2804 | case KVM_IRQCHIP_PIC_SLAVE: | 3061 | case KVM_IRQCHIP_PIC_SLAVE: |
2805 | raw_spin_lock(&pic_irqchip(kvm)->lock); | 3062 | spin_lock(&pic_irqchip(kvm)->lock); |
2806 | memcpy(&pic_irqchip(kvm)->pics[1], | 3063 | memcpy(&pic_irqchip(kvm)->pics[1], |
2807 | &chip->chip.pic, | 3064 | &chip->chip.pic, |
2808 | sizeof(struct kvm_pic_state)); | 3065 | sizeof(struct kvm_pic_state)); |
2809 | raw_spin_unlock(&pic_irqchip(kvm)->lock); | 3066 | spin_unlock(&pic_irqchip(kvm)->lock); |
2810 | break; | 3067 | break; |
2811 | case KVM_IRQCHIP_IOAPIC: | 3068 | case KVM_IRQCHIP_IOAPIC: |
2812 | r = kvm_set_ioapic(kvm, &chip->chip.ioapic); | 3069 | r = kvm_set_ioapic(kvm, &chip->chip.ioapic); |
@@ -3200,7 +3457,6 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3200 | break; | 3457 | break; |
3201 | } | 3458 | } |
3202 | case KVM_SET_CLOCK: { | 3459 | case KVM_SET_CLOCK: { |
3203 | struct timespec now; | ||
3204 | struct kvm_clock_data user_ns; | 3460 | struct kvm_clock_data user_ns; |
3205 | u64 now_ns; | 3461 | u64 now_ns; |
3206 | s64 delta; | 3462 | s64 delta; |
@@ -3214,20 +3470,21 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3214 | goto out; | 3470 | goto out; |
3215 | 3471 | ||
3216 | r = 0; | 3472 | r = 0; |
3217 | ktime_get_ts(&now); | 3473 | local_irq_disable(); |
3218 | now_ns = timespec_to_ns(&now); | 3474 | now_ns = get_kernel_ns(); |
3219 | delta = user_ns.clock - now_ns; | 3475 | delta = user_ns.clock - now_ns; |
3476 | local_irq_enable(); | ||
3220 | kvm->arch.kvmclock_offset = delta; | 3477 | kvm->arch.kvmclock_offset = delta; |
3221 | break; | 3478 | break; |
3222 | } | 3479 | } |
3223 | case KVM_GET_CLOCK: { | 3480 | case KVM_GET_CLOCK: { |
3224 | struct timespec now; | ||
3225 | struct kvm_clock_data user_ns; | 3481 | struct kvm_clock_data user_ns; |
3226 | u64 now_ns; | 3482 | u64 now_ns; |
3227 | 3483 | ||
3228 | ktime_get_ts(&now); | 3484 | local_irq_disable(); |
3229 | now_ns = timespec_to_ns(&now); | 3485 | now_ns = get_kernel_ns(); |
3230 | user_ns.clock = kvm->arch.kvmclock_offset + now_ns; | 3486 | user_ns.clock = kvm->arch.kvmclock_offset + now_ns; |
3487 | local_irq_enable(); | ||
3231 | user_ns.flags = 0; | 3488 | user_ns.flags = 0; |
3232 | 3489 | ||
3233 | r = -EFAULT; | 3490 | r = -EFAULT; |
@@ -3291,30 +3548,51 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, | |||
3291 | kvm_x86_ops->get_segment(vcpu, var, seg); | 3548 | kvm_x86_ops->get_segment(vcpu, var, seg); |
3292 | } | 3549 | } |
3293 | 3550 | ||
3551 | static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) | ||
3552 | { | ||
3553 | return gpa; | ||
3554 | } | ||
3555 | |||
3556 | static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) | ||
3557 | { | ||
3558 | gpa_t t_gpa; | ||
3559 | u32 error; | ||
3560 | |||
3561 | BUG_ON(!mmu_is_nested(vcpu)); | ||
3562 | |||
3563 | /* NPT walks are always user-walks */ | ||
3564 | access |= PFERR_USER_MASK; | ||
3565 | t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error); | ||
3566 | if (t_gpa == UNMAPPED_GVA) | ||
3567 | vcpu->arch.fault.nested = true; | ||
3568 | |||
3569 | return t_gpa; | ||
3570 | } | ||
3571 | |||
3294 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3572 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) |
3295 | { | 3573 | { |
3296 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3574 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3297 | return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); | 3575 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); |
3298 | } | 3576 | } |
3299 | 3577 | ||
3300 | gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3578 | gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) |
3301 | { | 3579 | { |
3302 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3580 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3303 | access |= PFERR_FETCH_MASK; | 3581 | access |= PFERR_FETCH_MASK; |
3304 | return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); | 3582 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); |
3305 | } | 3583 | } |
3306 | 3584 | ||
3307 | gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3585 | gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) |
3308 | { | 3586 | { |
3309 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3587 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3310 | access |= PFERR_WRITE_MASK; | 3588 | access |= PFERR_WRITE_MASK; |
3311 | return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); | 3589 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); |
3312 | } | 3590 | } |
3313 | 3591 | ||
3314 | /* uses this to access any guest's mapped memory without checking CPL */ | 3592 | /* uses this to access any guest's mapped memory without checking CPL */ |
3315 | gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3593 | gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) |
3316 | { | 3594 | { |
3317 | return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); | 3595 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error); |
3318 | } | 3596 | } |
3319 | 3597 | ||
3320 | static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, | 3598 | static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, |
@@ -3325,7 +3603,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, | |||
3325 | int r = X86EMUL_CONTINUE; | 3603 | int r = X86EMUL_CONTINUE; |
3326 | 3604 | ||
3327 | while (bytes) { | 3605 | while (bytes) { |
3328 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); | 3606 | gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, |
3607 | error); | ||
3329 | unsigned offset = addr & (PAGE_SIZE-1); | 3608 | unsigned offset = addr & (PAGE_SIZE-1); |
3330 | unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); | 3609 | unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); |
3331 | int ret; | 3610 | int ret; |
@@ -3380,8 +3659,9 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val, | |||
3380 | int r = X86EMUL_CONTINUE; | 3659 | int r = X86EMUL_CONTINUE; |
3381 | 3660 | ||
3382 | while (bytes) { | 3661 | while (bytes) { |
3383 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, | 3662 | gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, |
3384 | PFERR_WRITE_MASK, error); | 3663 | PFERR_WRITE_MASK, |
3664 | error); | ||
3385 | unsigned offset = addr & (PAGE_SIZE-1); | 3665 | unsigned offset = addr & (PAGE_SIZE-1); |
3386 | unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); | 3666 | unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); |
3387 | int ret; | 3667 | int ret; |
@@ -3623,7 +3903,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val, | |||
3623 | if (vcpu->arch.pio.count) | 3903 | if (vcpu->arch.pio.count) |
3624 | goto data_avail; | 3904 | goto data_avail; |
3625 | 3905 | ||
3626 | trace_kvm_pio(1, port, size, 1); | 3906 | trace_kvm_pio(0, port, size, 1); |
3627 | 3907 | ||
3628 | vcpu->arch.pio.port = port; | 3908 | vcpu->arch.pio.port = port; |
3629 | vcpu->arch.pio.in = 1; | 3909 | vcpu->arch.pio.in = 1; |
@@ -3651,7 +3931,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port, | |||
3651 | const void *val, unsigned int count, | 3931 | const void *val, unsigned int count, |
3652 | struct kvm_vcpu *vcpu) | 3932 | struct kvm_vcpu *vcpu) |
3653 | { | 3933 | { |
3654 | trace_kvm_pio(0, port, size, 1); | 3934 | trace_kvm_pio(1, port, size, 1); |
3655 | 3935 | ||
3656 | vcpu->arch.pio.port = port; | 3936 | vcpu->arch.pio.port = port; |
3657 | vcpu->arch.pio.in = 0; | 3937 | vcpu->arch.pio.in = 0; |
@@ -3790,6 +4070,11 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) | |||
3790 | kvm_x86_ops->get_gdt(vcpu, dt); | 4070 | kvm_x86_ops->get_gdt(vcpu, dt); |
3791 | } | 4071 | } |
3792 | 4072 | ||
4073 | static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) | ||
4074 | { | ||
4075 | kvm_x86_ops->get_idt(vcpu, dt); | ||
4076 | } | ||
4077 | |||
3793 | static unsigned long emulator_get_cached_segment_base(int seg, | 4078 | static unsigned long emulator_get_cached_segment_base(int seg, |
3794 | struct kvm_vcpu *vcpu) | 4079 | struct kvm_vcpu *vcpu) |
3795 | { | 4080 | { |
@@ -3883,6 +4168,7 @@ static struct x86_emulate_ops emulate_ops = { | |||
3883 | .set_segment_selector = emulator_set_segment_selector, | 4168 | .set_segment_selector = emulator_set_segment_selector, |
3884 | .get_cached_segment_base = emulator_get_cached_segment_base, | 4169 | .get_cached_segment_base = emulator_get_cached_segment_base, |
3885 | .get_gdt = emulator_get_gdt, | 4170 | .get_gdt = emulator_get_gdt, |
4171 | .get_idt = emulator_get_idt, | ||
3886 | .get_cr = emulator_get_cr, | 4172 | .get_cr = emulator_get_cr, |
3887 | .set_cr = emulator_set_cr, | 4173 | .set_cr = emulator_set_cr, |
3888 | .cpl = emulator_get_cpl, | 4174 | .cpl = emulator_get_cpl, |
@@ -3918,13 +4204,64 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) | |||
3918 | { | 4204 | { |
3919 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; | 4205 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
3920 | if (ctxt->exception == PF_VECTOR) | 4206 | if (ctxt->exception == PF_VECTOR) |
3921 | kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); | 4207 | kvm_propagate_fault(vcpu); |
3922 | else if (ctxt->error_code_valid) | 4208 | else if (ctxt->error_code_valid) |
3923 | kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); | 4209 | kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); |
3924 | else | 4210 | else |
3925 | kvm_queue_exception(vcpu, ctxt->exception); | 4211 | kvm_queue_exception(vcpu, ctxt->exception); |
3926 | } | 4212 | } |
3927 | 4213 | ||
4214 | static void init_emulate_ctxt(struct kvm_vcpu *vcpu) | ||
4215 | { | ||
4216 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | ||
4217 | int cs_db, cs_l; | ||
4218 | |||
4219 | cache_all_regs(vcpu); | ||
4220 | |||
4221 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | ||
4222 | |||
4223 | vcpu->arch.emulate_ctxt.vcpu = vcpu; | ||
4224 | vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); | ||
4225 | vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); | ||
4226 | vcpu->arch.emulate_ctxt.mode = | ||
4227 | (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : | ||
4228 | (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) | ||
4229 | ? X86EMUL_MODE_VM86 : cs_l | ||
4230 | ? X86EMUL_MODE_PROT64 : cs_db | ||
4231 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | ||
4232 | memset(c, 0, sizeof(struct decode_cache)); | ||
4233 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | ||
4234 | } | ||
4235 | |||
4236 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) | ||
4237 | { | ||
4238 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | ||
4239 | int ret; | ||
4240 | |||
4241 | init_emulate_ctxt(vcpu); | ||
4242 | |||
4243 | vcpu->arch.emulate_ctxt.decode.op_bytes = 2; | ||
4244 | vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; | ||
4245 | vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; | ||
4246 | ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); | ||
4247 | |||
4248 | if (ret != X86EMUL_CONTINUE) | ||
4249 | return EMULATE_FAIL; | ||
4250 | |||
4251 | vcpu->arch.emulate_ctxt.eip = c->eip; | ||
4252 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | ||
4253 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | ||
4254 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | ||
4255 | |||
4256 | if (irq == NMI_VECTOR) | ||
4257 | vcpu->arch.nmi_pending = false; | ||
4258 | else | ||
4259 | vcpu->arch.interrupt.pending = false; | ||
4260 | |||
4261 | return EMULATE_DONE; | ||
4262 | } | ||
4263 | EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); | ||
4264 | |||
3928 | static int handle_emulation_failure(struct kvm_vcpu *vcpu) | 4265 | static int handle_emulation_failure(struct kvm_vcpu *vcpu) |
3929 | { | 4266 | { |
3930 | ++vcpu->stat.insn_emulation_fail; | 4267 | ++vcpu->stat.insn_emulation_fail; |
@@ -3981,24 +4318,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
3981 | cache_all_regs(vcpu); | 4318 | cache_all_regs(vcpu); |
3982 | 4319 | ||
3983 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { | 4320 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { |
3984 | int cs_db, cs_l; | 4321 | init_emulate_ctxt(vcpu); |
3985 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | ||
3986 | |||
3987 | vcpu->arch.emulate_ctxt.vcpu = vcpu; | ||
3988 | vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); | ||
3989 | vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); | ||
3990 | vcpu->arch.emulate_ctxt.mode = | ||
3991 | (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : | ||
3992 | (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) | ||
3993 | ? X86EMUL_MODE_VM86 : cs_l | ||
3994 | ? X86EMUL_MODE_PROT64 : cs_db | ||
3995 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | ||
3996 | memset(c, 0, sizeof(struct decode_cache)); | ||
3997 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | ||
3998 | vcpu->arch.emulate_ctxt.interruptibility = 0; | 4322 | vcpu->arch.emulate_ctxt.interruptibility = 0; |
3999 | vcpu->arch.emulate_ctxt.exception = -1; | 4323 | vcpu->arch.emulate_ctxt.exception = -1; |
4324 | vcpu->arch.emulate_ctxt.perm_ok = false; | ||
4325 | |||
4326 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt); | ||
4327 | if (r == X86EMUL_PROPAGATE_FAULT) | ||
4328 | goto done; | ||
4000 | 4329 | ||
4001 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | ||
4002 | trace_kvm_emulate_insn_start(vcpu); | 4330 | trace_kvm_emulate_insn_start(vcpu); |
4003 | 4331 | ||
4004 | /* Only allow emulation of specific instructions on #UD | 4332 | /* Only allow emulation of specific instructions on #UD |
@@ -4048,41 +4376,39 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
4048 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | 4376 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); |
4049 | 4377 | ||
4050 | restart: | 4378 | restart: |
4051 | r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | 4379 | r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); |
4052 | 4380 | ||
4053 | if (r) { /* emulation failed */ | 4381 | if (r == EMULATION_FAILED) { |
4054 | if (reexecute_instruction(vcpu, cr2)) | 4382 | if (reexecute_instruction(vcpu, cr2)) |
4055 | return EMULATE_DONE; | 4383 | return EMULATE_DONE; |
4056 | 4384 | ||
4057 | return handle_emulation_failure(vcpu); | 4385 | return handle_emulation_failure(vcpu); |
4058 | } | 4386 | } |
4059 | 4387 | ||
4060 | toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); | 4388 | done: |
4061 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | ||
4062 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | ||
4063 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | ||
4064 | |||
4065 | if (vcpu->arch.emulate_ctxt.exception >= 0) { | 4389 | if (vcpu->arch.emulate_ctxt.exception >= 0) { |
4066 | inject_emulated_exception(vcpu); | 4390 | inject_emulated_exception(vcpu); |
4067 | return EMULATE_DONE; | 4391 | r = EMULATE_DONE; |
4068 | } | 4392 | } else if (vcpu->arch.pio.count) { |
4069 | |||
4070 | if (vcpu->arch.pio.count) { | ||
4071 | if (!vcpu->arch.pio.in) | 4393 | if (!vcpu->arch.pio.in) |
4072 | vcpu->arch.pio.count = 0; | 4394 | vcpu->arch.pio.count = 0; |
4073 | return EMULATE_DO_MMIO; | 4395 | r = EMULATE_DO_MMIO; |
4074 | } | 4396 | } else if (vcpu->mmio_needed) { |
4075 | |||
4076 | if (vcpu->mmio_needed) { | ||
4077 | if (vcpu->mmio_is_write) | 4397 | if (vcpu->mmio_is_write) |
4078 | vcpu->mmio_needed = 0; | 4398 | vcpu->mmio_needed = 0; |
4079 | return EMULATE_DO_MMIO; | 4399 | r = EMULATE_DO_MMIO; |
4080 | } | 4400 | } else if (r == EMULATION_RESTART) |
4081 | |||
4082 | if (vcpu->arch.emulate_ctxt.restart) | ||
4083 | goto restart; | 4401 | goto restart; |
4402 | else | ||
4403 | r = EMULATE_DONE; | ||
4084 | 4404 | ||
4085 | return EMULATE_DONE; | 4405 | toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); |
4406 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | ||
4407 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
4408 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | ||
4409 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | ||
4410 | |||
4411 | return r; | ||
4086 | } | 4412 | } |
4087 | EXPORT_SYMBOL_GPL(emulate_instruction); | 4413 | EXPORT_SYMBOL_GPL(emulate_instruction); |
4088 | 4414 | ||
@@ -4096,9 +4422,23 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) | |||
4096 | } | 4422 | } |
4097 | EXPORT_SYMBOL_GPL(kvm_fast_pio_out); | 4423 | EXPORT_SYMBOL_GPL(kvm_fast_pio_out); |
4098 | 4424 | ||
4099 | static void bounce_off(void *info) | 4425 | static void tsc_bad(void *info) |
4426 | { | ||
4427 | __get_cpu_var(cpu_tsc_khz) = 0; | ||
4428 | } | ||
4429 | |||
4430 | static void tsc_khz_changed(void *data) | ||
4100 | { | 4431 | { |
4101 | /* nothing */ | 4432 | struct cpufreq_freqs *freq = data; |
4433 | unsigned long khz = 0; | ||
4434 | |||
4435 | if (data) | ||
4436 | khz = freq->new; | ||
4437 | else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) | ||
4438 | khz = cpufreq_quick_get(raw_smp_processor_id()); | ||
4439 | if (!khz) | ||
4440 | khz = tsc_khz; | ||
4441 | __get_cpu_var(cpu_tsc_khz) = khz; | ||
4102 | } | 4442 | } |
4103 | 4443 | ||
4104 | static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, | 4444 | static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, |
@@ -4109,21 +4449,60 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va | |||
4109 | struct kvm_vcpu *vcpu; | 4449 | struct kvm_vcpu *vcpu; |
4110 | int i, send_ipi = 0; | 4450 | int i, send_ipi = 0; |
4111 | 4451 | ||
4452 | /* | ||
4453 | * We allow guests to temporarily run on slowing clocks, | ||
4454 | * provided we notify them after, or to run on accelerating | ||
4455 | * clocks, provided we notify them before. Thus time never | ||
4456 | * goes backwards. | ||
4457 | * | ||
4458 | * However, we have a problem. We can't atomically update | ||
4459 | * the frequency of a given CPU from this function; it is | ||
4460 | * merely a notifier, which can be called from any CPU. | ||
4461 | * Changing the TSC frequency at arbitrary points in time | ||
4462 | * requires a recomputation of local variables related to | ||
4463 | * the TSC for each VCPU. We must flag these local variables | ||
4464 | * to be updated and be sure the update takes place with the | ||
4465 | * new frequency before any guests proceed. | ||
4466 | * | ||
4467 | * Unfortunately, the combination of hotplug CPU and frequency | ||
4468 | * change creates an intractable locking scenario; the order | ||
4469 | * of when these callouts happen is undefined with respect to | ||
4470 | * CPU hotplug, and they can race with each other. As such, | ||
4471 | * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is | ||
4472 | * undefined; you can actually have a CPU frequency change take | ||
4473 | * place in between the computation of X and the setting of the | ||
4474 | * variable. To protect against this problem, all updates of | ||
4475 | * the per_cpu tsc_khz variable are done in an interrupt | ||
4476 | * protected IPI, and all callers wishing to update the value | ||
4477 | * must wait for a synchronous IPI to complete (which is trivial | ||
4478 | * if the caller is on the CPU already). This establishes the | ||
4479 | * necessary total order on variable updates. | ||
4480 | * | ||
4481 | * Note that because a guest time update may take place | ||
4482 | * anytime after the setting of the VCPU's request bit, the | ||
4483 | * correct TSC value must be set before the request. However, | ||
4484 | * to ensure the update actually makes it to any guest which | ||
4485 | * starts running in hardware virtualization between the set | ||
4486 | * and the acquisition of the spinlock, we must also ping the | ||
4487 | * CPU after setting the request bit. | ||
4488 | * | ||
4489 | */ | ||
4490 | |||
4112 | if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) | 4491 | if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) |
4113 | return 0; | 4492 | return 0; |
4114 | if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) | 4493 | if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) |
4115 | return 0; | 4494 | return 0; |
4116 | per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; | 4495 | |
4496 | smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); | ||
4117 | 4497 | ||
4118 | spin_lock(&kvm_lock); | 4498 | spin_lock(&kvm_lock); |
4119 | list_for_each_entry(kvm, &vm_list, vm_list) { | 4499 | list_for_each_entry(kvm, &vm_list, vm_list) { |
4120 | kvm_for_each_vcpu(i, vcpu, kvm) { | 4500 | kvm_for_each_vcpu(i, vcpu, kvm) { |
4121 | if (vcpu->cpu != freq->cpu) | 4501 | if (vcpu->cpu != freq->cpu) |
4122 | continue; | 4502 | continue; |
4123 | if (!kvm_request_guest_time_update(vcpu)) | 4503 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
4124 | continue; | ||
4125 | if (vcpu->cpu != smp_processor_id()) | 4504 | if (vcpu->cpu != smp_processor_id()) |
4126 | send_ipi++; | 4505 | send_ipi = 1; |
4127 | } | 4506 | } |
4128 | } | 4507 | } |
4129 | spin_unlock(&kvm_lock); | 4508 | spin_unlock(&kvm_lock); |
@@ -4141,32 +4520,57 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va | |||
4141 | * guest context is entered kvmclock will be updated, | 4520 | * guest context is entered kvmclock will be updated, |
4142 | * so the guest will not see stale values. | 4521 | * so the guest will not see stale values. |
4143 | */ | 4522 | */ |
4144 | smp_call_function_single(freq->cpu, bounce_off, NULL, 1); | 4523 | smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); |
4145 | } | 4524 | } |
4146 | return 0; | 4525 | return 0; |
4147 | } | 4526 | } |
4148 | 4527 | ||
4149 | static struct notifier_block kvmclock_cpufreq_notifier_block = { | 4528 | static struct notifier_block kvmclock_cpufreq_notifier_block = { |
4150 | .notifier_call = kvmclock_cpufreq_notifier | 4529 | .notifier_call = kvmclock_cpufreq_notifier |
4530 | }; | ||
4531 | |||
4532 | static int kvmclock_cpu_notifier(struct notifier_block *nfb, | ||
4533 | unsigned long action, void *hcpu) | ||
4534 | { | ||
4535 | unsigned int cpu = (unsigned long)hcpu; | ||
4536 | |||
4537 | switch (action) { | ||
4538 | case CPU_ONLINE: | ||
4539 | case CPU_DOWN_FAILED: | ||
4540 | smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); | ||
4541 | break; | ||
4542 | case CPU_DOWN_PREPARE: | ||
4543 | smp_call_function_single(cpu, tsc_bad, NULL, 1); | ||
4544 | break; | ||
4545 | } | ||
4546 | return NOTIFY_OK; | ||
4547 | } | ||
4548 | |||
4549 | static struct notifier_block kvmclock_cpu_notifier_block = { | ||
4550 | .notifier_call = kvmclock_cpu_notifier, | ||
4551 | .priority = -INT_MAX | ||
4151 | }; | 4552 | }; |
4152 | 4553 | ||
4153 | static void kvm_timer_init(void) | 4554 | static void kvm_timer_init(void) |
4154 | { | 4555 | { |
4155 | int cpu; | 4556 | int cpu; |
4156 | 4557 | ||
4558 | max_tsc_khz = tsc_khz; | ||
4559 | register_hotcpu_notifier(&kvmclock_cpu_notifier_block); | ||
4157 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { | 4560 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { |
4561 | #ifdef CONFIG_CPU_FREQ | ||
4562 | struct cpufreq_policy policy; | ||
4563 | memset(&policy, 0, sizeof(policy)); | ||
4564 | cpufreq_get_policy(&policy, get_cpu()); | ||
4565 | if (policy.cpuinfo.max_freq) | ||
4566 | max_tsc_khz = policy.cpuinfo.max_freq; | ||
4567 | #endif | ||
4158 | cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, | 4568 | cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, |
4159 | CPUFREQ_TRANSITION_NOTIFIER); | 4569 | CPUFREQ_TRANSITION_NOTIFIER); |
4160 | for_each_online_cpu(cpu) { | ||
4161 | unsigned long khz = cpufreq_get(cpu); | ||
4162 | if (!khz) | ||
4163 | khz = tsc_khz; | ||
4164 | per_cpu(cpu_tsc_khz, cpu) = khz; | ||
4165 | } | ||
4166 | } else { | ||
4167 | for_each_possible_cpu(cpu) | ||
4168 | per_cpu(cpu_tsc_khz, cpu) = tsc_khz; | ||
4169 | } | 4570 | } |
4571 | pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); | ||
4572 | for_each_online_cpu(cpu) | ||
4573 | smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); | ||
4170 | } | 4574 | } |
4171 | 4575 | ||
4172 | static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); | 4576 | static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); |
@@ -4268,6 +4672,7 @@ void kvm_arch_exit(void) | |||
4268 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) | 4672 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) |
4269 | cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, | 4673 | cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, |
4270 | CPUFREQ_TRANSITION_NOTIFIER); | 4674 | CPUFREQ_TRANSITION_NOTIFIER); |
4675 | unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); | ||
4271 | kvm_x86_ops = NULL; | 4676 | kvm_x86_ops = NULL; |
4272 | kvm_mmu_module_exit(); | 4677 | kvm_mmu_module_exit(); |
4273 | } | 4678 | } |
@@ -4683,8 +5088,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
4683 | kvm_mmu_unload(vcpu); | 5088 | kvm_mmu_unload(vcpu); |
4684 | if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) | 5089 | if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) |
4685 | __kvm_migrate_timers(vcpu); | 5090 | __kvm_migrate_timers(vcpu); |
4686 | if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) | 5091 | if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { |
4687 | kvm_write_guest_time(vcpu); | 5092 | r = kvm_guest_time_update(vcpu); |
5093 | if (unlikely(r)) | ||
5094 | goto out; | ||
5095 | } | ||
4688 | if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) | 5096 | if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) |
4689 | kvm_mmu_sync_roots(vcpu); | 5097 | kvm_mmu_sync_roots(vcpu); |
4690 | if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) | 5098 | if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) |
@@ -4709,6 +5117,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
4709 | if (unlikely(r)) | 5117 | if (unlikely(r)) |
4710 | goto out; | 5118 | goto out; |
4711 | 5119 | ||
5120 | if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { | ||
5121 | inject_pending_event(vcpu); | ||
5122 | |||
5123 | /* enable NMI/IRQ window open exits if needed */ | ||
5124 | if (vcpu->arch.nmi_pending) | ||
5125 | kvm_x86_ops->enable_nmi_window(vcpu); | ||
5126 | else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) | ||
5127 | kvm_x86_ops->enable_irq_window(vcpu); | ||
5128 | |||
5129 | if (kvm_lapic_enabled(vcpu)) { | ||
5130 | update_cr8_intercept(vcpu); | ||
5131 | kvm_lapic_sync_to_vapic(vcpu); | ||
5132 | } | ||
5133 | } | ||
5134 | |||
4712 | preempt_disable(); | 5135 | preempt_disable(); |
4713 | 5136 | ||
4714 | kvm_x86_ops->prepare_guest_switch(vcpu); | 5137 | kvm_x86_ops->prepare_guest_switch(vcpu); |
@@ -4727,23 +5150,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
4727 | smp_wmb(); | 5150 | smp_wmb(); |
4728 | local_irq_enable(); | 5151 | local_irq_enable(); |
4729 | preempt_enable(); | 5152 | preempt_enable(); |
5153 | kvm_x86_ops->cancel_injection(vcpu); | ||
4730 | r = 1; | 5154 | r = 1; |
4731 | goto out; | 5155 | goto out; |
4732 | } | 5156 | } |
4733 | 5157 | ||
4734 | inject_pending_event(vcpu); | ||
4735 | |||
4736 | /* enable NMI/IRQ window open exits if needed */ | ||
4737 | if (vcpu->arch.nmi_pending) | ||
4738 | kvm_x86_ops->enable_nmi_window(vcpu); | ||
4739 | else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) | ||
4740 | kvm_x86_ops->enable_irq_window(vcpu); | ||
4741 | |||
4742 | if (kvm_lapic_enabled(vcpu)) { | ||
4743 | update_cr8_intercept(vcpu); | ||
4744 | kvm_lapic_sync_to_vapic(vcpu); | ||
4745 | } | ||
4746 | |||
4747 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); | 5158 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); |
4748 | 5159 | ||
4749 | kvm_guest_enter(); | 5160 | kvm_guest_enter(); |
@@ -4769,6 +5180,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
4769 | if (hw_breakpoint_active()) | 5180 | if (hw_breakpoint_active()) |
4770 | hw_breakpoint_restore(); | 5181 | hw_breakpoint_restore(); |
4771 | 5182 | ||
5183 | kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); | ||
5184 | |||
4772 | atomic_set(&vcpu->guest_mode, 0); | 5185 | atomic_set(&vcpu->guest_mode, 0); |
4773 | smp_wmb(); | 5186 | smp_wmb(); |
4774 | local_irq_enable(); | 5187 | local_irq_enable(); |
@@ -4898,8 +5311,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
4898 | if (!irqchip_in_kernel(vcpu->kvm)) | 5311 | if (!irqchip_in_kernel(vcpu->kvm)) |
4899 | kvm_set_cr8(vcpu, kvm_run->cr8); | 5312 | kvm_set_cr8(vcpu, kvm_run->cr8); |
4900 | 5313 | ||
4901 | if (vcpu->arch.pio.count || vcpu->mmio_needed || | 5314 | if (vcpu->arch.pio.count || vcpu->mmio_needed) { |
4902 | vcpu->arch.emulate_ctxt.restart) { | ||
4903 | if (vcpu->mmio_needed) { | 5315 | if (vcpu->mmio_needed) { |
4904 | memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); | 5316 | memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); |
4905 | vcpu->mmio_read_completed = 1; | 5317 | vcpu->mmio_read_completed = 1; |
@@ -4980,6 +5392,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
4980 | 5392 | ||
4981 | vcpu->arch.exception.pending = false; | 5393 | vcpu->arch.exception.pending = false; |
4982 | 5394 | ||
5395 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
5396 | |||
4983 | return 0; | 5397 | return 0; |
4984 | } | 5398 | } |
4985 | 5399 | ||
@@ -5043,6 +5457,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, | |||
5043 | struct kvm_mp_state *mp_state) | 5457 | struct kvm_mp_state *mp_state) |
5044 | { | 5458 | { |
5045 | vcpu->arch.mp_state = mp_state->mp_state; | 5459 | vcpu->arch.mp_state = mp_state->mp_state; |
5460 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
5046 | return 0; | 5461 | return 0; |
5047 | } | 5462 | } |
5048 | 5463 | ||
@@ -5050,24 +5465,11 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, | |||
5050 | bool has_error_code, u32 error_code) | 5465 | bool has_error_code, u32 error_code) |
5051 | { | 5466 | { |
5052 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 5467 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; |
5053 | int cs_db, cs_l, ret; | 5468 | int ret; |
5054 | cache_all_regs(vcpu); | ||
5055 | |||
5056 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | ||
5057 | 5469 | ||
5058 | vcpu->arch.emulate_ctxt.vcpu = vcpu; | 5470 | init_emulate_ctxt(vcpu); |
5059 | vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); | ||
5060 | vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); | ||
5061 | vcpu->arch.emulate_ctxt.mode = | ||
5062 | (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : | ||
5063 | (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) | ||
5064 | ? X86EMUL_MODE_VM86 : cs_l | ||
5065 | ? X86EMUL_MODE_PROT64 : cs_db | ||
5066 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | ||
5067 | memset(c, 0, sizeof(struct decode_cache)); | ||
5068 | memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); | ||
5069 | 5471 | ||
5070 | ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, | 5472 | ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, |
5071 | tss_selector, reason, has_error_code, | 5473 | tss_selector, reason, has_error_code, |
5072 | error_code); | 5474 | error_code); |
5073 | 5475 | ||
@@ -5077,6 +5479,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, | |||
5077 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); | 5479 | memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); |
5078 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); | 5480 | kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); |
5079 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | 5481 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); |
5482 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
5080 | return EMULATE_DONE; | 5483 | return EMULATE_DONE; |
5081 | } | 5484 | } |
5082 | EXPORT_SYMBOL_GPL(kvm_task_switch); | 5485 | EXPORT_SYMBOL_GPL(kvm_task_switch); |
@@ -5112,7 +5515,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
5112 | mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; | 5515 | mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; |
5113 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); | 5516 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); |
5114 | if (!is_long_mode(vcpu) && is_pae(vcpu)) { | 5517 | if (!is_long_mode(vcpu) && is_pae(vcpu)) { |
5115 | load_pdptrs(vcpu, vcpu->arch.cr3); | 5518 | load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); |
5116 | mmu_reset_needed = 1; | 5519 | mmu_reset_needed = 1; |
5117 | } | 5520 | } |
5118 | 5521 | ||
@@ -5147,6 +5550,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
5147 | !is_protmode(vcpu)) | 5550 | !is_protmode(vcpu)) |
5148 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 5551 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
5149 | 5552 | ||
5553 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
5554 | |||
5150 | return 0; | 5555 | return 0; |
5151 | } | 5556 | } |
5152 | 5557 | ||
@@ -5333,6 +5738,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) | |||
5333 | struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, | 5738 | struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, |
5334 | unsigned int id) | 5739 | unsigned int id) |
5335 | { | 5740 | { |
5741 | if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) | ||
5742 | printk_once(KERN_WARNING | ||
5743 | "kvm: SMP vm created on host with unstable TSC; " | ||
5744 | "guest TSC will not be reliable\n"); | ||
5336 | return kvm_x86_ops->vcpu_create(kvm, id); | 5745 | return kvm_x86_ops->vcpu_create(kvm, id); |
5337 | } | 5746 | } |
5338 | 5747 | ||
@@ -5375,22 +5784,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) | |||
5375 | vcpu->arch.dr6 = DR6_FIXED_1; | 5784 | vcpu->arch.dr6 = DR6_FIXED_1; |
5376 | vcpu->arch.dr7 = DR7_FIXED_1; | 5785 | vcpu->arch.dr7 = DR7_FIXED_1; |
5377 | 5786 | ||
5787 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
5788 | |||
5378 | return kvm_x86_ops->vcpu_reset(vcpu); | 5789 | return kvm_x86_ops->vcpu_reset(vcpu); |
5379 | } | 5790 | } |
5380 | 5791 | ||
5381 | int kvm_arch_hardware_enable(void *garbage) | 5792 | int kvm_arch_hardware_enable(void *garbage) |
5382 | { | 5793 | { |
5383 | /* | 5794 | struct kvm *kvm; |
5384 | * Since this may be called from a hotplug notifcation, | 5795 | struct kvm_vcpu *vcpu; |
5385 | * we can't get the CPU frequency directly. | 5796 | int i; |
5386 | */ | ||
5387 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { | ||
5388 | int cpu = raw_smp_processor_id(); | ||
5389 | per_cpu(cpu_tsc_khz, cpu) = 0; | ||
5390 | } | ||
5391 | 5797 | ||
5392 | kvm_shared_msr_cpu_online(); | 5798 | kvm_shared_msr_cpu_online(); |
5393 | 5799 | list_for_each_entry(kvm, &vm_list, vm_list) | |
5800 | kvm_for_each_vcpu(i, vcpu, kvm) | ||
5801 | if (vcpu->cpu == smp_processor_id()) | ||
5802 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | ||
5394 | return kvm_x86_ops->hardware_enable(garbage); | 5803 | return kvm_x86_ops->hardware_enable(garbage); |
5395 | } | 5804 | } |
5396 | 5805 | ||
@@ -5424,7 +5833,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
5424 | BUG_ON(vcpu->kvm == NULL); | 5833 | BUG_ON(vcpu->kvm == NULL); |
5425 | kvm = vcpu->kvm; | 5834 | kvm = vcpu->kvm; |
5426 | 5835 | ||
5836 | vcpu->arch.emulate_ctxt.ops = &emulate_ops; | ||
5837 | vcpu->arch.walk_mmu = &vcpu->arch.mmu; | ||
5427 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | 5838 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; |
5839 | vcpu->arch.mmu.translate_gpa = translate_gpa; | ||
5840 | vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; | ||
5428 | if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) | 5841 | if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) |
5429 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 5842 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
5430 | else | 5843 | else |
@@ -5437,6 +5850,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
5437 | } | 5850 | } |
5438 | vcpu->arch.pio_data = page_address(page); | 5851 | vcpu->arch.pio_data = page_address(page); |
5439 | 5852 | ||
5853 | if (!kvm->arch.virtual_tsc_khz) | ||
5854 | kvm_arch_set_tsc_khz(kvm, max_tsc_khz); | ||
5855 | |||
5440 | r = kvm_mmu_create(vcpu); | 5856 | r = kvm_mmu_create(vcpu); |
5441 | if (r < 0) | 5857 | if (r < 0) |
5442 | goto fail_free_pio_data; | 5858 | goto fail_free_pio_data; |
@@ -5496,7 +5912,7 @@ struct kvm *kvm_arch_create_vm(void) | |||
5496 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ | 5912 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ |
5497 | set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); | 5913 | set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); |
5498 | 5914 | ||
5499 | rdtscll(kvm->arch.vm_init_tsc); | 5915 | spin_lock_init(&kvm->arch.tsc_write_lock); |
5500 | 5916 | ||
5501 | return kvm; | 5917 | return kvm; |
5502 | } | 5918 | } |
@@ -5683,6 +6099,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | |||
5683 | kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) | 6099 | kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) |
5684 | rflags |= X86_EFLAGS_TF; | 6100 | rflags |= X86_EFLAGS_TF; |
5685 | kvm_x86_ops->set_rflags(vcpu, rflags); | 6101 | kvm_x86_ops->set_rflags(vcpu, rflags); |
6102 | kvm_make_request(KVM_REQ_EVENT, vcpu); | ||
5686 | } | 6103 | } |
5687 | EXPORT_SYMBOL_GPL(kvm_set_rflags); | 6104 | EXPORT_SYMBOL_GPL(kvm_set_rflags); |
5688 | 6105 | ||
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index b7a404722d2b..2cea414489f3 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h | |||
@@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) | |||
50 | #endif | 50 | #endif |
51 | } | 51 | } |
52 | 52 | ||
53 | static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) | ||
54 | { | ||
55 | return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; | ||
56 | } | ||
57 | |||
53 | static inline int is_pae(struct kvm_vcpu *vcpu) | 58 | static inline int is_pae(struct kvm_vcpu *vcpu) |
54 | { | 59 | { |
55 | return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); | 60 | return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); |
@@ -67,5 +72,8 @@ static inline int is_paging(struct kvm_vcpu *vcpu) | |||
67 | 72 | ||
68 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); | 73 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); |
69 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); | 74 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); |
75 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); | ||
76 | |||
77 | void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); | ||
70 | 78 | ||
71 | #endif | 79 | #endif |
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index 5415a9d06f53..b908a59eccf5 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c | |||
@@ -22,22 +22,187 @@ EXPORT_SYMBOL(memset); | |||
22 | 22 | ||
23 | void *memmove(void *dest, const void *src, size_t n) | 23 | void *memmove(void *dest, const void *src, size_t n) |
24 | { | 24 | { |
25 | int d0, d1, d2; | 25 | int d0,d1,d2,d3,d4,d5; |
26 | 26 | char *ret = dest; | |
27 | if (dest < src) { | 27 | |
28 | memcpy(dest, src, n); | 28 | __asm__ __volatile__( |
29 | } else { | 29 | /* Handle more 16bytes in loop */ |
30 | __asm__ __volatile__( | 30 | "cmp $0x10, %0\n\t" |
31 | "std\n\t" | 31 | "jb 1f\n\t" |
32 | "rep\n\t" | 32 | |
33 | "movsb\n\t" | 33 | /* Decide forward/backward copy mode */ |
34 | "cld" | 34 | "cmp %2, %1\n\t" |
35 | : "=&c" (d0), "=&S" (d1), "=&D" (d2) | 35 | "jb 2f\n\t" |
36 | :"0" (n), | 36 | |
37 | "1" (n-1+src), | 37 | /* |
38 | "2" (n-1+dest) | 38 | * movs instruction have many startup latency |
39 | :"memory"); | 39 | * so we handle small size by general register. |
40 | } | 40 | */ |
41 | return dest; | 41 | "cmp $680, %0\n\t" |
42 | "jb 3f\n\t" | ||
43 | /* | ||
44 | * movs instruction is only good for aligned case. | ||
45 | */ | ||
46 | "mov %1, %3\n\t" | ||
47 | "xor %2, %3\n\t" | ||
48 | "and $0xff, %3\n\t" | ||
49 | "jz 4f\n\t" | ||
50 | "3:\n\t" | ||
51 | "sub $0x10, %0\n\t" | ||
52 | |||
53 | /* | ||
54 | * We gobble 16byts forward in each loop. | ||
55 | */ | ||
56 | "3:\n\t" | ||
57 | "sub $0x10, %0\n\t" | ||
58 | "mov 0*4(%1), %3\n\t" | ||
59 | "mov 1*4(%1), %4\n\t" | ||
60 | "mov %3, 0*4(%2)\n\t" | ||
61 | "mov %4, 1*4(%2)\n\t" | ||
62 | "mov 2*4(%1), %3\n\t" | ||
63 | "mov 3*4(%1), %4\n\t" | ||
64 | "mov %3, 2*4(%2)\n\t" | ||
65 | "mov %4, 3*4(%2)\n\t" | ||
66 | "lea 0x10(%1), %1\n\t" | ||
67 | "lea 0x10(%2), %2\n\t" | ||
68 | "jae 3b\n\t" | ||
69 | "add $0x10, %0\n\t" | ||
70 | "jmp 1f\n\t" | ||
71 | |||
72 | /* | ||
73 | * Handle data forward by movs. | ||
74 | */ | ||
75 | ".p2align 4\n\t" | ||
76 | "4:\n\t" | ||
77 | "mov -4(%1, %0), %3\n\t" | ||
78 | "lea -4(%2, %0), %4\n\t" | ||
79 | "shr $2, %0\n\t" | ||
80 | "rep movsl\n\t" | ||
81 | "mov %3, (%4)\n\t" | ||
82 | "jmp 11f\n\t" | ||
83 | /* | ||
84 | * Handle data backward by movs. | ||
85 | */ | ||
86 | ".p2align 4\n\t" | ||
87 | "6:\n\t" | ||
88 | "mov (%1), %3\n\t" | ||
89 | "mov %2, %4\n\t" | ||
90 | "lea -4(%1, %0), %1\n\t" | ||
91 | "lea -4(%2, %0), %2\n\t" | ||
92 | "shr $2, %0\n\t" | ||
93 | "std\n\t" | ||
94 | "rep movsl\n\t" | ||
95 | "mov %3,(%4)\n\t" | ||
96 | "cld\n\t" | ||
97 | "jmp 11f\n\t" | ||
98 | |||
99 | /* | ||
100 | * Start to prepare for backward copy. | ||
101 | */ | ||
102 | ".p2align 4\n\t" | ||
103 | "2:\n\t" | ||
104 | "cmp $680, %0\n\t" | ||
105 | "jb 5f\n\t" | ||
106 | "mov %1, %3\n\t" | ||
107 | "xor %2, %3\n\t" | ||
108 | "and $0xff, %3\n\t" | ||
109 | "jz 6b\n\t" | ||
110 | |||
111 | /* | ||
112 | * Calculate copy position to tail. | ||
113 | */ | ||
114 | "5:\n\t" | ||
115 | "add %0, %1\n\t" | ||
116 | "add %0, %2\n\t" | ||
117 | "sub $0x10, %0\n\t" | ||
118 | |||
119 | /* | ||
120 | * We gobble 16byts backward in each loop. | ||
121 | */ | ||
122 | "7:\n\t" | ||
123 | "sub $0x10, %0\n\t" | ||
124 | |||
125 | "mov -1*4(%1), %3\n\t" | ||
126 | "mov -2*4(%1), %4\n\t" | ||
127 | "mov %3, -1*4(%2)\n\t" | ||
128 | "mov %4, -2*4(%2)\n\t" | ||
129 | "mov -3*4(%1), %3\n\t" | ||
130 | "mov -4*4(%1), %4\n\t" | ||
131 | "mov %3, -3*4(%2)\n\t" | ||
132 | "mov %4, -4*4(%2)\n\t" | ||
133 | "lea -0x10(%1), %1\n\t" | ||
134 | "lea -0x10(%2), %2\n\t" | ||
135 | "jae 7b\n\t" | ||
136 | /* | ||
137 | * Calculate copy position to head. | ||
138 | */ | ||
139 | "add $0x10, %0\n\t" | ||
140 | "sub %0, %1\n\t" | ||
141 | "sub %0, %2\n\t" | ||
142 | |||
143 | /* | ||
144 | * Move data from 8 bytes to 15 bytes. | ||
145 | */ | ||
146 | ".p2align 4\n\t" | ||
147 | "1:\n\t" | ||
148 | "cmp $8, %0\n\t" | ||
149 | "jb 8f\n\t" | ||
150 | "mov 0*4(%1), %3\n\t" | ||
151 | "mov 1*4(%1), %4\n\t" | ||
152 | "mov -2*4(%1, %0), %5\n\t" | ||
153 | "mov -1*4(%1, %0), %1\n\t" | ||
154 | |||
155 | "mov %3, 0*4(%2)\n\t" | ||
156 | "mov %4, 1*4(%2)\n\t" | ||
157 | "mov %5, -2*4(%2, %0)\n\t" | ||
158 | "mov %1, -1*4(%2, %0)\n\t" | ||
159 | "jmp 11f\n\t" | ||
160 | |||
161 | /* | ||
162 | * Move data from 4 bytes to 7 bytes. | ||
163 | */ | ||
164 | ".p2align 4\n\t" | ||
165 | "8:\n\t" | ||
166 | "cmp $4, %0\n\t" | ||
167 | "jb 9f\n\t" | ||
168 | "mov 0*4(%1), %3\n\t" | ||
169 | "mov -1*4(%1, %0), %4\n\t" | ||
170 | "mov %3, 0*4(%2)\n\t" | ||
171 | "mov %4, -1*4(%2, %0)\n\t" | ||
172 | "jmp 11f\n\t" | ||
173 | |||
174 | /* | ||
175 | * Move data from 2 bytes to 3 bytes. | ||
176 | */ | ||
177 | ".p2align 4\n\t" | ||
178 | "9:\n\t" | ||
179 | "cmp $2, %0\n\t" | ||
180 | "jb 10f\n\t" | ||
181 | "movw 0*2(%1), %%dx\n\t" | ||
182 | "movw -1*2(%1, %0), %%bx\n\t" | ||
183 | "movw %%dx, 0*2(%2)\n\t" | ||
184 | "movw %%bx, -1*2(%2, %0)\n\t" | ||
185 | "jmp 11f\n\t" | ||
186 | |||
187 | /* | ||
188 | * Move data for 1 byte. | ||
189 | */ | ||
190 | ".p2align 4\n\t" | ||
191 | "10:\n\t" | ||
192 | "cmp $1, %0\n\t" | ||
193 | "jb 11f\n\t" | ||
194 | "movb (%1), %%cl\n\t" | ||
195 | "movb %%cl, (%2)\n\t" | ||
196 | ".p2align 4\n\t" | ||
197 | "11:" | ||
198 | : "=&c" (d0), "=&S" (d1), "=&D" (d2), | ||
199 | "=r" (d3),"=r" (d4), "=r"(d5) | ||
200 | :"0" (n), | ||
201 | "1" (src), | ||
202 | "2" (dest) | ||
203 | :"memory"); | ||
204 | |||
205 | return ret; | ||
206 | |||
42 | } | 207 | } |
43 | EXPORT_SYMBOL(memmove); | 208 | EXPORT_SYMBOL(memmove); |
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index bcbcd1e0f7d5..75ef61e35e38 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S | |||
@@ -40,84 +40,132 @@ | |||
40 | ENTRY(__memcpy) | 40 | ENTRY(__memcpy) |
41 | ENTRY(memcpy) | 41 | ENTRY(memcpy) |
42 | CFI_STARTPROC | 42 | CFI_STARTPROC |
43 | movq %rdi, %rax | ||
43 | 44 | ||
44 | /* | 45 | /* |
45 | * Put the number of full 64-byte blocks into %ecx. | 46 | * Use 32bit CMP here to avoid long NOP padding. |
46 | * Tail portion is handled at the end: | ||
47 | */ | 47 | */ |
48 | movq %rdi, %rax | 48 | cmp $0x20, %edx |
49 | movl %edx, %ecx | 49 | jb .Lhandle_tail |
50 | shrl $6, %ecx | ||
51 | jz .Lhandle_tail | ||
52 | 50 | ||
53 | .p2align 4 | ||
54 | .Lloop_64: | ||
55 | /* | 51 | /* |
56 | * We decrement the loop index here - and the zero-flag is | 52 | * We check whether memory false dependece could occur, |
57 | * checked at the end of the loop (instructions inbetween do | 53 | * then jump to corresponding copy mode. |
58 | * not change the zero flag): | ||
59 | */ | 54 | */ |
60 | decl %ecx | 55 | cmp %dil, %sil |
56 | jl .Lcopy_backward | ||
57 | subl $0x20, %edx | ||
58 | .Lcopy_forward_loop: | ||
59 | subq $0x20, %rdx | ||
61 | 60 | ||
62 | /* | 61 | /* |
63 | * Move in blocks of 4x16 bytes: | 62 | * Move in blocks of 4x8 bytes: |
64 | */ | 63 | */ |
65 | movq 0*8(%rsi), %r11 | 64 | movq 0*8(%rsi), %r8 |
66 | movq 1*8(%rsi), %r8 | 65 | movq 1*8(%rsi), %r9 |
67 | movq %r11, 0*8(%rdi) | 66 | movq 2*8(%rsi), %r10 |
68 | movq %r8, 1*8(%rdi) | 67 | movq 3*8(%rsi), %r11 |
69 | 68 | leaq 4*8(%rsi), %rsi | |
70 | movq 2*8(%rsi), %r9 | 69 | |
71 | movq 3*8(%rsi), %r10 | 70 | movq %r8, 0*8(%rdi) |
72 | movq %r9, 2*8(%rdi) | 71 | movq %r9, 1*8(%rdi) |
73 | movq %r10, 3*8(%rdi) | 72 | movq %r10, 2*8(%rdi) |
74 | 73 | movq %r11, 3*8(%rdi) | |
75 | movq 4*8(%rsi), %r11 | 74 | leaq 4*8(%rdi), %rdi |
76 | movq 5*8(%rsi), %r8 | 75 | jae .Lcopy_forward_loop |
77 | movq %r11, 4*8(%rdi) | 76 | addq $0x20, %rdx |
78 | movq %r8, 5*8(%rdi) | 77 | jmp .Lhandle_tail |
79 | 78 | ||
80 | movq 6*8(%rsi), %r9 | 79 | .Lcopy_backward: |
81 | movq 7*8(%rsi), %r10 | 80 | /* |
82 | movq %r9, 6*8(%rdi) | 81 | * Calculate copy position to tail. |
83 | movq %r10, 7*8(%rdi) | 82 | */ |
84 | 83 | addq %rdx, %rsi | |
85 | leaq 64(%rsi), %rsi | 84 | addq %rdx, %rdi |
86 | leaq 64(%rdi), %rdi | 85 | subq $0x20, %rdx |
87 | 86 | /* | |
88 | jnz .Lloop_64 | 87 | * At most 3 ALU operations in one cycle, |
88 | * so append NOPS in the same 16bytes trunk. | ||
89 | */ | ||
90 | .p2align 4 | ||
91 | .Lcopy_backward_loop: | ||
92 | subq $0x20, %rdx | ||
93 | movq -1*8(%rsi), %r8 | ||
94 | movq -2*8(%rsi), %r9 | ||
95 | movq -3*8(%rsi), %r10 | ||
96 | movq -4*8(%rsi), %r11 | ||
97 | leaq -4*8(%rsi), %rsi | ||
98 | movq %r8, -1*8(%rdi) | ||
99 | movq %r9, -2*8(%rdi) | ||
100 | movq %r10, -3*8(%rdi) | ||
101 | movq %r11, -4*8(%rdi) | ||
102 | leaq -4*8(%rdi), %rdi | ||
103 | jae .Lcopy_backward_loop | ||
89 | 104 | ||
105 | /* | ||
106 | * Calculate copy position to head. | ||
107 | */ | ||
108 | addq $0x20, %rdx | ||
109 | subq %rdx, %rsi | ||
110 | subq %rdx, %rdi | ||
90 | .Lhandle_tail: | 111 | .Lhandle_tail: |
91 | movl %edx, %ecx | 112 | cmpq $16, %rdx |
92 | andl $63, %ecx | 113 | jb .Lless_16bytes |
93 | shrl $3, %ecx | ||
94 | jz .Lhandle_7 | ||
95 | 114 | ||
115 | /* | ||
116 | * Move data from 16 bytes to 31 bytes. | ||
117 | */ | ||
118 | movq 0*8(%rsi), %r8 | ||
119 | movq 1*8(%rsi), %r9 | ||
120 | movq -2*8(%rsi, %rdx), %r10 | ||
121 | movq -1*8(%rsi, %rdx), %r11 | ||
122 | movq %r8, 0*8(%rdi) | ||
123 | movq %r9, 1*8(%rdi) | ||
124 | movq %r10, -2*8(%rdi, %rdx) | ||
125 | movq %r11, -1*8(%rdi, %rdx) | ||
126 | retq | ||
96 | .p2align 4 | 127 | .p2align 4 |
97 | .Lloop_8: | 128 | .Lless_16bytes: |
98 | decl %ecx | 129 | cmpq $8, %rdx |
99 | movq (%rsi), %r8 | 130 | jb .Lless_8bytes |
100 | movq %r8, (%rdi) | 131 | /* |
101 | leaq 8(%rdi), %rdi | 132 | * Move data from 8 bytes to 15 bytes. |
102 | leaq 8(%rsi), %rsi | 133 | */ |
103 | jnz .Lloop_8 | 134 | movq 0*8(%rsi), %r8 |
104 | 135 | movq -1*8(%rsi, %rdx), %r9 | |
105 | .Lhandle_7: | 136 | movq %r8, 0*8(%rdi) |
106 | movl %edx, %ecx | 137 | movq %r9, -1*8(%rdi, %rdx) |
107 | andl $7, %ecx | 138 | retq |
108 | jz .Lend | 139 | .p2align 4 |
140 | .Lless_8bytes: | ||
141 | cmpq $4, %rdx | ||
142 | jb .Lless_3bytes | ||
109 | 143 | ||
144 | /* | ||
145 | * Move data from 4 bytes to 7 bytes. | ||
146 | */ | ||
147 | movl (%rsi), %ecx | ||
148 | movl -4(%rsi, %rdx), %r8d | ||
149 | movl %ecx, (%rdi) | ||
150 | movl %r8d, -4(%rdi, %rdx) | ||
151 | retq | ||
110 | .p2align 4 | 152 | .p2align 4 |
153 | .Lless_3bytes: | ||
154 | cmpl $0, %edx | ||
155 | je .Lend | ||
156 | /* | ||
157 | * Move data from 1 bytes to 3 bytes. | ||
158 | */ | ||
111 | .Lloop_1: | 159 | .Lloop_1: |
112 | movb (%rsi), %r8b | 160 | movb (%rsi), %r8b |
113 | movb %r8b, (%rdi) | 161 | movb %r8b, (%rdi) |
114 | incq %rdi | 162 | incq %rdi |
115 | incq %rsi | 163 | incq %rsi |
116 | decl %ecx | 164 | decl %edx |
117 | jnz .Lloop_1 | 165 | jnz .Lloop_1 |
118 | 166 | ||
119 | .Lend: | 167 | .Lend: |
120 | ret | 168 | retq |
121 | CFI_ENDPROC | 169 | CFI_ENDPROC |
122 | ENDPROC(memcpy) | 170 | ENDPROC(memcpy) |
123 | ENDPROC(__memcpy) | 171 | ENDPROC(__memcpy) |
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c index 0a33909bf122..6d0f0ec41b34 100644 --- a/arch/x86/lib/memmove_64.c +++ b/arch/x86/lib/memmove_64.c | |||
@@ -8,14 +8,185 @@ | |||
8 | #undef memmove | 8 | #undef memmove |
9 | void *memmove(void *dest, const void *src, size_t count) | 9 | void *memmove(void *dest, const void *src, size_t count) |
10 | { | 10 | { |
11 | if (dest < src) { | 11 | unsigned long d0,d1,d2,d3,d4,d5,d6,d7; |
12 | return memcpy(dest, src, count); | 12 | char *ret; |
13 | } else { | 13 | |
14 | char *p = dest + count; | 14 | __asm__ __volatile__( |
15 | const char *s = src + count; | 15 | /* Handle more 32bytes in loop */ |
16 | while (count--) | 16 | "mov %2, %3\n\t" |
17 | *--p = *--s; | 17 | "cmp $0x20, %0\n\t" |
18 | } | 18 | "jb 1f\n\t" |
19 | return dest; | 19 | |
20 | /* Decide forward/backward copy mode */ | ||
21 | "cmp %2, %1\n\t" | ||
22 | "jb 2f\n\t" | ||
23 | |||
24 | /* | ||
25 | * movsq instruction have many startup latency | ||
26 | * so we handle small size by general register. | ||
27 | */ | ||
28 | "cmp $680, %0\n\t" | ||
29 | "jb 3f\n\t" | ||
30 | /* | ||
31 | * movsq instruction is only good for aligned case. | ||
32 | */ | ||
33 | "cmpb %%dil, %%sil\n\t" | ||
34 | "je 4f\n\t" | ||
35 | "3:\n\t" | ||
36 | "sub $0x20, %0\n\t" | ||
37 | /* | ||
38 | * We gobble 32byts forward in each loop. | ||
39 | */ | ||
40 | "5:\n\t" | ||
41 | "sub $0x20, %0\n\t" | ||
42 | "movq 0*8(%1), %4\n\t" | ||
43 | "movq 1*8(%1), %5\n\t" | ||
44 | "movq 2*8(%1), %6\n\t" | ||
45 | "movq 3*8(%1), %7\n\t" | ||
46 | "leaq 4*8(%1), %1\n\t" | ||
47 | |||
48 | "movq %4, 0*8(%2)\n\t" | ||
49 | "movq %5, 1*8(%2)\n\t" | ||
50 | "movq %6, 2*8(%2)\n\t" | ||
51 | "movq %7, 3*8(%2)\n\t" | ||
52 | "leaq 4*8(%2), %2\n\t" | ||
53 | "jae 5b\n\t" | ||
54 | "addq $0x20, %0\n\t" | ||
55 | "jmp 1f\n\t" | ||
56 | /* | ||
57 | * Handle data forward by movsq. | ||
58 | */ | ||
59 | ".p2align 4\n\t" | ||
60 | "4:\n\t" | ||
61 | "movq %0, %8\n\t" | ||
62 | "movq -8(%1, %0), %4\n\t" | ||
63 | "lea -8(%2, %0), %5\n\t" | ||
64 | "shrq $3, %8\n\t" | ||
65 | "rep movsq\n\t" | ||
66 | "movq %4, (%5)\n\t" | ||
67 | "jmp 13f\n\t" | ||
68 | /* | ||
69 | * Handle data backward by movsq. | ||
70 | */ | ||
71 | ".p2align 4\n\t" | ||
72 | "7:\n\t" | ||
73 | "movq %0, %8\n\t" | ||
74 | "movq (%1), %4\n\t" | ||
75 | "movq %2, %5\n\t" | ||
76 | "leaq -8(%1, %0), %1\n\t" | ||
77 | "leaq -8(%2, %0), %2\n\t" | ||
78 | "shrq $3, %8\n\t" | ||
79 | "std\n\t" | ||
80 | "rep movsq\n\t" | ||
81 | "cld\n\t" | ||
82 | "movq %4, (%5)\n\t" | ||
83 | "jmp 13f\n\t" | ||
84 | |||
85 | /* | ||
86 | * Start to prepare for backward copy. | ||
87 | */ | ||
88 | ".p2align 4\n\t" | ||
89 | "2:\n\t" | ||
90 | "cmp $680, %0\n\t" | ||
91 | "jb 6f \n\t" | ||
92 | "cmp %%dil, %%sil\n\t" | ||
93 | "je 7b \n\t" | ||
94 | "6:\n\t" | ||
95 | /* | ||
96 | * Calculate copy position to tail. | ||
97 | */ | ||
98 | "addq %0, %1\n\t" | ||
99 | "addq %0, %2\n\t" | ||
100 | "subq $0x20, %0\n\t" | ||
101 | /* | ||
102 | * We gobble 32byts backward in each loop. | ||
103 | */ | ||
104 | "8:\n\t" | ||
105 | "subq $0x20, %0\n\t" | ||
106 | "movq -1*8(%1), %4\n\t" | ||
107 | "movq -2*8(%1), %5\n\t" | ||
108 | "movq -3*8(%1), %6\n\t" | ||
109 | "movq -4*8(%1), %7\n\t" | ||
110 | "leaq -4*8(%1), %1\n\t" | ||
111 | |||
112 | "movq %4, -1*8(%2)\n\t" | ||
113 | "movq %5, -2*8(%2)\n\t" | ||
114 | "movq %6, -3*8(%2)\n\t" | ||
115 | "movq %7, -4*8(%2)\n\t" | ||
116 | "leaq -4*8(%2), %2\n\t" | ||
117 | "jae 8b\n\t" | ||
118 | /* | ||
119 | * Calculate copy position to head. | ||
120 | */ | ||
121 | "addq $0x20, %0\n\t" | ||
122 | "subq %0, %1\n\t" | ||
123 | "subq %0, %2\n\t" | ||
124 | "1:\n\t" | ||
125 | "cmpq $16, %0\n\t" | ||
126 | "jb 9f\n\t" | ||
127 | /* | ||
128 | * Move data from 16 bytes to 31 bytes. | ||
129 | */ | ||
130 | "movq 0*8(%1), %4\n\t" | ||
131 | "movq 1*8(%1), %5\n\t" | ||
132 | "movq -2*8(%1, %0), %6\n\t" | ||
133 | "movq -1*8(%1, %0), %7\n\t" | ||
134 | "movq %4, 0*8(%2)\n\t" | ||
135 | "movq %5, 1*8(%2)\n\t" | ||
136 | "movq %6, -2*8(%2, %0)\n\t" | ||
137 | "movq %7, -1*8(%2, %0)\n\t" | ||
138 | "jmp 13f\n\t" | ||
139 | ".p2align 4\n\t" | ||
140 | "9:\n\t" | ||
141 | "cmpq $8, %0\n\t" | ||
142 | "jb 10f\n\t" | ||
143 | /* | ||
144 | * Move data from 8 bytes to 15 bytes. | ||
145 | */ | ||
146 | "movq 0*8(%1), %4\n\t" | ||
147 | "movq -1*8(%1, %0), %5\n\t" | ||
148 | "movq %4, 0*8(%2)\n\t" | ||
149 | "movq %5, -1*8(%2, %0)\n\t" | ||
150 | "jmp 13f\n\t" | ||
151 | "10:\n\t" | ||
152 | "cmpq $4, %0\n\t" | ||
153 | "jb 11f\n\t" | ||
154 | /* | ||
155 | * Move data from 4 bytes to 7 bytes. | ||
156 | */ | ||
157 | "movl (%1), %4d\n\t" | ||
158 | "movl -4(%1, %0), %5d\n\t" | ||
159 | "movl %4d, (%2)\n\t" | ||
160 | "movl %5d, -4(%2, %0)\n\t" | ||
161 | "jmp 13f\n\t" | ||
162 | "11:\n\t" | ||
163 | "cmp $2, %0\n\t" | ||
164 | "jb 12f\n\t" | ||
165 | /* | ||
166 | * Move data from 2 bytes to 3 bytes. | ||
167 | */ | ||
168 | "movw (%1), %4w\n\t" | ||
169 | "movw -2(%1, %0), %5w\n\t" | ||
170 | "movw %4w, (%2)\n\t" | ||
171 | "movw %5w, -2(%2, %0)\n\t" | ||
172 | "jmp 13f\n\t" | ||
173 | "12:\n\t" | ||
174 | "cmp $1, %0\n\t" | ||
175 | "jb 13f\n\t" | ||
176 | /* | ||
177 | * Move data for 1 byte. | ||
178 | */ | ||
179 | "movb (%1), %4b\n\t" | ||
180 | "movb %4b, (%2)\n\t" | ||
181 | "13:\n\t" | ||
182 | : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) , | ||
183 | "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7) | ||
184 | :"0" (count), | ||
185 | "1" (src), | ||
186 | "2" (dest) | ||
187 | :"memory"); | ||
188 | |||
189 | return ret; | ||
190 | |||
20 | } | 191 | } |
21 | EXPORT_SYMBOL(memmove); | 192 | EXPORT_SYMBOL(memmove); |
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index a4c768397baa..55543397a8a7 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -26,4 +26,6 @@ obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o | |||
26 | obj-$(CONFIG_K8_NUMA) += k8topology_64.o | 26 | obj-$(CONFIG_K8_NUMA) += k8topology_64.o |
27 | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o | 27 | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o |
28 | 28 | ||
29 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | ||
30 | |||
29 | obj-$(CONFIG_MEMTEST) += memtest.o | 31 | obj-$(CONFIG_MEMTEST) += memtest.o |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4c4508e8a204..7d90ceb882a4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/kprobes.h> /* __kprobes, ... */ | 11 | #include <linux/kprobes.h> /* __kprobes, ... */ |
12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ | 12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ |
13 | #include <linux/perf_event.h> /* perf_sw_event */ | 13 | #include <linux/perf_event.h> /* perf_sw_event */ |
14 | #include <linux/hugetlb.h> /* hstate_index_to_shift */ | ||
14 | 15 | ||
15 | #include <asm/traps.h> /* dotraplinkage, ... */ | 16 | #include <asm/traps.h> /* dotraplinkage, ... */ |
16 | #include <asm/pgalloc.h> /* pgd_*(), ... */ | 17 | #include <asm/pgalloc.h> /* pgd_*(), ... */ |
@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) | |||
160 | 161 | ||
161 | static void | 162 | static void |
162 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, | 163 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, |
163 | struct task_struct *tsk) | 164 | struct task_struct *tsk, int fault) |
164 | { | 165 | { |
166 | unsigned lsb = 0; | ||
165 | siginfo_t info; | 167 | siginfo_t info; |
166 | 168 | ||
167 | info.si_signo = si_signo; | 169 | info.si_signo = si_signo; |
168 | info.si_errno = 0; | 170 | info.si_errno = 0; |
169 | info.si_code = si_code; | 171 | info.si_code = si_code; |
170 | info.si_addr = (void __user *)address; | 172 | info.si_addr = (void __user *)address; |
171 | info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; | 173 | if (fault & VM_FAULT_HWPOISON_LARGE) |
174 | lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); | ||
175 | if (fault & VM_FAULT_HWPOISON) | ||
176 | lsb = PAGE_SHIFT; | ||
177 | info.si_addr_lsb = lsb; | ||
172 | 178 | ||
173 | force_sig_info(si_signo, &info, tsk); | 179 | force_sig_info(si_signo, &info, tsk); |
174 | } | 180 | } |
@@ -229,7 +235,16 @@ void vmalloc_sync_all(void) | |||
229 | 235 | ||
230 | spin_lock_irqsave(&pgd_lock, flags); | 236 | spin_lock_irqsave(&pgd_lock, flags); |
231 | list_for_each_entry(page, &pgd_list, lru) { | 237 | list_for_each_entry(page, &pgd_list, lru) { |
232 | if (!vmalloc_sync_one(page_address(page), address)) | 238 | spinlock_t *pgt_lock; |
239 | pmd_t *ret; | ||
240 | |||
241 | pgt_lock = &pgd_page_get_mm(page)->page_table_lock; | ||
242 | |||
243 | spin_lock(pgt_lock); | ||
244 | ret = vmalloc_sync_one(page_address(page), address); | ||
245 | spin_unlock(pgt_lock); | ||
246 | |||
247 | if (!ret) | ||
233 | break; | 248 | break; |
234 | } | 249 | } |
235 | spin_unlock_irqrestore(&pgd_lock, flags); | 250 | spin_unlock_irqrestore(&pgd_lock, flags); |
@@ -251,6 +266,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) | |||
251 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) | 266 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) |
252 | return -1; | 267 | return -1; |
253 | 268 | ||
269 | WARN_ON_ONCE(in_nmi()); | ||
270 | |||
254 | /* | 271 | /* |
255 | * Synchronize this task's top level page-table | 272 | * Synchronize this task's top level page-table |
256 | * with the 'reference' page table. | 273 | * with the 'reference' page table. |
@@ -326,29 +343,7 @@ out: | |||
326 | 343 | ||
327 | void vmalloc_sync_all(void) | 344 | void vmalloc_sync_all(void) |
328 | { | 345 | { |
329 | unsigned long address; | 346 | sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); |
330 | |||
331 | for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; | ||
332 | address += PGDIR_SIZE) { | ||
333 | |||
334 | const pgd_t *pgd_ref = pgd_offset_k(address); | ||
335 | unsigned long flags; | ||
336 | struct page *page; | ||
337 | |||
338 | if (pgd_none(*pgd_ref)) | ||
339 | continue; | ||
340 | |||
341 | spin_lock_irqsave(&pgd_lock, flags); | ||
342 | list_for_each_entry(page, &pgd_list, lru) { | ||
343 | pgd_t *pgd; | ||
344 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | ||
345 | if (pgd_none(*pgd)) | ||
346 | set_pgd(pgd, *pgd_ref); | ||
347 | else | ||
348 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | ||
349 | } | ||
350 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
351 | } | ||
352 | } | 347 | } |
353 | 348 | ||
354 | /* | 349 | /* |
@@ -369,6 +364,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) | |||
369 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) | 364 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) |
370 | return -1; | 365 | return -1; |
371 | 366 | ||
367 | WARN_ON_ONCE(in_nmi()); | ||
368 | |||
372 | /* | 369 | /* |
373 | * Copy kernel mappings over when needed. This can also | 370 | * Copy kernel mappings over when needed. This can also |
374 | * happen within a race in page table update. In the later | 371 | * happen within a race in page table update. In the later |
@@ -731,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, | |||
731 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); | 728 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); |
732 | tsk->thread.trap_no = 14; | 729 | tsk->thread.trap_no = 14; |
733 | 730 | ||
734 | force_sig_info_fault(SIGSEGV, si_code, address, tsk); | 731 | force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); |
735 | 732 | ||
736 | return; | 733 | return; |
737 | } | 734 | } |
@@ -816,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, | |||
816 | tsk->thread.trap_no = 14; | 813 | tsk->thread.trap_no = 14; |
817 | 814 | ||
818 | #ifdef CONFIG_MEMORY_FAILURE | 815 | #ifdef CONFIG_MEMORY_FAILURE |
819 | if (fault & VM_FAULT_HWPOISON) { | 816 | if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { |
820 | printk(KERN_ERR | 817 | printk(KERN_ERR |
821 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", | 818 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", |
822 | tsk->comm, tsk->pid, address); | 819 | tsk->comm, tsk->pid, address); |
823 | code = BUS_MCEERR_AR; | 820 | code = BUS_MCEERR_AR; |
824 | } | 821 | } |
825 | #endif | 822 | #endif |
826 | force_sig_info_fault(SIGBUS, code, address, tsk); | 823 | force_sig_info_fault(SIGBUS, code, address, tsk, fault); |
827 | } | 824 | } |
828 | 825 | ||
829 | static noinline void | 826 | static noinline void |
@@ -833,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, | |||
833 | if (fault & VM_FAULT_OOM) { | 830 | if (fault & VM_FAULT_OOM) { |
834 | out_of_memory(regs, error_code, address); | 831 | out_of_memory(regs, error_code, address); |
835 | } else { | 832 | } else { |
836 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) | 833 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| |
834 | VM_FAULT_HWPOISON_LARGE)) | ||
837 | do_sigbus(regs, error_code, address, fault); | 835 | do_sigbus(regs, error_code, address, fault); |
838 | else | 836 | else |
839 | BUG(); | 837 | BUG(); |
@@ -894,8 +892,14 @@ spurious_fault(unsigned long error_code, unsigned long address) | |||
894 | if (pmd_large(*pmd)) | 892 | if (pmd_large(*pmd)) |
895 | return spurious_fault_check(error_code, (pte_t *) pmd); | 893 | return spurious_fault_check(error_code, (pte_t *) pmd); |
896 | 894 | ||
895 | /* | ||
896 | * Note: don't use pte_present() here, since it returns true | ||
897 | * if the _PAGE_PROTNONE bit is set. However, this aliases the | ||
898 | * _PAGE_GLOBAL bit, which for kernel pages give false positives | ||
899 | * when CONFIG_DEBUG_PAGEALLOC is used. | ||
900 | */ | ||
897 | pte = pte_offset_kernel(pmd, address); | 901 | pte = pte_offset_kernel(pmd, address); |
898 | if (!pte_present(*pte)) | 902 | if (!(pte_flags(*pte) & _PAGE_PRESENT)) |
899 | return 0; | 903 | return 0; |
900 | 904 | ||
901 | ret = spurious_fault_check(error_code, pte); | 905 | ret = spurious_fault_check(error_code, pte); |
@@ -915,9 +919,9 @@ spurious_fault(unsigned long error_code, unsigned long address) | |||
915 | int show_unhandled_signals = 1; | 919 | int show_unhandled_signals = 1; |
916 | 920 | ||
917 | static inline int | 921 | static inline int |
918 | access_error(unsigned long error_code, int write, struct vm_area_struct *vma) | 922 | access_error(unsigned long error_code, struct vm_area_struct *vma) |
919 | { | 923 | { |
920 | if (write) { | 924 | if (error_code & PF_WRITE) { |
921 | /* write, present and write, not present: */ | 925 | /* write, present and write, not present: */ |
922 | if (unlikely(!(vma->vm_flags & VM_WRITE))) | 926 | if (unlikely(!(vma->vm_flags & VM_WRITE))) |
923 | return 1; | 927 | return 1; |
@@ -952,8 +956,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
952 | struct task_struct *tsk; | 956 | struct task_struct *tsk; |
953 | unsigned long address; | 957 | unsigned long address; |
954 | struct mm_struct *mm; | 958 | struct mm_struct *mm; |
955 | int write; | ||
956 | int fault; | 959 | int fault; |
960 | int write = error_code & PF_WRITE; | ||
961 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | | ||
962 | (write ? FAULT_FLAG_WRITE : 0); | ||
957 | 963 | ||
958 | tsk = current; | 964 | tsk = current; |
959 | mm = tsk->mm; | 965 | mm = tsk->mm; |
@@ -1064,6 +1070,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1064 | bad_area_nosemaphore(regs, error_code, address); | 1070 | bad_area_nosemaphore(regs, error_code, address); |
1065 | return; | 1071 | return; |
1066 | } | 1072 | } |
1073 | retry: | ||
1067 | down_read(&mm->mmap_sem); | 1074 | down_read(&mm->mmap_sem); |
1068 | } else { | 1075 | } else { |
1069 | /* | 1076 | /* |
@@ -1107,9 +1114,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1107 | * we can handle it.. | 1114 | * we can handle it.. |
1108 | */ | 1115 | */ |
1109 | good_area: | 1116 | good_area: |
1110 | write = error_code & PF_WRITE; | 1117 | if (unlikely(access_error(error_code, vma))) { |
1111 | |||
1112 | if (unlikely(access_error(error_code, write, vma))) { | ||
1113 | bad_area_access_error(regs, error_code, address); | 1118 | bad_area_access_error(regs, error_code, address); |
1114 | return; | 1119 | return; |
1115 | } | 1120 | } |
@@ -1119,21 +1124,34 @@ good_area: | |||
1119 | * make sure we exit gracefully rather than endlessly redo | 1124 | * make sure we exit gracefully rather than endlessly redo |
1120 | * the fault: | 1125 | * the fault: |
1121 | */ | 1126 | */ |
1122 | fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); | 1127 | fault = handle_mm_fault(mm, vma, address, flags); |
1123 | 1128 | ||
1124 | if (unlikely(fault & VM_FAULT_ERROR)) { | 1129 | if (unlikely(fault & VM_FAULT_ERROR)) { |
1125 | mm_fault_error(regs, error_code, address, fault); | 1130 | mm_fault_error(regs, error_code, address, fault); |
1126 | return; | 1131 | return; |
1127 | } | 1132 | } |
1128 | 1133 | ||
1129 | if (fault & VM_FAULT_MAJOR) { | 1134 | /* |
1130 | tsk->maj_flt++; | 1135 | * Major/minor page fault accounting is only done on the |
1131 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, | 1136 | * initial attempt. If we go through a retry, it is extremely |
1132 | regs, address); | 1137 | * likely that the page will be found in page cache at that point. |
1133 | } else { | 1138 | */ |
1134 | tsk->min_flt++; | 1139 | if (flags & FAULT_FLAG_ALLOW_RETRY) { |
1135 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, | 1140 | if (fault & VM_FAULT_MAJOR) { |
1136 | regs, address); | 1141 | tsk->maj_flt++; |
1142 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, | ||
1143 | regs, address); | ||
1144 | } else { | ||
1145 | tsk->min_flt++; | ||
1146 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, | ||
1147 | regs, address); | ||
1148 | } | ||
1149 | if (fault & VM_FAULT_RETRY) { | ||
1150 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk | ||
1151 | * of starvation. */ | ||
1152 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | ||
1153 | goto retry; | ||
1154 | } | ||
1137 | } | 1155 | } |
1138 | 1156 | ||
1139 | check_v8086_mode(regs, address, tsk); | 1157 | check_v8086_mode(regs, address, tsk); |
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 5e8fa12ef861..b49962662101 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c | |||
@@ -9,6 +9,7 @@ void *kmap(struct page *page) | |||
9 | return page_address(page); | 9 | return page_address(page); |
10 | return kmap_high(page); | 10 | return kmap_high(page); |
11 | } | 11 | } |
12 | EXPORT_SYMBOL(kmap); | ||
12 | 13 | ||
13 | void kunmap(struct page *page) | 14 | void kunmap(struct page *page) |
14 | { | 15 | { |
@@ -18,6 +19,7 @@ void kunmap(struct page *page) | |||
18 | return; | 19 | return; |
19 | kunmap_high(page); | 20 | kunmap_high(page); |
20 | } | 21 | } |
22 | EXPORT_SYMBOL(kunmap); | ||
21 | 23 | ||
22 | /* | 24 | /* |
23 | * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because | 25 | * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because |
@@ -27,10 +29,10 @@ void kunmap(struct page *page) | |||
27 | * However when holding an atomic kmap it is not legal to sleep, so atomic | 29 | * However when holding an atomic kmap it is not legal to sleep, so atomic |
28 | * kmaps are appropriate for short, tight code paths only. | 30 | * kmaps are appropriate for short, tight code paths only. |
29 | */ | 31 | */ |
30 | void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) | 32 | void *kmap_atomic_prot(struct page *page, pgprot_t prot) |
31 | { | 33 | { |
32 | enum fixed_addresses idx; | ||
33 | unsigned long vaddr; | 34 | unsigned long vaddr; |
35 | int idx, type; | ||
34 | 36 | ||
35 | /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ | 37 | /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ |
36 | pagefault_disable(); | 38 | pagefault_disable(); |
@@ -38,8 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) | |||
38 | if (!PageHighMem(page)) | 40 | if (!PageHighMem(page)) |
39 | return page_address(page); | 41 | return page_address(page); |
40 | 42 | ||
41 | debug_kmap_atomic(type); | 43 | type = kmap_atomic_idx_push(); |
42 | |||
43 | idx = type + KM_TYPE_NR*smp_processor_id(); | 44 | idx = type + KM_TYPE_NR*smp_processor_id(); |
44 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); | 45 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); |
45 | BUG_ON(!pte_none(*(kmap_pte-idx))); | 46 | BUG_ON(!pte_none(*(kmap_pte-idx))); |
@@ -47,44 +48,57 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) | |||
47 | 48 | ||
48 | return (void *)vaddr; | 49 | return (void *)vaddr; |
49 | } | 50 | } |
51 | EXPORT_SYMBOL(kmap_atomic_prot); | ||
52 | |||
53 | void *__kmap_atomic(struct page *page) | ||
54 | { | ||
55 | return kmap_atomic_prot(page, kmap_prot); | ||
56 | } | ||
57 | EXPORT_SYMBOL(__kmap_atomic); | ||
50 | 58 | ||
51 | void *kmap_atomic(struct page *page, enum km_type type) | 59 | /* |
60 | * This is the same as kmap_atomic() but can map memory that doesn't | ||
61 | * have a struct page associated with it. | ||
62 | */ | ||
63 | void *kmap_atomic_pfn(unsigned long pfn) | ||
52 | { | 64 | { |
53 | return kmap_atomic_prot(page, type, kmap_prot); | 65 | return kmap_atomic_prot_pfn(pfn, kmap_prot); |
54 | } | 66 | } |
67 | EXPORT_SYMBOL_GPL(kmap_atomic_pfn); | ||
55 | 68 | ||
56 | void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) | 69 | void __kunmap_atomic(void *kvaddr) |
57 | { | 70 | { |
58 | unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; | 71 | unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; |
59 | enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); | 72 | |
60 | 73 | if (vaddr >= __fix_to_virt(FIX_KMAP_END) && | |
61 | /* | 74 | vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { |
62 | * Force other mappings to Oops if they'll try to access this pte | 75 | int idx, type; |
63 | * without first remap it. Keeping stale mappings around is a bad idea | 76 | |
64 | * also, in case the page changes cacheability attributes or becomes | 77 | type = kmap_atomic_idx(); |
65 | * a protected page in a hypervisor. | 78 | idx = type + KM_TYPE_NR * smp_processor_id(); |
66 | */ | 79 | |
67 | if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) | 80 | #ifdef CONFIG_DEBUG_HIGHMEM |
81 | WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); | ||
82 | #endif | ||
83 | /* | ||
84 | * Force other mappings to Oops if they'll try to access this | ||
85 | * pte without first remap it. Keeping stale mappings around | ||
86 | * is a bad idea also, in case the page changes cacheability | ||
87 | * attributes or becomes a protected page in a hypervisor. | ||
88 | */ | ||
68 | kpte_clear_flush(kmap_pte-idx, vaddr); | 89 | kpte_clear_flush(kmap_pte-idx, vaddr); |
69 | else { | 90 | kmap_atomic_idx_pop(); |
91 | } | ||
70 | #ifdef CONFIG_DEBUG_HIGHMEM | 92 | #ifdef CONFIG_DEBUG_HIGHMEM |
93 | else { | ||
71 | BUG_ON(vaddr < PAGE_OFFSET); | 94 | BUG_ON(vaddr < PAGE_OFFSET); |
72 | BUG_ON(vaddr >= (unsigned long)high_memory); | 95 | BUG_ON(vaddr >= (unsigned long)high_memory); |
73 | #endif | ||
74 | } | 96 | } |
97 | #endif | ||
75 | 98 | ||
76 | pagefault_enable(); | 99 | pagefault_enable(); |
77 | } | 100 | } |
78 | 101 | EXPORT_SYMBOL(__kunmap_atomic); | |
79 | /* | ||
80 | * This is the same as kmap_atomic() but can map memory that doesn't | ||
81 | * have a struct page associated with it. | ||
82 | */ | ||
83 | void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) | ||
84 | { | ||
85 | return kmap_atomic_prot_pfn(pfn, type, kmap_prot); | ||
86 | } | ||
87 | EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ | ||
88 | 102 | ||
89 | struct page *kmap_atomic_to_page(void *ptr) | 103 | struct page *kmap_atomic_to_page(void *ptr) |
90 | { | 104 | { |
@@ -98,12 +112,6 @@ struct page *kmap_atomic_to_page(void *ptr) | |||
98 | pte = kmap_pte - (idx - FIX_KMAP_BEGIN); | 112 | pte = kmap_pte - (idx - FIX_KMAP_BEGIN); |
99 | return pte_page(*pte); | 113 | return pte_page(*pte); |
100 | } | 114 | } |
101 | |||
102 | EXPORT_SYMBOL(kmap); | ||
103 | EXPORT_SYMBOL(kunmap); | ||
104 | EXPORT_SYMBOL(kmap_atomic); | ||
105 | EXPORT_SYMBOL(kunmap_atomic_notypecheck); | ||
106 | EXPORT_SYMBOL(kmap_atomic_prot); | ||
107 | EXPORT_SYMBOL(kmap_atomic_to_page); | 115 | EXPORT_SYMBOL(kmap_atomic_to_page); |
108 | 116 | ||
109 | void __init set_highmem_pages_init(void) | 117 | void __init set_highmem_pages_init(void) |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index b278535b14aa..c0e28a13de7d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <linux/initrd.h> | 2 | #include <linux/initrd.h> |
3 | #include <linux/ioport.h> | 3 | #include <linux/ioport.h> |
4 | #include <linux/swap.h> | 4 | #include <linux/swap.h> |
5 | #include <linux/memblock.h> | ||
5 | 6 | ||
6 | #include <asm/cacheflush.h> | 7 | #include <asm/cacheflush.h> |
7 | #include <asm/e820.h> | 8 | #include <asm/e820.h> |
@@ -33,6 +34,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse, | |||
33 | int use_gbpages) | 34 | int use_gbpages) |
34 | { | 35 | { |
35 | unsigned long puds, pmds, ptes, tables, start; | 36 | unsigned long puds, pmds, ptes, tables, start; |
37 | phys_addr_t base; | ||
36 | 38 | ||
37 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | 39 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; |
38 | tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); | 40 | tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); |
@@ -75,12 +77,12 @@ static void __init find_early_table_space(unsigned long end, int use_pse, | |||
75 | #else | 77 | #else |
76 | start = 0x8000; | 78 | start = 0x8000; |
77 | #endif | 79 | #endif |
78 | e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, | 80 | base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT, |
79 | tables, PAGE_SIZE); | 81 | tables, PAGE_SIZE); |
80 | if (e820_table_start == -1UL) | 82 | if (base == MEMBLOCK_ERROR) |
81 | panic("Cannot find space for the kernel page tables"); | 83 | panic("Cannot find space for the kernel page tables"); |
82 | 84 | ||
83 | e820_table_start >>= PAGE_SHIFT; | 85 | e820_table_start = base >> PAGE_SHIFT; |
84 | e820_table_end = e820_table_start; | 86 | e820_table_end = e820_table_start; |
85 | e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); | 87 | e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); |
86 | 88 | ||
@@ -299,7 +301,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
299 | __flush_tlb_all(); | 301 | __flush_tlb_all(); |
300 | 302 | ||
301 | if (!after_bootmem && e820_table_end > e820_table_start) | 303 | if (!after_bootmem && e820_table_end > e820_table_start) |
302 | reserve_early(e820_table_start << PAGE_SHIFT, | 304 | memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, |
303 | e820_table_end << PAGE_SHIFT, "PGTABLE"); | 305 | e820_table_end << PAGE_SHIFT, "PGTABLE"); |
304 | 306 | ||
305 | if (!after_bootmem) | 307 | if (!after_bootmem) |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 558f2d332076..0e969f9f401b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/pfn.h> | 25 | #include <linux/pfn.h> |
26 | #include <linux/poison.h> | 26 | #include <linux/poison.h> |
27 | #include <linux/bootmem.h> | 27 | #include <linux/bootmem.h> |
28 | #include <linux/memblock.h> | ||
28 | #include <linux/proc_fs.h> | 29 | #include <linux/proc_fs.h> |
29 | #include <linux/memory_hotplug.h> | 30 | #include <linux/memory_hotplug.h> |
30 | #include <linux/initrd.h> | 31 | #include <linux/initrd.h> |
@@ -422,49 +423,28 @@ static void __init add_one_highpage_init(struct page *page) | |||
422 | totalhigh_pages++; | 423 | totalhigh_pages++; |
423 | } | 424 | } |
424 | 425 | ||
425 | struct add_highpages_data { | 426 | void __init add_highpages_with_active_regions(int nid, |
426 | unsigned long start_pfn; | 427 | unsigned long start_pfn, unsigned long end_pfn) |
427 | unsigned long end_pfn; | ||
428 | }; | ||
429 | |||
430 | static int __init add_highpages_work_fn(unsigned long start_pfn, | ||
431 | unsigned long end_pfn, void *datax) | ||
432 | { | 428 | { |
433 | int node_pfn; | 429 | struct range *range; |
434 | struct page *page; | 430 | int nr_range; |
435 | unsigned long final_start_pfn, final_end_pfn; | 431 | int i; |
436 | struct add_highpages_data *data; | ||
437 | 432 | ||
438 | data = (struct add_highpages_data *)datax; | 433 | nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn); |
439 | 434 | ||
440 | final_start_pfn = max(start_pfn, data->start_pfn); | 435 | for (i = 0; i < nr_range; i++) { |
441 | final_end_pfn = min(end_pfn, data->end_pfn); | 436 | struct page *page; |
442 | if (final_start_pfn >= final_end_pfn) | 437 | int node_pfn; |
443 | return 0; | ||
444 | 438 | ||
445 | for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; | 439 | for (node_pfn = range[i].start; node_pfn < range[i].end; |
446 | node_pfn++) { | 440 | node_pfn++) { |
447 | if (!pfn_valid(node_pfn)) | 441 | if (!pfn_valid(node_pfn)) |
448 | continue; | 442 | continue; |
449 | page = pfn_to_page(node_pfn); | 443 | page = pfn_to_page(node_pfn); |
450 | add_one_highpage_init(page); | 444 | add_one_highpage_init(page); |
445 | } | ||
451 | } | 446 | } |
452 | |||
453 | return 0; | ||
454 | |||
455 | } | 447 | } |
456 | |||
457 | void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, | ||
458 | unsigned long end_pfn) | ||
459 | { | ||
460 | struct add_highpages_data data; | ||
461 | |||
462 | data.start_pfn = start_pfn; | ||
463 | data.end_pfn = end_pfn; | ||
464 | |||
465 | work_with_active_regions(nid, add_highpages_work_fn, &data); | ||
466 | } | ||
467 | |||
468 | #else | 448 | #else |
469 | static inline void permanent_kmaps_init(pgd_t *pgd_base) | 449 | static inline void permanent_kmaps_init(pgd_t *pgd_base) |
470 | { | 450 | { |
@@ -548,48 +528,6 @@ static void __init pagetable_init(void) | |||
548 | permanent_kmaps_init(pgd_base); | 528 | permanent_kmaps_init(pgd_base); |
549 | } | 529 | } |
550 | 530 | ||
551 | #ifdef CONFIG_ACPI_SLEEP | ||
552 | /* | ||
553 | * ACPI suspend needs this for resume, because things like the intel-agp | ||
554 | * driver might have split up a kernel 4MB mapping. | ||
555 | */ | ||
556 | char swsusp_pg_dir[PAGE_SIZE] | ||
557 | __attribute__ ((aligned(PAGE_SIZE))); | ||
558 | |||
559 | static inline void save_pg_dir(void) | ||
560 | { | ||
561 | copy_page(swsusp_pg_dir, swapper_pg_dir); | ||
562 | } | ||
563 | #else /* !CONFIG_ACPI_SLEEP */ | ||
564 | static inline void save_pg_dir(void) | ||
565 | { | ||
566 | } | ||
567 | #endif /* !CONFIG_ACPI_SLEEP */ | ||
568 | |||
569 | void zap_low_mappings(bool early) | ||
570 | { | ||
571 | int i; | ||
572 | |||
573 | /* | ||
574 | * Zap initial low-memory mappings. | ||
575 | * | ||
576 | * Note that "pgd_clear()" doesn't do it for | ||
577 | * us, because pgd_clear() is a no-op on i386. | ||
578 | */ | ||
579 | for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { | ||
580 | #ifdef CONFIG_X86_PAE | ||
581 | set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); | ||
582 | #else | ||
583 | set_pgd(swapper_pg_dir+i, __pgd(0)); | ||
584 | #endif | ||
585 | } | ||
586 | |||
587 | if (early) | ||
588 | __flush_tlb(); | ||
589 | else | ||
590 | flush_tlb_all(); | ||
591 | } | ||
592 | |||
593 | pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); | 531 | pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); |
594 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | 532 | EXPORT_SYMBOL_GPL(__supported_pte_mask); |
595 | 533 | ||
@@ -712,14 +650,14 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | |||
712 | highstart_pfn = highend_pfn = max_pfn; | 650 | highstart_pfn = highend_pfn = max_pfn; |
713 | if (max_pfn > max_low_pfn) | 651 | if (max_pfn > max_low_pfn) |
714 | highstart_pfn = max_low_pfn; | 652 | highstart_pfn = max_low_pfn; |
715 | e820_register_active_regions(0, 0, highend_pfn); | 653 | memblock_x86_register_active_regions(0, 0, highend_pfn); |
716 | sparse_memory_present_with_active_regions(0); | 654 | sparse_memory_present_with_active_regions(0); |
717 | printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", | 655 | printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", |
718 | pages_to_mb(highend_pfn - highstart_pfn)); | 656 | pages_to_mb(highend_pfn - highstart_pfn)); |
719 | num_physpages = highend_pfn; | 657 | num_physpages = highend_pfn; |
720 | high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; | 658 | high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; |
721 | #else | 659 | #else |
722 | e820_register_active_regions(0, 0, max_low_pfn); | 660 | memblock_x86_register_active_regions(0, 0, max_low_pfn); |
723 | sparse_memory_present_with_active_regions(0); | 661 | sparse_memory_present_with_active_regions(0); |
724 | num_physpages = max_low_pfn; | 662 | num_physpages = max_low_pfn; |
725 | high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; | 663 | high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; |
@@ -750,68 +688,12 @@ static void __init zone_sizes_init(void) | |||
750 | free_area_init_nodes(max_zone_pfns); | 688 | free_area_init_nodes(max_zone_pfns); |
751 | } | 689 | } |
752 | 690 | ||
753 | #ifndef CONFIG_NO_BOOTMEM | ||
754 | static unsigned long __init setup_node_bootmem(int nodeid, | ||
755 | unsigned long start_pfn, | ||
756 | unsigned long end_pfn, | ||
757 | unsigned long bootmap) | ||
758 | { | ||
759 | unsigned long bootmap_size; | ||
760 | |||
761 | /* don't touch min_low_pfn */ | ||
762 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | ||
763 | bootmap >> PAGE_SHIFT, | ||
764 | start_pfn, end_pfn); | ||
765 | printk(KERN_INFO " node %d low ram: %08lx - %08lx\n", | ||
766 | nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); | ||
767 | printk(KERN_INFO " node %d bootmap %08lx - %08lx\n", | ||
768 | nodeid, bootmap, bootmap + bootmap_size); | ||
769 | free_bootmem_with_active_regions(nodeid, end_pfn); | ||
770 | |||
771 | return bootmap + bootmap_size; | ||
772 | } | ||
773 | #endif | ||
774 | |||
775 | void __init setup_bootmem_allocator(void) | 691 | void __init setup_bootmem_allocator(void) |
776 | { | 692 | { |
777 | #ifndef CONFIG_NO_BOOTMEM | ||
778 | int nodeid; | ||
779 | unsigned long bootmap_size, bootmap; | ||
780 | /* | ||
781 | * Initialize the boot-time allocator (with low memory only): | ||
782 | */ | ||
783 | bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; | ||
784 | bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size, | ||
785 | PAGE_SIZE); | ||
786 | if (bootmap == -1L) | ||
787 | panic("Cannot find bootmem map of size %ld\n", bootmap_size); | ||
788 | reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); | ||
789 | #endif | ||
790 | |||
791 | printk(KERN_INFO " mapped low ram: 0 - %08lx\n", | 693 | printk(KERN_INFO " mapped low ram: 0 - %08lx\n", |
792 | max_pfn_mapped<<PAGE_SHIFT); | 694 | max_pfn_mapped<<PAGE_SHIFT); |
793 | printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); | 695 | printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); |
794 | 696 | ||
795 | #ifndef CONFIG_NO_BOOTMEM | ||
796 | for_each_online_node(nodeid) { | ||
797 | unsigned long start_pfn, end_pfn; | ||
798 | |||
799 | #ifdef CONFIG_NEED_MULTIPLE_NODES | ||
800 | start_pfn = node_start_pfn[nodeid]; | ||
801 | end_pfn = node_end_pfn[nodeid]; | ||
802 | if (start_pfn > max_low_pfn) | ||
803 | continue; | ||
804 | if (end_pfn > max_low_pfn) | ||
805 | end_pfn = max_low_pfn; | ||
806 | #else | ||
807 | start_pfn = 0; | ||
808 | end_pfn = max_low_pfn; | ||
809 | #endif | ||
810 | bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, | ||
811 | bootmap); | ||
812 | } | ||
813 | #endif | ||
814 | |||
815 | after_bootmem = 1; | 697 | after_bootmem = 1; |
816 | } | 698 | } |
817 | 699 | ||
@@ -958,9 +840,6 @@ void __init mem_init(void) | |||
958 | 840 | ||
959 | if (boot_cpu_data.wp_works_ok < 0) | 841 | if (boot_cpu_data.wp_works_ok < 0) |
960 | test_wp_bit(); | 842 | test_wp_bit(); |
961 | |||
962 | save_pg_dir(); | ||
963 | zap_low_mappings(true); | ||
964 | } | 843 | } |
965 | 844 | ||
966 | #ifdef CONFIG_MEMORY_HOTPLUG | 845 | #ifdef CONFIG_MEMORY_HOTPLUG |
@@ -1070,8 +949,3 @@ void mark_rodata_ro(void) | |||
1070 | } | 949 | } |
1071 | #endif | 950 | #endif |
1072 | 951 | ||
1073 | int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | ||
1074 | int flags) | ||
1075 | { | ||
1076 | return reserve_bootmem(phys, len, flags); | ||
1077 | } | ||
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7c48ad4faca3..71a59296af80 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/initrd.h> | 21 | #include <linux/initrd.h> |
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
24 | #include <linux/memblock.h> | ||
24 | #include <linux/proc_fs.h> | 25 | #include <linux/proc_fs.h> |
25 | #include <linux/pci.h> | 26 | #include <linux/pci.h> |
26 | #include <linux/pfn.h> | 27 | #include <linux/pfn.h> |
@@ -50,9 +51,6 @@ | |||
50 | #include <asm/numa.h> | 51 | #include <asm/numa.h> |
51 | #include <asm/cacheflush.h> | 52 | #include <asm/cacheflush.h> |
52 | #include <asm/init.h> | 53 | #include <asm/init.h> |
53 | #include <linux/bootmem.h> | ||
54 | |||
55 | static unsigned long dma_reserve __initdata; | ||
56 | 54 | ||
57 | static int __init parse_direct_gbpages_off(char *arg) | 55 | static int __init parse_direct_gbpages_off(char *arg) |
58 | { | 56 | { |
@@ -98,6 +96,43 @@ static int __init nonx32_setup(char *str) | |||
98 | __setup("noexec32=", nonx32_setup); | 96 | __setup("noexec32=", nonx32_setup); |
99 | 97 | ||
100 | /* | 98 | /* |
99 | * When memory was added/removed make sure all the processes MM have | ||
100 | * suitable PGD entries in the local PGD level page. | ||
101 | */ | ||
102 | void sync_global_pgds(unsigned long start, unsigned long end) | ||
103 | { | ||
104 | unsigned long address; | ||
105 | |||
106 | for (address = start; address <= end; address += PGDIR_SIZE) { | ||
107 | const pgd_t *pgd_ref = pgd_offset_k(address); | ||
108 | unsigned long flags; | ||
109 | struct page *page; | ||
110 | |||
111 | if (pgd_none(*pgd_ref)) | ||
112 | continue; | ||
113 | |||
114 | spin_lock_irqsave(&pgd_lock, flags); | ||
115 | list_for_each_entry(page, &pgd_list, lru) { | ||
116 | pgd_t *pgd; | ||
117 | spinlock_t *pgt_lock; | ||
118 | |||
119 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | ||
120 | pgt_lock = &pgd_page_get_mm(page)->page_table_lock; | ||
121 | spin_lock(pgt_lock); | ||
122 | |||
123 | if (pgd_none(*pgd)) | ||
124 | set_pgd(pgd, *pgd_ref); | ||
125 | else | ||
126 | BUG_ON(pgd_page_vaddr(*pgd) | ||
127 | != pgd_page_vaddr(*pgd_ref)); | ||
128 | |||
129 | spin_unlock(pgt_lock); | ||
130 | } | ||
131 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
132 | } | ||
133 | } | ||
134 | |||
135 | /* | ||
101 | * NOTE: This function is marked __ref because it calls __init function | 136 | * NOTE: This function is marked __ref because it calls __init function |
102 | * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. | 137 | * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. |
103 | */ | 138 | */ |
@@ -534,11 +569,13 @@ kernel_physical_mapping_init(unsigned long start, | |||
534 | unsigned long end, | 569 | unsigned long end, |
535 | unsigned long page_size_mask) | 570 | unsigned long page_size_mask) |
536 | { | 571 | { |
537 | 572 | bool pgd_changed = false; | |
538 | unsigned long next, last_map_addr = end; | 573 | unsigned long next, last_map_addr = end; |
574 | unsigned long addr; | ||
539 | 575 | ||
540 | start = (unsigned long)__va(start); | 576 | start = (unsigned long)__va(start); |
541 | end = (unsigned long)__va(end); | 577 | end = (unsigned long)__va(end); |
578 | addr = start; | ||
542 | 579 | ||
543 | for (; start < end; start = next) { | 580 | for (; start < end; start = next) { |
544 | pgd_t *pgd = pgd_offset_k(start); | 581 | pgd_t *pgd = pgd_offset_k(start); |
@@ -563,7 +600,12 @@ kernel_physical_mapping_init(unsigned long start, | |||
563 | spin_lock(&init_mm.page_table_lock); | 600 | spin_lock(&init_mm.page_table_lock); |
564 | pgd_populate(&init_mm, pgd, __va(pud_phys)); | 601 | pgd_populate(&init_mm, pgd, __va(pud_phys)); |
565 | spin_unlock(&init_mm.page_table_lock); | 602 | spin_unlock(&init_mm.page_table_lock); |
603 | pgd_changed = true; | ||
566 | } | 604 | } |
605 | |||
606 | if (pgd_changed) | ||
607 | sync_global_pgds(addr, end); | ||
608 | |||
567 | __flush_tlb_all(); | 609 | __flush_tlb_all(); |
568 | 610 | ||
569 | return last_map_addr; | 611 | return last_map_addr; |
@@ -573,23 +615,7 @@ kernel_physical_mapping_init(unsigned long start, | |||
573 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | 615 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, |
574 | int acpi, int k8) | 616 | int acpi, int k8) |
575 | { | 617 | { |
576 | #ifndef CONFIG_NO_BOOTMEM | 618 | memblock_x86_register_active_regions(0, start_pfn, end_pfn); |
577 | unsigned long bootmap_size, bootmap; | ||
578 | |||
579 | bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; | ||
580 | bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size, | ||
581 | PAGE_SIZE); | ||
582 | if (bootmap == -1L) | ||
583 | panic("Cannot find bootmem map of size %ld\n", bootmap_size); | ||
584 | reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); | ||
585 | /* don't touch min_low_pfn */ | ||
586 | bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, | ||
587 | 0, end_pfn); | ||
588 | e820_register_active_regions(0, start_pfn, end_pfn); | ||
589 | free_bootmem_with_active_regions(0, end_pfn); | ||
590 | #else | ||
591 | e820_register_active_regions(0, start_pfn, end_pfn); | ||
592 | #endif | ||
593 | } | 619 | } |
594 | #endif | 620 | #endif |
595 | 621 | ||
@@ -799,52 +825,6 @@ void mark_rodata_ro(void) | |||
799 | 825 | ||
800 | #endif | 826 | #endif |
801 | 827 | ||
802 | int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | ||
803 | int flags) | ||
804 | { | ||
805 | #ifdef CONFIG_NUMA | ||
806 | int nid, next_nid; | ||
807 | int ret; | ||
808 | #endif | ||
809 | unsigned long pfn = phys >> PAGE_SHIFT; | ||
810 | |||
811 | if (pfn >= max_pfn) { | ||
812 | /* | ||
813 | * This can happen with kdump kernels when accessing | ||
814 | * firmware tables: | ||
815 | */ | ||
816 | if (pfn < max_pfn_mapped) | ||
817 | return -EFAULT; | ||
818 | |||
819 | printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n", | ||
820 | phys, len); | ||
821 | return -EFAULT; | ||
822 | } | ||
823 | |||
824 | /* Should check here against the e820 map to avoid double free */ | ||
825 | #ifdef CONFIG_NUMA | ||
826 | nid = phys_to_nid(phys); | ||
827 | next_nid = phys_to_nid(phys + len - 1); | ||
828 | if (nid == next_nid) | ||
829 | ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags); | ||
830 | else | ||
831 | ret = reserve_bootmem(phys, len, flags); | ||
832 | |||
833 | if (ret != 0) | ||
834 | return ret; | ||
835 | |||
836 | #else | ||
837 | reserve_bootmem(phys, len, flags); | ||
838 | #endif | ||
839 | |||
840 | if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { | ||
841 | dma_reserve += len / PAGE_SIZE; | ||
842 | set_dma_reserve(dma_reserve); | ||
843 | } | ||
844 | |||
845 | return 0; | ||
846 | } | ||
847 | |||
848 | int kern_addr_valid(unsigned long addr) | 828 | int kern_addr_valid(unsigned long addr) |
849 | { | 829 | { |
850 | unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; | 830 | unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; |
@@ -1003,6 +983,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) | |||
1003 | } | 983 | } |
1004 | 984 | ||
1005 | } | 985 | } |
986 | sync_global_pgds((unsigned long)start_page, end); | ||
1006 | return 0; | 987 | return 0; |
1007 | } | 988 | } |
1008 | 989 | ||
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 72fc70cf6184..7b179b499fa3 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c | |||
@@ -48,21 +48,20 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) | |||
48 | } | 48 | } |
49 | EXPORT_SYMBOL_GPL(iomap_create_wc); | 49 | EXPORT_SYMBOL_GPL(iomap_create_wc); |
50 | 50 | ||
51 | void | 51 | void iomap_free(resource_size_t base, unsigned long size) |
52 | iomap_free(resource_size_t base, unsigned long size) | ||
53 | { | 52 | { |
54 | io_free_memtype(base, base + size); | 53 | io_free_memtype(base, base + size); |
55 | } | 54 | } |
56 | EXPORT_SYMBOL_GPL(iomap_free); | 55 | EXPORT_SYMBOL_GPL(iomap_free); |
57 | 56 | ||
58 | void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) | 57 | void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) |
59 | { | 58 | { |
60 | enum fixed_addresses idx; | ||
61 | unsigned long vaddr; | 59 | unsigned long vaddr; |
60 | int idx, type; | ||
62 | 61 | ||
63 | pagefault_disable(); | 62 | pagefault_disable(); |
64 | 63 | ||
65 | debug_kmap_atomic(type); | 64 | type = kmap_atomic_idx_push(); |
66 | idx = type + KM_TYPE_NR * smp_processor_id(); | 65 | idx = type + KM_TYPE_NR * smp_processor_id(); |
67 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); | 66 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); |
68 | set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); | 67 | set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); |
@@ -72,10 +71,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) | |||
72 | } | 71 | } |
73 | 72 | ||
74 | /* | 73 | /* |
75 | * Map 'pfn' using fixed map 'type' and protections 'prot' | 74 | * Map 'pfn' using protections 'prot' |
76 | */ | 75 | */ |
77 | void __iomem * | 76 | void __iomem * |
78 | iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) | 77 | iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) |
79 | { | 78 | { |
80 | /* | 79 | /* |
81 | * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. | 80 | * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. |
@@ -86,24 +85,34 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) | |||
86 | if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) | 85 | if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) |
87 | prot = PAGE_KERNEL_UC_MINUS; | 86 | prot = PAGE_KERNEL_UC_MINUS; |
88 | 87 | ||
89 | return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, type, prot); | 88 | return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); |
90 | } | 89 | } |
91 | EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); | 90 | EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); |
92 | 91 | ||
93 | void | 92 | void |
94 | iounmap_atomic(void __iomem *kvaddr, enum km_type type) | 93 | iounmap_atomic(void __iomem *kvaddr) |
95 | { | 94 | { |
96 | unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; | 95 | unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; |
97 | enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); | ||
98 | 96 | ||
99 | /* | 97 | if (vaddr >= __fix_to_virt(FIX_KMAP_END) && |
100 | * Force other mappings to Oops if they'll try to access this pte | 98 | vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { |
101 | * without first remap it. Keeping stale mappings around is a bad idea | 99 | int idx, type; |
102 | * also, in case the page changes cacheability attributes or becomes | 100 | |
103 | * a protected page in a hypervisor. | 101 | type = kmap_atomic_idx(); |
104 | */ | 102 | idx = type + KM_TYPE_NR * smp_processor_id(); |
105 | if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) | 103 | |
104 | #ifdef CONFIG_DEBUG_HIGHMEM | ||
105 | WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); | ||
106 | #endif | ||
107 | /* | ||
108 | * Force other mappings to Oops if they'll try to access this | ||
109 | * pte without first remap it. Keeping stale mappings around | ||
110 | * is a bad idea also, in case the page changes cacheability | ||
111 | * attributes or becomes a protected page in a hypervisor. | ||
112 | */ | ||
106 | kpte_clear_flush(kmap_pte-idx, vaddr); | 113 | kpte_clear_flush(kmap_pte-idx, vaddr); |
114 | kmap_atomic_idx_pop(); | ||
115 | } | ||
107 | 116 | ||
108 | pagefault_enable(); | 117 | pagefault_enable(); |
109 | } | 118 | } |
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 3ba6e0608c55..0369843511dc 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -362,6 +362,11 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr) | |||
362 | return &bm_pte[pte_index(addr)]; | 362 | return &bm_pte[pte_index(addr)]; |
363 | } | 363 | } |
364 | 364 | ||
365 | bool __init is_early_ioremap_ptep(pte_t *ptep) | ||
366 | { | ||
367 | return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)]; | ||
368 | } | ||
369 | |||
365 | static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; | 370 | static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; |
366 | 371 | ||
367 | void __init early_ioremap_init(void) | 372 | void __init early_ioremap_init(void) |
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 240f86462a83..804a3b6c6e14 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c | |||
@@ -11,6 +11,8 @@ | |||
11 | #include <linux/string.h> | 11 | #include <linux/string.h> |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/nodemask.h> | 13 | #include <linux/nodemask.h> |
14 | #include <linux/memblock.h> | ||
15 | |||
14 | #include <asm/io.h> | 16 | #include <asm/io.h> |
15 | #include <linux/pci_ids.h> | 17 | #include <linux/pci_ids.h> |
16 | #include <linux/acpi.h> | 18 | #include <linux/acpi.h> |
@@ -22,7 +24,7 @@ | |||
22 | #include <asm/numa.h> | 24 | #include <asm/numa.h> |
23 | #include <asm/mpspec.h> | 25 | #include <asm/mpspec.h> |
24 | #include <asm/apic.h> | 26 | #include <asm/apic.h> |
25 | #include <asm/k8.h> | 27 | #include <asm/amd_nb.h> |
26 | 28 | ||
27 | static struct bootnode __initdata nodes[8]; | 29 | static struct bootnode __initdata nodes[8]; |
28 | static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; | 30 | static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; |
@@ -222,7 +224,7 @@ int __init k8_scan_nodes(void) | |||
222 | for_each_node_mask(i, node_possible_map) { | 224 | for_each_node_mask(i, node_possible_map) { |
223 | int j; | 225 | int j; |
224 | 226 | ||
225 | e820_register_active_regions(i, | 227 | memblock_x86_register_active_regions(i, |
226 | nodes[i].start >> PAGE_SHIFT, | 228 | nodes[i].start >> PAGE_SHIFT, |
227 | nodes[i].end >> PAGE_SHIFT); | 229 | nodes[i].end >> PAGE_SHIFT); |
228 | for (j = apicid_base; j < cores + apicid_base; j++) | 230 | for (j = apicid_base; j < cores + apicid_base; j++) |
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index b3b531a4f8e5..d87dd6d042d6 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c | |||
@@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, | |||
631 | if (!pte) | 631 | if (!pte) |
632 | return false; | 632 | return false; |
633 | 633 | ||
634 | WARN_ON_ONCE(in_nmi()); | ||
635 | |||
634 | if (error_code & 2) | 636 | if (error_code & 2) |
635 | kmemcheck_access(regs, address, KMEMCHECK_WRITE); | 637 | kmemcheck_access(regs, address, KMEMCHECK_WRITE); |
636 | else | 638 | else |
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c index 63c19e27aa6f..324aa3f07237 100644 --- a/arch/x86/mm/kmemcheck/opcode.c +++ b/arch/x86/mm/kmemcheck/opcode.c | |||
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b) | |||
9 | b == 0xf0 || b == 0xf2 || b == 0xf3 | 9 | b == 0xf0 || b == 0xf2 || b == 0xf3 |
10 | /* Group 2 */ | 10 | /* Group 2 */ |
11 | || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 | 11 | || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 |
12 | || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e | 12 | || b == 0x64 || b == 0x65 |
13 | /* Group 3 */ | 13 | /* Group 3 */ |
14 | || b == 0x66 | 14 | || b == 0x66 |
15 | /* Group 4 */ | 15 | /* Group 4 */ |
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c new file mode 100644 index 000000000000..aa1169392b83 --- /dev/null +++ b/arch/x86/mm/memblock.c | |||
@@ -0,0 +1,348 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/types.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/bitops.h> | ||
5 | #include <linux/memblock.h> | ||
6 | #include <linux/bootmem.h> | ||
7 | #include <linux/mm.h> | ||
8 | #include <linux/range.h> | ||
9 | |||
10 | /* Check for already reserved areas */ | ||
11 | static bool __init check_with_memblock_reserved_size(u64 *addrp, u64 *sizep, u64 align) | ||
12 | { | ||
13 | struct memblock_region *r; | ||
14 | u64 addr = *addrp, last; | ||
15 | u64 size = *sizep; | ||
16 | bool changed = false; | ||
17 | |||
18 | again: | ||
19 | last = addr + size; | ||
20 | for_each_memblock(reserved, r) { | ||
21 | if (last > r->base && addr < r->base) { | ||
22 | size = r->base - addr; | ||
23 | changed = true; | ||
24 | goto again; | ||
25 | } | ||
26 | if (last > (r->base + r->size) && addr < (r->base + r->size)) { | ||
27 | addr = round_up(r->base + r->size, align); | ||
28 | size = last - addr; | ||
29 | changed = true; | ||
30 | goto again; | ||
31 | } | ||
32 | if (last <= (r->base + r->size) && addr >= r->base) { | ||
33 | *sizep = 0; | ||
34 | return false; | ||
35 | } | ||
36 | } | ||
37 | if (changed) { | ||
38 | *addrp = addr; | ||
39 | *sizep = size; | ||
40 | } | ||
41 | return changed; | ||
42 | } | ||
43 | |||
44 | /* | ||
45 | * Find next free range after start, and size is returned in *sizep | ||
46 | */ | ||
47 | u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align) | ||
48 | { | ||
49 | struct memblock_region *r; | ||
50 | |||
51 | for_each_memblock(memory, r) { | ||
52 | u64 ei_start = r->base; | ||
53 | u64 ei_last = ei_start + r->size; | ||
54 | u64 addr; | ||
55 | |||
56 | addr = round_up(ei_start, align); | ||
57 | if (addr < start) | ||
58 | addr = round_up(start, align); | ||
59 | if (addr >= ei_last) | ||
60 | continue; | ||
61 | *sizep = ei_last - addr; | ||
62 | while (check_with_memblock_reserved_size(&addr, sizep, align)) | ||
63 | ; | ||
64 | |||
65 | if (*sizep) | ||
66 | return addr; | ||
67 | } | ||
68 | |||
69 | return MEMBLOCK_ERROR; | ||
70 | } | ||
71 | |||
72 | static __init struct range *find_range_array(int count) | ||
73 | { | ||
74 | u64 end, size, mem; | ||
75 | struct range *range; | ||
76 | |||
77 | size = sizeof(struct range) * count; | ||
78 | end = memblock.current_limit; | ||
79 | |||
80 | mem = memblock_find_in_range(0, end, size, sizeof(struct range)); | ||
81 | if (mem == MEMBLOCK_ERROR) | ||
82 | panic("can not find more space for range array"); | ||
83 | |||
84 | /* | ||
85 | * This range is tempoaray, so don't reserve it, it will not be | ||
86 | * overlapped because We will not alloccate new buffer before | ||
87 | * We discard this one | ||
88 | */ | ||
89 | range = __va(mem); | ||
90 | memset(range, 0, size); | ||
91 | |||
92 | return range; | ||
93 | } | ||
94 | |||
95 | static void __init memblock_x86_subtract_reserved(struct range *range, int az) | ||
96 | { | ||
97 | u64 final_start, final_end; | ||
98 | struct memblock_region *r; | ||
99 | |||
100 | /* Take out region array itself at first*/ | ||
101 | memblock_free_reserved_regions(); | ||
102 | |||
103 | memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt); | ||
104 | |||
105 | for_each_memblock(reserved, r) { | ||
106 | memblock_dbg(" [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1); | ||
107 | final_start = PFN_DOWN(r->base); | ||
108 | final_end = PFN_UP(r->base + r->size); | ||
109 | if (final_start >= final_end) | ||
110 | continue; | ||
111 | subtract_range(range, az, final_start, final_end); | ||
112 | } | ||
113 | |||
114 | /* Put region array back ? */ | ||
115 | memblock_reserve_reserved_regions(); | ||
116 | } | ||
117 | |||
118 | struct count_data { | ||
119 | int nr; | ||
120 | }; | ||
121 | |||
122 | static int __init count_work_fn(unsigned long start_pfn, | ||
123 | unsigned long end_pfn, void *datax) | ||
124 | { | ||
125 | struct count_data *data = datax; | ||
126 | |||
127 | data->nr++; | ||
128 | |||
129 | return 0; | ||
130 | } | ||
131 | |||
132 | static int __init count_early_node_map(int nodeid) | ||
133 | { | ||
134 | struct count_data data; | ||
135 | |||
136 | data.nr = 0; | ||
137 | work_with_active_regions(nodeid, count_work_fn, &data); | ||
138 | |||
139 | return data.nr; | ||
140 | } | ||
141 | |||
142 | int __init __get_free_all_memory_range(struct range **rangep, int nodeid, | ||
143 | unsigned long start_pfn, unsigned long end_pfn) | ||
144 | { | ||
145 | int count; | ||
146 | struct range *range; | ||
147 | int nr_range; | ||
148 | |||
149 | count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2; | ||
150 | |||
151 | range = find_range_array(count); | ||
152 | nr_range = 0; | ||
153 | |||
154 | /* | ||
155 | * Use early_node_map[] and memblock.reserved.region to get range array | ||
156 | * at first | ||
157 | */ | ||
158 | nr_range = add_from_early_node_map(range, count, nr_range, nodeid); | ||
159 | subtract_range(range, count, 0, start_pfn); | ||
160 | subtract_range(range, count, end_pfn, -1ULL); | ||
161 | |||
162 | memblock_x86_subtract_reserved(range, count); | ||
163 | nr_range = clean_sort_range(range, count); | ||
164 | |||
165 | *rangep = range; | ||
166 | return nr_range; | ||
167 | } | ||
168 | |||
169 | int __init get_free_all_memory_range(struct range **rangep, int nodeid) | ||
170 | { | ||
171 | unsigned long end_pfn = -1UL; | ||
172 | |||
173 | #ifdef CONFIG_X86_32 | ||
174 | end_pfn = max_low_pfn; | ||
175 | #endif | ||
176 | return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn); | ||
177 | } | ||
178 | |||
179 | static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free) | ||
180 | { | ||
181 | int i, count; | ||
182 | struct range *range; | ||
183 | int nr_range; | ||
184 | u64 final_start, final_end; | ||
185 | u64 free_size; | ||
186 | struct memblock_region *r; | ||
187 | |||
188 | count = (memblock.reserved.cnt + memblock.memory.cnt) * 2; | ||
189 | |||
190 | range = find_range_array(count); | ||
191 | nr_range = 0; | ||
192 | |||
193 | addr = PFN_UP(addr); | ||
194 | limit = PFN_DOWN(limit); | ||
195 | |||
196 | for_each_memblock(memory, r) { | ||
197 | final_start = PFN_UP(r->base); | ||
198 | final_end = PFN_DOWN(r->base + r->size); | ||
199 | if (final_start >= final_end) | ||
200 | continue; | ||
201 | if (final_start >= limit || final_end <= addr) | ||
202 | continue; | ||
203 | |||
204 | nr_range = add_range(range, count, nr_range, final_start, final_end); | ||
205 | } | ||
206 | subtract_range(range, count, 0, addr); | ||
207 | subtract_range(range, count, limit, -1ULL); | ||
208 | |||
209 | /* Subtract memblock.reserved.region in range ? */ | ||
210 | if (!get_free) | ||
211 | goto sort_and_count_them; | ||
212 | for_each_memblock(reserved, r) { | ||
213 | final_start = PFN_DOWN(r->base); | ||
214 | final_end = PFN_UP(r->base + r->size); | ||
215 | if (final_start >= final_end) | ||
216 | continue; | ||
217 | if (final_start >= limit || final_end <= addr) | ||
218 | continue; | ||
219 | |||
220 | subtract_range(range, count, final_start, final_end); | ||
221 | } | ||
222 | |||
223 | sort_and_count_them: | ||
224 | nr_range = clean_sort_range(range, count); | ||
225 | |||
226 | free_size = 0; | ||
227 | for (i = 0; i < nr_range; i++) | ||
228 | free_size += range[i].end - range[i].start; | ||
229 | |||
230 | return free_size << PAGE_SHIFT; | ||
231 | } | ||
232 | |||
233 | u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit) | ||
234 | { | ||
235 | return __memblock_x86_memory_in_range(addr, limit, true); | ||
236 | } | ||
237 | |||
238 | u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit) | ||
239 | { | ||
240 | return __memblock_x86_memory_in_range(addr, limit, false); | ||
241 | } | ||
242 | |||
243 | void __init memblock_x86_reserve_range(u64 start, u64 end, char *name) | ||
244 | { | ||
245 | if (start == end) | ||
246 | return; | ||
247 | |||
248 | if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end)) | ||
249 | return; | ||
250 | |||
251 | memblock_dbg(" memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name); | ||
252 | |||
253 | memblock_reserve(start, end - start); | ||
254 | } | ||
255 | |||
256 | void __init memblock_x86_free_range(u64 start, u64 end) | ||
257 | { | ||
258 | if (start == end) | ||
259 | return; | ||
260 | |||
261 | if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end)) | ||
262 | return; | ||
263 | |||
264 | memblock_dbg(" memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1); | ||
265 | |||
266 | memblock_free(start, end - start); | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Need to call this function after memblock_x86_register_active_regions, | ||
271 | * so early_node_map[] is filled already. | ||
272 | */ | ||
273 | u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align) | ||
274 | { | ||
275 | u64 addr; | ||
276 | addr = find_memory_core_early(nid, size, align, start, end); | ||
277 | if (addr != MEMBLOCK_ERROR) | ||
278 | return addr; | ||
279 | |||
280 | /* Fallback, should already have start end within node range */ | ||
281 | return memblock_find_in_range(start, end, size, align); | ||
282 | } | ||
283 | |||
284 | /* | ||
285 | * Finds an active region in the address range from start_pfn to last_pfn and | ||
286 | * returns its range in ei_startpfn and ei_endpfn for the memblock entry. | ||
287 | */ | ||
288 | static int __init memblock_x86_find_active_region(const struct memblock_region *ei, | ||
289 | unsigned long start_pfn, | ||
290 | unsigned long last_pfn, | ||
291 | unsigned long *ei_startpfn, | ||
292 | unsigned long *ei_endpfn) | ||
293 | { | ||
294 | u64 align = PAGE_SIZE; | ||
295 | |||
296 | *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT; | ||
297 | *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT; | ||
298 | |||
299 | /* Skip map entries smaller than a page */ | ||
300 | if (*ei_startpfn >= *ei_endpfn) | ||
301 | return 0; | ||
302 | |||
303 | /* Skip if map is outside the node */ | ||
304 | if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn) | ||
305 | return 0; | ||
306 | |||
307 | /* Check for overlaps */ | ||
308 | if (*ei_startpfn < start_pfn) | ||
309 | *ei_startpfn = start_pfn; | ||
310 | if (*ei_endpfn > last_pfn) | ||
311 | *ei_endpfn = last_pfn; | ||
312 | |||
313 | return 1; | ||
314 | } | ||
315 | |||
316 | /* Walk the memblock.memory map and register active regions within a node */ | ||
317 | void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn, | ||
318 | unsigned long last_pfn) | ||
319 | { | ||
320 | unsigned long ei_startpfn; | ||
321 | unsigned long ei_endpfn; | ||
322 | struct memblock_region *r; | ||
323 | |||
324 | for_each_memblock(memory, r) | ||
325 | if (memblock_x86_find_active_region(r, start_pfn, last_pfn, | ||
326 | &ei_startpfn, &ei_endpfn)) | ||
327 | add_active_range(nid, ei_startpfn, ei_endpfn); | ||
328 | } | ||
329 | |||
330 | /* | ||
331 | * Find the hole size (in bytes) in the memory range. | ||
332 | * @start: starting address of the memory range to scan | ||
333 | * @end: ending address of the memory range to scan | ||
334 | */ | ||
335 | u64 __init memblock_x86_hole_size(u64 start, u64 end) | ||
336 | { | ||
337 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
338 | unsigned long last_pfn = end >> PAGE_SHIFT; | ||
339 | unsigned long ei_startpfn, ei_endpfn, ram = 0; | ||
340 | struct memblock_region *r; | ||
341 | |||
342 | for_each_memblock(memory, r) | ||
343 | if (memblock_x86_find_active_region(r, start_pfn, last_pfn, | ||
344 | &ei_startpfn, &ei_endpfn)) | ||
345 | ram += ei_endpfn - ei_startpfn; | ||
346 | |||
347 | return end - start - ((u64)ram << PAGE_SHIFT); | ||
348 | } | ||
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 18d244f70205..92faf3a1c53e 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c | |||
@@ -6,8 +6,7 @@ | |||
6 | #include <linux/smp.h> | 6 | #include <linux/smp.h> |
7 | #include <linux/init.h> | 7 | #include <linux/init.h> |
8 | #include <linux/pfn.h> | 8 | #include <linux/pfn.h> |
9 | 9 | #include <linux/memblock.h> | |
10 | #include <asm/e820.h> | ||
11 | 10 | ||
12 | static u64 patterns[] __initdata = { | 11 | static u64 patterns[] __initdata = { |
13 | 0, | 12 | 0, |
@@ -35,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) | |||
35 | (unsigned long long) pattern, | 34 | (unsigned long long) pattern, |
36 | (unsigned long long) start_bad, | 35 | (unsigned long long) start_bad, |
37 | (unsigned long long) end_bad); | 36 | (unsigned long long) end_bad); |
38 | reserve_early(start_bad, end_bad, "BAD RAM"); | 37 | memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM"); |
39 | } | 38 | } |
40 | 39 | ||
41 | static void __init memtest(u64 pattern, u64 start_phys, u64 size) | 40 | static void __init memtest(u64 pattern, u64 start_phys, u64 size) |
@@ -74,7 +73,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end) | |||
74 | u64 size = 0; | 73 | u64 size = 0; |
75 | 74 | ||
76 | while (start < end) { | 75 | while (start < end) { |
77 | start = find_e820_area_size(start, &size, 1); | 76 | start = memblock_x86_find_in_range_size(start, &size, 1); |
78 | 77 | ||
79 | /* done ? */ | 78 | /* done ? */ |
80 | if (start >= end) | 79 | if (start >= end) |
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 809baaaf48b1..84a3e4c9f277 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c | |||
@@ -24,6 +24,7 @@ | |||
24 | 24 | ||
25 | #include <linux/mm.h> | 25 | #include <linux/mm.h> |
26 | #include <linux/bootmem.h> | 26 | #include <linux/bootmem.h> |
27 | #include <linux/memblock.h> | ||
27 | #include <linux/mmzone.h> | 28 | #include <linux/mmzone.h> |
28 | #include <linux/highmem.h> | 29 | #include <linux/highmem.h> |
29 | #include <linux/initrd.h> | 30 | #include <linux/initrd.h> |
@@ -120,7 +121,7 @@ int __init get_memcfg_numa_flat(void) | |||
120 | 121 | ||
121 | node_start_pfn[0] = 0; | 122 | node_start_pfn[0] = 0; |
122 | node_end_pfn[0] = max_pfn; | 123 | node_end_pfn[0] = max_pfn; |
123 | e820_register_active_regions(0, 0, max_pfn); | 124 | memblock_x86_register_active_regions(0, 0, max_pfn); |
124 | memory_present(0, 0, max_pfn); | 125 | memory_present(0, 0, max_pfn); |
125 | node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); | 126 | node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); |
126 | 127 | ||
@@ -161,14 +162,14 @@ static void __init allocate_pgdat(int nid) | |||
161 | NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; | 162 | NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; |
162 | else { | 163 | else { |
163 | unsigned long pgdat_phys; | 164 | unsigned long pgdat_phys; |
164 | pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT, | 165 | pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT, |
165 | max_pfn_mapped<<PAGE_SHIFT, | 166 | max_pfn_mapped<<PAGE_SHIFT, |
166 | sizeof(pg_data_t), | 167 | sizeof(pg_data_t), |
167 | PAGE_SIZE); | 168 | PAGE_SIZE); |
168 | NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT)); | 169 | NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT)); |
169 | memset(buf, 0, sizeof(buf)); | 170 | memset(buf, 0, sizeof(buf)); |
170 | sprintf(buf, "NODE_DATA %d", nid); | 171 | sprintf(buf, "NODE_DATA %d", nid); |
171 | reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); | 172 | memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); |
172 | } | 173 | } |
173 | printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", | 174 | printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", |
174 | nid, (unsigned long)NODE_DATA(nid)); | 175 | nid, (unsigned long)NODE_DATA(nid)); |
@@ -291,15 +292,15 @@ static __init unsigned long calculate_numa_remap_pages(void) | |||
291 | PTRS_PER_PTE); | 292 | PTRS_PER_PTE); |
292 | node_kva_target <<= PAGE_SHIFT; | 293 | node_kva_target <<= PAGE_SHIFT; |
293 | do { | 294 | do { |
294 | node_kva_final = find_e820_area(node_kva_target, | 295 | node_kva_final = memblock_find_in_range(node_kva_target, |
295 | ((u64)node_end_pfn[nid])<<PAGE_SHIFT, | 296 | ((u64)node_end_pfn[nid])<<PAGE_SHIFT, |
296 | ((u64)size)<<PAGE_SHIFT, | 297 | ((u64)size)<<PAGE_SHIFT, |
297 | LARGE_PAGE_BYTES); | 298 | LARGE_PAGE_BYTES); |
298 | node_kva_target -= LARGE_PAGE_BYTES; | 299 | node_kva_target -= LARGE_PAGE_BYTES; |
299 | } while (node_kva_final == -1ULL && | 300 | } while (node_kva_final == MEMBLOCK_ERROR && |
300 | (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); | 301 | (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); |
301 | 302 | ||
302 | if (node_kva_final == -1ULL) | 303 | if (node_kva_final == MEMBLOCK_ERROR) |
303 | panic("Can not get kva ram\n"); | 304 | panic("Can not get kva ram\n"); |
304 | 305 | ||
305 | node_remap_size[nid] = size; | 306 | node_remap_size[nid] = size; |
@@ -318,15 +319,13 @@ static __init unsigned long calculate_numa_remap_pages(void) | |||
318 | * but we could have some hole in high memory, and it will only | 319 | * but we could have some hole in high memory, and it will only |
319 | * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide | 320 | * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide |
320 | * to use it as free. | 321 | * to use it as free. |
321 | * So reserve_early here, hope we don't run out of that array | 322 | * So memblock_x86_reserve_range here, hope we don't run out of that array |
322 | */ | 323 | */ |
323 | reserve_early(node_kva_final, | 324 | memblock_x86_reserve_range(node_kva_final, |
324 | node_kva_final+(((u64)size)<<PAGE_SHIFT), | 325 | node_kva_final+(((u64)size)<<PAGE_SHIFT), |
325 | "KVA RAM"); | 326 | "KVA RAM"); |
326 | 327 | ||
327 | node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT; | 328 | node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT; |
328 | remove_active_range(nid, node_remap_start_pfn[nid], | ||
329 | node_remap_start_pfn[nid] + size); | ||
330 | } | 329 | } |
331 | printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", | 330 | printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", |
332 | reserve_pages); | 331 | reserve_pages); |
@@ -367,14 +366,14 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | |||
367 | 366 | ||
368 | kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); | 367 | kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); |
369 | do { | 368 | do { |
370 | kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT, | 369 | kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT, |
371 | max_low_pfn<<PAGE_SHIFT, | 370 | max_low_pfn<<PAGE_SHIFT, |
372 | kva_pages<<PAGE_SHIFT, | 371 | kva_pages<<PAGE_SHIFT, |
373 | PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; | 372 | PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; |
374 | kva_target_pfn -= PTRS_PER_PTE; | 373 | kva_target_pfn -= PTRS_PER_PTE; |
375 | } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); | 374 | } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn); |
376 | 375 | ||
377 | if (kva_start_pfn == -1UL) | 376 | if (kva_start_pfn == MEMBLOCK_ERROR) |
378 | panic("Can not get kva space\n"); | 377 | panic("Can not get kva space\n"); |
379 | 378 | ||
380 | printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", | 379 | printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", |
@@ -382,7 +381,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | |||
382 | printk(KERN_INFO "max_pfn = %lx\n", max_pfn); | 381 | printk(KERN_INFO "max_pfn = %lx\n", max_pfn); |
383 | 382 | ||
384 | /* avoid clash with initrd */ | 383 | /* avoid clash with initrd */ |
385 | reserve_early(kva_start_pfn<<PAGE_SHIFT, | 384 | memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT, |
386 | (kva_start_pfn + kva_pages)<<PAGE_SHIFT, | 385 | (kva_start_pfn + kva_pages)<<PAGE_SHIFT, |
387 | "KVA PG"); | 386 | "KVA PG"); |
388 | #ifdef CONFIG_HIGHMEM | 387 | #ifdef CONFIG_HIGHMEM |
@@ -419,9 +418,6 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | |||
419 | for_each_online_node(nid) { | 418 | for_each_online_node(nid) { |
420 | memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); | 419 | memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); |
421 | NODE_DATA(nid)->node_id = nid; | 420 | NODE_DATA(nid)->node_id = nid; |
422 | #ifndef CONFIG_NO_BOOTMEM | ||
423 | NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; | ||
424 | #endif | ||
425 | } | 421 | } |
426 | 422 | ||
427 | setup_bootmem_allocator(); | 423 | setup_bootmem_allocator(); |
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a7bcc23ef96c..60f498511dd6 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <linux/string.h> | 7 | #include <linux/string.h> |
8 | #include <linux/init.h> | 8 | #include <linux/init.h> |
9 | #include <linux/bootmem.h> | 9 | #include <linux/bootmem.h> |
10 | #include <linux/memblock.h> | ||
10 | #include <linux/mmzone.h> | 11 | #include <linux/mmzone.h> |
11 | #include <linux/ctype.h> | 12 | #include <linux/ctype.h> |
12 | #include <linux/module.h> | 13 | #include <linux/module.h> |
@@ -18,7 +19,7 @@ | |||
18 | #include <asm/dma.h> | 19 | #include <asm/dma.h> |
19 | #include <asm/numa.h> | 20 | #include <asm/numa.h> |
20 | #include <asm/acpi.h> | 21 | #include <asm/acpi.h> |
21 | #include <asm/k8.h> | 22 | #include <asm/amd_nb.h> |
22 | 23 | ||
23 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | 24 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; |
24 | EXPORT_SYMBOL(node_data); | 25 | EXPORT_SYMBOL(node_data); |
@@ -86,16 +87,16 @@ static int __init allocate_cachealigned_memnodemap(void) | |||
86 | 87 | ||
87 | addr = 0x8000; | 88 | addr = 0x8000; |
88 | nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); | 89 | nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); |
89 | nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, | 90 | nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT, |
90 | nodemap_size, L1_CACHE_BYTES); | 91 | nodemap_size, L1_CACHE_BYTES); |
91 | if (nodemap_addr == -1UL) { | 92 | if (nodemap_addr == MEMBLOCK_ERROR) { |
92 | printk(KERN_ERR | 93 | printk(KERN_ERR |
93 | "NUMA: Unable to allocate Memory to Node hash map\n"); | 94 | "NUMA: Unable to allocate Memory to Node hash map\n"); |
94 | nodemap_addr = nodemap_size = 0; | 95 | nodemap_addr = nodemap_size = 0; |
95 | return -1; | 96 | return -1; |
96 | } | 97 | } |
97 | memnodemap = phys_to_virt(nodemap_addr); | 98 | memnodemap = phys_to_virt(nodemap_addr); |
98 | reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); | 99 | memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); |
99 | 100 | ||
100 | printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", | 101 | printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", |
101 | nodemap_addr, nodemap_addr + nodemap_size); | 102 | nodemap_addr, nodemap_addr + nodemap_size); |
@@ -171,8 +172,8 @@ static void * __init early_node_mem(int nodeid, unsigned long start, | |||
171 | if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) && | 172 | if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) && |
172 | end > (MAX_DMA32_PFN<<PAGE_SHIFT)) | 173 | end > (MAX_DMA32_PFN<<PAGE_SHIFT)) |
173 | start = MAX_DMA32_PFN<<PAGE_SHIFT; | 174 | start = MAX_DMA32_PFN<<PAGE_SHIFT; |
174 | mem = find_e820_area(start, end, size, align); | 175 | mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align); |
175 | if (mem != -1L) | 176 | if (mem != MEMBLOCK_ERROR) |
176 | return __va(mem); | 177 | return __va(mem); |
177 | 178 | ||
178 | /* extend the search scope */ | 179 | /* extend the search scope */ |
@@ -181,8 +182,8 @@ static void * __init early_node_mem(int nodeid, unsigned long start, | |||
181 | start = MAX_DMA32_PFN<<PAGE_SHIFT; | 182 | start = MAX_DMA32_PFN<<PAGE_SHIFT; |
182 | else | 183 | else |
183 | start = MAX_DMA_PFN<<PAGE_SHIFT; | 184 | start = MAX_DMA_PFN<<PAGE_SHIFT; |
184 | mem = find_e820_area(start, end, size, align); | 185 | mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align); |
185 | if (mem != -1L) | 186 | if (mem != MEMBLOCK_ERROR) |
186 | return __va(mem); | 187 | return __va(mem); |
187 | 188 | ||
188 | printk(KERN_ERR "Cannot find %lu bytes in node %d\n", | 189 | printk(KERN_ERR "Cannot find %lu bytes in node %d\n", |
@@ -198,10 +199,6 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | |||
198 | unsigned long start_pfn, last_pfn, nodedata_phys; | 199 | unsigned long start_pfn, last_pfn, nodedata_phys; |
199 | const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); | 200 | const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); |
200 | int nid; | 201 | int nid; |
201 | #ifndef CONFIG_NO_BOOTMEM | ||
202 | unsigned long bootmap_start, bootmap_pages, bootmap_size; | ||
203 | void *bootmap; | ||
204 | #endif | ||
205 | 202 | ||
206 | if (!end) | 203 | if (!end) |
207 | return; | 204 | return; |
@@ -226,7 +223,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | |||
226 | if (node_data[nodeid] == NULL) | 223 | if (node_data[nodeid] == NULL) |
227 | return; | 224 | return; |
228 | nodedata_phys = __pa(node_data[nodeid]); | 225 | nodedata_phys = __pa(node_data[nodeid]); |
229 | reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); | 226 | memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); |
230 | printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, | 227 | printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, |
231 | nodedata_phys + pgdat_size - 1); | 228 | nodedata_phys + pgdat_size - 1); |
232 | nid = phys_to_nid(nodedata_phys); | 229 | nid = phys_to_nid(nodedata_phys); |
@@ -238,47 +235,6 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | |||
238 | NODE_DATA(nodeid)->node_start_pfn = start_pfn; | 235 | NODE_DATA(nodeid)->node_start_pfn = start_pfn; |
239 | NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; | 236 | NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; |
240 | 237 | ||
241 | #ifndef CONFIG_NO_BOOTMEM | ||
242 | NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; | ||
243 | |||
244 | /* | ||
245 | * Find a place for the bootmem map | ||
246 | * nodedata_phys could be on other nodes by alloc_bootmem, | ||
247 | * so need to sure bootmap_start not to be small, otherwise | ||
248 | * early_node_mem will get that with find_e820_area instead | ||
249 | * of alloc_bootmem, that could clash with reserved range | ||
250 | */ | ||
251 | bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); | ||
252 | bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); | ||
253 | /* | ||
254 | * SMP_CACHE_BYTES could be enough, but init_bootmem_node like | ||
255 | * to use that to align to PAGE_SIZE | ||
256 | */ | ||
257 | bootmap = early_node_mem(nodeid, bootmap_start, end, | ||
258 | bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); | ||
259 | if (bootmap == NULL) { | ||
260 | free_early(nodedata_phys, nodedata_phys + pgdat_size); | ||
261 | node_data[nodeid] = NULL; | ||
262 | return; | ||
263 | } | ||
264 | bootmap_start = __pa(bootmap); | ||
265 | reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT), | ||
266 | "BOOTMAP"); | ||
267 | |||
268 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | ||
269 | bootmap_start >> PAGE_SHIFT, | ||
270 | start_pfn, last_pfn); | ||
271 | |||
272 | printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", | ||
273 | bootmap_start, bootmap_start + bootmap_size - 1, | ||
274 | bootmap_pages); | ||
275 | nid = phys_to_nid(bootmap_start); | ||
276 | if (nid != nodeid) | ||
277 | printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); | ||
278 | |||
279 | free_bootmem_with_active_regions(nodeid, end); | ||
280 | #endif | ||
281 | |||
282 | node_set_online(nodeid); | 238 | node_set_online(nodeid); |
283 | } | 239 | } |
284 | 240 | ||
@@ -416,7 +372,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, | |||
416 | nr_nodes = MAX_NUMNODES; | 372 | nr_nodes = MAX_NUMNODES; |
417 | } | 373 | } |
418 | 374 | ||
419 | size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes; | 375 | size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; |
420 | /* | 376 | /* |
421 | * Calculate the number of big nodes that can be allocated as a result | 377 | * Calculate the number of big nodes that can be allocated as a result |
422 | * of consolidating the remainder. | 378 | * of consolidating the remainder. |
@@ -452,7 +408,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, | |||
452 | * non-reserved memory is less than the per-node size. | 408 | * non-reserved memory is less than the per-node size. |
453 | */ | 409 | */ |
454 | while (end - physnodes[i].start - | 410 | while (end - physnodes[i].start - |
455 | e820_hole_size(physnodes[i].start, end) < size) { | 411 | memblock_x86_hole_size(physnodes[i].start, end) < size) { |
456 | end += FAKE_NODE_MIN_SIZE; | 412 | end += FAKE_NODE_MIN_SIZE; |
457 | if (end > physnodes[i].end) { | 413 | if (end > physnodes[i].end) { |
458 | end = physnodes[i].end; | 414 | end = physnodes[i].end; |
@@ -466,7 +422,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, | |||
466 | * this one must extend to the boundary. | 422 | * this one must extend to the boundary. |
467 | */ | 423 | */ |
468 | if (end < dma32_end && dma32_end - end - | 424 | if (end < dma32_end && dma32_end - end - |
469 | e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | 425 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) |
470 | end = dma32_end; | 426 | end = dma32_end; |
471 | 427 | ||
472 | /* | 428 | /* |
@@ -475,7 +431,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, | |||
475 | * physical node. | 431 | * physical node. |
476 | */ | 432 | */ |
477 | if (physnodes[i].end - end - | 433 | if (physnodes[i].end - end - |
478 | e820_hole_size(end, physnodes[i].end) < size) | 434 | memblock_x86_hole_size(end, physnodes[i].end) < size) |
479 | end = physnodes[i].end; | 435 | end = physnodes[i].end; |
480 | 436 | ||
481 | /* | 437 | /* |
@@ -503,7 +459,7 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) | |||
503 | { | 459 | { |
504 | u64 end = start + size; | 460 | u64 end = start + size; |
505 | 461 | ||
506 | while (end - start - e820_hole_size(start, end) < size) { | 462 | while (end - start - memblock_x86_hole_size(start, end) < size) { |
507 | end += FAKE_NODE_MIN_SIZE; | 463 | end += FAKE_NODE_MIN_SIZE; |
508 | if (end > max_addr) { | 464 | if (end > max_addr) { |
509 | end = max_addr; | 465 | end = max_addr; |
@@ -532,7 +488,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) | |||
532 | * creates a uniform distribution of node sizes across the entire | 488 | * creates a uniform distribution of node sizes across the entire |
533 | * machine (but not necessarily over physical nodes). | 489 | * machine (but not necessarily over physical nodes). |
534 | */ | 490 | */ |
535 | min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / | 491 | min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / |
536 | MAX_NUMNODES; | 492 | MAX_NUMNODES; |
537 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); | 493 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); |
538 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) | 494 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) |
@@ -565,7 +521,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) | |||
565 | * this one must extend to the boundary. | 521 | * this one must extend to the boundary. |
566 | */ | 522 | */ |
567 | if (end < dma32_end && dma32_end - end - | 523 | if (end < dma32_end && dma32_end - end - |
568 | e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | 524 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) |
569 | end = dma32_end; | 525 | end = dma32_end; |
570 | 526 | ||
571 | /* | 527 | /* |
@@ -574,7 +530,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) | |||
574 | * physical node. | 530 | * physical node. |
575 | */ | 531 | */ |
576 | if (physnodes[i].end - end - | 532 | if (physnodes[i].end - end - |
577 | e820_hole_size(end, physnodes[i].end) < size) | 533 | memblock_x86_hole_size(end, physnodes[i].end) < size) |
578 | end = physnodes[i].end; | 534 | end = physnodes[i].end; |
579 | 535 | ||
580 | /* | 536 | /* |
@@ -638,7 +594,7 @@ static int __init numa_emulation(unsigned long start_pfn, | |||
638 | */ | 594 | */ |
639 | remove_all_active_ranges(); | 595 | remove_all_active_ranges(); |
640 | for_each_node_mask(i, node_possible_map) { | 596 | for_each_node_mask(i, node_possible_map) { |
641 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | 597 | memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, |
642 | nodes[i].end >> PAGE_SHIFT); | 598 | nodes[i].end >> PAGE_SHIFT); |
643 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | 599 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); |
644 | } | 600 | } |
@@ -691,7 +647,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, | |||
691 | node_set(0, node_possible_map); | 647 | node_set(0, node_possible_map); |
692 | for (i = 0; i < nr_cpu_ids; i++) | 648 | for (i = 0; i < nr_cpu_ids; i++) |
693 | numa_set_node(i, 0); | 649 | numa_set_node(i, 0); |
694 | e820_register_active_regions(0, start_pfn, last_pfn); | 650 | memblock_x86_register_active_regions(0, start_pfn, last_pfn); |
695 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); | 651 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); |
696 | } | 652 | } |
697 | 653 | ||
@@ -703,9 +659,7 @@ unsigned long __init numa_free_all_bootmem(void) | |||
703 | for_each_online_node(i) | 659 | for_each_online_node(i) |
704 | pages += free_all_bootmem_node(NODE_DATA(i)); | 660 | pages += free_all_bootmem_node(NODE_DATA(i)); |
705 | 661 | ||
706 | #ifdef CONFIG_NO_BOOTMEM | ||
707 | pages += free_all_memory_core_early(MAX_NUMNODES); | 662 | pages += free_all_memory_core_early(MAX_NUMNODES); |
708 | #endif | ||
709 | 663 | ||
710 | return pages; | 664 | return pages; |
711 | } | 665 | } |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5c4ee422590e..8be8c7d7bc89 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd) | |||
87 | #define UNSHARED_PTRS_PER_PGD \ | 87 | #define UNSHARED_PTRS_PER_PGD \ |
88 | (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) | 88 | (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) |
89 | 89 | ||
90 | static void pgd_ctor(pgd_t *pgd) | 90 | |
91 | static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) | ||
92 | { | ||
93 | BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); | ||
94 | virt_to_page(pgd)->index = (pgoff_t)mm; | ||
95 | } | ||
96 | |||
97 | struct mm_struct *pgd_page_get_mm(struct page *page) | ||
98 | { | ||
99 | return (struct mm_struct *)page->index; | ||
100 | } | ||
101 | |||
102 | static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) | ||
91 | { | 103 | { |
92 | /* If the pgd points to a shared pagetable level (either the | 104 | /* If the pgd points to a shared pagetable level (either the |
93 | ptes in non-PAE, or shared PMD in PAE), then just copy the | 105 | ptes in non-PAE, or shared PMD in PAE), then just copy the |
@@ -98,15 +110,13 @@ static void pgd_ctor(pgd_t *pgd) | |||
98 | clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, | 110 | clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, |
99 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | 111 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, |
100 | KERNEL_PGD_PTRS); | 112 | KERNEL_PGD_PTRS); |
101 | paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT, | ||
102 | __pa(swapper_pg_dir) >> PAGE_SHIFT, | ||
103 | KERNEL_PGD_BOUNDARY, | ||
104 | KERNEL_PGD_PTRS); | ||
105 | } | 113 | } |
106 | 114 | ||
107 | /* list required to sync kernel mapping updates */ | 115 | /* list required to sync kernel mapping updates */ |
108 | if (!SHARED_KERNEL_PMD) | 116 | if (!SHARED_KERNEL_PMD) { |
117 | pgd_set_mm(pgd, mm); | ||
109 | pgd_list_add(pgd); | 118 | pgd_list_add(pgd); |
119 | } | ||
110 | } | 120 | } |
111 | 121 | ||
112 | static void pgd_dtor(pgd_t *pgd) | 122 | static void pgd_dtor(pgd_t *pgd) |
@@ -272,7 +282,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) | |||
272 | */ | 282 | */ |
273 | spin_lock_irqsave(&pgd_lock, flags); | 283 | spin_lock_irqsave(&pgd_lock, flags); |
274 | 284 | ||
275 | pgd_ctor(pgd); | 285 | pgd_ctor(mm, pgd); |
276 | pgd_prepopulate_pmd(mm, pgd, pmds); | 286 | pgd_prepopulate_pmd(mm, pgd, pmds); |
277 | 287 | ||
278 | spin_unlock_irqrestore(&pgd_lock, flags); | 288 | spin_unlock_irqrestore(&pgd_lock, flags); |
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 9324f13492d5..a17dffd136c1 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c | |||
@@ -25,6 +25,7 @@ | |||
25 | */ | 25 | */ |
26 | #include <linux/mm.h> | 26 | #include <linux/mm.h> |
27 | #include <linux/bootmem.h> | 27 | #include <linux/bootmem.h> |
28 | #include <linux/memblock.h> | ||
28 | #include <linux/mmzone.h> | 29 | #include <linux/mmzone.h> |
29 | #include <linux/acpi.h> | 30 | #include <linux/acpi.h> |
30 | #include <linux/nodemask.h> | 31 | #include <linux/nodemask.h> |
@@ -264,7 +265,7 @@ int __init get_memcfg_from_srat(void) | |||
264 | if (node_read_chunk(chunk->nid, chunk)) | 265 | if (node_read_chunk(chunk->nid, chunk)) |
265 | continue; | 266 | continue; |
266 | 267 | ||
267 | e820_register_active_regions(chunk->nid, chunk->start_pfn, | 268 | memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn, |
268 | min(chunk->end_pfn, max_pfn)); | 269 | min(chunk->end_pfn, max_pfn)); |
269 | } | 270 | } |
270 | /* for out of order entries in SRAT */ | 271 | /* for out of order entries in SRAT */ |
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 9c0d0d399c30..a35cb9d8b060 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/topology.h> | 17 | #include <linux/topology.h> |
18 | #include <linux/bootmem.h> | 18 | #include <linux/bootmem.h> |
19 | #include <linux/memblock.h> | ||
19 | #include <linux/mm.h> | 20 | #include <linux/mm.h> |
20 | #include <asm/proto.h> | 21 | #include <asm/proto.h> |
21 | #include <asm/numa.h> | 22 | #include <asm/numa.h> |
@@ -98,15 +99,15 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | |||
98 | unsigned long phys; | 99 | unsigned long phys; |
99 | 100 | ||
100 | length = slit->header.length; | 101 | length = slit->header.length; |
101 | phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length, | 102 | phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length, |
102 | PAGE_SIZE); | 103 | PAGE_SIZE); |
103 | 104 | ||
104 | if (phys == -1L) | 105 | if (phys == MEMBLOCK_ERROR) |
105 | panic(" Can not save slit!\n"); | 106 | panic(" Can not save slit!\n"); |
106 | 107 | ||
107 | acpi_slit = __va(phys); | 108 | acpi_slit = __va(phys); |
108 | memcpy(acpi_slit, slit, length); | 109 | memcpy(acpi_slit, slit, length); |
109 | reserve_early(phys, phys + length, "ACPI SLIT"); | 110 | memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT"); |
110 | } | 111 | } |
111 | 112 | ||
112 | /* Callback for Proximity Domain -> x2APIC mapping */ | 113 | /* Callback for Proximity Domain -> x2APIC mapping */ |
@@ -324,7 +325,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) | |||
324 | pxmram = 0; | 325 | pxmram = 0; |
325 | } | 326 | } |
326 | 327 | ||
327 | e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT); | 328 | e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT); |
328 | /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ | 329 | /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ |
329 | if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { | 330 | if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { |
330 | printk(KERN_ERR | 331 | printk(KERN_ERR |
@@ -421,7 +422,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) | |||
421 | } | 422 | } |
422 | 423 | ||
423 | for (i = 0; i < num_node_memblks; i++) | 424 | for (i = 0; i < num_node_memblks; i++) |
424 | e820_register_active_regions(memblk_nodeid[i], | 425 | memblock_x86_register_active_regions(memblk_nodeid[i], |
425 | node_memblk_range[i].start >> PAGE_SHIFT, | 426 | node_memblk_range[i].start >> PAGE_SHIFT, |
426 | node_memblk_range[i].end >> PAGE_SHIFT); | 427 | node_memblk_range[i].end >> PAGE_SHIFT); |
427 | 428 | ||
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c03f14ab6667..49358481c733 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/smp.h> | 5 | #include <linux/smp.h> |
6 | #include <linux/interrupt.h> | 6 | #include <linux/interrupt.h> |
7 | #include <linux/module.h> | 7 | #include <linux/module.h> |
8 | #include <linux/cpu.h> | ||
8 | 9 | ||
9 | #include <asm/tlbflush.h> | 10 | #include <asm/tlbflush.h> |
10 | #include <asm/mmu_context.h> | 11 | #include <asm/mmu_context.h> |
@@ -52,6 +53,8 @@ union smp_flush_state { | |||
52 | want false sharing in the per cpu data segment. */ | 53 | want false sharing in the per cpu data segment. */ |
53 | static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; | 54 | static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; |
54 | 55 | ||
56 | static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); | ||
57 | |||
55 | /* | 58 | /* |
56 | * We cannot call mmdrop() because we are in interrupt context, | 59 | * We cannot call mmdrop() because we are in interrupt context, |
57 | * instead update mm->cpu_vm_mask. | 60 | * instead update mm->cpu_vm_mask. |
@@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, | |||
173 | union smp_flush_state *f; | 176 | union smp_flush_state *f; |
174 | 177 | ||
175 | /* Caller has disabled preemption */ | 178 | /* Caller has disabled preemption */ |
176 | sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; | 179 | sender = this_cpu_read(tlb_vector_offset); |
177 | f = &flush_state[sender]; | 180 | f = &flush_state[sender]; |
178 | 181 | ||
179 | /* | 182 | /* |
@@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
218 | flush_tlb_others_ipi(cpumask, mm, va); | 221 | flush_tlb_others_ipi(cpumask, mm, va); |
219 | } | 222 | } |
220 | 223 | ||
224 | static void __cpuinit calculate_tlb_offset(void) | ||
225 | { | ||
226 | int cpu, node, nr_node_vecs; | ||
227 | /* | ||
228 | * we are changing tlb_vector_offset for each CPU in runtime, but this | ||
229 | * will not cause inconsistency, as the write is atomic under X86. we | ||
230 | * might see more lock contentions in a short time, but after all CPU's | ||
231 | * tlb_vector_offset are changed, everything should go normal | ||
232 | * | ||
233 | * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might | ||
234 | * waste some vectors. | ||
235 | **/ | ||
236 | if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS) | ||
237 | nr_node_vecs = 1; | ||
238 | else | ||
239 | nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; | ||
240 | |||
241 | for_each_online_node(node) { | ||
242 | int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) * | ||
243 | nr_node_vecs; | ||
244 | int cpu_offset = 0; | ||
245 | for_each_cpu(cpu, cpumask_of_node(node)) { | ||
246 | per_cpu(tlb_vector_offset, cpu) = node_offset + | ||
247 | cpu_offset; | ||
248 | cpu_offset++; | ||
249 | cpu_offset = cpu_offset % nr_node_vecs; | ||
250 | } | ||
251 | } | ||
252 | } | ||
253 | |||
254 | static int tlb_cpuhp_notify(struct notifier_block *n, | ||
255 | unsigned long action, void *hcpu) | ||
256 | { | ||
257 | switch (action & 0xf) { | ||
258 | case CPU_ONLINE: | ||
259 | case CPU_DEAD: | ||
260 | calculate_tlb_offset(); | ||
261 | } | ||
262 | return NOTIFY_OK; | ||
263 | } | ||
264 | |||
221 | static int __cpuinit init_smp_flush(void) | 265 | static int __cpuinit init_smp_flush(void) |
222 | { | 266 | { |
223 | int i; | 267 | int i; |
@@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void) | |||
225 | for (i = 0; i < ARRAY_SIZE(flush_state); i++) | 269 | for (i = 0; i < ARRAY_SIZE(flush_state); i++) |
226 | raw_spin_lock_init(&flush_state[i].tlbstate_lock); | 270 | raw_spin_lock_init(&flush_state[i].tlbstate_lock); |
227 | 271 | ||
272 | calculate_tlb_offset(); | ||
273 | hotcpu_notifier(tlb_cpuhp_notify, 0); | ||
228 | return 0; | 274 | return 0; |
229 | } | 275 | } |
230 | core_initcall(init_smp_flush); | 276 | core_initcall(init_smp_flush); |
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 3855096c59b8..2d49d4e19a36 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <asm/ptrace.h> | 14 | #include <asm/ptrace.h> |
15 | #include <asm/uaccess.h> | 15 | #include <asm/uaccess.h> |
16 | #include <asm/stacktrace.h> | 16 | #include <asm/stacktrace.h> |
17 | #include <linux/compat.h> | ||
17 | 18 | ||
18 | static void backtrace_warning_symbol(void *data, char *msg, | 19 | static void backtrace_warning_symbol(void *data, char *msg, |
19 | unsigned long symbol) | 20 | unsigned long symbol) |
@@ -48,14 +49,12 @@ static struct stacktrace_ops backtrace_ops = { | |||
48 | .walk_stack = print_context_stack, | 49 | .walk_stack = print_context_stack, |
49 | }; | 50 | }; |
50 | 51 | ||
51 | struct frame_head { | 52 | #ifdef CONFIG_COMPAT |
52 | struct frame_head *bp; | 53 | static struct stack_frame_ia32 * |
53 | unsigned long ret; | 54 | dump_user_backtrace_32(struct stack_frame_ia32 *head) |
54 | } __attribute__((packed)); | ||
55 | |||
56 | static struct frame_head *dump_user_backtrace(struct frame_head *head) | ||
57 | { | 55 | { |
58 | struct frame_head bufhead[2]; | 56 | struct stack_frame_ia32 bufhead[2]; |
57 | struct stack_frame_ia32 *fp; | ||
59 | 58 | ||
60 | /* Also check accessibility of one struct frame_head beyond */ | 59 | /* Also check accessibility of one struct frame_head beyond */ |
61 | if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) | 60 | if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) |
@@ -63,20 +62,66 @@ static struct frame_head *dump_user_backtrace(struct frame_head *head) | |||
63 | if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) | 62 | if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) |
64 | return NULL; | 63 | return NULL; |
65 | 64 | ||
66 | oprofile_add_trace(bufhead[0].ret); | 65 | fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame); |
66 | |||
67 | oprofile_add_trace(bufhead[0].return_address); | ||
68 | |||
69 | /* frame pointers should strictly progress back up the stack | ||
70 | * (towards higher addresses) */ | ||
71 | if (head >= fp) | ||
72 | return NULL; | ||
73 | |||
74 | return fp; | ||
75 | } | ||
76 | |||
77 | static inline int | ||
78 | x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) | ||
79 | { | ||
80 | struct stack_frame_ia32 *head; | ||
81 | |||
82 | /* User process is 32-bit */ | ||
83 | if (!current || !test_thread_flag(TIF_IA32)) | ||
84 | return 0; | ||
85 | |||
86 | head = (struct stack_frame_ia32 *) regs->bp; | ||
87 | while (depth-- && head) | ||
88 | head = dump_user_backtrace_32(head); | ||
89 | |||
90 | return 1; | ||
91 | } | ||
92 | |||
93 | #else | ||
94 | static inline int | ||
95 | x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) | ||
96 | { | ||
97 | return 0; | ||
98 | } | ||
99 | #endif /* CONFIG_COMPAT */ | ||
100 | |||
101 | static struct stack_frame *dump_user_backtrace(struct stack_frame *head) | ||
102 | { | ||
103 | struct stack_frame bufhead[2]; | ||
104 | |||
105 | /* Also check accessibility of one struct stack_frame beyond */ | ||
106 | if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) | ||
107 | return NULL; | ||
108 | if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) | ||
109 | return NULL; | ||
110 | |||
111 | oprofile_add_trace(bufhead[0].return_address); | ||
67 | 112 | ||
68 | /* frame pointers should strictly progress back up the stack | 113 | /* frame pointers should strictly progress back up the stack |
69 | * (towards higher addresses) */ | 114 | * (towards higher addresses) */ |
70 | if (head >= bufhead[0].bp) | 115 | if (head >= bufhead[0].next_frame) |
71 | return NULL; | 116 | return NULL; |
72 | 117 | ||
73 | return bufhead[0].bp; | 118 | return bufhead[0].next_frame; |
74 | } | 119 | } |
75 | 120 | ||
76 | void | 121 | void |
77 | x86_backtrace(struct pt_regs * const regs, unsigned int depth) | 122 | x86_backtrace(struct pt_regs * const regs, unsigned int depth) |
78 | { | 123 | { |
79 | struct frame_head *head = (struct frame_head *)frame_pointer(regs); | 124 | struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); |
80 | 125 | ||
81 | if (!user_mode_vm(regs)) { | 126 | if (!user_mode_vm(regs)) { |
82 | unsigned long stack = kernel_stack_pointer(regs); | 127 | unsigned long stack = kernel_stack_pointer(regs); |
@@ -86,6 +131,9 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) | |||
86 | return; | 131 | return; |
87 | } | 132 | } |
88 | 133 | ||
134 | if (x86_backtrace_32(regs, depth)) | ||
135 | return; | ||
136 | |||
89 | while (depth-- && head) | 137 | while (depth-- && head) |
90 | head = dump_user_backtrace(head); | 138 | head = dump_user_backtrace(head); |
91 | } | 139 | } |
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index f1575c9a2572..4e8baad36d37 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c | |||
@@ -695,9 +695,6 @@ static int __init ppro_init(char **cpu_type) | |||
695 | return 1; | 695 | return 1; |
696 | } | 696 | } |
697 | 697 | ||
698 | /* in order to get sysfs right */ | ||
699 | static int using_nmi; | ||
700 | |||
701 | int __init op_nmi_init(struct oprofile_operations *ops) | 698 | int __init op_nmi_init(struct oprofile_operations *ops) |
702 | { | 699 | { |
703 | __u8 vendor = boot_cpu_data.x86_vendor; | 700 | __u8 vendor = boot_cpu_data.x86_vendor; |
@@ -705,8 +702,6 @@ int __init op_nmi_init(struct oprofile_operations *ops) | |||
705 | char *cpu_type = NULL; | 702 | char *cpu_type = NULL; |
706 | int ret = 0; | 703 | int ret = 0; |
707 | 704 | ||
708 | using_nmi = 0; | ||
709 | |||
710 | if (!cpu_has_apic) | 705 | if (!cpu_has_apic) |
711 | return -ENODEV; | 706 | return -ENODEV; |
712 | 707 | ||
@@ -731,6 +726,12 @@ int __init op_nmi_init(struct oprofile_operations *ops) | |||
731 | case 0x11: | 726 | case 0x11: |
732 | cpu_type = "x86-64/family11h"; | 727 | cpu_type = "x86-64/family11h"; |
733 | break; | 728 | break; |
729 | case 0x12: | ||
730 | cpu_type = "x86-64/family12h"; | ||
731 | break; | ||
732 | case 0x14: | ||
733 | cpu_type = "x86-64/family14h"; | ||
734 | break; | ||
734 | default: | 735 | default: |
735 | return -ENODEV; | 736 | return -ENODEV; |
736 | } | 737 | } |
@@ -790,13 +791,11 @@ int __init op_nmi_init(struct oprofile_operations *ops) | |||
790 | if (ret) | 791 | if (ret) |
791 | return ret; | 792 | return ret; |
792 | 793 | ||
793 | using_nmi = 1; | ||
794 | printk(KERN_INFO "oprofile: using NMI interrupt.\n"); | 794 | printk(KERN_INFO "oprofile: using NMI interrupt.\n"); |
795 | return 0; | 795 | return 0; |
796 | } | 796 | } |
797 | 797 | ||
798 | void op_nmi_exit(void) | 798 | void op_nmi_exit(void) |
799 | { | 799 | { |
800 | if (using_nmi) | 800 | exit_sysfs(); |
801 | exit_sysfs(); | ||
802 | } | 801 | } |
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index b67a6b5aa8d4..a011bcc0f943 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c | |||
@@ -48,31 +48,53 @@ static unsigned long reset_value[NUM_VIRT_COUNTERS]; | |||
48 | 48 | ||
49 | static u32 ibs_caps; | 49 | static u32 ibs_caps; |
50 | 50 | ||
51 | struct op_ibs_config { | 51 | struct ibs_config { |
52 | unsigned long op_enabled; | 52 | unsigned long op_enabled; |
53 | unsigned long fetch_enabled; | 53 | unsigned long fetch_enabled; |
54 | unsigned long max_cnt_fetch; | 54 | unsigned long max_cnt_fetch; |
55 | unsigned long max_cnt_op; | 55 | unsigned long max_cnt_op; |
56 | unsigned long rand_en; | 56 | unsigned long rand_en; |
57 | unsigned long dispatched_ops; | 57 | unsigned long dispatched_ops; |
58 | unsigned long branch_target; | ||
58 | }; | 59 | }; |
59 | 60 | ||
60 | static struct op_ibs_config ibs_config; | 61 | struct ibs_state { |
61 | static u64 ibs_op_ctl; | 62 | u64 ibs_op_ctl; |
63 | int branch_target; | ||
64 | unsigned long sample_size; | ||
65 | }; | ||
66 | |||
67 | static struct ibs_config ibs_config; | ||
68 | static struct ibs_state ibs_state; | ||
62 | 69 | ||
63 | /* | 70 | /* |
64 | * IBS cpuid feature detection | 71 | * IBS cpuid feature detection |
65 | */ | 72 | */ |
66 | 73 | ||
67 | #define IBS_CPUID_FEATURES 0x8000001b | 74 | #define IBS_CPUID_FEATURES 0x8000001b |
68 | 75 | ||
69 | /* | 76 | /* |
70 | * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but | 77 | * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but |
71 | * bit 0 is used to indicate the existence of IBS. | 78 | * bit 0 is used to indicate the existence of IBS. |
72 | */ | 79 | */ |
73 | #define IBS_CAPS_AVAIL (1LL<<0) | 80 | #define IBS_CAPS_AVAIL (1U<<0) |
74 | #define IBS_CAPS_RDWROPCNT (1LL<<3) | 81 | #define IBS_CAPS_FETCHSAM (1U<<1) |
75 | #define IBS_CAPS_OPCNT (1LL<<4) | 82 | #define IBS_CAPS_OPSAM (1U<<2) |
83 | #define IBS_CAPS_RDWROPCNT (1U<<3) | ||
84 | #define IBS_CAPS_OPCNT (1U<<4) | ||
85 | #define IBS_CAPS_BRNTRGT (1U<<5) | ||
86 | #define IBS_CAPS_OPCNTEXT (1U<<6) | ||
87 | |||
88 | #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | ||
89 | | IBS_CAPS_FETCHSAM \ | ||
90 | | IBS_CAPS_OPSAM) | ||
91 | |||
92 | /* | ||
93 | * IBS APIC setup | ||
94 | */ | ||
95 | #define IBSCTL 0x1cc | ||
96 | #define IBSCTL_LVT_OFFSET_VALID (1ULL<<8) | ||
97 | #define IBSCTL_LVT_OFFSET_MASK 0x0F | ||
76 | 98 | ||
77 | /* | 99 | /* |
78 | * IBS randomization macros | 100 | * IBS randomization macros |
@@ -92,12 +114,12 @@ static u32 get_ibs_caps(void) | |||
92 | /* check IBS cpuid feature flags */ | 114 | /* check IBS cpuid feature flags */ |
93 | max_level = cpuid_eax(0x80000000); | 115 | max_level = cpuid_eax(0x80000000); |
94 | if (max_level < IBS_CPUID_FEATURES) | 116 | if (max_level < IBS_CPUID_FEATURES) |
95 | return IBS_CAPS_AVAIL; | 117 | return IBS_CAPS_DEFAULT; |
96 | 118 | ||
97 | ibs_caps = cpuid_eax(IBS_CPUID_FEATURES); | 119 | ibs_caps = cpuid_eax(IBS_CPUID_FEATURES); |
98 | if (!(ibs_caps & IBS_CAPS_AVAIL)) | 120 | if (!(ibs_caps & IBS_CAPS_AVAIL)) |
99 | /* cpuid flags not valid */ | 121 | /* cpuid flags not valid */ |
100 | return IBS_CAPS_AVAIL; | 122 | return IBS_CAPS_DEFAULT; |
101 | 123 | ||
102 | return ibs_caps; | 124 | return ibs_caps; |
103 | } | 125 | } |
@@ -190,8 +212,8 @@ op_amd_handle_ibs(struct pt_regs * const regs, | |||
190 | rdmsrl(MSR_AMD64_IBSOPCTL, ctl); | 212 | rdmsrl(MSR_AMD64_IBSOPCTL, ctl); |
191 | if (ctl & IBS_OP_VAL) { | 213 | if (ctl & IBS_OP_VAL) { |
192 | rdmsrl(MSR_AMD64_IBSOPRIP, val); | 214 | rdmsrl(MSR_AMD64_IBSOPRIP, val); |
193 | oprofile_write_reserve(&entry, regs, val, | 215 | oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE, |
194 | IBS_OP_CODE, IBS_OP_SIZE); | 216 | ibs_state.sample_size); |
195 | oprofile_add_data64(&entry, val); | 217 | oprofile_add_data64(&entry, val); |
196 | rdmsrl(MSR_AMD64_IBSOPDATA, val); | 218 | rdmsrl(MSR_AMD64_IBSOPDATA, val); |
197 | oprofile_add_data64(&entry, val); | 219 | oprofile_add_data64(&entry, val); |
@@ -203,10 +225,14 @@ op_amd_handle_ibs(struct pt_regs * const regs, | |||
203 | oprofile_add_data64(&entry, val); | 225 | oprofile_add_data64(&entry, val); |
204 | rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); | 226 | rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); |
205 | oprofile_add_data64(&entry, val); | 227 | oprofile_add_data64(&entry, val); |
228 | if (ibs_state.branch_target) { | ||
229 | rdmsrl(MSR_AMD64_IBSBRTARGET, val); | ||
230 | oprofile_add_data(&entry, (unsigned long)val); | ||
231 | } | ||
206 | oprofile_write_commit(&entry); | 232 | oprofile_write_commit(&entry); |
207 | 233 | ||
208 | /* reenable the IRQ */ | 234 | /* reenable the IRQ */ |
209 | ctl = op_amd_randomize_ibs_op(ibs_op_ctl); | 235 | ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl); |
210 | wrmsrl(MSR_AMD64_IBSOPCTL, ctl); | 236 | wrmsrl(MSR_AMD64_IBSOPCTL, ctl); |
211 | } | 237 | } |
212 | } | 238 | } |
@@ -219,21 +245,32 @@ static inline void op_amd_start_ibs(void) | |||
219 | if (!ibs_caps) | 245 | if (!ibs_caps) |
220 | return; | 246 | return; |
221 | 247 | ||
248 | memset(&ibs_state, 0, sizeof(ibs_state)); | ||
249 | |||
250 | /* | ||
251 | * Note: Since the max count settings may out of range we | ||
252 | * write back the actual used values so that userland can read | ||
253 | * it. | ||
254 | */ | ||
255 | |||
222 | if (ibs_config.fetch_enabled) { | 256 | if (ibs_config.fetch_enabled) { |
223 | val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT; | 257 | val = ibs_config.max_cnt_fetch >> 4; |
258 | val = min(val, IBS_FETCH_MAX_CNT); | ||
259 | ibs_config.max_cnt_fetch = val << 4; | ||
224 | val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; | 260 | val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; |
225 | val |= IBS_FETCH_ENABLE; | 261 | val |= IBS_FETCH_ENABLE; |
226 | wrmsrl(MSR_AMD64_IBSFETCHCTL, val); | 262 | wrmsrl(MSR_AMD64_IBSFETCHCTL, val); |
227 | } | 263 | } |
228 | 264 | ||
229 | if (ibs_config.op_enabled) { | 265 | if (ibs_config.op_enabled) { |
230 | ibs_op_ctl = ibs_config.max_cnt_op >> 4; | 266 | val = ibs_config.max_cnt_op >> 4; |
231 | if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) { | 267 | if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) { |
232 | /* | 268 | /* |
233 | * IbsOpCurCnt not supported. See | 269 | * IbsOpCurCnt not supported. See |
234 | * op_amd_randomize_ibs_op() for details. | 270 | * op_amd_randomize_ibs_op() for details. |
235 | */ | 271 | */ |
236 | ibs_op_ctl = clamp(ibs_op_ctl, 0x0081ULL, 0xFF80ULL); | 272 | val = clamp(val, 0x0081ULL, 0xFF80ULL); |
273 | ibs_config.max_cnt_op = val << 4; | ||
237 | } else { | 274 | } else { |
238 | /* | 275 | /* |
239 | * The start value is randomized with a | 276 | * The start value is randomized with a |
@@ -241,13 +278,24 @@ static inline void op_amd_start_ibs(void) | |||
241 | * with the half of the randomized range. Also | 278 | * with the half of the randomized range. Also |
242 | * avoid underflows. | 279 | * avoid underflows. |
243 | */ | 280 | */ |
244 | ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET, | 281 | val += IBS_RANDOM_MAXCNT_OFFSET; |
245 | IBS_OP_MAX_CNT); | 282 | if (ibs_caps & IBS_CAPS_OPCNTEXT) |
283 | val = min(val, IBS_OP_MAX_CNT_EXT); | ||
284 | else | ||
285 | val = min(val, IBS_OP_MAX_CNT); | ||
286 | ibs_config.max_cnt_op = | ||
287 | (val - IBS_RANDOM_MAXCNT_OFFSET) << 4; | ||
288 | } | ||
289 | val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT); | ||
290 | val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; | ||
291 | val |= IBS_OP_ENABLE; | ||
292 | ibs_state.ibs_op_ctl = val; | ||
293 | ibs_state.sample_size = IBS_OP_SIZE; | ||
294 | if (ibs_config.branch_target) { | ||
295 | ibs_state.branch_target = 1; | ||
296 | ibs_state.sample_size++; | ||
246 | } | 297 | } |
247 | if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops) | 298 | val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl); |
248 | ibs_op_ctl |= IBS_OP_CNT_CTL; | ||
249 | ibs_op_ctl |= IBS_OP_ENABLE; | ||
250 | val = op_amd_randomize_ibs_op(ibs_op_ctl); | ||
251 | wrmsrl(MSR_AMD64_IBSOPCTL, val); | 299 | wrmsrl(MSR_AMD64_IBSOPCTL, val); |
252 | } | 300 | } |
253 | } | 301 | } |
@@ -266,6 +314,70 @@ static void op_amd_stop_ibs(void) | |||
266 | wrmsrl(MSR_AMD64_IBSOPCTL, 0); | 314 | wrmsrl(MSR_AMD64_IBSOPCTL, 0); |
267 | } | 315 | } |
268 | 316 | ||
317 | static inline int eilvt_is_available(int offset) | ||
318 | { | ||
319 | /* check if we may assign a vector */ | ||
320 | return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); | ||
321 | } | ||
322 | |||
323 | static inline int ibs_eilvt_valid(void) | ||
324 | { | ||
325 | int offset; | ||
326 | u64 val; | ||
327 | |||
328 | rdmsrl(MSR_AMD64_IBSCTL, val); | ||
329 | offset = val & IBSCTL_LVT_OFFSET_MASK; | ||
330 | |||
331 | if (!(val & IBSCTL_LVT_OFFSET_VALID)) { | ||
332 | pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", | ||
333 | smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); | ||
334 | return 0; | ||
335 | } | ||
336 | |||
337 | if (!eilvt_is_available(offset)) { | ||
338 | pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", | ||
339 | smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); | ||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | return 1; | ||
344 | } | ||
345 | |||
346 | static inline int get_ibs_offset(void) | ||
347 | { | ||
348 | u64 val; | ||
349 | |||
350 | rdmsrl(MSR_AMD64_IBSCTL, val); | ||
351 | if (!(val & IBSCTL_LVT_OFFSET_VALID)) | ||
352 | return -EINVAL; | ||
353 | |||
354 | return val & IBSCTL_LVT_OFFSET_MASK; | ||
355 | } | ||
356 | |||
357 | static void setup_APIC_ibs(void) | ||
358 | { | ||
359 | int offset; | ||
360 | |||
361 | offset = get_ibs_offset(); | ||
362 | if (offset < 0) | ||
363 | goto failed; | ||
364 | |||
365 | if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) | ||
366 | return; | ||
367 | failed: | ||
368 | pr_warn("oprofile: IBS APIC setup failed on cpu #%d\n", | ||
369 | smp_processor_id()); | ||
370 | } | ||
371 | |||
372 | static void clear_APIC_ibs(void) | ||
373 | { | ||
374 | int offset; | ||
375 | |||
376 | offset = get_ibs_offset(); | ||
377 | if (offset >= 0) | ||
378 | setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); | ||
379 | } | ||
380 | |||
269 | #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX | 381 | #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX |
270 | 382 | ||
271 | static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, | 383 | static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, |
@@ -376,13 +488,13 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, | |||
376 | } | 488 | } |
377 | 489 | ||
378 | if (ibs_caps) | 490 | if (ibs_caps) |
379 | setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); | 491 | setup_APIC_ibs(); |
380 | } | 492 | } |
381 | 493 | ||
382 | static void op_amd_cpu_shutdown(void) | 494 | static void op_amd_cpu_shutdown(void) |
383 | { | 495 | { |
384 | if (ibs_caps) | 496 | if (ibs_caps) |
385 | setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); | 497 | clear_APIC_ibs(); |
386 | } | 498 | } |
387 | 499 | ||
388 | static int op_amd_check_ctrs(struct pt_regs * const regs, | 500 | static int op_amd_check_ctrs(struct pt_regs * const regs, |
@@ -445,16 +557,11 @@ static void op_amd_stop(struct op_msrs const * const msrs) | |||
445 | op_amd_stop_ibs(); | 557 | op_amd_stop_ibs(); |
446 | } | 558 | } |
447 | 559 | ||
448 | static int __init_ibs_nmi(void) | 560 | static int setup_ibs_ctl(int ibs_eilvt_off) |
449 | { | 561 | { |
450 | #define IBSCTL_LVTOFFSETVAL (1 << 8) | ||
451 | #define IBSCTL 0x1cc | ||
452 | struct pci_dev *cpu_cfg; | 562 | struct pci_dev *cpu_cfg; |
453 | int nodes; | 563 | int nodes; |
454 | u32 value = 0; | 564 | u32 value = 0; |
455 | u8 ibs_eilvt_off; | ||
456 | |||
457 | ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); | ||
458 | 565 | ||
459 | nodes = 0; | 566 | nodes = 0; |
460 | cpu_cfg = NULL; | 567 | cpu_cfg = NULL; |
@@ -466,24 +573,63 @@ static int __init_ibs_nmi(void) | |||
466 | break; | 573 | break; |
467 | ++nodes; | 574 | ++nodes; |
468 | pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off | 575 | pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off |
469 | | IBSCTL_LVTOFFSETVAL); | 576 | | IBSCTL_LVT_OFFSET_VALID); |
470 | pci_read_config_dword(cpu_cfg, IBSCTL, &value); | 577 | pci_read_config_dword(cpu_cfg, IBSCTL, &value); |
471 | if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { | 578 | if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { |
472 | pci_dev_put(cpu_cfg); | 579 | pci_dev_put(cpu_cfg); |
473 | printk(KERN_DEBUG "Failed to setup IBS LVT offset, " | 580 | printk(KERN_DEBUG "Failed to setup IBS LVT offset, " |
474 | "IBSCTL = 0x%08x", value); | 581 | "IBSCTL = 0x%08x\n", value); |
475 | return 1; | 582 | return -EINVAL; |
476 | } | 583 | } |
477 | } while (1); | 584 | } while (1); |
478 | 585 | ||
479 | if (!nodes) { | 586 | if (!nodes) { |
480 | printk(KERN_DEBUG "No CPU node configured for IBS"); | 587 | printk(KERN_DEBUG "No CPU node configured for IBS\n"); |
481 | return 1; | 588 | return -ENODEV; |
482 | } | 589 | } |
483 | 590 | ||
484 | return 0; | 591 | return 0; |
485 | } | 592 | } |
486 | 593 | ||
594 | static int force_ibs_eilvt_setup(void) | ||
595 | { | ||
596 | int i; | ||
597 | int ret; | ||
598 | |||
599 | /* find the next free available EILVT entry */ | ||
600 | for (i = 1; i < 4; i++) { | ||
601 | if (!eilvt_is_available(i)) | ||
602 | continue; | ||
603 | ret = setup_ibs_ctl(i); | ||
604 | if (ret) | ||
605 | return ret; | ||
606 | return 0; | ||
607 | } | ||
608 | |||
609 | printk(KERN_DEBUG "No EILVT entry available\n"); | ||
610 | |||
611 | return -EBUSY; | ||
612 | } | ||
613 | |||
614 | static int __init_ibs_nmi(void) | ||
615 | { | ||
616 | int ret; | ||
617 | |||
618 | if (ibs_eilvt_valid()) | ||
619 | return 0; | ||
620 | |||
621 | ret = force_ibs_eilvt_setup(); | ||
622 | if (ret) | ||
623 | return ret; | ||
624 | |||
625 | if (!ibs_eilvt_valid()) | ||
626 | return -EFAULT; | ||
627 | |||
628 | pr_err(FW_BUG "workaround enabled for IBS LVT offset\n"); | ||
629 | |||
630 | return 0; | ||
631 | } | ||
632 | |||
487 | /* initialize the APIC for the IBS interrupts if available */ | 633 | /* initialize the APIC for the IBS interrupts if available */ |
488 | static void init_ibs(void) | 634 | static void init_ibs(void) |
489 | { | 635 | { |
@@ -521,28 +667,33 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) | |||
521 | /* model specific files */ | 667 | /* model specific files */ |
522 | 668 | ||
523 | /* setup some reasonable defaults */ | 669 | /* setup some reasonable defaults */ |
670 | memset(&ibs_config, 0, sizeof(ibs_config)); | ||
524 | ibs_config.max_cnt_fetch = 250000; | 671 | ibs_config.max_cnt_fetch = 250000; |
525 | ibs_config.fetch_enabled = 0; | ||
526 | ibs_config.max_cnt_op = 250000; | 672 | ibs_config.max_cnt_op = 250000; |
527 | ibs_config.op_enabled = 0; | 673 | |
528 | ibs_config.dispatched_ops = 0; | 674 | if (ibs_caps & IBS_CAPS_FETCHSAM) { |
529 | 675 | dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); | |
530 | dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); | 676 | oprofilefs_create_ulong(sb, dir, "enable", |
531 | oprofilefs_create_ulong(sb, dir, "enable", | 677 | &ibs_config.fetch_enabled); |
532 | &ibs_config.fetch_enabled); | 678 | oprofilefs_create_ulong(sb, dir, "max_count", |
533 | oprofilefs_create_ulong(sb, dir, "max_count", | 679 | &ibs_config.max_cnt_fetch); |
534 | &ibs_config.max_cnt_fetch); | 680 | oprofilefs_create_ulong(sb, dir, "rand_enable", |
535 | oprofilefs_create_ulong(sb, dir, "rand_enable", | 681 | &ibs_config.rand_en); |
536 | &ibs_config.rand_en); | 682 | } |
537 | 683 | ||
538 | dir = oprofilefs_mkdir(sb, root, "ibs_op"); | 684 | if (ibs_caps & IBS_CAPS_OPSAM) { |
539 | oprofilefs_create_ulong(sb, dir, "enable", | 685 | dir = oprofilefs_mkdir(sb, root, "ibs_op"); |
540 | &ibs_config.op_enabled); | 686 | oprofilefs_create_ulong(sb, dir, "enable", |
541 | oprofilefs_create_ulong(sb, dir, "max_count", | 687 | &ibs_config.op_enabled); |
542 | &ibs_config.max_cnt_op); | 688 | oprofilefs_create_ulong(sb, dir, "max_count", |
543 | if (ibs_caps & IBS_CAPS_OPCNT) | 689 | &ibs_config.max_cnt_op); |
544 | oprofilefs_create_ulong(sb, dir, "dispatched_ops", | 690 | if (ibs_caps & IBS_CAPS_OPCNT) |
545 | &ibs_config.dispatched_ops); | 691 | oprofilefs_create_ulong(sb, dir, "dispatched_ops", |
692 | &ibs_config.dispatched_ops); | ||
693 | if (ibs_caps & IBS_CAPS_BRNTRGT) | ||
694 | oprofilefs_create_ulong(sb, dir, "branch_target", | ||
695 | &ibs_config.branch_target); | ||
696 | } | ||
546 | 697 | ||
547 | return 0; | 698 | return 0; |
548 | } | 699 | } |
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 8379c2c3d076..c4bb261c106e 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c | |||
@@ -65,16 +65,21 @@ pcibios_align_resource(void *data, const struct resource *res, | |||
65 | resource_size_t size, resource_size_t align) | 65 | resource_size_t size, resource_size_t align) |
66 | { | 66 | { |
67 | struct pci_dev *dev = data; | 67 | struct pci_dev *dev = data; |
68 | resource_size_t start = res->start; | 68 | resource_size_t start = round_down(res->end - size + 1, align); |
69 | 69 | ||
70 | if (res->flags & IORESOURCE_IO) { | 70 | if (res->flags & IORESOURCE_IO) { |
71 | if (skip_isa_ioresource_align(dev)) | 71 | |
72 | return start; | 72 | /* |
73 | if (start & 0x300) | 73 | * If we're avoiding ISA aliases, the largest contiguous I/O |
74 | start = (start + 0x3ff) & ~0x3ff; | 74 | * port space is 256 bytes. Clearing bits 9 and 10 preserves |
75 | * all 256-byte and smaller alignments, so the result will | ||
76 | * still be correctly aligned. | ||
77 | */ | ||
78 | if (!skip_isa_ioresource_align(dev)) | ||
79 | start &= ~0x300; | ||
75 | } else if (res->flags & IORESOURCE_MEM) { | 80 | } else if (res->flags & IORESOURCE_MEM) { |
76 | if (start < BIOS_END) | 81 | if (start < BIOS_END) |
77 | start = BIOS_END; | 82 | start = res->end; /* fail; no space */ |
78 | } | 83 | } |
79 | return start; | 84 | return start; |
80 | } | 85 | } |
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index f547ee05f715..9f9bfb705cf9 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c | |||
@@ -584,27 +584,28 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route | |||
584 | case PCI_DEVICE_ID_INTEL_ICH9_3: | 584 | case PCI_DEVICE_ID_INTEL_ICH9_3: |
585 | case PCI_DEVICE_ID_INTEL_ICH9_4: | 585 | case PCI_DEVICE_ID_INTEL_ICH9_4: |
586 | case PCI_DEVICE_ID_INTEL_ICH9_5: | 586 | case PCI_DEVICE_ID_INTEL_ICH9_5: |
587 | case PCI_DEVICE_ID_INTEL_TOLAPAI_0: | 587 | case PCI_DEVICE_ID_INTEL_EP80579_0: |
588 | case PCI_DEVICE_ID_INTEL_ICH10_0: | 588 | case PCI_DEVICE_ID_INTEL_ICH10_0: |
589 | case PCI_DEVICE_ID_INTEL_ICH10_1: | 589 | case PCI_DEVICE_ID_INTEL_ICH10_1: |
590 | case PCI_DEVICE_ID_INTEL_ICH10_2: | 590 | case PCI_DEVICE_ID_INTEL_ICH10_2: |
591 | case PCI_DEVICE_ID_INTEL_ICH10_3: | 591 | case PCI_DEVICE_ID_INTEL_ICH10_3: |
592 | case PCI_DEVICE_ID_INTEL_PATSBURG_LPC: | ||
592 | r->name = "PIIX/ICH"; | 593 | r->name = "PIIX/ICH"; |
593 | r->get = pirq_piix_get; | 594 | r->get = pirq_piix_get; |
594 | r->set = pirq_piix_set; | 595 | r->set = pirq_piix_set; |
595 | return 1; | 596 | return 1; |
596 | } | 597 | } |
597 | 598 | ||
598 | if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && | 599 | if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN) && |
599 | (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) { | 600 | (device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)) { |
600 | r->name = "PIIX/ICH"; | 601 | r->name = "PIIX/ICH"; |
601 | r->get = pirq_piix_get; | 602 | r->get = pirq_piix_get; |
602 | r->set = pirq_piix_set; | 603 | r->set = pirq_piix_set; |
603 | return 1; | 604 | return 1; |
604 | } | 605 | } |
605 | 606 | ||
606 | if ((device >= PCI_DEVICE_ID_INTEL_CPT_LPC_MIN) && | 607 | if ((device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN) && |
607 | (device <= PCI_DEVICE_ID_INTEL_CPT_LPC_MAX)) { | 608 | (device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)) { |
608 | r->name = "PIIX/ICH"; | 609 | r->name = "PIIX/ICH"; |
609 | r->get = pirq_piix_get; | 610 | r->get = pirq_piix_get; |
610 | r->set = pirq_piix_set; | 611 | r->set = pirq_piix_set; |
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index a918553ebc75..e282886616a0 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c | |||
@@ -65,7 +65,6 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, | |||
65 | int end, u64 addr) | 65 | int end, u64 addr) |
66 | { | 66 | { |
67 | struct pci_mmcfg_region *new; | 67 | struct pci_mmcfg_region *new; |
68 | int num_buses; | ||
69 | struct resource *res; | 68 | struct resource *res; |
70 | 69 | ||
71 | if (addr == 0) | 70 | if (addr == 0) |
@@ -82,10 +81,9 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, | |||
82 | 81 | ||
83 | list_add_sorted(new); | 82 | list_add_sorted(new); |
84 | 83 | ||
85 | num_buses = end - start + 1; | ||
86 | res = &new->res; | 84 | res = &new->res; |
87 | res->start = addr + PCI_MMCFG_BUS_OFFSET(start); | 85 | res->start = addr + PCI_MMCFG_BUS_OFFSET(start); |
88 | res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; | 86 | res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1; |
89 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | 87 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
90 | snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN, | 88 | snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN, |
91 | "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); | 89 | "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); |
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c index b34815408f58..13700ec8e2e4 100644 --- a/arch/x86/pci/olpc.c +++ b/arch/x86/pci/olpc.c | |||
@@ -304,7 +304,7 @@ static struct pci_raw_ops pci_olpc_conf = { | |||
304 | 304 | ||
305 | int __init pci_olpc_init(void) | 305 | int __init pci_olpc_init(void) |
306 | { | 306 | { |
307 | printk(KERN_INFO "PCI: Using configuration type OLPC\n"); | 307 | printk(KERN_INFO "PCI: Using configuration type OLPC XO-1\n"); |
308 | raw_pci_ops = &pci_olpc_conf; | 308 | raw_pci_ops = &pci_olpc_conf; |
309 | is_lx = is_geode_lx(); | 309 | is_lx = is_geode_lx(); |
310 | return 0; | 310 | return 0; |
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile new file mode 100644 index 000000000000..7bf70b812fa2 --- /dev/null +++ b/arch/x86/platform/Makefile | |||
@@ -0,0 +1,8 @@ | |||
1 | # Platform specific code goes here | ||
2 | obj-y += efi/ | ||
3 | obj-y += mrst/ | ||
4 | obj-y += olpc/ | ||
5 | obj-y += scx200/ | ||
6 | obj-y += sfi/ | ||
7 | obj-y += visws/ | ||
8 | obj-y += uv/ | ||
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile new file mode 100644 index 000000000000..73b8be0f3675 --- /dev/null +++ b/arch/x86/platform/efi/Makefile | |||
@@ -0,0 +1 @@ | |||
obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o | |||
diff --git a/arch/x86/kernel/efi.c b/arch/x86/platform/efi/efi.c index c2fa9b8b497e..0fe27d7c6258 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/platform/efi/efi.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/init.h> | 30 | #include <linux/init.h> |
31 | #include <linux/efi.h> | 31 | #include <linux/efi.h> |
32 | #include <linux/bootmem.h> | 32 | #include <linux/bootmem.h> |
33 | #include <linux/memblock.h> | ||
33 | #include <linux/spinlock.h> | 34 | #include <linux/spinlock.h> |
34 | #include <linux/uaccess.h> | 35 | #include <linux/uaccess.h> |
35 | #include <linux/time.h> | 36 | #include <linux/time.h> |
@@ -275,7 +276,7 @@ static void __init do_add_efi_memmap(void) | |||
275 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | 276 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |
276 | } | 277 | } |
277 | 278 | ||
278 | void __init efi_reserve_early(void) | 279 | void __init efi_memblock_x86_reserve_range(void) |
279 | { | 280 | { |
280 | unsigned long pmap; | 281 | unsigned long pmap; |
281 | 282 | ||
@@ -290,7 +291,7 @@ void __init efi_reserve_early(void) | |||
290 | boot_params.efi_info.efi_memdesc_size; | 291 | boot_params.efi_info.efi_memdesc_size; |
291 | memmap.desc_version = boot_params.efi_info.efi_memdesc_version; | 292 | memmap.desc_version = boot_params.efi_info.efi_memdesc_version; |
292 | memmap.desc_size = boot_params.efi_info.efi_memdesc_size; | 293 | memmap.desc_size = boot_params.efi_info.efi_memdesc_size; |
293 | reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size, | 294 | memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size, |
294 | "EFI memmap"); | 295 | "EFI memmap"); |
295 | } | 296 | } |
296 | 297 | ||
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/platform/efi/efi_32.c index 5cab48ee61a4..5cab48ee61a4 100644 --- a/arch/x86/kernel/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c | |||
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/platform/efi/efi_64.c index ac0621a7ac3d..ac0621a7ac3d 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c | |||
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S index fbe66e626c09..fbe66e626c09 100644 --- a/arch/x86/kernel/efi_stub_32.S +++ b/arch/x86/platform/efi/efi_stub_32.S | |||
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 4c07ccab8146..4c07ccab8146 100644 --- a/arch/x86/kernel/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S | |||
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile new file mode 100644 index 000000000000..efbbc552fa95 --- /dev/null +++ b/arch/x86/platform/mrst/Makefile | |||
@@ -0,0 +1 @@ | |||
obj-$(CONFIG_X86_MRST) += mrst.o | |||
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/platform/mrst/mrst.c index 79ae68154e87..79ae68154e87 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/platform/mrst/mrst.c | |||
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile new file mode 100644 index 000000000000..c31b8fcb5a86 --- /dev/null +++ b/arch/x86/platform/olpc/Makefile | |||
@@ -0,0 +1,3 @@ | |||
1 | obj-$(CONFIG_OLPC) += olpc.o | ||
2 | obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o | ||
3 | obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o | ||
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c new file mode 100644 index 000000000000..f5442c03abc3 --- /dev/null +++ b/arch/x86/platform/olpc/olpc-xo1.c | |||
@@ -0,0 +1,140 @@ | |||
1 | /* | ||
2 | * Support for features of the OLPC XO-1 laptop | ||
3 | * | ||
4 | * Copyright (C) 2010 One Laptop per Child | ||
5 | * Copyright (C) 2006 Red Hat, Inc. | ||
6 | * Copyright (C) 2006 Advanced Micro Devices, Inc. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License as published by | ||
10 | * the Free Software Foundation; either version 2 of the License, or | ||
11 | * (at your option) any later version. | ||
12 | */ | ||
13 | |||
14 | #include <linux/module.h> | ||
15 | #include <linux/pci.h> | ||
16 | #include <linux/pci_ids.h> | ||
17 | #include <linux/platform_device.h> | ||
18 | #include <linux/pm.h> | ||
19 | |||
20 | #include <asm/io.h> | ||
21 | #include <asm/olpc.h> | ||
22 | |||
23 | #define DRV_NAME "olpc-xo1" | ||
24 | |||
25 | #define PMS_BAR 4 | ||
26 | #define ACPI_BAR 5 | ||
27 | |||
28 | /* PMC registers (PMS block) */ | ||
29 | #define PM_SCLK 0x10 | ||
30 | #define PM_IN_SLPCTL 0x20 | ||
31 | #define PM_WKXD 0x34 | ||
32 | #define PM_WKD 0x30 | ||
33 | #define PM_SSC 0x54 | ||
34 | |||
35 | /* PM registers (ACPI block) */ | ||
36 | #define PM1_CNT 0x08 | ||
37 | #define PM_GPE0_STS 0x18 | ||
38 | |||
39 | static unsigned long acpi_base; | ||
40 | static unsigned long pms_base; | ||
41 | |||
42 | static void xo1_power_off(void) | ||
43 | { | ||
44 | printk(KERN_INFO "OLPC XO-1 power off sequence...\n"); | ||
45 | |||
46 | /* Enable all of these controls with 0 delay */ | ||
47 | outl(0x40000000, pms_base + PM_SCLK); | ||
48 | outl(0x40000000, pms_base + PM_IN_SLPCTL); | ||
49 | outl(0x40000000, pms_base + PM_WKXD); | ||
50 | outl(0x40000000, pms_base + PM_WKD); | ||
51 | |||
52 | /* Clear status bits (possibly unnecessary) */ | ||
53 | outl(0x0002ffff, pms_base + PM_SSC); | ||
54 | outl(0xffffffff, acpi_base + PM_GPE0_STS); | ||
55 | |||
56 | /* Write SLP_EN bit to start the machinery */ | ||
57 | outl(0x00002000, acpi_base + PM1_CNT); | ||
58 | } | ||
59 | |||
60 | /* Read the base addresses from the PCI BAR info */ | ||
61 | static int __devinit setup_bases(struct pci_dev *pdev) | ||
62 | { | ||
63 | int r; | ||
64 | |||
65 | r = pci_enable_device_io(pdev); | ||
66 | if (r) { | ||
67 | dev_err(&pdev->dev, "can't enable device IO\n"); | ||
68 | return r; | ||
69 | } | ||
70 | |||
71 | r = pci_request_region(pdev, ACPI_BAR, DRV_NAME); | ||
72 | if (r) { | ||
73 | dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR); | ||
74 | return r; | ||
75 | } | ||
76 | |||
77 | r = pci_request_region(pdev, PMS_BAR, DRV_NAME); | ||
78 | if (r) { | ||
79 | dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR); | ||
80 | pci_release_region(pdev, ACPI_BAR); | ||
81 | return r; | ||
82 | } | ||
83 | |||
84 | acpi_base = pci_resource_start(pdev, ACPI_BAR); | ||
85 | pms_base = pci_resource_start(pdev, PMS_BAR); | ||
86 | |||
87 | return 0; | ||
88 | } | ||
89 | |||
90 | static int __devinit olpc_xo1_probe(struct platform_device *pdev) | ||
91 | { | ||
92 | struct pci_dev *pcidev; | ||
93 | int r; | ||
94 | |||
95 | pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, | ||
96 | NULL); | ||
97 | if (!pdev) | ||
98 | return -ENODEV; | ||
99 | |||
100 | r = setup_bases(pcidev); | ||
101 | if (r) | ||
102 | return r; | ||
103 | |||
104 | pm_power_off = xo1_power_off; | ||
105 | |||
106 | printk(KERN_INFO "OLPC XO-1 support registered\n"); | ||
107 | return 0; | ||
108 | } | ||
109 | |||
110 | static int __devexit olpc_xo1_remove(struct platform_device *pdev) | ||
111 | { | ||
112 | pm_power_off = NULL; | ||
113 | return 0; | ||
114 | } | ||
115 | |||
116 | static struct platform_driver olpc_xo1_driver = { | ||
117 | .driver = { | ||
118 | .name = DRV_NAME, | ||
119 | .owner = THIS_MODULE, | ||
120 | }, | ||
121 | .probe = olpc_xo1_probe, | ||
122 | .remove = __devexit_p(olpc_xo1_remove), | ||
123 | }; | ||
124 | |||
125 | static int __init olpc_xo1_init(void) | ||
126 | { | ||
127 | return platform_driver_register(&olpc_xo1_driver); | ||
128 | } | ||
129 | |||
130 | static void __exit olpc_xo1_exit(void) | ||
131 | { | ||
132 | platform_driver_unregister(&olpc_xo1_driver); | ||
133 | } | ||
134 | |||
135 | MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); | ||
136 | MODULE_LICENSE("GPL"); | ||
137 | MODULE_ALIAS("platform:olpc-xo1"); | ||
138 | |||
139 | module_init(olpc_xo1_init); | ||
140 | module_exit(olpc_xo1_exit); | ||
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/platform/olpc/olpc.c index 0e0cdde519be..edaf3fe8dc5e 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/platform/olpc/olpc.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/spinlock.h> | 17 | #include <linux/spinlock.h> |
18 | #include <linux/io.h> | 18 | #include <linux/io.h> |
19 | #include <linux/string.h> | 19 | #include <linux/string.h> |
20 | #include <linux/platform_device.h> | ||
20 | 21 | ||
21 | #include <asm/geode.h> | 22 | #include <asm/geode.h> |
22 | #include <asm/setup.h> | 23 | #include <asm/setup.h> |
@@ -114,6 +115,7 @@ int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, | |||
114 | unsigned long flags; | 115 | unsigned long flags; |
115 | int ret = -EIO; | 116 | int ret = -EIO; |
116 | int i; | 117 | int i; |
118 | int restarts = 0; | ||
117 | 119 | ||
118 | spin_lock_irqsave(&ec_lock, flags); | 120 | spin_lock_irqsave(&ec_lock, flags); |
119 | 121 | ||
@@ -169,7 +171,9 @@ restart: | |||
169 | if (wait_on_obf(0x6c, 1)) { | 171 | if (wait_on_obf(0x6c, 1)) { |
170 | printk(KERN_ERR "olpc-ec: timeout waiting for" | 172 | printk(KERN_ERR "olpc-ec: timeout waiting for" |
171 | " EC to provide data!\n"); | 173 | " EC to provide data!\n"); |
172 | goto restart; | 174 | if (restarts++ < 10) |
175 | goto restart; | ||
176 | goto err; | ||
173 | } | 177 | } |
174 | outbuf[i] = inb(0x68); | 178 | outbuf[i] = inb(0x68); |
175 | pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); | 179 | pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); |
@@ -183,8 +187,21 @@ err: | |||
183 | } | 187 | } |
184 | EXPORT_SYMBOL_GPL(olpc_ec_cmd); | 188 | EXPORT_SYMBOL_GPL(olpc_ec_cmd); |
185 | 189 | ||
186 | #ifdef CONFIG_OLPC_OPENFIRMWARE | 190 | static bool __init check_ofw_architecture(void) |
187 | static void __init platform_detect(void) | 191 | { |
192 | size_t propsize; | ||
193 | char olpc_arch[5]; | ||
194 | const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 }; | ||
195 | void *res[] = { &propsize }; | ||
196 | |||
197 | if (olpc_ofw("getprop", args, res)) { | ||
198 | printk(KERN_ERR "ofw: getprop call failed!\n"); | ||
199 | return false; | ||
200 | } | ||
201 | return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0; | ||
202 | } | ||
203 | |||
204 | static u32 __init get_board_revision(void) | ||
188 | { | 205 | { |
189 | size_t propsize; | 206 | size_t propsize; |
190 | __be32 rev; | 207 | __be32 rev; |
@@ -193,45 +210,43 @@ static void __init platform_detect(void) | |||
193 | 210 | ||
194 | if (olpc_ofw("getprop", args, res) || propsize != 4) { | 211 | if (olpc_ofw("getprop", args, res) || propsize != 4) { |
195 | printk(KERN_ERR "ofw: getprop call failed!\n"); | 212 | printk(KERN_ERR "ofw: getprop call failed!\n"); |
196 | rev = cpu_to_be32(0); | 213 | return cpu_to_be32(0); |
197 | } | 214 | } |
198 | olpc_platform_info.boardrev = be32_to_cpu(rev); | 215 | return be32_to_cpu(rev); |
199 | } | 216 | } |
200 | #else | 217 | |
201 | static void __init platform_detect(void) | 218 | static bool __init platform_detect(void) |
202 | { | 219 | { |
203 | /* stopgap until OFW support is added to the kernel */ | 220 | if (!check_ofw_architecture()) |
204 | olpc_platform_info.boardrev = olpc_board(0xc2); | 221 | return false; |
222 | olpc_platform_info.flags |= OLPC_F_PRESENT; | ||
223 | olpc_platform_info.boardrev = get_board_revision(); | ||
224 | return true; | ||
205 | } | 225 | } |
206 | #endif | ||
207 | 226 | ||
208 | static int __init olpc_init(void) | 227 | static int __init add_xo1_platform_devices(void) |
209 | { | 228 | { |
210 | unsigned char *romsig; | 229 | struct platform_device *pdev; |
211 | 230 | ||
212 | /* The ioremap check is dangerous; limit what we run it on */ | 231 | pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0); |
213 | if (!is_geode() || cs5535_has_vsa2()) | 232 | if (IS_ERR(pdev)) |
214 | return 0; | 233 | return PTR_ERR(pdev); |
215 | 234 | ||
216 | spin_lock_init(&ec_lock); | 235 | pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0); |
236 | if (IS_ERR(pdev)) | ||
237 | return PTR_ERR(pdev); | ||
217 | 238 | ||
218 | romsig = ioremap(0xffffffc0, 16); | 239 | return 0; |
219 | if (!romsig) | 240 | } |
220 | return 0; | ||
221 | 241 | ||
222 | if (strncmp(romsig, "CL1 Q", 7)) | 242 | static int __init olpc_init(void) |
223 | goto unmap; | 243 | { |
224 | if (strncmp(romsig+6, romsig+13, 3)) { | 244 | int r = 0; |
225 | printk(KERN_INFO "OLPC BIOS signature looks invalid. " | ||
226 | "Assuming not OLPC\n"); | ||
227 | goto unmap; | ||
228 | } | ||
229 | 245 | ||
230 | printk(KERN_INFO "OLPC board with OpenFirmware %.16s\n", romsig); | 246 | if (!olpc_ofw_present() || !platform_detect()) |
231 | olpc_platform_info.flags |= OLPC_F_PRESENT; | 247 | return 0; |
232 | 248 | ||
233 | /* get the platform revision */ | 249 | spin_lock_init(&ec_lock); |
234 | platform_detect(); | ||
235 | 250 | ||
236 | /* assume B1 and above models always have a DCON */ | 251 | /* assume B1 and above models always have a DCON */ |
237 | if (olpc_board_at_least(olpc_board(0xb1))) | 252 | if (olpc_board_at_least(olpc_board(0xb1))) |
@@ -242,8 +257,10 @@ static int __init olpc_init(void) | |||
242 | (unsigned char *) &olpc_platform_info.ecver, 1); | 257 | (unsigned char *) &olpc_platform_info.ecver, 1); |
243 | 258 | ||
244 | #ifdef CONFIG_PCI_OLPC | 259 | #ifdef CONFIG_PCI_OLPC |
245 | /* If the VSA exists let it emulate PCI, if not emulate in kernel */ | 260 | /* If the VSA exists let it emulate PCI, if not emulate in kernel. |
246 | if (!cs5535_has_vsa2()) | 261 | * XO-1 only. */ |
262 | if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) && | ||
263 | !cs5535_has_vsa2()) | ||
247 | x86_init.pci.arch_init = pci_olpc_init; | 264 | x86_init.pci.arch_init = pci_olpc_init; |
248 | #endif | 265 | #endif |
249 | 266 | ||
@@ -252,8 +269,12 @@ static int __init olpc_init(void) | |||
252 | olpc_platform_info.boardrev >> 4, | 269 | olpc_platform_info.boardrev >> 4, |
253 | olpc_platform_info.ecver); | 270 | olpc_platform_info.ecver); |
254 | 271 | ||
255 | unmap: | 272 | if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */ |
256 | iounmap(romsig); | 273 | r = add_xo1_platform_devices(); |
274 | if (r) | ||
275 | return r; | ||
276 | } | ||
277 | |||
257 | return 0; | 278 | return 0; |
258 | } | 279 | } |
259 | 280 | ||
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c index 3218aa71ab5e..787320464379 100644 --- a/arch/x86/kernel/olpc_ofw.c +++ b/arch/x86/platform/olpc/olpc_ofw.c | |||
@@ -74,6 +74,12 @@ int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, | |||
74 | } | 74 | } |
75 | EXPORT_SYMBOL_GPL(__olpc_ofw); | 75 | EXPORT_SYMBOL_GPL(__olpc_ofw); |
76 | 76 | ||
77 | bool olpc_ofw_present(void) | ||
78 | { | ||
79 | return olpc_ofw_cif != NULL; | ||
80 | } | ||
81 | EXPORT_SYMBOL_GPL(olpc_ofw_present); | ||
82 | |||
77 | /* OFW cif _should_ be above this address */ | 83 | /* OFW cif _should_ be above this address */ |
78 | #define OFW_MIN 0xff000000 | 84 | #define OFW_MIN 0xff000000 |
79 | 85 | ||
diff --git a/arch/x86/platform/scx200/Makefile b/arch/x86/platform/scx200/Makefile new file mode 100644 index 000000000000..762b4c7f4314 --- /dev/null +++ b/arch/x86/platform/scx200/Makefile | |||
@@ -0,0 +1,2 @@ | |||
1 | obj-$(CONFIG_SCx200) += scx200.o | ||
2 | scx200-y += scx200_32.o | ||
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/platform/scx200/scx200_32.c index 7e004acbe526..7e004acbe526 100644 --- a/arch/x86/kernel/scx200_32.c +++ b/arch/x86/platform/scx200/scx200_32.c | |||
diff --git a/arch/x86/platform/sfi/Makefile b/arch/x86/platform/sfi/Makefile new file mode 100644 index 000000000000..cc5db1168a5e --- /dev/null +++ b/arch/x86/platform/sfi/Makefile | |||
@@ -0,0 +1 @@ | |||
obj-$(CONFIG_SFI) += sfi.o | |||
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/platform/sfi/sfi.c index cb22acf3ed09..dd4c281ffe57 100644 --- a/arch/x86/kernel/sfi.c +++ b/arch/x86/platform/sfi/sfi.c | |||
@@ -34,7 +34,7 @@ | |||
34 | #ifdef CONFIG_X86_LOCAL_APIC | 34 | #ifdef CONFIG_X86_LOCAL_APIC |
35 | static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; | 35 | static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; |
36 | 36 | ||
37 | void __init mp_sfi_register_lapic_address(unsigned long address) | 37 | static void __init mp_sfi_register_lapic_address(unsigned long address) |
38 | { | 38 | { |
39 | mp_lapic_addr = address; | 39 | mp_lapic_addr = address; |
40 | 40 | ||
@@ -46,7 +46,7 @@ void __init mp_sfi_register_lapic_address(unsigned long address) | |||
46 | } | 46 | } |
47 | 47 | ||
48 | /* All CPUs enumerated by SFI must be present and enabled */ | 48 | /* All CPUs enumerated by SFI must be present and enabled */ |
49 | void __cpuinit mp_sfi_register_lapic(u8 id) | 49 | static void __cpuinit mp_sfi_register_lapic(u8 id) |
50 | { | 50 | { |
51 | if (MAX_APICS - id <= 0) { | 51 | if (MAX_APICS - id <= 0) { |
52 | pr_warning("Processor #%d invalid (max %d)\n", | 52 | pr_warning("Processor #%d invalid (max %d)\n", |
diff --git a/arch/x86/platform/uv/Makefile b/arch/x86/platform/uv/Makefile new file mode 100644 index 000000000000..6c40995fefb8 --- /dev/null +++ b/arch/x86/platform/uv/Makefile | |||
@@ -0,0 +1 @@ | |||
obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o | |||
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 8bc57baaa9ad..8bc57baaa9ad 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c | |||
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 312ef0292815..20ea20a39e2a 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c | |||
@@ -1001,10 +1001,10 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) | |||
1001 | static ssize_t tunables_read(struct file *file, char __user *userbuf, | 1001 | static ssize_t tunables_read(struct file *file, char __user *userbuf, |
1002 | size_t count, loff_t *ppos) | 1002 | size_t count, loff_t *ppos) |
1003 | { | 1003 | { |
1004 | char buf[300]; | 1004 | char *buf; |
1005 | int ret; | 1005 | int ret; |
1006 | 1006 | ||
1007 | ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", | 1007 | buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", |
1008 | "max_bau_concurrent plugged_delay plugsb4reset", | 1008 | "max_bau_concurrent plugged_delay plugsb4reset", |
1009 | "timeoutsb4reset ipi_reset_limit complete_threshold", | 1009 | "timeoutsb4reset ipi_reset_limit complete_threshold", |
1010 | "congested_response_us congested_reps congested_period", | 1010 | "congested_response_us congested_reps congested_period", |
@@ -1012,7 +1012,12 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf, | |||
1012 | timeoutsb4reset, ipi_reset_limit, complete_threshold, | 1012 | timeoutsb4reset, ipi_reset_limit, complete_threshold, |
1013 | congested_response_us, congested_reps, congested_period); | 1013 | congested_response_us, congested_reps, congested_period); |
1014 | 1014 | ||
1015 | return simple_read_from_buffer(userbuf, count, ppos, buf, ret); | 1015 | if (!buf) |
1016 | return -ENOMEM; | ||
1017 | |||
1018 | ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); | ||
1019 | kfree(buf); | ||
1020 | return ret; | ||
1016 | } | 1021 | } |
1017 | 1022 | ||
1018 | /* | 1023 | /* |
@@ -1285,6 +1290,7 @@ static const struct file_operations tunables_fops = { | |||
1285 | .open = tunables_open, | 1290 | .open = tunables_open, |
1286 | .read = tunables_read, | 1291 | .read = tunables_read, |
1287 | .write = tunables_write, | 1292 | .write = tunables_write, |
1293 | .llseek = default_llseek, | ||
1288 | }; | 1294 | }; |
1289 | 1295 | ||
1290 | static int __init uv_ptc_init(void) | 1296 | static int __init uv_ptc_init(void) |
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 7b24460917d5..7b24460917d5 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c | |||
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c index 309c70fb7759..309c70fb7759 100644 --- a/arch/x86/kernel/uv_sysfs.c +++ b/arch/x86/platform/uv/uv_sysfs.c | |||
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/platform/uv/uv_time.c index 56e421bc379b..56e421bc379b 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c | |||
diff --git a/arch/x86/platform/visws/Makefile b/arch/x86/platform/visws/Makefile new file mode 100644 index 000000000000..91bc17ab2fd5 --- /dev/null +++ b/arch/x86/platform/visws/Makefile | |||
@@ -0,0 +1 @@ | |||
obj-$(CONFIG_X86_VISWS) += visws_quirks.o | |||
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c index 3371bd053b89..3371bd053b89 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/platform/visws/visws_quirks.c | |||
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index a234b9a71ab4..5b54892e4bc3 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig | |||
@@ -29,15 +29,12 @@ config XEN_PVHVM | |||
29 | depends on X86_LOCAL_APIC | 29 | depends on X86_LOCAL_APIC |
30 | 30 | ||
31 | config XEN_MAX_DOMAIN_MEMORY | 31 | config XEN_MAX_DOMAIN_MEMORY |
32 | int "Maximum allowed size of a domain in gigabytes" | 32 | int |
33 | default 8 if X86_32 | 33 | default 128 |
34 | default 32 if X86_64 | ||
35 | depends on XEN | 34 | depends on XEN |
36 | help | 35 | help |
37 | The pseudo-physical to machine address array is sized | 36 | This only affects the sizing of some bss arrays, the unused |
38 | according to the maximum possible memory size of a Xen | 37 | portions of which are freed. |
39 | domain. This array uses 1 page per gigabyte, so there's no | ||
40 | need to be too stingy here. | ||
41 | 38 | ||
42 | config XEN_SAVE_RESTORE | 39 | config XEN_SAVE_RESTORE |
43 | bool | 40 | bool |
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index 1304bcec8ee5..7c0fedd98ea0 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c | |||
@@ -106,6 +106,7 @@ static const struct file_operations u32_array_fops = { | |||
106 | .open = u32_array_open, | 106 | .open = u32_array_open, |
107 | .release= xen_array_release, | 107 | .release= xen_array_release, |
108 | .read = u32_array_read, | 108 | .read = u32_array_read, |
109 | .llseek = no_llseek, | ||
109 | }; | 110 | }; |
110 | 111 | ||
111 | struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, | 112 | struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, |
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d48a32b10a3c..235c0f4d3861 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/console.h> | 30 | #include <linux/console.h> |
31 | #include <linux/pci.h> | 31 | #include <linux/pci.h> |
32 | #include <linux/gfp.h> | 32 | #include <linux/gfp.h> |
33 | #include <linux/memblock.h> | ||
33 | 34 | ||
34 | #include <xen/xen.h> | 35 | #include <xen/xen.h> |
35 | #include <xen/interface/xen.h> | 36 | #include <xen/interface/xen.h> |
@@ -59,7 +60,6 @@ | |||
59 | #include <asm/pgtable.h> | 60 | #include <asm/pgtable.h> |
60 | #include <asm/tlbflush.h> | 61 | #include <asm/tlbflush.h> |
61 | #include <asm/reboot.h> | 62 | #include <asm/reboot.h> |
62 | #include <asm/setup.h> | ||
63 | #include <asm/stackprotector.h> | 63 | #include <asm/stackprotector.h> |
64 | #include <asm/hypervisor.h> | 64 | #include <asm/hypervisor.h> |
65 | 65 | ||
@@ -136,9 +136,6 @@ static void xen_vcpu_setup(int cpu) | |||
136 | info.mfn = arbitrary_virt_to_mfn(vcpup); | 136 | info.mfn = arbitrary_virt_to_mfn(vcpup); |
137 | info.offset = offset_in_page(vcpup); | 137 | info.offset = offset_in_page(vcpup); |
138 | 138 | ||
139 | printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", | ||
140 | cpu, vcpup, info.mfn, info.offset); | ||
141 | |||
142 | /* Check to see if the hypervisor will put the vcpu_info | 139 | /* Check to see if the hypervisor will put the vcpu_info |
143 | structure where we want it, which allows direct access via | 140 | structure where we want it, which allows direct access via |
144 | a percpu-variable. */ | 141 | a percpu-variable. */ |
@@ -152,9 +149,6 @@ static void xen_vcpu_setup(int cpu) | |||
152 | /* This cpu is using the registered vcpu info, even if | 149 | /* This cpu is using the registered vcpu info, even if |
153 | later ones fail to. */ | 150 | later ones fail to. */ |
154 | per_cpu(xen_vcpu, cpu) = vcpup; | 151 | per_cpu(xen_vcpu, cpu) = vcpup; |
155 | |||
156 | printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", | ||
157 | cpu, vcpup); | ||
158 | } | 152 | } |
159 | } | 153 | } |
160 | 154 | ||
@@ -837,6 +831,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) | |||
837 | Xen console noise. */ | 831 | Xen console noise. */ |
838 | break; | 832 | break; |
839 | 833 | ||
834 | case MSR_IA32_CR_PAT: | ||
835 | if (smp_processor_id() == 0) | ||
836 | xen_set_pat(((u64)high << 32) | low); | ||
837 | break; | ||
838 | |||
840 | default: | 839 | default: |
841 | ret = native_write_msr_safe(msr, low, high); | 840 | ret = native_write_msr_safe(msr, low, high); |
842 | } | 841 | } |
@@ -875,8 +874,6 @@ void xen_setup_vcpu_info_placement(void) | |||
875 | /* xen_vcpu_setup managed to place the vcpu_info within the | 874 | /* xen_vcpu_setup managed to place the vcpu_info within the |
876 | percpu area for all cpus, so make use of it */ | 875 | percpu area for all cpus, so make use of it */ |
877 | if (have_vcpu_info_placement) { | 876 | if (have_vcpu_info_placement) { |
878 | printk(KERN_INFO "Xen: using vcpu_info placement\n"); | ||
879 | |||
880 | pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); | 877 | pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); |
881 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); | 878 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); |
882 | pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); | 879 | pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); |
@@ -1020,7 +1017,7 @@ static void xen_reboot(int reason) | |||
1020 | struct sched_shutdown r = { .reason = reason }; | 1017 | struct sched_shutdown r = { .reason = reason }; |
1021 | 1018 | ||
1022 | #ifdef CONFIG_SMP | 1019 | #ifdef CONFIG_SMP |
1023 | smp_send_stop(); | 1020 | stop_other_cpus(); |
1024 | #endif | 1021 | #endif |
1025 | 1022 | ||
1026 | if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) | 1023 | if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) |
@@ -1185,10 +1182,15 @@ asmlinkage void __init xen_start_kernel(void) | |||
1185 | local_irq_disable(); | 1182 | local_irq_disable(); |
1186 | early_boot_irqs_off(); | 1183 | early_boot_irqs_off(); |
1187 | 1184 | ||
1185 | memblock_init(); | ||
1186 | |||
1188 | xen_raw_console_write("mapping kernel into physical memory\n"); | 1187 | xen_raw_console_write("mapping kernel into physical memory\n"); |
1189 | pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); | 1188 | pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); |
1190 | xen_ident_map_ISA(); | 1189 | xen_ident_map_ISA(); |
1191 | 1190 | ||
1191 | /* Allocate and initialize top and mid mfn levels for p2m structure */ | ||
1192 | xen_build_mfn_list_list(); | ||
1193 | |||
1192 | init_mm.pgd = pgd; | 1194 | init_mm.pgd = pgd; |
1193 | 1195 | ||
1194 | /* keep using Xen gdt for now; no urgent need to change it */ | 1196 | /* keep using Xen gdt for now; no urgent need to change it */ |
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index eed9c7cee4b7..c237b810b03f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <linux/vmalloc.h> | 45 | #include <linux/vmalloc.h> |
46 | #include <linux/module.h> | 46 | #include <linux/module.h> |
47 | #include <linux/gfp.h> | 47 | #include <linux/gfp.h> |
48 | #include <linux/memblock.h> | ||
48 | 49 | ||
49 | #include <asm/pgtable.h> | 50 | #include <asm/pgtable.h> |
50 | #include <asm/tlbflush.h> | 51 | #include <asm/tlbflush.h> |
@@ -55,6 +56,8 @@ | |||
55 | #include <asm/e820.h> | 56 | #include <asm/e820.h> |
56 | #include <asm/linkage.h> | 57 | #include <asm/linkage.h> |
57 | #include <asm/page.h> | 58 | #include <asm/page.h> |
59 | #include <asm/init.h> | ||
60 | #include <asm/pat.h> | ||
58 | 61 | ||
59 | #include <asm/xen/hypercall.h> | 62 | #include <asm/xen/hypercall.h> |
60 | #include <asm/xen/hypervisor.h> | 63 | #include <asm/xen/hypervisor.h> |
@@ -138,7 +141,8 @@ static inline void check_zero(void) | |||
138 | * large enough to allocate page table pages to allocate the rest. | 141 | * large enough to allocate page table pages to allocate the rest. |
139 | * Each page can map 2MB. | 142 | * Each page can map 2MB. |
140 | */ | 143 | */ |
141 | static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; | 144 | #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) |
145 | static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); | ||
142 | 146 | ||
143 | #ifdef CONFIG_X86_64 | 147 | #ifdef CONFIG_X86_64 |
144 | /* l3 pud for userspace vsyscall mapping */ | 148 | /* l3 pud for userspace vsyscall mapping */ |
@@ -169,49 +173,182 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ | |||
169 | */ | 173 | */ |
170 | #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) | 174 | #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) |
171 | 175 | ||
176 | /* | ||
177 | * Xen leaves the responsibility for maintaining p2m mappings to the | ||
178 | * guests themselves, but it must also access and update the p2m array | ||
179 | * during suspend/resume when all the pages are reallocated. | ||
180 | * | ||
181 | * The p2m table is logically a flat array, but we implement it as a | ||
182 | * three-level tree to allow the address space to be sparse. | ||
183 | * | ||
184 | * Xen | ||
185 | * | | ||
186 | * p2m_top p2m_top_mfn | ||
187 | * / \ / \ | ||
188 | * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn | ||
189 | * / \ / \ / / | ||
190 | * p2m p2m p2m p2m p2m p2m p2m ... | ||
191 | * | ||
192 | * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. | ||
193 | * | ||
194 | * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the | ||
195 | * maximum representable pseudo-physical address space is: | ||
196 | * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages | ||
197 | * | ||
198 | * P2M_PER_PAGE depends on the architecture, as a mfn is always | ||
199 | * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to | ||
200 | * 512 and 1024 entries respectively. | ||
201 | */ | ||
202 | |||
203 | unsigned long xen_max_p2m_pfn __read_mostly; | ||
172 | 204 | ||
173 | #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) | 205 | #define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) |
174 | #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) | 206 | #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) |
207 | #define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) | ||
175 | 208 | ||
176 | /* Placeholder for holes in the address space */ | 209 | #define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) |
177 | static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data = | ||
178 | { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; | ||
179 | 210 | ||
180 | /* Array of pointers to pages containing p2m entries */ | 211 | /* Placeholders for holes in the address space */ |
181 | static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = | 212 | static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); |
182 | { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; | 213 | static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); |
214 | static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); | ||
183 | 215 | ||
184 | /* Arrays of p2m arrays expressed in mfns used for save/restore */ | 216 | static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); |
185 | static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; | 217 | static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); |
218 | static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); | ||
186 | 219 | ||
187 | static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] | 220 | RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); |
188 | __page_aligned_bss; | 221 | RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); |
189 | 222 | ||
190 | static inline unsigned p2m_top_index(unsigned long pfn) | 223 | static inline unsigned p2m_top_index(unsigned long pfn) |
191 | { | 224 | { |
192 | BUG_ON(pfn >= MAX_DOMAIN_PAGES); | 225 | BUG_ON(pfn >= MAX_P2M_PFN); |
193 | return pfn / P2M_ENTRIES_PER_PAGE; | 226 | return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); |
227 | } | ||
228 | |||
229 | static inline unsigned p2m_mid_index(unsigned long pfn) | ||
230 | { | ||
231 | return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; | ||
194 | } | 232 | } |
195 | 233 | ||
196 | static inline unsigned p2m_index(unsigned long pfn) | 234 | static inline unsigned p2m_index(unsigned long pfn) |
197 | { | 235 | { |
198 | return pfn % P2M_ENTRIES_PER_PAGE; | 236 | return pfn % P2M_PER_PAGE; |
199 | } | 237 | } |
200 | 238 | ||
201 | /* Build the parallel p2m_top_mfn structures */ | 239 | static void p2m_top_init(unsigned long ***top) |
240 | { | ||
241 | unsigned i; | ||
242 | |||
243 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | ||
244 | top[i] = p2m_mid_missing; | ||
245 | } | ||
246 | |||
247 | static void p2m_top_mfn_init(unsigned long *top) | ||
248 | { | ||
249 | unsigned i; | ||
250 | |||
251 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | ||
252 | top[i] = virt_to_mfn(p2m_mid_missing_mfn); | ||
253 | } | ||
254 | |||
255 | static void p2m_top_mfn_p_init(unsigned long **top) | ||
256 | { | ||
257 | unsigned i; | ||
258 | |||
259 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | ||
260 | top[i] = p2m_mid_missing_mfn; | ||
261 | } | ||
262 | |||
263 | static void p2m_mid_init(unsigned long **mid) | ||
264 | { | ||
265 | unsigned i; | ||
266 | |||
267 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | ||
268 | mid[i] = p2m_missing; | ||
269 | } | ||
270 | |||
271 | static void p2m_mid_mfn_init(unsigned long *mid) | ||
272 | { | ||
273 | unsigned i; | ||
274 | |||
275 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | ||
276 | mid[i] = virt_to_mfn(p2m_missing); | ||
277 | } | ||
278 | |||
279 | static void p2m_init(unsigned long *p2m) | ||
280 | { | ||
281 | unsigned i; | ||
282 | |||
283 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | ||
284 | p2m[i] = INVALID_P2M_ENTRY; | ||
285 | } | ||
286 | |||
287 | /* | ||
288 | * Build the parallel p2m_top_mfn and p2m_mid_mfn structures | ||
289 | * | ||
290 | * This is called both at boot time, and after resuming from suspend: | ||
291 | * - At boot time we're called very early, and must use extend_brk() | ||
292 | * to allocate memory. | ||
293 | * | ||
294 | * - After resume we're called from within stop_machine, but the mfn | ||
295 | * tree should alreay be completely allocated. | ||
296 | */ | ||
202 | void xen_build_mfn_list_list(void) | 297 | void xen_build_mfn_list_list(void) |
203 | { | 298 | { |
204 | unsigned pfn, idx; | 299 | unsigned long pfn; |
205 | 300 | ||
206 | for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { | 301 | /* Pre-initialize p2m_top_mfn to be completely missing */ |
207 | unsigned topidx = p2m_top_index(pfn); | 302 | if (p2m_top_mfn == NULL) { |
303 | p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
304 | p2m_mid_mfn_init(p2m_mid_missing_mfn); | ||
305 | |||
306 | p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
307 | p2m_top_mfn_p_init(p2m_top_mfn_p); | ||
208 | 308 | ||
209 | p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); | 309 | p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); |
310 | p2m_top_mfn_init(p2m_top_mfn); | ||
311 | } else { | ||
312 | /* Reinitialise, mfn's all change after migration */ | ||
313 | p2m_mid_mfn_init(p2m_mid_missing_mfn); | ||
210 | } | 314 | } |
211 | 315 | ||
212 | for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { | 316 | for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { |
213 | unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; | 317 | unsigned topidx = p2m_top_index(pfn); |
214 | p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); | 318 | unsigned mididx = p2m_mid_index(pfn); |
319 | unsigned long **mid; | ||
320 | unsigned long *mid_mfn_p; | ||
321 | |||
322 | mid = p2m_top[topidx]; | ||
323 | mid_mfn_p = p2m_top_mfn_p[topidx]; | ||
324 | |||
325 | /* Don't bother allocating any mfn mid levels if | ||
326 | * they're just missing, just update the stored mfn, | ||
327 | * since all could have changed over a migrate. | ||
328 | */ | ||
329 | if (mid == p2m_mid_missing) { | ||
330 | BUG_ON(mididx); | ||
331 | BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); | ||
332 | p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); | ||
333 | pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; | ||
334 | continue; | ||
335 | } | ||
336 | |||
337 | if (mid_mfn_p == p2m_mid_missing_mfn) { | ||
338 | /* | ||
339 | * XXX boot-time only! We should never find | ||
340 | * missing parts of the mfn tree after | ||
341 | * runtime. extend_brk() will BUG if we call | ||
342 | * it too late. | ||
343 | */ | ||
344 | mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
345 | p2m_mid_mfn_init(mid_mfn_p); | ||
346 | |||
347 | p2m_top_mfn_p[topidx] = mid_mfn_p; | ||
348 | } | ||
349 | |||
350 | p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); | ||
351 | mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); | ||
215 | } | 352 | } |
216 | } | 353 | } |
217 | 354 | ||
@@ -220,8 +357,8 @@ void xen_setup_mfn_list_list(void) | |||
220 | BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); | 357 | BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); |
221 | 358 | ||
222 | HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = | 359 | HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = |
223 | virt_to_mfn(p2m_top_mfn_list); | 360 | virt_to_mfn(p2m_top_mfn); |
224 | HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages; | 361 | HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; |
225 | } | 362 | } |
226 | 363 | ||
227 | /* Set up p2m_top to point to the domain-builder provided p2m pages */ | 364 | /* Set up p2m_top to point to the domain-builder provided p2m pages */ |
@@ -229,98 +366,176 @@ void __init xen_build_dynamic_phys_to_machine(void) | |||
229 | { | 366 | { |
230 | unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; | 367 | unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; |
231 | unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); | 368 | unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); |
232 | unsigned pfn; | 369 | unsigned long pfn; |
370 | |||
371 | xen_max_p2m_pfn = max_pfn; | ||
233 | 372 | ||
234 | for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { | 373 | p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); |
374 | p2m_init(p2m_missing); | ||
375 | |||
376 | p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
377 | p2m_mid_init(p2m_mid_missing); | ||
378 | |||
379 | p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
380 | p2m_top_init(p2m_top); | ||
381 | |||
382 | /* | ||
383 | * The domain builder gives us a pre-constructed p2m array in | ||
384 | * mfn_list for all the pages initially given to us, so we just | ||
385 | * need to graft that into our tree structure. | ||
386 | */ | ||
387 | for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { | ||
235 | unsigned topidx = p2m_top_index(pfn); | 388 | unsigned topidx = p2m_top_index(pfn); |
389 | unsigned mididx = p2m_mid_index(pfn); | ||
236 | 390 | ||
237 | p2m_top[topidx] = &mfn_list[pfn]; | 391 | if (p2m_top[topidx] == p2m_mid_missing) { |
238 | } | 392 | unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); |
393 | p2m_mid_init(mid); | ||
394 | |||
395 | p2m_top[topidx] = mid; | ||
396 | } | ||
239 | 397 | ||
240 | xen_build_mfn_list_list(); | 398 | p2m_top[topidx][mididx] = &mfn_list[pfn]; |
399 | } | ||
241 | } | 400 | } |
242 | 401 | ||
243 | unsigned long get_phys_to_machine(unsigned long pfn) | 402 | unsigned long get_phys_to_machine(unsigned long pfn) |
244 | { | 403 | { |
245 | unsigned topidx, idx; | 404 | unsigned topidx, mididx, idx; |
246 | 405 | ||
247 | if (unlikely(pfn >= MAX_DOMAIN_PAGES)) | 406 | if (unlikely(pfn >= MAX_P2M_PFN)) |
248 | return INVALID_P2M_ENTRY; | 407 | return INVALID_P2M_ENTRY; |
249 | 408 | ||
250 | topidx = p2m_top_index(pfn); | 409 | topidx = p2m_top_index(pfn); |
410 | mididx = p2m_mid_index(pfn); | ||
251 | idx = p2m_index(pfn); | 411 | idx = p2m_index(pfn); |
252 | return p2m_top[topidx][idx]; | 412 | |
413 | return p2m_top[topidx][mididx][idx]; | ||
253 | } | 414 | } |
254 | EXPORT_SYMBOL_GPL(get_phys_to_machine); | 415 | EXPORT_SYMBOL_GPL(get_phys_to_machine); |
255 | 416 | ||
256 | /* install a new p2m_top page */ | 417 | static void *alloc_p2m_page(void) |
257 | bool install_p2mtop_page(unsigned long pfn, unsigned long *p) | ||
258 | { | 418 | { |
259 | unsigned topidx = p2m_top_index(pfn); | 419 | return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); |
260 | unsigned long **pfnp, *mfnp; | 420 | } |
261 | unsigned i; | ||
262 | 421 | ||
263 | pfnp = &p2m_top[topidx]; | 422 | static void free_p2m_page(void *p) |
264 | mfnp = &p2m_top_mfn[topidx]; | 423 | { |
424 | free_page((unsigned long)p); | ||
425 | } | ||
265 | 426 | ||
266 | for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) | 427 | /* |
267 | p[i] = INVALID_P2M_ENTRY; | 428 | * Fully allocate the p2m structure for a given pfn. We need to check |
429 | * that both the top and mid levels are allocated, and make sure the | ||
430 | * parallel mfn tree is kept in sync. We may race with other cpus, so | ||
431 | * the new pages are installed with cmpxchg; if we lose the race then | ||
432 | * simply free the page we allocated and use the one that's there. | ||
433 | */ | ||
434 | static bool alloc_p2m(unsigned long pfn) | ||
435 | { | ||
436 | unsigned topidx, mididx; | ||
437 | unsigned long ***top_p, **mid; | ||
438 | unsigned long *top_mfn_p, *mid_mfn; | ||
268 | 439 | ||
269 | if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) { | 440 | topidx = p2m_top_index(pfn); |
270 | *mfnp = virt_to_mfn(p); | 441 | mididx = p2m_mid_index(pfn); |
271 | return true; | 442 | |
443 | top_p = &p2m_top[topidx]; | ||
444 | mid = *top_p; | ||
445 | |||
446 | if (mid == p2m_mid_missing) { | ||
447 | /* Mid level is missing, allocate a new one */ | ||
448 | mid = alloc_p2m_page(); | ||
449 | if (!mid) | ||
450 | return false; | ||
451 | |||
452 | p2m_mid_init(mid); | ||
453 | |||
454 | if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) | ||
455 | free_p2m_page(mid); | ||
272 | } | 456 | } |
273 | 457 | ||
274 | return false; | 458 | top_mfn_p = &p2m_top_mfn[topidx]; |
275 | } | 459 | mid_mfn = p2m_top_mfn_p[topidx]; |
276 | 460 | ||
277 | static void alloc_p2m(unsigned long pfn) | 461 | BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); |
278 | { | 462 | |
279 | unsigned long *p; | 463 | if (mid_mfn == p2m_mid_missing_mfn) { |
464 | /* Separately check the mid mfn level */ | ||
465 | unsigned long missing_mfn; | ||
466 | unsigned long mid_mfn_mfn; | ||
467 | |||
468 | mid_mfn = alloc_p2m_page(); | ||
469 | if (!mid_mfn) | ||
470 | return false; | ||
280 | 471 | ||
281 | p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); | 472 | p2m_mid_mfn_init(mid_mfn); |
282 | BUG_ON(p == NULL); | ||
283 | 473 | ||
284 | if (!install_p2mtop_page(pfn, p)) | 474 | missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); |
285 | free_page((unsigned long)p); | 475 | mid_mfn_mfn = virt_to_mfn(mid_mfn); |
476 | if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) | ||
477 | free_p2m_page(mid_mfn); | ||
478 | else | ||
479 | p2m_top_mfn_p[topidx] = mid_mfn; | ||
480 | } | ||
481 | |||
482 | if (p2m_top[topidx][mididx] == p2m_missing) { | ||
483 | /* p2m leaf page is missing */ | ||
484 | unsigned long *p2m; | ||
485 | |||
486 | p2m = alloc_p2m_page(); | ||
487 | if (!p2m) | ||
488 | return false; | ||
489 | |||
490 | p2m_init(p2m); | ||
491 | |||
492 | if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) | ||
493 | free_p2m_page(p2m); | ||
494 | else | ||
495 | mid_mfn[mididx] = virt_to_mfn(p2m); | ||
496 | } | ||
497 | |||
498 | return true; | ||
286 | } | 499 | } |
287 | 500 | ||
288 | /* Try to install p2m mapping; fail if intermediate bits missing */ | 501 | /* Try to install p2m mapping; fail if intermediate bits missing */ |
289 | bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) | 502 | bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) |
290 | { | 503 | { |
291 | unsigned topidx, idx; | 504 | unsigned topidx, mididx, idx; |
292 | 505 | ||
293 | if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { | 506 | if (unlikely(pfn >= MAX_P2M_PFN)) { |
294 | BUG_ON(mfn != INVALID_P2M_ENTRY); | 507 | BUG_ON(mfn != INVALID_P2M_ENTRY); |
295 | return true; | 508 | return true; |
296 | } | 509 | } |
297 | 510 | ||
298 | topidx = p2m_top_index(pfn); | 511 | topidx = p2m_top_index(pfn); |
299 | if (p2m_top[topidx] == p2m_missing) { | 512 | mididx = p2m_mid_index(pfn); |
300 | if (mfn == INVALID_P2M_ENTRY) | ||
301 | return true; | ||
302 | return false; | ||
303 | } | ||
304 | |||
305 | idx = p2m_index(pfn); | 513 | idx = p2m_index(pfn); |
306 | p2m_top[topidx][idx] = mfn; | 514 | |
515 | if (p2m_top[topidx][mididx] == p2m_missing) | ||
516 | return mfn == INVALID_P2M_ENTRY; | ||
517 | |||
518 | p2m_top[topidx][mididx][idx] = mfn; | ||
307 | 519 | ||
308 | return true; | 520 | return true; |
309 | } | 521 | } |
310 | 522 | ||
311 | void set_phys_to_machine(unsigned long pfn, unsigned long mfn) | 523 | bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) |
312 | { | 524 | { |
313 | if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { | 525 | if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { |
314 | BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); | 526 | BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); |
315 | return; | 527 | return true; |
316 | } | 528 | } |
317 | 529 | ||
318 | if (unlikely(!__set_phys_to_machine(pfn, mfn))) { | 530 | if (unlikely(!__set_phys_to_machine(pfn, mfn))) { |
319 | alloc_p2m(pfn); | 531 | if (!alloc_p2m(pfn)) |
532 | return false; | ||
320 | 533 | ||
321 | if (!__set_phys_to_machine(pfn, mfn)) | 534 | if (!__set_phys_to_machine(pfn, mfn)) |
322 | BUG(); | 535 | return false; |
323 | } | 536 | } |
537 | |||
538 | return true; | ||
324 | } | 539 | } |
325 | 540 | ||
326 | unsigned long arbitrary_virt_to_mfn(void *vaddr) | 541 | unsigned long arbitrary_virt_to_mfn(void *vaddr) |
@@ -359,7 +574,8 @@ void make_lowmem_page_readonly(void *vaddr) | |||
359 | unsigned int level; | 574 | unsigned int level; |
360 | 575 | ||
361 | pte = lookup_address(address, &level); | 576 | pte = lookup_address(address, &level); |
362 | BUG_ON(pte == NULL); | 577 | if (pte == NULL) |
578 | return; /* vaddr missing */ | ||
363 | 579 | ||
364 | ptev = pte_wrprotect(*pte); | 580 | ptev = pte_wrprotect(*pte); |
365 | 581 | ||
@@ -374,7 +590,8 @@ void make_lowmem_page_readwrite(void *vaddr) | |||
374 | unsigned int level; | 590 | unsigned int level; |
375 | 591 | ||
376 | pte = lookup_address(address, &level); | 592 | pte = lookup_address(address, &level); |
377 | BUG_ON(pte == NULL); | 593 | if (pte == NULL) |
594 | return; /* vaddr missing */ | ||
378 | 595 | ||
379 | ptev = pte_mkwrite(*pte); | 596 | ptev = pte_mkwrite(*pte); |
380 | 597 | ||
@@ -395,7 +612,7 @@ static bool xen_iomap_pte(pte_t pte) | |||
395 | return pte_flags(pte) & _PAGE_IOMAP; | 612 | return pte_flags(pte) & _PAGE_IOMAP; |
396 | } | 613 | } |
397 | 614 | ||
398 | static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) | 615 | void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) |
399 | { | 616 | { |
400 | struct multicall_space mcs; | 617 | struct multicall_space mcs; |
401 | struct mmu_update *u; | 618 | struct mmu_update *u; |
@@ -407,10 +624,16 @@ static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) | |||
407 | u->ptr = arbitrary_virt_to_machine(ptep).maddr; | 624 | u->ptr = arbitrary_virt_to_machine(ptep).maddr; |
408 | u->val = pte_val_ma(pteval); | 625 | u->val = pte_val_ma(pteval); |
409 | 626 | ||
410 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO); | 627 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); |
411 | 628 | ||
412 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 629 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
413 | } | 630 | } |
631 | EXPORT_SYMBOL_GPL(xen_set_domain_pte); | ||
632 | |||
633 | static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) | ||
634 | { | ||
635 | xen_set_domain_pte(ptep, pteval, DOMID_IO); | ||
636 | } | ||
414 | 637 | ||
415 | static void xen_extend_mmu_update(const struct mmu_update *update) | 638 | static void xen_extend_mmu_update(const struct mmu_update *update) |
416 | { | 639 | { |
@@ -557,7 +780,20 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) | |||
557 | if (val & _PAGE_PRESENT) { | 780 | if (val & _PAGE_PRESENT) { |
558 | unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; | 781 | unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; |
559 | pteval_t flags = val & PTE_FLAGS_MASK; | 782 | pteval_t flags = val & PTE_FLAGS_MASK; |
560 | val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; | 783 | unsigned long mfn = pfn_to_mfn(pfn); |
784 | |||
785 | /* | ||
786 | * If there's no mfn for the pfn, then just create an | ||
787 | * empty non-present pte. Unfortunately this loses | ||
788 | * information about the original pfn, so | ||
789 | * pte_mfn_to_pfn is asymmetric. | ||
790 | */ | ||
791 | if (unlikely(mfn == INVALID_P2M_ENTRY)) { | ||
792 | mfn = 0; | ||
793 | flags = 0; | ||
794 | } | ||
795 | |||
796 | val = ((pteval_t)mfn << PAGE_SHIFT) | flags; | ||
561 | } | 797 | } |
562 | 798 | ||
563 | return val; | 799 | return val; |
@@ -579,10 +815,18 @@ static pteval_t iomap_pte(pteval_t val) | |||
579 | 815 | ||
580 | pteval_t xen_pte_val(pte_t pte) | 816 | pteval_t xen_pte_val(pte_t pte) |
581 | { | 817 | { |
582 | if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) | 818 | pteval_t pteval = pte.pte; |
583 | return pte.pte; | ||
584 | 819 | ||
585 | return pte_mfn_to_pfn(pte.pte); | 820 | /* If this is a WC pte, convert back from Xen WC to Linux WC */ |
821 | if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) { | ||
822 | WARN_ON(!pat_enabled); | ||
823 | pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; | ||
824 | } | ||
825 | |||
826 | if (xen_initial_domain() && (pteval & _PAGE_IOMAP)) | ||
827 | return pteval; | ||
828 | |||
829 | return pte_mfn_to_pfn(pteval); | ||
586 | } | 830 | } |
587 | PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); | 831 | PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); |
588 | 832 | ||
@@ -592,10 +836,48 @@ pgdval_t xen_pgd_val(pgd_t pgd) | |||
592 | } | 836 | } |
593 | PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); | 837 | PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); |
594 | 838 | ||
839 | /* | ||
840 | * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7 | ||
841 | * are reserved for now, to correspond to the Intel-reserved PAT | ||
842 | * types. | ||
843 | * | ||
844 | * We expect Linux's PAT set as follows: | ||
845 | * | ||
846 | * Idx PTE flags Linux Xen Default | ||
847 | * 0 WB WB WB | ||
848 | * 1 PWT WC WT WT | ||
849 | * 2 PCD UC- UC- UC- | ||
850 | * 3 PCD PWT UC UC UC | ||
851 | * 4 PAT WB WC WB | ||
852 | * 5 PAT PWT WC WP WT | ||
853 | * 6 PAT PCD UC- UC UC- | ||
854 | * 7 PAT PCD PWT UC UC UC | ||
855 | */ | ||
856 | |||
857 | void xen_set_pat(u64 pat) | ||
858 | { | ||
859 | /* We expect Linux to use a PAT setting of | ||
860 | * UC UC- WC WB (ignoring the PAT flag) */ | ||
861 | WARN_ON(pat != 0x0007010600070106ull); | ||
862 | } | ||
863 | |||
595 | pte_t xen_make_pte(pteval_t pte) | 864 | pte_t xen_make_pte(pteval_t pte) |
596 | { | 865 | { |
597 | phys_addr_t addr = (pte & PTE_PFN_MASK); | 866 | phys_addr_t addr = (pte & PTE_PFN_MASK); |
598 | 867 | ||
868 | /* If Linux is trying to set a WC pte, then map to the Xen WC. | ||
869 | * If _PAGE_PAT is set, then it probably means it is really | ||
870 | * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope | ||
871 | * things work out OK... | ||
872 | * | ||
873 | * (We should never see kernel mappings with _PAGE_PSE set, | ||
874 | * but we could see hugetlbfs mappings, I think.). | ||
875 | */ | ||
876 | if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) { | ||
877 | if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT) | ||
878 | pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; | ||
879 | } | ||
880 | |||
599 | /* | 881 | /* |
600 | * Unprivileged domains are allowed to do IOMAPpings for | 882 | * Unprivileged domains are allowed to do IOMAPpings for |
601 | * PCI passthrough, but not map ISA space. The ISA | 883 | * PCI passthrough, but not map ISA space. The ISA |
@@ -1508,13 +1790,25 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) | |||
1508 | #endif | 1790 | #endif |
1509 | } | 1791 | } |
1510 | 1792 | ||
1511 | #ifdef CONFIG_X86_32 | ||
1512 | static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) | 1793 | static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) |
1513 | { | 1794 | { |
1795 | unsigned long pfn = pte_pfn(pte); | ||
1796 | |||
1797 | #ifdef CONFIG_X86_32 | ||
1514 | /* If there's an existing pte, then don't allow _PAGE_RW to be set */ | 1798 | /* If there's an existing pte, then don't allow _PAGE_RW to be set */ |
1515 | if (pte_val_ma(*ptep) & _PAGE_PRESENT) | 1799 | if (pte_val_ma(*ptep) & _PAGE_PRESENT) |
1516 | pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & | 1800 | pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & |
1517 | pte_val_ma(pte)); | 1801 | pte_val_ma(pte)); |
1802 | #endif | ||
1803 | |||
1804 | /* | ||
1805 | * If the new pfn is within the range of the newly allocated | ||
1806 | * kernel pagetable, and it isn't being mapped into an | ||
1807 | * early_ioremap fixmap slot, make sure it is RO. | ||
1808 | */ | ||
1809 | if (!is_early_ioremap_ptep(ptep) && | ||
1810 | pfn >= e820_table_start && pfn < e820_table_end) | ||
1811 | pte = pte_wrprotect(pte); | ||
1518 | 1812 | ||
1519 | return pte; | 1813 | return pte; |
1520 | } | 1814 | } |
@@ -1527,7 +1821,6 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) | |||
1527 | 1821 | ||
1528 | xen_set_pte(ptep, pte); | 1822 | xen_set_pte(ptep, pte); |
1529 | } | 1823 | } |
1530 | #endif | ||
1531 | 1824 | ||
1532 | static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) | 1825 | static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) |
1533 | { | 1826 | { |
@@ -1698,6 +1991,9 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) | |||
1698 | unsigned ident_pte; | 1991 | unsigned ident_pte; |
1699 | unsigned long pfn; | 1992 | unsigned long pfn; |
1700 | 1993 | ||
1994 | level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, | ||
1995 | PAGE_SIZE); | ||
1996 | |||
1701 | ident_pte = 0; | 1997 | ident_pte = 0; |
1702 | pfn = 0; | 1998 | pfn = 0; |
1703 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { | 1999 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { |
@@ -1708,7 +2004,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) | |||
1708 | pte_page = m2v(pmd[pmdidx].pmd); | 2004 | pte_page = m2v(pmd[pmdidx].pmd); |
1709 | else { | 2005 | else { |
1710 | /* Check for free pte pages */ | 2006 | /* Check for free pte pages */ |
1711 | if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) | 2007 | if (ident_pte == LEVEL1_IDENT_ENTRIES) |
1712 | break; | 2008 | break; |
1713 | 2009 | ||
1714 | pte_page = &level1_ident_pgt[ident_pte]; | 2010 | pte_page = &level1_ident_pgt[ident_pte]; |
@@ -1815,7 +2111,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, | |||
1815 | __xen_write_cr3(true, __pa(pgd)); | 2111 | __xen_write_cr3(true, __pa(pgd)); |
1816 | xen_mc_issue(PARAVIRT_LAZY_CPU); | 2112 | xen_mc_issue(PARAVIRT_LAZY_CPU); |
1817 | 2113 | ||
1818 | reserve_early(__pa(xen_start_info->pt_base), | 2114 | memblock_x86_reserve_range(__pa(xen_start_info->pt_base), |
1819 | __pa(xen_start_info->pt_base + | 2115 | __pa(xen_start_info->pt_base + |
1820 | xen_start_info->nr_pt_frames * PAGE_SIZE), | 2116 | xen_start_info->nr_pt_frames * PAGE_SIZE), |
1821 | "XEN PAGETABLES"); | 2117 | "XEN PAGETABLES"); |
@@ -1823,13 +2119,15 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, | |||
1823 | return pgd; | 2119 | return pgd; |
1824 | } | 2120 | } |
1825 | #else /* !CONFIG_X86_64 */ | 2121 | #else /* !CONFIG_X86_64 */ |
1826 | static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; | 2122 | static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD); |
1827 | 2123 | ||
1828 | __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, | 2124 | __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, |
1829 | unsigned long max_pfn) | 2125 | unsigned long max_pfn) |
1830 | { | 2126 | { |
1831 | pmd_t *kernel_pmd; | 2127 | pmd_t *kernel_pmd; |
1832 | 2128 | ||
2129 | level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE); | ||
2130 | |||
1833 | max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + | 2131 | max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + |
1834 | xen_start_info->nr_pt_frames * PAGE_SIZE + | 2132 | xen_start_info->nr_pt_frames * PAGE_SIZE + |
1835 | 512*1024); | 2133 | 512*1024); |
@@ -1853,7 +2151,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, | |||
1853 | 2151 | ||
1854 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); | 2152 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); |
1855 | 2153 | ||
1856 | reserve_early(__pa(xen_start_info->pt_base), | 2154 | memblock_x86_reserve_range(__pa(xen_start_info->pt_base), |
1857 | __pa(xen_start_info->pt_base + | 2155 | __pa(xen_start_info->pt_base + |
1858 | xen_start_info->nr_pt_frames * PAGE_SIZE), | 2156 | xen_start_info->nr_pt_frames * PAGE_SIZE), |
1859 | "XEN PAGETABLES"); | 2157 | "XEN PAGETABLES"); |
@@ -2008,14 +2306,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { | |||
2008 | .alloc_pte = xen_alloc_pte_init, | 2306 | .alloc_pte = xen_alloc_pte_init, |
2009 | .release_pte = xen_release_pte_init, | 2307 | .release_pte = xen_release_pte_init, |
2010 | .alloc_pmd = xen_alloc_pmd_init, | 2308 | .alloc_pmd = xen_alloc_pmd_init, |
2011 | .alloc_pmd_clone = paravirt_nop, | ||
2012 | .release_pmd = xen_release_pmd_init, | 2309 | .release_pmd = xen_release_pmd_init, |
2013 | 2310 | ||
2014 | #ifdef CONFIG_X86_64 | ||
2015 | .set_pte = xen_set_pte, | ||
2016 | #else | ||
2017 | .set_pte = xen_set_pte_init, | 2311 | .set_pte = xen_set_pte_init, |
2018 | #endif | ||
2019 | .set_pte_at = xen_set_pte_at, | 2312 | .set_pte_at = xen_set_pte_at, |
2020 | .set_pmd = xen_set_pmd_hyper, | 2313 | .set_pmd = xen_set_pmd_hyper, |
2021 | 2314 | ||
@@ -2300,6 +2593,72 @@ void __init xen_hvm_init_mmu_ops(void) | |||
2300 | } | 2593 | } |
2301 | #endif | 2594 | #endif |
2302 | 2595 | ||
2596 | #define REMAP_BATCH_SIZE 16 | ||
2597 | |||
2598 | struct remap_data { | ||
2599 | unsigned long mfn; | ||
2600 | pgprot_t prot; | ||
2601 | struct mmu_update *mmu_update; | ||
2602 | }; | ||
2603 | |||
2604 | static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, | ||
2605 | unsigned long addr, void *data) | ||
2606 | { | ||
2607 | struct remap_data *rmd = data; | ||
2608 | pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); | ||
2609 | |||
2610 | rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr; | ||
2611 | rmd->mmu_update->val = pte_val_ma(pte); | ||
2612 | rmd->mmu_update++; | ||
2613 | |||
2614 | return 0; | ||
2615 | } | ||
2616 | |||
2617 | int xen_remap_domain_mfn_range(struct vm_area_struct *vma, | ||
2618 | unsigned long addr, | ||
2619 | unsigned long mfn, int nr, | ||
2620 | pgprot_t prot, unsigned domid) | ||
2621 | { | ||
2622 | struct remap_data rmd; | ||
2623 | struct mmu_update mmu_update[REMAP_BATCH_SIZE]; | ||
2624 | int batch; | ||
2625 | unsigned long range; | ||
2626 | int err = 0; | ||
2627 | |||
2628 | prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); | ||
2629 | |||
2630 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | ||
2631 | |||
2632 | rmd.mfn = mfn; | ||
2633 | rmd.prot = prot; | ||
2634 | |||
2635 | while (nr) { | ||
2636 | batch = min(REMAP_BATCH_SIZE, nr); | ||
2637 | range = (unsigned long)batch << PAGE_SHIFT; | ||
2638 | |||
2639 | rmd.mmu_update = mmu_update; | ||
2640 | err = apply_to_page_range(vma->vm_mm, addr, range, | ||
2641 | remap_area_mfn_pte_fn, &rmd); | ||
2642 | if (err) | ||
2643 | goto out; | ||
2644 | |||
2645 | err = -EFAULT; | ||
2646 | if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) | ||
2647 | goto out; | ||
2648 | |||
2649 | nr -= batch; | ||
2650 | addr += range; | ||
2651 | } | ||
2652 | |||
2653 | err = 0; | ||
2654 | out: | ||
2655 | |||
2656 | flush_tlb_all(); | ||
2657 | |||
2658 | return err; | ||
2659 | } | ||
2660 | EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); | ||
2661 | |||
2303 | #ifdef CONFIG_XEN_DEBUG_FS | 2662 | #ifdef CONFIG_XEN_DEBUG_FS |
2304 | 2663 | ||
2305 | static struct dentry *d_mmu_debug; | 2664 | static struct dentry *d_mmu_debug; |
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index fa938c4aa2f7..537bb9aab777 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h | |||
@@ -12,7 +12,6 @@ enum pt_level { | |||
12 | 12 | ||
13 | 13 | ||
14 | bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); | 14 | bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); |
15 | bool install_p2mtop_page(unsigned long pfn, unsigned long *p); | ||
16 | 15 | ||
17 | void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); | 16 | void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); |
18 | 17 | ||
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index be4d80a6fae9..bfd0632fe65e 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c | |||
@@ -6,6 +6,7 @@ | |||
6 | 6 | ||
7 | #include <asm/xen/hypervisor.h> | 7 | #include <asm/xen/hypervisor.h> |
8 | #include <xen/xen.h> | 8 | #include <xen/xen.h> |
9 | #include <asm/iommu_table.h> | ||
9 | 10 | ||
10 | int xen_swiotlb __read_mostly; | 11 | int xen_swiotlb __read_mostly; |
11 | 12 | ||
@@ -60,3 +61,7 @@ void __init pci_xen_swiotlb_init(void) | |||
60 | pci_request_acs(); | 61 | pci_request_acs(); |
61 | } | 62 | } |
62 | } | 63 | } |
64 | IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, | ||
65 | 0, | ||
66 | pci_xen_swiotlb_init, | ||
67 | 0); | ||
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 62ceb7864017..b1dbdaa23ecc 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/sched.h> | 8 | #include <linux/sched.h> |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/pm.h> | 10 | #include <linux/pm.h> |
11 | #include <linux/memblock.h> | ||
11 | 12 | ||
12 | #include <asm/elf.h> | 13 | #include <asm/elf.h> |
13 | #include <asm/vdso.h> | 14 | #include <asm/vdso.h> |
@@ -17,8 +18,10 @@ | |||
17 | #include <asm/xen/hypervisor.h> | 18 | #include <asm/xen/hypervisor.h> |
18 | #include <asm/xen/hypercall.h> | 19 | #include <asm/xen/hypercall.h> |
19 | 20 | ||
21 | #include <xen/xen.h> | ||
20 | #include <xen/page.h> | 22 | #include <xen/page.h> |
21 | #include <xen/interface/callback.h> | 23 | #include <xen/interface/callback.h> |
24 | #include <xen/interface/memory.h> | ||
22 | #include <xen/interface/physdev.h> | 25 | #include <xen/interface/physdev.h> |
23 | #include <xen/interface/memory.h> | 26 | #include <xen/interface/memory.h> |
24 | #include <xen/features.h> | 27 | #include <xen/features.h> |
@@ -33,6 +36,39 @@ extern void xen_sysenter_target(void); | |||
33 | extern void xen_syscall_target(void); | 36 | extern void xen_syscall_target(void); |
34 | extern void xen_syscall32_target(void); | 37 | extern void xen_syscall32_target(void); |
35 | 38 | ||
39 | /* Amount of extra memory space we add to the e820 ranges */ | ||
40 | phys_addr_t xen_extra_mem_start, xen_extra_mem_size; | ||
41 | |||
42 | /* | ||
43 | * The maximum amount of extra memory compared to the base size. The | ||
44 | * main scaling factor is the size of struct page. At extreme ratios | ||
45 | * of base:extra, all the base memory can be filled with page | ||
46 | * structures for the extra memory, leaving no space for anything | ||
47 | * else. | ||
48 | * | ||
49 | * 10x seems like a reasonable balance between scaling flexibility and | ||
50 | * leaving a practically usable system. | ||
51 | */ | ||
52 | #define EXTRA_MEM_RATIO (10) | ||
53 | |||
54 | static __init void xen_add_extra_mem(unsigned long pages) | ||
55 | { | ||
56 | u64 size = (u64)pages * PAGE_SIZE; | ||
57 | u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; | ||
58 | |||
59 | if (!pages) | ||
60 | return; | ||
61 | |||
62 | e820_add_region(extra_start, size, E820_RAM); | ||
63 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | ||
64 | |||
65 | memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA"); | ||
66 | |||
67 | xen_extra_mem_size += size; | ||
68 | |||
69 | xen_max_p2m_pfn = PFN_DOWN(extra_start + size); | ||
70 | } | ||
71 | |||
36 | static unsigned long __init xen_release_chunk(phys_addr_t start_addr, | 72 | static unsigned long __init xen_release_chunk(phys_addr_t start_addr, |
37 | phys_addr_t end_addr) | 73 | phys_addr_t end_addr) |
38 | { | 74 | { |
@@ -104,16 +140,65 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, | |||
104 | /** | 140 | /** |
105 | * machine_specific_memory_setup - Hook for machine specific memory setup. | 141 | * machine_specific_memory_setup - Hook for machine specific memory setup. |
106 | **/ | 142 | **/ |
107 | |||
108 | char * __init xen_memory_setup(void) | 143 | char * __init xen_memory_setup(void) |
109 | { | 144 | { |
145 | static struct e820entry map[E820MAX] __initdata; | ||
146 | |||
110 | unsigned long max_pfn = xen_start_info->nr_pages; | 147 | unsigned long max_pfn = xen_start_info->nr_pages; |
148 | unsigned long long mem_end; | ||
149 | int rc; | ||
150 | struct xen_memory_map memmap; | ||
151 | unsigned long extra_pages = 0; | ||
152 | unsigned long extra_limit; | ||
153 | int i; | ||
154 | int op; | ||
111 | 155 | ||
112 | max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); | 156 | max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); |
157 | mem_end = PFN_PHYS(max_pfn); | ||
158 | |||
159 | memmap.nr_entries = E820MAX; | ||
160 | set_xen_guest_handle(memmap.buffer, map); | ||
161 | |||
162 | op = xen_initial_domain() ? | ||
163 | XENMEM_machine_memory_map : | ||
164 | XENMEM_memory_map; | ||
165 | rc = HYPERVISOR_memory_op(op, &memmap); | ||
166 | if (rc == -ENOSYS) { | ||
167 | memmap.nr_entries = 1; | ||
168 | map[0].addr = 0ULL; | ||
169 | map[0].size = mem_end; | ||
170 | /* 8MB slack (to balance backend allocations). */ | ||
171 | map[0].size += 8ULL << 20; | ||
172 | map[0].type = E820_RAM; | ||
173 | rc = 0; | ||
174 | } | ||
175 | BUG_ON(rc); | ||
113 | 176 | ||
114 | e820.nr_map = 0; | 177 | e820.nr_map = 0; |
178 | xen_extra_mem_start = mem_end; | ||
179 | for (i = 0; i < memmap.nr_entries; i++) { | ||
180 | unsigned long long end = map[i].addr + map[i].size; | ||
181 | |||
182 | if (map[i].type == E820_RAM) { | ||
183 | if (map[i].addr < mem_end && end > mem_end) { | ||
184 | /* Truncate region to max_mem. */ | ||
185 | u64 delta = end - mem_end; | ||
115 | 186 | ||
116 | e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); | 187 | map[i].size -= delta; |
188 | extra_pages += PFN_DOWN(delta); | ||
189 | |||
190 | end = mem_end; | ||
191 | } | ||
192 | } | ||
193 | |||
194 | if (end > xen_extra_mem_start) | ||
195 | xen_extra_mem_start = end; | ||
196 | |||
197 | /* If region is non-RAM or below mem_end, add what remains */ | ||
198 | if ((map[i].type != E820_RAM || map[i].addr < mem_end) && | ||
199 | map[i].size > 0) | ||
200 | e820_add_region(map[i].addr, map[i].size, map[i].type); | ||
201 | } | ||
117 | 202 | ||
118 | /* | 203 | /* |
119 | * Even though this is normal, usable memory under Xen, reserve | 204 | * Even though this is normal, usable memory under Xen, reserve |
@@ -132,13 +217,35 @@ char * __init xen_memory_setup(void) | |||
132 | * - xen_start_info | 217 | * - xen_start_info |
133 | * See comment above "struct start_info" in <xen/interface/xen.h> | 218 | * See comment above "struct start_info" in <xen/interface/xen.h> |
134 | */ | 219 | */ |
135 | reserve_early(__pa(xen_start_info->mfn_list), | 220 | memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), |
136 | __pa(xen_start_info->pt_base), | 221 | __pa(xen_start_info->pt_base), |
137 | "XEN START INFO"); | 222 | "XEN START INFO"); |
138 | 223 | ||
139 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | 224 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |
140 | 225 | ||
141 | xen_return_unused_memory(xen_start_info->nr_pages, &e820); | 226 | extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); |
227 | |||
228 | /* | ||
229 | * Clamp the amount of extra memory to a EXTRA_MEM_RATIO | ||
230 | * factor the base size. On non-highmem systems, the base | ||
231 | * size is the full initial memory allocation; on highmem it | ||
232 | * is limited to the max size of lowmem, so that it doesn't | ||
233 | * get completely filled. | ||
234 | * | ||
235 | * In principle there could be a problem in lowmem systems if | ||
236 | * the initial memory is also very large with respect to | ||
237 | * lowmem, but we won't try to deal with that here. | ||
238 | */ | ||
239 | extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), | ||
240 | max_pfn + extra_pages); | ||
241 | |||
242 | if (extra_limit >= max_pfn) | ||
243 | extra_pages = extra_limit - max_pfn; | ||
244 | else | ||
245 | extra_pages = 0; | ||
246 | |||
247 | if (!xen_initial_domain()) | ||
248 | xen_add_extra_mem(extra_pages); | ||
142 | 249 | ||
143 | return "Xen"; | 250 | return "Xen"; |
144 | } | 251 | } |
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 834dfeb54e31..72a4c7959045 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c | |||
@@ -426,9 +426,9 @@ static void stop_self(void *v) | |||
426 | BUG(); | 426 | BUG(); |
427 | } | 427 | } |
428 | 428 | ||
429 | static void xen_smp_send_stop(void) | 429 | static void xen_stop_other_cpus(int wait) |
430 | { | 430 | { |
431 | smp_call_function(stop_self, NULL, 0); | 431 | smp_call_function(stop_self, NULL, wait); |
432 | } | 432 | } |
433 | 433 | ||
434 | static void xen_smp_send_reschedule(int cpu) | 434 | static void xen_smp_send_reschedule(int cpu) |
@@ -496,7 +496,7 @@ static const struct smp_ops xen_smp_ops __initdata = { | |||
496 | .cpu_disable = xen_cpu_disable, | 496 | .cpu_disable = xen_cpu_disable, |
497 | .play_dead = xen_play_dead, | 497 | .play_dead = xen_play_dead, |
498 | 498 | ||
499 | .smp_send_stop = xen_smp_send_stop, | 499 | .stop_other_cpus = xen_stop_other_cpus, |
500 | .smp_send_reschedule = xen_smp_send_reschedule, | 500 | .smp_send_reschedule = xen_smp_send_reschedule, |
501 | 501 | ||
502 | .send_call_func_ipi = xen_smp_send_call_function_ipi, | 502 | .send_call_func_ipi = xen_smp_send_call_function_ipi, |
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index e0500646585d..23e061b9327b 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c | |||
@@ -224,7 +224,7 @@ static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enab | |||
224 | goto out; | 224 | goto out; |
225 | } | 225 | } |
226 | 226 | ||
227 | flags = __raw_local_save_flags(); | 227 | flags = arch_local_save_flags(); |
228 | if (irq_enable) { | 228 | if (irq_enable) { |
229 | ADD_STATS(taken_slow_irqenable, 1); | 229 | ADD_STATS(taken_slow_irqenable, 1); |
230 | raw_local_irq_enable(); | 230 | raw_local_irq_enable(); |
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 7c8ab86163e9..64044747348e 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h | |||
@@ -30,6 +30,9 @@ void xen_setup_machphys_mapping(void); | |||
30 | pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); | 30 | pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); |
31 | void xen_ident_map_ISA(void); | 31 | void xen_ident_map_ISA(void); |
32 | void xen_reserve_top(void); | 32 | void xen_reserve_top(void); |
33 | extern unsigned long xen_max_p2m_pfn; | ||
34 | |||
35 | void xen_set_pat(u64); | ||
33 | 36 | ||
34 | char * __init xen_memory_setup(void); | 37 | char * __init xen_memory_setup(void); |
35 | void __init xen_arch_setup(void); | 38 | void __init xen_arch_setup(void); |