diff options
Diffstat (limited to 'arch/x86')
214 files changed, 8533 insertions, 4571 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6ab63107eeaf..d5ed94d30aad 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -21,7 +21,7 @@ config X86 | |||
21 | select HAVE_UNSTABLE_SCHED_CLOCK | 21 | select HAVE_UNSTABLE_SCHED_CLOCK |
22 | select HAVE_IDE | 22 | select HAVE_IDE |
23 | select HAVE_OPROFILE | 23 | select HAVE_OPROFILE |
24 | select HAVE_PERF_EVENTS if (!M386 && !M486) | 24 | select HAVE_PERF_EVENTS |
25 | select HAVE_IRQ_WORK | 25 | select HAVE_IRQ_WORK |
26 | select HAVE_IOREMAP_PROT | 26 | select HAVE_IOREMAP_PROT |
27 | select HAVE_KPROBES | 27 | select HAVE_KPROBES |
@@ -51,6 +51,7 @@ config X86 | |||
51 | select HAVE_KERNEL_GZIP | 51 | select HAVE_KERNEL_GZIP |
52 | select HAVE_KERNEL_BZIP2 | 52 | select HAVE_KERNEL_BZIP2 |
53 | select HAVE_KERNEL_LZMA | 53 | select HAVE_KERNEL_LZMA |
54 | select HAVE_KERNEL_XZ | ||
54 | select HAVE_KERNEL_LZO | 55 | select HAVE_KERNEL_LZO |
55 | select HAVE_HW_BREAKPOINT | 56 | select HAVE_HW_BREAKPOINT |
56 | select HAVE_MIXED_BREAKPOINTS_REGS | 57 | select HAVE_MIXED_BREAKPOINTS_REGS |
@@ -65,6 +66,7 @@ config X86 | |||
65 | select HAVE_SPARSE_IRQ | 66 | select HAVE_SPARSE_IRQ |
66 | select GENERIC_IRQ_PROBE | 67 | select GENERIC_IRQ_PROBE |
67 | select GENERIC_PENDING_IRQ if SMP | 68 | select GENERIC_PENDING_IRQ if SMP |
69 | select USE_GENERIC_SMP_HELPERS if SMP | ||
68 | 70 | ||
69 | config INSTRUCTION_DECODER | 71 | config INSTRUCTION_DECODER |
70 | def_bool (KPROBES || PERF_EVENTS) | 72 | def_bool (KPROBES || PERF_EVENTS) |
@@ -203,10 +205,6 @@ config HAVE_INTEL_TXT | |||
203 | def_bool y | 205 | def_bool y |
204 | depends on EXPERIMENTAL && DMAR && ACPI | 206 | depends on EXPERIMENTAL && DMAR && ACPI |
205 | 207 | ||
206 | config USE_GENERIC_SMP_HELPERS | ||
207 | def_bool y | ||
208 | depends on SMP | ||
209 | |||
210 | config X86_32_SMP | 208 | config X86_32_SMP |
211 | def_bool y | 209 | def_bool y |
212 | depends on X86_32 && SMP | 210 | depends on X86_32 && SMP |
@@ -629,11 +627,11 @@ config APB_TIMER | |||
629 | as it is off-chip. APB timers are always running regardless of CPU | 627 | as it is off-chip. APB timers are always running regardless of CPU |
630 | C states, they are used as per CPU clockevent device when possible. | 628 | C states, they are used as per CPU clockevent device when possible. |
631 | 629 | ||
632 | # Mark as embedded because too many people got it wrong. | 630 | # Mark as expert because too many people got it wrong. |
633 | # The code disables itself when not needed. | 631 | # The code disables itself when not needed. |
634 | config DMI | 632 | config DMI |
635 | default y | 633 | default y |
636 | bool "Enable DMI scanning" if EMBEDDED | 634 | bool "Enable DMI scanning" if EXPERT |
637 | ---help--- | 635 | ---help--- |
638 | Enabled scanning of DMI to identify machine quirks. Say Y | 636 | Enabled scanning of DMI to identify machine quirks. Say Y |
639 | here unless you have verified that your setup is not | 637 | here unless you have verified that your setup is not |
@@ -641,7 +639,7 @@ config DMI | |||
641 | BIOS code. | 639 | BIOS code. |
642 | 640 | ||
643 | config GART_IOMMU | 641 | config GART_IOMMU |
644 | bool "GART IOMMU support" if EMBEDDED | 642 | bool "GART IOMMU support" if EXPERT |
645 | default y | 643 | default y |
646 | select SWIOTLB | 644 | select SWIOTLB |
647 | depends on X86_64 && PCI && AMD_NB | 645 | depends on X86_64 && PCI && AMD_NB |
@@ -891,7 +889,7 @@ config X86_THERMAL_VECTOR | |||
891 | depends on X86_MCE_INTEL | 889 | depends on X86_MCE_INTEL |
892 | 890 | ||
893 | config VM86 | 891 | config VM86 |
894 | bool "Enable VM86 support" if EMBEDDED | 892 | bool "Enable VM86 support" if EXPERT |
895 | default y | 893 | default y |
896 | depends on X86_32 | 894 | depends on X86_32 |
897 | ---help--- | 895 | ---help--- |
@@ -1075,7 +1073,7 @@ endchoice | |||
1075 | 1073 | ||
1076 | choice | 1074 | choice |
1077 | depends on EXPERIMENTAL | 1075 | depends on EXPERIMENTAL |
1078 | prompt "Memory split" if EMBEDDED | 1076 | prompt "Memory split" if EXPERT |
1079 | default VMSPLIT_3G | 1077 | default VMSPLIT_3G |
1080 | depends on X86_32 | 1078 | depends on X86_32 |
1081 | ---help--- | 1079 | ---help--- |
@@ -1137,7 +1135,7 @@ config ARCH_DMA_ADDR_T_64BIT | |||
1137 | def_bool X86_64 || HIGHMEM64G | 1135 | def_bool X86_64 || HIGHMEM64G |
1138 | 1136 | ||
1139 | config DIRECT_GBPAGES | 1137 | config DIRECT_GBPAGES |
1140 | bool "Enable 1GB pages for kernel pagetables" if EMBEDDED | 1138 | bool "Enable 1GB pages for kernel pagetables" if EXPERT |
1141 | default y | 1139 | default y |
1142 | depends on X86_64 | 1140 | depends on X86_64 |
1143 | ---help--- | 1141 | ---help--- |
@@ -1170,16 +1168,16 @@ config NUMA | |||
1170 | comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" | 1168 | comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" |
1171 | depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) | 1169 | depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) |
1172 | 1170 | ||
1173 | config K8_NUMA | 1171 | config AMD_NUMA |
1174 | def_bool y | 1172 | def_bool y |
1175 | prompt "Old style AMD Opteron NUMA detection" | 1173 | prompt "Old style AMD Opteron NUMA detection" |
1176 | depends on X86_64 && NUMA && PCI | 1174 | depends on X86_64 && NUMA && PCI |
1177 | ---help--- | 1175 | ---help--- |
1178 | Enable K8 NUMA node topology detection. You should say Y here if | 1176 | Enable AMD NUMA node topology detection. You should say Y here if |
1179 | you have a multi processor AMD K8 system. This uses an old | 1177 | you have a multi processor AMD system. This uses an old method to |
1180 | method to read the NUMA configuration directly from the builtin | 1178 | read the NUMA configuration directly from the builtin Northbridge |
1181 | Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA | 1179 | of Opteron. It is recommended to use X86_64_ACPI_NUMA instead, |
1182 | instead, which also takes priority if both are compiled in. | 1180 | which also takes priority if both are compiled in. |
1183 | 1181 | ||
1184 | config X86_64_ACPI_NUMA | 1182 | config X86_64_ACPI_NUMA |
1185 | def_bool y | 1183 | def_bool y |
@@ -1371,7 +1369,7 @@ config MATH_EMULATION | |||
1371 | 1369 | ||
1372 | config MTRR | 1370 | config MTRR |
1373 | def_bool y | 1371 | def_bool y |
1374 | prompt "MTRR (Memory Type Range Register) support" if EMBEDDED | 1372 | prompt "MTRR (Memory Type Range Register) support" if EXPERT |
1375 | ---help--- | 1373 | ---help--- |
1376 | On Intel P6 family processors (Pentium Pro, Pentium II and later) | 1374 | On Intel P6 family processors (Pentium Pro, Pentium II and later) |
1377 | the Memory Type Range Registers (MTRRs) may be used to control | 1375 | the Memory Type Range Registers (MTRRs) may be used to control |
@@ -1437,7 +1435,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT | |||
1437 | 1435 | ||
1438 | config X86_PAT | 1436 | config X86_PAT |
1439 | def_bool y | 1437 | def_bool y |
1440 | prompt "x86 PAT support" if EMBEDDED | 1438 | prompt "x86 PAT support" if EXPERT |
1441 | depends on MTRR | 1439 | depends on MTRR |
1442 | ---help--- | 1440 | ---help--- |
1443 | Use PAT attributes to setup page level cache control. | 1441 | Use PAT attributes to setup page level cache control. |
@@ -1541,7 +1539,7 @@ config KEXEC_JUMP | |||
1541 | code in physical address mode via KEXEC | 1539 | code in physical address mode via KEXEC |
1542 | 1540 | ||
1543 | config PHYSICAL_START | 1541 | config PHYSICAL_START |
1544 | hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) | 1542 | hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP) |
1545 | default "0x1000000" | 1543 | default "0x1000000" |
1546 | ---help--- | 1544 | ---help--- |
1547 | This gives the physical address where the kernel is loaded. | 1545 | This gives the physical address where the kernel is loaded. |
@@ -1936,13 +1934,19 @@ config PCI_MMCONFIG | |||
1936 | depends on X86_64 && PCI && ACPI | 1934 | depends on X86_64 && PCI && ACPI |
1937 | 1935 | ||
1938 | config PCI_CNB20LE_QUIRK | 1936 | config PCI_CNB20LE_QUIRK |
1939 | bool "Read CNB20LE Host Bridge Windows" | 1937 | bool "Read CNB20LE Host Bridge Windows" if EXPERT |
1940 | depends on PCI | 1938 | default n |
1939 | depends on PCI && EXPERIMENTAL | ||
1941 | help | 1940 | help |
1942 | Read the PCI windows out of the CNB20LE host bridge. This allows | 1941 | Read the PCI windows out of the CNB20LE host bridge. This allows |
1943 | PCI hotplug to work on systems with the CNB20LE chipset which do | 1942 | PCI hotplug to work on systems with the CNB20LE chipset which do |
1944 | not have ACPI. | 1943 | not have ACPI. |
1945 | 1944 | ||
1945 | There's no public spec for this chipset, and this functionality | ||
1946 | is known to be incomplete. | ||
1947 | |||
1948 | You should say N unless you know you need this. | ||
1949 | |||
1946 | config DMAR | 1950 | config DMAR |
1947 | bool "Support for DMA Remapping Devices (EXPERIMENTAL)" | 1951 | bool "Support for DMA Remapping Devices (EXPERIMENTAL)" |
1948 | depends on PCI_MSI && ACPI && EXPERIMENTAL | 1952 | depends on PCI_MSI && ACPI && EXPERIMENTAL |
@@ -2064,13 +2068,14 @@ config OLPC | |||
2064 | bool "One Laptop Per Child support" | 2068 | bool "One Laptop Per Child support" |
2065 | select GPIOLIB | 2069 | select GPIOLIB |
2066 | select OLPC_OPENFIRMWARE | 2070 | select OLPC_OPENFIRMWARE |
2071 | depends on !X86_64 && !X86_PAE | ||
2067 | ---help--- | 2072 | ---help--- |
2068 | Add support for detecting the unique features of the OLPC | 2073 | Add support for detecting the unique features of the OLPC |
2069 | XO hardware. | 2074 | XO hardware. |
2070 | 2075 | ||
2071 | config OLPC_XO1 | 2076 | config OLPC_XO1 |
2072 | tristate "OLPC XO-1 support" | 2077 | tristate "OLPC XO-1 support" |
2073 | depends on OLPC && PCI | 2078 | depends on OLPC && MFD_CS5535 |
2074 | ---help--- | 2079 | ---help--- |
2075 | Add support for non-essential features of the OLPC XO-1 laptop. | 2080 | Add support for non-essential features of the OLPC XO-1 laptop. |
2076 | 2081 | ||
@@ -2078,11 +2083,17 @@ config OLPC_OPENFIRMWARE | |||
2078 | bool "Support for OLPC's Open Firmware" | 2083 | bool "Support for OLPC's Open Firmware" |
2079 | depends on !X86_64 && !X86_PAE | 2084 | depends on !X86_64 && !X86_PAE |
2080 | default n | 2085 | default n |
2086 | select OF | ||
2081 | help | 2087 | help |
2082 | This option adds support for the implementation of Open Firmware | 2088 | This option adds support for the implementation of Open Firmware |
2083 | that is used on the OLPC XO-1 Children's Machine. | 2089 | that is used on the OLPC XO-1 Children's Machine. |
2084 | If unsure, say N here. | 2090 | If unsure, say N here. |
2085 | 2091 | ||
2092 | config OLPC_OPENFIRMWARE_DT | ||
2093 | bool | ||
2094 | default y if OLPC_OPENFIRMWARE && PROC_DEVICETREE | ||
2095 | select OF_PROMTREE | ||
2096 | |||
2086 | endif # X86_32 | 2097 | endif # X86_32 |
2087 | 2098 | ||
2088 | config AMD_NB | 2099 | config AMD_NB |
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2ac9069890cd..283c5a6a03a6 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu | |||
@@ -310,6 +310,9 @@ config X86_INTERNODE_CACHE_SHIFT | |||
310 | config X86_CMPXCHG | 310 | config X86_CMPXCHG |
311 | def_bool X86_64 || (X86_32 && !M386) | 311 | def_bool X86_64 || (X86_32 && !M386) |
312 | 312 | ||
313 | config CMPXCHG_LOCAL | ||
314 | def_bool X86_64 || (X86_32 && !M386) | ||
315 | |||
313 | config X86_L1_CACHE_SHIFT | 316 | config X86_L1_CACHE_SHIFT |
314 | int | 317 | int |
315 | default "7" if MPENTIUM4 || MPSC | 318 | default "7" if MPENTIUM4 || MPSC |
@@ -421,7 +424,7 @@ config X86_DEBUGCTLMSR | |||
421 | depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML | 424 | depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML |
422 | 425 | ||
423 | menuconfig PROCESSOR_SELECT | 426 | menuconfig PROCESSOR_SELECT |
424 | bool "Supported processor vendors" if EMBEDDED | 427 | bool "Supported processor vendors" if EXPERT |
425 | ---help--- | 428 | ---help--- |
426 | This lets you choose what x86 vendor support code your kernel | 429 | This lets you choose what x86 vendor support code your kernel |
427 | will include. | 430 | will include. |
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index b59ee765414e..615e18810f48 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug | |||
@@ -31,7 +31,7 @@ config X86_VERBOSE_BOOTUP | |||
31 | see errors. Disable this if you want silent bootup. | 31 | see errors. Disable this if you want silent bootup. |
32 | 32 | ||
33 | config EARLY_PRINTK | 33 | config EARLY_PRINTK |
34 | bool "Early printk" if EMBEDDED | 34 | bool "Early printk" if EXPERT |
35 | default y | 35 | default y |
36 | ---help--- | 36 | ---help--- |
37 | Write kernel log output directly into the VGA buffer or to a serial | 37 | Write kernel log output directly into the VGA buffer or to a serial |
@@ -117,6 +117,17 @@ config DEBUG_RODATA_TEST | |||
117 | feature as well as for the change_page_attr() infrastructure. | 117 | feature as well as for the change_page_attr() infrastructure. |
118 | If in doubt, say "N" | 118 | If in doubt, say "N" |
119 | 119 | ||
120 | config DEBUG_SET_MODULE_RONX | ||
121 | bool "Set loadable kernel module data as NX and text as RO" | ||
122 | depends on MODULES | ||
123 | ---help--- | ||
124 | This option helps catch unintended modifications to loadable | ||
125 | kernel module's text and read-only data. It also prevents execution | ||
126 | of module data. Such protection may interfere with run-time code | ||
127 | patching and dynamic kernel tracing - and they might also protect | ||
128 | against certain classes of kernel exploits. | ||
129 | If in doubt, say "N". | ||
130 | |||
120 | config DEBUG_NX_TEST | 131 | config DEBUG_NX_TEST |
121 | tristate "Testcase for the NX non-executable stack feature" | 132 | tristate "Testcase for the NX non-executable stack feature" |
122 | depends on DEBUG_KERNEL && m | 133 | depends on DEBUG_KERNEL && m |
@@ -127,7 +138,7 @@ config DEBUG_NX_TEST | |||
127 | 138 | ||
128 | config DOUBLEFAULT | 139 | config DOUBLEFAULT |
129 | default y | 140 | default y |
130 | bool "Enable doublefault exception handler" if EMBEDDED | 141 | bool "Enable doublefault exception handler" if EXPERT |
131 | depends on X86_32 | 142 | depends on X86_32 |
132 | ---help--- | 143 | ---help--- |
133 | This option allows trapping of rare doublefault exceptions that | 144 | This option allows trapping of rare doublefault exceptions that |
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 0c229551eead..09664efb9cee 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile | |||
@@ -4,7 +4,7 @@ | |||
4 | # create a compressed vmlinux image from the original vmlinux | 4 | # create a compressed vmlinux image from the original vmlinux |
5 | # | 5 | # |
6 | 6 | ||
7 | targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o | 7 | targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o |
8 | 8 | ||
9 | KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 | 9 | KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 |
10 | KBUILD_CFLAGS += -fno-strict-aliasing -fPIC | 10 | KBUILD_CFLAGS += -fno-strict-aliasing -fPIC |
@@ -49,12 +49,15 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE | |||
49 | $(call if_changed,bzip2) | 49 | $(call if_changed,bzip2) |
50 | $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE | 50 | $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE |
51 | $(call if_changed,lzma) | 51 | $(call if_changed,lzma) |
52 | $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE | ||
53 | $(call if_changed,xzkern) | ||
52 | $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE | 54 | $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE |
53 | $(call if_changed,lzo) | 55 | $(call if_changed,lzo) |
54 | 56 | ||
55 | suffix-$(CONFIG_KERNEL_GZIP) := gz | 57 | suffix-$(CONFIG_KERNEL_GZIP) := gz |
56 | suffix-$(CONFIG_KERNEL_BZIP2) := bz2 | 58 | suffix-$(CONFIG_KERNEL_BZIP2) := bz2 |
57 | suffix-$(CONFIG_KERNEL_LZMA) := lzma | 59 | suffix-$(CONFIG_KERNEL_LZMA) := lzma |
60 | suffix-$(CONFIG_KERNEL_XZ) := xz | ||
58 | suffix-$(CONFIG_KERNEL_LZO) := lzo | 61 | suffix-$(CONFIG_KERNEL_LZO) := lzo |
59 | 62 | ||
60 | quiet_cmd_mkpiggy = MKPIGGY $@ | 63 | quiet_cmd_mkpiggy = MKPIGGY $@ |
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 52f85a196fa0..35af09d13dc1 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S | |||
@@ -182,7 +182,7 @@ no_longmode: | |||
182 | hlt | 182 | hlt |
183 | jmp 1b | 183 | jmp 1b |
184 | 184 | ||
185 | #include "../../kernel/verify_cpu_64.S" | 185 | #include "../../kernel/verify_cpu.S" |
186 | 186 | ||
187 | /* | 187 | /* |
188 | * Be careful here startup_64 needs to be at a predictable | 188 | * Be careful here startup_64 needs to be at a predictable |
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 23f315c9f215..3a19d04cebeb 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c | |||
@@ -139,6 +139,10 @@ static int lines, cols; | |||
139 | #include "../../../../lib/decompress_unlzma.c" | 139 | #include "../../../../lib/decompress_unlzma.c" |
140 | #endif | 140 | #endif |
141 | 141 | ||
142 | #ifdef CONFIG_KERNEL_XZ | ||
143 | #include "../../../../lib/decompress_unxz.c" | ||
144 | #endif | ||
145 | |||
142 | #ifdef CONFIG_KERNEL_LZO | 146 | #ifdef CONFIG_KERNEL_LZO |
143 | #include "../../../../lib/decompress_unlzo.c" | 147 | #include "../../../../lib/decompress_unlzo.c" |
144 | #endif | 148 | #endif |
@@ -355,7 +359,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, | |||
355 | if (heap > 0x3fffffffffffUL) | 359 | if (heap > 0x3fffffffffffUL) |
356 | error("Destination address too large"); | 360 | error("Destination address too large"); |
357 | #else | 361 | #else |
358 | if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) | 362 | if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff)) |
359 | error("Destination address too large"); | 363 | error("Destination address too large"); |
360 | #endif | 364 | #endif |
361 | #ifndef CONFIG_RELOCATABLE | 365 | #ifndef CONFIG_RELOCATABLE |
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index 5c228129d175..646aa78ba5fd 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c | |||
@@ -74,7 +74,7 @@ int main(int argc, char *argv[]) | |||
74 | 74 | ||
75 | offs = (olen > ilen) ? olen - ilen : 0; | 75 | offs = (olen > ilen) ? olen - ilen : 0; |
76 | offs += olen >> 12; /* Add 8 bytes for each 32K block */ | 76 | offs += olen >> 12; /* Add 8 bytes for each 32K block */ |
77 | offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */ | 77 | offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */ |
78 | offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ | 78 | offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ |
79 | 79 | ||
80 | printf(".section \".rodata..compressed\",\"a\",@progbits\n"); | 80 | printf(".section \".rodata..compressed\",\"a\",@progbits\n"); |
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index ff16756a51c1..8fe2a4966b7a 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S | |||
@@ -9,6 +9,20 @@ | |||
9 | * Vinodh Gopal <vinodh.gopal@intel.com> | 9 | * Vinodh Gopal <vinodh.gopal@intel.com> |
10 | * Kahraman Akdemir | 10 | * Kahraman Akdemir |
11 | * | 11 | * |
12 | * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD | ||
13 | * interface for 64-bit kernels. | ||
14 | * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) | ||
15 | * Aidan O'Mahony (aidan.o.mahony@intel.com) | ||
16 | * Adrian Hoban <adrian.hoban@intel.com> | ||
17 | * James Guilford (james.guilford@intel.com) | ||
18 | * Gabriele Paoloni <gabriele.paoloni@intel.com> | ||
19 | * Tadeusz Struk (tadeusz.struk@intel.com) | ||
20 | * Wajdi Feghali (wajdi.k.feghali@intel.com) | ||
21 | * Copyright (c) 2010, Intel Corporation. | ||
22 | * | ||
23 | * Ported x86_64 version to x86: | ||
24 | * Author: Mathias Krause <minipli@googlemail.com> | ||
25 | * | ||
12 | * This program is free software; you can redistribute it and/or modify | 26 | * This program is free software; you can redistribute it and/or modify |
13 | * it under the terms of the GNU General Public License as published by | 27 | * it under the terms of the GNU General Public License as published by |
14 | * the Free Software Foundation; either version 2 of the License, or | 28 | * the Free Software Foundation; either version 2 of the License, or |
@@ -18,8 +32,62 @@ | |||
18 | #include <linux/linkage.h> | 32 | #include <linux/linkage.h> |
19 | #include <asm/inst.h> | 33 | #include <asm/inst.h> |
20 | 34 | ||
35 | #ifdef __x86_64__ | ||
36 | .data | ||
37 | POLY: .octa 0xC2000000000000000000000000000001 | ||
38 | TWOONE: .octa 0x00000001000000000000000000000001 | ||
39 | |||
40 | # order of these constants should not change. | ||
41 | # more specifically, ALL_F should follow SHIFT_MASK, | ||
42 | # and ZERO should follow ALL_F | ||
43 | |||
44 | SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F | ||
45 | MASK1: .octa 0x0000000000000000ffffffffffffffff | ||
46 | MASK2: .octa 0xffffffffffffffff0000000000000000 | ||
47 | SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 | ||
48 | ALL_F: .octa 0xffffffffffffffffffffffffffffffff | ||
49 | ZERO: .octa 0x00000000000000000000000000000000 | ||
50 | ONE: .octa 0x00000000000000000000000000000001 | ||
51 | F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 | ||
52 | dec: .octa 0x1 | ||
53 | enc: .octa 0x2 | ||
54 | |||
55 | |||
21 | .text | 56 | .text |
22 | 57 | ||
58 | |||
59 | #define STACK_OFFSET 8*3 | ||
60 | #define HashKey 16*0 // store HashKey <<1 mod poly here | ||
61 | #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here | ||
62 | #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here | ||
63 | #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here | ||
64 | #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 | ||
65 | // bits of HashKey <<1 mod poly here | ||
66 | //(for Karatsuba purposes) | ||
67 | #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 | ||
68 | // bits of HashKey^2 <<1 mod poly here | ||
69 | // (for Karatsuba purposes) | ||
70 | #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 | ||
71 | // bits of HashKey^3 <<1 mod poly here | ||
72 | // (for Karatsuba purposes) | ||
73 | #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 | ||
74 | // bits of HashKey^4 <<1 mod poly here | ||
75 | // (for Karatsuba purposes) | ||
76 | #define VARIABLE_OFFSET 16*8 | ||
77 | |||
78 | #define arg1 rdi | ||
79 | #define arg2 rsi | ||
80 | #define arg3 rdx | ||
81 | #define arg4 rcx | ||
82 | #define arg5 r8 | ||
83 | #define arg6 r9 | ||
84 | #define arg7 STACK_OFFSET+8(%r14) | ||
85 | #define arg8 STACK_OFFSET+16(%r14) | ||
86 | #define arg9 STACK_OFFSET+24(%r14) | ||
87 | #define arg10 STACK_OFFSET+32(%r14) | ||
88 | #endif | ||
89 | |||
90 | |||
23 | #define STATE1 %xmm0 | 91 | #define STATE1 %xmm0 |
24 | #define STATE2 %xmm4 | 92 | #define STATE2 %xmm4 |
25 | #define STATE3 %xmm5 | 93 | #define STATE3 %xmm5 |
@@ -32,12 +100,16 @@ | |||
32 | #define IN IN1 | 100 | #define IN IN1 |
33 | #define KEY %xmm2 | 101 | #define KEY %xmm2 |
34 | #define IV %xmm3 | 102 | #define IV %xmm3 |
103 | |||
35 | #define BSWAP_MASK %xmm10 | 104 | #define BSWAP_MASK %xmm10 |
36 | #define CTR %xmm11 | 105 | #define CTR %xmm11 |
37 | #define INC %xmm12 | 106 | #define INC %xmm12 |
38 | 107 | ||
108 | #ifdef __x86_64__ | ||
109 | #define AREG %rax | ||
39 | #define KEYP %rdi | 110 | #define KEYP %rdi |
40 | #define OUTP %rsi | 111 | #define OUTP %rsi |
112 | #define UKEYP OUTP | ||
41 | #define INP %rdx | 113 | #define INP %rdx |
42 | #define LEN %rcx | 114 | #define LEN %rcx |
43 | #define IVP %r8 | 115 | #define IVP %r8 |
@@ -46,6 +118,1588 @@ | |||
46 | #define TKEYP T1 | 118 | #define TKEYP T1 |
47 | #define T2 %r11 | 119 | #define T2 %r11 |
48 | #define TCTR_LOW T2 | 120 | #define TCTR_LOW T2 |
121 | #else | ||
122 | #define AREG %eax | ||
123 | #define KEYP %edi | ||
124 | #define OUTP AREG | ||
125 | #define UKEYP OUTP | ||
126 | #define INP %edx | ||
127 | #define LEN %esi | ||
128 | #define IVP %ebp | ||
129 | #define KLEN %ebx | ||
130 | #define T1 %ecx | ||
131 | #define TKEYP T1 | ||
132 | #endif | ||
133 | |||
134 | |||
135 | #ifdef __x86_64__ | ||
136 | /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) | ||
137 | * | ||
138 | * | ||
139 | * Input: A and B (128-bits each, bit-reflected) | ||
140 | * Output: C = A*B*x mod poly, (i.e. >>1 ) | ||
141 | * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input | ||
142 | * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. | ||
143 | * | ||
144 | */ | ||
145 | .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 | ||
146 | movdqa \GH, \TMP1 | ||
147 | pshufd $78, \GH, \TMP2 | ||
148 | pshufd $78, \HK, \TMP3 | ||
149 | pxor \GH, \TMP2 # TMP2 = a1+a0 | ||
150 | pxor \HK, \TMP3 # TMP3 = b1+b0 | ||
151 | PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 | ||
152 | PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 | ||
153 | PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) | ||
154 | pxor \GH, \TMP2 | ||
155 | pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) | ||
156 | movdqa \TMP2, \TMP3 | ||
157 | pslldq $8, \TMP3 # left shift TMP3 2 DWs | ||
158 | psrldq $8, \TMP2 # right shift TMP2 2 DWs | ||
159 | pxor \TMP3, \GH | ||
160 | pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK | ||
161 | |||
162 | # first phase of the reduction | ||
163 | |||
164 | movdqa \GH, \TMP2 | ||
165 | movdqa \GH, \TMP3 | ||
166 | movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 | ||
167 | # in in order to perform | ||
168 | # independent shifts | ||
169 | pslld $31, \TMP2 # packed right shift <<31 | ||
170 | pslld $30, \TMP3 # packed right shift <<30 | ||
171 | pslld $25, \TMP4 # packed right shift <<25 | ||
172 | pxor \TMP3, \TMP2 # xor the shifted versions | ||
173 | pxor \TMP4, \TMP2 | ||
174 | movdqa \TMP2, \TMP5 | ||
175 | psrldq $4, \TMP5 # right shift TMP5 1 DW | ||
176 | pslldq $12, \TMP2 # left shift TMP2 3 DWs | ||
177 | pxor \TMP2, \GH | ||
178 | |||
179 | # second phase of the reduction | ||
180 | |||
181 | movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 | ||
182 | # in in order to perform | ||
183 | # independent shifts | ||
184 | movdqa \GH,\TMP3 | ||
185 | movdqa \GH,\TMP4 | ||
186 | psrld $1,\TMP2 # packed left shift >>1 | ||
187 | psrld $2,\TMP3 # packed left shift >>2 | ||
188 | psrld $7,\TMP4 # packed left shift >>7 | ||
189 | pxor \TMP3,\TMP2 # xor the shifted versions | ||
190 | pxor \TMP4,\TMP2 | ||
191 | pxor \TMP5, \TMP2 | ||
192 | pxor \TMP2, \GH | ||
193 | pxor \TMP1, \GH # result is in TMP1 | ||
194 | .endm | ||
195 | |||
196 | /* | ||
197 | * if a = number of total plaintext bytes | ||
198 | * b = floor(a/16) | ||
199 | * num_initial_blocks = b mod 4 | ||
200 | * encrypt the initial num_initial_blocks blocks and apply ghash on | ||
201 | * the ciphertext | ||
202 | * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers | ||
203 | * are clobbered | ||
204 | * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified | ||
205 | */ | ||
206 | |||
207 | |||
208 | .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ | ||
209 | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation | ||
210 | mov arg7, %r10 # %r10 = AAD | ||
211 | mov arg8, %r12 # %r12 = aadLen | ||
212 | mov %r12, %r11 | ||
213 | pxor %xmm\i, %xmm\i | ||
214 | _get_AAD_loop\num_initial_blocks\operation: | ||
215 | movd (%r10), \TMP1 | ||
216 | pslldq $12, \TMP1 | ||
217 | psrldq $4, %xmm\i | ||
218 | pxor \TMP1, %xmm\i | ||
219 | add $4, %r10 | ||
220 | sub $4, %r12 | ||
221 | jne _get_AAD_loop\num_initial_blocks\operation | ||
222 | cmp $16, %r11 | ||
223 | je _get_AAD_loop2_done\num_initial_blocks\operation | ||
224 | mov $16, %r12 | ||
225 | _get_AAD_loop2\num_initial_blocks\operation: | ||
226 | psrldq $4, %xmm\i | ||
227 | sub $4, %r12 | ||
228 | cmp %r11, %r12 | ||
229 | jne _get_AAD_loop2\num_initial_blocks\operation | ||
230 | _get_AAD_loop2_done\num_initial_blocks\operation: | ||
231 | movdqa SHUF_MASK(%rip), %xmm14 | ||
232 | PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data | ||
233 | |||
234 | xor %r11, %r11 # initialise the data pointer offset as zero | ||
235 | |||
236 | # start AES for num_initial_blocks blocks | ||
237 | |||
238 | mov %arg5, %rax # %rax = *Y0 | ||
239 | movdqu (%rax), \XMM0 # XMM0 = Y0 | ||
240 | movdqa SHUF_MASK(%rip), %xmm14 | ||
241 | PSHUFB_XMM %xmm14, \XMM0 | ||
242 | |||
243 | .if (\i == 5) || (\i == 6) || (\i == 7) | ||
244 | .irpc index, \i_seq | ||
245 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
246 | movdqa \XMM0, %xmm\index | ||
247 | movdqa SHUF_MASK(%rip), %xmm14 | ||
248 | PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap | ||
249 | |||
250 | .endr | ||
251 | .irpc index, \i_seq | ||
252 | pxor 16*0(%arg1), %xmm\index | ||
253 | .endr | ||
254 | .irpc index, \i_seq | ||
255 | movaps 0x10(%rdi), \TMP1 | ||
256 | AESENC \TMP1, %xmm\index # Round 1 | ||
257 | .endr | ||
258 | .irpc index, \i_seq | ||
259 | movaps 0x20(%arg1), \TMP1 | ||
260 | AESENC \TMP1, %xmm\index # Round 2 | ||
261 | .endr | ||
262 | .irpc index, \i_seq | ||
263 | movaps 0x30(%arg1), \TMP1 | ||
264 | AESENC \TMP1, %xmm\index # Round 2 | ||
265 | .endr | ||
266 | .irpc index, \i_seq | ||
267 | movaps 0x40(%arg1), \TMP1 | ||
268 | AESENC \TMP1, %xmm\index # Round 2 | ||
269 | .endr | ||
270 | .irpc index, \i_seq | ||
271 | movaps 0x50(%arg1), \TMP1 | ||
272 | AESENC \TMP1, %xmm\index # Round 2 | ||
273 | .endr | ||
274 | .irpc index, \i_seq | ||
275 | movaps 0x60(%arg1), \TMP1 | ||
276 | AESENC \TMP1, %xmm\index # Round 2 | ||
277 | .endr | ||
278 | .irpc index, \i_seq | ||
279 | movaps 0x70(%arg1), \TMP1 | ||
280 | AESENC \TMP1, %xmm\index # Round 2 | ||
281 | .endr | ||
282 | .irpc index, \i_seq | ||
283 | movaps 0x80(%arg1), \TMP1 | ||
284 | AESENC \TMP1, %xmm\index # Round 2 | ||
285 | .endr | ||
286 | .irpc index, \i_seq | ||
287 | movaps 0x90(%arg1), \TMP1 | ||
288 | AESENC \TMP1, %xmm\index # Round 2 | ||
289 | .endr | ||
290 | .irpc index, \i_seq | ||
291 | movaps 0xa0(%arg1), \TMP1 | ||
292 | AESENCLAST \TMP1, %xmm\index # Round 10 | ||
293 | .endr | ||
294 | .irpc index, \i_seq | ||
295 | movdqu (%arg3 , %r11, 1), \TMP1 | ||
296 | pxor \TMP1, %xmm\index | ||
297 | movdqu %xmm\index, (%arg2 , %r11, 1) | ||
298 | # write back plaintext/ciphertext for num_initial_blocks | ||
299 | add $16, %r11 | ||
300 | |||
301 | movdqa \TMP1, %xmm\index | ||
302 | movdqa SHUF_MASK(%rip), %xmm14 | ||
303 | PSHUFB_XMM %xmm14, %xmm\index | ||
304 | |||
305 | # prepare plaintext/ciphertext for GHASH computation | ||
306 | .endr | ||
307 | .endif | ||
308 | GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
309 | # apply GHASH on num_initial_blocks blocks | ||
310 | |||
311 | .if \i == 5 | ||
312 | pxor %xmm5, %xmm6 | ||
313 | GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
314 | pxor %xmm6, %xmm7 | ||
315 | GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
316 | pxor %xmm7, %xmm8 | ||
317 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
318 | .elseif \i == 6 | ||
319 | pxor %xmm6, %xmm7 | ||
320 | GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
321 | pxor %xmm7, %xmm8 | ||
322 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
323 | .elseif \i == 7 | ||
324 | pxor %xmm7, %xmm8 | ||
325 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
326 | .endif | ||
327 | cmp $64, %r13 | ||
328 | jl _initial_blocks_done\num_initial_blocks\operation | ||
329 | # no need for precomputed values | ||
330 | /* | ||
331 | * | ||
332 | * Precomputations for HashKey parallel with encryption of first 4 blocks. | ||
333 | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | ||
334 | */ | ||
335 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
336 | movdqa \XMM0, \XMM1 | ||
337 | movdqa SHUF_MASK(%rip), %xmm14 | ||
338 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | ||
339 | |||
340 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
341 | movdqa \XMM0, \XMM2 | ||
342 | movdqa SHUF_MASK(%rip), %xmm14 | ||
343 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | ||
344 | |||
345 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
346 | movdqa \XMM0, \XMM3 | ||
347 | movdqa SHUF_MASK(%rip), %xmm14 | ||
348 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | ||
349 | |||
350 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
351 | movdqa \XMM0, \XMM4 | ||
352 | movdqa SHUF_MASK(%rip), %xmm14 | ||
353 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | ||
354 | |||
355 | pxor 16*0(%arg1), \XMM1 | ||
356 | pxor 16*0(%arg1), \XMM2 | ||
357 | pxor 16*0(%arg1), \XMM3 | ||
358 | pxor 16*0(%arg1), \XMM4 | ||
359 | movdqa \TMP3, \TMP5 | ||
360 | pshufd $78, \TMP3, \TMP1 | ||
361 | pxor \TMP3, \TMP1 | ||
362 | movdqa \TMP1, HashKey_k(%rsp) | ||
363 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | ||
364 | # TMP5 = HashKey^2<<1 (mod poly) | ||
365 | movdqa \TMP5, HashKey_2(%rsp) | ||
366 | # HashKey_2 = HashKey^2<<1 (mod poly) | ||
367 | pshufd $78, \TMP5, \TMP1 | ||
368 | pxor \TMP5, \TMP1 | ||
369 | movdqa \TMP1, HashKey_2_k(%rsp) | ||
370 | .irpc index, 1234 # do 4 rounds | ||
371 | movaps 0x10*\index(%arg1), \TMP1 | ||
372 | AESENC \TMP1, \XMM1 | ||
373 | AESENC \TMP1, \XMM2 | ||
374 | AESENC \TMP1, \XMM3 | ||
375 | AESENC \TMP1, \XMM4 | ||
376 | .endr | ||
377 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | ||
378 | # TMP5 = HashKey^3<<1 (mod poly) | ||
379 | movdqa \TMP5, HashKey_3(%rsp) | ||
380 | pshufd $78, \TMP5, \TMP1 | ||
381 | pxor \TMP5, \TMP1 | ||
382 | movdqa \TMP1, HashKey_3_k(%rsp) | ||
383 | .irpc index, 56789 # do next 5 rounds | ||
384 | movaps 0x10*\index(%arg1), \TMP1 | ||
385 | AESENC \TMP1, \XMM1 | ||
386 | AESENC \TMP1, \XMM2 | ||
387 | AESENC \TMP1, \XMM3 | ||
388 | AESENC \TMP1, \XMM4 | ||
389 | .endr | ||
390 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | ||
391 | # TMP5 = HashKey^3<<1 (mod poly) | ||
392 | movdqa \TMP5, HashKey_4(%rsp) | ||
393 | pshufd $78, \TMP5, \TMP1 | ||
394 | pxor \TMP5, \TMP1 | ||
395 | movdqa \TMP1, HashKey_4_k(%rsp) | ||
396 | movaps 0xa0(%arg1), \TMP2 | ||
397 | AESENCLAST \TMP2, \XMM1 | ||
398 | AESENCLAST \TMP2, \XMM2 | ||
399 | AESENCLAST \TMP2, \XMM3 | ||
400 | AESENCLAST \TMP2, \XMM4 | ||
401 | movdqu 16*0(%arg3 , %r11 , 1), \TMP1 | ||
402 | pxor \TMP1, \XMM1 | ||
403 | movdqu \XMM1, 16*0(%arg2 , %r11 , 1) | ||
404 | movdqa \TMP1, \XMM1 | ||
405 | movdqu 16*1(%arg3 , %r11 , 1), \TMP1 | ||
406 | pxor \TMP1, \XMM2 | ||
407 | movdqu \XMM2, 16*1(%arg2 , %r11 , 1) | ||
408 | movdqa \TMP1, \XMM2 | ||
409 | movdqu 16*2(%arg3 , %r11 , 1), \TMP1 | ||
410 | pxor \TMP1, \XMM3 | ||
411 | movdqu \XMM3, 16*2(%arg2 , %r11 , 1) | ||
412 | movdqa \TMP1, \XMM3 | ||
413 | movdqu 16*3(%arg3 , %r11 , 1), \TMP1 | ||
414 | pxor \TMP1, \XMM4 | ||
415 | movdqu \XMM4, 16*3(%arg2 , %r11 , 1) | ||
416 | movdqa \TMP1, \XMM4 | ||
417 | add $64, %r11 | ||
418 | movdqa SHUF_MASK(%rip), %xmm14 | ||
419 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | ||
420 | pxor \XMMDst, \XMM1 | ||
421 | # combine GHASHed value with the corresponding ciphertext | ||
422 | movdqa SHUF_MASK(%rip), %xmm14 | ||
423 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | ||
424 | movdqa SHUF_MASK(%rip), %xmm14 | ||
425 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | ||
426 | movdqa SHUF_MASK(%rip), %xmm14 | ||
427 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | ||
428 | |||
429 | _initial_blocks_done\num_initial_blocks\operation: | ||
430 | |||
431 | .endm | ||
432 | |||
433 | |||
434 | /* | ||
435 | * if a = number of total plaintext bytes | ||
436 | * b = floor(a/16) | ||
437 | * num_initial_blocks = b mod 4 | ||
438 | * encrypt the initial num_initial_blocks blocks and apply ghash on | ||
439 | * the ciphertext | ||
440 | * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers | ||
441 | * are clobbered | ||
442 | * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified | ||
443 | */ | ||
444 | |||
445 | |||
446 | .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ | ||
447 | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation | ||
448 | mov arg7, %r10 # %r10 = AAD | ||
449 | mov arg8, %r12 # %r12 = aadLen | ||
450 | mov %r12, %r11 | ||
451 | pxor %xmm\i, %xmm\i | ||
452 | _get_AAD_loop\num_initial_blocks\operation: | ||
453 | movd (%r10), \TMP1 | ||
454 | pslldq $12, \TMP1 | ||
455 | psrldq $4, %xmm\i | ||
456 | pxor \TMP1, %xmm\i | ||
457 | add $4, %r10 | ||
458 | sub $4, %r12 | ||
459 | jne _get_AAD_loop\num_initial_blocks\operation | ||
460 | cmp $16, %r11 | ||
461 | je _get_AAD_loop2_done\num_initial_blocks\operation | ||
462 | mov $16, %r12 | ||
463 | _get_AAD_loop2\num_initial_blocks\operation: | ||
464 | psrldq $4, %xmm\i | ||
465 | sub $4, %r12 | ||
466 | cmp %r11, %r12 | ||
467 | jne _get_AAD_loop2\num_initial_blocks\operation | ||
468 | _get_AAD_loop2_done\num_initial_blocks\operation: | ||
469 | movdqa SHUF_MASK(%rip), %xmm14 | ||
470 | PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data | ||
471 | |||
472 | xor %r11, %r11 # initialise the data pointer offset as zero | ||
473 | |||
474 | # start AES for num_initial_blocks blocks | ||
475 | |||
476 | mov %arg5, %rax # %rax = *Y0 | ||
477 | movdqu (%rax), \XMM0 # XMM0 = Y0 | ||
478 | movdqa SHUF_MASK(%rip), %xmm14 | ||
479 | PSHUFB_XMM %xmm14, \XMM0 | ||
480 | |||
481 | .if (\i == 5) || (\i == 6) || (\i == 7) | ||
482 | .irpc index, \i_seq | ||
483 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
484 | movdqa \XMM0, %xmm\index | ||
485 | movdqa SHUF_MASK(%rip), %xmm14 | ||
486 | PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap | ||
487 | |||
488 | .endr | ||
489 | .irpc index, \i_seq | ||
490 | pxor 16*0(%arg1), %xmm\index | ||
491 | .endr | ||
492 | .irpc index, \i_seq | ||
493 | movaps 0x10(%rdi), \TMP1 | ||
494 | AESENC \TMP1, %xmm\index # Round 1 | ||
495 | .endr | ||
496 | .irpc index, \i_seq | ||
497 | movaps 0x20(%arg1), \TMP1 | ||
498 | AESENC \TMP1, %xmm\index # Round 2 | ||
499 | .endr | ||
500 | .irpc index, \i_seq | ||
501 | movaps 0x30(%arg1), \TMP1 | ||
502 | AESENC \TMP1, %xmm\index # Round 2 | ||
503 | .endr | ||
504 | .irpc index, \i_seq | ||
505 | movaps 0x40(%arg1), \TMP1 | ||
506 | AESENC \TMP1, %xmm\index # Round 2 | ||
507 | .endr | ||
508 | .irpc index, \i_seq | ||
509 | movaps 0x50(%arg1), \TMP1 | ||
510 | AESENC \TMP1, %xmm\index # Round 2 | ||
511 | .endr | ||
512 | .irpc index, \i_seq | ||
513 | movaps 0x60(%arg1), \TMP1 | ||
514 | AESENC \TMP1, %xmm\index # Round 2 | ||
515 | .endr | ||
516 | .irpc index, \i_seq | ||
517 | movaps 0x70(%arg1), \TMP1 | ||
518 | AESENC \TMP1, %xmm\index # Round 2 | ||
519 | .endr | ||
520 | .irpc index, \i_seq | ||
521 | movaps 0x80(%arg1), \TMP1 | ||
522 | AESENC \TMP1, %xmm\index # Round 2 | ||
523 | .endr | ||
524 | .irpc index, \i_seq | ||
525 | movaps 0x90(%arg1), \TMP1 | ||
526 | AESENC \TMP1, %xmm\index # Round 2 | ||
527 | .endr | ||
528 | .irpc index, \i_seq | ||
529 | movaps 0xa0(%arg1), \TMP1 | ||
530 | AESENCLAST \TMP1, %xmm\index # Round 10 | ||
531 | .endr | ||
532 | .irpc index, \i_seq | ||
533 | movdqu (%arg3 , %r11, 1), \TMP1 | ||
534 | pxor \TMP1, %xmm\index | ||
535 | movdqu %xmm\index, (%arg2 , %r11, 1) | ||
536 | # write back plaintext/ciphertext for num_initial_blocks | ||
537 | add $16, %r11 | ||
538 | |||
539 | movdqa SHUF_MASK(%rip), %xmm14 | ||
540 | PSHUFB_XMM %xmm14, %xmm\index | ||
541 | |||
542 | # prepare plaintext/ciphertext for GHASH computation | ||
543 | .endr | ||
544 | .endif | ||
545 | GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
546 | # apply GHASH on num_initial_blocks blocks | ||
547 | |||
548 | .if \i == 5 | ||
549 | pxor %xmm5, %xmm6 | ||
550 | GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
551 | pxor %xmm6, %xmm7 | ||
552 | GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
553 | pxor %xmm7, %xmm8 | ||
554 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
555 | .elseif \i == 6 | ||
556 | pxor %xmm6, %xmm7 | ||
557 | GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
558 | pxor %xmm7, %xmm8 | ||
559 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
560 | .elseif \i == 7 | ||
561 | pxor %xmm7, %xmm8 | ||
562 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
563 | .endif | ||
564 | cmp $64, %r13 | ||
565 | jl _initial_blocks_done\num_initial_blocks\operation | ||
566 | # no need for precomputed values | ||
567 | /* | ||
568 | * | ||
569 | * Precomputations for HashKey parallel with encryption of first 4 blocks. | ||
570 | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | ||
571 | */ | ||
572 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
573 | movdqa \XMM0, \XMM1 | ||
574 | movdqa SHUF_MASK(%rip), %xmm14 | ||
575 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | ||
576 | |||
577 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
578 | movdqa \XMM0, \XMM2 | ||
579 | movdqa SHUF_MASK(%rip), %xmm14 | ||
580 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | ||
581 | |||
582 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
583 | movdqa \XMM0, \XMM3 | ||
584 | movdqa SHUF_MASK(%rip), %xmm14 | ||
585 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | ||
586 | |||
587 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
588 | movdqa \XMM0, \XMM4 | ||
589 | movdqa SHUF_MASK(%rip), %xmm14 | ||
590 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | ||
591 | |||
592 | pxor 16*0(%arg1), \XMM1 | ||
593 | pxor 16*0(%arg1), \XMM2 | ||
594 | pxor 16*0(%arg1), \XMM3 | ||
595 | pxor 16*0(%arg1), \XMM4 | ||
596 | movdqa \TMP3, \TMP5 | ||
597 | pshufd $78, \TMP3, \TMP1 | ||
598 | pxor \TMP3, \TMP1 | ||
599 | movdqa \TMP1, HashKey_k(%rsp) | ||
600 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | ||
601 | # TMP5 = HashKey^2<<1 (mod poly) | ||
602 | movdqa \TMP5, HashKey_2(%rsp) | ||
603 | # HashKey_2 = HashKey^2<<1 (mod poly) | ||
604 | pshufd $78, \TMP5, \TMP1 | ||
605 | pxor \TMP5, \TMP1 | ||
606 | movdqa \TMP1, HashKey_2_k(%rsp) | ||
607 | .irpc index, 1234 # do 4 rounds | ||
608 | movaps 0x10*\index(%arg1), \TMP1 | ||
609 | AESENC \TMP1, \XMM1 | ||
610 | AESENC \TMP1, \XMM2 | ||
611 | AESENC \TMP1, \XMM3 | ||
612 | AESENC \TMP1, \XMM4 | ||
613 | .endr | ||
614 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | ||
615 | # TMP5 = HashKey^3<<1 (mod poly) | ||
616 | movdqa \TMP5, HashKey_3(%rsp) | ||
617 | pshufd $78, \TMP5, \TMP1 | ||
618 | pxor \TMP5, \TMP1 | ||
619 | movdqa \TMP1, HashKey_3_k(%rsp) | ||
620 | .irpc index, 56789 # do next 5 rounds | ||
621 | movaps 0x10*\index(%arg1), \TMP1 | ||
622 | AESENC \TMP1, \XMM1 | ||
623 | AESENC \TMP1, \XMM2 | ||
624 | AESENC \TMP1, \XMM3 | ||
625 | AESENC \TMP1, \XMM4 | ||
626 | .endr | ||
627 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | ||
628 | # TMP5 = HashKey^3<<1 (mod poly) | ||
629 | movdqa \TMP5, HashKey_4(%rsp) | ||
630 | pshufd $78, \TMP5, \TMP1 | ||
631 | pxor \TMP5, \TMP1 | ||
632 | movdqa \TMP1, HashKey_4_k(%rsp) | ||
633 | movaps 0xa0(%arg1), \TMP2 | ||
634 | AESENCLAST \TMP2, \XMM1 | ||
635 | AESENCLAST \TMP2, \XMM2 | ||
636 | AESENCLAST \TMP2, \XMM3 | ||
637 | AESENCLAST \TMP2, \XMM4 | ||
638 | movdqu 16*0(%arg3 , %r11 , 1), \TMP1 | ||
639 | pxor \TMP1, \XMM1 | ||
640 | movdqu 16*1(%arg3 , %r11 , 1), \TMP1 | ||
641 | pxor \TMP1, \XMM2 | ||
642 | movdqu 16*2(%arg3 , %r11 , 1), \TMP1 | ||
643 | pxor \TMP1, \XMM3 | ||
644 | movdqu 16*3(%arg3 , %r11 , 1), \TMP1 | ||
645 | pxor \TMP1, \XMM4 | ||
646 | movdqu \XMM1, 16*0(%arg2 , %r11 , 1) | ||
647 | movdqu \XMM2, 16*1(%arg2 , %r11 , 1) | ||
648 | movdqu \XMM3, 16*2(%arg2 , %r11 , 1) | ||
649 | movdqu \XMM4, 16*3(%arg2 , %r11 , 1) | ||
650 | |||
651 | add $64, %r11 | ||
652 | movdqa SHUF_MASK(%rip), %xmm14 | ||
653 | PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap | ||
654 | pxor \XMMDst, \XMM1 | ||
655 | # combine GHASHed value with the corresponding ciphertext | ||
656 | movdqa SHUF_MASK(%rip), %xmm14 | ||
657 | PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap | ||
658 | movdqa SHUF_MASK(%rip), %xmm14 | ||
659 | PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap | ||
660 | movdqa SHUF_MASK(%rip), %xmm14 | ||
661 | PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap | ||
662 | |||
663 | _initial_blocks_done\num_initial_blocks\operation: | ||
664 | |||
665 | .endm | ||
666 | |||
667 | /* | ||
668 | * encrypt 4 blocks at a time | ||
669 | * ghash the 4 previously encrypted ciphertext blocks | ||
670 | * arg1, %arg2, %arg3 are used as pointers only, not modified | ||
671 | * %r11 is the data offset value | ||
672 | */ | ||
673 | .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ | ||
674 | TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation | ||
675 | |||
676 | movdqa \XMM1, \XMM5 | ||
677 | movdqa \XMM2, \XMM6 | ||
678 | movdqa \XMM3, \XMM7 | ||
679 | movdqa \XMM4, \XMM8 | ||
680 | |||
681 | movdqa SHUF_MASK(%rip), %xmm15 | ||
682 | # multiply TMP5 * HashKey using karatsuba | ||
683 | |||
684 | movdqa \XMM5, \TMP4 | ||
685 | pshufd $78, \XMM5, \TMP6 | ||
686 | pxor \XMM5, \TMP6 | ||
687 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
688 | movdqa HashKey_4(%rsp), \TMP5 | ||
689 | PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 | ||
690 | movdqa \XMM0, \XMM1 | ||
691 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
692 | movdqa \XMM0, \XMM2 | ||
693 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
694 | movdqa \XMM0, \XMM3 | ||
695 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
696 | movdqa \XMM0, \XMM4 | ||
697 | PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap | ||
698 | PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 | ||
699 | PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap | ||
700 | PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap | ||
701 | PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap | ||
702 | |||
703 | pxor (%arg1), \XMM1 | ||
704 | pxor (%arg1), \XMM2 | ||
705 | pxor (%arg1), \XMM3 | ||
706 | pxor (%arg1), \XMM4 | ||
707 | movdqa HashKey_4_k(%rsp), \TMP5 | ||
708 | PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) | ||
709 | movaps 0x10(%arg1), \TMP1 | ||
710 | AESENC \TMP1, \XMM1 # Round 1 | ||
711 | AESENC \TMP1, \XMM2 | ||
712 | AESENC \TMP1, \XMM3 | ||
713 | AESENC \TMP1, \XMM4 | ||
714 | movaps 0x20(%arg1), \TMP1 | ||
715 | AESENC \TMP1, \XMM1 # Round 2 | ||
716 | AESENC \TMP1, \XMM2 | ||
717 | AESENC \TMP1, \XMM3 | ||
718 | AESENC \TMP1, \XMM4 | ||
719 | movdqa \XMM6, \TMP1 | ||
720 | pshufd $78, \XMM6, \TMP2 | ||
721 | pxor \XMM6, \TMP2 | ||
722 | movdqa HashKey_3(%rsp), \TMP5 | ||
723 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 | ||
724 | movaps 0x30(%arg1), \TMP3 | ||
725 | AESENC \TMP3, \XMM1 # Round 3 | ||
726 | AESENC \TMP3, \XMM2 | ||
727 | AESENC \TMP3, \XMM3 | ||
728 | AESENC \TMP3, \XMM4 | ||
729 | PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 | ||
730 | movaps 0x40(%arg1), \TMP3 | ||
731 | AESENC \TMP3, \XMM1 # Round 4 | ||
732 | AESENC \TMP3, \XMM2 | ||
733 | AESENC \TMP3, \XMM3 | ||
734 | AESENC \TMP3, \XMM4 | ||
735 | movdqa HashKey_3_k(%rsp), \TMP5 | ||
736 | PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
737 | movaps 0x50(%arg1), \TMP3 | ||
738 | AESENC \TMP3, \XMM1 # Round 5 | ||
739 | AESENC \TMP3, \XMM2 | ||
740 | AESENC \TMP3, \XMM3 | ||
741 | AESENC \TMP3, \XMM4 | ||
742 | pxor \TMP1, \TMP4 | ||
743 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | ||
744 | pxor \XMM6, \XMM5 | ||
745 | pxor \TMP2, \TMP6 | ||
746 | movdqa \XMM7, \TMP1 | ||
747 | pshufd $78, \XMM7, \TMP2 | ||
748 | pxor \XMM7, \TMP2 | ||
749 | movdqa HashKey_2(%rsp ), \TMP5 | ||
750 | |||
751 | # Multiply TMP5 * HashKey using karatsuba | ||
752 | |||
753 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
754 | movaps 0x60(%arg1), \TMP3 | ||
755 | AESENC \TMP3, \XMM1 # Round 6 | ||
756 | AESENC \TMP3, \XMM2 | ||
757 | AESENC \TMP3, \XMM3 | ||
758 | AESENC \TMP3, \XMM4 | ||
759 | PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 | ||
760 | movaps 0x70(%arg1), \TMP3 | ||
761 | AESENC \TMP3, \XMM1 # Round 7 | ||
762 | AESENC \TMP3, \XMM2 | ||
763 | AESENC \TMP3, \XMM3 | ||
764 | AESENC \TMP3, \XMM4 | ||
765 | movdqa HashKey_2_k(%rsp), \TMP5 | ||
766 | PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
767 | movaps 0x80(%arg1), \TMP3 | ||
768 | AESENC \TMP3, \XMM1 # Round 8 | ||
769 | AESENC \TMP3, \XMM2 | ||
770 | AESENC \TMP3, \XMM3 | ||
771 | AESENC \TMP3, \XMM4 | ||
772 | pxor \TMP1, \TMP4 | ||
773 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | ||
774 | pxor \XMM7, \XMM5 | ||
775 | pxor \TMP2, \TMP6 | ||
776 | |||
777 | # Multiply XMM8 * HashKey | ||
778 | # XMM8 and TMP5 hold the values for the two operands | ||
779 | |||
780 | movdqa \XMM8, \TMP1 | ||
781 | pshufd $78, \XMM8, \TMP2 | ||
782 | pxor \XMM8, \TMP2 | ||
783 | movdqa HashKey(%rsp), \TMP5 | ||
784 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
785 | movaps 0x90(%arg1), \TMP3 | ||
786 | AESENC \TMP3, \XMM1 # Round 9 | ||
787 | AESENC \TMP3, \XMM2 | ||
788 | AESENC \TMP3, \XMM3 | ||
789 | AESENC \TMP3, \XMM4 | ||
790 | PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 | ||
791 | movaps 0xa0(%arg1), \TMP3 | ||
792 | AESENCLAST \TMP3, \XMM1 # Round 10 | ||
793 | AESENCLAST \TMP3, \XMM2 | ||
794 | AESENCLAST \TMP3, \XMM3 | ||
795 | AESENCLAST \TMP3, \XMM4 | ||
796 | movdqa HashKey_k(%rsp), \TMP5 | ||
797 | PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
798 | movdqu (%arg3,%r11,1), \TMP3 | ||
799 | pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK | ||
800 | movdqu 16(%arg3,%r11,1), \TMP3 | ||
801 | pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK | ||
802 | movdqu 32(%arg3,%r11,1), \TMP3 | ||
803 | pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK | ||
804 | movdqu 48(%arg3,%r11,1), \TMP3 | ||
805 | pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK | ||
806 | movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer | ||
807 | movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer | ||
808 | movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer | ||
809 | movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer | ||
810 | PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap | ||
811 | PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap | ||
812 | PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap | ||
813 | PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap | ||
814 | |||
815 | pxor \TMP4, \TMP1 | ||
816 | pxor \XMM8, \XMM5 | ||
817 | pxor \TMP6, \TMP2 | ||
818 | pxor \TMP1, \TMP2 | ||
819 | pxor \XMM5, \TMP2 | ||
820 | movdqa \TMP2, \TMP3 | ||
821 | pslldq $8, \TMP3 # left shift TMP3 2 DWs | ||
822 | psrldq $8, \TMP2 # right shift TMP2 2 DWs | ||
823 | pxor \TMP3, \XMM5 | ||
824 | pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 | ||
825 | |||
826 | # first phase of reduction | ||
827 | |||
828 | movdqa \XMM5, \TMP2 | ||
829 | movdqa \XMM5, \TMP3 | ||
830 | movdqa \XMM5, \TMP4 | ||
831 | # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently | ||
832 | pslld $31, \TMP2 # packed right shift << 31 | ||
833 | pslld $30, \TMP3 # packed right shift << 30 | ||
834 | pslld $25, \TMP4 # packed right shift << 25 | ||
835 | pxor \TMP3, \TMP2 # xor the shifted versions | ||
836 | pxor \TMP4, \TMP2 | ||
837 | movdqa \TMP2, \TMP5 | ||
838 | psrldq $4, \TMP5 # right shift T5 1 DW | ||
839 | pslldq $12, \TMP2 # left shift T2 3 DWs | ||
840 | pxor \TMP2, \XMM5 | ||
841 | |||
842 | # second phase of reduction | ||
843 | |||
844 | movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 | ||
845 | movdqa \XMM5,\TMP3 | ||
846 | movdqa \XMM5,\TMP4 | ||
847 | psrld $1, \TMP2 # packed left shift >>1 | ||
848 | psrld $2, \TMP3 # packed left shift >>2 | ||
849 | psrld $7, \TMP4 # packed left shift >>7 | ||
850 | pxor \TMP3,\TMP2 # xor the shifted versions | ||
851 | pxor \TMP4,\TMP2 | ||
852 | pxor \TMP5, \TMP2 | ||
853 | pxor \TMP2, \XMM5 | ||
854 | pxor \TMP1, \XMM5 # result is in TMP1 | ||
855 | |||
856 | pxor \XMM5, \XMM1 | ||
857 | .endm | ||
858 | |||
859 | /* | ||
860 | * decrypt 4 blocks at a time | ||
861 | * ghash the 4 previously decrypted ciphertext blocks | ||
862 | * arg1, %arg2, %arg3 are used as pointers only, not modified | ||
863 | * %r11 is the data offset value | ||
864 | */ | ||
865 | .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ | ||
866 | TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation | ||
867 | |||
868 | movdqa \XMM1, \XMM5 | ||
869 | movdqa \XMM2, \XMM6 | ||
870 | movdqa \XMM3, \XMM7 | ||
871 | movdqa \XMM4, \XMM8 | ||
872 | |||
873 | movdqa SHUF_MASK(%rip), %xmm15 | ||
874 | # multiply TMP5 * HashKey using karatsuba | ||
875 | |||
876 | movdqa \XMM5, \TMP4 | ||
877 | pshufd $78, \XMM5, \TMP6 | ||
878 | pxor \XMM5, \TMP6 | ||
879 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
880 | movdqa HashKey_4(%rsp), \TMP5 | ||
881 | PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 | ||
882 | movdqa \XMM0, \XMM1 | ||
883 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
884 | movdqa \XMM0, \XMM2 | ||
885 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
886 | movdqa \XMM0, \XMM3 | ||
887 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
888 | movdqa \XMM0, \XMM4 | ||
889 | PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap | ||
890 | PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 | ||
891 | PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap | ||
892 | PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap | ||
893 | PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap | ||
894 | |||
895 | pxor (%arg1), \XMM1 | ||
896 | pxor (%arg1), \XMM2 | ||
897 | pxor (%arg1), \XMM3 | ||
898 | pxor (%arg1), \XMM4 | ||
899 | movdqa HashKey_4_k(%rsp), \TMP5 | ||
900 | PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) | ||
901 | movaps 0x10(%arg1), \TMP1 | ||
902 | AESENC \TMP1, \XMM1 # Round 1 | ||
903 | AESENC \TMP1, \XMM2 | ||
904 | AESENC \TMP1, \XMM3 | ||
905 | AESENC \TMP1, \XMM4 | ||
906 | movaps 0x20(%arg1), \TMP1 | ||
907 | AESENC \TMP1, \XMM1 # Round 2 | ||
908 | AESENC \TMP1, \XMM2 | ||
909 | AESENC \TMP1, \XMM3 | ||
910 | AESENC \TMP1, \XMM4 | ||
911 | movdqa \XMM6, \TMP1 | ||
912 | pshufd $78, \XMM6, \TMP2 | ||
913 | pxor \XMM6, \TMP2 | ||
914 | movdqa HashKey_3(%rsp), \TMP5 | ||
915 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 | ||
916 | movaps 0x30(%arg1), \TMP3 | ||
917 | AESENC \TMP3, \XMM1 # Round 3 | ||
918 | AESENC \TMP3, \XMM2 | ||
919 | AESENC \TMP3, \XMM3 | ||
920 | AESENC \TMP3, \XMM4 | ||
921 | PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 | ||
922 | movaps 0x40(%arg1), \TMP3 | ||
923 | AESENC \TMP3, \XMM1 # Round 4 | ||
924 | AESENC \TMP3, \XMM2 | ||
925 | AESENC \TMP3, \XMM3 | ||
926 | AESENC \TMP3, \XMM4 | ||
927 | movdqa HashKey_3_k(%rsp), \TMP5 | ||
928 | PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
929 | movaps 0x50(%arg1), \TMP3 | ||
930 | AESENC \TMP3, \XMM1 # Round 5 | ||
931 | AESENC \TMP3, \XMM2 | ||
932 | AESENC \TMP3, \XMM3 | ||
933 | AESENC \TMP3, \XMM4 | ||
934 | pxor \TMP1, \TMP4 | ||
935 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | ||
936 | pxor \XMM6, \XMM5 | ||
937 | pxor \TMP2, \TMP6 | ||
938 | movdqa \XMM7, \TMP1 | ||
939 | pshufd $78, \XMM7, \TMP2 | ||
940 | pxor \XMM7, \TMP2 | ||
941 | movdqa HashKey_2(%rsp ), \TMP5 | ||
942 | |||
943 | # Multiply TMP5 * HashKey using karatsuba | ||
944 | |||
945 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
946 | movaps 0x60(%arg1), \TMP3 | ||
947 | AESENC \TMP3, \XMM1 # Round 6 | ||
948 | AESENC \TMP3, \XMM2 | ||
949 | AESENC \TMP3, \XMM3 | ||
950 | AESENC \TMP3, \XMM4 | ||
951 | PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 | ||
952 | movaps 0x70(%arg1), \TMP3 | ||
953 | AESENC \TMP3, \XMM1 # Round 7 | ||
954 | AESENC \TMP3, \XMM2 | ||
955 | AESENC \TMP3, \XMM3 | ||
956 | AESENC \TMP3, \XMM4 | ||
957 | movdqa HashKey_2_k(%rsp), \TMP5 | ||
958 | PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
959 | movaps 0x80(%arg1), \TMP3 | ||
960 | AESENC \TMP3, \XMM1 # Round 8 | ||
961 | AESENC \TMP3, \XMM2 | ||
962 | AESENC \TMP3, \XMM3 | ||
963 | AESENC \TMP3, \XMM4 | ||
964 | pxor \TMP1, \TMP4 | ||
965 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | ||
966 | pxor \XMM7, \XMM5 | ||
967 | pxor \TMP2, \TMP6 | ||
968 | |||
969 | # Multiply XMM8 * HashKey | ||
970 | # XMM8 and TMP5 hold the values for the two operands | ||
971 | |||
972 | movdqa \XMM8, \TMP1 | ||
973 | pshufd $78, \XMM8, \TMP2 | ||
974 | pxor \XMM8, \TMP2 | ||
975 | movdqa HashKey(%rsp), \TMP5 | ||
976 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
977 | movaps 0x90(%arg1), \TMP3 | ||
978 | AESENC \TMP3, \XMM1 # Round 9 | ||
979 | AESENC \TMP3, \XMM2 | ||
980 | AESENC \TMP3, \XMM3 | ||
981 | AESENC \TMP3, \XMM4 | ||
982 | PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 | ||
983 | movaps 0xa0(%arg1), \TMP3 | ||
984 | AESENCLAST \TMP3, \XMM1 # Round 10 | ||
985 | AESENCLAST \TMP3, \XMM2 | ||
986 | AESENCLAST \TMP3, \XMM3 | ||
987 | AESENCLAST \TMP3, \XMM4 | ||
988 | movdqa HashKey_k(%rsp), \TMP5 | ||
989 | PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
990 | movdqu (%arg3,%r11,1), \TMP3 | ||
991 | pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK | ||
992 | movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer | ||
993 | movdqa \TMP3, \XMM1 | ||
994 | movdqu 16(%arg3,%r11,1), \TMP3 | ||
995 | pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK | ||
996 | movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer | ||
997 | movdqa \TMP3, \XMM2 | ||
998 | movdqu 32(%arg3,%r11,1), \TMP3 | ||
999 | pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK | ||
1000 | movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer | ||
1001 | movdqa \TMP3, \XMM3 | ||
1002 | movdqu 48(%arg3,%r11,1), \TMP3 | ||
1003 | pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK | ||
1004 | movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer | ||
1005 | movdqa \TMP3, \XMM4 | ||
1006 | PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap | ||
1007 | PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap | ||
1008 | PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap | ||
1009 | PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap | ||
1010 | |||
1011 | pxor \TMP4, \TMP1 | ||
1012 | pxor \XMM8, \XMM5 | ||
1013 | pxor \TMP6, \TMP2 | ||
1014 | pxor \TMP1, \TMP2 | ||
1015 | pxor \XMM5, \TMP2 | ||
1016 | movdqa \TMP2, \TMP3 | ||
1017 | pslldq $8, \TMP3 # left shift TMP3 2 DWs | ||
1018 | psrldq $8, \TMP2 # right shift TMP2 2 DWs | ||
1019 | pxor \TMP3, \XMM5 | ||
1020 | pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 | ||
1021 | |||
1022 | # first phase of reduction | ||
1023 | |||
1024 | movdqa \XMM5, \TMP2 | ||
1025 | movdqa \XMM5, \TMP3 | ||
1026 | movdqa \XMM5, \TMP4 | ||
1027 | # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently | ||
1028 | pslld $31, \TMP2 # packed right shift << 31 | ||
1029 | pslld $30, \TMP3 # packed right shift << 30 | ||
1030 | pslld $25, \TMP4 # packed right shift << 25 | ||
1031 | pxor \TMP3, \TMP2 # xor the shifted versions | ||
1032 | pxor \TMP4, \TMP2 | ||
1033 | movdqa \TMP2, \TMP5 | ||
1034 | psrldq $4, \TMP5 # right shift T5 1 DW | ||
1035 | pslldq $12, \TMP2 # left shift T2 3 DWs | ||
1036 | pxor \TMP2, \XMM5 | ||
1037 | |||
1038 | # second phase of reduction | ||
1039 | |||
1040 | movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 | ||
1041 | movdqa \XMM5,\TMP3 | ||
1042 | movdqa \XMM5,\TMP4 | ||
1043 | psrld $1, \TMP2 # packed left shift >>1 | ||
1044 | psrld $2, \TMP3 # packed left shift >>2 | ||
1045 | psrld $7, \TMP4 # packed left shift >>7 | ||
1046 | pxor \TMP3,\TMP2 # xor the shifted versions | ||
1047 | pxor \TMP4,\TMP2 | ||
1048 | pxor \TMP5, \TMP2 | ||
1049 | pxor \TMP2, \XMM5 | ||
1050 | pxor \TMP1, \XMM5 # result is in TMP1 | ||
1051 | |||
1052 | pxor \XMM5, \XMM1 | ||
1053 | .endm | ||
1054 | |||
1055 | /* GHASH the last 4 ciphertext blocks. */ | ||
1056 | .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ | ||
1057 | TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst | ||
1058 | |||
1059 | # Multiply TMP6 * HashKey (using Karatsuba) | ||
1060 | |||
1061 | movdqa \XMM1, \TMP6 | ||
1062 | pshufd $78, \XMM1, \TMP2 | ||
1063 | pxor \XMM1, \TMP2 | ||
1064 | movdqa HashKey_4(%rsp), \TMP5 | ||
1065 | PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 | ||
1066 | PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 | ||
1067 | movdqa HashKey_4_k(%rsp), \TMP4 | ||
1068 | PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
1069 | movdqa \XMM1, \XMMDst | ||
1070 | movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 | ||
1071 | |||
1072 | # Multiply TMP1 * HashKey (using Karatsuba) | ||
1073 | |||
1074 | movdqa \XMM2, \TMP1 | ||
1075 | pshufd $78, \XMM2, \TMP2 | ||
1076 | pxor \XMM2, \TMP2 | ||
1077 | movdqa HashKey_3(%rsp), \TMP5 | ||
1078 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
1079 | PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 | ||
1080 | movdqa HashKey_3_k(%rsp), \TMP4 | ||
1081 | PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
1082 | pxor \TMP1, \TMP6 | ||
1083 | pxor \XMM2, \XMMDst | ||
1084 | pxor \TMP2, \XMM1 | ||
1085 | # results accumulated in TMP6, XMMDst, XMM1 | ||
1086 | |||
1087 | # Multiply TMP1 * HashKey (using Karatsuba) | ||
1088 | |||
1089 | movdqa \XMM3, \TMP1 | ||
1090 | pshufd $78, \XMM3, \TMP2 | ||
1091 | pxor \XMM3, \TMP2 | ||
1092 | movdqa HashKey_2(%rsp), \TMP5 | ||
1093 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
1094 | PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 | ||
1095 | movdqa HashKey_2_k(%rsp), \TMP4 | ||
1096 | PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
1097 | pxor \TMP1, \TMP6 | ||
1098 | pxor \XMM3, \XMMDst | ||
1099 | pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 | ||
1100 | |||
1101 | # Multiply TMP1 * HashKey (using Karatsuba) | ||
1102 | movdqa \XMM4, \TMP1 | ||
1103 | pshufd $78, \XMM4, \TMP2 | ||
1104 | pxor \XMM4, \TMP2 | ||
1105 | movdqa HashKey(%rsp), \TMP5 | ||
1106 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
1107 | PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 | ||
1108 | movdqa HashKey_k(%rsp), \TMP4 | ||
1109 | PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
1110 | pxor \TMP1, \TMP6 | ||
1111 | pxor \XMM4, \XMMDst | ||
1112 | pxor \XMM1, \TMP2 | ||
1113 | pxor \TMP6, \TMP2 | ||
1114 | pxor \XMMDst, \TMP2 | ||
1115 | # middle section of the temp results combined as in karatsuba algorithm | ||
1116 | movdqa \TMP2, \TMP4 | ||
1117 | pslldq $8, \TMP4 # left shift TMP4 2 DWs | ||
1118 | psrldq $8, \TMP2 # right shift TMP2 2 DWs | ||
1119 | pxor \TMP4, \XMMDst | ||
1120 | pxor \TMP2, \TMP6 | ||
1121 | # TMP6:XMMDst holds the result of the accumulated carry-less multiplications | ||
1122 | # first phase of the reduction | ||
1123 | movdqa \XMMDst, \TMP2 | ||
1124 | movdqa \XMMDst, \TMP3 | ||
1125 | movdqa \XMMDst, \TMP4 | ||
1126 | # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently | ||
1127 | pslld $31, \TMP2 # packed right shifting << 31 | ||
1128 | pslld $30, \TMP3 # packed right shifting << 30 | ||
1129 | pslld $25, \TMP4 # packed right shifting << 25 | ||
1130 | pxor \TMP3, \TMP2 # xor the shifted versions | ||
1131 | pxor \TMP4, \TMP2 | ||
1132 | movdqa \TMP2, \TMP7 | ||
1133 | psrldq $4, \TMP7 # right shift TMP7 1 DW | ||
1134 | pslldq $12, \TMP2 # left shift TMP2 3 DWs | ||
1135 | pxor \TMP2, \XMMDst | ||
1136 | |||
1137 | # second phase of the reduction | ||
1138 | movdqa \XMMDst, \TMP2 | ||
1139 | # make 3 copies of XMMDst for doing 3 shift operations | ||
1140 | movdqa \XMMDst, \TMP3 | ||
1141 | movdqa \XMMDst, \TMP4 | ||
1142 | psrld $1, \TMP2 # packed left shift >> 1 | ||
1143 | psrld $2, \TMP3 # packed left shift >> 2 | ||
1144 | psrld $7, \TMP4 # packed left shift >> 7 | ||
1145 | pxor \TMP3, \TMP2 # xor the shifted versions | ||
1146 | pxor \TMP4, \TMP2 | ||
1147 | pxor \TMP7, \TMP2 | ||
1148 | pxor \TMP2, \XMMDst | ||
1149 | pxor \TMP6, \XMMDst # reduced result is in XMMDst | ||
1150 | .endm | ||
1151 | |||
1152 | /* Encryption of a single block done*/ | ||
1153 | .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 | ||
1154 | |||
1155 | pxor (%arg1), \XMM0 | ||
1156 | movaps 16(%arg1), \TMP1 | ||
1157 | AESENC \TMP1, \XMM0 | ||
1158 | movaps 32(%arg1), \TMP1 | ||
1159 | AESENC \TMP1, \XMM0 | ||
1160 | movaps 48(%arg1), \TMP1 | ||
1161 | AESENC \TMP1, \XMM0 | ||
1162 | movaps 64(%arg1), \TMP1 | ||
1163 | AESENC \TMP1, \XMM0 | ||
1164 | movaps 80(%arg1), \TMP1 | ||
1165 | AESENC \TMP1, \XMM0 | ||
1166 | movaps 96(%arg1), \TMP1 | ||
1167 | AESENC \TMP1, \XMM0 | ||
1168 | movaps 112(%arg1), \TMP1 | ||
1169 | AESENC \TMP1, \XMM0 | ||
1170 | movaps 128(%arg1), \TMP1 | ||
1171 | AESENC \TMP1, \XMM0 | ||
1172 | movaps 144(%arg1), \TMP1 | ||
1173 | AESENC \TMP1, \XMM0 | ||
1174 | movaps 160(%arg1), \TMP1 | ||
1175 | AESENCLAST \TMP1, \XMM0 | ||
1176 | .endm | ||
1177 | |||
1178 | |||
1179 | /***************************************************************************** | ||
1180 | * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. | ||
1181 | * u8 *out, // Plaintext output. Encrypt in-place is allowed. | ||
1182 | * const u8 *in, // Ciphertext input | ||
1183 | * u64 plaintext_len, // Length of data in bytes for decryption. | ||
1184 | * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) | ||
1185 | * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) | ||
1186 | * // concatenated with 0x00000001. 16-byte aligned pointer. | ||
1187 | * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. | ||
1188 | * const u8 *aad, // Additional Authentication Data (AAD) | ||
1189 | * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes | ||
1190 | * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the | ||
1191 | * // given authentication tag and only return the plaintext if they match. | ||
1192 | * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 | ||
1193 | * // (most likely), 12 or 8. | ||
1194 | * | ||
1195 | * Assumptions: | ||
1196 | * | ||
1197 | * keys: | ||
1198 | * keys are pre-expanded and aligned to 16 bytes. we are using the first | ||
1199 | * set of 11 keys in the data structure void *aes_ctx | ||
1200 | * | ||
1201 | * iv: | ||
1202 | * 0 1 2 3 | ||
1203 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
1204 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1205 | * | Salt (From the SA) | | ||
1206 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1207 | * | Initialization Vector | | ||
1208 | * | (This is the sequence number from IPSec header) | | ||
1209 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1210 | * | 0x1 | | ||
1211 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1212 | * | ||
1213 | * | ||
1214 | * | ||
1215 | * AAD: | ||
1216 | * AAD padded to 128 bits with 0 | ||
1217 | * for example, assume AAD is a u32 vector | ||
1218 | * | ||
1219 | * if AAD is 8 bytes: | ||
1220 | * AAD[3] = {A0, A1}; | ||
1221 | * padded AAD in xmm register = {A1 A0 0 0} | ||
1222 | * | ||
1223 | * 0 1 2 3 | ||
1224 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
1225 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1226 | * | SPI (A1) | | ||
1227 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1228 | * | 32-bit Sequence Number (A0) | | ||
1229 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1230 | * | 0x0 | | ||
1231 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1232 | * | ||
1233 | * AAD Format with 32-bit Sequence Number | ||
1234 | * | ||
1235 | * if AAD is 12 bytes: | ||
1236 | * AAD[3] = {A0, A1, A2}; | ||
1237 | * padded AAD in xmm register = {A2 A1 A0 0} | ||
1238 | * | ||
1239 | * 0 1 2 3 | ||
1240 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
1241 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1242 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
1243 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1244 | * | SPI (A2) | | ||
1245 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1246 | * | 64-bit Extended Sequence Number {A1,A0} | | ||
1247 | * | | | ||
1248 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1249 | * | 0x0 | | ||
1250 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1251 | * | ||
1252 | * AAD Format with 64-bit Extended Sequence Number | ||
1253 | * | ||
1254 | * aadLen: | ||
1255 | * from the definition of the spec, aadLen can only be 8 or 12 bytes. | ||
1256 | * The code supports 16 too but for other sizes, the code will fail. | ||
1257 | * | ||
1258 | * TLen: | ||
1259 | * from the definition of the spec, TLen can only be 8, 12 or 16 bytes. | ||
1260 | * For other sizes, the code will fail. | ||
1261 | * | ||
1262 | * poly = x^128 + x^127 + x^126 + x^121 + 1 | ||
1263 | * | ||
1264 | *****************************************************************************/ | ||
1265 | |||
1266 | ENTRY(aesni_gcm_dec) | ||
1267 | push %r12 | ||
1268 | push %r13 | ||
1269 | push %r14 | ||
1270 | mov %rsp, %r14 | ||
1271 | /* | ||
1272 | * states of %xmm registers %xmm6:%xmm15 not saved | ||
1273 | * all %xmm registers are clobbered | ||
1274 | */ | ||
1275 | sub $VARIABLE_OFFSET, %rsp | ||
1276 | and $~63, %rsp # align rsp to 64 bytes | ||
1277 | mov %arg6, %r12 | ||
1278 | movdqu (%r12), %xmm13 # %xmm13 = HashKey | ||
1279 | movdqa SHUF_MASK(%rip), %xmm2 | ||
1280 | PSHUFB_XMM %xmm2, %xmm13 | ||
1281 | |||
1282 | |||
1283 | # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) | ||
1284 | |||
1285 | movdqa %xmm13, %xmm2 | ||
1286 | psllq $1, %xmm13 | ||
1287 | psrlq $63, %xmm2 | ||
1288 | movdqa %xmm2, %xmm1 | ||
1289 | pslldq $8, %xmm2 | ||
1290 | psrldq $8, %xmm1 | ||
1291 | por %xmm2, %xmm13 | ||
1292 | |||
1293 | # Reduction | ||
1294 | |||
1295 | pshufd $0x24, %xmm1, %xmm2 | ||
1296 | pcmpeqd TWOONE(%rip), %xmm2 | ||
1297 | pand POLY(%rip), %xmm2 | ||
1298 | pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) | ||
1299 | |||
1300 | |||
1301 | # Decrypt first few blocks | ||
1302 | |||
1303 | movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) | ||
1304 | mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext | ||
1305 | and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) | ||
1306 | mov %r13, %r12 | ||
1307 | and $(3<<4), %r12 | ||
1308 | jz _initial_num_blocks_is_0_decrypt | ||
1309 | cmp $(2<<4), %r12 | ||
1310 | jb _initial_num_blocks_is_1_decrypt | ||
1311 | je _initial_num_blocks_is_2_decrypt | ||
1312 | _initial_num_blocks_is_3_decrypt: | ||
1313 | INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1314 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec | ||
1315 | sub $48, %r13 | ||
1316 | jmp _initial_blocks_decrypted | ||
1317 | _initial_num_blocks_is_2_decrypt: | ||
1318 | INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1319 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec | ||
1320 | sub $32, %r13 | ||
1321 | jmp _initial_blocks_decrypted | ||
1322 | _initial_num_blocks_is_1_decrypt: | ||
1323 | INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1324 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec | ||
1325 | sub $16, %r13 | ||
1326 | jmp _initial_blocks_decrypted | ||
1327 | _initial_num_blocks_is_0_decrypt: | ||
1328 | INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1329 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec | ||
1330 | _initial_blocks_decrypted: | ||
1331 | cmp $0, %r13 | ||
1332 | je _zero_cipher_left_decrypt | ||
1333 | sub $64, %r13 | ||
1334 | je _four_cipher_left_decrypt | ||
1335 | _decrypt_by_4: | ||
1336 | GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ | ||
1337 | %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec | ||
1338 | add $64, %r11 | ||
1339 | sub $64, %r13 | ||
1340 | jne _decrypt_by_4 | ||
1341 | _four_cipher_left_decrypt: | ||
1342 | GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ | ||
1343 | %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 | ||
1344 | _zero_cipher_left_decrypt: | ||
1345 | mov %arg4, %r13 | ||
1346 | and $15, %r13 # %r13 = arg4 (mod 16) | ||
1347 | je _multiple_of_16_bytes_decrypt | ||
1348 | |||
1349 | # Handle the last <16 byte block seperately | ||
1350 | |||
1351 | paddd ONE(%rip), %xmm0 # increment CNT to get Yn | ||
1352 | movdqa SHUF_MASK(%rip), %xmm10 | ||
1353 | PSHUFB_XMM %xmm10, %xmm0 | ||
1354 | |||
1355 | ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) | ||
1356 | sub $16, %r11 | ||
1357 | add %r13, %r11 | ||
1358 | movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block | ||
1359 | lea SHIFT_MASK+16(%rip), %r12 | ||
1360 | sub %r13, %r12 | ||
1361 | # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes | ||
1362 | # (%r13 is the number of bytes in plaintext mod 16) | ||
1363 | movdqu (%r12), %xmm2 # get the appropriate shuffle mask | ||
1364 | PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes | ||
1365 | |||
1366 | movdqa %xmm1, %xmm2 | ||
1367 | pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) | ||
1368 | movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 | ||
1369 | # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 | ||
1370 | pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 | ||
1371 | pand %xmm1, %xmm2 | ||
1372 | movdqa SHUF_MASK(%rip), %xmm10 | ||
1373 | PSHUFB_XMM %xmm10 ,%xmm2 | ||
1374 | |||
1375 | pxor %xmm2, %xmm8 | ||
1376 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | ||
1377 | # GHASH computation for the last <16 byte block | ||
1378 | sub %r13, %r11 | ||
1379 | add $16, %r11 | ||
1380 | |||
1381 | # output %r13 bytes | ||
1382 | MOVQ_R64_XMM %xmm0, %rax | ||
1383 | cmp $8, %r13 | ||
1384 | jle _less_than_8_bytes_left_decrypt | ||
1385 | mov %rax, (%arg2 , %r11, 1) | ||
1386 | add $8, %r11 | ||
1387 | psrldq $8, %xmm0 | ||
1388 | MOVQ_R64_XMM %xmm0, %rax | ||
1389 | sub $8, %r13 | ||
1390 | _less_than_8_bytes_left_decrypt: | ||
1391 | mov %al, (%arg2, %r11, 1) | ||
1392 | add $1, %r11 | ||
1393 | shr $8, %rax | ||
1394 | sub $1, %r13 | ||
1395 | jne _less_than_8_bytes_left_decrypt | ||
1396 | _multiple_of_16_bytes_decrypt: | ||
1397 | mov arg8, %r12 # %r13 = aadLen (number of bytes) | ||
1398 | shl $3, %r12 # convert into number of bits | ||
1399 | movd %r12d, %xmm15 # len(A) in %xmm15 | ||
1400 | shl $3, %arg4 # len(C) in bits (*128) | ||
1401 | MOVQ_R64_XMM %arg4, %xmm1 | ||
1402 | pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 | ||
1403 | pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) | ||
1404 | pxor %xmm15, %xmm8 | ||
1405 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | ||
1406 | # final GHASH computation | ||
1407 | movdqa SHUF_MASK(%rip), %xmm10 | ||
1408 | PSHUFB_XMM %xmm10, %xmm8 | ||
1409 | |||
1410 | mov %arg5, %rax # %rax = *Y0 | ||
1411 | movdqu (%rax), %xmm0 # %xmm0 = Y0 | ||
1412 | ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) | ||
1413 | pxor %xmm8, %xmm0 | ||
1414 | _return_T_decrypt: | ||
1415 | mov arg9, %r10 # %r10 = authTag | ||
1416 | mov arg10, %r11 # %r11 = auth_tag_len | ||
1417 | cmp $16, %r11 | ||
1418 | je _T_16_decrypt | ||
1419 | cmp $12, %r11 | ||
1420 | je _T_12_decrypt | ||
1421 | _T_8_decrypt: | ||
1422 | MOVQ_R64_XMM %xmm0, %rax | ||
1423 | mov %rax, (%r10) | ||
1424 | jmp _return_T_done_decrypt | ||
1425 | _T_12_decrypt: | ||
1426 | MOVQ_R64_XMM %xmm0, %rax | ||
1427 | mov %rax, (%r10) | ||
1428 | psrldq $8, %xmm0 | ||
1429 | movd %xmm0, %eax | ||
1430 | mov %eax, 8(%r10) | ||
1431 | jmp _return_T_done_decrypt | ||
1432 | _T_16_decrypt: | ||
1433 | movdqu %xmm0, (%r10) | ||
1434 | _return_T_done_decrypt: | ||
1435 | mov %r14, %rsp | ||
1436 | pop %r14 | ||
1437 | pop %r13 | ||
1438 | pop %r12 | ||
1439 | ret | ||
1440 | |||
1441 | |||
1442 | /***************************************************************************** | ||
1443 | * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. | ||
1444 | * u8 *out, // Ciphertext output. Encrypt in-place is allowed. | ||
1445 | * const u8 *in, // Plaintext input | ||
1446 | * u64 plaintext_len, // Length of data in bytes for encryption. | ||
1447 | * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) | ||
1448 | * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) | ||
1449 | * // concatenated with 0x00000001. 16-byte aligned pointer. | ||
1450 | * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. | ||
1451 | * const u8 *aad, // Additional Authentication Data (AAD) | ||
1452 | * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes | ||
1453 | * u8 *auth_tag, // Authenticated Tag output. | ||
1454 | * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), | ||
1455 | * // 12 or 8. | ||
1456 | * | ||
1457 | * Assumptions: | ||
1458 | * | ||
1459 | * keys: | ||
1460 | * keys are pre-expanded and aligned to 16 bytes. we are using the | ||
1461 | * first set of 11 keys in the data structure void *aes_ctx | ||
1462 | * | ||
1463 | * | ||
1464 | * iv: | ||
1465 | * 0 1 2 3 | ||
1466 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
1467 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1468 | * | Salt (From the SA) | | ||
1469 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1470 | * | Initialization Vector | | ||
1471 | * | (This is the sequence number from IPSec header) | | ||
1472 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1473 | * | 0x1 | | ||
1474 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1475 | * | ||
1476 | * | ||
1477 | * | ||
1478 | * AAD: | ||
1479 | * AAD padded to 128 bits with 0 | ||
1480 | * for example, assume AAD is a u32 vector | ||
1481 | * | ||
1482 | * if AAD is 8 bytes: | ||
1483 | * AAD[3] = {A0, A1}; | ||
1484 | * padded AAD in xmm register = {A1 A0 0 0} | ||
1485 | * | ||
1486 | * 0 1 2 3 | ||
1487 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
1488 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1489 | * | SPI (A1) | | ||
1490 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1491 | * | 32-bit Sequence Number (A0) | | ||
1492 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1493 | * | 0x0 | | ||
1494 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1495 | * | ||
1496 | * AAD Format with 32-bit Sequence Number | ||
1497 | * | ||
1498 | * if AAD is 12 bytes: | ||
1499 | * AAD[3] = {A0, A1, A2}; | ||
1500 | * padded AAD in xmm register = {A2 A1 A0 0} | ||
1501 | * | ||
1502 | * 0 1 2 3 | ||
1503 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
1504 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1505 | * | SPI (A2) | | ||
1506 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1507 | * | 64-bit Extended Sequence Number {A1,A0} | | ||
1508 | * | | | ||
1509 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1510 | * | 0x0 | | ||
1511 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1512 | * | ||
1513 | * AAD Format with 64-bit Extended Sequence Number | ||
1514 | * | ||
1515 | * aadLen: | ||
1516 | * from the definition of the spec, aadLen can only be 8 or 12 bytes. | ||
1517 | * The code supports 16 too but for other sizes, the code will fail. | ||
1518 | * | ||
1519 | * TLen: | ||
1520 | * from the definition of the spec, TLen can only be 8, 12 or 16 bytes. | ||
1521 | * For other sizes, the code will fail. | ||
1522 | * | ||
1523 | * poly = x^128 + x^127 + x^126 + x^121 + 1 | ||
1524 | ***************************************************************************/ | ||
1525 | ENTRY(aesni_gcm_enc) | ||
1526 | push %r12 | ||
1527 | push %r13 | ||
1528 | push %r14 | ||
1529 | mov %rsp, %r14 | ||
1530 | # | ||
1531 | # states of %xmm registers %xmm6:%xmm15 not saved | ||
1532 | # all %xmm registers are clobbered | ||
1533 | # | ||
1534 | sub $VARIABLE_OFFSET, %rsp | ||
1535 | and $~63, %rsp | ||
1536 | mov %arg6, %r12 | ||
1537 | movdqu (%r12), %xmm13 | ||
1538 | movdqa SHUF_MASK(%rip), %xmm2 | ||
1539 | PSHUFB_XMM %xmm2, %xmm13 | ||
1540 | |||
1541 | |||
1542 | # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) | ||
1543 | |||
1544 | movdqa %xmm13, %xmm2 | ||
1545 | psllq $1, %xmm13 | ||
1546 | psrlq $63, %xmm2 | ||
1547 | movdqa %xmm2, %xmm1 | ||
1548 | pslldq $8, %xmm2 | ||
1549 | psrldq $8, %xmm1 | ||
1550 | por %xmm2, %xmm13 | ||
1551 | |||
1552 | # reduce HashKey<<1 | ||
1553 | |||
1554 | pshufd $0x24, %xmm1, %xmm2 | ||
1555 | pcmpeqd TWOONE(%rip), %xmm2 | ||
1556 | pand POLY(%rip), %xmm2 | ||
1557 | pxor %xmm2, %xmm13 | ||
1558 | movdqa %xmm13, HashKey(%rsp) | ||
1559 | mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) | ||
1560 | and $-16, %r13 | ||
1561 | mov %r13, %r12 | ||
1562 | |||
1563 | # Encrypt first few blocks | ||
1564 | |||
1565 | and $(3<<4), %r12 | ||
1566 | jz _initial_num_blocks_is_0_encrypt | ||
1567 | cmp $(2<<4), %r12 | ||
1568 | jb _initial_num_blocks_is_1_encrypt | ||
1569 | je _initial_num_blocks_is_2_encrypt | ||
1570 | _initial_num_blocks_is_3_encrypt: | ||
1571 | INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1572 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc | ||
1573 | sub $48, %r13 | ||
1574 | jmp _initial_blocks_encrypted | ||
1575 | _initial_num_blocks_is_2_encrypt: | ||
1576 | INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1577 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc | ||
1578 | sub $32, %r13 | ||
1579 | jmp _initial_blocks_encrypted | ||
1580 | _initial_num_blocks_is_1_encrypt: | ||
1581 | INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1582 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc | ||
1583 | sub $16, %r13 | ||
1584 | jmp _initial_blocks_encrypted | ||
1585 | _initial_num_blocks_is_0_encrypt: | ||
1586 | INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1587 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc | ||
1588 | _initial_blocks_encrypted: | ||
1589 | |||
1590 | # Main loop - Encrypt remaining blocks | ||
1591 | |||
1592 | cmp $0, %r13 | ||
1593 | je _zero_cipher_left_encrypt | ||
1594 | sub $64, %r13 | ||
1595 | je _four_cipher_left_encrypt | ||
1596 | _encrypt_by_4_encrypt: | ||
1597 | GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ | ||
1598 | %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc | ||
1599 | add $64, %r11 | ||
1600 | sub $64, %r13 | ||
1601 | jne _encrypt_by_4_encrypt | ||
1602 | _four_cipher_left_encrypt: | ||
1603 | GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ | ||
1604 | %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 | ||
1605 | _zero_cipher_left_encrypt: | ||
1606 | mov %arg4, %r13 | ||
1607 | and $15, %r13 # %r13 = arg4 (mod 16) | ||
1608 | je _multiple_of_16_bytes_encrypt | ||
1609 | |||
1610 | # Handle the last <16 Byte block seperately | ||
1611 | paddd ONE(%rip), %xmm0 # INCR CNT to get Yn | ||
1612 | movdqa SHUF_MASK(%rip), %xmm10 | ||
1613 | PSHUFB_XMM %xmm10, %xmm0 | ||
1614 | |||
1615 | ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) | ||
1616 | sub $16, %r11 | ||
1617 | add %r13, %r11 | ||
1618 | movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks | ||
1619 | lea SHIFT_MASK+16(%rip), %r12 | ||
1620 | sub %r13, %r12 | ||
1621 | # adjust the shuffle mask pointer to be able to shift 16-r13 bytes | ||
1622 | # (%r13 is the number of bytes in plaintext mod 16) | ||
1623 | movdqu (%r12), %xmm2 # get the appropriate shuffle mask | ||
1624 | PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte | ||
1625 | pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) | ||
1626 | movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 | ||
1627 | # get the appropriate mask to mask out top 16-r13 bytes of xmm0 | ||
1628 | pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 | ||
1629 | movdqa SHUF_MASK(%rip), %xmm10 | ||
1630 | PSHUFB_XMM %xmm10,%xmm0 | ||
1631 | |||
1632 | pxor %xmm0, %xmm8 | ||
1633 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | ||
1634 | # GHASH computation for the last <16 byte block | ||
1635 | sub %r13, %r11 | ||
1636 | add $16, %r11 | ||
1637 | PSHUFB_XMM %xmm10, %xmm1 | ||
1638 | |||
1639 | # shuffle xmm0 back to output as ciphertext | ||
1640 | |||
1641 | # Output %r13 bytes | ||
1642 | MOVQ_R64_XMM %xmm0, %rax | ||
1643 | cmp $8, %r13 | ||
1644 | jle _less_than_8_bytes_left_encrypt | ||
1645 | mov %rax, (%arg2 , %r11, 1) | ||
1646 | add $8, %r11 | ||
1647 | psrldq $8, %xmm0 | ||
1648 | MOVQ_R64_XMM %xmm0, %rax | ||
1649 | sub $8, %r13 | ||
1650 | _less_than_8_bytes_left_encrypt: | ||
1651 | mov %al, (%arg2, %r11, 1) | ||
1652 | add $1, %r11 | ||
1653 | shr $8, %rax | ||
1654 | sub $1, %r13 | ||
1655 | jne _less_than_8_bytes_left_encrypt | ||
1656 | _multiple_of_16_bytes_encrypt: | ||
1657 | mov arg8, %r12 # %r12 = addLen (number of bytes) | ||
1658 | shl $3, %r12 | ||
1659 | movd %r12d, %xmm15 # len(A) in %xmm15 | ||
1660 | shl $3, %arg4 # len(C) in bits (*128) | ||
1661 | MOVQ_R64_XMM %arg4, %xmm1 | ||
1662 | pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 | ||
1663 | pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) | ||
1664 | pxor %xmm15, %xmm8 | ||
1665 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | ||
1666 | # final GHASH computation | ||
1667 | movdqa SHUF_MASK(%rip), %xmm10 | ||
1668 | PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap | ||
1669 | |||
1670 | mov %arg5, %rax # %rax = *Y0 | ||
1671 | movdqu (%rax), %xmm0 # %xmm0 = Y0 | ||
1672 | ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) | ||
1673 | pxor %xmm8, %xmm0 | ||
1674 | _return_T_encrypt: | ||
1675 | mov arg9, %r10 # %r10 = authTag | ||
1676 | mov arg10, %r11 # %r11 = auth_tag_len | ||
1677 | cmp $16, %r11 | ||
1678 | je _T_16_encrypt | ||
1679 | cmp $12, %r11 | ||
1680 | je _T_12_encrypt | ||
1681 | _T_8_encrypt: | ||
1682 | MOVQ_R64_XMM %xmm0, %rax | ||
1683 | mov %rax, (%r10) | ||
1684 | jmp _return_T_done_encrypt | ||
1685 | _T_12_encrypt: | ||
1686 | MOVQ_R64_XMM %xmm0, %rax | ||
1687 | mov %rax, (%r10) | ||
1688 | psrldq $8, %xmm0 | ||
1689 | movd %xmm0, %eax | ||
1690 | mov %eax, 8(%r10) | ||
1691 | jmp _return_T_done_encrypt | ||
1692 | _T_16_encrypt: | ||
1693 | movdqu %xmm0, (%r10) | ||
1694 | _return_T_done_encrypt: | ||
1695 | mov %r14, %rsp | ||
1696 | pop %r14 | ||
1697 | pop %r13 | ||
1698 | pop %r12 | ||
1699 | ret | ||
1700 | |||
1701 | #endif | ||
1702 | |||
49 | 1703 | ||
50 | _key_expansion_128: | 1704 | _key_expansion_128: |
51 | _key_expansion_256a: | 1705 | _key_expansion_256a: |
@@ -55,10 +1709,11 @@ _key_expansion_256a: | |||
55 | shufps $0b10001100, %xmm0, %xmm4 | 1709 | shufps $0b10001100, %xmm0, %xmm4 |
56 | pxor %xmm4, %xmm0 | 1710 | pxor %xmm4, %xmm0 |
57 | pxor %xmm1, %xmm0 | 1711 | pxor %xmm1, %xmm0 |
58 | movaps %xmm0, (%rcx) | 1712 | movaps %xmm0, (TKEYP) |
59 | add $0x10, %rcx | 1713 | add $0x10, TKEYP |
60 | ret | 1714 | ret |
61 | 1715 | ||
1716 | .align 4 | ||
62 | _key_expansion_192a: | 1717 | _key_expansion_192a: |
63 | pshufd $0b01010101, %xmm1, %xmm1 | 1718 | pshufd $0b01010101, %xmm1, %xmm1 |
64 | shufps $0b00010000, %xmm0, %xmm4 | 1719 | shufps $0b00010000, %xmm0, %xmm4 |
@@ -76,12 +1731,13 @@ _key_expansion_192a: | |||
76 | 1731 | ||
77 | movaps %xmm0, %xmm1 | 1732 | movaps %xmm0, %xmm1 |
78 | shufps $0b01000100, %xmm0, %xmm6 | 1733 | shufps $0b01000100, %xmm0, %xmm6 |
79 | movaps %xmm6, (%rcx) | 1734 | movaps %xmm6, (TKEYP) |
80 | shufps $0b01001110, %xmm2, %xmm1 | 1735 | shufps $0b01001110, %xmm2, %xmm1 |
81 | movaps %xmm1, 16(%rcx) | 1736 | movaps %xmm1, 0x10(TKEYP) |
82 | add $0x20, %rcx | 1737 | add $0x20, TKEYP |
83 | ret | 1738 | ret |
84 | 1739 | ||
1740 | .align 4 | ||
85 | _key_expansion_192b: | 1741 | _key_expansion_192b: |
86 | pshufd $0b01010101, %xmm1, %xmm1 | 1742 | pshufd $0b01010101, %xmm1, %xmm1 |
87 | shufps $0b00010000, %xmm0, %xmm4 | 1743 | shufps $0b00010000, %xmm0, %xmm4 |
@@ -96,10 +1752,11 @@ _key_expansion_192b: | |||
96 | pxor %xmm3, %xmm2 | 1752 | pxor %xmm3, %xmm2 |
97 | pxor %xmm5, %xmm2 | 1753 | pxor %xmm5, %xmm2 |
98 | 1754 | ||
99 | movaps %xmm0, (%rcx) | 1755 | movaps %xmm0, (TKEYP) |
100 | add $0x10, %rcx | 1756 | add $0x10, TKEYP |
101 | ret | 1757 | ret |
102 | 1758 | ||
1759 | .align 4 | ||
103 | _key_expansion_256b: | 1760 | _key_expansion_256b: |
104 | pshufd $0b10101010, %xmm1, %xmm1 | 1761 | pshufd $0b10101010, %xmm1, %xmm1 |
105 | shufps $0b00010000, %xmm2, %xmm4 | 1762 | shufps $0b00010000, %xmm2, %xmm4 |
@@ -107,8 +1764,8 @@ _key_expansion_256b: | |||
107 | shufps $0b10001100, %xmm2, %xmm4 | 1764 | shufps $0b10001100, %xmm2, %xmm4 |
108 | pxor %xmm4, %xmm2 | 1765 | pxor %xmm4, %xmm2 |
109 | pxor %xmm1, %xmm2 | 1766 | pxor %xmm1, %xmm2 |
110 | movaps %xmm2, (%rcx) | 1767 | movaps %xmm2, (TKEYP) |
111 | add $0x10, %rcx | 1768 | add $0x10, TKEYP |
112 | ret | 1769 | ret |
113 | 1770 | ||
114 | /* | 1771 | /* |
@@ -116,17 +1773,23 @@ _key_expansion_256b: | |||
116 | * unsigned int key_len) | 1773 | * unsigned int key_len) |
117 | */ | 1774 | */ |
118 | ENTRY(aesni_set_key) | 1775 | ENTRY(aesni_set_key) |
119 | movups (%rsi), %xmm0 # user key (first 16 bytes) | 1776 | #ifndef __x86_64__ |
120 | movaps %xmm0, (%rdi) | 1777 | pushl KEYP |
121 | lea 0x10(%rdi), %rcx # key addr | 1778 | movl 8(%esp), KEYP # ctx |
122 | movl %edx, 480(%rdi) | 1779 | movl 12(%esp), UKEYP # in_key |
1780 | movl 16(%esp), %edx # key_len | ||
1781 | #endif | ||
1782 | movups (UKEYP), %xmm0 # user key (first 16 bytes) | ||
1783 | movaps %xmm0, (KEYP) | ||
1784 | lea 0x10(KEYP), TKEYP # key addr | ||
1785 | movl %edx, 480(KEYP) | ||
123 | pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x | 1786 | pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x |
124 | cmp $24, %dl | 1787 | cmp $24, %dl |
125 | jb .Lenc_key128 | 1788 | jb .Lenc_key128 |
126 | je .Lenc_key192 | 1789 | je .Lenc_key192 |
127 | movups 0x10(%rsi), %xmm2 # other user key | 1790 | movups 0x10(UKEYP), %xmm2 # other user key |
128 | movaps %xmm2, (%rcx) | 1791 | movaps %xmm2, (TKEYP) |
129 | add $0x10, %rcx | 1792 | add $0x10, TKEYP |
130 | AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 | 1793 | AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 |
131 | call _key_expansion_256a | 1794 | call _key_expansion_256a |
132 | AESKEYGENASSIST 0x1 %xmm0 %xmm1 | 1795 | AESKEYGENASSIST 0x1 %xmm0 %xmm1 |
@@ -155,7 +1818,7 @@ ENTRY(aesni_set_key) | |||
155 | call _key_expansion_256a | 1818 | call _key_expansion_256a |
156 | jmp .Ldec_key | 1819 | jmp .Ldec_key |
157 | .Lenc_key192: | 1820 | .Lenc_key192: |
158 | movq 0x10(%rsi), %xmm2 # other user key | 1821 | movq 0x10(UKEYP), %xmm2 # other user key |
159 | AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 | 1822 | AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 |
160 | call _key_expansion_192a | 1823 | call _key_expansion_192a |
161 | AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 | 1824 | AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 |
@@ -195,33 +1858,47 @@ ENTRY(aesni_set_key) | |||
195 | AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 | 1858 | AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 |
196 | call _key_expansion_128 | 1859 | call _key_expansion_128 |
197 | .Ldec_key: | 1860 | .Ldec_key: |
198 | sub $0x10, %rcx | 1861 | sub $0x10, TKEYP |
199 | movaps (%rdi), %xmm0 | 1862 | movaps (KEYP), %xmm0 |
200 | movaps (%rcx), %xmm1 | 1863 | movaps (TKEYP), %xmm1 |
201 | movaps %xmm0, 240(%rcx) | 1864 | movaps %xmm0, 240(TKEYP) |
202 | movaps %xmm1, 240(%rdi) | 1865 | movaps %xmm1, 240(KEYP) |
203 | add $0x10, %rdi | 1866 | add $0x10, KEYP |
204 | lea 240-16(%rcx), %rsi | 1867 | lea 240-16(TKEYP), UKEYP |
205 | .align 4 | 1868 | .align 4 |
206 | .Ldec_key_loop: | 1869 | .Ldec_key_loop: |
207 | movaps (%rdi), %xmm0 | 1870 | movaps (KEYP), %xmm0 |
208 | AESIMC %xmm0 %xmm1 | 1871 | AESIMC %xmm0 %xmm1 |
209 | movaps %xmm1, (%rsi) | 1872 | movaps %xmm1, (UKEYP) |
210 | add $0x10, %rdi | 1873 | add $0x10, KEYP |
211 | sub $0x10, %rsi | 1874 | sub $0x10, UKEYP |
212 | cmp %rcx, %rdi | 1875 | cmp TKEYP, KEYP |
213 | jb .Ldec_key_loop | 1876 | jb .Ldec_key_loop |
214 | xor %rax, %rax | 1877 | xor AREG, AREG |
1878 | #ifndef __x86_64__ | ||
1879 | popl KEYP | ||
1880 | #endif | ||
215 | ret | 1881 | ret |
216 | 1882 | ||
217 | /* | 1883 | /* |
218 | * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) | 1884 | * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) |
219 | */ | 1885 | */ |
220 | ENTRY(aesni_enc) | 1886 | ENTRY(aesni_enc) |
1887 | #ifndef __x86_64__ | ||
1888 | pushl KEYP | ||
1889 | pushl KLEN | ||
1890 | movl 12(%esp), KEYP | ||
1891 | movl 16(%esp), OUTP | ||
1892 | movl 20(%esp), INP | ||
1893 | #endif | ||
221 | movl 480(KEYP), KLEN # key length | 1894 | movl 480(KEYP), KLEN # key length |
222 | movups (INP), STATE # input | 1895 | movups (INP), STATE # input |
223 | call _aesni_enc1 | 1896 | call _aesni_enc1 |
224 | movups STATE, (OUTP) # output | 1897 | movups STATE, (OUTP) # output |
1898 | #ifndef __x86_64__ | ||
1899 | popl KLEN | ||
1900 | popl KEYP | ||
1901 | #endif | ||
225 | ret | 1902 | ret |
226 | 1903 | ||
227 | /* | 1904 | /* |
@@ -236,6 +1913,7 @@ ENTRY(aesni_enc) | |||
236 | * KEY | 1913 | * KEY |
237 | * TKEYP (T1) | 1914 | * TKEYP (T1) |
238 | */ | 1915 | */ |
1916 | .align 4 | ||
239 | _aesni_enc1: | 1917 | _aesni_enc1: |
240 | movaps (KEYP), KEY # key | 1918 | movaps (KEYP), KEY # key |
241 | mov KEYP, TKEYP | 1919 | mov KEYP, TKEYP |
@@ -298,6 +1976,7 @@ _aesni_enc1: | |||
298 | * KEY | 1976 | * KEY |
299 | * TKEYP (T1) | 1977 | * TKEYP (T1) |
300 | */ | 1978 | */ |
1979 | .align 4 | ||
301 | _aesni_enc4: | 1980 | _aesni_enc4: |
302 | movaps (KEYP), KEY # key | 1981 | movaps (KEYP), KEY # key |
303 | mov KEYP, TKEYP | 1982 | mov KEYP, TKEYP |
@@ -391,11 +2070,22 @@ _aesni_enc4: | |||
391 | * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) | 2070 | * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) |
392 | */ | 2071 | */ |
393 | ENTRY(aesni_dec) | 2072 | ENTRY(aesni_dec) |
2073 | #ifndef __x86_64__ | ||
2074 | pushl KEYP | ||
2075 | pushl KLEN | ||
2076 | movl 12(%esp), KEYP | ||
2077 | movl 16(%esp), OUTP | ||
2078 | movl 20(%esp), INP | ||
2079 | #endif | ||
394 | mov 480(KEYP), KLEN # key length | 2080 | mov 480(KEYP), KLEN # key length |
395 | add $240, KEYP | 2081 | add $240, KEYP |
396 | movups (INP), STATE # input | 2082 | movups (INP), STATE # input |
397 | call _aesni_dec1 | 2083 | call _aesni_dec1 |
398 | movups STATE, (OUTP) #output | 2084 | movups STATE, (OUTP) #output |
2085 | #ifndef __x86_64__ | ||
2086 | popl KLEN | ||
2087 | popl KEYP | ||
2088 | #endif | ||
399 | ret | 2089 | ret |
400 | 2090 | ||
401 | /* | 2091 | /* |
@@ -410,6 +2100,7 @@ ENTRY(aesni_dec) | |||
410 | * KEY | 2100 | * KEY |
411 | * TKEYP (T1) | 2101 | * TKEYP (T1) |
412 | */ | 2102 | */ |
2103 | .align 4 | ||
413 | _aesni_dec1: | 2104 | _aesni_dec1: |
414 | movaps (KEYP), KEY # key | 2105 | movaps (KEYP), KEY # key |
415 | mov KEYP, TKEYP | 2106 | mov KEYP, TKEYP |
@@ -472,6 +2163,7 @@ _aesni_dec1: | |||
472 | * KEY | 2163 | * KEY |
473 | * TKEYP (T1) | 2164 | * TKEYP (T1) |
474 | */ | 2165 | */ |
2166 | .align 4 | ||
475 | _aesni_dec4: | 2167 | _aesni_dec4: |
476 | movaps (KEYP), KEY # key | 2168 | movaps (KEYP), KEY # key |
477 | mov KEYP, TKEYP | 2169 | mov KEYP, TKEYP |
@@ -566,6 +2258,15 @@ _aesni_dec4: | |||
566 | * size_t len) | 2258 | * size_t len) |
567 | */ | 2259 | */ |
568 | ENTRY(aesni_ecb_enc) | 2260 | ENTRY(aesni_ecb_enc) |
2261 | #ifndef __x86_64__ | ||
2262 | pushl LEN | ||
2263 | pushl KEYP | ||
2264 | pushl KLEN | ||
2265 | movl 16(%esp), KEYP | ||
2266 | movl 20(%esp), OUTP | ||
2267 | movl 24(%esp), INP | ||
2268 | movl 28(%esp), LEN | ||
2269 | #endif | ||
569 | test LEN, LEN # check length | 2270 | test LEN, LEN # check length |
570 | jz .Lecb_enc_ret | 2271 | jz .Lecb_enc_ret |
571 | mov 480(KEYP), KLEN | 2272 | mov 480(KEYP), KLEN |
@@ -602,6 +2303,11 @@ ENTRY(aesni_ecb_enc) | |||
602 | cmp $16, LEN | 2303 | cmp $16, LEN |
603 | jge .Lecb_enc_loop1 | 2304 | jge .Lecb_enc_loop1 |
604 | .Lecb_enc_ret: | 2305 | .Lecb_enc_ret: |
2306 | #ifndef __x86_64__ | ||
2307 | popl KLEN | ||
2308 | popl KEYP | ||
2309 | popl LEN | ||
2310 | #endif | ||
605 | ret | 2311 | ret |
606 | 2312 | ||
607 | /* | 2313 | /* |
@@ -609,6 +2315,15 @@ ENTRY(aesni_ecb_enc) | |||
609 | * size_t len); | 2315 | * size_t len); |
610 | */ | 2316 | */ |
611 | ENTRY(aesni_ecb_dec) | 2317 | ENTRY(aesni_ecb_dec) |
2318 | #ifndef __x86_64__ | ||
2319 | pushl LEN | ||
2320 | pushl KEYP | ||
2321 | pushl KLEN | ||
2322 | movl 16(%esp), KEYP | ||
2323 | movl 20(%esp), OUTP | ||
2324 | movl 24(%esp), INP | ||
2325 | movl 28(%esp), LEN | ||
2326 | #endif | ||
612 | test LEN, LEN | 2327 | test LEN, LEN |
613 | jz .Lecb_dec_ret | 2328 | jz .Lecb_dec_ret |
614 | mov 480(KEYP), KLEN | 2329 | mov 480(KEYP), KLEN |
@@ -646,6 +2361,11 @@ ENTRY(aesni_ecb_dec) | |||
646 | cmp $16, LEN | 2361 | cmp $16, LEN |
647 | jge .Lecb_dec_loop1 | 2362 | jge .Lecb_dec_loop1 |
648 | .Lecb_dec_ret: | 2363 | .Lecb_dec_ret: |
2364 | #ifndef __x86_64__ | ||
2365 | popl KLEN | ||
2366 | popl KEYP | ||
2367 | popl LEN | ||
2368 | #endif | ||
649 | ret | 2369 | ret |
650 | 2370 | ||
651 | /* | 2371 | /* |
@@ -653,6 +2373,17 @@ ENTRY(aesni_ecb_dec) | |||
653 | * size_t len, u8 *iv) | 2373 | * size_t len, u8 *iv) |
654 | */ | 2374 | */ |
655 | ENTRY(aesni_cbc_enc) | 2375 | ENTRY(aesni_cbc_enc) |
2376 | #ifndef __x86_64__ | ||
2377 | pushl IVP | ||
2378 | pushl LEN | ||
2379 | pushl KEYP | ||
2380 | pushl KLEN | ||
2381 | movl 20(%esp), KEYP | ||
2382 | movl 24(%esp), OUTP | ||
2383 | movl 28(%esp), INP | ||
2384 | movl 32(%esp), LEN | ||
2385 | movl 36(%esp), IVP | ||
2386 | #endif | ||
656 | cmp $16, LEN | 2387 | cmp $16, LEN |
657 | jb .Lcbc_enc_ret | 2388 | jb .Lcbc_enc_ret |
658 | mov 480(KEYP), KLEN | 2389 | mov 480(KEYP), KLEN |
@@ -670,6 +2401,12 @@ ENTRY(aesni_cbc_enc) | |||
670 | jge .Lcbc_enc_loop | 2401 | jge .Lcbc_enc_loop |
671 | movups STATE, (IVP) | 2402 | movups STATE, (IVP) |
672 | .Lcbc_enc_ret: | 2403 | .Lcbc_enc_ret: |
2404 | #ifndef __x86_64__ | ||
2405 | popl KLEN | ||
2406 | popl KEYP | ||
2407 | popl LEN | ||
2408 | popl IVP | ||
2409 | #endif | ||
673 | ret | 2410 | ret |
674 | 2411 | ||
675 | /* | 2412 | /* |
@@ -677,6 +2414,17 @@ ENTRY(aesni_cbc_enc) | |||
677 | * size_t len, u8 *iv) | 2414 | * size_t len, u8 *iv) |
678 | */ | 2415 | */ |
679 | ENTRY(aesni_cbc_dec) | 2416 | ENTRY(aesni_cbc_dec) |
2417 | #ifndef __x86_64__ | ||
2418 | pushl IVP | ||
2419 | pushl LEN | ||
2420 | pushl KEYP | ||
2421 | pushl KLEN | ||
2422 | movl 20(%esp), KEYP | ||
2423 | movl 24(%esp), OUTP | ||
2424 | movl 28(%esp), INP | ||
2425 | movl 32(%esp), LEN | ||
2426 | movl 36(%esp), IVP | ||
2427 | #endif | ||
680 | cmp $16, LEN | 2428 | cmp $16, LEN |
681 | jb .Lcbc_dec_just_ret | 2429 | jb .Lcbc_dec_just_ret |
682 | mov 480(KEYP), KLEN | 2430 | mov 480(KEYP), KLEN |
@@ -690,16 +2438,30 @@ ENTRY(aesni_cbc_dec) | |||
690 | movaps IN1, STATE1 | 2438 | movaps IN1, STATE1 |
691 | movups 0x10(INP), IN2 | 2439 | movups 0x10(INP), IN2 |
692 | movaps IN2, STATE2 | 2440 | movaps IN2, STATE2 |
2441 | #ifdef __x86_64__ | ||
693 | movups 0x20(INP), IN3 | 2442 | movups 0x20(INP), IN3 |
694 | movaps IN3, STATE3 | 2443 | movaps IN3, STATE3 |
695 | movups 0x30(INP), IN4 | 2444 | movups 0x30(INP), IN4 |
696 | movaps IN4, STATE4 | 2445 | movaps IN4, STATE4 |
2446 | #else | ||
2447 | movups 0x20(INP), IN1 | ||
2448 | movaps IN1, STATE3 | ||
2449 | movups 0x30(INP), IN2 | ||
2450 | movaps IN2, STATE4 | ||
2451 | #endif | ||
697 | call _aesni_dec4 | 2452 | call _aesni_dec4 |
698 | pxor IV, STATE1 | 2453 | pxor IV, STATE1 |
2454 | #ifdef __x86_64__ | ||
699 | pxor IN1, STATE2 | 2455 | pxor IN1, STATE2 |
700 | pxor IN2, STATE3 | 2456 | pxor IN2, STATE3 |
701 | pxor IN3, STATE4 | 2457 | pxor IN3, STATE4 |
702 | movaps IN4, IV | 2458 | movaps IN4, IV |
2459 | #else | ||
2460 | pxor (INP), STATE2 | ||
2461 | pxor 0x10(INP), STATE3 | ||
2462 | pxor IN1, STATE4 | ||
2463 | movaps IN2, IV | ||
2464 | #endif | ||
703 | movups STATE1, (OUTP) | 2465 | movups STATE1, (OUTP) |
704 | movups STATE2, 0x10(OUTP) | 2466 | movups STATE2, 0x10(OUTP) |
705 | movups STATE3, 0x20(OUTP) | 2467 | movups STATE3, 0x20(OUTP) |
@@ -727,8 +2489,15 @@ ENTRY(aesni_cbc_dec) | |||
727 | .Lcbc_dec_ret: | 2489 | .Lcbc_dec_ret: |
728 | movups IV, (IVP) | 2490 | movups IV, (IVP) |
729 | .Lcbc_dec_just_ret: | 2491 | .Lcbc_dec_just_ret: |
2492 | #ifndef __x86_64__ | ||
2493 | popl KLEN | ||
2494 | popl KEYP | ||
2495 | popl LEN | ||
2496 | popl IVP | ||
2497 | #endif | ||
730 | ret | 2498 | ret |
731 | 2499 | ||
2500 | #ifdef __x86_64__ | ||
732 | .align 16 | 2501 | .align 16 |
733 | .Lbswap_mask: | 2502 | .Lbswap_mask: |
734 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | 2503 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
@@ -744,6 +2513,7 @@ ENTRY(aesni_cbc_dec) | |||
744 | * INC: == 1, in little endian | 2513 | * INC: == 1, in little endian |
745 | * BSWAP_MASK == endian swapping mask | 2514 | * BSWAP_MASK == endian swapping mask |
746 | */ | 2515 | */ |
2516 | .align 4 | ||
747 | _aesni_inc_init: | 2517 | _aesni_inc_init: |
748 | movaps .Lbswap_mask, BSWAP_MASK | 2518 | movaps .Lbswap_mask, BSWAP_MASK |
749 | movaps IV, CTR | 2519 | movaps IV, CTR |
@@ -768,6 +2538,7 @@ _aesni_inc_init: | |||
768 | * CTR: == output IV, in little endian | 2538 | * CTR: == output IV, in little endian |
769 | * TCTR_LOW: == lower qword of CTR | 2539 | * TCTR_LOW: == lower qword of CTR |
770 | */ | 2540 | */ |
2541 | .align 4 | ||
771 | _aesni_inc: | 2542 | _aesni_inc: |
772 | paddq INC, CTR | 2543 | paddq INC, CTR |
773 | add $1, TCTR_LOW | 2544 | add $1, TCTR_LOW |
@@ -839,3 +2610,4 @@ ENTRY(aesni_ctr_enc) | |||
839 | movups IV, (IVP) | 2610 | movups IV, (IVP) |
840 | .Lctr_enc_just_ret: | 2611 | .Lctr_enc_just_ret: |
841 | ret | 2612 | ret |
2613 | #endif | ||
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 2cb3dcc4490a..e1e60c7d5813 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c | |||
@@ -5,6 +5,14 @@ | |||
5 | * Copyright (C) 2008, Intel Corp. | 5 | * Copyright (C) 2008, Intel Corp. |
6 | * Author: Huang Ying <ying.huang@intel.com> | 6 | * Author: Huang Ying <ying.huang@intel.com> |
7 | * | 7 | * |
8 | * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD | ||
9 | * interface for 64-bit kernels. | ||
10 | * Authors: Adrian Hoban <adrian.hoban@intel.com> | ||
11 | * Gabriele Paoloni <gabriele.paoloni@intel.com> | ||
12 | * Tadeusz Struk (tadeusz.struk@intel.com) | ||
13 | * Aidan O'Mahony (aidan.o.mahony@intel.com) | ||
14 | * Copyright (c) 2010, Intel Corporation. | ||
15 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | 16 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by | 17 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or | 18 | * the Free Software Foundation; either version 2 of the License, or |
@@ -21,6 +29,10 @@ | |||
21 | #include <crypto/ctr.h> | 29 | #include <crypto/ctr.h> |
22 | #include <asm/i387.h> | 30 | #include <asm/i387.h> |
23 | #include <asm/aes.h> | 31 | #include <asm/aes.h> |
32 | #include <crypto/scatterwalk.h> | ||
33 | #include <crypto/internal/aead.h> | ||
34 | #include <linux/workqueue.h> | ||
35 | #include <linux/spinlock.h> | ||
24 | 36 | ||
25 | #if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) | 37 | #if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) |
26 | #define HAS_CTR | 38 | #define HAS_CTR |
@@ -42,8 +54,31 @@ struct async_aes_ctx { | |||
42 | struct cryptd_ablkcipher *cryptd_tfm; | 54 | struct cryptd_ablkcipher *cryptd_tfm; |
43 | }; | 55 | }; |
44 | 56 | ||
45 | #define AESNI_ALIGN 16 | 57 | /* This data is stored at the end of the crypto_tfm struct. |
58 | * It's a type of per "session" data storage location. | ||
59 | * This needs to be 16 byte aligned. | ||
60 | */ | ||
61 | struct aesni_rfc4106_gcm_ctx { | ||
62 | u8 hash_subkey[16]; | ||
63 | struct crypto_aes_ctx aes_key_expanded; | ||
64 | u8 nonce[4]; | ||
65 | struct cryptd_aead *cryptd_tfm; | ||
66 | }; | ||
67 | |||
68 | struct aesni_gcm_set_hash_subkey_result { | ||
69 | int err; | ||
70 | struct completion completion; | ||
71 | }; | ||
72 | |||
73 | struct aesni_hash_subkey_req_data { | ||
74 | u8 iv[16]; | ||
75 | struct aesni_gcm_set_hash_subkey_result result; | ||
76 | struct scatterlist sg; | ||
77 | }; | ||
78 | |||
79 | #define AESNI_ALIGN (16) | ||
46 | #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) | 80 | #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) |
81 | #define RFC4106_HASH_SUBKEY_SIZE 16 | ||
47 | 82 | ||
48 | asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, | 83 | asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, |
49 | unsigned int key_len); | 84 | unsigned int key_len); |
@@ -59,9 +94,62 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, | |||
59 | const u8 *in, unsigned int len, u8 *iv); | 94 | const u8 *in, unsigned int len, u8 *iv); |
60 | asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, | 95 | asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, |
61 | const u8 *in, unsigned int len, u8 *iv); | 96 | const u8 *in, unsigned int len, u8 *iv); |
97 | #ifdef CONFIG_X86_64 | ||
62 | asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, | 98 | asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, |
63 | const u8 *in, unsigned int len, u8 *iv); | 99 | const u8 *in, unsigned int len, u8 *iv); |
64 | 100 | ||
101 | /* asmlinkage void aesni_gcm_enc() | ||
102 | * void *ctx, AES Key schedule. Starts on a 16 byte boundary. | ||
103 | * u8 *out, Ciphertext output. Encrypt in-place is allowed. | ||
104 | * const u8 *in, Plaintext input | ||
105 | * unsigned long plaintext_len, Length of data in bytes for encryption. | ||
106 | * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) | ||
107 | * concatenated with 8 byte Initialisation Vector (from IPSec ESP | ||
108 | * Payload) concatenated with 0x00000001. 16-byte aligned pointer. | ||
109 | * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. | ||
110 | * const u8 *aad, Additional Authentication Data (AAD) | ||
111 | * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this | ||
112 | * is going to be 8 or 12 bytes | ||
113 | * u8 *auth_tag, Authenticated Tag output. | ||
114 | * unsigned long auth_tag_len), Authenticated Tag Length in bytes. | ||
115 | * Valid values are 16 (most likely), 12 or 8. | ||
116 | */ | ||
117 | asmlinkage void aesni_gcm_enc(void *ctx, u8 *out, | ||
118 | const u8 *in, unsigned long plaintext_len, u8 *iv, | ||
119 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | ||
120 | u8 *auth_tag, unsigned long auth_tag_len); | ||
121 | |||
122 | /* asmlinkage void aesni_gcm_dec() | ||
123 | * void *ctx, AES Key schedule. Starts on a 16 byte boundary. | ||
124 | * u8 *out, Plaintext output. Decrypt in-place is allowed. | ||
125 | * const u8 *in, Ciphertext input | ||
126 | * unsigned long ciphertext_len, Length of data in bytes for decryption. | ||
127 | * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) | ||
128 | * concatenated with 8 byte Initialisation Vector (from IPSec ESP | ||
129 | * Payload) concatenated with 0x00000001. 16-byte aligned pointer. | ||
130 | * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. | ||
131 | * const u8 *aad, Additional Authentication Data (AAD) | ||
132 | * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going | ||
133 | * to be 8 or 12 bytes | ||
134 | * u8 *auth_tag, Authenticated Tag output. | ||
135 | * unsigned long auth_tag_len) Authenticated Tag Length in bytes. | ||
136 | * Valid values are 16 (most likely), 12 or 8. | ||
137 | */ | ||
138 | asmlinkage void aesni_gcm_dec(void *ctx, u8 *out, | ||
139 | const u8 *in, unsigned long ciphertext_len, u8 *iv, | ||
140 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | ||
141 | u8 *auth_tag, unsigned long auth_tag_len); | ||
142 | |||
143 | static inline struct | ||
144 | aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) | ||
145 | { | ||
146 | return | ||
147 | (struct aesni_rfc4106_gcm_ctx *) | ||
148 | PTR_ALIGN((u8 *) | ||
149 | crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN); | ||
150 | } | ||
151 | #endif | ||
152 | |||
65 | static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) | 153 | static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) |
66 | { | 154 | { |
67 | unsigned long addr = (unsigned long)raw_ctx; | 155 | unsigned long addr = (unsigned long)raw_ctx; |
@@ -324,6 +412,7 @@ static struct crypto_alg blk_cbc_alg = { | |||
324 | }, | 412 | }, |
325 | }; | 413 | }; |
326 | 414 | ||
415 | #ifdef CONFIG_X86_64 | ||
327 | static void ctr_crypt_final(struct crypto_aes_ctx *ctx, | 416 | static void ctr_crypt_final(struct crypto_aes_ctx *ctx, |
328 | struct blkcipher_walk *walk) | 417 | struct blkcipher_walk *walk) |
329 | { | 418 | { |
@@ -389,6 +478,7 @@ static struct crypto_alg blk_ctr_alg = { | |||
389 | }, | 478 | }, |
390 | }, | 479 | }, |
391 | }; | 480 | }; |
481 | #endif | ||
392 | 482 | ||
393 | static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, | 483 | static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, |
394 | unsigned int key_len) | 484 | unsigned int key_len) |
@@ -536,6 +626,7 @@ static struct crypto_alg ablk_cbc_alg = { | |||
536 | }, | 626 | }, |
537 | }; | 627 | }; |
538 | 628 | ||
629 | #ifdef CONFIG_X86_64 | ||
539 | static int ablk_ctr_init(struct crypto_tfm *tfm) | 630 | static int ablk_ctr_init(struct crypto_tfm *tfm) |
540 | { | 631 | { |
541 | struct cryptd_ablkcipher *cryptd_tfm; | 632 | struct cryptd_ablkcipher *cryptd_tfm; |
@@ -612,6 +703,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = { | |||
612 | }, | 703 | }, |
613 | }; | 704 | }; |
614 | #endif | 705 | #endif |
706 | #endif | ||
615 | 707 | ||
616 | #ifdef HAS_LRW | 708 | #ifdef HAS_LRW |
617 | static int ablk_lrw_init(struct crypto_tfm *tfm) | 709 | static int ablk_lrw_init(struct crypto_tfm *tfm) |
@@ -730,6 +822,424 @@ static struct crypto_alg ablk_xts_alg = { | |||
730 | }; | 822 | }; |
731 | #endif | 823 | #endif |
732 | 824 | ||
825 | #ifdef CONFIG_X86_64 | ||
826 | static int rfc4106_init(struct crypto_tfm *tfm) | ||
827 | { | ||
828 | struct cryptd_aead *cryptd_tfm; | ||
829 | struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *) | ||
830 | PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); | ||
831 | cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0); | ||
832 | if (IS_ERR(cryptd_tfm)) | ||
833 | return PTR_ERR(cryptd_tfm); | ||
834 | ctx->cryptd_tfm = cryptd_tfm; | ||
835 | tfm->crt_aead.reqsize = sizeof(struct aead_request) | ||
836 | + crypto_aead_reqsize(&cryptd_tfm->base); | ||
837 | return 0; | ||
838 | } | ||
839 | |||
840 | static void rfc4106_exit(struct crypto_tfm *tfm) | ||
841 | { | ||
842 | struct aesni_rfc4106_gcm_ctx *ctx = | ||
843 | (struct aesni_rfc4106_gcm_ctx *) | ||
844 | PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); | ||
845 | if (!IS_ERR(ctx->cryptd_tfm)) | ||
846 | cryptd_free_aead(ctx->cryptd_tfm); | ||
847 | return; | ||
848 | } | ||
849 | |||
850 | static void | ||
851 | rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err) | ||
852 | { | ||
853 | struct aesni_gcm_set_hash_subkey_result *result = req->data; | ||
854 | |||
855 | if (err == -EINPROGRESS) | ||
856 | return; | ||
857 | result->err = err; | ||
858 | complete(&result->completion); | ||
859 | } | ||
860 | |||
861 | static int | ||
862 | rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) | ||
863 | { | ||
864 | struct crypto_ablkcipher *ctr_tfm; | ||
865 | struct ablkcipher_request *req; | ||
866 | int ret = -EINVAL; | ||
867 | struct aesni_hash_subkey_req_data *req_data; | ||
868 | |||
869 | ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0); | ||
870 | if (IS_ERR(ctr_tfm)) | ||
871 | return PTR_ERR(ctr_tfm); | ||
872 | |||
873 | crypto_ablkcipher_clear_flags(ctr_tfm, ~0); | ||
874 | |||
875 | ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); | ||
876 | if (ret) { | ||
877 | crypto_free_ablkcipher(ctr_tfm); | ||
878 | return ret; | ||
879 | } | ||
880 | |||
881 | req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL); | ||
882 | if (!req) { | ||
883 | crypto_free_ablkcipher(ctr_tfm); | ||
884 | return -EINVAL; | ||
885 | } | ||
886 | |||
887 | req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); | ||
888 | if (!req_data) { | ||
889 | crypto_free_ablkcipher(ctr_tfm); | ||
890 | return -ENOMEM; | ||
891 | } | ||
892 | memset(req_data->iv, 0, sizeof(req_data->iv)); | ||
893 | |||
894 | /* Clear the data in the hash sub key container to zero.*/ | ||
895 | /* We want to cipher all zeros to create the hash sub key. */ | ||
896 | memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE); | ||
897 | |||
898 | init_completion(&req_data->result.completion); | ||
899 | sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE); | ||
900 | ablkcipher_request_set_tfm(req, ctr_tfm); | ||
901 | ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | | ||
902 | CRYPTO_TFM_REQ_MAY_BACKLOG, | ||
903 | rfc4106_set_hash_subkey_done, | ||
904 | &req_data->result); | ||
905 | |||
906 | ablkcipher_request_set_crypt(req, &req_data->sg, | ||
907 | &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv); | ||
908 | |||
909 | ret = crypto_ablkcipher_encrypt(req); | ||
910 | if (ret == -EINPROGRESS || ret == -EBUSY) { | ||
911 | ret = wait_for_completion_interruptible | ||
912 | (&req_data->result.completion); | ||
913 | if (!ret) | ||
914 | ret = req_data->result.err; | ||
915 | } | ||
916 | ablkcipher_request_free(req); | ||
917 | kfree(req_data); | ||
918 | crypto_free_ablkcipher(ctr_tfm); | ||
919 | return ret; | ||
920 | } | ||
921 | |||
922 | static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, | ||
923 | unsigned int key_len) | ||
924 | { | ||
925 | int ret = 0; | ||
926 | struct crypto_tfm *tfm = crypto_aead_tfm(parent); | ||
927 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); | ||
928 | u8 *new_key_mem = NULL; | ||
929 | |||
930 | if (key_len < 4) { | ||
931 | crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); | ||
932 | return -EINVAL; | ||
933 | } | ||
934 | /*Account for 4 byte nonce at the end.*/ | ||
935 | key_len -= 4; | ||
936 | if (key_len != AES_KEYSIZE_128) { | ||
937 | crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); | ||
938 | return -EINVAL; | ||
939 | } | ||
940 | |||
941 | memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce)); | ||
942 | /*This must be on a 16 byte boundary!*/ | ||
943 | if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN) | ||
944 | return -EINVAL; | ||
945 | |||
946 | if ((unsigned long)key % AESNI_ALIGN) { | ||
947 | /*key is not aligned: use an auxuliar aligned pointer*/ | ||
948 | new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL); | ||
949 | if (!new_key_mem) | ||
950 | return -ENOMEM; | ||
951 | |||
952 | new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN); | ||
953 | memcpy(new_key_mem, key, key_len); | ||
954 | key = new_key_mem; | ||
955 | } | ||
956 | |||
957 | if (!irq_fpu_usable()) | ||
958 | ret = crypto_aes_expand_key(&(ctx->aes_key_expanded), | ||
959 | key, key_len); | ||
960 | else { | ||
961 | kernel_fpu_begin(); | ||
962 | ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len); | ||
963 | kernel_fpu_end(); | ||
964 | } | ||
965 | /*This must be on a 16 byte boundary!*/ | ||
966 | if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) { | ||
967 | ret = -EINVAL; | ||
968 | goto exit; | ||
969 | } | ||
970 | ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); | ||
971 | exit: | ||
972 | kfree(new_key_mem); | ||
973 | return ret; | ||
974 | } | ||
975 | |||
976 | /* This is the Integrity Check Value (aka the authentication tag length and can | ||
977 | * be 8, 12 or 16 bytes long. */ | ||
978 | static int rfc4106_set_authsize(struct crypto_aead *parent, | ||
979 | unsigned int authsize) | ||
980 | { | ||
981 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); | ||
982 | struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); | ||
983 | |||
984 | switch (authsize) { | ||
985 | case 8: | ||
986 | case 12: | ||
987 | case 16: | ||
988 | break; | ||
989 | default: | ||
990 | return -EINVAL; | ||
991 | } | ||
992 | crypto_aead_crt(parent)->authsize = authsize; | ||
993 | crypto_aead_crt(cryptd_child)->authsize = authsize; | ||
994 | return 0; | ||
995 | } | ||
996 | |||
997 | static int rfc4106_encrypt(struct aead_request *req) | ||
998 | { | ||
999 | int ret; | ||
1000 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | ||
1001 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); | ||
1002 | struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); | ||
1003 | |||
1004 | if (!irq_fpu_usable()) { | ||
1005 | struct aead_request *cryptd_req = | ||
1006 | (struct aead_request *) aead_request_ctx(req); | ||
1007 | memcpy(cryptd_req, req, sizeof(*req)); | ||
1008 | aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); | ||
1009 | return crypto_aead_encrypt(cryptd_req); | ||
1010 | } else { | ||
1011 | kernel_fpu_begin(); | ||
1012 | ret = cryptd_child->base.crt_aead.encrypt(req); | ||
1013 | kernel_fpu_end(); | ||
1014 | return ret; | ||
1015 | } | ||
1016 | } | ||
1017 | |||
1018 | static int rfc4106_decrypt(struct aead_request *req) | ||
1019 | { | ||
1020 | int ret; | ||
1021 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | ||
1022 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); | ||
1023 | struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); | ||
1024 | |||
1025 | if (!irq_fpu_usable()) { | ||
1026 | struct aead_request *cryptd_req = | ||
1027 | (struct aead_request *) aead_request_ctx(req); | ||
1028 | memcpy(cryptd_req, req, sizeof(*req)); | ||
1029 | aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); | ||
1030 | return crypto_aead_decrypt(cryptd_req); | ||
1031 | } else { | ||
1032 | kernel_fpu_begin(); | ||
1033 | ret = cryptd_child->base.crt_aead.decrypt(req); | ||
1034 | kernel_fpu_end(); | ||
1035 | return ret; | ||
1036 | } | ||
1037 | } | ||
1038 | |||
1039 | static struct crypto_alg rfc4106_alg = { | ||
1040 | .cra_name = "rfc4106(gcm(aes))", | ||
1041 | .cra_driver_name = "rfc4106-gcm-aesni", | ||
1042 | .cra_priority = 400, | ||
1043 | .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC, | ||
1044 | .cra_blocksize = 1, | ||
1045 | .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN, | ||
1046 | .cra_alignmask = 0, | ||
1047 | .cra_type = &crypto_nivaead_type, | ||
1048 | .cra_module = THIS_MODULE, | ||
1049 | .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list), | ||
1050 | .cra_init = rfc4106_init, | ||
1051 | .cra_exit = rfc4106_exit, | ||
1052 | .cra_u = { | ||
1053 | .aead = { | ||
1054 | .setkey = rfc4106_set_key, | ||
1055 | .setauthsize = rfc4106_set_authsize, | ||
1056 | .encrypt = rfc4106_encrypt, | ||
1057 | .decrypt = rfc4106_decrypt, | ||
1058 | .geniv = "seqiv", | ||
1059 | .ivsize = 8, | ||
1060 | .maxauthsize = 16, | ||
1061 | }, | ||
1062 | }, | ||
1063 | }; | ||
1064 | |||
1065 | static int __driver_rfc4106_encrypt(struct aead_request *req) | ||
1066 | { | ||
1067 | u8 one_entry_in_sg = 0; | ||
1068 | u8 *src, *dst, *assoc; | ||
1069 | __be32 counter = cpu_to_be32(1); | ||
1070 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | ||
1071 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); | ||
1072 | void *aes_ctx = &(ctx->aes_key_expanded); | ||
1073 | unsigned long auth_tag_len = crypto_aead_authsize(tfm); | ||
1074 | u8 iv_tab[16+AESNI_ALIGN]; | ||
1075 | u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN); | ||
1076 | struct scatter_walk src_sg_walk; | ||
1077 | struct scatter_walk assoc_sg_walk; | ||
1078 | struct scatter_walk dst_sg_walk; | ||
1079 | unsigned int i; | ||
1080 | |||
1081 | /* Assuming we are supporting rfc4106 64-bit extended */ | ||
1082 | /* sequence numbers We need to have the AAD length equal */ | ||
1083 | /* to 8 or 12 bytes */ | ||
1084 | if (unlikely(req->assoclen != 8 && req->assoclen != 12)) | ||
1085 | return -EINVAL; | ||
1086 | /* IV below built */ | ||
1087 | for (i = 0; i < 4; i++) | ||
1088 | *(iv+i) = ctx->nonce[i]; | ||
1089 | for (i = 0; i < 8; i++) | ||
1090 | *(iv+4+i) = req->iv[i]; | ||
1091 | *((__be32 *)(iv+12)) = counter; | ||
1092 | |||
1093 | if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) { | ||
1094 | one_entry_in_sg = 1; | ||
1095 | scatterwalk_start(&src_sg_walk, req->src); | ||
1096 | scatterwalk_start(&assoc_sg_walk, req->assoc); | ||
1097 | src = scatterwalk_map(&src_sg_walk, 0); | ||
1098 | assoc = scatterwalk_map(&assoc_sg_walk, 0); | ||
1099 | dst = src; | ||
1100 | if (unlikely(req->src != req->dst)) { | ||
1101 | scatterwalk_start(&dst_sg_walk, req->dst); | ||
1102 | dst = scatterwalk_map(&dst_sg_walk, 0); | ||
1103 | } | ||
1104 | |||
1105 | } else { | ||
1106 | /* Allocate memory for src, dst, assoc */ | ||
1107 | src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen, | ||
1108 | GFP_ATOMIC); | ||
1109 | if (unlikely(!src)) | ||
1110 | return -ENOMEM; | ||
1111 | assoc = (src + req->cryptlen + auth_tag_len); | ||
1112 | scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); | ||
1113 | scatterwalk_map_and_copy(assoc, req->assoc, 0, | ||
1114 | req->assoclen, 0); | ||
1115 | dst = src; | ||
1116 | } | ||
1117 | |||
1118 | aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, | ||
1119 | ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst | ||
1120 | + ((unsigned long)req->cryptlen), auth_tag_len); | ||
1121 | |||
1122 | /* The authTag (aka the Integrity Check Value) needs to be written | ||
1123 | * back to the packet. */ | ||
1124 | if (one_entry_in_sg) { | ||
1125 | if (unlikely(req->src != req->dst)) { | ||
1126 | scatterwalk_unmap(dst, 0); | ||
1127 | scatterwalk_done(&dst_sg_walk, 0, 0); | ||
1128 | } | ||
1129 | scatterwalk_unmap(src, 0); | ||
1130 | scatterwalk_unmap(assoc, 0); | ||
1131 | scatterwalk_done(&src_sg_walk, 0, 0); | ||
1132 | scatterwalk_done(&assoc_sg_walk, 0, 0); | ||
1133 | } else { | ||
1134 | scatterwalk_map_and_copy(dst, req->dst, 0, | ||
1135 | req->cryptlen + auth_tag_len, 1); | ||
1136 | kfree(src); | ||
1137 | } | ||
1138 | return 0; | ||
1139 | } | ||
1140 | |||
1141 | static int __driver_rfc4106_decrypt(struct aead_request *req) | ||
1142 | { | ||
1143 | u8 one_entry_in_sg = 0; | ||
1144 | u8 *src, *dst, *assoc; | ||
1145 | unsigned long tempCipherLen = 0; | ||
1146 | __be32 counter = cpu_to_be32(1); | ||
1147 | int retval = 0; | ||
1148 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | ||
1149 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); | ||
1150 | void *aes_ctx = &(ctx->aes_key_expanded); | ||
1151 | unsigned long auth_tag_len = crypto_aead_authsize(tfm); | ||
1152 | u8 iv_and_authTag[32+AESNI_ALIGN]; | ||
1153 | u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN); | ||
1154 | u8 *authTag = iv + 16; | ||
1155 | struct scatter_walk src_sg_walk; | ||
1156 | struct scatter_walk assoc_sg_walk; | ||
1157 | struct scatter_walk dst_sg_walk; | ||
1158 | unsigned int i; | ||
1159 | |||
1160 | if (unlikely((req->cryptlen < auth_tag_len) || | ||
1161 | (req->assoclen != 8 && req->assoclen != 12))) | ||
1162 | return -EINVAL; | ||
1163 | /* Assuming we are supporting rfc4106 64-bit extended */ | ||
1164 | /* sequence numbers We need to have the AAD length */ | ||
1165 | /* equal to 8 or 12 bytes */ | ||
1166 | |||
1167 | tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); | ||
1168 | /* IV below built */ | ||
1169 | for (i = 0; i < 4; i++) | ||
1170 | *(iv+i) = ctx->nonce[i]; | ||
1171 | for (i = 0; i < 8; i++) | ||
1172 | *(iv+4+i) = req->iv[i]; | ||
1173 | *((__be32 *)(iv+12)) = counter; | ||
1174 | |||
1175 | if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) { | ||
1176 | one_entry_in_sg = 1; | ||
1177 | scatterwalk_start(&src_sg_walk, req->src); | ||
1178 | scatterwalk_start(&assoc_sg_walk, req->assoc); | ||
1179 | src = scatterwalk_map(&src_sg_walk, 0); | ||
1180 | assoc = scatterwalk_map(&assoc_sg_walk, 0); | ||
1181 | dst = src; | ||
1182 | if (unlikely(req->src != req->dst)) { | ||
1183 | scatterwalk_start(&dst_sg_walk, req->dst); | ||
1184 | dst = scatterwalk_map(&dst_sg_walk, 0); | ||
1185 | } | ||
1186 | |||
1187 | } else { | ||
1188 | /* Allocate memory for src, dst, assoc */ | ||
1189 | src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); | ||
1190 | if (!src) | ||
1191 | return -ENOMEM; | ||
1192 | assoc = (src + req->cryptlen + auth_tag_len); | ||
1193 | scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); | ||
1194 | scatterwalk_map_and_copy(assoc, req->assoc, 0, | ||
1195 | req->assoclen, 0); | ||
1196 | dst = src; | ||
1197 | } | ||
1198 | |||
1199 | aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv, | ||
1200 | ctx->hash_subkey, assoc, (unsigned long)req->assoclen, | ||
1201 | authTag, auth_tag_len); | ||
1202 | |||
1203 | /* Compare generated tag with passed in tag. */ | ||
1204 | retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ? | ||
1205 | -EBADMSG : 0; | ||
1206 | |||
1207 | if (one_entry_in_sg) { | ||
1208 | if (unlikely(req->src != req->dst)) { | ||
1209 | scatterwalk_unmap(dst, 0); | ||
1210 | scatterwalk_done(&dst_sg_walk, 0, 0); | ||
1211 | } | ||
1212 | scatterwalk_unmap(src, 0); | ||
1213 | scatterwalk_unmap(assoc, 0); | ||
1214 | scatterwalk_done(&src_sg_walk, 0, 0); | ||
1215 | scatterwalk_done(&assoc_sg_walk, 0, 0); | ||
1216 | } else { | ||
1217 | scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1); | ||
1218 | kfree(src); | ||
1219 | } | ||
1220 | return retval; | ||
1221 | } | ||
1222 | |||
1223 | static struct crypto_alg __rfc4106_alg = { | ||
1224 | .cra_name = "__gcm-aes-aesni", | ||
1225 | .cra_driver_name = "__driver-gcm-aes-aesni", | ||
1226 | .cra_priority = 0, | ||
1227 | .cra_flags = CRYPTO_ALG_TYPE_AEAD, | ||
1228 | .cra_blocksize = 1, | ||
1229 | .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN, | ||
1230 | .cra_alignmask = 0, | ||
1231 | .cra_type = &crypto_aead_type, | ||
1232 | .cra_module = THIS_MODULE, | ||
1233 | .cra_list = LIST_HEAD_INIT(__rfc4106_alg.cra_list), | ||
1234 | .cra_u = { | ||
1235 | .aead = { | ||
1236 | .encrypt = __driver_rfc4106_encrypt, | ||
1237 | .decrypt = __driver_rfc4106_decrypt, | ||
1238 | }, | ||
1239 | }, | ||
1240 | }; | ||
1241 | #endif | ||
1242 | |||
733 | static int __init aesni_init(void) | 1243 | static int __init aesni_init(void) |
734 | { | 1244 | { |
735 | int err; | 1245 | int err; |
@@ -738,6 +1248,7 @@ static int __init aesni_init(void) | |||
738 | printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); | 1248 | printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); |
739 | return -ENODEV; | 1249 | return -ENODEV; |
740 | } | 1250 | } |
1251 | |||
741 | if ((err = crypto_register_alg(&aesni_alg))) | 1252 | if ((err = crypto_register_alg(&aesni_alg))) |
742 | goto aes_err; | 1253 | goto aes_err; |
743 | if ((err = crypto_register_alg(&__aesni_alg))) | 1254 | if ((err = crypto_register_alg(&__aesni_alg))) |
@@ -746,18 +1257,24 @@ static int __init aesni_init(void) | |||
746 | goto blk_ecb_err; | 1257 | goto blk_ecb_err; |
747 | if ((err = crypto_register_alg(&blk_cbc_alg))) | 1258 | if ((err = crypto_register_alg(&blk_cbc_alg))) |
748 | goto blk_cbc_err; | 1259 | goto blk_cbc_err; |
749 | if ((err = crypto_register_alg(&blk_ctr_alg))) | ||
750 | goto blk_ctr_err; | ||
751 | if ((err = crypto_register_alg(&ablk_ecb_alg))) | 1260 | if ((err = crypto_register_alg(&ablk_ecb_alg))) |
752 | goto ablk_ecb_err; | 1261 | goto ablk_ecb_err; |
753 | if ((err = crypto_register_alg(&ablk_cbc_alg))) | 1262 | if ((err = crypto_register_alg(&ablk_cbc_alg))) |
754 | goto ablk_cbc_err; | 1263 | goto ablk_cbc_err; |
1264 | #ifdef CONFIG_X86_64 | ||
1265 | if ((err = crypto_register_alg(&blk_ctr_alg))) | ||
1266 | goto blk_ctr_err; | ||
755 | if ((err = crypto_register_alg(&ablk_ctr_alg))) | 1267 | if ((err = crypto_register_alg(&ablk_ctr_alg))) |
756 | goto ablk_ctr_err; | 1268 | goto ablk_ctr_err; |
1269 | if ((err = crypto_register_alg(&__rfc4106_alg))) | ||
1270 | goto __aead_gcm_err; | ||
1271 | if ((err = crypto_register_alg(&rfc4106_alg))) | ||
1272 | goto aead_gcm_err; | ||
757 | #ifdef HAS_CTR | 1273 | #ifdef HAS_CTR |
758 | if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg))) | 1274 | if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg))) |
759 | goto ablk_rfc3686_ctr_err; | 1275 | goto ablk_rfc3686_ctr_err; |
760 | #endif | 1276 | #endif |
1277 | #endif | ||
761 | #ifdef HAS_LRW | 1278 | #ifdef HAS_LRW |
762 | if ((err = crypto_register_alg(&ablk_lrw_alg))) | 1279 | if ((err = crypto_register_alg(&ablk_lrw_alg))) |
763 | goto ablk_lrw_err; | 1280 | goto ablk_lrw_err; |
@@ -770,7 +1287,6 @@ static int __init aesni_init(void) | |||
770 | if ((err = crypto_register_alg(&ablk_xts_alg))) | 1287 | if ((err = crypto_register_alg(&ablk_xts_alg))) |
771 | goto ablk_xts_err; | 1288 | goto ablk_xts_err; |
772 | #endif | 1289 | #endif |
773 | |||
774 | return err; | 1290 | return err; |
775 | 1291 | ||
776 | #ifdef HAS_XTS | 1292 | #ifdef HAS_XTS |
@@ -784,18 +1300,24 @@ ablk_pcbc_err: | |||
784 | crypto_unregister_alg(&ablk_lrw_alg); | 1300 | crypto_unregister_alg(&ablk_lrw_alg); |
785 | ablk_lrw_err: | 1301 | ablk_lrw_err: |
786 | #endif | 1302 | #endif |
1303 | #ifdef CONFIG_X86_64 | ||
787 | #ifdef HAS_CTR | 1304 | #ifdef HAS_CTR |
788 | crypto_unregister_alg(&ablk_rfc3686_ctr_alg); | 1305 | crypto_unregister_alg(&ablk_rfc3686_ctr_alg); |
789 | ablk_rfc3686_ctr_err: | 1306 | ablk_rfc3686_ctr_err: |
790 | #endif | 1307 | #endif |
1308 | crypto_unregister_alg(&rfc4106_alg); | ||
1309 | aead_gcm_err: | ||
1310 | crypto_unregister_alg(&__rfc4106_alg); | ||
1311 | __aead_gcm_err: | ||
791 | crypto_unregister_alg(&ablk_ctr_alg); | 1312 | crypto_unregister_alg(&ablk_ctr_alg); |
792 | ablk_ctr_err: | 1313 | ablk_ctr_err: |
1314 | crypto_unregister_alg(&blk_ctr_alg); | ||
1315 | blk_ctr_err: | ||
1316 | #endif | ||
793 | crypto_unregister_alg(&ablk_cbc_alg); | 1317 | crypto_unregister_alg(&ablk_cbc_alg); |
794 | ablk_cbc_err: | 1318 | ablk_cbc_err: |
795 | crypto_unregister_alg(&ablk_ecb_alg); | 1319 | crypto_unregister_alg(&ablk_ecb_alg); |
796 | ablk_ecb_err: | 1320 | ablk_ecb_err: |
797 | crypto_unregister_alg(&blk_ctr_alg); | ||
798 | blk_ctr_err: | ||
799 | crypto_unregister_alg(&blk_cbc_alg); | 1321 | crypto_unregister_alg(&blk_cbc_alg); |
800 | blk_cbc_err: | 1322 | blk_cbc_err: |
801 | crypto_unregister_alg(&blk_ecb_alg); | 1323 | crypto_unregister_alg(&blk_ecb_alg); |
@@ -818,13 +1340,17 @@ static void __exit aesni_exit(void) | |||
818 | #ifdef HAS_LRW | 1340 | #ifdef HAS_LRW |
819 | crypto_unregister_alg(&ablk_lrw_alg); | 1341 | crypto_unregister_alg(&ablk_lrw_alg); |
820 | #endif | 1342 | #endif |
1343 | #ifdef CONFIG_X86_64 | ||
821 | #ifdef HAS_CTR | 1344 | #ifdef HAS_CTR |
822 | crypto_unregister_alg(&ablk_rfc3686_ctr_alg); | 1345 | crypto_unregister_alg(&ablk_rfc3686_ctr_alg); |
823 | #endif | 1346 | #endif |
1347 | crypto_unregister_alg(&rfc4106_alg); | ||
1348 | crypto_unregister_alg(&__rfc4106_alg); | ||
824 | crypto_unregister_alg(&ablk_ctr_alg); | 1349 | crypto_unregister_alg(&ablk_ctr_alg); |
1350 | crypto_unregister_alg(&blk_ctr_alg); | ||
1351 | #endif | ||
825 | crypto_unregister_alg(&ablk_cbc_alg); | 1352 | crypto_unregister_alg(&ablk_cbc_alg); |
826 | crypto_unregister_alg(&ablk_ecb_alg); | 1353 | crypto_unregister_alg(&ablk_ecb_alg); |
827 | crypto_unregister_alg(&blk_ctr_alg); | ||
828 | crypto_unregister_alg(&blk_cbc_alg); | 1354 | crypto_unregister_alg(&blk_cbc_alg); |
829 | crypto_unregister_alg(&blk_ecb_alg); | 1355 | crypto_unregister_alg(&blk_ecb_alg); |
830 | crypto_unregister_alg(&__aesni_alg); | 1356 | crypto_unregister_alg(&__aesni_alg); |
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index cbcc8d8ea93a..7a6e68e4f748 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c | |||
@@ -10,6 +10,7 @@ | |||
10 | * by the Free Software Foundation. | 10 | * by the Free Software Foundation. |
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <linux/err.h> | ||
13 | #include <linux/module.h> | 14 | #include <linux/module.h> |
14 | #include <linux/init.h> | 15 | #include <linux/init.h> |
15 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 849813f398e7..5852519b2d0f 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c | |||
@@ -28,7 +28,6 @@ | |||
28 | #include <linux/syscalls.h> | 28 | #include <linux/syscalls.h> |
29 | #include <linux/times.h> | 29 | #include <linux/times.h> |
30 | #include <linux/utsname.h> | 30 | #include <linux/utsname.h> |
31 | #include <linux/smp_lock.h> | ||
32 | #include <linux/mm.h> | 31 | #include <linux/mm.h> |
33 | #include <linux/uio.h> | 32 | #include <linux/uio.h> |
34 | #include <linux/poll.h> | 33 | #include <linux/poll.h> |
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 55d106b5e31b..211ca3f7fd16 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h | |||
@@ -185,17 +185,16 @@ struct bootnode; | |||
185 | 185 | ||
186 | #ifdef CONFIG_ACPI_NUMA | 186 | #ifdef CONFIG_ACPI_NUMA |
187 | extern int acpi_numa; | 187 | extern int acpi_numa; |
188 | extern int acpi_get_nodes(struct bootnode *physnodes); | 188 | extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start, |
189 | unsigned long end); | ||
189 | extern int acpi_scan_nodes(unsigned long start, unsigned long end); | 190 | extern int acpi_scan_nodes(unsigned long start, unsigned long end); |
190 | #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) | 191 | #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) |
192 | |||
193 | #ifdef CONFIG_NUMA_EMU | ||
191 | extern void acpi_fake_nodes(const struct bootnode *fake_nodes, | 194 | extern void acpi_fake_nodes(const struct bootnode *fake_nodes, |
192 | int num_nodes); | 195 | int num_nodes); |
193 | #else | ||
194 | static inline void acpi_fake_nodes(const struct bootnode *fake_nodes, | ||
195 | int num_nodes) | ||
196 | { | ||
197 | } | ||
198 | #endif | 196 | #endif |
197 | #endif /* CONFIG_ACPI_NUMA */ | ||
199 | 198 | ||
200 | #define acpi_unlazy_tlb(x) leave_mm(x) | 199 | #define acpi_unlazy_tlb(x) leave_mm(x) |
201 | 200 | ||
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 76561d20ea2f..13009d1af99a 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h | |||
@@ -66,6 +66,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name, | |||
66 | extern void alternatives_smp_module_del(struct module *mod); | 66 | extern void alternatives_smp_module_del(struct module *mod); |
67 | extern void alternatives_smp_switch(int smp); | 67 | extern void alternatives_smp_switch(int smp); |
68 | extern int alternatives_text_reserved(void *start, void *end); | 68 | extern int alternatives_text_reserved(void *start, void *end); |
69 | extern bool skip_smp_alternatives; | ||
69 | #else | 70 | #else |
70 | static inline void alternatives_smp_module_add(struct module *mod, char *name, | 71 | static inline void alternatives_smp_module_add(struct module *mod, char *name, |
71 | void *locks, void *locks_end, | 72 | void *locks, void *locks_end, |
@@ -180,8 +181,15 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); | |||
180 | * On the local CPU you need to be protected again NMI or MCE handlers seeing an | 181 | * On the local CPU you need to be protected again NMI or MCE handlers seeing an |
181 | * inconsistent instruction while you patch. | 182 | * inconsistent instruction while you patch. |
182 | */ | 183 | */ |
184 | struct text_poke_param { | ||
185 | void *addr; | ||
186 | const void *opcode; | ||
187 | size_t len; | ||
188 | }; | ||
189 | |||
183 | extern void *text_poke(void *addr, const void *opcode, size_t len); | 190 | extern void *text_poke(void *addr, const void *opcode, size_t len); |
184 | extern void *text_poke_smp(void *addr, const void *opcode, size_t len); | 191 | extern void *text_poke_smp(void *addr, const void *opcode, size_t len); |
192 | extern void text_poke_smp_batch(struct text_poke_param *params, int n); | ||
185 | 193 | ||
186 | #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) | 194 | #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) |
187 | #define IDEAL_NOP_SIZE_5 5 | 195 | #define IDEAL_NOP_SIZE_5 5 |
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index c8517f81b21e..64dc82ee19f0 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h | |||
@@ -3,36 +3,64 @@ | |||
3 | 3 | ||
4 | #include <linux/pci.h> | 4 | #include <linux/pci.h> |
5 | 5 | ||
6 | extern struct pci_device_id k8_nb_ids[]; | 6 | struct amd_nb_bus_dev_range { |
7 | u8 bus; | ||
8 | u8 dev_base; | ||
9 | u8 dev_limit; | ||
10 | }; | ||
11 | |||
12 | extern struct pci_device_id amd_nb_misc_ids[]; | ||
13 | extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; | ||
7 | struct bootnode; | 14 | struct bootnode; |
8 | 15 | ||
9 | extern int early_is_k8_nb(u32 value); | 16 | extern int early_is_amd_nb(u32 value); |
10 | extern int cache_k8_northbridges(void); | 17 | extern int amd_cache_northbridges(void); |
11 | extern void k8_flush_garts(void); | 18 | extern void amd_flush_garts(void); |
12 | extern int k8_get_nodes(struct bootnode *nodes); | 19 | extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); |
13 | extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); | 20 | extern int amd_scan_nodes(void); |
14 | extern int k8_scan_nodes(void); | 21 | |
22 | #ifdef CONFIG_NUMA_EMU | ||
23 | extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes); | ||
24 | extern void amd_get_nodes(struct bootnode *nodes); | ||
25 | #endif | ||
15 | 26 | ||
16 | struct k8_northbridge_info { | 27 | struct amd_northbridge { |
28 | struct pci_dev *misc; | ||
29 | }; | ||
30 | |||
31 | struct amd_northbridge_info { | ||
17 | u16 num; | 32 | u16 num; |
18 | u8 gart_supported; | 33 | u64 flags; |
19 | struct pci_dev **nb_misc; | 34 | struct amd_northbridge *nb; |
20 | }; | 35 | }; |
21 | extern struct k8_northbridge_info k8_northbridges; | 36 | extern struct amd_northbridge_info amd_northbridges; |
37 | |||
38 | #define AMD_NB_GART 0x1 | ||
39 | #define AMD_NB_L3_INDEX_DISABLE 0x2 | ||
22 | 40 | ||
23 | #ifdef CONFIG_AMD_NB | 41 | #ifdef CONFIG_AMD_NB |
24 | 42 | ||
25 | static inline struct pci_dev *node_to_k8_nb_misc(int node) | 43 | static inline int amd_nb_num(void) |
26 | { | 44 | { |
27 | return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL; | 45 | return amd_northbridges.num; |
28 | } | 46 | } |
29 | 47 | ||
30 | #else | 48 | static inline int amd_nb_has_feature(int feature) |
49 | { | ||
50 | return ((amd_northbridges.flags & feature) == feature); | ||
51 | } | ||
31 | 52 | ||
32 | static inline struct pci_dev *node_to_k8_nb_misc(int node) | 53 | static inline struct amd_northbridge *node_to_amd_nb(int node) |
33 | { | 54 | { |
34 | return NULL; | 55 | return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; |
35 | } | 56 | } |
57 | |||
58 | #else | ||
59 | |||
60 | #define amd_nb_num(x) 0 | ||
61 | #define amd_nb_has_feature(x) false | ||
62 | #define node_to_amd_nb(x) NULL | ||
63 | |||
36 | #endif | 64 | #endif |
37 | 65 | ||
38 | 66 | ||
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 286de34b0ed6..5e3969c36d7f 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h | |||
@@ -141,13 +141,13 @@ static inline void native_apic_msr_write(u32 reg, u32 v) | |||
141 | 141 | ||
142 | static inline u32 native_apic_msr_read(u32 reg) | 142 | static inline u32 native_apic_msr_read(u32 reg) |
143 | { | 143 | { |
144 | u32 low, high; | 144 | u64 msr; |
145 | 145 | ||
146 | if (reg == APIC_DFR) | 146 | if (reg == APIC_DFR) |
147 | return -1; | 147 | return -1; |
148 | 148 | ||
149 | rdmsr(APIC_BASE_MSR + (reg >> 4), low, high); | 149 | rdmsrl(APIC_BASE_MSR + (reg >> 4), msr); |
150 | return low; | 150 | return (u32)msr; |
151 | } | 151 | } |
152 | 152 | ||
153 | static inline void native_x2apic_wait_icr_idle(void) | 153 | static inline void native_x2apic_wait_icr_idle(void) |
@@ -181,12 +181,12 @@ extern void enable_x2apic(void); | |||
181 | extern void x2apic_icr_write(u32 low, u32 id); | 181 | extern void x2apic_icr_write(u32 low, u32 id); |
182 | static inline int x2apic_enabled(void) | 182 | static inline int x2apic_enabled(void) |
183 | { | 183 | { |
184 | int msr, msr2; | 184 | u64 msr; |
185 | 185 | ||
186 | if (!cpu_has_x2apic) | 186 | if (!cpu_has_x2apic) |
187 | return 0; | 187 | return 0; |
188 | 188 | ||
189 | rdmsr(MSR_IA32_APICBASE, msr, msr2); | 189 | rdmsrl(MSR_IA32_APICBASE, msr); |
190 | if (msr & X2APIC_ENABLE) | 190 | if (msr & X2APIC_ENABLE) |
191 | return 1; | 191 | return 1; |
192 | return 0; | 192 | return 0; |
@@ -234,16 +234,17 @@ extern void init_bsp_APIC(void); | |||
234 | extern void setup_local_APIC(void); | 234 | extern void setup_local_APIC(void); |
235 | extern void end_local_APIC_setup(void); | 235 | extern void end_local_APIC_setup(void); |
236 | extern void init_apic_mappings(void); | 236 | extern void init_apic_mappings(void); |
237 | void register_lapic_address(unsigned long address); | ||
237 | extern void setup_boot_APIC_clock(void); | 238 | extern void setup_boot_APIC_clock(void); |
238 | extern void setup_secondary_APIC_clock(void); | 239 | extern void setup_secondary_APIC_clock(void); |
239 | extern int APIC_init_uniprocessor(void); | 240 | extern int APIC_init_uniprocessor(void); |
240 | extern void enable_NMI_through_LVT0(void); | 241 | extern void enable_NMI_through_LVT0(void); |
242 | extern int apic_force_enable(void); | ||
241 | 243 | ||
242 | /* | 244 | /* |
243 | * On 32bit this is mach-xxx local | 245 | * On 32bit this is mach-xxx local |
244 | */ | 246 | */ |
245 | #ifdef CONFIG_X86_64 | 247 | #ifdef CONFIG_X86_64 |
246 | extern void early_init_lapic_mapping(void); | ||
247 | extern int apic_is_clustered_box(void); | 248 | extern int apic_is_clustered_box(void); |
248 | #else | 249 | #else |
249 | static inline int apic_is_clustered_box(void) | 250 | static inline int apic_is_clustered_box(void) |
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index a859ca461fb0..47a30ff8e517 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h | |||
@@ -145,6 +145,7 @@ | |||
145 | 145 | ||
146 | #ifdef CONFIG_X86_32 | 146 | #ifdef CONFIG_X86_32 |
147 | # define MAX_IO_APICS 64 | 147 | # define MAX_IO_APICS 64 |
148 | # define MAX_LOCAL_APIC 256 | ||
148 | #else | 149 | #else |
149 | # define MAX_IO_APICS 128 | 150 | # define MAX_IO_APICS 128 |
150 | # define MAX_LOCAL_APIC 32768 | 151 | # define MAX_LOCAL_APIC 32768 |
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 3b62ab56c7a0..5e1a2eef3e7c 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h | |||
@@ -32,11 +32,7 @@ | |||
32 | #define BOOT_HEAP_SIZE 0x400000 | 32 | #define BOOT_HEAP_SIZE 0x400000 |
33 | #else /* !CONFIG_KERNEL_BZIP2 */ | 33 | #else /* !CONFIG_KERNEL_BZIP2 */ |
34 | 34 | ||
35 | #ifdef CONFIG_X86_64 | 35 | #define BOOT_HEAP_SIZE 0x8000 |
36 | #define BOOT_HEAP_SIZE 0x7000 | ||
37 | #else | ||
38 | #define BOOT_HEAP_SIZE 0x4000 | ||
39 | #endif | ||
40 | 36 | ||
41 | #endif /* !CONFIG_KERNEL_BZIP2 */ | 37 | #endif /* !CONFIG_KERNEL_BZIP2 */ |
42 | 38 | ||
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 63e35ec9075c..62f084478f7e 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h | |||
@@ -1,48 +1,8 @@ | |||
1 | #ifndef _ASM_X86_CACHEFLUSH_H | 1 | #ifndef _ASM_X86_CACHEFLUSH_H |
2 | #define _ASM_X86_CACHEFLUSH_H | 2 | #define _ASM_X86_CACHEFLUSH_H |
3 | 3 | ||
4 | /* Keep includes the same across arches. */ | ||
5 | #include <linux/mm.h> | ||
6 | |||
7 | /* Caches aren't brain-dead on the intel. */ | 4 | /* Caches aren't brain-dead on the intel. */ |
8 | static inline void flush_cache_all(void) { } | 5 | #include <asm-generic/cacheflush.h> |
9 | static inline void flush_cache_mm(struct mm_struct *mm) { } | ||
10 | static inline void flush_cache_dup_mm(struct mm_struct *mm) { } | ||
11 | static inline void flush_cache_range(struct vm_area_struct *vma, | ||
12 | unsigned long start, unsigned long end) { } | ||
13 | static inline void flush_cache_page(struct vm_area_struct *vma, | ||
14 | unsigned long vmaddr, unsigned long pfn) { } | ||
15 | #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 | ||
16 | static inline void flush_dcache_page(struct page *page) { } | ||
17 | static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } | ||
18 | static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } | ||
19 | static inline void flush_icache_range(unsigned long start, | ||
20 | unsigned long end) { } | ||
21 | static inline void flush_icache_page(struct vm_area_struct *vma, | ||
22 | struct page *page) { } | ||
23 | static inline void flush_icache_user_range(struct vm_area_struct *vma, | ||
24 | struct page *page, | ||
25 | unsigned long addr, | ||
26 | unsigned long len) { } | ||
27 | static inline void flush_cache_vmap(unsigned long start, unsigned long end) { } | ||
28 | static inline void flush_cache_vunmap(unsigned long start, | ||
29 | unsigned long end) { } | ||
30 | |||
31 | static inline void copy_to_user_page(struct vm_area_struct *vma, | ||
32 | struct page *page, unsigned long vaddr, | ||
33 | void *dst, const void *src, | ||
34 | unsigned long len) | ||
35 | { | ||
36 | memcpy(dst, src, len); | ||
37 | } | ||
38 | |||
39 | static inline void copy_from_user_page(struct vm_area_struct *vma, | ||
40 | struct page *page, unsigned long vaddr, | ||
41 | void *dst, const void *src, | ||
42 | unsigned long len) | ||
43 | { | ||
44 | memcpy(dst, src, len); | ||
45 | } | ||
46 | 6 | ||
47 | #ifdef CONFIG_X86_PAT | 7 | #ifdef CONFIG_X86_PAT |
48 | /* | 8 | /* |
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 4fab24de26b1..6e6e7558e702 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h | |||
@@ -32,5 +32,6 @@ extern void arch_unregister_cpu(int); | |||
32 | 32 | ||
33 | DECLARE_PER_CPU(int, cpu_state); | 33 | DECLARE_PER_CPU(int, cpu_state); |
34 | 34 | ||
35 | int __cpuinit mwait_usable(const struct cpuinfo_x86 *); | ||
35 | 36 | ||
36 | #endif /* _ASM_X86_CPU_H */ | 37 | #endif /* _ASM_X86_CPU_H */ |
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index b81002f23614..078ad0caefc6 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h | |||
@@ -94,7 +94,7 @@ static inline void hw_breakpoint_disable(void) | |||
94 | 94 | ||
95 | static inline int hw_breakpoint_active(void) | 95 | static inline int hw_breakpoint_active(void) |
96 | { | 96 | { |
97 | return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; | 97 | return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; |
98 | } | 98 | } |
99 | 99 | ||
100 | extern void aout_dump_debugregs(struct user *dump); | 100 | extern void aout_dump_debugregs(struct user *dump); |
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 5be1542fbfaf..e99d55d74df5 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h | |||
@@ -72,6 +72,9 @@ struct e820map { | |||
72 | #define BIOS_BEGIN 0x000a0000 | 72 | #define BIOS_BEGIN 0x000a0000 |
73 | #define BIOS_END 0x00100000 | 73 | #define BIOS_END 0x00100000 |
74 | 74 | ||
75 | #define BIOS_ROM_BASE 0xffe00000 | ||
76 | #define BIOS_ROM_END 0xffffffff | ||
77 | |||
75 | #ifdef __KERNEL__ | 78 | #ifdef __KERNEL__ |
76 | /* see comment in arch/x86/kernel/e820.c */ | 79 | /* see comment in arch/x86/kernel/e820.c */ |
77 | extern struct e820map e820; | 80 | extern struct e820map e820; |
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 139591a933f6..4729b2b63117 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h | |||
@@ -116,11 +116,11 @@ enum fixed_addresses { | |||
116 | #endif | 116 | #endif |
117 | FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ | 117 | FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ |
118 | FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ | 118 | FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ |
119 | __end_of_permanent_fixed_addresses, | ||
120 | |||
121 | #ifdef CONFIG_X86_MRST | 119 | #ifdef CONFIG_X86_MRST |
122 | FIX_LNW_VRTC, | 120 | FIX_LNW_VRTC, |
123 | #endif | 121 | #endif |
122 | __end_of_permanent_fixed_addresses, | ||
123 | |||
124 | /* | 124 | /* |
125 | * 256 temporary boot-time mappings, used by early_ioremap(), | 125 | * 256 temporary boot-time mappings, used by early_ioremap(), |
126 | * before ioremap() is functional. | 126 | * before ioremap() is functional. |
@@ -220,8 +220,8 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr) | |||
220 | } | 220 | } |
221 | 221 | ||
222 | /* Return an pointer with offset calculated */ | 222 | /* Return an pointer with offset calculated */ |
223 | static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx, | 223 | static __always_inline unsigned long |
224 | phys_addr_t phys, pgprot_t flags) | 224 | __set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) |
225 | { | 225 | { |
226 | __set_fixmap(idx, phys, flags); | 226 | __set_fixmap(idx, phys, flags); |
227 | return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1)); | 227 | return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1)); |
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h index 49dbfdfa50f9..91d915a65259 100644 --- a/arch/x86/include/asm/gpio.h +++ b/arch/x86/include/asm/gpio.h | |||
@@ -38,12 +38,9 @@ static inline int gpio_cansleep(unsigned int gpio) | |||
38 | return __gpio_cansleep(gpio); | 38 | return __gpio_cansleep(gpio); |
39 | } | 39 | } |
40 | 40 | ||
41 | /* | ||
42 | * Not implemented, yet. | ||
43 | */ | ||
44 | static inline int gpio_to_irq(unsigned int gpio) | 41 | static inline int gpio_to_irq(unsigned int gpio) |
45 | { | 42 | { |
46 | return -ENOSYS; | 43 | return __gpio_to_irq(gpio); |
47 | } | 44 | } |
48 | 45 | ||
49 | static inline int irq_to_gpio(unsigned int irq) | 46 | static inline int irq_to_gpio(unsigned int irq) |
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index ff2546ce7178..7a15153c675d 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h | |||
@@ -20,6 +20,9 @@ | |||
20 | #ifndef _ASM_X86_HYPERVISOR_H | 20 | #ifndef _ASM_X86_HYPERVISOR_H |
21 | #define _ASM_X86_HYPERVISOR_H | 21 | #define _ASM_X86_HYPERVISOR_H |
22 | 22 | ||
23 | #include <asm/kvm_para.h> | ||
24 | #include <asm/xen/hypervisor.h> | ||
25 | |||
23 | extern void init_hypervisor(struct cpuinfo_x86 *c); | 26 | extern void init_hypervisor(struct cpuinfo_x86 *c); |
24 | extern void init_hypervisor_platform(void); | 27 | extern void init_hypervisor_platform(void); |
25 | 28 | ||
@@ -47,4 +50,13 @@ extern const struct hypervisor_x86 x86_hyper_vmware; | |||
47 | extern const struct hypervisor_x86 x86_hyper_ms_hyperv; | 50 | extern const struct hypervisor_x86 x86_hyper_ms_hyperv; |
48 | extern const struct hypervisor_x86 x86_hyper_xen_hvm; | 51 | extern const struct hypervisor_x86 x86_hyper_xen_hvm; |
49 | 52 | ||
53 | static inline bool hypervisor_x2apic_available(void) | ||
54 | { | ||
55 | if (kvm_para_available()) | ||
56 | return true; | ||
57 | if (xen_x2apic_para_available()) | ||
58 | return true; | ||
59 | return false; | ||
60 | } | ||
61 | |||
50 | #endif | 62 | #endif |
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 4aa2bb3b242a..ef328901c802 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h | |||
@@ -93,6 +93,17 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) | |||
93 | int err; | 93 | int err; |
94 | 94 | ||
95 | /* See comment in fxsave() below. */ | 95 | /* See comment in fxsave() below. */ |
96 | #ifdef CONFIG_AS_FXSAVEQ | ||
97 | asm volatile("1: fxrstorq %[fx]\n\t" | ||
98 | "2:\n" | ||
99 | ".section .fixup,\"ax\"\n" | ||
100 | "3: movl $-1,%[err]\n" | ||
101 | " jmp 2b\n" | ||
102 | ".previous\n" | ||
103 | _ASM_EXTABLE(1b, 3b) | ||
104 | : [err] "=r" (err) | ||
105 | : [fx] "m" (*fx), "0" (0)); | ||
106 | #else | ||
96 | asm volatile("1: rex64/fxrstor (%[fx])\n\t" | 107 | asm volatile("1: rex64/fxrstor (%[fx])\n\t" |
97 | "2:\n" | 108 | "2:\n" |
98 | ".section .fixup,\"ax\"\n" | 109 | ".section .fixup,\"ax\"\n" |
@@ -102,6 +113,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) | |||
102 | _ASM_EXTABLE(1b, 3b) | 113 | _ASM_EXTABLE(1b, 3b) |
103 | : [err] "=r" (err) | 114 | : [err] "=r" (err) |
104 | : [fx] "R" (fx), "m" (*fx), "0" (0)); | 115 | : [fx] "R" (fx), "m" (*fx), "0" (0)); |
116 | #endif | ||
105 | return err; | 117 | return err; |
106 | } | 118 | } |
107 | 119 | ||
@@ -119,6 +131,17 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) | |||
119 | return -EFAULT; | 131 | return -EFAULT; |
120 | 132 | ||
121 | /* See comment in fxsave() below. */ | 133 | /* See comment in fxsave() below. */ |
134 | #ifdef CONFIG_AS_FXSAVEQ | ||
135 | asm volatile("1: fxsaveq %[fx]\n\t" | ||
136 | "2:\n" | ||
137 | ".section .fixup,\"ax\"\n" | ||
138 | "3: movl $-1,%[err]\n" | ||
139 | " jmp 2b\n" | ||
140 | ".previous\n" | ||
141 | _ASM_EXTABLE(1b, 3b) | ||
142 | : [err] "=r" (err), [fx] "=m" (*fx) | ||
143 | : "0" (0)); | ||
144 | #else | ||
122 | asm volatile("1: rex64/fxsave (%[fx])\n\t" | 145 | asm volatile("1: rex64/fxsave (%[fx])\n\t" |
123 | "2:\n" | 146 | "2:\n" |
124 | ".section .fixup,\"ax\"\n" | 147 | ".section .fixup,\"ax\"\n" |
@@ -128,6 +151,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) | |||
128 | _ASM_EXTABLE(1b, 3b) | 151 | _ASM_EXTABLE(1b, 3b) |
129 | : [err] "=r" (err), "=m" (*fx) | 152 | : [err] "=r" (err), "=m" (*fx) |
130 | : [fx] "R" (fx), "0" (0)); | 153 | : [fx] "R" (fx), "0" (0)); |
154 | #endif | ||
131 | if (unlikely(err) && | 155 | if (unlikely(err) && |
132 | __clear_user(fx, sizeof(struct i387_fxsave_struct))) | 156 | __clear_user(fx, sizeof(struct i387_fxsave_struct))) |
133 | err = -EFAULT; | 157 | err = -EFAULT; |
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index a6b28d017c2f..f327d386d6cc 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h | |||
@@ -159,7 +159,7 @@ struct io_apic_irq_attr; | |||
159 | extern int io_apic_set_pci_routing(struct device *dev, int irq, | 159 | extern int io_apic_set_pci_routing(struct device *dev, int irq, |
160 | struct io_apic_irq_attr *irq_attr); | 160 | struct io_apic_irq_attr *irq_attr); |
161 | void setup_IO_APIC_irq_extra(u32 gsi); | 161 | void setup_IO_APIC_irq_extra(u32 gsi); |
162 | extern void ioapic_init_mappings(void); | 162 | extern void ioapic_and_gsi_init(void); |
163 | extern void ioapic_insert_resources(void); | 163 | extern void ioapic_insert_resources(void); |
164 | 164 | ||
165 | extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); | 165 | extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); |
@@ -168,10 +168,10 @@ extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); | |||
168 | extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); | 168 | extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); |
169 | extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); | 169 | extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); |
170 | 170 | ||
171 | extern void probe_nr_irqs_gsi(void); | ||
172 | extern int get_nr_irqs_gsi(void); | 171 | extern int get_nr_irqs_gsi(void); |
173 | 172 | ||
174 | extern void setup_ioapic_ids_from_mpc(void); | 173 | extern void setup_ioapic_ids_from_mpc(void); |
174 | extern void setup_ioapic_ids_from_mpc_nocheck(void); | ||
175 | 175 | ||
176 | struct mp_ioapic_gsi{ | 176 | struct mp_ioapic_gsi{ |
177 | u32 gsi_base; | 177 | u32 gsi_base; |
@@ -184,14 +184,15 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi); | |||
184 | void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); | 184 | void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); |
185 | extern void __init pre_init_apic_IRQ0(void); | 185 | extern void __init pre_init_apic_IRQ0(void); |
186 | 186 | ||
187 | extern void mp_save_irq(struct mpc_intsrc *m); | ||
188 | |||
187 | #else /* !CONFIG_X86_IO_APIC */ | 189 | #else /* !CONFIG_X86_IO_APIC */ |
188 | 190 | ||
189 | #define io_apic_assign_pci_irqs 0 | 191 | #define io_apic_assign_pci_irqs 0 |
190 | #define setup_ioapic_ids_from_mpc x86_init_noop | 192 | #define setup_ioapic_ids_from_mpc x86_init_noop |
191 | static const int timer_through_8259 = 0; | 193 | static const int timer_through_8259 = 0; |
192 | static inline void ioapic_init_mappings(void) { } | 194 | static inline void ioapic_and_gsi_init(void) { } |
193 | static inline void ioapic_insert_resources(void) { } | 195 | static inline void ioapic_insert_resources(void) { } |
194 | static inline void probe_nr_irqs_gsi(void) { } | ||
195 | #define gsi_top (NR_IRQS_LEGACY) | 196 | #define gsi_top (NR_IRQS_LEGACY) |
196 | static inline int mp_find_ioapic(u32 gsi) { return 0; } | 197 | static inline int mp_find_ioapic(u32 gsi) { return 0; } |
197 | 198 | ||
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 13b0ebaa512f..c704b38c57a2 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h | |||
@@ -10,15 +10,14 @@ | |||
10 | #include <asm/apicdef.h> | 10 | #include <asm/apicdef.h> |
11 | #include <asm/irq_vectors.h> | 11 | #include <asm/irq_vectors.h> |
12 | 12 | ||
13 | /* Even though we don't support this, supply it to appease OF */ | ||
14 | static inline void irq_dispose_mapping(unsigned int virq) { } | ||
15 | |||
13 | static inline int irq_canonicalize(int irq) | 16 | static inline int irq_canonicalize(int irq) |
14 | { | 17 | { |
15 | return ((irq == 2) ? 9 : irq); | 18 | return ((irq == 2) ? 9 : irq); |
16 | } | 19 | } |
17 | 20 | ||
18 | #ifdef CONFIG_X86_LOCAL_APIC | ||
19 | # define ARCH_HAS_NMI_WATCHDOG | ||
20 | #endif | ||
21 | |||
22 | #ifdef CONFIG_X86_32 | 21 | #ifdef CONFIG_X86_32 |
23 | extern void irq_ctx_init(int cpu); | 22 | extern void irq_ctx_init(int cpu); |
24 | #else | 23 | #else |
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index f52d42e80585..574dbc22893a 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h | |||
@@ -14,7 +14,7 @@ | |||
14 | do { \ | 14 | do { \ |
15 | asm goto("1:" \ | 15 | asm goto("1:" \ |
16 | JUMP_LABEL_INITIAL_NOP \ | 16 | JUMP_LABEL_INITIAL_NOP \ |
17 | ".pushsection __jump_table, \"a\" \n\t"\ | 17 | ".pushsection __jump_table, \"aw\" \n\t"\ |
18 | _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ | 18 | _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ |
19 | ".popsection \n\t" \ | 19 | ".popsection \n\t" \ |
20 | : : "i" (key) : : label); \ | 20 | : : "i" (key) : : label); \ |
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 5bdfca86581b..ca242d35e873 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h | |||
@@ -18,7 +18,6 @@ enum die_val { | |||
18 | DIE_TRAP, | 18 | DIE_TRAP, |
19 | DIE_GPF, | 19 | DIE_GPF, |
20 | DIE_CALL, | 20 | DIE_CALL, |
21 | DIE_NMI_IPI, | ||
22 | DIE_PAGE_FAULT, | 21 | DIE_PAGE_FAULT, |
23 | DIE_NMIUNKNOWN, | 22 | DIE_NMIUNKNOWN, |
24 | }; | 23 | }; |
@@ -28,7 +27,7 @@ extern void die(const char *, struct pt_regs *,long); | |||
28 | extern int __must_check __die(const char *, struct pt_regs *, long); | 27 | extern int __must_check __die(const char *, struct pt_regs *, long); |
29 | extern void show_registers(struct pt_regs *regs); | 28 | extern void show_registers(struct pt_regs *regs); |
30 | extern void show_trace(struct task_struct *t, struct pt_regs *regs, | 29 | extern void show_trace(struct task_struct *t, struct pt_regs *regs, |
31 | unsigned long *sp, unsigned long bp); | 30 | unsigned long *sp); |
32 | extern void __show_regs(struct pt_regs *regs, int all); | 31 | extern void __show_regs(struct pt_regs *regs, int all); |
33 | extern void show_regs(struct pt_regs *regs); | 32 | extern void show_regs(struct pt_regs *regs); |
34 | extern unsigned long oops_begin(void); | 33 | extern unsigned long oops_begin(void); |
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index b36c6b3fe144..8e37deb1eb38 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h | |||
@@ -15,6 +15,14 @@ | |||
15 | 15 | ||
16 | struct x86_emulate_ctxt; | 16 | struct x86_emulate_ctxt; |
17 | 17 | ||
18 | struct x86_exception { | ||
19 | u8 vector; | ||
20 | bool error_code_valid; | ||
21 | u16 error_code; | ||
22 | bool nested_page_fault; | ||
23 | u64 address; /* cr2 or nested page fault gpa */ | ||
24 | }; | ||
25 | |||
18 | /* | 26 | /* |
19 | * x86_emulate_ops: | 27 | * x86_emulate_ops: |
20 | * | 28 | * |
@@ -64,7 +72,8 @@ struct x86_emulate_ops { | |||
64 | * @bytes: [IN ] Number of bytes to read from memory. | 72 | * @bytes: [IN ] Number of bytes to read from memory. |
65 | */ | 73 | */ |
66 | int (*read_std)(unsigned long addr, void *val, | 74 | int (*read_std)(unsigned long addr, void *val, |
67 | unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); | 75 | unsigned int bytes, struct kvm_vcpu *vcpu, |
76 | struct x86_exception *fault); | ||
68 | 77 | ||
69 | /* | 78 | /* |
70 | * write_std: Write bytes of standard (non-emulated/special) memory. | 79 | * write_std: Write bytes of standard (non-emulated/special) memory. |
@@ -74,7 +83,8 @@ struct x86_emulate_ops { | |||
74 | * @bytes: [IN ] Number of bytes to write to memory. | 83 | * @bytes: [IN ] Number of bytes to write to memory. |
75 | */ | 84 | */ |
76 | int (*write_std)(unsigned long addr, void *val, | 85 | int (*write_std)(unsigned long addr, void *val, |
77 | unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); | 86 | unsigned int bytes, struct kvm_vcpu *vcpu, |
87 | struct x86_exception *fault); | ||
78 | /* | 88 | /* |
79 | * fetch: Read bytes of standard (non-emulated/special) memory. | 89 | * fetch: Read bytes of standard (non-emulated/special) memory. |
80 | * Used for instruction fetch. | 90 | * Used for instruction fetch. |
@@ -83,7 +93,8 @@ struct x86_emulate_ops { | |||
83 | * @bytes: [IN ] Number of bytes to read from memory. | 93 | * @bytes: [IN ] Number of bytes to read from memory. |
84 | */ | 94 | */ |
85 | int (*fetch)(unsigned long addr, void *val, | 95 | int (*fetch)(unsigned long addr, void *val, |
86 | unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); | 96 | unsigned int bytes, struct kvm_vcpu *vcpu, |
97 | struct x86_exception *fault); | ||
87 | 98 | ||
88 | /* | 99 | /* |
89 | * read_emulated: Read bytes from emulated/special memory area. | 100 | * read_emulated: Read bytes from emulated/special memory area. |
@@ -94,7 +105,7 @@ struct x86_emulate_ops { | |||
94 | int (*read_emulated)(unsigned long addr, | 105 | int (*read_emulated)(unsigned long addr, |
95 | void *val, | 106 | void *val, |
96 | unsigned int bytes, | 107 | unsigned int bytes, |
97 | unsigned int *error, | 108 | struct x86_exception *fault, |
98 | struct kvm_vcpu *vcpu); | 109 | struct kvm_vcpu *vcpu); |
99 | 110 | ||
100 | /* | 111 | /* |
@@ -107,7 +118,7 @@ struct x86_emulate_ops { | |||
107 | int (*write_emulated)(unsigned long addr, | 118 | int (*write_emulated)(unsigned long addr, |
108 | const void *val, | 119 | const void *val, |
109 | unsigned int bytes, | 120 | unsigned int bytes, |
110 | unsigned int *error, | 121 | struct x86_exception *fault, |
111 | struct kvm_vcpu *vcpu); | 122 | struct kvm_vcpu *vcpu); |
112 | 123 | ||
113 | /* | 124 | /* |
@@ -122,7 +133,7 @@ struct x86_emulate_ops { | |||
122 | const void *old, | 133 | const void *old, |
123 | const void *new, | 134 | const void *new, |
124 | unsigned int bytes, | 135 | unsigned int bytes, |
125 | unsigned int *error, | 136 | struct x86_exception *fault, |
126 | struct kvm_vcpu *vcpu); | 137 | struct kvm_vcpu *vcpu); |
127 | 138 | ||
128 | int (*pio_in_emulated)(int size, unsigned short port, void *val, | 139 | int (*pio_in_emulated)(int size, unsigned short port, void *val, |
@@ -159,7 +170,10 @@ struct operand { | |||
159 | }; | 170 | }; |
160 | union { | 171 | union { |
161 | unsigned long *reg; | 172 | unsigned long *reg; |
162 | unsigned long mem; | 173 | struct segmented_address { |
174 | ulong ea; | ||
175 | unsigned seg; | ||
176 | } mem; | ||
163 | } addr; | 177 | } addr; |
164 | union { | 178 | union { |
165 | unsigned long val; | 179 | unsigned long val; |
@@ -226,9 +240,8 @@ struct x86_emulate_ctxt { | |||
226 | 240 | ||
227 | bool perm_ok; /* do not check permissions if true */ | 241 | bool perm_ok; /* do not check permissions if true */ |
228 | 242 | ||
229 | int exception; /* exception that happens during emulation or -1 */ | 243 | bool have_exception; |
230 | u32 error_code; /* error code for exception */ | 244 | struct x86_exception exception; |
231 | bool error_code_valid; | ||
232 | 245 | ||
233 | /* decode cache */ | 246 | /* decode cache */ |
234 | struct decode_cache decode; | 247 | struct decode_cache decode; |
@@ -252,7 +265,7 @@ struct x86_emulate_ctxt { | |||
252 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 | 265 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 |
253 | #endif | 266 | #endif |
254 | 267 | ||
255 | int x86_decode_insn(struct x86_emulate_ctxt *ctxt); | 268 | int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); |
256 | #define EMULATION_FAILED -1 | 269 | #define EMULATION_FAILED -1 |
257 | #define EMULATION_OK 0 | 270 | #define EMULATION_OK 0 |
258 | #define EMULATION_RESTART 1 | 271 | #define EMULATION_RESTART 1 |
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9e6fe391094e..ffd7f8d29187 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -79,15 +79,18 @@ | |||
79 | #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) | 79 | #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) |
80 | #define KVM_MIN_FREE_MMU_PAGES 5 | 80 | #define KVM_MIN_FREE_MMU_PAGES 5 |
81 | #define KVM_REFILL_PAGES 25 | 81 | #define KVM_REFILL_PAGES 25 |
82 | #define KVM_MAX_CPUID_ENTRIES 40 | 82 | #define KVM_MAX_CPUID_ENTRIES 80 |
83 | #define KVM_NR_FIXED_MTRR_REGION 88 | 83 | #define KVM_NR_FIXED_MTRR_REGION 88 |
84 | #define KVM_NR_VAR_MTRR 8 | 84 | #define KVM_NR_VAR_MTRR 8 |
85 | 85 | ||
86 | #define ASYNC_PF_PER_VCPU 64 | ||
87 | |||
86 | extern spinlock_t kvm_lock; | 88 | extern spinlock_t kvm_lock; |
87 | extern struct list_head vm_list; | 89 | extern struct list_head vm_list; |
88 | 90 | ||
89 | struct kvm_vcpu; | 91 | struct kvm_vcpu; |
90 | struct kvm; | 92 | struct kvm; |
93 | struct kvm_async_pf; | ||
91 | 94 | ||
92 | enum kvm_reg { | 95 | enum kvm_reg { |
93 | VCPU_REGS_RAX = 0, | 96 | VCPU_REGS_RAX = 0, |
@@ -114,6 +117,7 @@ enum kvm_reg { | |||
114 | 117 | ||
115 | enum kvm_reg_ex { | 118 | enum kvm_reg_ex { |
116 | VCPU_EXREG_PDPTR = NR_VCPU_REGS, | 119 | VCPU_EXREG_PDPTR = NR_VCPU_REGS, |
120 | VCPU_EXREG_CR3, | ||
117 | }; | 121 | }; |
118 | 122 | ||
119 | enum { | 123 | enum { |
@@ -238,16 +242,18 @@ struct kvm_mmu { | |||
238 | void (*new_cr3)(struct kvm_vcpu *vcpu); | 242 | void (*new_cr3)(struct kvm_vcpu *vcpu); |
239 | void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); | 243 | void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); |
240 | unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); | 244 | unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); |
241 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); | 245 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, |
242 | void (*inject_page_fault)(struct kvm_vcpu *vcpu); | 246 | bool prefault); |
247 | void (*inject_page_fault)(struct kvm_vcpu *vcpu, | ||
248 | struct x86_exception *fault); | ||
243 | void (*free)(struct kvm_vcpu *vcpu); | 249 | void (*free)(struct kvm_vcpu *vcpu); |
244 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, | 250 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, |
245 | u32 *error); | 251 | struct x86_exception *exception); |
246 | gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); | 252 | gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); |
247 | void (*prefetch_page)(struct kvm_vcpu *vcpu, | 253 | void (*prefetch_page)(struct kvm_vcpu *vcpu, |
248 | struct kvm_mmu_page *page); | 254 | struct kvm_mmu_page *page); |
249 | int (*sync_page)(struct kvm_vcpu *vcpu, | 255 | int (*sync_page)(struct kvm_vcpu *vcpu, |
250 | struct kvm_mmu_page *sp, bool clear_unsync); | 256 | struct kvm_mmu_page *sp); |
251 | void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); | 257 | void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); |
252 | hpa_t root_hpa; | 258 | hpa_t root_hpa; |
253 | int root_level; | 259 | int root_level; |
@@ -315,16 +321,6 @@ struct kvm_vcpu_arch { | |||
315 | */ | 321 | */ |
316 | struct kvm_mmu *walk_mmu; | 322 | struct kvm_mmu *walk_mmu; |
317 | 323 | ||
318 | /* | ||
319 | * This struct is filled with the necessary information to propagate a | ||
320 | * page fault into the guest | ||
321 | */ | ||
322 | struct { | ||
323 | u64 address; | ||
324 | unsigned error_code; | ||
325 | bool nested; | ||
326 | } fault; | ||
327 | |||
328 | /* only needed in kvm_pv_mmu_op() path, but it's hot so | 324 | /* only needed in kvm_pv_mmu_op() path, but it's hot so |
329 | * put it here to avoid allocation */ | 325 | * put it here to avoid allocation */ |
330 | struct kvm_pv_mmu_op_buffer mmu_op_buffer; | 326 | struct kvm_pv_mmu_op_buffer mmu_op_buffer; |
@@ -412,6 +408,15 @@ struct kvm_vcpu_arch { | |||
412 | u64 hv_vapic; | 408 | u64 hv_vapic; |
413 | 409 | ||
414 | cpumask_var_t wbinvd_dirty_mask; | 410 | cpumask_var_t wbinvd_dirty_mask; |
411 | |||
412 | struct { | ||
413 | bool halted; | ||
414 | gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; | ||
415 | struct gfn_to_hva_cache data; | ||
416 | u64 msr_val; | ||
417 | u32 id; | ||
418 | bool send_user_only; | ||
419 | } apf; | ||
415 | }; | 420 | }; |
416 | 421 | ||
417 | struct kvm_arch { | 422 | struct kvm_arch { |
@@ -456,6 +461,10 @@ struct kvm_arch { | |||
456 | /* fields used by HYPER-V emulation */ | 461 | /* fields used by HYPER-V emulation */ |
457 | u64 hv_guest_os_id; | 462 | u64 hv_guest_os_id; |
458 | u64 hv_hypercall; | 463 | u64 hv_hypercall; |
464 | |||
465 | #ifdef CONFIG_KVM_MMU_AUDIT | ||
466 | int audit_point; | ||
467 | #endif | ||
459 | }; | 468 | }; |
460 | 469 | ||
461 | struct kvm_vm_stat { | 470 | struct kvm_vm_stat { |
@@ -529,6 +538,7 @@ struct kvm_x86_ops { | |||
529 | struct kvm_segment *var, int seg); | 538 | struct kvm_segment *var, int seg); |
530 | void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); | 539 | void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); |
531 | void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); | 540 | void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); |
541 | void (*decache_cr3)(struct kvm_vcpu *vcpu); | ||
532 | void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); | 542 | void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); |
533 | void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); | 543 | void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); |
534 | void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); | 544 | void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); |
@@ -582,9 +592,17 @@ struct kvm_x86_ops { | |||
582 | 592 | ||
583 | void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); | 593 | void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); |
584 | 594 | ||
595 | void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); | ||
585 | const struct trace_print_flags *exit_reasons_str; | 596 | const struct trace_print_flags *exit_reasons_str; |
586 | }; | 597 | }; |
587 | 598 | ||
599 | struct kvm_arch_async_pf { | ||
600 | u32 token; | ||
601 | gfn_t gfn; | ||
602 | unsigned long cr3; | ||
603 | bool direct_map; | ||
604 | }; | ||
605 | |||
588 | extern struct kvm_x86_ops *kvm_x86_ops; | 606 | extern struct kvm_x86_ops *kvm_x86_ops; |
589 | 607 | ||
590 | int kvm_mmu_module_init(void); | 608 | int kvm_mmu_module_init(void); |
@@ -594,7 +612,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu); | |||
594 | int kvm_mmu_create(struct kvm_vcpu *vcpu); | 612 | int kvm_mmu_create(struct kvm_vcpu *vcpu); |
595 | int kvm_mmu_setup(struct kvm_vcpu *vcpu); | 613 | int kvm_mmu_setup(struct kvm_vcpu *vcpu); |
596 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); | 614 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); |
597 | void kvm_mmu_set_base_ptes(u64 base_pte); | ||
598 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 615 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
599 | u64 dirty_mask, u64 nx_mask, u64 x_mask); | 616 | u64 dirty_mask, u64 nx_mask, u64 x_mask); |
600 | 617 | ||
@@ -623,8 +640,15 @@ enum emulation_result { | |||
623 | #define EMULTYPE_NO_DECODE (1 << 0) | 640 | #define EMULTYPE_NO_DECODE (1 << 0) |
624 | #define EMULTYPE_TRAP_UD (1 << 1) | 641 | #define EMULTYPE_TRAP_UD (1 << 1) |
625 | #define EMULTYPE_SKIP (1 << 2) | 642 | #define EMULTYPE_SKIP (1 << 2) |
626 | int emulate_instruction(struct kvm_vcpu *vcpu, | 643 | int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, |
627 | unsigned long cr2, u16 error_code, int emulation_type); | 644 | int emulation_type, void *insn, int insn_len); |
645 | |||
646 | static inline int emulate_instruction(struct kvm_vcpu *vcpu, | ||
647 | int emulation_type) | ||
648 | { | ||
649 | return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); | ||
650 | } | ||
651 | |||
628 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | 652 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); |
629 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | 653 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); |
630 | 654 | ||
@@ -650,7 +674,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, | |||
650 | int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); | 674 | int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); |
651 | int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); | 675 | int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); |
652 | int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); | 676 | int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); |
653 | void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); | 677 | int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); |
654 | int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); | 678 | int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); |
655 | int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); | 679 | int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); |
656 | unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); | 680 | unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); |
@@ -668,11 +692,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); | |||
668 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); | 692 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); |
669 | void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); | 693 | void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); |
670 | void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); | 694 | void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); |
671 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu); | 695 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); |
672 | int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | 696 | int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
673 | gfn_t gfn, void *data, int offset, int len, | 697 | gfn_t gfn, void *data, int offset, int len, |
674 | u32 access); | 698 | u32 access); |
675 | void kvm_propagate_fault(struct kvm_vcpu *vcpu); | 699 | void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); |
676 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); | 700 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); |
677 | 701 | ||
678 | int kvm_pic_set_irq(void *opaque, int irq, int level); | 702 | int kvm_pic_set_irq(void *opaque, int irq, int level); |
@@ -690,16 +714,21 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); | |||
690 | int kvm_mmu_load(struct kvm_vcpu *vcpu); | 714 | int kvm_mmu_load(struct kvm_vcpu *vcpu); |
691 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); | 715 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); |
692 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); | 716 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); |
693 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); | 717 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, |
694 | gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); | 718 | struct x86_exception *exception); |
695 | gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); | 719 | gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, |
696 | gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); | 720 | struct x86_exception *exception); |
721 | gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, | ||
722 | struct x86_exception *exception); | ||
723 | gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, | ||
724 | struct x86_exception *exception); | ||
697 | 725 | ||
698 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); | 726 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); |
699 | 727 | ||
700 | int kvm_fix_hypercall(struct kvm_vcpu *vcpu); | 728 | int kvm_fix_hypercall(struct kvm_vcpu *vcpu); |
701 | 729 | ||
702 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); | 730 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, |
731 | void *insn, int insn_len); | ||
703 | void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); | 732 | void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); |
704 | 733 | ||
705 | void kvm_enable_tdp(void); | 734 | void kvm_enable_tdp(void); |
@@ -766,20 +795,25 @@ enum { | |||
766 | #define HF_VINTR_MASK (1 << 2) | 795 | #define HF_VINTR_MASK (1 << 2) |
767 | #define HF_NMI_MASK (1 << 3) | 796 | #define HF_NMI_MASK (1 << 3) |
768 | #define HF_IRET_MASK (1 << 4) | 797 | #define HF_IRET_MASK (1 << 4) |
798 | #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ | ||
769 | 799 | ||
770 | /* | 800 | /* |
771 | * Hardware virtualization extension instructions may fault if a | 801 | * Hardware virtualization extension instructions may fault if a |
772 | * reboot turns off virtualization while processes are running. | 802 | * reboot turns off virtualization while processes are running. |
773 | * Trap the fault and ignore the instruction if that happens. | 803 | * Trap the fault and ignore the instruction if that happens. |
774 | */ | 804 | */ |
775 | asmlinkage void kvm_handle_fault_on_reboot(void); | 805 | asmlinkage void kvm_spurious_fault(void); |
806 | extern bool kvm_rebooting; | ||
776 | 807 | ||
777 | #define __kvm_handle_fault_on_reboot(insn) \ | 808 | #define __kvm_handle_fault_on_reboot(insn) \ |
778 | "666: " insn "\n\t" \ | 809 | "666: " insn "\n\t" \ |
810 | "668: \n\t" \ | ||
779 | ".pushsection .fixup, \"ax\" \n" \ | 811 | ".pushsection .fixup, \"ax\" \n" \ |
780 | "667: \n\t" \ | 812 | "667: \n\t" \ |
813 | "cmpb $0, kvm_rebooting \n\t" \ | ||
814 | "jne 668b \n\t" \ | ||
781 | __ASM_SIZE(push) " $666b \n\t" \ | 815 | __ASM_SIZE(push) " $666b \n\t" \ |
782 | "jmp kvm_handle_fault_on_reboot \n\t" \ | 816 | "call kvm_spurious_fault \n\t" \ |
783 | ".popsection \n\t" \ | 817 | ".popsection \n\t" \ |
784 | ".pushsection __ex_table, \"a\" \n\t" \ | 818 | ".pushsection __ex_table, \"a\" \n\t" \ |
785 | _ASM_PTR " 666b, 667b \n\t" \ | 819 | _ASM_PTR " 666b, 667b \n\t" \ |
@@ -788,6 +822,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void); | |||
788 | #define KVM_ARCH_WANT_MMU_NOTIFIER | 822 | #define KVM_ARCH_WANT_MMU_NOTIFIER |
789 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); | 823 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); |
790 | int kvm_age_hva(struct kvm *kvm, unsigned long hva); | 824 | int kvm_age_hva(struct kvm *kvm, unsigned long hva); |
825 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); | ||
791 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); | 826 | void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); |
792 | int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); | 827 | int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); |
793 | int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); | 828 | int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); |
@@ -799,4 +834,15 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); | |||
799 | 834 | ||
800 | bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); | 835 | bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); |
801 | 836 | ||
837 | void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, | ||
838 | struct kvm_async_pf *work); | ||
839 | void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, | ||
840 | struct kvm_async_pf *work); | ||
841 | void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, | ||
842 | struct kvm_async_pf *work); | ||
843 | bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); | ||
844 | extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); | ||
845 | |||
846 | void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); | ||
847 | |||
802 | #endif /* _ASM_X86_KVM_HOST_H */ | 848 | #endif /* _ASM_X86_KVM_HOST_H */ |
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 7b562b6184bc..a427bf77a93d 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h | |||
@@ -20,6 +20,7 @@ | |||
20 | * are available. The use of 0x11 and 0x12 is deprecated | 20 | * are available. The use of 0x11 and 0x12 is deprecated |
21 | */ | 21 | */ |
22 | #define KVM_FEATURE_CLOCKSOURCE2 3 | 22 | #define KVM_FEATURE_CLOCKSOURCE2 3 |
23 | #define KVM_FEATURE_ASYNC_PF 4 | ||
23 | 24 | ||
24 | /* The last 8 bits are used to indicate how to interpret the flags field | 25 | /* The last 8 bits are used to indicate how to interpret the flags field |
25 | * in pvclock structure. If no bits are set, all flags are ignored. | 26 | * in pvclock structure. If no bits are set, all flags are ignored. |
@@ -32,9 +33,13 @@ | |||
32 | /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ | 33 | /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ |
33 | #define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 | 34 | #define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 |
34 | #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 | 35 | #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 |
36 | #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 | ||
35 | 37 | ||
36 | #define KVM_MAX_MMU_OP_BATCH 32 | 38 | #define KVM_MAX_MMU_OP_BATCH 32 |
37 | 39 | ||
40 | #define KVM_ASYNC_PF_ENABLED (1 << 0) | ||
41 | #define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) | ||
42 | |||
38 | /* Operations for KVM_HC_MMU_OP */ | 43 | /* Operations for KVM_HC_MMU_OP */ |
39 | #define KVM_MMU_OP_WRITE_PTE 1 | 44 | #define KVM_MMU_OP_WRITE_PTE 1 |
40 | #define KVM_MMU_OP_FLUSH_TLB 2 | 45 | #define KVM_MMU_OP_FLUSH_TLB 2 |
@@ -61,10 +66,20 @@ struct kvm_mmu_op_release_pt { | |||
61 | __u64 pt_phys; | 66 | __u64 pt_phys; |
62 | }; | 67 | }; |
63 | 68 | ||
69 | #define KVM_PV_REASON_PAGE_NOT_PRESENT 1 | ||
70 | #define KVM_PV_REASON_PAGE_READY 2 | ||
71 | |||
72 | struct kvm_vcpu_pv_apf_data { | ||
73 | __u32 reason; | ||
74 | __u8 pad[60]; | ||
75 | __u32 enabled; | ||
76 | }; | ||
77 | |||
64 | #ifdef __KERNEL__ | 78 | #ifdef __KERNEL__ |
65 | #include <asm/processor.h> | 79 | #include <asm/processor.h> |
66 | 80 | ||
67 | extern void kvmclock_init(void); | 81 | extern void kvmclock_init(void); |
82 | extern int kvm_register_clock(char *txt); | ||
68 | 83 | ||
69 | 84 | ||
70 | /* This instruction is vmcall. On non-VT architectures, it will generate a | 85 | /* This instruction is vmcall. On non-VT architectures, it will generate a |
@@ -160,8 +175,17 @@ static inline unsigned int kvm_arch_para_features(void) | |||
160 | 175 | ||
161 | #ifdef CONFIG_KVM_GUEST | 176 | #ifdef CONFIG_KVM_GUEST |
162 | void __init kvm_guest_init(void); | 177 | void __init kvm_guest_init(void); |
178 | void kvm_async_pf_task_wait(u32 token); | ||
179 | void kvm_async_pf_task_wake(u32 token); | ||
180 | u32 kvm_read_and_reset_pf_reason(void); | ||
163 | #else | 181 | #else |
164 | #define kvm_guest_init() do { } while (0) | 182 | #define kvm_guest_init() do { } while (0) |
183 | #define kvm_async_pf_task_wait(T) do {} while(0) | ||
184 | #define kvm_async_pf_task_wake(T) do {} while(0) | ||
185 | static inline u32 kvm_read_and_reset_pf_reason(void) | ||
186 | { | ||
187 | return 0; | ||
188 | } | ||
165 | #endif | 189 | #endif |
166 | 190 | ||
167 | #endif /* __KERNEL__ */ | 191 | #endif /* __KERNEL__ */ |
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h index f7920601e472..72a8b52e7dfd 100644 --- a/arch/x86/include/asm/mach_traps.h +++ b/arch/x86/include/asm/mach_traps.h | |||
@@ -7,9 +7,19 @@ | |||
7 | 7 | ||
8 | #include <asm/mc146818rtc.h> | 8 | #include <asm/mc146818rtc.h> |
9 | 9 | ||
10 | #define NMI_REASON_PORT 0x61 | ||
11 | |||
12 | #define NMI_REASON_SERR 0x80 | ||
13 | #define NMI_REASON_IOCHK 0x40 | ||
14 | #define NMI_REASON_MASK (NMI_REASON_SERR | NMI_REASON_IOCHK) | ||
15 | |||
16 | #define NMI_REASON_CLEAR_SERR 0x04 | ||
17 | #define NMI_REASON_CLEAR_IOCHK 0x08 | ||
18 | #define NMI_REASON_CLEAR_MASK 0x0f | ||
19 | |||
10 | static inline unsigned char get_nmi_reason(void) | 20 | static inline unsigned char get_nmi_reason(void) |
11 | { | 21 | { |
12 | return inb(0x61); | 22 | return inb(NMI_REASON_PORT); |
13 | } | 23 | } |
14 | 24 | ||
15 | static inline void reassert_nmi(void) | 25 | static inline void reassert_nmi(void) |
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index c62c13cb9788..eb16e94ae04f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
@@ -223,6 +223,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c); | |||
223 | 223 | ||
224 | void mce_log_therm_throt_event(__u64 status); | 224 | void mce_log_therm_throt_event(__u64 status); |
225 | 225 | ||
226 | /* Interrupt Handler for core thermal thresholds */ | ||
227 | extern int (*platform_thermal_notify)(__u64 msr_val); | ||
228 | |||
226 | #ifdef CONFIG_X86_THERMAL_VECTOR | 229 | #ifdef CONFIG_X86_THERMAL_VECTOR |
227 | extern void mcheck_intel_therm_init(void); | 230 | extern void mcheck_intel_therm_init(void); |
228 | #else | 231 | #else |
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index ef51b501e22a..24215072d0e1 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h | |||
@@ -48,6 +48,12 @@ static inline struct microcode_ops * __init init_intel_microcode(void) | |||
48 | 48 | ||
49 | #ifdef CONFIG_MICROCODE_AMD | 49 | #ifdef CONFIG_MICROCODE_AMD |
50 | extern struct microcode_ops * __init init_amd_microcode(void); | 50 | extern struct microcode_ops * __init init_amd_microcode(void); |
51 | |||
52 | static inline void get_ucode_data(void *to, const u8 *from, size_t n) | ||
53 | { | ||
54 | memcpy(to, from, n); | ||
55 | } | ||
56 | |||
51 | #else | 57 | #else |
52 | static inline struct microcode_ops * __init init_amd_microcode(void) | 58 | static inline struct microcode_ops * __init init_amd_microcode(void) |
53 | { | 59 | { |
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index c82868e9f905..0c90dd9f0505 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h | |||
@@ -5,8 +5,9 @@ | |||
5 | 5 | ||
6 | #include <asm/mpspec_def.h> | 6 | #include <asm/mpspec_def.h> |
7 | #include <asm/x86_init.h> | 7 | #include <asm/x86_init.h> |
8 | #include <asm/apicdef.h> | ||
8 | 9 | ||
9 | extern int apic_version[MAX_APICS]; | 10 | extern int apic_version[]; |
10 | extern int pic_mode; | 11 | extern int pic_mode; |
11 | 12 | ||
12 | #ifdef CONFIG_X86_32 | 13 | #ifdef CONFIG_X86_32 |
@@ -107,7 +108,7 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level, | |||
107 | int active_high_low); | 108 | int active_high_low); |
108 | #endif /* CONFIG_ACPI */ | 109 | #endif /* CONFIG_ACPI */ |
109 | 110 | ||
110 | #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) | 111 | #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) |
111 | 112 | ||
112 | struct physid_mask { | 113 | struct physid_mask { |
113 | unsigned long mask[PHYSID_ARRAY_SIZE]; | 114 | unsigned long mask[PHYSID_ARRAY_SIZE]; |
@@ -122,31 +123,31 @@ typedef struct physid_mask physid_mask_t; | |||
122 | test_and_set_bit(physid, (map).mask) | 123 | test_and_set_bit(physid, (map).mask) |
123 | 124 | ||
124 | #define physids_and(dst, src1, src2) \ | 125 | #define physids_and(dst, src1, src2) \ |
125 | bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS) | 126 | bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) |
126 | 127 | ||
127 | #define physids_or(dst, src1, src2) \ | 128 | #define physids_or(dst, src1, src2) \ |
128 | bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS) | 129 | bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) |
129 | 130 | ||
130 | #define physids_clear(map) \ | 131 | #define physids_clear(map) \ |
131 | bitmap_zero((map).mask, MAX_APICS) | 132 | bitmap_zero((map).mask, MAX_LOCAL_APIC) |
132 | 133 | ||
133 | #define physids_complement(dst, src) \ | 134 | #define physids_complement(dst, src) \ |
134 | bitmap_complement((dst).mask, (src).mask, MAX_APICS) | 135 | bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC) |
135 | 136 | ||
136 | #define physids_empty(map) \ | 137 | #define physids_empty(map) \ |
137 | bitmap_empty((map).mask, MAX_APICS) | 138 | bitmap_empty((map).mask, MAX_LOCAL_APIC) |
138 | 139 | ||
139 | #define physids_equal(map1, map2) \ | 140 | #define physids_equal(map1, map2) \ |
140 | bitmap_equal((map1).mask, (map2).mask, MAX_APICS) | 141 | bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC) |
141 | 142 | ||
142 | #define physids_weight(map) \ | 143 | #define physids_weight(map) \ |
143 | bitmap_weight((map).mask, MAX_APICS) | 144 | bitmap_weight((map).mask, MAX_LOCAL_APIC) |
144 | 145 | ||
145 | #define physids_shift_right(d, s, n) \ | 146 | #define physids_shift_right(d, s, n) \ |
146 | bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS) | 147 | bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC) |
147 | 148 | ||
148 | #define physids_shift_left(d, s, n) \ | 149 | #define physids_shift_left(d, s, n) \ |
149 | bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) | 150 | bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC) |
150 | 151 | ||
151 | static inline unsigned long physids_coerce(physid_mask_t *map) | 152 | static inline unsigned long physids_coerce(physid_mask_t *map) |
152 | { | 153 | { |
@@ -159,14 +160,6 @@ static inline void physids_promote(unsigned long physids, physid_mask_t *map) | |||
159 | map->mask[0] = physids; | 160 | map->mask[0] = physids; |
160 | } | 161 | } |
161 | 162 | ||
162 | /* Note: will create very large stack frames if physid_mask_t is big */ | ||
163 | #define physid_mask_of_physid(physid) \ | ||
164 | ({ \ | ||
165 | physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ | ||
166 | physid_set(physid, __physid_mask); \ | ||
167 | __physid_mask; \ | ||
168 | }) | ||
169 | |||
170 | static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) | 163 | static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) |
171 | { | 164 | { |
172 | physids_clear(*map); | 165 | physids_clear(*map); |
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h index 4a7f96d7c188..c0a955a9a087 100644 --- a/arch/x86/include/asm/mpspec_def.h +++ b/arch/x86/include/asm/mpspec_def.h | |||
@@ -15,13 +15,6 @@ | |||
15 | 15 | ||
16 | #ifdef CONFIG_X86_32 | 16 | #ifdef CONFIG_X86_32 |
17 | # define MAX_MPC_ENTRY 1024 | 17 | # define MAX_MPC_ENTRY 1024 |
18 | # define MAX_APICS 256 | ||
19 | #else | ||
20 | # if NR_CPUS <= 255 | ||
21 | # define MAX_APICS 255 | ||
22 | # else | ||
23 | # define MAX_APICS 32768 | ||
24 | # endif | ||
25 | #endif | 18 | #endif |
26 | 19 | ||
27 | /* Intel MP Floating Pointer Structure */ | 20 | /* Intel MP Floating Pointer Structure */ |
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3ea3dc487047..4d0dfa0d998e 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h | |||
@@ -123,12 +123,16 @@ | |||
123 | #define MSR_AMD64_IBSCTL 0xc001103a | 123 | #define MSR_AMD64_IBSCTL 0xc001103a |
124 | #define MSR_AMD64_IBSBRTARGET 0xc001103b | 124 | #define MSR_AMD64_IBSBRTARGET 0xc001103b |
125 | 125 | ||
126 | /* Fam 15h MSRs */ | ||
127 | #define MSR_F15H_PERF_CTL 0xc0010200 | ||
128 | #define MSR_F15H_PERF_CTR 0xc0010201 | ||
129 | |||
126 | /* Fam 10h MSRs */ | 130 | /* Fam 10h MSRs */ |
127 | #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 | 131 | #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 |
128 | #define FAM10H_MMIO_CONF_ENABLE (1<<0) | 132 | #define FAM10H_MMIO_CONF_ENABLE (1<<0) |
129 | #define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf | 133 | #define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf |
130 | #define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 | 134 | #define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 |
131 | #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff | 135 | #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL |
132 | #define FAM10H_MMIO_CONF_BASE_SHIFT 20 | 136 | #define FAM10H_MMIO_CONF_BASE_SHIFT 20 |
133 | #define MSR_FAM10H_NODE_ID 0xc001100c | 137 | #define MSR_FAM10H_NODE_ID 0xc001100c |
134 | 138 | ||
@@ -253,6 +257,18 @@ | |||
253 | #define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1) | 257 | #define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1) |
254 | #define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24) | 258 | #define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24) |
255 | 259 | ||
260 | /* Thermal Thresholds Support */ | ||
261 | #define THERM_INT_THRESHOLD0_ENABLE (1 << 15) | ||
262 | #define THERM_SHIFT_THRESHOLD0 8 | ||
263 | #define THERM_MASK_THRESHOLD0 (0x7f << THERM_SHIFT_THRESHOLD0) | ||
264 | #define THERM_INT_THRESHOLD1_ENABLE (1 << 23) | ||
265 | #define THERM_SHIFT_THRESHOLD1 16 | ||
266 | #define THERM_MASK_THRESHOLD1 (0x7f << THERM_SHIFT_THRESHOLD1) | ||
267 | #define THERM_STATUS_THRESHOLD0 (1 << 6) | ||
268 | #define THERM_LOG_THRESHOLD0 (1 << 7) | ||
269 | #define THERM_STATUS_THRESHOLD1 (1 << 8) | ||
270 | #define THERM_LOG_THRESHOLD1 (1 << 9) | ||
271 | |||
256 | /* MISC_ENABLE bits: architectural */ | 272 | /* MISC_ENABLE bits: architectural */ |
257 | #define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) | 273 | #define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) |
258 | #define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) | 274 | #define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) |
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 932f0f86b4b7..c76f5b92b840 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h | |||
@@ -5,41 +5,15 @@ | |||
5 | #include <asm/irq.h> | 5 | #include <asm/irq.h> |
6 | #include <asm/io.h> | 6 | #include <asm/io.h> |
7 | 7 | ||
8 | #ifdef ARCH_HAS_NMI_WATCHDOG | 8 | #ifdef CONFIG_X86_LOCAL_APIC |
9 | |||
10 | /** | ||
11 | * do_nmi_callback | ||
12 | * | ||
13 | * Check to see if a callback exists and execute it. Return 1 | ||
14 | * if the handler exists and was handled successfully. | ||
15 | */ | ||
16 | int do_nmi_callback(struct pt_regs *regs, int cpu); | ||
17 | 9 | ||
18 | extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); | 10 | extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); |
19 | extern int check_nmi_watchdog(void); | ||
20 | #if !defined(CONFIG_LOCKUP_DETECTOR) | ||
21 | extern int nmi_watchdog_enabled; | ||
22 | #endif | ||
23 | extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); | 11 | extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); |
24 | extern int reserve_perfctr_nmi(unsigned int); | 12 | extern int reserve_perfctr_nmi(unsigned int); |
25 | extern void release_perfctr_nmi(unsigned int); | 13 | extern void release_perfctr_nmi(unsigned int); |
26 | extern int reserve_evntsel_nmi(unsigned int); | 14 | extern int reserve_evntsel_nmi(unsigned int); |
27 | extern void release_evntsel_nmi(unsigned int); | 15 | extern void release_evntsel_nmi(unsigned int); |
28 | 16 | ||
29 | extern void setup_apic_nmi_watchdog(void *); | ||
30 | extern void stop_apic_nmi_watchdog(void *); | ||
31 | extern void disable_timer_nmi_watchdog(void); | ||
32 | extern void enable_timer_nmi_watchdog(void); | ||
33 | extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason); | ||
34 | extern void cpu_nmi_set_wd_enabled(void); | ||
35 | |||
36 | extern atomic_t nmi_active; | ||
37 | extern unsigned int nmi_watchdog; | ||
38 | #define NMI_NONE 0 | ||
39 | #define NMI_IO_APIC 1 | ||
40 | #define NMI_LOCAL_APIC 2 | ||
41 | #define NMI_INVALID 3 | ||
42 | |||
43 | struct ctl_table; | 17 | struct ctl_table; |
44 | extern int proc_nmi_enabled(struct ctl_table *, int , | 18 | extern int proc_nmi_enabled(struct ctl_table *, int , |
45 | void __user *, size_t *, loff_t *); | 19 | void __user *, size_t *, loff_t *); |
@@ -47,33 +21,28 @@ extern int unknown_nmi_panic; | |||
47 | 21 | ||
48 | void arch_trigger_all_cpu_backtrace(void); | 22 | void arch_trigger_all_cpu_backtrace(void); |
49 | #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace | 23 | #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace |
24 | #endif | ||
50 | 25 | ||
51 | static inline void localise_nmi_watchdog(void) | 26 | /* |
52 | { | 27 | * Define some priorities for the nmi notifier call chain. |
53 | if (nmi_watchdog == NMI_IO_APIC) | 28 | * |
54 | nmi_watchdog = NMI_LOCAL_APIC; | 29 | * Create a local nmi bit that has a higher priority than |
55 | } | 30 | * external nmis, because the local ones are more frequent. |
31 | * | ||
32 | * Also setup some default high/normal/low settings for | ||
33 | * subsystems to registers with. Using 4 bits to seperate | ||
34 | * the priorities. This can go alot higher if needed be. | ||
35 | */ | ||
56 | 36 | ||
57 | /* check if nmi_watchdog is active (ie was specified at boot) */ | 37 | #define NMI_LOCAL_SHIFT 16 /* randomly picked */ |
58 | static inline int nmi_watchdog_active(void) | 38 | #define NMI_LOCAL_BIT (1ULL << NMI_LOCAL_SHIFT) |
59 | { | 39 | #define NMI_HIGH_PRIOR (1ULL << 8) |
60 | /* | 40 | #define NMI_NORMAL_PRIOR (1ULL << 4) |
61 | * actually it should be: | 41 | #define NMI_LOW_PRIOR (1ULL << 0) |
62 | * return (nmi_watchdog == NMI_LOCAL_APIC || | 42 | #define NMI_LOCAL_HIGH_PRIOR (NMI_LOCAL_BIT | NMI_HIGH_PRIOR) |
63 | * nmi_watchdog == NMI_IO_APIC) | 43 | #define NMI_LOCAL_NORMAL_PRIOR (NMI_LOCAL_BIT | NMI_NORMAL_PRIOR) |
64 | * but since they are power of two we could use a | 44 | #define NMI_LOCAL_LOW_PRIOR (NMI_LOCAL_BIT | NMI_LOW_PRIOR) |
65 | * cheaper way --cvg | ||
66 | */ | ||
67 | return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC); | ||
68 | } | ||
69 | #endif | ||
70 | 45 | ||
71 | void lapic_watchdog_stop(void); | ||
72 | int lapic_watchdog_init(unsigned nmi_hz); | ||
73 | int lapic_wd_event(unsigned nmi_hz); | ||
74 | unsigned lapic_adjust_nmi_hz(unsigned hz); | ||
75 | void disable_lapic_nmi_watchdog(void); | ||
76 | void enable_lapic_nmi_watchdog(void); | ||
77 | void stop_nmi(void); | 46 | void stop_nmi(void); |
78 | void restart_nmi(void); | 47 | void restart_nmi(void); |
79 | 48 | ||
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index a37229011b56..b0ef2b449a9d 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h | |||
@@ -1,6 +1,8 @@ | |||
1 | #ifndef _ASM_X86_NUMA_32_H | 1 | #ifndef _ASM_X86_NUMA_32_H |
2 | #define _ASM_X86_NUMA_32_H | 2 | #define _ASM_X86_NUMA_32_H |
3 | 3 | ||
4 | extern int numa_off; | ||
5 | |||
4 | extern int pxm_to_nid(int pxm); | 6 | extern int pxm_to_nid(int pxm); |
5 | extern void numa_remove_cpu(int cpu); | 7 | extern void numa_remove_cpu(int cpu); |
6 | 8 | ||
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 823e070e7c26..0493be39607c 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h | |||
@@ -38,8 +38,9 @@ extern void __cpuinit numa_add_cpu(int cpu); | |||
38 | extern void __cpuinit numa_remove_cpu(int cpu); | 38 | extern void __cpuinit numa_remove_cpu(int cpu); |
39 | 39 | ||
40 | #ifdef CONFIG_NUMA_EMU | 40 | #ifdef CONFIG_NUMA_EMU |
41 | #define FAKE_NODE_MIN_SIZE ((u64)64 << 20) | 41 | #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) |
42 | #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) | 42 | #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) |
43 | void numa_emu_cmdline(char *); | ||
43 | #endif /* CONFIG_NUMA_EMU */ | 44 | #endif /* CONFIG_NUMA_EMU */ |
44 | #else | 45 | #else |
45 | static inline void init_cpu_to_node(void) { } | 46 | static inline void init_cpu_to_node(void) { } |
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 42a978c0c1b3..f482010350fb 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h | |||
@@ -107,10 +107,14 @@ extern int olpc_ec_mask_unset(uint8_t bits); | |||
107 | /* GPIO assignments */ | 107 | /* GPIO assignments */ |
108 | 108 | ||
109 | #define OLPC_GPIO_MIC_AC 1 | 109 | #define OLPC_GPIO_MIC_AC 1 |
110 | #define OLPC_GPIO_DCON_IRQ geode_gpio(7) | 110 | #define OLPC_GPIO_DCON_STAT0 5 |
111 | #define OLPC_GPIO_DCON_STAT1 6 | ||
112 | #define OLPC_GPIO_DCON_IRQ 7 | ||
111 | #define OLPC_GPIO_THRM_ALRM geode_gpio(10) | 113 | #define OLPC_GPIO_THRM_ALRM geode_gpio(10) |
112 | #define OLPC_GPIO_SMB_CLK geode_gpio(14) | 114 | #define OLPC_GPIO_DCON_LOAD 11 |
113 | #define OLPC_GPIO_SMB_DATA geode_gpio(15) | 115 | #define OLPC_GPIO_DCON_BLANK 12 |
116 | #define OLPC_GPIO_SMB_CLK 14 | ||
117 | #define OLPC_GPIO_SMB_DATA 15 | ||
114 | #define OLPC_GPIO_WORKAUX geode_gpio(24) | 118 | #define OLPC_GPIO_WORKAUX geode_gpio(24) |
115 | #define OLPC_GPIO_LID geode_gpio(26) | 119 | #define OLPC_GPIO_LID geode_gpio(26) |
116 | #define OLPC_GPIO_ECSCI geode_gpio(27) | 120 | #define OLPC_GPIO_ECSCI geode_gpio(27) |
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h index 2a8478140bb3..641988efe063 100644 --- a/arch/x86/include/asm/olpc_ofw.h +++ b/arch/x86/include/asm/olpc_ofw.h | |||
@@ -8,6 +8,8 @@ | |||
8 | 8 | ||
9 | #ifdef CONFIG_OLPC_OPENFIRMWARE | 9 | #ifdef CONFIG_OLPC_OPENFIRMWARE |
10 | 10 | ||
11 | extern bool olpc_ofw_is_installed(void); | ||
12 | |||
11 | /* run an OFW command by calling into the firmware */ | 13 | /* run an OFW command by calling into the firmware */ |
12 | #define olpc_ofw(name, args, res) \ | 14 | #define olpc_ofw(name, args, res) \ |
13 | __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res) | 15 | __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res) |
@@ -26,10 +28,17 @@ extern bool olpc_ofw_present(void); | |||
26 | 28 | ||
27 | #else /* !CONFIG_OLPC_OPENFIRMWARE */ | 29 | #else /* !CONFIG_OLPC_OPENFIRMWARE */ |
28 | 30 | ||
31 | static inline bool olpc_ofw_is_installed(void) { return false; } | ||
29 | static inline void olpc_ofw_detect(void) { } | 32 | static inline void olpc_ofw_detect(void) { } |
30 | static inline void setup_olpc_ofw_pgd(void) { } | 33 | static inline void setup_olpc_ofw_pgd(void) { } |
31 | static inline bool olpc_ofw_present(void) { return false; } | 34 | static inline bool olpc_ofw_present(void) { return false; } |
32 | 35 | ||
33 | #endif /* !CONFIG_OLPC_OPENFIRMWARE */ | 36 | #endif /* !CONFIG_OLPC_OPENFIRMWARE */ |
34 | 37 | ||
38 | #ifdef CONFIG_OLPC_OPENFIRMWARE_DT | ||
39 | extern void olpc_dt_build_devicetree(void); | ||
40 | #else | ||
41 | static inline void olpc_dt_build_devicetree(void) { } | ||
42 | #endif /* CONFIG_OLPC_OPENFIRMWARE_DT */ | ||
43 | |||
35 | #endif /* _ASM_X86_OLPC_OFW_H */ | 44 | #endif /* _ASM_X86_OLPC_OFW_H */ |
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 18e3b8a8709f..ebbc4d8ab170 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h | |||
@@ -112,7 +112,7 @@ static inline void arch_safe_halt(void) | |||
112 | 112 | ||
113 | static inline void halt(void) | 113 | static inline void halt(void) |
114 | { | 114 | { |
115 | PVOP_VCALL0(pv_irq_ops.safe_halt); | 115 | PVOP_VCALL0(pv_irq_ops.halt); |
116 | } | 116 | } |
117 | 117 | ||
118 | static inline void wbinvd(void) | 118 | static inline void wbinvd(void) |
@@ -435,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr, | |||
435 | { | 435 | { |
436 | PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); | 436 | PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); |
437 | } | 437 | } |
438 | static inline void pmd_update(struct mm_struct *mm, unsigned long addr, | ||
439 | pmd_t *pmdp) | ||
440 | { | ||
441 | PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp); | ||
442 | } | ||
438 | 443 | ||
439 | static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, | 444 | static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, |
440 | pte_t *ptep) | 445 | pte_t *ptep) |
@@ -442,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, | |||
442 | PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); | 447 | PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); |
443 | } | 448 | } |
444 | 449 | ||
450 | static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr, | ||
451 | pmd_t *pmdp) | ||
452 | { | ||
453 | PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp); | ||
454 | } | ||
455 | |||
445 | static inline pte_t __pte(pteval_t val) | 456 | static inline pte_t __pte(pteval_t val) |
446 | { | 457 | { |
447 | pteval_t ret; | 458 | pteval_t ret; |
@@ -543,6 +554,19 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
543 | PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); | 554 | PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); |
544 | } | 555 | } |
545 | 556 | ||
557 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
558 | static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, | ||
559 | pmd_t *pmdp, pmd_t pmd) | ||
560 | { | ||
561 | if (sizeof(pmdval_t) > sizeof(long)) | ||
562 | /* 5 arg words */ | ||
563 | pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd); | ||
564 | else | ||
565 | PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, | ||
566 | native_pmd_val(pmd)); | ||
567 | } | ||
568 | #endif | ||
569 | |||
546 | static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) | 570 | static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) |
547 | { | 571 | { |
548 | pmdval_t val = native_pmd_val(pmd); | 572 | pmdval_t val = native_pmd_val(pmd); |
@@ -824,27 +848,27 @@ static __always_inline void arch_spin_unlock(struct arch_spinlock *lock) | |||
824 | #define __PV_IS_CALLEE_SAVE(func) \ | 848 | #define __PV_IS_CALLEE_SAVE(func) \ |
825 | ((struct paravirt_callee_save) { func }) | 849 | ((struct paravirt_callee_save) { func }) |
826 | 850 | ||
827 | static inline unsigned long arch_local_save_flags(void) | 851 | static inline notrace unsigned long arch_local_save_flags(void) |
828 | { | 852 | { |
829 | return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl); | 853 | return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl); |
830 | } | 854 | } |
831 | 855 | ||
832 | static inline void arch_local_irq_restore(unsigned long f) | 856 | static inline notrace void arch_local_irq_restore(unsigned long f) |
833 | { | 857 | { |
834 | PVOP_VCALLEE1(pv_irq_ops.restore_fl, f); | 858 | PVOP_VCALLEE1(pv_irq_ops.restore_fl, f); |
835 | } | 859 | } |
836 | 860 | ||
837 | static inline void arch_local_irq_disable(void) | 861 | static inline notrace void arch_local_irq_disable(void) |
838 | { | 862 | { |
839 | PVOP_VCALLEE0(pv_irq_ops.irq_disable); | 863 | PVOP_VCALLEE0(pv_irq_ops.irq_disable); |
840 | } | 864 | } |
841 | 865 | ||
842 | static inline void arch_local_irq_enable(void) | 866 | static inline notrace void arch_local_irq_enable(void) |
843 | { | 867 | { |
844 | PVOP_VCALLEE0(pv_irq_ops.irq_enable); | 868 | PVOP_VCALLEE0(pv_irq_ops.irq_enable); |
845 | } | 869 | } |
846 | 870 | ||
847 | static inline unsigned long arch_local_irq_save(void) | 871 | static inline notrace unsigned long arch_local_irq_save(void) |
848 | { | 872 | { |
849 | unsigned long f; | 873 | unsigned long f; |
850 | 874 | ||
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b82bac975250..82885099c869 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h | |||
@@ -265,10 +265,16 @@ struct pv_mmu_ops { | |||
265 | void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, | 265 | void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, |
266 | pte_t *ptep, pte_t pteval); | 266 | pte_t *ptep, pte_t pteval); |
267 | void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); | 267 | void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); |
268 | void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr, | ||
269 | pmd_t *pmdp, pmd_t pmdval); | ||
268 | void (*pte_update)(struct mm_struct *mm, unsigned long addr, | 270 | void (*pte_update)(struct mm_struct *mm, unsigned long addr, |
269 | pte_t *ptep); | 271 | pte_t *ptep); |
270 | void (*pte_update_defer)(struct mm_struct *mm, | 272 | void (*pte_update_defer)(struct mm_struct *mm, |
271 | unsigned long addr, pte_t *ptep); | 273 | unsigned long addr, pte_t *ptep); |
274 | void (*pmd_update)(struct mm_struct *mm, unsigned long addr, | ||
275 | pmd_t *pmdp); | ||
276 | void (*pmd_update_defer)(struct mm_struct *mm, | ||
277 | unsigned long addr, pmd_t *pmdp); | ||
272 | 278 | ||
273 | pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, | 279 | pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, |
274 | pte_t *ptep); | 280 | pte_t *ptep); |
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index ca0437c714b2..676129229630 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h | |||
@@ -65,6 +65,7 @@ extern unsigned long pci_mem_start; | |||
65 | 65 | ||
66 | #define PCIBIOS_MIN_CARDBUS_IO 0x4000 | 66 | #define PCIBIOS_MIN_CARDBUS_IO 0x4000 |
67 | 67 | ||
68 | extern int pcibios_enabled; | ||
68 | void pcibios_config_init(void); | 69 | void pcibios_config_init(void); |
69 | struct pci_bus *pcibios_scan_root(int bus); | 70 | struct pci_bus *pcibios_scan_root(int bus); |
70 | 71 | ||
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index f899e01a8ac9..7e172955ee57 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h | |||
@@ -230,6 +230,125 @@ do { \ | |||
230 | }) | 230 | }) |
231 | 231 | ||
232 | /* | 232 | /* |
233 | * Add return operation | ||
234 | */ | ||
235 | #define percpu_add_return_op(var, val) \ | ||
236 | ({ \ | ||
237 | typeof(var) paro_ret__ = val; \ | ||
238 | switch (sizeof(var)) { \ | ||
239 | case 1: \ | ||
240 | asm("xaddb %0, "__percpu_arg(1) \ | ||
241 | : "+q" (paro_ret__), "+m" (var) \ | ||
242 | : : "memory"); \ | ||
243 | break; \ | ||
244 | case 2: \ | ||
245 | asm("xaddw %0, "__percpu_arg(1) \ | ||
246 | : "+r" (paro_ret__), "+m" (var) \ | ||
247 | : : "memory"); \ | ||
248 | break; \ | ||
249 | case 4: \ | ||
250 | asm("xaddl %0, "__percpu_arg(1) \ | ||
251 | : "+r" (paro_ret__), "+m" (var) \ | ||
252 | : : "memory"); \ | ||
253 | break; \ | ||
254 | case 8: \ | ||
255 | asm("xaddq %0, "__percpu_arg(1) \ | ||
256 | : "+re" (paro_ret__), "+m" (var) \ | ||
257 | : : "memory"); \ | ||
258 | break; \ | ||
259 | default: __bad_percpu_size(); \ | ||
260 | } \ | ||
261 | paro_ret__ += val; \ | ||
262 | paro_ret__; \ | ||
263 | }) | ||
264 | |||
265 | /* | ||
266 | * xchg is implemented using cmpxchg without a lock prefix. xchg is | ||
267 | * expensive due to the implied lock prefix. The processor cannot prefetch | ||
268 | * cachelines if xchg is used. | ||
269 | */ | ||
270 | #define percpu_xchg_op(var, nval) \ | ||
271 | ({ \ | ||
272 | typeof(var) pxo_ret__; \ | ||
273 | typeof(var) pxo_new__ = (nval); \ | ||
274 | switch (sizeof(var)) { \ | ||
275 | case 1: \ | ||
276 | asm("\n\tmov "__percpu_arg(1)",%%al" \ | ||
277 | "\n1:\tcmpxchgb %2, "__percpu_arg(1) \ | ||
278 | "\n\tjnz 1b" \ | ||
279 | : "=&a" (pxo_ret__), "+m" (var) \ | ||
280 | : "q" (pxo_new__) \ | ||
281 | : "memory"); \ | ||
282 | break; \ | ||
283 | case 2: \ | ||
284 | asm("\n\tmov "__percpu_arg(1)",%%ax" \ | ||
285 | "\n1:\tcmpxchgw %2, "__percpu_arg(1) \ | ||
286 | "\n\tjnz 1b" \ | ||
287 | : "=&a" (pxo_ret__), "+m" (var) \ | ||
288 | : "r" (pxo_new__) \ | ||
289 | : "memory"); \ | ||
290 | break; \ | ||
291 | case 4: \ | ||
292 | asm("\n\tmov "__percpu_arg(1)",%%eax" \ | ||
293 | "\n1:\tcmpxchgl %2, "__percpu_arg(1) \ | ||
294 | "\n\tjnz 1b" \ | ||
295 | : "=&a" (pxo_ret__), "+m" (var) \ | ||
296 | : "r" (pxo_new__) \ | ||
297 | : "memory"); \ | ||
298 | break; \ | ||
299 | case 8: \ | ||
300 | asm("\n\tmov "__percpu_arg(1)",%%rax" \ | ||
301 | "\n1:\tcmpxchgq %2, "__percpu_arg(1) \ | ||
302 | "\n\tjnz 1b" \ | ||
303 | : "=&a" (pxo_ret__), "+m" (var) \ | ||
304 | : "r" (pxo_new__) \ | ||
305 | : "memory"); \ | ||
306 | break; \ | ||
307 | default: __bad_percpu_size(); \ | ||
308 | } \ | ||
309 | pxo_ret__; \ | ||
310 | }) | ||
311 | |||
312 | /* | ||
313 | * cmpxchg has no such implied lock semantics as a result it is much | ||
314 | * more efficient for cpu local operations. | ||
315 | */ | ||
316 | #define percpu_cmpxchg_op(var, oval, nval) \ | ||
317 | ({ \ | ||
318 | typeof(var) pco_ret__; \ | ||
319 | typeof(var) pco_old__ = (oval); \ | ||
320 | typeof(var) pco_new__ = (nval); \ | ||
321 | switch (sizeof(var)) { \ | ||
322 | case 1: \ | ||
323 | asm("cmpxchgb %2, "__percpu_arg(1) \ | ||
324 | : "=a" (pco_ret__), "+m" (var) \ | ||
325 | : "q" (pco_new__), "0" (pco_old__) \ | ||
326 | : "memory"); \ | ||
327 | break; \ | ||
328 | case 2: \ | ||
329 | asm("cmpxchgw %2, "__percpu_arg(1) \ | ||
330 | : "=a" (pco_ret__), "+m" (var) \ | ||
331 | : "r" (pco_new__), "0" (pco_old__) \ | ||
332 | : "memory"); \ | ||
333 | break; \ | ||
334 | case 4: \ | ||
335 | asm("cmpxchgl %2, "__percpu_arg(1) \ | ||
336 | : "=a" (pco_ret__), "+m" (var) \ | ||
337 | : "r" (pco_new__), "0" (pco_old__) \ | ||
338 | : "memory"); \ | ||
339 | break; \ | ||
340 | case 8: \ | ||
341 | asm("cmpxchgq %2, "__percpu_arg(1) \ | ||
342 | : "=a" (pco_ret__), "+m" (var) \ | ||
343 | : "r" (pco_new__), "0" (pco_old__) \ | ||
344 | : "memory"); \ | ||
345 | break; \ | ||
346 | default: __bad_percpu_size(); \ | ||
347 | } \ | ||
348 | pco_ret__; \ | ||
349 | }) | ||
350 | |||
351 | /* | ||
233 | * percpu_read() makes gcc load the percpu variable every time it is | 352 | * percpu_read() makes gcc load the percpu variable every time it is |
234 | * accessed while percpu_read_stable() allows the value to be cached. | 353 | * accessed while percpu_read_stable() allows the value to be cached. |
235 | * percpu_read_stable() is more efficient and can be used if its value | 354 | * percpu_read_stable() is more efficient and can be used if its value |
@@ -267,6 +386,12 @@ do { \ | |||
267 | #define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) | 386 | #define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) |
268 | #define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) | 387 | #define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) |
269 | #define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) | 388 | #define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) |
389 | /* | ||
390 | * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much | ||
391 | * faster than an xchg with forced lock semantics. | ||
392 | */ | ||
393 | #define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) | ||
394 | #define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
270 | 395 | ||
271 | #define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) | 396 | #define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) |
272 | #define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) | 397 | #define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) |
@@ -286,6 +411,9 @@ do { \ | |||
286 | #define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) | 411 | #define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) |
287 | #define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) | 412 | #define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) |
288 | #define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) | 413 | #define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) |
414 | #define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) | ||
415 | #define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) | ||
416 | #define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) | ||
289 | 417 | ||
290 | #define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) | 418 | #define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) |
291 | #define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) | 419 | #define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) |
@@ -299,6 +427,29 @@ do { \ | |||
299 | #define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) | 427 | #define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) |
300 | #define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) | 428 | #define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) |
301 | #define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) | 429 | #define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) |
430 | #define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) | ||
431 | #define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) | ||
432 | #define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) | ||
433 | |||
434 | #ifndef CONFIG_M386 | ||
435 | #define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) | ||
436 | #define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) | ||
437 | #define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) | ||
438 | #define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
439 | #define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
440 | #define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
441 | |||
442 | #define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) | ||
443 | #define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) | ||
444 | #define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) | ||
445 | #define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
446 | #define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
447 | #define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
448 | |||
449 | #define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
450 | #define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
451 | #define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
452 | #endif /* !CONFIG_M386 */ | ||
302 | 453 | ||
303 | /* | 454 | /* |
304 | * Per cpu atomic 64 bit operations are only available under 64 bit. | 455 | * Per cpu atomic 64 bit operations are only available under 64 bit. |
@@ -311,6 +462,7 @@ do { \ | |||
311 | #define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) | 462 | #define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) |
312 | #define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) | 463 | #define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) |
313 | #define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) | 464 | #define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) |
465 | #define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) | ||
314 | 466 | ||
315 | #define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) | 467 | #define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) |
316 | #define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) | 468 | #define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) |
@@ -318,12 +470,16 @@ do { \ | |||
318 | #define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) | 470 | #define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) |
319 | #define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) | 471 | #define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) |
320 | #define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) | 472 | #define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) |
473 | #define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) | ||
474 | #define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) | ||
475 | #define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
321 | 476 | ||
322 | #define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) | 477 | #define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) |
323 | #define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) | 478 | #define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) |
324 | #define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) | 479 | #define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) |
325 | #define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) | 480 | #define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) |
326 | 481 | #define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) | |
482 | #define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) | ||
327 | #endif | 483 | #endif |
328 | 484 | ||
329 | /* This is not atomic against other CPUs -- CPU preemption needs to be off */ | 485 | /* This is not atomic against other CPUs -- CPU preemption needs to be off */ |
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 550e26b1dbb3..d9d4dae305f6 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h | |||
@@ -125,7 +125,6 @@ union cpuid10_edx { | |||
125 | #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ | 125 | #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ |
126 | 126 | ||
127 | #ifdef CONFIG_PERF_EVENTS | 127 | #ifdef CONFIG_PERF_EVENTS |
128 | extern void init_hw_perf_events(void); | ||
129 | extern void perf_events_lapic_init(void); | 128 | extern void perf_events_lapic_init(void); |
130 | 129 | ||
131 | #define PERF_EVENT_INDEX_OFFSET 0 | 130 | #define PERF_EVENT_INDEX_OFFSET 0 |
@@ -156,7 +155,6 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs); | |||
156 | } | 155 | } |
157 | 156 | ||
158 | #else | 157 | #else |
159 | static inline void init_hw_perf_events(void) { } | ||
160 | static inline void perf_events_lapic_init(void) { } | 158 | static inline void perf_events_lapic_init(void) { } |
161 | #endif | 159 | #endif |
162 | 160 | ||
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index a70cd216be5d..e2f6a99f14ab 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h | |||
@@ -20,6 +20,9 @@ | |||
20 | #define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR) | 20 | #define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR) |
21 | #define ARCH_P4_MAX_CCCR (18) | 21 | #define ARCH_P4_MAX_CCCR (18) |
22 | 22 | ||
23 | #define ARCH_P4_CNTRVAL_BITS (40) | ||
24 | #define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1) | ||
25 | |||
23 | #define P4_ESCR_EVENT_MASK 0x7e000000U | 26 | #define P4_ESCR_EVENT_MASK 0x7e000000U |
24 | #define P4_ESCR_EVENT_SHIFT 25 | 27 | #define P4_ESCR_EVENT_SHIFT 25 |
25 | #define P4_ESCR_EVENTMASK_MASK 0x01fffe00U | 28 | #define P4_ESCR_EVENTMASK_MASK 0x01fffe00U |
@@ -744,14 +747,6 @@ enum P4_ESCR_EMASKS { | |||
744 | }; | 747 | }; |
745 | 748 | ||
746 | /* | 749 | /* |
747 | * P4 PEBS specifics (Replay Event only) | ||
748 | * | ||
749 | * Format (bits): | ||
750 | * 0-6: metric from P4_PEBS_METRIC enum | ||
751 | * 7 : reserved | ||
752 | * 8 : reserved | ||
753 | * 9-11 : reserved | ||
754 | * | ||
755 | * Note we have UOP and PEBS bits reserved for now | 750 | * Note we have UOP and PEBS bits reserved for now |
756 | * just in case if we will need them once | 751 | * just in case if we will need them once |
757 | */ | 752 | */ |
@@ -788,5 +783,60 @@ enum P4_PEBS_METRIC { | |||
788 | P4_PEBS_METRIC__max | 783 | P4_PEBS_METRIC__max |
789 | }; | 784 | }; |
790 | 785 | ||
786 | /* | ||
787 | * Notes on internal configuration of ESCR+CCCR tuples | ||
788 | * | ||
789 | * Since P4 has quite the different architecture of | ||
790 | * performance registers in compare with "architectural" | ||
791 | * once and we have on 64 bits to keep configuration | ||
792 | * of performance event, the following trick is used. | ||
793 | * | ||
794 | * 1) Since both ESCR and CCCR registers have only low | ||
795 | * 32 bits valuable, we pack them into a single 64 bit | ||
796 | * configuration. Low 32 bits of such config correspond | ||
797 | * to low 32 bits of CCCR register and high 32 bits | ||
798 | * correspond to low 32 bits of ESCR register. | ||
799 | * | ||
800 | * 2) The meaning of every bit of such config field can | ||
801 | * be found in Intel SDM but it should be noted that | ||
802 | * we "borrow" some reserved bits for own usage and | ||
803 | * clean them or set to a proper value when we do | ||
804 | * a real write to hardware registers. | ||
805 | * | ||
806 | * 3) The format of bits of config is the following | ||
807 | * and should be either 0 or set to some predefined | ||
808 | * values: | ||
809 | * | ||
810 | * Low 32 bits | ||
811 | * ----------- | ||
812 | * 0-6: P4_PEBS_METRIC enum | ||
813 | * 7-11: reserved | ||
814 | * 12: reserved (Enable) | ||
815 | * 13-15: reserved (ESCR select) | ||
816 | * 16-17: Active Thread | ||
817 | * 18: Compare | ||
818 | * 19: Complement | ||
819 | * 20-23: Threshold | ||
820 | * 24: Edge | ||
821 | * 25: reserved (FORCE_OVF) | ||
822 | * 26: reserved (OVF_PMI_T0) | ||
823 | * 27: reserved (OVF_PMI_T1) | ||
824 | * 28-29: reserved | ||
825 | * 30: reserved (Cascade) | ||
826 | * 31: reserved (OVF) | ||
827 | * | ||
828 | * High 32 bits | ||
829 | * ------------ | ||
830 | * 0: reserved (T1_USR) | ||
831 | * 1: reserved (T1_OS) | ||
832 | * 2: reserved (T0_USR) | ||
833 | * 3: reserved (T0_OS) | ||
834 | * 4: Tag Enable | ||
835 | * 5-8: Tag Value | ||
836 | * 9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper) | ||
837 | * 25-30: enum P4_EVENTS | ||
838 | * 31: reserved (HT thread) | ||
839 | */ | ||
840 | |||
791 | #endif /* PERF_EVENT_P4_H */ | 841 | #endif /* PERF_EVENT_P4_H */ |
792 | 842 | ||
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 271de94c3810..b4389a468fb6 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h | |||
@@ -92,7 +92,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) | |||
92 | extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); | 92 | extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); |
93 | 93 | ||
94 | static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, | 94 | static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, |
95 | unsigned long adddress) | 95 | unsigned long address) |
96 | { | 96 | { |
97 | ___pmd_free_tlb(tlb, pmd); | 97 | ___pmd_free_tlb(tlb, pmd); |
98 | } | 98 | } |
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 2334982b339e..98391db840c6 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h | |||
@@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) | |||
46 | #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) | 46 | #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) |
47 | #endif | 47 | #endif |
48 | 48 | ||
49 | #ifdef CONFIG_SMP | ||
50 | static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) | ||
51 | { | ||
52 | return __pmd(xchg((pmdval_t *)xp, 0)); | ||
53 | } | ||
54 | #else | ||
55 | #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) | ||
56 | #endif | ||
57 | |||
49 | /* | 58 | /* |
50 | * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, | 59 | * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, |
51 | * split up the 29 bits of offset into this range: | 60 | * split up the 29 bits of offset into this range: |
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 177b0165ea01..94b979d1b58d 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h | |||
@@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep) | |||
104 | #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) | 104 | #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) |
105 | #endif | 105 | #endif |
106 | 106 | ||
107 | #ifdef CONFIG_SMP | ||
108 | union split_pmd { | ||
109 | struct { | ||
110 | u32 pmd_low; | ||
111 | u32 pmd_high; | ||
112 | }; | ||
113 | pmd_t pmd; | ||
114 | }; | ||
115 | static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) | ||
116 | { | ||
117 | union split_pmd res, *orig = (union split_pmd *)pmdp; | ||
118 | |||
119 | /* xchg acts as a barrier before setting of the high bits */ | ||
120 | res.pmd_low = xchg(&orig->pmd_low, 0); | ||
121 | res.pmd_high = orig->pmd_high; | ||
122 | orig->pmd_high = 0; | ||
123 | |||
124 | return res.pmd; | ||
125 | } | ||
126 | #else | ||
127 | #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) | ||
128 | #endif | ||
129 | |||
107 | /* | 130 | /* |
108 | * Bits 0, 6 and 7 are taken in the low part of the pte, | 131 | * Bits 0, 6 and 7 are taken in the low part of the pte, |
109 | * put the 32 bits of offset into the high part. | 132 | * put the 32 bits of offset into the high part. |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index ada823a13c7c..18601c86fab1 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -35,6 +35,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); | |||
35 | #else /* !CONFIG_PARAVIRT */ | 35 | #else /* !CONFIG_PARAVIRT */ |
36 | #define set_pte(ptep, pte) native_set_pte(ptep, pte) | 36 | #define set_pte(ptep, pte) native_set_pte(ptep, pte) |
37 | #define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) | 37 | #define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) |
38 | #define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd) | ||
38 | 39 | ||
39 | #define set_pte_atomic(ptep, pte) \ | 40 | #define set_pte_atomic(ptep, pte) \ |
40 | native_set_pte_atomic(ptep, pte) | 41 | native_set_pte_atomic(ptep, pte) |
@@ -59,6 +60,8 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); | |||
59 | 60 | ||
60 | #define pte_update(mm, addr, ptep) do { } while (0) | 61 | #define pte_update(mm, addr, ptep) do { } while (0) |
61 | #define pte_update_defer(mm, addr, ptep) do { } while (0) | 62 | #define pte_update_defer(mm, addr, ptep) do { } while (0) |
63 | #define pmd_update(mm, addr, ptep) do { } while (0) | ||
64 | #define pmd_update_defer(mm, addr, ptep) do { } while (0) | ||
62 | 65 | ||
63 | #define pgd_val(x) native_pgd_val(x) | 66 | #define pgd_val(x) native_pgd_val(x) |
64 | #define __pgd(x) native_make_pgd(x) | 67 | #define __pgd(x) native_make_pgd(x) |
@@ -94,6 +97,11 @@ static inline int pte_young(pte_t pte) | |||
94 | return pte_flags(pte) & _PAGE_ACCESSED; | 97 | return pte_flags(pte) & _PAGE_ACCESSED; |
95 | } | 98 | } |
96 | 99 | ||
100 | static inline int pmd_young(pmd_t pmd) | ||
101 | { | ||
102 | return pmd_flags(pmd) & _PAGE_ACCESSED; | ||
103 | } | ||
104 | |||
97 | static inline int pte_write(pte_t pte) | 105 | static inline int pte_write(pte_t pte) |
98 | { | 106 | { |
99 | return pte_flags(pte) & _PAGE_RW; | 107 | return pte_flags(pte) & _PAGE_RW; |
@@ -142,6 +150,23 @@ static inline int pmd_large(pmd_t pte) | |||
142 | (_PAGE_PSE | _PAGE_PRESENT); | 150 | (_PAGE_PSE | _PAGE_PRESENT); |
143 | } | 151 | } |
144 | 152 | ||
153 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
154 | static inline int pmd_trans_splitting(pmd_t pmd) | ||
155 | { | ||
156 | return pmd_val(pmd) & _PAGE_SPLITTING; | ||
157 | } | ||
158 | |||
159 | static inline int pmd_trans_huge(pmd_t pmd) | ||
160 | { | ||
161 | return pmd_val(pmd) & _PAGE_PSE; | ||
162 | } | ||
163 | |||
164 | static inline int has_transparent_hugepage(void) | ||
165 | { | ||
166 | return cpu_has_pse; | ||
167 | } | ||
168 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
169 | |||
145 | static inline pte_t pte_set_flags(pte_t pte, pteval_t set) | 170 | static inline pte_t pte_set_flags(pte_t pte, pteval_t set) |
146 | { | 171 | { |
147 | pteval_t v = native_pte_val(pte); | 172 | pteval_t v = native_pte_val(pte); |
@@ -216,6 +241,55 @@ static inline pte_t pte_mkspecial(pte_t pte) | |||
216 | return pte_set_flags(pte, _PAGE_SPECIAL); | 241 | return pte_set_flags(pte, _PAGE_SPECIAL); |
217 | } | 242 | } |
218 | 243 | ||
244 | static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) | ||
245 | { | ||
246 | pmdval_t v = native_pmd_val(pmd); | ||
247 | |||
248 | return __pmd(v | set); | ||
249 | } | ||
250 | |||
251 | static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) | ||
252 | { | ||
253 | pmdval_t v = native_pmd_val(pmd); | ||
254 | |||
255 | return __pmd(v & ~clear); | ||
256 | } | ||
257 | |||
258 | static inline pmd_t pmd_mkold(pmd_t pmd) | ||
259 | { | ||
260 | return pmd_clear_flags(pmd, _PAGE_ACCESSED); | ||
261 | } | ||
262 | |||
263 | static inline pmd_t pmd_wrprotect(pmd_t pmd) | ||
264 | { | ||
265 | return pmd_clear_flags(pmd, _PAGE_RW); | ||
266 | } | ||
267 | |||
268 | static inline pmd_t pmd_mkdirty(pmd_t pmd) | ||
269 | { | ||
270 | return pmd_set_flags(pmd, _PAGE_DIRTY); | ||
271 | } | ||
272 | |||
273 | static inline pmd_t pmd_mkhuge(pmd_t pmd) | ||
274 | { | ||
275 | return pmd_set_flags(pmd, _PAGE_PSE); | ||
276 | } | ||
277 | |||
278 | static inline pmd_t pmd_mkyoung(pmd_t pmd) | ||
279 | { | ||
280 | return pmd_set_flags(pmd, _PAGE_ACCESSED); | ||
281 | } | ||
282 | |||
283 | static inline pmd_t pmd_mkwrite(pmd_t pmd) | ||
284 | { | ||
285 | return pmd_set_flags(pmd, _PAGE_RW); | ||
286 | } | ||
287 | |||
288 | static inline pmd_t pmd_mknotpresent(pmd_t pmd) | ||
289 | { | ||
290 | return pmd_clear_flags(pmd, _PAGE_PRESENT); | ||
291 | } | ||
292 | |||
219 | /* | 293 | /* |
220 | * Mask out unsupported bits in a present pgprot. Non-present pgprots | 294 | * Mask out unsupported bits in a present pgprot. Non-present pgprots |
221 | * can use those bits for other purposes, so leave them be. | 295 | * can use those bits for other purposes, so leave them be. |
@@ -256,6 +330,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) | |||
256 | return __pte(val); | 330 | return __pte(val); |
257 | } | 331 | } |
258 | 332 | ||
333 | static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) | ||
334 | { | ||
335 | pmdval_t val = pmd_val(pmd); | ||
336 | |||
337 | val &= _HPAGE_CHG_MASK; | ||
338 | val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK; | ||
339 | |||
340 | return __pmd(val); | ||
341 | } | ||
342 | |||
259 | /* mprotect needs to preserve PAT bits when updating vm_page_prot */ | 343 | /* mprotect needs to preserve PAT bits when updating vm_page_prot */ |
260 | #define pgprot_modify pgprot_modify | 344 | #define pgprot_modify pgprot_modify |
261 | static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) | 345 | static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) |
@@ -350,7 +434,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) | |||
350 | * Currently stuck as a macro due to indirect forward reference to | 434 | * Currently stuck as a macro due to indirect forward reference to |
351 | * linux/mmzone.h's __section_mem_map_addr() definition: | 435 | * linux/mmzone.h's __section_mem_map_addr() definition: |
352 | */ | 436 | */ |
353 | #define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) | 437 | #define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT) |
354 | 438 | ||
355 | /* | 439 | /* |
356 | * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] | 440 | * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] |
@@ -524,12 +608,26 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) | |||
524 | return res; | 608 | return res; |
525 | } | 609 | } |
526 | 610 | ||
611 | static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp) | ||
612 | { | ||
613 | pmd_t res = *pmdp; | ||
614 | |||
615 | native_pmd_clear(pmdp); | ||
616 | return res; | ||
617 | } | ||
618 | |||
527 | static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, | 619 | static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, |
528 | pte_t *ptep , pte_t pte) | 620 | pte_t *ptep , pte_t pte) |
529 | { | 621 | { |
530 | native_set_pte(ptep, pte); | 622 | native_set_pte(ptep, pte); |
531 | } | 623 | } |
532 | 624 | ||
625 | static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr, | ||
626 | pmd_t *pmdp , pmd_t pmd) | ||
627 | { | ||
628 | native_set_pmd(pmdp, pmd); | ||
629 | } | ||
630 | |||
533 | #ifndef CONFIG_PARAVIRT | 631 | #ifndef CONFIG_PARAVIRT |
534 | /* | 632 | /* |
535 | * Rules for using pte_update - it must be called after any PTE update which | 633 | * Rules for using pte_update - it must be called after any PTE update which |
@@ -607,6 +705,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, | |||
607 | 705 | ||
608 | #define flush_tlb_fix_spurious_fault(vma, address) | 706 | #define flush_tlb_fix_spurious_fault(vma, address) |
609 | 707 | ||
708 | #define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) | ||
709 | |||
710 | #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS | ||
711 | extern int pmdp_set_access_flags(struct vm_area_struct *vma, | ||
712 | unsigned long address, pmd_t *pmdp, | ||
713 | pmd_t entry, int dirty); | ||
714 | |||
715 | #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG | ||
716 | extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, | ||
717 | unsigned long addr, pmd_t *pmdp); | ||
718 | |||
719 | #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH | ||
720 | extern int pmdp_clear_flush_young(struct vm_area_struct *vma, | ||
721 | unsigned long address, pmd_t *pmdp); | ||
722 | |||
723 | |||
724 | #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
725 | extern void pmdp_splitting_flush(struct vm_area_struct *vma, | ||
726 | unsigned long addr, pmd_t *pmdp); | ||
727 | |||
728 | #define __HAVE_ARCH_PMD_WRITE | ||
729 | static inline int pmd_write(pmd_t pmd) | ||
730 | { | ||
731 | return pmd_flags(pmd) & _PAGE_RW; | ||
732 | } | ||
733 | |||
734 | #define __HAVE_ARCH_PMDP_GET_AND_CLEAR | ||
735 | static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, | ||
736 | pmd_t *pmdp) | ||
737 | { | ||
738 | pmd_t pmd = native_pmdp_get_and_clear(pmdp); | ||
739 | pmd_update(mm, addr, pmdp); | ||
740 | return pmd; | ||
741 | } | ||
742 | |||
743 | #define __HAVE_ARCH_PMDP_SET_WRPROTECT | ||
744 | static inline void pmdp_set_wrprotect(struct mm_struct *mm, | ||
745 | unsigned long addr, pmd_t *pmdp) | ||
746 | { | ||
747 | clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); | ||
748 | pmd_update(mm, addr, pmdp); | ||
749 | } | ||
750 | |||
610 | /* | 751 | /* |
611 | * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); | 752 | * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); |
612 | * | 753 | * |
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index f86da20347f2..975f709e09ae 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h | |||
@@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) | |||
59 | native_set_pte(ptep, pte); | 59 | native_set_pte(ptep, pte); |
60 | } | 60 | } |
61 | 61 | ||
62 | static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) | ||
63 | { | ||
64 | *pmdp = pmd; | ||
65 | } | ||
66 | |||
67 | static inline void native_pmd_clear(pmd_t *pmd) | ||
68 | { | ||
69 | native_set_pmd(pmd, native_make_pmd(0)); | ||
70 | } | ||
71 | |||
62 | static inline pte_t native_ptep_get_and_clear(pte_t *xp) | 72 | static inline pte_t native_ptep_get_and_clear(pte_t *xp) |
63 | { | 73 | { |
64 | #ifdef CONFIG_SMP | 74 | #ifdef CONFIG_SMP |
@@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) | |||
72 | #endif | 82 | #endif |
73 | } | 83 | } |
74 | 84 | ||
75 | static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) | 85 | static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) |
76 | { | 86 | { |
77 | *pmdp = pmd; | 87 | #ifdef CONFIG_SMP |
78 | } | 88 | return native_make_pmd(xchg(&xp->pmd, 0)); |
79 | 89 | #else | |
80 | static inline void native_pmd_clear(pmd_t *pmd) | 90 | /* native_local_pmdp_get_and_clear, |
81 | { | 91 | but duplicated because of cyclic dependency */ |
82 | native_set_pmd(pmd, native_make_pmd(0)); | 92 | pmd_t ret = *xp; |
93 | native_pmd_clear(xp); | ||
94 | return ret; | ||
95 | #endif | ||
83 | } | 96 | } |
84 | 97 | ||
85 | static inline void native_set_pud(pud_t *pudp, pud_t pud) | 98 | static inline void native_set_pud(pud_t *pudp, pud_t pud) |
@@ -168,6 +181,7 @@ extern void cleanup_highmap(void); | |||
168 | #define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) | 181 | #define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) |
169 | 182 | ||
170 | #define __HAVE_ARCH_PTE_SAME | 183 | #define __HAVE_ARCH_PTE_SAME |
184 | |||
171 | #endif /* !__ASSEMBLY__ */ | 185 | #endif /* !__ASSEMBLY__ */ |
172 | 186 | ||
173 | #endif /* _ASM_X86_PGTABLE_64_H */ | 187 | #endif /* _ASM_X86_PGTABLE_64_H */ |
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index d1f4a760be23..7db7723d1f32 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
@@ -22,6 +22,7 @@ | |||
22 | #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ | 22 | #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ |
23 | #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 | 23 | #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 |
24 | #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 | 24 | #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 |
25 | #define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */ | ||
25 | #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ | 26 | #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ |
26 | 27 | ||
27 | /* If _PAGE_BIT_PRESENT is clear, we use these: */ | 28 | /* If _PAGE_BIT_PRESENT is clear, we use these: */ |
@@ -45,6 +46,7 @@ | |||
45 | #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) | 46 | #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) |
46 | #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) | 47 | #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) |
47 | #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) | 48 | #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) |
49 | #define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING) | ||
48 | #define __HAVE_ARCH_PTE_SPECIAL | 50 | #define __HAVE_ARCH_PTE_SPECIAL |
49 | 51 | ||
50 | #ifdef CONFIG_KMEMCHECK | 52 | #ifdef CONFIG_KMEMCHECK |
@@ -70,6 +72,7 @@ | |||
70 | /* Set of bits not changed in pte_modify */ | 72 | /* Set of bits not changed in pte_modify */ |
71 | #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ | 73 | #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ |
72 | _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) | 74 | _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) |
75 | #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) | ||
73 | 76 | ||
74 | #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) | 77 | #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) |
75 | #define _PAGE_CACHE_WB (0) | 78 | #define _PAGE_CACHE_WB (0) |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index cae9c3cb95cf..45636cefa186 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -141,10 +141,9 @@ extern __u32 cpu_caps_set[NCAPINTS]; | |||
141 | #ifdef CONFIG_SMP | 141 | #ifdef CONFIG_SMP |
142 | DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); | 142 | DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); |
143 | #define cpu_data(cpu) per_cpu(cpu_info, cpu) | 143 | #define cpu_data(cpu) per_cpu(cpu_info, cpu) |
144 | #define current_cpu_data __get_cpu_var(cpu_info) | ||
145 | #else | 144 | #else |
145 | #define cpu_info boot_cpu_data | ||
146 | #define cpu_data(cpu) boot_cpu_data | 146 | #define cpu_data(cpu) boot_cpu_data |
147 | #define current_cpu_data boot_cpu_data | ||
148 | #endif | 147 | #endif |
149 | 148 | ||
150 | extern const struct seq_operations cpuinfo_op; | 149 | extern const struct seq_operations cpuinfo_op; |
@@ -762,10 +761,11 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c); | |||
762 | extern void init_c1e_mask(void); | 761 | extern void init_c1e_mask(void); |
763 | 762 | ||
764 | extern unsigned long boot_option_idle_override; | 763 | extern unsigned long boot_option_idle_override; |
765 | extern unsigned long idle_halt; | ||
766 | extern unsigned long idle_nomwait; | ||
767 | extern bool c1e_detected; | 764 | extern bool c1e_detected; |
768 | 765 | ||
766 | enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, | ||
767 | IDLE_POLL, IDLE_FORCE_MWAIT}; | ||
768 | |||
769 | extern void enable_sep_cpu(void); | 769 | extern void enable_sep_cpu(void); |
770 | extern int sysenter_setup(void); | 770 | extern int sysenter_setup(void); |
771 | 771 | ||
@@ -902,7 +902,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); | |||
902 | /* | 902 | /* |
903 | * The below -8 is to reserve 8 bytes on top of the ring0 stack. | 903 | * The below -8 is to reserve 8 bytes on top of the ring0 stack. |
904 | * This is necessary to guarantee that the entire "struct pt_regs" | 904 | * This is necessary to guarantee that the entire "struct pt_regs" |
905 | * is accessable even if the CPU haven't stored the SS/ESP registers | 905 | * is accessible even if the CPU haven't stored the SS/ESP registers |
906 | * on the stack (interrupt gate does not save these registers | 906 | * on the stack (interrupt gate does not save these registers |
907 | * when switching to the same priv ring). | 907 | * when switching to the same priv ring). |
908 | * Therefore beware: accessing the ss/esp fields of the | 908 | * Therefore beware: accessing the ss/esp fields of the |
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h new file mode 100644 index 000000000000..b4ec95f07518 --- /dev/null +++ b/arch/x86/include/asm/prom.h | |||
@@ -0,0 +1 @@ | |||
/* dummy prom.h; here to make linux/of.h's #includes happy */ | |||
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 7f7e577a0e39..31d84acc1512 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h | |||
@@ -11,6 +11,7 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); | |||
11 | void pvclock_read_wallclock(struct pvclock_wall_clock *wall, | 11 | void pvclock_read_wallclock(struct pvclock_wall_clock *wall, |
12 | struct pvclock_vcpu_time_info *vcpu, | 12 | struct pvclock_vcpu_time_info *vcpu, |
13 | struct timespec *ts); | 13 | struct timespec *ts); |
14 | void pvclock_resume(void); | ||
14 | 15 | ||
15 | /* | 16 | /* |
16 | * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, | 17 | * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, |
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h index 1def60114906..6c22bf353f26 100644 --- a/arch/x86/include/asm/smpboot_hooks.h +++ b/arch/x86/include/asm/smpboot_hooks.h | |||
@@ -48,7 +48,6 @@ static inline void __init smpboot_setup_io_apic(void) | |||
48 | setup_IO_APIC(); | 48 | setup_IO_APIC(); |
49 | else { | 49 | else { |
50 | nr_ioapics = 0; | 50 | nr_ioapics = 0; |
51 | localise_nmi_watchdog(); | ||
52 | } | 51 | } |
53 | #endif | 52 | #endif |
54 | } | 53 | } |
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 2b16a2ad23dc..52b5c7ed3608 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h | |||
@@ -7,6 +7,7 @@ | |||
7 | #define _ASM_X86_STACKTRACE_H | 7 | #define _ASM_X86_STACKTRACE_H |
8 | 8 | ||
9 | #include <linux/uaccess.h> | 9 | #include <linux/uaccess.h> |
10 | #include <linux/ptrace.h> | ||
10 | 11 | ||
11 | extern int kstack_depth_to_print; | 12 | extern int kstack_depth_to_print; |
12 | 13 | ||
@@ -46,7 +47,7 @@ struct stacktrace_ops { | |||
46 | }; | 47 | }; |
47 | 48 | ||
48 | void dump_trace(struct task_struct *tsk, struct pt_regs *regs, | 49 | void dump_trace(struct task_struct *tsk, struct pt_regs *regs, |
49 | unsigned long *stack, unsigned long bp, | 50 | unsigned long *stack, |
50 | const struct stacktrace_ops *ops, void *data); | 51 | const struct stacktrace_ops *ops, void *data); |
51 | 52 | ||
52 | #ifdef CONFIG_X86_32 | 53 | #ifdef CONFIG_X86_32 |
@@ -57,13 +58,39 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, | |||
57 | #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) | 58 | #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) |
58 | #endif | 59 | #endif |
59 | 60 | ||
61 | #ifdef CONFIG_FRAME_POINTER | ||
62 | static inline unsigned long | ||
63 | stack_frame(struct task_struct *task, struct pt_regs *regs) | ||
64 | { | ||
65 | unsigned long bp; | ||
66 | |||
67 | if (regs) | ||
68 | return regs->bp; | ||
69 | |||
70 | if (task == current) { | ||
71 | /* Grab bp right from our regs */ | ||
72 | get_bp(bp); | ||
73 | return bp; | ||
74 | } | ||
75 | |||
76 | /* bp is the last reg pushed by switch_to */ | ||
77 | return *(unsigned long *)task->thread.sp; | ||
78 | } | ||
79 | #else | ||
80 | static inline unsigned long | ||
81 | stack_frame(struct task_struct *task, struct pt_regs *regs) | ||
82 | { | ||
83 | return 0; | ||
84 | } | ||
85 | #endif | ||
86 | |||
60 | extern void | 87 | extern void |
61 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | 88 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, |
62 | unsigned long *stack, unsigned long bp, char *log_lvl); | 89 | unsigned long *stack, char *log_lvl); |
63 | 90 | ||
64 | extern void | 91 | extern void |
65 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | 92 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, |
66 | unsigned long *sp, unsigned long bp, char *log_lvl); | 93 | unsigned long *sp, char *log_lvl); |
67 | 94 | ||
68 | extern unsigned int code_bytes; | 95 | extern unsigned int code_bytes; |
69 | 96 | ||
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 0e831059ac5a..f2b83bc7d784 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h | |||
@@ -47,14 +47,13 @@ enum { | |||
47 | INTERCEPT_MONITOR, | 47 | INTERCEPT_MONITOR, |
48 | INTERCEPT_MWAIT, | 48 | INTERCEPT_MWAIT, |
49 | INTERCEPT_MWAIT_COND, | 49 | INTERCEPT_MWAIT_COND, |
50 | INTERCEPT_XSETBV, | ||
50 | }; | 51 | }; |
51 | 52 | ||
52 | 53 | ||
53 | struct __attribute__ ((__packed__)) vmcb_control_area { | 54 | struct __attribute__ ((__packed__)) vmcb_control_area { |
54 | u16 intercept_cr_read; | 55 | u32 intercept_cr; |
55 | u16 intercept_cr_write; | 56 | u32 intercept_dr; |
56 | u16 intercept_dr_read; | ||
57 | u16 intercept_dr_write; | ||
58 | u32 intercept_exceptions; | 57 | u32 intercept_exceptions; |
59 | u64 intercept; | 58 | u64 intercept; |
60 | u8 reserved_1[42]; | 59 | u8 reserved_1[42]; |
@@ -81,14 +80,19 @@ struct __attribute__ ((__packed__)) vmcb_control_area { | |||
81 | u32 event_inj_err; | 80 | u32 event_inj_err; |
82 | u64 nested_cr3; | 81 | u64 nested_cr3; |
83 | u64 lbr_ctl; | 82 | u64 lbr_ctl; |
84 | u64 reserved_5; | 83 | u32 clean; |
84 | u32 reserved_5; | ||
85 | u64 next_rip; | 85 | u64 next_rip; |
86 | u8 reserved_6[816]; | 86 | u8 insn_len; |
87 | u8 insn_bytes[15]; | ||
88 | u8 reserved_6[800]; | ||
87 | }; | 89 | }; |
88 | 90 | ||
89 | 91 | ||
90 | #define TLB_CONTROL_DO_NOTHING 0 | 92 | #define TLB_CONTROL_DO_NOTHING 0 |
91 | #define TLB_CONTROL_FLUSH_ALL_ASID 1 | 93 | #define TLB_CONTROL_FLUSH_ALL_ASID 1 |
94 | #define TLB_CONTROL_FLUSH_ASID 3 | ||
95 | #define TLB_CONTROL_FLUSH_ASID_LOCAL 7 | ||
92 | 96 | ||
93 | #define V_TPR_MASK 0x0f | 97 | #define V_TPR_MASK 0x0f |
94 | 98 | ||
@@ -204,19 +208,31 @@ struct __attribute__ ((__packed__)) vmcb { | |||
204 | #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK | 208 | #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK |
205 | #define SVM_SELECTOR_CODE_MASK (1 << 3) | 209 | #define SVM_SELECTOR_CODE_MASK (1 << 3) |
206 | 210 | ||
207 | #define INTERCEPT_CR0_MASK 1 | 211 | #define INTERCEPT_CR0_READ 0 |
208 | #define INTERCEPT_CR3_MASK (1 << 3) | 212 | #define INTERCEPT_CR3_READ 3 |
209 | #define INTERCEPT_CR4_MASK (1 << 4) | 213 | #define INTERCEPT_CR4_READ 4 |
210 | #define INTERCEPT_CR8_MASK (1 << 8) | 214 | #define INTERCEPT_CR8_READ 8 |
211 | 215 | #define INTERCEPT_CR0_WRITE (16 + 0) | |
212 | #define INTERCEPT_DR0_MASK 1 | 216 | #define INTERCEPT_CR3_WRITE (16 + 3) |
213 | #define INTERCEPT_DR1_MASK (1 << 1) | 217 | #define INTERCEPT_CR4_WRITE (16 + 4) |
214 | #define INTERCEPT_DR2_MASK (1 << 2) | 218 | #define INTERCEPT_CR8_WRITE (16 + 8) |
215 | #define INTERCEPT_DR3_MASK (1 << 3) | 219 | |
216 | #define INTERCEPT_DR4_MASK (1 << 4) | 220 | #define INTERCEPT_DR0_READ 0 |
217 | #define INTERCEPT_DR5_MASK (1 << 5) | 221 | #define INTERCEPT_DR1_READ 1 |
218 | #define INTERCEPT_DR6_MASK (1 << 6) | 222 | #define INTERCEPT_DR2_READ 2 |
219 | #define INTERCEPT_DR7_MASK (1 << 7) | 223 | #define INTERCEPT_DR3_READ 3 |
224 | #define INTERCEPT_DR4_READ 4 | ||
225 | #define INTERCEPT_DR5_READ 5 | ||
226 | #define INTERCEPT_DR6_READ 6 | ||
227 | #define INTERCEPT_DR7_READ 7 | ||
228 | #define INTERCEPT_DR0_WRITE (16 + 0) | ||
229 | #define INTERCEPT_DR1_WRITE (16 + 1) | ||
230 | #define INTERCEPT_DR2_WRITE (16 + 2) | ||
231 | #define INTERCEPT_DR3_WRITE (16 + 3) | ||
232 | #define INTERCEPT_DR4_WRITE (16 + 4) | ||
233 | #define INTERCEPT_DR5_WRITE (16 + 5) | ||
234 | #define INTERCEPT_DR6_WRITE (16 + 6) | ||
235 | #define INTERCEPT_DR7_WRITE (16 + 7) | ||
220 | 236 | ||
221 | #define SVM_EVTINJ_VEC_MASK 0xff | 237 | #define SVM_EVTINJ_VEC_MASK 0xff |
222 | 238 | ||
@@ -246,6 +262,8 @@ struct __attribute__ ((__packed__)) vmcb { | |||
246 | #define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 | 262 | #define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 |
247 | #define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 | 263 | #define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 |
248 | 264 | ||
265 | #define SVM_EXITINFO_REG_MASK 0x0F | ||
266 | |||
249 | #define SVM_EXIT_READ_CR0 0x000 | 267 | #define SVM_EXIT_READ_CR0 0x000 |
250 | #define SVM_EXIT_READ_CR3 0x003 | 268 | #define SVM_EXIT_READ_CR3 0x003 |
251 | #define SVM_EXIT_READ_CR4 0x004 | 269 | #define SVM_EXIT_READ_CR4 0x004 |
@@ -316,6 +334,7 @@ struct __attribute__ ((__packed__)) vmcb { | |||
316 | #define SVM_EXIT_MONITOR 0x08a | 334 | #define SVM_EXIT_MONITOR 0x08a |
317 | #define SVM_EXIT_MWAIT 0x08b | 335 | #define SVM_EXIT_MWAIT 0x08b |
318 | #define SVM_EXIT_MWAIT_COND 0x08c | 336 | #define SVM_EXIT_MWAIT_COND 0x08c |
337 | #define SVM_EXIT_XSETBV 0x08d | ||
319 | #define SVM_EXIT_NPF 0x400 | 338 | #define SVM_EXIT_NPF 0x400 |
320 | 339 | ||
321 | #define SVM_EXIT_ERR -1 | 340 | #define SVM_EXIT_ERR -1 |
diff --git a/arch/x86/include/asm/system_64.h b/arch/x86/include/asm/system_64.h deleted file mode 100644 index 1159e091ad09..000000000000 --- a/arch/x86/include/asm/system_64.h +++ /dev/null | |||
@@ -1,22 +0,0 @@ | |||
1 | #ifndef _ASM_X86_SYSTEM_64_H | ||
2 | #define _ASM_X86_SYSTEM_64_H | ||
3 | |||
4 | #include <asm/segment.h> | ||
5 | #include <asm/cmpxchg.h> | ||
6 | |||
7 | |||
8 | static inline unsigned long read_cr8(void) | ||
9 | { | ||
10 | unsigned long cr8; | ||
11 | asm volatile("movq %%cr8,%0" : "=r" (cr8)); | ||
12 | return cr8; | ||
13 | } | ||
14 | |||
15 | static inline void write_cr8(unsigned long val) | ||
16 | { | ||
17 | asm volatile("movq %0,%%cr8" :: "r" (val) : "memory"); | ||
18 | } | ||
19 | |||
20 | #include <linux/irqflags.h> | ||
21 | |||
22 | #endif /* _ASM_X86_SYSTEM_64_H */ | ||
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 5469630b27f5..fa7b9176b76c 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h | |||
@@ -10,12 +10,6 @@ | |||
10 | unsigned long long native_sched_clock(void); | 10 | unsigned long long native_sched_clock(void); |
11 | extern int recalibrate_cpu_khz(void); | 11 | extern int recalibrate_cpu_khz(void); |
12 | 12 | ||
13 | #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) | ||
14 | extern int timer_ack; | ||
15 | #else | ||
16 | # define timer_ack (0) | ||
17 | #endif | ||
18 | |||
19 | extern int no_timer_check; | 13 | extern int no_timer_check; |
20 | 14 | ||
21 | /* Accelerators for sched_clock() | 15 | /* Accelerators for sched_clock() |
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index f66cda56781d..0310da67307f 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h | |||
@@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void); | |||
30 | asmlinkage void stack_segment(void); | 30 | asmlinkage void stack_segment(void); |
31 | asmlinkage void general_protection(void); | 31 | asmlinkage void general_protection(void); |
32 | asmlinkage void page_fault(void); | 32 | asmlinkage void page_fault(void); |
33 | asmlinkage void async_page_fault(void); | ||
33 | asmlinkage void spurious_interrupt_bug(void); | 34 | asmlinkage void spurious_interrupt_bug(void); |
34 | asmlinkage void coprocessor_error(void); | 35 | asmlinkage void coprocessor_error(void); |
35 | asmlinkage void alignment_check(void); | 36 | asmlinkage void alignment_check(void); |
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 42d412fd8b02..ce1d54c8a433 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h | |||
@@ -26,20 +26,22 @@ | |||
26 | * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512, | 26 | * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512, |
27 | * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. | 27 | * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. |
28 | * | 28 | * |
29 | * We will use 31 sets, one for sending BAU messages from each of the 32 | 29 | * We will use one set for sending BAU messages from each of the |
30 | * cpu's on the uvhub. | 30 | * cpu's on the uvhub. |
31 | * | 31 | * |
32 | * TLB shootdown will use the first of the 8 descriptors of each set. | 32 | * TLB shootdown will use the first of the 8 descriptors of each set. |
33 | * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). | 33 | * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). |
34 | */ | 34 | */ |
35 | 35 | ||
36 | #define MAX_CPUS_PER_UVHUB 64 | ||
37 | #define MAX_CPUS_PER_SOCKET 32 | ||
38 | #define UV_ADP_SIZE 64 /* hardware-provided max. */ | ||
39 | #define UV_CPUS_PER_ACT_STATUS 32 /* hardware-provided max. */ | ||
36 | #define UV_ITEMS_PER_DESCRIPTOR 8 | 40 | #define UV_ITEMS_PER_DESCRIPTOR 8 |
37 | /* the 'throttle' to prevent the hardware stay-busy bug */ | 41 | /* the 'throttle' to prevent the hardware stay-busy bug */ |
38 | #define MAX_BAU_CONCURRENT 3 | 42 | #define MAX_BAU_CONCURRENT 3 |
39 | #define UV_CPUS_PER_ACT_STATUS 32 | ||
40 | #define UV_ACT_STATUS_MASK 0x3 | 43 | #define UV_ACT_STATUS_MASK 0x3 |
41 | #define UV_ACT_STATUS_SIZE 2 | 44 | #define UV_ACT_STATUS_SIZE 2 |
42 | #define UV_ADP_SIZE 32 | ||
43 | #define UV_DISTRIBUTION_SIZE 256 | 45 | #define UV_DISTRIBUTION_SIZE 256 |
44 | #define UV_SW_ACK_NPENDING 8 | 46 | #define UV_SW_ACK_NPENDING 8 |
45 | #define UV_NET_ENDPOINT_INTD 0x38 | 47 | #define UV_NET_ENDPOINT_INTD 0x38 |
@@ -100,7 +102,6 @@ | |||
100 | * number of destination side software ack resources | 102 | * number of destination side software ack resources |
101 | */ | 103 | */ |
102 | #define DEST_NUM_RESOURCES 8 | 104 | #define DEST_NUM_RESOURCES 8 |
103 | #define MAX_CPUS_PER_NODE 32 | ||
104 | /* | 105 | /* |
105 | * completion statuses for sending a TLB flush message | 106 | * completion statuses for sending a TLB flush message |
106 | */ | 107 | */ |
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index e969f691cbfd..a501741c2335 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h | |||
@@ -199,6 +199,8 @@ union uvh_apicid { | |||
199 | #define UVH_APICID 0x002D0E00L | 199 | #define UVH_APICID 0x002D0E00L |
200 | #define UV_APIC_PNODE_SHIFT 6 | 200 | #define UV_APIC_PNODE_SHIFT 6 |
201 | 201 | ||
202 | #define UV_APICID_HIBIT_MASK 0xffff0000 | ||
203 | |||
202 | /* Local Bus from cpu's perspective */ | 204 | /* Local Bus from cpu's perspective */ |
203 | #define LOCAL_BUS_BASE 0x1c00000 | 205 | #define LOCAL_BUS_BASE 0x1c00000 |
204 | #define LOCAL_BUS_SIZE (4 * 1024 * 1024) | 206 | #define LOCAL_BUS_SIZE (4 * 1024 * 1024) |
@@ -491,8 +493,10 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) | |||
491 | } | 493 | } |
492 | } | 494 | } |
493 | 495 | ||
496 | extern unsigned int uv_apicid_hibits; | ||
494 | static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode) | 497 | static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode) |
495 | { | 498 | { |
499 | apicid |= uv_apicid_hibits; | ||
496 | return (1UL << UVH_IPI_INT_SEND_SHFT) | | 500 | return (1UL << UVH_IPI_INT_SEND_SHFT) | |
497 | ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | | 501 | ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | |
498 | (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | | 502 | (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | |
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index b2f2d2e05cec..20cafeac7455 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h | |||
@@ -5,7 +5,7 @@ | |||
5 | * | 5 | * |
6 | * SGI UV MMR definitions | 6 | * SGI UV MMR definitions |
7 | * | 7 | * |
8 | * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. | 8 | * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #ifndef _ASM_X86_UV_UV_MMRS_H | 11 | #ifndef _ASM_X86_UV_UV_MMRS_H |
@@ -754,6 +754,23 @@ union uvh_lb_bau_sb_descriptor_base_u { | |||
754 | }; | 754 | }; |
755 | 755 | ||
756 | /* ========================================================================= */ | 756 | /* ========================================================================= */ |
757 | /* UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK */ | ||
758 | /* ========================================================================= */ | ||
759 | #define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL | ||
760 | #define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x009f0 | ||
761 | |||
762 | #define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0 | ||
763 | #define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL | ||
764 | |||
765 | union uvh_lb_target_physical_apic_id_mask_u { | ||
766 | unsigned long v; | ||
767 | struct uvh_lb_target_physical_apic_id_mask_s { | ||
768 | unsigned long bit_enables : 32; /* RW */ | ||
769 | unsigned long rsvd_32_63 : 32; /* */ | ||
770 | } s; | ||
771 | }; | ||
772 | |||
773 | /* ========================================================================= */ | ||
757 | /* UVH_NODE_ID */ | 774 | /* UVH_NODE_ID */ |
758 | /* ========================================================================= */ | 775 | /* ========================================================================= */ |
759 | #define UVH_NODE_ID 0x0UL | 776 | #define UVH_NODE_ID 0x0UL |
@@ -806,6 +823,78 @@ union uvh_node_present_table_u { | |||
806 | }; | 823 | }; |
807 | 824 | ||
808 | /* ========================================================================= */ | 825 | /* ========================================================================= */ |
826 | /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */ | ||
827 | /* ========================================================================= */ | ||
828 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL | ||
829 | |||
830 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 | ||
831 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL | ||
832 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 | ||
833 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL | ||
834 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 | ||
835 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL | ||
836 | |||
837 | union uvh_rh_gam_alias210_overlay_config_0_mmr_u { | ||
838 | unsigned long v; | ||
839 | struct uvh_rh_gam_alias210_overlay_config_0_mmr_s { | ||
840 | unsigned long rsvd_0_23: 24; /* */ | ||
841 | unsigned long base : 8; /* RW */ | ||
842 | unsigned long rsvd_32_47: 16; /* */ | ||
843 | unsigned long m_alias : 5; /* RW */ | ||
844 | unsigned long rsvd_53_62: 10; /* */ | ||
845 | unsigned long enable : 1; /* RW */ | ||
846 | } s; | ||
847 | }; | ||
848 | |||
849 | /* ========================================================================= */ | ||
850 | /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */ | ||
851 | /* ========================================================================= */ | ||
852 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL | ||
853 | |||
854 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 | ||
855 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL | ||
856 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 | ||
857 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL | ||
858 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 | ||
859 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL | ||
860 | |||
861 | union uvh_rh_gam_alias210_overlay_config_1_mmr_u { | ||
862 | unsigned long v; | ||
863 | struct uvh_rh_gam_alias210_overlay_config_1_mmr_s { | ||
864 | unsigned long rsvd_0_23: 24; /* */ | ||
865 | unsigned long base : 8; /* RW */ | ||
866 | unsigned long rsvd_32_47: 16; /* */ | ||
867 | unsigned long m_alias : 5; /* RW */ | ||
868 | unsigned long rsvd_53_62: 10; /* */ | ||
869 | unsigned long enable : 1; /* RW */ | ||
870 | } s; | ||
871 | }; | ||
872 | |||
873 | /* ========================================================================= */ | ||
874 | /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */ | ||
875 | /* ========================================================================= */ | ||
876 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL | ||
877 | |||
878 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 | ||
879 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL | ||
880 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 | ||
881 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL | ||
882 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 | ||
883 | #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL | ||
884 | |||
885 | union uvh_rh_gam_alias210_overlay_config_2_mmr_u { | ||
886 | unsigned long v; | ||
887 | struct uvh_rh_gam_alias210_overlay_config_2_mmr_s { | ||
888 | unsigned long rsvd_0_23: 24; /* */ | ||
889 | unsigned long base : 8; /* RW */ | ||
890 | unsigned long rsvd_32_47: 16; /* */ | ||
891 | unsigned long m_alias : 5; /* RW */ | ||
892 | unsigned long rsvd_53_62: 10; /* */ | ||
893 | unsigned long enable : 1; /* RW */ | ||
894 | } s; | ||
895 | }; | ||
896 | |||
897 | /* ========================================================================= */ | ||
809 | /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ | 898 | /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ |
810 | /* ========================================================================= */ | 899 | /* ========================================================================= */ |
811 | #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL | 900 | #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL |
@@ -857,6 +946,29 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { | |||
857 | }; | 946 | }; |
858 | 947 | ||
859 | /* ========================================================================= */ | 948 | /* ========================================================================= */ |
949 | /* UVH_RH_GAM_CONFIG_MMR */ | ||
950 | /* ========================================================================= */ | ||
951 | #define UVH_RH_GAM_CONFIG_MMR 0x1600000UL | ||
952 | |||
953 | #define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 | ||
954 | #define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL | ||
955 | #define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 | ||
956 | #define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL | ||
957 | #define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12 | ||
958 | #define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL | ||
959 | |||
960 | union uvh_rh_gam_config_mmr_u { | ||
961 | unsigned long v; | ||
962 | struct uvh_rh_gam_config_mmr_s { | ||
963 | unsigned long m_skt : 6; /* RW */ | ||
964 | unsigned long n_skt : 4; /* RW */ | ||
965 | unsigned long rsvd_10_11: 2; /* */ | ||
966 | unsigned long mmiol_cfg : 1; /* RW */ | ||
967 | unsigned long rsvd_13_63: 51; /* */ | ||
968 | } s; | ||
969 | }; | ||
970 | |||
971 | /* ========================================================================= */ | ||
860 | /* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ | 972 | /* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ |
861 | /* ========================================================================= */ | 973 | /* ========================================================================= */ |
862 | #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL | 974 | #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL |
@@ -987,97 +1099,5 @@ union uvh_rtc1_int_config_u { | |||
987 | } s; | 1099 | } s; |
988 | }; | 1100 | }; |
989 | 1101 | ||
990 | /* ========================================================================= */ | ||
991 | /* UVH_SI_ADDR_MAP_CONFIG */ | ||
992 | /* ========================================================================= */ | ||
993 | #define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL | ||
994 | |||
995 | #define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0 | ||
996 | #define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL | ||
997 | #define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8 | ||
998 | #define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL | ||
999 | |||
1000 | union uvh_si_addr_map_config_u { | ||
1001 | unsigned long v; | ||
1002 | struct uvh_si_addr_map_config_s { | ||
1003 | unsigned long m_skt : 6; /* RW */ | ||
1004 | unsigned long rsvd_6_7: 2; /* */ | ||
1005 | unsigned long n_skt : 4; /* RW */ | ||
1006 | unsigned long rsvd_12_63: 52; /* */ | ||
1007 | } s; | ||
1008 | }; | ||
1009 | |||
1010 | /* ========================================================================= */ | ||
1011 | /* UVH_SI_ALIAS0_OVERLAY_CONFIG */ | ||
1012 | /* ========================================================================= */ | ||
1013 | #define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL | ||
1014 | |||
1015 | #define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24 | ||
1016 | #define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL | ||
1017 | #define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48 | ||
1018 | #define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL | ||
1019 | #define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63 | ||
1020 | #define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL | ||
1021 | |||
1022 | union uvh_si_alias0_overlay_config_u { | ||
1023 | unsigned long v; | ||
1024 | struct uvh_si_alias0_overlay_config_s { | ||
1025 | unsigned long rsvd_0_23: 24; /* */ | ||
1026 | unsigned long base : 8; /* RW */ | ||
1027 | unsigned long rsvd_32_47: 16; /* */ | ||
1028 | unsigned long m_alias : 5; /* RW */ | ||
1029 | unsigned long rsvd_53_62: 10; /* */ | ||
1030 | unsigned long enable : 1; /* RW */ | ||
1031 | } s; | ||
1032 | }; | ||
1033 | |||
1034 | /* ========================================================================= */ | ||
1035 | /* UVH_SI_ALIAS1_OVERLAY_CONFIG */ | ||
1036 | /* ========================================================================= */ | ||
1037 | #define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL | ||
1038 | |||
1039 | #define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24 | ||
1040 | #define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL | ||
1041 | #define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48 | ||
1042 | #define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL | ||
1043 | #define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63 | ||
1044 | #define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL | ||
1045 | |||
1046 | union uvh_si_alias1_overlay_config_u { | ||
1047 | unsigned long v; | ||
1048 | struct uvh_si_alias1_overlay_config_s { | ||
1049 | unsigned long rsvd_0_23: 24; /* */ | ||
1050 | unsigned long base : 8; /* RW */ | ||
1051 | unsigned long rsvd_32_47: 16; /* */ | ||
1052 | unsigned long m_alias : 5; /* RW */ | ||
1053 | unsigned long rsvd_53_62: 10; /* */ | ||
1054 | unsigned long enable : 1; /* RW */ | ||
1055 | } s; | ||
1056 | }; | ||
1057 | |||
1058 | /* ========================================================================= */ | ||
1059 | /* UVH_SI_ALIAS2_OVERLAY_CONFIG */ | ||
1060 | /* ========================================================================= */ | ||
1061 | #define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL | ||
1062 | |||
1063 | #define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24 | ||
1064 | #define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL | ||
1065 | #define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48 | ||
1066 | #define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL | ||
1067 | #define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63 | ||
1068 | #define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL | ||
1069 | |||
1070 | union uvh_si_alias2_overlay_config_u { | ||
1071 | unsigned long v; | ||
1072 | struct uvh_si_alias2_overlay_config_s { | ||
1073 | unsigned long rsvd_0_23: 24; /* */ | ||
1074 | unsigned long base : 8; /* RW */ | ||
1075 | unsigned long rsvd_32_47: 16; /* */ | ||
1076 | unsigned long m_alias : 5; /* RW */ | ||
1077 | unsigned long rsvd_53_62: 10; /* */ | ||
1078 | unsigned long enable : 1; /* RW */ | ||
1079 | } s; | ||
1080 | }; | ||
1081 | |||
1082 | 1102 | ||
1083 | #endif /* _ASM_X86_UV_UV_MMRS_H */ | 1103 | #endif /* __ASM_UV_MMRS_X86_H__ */ |
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 9f0cbd987d50..84471b810460 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h | |||
@@ -66,15 +66,23 @@ | |||
66 | #define PIN_BASED_NMI_EXITING 0x00000008 | 66 | #define PIN_BASED_NMI_EXITING 0x00000008 |
67 | #define PIN_BASED_VIRTUAL_NMIS 0x00000020 | 67 | #define PIN_BASED_VIRTUAL_NMIS 0x00000020 |
68 | 68 | ||
69 | #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 | ||
69 | #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 | 70 | #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 |
71 | #define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000 | ||
70 | #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 | 72 | #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 |
71 | #define VM_EXIT_SAVE_IA32_PAT 0x00040000 | 73 | #define VM_EXIT_SAVE_IA32_PAT 0x00040000 |
72 | #define VM_EXIT_LOAD_IA32_PAT 0x00080000 | 74 | #define VM_EXIT_LOAD_IA32_PAT 0x00080000 |
75 | #define VM_EXIT_SAVE_IA32_EFER 0x00100000 | ||
76 | #define VM_EXIT_LOAD_IA32_EFER 0x00200000 | ||
77 | #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 | ||
73 | 78 | ||
79 | #define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002 | ||
74 | #define VM_ENTRY_IA32E_MODE 0x00000200 | 80 | #define VM_ENTRY_IA32E_MODE 0x00000200 |
75 | #define VM_ENTRY_SMM 0x00000400 | 81 | #define VM_ENTRY_SMM 0x00000400 |
76 | #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 | 82 | #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 |
83 | #define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000 | ||
77 | #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 | 84 | #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 |
85 | #define VM_ENTRY_LOAD_IA32_EFER 0x00008000 | ||
78 | 86 | ||
79 | /* VMCS Encodings */ | 87 | /* VMCS Encodings */ |
80 | enum vmcs_field { | 88 | enum vmcs_field { |
@@ -239,6 +247,7 @@ enum vmcs_field { | |||
239 | #define EXIT_REASON_TASK_SWITCH 9 | 247 | #define EXIT_REASON_TASK_SWITCH 9 |
240 | #define EXIT_REASON_CPUID 10 | 248 | #define EXIT_REASON_CPUID 10 |
241 | #define EXIT_REASON_HLT 12 | 249 | #define EXIT_REASON_HLT 12 |
250 | #define EXIT_REASON_INVD 13 | ||
242 | #define EXIT_REASON_INVLPG 14 | 251 | #define EXIT_REASON_INVLPG 14 |
243 | #define EXIT_REASON_RDPMC 15 | 252 | #define EXIT_REASON_RDPMC 15 |
244 | #define EXIT_REASON_RDTSC 16 | 253 | #define EXIT_REASON_RDTSC 16 |
@@ -296,6 +305,12 @@ enum vmcs_field { | |||
296 | #define GUEST_INTR_STATE_SMI 0x00000004 | 305 | #define GUEST_INTR_STATE_SMI 0x00000004 |
297 | #define GUEST_INTR_STATE_NMI 0x00000008 | 306 | #define GUEST_INTR_STATE_NMI 0x00000008 |
298 | 307 | ||
308 | /* GUEST_ACTIVITY_STATE flags */ | ||
309 | #define GUEST_ACTIVITY_ACTIVE 0 | ||
310 | #define GUEST_ACTIVITY_HLT 1 | ||
311 | #define GUEST_ACTIVITY_SHUTDOWN 2 | ||
312 | #define GUEST_ACTIVITY_WAIT_SIPI 3 | ||
313 | |||
299 | /* | 314 | /* |
300 | * Exit Qualifications for MOV for Control Register Access | 315 | * Exit Qualifications for MOV for Control Register Access |
301 | */ | 316 | */ |
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 396ff4cc8ed4..66d0fff1ee84 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h | |||
@@ -37,4 +37,39 @@ | |||
37 | extern struct shared_info *HYPERVISOR_shared_info; | 37 | extern struct shared_info *HYPERVISOR_shared_info; |
38 | extern struct start_info *xen_start_info; | 38 | extern struct start_info *xen_start_info; |
39 | 39 | ||
40 | #include <asm/processor.h> | ||
41 | |||
42 | static inline uint32_t xen_cpuid_base(void) | ||
43 | { | ||
44 | uint32_t base, eax, ebx, ecx, edx; | ||
45 | char signature[13]; | ||
46 | |||
47 | for (base = 0x40000000; base < 0x40010000; base += 0x100) { | ||
48 | cpuid(base, &eax, &ebx, &ecx, &edx); | ||
49 | *(uint32_t *)(signature + 0) = ebx; | ||
50 | *(uint32_t *)(signature + 4) = ecx; | ||
51 | *(uint32_t *)(signature + 8) = edx; | ||
52 | signature[12] = 0; | ||
53 | |||
54 | if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) | ||
55 | return base; | ||
56 | } | ||
57 | |||
58 | return 0; | ||
59 | } | ||
60 | |||
61 | #ifdef CONFIG_XEN | ||
62 | extern bool xen_hvm_need_lapic(void); | ||
63 | |||
64 | static inline bool xen_x2apic_para_available(void) | ||
65 | { | ||
66 | return xen_hvm_need_lapic(); | ||
67 | } | ||
68 | #else | ||
69 | static inline bool xen_x2apic_para_available(void) | ||
70 | { | ||
71 | return (xen_cpuid_base() != 0); | ||
72 | } | ||
73 | #endif | ||
74 | |||
40 | #endif /* _ASM_X86_XEN_HYPERVISOR_H */ | 75 | #endif /* _ASM_X86_XEN_HYPERVISOR_H */ |
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index e8506c1f0c55..1c10c88ee4e1 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h | |||
@@ -61,9 +61,9 @@ DEFINE_GUEST_HANDLE(void); | |||
61 | #define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) | 61 | #define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) |
62 | #endif | 62 | #endif |
63 | 63 | ||
64 | #ifndef machine_to_phys_mapping | 64 | #define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) |
65 | #define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) | 65 | #define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) |
66 | #endif | 66 | #define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT) |
67 | 67 | ||
68 | /* Maximum number of virtual CPUs in multi-processor guests. */ | 68 | /* Maximum number of virtual CPUs in multi-processor guests. */ |
69 | #define MAX_VIRT_CPUS 32 | 69 | #define MAX_VIRT_CPUS 32 |
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h index 42a7e004ae5c..8413688b2571 100644 --- a/arch/x86/include/asm/xen/interface_32.h +++ b/arch/x86/include/asm/xen/interface_32.h | |||
@@ -32,6 +32,11 @@ | |||
32 | /* And the trap vector is... */ | 32 | /* And the trap vector is... */ |
33 | #define TRAP_INSTR "int $0x82" | 33 | #define TRAP_INSTR "int $0x82" |
34 | 34 | ||
35 | #define __MACH2PHYS_VIRT_START 0xF5800000 | ||
36 | #define __MACH2PHYS_VIRT_END 0xF6800000 | ||
37 | |||
38 | #define __MACH2PHYS_SHIFT 2 | ||
39 | |||
35 | /* | 40 | /* |
36 | * Virtual addresses beyond this are not modifiable by guest OSes. The | 41 | * Virtual addresses beyond this are not modifiable by guest OSes. The |
37 | * machine->physical mapping table starts at this address, read-only. | 42 | * machine->physical mapping table starts at this address, read-only. |
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h index 100d2662b97c..839a4811cf98 100644 --- a/arch/x86/include/asm/xen/interface_64.h +++ b/arch/x86/include/asm/xen/interface_64.h | |||
@@ -39,18 +39,7 @@ | |||
39 | #define __HYPERVISOR_VIRT_END 0xFFFF880000000000 | 39 | #define __HYPERVISOR_VIRT_END 0xFFFF880000000000 |
40 | #define __MACH2PHYS_VIRT_START 0xFFFF800000000000 | 40 | #define __MACH2PHYS_VIRT_START 0xFFFF800000000000 |
41 | #define __MACH2PHYS_VIRT_END 0xFFFF804000000000 | 41 | #define __MACH2PHYS_VIRT_END 0xFFFF804000000000 |
42 | 42 | #define __MACH2PHYS_SHIFT 3 | |
43 | #ifndef HYPERVISOR_VIRT_START | ||
44 | #define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) | ||
45 | #define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END) | ||
46 | #endif | ||
47 | |||
48 | #define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) | ||
49 | #define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) | ||
50 | #define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3) | ||
51 | #ifndef machine_to_phys_mapping | ||
52 | #define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) | ||
53 | #endif | ||
54 | 43 | ||
55 | /* | 44 | /* |
56 | * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) | 45 | * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) |
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index dd8c1414b3d5..f25bdf238a33 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/types.h> | 5 | #include <linux/types.h> |
6 | #include <linux/spinlock.h> | 6 | #include <linux/spinlock.h> |
7 | #include <linux/pfn.h> | 7 | #include <linux/pfn.h> |
8 | #include <linux/mm.h> | ||
8 | 9 | ||
9 | #include <asm/uaccess.h> | 10 | #include <asm/uaccess.h> |
10 | #include <asm/page.h> | 11 | #include <asm/page.h> |
@@ -35,10 +36,17 @@ typedef struct xpaddr { | |||
35 | #define MAX_DOMAIN_PAGES \ | 36 | #define MAX_DOMAIN_PAGES \ |
36 | ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) | 37 | ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) |
37 | 38 | ||
39 | extern unsigned long *machine_to_phys_mapping; | ||
40 | extern unsigned int machine_to_phys_order; | ||
38 | 41 | ||
39 | extern unsigned long get_phys_to_machine(unsigned long pfn); | 42 | extern unsigned long get_phys_to_machine(unsigned long pfn); |
40 | extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); | 43 | extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); |
41 | 44 | ||
45 | extern int m2p_add_override(unsigned long mfn, struct page *page); | ||
46 | extern int m2p_remove_override(struct page *page); | ||
47 | extern struct page *m2p_find_override(unsigned long mfn); | ||
48 | extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); | ||
49 | |||
42 | static inline unsigned long pfn_to_mfn(unsigned long pfn) | 50 | static inline unsigned long pfn_to_mfn(unsigned long pfn) |
43 | { | 51 | { |
44 | unsigned long mfn; | 52 | unsigned long mfn; |
@@ -69,11 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) | |||
69 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 77 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
70 | return mfn; | 78 | return mfn; |
71 | 79 | ||
72 | #if 0 | ||
73 | if (unlikely((mfn >> machine_to_phys_order) != 0)) | ||
74 | return max_mapnr; | ||
75 | #endif | ||
76 | |||
77 | pfn = 0; | 80 | pfn = 0; |
78 | /* | 81 | /* |
79 | * The array access can fail (e.g., device space beyond end of RAM). | 82 | * The array access can fail (e.g., device space beyond end of RAM). |
@@ -82,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) | |||
82 | */ | 85 | */ |
83 | __get_user(pfn, &machine_to_phys_mapping[mfn]); | 86 | __get_user(pfn, &machine_to_phys_mapping[mfn]); |
84 | 87 | ||
88 | /* | ||
89 | * If this appears to be a foreign mfn (because the pfn | ||
90 | * doesn't map back to the mfn), then check the local override | ||
91 | * table to see if there's a better pfn to use. | ||
92 | */ | ||
93 | if (get_phys_to_machine(pfn) != mfn) | ||
94 | pfn = m2p_find_override_pfn(mfn, pfn); | ||
95 | |||
85 | return pfn; | 96 | return pfn; |
86 | } | 97 | } |
87 | 98 | ||
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f60153d5de57..34244b2cd880 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -45,6 +45,7 @@ obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o | |||
45 | obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o | 45 | obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o |
46 | obj-y += tsc.o io_delay.o rtc.o | 46 | obj-y += tsc.o io_delay.o rtc.o |
47 | obj-y += pci-iommu_table.o | 47 | obj-y += pci-iommu_table.o |
48 | obj-y += resource.o | ||
48 | 49 | ||
49 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o | 50 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o |
50 | obj-y += process.o | 51 | obj-y += process.o |
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 71232b941b6c..b3a71137983a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
@@ -198,6 +198,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled) | |||
198 | { | 198 | { |
199 | unsigned int ver = 0; | 199 | unsigned int ver = 0; |
200 | 200 | ||
201 | if (id >= (MAX_LOCAL_APIC-1)) { | ||
202 | printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); | ||
203 | return; | ||
204 | } | ||
205 | |||
201 | if (!enabled) { | 206 | if (!enabled) { |
202 | ++disabled_cpus; | 207 | ++disabled_cpus; |
203 | return; | 208 | return; |
@@ -504,6 +509,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) | |||
504 | 509 | ||
505 | return 0; | 510 | return 0; |
506 | } | 511 | } |
512 | EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); | ||
507 | 513 | ||
508 | int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) | 514 | int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) |
509 | { | 515 | { |
@@ -847,18 +853,6 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) | |||
847 | * returns 0 on success, < 0 on error | 853 | * returns 0 on success, < 0 on error |
848 | */ | 854 | */ |
849 | 855 | ||
850 | static void __init acpi_register_lapic_address(unsigned long address) | ||
851 | { | ||
852 | mp_lapic_addr = address; | ||
853 | |||
854 | set_fixmap_nocache(FIX_APIC_BASE, address); | ||
855 | if (boot_cpu_physical_apicid == -1U) { | ||
856 | boot_cpu_physical_apicid = read_apic_id(); | ||
857 | apic_version[boot_cpu_physical_apicid] = | ||
858 | GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
859 | } | ||
860 | } | ||
861 | |||
862 | static int __init early_acpi_parse_madt_lapic_addr_ovr(void) | 856 | static int __init early_acpi_parse_madt_lapic_addr_ovr(void) |
863 | { | 857 | { |
864 | int count; | 858 | int count; |
@@ -880,7 +874,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) | |||
880 | return count; | 874 | return count; |
881 | } | 875 | } |
882 | 876 | ||
883 | acpi_register_lapic_address(acpi_lapic_addr); | 877 | register_lapic_address(acpi_lapic_addr); |
884 | 878 | ||
885 | return count; | 879 | return count; |
886 | } | 880 | } |
@@ -907,16 +901,16 @@ static int __init acpi_parse_madt_lapic_entries(void) | |||
907 | return count; | 901 | return count; |
908 | } | 902 | } |
909 | 903 | ||
910 | acpi_register_lapic_address(acpi_lapic_addr); | 904 | register_lapic_address(acpi_lapic_addr); |
911 | 905 | ||
912 | count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, | 906 | count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, |
913 | acpi_parse_sapic, MAX_APICS); | 907 | acpi_parse_sapic, MAX_LOCAL_APIC); |
914 | 908 | ||
915 | if (!count) { | 909 | if (!count) { |
916 | x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, | 910 | x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, |
917 | acpi_parse_x2apic, MAX_APICS); | 911 | acpi_parse_x2apic, MAX_LOCAL_APIC); |
918 | count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, | 912 | count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, |
919 | acpi_parse_lapic, MAX_APICS); | 913 | acpi_parse_lapic, MAX_LOCAL_APIC); |
920 | } | 914 | } |
921 | if (!count && !x2count) { | 915 | if (!count && !x2count) { |
922 | printk(KERN_ERR PREFIX "No LAPIC entries present\n"); | 916 | printk(KERN_ERR PREFIX "No LAPIC entries present\n"); |
@@ -949,32 +943,6 @@ static int __init acpi_parse_madt_lapic_entries(void) | |||
949 | extern int es7000_plat; | 943 | extern int es7000_plat; |
950 | #endif | 944 | #endif |
951 | 945 | ||
952 | static void assign_to_mp_irq(struct mpc_intsrc *m, | ||
953 | struct mpc_intsrc *mp_irq) | ||
954 | { | ||
955 | memcpy(mp_irq, m, sizeof(struct mpc_intsrc)); | ||
956 | } | ||
957 | |||
958 | static int mp_irq_cmp(struct mpc_intsrc *mp_irq, | ||
959 | struct mpc_intsrc *m) | ||
960 | { | ||
961 | return memcmp(mp_irq, m, sizeof(struct mpc_intsrc)); | ||
962 | } | ||
963 | |||
964 | static void save_mp_irq(struct mpc_intsrc *m) | ||
965 | { | ||
966 | int i; | ||
967 | |||
968 | for (i = 0; i < mp_irq_entries; i++) { | ||
969 | if (!mp_irq_cmp(&mp_irqs[i], m)) | ||
970 | return; | ||
971 | } | ||
972 | |||
973 | assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); | ||
974 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
975 | panic("Max # of irq sources exceeded!!\n"); | ||
976 | } | ||
977 | |||
978 | void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) | 946 | void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) |
979 | { | 947 | { |
980 | int ioapic; | 948 | int ioapic; |
@@ -1005,7 +973,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) | |||
1005 | mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */ | 973 | mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */ |
1006 | mp_irq.dstirq = pin; /* INTIN# */ | 974 | mp_irq.dstirq = pin; /* INTIN# */ |
1007 | 975 | ||
1008 | save_mp_irq(&mp_irq); | 976 | mp_save_irq(&mp_irq); |
1009 | 977 | ||
1010 | isa_irq_to_gsi[bus_irq] = gsi; | 978 | isa_irq_to_gsi[bus_irq] = gsi; |
1011 | } | 979 | } |
@@ -1080,7 +1048,7 @@ void __init mp_config_acpi_legacy_irqs(void) | |||
1080 | mp_irq.srcbusirq = i; /* Identity mapped */ | 1048 | mp_irq.srcbusirq = i; /* Identity mapped */ |
1081 | mp_irq.dstirq = pin; | 1049 | mp_irq.dstirq = pin; |
1082 | 1050 | ||
1083 | save_mp_irq(&mp_irq); | 1051 | mp_save_irq(&mp_irq); |
1084 | } | 1052 | } |
1085 | } | 1053 | } |
1086 | 1054 | ||
@@ -1117,7 +1085,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, | |||
1117 | mp_irq.dstapic = mp_ioapics[ioapic].apicid; | 1085 | mp_irq.dstapic = mp_ioapics[ioapic].apicid; |
1118 | mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); | 1086 | mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); |
1119 | 1087 | ||
1120 | save_mp_irq(&mp_irq); | 1088 | mp_save_irq(&mp_irq); |
1121 | #endif | 1089 | #endif |
1122 | return 0; | 1090 | return 0; |
1123 | } | 1091 | } |
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5079f24c955a..123608531c8f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -353,6 +353,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod) | |||
353 | mutex_unlock(&smp_alt); | 353 | mutex_unlock(&smp_alt); |
354 | } | 354 | } |
355 | 355 | ||
356 | bool skip_smp_alternatives; | ||
356 | void alternatives_smp_switch(int smp) | 357 | void alternatives_smp_switch(int smp) |
357 | { | 358 | { |
358 | struct smp_alt_module *mod; | 359 | struct smp_alt_module *mod; |
@@ -368,7 +369,7 @@ void alternatives_smp_switch(int smp) | |||
368 | printk("lockdep: fixing up alternatives.\n"); | 369 | printk("lockdep: fixing up alternatives.\n"); |
369 | #endif | 370 | #endif |
370 | 371 | ||
371 | if (noreplace_smp || smp_alt_once) | 372 | if (noreplace_smp || smp_alt_once || skip_smp_alternatives) |
372 | return; | 373 | return; |
373 | BUG_ON(!smp && (num_online_cpus() > 1)); | 374 | BUG_ON(!smp && (num_online_cpus() > 1)); |
374 | 375 | ||
@@ -591,17 +592,21 @@ static atomic_t stop_machine_first; | |||
591 | static int wrote_text; | 592 | static int wrote_text; |
592 | 593 | ||
593 | struct text_poke_params { | 594 | struct text_poke_params { |
594 | void *addr; | 595 | struct text_poke_param *params; |
595 | const void *opcode; | 596 | int nparams; |
596 | size_t len; | ||
597 | }; | 597 | }; |
598 | 598 | ||
599 | static int __kprobes stop_machine_text_poke(void *data) | 599 | static int __kprobes stop_machine_text_poke(void *data) |
600 | { | 600 | { |
601 | struct text_poke_params *tpp = data; | 601 | struct text_poke_params *tpp = data; |
602 | struct text_poke_param *p; | ||
603 | int i; | ||
602 | 604 | ||
603 | if (atomic_dec_and_test(&stop_machine_first)) { | 605 | if (atomic_dec_and_test(&stop_machine_first)) { |
604 | text_poke(tpp->addr, tpp->opcode, tpp->len); | 606 | for (i = 0; i < tpp->nparams; i++) { |
607 | p = &tpp->params[i]; | ||
608 | text_poke(p->addr, p->opcode, p->len); | ||
609 | } | ||
605 | smp_wmb(); /* Make sure other cpus see that this has run */ | 610 | smp_wmb(); /* Make sure other cpus see that this has run */ |
606 | wrote_text = 1; | 611 | wrote_text = 1; |
607 | } else { | 612 | } else { |
@@ -610,8 +615,12 @@ static int __kprobes stop_machine_text_poke(void *data) | |||
610 | smp_mb(); /* Load wrote_text before following execution */ | 615 | smp_mb(); /* Load wrote_text before following execution */ |
611 | } | 616 | } |
612 | 617 | ||
613 | flush_icache_range((unsigned long)tpp->addr, | 618 | for (i = 0; i < tpp->nparams; i++) { |
614 | (unsigned long)tpp->addr + tpp->len); | 619 | p = &tpp->params[i]; |
620 | flush_icache_range((unsigned long)p->addr, | ||
621 | (unsigned long)p->addr + p->len); | ||
622 | } | ||
623 | |||
615 | return 0; | 624 | return 0; |
616 | } | 625 | } |
617 | 626 | ||
@@ -631,10 +640,13 @@ static int __kprobes stop_machine_text_poke(void *data) | |||
631 | void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) | 640 | void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) |
632 | { | 641 | { |
633 | struct text_poke_params tpp; | 642 | struct text_poke_params tpp; |
643 | struct text_poke_param p; | ||
634 | 644 | ||
635 | tpp.addr = addr; | 645 | p.addr = addr; |
636 | tpp.opcode = opcode; | 646 | p.opcode = opcode; |
637 | tpp.len = len; | 647 | p.len = len; |
648 | tpp.params = &p; | ||
649 | tpp.nparams = 1; | ||
638 | atomic_set(&stop_machine_first, 1); | 650 | atomic_set(&stop_machine_first, 1); |
639 | wrote_text = 0; | 651 | wrote_text = 0; |
640 | /* Use __stop_machine() because the caller already got online_cpus. */ | 652 | /* Use __stop_machine() because the caller already got online_cpus. */ |
@@ -642,6 +654,26 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) | |||
642 | return addr; | 654 | return addr; |
643 | } | 655 | } |
644 | 656 | ||
657 | /** | ||
658 | * text_poke_smp_batch - Update instructions on a live kernel on SMP | ||
659 | * @params: an array of text_poke parameters | ||
660 | * @n: the number of elements in params. | ||
661 | * | ||
662 | * Modify multi-byte instruction by using stop_machine() on SMP. Since the | ||
663 | * stop_machine() is heavy task, it is better to aggregate text_poke requests | ||
664 | * and do it once if possible. | ||
665 | * | ||
666 | * Note: Must be called under get_online_cpus() and text_mutex. | ||
667 | */ | ||
668 | void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) | ||
669 | { | ||
670 | struct text_poke_params tpp = {.params = params, .nparams = n}; | ||
671 | |||
672 | atomic_set(&stop_machine_first, 1); | ||
673 | wrote_text = 0; | ||
674 | stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); | ||
675 | } | ||
676 | |||
645 | #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) | 677 | #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) |
646 | 678 | ||
647 | #ifdef CONFIG_X86_64 | 679 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index d2fdb0826df2..57ca77787220 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c | |||
@@ -1086,7 +1086,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, | |||
1086 | 1086 | ||
1087 | dma_dom->aperture_size += APERTURE_RANGE_SIZE; | 1087 | dma_dom->aperture_size += APERTURE_RANGE_SIZE; |
1088 | 1088 | ||
1089 | /* Intialize the exclusion range if necessary */ | 1089 | /* Initialize the exclusion range if necessary */ |
1090 | for_each_iommu(iommu) { | 1090 | for_each_iommu(iommu) { |
1091 | if (iommu->exclusion_start && | 1091 | if (iommu->exclusion_start && |
1092 | iommu->exclusion_start >= dma_dom->aperture[index]->offset | 1092 | iommu->exclusion_start >= dma_dom->aperture[index]->offset |
@@ -1353,7 +1353,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) | |||
1353 | 1353 | ||
1354 | /* | 1354 | /* |
1355 | * Allocates a new protection domain usable for the dma_ops functions. | 1355 | * Allocates a new protection domain usable for the dma_ops functions. |
1356 | * It also intializes the page table and the address allocator data | 1356 | * It also initializes the page table and the address allocator data |
1357 | * structures required for the dma_ops interface | 1357 | * structures required for the dma_ops interface |
1358 | */ | 1358 | */ |
1359 | static struct dma_ops_domain *dma_ops_domain_alloc(void) | 1359 | static struct dma_ops_domain *dma_ops_domain_alloc(void) |
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 8f6463d8ed0d..0a99f7198bc3 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c | |||
@@ -12,95 +12,123 @@ | |||
12 | 12 | ||
13 | static u32 *flush_words; | 13 | static u32 *flush_words; |
14 | 14 | ||
15 | struct pci_device_id k8_nb_ids[] = { | 15 | struct pci_device_id amd_nb_misc_ids[] = { |
16 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, | 16 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, |
17 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, | 17 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, |
18 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, | 18 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, |
19 | {} | 19 | {} |
20 | }; | 20 | }; |
21 | EXPORT_SYMBOL(k8_nb_ids); | 21 | EXPORT_SYMBOL(amd_nb_misc_ids); |
22 | 22 | ||
23 | struct k8_northbridge_info k8_northbridges; | 23 | const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { |
24 | EXPORT_SYMBOL(k8_northbridges); | 24 | { 0x00, 0x18, 0x20 }, |
25 | { 0xff, 0x00, 0x20 }, | ||
26 | { 0xfe, 0x00, 0x20 }, | ||
27 | { } | ||
28 | }; | ||
29 | |||
30 | struct amd_northbridge_info amd_northbridges; | ||
31 | EXPORT_SYMBOL(amd_northbridges); | ||
25 | 32 | ||
26 | static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) | 33 | static struct pci_dev *next_northbridge(struct pci_dev *dev, |
34 | struct pci_device_id *ids) | ||
27 | { | 35 | { |
28 | do { | 36 | do { |
29 | dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); | 37 | dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); |
30 | if (!dev) | 38 | if (!dev) |
31 | break; | 39 | break; |
32 | } while (!pci_match_id(&k8_nb_ids[0], dev)); | 40 | } while (!pci_match_id(ids, dev)); |
33 | return dev; | 41 | return dev; |
34 | } | 42 | } |
35 | 43 | ||
36 | int cache_k8_northbridges(void) | 44 | int amd_cache_northbridges(void) |
37 | { | 45 | { |
38 | int i; | 46 | int i = 0; |
39 | struct pci_dev *dev; | 47 | struct amd_northbridge *nb; |
48 | struct pci_dev *misc; | ||
40 | 49 | ||
41 | if (k8_northbridges.num) | 50 | if (amd_nb_num()) |
42 | return 0; | 51 | return 0; |
43 | 52 | ||
44 | dev = NULL; | 53 | misc = NULL; |
45 | while ((dev = next_k8_northbridge(dev)) != NULL) | 54 | while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL) |
46 | k8_northbridges.num++; | 55 | i++; |
47 | 56 | ||
48 | /* some CPU families (e.g. family 0x11) do not support GART */ | 57 | if (i == 0) |
49 | if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || | 58 | return 0; |
50 | boot_cpu_data.x86 == 0x15) | ||
51 | k8_northbridges.gart_supported = 1; | ||
52 | 59 | ||
53 | k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * | 60 | nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); |
54 | sizeof(void *), GFP_KERNEL); | 61 | if (!nb) |
55 | if (!k8_northbridges.nb_misc) | ||
56 | return -ENOMEM; | 62 | return -ENOMEM; |
57 | 63 | ||
58 | if (!k8_northbridges.num) { | 64 | amd_northbridges.nb = nb; |
59 | k8_northbridges.nb_misc[0] = NULL; | 65 | amd_northbridges.num = i; |
60 | return 0; | ||
61 | } | ||
62 | 66 | ||
63 | if (k8_northbridges.gart_supported) { | 67 | misc = NULL; |
64 | flush_words = kmalloc(k8_northbridges.num * sizeof(u32), | 68 | for (i = 0; i != amd_nb_num(); i++) { |
65 | GFP_KERNEL); | 69 | node_to_amd_nb(i)->misc = misc = |
66 | if (!flush_words) { | 70 | next_northbridge(misc, amd_nb_misc_ids); |
67 | kfree(k8_northbridges.nb_misc); | 71 | } |
68 | return -ENOMEM; | 72 | |
69 | } | 73 | /* some CPU families (e.g. family 0x11) do not support GART */ |
70 | } | 74 | if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || |
75 | boot_cpu_data.x86 == 0x15) | ||
76 | amd_northbridges.flags |= AMD_NB_GART; | ||
77 | |||
78 | /* | ||
79 | * Some CPU families support L3 Cache Index Disable. There are some | ||
80 | * limitations because of E382 and E388 on family 0x10. | ||
81 | */ | ||
82 | if (boot_cpu_data.x86 == 0x10 && | ||
83 | boot_cpu_data.x86_model >= 0x8 && | ||
84 | (boot_cpu_data.x86_model > 0x9 || | ||
85 | boot_cpu_data.x86_mask >= 0x1)) | ||
86 | amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; | ||
71 | 87 | ||
72 | dev = NULL; | ||
73 | i = 0; | ||
74 | while ((dev = next_k8_northbridge(dev)) != NULL) { | ||
75 | k8_northbridges.nb_misc[i] = dev; | ||
76 | if (k8_northbridges.gart_supported) | ||
77 | pci_read_config_dword(dev, 0x9c, &flush_words[i++]); | ||
78 | } | ||
79 | k8_northbridges.nb_misc[i] = NULL; | ||
80 | return 0; | 88 | return 0; |
81 | } | 89 | } |
82 | EXPORT_SYMBOL_GPL(cache_k8_northbridges); | 90 | EXPORT_SYMBOL_GPL(amd_cache_northbridges); |
83 | 91 | ||
84 | /* Ignores subdevice/subvendor but as far as I can figure out | 92 | /* Ignores subdevice/subvendor but as far as I can figure out |
85 | they're useless anyways */ | 93 | they're useless anyways */ |
86 | int __init early_is_k8_nb(u32 device) | 94 | int __init early_is_amd_nb(u32 device) |
87 | { | 95 | { |
88 | struct pci_device_id *id; | 96 | struct pci_device_id *id; |
89 | u32 vendor = device & 0xffff; | 97 | u32 vendor = device & 0xffff; |
90 | device >>= 16; | 98 | device >>= 16; |
91 | for (id = k8_nb_ids; id->vendor; id++) | 99 | for (id = amd_nb_misc_ids; id->vendor; id++) |
92 | if (vendor == id->vendor && device == id->device) | 100 | if (vendor == id->vendor && device == id->device) |
93 | return 1; | 101 | return 1; |
94 | return 0; | 102 | return 0; |
95 | } | 103 | } |
96 | 104 | ||
97 | void k8_flush_garts(void) | 105 | int amd_cache_gart(void) |
106 | { | ||
107 | int i; | ||
108 | |||
109 | if (!amd_nb_has_feature(AMD_NB_GART)) | ||
110 | return 0; | ||
111 | |||
112 | flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); | ||
113 | if (!flush_words) { | ||
114 | amd_northbridges.flags &= ~AMD_NB_GART; | ||
115 | return -ENOMEM; | ||
116 | } | ||
117 | |||
118 | for (i = 0; i != amd_nb_num(); i++) | ||
119 | pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, | ||
120 | &flush_words[i]); | ||
121 | |||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | void amd_flush_garts(void) | ||
98 | { | 126 | { |
99 | int flushed, i; | 127 | int flushed, i; |
100 | unsigned long flags; | 128 | unsigned long flags; |
101 | static DEFINE_SPINLOCK(gart_lock); | 129 | static DEFINE_SPINLOCK(gart_lock); |
102 | 130 | ||
103 | if (!k8_northbridges.gart_supported) | 131 | if (!amd_nb_has_feature(AMD_NB_GART)) |
104 | return; | 132 | return; |
105 | 133 | ||
106 | /* Avoid races between AGP and IOMMU. In theory it's not needed | 134 | /* Avoid races between AGP and IOMMU. In theory it's not needed |
@@ -109,16 +137,16 @@ void k8_flush_garts(void) | |||
109 | that it doesn't matter to serialize more. -AK */ | 137 | that it doesn't matter to serialize more. -AK */ |
110 | spin_lock_irqsave(&gart_lock, flags); | 138 | spin_lock_irqsave(&gart_lock, flags); |
111 | flushed = 0; | 139 | flushed = 0; |
112 | for (i = 0; i < k8_northbridges.num; i++) { | 140 | for (i = 0; i < amd_nb_num(); i++) { |
113 | pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c, | 141 | pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c, |
114 | flush_words[i]|1); | 142 | flush_words[i] | 1); |
115 | flushed++; | 143 | flushed++; |
116 | } | 144 | } |
117 | for (i = 0; i < k8_northbridges.num; i++) { | 145 | for (i = 0; i < amd_nb_num(); i++) { |
118 | u32 w; | 146 | u32 w; |
119 | /* Make sure the hardware actually executed the flush*/ | 147 | /* Make sure the hardware actually executed the flush*/ |
120 | for (;;) { | 148 | for (;;) { |
121 | pci_read_config_dword(k8_northbridges.nb_misc[i], | 149 | pci_read_config_dword(node_to_amd_nb(i)->misc, |
122 | 0x9c, &w); | 150 | 0x9c, &w); |
123 | if (!(w & 1)) | 151 | if (!(w & 1)) |
124 | break; | 152 | break; |
@@ -129,19 +157,23 @@ void k8_flush_garts(void) | |||
129 | if (!flushed) | 157 | if (!flushed) |
130 | printk("nothing to flush?\n"); | 158 | printk("nothing to flush?\n"); |
131 | } | 159 | } |
132 | EXPORT_SYMBOL_GPL(k8_flush_garts); | 160 | EXPORT_SYMBOL_GPL(amd_flush_garts); |
133 | 161 | ||
134 | static __init int init_k8_nbs(void) | 162 | static __init int init_amd_nbs(void) |
135 | { | 163 | { |
136 | int err = 0; | 164 | int err = 0; |
137 | 165 | ||
138 | err = cache_k8_northbridges(); | 166 | err = amd_cache_northbridges(); |
139 | 167 | ||
140 | if (err < 0) | 168 | if (err < 0) |
141 | printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); | 169 | printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); |
170 | |||
171 | if (amd_cache_gart() < 0) | ||
172 | printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " | ||
173 | "GART support disabled.\n"); | ||
142 | 174 | ||
143 | return err; | 175 | return err; |
144 | } | 176 | } |
145 | 177 | ||
146 | /* This has to go after the PCI subsystem */ | 178 | /* This has to go after the PCI subsystem */ |
147 | fs_initcall(init_k8_nbs); | 179 | fs_initcall(init_amd_nbs); |
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index afc406498c9d..671d5aad7a0c 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c | |||
@@ -313,14 +313,16 @@ static void apbt_setup_irq(struct apbt_dev *adev) | |||
313 | if (adev->irq == 0) | 313 | if (adev->irq == 0) |
314 | return; | 314 | return; |
315 | 315 | ||
316 | irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); | ||
317 | irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); | ||
318 | /* APB timer irqs are set up as mp_irqs, timer is edge type */ | ||
319 | __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge"); | ||
320 | |||
316 | if (system_state == SYSTEM_BOOTING) { | 321 | if (system_state == SYSTEM_BOOTING) { |
317 | irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); | ||
318 | irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); | ||
319 | /* APB timer irqs are set up as mp_irqs, timer is edge type */ | ||
320 | __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge"); | ||
321 | if (request_irq(adev->irq, apbt_interrupt_handler, | 322 | if (request_irq(adev->irq, apbt_interrupt_handler, |
322 | IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, | 323 | IRQF_TIMER | IRQF_DISABLED | |
323 | adev->name, adev)) { | 324 | IRQF_NOBALANCING, |
325 | adev->name, adev)) { | ||
324 | printk(KERN_ERR "Failed request IRQ for APBT%d\n", | 326 | printk(KERN_ERR "Failed request IRQ for APBT%d\n", |
325 | adev->num); | 327 | adev->num); |
326 | } | 328 | } |
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b3a16e8f0703..5955a7800a96 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c | |||
@@ -39,18 +39,6 @@ int fallback_aper_force __initdata; | |||
39 | 39 | ||
40 | int fix_aperture __initdata = 1; | 40 | int fix_aperture __initdata = 1; |
41 | 41 | ||
42 | struct bus_dev_range { | ||
43 | int bus; | ||
44 | int dev_base; | ||
45 | int dev_limit; | ||
46 | }; | ||
47 | |||
48 | static struct bus_dev_range bus_dev_ranges[] __initdata = { | ||
49 | { 0x00, 0x18, 0x20}, | ||
50 | { 0xff, 0x00, 0x20}, | ||
51 | { 0xfe, 0x00, 0x20} | ||
52 | }; | ||
53 | |||
54 | static struct resource gart_resource = { | 42 | static struct resource gart_resource = { |
55 | .name = "GART", | 43 | .name = "GART", |
56 | .flags = IORESOURCE_MEM, | 44 | .flags = IORESOURCE_MEM, |
@@ -206,7 +194,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) | |||
206 | * Do an PCI bus scan by hand because we're running before the PCI | 194 | * Do an PCI bus scan by hand because we're running before the PCI |
207 | * subsystem. | 195 | * subsystem. |
208 | * | 196 | * |
209 | * All K8 AGP bridges are AGPv3 compliant, so we can do this scan | 197 | * All AMD AGP bridges are AGPv3 compliant, so we can do this scan |
210 | * generically. It's probably overkill to always scan all slots because | 198 | * generically. It's probably overkill to always scan all slots because |
211 | * the AGP bridges should be always an own bus on the HT hierarchy, | 199 | * the AGP bridges should be always an own bus on the HT hierarchy, |
212 | * but do it here for future safety. | 200 | * but do it here for future safety. |
@@ -294,16 +282,16 @@ void __init early_gart_iommu_check(void) | |||
294 | search_agp_bridge(&agp_aper_order, &valid_agp); | 282 | search_agp_bridge(&agp_aper_order, &valid_agp); |
295 | 283 | ||
296 | fix = 0; | 284 | fix = 0; |
297 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { | 285 | for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) { |
298 | int bus; | 286 | int bus; |
299 | int dev_base, dev_limit; | 287 | int dev_base, dev_limit; |
300 | 288 | ||
301 | bus = bus_dev_ranges[i].bus; | 289 | bus = amd_nb_bus_dev_ranges[i].bus; |
302 | dev_base = bus_dev_ranges[i].dev_base; | 290 | dev_base = amd_nb_bus_dev_ranges[i].dev_base; |
303 | dev_limit = bus_dev_ranges[i].dev_limit; | 291 | dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; |
304 | 292 | ||
305 | for (slot = dev_base; slot < dev_limit; slot++) { | 293 | for (slot = dev_base; slot < dev_limit; slot++) { |
306 | if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) | 294 | if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) |
307 | continue; | 295 | continue; |
308 | 296 | ||
309 | ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); | 297 | ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); |
@@ -349,16 +337,16 @@ void __init early_gart_iommu_check(void) | |||
349 | return; | 337 | return; |
350 | 338 | ||
351 | /* disable them all at first */ | 339 | /* disable them all at first */ |
352 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { | 340 | for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { |
353 | int bus; | 341 | int bus; |
354 | int dev_base, dev_limit; | 342 | int dev_base, dev_limit; |
355 | 343 | ||
356 | bus = bus_dev_ranges[i].bus; | 344 | bus = amd_nb_bus_dev_ranges[i].bus; |
357 | dev_base = bus_dev_ranges[i].dev_base; | 345 | dev_base = amd_nb_bus_dev_ranges[i].dev_base; |
358 | dev_limit = bus_dev_ranges[i].dev_limit; | 346 | dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; |
359 | 347 | ||
360 | for (slot = dev_base; slot < dev_limit; slot++) { | 348 | for (slot = dev_base; slot < dev_limit; slot++) { |
361 | if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) | 349 | if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) |
362 | continue; | 350 | continue; |
363 | 351 | ||
364 | ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); | 352 | ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); |
@@ -390,17 +378,17 @@ int __init gart_iommu_hole_init(void) | |||
390 | 378 | ||
391 | fix = 0; | 379 | fix = 0; |
392 | node = 0; | 380 | node = 0; |
393 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { | 381 | for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { |
394 | int bus; | 382 | int bus; |
395 | int dev_base, dev_limit; | 383 | int dev_base, dev_limit; |
396 | u32 ctl; | 384 | u32 ctl; |
397 | 385 | ||
398 | bus = bus_dev_ranges[i].bus; | 386 | bus = amd_nb_bus_dev_ranges[i].bus; |
399 | dev_base = bus_dev_ranges[i].dev_base; | 387 | dev_base = amd_nb_bus_dev_ranges[i].dev_base; |
400 | dev_limit = bus_dev_ranges[i].dev_limit; | 388 | dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; |
401 | 389 | ||
402 | for (slot = dev_base; slot < dev_limit; slot++) { | 390 | for (slot = dev_base; slot < dev_limit; slot++) { |
403 | if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) | 391 | if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) |
404 | continue; | 392 | continue; |
405 | 393 | ||
406 | iommu_detected = 1; | 394 | iommu_detected = 1; |
@@ -505,7 +493,7 @@ out: | |||
505 | } | 493 | } |
506 | 494 | ||
507 | /* Fix up the north bridges */ | 495 | /* Fix up the north bridges */ |
508 | for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { | 496 | for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { |
509 | int bus, dev_base, dev_limit; | 497 | int bus, dev_base, dev_limit; |
510 | 498 | ||
511 | /* | 499 | /* |
@@ -514,11 +502,11 @@ out: | |||
514 | */ | 502 | */ |
515 | u32 ctl = DISTLBWALKPRB | aper_order << 1; | 503 | u32 ctl = DISTLBWALKPRB | aper_order << 1; |
516 | 504 | ||
517 | bus = bus_dev_ranges[i].bus; | 505 | bus = amd_nb_bus_dev_ranges[i].bus; |
518 | dev_base = bus_dev_ranges[i].dev_base; | 506 | dev_base = amd_nb_bus_dev_ranges[i].dev_base; |
519 | dev_limit = bus_dev_ranges[i].dev_limit; | 507 | dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; |
520 | for (slot = dev_base; slot < dev_limit; slot++) { | 508 | for (slot = dev_base; slot < dev_limit; slot++) { |
521 | if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) | 509 | if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) |
522 | continue; | 510 | continue; |
523 | 511 | ||
524 | write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); | 512 | write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); |
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 910f20b457c4..3966b564ea47 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile | |||
@@ -3,10 +3,7 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o | 5 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o |
6 | ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) | 6 | obj-y += hw_nmi.o |
7 | obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o | ||
8 | endif | ||
9 | obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o | ||
10 | 7 | ||
11 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o | 8 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o |
12 | obj-$(CONFIG_SMP) += ipi.o | 9 | obj-$(CONFIG_SMP) += ipi.o |
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 850657d1b0ed..06c196d7e59c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -31,7 +31,6 @@ | |||
31 | #include <linux/init.h> | 31 | #include <linux/init.h> |
32 | #include <linux/cpu.h> | 32 | #include <linux/cpu.h> |
33 | #include <linux/dmi.h> | 33 | #include <linux/dmi.h> |
34 | #include <linux/nmi.h> | ||
35 | #include <linux/smp.h> | 34 | #include <linux/smp.h> |
36 | #include <linux/mm.h> | 35 | #include <linux/mm.h> |
37 | 36 | ||
@@ -50,9 +49,8 @@ | |||
50 | #include <asm/mtrr.h> | 49 | #include <asm/mtrr.h> |
51 | #include <asm/smp.h> | 50 | #include <asm/smp.h> |
52 | #include <asm/mce.h> | 51 | #include <asm/mce.h> |
53 | #include <asm/kvm_para.h> | ||
54 | #include <asm/tsc.h> | 52 | #include <asm/tsc.h> |
55 | #include <asm/atomic.h> | 53 | #include <asm/hypervisor.h> |
56 | 54 | ||
57 | unsigned int num_processors; | 55 | unsigned int num_processors; |
58 | 56 | ||
@@ -433,17 +431,18 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) | |||
433 | reserved = reserve_eilvt_offset(offset, new); | 431 | reserved = reserve_eilvt_offset(offset, new); |
434 | 432 | ||
435 | if (reserved != new) { | 433 | if (reserved != new) { |
436 | pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but " | 434 | pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " |
437 | "vector 0x%x was already reserved by another core, " | 435 | "vector 0x%x, but the register is already in use for " |
438 | "APIC%lX=0x%x\n", | 436 | "vector 0x%x on another cpu\n", |
439 | smp_processor_id(), new, reserved, reg, old); | 437 | smp_processor_id(), reg, offset, new, reserved); |
440 | return -EINVAL; | 438 | return -EINVAL; |
441 | } | 439 | } |
442 | 440 | ||
443 | if (!eilvt_entry_is_changeable(old, new)) { | 441 | if (!eilvt_entry_is_changeable(old, new)) { |
444 | pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but " | 442 | pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " |
445 | "register already in use, APIC%lX=0x%x\n", | 443 | "vector 0x%x, but the register is already in use for " |
446 | smp_processor_id(), new, reg, old); | 444 | "vector 0x%x on this cpu\n", |
445 | smp_processor_id(), reg, offset, new, old); | ||
447 | return -EBUSY; | 446 | return -EBUSY; |
448 | } | 447 | } |
449 | 448 | ||
@@ -517,7 +516,7 @@ static void __cpuinit setup_APIC_timer(void) | |||
517 | { | 516 | { |
518 | struct clock_event_device *levt = &__get_cpu_var(lapic_events); | 517 | struct clock_event_device *levt = &__get_cpu_var(lapic_events); |
519 | 518 | ||
520 | if (cpu_has(¤t_cpu_data, X86_FEATURE_ARAT)) { | 519 | if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) { |
521 | lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; | 520 | lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; |
522 | /* Make LAPIC timer preferrable over percpu HPET */ | 521 | /* Make LAPIC timer preferrable over percpu HPET */ |
523 | lapic_clockevent.rating = 150; | 522 | lapic_clockevent.rating = 150; |
@@ -685,7 +684,7 @@ static int __init calibrate_APIC_clock(void) | |||
685 | lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, | 684 | lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, |
686 | lapic_clockevent.shift); | 685 | lapic_clockevent.shift); |
687 | lapic_clockevent.max_delta_ns = | 686 | lapic_clockevent.max_delta_ns = |
688 | clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); | 687 | clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); |
689 | lapic_clockevent.min_delta_ns = | 688 | lapic_clockevent.min_delta_ns = |
690 | clockevent_delta2ns(0xF, &lapic_clockevent); | 689 | clockevent_delta2ns(0xF, &lapic_clockevent); |
691 | 690 | ||
@@ -800,11 +799,7 @@ void __init setup_boot_APIC_clock(void) | |||
800 | * PIT/HPET going. Otherwise register lapic as a dummy | 799 | * PIT/HPET going. Otherwise register lapic as a dummy |
801 | * device. | 800 | * device. |
802 | */ | 801 | */ |
803 | if (nmi_watchdog != NMI_IO_APIC) | 802 | lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; |
804 | lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; | ||
805 | else | ||
806 | pr_warning("APIC timer registered as dummy," | ||
807 | " due to nmi_watchdog=%d!\n", nmi_watchdog); | ||
808 | 803 | ||
809 | /* Setup the lapic or request the broadcast */ | 804 | /* Setup the lapic or request the broadcast */ |
810 | setup_APIC_timer(); | 805 | setup_APIC_timer(); |
@@ -1196,12 +1191,15 @@ static void __cpuinit lapic_setup_esr(void) | |||
1196 | oldvalue, value); | 1191 | oldvalue, value); |
1197 | } | 1192 | } |
1198 | 1193 | ||
1199 | |||
1200 | /** | 1194 | /** |
1201 | * setup_local_APIC - setup the local APIC | 1195 | * setup_local_APIC - setup the local APIC |
1196 | * | ||
1197 | * Used to setup local APIC while initializing BSP or bringin up APs. | ||
1198 | * Always called with preemption disabled. | ||
1202 | */ | 1199 | */ |
1203 | void __cpuinit setup_local_APIC(void) | 1200 | void __cpuinit setup_local_APIC(void) |
1204 | { | 1201 | { |
1202 | int cpu = smp_processor_id(); | ||
1205 | unsigned int value, queued; | 1203 | unsigned int value, queued; |
1206 | int i, j, acked = 0; | 1204 | int i, j, acked = 0; |
1207 | unsigned long long tsc = 0, ntsc; | 1205 | unsigned long long tsc = 0, ntsc; |
@@ -1226,8 +1224,6 @@ void __cpuinit setup_local_APIC(void) | |||
1226 | #endif | 1224 | #endif |
1227 | perf_events_lapic_init(); | 1225 | perf_events_lapic_init(); |
1228 | 1226 | ||
1229 | preempt_disable(); | ||
1230 | |||
1231 | /* | 1227 | /* |
1232 | * Double-check whether this APIC is really registered. | 1228 | * Double-check whether this APIC is really registered. |
1233 | * This is meaningless in clustered apic mode, so we skip it. | 1229 | * This is meaningless in clustered apic mode, so we skip it. |
@@ -1343,21 +1339,19 @@ void __cpuinit setup_local_APIC(void) | |||
1343 | * TODO: set up through-local-APIC from through-I/O-APIC? --macro | 1339 | * TODO: set up through-local-APIC from through-I/O-APIC? --macro |
1344 | */ | 1340 | */ |
1345 | value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; | 1341 | value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; |
1346 | if (!smp_processor_id() && (pic_mode || !value)) { | 1342 | if (!cpu && (pic_mode || !value)) { |
1347 | value = APIC_DM_EXTINT; | 1343 | value = APIC_DM_EXTINT; |
1348 | apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", | 1344 | apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu); |
1349 | smp_processor_id()); | ||
1350 | } else { | 1345 | } else { |
1351 | value = APIC_DM_EXTINT | APIC_LVT_MASKED; | 1346 | value = APIC_DM_EXTINT | APIC_LVT_MASKED; |
1352 | apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", | 1347 | apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu); |
1353 | smp_processor_id()); | ||
1354 | } | 1348 | } |
1355 | apic_write(APIC_LVT0, value); | 1349 | apic_write(APIC_LVT0, value); |
1356 | 1350 | ||
1357 | /* | 1351 | /* |
1358 | * only the BP should see the LINT1 NMI signal, obviously. | 1352 | * only the BP should see the LINT1 NMI signal, obviously. |
1359 | */ | 1353 | */ |
1360 | if (!smp_processor_id()) | 1354 | if (!cpu) |
1361 | value = APIC_DM_NMI; | 1355 | value = APIC_DM_NMI; |
1362 | else | 1356 | else |
1363 | value = APIC_DM_NMI | APIC_LVT_MASKED; | 1357 | value = APIC_DM_NMI | APIC_LVT_MASKED; |
@@ -1365,11 +1359,9 @@ void __cpuinit setup_local_APIC(void) | |||
1365 | value |= APIC_LVT_LEVEL_TRIGGER; | 1359 | value |= APIC_LVT_LEVEL_TRIGGER; |
1366 | apic_write(APIC_LVT1, value); | 1360 | apic_write(APIC_LVT1, value); |
1367 | 1361 | ||
1368 | preempt_enable(); | ||
1369 | |||
1370 | #ifdef CONFIG_X86_MCE_INTEL | 1362 | #ifdef CONFIG_X86_MCE_INTEL |
1371 | /* Recheck CMCI information after local APIC is up on CPU #0 */ | 1363 | /* Recheck CMCI information after local APIC is up on CPU #0 */ |
1372 | if (smp_processor_id() == 0) | 1364 | if (!cpu) |
1373 | cmci_recheck(); | 1365 | cmci_recheck(); |
1374 | #endif | 1366 | #endif |
1375 | } | 1367 | } |
@@ -1388,8 +1380,15 @@ void __cpuinit end_local_APIC_setup(void) | |||
1388 | } | 1380 | } |
1389 | #endif | 1381 | #endif |
1390 | 1382 | ||
1391 | setup_apic_nmi_watchdog(NULL); | ||
1392 | apic_pm_activate(); | 1383 | apic_pm_activate(); |
1384 | |||
1385 | /* | ||
1386 | * Now that local APIC setup is completed for BP, configure the fault | ||
1387 | * handling for interrupt remapping. | ||
1388 | */ | ||
1389 | if (!smp_processor_id() && intr_remapping_enabled) | ||
1390 | enable_drhd_fault_handling(); | ||
1391 | |||
1393 | } | 1392 | } |
1394 | 1393 | ||
1395 | #ifdef CONFIG_X86_X2APIC | 1394 | #ifdef CONFIG_X86_X2APIC |
@@ -1477,7 +1476,8 @@ void __init enable_IR_x2apic(void) | |||
1477 | /* IR is required if there is APIC ID > 255 even when running | 1476 | /* IR is required if there is APIC ID > 255 even when running |
1478 | * under KVM | 1477 | * under KVM |
1479 | */ | 1478 | */ |
1480 | if (max_physical_apicid > 255 || !kvm_para_available()) | 1479 | if (max_physical_apicid > 255 || |
1480 | !hypervisor_x2apic_available()) | ||
1481 | goto nox2apic; | 1481 | goto nox2apic; |
1482 | /* | 1482 | /* |
1483 | * without IR all CPUs can be addressed by IOAPIC/MSI | 1483 | * without IR all CPUs can be addressed by IOAPIC/MSI |
@@ -1531,13 +1531,60 @@ static int __init detect_init_APIC(void) | |||
1531 | return 0; | 1531 | return 0; |
1532 | } | 1532 | } |
1533 | #else | 1533 | #else |
1534 | |||
1535 | static int apic_verify(void) | ||
1536 | { | ||
1537 | u32 features, h, l; | ||
1538 | |||
1539 | /* | ||
1540 | * The APIC feature bit should now be enabled | ||
1541 | * in `cpuid' | ||
1542 | */ | ||
1543 | features = cpuid_edx(1); | ||
1544 | if (!(features & (1 << X86_FEATURE_APIC))) { | ||
1545 | pr_warning("Could not enable APIC!\n"); | ||
1546 | return -1; | ||
1547 | } | ||
1548 | set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); | ||
1549 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
1550 | |||
1551 | /* The BIOS may have set up the APIC at some other address */ | ||
1552 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
1553 | if (l & MSR_IA32_APICBASE_ENABLE) | ||
1554 | mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; | ||
1555 | |||
1556 | pr_info("Found and enabled local APIC!\n"); | ||
1557 | return 0; | ||
1558 | } | ||
1559 | |||
1560 | int apic_force_enable(void) | ||
1561 | { | ||
1562 | u32 h, l; | ||
1563 | |||
1564 | if (disable_apic) | ||
1565 | return -1; | ||
1566 | |||
1567 | /* | ||
1568 | * Some BIOSes disable the local APIC in the APIC_BASE | ||
1569 | * MSR. This can only be done in software for Intel P6 or later | ||
1570 | * and AMD K7 (Model > 1) or later. | ||
1571 | */ | ||
1572 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
1573 | if (!(l & MSR_IA32_APICBASE_ENABLE)) { | ||
1574 | pr_info("Local APIC disabled by BIOS -- reenabling.\n"); | ||
1575 | l &= ~MSR_IA32_APICBASE_BASE; | ||
1576 | l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; | ||
1577 | wrmsr(MSR_IA32_APICBASE, l, h); | ||
1578 | enabled_via_apicbase = 1; | ||
1579 | } | ||
1580 | return apic_verify(); | ||
1581 | } | ||
1582 | |||
1534 | /* | 1583 | /* |
1535 | * Detect and initialize APIC | 1584 | * Detect and initialize APIC |
1536 | */ | 1585 | */ |
1537 | static int __init detect_init_APIC(void) | 1586 | static int __init detect_init_APIC(void) |
1538 | { | 1587 | { |
1539 | u32 h, l, features; | ||
1540 | |||
1541 | /* Disabled by kernel option? */ | 1588 | /* Disabled by kernel option? */ |
1542 | if (disable_apic) | 1589 | if (disable_apic) |
1543 | return -1; | 1590 | return -1; |
@@ -1567,38 +1614,12 @@ static int __init detect_init_APIC(void) | |||
1567 | "you can enable it with \"lapic\"\n"); | 1614 | "you can enable it with \"lapic\"\n"); |
1568 | return -1; | 1615 | return -1; |
1569 | } | 1616 | } |
1570 | /* | 1617 | if (apic_force_enable()) |
1571 | * Some BIOSes disable the local APIC in the APIC_BASE | 1618 | return -1; |
1572 | * MSR. This can only be done in software for Intel P6 or later | 1619 | } else { |
1573 | * and AMD K7 (Model > 1) or later. | 1620 | if (apic_verify()) |
1574 | */ | 1621 | return -1; |
1575 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
1576 | if (!(l & MSR_IA32_APICBASE_ENABLE)) { | ||
1577 | pr_info("Local APIC disabled by BIOS -- reenabling.\n"); | ||
1578 | l &= ~MSR_IA32_APICBASE_BASE; | ||
1579 | l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; | ||
1580 | wrmsr(MSR_IA32_APICBASE, l, h); | ||
1581 | enabled_via_apicbase = 1; | ||
1582 | } | ||
1583 | } | ||
1584 | /* | ||
1585 | * The APIC feature bit should now be enabled | ||
1586 | * in `cpuid' | ||
1587 | */ | ||
1588 | features = cpuid_edx(1); | ||
1589 | if (!(features & (1 << X86_FEATURE_APIC))) { | ||
1590 | pr_warning("Could not enable APIC!\n"); | ||
1591 | return -1; | ||
1592 | } | 1622 | } |
1593 | set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); | ||
1594 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
1595 | |||
1596 | /* The BIOS may have set up the APIC at some other address */ | ||
1597 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
1598 | if (l & MSR_IA32_APICBASE_ENABLE) | ||
1599 | mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; | ||
1600 | |||
1601 | pr_info("Found and enabled local APIC!\n"); | ||
1602 | 1623 | ||
1603 | apic_pm_activate(); | 1624 | apic_pm_activate(); |
1604 | 1625 | ||
@@ -1610,28 +1631,6 @@ no_apic: | |||
1610 | } | 1631 | } |
1611 | #endif | 1632 | #endif |
1612 | 1633 | ||
1613 | #ifdef CONFIG_X86_64 | ||
1614 | void __init early_init_lapic_mapping(void) | ||
1615 | { | ||
1616 | /* | ||
1617 | * If no local APIC can be found then go out | ||
1618 | * : it means there is no mpatable and MADT | ||
1619 | */ | ||
1620 | if (!smp_found_config) | ||
1621 | return; | ||
1622 | |||
1623 | set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); | ||
1624 | apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", | ||
1625 | APIC_BASE, mp_lapic_addr); | ||
1626 | |||
1627 | /* | ||
1628 | * Fetch the APIC ID of the BSP in case we have a | ||
1629 | * default configuration (or the MP table is broken). | ||
1630 | */ | ||
1631 | boot_cpu_physical_apicid = read_apic_id(); | ||
1632 | } | ||
1633 | #endif | ||
1634 | |||
1635 | /** | 1634 | /** |
1636 | * init_apic_mappings - initialize APIC mappings | 1635 | * init_apic_mappings - initialize APIC mappings |
1637 | */ | 1636 | */ |
@@ -1657,10 +1656,7 @@ void __init init_apic_mappings(void) | |||
1657 | * acpi_register_lapic_address() | 1656 | * acpi_register_lapic_address() |
1658 | */ | 1657 | */ |
1659 | if (!acpi_lapic && !smp_found_config) | 1658 | if (!acpi_lapic && !smp_found_config) |
1660 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); | 1659 | register_lapic_address(apic_phys); |
1661 | |||
1662 | apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", | ||
1663 | APIC_BASE, apic_phys); | ||
1664 | } | 1660 | } |
1665 | 1661 | ||
1666 | /* | 1662 | /* |
@@ -1682,11 +1678,27 @@ void __init init_apic_mappings(void) | |||
1682 | } | 1678 | } |
1683 | } | 1679 | } |
1684 | 1680 | ||
1681 | void __init register_lapic_address(unsigned long address) | ||
1682 | { | ||
1683 | mp_lapic_addr = address; | ||
1684 | |||
1685 | if (!x2apic_mode) { | ||
1686 | set_fixmap_nocache(FIX_APIC_BASE, address); | ||
1687 | apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", | ||
1688 | APIC_BASE, mp_lapic_addr); | ||
1689 | } | ||
1690 | if (boot_cpu_physical_apicid == -1U) { | ||
1691 | boot_cpu_physical_apicid = read_apic_id(); | ||
1692 | apic_version[boot_cpu_physical_apicid] = | ||
1693 | GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
1694 | } | ||
1695 | } | ||
1696 | |||
1685 | /* | 1697 | /* |
1686 | * This initializes the IO-APIC and APIC hardware if this is | 1698 | * This initializes the IO-APIC and APIC hardware if this is |
1687 | * a UP kernel. | 1699 | * a UP kernel. |
1688 | */ | 1700 | */ |
1689 | int apic_version[MAX_APICS]; | 1701 | int apic_version[MAX_LOCAL_APIC]; |
1690 | 1702 | ||
1691 | int __init APIC_init_uniprocessor(void) | 1703 | int __init APIC_init_uniprocessor(void) |
1692 | { | 1704 | { |
@@ -1751,17 +1763,10 @@ int __init APIC_init_uniprocessor(void) | |||
1751 | setup_IO_APIC(); | 1763 | setup_IO_APIC(); |
1752 | else { | 1764 | else { |
1753 | nr_ioapics = 0; | 1765 | nr_ioapics = 0; |
1754 | localise_nmi_watchdog(); | ||
1755 | } | 1766 | } |
1756 | #else | ||
1757 | localise_nmi_watchdog(); | ||
1758 | #endif | 1767 | #endif |
1759 | 1768 | ||
1760 | x86_init.timers.setup_percpu_clockev(); | 1769 | x86_init.timers.setup_percpu_clockev(); |
1761 | #ifdef CONFIG_X86_64 | ||
1762 | check_nmi_watchdog(); | ||
1763 | #endif | ||
1764 | |||
1765 | return 0; | 1770 | return 0; |
1766 | } | 1771 | } |
1767 | 1772 | ||
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index cefd6942f0e9..79fd43ca6f96 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c | |||
@@ -17,19 +17,31 @@ | |||
17 | #include <linux/nmi.h> | 17 | #include <linux/nmi.h> |
18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
19 | 19 | ||
20 | /* For reliability, we're prepared to waste bits here. */ | 20 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
21 | static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; | ||
22 | |||
23 | u64 hw_nmi_get_sample_period(void) | 21 | u64 hw_nmi_get_sample_period(void) |
24 | { | 22 | { |
25 | return (u64)(cpu_khz) * 1000 * 60; | 23 | return (u64)(cpu_khz) * 1000 * 60; |
26 | } | 24 | } |
25 | #endif | ||
26 | |||
27 | #ifdef arch_trigger_all_cpu_backtrace | ||
28 | /* For reliability, we're prepared to waste bits here. */ | ||
29 | static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; | ||
30 | |||
31 | /* "in progress" flag of arch_trigger_all_cpu_backtrace */ | ||
32 | static unsigned long backtrace_flag; | ||
27 | 33 | ||
28 | #ifdef ARCH_HAS_NMI_WATCHDOG | ||
29 | void arch_trigger_all_cpu_backtrace(void) | 34 | void arch_trigger_all_cpu_backtrace(void) |
30 | { | 35 | { |
31 | int i; | 36 | int i; |
32 | 37 | ||
38 | if (test_and_set_bit(0, &backtrace_flag)) | ||
39 | /* | ||
40 | * If there is already a trigger_all_cpu_backtrace() in progress | ||
41 | * (backtrace_flag == 1), don't output double cpu dump infos. | ||
42 | */ | ||
43 | return; | ||
44 | |||
33 | cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); | 45 | cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); |
34 | 46 | ||
35 | printk(KERN_INFO "sending NMI to all CPUs:\n"); | 47 | printk(KERN_INFO "sending NMI to all CPUs:\n"); |
@@ -41,6 +53,9 @@ void arch_trigger_all_cpu_backtrace(void) | |||
41 | break; | 53 | break; |
42 | mdelay(1); | 54 | mdelay(1); |
43 | } | 55 | } |
56 | |||
57 | clear_bit(0, &backtrace_flag); | ||
58 | smp_mb__after_clear_bit(); | ||
44 | } | 59 | } |
45 | 60 | ||
46 | static int __kprobes | 61 | static int __kprobes |
@@ -49,11 +64,10 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, | |||
49 | { | 64 | { |
50 | struct die_args *args = __args; | 65 | struct die_args *args = __args; |
51 | struct pt_regs *regs; | 66 | struct pt_regs *regs; |
52 | int cpu = smp_processor_id(); | 67 | int cpu; |
53 | 68 | ||
54 | switch (cmd) { | 69 | switch (cmd) { |
55 | case DIE_NMI: | 70 | case DIE_NMI: |
56 | case DIE_NMI_IPI: | ||
57 | break; | 71 | break; |
58 | 72 | ||
59 | default: | 73 | default: |
@@ -61,6 +75,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, | |||
61 | } | 75 | } |
62 | 76 | ||
63 | regs = args->regs; | 77 | regs = args->regs; |
78 | cpu = smp_processor_id(); | ||
64 | 79 | ||
65 | if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { | 80 | if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { |
66 | static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; | 81 | static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; |
@@ -80,7 +95,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, | |||
80 | static __read_mostly struct notifier_block backtrace_notifier = { | 95 | static __read_mostly struct notifier_block backtrace_notifier = { |
81 | .notifier_call = arch_trigger_all_cpu_backtrace_handler, | 96 | .notifier_call = arch_trigger_all_cpu_backtrace_handler, |
82 | .next = NULL, | 97 | .next = NULL, |
83 | .priority = 1 | 98 | .priority = NMI_LOCAL_LOW_PRIOR, |
84 | }; | 99 | }; |
85 | 100 | ||
86 | static int __init register_trigger_all_cpu_backtrace(void) | 101 | static int __init register_trigger_all_cpu_backtrace(void) |
@@ -90,18 +105,3 @@ static int __init register_trigger_all_cpu_backtrace(void) | |||
90 | } | 105 | } |
91 | early_initcall(register_trigger_all_cpu_backtrace); | 106 | early_initcall(register_trigger_all_cpu_backtrace); |
92 | #endif | 107 | #endif |
93 | |||
94 | /* STUB calls to mimic old nmi_watchdog behaviour */ | ||
95 | #if defined(CONFIG_X86_LOCAL_APIC) | ||
96 | unsigned int nmi_watchdog = NMI_NONE; | ||
97 | EXPORT_SYMBOL(nmi_watchdog); | ||
98 | void acpi_nmi_enable(void) { return; } | ||
99 | void acpi_nmi_disable(void) { return; } | ||
100 | #endif | ||
101 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ | ||
102 | EXPORT_SYMBOL(nmi_active); | ||
103 | int unknown_nmi_panic; | ||
104 | void cpu_nmi_set_wd_enabled(void) { return; } | ||
105 | void stop_apic_nmi_watchdog(void *unused) { return; } | ||
106 | void setup_apic_nmi_watchdog(void *unused) { return; } | ||
107 | int __init check_nmi_watchdog(void) { return 0; } | ||
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7cc0a721f628..697dc34b7b87 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c | |||
@@ -54,7 +54,6 @@ | |||
54 | #include <asm/dma.h> | 54 | #include <asm/dma.h> |
55 | #include <asm/timer.h> | 55 | #include <asm/timer.h> |
56 | #include <asm/i8259.h> | 56 | #include <asm/i8259.h> |
57 | #include <asm/nmi.h> | ||
58 | #include <asm/msidef.h> | 57 | #include <asm/msidef.h> |
59 | #include <asm/hypertransport.h> | 58 | #include <asm/hypertransport.h> |
60 | #include <asm/setup.h> | 59 | #include <asm/setup.h> |
@@ -126,6 +125,26 @@ static int __init parse_noapic(char *str) | |||
126 | } | 125 | } |
127 | early_param("noapic", parse_noapic); | 126 | early_param("noapic", parse_noapic); |
128 | 127 | ||
128 | /* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ | ||
129 | void mp_save_irq(struct mpc_intsrc *m) | ||
130 | { | ||
131 | int i; | ||
132 | |||
133 | apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," | ||
134 | " IRQ %02x, APIC ID %x, APIC INT %02x\n", | ||
135 | m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, | ||
136 | m->srcbusirq, m->dstapic, m->dstirq); | ||
137 | |||
138 | for (i = 0; i < mp_irq_entries; i++) { | ||
139 | if (!memcmp(&mp_irqs[i], m, sizeof(*m))) | ||
140 | return; | ||
141 | } | ||
142 | |||
143 | memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m)); | ||
144 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
145 | panic("Max # of irq sources exceeded!!\n"); | ||
146 | } | ||
147 | |||
129 | struct irq_pin_list { | 148 | struct irq_pin_list { |
130 | int apic, pin; | 149 | int apic, pin; |
131 | struct irq_pin_list *next; | 150 | struct irq_pin_list *next; |
@@ -136,6 +155,7 @@ static struct irq_pin_list *alloc_irq_pin_list(int node) | |||
136 | return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); | 155 | return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); |
137 | } | 156 | } |
138 | 157 | ||
158 | |||
139 | /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ | 159 | /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ |
140 | #ifdef CONFIG_SPARSE_IRQ | 160 | #ifdef CONFIG_SPARSE_IRQ |
141 | static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; | 161 | static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; |
@@ -1934,8 +1954,7 @@ void disable_IO_APIC(void) | |||
1934 | * | 1954 | * |
1935 | * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 | 1955 | * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 |
1936 | */ | 1956 | */ |
1937 | 1957 | void __init setup_ioapic_ids_from_mpc_nocheck(void) | |
1938 | void __init setup_ioapic_ids_from_mpc(void) | ||
1939 | { | 1958 | { |
1940 | union IO_APIC_reg_00 reg_00; | 1959 | union IO_APIC_reg_00 reg_00; |
1941 | physid_mask_t phys_id_present_map; | 1960 | physid_mask_t phys_id_present_map; |
@@ -1944,15 +1963,6 @@ void __init setup_ioapic_ids_from_mpc(void) | |||
1944 | unsigned char old_id; | 1963 | unsigned char old_id; |
1945 | unsigned long flags; | 1964 | unsigned long flags; |
1946 | 1965 | ||
1947 | if (acpi_ioapic) | ||
1948 | return; | ||
1949 | /* | ||
1950 | * Don't check I/O APIC IDs for xAPIC systems. They have | ||
1951 | * no meaning without the serial APIC bus. | ||
1952 | */ | ||
1953 | if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) | ||
1954 | || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) | ||
1955 | return; | ||
1956 | /* | 1966 | /* |
1957 | * This is broken; anything with a real cpu count has to | 1967 | * This is broken; anything with a real cpu count has to |
1958 | * circumvent this idiocy regardless. | 1968 | * circumvent this idiocy regardless. |
@@ -2006,7 +2016,6 @@ void __init setup_ioapic_ids_from_mpc(void) | |||
2006 | physids_or(phys_id_present_map, phys_id_present_map, tmp); | 2016 | physids_or(phys_id_present_map, phys_id_present_map, tmp); |
2007 | } | 2017 | } |
2008 | 2018 | ||
2009 | |||
2010 | /* | 2019 | /* |
2011 | * We need to adjust the IRQ routing table | 2020 | * We need to adjust the IRQ routing table |
2012 | * if the ID changed. | 2021 | * if the ID changed. |
@@ -2018,9 +2027,12 @@ void __init setup_ioapic_ids_from_mpc(void) | |||
2018 | = mp_ioapics[apic_id].apicid; | 2027 | = mp_ioapics[apic_id].apicid; |
2019 | 2028 | ||
2020 | /* | 2029 | /* |
2021 | * Read the right value from the MPC table and | 2030 | * Update the ID register according to the right value |
2022 | * write it into the ID register. | 2031 | * from the MPC table if they are different. |
2023 | */ | 2032 | */ |
2033 | if (mp_ioapics[apic_id].apicid == reg_00.bits.ID) | ||
2034 | continue; | ||
2035 | |||
2024 | apic_printk(APIC_VERBOSE, KERN_INFO | 2036 | apic_printk(APIC_VERBOSE, KERN_INFO |
2025 | "...changing IO-APIC physical APIC ID to %d ...", | 2037 | "...changing IO-APIC physical APIC ID to %d ...", |
2026 | mp_ioapics[apic_id].apicid); | 2038 | mp_ioapics[apic_id].apicid); |
@@ -2042,6 +2054,21 @@ void __init setup_ioapic_ids_from_mpc(void) | |||
2042 | apic_printk(APIC_VERBOSE, " ok.\n"); | 2054 | apic_printk(APIC_VERBOSE, " ok.\n"); |
2043 | } | 2055 | } |
2044 | } | 2056 | } |
2057 | |||
2058 | void __init setup_ioapic_ids_from_mpc(void) | ||
2059 | { | ||
2060 | |||
2061 | if (acpi_ioapic) | ||
2062 | return; | ||
2063 | /* | ||
2064 | * Don't check I/O APIC IDs for xAPIC systems. They have | ||
2065 | * no meaning without the serial APIC bus. | ||
2066 | */ | ||
2067 | if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) | ||
2068 | || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) | ||
2069 | return; | ||
2070 | setup_ioapic_ids_from_mpc_nocheck(); | ||
2071 | } | ||
2045 | #endif | 2072 | #endif |
2046 | 2073 | ||
2047 | int no_timer_check __initdata; | 2074 | int no_timer_check __initdata; |
@@ -2302,7 +2329,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) | |||
2302 | unsigned int irr; | 2329 | unsigned int irr; |
2303 | struct irq_desc *desc; | 2330 | struct irq_desc *desc; |
2304 | struct irq_cfg *cfg; | 2331 | struct irq_cfg *cfg; |
2305 | irq = __get_cpu_var(vector_irq)[vector]; | 2332 | irq = __this_cpu_read(vector_irq[vector]); |
2306 | 2333 | ||
2307 | if (irq == -1) | 2334 | if (irq == -1) |
2308 | continue; | 2335 | continue; |
@@ -2336,7 +2363,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) | |||
2336 | apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); | 2363 | apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); |
2337 | goto unlock; | 2364 | goto unlock; |
2338 | } | 2365 | } |
2339 | __get_cpu_var(vector_irq)[vector] = -1; | 2366 | __this_cpu_write(vector_irq[vector], -1); |
2340 | unlock: | 2367 | unlock: |
2341 | raw_spin_unlock(&desc->lock); | 2368 | raw_spin_unlock(&desc->lock); |
2342 | } | 2369 | } |
@@ -2430,13 +2457,12 @@ static void ack_apic_level(struct irq_data *data) | |||
2430 | { | 2457 | { |
2431 | struct irq_cfg *cfg = data->chip_data; | 2458 | struct irq_cfg *cfg = data->chip_data; |
2432 | int i, do_unmask_irq = 0, irq = data->irq; | 2459 | int i, do_unmask_irq = 0, irq = data->irq; |
2433 | struct irq_desc *desc = irq_to_desc(irq); | ||
2434 | unsigned long v; | 2460 | unsigned long v; |
2435 | 2461 | ||
2436 | irq_complete_move(cfg); | 2462 | irq_complete_move(cfg); |
2437 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 2463 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
2438 | /* If we are moving the irq we need to mask it */ | 2464 | /* If we are moving the irq we need to mask it */ |
2439 | if (unlikely(desc->status & IRQ_MOVE_PENDING)) { | 2465 | if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { |
2440 | do_unmask_irq = 1; | 2466 | do_unmask_irq = 1; |
2441 | mask_ioapic(cfg); | 2467 | mask_ioapic(cfg); |
2442 | } | 2468 | } |
@@ -2643,24 +2669,6 @@ static void lapic_register_intr(int irq) | |||
2643 | "edge"); | 2669 | "edge"); |
2644 | } | 2670 | } |
2645 | 2671 | ||
2646 | static void __init setup_nmi(void) | ||
2647 | { | ||
2648 | /* | ||
2649 | * Dirty trick to enable the NMI watchdog ... | ||
2650 | * We put the 8259A master into AEOI mode and | ||
2651 | * unmask on all local APICs LVT0 as NMI. | ||
2652 | * | ||
2653 | * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') | ||
2654 | * is from Maciej W. Rozycki - so we do not have to EOI from | ||
2655 | * the NMI handler or the timer interrupt. | ||
2656 | */ | ||
2657 | apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); | ||
2658 | |||
2659 | enable_NMI_through_LVT0(); | ||
2660 | |||
2661 | apic_printk(APIC_VERBOSE, " done.\n"); | ||
2662 | } | ||
2663 | |||
2664 | /* | 2672 | /* |
2665 | * This looks a bit hackish but it's about the only one way of sending | 2673 | * This looks a bit hackish but it's about the only one way of sending |
2666 | * a few INTA cycles to 8259As and any associated glue logic. ICR does | 2674 | * a few INTA cycles to 8259As and any associated glue logic. ICR does |
@@ -2766,15 +2774,6 @@ static inline void __init check_timer(void) | |||
2766 | */ | 2774 | */ |
2767 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | 2775 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); |
2768 | legacy_pic->init(1); | 2776 | legacy_pic->init(1); |
2769 | #ifdef CONFIG_X86_32 | ||
2770 | { | ||
2771 | unsigned int ver; | ||
2772 | |||
2773 | ver = apic_read(APIC_LVR); | ||
2774 | ver = GET_APIC_VERSION(ver); | ||
2775 | timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); | ||
2776 | } | ||
2777 | #endif | ||
2778 | 2777 | ||
2779 | pin1 = find_isa_irq_pin(0, mp_INT); | 2778 | pin1 = find_isa_irq_pin(0, mp_INT); |
2780 | apic1 = find_isa_irq_apic(0, mp_INT); | 2779 | apic1 = find_isa_irq_apic(0, mp_INT); |
@@ -2822,10 +2821,6 @@ static inline void __init check_timer(void) | |||
2822 | unmask_ioapic(cfg); | 2821 | unmask_ioapic(cfg); |
2823 | } | 2822 | } |
2824 | if (timer_irq_works()) { | 2823 | if (timer_irq_works()) { |
2825 | if (nmi_watchdog == NMI_IO_APIC) { | ||
2826 | setup_nmi(); | ||
2827 | legacy_pic->unmask(0); | ||
2828 | } | ||
2829 | if (disable_timer_pin_1 > 0) | 2824 | if (disable_timer_pin_1 > 0) |
2830 | clear_IO_APIC_pin(0, pin1); | 2825 | clear_IO_APIC_pin(0, pin1); |
2831 | goto out; | 2826 | goto out; |
@@ -2851,11 +2846,6 @@ static inline void __init check_timer(void) | |||
2851 | if (timer_irq_works()) { | 2846 | if (timer_irq_works()) { |
2852 | apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); | 2847 | apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); |
2853 | timer_through_8259 = 1; | 2848 | timer_through_8259 = 1; |
2854 | if (nmi_watchdog == NMI_IO_APIC) { | ||
2855 | legacy_pic->mask(0); | ||
2856 | setup_nmi(); | ||
2857 | legacy_pic->unmask(0); | ||
2858 | } | ||
2859 | goto out; | 2849 | goto out; |
2860 | } | 2850 | } |
2861 | /* | 2851 | /* |
@@ -2867,15 +2857,6 @@ static inline void __init check_timer(void) | |||
2867 | apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); | 2857 | apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); |
2868 | } | 2858 | } |
2869 | 2859 | ||
2870 | if (nmi_watchdog == NMI_IO_APIC) { | ||
2871 | apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " | ||
2872 | "through the IO-APIC - disabling NMI Watchdog!\n"); | ||
2873 | nmi_watchdog = NMI_NONE; | ||
2874 | } | ||
2875 | #ifdef CONFIG_X86_32 | ||
2876 | timer_ack = 0; | ||
2877 | #endif | ||
2878 | |||
2879 | apic_printk(APIC_QUIET, KERN_INFO | 2860 | apic_printk(APIC_QUIET, KERN_INFO |
2880 | "...trying to set up timer as Virtual Wire IRQ...\n"); | 2861 | "...trying to set up timer as Virtual Wire IRQ...\n"); |
2881 | 2862 | ||
@@ -3413,6 +3394,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, | |||
3413 | msg.data |= MSI_DATA_VECTOR(cfg->vector); | 3394 | msg.data |= MSI_DATA_VECTOR(cfg->vector); |
3414 | msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; | 3395 | msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; |
3415 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); | 3396 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); |
3397 | msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); | ||
3416 | 3398 | ||
3417 | dmar_msi_write(irq, &msg); | 3399 | dmar_msi_write(irq, &msg); |
3418 | 3400 | ||
@@ -3639,7 +3621,7 @@ int __init io_apic_get_redir_entries (int ioapic) | |||
3639 | return reg_01.bits.entries + 1; | 3621 | return reg_01.bits.entries + 1; |
3640 | } | 3622 | } |
3641 | 3623 | ||
3642 | void __init probe_nr_irqs_gsi(void) | 3624 | static void __init probe_nr_irqs_gsi(void) |
3643 | { | 3625 | { |
3644 | int nr; | 3626 | int nr; |
3645 | 3627 | ||
@@ -3956,7 +3938,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) | |||
3956 | return res; | 3938 | return res; |
3957 | } | 3939 | } |
3958 | 3940 | ||
3959 | void __init ioapic_init_mappings(void) | 3941 | void __init ioapic_and_gsi_init(void) |
3960 | { | 3942 | { |
3961 | unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; | 3943 | unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; |
3962 | struct resource *ioapic_res; | 3944 | struct resource *ioapic_res; |
@@ -3994,6 +3976,8 @@ fake_ioapic_page: | |||
3994 | ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; | 3976 | ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; |
3995 | ioapic_res++; | 3977 | ioapic_res++; |
3996 | } | 3978 | } |
3979 | |||
3980 | probe_nr_irqs_gsi(); | ||
3997 | } | 3981 | } |
3998 | 3982 | ||
3999 | void __init ioapic_insert_resources(void) | 3983 | void __init ioapic_insert_resources(void) |
@@ -4103,7 +4087,8 @@ void __init pre_init_apic_IRQ0(void) | |||
4103 | 4087 | ||
4104 | printk(KERN_INFO "Early APIC setup for system timer0\n"); | 4088 | printk(KERN_INFO "Early APIC setup for system timer0\n"); |
4105 | #ifndef CONFIG_SMP | 4089 | #ifndef CONFIG_SMP |
4106 | phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); | 4090 | physid_set_mask_of_physid(boot_cpu_physical_apicid, |
4091 | &phys_cpu_present_map); | ||
4107 | #endif | 4092 | #endif |
4108 | /* Make sure the irq descriptor is set up */ | 4093 | /* Make sure the irq descriptor is set up */ |
4109 | cfg = alloc_irq_and_cfg_at(0, 0); | 4094 | cfg = alloc_irq_and_cfg_at(0, 0); |
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c deleted file mode 100644 index c90041ccb742..000000000000 --- a/arch/x86/kernel/apic/nmi.c +++ /dev/null | |||
@@ -1,567 +0,0 @@ | |||
1 | /* | ||
2 | * NMI watchdog support on APIC systems | ||
3 | * | ||
4 | * Started by Ingo Molnar <mingo@redhat.com> | ||
5 | * | ||
6 | * Fixes: | ||
7 | * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. | ||
8 | * Mikael Pettersson : Power Management for local APIC NMI watchdog. | ||
9 | * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. | ||
10 | * Pavel Machek and | ||
11 | * Mikael Pettersson : PM converted to driver model. Disable/enable API. | ||
12 | */ | ||
13 | |||
14 | #include <asm/apic.h> | ||
15 | |||
16 | #include <linux/nmi.h> | ||
17 | #include <linux/mm.h> | ||
18 | #include <linux/delay.h> | ||
19 | #include <linux/interrupt.h> | ||
20 | #include <linux/module.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/sysdev.h> | ||
23 | #include <linux/sysctl.h> | ||
24 | #include <linux/percpu.h> | ||
25 | #include <linux/kprobes.h> | ||
26 | #include <linux/cpumask.h> | ||
27 | #include <linux/kernel_stat.h> | ||
28 | #include <linux/kdebug.h> | ||
29 | #include <linux/smp.h> | ||
30 | |||
31 | #include <asm/i8259.h> | ||
32 | #include <asm/io_apic.h> | ||
33 | #include <asm/proto.h> | ||
34 | #include <asm/timer.h> | ||
35 | |||
36 | #include <asm/mce.h> | ||
37 | |||
38 | #include <asm/mach_traps.h> | ||
39 | |||
40 | int unknown_nmi_panic; | ||
41 | int nmi_watchdog_enabled; | ||
42 | |||
43 | /* For reliability, we're prepared to waste bits here. */ | ||
44 | static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; | ||
45 | |||
46 | /* nmi_active: | ||
47 | * >0: the lapic NMI watchdog is active, but can be disabled | ||
48 | * <0: the lapic NMI watchdog has not been set up, and cannot | ||
49 | * be enabled | ||
50 | * 0: the lapic NMI watchdog is disabled, but can be enabled | ||
51 | */ | ||
52 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ | ||
53 | EXPORT_SYMBOL(nmi_active); | ||
54 | |||
55 | unsigned int nmi_watchdog = NMI_NONE; | ||
56 | EXPORT_SYMBOL(nmi_watchdog); | ||
57 | |||
58 | static int panic_on_timeout; | ||
59 | |||
60 | static unsigned int nmi_hz = HZ; | ||
61 | static DEFINE_PER_CPU(short, wd_enabled); | ||
62 | static int endflag __initdata; | ||
63 | |||
64 | static inline unsigned int get_nmi_count(int cpu) | ||
65 | { | ||
66 | return per_cpu(irq_stat, cpu).__nmi_count; | ||
67 | } | ||
68 | |||
69 | static inline int mce_in_progress(void) | ||
70 | { | ||
71 | #if defined(CONFIG_X86_MCE) | ||
72 | return atomic_read(&mce_entry) > 0; | ||
73 | #endif | ||
74 | return 0; | ||
75 | } | ||
76 | |||
77 | /* | ||
78 | * Take the local apic timer and PIT/HPET into account. We don't | ||
79 | * know which one is active, when we have highres/dyntick on | ||
80 | */ | ||
81 | static inline unsigned int get_timer_irqs(int cpu) | ||
82 | { | ||
83 | return per_cpu(irq_stat, cpu).apic_timer_irqs + | ||
84 | per_cpu(irq_stat, cpu).irq0_irqs; | ||
85 | } | ||
86 | |||
87 | #ifdef CONFIG_SMP | ||
88 | /* | ||
89 | * The performance counters used by NMI_LOCAL_APIC don't trigger when | ||
90 | * the CPU is idle. To make sure the NMI watchdog really ticks on all | ||
91 | * CPUs during the test make them busy. | ||
92 | */ | ||
93 | static __init void nmi_cpu_busy(void *data) | ||
94 | { | ||
95 | local_irq_enable_in_hardirq(); | ||
96 | /* | ||
97 | * Intentionally don't use cpu_relax here. This is | ||
98 | * to make sure that the performance counter really ticks, | ||
99 | * even if there is a simulator or similar that catches the | ||
100 | * pause instruction. On a real HT machine this is fine because | ||
101 | * all other CPUs are busy with "useless" delay loops and don't | ||
102 | * care if they get somewhat less cycles. | ||
103 | */ | ||
104 | while (endflag == 0) | ||
105 | mb(); | ||
106 | } | ||
107 | #endif | ||
108 | |||
109 | static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) | ||
110 | { | ||
111 | printk(KERN_CONT "\n"); | ||
112 | |||
113 | printk(KERN_WARNING | ||
114 | "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n", | ||
115 | cpu, prev_nmi_count[cpu], get_nmi_count(cpu)); | ||
116 | |||
117 | printk(KERN_WARNING | ||
118 | "Please report this to bugzilla.kernel.org,\n"); | ||
119 | printk(KERN_WARNING | ||
120 | "and attach the output of the 'dmesg' command.\n"); | ||
121 | |||
122 | per_cpu(wd_enabled, cpu) = 0; | ||
123 | atomic_dec(&nmi_active); | ||
124 | } | ||
125 | |||
126 | static void __acpi_nmi_disable(void *__unused) | ||
127 | { | ||
128 | apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); | ||
129 | } | ||
130 | |||
131 | int __init check_nmi_watchdog(void) | ||
132 | { | ||
133 | unsigned int *prev_nmi_count; | ||
134 | int cpu; | ||
135 | |||
136 | if (!nmi_watchdog_active() || !atomic_read(&nmi_active)) | ||
137 | return 0; | ||
138 | |||
139 | prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); | ||
140 | if (!prev_nmi_count) | ||
141 | goto error; | ||
142 | |||
143 | printk(KERN_INFO "Testing NMI watchdog ... "); | ||
144 | |||
145 | #ifdef CONFIG_SMP | ||
146 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
147 | smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); | ||
148 | #endif | ||
149 | |||
150 | for_each_possible_cpu(cpu) | ||
151 | prev_nmi_count[cpu] = get_nmi_count(cpu); | ||
152 | local_irq_enable(); | ||
153 | mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ | ||
154 | |||
155 | for_each_online_cpu(cpu) { | ||
156 | if (!per_cpu(wd_enabled, cpu)) | ||
157 | continue; | ||
158 | if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) | ||
159 | report_broken_nmi(cpu, prev_nmi_count); | ||
160 | } | ||
161 | endflag = 1; | ||
162 | if (!atomic_read(&nmi_active)) { | ||
163 | kfree(prev_nmi_count); | ||
164 | atomic_set(&nmi_active, -1); | ||
165 | goto error; | ||
166 | } | ||
167 | printk("OK.\n"); | ||
168 | |||
169 | /* | ||
170 | * now that we know it works we can reduce NMI frequency to | ||
171 | * something more reasonable; makes a difference in some configs | ||
172 | */ | ||
173 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
174 | nmi_hz = lapic_adjust_nmi_hz(1); | ||
175 | |||
176 | kfree(prev_nmi_count); | ||
177 | return 0; | ||
178 | error: | ||
179 | if (nmi_watchdog == NMI_IO_APIC) { | ||
180 | if (!timer_through_8259) | ||
181 | legacy_pic->mask(0); | ||
182 | on_each_cpu(__acpi_nmi_disable, NULL, 1); | ||
183 | } | ||
184 | |||
185 | #ifdef CONFIG_X86_32 | ||
186 | timer_ack = 0; | ||
187 | #endif | ||
188 | return -1; | ||
189 | } | ||
190 | |||
191 | static int __init setup_nmi_watchdog(char *str) | ||
192 | { | ||
193 | unsigned int nmi; | ||
194 | |||
195 | if (!strncmp(str, "panic", 5)) { | ||
196 | panic_on_timeout = 1; | ||
197 | str = strchr(str, ','); | ||
198 | if (!str) | ||
199 | return 1; | ||
200 | ++str; | ||
201 | } | ||
202 | |||
203 | if (!strncmp(str, "lapic", 5)) | ||
204 | nmi_watchdog = NMI_LOCAL_APIC; | ||
205 | else if (!strncmp(str, "ioapic", 6)) | ||
206 | nmi_watchdog = NMI_IO_APIC; | ||
207 | else { | ||
208 | get_option(&str, &nmi); | ||
209 | if (nmi >= NMI_INVALID) | ||
210 | return 0; | ||
211 | nmi_watchdog = nmi; | ||
212 | } | ||
213 | |||
214 | return 1; | ||
215 | } | ||
216 | __setup("nmi_watchdog=", setup_nmi_watchdog); | ||
217 | |||
218 | /* | ||
219 | * Suspend/resume support | ||
220 | */ | ||
221 | #ifdef CONFIG_PM | ||
222 | |||
223 | static int nmi_pm_active; /* nmi_active before suspend */ | ||
224 | |||
225 | static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) | ||
226 | { | ||
227 | /* only CPU0 goes here, other CPUs should be offline */ | ||
228 | nmi_pm_active = atomic_read(&nmi_active); | ||
229 | stop_apic_nmi_watchdog(NULL); | ||
230 | BUG_ON(atomic_read(&nmi_active) != 0); | ||
231 | return 0; | ||
232 | } | ||
233 | |||
234 | static int lapic_nmi_resume(struct sys_device *dev) | ||
235 | { | ||
236 | /* only CPU0 goes here, other CPUs should be offline */ | ||
237 | if (nmi_pm_active > 0) { | ||
238 | setup_apic_nmi_watchdog(NULL); | ||
239 | touch_nmi_watchdog(); | ||
240 | } | ||
241 | return 0; | ||
242 | } | ||
243 | |||
244 | static struct sysdev_class nmi_sysclass = { | ||
245 | .name = "lapic_nmi", | ||
246 | .resume = lapic_nmi_resume, | ||
247 | .suspend = lapic_nmi_suspend, | ||
248 | }; | ||
249 | |||
250 | static struct sys_device device_lapic_nmi = { | ||
251 | .id = 0, | ||
252 | .cls = &nmi_sysclass, | ||
253 | }; | ||
254 | |||
255 | static int __init init_lapic_nmi_sysfs(void) | ||
256 | { | ||
257 | int error; | ||
258 | |||
259 | /* | ||
260 | * should really be a BUG_ON but b/c this is an | ||
261 | * init call, it just doesn't work. -dcz | ||
262 | */ | ||
263 | if (nmi_watchdog != NMI_LOCAL_APIC) | ||
264 | return 0; | ||
265 | |||
266 | if (atomic_read(&nmi_active) < 0) | ||
267 | return 0; | ||
268 | |||
269 | error = sysdev_class_register(&nmi_sysclass); | ||
270 | if (!error) | ||
271 | error = sysdev_register(&device_lapic_nmi); | ||
272 | return error; | ||
273 | } | ||
274 | |||
275 | /* must come after the local APIC's device_initcall() */ | ||
276 | late_initcall(init_lapic_nmi_sysfs); | ||
277 | |||
278 | #endif /* CONFIG_PM */ | ||
279 | |||
280 | static void __acpi_nmi_enable(void *__unused) | ||
281 | { | ||
282 | apic_write(APIC_LVT0, APIC_DM_NMI); | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | * Enable timer based NMIs on all CPUs: | ||
287 | */ | ||
288 | void acpi_nmi_enable(void) | ||
289 | { | ||
290 | if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) | ||
291 | on_each_cpu(__acpi_nmi_enable, NULL, 1); | ||
292 | } | ||
293 | |||
294 | /* | ||
295 | * Disable timer based NMIs on all CPUs: | ||
296 | */ | ||
297 | void acpi_nmi_disable(void) | ||
298 | { | ||
299 | if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) | ||
300 | on_each_cpu(__acpi_nmi_disable, NULL, 1); | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * This function is called as soon the LAPIC NMI watchdog driver has everything | ||
305 | * in place and it's ready to check if the NMIs belong to the NMI watchdog | ||
306 | */ | ||
307 | void cpu_nmi_set_wd_enabled(void) | ||
308 | { | ||
309 | __get_cpu_var(wd_enabled) = 1; | ||
310 | } | ||
311 | |||
312 | void setup_apic_nmi_watchdog(void *unused) | ||
313 | { | ||
314 | if (__get_cpu_var(wd_enabled)) | ||
315 | return; | ||
316 | |||
317 | /* cheap hack to support suspend/resume */ | ||
318 | /* if cpu0 is not active neither should the other cpus */ | ||
319 | if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0) | ||
320 | return; | ||
321 | |||
322 | switch (nmi_watchdog) { | ||
323 | case NMI_LOCAL_APIC: | ||
324 | if (lapic_watchdog_init(nmi_hz) < 0) { | ||
325 | __get_cpu_var(wd_enabled) = 0; | ||
326 | return; | ||
327 | } | ||
328 | /* FALL THROUGH */ | ||
329 | case NMI_IO_APIC: | ||
330 | __get_cpu_var(wd_enabled) = 1; | ||
331 | atomic_inc(&nmi_active); | ||
332 | } | ||
333 | } | ||
334 | |||
335 | void stop_apic_nmi_watchdog(void *unused) | ||
336 | { | ||
337 | /* only support LOCAL and IO APICs for now */ | ||
338 | if (!nmi_watchdog_active()) | ||
339 | return; | ||
340 | if (__get_cpu_var(wd_enabled) == 0) | ||
341 | return; | ||
342 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
343 | lapic_watchdog_stop(); | ||
344 | else | ||
345 | __acpi_nmi_disable(NULL); | ||
346 | __get_cpu_var(wd_enabled) = 0; | ||
347 | atomic_dec(&nmi_active); | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * the best way to detect whether a CPU has a 'hard lockup' problem | ||
352 | * is to check it's local APIC timer IRQ counts. If they are not | ||
353 | * changing then that CPU has some problem. | ||
354 | * | ||
355 | * as these watchdog NMI IRQs are generated on every CPU, we only | ||
356 | * have to check the current processor. | ||
357 | * | ||
358 | * since NMIs don't listen to _any_ locks, we have to be extremely | ||
359 | * careful not to rely on unsafe variables. The printk might lock | ||
360 | * up though, so we have to break up any console locks first ... | ||
361 | * [when there will be more tty-related locks, break them up here too!] | ||
362 | */ | ||
363 | |||
364 | static DEFINE_PER_CPU(unsigned, last_irq_sum); | ||
365 | static DEFINE_PER_CPU(long, alert_counter); | ||
366 | static DEFINE_PER_CPU(int, nmi_touch); | ||
367 | |||
368 | void touch_nmi_watchdog(void) | ||
369 | { | ||
370 | if (nmi_watchdog_active()) { | ||
371 | unsigned cpu; | ||
372 | |||
373 | /* | ||
374 | * Tell other CPUs to reset their alert counters. We cannot | ||
375 | * do it ourselves because the alert count increase is not | ||
376 | * atomic. | ||
377 | */ | ||
378 | for_each_present_cpu(cpu) { | ||
379 | if (per_cpu(nmi_touch, cpu) != 1) | ||
380 | per_cpu(nmi_touch, cpu) = 1; | ||
381 | } | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * Tickle the softlockup detector too: | ||
386 | */ | ||
387 | touch_softlockup_watchdog(); | ||
388 | } | ||
389 | EXPORT_SYMBOL(touch_nmi_watchdog); | ||
390 | |||
391 | notrace __kprobes int | ||
392 | nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) | ||
393 | { | ||
394 | /* | ||
395 | * Since current_thread_info()-> is always on the stack, and we | ||
396 | * always switch the stack NMI-atomically, it's safe to use | ||
397 | * smp_processor_id(). | ||
398 | */ | ||
399 | unsigned int sum; | ||
400 | int touched = 0; | ||
401 | int cpu = smp_processor_id(); | ||
402 | int rc = 0; | ||
403 | |||
404 | sum = get_timer_irqs(cpu); | ||
405 | |||
406 | if (__get_cpu_var(nmi_touch)) { | ||
407 | __get_cpu_var(nmi_touch) = 0; | ||
408 | touched = 1; | ||
409 | } | ||
410 | |||
411 | /* We can be called before check_nmi_watchdog, hence NULL check. */ | ||
412 | if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { | ||
413 | static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */ | ||
414 | |||
415 | raw_spin_lock(&lock); | ||
416 | printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); | ||
417 | show_regs(regs); | ||
418 | dump_stack(); | ||
419 | raw_spin_unlock(&lock); | ||
420 | cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); | ||
421 | |||
422 | rc = 1; | ||
423 | } | ||
424 | |||
425 | /* Could check oops_in_progress here too, but it's safer not to */ | ||
426 | if (mce_in_progress()) | ||
427 | touched = 1; | ||
428 | |||
429 | /* if the none of the timers isn't firing, this cpu isn't doing much */ | ||
430 | if (!touched && __get_cpu_var(last_irq_sum) == sum) { | ||
431 | /* | ||
432 | * Ayiee, looks like this CPU is stuck ... | ||
433 | * wait a few IRQs (5 seconds) before doing the oops ... | ||
434 | */ | ||
435 | __this_cpu_inc(alert_counter); | ||
436 | if (__this_cpu_read(alert_counter) == 5 * nmi_hz) | ||
437 | /* | ||
438 | * die_nmi will return ONLY if NOTIFY_STOP happens.. | ||
439 | */ | ||
440 | die_nmi("BUG: NMI Watchdog detected LOCKUP", | ||
441 | regs, panic_on_timeout); | ||
442 | } else { | ||
443 | __get_cpu_var(last_irq_sum) = sum; | ||
444 | __this_cpu_write(alert_counter, 0); | ||
445 | } | ||
446 | |||
447 | /* see if the nmi watchdog went off */ | ||
448 | if (!__get_cpu_var(wd_enabled)) | ||
449 | return rc; | ||
450 | switch (nmi_watchdog) { | ||
451 | case NMI_LOCAL_APIC: | ||
452 | rc |= lapic_wd_event(nmi_hz); | ||
453 | break; | ||
454 | case NMI_IO_APIC: | ||
455 | /* | ||
456 | * don't know how to accurately check for this. | ||
457 | * just assume it was a watchdog timer interrupt | ||
458 | * This matches the old behaviour. | ||
459 | */ | ||
460 | rc = 1; | ||
461 | break; | ||
462 | } | ||
463 | return rc; | ||
464 | } | ||
465 | |||
466 | #ifdef CONFIG_SYSCTL | ||
467 | |||
468 | static void enable_ioapic_nmi_watchdog_single(void *unused) | ||
469 | { | ||
470 | __get_cpu_var(wd_enabled) = 1; | ||
471 | atomic_inc(&nmi_active); | ||
472 | __acpi_nmi_enable(NULL); | ||
473 | } | ||
474 | |||
475 | static void enable_ioapic_nmi_watchdog(void) | ||
476 | { | ||
477 | on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1); | ||
478 | touch_nmi_watchdog(); | ||
479 | } | ||
480 | |||
481 | static void disable_ioapic_nmi_watchdog(void) | ||
482 | { | ||
483 | on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); | ||
484 | } | ||
485 | |||
486 | static int __init setup_unknown_nmi_panic(char *str) | ||
487 | { | ||
488 | unknown_nmi_panic = 1; | ||
489 | return 1; | ||
490 | } | ||
491 | __setup("unknown_nmi_panic", setup_unknown_nmi_panic); | ||
492 | |||
493 | static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) | ||
494 | { | ||
495 | unsigned char reason = get_nmi_reason(); | ||
496 | char buf[64]; | ||
497 | |||
498 | sprintf(buf, "NMI received for unknown reason %02x\n", reason); | ||
499 | die_nmi(buf, regs, 1); /* Always panic here */ | ||
500 | return 0; | ||
501 | } | ||
502 | |||
503 | /* | ||
504 | * proc handler for /proc/sys/kernel/nmi | ||
505 | */ | ||
506 | int proc_nmi_enabled(struct ctl_table *table, int write, | ||
507 | void __user *buffer, size_t *length, loff_t *ppos) | ||
508 | { | ||
509 | int old_state; | ||
510 | |||
511 | nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; | ||
512 | old_state = nmi_watchdog_enabled; | ||
513 | proc_dointvec(table, write, buffer, length, ppos); | ||
514 | if (!!old_state == !!nmi_watchdog_enabled) | ||
515 | return 0; | ||
516 | |||
517 | if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) { | ||
518 | printk(KERN_WARNING | ||
519 | "NMI watchdog is permanently disabled\n"); | ||
520 | return -EIO; | ||
521 | } | ||
522 | |||
523 | if (nmi_watchdog == NMI_LOCAL_APIC) { | ||
524 | if (nmi_watchdog_enabled) | ||
525 | enable_lapic_nmi_watchdog(); | ||
526 | else | ||
527 | disable_lapic_nmi_watchdog(); | ||
528 | } else if (nmi_watchdog == NMI_IO_APIC) { | ||
529 | if (nmi_watchdog_enabled) | ||
530 | enable_ioapic_nmi_watchdog(); | ||
531 | else | ||
532 | disable_ioapic_nmi_watchdog(); | ||
533 | } else { | ||
534 | printk(KERN_WARNING | ||
535 | "NMI watchdog doesn't know what hardware to touch\n"); | ||
536 | return -EIO; | ||
537 | } | ||
538 | return 0; | ||
539 | } | ||
540 | |||
541 | #endif /* CONFIG_SYSCTL */ | ||
542 | |||
543 | int do_nmi_callback(struct pt_regs *regs, int cpu) | ||
544 | { | ||
545 | #ifdef CONFIG_SYSCTL | ||
546 | if (unknown_nmi_panic) | ||
547 | return unknown_nmi_panic_callback(regs, cpu); | ||
548 | #endif | ||
549 | return 0; | ||
550 | } | ||
551 | |||
552 | void arch_trigger_all_cpu_backtrace(void) | ||
553 | { | ||
554 | int i; | ||
555 | |||
556 | cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); | ||
557 | |||
558 | printk(KERN_INFO "sending NMI to all CPUs:\n"); | ||
559 | apic->send_IPI_all(NMI_VECTOR); | ||
560 | |||
561 | /* Wait for up to 10 seconds for all CPUs to do the backtrace */ | ||
562 | for (i = 0; i < 10 * 1000; i++) { | ||
563 | if (cpumask_empty(to_cpumask(backtrace_mask))) | ||
564 | break; | ||
565 | mdelay(1); | ||
566 | } | ||
567 | } | ||
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index f9e4e6a54073..d8c4a6feb286 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c | |||
@@ -79,13 +79,6 @@ void __init default_setup_apic_routing(void) | |||
79 | /* need to update phys_pkg_id */ | 79 | /* need to update phys_pkg_id */ |
80 | apic->phys_pkg_id = apicid_phys_pkg_id; | 80 | apic->phys_pkg_id = apicid_phys_pkg_id; |
81 | } | 81 | } |
82 | |||
83 | /* | ||
84 | * Now that apic routing model is selected, configure the | ||
85 | * fault handling for intr remapping. | ||
86 | */ | ||
87 | if (intr_remapping_enabled) | ||
88 | enable_drhd_fault_handling(); | ||
89 | } | 82 | } |
90 | 83 | ||
91 | /* Same for both flat and physical. */ | 84 | /* Same for both flat and physical. */ |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index ed4118de249e..bd16b58b8850 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c | |||
@@ -44,8 +44,20 @@ static u64 gru_start_paddr, gru_end_paddr; | |||
44 | static union uvh_apicid uvh_apicid; | 44 | static union uvh_apicid uvh_apicid; |
45 | int uv_min_hub_revision_id; | 45 | int uv_min_hub_revision_id; |
46 | EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); | 46 | EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); |
47 | unsigned int uv_apicid_hibits; | ||
48 | EXPORT_SYMBOL_GPL(uv_apicid_hibits); | ||
47 | static DEFINE_SPINLOCK(uv_nmi_lock); | 49 | static DEFINE_SPINLOCK(uv_nmi_lock); |
48 | 50 | ||
51 | static unsigned long __init uv_early_read_mmr(unsigned long addr) | ||
52 | { | ||
53 | unsigned long val, *mmr; | ||
54 | |||
55 | mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr)); | ||
56 | val = *mmr; | ||
57 | early_iounmap(mmr, sizeof(*mmr)); | ||
58 | return val; | ||
59 | } | ||
60 | |||
49 | static inline bool is_GRU_range(u64 start, u64 end) | 61 | static inline bool is_GRU_range(u64 start, u64 end) |
50 | { | 62 | { |
51 | return start >= gru_start_paddr && end <= gru_end_paddr; | 63 | return start >= gru_start_paddr && end <= gru_end_paddr; |
@@ -56,28 +68,24 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end) | |||
56 | return is_ISA_range(start, end) || is_GRU_range(start, end); | 68 | return is_ISA_range(start, end) || is_GRU_range(start, end); |
57 | } | 69 | } |
58 | 70 | ||
59 | static int early_get_nodeid(void) | 71 | static int __init early_get_pnodeid(void) |
60 | { | 72 | { |
61 | union uvh_node_id_u node_id; | 73 | union uvh_node_id_u node_id; |
62 | unsigned long *mmr; | 74 | union uvh_rh_gam_config_mmr_u m_n_config; |
63 | 75 | int pnode; | |
64 | mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); | ||
65 | node_id.v = *mmr; | ||
66 | early_iounmap(mmr, sizeof(*mmr)); | ||
67 | 76 | ||
68 | /* Currently, all blades have same revision number */ | 77 | /* Currently, all blades have same revision number */ |
78 | node_id.v = uv_early_read_mmr(UVH_NODE_ID); | ||
79 | m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR); | ||
69 | uv_min_hub_revision_id = node_id.s.revision; | 80 | uv_min_hub_revision_id = node_id.s.revision; |
70 | 81 | ||
71 | return node_id.s.node_id; | 82 | pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); |
83 | return pnode; | ||
72 | } | 84 | } |
73 | 85 | ||
74 | static void __init early_get_apic_pnode_shift(void) | 86 | static void __init early_get_apic_pnode_shift(void) |
75 | { | 87 | { |
76 | unsigned long *mmr; | 88 | uvh_apicid.v = uv_early_read_mmr(UVH_APICID); |
77 | |||
78 | mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr)); | ||
79 | uvh_apicid.v = *mmr; | ||
80 | early_iounmap(mmr, sizeof(*mmr)); | ||
81 | if (!uvh_apicid.v) | 89 | if (!uvh_apicid.v) |
82 | /* | 90 | /* |
83 | * Old bios, use default value | 91 | * Old bios, use default value |
@@ -85,12 +93,25 @@ static void __init early_get_apic_pnode_shift(void) | |||
85 | uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT; | 93 | uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT; |
86 | } | 94 | } |
87 | 95 | ||
96 | /* | ||
97 | * Add an extra bit as dictated by bios to the destination apicid of | ||
98 | * interrupts potentially passing through the UV HUB. This prevents | ||
99 | * a deadlock between interrupts and IO port operations. | ||
100 | */ | ||
101 | static void __init uv_set_apicid_hibit(void) | ||
102 | { | ||
103 | union uvh_lb_target_physical_apic_id_mask_u apicid_mask; | ||
104 | |||
105 | apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK); | ||
106 | uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK; | ||
107 | } | ||
108 | |||
88 | static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | 109 | static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) |
89 | { | 110 | { |
90 | int nodeid; | 111 | int pnodeid; |
91 | 112 | ||
92 | if (!strcmp(oem_id, "SGI")) { | 113 | if (!strcmp(oem_id, "SGI")) { |
93 | nodeid = early_get_nodeid(); | 114 | pnodeid = early_get_pnodeid(); |
94 | early_get_apic_pnode_shift(); | 115 | early_get_apic_pnode_shift(); |
95 | x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; | 116 | x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; |
96 | x86_platform.nmi_init = uv_nmi_init; | 117 | x86_platform.nmi_init = uv_nmi_init; |
@@ -99,9 +120,10 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | |||
99 | else if (!strcmp(oem_table_id, "UVX")) | 120 | else if (!strcmp(oem_table_id, "UVX")) |
100 | uv_system_type = UV_X2APIC; | 121 | uv_system_type = UV_X2APIC; |
101 | else if (!strcmp(oem_table_id, "UVH")) { | 122 | else if (!strcmp(oem_table_id, "UVH")) { |
102 | __get_cpu_var(x2apic_extra_bits) = | 123 | __this_cpu_write(x2apic_extra_bits, |
103 | nodeid << (uvh_apicid.s.pnode_shift - 1); | 124 | pnodeid << uvh_apicid.s.pnode_shift); |
104 | uv_system_type = UV_NON_UNIQUE_APIC; | 125 | uv_system_type = UV_NON_UNIQUE_APIC; |
126 | uv_set_apicid_hibit(); | ||
105 | return 1; | 127 | return 1; |
106 | } | 128 | } |
107 | } | 129 | } |
@@ -155,6 +177,7 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri | |||
155 | int pnode; | 177 | int pnode; |
156 | 178 | ||
157 | pnode = uv_apicid_to_pnode(phys_apicid); | 179 | pnode = uv_apicid_to_pnode(phys_apicid); |
180 | phys_apicid |= uv_apicid_hibits; | ||
158 | val = (1UL << UVH_IPI_INT_SEND_SHFT) | | 181 | val = (1UL << UVH_IPI_INT_SEND_SHFT) | |
159 | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | | 182 | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | |
160 | ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | | 183 | ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | |
@@ -236,7 +259,7 @@ static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) | |||
236 | int cpu = cpumask_first(cpumask); | 259 | int cpu = cpumask_first(cpumask); |
237 | 260 | ||
238 | if ((unsigned)cpu < nr_cpu_ids) | 261 | if ((unsigned)cpu < nr_cpu_ids) |
239 | return per_cpu(x86_cpu_to_apicid, cpu); | 262 | return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; |
240 | else | 263 | else |
241 | return BAD_APICID; | 264 | return BAD_APICID; |
242 | } | 265 | } |
@@ -255,7 +278,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, | |||
255 | if (cpumask_test_cpu(cpu, cpu_online_mask)) | 278 | if (cpumask_test_cpu(cpu, cpu_online_mask)) |
256 | break; | 279 | break; |
257 | } | 280 | } |
258 | return per_cpu(x86_cpu_to_apicid, cpu); | 281 | return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; |
259 | } | 282 | } |
260 | 283 | ||
261 | static unsigned int x2apic_get_apic_id(unsigned long x) | 284 | static unsigned int x2apic_get_apic_id(unsigned long x) |
@@ -263,7 +286,7 @@ static unsigned int x2apic_get_apic_id(unsigned long x) | |||
263 | unsigned int id; | 286 | unsigned int id; |
264 | 287 | ||
265 | WARN_ON(preemptible() && num_online_cpus() > 1); | 288 | WARN_ON(preemptible() && num_online_cpus() > 1); |
266 | id = x | __get_cpu_var(x2apic_extra_bits); | 289 | id = x | __this_cpu_read(x2apic_extra_bits); |
267 | 290 | ||
268 | return id; | 291 | return id; |
269 | } | 292 | } |
@@ -355,7 +378,7 @@ struct apic __refdata apic_x2apic_uv_x = { | |||
355 | 378 | ||
356 | static __cpuinit void set_x2apic_extra_bits(int pnode) | 379 | static __cpuinit void set_x2apic_extra_bits(int pnode) |
357 | { | 380 | { |
358 | __get_cpu_var(x2apic_extra_bits) = (pnode << 6); | 381 | __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift); |
359 | } | 382 | } |
360 | 383 | ||
361 | /* | 384 | /* |
@@ -379,14 +402,14 @@ struct redir_addr { | |||
379 | #define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT | 402 | #define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT |
380 | 403 | ||
381 | static __initdata struct redir_addr redir_addrs[] = { | 404 | static __initdata struct redir_addr redir_addrs[] = { |
382 | {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG}, | 405 | {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR}, |
383 | {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG}, | 406 | {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR}, |
384 | {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG}, | 407 | {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR}, |
385 | }; | 408 | }; |
386 | 409 | ||
387 | static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) | 410 | static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) |
388 | { | 411 | { |
389 | union uvh_si_alias0_overlay_config_u alias; | 412 | union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; |
390 | union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; | 413 | union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; |
391 | int i; | 414 | int i; |
392 | 415 | ||
@@ -618,7 +641,7 @@ void __cpuinit uv_cpu_init(void) | |||
618 | */ | 641 | */ |
619 | int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) | 642 | int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) |
620 | { | 643 | { |
621 | if (reason != DIE_NMI_IPI) | 644 | if (reason != DIE_NMIUNKNOWN) |
622 | return NOTIFY_OK; | 645 | return NOTIFY_OK; |
623 | 646 | ||
624 | if (in_crash_kexec) | 647 | if (in_crash_kexec) |
@@ -660,28 +683,33 @@ void uv_nmi_init(void) | |||
660 | 683 | ||
661 | void __init uv_system_init(void) | 684 | void __init uv_system_init(void) |
662 | { | 685 | { |
663 | union uvh_si_addr_map_config_u m_n_config; | 686 | union uvh_rh_gam_config_mmr_u m_n_config; |
687 | union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; | ||
664 | union uvh_node_id_u node_id; | 688 | union uvh_node_id_u node_id; |
665 | unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; | 689 | unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; |
666 | int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; | 690 | int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io; |
667 | int gnode_extra, max_pnode = 0; | 691 | int gnode_extra, max_pnode = 0; |
668 | unsigned long mmr_base, present, paddr; | 692 | unsigned long mmr_base, present, paddr; |
669 | unsigned short pnode_mask; | 693 | unsigned short pnode_mask, pnode_io_mask; |
670 | 694 | ||
671 | map_low_mmrs(); | 695 | map_low_mmrs(); |
672 | 696 | ||
673 | m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); | 697 | m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); |
674 | m_val = m_n_config.s.m_skt; | 698 | m_val = m_n_config.s.m_skt; |
675 | n_val = m_n_config.s.n_skt; | 699 | n_val = m_n_config.s.n_skt; |
700 | mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); | ||
701 | n_io = mmioh.s.n_io; | ||
676 | mmr_base = | 702 | mmr_base = |
677 | uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & | 703 | uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & |
678 | ~UV_MMR_ENABLE; | 704 | ~UV_MMR_ENABLE; |
679 | pnode_mask = (1 << n_val) - 1; | 705 | pnode_mask = (1 << n_val) - 1; |
706 | pnode_io_mask = (1 << n_io) - 1; | ||
707 | |||
680 | node_id.v = uv_read_local_mmr(UVH_NODE_ID); | 708 | node_id.v = uv_read_local_mmr(UVH_NODE_ID); |
681 | gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; | 709 | gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; |
682 | gnode_upper = ((unsigned long)gnode_extra << m_val); | 710 | gnode_upper = ((unsigned long)gnode_extra << m_val); |
683 | printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", | 711 | printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n", |
684 | n_val, m_val, gnode_upper, gnode_extra); | 712 | n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask); |
685 | 713 | ||
686 | printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); | 714 | printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); |
687 | 715 | ||
@@ -714,7 +742,7 @@ void __init uv_system_init(void) | |||
714 | for (j = 0; j < 64; j++) { | 742 | for (j = 0; j < 64; j++) { |
715 | if (!test_bit(j, &present)) | 743 | if (!test_bit(j, &present)) |
716 | continue; | 744 | continue; |
717 | pnode = (i * 64 + j); | 745 | pnode = (i * 64 + j) & pnode_mask; |
718 | uv_blade_info[blade].pnode = pnode; | 746 | uv_blade_info[blade].pnode = pnode; |
719 | uv_blade_info[blade].nr_possible_cpus = 0; | 747 | uv_blade_info[blade].nr_possible_cpus = 0; |
720 | uv_blade_info[blade].nr_online_cpus = 0; | 748 | uv_blade_info[blade].nr_online_cpus = 0; |
@@ -735,6 +763,7 @@ void __init uv_system_init(void) | |||
735 | /* | 763 | /* |
736 | * apic_pnode_shift must be set before calling uv_apicid_to_pnode(); | 764 | * apic_pnode_shift must be set before calling uv_apicid_to_pnode(); |
737 | */ | 765 | */ |
766 | uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; | ||
738 | uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; | 767 | uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; |
739 | pnode = uv_apicid_to_pnode(apicid); | 768 | pnode = uv_apicid_to_pnode(apicid); |
740 | blade = boot_pnode_to_blade(pnode); | 769 | blade = boot_pnode_to_blade(pnode); |
@@ -751,7 +780,6 @@ void __init uv_system_init(void) | |||
751 | uv_cpu_hub_info(cpu)->numa_blade_id = blade; | 780 | uv_cpu_hub_info(cpu)->numa_blade_id = blade; |
752 | uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; | 781 | uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; |
753 | uv_cpu_hub_info(cpu)->pnode = pnode; | 782 | uv_cpu_hub_info(cpu)->pnode = pnode; |
754 | uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; | ||
755 | uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; | 783 | uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; |
756 | uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; | 784 | uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; |
757 | uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; | 785 | uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; |
@@ -775,7 +803,7 @@ void __init uv_system_init(void) | |||
775 | 803 | ||
776 | map_gru_high(max_pnode); | 804 | map_gru_high(max_pnode); |
777 | map_mmr_high(max_pnode); | 805 | map_mmr_high(max_pnode); |
778 | map_mmioh_high(max_pnode); | 806 | map_mmioh_high(max_pnode & pnode_io_mask); |
779 | 807 | ||
780 | uv_cpu_init(); | 808 | uv_cpu_init(); |
781 | uv_scir_register_cpu_notifier(); | 809 | uv_scir_register_cpu_notifier(); |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9e093f8fe78c..7c7bedb83c5a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -668,7 +668,7 @@ EXPORT_SYMBOL_GPL(amd_erratum_383); | |||
668 | 668 | ||
669 | bool cpu_has_amd_erratum(const int *erratum) | 669 | bool cpu_has_amd_erratum(const int *erratum) |
670 | { | 670 | { |
671 | struct cpuinfo_x86 *cpu = ¤t_cpu_data; | 671 | struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info); |
672 | int osvw_id = *erratum++; | 672 | int osvw_id = *erratum++; |
673 | u32 range; | 673 | u32 range; |
674 | u32 ms; | 674 | u32 ms; |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4b68bda30938..1d59834396bd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -894,7 +894,6 @@ void __init identify_boot_cpu(void) | |||
894 | #else | 894 | #else |
895 | vgetcpu_set_mode(); | 895 | vgetcpu_set_mode(); |
896 | #endif | 896 | #endif |
897 | init_hw_perf_events(); | ||
898 | } | 897 | } |
899 | 898 | ||
900 | void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) | 899 | void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) |
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 491977baf6c0..35c7e65e59be 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c | |||
@@ -521,7 +521,7 @@ static void check_supported_cpu(void *_rc) | |||
521 | 521 | ||
522 | *rc = -ENODEV; | 522 | *rc = -ENODEV; |
523 | 523 | ||
524 | if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) | 524 | if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD) |
525 | return; | 525 | return; |
526 | 526 | ||
527 | eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); | 527 | eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); |
@@ -1377,7 +1377,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) | |||
1377 | static void query_values_on_cpu(void *_err) | 1377 | static void query_values_on_cpu(void *_err) |
1378 | { | 1378 | { |
1379 | int *err = _err; | 1379 | int *err = _err; |
1380 | struct powernow_k8_data *data = __get_cpu_var(powernow_data); | 1380 | struct powernow_k8_data *data = __this_cpu_read(powernow_data); |
1381 | 1381 | ||
1382 | *err = query_current_values_with_pending_wait(data); | 1382 | *err = query_current_values_with_pending_wait(data); |
1383 | } | 1383 | } |
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 17ad03366211..ec2c19a7b8ef 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -45,6 +45,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = | |||
45 | { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ | 45 | { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ |
46 | { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ | 46 | { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ |
47 | { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ | 47 | { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ |
48 | { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */ | ||
48 | { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ | 49 | { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ |
49 | { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ | 50 | { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ |
50 | { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ | 51 | { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ |
@@ -66,6 +67,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = | |||
66 | { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ | 67 | { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ |
67 | { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ | 68 | { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ |
68 | { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ | 69 | { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ |
70 | { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */ | ||
69 | { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ | 71 | { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ |
70 | { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ | 72 | { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ |
71 | { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ | 73 | { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ |
@@ -87,6 +89,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = | |||
87 | { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ | 89 | { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ |
88 | { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ | 90 | { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ |
89 | { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ | 91 | { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ |
92 | { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */ | ||
90 | { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ | 93 | { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ |
91 | { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ | 94 | { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ |
92 | { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ | 95 | { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ |
@@ -149,8 +152,7 @@ union _cpuid4_leaf_ecx { | |||
149 | }; | 152 | }; |
150 | 153 | ||
151 | struct amd_l3_cache { | 154 | struct amd_l3_cache { |
152 | struct pci_dev *dev; | 155 | struct amd_northbridge *nb; |
153 | bool can_disable; | ||
154 | unsigned indices; | 156 | unsigned indices; |
155 | u8 subcaches[4]; | 157 | u8 subcaches[4]; |
156 | }; | 158 | }; |
@@ -266,7 +268,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, | |||
266 | line_size = l2.line_size; | 268 | line_size = l2.line_size; |
267 | lines_per_tag = l2.lines_per_tag; | 269 | lines_per_tag = l2.lines_per_tag; |
268 | /* cpu_data has errata corrections for K7 applied */ | 270 | /* cpu_data has errata corrections for K7 applied */ |
269 | size_in_kb = current_cpu_data.x86_cache_size; | 271 | size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); |
270 | break; | 272 | break; |
271 | case 3: | 273 | case 3: |
272 | if (!l3.val) | 274 | if (!l3.val) |
@@ -288,7 +290,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, | |||
288 | eax->split.type = types[leaf]; | 290 | eax->split.type = types[leaf]; |
289 | eax->split.level = levels[leaf]; | 291 | eax->split.level = levels[leaf]; |
290 | eax->split.num_threads_sharing = 0; | 292 | eax->split.num_threads_sharing = 0; |
291 | eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; | 293 | eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1; |
292 | 294 | ||
293 | 295 | ||
294 | if (assoc == 0xffff) | 296 | if (assoc == 0xffff) |
@@ -311,14 +313,12 @@ struct _cache_attr { | |||
311 | /* | 313 | /* |
312 | * L3 cache descriptors | 314 | * L3 cache descriptors |
313 | */ | 315 | */ |
314 | static struct amd_l3_cache **__cpuinitdata l3_caches; | ||
315 | |||
316 | static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) | 316 | static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) |
317 | { | 317 | { |
318 | unsigned int sc0, sc1, sc2, sc3; | 318 | unsigned int sc0, sc1, sc2, sc3; |
319 | u32 val = 0; | 319 | u32 val = 0; |
320 | 320 | ||
321 | pci_read_config_dword(l3->dev, 0x1C4, &val); | 321 | pci_read_config_dword(l3->nb->misc, 0x1C4, &val); |
322 | 322 | ||
323 | /* calculate subcache sizes */ | 323 | /* calculate subcache sizes */ |
324 | l3->subcaches[0] = sc0 = !(val & BIT(0)); | 324 | l3->subcaches[0] = sc0 = !(val & BIT(0)); |
@@ -330,47 +330,14 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) | |||
330 | l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; | 330 | l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; |
331 | } | 331 | } |
332 | 332 | ||
333 | static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) | 333 | static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, |
334 | { | 334 | int index) |
335 | struct amd_l3_cache *l3; | ||
336 | struct pci_dev *dev = node_to_k8_nb_misc(node); | ||
337 | |||
338 | l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC); | ||
339 | if (!l3) { | ||
340 | printk(KERN_WARNING "Error allocating L3 struct\n"); | ||
341 | return NULL; | ||
342 | } | ||
343 | |||
344 | l3->dev = dev; | ||
345 | |||
346 | amd_calc_l3_indices(l3); | ||
347 | |||
348 | return l3; | ||
349 | } | ||
350 | |||
351 | static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, | ||
352 | int index) | ||
353 | { | 335 | { |
336 | static struct amd_l3_cache *__cpuinitdata l3_caches; | ||
354 | int node; | 337 | int node; |
355 | 338 | ||
356 | if (boot_cpu_data.x86 != 0x10) | 339 | /* only for L3, and not in virtualized environments */ |
357 | return; | 340 | if (index < 3 || amd_nb_num() == 0) |
358 | |||
359 | if (index < 3) | ||
360 | return; | ||
361 | |||
362 | /* see errata #382 and #388 */ | ||
363 | if (boot_cpu_data.x86_model < 0x8) | ||
364 | return; | ||
365 | |||
366 | if ((boot_cpu_data.x86_model == 0x8 || | ||
367 | boot_cpu_data.x86_model == 0x9) | ||
368 | && | ||
369 | boot_cpu_data.x86_mask < 0x1) | ||
370 | return; | ||
371 | |||
372 | /* not in virtualized environments */ | ||
373 | if (k8_northbridges.num == 0) | ||
374 | return; | 341 | return; |
375 | 342 | ||
376 | /* | 343 | /* |
@@ -378,7 +345,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, | |||
378 | * never freed but this is done only on shutdown so it doesn't matter. | 345 | * never freed but this is done only on shutdown so it doesn't matter. |
379 | */ | 346 | */ |
380 | if (!l3_caches) { | 347 | if (!l3_caches) { |
381 | int size = k8_northbridges.num * sizeof(struct amd_l3_cache *); | 348 | int size = amd_nb_num() * sizeof(struct amd_l3_cache); |
382 | 349 | ||
383 | l3_caches = kzalloc(size, GFP_ATOMIC); | 350 | l3_caches = kzalloc(size, GFP_ATOMIC); |
384 | if (!l3_caches) | 351 | if (!l3_caches) |
@@ -387,14 +354,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, | |||
387 | 354 | ||
388 | node = amd_get_nb_id(smp_processor_id()); | 355 | node = amd_get_nb_id(smp_processor_id()); |
389 | 356 | ||
390 | if (!l3_caches[node]) { | 357 | if (!l3_caches[node].nb) { |
391 | l3_caches[node] = amd_init_l3_cache(node); | 358 | l3_caches[node].nb = node_to_amd_nb(node); |
392 | l3_caches[node]->can_disable = true; | 359 | amd_calc_l3_indices(&l3_caches[node]); |
393 | } | 360 | } |
394 | 361 | ||
395 | WARN_ON(!l3_caches[node]); | 362 | this_leaf->l3 = &l3_caches[node]; |
396 | |||
397 | this_leaf->l3 = l3_caches[node]; | ||
398 | } | 363 | } |
399 | 364 | ||
400 | /* | 365 | /* |
@@ -408,7 +373,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) | |||
408 | { | 373 | { |
409 | unsigned int reg = 0; | 374 | unsigned int reg = 0; |
410 | 375 | ||
411 | pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); | 376 | pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, ®); |
412 | 377 | ||
413 | /* check whether this slot is activated already */ | 378 | /* check whether this slot is activated already */ |
414 | if (reg & (3UL << 30)) | 379 | if (reg & (3UL << 30)) |
@@ -422,7 +387,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, | |||
422 | { | 387 | { |
423 | int index; | 388 | int index; |
424 | 389 | ||
425 | if (!this_leaf->l3 || !this_leaf->l3->can_disable) | 390 | if (!this_leaf->l3 || |
391 | !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) | ||
426 | return -EINVAL; | 392 | return -EINVAL; |
427 | 393 | ||
428 | index = amd_get_l3_disable_slot(this_leaf->l3, slot); | 394 | index = amd_get_l3_disable_slot(this_leaf->l3, slot); |
@@ -457,7 +423,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, | |||
457 | if (!l3->subcaches[i]) | 423 | if (!l3->subcaches[i]) |
458 | continue; | 424 | continue; |
459 | 425 | ||
460 | pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); | 426 | pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); |
461 | 427 | ||
462 | /* | 428 | /* |
463 | * We need to WBINVD on a core on the node containing the L3 | 429 | * We need to WBINVD on a core on the node containing the L3 |
@@ -467,7 +433,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, | |||
467 | wbinvd_on_cpu(cpu); | 433 | wbinvd_on_cpu(cpu); |
468 | 434 | ||
469 | reg |= BIT(31); | 435 | reg |= BIT(31); |
470 | pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); | 436 | pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); |
471 | } | 437 | } |
472 | } | 438 | } |
473 | 439 | ||
@@ -524,7 +490,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, | |||
524 | if (!capable(CAP_SYS_ADMIN)) | 490 | if (!capable(CAP_SYS_ADMIN)) |
525 | return -EPERM; | 491 | return -EPERM; |
526 | 492 | ||
527 | if (!this_leaf->l3 || !this_leaf->l3->can_disable) | 493 | if (!this_leaf->l3 || |
494 | !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) | ||
528 | return -EINVAL; | 495 | return -EINVAL; |
529 | 496 | ||
530 | cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); | 497 | cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); |
@@ -545,7 +512,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, | |||
545 | #define STORE_CACHE_DISABLE(slot) \ | 512 | #define STORE_CACHE_DISABLE(slot) \ |
546 | static ssize_t \ | 513 | static ssize_t \ |
547 | store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ | 514 | store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ |
548 | const char *buf, size_t count) \ | 515 | const char *buf, size_t count) \ |
549 | { \ | 516 | { \ |
550 | return store_cache_disable(this_leaf, buf, count, slot); \ | 517 | return store_cache_disable(this_leaf, buf, count, slot); \ |
551 | } | 518 | } |
@@ -558,10 +525,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, | |||
558 | show_cache_disable_1, store_cache_disable_1); | 525 | show_cache_disable_1, store_cache_disable_1); |
559 | 526 | ||
560 | #else /* CONFIG_AMD_NB */ | 527 | #else /* CONFIG_AMD_NB */ |
561 | static void __cpuinit | 528 | #define amd_init_l3_cache(x, y) |
562 | amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) | ||
563 | { | ||
564 | }; | ||
565 | #endif /* CONFIG_AMD_NB */ | 529 | #endif /* CONFIG_AMD_NB */ |
566 | 530 | ||
567 | static int | 531 | static int |
@@ -575,7 +539,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, | |||
575 | 539 | ||
576 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { | 540 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { |
577 | amd_cpuid4(index, &eax, &ebx, &ecx); | 541 | amd_cpuid4(index, &eax, &ebx, &ecx); |
578 | amd_check_l3_disable(this_leaf, index); | 542 | amd_init_l3_cache(this_leaf, index); |
579 | } else { | 543 | } else { |
580 | cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); | 544 | cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); |
581 | } | 545 | } |
@@ -983,30 +947,48 @@ define_one_ro(size); | |||
983 | define_one_ro(shared_cpu_map); | 947 | define_one_ro(shared_cpu_map); |
984 | define_one_ro(shared_cpu_list); | 948 | define_one_ro(shared_cpu_list); |
985 | 949 | ||
986 | #define DEFAULT_SYSFS_CACHE_ATTRS \ | ||
987 | &type.attr, \ | ||
988 | &level.attr, \ | ||
989 | &coherency_line_size.attr, \ | ||
990 | &physical_line_partition.attr, \ | ||
991 | &ways_of_associativity.attr, \ | ||
992 | &number_of_sets.attr, \ | ||
993 | &size.attr, \ | ||
994 | &shared_cpu_map.attr, \ | ||
995 | &shared_cpu_list.attr | ||
996 | |||
997 | static struct attribute *default_attrs[] = { | 950 | static struct attribute *default_attrs[] = { |
998 | DEFAULT_SYSFS_CACHE_ATTRS, | 951 | &type.attr, |
952 | &level.attr, | ||
953 | &coherency_line_size.attr, | ||
954 | &physical_line_partition.attr, | ||
955 | &ways_of_associativity.attr, | ||
956 | &number_of_sets.attr, | ||
957 | &size.attr, | ||
958 | &shared_cpu_map.attr, | ||
959 | &shared_cpu_list.attr, | ||
999 | NULL | 960 | NULL |
1000 | }; | 961 | }; |
1001 | 962 | ||
1002 | static struct attribute *default_l3_attrs[] = { | ||
1003 | DEFAULT_SYSFS_CACHE_ATTRS, | ||
1004 | #ifdef CONFIG_AMD_NB | 963 | #ifdef CONFIG_AMD_NB |
1005 | &cache_disable_0.attr, | 964 | static struct attribute ** __cpuinit amd_l3_attrs(void) |
1006 | &cache_disable_1.attr, | 965 | { |
966 | static struct attribute **attrs; | ||
967 | int n; | ||
968 | |||
969 | if (attrs) | ||
970 | return attrs; | ||
971 | |||
972 | n = sizeof (default_attrs) / sizeof (struct attribute *); | ||
973 | |||
974 | if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) | ||
975 | n += 2; | ||
976 | |||
977 | attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); | ||
978 | if (attrs == NULL) | ||
979 | return attrs = default_attrs; | ||
980 | |||
981 | for (n = 0; default_attrs[n]; n++) | ||
982 | attrs[n] = default_attrs[n]; | ||
983 | |||
984 | if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { | ||
985 | attrs[n++] = &cache_disable_0.attr; | ||
986 | attrs[n++] = &cache_disable_1.attr; | ||
987 | } | ||
988 | |||
989 | return attrs; | ||
990 | } | ||
1007 | #endif | 991 | #endif |
1008 | NULL | ||
1009 | }; | ||
1010 | 992 | ||
1011 | static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) | 993 | static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) |
1012 | { | 994 | { |
@@ -1117,11 +1099,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) | |||
1117 | 1099 | ||
1118 | this_leaf = CPUID4_INFO_IDX(cpu, i); | 1100 | this_leaf = CPUID4_INFO_IDX(cpu, i); |
1119 | 1101 | ||
1120 | if (this_leaf->l3 && this_leaf->l3->can_disable) | 1102 | ktype_cache.default_attrs = default_attrs; |
1121 | ktype_cache.default_attrs = default_l3_attrs; | 1103 | #ifdef CONFIG_AMD_NB |
1122 | else | 1104 | if (this_leaf->l3) |
1123 | ktype_cache.default_attrs = default_attrs; | 1105 | ktype_cache.default_attrs = amd_l3_attrs(); |
1124 | 1106 | #endif | |
1125 | retval = kobject_init_and_add(&(this_object->kobj), | 1107 | retval = kobject_init_and_add(&(this_object->kobj), |
1126 | &ktype_cache, | 1108 | &ktype_cache, |
1127 | per_cpu(ici_cache_kobject, cpu), | 1109 | per_cpu(ici_cache_kobject, cpu), |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index e7dbde7bfedb..a77971979564 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/gfp.h> | 25 | #include <linux/gfp.h> |
26 | #include <asm/mce.h> | 26 | #include <asm/mce.h> |
27 | #include <asm/apic.h> | 27 | #include <asm/apic.h> |
28 | #include <asm/nmi.h> | ||
28 | 29 | ||
29 | /* Update fake mce registers on current CPU. */ | 30 | /* Update fake mce registers on current CPU. */ |
30 | static void inject_mce(struct mce *m) | 31 | static void inject_mce(struct mce *m) |
@@ -83,7 +84,7 @@ static int mce_raise_notify(struct notifier_block *self, | |||
83 | struct die_args *args = (struct die_args *)data; | 84 | struct die_args *args = (struct die_args *)data; |
84 | int cpu = smp_processor_id(); | 85 | int cpu = smp_processor_id(); |
85 | struct mce *m = &__get_cpu_var(injectm); | 86 | struct mce *m = &__get_cpu_var(injectm); |
86 | if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) | 87 | if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) |
87 | return NOTIFY_DONE; | 88 | return NOTIFY_DONE; |
88 | cpumask_clear_cpu(cpu, mce_inject_cpumask); | 89 | cpumask_clear_cpu(cpu, mce_inject_cpumask); |
89 | if (m->inject_flags & MCJ_EXCEPTION) | 90 | if (m->inject_flags & MCJ_EXCEPTION) |
@@ -95,7 +96,7 @@ static int mce_raise_notify(struct notifier_block *self, | |||
95 | 96 | ||
96 | static struct notifier_block mce_raise_nb = { | 97 | static struct notifier_block mce_raise_nb = { |
97 | .notifier_call = mce_raise_notify, | 98 | .notifier_call = mce_raise_notify, |
98 | .priority = 1000, | 99 | .priority = NMI_LOCAL_NORMAL_PRIOR, |
99 | }; | 100 | }; |
100 | 101 | ||
101 | /* Inject mce on current CPU */ | 102 | /* Inject mce on current CPU */ |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7a35b72d7c03..d916183b7f9c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -326,7 +326,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) | |||
326 | 326 | ||
327 | static int msr_to_offset(u32 msr) | 327 | static int msr_to_offset(u32 msr) |
328 | { | 328 | { |
329 | unsigned bank = __get_cpu_var(injectm.bank); | 329 | unsigned bank = __this_cpu_read(injectm.bank); |
330 | 330 | ||
331 | if (msr == rip_msr) | 331 | if (msr == rip_msr) |
332 | return offsetof(struct mce, ip); | 332 | return offsetof(struct mce, ip); |
@@ -346,7 +346,7 @@ static u64 mce_rdmsrl(u32 msr) | |||
346 | { | 346 | { |
347 | u64 v; | 347 | u64 v; |
348 | 348 | ||
349 | if (__get_cpu_var(injectm).finished) { | 349 | if (__this_cpu_read(injectm.finished)) { |
350 | int offset = msr_to_offset(msr); | 350 | int offset = msr_to_offset(msr); |
351 | 351 | ||
352 | if (offset < 0) | 352 | if (offset < 0) |
@@ -369,7 +369,7 @@ static u64 mce_rdmsrl(u32 msr) | |||
369 | 369 | ||
370 | static void mce_wrmsrl(u32 msr, u64 v) | 370 | static void mce_wrmsrl(u32 msr, u64 v) |
371 | { | 371 | { |
372 | if (__get_cpu_var(injectm).finished) { | 372 | if (__this_cpu_read(injectm.finished)) { |
373 | int offset = msr_to_offset(msr); | 373 | int offset = msr_to_offset(msr); |
374 | 374 | ||
375 | if (offset >= 0) | 375 | if (offset >= 0) |
@@ -1159,7 +1159,7 @@ static void mce_start_timer(unsigned long data) | |||
1159 | 1159 | ||
1160 | WARN_ON(smp_processor_id() != data); | 1160 | WARN_ON(smp_processor_id() != data); |
1161 | 1161 | ||
1162 | if (mce_available(¤t_cpu_data)) { | 1162 | if (mce_available(__this_cpu_ptr(&cpu_info))) { |
1163 | machine_check_poll(MCP_TIMESTAMP, | 1163 | machine_check_poll(MCP_TIMESTAMP, |
1164 | &__get_cpu_var(mce_poll_banks)); | 1164 | &__get_cpu_var(mce_poll_banks)); |
1165 | } | 1165 | } |
@@ -1767,7 +1767,7 @@ static int mce_shutdown(struct sys_device *dev) | |||
1767 | static int mce_resume(struct sys_device *dev) | 1767 | static int mce_resume(struct sys_device *dev) |
1768 | { | 1768 | { |
1769 | __mcheck_cpu_init_generic(); | 1769 | __mcheck_cpu_init_generic(); |
1770 | __mcheck_cpu_init_vendor(¤t_cpu_data); | 1770 | __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); |
1771 | 1771 | ||
1772 | return 0; | 1772 | return 0; |
1773 | } | 1773 | } |
@@ -1775,7 +1775,7 @@ static int mce_resume(struct sys_device *dev) | |||
1775 | static void mce_cpu_restart(void *data) | 1775 | static void mce_cpu_restart(void *data) |
1776 | { | 1776 | { |
1777 | del_timer_sync(&__get_cpu_var(mce_timer)); | 1777 | del_timer_sync(&__get_cpu_var(mce_timer)); |
1778 | if (!mce_available(¤t_cpu_data)) | 1778 | if (!mce_available(__this_cpu_ptr(&cpu_info))) |
1779 | return; | 1779 | return; |
1780 | __mcheck_cpu_init_generic(); | 1780 | __mcheck_cpu_init_generic(); |
1781 | __mcheck_cpu_init_timer(); | 1781 | __mcheck_cpu_init_timer(); |
@@ -1790,7 +1790,7 @@ static void mce_restart(void) | |||
1790 | /* Toggle features for corrected errors */ | 1790 | /* Toggle features for corrected errors */ |
1791 | static void mce_disable_ce(void *all) | 1791 | static void mce_disable_ce(void *all) |
1792 | { | 1792 | { |
1793 | if (!mce_available(¤t_cpu_data)) | 1793 | if (!mce_available(__this_cpu_ptr(&cpu_info))) |
1794 | return; | 1794 | return; |
1795 | if (all) | 1795 | if (all) |
1796 | del_timer_sync(&__get_cpu_var(mce_timer)); | 1796 | del_timer_sync(&__get_cpu_var(mce_timer)); |
@@ -1799,7 +1799,7 @@ static void mce_disable_ce(void *all) | |||
1799 | 1799 | ||
1800 | static void mce_enable_ce(void *all) | 1800 | static void mce_enable_ce(void *all) |
1801 | { | 1801 | { |
1802 | if (!mce_available(¤t_cpu_data)) | 1802 | if (!mce_available(__this_cpu_ptr(&cpu_info))) |
1803 | return; | 1803 | return; |
1804 | cmci_reenable(); | 1804 | cmci_reenable(); |
1805 | cmci_recheck(); | 1805 | cmci_recheck(); |
@@ -2022,7 +2022,7 @@ static void __cpuinit mce_disable_cpu(void *h) | |||
2022 | unsigned long action = *(unsigned long *)h; | 2022 | unsigned long action = *(unsigned long *)h; |
2023 | int i; | 2023 | int i; |
2024 | 2024 | ||
2025 | if (!mce_available(¤t_cpu_data)) | 2025 | if (!mce_available(__this_cpu_ptr(&cpu_info))) |
2026 | return; | 2026 | return; |
2027 | 2027 | ||
2028 | if (!(action & CPU_TASKS_FROZEN)) | 2028 | if (!(action & CPU_TASKS_FROZEN)) |
@@ -2040,7 +2040,7 @@ static void __cpuinit mce_reenable_cpu(void *h) | |||
2040 | unsigned long action = *(unsigned long *)h; | 2040 | unsigned long action = *(unsigned long *)h; |
2041 | int i; | 2041 | int i; |
2042 | 2042 | ||
2043 | if (!mce_available(¤t_cpu_data)) | 2043 | if (!mce_available(__this_cpu_ptr(&cpu_info))) |
2044 | return; | 2044 | return; |
2045 | 2045 | ||
2046 | if (!(action & CPU_TASKS_FROZEN)) | 2046 | if (!(action & CPU_TASKS_FROZEN)) |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 80c482382d5c..5bf2fac52aca 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c | |||
@@ -31,8 +31,6 @@ | |||
31 | #include <asm/mce.h> | 31 | #include <asm/mce.h> |
32 | #include <asm/msr.h> | 32 | #include <asm/msr.h> |
33 | 33 | ||
34 | #define PFX "mce_threshold: " | ||
35 | #define VERSION "version 1.1.1" | ||
36 | #define NR_BANKS 6 | 34 | #define NR_BANKS 6 |
37 | #define NR_BLOCKS 9 | 35 | #define NR_BLOCKS 9 |
38 | #define THRESHOLD_MAX 0xFFF | 36 | #define THRESHOLD_MAX 0xFFF |
@@ -59,12 +57,6 @@ struct threshold_block { | |||
59 | struct list_head miscj; | 57 | struct list_head miscj; |
60 | }; | 58 | }; |
61 | 59 | ||
62 | /* defaults used early on boot */ | ||
63 | static struct threshold_block threshold_defaults = { | ||
64 | .interrupt_enable = 0, | ||
65 | .threshold_limit = THRESHOLD_MAX, | ||
66 | }; | ||
67 | |||
68 | struct threshold_bank { | 60 | struct threshold_bank { |
69 | struct kobject *kobj; | 61 | struct kobject *kobj; |
70 | struct threshold_block *blocks; | 62 | struct threshold_block *blocks; |
@@ -89,50 +81,101 @@ static void amd_threshold_interrupt(void); | |||
89 | struct thresh_restart { | 81 | struct thresh_restart { |
90 | struct threshold_block *b; | 82 | struct threshold_block *b; |
91 | int reset; | 83 | int reset; |
84 | int set_lvt_off; | ||
85 | int lvt_off; | ||
92 | u16 old_limit; | 86 | u16 old_limit; |
93 | }; | 87 | }; |
94 | 88 | ||
89 | static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) | ||
90 | { | ||
91 | int msr = (hi & MASK_LVTOFF_HI) >> 20; | ||
92 | |||
93 | if (apic < 0) { | ||
94 | pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " | ||
95 | "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, | ||
96 | b->bank, b->block, b->address, hi, lo); | ||
97 | return 0; | ||
98 | } | ||
99 | |||
100 | if (apic != msr) { | ||
101 | pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " | ||
102 | "for bank %d, block %d (MSR%08X=0x%x%08x)\n", | ||
103 | b->cpu, apic, b->bank, b->block, b->address, hi, lo); | ||
104 | return 0; | ||
105 | } | ||
106 | |||
107 | return 1; | ||
108 | }; | ||
109 | |||
95 | /* must be called with correct cpu affinity */ | 110 | /* must be called with correct cpu affinity */ |
96 | /* Called via smp_call_function_single() */ | 111 | /* Called via smp_call_function_single() */ |
97 | static void threshold_restart_bank(void *_tr) | 112 | static void threshold_restart_bank(void *_tr) |
98 | { | 113 | { |
99 | struct thresh_restart *tr = _tr; | 114 | struct thresh_restart *tr = _tr; |
100 | u32 mci_misc_hi, mci_misc_lo; | 115 | u32 hi, lo; |
101 | 116 | ||
102 | rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); | 117 | rdmsr(tr->b->address, lo, hi); |
103 | 118 | ||
104 | if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) | 119 | if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) |
105 | tr->reset = 1; /* limit cannot be lower than err count */ | 120 | tr->reset = 1; /* limit cannot be lower than err count */ |
106 | 121 | ||
107 | if (tr->reset) { /* reset err count and overflow bit */ | 122 | if (tr->reset) { /* reset err count and overflow bit */ |
108 | mci_misc_hi = | 123 | hi = |
109 | (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | | 124 | (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | |
110 | (THRESHOLD_MAX - tr->b->threshold_limit); | 125 | (THRESHOLD_MAX - tr->b->threshold_limit); |
111 | } else if (tr->old_limit) { /* change limit w/o reset */ | 126 | } else if (tr->old_limit) { /* change limit w/o reset */ |
112 | int new_count = (mci_misc_hi & THRESHOLD_MAX) + | 127 | int new_count = (hi & THRESHOLD_MAX) + |
113 | (tr->old_limit - tr->b->threshold_limit); | 128 | (tr->old_limit - tr->b->threshold_limit); |
114 | 129 | ||
115 | mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | | 130 | hi = (hi & ~MASK_ERR_COUNT_HI) | |
116 | (new_count & THRESHOLD_MAX); | 131 | (new_count & THRESHOLD_MAX); |
117 | } | 132 | } |
118 | 133 | ||
134 | if (tr->set_lvt_off) { | ||
135 | if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { | ||
136 | /* set new lvt offset */ | ||
137 | hi &= ~MASK_LVTOFF_HI; | ||
138 | hi |= tr->lvt_off << 20; | ||
139 | } | ||
140 | } | ||
141 | |||
119 | tr->b->interrupt_enable ? | 142 | tr->b->interrupt_enable ? |
120 | (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : | 143 | (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : |
121 | (mci_misc_hi &= ~MASK_INT_TYPE_HI); | 144 | (hi &= ~MASK_INT_TYPE_HI); |
122 | 145 | ||
123 | mci_misc_hi |= MASK_COUNT_EN_HI; | 146 | hi |= MASK_COUNT_EN_HI; |
124 | wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); | 147 | wrmsr(tr->b->address, lo, hi); |
148 | } | ||
149 | |||
150 | static void mce_threshold_block_init(struct threshold_block *b, int offset) | ||
151 | { | ||
152 | struct thresh_restart tr = { | ||
153 | .b = b, | ||
154 | .set_lvt_off = 1, | ||
155 | .lvt_off = offset, | ||
156 | }; | ||
157 | |||
158 | b->threshold_limit = THRESHOLD_MAX; | ||
159 | threshold_restart_bank(&tr); | ||
160 | }; | ||
161 | |||
162 | static int setup_APIC_mce(int reserved, int new) | ||
163 | { | ||
164 | if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, | ||
165 | APIC_EILVT_MSG_FIX, 0)) | ||
166 | return new; | ||
167 | |||
168 | return reserved; | ||
125 | } | 169 | } |
126 | 170 | ||
127 | /* cpu init entry point, called from mce.c with preempt off */ | 171 | /* cpu init entry point, called from mce.c with preempt off */ |
128 | void mce_amd_feature_init(struct cpuinfo_x86 *c) | 172 | void mce_amd_feature_init(struct cpuinfo_x86 *c) |
129 | { | 173 | { |
174 | struct threshold_block b; | ||
130 | unsigned int cpu = smp_processor_id(); | 175 | unsigned int cpu = smp_processor_id(); |
131 | u32 low = 0, high = 0, address = 0; | 176 | u32 low = 0, high = 0, address = 0; |
132 | unsigned int bank, block; | 177 | unsigned int bank, block; |
133 | struct thresh_restart tr; | 178 | int offset = -1; |
134 | int lvt_off = -1; | ||
135 | u8 offset; | ||
136 | 179 | ||
137 | for (bank = 0; bank < NR_BANKS; ++bank) { | 180 | for (bank = 0; bank < NR_BANKS; ++bank) { |
138 | for (block = 0; block < NR_BLOCKS; ++block) { | 181 | for (block = 0; block < NR_BLOCKS; ++block) { |
@@ -163,39 +206,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) | |||
163 | if (shared_bank[bank] && c->cpu_core_id) | 206 | if (shared_bank[bank] && c->cpu_core_id) |
164 | break; | 207 | break; |
165 | #endif | 208 | #endif |
166 | offset = (high & MASK_LVTOFF_HI) >> 20; | 209 | offset = setup_APIC_mce(offset, |
167 | if (lvt_off < 0) { | 210 | (high & MASK_LVTOFF_HI) >> 20); |
168 | if (setup_APIC_eilvt(offset, | ||
169 | THRESHOLD_APIC_VECTOR, | ||
170 | APIC_EILVT_MSG_FIX, 0)) { | ||
171 | pr_err(FW_BUG "cpu %d, failed to " | ||
172 | "setup threshold interrupt " | ||
173 | "for bank %d, block %d " | ||
174 | "(MSR%08X=0x%x%08x)", | ||
175 | smp_processor_id(), bank, block, | ||
176 | address, high, low); | ||
177 | continue; | ||
178 | } | ||
179 | lvt_off = offset; | ||
180 | } else if (lvt_off != offset) { | ||
181 | pr_err(FW_BUG "cpu %d, invalid threshold " | ||
182 | "interrupt offset %d for bank %d," | ||
183 | "block %d (MSR%08X=0x%x%08x)", | ||
184 | smp_processor_id(), lvt_off, bank, | ||
185 | block, address, high, low); | ||
186 | continue; | ||
187 | } | ||
188 | |||
189 | high &= ~MASK_LVTOFF_HI; | ||
190 | high |= lvt_off << 20; | ||
191 | wrmsr(address, low, high); | ||
192 | 211 | ||
193 | threshold_defaults.address = address; | 212 | memset(&b, 0, sizeof(b)); |
194 | tr.b = &threshold_defaults; | 213 | b.cpu = cpu; |
195 | tr.reset = 0; | 214 | b.bank = bank; |
196 | tr.old_limit = 0; | 215 | b.block = block; |
197 | threshold_restart_bank(&tr); | 216 | b.address = address; |
198 | 217 | ||
218 | mce_threshold_block_init(&b, offset); | ||
199 | mce_threshold_vector = amd_threshold_interrupt; | 219 | mce_threshold_vector = amd_threshold_interrupt; |
200 | } | 220 | } |
201 | } | 221 | } |
@@ -298,9 +318,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) | |||
298 | 318 | ||
299 | b->interrupt_enable = !!new; | 319 | b->interrupt_enable = !!new; |
300 | 320 | ||
321 | memset(&tr, 0, sizeof(tr)); | ||
301 | tr.b = b; | 322 | tr.b = b; |
302 | tr.reset = 0; | ||
303 | tr.old_limit = 0; | ||
304 | 323 | ||
305 | smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); | 324 | smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); |
306 | 325 | ||
@@ -321,10 +340,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) | |||
321 | if (new < 1) | 340 | if (new < 1) |
322 | new = 1; | 341 | new = 1; |
323 | 342 | ||
343 | memset(&tr, 0, sizeof(tr)); | ||
324 | tr.old_limit = b->threshold_limit; | 344 | tr.old_limit = b->threshold_limit; |
325 | b->threshold_limit = new; | 345 | b->threshold_limit = new; |
326 | tr.b = b; | 346 | tr.b = b; |
327 | tr.reset = 0; | ||
328 | 347 | ||
329 | smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); | 348 | smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); |
330 | 349 | ||
@@ -603,9 +622,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu) | |||
603 | continue; | 622 | continue; |
604 | err = threshold_create_bank(cpu, bank); | 623 | err = threshold_create_bank(cpu, bank); |
605 | if (err) | 624 | if (err) |
606 | goto out; | 625 | return err; |
607 | } | 626 | } |
608 | out: | 627 | |
609 | return err; | 628 | return err; |
610 | } | 629 | } |
611 | 630 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 6fcd0936194f..8694ef56459d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c | |||
@@ -130,7 +130,7 @@ void cmci_recheck(void) | |||
130 | unsigned long flags; | 130 | unsigned long flags; |
131 | int banks; | 131 | int banks; |
132 | 132 | ||
133 | if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) | 133 | if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) |
134 | return; | 134 | return; |
135 | local_irq_save(flags); | 135 | local_irq_save(flags); |
136 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); | 136 | machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); |
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 4b683267eca5..6f8c5e9da97f 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c | |||
@@ -53,8 +53,14 @@ struct thermal_state { | |||
53 | struct _thermal_state core_power_limit; | 53 | struct _thermal_state core_power_limit; |
54 | struct _thermal_state package_throttle; | 54 | struct _thermal_state package_throttle; |
55 | struct _thermal_state package_power_limit; | 55 | struct _thermal_state package_power_limit; |
56 | struct _thermal_state core_thresh0; | ||
57 | struct _thermal_state core_thresh1; | ||
56 | }; | 58 | }; |
57 | 59 | ||
60 | /* Callback to handle core threshold interrupts */ | ||
61 | int (*platform_thermal_notify)(__u64 msr_val); | ||
62 | EXPORT_SYMBOL(platform_thermal_notify); | ||
63 | |||
58 | static DEFINE_PER_CPU(struct thermal_state, thermal_state); | 64 | static DEFINE_PER_CPU(struct thermal_state, thermal_state); |
59 | 65 | ||
60 | static atomic_t therm_throt_en = ATOMIC_INIT(0); | 66 | static atomic_t therm_throt_en = ATOMIC_INIT(0); |
@@ -200,6 +206,22 @@ static int therm_throt_process(bool new_event, int event, int level) | |||
200 | return 0; | 206 | return 0; |
201 | } | 207 | } |
202 | 208 | ||
209 | static int thresh_event_valid(int event) | ||
210 | { | ||
211 | struct _thermal_state *state; | ||
212 | unsigned int this_cpu = smp_processor_id(); | ||
213 | struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); | ||
214 | u64 now = get_jiffies_64(); | ||
215 | |||
216 | state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1; | ||
217 | |||
218 | if (time_before64(now, state->next_check)) | ||
219 | return 0; | ||
220 | |||
221 | state->next_check = now + CHECK_INTERVAL; | ||
222 | return 1; | ||
223 | } | ||
224 | |||
203 | #ifdef CONFIG_SYSFS | 225 | #ifdef CONFIG_SYSFS |
204 | /* Add/Remove thermal_throttle interface for CPU device: */ | 226 | /* Add/Remove thermal_throttle interface for CPU device: */ |
205 | static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, | 227 | static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, |
@@ -313,6 +335,22 @@ device_initcall(thermal_throttle_init_device); | |||
313 | #define PACKAGE_THROTTLED ((__u64)2 << 62) | 335 | #define PACKAGE_THROTTLED ((__u64)2 << 62) |
314 | #define PACKAGE_POWER_LIMIT ((__u64)3 << 62) | 336 | #define PACKAGE_POWER_LIMIT ((__u64)3 << 62) |
315 | 337 | ||
338 | static void notify_thresholds(__u64 msr_val) | ||
339 | { | ||
340 | /* check whether the interrupt handler is defined; | ||
341 | * otherwise simply return | ||
342 | */ | ||
343 | if (!platform_thermal_notify) | ||
344 | return; | ||
345 | |||
346 | /* lower threshold reached */ | ||
347 | if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0)) | ||
348 | platform_thermal_notify(msr_val); | ||
349 | /* higher threshold reached */ | ||
350 | if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1)) | ||
351 | platform_thermal_notify(msr_val); | ||
352 | } | ||
353 | |||
316 | /* Thermal transition interrupt handler */ | 354 | /* Thermal transition interrupt handler */ |
317 | static void intel_thermal_interrupt(void) | 355 | static void intel_thermal_interrupt(void) |
318 | { | 356 | { |
@@ -321,6 +359,9 @@ static void intel_thermal_interrupt(void) | |||
321 | 359 | ||
322 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | 360 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); |
323 | 361 | ||
362 | /* Check for violation of core thermal thresholds*/ | ||
363 | notify_thresholds(msr_val); | ||
364 | |||
324 | if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, | 365 | if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, |
325 | THERMAL_THROTTLING_EVENT, | 366 | THERMAL_THROTTLING_EVENT, |
326 | CORE_LEVEL) != 0) | 367 | CORE_LEVEL) != 0) |
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ed6310183efb..9d977a2ea693 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -330,9 +330,6 @@ static bool reserve_pmc_hardware(void) | |||
330 | { | 330 | { |
331 | int i; | 331 | int i; |
332 | 332 | ||
333 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
334 | disable_lapic_nmi_watchdog(); | ||
335 | |||
336 | for (i = 0; i < x86_pmu.num_counters; i++) { | 333 | for (i = 0; i < x86_pmu.num_counters; i++) { |
337 | if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) | 334 | if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) |
338 | goto perfctr_fail; | 335 | goto perfctr_fail; |
@@ -355,9 +352,6 @@ perfctr_fail: | |||
355 | for (i--; i >= 0; i--) | 352 | for (i--; i >= 0; i--) |
356 | release_perfctr_nmi(x86_pmu.perfctr + i); | 353 | release_perfctr_nmi(x86_pmu.perfctr + i); |
357 | 354 | ||
358 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
359 | enable_lapic_nmi_watchdog(); | ||
360 | |||
361 | return false; | 355 | return false; |
362 | } | 356 | } |
363 | 357 | ||
@@ -369,9 +363,6 @@ static void release_pmc_hardware(void) | |||
369 | release_perfctr_nmi(x86_pmu.perfctr + i); | 363 | release_perfctr_nmi(x86_pmu.perfctr + i); |
370 | release_evntsel_nmi(x86_pmu.eventsel + i); | 364 | release_evntsel_nmi(x86_pmu.eventsel + i); |
371 | } | 365 | } |
372 | |||
373 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
374 | enable_lapic_nmi_watchdog(); | ||
375 | } | 366 | } |
376 | 367 | ||
377 | #else | 368 | #else |
@@ -381,6 +372,58 @@ static void release_pmc_hardware(void) {} | |||
381 | 372 | ||
382 | #endif | 373 | #endif |
383 | 374 | ||
375 | static bool check_hw_exists(void) | ||
376 | { | ||
377 | u64 val, val_new = 0; | ||
378 | int i, reg, ret = 0; | ||
379 | |||
380 | /* | ||
381 | * Check to see if the BIOS enabled any of the counters, if so | ||
382 | * complain and bail. | ||
383 | */ | ||
384 | for (i = 0; i < x86_pmu.num_counters; i++) { | ||
385 | reg = x86_pmu.eventsel + i; | ||
386 | ret = rdmsrl_safe(reg, &val); | ||
387 | if (ret) | ||
388 | goto msr_fail; | ||
389 | if (val & ARCH_PERFMON_EVENTSEL_ENABLE) | ||
390 | goto bios_fail; | ||
391 | } | ||
392 | |||
393 | if (x86_pmu.num_counters_fixed) { | ||
394 | reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; | ||
395 | ret = rdmsrl_safe(reg, &val); | ||
396 | if (ret) | ||
397 | goto msr_fail; | ||
398 | for (i = 0; i < x86_pmu.num_counters_fixed; i++) { | ||
399 | if (val & (0x03 << i*4)) | ||
400 | goto bios_fail; | ||
401 | } | ||
402 | } | ||
403 | |||
404 | /* | ||
405 | * Now write a value and read it back to see if it matches, | ||
406 | * this is needed to detect certain hardware emulators (qemu/kvm) | ||
407 | * that don't trap on the MSR access and always return 0s. | ||
408 | */ | ||
409 | val = 0xabcdUL; | ||
410 | ret = checking_wrmsrl(x86_pmu.perfctr, val); | ||
411 | ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new); | ||
412 | if (ret || val != val_new) | ||
413 | goto msr_fail; | ||
414 | |||
415 | return true; | ||
416 | |||
417 | bios_fail: | ||
418 | printk(KERN_CONT "Broken BIOS detected, using software events only.\n"); | ||
419 | printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val); | ||
420 | return false; | ||
421 | |||
422 | msr_fail: | ||
423 | printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); | ||
424 | return false; | ||
425 | } | ||
426 | |||
384 | static void reserve_ds_buffers(void); | 427 | static void reserve_ds_buffers(void); |
385 | static void release_ds_buffers(void); | 428 | static void release_ds_buffers(void); |
386 | 429 | ||
@@ -437,7 +480,7 @@ static int x86_setup_perfctr(struct perf_event *event) | |||
437 | struct hw_perf_event *hwc = &event->hw; | 480 | struct hw_perf_event *hwc = &event->hw; |
438 | u64 config; | 481 | u64 config; |
439 | 482 | ||
440 | if (!hwc->sample_period) { | 483 | if (!is_sampling_event(event)) { |
441 | hwc->sample_period = x86_pmu.max_period; | 484 | hwc->sample_period = x86_pmu.max_period; |
442 | hwc->last_period = hwc->sample_period; | 485 | hwc->last_period = hwc->sample_period; |
443 | local64_set(&hwc->period_left, hwc->sample_period); | 486 | local64_set(&hwc->period_left, hwc->sample_period); |
@@ -954,8 +997,7 @@ x86_perf_event_set_period(struct perf_event *event) | |||
954 | 997 | ||
955 | static void x86_pmu_enable_event(struct perf_event *event) | 998 | static void x86_pmu_enable_event(struct perf_event *event) |
956 | { | 999 | { |
957 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 1000 | if (__this_cpu_read(cpu_hw_events.enabled)) |
958 | if (cpuc->enabled) | ||
959 | __x86_pmu_enable_event(&event->hw, | 1001 | __x86_pmu_enable_event(&event->hw, |
960 | ARCH_PERFMON_EVENTSEL_ENABLE); | 1002 | ARCH_PERFMON_EVENTSEL_ENABLE); |
961 | } | 1003 | } |
@@ -1225,11 +1267,10 @@ perf_event_nmi_handler(struct notifier_block *self, | |||
1225 | 1267 | ||
1226 | switch (cmd) { | 1268 | switch (cmd) { |
1227 | case DIE_NMI: | 1269 | case DIE_NMI: |
1228 | case DIE_NMI_IPI: | ||
1229 | break; | 1270 | break; |
1230 | case DIE_NMIUNKNOWN: | 1271 | case DIE_NMIUNKNOWN: |
1231 | this_nmi = percpu_read(irq_stat.__nmi_count); | 1272 | this_nmi = percpu_read(irq_stat.__nmi_count); |
1232 | if (this_nmi != __get_cpu_var(pmu_nmi).marked) | 1273 | if (this_nmi != __this_cpu_read(pmu_nmi.marked)) |
1233 | /* let the kernel handle the unknown nmi */ | 1274 | /* let the kernel handle the unknown nmi */ |
1234 | return NOTIFY_DONE; | 1275 | return NOTIFY_DONE; |
1235 | /* | 1276 | /* |
@@ -1253,8 +1294,8 @@ perf_event_nmi_handler(struct notifier_block *self, | |||
1253 | this_nmi = percpu_read(irq_stat.__nmi_count); | 1294 | this_nmi = percpu_read(irq_stat.__nmi_count); |
1254 | if ((handled > 1) || | 1295 | if ((handled > 1) || |
1255 | /* the next nmi could be a back-to-back nmi */ | 1296 | /* the next nmi could be a back-to-back nmi */ |
1256 | ((__get_cpu_var(pmu_nmi).marked == this_nmi) && | 1297 | ((__this_cpu_read(pmu_nmi.marked) == this_nmi) && |
1257 | (__get_cpu_var(pmu_nmi).handled > 1))) { | 1298 | (__this_cpu_read(pmu_nmi.handled) > 1))) { |
1258 | /* | 1299 | /* |
1259 | * We could have two subsequent back-to-back nmis: The | 1300 | * We could have two subsequent back-to-back nmis: The |
1260 | * first handles more than one counter, the 2nd | 1301 | * first handles more than one counter, the 2nd |
@@ -1265,8 +1306,8 @@ perf_event_nmi_handler(struct notifier_block *self, | |||
1265 | * handling more than one counter. We will mark the | 1306 | * handling more than one counter. We will mark the |
1266 | * next (3rd) and then drop it if unhandled. | 1307 | * next (3rd) and then drop it if unhandled. |
1267 | */ | 1308 | */ |
1268 | __get_cpu_var(pmu_nmi).marked = this_nmi + 1; | 1309 | __this_cpu_write(pmu_nmi.marked, this_nmi + 1); |
1269 | __get_cpu_var(pmu_nmi).handled = handled; | 1310 | __this_cpu_write(pmu_nmi.handled, handled); |
1270 | } | 1311 | } |
1271 | 1312 | ||
1272 | return NOTIFY_STOP; | 1313 | return NOTIFY_STOP; |
@@ -1275,7 +1316,7 @@ perf_event_nmi_handler(struct notifier_block *self, | |||
1275 | static __read_mostly struct notifier_block perf_event_nmi_notifier = { | 1316 | static __read_mostly struct notifier_block perf_event_nmi_notifier = { |
1276 | .notifier_call = perf_event_nmi_handler, | 1317 | .notifier_call = perf_event_nmi_handler, |
1277 | .next = NULL, | 1318 | .next = NULL, |
1278 | .priority = 1 | 1319 | .priority = NMI_LOCAL_LOW_PRIOR, |
1279 | }; | 1320 | }; |
1280 | 1321 | ||
1281 | static struct event_constraint unconstrained; | 1322 | static struct event_constraint unconstrained; |
@@ -1348,7 +1389,7 @@ static void __init pmu_check_apic(void) | |||
1348 | pr_info("no hardware sampling interrupt available.\n"); | 1389 | pr_info("no hardware sampling interrupt available.\n"); |
1349 | } | 1390 | } |
1350 | 1391 | ||
1351 | void __init init_hw_perf_events(void) | 1392 | int __init init_hw_perf_events(void) |
1352 | { | 1393 | { |
1353 | struct event_constraint *c; | 1394 | struct event_constraint *c; |
1354 | int err; | 1395 | int err; |
@@ -1363,15 +1404,19 @@ void __init init_hw_perf_events(void) | |||
1363 | err = amd_pmu_init(); | 1404 | err = amd_pmu_init(); |
1364 | break; | 1405 | break; |
1365 | default: | 1406 | default: |
1366 | return; | 1407 | return 0; |
1367 | } | 1408 | } |
1368 | if (err != 0) { | 1409 | if (err != 0) { |
1369 | pr_cont("no PMU driver, software events only.\n"); | 1410 | pr_cont("no PMU driver, software events only.\n"); |
1370 | return; | 1411 | return 0; |
1371 | } | 1412 | } |
1372 | 1413 | ||
1373 | pmu_check_apic(); | 1414 | pmu_check_apic(); |
1374 | 1415 | ||
1416 | /* sanity check that the hardware exists or is emulated */ | ||
1417 | if (!check_hw_exists()) | ||
1418 | return 0; | ||
1419 | |||
1375 | pr_cont("%s PMU driver.\n", x86_pmu.name); | 1420 | pr_cont("%s PMU driver.\n", x86_pmu.name); |
1376 | 1421 | ||
1377 | if (x86_pmu.quirks) | 1422 | if (x86_pmu.quirks) |
@@ -1418,9 +1463,12 @@ void __init init_hw_perf_events(void) | |||
1418 | pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); | 1463 | pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); |
1419 | pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); | 1464 | pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); |
1420 | 1465 | ||
1421 | perf_pmu_register(&pmu); | 1466 | perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); |
1422 | perf_cpu_notifier(x86_pmu_notifier); | 1467 | perf_cpu_notifier(x86_pmu_notifier); |
1468 | |||
1469 | return 0; | ||
1423 | } | 1470 | } |
1471 | early_initcall(init_hw_perf_events); | ||
1424 | 1472 | ||
1425 | static inline void x86_pmu_read(struct perf_event *event) | 1473 | static inline void x86_pmu_read(struct perf_event *event) |
1426 | { | 1474 | { |
@@ -1434,11 +1482,9 @@ static inline void x86_pmu_read(struct perf_event *event) | |||
1434 | */ | 1482 | */ |
1435 | static void x86_pmu_start_txn(struct pmu *pmu) | 1483 | static void x86_pmu_start_txn(struct pmu *pmu) |
1436 | { | 1484 | { |
1437 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
1438 | |||
1439 | perf_pmu_disable(pmu); | 1485 | perf_pmu_disable(pmu); |
1440 | cpuc->group_flag |= PERF_EVENT_TXN; | 1486 | __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN); |
1441 | cpuc->n_txn = 0; | 1487 | __this_cpu_write(cpu_hw_events.n_txn, 0); |
1442 | } | 1488 | } |
1443 | 1489 | ||
1444 | /* | 1490 | /* |
@@ -1448,14 +1494,12 @@ static void x86_pmu_start_txn(struct pmu *pmu) | |||
1448 | */ | 1494 | */ |
1449 | static void x86_pmu_cancel_txn(struct pmu *pmu) | 1495 | static void x86_pmu_cancel_txn(struct pmu *pmu) |
1450 | { | 1496 | { |
1451 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | 1497 | __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); |
1452 | |||
1453 | cpuc->group_flag &= ~PERF_EVENT_TXN; | ||
1454 | /* | 1498 | /* |
1455 | * Truncate the collected events. | 1499 | * Truncate the collected events. |
1456 | */ | 1500 | */ |
1457 | cpuc->n_added -= cpuc->n_txn; | 1501 | __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); |
1458 | cpuc->n_events -= cpuc->n_txn; | 1502 | __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); |
1459 | perf_pmu_enable(pmu); | 1503 | perf_pmu_enable(pmu); |
1460 | } | 1504 | } |
1461 | 1505 | ||
@@ -1666,7 +1710,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) | |||
1666 | 1710 | ||
1667 | perf_callchain_store(entry, regs->ip); | 1711 | perf_callchain_store(entry, regs->ip); |
1668 | 1712 | ||
1669 | dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); | 1713 | dump_trace(NULL, regs, NULL, &backtrace_ops, entry); |
1670 | } | 1714 | } |
1671 | 1715 | ||
1672 | #ifdef CONFIG_COMPAT | 1716 | #ifdef CONFIG_COMPAT |
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 46d58448c3af..67e2202a6039 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c | |||
@@ -1,7 +1,5 @@ | |||
1 | #ifdef CONFIG_CPU_SUP_AMD | 1 | #ifdef CONFIG_CPU_SUP_AMD |
2 | 2 | ||
3 | static DEFINE_RAW_SPINLOCK(amd_nb_lock); | ||
4 | |||
5 | static __initconst const u64 amd_hw_cache_event_ids | 3 | static __initconst const u64 amd_hw_cache_event_ids |
6 | [PERF_COUNT_HW_CACHE_MAX] | 4 | [PERF_COUNT_HW_CACHE_MAX] |
7 | [PERF_COUNT_HW_CACHE_OP_MAX] | 5 | [PERF_COUNT_HW_CACHE_OP_MAX] |
@@ -275,17 +273,17 @@ done: | |||
275 | return &emptyconstraint; | 273 | return &emptyconstraint; |
276 | } | 274 | } |
277 | 275 | ||
278 | static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) | 276 | static struct amd_nb *amd_alloc_nb(int cpu) |
279 | { | 277 | { |
280 | struct amd_nb *nb; | 278 | struct amd_nb *nb; |
281 | int i; | 279 | int i; |
282 | 280 | ||
283 | nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); | 281 | nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO, |
282 | cpu_to_node(cpu)); | ||
284 | if (!nb) | 283 | if (!nb) |
285 | return NULL; | 284 | return NULL; |
286 | 285 | ||
287 | memset(nb, 0, sizeof(*nb)); | 286 | nb->nb_id = -1; |
288 | nb->nb_id = nb_id; | ||
289 | 287 | ||
290 | /* | 288 | /* |
291 | * initialize all possible NB constraints | 289 | * initialize all possible NB constraints |
@@ -306,7 +304,7 @@ static int amd_pmu_cpu_prepare(int cpu) | |||
306 | if (boot_cpu_data.x86_max_cores < 2) | 304 | if (boot_cpu_data.x86_max_cores < 2) |
307 | return NOTIFY_OK; | 305 | return NOTIFY_OK; |
308 | 306 | ||
309 | cpuc->amd_nb = amd_alloc_nb(cpu, -1); | 307 | cpuc->amd_nb = amd_alloc_nb(cpu); |
310 | if (!cpuc->amd_nb) | 308 | if (!cpuc->amd_nb) |
311 | return NOTIFY_BAD; | 309 | return NOTIFY_BAD; |
312 | 310 | ||
@@ -325,8 +323,6 @@ static void amd_pmu_cpu_starting(int cpu) | |||
325 | nb_id = amd_get_nb_id(cpu); | 323 | nb_id = amd_get_nb_id(cpu); |
326 | WARN_ON_ONCE(nb_id == BAD_APICID); | 324 | WARN_ON_ONCE(nb_id == BAD_APICID); |
327 | 325 | ||
328 | raw_spin_lock(&amd_nb_lock); | ||
329 | |||
330 | for_each_online_cpu(i) { | 326 | for_each_online_cpu(i) { |
331 | nb = per_cpu(cpu_hw_events, i).amd_nb; | 327 | nb = per_cpu(cpu_hw_events, i).amd_nb; |
332 | if (WARN_ON_ONCE(!nb)) | 328 | if (WARN_ON_ONCE(!nb)) |
@@ -341,8 +337,6 @@ static void amd_pmu_cpu_starting(int cpu) | |||
341 | 337 | ||
342 | cpuc->amd_nb->nb_id = nb_id; | 338 | cpuc->amd_nb->nb_id = nb_id; |
343 | cpuc->amd_nb->refcnt++; | 339 | cpuc->amd_nb->refcnt++; |
344 | |||
345 | raw_spin_unlock(&amd_nb_lock); | ||
346 | } | 340 | } |
347 | 341 | ||
348 | static void amd_pmu_cpu_dead(int cpu) | 342 | static void amd_pmu_cpu_dead(int cpu) |
@@ -354,8 +348,6 @@ static void amd_pmu_cpu_dead(int cpu) | |||
354 | 348 | ||
355 | cpuhw = &per_cpu(cpu_hw_events, cpu); | 349 | cpuhw = &per_cpu(cpu_hw_events, cpu); |
356 | 350 | ||
357 | raw_spin_lock(&amd_nb_lock); | ||
358 | |||
359 | if (cpuhw->amd_nb) { | 351 | if (cpuhw->amd_nb) { |
360 | struct amd_nb *nb = cpuhw->amd_nb; | 352 | struct amd_nb *nb = cpuhw->amd_nb; |
361 | 353 | ||
@@ -364,8 +356,6 @@ static void amd_pmu_cpu_dead(int cpu) | |||
364 | 356 | ||
365 | cpuhw->amd_nb = NULL; | 357 | cpuhw->amd_nb = NULL; |
366 | } | 358 | } |
367 | |||
368 | raw_spin_unlock(&amd_nb_lock); | ||
369 | } | 359 | } |
370 | 360 | ||
371 | static __initconst const struct x86_pmu amd_pmu = { | 361 | static __initconst const struct x86_pmu amd_pmu = { |
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index c8f5c088cad1..008835c1d79c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c | |||
@@ -649,7 +649,7 @@ static void intel_pmu_enable_event(struct perf_event *event) | |||
649 | struct hw_perf_event *hwc = &event->hw; | 649 | struct hw_perf_event *hwc = &event->hw; |
650 | 650 | ||
651 | if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { | 651 | if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { |
652 | if (!__get_cpu_var(cpu_hw_events).enabled) | 652 | if (!__this_cpu_read(cpu_hw_events.enabled)) |
653 | return; | 653 | return; |
654 | 654 | ||
655 | intel_pmu_enable_bts(hwc->config); | 655 | intel_pmu_enable_bts(hwc->config); |
@@ -679,7 +679,7 @@ static int intel_pmu_save_and_restart(struct perf_event *event) | |||
679 | 679 | ||
680 | static void intel_pmu_reset(void) | 680 | static void intel_pmu_reset(void) |
681 | { | 681 | { |
682 | struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; | 682 | struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); |
683 | unsigned long flags; | 683 | unsigned long flags; |
684 | int idx; | 684 | int idx; |
685 | 685 | ||
@@ -816,6 +816,32 @@ static int intel_pmu_hw_config(struct perf_event *event) | |||
816 | if (ret) | 816 | if (ret) |
817 | return ret; | 817 | return ret; |
818 | 818 | ||
819 | if (event->attr.precise_ip && | ||
820 | (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { | ||
821 | /* | ||
822 | * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P | ||
823 | * (0x003c) so that we can use it with PEBS. | ||
824 | * | ||
825 | * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't | ||
826 | * PEBS capable. However we can use INST_RETIRED.ANY_P | ||
827 | * (0x00c0), which is a PEBS capable event, to get the same | ||
828 | * count. | ||
829 | * | ||
830 | * INST_RETIRED.ANY_P counts the number of cycles that retires | ||
831 | * CNTMASK instructions. By setting CNTMASK to a value (16) | ||
832 | * larger than the maximum number of instructions that can be | ||
833 | * retired per cycle (4) and then inverting the condition, we | ||
834 | * count all cycles that retire 16 or less instructions, which | ||
835 | * is every cycle. | ||
836 | * | ||
837 | * Thereby we gain a PEBS capable cycle counter. | ||
838 | */ | ||
839 | u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */ | ||
840 | |||
841 | alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); | ||
842 | event->hw.config = alt_config; | ||
843 | } | ||
844 | |||
819 | if (event->attr.type != PERF_TYPE_RAW) | 845 | if (event->attr.type != PERF_TYPE_RAW) |
820 | return 0; | 846 | return 0; |
821 | 847 | ||
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 81400b93e694..e56b9bfbabd1 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c | |||
@@ -753,19 +753,21 @@ out: | |||
753 | 753 | ||
754 | static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) | 754 | static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) |
755 | { | 755 | { |
756 | int overflow = 0; | 756 | u64 v; |
757 | u32 low, high; | ||
758 | 757 | ||
759 | rdmsr(hwc->config_base + hwc->idx, low, high); | 758 | /* an official way for overflow indication */ |
760 | 759 | rdmsrl(hwc->config_base + hwc->idx, v); | |
761 | /* we need to check high bit for unflagged overflows */ | 760 | if (v & P4_CCCR_OVF) { |
762 | if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) { | 761 | wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF); |
763 | overflow = 1; | 762 | return 1; |
764 | (void)checking_wrmsrl(hwc->config_base + hwc->idx, | ||
765 | ((u64)low) & ~P4_CCCR_OVF); | ||
766 | } | 763 | } |
767 | 764 | ||
768 | return overflow; | 765 | /* it might be unflagged overflow */ |
766 | rdmsrl(hwc->event_base + hwc->idx, v); | ||
767 | if (!(v & ARCH_P4_CNTRVAL_MASK)) | ||
768 | return 1; | ||
769 | |||
770 | return 0; | ||
769 | } | 771 | } |
770 | 772 | ||
771 | static void p4_pmu_disable_pebs(void) | 773 | static void p4_pmu_disable_pebs(void) |
@@ -1152,9 +1154,9 @@ static __initconst const struct x86_pmu p4_pmu = { | |||
1152 | */ | 1154 | */ |
1153 | .num_counters = ARCH_P4_MAX_CCCR, | 1155 | .num_counters = ARCH_P4_MAX_CCCR, |
1154 | .apic = 1, | 1156 | .apic = 1, |
1155 | .cntval_bits = 40, | 1157 | .cntval_bits = ARCH_P4_CNTRVAL_BITS, |
1156 | .cntval_mask = (1ULL << 40) - 1, | 1158 | .cntval_mask = ARCH_P4_CNTRVAL_MASK, |
1157 | .max_period = (1ULL << 39) - 1, | 1159 | .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, |
1158 | .hw_config = p4_hw_config, | 1160 | .hw_config = p4_hw_config, |
1159 | .schedule_events = p4_pmu_schedule_events, | 1161 | .schedule_events = p4_pmu_schedule_events, |
1160 | /* | 1162 | /* |
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index d9f4ff8fcd69..d5a236615501 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c | |||
@@ -16,32 +16,12 @@ | |||
16 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
17 | #include <linux/bitops.h> | 17 | #include <linux/bitops.h> |
18 | #include <linux/smp.h> | 18 | #include <linux/smp.h> |
19 | #include <linux/nmi.h> | 19 | #include <asm/nmi.h> |
20 | #include <linux/kprobes.h> | 20 | #include <linux/kprobes.h> |
21 | 21 | ||
22 | #include <asm/apic.h> | 22 | #include <asm/apic.h> |
23 | #include <asm/perf_event.h> | 23 | #include <asm/perf_event.h> |
24 | 24 | ||
25 | struct nmi_watchdog_ctlblk { | ||
26 | unsigned int cccr_msr; | ||
27 | unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ | ||
28 | unsigned int evntsel_msr; /* the MSR to select the events to handle */ | ||
29 | }; | ||
30 | |||
31 | /* Interface defining a CPU specific perfctr watchdog */ | ||
32 | struct wd_ops { | ||
33 | int (*reserve)(void); | ||
34 | void (*unreserve)(void); | ||
35 | int (*setup)(unsigned nmi_hz); | ||
36 | void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz); | ||
37 | void (*stop)(void); | ||
38 | unsigned perfctr; | ||
39 | unsigned evntsel; | ||
40 | u64 checkbit; | ||
41 | }; | ||
42 | |||
43 | static const struct wd_ops *wd_ops; | ||
44 | |||
45 | /* | 25 | /* |
46 | * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's | 26 | * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's |
47 | * offset from MSR_P4_BSU_ESCR0. | 27 | * offset from MSR_P4_BSU_ESCR0. |
@@ -60,8 +40,6 @@ static const struct wd_ops *wd_ops; | |||
60 | static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); | 40 | static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); |
61 | static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); | 41 | static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); |
62 | 42 | ||
63 | static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); | ||
64 | |||
65 | /* converts an msr to an appropriate reservation bit */ | 43 | /* converts an msr to an appropriate reservation bit */ |
66 | static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) | 44 | static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) |
67 | { | 45 | { |
@@ -172,623 +150,3 @@ void release_evntsel_nmi(unsigned int msr) | |||
172 | clear_bit(counter, evntsel_nmi_owner); | 150 | clear_bit(counter, evntsel_nmi_owner); |
173 | } | 151 | } |
174 | EXPORT_SYMBOL(release_evntsel_nmi); | 152 | EXPORT_SYMBOL(release_evntsel_nmi); |
175 | |||
176 | void disable_lapic_nmi_watchdog(void) | ||
177 | { | ||
178 | BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); | ||
179 | |||
180 | if (atomic_read(&nmi_active) <= 0) | ||
181 | return; | ||
182 | |||
183 | on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); | ||
184 | |||
185 | if (wd_ops) | ||
186 | wd_ops->unreserve(); | ||
187 | |||
188 | BUG_ON(atomic_read(&nmi_active) != 0); | ||
189 | } | ||
190 | |||
191 | void enable_lapic_nmi_watchdog(void) | ||
192 | { | ||
193 | BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); | ||
194 | |||
195 | /* are we already enabled */ | ||
196 | if (atomic_read(&nmi_active) != 0) | ||
197 | return; | ||
198 | |||
199 | /* are we lapic aware */ | ||
200 | if (!wd_ops) | ||
201 | return; | ||
202 | if (!wd_ops->reserve()) { | ||
203 | printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n"); | ||
204 | return; | ||
205 | } | ||
206 | |||
207 | on_each_cpu(setup_apic_nmi_watchdog, NULL, 1); | ||
208 | touch_nmi_watchdog(); | ||
209 | } | ||
210 | |||
211 | /* | ||
212 | * Activate the NMI watchdog via the local APIC. | ||
213 | */ | ||
214 | |||
215 | static unsigned int adjust_for_32bit_ctr(unsigned int hz) | ||
216 | { | ||
217 | u64 counter_val; | ||
218 | unsigned int retval = hz; | ||
219 | |||
220 | /* | ||
221 | * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter | ||
222 | * are writable, with higher bits sign extending from bit 31. | ||
223 | * So, we can only program the counter with 31 bit values and | ||
224 | * 32nd bit should be 1, for 33.. to be 1. | ||
225 | * Find the appropriate nmi_hz | ||
226 | */ | ||
227 | counter_val = (u64)cpu_khz * 1000; | ||
228 | do_div(counter_val, retval); | ||
229 | if (counter_val > 0x7fffffffULL) { | ||
230 | u64 count = (u64)cpu_khz * 1000; | ||
231 | do_div(count, 0x7fffffffUL); | ||
232 | retval = count + 1; | ||
233 | } | ||
234 | return retval; | ||
235 | } | ||
236 | |||
237 | static void write_watchdog_counter(unsigned int perfctr_msr, | ||
238 | const char *descr, unsigned nmi_hz) | ||
239 | { | ||
240 | u64 count = (u64)cpu_khz * 1000; | ||
241 | |||
242 | do_div(count, nmi_hz); | ||
243 | if (descr) | ||
244 | pr_debug("setting %s to -0x%08Lx\n", descr, count); | ||
245 | wrmsrl(perfctr_msr, 0 - count); | ||
246 | } | ||
247 | |||
248 | static void write_watchdog_counter32(unsigned int perfctr_msr, | ||
249 | const char *descr, unsigned nmi_hz) | ||
250 | { | ||
251 | u64 count = (u64)cpu_khz * 1000; | ||
252 | |||
253 | do_div(count, nmi_hz); | ||
254 | if (descr) | ||
255 | pr_debug("setting %s to -0x%08Lx\n", descr, count); | ||
256 | wrmsr(perfctr_msr, (u32)(-count), 0); | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * AMD K7/K8/Family10h/Family11h support. | ||
261 | * AMD keeps this interface nicely stable so there is not much variety | ||
262 | */ | ||
263 | #define K7_EVNTSEL_ENABLE (1 << 22) | ||
264 | #define K7_EVNTSEL_INT (1 << 20) | ||
265 | #define K7_EVNTSEL_OS (1 << 17) | ||
266 | #define K7_EVNTSEL_USR (1 << 16) | ||
267 | #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 | ||
268 | #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING | ||
269 | |||
270 | static int setup_k7_watchdog(unsigned nmi_hz) | ||
271 | { | ||
272 | unsigned int perfctr_msr, evntsel_msr; | ||
273 | unsigned int evntsel; | ||
274 | struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); | ||
275 | |||
276 | perfctr_msr = wd_ops->perfctr; | ||
277 | evntsel_msr = wd_ops->evntsel; | ||
278 | |||
279 | wrmsrl(perfctr_msr, 0UL); | ||
280 | |||
281 | evntsel = K7_EVNTSEL_INT | ||
282 | | K7_EVNTSEL_OS | ||
283 | | K7_EVNTSEL_USR | ||
284 | | K7_NMI_EVENT; | ||
285 | |||
286 | /* setup the timer */ | ||
287 | wrmsr(evntsel_msr, evntsel, 0); | ||
288 | write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz); | ||
289 | |||
290 | /* initialize the wd struct before enabling */ | ||
291 | wd->perfctr_msr = perfctr_msr; | ||
292 | wd->evntsel_msr = evntsel_msr; | ||
293 | wd->cccr_msr = 0; /* unused */ | ||
294 | |||
295 | /* ok, everything is initialized, announce that we're set */ | ||
296 | cpu_nmi_set_wd_enabled(); | ||
297 | |||
298 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
299 | evntsel |= K7_EVNTSEL_ENABLE; | ||
300 | wrmsr(evntsel_msr, evntsel, 0); | ||
301 | |||
302 | return 1; | ||
303 | } | ||
304 | |||
305 | static void single_msr_stop_watchdog(void) | ||
306 | { | ||
307 | struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); | ||
308 | |||
309 | wrmsr(wd->evntsel_msr, 0, 0); | ||
310 | } | ||
311 | |||
312 | static int single_msr_reserve(void) | ||
313 | { | ||
314 | if (!reserve_perfctr_nmi(wd_ops->perfctr)) | ||
315 | return 0; | ||
316 | |||
317 | if (!reserve_evntsel_nmi(wd_ops->evntsel)) { | ||
318 | release_perfctr_nmi(wd_ops->perfctr); | ||
319 | return 0; | ||
320 | } | ||
321 | return 1; | ||
322 | } | ||
323 | |||
324 | static void single_msr_unreserve(void) | ||
325 | { | ||
326 | release_evntsel_nmi(wd_ops->evntsel); | ||
327 | release_perfctr_nmi(wd_ops->perfctr); | ||
328 | } | ||
329 | |||
330 | static void __kprobes | ||
331 | single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) | ||
332 | { | ||
333 | /* start the cycle over again */ | ||
334 | write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); | ||
335 | } | ||
336 | |||
337 | static const struct wd_ops k7_wd_ops = { | ||
338 | .reserve = single_msr_reserve, | ||
339 | .unreserve = single_msr_unreserve, | ||
340 | .setup = setup_k7_watchdog, | ||
341 | .rearm = single_msr_rearm, | ||
342 | .stop = single_msr_stop_watchdog, | ||
343 | .perfctr = MSR_K7_PERFCTR0, | ||
344 | .evntsel = MSR_K7_EVNTSEL0, | ||
345 | .checkbit = 1ULL << 47, | ||
346 | }; | ||
347 | |||
348 | /* | ||
349 | * Intel Model 6 (PPro+,P2,P3,P-M,Core1) | ||
350 | */ | ||
351 | #define P6_EVNTSEL0_ENABLE (1 << 22) | ||
352 | #define P6_EVNTSEL_INT (1 << 20) | ||
353 | #define P6_EVNTSEL_OS (1 << 17) | ||
354 | #define P6_EVNTSEL_USR (1 << 16) | ||
355 | #define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 | ||
356 | #define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED | ||
357 | |||
358 | static int setup_p6_watchdog(unsigned nmi_hz) | ||
359 | { | ||
360 | unsigned int perfctr_msr, evntsel_msr; | ||
361 | unsigned int evntsel; | ||
362 | struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); | ||
363 | |||
364 | perfctr_msr = wd_ops->perfctr; | ||
365 | evntsel_msr = wd_ops->evntsel; | ||
366 | |||
367 | /* KVM doesn't implement this MSR */ | ||
368 | if (wrmsr_safe(perfctr_msr, 0, 0) < 0) | ||
369 | return 0; | ||
370 | |||
371 | evntsel = P6_EVNTSEL_INT | ||
372 | | P6_EVNTSEL_OS | ||
373 | | P6_EVNTSEL_USR | ||
374 | | P6_NMI_EVENT; | ||
375 | |||
376 | /* setup the timer */ | ||
377 | wrmsr(evntsel_msr, evntsel, 0); | ||
378 | nmi_hz = adjust_for_32bit_ctr(nmi_hz); | ||
379 | write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz); | ||
380 | |||
381 | /* initialize the wd struct before enabling */ | ||
382 | wd->perfctr_msr = perfctr_msr; | ||
383 | wd->evntsel_msr = evntsel_msr; | ||
384 | wd->cccr_msr = 0; /* unused */ | ||
385 | |||
386 | /* ok, everything is initialized, announce that we're set */ | ||
387 | cpu_nmi_set_wd_enabled(); | ||
388 | |||
389 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
390 | evntsel |= P6_EVNTSEL0_ENABLE; | ||
391 | wrmsr(evntsel_msr, evntsel, 0); | ||
392 | |||
393 | return 1; | ||
394 | } | ||
395 | |||
396 | static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) | ||
397 | { | ||
398 | /* | ||
399 | * P6 based Pentium M need to re-unmask | ||
400 | * the apic vector but it doesn't hurt | ||
401 | * other P6 variant. | ||
402 | * ArchPerfom/Core Duo also needs this | ||
403 | */ | ||
404 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
405 | |||
406 | /* P6/ARCH_PERFMON has 32 bit counter write */ | ||
407 | write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz); | ||
408 | } | ||
409 | |||
410 | static const struct wd_ops p6_wd_ops = { | ||
411 | .reserve = single_msr_reserve, | ||
412 | .unreserve = single_msr_unreserve, | ||
413 | .setup = setup_p6_watchdog, | ||
414 | .rearm = p6_rearm, | ||
415 | .stop = single_msr_stop_watchdog, | ||
416 | .perfctr = MSR_P6_PERFCTR0, | ||
417 | .evntsel = MSR_P6_EVNTSEL0, | ||
418 | .checkbit = 1ULL << 39, | ||
419 | }; | ||
420 | |||
421 | /* | ||
422 | * Intel P4 performance counters. | ||
423 | * By far the most complicated of all. | ||
424 | */ | ||
425 | #define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7) | ||
426 | #define P4_ESCR_EVENT_SELECT(N) ((N) << 25) | ||
427 | #define P4_ESCR_OS (1 << 3) | ||
428 | #define P4_ESCR_USR (1 << 2) | ||
429 | #define P4_CCCR_OVF_PMI0 (1 << 26) | ||
430 | #define P4_CCCR_OVF_PMI1 (1 << 27) | ||
431 | #define P4_CCCR_THRESHOLD(N) ((N) << 20) | ||
432 | #define P4_CCCR_COMPLEMENT (1 << 19) | ||
433 | #define P4_CCCR_COMPARE (1 << 18) | ||
434 | #define P4_CCCR_REQUIRED (3 << 16) | ||
435 | #define P4_CCCR_ESCR_SELECT(N) ((N) << 13) | ||
436 | #define P4_CCCR_ENABLE (1 << 12) | ||
437 | #define P4_CCCR_OVF (1 << 31) | ||
438 | |||
439 | #define P4_CONTROLS 18 | ||
440 | static unsigned int p4_controls[18] = { | ||
441 | MSR_P4_BPU_CCCR0, | ||
442 | MSR_P4_BPU_CCCR1, | ||
443 | MSR_P4_BPU_CCCR2, | ||
444 | MSR_P4_BPU_CCCR3, | ||
445 | MSR_P4_MS_CCCR0, | ||
446 | MSR_P4_MS_CCCR1, | ||
447 | MSR_P4_MS_CCCR2, | ||
448 | MSR_P4_MS_CCCR3, | ||
449 | MSR_P4_FLAME_CCCR0, | ||
450 | MSR_P4_FLAME_CCCR1, | ||
451 | MSR_P4_FLAME_CCCR2, | ||
452 | MSR_P4_FLAME_CCCR3, | ||
453 | MSR_P4_IQ_CCCR0, | ||
454 | MSR_P4_IQ_CCCR1, | ||
455 | MSR_P4_IQ_CCCR2, | ||
456 | MSR_P4_IQ_CCCR3, | ||
457 | MSR_P4_IQ_CCCR4, | ||
458 | MSR_P4_IQ_CCCR5, | ||
459 | }; | ||
460 | /* | ||
461 | * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter | ||
462 | * CRU_ESCR0 (with any non-null event selector) through a complemented | ||
463 | * max threshold. [IA32-Vol3, Section 14.9.9] | ||
464 | */ | ||
465 | static int setup_p4_watchdog(unsigned nmi_hz) | ||
466 | { | ||
467 | unsigned int perfctr_msr, evntsel_msr, cccr_msr; | ||
468 | unsigned int evntsel, cccr_val; | ||
469 | unsigned int misc_enable, dummy; | ||
470 | unsigned int ht_num; | ||
471 | struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); | ||
472 | |||
473 | rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); | ||
474 | if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) | ||
475 | return 0; | ||
476 | |||
477 | #ifdef CONFIG_SMP | ||
478 | /* detect which hyperthread we are on */ | ||
479 | if (smp_num_siblings == 2) { | ||
480 | unsigned int ebx, apicid; | ||
481 | |||
482 | ebx = cpuid_ebx(1); | ||
483 | apicid = (ebx >> 24) & 0xff; | ||
484 | ht_num = apicid & 1; | ||
485 | } else | ||
486 | #endif | ||
487 | ht_num = 0; | ||
488 | |||
489 | /* | ||
490 | * performance counters are shared resources | ||
491 | * assign each hyperthread its own set | ||
492 | * (re-use the ESCR0 register, seems safe | ||
493 | * and keeps the cccr_val the same) | ||
494 | */ | ||
495 | if (!ht_num) { | ||
496 | /* logical cpu 0 */ | ||
497 | perfctr_msr = MSR_P4_IQ_PERFCTR0; | ||
498 | evntsel_msr = MSR_P4_CRU_ESCR0; | ||
499 | cccr_msr = MSR_P4_IQ_CCCR0; | ||
500 | cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); | ||
501 | |||
502 | /* | ||
503 | * If we're on the kdump kernel or other situation, we may | ||
504 | * still have other performance counter registers set to | ||
505 | * interrupt and they'll keep interrupting forever because | ||
506 | * of the P4_CCCR_OVF quirk. So we need to ACK all the | ||
507 | * pending interrupts and disable all the registers here, | ||
508 | * before reenabling the NMI delivery. Refer to p4_rearm() | ||
509 | * about the P4_CCCR_OVF quirk. | ||
510 | */ | ||
511 | if (reset_devices) { | ||
512 | unsigned int low, high; | ||
513 | int i; | ||
514 | |||
515 | for (i = 0; i < P4_CONTROLS; i++) { | ||
516 | rdmsr(p4_controls[i], low, high); | ||
517 | low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF); | ||
518 | wrmsr(p4_controls[i], low, high); | ||
519 | } | ||
520 | } | ||
521 | } else { | ||
522 | /* logical cpu 1 */ | ||
523 | perfctr_msr = MSR_P4_IQ_PERFCTR1; | ||
524 | evntsel_msr = MSR_P4_CRU_ESCR0; | ||
525 | cccr_msr = MSR_P4_IQ_CCCR1; | ||
526 | |||
527 | /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */ | ||
528 | if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4) | ||
529 | cccr_val = P4_CCCR_OVF_PMI0; | ||
530 | else | ||
531 | cccr_val = P4_CCCR_OVF_PMI1; | ||
532 | cccr_val |= P4_CCCR_ESCR_SELECT(4); | ||
533 | } | ||
534 | |||
535 | evntsel = P4_ESCR_EVENT_SELECT(0x3F) | ||
536 | | P4_ESCR_OS | ||
537 | | P4_ESCR_USR; | ||
538 | |||
539 | cccr_val |= P4_CCCR_THRESHOLD(15) | ||
540 | | P4_CCCR_COMPLEMENT | ||
541 | | P4_CCCR_COMPARE | ||
542 | | P4_CCCR_REQUIRED; | ||
543 | |||
544 | wrmsr(evntsel_msr, evntsel, 0); | ||
545 | wrmsr(cccr_msr, cccr_val, 0); | ||
546 | write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); | ||
547 | |||
548 | wd->perfctr_msr = perfctr_msr; | ||
549 | wd->evntsel_msr = evntsel_msr; | ||
550 | wd->cccr_msr = cccr_msr; | ||
551 | |||
552 | /* ok, everything is initialized, announce that we're set */ | ||
553 | cpu_nmi_set_wd_enabled(); | ||
554 | |||
555 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
556 | cccr_val |= P4_CCCR_ENABLE; | ||
557 | wrmsr(cccr_msr, cccr_val, 0); | ||
558 | return 1; | ||
559 | } | ||
560 | |||
561 | static void stop_p4_watchdog(void) | ||
562 | { | ||
563 | struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); | ||
564 | wrmsr(wd->cccr_msr, 0, 0); | ||
565 | wrmsr(wd->evntsel_msr, 0, 0); | ||
566 | } | ||
567 | |||
568 | static int p4_reserve(void) | ||
569 | { | ||
570 | if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0)) | ||
571 | return 0; | ||
572 | #ifdef CONFIG_SMP | ||
573 | if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1)) | ||
574 | goto fail1; | ||
575 | #endif | ||
576 | if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) | ||
577 | goto fail2; | ||
578 | /* RED-PEN why is ESCR1 not reserved here? */ | ||
579 | return 1; | ||
580 | fail2: | ||
581 | #ifdef CONFIG_SMP | ||
582 | if (smp_num_siblings > 1) | ||
583 | release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); | ||
584 | fail1: | ||
585 | #endif | ||
586 | release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); | ||
587 | return 0; | ||
588 | } | ||
589 | |||
590 | static void p4_unreserve(void) | ||
591 | { | ||
592 | #ifdef CONFIG_SMP | ||
593 | if (smp_num_siblings > 1) | ||
594 | release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); | ||
595 | #endif | ||
596 | release_evntsel_nmi(MSR_P4_CRU_ESCR0); | ||
597 | release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); | ||
598 | } | ||
599 | |||
600 | static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) | ||
601 | { | ||
602 | unsigned dummy; | ||
603 | /* | ||
604 | * P4 quirks: | ||
605 | * - An overflown perfctr will assert its interrupt | ||
606 | * until the OVF flag in its CCCR is cleared. | ||
607 | * - LVTPC is masked on interrupt and must be | ||
608 | * unmasked by the LVTPC handler. | ||
609 | */ | ||
610 | rdmsrl(wd->cccr_msr, dummy); | ||
611 | dummy &= ~P4_CCCR_OVF; | ||
612 | wrmsrl(wd->cccr_msr, dummy); | ||
613 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
614 | /* start the cycle over again */ | ||
615 | write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); | ||
616 | } | ||
617 | |||
618 | static const struct wd_ops p4_wd_ops = { | ||
619 | .reserve = p4_reserve, | ||
620 | .unreserve = p4_unreserve, | ||
621 | .setup = setup_p4_watchdog, | ||
622 | .rearm = p4_rearm, | ||
623 | .stop = stop_p4_watchdog, | ||
624 | /* RED-PEN this is wrong for the other sibling */ | ||
625 | .perfctr = MSR_P4_BPU_PERFCTR0, | ||
626 | .evntsel = MSR_P4_BSU_ESCR0, | ||
627 | .checkbit = 1ULL << 39, | ||
628 | }; | ||
629 | |||
630 | /* | ||
631 | * Watchdog using the Intel architected PerfMon. | ||
632 | * Used for Core2 and hopefully all future Intel CPUs. | ||
633 | */ | ||
634 | #define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL | ||
635 | #define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK | ||
636 | |||
637 | static struct wd_ops intel_arch_wd_ops; | ||
638 | |||
639 | static int setup_intel_arch_watchdog(unsigned nmi_hz) | ||
640 | { | ||
641 | unsigned int ebx; | ||
642 | union cpuid10_eax eax; | ||
643 | unsigned int unused; | ||
644 | unsigned int perfctr_msr, evntsel_msr; | ||
645 | unsigned int evntsel; | ||
646 | struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); | ||
647 | |||
648 | /* | ||
649 | * Check whether the Architectural PerfMon supports | ||
650 | * Unhalted Core Cycles Event or not. | ||
651 | * NOTE: Corresponding bit = 0 in ebx indicates event present. | ||
652 | */ | ||
653 | cpuid(10, &(eax.full), &ebx, &unused, &unused); | ||
654 | if ((eax.split.mask_length < | ||
655 | (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || | ||
656 | (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) | ||
657 | return 0; | ||
658 | |||
659 | perfctr_msr = wd_ops->perfctr; | ||
660 | evntsel_msr = wd_ops->evntsel; | ||
661 | |||
662 | wrmsrl(perfctr_msr, 0UL); | ||
663 | |||
664 | evntsel = ARCH_PERFMON_EVENTSEL_INT | ||
665 | | ARCH_PERFMON_EVENTSEL_OS | ||
666 | | ARCH_PERFMON_EVENTSEL_USR | ||
667 | | ARCH_PERFMON_NMI_EVENT_SEL | ||
668 | | ARCH_PERFMON_NMI_EVENT_UMASK; | ||
669 | |||
670 | /* setup the timer */ | ||
671 | wrmsr(evntsel_msr, evntsel, 0); | ||
672 | nmi_hz = adjust_for_32bit_ctr(nmi_hz); | ||
673 | write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); | ||
674 | |||
675 | wd->perfctr_msr = perfctr_msr; | ||
676 | wd->evntsel_msr = evntsel_msr; | ||
677 | wd->cccr_msr = 0; /* unused */ | ||
678 | |||
679 | /* ok, everything is initialized, announce that we're set */ | ||
680 | cpu_nmi_set_wd_enabled(); | ||
681 | |||
682 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
683 | evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE; | ||
684 | wrmsr(evntsel_msr, evntsel, 0); | ||
685 | intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); | ||
686 | return 1; | ||
687 | } | ||
688 | |||
689 | static struct wd_ops intel_arch_wd_ops __read_mostly = { | ||
690 | .reserve = single_msr_reserve, | ||
691 | .unreserve = single_msr_unreserve, | ||
692 | .setup = setup_intel_arch_watchdog, | ||
693 | .rearm = p6_rearm, | ||
694 | .stop = single_msr_stop_watchdog, | ||
695 | .perfctr = MSR_ARCH_PERFMON_PERFCTR1, | ||
696 | .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, | ||
697 | }; | ||
698 | |||
699 | static void probe_nmi_watchdog(void) | ||
700 | { | ||
701 | switch (boot_cpu_data.x86_vendor) { | ||
702 | case X86_VENDOR_AMD: | ||
703 | if (boot_cpu_data.x86 == 6 || | ||
704 | (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15)) | ||
705 | wd_ops = &k7_wd_ops; | ||
706 | return; | ||
707 | case X86_VENDOR_INTEL: | ||
708 | /* Work around where perfctr1 doesn't have a working enable | ||
709 | * bit as described in the following errata: | ||
710 | * AE49 Core Duo and Intel Core Solo 65 nm | ||
711 | * AN49 Intel Pentium Dual-Core | ||
712 | * AF49 Dual-Core Intel Xeon Processor LV | ||
713 | */ | ||
714 | if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) || | ||
715 | ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 && | ||
716 | boot_cpu_data.x86_mask == 4))) { | ||
717 | intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; | ||
718 | intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; | ||
719 | } | ||
720 | if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { | ||
721 | wd_ops = &intel_arch_wd_ops; | ||
722 | break; | ||
723 | } | ||
724 | switch (boot_cpu_data.x86) { | ||
725 | case 6: | ||
726 | if (boot_cpu_data.x86_model > 13) | ||
727 | return; | ||
728 | |||
729 | wd_ops = &p6_wd_ops; | ||
730 | break; | ||
731 | case 15: | ||
732 | wd_ops = &p4_wd_ops; | ||
733 | break; | ||
734 | default: | ||
735 | return; | ||
736 | } | ||
737 | break; | ||
738 | } | ||
739 | } | ||
740 | |||
741 | /* Interface to nmi.c */ | ||
742 | |||
743 | int lapic_watchdog_init(unsigned nmi_hz) | ||
744 | { | ||
745 | if (!wd_ops) { | ||
746 | probe_nmi_watchdog(); | ||
747 | if (!wd_ops) { | ||
748 | printk(KERN_INFO "NMI watchdog: CPU not supported\n"); | ||
749 | return -1; | ||
750 | } | ||
751 | |||
752 | if (!wd_ops->reserve()) { | ||
753 | printk(KERN_ERR | ||
754 | "NMI watchdog: cannot reserve perfctrs\n"); | ||
755 | return -1; | ||
756 | } | ||
757 | } | ||
758 | |||
759 | if (!(wd_ops->setup(nmi_hz))) { | ||
760 | printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n", | ||
761 | raw_smp_processor_id()); | ||
762 | return -1; | ||
763 | } | ||
764 | |||
765 | return 0; | ||
766 | } | ||
767 | |||
768 | void lapic_watchdog_stop(void) | ||
769 | { | ||
770 | if (wd_ops) | ||
771 | wd_ops->stop(); | ||
772 | } | ||
773 | |||
774 | unsigned lapic_adjust_nmi_hz(unsigned hz) | ||
775 | { | ||
776 | struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); | ||
777 | if (wd->perfctr_msr == MSR_P6_PERFCTR0 || | ||
778 | wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) | ||
779 | hz = adjust_for_32bit_ctr(hz); | ||
780 | return hz; | ||
781 | } | ||
782 | |||
783 | int __kprobes lapic_wd_event(unsigned nmi_hz) | ||
784 | { | ||
785 | struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); | ||
786 | u64 ctr; | ||
787 | |||
788 | rdmsrl(wd->perfctr_msr, ctr); | ||
789 | if (ctr & wd_ops->checkbit) /* perfctr still running? */ | ||
790 | return 0; | ||
791 | |||
792 | wd_ops->rearm(wd, nmi_hz); | ||
793 | return 1; | ||
794 | } | ||
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 1b7b31ab7d86..212a6a42527c 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c | |||
@@ -33,7 +33,6 @@ | |||
33 | #include <linux/init.h> | 33 | #include <linux/init.h> |
34 | #include <linux/poll.h> | 34 | #include <linux/poll.h> |
35 | #include <linux/smp.h> | 35 | #include <linux/smp.h> |
36 | #include <linux/smp_lock.h> | ||
37 | #include <linux/major.h> | 36 | #include <linux/major.h> |
38 | #include <linux/fs.h> | 37 | #include <linux/fs.h> |
39 | #include <linux/device.h> | 38 | #include <linux/device.h> |
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6e8752c1bd52..df20723a6a1b 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c | |||
@@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = { | |||
175 | 175 | ||
176 | void | 176 | void |
177 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | 177 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, |
178 | unsigned long *stack, unsigned long bp, char *log_lvl) | 178 | unsigned long *stack, char *log_lvl) |
179 | { | 179 | { |
180 | printk("%sCall Trace:\n", log_lvl); | 180 | printk("%sCall Trace:\n", log_lvl); |
181 | dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); | 181 | dump_trace(task, regs, stack, &print_trace_ops, log_lvl); |
182 | } | 182 | } |
183 | 183 | ||
184 | void show_trace(struct task_struct *task, struct pt_regs *regs, | 184 | void show_trace(struct task_struct *task, struct pt_regs *regs, |
185 | unsigned long *stack, unsigned long bp) | 185 | unsigned long *stack) |
186 | { | 186 | { |
187 | show_trace_log_lvl(task, regs, stack, bp, ""); | 187 | show_trace_log_lvl(task, regs, stack, ""); |
188 | } | 188 | } |
189 | 189 | ||
190 | void show_stack(struct task_struct *task, unsigned long *sp) | 190 | void show_stack(struct task_struct *task, unsigned long *sp) |
191 | { | 191 | { |
192 | show_stack_log_lvl(task, NULL, sp, 0, ""); | 192 | show_stack_log_lvl(task, NULL, sp, ""); |
193 | } | 193 | } |
194 | 194 | ||
195 | /* | 195 | /* |
@@ -197,20 +197,14 @@ void show_stack(struct task_struct *task, unsigned long *sp) | |||
197 | */ | 197 | */ |
198 | void dump_stack(void) | 198 | void dump_stack(void) |
199 | { | 199 | { |
200 | unsigned long bp = 0; | ||
201 | unsigned long stack; | 200 | unsigned long stack; |
202 | 201 | ||
203 | #ifdef CONFIG_FRAME_POINTER | ||
204 | if (!bp) | ||
205 | get_bp(bp); | ||
206 | #endif | ||
207 | |||
208 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | 202 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", |
209 | current->pid, current->comm, print_tainted(), | 203 | current->pid, current->comm, print_tainted(), |
210 | init_utsname()->release, | 204 | init_utsname()->release, |
211 | (int)strcspn(init_utsname()->version, " "), | 205 | (int)strcspn(init_utsname()->version, " "), |
212 | init_utsname()->version); | 206 | init_utsname()->version); |
213 | show_trace(NULL, NULL, &stack, bp); | 207 | show_trace(NULL, NULL, &stack); |
214 | } | 208 | } |
215 | EXPORT_SYMBOL(dump_stack); | 209 | EXPORT_SYMBOL(dump_stack); |
216 | 210 | ||
@@ -240,6 +234,7 @@ unsigned __kprobes long oops_begin(void) | |||
240 | bust_spinlocks(1); | 234 | bust_spinlocks(1); |
241 | return flags; | 235 | return flags; |
242 | } | 236 | } |
237 | EXPORT_SYMBOL_GPL(oops_begin); | ||
243 | 238 | ||
244 | void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) | 239 | void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) |
245 | { | 240 | { |
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 1bc7f75a5bda..74cc1eda384b 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -17,11 +17,12 @@ | |||
17 | #include <asm/stacktrace.h> | 17 | #include <asm/stacktrace.h> |
18 | 18 | ||
19 | 19 | ||
20 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | 20 | void dump_trace(struct task_struct *task, |
21 | unsigned long *stack, unsigned long bp, | 21 | struct pt_regs *regs, unsigned long *stack, |
22 | const struct stacktrace_ops *ops, void *data) | 22 | const struct stacktrace_ops *ops, void *data) |
23 | { | 23 | { |
24 | int graph = 0; | 24 | int graph = 0; |
25 | unsigned long bp; | ||
25 | 26 | ||
26 | if (!task) | 27 | if (!task) |
27 | task = current; | 28 | task = current; |
@@ -34,18 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, | |||
34 | stack = (unsigned long *)task->thread.sp; | 35 | stack = (unsigned long *)task->thread.sp; |
35 | } | 36 | } |
36 | 37 | ||
37 | #ifdef CONFIG_FRAME_POINTER | 38 | bp = stack_frame(task, regs); |
38 | if (!bp) { | ||
39 | if (task == current) { | ||
40 | /* Grab bp right from our regs */ | ||
41 | get_bp(bp); | ||
42 | } else { | ||
43 | /* bp is the last reg pushed by switch_to */ | ||
44 | bp = *(unsigned long *) task->thread.sp; | ||
45 | } | ||
46 | } | ||
47 | #endif | ||
48 | |||
49 | for (;;) { | 39 | for (;;) { |
50 | struct thread_info *context; | 40 | struct thread_info *context; |
51 | 41 | ||
@@ -65,7 +55,7 @@ EXPORT_SYMBOL(dump_trace); | |||
65 | 55 | ||
66 | void | 56 | void |
67 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | 57 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, |
68 | unsigned long *sp, unsigned long bp, char *log_lvl) | 58 | unsigned long *sp, char *log_lvl) |
69 | { | 59 | { |
70 | unsigned long *stack; | 60 | unsigned long *stack; |
71 | int i; | 61 | int i; |
@@ -87,7 +77,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | |||
87 | touch_nmi_watchdog(); | 77 | touch_nmi_watchdog(); |
88 | } | 78 | } |
89 | printk(KERN_CONT "\n"); | 79 | printk(KERN_CONT "\n"); |
90 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); | 80 | show_trace_log_lvl(task, regs, sp, log_lvl); |
91 | } | 81 | } |
92 | 82 | ||
93 | 83 | ||
@@ -112,8 +102,7 @@ void show_registers(struct pt_regs *regs) | |||
112 | u8 *ip; | 102 | u8 *ip; |
113 | 103 | ||
114 | printk(KERN_EMERG "Stack:\n"); | 104 | printk(KERN_EMERG "Stack:\n"); |
115 | show_stack_log_lvl(NULL, regs, ®s->sp, | 105 | show_stack_log_lvl(NULL, regs, ®s->sp, KERN_EMERG); |
116 | 0, KERN_EMERG); | ||
117 | 106 | ||
118 | printk(KERN_EMERG "Code: "); | 107 | printk(KERN_EMERG "Code: "); |
119 | 108 | ||
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6a340485249a..a6b6fcf7f0ae 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c | |||
@@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack, | |||
139 | * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack | 139 | * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack |
140 | */ | 140 | */ |
141 | 141 | ||
142 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | 142 | void dump_trace(struct task_struct *task, |
143 | unsigned long *stack, unsigned long bp, | 143 | struct pt_regs *regs, unsigned long *stack, |
144 | const struct stacktrace_ops *ops, void *data) | 144 | const struct stacktrace_ops *ops, void *data) |
145 | { | 145 | { |
146 | const unsigned cpu = get_cpu(); | 146 | const unsigned cpu = get_cpu(); |
@@ -149,29 +149,19 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, | |||
149 | unsigned used = 0; | 149 | unsigned used = 0; |
150 | struct thread_info *tinfo; | 150 | struct thread_info *tinfo; |
151 | int graph = 0; | 151 | int graph = 0; |
152 | unsigned long dummy; | ||
153 | unsigned long bp; | ||
152 | 154 | ||
153 | if (!task) | 155 | if (!task) |
154 | task = current; | 156 | task = current; |
155 | 157 | ||
156 | if (!stack) { | 158 | if (!stack) { |
157 | unsigned long dummy; | ||
158 | stack = &dummy; | 159 | stack = &dummy; |
159 | if (task && task != current) | 160 | if (task && task != current) |
160 | stack = (unsigned long *)task->thread.sp; | 161 | stack = (unsigned long *)task->thread.sp; |
161 | } | 162 | } |
162 | 163 | ||
163 | #ifdef CONFIG_FRAME_POINTER | 164 | bp = stack_frame(task, regs); |
164 | if (!bp) { | ||
165 | if (task == current) { | ||
166 | /* Grab bp right from our regs */ | ||
167 | get_bp(bp); | ||
168 | } else { | ||
169 | /* bp is the last reg pushed by switch_to */ | ||
170 | bp = *(unsigned long *) task->thread.sp; | ||
171 | } | ||
172 | } | ||
173 | #endif | ||
174 | |||
175 | /* | 165 | /* |
176 | * Print function call entries in all stacks, starting at the | 166 | * Print function call entries in all stacks, starting at the |
177 | * current stack address. If the stacks consist of nested | 167 | * current stack address. If the stacks consist of nested |
@@ -235,7 +225,7 @@ EXPORT_SYMBOL(dump_trace); | |||
235 | 225 | ||
236 | void | 226 | void |
237 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | 227 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, |
238 | unsigned long *sp, unsigned long bp, char *log_lvl) | 228 | unsigned long *sp, char *log_lvl) |
239 | { | 229 | { |
240 | unsigned long *irq_stack_end; | 230 | unsigned long *irq_stack_end; |
241 | unsigned long *irq_stack; | 231 | unsigned long *irq_stack; |
@@ -279,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | |||
279 | preempt_enable(); | 269 | preempt_enable(); |
280 | 270 | ||
281 | printk(KERN_CONT "\n"); | 271 | printk(KERN_CONT "\n"); |
282 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); | 272 | show_trace_log_lvl(task, regs, sp, log_lvl); |
283 | } | 273 | } |
284 | 274 | ||
285 | void show_registers(struct pt_regs *regs) | 275 | void show_registers(struct pt_regs *regs) |
@@ -308,7 +298,7 @@ void show_registers(struct pt_regs *regs) | |||
308 | 298 | ||
309 | printk(KERN_EMERG "Stack:\n"); | 299 | printk(KERN_EMERG "Stack:\n"); |
310 | show_stack_log_lvl(NULL, regs, (unsigned long *)sp, | 300 | show_stack_log_lvl(NULL, regs, (unsigned long *)sp, |
311 | regs->bp, KERN_EMERG); | 301 | KERN_EMERG); |
312 | 302 | ||
313 | printk(KERN_EMERG "Code: "); | 303 | printk(KERN_EMERG "Code: "); |
314 | 304 | ||
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0c2b7ef7a34d..294f26da0c0c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/bootmem.h> | 14 | #include <linux/bootmem.h> |
15 | #include <linux/pfn.h> | 15 | #include <linux/pfn.h> |
16 | #include <linux/suspend.h> | 16 | #include <linux/suspend.h> |
17 | #include <linux/acpi.h> | ||
17 | #include <linux/firmware-map.h> | 18 | #include <linux/firmware-map.h> |
18 | #include <linux/memblock.h> | 19 | #include <linux/memblock.h> |
19 | 20 | ||
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 59e175e89599..c8b4efad7ebb 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -395,7 +395,7 @@ sysenter_past_esp: | |||
395 | * A tiny bit of offset fixup is necessary - 4*4 means the 4 words | 395 | * A tiny bit of offset fixup is necessary - 4*4 means the 4 words |
396 | * pushed above; +8 corresponds to copy_thread's esp0 setting. | 396 | * pushed above; +8 corresponds to copy_thread's esp0 setting. |
397 | */ | 397 | */ |
398 | pushl_cfi (TI_sysenter_return-THREAD_SIZE_asm+8+4*4)(%esp) | 398 | pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp) |
399 | CFI_REL_OFFSET eip, 0 | 399 | CFI_REL_OFFSET eip, 0 |
400 | 400 | ||
401 | pushl_cfi %eax | 401 | pushl_cfi %eax |
@@ -1406,6 +1406,16 @@ ENTRY(general_protection) | |||
1406 | CFI_ENDPROC | 1406 | CFI_ENDPROC |
1407 | END(general_protection) | 1407 | END(general_protection) |
1408 | 1408 | ||
1409 | #ifdef CONFIG_KVM_GUEST | ||
1410 | ENTRY(async_page_fault) | ||
1411 | RING0_EC_FRAME | ||
1412 | pushl $do_async_page_fault | ||
1413 | CFI_ADJUST_CFA_OFFSET 4 | ||
1414 | jmp error_code | ||
1415 | CFI_ENDPROC | ||
1416 | END(apf_page_fault) | ||
1417 | #endif | ||
1418 | |||
1409 | /* | 1419 | /* |
1410 | * End of kprobes section | 1420 | * End of kprobes section |
1411 | */ | 1421 | */ |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index fe2690d71c0c..aed1ffbeb0c9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -295,20 +295,25 @@ ENDPROC(native_usergs_sysret64) | |||
295 | .endm | 295 | .endm |
296 | 296 | ||
297 | /* save partial stack frame */ | 297 | /* save partial stack frame */ |
298 | .pushsection .kprobes.text, "ax" | ||
298 | ENTRY(save_args) | 299 | ENTRY(save_args) |
299 | XCPT_FRAME | 300 | XCPT_FRAME |
300 | cld | 301 | cld |
301 | movq_cfi rdi, RDI+16-ARGOFFSET | 302 | /* |
302 | movq_cfi rsi, RSI+16-ARGOFFSET | 303 | * start from rbp in pt_regs and jump over |
303 | movq_cfi rdx, RDX+16-ARGOFFSET | 304 | * return address. |
304 | movq_cfi rcx, RCX+16-ARGOFFSET | 305 | */ |
305 | movq_cfi rax, RAX+16-ARGOFFSET | 306 | movq_cfi rdi, RDI+8-RBP |
306 | movq_cfi r8, R8+16-ARGOFFSET | 307 | movq_cfi rsi, RSI+8-RBP |
307 | movq_cfi r9, R9+16-ARGOFFSET | 308 | movq_cfi rdx, RDX+8-RBP |
308 | movq_cfi r10, R10+16-ARGOFFSET | 309 | movq_cfi rcx, RCX+8-RBP |
309 | movq_cfi r11, R11+16-ARGOFFSET | 310 | movq_cfi rax, RAX+8-RBP |
310 | 311 | movq_cfi r8, R8+8-RBP | |
311 | leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ | 312 | movq_cfi r9, R9+8-RBP |
313 | movq_cfi r10, R10+8-RBP | ||
314 | movq_cfi r11, R11+8-RBP | ||
315 | |||
316 | leaq -RBP+8(%rsp),%rdi /* arg1 for handler */ | ||
312 | movq_cfi rbp, 8 /* push %rbp */ | 317 | movq_cfi rbp, 8 /* push %rbp */ |
313 | leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ | 318 | leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ |
314 | testl $3, CS(%rdi) | 319 | testl $3, CS(%rdi) |
@@ -334,6 +339,7 @@ ENTRY(save_args) | |||
334 | ret | 339 | ret |
335 | CFI_ENDPROC | 340 | CFI_ENDPROC |
336 | END(save_args) | 341 | END(save_args) |
342 | .popsection | ||
337 | 343 | ||
338 | ENTRY(save_rest) | 344 | ENTRY(save_rest) |
339 | PARTIAL_FRAME 1 REST_SKIP+8 | 345 | PARTIAL_FRAME 1 REST_SKIP+8 |
@@ -780,8 +786,9 @@ END(interrupt) | |||
780 | 786 | ||
781 | /* 0(%rsp): ~(interrupt number) */ | 787 | /* 0(%rsp): ~(interrupt number) */ |
782 | .macro interrupt func | 788 | .macro interrupt func |
783 | subq $ORIG_RAX-ARGOFFSET+8, %rsp | 789 | /* reserve pt_regs for scratch regs and rbp */ |
784 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8 | 790 | subq $ORIG_RAX-RBP, %rsp |
791 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP | ||
785 | call save_args | 792 | call save_args |
786 | PARTIAL_FRAME 0 | 793 | PARTIAL_FRAME 0 |
787 | call \func | 794 | call \func |
@@ -806,9 +813,14 @@ ret_from_intr: | |||
806 | TRACE_IRQS_OFF | 813 | TRACE_IRQS_OFF |
807 | decl PER_CPU_VAR(irq_count) | 814 | decl PER_CPU_VAR(irq_count) |
808 | leaveq | 815 | leaveq |
816 | |||
809 | CFI_RESTORE rbp | 817 | CFI_RESTORE rbp |
810 | CFI_DEF_CFA_REGISTER rsp | 818 | CFI_DEF_CFA_REGISTER rsp |
811 | CFI_ADJUST_CFA_OFFSET -8 | 819 | CFI_ADJUST_CFA_OFFSET -8 |
820 | |||
821 | /* we did not save rbx, restore only from ARGOFFSET */ | ||
822 | addq $8, %rsp | ||
823 | CFI_ADJUST_CFA_OFFSET -8 | ||
812 | exit_intr: | 824 | exit_intr: |
813 | GET_THREAD_INFO(%rcx) | 825 | GET_THREAD_INFO(%rcx) |
814 | testl $3,CS-ARGOFFSET(%rsp) | 826 | testl $3,CS-ARGOFFSET(%rsp) |
@@ -1317,6 +1329,9 @@ errorentry xen_stack_segment do_stack_segment | |||
1317 | #endif | 1329 | #endif |
1318 | errorentry general_protection do_general_protection | 1330 | errorentry general_protection do_general_protection |
1319 | errorentry page_fault do_page_fault | 1331 | errorentry page_fault do_page_fault |
1332 | #ifdef CONFIG_KVM_GUEST | ||
1333 | errorentry async_page_fault do_async_page_fault | ||
1334 | #endif | ||
1320 | #ifdef CONFIG_X86_MCE | 1335 | #ifdef CONFIG_X86_MCE |
1321 | paranoidzeroentry machine_check *machine_check_vector(%rip) | 1336 | paranoidzeroentry machine_check *machine_check_vector(%rip) |
1322 | #endif | 1337 | #endif |
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3afb33f14d2d..382eb2936d4d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
20 | #include <linux/init.h> | 20 | #include <linux/init.h> |
21 | #include <linux/list.h> | 21 | #include <linux/list.h> |
22 | #include <linux/module.h> | ||
22 | 23 | ||
23 | #include <trace/syscall.h> | 24 | #include <trace/syscall.h> |
24 | 25 | ||
@@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code); | |||
49 | int ftrace_arch_code_modify_prepare(void) | 50 | int ftrace_arch_code_modify_prepare(void) |
50 | { | 51 | { |
51 | set_kernel_text_rw(); | 52 | set_kernel_text_rw(); |
53 | set_all_modules_text_rw(); | ||
52 | modifying_code = 1; | 54 | modifying_code = 1; |
53 | return 0; | 55 | return 0; |
54 | } | 56 | } |
@@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void) | |||
56 | int ftrace_arch_code_modify_post_process(void) | 58 | int ftrace_arch_code_modify_post_process(void) |
57 | { | 59 | { |
58 | modifying_code = 0; | 60 | modifying_code = 0; |
61 | set_all_modules_text_ro(); | ||
59 | set_kernel_text_ro(); | 62 | set_kernel_text_ro(); |
60 | return 0; | 63 | return 0; |
61 | } | 64 | } |
@@ -167,9 +170,9 @@ static void ftrace_mod_code(void) | |||
167 | 170 | ||
168 | void ftrace_nmi_enter(void) | 171 | void ftrace_nmi_enter(void) |
169 | { | 172 | { |
170 | __get_cpu_var(save_modifying_code) = modifying_code; | 173 | __this_cpu_write(save_modifying_code, modifying_code); |
171 | 174 | ||
172 | if (!__get_cpu_var(save_modifying_code)) | 175 | if (!__this_cpu_read(save_modifying_code)) |
173 | return; | 176 | return; |
174 | 177 | ||
175 | if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { | 178 | if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { |
@@ -183,7 +186,7 @@ void ftrace_nmi_enter(void) | |||
183 | 186 | ||
184 | void ftrace_nmi_exit(void) | 187 | void ftrace_nmi_exit(void) |
185 | { | 188 | { |
186 | if (!__get_cpu_var(save_modifying_code)) | 189 | if (!__this_cpu_read(save_modifying_code)) |
187 | return; | 190 | return; |
188 | 191 | ||
189 | /* Finish all executions before clearing nmi_running */ | 192 | /* Finish all executions before clearing nmi_running */ |
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index bcece91dd311..fc293dc8dc35 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -60,16 +60,18 @@ | |||
60 | #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) | 60 | #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) |
61 | #endif | 61 | #endif |
62 | 62 | ||
63 | /* Number of possible pages in the lowmem region */ | ||
64 | LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) | ||
65 | |||
63 | /* Enough space to fit pagetables for the low memory linear map */ | 66 | /* Enough space to fit pagetables for the low memory linear map */ |
64 | MAPPING_BEYOND_END = \ | 67 | MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT |
65 | PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT | ||
66 | 68 | ||
67 | /* | 69 | /* |
68 | * Worst-case size of the kernel mapping we need to make: | 70 | * Worst-case size of the kernel mapping we need to make: |
69 | * the worst-case size of the kernel itself, plus the extra we need | 71 | * a relocatable kernel can live anywhere in lowmem, so we need to be able |
70 | * to map for the linear map. | 72 | * to map all of lowmem. |
71 | */ | 73 | */ |
72 | KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT | 74 | KERNEL_PAGES = LOWMEM_PAGES |
73 | 75 | ||
74 | INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm | 76 | INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm |
75 | RESERVE_BRK(pagetables, INIT_MAP_SIZE) | 77 | RESERVE_BRK(pagetables, INIT_MAP_SIZE) |
@@ -124,7 +126,7 @@ ENTRY(startup_32) | |||
124 | movsl | 126 | movsl |
125 | movl pa(boot_params) + NEW_CL_POINTER,%esi | 127 | movl pa(boot_params) + NEW_CL_POINTER,%esi |
126 | andl %esi,%esi | 128 | andl %esi,%esi |
127 | jz 1f # No comand line | 129 | jz 1f # No command line |
128 | movl $pa(boot_command_line),%edi | 130 | movl $pa(boot_command_line),%edi |
129 | movl $(COMMAND_LINE_SIZE/4),%ecx | 131 | movl $(COMMAND_LINE_SIZE/4),%ecx |
130 | rep | 132 | rep |
@@ -137,39 +139,6 @@ ENTRY(startup_32) | |||
137 | movl %eax, pa(olpc_ofw_pgd) | 139 | movl %eax, pa(olpc_ofw_pgd) |
138 | #endif | 140 | #endif |
139 | 141 | ||
140 | #ifdef CONFIG_PARAVIRT | ||
141 | /* This is can only trip for a broken bootloader... */ | ||
142 | cmpw $0x207, pa(boot_params + BP_version) | ||
143 | jb default_entry | ||
144 | |||
145 | /* Paravirt-compatible boot parameters. Look to see what architecture | ||
146 | we're booting under. */ | ||
147 | movl pa(boot_params + BP_hardware_subarch), %eax | ||
148 | cmpl $num_subarch_entries, %eax | ||
149 | jae bad_subarch | ||
150 | |||
151 | movl pa(subarch_entries)(,%eax,4), %eax | ||
152 | subl $__PAGE_OFFSET, %eax | ||
153 | jmp *%eax | ||
154 | |||
155 | bad_subarch: | ||
156 | WEAK(lguest_entry) | ||
157 | WEAK(xen_entry) | ||
158 | /* Unknown implementation; there's really | ||
159 | nothing we can do at this point. */ | ||
160 | ud2a | ||
161 | |||
162 | __INITDATA | ||
163 | |||
164 | subarch_entries: | ||
165 | .long default_entry /* normal x86/PC */ | ||
166 | .long lguest_entry /* lguest hypervisor */ | ||
167 | .long xen_entry /* Xen hypervisor */ | ||
168 | .long default_entry /* Moorestown MID */ | ||
169 | num_subarch_entries = (. - subarch_entries) / 4 | ||
170 | .previous | ||
171 | #endif /* CONFIG_PARAVIRT */ | ||
172 | |||
173 | /* | 142 | /* |
174 | * Initialize page tables. This creates a PDE and a set of page | 143 | * Initialize page tables. This creates a PDE and a set of page |
175 | * tables, which are located immediately beyond __brk_base. The variable | 144 | * tables, which are located immediately beyond __brk_base. The variable |
@@ -179,7 +148,6 @@ num_subarch_entries = (. - subarch_entries) / 4 | |||
179 | * | 148 | * |
180 | * Note that the stack is not yet set up! | 149 | * Note that the stack is not yet set up! |
181 | */ | 150 | */ |
182 | default_entry: | ||
183 | #ifdef CONFIG_X86_PAE | 151 | #ifdef CONFIG_X86_PAE |
184 | 152 | ||
185 | /* | 153 | /* |
@@ -259,7 +227,42 @@ page_pde_offset = (__PAGE_OFFSET >> 20); | |||
259 | movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax | 227 | movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax |
260 | movl %eax,pa(initial_page_table+0xffc) | 228 | movl %eax,pa(initial_page_table+0xffc) |
261 | #endif | 229 | #endif |
262 | jmp 3f | 230 | |
231 | #ifdef CONFIG_PARAVIRT | ||
232 | /* This is can only trip for a broken bootloader... */ | ||
233 | cmpw $0x207, pa(boot_params + BP_version) | ||
234 | jb default_entry | ||
235 | |||
236 | /* Paravirt-compatible boot parameters. Look to see what architecture | ||
237 | we're booting under. */ | ||
238 | movl pa(boot_params + BP_hardware_subarch), %eax | ||
239 | cmpl $num_subarch_entries, %eax | ||
240 | jae bad_subarch | ||
241 | |||
242 | movl pa(subarch_entries)(,%eax,4), %eax | ||
243 | subl $__PAGE_OFFSET, %eax | ||
244 | jmp *%eax | ||
245 | |||
246 | bad_subarch: | ||
247 | WEAK(lguest_entry) | ||
248 | WEAK(xen_entry) | ||
249 | /* Unknown implementation; there's really | ||
250 | nothing we can do at this point. */ | ||
251 | ud2a | ||
252 | |||
253 | __INITDATA | ||
254 | |||
255 | subarch_entries: | ||
256 | .long default_entry /* normal x86/PC */ | ||
257 | .long lguest_entry /* lguest hypervisor */ | ||
258 | .long xen_entry /* Xen hypervisor */ | ||
259 | .long default_entry /* Moorestown MID */ | ||
260 | num_subarch_entries = (. - subarch_entries) / 4 | ||
261 | .previous | ||
262 | #else | ||
263 | jmp default_entry | ||
264 | #endif /* CONFIG_PARAVIRT */ | ||
265 | |||
263 | /* | 266 | /* |
264 | * Non-boot CPU entry point; entered from trampoline.S | 267 | * Non-boot CPU entry point; entered from trampoline.S |
265 | * We can't lgdt here, because lgdt itself uses a data segment, but | 268 | * We can't lgdt here, because lgdt itself uses a data segment, but |
@@ -280,7 +283,7 @@ ENTRY(startup_32_smp) | |||
280 | movl %eax,%fs | 283 | movl %eax,%fs |
281 | movl %eax,%gs | 284 | movl %eax,%gs |
282 | #endif /* CONFIG_SMP */ | 285 | #endif /* CONFIG_SMP */ |
283 | 3: | 286 | default_entry: |
284 | 287 | ||
285 | /* | 288 | /* |
286 | * New page tables may be in 4Mbyte page mode and may | 289 | * New page tables may be in 4Mbyte page mode and may |
@@ -314,6 +317,10 @@ ENTRY(startup_32_smp) | |||
314 | subl $0x80000001, %eax | 317 | subl $0x80000001, %eax |
315 | cmpl $(0x8000ffff-0x80000001), %eax | 318 | cmpl $(0x8000ffff-0x80000001), %eax |
316 | ja 6f | 319 | ja 6f |
320 | |||
321 | /* Clear bogus XD_DISABLE bits */ | ||
322 | call verify_cpu | ||
323 | |||
317 | mov $0x80000001, %eax | 324 | mov $0x80000001, %eax |
318 | cpuid | 325 | cpuid |
319 | /* Execute Disable bit supported? */ | 326 | /* Execute Disable bit supported? */ |
@@ -609,6 +616,8 @@ ignore_int: | |||
609 | #endif | 616 | #endif |
610 | iret | 617 | iret |
611 | 618 | ||
619 | #include "verify_cpu.S" | ||
620 | |||
612 | __REFDATA | 621 | __REFDATA |
613 | .align 4 | 622 | .align 4 |
614 | ENTRY(initial_code) | 623 | ENTRY(initial_code) |
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ae03cab4352e..4ff5968f12d2 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c | |||
@@ -27,6 +27,9 @@ | |||
27 | #define HPET_DEV_FSB_CAP 0x1000 | 27 | #define HPET_DEV_FSB_CAP 0x1000 |
28 | #define HPET_DEV_PERI_CAP 0x2000 | 28 | #define HPET_DEV_PERI_CAP 0x2000 |
29 | 29 | ||
30 | #define HPET_MIN_CYCLES 128 | ||
31 | #define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) | ||
32 | |||
30 | #define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) | 33 | #define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) |
31 | 34 | ||
32 | /* | 35 | /* |
@@ -299,8 +302,9 @@ static void hpet_legacy_clockevent_register(void) | |||
299 | /* Calculate the min / max delta */ | 302 | /* Calculate the min / max delta */ |
300 | hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, | 303 | hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, |
301 | &hpet_clockevent); | 304 | &hpet_clockevent); |
302 | /* 5 usec minimum reprogramming delta. */ | 305 | /* Setup minimum reprogramming delta. */ |
303 | hpet_clockevent.min_delta_ns = 5000; | 306 | hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA, |
307 | &hpet_clockevent); | ||
304 | 308 | ||
305 | /* | 309 | /* |
306 | * Start hpet with the boot cpu mask and make it | 310 | * Start hpet with the boot cpu mask and make it |
@@ -393,22 +397,24 @@ static int hpet_next_event(unsigned long delta, | |||
393 | * the wraparound into account) nor a simple count down event | 397 | * the wraparound into account) nor a simple count down event |
394 | * mode. Further the write to the comparator register is | 398 | * mode. Further the write to the comparator register is |
395 | * delayed internally up to two HPET clock cycles in certain | 399 | * delayed internally up to two HPET clock cycles in certain |
396 | * chipsets (ATI, ICH9,10). We worked around that by reading | 400 | * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even |
397 | * back the compare register, but that required another | 401 | * longer delays. We worked around that by reading back the |
398 | * workaround for ICH9,10 chips where the first readout after | 402 | * compare register, but that required another workaround for |
399 | * write can return the old stale value. We already have a | 403 | * ICH9,10 chips where the first readout after write can |
400 | * minimum delta of 5us enforced, but a NMI or SMI hitting | 404 | * return the old stale value. We already had a minimum |
405 | * programming delta of 5us enforced, but a NMI or SMI hitting | ||
401 | * between the counter readout and the comparator write can | 406 | * between the counter readout and the comparator write can |
402 | * move us behind that point easily. Now instead of reading | 407 | * move us behind that point easily. Now instead of reading |
403 | * the compare register back several times, we make the ETIME | 408 | * the compare register back several times, we make the ETIME |
404 | * decision based on the following: Return ETIME if the | 409 | * decision based on the following: Return ETIME if the |
405 | * counter value after the write is less than 8 HPET cycles | 410 | * counter value after the write is less than HPET_MIN_CYCLES |
406 | * away from the event or if the counter is already ahead of | 411 | * away from the event or if the counter is already ahead of |
407 | * the event. | 412 | * the event. The minimum programming delta for the generic |
413 | * clockevents code is set to 1.5 * HPET_MIN_CYCLES. | ||
408 | */ | 414 | */ |
409 | res = (s32)(cnt - hpet_readl(HPET_COUNTER)); | 415 | res = (s32)(cnt - hpet_readl(HPET_COUNTER)); |
410 | 416 | ||
411 | return res < 8 ? -ETIME : 0; | 417 | return res < HPET_MIN_CYCLES ? -ETIME : 0; |
412 | } | 418 | } |
413 | 419 | ||
414 | static void hpet_legacy_set_mode(enum clock_event_mode mode, | 420 | static void hpet_legacy_set_mode(enum clock_event_mode mode, |
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index ff15c9dcc25d..02f07634d265 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c | |||
@@ -122,7 +122,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp) | |||
122 | return -EBUSY; | 122 | return -EBUSY; |
123 | 123 | ||
124 | set_debugreg(info->address, i); | 124 | set_debugreg(info->address, i); |
125 | __get_cpu_var(cpu_debugreg[i]) = info->address; | 125 | __this_cpu_write(cpu_debugreg[i], info->address); |
126 | 126 | ||
127 | dr7 = &__get_cpu_var(cpu_dr7); | 127 | dr7 = &__get_cpu_var(cpu_dr7); |
128 | *dr7 |= encode_dr7(i, info->len, info->type); | 128 | *dr7 |= encode_dr7(i, info->len, info->type); |
@@ -397,12 +397,12 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) | |||
397 | 397 | ||
398 | void hw_breakpoint_restore(void) | 398 | void hw_breakpoint_restore(void) |
399 | { | 399 | { |
400 | set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); | 400 | set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0); |
401 | set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1); | 401 | set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1); |
402 | set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); | 402 | set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2); |
403 | set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); | 403 | set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3); |
404 | set_debugreg(current->thread.debugreg6, 6); | 404 | set_debugreg(current->thread.debugreg6, 6); |
405 | set_debugreg(__get_cpu_var(cpu_dr7), 7); | 405 | set_debugreg(__this_cpu_read(cpu_dr7), 7); |
406 | } | 406 | } |
407 | EXPORT_SYMBOL_GPL(hw_breakpoint_restore); | 407 | EXPORT_SYMBOL_GPL(hw_breakpoint_restore); |
408 | 408 | ||
@@ -433,6 +433,10 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) | |||
433 | dr6_p = (unsigned long *)ERR_PTR(args->err); | 433 | dr6_p = (unsigned long *)ERR_PTR(args->err); |
434 | dr6 = *dr6_p; | 434 | dr6 = *dr6_p; |
435 | 435 | ||
436 | /* If it's a single step, TRAP bits are random */ | ||
437 | if (dr6 & DR_STEP) | ||
438 | return NOTIFY_DONE; | ||
439 | |||
436 | /* Do an early return if no trap bits are set in DR6 */ | 440 | /* Do an early return if no trap bits are set in DR6 */ |
437 | if ((dr6 & DR_TRAP_BITS) == 0) | 441 | if ((dr6 & DR_TRAP_BITS) == 0) |
438 | return NOTIFY_DONE; | 442 | return NOTIFY_DONE; |
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 58bb239a2fd7..e60c38cc0eed 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
@@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk) | |||
169 | set_stopped_child_used_math(tsk); | 169 | set_stopped_child_used_math(tsk); |
170 | return 0; | 170 | return 0; |
171 | } | 171 | } |
172 | EXPORT_SYMBOL_GPL(init_fpu); | ||
172 | 173 | ||
173 | /* | 174 | /* |
174 | * The xstateregs_active() routine is the same as the fpregs_active() routine, | 175 | * The xstateregs_active() routine is the same as the fpregs_active() routine, |
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 83ec0175f986..52945da52a94 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/cpu.h> | 4 | #include <linux/cpu.h> |
5 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
6 | #include <linux/kernel_stat.h> | 6 | #include <linux/kernel_stat.h> |
7 | #include <linux/of.h> | ||
7 | #include <linux/seq_file.h> | 8 | #include <linux/seq_file.h> |
8 | #include <linux/smp.h> | 9 | #include <linux/smp.h> |
9 | #include <linux/ftrace.h> | 10 | #include <linux/ftrace.h> |
@@ -234,7 +235,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) | |||
234 | exit_idle(); | 235 | exit_idle(); |
235 | irq_enter(); | 236 | irq_enter(); |
236 | 237 | ||
237 | irq = __get_cpu_var(vector_irq)[vector]; | 238 | irq = __this_cpu_read(vector_irq[vector]); |
238 | 239 | ||
239 | if (!handle_irq(irq, regs)) { | 240 | if (!handle_irq(irq, regs)) { |
240 | ack_APIC_irq(); | 241 | ack_APIC_irq(); |
@@ -275,6 +276,15 @@ void smp_x86_platform_ipi(struct pt_regs *regs) | |||
275 | 276 | ||
276 | EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); | 277 | EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); |
277 | 278 | ||
279 | #ifdef CONFIG_OF | ||
280 | unsigned int irq_create_of_mapping(struct device_node *controller, | ||
281 | const u32 *intspec, unsigned int intsize) | ||
282 | { | ||
283 | return intspec[0]; | ||
284 | } | ||
285 | EXPORT_SYMBOL_GPL(irq_create_of_mapping); | ||
286 | #endif | ||
287 | |||
278 | #ifdef CONFIG_HOTPLUG_CPU | 288 | #ifdef CONFIG_HOTPLUG_CPU |
279 | /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ | 289 | /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ |
280 | void fixup_irqs(void) | 290 | void fixup_irqs(void) |
@@ -350,12 +360,12 @@ void fixup_irqs(void) | |||
350 | for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { | 360 | for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { |
351 | unsigned int irr; | 361 | unsigned int irr; |
352 | 362 | ||
353 | if (__get_cpu_var(vector_irq)[vector] < 0) | 363 | if (__this_cpu_read(vector_irq[vector]) < 0) |
354 | continue; | 364 | continue; |
355 | 365 | ||
356 | irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); | 366 | irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); |
357 | if (irr & (1 << (vector % 32))) { | 367 | if (irr & (1 << (vector % 32))) { |
358 | irq = __get_cpu_var(vector_irq)[vector]; | 368 | irq = __this_cpu_read(vector_irq[vector]); |
359 | 369 | ||
360 | data = irq_get_irq_data(irq); | 370 | data = irq_get_irq_data(irq); |
361 | raw_spin_lock(&desc->lock); | 371 | raw_spin_lock(&desc->lock); |
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 96656f207751..9974d21048fd 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
@@ -79,7 +79,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) | |||
79 | u32 *isp, arg1, arg2; | 79 | u32 *isp, arg1, arg2; |
80 | 80 | ||
81 | curctx = (union irq_ctx *) current_thread_info(); | 81 | curctx = (union irq_ctx *) current_thread_info(); |
82 | irqctx = __get_cpu_var(hardirq_ctx); | 82 | irqctx = __this_cpu_read(hardirq_ctx); |
83 | 83 | ||
84 | /* | 84 | /* |
85 | * this is where we switch to the IRQ stack. However, if we are | 85 | * this is where we switch to the IRQ stack. However, if we are |
@@ -129,8 +129,7 @@ void __cpuinit irq_ctx_init(int cpu) | |||
129 | irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), | 129 | irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), |
130 | THREAD_FLAGS, | 130 | THREAD_FLAGS, |
131 | THREAD_ORDER)); | 131 | THREAD_ORDER)); |
132 | irqctx->tinfo.task = NULL; | 132 | memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); |
133 | irqctx->tinfo.exec_domain = NULL; | ||
134 | irqctx->tinfo.cpu = cpu; | 133 | irqctx->tinfo.cpu = cpu; |
135 | irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; | 134 | irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; |
136 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); | 135 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); |
@@ -140,10 +139,8 @@ void __cpuinit irq_ctx_init(int cpu) | |||
140 | irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), | 139 | irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), |
141 | THREAD_FLAGS, | 140 | THREAD_FLAGS, |
142 | THREAD_ORDER)); | 141 | THREAD_ORDER)); |
143 | irqctx->tinfo.task = NULL; | 142 | memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); |
144 | irqctx->tinfo.exec_domain = NULL; | ||
145 | irqctx->tinfo.cpu = cpu; | 143 | irqctx->tinfo.cpu = cpu; |
146 | irqctx->tinfo.preempt_count = 0; | ||
147 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); | 144 | irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); |
148 | 145 | ||
149 | per_cpu(softirq_ctx, cpu) = irqctx; | 146 | per_cpu(softirq_ctx, cpu) = irqctx; |
@@ -166,7 +163,7 @@ asmlinkage void do_softirq(void) | |||
166 | 163 | ||
167 | if (local_softirq_pending()) { | 164 | if (local_softirq_pending()) { |
168 | curctx = current_thread_info(); | 165 | curctx = current_thread_info(); |
169 | irqctx = __get_cpu_var(softirq_ctx); | 166 | irqctx = __this_cpu_read(softirq_ctx); |
170 | irqctx->tinfo.task = curctx->task; | 167 | irqctx->tinfo.task = curctx->task; |
171 | irqctx->tinfo.previous_esp = current_stack_pointer; | 168 | irqctx->tinfo.previous_esp = current_stack_pointer; |
172 | 169 | ||
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index ec592caac4b4..a4130005028a 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <asm/apicdef.h> | 48 | #include <asm/apicdef.h> |
49 | #include <asm/system.h> | 49 | #include <asm/system.h> |
50 | #include <asm/apic.h> | 50 | #include <asm/apic.h> |
51 | #include <asm/nmi.h> | ||
51 | 52 | ||
52 | struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = | 53 | struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = |
53 | { | 54 | { |
@@ -315,14 +316,18 @@ static void kgdb_remove_all_hw_break(void) | |||
315 | if (!breakinfo[i].enabled) | 316 | if (!breakinfo[i].enabled) |
316 | continue; | 317 | continue; |
317 | bp = *per_cpu_ptr(breakinfo[i].pev, cpu); | 318 | bp = *per_cpu_ptr(breakinfo[i].pev, cpu); |
318 | if (bp->attr.disabled == 1) | 319 | if (!bp->attr.disabled) { |
320 | arch_uninstall_hw_breakpoint(bp); | ||
321 | bp->attr.disabled = 1; | ||
319 | continue; | 322 | continue; |
323 | } | ||
320 | if (dbg_is_early) | 324 | if (dbg_is_early) |
321 | early_dr7 &= ~encode_dr7(i, breakinfo[i].len, | 325 | early_dr7 &= ~encode_dr7(i, breakinfo[i].len, |
322 | breakinfo[i].type); | 326 | breakinfo[i].type); |
323 | else | 327 | else if (hw_break_release_slot(i)) |
324 | arch_uninstall_hw_breakpoint(bp); | 328 | printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n", |
325 | bp->attr.disabled = 1; | 329 | breakinfo[i].addr); |
330 | breakinfo[i].enabled = 0; | ||
326 | } | 331 | } |
327 | } | 332 | } |
328 | 333 | ||
@@ -521,10 +526,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) | |||
521 | } | 526 | } |
522 | return NOTIFY_DONE; | 527 | return NOTIFY_DONE; |
523 | 528 | ||
524 | case DIE_NMI_IPI: | ||
525 | /* Just ignore, we will handle the roundup on DIE_NMI. */ | ||
526 | return NOTIFY_DONE; | ||
527 | |||
528 | case DIE_NMIUNKNOWN: | 529 | case DIE_NMIUNKNOWN: |
529 | if (was_in_debug_nmi[raw_smp_processor_id()]) { | 530 | if (was_in_debug_nmi[raw_smp_processor_id()]) { |
530 | was_in_debug_nmi[raw_smp_processor_id()] = 0; | 531 | was_in_debug_nmi[raw_smp_processor_id()] = 0; |
@@ -602,7 +603,7 @@ static struct notifier_block kgdb_notifier = { | |||
602 | /* | 603 | /* |
603 | * Lowest-prio notifier priority, we want to be notified last: | 604 | * Lowest-prio notifier priority, we want to be notified last: |
604 | */ | 605 | */ |
605 | .priority = -INT_MAX, | 606 | .priority = NMI_LOCAL_LOW_PRIOR, |
606 | }; | 607 | }; |
607 | 608 | ||
608 | /** | 609 | /** |
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 1cbd54c0df99..d91c477b3f62 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c | |||
@@ -403,7 +403,7 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) | |||
403 | 403 | ||
404 | static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) | 404 | static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) |
405 | { | 405 | { |
406 | __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; | 406 | __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); |
407 | kcb->kprobe_status = kcb->prev_kprobe.status; | 407 | kcb->kprobe_status = kcb->prev_kprobe.status; |
408 | kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags; | 408 | kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags; |
409 | kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; | 409 | kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; |
@@ -412,7 +412,7 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) | |||
412 | static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, | 412 | static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, |
413 | struct kprobe_ctlblk *kcb) | 413 | struct kprobe_ctlblk *kcb) |
414 | { | 414 | { |
415 | __get_cpu_var(current_kprobe) = p; | 415 | __this_cpu_write(current_kprobe, p); |
416 | kcb->kprobe_saved_flags = kcb->kprobe_old_flags | 416 | kcb->kprobe_saved_flags = kcb->kprobe_old_flags |
417 | = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); | 417 | = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); |
418 | if (is_IF_modifier(p->ainsn.insn)) | 418 | if (is_IF_modifier(p->ainsn.insn)) |
@@ -586,7 +586,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) | |||
586 | preempt_enable_no_resched(); | 586 | preempt_enable_no_resched(); |
587 | return 1; | 587 | return 1; |
588 | } else if (kprobe_running()) { | 588 | } else if (kprobe_running()) { |
589 | p = __get_cpu_var(current_kprobe); | 589 | p = __this_cpu_read(current_kprobe); |
590 | if (p->break_handler && p->break_handler(p, regs)) { | 590 | if (p->break_handler && p->break_handler(p, regs)) { |
591 | setup_singlestep(p, regs, kcb, 0); | 591 | setup_singlestep(p, regs, kcb, 0); |
592 | return 1; | 592 | return 1; |
@@ -759,11 +759,11 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) | |||
759 | 759 | ||
760 | orig_ret_address = (unsigned long)ri->ret_addr; | 760 | orig_ret_address = (unsigned long)ri->ret_addr; |
761 | if (ri->rp && ri->rp->handler) { | 761 | if (ri->rp && ri->rp->handler) { |
762 | __get_cpu_var(current_kprobe) = &ri->rp->kp; | 762 | __this_cpu_write(current_kprobe, &ri->rp->kp); |
763 | get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; | 763 | get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; |
764 | ri->ret_addr = correct_ret_addr; | 764 | ri->ret_addr = correct_ret_addr; |
765 | ri->rp->handler(ri, regs); | 765 | ri->rp->handler(ri, regs); |
766 | __get_cpu_var(current_kprobe) = NULL; | 766 | __this_cpu_write(current_kprobe, NULL); |
767 | } | 767 | } |
768 | 768 | ||
769 | recycle_rp_inst(ri, &empty_rp); | 769 | recycle_rp_inst(ri, &empty_rp); |
@@ -1184,6 +1184,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, | |||
1184 | { | 1184 | { |
1185 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); | 1185 | struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); |
1186 | 1186 | ||
1187 | /* This is possible if op is under delayed unoptimizing */ | ||
1188 | if (kprobe_disabled(&op->kp)) | ||
1189 | return; | ||
1190 | |||
1187 | preempt_disable(); | 1191 | preempt_disable(); |
1188 | if (kprobe_running()) { | 1192 | if (kprobe_running()) { |
1189 | kprobes_inc_nmissed_count(&op->kp); | 1193 | kprobes_inc_nmissed_count(&op->kp); |
@@ -1198,10 +1202,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, | |||
1198 | regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; | 1202 | regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; |
1199 | regs->orig_ax = ~0UL; | 1203 | regs->orig_ax = ~0UL; |
1200 | 1204 | ||
1201 | __get_cpu_var(current_kprobe) = &op->kp; | 1205 | __this_cpu_write(current_kprobe, &op->kp); |
1202 | kcb->kprobe_status = KPROBE_HIT_ACTIVE; | 1206 | kcb->kprobe_status = KPROBE_HIT_ACTIVE; |
1203 | opt_pre_handler(&op->kp, regs); | 1207 | opt_pre_handler(&op->kp, regs); |
1204 | __get_cpu_var(current_kprobe) = NULL; | 1208 | __this_cpu_write(current_kprobe, NULL); |
1205 | } | 1209 | } |
1206 | preempt_enable_no_resched(); | 1210 | preempt_enable_no_resched(); |
1207 | } | 1211 | } |
@@ -1401,10 +1405,16 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) | |||
1401 | return 0; | 1405 | return 0; |
1402 | } | 1406 | } |
1403 | 1407 | ||
1404 | /* Replace a breakpoint (int3) with a relative jump. */ | 1408 | #define MAX_OPTIMIZE_PROBES 256 |
1405 | int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) | 1409 | static struct text_poke_param *jump_poke_params; |
1410 | static struct jump_poke_buffer { | ||
1411 | u8 buf[RELATIVEJUMP_SIZE]; | ||
1412 | } *jump_poke_bufs; | ||
1413 | |||
1414 | static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, | ||
1415 | u8 *insn_buf, | ||
1416 | struct optimized_kprobe *op) | ||
1406 | { | 1417 | { |
1407 | unsigned char jmp_code[RELATIVEJUMP_SIZE]; | ||
1408 | s32 rel = (s32)((long)op->optinsn.insn - | 1418 | s32 rel = (s32)((long)op->optinsn.insn - |
1409 | ((long)op->kp.addr + RELATIVEJUMP_SIZE)); | 1419 | ((long)op->kp.addr + RELATIVEJUMP_SIZE)); |
1410 | 1420 | ||
@@ -1412,16 +1422,79 @@ int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) | |||
1412 | memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, | 1422 | memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, |
1413 | RELATIVE_ADDR_SIZE); | 1423 | RELATIVE_ADDR_SIZE); |
1414 | 1424 | ||
1415 | jmp_code[0] = RELATIVEJUMP_OPCODE; | 1425 | insn_buf[0] = RELATIVEJUMP_OPCODE; |
1416 | *(s32 *)(&jmp_code[1]) = rel; | 1426 | *(s32 *)(&insn_buf[1]) = rel; |
1427 | |||
1428 | tprm->addr = op->kp.addr; | ||
1429 | tprm->opcode = insn_buf; | ||
1430 | tprm->len = RELATIVEJUMP_SIZE; | ||
1431 | } | ||
1432 | |||
1433 | /* | ||
1434 | * Replace breakpoints (int3) with relative jumps. | ||
1435 | * Caller must call with locking kprobe_mutex and text_mutex. | ||
1436 | */ | ||
1437 | void __kprobes arch_optimize_kprobes(struct list_head *oplist) | ||
1438 | { | ||
1439 | struct optimized_kprobe *op, *tmp; | ||
1440 | int c = 0; | ||
1441 | |||
1442 | list_for_each_entry_safe(op, tmp, oplist, list) { | ||
1443 | WARN_ON(kprobe_disabled(&op->kp)); | ||
1444 | /* Setup param */ | ||
1445 | setup_optimize_kprobe(&jump_poke_params[c], | ||
1446 | jump_poke_bufs[c].buf, op); | ||
1447 | list_del_init(&op->list); | ||
1448 | if (++c >= MAX_OPTIMIZE_PROBES) | ||
1449 | break; | ||
1450 | } | ||
1417 | 1451 | ||
1418 | /* | 1452 | /* |
1419 | * text_poke_smp doesn't support NMI/MCE code modifying. | 1453 | * text_poke_smp doesn't support NMI/MCE code modifying. |
1420 | * However, since kprobes itself also doesn't support NMI/MCE | 1454 | * However, since kprobes itself also doesn't support NMI/MCE |
1421 | * code probing, it's not a problem. | 1455 | * code probing, it's not a problem. |
1422 | */ | 1456 | */ |
1423 | text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); | 1457 | text_poke_smp_batch(jump_poke_params, c); |
1424 | return 0; | 1458 | } |
1459 | |||
1460 | static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, | ||
1461 | u8 *insn_buf, | ||
1462 | struct optimized_kprobe *op) | ||
1463 | { | ||
1464 | /* Set int3 to first byte for kprobes */ | ||
1465 | insn_buf[0] = BREAKPOINT_INSTRUCTION; | ||
1466 | memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); | ||
1467 | |||
1468 | tprm->addr = op->kp.addr; | ||
1469 | tprm->opcode = insn_buf; | ||
1470 | tprm->len = RELATIVEJUMP_SIZE; | ||
1471 | } | ||
1472 | |||
1473 | /* | ||
1474 | * Recover original instructions and breakpoints from relative jumps. | ||
1475 | * Caller must call with locking kprobe_mutex. | ||
1476 | */ | ||
1477 | extern void arch_unoptimize_kprobes(struct list_head *oplist, | ||
1478 | struct list_head *done_list) | ||
1479 | { | ||
1480 | struct optimized_kprobe *op, *tmp; | ||
1481 | int c = 0; | ||
1482 | |||
1483 | list_for_each_entry_safe(op, tmp, oplist, list) { | ||
1484 | /* Setup param */ | ||
1485 | setup_unoptimize_kprobe(&jump_poke_params[c], | ||
1486 | jump_poke_bufs[c].buf, op); | ||
1487 | list_move(&op->list, done_list); | ||
1488 | if (++c >= MAX_OPTIMIZE_PROBES) | ||
1489 | break; | ||
1490 | } | ||
1491 | |||
1492 | /* | ||
1493 | * text_poke_smp doesn't support NMI/MCE code modifying. | ||
1494 | * However, since kprobes itself also doesn't support NMI/MCE | ||
1495 | * code probing, it's not a problem. | ||
1496 | */ | ||
1497 | text_poke_smp_batch(jump_poke_params, c); | ||
1425 | } | 1498 | } |
1426 | 1499 | ||
1427 | /* Replace a relative jump with a breakpoint (int3). */ | 1500 | /* Replace a relative jump with a breakpoint (int3). */ |
@@ -1453,11 +1526,35 @@ static int __kprobes setup_detour_execution(struct kprobe *p, | |||
1453 | } | 1526 | } |
1454 | return 0; | 1527 | return 0; |
1455 | } | 1528 | } |
1529 | |||
1530 | static int __kprobes init_poke_params(void) | ||
1531 | { | ||
1532 | /* Allocate code buffer and parameter array */ | ||
1533 | jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * | ||
1534 | MAX_OPTIMIZE_PROBES, GFP_KERNEL); | ||
1535 | if (!jump_poke_bufs) | ||
1536 | return -ENOMEM; | ||
1537 | |||
1538 | jump_poke_params = kmalloc(sizeof(struct text_poke_param) * | ||
1539 | MAX_OPTIMIZE_PROBES, GFP_KERNEL); | ||
1540 | if (!jump_poke_params) { | ||
1541 | kfree(jump_poke_bufs); | ||
1542 | jump_poke_bufs = NULL; | ||
1543 | return -ENOMEM; | ||
1544 | } | ||
1545 | |||
1546 | return 0; | ||
1547 | } | ||
1548 | #else /* !CONFIG_OPTPROBES */ | ||
1549 | static int __kprobes init_poke_params(void) | ||
1550 | { | ||
1551 | return 0; | ||
1552 | } | ||
1456 | #endif | 1553 | #endif |
1457 | 1554 | ||
1458 | int __init arch_init_kprobes(void) | 1555 | int __init arch_init_kprobes(void) |
1459 | { | 1556 | { |
1460 | return 0; | 1557 | return init_poke_params(); |
1461 | } | 1558 | } |
1462 | 1559 | ||
1463 | int __kprobes arch_trampoline_kprobe(struct kprobe *p) | 1560 | int __kprobes arch_trampoline_kprobe(struct kprobe *p) |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 63b0ec8d3d4a..8dc44662394b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -27,16 +27,37 @@ | |||
27 | #include <linux/mm.h> | 27 | #include <linux/mm.h> |
28 | #include <linux/highmem.h> | 28 | #include <linux/highmem.h> |
29 | #include <linux/hardirq.h> | 29 | #include <linux/hardirq.h> |
30 | #include <linux/notifier.h> | ||
31 | #include <linux/reboot.h> | ||
32 | #include <linux/hash.h> | ||
33 | #include <linux/sched.h> | ||
34 | #include <linux/slab.h> | ||
35 | #include <linux/kprobes.h> | ||
30 | #include <asm/timer.h> | 36 | #include <asm/timer.h> |
37 | #include <asm/cpu.h> | ||
38 | #include <asm/traps.h> | ||
39 | #include <asm/desc.h> | ||
40 | #include <asm/tlbflush.h> | ||
31 | 41 | ||
32 | #define MMU_QUEUE_SIZE 1024 | 42 | #define MMU_QUEUE_SIZE 1024 |
33 | 43 | ||
44 | static int kvmapf = 1; | ||
45 | |||
46 | static int parse_no_kvmapf(char *arg) | ||
47 | { | ||
48 | kvmapf = 0; | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | early_param("no-kvmapf", parse_no_kvmapf); | ||
53 | |||
34 | struct kvm_para_state { | 54 | struct kvm_para_state { |
35 | u8 mmu_queue[MMU_QUEUE_SIZE]; | 55 | u8 mmu_queue[MMU_QUEUE_SIZE]; |
36 | int mmu_queue_len; | 56 | int mmu_queue_len; |
37 | }; | 57 | }; |
38 | 58 | ||
39 | static DEFINE_PER_CPU(struct kvm_para_state, para_state); | 59 | static DEFINE_PER_CPU(struct kvm_para_state, para_state); |
60 | static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); | ||
40 | 61 | ||
41 | static struct kvm_para_state *kvm_para_state(void) | 62 | static struct kvm_para_state *kvm_para_state(void) |
42 | { | 63 | { |
@@ -50,6 +71,195 @@ static void kvm_io_delay(void) | |||
50 | { | 71 | { |
51 | } | 72 | } |
52 | 73 | ||
74 | #define KVM_TASK_SLEEP_HASHBITS 8 | ||
75 | #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) | ||
76 | |||
77 | struct kvm_task_sleep_node { | ||
78 | struct hlist_node link; | ||
79 | wait_queue_head_t wq; | ||
80 | u32 token; | ||
81 | int cpu; | ||
82 | bool halted; | ||
83 | struct mm_struct *mm; | ||
84 | }; | ||
85 | |||
86 | static struct kvm_task_sleep_head { | ||
87 | spinlock_t lock; | ||
88 | struct hlist_head list; | ||
89 | } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; | ||
90 | |||
91 | static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, | ||
92 | u32 token) | ||
93 | { | ||
94 | struct hlist_node *p; | ||
95 | |||
96 | hlist_for_each(p, &b->list) { | ||
97 | struct kvm_task_sleep_node *n = | ||
98 | hlist_entry(p, typeof(*n), link); | ||
99 | if (n->token == token) | ||
100 | return n; | ||
101 | } | ||
102 | |||
103 | return NULL; | ||
104 | } | ||
105 | |||
106 | void kvm_async_pf_task_wait(u32 token) | ||
107 | { | ||
108 | u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); | ||
109 | struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; | ||
110 | struct kvm_task_sleep_node n, *e; | ||
111 | DEFINE_WAIT(wait); | ||
112 | int cpu, idle; | ||
113 | |||
114 | cpu = get_cpu(); | ||
115 | idle = idle_cpu(cpu); | ||
116 | put_cpu(); | ||
117 | |||
118 | spin_lock(&b->lock); | ||
119 | e = _find_apf_task(b, token); | ||
120 | if (e) { | ||
121 | /* dummy entry exist -> wake up was delivered ahead of PF */ | ||
122 | hlist_del(&e->link); | ||
123 | kfree(e); | ||
124 | spin_unlock(&b->lock); | ||
125 | return; | ||
126 | } | ||
127 | |||
128 | n.token = token; | ||
129 | n.cpu = smp_processor_id(); | ||
130 | n.mm = current->active_mm; | ||
131 | n.halted = idle || preempt_count() > 1; | ||
132 | atomic_inc(&n.mm->mm_count); | ||
133 | init_waitqueue_head(&n.wq); | ||
134 | hlist_add_head(&n.link, &b->list); | ||
135 | spin_unlock(&b->lock); | ||
136 | |||
137 | for (;;) { | ||
138 | if (!n.halted) | ||
139 | prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); | ||
140 | if (hlist_unhashed(&n.link)) | ||
141 | break; | ||
142 | |||
143 | if (!n.halted) { | ||
144 | local_irq_enable(); | ||
145 | schedule(); | ||
146 | local_irq_disable(); | ||
147 | } else { | ||
148 | /* | ||
149 | * We cannot reschedule. So halt. | ||
150 | */ | ||
151 | native_safe_halt(); | ||
152 | local_irq_disable(); | ||
153 | } | ||
154 | } | ||
155 | if (!n.halted) | ||
156 | finish_wait(&n.wq, &wait); | ||
157 | |||
158 | return; | ||
159 | } | ||
160 | EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); | ||
161 | |||
162 | static void apf_task_wake_one(struct kvm_task_sleep_node *n) | ||
163 | { | ||
164 | hlist_del_init(&n->link); | ||
165 | if (!n->mm) | ||
166 | return; | ||
167 | mmdrop(n->mm); | ||
168 | if (n->halted) | ||
169 | smp_send_reschedule(n->cpu); | ||
170 | else if (waitqueue_active(&n->wq)) | ||
171 | wake_up(&n->wq); | ||
172 | } | ||
173 | |||
174 | static void apf_task_wake_all(void) | ||
175 | { | ||
176 | int i; | ||
177 | |||
178 | for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { | ||
179 | struct hlist_node *p, *next; | ||
180 | struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; | ||
181 | spin_lock(&b->lock); | ||
182 | hlist_for_each_safe(p, next, &b->list) { | ||
183 | struct kvm_task_sleep_node *n = | ||
184 | hlist_entry(p, typeof(*n), link); | ||
185 | if (n->cpu == smp_processor_id()) | ||
186 | apf_task_wake_one(n); | ||
187 | } | ||
188 | spin_unlock(&b->lock); | ||
189 | } | ||
190 | } | ||
191 | |||
192 | void kvm_async_pf_task_wake(u32 token) | ||
193 | { | ||
194 | u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); | ||
195 | struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; | ||
196 | struct kvm_task_sleep_node *n; | ||
197 | |||
198 | if (token == ~0) { | ||
199 | apf_task_wake_all(); | ||
200 | return; | ||
201 | } | ||
202 | |||
203 | again: | ||
204 | spin_lock(&b->lock); | ||
205 | n = _find_apf_task(b, token); | ||
206 | if (!n) { | ||
207 | /* | ||
208 | * async PF was not yet handled. | ||
209 | * Add dummy entry for the token. | ||
210 | */ | ||
211 | n = kmalloc(sizeof(*n), GFP_ATOMIC); | ||
212 | if (!n) { | ||
213 | /* | ||
214 | * Allocation failed! Busy wait while other cpu | ||
215 | * handles async PF. | ||
216 | */ | ||
217 | spin_unlock(&b->lock); | ||
218 | cpu_relax(); | ||
219 | goto again; | ||
220 | } | ||
221 | n->token = token; | ||
222 | n->cpu = smp_processor_id(); | ||
223 | n->mm = NULL; | ||
224 | init_waitqueue_head(&n->wq); | ||
225 | hlist_add_head(&n->link, &b->list); | ||
226 | } else | ||
227 | apf_task_wake_one(n); | ||
228 | spin_unlock(&b->lock); | ||
229 | return; | ||
230 | } | ||
231 | EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); | ||
232 | |||
233 | u32 kvm_read_and_reset_pf_reason(void) | ||
234 | { | ||
235 | u32 reason = 0; | ||
236 | |||
237 | if (__get_cpu_var(apf_reason).enabled) { | ||
238 | reason = __get_cpu_var(apf_reason).reason; | ||
239 | __get_cpu_var(apf_reason).reason = 0; | ||
240 | } | ||
241 | |||
242 | return reason; | ||
243 | } | ||
244 | EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); | ||
245 | |||
246 | dotraplinkage void __kprobes | ||
247 | do_async_page_fault(struct pt_regs *regs, unsigned long error_code) | ||
248 | { | ||
249 | switch (kvm_read_and_reset_pf_reason()) { | ||
250 | default: | ||
251 | do_page_fault(regs, error_code); | ||
252 | break; | ||
253 | case KVM_PV_REASON_PAGE_NOT_PRESENT: | ||
254 | /* page is swapped out by the host. */ | ||
255 | kvm_async_pf_task_wait((u32)read_cr2()); | ||
256 | break; | ||
257 | case KVM_PV_REASON_PAGE_READY: | ||
258 | kvm_async_pf_task_wake((u32)read_cr2()); | ||
259 | break; | ||
260 | } | ||
261 | } | ||
262 | |||
53 | static void kvm_mmu_op(void *buffer, unsigned len) | 263 | static void kvm_mmu_op(void *buffer, unsigned len) |
54 | { | 264 | { |
55 | int r; | 265 | int r; |
@@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void) | |||
231 | #endif | 441 | #endif |
232 | } | 442 | } |
233 | 443 | ||
444 | void __cpuinit kvm_guest_cpu_init(void) | ||
445 | { | ||
446 | if (!kvm_para_available()) | ||
447 | return; | ||
448 | |||
449 | if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { | ||
450 | u64 pa = __pa(&__get_cpu_var(apf_reason)); | ||
451 | |||
452 | #ifdef CONFIG_PREEMPT | ||
453 | pa |= KVM_ASYNC_PF_SEND_ALWAYS; | ||
454 | #endif | ||
455 | wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); | ||
456 | __get_cpu_var(apf_reason).enabled = 1; | ||
457 | printk(KERN_INFO"KVM setup async PF for cpu %d\n", | ||
458 | smp_processor_id()); | ||
459 | } | ||
460 | } | ||
461 | |||
462 | static void kvm_pv_disable_apf(void *unused) | ||
463 | { | ||
464 | if (!__get_cpu_var(apf_reason).enabled) | ||
465 | return; | ||
466 | |||
467 | wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); | ||
468 | __get_cpu_var(apf_reason).enabled = 0; | ||
469 | |||
470 | printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", | ||
471 | smp_processor_id()); | ||
472 | } | ||
473 | |||
474 | static int kvm_pv_reboot_notify(struct notifier_block *nb, | ||
475 | unsigned long code, void *unused) | ||
476 | { | ||
477 | if (code == SYS_RESTART) | ||
478 | on_each_cpu(kvm_pv_disable_apf, NULL, 1); | ||
479 | return NOTIFY_DONE; | ||
480 | } | ||
481 | |||
482 | static struct notifier_block kvm_pv_reboot_nb = { | ||
483 | .notifier_call = kvm_pv_reboot_notify, | ||
484 | }; | ||
485 | |||
486 | #ifdef CONFIG_SMP | ||
487 | static void __init kvm_smp_prepare_boot_cpu(void) | ||
488 | { | ||
489 | #ifdef CONFIG_KVM_CLOCK | ||
490 | WARN_ON(kvm_register_clock("primary cpu clock")); | ||
491 | #endif | ||
492 | kvm_guest_cpu_init(); | ||
493 | native_smp_prepare_boot_cpu(); | ||
494 | } | ||
495 | |||
496 | static void kvm_guest_cpu_online(void *dummy) | ||
497 | { | ||
498 | kvm_guest_cpu_init(); | ||
499 | } | ||
500 | |||
501 | static void kvm_guest_cpu_offline(void *dummy) | ||
502 | { | ||
503 | kvm_pv_disable_apf(NULL); | ||
504 | apf_task_wake_all(); | ||
505 | } | ||
506 | |||
507 | static int __cpuinit kvm_cpu_notify(struct notifier_block *self, | ||
508 | unsigned long action, void *hcpu) | ||
509 | { | ||
510 | int cpu = (unsigned long)hcpu; | ||
511 | switch (action) { | ||
512 | case CPU_ONLINE: | ||
513 | case CPU_DOWN_FAILED: | ||
514 | case CPU_ONLINE_FROZEN: | ||
515 | smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); | ||
516 | break; | ||
517 | case CPU_DOWN_PREPARE: | ||
518 | case CPU_DOWN_PREPARE_FROZEN: | ||
519 | smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); | ||
520 | break; | ||
521 | default: | ||
522 | break; | ||
523 | } | ||
524 | return NOTIFY_OK; | ||
525 | } | ||
526 | |||
527 | static struct notifier_block __cpuinitdata kvm_cpu_notifier = { | ||
528 | .notifier_call = kvm_cpu_notify, | ||
529 | }; | ||
530 | #endif | ||
531 | |||
532 | static void __init kvm_apf_trap_init(void) | ||
533 | { | ||
534 | set_intr_gate(14, &async_page_fault); | ||
535 | } | ||
536 | |||
234 | void __init kvm_guest_init(void) | 537 | void __init kvm_guest_init(void) |
235 | { | 538 | { |
539 | int i; | ||
540 | |||
236 | if (!kvm_para_available()) | 541 | if (!kvm_para_available()) |
237 | return; | 542 | return; |
238 | 543 | ||
239 | paravirt_ops_setup(); | 544 | paravirt_ops_setup(); |
545 | register_reboot_notifier(&kvm_pv_reboot_nb); | ||
546 | for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) | ||
547 | spin_lock_init(&async_pf_sleepers[i].lock); | ||
548 | if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) | ||
549 | x86_init.irqs.trap_init = kvm_apf_trap_init; | ||
550 | |||
551 | #ifdef CONFIG_SMP | ||
552 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | ||
553 | register_cpu_notifier(&kvm_cpu_notifier); | ||
554 | #else | ||
555 | kvm_guest_cpu_init(); | ||
556 | #endif | ||
240 | } | 557 | } |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ca43ce31a19c..f98d3eafe07a 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -125,7 +125,7 @@ static struct clocksource kvm_clock = { | |||
125 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | 125 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, |
126 | }; | 126 | }; |
127 | 127 | ||
128 | static int kvm_register_clock(char *txt) | 128 | int kvm_register_clock(char *txt) |
129 | { | 129 | { |
130 | int cpu = smp_processor_id(); | 130 | int cpu = smp_processor_id(); |
131 | int low, high, ret; | 131 | int low, high, ret; |
@@ -152,14 +152,6 @@ static void __cpuinit kvm_setup_secondary_clock(void) | |||
152 | } | 152 | } |
153 | #endif | 153 | #endif |
154 | 154 | ||
155 | #ifdef CONFIG_SMP | ||
156 | static void __init kvm_smp_prepare_boot_cpu(void) | ||
157 | { | ||
158 | WARN_ON(kvm_register_clock("primary cpu clock")); | ||
159 | native_smp_prepare_boot_cpu(); | ||
160 | } | ||
161 | #endif | ||
162 | |||
163 | /* | 155 | /* |
164 | * After the clock is registered, the host will keep writing to the | 156 | * After the clock is registered, the host will keep writing to the |
165 | * registered memory location. If the guest happens to shutdown, this memory | 157 | * registered memory location. If the guest happens to shutdown, this memory |
@@ -206,9 +198,6 @@ void __init kvmclock_init(void) | |||
206 | x86_cpuinit.setup_percpu_clockev = | 198 | x86_cpuinit.setup_percpu_clockev = |
207 | kvm_setup_secondary_clock; | 199 | kvm_setup_secondary_clock; |
208 | #endif | 200 | #endif |
209 | #ifdef CONFIG_SMP | ||
210 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | ||
211 | #endif | ||
212 | machine_ops.shutdown = kvm_shutdown; | 201 | machine_ops.shutdown = kvm_shutdown; |
213 | #ifdef CONFIG_KEXEC | 202 | #ifdef CONFIG_KEXEC |
214 | machine_ops.crash_shutdown = kvm_crash_shutdown; | 203 | machine_ops.crash_shutdown = kvm_crash_shutdown; |
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index e1af7c055c7d..0fe6d1a66c38 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c | |||
@@ -155,12 +155,6 @@ static int apply_microcode_amd(int cpu) | |||
155 | return 0; | 155 | return 0; |
156 | } | 156 | } |
157 | 157 | ||
158 | static int get_ucode_data(void *to, const u8 *from, size_t n) | ||
159 | { | ||
160 | memcpy(to, from, n); | ||
161 | return 0; | ||
162 | } | ||
163 | |||
164 | static void * | 158 | static void * |
165 | get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) | 159 | get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) |
166 | { | 160 | { |
@@ -168,8 +162,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) | |||
168 | u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; | 162 | u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; |
169 | void *mc; | 163 | void *mc; |
170 | 164 | ||
171 | if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) | 165 | get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR); |
172 | return NULL; | ||
173 | 166 | ||
174 | if (section_hdr[0] != UCODE_UCODE_TYPE) { | 167 | if (section_hdr[0] != UCODE_UCODE_TYPE) { |
175 | pr_err("error: invalid type field in container file section header\n"); | 168 | pr_err("error: invalid type field in container file section header\n"); |
@@ -183,16 +176,13 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) | |||
183 | return NULL; | 176 | return NULL; |
184 | } | 177 | } |
185 | 178 | ||
186 | mc = vmalloc(UCODE_MAX_SIZE); | 179 | mc = vzalloc(UCODE_MAX_SIZE); |
187 | if (mc) { | 180 | if (!mc) |
188 | memset(mc, 0, UCODE_MAX_SIZE); | 181 | return NULL; |
189 | if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, | 182 | |
190 | total_size)) { | 183 | get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size); |
191 | vfree(mc); | 184 | *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; |
192 | mc = NULL; | 185 | |
193 | } else | ||
194 | *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; | ||
195 | } | ||
196 | return mc; | 186 | return mc; |
197 | } | 187 | } |
198 | 188 | ||
@@ -202,8 +192,7 @@ static int install_equiv_cpu_table(const u8 *buf) | |||
202 | unsigned int *buf_pos = (unsigned int *)container_hdr; | 192 | unsigned int *buf_pos = (unsigned int *)container_hdr; |
203 | unsigned long size; | 193 | unsigned long size; |
204 | 194 | ||
205 | if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE)) | 195 | get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE); |
206 | return 0; | ||
207 | 196 | ||
208 | size = buf_pos[2]; | 197 | size = buf_pos[2]; |
209 | 198 | ||
@@ -212,17 +201,14 @@ static int install_equiv_cpu_table(const u8 *buf) | |||
212 | return 0; | 201 | return 0; |
213 | } | 202 | } |
214 | 203 | ||
215 | equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); | 204 | equiv_cpu_table = vmalloc(size); |
216 | if (!equiv_cpu_table) { | 205 | if (!equiv_cpu_table) { |
217 | pr_err("failed to allocate equivalent CPU table\n"); | 206 | pr_err("failed to allocate equivalent CPU table\n"); |
218 | return 0; | 207 | return 0; |
219 | } | 208 | } |
220 | 209 | ||
221 | buf += UCODE_CONTAINER_HEADER_SIZE; | 210 | buf += UCODE_CONTAINER_HEADER_SIZE; |
222 | if (get_ucode_data(equiv_cpu_table, buf, size)) { | 211 | get_ucode_data(equiv_cpu_table, buf, size); |
223 | vfree(equiv_cpu_table); | ||
224 | return 0; | ||
225 | } | ||
226 | 212 | ||
227 | return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ | 213 | return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ |
228 | } | 214 | } |
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index dcb65cc0a053..1a1b606d3e92 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c | |||
@@ -364,8 +364,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, | |||
364 | 364 | ||
365 | /* For performance reasons, reuse mc area when possible */ | 365 | /* For performance reasons, reuse mc area when possible */ |
366 | if (!mc || mc_size > curr_mc_size) { | 366 | if (!mc || mc_size > curr_mc_size) { |
367 | if (mc) | 367 | vfree(mc); |
368 | vfree(mc); | ||
369 | mc = vmalloc(mc_size); | 368 | mc = vmalloc(mc_size); |
370 | if (!mc) | 369 | if (!mc) |
371 | break; | 370 | break; |
@@ -374,13 +373,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, | |||
374 | 373 | ||
375 | if (get_ucode_data(mc, ucode_ptr, mc_size) || | 374 | if (get_ucode_data(mc, ucode_ptr, mc_size) || |
376 | microcode_sanity_check(mc) < 0) { | 375 | microcode_sanity_check(mc) < 0) { |
377 | vfree(mc); | ||
378 | break; | 376 | break; |
379 | } | 377 | } |
380 | 378 | ||
381 | if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { | 379 | if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { |
382 | if (new_mc) | 380 | vfree(new_mc); |
383 | vfree(new_mc); | ||
384 | new_rev = mc_header.rev; | 381 | new_rev = mc_header.rev; |
385 | new_mc = mc; | 382 | new_mc = mc; |
386 | mc = NULL; /* trigger new vmalloc */ | 383 | mc = NULL; /* trigger new vmalloc */ |
@@ -390,12 +387,10 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, | |||
390 | leftover -= mc_size; | 387 | leftover -= mc_size; |
391 | } | 388 | } |
392 | 389 | ||
393 | if (mc) | 390 | vfree(mc); |
394 | vfree(mc); | ||
395 | 391 | ||
396 | if (leftover) { | 392 | if (leftover) { |
397 | if (new_mc) | 393 | vfree(new_mc); |
398 | vfree(new_mc); | ||
399 | state = UCODE_ERROR; | 394 | state = UCODE_ERROR; |
400 | goto out; | 395 | goto out; |
401 | } | 396 | } |
@@ -405,8 +400,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, | |||
405 | goto out; | 400 | goto out; |
406 | } | 401 | } |
407 | 402 | ||
408 | if (uci->mc) | 403 | vfree(uci->mc); |
409 | vfree(uci->mc); | ||
410 | uci->mc = (struct microcode_intel *)new_mc; | 404 | uci->mc = (struct microcode_intel *)new_mc; |
411 | 405 | ||
412 | pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", | 406 | pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", |
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index 71825806cd44..ac861b8348e2 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c | |||
@@ -25,7 +25,6 @@ struct pci_hostbridge_probe { | |||
25 | }; | 25 | }; |
26 | 26 | ||
27 | static u64 __cpuinitdata fam10h_pci_mmconf_base; | 27 | static u64 __cpuinitdata fam10h_pci_mmconf_base; |
28 | static int __cpuinitdata fam10h_pci_mmconf_base_status; | ||
29 | 28 | ||
30 | static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { | 29 | static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { |
31 | { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, | 30 | { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, |
@@ -44,10 +43,12 @@ static int __cpuinit cmp_range(const void *x1, const void *x2) | |||
44 | return start1 - start2; | 43 | return start1 - start2; |
45 | } | 44 | } |
46 | 45 | ||
47 | /*[47:0] */ | 46 | #define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT) |
48 | /* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */ | 47 | #define MMCONF_MASK (~(MMCONF_UNIT - 1)) |
48 | #define MMCONF_SIZE (MMCONF_UNIT << 8) | ||
49 | /* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */ | ||
49 | #define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) | 50 | #define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) |
50 | #define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32))) | 51 | #define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40)) |
51 | static void __cpuinit get_fam10h_pci_mmconf_base(void) | 52 | static void __cpuinit get_fam10h_pci_mmconf_base(void) |
52 | { | 53 | { |
53 | int i; | 54 | int i; |
@@ -64,12 +65,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) | |||
64 | struct range range[8]; | 65 | struct range range[8]; |
65 | 66 | ||
66 | /* only try to get setting from BSP */ | 67 | /* only try to get setting from BSP */ |
67 | /* -1 or 1 */ | 68 | if (fam10h_pci_mmconf_base) |
68 | if (fam10h_pci_mmconf_base_status) | ||
69 | return; | 69 | return; |
70 | 70 | ||
71 | if (!early_pci_allowed()) | 71 | if (!early_pci_allowed()) |
72 | goto fail; | 72 | return; |
73 | 73 | ||
74 | found = 0; | 74 | found = 0; |
75 | for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { | 75 | for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { |
@@ -91,7 +91,7 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) | |||
91 | } | 91 | } |
92 | 92 | ||
93 | if (!found) | 93 | if (!found) |
94 | goto fail; | 94 | return; |
95 | 95 | ||
96 | /* SYS_CFG */ | 96 | /* SYS_CFG */ |
97 | address = MSR_K8_SYSCFG; | 97 | address = MSR_K8_SYSCFG; |
@@ -99,16 +99,16 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) | |||
99 | 99 | ||
100 | /* TOP_MEM2 is not enabled? */ | 100 | /* TOP_MEM2 is not enabled? */ |
101 | if (!(val & (1<<21))) { | 101 | if (!(val & (1<<21))) { |
102 | tom2 = 0; | 102 | tom2 = 1ULL << 32; |
103 | } else { | 103 | } else { |
104 | /* TOP_MEM2 */ | 104 | /* TOP_MEM2 */ |
105 | address = MSR_K8_TOP_MEM2; | 105 | address = MSR_K8_TOP_MEM2; |
106 | rdmsrl(address, val); | 106 | rdmsrl(address, val); |
107 | tom2 = val & (0xffffULL<<32); | 107 | tom2 = max(val & 0xffffff800000ULL, 1ULL << 32); |
108 | } | 108 | } |
109 | 109 | ||
110 | if (base <= tom2) | 110 | if (base <= tom2) |
111 | base = tom2 + (1ULL<<32); | 111 | base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK; |
112 | 112 | ||
113 | /* | 113 | /* |
114 | * need to check if the range is in the high mmio range that is | 114 | * need to check if the range is in the high mmio range that is |
@@ -123,11 +123,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) | |||
123 | if (!(reg & 3)) | 123 | if (!(reg & 3)) |
124 | continue; | 124 | continue; |
125 | 125 | ||
126 | start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ | 126 | start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/ |
127 | reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); | 127 | reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); |
128 | end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ | 128 | end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/ |
129 | 129 | ||
130 | if (!end) | 130 | if (end < tom2) |
131 | continue; | 131 | continue; |
132 | 132 | ||
133 | range[hi_mmio_num].start = start; | 133 | range[hi_mmio_num].start = start; |
@@ -143,32 +143,27 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) | |||
143 | 143 | ||
144 | if (range[hi_mmio_num - 1].end < base) | 144 | if (range[hi_mmio_num - 1].end < base) |
145 | goto out; | 145 | goto out; |
146 | if (range[0].start > base) | 146 | if (range[0].start > base + MMCONF_SIZE) |
147 | goto out; | 147 | goto out; |
148 | 148 | ||
149 | /* need to find one window */ | 149 | /* need to find one window */ |
150 | base = range[0].start - (1ULL << 32); | 150 | base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT; |
151 | if ((base > tom2) && BASE_VALID(base)) | 151 | if ((base > tom2) && BASE_VALID(base)) |
152 | goto out; | 152 | goto out; |
153 | base = range[hi_mmio_num - 1].end + (1ULL << 32); | 153 | base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK; |
154 | if ((base > tom2) && BASE_VALID(base)) | 154 | if (BASE_VALID(base)) |
155 | goto out; | 155 | goto out; |
156 | /* need to find window between ranges */ | 156 | /* need to find window between ranges */ |
157 | if (hi_mmio_num > 1) | 157 | for (i = 1; i < hi_mmio_num; i++) { |
158 | for (i = 0; i < hi_mmio_num - 1; i++) { | 158 | base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK; |
159 | if (range[i + 1].start > (range[i].end + (1ULL << 32))) { | 159 | val = range[i].start & MMCONF_MASK; |
160 | base = range[i].end + (1ULL << 32); | 160 | if (val >= base + MMCONF_SIZE && BASE_VALID(base)) |
161 | if ((base > tom2) && BASE_VALID(base)) | 161 | goto out; |
162 | goto out; | ||
163 | } | ||
164 | } | 162 | } |
165 | |||
166 | fail: | ||
167 | fam10h_pci_mmconf_base_status = -1; | ||
168 | return; | 163 | return; |
164 | |||
169 | out: | 165 | out: |
170 | fam10h_pci_mmconf_base = base; | 166 | fam10h_pci_mmconf_base = base; |
171 | fam10h_pci_mmconf_base_status = 1; | ||
172 | } | 167 | } |
173 | 168 | ||
174 | void __cpuinit fam10h_check_enable_mmcfg(void) | 169 | void __cpuinit fam10h_check_enable_mmcfg(void) |
@@ -190,11 +185,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void) | |||
190 | 185 | ||
191 | /* only trust the one handle 256 buses, if acpi=off */ | 186 | /* only trust the one handle 256 buses, if acpi=off */ |
192 | if (!acpi_pci_disabled || busnbits >= 8) { | 187 | if (!acpi_pci_disabled || busnbits >= 8) { |
193 | u64 base; | 188 | u64 base = val & MMCONF_MASK; |
194 | base = val & (0xffffULL << 32); | 189 | |
195 | if (fam10h_pci_mmconf_base_status <= 0) { | 190 | if (!fam10h_pci_mmconf_base) { |
196 | fam10h_pci_mmconf_base = base; | 191 | fam10h_pci_mmconf_base = base; |
197 | fam10h_pci_mmconf_base_status = 1; | ||
198 | return; | 192 | return; |
199 | } else if (fam10h_pci_mmconf_base == base) | 193 | } else if (fam10h_pci_mmconf_base == base) |
200 | return; | 194 | return; |
@@ -206,8 +200,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void) | |||
206 | * with 256 buses | 200 | * with 256 buses |
207 | */ | 201 | */ |
208 | get_fam10h_pci_mmconf_base(); | 202 | get_fam10h_pci_mmconf_base(); |
209 | if (fam10h_pci_mmconf_base_status <= 0) | 203 | if (!fam10h_pci_mmconf_base) { |
204 | pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF; | ||
210 | return; | 205 | return; |
206 | } | ||
211 | 207 | ||
212 | printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); | 208 | printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); |
213 | val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) | | 209 | val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) | |
@@ -217,13 +213,13 @@ void __cpuinit fam10h_check_enable_mmcfg(void) | |||
217 | wrmsrl(address, val); | 213 | wrmsrl(address, val); |
218 | } | 214 | } |
219 | 215 | ||
220 | static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d) | 216 | static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d) |
221 | { | 217 | { |
222 | pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF; | 218 | pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF; |
223 | return 0; | 219 | return 0; |
224 | } | 220 | } |
225 | 221 | ||
226 | static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = { | 222 | static const struct dmi_system_id __initconst mmconf_dmi_table[] = { |
227 | { | 223 | { |
228 | .callback = set_check_enable_amd_mmconf, | 224 | .callback = set_check_enable_amd_mmconf, |
229 | .ident = "Sun Microsystems Machine", | 225 | .ident = "Sun Microsystems Machine", |
@@ -234,7 +230,8 @@ static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = { | |||
234 | {} | 230 | {} |
235 | }; | 231 | }; |
236 | 232 | ||
237 | void __cpuinit check_enable_amd_mmconf_dmi(void) | 233 | /* Called from a __cpuinit function, but only on the BSP. */ |
234 | void __ref check_enable_amd_mmconf_dmi(void) | ||
238 | { | 235 | { |
239 | dmi_check_system(mmconf_dmi_table); | 236 | dmi_check_system(mmconf_dmi_table); |
240 | } | 237 | } |
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 8f2956091735..ab23f1ad4bf1 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c | |||
@@ -37,20 +37,11 @@ | |||
37 | 37 | ||
38 | void *module_alloc(unsigned long size) | 38 | void *module_alloc(unsigned long size) |
39 | { | 39 | { |
40 | struct vm_struct *area; | 40 | if (PAGE_ALIGN(size) > MODULES_LEN) |
41 | |||
42 | if (!size) | ||
43 | return NULL; | ||
44 | size = PAGE_ALIGN(size); | ||
45 | if (size > MODULES_LEN) | ||
46 | return NULL; | 41 | return NULL; |
47 | 42 | return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, | |
48 | area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); | 43 | GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, |
49 | if (!area) | 44 | -1, __builtin_return_address(0)); |
50 | return NULL; | ||
51 | |||
52 | return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, | ||
53 | PAGE_KERNEL_EXEC); | ||
54 | } | 45 | } |
55 | 46 | ||
56 | /* Free memory returned from module_alloc */ | 47 | /* Free memory returned from module_alloc */ |
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 9af64d9c4b67..01b0f6d06451 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c | |||
@@ -118,21 +118,8 @@ static void __init MP_bus_info(struct mpc_bus *m) | |||
118 | 118 | ||
119 | static void __init MP_ioapic_info(struct mpc_ioapic *m) | 119 | static void __init MP_ioapic_info(struct mpc_ioapic *m) |
120 | { | 120 | { |
121 | if (!(m->flags & MPC_APIC_USABLE)) | 121 | if (m->flags & MPC_APIC_USABLE) |
122 | return; | 122 | mp_register_ioapic(m->apicid, m->apicaddr, gsi_top); |
123 | |||
124 | printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", | ||
125 | m->apicid, m->apicver, m->apicaddr); | ||
126 | |||
127 | mp_register_ioapic(m->apicid, m->apicaddr, gsi_top); | ||
128 | } | ||
129 | |||
130 | static void print_MP_intsrc_info(struct mpc_intsrc *m) | ||
131 | { | ||
132 | apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," | ||
133 | " IRQ %02x, APIC ID %x, APIC INT %02x\n", | ||
134 | m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, | ||
135 | m->srcbusirq, m->dstapic, m->dstirq); | ||
136 | } | 123 | } |
137 | 124 | ||
138 | static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) | 125 | static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) |
@@ -144,73 +131,11 @@ static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) | |||
144 | mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); | 131 | mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); |
145 | } | 132 | } |
146 | 133 | ||
147 | static void __init assign_to_mp_irq(struct mpc_intsrc *m, | ||
148 | struct mpc_intsrc *mp_irq) | ||
149 | { | ||
150 | mp_irq->dstapic = m->dstapic; | ||
151 | mp_irq->type = m->type; | ||
152 | mp_irq->irqtype = m->irqtype; | ||
153 | mp_irq->irqflag = m->irqflag; | ||
154 | mp_irq->srcbus = m->srcbus; | ||
155 | mp_irq->srcbusirq = m->srcbusirq; | ||
156 | mp_irq->dstirq = m->dstirq; | ||
157 | } | ||
158 | |||
159 | static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq, | ||
160 | struct mpc_intsrc *m) | ||
161 | { | ||
162 | m->dstapic = mp_irq->dstapic; | ||
163 | m->type = mp_irq->type; | ||
164 | m->irqtype = mp_irq->irqtype; | ||
165 | m->irqflag = mp_irq->irqflag; | ||
166 | m->srcbus = mp_irq->srcbus; | ||
167 | m->srcbusirq = mp_irq->srcbusirq; | ||
168 | m->dstirq = mp_irq->dstirq; | ||
169 | } | ||
170 | |||
171 | static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq, | ||
172 | struct mpc_intsrc *m) | ||
173 | { | ||
174 | if (mp_irq->dstapic != m->dstapic) | ||
175 | return 1; | ||
176 | if (mp_irq->type != m->type) | ||
177 | return 2; | ||
178 | if (mp_irq->irqtype != m->irqtype) | ||
179 | return 3; | ||
180 | if (mp_irq->irqflag != m->irqflag) | ||
181 | return 4; | ||
182 | if (mp_irq->srcbus != m->srcbus) | ||
183 | return 5; | ||
184 | if (mp_irq->srcbusirq != m->srcbusirq) | ||
185 | return 6; | ||
186 | if (mp_irq->dstirq != m->dstirq) | ||
187 | return 7; | ||
188 | |||
189 | return 0; | ||
190 | } | ||
191 | |||
192 | static void __init MP_intsrc_info(struct mpc_intsrc *m) | ||
193 | { | ||
194 | int i; | ||
195 | |||
196 | print_MP_intsrc_info(m); | ||
197 | |||
198 | for (i = 0; i < mp_irq_entries; i++) { | ||
199 | if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m)) | ||
200 | return; | ||
201 | } | ||
202 | |||
203 | assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); | ||
204 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
205 | panic("Max # of irq sources exceeded!!\n"); | ||
206 | } | ||
207 | #else /* CONFIG_X86_IO_APIC */ | 134 | #else /* CONFIG_X86_IO_APIC */ |
208 | static inline void __init MP_bus_info(struct mpc_bus *m) {} | 135 | static inline void __init MP_bus_info(struct mpc_bus *m) {} |
209 | static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} | 136 | static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} |
210 | static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {} | ||
211 | #endif /* CONFIG_X86_IO_APIC */ | 137 | #endif /* CONFIG_X86_IO_APIC */ |
212 | 138 | ||
213 | |||
214 | static void __init MP_lintsrc_info(struct mpc_lintsrc *m) | 139 | static void __init MP_lintsrc_info(struct mpc_lintsrc *m) |
215 | { | 140 | { |
216 | apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," | 141 | apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," |
@@ -222,7 +147,6 @@ static void __init MP_lintsrc_info(struct mpc_lintsrc *m) | |||
222 | /* | 147 | /* |
223 | * Read/parse the MPC | 148 | * Read/parse the MPC |
224 | */ | 149 | */ |
225 | |||
226 | static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) | 150 | static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) |
227 | { | 151 | { |
228 | 152 | ||
@@ -275,18 +199,6 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) | |||
275 | 199 | ||
276 | void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } | 200 | void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } |
277 | 201 | ||
278 | static void __init smp_register_lapic_address(unsigned long address) | ||
279 | { | ||
280 | mp_lapic_addr = address; | ||
281 | |||
282 | set_fixmap_nocache(FIX_APIC_BASE, address); | ||
283 | if (boot_cpu_physical_apicid == -1U) { | ||
284 | boot_cpu_physical_apicid = read_apic_id(); | ||
285 | apic_version[boot_cpu_physical_apicid] = | ||
286 | GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
287 | } | ||
288 | } | ||
289 | |||
290 | static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) | 202 | static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) |
291 | { | 203 | { |
292 | char str[16]; | 204 | char str[16]; |
@@ -301,17 +213,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) | |||
301 | #ifdef CONFIG_X86_32 | 213 | #ifdef CONFIG_X86_32 |
302 | generic_mps_oem_check(mpc, oem, str); | 214 | generic_mps_oem_check(mpc, oem, str); |
303 | #endif | 215 | #endif |
304 | /* save the local APIC address, it might be non-default */ | 216 | /* Initialize the lapic mapping */ |
305 | if (!acpi_lapic) | 217 | if (!acpi_lapic) |
306 | mp_lapic_addr = mpc->lapic; | 218 | register_lapic_address(mpc->lapic); |
307 | 219 | ||
308 | if (early) | 220 | if (early) |
309 | return 1; | 221 | return 1; |
310 | 222 | ||
311 | /* Initialize the lapic mapping */ | ||
312 | if (!acpi_lapic) | ||
313 | smp_register_lapic_address(mpc->lapic); | ||
314 | |||
315 | if (mpc->oemptr) | 223 | if (mpc->oemptr) |
316 | x86_init.mpparse.smp_read_mpc_oem(mpc); | 224 | x86_init.mpparse.smp_read_mpc_oem(mpc); |
317 | 225 | ||
@@ -337,7 +245,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) | |||
337 | skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); | 245 | skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); |
338 | break; | 246 | break; |
339 | case MP_INTSRC: | 247 | case MP_INTSRC: |
340 | MP_intsrc_info((struct mpc_intsrc *)mpt); | 248 | mp_save_irq((struct mpc_intsrc *)mpt); |
341 | skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); | 249 | skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); |
342 | break; | 250 | break; |
343 | case MP_LINTSRC: | 251 | case MP_LINTSRC: |
@@ -429,13 +337,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) | |||
429 | 337 | ||
430 | intsrc.srcbusirq = i; | 338 | intsrc.srcbusirq = i; |
431 | intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ | 339 | intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ |
432 | MP_intsrc_info(&intsrc); | 340 | mp_save_irq(&intsrc); |
433 | } | 341 | } |
434 | 342 | ||
435 | intsrc.irqtype = mp_ExtINT; | 343 | intsrc.irqtype = mp_ExtINT; |
436 | intsrc.srcbusirq = 0; | 344 | intsrc.srcbusirq = 0; |
437 | intsrc.dstirq = 0; /* 8259A to INTIN0 */ | 345 | intsrc.dstirq = 0; /* 8259A to INTIN0 */ |
438 | MP_intsrc_info(&intsrc); | 346 | mp_save_irq(&intsrc); |
439 | } | 347 | } |
440 | 348 | ||
441 | 349 | ||
@@ -784,11 +692,11 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) | |||
784 | int i; | 692 | int i; |
785 | 693 | ||
786 | apic_printk(APIC_VERBOSE, "OLD "); | 694 | apic_printk(APIC_VERBOSE, "OLD "); |
787 | print_MP_intsrc_info(m); | 695 | print_mp_irq_info(m); |
788 | 696 | ||
789 | i = get_MP_intsrc_index(m); | 697 | i = get_MP_intsrc_index(m); |
790 | if (i > 0) { | 698 | if (i > 0) { |
791 | assign_to_mpc_intsrc(&mp_irqs[i], m); | 699 | memcpy(m, &mp_irqs[i], sizeof(*m)); |
792 | apic_printk(APIC_VERBOSE, "NEW "); | 700 | apic_printk(APIC_VERBOSE, "NEW "); |
793 | print_mp_irq_info(&mp_irqs[i]); | 701 | print_mp_irq_info(&mp_irqs[i]); |
794 | return; | 702 | return; |
@@ -875,14 +783,14 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, | |||
875 | if (nr_m_spare > 0) { | 783 | if (nr_m_spare > 0) { |
876 | apic_printk(APIC_VERBOSE, "*NEW* found\n"); | 784 | apic_printk(APIC_VERBOSE, "*NEW* found\n"); |
877 | nr_m_spare--; | 785 | nr_m_spare--; |
878 | assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); | 786 | memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i])); |
879 | m_spare[nr_m_spare] = NULL; | 787 | m_spare[nr_m_spare] = NULL; |
880 | } else { | 788 | } else { |
881 | struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; | 789 | struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; |
882 | count += sizeof(struct mpc_intsrc); | 790 | count += sizeof(struct mpc_intsrc); |
883 | if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) | 791 | if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) |
884 | goto out; | 792 | goto out; |
885 | assign_to_mpc_intsrc(&mp_irqs[i], m); | 793 | memcpy(m, &mp_irqs[i], sizeof(*m)); |
886 | mpc->length = count; | 794 | mpc->length = count; |
887 | mpt += sizeof(struct mpc_intsrc); | 795 | mpt += sizeof(struct mpc_intsrc); |
888 | } | 796 | } |
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7bf2dc4c8f70..12fcbe2c143e 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c | |||
@@ -30,7 +30,6 @@ | |||
30 | #include <linux/init.h> | 30 | #include <linux/init.h> |
31 | #include <linux/poll.h> | 31 | #include <linux/poll.h> |
32 | #include <linux/smp.h> | 32 | #include <linux/smp.h> |
33 | #include <linux/smp_lock.h> | ||
34 | #include <linux/major.h> | 33 | #include <linux/major.h> |
35 | #include <linux/fs.h> | 34 | #include <linux/fs.h> |
36 | #include <linux/device.h> | 35 | #include <linux/device.h> |
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c5b250011fd4..869e1aeeb71b 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = { | |||
421 | .set_pte = native_set_pte, | 421 | .set_pte = native_set_pte, |
422 | .set_pte_at = native_set_pte_at, | 422 | .set_pte_at = native_set_pte_at, |
423 | .set_pmd = native_set_pmd, | 423 | .set_pmd = native_set_pmd, |
424 | .set_pmd_at = native_set_pmd_at, | ||
424 | .pte_update = paravirt_nop, | 425 | .pte_update = paravirt_nop, |
425 | .pte_update_defer = paravirt_nop, | 426 | .pte_update_defer = paravirt_nop, |
427 | .pmd_update = paravirt_nop, | ||
428 | .pmd_update_defer = paravirt_nop, | ||
426 | 429 | ||
427 | .ptep_modify_prot_start = __ptep_modify_prot_start, | 430 | .ptep_modify_prot_start = __ptep_modify_prot_start, |
428 | .ptep_modify_prot_commit = __ptep_modify_prot_commit, | 431 | .ptep_modify_prot_commit = __ptep_modify_prot_commit, |
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index ba0f0ca9f280..c01ffa5b9b87 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c | |||
@@ -143,7 +143,7 @@ static void flush_gart(void) | |||
143 | 143 | ||
144 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | 144 | spin_lock_irqsave(&iommu_bitmap_lock, flags); |
145 | if (need_flush) { | 145 | if (need_flush) { |
146 | k8_flush_garts(); | 146 | amd_flush_garts(); |
147 | need_flush = false; | 147 | need_flush = false; |
148 | } | 148 | } |
149 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | 149 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); |
@@ -561,17 +561,17 @@ static void enable_gart_translations(void) | |||
561 | { | 561 | { |
562 | int i; | 562 | int i; |
563 | 563 | ||
564 | if (!k8_northbridges.gart_supported) | 564 | if (!amd_nb_has_feature(AMD_NB_GART)) |
565 | return; | 565 | return; |
566 | 566 | ||
567 | for (i = 0; i < k8_northbridges.num; i++) { | 567 | for (i = 0; i < amd_nb_num(); i++) { |
568 | struct pci_dev *dev = k8_northbridges.nb_misc[i]; | 568 | struct pci_dev *dev = node_to_amd_nb(i)->misc; |
569 | 569 | ||
570 | enable_gart_translation(dev, __pa(agp_gatt_table)); | 570 | enable_gart_translation(dev, __pa(agp_gatt_table)); |
571 | } | 571 | } |
572 | 572 | ||
573 | /* Flush the GART-TLB to remove stale entries */ | 573 | /* Flush the GART-TLB to remove stale entries */ |
574 | k8_flush_garts(); | 574 | amd_flush_garts(); |
575 | } | 575 | } |
576 | 576 | ||
577 | /* | 577 | /* |
@@ -596,13 +596,13 @@ static void gart_fixup_northbridges(struct sys_device *dev) | |||
596 | if (!fix_up_north_bridges) | 596 | if (!fix_up_north_bridges) |
597 | return; | 597 | return; |
598 | 598 | ||
599 | if (!k8_northbridges.gart_supported) | 599 | if (!amd_nb_has_feature(AMD_NB_GART)) |
600 | return; | 600 | return; |
601 | 601 | ||
602 | pr_info("PCI-DMA: Restoring GART aperture settings\n"); | 602 | pr_info("PCI-DMA: Restoring GART aperture settings\n"); |
603 | 603 | ||
604 | for (i = 0; i < k8_northbridges.num; i++) { | 604 | for (i = 0; i < amd_nb_num(); i++) { |
605 | struct pci_dev *dev = k8_northbridges.nb_misc[i]; | 605 | struct pci_dev *dev = node_to_amd_nb(i)->misc; |
606 | 606 | ||
607 | /* | 607 | /* |
608 | * Don't enable translations just yet. That is the next | 608 | * Don't enable translations just yet. That is the next |
@@ -644,7 +644,7 @@ static struct sys_device device_gart = { | |||
644 | * Private Northbridge GATT initialization in case we cannot use the | 644 | * Private Northbridge GATT initialization in case we cannot use the |
645 | * AGP driver for some reason. | 645 | * AGP driver for some reason. |
646 | */ | 646 | */ |
647 | static __init int init_k8_gatt(struct agp_kern_info *info) | 647 | static __init int init_amd_gatt(struct agp_kern_info *info) |
648 | { | 648 | { |
649 | unsigned aper_size, gatt_size, new_aper_size; | 649 | unsigned aper_size, gatt_size, new_aper_size; |
650 | unsigned aper_base, new_aper_base; | 650 | unsigned aper_base, new_aper_base; |
@@ -656,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) | |||
656 | 656 | ||
657 | aper_size = aper_base = info->aper_size = 0; | 657 | aper_size = aper_base = info->aper_size = 0; |
658 | dev = NULL; | 658 | dev = NULL; |
659 | for (i = 0; i < k8_northbridges.num; i++) { | 659 | for (i = 0; i < amd_nb_num(); i++) { |
660 | dev = k8_northbridges.nb_misc[i]; | 660 | dev = node_to_amd_nb(i)->misc; |
661 | new_aper_base = read_aperture(dev, &new_aper_size); | 661 | new_aper_base = read_aperture(dev, &new_aper_size); |
662 | if (!new_aper_base) | 662 | if (!new_aper_base) |
663 | goto nommu; | 663 | goto nommu; |
@@ -725,13 +725,13 @@ static void gart_iommu_shutdown(void) | |||
725 | if (!no_agp) | 725 | if (!no_agp) |
726 | return; | 726 | return; |
727 | 727 | ||
728 | if (!k8_northbridges.gart_supported) | 728 | if (!amd_nb_has_feature(AMD_NB_GART)) |
729 | return; | 729 | return; |
730 | 730 | ||
731 | for (i = 0; i < k8_northbridges.num; i++) { | 731 | for (i = 0; i < amd_nb_num(); i++) { |
732 | u32 ctl; | 732 | u32 ctl; |
733 | 733 | ||
734 | dev = k8_northbridges.nb_misc[i]; | 734 | dev = node_to_amd_nb(i)->misc; |
735 | pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); | 735 | pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); |
736 | 736 | ||
737 | ctl &= ~GARTEN; | 737 | ctl &= ~GARTEN; |
@@ -749,14 +749,14 @@ int __init gart_iommu_init(void) | |||
749 | unsigned long scratch; | 749 | unsigned long scratch; |
750 | long i; | 750 | long i; |
751 | 751 | ||
752 | if (!k8_northbridges.gart_supported) | 752 | if (!amd_nb_has_feature(AMD_NB_GART)) |
753 | return 0; | 753 | return 0; |
754 | 754 | ||
755 | #ifndef CONFIG_AGP_AMD64 | 755 | #ifndef CONFIG_AGP_AMD64 |
756 | no_agp = 1; | 756 | no_agp = 1; |
757 | #else | 757 | #else |
758 | /* Makefile puts PCI initialization via subsys_initcall first. */ | 758 | /* Makefile puts PCI initialization via subsys_initcall first. */ |
759 | /* Add other K8 AGP bridge drivers here */ | 759 | /* Add other AMD AGP bridge drivers here */ |
760 | no_agp = no_agp || | 760 | no_agp = no_agp || |
761 | (agp_amd64_init() < 0) || | 761 | (agp_amd64_init() < 0) || |
762 | (agp_copy_info(agp_bridge, &info) < 0); | 762 | (agp_copy_info(agp_bridge, &info) < 0); |
@@ -765,7 +765,7 @@ int __init gart_iommu_init(void) | |||
765 | if (no_iommu || | 765 | if (no_iommu || |
766 | (!force_iommu && max_pfn <= MAX_DMA32_PFN) || | 766 | (!force_iommu && max_pfn <= MAX_DMA32_PFN) || |
767 | !gart_iommu_aperture || | 767 | !gart_iommu_aperture || |
768 | (no_agp && init_k8_gatt(&info) < 0)) { | 768 | (no_agp && init_amd_gatt(&info) < 0)) { |
769 | if (max_pfn > MAX_DMA32_PFN) { | 769 | if (max_pfn > MAX_DMA32_PFN) { |
770 | pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); | 770 | pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); |
771 | pr_warning("falling back to iommu=soft.\n"); | 771 | pr_warning("falling back to iommu=soft.\n"); |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 57d1868a86aa..e764fc05d700 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/utsname.h> | 14 | #include <linux/utsname.h> |
15 | #include <trace/events/power.h> | 15 | #include <trace/events/power.h> |
16 | #include <linux/hw_breakpoint.h> | 16 | #include <linux/hw_breakpoint.h> |
17 | #include <asm/cpu.h> | ||
17 | #include <asm/system.h> | 18 | #include <asm/system.h> |
18 | #include <asm/apic.h> | 19 | #include <asm/apic.h> |
19 | #include <asm/syscalls.h> | 20 | #include <asm/syscalls.h> |
@@ -22,11 +23,6 @@ | |||
22 | #include <asm/i387.h> | 23 | #include <asm/i387.h> |
23 | #include <asm/debugreg.h> | 24 | #include <asm/debugreg.h> |
24 | 25 | ||
25 | unsigned long idle_halt; | ||
26 | EXPORT_SYMBOL(idle_halt); | ||
27 | unsigned long idle_nomwait; | ||
28 | EXPORT_SYMBOL(idle_nomwait); | ||
29 | |||
30 | struct kmem_cache *task_xstate_cachep; | 26 | struct kmem_cache *task_xstate_cachep; |
31 | EXPORT_SYMBOL_GPL(task_xstate_cachep); | 27 | EXPORT_SYMBOL_GPL(task_xstate_cachep); |
32 | 28 | ||
@@ -91,8 +87,7 @@ void exit_thread(void) | |||
91 | void show_regs(struct pt_regs *regs) | 87 | void show_regs(struct pt_regs *regs) |
92 | { | 88 | { |
93 | show_registers(regs); | 89 | show_registers(regs); |
94 | show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), | 90 | show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs)); |
95 | regs->bp); | ||
96 | } | 91 | } |
97 | 92 | ||
98 | void show_regs_common(void) | 93 | void show_regs_common(void) |
@@ -328,7 +323,7 @@ long sys_execve(const char __user *name, | |||
328 | /* | 323 | /* |
329 | * Idle related variables and functions | 324 | * Idle related variables and functions |
330 | */ | 325 | */ |
331 | unsigned long boot_option_idle_override = 0; | 326 | unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; |
332 | EXPORT_SYMBOL(boot_option_idle_override); | 327 | EXPORT_SYMBOL(boot_option_idle_override); |
333 | 328 | ||
334 | /* | 329 | /* |
@@ -374,6 +369,7 @@ void default_idle(void) | |||
374 | { | 369 | { |
375 | if (hlt_use_halt()) { | 370 | if (hlt_use_halt()) { |
376 | trace_power_start(POWER_CSTATE, 1, smp_processor_id()); | 371 | trace_power_start(POWER_CSTATE, 1, smp_processor_id()); |
372 | trace_cpu_idle(1, smp_processor_id()); | ||
377 | current_thread_info()->status &= ~TS_POLLING; | 373 | current_thread_info()->status &= ~TS_POLLING; |
378 | /* | 374 | /* |
379 | * TS_POLLING-cleared state must be visible before we | 375 | * TS_POLLING-cleared state must be visible before we |
@@ -386,6 +382,8 @@ void default_idle(void) | |||
386 | else | 382 | else |
387 | local_irq_enable(); | 383 | local_irq_enable(); |
388 | current_thread_info()->status |= TS_POLLING; | 384 | current_thread_info()->status |= TS_POLLING; |
385 | trace_power_end(smp_processor_id()); | ||
386 | trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); | ||
389 | } else { | 387 | } else { |
390 | local_irq_enable(); | 388 | local_irq_enable(); |
391 | /* loop is done by the caller */ | 389 | /* loop is done by the caller */ |
@@ -443,9 +441,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); | |||
443 | */ | 441 | */ |
444 | void mwait_idle_with_hints(unsigned long ax, unsigned long cx) | 442 | void mwait_idle_with_hints(unsigned long ax, unsigned long cx) |
445 | { | 443 | { |
446 | trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); | ||
447 | if (!need_resched()) { | 444 | if (!need_resched()) { |
448 | if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) | 445 | if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) |
449 | clflush((void *)¤t_thread_info()->flags); | 446 | clflush((void *)¤t_thread_info()->flags); |
450 | 447 | ||
451 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | 448 | __monitor((void *)¤t_thread_info()->flags, 0, 0); |
@@ -460,7 +457,8 @@ static void mwait_idle(void) | |||
460 | { | 457 | { |
461 | if (!need_resched()) { | 458 | if (!need_resched()) { |
462 | trace_power_start(POWER_CSTATE, 1, smp_processor_id()); | 459 | trace_power_start(POWER_CSTATE, 1, smp_processor_id()); |
463 | if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) | 460 | trace_cpu_idle(1, smp_processor_id()); |
461 | if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) | ||
464 | clflush((void *)¤t_thread_info()->flags); | 462 | clflush((void *)¤t_thread_info()->flags); |
465 | 463 | ||
466 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | 464 | __monitor((void *)¤t_thread_info()->flags, 0, 0); |
@@ -469,6 +467,8 @@ static void mwait_idle(void) | |||
469 | __sti_mwait(0, 0); | 467 | __sti_mwait(0, 0); |
470 | else | 468 | else |
471 | local_irq_enable(); | 469 | local_irq_enable(); |
470 | trace_power_end(smp_processor_id()); | ||
471 | trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); | ||
472 | } else | 472 | } else |
473 | local_irq_enable(); | 473 | local_irq_enable(); |
474 | } | 474 | } |
@@ -481,10 +481,12 @@ static void mwait_idle(void) | |||
481 | static void poll_idle(void) | 481 | static void poll_idle(void) |
482 | { | 482 | { |
483 | trace_power_start(POWER_CSTATE, 0, smp_processor_id()); | 483 | trace_power_start(POWER_CSTATE, 0, smp_processor_id()); |
484 | trace_cpu_idle(0, smp_processor_id()); | ||
484 | local_irq_enable(); | 485 | local_irq_enable(); |
485 | while (!need_resched()) | 486 | while (!need_resched()) |
486 | cpu_relax(); | 487 | cpu_relax(); |
487 | trace_power_end(0); | 488 | trace_power_end(smp_processor_id()); |
489 | trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); | ||
488 | } | 490 | } |
489 | 491 | ||
490 | /* | 492 | /* |
@@ -499,17 +501,16 @@ static void poll_idle(void) | |||
499 | * | 501 | * |
500 | * idle=mwait overrides this decision and forces the usage of mwait. | 502 | * idle=mwait overrides this decision and forces the usage of mwait. |
501 | */ | 503 | */ |
502 | static int __cpuinitdata force_mwait; | ||
503 | 504 | ||
504 | #define MWAIT_INFO 0x05 | 505 | #define MWAIT_INFO 0x05 |
505 | #define MWAIT_ECX_EXTENDED_INFO 0x01 | 506 | #define MWAIT_ECX_EXTENDED_INFO 0x01 |
506 | #define MWAIT_EDX_C1 0xf0 | 507 | #define MWAIT_EDX_C1 0xf0 |
507 | 508 | ||
508 | static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) | 509 | int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) |
509 | { | 510 | { |
510 | u32 eax, ebx, ecx, edx; | 511 | u32 eax, ebx, ecx, edx; |
511 | 512 | ||
512 | if (force_mwait) | 513 | if (boot_option_idle_override == IDLE_FORCE_MWAIT) |
513 | return 1; | 514 | return 1; |
514 | 515 | ||
515 | if (c->cpuid_level < MWAIT_INFO) | 516 | if (c->cpuid_level < MWAIT_INFO) |
@@ -629,9 +630,10 @@ static int __init idle_setup(char *str) | |||
629 | if (!strcmp(str, "poll")) { | 630 | if (!strcmp(str, "poll")) { |
630 | printk("using polling idle threads.\n"); | 631 | printk("using polling idle threads.\n"); |
631 | pm_idle = poll_idle; | 632 | pm_idle = poll_idle; |
632 | } else if (!strcmp(str, "mwait")) | 633 | boot_option_idle_override = IDLE_POLL; |
633 | force_mwait = 1; | 634 | } else if (!strcmp(str, "mwait")) { |
634 | else if (!strcmp(str, "halt")) { | 635 | boot_option_idle_override = IDLE_FORCE_MWAIT; |
636 | } else if (!strcmp(str, "halt")) { | ||
635 | /* | 637 | /* |
636 | * When the boot option of idle=halt is added, halt is | 638 | * When the boot option of idle=halt is added, halt is |
637 | * forced to be used for CPU idle. In such case CPU C2/C3 | 639 | * forced to be used for CPU idle. In such case CPU C2/C3 |
@@ -640,8 +642,7 @@ static int __init idle_setup(char *str) | |||
640 | * the boot_option_idle_override. | 642 | * the boot_option_idle_override. |
641 | */ | 643 | */ |
642 | pm_idle = default_idle; | 644 | pm_idle = default_idle; |
643 | idle_halt = 1; | 645 | boot_option_idle_override = IDLE_HALT; |
644 | return 0; | ||
645 | } else if (!strcmp(str, "nomwait")) { | 646 | } else if (!strcmp(str, "nomwait")) { |
646 | /* | 647 | /* |
647 | * If the boot option of "idle=nomwait" is added, | 648 | * If the boot option of "idle=nomwait" is added, |
@@ -649,12 +650,10 @@ static int __init idle_setup(char *str) | |||
649 | * states. In such case it won't touch the variable | 650 | * states. In such case it won't touch the variable |
650 | * of boot_option_idle_override. | 651 | * of boot_option_idle_override. |
651 | */ | 652 | */ |
652 | idle_nomwait = 1; | 653 | boot_option_idle_override = IDLE_NOMWAIT; |
653 | return 0; | ||
654 | } else | 654 | } else |
655 | return -1; | 655 | return -1; |
656 | 656 | ||
657 | boot_option_idle_override = 1; | ||
658 | return 0; | 657 | return 0; |
659 | } | 658 | } |
660 | early_param("idle", idle_setup); | 659 | early_param("idle", idle_setup); |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 96586c3cbbbf..8d128783af47 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -57,8 +57,6 @@ | |||
57 | #include <asm/syscalls.h> | 57 | #include <asm/syscalls.h> |
58 | #include <asm/debugreg.h> | 58 | #include <asm/debugreg.h> |
59 | 59 | ||
60 | #include <trace/events/power.h> | ||
61 | |||
62 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); | 60 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); |
63 | 61 | ||
64 | /* | 62 | /* |
@@ -113,8 +111,6 @@ void cpu_idle(void) | |||
113 | stop_critical_timings(); | 111 | stop_critical_timings(); |
114 | pm_idle(); | 112 | pm_idle(); |
115 | start_critical_timings(); | 113 | start_critical_timings(); |
116 | |||
117 | trace_power_end(smp_processor_id()); | ||
118 | } | 114 | } |
119 | tick_nohz_restart_sched_tick(); | 115 | tick_nohz_restart_sched_tick(); |
120 | preempt_enable_no_resched(); | 116 | preempt_enable_no_resched(); |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b3d7a3a04f38..bd387e8f73b4 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -51,8 +51,6 @@ | |||
51 | #include <asm/syscalls.h> | 51 | #include <asm/syscalls.h> |
52 | #include <asm/debugreg.h> | 52 | #include <asm/debugreg.h> |
53 | 53 | ||
54 | #include <trace/events/power.h> | ||
55 | |||
56 | asmlinkage extern void ret_from_fork(void); | 54 | asmlinkage extern void ret_from_fork(void); |
57 | 55 | ||
58 | DEFINE_PER_CPU(unsigned long, old_rsp); | 56 | DEFINE_PER_CPU(unsigned long, old_rsp); |
@@ -141,8 +139,6 @@ void cpu_idle(void) | |||
141 | pm_idle(); | 139 | pm_idle(); |
142 | start_critical_timings(); | 140 | start_critical_timings(); |
143 | 141 | ||
144 | trace_power_end(smp_processor_id()); | ||
145 | |||
146 | /* In many cases the interrupt that ended idle | 142 | /* In many cases the interrupt that ended idle |
147 | has already called exit_idle. But some idle | 143 | has already called exit_idle. But some idle |
148 | loops can be woken up without interrupt. */ | 144 | loops can be woken up without interrupt. */ |
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index bab3b9e6f66d..42eb3300dfc6 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c | |||
@@ -41,44 +41,6 @@ void pvclock_set_flags(u8 flags) | |||
41 | valid_flags = flags; | 41 | valid_flags = flags; |
42 | } | 42 | } |
43 | 43 | ||
44 | /* | ||
45 | * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, | ||
46 | * yielding a 64-bit result. | ||
47 | */ | ||
48 | static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) | ||
49 | { | ||
50 | u64 product; | ||
51 | #ifdef __i386__ | ||
52 | u32 tmp1, tmp2; | ||
53 | #endif | ||
54 | |||
55 | if (shift < 0) | ||
56 | delta >>= -shift; | ||
57 | else | ||
58 | delta <<= shift; | ||
59 | |||
60 | #ifdef __i386__ | ||
61 | __asm__ ( | ||
62 | "mul %5 ; " | ||
63 | "mov %4,%%eax ; " | ||
64 | "mov %%edx,%4 ; " | ||
65 | "mul %5 ; " | ||
66 | "xor %5,%5 ; " | ||
67 | "add %4,%%eax ; " | ||
68 | "adc %5,%%edx ; " | ||
69 | : "=A" (product), "=r" (tmp1), "=r" (tmp2) | ||
70 | : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); | ||
71 | #elif defined(__x86_64__) | ||
72 | __asm__ ( | ||
73 | "mul %%rdx ; shrd $32,%%rdx,%%rax" | ||
74 | : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); | ||
75 | #else | ||
76 | #error implement me! | ||
77 | #endif | ||
78 | |||
79 | return product; | ||
80 | } | ||
81 | |||
82 | static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) | 44 | static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) |
83 | { | 45 | { |
84 | u64 delta = native_read_tsc() - shadow->tsc_timestamp; | 46 | u64 delta = native_read_tsc() - shadow->tsc_timestamp; |
@@ -121,6 +83,11 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) | |||
121 | 83 | ||
122 | static atomic64_t last_value = ATOMIC64_INIT(0); | 84 | static atomic64_t last_value = ATOMIC64_INIT(0); |
123 | 85 | ||
86 | void pvclock_resume(void) | ||
87 | { | ||
88 | atomic64_set(&last_value, 0); | ||
89 | } | ||
90 | |||
124 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) | 91 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) |
125 | { | 92 | { |
126 | struct pvclock_shadow_time shadow; | 93 | struct pvclock_shadow_time shadow; |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index c495aa8d4815..fc7aae1e2bc7 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <asm/pci_x86.h> | 18 | #include <asm/pci_x86.h> |
19 | #include <asm/virtext.h> | 19 | #include <asm/virtext.h> |
20 | #include <asm/cpu.h> | 20 | #include <asm/cpu.h> |
21 | #include <asm/nmi.h> | ||
21 | 22 | ||
22 | #ifdef CONFIG_X86_32 | 23 | #ifdef CONFIG_X86_32 |
23 | # include <linux/ctype.h> | 24 | # include <linux/ctype.h> |
@@ -747,7 +748,7 @@ static int crash_nmi_callback(struct notifier_block *self, | |||
747 | { | 748 | { |
748 | int cpu; | 749 | int cpu; |
749 | 750 | ||
750 | if (val != DIE_NMI_IPI) | 751 | if (val != DIE_NMI) |
751 | return NOTIFY_OK; | 752 | return NOTIFY_OK; |
752 | 753 | ||
753 | cpu = raw_smp_processor_id(); | 754 | cpu = raw_smp_processor_id(); |
@@ -778,6 +779,8 @@ static void smp_send_nmi_allbutself(void) | |||
778 | 779 | ||
779 | static struct notifier_block crash_nmi_nb = { | 780 | static struct notifier_block crash_nmi_nb = { |
780 | .notifier_call = crash_nmi_callback, | 781 | .notifier_call = crash_nmi_callback, |
782 | /* we want to be the first one called */ | ||
783 | .priority = NMI_LOCAL_HIGH_PRIOR+1, | ||
781 | }; | 784 | }; |
782 | 785 | ||
783 | /* Halt all other CPUs, calling the specified function on each of them | 786 | /* Halt all other CPUs, calling the specified function on each of them |
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c new file mode 100644 index 000000000000..2a26819bb6a8 --- /dev/null +++ b/arch/x86/kernel/resource.c | |||
@@ -0,0 +1,48 @@ | |||
1 | #include <linux/ioport.h> | ||
2 | #include <asm/e820.h> | ||
3 | |||
4 | static void resource_clip(struct resource *res, resource_size_t start, | ||
5 | resource_size_t end) | ||
6 | { | ||
7 | resource_size_t low = 0, high = 0; | ||
8 | |||
9 | if (res->end < start || res->start > end) | ||
10 | return; /* no conflict */ | ||
11 | |||
12 | if (res->start < start) | ||
13 | low = start - res->start; | ||
14 | |||
15 | if (res->end > end) | ||
16 | high = res->end - end; | ||
17 | |||
18 | /* Keep the area above or below the conflict, whichever is larger */ | ||
19 | if (low > high) | ||
20 | res->end = start - 1; | ||
21 | else | ||
22 | res->start = end + 1; | ||
23 | } | ||
24 | |||
25 | static void remove_e820_regions(struct resource *avail) | ||
26 | { | ||
27 | int i; | ||
28 | struct e820entry *entry; | ||
29 | |||
30 | for (i = 0; i < e820.nr_map; i++) { | ||
31 | entry = &e820.map[i]; | ||
32 | |||
33 | resource_clip(avail, entry->addr, | ||
34 | entry->addr + entry->size - 1); | ||
35 | } | ||
36 | } | ||
37 | |||
38 | void arch_remove_reservations(struct resource *avail) | ||
39 | { | ||
40 | /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */ | ||
41 | if (avail->flags & IORESOURCE_MEM) { | ||
42 | if (avail->start < BIOS_END) | ||
43 | avail->start = BIOS_END; | ||
44 | resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END); | ||
45 | |||
46 | remove_e820_regions(avail); | ||
47 | } | ||
48 | } | ||
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 1cfbbfc3ae26..6f39cab052d5 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c | |||
@@ -76,7 +76,7 @@ int mach_set_rtc_mmss(unsigned long nowtime) | |||
76 | CMOS_WRITE(real_seconds, RTC_SECONDS); | 76 | CMOS_WRITE(real_seconds, RTC_SECONDS); |
77 | CMOS_WRITE(real_minutes, RTC_MINUTES); | 77 | CMOS_WRITE(real_minutes, RTC_MINUTES); |
78 | } else { | 78 | } else { |
79 | printk(KERN_WARNING | 79 | printk_once(KERN_NOTICE |
80 | "set_rtc_mmss: can't update from %d to %d\n", | 80 | "set_rtc_mmss: can't update from %d to %d\n", |
81 | cmos_minutes, real_minutes); | 81 | cmos_minutes, real_minutes); |
82 | retval = -1; | 82 | retval = -1; |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 68d535a77df0..ca2f10622a79 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -501,7 +501,18 @@ static inline unsigned long long get_total_mem(void) | |||
501 | return total << PAGE_SHIFT; | 501 | return total << PAGE_SHIFT; |
502 | } | 502 | } |
503 | 503 | ||
504 | #define DEFAULT_BZIMAGE_ADDR_MAX 0x37FFFFFF | 504 | /* |
505 | * Keep the crash kernel below this limit. On 32 bits earlier kernels | ||
506 | * would limit the kernel to the low 512 MiB due to mapping restrictions. | ||
507 | * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this | ||
508 | * limit once kexec-tools are fixed. | ||
509 | */ | ||
510 | #ifdef CONFIG_X86_32 | ||
511 | # define CRASH_KERNEL_ADDR_MAX (512 << 20) | ||
512 | #else | ||
513 | # define CRASH_KERNEL_ADDR_MAX (896 << 20) | ||
514 | #endif | ||
515 | |||
505 | static void __init reserve_crashkernel(void) | 516 | static void __init reserve_crashkernel(void) |
506 | { | 517 | { |
507 | unsigned long long total_mem; | 518 | unsigned long long total_mem; |
@@ -520,10 +531,10 @@ static void __init reserve_crashkernel(void) | |||
520 | const unsigned long long alignment = 16<<20; /* 16M */ | 531 | const unsigned long long alignment = 16<<20; /* 16M */ |
521 | 532 | ||
522 | /* | 533 | /* |
523 | * kexec want bzImage is below DEFAULT_BZIMAGE_ADDR_MAX | 534 | * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX |
524 | */ | 535 | */ |
525 | crash_base = memblock_find_in_range(alignment, | 536 | crash_base = memblock_find_in_range(alignment, |
526 | DEFAULT_BZIMAGE_ADDR_MAX, crash_size, alignment); | 537 | CRASH_KERNEL_ADDR_MAX, crash_size, alignment); |
527 | 538 | ||
528 | if (crash_base == MEMBLOCK_ERROR) { | 539 | if (crash_base == MEMBLOCK_ERROR) { |
529 | pr_info("crashkernel reservation failed - No suitable area found.\n"); | 540 | pr_info("crashkernel reservation failed - No suitable area found.\n"); |
@@ -694,7 +705,7 @@ static u64 __init get_max_mapped(void) | |||
694 | void __init setup_arch(char **cmdline_p) | 705 | void __init setup_arch(char **cmdline_p) |
695 | { | 706 | { |
696 | int acpi = 0; | 707 | int acpi = 0; |
697 | int k8 = 0; | 708 | int amd = 0; |
698 | unsigned long flags; | 709 | unsigned long flags; |
699 | 710 | ||
700 | #ifdef CONFIG_X86_32 | 711 | #ifdef CONFIG_X86_32 |
@@ -769,7 +780,6 @@ void __init setup_arch(char **cmdline_p) | |||
769 | 780 | ||
770 | x86_init.oem.arch_setup(); | 781 | x86_init.oem.arch_setup(); |
771 | 782 | ||
772 | resource_alloc_from_bottom = 0; | ||
773 | iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; | 783 | iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; |
774 | setup_memory_map(); | 784 | setup_memory_map(); |
775 | parse_setup_data(); | 785 | parse_setup_data(); |
@@ -981,12 +991,12 @@ void __init setup_arch(char **cmdline_p) | |||
981 | acpi = acpi_numa_init(); | 991 | acpi = acpi_numa_init(); |
982 | #endif | 992 | #endif |
983 | 993 | ||
984 | #ifdef CONFIG_K8_NUMA | 994 | #ifdef CONFIG_AMD_NUMA |
985 | if (!acpi) | 995 | if (!acpi) |
986 | k8 = !k8_numa_init(0, max_pfn); | 996 | amd = !amd_numa_init(0, max_pfn); |
987 | #endif | 997 | #endif |
988 | 998 | ||
989 | initmem_init(0, max_pfn, acpi, k8); | 999 | initmem_init(0, max_pfn, acpi, amd); |
990 | memblock_find_dma_reserve(); | 1000 | memblock_find_dma_reserve(); |
991 | dma32_reserve_bootmem(); | 1001 | dma32_reserve_bootmem(); |
992 | 1002 | ||
@@ -1035,10 +1045,7 @@ void __init setup_arch(char **cmdline_p) | |||
1035 | #endif | 1045 | #endif |
1036 | 1046 | ||
1037 | init_apic_mappings(); | 1047 | init_apic_mappings(); |
1038 | ioapic_init_mappings(); | 1048 | ioapic_and_gsi_init(); |
1039 | |||
1040 | /* need to wait for io_apic is mapped */ | ||
1041 | probe_nr_irqs_gsi(); | ||
1042 | 1049 | ||
1043 | kvm_guest_init(); | 1050 | kvm_guest_init(); |
1044 | 1051 | ||
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 083e99d1b7df..0cbe8c0b35ed 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -97,12 +97,12 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); | |||
97 | */ | 97 | */ |
98 | static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); | 98 | static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); |
99 | 99 | ||
100 | void cpu_hotplug_driver_lock() | 100 | void cpu_hotplug_driver_lock(void) |
101 | { | 101 | { |
102 | mutex_lock(&x86_cpu_hotplug_driver_mutex); | 102 | mutex_lock(&x86_cpu_hotplug_driver_mutex); |
103 | } | 103 | } |
104 | 104 | ||
105 | void cpu_hotplug_driver_unlock() | 105 | void cpu_hotplug_driver_unlock(void) |
106 | { | 106 | { |
107 | mutex_unlock(&x86_cpu_hotplug_driver_mutex); | 107 | mutex_unlock(&x86_cpu_hotplug_driver_mutex); |
108 | } | 108 | } |
@@ -281,6 +281,13 @@ static void __cpuinit smp_callin(void) | |||
281 | */ | 281 | */ |
282 | smp_store_cpu_info(cpuid); | 282 | smp_store_cpu_info(cpuid); |
283 | 283 | ||
284 | /* | ||
285 | * This must be done before setting cpu_online_mask | ||
286 | * or calling notify_cpu_starting. | ||
287 | */ | ||
288 | set_cpu_sibling_map(raw_smp_processor_id()); | ||
289 | wmb(); | ||
290 | |||
284 | notify_cpu_starting(cpuid); | 291 | notify_cpu_starting(cpuid); |
285 | 292 | ||
286 | /* | 293 | /* |
@@ -316,16 +323,6 @@ notrace static void __cpuinit start_secondary(void *unused) | |||
316 | */ | 323 | */ |
317 | check_tsc_sync_target(); | 324 | check_tsc_sync_target(); |
318 | 325 | ||
319 | if (nmi_watchdog == NMI_IO_APIC) { | ||
320 | legacy_pic->mask(0); | ||
321 | enable_NMI_through_LVT0(); | ||
322 | legacy_pic->unmask(0); | ||
323 | } | ||
324 | |||
325 | /* This must be done before setting cpu_online_mask */ | ||
326 | set_cpu_sibling_map(raw_smp_processor_id()); | ||
327 | wmb(); | ||
328 | |||
329 | /* | 326 | /* |
330 | * We need to hold call_lock, so there is no inconsistency | 327 | * We need to hold call_lock, so there is no inconsistency |
331 | * between the time smp_call_function() determines number of | 328 | * between the time smp_call_function() determines number of |
@@ -430,7 +427,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) | |||
430 | 427 | ||
431 | cpumask_set_cpu(cpu, c->llc_shared_map); | 428 | cpumask_set_cpu(cpu, c->llc_shared_map); |
432 | 429 | ||
433 | if (current_cpu_data.x86_max_cores == 1) { | 430 | if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { |
434 | cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); | 431 | cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); |
435 | c->booted_cores = 1; | 432 | c->booted_cores = 1; |
436 | return; | 433 | return; |
@@ -1061,8 +1058,6 @@ static int __init smp_sanity_check(unsigned max_cpus) | |||
1061 | printk(KERN_INFO "SMP mode deactivated.\n"); | 1058 | printk(KERN_INFO "SMP mode deactivated.\n"); |
1062 | smpboot_clear_io_apic(); | 1059 | smpboot_clear_io_apic(); |
1063 | 1060 | ||
1064 | localise_nmi_watchdog(); | ||
1065 | |||
1066 | connect_bsp_APIC(); | 1061 | connect_bsp_APIC(); |
1067 | setup_local_APIC(); | 1062 | setup_local_APIC(); |
1068 | end_local_APIC_setup(); | 1063 | end_local_APIC_setup(); |
@@ -1094,7 +1089,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) | |||
1094 | 1089 | ||
1095 | preempt_disable(); | 1090 | preempt_disable(); |
1096 | smp_cpu_index_default(); | 1091 | smp_cpu_index_default(); |
1097 | current_cpu_data = boot_cpu_data; | 1092 | memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info)); |
1098 | cpumask_copy(cpu_callin_mask, cpumask_of(0)); | 1093 | cpumask_copy(cpu_callin_mask, cpumask_of(0)); |
1099 | mb(); | 1094 | mb(); |
1100 | /* | 1095 | /* |
@@ -1166,6 +1161,20 @@ out: | |||
1166 | preempt_enable(); | 1161 | preempt_enable(); |
1167 | } | 1162 | } |
1168 | 1163 | ||
1164 | void arch_disable_nonboot_cpus_begin(void) | ||
1165 | { | ||
1166 | /* | ||
1167 | * Avoid the smp alternatives switch during the disable_nonboot_cpus(). | ||
1168 | * In the suspend path, we will be back in the SMP mode shortly anyways. | ||
1169 | */ | ||
1170 | skip_smp_alternatives = true; | ||
1171 | } | ||
1172 | |||
1173 | void arch_disable_nonboot_cpus_end(void) | ||
1174 | { | ||
1175 | skip_smp_alternatives = false; | ||
1176 | } | ||
1177 | |||
1169 | void arch_enable_nonboot_cpus_begin(void) | 1178 | void arch_enable_nonboot_cpus_begin(void) |
1170 | { | 1179 | { |
1171 | set_mtrr_aps_delayed_init(); | 1180 | set_mtrr_aps_delayed_init(); |
@@ -1196,7 +1205,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) | |||
1196 | #ifdef CONFIG_X86_IO_APIC | 1205 | #ifdef CONFIG_X86_IO_APIC |
1197 | setup_ioapic_dest(); | 1206 | setup_ioapic_dest(); |
1198 | #endif | 1207 | #endif |
1199 | check_nmi_watchdog(); | ||
1200 | mtrr_aps_init(); | 1208 | mtrr_aps_init(); |
1201 | } | 1209 | } |
1202 | 1210 | ||
@@ -1341,8 +1349,6 @@ int native_cpu_disable(void) | |||
1341 | if (cpu == 0) | 1349 | if (cpu == 0) |
1342 | return -EBUSY; | 1350 | return -EBUSY; |
1343 | 1351 | ||
1344 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
1345 | stop_apic_nmi_watchdog(NULL); | ||
1346 | clear_local_APIC(); | 1352 | clear_local_APIC(); |
1347 | 1353 | ||
1348 | cpu_disable_common(); | 1354 | cpu_disable_common(); |
@@ -1377,7 +1383,7 @@ void play_dead_common(void) | |||
1377 | 1383 | ||
1378 | mb(); | 1384 | mb(); |
1379 | /* Ack it */ | 1385 | /* Ack it */ |
1380 | __get_cpu_var(cpu_state) = CPU_DEAD; | 1386 | __this_cpu_write(cpu_state, CPU_DEAD); |
1381 | 1387 | ||
1382 | /* | 1388 | /* |
1383 | * With physical CPU hotplug, we should halt the cpu | 1389 | * With physical CPU hotplug, we should halt the cpu |
@@ -1396,12 +1402,13 @@ static inline void mwait_play_dead(void) | |||
1396 | unsigned int highest_subcstate = 0; | 1402 | unsigned int highest_subcstate = 0; |
1397 | int i; | 1403 | int i; |
1398 | void *mwait_ptr; | 1404 | void *mwait_ptr; |
1405 | struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); | ||
1399 | 1406 | ||
1400 | if (!cpu_has(¤t_cpu_data, X86_FEATURE_MWAIT)) | 1407 | if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c))) |
1401 | return; | 1408 | return; |
1402 | if (!cpu_has(¤t_cpu_data, X86_FEATURE_CLFLSH)) | 1409 | if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH)) |
1403 | return; | 1410 | return; |
1404 | if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) | 1411 | if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) |
1405 | return; | 1412 | return; |
1406 | 1413 | ||
1407 | eax = CPUID_MWAIT_LEAF; | 1414 | eax = CPUID_MWAIT_LEAF; |
@@ -1452,7 +1459,7 @@ static inline void mwait_play_dead(void) | |||
1452 | 1459 | ||
1453 | static inline void hlt_play_dead(void) | 1460 | static inline void hlt_play_dead(void) |
1454 | { | 1461 | { |
1455 | if (current_cpu_data.x86 >= 4) | 1462 | if (__this_cpu_read(cpu_info.x86) >= 4) |
1456 | wbinvd(); | 1463 | wbinvd(); |
1457 | 1464 | ||
1458 | while (1) { | 1465 | while (1) { |
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index b53c525368a7..938c8e10a19a 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c | |||
@@ -73,22 +73,22 @@ static const struct stacktrace_ops save_stack_ops_nosched = { | |||
73 | */ | 73 | */ |
74 | void save_stack_trace(struct stack_trace *trace) | 74 | void save_stack_trace(struct stack_trace *trace) |
75 | { | 75 | { |
76 | dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); | 76 | dump_trace(current, NULL, NULL, &save_stack_ops, trace); |
77 | if (trace->nr_entries < trace->max_entries) | 77 | if (trace->nr_entries < trace->max_entries) |
78 | trace->entries[trace->nr_entries++] = ULONG_MAX; | 78 | trace->entries[trace->nr_entries++] = ULONG_MAX; |
79 | } | 79 | } |
80 | EXPORT_SYMBOL_GPL(save_stack_trace); | 80 | EXPORT_SYMBOL_GPL(save_stack_trace); |
81 | 81 | ||
82 | void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) | 82 | void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) |
83 | { | 83 | { |
84 | dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); | 84 | dump_trace(current, regs, NULL, &save_stack_ops, trace); |
85 | if (trace->nr_entries < trace->max_entries) | 85 | if (trace->nr_entries < trace->max_entries) |
86 | trace->entries[trace->nr_entries++] = ULONG_MAX; | 86 | trace->entries[trace->nr_entries++] = ULONG_MAX; |
87 | } | 87 | } |
88 | 88 | ||
89 | void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) | 89 | void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) |
90 | { | 90 | { |
91 | dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); | 91 | dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); |
92 | if (trace->nr_entries < trace->max_entries) | 92 | if (trace->nr_entries < trace->max_entries) |
93 | trace->entries[trace->nr_entries++] = ULONG_MAX; | 93 | trace->entries[trace->nr_entries++] = ULONG_MAX; |
94 | } | 94 | } |
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index c2f1b26141e2..998e972f3b1a 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c | |||
@@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, | |||
133 | pmd = pmd_alloc(&tboot_mm, pud, vaddr); | 133 | pmd = pmd_alloc(&tboot_mm, pud, vaddr); |
134 | if (!pmd) | 134 | if (!pmd) |
135 | return -1; | 135 | return -1; |
136 | pte = pte_alloc_map(&tboot_mm, pmd, vaddr); | 136 | pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr); |
137 | if (!pte) | 137 | if (!pte) |
138 | return -1; | 138 | return -1; |
139 | set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); | 139 | set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); |
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index fb5cc5e14cfa..25a28a245937 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c | |||
@@ -22,10 +22,6 @@ | |||
22 | #include <asm/hpet.h> | 22 | #include <asm/hpet.h> |
23 | #include <asm/time.h> | 23 | #include <asm/time.h> |
24 | 24 | ||
25 | #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) | ||
26 | int timer_ack; | ||
27 | #endif | ||
28 | |||
29 | #ifdef CONFIG_X86_64 | 25 | #ifdef CONFIG_X86_64 |
30 | volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; | 26 | volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; |
31 | #endif | 27 | #endif |
@@ -63,20 +59,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) | |||
63 | /* Keep nmi watchdog up to date */ | 59 | /* Keep nmi watchdog up to date */ |
64 | inc_irq_stat(irq0_irqs); | 60 | inc_irq_stat(irq0_irqs); |
65 | 61 | ||
66 | /* Optimized out for !IO_APIC and x86_64 */ | ||
67 | if (timer_ack) { | ||
68 | /* | ||
69 | * Subtle, when I/O APICs are used we have to ack timer IRQ | ||
70 | * manually to deassert NMI lines for the watchdog if run | ||
71 | * on an 82489DX-based system. | ||
72 | */ | ||
73 | raw_spin_lock(&i8259A_lock); | ||
74 | outb(0x0c, PIC_MASTER_OCW3); | ||
75 | /* Ack the IRQ; AEOI will end it automatically. */ | ||
76 | inb(PIC_MASTER_POLL); | ||
77 | raw_spin_unlock(&i8259A_lock); | ||
78 | } | ||
79 | |||
80 | global_clock_event->event_handler(global_clock_event); | 62 | global_clock_event->event_handler(global_clock_event); |
81 | 63 | ||
82 | /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ | 64 | /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ |
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index 3af2dff58b21..075d130efcf9 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S | |||
@@ -127,7 +127,7 @@ startup_64: | |||
127 | no_longmode: | 127 | no_longmode: |
128 | hlt | 128 | hlt |
129 | jmp no_longmode | 129 | jmp no_longmode |
130 | #include "verify_cpu_64.S" | 130 | #include "verify_cpu.S" |
131 | 131 | ||
132 | # Careful these need to be in the same 64K segment as the above; | 132 | # Careful these need to be in the same 64K segment as the above; |
133 | tidt: | 133 | tidt: |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index cb838ca42c96..b9b67166f9de 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -83,6 +83,13 @@ EXPORT_SYMBOL_GPL(used_vectors); | |||
83 | 83 | ||
84 | static int ignore_nmis; | 84 | static int ignore_nmis; |
85 | 85 | ||
86 | int unknown_nmi_panic; | ||
87 | /* | ||
88 | * Prevent NMI reason port (0x61) being accessed simultaneously, can | ||
89 | * only be used in NMI handler. | ||
90 | */ | ||
91 | static DEFINE_RAW_SPINLOCK(nmi_reason_lock); | ||
92 | |||
86 | static inline void conditional_sti(struct pt_regs *regs) | 93 | static inline void conditional_sti(struct pt_regs *regs) |
87 | { | 94 | { |
88 | if (regs->flags & X86_EFLAGS_IF) | 95 | if (regs->flags & X86_EFLAGS_IF) |
@@ -300,16 +307,23 @@ gp_in_kernel: | |||
300 | die("general protection fault", regs, error_code); | 307 | die("general protection fault", regs, error_code); |
301 | } | 308 | } |
302 | 309 | ||
303 | static notrace __kprobes void | 310 | static int __init setup_unknown_nmi_panic(char *str) |
304 | mem_parity_error(unsigned char reason, struct pt_regs *regs) | ||
305 | { | 311 | { |
306 | printk(KERN_EMERG | 312 | unknown_nmi_panic = 1; |
307 | "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", | 313 | return 1; |
308 | reason, smp_processor_id()); | 314 | } |
315 | __setup("unknown_nmi_panic", setup_unknown_nmi_panic); | ||
309 | 316 | ||
310 | printk(KERN_EMERG | 317 | static notrace __kprobes void |
311 | "You have some hardware problem, likely on the PCI bus.\n"); | 318 | pci_serr_error(unsigned char reason, struct pt_regs *regs) |
319 | { | ||
320 | pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", | ||
321 | reason, smp_processor_id()); | ||
312 | 322 | ||
323 | /* | ||
324 | * On some machines, PCI SERR line is used to report memory | ||
325 | * errors. EDAC makes use of it. | ||
326 | */ | ||
313 | #if defined(CONFIG_EDAC) | 327 | #if defined(CONFIG_EDAC) |
314 | if (edac_handler_set()) { | 328 | if (edac_handler_set()) { |
315 | edac_atomic_assert_error(); | 329 | edac_atomic_assert_error(); |
@@ -320,11 +334,11 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs) | |||
320 | if (panic_on_unrecovered_nmi) | 334 | if (panic_on_unrecovered_nmi) |
321 | panic("NMI: Not continuing"); | 335 | panic("NMI: Not continuing"); |
322 | 336 | ||
323 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | 337 | pr_emerg("Dazed and confused, but trying to continue\n"); |
324 | 338 | ||
325 | /* Clear and disable the memory parity error line. */ | 339 | /* Clear and disable the PCI SERR error line. */ |
326 | reason = (reason & 0xf) | 4; | 340 | reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; |
327 | outb(reason, 0x61); | 341 | outb(reason, NMI_REASON_PORT); |
328 | } | 342 | } |
329 | 343 | ||
330 | static notrace __kprobes void | 344 | static notrace __kprobes void |
@@ -332,22 +346,26 @@ io_check_error(unsigned char reason, struct pt_regs *regs) | |||
332 | { | 346 | { |
333 | unsigned long i; | 347 | unsigned long i; |
334 | 348 | ||
335 | printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); | 349 | pr_emerg( |
350 | "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", | ||
351 | reason, smp_processor_id()); | ||
336 | show_registers(regs); | 352 | show_registers(regs); |
337 | 353 | ||
338 | if (panic_on_io_nmi) | 354 | if (panic_on_io_nmi) |
339 | panic("NMI IOCK error: Not continuing"); | 355 | panic("NMI IOCK error: Not continuing"); |
340 | 356 | ||
341 | /* Re-enable the IOCK line, wait for a few seconds */ | 357 | /* Re-enable the IOCK line, wait for a few seconds */ |
342 | reason = (reason & 0xf) | 8; | 358 | reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; |
343 | outb(reason, 0x61); | 359 | outb(reason, NMI_REASON_PORT); |
344 | 360 | ||
345 | i = 2000; | 361 | i = 20000; |
346 | while (--i) | 362 | while (--i) { |
347 | udelay(1000); | 363 | touch_nmi_watchdog(); |
364 | udelay(100); | ||
365 | } | ||
348 | 366 | ||
349 | reason &= ~8; | 367 | reason &= ~NMI_REASON_CLEAR_IOCHK; |
350 | outb(reason, 0x61); | 368 | outb(reason, NMI_REASON_PORT); |
351 | } | 369 | } |
352 | 370 | ||
353 | static notrace __kprobes void | 371 | static notrace __kprobes void |
@@ -366,69 +384,50 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) | |||
366 | return; | 384 | return; |
367 | } | 385 | } |
368 | #endif | 386 | #endif |
369 | printk(KERN_EMERG | 387 | pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", |
370 | "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", | 388 | reason, smp_processor_id()); |
371 | reason, smp_processor_id()); | ||
372 | 389 | ||
373 | printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); | 390 | pr_emerg("Do you have a strange power saving mode enabled?\n"); |
374 | if (panic_on_unrecovered_nmi) | 391 | if (unknown_nmi_panic || panic_on_unrecovered_nmi) |
375 | panic("NMI: Not continuing"); | 392 | panic("NMI: Not continuing"); |
376 | 393 | ||
377 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | 394 | pr_emerg("Dazed and confused, but trying to continue\n"); |
378 | } | 395 | } |
379 | 396 | ||
380 | static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | 397 | static notrace __kprobes void default_do_nmi(struct pt_regs *regs) |
381 | { | 398 | { |
382 | unsigned char reason = 0; | 399 | unsigned char reason = 0; |
383 | int cpu; | ||
384 | 400 | ||
385 | cpu = smp_processor_id(); | 401 | /* |
386 | 402 | * CPU-specific NMI must be processed before non-CPU-specific | |
387 | /* Only the BSP gets external NMIs from the system. */ | 403 | * NMI, otherwise we may lose it, because the CPU-specific |
388 | if (!cpu) | 404 | * NMI can not be detected/processed on other CPUs. |
389 | reason = get_nmi_reason(); | 405 | */ |
390 | 406 | if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP) | |
391 | if (!(reason & 0xc0)) { | 407 | return; |
392 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) | ||
393 | == NOTIFY_STOP) | ||
394 | return; | ||
395 | 408 | ||
396 | #ifdef CONFIG_X86_LOCAL_APIC | 409 | /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ |
397 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) | 410 | raw_spin_lock(&nmi_reason_lock); |
398 | == NOTIFY_STOP) | 411 | reason = get_nmi_reason(); |
399 | return; | ||
400 | 412 | ||
401 | #ifndef CONFIG_LOCKUP_DETECTOR | 413 | if (reason & NMI_REASON_MASK) { |
414 | if (reason & NMI_REASON_SERR) | ||
415 | pci_serr_error(reason, regs); | ||
416 | else if (reason & NMI_REASON_IOCHK) | ||
417 | io_check_error(reason, regs); | ||
418 | #ifdef CONFIG_X86_32 | ||
402 | /* | 419 | /* |
403 | * Ok, so this is none of the documented NMI sources, | 420 | * Reassert NMI in case it became active |
404 | * so it must be the NMI watchdog. | 421 | * meanwhile as it's edge-triggered: |
405 | */ | 422 | */ |
406 | if (nmi_watchdog_tick(regs, reason)) | 423 | reassert_nmi(); |
407 | return; | ||
408 | if (!do_nmi_callback(regs, cpu)) | ||
409 | #endif /* !CONFIG_LOCKUP_DETECTOR */ | ||
410 | unknown_nmi_error(reason, regs); | ||
411 | #else | ||
412 | unknown_nmi_error(reason, regs); | ||
413 | #endif | 424 | #endif |
414 | 425 | raw_spin_unlock(&nmi_reason_lock); | |
415 | return; | 426 | return; |
416 | } | 427 | } |
417 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) | 428 | raw_spin_unlock(&nmi_reason_lock); |
418 | return; | ||
419 | 429 | ||
420 | /* AK: following checks seem to be broken on modern chipsets. FIXME */ | 430 | unknown_nmi_error(reason, regs); |
421 | if (reason & 0x80) | ||
422 | mem_parity_error(reason, regs); | ||
423 | if (reason & 0x40) | ||
424 | io_check_error(reason, regs); | ||
425 | #ifdef CONFIG_X86_32 | ||
426 | /* | ||
427 | * Reassert NMI in case it became active meanwhile | ||
428 | * as it's edge-triggered: | ||
429 | */ | ||
430 | reassert_nmi(); | ||
431 | #endif | ||
432 | } | 431 | } |
433 | 432 | ||
434 | dotraplinkage notrace __kprobes void | 433 | dotraplinkage notrace __kprobes void |
@@ -446,14 +445,12 @@ do_nmi(struct pt_regs *regs, long error_code) | |||
446 | 445 | ||
447 | void stop_nmi(void) | 446 | void stop_nmi(void) |
448 | { | 447 | { |
449 | acpi_nmi_disable(); | ||
450 | ignore_nmis++; | 448 | ignore_nmis++; |
451 | } | 449 | } |
452 | 450 | ||
453 | void restart_nmi(void) | 451 | void restart_nmi(void) |
454 | { | 452 | { |
455 | ignore_nmis--; | 453 | ignore_nmis--; |
456 | acpi_nmi_enable(); | ||
457 | } | 454 | } |
458 | 455 | ||
459 | /* May run on IST stack. */ | 456 | /* May run on IST stack. */ |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 0c40d8b72416..ffe5755caa8b 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -464,7 +464,7 @@ unsigned long native_calibrate_tsc(void) | |||
464 | tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); | 464 | tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); |
465 | 465 | ||
466 | /* hpet or pmtimer available ? */ | 466 | /* hpet or pmtimer available ? */ |
467 | if (!hpet && !ref1 && !ref2) | 467 | if (ref1 == ref2) |
468 | continue; | 468 | continue; |
469 | 469 | ||
470 | /* Check, whether the sampling was disturbed by an SMI */ | 470 | /* Check, whether the sampling was disturbed by an SMI */ |
@@ -659,7 +659,7 @@ void restore_sched_clock_state(void) | |||
659 | 659 | ||
660 | local_irq_save(flags); | 660 | local_irq_save(flags); |
661 | 661 | ||
662 | __get_cpu_var(cyc2ns_offset) = 0; | 662 | __this_cpu_write(cyc2ns_offset, 0); |
663 | offset = cyc2ns_suspend - sched_clock(); | 663 | offset = cyc2ns_suspend - sched_clock(); |
664 | 664 | ||
665 | for_each_possible_cpu(cpu) | 665 | for_each_possible_cpu(cpu) |
@@ -872,6 +872,9 @@ __cpuinit int unsynchronized_tsc(void) | |||
872 | 872 | ||
873 | if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) | 873 | if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) |
874 | return 0; | 874 | return 0; |
875 | |||
876 | if (tsc_clocksource_reliable) | ||
877 | return 0; | ||
875 | /* | 878 | /* |
876 | * Intel systems are normally all synchronized. | 879 | * Intel systems are normally all synchronized. |
877 | * Exceptions must mark TSC as unstable: | 880 | * Exceptions must mark TSC as unstable: |
@@ -879,14 +882,92 @@ __cpuinit int unsynchronized_tsc(void) | |||
879 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { | 882 | if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { |
880 | /* assume multi socket systems are not synchronized: */ | 883 | /* assume multi socket systems are not synchronized: */ |
881 | if (num_possible_cpus() > 1) | 884 | if (num_possible_cpus() > 1) |
882 | tsc_unstable = 1; | 885 | return 1; |
883 | } | 886 | } |
884 | 887 | ||
885 | return tsc_unstable; | 888 | return 0; |
889 | } | ||
890 | |||
891 | |||
892 | static void tsc_refine_calibration_work(struct work_struct *work); | ||
893 | static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); | ||
894 | /** | ||
895 | * tsc_refine_calibration_work - Further refine tsc freq calibration | ||
896 | * @work - ignored. | ||
897 | * | ||
898 | * This functions uses delayed work over a period of a | ||
899 | * second to further refine the TSC freq value. Since this is | ||
900 | * timer based, instead of loop based, we don't block the boot | ||
901 | * process while this longer calibration is done. | ||
902 | * | ||
903 | * If there are any calibration anomolies (too many SMIs, etc), | ||
904 | * or the refined calibration is off by 1% of the fast early | ||
905 | * calibration, we throw out the new calibration and use the | ||
906 | * early calibration. | ||
907 | */ | ||
908 | static void tsc_refine_calibration_work(struct work_struct *work) | ||
909 | { | ||
910 | static u64 tsc_start = -1, ref_start; | ||
911 | static int hpet; | ||
912 | u64 tsc_stop, ref_stop, delta; | ||
913 | unsigned long freq; | ||
914 | |||
915 | /* Don't bother refining TSC on unstable systems */ | ||
916 | if (check_tsc_unstable()) | ||
917 | goto out; | ||
918 | |||
919 | /* | ||
920 | * Since the work is started early in boot, we may be | ||
921 | * delayed the first time we expire. So set the workqueue | ||
922 | * again once we know timers are working. | ||
923 | */ | ||
924 | if (tsc_start == -1) { | ||
925 | /* | ||
926 | * Only set hpet once, to avoid mixing hardware | ||
927 | * if the hpet becomes enabled later. | ||
928 | */ | ||
929 | hpet = is_hpet_enabled(); | ||
930 | schedule_delayed_work(&tsc_irqwork, HZ); | ||
931 | tsc_start = tsc_read_refs(&ref_start, hpet); | ||
932 | return; | ||
933 | } | ||
934 | |||
935 | tsc_stop = tsc_read_refs(&ref_stop, hpet); | ||
936 | |||
937 | /* hpet or pmtimer available ? */ | ||
938 | if (ref_start == ref_stop) | ||
939 | goto out; | ||
940 | |||
941 | /* Check, whether the sampling was disturbed by an SMI */ | ||
942 | if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX) | ||
943 | goto out; | ||
944 | |||
945 | delta = tsc_stop - tsc_start; | ||
946 | delta *= 1000000LL; | ||
947 | if (hpet) | ||
948 | freq = calc_hpet_ref(delta, ref_start, ref_stop); | ||
949 | else | ||
950 | freq = calc_pmtimer_ref(delta, ref_start, ref_stop); | ||
951 | |||
952 | /* Make sure we're within 1% */ | ||
953 | if (abs(tsc_khz - freq) > tsc_khz/100) | ||
954 | goto out; | ||
955 | |||
956 | tsc_khz = freq; | ||
957 | printk(KERN_INFO "Refined TSC clocksource calibration: " | ||
958 | "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000, | ||
959 | (unsigned long)tsc_khz % 1000); | ||
960 | |||
961 | out: | ||
962 | clocksource_register_khz(&clocksource_tsc, tsc_khz); | ||
886 | } | 963 | } |
887 | 964 | ||
888 | static void __init init_tsc_clocksource(void) | 965 | |
966 | static int __init init_tsc_clocksource(void) | ||
889 | { | 967 | { |
968 | if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz) | ||
969 | return 0; | ||
970 | |||
890 | if (tsc_clocksource_reliable) | 971 | if (tsc_clocksource_reliable) |
891 | clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; | 972 | clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; |
892 | /* lower the rating if we already know its unstable: */ | 973 | /* lower the rating if we already know its unstable: */ |
@@ -894,8 +975,14 @@ static void __init init_tsc_clocksource(void) | |||
894 | clocksource_tsc.rating = 0; | 975 | clocksource_tsc.rating = 0; |
895 | clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; | 976 | clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; |
896 | } | 977 | } |
897 | clocksource_register_khz(&clocksource_tsc, tsc_khz); | 978 | schedule_delayed_work(&tsc_irqwork, 0); |
979 | return 0; | ||
898 | } | 980 | } |
981 | /* | ||
982 | * We use device_initcall here, to ensure we run after the hpet | ||
983 | * is fully initialized, which may occur at fs_initcall time. | ||
984 | */ | ||
985 | device_initcall(init_tsc_clocksource); | ||
899 | 986 | ||
900 | void __init tsc_init(void) | 987 | void __init tsc_init(void) |
901 | { | 988 | { |
@@ -949,6 +1036,5 @@ void __init tsc_init(void) | |||
949 | mark_tsc_unstable("TSCs unsynchronized"); | 1036 | mark_tsc_unstable("TSCs unsynchronized"); |
950 | 1037 | ||
951 | check_system_tsc_reliable(); | 1038 | check_system_tsc_reliable(); |
952 | init_tsc_clocksource(); | ||
953 | } | 1039 | } |
954 | 1040 | ||
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu.S index 56a8c2a867d9..0edefc19a113 100644 --- a/arch/x86/kernel/verify_cpu_64.S +++ b/arch/x86/kernel/verify_cpu.S | |||
@@ -7,6 +7,7 @@ | |||
7 | * Copyright (c) 2007 Andi Kleen (ak@suse.de) | 7 | * Copyright (c) 2007 Andi Kleen (ak@suse.de) |
8 | * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) | 8 | * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) |
9 | * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) | 9 | * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) |
10 | * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com) | ||
10 | * | 11 | * |
11 | * This source code is licensed under the GNU General Public License, | 12 | * This source code is licensed under the GNU General Public License, |
12 | * Version 2. See the file COPYING for more details. | 13 | * Version 2. See the file COPYING for more details. |
@@ -14,18 +15,17 @@ | |||
14 | * This is a common code for verification whether CPU supports | 15 | * This is a common code for verification whether CPU supports |
15 | * long mode and SSE or not. It is not called directly instead this | 16 | * long mode and SSE or not. It is not called directly instead this |
16 | * file is included at various places and compiled in that context. | 17 | * file is included at various places and compiled in that context. |
17 | * Following are the current usage. | 18 | * This file is expected to run in 32bit code. Currently: |
18 | * | 19 | * |
19 | * This file is included by both 16bit and 32bit code. | 20 | * arch/x86/boot/compressed/head_64.S: Boot cpu verification |
21 | * arch/x86/kernel/trampoline_64.S: secondary processor verfication | ||
22 | * arch/x86/kernel/head_32.S: processor startup | ||
20 | * | 23 | * |
21 | * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) | 24 | * verify_cpu, returns the status of longmode and SSE in register %eax. |
22 | * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit) | ||
23 | * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit) | ||
24 | * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit) | ||
25 | * | ||
26 | * verify_cpu, returns the status of cpu check in register %eax. | ||
27 | * 0: Success 1: Failure | 25 | * 0: Success 1: Failure |
28 | * | 26 | * |
27 | * On Intel, the XD_DISABLE flag will be cleared as a side-effect. | ||
28 | * | ||
29 | * The caller needs to check for the error code and take the action | 29 | * The caller needs to check for the error code and take the action |
30 | * appropriately. Either display a message or halt. | 30 | * appropriately. Either display a message or halt. |
31 | */ | 31 | */ |
@@ -62,8 +62,41 @@ verify_cpu: | |||
62 | cmpl $0x444d4163,%ecx | 62 | cmpl $0x444d4163,%ecx |
63 | jnz verify_cpu_noamd | 63 | jnz verify_cpu_noamd |
64 | mov $1,%di # cpu is from AMD | 64 | mov $1,%di # cpu is from AMD |
65 | jmp verify_cpu_check | ||
65 | 66 | ||
66 | verify_cpu_noamd: | 67 | verify_cpu_noamd: |
68 | cmpl $0x756e6547,%ebx # GenuineIntel? | ||
69 | jnz verify_cpu_check | ||
70 | cmpl $0x49656e69,%edx | ||
71 | jnz verify_cpu_check | ||
72 | cmpl $0x6c65746e,%ecx | ||
73 | jnz verify_cpu_check | ||
74 | |||
75 | # only call IA32_MISC_ENABLE when: | ||
76 | # family > 6 || (family == 6 && model >= 0xd) | ||
77 | movl $0x1, %eax # check CPU family and model | ||
78 | cpuid | ||
79 | movl %eax, %ecx | ||
80 | |||
81 | andl $0x0ff00f00, %eax # mask family and extended family | ||
82 | shrl $8, %eax | ||
83 | cmpl $6, %eax | ||
84 | ja verify_cpu_clear_xd # family > 6, ok | ||
85 | jb verify_cpu_check # family < 6, skip | ||
86 | |||
87 | andl $0x000f00f0, %ecx # mask model and extended model | ||
88 | shrl $4, %ecx | ||
89 | cmpl $0xd, %ecx | ||
90 | jb verify_cpu_check # family == 6, model < 0xd, skip | ||
91 | |||
92 | verify_cpu_clear_xd: | ||
93 | movl $MSR_IA32_MISC_ENABLE, %ecx | ||
94 | rdmsr | ||
95 | btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE | ||
96 | jnc verify_cpu_check # only write MSR if bit was changed | ||
97 | wrmsr | ||
98 | |||
99 | verify_cpu_check: | ||
67 | movl $0x1,%eax # Does the cpu have what it takes | 100 | movl $0x1,%eax # Does the cpu have what it takes |
68 | cpuid | 101 | cpuid |
69 | andl $REQUIRED_MASK0,%edx | 102 | andl $REQUIRED_MASK0,%edx |
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 61fb98519622..863f8753ab0a 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
179 | if (pud_none_or_clear_bad(pud)) | 179 | if (pud_none_or_clear_bad(pud)) |
180 | goto out; | 180 | goto out; |
181 | pmd = pmd_offset(pud, 0xA0000); | 181 | pmd = pmd_offset(pud, 0xA0000); |
182 | split_huge_page_pmd(mm, pmd); | ||
182 | if (pmd_none_or_clear_bad(pmd)) | 183 | if (pmd_none_or_clear_bad(pmd)) |
183 | goto out; | 184 | goto out; |
184 | pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); | 185 | pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); |
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e03530aebfd0..bf4700755184 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
@@ -69,7 +69,7 @@ jiffies_64 = jiffies; | |||
69 | 69 | ||
70 | PHDRS { | 70 | PHDRS { |
71 | text PT_LOAD FLAGS(5); /* R_E */ | 71 | text PT_LOAD FLAGS(5); /* R_E */ |
72 | data PT_LOAD FLAGS(7); /* RWE */ | 72 | data PT_LOAD FLAGS(6); /* RW_ */ |
73 | #ifdef CONFIG_X86_64 | 73 | #ifdef CONFIG_X86_64 |
74 | user PT_LOAD FLAGS(5); /* R_E */ | 74 | user PT_LOAD FLAGS(5); /* R_E */ |
75 | #ifdef CONFIG_SMP | 75 | #ifdef CONFIG_SMP |
@@ -116,6 +116,10 @@ SECTIONS | |||
116 | 116 | ||
117 | EXCEPTION_TABLE(16) :text = 0x9090 | 117 | EXCEPTION_TABLE(16) :text = 0x9090 |
118 | 118 | ||
119 | #if defined(CONFIG_DEBUG_RODATA) | ||
120 | /* .text should occupy whole number of pages */ | ||
121 | . = ALIGN(PAGE_SIZE); | ||
122 | #endif | ||
119 | X64_ALIGN_DEBUG_RODATA_BEGIN | 123 | X64_ALIGN_DEBUG_RODATA_BEGIN |
120 | RO_DATA(PAGE_SIZE) | 124 | RO_DATA(PAGE_SIZE) |
121 | X64_ALIGN_DEBUG_RODATA_END | 125 | X64_ALIGN_DEBUG_RODATA_END |
@@ -335,7 +339,7 @@ SECTIONS | |||
335 | __bss_start = .; | 339 | __bss_start = .; |
336 | *(.bss..page_aligned) | 340 | *(.bss..page_aligned) |
337 | *(.bss) | 341 | *(.bss) |
338 | . = ALIGN(4); | 342 | . = ALIGN(PAGE_SIZE); |
339 | __bss_stop = .; | 343 | __bss_stop = .; |
340 | } | 344 | } |
341 | 345 | ||
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 9c253bd65e24..547128546cc3 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c | |||
@@ -394,7 +394,8 @@ static void __init setup_xstate_init(void) | |||
394 | * Setup init_xstate_buf to represent the init state of | 394 | * Setup init_xstate_buf to represent the init state of |
395 | * all the features managed by the xsave | 395 | * all the features managed by the xsave |
396 | */ | 396 | */ |
397 | init_xstate_buf = alloc_bootmem(xstate_size); | 397 | init_xstate_buf = alloc_bootmem_align(xstate_size, |
398 | __alignof__(struct xsave_struct)); | ||
398 | init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; | 399 | init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; |
399 | 400 | ||
400 | clts(); | 401 | clts(); |
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ddc131ff438f..50f63648ce1b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -28,6 +28,7 @@ config KVM | |||
28 | select HAVE_KVM_IRQCHIP | 28 | select HAVE_KVM_IRQCHIP |
29 | select HAVE_KVM_EVENTFD | 29 | select HAVE_KVM_EVENTFD |
30 | select KVM_APIC_ARCHITECTURE | 30 | select KVM_APIC_ARCHITECTURE |
31 | select KVM_ASYNC_PF | ||
31 | select USER_RETURN_NOTIFIER | 32 | select USER_RETURN_NOTIFIER |
32 | select KVM_MMIO | 33 | select KVM_MMIO |
33 | ---help--- | 34 | ---help--- |
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 31a7035c4bd9..f15501f431c8 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | 1 | ||
2 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm | 2 | ccflags-y += -Ivirt/kvm -Iarch/x86/kvm |
3 | 3 | ||
4 | CFLAGS_x86.o := -I. | 4 | CFLAGS_x86.o := -I. |
5 | CFLAGS_svm.o := -I. | 5 | CFLAGS_svm.o := -I. |
@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ | |||
9 | coalesced_mmio.o irq_comm.o eventfd.o \ | 9 | coalesced_mmio.o irq_comm.o eventfd.o \ |
10 | assigned-dev.o) | 10 | assigned-dev.o) |
11 | kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) | 11 | kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) |
12 | kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) | ||
12 | 13 | ||
13 | kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ | 14 | kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ |
14 | i8254.o timer.o | 15 | i8254.o timer.o |
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 38b6e8dafaff..caf966781d25 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -20,16 +20,8 @@ | |||
20 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 | 20 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #ifndef __KERNEL__ | ||
24 | #include <stdio.h> | ||
25 | #include <stdint.h> | ||
26 | #include <public/xen.h> | ||
27 | #define DPRINTF(_f, _a ...) printf(_f , ## _a) | ||
28 | #else | ||
29 | #include <linux/kvm_host.h> | 23 | #include <linux/kvm_host.h> |
30 | #include "kvm_cache_regs.h" | 24 | #include "kvm_cache_regs.h" |
31 | #define DPRINTF(x...) do {} while (0) | ||
32 | #endif | ||
33 | #include <linux/module.h> | 25 | #include <linux/module.h> |
34 | #include <asm/kvm_emulate.h> | 26 | #include <asm/kvm_emulate.h> |
35 | 27 | ||
@@ -418,9 +410,9 @@ address_mask(struct decode_cache *c, unsigned long reg) | |||
418 | } | 410 | } |
419 | 411 | ||
420 | static inline unsigned long | 412 | static inline unsigned long |
421 | register_address(struct decode_cache *c, unsigned long base, unsigned long reg) | 413 | register_address(struct decode_cache *c, unsigned long reg) |
422 | { | 414 | { |
423 | return base + address_mask(c, reg); | 415 | return address_mask(c, reg); |
424 | } | 416 | } |
425 | 417 | ||
426 | static inline void | 418 | static inline void |
@@ -452,60 +444,55 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, | |||
452 | return ops->get_cached_segment_base(seg, ctxt->vcpu); | 444 | return ops->get_cached_segment_base(seg, ctxt->vcpu); |
453 | } | 445 | } |
454 | 446 | ||
455 | static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, | 447 | static unsigned seg_override(struct x86_emulate_ctxt *ctxt, |
456 | struct x86_emulate_ops *ops, | 448 | struct x86_emulate_ops *ops, |
457 | struct decode_cache *c) | 449 | struct decode_cache *c) |
458 | { | 450 | { |
459 | if (!c->has_seg_override) | 451 | if (!c->has_seg_override) |
460 | return 0; | 452 | return 0; |
461 | 453 | ||
462 | return seg_base(ctxt, ops, c->seg_override); | 454 | return c->seg_override; |
463 | } | 455 | } |
464 | 456 | ||
465 | static unsigned long es_base(struct x86_emulate_ctxt *ctxt, | 457 | static ulong linear(struct x86_emulate_ctxt *ctxt, |
466 | struct x86_emulate_ops *ops) | 458 | struct segmented_address addr) |
467 | { | 459 | { |
468 | return seg_base(ctxt, ops, VCPU_SREG_ES); | 460 | struct decode_cache *c = &ctxt->decode; |
469 | } | 461 | ulong la; |
470 | |||
471 | static unsigned long ss_base(struct x86_emulate_ctxt *ctxt, | ||
472 | struct x86_emulate_ops *ops) | ||
473 | { | ||
474 | return seg_base(ctxt, ops, VCPU_SREG_SS); | ||
475 | } | ||
476 | 462 | ||
477 | static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, | 463 | la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; |
478 | u32 error, bool valid) | 464 | if (c->ad_bytes != 8) |
479 | { | 465 | la &= (u32)-1; |
480 | ctxt->exception = vec; | 466 | return la; |
481 | ctxt->error_code = error; | ||
482 | ctxt->error_code_valid = valid; | ||
483 | } | 467 | } |
484 | 468 | ||
485 | static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) | 469 | static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, |
470 | u32 error, bool valid) | ||
486 | { | 471 | { |
487 | emulate_exception(ctxt, GP_VECTOR, err, true); | 472 | ctxt->exception.vector = vec; |
473 | ctxt->exception.error_code = error; | ||
474 | ctxt->exception.error_code_valid = valid; | ||
475 | return X86EMUL_PROPAGATE_FAULT; | ||
488 | } | 476 | } |
489 | 477 | ||
490 | static void emulate_pf(struct x86_emulate_ctxt *ctxt) | 478 | static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) |
491 | { | 479 | { |
492 | emulate_exception(ctxt, PF_VECTOR, 0, true); | 480 | return emulate_exception(ctxt, GP_VECTOR, err, true); |
493 | } | 481 | } |
494 | 482 | ||
495 | static void emulate_ud(struct x86_emulate_ctxt *ctxt) | 483 | static int emulate_ud(struct x86_emulate_ctxt *ctxt) |
496 | { | 484 | { |
497 | emulate_exception(ctxt, UD_VECTOR, 0, false); | 485 | return emulate_exception(ctxt, UD_VECTOR, 0, false); |
498 | } | 486 | } |
499 | 487 | ||
500 | static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) | 488 | static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err) |
501 | { | 489 | { |
502 | emulate_exception(ctxt, TS_VECTOR, err, true); | 490 | return emulate_exception(ctxt, TS_VECTOR, err, true); |
503 | } | 491 | } |
504 | 492 | ||
505 | static int emulate_de(struct x86_emulate_ctxt *ctxt) | 493 | static int emulate_de(struct x86_emulate_ctxt *ctxt) |
506 | { | 494 | { |
507 | emulate_exception(ctxt, DE_VECTOR, 0, false); | 495 | return emulate_exception(ctxt, DE_VECTOR, 0, false); |
508 | return X86EMUL_PROPAGATE_FAULT; | ||
509 | } | 496 | } |
510 | 497 | ||
511 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | 498 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, |
@@ -520,7 +507,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | |||
520 | cur_size = fc->end - fc->start; | 507 | cur_size = fc->end - fc->start; |
521 | size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); | 508 | size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); |
522 | rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, | 509 | rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, |
523 | size, ctxt->vcpu, NULL); | 510 | size, ctxt->vcpu, &ctxt->exception); |
524 | if (rc != X86EMUL_CONTINUE) | 511 | if (rc != X86EMUL_CONTINUE) |
525 | return rc; | 512 | return rc; |
526 | fc->end += size; | 513 | fc->end += size; |
@@ -564,7 +551,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs, | |||
564 | 551 | ||
565 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, | 552 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, |
566 | struct x86_emulate_ops *ops, | 553 | struct x86_emulate_ops *ops, |
567 | ulong addr, | 554 | struct segmented_address addr, |
568 | u16 *size, unsigned long *address, int op_bytes) | 555 | u16 *size, unsigned long *address, int op_bytes) |
569 | { | 556 | { |
570 | int rc; | 557 | int rc; |
@@ -572,10 +559,13 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, | |||
572 | if (op_bytes == 2) | 559 | if (op_bytes == 2) |
573 | op_bytes = 3; | 560 | op_bytes = 3; |
574 | *address = 0; | 561 | *address = 0; |
575 | rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL); | 562 | rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2, |
563 | ctxt->vcpu, &ctxt->exception); | ||
576 | if (rc != X86EMUL_CONTINUE) | 564 | if (rc != X86EMUL_CONTINUE) |
577 | return rc; | 565 | return rc; |
578 | rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL); | 566 | addr.ea += 2; |
567 | rc = ops->read_std(linear(ctxt, addr), address, op_bytes, | ||
568 | ctxt->vcpu, &ctxt->exception); | ||
579 | return rc; | 569 | return rc; |
580 | } | 570 | } |
581 | 571 | ||
@@ -768,7 +758,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, | |||
768 | break; | 758 | break; |
769 | } | 759 | } |
770 | } | 760 | } |
771 | op->addr.mem = modrm_ea; | 761 | op->addr.mem.ea = modrm_ea; |
772 | done: | 762 | done: |
773 | return rc; | 763 | return rc; |
774 | } | 764 | } |
@@ -783,13 +773,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt, | |||
783 | op->type = OP_MEM; | 773 | op->type = OP_MEM; |
784 | switch (c->ad_bytes) { | 774 | switch (c->ad_bytes) { |
785 | case 2: | 775 | case 2: |
786 | op->addr.mem = insn_fetch(u16, 2, c->eip); | 776 | op->addr.mem.ea = insn_fetch(u16, 2, c->eip); |
787 | break; | 777 | break; |
788 | case 4: | 778 | case 4: |
789 | op->addr.mem = insn_fetch(u32, 4, c->eip); | 779 | op->addr.mem.ea = insn_fetch(u32, 4, c->eip); |
790 | break; | 780 | break; |
791 | case 8: | 781 | case 8: |
792 | op->addr.mem = insn_fetch(u64, 8, c->eip); | 782 | op->addr.mem.ea = insn_fetch(u64, 8, c->eip); |
793 | break; | 783 | break; |
794 | } | 784 | } |
795 | done: | 785 | done: |
@@ -808,7 +798,7 @@ static void fetch_bit_operand(struct decode_cache *c) | |||
808 | else if (c->src.bytes == 4) | 798 | else if (c->src.bytes == 4) |
809 | sv = (s32)c->src.val & (s32)mask; | 799 | sv = (s32)c->src.val & (s32)mask; |
810 | 800 | ||
811 | c->dst.addr.mem += (sv >> 3); | 801 | c->dst.addr.mem.ea += (sv >> 3); |
812 | } | 802 | } |
813 | 803 | ||
814 | /* only subword offset */ | 804 | /* only subword offset */ |
@@ -821,7 +811,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, | |||
821 | { | 811 | { |
822 | int rc; | 812 | int rc; |
823 | struct read_cache *mc = &ctxt->decode.mem_read; | 813 | struct read_cache *mc = &ctxt->decode.mem_read; |
824 | u32 err; | ||
825 | 814 | ||
826 | while (size) { | 815 | while (size) { |
827 | int n = min(size, 8u); | 816 | int n = min(size, 8u); |
@@ -829,10 +818,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, | |||
829 | if (mc->pos < mc->end) | 818 | if (mc->pos < mc->end) |
830 | goto read_cached; | 819 | goto read_cached; |
831 | 820 | ||
832 | rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, | 821 | rc = ops->read_emulated(addr, mc->data + mc->end, n, |
833 | ctxt->vcpu); | 822 | &ctxt->exception, ctxt->vcpu); |
834 | if (rc == X86EMUL_PROPAGATE_FAULT) | ||
835 | emulate_pf(ctxt); | ||
836 | if (rc != X86EMUL_CONTINUE) | 823 | if (rc != X86EMUL_CONTINUE) |
837 | return rc; | 824 | return rc; |
838 | mc->end += n; | 825 | mc->end += n; |
@@ -907,19 +894,15 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
907 | struct desc_ptr dt; | 894 | struct desc_ptr dt; |
908 | u16 index = selector >> 3; | 895 | u16 index = selector >> 3; |
909 | int ret; | 896 | int ret; |
910 | u32 err; | ||
911 | ulong addr; | 897 | ulong addr; |
912 | 898 | ||
913 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); | 899 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); |
914 | 900 | ||
915 | if (dt.size < index * 8 + 7) { | 901 | if (dt.size < index * 8 + 7) |
916 | emulate_gp(ctxt, selector & 0xfffc); | 902 | return emulate_gp(ctxt, selector & 0xfffc); |
917 | return X86EMUL_PROPAGATE_FAULT; | ||
918 | } | ||
919 | addr = dt.address + index * 8; | 903 | addr = dt.address + index * 8; |
920 | ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); | 904 | ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, |
921 | if (ret == X86EMUL_PROPAGATE_FAULT) | 905 | &ctxt->exception); |
922 | emulate_pf(ctxt); | ||
923 | 906 | ||
924 | return ret; | 907 | return ret; |
925 | } | 908 | } |
@@ -931,21 +914,17 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, | |||
931 | { | 914 | { |
932 | struct desc_ptr dt; | 915 | struct desc_ptr dt; |
933 | u16 index = selector >> 3; | 916 | u16 index = selector >> 3; |
934 | u32 err; | ||
935 | ulong addr; | 917 | ulong addr; |
936 | int ret; | 918 | int ret; |
937 | 919 | ||
938 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); | 920 | get_descriptor_table_ptr(ctxt, ops, selector, &dt); |
939 | 921 | ||
940 | if (dt.size < index * 8 + 7) { | 922 | if (dt.size < index * 8 + 7) |
941 | emulate_gp(ctxt, selector & 0xfffc); | 923 | return emulate_gp(ctxt, selector & 0xfffc); |
942 | return X86EMUL_PROPAGATE_FAULT; | ||
943 | } | ||
944 | 924 | ||
945 | addr = dt.address + index * 8; | 925 | addr = dt.address + index * 8; |
946 | ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); | 926 | ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, |
947 | if (ret == X86EMUL_PROPAGATE_FAULT) | 927 | &ctxt->exception); |
948 | emulate_pf(ctxt); | ||
949 | 928 | ||
950 | return ret; | 929 | return ret; |
951 | } | 930 | } |
@@ -1092,7 +1071,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, | |||
1092 | { | 1071 | { |
1093 | int rc; | 1072 | int rc; |
1094 | struct decode_cache *c = &ctxt->decode; | 1073 | struct decode_cache *c = &ctxt->decode; |
1095 | u32 err; | ||
1096 | 1074 | ||
1097 | switch (c->dst.type) { | 1075 | switch (c->dst.type) { |
1098 | case OP_REG: | 1076 | case OP_REG: |
@@ -1101,21 +1079,19 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, | |||
1101 | case OP_MEM: | 1079 | case OP_MEM: |
1102 | if (c->lock_prefix) | 1080 | if (c->lock_prefix) |
1103 | rc = ops->cmpxchg_emulated( | 1081 | rc = ops->cmpxchg_emulated( |
1104 | c->dst.addr.mem, | 1082 | linear(ctxt, c->dst.addr.mem), |
1105 | &c->dst.orig_val, | 1083 | &c->dst.orig_val, |
1106 | &c->dst.val, | 1084 | &c->dst.val, |
1107 | c->dst.bytes, | 1085 | c->dst.bytes, |
1108 | &err, | 1086 | &ctxt->exception, |
1109 | ctxt->vcpu); | 1087 | ctxt->vcpu); |
1110 | else | 1088 | else |
1111 | rc = ops->write_emulated( | 1089 | rc = ops->write_emulated( |
1112 | c->dst.addr.mem, | 1090 | linear(ctxt, c->dst.addr.mem), |
1113 | &c->dst.val, | 1091 | &c->dst.val, |
1114 | c->dst.bytes, | 1092 | c->dst.bytes, |
1115 | &err, | 1093 | &ctxt->exception, |
1116 | ctxt->vcpu); | 1094 | ctxt->vcpu); |
1117 | if (rc == X86EMUL_PROPAGATE_FAULT) | ||
1118 | emulate_pf(ctxt); | ||
1119 | if (rc != X86EMUL_CONTINUE) | 1095 | if (rc != X86EMUL_CONTINUE) |
1120 | return rc; | 1096 | return rc; |
1121 | break; | 1097 | break; |
@@ -1137,8 +1113,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt, | |||
1137 | c->dst.bytes = c->op_bytes; | 1113 | c->dst.bytes = c->op_bytes; |
1138 | c->dst.val = c->src.val; | 1114 | c->dst.val = c->src.val; |
1139 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); | 1115 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); |
1140 | c->dst.addr.mem = register_address(c, ss_base(ctxt, ops), | 1116 | c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]); |
1141 | c->regs[VCPU_REGS_RSP]); | 1117 | c->dst.addr.mem.seg = VCPU_SREG_SS; |
1142 | } | 1118 | } |
1143 | 1119 | ||
1144 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, | 1120 | static int emulate_pop(struct x86_emulate_ctxt *ctxt, |
@@ -1147,10 +1123,11 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, | |||
1147 | { | 1123 | { |
1148 | struct decode_cache *c = &ctxt->decode; | 1124 | struct decode_cache *c = &ctxt->decode; |
1149 | int rc; | 1125 | int rc; |
1126 | struct segmented_address addr; | ||
1150 | 1127 | ||
1151 | rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), | 1128 | addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); |
1152 | c->regs[VCPU_REGS_RSP]), | 1129 | addr.seg = VCPU_SREG_SS; |
1153 | dest, len); | 1130 | rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len); |
1154 | if (rc != X86EMUL_CONTINUE) | 1131 | if (rc != X86EMUL_CONTINUE) |
1155 | return rc; | 1132 | return rc; |
1156 | 1133 | ||
@@ -1184,10 +1161,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, | |||
1184 | change_mask |= EFLG_IF; | 1161 | change_mask |= EFLG_IF; |
1185 | break; | 1162 | break; |
1186 | case X86EMUL_MODE_VM86: | 1163 | case X86EMUL_MODE_VM86: |
1187 | if (iopl < 3) { | 1164 | if (iopl < 3) |
1188 | emulate_gp(ctxt, 0); | 1165 | return emulate_gp(ctxt, 0); |
1189 | return X86EMUL_PROPAGATE_FAULT; | ||
1190 | } | ||
1191 | change_mask |= EFLG_IF; | 1166 | change_mask |= EFLG_IF; |
1192 | break; | 1167 | break; |
1193 | default: /* real mode */ | 1168 | default: /* real mode */ |
@@ -1198,9 +1173,6 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, | |||
1198 | *(unsigned long *)dest = | 1173 | *(unsigned long *)dest = |
1199 | (ctxt->eflags & ~change_mask) | (val & change_mask); | 1174 | (ctxt->eflags & ~change_mask) | (val & change_mask); |
1200 | 1175 | ||
1201 | if (rc == X86EMUL_PROPAGATE_FAULT) | ||
1202 | emulate_pf(ctxt); | ||
1203 | |||
1204 | return rc; | 1176 | return rc; |
1205 | } | 1177 | } |
1206 | 1178 | ||
@@ -1287,7 +1259,6 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, | |||
1287 | gva_t cs_addr; | 1259 | gva_t cs_addr; |
1288 | gva_t eip_addr; | 1260 | gva_t eip_addr; |
1289 | u16 cs, eip; | 1261 | u16 cs, eip; |
1290 | u32 err; | ||
1291 | 1262 | ||
1292 | /* TODO: Add limit checks */ | 1263 | /* TODO: Add limit checks */ |
1293 | c->src.val = ctxt->eflags; | 1264 | c->src.val = ctxt->eflags; |
@@ -1317,11 +1288,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, | |||
1317 | eip_addr = dt.address + (irq << 2); | 1288 | eip_addr = dt.address + (irq << 2); |
1318 | cs_addr = dt.address + (irq << 2) + 2; | 1289 | cs_addr = dt.address + (irq << 2) + 2; |
1319 | 1290 | ||
1320 | rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err); | 1291 | rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception); |
1321 | if (rc != X86EMUL_CONTINUE) | 1292 | if (rc != X86EMUL_CONTINUE) |
1322 | return rc; | 1293 | return rc; |
1323 | 1294 | ||
1324 | rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err); | 1295 | rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception); |
1325 | if (rc != X86EMUL_CONTINUE) | 1296 | if (rc != X86EMUL_CONTINUE) |
1326 | return rc; | 1297 | return rc; |
1327 | 1298 | ||
@@ -1370,10 +1341,8 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, | |||
1370 | if (rc != X86EMUL_CONTINUE) | 1341 | if (rc != X86EMUL_CONTINUE) |
1371 | return rc; | 1342 | return rc; |
1372 | 1343 | ||
1373 | if (temp_eip & ~0xffff) { | 1344 | if (temp_eip & ~0xffff) |
1374 | emulate_gp(ctxt, 0); | 1345 | return emulate_gp(ctxt, 0); |
1375 | return X86EMUL_PROPAGATE_FAULT; | ||
1376 | } | ||
1377 | 1346 | ||
1378 | rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); | 1347 | rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); |
1379 | 1348 | ||
@@ -1624,10 +1593,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1624 | 1593 | ||
1625 | /* syscall is not available in real mode */ | 1594 | /* syscall is not available in real mode */ |
1626 | if (ctxt->mode == X86EMUL_MODE_REAL || | 1595 | if (ctxt->mode == X86EMUL_MODE_REAL || |
1627 | ctxt->mode == X86EMUL_MODE_VM86) { | 1596 | ctxt->mode == X86EMUL_MODE_VM86) |
1628 | emulate_ud(ctxt); | 1597 | return emulate_ud(ctxt); |
1629 | return X86EMUL_PROPAGATE_FAULT; | ||
1630 | } | ||
1631 | 1598 | ||
1632 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 1599 | setup_syscalls_segments(ctxt, ops, &cs, &ss); |
1633 | 1600 | ||
@@ -1678,34 +1645,26 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1678 | u16 cs_sel, ss_sel; | 1645 | u16 cs_sel, ss_sel; |
1679 | 1646 | ||
1680 | /* inject #GP if in real mode */ | 1647 | /* inject #GP if in real mode */ |
1681 | if (ctxt->mode == X86EMUL_MODE_REAL) { | 1648 | if (ctxt->mode == X86EMUL_MODE_REAL) |
1682 | emulate_gp(ctxt, 0); | 1649 | return emulate_gp(ctxt, 0); |
1683 | return X86EMUL_PROPAGATE_FAULT; | ||
1684 | } | ||
1685 | 1650 | ||
1686 | /* XXX sysenter/sysexit have not been tested in 64bit mode. | 1651 | /* XXX sysenter/sysexit have not been tested in 64bit mode. |
1687 | * Therefore, we inject an #UD. | 1652 | * Therefore, we inject an #UD. |
1688 | */ | 1653 | */ |
1689 | if (ctxt->mode == X86EMUL_MODE_PROT64) { | 1654 | if (ctxt->mode == X86EMUL_MODE_PROT64) |
1690 | emulate_ud(ctxt); | 1655 | return emulate_ud(ctxt); |
1691 | return X86EMUL_PROPAGATE_FAULT; | ||
1692 | } | ||
1693 | 1656 | ||
1694 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 1657 | setup_syscalls_segments(ctxt, ops, &cs, &ss); |
1695 | 1658 | ||
1696 | ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); | 1659 | ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); |
1697 | switch (ctxt->mode) { | 1660 | switch (ctxt->mode) { |
1698 | case X86EMUL_MODE_PROT32: | 1661 | case X86EMUL_MODE_PROT32: |
1699 | if ((msr_data & 0xfffc) == 0x0) { | 1662 | if ((msr_data & 0xfffc) == 0x0) |
1700 | emulate_gp(ctxt, 0); | 1663 | return emulate_gp(ctxt, 0); |
1701 | return X86EMUL_PROPAGATE_FAULT; | ||
1702 | } | ||
1703 | break; | 1664 | break; |
1704 | case X86EMUL_MODE_PROT64: | 1665 | case X86EMUL_MODE_PROT64: |
1705 | if (msr_data == 0x0) { | 1666 | if (msr_data == 0x0) |
1706 | emulate_gp(ctxt, 0); | 1667 | return emulate_gp(ctxt, 0); |
1707 | return X86EMUL_PROPAGATE_FAULT; | ||
1708 | } | ||
1709 | break; | 1668 | break; |
1710 | } | 1669 | } |
1711 | 1670 | ||
@@ -1745,10 +1704,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1745 | 1704 | ||
1746 | /* inject #GP if in real mode or Virtual 8086 mode */ | 1705 | /* inject #GP if in real mode or Virtual 8086 mode */ |
1747 | if (ctxt->mode == X86EMUL_MODE_REAL || | 1706 | if (ctxt->mode == X86EMUL_MODE_REAL || |
1748 | ctxt->mode == X86EMUL_MODE_VM86) { | 1707 | ctxt->mode == X86EMUL_MODE_VM86) |
1749 | emulate_gp(ctxt, 0); | 1708 | return emulate_gp(ctxt, 0); |
1750 | return X86EMUL_PROPAGATE_FAULT; | ||
1751 | } | ||
1752 | 1709 | ||
1753 | setup_syscalls_segments(ctxt, ops, &cs, &ss); | 1710 | setup_syscalls_segments(ctxt, ops, &cs, &ss); |
1754 | 1711 | ||
@@ -1763,18 +1720,14 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1763 | switch (usermode) { | 1720 | switch (usermode) { |
1764 | case X86EMUL_MODE_PROT32: | 1721 | case X86EMUL_MODE_PROT32: |
1765 | cs_sel = (u16)(msr_data + 16); | 1722 | cs_sel = (u16)(msr_data + 16); |
1766 | if ((msr_data & 0xfffc) == 0x0) { | 1723 | if ((msr_data & 0xfffc) == 0x0) |
1767 | emulate_gp(ctxt, 0); | 1724 | return emulate_gp(ctxt, 0); |
1768 | return X86EMUL_PROPAGATE_FAULT; | ||
1769 | } | ||
1770 | ss_sel = (u16)(msr_data + 24); | 1725 | ss_sel = (u16)(msr_data + 24); |
1771 | break; | 1726 | break; |
1772 | case X86EMUL_MODE_PROT64: | 1727 | case X86EMUL_MODE_PROT64: |
1773 | cs_sel = (u16)(msr_data + 32); | 1728 | cs_sel = (u16)(msr_data + 32); |
1774 | if (msr_data == 0x0) { | 1729 | if (msr_data == 0x0) |
1775 | emulate_gp(ctxt, 0); | 1730 | return emulate_gp(ctxt, 0); |
1776 | return X86EMUL_PROPAGATE_FAULT; | ||
1777 | } | ||
1778 | ss_sel = cs_sel + 8; | 1731 | ss_sel = cs_sel + 8; |
1779 | cs.d = 0; | 1732 | cs.d = 0; |
1780 | cs.l = 1; | 1733 | cs.l = 1; |
@@ -1934,33 +1887,27 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
1934 | { | 1887 | { |
1935 | struct tss_segment_16 tss_seg; | 1888 | struct tss_segment_16 tss_seg; |
1936 | int ret; | 1889 | int ret; |
1937 | u32 err, new_tss_base = get_desc_base(new_desc); | 1890 | u32 new_tss_base = get_desc_base(new_desc); |
1938 | 1891 | ||
1939 | ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 1892 | ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, |
1940 | &err); | 1893 | &ctxt->exception); |
1941 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 1894 | if (ret != X86EMUL_CONTINUE) |
1942 | /* FIXME: need to provide precise fault address */ | 1895 | /* FIXME: need to provide precise fault address */ |
1943 | emulate_pf(ctxt); | ||
1944 | return ret; | 1896 | return ret; |
1945 | } | ||
1946 | 1897 | ||
1947 | save_state_to_tss16(ctxt, ops, &tss_seg); | 1898 | save_state_to_tss16(ctxt, ops, &tss_seg); |
1948 | 1899 | ||
1949 | ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 1900 | ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, |
1950 | &err); | 1901 | &ctxt->exception); |
1951 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 1902 | if (ret != X86EMUL_CONTINUE) |
1952 | /* FIXME: need to provide precise fault address */ | 1903 | /* FIXME: need to provide precise fault address */ |
1953 | emulate_pf(ctxt); | ||
1954 | return ret; | 1904 | return ret; |
1955 | } | ||
1956 | 1905 | ||
1957 | ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 1906 | ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, |
1958 | &err); | 1907 | &ctxt->exception); |
1959 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 1908 | if (ret != X86EMUL_CONTINUE) |
1960 | /* FIXME: need to provide precise fault address */ | 1909 | /* FIXME: need to provide precise fault address */ |
1961 | emulate_pf(ctxt); | ||
1962 | return ret; | 1910 | return ret; |
1963 | } | ||
1964 | 1911 | ||
1965 | if (old_tss_sel != 0xffff) { | 1912 | if (old_tss_sel != 0xffff) { |
1966 | tss_seg.prev_task_link = old_tss_sel; | 1913 | tss_seg.prev_task_link = old_tss_sel; |
@@ -1968,12 +1915,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, | |||
1968 | ret = ops->write_std(new_tss_base, | 1915 | ret = ops->write_std(new_tss_base, |
1969 | &tss_seg.prev_task_link, | 1916 | &tss_seg.prev_task_link, |
1970 | sizeof tss_seg.prev_task_link, | 1917 | sizeof tss_seg.prev_task_link, |
1971 | ctxt->vcpu, &err); | 1918 | ctxt->vcpu, &ctxt->exception); |
1972 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 1919 | if (ret != X86EMUL_CONTINUE) |
1973 | /* FIXME: need to provide precise fault address */ | 1920 | /* FIXME: need to provide precise fault address */ |
1974 | emulate_pf(ctxt); | ||
1975 | return ret; | 1921 | return ret; |
1976 | } | ||
1977 | } | 1922 | } |
1978 | 1923 | ||
1979 | return load_state_from_tss16(ctxt, ops, &tss_seg); | 1924 | return load_state_from_tss16(ctxt, ops, &tss_seg); |
@@ -2013,10 +1958,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, | |||
2013 | struct decode_cache *c = &ctxt->decode; | 1958 | struct decode_cache *c = &ctxt->decode; |
2014 | int ret; | 1959 | int ret; |
2015 | 1960 | ||
2016 | if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { | 1961 | if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) |
2017 | emulate_gp(ctxt, 0); | 1962 | return emulate_gp(ctxt, 0); |
2018 | return X86EMUL_PROPAGATE_FAULT; | ||
2019 | } | ||
2020 | c->eip = tss->eip; | 1963 | c->eip = tss->eip; |
2021 | ctxt->eflags = tss->eflags | 2; | 1964 | ctxt->eflags = tss->eflags | 2; |
2022 | c->regs[VCPU_REGS_RAX] = tss->eax; | 1965 | c->regs[VCPU_REGS_RAX] = tss->eax; |
@@ -2076,33 +2019,27 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2076 | { | 2019 | { |
2077 | struct tss_segment_32 tss_seg; | 2020 | struct tss_segment_32 tss_seg; |
2078 | int ret; | 2021 | int ret; |
2079 | u32 err, new_tss_base = get_desc_base(new_desc); | 2022 | u32 new_tss_base = get_desc_base(new_desc); |
2080 | 2023 | ||
2081 | ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 2024 | ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, |
2082 | &err); | 2025 | &ctxt->exception); |
2083 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2026 | if (ret != X86EMUL_CONTINUE) |
2084 | /* FIXME: need to provide precise fault address */ | 2027 | /* FIXME: need to provide precise fault address */ |
2085 | emulate_pf(ctxt); | ||
2086 | return ret; | 2028 | return ret; |
2087 | } | ||
2088 | 2029 | ||
2089 | save_state_to_tss32(ctxt, ops, &tss_seg); | 2030 | save_state_to_tss32(ctxt, ops, &tss_seg); |
2090 | 2031 | ||
2091 | ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 2032 | ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, |
2092 | &err); | 2033 | &ctxt->exception); |
2093 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2034 | if (ret != X86EMUL_CONTINUE) |
2094 | /* FIXME: need to provide precise fault address */ | 2035 | /* FIXME: need to provide precise fault address */ |
2095 | emulate_pf(ctxt); | ||
2096 | return ret; | 2036 | return ret; |
2097 | } | ||
2098 | 2037 | ||
2099 | ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, | 2038 | ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, |
2100 | &err); | 2039 | &ctxt->exception); |
2101 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2040 | if (ret != X86EMUL_CONTINUE) |
2102 | /* FIXME: need to provide precise fault address */ | 2041 | /* FIXME: need to provide precise fault address */ |
2103 | emulate_pf(ctxt); | ||
2104 | return ret; | 2042 | return ret; |
2105 | } | ||
2106 | 2043 | ||
2107 | if (old_tss_sel != 0xffff) { | 2044 | if (old_tss_sel != 0xffff) { |
2108 | tss_seg.prev_task_link = old_tss_sel; | 2045 | tss_seg.prev_task_link = old_tss_sel; |
@@ -2110,12 +2047,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, | |||
2110 | ret = ops->write_std(new_tss_base, | 2047 | ret = ops->write_std(new_tss_base, |
2111 | &tss_seg.prev_task_link, | 2048 | &tss_seg.prev_task_link, |
2112 | sizeof tss_seg.prev_task_link, | 2049 | sizeof tss_seg.prev_task_link, |
2113 | ctxt->vcpu, &err); | 2050 | ctxt->vcpu, &ctxt->exception); |
2114 | if (ret == X86EMUL_PROPAGATE_FAULT) { | 2051 | if (ret != X86EMUL_CONTINUE) |
2115 | /* FIXME: need to provide precise fault address */ | 2052 | /* FIXME: need to provide precise fault address */ |
2116 | emulate_pf(ctxt); | ||
2117 | return ret; | 2053 | return ret; |
2118 | } | ||
2119 | } | 2054 | } |
2120 | 2055 | ||
2121 | return load_state_from_tss32(ctxt, ops, &tss_seg); | 2056 | return load_state_from_tss32(ctxt, ops, &tss_seg); |
@@ -2146,10 +2081,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2146 | 2081 | ||
2147 | if (reason != TASK_SWITCH_IRET) { | 2082 | if (reason != TASK_SWITCH_IRET) { |
2148 | if ((tss_selector & 3) > next_tss_desc.dpl || | 2083 | if ((tss_selector & 3) > next_tss_desc.dpl || |
2149 | ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { | 2084 | ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) |
2150 | emulate_gp(ctxt, 0); | 2085 | return emulate_gp(ctxt, 0); |
2151 | return X86EMUL_PROPAGATE_FAULT; | ||
2152 | } | ||
2153 | } | 2086 | } |
2154 | 2087 | ||
2155 | desc_limit = desc_limit_scaled(&next_tss_desc); | 2088 | desc_limit = desc_limit_scaled(&next_tss_desc); |
@@ -2231,14 +2164,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, | |||
2231 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | 2164 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; |
2232 | } | 2165 | } |
2233 | 2166 | ||
2234 | static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, | 2167 | static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, |
2235 | int reg, struct operand *op) | 2168 | int reg, struct operand *op) |
2236 | { | 2169 | { |
2237 | struct decode_cache *c = &ctxt->decode; | 2170 | struct decode_cache *c = &ctxt->decode; |
2238 | int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; | 2171 | int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; |
2239 | 2172 | ||
2240 | register_address_increment(c, &c->regs[reg], df * op->bytes); | 2173 | register_address_increment(c, &c->regs[reg], df * op->bytes); |
2241 | op->addr.mem = register_address(c, base, c->regs[reg]); | 2174 | op->addr.mem.ea = register_address(c, c->regs[reg]); |
2175 | op->addr.mem.seg = seg; | ||
2242 | } | 2176 | } |
2243 | 2177 | ||
2244 | static int em_push(struct x86_emulate_ctxt *ctxt) | 2178 | static int em_push(struct x86_emulate_ctxt *ctxt) |
@@ -2369,10 +2303,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt) | |||
2369 | struct decode_cache *c = &ctxt->decode; | 2303 | struct decode_cache *c = &ctxt->decode; |
2370 | u64 tsc = 0; | 2304 | u64 tsc = 0; |
2371 | 2305 | ||
2372 | if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) { | 2306 | if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) |
2373 | emulate_gp(ctxt, 0); | 2307 | return emulate_gp(ctxt, 0); |
2374 | return X86EMUL_PROPAGATE_FAULT; | ||
2375 | } | ||
2376 | ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); | 2308 | ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); |
2377 | c->regs[VCPU_REGS_RAX] = (u32)tsc; | 2309 | c->regs[VCPU_REGS_RAX] = (u32)tsc; |
2378 | c->regs[VCPU_REGS_RDX] = tsc >> 32; | 2310 | c->regs[VCPU_REGS_RDX] = tsc >> 32; |
@@ -2647,7 +2579,7 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, | |||
2647 | 2579 | ||
2648 | op->type = OP_IMM; | 2580 | op->type = OP_IMM; |
2649 | op->bytes = size; | 2581 | op->bytes = size; |
2650 | op->addr.mem = c->eip; | 2582 | op->addr.mem.ea = c->eip; |
2651 | /* NB. Immediates are sign-extended as necessary. */ | 2583 | /* NB. Immediates are sign-extended as necessary. */ |
2652 | switch (op->bytes) { | 2584 | switch (op->bytes) { |
2653 | case 1: | 2585 | case 1: |
@@ -2678,7 +2610,7 @@ done: | |||
2678 | } | 2610 | } |
2679 | 2611 | ||
2680 | int | 2612 | int |
2681 | x86_decode_insn(struct x86_emulate_ctxt *ctxt) | 2613 | x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) |
2682 | { | 2614 | { |
2683 | struct x86_emulate_ops *ops = ctxt->ops; | 2615 | struct x86_emulate_ops *ops = ctxt->ops; |
2684 | struct decode_cache *c = &ctxt->decode; | 2616 | struct decode_cache *c = &ctxt->decode; |
@@ -2689,7 +2621,10 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt) | |||
2689 | struct operand memop = { .type = OP_NONE }; | 2621 | struct operand memop = { .type = OP_NONE }; |
2690 | 2622 | ||
2691 | c->eip = ctxt->eip; | 2623 | c->eip = ctxt->eip; |
2692 | c->fetch.start = c->fetch.end = c->eip; | 2624 | c->fetch.start = c->eip; |
2625 | c->fetch.end = c->fetch.start + insn_len; | ||
2626 | if (insn_len > 0) | ||
2627 | memcpy(c->fetch.data, insn, insn_len); | ||
2693 | ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); | 2628 | ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); |
2694 | 2629 | ||
2695 | switch (mode) { | 2630 | switch (mode) { |
@@ -2803,10 +2738,8 @@ done_prefixes: | |||
2803 | c->execute = opcode.u.execute; | 2738 | c->execute = opcode.u.execute; |
2804 | 2739 | ||
2805 | /* Unrecognised? */ | 2740 | /* Unrecognised? */ |
2806 | if (c->d == 0 || (c->d & Undefined)) { | 2741 | if (c->d == 0 || (c->d & Undefined)) |
2807 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
2808 | return -1; | 2742 | return -1; |
2809 | } | ||
2810 | 2743 | ||
2811 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) | 2744 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) |
2812 | c->op_bytes = 8; | 2745 | c->op_bytes = 8; |
@@ -2831,14 +2764,13 @@ done_prefixes: | |||
2831 | if (!c->has_seg_override) | 2764 | if (!c->has_seg_override) |
2832 | set_seg_override(c, VCPU_SREG_DS); | 2765 | set_seg_override(c, VCPU_SREG_DS); |
2833 | 2766 | ||
2834 | if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d)) | 2767 | memop.addr.mem.seg = seg_override(ctxt, ops, c); |
2835 | memop.addr.mem += seg_override_base(ctxt, ops, c); | ||
2836 | 2768 | ||
2837 | if (memop.type == OP_MEM && c->ad_bytes != 8) | 2769 | if (memop.type == OP_MEM && c->ad_bytes != 8) |
2838 | memop.addr.mem = (u32)memop.addr.mem; | 2770 | memop.addr.mem.ea = (u32)memop.addr.mem.ea; |
2839 | 2771 | ||
2840 | if (memop.type == OP_MEM && c->rip_relative) | 2772 | if (memop.type == OP_MEM && c->rip_relative) |
2841 | memop.addr.mem += c->eip; | 2773 | memop.addr.mem.ea += c->eip; |
2842 | 2774 | ||
2843 | /* | 2775 | /* |
2844 | * Decode and fetch the source operand: register, memory | 2776 | * Decode and fetch the source operand: register, memory |
@@ -2890,14 +2822,14 @@ done_prefixes: | |||
2890 | case SrcSI: | 2822 | case SrcSI: |
2891 | c->src.type = OP_MEM; | 2823 | c->src.type = OP_MEM; |
2892 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 2824 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; |
2893 | c->src.addr.mem = | 2825 | c->src.addr.mem.ea = |
2894 | register_address(c, seg_override_base(ctxt, ops, c), | 2826 | register_address(c, c->regs[VCPU_REGS_RSI]); |
2895 | c->regs[VCPU_REGS_RSI]); | 2827 | c->src.addr.mem.seg = seg_override(ctxt, ops, c), |
2896 | c->src.val = 0; | 2828 | c->src.val = 0; |
2897 | break; | 2829 | break; |
2898 | case SrcImmFAddr: | 2830 | case SrcImmFAddr: |
2899 | c->src.type = OP_IMM; | 2831 | c->src.type = OP_IMM; |
2900 | c->src.addr.mem = c->eip; | 2832 | c->src.addr.mem.ea = c->eip; |
2901 | c->src.bytes = c->op_bytes + 2; | 2833 | c->src.bytes = c->op_bytes + 2; |
2902 | insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); | 2834 | insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); |
2903 | break; | 2835 | break; |
@@ -2944,7 +2876,7 @@ done_prefixes: | |||
2944 | break; | 2876 | break; |
2945 | case DstImmUByte: | 2877 | case DstImmUByte: |
2946 | c->dst.type = OP_IMM; | 2878 | c->dst.type = OP_IMM; |
2947 | c->dst.addr.mem = c->eip; | 2879 | c->dst.addr.mem.ea = c->eip; |
2948 | c->dst.bytes = 1; | 2880 | c->dst.bytes = 1; |
2949 | c->dst.val = insn_fetch(u8, 1, c->eip); | 2881 | c->dst.val = insn_fetch(u8, 1, c->eip); |
2950 | break; | 2882 | break; |
@@ -2969,9 +2901,9 @@ done_prefixes: | |||
2969 | case DstDI: | 2901 | case DstDI: |
2970 | c->dst.type = OP_MEM; | 2902 | c->dst.type = OP_MEM; |
2971 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 2903 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; |
2972 | c->dst.addr.mem = | 2904 | c->dst.addr.mem.ea = |
2973 | register_address(c, es_base(ctxt, ops), | 2905 | register_address(c, c->regs[VCPU_REGS_RDI]); |
2974 | c->regs[VCPU_REGS_RDI]); | 2906 | c->dst.addr.mem.seg = VCPU_SREG_ES; |
2975 | c->dst.val = 0; | 2907 | c->dst.val = 0; |
2976 | break; | 2908 | break; |
2977 | case ImplicitOps: | 2909 | case ImplicitOps: |
@@ -3020,24 +2952,24 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | |||
3020 | ctxt->decode.mem_read.pos = 0; | 2952 | ctxt->decode.mem_read.pos = 0; |
3021 | 2953 | ||
3022 | if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { | 2954 | if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { |
3023 | emulate_ud(ctxt); | 2955 | rc = emulate_ud(ctxt); |
3024 | goto done; | 2956 | goto done; |
3025 | } | 2957 | } |
3026 | 2958 | ||
3027 | /* LOCK prefix is allowed only with some instructions */ | 2959 | /* LOCK prefix is allowed only with some instructions */ |
3028 | if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { | 2960 | if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { |
3029 | emulate_ud(ctxt); | 2961 | rc = emulate_ud(ctxt); |
3030 | goto done; | 2962 | goto done; |
3031 | } | 2963 | } |
3032 | 2964 | ||
3033 | if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { | 2965 | if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { |
3034 | emulate_ud(ctxt); | 2966 | rc = emulate_ud(ctxt); |
3035 | goto done; | 2967 | goto done; |
3036 | } | 2968 | } |
3037 | 2969 | ||
3038 | /* Privileged instruction can be executed only in CPL=0 */ | 2970 | /* Privileged instruction can be executed only in CPL=0 */ |
3039 | if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { | 2971 | if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { |
3040 | emulate_gp(ctxt, 0); | 2972 | rc = emulate_gp(ctxt, 0); |
3041 | goto done; | 2973 | goto done; |
3042 | } | 2974 | } |
3043 | 2975 | ||
@@ -3050,7 +2982,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | |||
3050 | } | 2982 | } |
3051 | 2983 | ||
3052 | if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { | 2984 | if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { |
3053 | rc = read_emulated(ctxt, ops, c->src.addr.mem, | 2985 | rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem), |
3054 | c->src.valptr, c->src.bytes); | 2986 | c->src.valptr, c->src.bytes); |
3055 | if (rc != X86EMUL_CONTINUE) | 2987 | if (rc != X86EMUL_CONTINUE) |
3056 | goto done; | 2988 | goto done; |
@@ -3058,7 +2990,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | |||
3058 | } | 2990 | } |
3059 | 2991 | ||
3060 | if (c->src2.type == OP_MEM) { | 2992 | if (c->src2.type == OP_MEM) { |
3061 | rc = read_emulated(ctxt, ops, c->src2.addr.mem, | 2993 | rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem), |
3062 | &c->src2.val, c->src2.bytes); | 2994 | &c->src2.val, c->src2.bytes); |
3063 | if (rc != X86EMUL_CONTINUE) | 2995 | if (rc != X86EMUL_CONTINUE) |
3064 | goto done; | 2996 | goto done; |
@@ -3070,7 +3002,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) | |||
3070 | 3002 | ||
3071 | if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { | 3003 | if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { |
3072 | /* optimisation - avoid slow emulated read if Mov */ | 3004 | /* optimisation - avoid slow emulated read if Mov */ |
3073 | rc = read_emulated(ctxt, ops, c->dst.addr.mem, | 3005 | rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem), |
3074 | &c->dst.val, c->dst.bytes); | 3006 | &c->dst.val, c->dst.bytes); |
3075 | if (rc != X86EMUL_CONTINUE) | 3007 | if (rc != X86EMUL_CONTINUE) |
3076 | goto done; | 3008 | goto done; |
@@ -3215,13 +3147,13 @@ special_insn: | |||
3215 | break; | 3147 | break; |
3216 | case 0x8c: /* mov r/m, sreg */ | 3148 | case 0x8c: /* mov r/m, sreg */ |
3217 | if (c->modrm_reg > VCPU_SREG_GS) { | 3149 | if (c->modrm_reg > VCPU_SREG_GS) { |
3218 | emulate_ud(ctxt); | 3150 | rc = emulate_ud(ctxt); |
3219 | goto done; | 3151 | goto done; |
3220 | } | 3152 | } |
3221 | c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); | 3153 | c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); |
3222 | break; | 3154 | break; |
3223 | case 0x8d: /* lea r16/r32, m */ | 3155 | case 0x8d: /* lea r16/r32, m */ |
3224 | c->dst.val = c->src.addr.mem; | 3156 | c->dst.val = c->src.addr.mem.ea; |
3225 | break; | 3157 | break; |
3226 | case 0x8e: { /* mov seg, r/m16 */ | 3158 | case 0x8e: { /* mov seg, r/m16 */ |
3227 | uint16_t sel; | 3159 | uint16_t sel; |
@@ -3230,7 +3162,7 @@ special_insn: | |||
3230 | 3162 | ||
3231 | if (c->modrm_reg == VCPU_SREG_CS || | 3163 | if (c->modrm_reg == VCPU_SREG_CS || |
3232 | c->modrm_reg > VCPU_SREG_GS) { | 3164 | c->modrm_reg > VCPU_SREG_GS) { |
3233 | emulate_ud(ctxt); | 3165 | rc = emulate_ud(ctxt); |
3234 | goto done; | 3166 | goto done; |
3235 | } | 3167 | } |
3236 | 3168 | ||
@@ -3268,7 +3200,6 @@ special_insn: | |||
3268 | break; | 3200 | break; |
3269 | case 0xa6 ... 0xa7: /* cmps */ | 3201 | case 0xa6 ... 0xa7: /* cmps */ |
3270 | c->dst.type = OP_NONE; /* Disable writeback. */ | 3202 | c->dst.type = OP_NONE; /* Disable writeback. */ |
3271 | DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem); | ||
3272 | goto cmp; | 3203 | goto cmp; |
3273 | case 0xa8 ... 0xa9: /* test ax, imm */ | 3204 | case 0xa8 ... 0xa9: /* test ax, imm */ |
3274 | goto test; | 3205 | goto test; |
@@ -3363,7 +3294,7 @@ special_insn: | |||
3363 | do_io_in: | 3294 | do_io_in: |
3364 | c->dst.bytes = min(c->dst.bytes, 4u); | 3295 | c->dst.bytes = min(c->dst.bytes, 4u); |
3365 | if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { | 3296 | if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { |
3366 | emulate_gp(ctxt, 0); | 3297 | rc = emulate_gp(ctxt, 0); |
3367 | goto done; | 3298 | goto done; |
3368 | } | 3299 | } |
3369 | if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, | 3300 | if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, |
@@ -3377,7 +3308,7 @@ special_insn: | |||
3377 | c->src.bytes = min(c->src.bytes, 4u); | 3308 | c->src.bytes = min(c->src.bytes, 4u); |
3378 | if (!emulator_io_permited(ctxt, ops, c->dst.val, | 3309 | if (!emulator_io_permited(ctxt, ops, c->dst.val, |
3379 | c->src.bytes)) { | 3310 | c->src.bytes)) { |
3380 | emulate_gp(ctxt, 0); | 3311 | rc = emulate_gp(ctxt, 0); |
3381 | goto done; | 3312 | goto done; |
3382 | } | 3313 | } |
3383 | ops->pio_out_emulated(c->src.bytes, c->dst.val, | 3314 | ops->pio_out_emulated(c->src.bytes, c->dst.val, |
@@ -3402,14 +3333,14 @@ special_insn: | |||
3402 | break; | 3333 | break; |
3403 | case 0xfa: /* cli */ | 3334 | case 0xfa: /* cli */ |
3404 | if (emulator_bad_iopl(ctxt, ops)) { | 3335 | if (emulator_bad_iopl(ctxt, ops)) { |
3405 | emulate_gp(ctxt, 0); | 3336 | rc = emulate_gp(ctxt, 0); |
3406 | goto done; | 3337 | goto done; |
3407 | } else | 3338 | } else |
3408 | ctxt->eflags &= ~X86_EFLAGS_IF; | 3339 | ctxt->eflags &= ~X86_EFLAGS_IF; |
3409 | break; | 3340 | break; |
3410 | case 0xfb: /* sti */ | 3341 | case 0xfb: /* sti */ |
3411 | if (emulator_bad_iopl(ctxt, ops)) { | 3342 | if (emulator_bad_iopl(ctxt, ops)) { |
3412 | emulate_gp(ctxt, 0); | 3343 | rc = emulate_gp(ctxt, 0); |
3413 | goto done; | 3344 | goto done; |
3414 | } else { | 3345 | } else { |
3415 | ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; | 3346 | ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; |
@@ -3449,11 +3380,11 @@ writeback: | |||
3449 | c->dst.type = saved_dst_type; | 3380 | c->dst.type = saved_dst_type; |
3450 | 3381 | ||
3451 | if ((c->d & SrcMask) == SrcSI) | 3382 | if ((c->d & SrcMask) == SrcSI) |
3452 | string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), | 3383 | string_addr_inc(ctxt, seg_override(ctxt, ops, c), |
3453 | VCPU_REGS_RSI, &c->src); | 3384 | VCPU_REGS_RSI, &c->src); |
3454 | 3385 | ||
3455 | if ((c->d & DstMask) == DstDI) | 3386 | if ((c->d & DstMask) == DstDI) |
3456 | string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, | 3387 | string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, |
3457 | &c->dst); | 3388 | &c->dst); |
3458 | 3389 | ||
3459 | if (c->rep_prefix && (c->d & String)) { | 3390 | if (c->rep_prefix && (c->d & String)) { |
@@ -3482,6 +3413,8 @@ writeback: | |||
3482 | ctxt->eip = c->eip; | 3413 | ctxt->eip = c->eip; |
3483 | 3414 | ||
3484 | done: | 3415 | done: |
3416 | if (rc == X86EMUL_PROPAGATE_FAULT) | ||
3417 | ctxt->have_exception = true; | ||
3485 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; | 3418 | return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; |
3486 | 3419 | ||
3487 | twobyte_insn: | 3420 | twobyte_insn: |
@@ -3544,9 +3477,11 @@ twobyte_insn: | |||
3544 | break; | 3477 | break; |
3545 | case 5: /* not defined */ | 3478 | case 5: /* not defined */ |
3546 | emulate_ud(ctxt); | 3479 | emulate_ud(ctxt); |
3480 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3547 | goto done; | 3481 | goto done; |
3548 | case 7: /* invlpg*/ | 3482 | case 7: /* invlpg*/ |
3549 | emulate_invlpg(ctxt->vcpu, c->src.addr.mem); | 3483 | emulate_invlpg(ctxt->vcpu, |
3484 | linear(ctxt, c->src.addr.mem)); | ||
3550 | /* Disable writeback. */ | 3485 | /* Disable writeback. */ |
3551 | c->dst.type = OP_NONE; | 3486 | c->dst.type = OP_NONE; |
3552 | break; | 3487 | break; |
@@ -3573,6 +3508,7 @@ twobyte_insn: | |||
3573 | case 5 ... 7: | 3508 | case 5 ... 7: |
3574 | case 9 ... 15: | 3509 | case 9 ... 15: |
3575 | emulate_ud(ctxt); | 3510 | emulate_ud(ctxt); |
3511 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3576 | goto done; | 3512 | goto done; |
3577 | } | 3513 | } |
3578 | c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); | 3514 | c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); |
@@ -3581,6 +3517,7 @@ twobyte_insn: | |||
3581 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && | 3517 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && |
3582 | (c->modrm_reg == 4 || c->modrm_reg == 5)) { | 3518 | (c->modrm_reg == 4 || c->modrm_reg == 5)) { |
3583 | emulate_ud(ctxt); | 3519 | emulate_ud(ctxt); |
3520 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3584 | goto done; | 3521 | goto done; |
3585 | } | 3522 | } |
3586 | ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); | 3523 | ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); |
@@ -3588,6 +3525,7 @@ twobyte_insn: | |||
3588 | case 0x22: /* mov reg, cr */ | 3525 | case 0x22: /* mov reg, cr */ |
3589 | if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { | 3526 | if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { |
3590 | emulate_gp(ctxt, 0); | 3527 | emulate_gp(ctxt, 0); |
3528 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3591 | goto done; | 3529 | goto done; |
3592 | } | 3530 | } |
3593 | c->dst.type = OP_NONE; | 3531 | c->dst.type = OP_NONE; |
@@ -3596,6 +3534,7 @@ twobyte_insn: | |||
3596 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && | 3534 | if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && |
3597 | (c->modrm_reg == 4 || c->modrm_reg == 5)) { | 3535 | (c->modrm_reg == 4 || c->modrm_reg == 5)) { |
3598 | emulate_ud(ctxt); | 3536 | emulate_ud(ctxt); |
3537 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3599 | goto done; | 3538 | goto done; |
3600 | } | 3539 | } |
3601 | 3540 | ||
@@ -3604,6 +3543,7 @@ twobyte_insn: | |||
3604 | ~0ULL : ~0U), ctxt->vcpu) < 0) { | 3543 | ~0ULL : ~0U), ctxt->vcpu) < 0) { |
3605 | /* #UD condition is already handled by the code above */ | 3544 | /* #UD condition is already handled by the code above */ |
3606 | emulate_gp(ctxt, 0); | 3545 | emulate_gp(ctxt, 0); |
3546 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3607 | goto done; | 3547 | goto done; |
3608 | } | 3548 | } |
3609 | 3549 | ||
@@ -3615,6 +3555,7 @@ twobyte_insn: | |||
3615 | | ((u64)c->regs[VCPU_REGS_RDX] << 32); | 3555 | | ((u64)c->regs[VCPU_REGS_RDX] << 32); |
3616 | if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { | 3556 | if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { |
3617 | emulate_gp(ctxt, 0); | 3557 | emulate_gp(ctxt, 0); |
3558 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3618 | goto done; | 3559 | goto done; |
3619 | } | 3560 | } |
3620 | rc = X86EMUL_CONTINUE; | 3561 | rc = X86EMUL_CONTINUE; |
@@ -3623,6 +3564,7 @@ twobyte_insn: | |||
3623 | /* rdmsr */ | 3564 | /* rdmsr */ |
3624 | if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { | 3565 | if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { |
3625 | emulate_gp(ctxt, 0); | 3566 | emulate_gp(ctxt, 0); |
3567 | rc = X86EMUL_PROPAGATE_FAULT; | ||
3626 | goto done; | 3568 | goto done; |
3627 | } else { | 3569 | } else { |
3628 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; | 3570 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; |
@@ -3785,6 +3727,5 @@ twobyte_insn: | |||
3785 | goto writeback; | 3727 | goto writeback; |
3786 | 3728 | ||
3787 | cannot_emulate: | 3729 | cannot_emulate: |
3788 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
3789 | return -1; | 3730 | return -1; |
3790 | } | 3731 | } |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index f628234fbeca..3cece05e4ac4 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -575,6 +575,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) | |||
575 | s->pics[1].elcr_mask = 0xde; | 575 | s->pics[1].elcr_mask = 0xde; |
576 | s->pics[0].pics_state = s; | 576 | s->pics[0].pics_state = s; |
577 | s->pics[1].pics_state = s; | 577 | s->pics[1].pics_state = s; |
578 | s->pics[0].isr_ack = 0xff; | ||
579 | s->pics[1].isr_ack = 0xff; | ||
578 | 580 | ||
579 | /* | 581 | /* |
580 | * Initialize PIO device | 582 | * Initialize PIO device |
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 975bb45329a1..3377d53fcd36 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h | |||
@@ -73,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) | |||
73 | return vcpu->arch.cr4 & mask; | 73 | return vcpu->arch.cr4 & mask; |
74 | } | 74 | } |
75 | 75 | ||
76 | static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) | ||
77 | { | ||
78 | if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) | ||
79 | kvm_x86_ops->decache_cr3(vcpu); | ||
80 | return vcpu->arch.cr3; | ||
81 | } | ||
82 | |||
76 | static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) | 83 | static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) |
77 | { | 84 | { |
78 | return kvm_read_cr4_bits(vcpu, ~0UL); | 85 | return kvm_read_cr4_bits(vcpu, ~0UL); |
@@ -84,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) | |||
84 | | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); | 91 | | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); |
85 | } | 92 | } |
86 | 93 | ||
94 | static inline void enter_guest_mode(struct kvm_vcpu *vcpu) | ||
95 | { | ||
96 | vcpu->arch.hflags |= HF_GUEST_MASK; | ||
97 | } | ||
98 | |||
99 | static inline void leave_guest_mode(struct kvm_vcpu *vcpu) | ||
100 | { | ||
101 | vcpu->arch.hflags &= ~HF_GUEST_MASK; | ||
102 | } | ||
103 | |||
104 | static inline bool is_guest_mode(struct kvm_vcpu *vcpu) | ||
105 | { | ||
106 | return vcpu->arch.hflags & HF_GUEST_MASK; | ||
107 | } | ||
108 | |||
87 | #endif | 109 | #endif |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 413f8973a855..93cf9d0d3653 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -277,7 +277,8 @@ static void apic_update_ppr(struct kvm_lapic *apic) | |||
277 | 277 | ||
278 | if (old_ppr != ppr) { | 278 | if (old_ppr != ppr) { |
279 | apic_set_reg(apic, APIC_PROCPRI, ppr); | 279 | apic_set_reg(apic, APIC_PROCPRI, ppr); |
280 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); | 280 | if (ppr < old_ppr) |
281 | kvm_make_request(KVM_REQ_EVENT, apic->vcpu); | ||
281 | } | 282 | } |
282 | } | 283 | } |
283 | 284 | ||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fb8b376bf28c..f02b8edc3d44 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -18,9 +18,11 @@ | |||
18 | * | 18 | * |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include "irq.h" | ||
21 | #include "mmu.h" | 22 | #include "mmu.h" |
22 | #include "x86.h" | 23 | #include "x86.h" |
23 | #include "kvm_cache_regs.h" | 24 | #include "kvm_cache_regs.h" |
25 | #include "x86.h" | ||
24 | 26 | ||
25 | #include <linux/kvm_host.h> | 27 | #include <linux/kvm_host.h> |
26 | #include <linux/types.h> | 28 | #include <linux/types.h> |
@@ -194,7 +196,6 @@ static struct percpu_counter kvm_total_used_mmu_pages; | |||
194 | 196 | ||
195 | static u64 __read_mostly shadow_trap_nonpresent_pte; | 197 | static u64 __read_mostly shadow_trap_nonpresent_pte; |
196 | static u64 __read_mostly shadow_notrap_nonpresent_pte; | 198 | static u64 __read_mostly shadow_notrap_nonpresent_pte; |
197 | static u64 __read_mostly shadow_base_present_pte; | ||
198 | static u64 __read_mostly shadow_nx_mask; | 199 | static u64 __read_mostly shadow_nx_mask; |
199 | static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ | 200 | static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ |
200 | static u64 __read_mostly shadow_user_mask; | 201 | static u64 __read_mostly shadow_user_mask; |
@@ -213,12 +214,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) | |||
213 | } | 214 | } |
214 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); | 215 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); |
215 | 216 | ||
216 | void kvm_mmu_set_base_ptes(u64 base_pte) | ||
217 | { | ||
218 | shadow_base_present_pte = base_pte; | ||
219 | } | ||
220 | EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); | ||
221 | |||
222 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 217 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
223 | u64 dirty_mask, u64 nx_mask, u64 x_mask) | 218 | u64 dirty_mask, u64 nx_mask, u64 x_mask) |
224 | { | 219 | { |
@@ -482,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) | |||
482 | } | 477 | } |
483 | 478 | ||
484 | /* | 479 | /* |
485 | * Return the pointer to the largepage write count for a given | 480 | * Return the pointer to the large page information for a given gfn, |
486 | * gfn, handling slots that are not large page aligned. | 481 | * handling slots that are not large page aligned. |
487 | */ | 482 | */ |
488 | static int *slot_largepage_idx(gfn_t gfn, | 483 | static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, |
489 | struct kvm_memory_slot *slot, | 484 | struct kvm_memory_slot *slot, |
490 | int level) | 485 | int level) |
491 | { | 486 | { |
492 | unsigned long idx; | 487 | unsigned long idx; |
493 | 488 | ||
494 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - | 489 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - |
495 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); | 490 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); |
496 | return &slot->lpage_info[level - 2][idx].write_count; | 491 | return &slot->lpage_info[level - 2][idx]; |
497 | } | 492 | } |
498 | 493 | ||
499 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) | 494 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) |
500 | { | 495 | { |
501 | struct kvm_memory_slot *slot; | 496 | struct kvm_memory_slot *slot; |
502 | int *write_count; | 497 | struct kvm_lpage_info *linfo; |
503 | int i; | 498 | int i; |
504 | 499 | ||
505 | slot = gfn_to_memslot(kvm, gfn); | 500 | slot = gfn_to_memslot(kvm, gfn); |
506 | for (i = PT_DIRECTORY_LEVEL; | 501 | for (i = PT_DIRECTORY_LEVEL; |
507 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 502 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
508 | write_count = slot_largepage_idx(gfn, slot, i); | 503 | linfo = lpage_info_slot(gfn, slot, i); |
509 | *write_count += 1; | 504 | linfo->write_count += 1; |
510 | } | 505 | } |
511 | } | 506 | } |
512 | 507 | ||
513 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | 508 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) |
514 | { | 509 | { |
515 | struct kvm_memory_slot *slot; | 510 | struct kvm_memory_slot *slot; |
516 | int *write_count; | 511 | struct kvm_lpage_info *linfo; |
517 | int i; | 512 | int i; |
518 | 513 | ||
519 | slot = gfn_to_memslot(kvm, gfn); | 514 | slot = gfn_to_memslot(kvm, gfn); |
520 | for (i = PT_DIRECTORY_LEVEL; | 515 | for (i = PT_DIRECTORY_LEVEL; |
521 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 516 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
522 | write_count = slot_largepage_idx(gfn, slot, i); | 517 | linfo = lpage_info_slot(gfn, slot, i); |
523 | *write_count -= 1; | 518 | linfo->write_count -= 1; |
524 | WARN_ON(*write_count < 0); | 519 | WARN_ON(linfo->write_count < 0); |
525 | } | 520 | } |
526 | } | 521 | } |
527 | 522 | ||
@@ -530,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm, | |||
530 | int level) | 525 | int level) |
531 | { | 526 | { |
532 | struct kvm_memory_slot *slot; | 527 | struct kvm_memory_slot *slot; |
533 | int *largepage_idx; | 528 | struct kvm_lpage_info *linfo; |
534 | 529 | ||
535 | slot = gfn_to_memslot(kvm, gfn); | 530 | slot = gfn_to_memslot(kvm, gfn); |
536 | if (slot) { | 531 | if (slot) { |
537 | largepage_idx = slot_largepage_idx(gfn, slot, level); | 532 | linfo = lpage_info_slot(gfn, slot, level); |
538 | return *largepage_idx; | 533 | return linfo->write_count; |
539 | } | 534 | } |
540 | 535 | ||
541 | return 1; | 536 | return 1; |
@@ -559,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) | |||
559 | return ret; | 554 | return ret; |
560 | } | 555 | } |
561 | 556 | ||
562 | static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | 557 | static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) |
563 | { | 558 | { |
564 | struct kvm_memory_slot *slot; | 559 | struct kvm_memory_slot *slot; |
565 | int host_level, level, max_level; | ||
566 | |||
567 | slot = gfn_to_memslot(vcpu->kvm, large_gfn); | 560 | slot = gfn_to_memslot(vcpu->kvm, large_gfn); |
568 | if (slot && slot->dirty_bitmap) | 561 | if (slot && slot->dirty_bitmap) |
569 | return PT_PAGE_TABLE_LEVEL; | 562 | return true; |
563 | return false; | ||
564 | } | ||
565 | |||
566 | static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | ||
567 | { | ||
568 | int host_level, level, max_level; | ||
570 | 569 | ||
571 | host_level = host_mapping_level(vcpu->kvm, large_gfn); | 570 | host_level = host_mapping_level(vcpu->kvm, large_gfn); |
572 | 571 | ||
@@ -590,16 +589,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | |||
590 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) | 589 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) |
591 | { | 590 | { |
592 | struct kvm_memory_slot *slot; | 591 | struct kvm_memory_slot *slot; |
593 | unsigned long idx; | 592 | struct kvm_lpage_info *linfo; |
594 | 593 | ||
595 | slot = gfn_to_memslot(kvm, gfn); | 594 | slot = gfn_to_memslot(kvm, gfn); |
596 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | 595 | if (likely(level == PT_PAGE_TABLE_LEVEL)) |
597 | return &slot->rmap[gfn - slot->base_gfn]; | 596 | return &slot->rmap[gfn - slot->base_gfn]; |
598 | 597 | ||
599 | idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - | 598 | linfo = lpage_info_slot(gfn, slot, level); |
600 | (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); | ||
601 | 599 | ||
602 | return &slot->lpage_info[level - 2][idx].rmap_pde; | 600 | return &linfo->rmap_pde; |
603 | } | 601 | } |
604 | 602 | ||
605 | /* | 603 | /* |
@@ -887,19 +885,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | |||
887 | end = start + (memslot->npages << PAGE_SHIFT); | 885 | end = start + (memslot->npages << PAGE_SHIFT); |
888 | if (hva >= start && hva < end) { | 886 | if (hva >= start && hva < end) { |
889 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; | 887 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; |
888 | gfn_t gfn = memslot->base_gfn + gfn_offset; | ||
890 | 889 | ||
891 | ret = handler(kvm, &memslot->rmap[gfn_offset], data); | 890 | ret = handler(kvm, &memslot->rmap[gfn_offset], data); |
892 | 891 | ||
893 | for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { | 892 | for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { |
894 | unsigned long idx; | 893 | struct kvm_lpage_info *linfo; |
895 | int sh; | 894 | |
896 | 895 | linfo = lpage_info_slot(gfn, memslot, | |
897 | sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); | 896 | PT_DIRECTORY_LEVEL + j); |
898 | idx = ((memslot->base_gfn+gfn_offset) >> sh) - | 897 | ret |= handler(kvm, &linfo->rmap_pde, data); |
899 | (memslot->base_gfn >> sh); | ||
900 | ret |= handler(kvm, | ||
901 | &memslot->lpage_info[j][idx].rmap_pde, | ||
902 | data); | ||
903 | } | 898 | } |
904 | trace_kvm_age_page(hva, memslot, ret); | 899 | trace_kvm_age_page(hva, memslot, ret); |
905 | retval |= ret; | 900 | retval |= ret; |
@@ -950,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
950 | return young; | 945 | return young; |
951 | } | 946 | } |
952 | 947 | ||
948 | static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | ||
949 | unsigned long data) | ||
950 | { | ||
951 | u64 *spte; | ||
952 | int young = 0; | ||
953 | |||
954 | /* | ||
955 | * If there's no access bit in the secondary pte set by the | ||
956 | * hardware it's up to gup-fast/gup to set the access bit in | ||
957 | * the primary pte or in the page structure. | ||
958 | */ | ||
959 | if (!shadow_accessed_mask) | ||
960 | goto out; | ||
961 | |||
962 | spte = rmap_next(kvm, rmapp, NULL); | ||
963 | while (spte) { | ||
964 | u64 _spte = *spte; | ||
965 | BUG_ON(!(_spte & PT_PRESENT_MASK)); | ||
966 | young = _spte & PT_ACCESSED_MASK; | ||
967 | if (young) { | ||
968 | young = 1; | ||
969 | break; | ||
970 | } | ||
971 | spte = rmap_next(kvm, rmapp, spte); | ||
972 | } | ||
973 | out: | ||
974 | return young; | ||
975 | } | ||
976 | |||
953 | #define RMAP_RECYCLE_THRESHOLD 1000 | 977 | #define RMAP_RECYCLE_THRESHOLD 1000 |
954 | 978 | ||
955 | static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | 979 | static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) |
@@ -970,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva) | |||
970 | return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); | 994 | return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); |
971 | } | 995 | } |
972 | 996 | ||
997 | int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) | ||
998 | { | ||
999 | return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); | ||
1000 | } | ||
1001 | |||
973 | #ifdef MMU_DEBUG | 1002 | #ifdef MMU_DEBUG |
974 | static int is_empty_shadow_page(u64 *spt) | 1003 | static int is_empty_shadow_page(u64 *spt) |
975 | { | 1004 | { |
@@ -1161,7 +1190,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | |||
1161 | } | 1190 | } |
1162 | 1191 | ||
1163 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, | 1192 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, |
1164 | struct kvm_mmu_page *sp, bool clear_unsync) | 1193 | struct kvm_mmu_page *sp) |
1165 | { | 1194 | { |
1166 | return 1; | 1195 | return 1; |
1167 | } | 1196 | } |
@@ -1291,7 +1320,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
1291 | if (clear_unsync) | 1320 | if (clear_unsync) |
1292 | kvm_unlink_unsync_page(vcpu->kvm, sp); | 1321 | kvm_unlink_unsync_page(vcpu->kvm, sp); |
1293 | 1322 | ||
1294 | if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { | 1323 | if (vcpu->arch.mmu.sync_page(vcpu, sp)) { |
1295 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); | 1324 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); |
1296 | return 1; | 1325 | return 1; |
1297 | } | 1326 | } |
@@ -1332,12 +1361,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) | |||
1332 | continue; | 1361 | continue; |
1333 | 1362 | ||
1334 | WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); | 1363 | WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); |
1364 | kvm_unlink_unsync_page(vcpu->kvm, s); | ||
1335 | if ((s->role.cr4_pae != !!is_pae(vcpu)) || | 1365 | if ((s->role.cr4_pae != !!is_pae(vcpu)) || |
1336 | (vcpu->arch.mmu.sync_page(vcpu, s, true))) { | 1366 | (vcpu->arch.mmu.sync_page(vcpu, s))) { |
1337 | kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); | 1367 | kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); |
1338 | continue; | 1368 | continue; |
1339 | } | 1369 | } |
1340 | kvm_unlink_unsync_page(vcpu->kvm, s); | ||
1341 | flush = true; | 1370 | flush = true; |
1342 | } | 1371 | } |
1343 | 1372 | ||
@@ -1963,9 +1992,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1963 | unsigned pte_access, int user_fault, | 1992 | unsigned pte_access, int user_fault, |
1964 | int write_fault, int dirty, int level, | 1993 | int write_fault, int dirty, int level, |
1965 | gfn_t gfn, pfn_t pfn, bool speculative, | 1994 | gfn_t gfn, pfn_t pfn, bool speculative, |
1966 | bool can_unsync, bool reset_host_protection) | 1995 | bool can_unsync, bool host_writable) |
1967 | { | 1996 | { |
1968 | u64 spte; | 1997 | u64 spte, entry = *sptep; |
1969 | int ret = 0; | 1998 | int ret = 0; |
1970 | 1999 | ||
1971 | /* | 2000 | /* |
@@ -1973,7 +2002,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1973 | * whether the guest actually used the pte (in order to detect | 2002 | * whether the guest actually used the pte (in order to detect |
1974 | * demand paging). | 2003 | * demand paging). |
1975 | */ | 2004 | */ |
1976 | spte = shadow_base_present_pte; | 2005 | spte = PT_PRESENT_MASK; |
1977 | if (!speculative) | 2006 | if (!speculative) |
1978 | spte |= shadow_accessed_mask; | 2007 | spte |= shadow_accessed_mask; |
1979 | if (!dirty) | 2008 | if (!dirty) |
@@ -1990,8 +2019,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1990 | spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, | 2019 | spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, |
1991 | kvm_is_mmio_pfn(pfn)); | 2020 | kvm_is_mmio_pfn(pfn)); |
1992 | 2021 | ||
1993 | if (reset_host_protection) | 2022 | if (host_writable) |
1994 | spte |= SPTE_HOST_WRITEABLE; | 2023 | spte |= SPTE_HOST_WRITEABLE; |
2024 | else | ||
2025 | pte_access &= ~ACC_WRITE_MASK; | ||
1995 | 2026 | ||
1996 | spte |= (u64)pfn << PAGE_SHIFT; | 2027 | spte |= (u64)pfn << PAGE_SHIFT; |
1997 | 2028 | ||
@@ -2036,6 +2067,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2036 | 2067 | ||
2037 | set_pte: | 2068 | set_pte: |
2038 | update_spte(sptep, spte); | 2069 | update_spte(sptep, spte); |
2070 | /* | ||
2071 | * If we overwrite a writable spte with a read-only one we | ||
2072 | * should flush remote TLBs. Otherwise rmap_write_protect | ||
2073 | * will find a read-only spte, even though the writable spte | ||
2074 | * might be cached on a CPU's TLB. | ||
2075 | */ | ||
2076 | if (is_writable_pte(entry) && !is_writable_pte(*sptep)) | ||
2077 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
2039 | done: | 2078 | done: |
2040 | return ret; | 2079 | return ret; |
2041 | } | 2080 | } |
@@ -2045,7 +2084,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2045 | int user_fault, int write_fault, int dirty, | 2084 | int user_fault, int write_fault, int dirty, |
2046 | int *ptwrite, int level, gfn_t gfn, | 2085 | int *ptwrite, int level, gfn_t gfn, |
2047 | pfn_t pfn, bool speculative, | 2086 | pfn_t pfn, bool speculative, |
2048 | bool reset_host_protection) | 2087 | bool host_writable) |
2049 | { | 2088 | { |
2050 | int was_rmapped = 0; | 2089 | int was_rmapped = 0; |
2051 | int rmap_count; | 2090 | int rmap_count; |
@@ -2080,7 +2119,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2080 | 2119 | ||
2081 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, | 2120 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, |
2082 | dirty, level, gfn, pfn, speculative, true, | 2121 | dirty, level, gfn, pfn, speculative, true, |
2083 | reset_host_protection)) { | 2122 | host_writable)) { |
2084 | if (write_fault) | 2123 | if (write_fault) |
2085 | *ptwrite = 1; | 2124 | *ptwrite = 1; |
2086 | kvm_mmu_flush_tlb(vcpu); | 2125 | kvm_mmu_flush_tlb(vcpu); |
@@ -2211,7 +2250,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) | |||
2211 | } | 2250 | } |
2212 | 2251 | ||
2213 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | 2252 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, |
2214 | int level, gfn_t gfn, pfn_t pfn) | 2253 | int map_writable, int level, gfn_t gfn, pfn_t pfn, |
2254 | bool prefault) | ||
2215 | { | 2255 | { |
2216 | struct kvm_shadow_walk_iterator iterator; | 2256 | struct kvm_shadow_walk_iterator iterator; |
2217 | struct kvm_mmu_page *sp; | 2257 | struct kvm_mmu_page *sp; |
@@ -2220,9 +2260,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
2220 | 2260 | ||
2221 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { | 2261 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { |
2222 | if (iterator.level == level) { | 2262 | if (iterator.level == level) { |
2223 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, | 2263 | unsigned pte_access = ACC_ALL; |
2264 | |||
2265 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, | ||
2224 | 0, write, 1, &pt_write, | 2266 | 0, write, 1, &pt_write, |
2225 | level, gfn, pfn, false, true); | 2267 | level, gfn, pfn, prefault, map_writable); |
2226 | direct_pte_prefetch(vcpu, iterator.sptep); | 2268 | direct_pte_prefetch(vcpu, iterator.sptep); |
2227 | ++vcpu->stat.pf_fixed; | 2269 | ++vcpu->stat.pf_fixed; |
2228 | break; | 2270 | break; |
@@ -2277,27 +2319,81 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) | |||
2277 | return 1; | 2319 | return 1; |
2278 | } | 2320 | } |
2279 | 2321 | ||
2280 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | 2322 | static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, |
2323 | gfn_t *gfnp, pfn_t *pfnp, int *levelp) | ||
2324 | { | ||
2325 | pfn_t pfn = *pfnp; | ||
2326 | gfn_t gfn = *gfnp; | ||
2327 | int level = *levelp; | ||
2328 | |||
2329 | /* | ||
2330 | * Check if it's a transparent hugepage. If this would be an | ||
2331 | * hugetlbfs page, level wouldn't be set to | ||
2332 | * PT_PAGE_TABLE_LEVEL and there would be no adjustment done | ||
2333 | * here. | ||
2334 | */ | ||
2335 | if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && | ||
2336 | level == PT_PAGE_TABLE_LEVEL && | ||
2337 | PageTransCompound(pfn_to_page(pfn)) && | ||
2338 | !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { | ||
2339 | unsigned long mask; | ||
2340 | /* | ||
2341 | * mmu_notifier_retry was successful and we hold the | ||
2342 | * mmu_lock here, so the pmd can't become splitting | ||
2343 | * from under us, and in turn | ||
2344 | * __split_huge_page_refcount() can't run from under | ||
2345 | * us and we can safely transfer the refcount from | ||
2346 | * PG_tail to PG_head as we switch the pfn to tail to | ||
2347 | * head. | ||
2348 | */ | ||
2349 | *levelp = level = PT_DIRECTORY_LEVEL; | ||
2350 | mask = KVM_PAGES_PER_HPAGE(level) - 1; | ||
2351 | VM_BUG_ON((gfn & mask) != (pfn & mask)); | ||
2352 | if (pfn & mask) { | ||
2353 | gfn &= ~mask; | ||
2354 | *gfnp = gfn; | ||
2355 | kvm_release_pfn_clean(pfn); | ||
2356 | pfn &= ~mask; | ||
2357 | if (!get_page_unless_zero(pfn_to_page(pfn))) | ||
2358 | BUG(); | ||
2359 | *pfnp = pfn; | ||
2360 | } | ||
2361 | } | ||
2362 | } | ||
2363 | |||
2364 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | ||
2365 | gva_t gva, pfn_t *pfn, bool write, bool *writable); | ||
2366 | |||
2367 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | ||
2368 | bool prefault) | ||
2281 | { | 2369 | { |
2282 | int r; | 2370 | int r; |
2283 | int level; | 2371 | int level; |
2372 | int force_pt_level; | ||
2284 | pfn_t pfn; | 2373 | pfn_t pfn; |
2285 | unsigned long mmu_seq; | 2374 | unsigned long mmu_seq; |
2375 | bool map_writable; | ||
2286 | 2376 | ||
2287 | level = mapping_level(vcpu, gfn); | 2377 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); |
2288 | 2378 | if (likely(!force_pt_level)) { | |
2289 | /* | 2379 | level = mapping_level(vcpu, gfn); |
2290 | * This path builds a PAE pagetable - so we can map 2mb pages at | 2380 | /* |
2291 | * maximum. Therefore check if the level is larger than that. | 2381 | * This path builds a PAE pagetable - so we can map |
2292 | */ | 2382 | * 2mb pages at maximum. Therefore check if the level |
2293 | if (level > PT_DIRECTORY_LEVEL) | 2383 | * is larger than that. |
2294 | level = PT_DIRECTORY_LEVEL; | 2384 | */ |
2385 | if (level > PT_DIRECTORY_LEVEL) | ||
2386 | level = PT_DIRECTORY_LEVEL; | ||
2295 | 2387 | ||
2296 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | 2388 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); |
2389 | } else | ||
2390 | level = PT_PAGE_TABLE_LEVEL; | ||
2297 | 2391 | ||
2298 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2392 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2299 | smp_rmb(); | 2393 | smp_rmb(); |
2300 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2394 | |
2395 | if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) | ||
2396 | return 0; | ||
2301 | 2397 | ||
2302 | /* mmio */ | 2398 | /* mmio */ |
2303 | if (is_error_pfn(pfn)) | 2399 | if (is_error_pfn(pfn)) |
@@ -2307,7 +2403,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
2307 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2403 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
2308 | goto out_unlock; | 2404 | goto out_unlock; |
2309 | kvm_mmu_free_some_pages(vcpu); | 2405 | kvm_mmu_free_some_pages(vcpu); |
2310 | r = __direct_map(vcpu, v, write, level, gfn, pfn); | 2406 | if (likely(!force_pt_level)) |
2407 | transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); | ||
2408 | r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, | ||
2409 | prefault); | ||
2311 | spin_unlock(&vcpu->kvm->mmu_lock); | 2410 | spin_unlock(&vcpu->kvm->mmu_lock); |
2312 | 2411 | ||
2313 | 2412 | ||
@@ -2394,7 +2493,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) | |||
2394 | ASSERT(!VALID_PAGE(root)); | 2493 | ASSERT(!VALID_PAGE(root)); |
2395 | spin_lock(&vcpu->kvm->mmu_lock); | 2494 | spin_lock(&vcpu->kvm->mmu_lock); |
2396 | kvm_mmu_free_some_pages(vcpu); | 2495 | kvm_mmu_free_some_pages(vcpu); |
2397 | sp = kvm_mmu_get_page(vcpu, i << 30, i << 30, | 2496 | sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), |
2497 | i << 30, | ||
2398 | PT32_ROOT_LEVEL, 1, ACC_ALL, | 2498 | PT32_ROOT_LEVEL, 1, ACC_ALL, |
2399 | NULL); | 2499 | NULL); |
2400 | root = __pa(sp->spt); | 2500 | root = __pa(sp->spt); |
@@ -2529,6 +2629,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2529 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2629 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2530 | sp = page_header(root); | 2630 | sp = page_header(root); |
2531 | mmu_sync_children(vcpu, sp); | 2631 | mmu_sync_children(vcpu, sp); |
2632 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); | ||
2532 | return; | 2633 | return; |
2533 | } | 2634 | } |
2534 | for (i = 0; i < 4; ++i) { | 2635 | for (i = 0; i < 4; ++i) { |
@@ -2551,23 +2652,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2551 | } | 2652 | } |
2552 | 2653 | ||
2553 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, | 2654 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, |
2554 | u32 access, u32 *error) | 2655 | u32 access, struct x86_exception *exception) |
2555 | { | 2656 | { |
2556 | if (error) | 2657 | if (exception) |
2557 | *error = 0; | 2658 | exception->error_code = 0; |
2558 | return vaddr; | 2659 | return vaddr; |
2559 | } | 2660 | } |
2560 | 2661 | ||
2561 | static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, | 2662 | static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, |
2562 | u32 access, u32 *error) | 2663 | u32 access, |
2664 | struct x86_exception *exception) | ||
2563 | { | 2665 | { |
2564 | if (error) | 2666 | if (exception) |
2565 | *error = 0; | 2667 | exception->error_code = 0; |
2566 | return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); | 2668 | return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); |
2567 | } | 2669 | } |
2568 | 2670 | ||
2569 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | 2671 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, |
2570 | u32 error_code) | 2672 | u32 error_code, bool prefault) |
2571 | { | 2673 | { |
2572 | gfn_t gfn; | 2674 | gfn_t gfn; |
2573 | int r; | 2675 | int r; |
@@ -2583,17 +2685,68 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
2583 | gfn = gva >> PAGE_SHIFT; | 2685 | gfn = gva >> PAGE_SHIFT; |
2584 | 2686 | ||
2585 | return nonpaging_map(vcpu, gva & PAGE_MASK, | 2687 | return nonpaging_map(vcpu, gva & PAGE_MASK, |
2586 | error_code & PFERR_WRITE_MASK, gfn); | 2688 | error_code & PFERR_WRITE_MASK, gfn, prefault); |
2689 | } | ||
2690 | |||
2691 | static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) | ||
2692 | { | ||
2693 | struct kvm_arch_async_pf arch; | ||
2694 | |||
2695 | arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; | ||
2696 | arch.gfn = gfn; | ||
2697 | arch.direct_map = vcpu->arch.mmu.direct_map; | ||
2698 | arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); | ||
2699 | |||
2700 | return kvm_setup_async_pf(vcpu, gva, gfn, &arch); | ||
2701 | } | ||
2702 | |||
2703 | static bool can_do_async_pf(struct kvm_vcpu *vcpu) | ||
2704 | { | ||
2705 | if (unlikely(!irqchip_in_kernel(vcpu->kvm) || | ||
2706 | kvm_event_needs_reinjection(vcpu))) | ||
2707 | return false; | ||
2708 | |||
2709 | return kvm_x86_ops->interrupt_allowed(vcpu); | ||
2587 | } | 2710 | } |
2588 | 2711 | ||
2589 | static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | 2712 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, |
2590 | u32 error_code) | 2713 | gva_t gva, pfn_t *pfn, bool write, bool *writable) |
2714 | { | ||
2715 | bool async; | ||
2716 | |||
2717 | *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); | ||
2718 | |||
2719 | if (!async) | ||
2720 | return false; /* *pfn has correct page already */ | ||
2721 | |||
2722 | put_page(pfn_to_page(*pfn)); | ||
2723 | |||
2724 | if (!prefault && can_do_async_pf(vcpu)) { | ||
2725 | trace_kvm_try_async_get_page(gva, gfn); | ||
2726 | if (kvm_find_async_pf_gfn(vcpu, gfn)) { | ||
2727 | trace_kvm_async_pf_doublefault(gva, gfn); | ||
2728 | kvm_make_request(KVM_REQ_APF_HALT, vcpu); | ||
2729 | return true; | ||
2730 | } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) | ||
2731 | return true; | ||
2732 | } | ||
2733 | |||
2734 | *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); | ||
2735 | |||
2736 | return false; | ||
2737 | } | ||
2738 | |||
2739 | static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | ||
2740 | bool prefault) | ||
2591 | { | 2741 | { |
2592 | pfn_t pfn; | 2742 | pfn_t pfn; |
2593 | int r; | 2743 | int r; |
2594 | int level; | 2744 | int level; |
2745 | int force_pt_level; | ||
2595 | gfn_t gfn = gpa >> PAGE_SHIFT; | 2746 | gfn_t gfn = gpa >> PAGE_SHIFT; |
2596 | unsigned long mmu_seq; | 2747 | unsigned long mmu_seq; |
2748 | int write = error_code & PFERR_WRITE_MASK; | ||
2749 | bool map_writable; | ||
2597 | 2750 | ||
2598 | ASSERT(vcpu); | 2751 | ASSERT(vcpu); |
2599 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 2752 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
@@ -2602,21 +2755,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
2602 | if (r) | 2755 | if (r) |
2603 | return r; | 2756 | return r; |
2604 | 2757 | ||
2605 | level = mapping_level(vcpu, gfn); | 2758 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); |
2606 | 2759 | if (likely(!force_pt_level)) { | |
2607 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | 2760 | level = mapping_level(vcpu, gfn); |
2761 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | ||
2762 | } else | ||
2763 | level = PT_PAGE_TABLE_LEVEL; | ||
2608 | 2764 | ||
2609 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2765 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2610 | smp_rmb(); | 2766 | smp_rmb(); |
2611 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2767 | |
2768 | if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) | ||
2769 | return 0; | ||
2770 | |||
2771 | /* mmio */ | ||
2612 | if (is_error_pfn(pfn)) | 2772 | if (is_error_pfn(pfn)) |
2613 | return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); | 2773 | return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); |
2614 | spin_lock(&vcpu->kvm->mmu_lock); | 2774 | spin_lock(&vcpu->kvm->mmu_lock); |
2615 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 2775 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
2616 | goto out_unlock; | 2776 | goto out_unlock; |
2617 | kvm_mmu_free_some_pages(vcpu); | 2777 | kvm_mmu_free_some_pages(vcpu); |
2618 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, | 2778 | if (likely(!force_pt_level)) |
2619 | level, gfn, pfn); | 2779 | transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); |
2780 | r = __direct_map(vcpu, gpa, write, map_writable, | ||
2781 | level, gfn, pfn, prefault); | ||
2620 | spin_unlock(&vcpu->kvm->mmu_lock); | 2782 | spin_unlock(&vcpu->kvm->mmu_lock); |
2621 | 2783 | ||
2622 | return r; | 2784 | return r; |
@@ -2658,18 +2820,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | |||
2658 | 2820 | ||
2659 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | 2821 | static void paging_new_cr3(struct kvm_vcpu *vcpu) |
2660 | { | 2822 | { |
2661 | pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); | 2823 | pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu)); |
2662 | mmu_free_roots(vcpu); | 2824 | mmu_free_roots(vcpu); |
2663 | } | 2825 | } |
2664 | 2826 | ||
2665 | static unsigned long get_cr3(struct kvm_vcpu *vcpu) | 2827 | static unsigned long get_cr3(struct kvm_vcpu *vcpu) |
2666 | { | 2828 | { |
2667 | return vcpu->arch.cr3; | 2829 | return kvm_read_cr3(vcpu); |
2668 | } | 2830 | } |
2669 | 2831 | ||
2670 | static void inject_page_fault(struct kvm_vcpu *vcpu) | 2832 | static void inject_page_fault(struct kvm_vcpu *vcpu, |
2833 | struct x86_exception *fault) | ||
2671 | { | 2834 | { |
2672 | vcpu->arch.mmu.inject_page_fault(vcpu); | 2835 | vcpu->arch.mmu.inject_page_fault(vcpu, fault); |
2673 | } | 2836 | } |
2674 | 2837 | ||
2675 | static void paging_free(struct kvm_vcpu *vcpu) | 2838 | static void paging_free(struct kvm_vcpu *vcpu) |
@@ -2815,6 +2978,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
2815 | { | 2978 | { |
2816 | struct kvm_mmu *context = vcpu->arch.walk_mmu; | 2979 | struct kvm_mmu *context = vcpu->arch.walk_mmu; |
2817 | 2980 | ||
2981 | context->base_role.word = 0; | ||
2818 | context->new_cr3 = nonpaging_new_cr3; | 2982 | context->new_cr3 = nonpaging_new_cr3; |
2819 | context->page_fault = tdp_page_fault; | 2983 | context->page_fault = tdp_page_fault; |
2820 | context->free = nonpaging_free; | 2984 | context->free = nonpaging_free; |
@@ -3007,9 +3171,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | |||
3007 | return; | 3171 | return; |
3008 | } | 3172 | } |
3009 | 3173 | ||
3010 | if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) | ||
3011 | return; | ||
3012 | |||
3013 | ++vcpu->kvm->stat.mmu_pte_updated; | 3174 | ++vcpu->kvm->stat.mmu_pte_updated; |
3014 | if (!sp->role.cr4_pae) | 3175 | if (!sp->role.cr4_pae) |
3015 | paging32_update_pte(vcpu, sp, spte, new); | 3176 | paging32_update_pte(vcpu, sp, spte, new); |
@@ -3263,12 +3424,13 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | |||
3263 | } | 3424 | } |
3264 | } | 3425 | } |
3265 | 3426 | ||
3266 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | 3427 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, |
3428 | void *insn, int insn_len) | ||
3267 | { | 3429 | { |
3268 | int r; | 3430 | int r; |
3269 | enum emulation_result er; | 3431 | enum emulation_result er; |
3270 | 3432 | ||
3271 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); | 3433 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); |
3272 | if (r < 0) | 3434 | if (r < 0) |
3273 | goto out; | 3435 | goto out; |
3274 | 3436 | ||
@@ -3281,7 +3443,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | |||
3281 | if (r) | 3443 | if (r) |
3282 | goto out; | 3444 | goto out; |
3283 | 3445 | ||
3284 | er = emulate_instruction(vcpu, cr2, error_code, 0); | 3446 | er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); |
3285 | 3447 | ||
3286 | switch (er) { | 3448 | switch (er) { |
3287 | case EMULATE_DONE: | 3449 | case EMULATE_DONE: |
@@ -3376,11 +3538,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
3376 | if (!test_bit(slot, sp->slot_bitmap)) | 3538 | if (!test_bit(slot, sp->slot_bitmap)) |
3377 | continue; | 3539 | continue; |
3378 | 3540 | ||
3541 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) | ||
3542 | continue; | ||
3543 | |||
3379 | pt = sp->spt; | 3544 | pt = sp->spt; |
3380 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | 3545 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) |
3381 | /* avoid RMW */ | 3546 | /* avoid RMW */ |
3382 | if (is_writable_pte(pt[i])) | 3547 | if (is_writable_pte(pt[i])) |
3383 | pt[i] &= ~PT_WRITABLE_MASK; | 3548 | update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); |
3384 | } | 3549 | } |
3385 | kvm_flush_remote_tlbs(kvm); | 3550 | kvm_flush_remote_tlbs(kvm); |
3386 | } | 3551 | } |
@@ -3462,13 +3627,6 @@ static void mmu_destroy_caches(void) | |||
3462 | kmem_cache_destroy(mmu_page_header_cache); | 3627 | kmem_cache_destroy(mmu_page_header_cache); |
3463 | } | 3628 | } |
3464 | 3629 | ||
3465 | void kvm_mmu_module_exit(void) | ||
3466 | { | ||
3467 | mmu_destroy_caches(); | ||
3468 | percpu_counter_destroy(&kvm_total_used_mmu_pages); | ||
3469 | unregister_shrinker(&mmu_shrinker); | ||
3470 | } | ||
3471 | |||
3472 | int kvm_mmu_module_init(void) | 3630 | int kvm_mmu_module_init(void) |
3473 | { | 3631 | { |
3474 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", | 3632 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", |
@@ -3565,7 +3723,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, | |||
3565 | 3723 | ||
3566 | static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) | 3724 | static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) |
3567 | { | 3725 | { |
3568 | (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); | 3726 | (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu)); |
3569 | return 1; | 3727 | return 1; |
3570 | } | 3728 | } |
3571 | 3729 | ||
@@ -3661,12 +3819,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) | |||
3661 | } | 3819 | } |
3662 | EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); | 3820 | EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); |
3663 | 3821 | ||
3664 | #ifdef CONFIG_KVM_MMU_AUDIT | ||
3665 | #include "mmu_audit.c" | ||
3666 | #else | ||
3667 | static void mmu_audit_disable(void) { } | ||
3668 | #endif | ||
3669 | |||
3670 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | 3822 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) |
3671 | { | 3823 | { |
3672 | ASSERT(vcpu); | 3824 | ASSERT(vcpu); |
@@ -3674,5 +3826,18 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | |||
3674 | destroy_kvm_mmu(vcpu); | 3826 | destroy_kvm_mmu(vcpu); |
3675 | free_mmu_pages(vcpu); | 3827 | free_mmu_pages(vcpu); |
3676 | mmu_free_memory_caches(vcpu); | 3828 | mmu_free_memory_caches(vcpu); |
3829 | } | ||
3830 | |||
3831 | #ifdef CONFIG_KVM_MMU_AUDIT | ||
3832 | #include "mmu_audit.c" | ||
3833 | #else | ||
3834 | static void mmu_audit_disable(void) { } | ||
3835 | #endif | ||
3836 | |||
3837 | void kvm_mmu_module_exit(void) | ||
3838 | { | ||
3839 | mmu_destroy_caches(); | ||
3840 | percpu_counter_destroy(&kvm_total_used_mmu_pages); | ||
3841 | unregister_shrinker(&mmu_shrinker); | ||
3677 | mmu_audit_disable(); | 3842 | mmu_audit_disable(); |
3678 | } | 3843 | } |
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index ba2bcdde6221..5f6223b8bcf7 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c | |||
@@ -19,11 +19,9 @@ | |||
19 | 19 | ||
20 | #include <linux/ratelimit.h> | 20 | #include <linux/ratelimit.h> |
21 | 21 | ||
22 | static int audit_point; | 22 | #define audit_printk(kvm, fmt, args...) \ |
23 | |||
24 | #define audit_printk(fmt, args...) \ | ||
25 | printk(KERN_ERR "audit: (%s) error: " \ | 23 | printk(KERN_ERR "audit: (%s) error: " \ |
26 | fmt, audit_point_name[audit_point], ##args) | 24 | fmt, audit_point_name[kvm->arch.audit_point], ##args) |
27 | 25 | ||
28 | typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); | 26 | typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); |
29 | 27 | ||
@@ -97,18 +95,21 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) | |||
97 | 95 | ||
98 | if (sp->unsync) { | 96 | if (sp->unsync) { |
99 | if (level != PT_PAGE_TABLE_LEVEL) { | 97 | if (level != PT_PAGE_TABLE_LEVEL) { |
100 | audit_printk("unsync sp: %p level = %d\n", sp, level); | 98 | audit_printk(vcpu->kvm, "unsync sp: %p " |
99 | "level = %d\n", sp, level); | ||
101 | return; | 100 | return; |
102 | } | 101 | } |
103 | 102 | ||
104 | if (*sptep == shadow_notrap_nonpresent_pte) { | 103 | if (*sptep == shadow_notrap_nonpresent_pte) { |
105 | audit_printk("notrap spte in unsync sp: %p\n", sp); | 104 | audit_printk(vcpu->kvm, "notrap spte in unsync " |
105 | "sp: %p\n", sp); | ||
106 | return; | 106 | return; |
107 | } | 107 | } |
108 | } | 108 | } |
109 | 109 | ||
110 | if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { | 110 | if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { |
111 | audit_printk("notrap spte in direct sp: %p\n", sp); | 111 | audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n", |
112 | sp); | ||
112 | return; | 113 | return; |
113 | } | 114 | } |
114 | 115 | ||
@@ -125,8 +126,9 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) | |||
125 | 126 | ||
126 | hpa = pfn << PAGE_SHIFT; | 127 | hpa = pfn << PAGE_SHIFT; |
127 | if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) | 128 | if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) |
128 | audit_printk("levels %d pfn %llx hpa %llx ent %llxn", | 129 | audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx " |
129 | vcpu->arch.mmu.root_level, pfn, hpa, *sptep); | 130 | "ent %llxn", vcpu->arch.mmu.root_level, pfn, |
131 | hpa, *sptep); | ||
130 | } | 132 | } |
131 | 133 | ||
132 | static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) | 134 | static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) |
@@ -142,8 +144,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) | |||
142 | if (!gfn_to_memslot(kvm, gfn)) { | 144 | if (!gfn_to_memslot(kvm, gfn)) { |
143 | if (!printk_ratelimit()) | 145 | if (!printk_ratelimit()) |
144 | return; | 146 | return; |
145 | audit_printk("no memslot for gfn %llx\n", gfn); | 147 | audit_printk(kvm, "no memslot for gfn %llx\n", gfn); |
146 | audit_printk("index %ld of sp (gfn=%llx)\n", | 148 | audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", |
147 | (long int)(sptep - rev_sp->spt), rev_sp->gfn); | 149 | (long int)(sptep - rev_sp->spt), rev_sp->gfn); |
148 | dump_stack(); | 150 | dump_stack(); |
149 | return; | 151 | return; |
@@ -153,7 +155,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) | |||
153 | if (!*rmapp) { | 155 | if (!*rmapp) { |
154 | if (!printk_ratelimit()) | 156 | if (!printk_ratelimit()) |
155 | return; | 157 | return; |
156 | audit_printk("no rmap for writable spte %llx\n", *sptep); | 158 | audit_printk(kvm, "no rmap for writable spte %llx\n", |
159 | *sptep); | ||
157 | dump_stack(); | 160 | dump_stack(); |
158 | } | 161 | } |
159 | } | 162 | } |
@@ -168,8 +171,9 @@ static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) | |||
168 | { | 171 | { |
169 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | 172 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); |
170 | 173 | ||
171 | if (audit_point == AUDIT_POST_SYNC && sp->unsync) | 174 | if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync) |
172 | audit_printk("meet unsync sp(%p) after sync root.\n", sp); | 175 | audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync " |
176 | "root.\n", sp); | ||
173 | } | 177 | } |
174 | 178 | ||
175 | static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) | 179 | static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) |
@@ -202,8 +206,9 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
202 | spte = rmap_next(kvm, rmapp, NULL); | 206 | spte = rmap_next(kvm, rmapp, NULL); |
203 | while (spte) { | 207 | while (spte) { |
204 | if (is_writable_pte(*spte)) | 208 | if (is_writable_pte(*spte)) |
205 | audit_printk("shadow page has writable mappings: gfn " | 209 | audit_printk(kvm, "shadow page has writable " |
206 | "%llx role %x\n", sp->gfn, sp->role.word); | 210 | "mappings: gfn %llx role %x\n", |
211 | sp->gfn, sp->role.word); | ||
207 | spte = rmap_next(kvm, rmapp, spte); | 212 | spte = rmap_next(kvm, rmapp, spte); |
208 | } | 213 | } |
209 | } | 214 | } |
@@ -238,7 +243,7 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) | |||
238 | if (!__ratelimit(&ratelimit_state)) | 243 | if (!__ratelimit(&ratelimit_state)) |
239 | return; | 244 | return; |
240 | 245 | ||
241 | audit_point = point; | 246 | vcpu->kvm->arch.audit_point = point; |
242 | audit_all_active_sps(vcpu->kvm); | 247 | audit_all_active_sps(vcpu->kvm); |
243 | audit_vcpu_spte(vcpu); | 248 | audit_vcpu_spte(vcpu); |
244 | } | 249 | } |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index cd7a833a3b52..6bccc24c4181 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -72,7 +72,7 @@ struct guest_walker { | |||
72 | unsigned pt_access; | 72 | unsigned pt_access; |
73 | unsigned pte_access; | 73 | unsigned pte_access; |
74 | gfn_t gfn; | 74 | gfn_t gfn; |
75 | u32 error_code; | 75 | struct x86_exception fault; |
76 | }; | 76 | }; |
77 | 77 | ||
78 | static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) | 78 | static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) |
@@ -266,21 +266,23 @@ walk: | |||
266 | return 1; | 266 | return 1; |
267 | 267 | ||
268 | error: | 268 | error: |
269 | walker->error_code = 0; | 269 | walker->fault.vector = PF_VECTOR; |
270 | walker->fault.error_code_valid = true; | ||
271 | walker->fault.error_code = 0; | ||
270 | if (present) | 272 | if (present) |
271 | walker->error_code |= PFERR_PRESENT_MASK; | 273 | walker->fault.error_code |= PFERR_PRESENT_MASK; |
272 | 274 | ||
273 | walker->error_code |= write_fault | user_fault; | 275 | walker->fault.error_code |= write_fault | user_fault; |
274 | 276 | ||
275 | if (fetch_fault && mmu->nx) | 277 | if (fetch_fault && mmu->nx) |
276 | walker->error_code |= PFERR_FETCH_MASK; | 278 | walker->fault.error_code |= PFERR_FETCH_MASK; |
277 | if (rsvd_fault) | 279 | if (rsvd_fault) |
278 | walker->error_code |= PFERR_RSVD_MASK; | 280 | walker->fault.error_code |= PFERR_RSVD_MASK; |
279 | 281 | ||
280 | vcpu->arch.fault.address = addr; | 282 | walker->fault.address = addr; |
281 | vcpu->arch.fault.error_code = walker->error_code; | 283 | walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; |
282 | 284 | ||
283 | trace_kvm_mmu_walker_error(walker->error_code); | 285 | trace_kvm_mmu_walker_error(walker->fault.error_code); |
284 | return 0; | 286 | return 0; |
285 | } | 287 | } |
286 | 288 | ||
@@ -299,25 +301,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, | |||
299 | addr, access); | 301 | addr, access); |
300 | } | 302 | } |
301 | 303 | ||
304 | static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, | ||
305 | struct kvm_mmu_page *sp, u64 *spte, | ||
306 | pt_element_t gpte) | ||
307 | { | ||
308 | u64 nonpresent = shadow_trap_nonpresent_pte; | ||
309 | |||
310 | if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) | ||
311 | goto no_present; | ||
312 | |||
313 | if (!is_present_gpte(gpte)) { | ||
314 | if (!sp->unsync) | ||
315 | nonpresent = shadow_notrap_nonpresent_pte; | ||
316 | goto no_present; | ||
317 | } | ||
318 | |||
319 | if (!(gpte & PT_ACCESSED_MASK)) | ||
320 | goto no_present; | ||
321 | |||
322 | return false; | ||
323 | |||
324 | no_present: | ||
325 | drop_spte(vcpu->kvm, spte, nonpresent); | ||
326 | return true; | ||
327 | } | ||
328 | |||
302 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | 329 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
303 | u64 *spte, const void *pte) | 330 | u64 *spte, const void *pte) |
304 | { | 331 | { |
305 | pt_element_t gpte; | 332 | pt_element_t gpte; |
306 | unsigned pte_access; | 333 | unsigned pte_access; |
307 | pfn_t pfn; | 334 | pfn_t pfn; |
308 | u64 new_spte; | ||
309 | 335 | ||
310 | gpte = *(const pt_element_t *)pte; | 336 | gpte = *(const pt_element_t *)pte; |
311 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { | 337 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) |
312 | if (!is_present_gpte(gpte)) { | ||
313 | if (sp->unsync) | ||
314 | new_spte = shadow_trap_nonpresent_pte; | ||
315 | else | ||
316 | new_spte = shadow_notrap_nonpresent_pte; | ||
317 | __set_spte(spte, new_spte); | ||
318 | } | ||
319 | return; | 338 | return; |
320 | } | 339 | |
321 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); | 340 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); |
322 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 341 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); |
323 | if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) | 342 | if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) |
@@ -329,7 +348,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
329 | return; | 348 | return; |
330 | kvm_get_pfn(pfn); | 349 | kvm_get_pfn(pfn); |
331 | /* | 350 | /* |
332 | * we call mmu_set_spte() with reset_host_protection = true beacuse that | 351 | * we call mmu_set_spte() with host_writable = true beacuse that |
333 | * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). | 352 | * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). |
334 | */ | 353 | */ |
335 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, | 354 | mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, |
@@ -364,7 +383,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
364 | u64 *sptep) | 383 | u64 *sptep) |
365 | { | 384 | { |
366 | struct kvm_mmu_page *sp; | 385 | struct kvm_mmu_page *sp; |
367 | struct kvm_mmu *mmu = &vcpu->arch.mmu; | ||
368 | pt_element_t *gptep = gw->prefetch_ptes; | 386 | pt_element_t *gptep = gw->prefetch_ptes; |
369 | u64 *spte; | 387 | u64 *spte; |
370 | int i; | 388 | int i; |
@@ -395,14 +413,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
395 | 413 | ||
396 | gpte = gptep[i]; | 414 | gpte = gptep[i]; |
397 | 415 | ||
398 | if (!is_present_gpte(gpte) || | 416 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) |
399 | is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) { | ||
400 | if (!sp->unsync) | ||
401 | __set_spte(spte, shadow_notrap_nonpresent_pte); | ||
402 | continue; | ||
403 | } | ||
404 | |||
405 | if (!(gpte & PT_ACCESSED_MASK)) | ||
406 | continue; | 417 | continue; |
407 | 418 | ||
408 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 419 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); |
@@ -427,7 +438,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
427 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 438 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, |
428 | struct guest_walker *gw, | 439 | struct guest_walker *gw, |
429 | int user_fault, int write_fault, int hlevel, | 440 | int user_fault, int write_fault, int hlevel, |
430 | int *ptwrite, pfn_t pfn) | 441 | int *ptwrite, pfn_t pfn, bool map_writable, |
442 | bool prefault) | ||
431 | { | 443 | { |
432 | unsigned access = gw->pt_access; | 444 | unsigned access = gw->pt_access; |
433 | struct kvm_mmu_page *sp = NULL; | 445 | struct kvm_mmu_page *sp = NULL; |
@@ -501,7 +513,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
501 | 513 | ||
502 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, | 514 | mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, |
503 | user_fault, write_fault, dirty, ptwrite, it.level, | 515 | user_fault, write_fault, dirty, ptwrite, it.level, |
504 | gw->gfn, pfn, false, true); | 516 | gw->gfn, pfn, prefault, map_writable); |
505 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); | 517 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); |
506 | 518 | ||
507 | return it.sptep; | 519 | return it.sptep; |
@@ -527,8 +539,8 @@ out_gpte_changed: | |||
527 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or | 539 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or |
528 | * a negative value on error. | 540 | * a negative value on error. |
529 | */ | 541 | */ |
530 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | 542 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, |
531 | u32 error_code) | 543 | bool prefault) |
532 | { | 544 | { |
533 | int write_fault = error_code & PFERR_WRITE_MASK; | 545 | int write_fault = error_code & PFERR_WRITE_MASK; |
534 | int user_fault = error_code & PFERR_USER_MASK; | 546 | int user_fault = error_code & PFERR_USER_MASK; |
@@ -538,7 +550,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
538 | int r; | 550 | int r; |
539 | pfn_t pfn; | 551 | pfn_t pfn; |
540 | int level = PT_PAGE_TABLE_LEVEL; | 552 | int level = PT_PAGE_TABLE_LEVEL; |
553 | int force_pt_level; | ||
541 | unsigned long mmu_seq; | 554 | unsigned long mmu_seq; |
555 | bool map_writable; | ||
542 | 556 | ||
543 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); | 557 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
544 | 558 | ||
@@ -556,19 +570,29 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
556 | */ | 570 | */ |
557 | if (!r) { | 571 | if (!r) { |
558 | pgprintk("%s: guest page fault\n", __func__); | 572 | pgprintk("%s: guest page fault\n", __func__); |
559 | inject_page_fault(vcpu); | 573 | if (!prefault) { |
560 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | 574 | inject_page_fault(vcpu, &walker.fault); |
575 | /* reset fork detector */ | ||
576 | vcpu->arch.last_pt_write_count = 0; | ||
577 | } | ||
561 | return 0; | 578 | return 0; |
562 | } | 579 | } |
563 | 580 | ||
564 | if (walker.level >= PT_DIRECTORY_LEVEL) { | 581 | if (walker.level >= PT_DIRECTORY_LEVEL) |
582 | force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); | ||
583 | else | ||
584 | force_pt_level = 1; | ||
585 | if (!force_pt_level) { | ||
565 | level = min(walker.level, mapping_level(vcpu, walker.gfn)); | 586 | level = min(walker.level, mapping_level(vcpu, walker.gfn)); |
566 | walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); | 587 | walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); |
567 | } | 588 | } |
568 | 589 | ||
569 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 590 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
570 | smp_rmb(); | 591 | smp_rmb(); |
571 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); | 592 | |
593 | if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, | ||
594 | &map_writable)) | ||
595 | return 0; | ||
572 | 596 | ||
573 | /* mmio */ | 597 | /* mmio */ |
574 | if (is_error_pfn(pfn)) | 598 | if (is_error_pfn(pfn)) |
@@ -580,8 +604,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
580 | 604 | ||
581 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); | 605 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); |
582 | kvm_mmu_free_some_pages(vcpu); | 606 | kvm_mmu_free_some_pages(vcpu); |
607 | if (!force_pt_level) | ||
608 | transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); | ||
583 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 609 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
584 | level, &write_pt, pfn); | 610 | level, &write_pt, pfn, map_writable, prefault); |
585 | (void)sptep; | 611 | (void)sptep; |
586 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, | 612 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, |
587 | sptep, *sptep, write_pt); | 613 | sptep, *sptep, write_pt); |
@@ -661,7 +687,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
661 | } | 687 | } |
662 | 688 | ||
663 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, | 689 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, |
664 | u32 *error) | 690 | struct x86_exception *exception) |
665 | { | 691 | { |
666 | struct guest_walker walker; | 692 | struct guest_walker walker; |
667 | gpa_t gpa = UNMAPPED_GVA; | 693 | gpa_t gpa = UNMAPPED_GVA; |
@@ -672,14 +698,15 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, | |||
672 | if (r) { | 698 | if (r) { |
673 | gpa = gfn_to_gpa(walker.gfn); | 699 | gpa = gfn_to_gpa(walker.gfn); |
674 | gpa |= vaddr & ~PAGE_MASK; | 700 | gpa |= vaddr & ~PAGE_MASK; |
675 | } else if (error) | 701 | } else if (exception) |
676 | *error = walker.error_code; | 702 | *exception = walker.fault; |
677 | 703 | ||
678 | return gpa; | 704 | return gpa; |
679 | } | 705 | } |
680 | 706 | ||
681 | static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, | 707 | static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, |
682 | u32 access, u32 *error) | 708 | u32 access, |
709 | struct x86_exception *exception) | ||
683 | { | 710 | { |
684 | struct guest_walker walker; | 711 | struct guest_walker walker; |
685 | gpa_t gpa = UNMAPPED_GVA; | 712 | gpa_t gpa = UNMAPPED_GVA; |
@@ -690,8 +717,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, | |||
690 | if (r) { | 717 | if (r) { |
691 | gpa = gfn_to_gpa(walker.gfn); | 718 | gpa = gfn_to_gpa(walker.gfn); |
692 | gpa |= vaddr & ~PAGE_MASK; | 719 | gpa |= vaddr & ~PAGE_MASK; |
693 | } else if (error) | 720 | } else if (exception) |
694 | *error = walker.error_code; | 721 | *exception = walker.fault; |
695 | 722 | ||
696 | return gpa; | 723 | return gpa; |
697 | } | 724 | } |
@@ -730,12 +757,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | |||
730 | * Using the cached information from sp->gfns is safe because: | 757 | * Using the cached information from sp->gfns is safe because: |
731 | * - The spte has a reference to the struct page, so the pfn for a given gfn | 758 | * - The spte has a reference to the struct page, so the pfn for a given gfn |
732 | * can't change unless all sptes pointing to it are nuked first. | 759 | * can't change unless all sptes pointing to it are nuked first. |
760 | * | ||
761 | * Note: | ||
762 | * We should flush all tlbs if spte is dropped even though guest is | ||
763 | * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page | ||
764 | * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't | ||
765 | * used by guest then tlbs are not flushed, so guest is allowed to access the | ||
766 | * freed pages. | ||
767 | * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. | ||
733 | */ | 768 | */ |
734 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | 769 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) |
735 | bool clear_unsync) | ||
736 | { | 770 | { |
737 | int i, offset, nr_present; | 771 | int i, offset, nr_present; |
738 | bool reset_host_protection; | 772 | bool host_writable; |
739 | gpa_t first_pte_gpa; | 773 | gpa_t first_pte_gpa; |
740 | 774 | ||
741 | offset = nr_present = 0; | 775 | offset = nr_present = 0; |
@@ -764,31 +798,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
764 | return -EINVAL; | 798 | return -EINVAL; |
765 | 799 | ||
766 | gfn = gpte_to_gfn(gpte); | 800 | gfn = gpte_to_gfn(gpte); |
767 | if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL) | ||
768 | || gfn != sp->gfns[i] || !is_present_gpte(gpte) | ||
769 | || !(gpte & PT_ACCESSED_MASK)) { | ||
770 | u64 nonpresent; | ||
771 | 801 | ||
772 | if (is_present_gpte(gpte) || !clear_unsync) | 802 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { |
773 | nonpresent = shadow_trap_nonpresent_pte; | 803 | vcpu->kvm->tlbs_dirty++; |
774 | else | 804 | continue; |
775 | nonpresent = shadow_notrap_nonpresent_pte; | 805 | } |
776 | drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); | 806 | |
807 | if (gfn != sp->gfns[i]) { | ||
808 | drop_spte(vcpu->kvm, &sp->spt[i], | ||
809 | shadow_trap_nonpresent_pte); | ||
810 | vcpu->kvm->tlbs_dirty++; | ||
777 | continue; | 811 | continue; |
778 | } | 812 | } |
779 | 813 | ||
780 | nr_present++; | 814 | nr_present++; |
781 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 815 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); |
782 | if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) { | 816 | host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; |
783 | pte_access &= ~ACC_WRITE_MASK; | 817 | |
784 | reset_host_protection = 0; | ||
785 | } else { | ||
786 | reset_host_protection = 1; | ||
787 | } | ||
788 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, | 818 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, |
789 | is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, | 819 | is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, |
790 | spte_to_pfn(sp->spt[i]), true, false, | 820 | spte_to_pfn(sp->spt[i]), true, false, |
791 | reset_host_protection); | 821 | host_writable); |
792 | } | 822 | } |
793 | 823 | ||
794 | return !nr_present; | 824 | return !nr_present; |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 82e144a4e514..25bd1bc5aad2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -31,6 +31,7 @@ | |||
31 | 31 | ||
32 | #include <asm/tlbflush.h> | 32 | #include <asm/tlbflush.h> |
33 | #include <asm/desc.h> | 33 | #include <asm/desc.h> |
34 | #include <asm/kvm_para.h> | ||
34 | 35 | ||
35 | #include <asm/virtext.h> | 36 | #include <asm/virtext.h> |
36 | #include "trace.h" | 37 | #include "trace.h" |
@@ -50,6 +51,10 @@ MODULE_LICENSE("GPL"); | |||
50 | #define SVM_FEATURE_LBRV (1 << 1) | 51 | #define SVM_FEATURE_LBRV (1 << 1) |
51 | #define SVM_FEATURE_SVML (1 << 2) | 52 | #define SVM_FEATURE_SVML (1 << 2) |
52 | #define SVM_FEATURE_NRIP (1 << 3) | 53 | #define SVM_FEATURE_NRIP (1 << 3) |
54 | #define SVM_FEATURE_TSC_RATE (1 << 4) | ||
55 | #define SVM_FEATURE_VMCB_CLEAN (1 << 5) | ||
56 | #define SVM_FEATURE_FLUSH_ASID (1 << 6) | ||
57 | #define SVM_FEATURE_DECODE_ASSIST (1 << 7) | ||
53 | #define SVM_FEATURE_PAUSE_FILTER (1 << 10) | 58 | #define SVM_FEATURE_PAUSE_FILTER (1 << 10) |
54 | 59 | ||
55 | #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ | 60 | #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ |
@@ -97,10 +102,8 @@ struct nested_state { | |||
97 | unsigned long vmexit_rax; | 102 | unsigned long vmexit_rax; |
98 | 103 | ||
99 | /* cache for intercepts of the guest */ | 104 | /* cache for intercepts of the guest */ |
100 | u16 intercept_cr_read; | 105 | u32 intercept_cr; |
101 | u16 intercept_cr_write; | 106 | u32 intercept_dr; |
102 | u16 intercept_dr_read; | ||
103 | u16 intercept_dr_write; | ||
104 | u32 intercept_exceptions; | 107 | u32 intercept_exceptions; |
105 | u64 intercept; | 108 | u64 intercept; |
106 | 109 | ||
@@ -123,7 +126,12 @@ struct vcpu_svm { | |||
123 | u64 next_rip; | 126 | u64 next_rip; |
124 | 127 | ||
125 | u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; | 128 | u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; |
126 | u64 host_gs_base; | 129 | struct { |
130 | u16 fs; | ||
131 | u16 gs; | ||
132 | u16 ldt; | ||
133 | u64 gs_base; | ||
134 | } host; | ||
127 | 135 | ||
128 | u32 *msrpm; | 136 | u32 *msrpm; |
129 | 137 | ||
@@ -133,6 +141,7 @@ struct vcpu_svm { | |||
133 | 141 | ||
134 | unsigned int3_injected; | 142 | unsigned int3_injected; |
135 | unsigned long int3_rip; | 143 | unsigned long int3_rip; |
144 | u32 apf_reason; | ||
136 | }; | 145 | }; |
137 | 146 | ||
138 | #define MSR_INVALID 0xffffffffU | 147 | #define MSR_INVALID 0xffffffffU |
@@ -180,14 +189,151 @@ static int nested_svm_vmexit(struct vcpu_svm *svm); | |||
180 | static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, | 189 | static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, |
181 | bool has_error_code, u32 error_code); | 190 | bool has_error_code, u32 error_code); |
182 | 191 | ||
192 | enum { | ||
193 | VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, | ||
194 | pause filter count */ | ||
195 | VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */ | ||
196 | VMCB_ASID, /* ASID */ | ||
197 | VMCB_INTR, /* int_ctl, int_vector */ | ||
198 | VMCB_NPT, /* npt_en, nCR3, gPAT */ | ||
199 | VMCB_CR, /* CR0, CR3, CR4, EFER */ | ||
200 | VMCB_DR, /* DR6, DR7 */ | ||
201 | VMCB_DT, /* GDT, IDT */ | ||
202 | VMCB_SEG, /* CS, DS, SS, ES, CPL */ | ||
203 | VMCB_CR2, /* CR2 only */ | ||
204 | VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ | ||
205 | VMCB_DIRTY_MAX, | ||
206 | }; | ||
207 | |||
208 | /* TPR and CR2 are always written before VMRUN */ | ||
209 | #define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) | ||
210 | |||
211 | static inline void mark_all_dirty(struct vmcb *vmcb) | ||
212 | { | ||
213 | vmcb->control.clean = 0; | ||
214 | } | ||
215 | |||
216 | static inline void mark_all_clean(struct vmcb *vmcb) | ||
217 | { | ||
218 | vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) | ||
219 | & ~VMCB_ALWAYS_DIRTY_MASK; | ||
220 | } | ||
221 | |||
222 | static inline void mark_dirty(struct vmcb *vmcb, int bit) | ||
223 | { | ||
224 | vmcb->control.clean &= ~(1 << bit); | ||
225 | } | ||
226 | |||
183 | static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) | 227 | static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) |
184 | { | 228 | { |
185 | return container_of(vcpu, struct vcpu_svm, vcpu); | 229 | return container_of(vcpu, struct vcpu_svm, vcpu); |
186 | } | 230 | } |
187 | 231 | ||
188 | static inline bool is_nested(struct vcpu_svm *svm) | 232 | static void recalc_intercepts(struct vcpu_svm *svm) |
233 | { | ||
234 | struct vmcb_control_area *c, *h; | ||
235 | struct nested_state *g; | ||
236 | |||
237 | mark_dirty(svm->vmcb, VMCB_INTERCEPTS); | ||
238 | |||
239 | if (!is_guest_mode(&svm->vcpu)) | ||
240 | return; | ||
241 | |||
242 | c = &svm->vmcb->control; | ||
243 | h = &svm->nested.hsave->control; | ||
244 | g = &svm->nested; | ||
245 | |||
246 | c->intercept_cr = h->intercept_cr | g->intercept_cr; | ||
247 | c->intercept_dr = h->intercept_dr | g->intercept_dr; | ||
248 | c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; | ||
249 | c->intercept = h->intercept | g->intercept; | ||
250 | } | ||
251 | |||
252 | static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) | ||
253 | { | ||
254 | if (is_guest_mode(&svm->vcpu)) | ||
255 | return svm->nested.hsave; | ||
256 | else | ||
257 | return svm->vmcb; | ||
258 | } | ||
259 | |||
260 | static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) | ||
261 | { | ||
262 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
263 | |||
264 | vmcb->control.intercept_cr |= (1U << bit); | ||
265 | |||
266 | recalc_intercepts(svm); | ||
267 | } | ||
268 | |||
269 | static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) | ||
270 | { | ||
271 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
272 | |||
273 | vmcb->control.intercept_cr &= ~(1U << bit); | ||
274 | |||
275 | recalc_intercepts(svm); | ||
276 | } | ||
277 | |||
278 | static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) | ||
279 | { | ||
280 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
281 | |||
282 | return vmcb->control.intercept_cr & (1U << bit); | ||
283 | } | ||
284 | |||
285 | static inline void set_dr_intercept(struct vcpu_svm *svm, int bit) | ||
286 | { | ||
287 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
288 | |||
289 | vmcb->control.intercept_dr |= (1U << bit); | ||
290 | |||
291 | recalc_intercepts(svm); | ||
292 | } | ||
293 | |||
294 | static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit) | ||
295 | { | ||
296 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
297 | |||
298 | vmcb->control.intercept_dr &= ~(1U << bit); | ||
299 | |||
300 | recalc_intercepts(svm); | ||
301 | } | ||
302 | |||
303 | static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) | ||
304 | { | ||
305 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
306 | |||
307 | vmcb->control.intercept_exceptions |= (1U << bit); | ||
308 | |||
309 | recalc_intercepts(svm); | ||
310 | } | ||
311 | |||
312 | static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) | ||
189 | { | 313 | { |
190 | return svm->nested.vmcb; | 314 | struct vmcb *vmcb = get_host_vmcb(svm); |
315 | |||
316 | vmcb->control.intercept_exceptions &= ~(1U << bit); | ||
317 | |||
318 | recalc_intercepts(svm); | ||
319 | } | ||
320 | |||
321 | static inline void set_intercept(struct vcpu_svm *svm, int bit) | ||
322 | { | ||
323 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
324 | |||
325 | vmcb->control.intercept |= (1ULL << bit); | ||
326 | |||
327 | recalc_intercepts(svm); | ||
328 | } | ||
329 | |||
330 | static inline void clr_intercept(struct vcpu_svm *svm, int bit) | ||
331 | { | ||
332 | struct vmcb *vmcb = get_host_vmcb(svm); | ||
333 | |||
334 | vmcb->control.intercept &= ~(1ULL << bit); | ||
335 | |||
336 | recalc_intercepts(svm); | ||
191 | } | 337 | } |
192 | 338 | ||
193 | static inline void enable_gif(struct vcpu_svm *svm) | 339 | static inline void enable_gif(struct vcpu_svm *svm) |
@@ -264,11 +410,6 @@ static u32 svm_msrpm_offset(u32 msr) | |||
264 | 410 | ||
265 | #define MAX_INST_SIZE 15 | 411 | #define MAX_INST_SIZE 15 |
266 | 412 | ||
267 | static inline u32 svm_has(u32 feat) | ||
268 | { | ||
269 | return svm_features & feat; | ||
270 | } | ||
271 | |||
272 | static inline void clgi(void) | 413 | static inline void clgi(void) |
273 | { | 414 | { |
274 | asm volatile (__ex(SVM_CLGI)); | 415 | asm volatile (__ex(SVM_CLGI)); |
@@ -284,16 +425,6 @@ static inline void invlpga(unsigned long addr, u32 asid) | |||
284 | asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); | 425 | asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); |
285 | } | 426 | } |
286 | 427 | ||
287 | static inline void force_new_asid(struct kvm_vcpu *vcpu) | ||
288 | { | ||
289 | to_svm(vcpu)->asid_generation--; | ||
290 | } | ||
291 | |||
292 | static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) | ||
293 | { | ||
294 | force_new_asid(vcpu); | ||
295 | } | ||
296 | |||
297 | static int get_npt_level(void) | 428 | static int get_npt_level(void) |
298 | { | 429 | { |
299 | #ifdef CONFIG_X86_64 | 430 | #ifdef CONFIG_X86_64 |
@@ -310,6 +441,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) | |||
310 | efer &= ~EFER_LME; | 441 | efer &= ~EFER_LME; |
311 | 442 | ||
312 | to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; | 443 | to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; |
444 | mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); | ||
313 | } | 445 | } |
314 | 446 | ||
315 | static int is_external_interrupt(u32 info) | 447 | static int is_external_interrupt(u32 info) |
@@ -347,7 +479,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
347 | svm->next_rip = svm->vmcb->control.next_rip; | 479 | svm->next_rip = svm->vmcb->control.next_rip; |
348 | 480 | ||
349 | if (!svm->next_rip) { | 481 | if (!svm->next_rip) { |
350 | if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != | 482 | if (emulate_instruction(vcpu, EMULTYPE_SKIP) != |
351 | EMULATE_DONE) | 483 | EMULATE_DONE) |
352 | printk(KERN_DEBUG "%s: NOP\n", __func__); | 484 | printk(KERN_DEBUG "%s: NOP\n", __func__); |
353 | return; | 485 | return; |
@@ -374,7 +506,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
374 | nested_svm_check_exception(svm, nr, has_error_code, error_code)) | 506 | nested_svm_check_exception(svm, nr, has_error_code, error_code)) |
375 | return; | 507 | return; |
376 | 508 | ||
377 | if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) { | 509 | if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { |
378 | unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); | 510 | unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); |
379 | 511 | ||
380 | /* | 512 | /* |
@@ -670,7 +802,7 @@ static __init int svm_hardware_setup(void) | |||
670 | 802 | ||
671 | svm_features = cpuid_edx(SVM_CPUID_FUNC); | 803 | svm_features = cpuid_edx(SVM_CPUID_FUNC); |
672 | 804 | ||
673 | if (!svm_has(SVM_FEATURE_NPT)) | 805 | if (!boot_cpu_has(X86_FEATURE_NPT)) |
674 | npt_enabled = false; | 806 | npt_enabled = false; |
675 | 807 | ||
676 | if (npt_enabled && !npt) { | 808 | if (npt_enabled && !npt) { |
@@ -725,13 +857,15 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) | |||
725 | struct vcpu_svm *svm = to_svm(vcpu); | 857 | struct vcpu_svm *svm = to_svm(vcpu); |
726 | u64 g_tsc_offset = 0; | 858 | u64 g_tsc_offset = 0; |
727 | 859 | ||
728 | if (is_nested(svm)) { | 860 | if (is_guest_mode(vcpu)) { |
729 | g_tsc_offset = svm->vmcb->control.tsc_offset - | 861 | g_tsc_offset = svm->vmcb->control.tsc_offset - |
730 | svm->nested.hsave->control.tsc_offset; | 862 | svm->nested.hsave->control.tsc_offset; |
731 | svm->nested.hsave->control.tsc_offset = offset; | 863 | svm->nested.hsave->control.tsc_offset = offset; |
732 | } | 864 | } |
733 | 865 | ||
734 | svm->vmcb->control.tsc_offset = offset + g_tsc_offset; | 866 | svm->vmcb->control.tsc_offset = offset + g_tsc_offset; |
867 | |||
868 | mark_dirty(svm->vmcb, VMCB_INTERCEPTS); | ||
735 | } | 869 | } |
736 | 870 | ||
737 | static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) | 871 | static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) |
@@ -739,8 +873,9 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) | |||
739 | struct vcpu_svm *svm = to_svm(vcpu); | 873 | struct vcpu_svm *svm = to_svm(vcpu); |
740 | 874 | ||
741 | svm->vmcb->control.tsc_offset += adjustment; | 875 | svm->vmcb->control.tsc_offset += adjustment; |
742 | if (is_nested(svm)) | 876 | if (is_guest_mode(vcpu)) |
743 | svm->nested.hsave->control.tsc_offset += adjustment; | 877 | svm->nested.hsave->control.tsc_offset += adjustment; |
878 | mark_dirty(svm->vmcb, VMCB_INTERCEPTS); | ||
744 | } | 879 | } |
745 | 880 | ||
746 | static void init_vmcb(struct vcpu_svm *svm) | 881 | static void init_vmcb(struct vcpu_svm *svm) |
@@ -749,62 +884,62 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
749 | struct vmcb_save_area *save = &svm->vmcb->save; | 884 | struct vmcb_save_area *save = &svm->vmcb->save; |
750 | 885 | ||
751 | svm->vcpu.fpu_active = 1; | 886 | svm->vcpu.fpu_active = 1; |
887 | svm->vcpu.arch.hflags = 0; | ||
752 | 888 | ||
753 | control->intercept_cr_read = INTERCEPT_CR0_MASK | | 889 | set_cr_intercept(svm, INTERCEPT_CR0_READ); |
754 | INTERCEPT_CR3_MASK | | 890 | set_cr_intercept(svm, INTERCEPT_CR3_READ); |
755 | INTERCEPT_CR4_MASK; | 891 | set_cr_intercept(svm, INTERCEPT_CR4_READ); |
756 | 892 | set_cr_intercept(svm, INTERCEPT_CR0_WRITE); | |
757 | control->intercept_cr_write = INTERCEPT_CR0_MASK | | 893 | set_cr_intercept(svm, INTERCEPT_CR3_WRITE); |
758 | INTERCEPT_CR3_MASK | | 894 | set_cr_intercept(svm, INTERCEPT_CR4_WRITE); |
759 | INTERCEPT_CR4_MASK | | 895 | set_cr_intercept(svm, INTERCEPT_CR8_WRITE); |
760 | INTERCEPT_CR8_MASK; | 896 | |
761 | 897 | set_dr_intercept(svm, INTERCEPT_DR0_READ); | |
762 | control->intercept_dr_read = INTERCEPT_DR0_MASK | | 898 | set_dr_intercept(svm, INTERCEPT_DR1_READ); |
763 | INTERCEPT_DR1_MASK | | 899 | set_dr_intercept(svm, INTERCEPT_DR2_READ); |
764 | INTERCEPT_DR2_MASK | | 900 | set_dr_intercept(svm, INTERCEPT_DR3_READ); |
765 | INTERCEPT_DR3_MASK | | 901 | set_dr_intercept(svm, INTERCEPT_DR4_READ); |
766 | INTERCEPT_DR4_MASK | | 902 | set_dr_intercept(svm, INTERCEPT_DR5_READ); |
767 | INTERCEPT_DR5_MASK | | 903 | set_dr_intercept(svm, INTERCEPT_DR6_READ); |
768 | INTERCEPT_DR6_MASK | | 904 | set_dr_intercept(svm, INTERCEPT_DR7_READ); |
769 | INTERCEPT_DR7_MASK; | 905 | |
770 | 906 | set_dr_intercept(svm, INTERCEPT_DR0_WRITE); | |
771 | control->intercept_dr_write = INTERCEPT_DR0_MASK | | 907 | set_dr_intercept(svm, INTERCEPT_DR1_WRITE); |
772 | INTERCEPT_DR1_MASK | | 908 | set_dr_intercept(svm, INTERCEPT_DR2_WRITE); |
773 | INTERCEPT_DR2_MASK | | 909 | set_dr_intercept(svm, INTERCEPT_DR3_WRITE); |
774 | INTERCEPT_DR3_MASK | | 910 | set_dr_intercept(svm, INTERCEPT_DR4_WRITE); |
775 | INTERCEPT_DR4_MASK | | 911 | set_dr_intercept(svm, INTERCEPT_DR5_WRITE); |
776 | INTERCEPT_DR5_MASK | | 912 | set_dr_intercept(svm, INTERCEPT_DR6_WRITE); |
777 | INTERCEPT_DR6_MASK | | 913 | set_dr_intercept(svm, INTERCEPT_DR7_WRITE); |
778 | INTERCEPT_DR7_MASK; | 914 | |
779 | 915 | set_exception_intercept(svm, PF_VECTOR); | |
780 | control->intercept_exceptions = (1 << PF_VECTOR) | | 916 | set_exception_intercept(svm, UD_VECTOR); |
781 | (1 << UD_VECTOR) | | 917 | set_exception_intercept(svm, MC_VECTOR); |
782 | (1 << MC_VECTOR); | 918 | |
783 | 919 | set_intercept(svm, INTERCEPT_INTR); | |
784 | 920 | set_intercept(svm, INTERCEPT_NMI); | |
785 | control->intercept = (1ULL << INTERCEPT_INTR) | | 921 | set_intercept(svm, INTERCEPT_SMI); |
786 | (1ULL << INTERCEPT_NMI) | | 922 | set_intercept(svm, INTERCEPT_SELECTIVE_CR0); |
787 | (1ULL << INTERCEPT_SMI) | | 923 | set_intercept(svm, INTERCEPT_CPUID); |
788 | (1ULL << INTERCEPT_SELECTIVE_CR0) | | 924 | set_intercept(svm, INTERCEPT_INVD); |
789 | (1ULL << INTERCEPT_CPUID) | | 925 | set_intercept(svm, INTERCEPT_HLT); |
790 | (1ULL << INTERCEPT_INVD) | | 926 | set_intercept(svm, INTERCEPT_INVLPG); |
791 | (1ULL << INTERCEPT_HLT) | | 927 | set_intercept(svm, INTERCEPT_INVLPGA); |
792 | (1ULL << INTERCEPT_INVLPG) | | 928 | set_intercept(svm, INTERCEPT_IOIO_PROT); |
793 | (1ULL << INTERCEPT_INVLPGA) | | 929 | set_intercept(svm, INTERCEPT_MSR_PROT); |
794 | (1ULL << INTERCEPT_IOIO_PROT) | | 930 | set_intercept(svm, INTERCEPT_TASK_SWITCH); |
795 | (1ULL << INTERCEPT_MSR_PROT) | | 931 | set_intercept(svm, INTERCEPT_SHUTDOWN); |
796 | (1ULL << INTERCEPT_TASK_SWITCH) | | 932 | set_intercept(svm, INTERCEPT_VMRUN); |
797 | (1ULL << INTERCEPT_SHUTDOWN) | | 933 | set_intercept(svm, INTERCEPT_VMMCALL); |
798 | (1ULL << INTERCEPT_VMRUN) | | 934 | set_intercept(svm, INTERCEPT_VMLOAD); |
799 | (1ULL << INTERCEPT_VMMCALL) | | 935 | set_intercept(svm, INTERCEPT_VMSAVE); |
800 | (1ULL << INTERCEPT_VMLOAD) | | 936 | set_intercept(svm, INTERCEPT_STGI); |
801 | (1ULL << INTERCEPT_VMSAVE) | | 937 | set_intercept(svm, INTERCEPT_CLGI); |
802 | (1ULL << INTERCEPT_STGI) | | 938 | set_intercept(svm, INTERCEPT_SKINIT); |
803 | (1ULL << INTERCEPT_CLGI) | | 939 | set_intercept(svm, INTERCEPT_WBINVD); |
804 | (1ULL << INTERCEPT_SKINIT) | | 940 | set_intercept(svm, INTERCEPT_MONITOR); |
805 | (1ULL << INTERCEPT_WBINVD) | | 941 | set_intercept(svm, INTERCEPT_MWAIT); |
806 | (1ULL << INTERCEPT_MONITOR) | | 942 | set_intercept(svm, INTERCEPT_XSETBV); |
807 | (1ULL << INTERCEPT_MWAIT); | ||
808 | 943 | ||
809 | control->iopm_base_pa = iopm_base; | 944 | control->iopm_base_pa = iopm_base; |
810 | control->msrpm_base_pa = __pa(svm->msrpm); | 945 | control->msrpm_base_pa = __pa(svm->msrpm); |
@@ -855,25 +990,27 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
855 | if (npt_enabled) { | 990 | if (npt_enabled) { |
856 | /* Setup VMCB for Nested Paging */ | 991 | /* Setup VMCB for Nested Paging */ |
857 | control->nested_ctl = 1; | 992 | control->nested_ctl = 1; |
858 | control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | | 993 | clr_intercept(svm, INTERCEPT_TASK_SWITCH); |
859 | (1ULL << INTERCEPT_INVLPG)); | 994 | clr_intercept(svm, INTERCEPT_INVLPG); |
860 | control->intercept_exceptions &= ~(1 << PF_VECTOR); | 995 | clr_exception_intercept(svm, PF_VECTOR); |
861 | control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; | 996 | clr_cr_intercept(svm, INTERCEPT_CR3_READ); |
862 | control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; | 997 | clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); |
863 | save->g_pat = 0x0007040600070406ULL; | 998 | save->g_pat = 0x0007040600070406ULL; |
864 | save->cr3 = 0; | 999 | save->cr3 = 0; |
865 | save->cr4 = 0; | 1000 | save->cr4 = 0; |
866 | } | 1001 | } |
867 | force_new_asid(&svm->vcpu); | 1002 | svm->asid_generation = 0; |
868 | 1003 | ||
869 | svm->nested.vmcb = 0; | 1004 | svm->nested.vmcb = 0; |
870 | svm->vcpu.arch.hflags = 0; | 1005 | svm->vcpu.arch.hflags = 0; |
871 | 1006 | ||
872 | if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { | 1007 | if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { |
873 | control->pause_filter_count = 3000; | 1008 | control->pause_filter_count = 3000; |
874 | control->intercept |= (1ULL << INTERCEPT_PAUSE); | 1009 | set_intercept(svm, INTERCEPT_PAUSE); |
875 | } | 1010 | } |
876 | 1011 | ||
1012 | mark_all_dirty(svm->vmcb); | ||
1013 | |||
877 | enable_gif(svm); | 1014 | enable_gif(svm); |
878 | } | 1015 | } |
879 | 1016 | ||
@@ -990,8 +1127,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
990 | 1127 | ||
991 | if (unlikely(cpu != vcpu->cpu)) { | 1128 | if (unlikely(cpu != vcpu->cpu)) { |
992 | svm->asid_generation = 0; | 1129 | svm->asid_generation = 0; |
1130 | mark_all_dirty(svm->vmcb); | ||
993 | } | 1131 | } |
994 | 1132 | ||
1133 | #ifdef CONFIG_X86_64 | ||
1134 | rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base); | ||
1135 | #endif | ||
1136 | savesegment(fs, svm->host.fs); | ||
1137 | savesegment(gs, svm->host.gs); | ||
1138 | svm->host.ldt = kvm_read_ldt(); | ||
1139 | |||
995 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | 1140 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) |
996 | rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | 1141 | rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); |
997 | } | 1142 | } |
@@ -1002,6 +1147,14 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) | |||
1002 | int i; | 1147 | int i; |
1003 | 1148 | ||
1004 | ++vcpu->stat.host_state_reload; | 1149 | ++vcpu->stat.host_state_reload; |
1150 | kvm_load_ldt(svm->host.ldt); | ||
1151 | #ifdef CONFIG_X86_64 | ||
1152 | loadsegment(fs, svm->host.fs); | ||
1153 | load_gs_index(svm->host.gs); | ||
1154 | wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); | ||
1155 | #else | ||
1156 | loadsegment(gs, svm->host.gs); | ||
1157 | #endif | ||
1005 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | 1158 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) |
1006 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | 1159 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); |
1007 | } | 1160 | } |
@@ -1021,7 +1174,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | |||
1021 | switch (reg) { | 1174 | switch (reg) { |
1022 | case VCPU_EXREG_PDPTR: | 1175 | case VCPU_EXREG_PDPTR: |
1023 | BUG_ON(!npt_enabled); | 1176 | BUG_ON(!npt_enabled); |
1024 | load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); | 1177 | load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); |
1025 | break; | 1178 | break; |
1026 | default: | 1179 | default: |
1027 | BUG(); | 1180 | BUG(); |
@@ -1030,12 +1183,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | |||
1030 | 1183 | ||
1031 | static void svm_set_vintr(struct vcpu_svm *svm) | 1184 | static void svm_set_vintr(struct vcpu_svm *svm) |
1032 | { | 1185 | { |
1033 | svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; | 1186 | set_intercept(svm, INTERCEPT_VINTR); |
1034 | } | 1187 | } |
1035 | 1188 | ||
1036 | static void svm_clear_vintr(struct vcpu_svm *svm) | 1189 | static void svm_clear_vintr(struct vcpu_svm *svm) |
1037 | { | 1190 | { |
1038 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); | 1191 | clr_intercept(svm, INTERCEPT_VINTR); |
1039 | } | 1192 | } |
1040 | 1193 | ||
1041 | static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) | 1194 | static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) |
@@ -1150,6 +1303,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | |||
1150 | 1303 | ||
1151 | svm->vmcb->save.idtr.limit = dt->size; | 1304 | svm->vmcb->save.idtr.limit = dt->size; |
1152 | svm->vmcb->save.idtr.base = dt->address ; | 1305 | svm->vmcb->save.idtr.base = dt->address ; |
1306 | mark_dirty(svm->vmcb, VMCB_DT); | ||
1153 | } | 1307 | } |
1154 | 1308 | ||
1155 | static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | 1309 | static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) |
@@ -1166,19 +1320,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | |||
1166 | 1320 | ||
1167 | svm->vmcb->save.gdtr.limit = dt->size; | 1321 | svm->vmcb->save.gdtr.limit = dt->size; |
1168 | svm->vmcb->save.gdtr.base = dt->address ; | 1322 | svm->vmcb->save.gdtr.base = dt->address ; |
1323 | mark_dirty(svm->vmcb, VMCB_DT); | ||
1169 | } | 1324 | } |
1170 | 1325 | ||
1171 | static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) | 1326 | static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) |
1172 | { | 1327 | { |
1173 | } | 1328 | } |
1174 | 1329 | ||
1330 | static void svm_decache_cr3(struct kvm_vcpu *vcpu) | ||
1331 | { | ||
1332 | } | ||
1333 | |||
1175 | static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | 1334 | static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) |
1176 | { | 1335 | { |
1177 | } | 1336 | } |
1178 | 1337 | ||
1179 | static void update_cr0_intercept(struct vcpu_svm *svm) | 1338 | static void update_cr0_intercept(struct vcpu_svm *svm) |
1180 | { | 1339 | { |
1181 | struct vmcb *vmcb = svm->vmcb; | ||
1182 | ulong gcr0 = svm->vcpu.arch.cr0; | 1340 | ulong gcr0 = svm->vcpu.arch.cr0; |
1183 | u64 *hcr0 = &svm->vmcb->save.cr0; | 1341 | u64 *hcr0 = &svm->vmcb->save.cr0; |
1184 | 1342 | ||
@@ -1188,27 +1346,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm) | |||
1188 | *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) | 1346 | *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) |
1189 | | (gcr0 & SVM_CR0_SELECTIVE_MASK); | 1347 | | (gcr0 & SVM_CR0_SELECTIVE_MASK); |
1190 | 1348 | ||
1349 | mark_dirty(svm->vmcb, VMCB_CR); | ||
1191 | 1350 | ||
1192 | if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { | 1351 | if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { |
1193 | vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; | 1352 | clr_cr_intercept(svm, INTERCEPT_CR0_READ); |
1194 | vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; | 1353 | clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); |
1195 | if (is_nested(svm)) { | ||
1196 | struct vmcb *hsave = svm->nested.hsave; | ||
1197 | |||
1198 | hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; | ||
1199 | hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; | ||
1200 | vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read; | ||
1201 | vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write; | ||
1202 | } | ||
1203 | } else { | 1354 | } else { |
1204 | svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; | 1355 | set_cr_intercept(svm, INTERCEPT_CR0_READ); |
1205 | svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; | 1356 | set_cr_intercept(svm, INTERCEPT_CR0_WRITE); |
1206 | if (is_nested(svm)) { | ||
1207 | struct vmcb *hsave = svm->nested.hsave; | ||
1208 | |||
1209 | hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK; | ||
1210 | hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK; | ||
1211 | } | ||
1212 | } | 1357 | } |
1213 | } | 1358 | } |
1214 | 1359 | ||
@@ -1216,7 +1361,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1216 | { | 1361 | { |
1217 | struct vcpu_svm *svm = to_svm(vcpu); | 1362 | struct vcpu_svm *svm = to_svm(vcpu); |
1218 | 1363 | ||
1219 | if (is_nested(svm)) { | 1364 | if (is_guest_mode(vcpu)) { |
1220 | /* | 1365 | /* |
1221 | * We are here because we run in nested mode, the host kvm | 1366 | * We are here because we run in nested mode, the host kvm |
1222 | * intercepts cr0 writes but the l1 hypervisor does not. | 1367 | * intercepts cr0 writes but the l1 hypervisor does not. |
@@ -1268,6 +1413,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1268 | */ | 1413 | */ |
1269 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); | 1414 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); |
1270 | svm->vmcb->save.cr0 = cr0; | 1415 | svm->vmcb->save.cr0 = cr0; |
1416 | mark_dirty(svm->vmcb, VMCB_CR); | ||
1271 | update_cr0_intercept(svm); | 1417 | update_cr0_intercept(svm); |
1272 | } | 1418 | } |
1273 | 1419 | ||
@@ -1277,13 +1423,14 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
1277 | unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; | 1423 | unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; |
1278 | 1424 | ||
1279 | if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) | 1425 | if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) |
1280 | force_new_asid(vcpu); | 1426 | svm_flush_tlb(vcpu); |
1281 | 1427 | ||
1282 | vcpu->arch.cr4 = cr4; | 1428 | vcpu->arch.cr4 = cr4; |
1283 | if (!npt_enabled) | 1429 | if (!npt_enabled) |
1284 | cr4 |= X86_CR4_PAE; | 1430 | cr4 |= X86_CR4_PAE; |
1285 | cr4 |= host_cr4_mce; | 1431 | cr4 |= host_cr4_mce; |
1286 | to_svm(vcpu)->vmcb->save.cr4 = cr4; | 1432 | to_svm(vcpu)->vmcb->save.cr4 = cr4; |
1433 | mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); | ||
1287 | } | 1434 | } |
1288 | 1435 | ||
1289 | static void svm_set_segment(struct kvm_vcpu *vcpu, | 1436 | static void svm_set_segment(struct kvm_vcpu *vcpu, |
@@ -1312,26 +1459,25 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, | |||
1312 | = (svm->vmcb->save.cs.attrib | 1459 | = (svm->vmcb->save.cs.attrib |
1313 | >> SVM_SELECTOR_DPL_SHIFT) & 3; | 1460 | >> SVM_SELECTOR_DPL_SHIFT) & 3; |
1314 | 1461 | ||
1462 | mark_dirty(svm->vmcb, VMCB_SEG); | ||
1315 | } | 1463 | } |
1316 | 1464 | ||
1317 | static void update_db_intercept(struct kvm_vcpu *vcpu) | 1465 | static void update_db_intercept(struct kvm_vcpu *vcpu) |
1318 | { | 1466 | { |
1319 | struct vcpu_svm *svm = to_svm(vcpu); | 1467 | struct vcpu_svm *svm = to_svm(vcpu); |
1320 | 1468 | ||
1321 | svm->vmcb->control.intercept_exceptions &= | 1469 | clr_exception_intercept(svm, DB_VECTOR); |
1322 | ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); | 1470 | clr_exception_intercept(svm, BP_VECTOR); |
1323 | 1471 | ||
1324 | if (svm->nmi_singlestep) | 1472 | if (svm->nmi_singlestep) |
1325 | svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); | 1473 | set_exception_intercept(svm, DB_VECTOR); |
1326 | 1474 | ||
1327 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { | 1475 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { |
1328 | if (vcpu->guest_debug & | 1476 | if (vcpu->guest_debug & |
1329 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) | 1477 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) |
1330 | svm->vmcb->control.intercept_exceptions |= | 1478 | set_exception_intercept(svm, DB_VECTOR); |
1331 | 1 << DB_VECTOR; | ||
1332 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) | 1479 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) |
1333 | svm->vmcb->control.intercept_exceptions |= | 1480 | set_exception_intercept(svm, BP_VECTOR); |
1334 | 1 << BP_VECTOR; | ||
1335 | } else | 1481 | } else |
1336 | vcpu->guest_debug = 0; | 1482 | vcpu->guest_debug = 0; |
1337 | } | 1483 | } |
@@ -1345,21 +1491,9 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) | |||
1345 | else | 1491 | else |
1346 | svm->vmcb->save.dr7 = vcpu->arch.dr7; | 1492 | svm->vmcb->save.dr7 = vcpu->arch.dr7; |
1347 | 1493 | ||
1348 | update_db_intercept(vcpu); | 1494 | mark_dirty(svm->vmcb, VMCB_DR); |
1349 | } | ||
1350 | |||
1351 | static void load_host_msrs(struct kvm_vcpu *vcpu) | ||
1352 | { | ||
1353 | #ifdef CONFIG_X86_64 | ||
1354 | wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); | ||
1355 | #endif | ||
1356 | } | ||
1357 | 1495 | ||
1358 | static void save_host_msrs(struct kvm_vcpu *vcpu) | 1496 | update_db_intercept(vcpu); |
1359 | { | ||
1360 | #ifdef CONFIG_X86_64 | ||
1361 | rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); | ||
1362 | #endif | ||
1363 | } | 1497 | } |
1364 | 1498 | ||
1365 | static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) | 1499 | static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) |
@@ -1372,6 +1506,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) | |||
1372 | 1506 | ||
1373 | svm->asid_generation = sd->asid_generation; | 1507 | svm->asid_generation = sd->asid_generation; |
1374 | svm->vmcb->control.asid = sd->next_asid++; | 1508 | svm->vmcb->control.asid = sd->next_asid++; |
1509 | |||
1510 | mark_dirty(svm->vmcb, VMCB_ASID); | ||
1375 | } | 1511 | } |
1376 | 1512 | ||
1377 | static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) | 1513 | static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) |
@@ -1379,20 +1515,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) | |||
1379 | struct vcpu_svm *svm = to_svm(vcpu); | 1515 | struct vcpu_svm *svm = to_svm(vcpu); |
1380 | 1516 | ||
1381 | svm->vmcb->save.dr7 = value; | 1517 | svm->vmcb->save.dr7 = value; |
1518 | mark_dirty(svm->vmcb, VMCB_DR); | ||
1382 | } | 1519 | } |
1383 | 1520 | ||
1384 | static int pf_interception(struct vcpu_svm *svm) | 1521 | static int pf_interception(struct vcpu_svm *svm) |
1385 | { | 1522 | { |
1386 | u64 fault_address; | 1523 | u64 fault_address = svm->vmcb->control.exit_info_2; |
1387 | u32 error_code; | 1524 | u32 error_code; |
1525 | int r = 1; | ||
1388 | 1526 | ||
1389 | fault_address = svm->vmcb->control.exit_info_2; | 1527 | switch (svm->apf_reason) { |
1390 | error_code = svm->vmcb->control.exit_info_1; | 1528 | default: |
1391 | 1529 | error_code = svm->vmcb->control.exit_info_1; | |
1392 | trace_kvm_page_fault(fault_address, error_code); | 1530 | |
1393 | if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) | 1531 | trace_kvm_page_fault(fault_address, error_code); |
1394 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); | 1532 | if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) |
1395 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); | 1533 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); |
1534 | r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, | ||
1535 | svm->vmcb->control.insn_bytes, | ||
1536 | svm->vmcb->control.insn_len); | ||
1537 | break; | ||
1538 | case KVM_PV_REASON_PAGE_NOT_PRESENT: | ||
1539 | svm->apf_reason = 0; | ||
1540 | local_irq_disable(); | ||
1541 | kvm_async_pf_task_wait(fault_address); | ||
1542 | local_irq_enable(); | ||
1543 | break; | ||
1544 | case KVM_PV_REASON_PAGE_READY: | ||
1545 | svm->apf_reason = 0; | ||
1546 | local_irq_disable(); | ||
1547 | kvm_async_pf_task_wake(fault_address); | ||
1548 | local_irq_enable(); | ||
1549 | break; | ||
1550 | } | ||
1551 | return r; | ||
1396 | } | 1552 | } |
1397 | 1553 | ||
1398 | static int db_interception(struct vcpu_svm *svm) | 1554 | static int db_interception(struct vcpu_svm *svm) |
@@ -1440,7 +1596,7 @@ static int ud_interception(struct vcpu_svm *svm) | |||
1440 | { | 1596 | { |
1441 | int er; | 1597 | int er; |
1442 | 1598 | ||
1443 | er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD); | 1599 | er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); |
1444 | if (er != EMULATE_DONE) | 1600 | if (er != EMULATE_DONE) |
1445 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | 1601 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); |
1446 | return 1; | 1602 | return 1; |
@@ -1449,21 +1605,8 @@ static int ud_interception(struct vcpu_svm *svm) | |||
1449 | static void svm_fpu_activate(struct kvm_vcpu *vcpu) | 1605 | static void svm_fpu_activate(struct kvm_vcpu *vcpu) |
1450 | { | 1606 | { |
1451 | struct vcpu_svm *svm = to_svm(vcpu); | 1607 | struct vcpu_svm *svm = to_svm(vcpu); |
1452 | u32 excp; | ||
1453 | |||
1454 | if (is_nested(svm)) { | ||
1455 | u32 h_excp, n_excp; | ||
1456 | |||
1457 | h_excp = svm->nested.hsave->control.intercept_exceptions; | ||
1458 | n_excp = svm->nested.intercept_exceptions; | ||
1459 | h_excp &= ~(1 << NM_VECTOR); | ||
1460 | excp = h_excp | n_excp; | ||
1461 | } else { | ||
1462 | excp = svm->vmcb->control.intercept_exceptions; | ||
1463 | excp &= ~(1 << NM_VECTOR); | ||
1464 | } | ||
1465 | 1608 | ||
1466 | svm->vmcb->control.intercept_exceptions = excp; | 1609 | clr_exception_intercept(svm, NM_VECTOR); |
1467 | 1610 | ||
1468 | svm->vcpu.fpu_active = 1; | 1611 | svm->vcpu.fpu_active = 1; |
1469 | update_cr0_intercept(svm); | 1612 | update_cr0_intercept(svm); |
@@ -1570,7 +1713,7 @@ static int io_interception(struct vcpu_svm *svm) | |||
1570 | string = (io_info & SVM_IOIO_STR_MASK) != 0; | 1713 | string = (io_info & SVM_IOIO_STR_MASK) != 0; |
1571 | in = (io_info & SVM_IOIO_TYPE_MASK) != 0; | 1714 | in = (io_info & SVM_IOIO_TYPE_MASK) != 0; |
1572 | if (string || in) | 1715 | if (string || in) |
1573 | return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; | 1716 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; |
1574 | 1717 | ||
1575 | port = io_info >> 16; | 1718 | port = io_info >> 16; |
1576 | size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; | 1719 | size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; |
@@ -1624,17 +1767,19 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, | |||
1624 | struct vcpu_svm *svm = to_svm(vcpu); | 1767 | struct vcpu_svm *svm = to_svm(vcpu); |
1625 | 1768 | ||
1626 | svm->vmcb->control.nested_cr3 = root; | 1769 | svm->vmcb->control.nested_cr3 = root; |
1627 | force_new_asid(vcpu); | 1770 | mark_dirty(svm->vmcb, VMCB_NPT); |
1771 | svm_flush_tlb(vcpu); | ||
1628 | } | 1772 | } |
1629 | 1773 | ||
1630 | static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu) | 1774 | static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, |
1775 | struct x86_exception *fault) | ||
1631 | { | 1776 | { |
1632 | struct vcpu_svm *svm = to_svm(vcpu); | 1777 | struct vcpu_svm *svm = to_svm(vcpu); |
1633 | 1778 | ||
1634 | svm->vmcb->control.exit_code = SVM_EXIT_NPF; | 1779 | svm->vmcb->control.exit_code = SVM_EXIT_NPF; |
1635 | svm->vmcb->control.exit_code_hi = 0; | 1780 | svm->vmcb->control.exit_code_hi = 0; |
1636 | svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code; | 1781 | svm->vmcb->control.exit_info_1 = fault->error_code; |
1637 | svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address; | 1782 | svm->vmcb->control.exit_info_2 = fault->address; |
1638 | 1783 | ||
1639 | nested_svm_vmexit(svm); | 1784 | nested_svm_vmexit(svm); |
1640 | } | 1785 | } |
@@ -1680,7 +1825,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, | |||
1680 | { | 1825 | { |
1681 | int vmexit; | 1826 | int vmexit; |
1682 | 1827 | ||
1683 | if (!is_nested(svm)) | 1828 | if (!is_guest_mode(&svm->vcpu)) |
1684 | return 0; | 1829 | return 0; |
1685 | 1830 | ||
1686 | svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; | 1831 | svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; |
@@ -1698,7 +1843,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, | |||
1698 | /* This function returns true if it is save to enable the irq window */ | 1843 | /* This function returns true if it is save to enable the irq window */ |
1699 | static inline bool nested_svm_intr(struct vcpu_svm *svm) | 1844 | static inline bool nested_svm_intr(struct vcpu_svm *svm) |
1700 | { | 1845 | { |
1701 | if (!is_nested(svm)) | 1846 | if (!is_guest_mode(&svm->vcpu)) |
1702 | return true; | 1847 | return true; |
1703 | 1848 | ||
1704 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) | 1849 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) |
@@ -1737,7 +1882,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) | |||
1737 | /* This function returns true if it is save to enable the nmi window */ | 1882 | /* This function returns true if it is save to enable the nmi window */ |
1738 | static inline bool nested_svm_nmi(struct vcpu_svm *svm) | 1883 | static inline bool nested_svm_nmi(struct vcpu_svm *svm) |
1739 | { | 1884 | { |
1740 | if (!is_nested(svm)) | 1885 | if (!is_guest_mode(&svm->vcpu)) |
1741 | return true; | 1886 | return true; |
1742 | 1887 | ||
1743 | if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) | 1888 | if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) |
@@ -1836,8 +1981,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) | |||
1836 | return NESTED_EXIT_HOST; | 1981 | return NESTED_EXIT_HOST; |
1837 | break; | 1982 | break; |
1838 | case SVM_EXIT_EXCP_BASE + PF_VECTOR: | 1983 | case SVM_EXIT_EXCP_BASE + PF_VECTOR: |
1839 | /* When we're shadowing, trap PFs */ | 1984 | /* When we're shadowing, trap PFs, but not async PF */ |
1840 | if (!npt_enabled) | 1985 | if (!npt_enabled && svm->apf_reason == 0) |
1841 | return NESTED_EXIT_HOST; | 1986 | return NESTED_EXIT_HOST; |
1842 | break; | 1987 | break; |
1843 | case SVM_EXIT_EXCP_BASE + NM_VECTOR: | 1988 | case SVM_EXIT_EXCP_BASE + NM_VECTOR: |
@@ -1865,27 +2010,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm) | |||
1865 | case SVM_EXIT_IOIO: | 2010 | case SVM_EXIT_IOIO: |
1866 | vmexit = nested_svm_intercept_ioio(svm); | 2011 | vmexit = nested_svm_intercept_ioio(svm); |
1867 | break; | 2012 | break; |
1868 | case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { | 2013 | case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { |
1869 | u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); | 2014 | u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); |
1870 | if (svm->nested.intercept_cr_read & cr_bits) | 2015 | if (svm->nested.intercept_cr & bit) |
1871 | vmexit = NESTED_EXIT_DONE; | 2016 | vmexit = NESTED_EXIT_DONE; |
1872 | break; | 2017 | break; |
1873 | } | 2018 | } |
1874 | case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { | 2019 | case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { |
1875 | u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); | 2020 | u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); |
1876 | if (svm->nested.intercept_cr_write & cr_bits) | 2021 | if (svm->nested.intercept_dr & bit) |
1877 | vmexit = NESTED_EXIT_DONE; | ||
1878 | break; | ||
1879 | } | ||
1880 | case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { | ||
1881 | u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); | ||
1882 | if (svm->nested.intercept_dr_read & dr_bits) | ||
1883 | vmexit = NESTED_EXIT_DONE; | ||
1884 | break; | ||
1885 | } | ||
1886 | case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { | ||
1887 | u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); | ||
1888 | if (svm->nested.intercept_dr_write & dr_bits) | ||
1889 | vmexit = NESTED_EXIT_DONE; | 2022 | vmexit = NESTED_EXIT_DONE; |
1890 | break; | 2023 | break; |
1891 | } | 2024 | } |
@@ -1893,6 +2026,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm) | |||
1893 | u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); | 2026 | u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); |
1894 | if (svm->nested.intercept_exceptions & excp_bits) | 2027 | if (svm->nested.intercept_exceptions & excp_bits) |
1895 | vmexit = NESTED_EXIT_DONE; | 2028 | vmexit = NESTED_EXIT_DONE; |
2029 | /* async page fault always cause vmexit */ | ||
2030 | else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && | ||
2031 | svm->apf_reason != 0) | ||
2032 | vmexit = NESTED_EXIT_DONE; | ||
1896 | break; | 2033 | break; |
1897 | } | 2034 | } |
1898 | case SVM_EXIT_ERR: { | 2035 | case SVM_EXIT_ERR: { |
@@ -1926,10 +2063,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr | |||
1926 | struct vmcb_control_area *dst = &dst_vmcb->control; | 2063 | struct vmcb_control_area *dst = &dst_vmcb->control; |
1927 | struct vmcb_control_area *from = &from_vmcb->control; | 2064 | struct vmcb_control_area *from = &from_vmcb->control; |
1928 | 2065 | ||
1929 | dst->intercept_cr_read = from->intercept_cr_read; | 2066 | dst->intercept_cr = from->intercept_cr; |
1930 | dst->intercept_cr_write = from->intercept_cr_write; | 2067 | dst->intercept_dr = from->intercept_dr; |
1931 | dst->intercept_dr_read = from->intercept_dr_read; | ||
1932 | dst->intercept_dr_write = from->intercept_dr_write; | ||
1933 | dst->intercept_exceptions = from->intercept_exceptions; | 2068 | dst->intercept_exceptions = from->intercept_exceptions; |
1934 | dst->intercept = from->intercept; | 2069 | dst->intercept = from->intercept; |
1935 | dst->iopm_base_pa = from->iopm_base_pa; | 2070 | dst->iopm_base_pa = from->iopm_base_pa; |
@@ -1970,7 +2105,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1970 | if (!nested_vmcb) | 2105 | if (!nested_vmcb) |
1971 | return 1; | 2106 | return 1; |
1972 | 2107 | ||
1973 | /* Exit nested SVM mode */ | 2108 | /* Exit Guest-Mode */ |
2109 | leave_guest_mode(&svm->vcpu); | ||
1974 | svm->nested.vmcb = 0; | 2110 | svm->nested.vmcb = 0; |
1975 | 2111 | ||
1976 | /* Give the current vmcb to the guest */ | 2112 | /* Give the current vmcb to the guest */ |
@@ -1984,7 +2120,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1984 | nested_vmcb->save.idtr = vmcb->save.idtr; | 2120 | nested_vmcb->save.idtr = vmcb->save.idtr; |
1985 | nested_vmcb->save.efer = svm->vcpu.arch.efer; | 2121 | nested_vmcb->save.efer = svm->vcpu.arch.efer; |
1986 | nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); | 2122 | nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); |
1987 | nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; | 2123 | nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); |
1988 | nested_vmcb->save.cr2 = vmcb->save.cr2; | 2124 | nested_vmcb->save.cr2 = vmcb->save.cr2; |
1989 | nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; | 2125 | nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; |
1990 | nested_vmcb->save.rflags = vmcb->save.rflags; | 2126 | nested_vmcb->save.rflags = vmcb->save.rflags; |
@@ -2061,6 +2197,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
2061 | svm->vmcb->save.cpl = 0; | 2197 | svm->vmcb->save.cpl = 0; |
2062 | svm->vmcb->control.exit_int_info = 0; | 2198 | svm->vmcb->control.exit_int_info = 0; |
2063 | 2199 | ||
2200 | mark_all_dirty(svm->vmcb); | ||
2201 | |||
2064 | nested_svm_unmap(page); | 2202 | nested_svm_unmap(page); |
2065 | 2203 | ||
2066 | nested_svm_uninit_mmu_context(&svm->vcpu); | 2204 | nested_svm_uninit_mmu_context(&svm->vcpu); |
@@ -2148,8 +2286,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2148 | nested_vmcb->control.event_inj, | 2286 | nested_vmcb->control.event_inj, |
2149 | nested_vmcb->control.nested_ctl); | 2287 | nested_vmcb->control.nested_ctl); |
2150 | 2288 | ||
2151 | trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read, | 2289 | trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, |
2152 | nested_vmcb->control.intercept_cr_write, | 2290 | nested_vmcb->control.intercept_cr >> 16, |
2153 | nested_vmcb->control.intercept_exceptions, | 2291 | nested_vmcb->control.intercept_exceptions, |
2154 | nested_vmcb->control.intercept); | 2292 | nested_vmcb->control.intercept); |
2155 | 2293 | ||
@@ -2177,7 +2315,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2177 | if (npt_enabled) | 2315 | if (npt_enabled) |
2178 | hsave->save.cr3 = vmcb->save.cr3; | 2316 | hsave->save.cr3 = vmcb->save.cr3; |
2179 | else | 2317 | else |
2180 | hsave->save.cr3 = svm->vcpu.arch.cr3; | 2318 | hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); |
2181 | 2319 | ||
2182 | copy_vmcb_control_area(hsave, vmcb); | 2320 | copy_vmcb_control_area(hsave, vmcb); |
2183 | 2321 | ||
@@ -2229,14 +2367,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2229 | svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; | 2367 | svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; |
2230 | 2368 | ||
2231 | /* cache intercepts */ | 2369 | /* cache intercepts */ |
2232 | svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; | 2370 | svm->nested.intercept_cr = nested_vmcb->control.intercept_cr; |
2233 | svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; | 2371 | svm->nested.intercept_dr = nested_vmcb->control.intercept_dr; |
2234 | svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read; | ||
2235 | svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write; | ||
2236 | svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; | 2372 | svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; |
2237 | svm->nested.intercept = nested_vmcb->control.intercept; | 2373 | svm->nested.intercept = nested_vmcb->control.intercept; |
2238 | 2374 | ||
2239 | force_new_asid(&svm->vcpu); | 2375 | svm_flush_tlb(&svm->vcpu); |
2240 | svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; | 2376 | svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; |
2241 | if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) | 2377 | if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) |
2242 | svm->vcpu.arch.hflags |= HF_VINTR_MASK; | 2378 | svm->vcpu.arch.hflags |= HF_VINTR_MASK; |
@@ -2245,29 +2381,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2245 | 2381 | ||
2246 | if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { | 2382 | if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { |
2247 | /* We only want the cr8 intercept bits of the guest */ | 2383 | /* We only want the cr8 intercept bits of the guest */ |
2248 | svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK; | 2384 | clr_cr_intercept(svm, INTERCEPT_CR8_READ); |
2249 | svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; | 2385 | clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); |
2250 | } | 2386 | } |
2251 | 2387 | ||
2252 | /* We don't want to see VMMCALLs from a nested guest */ | 2388 | /* We don't want to see VMMCALLs from a nested guest */ |
2253 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL); | 2389 | clr_intercept(svm, INTERCEPT_VMMCALL); |
2254 | |||
2255 | /* | ||
2256 | * We don't want a nested guest to be more powerful than the guest, so | ||
2257 | * all intercepts are ORed | ||
2258 | */ | ||
2259 | svm->vmcb->control.intercept_cr_read |= | ||
2260 | nested_vmcb->control.intercept_cr_read; | ||
2261 | svm->vmcb->control.intercept_cr_write |= | ||
2262 | nested_vmcb->control.intercept_cr_write; | ||
2263 | svm->vmcb->control.intercept_dr_read |= | ||
2264 | nested_vmcb->control.intercept_dr_read; | ||
2265 | svm->vmcb->control.intercept_dr_write |= | ||
2266 | nested_vmcb->control.intercept_dr_write; | ||
2267 | svm->vmcb->control.intercept_exceptions |= | ||
2268 | nested_vmcb->control.intercept_exceptions; | ||
2269 | |||
2270 | svm->vmcb->control.intercept |= nested_vmcb->control.intercept; | ||
2271 | 2390 | ||
2272 | svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; | 2391 | svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; |
2273 | svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; | 2392 | svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; |
@@ -2278,11 +2397,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) | |||
2278 | 2397 | ||
2279 | nested_svm_unmap(page); | 2398 | nested_svm_unmap(page); |
2280 | 2399 | ||
2281 | /* nested_vmcb is our indicator if nested SVM is activated */ | 2400 | /* Enter Guest-Mode */ |
2401 | enter_guest_mode(&svm->vcpu); | ||
2402 | |||
2403 | /* | ||
2404 | * Merge guest and host intercepts - must be called with vcpu in | ||
2405 | * guest-mode to take affect here | ||
2406 | */ | ||
2407 | recalc_intercepts(svm); | ||
2408 | |||
2282 | svm->nested.vmcb = vmcb_gpa; | 2409 | svm->nested.vmcb = vmcb_gpa; |
2283 | 2410 | ||
2284 | enable_gif(svm); | 2411 | enable_gif(svm); |
2285 | 2412 | ||
2413 | mark_all_dirty(svm->vmcb); | ||
2414 | |||
2286 | return true; | 2415 | return true; |
2287 | } | 2416 | } |
2288 | 2417 | ||
@@ -2400,6 +2529,8 @@ static int clgi_interception(struct vcpu_svm *svm) | |||
2400 | svm_clear_vintr(svm); | 2529 | svm_clear_vintr(svm); |
2401 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; | 2530 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; |
2402 | 2531 | ||
2532 | mark_dirty(svm->vmcb, VMCB_INTR); | ||
2533 | |||
2403 | return 1; | 2534 | return 1; |
2404 | } | 2535 | } |
2405 | 2536 | ||
@@ -2426,6 +2557,19 @@ static int skinit_interception(struct vcpu_svm *svm) | |||
2426 | return 1; | 2557 | return 1; |
2427 | } | 2558 | } |
2428 | 2559 | ||
2560 | static int xsetbv_interception(struct vcpu_svm *svm) | ||
2561 | { | ||
2562 | u64 new_bv = kvm_read_edx_eax(&svm->vcpu); | ||
2563 | u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); | ||
2564 | |||
2565 | if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { | ||
2566 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | ||
2567 | skip_emulated_instruction(&svm->vcpu); | ||
2568 | } | ||
2569 | |||
2570 | return 1; | ||
2571 | } | ||
2572 | |||
2429 | static int invalid_op_interception(struct vcpu_svm *svm) | 2573 | static int invalid_op_interception(struct vcpu_svm *svm) |
2430 | { | 2574 | { |
2431 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | 2575 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); |
@@ -2507,19 +2651,92 @@ static int cpuid_interception(struct vcpu_svm *svm) | |||
2507 | static int iret_interception(struct vcpu_svm *svm) | 2651 | static int iret_interception(struct vcpu_svm *svm) |
2508 | { | 2652 | { |
2509 | ++svm->vcpu.stat.nmi_window_exits; | 2653 | ++svm->vcpu.stat.nmi_window_exits; |
2510 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); | 2654 | clr_intercept(svm, INTERCEPT_IRET); |
2511 | svm->vcpu.arch.hflags |= HF_IRET_MASK; | 2655 | svm->vcpu.arch.hflags |= HF_IRET_MASK; |
2512 | return 1; | 2656 | return 1; |
2513 | } | 2657 | } |
2514 | 2658 | ||
2515 | static int invlpg_interception(struct vcpu_svm *svm) | 2659 | static int invlpg_interception(struct vcpu_svm *svm) |
2516 | { | 2660 | { |
2517 | return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; | 2661 | if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) |
2662 | return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; | ||
2663 | |||
2664 | kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); | ||
2665 | skip_emulated_instruction(&svm->vcpu); | ||
2666 | return 1; | ||
2518 | } | 2667 | } |
2519 | 2668 | ||
2520 | static int emulate_on_interception(struct vcpu_svm *svm) | 2669 | static int emulate_on_interception(struct vcpu_svm *svm) |
2521 | { | 2670 | { |
2522 | return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; | 2671 | return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; |
2672 | } | ||
2673 | |||
2674 | #define CR_VALID (1ULL << 63) | ||
2675 | |||
2676 | static int cr_interception(struct vcpu_svm *svm) | ||
2677 | { | ||
2678 | int reg, cr; | ||
2679 | unsigned long val; | ||
2680 | int err; | ||
2681 | |||
2682 | if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) | ||
2683 | return emulate_on_interception(svm); | ||
2684 | |||
2685 | if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) | ||
2686 | return emulate_on_interception(svm); | ||
2687 | |||
2688 | reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; | ||
2689 | cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; | ||
2690 | |||
2691 | err = 0; | ||
2692 | if (cr >= 16) { /* mov to cr */ | ||
2693 | cr -= 16; | ||
2694 | val = kvm_register_read(&svm->vcpu, reg); | ||
2695 | switch (cr) { | ||
2696 | case 0: | ||
2697 | err = kvm_set_cr0(&svm->vcpu, val); | ||
2698 | break; | ||
2699 | case 3: | ||
2700 | err = kvm_set_cr3(&svm->vcpu, val); | ||
2701 | break; | ||
2702 | case 4: | ||
2703 | err = kvm_set_cr4(&svm->vcpu, val); | ||
2704 | break; | ||
2705 | case 8: | ||
2706 | err = kvm_set_cr8(&svm->vcpu, val); | ||
2707 | break; | ||
2708 | default: | ||
2709 | WARN(1, "unhandled write to CR%d", cr); | ||
2710 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | ||
2711 | return 1; | ||
2712 | } | ||
2713 | } else { /* mov from cr */ | ||
2714 | switch (cr) { | ||
2715 | case 0: | ||
2716 | val = kvm_read_cr0(&svm->vcpu); | ||
2717 | break; | ||
2718 | case 2: | ||
2719 | val = svm->vcpu.arch.cr2; | ||
2720 | break; | ||
2721 | case 3: | ||
2722 | val = kvm_read_cr3(&svm->vcpu); | ||
2723 | break; | ||
2724 | case 4: | ||
2725 | val = kvm_read_cr4(&svm->vcpu); | ||
2726 | break; | ||
2727 | case 8: | ||
2728 | val = kvm_get_cr8(&svm->vcpu); | ||
2729 | break; | ||
2730 | default: | ||
2731 | WARN(1, "unhandled read from CR%d", cr); | ||
2732 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | ||
2733 | return 1; | ||
2734 | } | ||
2735 | kvm_register_write(&svm->vcpu, reg, val); | ||
2736 | } | ||
2737 | kvm_complete_insn_gp(&svm->vcpu, err); | ||
2738 | |||
2739 | return 1; | ||
2523 | } | 2740 | } |
2524 | 2741 | ||
2525 | static int cr0_write_interception(struct vcpu_svm *svm) | 2742 | static int cr0_write_interception(struct vcpu_svm *svm) |
@@ -2527,7 +2744,7 @@ static int cr0_write_interception(struct vcpu_svm *svm) | |||
2527 | struct kvm_vcpu *vcpu = &svm->vcpu; | 2744 | struct kvm_vcpu *vcpu = &svm->vcpu; |
2528 | int r; | 2745 | int r; |
2529 | 2746 | ||
2530 | r = emulate_instruction(&svm->vcpu, 0, 0, 0); | 2747 | r = cr_interception(svm); |
2531 | 2748 | ||
2532 | if (svm->nested.vmexit_rip) { | 2749 | if (svm->nested.vmexit_rip) { |
2533 | kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); | 2750 | kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); |
@@ -2536,22 +2753,47 @@ static int cr0_write_interception(struct vcpu_svm *svm) | |||
2536 | svm->nested.vmexit_rip = 0; | 2753 | svm->nested.vmexit_rip = 0; |
2537 | } | 2754 | } |
2538 | 2755 | ||
2539 | return r == EMULATE_DONE; | 2756 | return r; |
2757 | } | ||
2758 | |||
2759 | static int dr_interception(struct vcpu_svm *svm) | ||
2760 | { | ||
2761 | int reg, dr; | ||
2762 | unsigned long val; | ||
2763 | int err; | ||
2764 | |||
2765 | if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) | ||
2766 | return emulate_on_interception(svm); | ||
2767 | |||
2768 | reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; | ||
2769 | dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; | ||
2770 | |||
2771 | if (dr >= 16) { /* mov to DRn */ | ||
2772 | val = kvm_register_read(&svm->vcpu, reg); | ||
2773 | kvm_set_dr(&svm->vcpu, dr - 16, val); | ||
2774 | } else { | ||
2775 | err = kvm_get_dr(&svm->vcpu, dr, &val); | ||
2776 | if (!err) | ||
2777 | kvm_register_write(&svm->vcpu, reg, val); | ||
2778 | } | ||
2779 | |||
2780 | return 1; | ||
2540 | } | 2781 | } |
2541 | 2782 | ||
2542 | static int cr8_write_interception(struct vcpu_svm *svm) | 2783 | static int cr8_write_interception(struct vcpu_svm *svm) |
2543 | { | 2784 | { |
2544 | struct kvm_run *kvm_run = svm->vcpu.run; | 2785 | struct kvm_run *kvm_run = svm->vcpu.run; |
2786 | int r; | ||
2545 | 2787 | ||
2546 | u8 cr8_prev = kvm_get_cr8(&svm->vcpu); | 2788 | u8 cr8_prev = kvm_get_cr8(&svm->vcpu); |
2547 | /* instruction emulation calls kvm_set_cr8() */ | 2789 | /* instruction emulation calls kvm_set_cr8() */ |
2548 | emulate_instruction(&svm->vcpu, 0, 0, 0); | 2790 | r = cr_interception(svm); |
2549 | if (irqchip_in_kernel(svm->vcpu.kvm)) { | 2791 | if (irqchip_in_kernel(svm->vcpu.kvm)) { |
2550 | svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; | 2792 | clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); |
2551 | return 1; | 2793 | return r; |
2552 | } | 2794 | } |
2553 | if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) | 2795 | if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) |
2554 | return 1; | 2796 | return r; |
2555 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | 2797 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; |
2556 | return 0; | 2798 | return 0; |
2557 | } | 2799 | } |
@@ -2562,14 +2804,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | |||
2562 | 2804 | ||
2563 | switch (ecx) { | 2805 | switch (ecx) { |
2564 | case MSR_IA32_TSC: { | 2806 | case MSR_IA32_TSC: { |
2565 | u64 tsc_offset; | 2807 | struct vmcb *vmcb = get_host_vmcb(svm); |
2566 | 2808 | ||
2567 | if (is_nested(svm)) | 2809 | *data = vmcb->control.tsc_offset + native_read_tsc(); |
2568 | tsc_offset = svm->nested.hsave->control.tsc_offset; | ||
2569 | else | ||
2570 | tsc_offset = svm->vmcb->control.tsc_offset; | ||
2571 | |||
2572 | *data = tsc_offset + native_read_tsc(); | ||
2573 | break; | 2810 | break; |
2574 | } | 2811 | } |
2575 | case MSR_STAR: | 2812 | case MSR_STAR: |
@@ -2714,7 +2951,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
2714 | svm->vmcb->save.sysenter_esp = data; | 2951 | svm->vmcb->save.sysenter_esp = data; |
2715 | break; | 2952 | break; |
2716 | case MSR_IA32_DEBUGCTLMSR: | 2953 | case MSR_IA32_DEBUGCTLMSR: |
2717 | if (!svm_has(SVM_FEATURE_LBRV)) { | 2954 | if (!boot_cpu_has(X86_FEATURE_LBRV)) { |
2718 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", | 2955 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", |
2719 | __func__, data); | 2956 | __func__, data); |
2720 | break; | 2957 | break; |
@@ -2723,6 +2960,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
2723 | return 1; | 2960 | return 1; |
2724 | 2961 | ||
2725 | svm->vmcb->save.dbgctl = data; | 2962 | svm->vmcb->save.dbgctl = data; |
2963 | mark_dirty(svm->vmcb, VMCB_LBR); | ||
2726 | if (data & (1ULL<<0)) | 2964 | if (data & (1ULL<<0)) |
2727 | svm_enable_lbrv(svm); | 2965 | svm_enable_lbrv(svm); |
2728 | else | 2966 | else |
@@ -2775,6 +3013,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) | |||
2775 | kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); | 3013 | kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); |
2776 | svm_clear_vintr(svm); | 3014 | svm_clear_vintr(svm); |
2777 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; | 3015 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; |
3016 | mark_dirty(svm->vmcb, VMCB_INTR); | ||
2778 | /* | 3017 | /* |
2779 | * If the user space waits to inject interrupts, exit as soon as | 3018 | * If the user space waits to inject interrupts, exit as soon as |
2780 | * possible | 3019 | * possible |
@@ -2797,31 +3036,31 @@ static int pause_interception(struct vcpu_svm *svm) | |||
2797 | } | 3036 | } |
2798 | 3037 | ||
2799 | static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { | 3038 | static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { |
2800 | [SVM_EXIT_READ_CR0] = emulate_on_interception, | 3039 | [SVM_EXIT_READ_CR0] = cr_interception, |
2801 | [SVM_EXIT_READ_CR3] = emulate_on_interception, | 3040 | [SVM_EXIT_READ_CR3] = cr_interception, |
2802 | [SVM_EXIT_READ_CR4] = emulate_on_interception, | 3041 | [SVM_EXIT_READ_CR4] = cr_interception, |
2803 | [SVM_EXIT_READ_CR8] = emulate_on_interception, | 3042 | [SVM_EXIT_READ_CR8] = cr_interception, |
2804 | [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, | 3043 | [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, |
2805 | [SVM_EXIT_WRITE_CR0] = cr0_write_interception, | 3044 | [SVM_EXIT_WRITE_CR0] = cr0_write_interception, |
2806 | [SVM_EXIT_WRITE_CR3] = emulate_on_interception, | 3045 | [SVM_EXIT_WRITE_CR3] = cr_interception, |
2807 | [SVM_EXIT_WRITE_CR4] = emulate_on_interception, | 3046 | [SVM_EXIT_WRITE_CR4] = cr_interception, |
2808 | [SVM_EXIT_WRITE_CR8] = cr8_write_interception, | 3047 | [SVM_EXIT_WRITE_CR8] = cr8_write_interception, |
2809 | [SVM_EXIT_READ_DR0] = emulate_on_interception, | 3048 | [SVM_EXIT_READ_DR0] = dr_interception, |
2810 | [SVM_EXIT_READ_DR1] = emulate_on_interception, | 3049 | [SVM_EXIT_READ_DR1] = dr_interception, |
2811 | [SVM_EXIT_READ_DR2] = emulate_on_interception, | 3050 | [SVM_EXIT_READ_DR2] = dr_interception, |
2812 | [SVM_EXIT_READ_DR3] = emulate_on_interception, | 3051 | [SVM_EXIT_READ_DR3] = dr_interception, |
2813 | [SVM_EXIT_READ_DR4] = emulate_on_interception, | 3052 | [SVM_EXIT_READ_DR4] = dr_interception, |
2814 | [SVM_EXIT_READ_DR5] = emulate_on_interception, | 3053 | [SVM_EXIT_READ_DR5] = dr_interception, |
2815 | [SVM_EXIT_READ_DR6] = emulate_on_interception, | 3054 | [SVM_EXIT_READ_DR6] = dr_interception, |
2816 | [SVM_EXIT_READ_DR7] = emulate_on_interception, | 3055 | [SVM_EXIT_READ_DR7] = dr_interception, |
2817 | [SVM_EXIT_WRITE_DR0] = emulate_on_interception, | 3056 | [SVM_EXIT_WRITE_DR0] = dr_interception, |
2818 | [SVM_EXIT_WRITE_DR1] = emulate_on_interception, | 3057 | [SVM_EXIT_WRITE_DR1] = dr_interception, |
2819 | [SVM_EXIT_WRITE_DR2] = emulate_on_interception, | 3058 | [SVM_EXIT_WRITE_DR2] = dr_interception, |
2820 | [SVM_EXIT_WRITE_DR3] = emulate_on_interception, | 3059 | [SVM_EXIT_WRITE_DR3] = dr_interception, |
2821 | [SVM_EXIT_WRITE_DR4] = emulate_on_interception, | 3060 | [SVM_EXIT_WRITE_DR4] = dr_interception, |
2822 | [SVM_EXIT_WRITE_DR5] = emulate_on_interception, | 3061 | [SVM_EXIT_WRITE_DR5] = dr_interception, |
2823 | [SVM_EXIT_WRITE_DR6] = emulate_on_interception, | 3062 | [SVM_EXIT_WRITE_DR6] = dr_interception, |
2824 | [SVM_EXIT_WRITE_DR7] = emulate_on_interception, | 3063 | [SVM_EXIT_WRITE_DR7] = dr_interception, |
2825 | [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, | 3064 | [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, |
2826 | [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, | 3065 | [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, |
2827 | [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, | 3066 | [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, |
@@ -2854,6 +3093,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { | |||
2854 | [SVM_EXIT_WBINVD] = emulate_on_interception, | 3093 | [SVM_EXIT_WBINVD] = emulate_on_interception, |
2855 | [SVM_EXIT_MONITOR] = invalid_op_interception, | 3094 | [SVM_EXIT_MONITOR] = invalid_op_interception, |
2856 | [SVM_EXIT_MWAIT] = invalid_op_interception, | 3095 | [SVM_EXIT_MWAIT] = invalid_op_interception, |
3096 | [SVM_EXIT_XSETBV] = xsetbv_interception, | ||
2857 | [SVM_EXIT_NPF] = pf_interception, | 3097 | [SVM_EXIT_NPF] = pf_interception, |
2858 | }; | 3098 | }; |
2859 | 3099 | ||
@@ -2864,10 +3104,10 @@ void dump_vmcb(struct kvm_vcpu *vcpu) | |||
2864 | struct vmcb_save_area *save = &svm->vmcb->save; | 3104 | struct vmcb_save_area *save = &svm->vmcb->save; |
2865 | 3105 | ||
2866 | pr_err("VMCB Control Area:\n"); | 3106 | pr_err("VMCB Control Area:\n"); |
2867 | pr_err("cr_read: %04x\n", control->intercept_cr_read); | 3107 | pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff); |
2868 | pr_err("cr_write: %04x\n", control->intercept_cr_write); | 3108 | pr_err("cr_write: %04x\n", control->intercept_cr >> 16); |
2869 | pr_err("dr_read: %04x\n", control->intercept_dr_read); | 3109 | pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff); |
2870 | pr_err("dr_write: %04x\n", control->intercept_dr_write); | 3110 | pr_err("dr_write: %04x\n", control->intercept_dr >> 16); |
2871 | pr_err("exceptions: %08x\n", control->intercept_exceptions); | 3111 | pr_err("exceptions: %08x\n", control->intercept_exceptions); |
2872 | pr_err("intercepts: %016llx\n", control->intercept); | 3112 | pr_err("intercepts: %016llx\n", control->intercept); |
2873 | pr_err("pause filter count: %d\n", control->pause_filter_count); | 3113 | pr_err("pause filter count: %d\n", control->pause_filter_count); |
@@ -2950,15 +3190,23 @@ void dump_vmcb(struct kvm_vcpu *vcpu) | |||
2950 | 3190 | ||
2951 | } | 3191 | } |
2952 | 3192 | ||
3193 | static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) | ||
3194 | { | ||
3195 | struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; | ||
3196 | |||
3197 | *info1 = control->exit_info_1; | ||
3198 | *info2 = control->exit_info_2; | ||
3199 | } | ||
3200 | |||
2953 | static int handle_exit(struct kvm_vcpu *vcpu) | 3201 | static int handle_exit(struct kvm_vcpu *vcpu) |
2954 | { | 3202 | { |
2955 | struct vcpu_svm *svm = to_svm(vcpu); | 3203 | struct vcpu_svm *svm = to_svm(vcpu); |
2956 | struct kvm_run *kvm_run = vcpu->run; | 3204 | struct kvm_run *kvm_run = vcpu->run; |
2957 | u32 exit_code = svm->vmcb->control.exit_code; | 3205 | u32 exit_code = svm->vmcb->control.exit_code; |
2958 | 3206 | ||
2959 | trace_kvm_exit(exit_code, vcpu); | 3207 | trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); |
2960 | 3208 | ||
2961 | if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) | 3209 | if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) |
2962 | vcpu->arch.cr0 = svm->vmcb->save.cr0; | 3210 | vcpu->arch.cr0 = svm->vmcb->save.cr0; |
2963 | if (npt_enabled) | 3211 | if (npt_enabled) |
2964 | vcpu->arch.cr3 = svm->vmcb->save.cr3; | 3212 | vcpu->arch.cr3 = svm->vmcb->save.cr3; |
@@ -2970,7 +3218,7 @@ static int handle_exit(struct kvm_vcpu *vcpu) | |||
2970 | return 1; | 3218 | return 1; |
2971 | } | 3219 | } |
2972 | 3220 | ||
2973 | if (is_nested(svm)) { | 3221 | if (is_guest_mode(vcpu)) { |
2974 | int vmexit; | 3222 | int vmexit; |
2975 | 3223 | ||
2976 | trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, | 3224 | trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, |
@@ -3033,7 +3281,6 @@ static void pre_svm_run(struct vcpu_svm *svm) | |||
3033 | 3281 | ||
3034 | struct svm_cpu_data *sd = per_cpu(svm_data, cpu); | 3282 | struct svm_cpu_data *sd = per_cpu(svm_data, cpu); |
3035 | 3283 | ||
3036 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; | ||
3037 | /* FIXME: handle wraparound of asid_generation */ | 3284 | /* FIXME: handle wraparound of asid_generation */ |
3038 | if (svm->asid_generation != sd->asid_generation) | 3285 | if (svm->asid_generation != sd->asid_generation) |
3039 | new_asid(svm, sd); | 3286 | new_asid(svm, sd); |
@@ -3045,7 +3292,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) | |||
3045 | 3292 | ||
3046 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; | 3293 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; |
3047 | vcpu->arch.hflags |= HF_NMI_MASK; | 3294 | vcpu->arch.hflags |= HF_NMI_MASK; |
3048 | svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); | 3295 | set_intercept(svm, INTERCEPT_IRET); |
3049 | ++vcpu->stat.nmi_injections; | 3296 | ++vcpu->stat.nmi_injections; |
3050 | } | 3297 | } |
3051 | 3298 | ||
@@ -3058,6 +3305,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | |||
3058 | control->int_ctl &= ~V_INTR_PRIO_MASK; | 3305 | control->int_ctl &= ~V_INTR_PRIO_MASK; |
3059 | control->int_ctl |= V_IRQ_MASK | | 3306 | control->int_ctl |= V_IRQ_MASK | |
3060 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); | 3307 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); |
3308 | mark_dirty(svm->vmcb, VMCB_INTR); | ||
3061 | } | 3309 | } |
3062 | 3310 | ||
3063 | static void svm_set_irq(struct kvm_vcpu *vcpu) | 3311 | static void svm_set_irq(struct kvm_vcpu *vcpu) |
@@ -3077,14 +3325,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) | |||
3077 | { | 3325 | { |
3078 | struct vcpu_svm *svm = to_svm(vcpu); | 3326 | struct vcpu_svm *svm = to_svm(vcpu); |
3079 | 3327 | ||
3080 | if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) | 3328 | if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) |
3081 | return; | 3329 | return; |
3082 | 3330 | ||
3083 | if (irr == -1) | 3331 | if (irr == -1) |
3084 | return; | 3332 | return; |
3085 | 3333 | ||
3086 | if (tpr >= irr) | 3334 | if (tpr >= irr) |
3087 | svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; | 3335 | set_cr_intercept(svm, INTERCEPT_CR8_WRITE); |
3088 | } | 3336 | } |
3089 | 3337 | ||
3090 | static int svm_nmi_allowed(struct kvm_vcpu *vcpu) | 3338 | static int svm_nmi_allowed(struct kvm_vcpu *vcpu) |
@@ -3112,10 +3360,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) | |||
3112 | 3360 | ||
3113 | if (masked) { | 3361 | if (masked) { |
3114 | svm->vcpu.arch.hflags |= HF_NMI_MASK; | 3362 | svm->vcpu.arch.hflags |= HF_NMI_MASK; |
3115 | svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); | 3363 | set_intercept(svm, INTERCEPT_IRET); |
3116 | } else { | 3364 | } else { |
3117 | svm->vcpu.arch.hflags &= ~HF_NMI_MASK; | 3365 | svm->vcpu.arch.hflags &= ~HF_NMI_MASK; |
3118 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); | 3366 | clr_intercept(svm, INTERCEPT_IRET); |
3119 | } | 3367 | } |
3120 | } | 3368 | } |
3121 | 3369 | ||
@@ -3131,7 +3379,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) | |||
3131 | 3379 | ||
3132 | ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); | 3380 | ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); |
3133 | 3381 | ||
3134 | if (is_nested(svm)) | 3382 | if (is_guest_mode(vcpu)) |
3135 | return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); | 3383 | return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); |
3136 | 3384 | ||
3137 | return ret; | 3385 | return ret; |
@@ -3177,7 +3425,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) | |||
3177 | 3425 | ||
3178 | static void svm_flush_tlb(struct kvm_vcpu *vcpu) | 3426 | static void svm_flush_tlb(struct kvm_vcpu *vcpu) |
3179 | { | 3427 | { |
3180 | force_new_asid(vcpu); | 3428 | struct vcpu_svm *svm = to_svm(vcpu); |
3429 | |||
3430 | if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) | ||
3431 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; | ||
3432 | else | ||
3433 | svm->asid_generation--; | ||
3181 | } | 3434 | } |
3182 | 3435 | ||
3183 | static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) | 3436 | static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) |
@@ -3188,10 +3441,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) | |||
3188 | { | 3441 | { |
3189 | struct vcpu_svm *svm = to_svm(vcpu); | 3442 | struct vcpu_svm *svm = to_svm(vcpu); |
3190 | 3443 | ||
3191 | if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) | 3444 | if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) |
3192 | return; | 3445 | return; |
3193 | 3446 | ||
3194 | if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { | 3447 | if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { |
3195 | int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; | 3448 | int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; |
3196 | kvm_set_cr8(vcpu, cr8); | 3449 | kvm_set_cr8(vcpu, cr8); |
3197 | } | 3450 | } |
@@ -3202,7 +3455,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) | |||
3202 | struct vcpu_svm *svm = to_svm(vcpu); | 3455 | struct vcpu_svm *svm = to_svm(vcpu); |
3203 | u64 cr8; | 3456 | u64 cr8; |
3204 | 3457 | ||
3205 | if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) | 3458 | if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) |
3206 | return; | 3459 | return; |
3207 | 3460 | ||
3208 | cr8 = kvm_get_cr8(vcpu); | 3461 | cr8 = kvm_get_cr8(vcpu); |
@@ -3289,9 +3542,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) | |||
3289 | static void svm_vcpu_run(struct kvm_vcpu *vcpu) | 3542 | static void svm_vcpu_run(struct kvm_vcpu *vcpu) |
3290 | { | 3543 | { |
3291 | struct vcpu_svm *svm = to_svm(vcpu); | 3544 | struct vcpu_svm *svm = to_svm(vcpu); |
3292 | u16 fs_selector; | ||
3293 | u16 gs_selector; | ||
3294 | u16 ldt_selector; | ||
3295 | 3545 | ||
3296 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; | 3546 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; |
3297 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | 3547 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; |
@@ -3308,10 +3558,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3308 | 3558 | ||
3309 | sync_lapic_to_cr8(vcpu); | 3559 | sync_lapic_to_cr8(vcpu); |
3310 | 3560 | ||
3311 | save_host_msrs(vcpu); | ||
3312 | savesegment(fs, fs_selector); | ||
3313 | savesegment(gs, gs_selector); | ||
3314 | ldt_selector = kvm_read_ldt(); | ||
3315 | svm->vmcb->save.cr2 = vcpu->arch.cr2; | 3561 | svm->vmcb->save.cr2 = vcpu->arch.cr2; |
3316 | 3562 | ||
3317 | clgi(); | 3563 | clgi(); |
@@ -3389,20 +3635,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3389 | #endif | 3635 | #endif |
3390 | ); | 3636 | ); |
3391 | 3637 | ||
3392 | vcpu->arch.cr2 = svm->vmcb->save.cr2; | ||
3393 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
3394 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
3395 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; | ||
3396 | |||
3397 | load_host_msrs(vcpu); | ||
3398 | loadsegment(fs, fs_selector); | ||
3399 | #ifdef CONFIG_X86_64 | 3638 | #ifdef CONFIG_X86_64 |
3400 | load_gs_index(gs_selector); | 3639 | wrmsrl(MSR_GS_BASE, svm->host.gs_base); |
3401 | wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); | ||
3402 | #else | 3640 | #else |
3403 | loadsegment(gs, gs_selector); | 3641 | loadsegment(fs, svm->host.fs); |
3404 | #endif | 3642 | #endif |
3405 | kvm_load_ldt(ldt_selector); | ||
3406 | 3643 | ||
3407 | reload_tss(vcpu); | 3644 | reload_tss(vcpu); |
3408 | 3645 | ||
@@ -3410,10 +3647,21 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3410 | 3647 | ||
3411 | stgi(); | 3648 | stgi(); |
3412 | 3649 | ||
3650 | vcpu->arch.cr2 = svm->vmcb->save.cr2; | ||
3651 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
3652 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
3653 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; | ||
3654 | |||
3413 | sync_cr8_to_lapic(vcpu); | 3655 | sync_cr8_to_lapic(vcpu); |
3414 | 3656 | ||
3415 | svm->next_rip = 0; | 3657 | svm->next_rip = 0; |
3416 | 3658 | ||
3659 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; | ||
3660 | |||
3661 | /* if exit due to PF check for async PF */ | ||
3662 | if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) | ||
3663 | svm->apf_reason = kvm_read_and_reset_pf_reason(); | ||
3664 | |||
3417 | if (npt_enabled) { | 3665 | if (npt_enabled) { |
3418 | vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); | 3666 | vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); |
3419 | vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); | 3667 | vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); |
@@ -3426,6 +3674,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) | |||
3426 | if (unlikely(svm->vmcb->control.exit_code == | 3674 | if (unlikely(svm->vmcb->control.exit_code == |
3427 | SVM_EXIT_EXCP_BASE + MC_VECTOR)) | 3675 | SVM_EXIT_EXCP_BASE + MC_VECTOR)) |
3428 | svm_handle_mce(svm); | 3676 | svm_handle_mce(svm); |
3677 | |||
3678 | mark_all_clean(svm->vmcb); | ||
3429 | } | 3679 | } |
3430 | 3680 | ||
3431 | #undef R | 3681 | #undef R |
@@ -3435,7 +3685,8 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | |||
3435 | struct vcpu_svm *svm = to_svm(vcpu); | 3685 | struct vcpu_svm *svm = to_svm(vcpu); |
3436 | 3686 | ||
3437 | svm->vmcb->save.cr3 = root; | 3687 | svm->vmcb->save.cr3 = root; |
3438 | force_new_asid(vcpu); | 3688 | mark_dirty(svm->vmcb, VMCB_CR); |
3689 | svm_flush_tlb(vcpu); | ||
3439 | } | 3690 | } |
3440 | 3691 | ||
3441 | static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) | 3692 | static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) |
@@ -3443,11 +3694,13 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) | |||
3443 | struct vcpu_svm *svm = to_svm(vcpu); | 3694 | struct vcpu_svm *svm = to_svm(vcpu); |
3444 | 3695 | ||
3445 | svm->vmcb->control.nested_cr3 = root; | 3696 | svm->vmcb->control.nested_cr3 = root; |
3697 | mark_dirty(svm->vmcb, VMCB_NPT); | ||
3446 | 3698 | ||
3447 | /* Also sync guest cr3 here in case we live migrate */ | 3699 | /* Also sync guest cr3 here in case we live migrate */ |
3448 | svm->vmcb->save.cr3 = vcpu->arch.cr3; | 3700 | svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); |
3701 | mark_dirty(svm->vmcb, VMCB_CR); | ||
3449 | 3702 | ||
3450 | force_new_asid(vcpu); | 3703 | svm_flush_tlb(vcpu); |
3451 | } | 3704 | } |
3452 | 3705 | ||
3453 | static int is_disabled(void) | 3706 | static int is_disabled(void) |
@@ -3507,7 +3760,7 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) | |||
3507 | additional features */ | 3760 | additional features */ |
3508 | 3761 | ||
3509 | /* Support next_rip if host supports it */ | 3762 | /* Support next_rip if host supports it */ |
3510 | if (svm_has(SVM_FEATURE_NRIP)) | 3763 | if (boot_cpu_has(X86_FEATURE_NRIPS)) |
3511 | entry->edx |= SVM_FEATURE_NRIP; | 3764 | entry->edx |= SVM_FEATURE_NRIP; |
3512 | 3765 | ||
3513 | /* Support NPT for the guest if enabled */ | 3766 | /* Support NPT for the guest if enabled */ |
@@ -3567,6 +3820,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = { | |||
3567 | { SVM_EXIT_WBINVD, "wbinvd" }, | 3820 | { SVM_EXIT_WBINVD, "wbinvd" }, |
3568 | { SVM_EXIT_MONITOR, "monitor" }, | 3821 | { SVM_EXIT_MONITOR, "monitor" }, |
3569 | { SVM_EXIT_MWAIT, "mwait" }, | 3822 | { SVM_EXIT_MWAIT, "mwait" }, |
3823 | { SVM_EXIT_XSETBV, "xsetbv" }, | ||
3570 | { SVM_EXIT_NPF, "npf" }, | 3824 | { SVM_EXIT_NPF, "npf" }, |
3571 | { -1, NULL } | 3825 | { -1, NULL } |
3572 | }; | 3826 | }; |
@@ -3590,9 +3844,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) | |||
3590 | { | 3844 | { |
3591 | struct vcpu_svm *svm = to_svm(vcpu); | 3845 | struct vcpu_svm *svm = to_svm(vcpu); |
3592 | 3846 | ||
3593 | svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; | 3847 | set_exception_intercept(svm, NM_VECTOR); |
3594 | if (is_nested(svm)) | ||
3595 | svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR; | ||
3596 | update_cr0_intercept(svm); | 3848 | update_cr0_intercept(svm); |
3597 | } | 3849 | } |
3598 | 3850 | ||
@@ -3623,6 +3875,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
3623 | .get_cpl = svm_get_cpl, | 3875 | .get_cpl = svm_get_cpl, |
3624 | .get_cs_db_l_bits = kvm_get_cs_db_l_bits, | 3876 | .get_cs_db_l_bits = kvm_get_cs_db_l_bits, |
3625 | .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, | 3877 | .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, |
3878 | .decache_cr3 = svm_decache_cr3, | ||
3626 | .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, | 3879 | .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, |
3627 | .set_cr0 = svm_set_cr0, | 3880 | .set_cr0 = svm_set_cr0, |
3628 | .set_cr3 = svm_set_cr3, | 3881 | .set_cr3 = svm_set_cr3, |
@@ -3663,7 +3916,9 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
3663 | .get_tdp_level = get_npt_level, | 3916 | .get_tdp_level = get_npt_level, |
3664 | .get_mt_mask = svm_get_mt_mask, | 3917 | .get_mt_mask = svm_get_mt_mask, |
3665 | 3918 | ||
3919 | .get_exit_info = svm_get_exit_info, | ||
3666 | .exit_reasons_str = svm_exit_reasons_str, | 3920 | .exit_reasons_str = svm_exit_reasons_str, |
3921 | |||
3667 | .get_lpage_level = svm_get_lpage_level, | 3922 | .get_lpage_level = svm_get_lpage_level, |
3668 | 3923 | ||
3669 | .cpuid_update = svm_cpuid_update, | 3924 | .cpuid_update = svm_cpuid_update, |
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index a6544b8e7c0f..1357d7cf4ec8 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h | |||
@@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic, | |||
178 | #define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) | 178 | #define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) |
179 | #define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) | 179 | #define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) |
180 | 180 | ||
181 | #define KVM_ISA_VMX 1 | ||
182 | #define KVM_ISA_SVM 2 | ||
183 | |||
181 | /* | 184 | /* |
182 | * Tracepoint for kvm guest exit: | 185 | * Tracepoint for kvm guest exit: |
183 | */ | 186 | */ |
184 | TRACE_EVENT(kvm_exit, | 187 | TRACE_EVENT(kvm_exit, |
185 | TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu), | 188 | TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), |
186 | TP_ARGS(exit_reason, vcpu), | 189 | TP_ARGS(exit_reason, vcpu, isa), |
187 | 190 | ||
188 | TP_STRUCT__entry( | 191 | TP_STRUCT__entry( |
189 | __field( unsigned int, exit_reason ) | 192 | __field( unsigned int, exit_reason ) |
190 | __field( unsigned long, guest_rip ) | 193 | __field( unsigned long, guest_rip ) |
194 | __field( u32, isa ) | ||
195 | __field( u64, info1 ) | ||
196 | __field( u64, info2 ) | ||
191 | ), | 197 | ), |
192 | 198 | ||
193 | TP_fast_assign( | 199 | TP_fast_assign( |
194 | __entry->exit_reason = exit_reason; | 200 | __entry->exit_reason = exit_reason; |
195 | __entry->guest_rip = kvm_rip_read(vcpu); | 201 | __entry->guest_rip = kvm_rip_read(vcpu); |
202 | __entry->isa = isa; | ||
203 | kvm_x86_ops->get_exit_info(vcpu, &__entry->info1, | ||
204 | &__entry->info2); | ||
196 | ), | 205 | ), |
197 | 206 | ||
198 | TP_printk("reason %s rip 0x%lx", | 207 | TP_printk("reason %s rip 0x%lx info %llx %llx", |
199 | ftrace_print_symbols_seq(p, __entry->exit_reason, | 208 | ftrace_print_symbols_seq(p, __entry->exit_reason, |
200 | kvm_x86_ops->exit_reasons_str), | 209 | kvm_x86_ops->exit_reasons_str), |
201 | __entry->guest_rip) | 210 | __entry->guest_rip, __entry->info1, __entry->info2) |
202 | ); | 211 | ); |
203 | 212 | ||
204 | /* | 213 | /* |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8da0e45ff7c9..bf89ec2cfb82 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); | |||
69 | static int __read_mostly vmm_exclusive = 1; | 69 | static int __read_mostly vmm_exclusive = 1; |
70 | module_param(vmm_exclusive, bool, S_IRUGO); | 70 | module_param(vmm_exclusive, bool, S_IRUGO); |
71 | 71 | ||
72 | static int __read_mostly yield_on_hlt = 1; | ||
73 | module_param(yield_on_hlt, bool, S_IRUGO); | ||
74 | |||
72 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ | 75 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ |
73 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) | 76 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) |
74 | #define KVM_GUEST_CR0_MASK \ | 77 | #define KVM_GUEST_CR0_MASK \ |
@@ -177,6 +180,7 @@ static int init_rmode(struct kvm *kvm); | |||
177 | static u64 construct_eptp(unsigned long root_hpa); | 180 | static u64 construct_eptp(unsigned long root_hpa); |
178 | static void kvm_cpu_vmxon(u64 addr); | 181 | static void kvm_cpu_vmxon(u64 addr); |
179 | static void kvm_cpu_vmxoff(void); | 182 | static void kvm_cpu_vmxoff(void); |
183 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); | ||
180 | 184 | ||
181 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | 185 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); |
182 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | 186 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); |
@@ -188,6 +192,8 @@ static unsigned long *vmx_io_bitmap_b; | |||
188 | static unsigned long *vmx_msr_bitmap_legacy; | 192 | static unsigned long *vmx_msr_bitmap_legacy; |
189 | static unsigned long *vmx_msr_bitmap_longmode; | 193 | static unsigned long *vmx_msr_bitmap_longmode; |
190 | 194 | ||
195 | static bool cpu_has_load_ia32_efer; | ||
196 | |||
191 | static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); | 197 | static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); |
192 | static DEFINE_SPINLOCK(vmx_vpid_lock); | 198 | static DEFINE_SPINLOCK(vmx_vpid_lock); |
193 | 199 | ||
@@ -472,7 +478,7 @@ static void vmcs_clear(struct vmcs *vmcs) | |||
472 | u8 error; | 478 | u8 error; |
473 | 479 | ||
474 | asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" | 480 | asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" |
475 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | 481 | : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) |
476 | : "cc", "memory"); | 482 | : "cc", "memory"); |
477 | if (error) | 483 | if (error) |
478 | printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", | 484 | printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", |
@@ -485,7 +491,7 @@ static void vmcs_load(struct vmcs *vmcs) | |||
485 | u8 error; | 491 | u8 error; |
486 | 492 | ||
487 | asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" | 493 | asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" |
488 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | 494 | : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) |
489 | : "cc", "memory"); | 495 | : "cc", "memory"); |
490 | if (error) | 496 | if (error) |
491 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", | 497 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", |
@@ -565,10 +571,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) | |||
565 | 571 | ||
566 | static unsigned long vmcs_readl(unsigned long field) | 572 | static unsigned long vmcs_readl(unsigned long field) |
567 | { | 573 | { |
568 | unsigned long value; | 574 | unsigned long value = 0; |
569 | 575 | ||
570 | asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) | 576 | asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) |
571 | : "=a"(value) : "d"(field) : "cc"); | 577 | : "+a"(value) : "d"(field) : "cc"); |
572 | return value; | 578 | return value; |
573 | } | 579 | } |
574 | 580 | ||
@@ -661,6 +667,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) | |||
661 | unsigned i; | 667 | unsigned i; |
662 | struct msr_autoload *m = &vmx->msr_autoload; | 668 | struct msr_autoload *m = &vmx->msr_autoload; |
663 | 669 | ||
670 | if (msr == MSR_EFER && cpu_has_load_ia32_efer) { | ||
671 | vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER); | ||
672 | vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER); | ||
673 | return; | ||
674 | } | ||
675 | |||
664 | for (i = 0; i < m->nr; ++i) | 676 | for (i = 0; i < m->nr; ++i) |
665 | if (m->guest[i].index == msr) | 677 | if (m->guest[i].index == msr) |
666 | break; | 678 | break; |
@@ -680,6 +692,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, | |||
680 | unsigned i; | 692 | unsigned i; |
681 | struct msr_autoload *m = &vmx->msr_autoload; | 693 | struct msr_autoload *m = &vmx->msr_autoload; |
682 | 694 | ||
695 | if (msr == MSR_EFER && cpu_has_load_ia32_efer) { | ||
696 | vmcs_write64(GUEST_IA32_EFER, guest_val); | ||
697 | vmcs_write64(HOST_IA32_EFER, host_val); | ||
698 | vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER); | ||
699 | vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER); | ||
700 | return; | ||
701 | } | ||
702 | |||
683 | for (i = 0; i < m->nr; ++i) | 703 | for (i = 0; i < m->nr; ++i) |
684 | if (m->guest[i].index == msr) | 704 | if (m->guest[i].index == msr) |
685 | break; | 705 | break; |
@@ -821,10 +841,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) | |||
821 | #endif | 841 | #endif |
822 | 842 | ||
823 | #ifdef CONFIG_X86_64 | 843 | #ifdef CONFIG_X86_64 |
824 | if (is_long_mode(&vmx->vcpu)) { | 844 | rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); |
825 | rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); | 845 | if (is_long_mode(&vmx->vcpu)) |
826 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); | 846 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); |
827 | } | ||
828 | #endif | 847 | #endif |
829 | for (i = 0; i < vmx->save_nmsrs; ++i) | 848 | for (i = 0; i < vmx->save_nmsrs; ++i) |
830 | kvm_set_shared_msr(vmx->guest_msrs[i].index, | 849 | kvm_set_shared_msr(vmx->guest_msrs[i].index, |
@@ -839,23 +858,23 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) | |||
839 | 858 | ||
840 | ++vmx->vcpu.stat.host_state_reload; | 859 | ++vmx->vcpu.stat.host_state_reload; |
841 | vmx->host_state.loaded = 0; | 860 | vmx->host_state.loaded = 0; |
842 | if (vmx->host_state.fs_reload_needed) | 861 | #ifdef CONFIG_X86_64 |
843 | loadsegment(fs, vmx->host_state.fs_sel); | 862 | if (is_long_mode(&vmx->vcpu)) |
863 | rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); | ||
864 | #endif | ||
844 | if (vmx->host_state.gs_ldt_reload_needed) { | 865 | if (vmx->host_state.gs_ldt_reload_needed) { |
845 | kvm_load_ldt(vmx->host_state.ldt_sel); | 866 | kvm_load_ldt(vmx->host_state.ldt_sel); |
846 | #ifdef CONFIG_X86_64 | 867 | #ifdef CONFIG_X86_64 |
847 | load_gs_index(vmx->host_state.gs_sel); | 868 | load_gs_index(vmx->host_state.gs_sel); |
848 | wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); | ||
849 | #else | 869 | #else |
850 | loadsegment(gs, vmx->host_state.gs_sel); | 870 | loadsegment(gs, vmx->host_state.gs_sel); |
851 | #endif | 871 | #endif |
852 | } | 872 | } |
873 | if (vmx->host_state.fs_reload_needed) | ||
874 | loadsegment(fs, vmx->host_state.fs_sel); | ||
853 | reload_tss(); | 875 | reload_tss(); |
854 | #ifdef CONFIG_X86_64 | 876 | #ifdef CONFIG_X86_64 |
855 | if (is_long_mode(&vmx->vcpu)) { | 877 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); |
856 | rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); | ||
857 | wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); | ||
858 | } | ||
859 | #endif | 878 | #endif |
860 | if (current_thread_info()->status & TS_USEDFPU) | 879 | if (current_thread_info()->status & TS_USEDFPU) |
861 | clts(); | 880 | clts(); |
@@ -1010,6 +1029,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
1010 | vmx_set_interrupt_shadow(vcpu, 0); | 1029 | vmx_set_interrupt_shadow(vcpu, 0); |
1011 | } | 1030 | } |
1012 | 1031 | ||
1032 | static void vmx_clear_hlt(struct kvm_vcpu *vcpu) | ||
1033 | { | ||
1034 | /* Ensure that we clear the HLT state in the VMCS. We don't need to | ||
1035 | * explicitly skip the instruction because if the HLT state is set, then | ||
1036 | * the instruction is already executing and RIP has already been | ||
1037 | * advanced. */ | ||
1038 | if (!yield_on_hlt && | ||
1039 | vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) | ||
1040 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); | ||
1041 | } | ||
1042 | |||
1013 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | 1043 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, |
1014 | bool has_error_code, u32 error_code, | 1044 | bool has_error_code, u32 error_code, |
1015 | bool reinject) | 1045 | bool reinject) |
@@ -1036,6 +1066,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
1036 | intr_info |= INTR_TYPE_HARD_EXCEPTION; | 1066 | intr_info |= INTR_TYPE_HARD_EXCEPTION; |
1037 | 1067 | ||
1038 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); | 1068 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); |
1069 | vmx_clear_hlt(vcpu); | ||
1039 | } | 1070 | } |
1040 | 1071 | ||
1041 | static bool vmx_rdtscp_supported(void) | 1072 | static bool vmx_rdtscp_supported(void) |
@@ -1306,8 +1337,11 @@ static __init int vmx_disabled_by_bios(void) | |||
1306 | && tboot_enabled()) | 1337 | && tboot_enabled()) |
1307 | return 1; | 1338 | return 1; |
1308 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) | 1339 | if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) |
1309 | && !tboot_enabled()) | 1340 | && !tboot_enabled()) { |
1341 | printk(KERN_WARNING "kvm: disable TXT in the BIOS or " | ||
1342 | " activate TXT before enabling KVM\n"); | ||
1310 | return 1; | 1343 | return 1; |
1344 | } | ||
1311 | } | 1345 | } |
1312 | 1346 | ||
1313 | return 0; | 1347 | return 0; |
@@ -1401,6 +1435,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, | |||
1401 | return 0; | 1435 | return 0; |
1402 | } | 1436 | } |
1403 | 1437 | ||
1438 | static __init bool allow_1_setting(u32 msr, u32 ctl) | ||
1439 | { | ||
1440 | u32 vmx_msr_low, vmx_msr_high; | ||
1441 | |||
1442 | rdmsr(msr, vmx_msr_low, vmx_msr_high); | ||
1443 | return vmx_msr_high & ctl; | ||
1444 | } | ||
1445 | |||
1404 | static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | 1446 | static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) |
1405 | { | 1447 | { |
1406 | u32 vmx_msr_low, vmx_msr_high; | 1448 | u32 vmx_msr_low, vmx_msr_high; |
@@ -1417,7 +1459,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1417 | &_pin_based_exec_control) < 0) | 1459 | &_pin_based_exec_control) < 0) |
1418 | return -EIO; | 1460 | return -EIO; |
1419 | 1461 | ||
1420 | min = CPU_BASED_HLT_EXITING | | 1462 | min = |
1421 | #ifdef CONFIG_X86_64 | 1463 | #ifdef CONFIG_X86_64 |
1422 | CPU_BASED_CR8_LOAD_EXITING | | 1464 | CPU_BASED_CR8_LOAD_EXITING | |
1423 | CPU_BASED_CR8_STORE_EXITING | | 1465 | CPU_BASED_CR8_STORE_EXITING | |
@@ -1430,6 +1472,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1430 | CPU_BASED_MWAIT_EXITING | | 1472 | CPU_BASED_MWAIT_EXITING | |
1431 | CPU_BASED_MONITOR_EXITING | | 1473 | CPU_BASED_MONITOR_EXITING | |
1432 | CPU_BASED_INVLPG_EXITING; | 1474 | CPU_BASED_INVLPG_EXITING; |
1475 | |||
1476 | if (yield_on_hlt) | ||
1477 | min |= CPU_BASED_HLT_EXITING; | ||
1478 | |||
1433 | opt = CPU_BASED_TPR_SHADOW | | 1479 | opt = CPU_BASED_TPR_SHADOW | |
1434 | CPU_BASED_USE_MSR_BITMAPS | | 1480 | CPU_BASED_USE_MSR_BITMAPS | |
1435 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | 1481 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; |
@@ -1511,6 +1557,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1511 | vmcs_conf->vmexit_ctrl = _vmexit_control; | 1557 | vmcs_conf->vmexit_ctrl = _vmexit_control; |
1512 | vmcs_conf->vmentry_ctrl = _vmentry_control; | 1558 | vmcs_conf->vmentry_ctrl = _vmentry_control; |
1513 | 1559 | ||
1560 | cpu_has_load_ia32_efer = | ||
1561 | allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, | ||
1562 | VM_ENTRY_LOAD_IA32_EFER) | ||
1563 | && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, | ||
1564 | VM_EXIT_LOAD_IA32_EFER); | ||
1565 | |||
1514 | return 0; | 1566 | return 0; |
1515 | } | 1567 | } |
1516 | 1568 | ||
@@ -1684,9 +1736,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | |||
1684 | save->limit = vmcs_read32(sf->limit); | 1736 | save->limit = vmcs_read32(sf->limit); |
1685 | save->ar = vmcs_read32(sf->ar_bytes); | 1737 | save->ar = vmcs_read32(sf->ar_bytes); |
1686 | vmcs_write16(sf->selector, save->base >> 4); | 1738 | vmcs_write16(sf->selector, save->base >> 4); |
1687 | vmcs_write32(sf->base, save->base & 0xfffff); | 1739 | vmcs_write32(sf->base, save->base & 0xffff0); |
1688 | vmcs_write32(sf->limit, 0xffff); | 1740 | vmcs_write32(sf->limit, 0xffff); |
1689 | vmcs_write32(sf->ar_bytes, 0xf3); | 1741 | vmcs_write32(sf->ar_bytes, 0xf3); |
1742 | if (save->base & 0xf) | ||
1743 | printk_once(KERN_WARNING "kvm: segment base is not paragraph" | ||
1744 | " aligned when entering protected mode (seg=%d)", | ||
1745 | seg); | ||
1690 | } | 1746 | } |
1691 | 1747 | ||
1692 | static void enter_rmode(struct kvm_vcpu *vcpu) | 1748 | static void enter_rmode(struct kvm_vcpu *vcpu) |
@@ -1815,6 +1871,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) | |||
1815 | vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; | 1871 | vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; |
1816 | } | 1872 | } |
1817 | 1873 | ||
1874 | static void vmx_decache_cr3(struct kvm_vcpu *vcpu) | ||
1875 | { | ||
1876 | if (enable_ept && is_paging(vcpu)) | ||
1877 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | ||
1878 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
1879 | } | ||
1880 | |||
1818 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | 1881 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) |
1819 | { | 1882 | { |
1820 | ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; | 1883 | ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; |
@@ -1858,6 +1921,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, | |||
1858 | unsigned long cr0, | 1921 | unsigned long cr0, |
1859 | struct kvm_vcpu *vcpu) | 1922 | struct kvm_vcpu *vcpu) |
1860 | { | 1923 | { |
1924 | vmx_decache_cr3(vcpu); | ||
1861 | if (!(cr0 & X86_CR0_PG)) { | 1925 | if (!(cr0 & X86_CR0_PG)) { |
1862 | /* From paging/starting to nonpaging */ | 1926 | /* From paging/starting to nonpaging */ |
1863 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, | 1927 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, |
@@ -1938,7 +2002,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
1938 | if (enable_ept) { | 2002 | if (enable_ept) { |
1939 | eptp = construct_eptp(cr3); | 2003 | eptp = construct_eptp(cr3); |
1940 | vmcs_write64(EPT_POINTER, eptp); | 2004 | vmcs_write64(EPT_POINTER, eptp); |
1941 | guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : | 2005 | guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) : |
1942 | vcpu->kvm->arch.ept_identity_map_addr; | 2006 | vcpu->kvm->arch.ept_identity_map_addr; |
1943 | ept_load_pdptrs(vcpu); | 2007 | ept_load_pdptrs(vcpu); |
1944 | } | 2008 | } |
@@ -2726,7 +2790,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2726 | vmcs_writel(GUEST_IDTR_BASE, 0); | 2790 | vmcs_writel(GUEST_IDTR_BASE, 0); |
2727 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); | 2791 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); |
2728 | 2792 | ||
2729 | vmcs_write32(GUEST_ACTIVITY_STATE, 0); | 2793 | vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); |
2730 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); | 2794 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); |
2731 | vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); | 2795 | vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); |
2732 | 2796 | ||
@@ -2788,6 +2852,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) | |||
2788 | return; | 2852 | return; |
2789 | } | 2853 | } |
2790 | 2854 | ||
2855 | if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { | ||
2856 | enable_irq_window(vcpu); | ||
2857 | return; | ||
2858 | } | ||
2791 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 2859 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
2792 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; | 2860 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; |
2793 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | 2861 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); |
@@ -2815,6 +2883,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) | |||
2815 | } else | 2883 | } else |
2816 | intr |= INTR_TYPE_EXT_INTR; | 2884 | intr |= INTR_TYPE_EXT_INTR; |
2817 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); | 2885 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); |
2886 | vmx_clear_hlt(vcpu); | ||
2818 | } | 2887 | } |
2819 | 2888 | ||
2820 | static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | 2889 | static void vmx_inject_nmi(struct kvm_vcpu *vcpu) |
@@ -2842,6 +2911,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
2842 | } | 2911 | } |
2843 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2912 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
2844 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); | 2913 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); |
2914 | vmx_clear_hlt(vcpu); | ||
2845 | } | 2915 | } |
2846 | 2916 | ||
2847 | static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) | 2917 | static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) |
@@ -2850,7 +2920,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) | |||
2850 | return 0; | 2920 | return 0; |
2851 | 2921 | ||
2852 | return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | 2922 | return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & |
2853 | (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI)); | 2923 | (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
2924 | | GUEST_INTR_STATE_NMI)); | ||
2854 | } | 2925 | } |
2855 | 2926 | ||
2856 | static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) | 2927 | static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) |
@@ -2911,7 +2982,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, | |||
2911 | * Cause the #SS fault with 0 error code in VM86 mode. | 2982 | * Cause the #SS fault with 0 error code in VM86 mode. |
2912 | */ | 2983 | */ |
2913 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) | 2984 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) |
2914 | if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) | 2985 | if (emulate_instruction(vcpu, 0) == EMULATE_DONE) |
2915 | return 1; | 2986 | return 1; |
2916 | /* | 2987 | /* |
2917 | * Forward all other exceptions that are valid in real mode. | 2988 | * Forward all other exceptions that are valid in real mode. |
@@ -3008,7 +3079,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) | |||
3008 | } | 3079 | } |
3009 | 3080 | ||
3010 | if (is_invalid_opcode(intr_info)) { | 3081 | if (is_invalid_opcode(intr_info)) { |
3011 | er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); | 3082 | er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); |
3012 | if (er != EMULATE_DONE) | 3083 | if (er != EMULATE_DONE) |
3013 | kvm_queue_exception(vcpu, UD_VECTOR); | 3084 | kvm_queue_exception(vcpu, UD_VECTOR); |
3014 | return 1; | 3085 | return 1; |
@@ -3027,7 +3098,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) | |||
3027 | 3098 | ||
3028 | if (kvm_event_needs_reinjection(vcpu)) | 3099 | if (kvm_event_needs_reinjection(vcpu)) |
3029 | kvm_mmu_unprotect_page_virt(vcpu, cr2); | 3100 | kvm_mmu_unprotect_page_virt(vcpu, cr2); |
3030 | return kvm_mmu_page_fault(vcpu, cr2, error_code); | 3101 | return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); |
3031 | } | 3102 | } |
3032 | 3103 | ||
3033 | if (vmx->rmode.vm86_active && | 3104 | if (vmx->rmode.vm86_active && |
@@ -3099,7 +3170,7 @@ static int handle_io(struct kvm_vcpu *vcpu) | |||
3099 | ++vcpu->stat.io_exits; | 3170 | ++vcpu->stat.io_exits; |
3100 | 3171 | ||
3101 | if (string || in) | 3172 | if (string || in) |
3102 | return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; | 3173 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; |
3103 | 3174 | ||
3104 | port = exit_qualification >> 16; | 3175 | port = exit_qualification >> 16; |
3105 | size = (exit_qualification & 7) + 1; | 3176 | size = (exit_qualification & 7) + 1; |
@@ -3119,14 +3190,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
3119 | hypercall[2] = 0xc1; | 3190 | hypercall[2] = 0xc1; |
3120 | } | 3191 | } |
3121 | 3192 | ||
3122 | static void complete_insn_gp(struct kvm_vcpu *vcpu, int err) | ||
3123 | { | ||
3124 | if (err) | ||
3125 | kvm_inject_gp(vcpu, 0); | ||
3126 | else | ||
3127 | skip_emulated_instruction(vcpu); | ||
3128 | } | ||
3129 | |||
3130 | static int handle_cr(struct kvm_vcpu *vcpu) | 3193 | static int handle_cr(struct kvm_vcpu *vcpu) |
3131 | { | 3194 | { |
3132 | unsigned long exit_qualification, val; | 3195 | unsigned long exit_qualification, val; |
@@ -3144,21 +3207,21 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3144 | switch (cr) { | 3207 | switch (cr) { |
3145 | case 0: | 3208 | case 0: |
3146 | err = kvm_set_cr0(vcpu, val); | 3209 | err = kvm_set_cr0(vcpu, val); |
3147 | complete_insn_gp(vcpu, err); | 3210 | kvm_complete_insn_gp(vcpu, err); |
3148 | return 1; | 3211 | return 1; |
3149 | case 3: | 3212 | case 3: |
3150 | err = kvm_set_cr3(vcpu, val); | 3213 | err = kvm_set_cr3(vcpu, val); |
3151 | complete_insn_gp(vcpu, err); | 3214 | kvm_complete_insn_gp(vcpu, err); |
3152 | return 1; | 3215 | return 1; |
3153 | case 4: | 3216 | case 4: |
3154 | err = kvm_set_cr4(vcpu, val); | 3217 | err = kvm_set_cr4(vcpu, val); |
3155 | complete_insn_gp(vcpu, err); | 3218 | kvm_complete_insn_gp(vcpu, err); |
3156 | return 1; | 3219 | return 1; |
3157 | case 8: { | 3220 | case 8: { |
3158 | u8 cr8_prev = kvm_get_cr8(vcpu); | 3221 | u8 cr8_prev = kvm_get_cr8(vcpu); |
3159 | u8 cr8 = kvm_register_read(vcpu, reg); | 3222 | u8 cr8 = kvm_register_read(vcpu, reg); |
3160 | kvm_set_cr8(vcpu, cr8); | 3223 | err = kvm_set_cr8(vcpu, cr8); |
3161 | skip_emulated_instruction(vcpu); | 3224 | kvm_complete_insn_gp(vcpu, err); |
3162 | if (irqchip_in_kernel(vcpu->kvm)) | 3225 | if (irqchip_in_kernel(vcpu->kvm)) |
3163 | return 1; | 3226 | return 1; |
3164 | if (cr8_prev <= cr8) | 3227 | if (cr8_prev <= cr8) |
@@ -3177,8 +3240,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) | |||
3177 | case 1: /*mov from cr*/ | 3240 | case 1: /*mov from cr*/ |
3178 | switch (cr) { | 3241 | switch (cr) { |
3179 | case 3: | 3242 | case 3: |
3180 | kvm_register_write(vcpu, reg, vcpu->arch.cr3); | 3243 | val = kvm_read_cr3(vcpu); |
3181 | trace_kvm_cr_read(cr, vcpu->arch.cr3); | 3244 | kvm_register_write(vcpu, reg, val); |
3245 | trace_kvm_cr_read(cr, val); | ||
3182 | skip_emulated_instruction(vcpu); | 3246 | skip_emulated_instruction(vcpu); |
3183 | return 1; | 3247 | return 1; |
3184 | case 8: | 3248 | case 8: |
@@ -3350,6 +3414,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu) | |||
3350 | return 1; | 3414 | return 1; |
3351 | } | 3415 | } |
3352 | 3416 | ||
3417 | static int handle_invd(struct kvm_vcpu *vcpu) | ||
3418 | { | ||
3419 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; | ||
3420 | } | ||
3421 | |||
3353 | static int handle_invlpg(struct kvm_vcpu *vcpu) | 3422 | static int handle_invlpg(struct kvm_vcpu *vcpu) |
3354 | { | 3423 | { |
3355 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 3424 | unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
@@ -3378,7 +3447,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) | |||
3378 | 3447 | ||
3379 | static int handle_apic_access(struct kvm_vcpu *vcpu) | 3448 | static int handle_apic_access(struct kvm_vcpu *vcpu) |
3380 | { | 3449 | { |
3381 | return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; | 3450 | return emulate_instruction(vcpu, 0) == EMULATE_DONE; |
3382 | } | 3451 | } |
3383 | 3452 | ||
3384 | static int handle_task_switch(struct kvm_vcpu *vcpu) | 3453 | static int handle_task_switch(struct kvm_vcpu *vcpu) |
@@ -3477,7 +3546,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) | |||
3477 | 3546 | ||
3478 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | 3547 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); |
3479 | trace_kvm_page_fault(gpa, exit_qualification); | 3548 | trace_kvm_page_fault(gpa, exit_qualification); |
3480 | return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); | 3549 | return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); |
3481 | } | 3550 | } |
3482 | 3551 | ||
3483 | static u64 ept_rsvd_mask(u64 spte, int level) | 3552 | static u64 ept_rsvd_mask(u64 spte, int level) |
@@ -3593,7 +3662,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) | |||
3593 | && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) | 3662 | && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) |
3594 | return handle_interrupt_window(&vmx->vcpu); | 3663 | return handle_interrupt_window(&vmx->vcpu); |
3595 | 3664 | ||
3596 | err = emulate_instruction(vcpu, 0, 0, 0); | 3665 | err = emulate_instruction(vcpu, 0); |
3597 | 3666 | ||
3598 | if (err == EMULATE_DO_MMIO) { | 3667 | if (err == EMULATE_DO_MMIO) { |
3599 | ret = 0; | 3668 | ret = 0; |
@@ -3650,6 +3719,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
3650 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, | 3719 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, |
3651 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, | 3720 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, |
3652 | [EXIT_REASON_HLT] = handle_halt, | 3721 | [EXIT_REASON_HLT] = handle_halt, |
3722 | [EXIT_REASON_INVD] = handle_invd, | ||
3653 | [EXIT_REASON_INVLPG] = handle_invlpg, | 3723 | [EXIT_REASON_INVLPG] = handle_invlpg, |
3654 | [EXIT_REASON_VMCALL] = handle_vmcall, | 3724 | [EXIT_REASON_VMCALL] = handle_vmcall, |
3655 | [EXIT_REASON_VMCLEAR] = handle_vmx_insn, | 3725 | [EXIT_REASON_VMCLEAR] = handle_vmx_insn, |
@@ -3677,6 +3747,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { | |||
3677 | static const int kvm_vmx_max_exit_handlers = | 3747 | static const int kvm_vmx_max_exit_handlers = |
3678 | ARRAY_SIZE(kvm_vmx_exit_handlers); | 3748 | ARRAY_SIZE(kvm_vmx_exit_handlers); |
3679 | 3749 | ||
3750 | static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) | ||
3751 | { | ||
3752 | *info1 = vmcs_readl(EXIT_QUALIFICATION); | ||
3753 | *info2 = vmcs_read32(VM_EXIT_INTR_INFO); | ||
3754 | } | ||
3755 | |||
3680 | /* | 3756 | /* |
3681 | * The guest has exited. See if we can fix it or if we need userspace | 3757 | * The guest has exited. See if we can fix it or if we need userspace |
3682 | * assistance. | 3758 | * assistance. |
@@ -3687,17 +3763,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) | |||
3687 | u32 exit_reason = vmx->exit_reason; | 3763 | u32 exit_reason = vmx->exit_reason; |
3688 | u32 vectoring_info = vmx->idt_vectoring_info; | 3764 | u32 vectoring_info = vmx->idt_vectoring_info; |
3689 | 3765 | ||
3690 | trace_kvm_exit(exit_reason, vcpu); | 3766 | trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); |
3691 | 3767 | ||
3692 | /* If guest state is invalid, start emulating */ | 3768 | /* If guest state is invalid, start emulating */ |
3693 | if (vmx->emulation_required && emulate_invalid_guest_state) | 3769 | if (vmx->emulation_required && emulate_invalid_guest_state) |
3694 | return handle_invalid_guest_state(vcpu); | 3770 | return handle_invalid_guest_state(vcpu); |
3695 | 3771 | ||
3696 | /* Access CR3 don't cause VMExit in paging mode, so we need | ||
3697 | * to sync with guest real CR3. */ | ||
3698 | if (enable_ept && is_paging(vcpu)) | ||
3699 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | ||
3700 | |||
3701 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { | 3772 | if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { |
3702 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; | 3773 | vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
3703 | vcpu->run->fail_entry.hardware_entry_failure_reason | 3774 | vcpu->run->fail_entry.hardware_entry_failure_reason |
@@ -4014,7 +4085,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | |||
4014 | ); | 4085 | ); |
4015 | 4086 | ||
4016 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) | 4087 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) |
4017 | | (1 << VCPU_EXREG_PDPTR)); | 4088 | | (1 << VCPU_EXREG_PDPTR) |
4089 | | (1 << VCPU_EXREG_CR3)); | ||
4018 | vcpu->arch.regs_dirty = 0; | 4090 | vcpu->arch.regs_dirty = 0; |
4019 | 4091 | ||
4020 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 4092 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
@@ -4228,11 +4300,6 @@ static int vmx_get_lpage_level(void) | |||
4228 | return PT_PDPE_LEVEL; | 4300 | return PT_PDPE_LEVEL; |
4229 | } | 4301 | } |
4230 | 4302 | ||
4231 | static inline u32 bit(int bitno) | ||
4232 | { | ||
4233 | return 1 << (bitno & 31); | ||
4234 | } | ||
4235 | |||
4236 | static void vmx_cpuid_update(struct kvm_vcpu *vcpu) | 4303 | static void vmx_cpuid_update(struct kvm_vcpu *vcpu) |
4237 | { | 4304 | { |
4238 | struct kvm_cpuid_entry2 *best; | 4305 | struct kvm_cpuid_entry2 *best; |
@@ -4286,6 +4353,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
4286 | .get_cpl = vmx_get_cpl, | 4353 | .get_cpl = vmx_get_cpl, |
4287 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, | 4354 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, |
4288 | .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, | 4355 | .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, |
4356 | .decache_cr3 = vmx_decache_cr3, | ||
4289 | .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, | 4357 | .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, |
4290 | .set_cr0 = vmx_set_cr0, | 4358 | .set_cr0 = vmx_set_cr0, |
4291 | .set_cr3 = vmx_set_cr3, | 4359 | .set_cr3 = vmx_set_cr3, |
@@ -4326,7 +4394,9 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
4326 | .get_tdp_level = get_ept_level, | 4394 | .get_tdp_level = get_ept_level, |
4327 | .get_mt_mask = vmx_get_mt_mask, | 4395 | .get_mt_mask = vmx_get_mt_mask, |
4328 | 4396 | ||
4397 | .get_exit_info = vmx_get_exit_info, | ||
4329 | .exit_reasons_str = vmx_exit_reasons_str, | 4398 | .exit_reasons_str = vmx_exit_reasons_str, |
4399 | |||
4330 | .get_lpage_level = vmx_get_lpage_level, | 4400 | .get_lpage_level = vmx_get_lpage_level, |
4331 | 4401 | ||
4332 | .cpuid_update = vmx_cpuid_update, | 4402 | .cpuid_update = vmx_cpuid_update, |
@@ -4402,8 +4472,6 @@ static int __init vmx_init(void) | |||
4402 | 4472 | ||
4403 | if (enable_ept) { | 4473 | if (enable_ept) { |
4404 | bypass_guest_pf = 0; | 4474 | bypass_guest_pf = 0; |
4405 | kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | | ||
4406 | VMX_EPT_WRITABLE_MASK); | ||
4407 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, | 4475 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, |
4408 | VMX_EPT_EXECUTABLE_MASK); | 4476 | VMX_EPT_EXECUTABLE_MASK); |
4409 | kvm_enable_tdp(); | 4477 | kvm_enable_tdp(); |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cdac9e592aa5..bcc0efce85bf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -43,6 +43,7 @@ | |||
43 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
44 | #include <linux/perf_event.h> | 44 | #include <linux/perf_event.h> |
45 | #include <linux/uaccess.h> | 45 | #include <linux/uaccess.h> |
46 | #include <linux/hash.h> | ||
46 | #include <trace/events/kvm.h> | 47 | #include <trace/events/kvm.h> |
47 | 48 | ||
48 | #define CREATE_TRACE_POINTS | 49 | #define CREATE_TRACE_POINTS |
@@ -155,9 +156,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
155 | 156 | ||
156 | u64 __read_mostly host_xcr0; | 157 | u64 __read_mostly host_xcr0; |
157 | 158 | ||
158 | static inline u32 bit(int bitno) | 159 | static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) |
159 | { | 160 | { |
160 | return 1 << (bitno & 31); | 161 | int i; |
162 | for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) | ||
163 | vcpu->arch.apf.gfns[i] = ~0; | ||
161 | } | 164 | } |
162 | 165 | ||
163 | static void kvm_on_user_return(struct user_return_notifier *urn) | 166 | static void kvm_on_user_return(struct user_return_notifier *urn) |
@@ -331,23 +334,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) | |||
331 | } | 334 | } |
332 | EXPORT_SYMBOL_GPL(kvm_requeue_exception); | 335 | EXPORT_SYMBOL_GPL(kvm_requeue_exception); |
333 | 336 | ||
334 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu) | 337 | void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) |
335 | { | 338 | { |
336 | unsigned error_code = vcpu->arch.fault.error_code; | 339 | if (err) |
340 | kvm_inject_gp(vcpu, 0); | ||
341 | else | ||
342 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
343 | } | ||
344 | EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); | ||
337 | 345 | ||
346 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) | ||
347 | { | ||
338 | ++vcpu->stat.pf_guest; | 348 | ++vcpu->stat.pf_guest; |
339 | vcpu->arch.cr2 = vcpu->arch.fault.address; | 349 | vcpu->arch.cr2 = fault->address; |
340 | kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); | 350 | kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); |
341 | } | 351 | } |
342 | 352 | ||
343 | void kvm_propagate_fault(struct kvm_vcpu *vcpu) | 353 | void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) |
344 | { | 354 | { |
345 | if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested) | 355 | if (mmu_is_nested(vcpu) && !fault->nested_page_fault) |
346 | vcpu->arch.nested_mmu.inject_page_fault(vcpu); | 356 | vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); |
347 | else | 357 | else |
348 | vcpu->arch.mmu.inject_page_fault(vcpu); | 358 | vcpu->arch.mmu.inject_page_fault(vcpu, fault); |
349 | |||
350 | vcpu->arch.fault.nested = false; | ||
351 | } | 359 | } |
352 | 360 | ||
353 | void kvm_inject_nmi(struct kvm_vcpu *vcpu) | 361 | void kvm_inject_nmi(struct kvm_vcpu *vcpu) |
@@ -465,8 +473,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) | |||
465 | (unsigned long *)&vcpu->arch.regs_avail)) | 473 | (unsigned long *)&vcpu->arch.regs_avail)) |
466 | return true; | 474 | return true; |
467 | 475 | ||
468 | gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT; | 476 | gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; |
469 | offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1); | 477 | offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1); |
470 | r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), | 478 | r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), |
471 | PFERR_USER_MASK | PFERR_WRITE_MASK); | 479 | PFERR_USER_MASK | PFERR_WRITE_MASK); |
472 | if (r < 0) | 480 | if (r < 0) |
@@ -511,12 +519,15 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
511 | } else | 519 | } else |
512 | #endif | 520 | #endif |
513 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, | 521 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, |
514 | vcpu->arch.cr3)) | 522 | kvm_read_cr3(vcpu))) |
515 | return 1; | 523 | return 1; |
516 | } | 524 | } |
517 | 525 | ||
518 | kvm_x86_ops->set_cr0(vcpu, cr0); | 526 | kvm_x86_ops->set_cr0(vcpu, cr0); |
519 | 527 | ||
528 | if ((cr0 ^ old_cr0) & X86_CR0_PG) | ||
529 | kvm_clear_async_pf_completion_queue(vcpu); | ||
530 | |||
520 | if ((cr0 ^ old_cr0) & update_bits) | 531 | if ((cr0 ^ old_cr0) & update_bits) |
521 | kvm_mmu_reset_context(vcpu); | 532 | kvm_mmu_reset_context(vcpu); |
522 | return 0; | 533 | return 0; |
@@ -600,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
600 | return 1; | 611 | return 1; |
601 | } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) | 612 | } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) |
602 | && ((cr4 ^ old_cr4) & pdptr_bits) | 613 | && ((cr4 ^ old_cr4) & pdptr_bits) |
603 | && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)) | 614 | && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, |
615 | kvm_read_cr3(vcpu))) | ||
604 | return 1; | 616 | return 1; |
605 | 617 | ||
606 | if (cr4 & X86_CR4_VMXE) | 618 | if (cr4 & X86_CR4_VMXE) |
@@ -620,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); | |||
620 | 632 | ||
621 | int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | 633 | int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
622 | { | 634 | { |
623 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { | 635 | if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { |
624 | kvm_mmu_sync_roots(vcpu); | 636 | kvm_mmu_sync_roots(vcpu); |
625 | kvm_mmu_flush_tlb(vcpu); | 637 | kvm_mmu_flush_tlb(vcpu); |
626 | return 0; | 638 | return 0; |
@@ -655,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
655 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) | 667 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) |
656 | return 1; | 668 | return 1; |
657 | vcpu->arch.cr3 = cr3; | 669 | vcpu->arch.cr3 = cr3; |
670 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
658 | vcpu->arch.mmu.new_cr3(vcpu); | 671 | vcpu->arch.mmu.new_cr3(vcpu); |
659 | return 0; | 672 | return 0; |
660 | } | 673 | } |
661 | EXPORT_SYMBOL_GPL(kvm_set_cr3); | 674 | EXPORT_SYMBOL_GPL(kvm_set_cr3); |
662 | 675 | ||
663 | int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | 676 | int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) |
664 | { | 677 | { |
665 | if (cr8 & CR8_RESERVED_BITS) | 678 | if (cr8 & CR8_RESERVED_BITS) |
666 | return 1; | 679 | return 1; |
@@ -670,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | |||
670 | vcpu->arch.cr8 = cr8; | 683 | vcpu->arch.cr8 = cr8; |
671 | return 0; | 684 | return 0; |
672 | } | 685 | } |
673 | |||
674 | void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | ||
675 | { | ||
676 | if (__kvm_set_cr8(vcpu, cr8)) | ||
677 | kvm_inject_gp(vcpu, 0); | ||
678 | } | ||
679 | EXPORT_SYMBOL_GPL(kvm_set_cr8); | 686 | EXPORT_SYMBOL_GPL(kvm_set_cr8); |
680 | 687 | ||
681 | unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) | 688 | unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) |
@@ -780,12 +787,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); | |||
780 | * kvm-specific. Those are put in the beginning of the list. | 787 | * kvm-specific. Those are put in the beginning of the list. |
781 | */ | 788 | */ |
782 | 789 | ||
783 | #define KVM_SAVE_MSRS_BEGIN 7 | 790 | #define KVM_SAVE_MSRS_BEGIN 8 |
784 | static u32 msrs_to_save[] = { | 791 | static u32 msrs_to_save[] = { |
785 | MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, | 792 | MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, |
786 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, | 793 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, |
787 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, | 794 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, |
788 | HV_X64_MSR_APIC_ASSIST_PAGE, | 795 | HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, |
789 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | 796 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, |
790 | MSR_STAR, | 797 | MSR_STAR, |
791 | #ifdef CONFIG_X86_64 | 798 | #ifdef CONFIG_X86_64 |
@@ -835,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) | |||
835 | kvm_x86_ops->set_efer(vcpu, efer); | 842 | kvm_x86_ops->set_efer(vcpu, efer); |
836 | 843 | ||
837 | vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; | 844 | vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; |
838 | kvm_mmu_reset_context(vcpu); | ||
839 | 845 | ||
840 | /* Update reserved bits */ | 846 | /* Update reserved bits */ |
841 | if ((efer ^ old_efer) & EFER_NX) | 847 | if ((efer ^ old_efer) & EFER_NX) |
@@ -981,7 +987,7 @@ static inline u64 nsec_to_cycles(u64 nsec) | |||
981 | if (kvm_tsc_changes_freq()) | 987 | if (kvm_tsc_changes_freq()) |
982 | printk_once(KERN_WARNING | 988 | printk_once(KERN_WARNING |
983 | "kvm: unreliable cycle conversion on adjustable rate TSC\n"); | 989 | "kvm: unreliable cycle conversion on adjustable rate TSC\n"); |
984 | ret = nsec * __get_cpu_var(cpu_tsc_khz); | 990 | ret = nsec * __this_cpu_read(cpu_tsc_khz); |
985 | do_div(ret, USEC_PER_SEC); | 991 | do_div(ret, USEC_PER_SEC); |
986 | return ret; | 992 | return ret; |
987 | } | 993 | } |
@@ -1066,7 +1072,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) | |||
1066 | local_irq_save(flags); | 1072 | local_irq_save(flags); |
1067 | kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); | 1073 | kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); |
1068 | kernel_ns = get_kernel_ns(); | 1074 | kernel_ns = get_kernel_ns(); |
1069 | this_tsc_khz = __get_cpu_var(cpu_tsc_khz); | 1075 | this_tsc_khz = __this_cpu_read(cpu_tsc_khz); |
1070 | 1076 | ||
1071 | if (unlikely(this_tsc_khz == 0)) { | 1077 | if (unlikely(this_tsc_khz == 0)) { |
1072 | local_irq_restore(flags); | 1078 | local_irq_restore(flags); |
@@ -1423,6 +1429,30 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1423 | return 0; | 1429 | return 0; |
1424 | } | 1430 | } |
1425 | 1431 | ||
1432 | static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) | ||
1433 | { | ||
1434 | gpa_t gpa = data & ~0x3f; | ||
1435 | |||
1436 | /* Bits 2:5 are resrved, Should be zero */ | ||
1437 | if (data & 0x3c) | ||
1438 | return 1; | ||
1439 | |||
1440 | vcpu->arch.apf.msr_val = data; | ||
1441 | |||
1442 | if (!(data & KVM_ASYNC_PF_ENABLED)) { | ||
1443 | kvm_clear_async_pf_completion_queue(vcpu); | ||
1444 | kvm_async_pf_hash_reset(vcpu); | ||
1445 | return 0; | ||
1446 | } | ||
1447 | |||
1448 | if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa)) | ||
1449 | return 1; | ||
1450 | |||
1451 | vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); | ||
1452 | kvm_async_pf_wakeup_all(vcpu); | ||
1453 | return 0; | ||
1454 | } | ||
1455 | |||
1426 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | 1456 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) |
1427 | { | 1457 | { |
1428 | switch (msr) { | 1458 | switch (msr) { |
@@ -1504,6 +1534,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
1504 | } | 1534 | } |
1505 | break; | 1535 | break; |
1506 | } | 1536 | } |
1537 | case MSR_KVM_ASYNC_PF_EN: | ||
1538 | if (kvm_pv_enable_async_pf(vcpu, data)) | ||
1539 | return 1; | ||
1540 | break; | ||
1507 | case MSR_IA32_MCG_CTL: | 1541 | case MSR_IA32_MCG_CTL: |
1508 | case MSR_IA32_MCG_STATUS: | 1542 | case MSR_IA32_MCG_STATUS: |
1509 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: | 1543 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: |
@@ -1780,6 +1814,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
1780 | case MSR_KVM_SYSTEM_TIME_NEW: | 1814 | case MSR_KVM_SYSTEM_TIME_NEW: |
1781 | data = vcpu->arch.time; | 1815 | data = vcpu->arch.time; |
1782 | break; | 1816 | break; |
1817 | case MSR_KVM_ASYNC_PF_EN: | ||
1818 | data = vcpu->arch.apf.msr_val; | ||
1819 | break; | ||
1783 | case MSR_IA32_P5_MC_ADDR: | 1820 | case MSR_IA32_P5_MC_ADDR: |
1784 | case MSR_IA32_P5_MC_TYPE: | 1821 | case MSR_IA32_P5_MC_TYPE: |
1785 | case MSR_IA32_MCG_CAP: | 1822 | case MSR_IA32_MCG_CAP: |
@@ -1909,6 +1946,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
1909 | case KVM_CAP_NOP_IO_DELAY: | 1946 | case KVM_CAP_NOP_IO_DELAY: |
1910 | case KVM_CAP_MP_STATE: | 1947 | case KVM_CAP_MP_STATE: |
1911 | case KVM_CAP_SYNC_MMU: | 1948 | case KVM_CAP_SYNC_MMU: |
1949 | case KVM_CAP_USER_NMI: | ||
1912 | case KVM_CAP_REINJECT_CONTROL: | 1950 | case KVM_CAP_REINJECT_CONTROL: |
1913 | case KVM_CAP_IRQ_INJECT_STATUS: | 1951 | case KVM_CAP_IRQ_INJECT_STATUS: |
1914 | case KVM_CAP_ASSIGN_DEV_IRQ: | 1952 | case KVM_CAP_ASSIGN_DEV_IRQ: |
@@ -1927,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
1927 | case KVM_CAP_DEBUGREGS: | 1965 | case KVM_CAP_DEBUGREGS: |
1928 | case KVM_CAP_X86_ROBUST_SINGLESTEP: | 1966 | case KVM_CAP_X86_ROBUST_SINGLESTEP: |
1929 | case KVM_CAP_XSAVE: | 1967 | case KVM_CAP_XSAVE: |
1968 | case KVM_CAP_ASYNC_PF: | ||
1930 | r = 1; | 1969 | r = 1; |
1931 | break; | 1970 | break; |
1932 | case KVM_CAP_COALESCED_MMIO: | 1971 | case KVM_CAP_COALESCED_MMIO: |
@@ -2190,6 +2229,11 @@ out: | |||
2190 | return r; | 2229 | return r; |
2191 | } | 2230 | } |
2192 | 2231 | ||
2232 | static void cpuid_mask(u32 *word, int wordnum) | ||
2233 | { | ||
2234 | *word &= boot_cpu_data.x86_capability[wordnum]; | ||
2235 | } | ||
2236 | |||
2193 | static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, | 2237 | static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, |
2194 | u32 index) | 2238 | u32 index) |
2195 | { | 2239 | { |
@@ -2264,7 +2308,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2264 | break; | 2308 | break; |
2265 | case 1: | 2309 | case 1: |
2266 | entry->edx &= kvm_supported_word0_x86_features; | 2310 | entry->edx &= kvm_supported_word0_x86_features; |
2311 | cpuid_mask(&entry->edx, 0); | ||
2267 | entry->ecx &= kvm_supported_word4_x86_features; | 2312 | entry->ecx &= kvm_supported_word4_x86_features; |
2313 | cpuid_mask(&entry->ecx, 4); | ||
2268 | /* we support x2apic emulation even if host does not support | 2314 | /* we support x2apic emulation even if host does not support |
2269 | * it since we emulate x2apic in software */ | 2315 | * it since we emulate x2apic in software */ |
2270 | entry->ecx |= F(X2APIC); | 2316 | entry->ecx |= F(X2APIC); |
@@ -2355,7 +2401,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
2355 | break; | 2401 | break; |
2356 | case 0x80000001: | 2402 | case 0x80000001: |
2357 | entry->edx &= kvm_supported_word1_x86_features; | 2403 | entry->edx &= kvm_supported_word1_x86_features; |
2404 | cpuid_mask(&entry->edx, 1); | ||
2358 | entry->ecx &= kvm_supported_word6_x86_features; | 2405 | entry->ecx &= kvm_supported_word6_x86_features; |
2406 | cpuid_mask(&entry->ecx, 6); | ||
2359 | break; | 2407 | break; |
2360 | } | 2408 | } |
2361 | 2409 | ||
@@ -3174,20 +3222,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
3174 | struct kvm_memslots *slots, *old_slots; | 3222 | struct kvm_memslots *slots, *old_slots; |
3175 | unsigned long *dirty_bitmap; | 3223 | unsigned long *dirty_bitmap; |
3176 | 3224 | ||
3177 | r = -ENOMEM; | 3225 | dirty_bitmap = memslot->dirty_bitmap_head; |
3178 | dirty_bitmap = vmalloc(n); | 3226 | if (memslot->dirty_bitmap == dirty_bitmap) |
3179 | if (!dirty_bitmap) | 3227 | dirty_bitmap += n / sizeof(long); |
3180 | goto out; | ||
3181 | memset(dirty_bitmap, 0, n); | 3228 | memset(dirty_bitmap, 0, n); |
3182 | 3229 | ||
3183 | r = -ENOMEM; | 3230 | r = -ENOMEM; |
3184 | slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); | 3231 | slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); |
3185 | if (!slots) { | 3232 | if (!slots) |
3186 | vfree(dirty_bitmap); | ||
3187 | goto out; | 3233 | goto out; |
3188 | } | ||
3189 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); | 3234 | memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); |
3190 | slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; | 3235 | slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; |
3236 | slots->generation++; | ||
3191 | 3237 | ||
3192 | old_slots = kvm->memslots; | 3238 | old_slots = kvm->memslots; |
3193 | rcu_assign_pointer(kvm->memslots, slots); | 3239 | rcu_assign_pointer(kvm->memslots, slots); |
@@ -3200,11 +3246,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
3200 | spin_unlock(&kvm->mmu_lock); | 3246 | spin_unlock(&kvm->mmu_lock); |
3201 | 3247 | ||
3202 | r = -EFAULT; | 3248 | r = -EFAULT; |
3203 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { | 3249 | if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) |
3204 | vfree(dirty_bitmap); | ||
3205 | goto out; | 3250 | goto out; |
3206 | } | ||
3207 | vfree(dirty_bitmap); | ||
3208 | } else { | 3251 | } else { |
3209 | r = -EFAULT; | 3252 | r = -EFAULT; |
3210 | if (clear_user(log->dirty_bitmap, n)) | 3253 | if (clear_user(log->dirty_bitmap, n)) |
@@ -3271,8 +3314,10 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3271 | if (vpic) { | 3314 | if (vpic) { |
3272 | r = kvm_ioapic_init(kvm); | 3315 | r = kvm_ioapic_init(kvm); |
3273 | if (r) { | 3316 | if (r) { |
3317 | mutex_lock(&kvm->slots_lock); | ||
3274 | kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, | 3318 | kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, |
3275 | &vpic->dev); | 3319 | &vpic->dev); |
3320 | mutex_unlock(&kvm->slots_lock); | ||
3276 | kfree(vpic); | 3321 | kfree(vpic); |
3277 | goto create_irqchip_unlock; | 3322 | goto create_irqchip_unlock; |
3278 | } | 3323 | } |
@@ -3283,10 +3328,12 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
3283 | smp_wmb(); | 3328 | smp_wmb(); |
3284 | r = kvm_setup_default_irq_routing(kvm); | 3329 | r = kvm_setup_default_irq_routing(kvm); |
3285 | if (r) { | 3330 | if (r) { |
3331 | mutex_lock(&kvm->slots_lock); | ||
3286 | mutex_lock(&kvm->irq_lock); | 3332 | mutex_lock(&kvm->irq_lock); |
3287 | kvm_ioapic_destroy(kvm); | 3333 | kvm_ioapic_destroy(kvm); |
3288 | kvm_destroy_pic(kvm); | 3334 | kvm_destroy_pic(kvm); |
3289 | mutex_unlock(&kvm->irq_lock); | 3335 | mutex_unlock(&kvm->irq_lock); |
3336 | mutex_unlock(&kvm->slots_lock); | ||
3290 | } | 3337 | } |
3291 | create_irqchip_unlock: | 3338 | create_irqchip_unlock: |
3292 | mutex_unlock(&kvm->lock); | 3339 | mutex_unlock(&kvm->lock); |
@@ -3562,63 +3609,63 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) | |||
3562 | static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) | 3609 | static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) |
3563 | { | 3610 | { |
3564 | gpa_t t_gpa; | 3611 | gpa_t t_gpa; |
3565 | u32 error; | 3612 | struct x86_exception exception; |
3566 | 3613 | ||
3567 | BUG_ON(!mmu_is_nested(vcpu)); | 3614 | BUG_ON(!mmu_is_nested(vcpu)); |
3568 | 3615 | ||
3569 | /* NPT walks are always user-walks */ | 3616 | /* NPT walks are always user-walks */ |
3570 | access |= PFERR_USER_MASK; | 3617 | access |= PFERR_USER_MASK; |
3571 | t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error); | 3618 | t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception); |
3572 | if (t_gpa == UNMAPPED_GVA) | ||
3573 | vcpu->arch.fault.nested = true; | ||
3574 | 3619 | ||
3575 | return t_gpa; | 3620 | return t_gpa; |
3576 | } | 3621 | } |
3577 | 3622 | ||
3578 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3623 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, |
3624 | struct x86_exception *exception) | ||
3579 | { | 3625 | { |
3580 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3626 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3581 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); | 3627 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); |
3582 | } | 3628 | } |
3583 | 3629 | ||
3584 | gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3630 | gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, |
3631 | struct x86_exception *exception) | ||
3585 | { | 3632 | { |
3586 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3633 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3587 | access |= PFERR_FETCH_MASK; | 3634 | access |= PFERR_FETCH_MASK; |
3588 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); | 3635 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); |
3589 | } | 3636 | } |
3590 | 3637 | ||
3591 | gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3638 | gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, |
3639 | struct x86_exception *exception) | ||
3592 | { | 3640 | { |
3593 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3641 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3594 | access |= PFERR_WRITE_MASK; | 3642 | access |= PFERR_WRITE_MASK; |
3595 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); | 3643 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); |
3596 | } | 3644 | } |
3597 | 3645 | ||
3598 | /* uses this to access any guest's mapped memory without checking CPL */ | 3646 | /* uses this to access any guest's mapped memory without checking CPL */ |
3599 | gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) | 3647 | gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, |
3648 | struct x86_exception *exception) | ||
3600 | { | 3649 | { |
3601 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error); | 3650 | return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); |
3602 | } | 3651 | } |
3603 | 3652 | ||
3604 | static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, | 3653 | static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, |
3605 | struct kvm_vcpu *vcpu, u32 access, | 3654 | struct kvm_vcpu *vcpu, u32 access, |
3606 | u32 *error) | 3655 | struct x86_exception *exception) |
3607 | { | 3656 | { |
3608 | void *data = val; | 3657 | void *data = val; |
3609 | int r = X86EMUL_CONTINUE; | 3658 | int r = X86EMUL_CONTINUE; |
3610 | 3659 | ||
3611 | while (bytes) { | 3660 | while (bytes) { |
3612 | gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, | 3661 | gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, |
3613 | error); | 3662 | exception); |
3614 | unsigned offset = addr & (PAGE_SIZE-1); | 3663 | unsigned offset = addr & (PAGE_SIZE-1); |
3615 | unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); | 3664 | unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); |
3616 | int ret; | 3665 | int ret; |
3617 | 3666 | ||
3618 | if (gpa == UNMAPPED_GVA) { | 3667 | if (gpa == UNMAPPED_GVA) |
3619 | r = X86EMUL_PROPAGATE_FAULT; | 3668 | return X86EMUL_PROPAGATE_FAULT; |
3620 | goto out; | ||
3621 | } | ||
3622 | ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); | 3669 | ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); |
3623 | if (ret < 0) { | 3670 | if (ret < 0) { |
3624 | r = X86EMUL_IO_NEEDED; | 3671 | r = X86EMUL_IO_NEEDED; |
@@ -3635,31 +3682,35 @@ out: | |||
3635 | 3682 | ||
3636 | /* used for instruction fetching */ | 3683 | /* used for instruction fetching */ |
3637 | static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, | 3684 | static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, |
3638 | struct kvm_vcpu *vcpu, u32 *error) | 3685 | struct kvm_vcpu *vcpu, |
3686 | struct x86_exception *exception) | ||
3639 | { | 3687 | { |
3640 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3688 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3641 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, | 3689 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, |
3642 | access | PFERR_FETCH_MASK, error); | 3690 | access | PFERR_FETCH_MASK, |
3691 | exception); | ||
3643 | } | 3692 | } |
3644 | 3693 | ||
3645 | static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, | 3694 | static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, |
3646 | struct kvm_vcpu *vcpu, u32 *error) | 3695 | struct kvm_vcpu *vcpu, |
3696 | struct x86_exception *exception) | ||
3647 | { | 3697 | { |
3648 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; | 3698 | u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; |
3649 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, | 3699 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, |
3650 | error); | 3700 | exception); |
3651 | } | 3701 | } |
3652 | 3702 | ||
3653 | static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, | 3703 | static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, |
3654 | struct kvm_vcpu *vcpu, u32 *error) | 3704 | struct kvm_vcpu *vcpu, |
3705 | struct x86_exception *exception) | ||
3655 | { | 3706 | { |
3656 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); | 3707 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); |
3657 | } | 3708 | } |
3658 | 3709 | ||
3659 | static int kvm_write_guest_virt_system(gva_t addr, void *val, | 3710 | static int kvm_write_guest_virt_system(gva_t addr, void *val, |
3660 | unsigned int bytes, | 3711 | unsigned int bytes, |
3661 | struct kvm_vcpu *vcpu, | 3712 | struct kvm_vcpu *vcpu, |
3662 | u32 *error) | 3713 | struct x86_exception *exception) |
3663 | { | 3714 | { |
3664 | void *data = val; | 3715 | void *data = val; |
3665 | int r = X86EMUL_CONTINUE; | 3716 | int r = X86EMUL_CONTINUE; |
@@ -3667,15 +3718,13 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val, | |||
3667 | while (bytes) { | 3718 | while (bytes) { |
3668 | gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, | 3719 | gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, |
3669 | PFERR_WRITE_MASK, | 3720 | PFERR_WRITE_MASK, |
3670 | error); | 3721 | exception); |
3671 | unsigned offset = addr & (PAGE_SIZE-1); | 3722 | unsigned offset = addr & (PAGE_SIZE-1); |
3672 | unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); | 3723 | unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); |
3673 | int ret; | 3724 | int ret; |
3674 | 3725 | ||
3675 | if (gpa == UNMAPPED_GVA) { | 3726 | if (gpa == UNMAPPED_GVA) |
3676 | r = X86EMUL_PROPAGATE_FAULT; | 3727 | return X86EMUL_PROPAGATE_FAULT; |
3677 | goto out; | ||
3678 | } | ||
3679 | ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); | 3728 | ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); |
3680 | if (ret < 0) { | 3729 | if (ret < 0) { |
3681 | r = X86EMUL_IO_NEEDED; | 3730 | r = X86EMUL_IO_NEEDED; |
@@ -3693,7 +3742,7 @@ out: | |||
3693 | static int emulator_read_emulated(unsigned long addr, | 3742 | static int emulator_read_emulated(unsigned long addr, |
3694 | void *val, | 3743 | void *val, |
3695 | unsigned int bytes, | 3744 | unsigned int bytes, |
3696 | unsigned int *error_code, | 3745 | struct x86_exception *exception, |
3697 | struct kvm_vcpu *vcpu) | 3746 | struct kvm_vcpu *vcpu) |
3698 | { | 3747 | { |
3699 | gpa_t gpa; | 3748 | gpa_t gpa; |
@@ -3706,7 +3755,7 @@ static int emulator_read_emulated(unsigned long addr, | |||
3706 | return X86EMUL_CONTINUE; | 3755 | return X86EMUL_CONTINUE; |
3707 | } | 3756 | } |
3708 | 3757 | ||
3709 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); | 3758 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); |
3710 | 3759 | ||
3711 | if (gpa == UNMAPPED_GVA) | 3760 | if (gpa == UNMAPPED_GVA) |
3712 | return X86EMUL_PROPAGATE_FAULT; | 3761 | return X86EMUL_PROPAGATE_FAULT; |
@@ -3715,8 +3764,8 @@ static int emulator_read_emulated(unsigned long addr, | |||
3715 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | 3764 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) |
3716 | goto mmio; | 3765 | goto mmio; |
3717 | 3766 | ||
3718 | if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) | 3767 | if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception) |
3719 | == X86EMUL_CONTINUE) | 3768 | == X86EMUL_CONTINUE) |
3720 | return X86EMUL_CONTINUE; | 3769 | return X86EMUL_CONTINUE; |
3721 | 3770 | ||
3722 | mmio: | 3771 | mmio: |
@@ -3740,7 +3789,7 @@ mmio: | |||
3740 | } | 3789 | } |
3741 | 3790 | ||
3742 | int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | 3791 | int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, |
3743 | const void *val, int bytes) | 3792 | const void *val, int bytes) |
3744 | { | 3793 | { |
3745 | int ret; | 3794 | int ret; |
3746 | 3795 | ||
@@ -3754,12 +3803,12 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3754 | static int emulator_write_emulated_onepage(unsigned long addr, | 3803 | static int emulator_write_emulated_onepage(unsigned long addr, |
3755 | const void *val, | 3804 | const void *val, |
3756 | unsigned int bytes, | 3805 | unsigned int bytes, |
3757 | unsigned int *error_code, | 3806 | struct x86_exception *exception, |
3758 | struct kvm_vcpu *vcpu) | 3807 | struct kvm_vcpu *vcpu) |
3759 | { | 3808 | { |
3760 | gpa_t gpa; | 3809 | gpa_t gpa; |
3761 | 3810 | ||
3762 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); | 3811 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); |
3763 | 3812 | ||
3764 | if (gpa == UNMAPPED_GVA) | 3813 | if (gpa == UNMAPPED_GVA) |
3765 | return X86EMUL_PROPAGATE_FAULT; | 3814 | return X86EMUL_PROPAGATE_FAULT; |
@@ -3792,7 +3841,7 @@ mmio: | |||
3792 | int emulator_write_emulated(unsigned long addr, | 3841 | int emulator_write_emulated(unsigned long addr, |
3793 | const void *val, | 3842 | const void *val, |
3794 | unsigned int bytes, | 3843 | unsigned int bytes, |
3795 | unsigned int *error_code, | 3844 | struct x86_exception *exception, |
3796 | struct kvm_vcpu *vcpu) | 3845 | struct kvm_vcpu *vcpu) |
3797 | { | 3846 | { |
3798 | /* Crossing a page boundary? */ | 3847 | /* Crossing a page boundary? */ |
@@ -3800,7 +3849,7 @@ int emulator_write_emulated(unsigned long addr, | |||
3800 | int rc, now; | 3849 | int rc, now; |
3801 | 3850 | ||
3802 | now = -addr & ~PAGE_MASK; | 3851 | now = -addr & ~PAGE_MASK; |
3803 | rc = emulator_write_emulated_onepage(addr, val, now, error_code, | 3852 | rc = emulator_write_emulated_onepage(addr, val, now, exception, |
3804 | vcpu); | 3853 | vcpu); |
3805 | if (rc != X86EMUL_CONTINUE) | 3854 | if (rc != X86EMUL_CONTINUE) |
3806 | return rc; | 3855 | return rc; |
@@ -3808,7 +3857,7 @@ int emulator_write_emulated(unsigned long addr, | |||
3808 | val += now; | 3857 | val += now; |
3809 | bytes -= now; | 3858 | bytes -= now; |
3810 | } | 3859 | } |
3811 | return emulator_write_emulated_onepage(addr, val, bytes, error_code, | 3860 | return emulator_write_emulated_onepage(addr, val, bytes, exception, |
3812 | vcpu); | 3861 | vcpu); |
3813 | } | 3862 | } |
3814 | 3863 | ||
@@ -3826,7 +3875,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
3826 | const void *old, | 3875 | const void *old, |
3827 | const void *new, | 3876 | const void *new, |
3828 | unsigned int bytes, | 3877 | unsigned int bytes, |
3829 | unsigned int *error_code, | 3878 | struct x86_exception *exception, |
3830 | struct kvm_vcpu *vcpu) | 3879 | struct kvm_vcpu *vcpu) |
3831 | { | 3880 | { |
3832 | gpa_t gpa; | 3881 | gpa_t gpa; |
@@ -3884,7 +3933,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
3884 | emul_write: | 3933 | emul_write: |
3885 | printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); | 3934 | printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); |
3886 | 3935 | ||
3887 | return emulator_write_emulated(addr, new, bytes, error_code, vcpu); | 3936 | return emulator_write_emulated(addr, new, bytes, exception, vcpu); |
3888 | } | 3937 | } |
3889 | 3938 | ||
3890 | static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) | 3939 | static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) |
@@ -3909,7 +3958,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val, | |||
3909 | if (vcpu->arch.pio.count) | 3958 | if (vcpu->arch.pio.count) |
3910 | goto data_avail; | 3959 | goto data_avail; |
3911 | 3960 | ||
3912 | trace_kvm_pio(0, port, size, 1); | 3961 | trace_kvm_pio(0, port, size, count); |
3913 | 3962 | ||
3914 | vcpu->arch.pio.port = port; | 3963 | vcpu->arch.pio.port = port; |
3915 | vcpu->arch.pio.in = 1; | 3964 | vcpu->arch.pio.in = 1; |
@@ -3937,7 +3986,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port, | |||
3937 | const void *val, unsigned int count, | 3986 | const void *val, unsigned int count, |
3938 | struct kvm_vcpu *vcpu) | 3987 | struct kvm_vcpu *vcpu) |
3939 | { | 3988 | { |
3940 | trace_kvm_pio(1, port, size, 1); | 3989 | trace_kvm_pio(1, port, size, count); |
3941 | 3990 | ||
3942 | vcpu->arch.pio.port = port; | 3991 | vcpu->arch.pio.port = port; |
3943 | vcpu->arch.pio.in = 0; | 3992 | vcpu->arch.pio.in = 0; |
@@ -3978,13 +4027,15 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) | |||
3978 | return X86EMUL_CONTINUE; | 4027 | return X86EMUL_CONTINUE; |
3979 | 4028 | ||
3980 | if (kvm_x86_ops->has_wbinvd_exit()) { | 4029 | if (kvm_x86_ops->has_wbinvd_exit()) { |
3981 | preempt_disable(); | 4030 | int cpu = get_cpu(); |
4031 | |||
4032 | cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); | ||
3982 | smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, | 4033 | smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, |
3983 | wbinvd_ipi, NULL, 1); | 4034 | wbinvd_ipi, NULL, 1); |
3984 | preempt_enable(); | 4035 | put_cpu(); |
3985 | cpumask_clear(vcpu->arch.wbinvd_dirty_mask); | 4036 | cpumask_clear(vcpu->arch.wbinvd_dirty_mask); |
3986 | } | 4037 | } else |
3987 | wbinvd(); | 4038 | wbinvd(); |
3988 | return X86EMUL_CONTINUE; | 4039 | return X86EMUL_CONTINUE; |
3989 | } | 4040 | } |
3990 | EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); | 4041 | EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); |
@@ -4024,7 +4075,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) | |||
4024 | value = vcpu->arch.cr2; | 4075 | value = vcpu->arch.cr2; |
4025 | break; | 4076 | break; |
4026 | case 3: | 4077 | case 3: |
4027 | value = vcpu->arch.cr3; | 4078 | value = kvm_read_cr3(vcpu); |
4028 | break; | 4079 | break; |
4029 | case 4: | 4080 | case 4: |
4030 | value = kvm_read_cr4(vcpu); | 4081 | value = kvm_read_cr4(vcpu); |
@@ -4058,7 +4109,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) | |||
4058 | res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); | 4109 | res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); |
4059 | break; | 4110 | break; |
4060 | case 8: | 4111 | case 8: |
4061 | res = __kvm_set_cr8(vcpu, val & 0xfUL); | 4112 | res = kvm_set_cr8(vcpu, val); |
4062 | break; | 4113 | break; |
4063 | default: | 4114 | default: |
4064 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); | 4115 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); |
@@ -4211,12 +4262,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) | |||
4211 | static void inject_emulated_exception(struct kvm_vcpu *vcpu) | 4262 | static void inject_emulated_exception(struct kvm_vcpu *vcpu) |
4212 | { | 4263 | { |
4213 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; | 4264 | struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; |
4214 | if (ctxt->exception == PF_VECTOR) | 4265 | if (ctxt->exception.vector == PF_VECTOR) |
4215 | kvm_propagate_fault(vcpu); | 4266 | kvm_propagate_fault(vcpu, &ctxt->exception); |
4216 | else if (ctxt->error_code_valid) | 4267 | else if (ctxt->exception.error_code_valid) |
4217 | kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); | 4268 | kvm_queue_exception_e(vcpu, ctxt->exception.vector, |
4269 | ctxt->exception.error_code); | ||
4218 | else | 4270 | else |
4219 | kvm_queue_exception(vcpu, ctxt->exception); | 4271 | kvm_queue_exception(vcpu, ctxt->exception.vector); |
4220 | } | 4272 | } |
4221 | 4273 | ||
4222 | static void init_emulate_ctxt(struct kvm_vcpu *vcpu) | 4274 | static void init_emulate_ctxt(struct kvm_vcpu *vcpu) |
@@ -4272,13 +4324,19 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); | |||
4272 | 4324 | ||
4273 | static int handle_emulation_failure(struct kvm_vcpu *vcpu) | 4325 | static int handle_emulation_failure(struct kvm_vcpu *vcpu) |
4274 | { | 4326 | { |
4327 | int r = EMULATE_DONE; | ||
4328 | |||
4275 | ++vcpu->stat.insn_emulation_fail; | 4329 | ++vcpu->stat.insn_emulation_fail; |
4276 | trace_kvm_emulate_insn_failed(vcpu); | 4330 | trace_kvm_emulate_insn_failed(vcpu); |
4277 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; | 4331 | if (!is_guest_mode(vcpu)) { |
4278 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; | 4332 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; |
4279 | vcpu->run->internal.ndata = 0; | 4333 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; |
4334 | vcpu->run->internal.ndata = 0; | ||
4335 | r = EMULATE_FAIL; | ||
4336 | } | ||
4280 | kvm_queue_exception(vcpu, UD_VECTOR); | 4337 | kvm_queue_exception(vcpu, UD_VECTOR); |
4281 | return EMULATE_FAIL; | 4338 | |
4339 | return r; | ||
4282 | } | 4340 | } |
4283 | 4341 | ||
4284 | static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) | 4342 | static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) |
@@ -4307,10 +4365,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) | |||
4307 | return false; | 4365 | return false; |
4308 | } | 4366 | } |
4309 | 4367 | ||
4310 | int emulate_instruction(struct kvm_vcpu *vcpu, | 4368 | int x86_emulate_instruction(struct kvm_vcpu *vcpu, |
4311 | unsigned long cr2, | 4369 | unsigned long cr2, |
4312 | u16 error_code, | 4370 | int emulation_type, |
4313 | int emulation_type) | 4371 | void *insn, |
4372 | int insn_len) | ||
4314 | { | 4373 | { |
4315 | int r; | 4374 | int r; |
4316 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; | 4375 | struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; |
@@ -4328,10 +4387,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
4328 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { | 4387 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { |
4329 | init_emulate_ctxt(vcpu); | 4388 | init_emulate_ctxt(vcpu); |
4330 | vcpu->arch.emulate_ctxt.interruptibility = 0; | 4389 | vcpu->arch.emulate_ctxt.interruptibility = 0; |
4331 | vcpu->arch.emulate_ctxt.exception = -1; | 4390 | vcpu->arch.emulate_ctxt.have_exception = false; |
4332 | vcpu->arch.emulate_ctxt.perm_ok = false; | 4391 | vcpu->arch.emulate_ctxt.perm_ok = false; |
4333 | 4392 | ||
4334 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt); | 4393 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); |
4335 | if (r == X86EMUL_PROPAGATE_FAULT) | 4394 | if (r == X86EMUL_PROPAGATE_FAULT) |
4336 | goto done; | 4395 | goto done; |
4337 | 4396 | ||
@@ -4394,7 +4453,7 @@ restart: | |||
4394 | } | 4453 | } |
4395 | 4454 | ||
4396 | done: | 4455 | done: |
4397 | if (vcpu->arch.emulate_ctxt.exception >= 0) { | 4456 | if (vcpu->arch.emulate_ctxt.have_exception) { |
4398 | inject_emulated_exception(vcpu); | 4457 | inject_emulated_exception(vcpu); |
4399 | r = EMULATE_DONE; | 4458 | r = EMULATE_DONE; |
4400 | } else if (vcpu->arch.pio.count) { | 4459 | } else if (vcpu->arch.pio.count) { |
@@ -4418,7 +4477,7 @@ done: | |||
4418 | 4477 | ||
4419 | return r; | 4478 | return r; |
4420 | } | 4479 | } |
4421 | EXPORT_SYMBOL_GPL(emulate_instruction); | 4480 | EXPORT_SYMBOL_GPL(x86_emulate_instruction); |
4422 | 4481 | ||
4423 | int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) | 4482 | int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) |
4424 | { | 4483 | { |
@@ -4432,7 +4491,7 @@ EXPORT_SYMBOL_GPL(kvm_fast_pio_out); | |||
4432 | 4491 | ||
4433 | static void tsc_bad(void *info) | 4492 | static void tsc_bad(void *info) |
4434 | { | 4493 | { |
4435 | __get_cpu_var(cpu_tsc_khz) = 0; | 4494 | __this_cpu_write(cpu_tsc_khz, 0); |
4436 | } | 4495 | } |
4437 | 4496 | ||
4438 | static void tsc_khz_changed(void *data) | 4497 | static void tsc_khz_changed(void *data) |
@@ -4446,7 +4505,7 @@ static void tsc_khz_changed(void *data) | |||
4446 | khz = cpufreq_quick_get(raw_smp_processor_id()); | 4505 | khz = cpufreq_quick_get(raw_smp_processor_id()); |
4447 | if (!khz) | 4506 | if (!khz) |
4448 | khz = tsc_khz; | 4507 | khz = tsc_khz; |
4449 | __get_cpu_var(cpu_tsc_khz) = khz; | 4508 | __this_cpu_write(cpu_tsc_khz, khz); |
4450 | } | 4509 | } |
4451 | 4510 | ||
4452 | static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, | 4511 | static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, |
@@ -4569,9 +4628,11 @@ static void kvm_timer_init(void) | |||
4569 | #ifdef CONFIG_CPU_FREQ | 4628 | #ifdef CONFIG_CPU_FREQ |
4570 | struct cpufreq_policy policy; | 4629 | struct cpufreq_policy policy; |
4571 | memset(&policy, 0, sizeof(policy)); | 4630 | memset(&policy, 0, sizeof(policy)); |
4572 | cpufreq_get_policy(&policy, get_cpu()); | 4631 | cpu = get_cpu(); |
4632 | cpufreq_get_policy(&policy, cpu); | ||
4573 | if (policy.cpuinfo.max_freq) | 4633 | if (policy.cpuinfo.max_freq) |
4574 | max_tsc_khz = policy.cpuinfo.max_freq; | 4634 | max_tsc_khz = policy.cpuinfo.max_freq; |
4635 | put_cpu(); | ||
4575 | #endif | 4636 | #endif |
4576 | cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, | 4637 | cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, |
4577 | CPUFREQ_TRANSITION_NOTIFIER); | 4638 | CPUFREQ_TRANSITION_NOTIFIER); |
@@ -4656,7 +4717,6 @@ int kvm_arch_init(void *opaque) | |||
4656 | 4717 | ||
4657 | kvm_x86_ops = ops; | 4718 | kvm_x86_ops = ops; |
4658 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); | 4719 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); |
4659 | kvm_mmu_set_base_ptes(PT_PRESENT_MASK); | ||
4660 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, | 4720 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, |
4661 | PT_DIRTY_MASK, PT64_NX_MASK, 0); | 4721 | PT_DIRTY_MASK, PT64_NX_MASK, 0); |
4662 | 4722 | ||
@@ -5119,6 +5179,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) | |||
5119 | vcpu->fpu_active = 0; | 5179 | vcpu->fpu_active = 0; |
5120 | kvm_x86_ops->fpu_deactivate(vcpu); | 5180 | kvm_x86_ops->fpu_deactivate(vcpu); |
5121 | } | 5181 | } |
5182 | if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { | ||
5183 | /* Page is swapped out. Do synthetic halt */ | ||
5184 | vcpu->arch.apf.halted = true; | ||
5185 | r = 1; | ||
5186 | goto out; | ||
5187 | } | ||
5122 | } | 5188 | } |
5123 | 5189 | ||
5124 | r = kvm_mmu_reload(vcpu); | 5190 | r = kvm_mmu_reload(vcpu); |
@@ -5247,7 +5313,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
5247 | 5313 | ||
5248 | r = 1; | 5314 | r = 1; |
5249 | while (r > 0) { | 5315 | while (r > 0) { |
5250 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) | 5316 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && |
5317 | !vcpu->arch.apf.halted) | ||
5251 | r = vcpu_enter_guest(vcpu); | 5318 | r = vcpu_enter_guest(vcpu); |
5252 | else { | 5319 | else { |
5253 | srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); | 5320 | srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); |
@@ -5260,6 +5327,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
5260 | vcpu->arch.mp_state = | 5327 | vcpu->arch.mp_state = |
5261 | KVM_MP_STATE_RUNNABLE; | 5328 | KVM_MP_STATE_RUNNABLE; |
5262 | case KVM_MP_STATE_RUNNABLE: | 5329 | case KVM_MP_STATE_RUNNABLE: |
5330 | vcpu->arch.apf.halted = false; | ||
5263 | break; | 5331 | break; |
5264 | case KVM_MP_STATE_SIPI_RECEIVED: | 5332 | case KVM_MP_STATE_SIPI_RECEIVED: |
5265 | default: | 5333 | default: |
@@ -5281,6 +5349,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) | |||
5281 | vcpu->run->exit_reason = KVM_EXIT_INTR; | 5349 | vcpu->run->exit_reason = KVM_EXIT_INTR; |
5282 | ++vcpu->stat.request_irq_exits; | 5350 | ++vcpu->stat.request_irq_exits; |
5283 | } | 5351 | } |
5352 | |||
5353 | kvm_check_async_pf_completion(vcpu); | ||
5354 | |||
5284 | if (signal_pending(current)) { | 5355 | if (signal_pending(current)) { |
5285 | r = -EINTR; | 5356 | r = -EINTR; |
5286 | vcpu->run->exit_reason = KVM_EXIT_INTR; | 5357 | vcpu->run->exit_reason = KVM_EXIT_INTR; |
@@ -5305,6 +5376,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
5305 | int r; | 5376 | int r; |
5306 | sigset_t sigsaved; | 5377 | sigset_t sigsaved; |
5307 | 5378 | ||
5379 | if (!tsk_used_math(current) && init_fpu(current)) | ||
5380 | return -ENOMEM; | ||
5381 | |||
5308 | if (vcpu->sigset_active) | 5382 | if (vcpu->sigset_active) |
5309 | sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); | 5383 | sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); |
5310 | 5384 | ||
@@ -5316,8 +5390,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
5316 | } | 5390 | } |
5317 | 5391 | ||
5318 | /* re-sync apic's tpr */ | 5392 | /* re-sync apic's tpr */ |
5319 | if (!irqchip_in_kernel(vcpu->kvm)) | 5393 | if (!irqchip_in_kernel(vcpu->kvm)) { |
5320 | kvm_set_cr8(vcpu, kvm_run->cr8); | 5394 | if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { |
5395 | r = -EINVAL; | ||
5396 | goto out; | ||
5397 | } | ||
5398 | } | ||
5321 | 5399 | ||
5322 | if (vcpu->arch.pio.count || vcpu->mmio_needed) { | 5400 | if (vcpu->arch.pio.count || vcpu->mmio_needed) { |
5323 | if (vcpu->mmio_needed) { | 5401 | if (vcpu->mmio_needed) { |
@@ -5326,7 +5404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
5326 | vcpu->mmio_needed = 0; | 5404 | vcpu->mmio_needed = 0; |
5327 | } | 5405 | } |
5328 | vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); | 5406 | vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); |
5329 | r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); | 5407 | r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); |
5330 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); | 5408 | srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); |
5331 | if (r != EMULATE_DONE) { | 5409 | if (r != EMULATE_DONE) { |
5332 | r = 0; | 5410 | r = 0; |
@@ -5439,7 +5517,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | |||
5439 | 5517 | ||
5440 | sregs->cr0 = kvm_read_cr0(vcpu); | 5518 | sregs->cr0 = kvm_read_cr0(vcpu); |
5441 | sregs->cr2 = vcpu->arch.cr2; | 5519 | sregs->cr2 = vcpu->arch.cr2; |
5442 | sregs->cr3 = vcpu->arch.cr3; | 5520 | sregs->cr3 = kvm_read_cr3(vcpu); |
5443 | sregs->cr4 = kvm_read_cr4(vcpu); | 5521 | sregs->cr4 = kvm_read_cr4(vcpu); |
5444 | sregs->cr8 = kvm_get_cr8(vcpu); | 5522 | sregs->cr8 = kvm_get_cr8(vcpu); |
5445 | sregs->efer = vcpu->arch.efer; | 5523 | sregs->efer = vcpu->arch.efer; |
@@ -5507,8 +5585,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
5507 | kvm_x86_ops->set_gdt(vcpu, &dt); | 5585 | kvm_x86_ops->set_gdt(vcpu, &dt); |
5508 | 5586 | ||
5509 | vcpu->arch.cr2 = sregs->cr2; | 5587 | vcpu->arch.cr2 = sregs->cr2; |
5510 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; | 5588 | mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; |
5511 | vcpu->arch.cr3 = sregs->cr3; | 5589 | vcpu->arch.cr3 = sregs->cr3; |
5590 | __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); | ||
5512 | 5591 | ||
5513 | kvm_set_cr8(vcpu, sregs->cr8); | 5592 | kvm_set_cr8(vcpu, sregs->cr8); |
5514 | 5593 | ||
@@ -5522,8 +5601,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
5522 | 5601 | ||
5523 | mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; | 5602 | mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; |
5524 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); | 5603 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); |
5604 | if (sregs->cr4 & X86_CR4_OSXSAVE) | ||
5605 | update_cpuid(vcpu); | ||
5525 | if (!is_long_mode(vcpu) && is_pae(vcpu)) { | 5606 | if (!is_long_mode(vcpu) && is_pae(vcpu)) { |
5526 | load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); | 5607 | load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); |
5527 | mmu_reset_needed = 1; | 5608 | mmu_reset_needed = 1; |
5528 | } | 5609 | } |
5529 | 5610 | ||
@@ -5774,6 +5855,8 @@ free_vcpu: | |||
5774 | 5855 | ||
5775 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) | 5856 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) |
5776 | { | 5857 | { |
5858 | vcpu->arch.apf.msr_val = 0; | ||
5859 | |||
5777 | vcpu_load(vcpu); | 5860 | vcpu_load(vcpu); |
5778 | kvm_mmu_unload(vcpu); | 5861 | kvm_mmu_unload(vcpu); |
5779 | vcpu_put(vcpu); | 5862 | vcpu_put(vcpu); |
@@ -5793,6 +5876,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) | |||
5793 | vcpu->arch.dr7 = DR7_FIXED_1; | 5876 | vcpu->arch.dr7 = DR7_FIXED_1; |
5794 | 5877 | ||
5795 | kvm_make_request(KVM_REQ_EVENT, vcpu); | 5878 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
5879 | vcpu->arch.apf.msr_val = 0; | ||
5880 | |||
5881 | kvm_clear_async_pf_completion_queue(vcpu); | ||
5882 | kvm_async_pf_hash_reset(vcpu); | ||
5883 | vcpu->arch.apf.halted = false; | ||
5796 | 5884 | ||
5797 | return kvm_x86_ops->vcpu_reset(vcpu); | 5885 | return kvm_x86_ops->vcpu_reset(vcpu); |
5798 | } | 5886 | } |
@@ -5882,6 +5970,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
5882 | if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) | 5970 | if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) |
5883 | goto fail_free_mce_banks; | 5971 | goto fail_free_mce_banks; |
5884 | 5972 | ||
5973 | kvm_async_pf_hash_reset(vcpu); | ||
5974 | |||
5885 | return 0; | 5975 | return 0; |
5886 | fail_free_mce_banks: | 5976 | fail_free_mce_banks: |
5887 | kfree(vcpu->arch.mce_banks); | 5977 | kfree(vcpu->arch.mce_banks); |
@@ -5907,13 +5997,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) | |||
5907 | free_page((unsigned long)vcpu->arch.pio_data); | 5997 | free_page((unsigned long)vcpu->arch.pio_data); |
5908 | } | 5998 | } |
5909 | 5999 | ||
5910 | struct kvm *kvm_arch_create_vm(void) | 6000 | int kvm_arch_init_vm(struct kvm *kvm) |
5911 | { | 6001 | { |
5912 | struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); | ||
5913 | |||
5914 | if (!kvm) | ||
5915 | return ERR_PTR(-ENOMEM); | ||
5916 | |||
5917 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | 6002 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
5918 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); | 6003 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); |
5919 | 6004 | ||
@@ -5922,7 +6007,7 @@ struct kvm *kvm_arch_create_vm(void) | |||
5922 | 6007 | ||
5923 | spin_lock_init(&kvm->arch.tsc_write_lock); | 6008 | spin_lock_init(&kvm->arch.tsc_write_lock); |
5924 | 6009 | ||
5925 | return kvm; | 6010 | return 0; |
5926 | } | 6011 | } |
5927 | 6012 | ||
5928 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) | 6013 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) |
@@ -5940,8 +6025,10 @@ static void kvm_free_vcpus(struct kvm *kvm) | |||
5940 | /* | 6025 | /* |
5941 | * Unpin any mmu pages first. | 6026 | * Unpin any mmu pages first. |
5942 | */ | 6027 | */ |
5943 | kvm_for_each_vcpu(i, vcpu, kvm) | 6028 | kvm_for_each_vcpu(i, vcpu, kvm) { |
6029 | kvm_clear_async_pf_completion_queue(vcpu); | ||
5944 | kvm_unload_vcpu_mmu(vcpu); | 6030 | kvm_unload_vcpu_mmu(vcpu); |
6031 | } | ||
5945 | kvm_for_each_vcpu(i, vcpu, kvm) | 6032 | kvm_for_each_vcpu(i, vcpu, kvm) |
5946 | kvm_arch_vcpu_free(vcpu); | 6033 | kvm_arch_vcpu_free(vcpu); |
5947 | 6034 | ||
@@ -5965,13 +6052,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm) | |||
5965 | kfree(kvm->arch.vpic); | 6052 | kfree(kvm->arch.vpic); |
5966 | kfree(kvm->arch.vioapic); | 6053 | kfree(kvm->arch.vioapic); |
5967 | kvm_free_vcpus(kvm); | 6054 | kvm_free_vcpus(kvm); |
5968 | kvm_free_physmem(kvm); | ||
5969 | if (kvm->arch.apic_access_page) | 6055 | if (kvm->arch.apic_access_page) |
5970 | put_page(kvm->arch.apic_access_page); | 6056 | put_page(kvm->arch.apic_access_page); |
5971 | if (kvm->arch.ept_identity_pagetable) | 6057 | if (kvm->arch.ept_identity_pagetable) |
5972 | put_page(kvm->arch.ept_identity_pagetable); | 6058 | put_page(kvm->arch.ept_identity_pagetable); |
5973 | cleanup_srcu_struct(&kvm->srcu); | ||
5974 | kfree(kvm); | ||
5975 | } | 6059 | } |
5976 | 6060 | ||
5977 | int kvm_arch_prepare_memory_region(struct kvm *kvm, | 6061 | int kvm_arch_prepare_memory_region(struct kvm *kvm, |
@@ -6052,7 +6136,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm) | |||
6052 | 6136 | ||
6053 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | 6137 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) |
6054 | { | 6138 | { |
6055 | return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE | 6139 | return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && |
6140 | !vcpu->arch.apf.halted) | ||
6141 | || !list_empty_careful(&vcpu->async_pf.done) | ||
6056 | || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED | 6142 | || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED |
6057 | || vcpu->arch.nmi_pending || | 6143 | || vcpu->arch.nmi_pending || |
6058 | (kvm_arch_interrupt_allowed(vcpu) && | 6144 | (kvm_arch_interrupt_allowed(vcpu) && |
@@ -6111,6 +6197,147 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | |||
6111 | } | 6197 | } |
6112 | EXPORT_SYMBOL_GPL(kvm_set_rflags); | 6198 | EXPORT_SYMBOL_GPL(kvm_set_rflags); |
6113 | 6199 | ||
6200 | void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) | ||
6201 | { | ||
6202 | int r; | ||
6203 | |||
6204 | if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || | ||
6205 | is_error_page(work->page)) | ||
6206 | return; | ||
6207 | |||
6208 | r = kvm_mmu_reload(vcpu); | ||
6209 | if (unlikely(r)) | ||
6210 | return; | ||
6211 | |||
6212 | if (!vcpu->arch.mmu.direct_map && | ||
6213 | work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu)) | ||
6214 | return; | ||
6215 | |||
6216 | vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true); | ||
6217 | } | ||
6218 | |||
6219 | static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) | ||
6220 | { | ||
6221 | return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); | ||
6222 | } | ||
6223 | |||
6224 | static inline u32 kvm_async_pf_next_probe(u32 key) | ||
6225 | { | ||
6226 | return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); | ||
6227 | } | ||
6228 | |||
6229 | static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
6230 | { | ||
6231 | u32 key = kvm_async_pf_hash_fn(gfn); | ||
6232 | |||
6233 | while (vcpu->arch.apf.gfns[key] != ~0) | ||
6234 | key = kvm_async_pf_next_probe(key); | ||
6235 | |||
6236 | vcpu->arch.apf.gfns[key] = gfn; | ||
6237 | } | ||
6238 | |||
6239 | static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
6240 | { | ||
6241 | int i; | ||
6242 | u32 key = kvm_async_pf_hash_fn(gfn); | ||
6243 | |||
6244 | for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && | ||
6245 | (vcpu->arch.apf.gfns[key] != gfn && | ||
6246 | vcpu->arch.apf.gfns[key] != ~0); i++) | ||
6247 | key = kvm_async_pf_next_probe(key); | ||
6248 | |||
6249 | return key; | ||
6250 | } | ||
6251 | |||
6252 | bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
6253 | { | ||
6254 | return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn; | ||
6255 | } | ||
6256 | |||
6257 | static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
6258 | { | ||
6259 | u32 i, j, k; | ||
6260 | |||
6261 | i = j = kvm_async_pf_gfn_slot(vcpu, gfn); | ||
6262 | while (true) { | ||
6263 | vcpu->arch.apf.gfns[i] = ~0; | ||
6264 | do { | ||
6265 | j = kvm_async_pf_next_probe(j); | ||
6266 | if (vcpu->arch.apf.gfns[j] == ~0) | ||
6267 | return; | ||
6268 | k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]); | ||
6269 | /* | ||
6270 | * k lies cyclically in ]i,j] | ||
6271 | * | i.k.j | | ||
6272 | * |....j i.k.| or |.k..j i...| | ||
6273 | */ | ||
6274 | } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j)); | ||
6275 | vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j]; | ||
6276 | i = j; | ||
6277 | } | ||
6278 | } | ||
6279 | |||
6280 | static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) | ||
6281 | { | ||
6282 | |||
6283 | return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, | ||
6284 | sizeof(val)); | ||
6285 | } | ||
6286 | |||
6287 | void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, | ||
6288 | struct kvm_async_pf *work) | ||
6289 | { | ||
6290 | struct x86_exception fault; | ||
6291 | |||
6292 | trace_kvm_async_pf_not_present(work->arch.token, work->gva); | ||
6293 | kvm_add_async_pf_gfn(vcpu, work->arch.gfn); | ||
6294 | |||
6295 | if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || | ||
6296 | (vcpu->arch.apf.send_user_only && | ||
6297 | kvm_x86_ops->get_cpl(vcpu) == 0)) | ||
6298 | kvm_make_request(KVM_REQ_APF_HALT, vcpu); | ||
6299 | else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { | ||
6300 | fault.vector = PF_VECTOR; | ||
6301 | fault.error_code_valid = true; | ||
6302 | fault.error_code = 0; | ||
6303 | fault.nested_page_fault = false; | ||
6304 | fault.address = work->arch.token; | ||
6305 | kvm_inject_page_fault(vcpu, &fault); | ||
6306 | } | ||
6307 | } | ||
6308 | |||
6309 | void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, | ||
6310 | struct kvm_async_pf *work) | ||
6311 | { | ||
6312 | struct x86_exception fault; | ||
6313 | |||
6314 | trace_kvm_async_pf_ready(work->arch.token, work->gva); | ||
6315 | if (is_error_page(work->page)) | ||
6316 | work->arch.token = ~0; /* broadcast wakeup */ | ||
6317 | else | ||
6318 | kvm_del_async_pf_gfn(vcpu, work->arch.gfn); | ||
6319 | |||
6320 | if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && | ||
6321 | !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { | ||
6322 | fault.vector = PF_VECTOR; | ||
6323 | fault.error_code_valid = true; | ||
6324 | fault.error_code = 0; | ||
6325 | fault.nested_page_fault = false; | ||
6326 | fault.address = work->arch.token; | ||
6327 | kvm_inject_page_fault(vcpu, &fault); | ||
6328 | } | ||
6329 | vcpu->arch.apf.halted = false; | ||
6330 | } | ||
6331 | |||
6332 | bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) | ||
6333 | { | ||
6334 | if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) | ||
6335 | return true; | ||
6336 | else | ||
6337 | return !kvm_event_needs_reinjection(vcpu) && | ||
6338 | kvm_x86_ops->interrupt_allowed(vcpu); | ||
6339 | } | ||
6340 | |||
6114 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); | 6341 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); |
6115 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); | 6342 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); |
6116 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); | 6343 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); |
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2cea414489f3..c600da830ce0 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h | |||
@@ -70,6 +70,11 @@ static inline int is_paging(struct kvm_vcpu *vcpu) | |||
70 | return kvm_read_cr0_bits(vcpu, X86_CR0_PG); | 70 | return kvm_read_cr0_bits(vcpu, X86_CR0_PG); |
71 | } | 71 | } |
72 | 72 | ||
73 | static inline u32 bit(int bitno) | ||
74 | { | ||
75 | return 1 << (bitno & 31); | ||
76 | } | ||
77 | |||
73 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); | 78 | void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); |
74 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); | 79 | void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); |
75 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); | 80 | int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); |
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 38718041efc3..6e121a2a49e1 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig | |||
@@ -2,6 +2,7 @@ config LGUEST_GUEST | |||
2 | bool "Lguest guest support" | 2 | bool "Lguest guest support" |
3 | select PARAVIRT | 3 | select PARAVIRT |
4 | depends on X86_32 | 4 | depends on X86_32 |
5 | select VIRTUALIZATION | ||
5 | select VIRTIO | 6 | select VIRTIO |
6 | select VIRTIO_RING | 7 | select VIRTIO_RING |
7 | select VIRTIO_CONSOLE | 8 | select VIRTIO_CONSOLE |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 73b1e1a1f489..eba687f0cc0c 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -531,7 +531,10 @@ static void lguest_write_cr3(unsigned long cr3) | |||
531 | { | 531 | { |
532 | lguest_data.pgdir = cr3; | 532 | lguest_data.pgdir = cr3; |
533 | lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); | 533 | lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); |
534 | cr3_changed = true; | 534 | |
535 | /* These two page tables are simple, linear, and used during boot */ | ||
536 | if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) | ||
537 | cr3_changed = true; | ||
535 | } | 538 | } |
536 | 539 | ||
537 | static unsigned long lguest_read_cr3(void) | 540 | static unsigned long lguest_read_cr3(void) |
@@ -703,9 +706,9 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
703 | * to forget all of them. Fortunately, this is very rare. | 706 | * to forget all of them. Fortunately, this is very rare. |
704 | * | 707 | * |
705 | * ... except in early boot when the kernel sets up the initial pagetables, | 708 | * ... except in early boot when the kernel sets up the initial pagetables, |
706 | * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell | 709 | * which makes booting astonishingly slow: 48 seconds! So we don't even tell |
707 | * the Host anything changed until we've done the first page table switch, | 710 | * the Host anything changed until we've done the first real page table switch, |
708 | * which brings boot back to 0.25 seconds. | 711 | * which brings boot back to 4.3 seconds. |
709 | */ | 712 | */ |
710 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) | 713 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) |
711 | { | 714 | { |
@@ -821,7 +824,7 @@ static void __init lguest_init_IRQ(void) | |||
821 | 824 | ||
822 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { | 825 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { |
823 | /* Some systems map "vectors" to interrupts weirdly. Not us! */ | 826 | /* Some systems map "vectors" to interrupts weirdly. Not us! */ |
824 | __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; | 827 | __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); |
825 | if (i != SYSCALL_VECTOR) | 828 | if (i != SYSCALL_VECTOR) |
826 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); | 829 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); |
827 | } | 830 | } |
@@ -1002,7 +1005,7 @@ static void lguest_time_init(void) | |||
1002 | clockevents_register_device(&lguest_clockevent); | 1005 | clockevents_register_device(&lguest_clockevent); |
1003 | 1006 | ||
1004 | /* Finally, we unblock the timer interrupt. */ | 1007 | /* Finally, we unblock the timer interrupt. */ |
1005 | enable_lguest_irq(0); | 1008 | clear_bit(0, lguest_data.blocked_interrupts); |
1006 | } | 1009 | } |
1007 | 1010 | ||
1008 | /* | 1011 | /* |
@@ -1349,9 +1352,6 @@ __init void lguest_init(void) | |||
1349 | */ | 1352 | */ |
1350 | switch_to_new_gdt(0); | 1353 | switch_to_new_gdt(0); |
1351 | 1354 | ||
1352 | /* We actually boot with all memory mapped, but let's say 128MB. */ | ||
1353 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; | ||
1354 | |||
1355 | /* | 1355 | /* |
1356 | * The Host<->Guest Switcher lives at the top of our address space, and | 1356 | * The Host<->Guest Switcher lives at the top of our address space, and |
1357 | * the Host told us how big it is when we made LGUEST_INIT hypercall: | 1357 | * the Host told us how big it is when we made LGUEST_INIT hypercall: |
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index ff485d361182..fc45ba887d05 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c | |||
@@ -121,7 +121,7 @@ inline void __const_udelay(unsigned long xloops) | |||
121 | asm("mull %%edx" | 121 | asm("mull %%edx" |
122 | :"=d" (xloops), "=&a" (d0) | 122 | :"=d" (xloops), "=&a" (d0) |
123 | :"1" (xloops), "0" | 123 | :"1" (xloops), "0" |
124 | (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4))); | 124 | (this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4))); |
125 | 125 | ||
126 | __delay(++xloops); | 126 | __delay(++xloops); |
127 | } | 127 | } |
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 55543397a8a7..09df2f9a3d69 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -23,7 +23,7 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o | |||
23 | obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o | 23 | obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o |
24 | 24 | ||
25 | obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o | 25 | obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o |
26 | obj-$(CONFIG_K8_NUMA) += k8topology_64.o | 26 | obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o |
27 | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o | 27 | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o |
28 | 28 | ||
29 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | 29 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o |
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology_64.c index 804a3b6c6e14..f21962c435ed 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/amdtopology_64.c | |||
@@ -1,8 +1,8 @@ | |||
1 | /* | 1 | /* |
2 | * AMD K8 NUMA support. | 2 | * AMD NUMA support. |
3 | * Discover the memory map and associated nodes. | 3 | * Discover the memory map and associated nodes. |
4 | * | 4 | * |
5 | * This version reads it directly from the K8 northbridge. | 5 | * This version reads it directly from the AMD northbridge. |
6 | * | 6 | * |
7 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | 7 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. |
8 | */ | 8 | */ |
@@ -27,6 +27,7 @@ | |||
27 | #include <asm/amd_nb.h> | 27 | #include <asm/amd_nb.h> |
28 | 28 | ||
29 | static struct bootnode __initdata nodes[8]; | 29 | static struct bootnode __initdata nodes[8]; |
30 | static unsigned char __initdata nodeids[8]; | ||
30 | static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; | 31 | static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; |
31 | 32 | ||
32 | static __init int find_northbridge(void) | 33 | static __init int find_northbridge(void) |
@@ -57,7 +58,7 @@ static __init void early_get_boot_cpu_id(void) | |||
57 | { | 58 | { |
58 | /* | 59 | /* |
59 | * need to get the APIC ID of the BSP so can use that to | 60 | * need to get the APIC ID of the BSP so can use that to |
60 | * create apicid_to_node in k8_scan_nodes() | 61 | * create apicid_to_node in amd_scan_nodes() |
61 | */ | 62 | */ |
62 | #ifdef CONFIG_X86_MPPARSE | 63 | #ifdef CONFIG_X86_MPPARSE |
63 | /* | 64 | /* |
@@ -66,23 +67,9 @@ static __init void early_get_boot_cpu_id(void) | |||
66 | if (smp_found_config) | 67 | if (smp_found_config) |
67 | early_get_smp_config(); | 68 | early_get_smp_config(); |
68 | #endif | 69 | #endif |
69 | early_init_lapic_mapping(); | ||
70 | } | 70 | } |
71 | 71 | ||
72 | int __init k8_get_nodes(struct bootnode *physnodes) | 72 | int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) |
73 | { | ||
74 | int i; | ||
75 | int ret = 0; | ||
76 | |||
77 | for_each_node_mask(i, nodes_parsed) { | ||
78 | physnodes[ret].start = nodes[i].start; | ||
79 | physnodes[ret].end = nodes[i].end; | ||
80 | ret++; | ||
81 | } | ||
82 | return ret; | ||
83 | } | ||
84 | |||
85 | int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) | ||
86 | { | 73 | { |
87 | unsigned long start = PFN_PHYS(start_pfn); | 74 | unsigned long start = PFN_PHYS(start_pfn); |
88 | unsigned long end = PFN_PHYS(end_pfn); | 75 | unsigned long end = PFN_PHYS(end_pfn); |
@@ -114,7 +101,7 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) | |||
114 | base = read_pci_config(0, nb, 1, 0x40 + i*8); | 101 | base = read_pci_config(0, nb, 1, 0x40 + i*8); |
115 | limit = read_pci_config(0, nb, 1, 0x44 + i*8); | 102 | limit = read_pci_config(0, nb, 1, 0x44 + i*8); |
116 | 103 | ||
117 | nodeid = limit & 7; | 104 | nodeids[i] = nodeid = limit & 7; |
118 | if ((base & 3) == 0) { | 105 | if ((base & 3) == 0) { |
119 | if (i < numnodes) | 106 | if (i < numnodes) |
120 | pr_info("Skipping disabled node %d\n", i); | 107 | pr_info("Skipping disabled node %d\n", i); |
@@ -194,7 +181,77 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) | |||
194 | return 0; | 181 | return 0; |
195 | } | 182 | } |
196 | 183 | ||
197 | int __init k8_scan_nodes(void) | 184 | #ifdef CONFIG_NUMA_EMU |
185 | static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { | ||
186 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
187 | }; | ||
188 | |||
189 | void __init amd_get_nodes(struct bootnode *physnodes) | ||
190 | { | ||
191 | int i; | ||
192 | |||
193 | for_each_node_mask(i, nodes_parsed) { | ||
194 | physnodes[i].start = nodes[i].start; | ||
195 | physnodes[i].end = nodes[i].end; | ||
196 | } | ||
197 | } | ||
198 | |||
199 | static int __init find_node_by_addr(unsigned long addr) | ||
200 | { | ||
201 | int ret = NUMA_NO_NODE; | ||
202 | int i; | ||
203 | |||
204 | for (i = 0; i < 8; i++) | ||
205 | if (addr >= nodes[i].start && addr < nodes[i].end) { | ||
206 | ret = i; | ||
207 | break; | ||
208 | } | ||
209 | return ret; | ||
210 | } | ||
211 | |||
212 | /* | ||
213 | * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be | ||
214 | * setup to represent the physical topology but reflect the emulated | ||
215 | * environment. For each emulated node, the real node which it appears on is | ||
216 | * found and a fake pxm to nid mapping is created which mirrors the actual | ||
217 | * locality. node_distance() then represents the correct distances between | ||
218 | * emulated nodes by using the fake acpi mappings to pxms. | ||
219 | */ | ||
220 | void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) | ||
221 | { | ||
222 | unsigned int bits; | ||
223 | unsigned int cores; | ||
224 | unsigned int apicid_base = 0; | ||
225 | int i; | ||
226 | |||
227 | bits = boot_cpu_data.x86_coreid_bits; | ||
228 | cores = 1 << bits; | ||
229 | early_get_boot_cpu_id(); | ||
230 | if (boot_cpu_physical_apicid > 0) | ||
231 | apicid_base = boot_cpu_physical_apicid; | ||
232 | |||
233 | for (i = 0; i < nr_nodes; i++) { | ||
234 | int index; | ||
235 | int nid; | ||
236 | int j; | ||
237 | |||
238 | nid = find_node_by_addr(nodes[i].start); | ||
239 | if (nid == NUMA_NO_NODE) | ||
240 | continue; | ||
241 | |||
242 | index = nodeids[nid] << bits; | ||
243 | if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE) | ||
244 | for (j = apicid_base; j < cores + apicid_base; j++) | ||
245 | fake_apicid_to_node[index + j] = i; | ||
246 | #ifdef CONFIG_ACPI_NUMA | ||
247 | __acpi_map_pxm_to_node(nid, i); | ||
248 | #endif | ||
249 | } | ||
250 | memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); | ||
251 | } | ||
252 | #endif /* CONFIG_NUMA_EMU */ | ||
253 | |||
254 | int __init amd_scan_nodes(void) | ||
198 | { | 255 | { |
199 | unsigned int bits; | 256 | unsigned int bits; |
200 | unsigned int cores; | 257 | unsigned int cores; |
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 738e6593799d..dbe34b931374 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
9 | #include <linux/vmstat.h> | 9 | #include <linux/vmstat.h> |
10 | #include <linux/highmem.h> | 10 | #include <linux/highmem.h> |
11 | #include <linux/swap.h> | ||
11 | 12 | ||
12 | #include <asm/pgtable.h> | 13 | #include <asm/pgtable.h> |
13 | 14 | ||
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | |||
89 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 90 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
90 | page = pte_page(pte); | 91 | page = pte_page(pte); |
91 | get_page(page); | 92 | get_page(page); |
93 | SetPageReferenced(page); | ||
92 | pages[*nr] = page; | 94 | pages[*nr] = page; |
93 | (*nr)++; | 95 | (*nr)++; |
94 | 96 | ||
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr) | |||
103 | VM_BUG_ON(page != compound_head(page)); | 105 | VM_BUG_ON(page != compound_head(page)); |
104 | VM_BUG_ON(page_count(page) == 0); | 106 | VM_BUG_ON(page_count(page) == 0); |
105 | atomic_add(nr, &page->_count); | 107 | atomic_add(nr, &page->_count); |
108 | SetPageReferenced(page); | ||
109 | } | ||
110 | |||
111 | static inline void get_huge_page_tail(struct page *page) | ||
112 | { | ||
113 | /* | ||
114 | * __split_huge_page_refcount() cannot run | ||
115 | * from under us. | ||
116 | */ | ||
117 | VM_BUG_ON(atomic_read(&page->_count) < 0); | ||
118 | atomic_inc(&page->_count); | ||
106 | } | 119 | } |
107 | 120 | ||
108 | static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, | 121 | static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, |
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, | |||
128 | do { | 141 | do { |
129 | VM_BUG_ON(compound_head(page) != head); | 142 | VM_BUG_ON(compound_head(page) != head); |
130 | pages[*nr] = page; | 143 | pages[*nr] = page; |
144 | if (PageTail(page)) | ||
145 | get_huge_page_tail(page); | ||
131 | (*nr)++; | 146 | (*nr)++; |
132 | page++; | 147 | page++; |
133 | refs++; | 148 | refs++; |
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |||
148 | pmd_t pmd = *pmdp; | 163 | pmd_t pmd = *pmdp; |
149 | 164 | ||
150 | next = pmd_addr_end(addr, end); | 165 | next = pmd_addr_end(addr, end); |
151 | if (pmd_none(pmd)) | 166 | /* |
167 | * The pmd_trans_splitting() check below explains why | ||
168 | * pmdp_splitting_flush has to flush the tlb, to stop | ||
169 | * this gup-fast code from running while we set the | ||
170 | * splitting bit in the pmd. Returning zero will take | ||
171 | * the slow path that will call wait_split_huge_page() | ||
172 | * if the pmd is still in splitting state. gup-fast | ||
173 | * can't because it has irq disabled and | ||
174 | * wait_split_huge_page() would never return as the | ||
175 | * tlb flush IPI wouldn't run. | ||
176 | */ | ||
177 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | ||
152 | return 0; | 178 | return 0; |
153 | if (unlikely(pmd_large(pmd))) { | 179 | if (unlikely(pmd_large(pmd))) { |
154 | if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) | 180 | if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c0e28a13de7d..947f42abe820 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) | |||
364 | /* | 364 | /* |
365 | * We just marked the kernel text read only above, now that | 365 | * We just marked the kernel text read only above, now that |
366 | * we are going to free part of that, we need to make that | 366 | * we are going to free part of that, we need to make that |
367 | * writeable first. | 367 | * writeable and non-executable first. |
368 | */ | 368 | */ |
369 | set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); | ||
369 | set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); | 370 | set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); |
370 | 371 | ||
371 | printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); | 372 | printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0e969f9f401b..c821074b7f0b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <asm/bugs.h> | 45 | #include <asm/bugs.h> |
46 | #include <asm/tlb.h> | 46 | #include <asm/tlb.h> |
47 | #include <asm/tlbflush.h> | 47 | #include <asm/tlbflush.h> |
48 | #include <asm/olpc_ofw.h> | ||
48 | #include <asm/pgalloc.h> | 49 | #include <asm/pgalloc.h> |
49 | #include <asm/sections.h> | 50 | #include <asm/sections.h> |
50 | #include <asm/paravirt.h> | 51 | #include <asm/paravirt.h> |
@@ -226,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) | |||
226 | 227 | ||
227 | static inline int is_kernel_text(unsigned long addr) | 228 | static inline int is_kernel_text(unsigned long addr) |
228 | { | 229 | { |
229 | if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) | 230 | if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end) |
230 | return 1; | 231 | return 1; |
231 | return 0; | 232 | return 0; |
232 | } | 233 | } |
@@ -715,6 +716,7 @@ void __init paging_init(void) | |||
715 | /* | 716 | /* |
716 | * NOTE: at this point the bootmem allocator is fully available. | 717 | * NOTE: at this point the bootmem allocator is fully available. |
717 | */ | 718 | */ |
719 | olpc_dt_build_devicetree(); | ||
718 | sparse_init(); | 720 | sparse_init(); |
719 | zone_sizes_init(); | 721 | zone_sizes_init(); |
720 | } | 722 | } |
@@ -912,6 +914,23 @@ void set_kernel_text_ro(void) | |||
912 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); | 914 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); |
913 | } | 915 | } |
914 | 916 | ||
917 | static void mark_nxdata_nx(void) | ||
918 | { | ||
919 | /* | ||
920 | * When this called, init has already been executed and released, | ||
921 | * so everything past _etext sould be NX. | ||
922 | */ | ||
923 | unsigned long start = PFN_ALIGN(_etext); | ||
924 | /* | ||
925 | * This comes from is_kernel_text upper limit. Also HPAGE where used: | ||
926 | */ | ||
927 | unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start; | ||
928 | |||
929 | if (__supported_pte_mask & _PAGE_NX) | ||
930 | printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10); | ||
931 | set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT); | ||
932 | } | ||
933 | |||
915 | void mark_rodata_ro(void) | 934 | void mark_rodata_ro(void) |
916 | { | 935 | { |
917 | unsigned long start = PFN_ALIGN(_text); | 936 | unsigned long start = PFN_ALIGN(_text); |
@@ -946,6 +965,7 @@ void mark_rodata_ro(void) | |||
946 | printk(KERN_INFO "Testing CPA: write protecting again\n"); | 965 | printk(KERN_INFO "Testing CPA: write protecting again\n"); |
947 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); | 966 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); |
948 | #endif | 967 | #endif |
968 | mark_nxdata_nx(); | ||
949 | } | 969 | } |
950 | #endif | 970 | #endif |
951 | 971 | ||
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index af3b6c8a436f..704a37cedddb 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c | |||
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state, | |||
185 | e->trace.entries = e->trace_entries; | 185 | e->trace.entries = e->trace_entries; |
186 | e->trace.max_entries = ARRAY_SIZE(e->trace_entries); | 186 | e->trace.max_entries = ARRAY_SIZE(e->trace_entries); |
187 | e->trace.skip = 0; | 187 | e->trace.skip = 0; |
188 | save_stack_trace_bp(&e->trace, regs->bp); | 188 | save_stack_trace_regs(&e->trace, regs); |
189 | 189 | ||
190 | /* Round address down to nearest 16 bytes */ | 190 | /* Round address down to nearest 16 bytes */ |
191 | shadow_copy = kmemcheck_shadow_lookup(address | 191 | shadow_copy = kmemcheck_shadow_lookup(address |
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 787c52ca49c3..ebf6d7887a38 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c | |||
@@ -2,6 +2,28 @@ | |||
2 | #include <linux/topology.h> | 2 | #include <linux/topology.h> |
3 | #include <linux/module.h> | 3 | #include <linux/module.h> |
4 | #include <linux/bootmem.h> | 4 | #include <linux/bootmem.h> |
5 | #include <asm/numa.h> | ||
6 | #include <asm/acpi.h> | ||
7 | |||
8 | int __initdata numa_off; | ||
9 | |||
10 | static __init int numa_setup(char *opt) | ||
11 | { | ||
12 | if (!opt) | ||
13 | return -EINVAL; | ||
14 | if (!strncmp(opt, "off", 3)) | ||
15 | numa_off = 1; | ||
16 | #ifdef CONFIG_NUMA_EMU | ||
17 | if (!strncmp(opt, "fake=", 5)) | ||
18 | numa_emu_cmdline(opt + 5); | ||
19 | #endif | ||
20 | #ifdef CONFIG_ACPI_NUMA | ||
21 | if (!strncmp(opt, "noacpi", 6)) | ||
22 | acpi_numa = -1; | ||
23 | #endif | ||
24 | return 0; | ||
25 | } | ||
26 | early_param("numa", numa_setup); | ||
5 | 27 | ||
6 | /* | 28 | /* |
7 | * Which logical CPUs are on which nodes | 29 | * Which logical CPUs are on which nodes |
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 7ffc9b727efd..95ea1551eebc 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -30,7 +30,6 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | |||
30 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | 30 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE |
31 | }; | 31 | }; |
32 | 32 | ||
33 | int numa_off __initdata; | ||
34 | static unsigned long __initdata nodemap_addr; | 33 | static unsigned long __initdata nodemap_addr; |
35 | static unsigned long __initdata nodemap_size; | 34 | static unsigned long __initdata nodemap_size; |
36 | 35 | ||
@@ -260,30 +259,35 @@ void __init numa_init_array(void) | |||
260 | #ifdef CONFIG_NUMA_EMU | 259 | #ifdef CONFIG_NUMA_EMU |
261 | /* Numa emulation */ | 260 | /* Numa emulation */ |
262 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | 261 | static struct bootnode nodes[MAX_NUMNODES] __initdata; |
263 | static struct bootnode physnodes[MAX_NUMNODES] __initdata; | 262 | static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata; |
264 | static char *cmdline __initdata; | 263 | static char *cmdline __initdata; |
265 | 264 | ||
265 | void __init numa_emu_cmdline(char *str) | ||
266 | { | ||
267 | cmdline = str; | ||
268 | } | ||
269 | |||
266 | static int __init setup_physnodes(unsigned long start, unsigned long end, | 270 | static int __init setup_physnodes(unsigned long start, unsigned long end, |
267 | int acpi, int k8) | 271 | int acpi, int amd) |
268 | { | 272 | { |
269 | int nr_nodes = 0; | ||
270 | int ret = 0; | 273 | int ret = 0; |
271 | int i; | 274 | int i; |
272 | 275 | ||
276 | memset(physnodes, 0, sizeof(physnodes)); | ||
273 | #ifdef CONFIG_ACPI_NUMA | 277 | #ifdef CONFIG_ACPI_NUMA |
274 | if (acpi) | 278 | if (acpi) |
275 | nr_nodes = acpi_get_nodes(physnodes); | 279 | acpi_get_nodes(physnodes, start, end); |
276 | #endif | 280 | #endif |
277 | #ifdef CONFIG_K8_NUMA | 281 | #ifdef CONFIG_AMD_NUMA |
278 | if (k8) | 282 | if (amd) |
279 | nr_nodes = k8_get_nodes(physnodes); | 283 | amd_get_nodes(physnodes); |
280 | #endif | 284 | #endif |
281 | /* | 285 | /* |
282 | * Basic sanity checking on the physical node map: there may be errors | 286 | * Basic sanity checking on the physical node map: there may be errors |
283 | * if the SRAT or K8 incorrectly reported the topology or the mem= | 287 | * if the SRAT or AMD code incorrectly reported the topology or the mem= |
284 | * kernel parameter is used. | 288 | * kernel parameter is used. |
285 | */ | 289 | */ |
286 | for (i = 0; i < nr_nodes; i++) { | 290 | for (i = 0; i < MAX_NUMNODES; i++) { |
287 | if (physnodes[i].start == physnodes[i].end) | 291 | if (physnodes[i].start == physnodes[i].end) |
288 | continue; | 292 | continue; |
289 | if (physnodes[i].start > end) { | 293 | if (physnodes[i].start > end) { |
@@ -298,17 +302,6 @@ static int __init setup_physnodes(unsigned long start, unsigned long end, | |||
298 | physnodes[i].start = start; | 302 | physnodes[i].start = start; |
299 | if (physnodes[i].end > end) | 303 | if (physnodes[i].end > end) |
300 | physnodes[i].end = end; | 304 | physnodes[i].end = end; |
301 | } | ||
302 | |||
303 | /* | ||
304 | * Remove all nodes that have no memory or were truncated because of the | ||
305 | * limited address range. | ||
306 | */ | ||
307 | for (i = 0; i < nr_nodes; i++) { | ||
308 | if (physnodes[i].start == physnodes[i].end) | ||
309 | continue; | ||
310 | physnodes[ret].start = physnodes[i].start; | ||
311 | physnodes[ret].end = physnodes[i].end; | ||
312 | ret++; | 305 | ret++; |
313 | } | 306 | } |
314 | 307 | ||
@@ -324,6 +317,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end, | |||
324 | return ret; | 317 | return ret; |
325 | } | 318 | } |
326 | 319 | ||
320 | static void __init fake_physnodes(int acpi, int amd, int nr_nodes) | ||
321 | { | ||
322 | int i; | ||
323 | |||
324 | BUG_ON(acpi && amd); | ||
325 | #ifdef CONFIG_ACPI_NUMA | ||
326 | if (acpi) | ||
327 | acpi_fake_nodes(nodes, nr_nodes); | ||
328 | #endif | ||
329 | #ifdef CONFIG_AMD_NUMA | ||
330 | if (amd) | ||
331 | amd_fake_nodes(nodes, nr_nodes); | ||
332 | #endif | ||
333 | if (!acpi && !amd) | ||
334 | for (i = 0; i < nr_cpu_ids; i++) | ||
335 | numa_set_node(i, 0); | ||
336 | } | ||
337 | |||
327 | /* | 338 | /* |
328 | * Setups up nid to range from addr to addr + size. If the end | 339 | * Setups up nid to range from addr to addr + size. If the end |
329 | * boundary is greater than max_addr, then max_addr is used instead. | 340 | * boundary is greater than max_addr, then max_addr is used instead. |
@@ -352,8 +363,7 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) | |||
352 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | 363 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr |
353 | * to max_addr. The return value is the number of nodes allocated. | 364 | * to max_addr. The return value is the number of nodes allocated. |
354 | */ | 365 | */ |
355 | static int __init split_nodes_interleave(u64 addr, u64 max_addr, | 366 | static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) |
356 | int nr_phys_nodes, int nr_nodes) | ||
357 | { | 367 | { |
358 | nodemask_t physnode_mask = NODE_MASK_NONE; | 368 | nodemask_t physnode_mask = NODE_MASK_NONE; |
359 | u64 size; | 369 | u64 size; |
@@ -384,7 +394,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, | |||
384 | return -1; | 394 | return -1; |
385 | } | 395 | } |
386 | 396 | ||
387 | for (i = 0; i < nr_phys_nodes; i++) | 397 | for (i = 0; i < MAX_NUMNODES; i++) |
388 | if (physnodes[i].start != physnodes[i].end) | 398 | if (physnodes[i].start != physnodes[i].end) |
389 | node_set(i, physnode_mask); | 399 | node_set(i, physnode_mask); |
390 | 400 | ||
@@ -549,15 +559,13 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) | |||
549 | * numa=fake command-line option. | 559 | * numa=fake command-line option. |
550 | */ | 560 | */ |
551 | static int __init numa_emulation(unsigned long start_pfn, | 561 | static int __init numa_emulation(unsigned long start_pfn, |
552 | unsigned long last_pfn, int acpi, int k8) | 562 | unsigned long last_pfn, int acpi, int amd) |
553 | { | 563 | { |
554 | u64 addr = start_pfn << PAGE_SHIFT; | 564 | u64 addr = start_pfn << PAGE_SHIFT; |
555 | u64 max_addr = last_pfn << PAGE_SHIFT; | 565 | u64 max_addr = last_pfn << PAGE_SHIFT; |
556 | int num_phys_nodes; | ||
557 | int num_nodes; | 566 | int num_nodes; |
558 | int i; | 567 | int i; |
559 | 568 | ||
560 | num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); | ||
561 | /* | 569 | /* |
562 | * If the numa=fake command-line contains a 'M' or 'G', it represents | 570 | * If the numa=fake command-line contains a 'M' or 'G', it represents |
563 | * the fixed node size. Otherwise, if it is just a single number N, | 571 | * the fixed node size. Otherwise, if it is just a single number N, |
@@ -572,7 +580,7 @@ static int __init numa_emulation(unsigned long start_pfn, | |||
572 | unsigned long n; | 580 | unsigned long n; |
573 | 581 | ||
574 | n = simple_strtoul(cmdline, NULL, 0); | 582 | n = simple_strtoul(cmdline, NULL, 0); |
575 | num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); | 583 | num_nodes = split_nodes_interleave(addr, max_addr, n); |
576 | } | 584 | } |
577 | 585 | ||
578 | if (num_nodes < 0) | 586 | if (num_nodes < 0) |
@@ -595,14 +603,15 @@ static int __init numa_emulation(unsigned long start_pfn, | |||
595 | nodes[i].end >> PAGE_SHIFT); | 603 | nodes[i].end >> PAGE_SHIFT); |
596 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | 604 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); |
597 | } | 605 | } |
598 | acpi_fake_nodes(nodes, num_nodes); | 606 | setup_physnodes(addr, max_addr, acpi, amd); |
607 | fake_physnodes(acpi, amd, num_nodes); | ||
599 | numa_init_array(); | 608 | numa_init_array(); |
600 | return 0; | 609 | return 0; |
601 | } | 610 | } |
602 | #endif /* CONFIG_NUMA_EMU */ | 611 | #endif /* CONFIG_NUMA_EMU */ |
603 | 612 | ||
604 | void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, | 613 | void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, |
605 | int acpi, int k8) | 614 | int acpi, int amd) |
606 | { | 615 | { |
607 | int i; | 616 | int i; |
608 | 617 | ||
@@ -610,8 +619,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, | |||
610 | nodes_clear(node_online_map); | 619 | nodes_clear(node_online_map); |
611 | 620 | ||
612 | #ifdef CONFIG_NUMA_EMU | 621 | #ifdef CONFIG_NUMA_EMU |
613 | if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) | 622 | setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, |
623 | acpi, amd); | ||
624 | if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) | ||
614 | return; | 625 | return; |
626 | setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, | ||
627 | acpi, amd); | ||
615 | nodes_clear(node_possible_map); | 628 | nodes_clear(node_possible_map); |
616 | nodes_clear(node_online_map); | 629 | nodes_clear(node_online_map); |
617 | #endif | 630 | #endif |
@@ -624,8 +637,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, | |||
624 | nodes_clear(node_online_map); | 637 | nodes_clear(node_online_map); |
625 | #endif | 638 | #endif |
626 | 639 | ||
627 | #ifdef CONFIG_K8_NUMA | 640 | #ifdef CONFIG_AMD_NUMA |
628 | if (!numa_off && k8 && !k8_scan_nodes()) | 641 | if (!numa_off && amd && !amd_scan_nodes()) |
629 | return; | 642 | return; |
630 | nodes_clear(node_possible_map); | 643 | nodes_clear(node_possible_map); |
631 | nodes_clear(node_online_map); | 644 | nodes_clear(node_online_map); |
@@ -661,24 +674,6 @@ unsigned long __init numa_free_all_bootmem(void) | |||
661 | return pages; | 674 | return pages; |
662 | } | 675 | } |
663 | 676 | ||
664 | static __init int numa_setup(char *opt) | ||
665 | { | ||
666 | if (!opt) | ||
667 | return -EINVAL; | ||
668 | if (!strncmp(opt, "off", 3)) | ||
669 | numa_off = 1; | ||
670 | #ifdef CONFIG_NUMA_EMU | ||
671 | if (!strncmp(opt, "fake=", 5)) | ||
672 | cmdline = opt + 5; | ||
673 | #endif | ||
674 | #ifdef CONFIG_ACPI_NUMA | ||
675 | if (!strncmp(opt, "noacpi", 6)) | ||
676 | acpi_numa = -1; | ||
677 | #endif | ||
678 | return 0; | ||
679 | } | ||
680 | early_param("numa", numa_setup); | ||
681 | |||
682 | #ifdef CONFIG_NUMA | 677 | #ifdef CONFIG_NUMA |
683 | 678 | ||
684 | static __init int find_near_online_node(int node) | 679 | static __init int find_near_online_node(int node) |
@@ -767,6 +762,7 @@ void __cpuinit numa_clear_node(int cpu) | |||
767 | 762 | ||
768 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | 763 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS |
769 | 764 | ||
765 | #ifndef CONFIG_NUMA_EMU | ||
770 | void __cpuinit numa_add_cpu(int cpu) | 766 | void __cpuinit numa_add_cpu(int cpu) |
771 | { | 767 | { |
772 | cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | 768 | cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); |
@@ -776,34 +772,115 @@ void __cpuinit numa_remove_cpu(int cpu) | |||
776 | { | 772 | { |
777 | cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | 773 | cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); |
778 | } | 774 | } |
775 | #else | ||
776 | void __cpuinit numa_add_cpu(int cpu) | ||
777 | { | ||
778 | unsigned long addr; | ||
779 | u16 apicid; | ||
780 | int physnid; | ||
781 | int nid = NUMA_NO_NODE; | ||
782 | |||
783 | apicid = early_per_cpu(x86_cpu_to_apicid, cpu); | ||
784 | if (apicid != BAD_APICID) | ||
785 | nid = apicid_to_node[apicid]; | ||
786 | if (nid == NUMA_NO_NODE) | ||
787 | nid = early_cpu_to_node(cpu); | ||
788 | BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); | ||
789 | |||
790 | /* | ||
791 | * Use the starting address of the emulated node to find which physical | ||
792 | * node it is allocated on. | ||
793 | */ | ||
794 | addr = node_start_pfn(nid) << PAGE_SHIFT; | ||
795 | for (physnid = 0; physnid < MAX_NUMNODES; physnid++) | ||
796 | if (addr >= physnodes[physnid].start && | ||
797 | addr < physnodes[physnid].end) | ||
798 | break; | ||
799 | |||
800 | /* | ||
801 | * Map the cpu to each emulated node that is allocated on the physical | ||
802 | * node of the cpu's apic id. | ||
803 | */ | ||
804 | for_each_online_node(nid) { | ||
805 | addr = node_start_pfn(nid) << PAGE_SHIFT; | ||
806 | if (addr >= physnodes[physnid].start && | ||
807 | addr < physnodes[physnid].end) | ||
808 | cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); | ||
809 | } | ||
810 | } | ||
811 | |||
812 | void __cpuinit numa_remove_cpu(int cpu) | ||
813 | { | ||
814 | int i; | ||
815 | |||
816 | for_each_online_node(i) | ||
817 | cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); | ||
818 | } | ||
819 | #endif /* !CONFIG_NUMA_EMU */ | ||
779 | 820 | ||
780 | #else /* CONFIG_DEBUG_PER_CPU_MAPS */ | 821 | #else /* CONFIG_DEBUG_PER_CPU_MAPS */ |
822 | static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) | ||
823 | { | ||
824 | int node = early_cpu_to_node(cpu); | ||
825 | struct cpumask *mask; | ||
826 | char buf[64]; | ||
827 | |||
828 | mask = node_to_cpumask_map[node]; | ||
829 | if (!mask) { | ||
830 | pr_err("node_to_cpumask_map[%i] NULL\n", node); | ||
831 | dump_stack(); | ||
832 | return NULL; | ||
833 | } | ||
834 | |||
835 | cpulist_scnprintf(buf, sizeof(buf), mask); | ||
836 | printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", | ||
837 | enable ? "numa_add_cpu" : "numa_remove_cpu", | ||
838 | cpu, node, buf); | ||
839 | return mask; | ||
840 | } | ||
781 | 841 | ||
782 | /* | 842 | /* |
783 | * --------- debug versions of the numa functions --------- | 843 | * --------- debug versions of the numa functions --------- |
784 | */ | 844 | */ |
845 | #ifndef CONFIG_NUMA_EMU | ||
785 | static void __cpuinit numa_set_cpumask(int cpu, int enable) | 846 | static void __cpuinit numa_set_cpumask(int cpu, int enable) |
786 | { | 847 | { |
787 | int node = early_cpu_to_node(cpu); | ||
788 | struct cpumask *mask; | 848 | struct cpumask *mask; |
789 | char buf[64]; | ||
790 | 849 | ||
791 | mask = node_to_cpumask_map[node]; | 850 | mask = debug_cpumask_set_cpu(cpu, enable); |
792 | if (mask == NULL) { | 851 | if (!mask) |
793 | printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node); | ||
794 | dump_stack(); | ||
795 | return; | 852 | return; |
796 | } | ||
797 | 853 | ||
798 | if (enable) | 854 | if (enable) |
799 | cpumask_set_cpu(cpu, mask); | 855 | cpumask_set_cpu(cpu, mask); |
800 | else | 856 | else |
801 | cpumask_clear_cpu(cpu, mask); | 857 | cpumask_clear_cpu(cpu, mask); |
858 | } | ||
859 | #else | ||
860 | static void __cpuinit numa_set_cpumask(int cpu, int enable) | ||
861 | { | ||
862 | int node = early_cpu_to_node(cpu); | ||
863 | struct cpumask *mask; | ||
864 | int i; | ||
802 | 865 | ||
803 | cpulist_scnprintf(buf, sizeof(buf), mask); | 866 | for_each_online_node(i) { |
804 | printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", | 867 | unsigned long addr; |
805 | enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); | 868 | |
869 | addr = node_start_pfn(i) << PAGE_SHIFT; | ||
870 | if (addr < physnodes[node].start || | ||
871 | addr >= physnodes[node].end) | ||
872 | continue; | ||
873 | mask = debug_cpumask_set_cpu(cpu, enable); | ||
874 | if (!mask) | ||
875 | return; | ||
876 | |||
877 | if (enable) | ||
878 | cpumask_set_cpu(cpu, mask); | ||
879 | else | ||
880 | cpumask_clear_cpu(cpu, mask); | ||
881 | } | ||
806 | } | 882 | } |
883 | #endif /* CONFIG_NUMA_EMU */ | ||
807 | 884 | ||
808 | void __cpuinit numa_add_cpu(int cpu) | 885 | void __cpuinit numa_add_cpu(int cpu) |
809 | { | 886 | { |
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 532e7933d606..8b830ca14ac4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/pfn.h> | 13 | #include <linux/pfn.h> |
14 | #include <linux/percpu.h> | 14 | #include <linux/percpu.h> |
15 | #include <linux/gfp.h> | 15 | #include <linux/gfp.h> |
16 | #include <linux/pci.h> | ||
16 | 17 | ||
17 | #include <asm/e820.h> | 18 | #include <asm/e820.h> |
18 | #include <asm/processor.h> | 19 | #include <asm/processor.h> |
@@ -255,13 +256,16 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, | |||
255 | unsigned long pfn) | 256 | unsigned long pfn) |
256 | { | 257 | { |
257 | pgprot_t forbidden = __pgprot(0); | 258 | pgprot_t forbidden = __pgprot(0); |
259 | pgprot_t required = __pgprot(0); | ||
258 | 260 | ||
259 | /* | 261 | /* |
260 | * The BIOS area between 640k and 1Mb needs to be executable for | 262 | * The BIOS area between 640k and 1Mb needs to be executable for |
261 | * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. | 263 | * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. |
262 | */ | 264 | */ |
263 | if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) | 265 | #ifdef CONFIG_PCI_BIOS |
266 | if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) | ||
264 | pgprot_val(forbidden) |= _PAGE_NX; | 267 | pgprot_val(forbidden) |= _PAGE_NX; |
268 | #endif | ||
265 | 269 | ||
266 | /* | 270 | /* |
267 | * The kernel text needs to be executable for obvious reasons | 271 | * The kernel text needs to be executable for obvious reasons |
@@ -278,6 +282,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, | |||
278 | if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, | 282 | if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, |
279 | __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) | 283 | __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) |
280 | pgprot_val(forbidden) |= _PAGE_RW; | 284 | pgprot_val(forbidden) |= _PAGE_RW; |
285 | /* | ||
286 | * .data and .bss should always be writable. | ||
287 | */ | ||
288 | if (within(address, (unsigned long)_sdata, (unsigned long)_edata) || | ||
289 | within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop)) | ||
290 | pgprot_val(required) |= _PAGE_RW; | ||
281 | 291 | ||
282 | #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) | 292 | #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) |
283 | /* | 293 | /* |
@@ -317,6 +327,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, | |||
317 | #endif | 327 | #endif |
318 | 328 | ||
319 | prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); | 329 | prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); |
330 | prot = __pgprot(pgprot_val(prot) | pgprot_val(required)); | ||
320 | 331 | ||
321 | return prot; | 332 | return prot; |
322 | } | 333 | } |
@@ -393,7 +404,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, | |||
393 | { | 404 | { |
394 | unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; | 405 | unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; |
395 | pte_t new_pte, old_pte, *tmp; | 406 | pte_t new_pte, old_pte, *tmp; |
396 | pgprot_t old_prot, new_prot; | 407 | pgprot_t old_prot, new_prot, req_prot; |
397 | int i, do_split = 1; | 408 | int i, do_split = 1; |
398 | unsigned int level; | 409 | unsigned int level; |
399 | 410 | ||
@@ -438,10 +449,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, | |||
438 | * We are safe now. Check whether the new pgprot is the same: | 449 | * We are safe now. Check whether the new pgprot is the same: |
439 | */ | 450 | */ |
440 | old_pte = *kpte; | 451 | old_pte = *kpte; |
441 | old_prot = new_prot = pte_pgprot(old_pte); | 452 | old_prot = new_prot = req_prot = pte_pgprot(old_pte); |
442 | 453 | ||
443 | pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); | 454 | pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); |
444 | pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); | 455 | pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); |
445 | 456 | ||
446 | /* | 457 | /* |
447 | * old_pte points to the large page base address. So we need | 458 | * old_pte points to the large page base address. So we need |
@@ -450,17 +461,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, | |||
450 | pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); | 461 | pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); |
451 | cpa->pfn = pfn; | 462 | cpa->pfn = pfn; |
452 | 463 | ||
453 | new_prot = static_protections(new_prot, address, pfn); | 464 | new_prot = static_protections(req_prot, address, pfn); |
454 | 465 | ||
455 | /* | 466 | /* |
456 | * We need to check the full range, whether | 467 | * We need to check the full range, whether |
457 | * static_protection() requires a different pgprot for one of | 468 | * static_protection() requires a different pgprot for one of |
458 | * the pages in the range we try to preserve: | 469 | * the pages in the range we try to preserve: |
459 | */ | 470 | */ |
460 | addr = address + PAGE_SIZE; | 471 | addr = address & pmask; |
461 | pfn++; | 472 | pfn = pte_pfn(old_pte); |
462 | for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { | 473 | for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { |
463 | pgprot_t chk_prot = static_protections(new_prot, addr, pfn); | 474 | pgprot_t chk_prot = static_protections(req_prot, addr, pfn); |
464 | 475 | ||
465 | if (pgprot_val(chk_prot) != pgprot_val(new_prot)) | 476 | if (pgprot_val(chk_prot) != pgprot_val(new_prot)) |
466 | goto out_unlock; | 477 | goto out_unlock; |
@@ -483,7 +494,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, | |||
483 | * that we limited the number of possible pages already to | 494 | * that we limited the number of possible pages already to |
484 | * the number of pages in the large page. | 495 | * the number of pages in the large page. |
485 | */ | 496 | */ |
486 | if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { | 497 | if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { |
487 | /* | 498 | /* |
488 | * The address is aligned and the number of pages | 499 | * The address is aligned and the number of pages |
489 | * covers the full page. | 500 | * covers the full page. |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8be8c7d7bc89..500242d3c96d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma, | |||
320 | return changed; | 320 | return changed; |
321 | } | 321 | } |
322 | 322 | ||
323 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
324 | int pmdp_set_access_flags(struct vm_area_struct *vma, | ||
325 | unsigned long address, pmd_t *pmdp, | ||
326 | pmd_t entry, int dirty) | ||
327 | { | ||
328 | int changed = !pmd_same(*pmdp, entry); | ||
329 | |||
330 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
331 | |||
332 | if (changed && dirty) { | ||
333 | *pmdp = entry; | ||
334 | pmd_update_defer(vma->vm_mm, address, pmdp); | ||
335 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
336 | } | ||
337 | |||
338 | return changed; | ||
339 | } | ||
340 | #endif | ||
341 | |||
323 | int ptep_test_and_clear_young(struct vm_area_struct *vma, | 342 | int ptep_test_and_clear_young(struct vm_area_struct *vma, |
324 | unsigned long addr, pte_t *ptep) | 343 | unsigned long addr, pte_t *ptep) |
325 | { | 344 | { |
@@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, | |||
335 | return ret; | 354 | return ret; |
336 | } | 355 | } |
337 | 356 | ||
357 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
358 | int pmdp_test_and_clear_young(struct vm_area_struct *vma, | ||
359 | unsigned long addr, pmd_t *pmdp) | ||
360 | { | ||
361 | int ret = 0; | ||
362 | |||
363 | if (pmd_young(*pmdp)) | ||
364 | ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, | ||
365 | (unsigned long *)pmdp); | ||
366 | |||
367 | if (ret) | ||
368 | pmd_update(vma->vm_mm, addr, pmdp); | ||
369 | |||
370 | return ret; | ||
371 | } | ||
372 | #endif | ||
373 | |||
338 | int ptep_clear_flush_young(struct vm_area_struct *vma, | 374 | int ptep_clear_flush_young(struct vm_area_struct *vma, |
339 | unsigned long address, pte_t *ptep) | 375 | unsigned long address, pte_t *ptep) |
340 | { | 376 | { |
@@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, | |||
347 | return young; | 383 | return young; |
348 | } | 384 | } |
349 | 385 | ||
386 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
387 | int pmdp_clear_flush_young(struct vm_area_struct *vma, | ||
388 | unsigned long address, pmd_t *pmdp) | ||
389 | { | ||
390 | int young; | ||
391 | |||
392 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
393 | |||
394 | young = pmdp_test_and_clear_young(vma, address, pmdp); | ||
395 | if (young) | ||
396 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
397 | |||
398 | return young; | ||
399 | } | ||
400 | |||
401 | void pmdp_splitting_flush(struct vm_area_struct *vma, | ||
402 | unsigned long address, pmd_t *pmdp) | ||
403 | { | ||
404 | int set; | ||
405 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
406 | set = !test_and_set_bit(_PAGE_BIT_SPLITTING, | ||
407 | (unsigned long *)pmdp); | ||
408 | if (set) { | ||
409 | pmd_update(vma->vm_mm, address, pmdp); | ||
410 | /* need tlb flush only to serialize against gup-fast */ | ||
411 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
412 | } | ||
413 | } | ||
414 | #endif | ||
415 | |||
350 | /** | 416 | /** |
351 | * reserve_top_address - reserves a hole in the top of kernel address space | 417 | * reserve_top_address - reserves a hole in the top of kernel address space |
352 | * @reserve - size of hole to reserve | 418 | * @reserve - size of hole to reserve |
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index a3250aa34086..410531d3c292 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c | |||
@@ -41,7 +41,7 @@ void __init x86_report_nx(void) | |||
41 | { | 41 | { |
42 | if (!cpu_has_nx) { | 42 | if (!cpu_has_nx) { |
43 | printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " | 43 | printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " |
44 | "missing in CPU or disabled in BIOS!\n"); | 44 | "missing in CPU!\n"); |
45 | } else { | 45 | } else { |
46 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) | 46 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
47 | if (disable_nx) { | 47 | if (disable_nx) { |
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index a17dffd136c1..ae96e7b8051d 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c | |||
@@ -59,7 +59,6 @@ static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; | |||
59 | static int __initdata num_memory_chunks; /* total number of memory chunks */ | 59 | static int __initdata num_memory_chunks; /* total number of memory chunks */ |
60 | static u8 __initdata apicid_to_pxm[MAX_APICID]; | 60 | static u8 __initdata apicid_to_pxm[MAX_APICID]; |
61 | 61 | ||
62 | int numa_off __initdata; | ||
63 | int acpi_numa __initdata; | 62 | int acpi_numa __initdata; |
64 | 63 | ||
65 | static __init void bad_srat(void) | 64 | static __init void bad_srat(void) |
@@ -92,6 +91,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) | |||
92 | /* mark this node as "seen" in node bitmap */ | 91 | /* mark this node as "seen" in node bitmap */ |
93 | BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); | 92 | BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); |
94 | 93 | ||
94 | /* don't need to check apic_id here, because it is always 8 bits */ | ||
95 | apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; | 95 | apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; |
96 | 96 | ||
97 | printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", | 97 | printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", |
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index a35cb9d8b060..603d285d1daa 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c | |||
@@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) | |||
134 | } | 134 | } |
135 | 135 | ||
136 | apic_id = pa->apic_id; | 136 | apic_id = pa->apic_id; |
137 | if (apic_id >= MAX_LOCAL_APIC) { | ||
138 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); | ||
139 | return; | ||
140 | } | ||
137 | apicid_to_node[apic_id] = node; | 141 | apicid_to_node[apic_id] = node; |
138 | node_set(node, cpu_nodes_parsed); | 142 | node_set(node, cpu_nodes_parsed); |
139 | acpi_numa = 1; | 143 | acpi_numa = 1; |
@@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) | |||
168 | apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; | 172 | apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; |
169 | else | 173 | else |
170 | apic_id = pa->apic_id; | 174 | apic_id = pa->apic_id; |
175 | |||
176 | if (apic_id >= MAX_LOCAL_APIC) { | ||
177 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); | ||
178 | return; | ||
179 | } | ||
180 | |||
171 | apicid_to_node[apic_id] = node; | 181 | apicid_to_node[apic_id] = node; |
172 | node_set(node, cpu_nodes_parsed); | 182 | node_set(node, cpu_nodes_parsed); |
173 | acpi_numa = 1; | 183 | acpi_numa = 1; |
@@ -339,18 +349,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) | |||
339 | 349 | ||
340 | void __init acpi_numa_arch_fixup(void) {} | 350 | void __init acpi_numa_arch_fixup(void) {} |
341 | 351 | ||
342 | int __init acpi_get_nodes(struct bootnode *physnodes) | 352 | #ifdef CONFIG_NUMA_EMU |
353 | void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, | ||
354 | unsigned long end) | ||
343 | { | 355 | { |
344 | int i; | 356 | int i; |
345 | int ret = 0; | ||
346 | 357 | ||
347 | for_each_node_mask(i, nodes_parsed) { | 358 | for_each_node_mask(i, nodes_parsed) { |
348 | physnodes[ret].start = nodes[i].start; | 359 | cutoff_node(i, start, end); |
349 | physnodes[ret].end = nodes[i].end; | 360 | physnodes[i].start = nodes[i].start; |
350 | ret++; | 361 | physnodes[i].end = nodes[i].end; |
351 | } | 362 | } |
352 | return ret; | ||
353 | } | 363 | } |
364 | #endif /* CONFIG_NUMA_EMU */ | ||
354 | 365 | ||
355 | /* Use the information discovered above to actually set up the nodes. */ | 366 | /* Use the information discovered above to actually set up the nodes. */ |
356 | int __init acpi_scan_nodes(unsigned long start, unsigned long end) | 367 | int __init acpi_scan_nodes(unsigned long start, unsigned long end) |
@@ -495,8 +506,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) | |||
495 | { | 506 | { |
496 | int i, j; | 507 | int i, j; |
497 | 508 | ||
498 | printk(KERN_INFO "Faking PXM affinity for fake nodes on real " | ||
499 | "topology.\n"); | ||
500 | for (i = 0; i < num_nodes; i++) { | 509 | for (i = 0; i < num_nodes; i++) { |
501 | int nid, pxm; | 510 | int nid, pxm; |
502 | 511 | ||
@@ -516,6 +525,17 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) | |||
516 | fake_apicid_to_node[j] == NUMA_NO_NODE) | 525 | fake_apicid_to_node[j] == NUMA_NO_NODE) |
517 | fake_apicid_to_node[j] = i; | 526 | fake_apicid_to_node[j] = i; |
518 | } | 527 | } |
528 | |||
529 | /* | ||
530 | * If there are apicid-to-node mappings for physical nodes that do not | ||
531 | * have a corresponding emulated node, it should default to a guaranteed | ||
532 | * value. | ||
533 | */ | ||
534 | for (i = 0; i < MAX_LOCAL_APIC; i++) | ||
535 | if (apicid_to_node[i] != NUMA_NO_NODE && | ||
536 | fake_apicid_to_node[i] == NUMA_NO_NODE) | ||
537 | fake_apicid_to_node[i] = 0; | ||
538 | |||
519 | for (i = 0; i < num_nodes; i++) | 539 | for (i = 0; i < num_nodes; i++) |
520 | __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); | 540 | __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); |
521 | memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); | 541 | memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 49358481c733..6acc724d5d8f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -223,7 +223,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
223 | 223 | ||
224 | static void __cpuinit calculate_tlb_offset(void) | 224 | static void __cpuinit calculate_tlb_offset(void) |
225 | { | 225 | { |
226 | int cpu, node, nr_node_vecs; | 226 | int cpu, node, nr_node_vecs, idx = 0; |
227 | /* | 227 | /* |
228 | * we are changing tlb_vector_offset for each CPU in runtime, but this | 228 | * we are changing tlb_vector_offset for each CPU in runtime, but this |
229 | * will not cause inconsistency, as the write is atomic under X86. we | 229 | * will not cause inconsistency, as the write is atomic under X86. we |
@@ -239,7 +239,7 @@ static void __cpuinit calculate_tlb_offset(void) | |||
239 | nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; | 239 | nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; |
240 | 240 | ||
241 | for_each_online_node(node) { | 241 | for_each_online_node(node) { |
242 | int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) * | 242 | int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) * |
243 | nr_node_vecs; | 243 | nr_node_vecs; |
244 | int cpu_offset = 0; | 244 | int cpu_offset = 0; |
245 | for_each_cpu(cpu, cpumask_of_node(node)) { | 245 | for_each_cpu(cpu, cpumask_of_node(node)) { |
@@ -248,10 +248,11 @@ static void __cpuinit calculate_tlb_offset(void) | |||
248 | cpu_offset++; | 248 | cpu_offset++; |
249 | cpu_offset = cpu_offset % nr_node_vecs; | 249 | cpu_offset = cpu_offset % nr_node_vecs; |
250 | } | 250 | } |
251 | idx++; | ||
251 | } | 252 | } |
252 | } | 253 | } |
253 | 254 | ||
254 | static int tlb_cpuhp_notify(struct notifier_block *n, | 255 | static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n, |
255 | unsigned long action, void *hcpu) | 256 | unsigned long action, void *hcpu) |
256 | { | 257 | { |
257 | switch (action & 0xf) { | 258 | switch (action & 0xf) { |
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 2d49d4e19a36..72cbec14d783 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c | |||
@@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) | |||
126 | if (!user_mode_vm(regs)) { | 126 | if (!user_mode_vm(regs)) { |
127 | unsigned long stack = kernel_stack_pointer(regs); | 127 | unsigned long stack = kernel_stack_pointer(regs); |
128 | if (depth) | 128 | if (depth) |
129 | dump_trace(NULL, regs, (unsigned long *)stack, 0, | 129 | dump_trace(NULL, regs, (unsigned long *)stack, |
130 | &backtrace_ops, &depth); | 130 | &backtrace_ops, &depth); |
131 | return; | 131 | return; |
132 | } | 132 | } |
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 4e8baad36d37..e2b7b0c06cdf 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c | |||
@@ -65,7 +65,6 @@ static int profile_exceptions_notify(struct notifier_block *self, | |||
65 | 65 | ||
66 | switch (val) { | 66 | switch (val) { |
67 | case DIE_NMI: | 67 | case DIE_NMI: |
68 | case DIE_NMI_IPI: | ||
69 | if (ctr_running) | 68 | if (ctr_running) |
70 | model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs)); | 69 | model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs)); |
71 | else if (!nmi_enabled) | 70 | else if (!nmi_enabled) |
@@ -143,7 +142,7 @@ static inline int has_mux(void) | |||
143 | 142 | ||
144 | inline int op_x86_phys_to_virt(int phys) | 143 | inline int op_x86_phys_to_virt(int phys) |
145 | { | 144 | { |
146 | return __get_cpu_var(switch_index) + phys; | 145 | return __this_cpu_read(switch_index) + phys; |
147 | } | 146 | } |
148 | 147 | ||
149 | inline int op_x86_virt_to_phys(int virt) | 148 | inline int op_x86_virt_to_phys(int virt) |
@@ -361,7 +360,7 @@ static void nmi_cpu_setup(void *dummy) | |||
361 | static struct notifier_block profile_exceptions_nb = { | 360 | static struct notifier_block profile_exceptions_nb = { |
362 | .notifier_call = profile_exceptions_notify, | 361 | .notifier_call = profile_exceptions_notify, |
363 | .next = NULL, | 362 | .next = NULL, |
364 | .priority = 2 | 363 | .priority = NMI_LOCAL_LOW_PRIOR, |
365 | }; | 364 | }; |
366 | 365 | ||
367 | static void nmi_cpu_restore_registers(struct op_msrs *msrs) | 366 | static void nmi_cpu_restore_registers(struct op_msrs *msrs) |
@@ -732,6 +731,9 @@ int __init op_nmi_init(struct oprofile_operations *ops) | |||
732 | case 0x14: | 731 | case 0x14: |
733 | cpu_type = "x86-64/family14h"; | 732 | cpu_type = "x86-64/family14h"; |
734 | break; | 733 | break; |
734 | case 0x15: | ||
735 | cpu_type = "x86-64/family15h"; | ||
736 | break; | ||
735 | default: | 737 | default: |
736 | return -ENODEV; | 738 | return -ENODEV; |
737 | } | 739 | } |
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c index e3ecb71b5790..720bf5a53c51 100644 --- a/arch/x86/oprofile/nmi_timer_int.c +++ b/arch/x86/oprofile/nmi_timer_int.c | |||
@@ -38,7 +38,7 @@ static int profile_timer_exceptions_notify(struct notifier_block *self, | |||
38 | static struct notifier_block profile_timer_exceptions_nb = { | 38 | static struct notifier_block profile_timer_exceptions_nb = { |
39 | .notifier_call = profile_timer_exceptions_notify, | 39 | .notifier_call = profile_timer_exceptions_notify, |
40 | .next = NULL, | 40 | .next = NULL, |
41 | .priority = 0 | 41 | .priority = NMI_LOW_PRIOR, |
42 | }; | 42 | }; |
43 | 43 | ||
44 | static int timer_start(void) | 44 | static int timer_start(void) |
@@ -58,9 +58,6 @@ static void timer_stop(void) | |||
58 | 58 | ||
59 | int __init op_nmi_timer_init(struct oprofile_operations *ops) | 59 | int __init op_nmi_timer_init(struct oprofile_operations *ops) |
60 | { | 60 | { |
61 | if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0)) | ||
62 | return -ENODEV; | ||
63 | |||
64 | ops->start = timer_start; | 61 | ops->start = timer_start; |
65 | ops->stop = timer_stop; | 62 | ops->stop = timer_stop; |
66 | ops->cpu_type = "timer"; | 63 | ops->cpu_type = "timer"; |
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index a011bcc0f943..c3b8e24f2b16 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c | |||
@@ -29,11 +29,12 @@ | |||
29 | #include "op_x86_model.h" | 29 | #include "op_x86_model.h" |
30 | #include "op_counter.h" | 30 | #include "op_counter.h" |
31 | 31 | ||
32 | #define NUM_COUNTERS 4 | 32 | #define NUM_COUNTERS 4 |
33 | #define NUM_COUNTERS_F15H 6 | ||
33 | #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX | 34 | #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX |
34 | #define NUM_VIRT_COUNTERS 32 | 35 | #define NUM_VIRT_COUNTERS 32 |
35 | #else | 36 | #else |
36 | #define NUM_VIRT_COUNTERS NUM_COUNTERS | 37 | #define NUM_VIRT_COUNTERS 0 |
37 | #endif | 38 | #endif |
38 | 39 | ||
39 | #define OP_EVENT_MASK 0x0FFF | 40 | #define OP_EVENT_MASK 0x0FFF |
@@ -41,7 +42,8 @@ | |||
41 | 42 | ||
42 | #define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) | 43 | #define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) |
43 | 44 | ||
44 | static unsigned long reset_value[NUM_VIRT_COUNTERS]; | 45 | static int num_counters; |
46 | static unsigned long reset_value[OP_MAX_COUNTER]; | ||
45 | 47 | ||
46 | #define IBS_FETCH_SIZE 6 | 48 | #define IBS_FETCH_SIZE 6 |
47 | #define IBS_OP_SIZE 12 | 49 | #define IBS_OP_SIZE 12 |
@@ -387,7 +389,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, | |||
387 | int i; | 389 | int i; |
388 | 390 | ||
389 | /* enable active counters */ | 391 | /* enable active counters */ |
390 | for (i = 0; i < NUM_COUNTERS; ++i) { | 392 | for (i = 0; i < num_counters; ++i) { |
391 | int virt = op_x86_phys_to_virt(i); | 393 | int virt = op_x86_phys_to_virt(i); |
392 | if (!reset_value[virt]) | 394 | if (!reset_value[virt]) |
393 | continue; | 395 | continue; |
@@ -406,7 +408,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) | |||
406 | { | 408 | { |
407 | int i; | 409 | int i; |
408 | 410 | ||
409 | for (i = 0; i < NUM_COUNTERS; ++i) { | 411 | for (i = 0; i < num_counters; ++i) { |
410 | if (!msrs->counters[i].addr) | 412 | if (!msrs->counters[i].addr) |
411 | continue; | 413 | continue; |
412 | release_perfctr_nmi(MSR_K7_PERFCTR0 + i); | 414 | release_perfctr_nmi(MSR_K7_PERFCTR0 + i); |
@@ -418,7 +420,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs) | |||
418 | { | 420 | { |
419 | int i; | 421 | int i; |
420 | 422 | ||
421 | for (i = 0; i < NUM_COUNTERS; i++) { | 423 | for (i = 0; i < num_counters; i++) { |
422 | if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) | 424 | if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) |
423 | goto fail; | 425 | goto fail; |
424 | if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { | 426 | if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { |
@@ -426,8 +428,13 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs) | |||
426 | goto fail; | 428 | goto fail; |
427 | } | 429 | } |
428 | /* both registers must be reserved */ | 430 | /* both registers must be reserved */ |
429 | msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; | 431 | if (num_counters == NUM_COUNTERS_F15H) { |
430 | msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; | 432 | msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); |
433 | msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); | ||
434 | } else { | ||
435 | msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; | ||
436 | msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; | ||
437 | } | ||
431 | continue; | 438 | continue; |
432 | fail: | 439 | fail: |
433 | if (!counter_config[i].enabled) | 440 | if (!counter_config[i].enabled) |
@@ -447,7 +454,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, | |||
447 | int i; | 454 | int i; |
448 | 455 | ||
449 | /* setup reset_value */ | 456 | /* setup reset_value */ |
450 | for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { | 457 | for (i = 0; i < OP_MAX_COUNTER; ++i) { |
451 | if (counter_config[i].enabled | 458 | if (counter_config[i].enabled |
452 | && msrs->counters[op_x86_virt_to_phys(i)].addr) | 459 | && msrs->counters[op_x86_virt_to_phys(i)].addr) |
453 | reset_value[i] = counter_config[i].count; | 460 | reset_value[i] = counter_config[i].count; |
@@ -456,7 +463,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, | |||
456 | } | 463 | } |
457 | 464 | ||
458 | /* clear all counters */ | 465 | /* clear all counters */ |
459 | for (i = 0; i < NUM_COUNTERS; ++i) { | 466 | for (i = 0; i < num_counters; ++i) { |
460 | if (!msrs->controls[i].addr) | 467 | if (!msrs->controls[i].addr) |
461 | continue; | 468 | continue; |
462 | rdmsrl(msrs->controls[i].addr, val); | 469 | rdmsrl(msrs->controls[i].addr, val); |
@@ -472,7 +479,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, | |||
472 | } | 479 | } |
473 | 480 | ||
474 | /* enable active counters */ | 481 | /* enable active counters */ |
475 | for (i = 0; i < NUM_COUNTERS; ++i) { | 482 | for (i = 0; i < num_counters; ++i) { |
476 | int virt = op_x86_phys_to_virt(i); | 483 | int virt = op_x86_phys_to_virt(i); |
477 | if (!reset_value[virt]) | 484 | if (!reset_value[virt]) |
478 | continue; | 485 | continue; |
@@ -503,7 +510,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, | |||
503 | u64 val; | 510 | u64 val; |
504 | int i; | 511 | int i; |
505 | 512 | ||
506 | for (i = 0; i < NUM_COUNTERS; ++i) { | 513 | for (i = 0; i < num_counters; ++i) { |
507 | int virt = op_x86_phys_to_virt(i); | 514 | int virt = op_x86_phys_to_virt(i); |
508 | if (!reset_value[virt]) | 515 | if (!reset_value[virt]) |
509 | continue; | 516 | continue; |
@@ -526,7 +533,7 @@ static void op_amd_start(struct op_msrs const * const msrs) | |||
526 | u64 val; | 533 | u64 val; |
527 | int i; | 534 | int i; |
528 | 535 | ||
529 | for (i = 0; i < NUM_COUNTERS; ++i) { | 536 | for (i = 0; i < num_counters; ++i) { |
530 | if (!reset_value[op_x86_phys_to_virt(i)]) | 537 | if (!reset_value[op_x86_phys_to_virt(i)]) |
531 | continue; | 538 | continue; |
532 | rdmsrl(msrs->controls[i].addr, val); | 539 | rdmsrl(msrs->controls[i].addr, val); |
@@ -546,7 +553,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) | |||
546 | * Subtle: stop on all counters to avoid race with setting our | 553 | * Subtle: stop on all counters to avoid race with setting our |
547 | * pm callback | 554 | * pm callback |
548 | */ | 555 | */ |
549 | for (i = 0; i < NUM_COUNTERS; ++i) { | 556 | for (i = 0; i < num_counters; ++i) { |
550 | if (!reset_value[op_x86_phys_to_virt(i)]) | 557 | if (!reset_value[op_x86_phys_to_virt(i)]) |
551 | continue; | 558 | continue; |
552 | rdmsrl(msrs->controls[i].addr, val); | 559 | rdmsrl(msrs->controls[i].addr, val); |
@@ -603,6 +610,7 @@ static int force_ibs_eilvt_setup(void) | |||
603 | ret = setup_ibs_ctl(i); | 610 | ret = setup_ibs_ctl(i); |
604 | if (ret) | 611 | if (ret) |
605 | return ret; | 612 | return ret; |
613 | pr_err(FW_BUG "using offset %d for IBS interrupts\n", i); | ||
606 | return 0; | 614 | return 0; |
607 | } | 615 | } |
608 | 616 | ||
@@ -630,21 +638,29 @@ static int __init_ibs_nmi(void) | |||
630 | return 0; | 638 | return 0; |
631 | } | 639 | } |
632 | 640 | ||
633 | /* initialize the APIC for the IBS interrupts if available */ | 641 | /* |
642 | * check and reserve APIC extended interrupt LVT offset for IBS if | ||
643 | * available | ||
644 | * | ||
645 | * init_ibs() preforms implicitly cpu-local operations, so pin this | ||
646 | * thread to its current CPU | ||
647 | */ | ||
648 | |||
634 | static void init_ibs(void) | 649 | static void init_ibs(void) |
635 | { | 650 | { |
636 | ibs_caps = get_ibs_caps(); | 651 | preempt_disable(); |
637 | 652 | ||
653 | ibs_caps = get_ibs_caps(); | ||
638 | if (!ibs_caps) | 654 | if (!ibs_caps) |
639 | return; | 655 | goto out; |
640 | 656 | ||
641 | if (__init_ibs_nmi()) { | 657 | if (__init_ibs_nmi() < 0) |
642 | ibs_caps = 0; | 658 | ibs_caps = 0; |
643 | return; | 659 | else |
644 | } | 660 | printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); |
645 | 661 | ||
646 | printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", | 662 | out: |
647 | (unsigned)ibs_caps); | 663 | preempt_enable(); |
648 | } | 664 | } |
649 | 665 | ||
650 | static int (*create_arch_files)(struct super_block *sb, struct dentry *root); | 666 | static int (*create_arch_files)(struct super_block *sb, struct dentry *root); |
@@ -698,18 +714,29 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) | |||
698 | return 0; | 714 | return 0; |
699 | } | 715 | } |
700 | 716 | ||
717 | struct op_x86_model_spec op_amd_spec; | ||
718 | |||
701 | static int op_amd_init(struct oprofile_operations *ops) | 719 | static int op_amd_init(struct oprofile_operations *ops) |
702 | { | 720 | { |
703 | init_ibs(); | 721 | init_ibs(); |
704 | create_arch_files = ops->create_files; | 722 | create_arch_files = ops->create_files; |
705 | ops->create_files = setup_ibs_files; | 723 | ops->create_files = setup_ibs_files; |
724 | |||
725 | if (boot_cpu_data.x86 == 0x15) { | ||
726 | num_counters = NUM_COUNTERS_F15H; | ||
727 | } else { | ||
728 | num_counters = NUM_COUNTERS; | ||
729 | } | ||
730 | |||
731 | op_amd_spec.num_counters = num_counters; | ||
732 | op_amd_spec.num_controls = num_counters; | ||
733 | op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS); | ||
734 | |||
706 | return 0; | 735 | return 0; |
707 | } | 736 | } |
708 | 737 | ||
709 | struct op_x86_model_spec op_amd_spec = { | 738 | struct op_x86_model_spec op_amd_spec = { |
710 | .num_counters = NUM_COUNTERS, | 739 | /* num_counters/num_controls filled in at runtime */ |
711 | .num_controls = NUM_COUNTERS, | ||
712 | .num_virt_counters = NUM_VIRT_COUNTERS, | ||
713 | .reserved = MSR_AMD_EVENTSEL_RESERVED, | 740 | .reserved = MSR_AMD_EVENTSEL_RESERVED, |
714 | .event_mask = OP_EVENT_MASK, | 741 | .event_mask = OP_EVENT_MASK, |
715 | .init = op_amd_init, | 742 | .init = op_amd_init, |
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 182558dd5515..9fadec074142 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c | |||
@@ -11,7 +11,7 @@ | |||
11 | #include <linux/oprofile.h> | 11 | #include <linux/oprofile.h> |
12 | #include <linux/smp.h> | 12 | #include <linux/smp.h> |
13 | #include <linux/ptrace.h> | 13 | #include <linux/ptrace.h> |
14 | #include <linux/nmi.h> | 14 | #include <asm/nmi.h> |
15 | #include <asm/msr.h> | 15 | #include <asm/msr.h> |
16 | #include <asm/fixmap.h> | 16 | #include <asm/fixmap.h> |
17 | #include <asm/apic.h> | 17 | #include <asm/apic.h> |
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index d769cda54082..94b745045e45 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c | |||
@@ -95,8 +95,8 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, | |||
95 | * counter width: | 95 | * counter width: |
96 | */ | 96 | */ |
97 | if (!(eax.split.version_id == 0 && | 97 | if (!(eax.split.version_id == 0 && |
98 | current_cpu_data.x86 == 6 && | 98 | __this_cpu_read(cpu_info.x86) == 6 && |
99 | current_cpu_data.x86_model == 15)) { | 99 | __this_cpu_read(cpu_info.x86_model) == 15)) { |
100 | 100 | ||
101 | if (counter_width < eax.split.bit_width) | 101 | if (counter_width < eax.split.bit_width) |
102 | counter_width = eax.split.bit_width; | 102 | counter_width = eax.split.bit_width; |
@@ -235,8 +235,8 @@ static void arch_perfmon_setup_counters(void) | |||
235 | eax.full = cpuid_eax(0xa); | 235 | eax.full = cpuid_eax(0xa); |
236 | 236 | ||
237 | /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ | 237 | /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ |
238 | if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && | 238 | if (eax.split.version_id == 0 && __this_cpu_read(cpu_info.x86) == 6 && |
239 | current_cpu_data.x86_model == 15) { | 239 | __this_cpu_read(cpu_info.x86_model) == 15) { |
240 | eax.split.version_id = 2; | 240 | eax.split.version_id = 2; |
241 | eax.split.num_counters = 2; | 241 | eax.split.num_counters = 2; |
242 | eax.split.bit_width = 40; | 242 | eax.split.bit_width = 40; |
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 15466c096ba5..0972315c3860 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c | |||
@@ -138,7 +138,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) | |||
138 | struct acpi_resource_address64 addr; | 138 | struct acpi_resource_address64 addr; |
139 | acpi_status status; | 139 | acpi_status status; |
140 | unsigned long flags; | 140 | unsigned long flags; |
141 | struct resource *root, *conflict; | ||
142 | u64 start, end; | 141 | u64 start, end; |
143 | 142 | ||
144 | status = resource_to_addr(acpi_res, &addr); | 143 | status = resource_to_addr(acpi_res, &addr); |
@@ -146,12 +145,10 @@ setup_resource(struct acpi_resource *acpi_res, void *data) | |||
146 | return AE_OK; | 145 | return AE_OK; |
147 | 146 | ||
148 | if (addr.resource_type == ACPI_MEMORY_RANGE) { | 147 | if (addr.resource_type == ACPI_MEMORY_RANGE) { |
149 | root = &iomem_resource; | ||
150 | flags = IORESOURCE_MEM; | 148 | flags = IORESOURCE_MEM; |
151 | if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY) | 149 | if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY) |
152 | flags |= IORESOURCE_PREFETCH; | 150 | flags |= IORESOURCE_PREFETCH; |
153 | } else if (addr.resource_type == ACPI_IO_RANGE) { | 151 | } else if (addr.resource_type == ACPI_IO_RANGE) { |
154 | root = &ioport_resource; | ||
155 | flags = IORESOURCE_IO; | 152 | flags = IORESOURCE_IO; |
156 | } else | 153 | } else |
157 | return AE_OK; | 154 | return AE_OK; |
@@ -172,25 +169,90 @@ setup_resource(struct acpi_resource *acpi_res, void *data) | |||
172 | return AE_OK; | 169 | return AE_OK; |
173 | } | 170 | } |
174 | 171 | ||
175 | conflict = insert_resource_conflict(root, res); | 172 | info->res_num++; |
176 | if (conflict) { | 173 | if (addr.translation_offset) |
177 | dev_err(&info->bridge->dev, | 174 | dev_info(&info->bridge->dev, "host bridge window %pR " |
178 | "address space collision: host bridge window %pR " | 175 | "(PCI address [%#llx-%#llx])\n", |
179 | "conflicts with %s %pR\n", | 176 | res, res->start - addr.translation_offset, |
180 | res, conflict->name, conflict); | 177 | res->end - addr.translation_offset); |
181 | } else { | 178 | else |
182 | pci_bus_add_resource(info->bus, res, 0); | 179 | dev_info(&info->bridge->dev, "host bridge window %pR\n", res); |
183 | info->res_num++; | 180 | |
184 | if (addr.translation_offset) | 181 | return AE_OK; |
185 | dev_info(&info->bridge->dev, "host bridge window %pR " | 182 | } |
186 | "(PCI address [%#llx-%#llx])\n", | 183 | |
187 | res, res->start - addr.translation_offset, | 184 | static bool resource_contains(struct resource *res, resource_size_t point) |
188 | res->end - addr.translation_offset); | 185 | { |
186 | if (res->start <= point && point <= res->end) | ||
187 | return true; | ||
188 | return false; | ||
189 | } | ||
190 | |||
191 | static void coalesce_windows(struct pci_root_info *info, int type) | ||
192 | { | ||
193 | int i, j; | ||
194 | struct resource *res1, *res2; | ||
195 | |||
196 | for (i = 0; i < info->res_num; i++) { | ||
197 | res1 = &info->res[i]; | ||
198 | if (!(res1->flags & type)) | ||
199 | continue; | ||
200 | |||
201 | for (j = i + 1; j < info->res_num; j++) { | ||
202 | res2 = &info->res[j]; | ||
203 | if (!(res2->flags & type)) | ||
204 | continue; | ||
205 | |||
206 | /* | ||
207 | * I don't like throwing away windows because then | ||
208 | * our resources no longer match the ACPI _CRS, but | ||
209 | * the kernel resource tree doesn't allow overlaps. | ||
210 | */ | ||
211 | if (resource_contains(res1, res2->start) || | ||
212 | resource_contains(res1, res2->end) || | ||
213 | resource_contains(res2, res1->start) || | ||
214 | resource_contains(res2, res1->end)) { | ||
215 | res1->start = min(res1->start, res2->start); | ||
216 | res1->end = max(res1->end, res2->end); | ||
217 | dev_info(&info->bridge->dev, | ||
218 | "host bridge window expanded to %pR; %pR ignored\n", | ||
219 | res1, res2); | ||
220 | res2->flags = 0; | ||
221 | } | ||
222 | } | ||
223 | } | ||
224 | } | ||
225 | |||
226 | static void add_resources(struct pci_root_info *info) | ||
227 | { | ||
228 | int i; | ||
229 | struct resource *res, *root, *conflict; | ||
230 | |||
231 | if (!pci_use_crs) | ||
232 | return; | ||
233 | |||
234 | coalesce_windows(info, IORESOURCE_MEM); | ||
235 | coalesce_windows(info, IORESOURCE_IO); | ||
236 | |||
237 | for (i = 0; i < info->res_num; i++) { | ||
238 | res = &info->res[i]; | ||
239 | |||
240 | if (res->flags & IORESOURCE_MEM) | ||
241 | root = &iomem_resource; | ||
242 | else if (res->flags & IORESOURCE_IO) | ||
243 | root = &ioport_resource; | ||
189 | else | 244 | else |
190 | dev_info(&info->bridge->dev, | 245 | continue; |
191 | "host bridge window %pR\n", res); | 246 | |
247 | conflict = insert_resource_conflict(root, res); | ||
248 | if (conflict) | ||
249 | dev_err(&info->bridge->dev, | ||
250 | "address space collision: host bridge window %pR " | ||
251 | "conflicts with %s %pR\n", | ||
252 | res, conflict->name, conflict); | ||
253 | else | ||
254 | pci_bus_add_resource(info->bus, res, 0); | ||
192 | } | 255 | } |
193 | return AE_OK; | ||
194 | } | 256 | } |
195 | 257 | ||
196 | static void | 258 | static void |
@@ -224,6 +286,7 @@ get_current_resources(struct acpi_device *device, int busnum, | |||
224 | acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, | 286 | acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, |
225 | &info); | 287 | &info); |
226 | 288 | ||
289 | add_resources(&info); | ||
227 | return; | 290 | return; |
228 | 291 | ||
229 | name_alloc_fail: | 292 | name_alloc_fail: |
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index fc1e8fe07e5c..e27dffbbb1a7 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/cpu.h> | 4 | #include <linux/cpu.h> |
5 | #include <linux/range.h> | 5 | #include <linux/range.h> |
6 | 6 | ||
7 | #include <asm/amd_nb.h> | ||
7 | #include <asm/pci_x86.h> | 8 | #include <asm/pci_x86.h> |
8 | 9 | ||
9 | #include <asm/pci-direct.h> | 10 | #include <asm/pci-direct.h> |
@@ -378,6 +379,34 @@ static struct notifier_block __cpuinitdata amd_cpu_notifier = { | |||
378 | .notifier_call = amd_cpu_notify, | 379 | .notifier_call = amd_cpu_notify, |
379 | }; | 380 | }; |
380 | 381 | ||
382 | static void __init pci_enable_pci_io_ecs(void) | ||
383 | { | ||
384 | #ifdef CONFIG_AMD_NB | ||
385 | unsigned int i, n; | ||
386 | |||
387 | for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) { | ||
388 | u8 bus = amd_nb_bus_dev_ranges[i].bus; | ||
389 | u8 slot = amd_nb_bus_dev_ranges[i].dev_base; | ||
390 | u8 limit = amd_nb_bus_dev_ranges[i].dev_limit; | ||
391 | |||
392 | for (; slot < limit; ++slot) { | ||
393 | u32 val = read_pci_config(bus, slot, 3, 0); | ||
394 | |||
395 | if (!early_is_amd_nb(val)) | ||
396 | continue; | ||
397 | |||
398 | val = read_pci_config(bus, slot, 3, 0x8c); | ||
399 | if (!(val & (ENABLE_CF8_EXT_CFG >> 32))) { | ||
400 | val |= ENABLE_CF8_EXT_CFG >> 32; | ||
401 | write_pci_config(bus, slot, 3, 0x8c, val); | ||
402 | } | ||
403 | ++n; | ||
404 | } | ||
405 | } | ||
406 | pr_info("Extended Config Space enabled on %u nodes\n", n); | ||
407 | #endif | ||
408 | } | ||
409 | |||
381 | static int __init pci_io_ecs_init(void) | 410 | static int __init pci_io_ecs_init(void) |
382 | { | 411 | { |
383 | int cpu; | 412 | int cpu; |
@@ -386,6 +415,10 @@ static int __init pci_io_ecs_init(void) | |||
386 | if (boot_cpu_data.x86 < 0x10) | 415 | if (boot_cpu_data.x86 < 0x10) |
387 | return 0; | 416 | return 0; |
388 | 417 | ||
418 | /* Try the PCI method first. */ | ||
419 | if (early_pci_allowed()) | ||
420 | pci_enable_pci_io_ecs(); | ||
421 | |||
389 | register_cpu_notifier(&amd_cpu_notifier); | 422 | register_cpu_notifier(&amd_cpu_notifier); |
390 | for_each_online_cpu(cpu) | 423 | for_each_online_cpu(cpu) |
391 | amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, | 424 | amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, |
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c index 0846a5bbbfbd..ab8269b0da29 100644 --- a/arch/x86/pci/broadcom_bus.c +++ b/arch/x86/pci/broadcom_bus.c | |||
@@ -9,6 +9,7 @@ | |||
9 | * option) any later version. | 9 | * option) any later version. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/acpi.h> | ||
12 | #include <linux/delay.h> | 13 | #include <linux/delay.h> |
13 | #include <linux/dmi.h> | 14 | #include <linux/dmi.h> |
14 | #include <linux/pci.h> | 15 | #include <linux/pci.h> |
@@ -25,12 +26,14 @@ static void __devinit cnb20le_res(struct pci_dev *dev) | |||
25 | u8 fbus, lbus; | 26 | u8 fbus, lbus; |
26 | int i; | 27 | int i; |
27 | 28 | ||
29 | #ifdef CONFIG_ACPI | ||
28 | /* | 30 | /* |
29 | * The x86_pci_root_bus_res_quirks() function already refuses to use | 31 | * We should get host bridge information from ACPI unless the BIOS |
30 | * this information if ACPI _CRS was used. Therefore, we don't bother | 32 | * doesn't support it. |
31 | * checking if ACPI is enabled, and just generate the information | ||
32 | * for both the ACPI _CRS and no ACPI cases. | ||
33 | */ | 33 | */ |
34 | if (acpi_os_get_root_pointer()) | ||
35 | return; | ||
36 | #endif | ||
34 | 37 | ||
35 | info = &pci_root_info[pci_root_num]; | 38 | info = &pci_root_info[pci_root_num]; |
36 | pci_root_num++; | 39 | pci_root_num++; |
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index f7c8a399978c..5fe75026ecc2 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c | |||
@@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | | |||
22 | 22 | ||
23 | unsigned int pci_early_dump_regs; | 23 | unsigned int pci_early_dump_regs; |
24 | static int pci_bf_sort; | 24 | static int pci_bf_sort; |
25 | static int smbios_type_b1_flag; | ||
25 | int pci_routeirq; | 26 | int pci_routeirq; |
26 | int noioapicquirk; | 27 | int noioapicquirk; |
27 | #ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS | 28 | #ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS |
@@ -185,6 +186,39 @@ static int __devinit set_bf_sort(const struct dmi_system_id *d) | |||
185 | return 0; | 186 | return 0; |
186 | } | 187 | } |
187 | 188 | ||
189 | static void __devinit read_dmi_type_b1(const struct dmi_header *dm, | ||
190 | void *private_data) | ||
191 | { | ||
192 | u8 *d = (u8 *)dm + 4; | ||
193 | |||
194 | if (dm->type != 0xB1) | ||
195 | return; | ||
196 | switch (((*(u32 *)d) >> 9) & 0x03) { | ||
197 | case 0x00: | ||
198 | printk(KERN_INFO "dmi type 0xB1 record - unknown flag\n"); | ||
199 | break; | ||
200 | case 0x01: /* set pci=bfsort */ | ||
201 | smbios_type_b1_flag = 1; | ||
202 | break; | ||
203 | case 0x02: /* do not set pci=bfsort */ | ||
204 | smbios_type_b1_flag = 2; | ||
205 | break; | ||
206 | default: | ||
207 | break; | ||
208 | } | ||
209 | } | ||
210 | |||
211 | static int __devinit find_sort_method(const struct dmi_system_id *d) | ||
212 | { | ||
213 | dmi_walk(read_dmi_type_b1, NULL); | ||
214 | |||
215 | if (smbios_type_b1_flag == 1) { | ||
216 | set_bf_sort(d); | ||
217 | return 0; | ||
218 | } | ||
219 | return -1; | ||
220 | } | ||
221 | |||
188 | /* | 222 | /* |
189 | * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus) | 223 | * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus) |
190 | */ | 224 | */ |
@@ -213,6 +247,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = { | |||
213 | }, | 247 | }, |
214 | #endif /* __i386__ */ | 248 | #endif /* __i386__ */ |
215 | { | 249 | { |
250 | .callback = find_sort_method, | ||
251 | .ident = "Dell System", | ||
252 | .matches = { | ||
253 | DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"), | ||
254 | }, | ||
255 | }, | ||
256 | { | ||
216 | .callback = set_bf_sort, | 257 | .callback = set_bf_sort, |
217 | .ident = "Dell PowerEdge 1950", | 258 | .ident = "Dell PowerEdge 1950", |
218 | .matches = { | 259 | .matches = { |
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index c4bb261c106e..b1805b78842f 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c | |||
@@ -65,21 +65,13 @@ pcibios_align_resource(void *data, const struct resource *res, | |||
65 | resource_size_t size, resource_size_t align) | 65 | resource_size_t size, resource_size_t align) |
66 | { | 66 | { |
67 | struct pci_dev *dev = data; | 67 | struct pci_dev *dev = data; |
68 | resource_size_t start = round_down(res->end - size + 1, align); | 68 | resource_size_t start = res->start; |
69 | 69 | ||
70 | if (res->flags & IORESOURCE_IO) { | 70 | if (res->flags & IORESOURCE_IO) { |
71 | 71 | if (skip_isa_ioresource_align(dev)) | |
72 | /* | 72 | return start; |
73 | * If we're avoiding ISA aliases, the largest contiguous I/O | 73 | if (start & 0x300) |
74 | * port space is 256 bytes. Clearing bits 9 and 10 preserves | 74 | start = (start + 0x3ff) & ~0x3ff; |
75 | * all 256-byte and smaller alignments, so the result will | ||
76 | * still be correctly aligned. | ||
77 | */ | ||
78 | if (!skip_isa_ioresource_align(dev)) | ||
79 | start &= ~0x300; | ||
80 | } else if (res->flags & IORESOURCE_MEM) { | ||
81 | if (start < BIOS_END) | ||
82 | start = res->end; /* fail; no space */ | ||
83 | } | 75 | } |
84 | return start; | 76 | return start; |
85 | } | 77 | } |
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 9f9bfb705cf9..87e6c8323117 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c | |||
@@ -589,7 +589,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route | |||
589 | case PCI_DEVICE_ID_INTEL_ICH10_1: | 589 | case PCI_DEVICE_ID_INTEL_ICH10_1: |
590 | case PCI_DEVICE_ID_INTEL_ICH10_2: | 590 | case PCI_DEVICE_ID_INTEL_ICH10_2: |
591 | case PCI_DEVICE_ID_INTEL_ICH10_3: | 591 | case PCI_DEVICE_ID_INTEL_ICH10_3: |
592 | case PCI_DEVICE_ID_INTEL_PATSBURG_LPC: | 592 | case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0: |
593 | case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1: | ||
593 | r->name = "PIIX/ICH"; | 594 | r->name = "PIIX/ICH"; |
594 | r->get = pirq_piix_get; | 595 | r->get = pirq_piix_get; |
595 | r->set = pirq_piix_set; | 596 | r->set = pirq_piix_set; |
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 2492d165096a..a5f7d0d63de0 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/uaccess.h> | 9 | #include <linux/uaccess.h> |
10 | #include <asm/pci_x86.h> | 10 | #include <asm/pci_x86.h> |
11 | #include <asm/pci-functions.h> | 11 | #include <asm/pci-functions.h> |
12 | #include <asm/cacheflush.h> | ||
12 | 13 | ||
13 | /* BIOS32 signature: "_32_" */ | 14 | /* BIOS32 signature: "_32_" */ |
14 | #define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) | 15 | #define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) |
@@ -25,6 +26,27 @@ | |||
25 | #define PCIBIOS_HW_TYPE1_SPEC 0x10 | 26 | #define PCIBIOS_HW_TYPE1_SPEC 0x10 |
26 | #define PCIBIOS_HW_TYPE2_SPEC 0x20 | 27 | #define PCIBIOS_HW_TYPE2_SPEC 0x20 |
27 | 28 | ||
29 | int pcibios_enabled; | ||
30 | |||
31 | /* According to the BIOS specification at: | ||
32 | * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could | ||
33 | * restrict the x zone to some pages and make it ro. But this may be | ||
34 | * broken on some bios, complex to handle with static_protections. | ||
35 | * We could make the 0xe0000-0x100000 range rox, but this can break | ||
36 | * some ISA mapping. | ||
37 | * | ||
38 | * So we let's an rw and x hole when pcibios is used. This shouldn't | ||
39 | * happen for modern system with mmconfig, and if you don't want it | ||
40 | * you could disable pcibios... | ||
41 | */ | ||
42 | static inline void set_bios_x(void) | ||
43 | { | ||
44 | pcibios_enabled = 1; | ||
45 | set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT); | ||
46 | if (__supported_pte_mask & _PAGE_NX) | ||
47 | printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n"); | ||
48 | } | ||
49 | |||
28 | /* | 50 | /* |
29 | * This is the standard structure used to identify the entry point | 51 | * This is the standard structure used to identify the entry point |
30 | * to the BIOS32 Service Directory, as documented in | 52 | * to the BIOS32 Service Directory, as documented in |
@@ -332,6 +354,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void) | |||
332 | DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", | 354 | DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", |
333 | bios32_entry); | 355 | bios32_entry); |
334 | bios32_indirect.address = bios32_entry + PAGE_OFFSET; | 356 | bios32_indirect.address = bios32_entry + PAGE_OFFSET; |
357 | set_bios_x(); | ||
335 | if (check_pcibios()) | 358 | if (check_pcibios()) |
336 | return &pci_bios_access; | 359 | return &pci_bios_access; |
337 | } | 360 | } |
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 117f5b8daf75..25cd4a07d09f 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c | |||
@@ -70,6 +70,9 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, | |||
70 | struct xen_pci_frontend_ops *xen_pci_frontend; | 70 | struct xen_pci_frontend_ops *xen_pci_frontend; |
71 | EXPORT_SYMBOL_GPL(xen_pci_frontend); | 71 | EXPORT_SYMBOL_GPL(xen_pci_frontend); |
72 | 72 | ||
73 | #define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \ | ||
74 | MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0)) | ||
75 | |||
73 | static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq, | 76 | static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq, |
74 | struct msi_msg *msg) | 77 | struct msi_msg *msg) |
75 | { | 78 | { |
@@ -83,12 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq, | |||
83 | MSI_ADDR_REDIRECTION_CPU | | 86 | MSI_ADDR_REDIRECTION_CPU | |
84 | MSI_ADDR_DEST_ID(pirq); | 87 | MSI_ADDR_DEST_ID(pirq); |
85 | 88 | ||
86 | msg->data = | 89 | msg->data = XEN_PIRQ_MSI_DATA; |
87 | MSI_DATA_TRIGGER_EDGE | | ||
88 | MSI_DATA_LEVEL_ASSERT | | ||
89 | /* delivery mode reserved */ | ||
90 | (3 << 8) | | ||
91 | MSI_DATA_VECTOR(0); | ||
92 | } | 90 | } |
93 | 91 | ||
94 | static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) | 92 | static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) |
@@ -98,8 +96,23 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) | |||
98 | struct msi_msg msg; | 96 | struct msi_msg msg; |
99 | 97 | ||
100 | list_for_each_entry(msidesc, &dev->msi_list, list) { | 98 | list_for_each_entry(msidesc, &dev->msi_list, list) { |
99 | __read_msi_msg(msidesc, &msg); | ||
100 | pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | | ||
101 | ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); | ||
102 | if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) { | ||
103 | xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? | ||
104 | "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ); | ||
105 | if (irq < 0) | ||
106 | goto error; | ||
107 | ret = set_irq_msi(irq, msidesc); | ||
108 | if (ret < 0) | ||
109 | goto error_while; | ||
110 | printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d" | ||
111 | " pirq=%d\n", irq, pirq); | ||
112 | return 0; | ||
113 | } | ||
101 | xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? | 114 | xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? |
102 | "msi-x" : "msi", &irq, &pirq); | 115 | "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ)); |
103 | if (irq < 0 || pirq < 0) | 116 | if (irq < 0 || pirq < 0) |
104 | goto error; | 117 | goto error; |
105 | printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); | 118 | printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); |
@@ -147,8 +160,10 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) | |||
147 | irq = xen_allocate_pirq(v[i], 0, /* not sharable */ | 160 | irq = xen_allocate_pirq(v[i], 0, /* not sharable */ |
148 | (type == PCI_CAP_ID_MSIX) ? | 161 | (type == PCI_CAP_ID_MSIX) ? |
149 | "pcifront-msi-x" : "pcifront-msi"); | 162 | "pcifront-msi-x" : "pcifront-msi"); |
150 | if (irq < 0) | 163 | if (irq < 0) { |
151 | return -1; | 164 | ret = -1; |
165 | goto free; | ||
166 | } | ||
152 | 167 | ||
153 | ret = set_irq_msi(irq, msidesc); | 168 | ret = set_irq_msi(irq, msidesc); |
154 | if (ret) | 169 | if (ret) |
@@ -164,7 +179,7 @@ error: | |||
164 | if (ret == -ENODEV) | 179 | if (ret == -ENODEV) |
165 | dev_err(&dev->dev, "Xen PCI frontend has not registered" \ | 180 | dev_err(&dev->dev, "Xen PCI frontend has not registered" \ |
166 | " MSI/MSI-X support!\n"); | 181 | " MSI/MSI-X support!\n"); |
167 | 182 | free: | |
168 | kfree(v); | 183 | kfree(v); |
169 | return ret; | 184 | return ret; |
170 | } | 185 | } |
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c index 65df603622b2..25bfdbb5b130 100644 --- a/arch/x86/platform/mrst/early_printk_mrst.c +++ b/arch/x86/platform/mrst/early_printk_mrst.c | |||
@@ -103,7 +103,7 @@ struct dw_spi_reg { | |||
103 | static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0; | 103 | static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0; |
104 | 104 | ||
105 | static u32 *pclk_spi0; | 105 | static u32 *pclk_spi0; |
106 | /* Always contains an accessable address, start with 0 */ | 106 | /* Always contains an accessible address, start with 0 */ |
107 | static struct dw_spi_reg *pspi; | 107 | static struct dw_spi_reg *pspi; |
108 | 108 | ||
109 | static struct kmsg_dumper dw_dumper; | 109 | static struct kmsg_dumper dw_dumper; |
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 4c542c757cb4..5c0207bf959b 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c | |||
@@ -72,32 +72,6 @@ struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; | |||
72 | EXPORT_SYMBOL_GPL(sfi_mrtc_array); | 72 | EXPORT_SYMBOL_GPL(sfi_mrtc_array); |
73 | int sfi_mrtc_num; | 73 | int sfi_mrtc_num; |
74 | 74 | ||
75 | static inline void assign_to_mp_irq(struct mpc_intsrc *m, | ||
76 | struct mpc_intsrc *mp_irq) | ||
77 | { | ||
78 | memcpy(mp_irq, m, sizeof(struct mpc_intsrc)); | ||
79 | } | ||
80 | |||
81 | static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq, | ||
82 | struct mpc_intsrc *m) | ||
83 | { | ||
84 | return memcmp(mp_irq, m, sizeof(struct mpc_intsrc)); | ||
85 | } | ||
86 | |||
87 | static void save_mp_irq(struct mpc_intsrc *m) | ||
88 | { | ||
89 | int i; | ||
90 | |||
91 | for (i = 0; i < mp_irq_entries; i++) { | ||
92 | if (!mp_irq_cmp(&mp_irqs[i], m)) | ||
93 | return; | ||
94 | } | ||
95 | |||
96 | assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); | ||
97 | if (++mp_irq_entries == MAX_IRQ_SOURCES) | ||
98 | panic("Max # of irq sources exceeded!!\n"); | ||
99 | } | ||
100 | |||
101 | /* parse all the mtimer info to a static mtimer array */ | 75 | /* parse all the mtimer info to a static mtimer array */ |
102 | static int __init sfi_parse_mtmr(struct sfi_table_header *table) | 76 | static int __init sfi_parse_mtmr(struct sfi_table_header *table) |
103 | { | 77 | { |
@@ -131,7 +105,7 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table) | |||
131 | mp_irq.srcbusirq = pentry->irq; /* IRQ */ | 105 | mp_irq.srcbusirq = pentry->irq; /* IRQ */ |
132 | mp_irq.dstapic = MP_APIC_ALL; | 106 | mp_irq.dstapic = MP_APIC_ALL; |
133 | mp_irq.dstirq = pentry->irq; | 107 | mp_irq.dstirq = pentry->irq; |
134 | save_mp_irq(&mp_irq); | 108 | mp_save_irq(&mp_irq); |
135 | } | 109 | } |
136 | 110 | ||
137 | return 0; | 111 | return 0; |
@@ -201,7 +175,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) | |||
201 | mp_irq.srcbusirq = pentry->irq; /* IRQ */ | 175 | mp_irq.srcbusirq = pentry->irq; /* IRQ */ |
202 | mp_irq.dstapic = MP_APIC_ALL; | 176 | mp_irq.dstapic = MP_APIC_ALL; |
203 | mp_irq.dstirq = pentry->irq; | 177 | mp_irq.dstirq = pentry->irq; |
204 | save_mp_irq(&mp_irq); | 178 | mp_save_irq(&mp_irq); |
205 | } | 179 | } |
206 | return 0; | 180 | return 0; |
207 | } | 181 | } |
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile index c31b8fcb5a86..e797428b163b 100644 --- a/arch/x86/platform/olpc/Makefile +++ b/arch/x86/platform/olpc/Makefile | |||
@@ -1,3 +1,4 @@ | |||
1 | obj-$(CONFIG_OLPC) += olpc.o | 1 | obj-$(CONFIG_OLPC) += olpc.o |
2 | obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o | 2 | obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o |
3 | obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o | 3 | obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o |
4 | obj-$(CONFIG_OLPC_OPENFIRMWARE_DT) += olpc_dt.o | ||
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c index f5442c03abc3..127775696d6c 100644 --- a/arch/x86/platform/olpc/olpc-xo1.c +++ b/arch/x86/platform/olpc/olpc-xo1.c | |||
@@ -1,6 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Support for features of the OLPC XO-1 laptop | 2 | * Support for features of the OLPC XO-1 laptop |
3 | * | 3 | * |
4 | * Copyright (C) 2010 Andres Salomon <dilinger@queued.net> | ||
4 | * Copyright (C) 2010 One Laptop per Child | 5 | * Copyright (C) 2010 One Laptop per Child |
5 | * Copyright (C) 2006 Red Hat, Inc. | 6 | * Copyright (C) 2006 Red Hat, Inc. |
6 | * Copyright (C) 2006 Advanced Micro Devices, Inc. | 7 | * Copyright (C) 2006 Advanced Micro Devices, Inc. |
@@ -12,8 +13,6 @@ | |||
12 | */ | 13 | */ |
13 | 14 | ||
14 | #include <linux/module.h> | 15 | #include <linux/module.h> |
15 | #include <linux/pci.h> | ||
16 | #include <linux/pci_ids.h> | ||
17 | #include <linux/platform_device.h> | 16 | #include <linux/platform_device.h> |
18 | #include <linux/pm.h> | 17 | #include <linux/pm.h> |
19 | 18 | ||
@@ -22,9 +21,6 @@ | |||
22 | 21 | ||
23 | #define DRV_NAME "olpc-xo1" | 22 | #define DRV_NAME "olpc-xo1" |
24 | 23 | ||
25 | #define PMS_BAR 4 | ||
26 | #define ACPI_BAR 5 | ||
27 | |||
28 | /* PMC registers (PMS block) */ | 24 | /* PMC registers (PMS block) */ |
29 | #define PM_SCLK 0x10 | 25 | #define PM_SCLK 0x10 |
30 | #define PM_IN_SLPCTL 0x20 | 26 | #define PM_IN_SLPCTL 0x20 |
@@ -57,65 +53,67 @@ static void xo1_power_off(void) | |||
57 | outl(0x00002000, acpi_base + PM1_CNT); | 53 | outl(0x00002000, acpi_base + PM1_CNT); |
58 | } | 54 | } |
59 | 55 | ||
60 | /* Read the base addresses from the PCI BAR info */ | 56 | static int __devinit olpc_xo1_probe(struct platform_device *pdev) |
61 | static int __devinit setup_bases(struct pci_dev *pdev) | ||
62 | { | 57 | { |
63 | int r; | 58 | struct resource *res; |
64 | 59 | ||
65 | r = pci_enable_device_io(pdev); | 60 | /* don't run on non-XOs */ |
66 | if (r) { | 61 | if (!machine_is_olpc()) |
67 | dev_err(&pdev->dev, "can't enable device IO\n"); | 62 | return -ENODEV; |
68 | return r; | ||
69 | } | ||
70 | 63 | ||
71 | r = pci_request_region(pdev, ACPI_BAR, DRV_NAME); | 64 | res = platform_get_resource(pdev, IORESOURCE_IO, 0); |
72 | if (r) { | 65 | if (!res) { |
73 | dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR); | 66 | dev_err(&pdev->dev, "can't fetch device resource info\n"); |
74 | return r; | 67 | return -EIO; |
75 | } | 68 | } |
76 | 69 | ||
77 | r = pci_request_region(pdev, PMS_BAR, DRV_NAME); | 70 | if (!request_region(res->start, resource_size(res), DRV_NAME)) { |
78 | if (r) { | 71 | dev_err(&pdev->dev, "can't request region\n"); |
79 | dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR); | 72 | return -EIO; |
80 | pci_release_region(pdev, ACPI_BAR); | ||
81 | return r; | ||
82 | } | 73 | } |
83 | 74 | ||
84 | acpi_base = pci_resource_start(pdev, ACPI_BAR); | 75 | if (strcmp(pdev->name, "cs5535-pms") == 0) |
85 | pms_base = pci_resource_start(pdev, PMS_BAR); | 76 | pms_base = res->start; |
77 | else if (strcmp(pdev->name, "cs5535-acpi") == 0) | ||
78 | acpi_base = res->start; | ||
79 | |||
80 | /* If we have both addresses, we can override the poweroff hook */ | ||
81 | if (pms_base && acpi_base) { | ||
82 | pm_power_off = xo1_power_off; | ||
83 | printk(KERN_INFO "OLPC XO-1 support registered\n"); | ||
84 | } | ||
86 | 85 | ||
87 | return 0; | 86 | return 0; |
88 | } | 87 | } |
89 | 88 | ||
90 | static int __devinit olpc_xo1_probe(struct platform_device *pdev) | 89 | static int __devexit olpc_xo1_remove(struct platform_device *pdev) |
91 | { | 90 | { |
92 | struct pci_dev *pcidev; | 91 | struct resource *r; |
93 | int r; | ||
94 | |||
95 | pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, | ||
96 | NULL); | ||
97 | if (!pdev) | ||
98 | return -ENODEV; | ||
99 | |||
100 | r = setup_bases(pcidev); | ||
101 | if (r) | ||
102 | return r; | ||
103 | 92 | ||
104 | pm_power_off = xo1_power_off; | 93 | r = platform_get_resource(pdev, IORESOURCE_IO, 0); |
94 | release_region(r->start, resource_size(r)); | ||
105 | 95 | ||
106 | printk(KERN_INFO "OLPC XO-1 support registered\n"); | 96 | if (strcmp(pdev->name, "cs5535-pms") == 0) |
107 | return 0; | 97 | pms_base = 0; |
108 | } | 98 | else if (strcmp(pdev->name, "cs5535-acpi") == 0) |
99 | acpi_base = 0; | ||
109 | 100 | ||
110 | static int __devexit olpc_xo1_remove(struct platform_device *pdev) | ||
111 | { | ||
112 | pm_power_off = NULL; | 101 | pm_power_off = NULL; |
113 | return 0; | 102 | return 0; |
114 | } | 103 | } |
115 | 104 | ||
116 | static struct platform_driver olpc_xo1_driver = { | 105 | static struct platform_driver cs5535_pms_drv = { |
106 | .driver = { | ||
107 | .name = "cs5535-pms", | ||
108 | .owner = THIS_MODULE, | ||
109 | }, | ||
110 | .probe = olpc_xo1_probe, | ||
111 | .remove = __devexit_p(olpc_xo1_remove), | ||
112 | }; | ||
113 | |||
114 | static struct platform_driver cs5535_acpi_drv = { | ||
117 | .driver = { | 115 | .driver = { |
118 | .name = DRV_NAME, | 116 | .name = "cs5535-acpi", |
119 | .owner = THIS_MODULE, | 117 | .owner = THIS_MODULE, |
120 | }, | 118 | }, |
121 | .probe = olpc_xo1_probe, | 119 | .probe = olpc_xo1_probe, |
@@ -124,12 +122,23 @@ static struct platform_driver olpc_xo1_driver = { | |||
124 | 122 | ||
125 | static int __init olpc_xo1_init(void) | 123 | static int __init olpc_xo1_init(void) |
126 | { | 124 | { |
127 | return platform_driver_register(&olpc_xo1_driver); | 125 | int r; |
126 | |||
127 | r = platform_driver_register(&cs5535_pms_drv); | ||
128 | if (r) | ||
129 | return r; | ||
130 | |||
131 | r = platform_driver_register(&cs5535_acpi_drv); | ||
132 | if (r) | ||
133 | platform_driver_unregister(&cs5535_pms_drv); | ||
134 | |||
135 | return r; | ||
128 | } | 136 | } |
129 | 137 | ||
130 | static void __exit olpc_xo1_exit(void) | 138 | static void __exit olpc_xo1_exit(void) |
131 | { | 139 | { |
132 | platform_driver_unregister(&olpc_xo1_driver); | 140 | platform_driver_unregister(&cs5535_acpi_drv); |
141 | platform_driver_unregister(&cs5535_pms_drv); | ||
133 | } | 142 | } |
134 | 143 | ||
135 | MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); | 144 | MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); |
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c new file mode 100644 index 000000000000..dab874647530 --- /dev/null +++ b/arch/x86/platform/olpc/olpc_dt.c | |||
@@ -0,0 +1,183 @@ | |||
1 | /* | ||
2 | * OLPC-specific OFW device tree support code. | ||
3 | * | ||
4 | * Paul Mackerras August 1996. | ||
5 | * Copyright (C) 1996-2005 Paul Mackerras. | ||
6 | * | ||
7 | * Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner. | ||
8 | * {engebret|bergner}@us.ibm.com | ||
9 | * | ||
10 | * Adapted for sparc by David S. Miller davem@davemloft.net | ||
11 | * Adapted for x86/OLPC by Andres Salomon <dilinger@queued.net> | ||
12 | * | ||
13 | * This program is free software; you can redistribute it and/or | ||
14 | * modify it under the terms of the GNU General Public License | ||
15 | * as published by the Free Software Foundation; either version | ||
16 | * 2 of the License, or (at your option) any later version. | ||
17 | */ | ||
18 | |||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/bootmem.h> | ||
21 | #include <linux/of.h> | ||
22 | #include <linux/of_pdt.h> | ||
23 | #include <asm/olpc_ofw.h> | ||
24 | |||
25 | static phandle __init olpc_dt_getsibling(phandle node) | ||
26 | { | ||
27 | const void *args[] = { (void *)node }; | ||
28 | void *res[] = { &node }; | ||
29 | |||
30 | if ((s32)node == -1) | ||
31 | return 0; | ||
32 | |||
33 | if (olpc_ofw("peer", args, res) || (s32)node == -1) | ||
34 | return 0; | ||
35 | |||
36 | return node; | ||
37 | } | ||
38 | |||
39 | static phandle __init olpc_dt_getchild(phandle node) | ||
40 | { | ||
41 | const void *args[] = { (void *)node }; | ||
42 | void *res[] = { &node }; | ||
43 | |||
44 | if ((s32)node == -1) | ||
45 | return 0; | ||
46 | |||
47 | if (olpc_ofw("child", args, res) || (s32)node == -1) { | ||
48 | pr_err("PROM: %s: fetching child failed!\n", __func__); | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | return node; | ||
53 | } | ||
54 | |||
55 | static int __init olpc_dt_getproplen(phandle node, const char *prop) | ||
56 | { | ||
57 | const void *args[] = { (void *)node, prop }; | ||
58 | int len; | ||
59 | void *res[] = { &len }; | ||
60 | |||
61 | if ((s32)node == -1) | ||
62 | return -1; | ||
63 | |||
64 | if (olpc_ofw("getproplen", args, res)) { | ||
65 | pr_err("PROM: %s: getproplen failed!\n", __func__); | ||
66 | return -1; | ||
67 | } | ||
68 | |||
69 | return len; | ||
70 | } | ||
71 | |||
72 | static int __init olpc_dt_getproperty(phandle node, const char *prop, | ||
73 | char *buf, int bufsize) | ||
74 | { | ||
75 | int plen; | ||
76 | |||
77 | plen = olpc_dt_getproplen(node, prop); | ||
78 | if (plen > bufsize || plen < 1) { | ||
79 | return -1; | ||
80 | } else { | ||
81 | const void *args[] = { (void *)node, prop, buf, (void *)plen }; | ||
82 | void *res[] = { &plen }; | ||
83 | |||
84 | if (olpc_ofw("getprop", args, res)) { | ||
85 | pr_err("PROM: %s: getprop failed!\n", __func__); | ||
86 | return -1; | ||
87 | } | ||
88 | } | ||
89 | |||
90 | return plen; | ||
91 | } | ||
92 | |||
93 | static int __init olpc_dt_nextprop(phandle node, char *prev, char *buf) | ||
94 | { | ||
95 | const void *args[] = { (void *)node, prev, buf }; | ||
96 | int success; | ||
97 | void *res[] = { &success }; | ||
98 | |||
99 | buf[0] = '\0'; | ||
100 | |||
101 | if ((s32)node == -1) | ||
102 | return -1; | ||
103 | |||
104 | if (olpc_ofw("nextprop", args, res) || success != 1) | ||
105 | return -1; | ||
106 | |||
107 | return 0; | ||
108 | } | ||
109 | |||
110 | static int __init olpc_dt_pkg2path(phandle node, char *buf, | ||
111 | const int buflen, int *len) | ||
112 | { | ||
113 | const void *args[] = { (void *)node, buf, (void *)buflen }; | ||
114 | void *res[] = { len }; | ||
115 | |||
116 | if ((s32)node == -1) | ||
117 | return -1; | ||
118 | |||
119 | if (olpc_ofw("package-to-path", args, res) || *len < 1) | ||
120 | return -1; | ||
121 | |||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | static unsigned int prom_early_allocated __initdata; | ||
126 | |||
127 | void * __init prom_early_alloc(unsigned long size) | ||
128 | { | ||
129 | static u8 *mem; | ||
130 | static size_t free_mem; | ||
131 | void *res; | ||
132 | |||
133 | if (free_mem < size) { | ||
134 | const size_t chunk_size = max(PAGE_SIZE, size); | ||
135 | |||
136 | /* | ||
137 | * To mimimize the number of allocations, grab at least | ||
138 | * PAGE_SIZE of memory (that's an arbitrary choice that's | ||
139 | * fast enough on the platforms we care about while minimizing | ||
140 | * wasted bootmem) and hand off chunks of it to callers. | ||
141 | */ | ||
142 | res = alloc_bootmem(chunk_size); | ||
143 | if (!res) | ||
144 | return NULL; | ||
145 | prom_early_allocated += chunk_size; | ||
146 | memset(res, 0, chunk_size); | ||
147 | free_mem = chunk_size; | ||
148 | mem = res; | ||
149 | } | ||
150 | |||
151 | /* allocate from the local cache */ | ||
152 | free_mem -= size; | ||
153 | res = mem; | ||
154 | mem += size; | ||
155 | return res; | ||
156 | } | ||
157 | |||
158 | static struct of_pdt_ops prom_olpc_ops __initdata = { | ||
159 | .nextprop = olpc_dt_nextprop, | ||
160 | .getproplen = olpc_dt_getproplen, | ||
161 | .getproperty = olpc_dt_getproperty, | ||
162 | .getchild = olpc_dt_getchild, | ||
163 | .getsibling = olpc_dt_getsibling, | ||
164 | .pkg2path = olpc_dt_pkg2path, | ||
165 | }; | ||
166 | |||
167 | void __init olpc_dt_build_devicetree(void) | ||
168 | { | ||
169 | phandle root; | ||
170 | |||
171 | if (!olpc_ofw_is_installed()) | ||
172 | return; | ||
173 | |||
174 | root = olpc_dt_getsibling(0); | ||
175 | if (!root) { | ||
176 | pr_err("PROM: unable to get root node from OFW!\n"); | ||
177 | return; | ||
178 | } | ||
179 | of_pdt_build_devicetree(root, &prom_olpc_ops); | ||
180 | |||
181 | pr_info("PROM DT: Built device tree with %u bytes of memory.\n", | ||
182 | prom_early_allocated); | ||
183 | } | ||
diff --git a/arch/x86/platform/olpc/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c index 787320464379..e7604f62870d 100644 --- a/arch/x86/platform/olpc/olpc_ofw.c +++ b/arch/x86/platform/olpc/olpc_ofw.c | |||
@@ -110,3 +110,8 @@ void __init olpc_ofw_detect(void) | |||
110 | (unsigned long)olpc_ofw_cif, (-start) >> 20); | 110 | (unsigned long)olpc_ofw_cif, (-start) >> 20); |
111 | reserve_top_address(-start); | 111 | reserve_top_address(-start); |
112 | } | 112 | } |
113 | |||
114 | bool __init olpc_ofw_is_installed(void) | ||
115 | { | ||
116 | return olpc_ofw_cif != NULL; | ||
117 | } | ||
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index dd4c281ffe57..7785b72ecc3a 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c | |||
@@ -34,23 +34,12 @@ | |||
34 | #ifdef CONFIG_X86_LOCAL_APIC | 34 | #ifdef CONFIG_X86_LOCAL_APIC |
35 | static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; | 35 | static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; |
36 | 36 | ||
37 | static void __init mp_sfi_register_lapic_address(unsigned long address) | ||
38 | { | ||
39 | mp_lapic_addr = address; | ||
40 | |||
41 | set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); | ||
42 | if (boot_cpu_physical_apicid == -1U) | ||
43 | boot_cpu_physical_apicid = read_apic_id(); | ||
44 | |||
45 | pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid); | ||
46 | } | ||
47 | |||
48 | /* All CPUs enumerated by SFI must be present and enabled */ | 37 | /* All CPUs enumerated by SFI must be present and enabled */ |
49 | static void __cpuinit mp_sfi_register_lapic(u8 id) | 38 | static void __cpuinit mp_sfi_register_lapic(u8 id) |
50 | { | 39 | { |
51 | if (MAX_APICS - id <= 0) { | 40 | if (MAX_LOCAL_APIC - id <= 0) { |
52 | pr_warning("Processor #%d invalid (max %d)\n", | 41 | pr_warning("Processor #%d invalid (max %d)\n", |
53 | id, MAX_APICS); | 42 | id, MAX_LOCAL_APIC); |
54 | return; | 43 | return; |
55 | } | 44 | } |
56 | 45 | ||
@@ -110,7 +99,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table) | |||
110 | int __init sfi_platform_init(void) | 99 | int __init sfi_platform_init(void) |
111 | { | 100 | { |
112 | #ifdef CONFIG_X86_LOCAL_APIC | 101 | #ifdef CONFIG_X86_LOCAL_APIC |
113 | mp_sfi_register_lapic_address(sfi_lapic_addr); | 102 | register_lapic_address(sfi_lapic_addr); |
114 | sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); | 103 | sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); |
115 | #endif | 104 | #endif |
116 | #ifdef CONFIG_X86_IO_APIC | 105 | #ifdef CONFIG_X86_IO_APIC |
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 20ea20a39e2a..df58e9cad96a 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c | |||
@@ -1341,10 +1341,10 @@ uv_activation_descriptor_init(int node, int pnode) | |||
1341 | 1341 | ||
1342 | /* | 1342 | /* |
1343 | * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) | 1343 | * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) |
1344 | * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub | 1344 | * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE) |
1345 | */ | 1345 | */ |
1346 | bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* | 1346 | bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE |
1347 | UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); | 1347 | * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); |
1348 | BUG_ON(!bau_desc); | 1348 | BUG_ON(!bau_desc); |
1349 | 1349 | ||
1350 | pa = uv_gpa(bau_desc); /* need the real nasid*/ | 1350 | pa = uv_gpa(bau_desc); /* need the real nasid*/ |
@@ -1402,9 +1402,9 @@ uv_payload_queue_init(int node, int pnode) | |||
1402 | struct bau_payload_queue_entry *pqp_malloc; | 1402 | struct bau_payload_queue_entry *pqp_malloc; |
1403 | struct bau_control *bcp; | 1403 | struct bau_control *bcp; |
1404 | 1404 | ||
1405 | pqp = (struct bau_payload_queue_entry *) kmalloc_node( | 1405 | pqp = kmalloc_node((DEST_Q_SIZE + 1) |
1406 | (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), | 1406 | * sizeof(struct bau_payload_queue_entry), |
1407 | GFP_KERNEL, node); | 1407 | GFP_KERNEL, node); |
1408 | BUG_ON(!pqp); | 1408 | BUG_ON(!pqp); |
1409 | pqp_malloc = pqp; | 1409 | pqp_malloc = pqp; |
1410 | 1410 | ||
@@ -1455,7 +1455,7 @@ static void __init uv_init_uvhub(int uvhub, int vector) | |||
1455 | * the below initialization can't be in firmware because the | 1455 | * the below initialization can't be in firmware because the |
1456 | * messaging IRQ will be determined by the OS | 1456 | * messaging IRQ will be determined by the OS |
1457 | */ | 1457 | */ |
1458 | apicid = uvhub_to_first_apicid(uvhub); | 1458 | apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits; |
1459 | uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, | 1459 | uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, |
1460 | ((apicid << 32) | vector)); | 1460 | ((apicid << 32) | vector)); |
1461 | } | 1461 | } |
@@ -1490,7 +1490,7 @@ calculate_destination_timeout(void) | |||
1490 | /* | 1490 | /* |
1491 | * initialize the bau_control structure for each cpu | 1491 | * initialize the bau_control structure for each cpu |
1492 | */ | 1492 | */ |
1493 | static void __init uv_init_per_cpu(int nuvhubs) | 1493 | static int __init uv_init_per_cpu(int nuvhubs) |
1494 | { | 1494 | { |
1495 | int i; | 1495 | int i; |
1496 | int cpu; | 1496 | int cpu; |
@@ -1507,7 +1507,7 @@ static void __init uv_init_per_cpu(int nuvhubs) | |||
1507 | struct bau_control *smaster = NULL; | 1507 | struct bau_control *smaster = NULL; |
1508 | struct socket_desc { | 1508 | struct socket_desc { |
1509 | short num_cpus; | 1509 | short num_cpus; |
1510 | short cpu_number[16]; | 1510 | short cpu_number[MAX_CPUS_PER_SOCKET]; |
1511 | }; | 1511 | }; |
1512 | struct uvhub_desc { | 1512 | struct uvhub_desc { |
1513 | unsigned short socket_mask; | 1513 | unsigned short socket_mask; |
@@ -1520,8 +1520,7 @@ static void __init uv_init_per_cpu(int nuvhubs) | |||
1520 | 1520 | ||
1521 | timeout_us = calculate_destination_timeout(); | 1521 | timeout_us = calculate_destination_timeout(); |
1522 | 1522 | ||
1523 | uvhub_descs = (struct uvhub_desc *) | 1523 | uvhub_descs = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); |
1524 | kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); | ||
1525 | memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); | 1524 | memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); |
1526 | uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); | 1525 | uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); |
1527 | for_each_present_cpu(cpu) { | 1526 | for_each_present_cpu(cpu) { |
@@ -1541,6 +1540,10 @@ static void __init uv_init_per_cpu(int nuvhubs) | |||
1541 | sdp = &bdp->socket[socket]; | 1540 | sdp = &bdp->socket[socket]; |
1542 | sdp->cpu_number[sdp->num_cpus] = cpu; | 1541 | sdp->cpu_number[sdp->num_cpus] = cpu; |
1543 | sdp->num_cpus++; | 1542 | sdp->num_cpus++; |
1543 | if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) { | ||
1544 | printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus); | ||
1545 | return 1; | ||
1546 | } | ||
1544 | } | 1547 | } |
1545 | for (uvhub = 0; uvhub < nuvhubs; uvhub++) { | 1548 | for (uvhub = 0; uvhub < nuvhubs; uvhub++) { |
1546 | if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) | 1549 | if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) |
@@ -1571,6 +1574,12 @@ static void __init uv_init_per_cpu(int nuvhubs) | |||
1571 | bcp->uvhub_master = hmaster; | 1574 | bcp->uvhub_master = hmaster; |
1572 | bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> | 1575 | bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> |
1573 | blade_processor_id; | 1576 | blade_processor_id; |
1577 | if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { | ||
1578 | printk(KERN_EMERG | ||
1579 | "%d cpus per uvhub invalid\n", | ||
1580 | bcp->uvhub_cpu); | ||
1581 | return 1; | ||
1582 | } | ||
1574 | } | 1583 | } |
1575 | nextsocket: | 1584 | nextsocket: |
1576 | socket++; | 1585 | socket++; |
@@ -1596,6 +1605,7 @@ nextsocket: | |||
1596 | bcp->congested_reps = congested_reps; | 1605 | bcp->congested_reps = congested_reps; |
1597 | bcp->congested_period = congested_period; | 1606 | bcp->congested_period = congested_period; |
1598 | } | 1607 | } |
1608 | return 0; | ||
1599 | } | 1609 | } |
1600 | 1610 | ||
1601 | /* | 1611 | /* |
@@ -1626,7 +1636,10 @@ static int __init uv_bau_init(void) | |||
1626 | spin_lock_init(&disable_lock); | 1636 | spin_lock_init(&disable_lock); |
1627 | congested_cycles = microsec_2_cycles(congested_response_us); | 1637 | congested_cycles = microsec_2_cycles(congested_response_us); |
1628 | 1638 | ||
1629 | uv_init_per_cpu(nuvhubs); | 1639 | if (uv_init_per_cpu(nuvhubs)) { |
1640 | nobau = 1; | ||
1641 | return 0; | ||
1642 | } | ||
1630 | 1643 | ||
1631 | uv_partition_base_pnode = 0x7fffffff; | 1644 | uv_partition_base_pnode = 0x7fffffff; |
1632 | for (uvhub = 0; uvhub < nuvhubs; uvhub++) | 1645 | for (uvhub = 0; uvhub < nuvhubs; uvhub++) |
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 56e421bc379b..9daf5d1af9f1 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c | |||
@@ -89,6 +89,7 @@ static void uv_rtc_send_IPI(int cpu) | |||
89 | 89 | ||
90 | apicid = cpu_physical_id(cpu); | 90 | apicid = cpu_physical_id(cpu); |
91 | pnode = uv_apicid_to_pnode(apicid); | 91 | pnode = uv_apicid_to_pnode(apicid); |
92 | apicid |= uv_apicid_hibits; | ||
92 | val = (1UL << UVH_IPI_INT_SEND_SHFT) | | 93 | val = (1UL << UVH_IPI_INT_SEND_SHFT) | |
93 | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | | 94 | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | |
94 | (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); | 95 | (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); |
@@ -107,6 +108,7 @@ static int uv_intr_pending(int pnode) | |||
107 | static int uv_setup_intr(int cpu, u64 expires) | 108 | static int uv_setup_intr(int cpu, u64 expires) |
108 | { | 109 | { |
109 | u64 val; | 110 | u64 val; |
111 | unsigned long apicid = cpu_physical_id(cpu) | uv_apicid_hibits; | ||
110 | int pnode = uv_cpu_to_pnode(cpu); | 112 | int pnode = uv_cpu_to_pnode(cpu); |
111 | 113 | ||
112 | uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, | 114 | uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, |
@@ -117,7 +119,7 @@ static int uv_setup_intr(int cpu, u64 expires) | |||
117 | UVH_EVENT_OCCURRED0_RTC1_MASK); | 119 | UVH_EVENT_OCCURRED0_RTC1_MASK); |
118 | 120 | ||
119 | val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | | 121 | val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | |
120 | ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); | 122 | ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); |
121 | 123 | ||
122 | /* Set configuration */ | 124 | /* Set configuration */ |
123 | uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); | 125 | uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); |
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c index 3371bd053b89..632037671746 100644 --- a/arch/x86/platform/visws/visws_quirks.c +++ b/arch/x86/platform/visws/visws_quirks.c | |||
@@ -171,7 +171,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) | |||
171 | ver = m->apicver; | 171 | ver = m->apicver; |
172 | if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) { | 172 | if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) { |
173 | printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", | 173 | printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", |
174 | m->apicid, MAX_APICS); | 174 | m->apicid, MAX_LOCAL_APIC); |
175 | return; | 175 | return; |
176 | } | 176 | } |
177 | 177 | ||
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 4a2afa1bac51..b6552b189bcd 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile | |||
@@ -25,7 +25,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) | |||
25 | 25 | ||
26 | export CPPFLAGS_vdso.lds += -P -C | 26 | export CPPFLAGS_vdso.lds += -P -C |
27 | 27 | ||
28 | VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \ | 28 | VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ |
29 | -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 | 29 | -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 |
30 | 30 | ||
31 | $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so | 31 | $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so |
@@ -69,7 +69,7 @@ vdso32.so-$(VDSO32-y) += sysenter | |||
69 | vdso32-images = $(vdso32.so-y:%=vdso32-%.so) | 69 | vdso32-images = $(vdso32.so-y:%=vdso32-%.so) |
70 | 70 | ||
71 | CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) | 71 | CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) |
72 | VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1 | 72 | VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-soname=linux-gate.so.1 |
73 | 73 | ||
74 | # This makes sure the $(obj) subdirectory exists even though vdso32/ | 74 | # This makes sure the $(obj) subdirectory exists even though vdso32/ |
75 | # is not a kbuild sub-make subdirectory. | 75 | # is not a kbuild sub-make subdirectory. |
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 779385158915..17c565de3d64 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile | |||
@@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp) | |||
12 | 12 | ||
13 | obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ | 13 | obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ |
14 | time.o xen-asm.o xen-asm_$(BITS).o \ | 14 | time.o xen-asm.o xen-asm_$(BITS).o \ |
15 | grant-table.o suspend.o platform-pci-unplug.o | 15 | grant-table.o suspend.o platform-pci-unplug.o \ |
16 | p2m.o | ||
16 | 17 | ||
17 | obj-$(CONFIG_SMP) += smp.o | 18 | obj-$(CONFIG_SMP) += smp.o |
18 | obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o | 19 | obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o |
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 235c0f4d3861..50542efe45fb 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -75,6 +75,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); | |||
75 | enum xen_domain_type xen_domain_type = XEN_NATIVE; | 75 | enum xen_domain_type xen_domain_type = XEN_NATIVE; |
76 | EXPORT_SYMBOL_GPL(xen_domain_type); | 76 | EXPORT_SYMBOL_GPL(xen_domain_type); |
77 | 77 | ||
78 | unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; | ||
79 | EXPORT_SYMBOL(machine_to_phys_mapping); | ||
80 | unsigned int machine_to_phys_order; | ||
81 | EXPORT_SYMBOL(machine_to_phys_order); | ||
82 | |||
78 | struct start_info *xen_start_info; | 83 | struct start_info *xen_start_info; |
79 | EXPORT_SYMBOL_GPL(xen_start_info); | 84 | EXPORT_SYMBOL_GPL(xen_start_info); |
80 | 85 | ||
@@ -569,8 +574,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) | |||
569 | 574 | ||
570 | preempt_disable(); | 575 | preempt_disable(); |
571 | 576 | ||
572 | start = __get_cpu_var(idt_desc).address; | 577 | start = __this_cpu_read(idt_desc.address); |
573 | end = start + __get_cpu_var(idt_desc).size + 1; | 578 | end = start + __this_cpu_read(idt_desc.size) + 1; |
574 | 579 | ||
575 | xen_mc_flush(); | 580 | xen_mc_flush(); |
576 | 581 | ||
@@ -1016,10 +1021,6 @@ static void xen_reboot(int reason) | |||
1016 | { | 1021 | { |
1017 | struct sched_shutdown r = { .reason = reason }; | 1022 | struct sched_shutdown r = { .reason = reason }; |
1018 | 1023 | ||
1019 | #ifdef CONFIG_SMP | ||
1020 | stop_other_cpus(); | ||
1021 | #endif | ||
1022 | |||
1023 | if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) | 1024 | if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) |
1024 | BUG(); | 1025 | BUG(); |
1025 | } | 1026 | } |
@@ -1090,6 +1091,8 @@ static void __init xen_setup_stackprotector(void) | |||
1090 | /* First C function to be called on Xen boot */ | 1091 | /* First C function to be called on Xen boot */ |
1091 | asmlinkage void __init xen_start_kernel(void) | 1092 | asmlinkage void __init xen_start_kernel(void) |
1092 | { | 1093 | { |
1094 | struct physdev_set_iopl set_iopl; | ||
1095 | int rc; | ||
1093 | pgd_t *pgd; | 1096 | pgd_t *pgd; |
1094 | 1097 | ||
1095 | if (!xen_start_info) | 1098 | if (!xen_start_info) |
@@ -1097,6 +1100,8 @@ asmlinkage void __init xen_start_kernel(void) | |||
1097 | 1100 | ||
1098 | xen_domain_type = XEN_PV_DOMAIN; | 1101 | xen_domain_type = XEN_PV_DOMAIN; |
1099 | 1102 | ||
1103 | xen_setup_machphys_mapping(); | ||
1104 | |||
1100 | /* Install Xen paravirt ops */ | 1105 | /* Install Xen paravirt ops */ |
1101 | pv_info = xen_info; | 1106 | pv_info = xen_info; |
1102 | pv_init_ops = xen_init_ops; | 1107 | pv_init_ops = xen_init_ops; |
@@ -1169,6 +1174,15 @@ asmlinkage void __init xen_start_kernel(void) | |||
1169 | 1174 | ||
1170 | xen_smp_init(); | 1175 | xen_smp_init(); |
1171 | 1176 | ||
1177 | #ifdef CONFIG_ACPI_NUMA | ||
1178 | /* | ||
1179 | * The pages we from Xen are not related to machine pages, so | ||
1180 | * any NUMA information the kernel tries to get from ACPI will | ||
1181 | * be meaningless. Prevent it from trying. | ||
1182 | */ | ||
1183 | acpi_numa = -1; | ||
1184 | #endif | ||
1185 | |||
1172 | pgd = (pgd_t *)xen_start_info->pt_base; | 1186 | pgd = (pgd_t *)xen_start_info->pt_base; |
1173 | 1187 | ||
1174 | if (!xen_initial_domain()) | 1188 | if (!xen_initial_domain()) |
@@ -1180,7 +1194,7 @@ asmlinkage void __init xen_start_kernel(void) | |||
1180 | per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; | 1194 | per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; |
1181 | 1195 | ||
1182 | local_irq_disable(); | 1196 | local_irq_disable(); |
1183 | early_boot_irqs_off(); | 1197 | early_boot_irqs_disabled = true; |
1184 | 1198 | ||
1185 | memblock_init(); | 1199 | memblock_init(); |
1186 | 1200 | ||
@@ -1191,8 +1205,6 @@ asmlinkage void __init xen_start_kernel(void) | |||
1191 | /* Allocate and initialize top and mid mfn levels for p2m structure */ | 1205 | /* Allocate and initialize top and mid mfn levels for p2m structure */ |
1192 | xen_build_mfn_list_list(); | 1206 | xen_build_mfn_list_list(); |
1193 | 1207 | ||
1194 | init_mm.pgd = pgd; | ||
1195 | |||
1196 | /* keep using Xen gdt for now; no urgent need to change it */ | 1208 | /* keep using Xen gdt for now; no urgent need to change it */ |
1197 | 1209 | ||
1198 | #ifdef CONFIG_X86_32 | 1210 | #ifdef CONFIG_X86_32 |
@@ -1202,10 +1214,18 @@ asmlinkage void __init xen_start_kernel(void) | |||
1202 | #else | 1214 | #else |
1203 | pv_info.kernel_rpl = 0; | 1215 | pv_info.kernel_rpl = 0; |
1204 | #endif | 1216 | #endif |
1205 | |||
1206 | /* set the limit of our address space */ | 1217 | /* set the limit of our address space */ |
1207 | xen_reserve_top(); | 1218 | xen_reserve_top(); |
1208 | 1219 | ||
1220 | /* We used to do this in xen_arch_setup, but that is too late on AMD | ||
1221 | * were early_cpu_init (run before ->arch_setup()) calls early_amd_init | ||
1222 | * which pokes 0xcf8 port. | ||
1223 | */ | ||
1224 | set_iopl.iopl = 1; | ||
1225 | rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); | ||
1226 | if (rc != 0) | ||
1227 | xen_raw_printk("physdev_op failed %d\n", rc); | ||
1228 | |||
1209 | #ifdef CONFIG_X86_32 | 1229 | #ifdef CONFIG_X86_32 |
1210 | /* set up basic CPUID stuff */ | 1230 | /* set up basic CPUID stuff */ |
1211 | cpu_detect(&new_cpu_data); | 1231 | cpu_detect(&new_cpu_data); |
@@ -1245,25 +1265,6 @@ asmlinkage void __init xen_start_kernel(void) | |||
1245 | #endif | 1265 | #endif |
1246 | } | 1266 | } |
1247 | 1267 | ||
1248 | static uint32_t xen_cpuid_base(void) | ||
1249 | { | ||
1250 | uint32_t base, eax, ebx, ecx, edx; | ||
1251 | char signature[13]; | ||
1252 | |||
1253 | for (base = 0x40000000; base < 0x40010000; base += 0x100) { | ||
1254 | cpuid(base, &eax, &ebx, &ecx, &edx); | ||
1255 | *(uint32_t *)(signature + 0) = ebx; | ||
1256 | *(uint32_t *)(signature + 4) = ecx; | ||
1257 | *(uint32_t *)(signature + 8) = edx; | ||
1258 | signature[12] = 0; | ||
1259 | |||
1260 | if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) | ||
1261 | return base; | ||
1262 | } | ||
1263 | |||
1264 | return 0; | ||
1265 | } | ||
1266 | |||
1267 | static int init_hvm_pv_info(int *major, int *minor) | 1268 | static int init_hvm_pv_info(int *major, int *minor) |
1268 | { | 1269 | { |
1269 | uint32_t eax, ebx, ecx, edx, pages, msr, base; | 1270 | uint32_t eax, ebx, ecx, edx, pages, msr, base; |
@@ -1373,6 +1374,18 @@ static bool __init xen_hvm_platform(void) | |||
1373 | return true; | 1374 | return true; |
1374 | } | 1375 | } |
1375 | 1376 | ||
1377 | bool xen_hvm_need_lapic(void) | ||
1378 | { | ||
1379 | if (xen_pv_domain()) | ||
1380 | return false; | ||
1381 | if (!xen_hvm_domain()) | ||
1382 | return false; | ||
1383 | if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback) | ||
1384 | return false; | ||
1385 | return true; | ||
1386 | } | ||
1387 | EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); | ||
1388 | |||
1376 | const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { | 1389 | const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { |
1377 | .name = "Xen HVM", | 1390 | .name = "Xen HVM", |
1378 | .detect = xen_hvm_platform, | 1391 | .detect = xen_hvm_platform, |
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 9d30105a0c4a..6a6fe8939645 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c | |||
@@ -126,7 +126,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { | |||
126 | #endif | 126 | #endif |
127 | }; | 127 | }; |
128 | 128 | ||
129 | void __init xen_init_irq_ops() | 129 | void __init xen_init_irq_ops(void) |
130 | { | 130 | { |
131 | pv_irq_ops = xen_irq_ops; | 131 | pv_irq_ops = xen_irq_ops; |
132 | x86_init.irqs.intr_init = xen_init_IRQ; | 132 | x86_init.irqs.intr_init = xen_init_IRQ; |
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c237b810b03f..5e92b61ad574 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ | |||
173 | */ | 173 | */ |
174 | #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) | 174 | #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) |
175 | 175 | ||
176 | /* | ||
177 | * Xen leaves the responsibility for maintaining p2m mappings to the | ||
178 | * guests themselves, but it must also access and update the p2m array | ||
179 | * during suspend/resume when all the pages are reallocated. | ||
180 | * | ||
181 | * The p2m table is logically a flat array, but we implement it as a | ||
182 | * three-level tree to allow the address space to be sparse. | ||
183 | * | ||
184 | * Xen | ||
185 | * | | ||
186 | * p2m_top p2m_top_mfn | ||
187 | * / \ / \ | ||
188 | * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn | ||
189 | * / \ / \ / / | ||
190 | * p2m p2m p2m p2m p2m p2m p2m ... | ||
191 | * | ||
192 | * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. | ||
193 | * | ||
194 | * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the | ||
195 | * maximum representable pseudo-physical address space is: | ||
196 | * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages | ||
197 | * | ||
198 | * P2M_PER_PAGE depends on the architecture, as a mfn is always | ||
199 | * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to | ||
200 | * 512 and 1024 entries respectively. | ||
201 | */ | ||
202 | |||
203 | unsigned long xen_max_p2m_pfn __read_mostly; | ||
204 | |||
205 | #define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) | ||
206 | #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) | ||
207 | #define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) | ||
208 | |||
209 | #define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) | ||
210 | |||
211 | /* Placeholders for holes in the address space */ | ||
212 | static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); | ||
213 | static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); | ||
214 | static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); | ||
215 | |||
216 | static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); | ||
217 | static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); | ||
218 | static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); | ||
219 | |||
220 | RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); | ||
221 | RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); | ||
222 | |||
223 | static inline unsigned p2m_top_index(unsigned long pfn) | ||
224 | { | ||
225 | BUG_ON(pfn >= MAX_P2M_PFN); | ||
226 | return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); | ||
227 | } | ||
228 | |||
229 | static inline unsigned p2m_mid_index(unsigned long pfn) | ||
230 | { | ||
231 | return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; | ||
232 | } | ||
233 | |||
234 | static inline unsigned p2m_index(unsigned long pfn) | ||
235 | { | ||
236 | return pfn % P2M_PER_PAGE; | ||
237 | } | ||
238 | |||
239 | static void p2m_top_init(unsigned long ***top) | ||
240 | { | ||
241 | unsigned i; | ||
242 | |||
243 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | ||
244 | top[i] = p2m_mid_missing; | ||
245 | } | ||
246 | |||
247 | static void p2m_top_mfn_init(unsigned long *top) | ||
248 | { | ||
249 | unsigned i; | ||
250 | |||
251 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | ||
252 | top[i] = virt_to_mfn(p2m_mid_missing_mfn); | ||
253 | } | ||
254 | |||
255 | static void p2m_top_mfn_p_init(unsigned long **top) | ||
256 | { | ||
257 | unsigned i; | ||
258 | |||
259 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | ||
260 | top[i] = p2m_mid_missing_mfn; | ||
261 | } | ||
262 | |||
263 | static void p2m_mid_init(unsigned long **mid) | ||
264 | { | ||
265 | unsigned i; | ||
266 | |||
267 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | ||
268 | mid[i] = p2m_missing; | ||
269 | } | ||
270 | |||
271 | static void p2m_mid_mfn_init(unsigned long *mid) | ||
272 | { | ||
273 | unsigned i; | ||
274 | |||
275 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | ||
276 | mid[i] = virt_to_mfn(p2m_missing); | ||
277 | } | ||
278 | |||
279 | static void p2m_init(unsigned long *p2m) | ||
280 | { | ||
281 | unsigned i; | ||
282 | |||
283 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | ||
284 | p2m[i] = INVALID_P2M_ENTRY; | ||
285 | } | ||
286 | |||
287 | /* | ||
288 | * Build the parallel p2m_top_mfn and p2m_mid_mfn structures | ||
289 | * | ||
290 | * This is called both at boot time, and after resuming from suspend: | ||
291 | * - At boot time we're called very early, and must use extend_brk() | ||
292 | * to allocate memory. | ||
293 | * | ||
294 | * - After resume we're called from within stop_machine, but the mfn | ||
295 | * tree should alreay be completely allocated. | ||
296 | */ | ||
297 | void xen_build_mfn_list_list(void) | ||
298 | { | ||
299 | unsigned long pfn; | ||
300 | |||
301 | /* Pre-initialize p2m_top_mfn to be completely missing */ | ||
302 | if (p2m_top_mfn == NULL) { | ||
303 | p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
304 | p2m_mid_mfn_init(p2m_mid_missing_mfn); | ||
305 | |||
306 | p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
307 | p2m_top_mfn_p_init(p2m_top_mfn_p); | ||
308 | |||
309 | p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
310 | p2m_top_mfn_init(p2m_top_mfn); | ||
311 | } else { | ||
312 | /* Reinitialise, mfn's all change after migration */ | ||
313 | p2m_mid_mfn_init(p2m_mid_missing_mfn); | ||
314 | } | ||
315 | |||
316 | for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { | ||
317 | unsigned topidx = p2m_top_index(pfn); | ||
318 | unsigned mididx = p2m_mid_index(pfn); | ||
319 | unsigned long **mid; | ||
320 | unsigned long *mid_mfn_p; | ||
321 | |||
322 | mid = p2m_top[topidx]; | ||
323 | mid_mfn_p = p2m_top_mfn_p[topidx]; | ||
324 | |||
325 | /* Don't bother allocating any mfn mid levels if | ||
326 | * they're just missing, just update the stored mfn, | ||
327 | * since all could have changed over a migrate. | ||
328 | */ | ||
329 | if (mid == p2m_mid_missing) { | ||
330 | BUG_ON(mididx); | ||
331 | BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); | ||
332 | p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); | ||
333 | pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; | ||
334 | continue; | ||
335 | } | ||
336 | |||
337 | if (mid_mfn_p == p2m_mid_missing_mfn) { | ||
338 | /* | ||
339 | * XXX boot-time only! We should never find | ||
340 | * missing parts of the mfn tree after | ||
341 | * runtime. extend_brk() will BUG if we call | ||
342 | * it too late. | ||
343 | */ | ||
344 | mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
345 | p2m_mid_mfn_init(mid_mfn_p); | ||
346 | |||
347 | p2m_top_mfn_p[topidx] = mid_mfn_p; | ||
348 | } | ||
349 | |||
350 | p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); | ||
351 | mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); | ||
352 | } | ||
353 | } | ||
354 | |||
355 | void xen_setup_mfn_list_list(void) | ||
356 | { | ||
357 | BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); | ||
358 | |||
359 | HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = | ||
360 | virt_to_mfn(p2m_top_mfn); | ||
361 | HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; | ||
362 | } | ||
363 | |||
364 | /* Set up p2m_top to point to the domain-builder provided p2m pages */ | ||
365 | void __init xen_build_dynamic_phys_to_machine(void) | ||
366 | { | ||
367 | unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; | ||
368 | unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); | ||
369 | unsigned long pfn; | ||
370 | |||
371 | xen_max_p2m_pfn = max_pfn; | ||
372 | |||
373 | p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
374 | p2m_init(p2m_missing); | ||
375 | |||
376 | p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
377 | p2m_mid_init(p2m_mid_missing); | ||
378 | |||
379 | p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
380 | p2m_top_init(p2m_top); | ||
381 | |||
382 | /* | ||
383 | * The domain builder gives us a pre-constructed p2m array in | ||
384 | * mfn_list for all the pages initially given to us, so we just | ||
385 | * need to graft that into our tree structure. | ||
386 | */ | ||
387 | for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { | ||
388 | unsigned topidx = p2m_top_index(pfn); | ||
389 | unsigned mididx = p2m_mid_index(pfn); | ||
390 | |||
391 | if (p2m_top[topidx] == p2m_mid_missing) { | ||
392 | unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
393 | p2m_mid_init(mid); | ||
394 | |||
395 | p2m_top[topidx] = mid; | ||
396 | } | ||
397 | |||
398 | p2m_top[topidx][mididx] = &mfn_list[pfn]; | ||
399 | } | ||
400 | } | ||
401 | |||
402 | unsigned long get_phys_to_machine(unsigned long pfn) | ||
403 | { | ||
404 | unsigned topidx, mididx, idx; | ||
405 | |||
406 | if (unlikely(pfn >= MAX_P2M_PFN)) | ||
407 | return INVALID_P2M_ENTRY; | ||
408 | |||
409 | topidx = p2m_top_index(pfn); | ||
410 | mididx = p2m_mid_index(pfn); | ||
411 | idx = p2m_index(pfn); | ||
412 | |||
413 | return p2m_top[topidx][mididx][idx]; | ||
414 | } | ||
415 | EXPORT_SYMBOL_GPL(get_phys_to_machine); | ||
416 | |||
417 | static void *alloc_p2m_page(void) | ||
418 | { | ||
419 | return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); | ||
420 | } | ||
421 | |||
422 | static void free_p2m_page(void *p) | ||
423 | { | ||
424 | free_page((unsigned long)p); | ||
425 | } | ||
426 | |||
427 | /* | ||
428 | * Fully allocate the p2m structure for a given pfn. We need to check | ||
429 | * that both the top and mid levels are allocated, and make sure the | ||
430 | * parallel mfn tree is kept in sync. We may race with other cpus, so | ||
431 | * the new pages are installed with cmpxchg; if we lose the race then | ||
432 | * simply free the page we allocated and use the one that's there. | ||
433 | */ | ||
434 | static bool alloc_p2m(unsigned long pfn) | ||
435 | { | ||
436 | unsigned topidx, mididx; | ||
437 | unsigned long ***top_p, **mid; | ||
438 | unsigned long *top_mfn_p, *mid_mfn; | ||
439 | |||
440 | topidx = p2m_top_index(pfn); | ||
441 | mididx = p2m_mid_index(pfn); | ||
442 | |||
443 | top_p = &p2m_top[topidx]; | ||
444 | mid = *top_p; | ||
445 | |||
446 | if (mid == p2m_mid_missing) { | ||
447 | /* Mid level is missing, allocate a new one */ | ||
448 | mid = alloc_p2m_page(); | ||
449 | if (!mid) | ||
450 | return false; | ||
451 | |||
452 | p2m_mid_init(mid); | ||
453 | |||
454 | if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) | ||
455 | free_p2m_page(mid); | ||
456 | } | ||
457 | |||
458 | top_mfn_p = &p2m_top_mfn[topidx]; | ||
459 | mid_mfn = p2m_top_mfn_p[topidx]; | ||
460 | |||
461 | BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); | ||
462 | |||
463 | if (mid_mfn == p2m_mid_missing_mfn) { | ||
464 | /* Separately check the mid mfn level */ | ||
465 | unsigned long missing_mfn; | ||
466 | unsigned long mid_mfn_mfn; | ||
467 | |||
468 | mid_mfn = alloc_p2m_page(); | ||
469 | if (!mid_mfn) | ||
470 | return false; | ||
471 | |||
472 | p2m_mid_mfn_init(mid_mfn); | ||
473 | |||
474 | missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); | ||
475 | mid_mfn_mfn = virt_to_mfn(mid_mfn); | ||
476 | if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) | ||
477 | free_p2m_page(mid_mfn); | ||
478 | else | ||
479 | p2m_top_mfn_p[topidx] = mid_mfn; | ||
480 | } | ||
481 | |||
482 | if (p2m_top[topidx][mididx] == p2m_missing) { | ||
483 | /* p2m leaf page is missing */ | ||
484 | unsigned long *p2m; | ||
485 | |||
486 | p2m = alloc_p2m_page(); | ||
487 | if (!p2m) | ||
488 | return false; | ||
489 | |||
490 | p2m_init(p2m); | ||
491 | |||
492 | if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) | ||
493 | free_p2m_page(p2m); | ||
494 | else | ||
495 | mid_mfn[mididx] = virt_to_mfn(p2m); | ||
496 | } | ||
497 | |||
498 | return true; | ||
499 | } | ||
500 | |||
501 | /* Try to install p2m mapping; fail if intermediate bits missing */ | ||
502 | bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) | ||
503 | { | ||
504 | unsigned topidx, mididx, idx; | ||
505 | |||
506 | if (unlikely(pfn >= MAX_P2M_PFN)) { | ||
507 | BUG_ON(mfn != INVALID_P2M_ENTRY); | ||
508 | return true; | ||
509 | } | ||
510 | |||
511 | topidx = p2m_top_index(pfn); | ||
512 | mididx = p2m_mid_index(pfn); | ||
513 | idx = p2m_index(pfn); | ||
514 | |||
515 | if (p2m_top[topidx][mididx] == p2m_missing) | ||
516 | return mfn == INVALID_P2M_ENTRY; | ||
517 | |||
518 | p2m_top[topidx][mididx][idx] = mfn; | ||
519 | |||
520 | return true; | ||
521 | } | ||
522 | |||
523 | bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) | ||
524 | { | ||
525 | if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { | ||
526 | BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); | ||
527 | return true; | ||
528 | } | ||
529 | |||
530 | if (unlikely(!__set_phys_to_machine(pfn, mfn))) { | ||
531 | if (!alloc_p2m(pfn)) | ||
532 | return false; | ||
533 | |||
534 | if (!__set_phys_to_machine(pfn, mfn)) | ||
535 | return false; | ||
536 | } | ||
537 | |||
538 | return true; | ||
539 | } | ||
540 | |||
541 | unsigned long arbitrary_virt_to_mfn(void *vaddr) | 176 | unsigned long arbitrary_virt_to_mfn(void *vaddr) |
542 | { | 177 | { |
543 | xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); | 178 | xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); |
@@ -566,6 +201,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr) | |||
566 | offset = address & ~PAGE_MASK; | 201 | offset = address & ~PAGE_MASK; |
567 | return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); | 202 | return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); |
568 | } | 203 | } |
204 | EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); | ||
569 | 205 | ||
570 | void make_lowmem_page_readonly(void *vaddr) | 206 | void make_lowmem_page_readonly(void *vaddr) |
571 | { | 207 | { |
@@ -2034,6 +1670,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) | |||
2034 | set_page_prot(pmd, PAGE_KERNEL_RO); | 1670 | set_page_prot(pmd, PAGE_KERNEL_RO); |
2035 | } | 1671 | } |
2036 | 1672 | ||
1673 | void __init xen_setup_machphys_mapping(void) | ||
1674 | { | ||
1675 | struct xen_machphys_mapping mapping; | ||
1676 | unsigned long machine_to_phys_nr_ents; | ||
1677 | |||
1678 | if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { | ||
1679 | machine_to_phys_mapping = (unsigned long *)mapping.v_start; | ||
1680 | machine_to_phys_nr_ents = mapping.max_mfn + 1; | ||
1681 | } else { | ||
1682 | machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; | ||
1683 | } | ||
1684 | machine_to_phys_order = fls(machine_to_phys_nr_ents - 1); | ||
1685 | } | ||
1686 | |||
2037 | #ifdef CONFIG_X86_64 | 1687 | #ifdef CONFIG_X86_64 |
2038 | static void convert_pfn_mfn(void *v) | 1688 | static void convert_pfn_mfn(void *v) |
2039 | { | 1689 | { |
@@ -2119,44 +1769,83 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, | |||
2119 | return pgd; | 1769 | return pgd; |
2120 | } | 1770 | } |
2121 | #else /* !CONFIG_X86_64 */ | 1771 | #else /* !CONFIG_X86_64 */ |
2122 | static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD); | 1772 | static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); |
1773 | static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); | ||
1774 | |||
1775 | static __init void xen_write_cr3_init(unsigned long cr3) | ||
1776 | { | ||
1777 | unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); | ||
1778 | |||
1779 | BUG_ON(read_cr3() != __pa(initial_page_table)); | ||
1780 | BUG_ON(cr3 != __pa(swapper_pg_dir)); | ||
1781 | |||
1782 | /* | ||
1783 | * We are switching to swapper_pg_dir for the first time (from | ||
1784 | * initial_page_table) and therefore need to mark that page | ||
1785 | * read-only and then pin it. | ||
1786 | * | ||
1787 | * Xen disallows sharing of kernel PMDs for PAE | ||
1788 | * guests. Therefore we must copy the kernel PMD from | ||
1789 | * initial_page_table into a new kernel PMD to be used in | ||
1790 | * swapper_pg_dir. | ||
1791 | */ | ||
1792 | swapper_kernel_pmd = | ||
1793 | extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); | ||
1794 | memcpy(swapper_kernel_pmd, initial_kernel_pmd, | ||
1795 | sizeof(pmd_t) * PTRS_PER_PMD); | ||
1796 | swapper_pg_dir[KERNEL_PGD_BOUNDARY] = | ||
1797 | __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); | ||
1798 | set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); | ||
1799 | |||
1800 | set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); | ||
1801 | xen_write_cr3(cr3); | ||
1802 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); | ||
1803 | |||
1804 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, | ||
1805 | PFN_DOWN(__pa(initial_page_table))); | ||
1806 | set_page_prot(initial_page_table, PAGE_KERNEL); | ||
1807 | set_page_prot(initial_kernel_pmd, PAGE_KERNEL); | ||
1808 | |||
1809 | pv_mmu_ops.write_cr3 = &xen_write_cr3; | ||
1810 | } | ||
2123 | 1811 | ||
2124 | __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, | 1812 | __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, |
2125 | unsigned long max_pfn) | 1813 | unsigned long max_pfn) |
2126 | { | 1814 | { |
2127 | pmd_t *kernel_pmd; | 1815 | pmd_t *kernel_pmd; |
2128 | 1816 | ||
2129 | level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE); | 1817 | initial_kernel_pmd = |
1818 | extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); | ||
2130 | 1819 | ||
2131 | max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + | 1820 | max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + |
2132 | xen_start_info->nr_pt_frames * PAGE_SIZE + | 1821 | xen_start_info->nr_pt_frames * PAGE_SIZE + |
2133 | 512*1024); | 1822 | 512*1024); |
2134 | 1823 | ||
2135 | kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); | 1824 | kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); |
2136 | memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); | 1825 | memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); |
2137 | 1826 | ||
2138 | xen_map_identity_early(level2_kernel_pgt, max_pfn); | 1827 | xen_map_identity_early(initial_kernel_pmd, max_pfn); |
2139 | 1828 | ||
2140 | memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); | 1829 | memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD); |
2141 | set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], | 1830 | initial_page_table[KERNEL_PGD_BOUNDARY] = |
2142 | __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); | 1831 | __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); |
2143 | 1832 | ||
2144 | set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); | 1833 | set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); |
2145 | set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); | 1834 | set_page_prot(initial_page_table, PAGE_KERNEL_RO); |
2146 | set_page_prot(empty_zero_page, PAGE_KERNEL_RO); | 1835 | set_page_prot(empty_zero_page, PAGE_KERNEL_RO); |
2147 | 1836 | ||
2148 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); | 1837 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); |
2149 | 1838 | ||
2150 | xen_write_cr3(__pa(swapper_pg_dir)); | 1839 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, |
2151 | 1840 | PFN_DOWN(__pa(initial_page_table))); | |
2152 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); | 1841 | xen_write_cr3(__pa(initial_page_table)); |
2153 | 1842 | ||
2154 | memblock_x86_reserve_range(__pa(xen_start_info->pt_base), | 1843 | memblock_x86_reserve_range(__pa(xen_start_info->pt_base), |
2155 | __pa(xen_start_info->pt_base + | 1844 | __pa(xen_start_info->pt_base + |
2156 | xen_start_info->nr_pt_frames * PAGE_SIZE), | 1845 | xen_start_info->nr_pt_frames * PAGE_SIZE), |
2157 | "XEN PAGETABLES"); | 1846 | "XEN PAGETABLES"); |
2158 | 1847 | ||
2159 | return swapper_pg_dir; | 1848 | return initial_page_table; |
2160 | } | 1849 | } |
2161 | #endif /* CONFIG_X86_64 */ | 1850 | #endif /* CONFIG_X86_64 */ |
2162 | 1851 | ||
@@ -2290,7 +1979,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { | |||
2290 | .write_cr2 = xen_write_cr2, | 1979 | .write_cr2 = xen_write_cr2, |
2291 | 1980 | ||
2292 | .read_cr3 = xen_read_cr3, | 1981 | .read_cr3 = xen_read_cr3, |
1982 | #ifdef CONFIG_X86_32 | ||
1983 | .write_cr3 = xen_write_cr3_init, | ||
1984 | #else | ||
2293 | .write_cr3 = xen_write_cr3, | 1985 | .write_cr3 = xen_write_cr3, |
1986 | #endif | ||
2294 | 1987 | ||
2295 | .flush_tlb_user = xen_flush_tlb, | 1988 | .flush_tlb_user = xen_flush_tlb, |
2296 | .flush_tlb_kernel = xen_flush_tlb, | 1989 | .flush_tlb_kernel = xen_flush_tlb, |
@@ -2358,8 +2051,6 @@ void __init xen_init_mmu_ops(void) | |||
2358 | x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; | 2051 | x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; |
2359 | pv_mmu_ops = xen_mmu_ops; | 2052 | pv_mmu_ops = xen_mmu_ops; |
2360 | 2053 | ||
2361 | vmap_lazy_unmap = false; | ||
2362 | |||
2363 | memset(dummy_mapping, 0xff, PAGE_SIZE); | 2054 | memset(dummy_mapping, 0xff, PAGE_SIZE); |
2364 | } | 2055 | } |
2365 | 2056 | ||
@@ -2627,7 +2318,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, | |||
2627 | 2318 | ||
2628 | prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); | 2319 | prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); |
2629 | 2320 | ||
2630 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | 2321 | BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) == |
2322 | (VM_PFNMAP | VM_RESERVED | VM_IO))); | ||
2631 | 2323 | ||
2632 | rmd.mfn = mfn; | 2324 | rmd.mfn = mfn; |
2633 | rmd.prot = prot; | 2325 | rmd.prot = prot; |
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index 9e565da5d1f7..4ec8035e3216 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h | |||
@@ -22,7 +22,7 @@ static inline void xen_mc_batch(void) | |||
22 | unsigned long flags; | 22 | unsigned long flags; |
23 | /* need to disable interrupts until this entry is complete */ | 23 | /* need to disable interrupts until this entry is complete */ |
24 | local_irq_save(flags); | 24 | local_irq_save(flags); |
25 | __get_cpu_var(xen_mc_irq_flags) = flags; | 25 | __this_cpu_write(xen_mc_irq_flags, flags); |
26 | } | 26 | } |
27 | 27 | ||
28 | static inline struct multicall_space xen_mc_entry(size_t args) | 28 | static inline struct multicall_space xen_mc_entry(size_t args) |
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c new file mode 100644 index 000000000000..fd12d7ce7ff9 --- /dev/null +++ b/arch/x86/xen/p2m.c | |||
@@ -0,0 +1,522 @@ | |||
1 | /* | ||
2 | * Xen leaves the responsibility for maintaining p2m mappings to the | ||
3 | * guests themselves, but it must also access and update the p2m array | ||
4 | * during suspend/resume when all the pages are reallocated. | ||
5 | * | ||
6 | * The p2m table is logically a flat array, but we implement it as a | ||
7 | * three-level tree to allow the address space to be sparse. | ||
8 | * | ||
9 | * Xen | ||
10 | * | | ||
11 | * p2m_top p2m_top_mfn | ||
12 | * / \ / \ | ||
13 | * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn | ||
14 | * / \ / \ / / | ||
15 | * p2m p2m p2m p2m p2m p2m p2m ... | ||
16 | * | ||
17 | * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. | ||
18 | * | ||
19 | * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the | ||
20 | * maximum representable pseudo-physical address space is: | ||
21 | * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages | ||
22 | * | ||
23 | * P2M_PER_PAGE depends on the architecture, as a mfn is always | ||
24 | * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to | ||
25 | * 512 and 1024 entries respectively. | ||
26 | */ | ||
27 | |||
28 | #include <linux/init.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/list.h> | ||
31 | #include <linux/hash.h> | ||
32 | #include <linux/sched.h> | ||
33 | |||
34 | #include <asm/cache.h> | ||
35 | #include <asm/setup.h> | ||
36 | |||
37 | #include <asm/xen/page.h> | ||
38 | #include <asm/xen/hypercall.h> | ||
39 | #include <asm/xen/hypervisor.h> | ||
40 | |||
41 | #include "xen-ops.h" | ||
42 | |||
43 | static void __init m2p_override_init(void); | ||
44 | |||
45 | unsigned long xen_max_p2m_pfn __read_mostly; | ||
46 | |||
47 | #define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) | ||
48 | #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) | ||
49 | #define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) | ||
50 | |||
51 | #define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) | ||
52 | |||
53 | /* Placeholders for holes in the address space */ | ||
54 | static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); | ||
55 | static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); | ||
56 | static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); | ||
57 | |||
58 | static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); | ||
59 | static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); | ||
60 | static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); | ||
61 | |||
62 | RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); | ||
63 | RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); | ||
64 | |||
65 | static inline unsigned p2m_top_index(unsigned long pfn) | ||
66 | { | ||
67 | BUG_ON(pfn >= MAX_P2M_PFN); | ||
68 | return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); | ||
69 | } | ||
70 | |||
71 | static inline unsigned p2m_mid_index(unsigned long pfn) | ||
72 | { | ||
73 | return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; | ||
74 | } | ||
75 | |||
76 | static inline unsigned p2m_index(unsigned long pfn) | ||
77 | { | ||
78 | return pfn % P2M_PER_PAGE; | ||
79 | } | ||
80 | |||
81 | static void p2m_top_init(unsigned long ***top) | ||
82 | { | ||
83 | unsigned i; | ||
84 | |||
85 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | ||
86 | top[i] = p2m_mid_missing; | ||
87 | } | ||
88 | |||
89 | static void p2m_top_mfn_init(unsigned long *top) | ||
90 | { | ||
91 | unsigned i; | ||
92 | |||
93 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | ||
94 | top[i] = virt_to_mfn(p2m_mid_missing_mfn); | ||
95 | } | ||
96 | |||
97 | static void p2m_top_mfn_p_init(unsigned long **top) | ||
98 | { | ||
99 | unsigned i; | ||
100 | |||
101 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | ||
102 | top[i] = p2m_mid_missing_mfn; | ||
103 | } | ||
104 | |||
105 | static void p2m_mid_init(unsigned long **mid) | ||
106 | { | ||
107 | unsigned i; | ||
108 | |||
109 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | ||
110 | mid[i] = p2m_missing; | ||
111 | } | ||
112 | |||
113 | static void p2m_mid_mfn_init(unsigned long *mid) | ||
114 | { | ||
115 | unsigned i; | ||
116 | |||
117 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | ||
118 | mid[i] = virt_to_mfn(p2m_missing); | ||
119 | } | ||
120 | |||
121 | static void p2m_init(unsigned long *p2m) | ||
122 | { | ||
123 | unsigned i; | ||
124 | |||
125 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | ||
126 | p2m[i] = INVALID_P2M_ENTRY; | ||
127 | } | ||
128 | |||
129 | /* | ||
130 | * Build the parallel p2m_top_mfn and p2m_mid_mfn structures | ||
131 | * | ||
132 | * This is called both at boot time, and after resuming from suspend: | ||
133 | * - At boot time we're called very early, and must use extend_brk() | ||
134 | * to allocate memory. | ||
135 | * | ||
136 | * - After resume we're called from within stop_machine, but the mfn | ||
137 | * tree should alreay be completely allocated. | ||
138 | */ | ||
139 | void xen_build_mfn_list_list(void) | ||
140 | { | ||
141 | unsigned long pfn; | ||
142 | |||
143 | /* Pre-initialize p2m_top_mfn to be completely missing */ | ||
144 | if (p2m_top_mfn == NULL) { | ||
145 | p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
146 | p2m_mid_mfn_init(p2m_mid_missing_mfn); | ||
147 | |||
148 | p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
149 | p2m_top_mfn_p_init(p2m_top_mfn_p); | ||
150 | |||
151 | p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
152 | p2m_top_mfn_init(p2m_top_mfn); | ||
153 | } else { | ||
154 | /* Reinitialise, mfn's all change after migration */ | ||
155 | p2m_mid_mfn_init(p2m_mid_missing_mfn); | ||
156 | } | ||
157 | |||
158 | for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { | ||
159 | unsigned topidx = p2m_top_index(pfn); | ||
160 | unsigned mididx = p2m_mid_index(pfn); | ||
161 | unsigned long **mid; | ||
162 | unsigned long *mid_mfn_p; | ||
163 | |||
164 | mid = p2m_top[topidx]; | ||
165 | mid_mfn_p = p2m_top_mfn_p[topidx]; | ||
166 | |||
167 | /* Don't bother allocating any mfn mid levels if | ||
168 | * they're just missing, just update the stored mfn, | ||
169 | * since all could have changed over a migrate. | ||
170 | */ | ||
171 | if (mid == p2m_mid_missing) { | ||
172 | BUG_ON(mididx); | ||
173 | BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); | ||
174 | p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); | ||
175 | pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; | ||
176 | continue; | ||
177 | } | ||
178 | |||
179 | if (mid_mfn_p == p2m_mid_missing_mfn) { | ||
180 | /* | ||
181 | * XXX boot-time only! We should never find | ||
182 | * missing parts of the mfn tree after | ||
183 | * runtime. extend_brk() will BUG if we call | ||
184 | * it too late. | ||
185 | */ | ||
186 | mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
187 | p2m_mid_mfn_init(mid_mfn_p); | ||
188 | |||
189 | p2m_top_mfn_p[topidx] = mid_mfn_p; | ||
190 | } | ||
191 | |||
192 | p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); | ||
193 | mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); | ||
194 | } | ||
195 | } | ||
196 | |||
197 | void xen_setup_mfn_list_list(void) | ||
198 | { | ||
199 | BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); | ||
200 | |||
201 | HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = | ||
202 | virt_to_mfn(p2m_top_mfn); | ||
203 | HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; | ||
204 | } | ||
205 | |||
206 | /* Set up p2m_top to point to the domain-builder provided p2m pages */ | ||
207 | void __init xen_build_dynamic_phys_to_machine(void) | ||
208 | { | ||
209 | unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; | ||
210 | unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); | ||
211 | unsigned long pfn; | ||
212 | |||
213 | xen_max_p2m_pfn = max_pfn; | ||
214 | |||
215 | p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
216 | p2m_init(p2m_missing); | ||
217 | |||
218 | p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
219 | p2m_mid_init(p2m_mid_missing); | ||
220 | |||
221 | p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
222 | p2m_top_init(p2m_top); | ||
223 | |||
224 | /* | ||
225 | * The domain builder gives us a pre-constructed p2m array in | ||
226 | * mfn_list for all the pages initially given to us, so we just | ||
227 | * need to graft that into our tree structure. | ||
228 | */ | ||
229 | for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { | ||
230 | unsigned topidx = p2m_top_index(pfn); | ||
231 | unsigned mididx = p2m_mid_index(pfn); | ||
232 | |||
233 | if (p2m_top[topidx] == p2m_mid_missing) { | ||
234 | unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
235 | p2m_mid_init(mid); | ||
236 | |||
237 | p2m_top[topidx] = mid; | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | * As long as the mfn_list has enough entries to completely | ||
242 | * fill a p2m page, pointing into the array is ok. But if | ||
243 | * not the entries beyond the last pfn will be undefined. | ||
244 | */ | ||
245 | if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) { | ||
246 | unsigned long p2midx; | ||
247 | |||
248 | p2midx = max_pfn % P2M_PER_PAGE; | ||
249 | for ( ; p2midx < P2M_PER_PAGE; p2midx++) | ||
250 | mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY; | ||
251 | } | ||
252 | p2m_top[topidx][mididx] = &mfn_list[pfn]; | ||
253 | } | ||
254 | |||
255 | m2p_override_init(); | ||
256 | } | ||
257 | |||
258 | unsigned long get_phys_to_machine(unsigned long pfn) | ||
259 | { | ||
260 | unsigned topidx, mididx, idx; | ||
261 | |||
262 | if (unlikely(pfn >= MAX_P2M_PFN)) | ||
263 | return INVALID_P2M_ENTRY; | ||
264 | |||
265 | topidx = p2m_top_index(pfn); | ||
266 | mididx = p2m_mid_index(pfn); | ||
267 | idx = p2m_index(pfn); | ||
268 | |||
269 | return p2m_top[topidx][mididx][idx]; | ||
270 | } | ||
271 | EXPORT_SYMBOL_GPL(get_phys_to_machine); | ||
272 | |||
273 | static void *alloc_p2m_page(void) | ||
274 | { | ||
275 | return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); | ||
276 | } | ||
277 | |||
278 | static void free_p2m_page(void *p) | ||
279 | { | ||
280 | free_page((unsigned long)p); | ||
281 | } | ||
282 | |||
283 | /* | ||
284 | * Fully allocate the p2m structure for a given pfn. We need to check | ||
285 | * that both the top and mid levels are allocated, and make sure the | ||
286 | * parallel mfn tree is kept in sync. We may race with other cpus, so | ||
287 | * the new pages are installed with cmpxchg; if we lose the race then | ||
288 | * simply free the page we allocated and use the one that's there. | ||
289 | */ | ||
290 | static bool alloc_p2m(unsigned long pfn) | ||
291 | { | ||
292 | unsigned topidx, mididx; | ||
293 | unsigned long ***top_p, **mid; | ||
294 | unsigned long *top_mfn_p, *mid_mfn; | ||
295 | |||
296 | topidx = p2m_top_index(pfn); | ||
297 | mididx = p2m_mid_index(pfn); | ||
298 | |||
299 | top_p = &p2m_top[topidx]; | ||
300 | mid = *top_p; | ||
301 | |||
302 | if (mid == p2m_mid_missing) { | ||
303 | /* Mid level is missing, allocate a new one */ | ||
304 | mid = alloc_p2m_page(); | ||
305 | if (!mid) | ||
306 | return false; | ||
307 | |||
308 | p2m_mid_init(mid); | ||
309 | |||
310 | if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) | ||
311 | free_p2m_page(mid); | ||
312 | } | ||
313 | |||
314 | top_mfn_p = &p2m_top_mfn[topidx]; | ||
315 | mid_mfn = p2m_top_mfn_p[topidx]; | ||
316 | |||
317 | BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); | ||
318 | |||
319 | if (mid_mfn == p2m_mid_missing_mfn) { | ||
320 | /* Separately check the mid mfn level */ | ||
321 | unsigned long missing_mfn; | ||
322 | unsigned long mid_mfn_mfn; | ||
323 | |||
324 | mid_mfn = alloc_p2m_page(); | ||
325 | if (!mid_mfn) | ||
326 | return false; | ||
327 | |||
328 | p2m_mid_mfn_init(mid_mfn); | ||
329 | |||
330 | missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); | ||
331 | mid_mfn_mfn = virt_to_mfn(mid_mfn); | ||
332 | if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) | ||
333 | free_p2m_page(mid_mfn); | ||
334 | else | ||
335 | p2m_top_mfn_p[topidx] = mid_mfn; | ||
336 | } | ||
337 | |||
338 | if (p2m_top[topidx][mididx] == p2m_missing) { | ||
339 | /* p2m leaf page is missing */ | ||
340 | unsigned long *p2m; | ||
341 | |||
342 | p2m = alloc_p2m_page(); | ||
343 | if (!p2m) | ||
344 | return false; | ||
345 | |||
346 | p2m_init(p2m); | ||
347 | |||
348 | if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) | ||
349 | free_p2m_page(p2m); | ||
350 | else | ||
351 | mid_mfn[mididx] = virt_to_mfn(p2m); | ||
352 | } | ||
353 | |||
354 | return true; | ||
355 | } | ||
356 | |||
357 | /* Try to install p2m mapping; fail if intermediate bits missing */ | ||
358 | bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) | ||
359 | { | ||
360 | unsigned topidx, mididx, idx; | ||
361 | |||
362 | if (unlikely(pfn >= MAX_P2M_PFN)) { | ||
363 | BUG_ON(mfn != INVALID_P2M_ENTRY); | ||
364 | return true; | ||
365 | } | ||
366 | |||
367 | topidx = p2m_top_index(pfn); | ||
368 | mididx = p2m_mid_index(pfn); | ||
369 | idx = p2m_index(pfn); | ||
370 | |||
371 | if (p2m_top[topidx][mididx] == p2m_missing) | ||
372 | return mfn == INVALID_P2M_ENTRY; | ||
373 | |||
374 | p2m_top[topidx][mididx][idx] = mfn; | ||
375 | |||
376 | return true; | ||
377 | } | ||
378 | |||
379 | bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) | ||
380 | { | ||
381 | if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { | ||
382 | BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); | ||
383 | return true; | ||
384 | } | ||
385 | |||
386 | if (unlikely(!__set_phys_to_machine(pfn, mfn))) { | ||
387 | if (!alloc_p2m(pfn)) | ||
388 | return false; | ||
389 | |||
390 | if (!__set_phys_to_machine(pfn, mfn)) | ||
391 | return false; | ||
392 | } | ||
393 | |||
394 | return true; | ||
395 | } | ||
396 | |||
397 | #define M2P_OVERRIDE_HASH_SHIFT 10 | ||
398 | #define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) | ||
399 | |||
400 | static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH); | ||
401 | static DEFINE_SPINLOCK(m2p_override_lock); | ||
402 | |||
403 | static void __init m2p_override_init(void) | ||
404 | { | ||
405 | unsigned i; | ||
406 | |||
407 | m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, | ||
408 | sizeof(unsigned long)); | ||
409 | |||
410 | for (i = 0; i < M2P_OVERRIDE_HASH; i++) | ||
411 | INIT_LIST_HEAD(&m2p_overrides[i]); | ||
412 | } | ||
413 | |||
414 | static unsigned long mfn_hash(unsigned long mfn) | ||
415 | { | ||
416 | return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); | ||
417 | } | ||
418 | |||
419 | /* Add an MFN override for a particular page */ | ||
420 | int m2p_add_override(unsigned long mfn, struct page *page) | ||
421 | { | ||
422 | unsigned long flags; | ||
423 | unsigned long pfn; | ||
424 | unsigned long address; | ||
425 | unsigned level; | ||
426 | pte_t *ptep = NULL; | ||
427 | |||
428 | pfn = page_to_pfn(page); | ||
429 | if (!PageHighMem(page)) { | ||
430 | address = (unsigned long)__va(pfn << PAGE_SHIFT); | ||
431 | ptep = lookup_address(address, &level); | ||
432 | |||
433 | if (WARN(ptep == NULL || level != PG_LEVEL_4K, | ||
434 | "m2p_add_override: pfn %lx not mapped", pfn)) | ||
435 | return -EINVAL; | ||
436 | } | ||
437 | |||
438 | page->private = mfn; | ||
439 | page->index = pfn_to_mfn(pfn); | ||
440 | |||
441 | __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); | ||
442 | if (!PageHighMem(page)) | ||
443 | /* Just zap old mapping for now */ | ||
444 | pte_clear(&init_mm, address, ptep); | ||
445 | |||
446 | spin_lock_irqsave(&m2p_override_lock, flags); | ||
447 | list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); | ||
448 | spin_unlock_irqrestore(&m2p_override_lock, flags); | ||
449 | |||
450 | return 0; | ||
451 | } | ||
452 | |||
453 | int m2p_remove_override(struct page *page) | ||
454 | { | ||
455 | unsigned long flags; | ||
456 | unsigned long mfn; | ||
457 | unsigned long pfn; | ||
458 | unsigned long address; | ||
459 | unsigned level; | ||
460 | pte_t *ptep = NULL; | ||
461 | |||
462 | pfn = page_to_pfn(page); | ||
463 | mfn = get_phys_to_machine(pfn); | ||
464 | if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) | ||
465 | return -EINVAL; | ||
466 | |||
467 | if (!PageHighMem(page)) { | ||
468 | address = (unsigned long)__va(pfn << PAGE_SHIFT); | ||
469 | ptep = lookup_address(address, &level); | ||
470 | |||
471 | if (WARN(ptep == NULL || level != PG_LEVEL_4K, | ||
472 | "m2p_remove_override: pfn %lx not mapped", pfn)) | ||
473 | return -EINVAL; | ||
474 | } | ||
475 | |||
476 | spin_lock_irqsave(&m2p_override_lock, flags); | ||
477 | list_del(&page->lru); | ||
478 | spin_unlock_irqrestore(&m2p_override_lock, flags); | ||
479 | __set_phys_to_machine(pfn, page->index); | ||
480 | |||
481 | if (!PageHighMem(page)) | ||
482 | set_pte_at(&init_mm, address, ptep, | ||
483 | pfn_pte(pfn, PAGE_KERNEL)); | ||
484 | /* No tlb flush necessary because the caller already | ||
485 | * left the pte unmapped. */ | ||
486 | |||
487 | return 0; | ||
488 | } | ||
489 | |||
490 | struct page *m2p_find_override(unsigned long mfn) | ||
491 | { | ||
492 | unsigned long flags; | ||
493 | struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)]; | ||
494 | struct page *p, *ret; | ||
495 | |||
496 | ret = NULL; | ||
497 | |||
498 | spin_lock_irqsave(&m2p_override_lock, flags); | ||
499 | |||
500 | list_for_each_entry(p, bucket, lru) { | ||
501 | if (p->private == mfn) { | ||
502 | ret = p; | ||
503 | break; | ||
504 | } | ||
505 | } | ||
506 | |||
507 | spin_unlock_irqrestore(&m2p_override_lock, flags); | ||
508 | |||
509 | return ret; | ||
510 | } | ||
511 | |||
512 | unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) | ||
513 | { | ||
514 | struct page *p = m2p_find_override(mfn); | ||
515 | unsigned long ret = pfn; | ||
516 | |||
517 | if (p) | ||
518 | ret = page_to_pfn(p); | ||
519 | |||
520 | return ret; | ||
521 | } | ||
522 | EXPORT_SYMBOL_GPL(m2p_find_override_pfn); | ||
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 0f456386cce5..25c52f94a27c 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c | |||
@@ -68,7 +68,7 @@ static int __init check_platform_magic(void) | |||
68 | return 0; | 68 | return 0; |
69 | } | 69 | } |
70 | 70 | ||
71 | void __init xen_unplug_emulated_devices(void) | 71 | void xen_unplug_emulated_devices(void) |
72 | { | 72 | { |
73 | int r; | 73 | int r; |
74 | 74 | ||
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index b1dbdaa23ecc..a8a66a50d446 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c | |||
@@ -23,7 +23,6 @@ | |||
23 | #include <xen/interface/callback.h> | 23 | #include <xen/interface/callback.h> |
24 | #include <xen/interface/memory.h> | 24 | #include <xen/interface/memory.h> |
25 | #include <xen/interface/physdev.h> | 25 | #include <xen/interface/physdev.h> |
26 | #include <xen/interface/memory.h> | ||
27 | #include <xen/features.h> | 26 | #include <xen/features.h> |
28 | 27 | ||
29 | #include "xen-ops.h" | 28 | #include "xen-ops.h" |
@@ -118,16 +117,18 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, | |||
118 | const struct e820map *e820) | 117 | const struct e820map *e820) |
119 | { | 118 | { |
120 | phys_addr_t max_addr = PFN_PHYS(max_pfn); | 119 | phys_addr_t max_addr = PFN_PHYS(max_pfn); |
121 | phys_addr_t last_end = 0; | 120 | phys_addr_t last_end = ISA_END_ADDRESS; |
122 | unsigned long released = 0; | 121 | unsigned long released = 0; |
123 | int i; | 122 | int i; |
124 | 123 | ||
124 | /* Free any unused memory above the low 1Mbyte. */ | ||
125 | for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { | 125 | for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { |
126 | phys_addr_t end = e820->map[i].addr; | 126 | phys_addr_t end = e820->map[i].addr; |
127 | end = min(max_addr, end); | 127 | end = min(max_addr, end); |
128 | 128 | ||
129 | released += xen_release_chunk(last_end, end); | 129 | if (last_end < end) |
130 | last_end = e820->map[i].addr + e820->map[i].size; | 130 | released += xen_release_chunk(last_end, end); |
131 | last_end = max(last_end, e820->map[i].addr + e820->map[i].size); | ||
131 | } | 132 | } |
132 | 133 | ||
133 | if (last_end < max_addr) | 134 | if (last_end < max_addr) |
@@ -164,6 +165,7 @@ char * __init xen_memory_setup(void) | |||
164 | XENMEM_memory_map; | 165 | XENMEM_memory_map; |
165 | rc = HYPERVISOR_memory_op(op, &memmap); | 166 | rc = HYPERVISOR_memory_op(op, &memmap); |
166 | if (rc == -ENOSYS) { | 167 | if (rc == -ENOSYS) { |
168 | BUG_ON(xen_initial_domain()); | ||
167 | memmap.nr_entries = 1; | 169 | memmap.nr_entries = 1; |
168 | map[0].addr = 0ULL; | 170 | map[0].addr = 0ULL; |
169 | map[0].size = mem_end; | 171 | map[0].size = mem_end; |
@@ -177,36 +179,39 @@ char * __init xen_memory_setup(void) | |||
177 | e820.nr_map = 0; | 179 | e820.nr_map = 0; |
178 | xen_extra_mem_start = mem_end; | 180 | xen_extra_mem_start = mem_end; |
179 | for (i = 0; i < memmap.nr_entries; i++) { | 181 | for (i = 0; i < memmap.nr_entries; i++) { |
180 | unsigned long long end = map[i].addr + map[i].size; | 182 | unsigned long long end; |
183 | |||
184 | /* Guard against non-page aligned E820 entries. */ | ||
185 | if (map[i].type == E820_RAM) | ||
186 | map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE; | ||
181 | 187 | ||
182 | if (map[i].type == E820_RAM) { | 188 | end = map[i].addr + map[i].size; |
183 | if (map[i].addr < mem_end && end > mem_end) { | 189 | if (map[i].type == E820_RAM && end > mem_end) { |
184 | /* Truncate region to max_mem. */ | 190 | /* RAM off the end - may be partially included */ |
185 | u64 delta = end - mem_end; | 191 | u64 delta = min(map[i].size, end - mem_end); |
186 | 192 | ||
187 | map[i].size -= delta; | 193 | map[i].size -= delta; |
188 | extra_pages += PFN_DOWN(delta); | 194 | end -= delta; |
189 | 195 | ||
190 | end = mem_end; | 196 | extra_pages += PFN_DOWN(delta); |
191 | } | ||
192 | } | 197 | } |
193 | 198 | ||
194 | if (end > xen_extra_mem_start) | 199 | if (map[i].size > 0 && end > xen_extra_mem_start) |
195 | xen_extra_mem_start = end; | 200 | xen_extra_mem_start = end; |
196 | 201 | ||
197 | /* If region is non-RAM or below mem_end, add what remains */ | 202 | /* Add region if any remains */ |
198 | if ((map[i].type != E820_RAM || map[i].addr < mem_end) && | 203 | if (map[i].size > 0) |
199 | map[i].size > 0) | ||
200 | e820_add_region(map[i].addr, map[i].size, map[i].type); | 204 | e820_add_region(map[i].addr, map[i].size, map[i].type); |
201 | } | 205 | } |
202 | 206 | ||
203 | /* | 207 | /* |
204 | * Even though this is normal, usable memory under Xen, reserve | 208 | * In domU, the ISA region is normal, usable memory, but we |
205 | * ISA memory anyway because too many things think they can poke | 209 | * reserve ISA memory anyway because too many things poke |
206 | * about in there. | 210 | * about in there. |
207 | * | 211 | * |
208 | * In a dom0 kernel, this region is identity mapped with the | 212 | * In Dom0, the host E820 information can leave gaps in the |
209 | * hardware ISA area, so it really is out of bounds. | 213 | * ISA range, which would cause us to release those pages. To |
214 | * avoid this, we unconditionally reserve them here. | ||
210 | */ | 215 | */ |
211 | e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, | 216 | e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, |
212 | E820_RESERVED); | 217 | E820_RESERVED); |
@@ -244,26 +249,11 @@ char * __init xen_memory_setup(void) | |||
244 | else | 249 | else |
245 | extra_pages = 0; | 250 | extra_pages = 0; |
246 | 251 | ||
247 | if (!xen_initial_domain()) | 252 | xen_add_extra_mem(extra_pages); |
248 | xen_add_extra_mem(extra_pages); | ||
249 | 253 | ||
250 | return "Xen"; | 254 | return "Xen"; |
251 | } | 255 | } |
252 | 256 | ||
253 | static void xen_idle(void) | ||
254 | { | ||
255 | local_irq_disable(); | ||
256 | |||
257 | if (need_resched()) | ||
258 | local_irq_enable(); | ||
259 | else { | ||
260 | current_thread_info()->status &= ~TS_POLLING; | ||
261 | smp_mb__after_clear_bit(); | ||
262 | safe_halt(); | ||
263 | current_thread_info()->status |= TS_POLLING; | ||
264 | } | ||
265 | } | ||
266 | |||
267 | /* | 257 | /* |
268 | * Set the bit indicating "nosegneg" library variants should be used. | 258 | * Set the bit indicating "nosegneg" library variants should be used. |
269 | * We only need to bother in pure 32-bit mode; compat 32-bit processes | 259 | * We only need to bother in pure 32-bit mode; compat 32-bit processes |
@@ -333,9 +323,6 @@ void __cpuinit xen_enable_syscall(void) | |||
333 | 323 | ||
334 | void __init xen_arch_setup(void) | 324 | void __init xen_arch_setup(void) |
335 | { | 325 | { |
336 | struct physdev_set_iopl set_iopl; | ||
337 | int rc; | ||
338 | |||
339 | xen_panic_handler_init(); | 326 | xen_panic_handler_init(); |
340 | 327 | ||
341 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); | 328 | HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); |
@@ -352,11 +339,6 @@ void __init xen_arch_setup(void) | |||
352 | xen_enable_sysenter(); | 339 | xen_enable_sysenter(); |
353 | xen_enable_syscall(); | 340 | xen_enable_syscall(); |
354 | 341 | ||
355 | set_iopl.iopl = 1; | ||
356 | rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); | ||
357 | if (rc != 0) | ||
358 | printk(KERN_INFO "physdev_op failed %d\n", rc); | ||
359 | |||
360 | #ifdef CONFIG_ACPI | 342 | #ifdef CONFIG_ACPI |
361 | if (!(xen_start_info->flags & SIF_INITDOMAIN)) { | 343 | if (!(xen_start_info->flags & SIF_INITDOMAIN)) { |
362 | printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); | 344 | printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); |
@@ -368,7 +350,12 @@ void __init xen_arch_setup(void) | |||
368 | MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? | 350 | MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? |
369 | COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); | 351 | COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); |
370 | 352 | ||
371 | pm_idle = xen_idle; | 353 | /* Set up idle, making sure it calls safe_halt() pvop */ |
354 | #ifdef CONFIG_X86_32 | ||
355 | boot_cpu_data.hlt_works_ok = 1; | ||
356 | #endif | ||
357 | pm_idle = default_idle; | ||
358 | boot_option_idle_override = IDLE_HALT; | ||
372 | 359 | ||
373 | fiddle_vdso(); | 360 | fiddle_vdso(); |
374 | } | 361 | } |
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 23e061b9327b..cc9b1e182fcf 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c | |||
@@ -159,8 +159,8 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) | |||
159 | { | 159 | { |
160 | struct xen_spinlock *prev; | 160 | struct xen_spinlock *prev; |
161 | 161 | ||
162 | prev = __get_cpu_var(lock_spinners); | 162 | prev = __this_cpu_read(lock_spinners); |
163 | __get_cpu_var(lock_spinners) = xl; | 163 | __this_cpu_write(lock_spinners, xl); |
164 | 164 | ||
165 | wmb(); /* set lock of interest before count */ | 165 | wmb(); /* set lock of interest before count */ |
166 | 166 | ||
@@ -179,14 +179,14 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock | |||
179 | asm(LOCK_PREFIX " decw %0" | 179 | asm(LOCK_PREFIX " decw %0" |
180 | : "+m" (xl->spinners) : : "memory"); | 180 | : "+m" (xl->spinners) : : "memory"); |
181 | wmb(); /* decrement count before restoring lock */ | 181 | wmb(); /* decrement count before restoring lock */ |
182 | __get_cpu_var(lock_spinners) = prev; | 182 | __this_cpu_write(lock_spinners, prev); |
183 | } | 183 | } |
184 | 184 | ||
185 | static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) | 185 | static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) |
186 | { | 186 | { |
187 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; | 187 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; |
188 | struct xen_spinlock *prev; | 188 | struct xen_spinlock *prev; |
189 | int irq = __get_cpu_var(lock_kicker_irq); | 189 | int irq = __this_cpu_read(lock_kicker_irq); |
190 | int ret; | 190 | int ret; |
191 | u64 start; | 191 | u64 start; |
192 | 192 | ||
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 1d789d56877c..9bbd63a129b5 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c | |||
@@ -31,6 +31,7 @@ void xen_hvm_post_suspend(int suspend_cancelled) | |||
31 | int cpu; | 31 | int cpu; |
32 | xen_hvm_init_shared_info(); | 32 | xen_hvm_init_shared_info(); |
33 | xen_callback_vector(); | 33 | xen_callback_vector(); |
34 | xen_unplug_emulated_devices(); | ||
34 | if (xen_feature(XENFEAT_hvm_safe_pvclock)) { | 35 | if (xen_feature(XENFEAT_hvm_safe_pvclock)) { |
35 | for_each_online_cpu(cpu) { | 36 | for_each_online_cpu(cpu) { |
36 | xen_setup_runstate_info(cpu); | 37 | xen_setup_runstate_info(cpu); |
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b2bb5aa3b054..067759e3d6a5 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c | |||
@@ -135,24 +135,24 @@ static void do_stolen_accounting(void) | |||
135 | 135 | ||
136 | /* Add the appropriate number of ticks of stolen time, | 136 | /* Add the appropriate number of ticks of stolen time, |
137 | including any left-overs from last time. */ | 137 | including any left-overs from last time. */ |
138 | stolen = runnable + offline + __get_cpu_var(xen_residual_stolen); | 138 | stolen = runnable + offline + __this_cpu_read(xen_residual_stolen); |
139 | 139 | ||
140 | if (stolen < 0) | 140 | if (stolen < 0) |
141 | stolen = 0; | 141 | stolen = 0; |
142 | 142 | ||
143 | ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); | 143 | ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); |
144 | __get_cpu_var(xen_residual_stolen) = stolen; | 144 | __this_cpu_write(xen_residual_stolen, stolen); |
145 | account_steal_ticks(ticks); | 145 | account_steal_ticks(ticks); |
146 | 146 | ||
147 | /* Add the appropriate number of ticks of blocked time, | 147 | /* Add the appropriate number of ticks of blocked time, |
148 | including any left-overs from last time. */ | 148 | including any left-overs from last time. */ |
149 | blocked += __get_cpu_var(xen_residual_blocked); | 149 | blocked += __this_cpu_read(xen_residual_blocked); |
150 | 150 | ||
151 | if (blocked < 0) | 151 | if (blocked < 0) |
152 | blocked = 0; | 152 | blocked = 0; |
153 | 153 | ||
154 | ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); | 154 | ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); |
155 | __get_cpu_var(xen_residual_blocked) = blocked; | 155 | __this_cpu_write(xen_residual_blocked, blocked); |
156 | account_idle_ticks(ticks); | 156 | account_idle_ticks(ticks); |
157 | } | 157 | } |
158 | 158 | ||
@@ -426,6 +426,8 @@ void xen_timer_resume(void) | |||
426 | { | 426 | { |
427 | int cpu; | 427 | int cpu; |
428 | 428 | ||
429 | pvclock_resume(); | ||
430 | |||
429 | if (xen_clockevent != &xen_vcpuop_clockevent) | 431 | if (xen_clockevent != &xen_vcpuop_clockevent) |
430 | return; | 432 | return; |
431 | 433 | ||
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 64044747348e..9d41bf985757 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h | |||
@@ -43,7 +43,7 @@ void xen_vcpu_restore(void); | |||
43 | 43 | ||
44 | void xen_callback_vector(void); | 44 | void xen_callback_vector(void); |
45 | void xen_hvm_init_shared_info(void); | 45 | void xen_hvm_init_shared_info(void); |
46 | void __init xen_unplug_emulated_devices(void); | 46 | void xen_unplug_emulated_devices(void); |
47 | 47 | ||
48 | void __init xen_build_dynamic_phys_to_machine(void); | 48 | void __init xen_build_dynamic_phys_to_machine(void); |
49 | 49 | ||