diff options
Diffstat (limited to 'arch/x86')
290 files changed, 11083 insertions, 9107 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 13ffa5df37d..8da93745c08 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -24,7 +24,7 @@ config X86 | |||
24 | select HAVE_UNSTABLE_SCHED_CLOCK | 24 | select HAVE_UNSTABLE_SCHED_CLOCK |
25 | select HAVE_IDE | 25 | select HAVE_IDE |
26 | select HAVE_OPROFILE | 26 | select HAVE_OPROFILE |
27 | select HAVE_PERF_COUNTERS if (!M386 && !M486) | 27 | select HAVE_PERF_EVENTS if (!M386 && !M486) |
28 | select HAVE_IOREMAP_PROT | 28 | select HAVE_IOREMAP_PROT |
29 | select HAVE_KPROBES | 29 | select HAVE_KPROBES |
30 | select ARCH_WANT_OPTIONAL_GPIOLIB | 30 | select ARCH_WANT_OPTIONAL_GPIOLIB |
@@ -38,7 +38,7 @@ config X86 | |||
38 | select HAVE_FUNCTION_GRAPH_FP_TEST | 38 | select HAVE_FUNCTION_GRAPH_FP_TEST |
39 | select HAVE_FUNCTION_TRACE_MCOUNT_TEST | 39 | select HAVE_FUNCTION_TRACE_MCOUNT_TEST |
40 | select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE | 40 | select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE |
41 | select HAVE_FTRACE_SYSCALLS | 41 | select HAVE_SYSCALL_TRACEPOINTS |
42 | select HAVE_KVM | 42 | select HAVE_KVM |
43 | select HAVE_ARCH_KGDB | 43 | select HAVE_ARCH_KGDB |
44 | select HAVE_ARCH_TRACEHOOK | 44 | select HAVE_ARCH_TRACEHOOK |
@@ -150,7 +150,10 @@ config ARCH_HAS_CACHE_LINE_SIZE | |||
150 | config HAVE_SETUP_PER_CPU_AREA | 150 | config HAVE_SETUP_PER_CPU_AREA |
151 | def_bool y | 151 | def_bool y |
152 | 152 | ||
153 | config HAVE_DYNAMIC_PER_CPU_AREA | 153 | config NEED_PER_CPU_EMBED_FIRST_CHUNK |
154 | def_bool y | ||
155 | |||
156 | config NEED_PER_CPU_PAGE_FIRST_CHUNK | ||
154 | def_bool y | 157 | def_bool y |
155 | 158 | ||
156 | config HAVE_CPUMASK_OF_CPU_MAP | 159 | config HAVE_CPUMASK_OF_CPU_MAP |
@@ -179,6 +182,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING | |||
179 | config ARCH_SUPPORTS_DEBUG_PAGEALLOC | 182 | config ARCH_SUPPORTS_DEBUG_PAGEALLOC |
180 | def_bool y | 183 | def_bool y |
181 | 184 | ||
185 | config HAVE_INTEL_TXT | ||
186 | def_bool y | ||
187 | depends on EXPERIMENTAL && DMAR && ACPI | ||
188 | |||
182 | # Use the generic interrupt handling code in kernel/irq/: | 189 | # Use the generic interrupt handling code in kernel/irq/: |
183 | config GENERIC_HARDIRQS | 190 | config GENERIC_HARDIRQS |
184 | bool | 191 | bool |
@@ -318,6 +325,7 @@ config X86_EXTENDED_PLATFORM | |||
318 | SGI 320/540 (Visual Workstation) | 325 | SGI 320/540 (Visual Workstation) |
319 | Summit/EXA (IBM x440) | 326 | Summit/EXA (IBM x440) |
320 | Unisys ES7000 IA32 series | 327 | Unisys ES7000 IA32 series |
328 | Moorestown MID devices | ||
321 | 329 | ||
322 | If you have one of these systems, or if you want to build a | 330 | If you have one of these systems, or if you want to build a |
323 | generic distribution kernel, say Y here - otherwise say N. | 331 | generic distribution kernel, say Y here - otherwise say N. |
@@ -377,6 +385,18 @@ config X86_ELAN | |||
377 | 385 | ||
378 | If unsure, choose "PC-compatible" instead. | 386 | If unsure, choose "PC-compatible" instead. |
379 | 387 | ||
388 | config X86_MRST | ||
389 | bool "Moorestown MID platform" | ||
390 | depends on X86_32 | ||
391 | depends on X86_EXTENDED_PLATFORM | ||
392 | ---help--- | ||
393 | Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin | ||
394 | Internet Device(MID) platform. Moorestown consists of two chips: | ||
395 | Lincroft (CPU core, graphics, and memory controller) and Langwell IOH. | ||
396 | Unlike standard x86 PCs, Moorestown does not have many legacy devices | ||
397 | nor standard legacy replacement devices/features. e.g. Moorestown does | ||
398 | not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. | ||
399 | |||
380 | config X86_RDC321X | 400 | config X86_RDC321X |
381 | bool "RDC R-321x SoC" | 401 | bool "RDC R-321x SoC" |
382 | depends on X86_32 | 402 | depends on X86_32 |
@@ -412,6 +432,17 @@ config X86_NUMAQ | |||
412 | of Flat Logical. You will need a new lynxer.elf file to flash your | 432 | of Flat Logical. You will need a new lynxer.elf file to flash your |
413 | firmware with - send email to <Martin.Bligh@us.ibm.com>. | 433 | firmware with - send email to <Martin.Bligh@us.ibm.com>. |
414 | 434 | ||
435 | config X86_SUPPORTS_MEMORY_FAILURE | ||
436 | bool | ||
437 | # MCE code calls memory_failure(): | ||
438 | depends on X86_MCE | ||
439 | # On 32-bit this adds too big of NODES_SHIFT and we run out of page flags: | ||
440 | depends on !X86_NUMAQ | ||
441 | # On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH: | ||
442 | depends on X86_64 || !SPARSEMEM | ||
443 | select ARCH_SUPPORTS_MEMORY_FAILURE | ||
444 | default y | ||
445 | |||
415 | config X86_VISWS | 446 | config X86_VISWS |
416 | bool "SGI 320/540 (Visual Workstation)" | 447 | bool "SGI 320/540 (Visual Workstation)" |
417 | depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT | 448 | depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT |
@@ -586,7 +617,6 @@ config GART_IOMMU | |||
586 | bool "GART IOMMU support" if EMBEDDED | 617 | bool "GART IOMMU support" if EMBEDDED |
587 | default y | 618 | default y |
588 | select SWIOTLB | 619 | select SWIOTLB |
589 | select AGP | ||
590 | depends on X86_64 && PCI | 620 | depends on X86_64 && PCI |
591 | ---help--- | 621 | ---help--- |
592 | Support for full DMA access of devices with 32bit memory access only | 622 | Support for full DMA access of devices with 32bit memory access only |
@@ -777,41 +807,17 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS | |||
777 | increased on these systems. | 807 | increased on these systems. |
778 | 808 | ||
779 | config X86_MCE | 809 | config X86_MCE |
780 | bool "Machine Check Exception" | 810 | bool "Machine Check / overheating reporting" |
781 | ---help--- | 811 | ---help--- |
782 | Machine Check Exception support allows the processor to notify the | 812 | Machine Check support allows the processor to notify the |
783 | kernel if it detects a problem (e.g. overheating, component failure). | 813 | kernel if it detects a problem (e.g. overheating, data corruption). |
784 | The action the kernel takes depends on the severity of the problem, | 814 | The action the kernel takes depends on the severity of the problem, |
785 | ranging from a warning message on the console, to halting the machine. | 815 | ranging from warning messages to halting the machine. |
786 | Your processor must be a Pentium or newer to support this - check the | ||
787 | flags in /proc/cpuinfo for mce. Note that some older Pentium systems | ||
788 | have a design flaw which leads to false MCE events - hence MCE is | ||
789 | disabled on all P5 processors, unless explicitly enabled with "mce" | ||
790 | as a boot argument. Similarly, if MCE is built in and creates a | ||
791 | problem on some new non-standard machine, you can boot with "nomce" | ||
792 | to disable it. MCE support simply ignores non-MCE processors like | ||
793 | the 386 and 486, so nearly everyone can say Y here. | ||
794 | |||
795 | config X86_OLD_MCE | ||
796 | depends on X86_32 && X86_MCE | ||
797 | bool "Use legacy machine check code (will go away)" | ||
798 | default n | ||
799 | select X86_ANCIENT_MCE | ||
800 | ---help--- | ||
801 | Use the old i386 machine check code. This is merely intended for | ||
802 | testing in a transition period. Try this if you run into any machine | ||
803 | check related software problems, but report the problem to | ||
804 | linux-kernel. When in doubt say no. | ||
805 | |||
806 | config X86_NEW_MCE | ||
807 | depends on X86_MCE | ||
808 | bool | ||
809 | default y if (!X86_OLD_MCE && X86_32) || X86_64 | ||
810 | 816 | ||
811 | config X86_MCE_INTEL | 817 | config X86_MCE_INTEL |
812 | def_bool y | 818 | def_bool y |
813 | prompt "Intel MCE features" | 819 | prompt "Intel MCE features" |
814 | depends on X86_NEW_MCE && X86_LOCAL_APIC | 820 | depends on X86_MCE && X86_LOCAL_APIC |
815 | ---help--- | 821 | ---help--- |
816 | Additional support for intel specific MCE features such as | 822 | Additional support for intel specific MCE features such as |
817 | the thermal monitor. | 823 | the thermal monitor. |
@@ -819,14 +825,14 @@ config X86_MCE_INTEL | |||
819 | config X86_MCE_AMD | 825 | config X86_MCE_AMD |
820 | def_bool y | 826 | def_bool y |
821 | prompt "AMD MCE features" | 827 | prompt "AMD MCE features" |
822 | depends on X86_NEW_MCE && X86_LOCAL_APIC | 828 | depends on X86_MCE && X86_LOCAL_APIC |
823 | ---help--- | 829 | ---help--- |
824 | Additional support for AMD specific MCE features such as | 830 | Additional support for AMD specific MCE features such as |
825 | the DRAM Error Threshold. | 831 | the DRAM Error Threshold. |
826 | 832 | ||
827 | config X86_ANCIENT_MCE | 833 | config X86_ANCIENT_MCE |
828 | def_bool n | 834 | def_bool n |
829 | depends on X86_32 | 835 | depends on X86_32 && X86_MCE |
830 | prompt "Support for old Pentium 5 / WinChip machine checks" | 836 | prompt "Support for old Pentium 5 / WinChip machine checks" |
831 | ---help--- | 837 | ---help--- |
832 | Include support for machine check handling on old Pentium 5 or WinChip | 838 | Include support for machine check handling on old Pentium 5 or WinChip |
@@ -839,36 +845,16 @@ config X86_MCE_THRESHOLD | |||
839 | default y | 845 | default y |
840 | 846 | ||
841 | config X86_MCE_INJECT | 847 | config X86_MCE_INJECT |
842 | depends on X86_NEW_MCE | 848 | depends on X86_MCE |
843 | tristate "Machine check injector support" | 849 | tristate "Machine check injector support" |
844 | ---help--- | 850 | ---help--- |
845 | Provide support for injecting machine checks for testing purposes. | 851 | Provide support for injecting machine checks for testing purposes. |
846 | If you don't know what a machine check is and you don't do kernel | 852 | If you don't know what a machine check is and you don't do kernel |
847 | QA it is safe to say n. | 853 | QA it is safe to say n. |
848 | 854 | ||
849 | config X86_MCE_NONFATAL | ||
850 | tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" | ||
851 | depends on X86_OLD_MCE | ||
852 | ---help--- | ||
853 | Enabling this feature starts a timer that triggers every 5 seconds which | ||
854 | will look at the machine check registers to see if anything happened. | ||
855 | Non-fatal problems automatically get corrected (but still logged). | ||
856 | Disable this if you don't want to see these messages. | ||
857 | Seeing the messages this option prints out may be indicative of dying | ||
858 | or out-of-spec (ie, overclocked) hardware. | ||
859 | This option only does something on certain CPUs. | ||
860 | (AMD Athlon/Duron and Intel Pentium 4) | ||
861 | |||
862 | config X86_MCE_P4THERMAL | ||
863 | bool "check for P4 thermal throttling interrupt." | ||
864 | depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP) | ||
865 | ---help--- | ||
866 | Enabling this feature will cause a message to be printed when the P4 | ||
867 | enters thermal throttling. | ||
868 | |||
869 | config X86_THERMAL_VECTOR | 855 | config X86_THERMAL_VECTOR |
870 | def_bool y | 856 | def_bool y |
871 | depends on X86_MCE_P4THERMAL || X86_MCE_INTEL | 857 | depends on X86_MCE_INTEL |
872 | 858 | ||
873 | config VM86 | 859 | config VM86 |
874 | bool "Enable VM86 support" if EMBEDDED | 860 | bool "Enable VM86 support" if EMBEDDED |
@@ -1229,6 +1215,10 @@ config ARCH_DISCONTIGMEM_DEFAULT | |||
1229 | def_bool y | 1215 | def_bool y |
1230 | depends on NUMA && X86_32 | 1216 | depends on NUMA && X86_32 |
1231 | 1217 | ||
1218 | config ARCH_PROC_KCORE_TEXT | ||
1219 | def_bool y | ||
1220 | depends on X86_64 && PROC_KCORE | ||
1221 | |||
1232 | config ARCH_SPARSEMEM_DEFAULT | 1222 | config ARCH_SPARSEMEM_DEFAULT |
1233 | def_bool y | 1223 | def_bool y |
1234 | depends on X86_64 | 1224 | depends on X86_64 |
@@ -1414,6 +1404,10 @@ config X86_PAT | |||
1414 | 1404 | ||
1415 | If unsure, say Y. | 1405 | If unsure, say Y. |
1416 | 1406 | ||
1407 | config ARCH_USES_PG_UNCACHED | ||
1408 | def_bool y | ||
1409 | depends on X86_PAT | ||
1410 | |||
1417 | config EFI | 1411 | config EFI |
1418 | bool "EFI runtime service support" | 1412 | bool "EFI runtime service support" |
1419 | depends on ACPI | 1413 | depends on ACPI |
@@ -1683,6 +1677,8 @@ source "kernel/power/Kconfig" | |||
1683 | 1677 | ||
1684 | source "drivers/acpi/Kconfig" | 1678 | source "drivers/acpi/Kconfig" |
1685 | 1679 | ||
1680 | source "drivers/sfi/Kconfig" | ||
1681 | |||
1686 | config X86_APM_BOOT | 1682 | config X86_APM_BOOT |
1687 | bool | 1683 | bool |
1688 | default y | 1684 | default y |
@@ -1878,7 +1874,7 @@ config PCI_DIRECT | |||
1878 | 1874 | ||
1879 | config PCI_MMCONFIG | 1875 | config PCI_MMCONFIG |
1880 | def_bool y | 1876 | def_bool y |
1881 | depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) | 1877 | depends on X86_32 && PCI && (ACPI || SFI) && (PCI_GOMMCONFIG || PCI_GOANY) |
1882 | 1878 | ||
1883 | config PCI_OLPC | 1879 | config PCI_OLPC |
1884 | def_bool y | 1880 | def_bool y |
@@ -1916,7 +1912,7 @@ config DMAR_DEFAULT_ON | |||
1916 | config DMAR_BROKEN_GFX_WA | 1912 | config DMAR_BROKEN_GFX_WA |
1917 | def_bool n | 1913 | def_bool n |
1918 | prompt "Workaround broken graphics drivers (going away soon)" | 1914 | prompt "Workaround broken graphics drivers (going away soon)" |
1919 | depends on DMAR | 1915 | depends on DMAR && BROKEN |
1920 | ---help--- | 1916 | ---help--- |
1921 | Current Graphics drivers tend to use physical address | 1917 | Current Graphics drivers tend to use physical address |
1922 | for DMA and avoid using DMA APIs. Setting this config | 1918 | for DMA and avoid using DMA APIs. Setting this config |
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 8130334329c..527519b8a9f 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu | |||
@@ -262,6 +262,15 @@ config MCORE2 | |||
262 | family in /proc/cpuinfo. Newer ones have 6 and older ones 15 | 262 | family in /proc/cpuinfo. Newer ones have 6 and older ones 15 |
263 | (not a typo) | 263 | (not a typo) |
264 | 264 | ||
265 | config MATOM | ||
266 | bool "Intel Atom" | ||
267 | ---help--- | ||
268 | |||
269 | Select this for the Intel Atom platform. Intel Atom CPUs have an | ||
270 | in-order pipelining architecture and thus can benefit from | ||
271 | accordingly optimized code. Use a recent GCC with specific Atom | ||
272 | support in order to fully benefit from selecting this option. | ||
273 | |||
265 | config GENERIC_CPU | 274 | config GENERIC_CPU |
266 | bool "Generic-x86-64" | 275 | bool "Generic-x86-64" |
267 | depends on X86_64 | 276 | depends on X86_64 |
@@ -295,7 +304,7 @@ config X86_CPU | |||
295 | config X86_L1_CACHE_BYTES | 304 | config X86_L1_CACHE_BYTES |
296 | int | 305 | int |
297 | default "128" if MPSC | 306 | default "128" if MPSC |
298 | default "64" if GENERIC_CPU || MK8 || MCORE2 || X86_32 | 307 | default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32 |
299 | 308 | ||
300 | config X86_INTERNODE_CACHE_BYTES | 309 | config X86_INTERNODE_CACHE_BYTES |
301 | int | 310 | int |
@@ -310,7 +319,7 @@ config X86_L1_CACHE_SHIFT | |||
310 | default "7" if MPENTIUM4 || MPSC | 319 | default "7" if MPENTIUM4 || MPSC |
311 | default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 | 320 | default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 |
312 | default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX | 321 | default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX |
313 | default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 || X86_GENERIC || GENERIC_CPU | 322 | default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU |
314 | 323 | ||
315 | config X86_XADD | 324 | config X86_XADD |
316 | def_bool y | 325 | def_bool y |
@@ -359,7 +368,7 @@ config X86_INTEL_USERCOPY | |||
359 | 368 | ||
360 | config X86_USE_PPRO_CHECKSUM | 369 | config X86_USE_PPRO_CHECKSUM |
361 | def_bool y | 370 | def_bool y |
362 | depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 | 371 | depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM |
363 | 372 | ||
364 | config X86_USE_3DNOW | 373 | config X86_USE_3DNOW |
365 | def_bool y | 374 | def_bool y |
@@ -387,7 +396,7 @@ config X86_P6_NOP | |||
387 | 396 | ||
388 | config X86_TSC | 397 | config X86_TSC |
389 | def_bool y | 398 | def_bool y |
390 | depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 | 399 | depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64 |
391 | 400 | ||
392 | config X86_CMPXCHG64 | 401 | config X86_CMPXCHG64 |
393 | def_bool y | 402 | def_bool y |
@@ -397,7 +406,7 @@ config X86_CMPXCHG64 | |||
397 | # generates cmov. | 406 | # generates cmov. |
398 | config X86_CMOV | 407 | config X86_CMOV |
399 | def_bool y | 408 | def_bool y |
400 | depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64) | 409 | depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM) |
401 | 410 | ||
402 | config X86_MINIMUM_CPU_FAMILY | 411 | config X86_MINIMUM_CPU_FAMILY |
403 | int | 412 | int |
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1b68659c41b..a012ee8ef80 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile | |||
@@ -32,8 +32,8 @@ ifeq ($(CONFIG_X86_32),y) | |||
32 | 32 | ||
33 | # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use | 33 | # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use |
34 | # a lot more stack due to the lack of sharing of stacklots: | 34 | # a lot more stack due to the lack of sharing of stacklots: |
35 | KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \ | 35 | KBUILD_CFLAGS += $(call cc-ifversion, -lt, 0400, \ |
36 | echo $(call cc-option,-fno-unit-at-a-time); fi ;) | 36 | $(call cc-option,-fno-unit-at-a-time)) |
37 | 37 | ||
38 | # CPU-specific tuning. Anything which can be shared with UML should go here. | 38 | # CPU-specific tuning. Anything which can be shared with UML should go here. |
39 | include $(srctree)/arch/x86/Makefile_32.cpu | 39 | include $(srctree)/arch/x86/Makefile_32.cpu |
@@ -55,6 +55,8 @@ else | |||
55 | 55 | ||
56 | cflags-$(CONFIG_MCORE2) += \ | 56 | cflags-$(CONFIG_MCORE2) += \ |
57 | $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) | 57 | $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) |
58 | cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \ | ||
59 | $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) | ||
58 | cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) | 60 | cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) |
59 | KBUILD_CFLAGS += $(cflags-y) | 61 | KBUILD_CFLAGS += $(cflags-y) |
60 | 62 | ||
@@ -72,7 +74,7 @@ endif | |||
72 | 74 | ||
73 | ifdef CONFIG_CC_STACKPROTECTOR | 75 | ifdef CONFIG_CC_STACKPROTECTOR |
74 | cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh | 76 | cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh |
75 | ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC)),y) | 77 | ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y) |
76 | stackp-y := -fstack-protector | 78 | stackp-y := -fstack-protector |
77 | stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all | 79 | stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all |
78 | KBUILD_CFLAGS += $(stackp-y) | 80 | KBUILD_CFLAGS += $(stackp-y) |
@@ -177,8 +179,8 @@ archclean: | |||
177 | define archhelp | 179 | define archhelp |
178 | echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' | 180 | echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' |
179 | echo ' install - Install kernel using' | 181 | echo ' install - Install kernel using' |
180 | echo ' (your) ~/bin/installkernel or' | 182 | echo ' (your) ~/bin/$(INSTALLKERNEL) or' |
181 | echo ' (distribution) /sbin/installkernel or' | 183 | echo ' (distribution) /sbin/$(INSTALLKERNEL) or' |
182 | echo ' install to $$(INSTALL_PATH) and run lilo' | 184 | echo ' install to $$(INSTALL_PATH) and run lilo' |
183 | echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' | 185 | echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' |
184 | echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' | 186 | echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' |
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 80177ec052f..30e9a264f69 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu | |||
@@ -33,6 +33,8 @@ cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-f | |||
33 | cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) | 33 | cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) |
34 | cflags-$(CONFIG_MVIAC7) += -march=i686 | 34 | cflags-$(CONFIG_MVIAC7) += -march=i686 |
35 | cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) | 35 | cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) |
36 | cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ | ||
37 | $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) | ||
36 | 38 | ||
37 | # AMD Elan support | 39 | # AMD Elan support |
38 | cflags-$(CONFIG_X86_ELAN) += -march=i486 | 40 | cflags-$(CONFIG_X86_ELAN) += -march=i486 |
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index e2ff504b4dd..f8ed0658404 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile | |||
@@ -4,7 +4,7 @@ | |||
4 | # create a compressed vmlinux image from the original vmlinux | 4 | # create a compressed vmlinux image from the original vmlinux |
5 | # | 5 | # |
6 | 6 | ||
7 | targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o | 7 | targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o |
8 | 8 | ||
9 | KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 | 9 | KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 |
10 | KBUILD_CFLAGS += -fno-strict-aliasing -fPIC | 10 | KBUILD_CFLAGS += -fno-strict-aliasing -fPIC |
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 75e4f001e70..f543b70ffae 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S | |||
@@ -23,13 +23,14 @@ | |||
23 | */ | 23 | */ |
24 | .text | 24 | .text |
25 | 25 | ||
26 | #include <linux/init.h> | ||
26 | #include <linux/linkage.h> | 27 | #include <linux/linkage.h> |
27 | #include <asm/segment.h> | 28 | #include <asm/segment.h> |
28 | #include <asm/page_types.h> | 29 | #include <asm/page_types.h> |
29 | #include <asm/boot.h> | 30 | #include <asm/boot.h> |
30 | #include <asm/asm-offsets.h> | 31 | #include <asm/asm-offsets.h> |
31 | 32 | ||
32 | .section ".text.head","ax",@progbits | 33 | __HEAD |
33 | ENTRY(startup_32) | 34 | ENTRY(startup_32) |
34 | cld | 35 | cld |
35 | /* | 36 | /* |
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index f62c284db9e..077e1b69198 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S | |||
@@ -24,6 +24,7 @@ | |||
24 | .code32 | 24 | .code32 |
25 | .text | 25 | .text |
26 | 26 | ||
27 | #include <linux/init.h> | ||
27 | #include <linux/linkage.h> | 28 | #include <linux/linkage.h> |
28 | #include <asm/segment.h> | 29 | #include <asm/segment.h> |
29 | #include <asm/pgtable_types.h> | 30 | #include <asm/pgtable_types.h> |
@@ -33,7 +34,7 @@ | |||
33 | #include <asm/processor-flags.h> | 34 | #include <asm/processor-flags.h> |
34 | #include <asm/asm-offsets.h> | 35 | #include <asm/asm-offsets.h> |
35 | 36 | ||
36 | .section ".text.head" | 37 | __HEAD |
37 | .code32 | 38 | .code32 |
38 | ENTRY(startup_32) | 39 | ENTRY(startup_32) |
39 | cld | 40 | cld |
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index cc353e1b3ff..f4193bb4878 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S | |||
@@ -1,3 +1,5 @@ | |||
1 | #include <asm-generic/vmlinux.lds.h> | ||
2 | |||
1 | OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) | 3 | OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) |
2 | 4 | ||
3 | #undef i386 | 5 | #undef i386 |
@@ -18,9 +20,9 @@ SECTIONS | |||
18 | * address 0. | 20 | * address 0. |
19 | */ | 21 | */ |
20 | . = 0; | 22 | . = 0; |
21 | .text.head : { | 23 | .head.text : { |
22 | _head = . ; | 24 | _head = . ; |
23 | *(.text.head) | 25 | HEAD_TEXT |
24 | _ehead = . ; | 26 | _ehead = . ; |
25 | } | 27 | } |
26 | .rodata.compressed : { | 28 | .rodata.compressed : { |
diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh index 8d60ee15dfd..d13ec1c3864 100644 --- a/arch/x86/boot/install.sh +++ b/arch/x86/boot/install.sh | |||
@@ -33,8 +33,8 @@ verify "$3" | |||
33 | 33 | ||
34 | # User may have a custom install script | 34 | # User may have a custom install script |
35 | 35 | ||
36 | if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi | 36 | if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi |
37 | if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi | 37 | if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi |
38 | 38 | ||
39 | # Default install - same as make zlilo | 39 | # Default install - same as make zlilo |
40 | 40 | ||
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 275dd177f19..11e8c6eb80a 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c | |||
@@ -31,7 +31,6 @@ static inline void vesa_store_mode_params_graphics(void) {} | |||
31 | 31 | ||
32 | static int vesa_probe(void) | 32 | static int vesa_probe(void) |
33 | { | 33 | { |
34 | #if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID) | ||
35 | struct biosregs ireg, oreg; | 34 | struct biosregs ireg, oreg; |
36 | u16 mode; | 35 | u16 mode; |
37 | addr_t mode_ptr; | 36 | addr_t mode_ptr; |
@@ -49,8 +48,7 @@ static int vesa_probe(void) | |||
49 | vginfo.signature != VESA_MAGIC || | 48 | vginfo.signature != VESA_MAGIC || |
50 | vginfo.version < 0x0102) | 49 | vginfo.version < 0x0102) |
51 | return 0; /* Not present */ | 50 | return 0; /* Not present */ |
52 | #endif /* CONFIG_VIDEO_VESA || CONFIG_FIRMWARE_EDID */ | 51 | |
53 | #ifdef CONFIG_VIDEO_VESA | ||
54 | set_fs(vginfo.video_mode_ptr.seg); | 52 | set_fs(vginfo.video_mode_ptr.seg); |
55 | mode_ptr = vginfo.video_mode_ptr.off; | 53 | mode_ptr = vginfo.video_mode_ptr.off; |
56 | 54 | ||
@@ -102,9 +100,6 @@ static int vesa_probe(void) | |||
102 | } | 100 | } |
103 | 101 | ||
104 | return nmodes; | 102 | return nmodes; |
105 | #else | ||
106 | return 0; | ||
107 | #endif /* CONFIG_VIDEO_VESA */ | ||
108 | } | 103 | } |
109 | 104 | ||
110 | static int vesa_set_mode(struct mode_info *mode) | 105 | static int vesa_set_mode(struct mode_info *mode) |
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 8f8d827e254..819caa1f200 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c | |||
@@ -47,14 +47,6 @@ static u8 vga_set_basic_mode(void) | |||
47 | 47 | ||
48 | initregs(&ireg); | 48 | initregs(&ireg); |
49 | 49 | ||
50 | #ifdef CONFIG_VIDEO_400_HACK | ||
51 | if (adapter >= ADAPTER_VGA) { | ||
52 | ireg.ax = 0x1202; | ||
53 | ireg.bx = 0x0030; | ||
54 | intcall(0x10, &ireg, NULL); | ||
55 | } | ||
56 | #endif | ||
57 | |||
58 | ax = 0x0f00; | 50 | ax = 0x0f00; |
59 | intcall(0x10, &ireg, &oreg); | 51 | intcall(0x10, &ireg, &oreg); |
60 | mode = oreg.al; | 52 | mode = oreg.al; |
@@ -62,11 +54,9 @@ static u8 vga_set_basic_mode(void) | |||
62 | set_fs(0); | 54 | set_fs(0); |
63 | rows = rdfs8(0x484); /* rows minus one */ | 55 | rows = rdfs8(0x484); /* rows minus one */ |
64 | 56 | ||
65 | #ifndef CONFIG_VIDEO_400_HACK | ||
66 | if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) && | 57 | if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) && |
67 | (rows == 0 || rows == 24)) | 58 | (rows == 0 || rows == 24)) |
68 | return mode; | 59 | return mode; |
69 | #endif | ||
70 | 60 | ||
71 | if (mode != 3 && mode != 7) | 61 | if (mode != 3 && mode != 7) |
72 | mode = 3; | 62 | mode = 3; |
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index bad728b76fc..d42da380249 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c | |||
@@ -221,7 +221,6 @@ static unsigned int mode_menu(void) | |||
221 | } | 221 | } |
222 | } | 222 | } |
223 | 223 | ||
224 | #ifdef CONFIG_VIDEO_RETAIN | ||
225 | /* Save screen content to the heap */ | 224 | /* Save screen content to the heap */ |
226 | static struct saved_screen { | 225 | static struct saved_screen { |
227 | int x, y; | 226 | int x, y; |
@@ -299,10 +298,6 @@ static void restore_screen(void) | |||
299 | ireg.dl = saved.curx; | 298 | ireg.dl = saved.curx; |
300 | intcall(0x10, &ireg, NULL); | 299 | intcall(0x10, &ireg, NULL); |
301 | } | 300 | } |
302 | #else | ||
303 | #define save_screen() ((void)0) | ||
304 | #define restore_screen() ((void)0) | ||
305 | #endif | ||
306 | 301 | ||
307 | void set_video(void) | 302 | void set_video(void) |
308 | { | 303 | { |
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index 5bb174a997f..ff339c5db31 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h | |||
@@ -17,19 +17,8 @@ | |||
17 | 17 | ||
18 | #include <linux/types.h> | 18 | #include <linux/types.h> |
19 | 19 | ||
20 | /* Enable autodetection of SVGA adapters and modes. */ | 20 | /* |
21 | #undef CONFIG_VIDEO_SVGA | 21 | * This code uses an extended set of video mode numbers. These include: |
22 | |||
23 | /* Enable autodetection of VESA modes */ | ||
24 | #define CONFIG_VIDEO_VESA | ||
25 | |||
26 | /* Retain screen contents when switching modes */ | ||
27 | #define CONFIG_VIDEO_RETAIN | ||
28 | |||
29 | /* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */ | ||
30 | #undef CONFIG_VIDEO_400_HACK | ||
31 | |||
32 | /* This code uses an extended set of video mode numbers. These include: | ||
33 | * Aliases for standard modes | 22 | * Aliases for standard modes |
34 | * NORMAL_VGA (-1) | 23 | * NORMAL_VGA (-1) |
35 | * EXTENDED_VGA (-2) | 24 | * EXTENDED_VGA (-2) |
@@ -67,13 +56,8 @@ | |||
67 | /* The "recalculate timings" flag */ | 56 | /* The "recalculate timings" flag */ |
68 | #define VIDEO_RECALC 0x8000 | 57 | #define VIDEO_RECALC 0x8000 |
69 | 58 | ||
70 | /* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ | ||
71 | #ifdef CONFIG_VIDEO_RETAIN | ||
72 | void store_screen(void); | 59 | void store_screen(void); |
73 | #define DO_STORE() store_screen() | 60 | #define DO_STORE() store_screen() |
74 | #else | ||
75 | #define DO_STORE() ((void)0) | ||
76 | #endif /* CONFIG_VIDEO_RETAIN */ | ||
77 | 61 | ||
78 | /* | 62 | /* |
79 | * Mode table structures | 63 | * Mode table structures |
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index edb992ebef9..d28fad19654 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig | |||
@@ -2355,7 +2355,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y | |||
2355 | CONFIG_HAVE_DYNAMIC_FTRACE=y | 2355 | CONFIG_HAVE_DYNAMIC_FTRACE=y |
2356 | CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y | 2356 | CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y |
2357 | CONFIG_HAVE_HW_BRANCH_TRACER=y | 2357 | CONFIG_HAVE_HW_BRANCH_TRACER=y |
2358 | CONFIG_HAVE_FTRACE_SYSCALLS=y | 2358 | CONFIG_HAVE_SYSCALL_TRACEPOINTS=y |
2359 | CONFIG_RING_BUFFER=y | 2359 | CONFIG_RING_BUFFER=y |
2360 | CONFIG_TRACING=y | 2360 | CONFIG_TRACING=y |
2361 | CONFIG_TRACING_SUPPORT=y | 2361 | CONFIG_TRACING_SUPPORT=y |
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index cee1dd2e69b..6c86acd847a 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig | |||
@@ -2329,7 +2329,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y | |||
2329 | CONFIG_HAVE_DYNAMIC_FTRACE=y | 2329 | CONFIG_HAVE_DYNAMIC_FTRACE=y |
2330 | CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y | 2330 | CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y |
2331 | CONFIG_HAVE_HW_BRANCH_TRACER=y | 2331 | CONFIG_HAVE_HW_BRANCH_TRACER=y |
2332 | CONFIG_HAVE_FTRACE_SYSCALLS=y | 2332 | CONFIG_HAVE_SYSCALL_TRACEPOINTS=y |
2333 | CONFIG_RING_BUFFER=y | 2333 | CONFIG_RING_BUFFER=y |
2334 | CONFIG_TRACING=y | 2334 | CONFIG_TRACING=y |
2335 | CONFIG_TRACING_SUPPORT=y | 2335 | CONFIG_TRACING_SUPPORT=y |
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index c580c5ec1ca..585edebe12c 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c | |||
@@ -59,13 +59,6 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, | |||
59 | asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, | 59 | asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, |
60 | const u8 *in, unsigned int len, u8 *iv); | 60 | const u8 *in, unsigned int len, u8 *iv); |
61 | 61 | ||
62 | static inline int kernel_fpu_using(void) | ||
63 | { | ||
64 | if (in_interrupt() && !(read_cr0() & X86_CR0_TS)) | ||
65 | return 1; | ||
66 | return 0; | ||
67 | } | ||
68 | |||
69 | static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) | 62 | static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) |
70 | { | 63 | { |
71 | unsigned long addr = (unsigned long)raw_ctx; | 64 | unsigned long addr = (unsigned long)raw_ctx; |
@@ -89,7 +82,7 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx, | |||
89 | return -EINVAL; | 82 | return -EINVAL; |
90 | } | 83 | } |
91 | 84 | ||
92 | if (kernel_fpu_using()) | 85 | if (irq_fpu_usable()) |
93 | err = crypto_aes_expand_key(ctx, in_key, key_len); | 86 | err = crypto_aes_expand_key(ctx, in_key, key_len); |
94 | else { | 87 | else { |
95 | kernel_fpu_begin(); | 88 | kernel_fpu_begin(); |
@@ -110,7 +103,7 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) | |||
110 | { | 103 | { |
111 | struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); | 104 | struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); |
112 | 105 | ||
113 | if (kernel_fpu_using()) | 106 | if (irq_fpu_usable()) |
114 | crypto_aes_encrypt_x86(ctx, dst, src); | 107 | crypto_aes_encrypt_x86(ctx, dst, src); |
115 | else { | 108 | else { |
116 | kernel_fpu_begin(); | 109 | kernel_fpu_begin(); |
@@ -123,7 +116,7 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) | |||
123 | { | 116 | { |
124 | struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); | 117 | struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); |
125 | 118 | ||
126 | if (kernel_fpu_using()) | 119 | if (irq_fpu_usable()) |
127 | crypto_aes_decrypt_x86(ctx, dst, src); | 120 | crypto_aes_decrypt_x86(ctx, dst, src); |
128 | else { | 121 | else { |
129 | kernel_fpu_begin(); | 122 | kernel_fpu_begin(); |
@@ -349,7 +342,7 @@ static int ablk_encrypt(struct ablkcipher_request *req) | |||
349 | struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); | 342 | struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); |
350 | struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); | 343 | struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); |
351 | 344 | ||
352 | if (kernel_fpu_using()) { | 345 | if (irq_fpu_usable()) { |
353 | struct ablkcipher_request *cryptd_req = | 346 | struct ablkcipher_request *cryptd_req = |
354 | ablkcipher_request_ctx(req); | 347 | ablkcipher_request_ctx(req); |
355 | memcpy(cryptd_req, req, sizeof(*req)); | 348 | memcpy(cryptd_req, req, sizeof(*req)); |
@@ -370,7 +363,7 @@ static int ablk_decrypt(struct ablkcipher_request *req) | |||
370 | struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); | 363 | struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); |
371 | struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); | 364 | struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); |
372 | 365 | ||
373 | if (kernel_fpu_using()) { | 366 | if (irq_fpu_usable()) { |
374 | struct ablkcipher_request *cryptd_req = | 367 | struct ablkcipher_request *cryptd_req = |
375 | ablkcipher_request_ctx(req); | 368 | ablkcipher_request_ctx(req); |
376 | memcpy(cryptd_req, req, sizeof(*req)); | 369 | memcpy(cryptd_req, req, sizeof(*req)); |
@@ -636,7 +629,7 @@ static int __init aesni_init(void) | |||
636 | int err; | 629 | int err; |
637 | 630 | ||
638 | if (!cpu_has_aes) { | 631 | if (!cpu_has_aes) { |
639 | printk(KERN_ERR "Intel AES-NI instructions are not detected.\n"); | 632 | printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); |
640 | return -ENODEV; | 633 | return -ENODEV; |
641 | } | 634 | } |
642 | if ((err = crypto_register_alg(&aesni_alg))) | 635 | if ((err = crypto_register_alg(&aesni_alg))) |
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e590261ba05..74619c4f9fd 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S | |||
@@ -537,7 +537,7 @@ ia32_sys_call_table: | |||
537 | .quad sys_mkdir | 537 | .quad sys_mkdir |
538 | .quad sys_rmdir /* 40 */ | 538 | .quad sys_rmdir /* 40 */ |
539 | .quad sys_dup | 539 | .quad sys_dup |
540 | .quad sys32_pipe | 540 | .quad sys_pipe |
541 | .quad compat_sys_times | 541 | .quad compat_sys_times |
542 | .quad quiet_ni_syscall /* old prof syscall holder */ | 542 | .quad quiet_ni_syscall /* old prof syscall holder */ |
543 | .quad sys_brk /* 45 */ | 543 | .quad sys_brk /* 45 */ |
@@ -831,5 +831,5 @@ ia32_sys_call_table: | |||
831 | .quad compat_sys_preadv | 831 | .quad compat_sys_preadv |
832 | .quad compat_sys_pwritev | 832 | .quad compat_sys_pwritev |
833 | .quad compat_sys_rt_tgsigqueueinfo /* 335 */ | 833 | .quad compat_sys_rt_tgsigqueueinfo /* 335 */ |
834 | .quad sys_perf_counter_open | 834 | .quad sys_perf_event_open |
835 | ia32_syscall_end: | 835 | ia32_syscall_end: |
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 085a8c35f14..9f552719882 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c | |||
@@ -189,20 +189,6 @@ asmlinkage long sys32_mprotect(unsigned long start, size_t len, | |||
189 | return sys_mprotect(start, len, prot); | 189 | return sys_mprotect(start, len, prot); |
190 | } | 190 | } |
191 | 191 | ||
192 | asmlinkage long sys32_pipe(int __user *fd) | ||
193 | { | ||
194 | int retval; | ||
195 | int fds[2]; | ||
196 | |||
197 | retval = do_pipe_flags(fds, 0); | ||
198 | if (retval) | ||
199 | goto out; | ||
200 | if (copy_to_user(fd, fds, sizeof(fds))) | ||
201 | retval = -EFAULT; | ||
202 | out: | ||
203 | return retval; | ||
204 | } | ||
205 | |||
206 | asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act, | 192 | asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act, |
207 | struct sigaction32 __user *oact, | 193 | struct sigaction32 __user *oact, |
208 | unsigned int sigsetsize) | 194 | unsigned int sigsetsize) |
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 20d1465a2ab..4518dc50090 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h | |||
@@ -144,7 +144,6 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) | |||
144 | 144 | ||
145 | #else /* !CONFIG_ACPI */ | 145 | #else /* !CONFIG_ACPI */ |
146 | 146 | ||
147 | #define acpi_disabled 1 | ||
148 | #define acpi_lapic 0 | 147 | #define acpi_lapic 0 |
149 | #define acpi_ioapic 0 | 148 | #define acpi_ioapic 0 |
150 | static inline void acpi_noirq_set(void) { } | 149 | static inline void acpi_noirq_set(void) { } |
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h index 9825cd64c9b..eec2a70d437 100644 --- a/arch/x86/include/asm/agp.h +++ b/arch/x86/include/asm/agp.h | |||
@@ -22,10 +22,6 @@ | |||
22 | */ | 22 | */ |
23 | #define flush_agp_cache() wbinvd() | 23 | #define flush_agp_cache() wbinvd() |
24 | 24 | ||
25 | /* Convert a physical address to an address suitable for the GART. */ | ||
26 | #define phys_to_gart(x) (x) | ||
27 | #define gart_to_phys(x) (x) | ||
28 | |||
29 | /* GATT allocation. Returns/accepts GATT kernel virtual address. */ | 25 | /* GATT allocation. Returns/accepts GATT kernel virtual address. */ |
30 | #define alloc_gatt_pages(order) \ | 26 | #define alloc_gatt_pages(order) \ |
31 | ((char *)__get_free_pages(GFP_KERNEL, (order))) | 27 | ((char *)__get_free_pages(GFP_KERNEL, (order))) |
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 1a37bcdc860..c240efc74e0 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h | |||
@@ -73,8 +73,6 @@ static inline void alternatives_smp_module_del(struct module *mod) {} | |||
73 | static inline void alternatives_smp_switch(int smp) {} | 73 | static inline void alternatives_smp_switch(int smp) {} |
74 | #endif /* CONFIG_SMP */ | 74 | #endif /* CONFIG_SMP */ |
75 | 75 | ||
76 | const unsigned char *const *find_nop_table(void); | ||
77 | |||
78 | /* alternative assembly primitive: */ | 76 | /* alternative assembly primitive: */ |
79 | #define ALTERNATIVE(oldinstr, newinstr, feature) \ | 77 | #define ALTERNATIVE(oldinstr, newinstr, feature) \ |
80 | \ | 78 | \ |
@@ -144,8 +142,6 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, | |||
144 | #define __parainstructions_end NULL | 142 | #define __parainstructions_end NULL |
145 | #endif | 143 | #endif |
146 | 144 | ||
147 | extern void add_nops(void *insns, unsigned int len); | ||
148 | |||
149 | /* | 145 | /* |
150 | * Clear and restore the kernel write-protection flag on the local CPU. | 146 | * Clear and restore the kernel write-protection flag on the local CPU. |
151 | * Allows the kernel to edit read-only pages. | 147 | * Allows the kernel to edit read-only pages. |
@@ -161,10 +157,7 @@ extern void add_nops(void *insns, unsigned int len); | |||
161 | * Intel's errata. | 157 | * Intel's errata. |
162 | * On the local CPU you need to be protected again NMI or MCE handlers seeing an | 158 | * On the local CPU you need to be protected again NMI or MCE handlers seeing an |
163 | * inconsistent instruction while you patch. | 159 | * inconsistent instruction while you patch. |
164 | * The _early version expects the memory to already be RW. | ||
165 | */ | 160 | */ |
166 | |||
167 | extern void *text_poke(void *addr, const void *opcode, size_t len); | 161 | extern void *text_poke(void *addr, const void *opcode, size_t len); |
168 | extern void *text_poke_early(void *addr, const void *opcode, size_t len); | ||
169 | 162 | ||
170 | #endif /* _ASM_X86_ALTERNATIVE_H */ | 163 | #endif /* _ASM_X86_ALTERNATIVE_H */ |
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index bdf96f119f0..ac95995b7ba 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h | |||
@@ -25,6 +25,7 @@ | |||
25 | #ifdef CONFIG_AMD_IOMMU | 25 | #ifdef CONFIG_AMD_IOMMU |
26 | extern int amd_iommu_init(void); | 26 | extern int amd_iommu_init(void); |
27 | extern int amd_iommu_init_dma_ops(void); | 27 | extern int amd_iommu_init_dma_ops(void); |
28 | extern int amd_iommu_init_passthrough(void); | ||
28 | extern void amd_iommu_detect(void); | 29 | extern void amd_iommu_detect(void); |
29 | extern irqreturn_t amd_iommu_int_handler(int irq, void *data); | 30 | extern irqreturn_t amd_iommu_int_handler(int irq, void *data); |
30 | extern void amd_iommu_flush_all_domains(void); | 31 | extern void amd_iommu_flush_all_domains(void); |
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 0c878caaa0a..2a2cc7a78a8 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h | |||
@@ -143,22 +143,29 @@ | |||
143 | #define EVT_BUFFER_SIZE 8192 /* 512 entries */ | 143 | #define EVT_BUFFER_SIZE 8192 /* 512 entries */ |
144 | #define EVT_LEN_MASK (0x9ULL << 56) | 144 | #define EVT_LEN_MASK (0x9ULL << 56) |
145 | 145 | ||
146 | #define PAGE_MODE_NONE 0x00 | ||
146 | #define PAGE_MODE_1_LEVEL 0x01 | 147 | #define PAGE_MODE_1_LEVEL 0x01 |
147 | #define PAGE_MODE_2_LEVEL 0x02 | 148 | #define PAGE_MODE_2_LEVEL 0x02 |
148 | #define PAGE_MODE_3_LEVEL 0x03 | 149 | #define PAGE_MODE_3_LEVEL 0x03 |
149 | 150 | #define PAGE_MODE_4_LEVEL 0x04 | |
150 | #define IOMMU_PDE_NL_0 0x000ULL | 151 | #define PAGE_MODE_5_LEVEL 0x05 |
151 | #define IOMMU_PDE_NL_1 0x200ULL | 152 | #define PAGE_MODE_6_LEVEL 0x06 |
152 | #define IOMMU_PDE_NL_2 0x400ULL | 153 | |
153 | #define IOMMU_PDE_NL_3 0x600ULL | 154 | #define PM_LEVEL_SHIFT(x) (12 + ((x) * 9)) |
154 | 155 | #define PM_LEVEL_SIZE(x) (((x) < 6) ? \ | |
155 | #define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL) | 156 | ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \ |
156 | #define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL) | 157 | (0xffffffffffffffffULL)) |
157 | #define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL) | 158 | #define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL) |
158 | 159 | #define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL) | |
159 | #define IOMMU_MAP_SIZE_L1 (1ULL << 21) | 160 | #define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \ |
160 | #define IOMMU_MAP_SIZE_L2 (1ULL << 30) | 161 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) |
161 | #define IOMMU_MAP_SIZE_L3 (1ULL << 39) | 162 | #define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL) |
163 | |||
164 | #define PM_MAP_4k 0 | ||
165 | #define PM_ADDR_MASK 0x000ffffffffff000ULL | ||
166 | #define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \ | ||
167 | (~((1ULL << (12 + ((lvl) * 9))) - 1))) | ||
168 | #define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr)) | ||
162 | 169 | ||
163 | #define IOMMU_PTE_P (1ULL << 0) | 170 | #define IOMMU_PTE_P (1ULL << 0) |
164 | #define IOMMU_PTE_TV (1ULL << 1) | 171 | #define IOMMU_PTE_TV (1ULL << 1) |
@@ -167,11 +174,6 @@ | |||
167 | #define IOMMU_PTE_IR (1ULL << 61) | 174 | #define IOMMU_PTE_IR (1ULL << 61) |
168 | #define IOMMU_PTE_IW (1ULL << 62) | 175 | #define IOMMU_PTE_IW (1ULL << 62) |
169 | 176 | ||
170 | #define IOMMU_L1_PDE(address) \ | ||
171 | ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) | ||
172 | #define IOMMU_L2_PDE(address) \ | ||
173 | ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) | ||
174 | |||
175 | #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) | 177 | #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) |
176 | #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) | 178 | #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) |
177 | #define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) | 179 | #define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) |
@@ -194,11 +196,14 @@ | |||
194 | #define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ | 196 | #define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ |
195 | #define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops | 197 | #define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops |
196 | domain for an IOMMU */ | 198 | domain for an IOMMU */ |
199 | #define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page | ||
200 | translation */ | ||
201 | |||
197 | extern bool amd_iommu_dump; | 202 | extern bool amd_iommu_dump; |
198 | #define DUMP_printk(format, arg...) \ | 203 | #define DUMP_printk(format, arg...) \ |
199 | do { \ | 204 | do { \ |
200 | if (amd_iommu_dump) \ | 205 | if (amd_iommu_dump) \ |
201 | printk(KERN_INFO "AMD IOMMU: " format, ## arg); \ | 206 | printk(KERN_INFO "AMD-Vi: " format, ## arg); \ |
202 | } while(0); | 207 | } while(0); |
203 | 208 | ||
204 | /* | 209 | /* |
@@ -226,6 +231,7 @@ struct protection_domain { | |||
226 | int mode; /* paging mode (0-6 levels) */ | 231 | int mode; /* paging mode (0-6 levels) */ |
227 | u64 *pt_root; /* page table root pointer */ | 232 | u64 *pt_root; /* page table root pointer */ |
228 | unsigned long flags; /* flags to find out type of domain */ | 233 | unsigned long flags; /* flags to find out type of domain */ |
234 | bool updated; /* complete domain flush required */ | ||
229 | unsigned dev_cnt; /* devices assigned to this domain */ | 235 | unsigned dev_cnt; /* devices assigned to this domain */ |
230 | void *priv; /* private data */ | 236 | void *priv; /* private data */ |
231 | }; | 237 | }; |
@@ -337,6 +343,9 @@ struct amd_iommu { | |||
337 | /* if one, we need to send a completion wait command */ | 343 | /* if one, we need to send a completion wait command */ |
338 | bool need_sync; | 344 | bool need_sync; |
339 | 345 | ||
346 | /* becomes true if a command buffer reset is running */ | ||
347 | bool reset_in_progress; | ||
348 | |||
340 | /* default dma_ops domain for that IOMMU */ | 349 | /* default dma_ops domain for that IOMMU */ |
341 | struct dma_ops_domain *default_dom; | 350 | struct dma_ops_domain *default_dom; |
342 | }; | 351 | }; |
@@ -457,4 +466,7 @@ static inline void amd_iommu_stats_init(void) { } | |||
457 | 466 | ||
458 | #endif /* CONFIG_AMD_IOMMU_STATS */ | 467 | #endif /* CONFIG_AMD_IOMMU_STATS */ |
459 | 468 | ||
469 | /* some function prototypes */ | ||
470 | extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); | ||
471 | |||
460 | #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ | 472 | #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ |
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bb7d4792584..474d80d3e6c 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h | |||
@@ -66,13 +66,23 @@ static inline void default_inquire_remote_apic(int apicid) | |||
66 | } | 66 | } |
67 | 67 | ||
68 | /* | 68 | /* |
69 | * With 82489DX we can't rely on apic feature bit | ||
70 | * retrieved via cpuid but still have to deal with | ||
71 | * such an apic chip so we assume that SMP configuration | ||
72 | * is found from MP table (64bit case uses ACPI mostly | ||
73 | * which set smp presence flag as well so we are safe | ||
74 | * to use this helper too). | ||
75 | */ | ||
76 | static inline bool apic_from_smp_config(void) | ||
77 | { | ||
78 | return smp_found_config && !disable_apic; | ||
79 | } | ||
80 | |||
81 | /* | ||
69 | * Basic functions accessing APICs. | 82 | * Basic functions accessing APICs. |
70 | */ | 83 | */ |
71 | #ifdef CONFIG_PARAVIRT | 84 | #ifdef CONFIG_PARAVIRT |
72 | #include <asm/paravirt.h> | 85 | #include <asm/paravirt.h> |
73 | #else | ||
74 | #define setup_boot_clock setup_boot_APIC_clock | ||
75 | #define setup_secondary_clock setup_secondary_APIC_clock | ||
76 | #endif | 86 | #endif |
77 | 87 | ||
78 | #ifdef CONFIG_X86_64 | 88 | #ifdef CONFIG_X86_64 |
@@ -183,6 +193,10 @@ static inline int x2apic_enabled(void) | |||
183 | } | 193 | } |
184 | 194 | ||
185 | #define x2apic_supported() (cpu_has_x2apic) | 195 | #define x2apic_supported() (cpu_has_x2apic) |
196 | static inline void x2apic_force_phys(void) | ||
197 | { | ||
198 | x2apic_phys = 1; | ||
199 | } | ||
186 | #else | 200 | #else |
187 | static inline void check_x2apic(void) | 201 | static inline void check_x2apic(void) |
188 | { | 202 | { |
@@ -194,6 +208,9 @@ static inline int x2apic_enabled(void) | |||
194 | { | 208 | { |
195 | return 0; | 209 | return 0; |
196 | } | 210 | } |
211 | static inline void x2apic_force_phys(void) | ||
212 | { | ||
213 | } | ||
197 | 214 | ||
198 | #define x2apic_preenabled 0 | 215 | #define x2apic_preenabled 0 |
199 | #define x2apic_supported() 0 | 216 | #define x2apic_supported() 0 |
@@ -245,6 +262,8 @@ static inline void lapic_shutdown(void) { } | |||
245 | static inline void init_apic_mappings(void) { } | 262 | static inline void init_apic_mappings(void) { } |
246 | static inline void disable_local_APIC(void) { } | 263 | static inline void disable_local_APIC(void) { } |
247 | static inline void apic_disable(void) { } | 264 | static inline void apic_disable(void) { } |
265 | # define setup_boot_APIC_clock x86_init_noop | ||
266 | # define setup_secondary_APIC_clock x86_init_noop | ||
248 | #endif /* !CONFIG_X86_LOCAL_APIC */ | 267 | #endif /* !CONFIG_X86_LOCAL_APIC */ |
249 | 268 | ||
250 | #ifdef CONFIG_X86_64 | 269 | #ifdef CONFIG_X86_64 |
@@ -293,7 +312,7 @@ struct apic { | |||
293 | int (*cpu_present_to_apicid)(int mps_cpu); | 312 | int (*cpu_present_to_apicid)(int mps_cpu); |
294 | physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); | 313 | physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); |
295 | void (*setup_portio_remap)(void); | 314 | void (*setup_portio_remap)(void); |
296 | int (*check_phys_apicid_present)(int boot_cpu_physical_apicid); | 315 | int (*check_phys_apicid_present)(int phys_apicid); |
297 | void (*enable_apic_mode)(void); | 316 | void (*enable_apic_mode)(void); |
298 | int (*phys_pkg_id)(int cpuid_apic, int index_msb); | 317 | int (*phys_pkg_id)(int cpuid_apic, int index_msb); |
299 | 318 | ||
@@ -427,7 +446,7 @@ extern struct apic apic_x2apic_uv_x; | |||
427 | DECLARE_PER_CPU(int, x2apic_extra_bits); | 446 | DECLARE_PER_CPU(int, x2apic_extra_bits); |
428 | 447 | ||
429 | extern int default_cpu_present_to_apicid(int mps_cpu); | 448 | extern int default_cpu_present_to_apicid(int mps_cpu); |
430 | extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid); | 449 | extern int default_check_phys_apicid_present(int phys_apicid); |
431 | #endif | 450 | #endif |
432 | 451 | ||
433 | static inline void default_wait_for_init_deassert(atomic_t *deassert) | 452 | static inline void default_wait_for_init_deassert(atomic_t *deassert) |
@@ -543,9 +562,9 @@ static inline int __default_cpu_present_to_apicid(int mps_cpu) | |||
543 | } | 562 | } |
544 | 563 | ||
545 | static inline int | 564 | static inline int |
546 | __default_check_phys_apicid_present(int boot_cpu_physical_apicid) | 565 | __default_check_phys_apicid_present(int phys_apicid) |
547 | { | 566 | { |
548 | return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map); | 567 | return physid_isset(phys_apicid, phys_cpu_present_map); |
549 | } | 568 | } |
550 | 569 | ||
551 | #ifdef CONFIG_X86_32 | 570 | #ifdef CONFIG_X86_32 |
@@ -555,13 +574,13 @@ static inline int default_cpu_present_to_apicid(int mps_cpu) | |||
555 | } | 574 | } |
556 | 575 | ||
557 | static inline int | 576 | static inline int |
558 | default_check_phys_apicid_present(int boot_cpu_physical_apicid) | 577 | default_check_phys_apicid_present(int phys_apicid) |
559 | { | 578 | { |
560 | return __default_check_phys_apicid_present(boot_cpu_physical_apicid); | 579 | return __default_check_phys_apicid_present(phys_apicid); |
561 | } | 580 | } |
562 | #else | 581 | #else |
563 | extern int default_cpu_present_to_apicid(int mps_cpu); | 582 | extern int default_cpu_present_to_apicid(int mps_cpu); |
564 | extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid); | 583 | extern int default_check_phys_apicid_present(int phys_apicid); |
565 | #endif | 584 | #endif |
566 | 585 | ||
567 | static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid) | 586 | static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid) |
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 7ddb36ab933..3b62da926de 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h | |||
@@ -8,12 +8,14 @@ | |||
8 | * Ingo Molnar <mingo@redhat.com>, 1999, 2000 | 8 | * Ingo Molnar <mingo@redhat.com>, 1999, 2000 |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #define APIC_DEFAULT_PHYS_BASE 0xfee00000 | 11 | #define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000 |
12 | #define APIC_DEFAULT_PHYS_BASE 0xfee00000 | ||
12 | 13 | ||
13 | #define APIC_ID 0x20 | 14 | #define APIC_ID 0x20 |
14 | 15 | ||
15 | #define APIC_LVR 0x30 | 16 | #define APIC_LVR 0x30 |
16 | #define APIC_LVR_MASK 0xFF00FF | 17 | #define APIC_LVR_MASK 0xFF00FF |
18 | #define APIC_LVR_DIRECTED_EOI (1 << 24) | ||
17 | #define GET_APIC_VERSION(x) ((x) & 0xFFu) | 19 | #define GET_APIC_VERSION(x) ((x) & 0xFFu) |
18 | #define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu) | 20 | #define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu) |
19 | #ifdef CONFIG_X86_32 | 21 | #ifdef CONFIG_X86_32 |
@@ -40,6 +42,7 @@ | |||
40 | #define APIC_DFR_CLUSTER 0x0FFFFFFFul | 42 | #define APIC_DFR_CLUSTER 0x0FFFFFFFul |
41 | #define APIC_DFR_FLAT 0xFFFFFFFFul | 43 | #define APIC_DFR_FLAT 0xFFFFFFFFul |
42 | #define APIC_SPIV 0xF0 | 44 | #define APIC_SPIV 0xF0 |
45 | #define APIC_SPIV_DIRECTED_EOI (1 << 12) | ||
43 | #define APIC_SPIV_FOCUS_DISABLED (1 << 9) | 46 | #define APIC_SPIV_FOCUS_DISABLED (1 << 9) |
44 | #define APIC_SPIV_APIC_ENABLED (1 << 8) | 47 | #define APIC_SPIV_APIC_ENABLED (1 << 8) |
45 | #define APIC_ISR 0x100 | 48 | #define APIC_ISR 0x100 |
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 56be78f582f..b3ed1e1460f 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h | |||
@@ -3,7 +3,7 @@ | |||
3 | 3 | ||
4 | #ifdef __ASSEMBLY__ | 4 | #ifdef __ASSEMBLY__ |
5 | # define __ASM_FORM(x) x | 5 | # define __ASM_FORM(x) x |
6 | # define __ASM_EX_SEC .section __ex_table | 6 | # define __ASM_EX_SEC .section __ex_table, "a" |
7 | #else | 7 | #else |
8 | # define __ASM_FORM(x) " " #x " " | 8 | # define __ASM_FORM(x) " " #x " " |
9 | # define __ASM_EX_SEC " .section __ex_table,\"a\"\n" | 9 | # define __ASM_EX_SEC " .section __ex_table,\"a\"\n" |
@@ -38,10 +38,18 @@ | |||
38 | #define _ASM_DI __ASM_REG(di) | 38 | #define _ASM_DI __ASM_REG(di) |
39 | 39 | ||
40 | /* Exception table entry */ | 40 | /* Exception table entry */ |
41 | #ifdef __ASSEMBLY__ | ||
42 | # define _ASM_EXTABLE(from,to) \ | ||
43 | __ASM_EX_SEC ; \ | ||
44 | _ASM_ALIGN ; \ | ||
45 | _ASM_PTR from , to ; \ | ||
46 | .previous | ||
47 | #else | ||
41 | # define _ASM_EXTABLE(from,to) \ | 48 | # define _ASM_EXTABLE(from,to) \ |
42 | __ASM_EX_SEC \ | 49 | __ASM_EX_SEC \ |
43 | _ASM_ALIGN "\n" \ | 50 | _ASM_ALIGN "\n" \ |
44 | _ASM_PTR #from "," #to "\n" \ | 51 | _ASM_PTR #from "," #to "\n" \ |
45 | " .previous\n" | 52 | " .previous\n" |
53 | #endif | ||
46 | 54 | ||
47 | #endif /* _ASM_X86_ASM_H */ | 55 | #endif /* _ASM_X86_ASM_H */ |
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 1724e8de317..6be33d83c71 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h | |||
@@ -85,7 +85,8 @@ struct efi_info { | |||
85 | struct boot_params { | 85 | struct boot_params { |
86 | struct screen_info screen_info; /* 0x000 */ | 86 | struct screen_info screen_info; /* 0x000 */ |
87 | struct apm_bios_info apm_bios_info; /* 0x040 */ | 87 | struct apm_bios_info apm_bios_info; /* 0x040 */ |
88 | __u8 _pad2[12]; /* 0x054 */ | 88 | __u8 _pad2[4]; /* 0x054 */ |
89 | __u64 tboot_addr; /* 0x058 */ | ||
89 | struct ist_info ist_info; /* 0x060 */ | 90 | struct ist_info ist_info; /* 0x060 */ |
90 | __u8 _pad3[16]; /* 0x070 */ | 91 | __u8 _pad3[16]; /* 0x070 */ |
91 | __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ | 92 | __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ |
@@ -109,4 +110,14 @@ struct boot_params { | |||
109 | __u8 _pad9[276]; /* 0xeec */ | 110 | __u8 _pad9[276]; /* 0xeec */ |
110 | } __attribute__((packed)); | 111 | } __attribute__((packed)); |
111 | 112 | ||
113 | enum { | ||
114 | X86_SUBARCH_PC = 0, | ||
115 | X86_SUBARCH_LGUEST, | ||
116 | X86_SUBARCH_XEN, | ||
117 | X86_SUBARCH_MRST, | ||
118 | X86_NR_SUBARCHS, | ||
119 | }; | ||
120 | |||
121 | |||
122 | |||
112 | #endif /* _ASM_X86_BOOTPARAM_H */ | 123 | #endif /* _ASM_X86_BOOTPARAM_H */ |
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h index 5d367caa0e3..549860d3be8 100644 --- a/arch/x86/include/asm/cache.h +++ b/arch/x86/include/asm/cache.h | |||
@@ -1,6 +1,8 @@ | |||
1 | #ifndef _ASM_X86_CACHE_H | 1 | #ifndef _ASM_X86_CACHE_H |
2 | #define _ASM_X86_CACHE_H | 2 | #define _ASM_X86_CACHE_H |
3 | 3 | ||
4 | #include <linux/linkage.h> | ||
5 | |||
4 | /* L1 cache line size */ | 6 | /* L1 cache line size */ |
5 | #define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT) | 7 | #define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT) |
6 | #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) | 8 | #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) |
@@ -13,7 +15,7 @@ | |||
13 | #ifdef CONFIG_SMP | 15 | #ifdef CONFIG_SMP |
14 | #define __cacheline_aligned_in_smp \ | 16 | #define __cacheline_aligned_in_smp \ |
15 | __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ | 17 | __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ |
16 | __attribute__((__section__(".data.page_aligned"))) | 18 | __page_aligned_data |
17 | #endif | 19 | #endif |
18 | #endif | 20 | #endif |
19 | 21 | ||
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index e55dfc1ad45..b54f6afe7ec 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h | |||
@@ -43,8 +43,58 @@ static inline void copy_from_user_page(struct vm_area_struct *vma, | |||
43 | memcpy(dst, src, len); | 43 | memcpy(dst, src, len); |
44 | } | 44 | } |
45 | 45 | ||
46 | #define PG_non_WB PG_arch_1 | 46 | #define PG_WC PG_arch_1 |
47 | PAGEFLAG(NonWB, non_WB) | 47 | PAGEFLAG(WC, WC) |
48 | |||
49 | #ifdef CONFIG_X86_PAT | ||
50 | /* | ||
51 | * X86 PAT uses page flags WC and Uncached together to keep track of | ||
52 | * memory type of pages that have backing page struct. X86 PAT supports 3 | ||
53 | * different memory types, _PAGE_CACHE_WB, _PAGE_CACHE_WC and | ||
54 | * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not | ||
55 | * been changed from its default (value of -1 used to denote this). | ||
56 | * Note we do not support _PAGE_CACHE_UC here. | ||
57 | * | ||
58 | * Caller must hold memtype_lock for atomicity. | ||
59 | */ | ||
60 | static inline unsigned long get_page_memtype(struct page *pg) | ||
61 | { | ||
62 | if (!PageUncached(pg) && !PageWC(pg)) | ||
63 | return -1; | ||
64 | else if (!PageUncached(pg) && PageWC(pg)) | ||
65 | return _PAGE_CACHE_WC; | ||
66 | else if (PageUncached(pg) && !PageWC(pg)) | ||
67 | return _PAGE_CACHE_UC_MINUS; | ||
68 | else | ||
69 | return _PAGE_CACHE_WB; | ||
70 | } | ||
71 | |||
72 | static inline void set_page_memtype(struct page *pg, unsigned long memtype) | ||
73 | { | ||
74 | switch (memtype) { | ||
75 | case _PAGE_CACHE_WC: | ||
76 | ClearPageUncached(pg); | ||
77 | SetPageWC(pg); | ||
78 | break; | ||
79 | case _PAGE_CACHE_UC_MINUS: | ||
80 | SetPageUncached(pg); | ||
81 | ClearPageWC(pg); | ||
82 | break; | ||
83 | case _PAGE_CACHE_WB: | ||
84 | SetPageUncached(pg); | ||
85 | SetPageWC(pg); | ||
86 | break; | ||
87 | default: | ||
88 | case -1: | ||
89 | ClearPageUncached(pg); | ||
90 | ClearPageWC(pg); | ||
91 | break; | ||
92 | } | ||
93 | } | ||
94 | #else | ||
95 | static inline unsigned long get_page_memtype(struct page *pg) { return -1; } | ||
96 | static inline void set_page_memtype(struct page *pg, unsigned long memtype) { } | ||
97 | #endif | ||
48 | 98 | ||
49 | /* | 99 | /* |
50 | * The set_memory_* API can be used to change various attributes of a virtual | 100 | * The set_memory_* API can be used to change various attributes of a virtual |
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 4a28d22d479..9cfc88b9774 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
@@ -95,6 +95,8 @@ | |||
95 | #define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ | 95 | #define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ |
96 | #define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ | 96 | #define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ |
97 | #define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ | 97 | #define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ |
98 | #define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */ | ||
99 | #define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */ | ||
98 | 100 | ||
99 | /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ | 101 | /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ |
100 | #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ | 102 | #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ |
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index c68c361697e..4d447b732d8 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h | |||
@@ -11,7 +11,7 @@ DECLARE_PER_CPU(struct task_struct *, current_task); | |||
11 | 11 | ||
12 | static __always_inline struct task_struct *get_current(void) | 12 | static __always_inline struct task_struct *get_current(void) |
13 | { | 13 | { |
14 | return percpu_read(current_task); | 14 | return percpu_read_stable(current_task); |
15 | } | 15 | } |
16 | 16 | ||
17 | #define current get_current() | 17 | #define current get_current() |
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index c993e9e0fed..e8de2f6f5ca 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h | |||
@@ -291,11 +291,24 @@ static inline unsigned long get_desc_base(const struct desc_struct *desc) | |||
291 | return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); | 291 | return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); |
292 | } | 292 | } |
293 | 293 | ||
294 | static inline void set_desc_base(struct desc_struct *desc, unsigned long base) | ||
295 | { | ||
296 | desc->base0 = base & 0xffff; | ||
297 | desc->base1 = (base >> 16) & 0xff; | ||
298 | desc->base2 = (base >> 24) & 0xff; | ||
299 | } | ||
300 | |||
294 | static inline unsigned long get_desc_limit(const struct desc_struct *desc) | 301 | static inline unsigned long get_desc_limit(const struct desc_struct *desc) |
295 | { | 302 | { |
296 | return desc->limit0 | (desc->limit << 16); | 303 | return desc->limit0 | (desc->limit << 16); |
297 | } | 304 | } |
298 | 305 | ||
306 | static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) | ||
307 | { | ||
308 | desc->limit0 = limit & 0xffff; | ||
309 | desc->limit = (limit >> 16) & 0xf; | ||
310 | } | ||
311 | |||
299 | static inline void _set_gate(int gate, unsigned type, void *addr, | 312 | static inline void _set_gate(int gate, unsigned type, void *addr, |
300 | unsigned dpl, unsigned ist, unsigned seg) | 313 | unsigned dpl, unsigned ist, unsigned seg) |
301 | { | 314 | { |
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index a6adefa28b9..9d6684849fd 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h | |||
@@ -34,6 +34,12 @@ struct desc_struct { | |||
34 | }; | 34 | }; |
35 | } __attribute__((packed)); | 35 | } __attribute__((packed)); |
36 | 36 | ||
37 | #define GDT_ENTRY_INIT(flags, base, limit) { { { \ | ||
38 | .a = ((limit) & 0xffff) | (((base) & 0xffff) << 16), \ | ||
39 | .b = (((base) & 0xff0000) >> 16) | (((flags) & 0xf0ff) << 8) | \ | ||
40 | ((limit) & 0xf0000) | ((base) & 0xff000000), \ | ||
41 | } } } | ||
42 | |||
37 | enum { | 43 | enum { |
38 | GATE_INTERRUPT = 0xE, | 44 | GATE_INTERRUPT = 0xE, |
39 | GATE_TRAP = 0xF, | 45 | GATE_TRAP = 0xF, |
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 4994a20acbc..cee34e9ca45 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h | |||
@@ -13,4 +13,7 @@ struct dma_map_ops *dma_ops; | |||
13 | #endif | 13 | #endif |
14 | }; | 14 | }; |
15 | 15 | ||
16 | struct pdev_archdata { | ||
17 | }; | ||
18 | |||
16 | #endif /* _ASM_X86_DEVICE_H */ | 19 | #endif /* _ASM_X86_DEVICE_H */ |
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 1c3f9435f1c..0ee770d23d0 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h | |||
@@ -55,6 +55,24 @@ extern int dma_set_mask(struct device *dev, u64 mask); | |||
55 | extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, | 55 | extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, |
56 | dma_addr_t *dma_addr, gfp_t flag); | 56 | dma_addr_t *dma_addr, gfp_t flag); |
57 | 57 | ||
58 | static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) | ||
59 | { | ||
60 | if (!dev->dma_mask) | ||
61 | return 0; | ||
62 | |||
63 | return addr + size <= *dev->dma_mask; | ||
64 | } | ||
65 | |||
66 | static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) | ||
67 | { | ||
68 | return paddr; | ||
69 | } | ||
70 | |||
71 | static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) | ||
72 | { | ||
73 | return daddr; | ||
74 | } | ||
75 | |||
58 | static inline void | 76 | static inline void |
59 | dma_cache_sync(struct device *dev, void *vaddr, size_t size, | 77 | dma_cache_sync(struct device *dev, void *vaddr, size_t size, |
60 | enum dma_data_direction dir) | 78 | enum dma_data_direction dir) |
diff --git a/arch/x86/include/asm/do_timer.h b/arch/x86/include/asm/do_timer.h deleted file mode 100644 index 23ecda0b28a..00000000000 --- a/arch/x86/include/asm/do_timer.h +++ /dev/null | |||
@@ -1,16 +0,0 @@ | |||
1 | /* defines for inline arch setup functions */ | ||
2 | #include <linux/clockchips.h> | ||
3 | |||
4 | #include <asm/i8259.h> | ||
5 | #include <asm/i8253.h> | ||
6 | |||
7 | /** | ||
8 | * do_timer_interrupt_hook - hook into timer tick | ||
9 | * | ||
10 | * Call the pit clock event handler. see asm/i8253.h | ||
11 | **/ | ||
12 | |||
13 | static inline void do_timer_interrupt_hook(void) | ||
14 | { | ||
15 | global_clock_event->event_handler(global_clock_event); | ||
16 | } | ||
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 3afc5e87cfd..ae6253ab902 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h | |||
@@ -87,9 +87,25 @@ | |||
87 | CFI_RESTORE \reg | 87 | CFI_RESTORE \reg |
88 | .endm | 88 | .endm |
89 | #else /*!CONFIG_X86_64*/ | 89 | #else /*!CONFIG_X86_64*/ |
90 | .macro pushl_cfi reg | ||
91 | pushl \reg | ||
92 | CFI_ADJUST_CFA_OFFSET 4 | ||
93 | .endm | ||
90 | 94 | ||
91 | /* 32bit defenitions are missed yet */ | 95 | .macro popl_cfi reg |
96 | popl \reg | ||
97 | CFI_ADJUST_CFA_OFFSET -4 | ||
98 | .endm | ||
92 | 99 | ||
100 | .macro movl_cfi reg offset=0 | ||
101 | movl %\reg, \offset(%esp) | ||
102 | CFI_REL_OFFSET \reg, \offset | ||
103 | .endm | ||
104 | |||
105 | .macro movl_cfi_restore offset reg | ||
106 | movl \offset(%esp), %\reg | ||
107 | CFI_RESTORE \reg | ||
108 | .endm | ||
93 | #endif /*!CONFIG_X86_64*/ | 109 | #endif /*!CONFIG_X86_64*/ |
94 | #endif /*__ASSEMBLY__*/ | 110 | #endif /*__ASSEMBLY__*/ |
95 | 111 | ||
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 7ecba4d8508..40b4e614fe7 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h | |||
@@ -126,8 +126,6 @@ extern void e820_reserve_resources(void); | |||
126 | extern void e820_reserve_resources_late(void); | 126 | extern void e820_reserve_resources_late(void); |
127 | extern void setup_memory_map(void); | 127 | extern void setup_memory_map(void); |
128 | extern char *default_machine_specific_memory_setup(void); | 128 | extern char *default_machine_specific_memory_setup(void); |
129 | extern char *machine_specific_memory_setup(void); | ||
130 | extern char *memory_setup(void); | ||
131 | #endif /* __KERNEL__ */ | 129 | #endif /* __KERNEL__ */ |
132 | #endif /* __ASSEMBLY__ */ | 130 | #endif /* __ASSEMBLY__ */ |
133 | 131 | ||
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 83c1bc8d2e8..456a304b817 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h | |||
@@ -299,6 +299,8 @@ do { \ | |||
299 | 299 | ||
300 | #ifdef CONFIG_X86_32 | 300 | #ifdef CONFIG_X86_32 |
301 | 301 | ||
302 | #define STACK_RND_MASK (0x7ff) | ||
303 | |||
302 | #define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO)) | 304 | #define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO)) |
303 | 305 | ||
304 | #define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled) | 306 | #define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled) |
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index ff8cbfa0785..f5693c81a1d 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h | |||
@@ -49,7 +49,7 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) | |||
49 | BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) | 49 | BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) |
50 | BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) | 50 | BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) |
51 | 51 | ||
52 | #ifdef CONFIG_PERF_COUNTERS | 52 | #ifdef CONFIG_PERF_EVENTS |
53 | BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) | 53 | BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) |
54 | #endif | 54 | #endif |
55 | 55 | ||
@@ -61,7 +61,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) | |||
61 | BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) | 61 | BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) |
62 | #endif | 62 | #endif |
63 | 63 | ||
64 | #ifdef CONFIG_X86_NEW_MCE | 64 | #ifdef CONFIG_X86_MCE |
65 | BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) | 65 | BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) |
66 | #endif | 66 | #endif |
67 | 67 | ||
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 7b2d71df39a..14f9890eb49 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h | |||
@@ -132,6 +132,9 @@ enum fixed_addresses { | |||
132 | #ifdef CONFIG_X86_32 | 132 | #ifdef CONFIG_X86_32 |
133 | FIX_WP_TEST, | 133 | FIX_WP_TEST, |
134 | #endif | 134 | #endif |
135 | #ifdef CONFIG_INTEL_TXT | ||
136 | FIX_TBOOT_BASE, | ||
137 | #endif | ||
135 | __end_of_fixed_addresses | 138 | __end_of_fixed_addresses |
136 | }; | 139 | }; |
137 | 140 | ||
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index bd2c6511c88..db24c2278be 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h | |||
@@ -28,13 +28,6 @@ | |||
28 | 28 | ||
29 | #endif | 29 | #endif |
30 | 30 | ||
31 | /* FIXME: I don't want to stay hardcoded */ | ||
32 | #ifdef CONFIG_X86_64 | ||
33 | # define FTRACE_SYSCALL_MAX 296 | ||
34 | #else | ||
35 | # define FTRACE_SYSCALL_MAX 333 | ||
36 | #endif | ||
37 | |||
38 | #ifdef CONFIG_FUNCTION_TRACER | 31 | #ifdef CONFIG_FUNCTION_TRACER |
39 | #define MCOUNT_ADDR ((long)(mcount)) | 32 | #define MCOUNT_ADDR ((long)(mcount)) |
40 | #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ | 33 | #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ |
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 369f5c5d09a..b78c0941e42 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h | |||
@@ -20,7 +20,7 @@ | |||
20 | #ifndef ASM_X86__HYPERVISOR_H | 20 | #ifndef ASM_X86__HYPERVISOR_H |
21 | #define ASM_X86__HYPERVISOR_H | 21 | #define ASM_X86__HYPERVISOR_H |
22 | 22 | ||
23 | extern unsigned long get_hypervisor_tsc_freq(void); | ||
24 | extern void init_hypervisor(struct cpuinfo_x86 *c); | 23 | extern void init_hypervisor(struct cpuinfo_x86 *c); |
24 | extern void init_hypervisor_platform(void); | ||
25 | 25 | ||
26 | #endif | 26 | #endif |
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 175adf58dd4..0b20bbb758f 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h | |||
@@ -26,6 +26,7 @@ extern void fpu_init(void); | |||
26 | extern void mxcsr_feature_mask_init(void); | 26 | extern void mxcsr_feature_mask_init(void); |
27 | extern int init_fpu(struct task_struct *child); | 27 | extern int init_fpu(struct task_struct *child); |
28 | extern asmlinkage void math_state_restore(void); | 28 | extern asmlinkage void math_state_restore(void); |
29 | extern void __math_state_restore(void); | ||
29 | extern void init_thread_xstate(void); | 30 | extern void init_thread_xstate(void); |
30 | extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); | 31 | extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); |
31 | 32 | ||
@@ -301,6 +302,14 @@ static inline void kernel_fpu_end(void) | |||
301 | preempt_enable(); | 302 | preempt_enable(); |
302 | } | 303 | } |
303 | 304 | ||
305 | static inline bool irq_fpu_usable(void) | ||
306 | { | ||
307 | struct pt_regs *regs; | ||
308 | |||
309 | return !in_interrupt() || !(regs = get_irq_regs()) || \ | ||
310 | user_mode(regs) || (read_cr0() & X86_CR0_TS); | ||
311 | } | ||
312 | |||
304 | /* | 313 | /* |
305 | * Some instructions like VIA's padlock instructions generate a spurious | 314 | * Some instructions like VIA's padlock instructions generate a spurious |
306 | * DNA fault but don't modify SSE registers. And these instructions | 315 | * DNA fault but don't modify SSE registers. And these instructions |
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 330ee807f89..7c7c16cde1f 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h | |||
@@ -143,6 +143,8 @@ extern int noioapicreroute; | |||
143 | /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ | 143 | /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ |
144 | extern int timer_through_8259; | 144 | extern int timer_through_8259; |
145 | 145 | ||
146 | extern void io_apic_disable_legacy(void); | ||
147 | |||
146 | /* | 148 | /* |
147 | * If we use the IO-APIC for IRQ routing, disable automatic | 149 | * If we use the IO-APIC for IRQ routing, disable automatic |
148 | * assignment of PCI IRQ's. | 150 | * assignment of PCI IRQ's. |
@@ -150,11 +152,10 @@ extern int timer_through_8259; | |||
150 | #define io_apic_assign_pci_irqs \ | 152 | #define io_apic_assign_pci_irqs \ |
151 | (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) | 153 | (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) |
152 | 154 | ||
153 | #ifdef CONFIG_ACPI | 155 | extern u8 io_apic_unique_id(u8 id); |
154 | extern int io_apic_get_unique_id(int ioapic, int apic_id); | 156 | extern int io_apic_get_unique_id(int ioapic, int apic_id); |
155 | extern int io_apic_get_version(int ioapic); | 157 | extern int io_apic_get_version(int ioapic); |
156 | extern int io_apic_get_redir_entries(int ioapic); | 158 | extern int io_apic_get_redir_entries(int ioapic); |
157 | #endif /* CONFIG_ACPI */ | ||
158 | 159 | ||
159 | struct io_apic_irq_attr; | 160 | struct io_apic_irq_attr; |
160 | extern int io_apic_set_pci_routing(struct device *dev, int irq, | 161 | extern int io_apic_set_pci_routing(struct device *dev, int irq, |
@@ -177,13 +178,26 @@ extern int setup_ioapic_entry(int apic, int irq, | |||
177 | int polarity, int vector, int pin); | 178 | int polarity, int vector, int pin); |
178 | extern void ioapic_write_entry(int apic, int pin, | 179 | extern void ioapic_write_entry(int apic, int pin, |
179 | struct IO_APIC_route_entry e); | 180 | struct IO_APIC_route_entry e); |
181 | extern void setup_ioapic_ids_from_mpc(void); | ||
182 | |||
183 | struct mp_ioapic_gsi{ | ||
184 | int gsi_base; | ||
185 | int gsi_end; | ||
186 | }; | ||
187 | extern struct mp_ioapic_gsi mp_gsi_routing[]; | ||
188 | int mp_find_ioapic(int gsi); | ||
189 | int mp_find_ioapic_pin(int ioapic, int gsi); | ||
190 | void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); | ||
191 | |||
180 | #else /* !CONFIG_X86_IO_APIC */ | 192 | #else /* !CONFIG_X86_IO_APIC */ |
193 | |||
181 | #define io_apic_assign_pci_irqs 0 | 194 | #define io_apic_assign_pci_irqs 0 |
195 | #define setup_ioapic_ids_from_mpc x86_init_noop | ||
182 | static const int timer_through_8259 = 0; | 196 | static const int timer_through_8259 = 0; |
183 | static inline void ioapic_init_mappings(void) { } | 197 | static inline void ioapic_init_mappings(void) { } |
184 | static inline void ioapic_insert_resources(void) { } | 198 | static inline void ioapic_insert_resources(void) { } |
185 | |||
186 | static inline void probe_nr_irqs_gsi(void) { } | 199 | static inline void probe_nr_irqs_gsi(void) { } |
200 | |||
187 | #endif | 201 | #endif |
188 | 202 | ||
189 | #endif /* _ASM_X86_IO_APIC_H */ | 203 | #endif /* _ASM_X86_IO_APIC_H */ |
diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h index 0d5b23b7b06..ec34c760665 100644 --- a/arch/x86/include/asm/ioctls.h +++ b/arch/x86/include/asm/ioctls.h | |||
@@ -1,94 +1 @@ | |||
1 | #ifndef _ASM_X86_IOCTLS_H | #include <asm-generic/ioctls.h> | |
2 | #define _ASM_X86_IOCTLS_H | ||
3 | |||
4 | #include <asm/ioctl.h> | ||
5 | |||
6 | /* 0x54 is just a magic number to make these relatively unique ('T') */ | ||
7 | |||
8 | #define TCGETS 0x5401 | ||
9 | #define TCSETS 0x5402 /* Clashes with SNDCTL_TMR_START sound ioctl */ | ||
10 | #define TCSETSW 0x5403 | ||
11 | #define TCSETSF 0x5404 | ||
12 | #define TCGETA 0x5405 | ||
13 | #define TCSETA 0x5406 | ||
14 | #define TCSETAW 0x5407 | ||
15 | #define TCSETAF 0x5408 | ||
16 | #define TCSBRK 0x5409 | ||
17 | #define TCXONC 0x540A | ||
18 | #define TCFLSH 0x540B | ||
19 | #define TIOCEXCL 0x540C | ||
20 | #define TIOCNXCL 0x540D | ||
21 | #define TIOCSCTTY 0x540E | ||
22 | #define TIOCGPGRP 0x540F | ||
23 | #define TIOCSPGRP 0x5410 | ||
24 | #define TIOCOUTQ 0x5411 | ||
25 | #define TIOCSTI 0x5412 | ||
26 | #define TIOCGWINSZ 0x5413 | ||
27 | #define TIOCSWINSZ 0x5414 | ||
28 | #define TIOCMGET 0x5415 | ||
29 | #define TIOCMBIS 0x5416 | ||
30 | #define TIOCMBIC 0x5417 | ||
31 | #define TIOCMSET 0x5418 | ||
32 | #define TIOCGSOFTCAR 0x5419 | ||
33 | #define TIOCSSOFTCAR 0x541A | ||
34 | #define FIONREAD 0x541B | ||
35 | #define TIOCINQ FIONREAD | ||
36 | #define TIOCLINUX 0x541C | ||
37 | #define TIOCCONS 0x541D | ||
38 | #define TIOCGSERIAL 0x541E | ||
39 | #define TIOCSSERIAL 0x541F | ||
40 | #define TIOCPKT 0x5420 | ||
41 | #define FIONBIO 0x5421 | ||
42 | #define TIOCNOTTY 0x5422 | ||
43 | #define TIOCSETD 0x5423 | ||
44 | #define TIOCGETD 0x5424 | ||
45 | #define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */ | ||
46 | /* #define TIOCTTYGSTRUCT 0x5426 - Former debugging-only ioctl */ | ||
47 | #define TIOCSBRK 0x5427 /* BSD compatibility */ | ||
48 | #define TIOCCBRK 0x5428 /* BSD compatibility */ | ||
49 | #define TIOCGSID 0x5429 /* Return the session ID of FD */ | ||
50 | #define TCGETS2 _IOR('T', 0x2A, struct termios2) | ||
51 | #define TCSETS2 _IOW('T', 0x2B, struct termios2) | ||
52 | #define TCSETSW2 _IOW('T', 0x2C, struct termios2) | ||
53 | #define TCSETSF2 _IOW('T', 0x2D, struct termios2) | ||
54 | #define TIOCGRS485 0x542E | ||
55 | #define TIOCSRS485 0x542F | ||
56 | #define TIOCGPTN _IOR('T', 0x30, unsigned int) | ||
57 | /* Get Pty Number (of pty-mux device) */ | ||
58 | #define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */ | ||
59 | #define TCGETX 0x5432 /* SYS5 TCGETX compatibility */ | ||
60 | #define TCSETX 0x5433 | ||
61 | #define TCSETXF 0x5434 | ||
62 | #define TCSETXW 0x5435 | ||
63 | |||
64 | #define FIONCLEX 0x5450 | ||
65 | #define FIOCLEX 0x5451 | ||
66 | #define FIOASYNC 0x5452 | ||
67 | #define TIOCSERCONFIG 0x5453 | ||
68 | #define TIOCSERGWILD 0x5454 | ||
69 | #define TIOCSERSWILD 0x5455 | ||
70 | #define TIOCGLCKTRMIOS 0x5456 | ||
71 | #define TIOCSLCKTRMIOS 0x5457 | ||
72 | #define TIOCSERGSTRUCT 0x5458 /* For debugging only */ | ||
73 | #define TIOCSERGETLSR 0x5459 /* Get line status register */ | ||
74 | #define TIOCSERGETMULTI 0x545A /* Get multiport config */ | ||
75 | #define TIOCSERSETMULTI 0x545B /* Set multiport config */ | ||
76 | |||
77 | #define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */ | ||
78 | #define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ | ||
79 | #define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ | ||
80 | #define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ | ||
81 | #define FIOQSIZE 0x5460 | ||
82 | |||
83 | /* Used for packet mode */ | ||
84 | #define TIOCPKT_DATA 0 | ||
85 | #define TIOCPKT_FLUSHREAD 1 | ||
86 | #define TIOCPKT_FLUSHWRITE 2 | ||
87 | #define TIOCPKT_STOP 4 | ||
88 | #define TIOCPKT_START 8 | ||
89 | #define TIOCPKT_NOSTOP 16 | ||
90 | #define TIOCPKT_DOSTOP 32 | ||
91 | |||
92 | #define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ | ||
93 | |||
94 | #endif /* _ASM_X86_IOCTLS_H */ | ||
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index 0e9fe1d9d97..f35eb45d657 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h | |||
@@ -26,13 +26,16 @@ | |||
26 | #include <asm/pgtable.h> | 26 | #include <asm/pgtable.h> |
27 | #include <asm/tlbflush.h> | 27 | #include <asm/tlbflush.h> |
28 | 28 | ||
29 | int | ||
30 | is_io_mapping_possible(resource_size_t base, unsigned long size); | ||
31 | |||
32 | void * | 29 | void * |
33 | iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); | 30 | iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); |
34 | 31 | ||
35 | void | 32 | void |
36 | iounmap_atomic(void *kvaddr, enum km_type type); | 33 | iounmap_atomic(void *kvaddr, enum km_type type); |
37 | 34 | ||
35 | int | ||
36 | iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); | ||
37 | |||
38 | void | ||
39 | iomap_free(resource_size_t base, unsigned long size); | ||
40 | |||
38 | #endif /* _ASM_X86_IOMAP_H */ | 41 | #endif /* _ASM_X86_IOMAP_H */ |
diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h index ee678fd5159..84c7e51cb6d 100644 --- a/arch/x86/include/asm/ipcbuf.h +++ b/arch/x86/include/asm/ipcbuf.h | |||
@@ -1,28 +1 @@ | |||
1 | #ifndef _ASM_X86_IPCBUF_H | #include <asm-generic/ipcbuf.h> | |
2 | #define _ASM_X86_IPCBUF_H | ||
3 | |||
4 | /* | ||
5 | * The ipc64_perm structure for x86 architecture. | ||
6 | * Note extra padding because this structure is passed back and forth | ||
7 | * between kernel and user space. | ||
8 | * | ||
9 | * Pad space is left for: | ||
10 | * - 32-bit mode_t and seq | ||
11 | * - 2 miscellaneous 32-bit values | ||
12 | */ | ||
13 | |||
14 | struct ipc64_perm { | ||
15 | __kernel_key_t key; | ||
16 | __kernel_uid32_t uid; | ||
17 | __kernel_gid32_t gid; | ||
18 | __kernel_uid32_t cuid; | ||
19 | __kernel_gid32_t cgid; | ||
20 | __kernel_mode_t mode; | ||
21 | unsigned short __pad1; | ||
22 | unsigned short seq; | ||
23 | unsigned short __pad2; | ||
24 | unsigned long __unused1; | ||
25 | unsigned long __unused2; | ||
26 | }; | ||
27 | |||
28 | #endif /* _ASM_X86_IPCBUF_H */ | ||
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index f38481bcd45..ddda6cbed6f 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h | |||
@@ -37,7 +37,6 @@ extern void fixup_irqs(void); | |||
37 | #endif | 37 | #endif |
38 | 38 | ||
39 | extern void (*generic_interrupt_extension)(void); | 39 | extern void (*generic_interrupt_extension)(void); |
40 | extern void init_IRQ(void); | ||
41 | extern void native_init_IRQ(void); | 40 | extern void native_init_IRQ(void); |
42 | extern bool handle_irq(unsigned irq, struct pt_regs *regs); | 41 | extern bool handle_irq(unsigned irq, struct pt_regs *regs); |
43 | 42 | ||
@@ -47,4 +46,6 @@ extern unsigned int do_IRQ(struct pt_regs *regs); | |||
47 | extern DECLARE_BITMAP(used_vectors, NR_VECTORS); | 46 | extern DECLARE_BITMAP(used_vectors, NR_VECTORS); |
48 | extern int vector_used_by_percpu_irq(unsigned int vector); | 47 | extern int vector_used_by_percpu_irq(unsigned int vector); |
49 | 48 | ||
49 | extern void init_ISA_irqs(void); | ||
50 | |||
50 | #endif /* _ASM_X86_IRQ_H */ | 51 | #endif /* _ASM_X86_IRQ_H */ |
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index c6ccbe7e81a..9e2b952f810 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h | |||
@@ -13,14 +13,13 @@ static inline unsigned long native_save_fl(void) | |||
13 | unsigned long flags; | 13 | unsigned long flags; |
14 | 14 | ||
15 | /* | 15 | /* |
16 | * Note: this needs to be "=r" not "=rm", because we have the | 16 | * "=rm" is safe here, because "pop" adjusts the stack before |
17 | * stack offset from what gcc expects at the time the "pop" is | 17 | * it evaluates its effective address -- this is part of the |
18 | * executed, and so a memory reference with respect to the stack | 18 | * documented behavior of the "pop" instruction. |
19 | * would end up using the wrong address. | ||
20 | */ | 19 | */ |
21 | asm volatile("# __raw_save_flags\n\t" | 20 | asm volatile("# __raw_save_flags\n\t" |
22 | "pushf ; pop %0" | 21 | "pushf ; pop %0" |
23 | : "=r" (flags) | 22 | : "=rm" (flags) |
24 | : /* no input */ | 23 | : /* no input */ |
25 | : "memory"); | 24 | : "memory"); |
26 | 25 | ||
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 125be8b1956..4a5fe914dc5 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h | |||
@@ -17,6 +17,8 @@ | |||
17 | #define __KVM_HAVE_USER_NMI | 17 | #define __KVM_HAVE_USER_NMI |
18 | #define __KVM_HAVE_GUEST_DEBUG | 18 | #define __KVM_HAVE_GUEST_DEBUG |
19 | #define __KVM_HAVE_MSIX | 19 | #define __KVM_HAVE_MSIX |
20 | #define __KVM_HAVE_MCE | ||
21 | #define __KVM_HAVE_PIT_STATE2 | ||
20 | 22 | ||
21 | /* Architectural interrupt line count. */ | 23 | /* Architectural interrupt line count. */ |
22 | #define KVM_NR_INTERRUPTS 256 | 24 | #define KVM_NR_INTERRUPTS 256 |
@@ -236,6 +238,14 @@ struct kvm_pit_state { | |||
236 | struct kvm_pit_channel_state channels[3]; | 238 | struct kvm_pit_channel_state channels[3]; |
237 | }; | 239 | }; |
238 | 240 | ||
241 | #define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 | ||
242 | |||
243 | struct kvm_pit_state2 { | ||
244 | struct kvm_pit_channel_state channels[3]; | ||
245 | __u32 flags; | ||
246 | __u32 reserved[9]; | ||
247 | }; | ||
248 | |||
239 | struct kvm_reinject_control { | 249 | struct kvm_reinject_control { |
240 | __u8 pit_reinject; | 250 | __u8 pit_reinject; |
241 | __u8 reserved[31]; | 251 | __u8 reserved[31]; |
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_emulate.h index b7ed2c42311..b7ed2c42311 100644 --- a/arch/x86/include/asm/kvm_x86_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h | |||
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index eabdc1cfab5..3be000435fa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/types.h> | 14 | #include <linux/types.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/mmu_notifier.h> | 16 | #include <linux/mmu_notifier.h> |
17 | #include <linux/tracepoint.h> | ||
17 | 18 | ||
18 | #include <linux/kvm.h> | 19 | #include <linux/kvm.h> |
19 | #include <linux/kvm_para.h> | 20 | #include <linux/kvm_para.h> |
@@ -37,12 +38,14 @@ | |||
37 | #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ | 38 | #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ |
38 | 0xFFFFFF0000000000ULL) | 39 | 0xFFFFFF0000000000ULL) |
39 | 40 | ||
40 | #define KVM_GUEST_CR0_MASK \ | 41 | #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ |
41 | (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \ | 42 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) |
42 | | X86_CR0_NW | X86_CR0_CD) | 43 | #define KVM_GUEST_CR0_MASK \ |
44 | (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) | ||
45 | #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ | ||
46 | (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) | ||
43 | #define KVM_VM_CR0_ALWAYS_ON \ | 47 | #define KVM_VM_CR0_ALWAYS_ON \ |
44 | (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \ | 48 | (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) |
45 | | X86_CR0_MP) | ||
46 | #define KVM_GUEST_CR4_MASK \ | 49 | #define KVM_GUEST_CR4_MASK \ |
47 | (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) | 50 | (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) |
48 | #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) | 51 | #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) |
@@ -51,12 +54,12 @@ | |||
51 | #define INVALID_PAGE (~(hpa_t)0) | 54 | #define INVALID_PAGE (~(hpa_t)0) |
52 | #define UNMAPPED_GVA (~(gpa_t)0) | 55 | #define UNMAPPED_GVA (~(gpa_t)0) |
53 | 56 | ||
54 | /* shadow tables are PAE even on non-PAE hosts */ | 57 | /* KVM Hugepage definitions for x86 */ |
55 | #define KVM_HPAGE_SHIFT 21 | 58 | #define KVM_NR_PAGE_SIZES 3 |
56 | #define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT) | 59 | #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) |
57 | #define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1)) | 60 | #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) |
58 | 61 | #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) | |
59 | #define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) | 62 | #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) |
60 | 63 | ||
61 | #define DE_VECTOR 0 | 64 | #define DE_VECTOR 0 |
62 | #define DB_VECTOR 1 | 65 | #define DB_VECTOR 1 |
@@ -120,6 +123,10 @@ enum kvm_reg { | |||
120 | NR_VCPU_REGS | 123 | NR_VCPU_REGS |
121 | }; | 124 | }; |
122 | 125 | ||
126 | enum kvm_reg_ex { | ||
127 | VCPU_EXREG_PDPTR = NR_VCPU_REGS, | ||
128 | }; | ||
129 | |||
123 | enum { | 130 | enum { |
124 | VCPU_SREG_ES, | 131 | VCPU_SREG_ES, |
125 | VCPU_SREG_CS, | 132 | VCPU_SREG_CS, |
@@ -131,7 +138,7 @@ enum { | |||
131 | VCPU_SREG_LDTR, | 138 | VCPU_SREG_LDTR, |
132 | }; | 139 | }; |
133 | 140 | ||
134 | #include <asm/kvm_x86_emulate.h> | 141 | #include <asm/kvm_emulate.h> |
135 | 142 | ||
136 | #define KVM_NR_MEM_OBJS 40 | 143 | #define KVM_NR_MEM_OBJS 40 |
137 | 144 | ||
@@ -308,7 +315,6 @@ struct kvm_vcpu_arch { | |||
308 | struct { | 315 | struct { |
309 | gfn_t gfn; /* presumed gfn during guest pte update */ | 316 | gfn_t gfn; /* presumed gfn during guest pte update */ |
310 | pfn_t pfn; /* pfn corresponding to that gfn */ | 317 | pfn_t pfn; /* pfn corresponding to that gfn */ |
311 | int largepage; | ||
312 | unsigned long mmu_seq; | 318 | unsigned long mmu_seq; |
313 | } update_pte; | 319 | } update_pte; |
314 | 320 | ||
@@ -334,16 +340,6 @@ struct kvm_vcpu_arch { | |||
334 | u8 nr; | 340 | u8 nr; |
335 | } interrupt; | 341 | } interrupt; |
336 | 342 | ||
337 | struct { | ||
338 | int vm86_active; | ||
339 | u8 save_iopl; | ||
340 | struct kvm_save_segment { | ||
341 | u16 selector; | ||
342 | unsigned long base; | ||
343 | u32 limit; | ||
344 | u32 ar; | ||
345 | } tr, es, ds, fs, gs; | ||
346 | } rmode; | ||
347 | int halt_request; /* real mode on Intel only */ | 343 | int halt_request; /* real mode on Intel only */ |
348 | 344 | ||
349 | int cpuid_nent; | 345 | int cpuid_nent; |
@@ -366,13 +362,15 @@ struct kvm_vcpu_arch { | |||
366 | u32 pat; | 362 | u32 pat; |
367 | 363 | ||
368 | int switch_db_regs; | 364 | int switch_db_regs; |
369 | unsigned long host_db[KVM_NR_DB_REGS]; | ||
370 | unsigned long host_dr6; | ||
371 | unsigned long host_dr7; | ||
372 | unsigned long db[KVM_NR_DB_REGS]; | 365 | unsigned long db[KVM_NR_DB_REGS]; |
373 | unsigned long dr6; | 366 | unsigned long dr6; |
374 | unsigned long dr7; | 367 | unsigned long dr7; |
375 | unsigned long eff_db[KVM_NR_DB_REGS]; | 368 | unsigned long eff_db[KVM_NR_DB_REGS]; |
369 | |||
370 | u64 mcg_cap; | ||
371 | u64 mcg_status; | ||
372 | u64 mcg_ctl; | ||
373 | u64 *mce_banks; | ||
376 | }; | 374 | }; |
377 | 375 | ||
378 | struct kvm_mem_alias { | 376 | struct kvm_mem_alias { |
@@ -409,6 +407,7 @@ struct kvm_arch{ | |||
409 | 407 | ||
410 | struct page *ept_identity_pagetable; | 408 | struct page *ept_identity_pagetable; |
411 | bool ept_identity_pagetable_done; | 409 | bool ept_identity_pagetable_done; |
410 | gpa_t ept_identity_map_addr; | ||
412 | 411 | ||
413 | unsigned long irq_sources_bitmap; | 412 | unsigned long irq_sources_bitmap; |
414 | unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; | 413 | unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; |
@@ -526,6 +525,9 @@ struct kvm_x86_ops { | |||
526 | int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); | 525 | int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); |
527 | int (*get_tdp_level)(void); | 526 | int (*get_tdp_level)(void); |
528 | u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); | 527 | u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); |
528 | bool (*gb_page_enable)(void); | ||
529 | |||
530 | const struct trace_print_flags *exit_reasons_str; | ||
529 | }; | 531 | }; |
530 | 532 | ||
531 | extern struct kvm_x86_ops *kvm_x86_ops; | 533 | extern struct kvm_x86_ops *kvm_x86_ops; |
@@ -618,6 +620,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); | |||
618 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); | 620 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); |
619 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, | 621 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, |
620 | u32 error_code); | 622 | u32 error_code); |
623 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); | ||
621 | 624 | ||
622 | int kvm_pic_set_irq(void *opaque, int irq, int level); | 625 | int kvm_pic_set_irq(void *opaque, int irq, int level); |
623 | 626 | ||
@@ -752,8 +755,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) | |||
752 | kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); | 755 | kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); |
753 | } | 756 | } |
754 | 757 | ||
755 | #define MSR_IA32_TIME_STAMP_COUNTER 0x010 | ||
756 | |||
757 | #define TSS_IOPB_BASE_OFFSET 0x66 | 758 | #define TSS_IOPB_BASE_OFFSET 0x66 |
758 | #define TSS_BASE_SIZE 0x68 | 759 | #define TSS_BASE_SIZE 0x68 |
759 | #define TSS_IOPB_SIZE (65536 / 8) | 760 | #define TSS_IOPB_SIZE (65536 / 8) |
@@ -796,5 +797,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void); | |||
796 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); | 797 | int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); |
797 | int kvm_age_hva(struct kvm *kvm, unsigned long hva); | 798 | int kvm_age_hva(struct kvm *kvm, unsigned long hva); |
798 | int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); | 799 | int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); |
800 | int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); | ||
801 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); | ||
802 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v); | ||
799 | 803 | ||
800 | #endif /* _ASM_X86_KVM_HOST_H */ | 804 | #endif /* _ASM_X86_KVM_HOST_H */ |
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index b8a3305ae09..c584076a47f 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h | |||
@@ -1,6 +1,8 @@ | |||
1 | #ifndef _ASM_X86_KVM_PARA_H | 1 | #ifndef _ASM_X86_KVM_PARA_H |
2 | #define _ASM_X86_KVM_PARA_H | 2 | #define _ASM_X86_KVM_PARA_H |
3 | 3 | ||
4 | #include <linux/types.h> | ||
5 | |||
4 | /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It | 6 | /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It |
5 | * should be used to determine that a VM is running under KVM. | 7 | * should be used to determine that a VM is running under KVM. |
6 | */ | 8 | */ |
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 5136dad57cb..0d97deba1e3 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h | |||
@@ -90,8 +90,9 @@ static inline void lguest_set_ts(void) | |||
90 | } | 90 | } |
91 | 91 | ||
92 | /* Full 4G segment descriptors, suitable for CS and DS. */ | 92 | /* Full 4G segment descriptors, suitable for CS and DS. */ |
93 | #define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } }) | 93 | #define FULL_EXEC_SEGMENT \ |
94 | #define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } }) | 94 | ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) |
95 | #define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) | ||
95 | 96 | ||
96 | #endif /* __ASSEMBLY__ */ | 97 | #endif /* __ASSEMBLY__ */ |
97 | 98 | ||
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 5cdd8d100ec..b608a64c581 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
@@ -9,7 +9,7 @@ | |||
9 | */ | 9 | */ |
10 | 10 | ||
11 | #define MCG_BANKCNT_MASK 0xff /* Number of Banks */ | 11 | #define MCG_BANKCNT_MASK 0xff /* Number of Banks */ |
12 | #define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ | 12 | #define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ |
13 | #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ | 13 | #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ |
14 | #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ | 14 | #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ |
15 | #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ | 15 | #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ |
@@ -38,6 +38,14 @@ | |||
38 | #define MCM_ADDR_MEM 3 /* memory address */ | 38 | #define MCM_ADDR_MEM 3 /* memory address */ |
39 | #define MCM_ADDR_GENERIC 7 /* generic */ | 39 | #define MCM_ADDR_GENERIC 7 /* generic */ |
40 | 40 | ||
41 | #define MCJ_CTX_MASK 3 | ||
42 | #define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) | ||
43 | #define MCJ_CTX_RANDOM 0 /* inject context: random */ | ||
44 | #define MCJ_CTX_PROCESS 1 /* inject context: process */ | ||
45 | #define MCJ_CTX_IRQ 2 /* inject context: IRQ */ | ||
46 | #define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */ | ||
47 | #define MCJ_EXCEPTION 8 /* raise as exception */ | ||
48 | |||
41 | /* Fields are zero when not available */ | 49 | /* Fields are zero when not available */ |
42 | struct mce { | 50 | struct mce { |
43 | __u64 status; | 51 | __u64 status; |
@@ -48,8 +56,8 @@ struct mce { | |||
48 | __u64 tsc; /* cpu time stamp counter */ | 56 | __u64 tsc; /* cpu time stamp counter */ |
49 | __u64 time; /* wall time_t when error was detected */ | 57 | __u64 time; /* wall time_t when error was detected */ |
50 | __u8 cpuvendor; /* cpu vendor as encoded in system.h */ | 58 | __u8 cpuvendor; /* cpu vendor as encoded in system.h */ |
51 | __u8 pad1; | 59 | __u8 inject_flags; /* software inject flags */ |
52 | __u16 pad2; | 60 | __u16 pad; |
53 | __u32 cpuid; /* CPUID 1 EAX */ | 61 | __u32 cpuid; /* CPUID 1 EAX */ |
54 | __u8 cs; /* code segment */ | 62 | __u8 cs; /* code segment */ |
55 | __u8 bank; /* machine check bank */ | 63 | __u8 bank; /* machine check bank */ |
@@ -115,13 +123,6 @@ void mcheck_init(struct cpuinfo_x86 *c); | |||
115 | static inline void mcheck_init(struct cpuinfo_x86 *c) {} | 123 | static inline void mcheck_init(struct cpuinfo_x86 *c) {} |
116 | #endif | 124 | #endif |
117 | 125 | ||
118 | #ifdef CONFIG_X86_OLD_MCE | ||
119 | extern int nr_mce_banks; | ||
120 | void amd_mcheck_init(struct cpuinfo_x86 *c); | ||
121 | void intel_p4_mcheck_init(struct cpuinfo_x86 *c); | ||
122 | void intel_p6_mcheck_init(struct cpuinfo_x86 *c); | ||
123 | #endif | ||
124 | |||
125 | #ifdef CONFIG_X86_ANCIENT_MCE | 126 | #ifdef CONFIG_X86_ANCIENT_MCE |
126 | void intel_p5_mcheck_init(struct cpuinfo_x86 *c); | 127 | void intel_p5_mcheck_init(struct cpuinfo_x86 *c); |
127 | void winchip_mcheck_init(struct cpuinfo_x86 *c); | 128 | void winchip_mcheck_init(struct cpuinfo_x86 *c); |
@@ -137,10 +138,11 @@ void mce_log(struct mce *m); | |||
137 | DECLARE_PER_CPU(struct sys_device, mce_dev); | 138 | DECLARE_PER_CPU(struct sys_device, mce_dev); |
138 | 139 | ||
139 | /* | 140 | /* |
140 | * To support more than 128 would need to escape the predefined | 141 | * Maximum banks number. |
141 | * Linux defined extended banks first. | 142 | * This is the limit of the current register layout on |
143 | * Intel CPUs. | ||
142 | */ | 144 | */ |
143 | #define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) | 145 | #define MAX_NR_BANKS 32 |
144 | 146 | ||
145 | #ifdef CONFIG_X86_MCE_INTEL | 147 | #ifdef CONFIG_X86_MCE_INTEL |
146 | extern int mce_cmci_disabled; | 148 | extern int mce_cmci_disabled; |
@@ -208,11 +210,7 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); | |||
208 | 210 | ||
209 | void intel_init_thermal(struct cpuinfo_x86 *c); | 211 | void intel_init_thermal(struct cpuinfo_x86 *c); |
210 | 212 | ||
211 | #ifdef CONFIG_X86_NEW_MCE | ||
212 | void mce_log_therm_throt_event(__u64 status); | 213 | void mce_log_therm_throt_event(__u64 status); |
213 | #else | ||
214 | static inline void mce_log_therm_throt_event(__u64 status) {} | ||
215 | #endif | ||
216 | 214 | ||
217 | #endif /* __KERNEL__ */ | 215 | #endif /* __KERNEL__ */ |
218 | #endif /* _ASM_X86_MCE_H */ | 216 | #endif /* _ASM_X86_MCE_H */ |
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h index 751af2550ed..593e51d4643 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/asm/mman.h | |||
@@ -1,20 +1,8 @@ | |||
1 | #ifndef _ASM_X86_MMAN_H | 1 | #ifndef _ASM_X86_MMAN_H |
2 | #define _ASM_X86_MMAN_H | 2 | #define _ASM_X86_MMAN_H |
3 | 3 | ||
4 | #include <asm-generic/mman-common.h> | ||
5 | |||
6 | #define MAP_32BIT 0x40 /* only give out 32bit addresses */ | 4 | #define MAP_32BIT 0x40 /* only give out 32bit addresses */ |
7 | 5 | ||
8 | #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ | 6 | #include <asm-generic/mman.h> |
9 | #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ | ||
10 | #define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ | ||
11 | #define MAP_LOCKED 0x2000 /* pages are locked */ | ||
12 | #define MAP_NORESERVE 0x4000 /* don't check for reservations */ | ||
13 | #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ | ||
14 | #define MAP_NONBLOCK 0x10000 /* do not block on IO */ | ||
15 | #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ | ||
16 | |||
17 | #define MCL_CURRENT 1 /* lock all current mappings */ | ||
18 | #define MCL_FUTURE 2 /* lock all future mappings */ | ||
19 | 7 | ||
20 | #endif /* _ASM_X86_MMAN_H */ | 8 | #endif /* _ASM_X86_MMAN_H */ |
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index f923203dc39..4a2d4e0c18d 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h | |||
@@ -37,12 +37,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, | |||
37 | 37 | ||
38 | if (likely(prev != next)) { | 38 | if (likely(prev != next)) { |
39 | /* stop flush ipis for the previous mm */ | 39 | /* stop flush ipis for the previous mm */ |
40 | cpu_clear(cpu, prev->cpu_vm_mask); | 40 | cpumask_clear_cpu(cpu, mm_cpumask(prev)); |
41 | #ifdef CONFIG_SMP | 41 | #ifdef CONFIG_SMP |
42 | percpu_write(cpu_tlbstate.state, TLBSTATE_OK); | 42 | percpu_write(cpu_tlbstate.state, TLBSTATE_OK); |
43 | percpu_write(cpu_tlbstate.active_mm, next); | 43 | percpu_write(cpu_tlbstate.active_mm, next); |
44 | #endif | 44 | #endif |
45 | cpu_set(cpu, next->cpu_vm_mask); | 45 | cpumask_set_cpu(cpu, mm_cpumask(next)); |
46 | 46 | ||
47 | /* Re-load page tables */ | 47 | /* Re-load page tables */ |
48 | load_cr3(next->pgd); | 48 | load_cr3(next->pgd); |
@@ -58,7 +58,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, | |||
58 | percpu_write(cpu_tlbstate.state, TLBSTATE_OK); | 58 | percpu_write(cpu_tlbstate.state, TLBSTATE_OK); |
59 | BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); | 59 | BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); |
60 | 60 | ||
61 | if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { | 61 | if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { |
62 | /* We were in lazy tlb mode and leave_mm disabled | 62 | /* We were in lazy tlb mode and leave_mm disabled |
63 | * tlb flush IPI delivery. We must reload CR3 | 63 | * tlb flush IPI delivery. We must reload CR3 |
64 | * to make sure to use no freed page tables. | 64 | * to make sure to use no freed page tables. |
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 47d62743c4d..3e2ce58a31a 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h | |||
@@ -1,18 +1,7 @@ | |||
1 | #ifndef _ASM_X86_MODULE_H | 1 | #ifndef _ASM_X86_MODULE_H |
2 | #define _ASM_X86_MODULE_H | 2 | #define _ASM_X86_MODULE_H |
3 | 3 | ||
4 | /* x86_32/64 are simple */ | 4 | #include <asm-generic/module.h> |
5 | struct mod_arch_specific {}; | ||
6 | |||
7 | #ifdef CONFIG_X86_32 | ||
8 | # define Elf_Shdr Elf32_Shdr | ||
9 | # define Elf_Sym Elf32_Sym | ||
10 | # define Elf_Ehdr Elf32_Ehdr | ||
11 | #else | ||
12 | # define Elf_Shdr Elf64_Shdr | ||
13 | # define Elf_Sym Elf64_Sym | ||
14 | # define Elf_Ehdr Elf64_Ehdr | ||
15 | #endif | ||
16 | 5 | ||
17 | #ifdef CONFIG_X86_64 | 6 | #ifdef CONFIG_X86_64 |
18 | /* X86_64 does not define MODULE_PROC_FAMILY */ | 7 | /* X86_64 does not define MODULE_PROC_FAMILY */ |
@@ -28,6 +17,8 @@ struct mod_arch_specific {}; | |||
28 | #define MODULE_PROC_FAMILY "586MMX " | 17 | #define MODULE_PROC_FAMILY "586MMX " |
29 | #elif defined CONFIG_MCORE2 | 18 | #elif defined CONFIG_MCORE2 |
30 | #define MODULE_PROC_FAMILY "CORE2 " | 19 | #define MODULE_PROC_FAMILY "CORE2 " |
20 | #elif defined CONFIG_MATOM | ||
21 | #define MODULE_PROC_FAMILY "ATOM " | ||
31 | #elif defined CONFIG_M686 | 22 | #elif defined CONFIG_M686 |
32 | #define MODULE_PROC_FAMILY "686 " | 23 | #define MODULE_PROC_FAMILY "686 " |
33 | #elif defined CONFIG_MPENTIUMII | 24 | #elif defined CONFIG_MPENTIUMII |
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index e2a1bb6d71e..79c94500c0b 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/init.h> | 4 | #include <linux/init.h> |
5 | 5 | ||
6 | #include <asm/mpspec_def.h> | 6 | #include <asm/mpspec_def.h> |
7 | #include <asm/x86_init.h> | ||
7 | 8 | ||
8 | extern int apic_version[MAX_APICS]; | 9 | extern int apic_version[MAX_APICS]; |
9 | extern int pic_mode; | 10 | extern int pic_mode; |
@@ -41,9 +42,6 @@ extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; | |||
41 | 42 | ||
42 | #endif /* CONFIG_X86_64 */ | 43 | #endif /* CONFIG_X86_64 */ |
43 | 44 | ||
44 | extern void early_find_smp_config(void); | ||
45 | extern void early_get_smp_config(void); | ||
46 | |||
47 | #if defined(CONFIG_MCA) || defined(CONFIG_EISA) | 45 | #if defined(CONFIG_MCA) || defined(CONFIG_EISA) |
48 | extern int mp_bus_id_to_type[MAX_MP_BUSSES]; | 46 | extern int mp_bus_id_to_type[MAX_MP_BUSSES]; |
49 | #endif | 47 | #endif |
@@ -52,20 +50,55 @@ extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); | |||
52 | 50 | ||
53 | extern unsigned int boot_cpu_physical_apicid; | 51 | extern unsigned int boot_cpu_physical_apicid; |
54 | extern unsigned int max_physical_apicid; | 52 | extern unsigned int max_physical_apicid; |
55 | extern int smp_found_config; | ||
56 | extern int mpc_default_type; | 53 | extern int mpc_default_type; |
57 | extern unsigned long mp_lapic_addr; | 54 | extern unsigned long mp_lapic_addr; |
58 | 55 | ||
59 | extern void get_smp_config(void); | 56 | #ifdef CONFIG_X86_LOCAL_APIC |
57 | extern int smp_found_config; | ||
58 | #else | ||
59 | # define smp_found_config 0 | ||
60 | #endif | ||
61 | |||
62 | static inline void get_smp_config(void) | ||
63 | { | ||
64 | x86_init.mpparse.get_smp_config(0); | ||
65 | } | ||
66 | |||
67 | static inline void early_get_smp_config(void) | ||
68 | { | ||
69 | x86_init.mpparse.get_smp_config(1); | ||
70 | } | ||
71 | |||
72 | static inline void find_smp_config(void) | ||
73 | { | ||
74 | x86_init.mpparse.find_smp_config(1); | ||
75 | } | ||
76 | |||
77 | static inline void early_find_smp_config(void) | ||
78 | { | ||
79 | x86_init.mpparse.find_smp_config(0); | ||
80 | } | ||
60 | 81 | ||
61 | #ifdef CONFIG_X86_MPPARSE | 82 | #ifdef CONFIG_X86_MPPARSE |
62 | extern void find_smp_config(void); | ||
63 | extern void early_reserve_e820_mpc_new(void); | 83 | extern void early_reserve_e820_mpc_new(void); |
64 | extern int enable_update_mptable; | 84 | extern int enable_update_mptable; |
85 | extern int default_mpc_apic_id(struct mpc_cpu *m); | ||
86 | extern void default_smp_read_mpc_oem(struct mpc_table *mpc); | ||
87 | # ifdef CONFIG_X86_IO_APIC | ||
88 | extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str); | ||
89 | # else | ||
90 | # define default_mpc_oem_bus_info NULL | ||
91 | # endif | ||
92 | extern void default_find_smp_config(unsigned int reserve); | ||
93 | extern void default_get_smp_config(unsigned int early); | ||
65 | #else | 94 | #else |
66 | static inline void find_smp_config(void) { } | ||
67 | static inline void early_reserve_e820_mpc_new(void) { } | 95 | static inline void early_reserve_e820_mpc_new(void) { } |
68 | #define enable_update_mptable 0 | 96 | #define enable_update_mptable 0 |
97 | #define default_mpc_apic_id NULL | ||
98 | #define default_smp_read_mpc_oem NULL | ||
99 | #define default_mpc_oem_bus_info NULL | ||
100 | #define default_find_smp_config x86_init_uint_noop | ||
101 | #define default_get_smp_config x86_init_uint_noop | ||
69 | #endif | 102 | #endif |
70 | 103 | ||
71 | void __cpuinit generic_processor_info(int apicid, int version); | 104 | void __cpuinit generic_processor_info(int apicid, int version); |
diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h index 7e4e9481f51..809134c644a 100644 --- a/arch/x86/include/asm/msgbuf.h +++ b/arch/x86/include/asm/msgbuf.h | |||
@@ -1,39 +1 @@ | |||
1 | #ifndef _ASM_X86_MSGBUF_H | #include <asm-generic/msgbuf.h> | |
2 | #define _ASM_X86_MSGBUF_H | ||
3 | |||
4 | /* | ||
5 | * The msqid64_ds structure for i386 architecture. | ||
6 | * Note extra padding because this structure is passed back and forth | ||
7 | * between kernel and user space. | ||
8 | * | ||
9 | * Pad space on i386 is left for: | ||
10 | * - 64-bit time_t to solve y2038 problem | ||
11 | * - 2 miscellaneous 32-bit values | ||
12 | * | ||
13 | * Pad space on x8664 is left for: | ||
14 | * - 2 miscellaneous 64-bit values | ||
15 | */ | ||
16 | struct msqid64_ds { | ||
17 | struct ipc64_perm msg_perm; | ||
18 | __kernel_time_t msg_stime; /* last msgsnd time */ | ||
19 | #ifdef __i386__ | ||
20 | unsigned long __unused1; | ||
21 | #endif | ||
22 | __kernel_time_t msg_rtime; /* last msgrcv time */ | ||
23 | #ifdef __i386__ | ||
24 | unsigned long __unused2; | ||
25 | #endif | ||
26 | __kernel_time_t msg_ctime; /* last change time */ | ||
27 | #ifdef __i386__ | ||
28 | unsigned long __unused3; | ||
29 | #endif | ||
30 | unsigned long msg_cbytes; /* current number of bytes on queue */ | ||
31 | unsigned long msg_qnum; /* number of messages in queue */ | ||
32 | unsigned long msg_qbytes; /* max number of bytes on queue */ | ||
33 | __kernel_pid_t msg_lspid; /* pid of last msgsnd */ | ||
34 | __kernel_pid_t msg_lrpid; /* last receive pid */ | ||
35 | unsigned long __unused4; | ||
36 | unsigned long __unused5; | ||
37 | }; | ||
38 | |||
39 | #endif /* _ASM_X86_MSGBUF_H */ | ||
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6be7fc254b5..4ffe09b2ad7 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h | |||
@@ -81,8 +81,15 @@ | |||
81 | #define MSR_IA32_MC0_ADDR 0x00000402 | 81 | #define MSR_IA32_MC0_ADDR 0x00000402 |
82 | #define MSR_IA32_MC0_MISC 0x00000403 | 82 | #define MSR_IA32_MC0_MISC 0x00000403 |
83 | 83 | ||
84 | #define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) | ||
85 | #define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) | ||
86 | #define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) | ||
87 | #define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) | ||
88 | |||
84 | /* These are consecutive and not in the normal 4er MCE bank block */ | 89 | /* These are consecutive and not in the normal 4er MCE bank block */ |
85 | #define MSR_IA32_MC0_CTL2 0x00000280 | 90 | #define MSR_IA32_MC0_CTL2 0x00000280 |
91 | #define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) | ||
92 | |||
86 | #define CMCI_EN (1ULL << 30) | 93 | #define CMCI_EN (1ULL << 30) |
87 | #define CMCI_THRESHOLD_MASK 0xffffULL | 94 | #define CMCI_THRESHOLD_MASK 0xffffULL |
88 | 95 | ||
@@ -215,6 +222,10 @@ | |||
215 | 222 | ||
216 | #define THERM_STATUS_PROCHOT (1 << 0) | 223 | #define THERM_STATUS_PROCHOT (1 << 0) |
217 | 224 | ||
225 | #define MSR_THERM2_CTL 0x0000019d | ||
226 | |||
227 | #define MSR_THERM2_CTL_TM_SELECT (1ULL << 16) | ||
228 | |||
218 | #define MSR_IA32_MISC_ENABLE 0x000001a0 | 229 | #define MSR_IA32_MISC_ENABLE 0x000001a0 |
219 | 230 | ||
220 | /* MISC_ENABLE bits: architectural */ | 231 | /* MISC_ENABLE bits: architectural */ |
@@ -374,6 +385,7 @@ | |||
374 | /* AMD-V MSRs */ | 385 | /* AMD-V MSRs */ |
375 | 386 | ||
376 | #define MSR_VM_CR 0xc0010114 | 387 | #define MSR_VM_CR 0xc0010114 |
388 | #define MSR_VM_IGNNE 0xc0010115 | ||
377 | #define MSR_VM_HSAVE_PA 0xc0010117 | 389 | #define MSR_VM_HSAVE_PA 0xc0010117 |
378 | 390 | ||
379 | #endif /* _ASM_X86_MSR_INDEX_H */ | 391 | #endif /* _ASM_X86_MSR_INDEX_H */ |
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 48ad9d29484..7e2b6ba962f 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h | |||
@@ -3,10 +3,16 @@ | |||
3 | 3 | ||
4 | #include <asm/msr-index.h> | 4 | #include <asm/msr-index.h> |
5 | 5 | ||
6 | #ifdef __KERNEL__ | ||
7 | #ifndef __ASSEMBLY__ | 6 | #ifndef __ASSEMBLY__ |
8 | 7 | ||
9 | #include <linux/types.h> | 8 | #include <linux/types.h> |
9 | #include <linux/ioctl.h> | ||
10 | |||
11 | #define X86_IOC_RDMSR_REGS _IOWR('c', 0xA0, __u32[8]) | ||
12 | #define X86_IOC_WRMSR_REGS _IOWR('c', 0xA1, __u32[8]) | ||
13 | |||
14 | #ifdef __KERNEL__ | ||
15 | |||
10 | #include <asm/asm.h> | 16 | #include <asm/asm.h> |
11 | #include <asm/errno.h> | 17 | #include <asm/errno.h> |
12 | #include <asm/cpumask.h> | 18 | #include <asm/cpumask.h> |
@@ -67,23 +73,7 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, | |||
67 | ".previous\n\t" | 73 | ".previous\n\t" |
68 | _ASM_EXTABLE(2b, 3b) | 74 | _ASM_EXTABLE(2b, 3b) |
69 | : [err] "=r" (*err), EAX_EDX_RET(val, low, high) | 75 | : [err] "=r" (*err), EAX_EDX_RET(val, low, high) |
70 | : "c" (msr), [fault] "i" (-EFAULT)); | 76 | : "c" (msr), [fault] "i" (-EIO)); |
71 | return EAX_EDX_VAL(val, low, high); | ||
72 | } | ||
73 | |||
74 | static inline unsigned long long native_read_msr_amd_safe(unsigned int msr, | ||
75 | int *err) | ||
76 | { | ||
77 | DECLARE_ARGS(val, low, high); | ||
78 | |||
79 | asm volatile("2: rdmsr ; xor %0,%0\n" | ||
80 | "1:\n\t" | ||
81 | ".section .fixup,\"ax\"\n\t" | ||
82 | "3: mov %3,%0 ; jmp 1b\n\t" | ||
83 | ".previous\n\t" | ||
84 | _ASM_EXTABLE(2b, 3b) | ||
85 | : "=r" (*err), EAX_EDX_RET(val, low, high) | ||
86 | : "c" (msr), "D" (0x9c5a203a), "i" (-EFAULT)); | ||
87 | return EAX_EDX_VAL(val, low, high); | 77 | return EAX_EDX_VAL(val, low, high); |
88 | } | 78 | } |
89 | 79 | ||
@@ -106,13 +96,16 @@ notrace static inline int native_write_msr_safe(unsigned int msr, | |||
106 | _ASM_EXTABLE(2b, 3b) | 96 | _ASM_EXTABLE(2b, 3b) |
107 | : [err] "=a" (err) | 97 | : [err] "=a" (err) |
108 | : "c" (msr), "0" (low), "d" (high), | 98 | : "c" (msr), "0" (low), "d" (high), |
109 | [fault] "i" (-EFAULT) | 99 | [fault] "i" (-EIO) |
110 | : "memory"); | 100 | : "memory"); |
111 | return err; | 101 | return err; |
112 | } | 102 | } |
113 | 103 | ||
114 | extern unsigned long long native_read_tsc(void); | 104 | extern unsigned long long native_read_tsc(void); |
115 | 105 | ||
106 | extern int native_rdmsr_safe_regs(u32 regs[8]); | ||
107 | extern int native_wrmsr_safe_regs(u32 regs[8]); | ||
108 | |||
116 | static __always_inline unsigned long long __native_read_tsc(void) | 109 | static __always_inline unsigned long long __native_read_tsc(void) |
117 | { | 110 | { |
118 | DECLARE_ARGS(val, low, high); | 111 | DECLARE_ARGS(val, low, high); |
@@ -181,14 +174,44 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) | |||
181 | *p = native_read_msr_safe(msr, &err); | 174 | *p = native_read_msr_safe(msr, &err); |
182 | return err; | 175 | return err; |
183 | } | 176 | } |
177 | |||
184 | static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) | 178 | static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) |
185 | { | 179 | { |
180 | u32 gprs[8] = { 0 }; | ||
186 | int err; | 181 | int err; |
187 | 182 | ||
188 | *p = native_read_msr_amd_safe(msr, &err); | 183 | gprs[1] = msr; |
184 | gprs[7] = 0x9c5a203a; | ||
185 | |||
186 | err = native_rdmsr_safe_regs(gprs); | ||
187 | |||
188 | *p = gprs[0] | ((u64)gprs[2] << 32); | ||
189 | |||
189 | return err; | 190 | return err; |
190 | } | 191 | } |
191 | 192 | ||
193 | static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) | ||
194 | { | ||
195 | u32 gprs[8] = { 0 }; | ||
196 | |||
197 | gprs[0] = (u32)val; | ||
198 | gprs[1] = msr; | ||
199 | gprs[2] = val >> 32; | ||
200 | gprs[7] = 0x9c5a203a; | ||
201 | |||
202 | return native_wrmsr_safe_regs(gprs); | ||
203 | } | ||
204 | |||
205 | static inline int rdmsr_safe_regs(u32 regs[8]) | ||
206 | { | ||
207 | return native_rdmsr_safe_regs(regs); | ||
208 | } | ||
209 | |||
210 | static inline int wrmsr_safe_regs(u32 regs[8]) | ||
211 | { | ||
212 | return native_wrmsr_safe_regs(regs); | ||
213 | } | ||
214 | |||
192 | #define rdtscl(low) \ | 215 | #define rdtscl(low) \ |
193 | ((low) = (u32)__native_read_tsc()) | 216 | ((low) = (u32)__native_read_tsc()) |
194 | 217 | ||
@@ -228,6 +251,8 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); | |||
228 | void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); | 251 | void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); |
229 | int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); | 252 | int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); |
230 | int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); | 253 | int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); |
254 | int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); | ||
255 | int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); | ||
231 | #else /* CONFIG_SMP */ | 256 | #else /* CONFIG_SMP */ |
232 | static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) | 257 | static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) |
233 | { | 258 | { |
@@ -258,7 +283,15 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) | |||
258 | { | 283 | { |
259 | return wrmsr_safe(msr_no, l, h); | 284 | return wrmsr_safe(msr_no, l, h); |
260 | } | 285 | } |
286 | static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) | ||
287 | { | ||
288 | return rdmsr_safe_regs(regs); | ||
289 | } | ||
290 | static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) | ||
291 | { | ||
292 | return wrmsr_safe_regs(regs); | ||
293 | } | ||
261 | #endif /* CONFIG_SMP */ | 294 | #endif /* CONFIG_SMP */ |
262 | #endif /* __ASSEMBLY__ */ | ||
263 | #endif /* __KERNEL__ */ | 295 | #endif /* __KERNEL__ */ |
296 | #endif /* __ASSEMBLY__ */ | ||
264 | #endif /* _ASM_X86_MSR_H */ | 297 | #endif /* _ASM_X86_MSR_H */ |
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index a51ada8467d..4365ffdb461 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h | |||
@@ -121,6 +121,9 @@ extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); | |||
121 | extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); | 121 | extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); |
122 | extern void mtrr_ap_init(void); | 122 | extern void mtrr_ap_init(void); |
123 | extern void mtrr_bp_init(void); | 123 | extern void mtrr_bp_init(void); |
124 | extern void set_mtrr_aps_delayed_init(void); | ||
125 | extern void mtrr_aps_init(void); | ||
126 | extern void mtrr_bp_restore(void); | ||
124 | extern int mtrr_trim_uncached_memory(unsigned long end_pfn); | 127 | extern int mtrr_trim_uncached_memory(unsigned long end_pfn); |
125 | extern int amd_special_default_mtrr(void); | 128 | extern int amd_special_default_mtrr(void); |
126 | # else | 129 | # else |
@@ -161,6 +164,9 @@ static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) | |||
161 | 164 | ||
162 | #define mtrr_ap_init() do {} while (0) | 165 | #define mtrr_ap_init() do {} while (0) |
163 | #define mtrr_bp_init() do {} while (0) | 166 | #define mtrr_bp_init() do {} while (0) |
167 | #define set_mtrr_aps_delayed_init() do {} while (0) | ||
168 | #define mtrr_aps_init() do {} while (0) | ||
169 | #define mtrr_bp_restore() do {} while (0) | ||
164 | # endif | 170 | # endif |
165 | 171 | ||
166 | #ifdef CONFIG_COMPAT | 172 | #ifdef CONFIG_COMPAT |
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c86e5ed4af5..139d4c1a33a 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h | |||
@@ -40,13 +40,12 @@ extern unsigned int nmi_watchdog; | |||
40 | #define NMI_INVALID 3 | 40 | #define NMI_INVALID 3 |
41 | 41 | ||
42 | struct ctl_table; | 42 | struct ctl_table; |
43 | struct file; | 43 | extern int proc_nmi_enabled(struct ctl_table *, int , |
44 | extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, | ||
45 | void __user *, size_t *, loff_t *); | 44 | void __user *, size_t *, loff_t *); |
46 | extern int unknown_nmi_panic; | 45 | extern int unknown_nmi_panic; |
47 | 46 | ||
48 | void __trigger_all_cpu_backtrace(void); | 47 | void arch_trigger_all_cpu_backtrace(void); |
49 | #define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() | 48 | #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace |
50 | 49 | ||
51 | static inline void localise_nmi_watchdog(void) | 50 | static inline void localise_nmi_watchdog(void) |
52 | { | 51 | { |
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index ad2668ee1aa..6d8723a766c 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h | |||
@@ -65,6 +65,8 @@ | |||
65 | 6: osp nopl 0x00(%eax,%eax,1) | 65 | 6: osp nopl 0x00(%eax,%eax,1) |
66 | 7: nopl 0x00000000(%eax) | 66 | 7: nopl 0x00000000(%eax) |
67 | 8: nopl 0x00000000(%eax,%eax,1) | 67 | 8: nopl 0x00000000(%eax,%eax,1) |
68 | Note: All the above are assumed to be a single instruction. | ||
69 | There is kernel code that depends on this. | ||
68 | */ | 70 | */ |
69 | #define P6_NOP1 GENERIC_NOP1 | 71 | #define P6_NOP1 GENERIC_NOP1 |
70 | #define P6_NOP2 ".byte 0x66,0x90\n" | 72 | #define P6_NOP2 ".byte 0x66,0x90\n" |
diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h index 6f0d0422f4c..965d4542797 100644 --- a/arch/x86/include/asm/param.h +++ b/arch/x86/include/asm/param.h | |||
@@ -1,22 +1 @@ | |||
1 | #ifndef _ASM_X86_PARAM_H | #include <asm-generic/param.h> | |
2 | #define _ASM_X86_PARAM_H | ||
3 | |||
4 | #ifdef __KERNEL__ | ||
5 | # define HZ CONFIG_HZ /* Internal kernel timer frequency */ | ||
6 | # define USER_HZ 100 /* some user interfaces are */ | ||
7 | # define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */ | ||
8 | #endif | ||
9 | |||
10 | #ifndef HZ | ||
11 | #define HZ 100 | ||
12 | #endif | ||
13 | |||
14 | #define EXEC_PAGESIZE 4096 | ||
15 | |||
16 | #ifndef NOGROUP | ||
17 | #define NOGROUP (-1) | ||
18 | #endif | ||
19 | |||
20 | #define MAXHOSTNAMELEN 64 /* max length of hostname */ | ||
21 | |||
22 | #endif /* _ASM_X86_PARAM_H */ | ||
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 4fb37c8a083..8aebcc41041 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h | |||
@@ -7,689 +7,11 @@ | |||
7 | #include <asm/pgtable_types.h> | 7 | #include <asm/pgtable_types.h> |
8 | #include <asm/asm.h> | 8 | #include <asm/asm.h> |
9 | 9 | ||
10 | /* Bitmask of what can be clobbered: usually at least eax. */ | 10 | #include <asm/paravirt_types.h> |
11 | #define CLBR_NONE 0 | ||
12 | #define CLBR_EAX (1 << 0) | ||
13 | #define CLBR_ECX (1 << 1) | ||
14 | #define CLBR_EDX (1 << 2) | ||
15 | #define CLBR_EDI (1 << 3) | ||
16 | |||
17 | #ifdef CONFIG_X86_32 | ||
18 | /* CLBR_ANY should match all regs platform has. For i386, that's just it */ | ||
19 | #define CLBR_ANY ((1 << 4) - 1) | ||
20 | |||
21 | #define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) | ||
22 | #define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) | ||
23 | #define CLBR_SCRATCH (0) | ||
24 | #else | ||
25 | #define CLBR_RAX CLBR_EAX | ||
26 | #define CLBR_RCX CLBR_ECX | ||
27 | #define CLBR_RDX CLBR_EDX | ||
28 | #define CLBR_RDI CLBR_EDI | ||
29 | #define CLBR_RSI (1 << 4) | ||
30 | #define CLBR_R8 (1 << 5) | ||
31 | #define CLBR_R9 (1 << 6) | ||
32 | #define CLBR_R10 (1 << 7) | ||
33 | #define CLBR_R11 (1 << 8) | ||
34 | |||
35 | #define CLBR_ANY ((1 << 9) - 1) | ||
36 | |||
37 | #define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ | ||
38 | CLBR_RCX | CLBR_R8 | CLBR_R9) | ||
39 | #define CLBR_RET_REG (CLBR_RAX) | ||
40 | #define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) | ||
41 | |||
42 | #include <asm/desc_defs.h> | ||
43 | #endif /* X86_64 */ | ||
44 | |||
45 | #define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) | ||
46 | 11 | ||
47 | #ifndef __ASSEMBLY__ | 12 | #ifndef __ASSEMBLY__ |
48 | #include <linux/types.h> | 13 | #include <linux/types.h> |
49 | #include <linux/cpumask.h> | 14 | #include <linux/cpumask.h> |
50 | #include <asm/kmap_types.h> | ||
51 | #include <asm/desc_defs.h> | ||
52 | |||
53 | struct page; | ||
54 | struct thread_struct; | ||
55 | struct desc_ptr; | ||
56 | struct tss_struct; | ||
57 | struct mm_struct; | ||
58 | struct desc_struct; | ||
59 | struct task_struct; | ||
60 | |||
61 | /* | ||
62 | * Wrapper type for pointers to code which uses the non-standard | ||
63 | * calling convention. See PV_CALL_SAVE_REGS_THUNK below. | ||
64 | */ | ||
65 | struct paravirt_callee_save { | ||
66 | void *func; | ||
67 | }; | ||
68 | |||
69 | /* general info */ | ||
70 | struct pv_info { | ||
71 | unsigned int kernel_rpl; | ||
72 | int shared_kernel_pmd; | ||
73 | int paravirt_enabled; | ||
74 | const char *name; | ||
75 | }; | ||
76 | |||
77 | struct pv_init_ops { | ||
78 | /* | ||
79 | * Patch may replace one of the defined code sequences with | ||
80 | * arbitrary code, subject to the same register constraints. | ||
81 | * This generally means the code is not free to clobber any | ||
82 | * registers other than EAX. The patch function should return | ||
83 | * the number of bytes of code generated, as we nop pad the | ||
84 | * rest in generic code. | ||
85 | */ | ||
86 | unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, | ||
87 | unsigned long addr, unsigned len); | ||
88 | |||
89 | /* Basic arch-specific setup */ | ||
90 | void (*arch_setup)(void); | ||
91 | char *(*memory_setup)(void); | ||
92 | void (*post_allocator_init)(void); | ||
93 | |||
94 | /* Print a banner to identify the environment */ | ||
95 | void (*banner)(void); | ||
96 | }; | ||
97 | |||
98 | |||
99 | struct pv_lazy_ops { | ||
100 | /* Set deferred update mode, used for batching operations. */ | ||
101 | void (*enter)(void); | ||
102 | void (*leave)(void); | ||
103 | }; | ||
104 | |||
105 | struct pv_time_ops { | ||
106 | void (*time_init)(void); | ||
107 | |||
108 | /* Set and set time of day */ | ||
109 | unsigned long (*get_wallclock)(void); | ||
110 | int (*set_wallclock)(unsigned long); | ||
111 | |||
112 | unsigned long long (*sched_clock)(void); | ||
113 | unsigned long (*get_tsc_khz)(void); | ||
114 | }; | ||
115 | |||
116 | struct pv_cpu_ops { | ||
117 | /* hooks for various privileged instructions */ | ||
118 | unsigned long (*get_debugreg)(int regno); | ||
119 | void (*set_debugreg)(int regno, unsigned long value); | ||
120 | |||
121 | void (*clts)(void); | ||
122 | |||
123 | unsigned long (*read_cr0)(void); | ||
124 | void (*write_cr0)(unsigned long); | ||
125 | |||
126 | unsigned long (*read_cr4_safe)(void); | ||
127 | unsigned long (*read_cr4)(void); | ||
128 | void (*write_cr4)(unsigned long); | ||
129 | |||
130 | #ifdef CONFIG_X86_64 | ||
131 | unsigned long (*read_cr8)(void); | ||
132 | void (*write_cr8)(unsigned long); | ||
133 | #endif | ||
134 | |||
135 | /* Segment descriptor handling */ | ||
136 | void (*load_tr_desc)(void); | ||
137 | void (*load_gdt)(const struct desc_ptr *); | ||
138 | void (*load_idt)(const struct desc_ptr *); | ||
139 | void (*store_gdt)(struct desc_ptr *); | ||
140 | void (*store_idt)(struct desc_ptr *); | ||
141 | void (*set_ldt)(const void *desc, unsigned entries); | ||
142 | unsigned long (*store_tr)(void); | ||
143 | void (*load_tls)(struct thread_struct *t, unsigned int cpu); | ||
144 | #ifdef CONFIG_X86_64 | ||
145 | void (*load_gs_index)(unsigned int idx); | ||
146 | #endif | ||
147 | void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, | ||
148 | const void *desc); | ||
149 | void (*write_gdt_entry)(struct desc_struct *, | ||
150 | int entrynum, const void *desc, int size); | ||
151 | void (*write_idt_entry)(gate_desc *, | ||
152 | int entrynum, const gate_desc *gate); | ||
153 | void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); | ||
154 | void (*free_ldt)(struct desc_struct *ldt, unsigned entries); | ||
155 | |||
156 | void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); | ||
157 | |||
158 | void (*set_iopl_mask)(unsigned mask); | ||
159 | |||
160 | void (*wbinvd)(void); | ||
161 | void (*io_delay)(void); | ||
162 | |||
163 | /* cpuid emulation, mostly so that caps bits can be disabled */ | ||
164 | void (*cpuid)(unsigned int *eax, unsigned int *ebx, | ||
165 | unsigned int *ecx, unsigned int *edx); | ||
166 | |||
167 | /* MSR, PMC and TSR operations. | ||
168 | err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ | ||
169 | u64 (*read_msr_amd)(unsigned int msr, int *err); | ||
170 | u64 (*read_msr)(unsigned int msr, int *err); | ||
171 | int (*write_msr)(unsigned int msr, unsigned low, unsigned high); | ||
172 | |||
173 | u64 (*read_tsc)(void); | ||
174 | u64 (*read_pmc)(int counter); | ||
175 | unsigned long long (*read_tscp)(unsigned int *aux); | ||
176 | |||
177 | /* | ||
178 | * Atomically enable interrupts and return to userspace. This | ||
179 | * is only ever used to return to 32-bit processes; in a | ||
180 | * 64-bit kernel, it's used for 32-on-64 compat processes, but | ||
181 | * never native 64-bit processes. (Jump, not call.) | ||
182 | */ | ||
183 | void (*irq_enable_sysexit)(void); | ||
184 | |||
185 | /* | ||
186 | * Switch to usermode gs and return to 64-bit usermode using | ||
187 | * sysret. Only used in 64-bit kernels to return to 64-bit | ||
188 | * processes. Usermode register state, including %rsp, must | ||
189 | * already be restored. | ||
190 | */ | ||
191 | void (*usergs_sysret64)(void); | ||
192 | |||
193 | /* | ||
194 | * Switch to usermode gs and return to 32-bit usermode using | ||
195 | * sysret. Used to return to 32-on-64 compat processes. | ||
196 | * Other usermode register state, including %esp, must already | ||
197 | * be restored. | ||
198 | */ | ||
199 | void (*usergs_sysret32)(void); | ||
200 | |||
201 | /* Normal iret. Jump to this with the standard iret stack | ||
202 | frame set up. */ | ||
203 | void (*iret)(void); | ||
204 | |||
205 | void (*swapgs)(void); | ||
206 | |||
207 | void (*start_context_switch)(struct task_struct *prev); | ||
208 | void (*end_context_switch)(struct task_struct *next); | ||
209 | }; | ||
210 | |||
211 | struct pv_irq_ops { | ||
212 | void (*init_IRQ)(void); | ||
213 | |||
214 | /* | ||
215 | * Get/set interrupt state. save_fl and restore_fl are only | ||
216 | * expected to use X86_EFLAGS_IF; all other bits | ||
217 | * returned from save_fl are undefined, and may be ignored by | ||
218 | * restore_fl. | ||
219 | * | ||
220 | * NOTE: These functions callers expect the callee to preserve | ||
221 | * more registers than the standard C calling convention. | ||
222 | */ | ||
223 | struct paravirt_callee_save save_fl; | ||
224 | struct paravirt_callee_save restore_fl; | ||
225 | struct paravirt_callee_save irq_disable; | ||
226 | struct paravirt_callee_save irq_enable; | ||
227 | |||
228 | void (*safe_halt)(void); | ||
229 | void (*halt)(void); | ||
230 | |||
231 | #ifdef CONFIG_X86_64 | ||
232 | void (*adjust_exception_frame)(void); | ||
233 | #endif | ||
234 | }; | ||
235 | |||
236 | struct pv_apic_ops { | ||
237 | #ifdef CONFIG_X86_LOCAL_APIC | ||
238 | void (*setup_boot_clock)(void); | ||
239 | void (*setup_secondary_clock)(void); | ||
240 | |||
241 | void (*startup_ipi_hook)(int phys_apicid, | ||
242 | unsigned long start_eip, | ||
243 | unsigned long start_esp); | ||
244 | #endif | ||
245 | }; | ||
246 | |||
247 | struct pv_mmu_ops { | ||
248 | /* | ||
249 | * Called before/after init_mm pagetable setup. setup_start | ||
250 | * may reset %cr3, and may pre-install parts of the pagetable; | ||
251 | * pagetable setup is expected to preserve any existing | ||
252 | * mapping. | ||
253 | */ | ||
254 | void (*pagetable_setup_start)(pgd_t *pgd_base); | ||
255 | void (*pagetable_setup_done)(pgd_t *pgd_base); | ||
256 | |||
257 | unsigned long (*read_cr2)(void); | ||
258 | void (*write_cr2)(unsigned long); | ||
259 | |||
260 | unsigned long (*read_cr3)(void); | ||
261 | void (*write_cr3)(unsigned long); | ||
262 | |||
263 | /* | ||
264 | * Hooks for intercepting the creation/use/destruction of an | ||
265 | * mm_struct. | ||
266 | */ | ||
267 | void (*activate_mm)(struct mm_struct *prev, | ||
268 | struct mm_struct *next); | ||
269 | void (*dup_mmap)(struct mm_struct *oldmm, | ||
270 | struct mm_struct *mm); | ||
271 | void (*exit_mmap)(struct mm_struct *mm); | ||
272 | |||
273 | |||
274 | /* TLB operations */ | ||
275 | void (*flush_tlb_user)(void); | ||
276 | void (*flush_tlb_kernel)(void); | ||
277 | void (*flush_tlb_single)(unsigned long addr); | ||
278 | void (*flush_tlb_others)(const struct cpumask *cpus, | ||
279 | struct mm_struct *mm, | ||
280 | unsigned long va); | ||
281 | |||
282 | /* Hooks for allocating and freeing a pagetable top-level */ | ||
283 | int (*pgd_alloc)(struct mm_struct *mm); | ||
284 | void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); | ||
285 | |||
286 | /* | ||
287 | * Hooks for allocating/releasing pagetable pages when they're | ||
288 | * attached to a pagetable | ||
289 | */ | ||
290 | void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); | ||
291 | void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); | ||
292 | void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); | ||
293 | void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); | ||
294 | void (*release_pte)(unsigned long pfn); | ||
295 | void (*release_pmd)(unsigned long pfn); | ||
296 | void (*release_pud)(unsigned long pfn); | ||
297 | |||
298 | /* Pagetable manipulation functions */ | ||
299 | void (*set_pte)(pte_t *ptep, pte_t pteval); | ||
300 | void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, | ||
301 | pte_t *ptep, pte_t pteval); | ||
302 | void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); | ||
303 | void (*pte_update)(struct mm_struct *mm, unsigned long addr, | ||
304 | pte_t *ptep); | ||
305 | void (*pte_update_defer)(struct mm_struct *mm, | ||
306 | unsigned long addr, pte_t *ptep); | ||
307 | |||
308 | pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, | ||
309 | pte_t *ptep); | ||
310 | void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, | ||
311 | pte_t *ptep, pte_t pte); | ||
312 | |||
313 | struct paravirt_callee_save pte_val; | ||
314 | struct paravirt_callee_save make_pte; | ||
315 | |||
316 | struct paravirt_callee_save pgd_val; | ||
317 | struct paravirt_callee_save make_pgd; | ||
318 | |||
319 | #if PAGETABLE_LEVELS >= 3 | ||
320 | #ifdef CONFIG_X86_PAE | ||
321 | void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); | ||
322 | void (*pte_clear)(struct mm_struct *mm, unsigned long addr, | ||
323 | pte_t *ptep); | ||
324 | void (*pmd_clear)(pmd_t *pmdp); | ||
325 | |||
326 | #endif /* CONFIG_X86_PAE */ | ||
327 | |||
328 | void (*set_pud)(pud_t *pudp, pud_t pudval); | ||
329 | |||
330 | struct paravirt_callee_save pmd_val; | ||
331 | struct paravirt_callee_save make_pmd; | ||
332 | |||
333 | #if PAGETABLE_LEVELS == 4 | ||
334 | struct paravirt_callee_save pud_val; | ||
335 | struct paravirt_callee_save make_pud; | ||
336 | |||
337 | void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); | ||
338 | #endif /* PAGETABLE_LEVELS == 4 */ | ||
339 | #endif /* PAGETABLE_LEVELS >= 3 */ | ||
340 | |||
341 | #ifdef CONFIG_HIGHPTE | ||
342 | void *(*kmap_atomic_pte)(struct page *page, enum km_type type); | ||
343 | #endif | ||
344 | |||
345 | struct pv_lazy_ops lazy_mode; | ||
346 | |||
347 | /* dom0 ops */ | ||
348 | |||
349 | /* Sometimes the physical address is a pfn, and sometimes its | ||
350 | an mfn. We can tell which is which from the index. */ | ||
351 | void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, | ||
352 | phys_addr_t phys, pgprot_t flags); | ||
353 | }; | ||
354 | |||
355 | struct raw_spinlock; | ||
356 | struct pv_lock_ops { | ||
357 | int (*spin_is_locked)(struct raw_spinlock *lock); | ||
358 | int (*spin_is_contended)(struct raw_spinlock *lock); | ||
359 | void (*spin_lock)(struct raw_spinlock *lock); | ||
360 | void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); | ||
361 | int (*spin_trylock)(struct raw_spinlock *lock); | ||
362 | void (*spin_unlock)(struct raw_spinlock *lock); | ||
363 | }; | ||
364 | |||
365 | /* This contains all the paravirt structures: we get a convenient | ||
366 | * number for each function using the offset which we use to indicate | ||
367 | * what to patch. */ | ||
368 | struct paravirt_patch_template { | ||
369 | struct pv_init_ops pv_init_ops; | ||
370 | struct pv_time_ops pv_time_ops; | ||
371 | struct pv_cpu_ops pv_cpu_ops; | ||
372 | struct pv_irq_ops pv_irq_ops; | ||
373 | struct pv_apic_ops pv_apic_ops; | ||
374 | struct pv_mmu_ops pv_mmu_ops; | ||
375 | struct pv_lock_ops pv_lock_ops; | ||
376 | }; | ||
377 | |||
378 | extern struct pv_info pv_info; | ||
379 | extern struct pv_init_ops pv_init_ops; | ||
380 | extern struct pv_time_ops pv_time_ops; | ||
381 | extern struct pv_cpu_ops pv_cpu_ops; | ||
382 | extern struct pv_irq_ops pv_irq_ops; | ||
383 | extern struct pv_apic_ops pv_apic_ops; | ||
384 | extern struct pv_mmu_ops pv_mmu_ops; | ||
385 | extern struct pv_lock_ops pv_lock_ops; | ||
386 | |||
387 | #define PARAVIRT_PATCH(x) \ | ||
388 | (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) | ||
389 | |||
390 | #define paravirt_type(op) \ | ||
391 | [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ | ||
392 | [paravirt_opptr] "i" (&(op)) | ||
393 | #define paravirt_clobber(clobber) \ | ||
394 | [paravirt_clobber] "i" (clobber) | ||
395 | |||
396 | /* | ||
397 | * Generate some code, and mark it as patchable by the | ||
398 | * apply_paravirt() alternate instruction patcher. | ||
399 | */ | ||
400 | #define _paravirt_alt(insn_string, type, clobber) \ | ||
401 | "771:\n\t" insn_string "\n" "772:\n" \ | ||
402 | ".pushsection .parainstructions,\"a\"\n" \ | ||
403 | _ASM_ALIGN "\n" \ | ||
404 | _ASM_PTR " 771b\n" \ | ||
405 | " .byte " type "\n" \ | ||
406 | " .byte 772b-771b\n" \ | ||
407 | " .short " clobber "\n" \ | ||
408 | ".popsection\n" | ||
409 | |||
410 | /* Generate patchable code, with the default asm parameters. */ | ||
411 | #define paravirt_alt(insn_string) \ | ||
412 | _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") | ||
413 | |||
414 | /* Simple instruction patching code. */ | ||
415 | #define DEF_NATIVE(ops, name, code) \ | ||
416 | extern const char start_##ops##_##name[], end_##ops##_##name[]; \ | ||
417 | asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") | ||
418 | |||
419 | unsigned paravirt_patch_nop(void); | ||
420 | unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); | ||
421 | unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); | ||
422 | unsigned paravirt_patch_ignore(unsigned len); | ||
423 | unsigned paravirt_patch_call(void *insnbuf, | ||
424 | const void *target, u16 tgt_clobbers, | ||
425 | unsigned long addr, u16 site_clobbers, | ||
426 | unsigned len); | ||
427 | unsigned paravirt_patch_jmp(void *insnbuf, const void *target, | ||
428 | unsigned long addr, unsigned len); | ||
429 | unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, | ||
430 | unsigned long addr, unsigned len); | ||
431 | |||
432 | unsigned paravirt_patch_insns(void *insnbuf, unsigned len, | ||
433 | const char *start, const char *end); | ||
434 | |||
435 | unsigned native_patch(u8 type, u16 clobbers, void *ibuf, | ||
436 | unsigned long addr, unsigned len); | ||
437 | |||
438 | int paravirt_disable_iospace(void); | ||
439 | |||
440 | /* | ||
441 | * This generates an indirect call based on the operation type number. | ||
442 | * The type number, computed in PARAVIRT_PATCH, is derived from the | ||
443 | * offset into the paravirt_patch_template structure, and can therefore be | ||
444 | * freely converted back into a structure offset. | ||
445 | */ | ||
446 | #define PARAVIRT_CALL "call *%c[paravirt_opptr];" | ||
447 | |||
448 | /* | ||
449 | * These macros are intended to wrap calls through one of the paravirt | ||
450 | * ops structs, so that they can be later identified and patched at | ||
451 | * runtime. | ||
452 | * | ||
453 | * Normally, a call to a pv_op function is a simple indirect call: | ||
454 | * (pv_op_struct.operations)(args...). | ||
455 | * | ||
456 | * Unfortunately, this is a relatively slow operation for modern CPUs, | ||
457 | * because it cannot necessarily determine what the destination | ||
458 | * address is. In this case, the address is a runtime constant, so at | ||
459 | * the very least we can patch the call to e a simple direct call, or | ||
460 | * ideally, patch an inline implementation into the callsite. (Direct | ||
461 | * calls are essentially free, because the call and return addresses | ||
462 | * are completely predictable.) | ||
463 | * | ||
464 | * For i386, these macros rely on the standard gcc "regparm(3)" calling | ||
465 | * convention, in which the first three arguments are placed in %eax, | ||
466 | * %edx, %ecx (in that order), and the remaining arguments are placed | ||
467 | * on the stack. All caller-save registers (eax,edx,ecx) are expected | ||
468 | * to be modified (either clobbered or used for return values). | ||
469 | * X86_64, on the other hand, already specifies a register-based calling | ||
470 | * conventions, returning at %rax, with parameteres going on %rdi, %rsi, | ||
471 | * %rdx, and %rcx. Note that for this reason, x86_64 does not need any | ||
472 | * special handling for dealing with 4 arguments, unlike i386. | ||
473 | * However, x86_64 also have to clobber all caller saved registers, which | ||
474 | * unfortunately, are quite a bit (r8 - r11) | ||
475 | * | ||
476 | * The call instruction itself is marked by placing its start address | ||
477 | * and size into the .parainstructions section, so that | ||
478 | * apply_paravirt() in arch/i386/kernel/alternative.c can do the | ||
479 | * appropriate patching under the control of the backend pv_init_ops | ||
480 | * implementation. | ||
481 | * | ||
482 | * Unfortunately there's no way to get gcc to generate the args setup | ||
483 | * for the call, and then allow the call itself to be generated by an | ||
484 | * inline asm. Because of this, we must do the complete arg setup and | ||
485 | * return value handling from within these macros. This is fairly | ||
486 | * cumbersome. | ||
487 | * | ||
488 | * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. | ||
489 | * It could be extended to more arguments, but there would be little | ||
490 | * to be gained from that. For each number of arguments, there are | ||
491 | * the two VCALL and CALL variants for void and non-void functions. | ||
492 | * | ||
493 | * When there is a return value, the invoker of the macro must specify | ||
494 | * the return type. The macro then uses sizeof() on that type to | ||
495 | * determine whether its a 32 or 64 bit value, and places the return | ||
496 | * in the right register(s) (just %eax for 32-bit, and %edx:%eax for | ||
497 | * 64-bit). For x86_64 machines, it just returns at %rax regardless of | ||
498 | * the return value size. | ||
499 | * | ||
500 | * 64-bit arguments are passed as a pair of adjacent 32-bit arguments | ||
501 | * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments | ||
502 | * in low,high order | ||
503 | * | ||
504 | * Small structures are passed and returned in registers. The macro | ||
505 | * calling convention can't directly deal with this, so the wrapper | ||
506 | * functions must do this. | ||
507 | * | ||
508 | * These PVOP_* macros are only defined within this header. This | ||
509 | * means that all uses must be wrapped in inline functions. This also | ||
510 | * makes sure the incoming and outgoing types are always correct. | ||
511 | */ | ||
512 | #ifdef CONFIG_X86_32 | ||
513 | #define PVOP_VCALL_ARGS \ | ||
514 | unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx | ||
515 | #define PVOP_CALL_ARGS PVOP_VCALL_ARGS | ||
516 | |||
517 | #define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) | ||
518 | #define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) | ||
519 | #define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) | ||
520 | |||
521 | #define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ | ||
522 | "=c" (__ecx) | ||
523 | #define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS | ||
524 | |||
525 | #define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) | ||
526 | #define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS | ||
527 | |||
528 | #define EXTRA_CLOBBERS | ||
529 | #define VEXTRA_CLOBBERS | ||
530 | #else /* CONFIG_X86_64 */ | ||
531 | #define PVOP_VCALL_ARGS \ | ||
532 | unsigned long __edi = __edi, __esi = __esi, \ | ||
533 | __edx = __edx, __ecx = __ecx | ||
534 | #define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax | ||
535 | |||
536 | #define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) | ||
537 | #define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) | ||
538 | #define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) | ||
539 | #define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) | ||
540 | |||
541 | #define PVOP_VCALL_CLOBBERS "=D" (__edi), \ | ||
542 | "=S" (__esi), "=d" (__edx), \ | ||
543 | "=c" (__ecx) | ||
544 | #define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) | ||
545 | |||
546 | #define PVOP_VCALLEE_CLOBBERS "=a" (__eax) | ||
547 | #define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS | ||
548 | |||
549 | #define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" | ||
550 | #define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" | ||
551 | #endif /* CONFIG_X86_32 */ | ||
552 | |||
553 | #ifdef CONFIG_PARAVIRT_DEBUG | ||
554 | #define PVOP_TEST_NULL(op) BUG_ON(op == NULL) | ||
555 | #else | ||
556 | #define PVOP_TEST_NULL(op) ((void)op) | ||
557 | #endif | ||
558 | |||
559 | #define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ | ||
560 | pre, post, ...) \ | ||
561 | ({ \ | ||
562 | rettype __ret; \ | ||
563 | PVOP_CALL_ARGS; \ | ||
564 | PVOP_TEST_NULL(op); \ | ||
565 | /* This is 32-bit specific, but is okay in 64-bit */ \ | ||
566 | /* since this condition will never hold */ \ | ||
567 | if (sizeof(rettype) > sizeof(unsigned long)) { \ | ||
568 | asm volatile(pre \ | ||
569 | paravirt_alt(PARAVIRT_CALL) \ | ||
570 | post \ | ||
571 | : call_clbr \ | ||
572 | : paravirt_type(op), \ | ||
573 | paravirt_clobber(clbr), \ | ||
574 | ##__VA_ARGS__ \ | ||
575 | : "memory", "cc" extra_clbr); \ | ||
576 | __ret = (rettype)((((u64)__edx) << 32) | __eax); \ | ||
577 | } else { \ | ||
578 | asm volatile(pre \ | ||
579 | paravirt_alt(PARAVIRT_CALL) \ | ||
580 | post \ | ||
581 | : call_clbr \ | ||
582 | : paravirt_type(op), \ | ||
583 | paravirt_clobber(clbr), \ | ||
584 | ##__VA_ARGS__ \ | ||
585 | : "memory", "cc" extra_clbr); \ | ||
586 | __ret = (rettype)__eax; \ | ||
587 | } \ | ||
588 | __ret; \ | ||
589 | }) | ||
590 | |||
591 | #define __PVOP_CALL(rettype, op, pre, post, ...) \ | ||
592 | ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ | ||
593 | EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) | ||
594 | |||
595 | #define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ | ||
596 | ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ | ||
597 | PVOP_CALLEE_CLOBBERS, , \ | ||
598 | pre, post, ##__VA_ARGS__) | ||
599 | |||
600 | |||
601 | #define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ | ||
602 | ({ \ | ||
603 | PVOP_VCALL_ARGS; \ | ||
604 | PVOP_TEST_NULL(op); \ | ||
605 | asm volatile(pre \ | ||
606 | paravirt_alt(PARAVIRT_CALL) \ | ||
607 | post \ | ||
608 | : call_clbr \ | ||
609 | : paravirt_type(op), \ | ||
610 | paravirt_clobber(clbr), \ | ||
611 | ##__VA_ARGS__ \ | ||
612 | : "memory", "cc" extra_clbr); \ | ||
613 | }) | ||
614 | |||
615 | #define __PVOP_VCALL(op, pre, post, ...) \ | ||
616 | ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ | ||
617 | VEXTRA_CLOBBERS, \ | ||
618 | pre, post, ##__VA_ARGS__) | ||
619 | |||
620 | #define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ | ||
621 | ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ | ||
622 | PVOP_VCALLEE_CLOBBERS, , \ | ||
623 | pre, post, ##__VA_ARGS__) | ||
624 | |||
625 | |||
626 | |||
627 | #define PVOP_CALL0(rettype, op) \ | ||
628 | __PVOP_CALL(rettype, op, "", "") | ||
629 | #define PVOP_VCALL0(op) \ | ||
630 | __PVOP_VCALL(op, "", "") | ||
631 | |||
632 | #define PVOP_CALLEE0(rettype, op) \ | ||
633 | __PVOP_CALLEESAVE(rettype, op, "", "") | ||
634 | #define PVOP_VCALLEE0(op) \ | ||
635 | __PVOP_VCALLEESAVE(op, "", "") | ||
636 | |||
637 | |||
638 | #define PVOP_CALL1(rettype, op, arg1) \ | ||
639 | __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) | ||
640 | #define PVOP_VCALL1(op, arg1) \ | ||
641 | __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) | ||
642 | |||
643 | #define PVOP_CALLEE1(rettype, op, arg1) \ | ||
644 | __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) | ||
645 | #define PVOP_VCALLEE1(op, arg1) \ | ||
646 | __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) | ||
647 | |||
648 | |||
649 | #define PVOP_CALL2(rettype, op, arg1, arg2) \ | ||
650 | __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ | ||
651 | PVOP_CALL_ARG2(arg2)) | ||
652 | #define PVOP_VCALL2(op, arg1, arg2) \ | ||
653 | __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ | ||
654 | PVOP_CALL_ARG2(arg2)) | ||
655 | |||
656 | #define PVOP_CALLEE2(rettype, op, arg1, arg2) \ | ||
657 | __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ | ||
658 | PVOP_CALL_ARG2(arg2)) | ||
659 | #define PVOP_VCALLEE2(op, arg1, arg2) \ | ||
660 | __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ | ||
661 | PVOP_CALL_ARG2(arg2)) | ||
662 | |||
663 | |||
664 | #define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ | ||
665 | __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ | ||
666 | PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) | ||
667 | #define PVOP_VCALL3(op, arg1, arg2, arg3) \ | ||
668 | __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ | ||
669 | PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) | ||
670 | |||
671 | /* This is the only difference in x86_64. We can make it much simpler */ | ||
672 | #ifdef CONFIG_X86_32 | ||
673 | #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ | ||
674 | __PVOP_CALL(rettype, op, \ | ||
675 | "push %[_arg4];", "lea 4(%%esp),%%esp;", \ | ||
676 | PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ | ||
677 | PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) | ||
678 | #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ | ||
679 | __PVOP_VCALL(op, \ | ||
680 | "push %[_arg4];", "lea 4(%%esp),%%esp;", \ | ||
681 | "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ | ||
682 | "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) | ||
683 | #else | ||
684 | #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ | ||
685 | __PVOP_CALL(rettype, op, "", "", \ | ||
686 | PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ | ||
687 | PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) | ||
688 | #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ | ||
689 | __PVOP_VCALL(op, "", "", \ | ||
690 | PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ | ||
691 | PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) | ||
692 | #endif | ||
693 | 15 | ||
694 | static inline int paravirt_enabled(void) | 16 | static inline int paravirt_enabled(void) |
695 | { | 17 | { |
@@ -702,22 +24,6 @@ static inline void load_sp0(struct tss_struct *tss, | |||
702 | PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); | 24 | PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); |
703 | } | 25 | } |
704 | 26 | ||
705 | #define ARCH_SETUP pv_init_ops.arch_setup(); | ||
706 | static inline unsigned long get_wallclock(void) | ||
707 | { | ||
708 | return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock); | ||
709 | } | ||
710 | |||
711 | static inline int set_wallclock(unsigned long nowtime) | ||
712 | { | ||
713 | return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime); | ||
714 | } | ||
715 | |||
716 | static inline void (*choose_time_init(void))(void) | ||
717 | { | ||
718 | return pv_time_ops.time_init; | ||
719 | } | ||
720 | |||
721 | /* The paravirtualized CPUID instruction. */ | 27 | /* The paravirtualized CPUID instruction. */ |
722 | static inline void __cpuid(unsigned int *eax, unsigned int *ebx, | 28 | static inline void __cpuid(unsigned int *eax, unsigned int *ebx, |
723 | unsigned int *ecx, unsigned int *edx) | 29 | unsigned int *ecx, unsigned int *edx) |
@@ -820,15 +126,22 @@ static inline u64 paravirt_read_msr(unsigned msr, int *err) | |||
820 | { | 126 | { |
821 | return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); | 127 | return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); |
822 | } | 128 | } |
823 | static inline u64 paravirt_read_msr_amd(unsigned msr, int *err) | 129 | |
130 | static inline int paravirt_rdmsr_regs(u32 *regs) | ||
824 | { | 131 | { |
825 | return PVOP_CALL2(u64, pv_cpu_ops.read_msr_amd, msr, err); | 132 | return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs); |
826 | } | 133 | } |
134 | |||
827 | static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) | 135 | static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) |
828 | { | 136 | { |
829 | return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); | 137 | return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); |
830 | } | 138 | } |
831 | 139 | ||
140 | static inline int paravirt_wrmsr_regs(u32 *regs) | ||
141 | { | ||
142 | return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs); | ||
143 | } | ||
144 | |||
832 | /* These should all do BUG_ON(_err), but our headers are too tangled. */ | 145 | /* These should all do BUG_ON(_err), but our headers are too tangled. */ |
833 | #define rdmsr(msr, val1, val2) \ | 146 | #define rdmsr(msr, val1, val2) \ |
834 | do { \ | 147 | do { \ |
@@ -862,6 +175,9 @@ do { \ | |||
862 | _err; \ | 175 | _err; \ |
863 | }) | 176 | }) |
864 | 177 | ||
178 | #define rdmsr_safe_regs(regs) paravirt_rdmsr_regs(regs) | ||
179 | #define wrmsr_safe_regs(regs) paravirt_wrmsr_regs(regs) | ||
180 | |||
865 | static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) | 181 | static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) |
866 | { | 182 | { |
867 | int err; | 183 | int err; |
@@ -871,12 +187,31 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) | |||
871 | } | 187 | } |
872 | static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) | 188 | static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) |
873 | { | 189 | { |
190 | u32 gprs[8] = { 0 }; | ||
874 | int err; | 191 | int err; |
875 | 192 | ||
876 | *p = paravirt_read_msr_amd(msr, &err); | 193 | gprs[1] = msr; |
194 | gprs[7] = 0x9c5a203a; | ||
195 | |||
196 | err = paravirt_rdmsr_regs(gprs); | ||
197 | |||
198 | *p = gprs[0] | ((u64)gprs[2] << 32); | ||
199 | |||
877 | return err; | 200 | return err; |
878 | } | 201 | } |
879 | 202 | ||
203 | static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) | ||
204 | { | ||
205 | u32 gprs[8] = { 0 }; | ||
206 | |||
207 | gprs[0] = (u32)val; | ||
208 | gprs[1] = msr; | ||
209 | gprs[2] = val >> 32; | ||
210 | gprs[7] = 0x9c5a203a; | ||
211 | |||
212 | return paravirt_wrmsr_regs(gprs); | ||
213 | } | ||
214 | |||
880 | static inline u64 paravirt_read_tsc(void) | 215 | static inline u64 paravirt_read_tsc(void) |
881 | { | 216 | { |
882 | return PVOP_CALL0(u64, pv_cpu_ops.read_tsc); | 217 | return PVOP_CALL0(u64, pv_cpu_ops.read_tsc); |
@@ -894,7 +229,6 @@ static inline unsigned long long paravirt_sched_clock(void) | |||
894 | { | 229 | { |
895 | return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); | 230 | return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); |
896 | } | 231 | } |
897 | #define calibrate_tsc() (pv_time_ops.get_tsc_khz()) | ||
898 | 232 | ||
899 | static inline unsigned long long paravirt_read_pmc(int counter) | 233 | static inline unsigned long long paravirt_read_pmc(int counter) |
900 | { | 234 | { |
@@ -1012,34 +346,6 @@ static inline void slow_down_io(void) | |||
1012 | #endif | 346 | #endif |
1013 | } | 347 | } |
1014 | 348 | ||
1015 | #ifdef CONFIG_X86_LOCAL_APIC | ||
1016 | static inline void setup_boot_clock(void) | ||
1017 | { | ||
1018 | PVOP_VCALL0(pv_apic_ops.setup_boot_clock); | ||
1019 | } | ||
1020 | |||
1021 | static inline void setup_secondary_clock(void) | ||
1022 | { | ||
1023 | PVOP_VCALL0(pv_apic_ops.setup_secondary_clock); | ||
1024 | } | ||
1025 | #endif | ||
1026 | |||
1027 | static inline void paravirt_post_allocator_init(void) | ||
1028 | { | ||
1029 | if (pv_init_ops.post_allocator_init) | ||
1030 | (*pv_init_ops.post_allocator_init)(); | ||
1031 | } | ||
1032 | |||
1033 | static inline void paravirt_pagetable_setup_start(pgd_t *base) | ||
1034 | { | ||
1035 | (*pv_mmu_ops.pagetable_setup_start)(base); | ||
1036 | } | ||
1037 | |||
1038 | static inline void paravirt_pagetable_setup_done(pgd_t *base) | ||
1039 | { | ||
1040 | (*pv_mmu_ops.pagetable_setup_done)(base); | ||
1041 | } | ||
1042 | |||
1043 | #ifdef CONFIG_SMP | 349 | #ifdef CONFIG_SMP |
1044 | static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, | 350 | static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, |
1045 | unsigned long start_esp) | 351 | unsigned long start_esp) |
@@ -1393,20 +699,6 @@ static inline void pmd_clear(pmd_t *pmdp) | |||
1393 | } | 699 | } |
1394 | #endif /* CONFIG_X86_PAE */ | 700 | #endif /* CONFIG_X86_PAE */ |
1395 | 701 | ||
1396 | /* Lazy mode for batching updates / context switch */ | ||
1397 | enum paravirt_lazy_mode { | ||
1398 | PARAVIRT_LAZY_NONE, | ||
1399 | PARAVIRT_LAZY_MMU, | ||
1400 | PARAVIRT_LAZY_CPU, | ||
1401 | }; | ||
1402 | |||
1403 | enum paravirt_lazy_mode paravirt_get_lazy_mode(void); | ||
1404 | void paravirt_start_context_switch(struct task_struct *prev); | ||
1405 | void paravirt_end_context_switch(struct task_struct *next); | ||
1406 | |||
1407 | void paravirt_enter_lazy_mmu(void); | ||
1408 | void paravirt_leave_lazy_mmu(void); | ||
1409 | |||
1410 | #define __HAVE_ARCH_START_CONTEXT_SWITCH | 702 | #define __HAVE_ARCH_START_CONTEXT_SWITCH |
1411 | static inline void arch_start_context_switch(struct task_struct *prev) | 703 | static inline void arch_start_context_switch(struct task_struct *prev) |
1412 | { | 704 | { |
@@ -1437,12 +729,6 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, | |||
1437 | pv_mmu_ops.set_fixmap(idx, phys, flags); | 729 | pv_mmu_ops.set_fixmap(idx, phys, flags); |
1438 | } | 730 | } |
1439 | 731 | ||
1440 | void _paravirt_nop(void); | ||
1441 | u32 _paravirt_ident_32(u32); | ||
1442 | u64 _paravirt_ident_64(u64); | ||
1443 | |||
1444 | #define paravirt_nop ((void *)_paravirt_nop) | ||
1445 | |||
1446 | #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) | 732 | #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) |
1447 | 733 | ||
1448 | static inline int __raw_spin_is_locked(struct raw_spinlock *lock) | 734 | static inline int __raw_spin_is_locked(struct raw_spinlock *lock) |
@@ -1479,17 +765,6 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) | |||
1479 | 765 | ||
1480 | #endif | 766 | #endif |
1481 | 767 | ||
1482 | /* These all sit in the .parainstructions section to tell us what to patch. */ | ||
1483 | struct paravirt_patch_site { | ||
1484 | u8 *instr; /* original instructions */ | ||
1485 | u8 instrtype; /* type of this instruction */ | ||
1486 | u8 len; /* length of original instruction */ | ||
1487 | u16 clobbers; /* what registers you may clobber */ | ||
1488 | }; | ||
1489 | |||
1490 | extern struct paravirt_patch_site __parainstructions[], | ||
1491 | __parainstructions_end[]; | ||
1492 | |||
1493 | #ifdef CONFIG_X86_32 | 768 | #ifdef CONFIG_X86_32 |
1494 | #define PV_SAVE_REGS "pushl %ecx; pushl %edx;" | 769 | #define PV_SAVE_REGS "pushl %ecx; pushl %edx;" |
1495 | #define PV_RESTORE_REGS "popl %edx; popl %ecx;" | 770 | #define PV_RESTORE_REGS "popl %edx; popl %ecx;" |
@@ -1628,6 +903,8 @@ static inline unsigned long __raw_local_irq_save(void) | |||
1628 | #undef PVOP_VCALL4 | 903 | #undef PVOP_VCALL4 |
1629 | #undef PVOP_CALL4 | 904 | #undef PVOP_CALL4 |
1630 | 905 | ||
906 | extern void default_banner(void); | ||
907 | |||
1631 | #else /* __ASSEMBLY__ */ | 908 | #else /* __ASSEMBLY__ */ |
1632 | 909 | ||
1633 | #define _PVSITE(ptype, clobbers, ops, word, algn) \ | 910 | #define _PVSITE(ptype, clobbers, ops, word, algn) \ |
@@ -1768,5 +1045,7 @@ static inline unsigned long __raw_local_irq_save(void) | |||
1768 | #endif /* CONFIG_X86_32 */ | 1045 | #endif /* CONFIG_X86_32 */ |
1769 | 1046 | ||
1770 | #endif /* __ASSEMBLY__ */ | 1047 | #endif /* __ASSEMBLY__ */ |
1771 | #endif /* CONFIG_PARAVIRT */ | 1048 | #else /* CONFIG_PARAVIRT */ |
1049 | # define default_banner x86_init_noop | ||
1050 | #endif /* !CONFIG_PARAVIRT */ | ||
1772 | #endif /* _ASM_X86_PARAVIRT_H */ | 1051 | #endif /* _ASM_X86_PARAVIRT_H */ |
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h new file mode 100644 index 00000000000..dd0f5b32489 --- /dev/null +++ b/arch/x86/include/asm/paravirt_types.h | |||
@@ -0,0 +1,693 @@ | |||
1 | #ifndef _ASM_X86_PARAVIRT_TYPES_H | ||
2 | #define _ASM_X86_PARAVIRT_TYPES_H | ||
3 | |||
4 | /* Bitmask of what can be clobbered: usually at least eax. */ | ||
5 | #define CLBR_NONE 0 | ||
6 | #define CLBR_EAX (1 << 0) | ||
7 | #define CLBR_ECX (1 << 1) | ||
8 | #define CLBR_EDX (1 << 2) | ||
9 | #define CLBR_EDI (1 << 3) | ||
10 | |||
11 | #ifdef CONFIG_X86_32 | ||
12 | /* CLBR_ANY should match all regs platform has. For i386, that's just it */ | ||
13 | #define CLBR_ANY ((1 << 4) - 1) | ||
14 | |||
15 | #define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) | ||
16 | #define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) | ||
17 | #define CLBR_SCRATCH (0) | ||
18 | #else | ||
19 | #define CLBR_RAX CLBR_EAX | ||
20 | #define CLBR_RCX CLBR_ECX | ||
21 | #define CLBR_RDX CLBR_EDX | ||
22 | #define CLBR_RDI CLBR_EDI | ||
23 | #define CLBR_RSI (1 << 4) | ||
24 | #define CLBR_R8 (1 << 5) | ||
25 | #define CLBR_R9 (1 << 6) | ||
26 | #define CLBR_R10 (1 << 7) | ||
27 | #define CLBR_R11 (1 << 8) | ||
28 | |||
29 | #define CLBR_ANY ((1 << 9) - 1) | ||
30 | |||
31 | #define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ | ||
32 | CLBR_RCX | CLBR_R8 | CLBR_R9) | ||
33 | #define CLBR_RET_REG (CLBR_RAX) | ||
34 | #define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) | ||
35 | |||
36 | #endif /* X86_64 */ | ||
37 | |||
38 | #define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) | ||
39 | |||
40 | #ifndef __ASSEMBLY__ | ||
41 | |||
42 | #include <asm/desc_defs.h> | ||
43 | #include <asm/kmap_types.h> | ||
44 | |||
45 | struct page; | ||
46 | struct thread_struct; | ||
47 | struct desc_ptr; | ||
48 | struct tss_struct; | ||
49 | struct mm_struct; | ||
50 | struct desc_struct; | ||
51 | struct task_struct; | ||
52 | struct cpumask; | ||
53 | |||
54 | /* | ||
55 | * Wrapper type for pointers to code which uses the non-standard | ||
56 | * calling convention. See PV_CALL_SAVE_REGS_THUNK below. | ||
57 | */ | ||
58 | struct paravirt_callee_save { | ||
59 | void *func; | ||
60 | }; | ||
61 | |||
62 | /* general info */ | ||
63 | struct pv_info { | ||
64 | unsigned int kernel_rpl; | ||
65 | int shared_kernel_pmd; | ||
66 | int paravirt_enabled; | ||
67 | const char *name; | ||
68 | }; | ||
69 | |||
70 | struct pv_init_ops { | ||
71 | /* | ||
72 | * Patch may replace one of the defined code sequences with | ||
73 | * arbitrary code, subject to the same register constraints. | ||
74 | * This generally means the code is not free to clobber any | ||
75 | * registers other than EAX. The patch function should return | ||
76 | * the number of bytes of code generated, as we nop pad the | ||
77 | * rest in generic code. | ||
78 | */ | ||
79 | unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, | ||
80 | unsigned long addr, unsigned len); | ||
81 | }; | ||
82 | |||
83 | |||
84 | struct pv_lazy_ops { | ||
85 | /* Set deferred update mode, used for batching operations. */ | ||
86 | void (*enter)(void); | ||
87 | void (*leave)(void); | ||
88 | }; | ||
89 | |||
90 | struct pv_time_ops { | ||
91 | unsigned long long (*sched_clock)(void); | ||
92 | unsigned long (*get_tsc_khz)(void); | ||
93 | }; | ||
94 | |||
95 | struct pv_cpu_ops { | ||
96 | /* hooks for various privileged instructions */ | ||
97 | unsigned long (*get_debugreg)(int regno); | ||
98 | void (*set_debugreg)(int regno, unsigned long value); | ||
99 | |||
100 | void (*clts)(void); | ||
101 | |||
102 | unsigned long (*read_cr0)(void); | ||
103 | void (*write_cr0)(unsigned long); | ||
104 | |||
105 | unsigned long (*read_cr4_safe)(void); | ||
106 | unsigned long (*read_cr4)(void); | ||
107 | void (*write_cr4)(unsigned long); | ||
108 | |||
109 | #ifdef CONFIG_X86_64 | ||
110 | unsigned long (*read_cr8)(void); | ||
111 | void (*write_cr8)(unsigned long); | ||
112 | #endif | ||
113 | |||
114 | /* Segment descriptor handling */ | ||
115 | void (*load_tr_desc)(void); | ||
116 | void (*load_gdt)(const struct desc_ptr *); | ||
117 | void (*load_idt)(const struct desc_ptr *); | ||
118 | void (*store_gdt)(struct desc_ptr *); | ||
119 | void (*store_idt)(struct desc_ptr *); | ||
120 | void (*set_ldt)(const void *desc, unsigned entries); | ||
121 | unsigned long (*store_tr)(void); | ||
122 | void (*load_tls)(struct thread_struct *t, unsigned int cpu); | ||
123 | #ifdef CONFIG_X86_64 | ||
124 | void (*load_gs_index)(unsigned int idx); | ||
125 | #endif | ||
126 | void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, | ||
127 | const void *desc); | ||
128 | void (*write_gdt_entry)(struct desc_struct *, | ||
129 | int entrynum, const void *desc, int size); | ||
130 | void (*write_idt_entry)(gate_desc *, | ||
131 | int entrynum, const gate_desc *gate); | ||
132 | void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); | ||
133 | void (*free_ldt)(struct desc_struct *ldt, unsigned entries); | ||
134 | |||
135 | void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); | ||
136 | |||
137 | void (*set_iopl_mask)(unsigned mask); | ||
138 | |||
139 | void (*wbinvd)(void); | ||
140 | void (*io_delay)(void); | ||
141 | |||
142 | /* cpuid emulation, mostly so that caps bits can be disabled */ | ||
143 | void (*cpuid)(unsigned int *eax, unsigned int *ebx, | ||
144 | unsigned int *ecx, unsigned int *edx); | ||
145 | |||
146 | /* MSR, PMC and TSR operations. | ||
147 | err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ | ||
148 | u64 (*read_msr)(unsigned int msr, int *err); | ||
149 | int (*rdmsr_regs)(u32 *regs); | ||
150 | int (*write_msr)(unsigned int msr, unsigned low, unsigned high); | ||
151 | int (*wrmsr_regs)(u32 *regs); | ||
152 | |||
153 | u64 (*read_tsc)(void); | ||
154 | u64 (*read_pmc)(int counter); | ||
155 | unsigned long long (*read_tscp)(unsigned int *aux); | ||
156 | |||
157 | /* | ||
158 | * Atomically enable interrupts and return to userspace. This | ||
159 | * is only ever used to return to 32-bit processes; in a | ||
160 | * 64-bit kernel, it's used for 32-on-64 compat processes, but | ||
161 | * never native 64-bit processes. (Jump, not call.) | ||
162 | */ | ||
163 | void (*irq_enable_sysexit)(void); | ||
164 | |||
165 | /* | ||
166 | * Switch to usermode gs and return to 64-bit usermode using | ||
167 | * sysret. Only used in 64-bit kernels to return to 64-bit | ||
168 | * processes. Usermode register state, including %rsp, must | ||
169 | * already be restored. | ||
170 | */ | ||
171 | void (*usergs_sysret64)(void); | ||
172 | |||
173 | /* | ||
174 | * Switch to usermode gs and return to 32-bit usermode using | ||
175 | * sysret. Used to return to 32-on-64 compat processes. | ||
176 | * Other usermode register state, including %esp, must already | ||
177 | * be restored. | ||
178 | */ | ||
179 | void (*usergs_sysret32)(void); | ||
180 | |||
181 | /* Normal iret. Jump to this with the standard iret stack | ||
182 | frame set up. */ | ||
183 | void (*iret)(void); | ||
184 | |||
185 | void (*swapgs)(void); | ||
186 | |||
187 | void (*start_context_switch)(struct task_struct *prev); | ||
188 | void (*end_context_switch)(struct task_struct *next); | ||
189 | }; | ||
190 | |||
191 | struct pv_irq_ops { | ||
192 | /* | ||
193 | * Get/set interrupt state. save_fl and restore_fl are only | ||
194 | * expected to use X86_EFLAGS_IF; all other bits | ||
195 | * returned from save_fl are undefined, and may be ignored by | ||
196 | * restore_fl. | ||
197 | * | ||
198 | * NOTE: These functions callers expect the callee to preserve | ||
199 | * more registers than the standard C calling convention. | ||
200 | */ | ||
201 | struct paravirt_callee_save save_fl; | ||
202 | struct paravirt_callee_save restore_fl; | ||
203 | struct paravirt_callee_save irq_disable; | ||
204 | struct paravirt_callee_save irq_enable; | ||
205 | |||
206 | void (*safe_halt)(void); | ||
207 | void (*halt)(void); | ||
208 | |||
209 | #ifdef CONFIG_X86_64 | ||
210 | void (*adjust_exception_frame)(void); | ||
211 | #endif | ||
212 | }; | ||
213 | |||
214 | struct pv_apic_ops { | ||
215 | #ifdef CONFIG_X86_LOCAL_APIC | ||
216 | void (*startup_ipi_hook)(int phys_apicid, | ||
217 | unsigned long start_eip, | ||
218 | unsigned long start_esp); | ||
219 | #endif | ||
220 | }; | ||
221 | |||
222 | struct pv_mmu_ops { | ||
223 | unsigned long (*read_cr2)(void); | ||
224 | void (*write_cr2)(unsigned long); | ||
225 | |||
226 | unsigned long (*read_cr3)(void); | ||
227 | void (*write_cr3)(unsigned long); | ||
228 | |||
229 | /* | ||
230 | * Hooks for intercepting the creation/use/destruction of an | ||
231 | * mm_struct. | ||
232 | */ | ||
233 | void (*activate_mm)(struct mm_struct *prev, | ||
234 | struct mm_struct *next); | ||
235 | void (*dup_mmap)(struct mm_struct *oldmm, | ||
236 | struct mm_struct *mm); | ||
237 | void (*exit_mmap)(struct mm_struct *mm); | ||
238 | |||
239 | |||
240 | /* TLB operations */ | ||
241 | void (*flush_tlb_user)(void); | ||
242 | void (*flush_tlb_kernel)(void); | ||
243 | void (*flush_tlb_single)(unsigned long addr); | ||
244 | void (*flush_tlb_others)(const struct cpumask *cpus, | ||
245 | struct mm_struct *mm, | ||
246 | unsigned long va); | ||
247 | |||
248 | /* Hooks for allocating and freeing a pagetable top-level */ | ||
249 | int (*pgd_alloc)(struct mm_struct *mm); | ||
250 | void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); | ||
251 | |||
252 | /* | ||
253 | * Hooks for allocating/releasing pagetable pages when they're | ||
254 | * attached to a pagetable | ||
255 | */ | ||
256 | void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); | ||
257 | void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); | ||
258 | void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); | ||
259 | void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); | ||
260 | void (*release_pte)(unsigned long pfn); | ||
261 | void (*release_pmd)(unsigned long pfn); | ||
262 | void (*release_pud)(unsigned long pfn); | ||
263 | |||
264 | /* Pagetable manipulation functions */ | ||
265 | void (*set_pte)(pte_t *ptep, pte_t pteval); | ||
266 | void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, | ||
267 | pte_t *ptep, pte_t pteval); | ||
268 | void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); | ||
269 | void (*pte_update)(struct mm_struct *mm, unsigned long addr, | ||
270 | pte_t *ptep); | ||
271 | void (*pte_update_defer)(struct mm_struct *mm, | ||
272 | unsigned long addr, pte_t *ptep); | ||
273 | |||
274 | pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, | ||
275 | pte_t *ptep); | ||
276 | void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, | ||
277 | pte_t *ptep, pte_t pte); | ||
278 | |||
279 | struct paravirt_callee_save pte_val; | ||
280 | struct paravirt_callee_save make_pte; | ||
281 | |||
282 | struct paravirt_callee_save pgd_val; | ||
283 | struct paravirt_callee_save make_pgd; | ||
284 | |||
285 | #if PAGETABLE_LEVELS >= 3 | ||
286 | #ifdef CONFIG_X86_PAE | ||
287 | void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); | ||
288 | void (*pte_clear)(struct mm_struct *mm, unsigned long addr, | ||
289 | pte_t *ptep); | ||
290 | void (*pmd_clear)(pmd_t *pmdp); | ||
291 | |||
292 | #endif /* CONFIG_X86_PAE */ | ||
293 | |||
294 | void (*set_pud)(pud_t *pudp, pud_t pudval); | ||
295 | |||
296 | struct paravirt_callee_save pmd_val; | ||
297 | struct paravirt_callee_save make_pmd; | ||
298 | |||
299 | #if PAGETABLE_LEVELS == 4 | ||
300 | struct paravirt_callee_save pud_val; | ||
301 | struct paravirt_callee_save make_pud; | ||
302 | |||
303 | void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); | ||
304 | #endif /* PAGETABLE_LEVELS == 4 */ | ||
305 | #endif /* PAGETABLE_LEVELS >= 3 */ | ||
306 | |||
307 | #ifdef CONFIG_HIGHPTE | ||
308 | void *(*kmap_atomic_pte)(struct page *page, enum km_type type); | ||
309 | #endif | ||
310 | |||
311 | struct pv_lazy_ops lazy_mode; | ||
312 | |||
313 | /* dom0 ops */ | ||
314 | |||
315 | /* Sometimes the physical address is a pfn, and sometimes its | ||
316 | an mfn. We can tell which is which from the index. */ | ||
317 | void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, | ||
318 | phys_addr_t phys, pgprot_t flags); | ||
319 | }; | ||
320 | |||
321 | struct raw_spinlock; | ||
322 | struct pv_lock_ops { | ||
323 | int (*spin_is_locked)(struct raw_spinlock *lock); | ||
324 | int (*spin_is_contended)(struct raw_spinlock *lock); | ||
325 | void (*spin_lock)(struct raw_spinlock *lock); | ||
326 | void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); | ||
327 | int (*spin_trylock)(struct raw_spinlock *lock); | ||
328 | void (*spin_unlock)(struct raw_spinlock *lock); | ||
329 | }; | ||
330 | |||
331 | /* This contains all the paravirt structures: we get a convenient | ||
332 | * number for each function using the offset which we use to indicate | ||
333 | * what to patch. */ | ||
334 | struct paravirt_patch_template { | ||
335 | struct pv_init_ops pv_init_ops; | ||
336 | struct pv_time_ops pv_time_ops; | ||
337 | struct pv_cpu_ops pv_cpu_ops; | ||
338 | struct pv_irq_ops pv_irq_ops; | ||
339 | struct pv_apic_ops pv_apic_ops; | ||
340 | struct pv_mmu_ops pv_mmu_ops; | ||
341 | struct pv_lock_ops pv_lock_ops; | ||
342 | }; | ||
343 | |||
344 | extern struct pv_info pv_info; | ||
345 | extern struct pv_init_ops pv_init_ops; | ||
346 | extern struct pv_time_ops pv_time_ops; | ||
347 | extern struct pv_cpu_ops pv_cpu_ops; | ||
348 | extern struct pv_irq_ops pv_irq_ops; | ||
349 | extern struct pv_apic_ops pv_apic_ops; | ||
350 | extern struct pv_mmu_ops pv_mmu_ops; | ||
351 | extern struct pv_lock_ops pv_lock_ops; | ||
352 | |||
353 | #define PARAVIRT_PATCH(x) \ | ||
354 | (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) | ||
355 | |||
356 | #define paravirt_type(op) \ | ||
357 | [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ | ||
358 | [paravirt_opptr] "i" (&(op)) | ||
359 | #define paravirt_clobber(clobber) \ | ||
360 | [paravirt_clobber] "i" (clobber) | ||
361 | |||
362 | /* | ||
363 | * Generate some code, and mark it as patchable by the | ||
364 | * apply_paravirt() alternate instruction patcher. | ||
365 | */ | ||
366 | #define _paravirt_alt(insn_string, type, clobber) \ | ||
367 | "771:\n\t" insn_string "\n" "772:\n" \ | ||
368 | ".pushsection .parainstructions,\"a\"\n" \ | ||
369 | _ASM_ALIGN "\n" \ | ||
370 | _ASM_PTR " 771b\n" \ | ||
371 | " .byte " type "\n" \ | ||
372 | " .byte 772b-771b\n" \ | ||
373 | " .short " clobber "\n" \ | ||
374 | ".popsection\n" | ||
375 | |||
376 | /* Generate patchable code, with the default asm parameters. */ | ||
377 | #define paravirt_alt(insn_string) \ | ||
378 | _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") | ||
379 | |||
380 | /* Simple instruction patching code. */ | ||
381 | #define DEF_NATIVE(ops, name, code) \ | ||
382 | extern const char start_##ops##_##name[], end_##ops##_##name[]; \ | ||
383 | asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") | ||
384 | |||
385 | unsigned paravirt_patch_nop(void); | ||
386 | unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); | ||
387 | unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); | ||
388 | unsigned paravirt_patch_ignore(unsigned len); | ||
389 | unsigned paravirt_patch_call(void *insnbuf, | ||
390 | const void *target, u16 tgt_clobbers, | ||
391 | unsigned long addr, u16 site_clobbers, | ||
392 | unsigned len); | ||
393 | unsigned paravirt_patch_jmp(void *insnbuf, const void *target, | ||
394 | unsigned long addr, unsigned len); | ||
395 | unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, | ||
396 | unsigned long addr, unsigned len); | ||
397 | |||
398 | unsigned paravirt_patch_insns(void *insnbuf, unsigned len, | ||
399 | const char *start, const char *end); | ||
400 | |||
401 | unsigned native_patch(u8 type, u16 clobbers, void *ibuf, | ||
402 | unsigned long addr, unsigned len); | ||
403 | |||
404 | int paravirt_disable_iospace(void); | ||
405 | |||
406 | /* | ||
407 | * This generates an indirect call based on the operation type number. | ||
408 | * The type number, computed in PARAVIRT_PATCH, is derived from the | ||
409 | * offset into the paravirt_patch_template structure, and can therefore be | ||
410 | * freely converted back into a structure offset. | ||
411 | */ | ||
412 | #define PARAVIRT_CALL "call *%c[paravirt_opptr];" | ||
413 | |||
414 | /* | ||
415 | * These macros are intended to wrap calls through one of the paravirt | ||
416 | * ops structs, so that they can be later identified and patched at | ||
417 | * runtime. | ||
418 | * | ||
419 | * Normally, a call to a pv_op function is a simple indirect call: | ||
420 | * (pv_op_struct.operations)(args...). | ||
421 | * | ||
422 | * Unfortunately, this is a relatively slow operation for modern CPUs, | ||
423 | * because it cannot necessarily determine what the destination | ||
424 | * address is. In this case, the address is a runtime constant, so at | ||
425 | * the very least we can patch the call to e a simple direct call, or | ||
426 | * ideally, patch an inline implementation into the callsite. (Direct | ||
427 | * calls are essentially free, because the call and return addresses | ||
428 | * are completely predictable.) | ||
429 | * | ||
430 | * For i386, these macros rely on the standard gcc "regparm(3)" calling | ||
431 | * convention, in which the first three arguments are placed in %eax, | ||
432 | * %edx, %ecx (in that order), and the remaining arguments are placed | ||
433 | * on the stack. All caller-save registers (eax,edx,ecx) are expected | ||
434 | * to be modified (either clobbered or used for return values). | ||
435 | * X86_64, on the other hand, already specifies a register-based calling | ||
436 | * conventions, returning at %rax, with parameteres going on %rdi, %rsi, | ||
437 | * %rdx, and %rcx. Note that for this reason, x86_64 does not need any | ||
438 | * special handling for dealing with 4 arguments, unlike i386. | ||
439 | * However, x86_64 also have to clobber all caller saved registers, which | ||
440 | * unfortunately, are quite a bit (r8 - r11) | ||
441 | * | ||
442 | * The call instruction itself is marked by placing its start address | ||
443 | * and size into the .parainstructions section, so that | ||
444 | * apply_paravirt() in arch/i386/kernel/alternative.c can do the | ||
445 | * appropriate patching under the control of the backend pv_init_ops | ||
446 | * implementation. | ||
447 | * | ||
448 | * Unfortunately there's no way to get gcc to generate the args setup | ||
449 | * for the call, and then allow the call itself to be generated by an | ||
450 | * inline asm. Because of this, we must do the complete arg setup and | ||
451 | * return value handling from within these macros. This is fairly | ||
452 | * cumbersome. | ||
453 | * | ||
454 | * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. | ||
455 | * It could be extended to more arguments, but there would be little | ||
456 | * to be gained from that. For each number of arguments, there are | ||
457 | * the two VCALL and CALL variants for void and non-void functions. | ||
458 | * | ||
459 | * When there is a return value, the invoker of the macro must specify | ||
460 | * the return type. The macro then uses sizeof() on that type to | ||
461 | * determine whether its a 32 or 64 bit value, and places the return | ||
462 | * in the right register(s) (just %eax for 32-bit, and %edx:%eax for | ||
463 | * 64-bit). For x86_64 machines, it just returns at %rax regardless of | ||
464 | * the return value size. | ||
465 | * | ||
466 | * 64-bit arguments are passed as a pair of adjacent 32-bit arguments | ||
467 | * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments | ||
468 | * in low,high order | ||
469 | * | ||
470 | * Small structures are passed and returned in registers. The macro | ||
471 | * calling convention can't directly deal with this, so the wrapper | ||
472 | * functions must do this. | ||
473 | * | ||
474 | * These PVOP_* macros are only defined within this header. This | ||
475 | * means that all uses must be wrapped in inline functions. This also | ||
476 | * makes sure the incoming and outgoing types are always correct. | ||
477 | */ | ||
478 | #ifdef CONFIG_X86_32 | ||
479 | #define PVOP_VCALL_ARGS \ | ||
480 | unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx | ||
481 | #define PVOP_CALL_ARGS PVOP_VCALL_ARGS | ||
482 | |||
483 | #define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) | ||
484 | #define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) | ||
485 | #define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) | ||
486 | |||
487 | #define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ | ||
488 | "=c" (__ecx) | ||
489 | #define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS | ||
490 | |||
491 | #define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) | ||
492 | #define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS | ||
493 | |||
494 | #define EXTRA_CLOBBERS | ||
495 | #define VEXTRA_CLOBBERS | ||
496 | #else /* CONFIG_X86_64 */ | ||
497 | #define PVOP_VCALL_ARGS \ | ||
498 | unsigned long __edi = __edi, __esi = __esi, \ | ||
499 | __edx = __edx, __ecx = __ecx | ||
500 | #define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax | ||
501 | |||
502 | #define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) | ||
503 | #define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) | ||
504 | #define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) | ||
505 | #define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) | ||
506 | |||
507 | #define PVOP_VCALL_CLOBBERS "=D" (__edi), \ | ||
508 | "=S" (__esi), "=d" (__edx), \ | ||
509 | "=c" (__ecx) | ||
510 | #define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) | ||
511 | |||
512 | #define PVOP_VCALLEE_CLOBBERS "=a" (__eax) | ||
513 | #define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS | ||
514 | |||
515 | #define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" | ||
516 | #define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" | ||
517 | #endif /* CONFIG_X86_32 */ | ||
518 | |||
519 | #ifdef CONFIG_PARAVIRT_DEBUG | ||
520 | #define PVOP_TEST_NULL(op) BUG_ON(op == NULL) | ||
521 | #else | ||
522 | #define PVOP_TEST_NULL(op) ((void)op) | ||
523 | #endif | ||
524 | |||
525 | #define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ | ||
526 | pre, post, ...) \ | ||
527 | ({ \ | ||
528 | rettype __ret; \ | ||
529 | PVOP_CALL_ARGS; \ | ||
530 | PVOP_TEST_NULL(op); \ | ||
531 | /* This is 32-bit specific, but is okay in 64-bit */ \ | ||
532 | /* since this condition will never hold */ \ | ||
533 | if (sizeof(rettype) > sizeof(unsigned long)) { \ | ||
534 | asm volatile(pre \ | ||
535 | paravirt_alt(PARAVIRT_CALL) \ | ||
536 | post \ | ||
537 | : call_clbr \ | ||
538 | : paravirt_type(op), \ | ||
539 | paravirt_clobber(clbr), \ | ||
540 | ##__VA_ARGS__ \ | ||
541 | : "memory", "cc" extra_clbr); \ | ||
542 | __ret = (rettype)((((u64)__edx) << 32) | __eax); \ | ||
543 | } else { \ | ||
544 | asm volatile(pre \ | ||
545 | paravirt_alt(PARAVIRT_CALL) \ | ||
546 | post \ | ||
547 | : call_clbr \ | ||
548 | : paravirt_type(op), \ | ||
549 | paravirt_clobber(clbr), \ | ||
550 | ##__VA_ARGS__ \ | ||
551 | : "memory", "cc" extra_clbr); \ | ||
552 | __ret = (rettype)__eax; \ | ||
553 | } \ | ||
554 | __ret; \ | ||
555 | }) | ||
556 | |||
557 | #define __PVOP_CALL(rettype, op, pre, post, ...) \ | ||
558 | ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ | ||
559 | EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) | ||
560 | |||
561 | #define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ | ||
562 | ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ | ||
563 | PVOP_CALLEE_CLOBBERS, , \ | ||
564 | pre, post, ##__VA_ARGS__) | ||
565 | |||
566 | |||
567 | #define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ | ||
568 | ({ \ | ||
569 | PVOP_VCALL_ARGS; \ | ||
570 | PVOP_TEST_NULL(op); \ | ||
571 | asm volatile(pre \ | ||
572 | paravirt_alt(PARAVIRT_CALL) \ | ||
573 | post \ | ||
574 | : call_clbr \ | ||
575 | : paravirt_type(op), \ | ||
576 | paravirt_clobber(clbr), \ | ||
577 | ##__VA_ARGS__ \ | ||
578 | : "memory", "cc" extra_clbr); \ | ||
579 | }) | ||
580 | |||
581 | #define __PVOP_VCALL(op, pre, post, ...) \ | ||
582 | ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ | ||
583 | VEXTRA_CLOBBERS, \ | ||
584 | pre, post, ##__VA_ARGS__) | ||
585 | |||
586 | #define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ | ||
587 | ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ | ||
588 | PVOP_VCALLEE_CLOBBERS, , \ | ||
589 | pre, post, ##__VA_ARGS__) | ||
590 | |||
591 | |||
592 | |||
593 | #define PVOP_CALL0(rettype, op) \ | ||
594 | __PVOP_CALL(rettype, op, "", "") | ||
595 | #define PVOP_VCALL0(op) \ | ||
596 | __PVOP_VCALL(op, "", "") | ||
597 | |||
598 | #define PVOP_CALLEE0(rettype, op) \ | ||
599 | __PVOP_CALLEESAVE(rettype, op, "", "") | ||
600 | #define PVOP_VCALLEE0(op) \ | ||
601 | __PVOP_VCALLEESAVE(op, "", "") | ||
602 | |||
603 | |||
604 | #define PVOP_CALL1(rettype, op, arg1) \ | ||
605 | __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) | ||
606 | #define PVOP_VCALL1(op, arg1) \ | ||
607 | __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) | ||
608 | |||
609 | #define PVOP_CALLEE1(rettype, op, arg1) \ | ||
610 | __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) | ||
611 | #define PVOP_VCALLEE1(op, arg1) \ | ||
612 | __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) | ||
613 | |||
614 | |||
615 | #define PVOP_CALL2(rettype, op, arg1, arg2) \ | ||
616 | __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ | ||
617 | PVOP_CALL_ARG2(arg2)) | ||
618 | #define PVOP_VCALL2(op, arg1, arg2) \ | ||
619 | __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ | ||
620 | PVOP_CALL_ARG2(arg2)) | ||
621 | |||
622 | #define PVOP_CALLEE2(rettype, op, arg1, arg2) \ | ||
623 | __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ | ||
624 | PVOP_CALL_ARG2(arg2)) | ||
625 | #define PVOP_VCALLEE2(op, arg1, arg2) \ | ||
626 | __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ | ||
627 | PVOP_CALL_ARG2(arg2)) | ||
628 | |||
629 | |||
630 | #define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ | ||
631 | __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ | ||
632 | PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) | ||
633 | #define PVOP_VCALL3(op, arg1, arg2, arg3) \ | ||
634 | __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ | ||
635 | PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) | ||
636 | |||
637 | /* This is the only difference in x86_64. We can make it much simpler */ | ||
638 | #ifdef CONFIG_X86_32 | ||
639 | #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ | ||
640 | __PVOP_CALL(rettype, op, \ | ||
641 | "push %[_arg4];", "lea 4(%%esp),%%esp;", \ | ||
642 | PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ | ||
643 | PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) | ||
644 | #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ | ||
645 | __PVOP_VCALL(op, \ | ||
646 | "push %[_arg4];", "lea 4(%%esp),%%esp;", \ | ||
647 | "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ | ||
648 | "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) | ||
649 | #else | ||
650 | #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ | ||
651 | __PVOP_CALL(rettype, op, "", "", \ | ||
652 | PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ | ||
653 | PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) | ||
654 | #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ | ||
655 | __PVOP_VCALL(op, "", "", \ | ||
656 | PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ | ||
657 | PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) | ||
658 | #endif | ||
659 | |||
660 | /* Lazy mode for batching updates / context switch */ | ||
661 | enum paravirt_lazy_mode { | ||
662 | PARAVIRT_LAZY_NONE, | ||
663 | PARAVIRT_LAZY_MMU, | ||
664 | PARAVIRT_LAZY_CPU, | ||
665 | }; | ||
666 | |||
667 | enum paravirt_lazy_mode paravirt_get_lazy_mode(void); | ||
668 | void paravirt_start_context_switch(struct task_struct *prev); | ||
669 | void paravirt_end_context_switch(struct task_struct *next); | ||
670 | |||
671 | void paravirt_enter_lazy_mmu(void); | ||
672 | void paravirt_leave_lazy_mmu(void); | ||
673 | |||
674 | void _paravirt_nop(void); | ||
675 | u32 _paravirt_ident_32(u32); | ||
676 | u64 _paravirt_ident_64(u64); | ||
677 | |||
678 | #define paravirt_nop ((void *)_paravirt_nop) | ||
679 | |||
680 | /* These all sit in the .parainstructions section to tell us what to patch. */ | ||
681 | struct paravirt_patch_site { | ||
682 | u8 *instr; /* original instructions */ | ||
683 | u8 instrtype; /* type of this instruction */ | ||
684 | u8 len; /* length of original instruction */ | ||
685 | u16 clobbers; /* what registers you may clobber */ | ||
686 | }; | ||
687 | |||
688 | extern struct paravirt_patch_site __parainstructions[], | ||
689 | __parainstructions_end[]; | ||
690 | |||
691 | #endif /* __ASSEMBLY__ */ | ||
692 | |||
693 | #endif /* _ASM_X86_PARAVIRT_TYPES_H */ | ||
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 7af14e512f9..e2c1668dde7 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h | |||
@@ -19,4 +19,9 @@ extern int free_memtype(u64 start, u64 end); | |||
19 | extern int kernel_map_sync_memtype(u64 base, unsigned long size, | 19 | extern int kernel_map_sync_memtype(u64 base, unsigned long size, |
20 | unsigned long flag); | 20 | unsigned long flag); |
21 | 21 | ||
22 | int io_reserve_memtype(resource_size_t start, resource_size_t end, | ||
23 | unsigned long *type); | ||
24 | |||
25 | void io_free_memtype(resource_size_t start, resource_size_t end); | ||
26 | |||
22 | #endif /* _ASM_X86_PAT_H */ | 27 | #endif /* _ASM_X86_PAT_H */ |
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 1ff685ca221..ada8c201d51 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h | |||
@@ -48,7 +48,6 @@ extern unsigned int pcibios_assign_all_busses(void); | |||
48 | #else | 48 | #else |
49 | #define pcibios_assign_all_busses() 0 | 49 | #define pcibios_assign_all_busses() 0 |
50 | #endif | 50 | #endif |
51 | #define pcibios_scan_all_fns(a, b) 0 | ||
52 | 51 | ||
53 | extern unsigned long pci_mem_start; | 52 | extern unsigned long pci_mem_start; |
54 | #define PCIBIOS_MIN_IO 0x1000 | 53 | #define PCIBIOS_MIN_IO 0x1000 |
@@ -144,7 +143,11 @@ static inline int __pcibus_to_node(const struct pci_bus *bus) | |||
144 | static inline const struct cpumask * | 143 | static inline const struct cpumask * |
145 | cpumask_of_pcibus(const struct pci_bus *bus) | 144 | cpumask_of_pcibus(const struct pci_bus *bus) |
146 | { | 145 | { |
147 | return cpumask_of_node(__pcibus_to_node(bus)); | 146 | int node; |
147 | |||
148 | node = __pcibus_to_node(bus); | ||
149 | return (node == -1) ? cpu_online_mask : | ||
150 | cpumask_of_node(node); | ||
148 | } | 151 | } |
149 | #endif | 152 | #endif |
150 | 153 | ||
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 103f1ddb0d8..b65a36defeb 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h | |||
@@ -49,7 +49,7 @@ | |||
49 | #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x | 49 | #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x |
50 | #define __my_cpu_offset percpu_read(this_cpu_off) | 50 | #define __my_cpu_offset percpu_read(this_cpu_off) |
51 | #else | 51 | #else |
52 | #define __percpu_arg(x) "%" #x | 52 | #define __percpu_arg(x) "%P" #x |
53 | #endif | 53 | #endif |
54 | 54 | ||
55 | /* | 55 | /* |
@@ -104,36 +104,48 @@ do { \ | |||
104 | } \ | 104 | } \ |
105 | } while (0) | 105 | } while (0) |
106 | 106 | ||
107 | #define percpu_from_op(op, var) \ | 107 | #define percpu_from_op(op, var, constraint) \ |
108 | ({ \ | 108 | ({ \ |
109 | typeof(var) ret__; \ | 109 | typeof(var) ret__; \ |
110 | switch (sizeof(var)) { \ | 110 | switch (sizeof(var)) { \ |
111 | case 1: \ | 111 | case 1: \ |
112 | asm(op "b "__percpu_arg(1)",%0" \ | 112 | asm(op "b "__percpu_arg(1)",%0" \ |
113 | : "=q" (ret__) \ | 113 | : "=q" (ret__) \ |
114 | : "m" (var)); \ | 114 | : constraint); \ |
115 | break; \ | 115 | break; \ |
116 | case 2: \ | 116 | case 2: \ |
117 | asm(op "w "__percpu_arg(1)",%0" \ | 117 | asm(op "w "__percpu_arg(1)",%0" \ |
118 | : "=r" (ret__) \ | 118 | : "=r" (ret__) \ |
119 | : "m" (var)); \ | 119 | : constraint); \ |
120 | break; \ | 120 | break; \ |
121 | case 4: \ | 121 | case 4: \ |
122 | asm(op "l "__percpu_arg(1)",%0" \ | 122 | asm(op "l "__percpu_arg(1)",%0" \ |
123 | : "=r" (ret__) \ | 123 | : "=r" (ret__) \ |
124 | : "m" (var)); \ | 124 | : constraint); \ |
125 | break; \ | 125 | break; \ |
126 | case 8: \ | 126 | case 8: \ |
127 | asm(op "q "__percpu_arg(1)",%0" \ | 127 | asm(op "q "__percpu_arg(1)",%0" \ |
128 | : "=r" (ret__) \ | 128 | : "=r" (ret__) \ |
129 | : "m" (var)); \ | 129 | : constraint); \ |
130 | break; \ | 130 | break; \ |
131 | default: __bad_percpu_size(); \ | 131 | default: __bad_percpu_size(); \ |
132 | } \ | 132 | } \ |
133 | ret__; \ | 133 | ret__; \ |
134 | }) | 134 | }) |
135 | 135 | ||
136 | #define percpu_read(var) percpu_from_op("mov", per_cpu__##var) | 136 | /* |
137 | * percpu_read() makes gcc load the percpu variable every time it is | ||
138 | * accessed while percpu_read_stable() allows the value to be cached. | ||
139 | * percpu_read_stable() is more efficient and can be used if its value | ||
140 | * is guaranteed to be valid across cpus. The current users include | ||
141 | * get_current() and get_thread_info() both of which are actually | ||
142 | * per-thread variables implemented as per-cpu variables and thus | ||
143 | * stable for the duration of the respective task. | ||
144 | */ | ||
145 | #define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \ | ||
146 | "m" (per_cpu__##var)) | ||
147 | #define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \ | ||
148 | "p" (&per_cpu__##var)) | ||
137 | #define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) | 149 | #define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) |
138 | #define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) | 150 | #define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) |
139 | #define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) | 151 | #define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) |
@@ -156,15 +168,6 @@ do { \ | |||
156 | /* We can use this directly for local CPU (faster). */ | 168 | /* We can use this directly for local CPU (faster). */ |
157 | DECLARE_PER_CPU(unsigned long, this_cpu_off); | 169 | DECLARE_PER_CPU(unsigned long, this_cpu_off); |
158 | 170 | ||
159 | #ifdef CONFIG_NEED_MULTIPLE_NODES | ||
160 | void *pcpu_lpage_remapped(void *kaddr); | ||
161 | #else | ||
162 | static inline void *pcpu_lpage_remapped(void *kaddr) | ||
163 | { | ||
164 | return NULL; | ||
165 | } | ||
166 | #endif | ||
167 | |||
168 | #endif /* !__ASSEMBLY__ */ | 171 | #endif /* !__ASSEMBLY__ */ |
169 | 172 | ||
170 | #ifdef CONFIG_SMP | 173 | #ifdef CONFIG_SMP |
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_event.h index fa64e401589..ad7ce3fd506 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_event.h | |||
@@ -1,8 +1,8 @@ | |||
1 | #ifndef _ASM_X86_PERF_COUNTER_H | 1 | #ifndef _ASM_X86_PERF_EVENT_H |
2 | #define _ASM_X86_PERF_COUNTER_H | 2 | #define _ASM_X86_PERF_EVENT_H |
3 | 3 | ||
4 | /* | 4 | /* |
5 | * Performance counter hw details: | 5 | * Performance event hw details: |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #define X86_PMC_MAX_GENERIC 8 | 8 | #define X86_PMC_MAX_GENERIC 8 |
@@ -43,7 +43,7 @@ | |||
43 | union cpuid10_eax { | 43 | union cpuid10_eax { |
44 | struct { | 44 | struct { |
45 | unsigned int version_id:8; | 45 | unsigned int version_id:8; |
46 | unsigned int num_counters:8; | 46 | unsigned int num_events:8; |
47 | unsigned int bit_width:8; | 47 | unsigned int bit_width:8; |
48 | unsigned int mask_length:8; | 48 | unsigned int mask_length:8; |
49 | } split; | 49 | } split; |
@@ -52,7 +52,7 @@ union cpuid10_eax { | |||
52 | 52 | ||
53 | union cpuid10_edx { | 53 | union cpuid10_edx { |
54 | struct { | 54 | struct { |
55 | unsigned int num_counters_fixed:4; | 55 | unsigned int num_events_fixed:4; |
56 | unsigned int reserved:28; | 56 | unsigned int reserved:28; |
57 | } split; | 57 | } split; |
58 | unsigned int full; | 58 | unsigned int full; |
@@ -60,7 +60,7 @@ union cpuid10_edx { | |||
60 | 60 | ||
61 | 61 | ||
62 | /* | 62 | /* |
63 | * Fixed-purpose performance counters: | 63 | * Fixed-purpose performance events: |
64 | */ | 64 | */ |
65 | 65 | ||
66 | /* | 66 | /* |
@@ -84,15 +84,25 @@ union cpuid10_edx { | |||
84 | #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b | 84 | #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b |
85 | #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) | 85 | #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) |
86 | 86 | ||
87 | #ifdef CONFIG_PERF_COUNTERS | 87 | /* |
88 | extern void init_hw_perf_counters(void); | 88 | * We model BTS tracing as another fixed-mode PMC. |
89 | extern void perf_counters_lapic_init(void); | 89 | * |
90 | * We choose a value in the middle of the fixed event range, since lower | ||
91 | * values are used by actual fixed events and higher values are used | ||
92 | * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. | ||
93 | */ | ||
94 | #define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) | ||
95 | |||
96 | |||
97 | #ifdef CONFIG_PERF_EVENTS | ||
98 | extern void init_hw_perf_events(void); | ||
99 | extern void perf_events_lapic_init(void); | ||
90 | 100 | ||
91 | #define PERF_COUNTER_INDEX_OFFSET 0 | 101 | #define PERF_EVENT_INDEX_OFFSET 0 |
92 | 102 | ||
93 | #else | 103 | #else |
94 | static inline void init_hw_perf_counters(void) { } | 104 | static inline void init_hw_perf_events(void) { } |
95 | static inline void perf_counters_lapic_init(void) { } | 105 | static inline void perf_events_lapic_init(void) { } |
96 | #endif | 106 | #endif |
97 | 107 | ||
98 | #endif /* _ASM_X86_PERF_COUNTER_H */ | 108 | #endif /* _ASM_X86_PERF_EVENT_H */ |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3cc06e3fceb..af6fd360ab3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define _ASM_X86_PGTABLE_H | 2 | #define _ASM_X86_PGTABLE_H |
3 | 3 | ||
4 | #include <asm/page.h> | 4 | #include <asm/page.h> |
5 | #include <asm/e820.h> | ||
5 | 6 | ||
6 | #include <asm/pgtable_types.h> | 7 | #include <asm/pgtable_types.h> |
7 | 8 | ||
@@ -55,16 +56,6 @@ extern struct list_head pgd_list; | |||
55 | #define pte_update(mm, addr, ptep) do { } while (0) | 56 | #define pte_update(mm, addr, ptep) do { } while (0) |
56 | #define pte_update_defer(mm, addr, ptep) do { } while (0) | 57 | #define pte_update_defer(mm, addr, ptep) do { } while (0) |
57 | 58 | ||
58 | static inline void __init paravirt_pagetable_setup_start(pgd_t *base) | ||
59 | { | ||
60 | native_pagetable_setup_start(base); | ||
61 | } | ||
62 | |||
63 | static inline void __init paravirt_pagetable_setup_done(pgd_t *base) | ||
64 | { | ||
65 | native_pagetable_setup_done(base); | ||
66 | } | ||
67 | |||
68 | #define pgd_val(x) native_pgd_val(x) | 59 | #define pgd_val(x) native_pgd_val(x) |
69 | #define __pgd(x) native_make_pgd(x) | 60 | #define __pgd(x) native_make_pgd(x) |
70 | 61 | ||
@@ -134,6 +125,11 @@ static inline unsigned long pte_pfn(pte_t pte) | |||
134 | return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; | 125 | return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; |
135 | } | 126 | } |
136 | 127 | ||
128 | static inline unsigned long pmd_pfn(pmd_t pmd) | ||
129 | { | ||
130 | return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; | ||
131 | } | ||
132 | |||
137 | #define pte_page(pte) pfn_to_page(pte_pfn(pte)) | 133 | #define pte_page(pte) pfn_to_page(pte_pfn(pte)) |
138 | 134 | ||
139 | static inline int pmd_large(pmd_t pte) | 135 | static inline int pmd_large(pmd_t pte) |
@@ -269,10 +265,17 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) | |||
269 | 265 | ||
270 | #define canon_pgprot(p) __pgprot(massage_pgprot(p)) | 266 | #define canon_pgprot(p) __pgprot(massage_pgprot(p)) |
271 | 267 | ||
272 | static inline int is_new_memtype_allowed(unsigned long flags, | 268 | static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, |
273 | unsigned long new_flags) | 269 | unsigned long flags, |
270 | unsigned long new_flags) | ||
274 | { | 271 | { |
275 | /* | 272 | /* |
273 | * PAT type is always WB for ISA. So no need to check. | ||
274 | */ | ||
275 | if (is_ISA_range(paddr, paddr + size - 1)) | ||
276 | return 1; | ||
277 | |||
278 | /* | ||
276 | * Certain new memtypes are not allowed with certain | 279 | * Certain new memtypes are not allowed with certain |
277 | * requested memtype: | 280 | * requested memtype: |
278 | * - request is uncached, return cannot be write-back | 281 | * - request is uncached, return cannot be write-back |
@@ -351,7 +354,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) | |||
351 | * this macro returns the index of the entry in the pmd page which would | 354 | * this macro returns the index of the entry in the pmd page which would |
352 | * control the given virtual address | 355 | * control the given virtual address |
353 | */ | 356 | */ |
354 | static inline unsigned pmd_index(unsigned long address) | 357 | static inline unsigned long pmd_index(unsigned long address) |
355 | { | 358 | { |
356 | return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); | 359 | return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); |
357 | } | 360 | } |
@@ -371,7 +374,7 @@ static inline unsigned pmd_index(unsigned long address) | |||
371 | * this function returns the index of the entry in the pte page which would | 374 | * this function returns the index of the entry in the pte page which would |
372 | * control the given virtual address | 375 | * control the given virtual address |
373 | */ | 376 | */ |
374 | static inline unsigned pte_index(unsigned long address) | 377 | static inline unsigned long pte_index(unsigned long address) |
375 | { | 378 | { |
376 | return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); | 379 | return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); |
377 | } | 380 | } |
@@ -422,11 +425,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) | |||
422 | return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); | 425 | return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); |
423 | } | 426 | } |
424 | 427 | ||
425 | static inline unsigned long pmd_pfn(pmd_t pmd) | ||
426 | { | ||
427 | return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; | ||
428 | } | ||
429 | |||
430 | static inline int pud_large(pud_t pud) | 428 | static inline int pud_large(pud_t pud) |
431 | { | 429 | { |
432 | return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == | 430 | return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == |
@@ -462,7 +460,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) | |||
462 | #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) | 460 | #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) |
463 | 461 | ||
464 | /* to find an entry in a page-table-directory. */ | 462 | /* to find an entry in a page-table-directory. */ |
465 | static inline unsigned pud_index(unsigned long address) | 463 | static inline unsigned long pud_index(unsigned long address) |
466 | { | 464 | { |
467 | return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); | 465 | return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); |
468 | } | 466 | } |
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 54cb697f490..d1f4a760be2 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
@@ -277,6 +277,7 @@ static inline pteval_t pte_flags(pte_t pte) | |||
277 | typedef struct page *pgtable_t; | 277 | typedef struct page *pgtable_t; |
278 | 278 | ||
279 | extern pteval_t __supported_pte_mask; | 279 | extern pteval_t __supported_pte_mask; |
280 | extern void set_nx(void); | ||
280 | extern int nx_enabled; | 281 | extern int nx_enabled; |
281 | 282 | ||
282 | #define pgprot_writecombine pgprot_writecombine | 283 | #define pgprot_writecombine pgprot_writecombine |
@@ -299,8 +300,8 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte); | |||
299 | extern void native_pagetable_setup_start(pgd_t *base); | 300 | extern void native_pagetable_setup_start(pgd_t *base); |
300 | extern void native_pagetable_setup_done(pgd_t *base); | 301 | extern void native_pagetable_setup_done(pgd_t *base); |
301 | #else | 302 | #else |
302 | static inline void native_pagetable_setup_start(pgd_t *base) {} | 303 | #define native_pagetable_setup_start x86_init_pgd_noop |
303 | static inline void native_pagetable_setup_done(pgd_t *base) {} | 304 | #define native_pagetable_setup_done x86_init_pgd_noop |
304 | #endif | 305 | #endif |
305 | 306 | ||
306 | struct seq_file; | 307 | struct seq_file; |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c7768269b1c..c3429e8b242 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -27,6 +27,7 @@ struct mm_struct; | |||
27 | #include <linux/cpumask.h> | 27 | #include <linux/cpumask.h> |
28 | #include <linux/cache.h> | 28 | #include <linux/cache.h> |
29 | #include <linux/threads.h> | 29 | #include <linux/threads.h> |
30 | #include <linux/math64.h> | ||
30 | #include <linux/init.h> | 31 | #include <linux/init.h> |
31 | 32 | ||
32 | /* | 33 | /* |
@@ -403,7 +404,17 @@ extern unsigned long kernel_eflags; | |||
403 | extern asmlinkage void ignore_sysret(void); | 404 | extern asmlinkage void ignore_sysret(void); |
404 | #else /* X86_64 */ | 405 | #else /* X86_64 */ |
405 | #ifdef CONFIG_CC_STACKPROTECTOR | 406 | #ifdef CONFIG_CC_STACKPROTECTOR |
406 | DECLARE_PER_CPU(unsigned long, stack_canary); | 407 | /* |
408 | * Make sure stack canary segment base is cached-aligned: | ||
409 | * "For Intel Atom processors, avoid non zero segment base address | ||
410 | * that is not aligned to cache line boundary at all cost." | ||
411 | * (Optim Ref Manual Assembly/Compiler Coding Rule 15.) | ||
412 | */ | ||
413 | struct stack_canary { | ||
414 | char __pad[20]; /* canary at %gs:20 */ | ||
415 | unsigned long canary; | ||
416 | }; | ||
417 | DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); | ||
407 | #endif | 418 | #endif |
408 | #endif /* X86_64 */ | 419 | #endif /* X86_64 */ |
409 | 420 | ||
@@ -703,13 +714,23 @@ static inline void cpu_relax(void) | |||
703 | rep_nop(); | 714 | rep_nop(); |
704 | } | 715 | } |
705 | 716 | ||
706 | /* Stop speculative execution: */ | 717 | /* Stop speculative execution and prefetching of modified code. */ |
707 | static inline void sync_core(void) | 718 | static inline void sync_core(void) |
708 | { | 719 | { |
709 | int tmp; | 720 | int tmp; |
710 | 721 | ||
711 | asm volatile("cpuid" : "=a" (tmp) : "0" (1) | 722 | #if defined(CONFIG_M386) || defined(CONFIG_M486) |
712 | : "ebx", "ecx", "edx", "memory"); | 723 | if (boot_cpu_data.x86 < 5) |
724 | /* There is no speculative execution. | ||
725 | * jmp is a barrier to prefetching. */ | ||
726 | asm volatile("jmp 1f\n1:\n" ::: "memory"); | ||
727 | else | ||
728 | #endif | ||
729 | /* cpuid is a barrier to speculative execution. | ||
730 | * Prefetched instructions are automatically | ||
731 | * invalidated when modified. */ | ||
732 | asm volatile("cpuid" : "=a" (tmp) : "0" (1) | ||
733 | : "ebx", "ecx", "edx", "memory"); | ||
713 | } | 734 | } |
714 | 735 | ||
715 | static inline void __monitor(const void *eax, unsigned long ecx, | 736 | static inline void __monitor(const void *eax, unsigned long ecx, |
@@ -1000,4 +1021,35 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, | |||
1000 | extern int get_tsc_mode(unsigned long adr); | 1021 | extern int get_tsc_mode(unsigned long adr); |
1001 | extern int set_tsc_mode(unsigned int val); | 1022 | extern int set_tsc_mode(unsigned int val); |
1002 | 1023 | ||
1024 | extern int amd_get_nb_id(int cpu); | ||
1025 | |||
1026 | struct aperfmperf { | ||
1027 | u64 aperf, mperf; | ||
1028 | }; | ||
1029 | |||
1030 | static inline void get_aperfmperf(struct aperfmperf *am) | ||
1031 | { | ||
1032 | WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); | ||
1033 | |||
1034 | rdmsrl(MSR_IA32_APERF, am->aperf); | ||
1035 | rdmsrl(MSR_IA32_MPERF, am->mperf); | ||
1036 | } | ||
1037 | |||
1038 | #define APERFMPERF_SHIFT 10 | ||
1039 | |||
1040 | static inline | ||
1041 | unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, | ||
1042 | struct aperfmperf *new) | ||
1043 | { | ||
1044 | u64 aperf = new->aperf - old->aperf; | ||
1045 | u64 mperf = new->mperf - old->mperf; | ||
1046 | unsigned long ratio = aperf; | ||
1047 | |||
1048 | mperf >>= APERFMPERF_SHIFT; | ||
1049 | if (mperf) | ||
1050 | ratio = div64_u64(aperf, mperf); | ||
1051 | |||
1052 | return ratio; | ||
1053 | } | ||
1054 | |||
1003 | #endif /* _ASM_X86_PROCESSOR_H */ | 1055 | #endif /* _ASM_X86_PROCESSOR_H */ |
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h index 263d397d2ee..75af592677e 100644 --- a/arch/x86/include/asm/scatterlist.h +++ b/arch/x86/include/asm/scatterlist.h | |||
@@ -1,33 +1,8 @@ | |||
1 | #ifndef _ASM_X86_SCATTERLIST_H | 1 | #ifndef _ASM_X86_SCATTERLIST_H |
2 | #define _ASM_X86_SCATTERLIST_H | 2 | #define _ASM_X86_SCATTERLIST_H |
3 | 3 | ||
4 | #include <asm/types.h> | ||
5 | |||
6 | struct scatterlist { | ||
7 | #ifdef CONFIG_DEBUG_SG | ||
8 | unsigned long sg_magic; | ||
9 | #endif | ||
10 | unsigned long page_link; | ||
11 | unsigned int offset; | ||
12 | unsigned int length; | ||
13 | dma_addr_t dma_address; | ||
14 | unsigned int dma_length; | ||
15 | }; | ||
16 | |||
17 | #define ARCH_HAS_SG_CHAIN | ||
18 | #define ISA_DMA_THRESHOLD (0x00ffffff) | 4 | #define ISA_DMA_THRESHOLD (0x00ffffff) |
19 | 5 | ||
20 | /* | 6 | #include <asm-generic/scatterlist.h> |
21 | * These macros should be used after a pci_map_sg call has been done | ||
22 | * to get bus addresses of each of the SG entries and their lengths. | ||
23 | * You should only work with the number of sg entries pci_map_sg | ||
24 | * returns. | ||
25 | */ | ||
26 | #define sg_dma_address(sg) ((sg)->dma_address) | ||
27 | #ifdef CONFIG_X86_32 | ||
28 | # define sg_dma_len(sg) ((sg)->length) | ||
29 | #else | ||
30 | # define sg_dma_len(sg) ((sg)->dma_length) | ||
31 | #endif | ||
32 | 7 | ||
33 | #endif /* _ASM_X86_SCATTERLIST_H */ | 8 | #endif /* _ASM_X86_SCATTERLIST_H */ |
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 4093d1ed6db..18e496c98ff 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h | |||
@@ -5,43 +5,6 @@ | |||
5 | 5 | ||
6 | #define COMMAND_LINE_SIZE 2048 | 6 | #define COMMAND_LINE_SIZE 2048 |
7 | 7 | ||
8 | #ifndef __ASSEMBLY__ | ||
9 | |||
10 | /* | ||
11 | * Any setup quirks to be performed? | ||
12 | */ | ||
13 | struct mpc_cpu; | ||
14 | struct mpc_bus; | ||
15 | struct mpc_oemtable; | ||
16 | |||
17 | struct x86_quirks { | ||
18 | int (*arch_pre_time_init)(void); | ||
19 | int (*arch_time_init)(void); | ||
20 | int (*arch_pre_intr_init)(void); | ||
21 | int (*arch_intr_init)(void); | ||
22 | int (*arch_trap_init)(void); | ||
23 | char * (*arch_memory_setup)(void); | ||
24 | int (*mach_get_smp_config)(unsigned int early); | ||
25 | int (*mach_find_smp_config)(unsigned int reserve); | ||
26 | |||
27 | int *mpc_record; | ||
28 | int (*mpc_apic_id)(struct mpc_cpu *m); | ||
29 | void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); | ||
30 | void (*mpc_oem_pci_bus)(struct mpc_bus *m); | ||
31 | void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable, | ||
32 | unsigned short oemsize); | ||
33 | int (*setup_ioapic_ids)(void); | ||
34 | }; | ||
35 | |||
36 | extern void x86_quirk_intr_init(void); | ||
37 | |||
38 | extern void x86_quirk_trap_init(void); | ||
39 | |||
40 | extern void x86_quirk_pre_time_init(void); | ||
41 | extern void x86_quirk_time_init(void); | ||
42 | |||
43 | #endif /* __ASSEMBLY__ */ | ||
44 | |||
45 | #ifdef __i386__ | 8 | #ifdef __i386__ |
46 | 9 | ||
47 | #include <linux/pfn.h> | 10 | #include <linux/pfn.h> |
@@ -61,6 +24,7 @@ extern void x86_quirk_time_init(void); | |||
61 | 24 | ||
62 | #ifndef __ASSEMBLY__ | 25 | #ifndef __ASSEMBLY__ |
63 | #include <asm/bootparam.h> | 26 | #include <asm/bootparam.h> |
27 | #include <asm/x86_init.h> | ||
64 | 28 | ||
65 | /* Interrupt control for vSMPowered x86_64 systems */ | 29 | /* Interrupt control for vSMPowered x86_64 systems */ |
66 | #ifdef CONFIG_X86_64 | 30 | #ifdef CONFIG_X86_64 |
@@ -79,11 +43,16 @@ static inline void visws_early_detect(void) { } | |||
79 | static inline int is_visws_box(void) { return 0; } | 43 | static inline int is_visws_box(void) { return 0; } |
80 | #endif | 44 | #endif |
81 | 45 | ||
82 | extern struct x86_quirks *x86_quirks; | ||
83 | extern unsigned long saved_video_mode; | 46 | extern unsigned long saved_video_mode; |
84 | 47 | ||
85 | #ifndef CONFIG_PARAVIRT | 48 | extern void reserve_standard_io_resources(void); |
86 | #define paravirt_post_allocator_init() do {} while (0) | 49 | extern void i386_reserve_resources(void); |
50 | extern void setup_default_timer_irq(void); | ||
51 | |||
52 | #ifdef CONFIG_X86_MRST | ||
53 | extern void x86_mrst_early_setup(void); | ||
54 | #else | ||
55 | static inline void x86_mrst_early_setup(void) { } | ||
87 | #endif | 56 | #endif |
88 | 57 | ||
89 | #ifndef _SETUP | 58 | #ifndef _SETUP |
diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h index b51413b7497..83c05fc2de3 100644 --- a/arch/x86/include/asm/shmbuf.h +++ b/arch/x86/include/asm/shmbuf.h | |||
@@ -1,51 +1 @@ | |||
1 | #ifndef _ASM_X86_SHMBUF_H | #include <asm-generic/shmbuf.h> | |
2 | #define _ASM_X86_SHMBUF_H | ||
3 | |||
4 | /* | ||
5 | * The shmid64_ds structure for x86 architecture. | ||
6 | * Note extra padding because this structure is passed back and forth | ||
7 | * between kernel and user space. | ||
8 | * | ||
9 | * Pad space on 32 bit is left for: | ||
10 | * - 64-bit time_t to solve y2038 problem | ||
11 | * - 2 miscellaneous 32-bit values | ||
12 | * | ||
13 | * Pad space on 64 bit is left for: | ||
14 | * - 2 miscellaneous 64-bit values | ||
15 | */ | ||
16 | |||
17 | struct shmid64_ds { | ||
18 | struct ipc64_perm shm_perm; /* operation perms */ | ||
19 | size_t shm_segsz; /* size of segment (bytes) */ | ||
20 | __kernel_time_t shm_atime; /* last attach time */ | ||
21 | #ifdef __i386__ | ||
22 | unsigned long __unused1; | ||
23 | #endif | ||
24 | __kernel_time_t shm_dtime; /* last detach time */ | ||
25 | #ifdef __i386__ | ||
26 | unsigned long __unused2; | ||
27 | #endif | ||
28 | __kernel_time_t shm_ctime; /* last change time */ | ||
29 | #ifdef __i386__ | ||
30 | unsigned long __unused3; | ||
31 | #endif | ||
32 | __kernel_pid_t shm_cpid; /* pid of creator */ | ||
33 | __kernel_pid_t shm_lpid; /* pid of last operator */ | ||
34 | unsigned long shm_nattch; /* no. of current attaches */ | ||
35 | unsigned long __unused4; | ||
36 | unsigned long __unused5; | ||
37 | }; | ||
38 | |||
39 | struct shminfo64 { | ||
40 | unsigned long shmmax; | ||
41 | unsigned long shmmin; | ||
42 | unsigned long shmmni; | ||
43 | unsigned long shmseg; | ||
44 | unsigned long shmall; | ||
45 | unsigned long __unused1; | ||
46 | unsigned long __unused2; | ||
47 | unsigned long __unused3; | ||
48 | unsigned long __unused4; | ||
49 | }; | ||
50 | |||
51 | #endif /* _ASM_X86_SHMBUF_H */ | ||
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 6a84ed166ae..1e796782cd7 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h | |||
@@ -121,7 +121,6 @@ static inline void arch_send_call_function_single_ipi(int cpu) | |||
121 | smp_ops.send_call_func_single_ipi(cpu); | 121 | smp_ops.send_call_func_single_ipi(cpu); |
122 | } | 122 | } |
123 | 123 | ||
124 | #define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask | ||
125 | static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) | 124 | static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) |
126 | { | 125 | { |
127 | smp_ops.send_call_func_ipi(mask); | 126 | smp_ops.send_call_func_ipi(mask); |
diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h index ca8bf2cd0ba..6b71384b9d8 100644 --- a/arch/x86/include/asm/socket.h +++ b/arch/x86/include/asm/socket.h | |||
@@ -1,60 +1 @@ | |||
1 | #ifndef _ASM_X86_SOCKET_H | #include <asm-generic/socket.h> | |
2 | #define _ASM_X86_SOCKET_H | ||
3 | |||
4 | #include <asm/sockios.h> | ||
5 | |||
6 | /* For setsockopt(2) */ | ||
7 | #define SOL_SOCKET 1 | ||
8 | |||
9 | #define SO_DEBUG 1 | ||
10 | #define SO_REUSEADDR 2 | ||
11 | #define SO_TYPE 3 | ||
12 | #define SO_ERROR 4 | ||
13 | #define SO_DONTROUTE 5 | ||
14 | #define SO_BROADCAST 6 | ||
15 | #define SO_SNDBUF 7 | ||
16 | #define SO_RCVBUF 8 | ||
17 | #define SO_SNDBUFFORCE 32 | ||
18 | #define SO_RCVBUFFORCE 33 | ||
19 | #define SO_KEEPALIVE 9 | ||
20 | #define SO_OOBINLINE 10 | ||
21 | #define SO_NO_CHECK 11 | ||
22 | #define SO_PRIORITY 12 | ||
23 | #define SO_LINGER 13 | ||
24 | #define SO_BSDCOMPAT 14 | ||
25 | /* To add :#define SO_REUSEPORT 15 */ | ||
26 | #define SO_PASSCRED 16 | ||
27 | #define SO_PEERCRED 17 | ||
28 | #define SO_RCVLOWAT 18 | ||
29 | #define SO_SNDLOWAT 19 | ||
30 | #define SO_RCVTIMEO 20 | ||
31 | #define SO_SNDTIMEO 21 | ||
32 | |||
33 | /* Security levels - as per NRL IPv6 - don't actually do anything */ | ||
34 | #define SO_SECURITY_AUTHENTICATION 22 | ||
35 | #define SO_SECURITY_ENCRYPTION_TRANSPORT 23 | ||
36 | #define SO_SECURITY_ENCRYPTION_NETWORK 24 | ||
37 | |||
38 | #define SO_BINDTODEVICE 25 | ||
39 | |||
40 | /* Socket filtering */ | ||
41 | #define SO_ATTACH_FILTER 26 | ||
42 | #define SO_DETACH_FILTER 27 | ||
43 | |||
44 | #define SO_PEERNAME 28 | ||
45 | #define SO_TIMESTAMP 29 | ||
46 | #define SCM_TIMESTAMP SO_TIMESTAMP | ||
47 | |||
48 | #define SO_ACCEPTCONN 30 | ||
49 | |||
50 | #define SO_PEERSEC 31 | ||
51 | #define SO_PASSSEC 34 | ||
52 | #define SO_TIMESTAMPNS 35 | ||
53 | #define SCM_TIMESTAMPNS SO_TIMESTAMPNS | ||
54 | |||
55 | #define SO_MARK 36 | ||
56 | |||
57 | #define SO_TIMESTAMPING 37 | ||
58 | #define SCM_TIMESTAMPING SO_TIMESTAMPING | ||
59 | |||
60 | #endif /* _ASM_X86_SOCKET_H */ | ||
diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h index 49cc72b5d3c..def6d4746ee 100644 --- a/arch/x86/include/asm/sockios.h +++ b/arch/x86/include/asm/sockios.h | |||
@@ -1,13 +1 @@ | |||
1 | #ifndef _ASM_X86_SOCKIOS_H | #include <asm-generic/sockios.h> | |
2 | #define _ASM_X86_SOCKIOS_H | ||
3 | |||
4 | /* Socket-level I/O control calls. */ | ||
5 | #define FIOSETOWN 0x8901 | ||
6 | #define SIOCSPGRP 0x8902 | ||
7 | #define FIOGETOWN 0x8903 | ||
8 | #define SIOCGPGRP 0x8904 | ||
9 | #define SIOCATMARK 0x8905 | ||
10 | #define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ | ||
11 | #define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ | ||
12 | |||
13 | #endif /* _ASM_X86_SOCKIOS_H */ | ||
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index c2d742c6e15..15751776356 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h | |||
@@ -48,7 +48,7 @@ | |||
48 | * head_32 for boot CPU and setup_per_cpu_areas() for others. | 48 | * head_32 for boot CPU and setup_per_cpu_areas() for others. |
49 | */ | 49 | */ |
50 | #define GDT_STACK_CANARY_INIT \ | 50 | #define GDT_STACK_CANARY_INIT \ |
51 | [GDT_ENTRY_STACK_CANARY] = { { { 0x00000018, 0x00409000 } } }, | 51 | [GDT_ENTRY_STACK_CANARY] = GDT_ENTRY_INIT(0x4090, 0, 0x18), |
52 | 52 | ||
53 | /* | 53 | /* |
54 | * Initialize the stackprotector canary value. | 54 | * Initialize the stackprotector canary value. |
@@ -78,21 +78,19 @@ static __always_inline void boot_init_stack_canary(void) | |||
78 | #ifdef CONFIG_X86_64 | 78 | #ifdef CONFIG_X86_64 |
79 | percpu_write(irq_stack_union.stack_canary, canary); | 79 | percpu_write(irq_stack_union.stack_canary, canary); |
80 | #else | 80 | #else |
81 | percpu_write(stack_canary, canary); | 81 | percpu_write(stack_canary.canary, canary); |
82 | #endif | 82 | #endif |
83 | } | 83 | } |
84 | 84 | ||
85 | static inline void setup_stack_canary_segment(int cpu) | 85 | static inline void setup_stack_canary_segment(int cpu) |
86 | { | 86 | { |
87 | #ifdef CONFIG_X86_32 | 87 | #ifdef CONFIG_X86_32 |
88 | unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu) - 20; | 88 | unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu); |
89 | struct desc_struct *gdt_table = get_cpu_gdt_table(cpu); | 89 | struct desc_struct *gdt_table = get_cpu_gdt_table(cpu); |
90 | struct desc_struct desc; | 90 | struct desc_struct desc; |
91 | 91 | ||
92 | desc = gdt_table[GDT_ENTRY_STACK_CANARY]; | 92 | desc = gdt_table[GDT_ENTRY_STACK_CANARY]; |
93 | desc.base0 = canary & 0xffff; | 93 | set_desc_base(&desc, canary); |
94 | desc.base1 = (canary >> 16) & 0xff; | ||
95 | desc.base2 = (canary >> 24) & 0xff; | ||
96 | write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S); | 94 | write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S); |
97 | #endif | 95 | #endif |
98 | } | 96 | } |
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index c86f452256d..ae907e61718 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h | |||
@@ -65,7 +65,6 @@ static __always_inline void *__constant_memcpy(void *to, const void *from, | |||
65 | case 4: | 65 | case 4: |
66 | *(int *)to = *(int *)from; | 66 | *(int *)to = *(int *)from; |
67 | return to; | 67 | return to; |
68 | |||
69 | case 3: | 68 | case 3: |
70 | *(short *)to = *(short *)from; | 69 | *(short *)to = *(short *)from; |
71 | *((char *)to + 2) = *((char *)from + 2); | 70 | *((char *)to + 2) = *((char *)from + 2); |
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index d82f39bb790..8d33bc5462d 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Access to user system call parameters and results | 2 | * Access to user system call parameters and results |
3 | * | 3 | * |
4 | * Copyright (C) 2008 Red Hat, Inc. All rights reserved. | 4 | * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. |
5 | * | 5 | * |
6 | * This copyrighted material is made available to anyone wishing to use, | 6 | * This copyrighted material is made available to anyone wishing to use, |
7 | * modify, copy, or redistribute it subject to the terms and conditions | 7 | * modify, copy, or redistribute it subject to the terms and conditions |
@@ -16,13 +16,13 @@ | |||
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/err.h> | 17 | #include <linux/err.h> |
18 | 18 | ||
19 | static inline long syscall_get_nr(struct task_struct *task, | 19 | /* |
20 | struct pt_regs *regs) | 20 | * Only the low 32 bits of orig_ax are meaningful, so we return int. |
21 | * This importantly ignores the high bits on 64-bit, so comparisons | ||
22 | * sign-extend the low 32 bits. | ||
23 | */ | ||
24 | static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) | ||
21 | { | 25 | { |
22 | /* | ||
23 | * We always sign-extend a -1 value being set here, | ||
24 | * so this is always either -1L or a syscall number. | ||
25 | */ | ||
26 | return regs->orig_ax; | 26 | return regs->orig_ax; |
27 | } | 27 | } |
28 | 28 | ||
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 643c59b4bc6..f08f9737489 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h | |||
@@ -31,7 +31,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, | |||
31 | "movl %P[task_canary](%[next]), %%ebx\n\t" \ | 31 | "movl %P[task_canary](%[next]), %%ebx\n\t" \ |
32 | "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" | 32 | "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" |
33 | #define __switch_canary_oparam \ | 33 | #define __switch_canary_oparam \ |
34 | , [stack_canary] "=m" (per_cpu_var(stack_canary)) | 34 | , [stack_canary] "=m" (per_cpu_var(stack_canary.canary)) |
35 | #define __switch_canary_iparam \ | 35 | #define __switch_canary_iparam \ |
36 | , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) | 36 | , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) |
37 | #else /* CC_STACKPROTECTOR */ | 37 | #else /* CC_STACKPROTECTOR */ |
@@ -150,33 +150,6 @@ do { \ | |||
150 | #endif | 150 | #endif |
151 | 151 | ||
152 | #ifdef __KERNEL__ | 152 | #ifdef __KERNEL__ |
153 | #define _set_base(addr, base) do { unsigned long __pr; \ | ||
154 | __asm__ __volatile__ ("movw %%dx,%1\n\t" \ | ||
155 | "rorl $16,%%edx\n\t" \ | ||
156 | "movb %%dl,%2\n\t" \ | ||
157 | "movb %%dh,%3" \ | ||
158 | :"=&d" (__pr) \ | ||
159 | :"m" (*((addr)+2)), \ | ||
160 | "m" (*((addr)+4)), \ | ||
161 | "m" (*((addr)+7)), \ | ||
162 | "0" (base) \ | ||
163 | ); } while (0) | ||
164 | |||
165 | #define _set_limit(addr, limit) do { unsigned long __lr; \ | ||
166 | __asm__ __volatile__ ("movw %%dx,%1\n\t" \ | ||
167 | "rorl $16,%%edx\n\t" \ | ||
168 | "movb %2,%%dh\n\t" \ | ||
169 | "andb $0xf0,%%dh\n\t" \ | ||
170 | "orb %%dh,%%dl\n\t" \ | ||
171 | "movb %%dl,%2" \ | ||
172 | :"=&d" (__lr) \ | ||
173 | :"m" (*(addr)), \ | ||
174 | "m" (*((addr)+6)), \ | ||
175 | "0" (limit) \ | ||
176 | ); } while (0) | ||
177 | |||
178 | #define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base)) | ||
179 | #define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1)) | ||
180 | 153 | ||
181 | extern void native_load_gs_index(unsigned); | 154 | extern void native_load_gs_index(unsigned); |
182 | 155 | ||
diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h index af1b70ea440..3935b106de7 100644 --- a/arch/x86/include/asm/termbits.h +++ b/arch/x86/include/asm/termbits.h | |||
@@ -1,198 +1 @@ | |||
1 | #ifndef _ASM_X86_TERMBITS_H | #include <asm-generic/termbits.h> | |
2 | #define _ASM_X86_TERMBITS_H | ||
3 | |||
4 | #include <linux/posix_types.h> | ||
5 | |||
6 | typedef unsigned char cc_t; | ||
7 | typedef unsigned int speed_t; | ||
8 | typedef unsigned int tcflag_t; | ||
9 | |||
10 | #define NCCS 19 | ||
11 | struct termios { | ||
12 | tcflag_t c_iflag; /* input mode flags */ | ||
13 | tcflag_t c_oflag; /* output mode flags */ | ||
14 | tcflag_t c_cflag; /* control mode flags */ | ||
15 | tcflag_t c_lflag; /* local mode flags */ | ||
16 | cc_t c_line; /* line discipline */ | ||
17 | cc_t c_cc[NCCS]; /* control characters */ | ||
18 | }; | ||
19 | |||
20 | struct termios2 { | ||
21 | tcflag_t c_iflag; /* input mode flags */ | ||
22 | tcflag_t c_oflag; /* output mode flags */ | ||
23 | tcflag_t c_cflag; /* control mode flags */ | ||
24 | tcflag_t c_lflag; /* local mode flags */ | ||
25 | cc_t c_line; /* line discipline */ | ||
26 | cc_t c_cc[NCCS]; /* control characters */ | ||
27 | speed_t c_ispeed; /* input speed */ | ||
28 | speed_t c_ospeed; /* output speed */ | ||
29 | }; | ||
30 | |||
31 | struct ktermios { | ||
32 | tcflag_t c_iflag; /* input mode flags */ | ||
33 | tcflag_t c_oflag; /* output mode flags */ | ||
34 | tcflag_t c_cflag; /* control mode flags */ | ||
35 | tcflag_t c_lflag; /* local mode flags */ | ||
36 | cc_t c_line; /* line discipline */ | ||
37 | cc_t c_cc[NCCS]; /* control characters */ | ||
38 | speed_t c_ispeed; /* input speed */ | ||
39 | speed_t c_ospeed; /* output speed */ | ||
40 | }; | ||
41 | |||
42 | /* c_cc characters */ | ||
43 | #define VINTR 0 | ||
44 | #define VQUIT 1 | ||
45 | #define VERASE 2 | ||
46 | #define VKILL 3 | ||
47 | #define VEOF 4 | ||
48 | #define VTIME 5 | ||
49 | #define VMIN 6 | ||
50 | #define VSWTC 7 | ||
51 | #define VSTART 8 | ||
52 | #define VSTOP 9 | ||
53 | #define VSUSP 10 | ||
54 | #define VEOL 11 | ||
55 | #define VREPRINT 12 | ||
56 | #define VDISCARD 13 | ||
57 | #define VWERASE 14 | ||
58 | #define VLNEXT 15 | ||
59 | #define VEOL2 16 | ||
60 | |||
61 | /* c_iflag bits */ | ||
62 | #define IGNBRK 0000001 | ||
63 | #define BRKINT 0000002 | ||
64 | #define IGNPAR 0000004 | ||
65 | #define PARMRK 0000010 | ||
66 | #define INPCK 0000020 | ||
67 | #define ISTRIP 0000040 | ||
68 | #define INLCR 0000100 | ||
69 | #define IGNCR 0000200 | ||
70 | #define ICRNL 0000400 | ||
71 | #define IUCLC 0001000 | ||
72 | #define IXON 0002000 | ||
73 | #define IXANY 0004000 | ||
74 | #define IXOFF 0010000 | ||
75 | #define IMAXBEL 0020000 | ||
76 | #define IUTF8 0040000 | ||
77 | |||
78 | /* c_oflag bits */ | ||
79 | #define OPOST 0000001 | ||
80 | #define OLCUC 0000002 | ||
81 | #define ONLCR 0000004 | ||
82 | #define OCRNL 0000010 | ||
83 | #define ONOCR 0000020 | ||
84 | #define ONLRET 0000040 | ||
85 | #define OFILL 0000100 | ||
86 | #define OFDEL 0000200 | ||
87 | #define NLDLY 0000400 | ||
88 | #define NL0 0000000 | ||
89 | #define NL1 0000400 | ||
90 | #define CRDLY 0003000 | ||
91 | #define CR0 0000000 | ||
92 | #define CR1 0001000 | ||
93 | #define CR2 0002000 | ||
94 | #define CR3 0003000 | ||
95 | #define TABDLY 0014000 | ||
96 | #define TAB0 0000000 | ||
97 | #define TAB1 0004000 | ||
98 | #define TAB2 0010000 | ||
99 | #define TAB3 0014000 | ||
100 | #define XTABS 0014000 | ||
101 | #define BSDLY 0020000 | ||
102 | #define BS0 0000000 | ||
103 | #define BS1 0020000 | ||
104 | #define VTDLY 0040000 | ||
105 | #define VT0 0000000 | ||
106 | #define VT1 0040000 | ||
107 | #define FFDLY 0100000 | ||
108 | #define FF0 0000000 | ||
109 | #define FF1 0100000 | ||
110 | |||
111 | /* c_cflag bit meaning */ | ||
112 | #define CBAUD 0010017 | ||
113 | #define B0 0000000 /* hang up */ | ||
114 | #define B50 0000001 | ||
115 | #define B75 0000002 | ||
116 | #define B110 0000003 | ||
117 | #define B134 0000004 | ||
118 | #define B150 0000005 | ||
119 | #define B200 0000006 | ||
120 | #define B300 0000007 | ||
121 | #define B600 0000010 | ||
122 | #define B1200 0000011 | ||
123 | #define B1800 0000012 | ||
124 | #define B2400 0000013 | ||
125 | #define B4800 0000014 | ||
126 | #define B9600 0000015 | ||
127 | #define B19200 0000016 | ||
128 | #define B38400 0000017 | ||
129 | #define EXTA B19200 | ||
130 | #define EXTB B38400 | ||
131 | #define CSIZE 0000060 | ||
132 | #define CS5 0000000 | ||
133 | #define CS6 0000020 | ||
134 | #define CS7 0000040 | ||
135 | #define CS8 0000060 | ||
136 | #define CSTOPB 0000100 | ||
137 | #define CREAD 0000200 | ||
138 | #define PARENB 0000400 | ||
139 | #define PARODD 0001000 | ||
140 | #define HUPCL 0002000 | ||
141 | #define CLOCAL 0004000 | ||
142 | #define CBAUDEX 0010000 | ||
143 | #define BOTHER 0010000 /* non standard rate */ | ||
144 | #define B57600 0010001 | ||
145 | #define B115200 0010002 | ||
146 | #define B230400 0010003 | ||
147 | #define B460800 0010004 | ||
148 | #define B500000 0010005 | ||
149 | #define B576000 0010006 | ||
150 | #define B921600 0010007 | ||
151 | #define B1000000 0010010 | ||
152 | #define B1152000 0010011 | ||
153 | #define B1500000 0010012 | ||
154 | #define B2000000 0010013 | ||
155 | #define B2500000 0010014 | ||
156 | #define B3000000 0010015 | ||
157 | #define B3500000 0010016 | ||
158 | #define B4000000 0010017 | ||
159 | #define CIBAUD 002003600000 /* input baud rate */ | ||
160 | #define CMSPAR 010000000000 /* mark or space (stick) parity */ | ||
161 | #define CRTSCTS 020000000000 /* flow control */ | ||
162 | |||
163 | #define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ | ||
164 | |||
165 | /* c_lflag bits */ | ||
166 | #define ISIG 0000001 | ||
167 | #define ICANON 0000002 | ||
168 | #define XCASE 0000004 | ||
169 | #define ECHO 0000010 | ||
170 | #define ECHOE 0000020 | ||
171 | #define ECHOK 0000040 | ||
172 | #define ECHONL 0000100 | ||
173 | #define NOFLSH 0000200 | ||
174 | #define TOSTOP 0000400 | ||
175 | #define ECHOCTL 0001000 | ||
176 | #define ECHOPRT 0002000 | ||
177 | #define ECHOKE 0004000 | ||
178 | #define FLUSHO 0010000 | ||
179 | #define PENDIN 0040000 | ||
180 | #define IEXTEN 0100000 | ||
181 | |||
182 | /* tcflow() and TCXONC use these */ | ||
183 | #define TCOOFF 0 | ||
184 | #define TCOON 1 | ||
185 | #define TCIOFF 2 | ||
186 | #define TCION 3 | ||
187 | |||
188 | /* tcflush() and TCFLSH use these */ | ||
189 | #define TCIFLUSH 0 | ||
190 | #define TCOFLUSH 1 | ||
191 | #define TCIOFLUSH 2 | ||
192 | |||
193 | /* tcsetattr uses these */ | ||
194 | #define TCSANOW 0 | ||
195 | #define TCSADRAIN 1 | ||
196 | #define TCSAFLUSH 2 | ||
197 | |||
198 | #endif /* _ASM_X86_TERMBITS_H */ | ||
diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h index c4ee8056bac..280d78a9d96 100644 --- a/arch/x86/include/asm/termios.h +++ b/arch/x86/include/asm/termios.h | |||
@@ -1,114 +1 @@ | |||
1 | #ifndef _ASM_X86_TERMIOS_H | #include <asm-generic/termios.h> | |
2 | #define _ASM_X86_TERMIOS_H | ||
3 | |||
4 | #include <asm/termbits.h> | ||
5 | #include <asm/ioctls.h> | ||
6 | |||
7 | struct winsize { | ||
8 | unsigned short ws_row; | ||
9 | unsigned short ws_col; | ||
10 | unsigned short ws_xpixel; | ||
11 | unsigned short ws_ypixel; | ||
12 | }; | ||
13 | |||
14 | #define NCC 8 | ||
15 | struct termio { | ||
16 | unsigned short c_iflag; /* input mode flags */ | ||
17 | unsigned short c_oflag; /* output mode flags */ | ||
18 | unsigned short c_cflag; /* control mode flags */ | ||
19 | unsigned short c_lflag; /* local mode flags */ | ||
20 | unsigned char c_line; /* line discipline */ | ||
21 | unsigned char c_cc[NCC]; /* control characters */ | ||
22 | }; | ||
23 | |||
24 | /* modem lines */ | ||
25 | #define TIOCM_LE 0x001 | ||
26 | #define TIOCM_DTR 0x002 | ||
27 | #define TIOCM_RTS 0x004 | ||
28 | #define TIOCM_ST 0x008 | ||
29 | #define TIOCM_SR 0x010 | ||
30 | #define TIOCM_CTS 0x020 | ||
31 | #define TIOCM_CAR 0x040 | ||
32 | #define TIOCM_RNG 0x080 | ||
33 | #define TIOCM_DSR 0x100 | ||
34 | #define TIOCM_CD TIOCM_CAR | ||
35 | #define TIOCM_RI TIOCM_RNG | ||
36 | #define TIOCM_OUT1 0x2000 | ||
37 | #define TIOCM_OUT2 0x4000 | ||
38 | #define TIOCM_LOOP 0x8000 | ||
39 | |||
40 | /* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ | ||
41 | |||
42 | #ifdef __KERNEL__ | ||
43 | |||
44 | #include <asm/uaccess.h> | ||
45 | |||
46 | /* intr=^C quit=^\ erase=del kill=^U | ||
47 | eof=^D vtime=\0 vmin=\1 sxtc=\0 | ||
48 | start=^Q stop=^S susp=^Z eol=\0 | ||
49 | reprint=^R discard=^U werase=^W lnext=^V | ||
50 | eol2=\0 | ||
51 | */ | ||
52 | #define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" | ||
53 | |||
54 | /* | ||
55 | * Translate a "termio" structure into a "termios". Ugh. | ||
56 | */ | ||
57 | #define SET_LOW_TERMIOS_BITS(termios, termio, x) { \ | ||
58 | unsigned short __tmp; \ | ||
59 | get_user(__tmp,&(termio)->x); \ | ||
60 | *(unsigned short *) &(termios)->x = __tmp; \ | ||
61 | } | ||
62 | |||
63 | static inline int user_termio_to_kernel_termios(struct ktermios *termios, | ||
64 | struct termio __user *termio) | ||
65 | { | ||
66 | SET_LOW_TERMIOS_BITS(termios, termio, c_iflag); | ||
67 | SET_LOW_TERMIOS_BITS(termios, termio, c_oflag); | ||
68 | SET_LOW_TERMIOS_BITS(termios, termio, c_cflag); | ||
69 | SET_LOW_TERMIOS_BITS(termios, termio, c_lflag); | ||
70 | get_user(termios->c_line, &termio->c_line); | ||
71 | return copy_from_user(termios->c_cc, termio->c_cc, NCC); | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * Translate a "termios" structure into a "termio". Ugh. | ||
76 | */ | ||
77 | static inline int kernel_termios_to_user_termio(struct termio __user *termio, | ||
78 | struct ktermios *termios) | ||
79 | { | ||
80 | put_user((termios)->c_iflag, &(termio)->c_iflag); | ||
81 | put_user((termios)->c_oflag, &(termio)->c_oflag); | ||
82 | put_user((termios)->c_cflag, &(termio)->c_cflag); | ||
83 | put_user((termios)->c_lflag, &(termio)->c_lflag); | ||
84 | put_user((termios)->c_line, &(termio)->c_line); | ||
85 | return copy_to_user((termio)->c_cc, (termios)->c_cc, NCC); | ||
86 | } | ||
87 | |||
88 | static inline int user_termios_to_kernel_termios(struct ktermios *k, | ||
89 | struct termios2 __user *u) | ||
90 | { | ||
91 | return copy_from_user(k, u, sizeof(struct termios2)); | ||
92 | } | ||
93 | |||
94 | static inline int kernel_termios_to_user_termios(struct termios2 __user *u, | ||
95 | struct ktermios *k) | ||
96 | { | ||
97 | return copy_to_user(u, k, sizeof(struct termios2)); | ||
98 | } | ||
99 | |||
100 | static inline int user_termios_to_kernel_termios_1(struct ktermios *k, | ||
101 | struct termios __user *u) | ||
102 | { | ||
103 | return copy_from_user(k, u, sizeof(struct termios)); | ||
104 | } | ||
105 | |||
106 | static inline int kernel_termios_to_user_termios_1(struct termios __user *u, | ||
107 | struct ktermios *k) | ||
108 | { | ||
109 | return copy_to_user(u, k, sizeof(struct termios)); | ||
110 | } | ||
111 | |||
112 | #endif /* __KERNEL__ */ | ||
113 | |||
114 | #endif /* _ASM_X86_TERMIOS_H */ | ||
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index fad7d40b75f..d27d0a2fec4 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h | |||
@@ -95,7 +95,7 @@ struct thread_info { | |||
95 | #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ | 95 | #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ |
96 | #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ | 96 | #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ |
97 | #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ | 97 | #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ |
98 | #define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */ | 98 | #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ |
99 | 99 | ||
100 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) | 100 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) |
101 | #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) | 101 | #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) |
@@ -118,17 +118,17 @@ struct thread_info { | |||
118 | #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) | 118 | #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) |
119 | #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) | 119 | #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) |
120 | #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) | 120 | #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) |
121 | #define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) | 121 | #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) |
122 | 122 | ||
123 | /* work to do in syscall_trace_enter() */ | 123 | /* work to do in syscall_trace_enter() */ |
124 | #define _TIF_WORK_SYSCALL_ENTRY \ | 124 | #define _TIF_WORK_SYSCALL_ENTRY \ |
125 | (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \ | 125 | (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ |
126 | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) | 126 | _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) |
127 | 127 | ||
128 | /* work to do in syscall_trace_leave() */ | 128 | /* work to do in syscall_trace_leave() */ |
129 | #define _TIF_WORK_SYSCALL_EXIT \ | 129 | #define _TIF_WORK_SYSCALL_EXIT \ |
130 | (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ | 130 | (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ |
131 | _TIF_SYSCALL_FTRACE) | 131 | _TIF_SYSCALL_TRACEPOINT) |
132 | 132 | ||
133 | /* work to do on interrupt/exception return */ | 133 | /* work to do on interrupt/exception return */ |
134 | #define _TIF_WORK_MASK \ | 134 | #define _TIF_WORK_MASK \ |
@@ -137,7 +137,8 @@ struct thread_info { | |||
137 | _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) | 137 | _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) |
138 | 138 | ||
139 | /* work to do on any return to user space */ | 139 | /* work to do on any return to user space */ |
140 | #define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE) | 140 | #define _TIF_ALLWORK_MASK \ |
141 | ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT) | ||
141 | 142 | ||
142 | /* Only used for 64 bit */ | 143 | /* Only used for 64 bit */ |
143 | #define _TIF_DO_NOTIFY_MASK \ | 144 | #define _TIF_DO_NOTIFY_MASK \ |
@@ -213,7 +214,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); | |||
213 | static inline struct thread_info *current_thread_info(void) | 214 | static inline struct thread_info *current_thread_info(void) |
214 | { | 215 | { |
215 | struct thread_info *ti; | 216 | struct thread_info *ti; |
216 | ti = (void *)(percpu_read(kernel_stack) + | 217 | ti = (void *)(percpu_read_stable(kernel_stack) + |
217 | KERNEL_STACK_OFFSET - THREAD_SIZE); | 218 | KERNEL_STACK_OFFSET - THREAD_SIZE); |
218 | return ti; | 219 | return ti; |
219 | } | 220 | } |
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index 50c733aac42..7bdec4e9b73 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h | |||
@@ -4,60 +4,7 @@ | |||
4 | extern void hpet_time_init(void); | 4 | extern void hpet_time_init(void); |
5 | 5 | ||
6 | #include <asm/mc146818rtc.h> | 6 | #include <asm/mc146818rtc.h> |
7 | #ifdef CONFIG_X86_32 | ||
8 | #include <linux/efi.h> | ||
9 | |||
10 | static inline unsigned long native_get_wallclock(void) | ||
11 | { | ||
12 | unsigned long retval; | ||
13 | |||
14 | if (efi_enabled) | ||
15 | retval = efi_get_time(); | ||
16 | else | ||
17 | retval = mach_get_cmos_time(); | ||
18 | |||
19 | return retval; | ||
20 | } | ||
21 | |||
22 | static inline int native_set_wallclock(unsigned long nowtime) | ||
23 | { | ||
24 | int retval; | ||
25 | |||
26 | if (efi_enabled) | ||
27 | retval = efi_set_rtc_mmss(nowtime); | ||
28 | else | ||
29 | retval = mach_set_rtc_mmss(nowtime); | ||
30 | |||
31 | return retval; | ||
32 | } | ||
33 | |||
34 | #else | ||
35 | extern void native_time_init_hook(void); | ||
36 | |||
37 | static inline unsigned long native_get_wallclock(void) | ||
38 | { | ||
39 | return mach_get_cmos_time(); | ||
40 | } | ||
41 | |||
42 | static inline int native_set_wallclock(unsigned long nowtime) | ||
43 | { | ||
44 | return mach_set_rtc_mmss(nowtime); | ||
45 | } | ||
46 | |||
47 | #endif | ||
48 | 7 | ||
49 | extern void time_init(void); | 8 | extern void time_init(void); |
50 | 9 | ||
51 | #ifdef CONFIG_PARAVIRT | ||
52 | #include <asm/paravirt.h> | ||
53 | #else /* !CONFIG_PARAVIRT */ | ||
54 | |||
55 | #define get_wallclock() native_get_wallclock() | ||
56 | #define set_wallclock(x) native_set_wallclock(x) | ||
57 | #define choose_time_init() hpet_time_init | ||
58 | |||
59 | #endif /* CONFIG_PARAVIRT */ | ||
60 | |||
61 | extern unsigned long __init calibrate_cpu(void); | ||
62 | |||
63 | #endif /* _ASM_X86_TIME_H */ | 10 | #endif /* _ASM_X86_TIME_H */ |
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 20ca9c4d468..5469630b27f 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h | |||
@@ -8,20 +8,16 @@ | |||
8 | #define TICK_SIZE (tick_nsec / 1000) | 8 | #define TICK_SIZE (tick_nsec / 1000) |
9 | 9 | ||
10 | unsigned long long native_sched_clock(void); | 10 | unsigned long long native_sched_clock(void); |
11 | unsigned long native_calibrate_tsc(void); | 11 | extern int recalibrate_cpu_khz(void); |
12 | 12 | ||
13 | #ifdef CONFIG_X86_32 | 13 | #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) |
14 | extern int timer_ack; | 14 | extern int timer_ack; |
15 | extern irqreturn_t timer_interrupt(int irq, void *dev_id); | 15 | #else |
16 | #endif /* CONFIG_X86_32 */ | 16 | # define timer_ack (0) |
17 | extern int recalibrate_cpu_khz(void); | 17 | #endif |
18 | 18 | ||
19 | extern int no_timer_check; | 19 | extern int no_timer_check; |
20 | 20 | ||
21 | #ifndef CONFIG_PARAVIRT | ||
22 | #define calibrate_tsc() native_calibrate_tsc() | ||
23 | #endif | ||
24 | |||
25 | /* Accelerators for sched_clock() | 21 | /* Accelerators for sched_clock() |
26 | * convert from cycles(64bits) => nanoseconds (64bits) | 22 | * convert from cycles(64bits) => nanoseconds (64bits) |
27 | * basic equation: | 23 | * basic equation: |
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 066ef590d7e..25a92842dd9 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h | |||
@@ -116,38 +116,41 @@ extern unsigned long node_remap_size[]; | |||
116 | 116 | ||
117 | # define SD_CACHE_NICE_TRIES 1 | 117 | # define SD_CACHE_NICE_TRIES 1 |
118 | # define SD_IDLE_IDX 1 | 118 | # define SD_IDLE_IDX 1 |
119 | # define SD_NEWIDLE_IDX 2 | ||
120 | # define SD_FORKEXEC_IDX 0 | ||
121 | 119 | ||
122 | #else | 120 | #else |
123 | 121 | ||
124 | # define SD_CACHE_NICE_TRIES 2 | 122 | # define SD_CACHE_NICE_TRIES 2 |
125 | # define SD_IDLE_IDX 2 | 123 | # define SD_IDLE_IDX 2 |
126 | # define SD_NEWIDLE_IDX 2 | ||
127 | # define SD_FORKEXEC_IDX 1 | ||
128 | 124 | ||
129 | #endif | 125 | #endif |
130 | 126 | ||
131 | /* sched_domains SD_NODE_INIT for NUMA machines */ | 127 | /* sched_domains SD_NODE_INIT for NUMA machines */ |
132 | #define SD_NODE_INIT (struct sched_domain) { \ | 128 | #define SD_NODE_INIT (struct sched_domain) { \ |
133 | .min_interval = 8, \ | 129 | .min_interval = 8, \ |
134 | .max_interval = 32, \ | 130 | .max_interval = 32, \ |
135 | .busy_factor = 32, \ | 131 | .busy_factor = 32, \ |
136 | .imbalance_pct = 125, \ | 132 | .imbalance_pct = 125, \ |
137 | .cache_nice_tries = SD_CACHE_NICE_TRIES, \ | 133 | .cache_nice_tries = SD_CACHE_NICE_TRIES, \ |
138 | .busy_idx = 3, \ | 134 | .busy_idx = 3, \ |
139 | .idle_idx = SD_IDLE_IDX, \ | 135 | .idle_idx = SD_IDLE_IDX, \ |
140 | .newidle_idx = SD_NEWIDLE_IDX, \ | 136 | .newidle_idx = 0, \ |
141 | .wake_idx = 1, \ | 137 | .wake_idx = 0, \ |
142 | .forkexec_idx = SD_FORKEXEC_IDX, \ | 138 | .forkexec_idx = 0, \ |
143 | .flags = SD_LOAD_BALANCE \ | 139 | \ |
144 | | SD_BALANCE_EXEC \ | 140 | .flags = 1*SD_LOAD_BALANCE \ |
145 | | SD_BALANCE_FORK \ | 141 | | 1*SD_BALANCE_NEWIDLE \ |
146 | | SD_WAKE_AFFINE \ | 142 | | 1*SD_BALANCE_EXEC \ |
147 | | SD_WAKE_BALANCE \ | 143 | | 1*SD_BALANCE_FORK \ |
148 | | SD_SERIALIZE, \ | 144 | | 0*SD_BALANCE_WAKE \ |
149 | .last_balance = jiffies, \ | 145 | | 1*SD_WAKE_AFFINE \ |
150 | .balance_interval = 1, \ | 146 | | 0*SD_SHARE_CPUPOWER \ |
147 | | 0*SD_POWERSAVINGS_BALANCE \ | ||
148 | | 0*SD_SHARE_PKG_RESOURCES \ | ||
149 | | 1*SD_SERIALIZE \ | ||
150 | | 0*SD_PREFER_SIBLING \ | ||
151 | , \ | ||
152 | .last_balance = jiffies, \ | ||
153 | .balance_interval = 1, \ | ||
151 | } | 154 | } |
152 | 155 | ||
153 | #ifdef CONFIG_X86_64_ACPI_NUMA | 156 | #ifdef CONFIG_X86_64_ACPI_NUMA |
@@ -162,21 +165,11 @@ static inline int numa_node_id(void) | |||
162 | return 0; | 165 | return 0; |
163 | } | 166 | } |
164 | 167 | ||
165 | static inline int cpu_to_node(int cpu) | ||
166 | { | ||
167 | return 0; | ||
168 | } | ||
169 | |||
170 | static inline int early_cpu_to_node(int cpu) | 168 | static inline int early_cpu_to_node(int cpu) |
171 | { | 169 | { |
172 | return 0; | 170 | return 0; |
173 | } | 171 | } |
174 | 172 | ||
175 | static inline const struct cpumask *cpumask_of_node(int node) | ||
176 | { | ||
177 | return cpu_online_mask; | ||
178 | } | ||
179 | |||
180 | static inline void setup_node_to_cpumask_map(void) { } | 173 | static inline void setup_node_to_cpumask_map(void) { } |
181 | 174 | ||
182 | #endif | 175 | #endif |
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index bfd74c032fc..4da91ad69e0 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h | |||
@@ -81,9 +81,7 @@ extern int panic_on_unrecovered_nmi; | |||
81 | 81 | ||
82 | void math_error(void __user *); | 82 | void math_error(void __user *); |
83 | void math_emulate(struct math_emu_info *); | 83 | void math_emulate(struct math_emu_info *); |
84 | #ifdef CONFIG_X86_32 | 84 | #ifndef CONFIG_X86_32 |
85 | unsigned long patch_espfix_desc(unsigned long, unsigned long); | ||
86 | #else | ||
87 | asmlinkage void smp_thermal_interrupt(void); | 85 | asmlinkage void smp_thermal_interrupt(void); |
88 | asmlinkage void mce_threshold_interrupt(void); | 86 | asmlinkage void mce_threshold_interrupt(void); |
89 | #endif | 87 | #endif |
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 38ae163cc91..c0427295e8f 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h | |||
@@ -48,7 +48,8 @@ static __always_inline cycles_t vget_cycles(void) | |||
48 | extern void tsc_init(void); | 48 | extern void tsc_init(void); |
49 | extern void mark_tsc_unstable(char *reason); | 49 | extern void mark_tsc_unstable(char *reason); |
50 | extern int unsynchronized_tsc(void); | 50 | extern int unsynchronized_tsc(void); |
51 | int check_tsc_unstable(void); | 51 | extern int check_tsc_unstable(void); |
52 | extern unsigned long native_calibrate_tsc(void); | ||
52 | 53 | ||
53 | /* | 54 | /* |
54 | * Boot-time check whether the TSCs are synchronized across | 55 | * Boot-time check whether the TSCs are synchronized across |
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index 09b97745772..df1da20f453 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h | |||
@@ -1,19 +1,11 @@ | |||
1 | #ifndef _ASM_X86_TYPES_H | 1 | #ifndef _ASM_X86_TYPES_H |
2 | #define _ASM_X86_TYPES_H | 2 | #define _ASM_X86_TYPES_H |
3 | 3 | ||
4 | #include <asm-generic/int-ll64.h> | 4 | #define dma_addr_t dma_addr_t |
5 | 5 | ||
6 | #ifndef __ASSEMBLY__ | 6 | #include <asm-generic/types.h> |
7 | |||
8 | typedef unsigned short umode_t; | ||
9 | 7 | ||
10 | #endif /* __ASSEMBLY__ */ | ||
11 | |||
12 | /* | ||
13 | * These aren't exported outside the kernel to avoid name space clashes | ||
14 | */ | ||
15 | #ifdef __KERNEL__ | 8 | #ifdef __KERNEL__ |
16 | |||
17 | #ifndef __ASSEMBLY__ | 9 | #ifndef __ASSEMBLY__ |
18 | 10 | ||
19 | typedef u64 dma64_addr_t; | 11 | typedef u64 dma64_addr_t; |
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 5e06259e90e..632fb44b4cb 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h | |||
@@ -33,7 +33,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero | |||
33 | * Copy data from kernel space to user space. Caller must check | 33 | * Copy data from kernel space to user space. Caller must check |
34 | * the specified block with access_ok() before calling this function. | 34 | * the specified block with access_ok() before calling this function. |
35 | * The caller should also make sure he pins the user space address | 35 | * The caller should also make sure he pins the user space address |
36 | * so that the we don't result in page fault and sleep. | 36 | * so that we don't result in page fault and sleep. |
37 | * | 37 | * |
38 | * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault | 38 | * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault |
39 | * we return the initial request size (1, 2 or 4), as copy_*_user should do. | 39 | * we return the initial request size (1, 2 or 4), as copy_*_user should do. |
diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h index 87324cf439d..b7c29c8017f 100644 --- a/arch/x86/include/asm/ucontext.h +++ b/arch/x86/include/asm/ucontext.h | |||
@@ -7,12 +7,6 @@ | |||
7 | * sigcontext struct (uc_mcontext). | 7 | * sigcontext struct (uc_mcontext). |
8 | */ | 8 | */ |
9 | 9 | ||
10 | struct ucontext { | 10 | #include <asm-generic/ucontext.h> |
11 | unsigned long uc_flags; | ||
12 | struct ucontext *uc_link; | ||
13 | stack_t uc_stack; | ||
14 | struct sigcontext uc_mcontext; | ||
15 | sigset_t uc_sigmask; /* mask last for extensibility */ | ||
16 | }; | ||
17 | 11 | ||
18 | #endif /* _ASM_X86_UCONTEXT_H */ | 12 | #endif /* _ASM_X86_UCONTEXT_H */ |
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 732a3070615..6fb3c209a7e 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h | |||
@@ -341,10 +341,12 @@ | |||
341 | #define __NR_preadv 333 | 341 | #define __NR_preadv 333 |
342 | #define __NR_pwritev 334 | 342 | #define __NR_pwritev 334 |
343 | #define __NR_rt_tgsigqueueinfo 335 | 343 | #define __NR_rt_tgsigqueueinfo 335 |
344 | #define __NR_perf_counter_open 336 | 344 | #define __NR_perf_event_open 336 |
345 | 345 | ||
346 | #ifdef __KERNEL__ | 346 | #ifdef __KERNEL__ |
347 | 347 | ||
348 | #define NR_syscalls 337 | ||
349 | |||
348 | #define __ARCH_WANT_IPC_PARSE_VERSION | 350 | #define __ARCH_WANT_IPC_PARSE_VERSION |
349 | #define __ARCH_WANT_OLD_READDIR | 351 | #define __ARCH_WANT_OLD_READDIR |
350 | #define __ARCH_WANT_OLD_STAT | 352 | #define __ARCH_WANT_OLD_STAT |
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 900e1617e67..8d3ad0adbc6 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h | |||
@@ -659,8 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv) | |||
659 | __SYSCALL(__NR_pwritev, sys_pwritev) | 659 | __SYSCALL(__NR_pwritev, sys_pwritev) |
660 | #define __NR_rt_tgsigqueueinfo 297 | 660 | #define __NR_rt_tgsigqueueinfo 297 |
661 | __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) | 661 | __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) |
662 | #define __NR_perf_counter_open 298 | 662 | #define __NR_perf_event_open 298 |
663 | __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) | 663 | __SYSCALL(__NR_perf_event_open, sys_perf_event_open) |
664 | 664 | ||
665 | #ifndef __NO_STUBS | 665 | #ifndef __NO_STUBS |
666 | #define __ARCH_WANT_OLD_READDIR | 666 | #define __ARCH_WANT_OLD_READDIR |
@@ -688,6 +688,12 @@ __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) | |||
688 | #endif /* __NO_STUBS */ | 688 | #endif /* __NO_STUBS */ |
689 | 689 | ||
690 | #ifdef __KERNEL__ | 690 | #ifdef __KERNEL__ |
691 | |||
692 | #ifndef COMPILE_OFFSETS | ||
693 | #include <asm/asm-offsets.h> | ||
694 | #define NR_syscalls (__NR_syscall_max + 1) | ||
695 | #endif | ||
696 | |||
691 | /* | 697 | /* |
692 | * "Conditional" syscalls | 698 | * "Conditional" syscalls |
693 | * | 699 | * |
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index bddd44f2f0a..80e2984f521 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h | |||
@@ -133,7 +133,7 @@ struct bau_msg_payload { | |||
133 | * see table 4.2.3.0.1 in broacast_assist spec. | 133 | * see table 4.2.3.0.1 in broacast_assist spec. |
134 | */ | 134 | */ |
135 | struct bau_msg_header { | 135 | struct bau_msg_header { |
136 | unsigned int dest_subnodeid:6; /* must be zero */ | 136 | unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ |
137 | /* bits 5:0 */ | 137 | /* bits 5:0 */ |
138 | unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */ | 138 | unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */ |
139 | /* bits 20:6 */ /* first bit in node_map */ | 139 | /* bits 20:6 */ /* first bit in node_map */ |
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 77a68505419..04eb6c958b9 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/numa.h> | 15 | #include <linux/numa.h> |
16 | #include <linux/percpu.h> | 16 | #include <linux/percpu.h> |
17 | #include <linux/timer.h> | 17 | #include <linux/timer.h> |
18 | #include <linux/io.h> | ||
18 | #include <asm/types.h> | 19 | #include <asm/types.h> |
19 | #include <asm/percpu.h> | 20 | #include <asm/percpu.h> |
20 | #include <asm/uv/uv_mmrs.h> | 21 | #include <asm/uv/uv_mmrs.h> |
@@ -258,13 +259,13 @@ static inline unsigned long *uv_global_mmr32_address(int pnode, | |||
258 | static inline void uv_write_global_mmr32(int pnode, unsigned long offset, | 259 | static inline void uv_write_global_mmr32(int pnode, unsigned long offset, |
259 | unsigned long val) | 260 | unsigned long val) |
260 | { | 261 | { |
261 | *uv_global_mmr32_address(pnode, offset) = val; | 262 | writeq(val, uv_global_mmr32_address(pnode, offset)); |
262 | } | 263 | } |
263 | 264 | ||
264 | static inline unsigned long uv_read_global_mmr32(int pnode, | 265 | static inline unsigned long uv_read_global_mmr32(int pnode, |
265 | unsigned long offset) | 266 | unsigned long offset) |
266 | { | 267 | { |
267 | return *uv_global_mmr32_address(pnode, offset); | 268 | return readq(uv_global_mmr32_address(pnode, offset)); |
268 | } | 269 | } |
269 | 270 | ||
270 | /* | 271 | /* |
@@ -281,13 +282,13 @@ static inline unsigned long *uv_global_mmr64_address(int pnode, | |||
281 | static inline void uv_write_global_mmr64(int pnode, unsigned long offset, | 282 | static inline void uv_write_global_mmr64(int pnode, unsigned long offset, |
282 | unsigned long val) | 283 | unsigned long val) |
283 | { | 284 | { |
284 | *uv_global_mmr64_address(pnode, offset) = val; | 285 | writeq(val, uv_global_mmr64_address(pnode, offset)); |
285 | } | 286 | } |
286 | 287 | ||
287 | static inline unsigned long uv_read_global_mmr64(int pnode, | 288 | static inline unsigned long uv_read_global_mmr64(int pnode, |
288 | unsigned long offset) | 289 | unsigned long offset) |
289 | { | 290 | { |
290 | return *uv_global_mmr64_address(pnode, offset); | 291 | return readq(uv_global_mmr64_address(pnode, offset)); |
291 | } | 292 | } |
292 | 293 | ||
293 | /* | 294 | /* |
@@ -301,22 +302,22 @@ static inline unsigned long *uv_local_mmr_address(unsigned long offset) | |||
301 | 302 | ||
302 | static inline unsigned long uv_read_local_mmr(unsigned long offset) | 303 | static inline unsigned long uv_read_local_mmr(unsigned long offset) |
303 | { | 304 | { |
304 | return *uv_local_mmr_address(offset); | 305 | return readq(uv_local_mmr_address(offset)); |
305 | } | 306 | } |
306 | 307 | ||
307 | static inline void uv_write_local_mmr(unsigned long offset, unsigned long val) | 308 | static inline void uv_write_local_mmr(unsigned long offset, unsigned long val) |
308 | { | 309 | { |
309 | *uv_local_mmr_address(offset) = val; | 310 | writeq(val, uv_local_mmr_address(offset)); |
310 | } | 311 | } |
311 | 312 | ||
312 | static inline unsigned char uv_read_local_mmr8(unsigned long offset) | 313 | static inline unsigned char uv_read_local_mmr8(unsigned long offset) |
313 | { | 314 | { |
314 | return *((unsigned char *)uv_local_mmr_address(offset)); | 315 | return readb(uv_local_mmr_address(offset)); |
315 | } | 316 | } |
316 | 317 | ||
317 | static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val) | 318 | static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val) |
318 | { | 319 | { |
319 | *((unsigned char *)uv_local_mmr_address(offset)) = val; | 320 | writeb(val, uv_local_mmr_address(offset)); |
320 | } | 321 | } |
321 | 322 | ||
322 | /* | 323 | /* |
@@ -422,7 +423,7 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) | |||
422 | unsigned long val; | 423 | unsigned long val; |
423 | 424 | ||
424 | val = (1UL << UVH_IPI_INT_SEND_SHFT) | | 425 | val = (1UL << UVH_IPI_INT_SEND_SHFT) | |
425 | ((apicid & 0x3f) << UVH_IPI_INT_APIC_ID_SHFT) | | 426 | ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | |
426 | (vector << UVH_IPI_INT_VECTOR_SHFT); | 427 | (vector << UVH_IPI_INT_VECTOR_SHFT); |
427 | uv_write_global_mmr64(pnode, UVH_IPI_INT, val); | 428 | uv_write_global_mmr64(pnode, UVH_IPI_INT, val); |
428 | } | 429 | } |
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h index c11b7e100d8..e49ed6d2fd4 100644 --- a/arch/x86/include/asm/vmware.h +++ b/arch/x86/include/asm/vmware.h | |||
@@ -20,7 +20,7 @@ | |||
20 | #ifndef ASM_X86__VMWARE_H | 20 | #ifndef ASM_X86__VMWARE_H |
21 | #define ASM_X86__VMWARE_H | 21 | #define ASM_X86__VMWARE_H |
22 | 22 | ||
23 | extern unsigned long vmware_get_tsc_khz(void); | 23 | extern void vmware_platform_setup(void); |
24 | extern int vmware_platform(void); | 24 | extern int vmware_platform(void); |
25 | extern void vmware_set_feature_bits(struct cpuinfo_x86 *c); | 25 | extern void vmware_set_feature_bits(struct cpuinfo_x86 *c); |
26 | 26 | ||
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 11be5ad2e0e..272514c2d45 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h | |||
@@ -55,6 +55,7 @@ | |||
55 | #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 | 55 | #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 |
56 | #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 | 56 | #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 |
57 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 | 57 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 |
58 | #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 | ||
58 | 59 | ||
59 | 60 | ||
60 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 | 61 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 |
@@ -351,9 +352,16 @@ enum vmcs_field { | |||
351 | #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 | 352 | #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 |
352 | #define VMX_EPT_EXTENT_CONTEXT 1 | 353 | #define VMX_EPT_EXTENT_CONTEXT 1 |
353 | #define VMX_EPT_EXTENT_GLOBAL 2 | 354 | #define VMX_EPT_EXTENT_GLOBAL 2 |
355 | |||
356 | #define VMX_EPT_EXECUTE_ONLY_BIT (1ull) | ||
357 | #define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) | ||
358 | #define VMX_EPTP_UC_BIT (1ull << 8) | ||
359 | #define VMX_EPTP_WB_BIT (1ull << 14) | ||
360 | #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) | ||
354 | #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) | 361 | #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) |
355 | #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) | 362 | #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) |
356 | #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) | 363 | #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) |
364 | |||
357 | #define VMX_EPT_DEFAULT_GAW 3 | 365 | #define VMX_EPT_DEFAULT_GAW 3 |
358 | #define VMX_EPT_MAX_GAW 0x4 | 366 | #define VMX_EPT_MAX_GAW 0x4 |
359 | #define VMX_EPT_MT_EPTE_SHIFT 3 | 367 | #define VMX_EPT_MT_EPTE_SHIFT 3 |
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h new file mode 100644 index 00000000000..2c756fd4ab0 --- /dev/null +++ b/arch/x86/include/asm/x86_init.h | |||
@@ -0,0 +1,133 @@ | |||
1 | #ifndef _ASM_X86_PLATFORM_H | ||
2 | #define _ASM_X86_PLATFORM_H | ||
3 | |||
4 | #include <asm/pgtable_types.h> | ||
5 | #include <asm/bootparam.h> | ||
6 | |||
7 | struct mpc_bus; | ||
8 | struct mpc_cpu; | ||
9 | struct mpc_table; | ||
10 | |||
11 | /** | ||
12 | * struct x86_init_mpparse - platform specific mpparse ops | ||
13 | * @mpc_record: platform specific mpc record accounting | ||
14 | * @setup_ioapic_ids: platform specific ioapic id override | ||
15 | * @mpc_apic_id: platform specific mpc apic id assignment | ||
16 | * @smp_read_mpc_oem: platform specific oem mpc table setup | ||
17 | * @mpc_oem_pci_bus: platform specific pci bus setup (default NULL) | ||
18 | * @mpc_oem_bus_info: platform specific mpc bus info | ||
19 | * @find_smp_config: find the smp configuration | ||
20 | * @get_smp_config: get the smp configuration | ||
21 | */ | ||
22 | struct x86_init_mpparse { | ||
23 | void (*mpc_record)(unsigned int mode); | ||
24 | void (*setup_ioapic_ids)(void); | ||
25 | int (*mpc_apic_id)(struct mpc_cpu *m); | ||
26 | void (*smp_read_mpc_oem)(struct mpc_table *mpc); | ||
27 | void (*mpc_oem_pci_bus)(struct mpc_bus *m); | ||
28 | void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); | ||
29 | void (*find_smp_config)(unsigned int reserve); | ||
30 | void (*get_smp_config)(unsigned int early); | ||
31 | }; | ||
32 | |||
33 | /** | ||
34 | * struct x86_init_resources - platform specific resource related ops | ||
35 | * @probe_roms: probe BIOS roms | ||
36 | * @reserve_resources: reserve the standard resources for the | ||
37 | * platform | ||
38 | * @memory_setup: platform specific memory setup | ||
39 | * | ||
40 | */ | ||
41 | struct x86_init_resources { | ||
42 | void (*probe_roms)(void); | ||
43 | void (*reserve_resources)(void); | ||
44 | char *(*memory_setup)(void); | ||
45 | }; | ||
46 | |||
47 | /** | ||
48 | * struct x86_init_irqs - platform specific interrupt setup | ||
49 | * @pre_vector_init: init code to run before interrupt vectors | ||
50 | * are set up. | ||
51 | * @intr_init: interrupt init code | ||
52 | * @trap_init: platform specific trap setup | ||
53 | */ | ||
54 | struct x86_init_irqs { | ||
55 | void (*pre_vector_init)(void); | ||
56 | void (*intr_init)(void); | ||
57 | void (*trap_init)(void); | ||
58 | }; | ||
59 | |||
60 | /** | ||
61 | * struct x86_init_oem - oem platform specific customizing functions | ||
62 | * @arch_setup: platform specific architecure setup | ||
63 | * @banner: print a platform specific banner | ||
64 | */ | ||
65 | struct x86_init_oem { | ||
66 | void (*arch_setup)(void); | ||
67 | void (*banner)(void); | ||
68 | }; | ||
69 | |||
70 | /** | ||
71 | * struct x86_init_paging - platform specific paging functions | ||
72 | * @pagetable_setup_start: platform specific pre paging_init() call | ||
73 | * @pagetable_setup_done: platform specific post paging_init() call | ||
74 | */ | ||
75 | struct x86_init_paging { | ||
76 | void (*pagetable_setup_start)(pgd_t *base); | ||
77 | void (*pagetable_setup_done)(pgd_t *base); | ||
78 | }; | ||
79 | |||
80 | /** | ||
81 | * struct x86_init_timers - platform specific timer setup | ||
82 | * @setup_perpcu_clockev: set up the per cpu clock event device for the | ||
83 | * boot cpu | ||
84 | * @tsc_pre_init: platform function called before TSC init | ||
85 | * @timer_init: initialize the platform timer (default PIT/HPET) | ||
86 | */ | ||
87 | struct x86_init_timers { | ||
88 | void (*setup_percpu_clockev)(void); | ||
89 | void (*tsc_pre_init)(void); | ||
90 | void (*timer_init)(void); | ||
91 | }; | ||
92 | |||
93 | /** | ||
94 | * struct x86_init_ops - functions for platform specific setup | ||
95 | * | ||
96 | */ | ||
97 | struct x86_init_ops { | ||
98 | struct x86_init_resources resources; | ||
99 | struct x86_init_mpparse mpparse; | ||
100 | struct x86_init_irqs irqs; | ||
101 | struct x86_init_oem oem; | ||
102 | struct x86_init_paging paging; | ||
103 | struct x86_init_timers timers; | ||
104 | }; | ||
105 | |||
106 | /** | ||
107 | * struct x86_cpuinit_ops - platform specific cpu hotplug setups | ||
108 | * @setup_percpu_clockev: set up the per cpu clock event device | ||
109 | */ | ||
110 | struct x86_cpuinit_ops { | ||
111 | void (*setup_percpu_clockev)(void); | ||
112 | }; | ||
113 | |||
114 | /** | ||
115 | * struct x86_platform_ops - platform specific runtime functions | ||
116 | * @calibrate_tsc: calibrate TSC | ||
117 | * @get_wallclock: get time from HW clock like RTC etc. | ||
118 | * @set_wallclock: set time back to HW clock | ||
119 | */ | ||
120 | struct x86_platform_ops { | ||
121 | unsigned long (*calibrate_tsc)(void); | ||
122 | unsigned long (*get_wallclock)(void); | ||
123 | int (*set_wallclock)(unsigned long nowtime); | ||
124 | }; | ||
125 | |||
126 | extern struct x86_init_ops x86_init; | ||
127 | extern struct x86_cpuinit_ops x86_cpuinit; | ||
128 | extern struct x86_platform_ops x86_platform; | ||
129 | |||
130 | extern void x86_init_noop(void); | ||
131 | extern void x86_init_uint_noop(unsigned int unused); | ||
132 | |||
133 | #endif | ||
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 430d5b24af7..d8e5d0cdd67 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -31,8 +31,8 @@ GCOV_PROFILE_paravirt.o := n | |||
31 | 31 | ||
32 | obj-y := process_$(BITS).o signal.o entry_$(BITS).o | 32 | obj-y := process_$(BITS).o signal.o entry_$(BITS).o |
33 | obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o | 33 | obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o |
34 | obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o | 34 | obj-y += time.o ioport.o ldt.o dumpstack.o |
35 | obj-y += setup.o i8259.o irqinit.o | 35 | obj-y += setup.o x86_init.o i8259.o irqinit.o |
36 | obj-$(CONFIG_X86_VISWS) += visws_quirks.o | 36 | obj-$(CONFIG_X86_VISWS) += visws_quirks.o |
37 | obj-$(CONFIG_X86_32) += probe_roms_32.o | 37 | obj-$(CONFIG_X86_32) += probe_roms_32.o |
38 | obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o | 38 | obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o |
@@ -52,9 +52,11 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o | |||
52 | obj-$(CONFIG_X86_32) += tls.o | 52 | obj-$(CONFIG_X86_32) += tls.o |
53 | obj-$(CONFIG_IA32_EMULATION) += tls.o | 53 | obj-$(CONFIG_IA32_EMULATION) += tls.o |
54 | obj-y += step.o | 54 | obj-y += step.o |
55 | obj-$(CONFIG_INTEL_TXT) += tboot.o | ||
55 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | 56 | obj-$(CONFIG_STACKTRACE) += stacktrace.o |
56 | obj-y += cpu/ | 57 | obj-y += cpu/ |
57 | obj-y += acpi/ | 58 | obj-y += acpi/ |
59 | obj-$(CONFIG_SFI) += sfi.o | ||
58 | obj-y += reboot.o | 60 | obj-y += reboot.o |
59 | obj-$(CONFIG_MCA) += mca_32.o | 61 | obj-$(CONFIG_MCA) += mca_32.o |
60 | obj-$(CONFIG_X86_MSR) += msr.o | 62 | obj-$(CONFIG_X86_MSR) += msr.o |
@@ -104,6 +106,7 @@ obj-$(CONFIG_SCx200) += scx200.o | |||
104 | scx200-y += scx200_32.o | 106 | scx200-y += scx200_32.o |
105 | 107 | ||
106 | obj-$(CONFIG_OLPC) += olpc.o | 108 | obj-$(CONFIG_OLPC) += olpc.o |
109 | obj-$(CONFIG_X86_MRST) += mrst.o | ||
107 | 110 | ||
108 | microcode-y := microcode_core.o | 111 | microcode-y := microcode_core.o |
109 | microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o | 112 | microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o |
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6b8ca3a0285..67e929b8987 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
@@ -833,106 +833,6 @@ static int __init acpi_parse_madt_lapic_entries(void) | |||
833 | extern int es7000_plat; | 833 | extern int es7000_plat; |
834 | #endif | 834 | #endif |
835 | 835 | ||
836 | static struct { | ||
837 | int gsi_base; | ||
838 | int gsi_end; | ||
839 | } mp_ioapic_routing[MAX_IO_APICS]; | ||
840 | |||
841 | int mp_find_ioapic(int gsi) | ||
842 | { | ||
843 | int i = 0; | ||
844 | |||
845 | /* Find the IOAPIC that manages this GSI. */ | ||
846 | for (i = 0; i < nr_ioapics; i++) { | ||
847 | if ((gsi >= mp_ioapic_routing[i].gsi_base) | ||
848 | && (gsi <= mp_ioapic_routing[i].gsi_end)) | ||
849 | return i; | ||
850 | } | ||
851 | |||
852 | printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); | ||
853 | return -1; | ||
854 | } | ||
855 | |||
856 | int mp_find_ioapic_pin(int ioapic, int gsi) | ||
857 | { | ||
858 | if (WARN_ON(ioapic == -1)) | ||
859 | return -1; | ||
860 | if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end)) | ||
861 | return -1; | ||
862 | |||
863 | return gsi - mp_ioapic_routing[ioapic].gsi_base; | ||
864 | } | ||
865 | |||
866 | static u8 __init uniq_ioapic_id(u8 id) | ||
867 | { | ||
868 | #ifdef CONFIG_X86_32 | ||
869 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && | ||
870 | !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) | ||
871 | return io_apic_get_unique_id(nr_ioapics, id); | ||
872 | else | ||
873 | return id; | ||
874 | #else | ||
875 | int i; | ||
876 | DECLARE_BITMAP(used, 256); | ||
877 | bitmap_zero(used, 256); | ||
878 | for (i = 0; i < nr_ioapics; i++) { | ||
879 | struct mpc_ioapic *ia = &mp_ioapics[i]; | ||
880 | __set_bit(ia->apicid, used); | ||
881 | } | ||
882 | if (!test_bit(id, used)) | ||
883 | return id; | ||
884 | return find_first_zero_bit(used, 256); | ||
885 | #endif | ||
886 | } | ||
887 | |||
888 | static int bad_ioapic(unsigned long address) | ||
889 | { | ||
890 | if (nr_ioapics >= MAX_IO_APICS) { | ||
891 | printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " | ||
892 | "(found %d)\n", MAX_IO_APICS, nr_ioapics); | ||
893 | panic("Recompile kernel with bigger MAX_IO_APICS!\n"); | ||
894 | } | ||
895 | if (!address) { | ||
896 | printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" | ||
897 | " found in table, skipping!\n"); | ||
898 | return 1; | ||
899 | } | ||
900 | return 0; | ||
901 | } | ||
902 | |||
903 | void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) | ||
904 | { | ||
905 | int idx = 0; | ||
906 | |||
907 | if (bad_ioapic(address)) | ||
908 | return; | ||
909 | |||
910 | idx = nr_ioapics; | ||
911 | |||
912 | mp_ioapics[idx].type = MP_IOAPIC; | ||
913 | mp_ioapics[idx].flags = MPC_APIC_USABLE; | ||
914 | mp_ioapics[idx].apicaddr = address; | ||
915 | |||
916 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); | ||
917 | mp_ioapics[idx].apicid = uniq_ioapic_id(id); | ||
918 | mp_ioapics[idx].apicver = io_apic_get_version(idx); | ||
919 | |||
920 | /* | ||
921 | * Build basic GSI lookup table to facilitate gsi->io_apic lookups | ||
922 | * and to prevent reprogramming of IOAPIC pins (PCI GSIs). | ||
923 | */ | ||
924 | mp_ioapic_routing[idx].gsi_base = gsi_base; | ||
925 | mp_ioapic_routing[idx].gsi_end = gsi_base + | ||
926 | io_apic_get_redir_entries(idx); | ||
927 | |||
928 | printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " | ||
929 | "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, | ||
930 | mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, | ||
931 | mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); | ||
932 | |||
933 | nr_ioapics++; | ||
934 | } | ||
935 | |||
936 | int __init acpi_probe_gsi(void) | 836 | int __init acpi_probe_gsi(void) |
937 | { | 837 | { |
938 | int idx; | 838 | int idx; |
@@ -947,7 +847,7 @@ int __init acpi_probe_gsi(void) | |||
947 | 847 | ||
948 | max_gsi = 0; | 848 | max_gsi = 0; |
949 | for (idx = 0; idx < nr_ioapics; idx++) { | 849 | for (idx = 0; idx < nr_ioapics; idx++) { |
950 | gsi = mp_ioapic_routing[idx].gsi_end; | 850 | gsi = mp_gsi_routing[idx].gsi_end; |
951 | 851 | ||
952 | if (gsi > max_gsi) | 852 | if (gsi > max_gsi) |
953 | max_gsi = gsi; | 853 | max_gsi = gsi; |
@@ -1179,9 +1079,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) | |||
1179 | * If MPS is present, it will handle them, | 1079 | * If MPS is present, it will handle them, |
1180 | * otherwise the system will stay in PIC mode | 1080 | * otherwise the system will stay in PIC mode |
1181 | */ | 1081 | */ |
1182 | if (acpi_disabled || acpi_noirq) { | 1082 | if (acpi_disabled || acpi_noirq) |
1183 | return -ENODEV; | 1083 | return -ENODEV; |
1184 | } | ||
1185 | 1084 | ||
1186 | if (!cpu_has_apic) | 1085 | if (!cpu_has_apic) |
1187 | return -ENODEV; | 1086 | return -ENODEV; |
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 8c44c232efc..59cdfa4686b 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c | |||
@@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, | |||
48 | * P4, Core and beyond CPUs | 48 | * P4, Core and beyond CPUs |
49 | */ | 49 | */ |
50 | if (c->x86_vendor == X86_VENDOR_INTEL && | 50 | if (c->x86_vendor == X86_VENDOR_INTEL && |
51 | (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14))) | 51 | (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) |
52 | flags->bm_control = 0; | 52 | flags->bm_control = 0; |
53 | } | 53 | } |
54 | EXPORT_SYMBOL(acpi_processor_power_init_bm_check); | 54 | EXPORT_SYMBOL(acpi_processor_power_init_bm_check); |
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f5765870257..de7353c0ce9 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <linux/sched.h> | 2 | #include <linux/sched.h> |
3 | #include <linux/mutex.h> | 3 | #include <linux/mutex.h> |
4 | #include <linux/list.h> | 4 | #include <linux/list.h> |
5 | #include <linux/stringify.h> | ||
5 | #include <linux/kprobes.h> | 6 | #include <linux/kprobes.h> |
6 | #include <linux/mm.h> | 7 | #include <linux/mm.h> |
7 | #include <linux/vmalloc.h> | 8 | #include <linux/vmalloc.h> |
@@ -32,7 +33,7 @@ __setup("smp-alt-boot", bootonly); | |||
32 | #define smp_alt_once 1 | 33 | #define smp_alt_once 1 |
33 | #endif | 34 | #endif |
34 | 35 | ||
35 | static int debug_alternative; | 36 | static int __initdata_or_module debug_alternative; |
36 | 37 | ||
37 | static int __init debug_alt(char *str) | 38 | static int __init debug_alt(char *str) |
38 | { | 39 | { |
@@ -51,7 +52,7 @@ static int __init setup_noreplace_smp(char *str) | |||
51 | __setup("noreplace-smp", setup_noreplace_smp); | 52 | __setup("noreplace-smp", setup_noreplace_smp); |
52 | 53 | ||
53 | #ifdef CONFIG_PARAVIRT | 54 | #ifdef CONFIG_PARAVIRT |
54 | static int noreplace_paravirt = 0; | 55 | static int __initdata_or_module noreplace_paravirt = 0; |
55 | 56 | ||
56 | static int __init setup_noreplace_paravirt(char *str) | 57 | static int __init setup_noreplace_paravirt(char *str) |
57 | { | 58 | { |
@@ -64,16 +65,17 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt); | |||
64 | #define DPRINTK(fmt, args...) if (debug_alternative) \ | 65 | #define DPRINTK(fmt, args...) if (debug_alternative) \ |
65 | printk(KERN_DEBUG fmt, args) | 66 | printk(KERN_DEBUG fmt, args) |
66 | 67 | ||
67 | #ifdef GENERIC_NOP1 | 68 | #if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) |
68 | /* Use inline assembly to define this because the nops are defined | 69 | /* Use inline assembly to define this because the nops are defined |
69 | as inline assembly strings in the include files and we cannot | 70 | as inline assembly strings in the include files and we cannot |
70 | get them easily into strings. */ | 71 | get them easily into strings. */ |
71 | asm("\t.section .rodata, \"a\"\nintelnops: " | 72 | asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: " |
72 | GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 | 73 | GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 |
73 | GENERIC_NOP7 GENERIC_NOP8 | 74 | GENERIC_NOP7 GENERIC_NOP8 |
74 | "\t.previous"); | 75 | "\t.previous"); |
75 | extern const unsigned char intelnops[]; | 76 | extern const unsigned char intelnops[]; |
76 | static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { | 77 | static const unsigned char *const __initconst_or_module |
78 | intel_nops[ASM_NOP_MAX+1] = { | ||
77 | NULL, | 79 | NULL, |
78 | intelnops, | 80 | intelnops, |
79 | intelnops + 1, | 81 | intelnops + 1, |
@@ -87,12 +89,13 @@ static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { | |||
87 | #endif | 89 | #endif |
88 | 90 | ||
89 | #ifdef K8_NOP1 | 91 | #ifdef K8_NOP1 |
90 | asm("\t.section .rodata, \"a\"\nk8nops: " | 92 | asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: " |
91 | K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 | 93 | K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 |
92 | K8_NOP7 K8_NOP8 | 94 | K8_NOP7 K8_NOP8 |
93 | "\t.previous"); | 95 | "\t.previous"); |
94 | extern const unsigned char k8nops[]; | 96 | extern const unsigned char k8nops[]; |
95 | static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { | 97 | static const unsigned char *const __initconst_or_module |
98 | k8_nops[ASM_NOP_MAX+1] = { | ||
96 | NULL, | 99 | NULL, |
97 | k8nops, | 100 | k8nops, |
98 | k8nops + 1, | 101 | k8nops + 1, |
@@ -105,13 +108,14 @@ static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { | |||
105 | }; | 108 | }; |
106 | #endif | 109 | #endif |
107 | 110 | ||
108 | #ifdef K7_NOP1 | 111 | #if defined(K7_NOP1) && !defined(CONFIG_X86_64) |
109 | asm("\t.section .rodata, \"a\"\nk7nops: " | 112 | asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: " |
110 | K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 | 113 | K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 |
111 | K7_NOP7 K7_NOP8 | 114 | K7_NOP7 K7_NOP8 |
112 | "\t.previous"); | 115 | "\t.previous"); |
113 | extern const unsigned char k7nops[]; | 116 | extern const unsigned char k7nops[]; |
114 | static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { | 117 | static const unsigned char *const __initconst_or_module |
118 | k7_nops[ASM_NOP_MAX+1] = { | ||
115 | NULL, | 119 | NULL, |
116 | k7nops, | 120 | k7nops, |
117 | k7nops + 1, | 121 | k7nops + 1, |
@@ -125,12 +129,13 @@ static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { | |||
125 | #endif | 129 | #endif |
126 | 130 | ||
127 | #ifdef P6_NOP1 | 131 | #ifdef P6_NOP1 |
128 | asm("\t.section .rodata, \"a\"\np6nops: " | 132 | asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: " |
129 | P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 | 133 | P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 |
130 | P6_NOP7 P6_NOP8 | 134 | P6_NOP7 P6_NOP8 |
131 | "\t.previous"); | 135 | "\t.previous"); |
132 | extern const unsigned char p6nops[]; | 136 | extern const unsigned char p6nops[]; |
133 | static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { | 137 | static const unsigned char *const __initconst_or_module |
138 | p6_nops[ASM_NOP_MAX+1] = { | ||
134 | NULL, | 139 | NULL, |
135 | p6nops, | 140 | p6nops, |
136 | p6nops + 1, | 141 | p6nops + 1, |
@@ -146,7 +151,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { | |||
146 | #ifdef CONFIG_X86_64 | 151 | #ifdef CONFIG_X86_64 |
147 | 152 | ||
148 | extern char __vsyscall_0; | 153 | extern char __vsyscall_0; |
149 | const unsigned char *const *find_nop_table(void) | 154 | static const unsigned char *const *__init_or_module find_nop_table(void) |
150 | { | 155 | { |
151 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && | 156 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && |
152 | boot_cpu_has(X86_FEATURE_NOPL)) | 157 | boot_cpu_has(X86_FEATURE_NOPL)) |
@@ -157,7 +162,7 @@ const unsigned char *const *find_nop_table(void) | |||
157 | 162 | ||
158 | #else /* CONFIG_X86_64 */ | 163 | #else /* CONFIG_X86_64 */ |
159 | 164 | ||
160 | const unsigned char *const *find_nop_table(void) | 165 | static const unsigned char *const *__init_or_module find_nop_table(void) |
161 | { | 166 | { |
162 | if (boot_cpu_has(X86_FEATURE_K8)) | 167 | if (boot_cpu_has(X86_FEATURE_K8)) |
163 | return k8_nops; | 168 | return k8_nops; |
@@ -172,7 +177,7 @@ const unsigned char *const *find_nop_table(void) | |||
172 | #endif /* CONFIG_X86_64 */ | 177 | #endif /* CONFIG_X86_64 */ |
173 | 178 | ||
174 | /* Use this to add nops to a buffer, then text_poke the whole buffer. */ | 179 | /* Use this to add nops to a buffer, then text_poke the whole buffer. */ |
175 | void add_nops(void *insns, unsigned int len) | 180 | static void __init_or_module add_nops(void *insns, unsigned int len) |
176 | { | 181 | { |
177 | const unsigned char *const *noptable = find_nop_table(); | 182 | const unsigned char *const *noptable = find_nop_table(); |
178 | 183 | ||
@@ -185,10 +190,10 @@ void add_nops(void *insns, unsigned int len) | |||
185 | len -= noplen; | 190 | len -= noplen; |
186 | } | 191 | } |
187 | } | 192 | } |
188 | EXPORT_SYMBOL_GPL(add_nops); | ||
189 | 193 | ||
190 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; | 194 | extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; |
191 | extern u8 *__smp_locks[], *__smp_locks_end[]; | 195 | extern u8 *__smp_locks[], *__smp_locks_end[]; |
196 | static void *text_poke_early(void *addr, const void *opcode, size_t len); | ||
192 | 197 | ||
193 | /* Replace instructions with better alternatives for this CPU type. | 198 | /* Replace instructions with better alternatives for this CPU type. |
194 | This runs before SMP is initialized to avoid SMP problems with | 199 | This runs before SMP is initialized to avoid SMP problems with |
@@ -196,7 +201,8 @@ extern u8 *__smp_locks[], *__smp_locks_end[]; | |||
196 | APs have less capabilities than the boot processor are not handled. | 201 | APs have less capabilities than the boot processor are not handled. |
197 | Tough. Make sure you disable such features by hand. */ | 202 | Tough. Make sure you disable such features by hand. */ |
198 | 203 | ||
199 | void apply_alternatives(struct alt_instr *start, struct alt_instr *end) | 204 | void __init_or_module apply_alternatives(struct alt_instr *start, |
205 | struct alt_instr *end) | ||
200 | { | 206 | { |
201 | struct alt_instr *a; | 207 | struct alt_instr *a; |
202 | char insnbuf[MAX_PATCH_LEN]; | 208 | char insnbuf[MAX_PATCH_LEN]; |
@@ -279,9 +285,10 @@ static LIST_HEAD(smp_alt_modules); | |||
279 | static DEFINE_MUTEX(smp_alt); | 285 | static DEFINE_MUTEX(smp_alt); |
280 | static int smp_mode = 1; /* protected by smp_alt */ | 286 | static int smp_mode = 1; /* protected by smp_alt */ |
281 | 287 | ||
282 | void alternatives_smp_module_add(struct module *mod, char *name, | 288 | void __init_or_module alternatives_smp_module_add(struct module *mod, |
283 | void *locks, void *locks_end, | 289 | char *name, |
284 | void *text, void *text_end) | 290 | void *locks, void *locks_end, |
291 | void *text, void *text_end) | ||
285 | { | 292 | { |
286 | struct smp_alt_module *smp; | 293 | struct smp_alt_module *smp; |
287 | 294 | ||
@@ -317,7 +324,7 @@ void alternatives_smp_module_add(struct module *mod, char *name, | |||
317 | mutex_unlock(&smp_alt); | 324 | mutex_unlock(&smp_alt); |
318 | } | 325 | } |
319 | 326 | ||
320 | void alternatives_smp_module_del(struct module *mod) | 327 | void __init_or_module alternatives_smp_module_del(struct module *mod) |
321 | { | 328 | { |
322 | struct smp_alt_module *item; | 329 | struct smp_alt_module *item; |
323 | 330 | ||
@@ -386,8 +393,8 @@ void alternatives_smp_switch(int smp) | |||
386 | #endif | 393 | #endif |
387 | 394 | ||
388 | #ifdef CONFIG_PARAVIRT | 395 | #ifdef CONFIG_PARAVIRT |
389 | void apply_paravirt(struct paravirt_patch_site *start, | 396 | void __init_or_module apply_paravirt(struct paravirt_patch_site *start, |
390 | struct paravirt_patch_site *end) | 397 | struct paravirt_patch_site *end) |
391 | { | 398 | { |
392 | struct paravirt_patch_site *p; | 399 | struct paravirt_patch_site *p; |
393 | char insnbuf[MAX_PATCH_LEN]; | 400 | char insnbuf[MAX_PATCH_LEN]; |
@@ -485,13 +492,14 @@ void __init alternative_instructions(void) | |||
485 | * instructions. And on the local CPU you need to be protected again NMI or MCE | 492 | * instructions. And on the local CPU you need to be protected again NMI or MCE |
486 | * handlers seeing an inconsistent instruction while you patch. | 493 | * handlers seeing an inconsistent instruction while you patch. |
487 | */ | 494 | */ |
488 | void *text_poke_early(void *addr, const void *opcode, size_t len) | 495 | static void *__init_or_module text_poke_early(void *addr, const void *opcode, |
496 | size_t len) | ||
489 | { | 497 | { |
490 | unsigned long flags; | 498 | unsigned long flags; |
491 | local_irq_save(flags); | 499 | local_irq_save(flags); |
492 | memcpy(addr, opcode, len); | 500 | memcpy(addr, opcode, len); |
493 | local_irq_restore(flags); | ||
494 | sync_core(); | 501 | sync_core(); |
502 | local_irq_restore(flags); | ||
495 | /* Could also do a CLFLUSH here to speed up CPU recovery; but | 503 | /* Could also do a CLFLUSH here to speed up CPU recovery; but |
496 | that causes hangs on some VIA CPUs. */ | 504 | that causes hangs on some VIA CPUs. */ |
497 | return addr; | 505 | return addr; |
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 6c99f503780..98f230f6a28 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c | |||
@@ -41,9 +41,13 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); | |||
41 | static LIST_HEAD(iommu_pd_list); | 41 | static LIST_HEAD(iommu_pd_list); |
42 | static DEFINE_SPINLOCK(iommu_pd_list_lock); | 42 | static DEFINE_SPINLOCK(iommu_pd_list_lock); |
43 | 43 | ||
44 | #ifdef CONFIG_IOMMU_API | 44 | /* |
45 | * Domain for untranslated devices - only allocated | ||
46 | * if iommu=pt passed on kernel cmd line. | ||
47 | */ | ||
48 | static struct protection_domain *pt_domain; | ||
49 | |||
45 | static struct iommu_ops amd_iommu_ops; | 50 | static struct iommu_ops amd_iommu_ops; |
46 | #endif | ||
47 | 51 | ||
48 | /* | 52 | /* |
49 | * general struct to manage commands send to an IOMMU | 53 | * general struct to manage commands send to an IOMMU |
@@ -55,16 +59,16 @@ struct iommu_cmd { | |||
55 | static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, | 59 | static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, |
56 | struct unity_map_entry *e); | 60 | struct unity_map_entry *e); |
57 | static struct dma_ops_domain *find_protection_domain(u16 devid); | 61 | static struct dma_ops_domain *find_protection_domain(u16 devid); |
58 | static u64* alloc_pte(struct protection_domain *dom, | 62 | static u64 *alloc_pte(struct protection_domain *domain, |
59 | unsigned long address, u64 | 63 | unsigned long address, int end_lvl, |
60 | **pte_page, gfp_t gfp); | 64 | u64 **pte_page, gfp_t gfp); |
61 | static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, | 65 | static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, |
62 | unsigned long start_page, | 66 | unsigned long start_page, |
63 | unsigned int pages); | 67 | unsigned int pages); |
64 | 68 | static void reset_iommu_command_buffer(struct amd_iommu *iommu); | |
65 | #ifndef BUS_NOTIFY_UNBOUND_DRIVER | 69 | static u64 *fetch_pte(struct protection_domain *domain, |
66 | #define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 | 70 | unsigned long address, int map_size); |
67 | #endif | 71 | static void update_domain(struct protection_domain *domain); |
68 | 72 | ||
69 | #ifdef CONFIG_AMD_IOMMU_STATS | 73 | #ifdef CONFIG_AMD_IOMMU_STATS |
70 | 74 | ||
@@ -138,7 +142,25 @@ static int iommu_has_npcache(struct amd_iommu *iommu) | |||
138 | * | 142 | * |
139 | ****************************************************************************/ | 143 | ****************************************************************************/ |
140 | 144 | ||
141 | static void iommu_print_event(void *__evt) | 145 | static void dump_dte_entry(u16 devid) |
146 | { | ||
147 | int i; | ||
148 | |||
149 | for (i = 0; i < 8; ++i) | ||
150 | pr_err("AMD-Vi: DTE[%d]: %08x\n", i, | ||
151 | amd_iommu_dev_table[devid].data[i]); | ||
152 | } | ||
153 | |||
154 | static void dump_command(unsigned long phys_addr) | ||
155 | { | ||
156 | struct iommu_cmd *cmd = phys_to_virt(phys_addr); | ||
157 | int i; | ||
158 | |||
159 | for (i = 0; i < 4; ++i) | ||
160 | pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]); | ||
161 | } | ||
162 | |||
163 | static void iommu_print_event(struct amd_iommu *iommu, void *__evt) | ||
142 | { | 164 | { |
143 | u32 *event = __evt; | 165 | u32 *event = __evt; |
144 | int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; | 166 | int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; |
@@ -147,7 +169,7 @@ static void iommu_print_event(void *__evt) | |||
147 | int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; | 169 | int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; |
148 | u64 address = (u64)(((u64)event[3]) << 32) | event[2]; | 170 | u64 address = (u64)(((u64)event[3]) << 32) | event[2]; |
149 | 171 | ||
150 | printk(KERN_ERR "AMD IOMMU: Event logged ["); | 172 | printk(KERN_ERR "AMD-Vi: Event logged ["); |
151 | 173 | ||
152 | switch (type) { | 174 | switch (type) { |
153 | case EVENT_TYPE_ILL_DEV: | 175 | case EVENT_TYPE_ILL_DEV: |
@@ -155,6 +177,7 @@ static void iommu_print_event(void *__evt) | |||
155 | "address=0x%016llx flags=0x%04x]\n", | 177 | "address=0x%016llx flags=0x%04x]\n", |
156 | PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), | 178 | PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), |
157 | address, flags); | 179 | address, flags); |
180 | dump_dte_entry(devid); | ||
158 | break; | 181 | break; |
159 | case EVENT_TYPE_IO_FAULT: | 182 | case EVENT_TYPE_IO_FAULT: |
160 | printk("IO_PAGE_FAULT device=%02x:%02x.%x " | 183 | printk("IO_PAGE_FAULT device=%02x:%02x.%x " |
@@ -176,6 +199,8 @@ static void iommu_print_event(void *__evt) | |||
176 | break; | 199 | break; |
177 | case EVENT_TYPE_ILL_CMD: | 200 | case EVENT_TYPE_ILL_CMD: |
178 | printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); | 201 | printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); |
202 | reset_iommu_command_buffer(iommu); | ||
203 | dump_command(address); | ||
179 | break; | 204 | break; |
180 | case EVENT_TYPE_CMD_HARD_ERR: | 205 | case EVENT_TYPE_CMD_HARD_ERR: |
181 | printk("COMMAND_HARDWARE_ERROR address=0x%016llx " | 206 | printk("COMMAND_HARDWARE_ERROR address=0x%016llx " |
@@ -209,7 +234,7 @@ static void iommu_poll_events(struct amd_iommu *iommu) | |||
209 | tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); | 234 | tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); |
210 | 235 | ||
211 | while (head != tail) { | 236 | while (head != tail) { |
212 | iommu_print_event(iommu->evt_buf + head); | 237 | iommu_print_event(iommu, iommu->evt_buf + head); |
213 | head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; | 238 | head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; |
214 | } | 239 | } |
215 | 240 | ||
@@ -296,8 +321,11 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu) | |||
296 | status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; | 321 | status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; |
297 | writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); | 322 | writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); |
298 | 323 | ||
299 | if (unlikely(i == EXIT_LOOP_COUNT)) | 324 | if (unlikely(i == EXIT_LOOP_COUNT)) { |
300 | panic("AMD IOMMU: Completion wait loop failed\n"); | 325 | spin_unlock(&iommu->lock); |
326 | reset_iommu_command_buffer(iommu); | ||
327 | spin_lock(&iommu->lock); | ||
328 | } | ||
301 | } | 329 | } |
302 | 330 | ||
303 | /* | 331 | /* |
@@ -445,47 +473,78 @@ static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) | |||
445 | } | 473 | } |
446 | 474 | ||
447 | /* | 475 | /* |
476 | * This function flushes one domain on one IOMMU | ||
477 | */ | ||
478 | static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) | ||
479 | { | ||
480 | struct iommu_cmd cmd; | ||
481 | unsigned long flags; | ||
482 | |||
483 | __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, | ||
484 | domid, 1, 1); | ||
485 | |||
486 | spin_lock_irqsave(&iommu->lock, flags); | ||
487 | __iommu_queue_command(iommu, &cmd); | ||
488 | __iommu_completion_wait(iommu); | ||
489 | __iommu_wait_for_completion(iommu); | ||
490 | spin_unlock_irqrestore(&iommu->lock, flags); | ||
491 | } | ||
492 | |||
493 | static void flush_all_domains_on_iommu(struct amd_iommu *iommu) | ||
494 | { | ||
495 | int i; | ||
496 | |||
497 | for (i = 1; i < MAX_DOMAIN_ID; ++i) { | ||
498 | if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) | ||
499 | continue; | ||
500 | flush_domain_on_iommu(iommu, i); | ||
501 | } | ||
502 | |||
503 | } | ||
504 | |||
505 | /* | ||
448 | * This function is used to flush the IO/TLB for a given protection domain | 506 | * This function is used to flush the IO/TLB for a given protection domain |
449 | * on every IOMMU in the system | 507 | * on every IOMMU in the system |
450 | */ | 508 | */ |
451 | static void iommu_flush_domain(u16 domid) | 509 | static void iommu_flush_domain(u16 domid) |
452 | { | 510 | { |
453 | unsigned long flags; | ||
454 | struct amd_iommu *iommu; | 511 | struct amd_iommu *iommu; |
455 | struct iommu_cmd cmd; | ||
456 | 512 | ||
457 | INC_STATS_COUNTER(domain_flush_all); | 513 | INC_STATS_COUNTER(domain_flush_all); |
458 | 514 | ||
459 | __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, | 515 | for_each_iommu(iommu) |
460 | domid, 1, 1); | 516 | flush_domain_on_iommu(iommu, domid); |
461 | |||
462 | for_each_iommu(iommu) { | ||
463 | spin_lock_irqsave(&iommu->lock, flags); | ||
464 | __iommu_queue_command(iommu, &cmd); | ||
465 | __iommu_completion_wait(iommu); | ||
466 | __iommu_wait_for_completion(iommu); | ||
467 | spin_unlock_irqrestore(&iommu->lock, flags); | ||
468 | } | ||
469 | } | 517 | } |
470 | 518 | ||
471 | void amd_iommu_flush_all_domains(void) | 519 | void amd_iommu_flush_all_domains(void) |
472 | { | 520 | { |
521 | struct amd_iommu *iommu; | ||
522 | |||
523 | for_each_iommu(iommu) | ||
524 | flush_all_domains_on_iommu(iommu); | ||
525 | } | ||
526 | |||
527 | static void flush_all_devices_for_iommu(struct amd_iommu *iommu) | ||
528 | { | ||
473 | int i; | 529 | int i; |
474 | 530 | ||
475 | for (i = 1; i < MAX_DOMAIN_ID; ++i) { | 531 | for (i = 0; i <= amd_iommu_last_bdf; ++i) { |
476 | if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) | 532 | if (iommu != amd_iommu_rlookup_table[i]) |
477 | continue; | 533 | continue; |
478 | iommu_flush_domain(i); | 534 | |
535 | iommu_queue_inv_dev_entry(iommu, i); | ||
536 | iommu_completion_wait(iommu); | ||
479 | } | 537 | } |
480 | } | 538 | } |
481 | 539 | ||
482 | void amd_iommu_flush_all_devices(void) | 540 | static void flush_devices_by_domain(struct protection_domain *domain) |
483 | { | 541 | { |
484 | struct amd_iommu *iommu; | 542 | struct amd_iommu *iommu; |
485 | int i; | 543 | int i; |
486 | 544 | ||
487 | for (i = 0; i <= amd_iommu_last_bdf; ++i) { | 545 | for (i = 0; i <= amd_iommu_last_bdf; ++i) { |
488 | if (amd_iommu_pd_table[i] == NULL) | 546 | if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || |
547 | (amd_iommu_pd_table[i] != domain)) | ||
489 | continue; | 548 | continue; |
490 | 549 | ||
491 | iommu = amd_iommu_rlookup_table[i]; | 550 | iommu = amd_iommu_rlookup_table[i]; |
@@ -497,6 +556,27 @@ void amd_iommu_flush_all_devices(void) | |||
497 | } | 556 | } |
498 | } | 557 | } |
499 | 558 | ||
559 | static void reset_iommu_command_buffer(struct amd_iommu *iommu) | ||
560 | { | ||
561 | pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); | ||
562 | |||
563 | if (iommu->reset_in_progress) | ||
564 | panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); | ||
565 | |||
566 | iommu->reset_in_progress = true; | ||
567 | |||
568 | amd_iommu_reset_cmd_buffer(iommu); | ||
569 | flush_all_devices_for_iommu(iommu); | ||
570 | flush_all_domains_on_iommu(iommu); | ||
571 | |||
572 | iommu->reset_in_progress = false; | ||
573 | } | ||
574 | |||
575 | void amd_iommu_flush_all_devices(void) | ||
576 | { | ||
577 | flush_devices_by_domain(NULL); | ||
578 | } | ||
579 | |||
500 | /**************************************************************************** | 580 | /**************************************************************************** |
501 | * | 581 | * |
502 | * The functions below are used the create the page table mappings for | 582 | * The functions below are used the create the page table mappings for |
@@ -514,18 +594,21 @@ void amd_iommu_flush_all_devices(void) | |||
514 | static int iommu_map_page(struct protection_domain *dom, | 594 | static int iommu_map_page(struct protection_domain *dom, |
515 | unsigned long bus_addr, | 595 | unsigned long bus_addr, |
516 | unsigned long phys_addr, | 596 | unsigned long phys_addr, |
517 | int prot) | 597 | int prot, |
598 | int map_size) | ||
518 | { | 599 | { |
519 | u64 __pte, *pte; | 600 | u64 __pte, *pte; |
520 | 601 | ||
521 | bus_addr = PAGE_ALIGN(bus_addr); | 602 | bus_addr = PAGE_ALIGN(bus_addr); |
522 | phys_addr = PAGE_ALIGN(phys_addr); | 603 | phys_addr = PAGE_ALIGN(phys_addr); |
523 | 604 | ||
524 | /* only support 512GB address spaces for now */ | 605 | BUG_ON(!PM_ALIGNED(map_size, bus_addr)); |
525 | if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) | 606 | BUG_ON(!PM_ALIGNED(map_size, phys_addr)); |
607 | |||
608 | if (!(prot & IOMMU_PROT_MASK)) | ||
526 | return -EINVAL; | 609 | return -EINVAL; |
527 | 610 | ||
528 | pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); | 611 | pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL); |
529 | 612 | ||
530 | if (IOMMU_PTE_PRESENT(*pte)) | 613 | if (IOMMU_PTE_PRESENT(*pte)) |
531 | return -EBUSY; | 614 | return -EBUSY; |
@@ -538,29 +621,18 @@ static int iommu_map_page(struct protection_domain *dom, | |||
538 | 621 | ||
539 | *pte = __pte; | 622 | *pte = __pte; |
540 | 623 | ||
624 | update_domain(dom); | ||
625 | |||
541 | return 0; | 626 | return 0; |
542 | } | 627 | } |
543 | 628 | ||
544 | static void iommu_unmap_page(struct protection_domain *dom, | 629 | static void iommu_unmap_page(struct protection_domain *dom, |
545 | unsigned long bus_addr) | 630 | unsigned long bus_addr, int map_size) |
546 | { | 631 | { |
547 | u64 *pte; | 632 | u64 *pte = fetch_pte(dom, bus_addr, map_size); |
548 | |||
549 | pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; | ||
550 | |||
551 | if (!IOMMU_PTE_PRESENT(*pte)) | ||
552 | return; | ||
553 | |||
554 | pte = IOMMU_PTE_PAGE(*pte); | ||
555 | pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; | ||
556 | 633 | ||
557 | if (!IOMMU_PTE_PRESENT(*pte)) | 634 | if (pte) |
558 | return; | 635 | *pte = 0; |
559 | |||
560 | pte = IOMMU_PTE_PAGE(*pte); | ||
561 | pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; | ||
562 | |||
563 | *pte = 0; | ||
564 | } | 636 | } |
565 | 637 | ||
566 | /* | 638 | /* |
@@ -615,7 +687,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, | |||
615 | 687 | ||
616 | for (addr = e->address_start; addr < e->address_end; | 688 | for (addr = e->address_start; addr < e->address_end; |
617 | addr += PAGE_SIZE) { | 689 | addr += PAGE_SIZE) { |
618 | ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot); | 690 | ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, |
691 | PM_MAP_4k); | ||
619 | if (ret) | 692 | if (ret) |
620 | return ret; | 693 | return ret; |
621 | /* | 694 | /* |
@@ -670,24 +743,29 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, | |||
670 | * This function checks if there is a PTE for a given dma address. If | 743 | * This function checks if there is a PTE for a given dma address. If |
671 | * there is one, it returns the pointer to it. | 744 | * there is one, it returns the pointer to it. |
672 | */ | 745 | */ |
673 | static u64* fetch_pte(struct protection_domain *domain, | 746 | static u64 *fetch_pte(struct protection_domain *domain, |
674 | unsigned long address) | 747 | unsigned long address, int map_size) |
675 | { | 748 | { |
749 | int level; | ||
676 | u64 *pte; | 750 | u64 *pte; |
677 | 751 | ||
678 | pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; | 752 | level = domain->mode - 1; |
753 | pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; | ||
679 | 754 | ||
680 | if (!IOMMU_PTE_PRESENT(*pte)) | 755 | while (level > map_size) { |
681 | return NULL; | 756 | if (!IOMMU_PTE_PRESENT(*pte)) |
757 | return NULL; | ||
682 | 758 | ||
683 | pte = IOMMU_PTE_PAGE(*pte); | 759 | level -= 1; |
684 | pte = &pte[IOMMU_PTE_L1_INDEX(address)]; | ||
685 | 760 | ||
686 | if (!IOMMU_PTE_PRESENT(*pte)) | 761 | pte = IOMMU_PTE_PAGE(*pte); |
687 | return NULL; | 762 | pte = &pte[PM_LEVEL_INDEX(level, address)]; |
688 | 763 | ||
689 | pte = IOMMU_PTE_PAGE(*pte); | 764 | if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { |
690 | pte = &pte[IOMMU_PTE_L0_INDEX(address)]; | 765 | pte = NULL; |
766 | break; | ||
767 | } | ||
768 | } | ||
691 | 769 | ||
692 | return pte; | 770 | return pte; |
693 | } | 771 | } |
@@ -727,7 +805,7 @@ static int alloc_new_range(struct amd_iommu *iommu, | |||
727 | u64 *pte, *pte_page; | 805 | u64 *pte, *pte_page; |
728 | 806 | ||
729 | for (i = 0; i < num_ptes; ++i) { | 807 | for (i = 0; i < num_ptes; ++i) { |
730 | pte = alloc_pte(&dma_dom->domain, address, | 808 | pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k, |
731 | &pte_page, gfp); | 809 | &pte_page, gfp); |
732 | if (!pte) | 810 | if (!pte) |
733 | goto out_free; | 811 | goto out_free; |
@@ -760,16 +838,20 @@ static int alloc_new_range(struct amd_iommu *iommu, | |||
760 | for (i = dma_dom->aperture[index]->offset; | 838 | for (i = dma_dom->aperture[index]->offset; |
761 | i < dma_dom->aperture_size; | 839 | i < dma_dom->aperture_size; |
762 | i += PAGE_SIZE) { | 840 | i += PAGE_SIZE) { |
763 | u64 *pte = fetch_pte(&dma_dom->domain, i); | 841 | u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k); |
764 | if (!pte || !IOMMU_PTE_PRESENT(*pte)) | 842 | if (!pte || !IOMMU_PTE_PRESENT(*pte)) |
765 | continue; | 843 | continue; |
766 | 844 | ||
767 | dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); | 845 | dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); |
768 | } | 846 | } |
769 | 847 | ||
848 | update_domain(&dma_dom->domain); | ||
849 | |||
770 | return 0; | 850 | return 0; |
771 | 851 | ||
772 | out_free: | 852 | out_free: |
853 | update_domain(&dma_dom->domain); | ||
854 | |||
773 | free_page((unsigned long)dma_dom->aperture[index]->bitmap); | 855 | free_page((unsigned long)dma_dom->aperture[index]->bitmap); |
774 | 856 | ||
775 | kfree(dma_dom->aperture[index]); | 857 | kfree(dma_dom->aperture[index]); |
@@ -1009,7 +1091,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) | |||
1009 | dma_dom->domain.id = domain_id_alloc(); | 1091 | dma_dom->domain.id = domain_id_alloc(); |
1010 | if (dma_dom->domain.id == 0) | 1092 | if (dma_dom->domain.id == 0) |
1011 | goto free_dma_dom; | 1093 | goto free_dma_dom; |
1012 | dma_dom->domain.mode = PAGE_MODE_3_LEVEL; | 1094 | dma_dom->domain.mode = PAGE_MODE_2_LEVEL; |
1013 | dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); | 1095 | dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); |
1014 | dma_dom->domain.flags = PD_DMA_OPS_MASK; | 1096 | dma_dom->domain.flags = PD_DMA_OPS_MASK; |
1015 | dma_dom->domain.priv = dma_dom; | 1097 | dma_dom->domain.priv = dma_dom; |
@@ -1063,6 +1145,41 @@ static struct protection_domain *domain_for_device(u16 devid) | |||
1063 | return dom; | 1145 | return dom; |
1064 | } | 1146 | } |
1065 | 1147 | ||
1148 | static void set_dte_entry(u16 devid, struct protection_domain *domain) | ||
1149 | { | ||
1150 | u64 pte_root = virt_to_phys(domain->pt_root); | ||
1151 | |||
1152 | pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) | ||
1153 | << DEV_ENTRY_MODE_SHIFT; | ||
1154 | pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; | ||
1155 | |||
1156 | amd_iommu_dev_table[devid].data[2] = domain->id; | ||
1157 | amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); | ||
1158 | amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); | ||
1159 | |||
1160 | amd_iommu_pd_table[devid] = domain; | ||
1161 | } | ||
1162 | |||
1163 | /* | ||
1164 | * If a device is not yet associated with a domain, this function does | ||
1165 | * assigns it visible for the hardware | ||
1166 | */ | ||
1167 | static void __attach_device(struct amd_iommu *iommu, | ||
1168 | struct protection_domain *domain, | ||
1169 | u16 devid) | ||
1170 | { | ||
1171 | /* lock domain */ | ||
1172 | spin_lock(&domain->lock); | ||
1173 | |||
1174 | /* update DTE entry */ | ||
1175 | set_dte_entry(devid, domain); | ||
1176 | |||
1177 | domain->dev_cnt += 1; | ||
1178 | |||
1179 | /* ready */ | ||
1180 | spin_unlock(&domain->lock); | ||
1181 | } | ||
1182 | |||
1066 | /* | 1183 | /* |
1067 | * If a device is not yet associated with a domain, this function does | 1184 | * If a device is not yet associated with a domain, this function does |
1068 | * assigns it visible for the hardware | 1185 | * assigns it visible for the hardware |
@@ -1072,27 +1189,16 @@ static void attach_device(struct amd_iommu *iommu, | |||
1072 | u16 devid) | 1189 | u16 devid) |
1073 | { | 1190 | { |
1074 | unsigned long flags; | 1191 | unsigned long flags; |
1075 | u64 pte_root = virt_to_phys(domain->pt_root); | ||
1076 | |||
1077 | domain->dev_cnt += 1; | ||
1078 | |||
1079 | pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) | ||
1080 | << DEV_ENTRY_MODE_SHIFT; | ||
1081 | pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; | ||
1082 | 1192 | ||
1083 | write_lock_irqsave(&amd_iommu_devtable_lock, flags); | 1193 | write_lock_irqsave(&amd_iommu_devtable_lock, flags); |
1084 | amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); | 1194 | __attach_device(iommu, domain, devid); |
1085 | amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); | ||
1086 | amd_iommu_dev_table[devid].data[2] = domain->id; | ||
1087 | |||
1088 | amd_iommu_pd_table[devid] = domain; | ||
1089 | write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | 1195 | write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); |
1090 | 1196 | ||
1091 | /* | 1197 | /* |
1092 | * We might boot into a crash-kernel here. The crashed kernel | 1198 | * We might boot into a crash-kernel here. The crashed kernel |
1093 | * left the caches in the IOMMU dirty. So we have to flush | 1199 | * left the caches in the IOMMU dirty. So we have to flush |
1094 | * here to evict all dirty stuff. | 1200 | * here to evict all dirty stuff. |
1095 | */ | 1201 | */ |
1096 | iommu_queue_inv_dev_entry(iommu, devid); | 1202 | iommu_queue_inv_dev_entry(iommu, devid); |
1097 | iommu_flush_tlb_pde(iommu, domain->id); | 1203 | iommu_flush_tlb_pde(iommu, domain->id); |
1098 | } | 1204 | } |
@@ -1119,6 +1225,15 @@ static void __detach_device(struct protection_domain *domain, u16 devid) | |||
1119 | 1225 | ||
1120 | /* ready */ | 1226 | /* ready */ |
1121 | spin_unlock(&domain->lock); | 1227 | spin_unlock(&domain->lock); |
1228 | |||
1229 | /* | ||
1230 | * If we run in passthrough mode the device must be assigned to the | ||
1231 | * passthrough domain if it is detached from any other domain | ||
1232 | */ | ||
1233 | if (iommu_pass_through) { | ||
1234 | struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; | ||
1235 | __attach_device(iommu, pt_domain, devid); | ||
1236 | } | ||
1122 | } | 1237 | } |
1123 | 1238 | ||
1124 | /* | 1239 | /* |
@@ -1164,6 +1279,8 @@ static int device_change_notifier(struct notifier_block *nb, | |||
1164 | case BUS_NOTIFY_UNBOUND_DRIVER: | 1279 | case BUS_NOTIFY_UNBOUND_DRIVER: |
1165 | if (!domain) | 1280 | if (!domain) |
1166 | goto out; | 1281 | goto out; |
1282 | if (iommu_pass_through) | ||
1283 | break; | ||
1167 | detach_device(domain, devid); | 1284 | detach_device(domain, devid); |
1168 | break; | 1285 | break; |
1169 | case BUS_NOTIFY_ADD_DEVICE: | 1286 | case BUS_NOTIFY_ADD_DEVICE: |
@@ -1292,39 +1409,91 @@ static int get_device_resources(struct device *dev, | |||
1292 | return 1; | 1409 | return 1; |
1293 | } | 1410 | } |
1294 | 1411 | ||
1412 | static void update_device_table(struct protection_domain *domain) | ||
1413 | { | ||
1414 | unsigned long flags; | ||
1415 | int i; | ||
1416 | |||
1417 | for (i = 0; i <= amd_iommu_last_bdf; ++i) { | ||
1418 | if (amd_iommu_pd_table[i] != domain) | ||
1419 | continue; | ||
1420 | write_lock_irqsave(&amd_iommu_devtable_lock, flags); | ||
1421 | set_dte_entry(i, domain); | ||
1422 | write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | ||
1423 | } | ||
1424 | } | ||
1425 | |||
1426 | static void update_domain(struct protection_domain *domain) | ||
1427 | { | ||
1428 | if (!domain->updated) | ||
1429 | return; | ||
1430 | |||
1431 | update_device_table(domain); | ||
1432 | flush_devices_by_domain(domain); | ||
1433 | iommu_flush_domain(domain->id); | ||
1434 | |||
1435 | domain->updated = false; | ||
1436 | } | ||
1437 | |||
1295 | /* | 1438 | /* |
1296 | * If the pte_page is not yet allocated this function is called | 1439 | * This function is used to add another level to an IO page table. Adding |
1440 | * another level increases the size of the address space by 9 bits to a size up | ||
1441 | * to 64 bits. | ||
1297 | */ | 1442 | */ |
1298 | static u64* alloc_pte(struct protection_domain *dom, | 1443 | static bool increase_address_space(struct protection_domain *domain, |
1299 | unsigned long address, u64 **pte_page, gfp_t gfp) | 1444 | gfp_t gfp) |
1445 | { | ||
1446 | u64 *pte; | ||
1447 | |||
1448 | if (domain->mode == PAGE_MODE_6_LEVEL) | ||
1449 | /* address space already 64 bit large */ | ||
1450 | return false; | ||
1451 | |||
1452 | pte = (void *)get_zeroed_page(gfp); | ||
1453 | if (!pte) | ||
1454 | return false; | ||
1455 | |||
1456 | *pte = PM_LEVEL_PDE(domain->mode, | ||
1457 | virt_to_phys(domain->pt_root)); | ||
1458 | domain->pt_root = pte; | ||
1459 | domain->mode += 1; | ||
1460 | domain->updated = true; | ||
1461 | |||
1462 | return true; | ||
1463 | } | ||
1464 | |||
1465 | static u64 *alloc_pte(struct protection_domain *domain, | ||
1466 | unsigned long address, | ||
1467 | int end_lvl, | ||
1468 | u64 **pte_page, | ||
1469 | gfp_t gfp) | ||
1300 | { | 1470 | { |
1301 | u64 *pte, *page; | 1471 | u64 *pte, *page; |
1472 | int level; | ||
1302 | 1473 | ||
1303 | pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; | 1474 | while (address > PM_LEVEL_SIZE(domain->mode)) |
1475 | increase_address_space(domain, gfp); | ||
1304 | 1476 | ||
1305 | if (!IOMMU_PTE_PRESENT(*pte)) { | 1477 | level = domain->mode - 1; |
1306 | page = (u64 *)get_zeroed_page(gfp); | 1478 | pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; |
1307 | if (!page) | ||
1308 | return NULL; | ||
1309 | *pte = IOMMU_L2_PDE(virt_to_phys(page)); | ||
1310 | } | ||
1311 | 1479 | ||
1312 | pte = IOMMU_PTE_PAGE(*pte); | 1480 | while (level > end_lvl) { |
1313 | pte = &pte[IOMMU_PTE_L1_INDEX(address)]; | 1481 | if (!IOMMU_PTE_PRESENT(*pte)) { |
1482 | page = (u64 *)get_zeroed_page(gfp); | ||
1483 | if (!page) | ||
1484 | return NULL; | ||
1485 | *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); | ||
1486 | } | ||
1314 | 1487 | ||
1315 | if (!IOMMU_PTE_PRESENT(*pte)) { | 1488 | level -= 1; |
1316 | page = (u64 *)get_zeroed_page(gfp); | ||
1317 | if (!page) | ||
1318 | return NULL; | ||
1319 | *pte = IOMMU_L1_PDE(virt_to_phys(page)); | ||
1320 | } | ||
1321 | 1489 | ||
1322 | pte = IOMMU_PTE_PAGE(*pte); | 1490 | pte = IOMMU_PTE_PAGE(*pte); |
1323 | 1491 | ||
1324 | if (pte_page) | 1492 | if (pte_page && level == end_lvl) |
1325 | *pte_page = pte; | 1493 | *pte_page = pte; |
1326 | 1494 | ||
1327 | pte = &pte[IOMMU_PTE_L0_INDEX(address)]; | 1495 | pte = &pte[PM_LEVEL_INDEX(level, address)]; |
1496 | } | ||
1328 | 1497 | ||
1329 | return pte; | 1498 | return pte; |
1330 | } | 1499 | } |
@@ -1344,10 +1513,13 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, | |||
1344 | 1513 | ||
1345 | pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; | 1514 | pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; |
1346 | if (!pte) { | 1515 | if (!pte) { |
1347 | pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); | 1516 | pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page, |
1517 | GFP_ATOMIC); | ||
1348 | aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; | 1518 | aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; |
1349 | } else | 1519 | } else |
1350 | pte += IOMMU_PTE_L0_INDEX(address); | 1520 | pte += PM_LEVEL_INDEX(0, address); |
1521 | |||
1522 | update_domain(&dom->domain); | ||
1351 | 1523 | ||
1352 | return pte; | 1524 | return pte; |
1353 | } | 1525 | } |
@@ -1409,7 +1581,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, | |||
1409 | if (!pte) | 1581 | if (!pte) |
1410 | return; | 1582 | return; |
1411 | 1583 | ||
1412 | pte += IOMMU_PTE_L0_INDEX(address); | 1584 | pte += PM_LEVEL_INDEX(0, address); |
1413 | 1585 | ||
1414 | WARN_ON(!*pte); | 1586 | WARN_ON(!*pte); |
1415 | 1587 | ||
@@ -1988,19 +2160,47 @@ static void cleanup_domain(struct protection_domain *domain) | |||
1988 | write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | 2160 | write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); |
1989 | } | 2161 | } |
1990 | 2162 | ||
1991 | static int amd_iommu_domain_init(struct iommu_domain *dom) | 2163 | static void protection_domain_free(struct protection_domain *domain) |
2164 | { | ||
2165 | if (!domain) | ||
2166 | return; | ||
2167 | |||
2168 | if (domain->id) | ||
2169 | domain_id_free(domain->id); | ||
2170 | |||
2171 | kfree(domain); | ||
2172 | } | ||
2173 | |||
2174 | static struct protection_domain *protection_domain_alloc(void) | ||
1992 | { | 2175 | { |
1993 | struct protection_domain *domain; | 2176 | struct protection_domain *domain; |
1994 | 2177 | ||
1995 | domain = kzalloc(sizeof(*domain), GFP_KERNEL); | 2178 | domain = kzalloc(sizeof(*domain), GFP_KERNEL); |
1996 | if (!domain) | 2179 | if (!domain) |
1997 | return -ENOMEM; | 2180 | return NULL; |
1998 | 2181 | ||
1999 | spin_lock_init(&domain->lock); | 2182 | spin_lock_init(&domain->lock); |
2000 | domain->mode = PAGE_MODE_3_LEVEL; | ||
2001 | domain->id = domain_id_alloc(); | 2183 | domain->id = domain_id_alloc(); |
2002 | if (!domain->id) | 2184 | if (!domain->id) |
2185 | goto out_err; | ||
2186 | |||
2187 | return domain; | ||
2188 | |||
2189 | out_err: | ||
2190 | kfree(domain); | ||
2191 | |||
2192 | return NULL; | ||
2193 | } | ||
2194 | |||
2195 | static int amd_iommu_domain_init(struct iommu_domain *dom) | ||
2196 | { | ||
2197 | struct protection_domain *domain; | ||
2198 | |||
2199 | domain = protection_domain_alloc(); | ||
2200 | if (!domain) | ||
2003 | goto out_free; | 2201 | goto out_free; |
2202 | |||
2203 | domain->mode = PAGE_MODE_3_LEVEL; | ||
2004 | domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); | 2204 | domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); |
2005 | if (!domain->pt_root) | 2205 | if (!domain->pt_root) |
2006 | goto out_free; | 2206 | goto out_free; |
@@ -2010,7 +2210,7 @@ static int amd_iommu_domain_init(struct iommu_domain *dom) | |||
2010 | return 0; | 2210 | return 0; |
2011 | 2211 | ||
2012 | out_free: | 2212 | out_free: |
2013 | kfree(domain); | 2213 | protection_domain_free(domain); |
2014 | 2214 | ||
2015 | return -ENOMEM; | 2215 | return -ENOMEM; |
2016 | } | 2216 | } |
@@ -2115,7 +2315,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom, | |||
2115 | paddr &= PAGE_MASK; | 2315 | paddr &= PAGE_MASK; |
2116 | 2316 | ||
2117 | for (i = 0; i < npages; ++i) { | 2317 | for (i = 0; i < npages; ++i) { |
2118 | ret = iommu_map_page(domain, iova, paddr, prot); | 2318 | ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); |
2119 | if (ret) | 2319 | if (ret) |
2120 | return ret; | 2320 | return ret; |
2121 | 2321 | ||
@@ -2136,7 +2336,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, | |||
2136 | iova &= PAGE_MASK; | 2336 | iova &= PAGE_MASK; |
2137 | 2337 | ||
2138 | for (i = 0; i < npages; ++i) { | 2338 | for (i = 0; i < npages; ++i) { |
2139 | iommu_unmap_page(domain, iova); | 2339 | iommu_unmap_page(domain, iova, PM_MAP_4k); |
2140 | iova += PAGE_SIZE; | 2340 | iova += PAGE_SIZE; |
2141 | } | 2341 | } |
2142 | 2342 | ||
@@ -2151,21 +2351,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, | |||
2151 | phys_addr_t paddr; | 2351 | phys_addr_t paddr; |
2152 | u64 *pte; | 2352 | u64 *pte; |
2153 | 2353 | ||
2154 | pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)]; | 2354 | pte = fetch_pte(domain, iova, PM_MAP_4k); |
2155 | |||
2156 | if (!IOMMU_PTE_PRESENT(*pte)) | ||
2157 | return 0; | ||
2158 | |||
2159 | pte = IOMMU_PTE_PAGE(*pte); | ||
2160 | pte = &pte[IOMMU_PTE_L1_INDEX(iova)]; | ||
2161 | |||
2162 | if (!IOMMU_PTE_PRESENT(*pte)) | ||
2163 | return 0; | ||
2164 | |||
2165 | pte = IOMMU_PTE_PAGE(*pte); | ||
2166 | pte = &pte[IOMMU_PTE_L0_INDEX(iova)]; | ||
2167 | 2355 | ||
2168 | if (!IOMMU_PTE_PRESENT(*pte)) | 2356 | if (!pte || !IOMMU_PTE_PRESENT(*pte)) |
2169 | return 0; | 2357 | return 0; |
2170 | 2358 | ||
2171 | paddr = *pte & IOMMU_PAGE_MASK; | 2359 | paddr = *pte & IOMMU_PAGE_MASK; |
@@ -2191,3 +2379,46 @@ static struct iommu_ops amd_iommu_ops = { | |||
2191 | .domain_has_cap = amd_iommu_domain_has_cap, | 2379 | .domain_has_cap = amd_iommu_domain_has_cap, |
2192 | }; | 2380 | }; |
2193 | 2381 | ||
2382 | /***************************************************************************** | ||
2383 | * | ||
2384 | * The next functions do a basic initialization of IOMMU for pass through | ||
2385 | * mode | ||
2386 | * | ||
2387 | * In passthrough mode the IOMMU is initialized and enabled but not used for | ||
2388 | * DMA-API translation. | ||
2389 | * | ||
2390 | *****************************************************************************/ | ||
2391 | |||
2392 | int __init amd_iommu_init_passthrough(void) | ||
2393 | { | ||
2394 | struct pci_dev *dev = NULL; | ||
2395 | u16 devid, devid2; | ||
2396 | |||
2397 | /* allocate passthroug domain */ | ||
2398 | pt_domain = protection_domain_alloc(); | ||
2399 | if (!pt_domain) | ||
2400 | return -ENOMEM; | ||
2401 | |||
2402 | pt_domain->mode |= PAGE_MODE_NONE; | ||
2403 | |||
2404 | while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { | ||
2405 | struct amd_iommu *iommu; | ||
2406 | |||
2407 | devid = calc_devid(dev->bus->number, dev->devfn); | ||
2408 | if (devid > amd_iommu_last_bdf) | ||
2409 | continue; | ||
2410 | |||
2411 | devid2 = amd_iommu_alias_table[devid]; | ||
2412 | |||
2413 | iommu = amd_iommu_rlookup_table[devid2]; | ||
2414 | if (!iommu) | ||
2415 | continue; | ||
2416 | |||
2417 | __attach_device(iommu, pt_domain, devid); | ||
2418 | __attach_device(iommu, pt_domain, devid2); | ||
2419 | } | ||
2420 | |||
2421 | pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); | ||
2422 | |||
2423 | return 0; | ||
2424 | } | ||
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c1b17e97252..b4b61d462dc 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c | |||
@@ -252,7 +252,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) | |||
252 | /* Function to enable the hardware */ | 252 | /* Function to enable the hardware */ |
253 | static void iommu_enable(struct amd_iommu *iommu) | 253 | static void iommu_enable(struct amd_iommu *iommu) |
254 | { | 254 | { |
255 | printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", | 255 | printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n", |
256 | dev_name(&iommu->dev->dev), iommu->cap_ptr); | 256 | dev_name(&iommu->dev->dev), iommu->cap_ptr); |
257 | 257 | ||
258 | iommu_feature_enable(iommu, CONTROL_IOMMU_EN); | 258 | iommu_feature_enable(iommu, CONTROL_IOMMU_EN); |
@@ -435,6 +435,20 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) | |||
435 | } | 435 | } |
436 | 436 | ||
437 | /* | 437 | /* |
438 | * This function resets the command buffer if the IOMMU stopped fetching | ||
439 | * commands from it. | ||
440 | */ | ||
441 | void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu) | ||
442 | { | ||
443 | iommu_feature_disable(iommu, CONTROL_CMDBUF_EN); | ||
444 | |||
445 | writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); | ||
446 | writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); | ||
447 | |||
448 | iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); | ||
449 | } | ||
450 | |||
451 | /* | ||
438 | * This function writes the command buffer address to the hardware and | 452 | * This function writes the command buffer address to the hardware and |
439 | * enables it. | 453 | * enables it. |
440 | */ | 454 | */ |
@@ -450,11 +464,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu) | |||
450 | memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, | 464 | memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, |
451 | &entry, sizeof(entry)); | 465 | &entry, sizeof(entry)); |
452 | 466 | ||
453 | /* set head and tail to zero manually */ | 467 | amd_iommu_reset_cmd_buffer(iommu); |
454 | writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); | ||
455 | writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); | ||
456 | |||
457 | iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); | ||
458 | } | 468 | } |
459 | 469 | ||
460 | static void __init free_command_buffer(struct amd_iommu *iommu) | 470 | static void __init free_command_buffer(struct amd_iommu *iommu) |
@@ -858,7 +868,7 @@ static int __init init_iommu_all(struct acpi_table_header *table) | |||
858 | switch (*p) { | 868 | switch (*p) { |
859 | case ACPI_IVHD_TYPE: | 869 | case ACPI_IVHD_TYPE: |
860 | 870 | ||
861 | DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " | 871 | DUMP_printk("device: %02x:%02x.%01x cap: %04x " |
862 | "seg: %d flags: %01x info %04x\n", | 872 | "seg: %d flags: %01x info %04x\n", |
863 | PCI_BUS(h->devid), PCI_SLOT(h->devid), | 873 | PCI_BUS(h->devid), PCI_SLOT(h->devid), |
864 | PCI_FUNC(h->devid), h->cap_ptr, | 874 | PCI_FUNC(h->devid), h->cap_ptr, |
@@ -902,7 +912,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) | |||
902 | 912 | ||
903 | r = request_irq(iommu->dev->irq, amd_iommu_int_handler, | 913 | r = request_irq(iommu->dev->irq, amd_iommu_int_handler, |
904 | IRQF_SAMPLE_RANDOM, | 914 | IRQF_SAMPLE_RANDOM, |
905 | "AMD IOMMU", | 915 | "AMD-Vi", |
906 | NULL); | 916 | NULL); |
907 | 917 | ||
908 | if (r) { | 918 | if (r) { |
@@ -1150,7 +1160,7 @@ int __init amd_iommu_init(void) | |||
1150 | 1160 | ||
1151 | 1161 | ||
1152 | if (no_iommu) { | 1162 | if (no_iommu) { |
1153 | printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n"); | 1163 | printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); |
1154 | return 0; | 1164 | return 0; |
1155 | } | 1165 | } |
1156 | 1166 | ||
@@ -1242,22 +1252,28 @@ int __init amd_iommu_init(void) | |||
1242 | if (ret) | 1252 | if (ret) |
1243 | goto free; | 1253 | goto free; |
1244 | 1254 | ||
1245 | ret = amd_iommu_init_dma_ops(); | 1255 | if (iommu_pass_through) |
1256 | ret = amd_iommu_init_passthrough(); | ||
1257 | else | ||
1258 | ret = amd_iommu_init_dma_ops(); | ||
1246 | if (ret) | 1259 | if (ret) |
1247 | goto free; | 1260 | goto free; |
1248 | 1261 | ||
1249 | enable_iommus(); | 1262 | enable_iommus(); |
1250 | 1263 | ||
1251 | printk(KERN_INFO "AMD IOMMU: device isolation "); | 1264 | if (iommu_pass_through) |
1265 | goto out; | ||
1266 | |||
1267 | printk(KERN_INFO "AMD-Vi: device isolation "); | ||
1252 | if (amd_iommu_isolate) | 1268 | if (amd_iommu_isolate) |
1253 | printk("enabled\n"); | 1269 | printk("enabled\n"); |
1254 | else | 1270 | else |
1255 | printk("disabled\n"); | 1271 | printk("disabled\n"); |
1256 | 1272 | ||
1257 | if (amd_iommu_unmap_flush) | 1273 | if (amd_iommu_unmap_flush) |
1258 | printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n"); | 1274 | printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); |
1259 | else | 1275 | else |
1260 | printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n"); | 1276 | printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); |
1261 | 1277 | ||
1262 | out: | 1278 | out: |
1263 | return ret; | 1279 | return ret; |
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 676debfc170..128111d8ffe 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/bitops.h> | 20 | #include <linux/bitops.h> |
21 | #include <linux/ioport.h> | 21 | #include <linux/ioport.h> |
22 | #include <linux/suspend.h> | 22 | #include <linux/suspend.h> |
23 | #include <linux/kmemleak.h> | ||
23 | #include <asm/e820.h> | 24 | #include <asm/e820.h> |
24 | #include <asm/io.h> | 25 | #include <asm/io.h> |
25 | #include <asm/iommu.h> | 26 | #include <asm/iommu.h> |
@@ -94,6 +95,11 @@ static u32 __init allocate_aperture(void) | |||
94 | * code for safe | 95 | * code for safe |
95 | */ | 96 | */ |
96 | p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); | 97 | p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); |
98 | /* | ||
99 | * Kmemleak should not scan this block as it may not be mapped via the | ||
100 | * kernel direct mapping. | ||
101 | */ | ||
102 | kmemleak_ignore(p); | ||
97 | if (!p || __pa(p)+aper_size > 0xffffffff) { | 103 | if (!p || __pa(p)+aper_size > 0xffffffff) { |
98 | printk(KERN_ERR | 104 | printk(KERN_ERR |
99 | "Cannot allocate aperture memory hole (%p,%uK)\n", | 105 | "Cannot allocate aperture memory hole (%p,%uK)\n", |
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0a1c2830ec6..894aa97f071 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -14,7 +14,7 @@ | |||
14 | * Mikael Pettersson : PM converted to driver model. | 14 | * Mikael Pettersson : PM converted to driver model. |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/perf_counter.h> | 17 | #include <linux/perf_event.h> |
18 | #include <linux/kernel_stat.h> | 18 | #include <linux/kernel_stat.h> |
19 | #include <linux/mc146818rtc.h> | 19 | #include <linux/mc146818rtc.h> |
20 | #include <linux/acpi_pmtmr.h> | 20 | #include <linux/acpi_pmtmr.h> |
@@ -35,7 +35,8 @@ | |||
35 | #include <linux/smp.h> | 35 | #include <linux/smp.h> |
36 | #include <linux/mm.h> | 36 | #include <linux/mm.h> |
37 | 37 | ||
38 | #include <asm/perf_counter.h> | 38 | #include <asm/perf_event.h> |
39 | #include <asm/x86_init.h> | ||
39 | #include <asm/pgalloc.h> | 40 | #include <asm/pgalloc.h> |
40 | #include <asm/atomic.h> | 41 | #include <asm/atomic.h> |
41 | #include <asm/mpspec.h> | 42 | #include <asm/mpspec.h> |
@@ -49,6 +50,7 @@ | |||
49 | #include <asm/mtrr.h> | 50 | #include <asm/mtrr.h> |
50 | #include <asm/smp.h> | 51 | #include <asm/smp.h> |
51 | #include <asm/mce.h> | 52 | #include <asm/mce.h> |
53 | #include <asm/kvm_para.h> | ||
52 | 54 | ||
53 | unsigned int num_processors; | 55 | unsigned int num_processors; |
54 | 56 | ||
@@ -60,7 +62,7 @@ unsigned int boot_cpu_physical_apicid = -1U; | |||
60 | /* | 62 | /* |
61 | * The highest APIC ID seen during enumeration. | 63 | * The highest APIC ID seen during enumeration. |
62 | * | 64 | * |
63 | * This determines the messaging protocol we can use: if all APIC IDs | 65 | * On AMD, this determines the messaging protocol we can use: if all APIC IDs |
64 | * are in the 0 ... 7 range, then we can use logical addressing which | 66 | * are in the 0 ... 7 range, then we can use logical addressing which |
65 | * has some performance advantages (better broadcasting). | 67 | * has some performance advantages (better broadcasting). |
66 | * | 68 | * |
@@ -977,7 +979,7 @@ void lapic_shutdown(void) | |||
977 | { | 979 | { |
978 | unsigned long flags; | 980 | unsigned long flags; |
979 | 981 | ||
980 | if (!cpu_has_apic) | 982 | if (!cpu_has_apic && !apic_from_smp_config()) |
981 | return; | 983 | return; |
982 | 984 | ||
983 | local_irq_save(flags); | 985 | local_irq_save(flags); |
@@ -1187,7 +1189,7 @@ void __cpuinit setup_local_APIC(void) | |||
1187 | apic_write(APIC_ESR, 0); | 1189 | apic_write(APIC_ESR, 0); |
1188 | } | 1190 | } |
1189 | #endif | 1191 | #endif |
1190 | perf_counters_lapic_init(); | 1192 | perf_events_lapic_init(); |
1191 | 1193 | ||
1192 | preempt_disable(); | 1194 | preempt_disable(); |
1193 | 1195 | ||
@@ -1195,8 +1197,7 @@ void __cpuinit setup_local_APIC(void) | |||
1195 | * Double-check whether this APIC is really registered. | 1197 | * Double-check whether this APIC is really registered. |
1196 | * This is meaningless in clustered apic mode, so we skip it. | 1198 | * This is meaningless in clustered apic mode, so we skip it. |
1197 | */ | 1199 | */ |
1198 | if (!apic->apic_id_registered()) | 1200 | BUG_ON(!apic->apic_id_registered()); |
1199 | BUG(); | ||
1200 | 1201 | ||
1201 | /* | 1202 | /* |
1202 | * Intel recommends to set DFR, LDR and TPR before enabling | 1203 | * Intel recommends to set DFR, LDR and TPR before enabling |
@@ -1361,52 +1362,80 @@ void enable_x2apic(void) | |||
1361 | } | 1362 | } |
1362 | #endif /* CONFIG_X86_X2APIC */ | 1363 | #endif /* CONFIG_X86_X2APIC */ |
1363 | 1364 | ||
1364 | void __init enable_IR_x2apic(void) | 1365 | int __init enable_IR(void) |
1365 | { | 1366 | { |
1366 | #ifdef CONFIG_INTR_REMAP | 1367 | #ifdef CONFIG_INTR_REMAP |
1367 | int ret; | ||
1368 | unsigned long flags; | ||
1369 | struct IO_APIC_route_entry **ioapic_entries = NULL; | ||
1370 | |||
1371 | ret = dmar_table_init(); | ||
1372 | if (ret) { | ||
1373 | pr_debug("dmar_table_init() failed with %d:\n", ret); | ||
1374 | goto ir_failed; | ||
1375 | } | ||
1376 | |||
1377 | if (!intr_remapping_supported()) { | 1368 | if (!intr_remapping_supported()) { |
1378 | pr_debug("intr-remapping not supported\n"); | 1369 | pr_debug("intr-remapping not supported\n"); |
1379 | goto ir_failed; | 1370 | return 0; |
1380 | } | 1371 | } |
1381 | 1372 | ||
1382 | |||
1383 | if (!x2apic_preenabled && skip_ioapic_setup) { | 1373 | if (!x2apic_preenabled && skip_ioapic_setup) { |
1384 | pr_info("Skipped enabling intr-remap because of skipping " | 1374 | pr_info("Skipped enabling intr-remap because of skipping " |
1385 | "io-apic setup\n"); | 1375 | "io-apic setup\n"); |
1386 | return; | 1376 | return 0; |
1387 | } | 1377 | } |
1388 | 1378 | ||
1379 | if (enable_intr_remapping(x2apic_supported())) | ||
1380 | return 0; | ||
1381 | |||
1382 | pr_info("Enabled Interrupt-remapping\n"); | ||
1383 | |||
1384 | return 1; | ||
1385 | |||
1386 | #endif | ||
1387 | return 0; | ||
1388 | } | ||
1389 | |||
1390 | void __init enable_IR_x2apic(void) | ||
1391 | { | ||
1392 | unsigned long flags; | ||
1393 | struct IO_APIC_route_entry **ioapic_entries = NULL; | ||
1394 | int ret, x2apic_enabled = 0; | ||
1395 | int dmar_table_init_ret = 0; | ||
1396 | |||
1397 | #ifdef CONFIG_INTR_REMAP | ||
1398 | dmar_table_init_ret = dmar_table_init(); | ||
1399 | if (dmar_table_init_ret) | ||
1400 | pr_debug("dmar_table_init() failed with %d:\n", | ||
1401 | dmar_table_init_ret); | ||
1402 | #endif | ||
1403 | |||
1389 | ioapic_entries = alloc_ioapic_entries(); | 1404 | ioapic_entries = alloc_ioapic_entries(); |
1390 | if (!ioapic_entries) { | 1405 | if (!ioapic_entries) { |
1391 | pr_info("Allocate ioapic_entries failed: %d\n", ret); | 1406 | pr_err("Allocate ioapic_entries failed\n"); |
1392 | goto end; | 1407 | goto out; |
1393 | } | 1408 | } |
1394 | 1409 | ||
1395 | ret = save_IO_APIC_setup(ioapic_entries); | 1410 | ret = save_IO_APIC_setup(ioapic_entries); |
1396 | if (ret) { | 1411 | if (ret) { |
1397 | pr_info("Saving IO-APIC state failed: %d\n", ret); | 1412 | pr_info("Saving IO-APIC state failed: %d\n", ret); |
1398 | goto end; | 1413 | goto out; |
1399 | } | 1414 | } |
1400 | 1415 | ||
1401 | local_irq_save(flags); | 1416 | local_irq_save(flags); |
1402 | mask_IO_APIC_setup(ioapic_entries); | ||
1403 | mask_8259A(); | 1417 | mask_8259A(); |
1418 | mask_IO_APIC_setup(ioapic_entries); | ||
1404 | 1419 | ||
1405 | ret = enable_intr_remapping(x2apic_supported()); | 1420 | if (dmar_table_init_ret) |
1406 | if (ret) | 1421 | ret = 0; |
1407 | goto end_restore; | 1422 | else |
1423 | ret = enable_IR(); | ||
1408 | 1424 | ||
1409 | pr_info("Enabled Interrupt-remapping\n"); | 1425 | if (!ret) { |
1426 | /* IR is required if there is APIC ID > 255 even when running | ||
1427 | * under KVM | ||
1428 | */ | ||
1429 | if (max_physical_apicid > 255 || !kvm_para_available()) | ||
1430 | goto nox2apic; | ||
1431 | /* | ||
1432 | * without IR all CPUs can be addressed by IOAPIC/MSI | ||
1433 | * only in physical mode | ||
1434 | */ | ||
1435 | x2apic_force_phys(); | ||
1436 | } | ||
1437 | |||
1438 | x2apic_enabled = 1; | ||
1410 | 1439 | ||
1411 | if (x2apic_supported() && !x2apic_mode) { | 1440 | if (x2apic_supported() && !x2apic_mode) { |
1412 | x2apic_mode = 1; | 1441 | x2apic_mode = 1; |
@@ -1414,41 +1443,25 @@ void __init enable_IR_x2apic(void) | |||
1414 | pr_info("Enabled x2apic\n"); | 1443 | pr_info("Enabled x2apic\n"); |
1415 | } | 1444 | } |
1416 | 1445 | ||
1417 | end_restore: | 1446 | nox2apic: |
1418 | if (ret) | 1447 | if (!ret) /* IR enabling failed */ |
1419 | /* | ||
1420 | * IR enabling failed | ||
1421 | */ | ||
1422 | restore_IO_APIC_setup(ioapic_entries); | 1448 | restore_IO_APIC_setup(ioapic_entries); |
1423 | |||
1424 | unmask_8259A(); | 1449 | unmask_8259A(); |
1425 | local_irq_restore(flags); | 1450 | local_irq_restore(flags); |
1426 | 1451 | ||
1427 | end: | 1452 | out: |
1428 | if (ioapic_entries) | 1453 | if (ioapic_entries) |
1429 | free_ioapic_entries(ioapic_entries); | 1454 | free_ioapic_entries(ioapic_entries); |
1430 | 1455 | ||
1431 | if (!ret) | 1456 | if (x2apic_enabled) |
1432 | return; | 1457 | return; |
1433 | 1458 | ||
1434 | ir_failed: | ||
1435 | if (x2apic_preenabled) | 1459 | if (x2apic_preenabled) |
1436 | panic("x2apic enabled by bios. But IR enabling failed"); | 1460 | panic("x2apic: enabled by BIOS but kernel init failed."); |
1437 | else if (cpu_has_x2apic) | 1461 | else if (cpu_has_x2apic) |
1438 | pr_info("Not enabling x2apic,Intr-remapping\n"); | 1462 | pr_info("Not enabling x2apic, Intr-remapping init failed.\n"); |
1439 | #else | ||
1440 | if (!cpu_has_x2apic) | ||
1441 | return; | ||
1442 | |||
1443 | if (x2apic_preenabled) | ||
1444 | panic("x2apic enabled prior OS handover," | ||
1445 | " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP"); | ||
1446 | #endif | ||
1447 | |||
1448 | return; | ||
1449 | } | 1463 | } |
1450 | 1464 | ||
1451 | |||
1452 | #ifdef CONFIG_X86_64 | 1465 | #ifdef CONFIG_X86_64 |
1453 | /* | 1466 | /* |
1454 | * Detect and enable local APICs on non-SMP boards. | 1467 | * Detect and enable local APICs on non-SMP boards. |
@@ -1549,8 +1562,6 @@ no_apic: | |||
1549 | #ifdef CONFIG_X86_64 | 1562 | #ifdef CONFIG_X86_64 |
1550 | void __init early_init_lapic_mapping(void) | 1563 | void __init early_init_lapic_mapping(void) |
1551 | { | 1564 | { |
1552 | unsigned long phys_addr; | ||
1553 | |||
1554 | /* | 1565 | /* |
1555 | * If no local APIC can be found then go out | 1566 | * If no local APIC can be found then go out |
1556 | * : it means there is no mpatable and MADT | 1567 | * : it means there is no mpatable and MADT |
@@ -1558,11 +1569,9 @@ void __init early_init_lapic_mapping(void) | |||
1558 | if (!smp_found_config) | 1569 | if (!smp_found_config) |
1559 | return; | 1570 | return; |
1560 | 1571 | ||
1561 | phys_addr = mp_lapic_addr; | 1572 | set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); |
1562 | |||
1563 | set_fixmap_nocache(FIX_APIC_BASE, phys_addr); | ||
1564 | apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", | 1573 | apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", |
1565 | APIC_BASE, phys_addr); | 1574 | APIC_BASE, mp_lapic_addr); |
1566 | 1575 | ||
1567 | /* | 1576 | /* |
1568 | * Fetch the APIC ID of the BSP in case we have a | 1577 | * Fetch the APIC ID of the BSP in case we have a |
@@ -1651,7 +1660,6 @@ int __init APIC_init_uniprocessor(void) | |||
1651 | APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { | 1660 | APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { |
1652 | pr_err("BIOS bug, local APIC 0x%x not detected!...\n", | 1661 | pr_err("BIOS bug, local APIC 0x%x not detected!...\n", |
1653 | boot_cpu_physical_apicid); | 1662 | boot_cpu_physical_apicid); |
1654 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); | ||
1655 | return -1; | 1663 | return -1; |
1656 | } | 1664 | } |
1657 | #endif | 1665 | #endif |
@@ -1701,7 +1709,7 @@ int __init APIC_init_uniprocessor(void) | |||
1701 | localise_nmi_watchdog(); | 1709 | localise_nmi_watchdog(); |
1702 | #endif | 1710 | #endif |
1703 | 1711 | ||
1704 | setup_boot_clock(); | 1712 | x86_init.timers.setup_percpu_clockev(); |
1705 | #ifdef CONFIG_X86_64 | 1713 | #ifdef CONFIG_X86_64 |
1706 | check_nmi_watchdog(); | 1714 | check_nmi_watchdog(); |
1707 | #endif | 1715 | #endif |
@@ -1908,24 +1916,14 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1908 | max_physical_apicid = apicid; | 1916 | max_physical_apicid = apicid; |
1909 | 1917 | ||
1910 | #ifdef CONFIG_X86_32 | 1918 | #ifdef CONFIG_X86_32 |
1911 | /* | 1919 | switch (boot_cpu_data.x86_vendor) { |
1912 | * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y | 1920 | case X86_VENDOR_INTEL: |
1913 | * but we need to work other dependencies like SMP_SUSPEND etc | 1921 | if (num_processors > 8) |
1914 | * before this can be done without some confusion. | 1922 | def_to_bigsmp = 1; |
1915 | * if (CPU_HOTPLUG_ENABLED || num_processors > 8) | 1923 | break; |
1916 | * - Ashok Raj <ashok.raj@intel.com> | 1924 | case X86_VENDOR_AMD: |
1917 | */ | 1925 | if (max_physical_apicid >= 8) |
1918 | if (max_physical_apicid >= 8) { | ||
1919 | switch (boot_cpu_data.x86_vendor) { | ||
1920 | case X86_VENDOR_INTEL: | ||
1921 | if (!APIC_XAPIC(version)) { | ||
1922 | def_to_bigsmp = 0; | ||
1923 | break; | ||
1924 | } | ||
1925 | /* If P4 and above fall through */ | ||
1926 | case X86_VENDOR_AMD: | ||
1927 | def_to_bigsmp = 1; | 1926 | def_to_bigsmp = 1; |
1928 | } | ||
1929 | } | 1927 | } |
1930 | #endif | 1928 | #endif |
1931 | 1929 | ||
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 676cdac385c..77a06413b6b 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c | |||
@@ -112,7 +112,7 @@ static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) | |||
112 | return physids_promote(0xFFL); | 112 | return physids_promote(0xFFL); |
113 | } | 113 | } |
114 | 114 | ||
115 | static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid) | 115 | static int bigsmp_check_phys_apicid_present(int phys_apicid) |
116 | { | 116 | { |
117 | return 1; | 117 | return 1; |
118 | } | 118 | } |
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 8952a589028..89174f847b4 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c | |||
@@ -167,7 +167,7 @@ static int es7000_apic_is_cluster(void) | |||
167 | { | 167 | { |
168 | /* MPENTIUMIII */ | 168 | /* MPENTIUMIII */ |
169 | if (boot_cpu_data.x86 == 6 && | 169 | if (boot_cpu_data.x86 == 6 && |
170 | (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) | 170 | (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11)) |
171 | return 1; | 171 | return 1; |
172 | 172 | ||
173 | return 0; | 173 | return 0; |
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d2ed6c5ddc8..dc69f28489f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c | |||
@@ -66,6 +66,8 @@ | |||
66 | #include <asm/apic.h> | 66 | #include <asm/apic.h> |
67 | 67 | ||
68 | #define __apicdebuginit(type) static type __init | 68 | #define __apicdebuginit(type) static type __init |
69 | #define for_each_irq_pin(entry, head) \ | ||
70 | for (entry = head; entry; entry = entry->next) | ||
69 | 71 | ||
70 | /* | 72 | /* |
71 | * Is the SiS APIC rmw bug present ? | 73 | * Is the SiS APIC rmw bug present ? |
@@ -85,12 +87,20 @@ int nr_ioapic_registers[MAX_IO_APICS]; | |||
85 | struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; | 87 | struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; |
86 | int nr_ioapics; | 88 | int nr_ioapics; |
87 | 89 | ||
90 | /* IO APIC gsi routing info */ | ||
91 | struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; | ||
92 | |||
88 | /* MP IRQ source entries */ | 93 | /* MP IRQ source entries */ |
89 | struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; | 94 | struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; |
90 | 95 | ||
91 | /* # of MP IRQ source entries */ | 96 | /* # of MP IRQ source entries */ |
92 | int mp_irq_entries; | 97 | int mp_irq_entries; |
93 | 98 | ||
99 | /* Number of legacy interrupts */ | ||
100 | static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; | ||
101 | /* GSI interrupts */ | ||
102 | static int nr_irqs_gsi = NR_IRQS_LEGACY; | ||
103 | |||
94 | #if defined (CONFIG_MCA) || defined (CONFIG_EISA) | 104 | #if defined (CONFIG_MCA) || defined (CONFIG_EISA) |
95 | int mp_bus_id_to_type[MAX_MP_BUSSES]; | 105 | int mp_bus_id_to_type[MAX_MP_BUSSES]; |
96 | #endif | 106 | #endif |
@@ -116,15 +126,6 @@ static int __init parse_noapic(char *str) | |||
116 | } | 126 | } |
117 | early_param("noapic", parse_noapic); | 127 | early_param("noapic", parse_noapic); |
118 | 128 | ||
119 | struct irq_pin_list; | ||
120 | |||
121 | /* | ||
122 | * This is performance-critical, we want to do it O(1) | ||
123 | * | ||
124 | * the indexing order of this array favors 1:1 mappings | ||
125 | * between pins and IRQs. | ||
126 | */ | ||
127 | |||
128 | struct irq_pin_list { | 129 | struct irq_pin_list { |
129 | int apic, pin; | 130 | int apic, pin; |
130 | struct irq_pin_list *next; | 131 | struct irq_pin_list *next; |
@@ -139,6 +140,11 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) | |||
139 | return pin; | 140 | return pin; |
140 | } | 141 | } |
141 | 142 | ||
143 | /* | ||
144 | * This is performance-critical, we want to do it O(1) | ||
145 | * | ||
146 | * Most irqs are mapped 1:1 with pins. | ||
147 | */ | ||
142 | struct irq_cfg { | 148 | struct irq_cfg { |
143 | struct irq_pin_list *irq_2_pin; | 149 | struct irq_pin_list *irq_2_pin; |
144 | cpumask_var_t domain; | 150 | cpumask_var_t domain; |
@@ -172,6 +178,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = { | |||
172 | [15] = { .vector = IRQ15_VECTOR, }, | 178 | [15] = { .vector = IRQ15_VECTOR, }, |
173 | }; | 179 | }; |
174 | 180 | ||
181 | void __init io_apic_disable_legacy(void) | ||
182 | { | ||
183 | nr_legacy_irqs = 0; | ||
184 | nr_irqs_gsi = 0; | ||
185 | } | ||
186 | |||
175 | int __init arch_early_irq_init(void) | 187 | int __init arch_early_irq_init(void) |
176 | { | 188 | { |
177 | struct irq_cfg *cfg; | 189 | struct irq_cfg *cfg; |
@@ -189,7 +201,7 @@ int __init arch_early_irq_init(void) | |||
189 | desc->chip_data = &cfg[i]; | 201 | desc->chip_data = &cfg[i]; |
190 | zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); | 202 | zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); |
191 | zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); | 203 | zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); |
192 | if (i < NR_IRQS_LEGACY) | 204 | if (i < nr_legacy_irqs) |
193 | cpumask_setall(cfg[i].domain); | 205 | cpumask_setall(cfg[i].domain); |
194 | } | 206 | } |
195 | 207 | ||
@@ -215,17 +227,14 @@ static struct irq_cfg *get_one_free_irq_cfg(int node) | |||
215 | 227 | ||
216 | cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); | 228 | cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); |
217 | if (cfg) { | 229 | if (cfg) { |
218 | if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { | 230 | if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { |
219 | kfree(cfg); | 231 | kfree(cfg); |
220 | cfg = NULL; | 232 | cfg = NULL; |
221 | } else if (!alloc_cpumask_var_node(&cfg->old_domain, | 233 | } else if (!zalloc_cpumask_var_node(&cfg->old_domain, |
222 | GFP_ATOMIC, node)) { | 234 | GFP_ATOMIC, node)) { |
223 | free_cpumask_var(cfg->domain); | 235 | free_cpumask_var(cfg->domain); |
224 | kfree(cfg); | 236 | kfree(cfg); |
225 | cfg = NULL; | 237 | cfg = NULL; |
226 | } else { | ||
227 | cpumask_clear(cfg->domain); | ||
228 | cpumask_clear(cfg->old_domain); | ||
229 | } | 238 | } |
230 | } | 239 | } |
231 | 240 | ||
@@ -414,13 +423,10 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) | |||
414 | unsigned long flags; | 423 | unsigned long flags; |
415 | 424 | ||
416 | spin_lock_irqsave(&ioapic_lock, flags); | 425 | spin_lock_irqsave(&ioapic_lock, flags); |
417 | entry = cfg->irq_2_pin; | 426 | for_each_irq_pin(entry, cfg->irq_2_pin) { |
418 | for (;;) { | ||
419 | unsigned int reg; | 427 | unsigned int reg; |
420 | int pin; | 428 | int pin; |
421 | 429 | ||
422 | if (!entry) | ||
423 | break; | ||
424 | pin = entry->pin; | 430 | pin = entry->pin; |
425 | reg = io_apic_read(entry->apic, 0x10 + pin*2); | 431 | reg = io_apic_read(entry->apic, 0x10 + pin*2); |
426 | /* Is the remote IRR bit set? */ | 432 | /* Is the remote IRR bit set? */ |
@@ -428,9 +434,6 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) | |||
428 | spin_unlock_irqrestore(&ioapic_lock, flags); | 434 | spin_unlock_irqrestore(&ioapic_lock, flags); |
429 | return true; | 435 | return true; |
430 | } | 436 | } |
431 | if (!entry->next) | ||
432 | break; | ||
433 | entry = entry->next; | ||
434 | } | 437 | } |
435 | spin_unlock_irqrestore(&ioapic_lock, flags); | 438 | spin_unlock_irqrestore(&ioapic_lock, flags); |
436 | 439 | ||
@@ -498,72 +501,68 @@ static void ioapic_mask_entry(int apic, int pin) | |||
498 | * shared ISA-space IRQs, so we have to support them. We are super | 501 | * shared ISA-space IRQs, so we have to support them. We are super |
499 | * fast in the common case, and fast for shared ISA-space IRQs. | 502 | * fast in the common case, and fast for shared ISA-space IRQs. |
500 | */ | 503 | */ |
501 | static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) | 504 | static int |
505 | add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) | ||
502 | { | 506 | { |
503 | struct irq_pin_list *entry; | 507 | struct irq_pin_list **last, *entry; |
504 | 508 | ||
505 | entry = cfg->irq_2_pin; | 509 | /* don't allow duplicates */ |
506 | if (!entry) { | 510 | last = &cfg->irq_2_pin; |
507 | entry = get_one_free_irq_2_pin(node); | 511 | for_each_irq_pin(entry, cfg->irq_2_pin) { |
508 | if (!entry) { | ||
509 | printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", | ||
510 | apic, pin); | ||
511 | return; | ||
512 | } | ||
513 | cfg->irq_2_pin = entry; | ||
514 | entry->apic = apic; | ||
515 | entry->pin = pin; | ||
516 | return; | ||
517 | } | ||
518 | |||
519 | while (entry->next) { | ||
520 | /* not again, please */ | ||
521 | if (entry->apic == apic && entry->pin == pin) | 512 | if (entry->apic == apic && entry->pin == pin) |
522 | return; | 513 | return 0; |
523 | 514 | last = &entry->next; | |
524 | entry = entry->next; | ||
525 | } | 515 | } |
526 | 516 | ||
527 | entry->next = get_one_free_irq_2_pin(node); | 517 | entry = get_one_free_irq_2_pin(node); |
528 | entry = entry->next; | 518 | if (!entry) { |
519 | printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", | ||
520 | node, apic, pin); | ||
521 | return -ENOMEM; | ||
522 | } | ||
529 | entry->apic = apic; | 523 | entry->apic = apic; |
530 | entry->pin = pin; | 524 | entry->pin = pin; |
525 | |||
526 | *last = entry; | ||
527 | return 0; | ||
528 | } | ||
529 | |||
530 | static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) | ||
531 | { | ||
532 | if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin)) | ||
533 | panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); | ||
531 | } | 534 | } |
532 | 535 | ||
533 | /* | 536 | /* |
534 | * Reroute an IRQ to a different pin. | 537 | * Reroute an IRQ to a different pin. |
535 | */ | 538 | */ |
536 | static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, | 539 | static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, |
537 | int oldapic, int oldpin, | 540 | int oldapic, int oldpin, |
538 | int newapic, int newpin) | 541 | int newapic, int newpin) |
539 | { | 542 | { |
540 | struct irq_pin_list *entry = cfg->irq_2_pin; | 543 | struct irq_pin_list *entry; |
541 | int replaced = 0; | ||
542 | 544 | ||
543 | while (entry) { | 545 | for_each_irq_pin(entry, cfg->irq_2_pin) { |
544 | if (entry->apic == oldapic && entry->pin == oldpin) { | 546 | if (entry->apic == oldapic && entry->pin == oldpin) { |
545 | entry->apic = newapic; | 547 | entry->apic = newapic; |
546 | entry->pin = newpin; | 548 | entry->pin = newpin; |
547 | replaced = 1; | ||
548 | /* every one is different, right? */ | 549 | /* every one is different, right? */ |
549 | break; | 550 | return; |
550 | } | 551 | } |
551 | entry = entry->next; | ||
552 | } | 552 | } |
553 | 553 | ||
554 | /* why? call replace before add? */ | 554 | /* old apic/pin didn't exist, so just add new ones */ |
555 | if (!replaced) | 555 | add_pin_to_irq_node(cfg, node, newapic, newpin); |
556 | add_pin_to_irq_node(cfg, node, newapic, newpin); | ||
557 | } | 556 | } |
558 | 557 | ||
559 | static inline void io_apic_modify_irq(struct irq_cfg *cfg, | 558 | static void io_apic_modify_irq(struct irq_cfg *cfg, |
560 | int mask_and, int mask_or, | 559 | int mask_and, int mask_or, |
561 | void (*final)(struct irq_pin_list *entry)) | 560 | void (*final)(struct irq_pin_list *entry)) |
562 | { | 561 | { |
563 | int pin; | 562 | int pin; |
564 | struct irq_pin_list *entry; | 563 | struct irq_pin_list *entry; |
565 | 564 | ||
566 | for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { | 565 | for_each_irq_pin(entry, cfg->irq_2_pin) { |
567 | unsigned int reg; | 566 | unsigned int reg; |
568 | pin = entry->pin; | 567 | pin = entry->pin; |
569 | reg = io_apic_read(entry->apic, 0x10 + pin * 2); | 568 | reg = io_apic_read(entry->apic, 0x10 + pin * 2); |
@@ -580,7 +579,6 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) | |||
580 | io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); | 579 | io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); |
581 | } | 580 | } |
582 | 581 | ||
583 | #ifdef CONFIG_X86_64 | ||
584 | static void io_apic_sync(struct irq_pin_list *entry) | 582 | static void io_apic_sync(struct irq_pin_list *entry) |
585 | { | 583 | { |
586 | /* | 584 | /* |
@@ -596,11 +594,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) | |||
596 | { | 594 | { |
597 | io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); | 595 | io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); |
598 | } | 596 | } |
599 | #else /* CONFIG_X86_32 */ | ||
600 | static void __mask_IO_APIC_irq(struct irq_cfg *cfg) | ||
601 | { | ||
602 | io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); | ||
603 | } | ||
604 | 597 | ||
605 | static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) | 598 | static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) |
606 | { | 599 | { |
@@ -613,7 +606,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) | |||
613 | io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, | 606 | io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, |
614 | IO_APIC_REDIR_LEVEL_TRIGGER, NULL); | 607 | IO_APIC_REDIR_LEVEL_TRIGGER, NULL); |
615 | } | 608 | } |
616 | #endif /* CONFIG_X86_32 */ | ||
617 | 609 | ||
618 | static void mask_IO_APIC_irq_desc(struct irq_desc *desc) | 610 | static void mask_IO_APIC_irq_desc(struct irq_desc *desc) |
619 | { | 611 | { |
@@ -883,7 +875,7 @@ static int __init find_isa_irq_apic(int irq, int type) | |||
883 | */ | 875 | */ |
884 | static int EISA_ELCR(unsigned int irq) | 876 | static int EISA_ELCR(unsigned int irq) |
885 | { | 877 | { |
886 | if (irq < NR_IRQS_LEGACY) { | 878 | if (irq < nr_legacy_irqs) { |
887 | unsigned int port = 0x4d0 + (irq >> 3); | 879 | unsigned int port = 0x4d0 + (irq >> 3); |
888 | return (inb(port) >> (irq & 7)) & 1; | 880 | return (inb(port) >> (irq & 7)) & 1; |
889 | } | 881 | } |
@@ -1480,7 +1472,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq | |||
1480 | } | 1472 | } |
1481 | 1473 | ||
1482 | ioapic_register_intr(irq, desc, trigger); | 1474 | ioapic_register_intr(irq, desc, trigger); |
1483 | if (irq < NR_IRQS_LEGACY) | 1475 | if (irq < nr_legacy_irqs) |
1484 | disable_8259A_irq(irq); | 1476 | disable_8259A_irq(irq); |
1485 | 1477 | ||
1486 | ioapic_write_entry(apic_id, pin, entry); | 1478 | ioapic_write_entry(apic_id, pin, entry); |
@@ -1702,12 +1694,8 @@ __apicdebuginit(void) print_IO_APIC(void) | |||
1702 | if (!entry) | 1694 | if (!entry) |
1703 | continue; | 1695 | continue; |
1704 | printk(KERN_DEBUG "IRQ%d ", irq); | 1696 | printk(KERN_DEBUG "IRQ%d ", irq); |
1705 | for (;;) { | 1697 | for_each_irq_pin(entry, cfg->irq_2_pin) |
1706 | printk("-> %d:%d", entry->apic, entry->pin); | 1698 | printk("-> %d:%d", entry->apic, entry->pin); |
1707 | if (!entry->next) | ||
1708 | break; | ||
1709 | entry = entry->next; | ||
1710 | } | ||
1711 | printk("\n"); | 1699 | printk("\n"); |
1712 | } | 1700 | } |
1713 | 1701 | ||
@@ -1851,7 +1839,7 @@ __apicdebuginit(void) print_PIC(void) | |||
1851 | unsigned int v; | 1839 | unsigned int v; |
1852 | unsigned long flags; | 1840 | unsigned long flags; |
1853 | 1841 | ||
1854 | if (apic_verbosity == APIC_QUIET) | 1842 | if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) |
1855 | return; | 1843 | return; |
1856 | 1844 | ||
1857 | printk(KERN_DEBUG "\nprinting PIC contents\n"); | 1845 | printk(KERN_DEBUG "\nprinting PIC contents\n"); |
@@ -1883,7 +1871,7 @@ __apicdebuginit(int) print_all_ICs(void) | |||
1883 | print_PIC(); | 1871 | print_PIC(); |
1884 | 1872 | ||
1885 | /* don't print out if apic is not there */ | 1873 | /* don't print out if apic is not there */ |
1886 | if (!cpu_has_apic || disable_apic) | 1874 | if (!cpu_has_apic && !apic_from_smp_config()) |
1887 | return 0; | 1875 | return 0; |
1888 | 1876 | ||
1889 | print_all_local_APICs(); | 1877 | print_all_local_APICs(); |
@@ -1914,6 +1902,10 @@ void __init enable_IO_APIC(void) | |||
1914 | spin_unlock_irqrestore(&ioapic_lock, flags); | 1902 | spin_unlock_irqrestore(&ioapic_lock, flags); |
1915 | nr_ioapic_registers[apic] = reg_01.bits.entries+1; | 1903 | nr_ioapic_registers[apic] = reg_01.bits.entries+1; |
1916 | } | 1904 | } |
1905 | |||
1906 | if (!nr_legacy_irqs) | ||
1907 | return; | ||
1908 | |||
1917 | for(apic = 0; apic < nr_ioapics; apic++) { | 1909 | for(apic = 0; apic < nr_ioapics; apic++) { |
1918 | int pin; | 1910 | int pin; |
1919 | /* See if any of the pins is in ExtINT mode */ | 1911 | /* See if any of the pins is in ExtINT mode */ |
@@ -1968,6 +1960,9 @@ void disable_IO_APIC(void) | |||
1968 | */ | 1960 | */ |
1969 | clear_IO_APIC(); | 1961 | clear_IO_APIC(); |
1970 | 1962 | ||
1963 | if (!nr_legacy_irqs) | ||
1964 | return; | ||
1965 | |||
1971 | /* | 1966 | /* |
1972 | * If the i8259 is routed through an IOAPIC | 1967 | * If the i8259 is routed through an IOAPIC |
1973 | * Put that IOAPIC in virtual wire mode | 1968 | * Put that IOAPIC in virtual wire mode |
@@ -2001,7 +1996,7 @@ void disable_IO_APIC(void) | |||
2001 | /* | 1996 | /* |
2002 | * Use virtual wire A mode when interrupt remapping is enabled. | 1997 | * Use virtual wire A mode when interrupt remapping is enabled. |
2003 | */ | 1998 | */ |
2004 | if (cpu_has_apic) | 1999 | if (cpu_has_apic || apic_from_smp_config()) |
2005 | disconnect_bsp_APIC(!intr_remapping_enabled && | 2000 | disconnect_bsp_APIC(!intr_remapping_enabled && |
2006 | ioapic_i8259.pin != -1); | 2001 | ioapic_i8259.pin != -1); |
2007 | } | 2002 | } |
@@ -2014,7 +2009,7 @@ void disable_IO_APIC(void) | |||
2014 | * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 | 2009 | * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 |
2015 | */ | 2010 | */ |
2016 | 2011 | ||
2017 | static void __init setup_ioapic_ids_from_mpc(void) | 2012 | void __init setup_ioapic_ids_from_mpc(void) |
2018 | { | 2013 | { |
2019 | union IO_APIC_reg_00 reg_00; | 2014 | union IO_APIC_reg_00 reg_00; |
2020 | physid_mask_t phys_id_present_map; | 2015 | physid_mask_t phys_id_present_map; |
@@ -2023,9 +2018,8 @@ static void __init setup_ioapic_ids_from_mpc(void) | |||
2023 | unsigned char old_id; | 2018 | unsigned char old_id; |
2024 | unsigned long flags; | 2019 | unsigned long flags; |
2025 | 2020 | ||
2026 | if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) | 2021 | if (acpi_ioapic) |
2027 | return; | 2022 | return; |
2028 | |||
2029 | /* | 2023 | /* |
2030 | * Don't check I/O APIC IDs for xAPIC systems. They have | 2024 | * Don't check I/O APIC IDs for xAPIC systems. They have |
2031 | * no meaning without the serial APIC bus. | 2025 | * no meaning without the serial APIC bus. |
@@ -2199,7 +2193,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) | |||
2199 | struct irq_cfg *cfg; | 2193 | struct irq_cfg *cfg; |
2200 | 2194 | ||
2201 | spin_lock_irqsave(&ioapic_lock, flags); | 2195 | spin_lock_irqsave(&ioapic_lock, flags); |
2202 | if (irq < NR_IRQS_LEGACY) { | 2196 | if (irq < nr_legacy_irqs) { |
2203 | disable_8259A_irq(irq); | 2197 | disable_8259A_irq(irq); |
2204 | if (i8259A_irq_pending(irq)) | 2198 | if (i8259A_irq_pending(irq)) |
2205 | was_pending = 1; | 2199 | was_pending = 1; |
@@ -2211,7 +2205,6 @@ static unsigned int startup_ioapic_irq(unsigned int irq) | |||
2211 | return was_pending; | 2205 | return was_pending; |
2212 | } | 2206 | } |
2213 | 2207 | ||
2214 | #ifdef CONFIG_X86_64 | ||
2215 | static int ioapic_retrigger_irq(unsigned int irq) | 2208 | static int ioapic_retrigger_irq(unsigned int irq) |
2216 | { | 2209 | { |
2217 | 2210 | ||
@@ -2224,14 +2217,6 @@ static int ioapic_retrigger_irq(unsigned int irq) | |||
2224 | 2217 | ||
2225 | return 1; | 2218 | return 1; |
2226 | } | 2219 | } |
2227 | #else | ||
2228 | static int ioapic_retrigger_irq(unsigned int irq) | ||
2229 | { | ||
2230 | apic->send_IPI_self(irq_cfg(irq)->vector); | ||
2231 | |||
2232 | return 1; | ||
2233 | } | ||
2234 | #endif | ||
2235 | 2220 | ||
2236 | /* | 2221 | /* |
2237 | * Level and edge triggered IO-APIC interrupts need different handling, | 2222 | * Level and edge triggered IO-APIC interrupts need different handling, |
@@ -2269,13 +2254,9 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq | |||
2269 | struct irq_pin_list *entry; | 2254 | struct irq_pin_list *entry; |
2270 | u8 vector = cfg->vector; | 2255 | u8 vector = cfg->vector; |
2271 | 2256 | ||
2272 | entry = cfg->irq_2_pin; | 2257 | for_each_irq_pin(entry, cfg->irq_2_pin) { |
2273 | for (;;) { | ||
2274 | unsigned int reg; | 2258 | unsigned int reg; |
2275 | 2259 | ||
2276 | if (!entry) | ||
2277 | break; | ||
2278 | |||
2279 | apic = entry->apic; | 2260 | apic = entry->apic; |
2280 | pin = entry->pin; | 2261 | pin = entry->pin; |
2281 | /* | 2262 | /* |
@@ -2288,9 +2269,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq | |||
2288 | reg &= ~IO_APIC_REDIR_VECTOR_MASK; | 2269 | reg &= ~IO_APIC_REDIR_VECTOR_MASK; |
2289 | reg |= vector; | 2270 | reg |= vector; |
2290 | io_apic_modify(apic, 0x10 + pin*2, reg); | 2271 | io_apic_modify(apic, 0x10 + pin*2, reg); |
2291 | if (!entry->next) | ||
2292 | break; | ||
2293 | entry = entry->next; | ||
2294 | } | 2272 | } |
2295 | } | 2273 | } |
2296 | 2274 | ||
@@ -2515,11 +2493,8 @@ atomic_t irq_mis_count; | |||
2515 | static void ack_apic_level(unsigned int irq) | 2493 | static void ack_apic_level(unsigned int irq) |
2516 | { | 2494 | { |
2517 | struct irq_desc *desc = irq_to_desc(irq); | 2495 | struct irq_desc *desc = irq_to_desc(irq); |
2518 | |||
2519 | #ifdef CONFIG_X86_32 | ||
2520 | unsigned long v; | 2496 | unsigned long v; |
2521 | int i; | 2497 | int i; |
2522 | #endif | ||
2523 | struct irq_cfg *cfg; | 2498 | struct irq_cfg *cfg; |
2524 | int do_unmask_irq = 0; | 2499 | int do_unmask_irq = 0; |
2525 | 2500 | ||
@@ -2532,31 +2507,28 @@ static void ack_apic_level(unsigned int irq) | |||
2532 | } | 2507 | } |
2533 | #endif | 2508 | #endif |
2534 | 2509 | ||
2535 | #ifdef CONFIG_X86_32 | ||
2536 | /* | 2510 | /* |
2537 | * It appears there is an erratum which affects at least version 0x11 | 2511 | * It appears there is an erratum which affects at least version 0x11 |
2538 | * of I/O APIC (that's the 82093AA and cores integrated into various | 2512 | * of I/O APIC (that's the 82093AA and cores integrated into various |
2539 | * chipsets). Under certain conditions a level-triggered interrupt is | 2513 | * chipsets). Under certain conditions a level-triggered interrupt is |
2540 | * erroneously delivered as edge-triggered one but the respective IRR | 2514 | * erroneously delivered as edge-triggered one but the respective IRR |
2541 | * bit gets set nevertheless. As a result the I/O unit expects an EOI | 2515 | * bit gets set nevertheless. As a result the I/O unit expects an EOI |
2542 | * message but it will never arrive and further interrupts are blocked | 2516 | * message but it will never arrive and further interrupts are blocked |
2543 | * from the source. The exact reason is so far unknown, but the | 2517 | * from the source. The exact reason is so far unknown, but the |
2544 | * phenomenon was observed when two consecutive interrupt requests | 2518 | * phenomenon was observed when two consecutive interrupt requests |
2545 | * from a given source get delivered to the same CPU and the source is | 2519 | * from a given source get delivered to the same CPU and the source is |
2546 | * temporarily disabled in between. | 2520 | * temporarily disabled in between. |
2547 | * | 2521 | * |
2548 | * A workaround is to simulate an EOI message manually. We achieve it | 2522 | * A workaround is to simulate an EOI message manually. We achieve it |
2549 | * by setting the trigger mode to edge and then to level when the edge | 2523 | * by setting the trigger mode to edge and then to level when the edge |
2550 | * trigger mode gets detected in the TMR of a local APIC for a | 2524 | * trigger mode gets detected in the TMR of a local APIC for a |
2551 | * level-triggered interrupt. We mask the source for the time of the | 2525 | * level-triggered interrupt. We mask the source for the time of the |
2552 | * operation to prevent an edge-triggered interrupt escaping meanwhile. | 2526 | * operation to prevent an edge-triggered interrupt escaping meanwhile. |
2553 | * The idea is from Manfred Spraul. --macro | 2527 | * The idea is from Manfred Spraul. --macro |
2554 | */ | 2528 | */ |
2555 | cfg = desc->chip_data; | 2529 | cfg = desc->chip_data; |
2556 | i = cfg->vector; | 2530 | i = cfg->vector; |
2557 | |||
2558 | v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); | 2531 | v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); |
2559 | #endif | ||
2560 | 2532 | ||
2561 | /* | 2533 | /* |
2562 | * We must acknowledge the irq before we move it or the acknowledge will | 2534 | * We must acknowledge the irq before we move it or the acknowledge will |
@@ -2598,7 +2570,7 @@ static void ack_apic_level(unsigned int irq) | |||
2598 | unmask_IO_APIC_irq_desc(desc); | 2570 | unmask_IO_APIC_irq_desc(desc); |
2599 | } | 2571 | } |
2600 | 2572 | ||
2601 | #ifdef CONFIG_X86_32 | 2573 | /* Tail end of version 0x11 I/O APIC bug workaround */ |
2602 | if (!(v & (1 << (i & 0x1f)))) { | 2574 | if (!(v & (1 << (i & 0x1f)))) { |
2603 | atomic_inc(&irq_mis_count); | 2575 | atomic_inc(&irq_mis_count); |
2604 | spin_lock(&ioapic_lock); | 2576 | spin_lock(&ioapic_lock); |
@@ -2606,26 +2578,15 @@ static void ack_apic_level(unsigned int irq) | |||
2606 | __unmask_and_level_IO_APIC_irq(cfg); | 2578 | __unmask_and_level_IO_APIC_irq(cfg); |
2607 | spin_unlock(&ioapic_lock); | 2579 | spin_unlock(&ioapic_lock); |
2608 | } | 2580 | } |
2609 | #endif | ||
2610 | } | 2581 | } |
2611 | 2582 | ||
2612 | #ifdef CONFIG_INTR_REMAP | 2583 | #ifdef CONFIG_INTR_REMAP |
2613 | static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) | 2584 | static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) |
2614 | { | 2585 | { |
2615 | int apic, pin; | ||
2616 | struct irq_pin_list *entry; | 2586 | struct irq_pin_list *entry; |
2617 | 2587 | ||
2618 | entry = cfg->irq_2_pin; | 2588 | for_each_irq_pin(entry, cfg->irq_2_pin) |
2619 | for (;;) { | 2589 | io_apic_eoi(entry->apic, entry->pin); |
2620 | |||
2621 | if (!entry) | ||
2622 | break; | ||
2623 | |||
2624 | apic = entry->apic; | ||
2625 | pin = entry->pin; | ||
2626 | io_apic_eoi(apic, pin); | ||
2627 | entry = entry->next; | ||
2628 | } | ||
2629 | } | 2590 | } |
2630 | 2591 | ||
2631 | static void | 2592 | static void |
@@ -2710,7 +2671,7 @@ static inline void init_IO_APIC_traps(void) | |||
2710 | * so default to an old-fashioned 8259 | 2671 | * so default to an old-fashioned 8259 |
2711 | * interrupt if we can.. | 2672 | * interrupt if we can.. |
2712 | */ | 2673 | */ |
2713 | if (irq < NR_IRQS_LEGACY) | 2674 | if (irq < nr_legacy_irqs) |
2714 | make_8259A_irq(irq); | 2675 | make_8259A_irq(irq); |
2715 | else | 2676 | else |
2716 | /* Strange. Oh, well.. */ | 2677 | /* Strange. Oh, well.. */ |
@@ -3046,7 +3007,7 @@ out: | |||
3046 | * the I/O APIC in all cases now. No actual device should request | 3007 | * the I/O APIC in all cases now. No actual device should request |
3047 | * it anyway. --macro | 3008 | * it anyway. --macro |
3048 | */ | 3009 | */ |
3049 | #define PIC_IRQS (1 << PIC_CASCADE_IR) | 3010 | #define PIC_IRQS (1UL << PIC_CASCADE_IR) |
3050 | 3011 | ||
3051 | void __init setup_IO_APIC(void) | 3012 | void __init setup_IO_APIC(void) |
3052 | { | 3013 | { |
@@ -3054,21 +3015,19 @@ void __init setup_IO_APIC(void) | |||
3054 | /* | 3015 | /* |
3055 | * calling enable_IO_APIC() is moved to setup_local_APIC for BP | 3016 | * calling enable_IO_APIC() is moved to setup_local_APIC for BP |
3056 | */ | 3017 | */ |
3057 | 3018 | io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; | |
3058 | io_apic_irqs = ~PIC_IRQS; | ||
3059 | 3019 | ||
3060 | apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); | 3020 | apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); |
3061 | /* | 3021 | /* |
3062 | * Set up IO-APIC IRQ routing. | 3022 | * Set up IO-APIC IRQ routing. |
3063 | */ | 3023 | */ |
3064 | #ifdef CONFIG_X86_32 | 3024 | x86_init.mpparse.setup_ioapic_ids(); |
3065 | if (!acpi_ioapic) | 3025 | |
3066 | setup_ioapic_ids_from_mpc(); | ||
3067 | #endif | ||
3068 | sync_Arb_IDs(); | 3026 | sync_Arb_IDs(); |
3069 | setup_IO_APIC_irqs(); | 3027 | setup_IO_APIC_irqs(); |
3070 | init_IO_APIC_traps(); | 3028 | init_IO_APIC_traps(); |
3071 | check_timer(); | 3029 | if (nr_legacy_irqs) |
3030 | check_timer(); | ||
3072 | } | 3031 | } |
3073 | 3032 | ||
3074 | /* | 3033 | /* |
@@ -3169,7 +3128,6 @@ static int __init ioapic_init_sysfs(void) | |||
3169 | 3128 | ||
3170 | device_initcall(ioapic_init_sysfs); | 3129 | device_initcall(ioapic_init_sysfs); |
3171 | 3130 | ||
3172 | static int nr_irqs_gsi = NR_IRQS_LEGACY; | ||
3173 | /* | 3131 | /* |
3174 | * Dynamic irq allocate and deallocation | 3132 | * Dynamic irq allocate and deallocation |
3175 | */ | 3133 | */ |
@@ -3241,8 +3199,7 @@ void destroy_irq(unsigned int irq) | |||
3241 | cfg = desc->chip_data; | 3199 | cfg = desc->chip_data; |
3242 | dynamic_irq_cleanup(irq); | 3200 | dynamic_irq_cleanup(irq); |
3243 | /* connect back irq_cfg */ | 3201 | /* connect back irq_cfg */ |
3244 | if (desc) | 3202 | desc->chip_data = cfg; |
3245 | desc->chip_data = cfg; | ||
3246 | 3203 | ||
3247 | free_irte(irq); | 3204 | free_irte(irq); |
3248 | spin_lock_irqsave(&vector_lock, flags); | 3205 | spin_lock_irqsave(&vector_lock, flags); |
@@ -3910,9 +3867,13 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, | |||
3910 | /* | 3867 | /* |
3911 | * IRQs < 16 are already in the irq_2_pin[] map | 3868 | * IRQs < 16 are already in the irq_2_pin[] map |
3912 | */ | 3869 | */ |
3913 | if (irq >= NR_IRQS_LEGACY) { | 3870 | if (irq >= nr_legacy_irqs) { |
3914 | cfg = desc->chip_data; | 3871 | cfg = desc->chip_data; |
3915 | add_pin_to_irq_node(cfg, node, ioapic, pin); | 3872 | if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { |
3873 | printk(KERN_INFO "can not add pin %d for irq %d\n", | ||
3874 | pin, irq); | ||
3875 | return 0; | ||
3876 | } | ||
3916 | } | 3877 | } |
3917 | 3878 | ||
3918 | setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); | 3879 | setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); |
@@ -3941,11 +3902,28 @@ int io_apic_set_pci_routing(struct device *dev, int irq, | |||
3941 | return __io_apic_set_pci_routing(dev, irq, irq_attr); | 3902 | return __io_apic_set_pci_routing(dev, irq, irq_attr); |
3942 | } | 3903 | } |
3943 | 3904 | ||
3944 | /* -------------------------------------------------------------------------- | 3905 | u8 __init io_apic_unique_id(u8 id) |
3945 | ACPI-based IOAPIC Configuration | 3906 | { |
3946 | -------------------------------------------------------------------------- */ | 3907 | #ifdef CONFIG_X86_32 |
3908 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && | ||
3909 | !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) | ||
3910 | return io_apic_get_unique_id(nr_ioapics, id); | ||
3911 | else | ||
3912 | return id; | ||
3913 | #else | ||
3914 | int i; | ||
3915 | DECLARE_BITMAP(used, 256); | ||
3947 | 3916 | ||
3948 | #ifdef CONFIG_ACPI | 3917 | bitmap_zero(used, 256); |
3918 | for (i = 0; i < nr_ioapics; i++) { | ||
3919 | struct mpc_ioapic *ia = &mp_ioapics[i]; | ||
3920 | __set_bit(ia->apicid, used); | ||
3921 | } | ||
3922 | if (!test_bit(id, used)) | ||
3923 | return id; | ||
3924 | return find_first_zero_bit(used, 256); | ||
3925 | #endif | ||
3926 | } | ||
3949 | 3927 | ||
3950 | #ifdef CONFIG_X86_32 | 3928 | #ifdef CONFIG_X86_32 |
3951 | int __init io_apic_get_unique_id(int ioapic, int apic_id) | 3929 | int __init io_apic_get_unique_id(int ioapic, int apic_id) |
@@ -4054,8 +4032,6 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) | |||
4054 | return 0; | 4032 | return 0; |
4055 | } | 4033 | } |
4056 | 4034 | ||
4057 | #endif /* CONFIG_ACPI */ | ||
4058 | |||
4059 | /* | 4035 | /* |
4060 | * This function currently is only a helper for the i386 smp boot process where | 4036 | * This function currently is only a helper for the i386 smp boot process where |
4061 | * we need to reprogram the ioredtbls to cater for the cpus which have come online | 4037 | * we need to reprogram the ioredtbls to cater for the cpus which have come online |
@@ -4109,7 +4085,7 @@ void __init setup_ioapic_dest(void) | |||
4109 | 4085 | ||
4110 | static struct resource *ioapic_resources; | 4086 | static struct resource *ioapic_resources; |
4111 | 4087 | ||
4112 | static struct resource * __init ioapic_setup_resources(void) | 4088 | static struct resource * __init ioapic_setup_resources(int nr_ioapics) |
4113 | { | 4089 | { |
4114 | unsigned long n; | 4090 | unsigned long n; |
4115 | struct resource *res; | 4091 | struct resource *res; |
@@ -4125,15 +4101,13 @@ static struct resource * __init ioapic_setup_resources(void) | |||
4125 | mem = alloc_bootmem(n); | 4101 | mem = alloc_bootmem(n); |
4126 | res = (void *)mem; | 4102 | res = (void *)mem; |
4127 | 4103 | ||
4128 | if (mem != NULL) { | 4104 | mem += sizeof(struct resource) * nr_ioapics; |
4129 | mem += sizeof(struct resource) * nr_ioapics; | ||
4130 | 4105 | ||
4131 | for (i = 0; i < nr_ioapics; i++) { | 4106 | for (i = 0; i < nr_ioapics; i++) { |
4132 | res[i].name = mem; | 4107 | res[i].name = mem; |
4133 | res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; | 4108 | res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
4134 | sprintf(mem, "IOAPIC %u", i); | 4109 | sprintf(mem, "IOAPIC %u", i); |
4135 | mem += IOAPIC_RESOURCE_NAME_SIZE; | 4110 | mem += IOAPIC_RESOURCE_NAME_SIZE; |
4136 | } | ||
4137 | } | 4111 | } |
4138 | 4112 | ||
4139 | ioapic_resources = res; | 4113 | ioapic_resources = res; |
@@ -4147,7 +4121,7 @@ void __init ioapic_init_mappings(void) | |||
4147 | struct resource *ioapic_res; | 4121 | struct resource *ioapic_res; |
4148 | int i; | 4122 | int i; |
4149 | 4123 | ||
4150 | ioapic_res = ioapic_setup_resources(); | 4124 | ioapic_res = ioapic_setup_resources(nr_ioapics); |
4151 | for (i = 0; i < nr_ioapics; i++) { | 4125 | for (i = 0; i < nr_ioapics; i++) { |
4152 | if (smp_found_config) { | 4126 | if (smp_found_config) { |
4153 | ioapic_phys = mp_ioapics[i].apicaddr; | 4127 | ioapic_phys = mp_ioapics[i].apicaddr; |
@@ -4176,11 +4150,9 @@ fake_ioapic_page: | |||
4176 | __fix_to_virt(idx), ioapic_phys); | 4150 | __fix_to_virt(idx), ioapic_phys); |
4177 | idx++; | 4151 | idx++; |
4178 | 4152 | ||
4179 | if (ioapic_res != NULL) { | 4153 | ioapic_res->start = ioapic_phys; |
4180 | ioapic_res->start = ioapic_phys; | 4154 | ioapic_res->end = ioapic_phys + (4 * 1024) - 1; |
4181 | ioapic_res->end = ioapic_phys + (4 * 1024) - 1; | 4155 | ioapic_res++; |
4182 | ioapic_res++; | ||
4183 | } | ||
4184 | } | 4156 | } |
4185 | } | 4157 | } |
4186 | 4158 | ||
@@ -4201,3 +4173,76 @@ void __init ioapic_insert_resources(void) | |||
4201 | r++; | 4173 | r++; |
4202 | } | 4174 | } |
4203 | } | 4175 | } |
4176 | |||
4177 | int mp_find_ioapic(int gsi) | ||
4178 | { | ||
4179 | int i = 0; | ||
4180 | |||
4181 | /* Find the IOAPIC that manages this GSI. */ | ||
4182 | for (i = 0; i < nr_ioapics; i++) { | ||
4183 | if ((gsi >= mp_gsi_routing[i].gsi_base) | ||
4184 | && (gsi <= mp_gsi_routing[i].gsi_end)) | ||
4185 | return i; | ||
4186 | } | ||
4187 | |||
4188 | printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); | ||
4189 | return -1; | ||
4190 | } | ||
4191 | |||
4192 | int mp_find_ioapic_pin(int ioapic, int gsi) | ||
4193 | { | ||
4194 | if (WARN_ON(ioapic == -1)) | ||
4195 | return -1; | ||
4196 | if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end)) | ||
4197 | return -1; | ||
4198 | |||
4199 | return gsi - mp_gsi_routing[ioapic].gsi_base; | ||
4200 | } | ||
4201 | |||
4202 | static int bad_ioapic(unsigned long address) | ||
4203 | { | ||
4204 | if (nr_ioapics >= MAX_IO_APICS) { | ||
4205 | printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " | ||
4206 | "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); | ||
4207 | return 1; | ||
4208 | } | ||
4209 | if (!address) { | ||
4210 | printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address" | ||
4211 | " found in table, skipping!\n"); | ||
4212 | return 1; | ||
4213 | } | ||
4214 | return 0; | ||
4215 | } | ||
4216 | |||
4217 | void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) | ||
4218 | { | ||
4219 | int idx = 0; | ||
4220 | |||
4221 | if (bad_ioapic(address)) | ||
4222 | return; | ||
4223 | |||
4224 | idx = nr_ioapics; | ||
4225 | |||
4226 | mp_ioapics[idx].type = MP_IOAPIC; | ||
4227 | mp_ioapics[idx].flags = MPC_APIC_USABLE; | ||
4228 | mp_ioapics[idx].apicaddr = address; | ||
4229 | |||
4230 | set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); | ||
4231 | mp_ioapics[idx].apicid = io_apic_unique_id(id); | ||
4232 | mp_ioapics[idx].apicver = io_apic_get_version(idx); | ||
4233 | |||
4234 | /* | ||
4235 | * Build basic GSI lookup table to facilitate gsi->io_apic lookups | ||
4236 | * and to prevent reprogramming of IOAPIC pins (PCI GSIs). | ||
4237 | */ | ||
4238 | mp_gsi_routing[idx].gsi_base = gsi_base; | ||
4239 | mp_gsi_routing[idx].gsi_end = gsi_base + | ||
4240 | io_apic_get_redir_entries(idx); | ||
4241 | |||
4242 | printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " | ||
4243 | "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, | ||
4244 | mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, | ||
4245 | mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end); | ||
4246 | |||
4247 | nr_ioapics++; | ||
4248 | } | ||
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index dbf5445727a..08385e090a6 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c | |||
@@ -106,6 +106,9 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) | |||
106 | unsigned long mask = cpumask_bits(cpumask)[0]; | 106 | unsigned long mask = cpumask_bits(cpumask)[0]; |
107 | unsigned long flags; | 107 | unsigned long flags; |
108 | 108 | ||
109 | if (WARN_ONCE(!mask, "empty IPI mask")) | ||
110 | return; | ||
111 | |||
109 | local_irq_save(flags); | 112 | local_irq_save(flags); |
110 | WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); | 113 | WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); |
111 | __default_send_IPI_dest_field(mask, vector, apic->dest_logical); | 114 | __default_send_IPI_dest_field(mask, vector, apic->dest_logical); |
@@ -150,7 +153,7 @@ int safe_smp_processor_id(void) | |||
150 | { | 153 | { |
151 | int apicid, cpuid; | 154 | int apicid, cpuid; |
152 | 155 | ||
153 | if (!boot_cpu_has(X86_FEATURE_APIC)) | 156 | if (!cpu_has_apic) |
154 | return 0; | 157 | return 0; |
155 | 158 | ||
156 | apicid = hard_smp_processor_id(); | 159 | apicid = hard_smp_processor_id(); |
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index b3025b43b63..7ff61d6a188 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c | |||
@@ -39,7 +39,7 @@ | |||
39 | int unknown_nmi_panic; | 39 | int unknown_nmi_panic; |
40 | int nmi_watchdog_enabled; | 40 | int nmi_watchdog_enabled; |
41 | 41 | ||
42 | static cpumask_var_t backtrace_mask; | 42 | static cpumask_t backtrace_mask __read_mostly; |
43 | 43 | ||
44 | /* nmi_active: | 44 | /* nmi_active: |
45 | * >0: the lapic NMI watchdog is active, but can be disabled | 45 | * >0: the lapic NMI watchdog is active, but can be disabled |
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) | |||
66 | 66 | ||
67 | static inline int mce_in_progress(void) | 67 | static inline int mce_in_progress(void) |
68 | { | 68 | { |
69 | #if defined(CONFIG_X86_NEW_MCE) | 69 | #if defined(CONFIG_X86_MCE) |
70 | return atomic_read(&mce_entry) > 0; | 70 | return atomic_read(&mce_entry) > 0; |
71 | #endif | 71 | #endif |
72 | return 0; | 72 | return 0; |
@@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void) | |||
138 | if (!prev_nmi_count) | 138 | if (!prev_nmi_count) |
139 | goto error; | 139 | goto error; |
140 | 140 | ||
141 | alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO); | ||
142 | printk(KERN_INFO "Testing NMI watchdog ... "); | 141 | printk(KERN_INFO "Testing NMI watchdog ... "); |
143 | 142 | ||
144 | #ifdef CONFIG_SMP | 143 | #ifdef CONFIG_SMP |
@@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) | |||
415 | } | 414 | } |
416 | 415 | ||
417 | /* We can be called before check_nmi_watchdog, hence NULL check. */ | 416 | /* We can be called before check_nmi_watchdog, hence NULL check. */ |
418 | if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) { | 417 | if (cpumask_test_cpu(cpu, &backtrace_mask)) { |
419 | static DEFINE_SPINLOCK(lock); /* Serialise the printks */ | 418 | static DEFINE_SPINLOCK(lock); /* Serialise the printks */ |
420 | 419 | ||
421 | spin_lock(&lock); | 420 | spin_lock(&lock); |
422 | printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); | 421 | printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); |
422 | show_regs(regs); | ||
423 | dump_stack(); | 423 | dump_stack(); |
424 | spin_unlock(&lock); | 424 | spin_unlock(&lock); |
425 | cpumask_clear_cpu(cpu, backtrace_mask); | 425 | cpumask_clear_cpu(cpu, &backtrace_mask); |
426 | |||
427 | rc = 1; | ||
426 | } | 428 | } |
427 | 429 | ||
428 | /* Could check oops_in_progress here too, but it's safer not to */ | 430 | /* Could check oops_in_progress here too, but it's safer not to */ |
@@ -506,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) | |||
506 | /* | 508 | /* |
507 | * proc handler for /proc/sys/kernel/nmi | 509 | * proc handler for /proc/sys/kernel/nmi |
508 | */ | 510 | */ |
509 | int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, | 511 | int proc_nmi_enabled(struct ctl_table *table, int write, |
510 | void __user *buffer, size_t *length, loff_t *ppos) | 512 | void __user *buffer, size_t *length, loff_t *ppos) |
511 | { | 513 | { |
512 | int old_state; | 514 | int old_state; |
513 | 515 | ||
514 | nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; | 516 | nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; |
515 | old_state = nmi_watchdog_enabled; | 517 | old_state = nmi_watchdog_enabled; |
516 | proc_dointvec(table, write, file, buffer, length, ppos); | 518 | proc_dointvec(table, write, buffer, length, ppos); |
517 | if (!!old_state == !!nmi_watchdog_enabled) | 519 | if (!!old_state == !!nmi_watchdog_enabled) |
518 | return 0; | 520 | return 0; |
519 | 521 | ||
@@ -552,14 +554,18 @@ int do_nmi_callback(struct pt_regs *regs, int cpu) | |||
552 | return 0; | 554 | return 0; |
553 | } | 555 | } |
554 | 556 | ||
555 | void __trigger_all_cpu_backtrace(void) | 557 | void arch_trigger_all_cpu_backtrace(void) |
556 | { | 558 | { |
557 | int i; | 559 | int i; |
558 | 560 | ||
559 | cpumask_copy(backtrace_mask, cpu_online_mask); | 561 | cpumask_copy(&backtrace_mask, cpu_online_mask); |
562 | |||
563 | printk(KERN_INFO "sending NMI to all CPUs:\n"); | ||
564 | apic->send_IPI_all(NMI_VECTOR); | ||
565 | |||
560 | /* Wait for up to 10 seconds for all CPUs to do the backtrace */ | 566 | /* Wait for up to 10 seconds for all CPUs to do the backtrace */ |
561 | for (i = 0; i < 10 * 1000; i++) { | 567 | for (i = 0; i < 10 * 1000; i++) { |
562 | if (cpumask_empty(backtrace_mask)) | 568 | if (cpumask_empty(&backtrace_mask)) |
563 | break; | 569 | break; |
564 | mdelay(1); | 570 | mdelay(1); |
565 | } | 571 | } |
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index ca96e68f0d2..efa00e2b850 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c | |||
@@ -66,7 +66,6 @@ struct mpc_trans { | |||
66 | unsigned short trans_reserved; | 66 | unsigned short trans_reserved; |
67 | }; | 67 | }; |
68 | 68 | ||
69 | /* x86_quirks member */ | ||
70 | static int mpc_record; | 69 | static int mpc_record; |
71 | 70 | ||
72 | static struct mpc_trans *translation_table[MAX_MPC_ENTRY]; | 71 | static struct mpc_trans *translation_table[MAX_MPC_ENTRY]; |
@@ -130,10 +129,9 @@ void __cpuinit numaq_tsc_disable(void) | |||
130 | } | 129 | } |
131 | } | 130 | } |
132 | 131 | ||
133 | static int __init numaq_pre_time_init(void) | 132 | static void __init numaq_tsc_init(void) |
134 | { | 133 | { |
135 | numaq_tsc_disable(); | 134 | numaq_tsc_disable(); |
136 | return 0; | ||
137 | } | 135 | } |
138 | 136 | ||
139 | static inline int generate_logical_apicid(int quad, int phys_apicid) | 137 | static inline int generate_logical_apicid(int quad, int phys_apicid) |
@@ -177,6 +175,19 @@ static void mpc_oem_pci_bus(struct mpc_bus *m) | |||
177 | quad_local_to_mp_bus_id[quad][local] = m->busid; | 175 | quad_local_to_mp_bus_id[quad][local] = m->busid; |
178 | } | 176 | } |
179 | 177 | ||
178 | /* | ||
179 | * Called from mpparse code. | ||
180 | * mode = 0: prescan | ||
181 | * mode = 1: one mpc entry scanned | ||
182 | */ | ||
183 | static void numaq_mpc_record(unsigned int mode) | ||
184 | { | ||
185 | if (!mode) | ||
186 | mpc_record = 0; | ||
187 | else | ||
188 | mpc_record++; | ||
189 | } | ||
190 | |||
180 | static void __init MP_translation_info(struct mpc_trans *m) | 191 | static void __init MP_translation_info(struct mpc_trans *m) |
181 | { | 192 | { |
182 | printk(KERN_INFO | 193 | printk(KERN_INFO |
@@ -206,9 +217,9 @@ static int __init mpf_checksum(unsigned char *mp, int len) | |||
206 | /* | 217 | /* |
207 | * Read/parse the MPC oem tables | 218 | * Read/parse the MPC oem tables |
208 | */ | 219 | */ |
209 | static void __init | 220 | static void __init smp_read_mpc_oem(struct mpc_table *mpc) |
210 | smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize) | ||
211 | { | 221 | { |
222 | struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr; | ||
212 | int count = sizeof(*oemtable); /* the header size */ | 223 | int count = sizeof(*oemtable); /* the header size */ |
213 | unsigned char *oemptr = ((unsigned char *)oemtable) + count; | 224 | unsigned char *oemptr = ((unsigned char *)oemtable) + count; |
214 | 225 | ||
@@ -250,29 +261,6 @@ static void __init | |||
250 | } | 261 | } |
251 | } | 262 | } |
252 | 263 | ||
253 | static int __init numaq_setup_ioapic_ids(void) | ||
254 | { | ||
255 | /* so can skip it */ | ||
256 | return 1; | ||
257 | } | ||
258 | |||
259 | static struct x86_quirks numaq_x86_quirks __initdata = { | ||
260 | .arch_pre_time_init = numaq_pre_time_init, | ||
261 | .arch_time_init = NULL, | ||
262 | .arch_pre_intr_init = NULL, | ||
263 | .arch_memory_setup = NULL, | ||
264 | .arch_intr_init = NULL, | ||
265 | .arch_trap_init = NULL, | ||
266 | .mach_get_smp_config = NULL, | ||
267 | .mach_find_smp_config = NULL, | ||
268 | .mpc_record = &mpc_record, | ||
269 | .mpc_apic_id = mpc_apic_id, | ||
270 | .mpc_oem_bus_info = mpc_oem_bus_info, | ||
271 | .mpc_oem_pci_bus = mpc_oem_pci_bus, | ||
272 | .smp_read_mpc_oem = smp_read_mpc_oem, | ||
273 | .setup_ioapic_ids = numaq_setup_ioapic_ids, | ||
274 | }; | ||
275 | |||
276 | static __init void early_check_numaq(void) | 264 | static __init void early_check_numaq(void) |
277 | { | 265 | { |
278 | /* | 266 | /* |
@@ -286,8 +274,15 @@ static __init void early_check_numaq(void) | |||
286 | if (smp_found_config) | 274 | if (smp_found_config) |
287 | early_get_smp_config(); | 275 | early_get_smp_config(); |
288 | 276 | ||
289 | if (found_numaq) | 277 | if (found_numaq) { |
290 | x86_quirks = &numaq_x86_quirks; | 278 | x86_init.mpparse.mpc_record = numaq_mpc_record; |
279 | x86_init.mpparse.setup_ioapic_ids = x86_init_noop; | ||
280 | x86_init.mpparse.mpc_apic_id = mpc_apic_id; | ||
281 | x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; | ||
282 | x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; | ||
283 | x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; | ||
284 | x86_init.timers.tsc_pre_init = numaq_tsc_init; | ||
285 | } | ||
291 | } | 286 | } |
292 | 287 | ||
293 | int __init get_memcfg_numaq(void) | 288 | int __init get_memcfg_numaq(void) |
@@ -418,7 +413,7 @@ static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) | |||
418 | /* Where the IO area was mapped on multiquad, always 0 otherwise */ | 413 | /* Where the IO area was mapped on multiquad, always 0 otherwise */ |
419 | void *xquad_portio; | 414 | void *xquad_portio; |
420 | 415 | ||
421 | static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid) | 416 | static inline int numaq_check_phys_apicid_present(int phys_apicid) |
422 | { | 417 | { |
423 | return 1; | 418 | return 1; |
424 | } | 419 | } |
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index bc3e880f9b8..c4cbd3080c1 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c | |||
@@ -44,29 +44,46 @@ static struct apic *apic_probe[] __initdata = { | |||
44 | NULL, | 44 | NULL, |
45 | }; | 45 | }; |
46 | 46 | ||
47 | static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) | ||
48 | { | ||
49 | return hard_smp_processor_id() >> index_msb; | ||
50 | } | ||
51 | |||
47 | /* | 52 | /* |
48 | * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. | 53 | * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. |
49 | */ | 54 | */ |
50 | void __init default_setup_apic_routing(void) | 55 | void __init default_setup_apic_routing(void) |
51 | { | 56 | { |
52 | #ifdef CONFIG_X86_X2APIC | 57 | #ifdef CONFIG_X86_X2APIC |
53 | if (x2apic_mode && (apic != &apic_x2apic_phys && | 58 | if (x2apic_mode |
54 | #ifdef CONFIG_X86_UV | 59 | #ifdef CONFIG_X86_UV |
55 | apic != &apic_x2apic_uv_x && | 60 | && apic != &apic_x2apic_uv_x |
56 | #endif | 61 | #endif |
57 | apic != &apic_x2apic_cluster)) { | 62 | ) { |
58 | if (x2apic_phys) | 63 | if (x2apic_phys) |
59 | apic = &apic_x2apic_phys; | 64 | apic = &apic_x2apic_phys; |
60 | else | 65 | else |
61 | apic = &apic_x2apic_cluster; | 66 | apic = &apic_x2apic_cluster; |
62 | printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); | ||
63 | } | 67 | } |
64 | #endif | 68 | #endif |
65 | 69 | ||
66 | if (apic == &apic_flat) { | 70 | if (apic == &apic_flat) { |
67 | if (max_physical_apicid >= 8) | 71 | switch (boot_cpu_data.x86_vendor) { |
68 | apic = &apic_physflat; | 72 | case X86_VENDOR_INTEL: |
69 | printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); | 73 | if (num_processors > 8) |
74 | apic = &apic_physflat; | ||
75 | break; | ||
76 | case X86_VENDOR_AMD: | ||
77 | if (max_physical_apicid >= 8) | ||
78 | apic = &apic_physflat; | ||
79 | } | ||
80 | } | ||
81 | |||
82 | printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); | ||
83 | |||
84 | if (is_vsmp_box()) { | ||
85 | /* need to update phys_pkg_id */ | ||
86 | apic->phys_pkg_id = apicid_phys_pkg_id; | ||
70 | } | 87 | } |
71 | 88 | ||
72 | /* | 89 | /* |
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index eafdfbd1ea9..645ecc4ff0b 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c | |||
@@ -272,7 +272,7 @@ static physid_mask_t summit_apicid_to_cpu_present(int apicid) | |||
272 | return physid_mask_of_physid(0); | 272 | return physid_mask_of_physid(0); |
273 | } | 273 | } |
274 | 274 | ||
275 | static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid) | 275 | static int summit_check_phys_apicid_present(int physical_apicid) |
276 | { | 276 | { |
277 | return 1; | 277 | return 1; |
278 | } | 278 | } |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 832e908adcb..f5f5886a6b5 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c | |||
@@ -46,7 +46,7 @@ static int early_get_nodeid(void) | |||
46 | return node_id.s.node_id; | 46 | return node_id.s.node_id; |
47 | } | 47 | } |
48 | 48 | ||
49 | static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | 49 | static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) |
50 | { | 50 | { |
51 | if (!strcmp(oem_id, "SGI")) { | 51 | if (!strcmp(oem_id, "SGI")) { |
52 | if (!strcmp(oem_table_id, "UVL")) | 52 | if (!strcmp(oem_table_id, "UVL")) |
@@ -253,7 +253,7 @@ static void uv_send_IPI_self(int vector) | |||
253 | apic_write(APIC_SELF_IPI, vector); | 253 | apic_write(APIC_SELF_IPI, vector); |
254 | } | 254 | } |
255 | 255 | ||
256 | struct apic apic_x2apic_uv_x = { | 256 | struct apic __refdata apic_x2apic_uv_x = { |
257 | 257 | ||
258 | .name = "UV large system", | 258 | .name = "UV large system", |
259 | .probe = NULL, | 259 | .probe = NULL, |
@@ -389,6 +389,16 @@ static __init void map_gru_high(int max_pnode) | |||
389 | map_high("GRU", gru.s.base, shift, max_pnode, map_wb); | 389 | map_high("GRU", gru.s.base, shift, max_pnode, map_wb); |
390 | } | 390 | } |
391 | 391 | ||
392 | static __init void map_mmr_high(int max_pnode) | ||
393 | { | ||
394 | union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; | ||
395 | int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; | ||
396 | |||
397 | mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); | ||
398 | if (mmr.s.enable) | ||
399 | map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); | ||
400 | } | ||
401 | |||
392 | static __init void map_mmioh_high(int max_pnode) | 402 | static __init void map_mmioh_high(int max_pnode) |
393 | { | 403 | { |
394 | union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; | 404 | union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; |
@@ -643,6 +653,7 @@ void __init uv_system_init(void) | |||
643 | } | 653 | } |
644 | 654 | ||
645 | map_gru_high(max_pnode); | 655 | map_gru_high(max_pnode); |
656 | map_mmr_high(max_pnode); | ||
646 | map_mmioh_high(max_pnode); | 657 | map_mmioh_high(max_pnode); |
647 | 658 | ||
648 | uv_cpu_init(); | 659 | uv_cpu_init(); |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 442b5508893..151ace69a5a 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -403,7 +403,15 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); | |||
403 | static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); | 403 | static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); |
404 | static struct apm_user *user_list; | 404 | static struct apm_user *user_list; |
405 | static DEFINE_SPINLOCK(user_list_lock); | 405 | static DEFINE_SPINLOCK(user_list_lock); |
406 | static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; | 406 | |
407 | /* | ||
408 | * Set up a segment that references the real mode segment 0x40 | ||
409 | * that extends up to the end of page zero (that we have reserved). | ||
410 | * This is for buggy BIOS's that refer to (real mode) segment 0x40 | ||
411 | * even though they are called in protected mode. | ||
412 | */ | ||
413 | static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, | ||
414 | (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); | ||
407 | 415 | ||
408 | static const char driver_version[] = "1.16ac"; /* no spaces */ | 416 | static const char driver_version[] = "1.16ac"; /* no spaces */ |
409 | 417 | ||
@@ -2332,15 +2340,6 @@ static int __init apm_init(void) | |||
2332 | pm_flags |= PM_APM; | 2340 | pm_flags |= PM_APM; |
2333 | 2341 | ||
2334 | /* | 2342 | /* |
2335 | * Set up a segment that references the real mode segment 0x40 | ||
2336 | * that extends up to the end of page zero (that we have reserved). | ||
2337 | * This is for buggy BIOS's that refer to (real mode) segment 0x40 | ||
2338 | * even though they are called in protected mode. | ||
2339 | */ | ||
2340 | set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); | ||
2341 | _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); | ||
2342 | |||
2343 | /* | ||
2344 | * Set up the long jump entry point to the APM BIOS, which is called | 2343 | * Set up the long jump entry point to the APM BIOS, which is called |
2345 | * from inline assembly. | 2344 | * from inline assembly. |
2346 | */ | 2345 | */ |
@@ -2358,12 +2357,12 @@ static int __init apm_init(void) | |||
2358 | * code to that CPU. | 2357 | * code to that CPU. |
2359 | */ | 2358 | */ |
2360 | gdt = get_cpu_gdt_table(0); | 2359 | gdt = get_cpu_gdt_table(0); |
2361 | set_base(gdt[APM_CS >> 3], | 2360 | set_desc_base(&gdt[APM_CS >> 3], |
2362 | __va((unsigned long)apm_info.bios.cseg << 4)); | 2361 | (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); |
2363 | set_base(gdt[APM_CS_16 >> 3], | 2362 | set_desc_base(&gdt[APM_CS_16 >> 3], |
2364 | __va((unsigned long)apm_info.bios.cseg_16 << 4)); | 2363 | (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4)); |
2365 | set_base(gdt[APM_DS >> 3], | 2364 | set_desc_base(&gdt[APM_DS >> 3], |
2366 | __va((unsigned long)apm_info.bios.dseg << 4)); | 2365 | (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); |
2367 | 2366 | ||
2368 | proc_create("apm", 0, NULL, &apm_file_ops); | 2367 | proc_create("apm", 0, NULL, &apm_file_ops); |
2369 | 2368 | ||
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 898ecc47e12..4a6aeedcd96 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c | |||
@@ -3,6 +3,7 @@ | |||
3 | * This code generates raw asm output which is post-processed to extract | 3 | * This code generates raw asm output which is post-processed to extract |
4 | * and format the required data. | 4 | * and format the required data. |
5 | */ | 5 | */ |
6 | #define COMPILE_OFFSETS | ||
6 | 7 | ||
7 | #include <linux/crypto.h> | 8 | #include <linux/crypto.h> |
8 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3efcb2b96a1..68537e957a9 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -7,9 +7,13 @@ ifdef CONFIG_FUNCTION_TRACER | |||
7 | CFLAGS_REMOVE_common.o = -pg | 7 | CFLAGS_REMOVE_common.o = -pg |
8 | endif | 8 | endif |
9 | 9 | ||
10 | # Make sure load_percpu_segment has no stackprotector | ||
11 | nostackp := $(call cc-option, -fno-stack-protector) | ||
12 | CFLAGS_common.o := $(nostackp) | ||
13 | |||
10 | obj-y := intel_cacheinfo.o addon_cpuid_features.o | 14 | obj-y := intel_cacheinfo.o addon_cpuid_features.o |
11 | obj-y += proc.o capflags.o powerflags.o common.o | 15 | obj-y += proc.o capflags.o powerflags.o common.o |
12 | obj-y += vmware.o hypervisor.o | 16 | obj-y += vmware.o hypervisor.o sched.o |
13 | 17 | ||
14 | obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o | 18 | obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o |
15 | obj-$(CONFIG_X86_64) += bugs_64.o | 19 | obj-$(CONFIG_X86_64) += bugs_64.o |
@@ -23,7 +27,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o | |||
23 | obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o | 27 | obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o |
24 | obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o | 28 | obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o |
25 | 29 | ||
26 | obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o | 30 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o |
27 | 31 | ||
28 | obj-$(CONFIG_X86_MCE) += mcheck/ | 32 | obj-$(CONFIG_X86_MCE) += mcheck/ |
29 | obj-$(CONFIG_MTRR) += mtrr/ | 33 | obj-$(CONFIG_MTRR) += mtrr/ |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 63fddcd082c..c910a716a71 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -2,7 +2,7 @@ | |||
2 | #include <linux/bitops.h> | 2 | #include <linux/bitops.h> |
3 | #include <linux/mm.h> | 3 | #include <linux/mm.h> |
4 | 4 | ||
5 | #include <asm/io.h> | 5 | #include <linux/io.h> |
6 | #include <asm/processor.h> | 6 | #include <asm/processor.h> |
7 | #include <asm/apic.h> | 7 | #include <asm/apic.h> |
8 | #include <asm/cpu.h> | 8 | #include <asm/cpu.h> |
@@ -45,8 +45,8 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) | |||
45 | #define CBAR_ENB (0x80000000) | 45 | #define CBAR_ENB (0x80000000) |
46 | #define CBAR_KEY (0X000000CB) | 46 | #define CBAR_KEY (0X000000CB) |
47 | if (c->x86_model == 9 || c->x86_model == 10) { | 47 | if (c->x86_model == 9 || c->x86_model == 10) { |
48 | if (inl (CBAR) & CBAR_ENB) | 48 | if (inl(CBAR) & CBAR_ENB) |
49 | outl (0 | CBAR_KEY, CBAR); | 49 | outl(0 | CBAR_KEY, CBAR); |
50 | } | 50 | } |
51 | } | 51 | } |
52 | 52 | ||
@@ -87,9 +87,10 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) | |||
87 | d = d2-d; | 87 | d = d2-d; |
88 | 88 | ||
89 | if (d > 20*K6_BUG_LOOP) | 89 | if (d > 20*K6_BUG_LOOP) |
90 | printk("system stability may be impaired when more than 32 MB are used.\n"); | 90 | printk(KERN_CONT |
91 | "system stability may be impaired when more than 32 MB are used.\n"); | ||
91 | else | 92 | else |
92 | printk("probably OK (after B9730xxxx).\n"); | 93 | printk(KERN_CONT "probably OK (after B9730xxxx).\n"); |
93 | printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); | 94 | printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); |
94 | } | 95 | } |
95 | 96 | ||
@@ -183,7 +184,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) | |||
183 | * approved Athlon | 184 | * approved Athlon |
184 | */ | 185 | */ |
185 | WARN_ONCE(1, "WARNING: This combination of AMD" | 186 | WARN_ONCE(1, "WARNING: This combination of AMD" |
186 | "processors is not suitable for SMP.\n"); | 187 | " processors is not suitable for SMP.\n"); |
187 | if (!test_taint(TAINT_UNSAFE_SMP)) | 188 | if (!test_taint(TAINT_UNSAFE_SMP)) |
188 | add_taint(TAINT_UNSAFE_SMP); | 189 | add_taint(TAINT_UNSAFE_SMP); |
189 | 190 | ||
@@ -219,8 +220,9 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) | |||
219 | if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { | 220 | if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { |
220 | rdmsr(MSR_K7_CLK_CTL, l, h); | 221 | rdmsr(MSR_K7_CLK_CTL, l, h); |
221 | if ((l & 0xfff00000) != 0x20000000) { | 222 | if ((l & 0xfff00000) != 0x20000000) { |
222 | printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, | 223 | printk(KERN_INFO |
223 | ((l & 0x000fffff)|0x20000000)); | 224 | "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", |
225 | l, ((l & 0x000fffff)|0x20000000)); | ||
224 | wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); | 226 | wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); |
225 | } | 227 | } |
226 | } | 228 | } |
@@ -251,6 +253,64 @@ static int __cpuinit nearby_node(int apicid) | |||
251 | #endif | 253 | #endif |
252 | 254 | ||
253 | /* | 255 | /* |
256 | * Fixup core topology information for AMD multi-node processors. | ||
257 | * Assumption 1: Number of cores in each internal node is the same. | ||
258 | * Assumption 2: Mixed systems with both single-node and dual-node | ||
259 | * processors are not supported. | ||
260 | */ | ||
261 | #ifdef CONFIG_X86_HT | ||
262 | static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) | ||
263 | { | ||
264 | #ifdef CONFIG_PCI | ||
265 | u32 t, cpn; | ||
266 | u8 n, n_id; | ||
267 | int cpu = smp_processor_id(); | ||
268 | |||
269 | /* fixup topology information only once for a core */ | ||
270 | if (cpu_has(c, X86_FEATURE_AMD_DCM)) | ||
271 | return; | ||
272 | |||
273 | /* check for multi-node processor on boot cpu */ | ||
274 | t = read_pci_config(0, 24, 3, 0xe8); | ||
275 | if (!(t & (1 << 29))) | ||
276 | return; | ||
277 | |||
278 | set_cpu_cap(c, X86_FEATURE_AMD_DCM); | ||
279 | |||
280 | /* cores per node: each internal node has half the number of cores */ | ||
281 | cpn = c->x86_max_cores >> 1; | ||
282 | |||
283 | /* even-numbered NB_id of this dual-node processor */ | ||
284 | n = c->phys_proc_id << 1; | ||
285 | |||
286 | /* | ||
287 | * determine internal node id and assign cores fifty-fifty to | ||
288 | * each node of the dual-node processor | ||
289 | */ | ||
290 | t = read_pci_config(0, 24 + n, 3, 0xe8); | ||
291 | n = (t>>30) & 0x3; | ||
292 | if (n == 0) { | ||
293 | if (c->cpu_core_id < cpn) | ||
294 | n_id = 0; | ||
295 | else | ||
296 | n_id = 1; | ||
297 | } else { | ||
298 | if (c->cpu_core_id < cpn) | ||
299 | n_id = 1; | ||
300 | else | ||
301 | n_id = 0; | ||
302 | } | ||
303 | |||
304 | /* compute entire NodeID, use llc_shared_map to store sibling info */ | ||
305 | per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id; | ||
306 | |||
307 | /* fixup core id to be in range from 0 to cpn */ | ||
308 | c->cpu_core_id = c->cpu_core_id % cpn; | ||
309 | #endif | ||
310 | } | ||
311 | #endif | ||
312 | |||
313 | /* | ||
254 | * On a AMD dual core setup the lower bits of the APIC id distingush the cores. | 314 | * On a AMD dual core setup the lower bits of the APIC id distingush the cores. |
255 | * Assumes number of cores is a power of two. | 315 | * Assumes number of cores is a power of two. |
256 | */ | 316 | */ |
@@ -267,17 +327,31 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) | |||
267 | c->phys_proc_id = c->initial_apicid >> bits; | 327 | c->phys_proc_id = c->initial_apicid >> bits; |
268 | /* use socket ID also for last level cache */ | 328 | /* use socket ID also for last level cache */ |
269 | per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; | 329 | per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; |
330 | /* fixup topology information on multi-node processors */ | ||
331 | if ((c->x86 == 0x10) && (c->x86_model == 9)) | ||
332 | amd_fixup_dcm(c); | ||
270 | #endif | 333 | #endif |
271 | } | 334 | } |
272 | 335 | ||
336 | int amd_get_nb_id(int cpu) | ||
337 | { | ||
338 | int id = 0; | ||
339 | #ifdef CONFIG_SMP | ||
340 | id = per_cpu(cpu_llc_id, cpu); | ||
341 | #endif | ||
342 | return id; | ||
343 | } | ||
344 | EXPORT_SYMBOL_GPL(amd_get_nb_id); | ||
345 | |||
273 | static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) | 346 | static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) |
274 | { | 347 | { |
275 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) | 348 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) |
276 | int cpu = smp_processor_id(); | 349 | int cpu = smp_processor_id(); |
277 | int node; | 350 | int node; |
278 | unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; | 351 | unsigned apicid = c->apicid; |
352 | |||
353 | node = per_cpu(cpu_llc_id, cpu); | ||
279 | 354 | ||
280 | node = c->phys_proc_id; | ||
281 | if (apicid_to_node[apicid] != NUMA_NO_NODE) | 355 | if (apicid_to_node[apicid] != NUMA_NO_NODE) |
282 | node = apicid_to_node[apicid]; | 356 | node = apicid_to_node[apicid]; |
283 | if (!node_online(node)) { | 357 | if (!node_online(node)) { |
@@ -398,18 +472,30 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
398 | u32 level; | 472 | u32 level; |
399 | 473 | ||
400 | level = cpuid_eax(1); | 474 | level = cpuid_eax(1); |
401 | if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) | 475 | if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) |
402 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | 476 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); |
403 | 477 | ||
404 | /* | 478 | /* |
405 | * Some BIOSes incorrectly force this feature, but only K8 | 479 | * Some BIOSes incorrectly force this feature, but only K8 |
406 | * revision D (model = 0x14) and later actually support it. | 480 | * revision D (model = 0x14) and later actually support it. |
481 | * (AMD Erratum #110, docId: 25759). | ||
407 | */ | 482 | */ |
408 | if (c->x86_model < 0x14) | 483 | if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { |
484 | u64 val; | ||
485 | |||
409 | clear_cpu_cap(c, X86_FEATURE_LAHF_LM); | 486 | clear_cpu_cap(c, X86_FEATURE_LAHF_LM); |
487 | if (!rdmsrl_amd_safe(0xc001100d, &val)) { | ||
488 | val &= ~(1ULL << 32); | ||
489 | wrmsrl_amd_safe(0xc001100d, val); | ||
490 | } | ||
491 | } | ||
492 | |||
410 | } | 493 | } |
411 | if (c->x86 == 0x10 || c->x86 == 0x11) | 494 | if (c->x86 == 0x10 || c->x86 == 0x11) |
412 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | 495 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); |
496 | |||
497 | /* get apicid instead of initial apic id from cpuid */ | ||
498 | c->apicid = hard_smp_processor_id(); | ||
413 | #else | 499 | #else |
414 | 500 | ||
415 | /* | 501 | /* |
@@ -494,27 +580,30 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
494 | * benefit in doing so. | 580 | * benefit in doing so. |
495 | */ | 581 | */ |
496 | if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { | 582 | if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { |
497 | printk(KERN_DEBUG "tseg: %010llx\n", tseg); | 583 | printk(KERN_DEBUG "tseg: %010llx\n", tseg); |
498 | if ((tseg>>PMD_SHIFT) < | 584 | if ((tseg>>PMD_SHIFT) < |
499 | (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || | 585 | (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || |
500 | ((tseg>>PMD_SHIFT) < | 586 | ((tseg>>PMD_SHIFT) < |
501 | (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && | 587 | (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && |
502 | (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) | 588 | (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) |
503 | set_memory_4k((unsigned long)__va(tseg), 1); | 589 | set_memory_4k((unsigned long)__va(tseg), 1); |
504 | } | 590 | } |
505 | } | 591 | } |
506 | #endif | 592 | #endif |
507 | } | 593 | } |
508 | 594 | ||
509 | #ifdef CONFIG_X86_32 | 595 | #ifdef CONFIG_X86_32 |
510 | static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) | 596 | static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, |
597 | unsigned int size) | ||
511 | { | 598 | { |
512 | /* AMD errata T13 (order #21922) */ | 599 | /* AMD errata T13 (order #21922) */ |
513 | if ((c->x86 == 6)) { | 600 | if ((c->x86 == 6)) { |
514 | if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */ | 601 | /* Duron Rev A0 */ |
602 | if (c->x86_model == 3 && c->x86_mask == 0) | ||
515 | size = 64; | 603 | size = 64; |
604 | /* Tbird rev A1/A2 */ | ||
516 | if (c->x86_model == 4 && | 605 | if (c->x86_model == 4 && |
517 | (c->x86_mask == 0 || c->x86_mask == 1)) /* Tbird rev A1/A2 */ | 606 | (c->x86_mask == 0 || c->x86_mask == 1)) |
518 | size = 256; | 607 | size = 256; |
519 | } | 608 | } |
520 | return size; | 609 | return size; |
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c8e315f1aa8..01a26521239 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c | |||
@@ -81,7 +81,7 @@ static void __init check_fpu(void) | |||
81 | 81 | ||
82 | boot_cpu_data.fdiv_bug = fdiv_bug; | 82 | boot_cpu_data.fdiv_bug = fdiv_bug; |
83 | if (boot_cpu_data.fdiv_bug) | 83 | if (boot_cpu_data.fdiv_bug) |
84 | printk("Hmm, FPU with FDIV bug.\n"); | 84 | printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); |
85 | } | 85 | } |
86 | 86 | ||
87 | static void __init check_hlt(void) | 87 | static void __init check_hlt(void) |
@@ -98,7 +98,7 @@ static void __init check_hlt(void) | |||
98 | halt(); | 98 | halt(); |
99 | halt(); | 99 | halt(); |
100 | halt(); | 100 | halt(); |
101 | printk("OK.\n"); | 101 | printk(KERN_CONT "OK.\n"); |
102 | } | 102 | } |
103 | 103 | ||
104 | /* | 104 | /* |
@@ -122,9 +122,9 @@ static void __init check_popad(void) | |||
122 | * CPU hard. Too bad. | 122 | * CPU hard. Too bad. |
123 | */ | 123 | */ |
124 | if (res != 12345678) | 124 | if (res != 12345678) |
125 | printk("Buggy.\n"); | 125 | printk(KERN_CONT "Buggy.\n"); |
126 | else | 126 | else |
127 | printk("OK.\n"); | 127 | printk(KERN_CONT "OK.\n"); |
128 | #endif | 128 | #endif |
129 | } | 129 | } |
130 | 130 | ||
@@ -156,7 +156,7 @@ void __init check_bugs(void) | |||
156 | { | 156 | { |
157 | identify_boot_cpu(); | 157 | identify_boot_cpu(); |
158 | #ifndef CONFIG_SMP | 158 | #ifndef CONFIG_SMP |
159 | printk("CPU: "); | 159 | printk(KERN_INFO "CPU: "); |
160 | print_cpu_info(&boot_cpu_data); | 160 | print_cpu_info(&boot_cpu_data); |
161 | #endif | 161 | #endif |
162 | check_config(); | 162 | check_config(); |
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c index 9a3ed0649d4..04f0fe5af83 100644 --- a/arch/x86/kernel/cpu/bugs_64.c +++ b/arch/x86/kernel/cpu/bugs_64.c | |||
@@ -15,7 +15,7 @@ void __init check_bugs(void) | |||
15 | { | 15 | { |
16 | identify_boot_cpu(); | 16 | identify_boot_cpu(); |
17 | #if !defined(CONFIG_SMP) | 17 | #if !defined(CONFIG_SMP) |
18 | printk("CPU: "); | 18 | printk(KERN_INFO "CPU: "); |
19 | print_cpu_info(&boot_cpu_data); | 19 | print_cpu_info(&boot_cpu_data); |
20 | #endif | 20 | #endif |
21 | alternative_instructions(); | 21 | alternative_instructions(); |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5ce60a88027..cc25c2b4a56 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -13,13 +13,13 @@ | |||
13 | #include <linux/io.h> | 13 | #include <linux/io.h> |
14 | 14 | ||
15 | #include <asm/stackprotector.h> | 15 | #include <asm/stackprotector.h> |
16 | #include <asm/perf_counter.h> | 16 | #include <asm/perf_event.h> |
17 | #include <asm/mmu_context.h> | 17 | #include <asm/mmu_context.h> |
18 | #include <asm/hypervisor.h> | 18 | #include <asm/hypervisor.h> |
19 | #include <asm/processor.h> | 19 | #include <asm/processor.h> |
20 | #include <asm/sections.h> | 20 | #include <asm/sections.h> |
21 | #include <asm/topology.h> | 21 | #include <linux/topology.h> |
22 | #include <asm/cpumask.h> | 22 | #include <linux/cpumask.h> |
23 | #include <asm/pgtable.h> | 23 | #include <asm/pgtable.h> |
24 | #include <asm/atomic.h> | 24 | #include <asm/atomic.h> |
25 | #include <asm/proto.h> | 25 | #include <asm/proto.h> |
@@ -28,13 +28,12 @@ | |||
28 | #include <asm/desc.h> | 28 | #include <asm/desc.h> |
29 | #include <asm/i387.h> | 29 | #include <asm/i387.h> |
30 | #include <asm/mtrr.h> | 30 | #include <asm/mtrr.h> |
31 | #include <asm/numa.h> | 31 | #include <linux/numa.h> |
32 | #include <asm/asm.h> | 32 | #include <asm/asm.h> |
33 | #include <asm/cpu.h> | 33 | #include <asm/cpu.h> |
34 | #include <asm/mce.h> | 34 | #include <asm/mce.h> |
35 | #include <asm/msr.h> | 35 | #include <asm/msr.h> |
36 | #include <asm/pat.h> | 36 | #include <asm/pat.h> |
37 | #include <asm/smp.h> | ||
38 | 37 | ||
39 | #ifdef CONFIG_X86_LOCAL_APIC | 38 | #ifdef CONFIG_X86_LOCAL_APIC |
40 | #include <asm/uv/uv.h> | 39 | #include <asm/uv/uv.h> |
@@ -94,45 +93,45 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { | |||
94 | * TLS descriptors are currently at a different place compared to i386. | 93 | * TLS descriptors are currently at a different place compared to i386. |
95 | * Hopefully nobody expects them at a fixed place (Wine?) | 94 | * Hopefully nobody expects them at a fixed place (Wine?) |
96 | */ | 95 | */ |
97 | [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, | 96 | [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), |
98 | [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, | 97 | [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), |
99 | [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, | 98 | [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), |
100 | [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, | 99 | [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), |
101 | [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, | 100 | [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), |
102 | [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, | 101 | [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), |
103 | #else | 102 | #else |
104 | [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, | 103 | [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), |
105 | [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, | 104 | [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), |
106 | [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, | 105 | [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), |
107 | [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, | 106 | [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), |
108 | /* | 107 | /* |
109 | * Segments used for calling PnP BIOS have byte granularity. | 108 | * Segments used for calling PnP BIOS have byte granularity. |
110 | * They code segments and data segments have fixed 64k limits, | 109 | * They code segments and data segments have fixed 64k limits, |
111 | * the transfer segment sizes are set at run time. | 110 | * the transfer segment sizes are set at run time. |
112 | */ | 111 | */ |
113 | /* 32-bit code */ | 112 | /* 32-bit code */ |
114 | [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, | 113 | [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), |
115 | /* 16-bit code */ | 114 | /* 16-bit code */ |
116 | [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, | 115 | [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), |
117 | /* 16-bit data */ | 116 | /* 16-bit data */ |
118 | [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, | 117 | [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), |
119 | /* 16-bit data */ | 118 | /* 16-bit data */ |
120 | [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, | 119 | [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), |
121 | /* 16-bit data */ | 120 | /* 16-bit data */ |
122 | [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, | 121 | [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), |
123 | /* | 122 | /* |
124 | * The APM segments have byte granularity and their bases | 123 | * The APM segments have byte granularity and their bases |
125 | * are set at run time. All have 64k limits. | 124 | * are set at run time. All have 64k limits. |
126 | */ | 125 | */ |
127 | /* 32-bit code */ | 126 | /* 32-bit code */ |
128 | [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, | 127 | [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), |
129 | /* 16-bit code */ | 128 | /* 16-bit code */ |
130 | [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, | 129 | [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), |
131 | /* data */ | 130 | /* data */ |
132 | [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, | 131 | [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), |
133 | 132 | ||
134 | [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, | 133 | [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), |
135 | [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, | 134 | [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), |
136 | GDT_STACK_CANARY_INIT | 135 | GDT_STACK_CANARY_INIT |
137 | #endif | 136 | #endif |
138 | } }; | 137 | } }; |
@@ -870,7 +869,7 @@ void __init identify_boot_cpu(void) | |||
870 | #else | 869 | #else |
871 | vgetcpu_set_mode(); | 870 | vgetcpu_set_mode(); |
872 | #endif | 871 | #endif |
873 | init_hw_perf_counters(); | 872 | init_hw_perf_events(); |
874 | } | 873 | } |
875 | 874 | ||
876 | void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) | 875 | void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) |
@@ -982,18 +981,26 @@ static __init int setup_disablecpuid(char *arg) | |||
982 | __setup("clearcpuid=", setup_disablecpuid); | 981 | __setup("clearcpuid=", setup_disablecpuid); |
983 | 982 | ||
984 | #ifdef CONFIG_X86_64 | 983 | #ifdef CONFIG_X86_64 |
985 | struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; | 984 | struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; |
986 | 985 | ||
987 | DEFINE_PER_CPU_FIRST(union irq_stack_union, | 986 | DEFINE_PER_CPU_FIRST(union irq_stack_union, |
988 | irq_stack_union) __aligned(PAGE_SIZE); | 987 | irq_stack_union) __aligned(PAGE_SIZE); |
989 | 988 | ||
990 | DEFINE_PER_CPU(char *, irq_stack_ptr) = | 989 | /* |
991 | init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; | 990 | * The following four percpu variables are hot. Align current_task to |
991 | * cacheline size such that all four fall in the same cacheline. | ||
992 | */ | ||
993 | DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = | ||
994 | &init_task; | ||
995 | EXPORT_PER_CPU_SYMBOL(current_task); | ||
992 | 996 | ||
993 | DEFINE_PER_CPU(unsigned long, kernel_stack) = | 997 | DEFINE_PER_CPU(unsigned long, kernel_stack) = |
994 | (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; | 998 | (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; |
995 | EXPORT_PER_CPU_SYMBOL(kernel_stack); | 999 | EXPORT_PER_CPU_SYMBOL(kernel_stack); |
996 | 1000 | ||
1001 | DEFINE_PER_CPU(char *, irq_stack_ptr) = | ||
1002 | init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; | ||
1003 | |||
997 | DEFINE_PER_CPU(unsigned int, irq_count) = -1; | 1004 | DEFINE_PER_CPU(unsigned int, irq_count) = -1; |
998 | 1005 | ||
999 | /* | 1006 | /* |
@@ -1008,8 +1015,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { | |||
1008 | }; | 1015 | }; |
1009 | 1016 | ||
1010 | static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks | 1017 | static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks |
1011 | [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) | 1018 | [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); |
1012 | __aligned(PAGE_SIZE); | ||
1013 | 1019 | ||
1014 | /* May not be marked __init: used by software suspend */ | 1020 | /* May not be marked __init: used by software suspend */ |
1015 | void syscall_init(void) | 1021 | void syscall_init(void) |
@@ -1042,8 +1048,11 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist); | |||
1042 | 1048 | ||
1043 | #else /* CONFIG_X86_64 */ | 1049 | #else /* CONFIG_X86_64 */ |
1044 | 1050 | ||
1051 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; | ||
1052 | EXPORT_PER_CPU_SYMBOL(current_task); | ||
1053 | |||
1045 | #ifdef CONFIG_CC_STACKPROTECTOR | 1054 | #ifdef CONFIG_CC_STACKPROTECTOR |
1046 | DEFINE_PER_CPU(unsigned long, stack_canary); | 1055 | DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); |
1047 | #endif | 1056 | #endif |
1048 | 1057 | ||
1049 | /* Make sure %fs and %gs are initialized properly in idle threads */ | 1058 | /* Make sure %fs and %gs are initialized properly in idle threads */ |
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 6b2a52dd040..dca325c0399 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c | |||
@@ -30,8 +30,8 @@ | |||
30 | #include <asm/apic.h> | 30 | #include <asm/apic.h> |
31 | #include <asm/desc.h> | 31 | #include <asm/desc.h> |
32 | 32 | ||
33 | static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); | 33 | static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr); |
34 | static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); | 34 | static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr); |
35 | static DEFINE_PER_CPU(int, cpu_priv_count); | 35 | static DEFINE_PER_CPU(int, cpu_priv_count); |
36 | 36 | ||
37 | static DEFINE_MUTEX(cpu_debug_lock); | 37 | static DEFINE_MUTEX(cpu_debug_lock); |
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index ae9b503220c..7d5c3b0ea8d 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | |||
@@ -33,7 +33,7 @@ | |||
33 | #include <linux/cpufreq.h> | 33 | #include <linux/cpufreq.h> |
34 | #include <linux/compiler.h> | 34 | #include <linux/compiler.h> |
35 | #include <linux/dmi.h> | 35 | #include <linux/dmi.h> |
36 | #include <trace/power.h> | 36 | #include <trace/events/power.h> |
37 | 37 | ||
38 | #include <linux/acpi.h> | 38 | #include <linux/acpi.h> |
39 | #include <linux/io.h> | 39 | #include <linux/io.h> |
@@ -60,7 +60,6 @@ enum { | |||
60 | }; | 60 | }; |
61 | 61 | ||
62 | #define INTEL_MSR_RANGE (0xffff) | 62 | #define INTEL_MSR_RANGE (0xffff) |
63 | #define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1) | ||
64 | 63 | ||
65 | struct acpi_cpufreq_data { | 64 | struct acpi_cpufreq_data { |
66 | struct acpi_processor_performance *acpi_data; | 65 | struct acpi_processor_performance *acpi_data; |
@@ -71,13 +70,7 @@ struct acpi_cpufreq_data { | |||
71 | 70 | ||
72 | static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); | 71 | static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); |
73 | 72 | ||
74 | struct acpi_msr_data { | 73 | static DEFINE_PER_CPU(struct aperfmperf, old_perf); |
75 | u64 saved_aperf, saved_mperf; | ||
76 | }; | ||
77 | |||
78 | static DEFINE_PER_CPU(struct acpi_msr_data, msr_data); | ||
79 | |||
80 | DEFINE_TRACE(power_mark); | ||
81 | 74 | ||
82 | /* acpi_perf_data is a pointer to percpu data. */ | 75 | /* acpi_perf_data is a pointer to percpu data. */ |
83 | static struct acpi_processor_performance *acpi_perf_data; | 76 | static struct acpi_processor_performance *acpi_perf_data; |
@@ -244,23 +237,12 @@ static u32 get_cur_val(const struct cpumask *mask) | |||
244 | return cmd.val; | 237 | return cmd.val; |
245 | } | 238 | } |
246 | 239 | ||
247 | struct perf_pair { | ||
248 | union { | ||
249 | struct { | ||
250 | u32 lo; | ||
251 | u32 hi; | ||
252 | } split; | ||
253 | u64 whole; | ||
254 | } aperf, mperf; | ||
255 | }; | ||
256 | |||
257 | /* Called via smp_call_function_single(), on the target CPU */ | 240 | /* Called via smp_call_function_single(), on the target CPU */ |
258 | static void read_measured_perf_ctrs(void *_cur) | 241 | static void read_measured_perf_ctrs(void *_cur) |
259 | { | 242 | { |
260 | struct perf_pair *cur = _cur; | 243 | struct aperfmperf *am = _cur; |
261 | 244 | ||
262 | rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi); | 245 | get_aperfmperf(am); |
263 | rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi); | ||
264 | } | 246 | } |
265 | 247 | ||
266 | /* | 248 | /* |
@@ -279,63 +261,17 @@ static void read_measured_perf_ctrs(void *_cur) | |||
279 | static unsigned int get_measured_perf(struct cpufreq_policy *policy, | 261 | static unsigned int get_measured_perf(struct cpufreq_policy *policy, |
280 | unsigned int cpu) | 262 | unsigned int cpu) |
281 | { | 263 | { |
282 | struct perf_pair readin, cur; | 264 | struct aperfmperf perf; |
283 | unsigned int perf_percent; | 265 | unsigned long ratio; |
284 | unsigned int retval; | 266 | unsigned int retval; |
285 | 267 | ||
286 | if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1)) | 268 | if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) |
287 | return 0; | 269 | return 0; |
288 | 270 | ||
289 | cur.aperf.whole = readin.aperf.whole - | 271 | ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); |
290 | per_cpu(msr_data, cpu).saved_aperf; | 272 | per_cpu(old_perf, cpu) = perf; |
291 | cur.mperf.whole = readin.mperf.whole - | ||
292 | per_cpu(msr_data, cpu).saved_mperf; | ||
293 | per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole; | ||
294 | per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole; | ||
295 | |||
296 | #ifdef __i386__ | ||
297 | /* | ||
298 | * We dont want to do 64 bit divide with 32 bit kernel | ||
299 | * Get an approximate value. Return failure in case we cannot get | ||
300 | * an approximate value. | ||
301 | */ | ||
302 | if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) { | ||
303 | int shift_count; | ||
304 | u32 h; | ||
305 | |||
306 | h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi); | ||
307 | shift_count = fls(h); | ||
308 | |||
309 | cur.aperf.whole >>= shift_count; | ||
310 | cur.mperf.whole >>= shift_count; | ||
311 | } | ||
312 | |||
313 | if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) { | ||
314 | int shift_count = 7; | ||
315 | cur.aperf.split.lo >>= shift_count; | ||
316 | cur.mperf.split.lo >>= shift_count; | ||
317 | } | ||
318 | |||
319 | if (cur.aperf.split.lo && cur.mperf.split.lo) | ||
320 | perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo; | ||
321 | else | ||
322 | perf_percent = 0; | ||
323 | |||
324 | #else | ||
325 | if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) { | ||
326 | int shift_count = 7; | ||
327 | cur.aperf.whole >>= shift_count; | ||
328 | cur.mperf.whole >>= shift_count; | ||
329 | } | ||
330 | |||
331 | if (cur.aperf.whole && cur.mperf.whole) | ||
332 | perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole; | ||
333 | else | ||
334 | perf_percent = 0; | ||
335 | |||
336 | #endif | ||
337 | 273 | ||
338 | retval = (policy->cpuinfo.max_freq * perf_percent) / 100; | 274 | retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; |
339 | 275 | ||
340 | return retval; | 276 | return retval; |
341 | } | 277 | } |
@@ -394,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, | |||
394 | unsigned int next_perf_state = 0; /* Index into perf table */ | 330 | unsigned int next_perf_state = 0; /* Index into perf table */ |
395 | unsigned int i; | 331 | unsigned int i; |
396 | int result = 0; | 332 | int result = 0; |
397 | struct power_trace it; | ||
398 | 333 | ||
399 | dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); | 334 | dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); |
400 | 335 | ||
@@ -426,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, | |||
426 | } | 361 | } |
427 | } | 362 | } |
428 | 363 | ||
429 | trace_power_mark(&it, POWER_PSTATE, next_perf_state); | 364 | trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); |
430 | 365 | ||
431 | switch (data->cpu_feature) { | 366 | switch (data->cpu_feature) { |
432 | case SYSTEM_INTEL_MSR_CAPABLE: | 367 | case SYSTEM_INTEL_MSR_CAPABLE: |
@@ -588,6 +523,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = { | |||
588 | }, | 523 | }, |
589 | { } | 524 | { } |
590 | }; | 525 | }; |
526 | |||
527 | static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) | ||
528 | { | ||
529 | /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf | ||
530 | * AL30: A Machine Check Exception (MCE) Occurring during an | ||
531 | * Enhanced Intel SpeedStep Technology Ratio Change May Cause | ||
532 | * Both Processor Cores to Lock Up when HT is enabled*/ | ||
533 | if (c->x86_vendor == X86_VENDOR_INTEL) { | ||
534 | if ((c->x86 == 15) && | ||
535 | (c->x86_model == 6) && | ||
536 | (c->x86_mask == 8) && smt_capable()) | ||
537 | return -ENODEV; | ||
538 | } | ||
539 | return 0; | ||
540 | } | ||
591 | #endif | 541 | #endif |
592 | 542 | ||
593 | static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) | 543 | static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) |
@@ -602,6 +552,12 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) | |||
602 | 552 | ||
603 | dprintk("acpi_cpufreq_cpu_init\n"); | 553 | dprintk("acpi_cpufreq_cpu_init\n"); |
604 | 554 | ||
555 | #ifdef CONFIG_SMP | ||
556 | result = acpi_cpufreq_blacklist(c); | ||
557 | if (result) | ||
558 | return result; | ||
559 | #endif | ||
560 | |||
605 | data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); | 561 | data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); |
606 | if (!data) | 562 | if (!data) |
607 | return -ENOMEM; | 563 | return -ENOMEM; |
@@ -731,12 +687,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) | |||
731 | acpi_processor_notify_smm(THIS_MODULE); | 687 | acpi_processor_notify_smm(THIS_MODULE); |
732 | 688 | ||
733 | /* Check for APERF/MPERF support in hardware */ | 689 | /* Check for APERF/MPERF support in hardware */ |
734 | if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) { | 690 | if (cpu_has(c, X86_FEATURE_APERFMPERF)) |
735 | unsigned int ecx; | 691 | acpi_cpufreq_driver.getavg = get_measured_perf; |
736 | ecx = cpuid_ecx(6); | ||
737 | if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY) | ||
738 | acpi_cpufreq_driver.getavg = get_measured_perf; | ||
739 | } | ||
740 | 692 | ||
741 | dprintk("CPU%u - ACPI performance management activated.\n", cpu); | 693 | dprintk("CPU%u - ACPI performance management activated.\n", cpu); |
742 | for (i = 0; i < perf->state_count; i++) | 694 | for (i = 0; i < perf->state_count; i++) |
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 2a50ef89100..6394aa5c798 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c | |||
@@ -605,9 +605,10 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, | |||
605 | return 0; | 605 | return 0; |
606 | } | 606 | } |
607 | 607 | ||
608 | static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry) | 608 | static void invalidate_entry(struct cpufreq_frequency_table *powernow_table, |
609 | unsigned int entry) | ||
609 | { | 610 | { |
610 | data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; | 611 | powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; |
611 | } | 612 | } |
612 | 613 | ||
613 | static void print_basics(struct powernow_k8_data *data) | 614 | static void print_basics(struct powernow_k8_data *data) |
@@ -854,6 +855,10 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) | |||
854 | goto err_out; | 855 | goto err_out; |
855 | } | 856 | } |
856 | 857 | ||
858 | /* fill in data */ | ||
859 | data->numps = data->acpi_data.state_count; | ||
860 | powernow_k8_acpi_pst_values(data, 0); | ||
861 | |||
857 | if (cpu_family == CPU_HW_PSTATE) | 862 | if (cpu_family == CPU_HW_PSTATE) |
858 | ret_val = fill_powernow_table_pstate(data, powernow_table); | 863 | ret_val = fill_powernow_table_pstate(data, powernow_table); |
859 | else | 864 | else |
@@ -866,11 +871,8 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) | |||
866 | powernow_table[data->acpi_data.state_count].index = 0; | 871 | powernow_table[data->acpi_data.state_count].index = 0; |
867 | data->powernow_table = powernow_table; | 872 | data->powernow_table = powernow_table; |
868 | 873 | ||
869 | /* fill in data */ | ||
870 | data->numps = data->acpi_data.state_count; | ||
871 | if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) | 874 | if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) |
872 | print_basics(data); | 875 | print_basics(data); |
873 | powernow_k8_acpi_pst_values(data, 0); | ||
874 | 876 | ||
875 | /* notify BIOS that we exist */ | 877 | /* notify BIOS that we exist */ |
876 | acpi_processor_notify_smm(THIS_MODULE); | 878 | acpi_processor_notify_smm(THIS_MODULE); |
@@ -914,13 +916,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, | |||
914 | "bad value %d.\n", i, index); | 916 | "bad value %d.\n", i, index); |
915 | printk(KERN_ERR PFX "Please report to BIOS " | 917 | printk(KERN_ERR PFX "Please report to BIOS " |
916 | "manufacturer\n"); | 918 | "manufacturer\n"); |
917 | invalidate_entry(data, i); | 919 | invalidate_entry(powernow_table, i); |
918 | continue; | 920 | continue; |
919 | } | 921 | } |
920 | rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); | 922 | rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); |
921 | if (!(hi & HW_PSTATE_VALID_MASK)) { | 923 | if (!(hi & HW_PSTATE_VALID_MASK)) { |
922 | dprintk("invalid pstate %d, ignoring\n", index); | 924 | dprintk("invalid pstate %d, ignoring\n", index); |
923 | invalidate_entry(data, i); | 925 | invalidate_entry(powernow_table, i); |
924 | continue; | 926 | continue; |
925 | } | 927 | } |
926 | 928 | ||
@@ -941,7 +943,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, | |||
941 | struct cpufreq_frequency_table *powernow_table) | 943 | struct cpufreq_frequency_table *powernow_table) |
942 | { | 944 | { |
943 | int i; | 945 | int i; |
944 | int cntlofreq = 0; | ||
945 | 946 | ||
946 | for (i = 0; i < data->acpi_data.state_count; i++) { | 947 | for (i = 0; i < data->acpi_data.state_count; i++) { |
947 | u32 fid; | 948 | u32 fid; |
@@ -970,7 +971,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, | |||
970 | /* verify frequency is OK */ | 971 | /* verify frequency is OK */ |
971 | if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { | 972 | if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { |
972 | dprintk("invalid freq %u kHz, ignoring\n", freq); | 973 | dprintk("invalid freq %u kHz, ignoring\n", freq); |
973 | invalidate_entry(data, i); | 974 | invalidate_entry(powernow_table, i); |
974 | continue; | 975 | continue; |
975 | } | 976 | } |
976 | 977 | ||
@@ -978,38 +979,17 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, | |||
978 | * BIOSs are using "off" to indicate invalid */ | 979 | * BIOSs are using "off" to indicate invalid */ |
979 | if (vid == VID_OFF) { | 980 | if (vid == VID_OFF) { |
980 | dprintk("invalid vid %u, ignoring\n", vid); | 981 | dprintk("invalid vid %u, ignoring\n", vid); |
981 | invalidate_entry(data, i); | 982 | invalidate_entry(powernow_table, i); |
982 | continue; | 983 | continue; |
983 | } | 984 | } |
984 | 985 | ||
985 | /* verify only 1 entry from the lo frequency table */ | ||
986 | if (fid < HI_FID_TABLE_BOTTOM) { | ||
987 | if (cntlofreq) { | ||
988 | /* if both entries are the same, | ||
989 | * ignore this one ... */ | ||
990 | if ((freq != powernow_table[cntlofreq].frequency) || | ||
991 | (index != powernow_table[cntlofreq].index)) { | ||
992 | printk(KERN_ERR PFX | ||
993 | "Too many lo freq table " | ||
994 | "entries\n"); | ||
995 | return 1; | ||
996 | } | ||
997 | |||
998 | dprintk("double low frequency table entry, " | ||
999 | "ignoring it.\n"); | ||
1000 | invalidate_entry(data, i); | ||
1001 | continue; | ||
1002 | } else | ||
1003 | cntlofreq = i; | ||
1004 | } | ||
1005 | |||
1006 | if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { | 986 | if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { |
1007 | printk(KERN_INFO PFX "invalid freq entries " | 987 | printk(KERN_INFO PFX "invalid freq entries " |
1008 | "%u kHz vs. %u kHz\n", freq, | 988 | "%u kHz vs. %u kHz\n", freq, |
1009 | (unsigned int) | 989 | (unsigned int) |
1010 | (data->acpi_data.states[i].core_frequency | 990 | (data->acpi_data.states[i].core_frequency |
1011 | * 1000)); | 991 | * 1000)); |
1012 | invalidate_entry(data, i); | 992 | invalidate_entry(powernow_table, i); |
1013 | continue; | 993 | continue; |
1014 | } | 994 | } |
1015 | } | 995 | } |
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 593171e967e..19807b89f05 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c | |||
@@ -3,10 +3,10 @@ | |||
3 | #include <linux/delay.h> | 3 | #include <linux/delay.h> |
4 | #include <linux/pci.h> | 4 | #include <linux/pci.h> |
5 | #include <asm/dma.h> | 5 | #include <asm/dma.h> |
6 | #include <asm/io.h> | 6 | #include <linux/io.h> |
7 | #include <asm/processor-cyrix.h> | 7 | #include <asm/processor-cyrix.h> |
8 | #include <asm/processor-flags.h> | 8 | #include <asm/processor-flags.h> |
9 | #include <asm/timer.h> | 9 | #include <linux/timer.h> |
10 | #include <asm/pci-direct.h> | 10 | #include <asm/pci-direct.h> |
11 | #include <asm/tsc.h> | 11 | #include <asm/tsc.h> |
12 | 12 | ||
@@ -282,7 +282,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) | |||
282 | * The 5510/5520 companion chips have a funky PIT. | 282 | * The 5510/5520 companion chips have a funky PIT. |
283 | */ | 283 | */ |
284 | if (vendor == PCI_VENDOR_ID_CYRIX && | 284 | if (vendor == PCI_VENDOR_ID_CYRIX && |
285 | (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) | 285 | (device == PCI_DEVICE_ID_CYRIX_5510 || |
286 | device == PCI_DEVICE_ID_CYRIX_5520)) | ||
286 | mark_tsc_unstable("cyrix 5510/5520 detected"); | 287 | mark_tsc_unstable("cyrix 5510/5520 detected"); |
287 | } | 288 | } |
288 | #endif | 289 | #endif |
@@ -299,7 +300,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) | |||
299 | * ? : 0x7x | 300 | * ? : 0x7x |
300 | * GX1 : 0x8x GX1 datasheet 56 | 301 | * GX1 : 0x8x GX1 datasheet 56 |
301 | */ | 302 | */ |
302 | if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) | 303 | if ((0x30 <= dir1 && dir1 <= 0x6f) || |
304 | (0x80 <= dir1 && dir1 <= 0x8f)) | ||
303 | geode_configure(); | 305 | geode_configure(); |
304 | return; | 306 | return; |
305 | } else { /* MediaGX */ | 307 | } else { /* MediaGX */ |
@@ -427,9 +429,12 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) | |||
427 | printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); | 429 | printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); |
428 | local_irq_save(flags); | 430 | local_irq_save(flags); |
429 | ccr3 = getCx86(CX86_CCR3); | 431 | ccr3 = getCx86(CX86_CCR3); |
430 | setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ | 432 | /* enable MAPEN */ |
431 | setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */ | 433 | setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); |
432 | setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ | 434 | /* enable cpuid */ |
435 | setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); | ||
436 | /* disable MAPEN */ | ||
437 | setCx86(CX86_CCR3, ccr3); | ||
433 | local_irq_restore(flags); | 438 | local_irq_restore(flags); |
434 | } | 439 | } |
435 | } | 440 | } |
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index fb5b86af0b0..08be922de33 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c | |||
@@ -28,18 +28,10 @@ | |||
28 | static inline void __cpuinit | 28 | static inline void __cpuinit |
29 | detect_hypervisor_vendor(struct cpuinfo_x86 *c) | 29 | detect_hypervisor_vendor(struct cpuinfo_x86 *c) |
30 | { | 30 | { |
31 | if (vmware_platform()) { | 31 | if (vmware_platform()) |
32 | c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; | 32 | c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; |
33 | } else { | 33 | else |
34 | c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; | 34 | c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; |
35 | } | ||
36 | } | ||
37 | |||
38 | unsigned long get_hypervisor_tsc_freq(void) | ||
39 | { | ||
40 | if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) | ||
41 | return vmware_get_tsc_khz(); | ||
42 | return 0; | ||
43 | } | 35 | } |
44 | 36 | ||
45 | static inline void __cpuinit | 37 | static inline void __cpuinit |
@@ -56,3 +48,10 @@ void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) | |||
56 | detect_hypervisor_vendor(c); | 48 | detect_hypervisor_vendor(c); |
57 | hypervisor_set_feature_bits(c); | 49 | hypervisor_set_feature_bits(c); |
58 | } | 50 | } |
51 | |||
52 | void __init init_hypervisor_platform(void) | ||
53 | { | ||
54 | init_hypervisor(&boot_cpu_data); | ||
55 | if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) | ||
56 | vmware_platform_setup(); | ||
57 | } | ||
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3260ab04499..40e1835b35e 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -7,17 +7,17 @@ | |||
7 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
8 | #include <linux/thread_info.h> | 8 | #include <linux/thread_info.h> |
9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
10 | #include <linux/uaccess.h> | ||
10 | 11 | ||
11 | #include <asm/processor.h> | 12 | #include <asm/processor.h> |
12 | #include <asm/pgtable.h> | 13 | #include <asm/pgtable.h> |
13 | #include <asm/msr.h> | 14 | #include <asm/msr.h> |
14 | #include <asm/uaccess.h> | ||
15 | #include <asm/ds.h> | 15 | #include <asm/ds.h> |
16 | #include <asm/bugs.h> | 16 | #include <asm/bugs.h> |
17 | #include <asm/cpu.h> | 17 | #include <asm/cpu.h> |
18 | 18 | ||
19 | #ifdef CONFIG_X86_64 | 19 | #ifdef CONFIG_X86_64 |
20 | #include <asm/topology.h> | 20 | #include <linux/topology.h> |
21 | #include <asm/numa_64.h> | 21 | #include <asm/numa_64.h> |
22 | #endif | 22 | #endif |
23 | 23 | ||
@@ -174,7 +174,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) | |||
174 | #ifdef CONFIG_X86_F00F_BUG | 174 | #ifdef CONFIG_X86_F00F_BUG |
175 | /* | 175 | /* |
176 | * All current models of Pentium and Pentium with MMX technology CPUs | 176 | * All current models of Pentium and Pentium with MMX technology CPUs |
177 | * have the F0 0F bug, which lets nonprivileged users lock up the system. | 177 | * have the F0 0F bug, which lets nonprivileged users lock up the |
178 | * system. | ||
178 | * Note that the workaround only should be initialized once... | 179 | * Note that the workaround only should be initialized once... |
179 | */ | 180 | */ |
180 | c->f00f_bug = 0; | 181 | c->f00f_bug = 0; |
@@ -207,7 +208,7 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) | |||
207 | printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); | 208 | printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); |
208 | printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); | 209 | printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); |
209 | lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; | 210 | lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; |
210 | wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); | 211 | wrmsr(MSR_IA32_MISC_ENABLE, lo, hi); |
211 | } | 212 | } |
212 | } | 213 | } |
213 | 214 | ||
@@ -283,7 +284,7 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) | |||
283 | /* Intel has a non-standard dependency on %ecx for this CPUID level. */ | 284 | /* Intel has a non-standard dependency on %ecx for this CPUID level. */ |
284 | cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); | 285 | cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); |
285 | if (eax & 0x1f) | 286 | if (eax & 0x1f) |
286 | return ((eax >> 26) + 1); | 287 | return (eax >> 26) + 1; |
287 | else | 288 | else |
288 | return 1; | 289 | return 1; |
289 | } | 290 | } |
@@ -349,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) | |||
349 | set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); | 350 | set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); |
350 | } | 351 | } |
351 | 352 | ||
353 | if (c->cpuid_level > 6) { | ||
354 | unsigned ecx = cpuid_ecx(6); | ||
355 | if (ecx & 0x01) | ||
356 | set_cpu_cap(c, X86_FEATURE_APERFMPERF); | ||
357 | } | ||
358 | |||
352 | if (cpu_has_xmm2) | 359 | if (cpu_has_xmm2) |
353 | set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); | 360 | set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); |
354 | if (cpu_has_ds) { | 361 | if (cpu_has_ds) { |
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 789efe217e1..804c40e2bc3 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * Changes: | 4 | * Changes: |
5 | * Venkatesh Pallipadi : Adding cache identification through cpuid(4) | 5 | * Venkatesh Pallipadi : Adding cache identification through cpuid(4) |
6 | * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. | 6 | * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. |
7 | * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. | 7 | * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. |
8 | */ | 8 | */ |
9 | 9 | ||
@@ -16,7 +16,7 @@ | |||
16 | #include <linux/pci.h> | 16 | #include <linux/pci.h> |
17 | 17 | ||
18 | #include <asm/processor.h> | 18 | #include <asm/processor.h> |
19 | #include <asm/smp.h> | 19 | #include <linux/smp.h> |
20 | #include <asm/k8.h> | 20 | #include <asm/k8.h> |
21 | 21 | ||
22 | #define LVL_1_INST 1 | 22 | #define LVL_1_INST 1 |
@@ -25,14 +25,15 @@ | |||
25 | #define LVL_3 4 | 25 | #define LVL_3 4 |
26 | #define LVL_TRACE 5 | 26 | #define LVL_TRACE 5 |
27 | 27 | ||
28 | struct _cache_table | 28 | struct _cache_table { |
29 | { | ||
30 | unsigned char descriptor; | 29 | unsigned char descriptor; |
31 | char cache_type; | 30 | char cache_type; |
32 | short size; | 31 | short size; |
33 | }; | 32 | }; |
34 | 33 | ||
35 | /* all the cache descriptor types we care about (no TLB or trace cache entries) */ | 34 | /* All the cache descriptor types we care about (no TLB or |
35 | trace cache entries) */ | ||
36 | |||
36 | static const struct _cache_table __cpuinitconst cache_table[] = | 37 | static const struct _cache_table __cpuinitconst cache_table[] = |
37 | { | 38 | { |
38 | { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ | 39 | { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ |
@@ -105,8 +106,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = | |||
105 | }; | 106 | }; |
106 | 107 | ||
107 | 108 | ||
108 | enum _cache_type | 109 | enum _cache_type { |
109 | { | ||
110 | CACHE_TYPE_NULL = 0, | 110 | CACHE_TYPE_NULL = 0, |
111 | CACHE_TYPE_DATA = 1, | 111 | CACHE_TYPE_DATA = 1, |
112 | CACHE_TYPE_INST = 2, | 112 | CACHE_TYPE_INST = 2, |
@@ -170,31 +170,31 @@ unsigned short num_cache_leaves; | |||
170 | Maybe later */ | 170 | Maybe later */ |
171 | union l1_cache { | 171 | union l1_cache { |
172 | struct { | 172 | struct { |
173 | unsigned line_size : 8; | 173 | unsigned line_size:8; |
174 | unsigned lines_per_tag : 8; | 174 | unsigned lines_per_tag:8; |
175 | unsigned assoc : 8; | 175 | unsigned assoc:8; |
176 | unsigned size_in_kb : 8; | 176 | unsigned size_in_kb:8; |
177 | }; | 177 | }; |
178 | unsigned val; | 178 | unsigned val; |
179 | }; | 179 | }; |
180 | 180 | ||
181 | union l2_cache { | 181 | union l2_cache { |
182 | struct { | 182 | struct { |
183 | unsigned line_size : 8; | 183 | unsigned line_size:8; |
184 | unsigned lines_per_tag : 4; | 184 | unsigned lines_per_tag:4; |
185 | unsigned assoc : 4; | 185 | unsigned assoc:4; |
186 | unsigned size_in_kb : 16; | 186 | unsigned size_in_kb:16; |
187 | }; | 187 | }; |
188 | unsigned val; | 188 | unsigned val; |
189 | }; | 189 | }; |
190 | 190 | ||
191 | union l3_cache { | 191 | union l3_cache { |
192 | struct { | 192 | struct { |
193 | unsigned line_size : 8; | 193 | unsigned line_size:8; |
194 | unsigned lines_per_tag : 4; | 194 | unsigned lines_per_tag:4; |
195 | unsigned assoc : 4; | 195 | unsigned assoc:4; |
196 | unsigned res : 2; | 196 | unsigned res:2; |
197 | unsigned size_encoded : 14; | 197 | unsigned size_encoded:14; |
198 | }; | 198 | }; |
199 | unsigned val; | 199 | unsigned val; |
200 | }; | 200 | }; |
@@ -241,7 +241,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, | |||
241 | case 0: | 241 | case 0: |
242 | if (!l1->val) | 242 | if (!l1->val) |
243 | return; | 243 | return; |
244 | assoc = l1->assoc; | 244 | assoc = assocs[l1->assoc]; |
245 | line_size = l1->line_size; | 245 | line_size = l1->line_size; |
246 | lines_per_tag = l1->lines_per_tag; | 246 | lines_per_tag = l1->lines_per_tag; |
247 | size_in_kb = l1->size_in_kb; | 247 | size_in_kb = l1->size_in_kb; |
@@ -249,7 +249,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, | |||
249 | case 2: | 249 | case 2: |
250 | if (!l2.val) | 250 | if (!l2.val) |
251 | return; | 251 | return; |
252 | assoc = l2.assoc; | 252 | assoc = assocs[l2.assoc]; |
253 | line_size = l2.line_size; | 253 | line_size = l2.line_size; |
254 | lines_per_tag = l2.lines_per_tag; | 254 | lines_per_tag = l2.lines_per_tag; |
255 | /* cpu_data has errata corrections for K7 applied */ | 255 | /* cpu_data has errata corrections for K7 applied */ |
@@ -258,10 +258,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, | |||
258 | case 3: | 258 | case 3: |
259 | if (!l3.val) | 259 | if (!l3.val) |
260 | return; | 260 | return; |
261 | assoc = l3.assoc; | 261 | assoc = assocs[l3.assoc]; |
262 | line_size = l3.line_size; | 262 | line_size = l3.line_size; |
263 | lines_per_tag = l3.lines_per_tag; | 263 | lines_per_tag = l3.lines_per_tag; |
264 | size_in_kb = l3.size_encoded * 512; | 264 | size_in_kb = l3.size_encoded * 512; |
265 | if (boot_cpu_has(X86_FEATURE_AMD_DCM)) { | ||
266 | size_in_kb = size_in_kb >> 1; | ||
267 | assoc = assoc >> 1; | ||
268 | } | ||
265 | break; | 269 | break; |
266 | default: | 270 | default: |
267 | return; | 271 | return; |
@@ -270,18 +274,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, | |||
270 | eax->split.is_self_initializing = 1; | 274 | eax->split.is_self_initializing = 1; |
271 | eax->split.type = types[leaf]; | 275 | eax->split.type = types[leaf]; |
272 | eax->split.level = levels[leaf]; | 276 | eax->split.level = levels[leaf]; |
273 | if (leaf == 3) | 277 | eax->split.num_threads_sharing = 0; |
274 | eax->split.num_threads_sharing = | ||
275 | current_cpu_data.x86_max_cores - 1; | ||
276 | else | ||
277 | eax->split.num_threads_sharing = 0; | ||
278 | eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; | 278 | eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; |
279 | 279 | ||
280 | 280 | ||
281 | if (assoc == 0xf) | 281 | if (assoc == 0xffff) |
282 | eax->split.is_fully_associative = 1; | 282 | eax->split.is_fully_associative = 1; |
283 | ebx->split.coherency_line_size = line_size - 1; | 283 | ebx->split.coherency_line_size = line_size - 1; |
284 | ebx->split.ways_of_associativity = assocs[assoc] - 1; | 284 | ebx->split.ways_of_associativity = assoc - 1; |
285 | ebx->split.physical_line_partition = lines_per_tag - 1; | 285 | ebx->split.physical_line_partition = lines_per_tag - 1; |
286 | ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / | 286 | ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / |
287 | (ebx->split.ways_of_associativity + 1) - 1; | 287 | (ebx->split.ways_of_associativity + 1) - 1; |
@@ -350,7 +350,8 @@ static int __cpuinit find_num_cache_leaves(void) | |||
350 | 350 | ||
351 | unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) | 351 | unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) |
352 | { | 352 | { |
353 | unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ | 353 | /* Cache sizes */ |
354 | unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; | ||
354 | unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ | 355 | unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ |
355 | unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ | 356 | unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ |
356 | unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; | 357 | unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; |
@@ -377,8 +378,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) | |||
377 | 378 | ||
378 | retval = cpuid4_cache_lookup_regs(i, &this_leaf); | 379 | retval = cpuid4_cache_lookup_regs(i, &this_leaf); |
379 | if (retval >= 0) { | 380 | if (retval >= 0) { |
380 | switch(this_leaf.eax.split.level) { | 381 | switch (this_leaf.eax.split.level) { |
381 | case 1: | 382 | case 1: |
382 | if (this_leaf.eax.split.type == | 383 | if (this_leaf.eax.split.type == |
383 | CACHE_TYPE_DATA) | 384 | CACHE_TYPE_DATA) |
384 | new_l1d = this_leaf.size/1024; | 385 | new_l1d = this_leaf.size/1024; |
@@ -386,19 +387,20 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) | |||
386 | CACHE_TYPE_INST) | 387 | CACHE_TYPE_INST) |
387 | new_l1i = this_leaf.size/1024; | 388 | new_l1i = this_leaf.size/1024; |
388 | break; | 389 | break; |
389 | case 2: | 390 | case 2: |
390 | new_l2 = this_leaf.size/1024; | 391 | new_l2 = this_leaf.size/1024; |
391 | num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; | 392 | num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; |
392 | index_msb = get_count_order(num_threads_sharing); | 393 | index_msb = get_count_order(num_threads_sharing); |
393 | l2_id = c->apicid >> index_msb; | 394 | l2_id = c->apicid >> index_msb; |
394 | break; | 395 | break; |
395 | case 3: | 396 | case 3: |
396 | new_l3 = this_leaf.size/1024; | 397 | new_l3 = this_leaf.size/1024; |
397 | num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; | 398 | num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; |
398 | index_msb = get_count_order(num_threads_sharing); | 399 | index_msb = get_count_order( |
400 | num_threads_sharing); | ||
399 | l3_id = c->apicid >> index_msb; | 401 | l3_id = c->apicid >> index_msb; |
400 | break; | 402 | break; |
401 | default: | 403 | default: |
402 | break; | 404 | break; |
403 | } | 405 | } |
404 | } | 406 | } |
@@ -421,22 +423,21 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) | |||
421 | /* Number of times to iterate */ | 423 | /* Number of times to iterate */ |
422 | n = cpuid_eax(2) & 0xFF; | 424 | n = cpuid_eax(2) & 0xFF; |
423 | 425 | ||
424 | for ( i = 0 ; i < n ; i++ ) { | 426 | for (i = 0 ; i < n ; i++) { |
425 | cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); | 427 | cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); |
426 | 428 | ||
427 | /* If bit 31 is set, this is an unknown format */ | 429 | /* If bit 31 is set, this is an unknown format */ |
428 | for ( j = 0 ; j < 3 ; j++ ) { | 430 | for (j = 0 ; j < 3 ; j++) |
429 | if (regs[j] & (1 << 31)) regs[j] = 0; | 431 | if (regs[j] & (1 << 31)) |
430 | } | 432 | regs[j] = 0; |
431 | 433 | ||
432 | /* Byte 0 is level count, not a descriptor */ | 434 | /* Byte 0 is level count, not a descriptor */ |
433 | for ( j = 1 ; j < 16 ; j++ ) { | 435 | for (j = 1 ; j < 16 ; j++) { |
434 | unsigned char des = dp[j]; | 436 | unsigned char des = dp[j]; |
435 | unsigned char k = 0; | 437 | unsigned char k = 0; |
436 | 438 | ||
437 | /* look up this descriptor in the table */ | 439 | /* look up this descriptor in the table */ |
438 | while (cache_table[k].descriptor != 0) | 440 | while (cache_table[k].descriptor != 0) { |
439 | { | ||
440 | if (cache_table[k].descriptor == des) { | 441 | if (cache_table[k].descriptor == des) { |
441 | if (only_trace && cache_table[k].cache_type != LVL_TRACE) | 442 | if (only_trace && cache_table[k].cache_type != LVL_TRACE) |
442 | break; | 443 | break; |
@@ -488,14 +489,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) | |||
488 | } | 489 | } |
489 | 490 | ||
490 | if (trace) | 491 | if (trace) |
491 | printk (KERN_INFO "CPU: Trace cache: %dK uops", trace); | 492 | printk(KERN_INFO "CPU: Trace cache: %dK uops", trace); |
492 | else if ( l1i ) | 493 | else if (l1i) |
493 | printk (KERN_INFO "CPU: L1 I cache: %dK", l1i); | 494 | printk(KERN_INFO "CPU: L1 I cache: %dK", l1i); |
494 | 495 | ||
495 | if (l1d) | 496 | if (l1d) |
496 | printk(", L1 D cache: %dK\n", l1d); | 497 | printk(KERN_CONT ", L1 D cache: %dK\n", l1d); |
497 | else | 498 | else |
498 | printk("\n"); | 499 | printk(KERN_CONT "\n"); |
499 | 500 | ||
500 | if (l2) | 501 | if (l2) |
501 | printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); | 502 | printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); |
@@ -522,6 +523,18 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) | |||
522 | int index_msb, i; | 523 | int index_msb, i; |
523 | struct cpuinfo_x86 *c = &cpu_data(cpu); | 524 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
524 | 525 | ||
526 | if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { | ||
527 | struct cpuinfo_x86 *d; | ||
528 | for_each_online_cpu(i) { | ||
529 | if (!per_cpu(cpuid4_info, i)) | ||
530 | continue; | ||
531 | d = &cpu_data(i); | ||
532 | this_leaf = CPUID4_INFO_IDX(i, index); | ||
533 | cpumask_copy(to_cpumask(this_leaf->shared_cpu_map), | ||
534 | d->llc_shared_map); | ||
535 | } | ||
536 | return; | ||
537 | } | ||
525 | this_leaf = CPUID4_INFO_IDX(cpu, index); | 538 | this_leaf = CPUID4_INFO_IDX(cpu, index); |
526 | num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; | 539 | num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; |
527 | 540 | ||
@@ -558,8 +571,13 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) | |||
558 | } | 571 | } |
559 | } | 572 | } |
560 | #else | 573 | #else |
561 | static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) {} | 574 | static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) |
562 | static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) {} | 575 | { |
576 | } | ||
577 | |||
578 | static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) | ||
579 | { | ||
580 | } | ||
563 | #endif | 581 | #endif |
564 | 582 | ||
565 | static void __cpuinit free_cache_attributes(unsigned int cpu) | 583 | static void __cpuinit free_cache_attributes(unsigned int cpu) |
@@ -645,7 +663,7 @@ static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); | |||
645 | static ssize_t show_##file_name \ | 663 | static ssize_t show_##file_name \ |
646 | (struct _cpuid4_info *this_leaf, char *buf) \ | 664 | (struct _cpuid4_info *this_leaf, char *buf) \ |
647 | { \ | 665 | { \ |
648 | return sprintf (buf, "%lu\n", (unsigned long)this_leaf->object + val); \ | 666 | return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ |
649 | } | 667 | } |
650 | 668 | ||
651 | show_one_plus(level, eax.split.level, 0); | 669 | show_one_plus(level, eax.split.level, 0); |
@@ -656,7 +674,7 @@ show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); | |||
656 | 674 | ||
657 | static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) | 675 | static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) |
658 | { | 676 | { |
659 | return sprintf (buf, "%luK\n", this_leaf->size / 1024); | 677 | return sprintf(buf, "%luK\n", this_leaf->size / 1024); |
660 | } | 678 | } |
661 | 679 | ||
662 | static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, | 680 | static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, |
@@ -669,7 +687,7 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, | |||
669 | const struct cpumask *mask; | 687 | const struct cpumask *mask; |
670 | 688 | ||
671 | mask = to_cpumask(this_leaf->shared_cpu_map); | 689 | mask = to_cpumask(this_leaf->shared_cpu_map); |
672 | n = type? | 690 | n = type ? |
673 | cpulist_scnprintf(buf, len-2, mask) : | 691 | cpulist_scnprintf(buf, len-2, mask) : |
674 | cpumask_scnprintf(buf, len-2, mask); | 692 | cpumask_scnprintf(buf, len-2, mask); |
675 | buf[n++] = '\n'; | 693 | buf[n++] = '\n'; |
@@ -800,7 +818,7 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, | |||
800 | static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, | 818 | static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, |
801 | show_cache_disable_1, store_cache_disable_1); | 819 | show_cache_disable_1, store_cache_disable_1); |
802 | 820 | ||
803 | static struct attribute * default_attrs[] = { | 821 | static struct attribute *default_attrs[] = { |
804 | &type.attr, | 822 | &type.attr, |
805 | &level.attr, | 823 | &level.attr, |
806 | &coherency_line_size.attr, | 824 | &coherency_line_size.attr, |
@@ -815,7 +833,7 @@ static struct attribute * default_attrs[] = { | |||
815 | NULL | 833 | NULL |
816 | }; | 834 | }; |
817 | 835 | ||
818 | static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) | 836 | static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) |
819 | { | 837 | { |
820 | struct _cache_attr *fattr = to_attr(attr); | 838 | struct _cache_attr *fattr = to_attr(attr); |
821 | struct _index_kobject *this_leaf = to_object(kobj); | 839 | struct _index_kobject *this_leaf = to_object(kobj); |
@@ -828,8 +846,8 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) | |||
828 | return ret; | 846 | return ret; |
829 | } | 847 | } |
830 | 848 | ||
831 | static ssize_t store(struct kobject * kobj, struct attribute * attr, | 849 | static ssize_t store(struct kobject *kobj, struct attribute *attr, |
832 | const char * buf, size_t count) | 850 | const char *buf, size_t count) |
833 | { | 851 | { |
834 | struct _cache_attr *fattr = to_attr(attr); | 852 | struct _cache_attr *fattr = to_attr(attr); |
835 | struct _index_kobject *this_leaf = to_object(kobj); | 853 | struct _index_kobject *this_leaf = to_object(kobj); |
@@ -883,7 +901,7 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) | |||
883 | goto err_out; | 901 | goto err_out; |
884 | 902 | ||
885 | per_cpu(index_kobject, cpu) = kzalloc( | 903 | per_cpu(index_kobject, cpu) = kzalloc( |
886 | sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); | 904 | sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); |
887 | if (unlikely(per_cpu(index_kobject, cpu) == NULL)) | 905 | if (unlikely(per_cpu(index_kobject, cpu) == NULL)) |
888 | goto err_out; | 906 | goto err_out; |
889 | 907 | ||
@@ -917,7 +935,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) | |||
917 | } | 935 | } |
918 | 936 | ||
919 | for (i = 0; i < num_cache_leaves; i++) { | 937 | for (i = 0; i < num_cache_leaves; i++) { |
920 | this_object = INDEX_KOBJECT_PTR(cpu,i); | 938 | this_object = INDEX_KOBJECT_PTR(cpu, i); |
921 | this_object->cpu = cpu; | 939 | this_object->cpu = cpu; |
922 | this_object->index = i; | 940 | this_object->index = i; |
923 | retval = kobject_init_and_add(&(this_object->kobj), | 941 | retval = kobject_init_and_add(&(this_object->kobj), |
@@ -925,9 +943,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) | |||
925 | per_cpu(cache_kobject, cpu), | 943 | per_cpu(cache_kobject, cpu), |
926 | "index%1lu", i); | 944 | "index%1lu", i); |
927 | if (unlikely(retval)) { | 945 | if (unlikely(retval)) { |
928 | for (j = 0; j < i; j++) { | 946 | for (j = 0; j < i; j++) |
929 | kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj)); | 947 | kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); |
930 | } | ||
931 | kobject_put(per_cpu(cache_kobject, cpu)); | 948 | kobject_put(per_cpu(cache_kobject, cpu)); |
932 | cpuid4_cache_sysfs_exit(cpu); | 949 | cpuid4_cache_sysfs_exit(cpu); |
933 | return retval; | 950 | return retval; |
@@ -952,7 +969,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) | |||
952 | cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map)); | 969 | cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map)); |
953 | 970 | ||
954 | for (i = 0; i < num_cache_leaves; i++) | 971 | for (i = 0; i < num_cache_leaves; i++) |
955 | kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); | 972 | kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); |
956 | kobject_put(per_cpu(cache_kobject, cpu)); | 973 | kobject_put(per_cpu(cache_kobject, cpu)); |
957 | cpuid4_cache_sysfs_exit(cpu); | 974 | cpuid4_cache_sysfs_exit(cpu); |
958 | } | 975 | } |
@@ -977,8 +994,7 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, | |||
977 | return NOTIFY_OK; | 994 | return NOTIFY_OK; |
978 | } | 995 | } |
979 | 996 | ||
980 | static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = | 997 | static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = { |
981 | { | ||
982 | .notifier_call = cacheinfo_cpu_callback, | 998 | .notifier_call = cacheinfo_cpu_callback, |
983 | }; | 999 | }; |
984 | 1000 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 188a1ca5ad2..4ac6d48fe11 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile | |||
@@ -1,11 +1,8 @@ | |||
1 | obj-y = mce.o | 1 | obj-y = mce.o mce-severity.o |
2 | 2 | ||
3 | obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o | ||
4 | obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o | ||
5 | obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o | 3 | obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o |
6 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o | 4 | obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o |
7 | obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o | 5 | obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o |
8 | obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o | ||
9 | obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o | 6 | obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o |
10 | obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o | 7 | obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o |
11 | 8 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c deleted file mode 100644 index b945d5dbc60..00000000000 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ /dev/null | |||
@@ -1,116 +0,0 @@ | |||
1 | /* | ||
2 | * Athlon specific Machine Check Exception Reporting | ||
3 | * (C) Copyright 2002 Dave Jones <davej@redhat.com> | ||
4 | */ | ||
5 | #include <linux/interrupt.h> | ||
6 | #include <linux/kernel.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/smp.h> | ||
10 | |||
11 | #include <asm/processor.h> | ||
12 | #include <asm/system.h> | ||
13 | #include <asm/mce.h> | ||
14 | #include <asm/msr.h> | ||
15 | |||
16 | /* Machine Check Handler For AMD Athlon/Duron: */ | ||
17 | static void k7_machine_check(struct pt_regs *regs, long error_code) | ||
18 | { | ||
19 | u32 alow, ahigh, high, low; | ||
20 | u32 mcgstl, mcgsth; | ||
21 | int recover = 1; | ||
22 | int i; | ||
23 | |||
24 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | ||
25 | if (mcgstl & (1<<0)) /* Recoverable ? */ | ||
26 | recover = 0; | ||
27 | |||
28 | printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", | ||
29 | smp_processor_id(), mcgsth, mcgstl); | ||
30 | |||
31 | for (i = 1; i < nr_mce_banks; i++) { | ||
32 | rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); | ||
33 | if (high & (1<<31)) { | ||
34 | char misc[20]; | ||
35 | char addr[24]; | ||
36 | |||
37 | misc[0] = '\0'; | ||
38 | addr[0] = '\0'; | ||
39 | |||
40 | if (high & (1<<29)) | ||
41 | recover |= 1; | ||
42 | if (high & (1<<25)) | ||
43 | recover |= 2; | ||
44 | high &= ~(1<<31); | ||
45 | |||
46 | if (high & (1<<27)) { | ||
47 | rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); | ||
48 | snprintf(misc, 20, "[%08x%08x]", ahigh, alow); | ||
49 | } | ||
50 | if (high & (1<<26)) { | ||
51 | rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); | ||
52 | snprintf(addr, 24, " at %08x%08x", ahigh, alow); | ||
53 | } | ||
54 | |||
55 | printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", | ||
56 | smp_processor_id(), i, high, low, misc, addr); | ||
57 | |||
58 | /* Clear it: */ | ||
59 | wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); | ||
60 | /* Serialize: */ | ||
61 | wmb(); | ||
62 | add_taint(TAINT_MACHINE_CHECK); | ||
63 | } | ||
64 | } | ||
65 | |||
66 | if (recover & 2) | ||
67 | panic("CPU context corrupt"); | ||
68 | if (recover & 1) | ||
69 | panic("Unable to continue"); | ||
70 | |||
71 | printk(KERN_EMERG "Attempting to continue.\n"); | ||
72 | |||
73 | mcgstl &= ~(1<<2); | ||
74 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | ||
75 | } | ||
76 | |||
77 | |||
78 | /* AMD K7 machine check is Intel like: */ | ||
79 | void amd_mcheck_init(struct cpuinfo_x86 *c) | ||
80 | { | ||
81 | u32 l, h; | ||
82 | int i; | ||
83 | |||
84 | if (!cpu_has(c, X86_FEATURE_MCE)) | ||
85 | return; | ||
86 | |||
87 | machine_check_vector = k7_machine_check; | ||
88 | /* Make sure the vector pointer is visible before we enable MCEs: */ | ||
89 | wmb(); | ||
90 | |||
91 | printk(KERN_INFO "Intel machine check architecture supported.\n"); | ||
92 | |||
93 | rdmsr(MSR_IA32_MCG_CAP, l, h); | ||
94 | if (l & (1<<8)) /* Control register present ? */ | ||
95 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | ||
96 | nr_mce_banks = l & 0xff; | ||
97 | |||
98 | /* | ||
99 | * Clear status for MC index 0 separately, we don't touch CTL, | ||
100 | * as some K7 Athlons cause spurious MCEs when its enabled: | ||
101 | */ | ||
102 | if (boot_cpu_data.x86 == 6) { | ||
103 | wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); | ||
104 | i = 1; | ||
105 | } else | ||
106 | i = 0; | ||
107 | |||
108 | for (; i < nr_mce_banks; i++) { | ||
109 | wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); | ||
110 | wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); | ||
111 | } | ||
112 | |||
113 | set_in_cr4(X86_CR4_MCE); | ||
114 | printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", | ||
115 | smp_processor_id()); | ||
116 | } | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index a3a235a53f0..472763d9209 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c | |||
@@ -18,7 +18,12 @@ | |||
18 | #include <linux/string.h> | 18 | #include <linux/string.h> |
19 | #include <linux/fs.h> | 19 | #include <linux/fs.h> |
20 | #include <linux/smp.h> | 20 | #include <linux/smp.h> |
21 | #include <linux/notifier.h> | ||
22 | #include <linux/kdebug.h> | ||
23 | #include <linux/cpu.h> | ||
24 | #include <linux/sched.h> | ||
21 | #include <asm/mce.h> | 25 | #include <asm/mce.h> |
26 | #include <asm/apic.h> | ||
22 | 27 | ||
23 | /* Update fake mce registers on current CPU. */ | 28 | /* Update fake mce registers on current CPU. */ |
24 | static void inject_mce(struct mce *m) | 29 | static void inject_mce(struct mce *m) |
@@ -39,44 +44,142 @@ static void inject_mce(struct mce *m) | |||
39 | i->finished = 1; | 44 | i->finished = 1; |
40 | } | 45 | } |
41 | 46 | ||
42 | struct delayed_mce { | 47 | static void raise_poll(struct mce *m) |
43 | struct timer_list timer; | 48 | { |
44 | struct mce m; | 49 | unsigned long flags; |
45 | }; | 50 | mce_banks_t b; |
46 | 51 | ||
47 | /* Inject mce on current CPU */ | 52 | memset(&b, 0xff, sizeof(mce_banks_t)); |
48 | static void raise_mce(unsigned long data) | 53 | local_irq_save(flags); |
54 | machine_check_poll(0, &b); | ||
55 | local_irq_restore(flags); | ||
56 | m->finished = 0; | ||
57 | } | ||
58 | |||
59 | static void raise_exception(struct mce *m, struct pt_regs *pregs) | ||
49 | { | 60 | { |
50 | struct delayed_mce *dm = (struct delayed_mce *)data; | 61 | struct pt_regs regs; |
51 | struct mce *m = &dm->m; | 62 | unsigned long flags; |
52 | int cpu = m->extcpu; | ||
53 | 63 | ||
54 | inject_mce(m); | 64 | if (!pregs) { |
55 | if (m->status & MCI_STATUS_UC) { | ||
56 | struct pt_regs regs; | ||
57 | memset(®s, 0, sizeof(struct pt_regs)); | 65 | memset(®s, 0, sizeof(struct pt_regs)); |
58 | regs.ip = m->ip; | 66 | regs.ip = m->ip; |
59 | regs.cs = m->cs; | 67 | regs.cs = m->cs; |
68 | pregs = ®s; | ||
69 | } | ||
70 | /* in mcheck exeception handler, irq will be disabled */ | ||
71 | local_irq_save(flags); | ||
72 | do_machine_check(pregs, 0); | ||
73 | local_irq_restore(flags); | ||
74 | m->finished = 0; | ||
75 | } | ||
76 | |||
77 | static cpumask_t mce_inject_cpumask; | ||
78 | |||
79 | static int mce_raise_notify(struct notifier_block *self, | ||
80 | unsigned long val, void *data) | ||
81 | { | ||
82 | struct die_args *args = (struct die_args *)data; | ||
83 | int cpu = smp_processor_id(); | ||
84 | struct mce *m = &__get_cpu_var(injectm); | ||
85 | if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) | ||
86 | return NOTIFY_DONE; | ||
87 | cpu_clear(cpu, mce_inject_cpumask); | ||
88 | if (m->inject_flags & MCJ_EXCEPTION) | ||
89 | raise_exception(m, args->regs); | ||
90 | else if (m->status) | ||
91 | raise_poll(m); | ||
92 | return NOTIFY_STOP; | ||
93 | } | ||
94 | |||
95 | static struct notifier_block mce_raise_nb = { | ||
96 | .notifier_call = mce_raise_notify, | ||
97 | .priority = 1000, | ||
98 | }; | ||
99 | |||
100 | /* Inject mce on current CPU */ | ||
101 | static int raise_local(void) | ||
102 | { | ||
103 | struct mce *m = &__get_cpu_var(injectm); | ||
104 | int context = MCJ_CTX(m->inject_flags); | ||
105 | int ret = 0; | ||
106 | int cpu = m->extcpu; | ||
107 | |||
108 | if (m->inject_flags & MCJ_EXCEPTION) { | ||
60 | printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); | 109 | printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); |
61 | do_machine_check(®s, 0); | 110 | switch (context) { |
111 | case MCJ_CTX_IRQ: | ||
112 | /* | ||
113 | * Could do more to fake interrupts like | ||
114 | * calling irq_enter, but the necessary | ||
115 | * machinery isn't exported currently. | ||
116 | */ | ||
117 | /*FALL THROUGH*/ | ||
118 | case MCJ_CTX_PROCESS: | ||
119 | raise_exception(m, NULL); | ||
120 | break; | ||
121 | default: | ||
122 | printk(KERN_INFO "Invalid MCE context\n"); | ||
123 | ret = -EINVAL; | ||
124 | } | ||
62 | printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); | 125 | printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); |
63 | } else { | 126 | } else if (m->status) { |
64 | mce_banks_t b; | ||
65 | memset(&b, 0xff, sizeof(mce_banks_t)); | ||
66 | printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); | 127 | printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); |
67 | machine_check_poll(0, &b); | 128 | raise_poll(m); |
68 | mce_notify_irq(); | 129 | mce_notify_irq(); |
69 | printk(KERN_INFO "Finished machine check poll on CPU %d\n", | 130 | printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); |
70 | cpu); | 131 | } else |
71 | } | 132 | m->finished = 0; |
72 | kfree(dm); | 133 | |
134 | return ret; | ||
135 | } | ||
136 | |||
137 | static void raise_mce(struct mce *m) | ||
138 | { | ||
139 | int context = MCJ_CTX(m->inject_flags); | ||
140 | |||
141 | inject_mce(m); | ||
142 | |||
143 | if (context == MCJ_CTX_RANDOM) | ||
144 | return; | ||
145 | |||
146 | #ifdef CONFIG_X86_LOCAL_APIC | ||
147 | if (m->inject_flags & MCJ_NMI_BROADCAST) { | ||
148 | unsigned long start; | ||
149 | int cpu; | ||
150 | get_online_cpus(); | ||
151 | mce_inject_cpumask = cpu_online_map; | ||
152 | cpu_clear(get_cpu(), mce_inject_cpumask); | ||
153 | for_each_online_cpu(cpu) { | ||
154 | struct mce *mcpu = &per_cpu(injectm, cpu); | ||
155 | if (!mcpu->finished || | ||
156 | MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) | ||
157 | cpu_clear(cpu, mce_inject_cpumask); | ||
158 | } | ||
159 | if (!cpus_empty(mce_inject_cpumask)) | ||
160 | apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); | ||
161 | start = jiffies; | ||
162 | while (!cpus_empty(mce_inject_cpumask)) { | ||
163 | if (!time_before(jiffies, start + 2*HZ)) { | ||
164 | printk(KERN_ERR | ||
165 | "Timeout waiting for mce inject NMI %lx\n", | ||
166 | *cpus_addr(mce_inject_cpumask)); | ||
167 | break; | ||
168 | } | ||
169 | cpu_relax(); | ||
170 | } | ||
171 | raise_local(); | ||
172 | put_cpu(); | ||
173 | put_online_cpus(); | ||
174 | } else | ||
175 | #endif | ||
176 | raise_local(); | ||
73 | } | 177 | } |
74 | 178 | ||
75 | /* Error injection interface */ | 179 | /* Error injection interface */ |
76 | static ssize_t mce_write(struct file *filp, const char __user *ubuf, | 180 | static ssize_t mce_write(struct file *filp, const char __user *ubuf, |
77 | size_t usize, loff_t *off) | 181 | size_t usize, loff_t *off) |
78 | { | 182 | { |
79 | struct delayed_mce *dm; | ||
80 | struct mce m; | 183 | struct mce m; |
81 | 184 | ||
82 | if (!capable(CAP_SYS_ADMIN)) | 185 | if (!capable(CAP_SYS_ADMIN)) |
@@ -96,19 +199,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, | |||
96 | if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) | 199 | if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) |
97 | return -EINVAL; | 200 | return -EINVAL; |
98 | 201 | ||
99 | dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); | ||
100 | if (!dm) | ||
101 | return -ENOMEM; | ||
102 | |||
103 | /* | 202 | /* |
104 | * Need to give user space some time to set everything up, | 203 | * Need to give user space some time to set everything up, |
105 | * so do it a jiffie or two later everywhere. | 204 | * so do it a jiffie or two later everywhere. |
106 | * Should we use a hrtimer here for better synchronization? | ||
107 | */ | 205 | */ |
108 | memcpy(&dm->m, &m, sizeof(struct mce)); | 206 | schedule_timeout(2); |
109 | setup_timer(&dm->timer, raise_mce, (unsigned long)dm); | 207 | raise_mce(&m); |
110 | dm->timer.expires = jiffies + 2; | ||
111 | add_timer_on(&dm->timer, m.extcpu); | ||
112 | return usize; | 208 | return usize; |
113 | } | 209 | } |
114 | 210 | ||
@@ -116,6 +212,7 @@ static int inject_init(void) | |||
116 | { | 212 | { |
117 | printk(KERN_INFO "Machine check injector initialized\n"); | 213 | printk(KERN_INFO "Machine check injector initialized\n"); |
118 | mce_chrdev_ops.write = mce_write; | 214 | mce_chrdev_ops.write = mce_write; |
215 | register_die_notifier(&mce_raise_nb); | ||
119 | return 0; | 216 | return 0; |
120 | } | 217 | } |
121 | 218 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 54dcb8ff12e..32996f9fab6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h | |||
@@ -1,3 +1,4 @@ | |||
1 | #include <linux/sysdev.h> | ||
1 | #include <asm/mce.h> | 2 | #include <asm/mce.h> |
2 | 3 | ||
3 | enum severity_level { | 4 | enum severity_level { |
@@ -10,6 +11,20 @@ enum severity_level { | |||
10 | MCE_PANIC_SEVERITY, | 11 | MCE_PANIC_SEVERITY, |
11 | }; | 12 | }; |
12 | 13 | ||
14 | #define ATTR_LEN 16 | ||
15 | |||
16 | /* One object for each MCE bank, shared by all CPUs */ | ||
17 | struct mce_bank { | ||
18 | u64 ctl; /* subevents to enable */ | ||
19 | unsigned char init; /* initialise bank? */ | ||
20 | struct sysdev_attribute attr; /* sysdev attribute */ | ||
21 | char attrname[ATTR_LEN]; /* attribute name */ | ||
22 | }; | ||
23 | |||
13 | int mce_severity(struct mce *a, int tolerant, char **msg); | 24 | int mce_severity(struct mce *a, int tolerant, char **msg); |
25 | struct dentry *mce_get_debugfs_dir(void); | ||
14 | 26 | ||
15 | extern int mce_ser; | 27 | extern int mce_ser; |
28 | |||
29 | extern struct mce_bank *mce_banks; | ||
30 | |||
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index ff0807f9705..8a85dd1b1aa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c | |||
@@ -139,6 +139,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg) | |||
139 | } | 139 | } |
140 | } | 140 | } |
141 | 141 | ||
142 | #ifdef CONFIG_DEBUG_FS | ||
142 | static void *s_start(struct seq_file *f, loff_t *pos) | 143 | static void *s_start(struct seq_file *f, loff_t *pos) |
143 | { | 144 | { |
144 | if (*pos >= ARRAY_SIZE(severities)) | 145 | if (*pos >= ARRAY_SIZE(severities)) |
@@ -197,7 +198,7 @@ static int __init severities_debugfs_init(void) | |||
197 | { | 198 | { |
198 | struct dentry *dmce = NULL, *fseverities_coverage = NULL; | 199 | struct dentry *dmce = NULL, *fseverities_coverage = NULL; |
199 | 200 | ||
200 | dmce = debugfs_create_dir("mce", NULL); | 201 | dmce = mce_get_debugfs_dir(); |
201 | if (dmce == NULL) | 202 | if (dmce == NULL) |
202 | goto err_out; | 203 | goto err_out; |
203 | fseverities_coverage = debugfs_create_file("severities-coverage", | 204 | fseverities_coverage = debugfs_create_file("severities-coverage", |
@@ -209,10 +210,7 @@ static int __init severities_debugfs_init(void) | |||
209 | return 0; | 210 | return 0; |
210 | 211 | ||
211 | err_out: | 212 | err_out: |
212 | if (fseverities_coverage) | ||
213 | debugfs_remove(fseverities_coverage); | ||
214 | if (dmce) | ||
215 | debugfs_remove(dmce); | ||
216 | return -ENOMEM; | 213 | return -ENOMEM; |
217 | } | 214 | } |
218 | late_initcall(severities_debugfs_init); | 215 | late_initcall(severities_debugfs_init); |
216 | #endif | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1cfb623ce11..4b2af86e3e8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/smp.h> | 34 | #include <linux/smp.h> |
35 | #include <linux/fs.h> | 35 | #include <linux/fs.h> |
36 | #include <linux/mm.h> | 36 | #include <linux/mm.h> |
37 | #include <linux/debugfs.h> | ||
37 | 38 | ||
38 | #include <asm/processor.h> | 39 | #include <asm/processor.h> |
39 | #include <asm/hw_irq.h> | 40 | #include <asm/hw_irq.h> |
@@ -45,21 +46,8 @@ | |||
45 | 46 | ||
46 | #include "mce-internal.h" | 47 | #include "mce-internal.h" |
47 | 48 | ||
48 | /* Handle unconfigured int18 (should never happen) */ | ||
49 | static void unexpected_machine_check(struct pt_regs *regs, long error_code) | ||
50 | { | ||
51 | printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", | ||
52 | smp_processor_id()); | ||
53 | } | ||
54 | |||
55 | /* Call the installed machine check handler for this CPU setup. */ | ||
56 | void (*machine_check_vector)(struct pt_regs *, long error_code) = | ||
57 | unexpected_machine_check; | ||
58 | |||
59 | int mce_disabled __read_mostly; | 49 | int mce_disabled __read_mostly; |
60 | 50 | ||
61 | #ifdef CONFIG_X86_NEW_MCE | ||
62 | |||
63 | #define MISC_MCELOG_MINOR 227 | 51 | #define MISC_MCELOG_MINOR 227 |
64 | 52 | ||
65 | #define SPINUNIT 100 /* 100ns */ | 53 | #define SPINUNIT 100 /* 100ns */ |
@@ -77,7 +65,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); | |||
77 | */ | 65 | */ |
78 | static int tolerant __read_mostly = 1; | 66 | static int tolerant __read_mostly = 1; |
79 | static int banks __read_mostly; | 67 | static int banks __read_mostly; |
80 | static u64 *bank __read_mostly; | ||
81 | static int rip_msr __read_mostly; | 68 | static int rip_msr __read_mostly; |
82 | static int mce_bootlog __read_mostly = -1; | 69 | static int mce_bootlog __read_mostly = -1; |
83 | static int monarch_timeout __read_mostly = -1; | 70 | static int monarch_timeout __read_mostly = -1; |
@@ -87,13 +74,13 @@ int mce_cmci_disabled __read_mostly; | |||
87 | int mce_ignore_ce __read_mostly; | 74 | int mce_ignore_ce __read_mostly; |
88 | int mce_ser __read_mostly; | 75 | int mce_ser __read_mostly; |
89 | 76 | ||
77 | struct mce_bank *mce_banks __read_mostly; | ||
78 | |||
90 | /* User mode helper program triggered by machine check event */ | 79 | /* User mode helper program triggered by machine check event */ |
91 | static unsigned long mce_need_notify; | 80 | static unsigned long mce_need_notify; |
92 | static char mce_helper[128]; | 81 | static char mce_helper[128]; |
93 | static char *mce_helper_argv[2] = { mce_helper, NULL }; | 82 | static char *mce_helper_argv[2] = { mce_helper, NULL }; |
94 | 83 | ||
95 | static unsigned long dont_init_banks; | ||
96 | |||
97 | static DECLARE_WAIT_QUEUE_HEAD(mce_wait); | 84 | static DECLARE_WAIT_QUEUE_HEAD(mce_wait); |
98 | static DEFINE_PER_CPU(struct mce, mces_seen); | 85 | static DEFINE_PER_CPU(struct mce, mces_seen); |
99 | static int cpu_missing; | 86 | static int cpu_missing; |
@@ -104,11 +91,6 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { | |||
104 | [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL | 91 | [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL |
105 | }; | 92 | }; |
106 | 93 | ||
107 | static inline int skip_bank_init(int i) | ||
108 | { | ||
109 | return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); | ||
110 | } | ||
111 | |||
112 | static DEFINE_PER_CPU(struct work_struct, mce_work); | 94 | static DEFINE_PER_CPU(struct work_struct, mce_work); |
113 | 95 | ||
114 | /* Do initial initialization of a struct mce */ | 96 | /* Do initial initialization of a struct mce */ |
@@ -183,6 +165,11 @@ void mce_log(struct mce *mce) | |||
183 | set_bit(0, &mce_need_notify); | 165 | set_bit(0, &mce_need_notify); |
184 | } | 166 | } |
185 | 167 | ||
168 | void __weak decode_mce(struct mce *m) | ||
169 | { | ||
170 | return; | ||
171 | } | ||
172 | |||
186 | static void print_mce(struct mce *m) | 173 | static void print_mce(struct mce *m) |
187 | { | 174 | { |
188 | printk(KERN_EMERG | 175 | printk(KERN_EMERG |
@@ -205,6 +192,8 @@ static void print_mce(struct mce *m) | |||
205 | printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", | 192 | printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", |
206 | m->cpuvendor, m->cpuid, m->time, m->socketid, | 193 | m->cpuvendor, m->cpuid, m->time, m->socketid, |
207 | m->apicid); | 194 | m->apicid); |
195 | |||
196 | decode_mce(m); | ||
208 | } | 197 | } |
209 | 198 | ||
210 | static void print_mce_head(void) | 199 | static void print_mce_head(void) |
@@ -215,13 +204,19 @@ static void print_mce_head(void) | |||
215 | static void print_mce_tail(void) | 204 | static void print_mce_tail(void) |
216 | { | 205 | { |
217 | printk(KERN_EMERG "This is not a software problem!\n" | 206 | printk(KERN_EMERG "This is not a software problem!\n" |
218 | "Run through mcelog --ascii to decode and contact your hardware vendor\n"); | 207 | #if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD)) |
208 | "Run through mcelog --ascii to decode and contact your hardware vendor\n" | ||
209 | #endif | ||
210 | ); | ||
219 | } | 211 | } |
220 | 212 | ||
221 | #define PANIC_TIMEOUT 5 /* 5 seconds */ | 213 | #define PANIC_TIMEOUT 5 /* 5 seconds */ |
222 | 214 | ||
223 | static atomic_t mce_paniced; | 215 | static atomic_t mce_paniced; |
224 | 216 | ||
217 | static int fake_panic; | ||
218 | static atomic_t mce_fake_paniced; | ||
219 | |||
225 | /* Panic in progress. Enable interrupts and wait for final IPI */ | 220 | /* Panic in progress. Enable interrupts and wait for final IPI */ |
226 | static void wait_for_panic(void) | 221 | static void wait_for_panic(void) |
227 | { | 222 | { |
@@ -239,15 +234,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp) | |||
239 | { | 234 | { |
240 | int i; | 235 | int i; |
241 | 236 | ||
242 | /* | 237 | if (!fake_panic) { |
243 | * Make sure only one CPU runs in machine check panic | 238 | /* |
244 | */ | 239 | * Make sure only one CPU runs in machine check panic |
245 | if (atomic_add_return(1, &mce_paniced) > 1) | 240 | */ |
246 | wait_for_panic(); | 241 | if (atomic_inc_return(&mce_paniced) > 1) |
247 | barrier(); | 242 | wait_for_panic(); |
243 | barrier(); | ||
248 | 244 | ||
249 | bust_spinlocks(1); | 245 | bust_spinlocks(1); |
250 | console_verbose(); | 246 | console_verbose(); |
247 | } else { | ||
248 | /* Don't log too much for fake panic */ | ||
249 | if (atomic_inc_return(&mce_fake_paniced) > 1) | ||
250 | return; | ||
251 | } | ||
251 | print_mce_head(); | 252 | print_mce_head(); |
252 | /* First print corrected ones that are still unlogged */ | 253 | /* First print corrected ones that are still unlogged */ |
253 | for (i = 0; i < MCE_LOG_LEN; i++) { | 254 | for (i = 0; i < MCE_LOG_LEN; i++) { |
@@ -274,9 +275,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp) | |||
274 | print_mce_tail(); | 275 | print_mce_tail(); |
275 | if (exp) | 276 | if (exp) |
276 | printk(KERN_EMERG "Machine check: %s\n", exp); | 277 | printk(KERN_EMERG "Machine check: %s\n", exp); |
277 | if (panic_timeout == 0) | 278 | if (!fake_panic) { |
278 | panic_timeout = mce_panic_timeout; | 279 | if (panic_timeout == 0) |
279 | panic(msg); | 280 | panic_timeout = mce_panic_timeout; |
281 | panic(msg); | ||
282 | } else | ||
283 | printk(KERN_EMERG "Fake kernel panic: %s\n", msg); | ||
280 | } | 284 | } |
281 | 285 | ||
282 | /* Support code for software error injection */ | 286 | /* Support code for software error injection */ |
@@ -286,11 +290,11 @@ static int msr_to_offset(u32 msr) | |||
286 | unsigned bank = __get_cpu_var(injectm.bank); | 290 | unsigned bank = __get_cpu_var(injectm.bank); |
287 | if (msr == rip_msr) | 291 | if (msr == rip_msr) |
288 | return offsetof(struct mce, ip); | 292 | return offsetof(struct mce, ip); |
289 | if (msr == MSR_IA32_MC0_STATUS + bank*4) | 293 | if (msr == MSR_IA32_MCx_STATUS(bank)) |
290 | return offsetof(struct mce, status); | 294 | return offsetof(struct mce, status); |
291 | if (msr == MSR_IA32_MC0_ADDR + bank*4) | 295 | if (msr == MSR_IA32_MCx_ADDR(bank)) |
292 | return offsetof(struct mce, addr); | 296 | return offsetof(struct mce, addr); |
293 | if (msr == MSR_IA32_MC0_MISC + bank*4) | 297 | if (msr == MSR_IA32_MCx_MISC(bank)) |
294 | return offsetof(struct mce, misc); | 298 | return offsetof(struct mce, misc); |
295 | if (msr == MSR_IA32_MCG_STATUS) | 299 | if (msr == MSR_IA32_MCG_STATUS) |
296 | return offsetof(struct mce, mcgstatus); | 300 | return offsetof(struct mce, mcgstatus); |
@@ -301,13 +305,25 @@ static int msr_to_offset(u32 msr) | |||
301 | static u64 mce_rdmsrl(u32 msr) | 305 | static u64 mce_rdmsrl(u32 msr) |
302 | { | 306 | { |
303 | u64 v; | 307 | u64 v; |
308 | |||
304 | if (__get_cpu_var(injectm).finished) { | 309 | if (__get_cpu_var(injectm).finished) { |
305 | int offset = msr_to_offset(msr); | 310 | int offset = msr_to_offset(msr); |
311 | |||
306 | if (offset < 0) | 312 | if (offset < 0) |
307 | return 0; | 313 | return 0; |
308 | return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); | 314 | return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); |
309 | } | 315 | } |
310 | rdmsrl(msr, v); | 316 | |
317 | if (rdmsrl_safe(msr, &v)) { | ||
318 | WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); | ||
319 | /* | ||
320 | * Return zero in case the access faulted. This should | ||
321 | * not happen normally but can happen if the CPU does | ||
322 | * something weird, or if the code is buggy. | ||
323 | */ | ||
324 | v = 0; | ||
325 | } | ||
326 | |||
311 | return v; | 327 | return v; |
312 | } | 328 | } |
313 | 329 | ||
@@ -315,6 +331,7 @@ static void mce_wrmsrl(u32 msr, u64 v) | |||
315 | { | 331 | { |
316 | if (__get_cpu_var(injectm).finished) { | 332 | if (__get_cpu_var(injectm).finished) { |
317 | int offset = msr_to_offset(msr); | 333 | int offset = msr_to_offset(msr); |
334 | |||
318 | if (offset >= 0) | 335 | if (offset >= 0) |
319 | *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; | 336 | *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; |
320 | return; | 337 | return; |
@@ -411,7 +428,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) | |||
411 | m->ip = mce_rdmsrl(rip_msr); | 428 | m->ip = mce_rdmsrl(rip_msr); |
412 | } | 429 | } |
413 | 430 | ||
414 | #ifdef CONFIG_X86_LOCAL_APIC | 431 | #ifdef CONFIG_X86_LOCAL_APIC |
415 | /* | 432 | /* |
416 | * Called after interrupts have been reenabled again | 433 | * Called after interrupts have been reenabled again |
417 | * when a MCE happened during an interrupts off region | 434 | * when a MCE happened during an interrupts off region |
@@ -495,7 +512,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
495 | 512 | ||
496 | m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); | 513 | m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); |
497 | for (i = 0; i < banks; i++) { | 514 | for (i = 0; i < banks; i++) { |
498 | if (!bank[i] || !test_bit(i, *b)) | 515 | if (!mce_banks[i].ctl || !test_bit(i, *b)) |
499 | continue; | 516 | continue; |
500 | 517 | ||
501 | m.misc = 0; | 518 | m.misc = 0; |
@@ -504,7 +521,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
504 | m.tsc = 0; | 521 | m.tsc = 0; |
505 | 522 | ||
506 | barrier(); | 523 | barrier(); |
507 | m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); | 524 | m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); |
508 | if (!(m.status & MCI_STATUS_VAL)) | 525 | if (!(m.status & MCI_STATUS_VAL)) |
509 | continue; | 526 | continue; |
510 | 527 | ||
@@ -519,9 +536,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
519 | continue; | 536 | continue; |
520 | 537 | ||
521 | if (m.status & MCI_STATUS_MISCV) | 538 | if (m.status & MCI_STATUS_MISCV) |
522 | m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); | 539 | m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); |
523 | if (m.status & MCI_STATUS_ADDRV) | 540 | if (m.status & MCI_STATUS_ADDRV) |
524 | m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); | 541 | m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); |
525 | 542 | ||
526 | if (!(flags & MCP_TIMESTAMP)) | 543 | if (!(flags & MCP_TIMESTAMP)) |
527 | m.tsc = 0; | 544 | m.tsc = 0; |
@@ -537,7 +554,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
537 | /* | 554 | /* |
538 | * Clear state for this bank. | 555 | * Clear state for this bank. |
539 | */ | 556 | */ |
540 | mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | 557 | mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); |
541 | } | 558 | } |
542 | 559 | ||
543 | /* | 560 | /* |
@@ -558,7 +575,7 @@ static int mce_no_way_out(struct mce *m, char **msg) | |||
558 | int i; | 575 | int i; |
559 | 576 | ||
560 | for (i = 0; i < banks; i++) { | 577 | for (i = 0; i < banks; i++) { |
561 | m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); | 578 | m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); |
562 | if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) | 579 | if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) |
563 | return 1; | 580 | return 1; |
564 | } | 581 | } |
@@ -618,7 +635,7 @@ out: | |||
618 | * This way we prevent any potential data corruption in a unrecoverable case | 635 | * This way we prevent any potential data corruption in a unrecoverable case |
619 | * and also makes sure always all CPU's errors are examined. | 636 | * and also makes sure always all CPU's errors are examined. |
620 | * | 637 | * |
621 | * Also this detects the case of an machine check event coming from outer | 638 | * Also this detects the case of a machine check event coming from outer |
622 | * space (not detected by any CPUs) In this case some external agent wants | 639 | * space (not detected by any CPUs) In this case some external agent wants |
623 | * us to shut down, so panic too. | 640 | * us to shut down, so panic too. |
624 | * | 641 | * |
@@ -671,7 +688,7 @@ static void mce_reign(void) | |||
671 | * No machine check event found. Must be some external | 688 | * No machine check event found. Must be some external |
672 | * source or one CPU is hung. Panic. | 689 | * source or one CPU is hung. Panic. |
673 | */ | 690 | */ |
674 | if (!m && tolerant < 3) | 691 | if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) |
675 | mce_panic("Machine check from unknown source", NULL, NULL); | 692 | mce_panic("Machine check from unknown source", NULL, NULL); |
676 | 693 | ||
677 | /* | 694 | /* |
@@ -705,7 +722,7 @@ static int mce_start(int *no_way_out) | |||
705 | * global_nwo should be updated before mce_callin | 722 | * global_nwo should be updated before mce_callin |
706 | */ | 723 | */ |
707 | smp_wmb(); | 724 | smp_wmb(); |
708 | order = atomic_add_return(1, &mce_callin); | 725 | order = atomic_inc_return(&mce_callin); |
709 | 726 | ||
710 | /* | 727 | /* |
711 | * Wait for everyone. | 728 | * Wait for everyone. |
@@ -842,7 +859,7 @@ static void mce_clear_state(unsigned long *toclear) | |||
842 | 859 | ||
843 | for (i = 0; i < banks; i++) { | 860 | for (i = 0; i < banks; i++) { |
844 | if (test_bit(i, toclear)) | 861 | if (test_bit(i, toclear)) |
845 | mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | 862 | mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); |
846 | } | 863 | } |
847 | } | 864 | } |
848 | 865 | ||
@@ -895,11 +912,11 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
895 | mce_setup(&m); | 912 | mce_setup(&m); |
896 | 913 | ||
897 | m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); | 914 | m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); |
898 | no_way_out = mce_no_way_out(&m, &msg); | ||
899 | |||
900 | final = &__get_cpu_var(mces_seen); | 915 | final = &__get_cpu_var(mces_seen); |
901 | *final = m; | 916 | *final = m; |
902 | 917 | ||
918 | no_way_out = mce_no_way_out(&m, &msg); | ||
919 | |||
903 | barrier(); | 920 | barrier(); |
904 | 921 | ||
905 | /* | 922 | /* |
@@ -916,14 +933,14 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
916 | order = mce_start(&no_way_out); | 933 | order = mce_start(&no_way_out); |
917 | for (i = 0; i < banks; i++) { | 934 | for (i = 0; i < banks; i++) { |
918 | __clear_bit(i, toclear); | 935 | __clear_bit(i, toclear); |
919 | if (!bank[i]) | 936 | if (!mce_banks[i].ctl) |
920 | continue; | 937 | continue; |
921 | 938 | ||
922 | m.misc = 0; | 939 | m.misc = 0; |
923 | m.addr = 0; | 940 | m.addr = 0; |
924 | m.bank = i; | 941 | m.bank = i; |
925 | 942 | ||
926 | m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); | 943 | m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); |
927 | if ((m.status & MCI_STATUS_VAL) == 0) | 944 | if ((m.status & MCI_STATUS_VAL) == 0) |
928 | continue; | 945 | continue; |
929 | 946 | ||
@@ -964,9 +981,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
964 | kill_it = 1; | 981 | kill_it = 1; |
965 | 982 | ||
966 | if (m.status & MCI_STATUS_MISCV) | 983 | if (m.status & MCI_STATUS_MISCV) |
967 | m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); | 984 | m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); |
968 | if (m.status & MCI_STATUS_ADDRV) | 985 | if (m.status & MCI_STATUS_ADDRV) |
969 | m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); | 986 | m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); |
970 | 987 | ||
971 | /* | 988 | /* |
972 | * Action optional error. Queue address for later processing. | 989 | * Action optional error. Queue address for later processing. |
@@ -1091,7 +1108,7 @@ void mce_log_therm_throt_event(__u64 status) | |||
1091 | */ | 1108 | */ |
1092 | static int check_interval = 5 * 60; /* 5 minutes */ | 1109 | static int check_interval = 5 * 60; /* 5 minutes */ |
1093 | 1110 | ||
1094 | static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ | 1111 | static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ |
1095 | static DEFINE_PER_CPU(struct timer_list, mce_timer); | 1112 | static DEFINE_PER_CPU(struct timer_list, mce_timer); |
1096 | 1113 | ||
1097 | static void mcheck_timer(unsigned long data) | 1114 | static void mcheck_timer(unsigned long data) |
@@ -1110,7 +1127,7 @@ static void mcheck_timer(unsigned long data) | |||
1110 | * Alert userspace if needed. If we logged an MCE, reduce the | 1127 | * Alert userspace if needed. If we logged an MCE, reduce the |
1111 | * polling interval, otherwise increase the polling interval. | 1128 | * polling interval, otherwise increase the polling interval. |
1112 | */ | 1129 | */ |
1113 | n = &__get_cpu_var(next_interval); | 1130 | n = &__get_cpu_var(mce_next_interval); |
1114 | if (mce_notify_irq()) | 1131 | if (mce_notify_irq()) |
1115 | *n = max(*n/2, HZ/100); | 1132 | *n = max(*n/2, HZ/100); |
1116 | else | 1133 | else |
@@ -1159,10 +1176,26 @@ int mce_notify_irq(void) | |||
1159 | } | 1176 | } |
1160 | EXPORT_SYMBOL_GPL(mce_notify_irq); | 1177 | EXPORT_SYMBOL_GPL(mce_notify_irq); |
1161 | 1178 | ||
1179 | static int mce_banks_init(void) | ||
1180 | { | ||
1181 | int i; | ||
1182 | |||
1183 | mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); | ||
1184 | if (!mce_banks) | ||
1185 | return -ENOMEM; | ||
1186 | for (i = 0; i < banks; i++) { | ||
1187 | struct mce_bank *b = &mce_banks[i]; | ||
1188 | |||
1189 | b->ctl = -1ULL; | ||
1190 | b->init = 1; | ||
1191 | } | ||
1192 | return 0; | ||
1193 | } | ||
1194 | |||
1162 | /* | 1195 | /* |
1163 | * Initialize Machine Checks for a CPU. | 1196 | * Initialize Machine Checks for a CPU. |
1164 | */ | 1197 | */ |
1165 | static int mce_cap_init(void) | 1198 | static int __cpuinit mce_cap_init(void) |
1166 | { | 1199 | { |
1167 | unsigned b; | 1200 | unsigned b; |
1168 | u64 cap; | 1201 | u64 cap; |
@@ -1182,11 +1215,11 @@ static int mce_cap_init(void) | |||
1182 | /* Don't support asymmetric configurations today */ | 1215 | /* Don't support asymmetric configurations today */ |
1183 | WARN_ON(banks != 0 && b != banks); | 1216 | WARN_ON(banks != 0 && b != banks); |
1184 | banks = b; | 1217 | banks = b; |
1185 | if (!bank) { | 1218 | if (!mce_banks) { |
1186 | bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); | 1219 | int err = mce_banks_init(); |
1187 | if (!bank) | 1220 | |
1188 | return -ENOMEM; | 1221 | if (err) |
1189 | memset(bank, 0xff, banks * sizeof(u64)); | 1222 | return err; |
1190 | } | 1223 | } |
1191 | 1224 | ||
1192 | /* Use accurate RIP reporting if available. */ | 1225 | /* Use accurate RIP reporting if available. */ |
@@ -1218,16 +1251,23 @@ static void mce_init(void) | |||
1218 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | 1251 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); |
1219 | 1252 | ||
1220 | for (i = 0; i < banks; i++) { | 1253 | for (i = 0; i < banks; i++) { |
1221 | if (skip_bank_init(i)) | 1254 | struct mce_bank *b = &mce_banks[i]; |
1255 | |||
1256 | if (!b->init) | ||
1222 | continue; | 1257 | continue; |
1223 | wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); | 1258 | wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); |
1224 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | 1259 | wrmsrl(MSR_IA32_MCx_STATUS(i), 0); |
1225 | } | 1260 | } |
1226 | } | 1261 | } |
1227 | 1262 | ||
1228 | /* Add per CPU specific workarounds here */ | 1263 | /* Add per CPU specific workarounds here */ |
1229 | static void mce_cpu_quirks(struct cpuinfo_x86 *c) | 1264 | static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) |
1230 | { | 1265 | { |
1266 | if (c->x86_vendor == X86_VENDOR_UNKNOWN) { | ||
1267 | pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); | ||
1268 | return -EOPNOTSUPP; | ||
1269 | } | ||
1270 | |||
1231 | /* This should be disabled by the BIOS, but isn't always */ | 1271 | /* This should be disabled by the BIOS, but isn't always */ |
1232 | if (c->x86_vendor == X86_VENDOR_AMD) { | 1272 | if (c->x86_vendor == X86_VENDOR_AMD) { |
1233 | if (c->x86 == 15 && banks > 4) { | 1273 | if (c->x86 == 15 && banks > 4) { |
@@ -1236,7 +1276,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) | |||
1236 | * trips off incorrectly with the IOMMU & 3ware | 1276 | * trips off incorrectly with the IOMMU & 3ware |
1237 | * & Cerberus: | 1277 | * & Cerberus: |
1238 | */ | 1278 | */ |
1239 | clear_bit(10, (unsigned long *)&bank[4]); | 1279 | clear_bit(10, (unsigned long *)&mce_banks[4].ctl); |
1240 | } | 1280 | } |
1241 | if (c->x86 <= 17 && mce_bootlog < 0) { | 1281 | if (c->x86 <= 17 && mce_bootlog < 0) { |
1242 | /* | 1282 | /* |
@@ -1250,7 +1290,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) | |||
1250 | * by default. | 1290 | * by default. |
1251 | */ | 1291 | */ |
1252 | if (c->x86 == 6 && banks > 0) | 1292 | if (c->x86 == 6 && banks > 0) |
1253 | bank[0] = 0; | 1293 | mce_banks[0].ctl = 0; |
1254 | } | 1294 | } |
1255 | 1295 | ||
1256 | if (c->x86_vendor == X86_VENDOR_INTEL) { | 1296 | if (c->x86_vendor == X86_VENDOR_INTEL) { |
@@ -1263,8 +1303,8 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) | |||
1263 | * valid event later, merely don't write CTL0. | 1303 | * valid event later, merely don't write CTL0. |
1264 | */ | 1304 | */ |
1265 | 1305 | ||
1266 | if (c->x86 == 6 && c->x86_model < 0x1A) | 1306 | if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) |
1267 | __set_bit(0, &dont_init_banks); | 1307 | mce_banks[0].init = 0; |
1268 | 1308 | ||
1269 | /* | 1309 | /* |
1270 | * All newer Intel systems support MCE broadcasting. Enable | 1310 | * All newer Intel systems support MCE broadcasting. Enable |
@@ -1273,11 +1313,20 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) | |||
1273 | if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && | 1313 | if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && |
1274 | monarch_timeout < 0) | 1314 | monarch_timeout < 0) |
1275 | monarch_timeout = USEC_PER_SEC; | 1315 | monarch_timeout = USEC_PER_SEC; |
1316 | |||
1317 | /* | ||
1318 | * There are also broken BIOSes on some Pentium M and | ||
1319 | * earlier systems: | ||
1320 | */ | ||
1321 | if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) | ||
1322 | mce_bootlog = 0; | ||
1276 | } | 1323 | } |
1277 | if (monarch_timeout < 0) | 1324 | if (monarch_timeout < 0) |
1278 | monarch_timeout = 0; | 1325 | monarch_timeout = 0; |
1279 | if (mce_bootlog != 0) | 1326 | if (mce_bootlog != 0) |
1280 | mce_panic_timeout = 30; | 1327 | mce_panic_timeout = 30; |
1328 | |||
1329 | return 0; | ||
1281 | } | 1330 | } |
1282 | 1331 | ||
1283 | static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) | 1332 | static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) |
@@ -1311,7 +1360,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) | |||
1311 | static void mce_init_timer(void) | 1360 | static void mce_init_timer(void) |
1312 | { | 1361 | { |
1313 | struct timer_list *t = &__get_cpu_var(mce_timer); | 1362 | struct timer_list *t = &__get_cpu_var(mce_timer); |
1314 | int *n = &__get_cpu_var(next_interval); | 1363 | int *n = &__get_cpu_var(mce_next_interval); |
1315 | 1364 | ||
1316 | if (mce_ignore_ce) | 1365 | if (mce_ignore_ce) |
1317 | return; | 1366 | return; |
@@ -1324,6 +1373,17 @@ static void mce_init_timer(void) | |||
1324 | add_timer_on(t, smp_processor_id()); | 1373 | add_timer_on(t, smp_processor_id()); |
1325 | } | 1374 | } |
1326 | 1375 | ||
1376 | /* Handle unconfigured int18 (should never happen) */ | ||
1377 | static void unexpected_machine_check(struct pt_regs *regs, long error_code) | ||
1378 | { | ||
1379 | printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", | ||
1380 | smp_processor_id()); | ||
1381 | } | ||
1382 | |||
1383 | /* Call the installed machine check handler for this CPU setup. */ | ||
1384 | void (*machine_check_vector)(struct pt_regs *, long error_code) = | ||
1385 | unexpected_machine_check; | ||
1386 | |||
1327 | /* | 1387 | /* |
1328 | * Called for each booted CPU to set up machine checks. | 1388 | * Called for each booted CPU to set up machine checks. |
1329 | * Must be called with preempt off: | 1389 | * Must be called with preempt off: |
@@ -1338,11 +1398,10 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) | |||
1338 | if (!mce_available(c)) | 1398 | if (!mce_available(c)) |
1339 | return; | 1399 | return; |
1340 | 1400 | ||
1341 | if (mce_cap_init() < 0) { | 1401 | if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { |
1342 | mce_disabled = 1; | 1402 | mce_disabled = 1; |
1343 | return; | 1403 | return; |
1344 | } | 1404 | } |
1345 | mce_cpu_quirks(c); | ||
1346 | 1405 | ||
1347 | machine_check_vector = do_machine_check; | 1406 | machine_check_vector = do_machine_check; |
1348 | 1407 | ||
@@ -1538,8 +1597,10 @@ static struct miscdevice mce_log_device = { | |||
1538 | */ | 1597 | */ |
1539 | static int __init mcheck_enable(char *str) | 1598 | static int __init mcheck_enable(char *str) |
1540 | { | 1599 | { |
1541 | if (*str == 0) | 1600 | if (*str == 0) { |
1542 | enable_p5_mce(); | 1601 | enable_p5_mce(); |
1602 | return 1; | ||
1603 | } | ||
1543 | if (*str == '=') | 1604 | if (*str == '=') |
1544 | str++; | 1605 | str++; |
1545 | if (!strcmp(str, "off")) | 1606 | if (!strcmp(str, "off")) |
@@ -1580,8 +1641,10 @@ static int mce_disable(void) | |||
1580 | int i; | 1641 | int i; |
1581 | 1642 | ||
1582 | for (i = 0; i < banks; i++) { | 1643 | for (i = 0; i < banks; i++) { |
1583 | if (!skip_bank_init(i)) | 1644 | struct mce_bank *b = &mce_banks[i]; |
1584 | wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); | 1645 | |
1646 | if (b->init) | ||
1647 | wrmsrl(MSR_IA32_MCx_CTL(i), 0); | ||
1585 | } | 1648 | } |
1586 | return 0; | 1649 | return 0; |
1587 | } | 1650 | } |
@@ -1656,14 +1719,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev); | |||
1656 | __cpuinitdata | 1719 | __cpuinitdata |
1657 | void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); | 1720 | void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); |
1658 | 1721 | ||
1659 | static struct sysdev_attribute *bank_attrs; | 1722 | static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) |
1723 | { | ||
1724 | return container_of(attr, struct mce_bank, attr); | ||
1725 | } | ||
1660 | 1726 | ||
1661 | static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, | 1727 | static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, |
1662 | char *buf) | 1728 | char *buf) |
1663 | { | 1729 | { |
1664 | u64 b = bank[attr - bank_attrs]; | 1730 | return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); |
1665 | |||
1666 | return sprintf(buf, "%llx\n", b); | ||
1667 | } | 1731 | } |
1668 | 1732 | ||
1669 | static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, | 1733 | static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, |
@@ -1674,7 +1738,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, | |||
1674 | if (strict_strtoull(buf, 0, &new) < 0) | 1738 | if (strict_strtoull(buf, 0, &new) < 0) |
1675 | return -EINVAL; | 1739 | return -EINVAL; |
1676 | 1740 | ||
1677 | bank[attr - bank_attrs] = new; | 1741 | attr_to_bank(attr)->ctl = new; |
1678 | mce_restart(); | 1742 | mce_restart(); |
1679 | 1743 | ||
1680 | return size; | 1744 | return size; |
@@ -1816,7 +1880,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) | |||
1816 | } | 1880 | } |
1817 | for (j = 0; j < banks; j++) { | 1881 | for (j = 0; j < banks; j++) { |
1818 | err = sysdev_create_file(&per_cpu(mce_dev, cpu), | 1882 | err = sysdev_create_file(&per_cpu(mce_dev, cpu), |
1819 | &bank_attrs[j]); | 1883 | &mce_banks[j].attr); |
1820 | if (err) | 1884 | if (err) |
1821 | goto error2; | 1885 | goto error2; |
1822 | } | 1886 | } |
@@ -1825,10 +1889,10 @@ static __cpuinit int mce_create_device(unsigned int cpu) | |||
1825 | return 0; | 1889 | return 0; |
1826 | error2: | 1890 | error2: |
1827 | while (--j >= 0) | 1891 | while (--j >= 0) |
1828 | sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); | 1892 | sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); |
1829 | error: | 1893 | error: |
1830 | while (--i >= 0) | 1894 | while (--i >= 0) |
1831 | sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); | 1895 | sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); |
1832 | 1896 | ||
1833 | sysdev_unregister(&per_cpu(mce_dev, cpu)); | 1897 | sysdev_unregister(&per_cpu(mce_dev, cpu)); |
1834 | 1898 | ||
@@ -1846,7 +1910,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) | |||
1846 | sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); | 1910 | sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); |
1847 | 1911 | ||
1848 | for (i = 0; i < banks; i++) | 1912 | for (i = 0; i < banks; i++) |
1849 | sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); | 1913 | sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); |
1850 | 1914 | ||
1851 | sysdev_unregister(&per_cpu(mce_dev, cpu)); | 1915 | sysdev_unregister(&per_cpu(mce_dev, cpu)); |
1852 | cpumask_clear_cpu(cpu, mce_dev_initialized); | 1916 | cpumask_clear_cpu(cpu, mce_dev_initialized); |
@@ -1863,8 +1927,10 @@ static void mce_disable_cpu(void *h) | |||
1863 | if (!(action & CPU_TASKS_FROZEN)) | 1927 | if (!(action & CPU_TASKS_FROZEN)) |
1864 | cmci_clear(); | 1928 | cmci_clear(); |
1865 | for (i = 0; i < banks; i++) { | 1929 | for (i = 0; i < banks; i++) { |
1866 | if (!skip_bank_init(i)) | 1930 | struct mce_bank *b = &mce_banks[i]; |
1867 | wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); | 1931 | |
1932 | if (b->init) | ||
1933 | wrmsrl(MSR_IA32_MCx_CTL(i), 0); | ||
1868 | } | 1934 | } |
1869 | } | 1935 | } |
1870 | 1936 | ||
@@ -1879,8 +1945,10 @@ static void mce_reenable_cpu(void *h) | |||
1879 | if (!(action & CPU_TASKS_FROZEN)) | 1945 | if (!(action & CPU_TASKS_FROZEN)) |
1880 | cmci_reenable(); | 1946 | cmci_reenable(); |
1881 | for (i = 0; i < banks; i++) { | 1947 | for (i = 0; i < banks; i++) { |
1882 | if (!skip_bank_init(i)) | 1948 | struct mce_bank *b = &mce_banks[i]; |
1883 | wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); | 1949 | |
1950 | if (b->init) | ||
1951 | wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); | ||
1884 | } | 1952 | } |
1885 | } | 1953 | } |
1886 | 1954 | ||
@@ -1912,7 +1980,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
1912 | case CPU_DOWN_FAILED: | 1980 | case CPU_DOWN_FAILED: |
1913 | case CPU_DOWN_FAILED_FROZEN: | 1981 | case CPU_DOWN_FAILED_FROZEN: |
1914 | t->expires = round_jiffies(jiffies + | 1982 | t->expires = round_jiffies(jiffies + |
1915 | __get_cpu_var(next_interval)); | 1983 | __get_cpu_var(mce_next_interval)); |
1916 | add_timer_on(t, cpu); | 1984 | add_timer_on(t, cpu); |
1917 | smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); | 1985 | smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); |
1918 | break; | 1986 | break; |
@@ -1928,35 +1996,21 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { | |||
1928 | .notifier_call = mce_cpu_callback, | 1996 | .notifier_call = mce_cpu_callback, |
1929 | }; | 1997 | }; |
1930 | 1998 | ||
1931 | static __init int mce_init_banks(void) | 1999 | static __init void mce_init_banks(void) |
1932 | { | 2000 | { |
1933 | int i; | 2001 | int i; |
1934 | 2002 | ||
1935 | bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, | ||
1936 | GFP_KERNEL); | ||
1937 | if (!bank_attrs) | ||
1938 | return -ENOMEM; | ||
1939 | |||
1940 | for (i = 0; i < banks; i++) { | 2003 | for (i = 0; i < banks; i++) { |
1941 | struct sysdev_attribute *a = &bank_attrs[i]; | 2004 | struct mce_bank *b = &mce_banks[i]; |
2005 | struct sysdev_attribute *a = &b->attr; | ||
1942 | 2006 | ||
1943 | a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); | 2007 | a->attr.name = b->attrname; |
1944 | if (!a->attr.name) | 2008 | snprintf(b->attrname, ATTR_LEN, "bank%d", i); |
1945 | goto nomem; | ||
1946 | 2009 | ||
1947 | a->attr.mode = 0644; | 2010 | a->attr.mode = 0644; |
1948 | a->show = show_bank; | 2011 | a->show = show_bank; |
1949 | a->store = set_bank; | 2012 | a->store = set_bank; |
1950 | } | 2013 | } |
1951 | return 0; | ||
1952 | |||
1953 | nomem: | ||
1954 | while (--i >= 0) | ||
1955 | kfree(bank_attrs[i].attr.name); | ||
1956 | kfree(bank_attrs); | ||
1957 | bank_attrs = NULL; | ||
1958 | |||
1959 | return -ENOMEM; | ||
1960 | } | 2014 | } |
1961 | 2015 | ||
1962 | static __init int mce_init_device(void) | 2016 | static __init int mce_init_device(void) |
@@ -1969,9 +2023,7 @@ static __init int mce_init_device(void) | |||
1969 | 2023 | ||
1970 | zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); | 2024 | zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); |
1971 | 2025 | ||
1972 | err = mce_init_banks(); | 2026 | mce_init_banks(); |
1973 | if (err) | ||
1974 | return err; | ||
1975 | 2027 | ||
1976 | err = sysdev_class_register(&mce_sysclass); | 2028 | err = sysdev_class_register(&mce_sysclass); |
1977 | if (err) | 2029 | if (err) |
@@ -1991,57 +2043,65 @@ static __init int mce_init_device(void) | |||
1991 | 2043 | ||
1992 | device_initcall(mce_init_device); | 2044 | device_initcall(mce_init_device); |
1993 | 2045 | ||
1994 | #else /* CONFIG_X86_OLD_MCE: */ | 2046 | /* |
1995 | 2047 | * Old style boot options parsing. Only for compatibility. | |
1996 | int nr_mce_banks; | 2048 | */ |
1997 | EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ | 2049 | static int __init mcheck_disable(char *str) |
2050 | { | ||
2051 | mce_disabled = 1; | ||
2052 | return 1; | ||
2053 | } | ||
2054 | __setup("nomce", mcheck_disable); | ||
1998 | 2055 | ||
1999 | /* This has to be run for each processor */ | 2056 | #ifdef CONFIG_DEBUG_FS |
2000 | void mcheck_init(struct cpuinfo_x86 *c) | 2057 | struct dentry *mce_get_debugfs_dir(void) |
2001 | { | 2058 | { |
2002 | if (mce_disabled) | 2059 | static struct dentry *dmce; |
2003 | return; | ||
2004 | 2060 | ||
2005 | switch (c->x86_vendor) { | 2061 | if (!dmce) |
2006 | case X86_VENDOR_AMD: | 2062 | dmce = debugfs_create_dir("mce", NULL); |
2007 | amd_mcheck_init(c); | ||
2008 | break; | ||
2009 | 2063 | ||
2010 | case X86_VENDOR_INTEL: | 2064 | return dmce; |
2011 | if (c->x86 == 5) | 2065 | } |
2012 | intel_p5_mcheck_init(c); | ||
2013 | if (c->x86 == 6) | ||
2014 | intel_p6_mcheck_init(c); | ||
2015 | if (c->x86 == 15) | ||
2016 | intel_p4_mcheck_init(c); | ||
2017 | break; | ||
2018 | 2066 | ||
2019 | case X86_VENDOR_CENTAUR: | 2067 | static void mce_reset(void) |
2020 | if (c->x86 == 5) | 2068 | { |
2021 | winchip_mcheck_init(c); | 2069 | cpu_missing = 0; |
2022 | break; | 2070 | atomic_set(&mce_fake_paniced, 0); |
2071 | atomic_set(&mce_executing, 0); | ||
2072 | atomic_set(&mce_callin, 0); | ||
2073 | atomic_set(&global_nwo, 0); | ||
2074 | } | ||
2023 | 2075 | ||
2024 | default: | 2076 | static int fake_panic_get(void *data, u64 *val) |
2025 | break; | 2077 | { |
2026 | } | 2078 | *val = fake_panic; |
2027 | printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); | 2079 | return 0; |
2028 | } | 2080 | } |
2029 | 2081 | ||
2030 | static int __init mcheck_enable(char *str) | 2082 | static int fake_panic_set(void *data, u64 val) |
2031 | { | 2083 | { |
2032 | mce_p5_enabled = 1; | 2084 | mce_reset(); |
2033 | return 1; | 2085 | fake_panic = val; |
2086 | return 0; | ||
2034 | } | 2087 | } |
2035 | __setup("mce", mcheck_enable); | ||
2036 | 2088 | ||
2037 | #endif /* CONFIG_X86_OLD_MCE */ | 2089 | DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, |
2090 | fake_panic_set, "%llu\n"); | ||
2038 | 2091 | ||
2039 | /* | 2092 | static int __init mce_debugfs_init(void) |
2040 | * Old style boot options parsing. Only for compatibility. | ||
2041 | */ | ||
2042 | static int __init mcheck_disable(char *str) | ||
2043 | { | 2093 | { |
2044 | mce_disabled = 1; | 2094 | struct dentry *dmce, *ffake_panic; |
2045 | return 1; | 2095 | |
2096 | dmce = mce_get_debugfs_dir(); | ||
2097 | if (!dmce) | ||
2098 | return -ENOMEM; | ||
2099 | ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, | ||
2100 | &fake_panic_fops); | ||
2101 | if (!ffake_panic) | ||
2102 | return -ENOMEM; | ||
2103 | |||
2104 | return 0; | ||
2046 | } | 2105 | } |
2047 | __setup("nomce", mcheck_disable); | 2106 | late_initcall(mce_debugfs_init); |
2107 | #endif | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index ddae21620bd..83a3d1f4efc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c | |||
@@ -69,7 +69,7 @@ struct threshold_bank { | |||
69 | struct threshold_block *blocks; | 69 | struct threshold_block *blocks; |
70 | cpumask_var_t cpus; | 70 | cpumask_var_t cpus; |
71 | }; | 71 | }; |
72 | static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); | 72 | static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); |
73 | 73 | ||
74 | #ifdef CONFIG_SMP | 74 | #ifdef CONFIG_SMP |
75 | static unsigned char shared_bank[NR_BANKS] = { | 75 | static unsigned char shared_bank[NR_BANKS] = { |
@@ -489,12 +489,15 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
489 | int i, err = 0; | 489 | int i, err = 0; |
490 | struct threshold_bank *b = NULL; | 490 | struct threshold_bank *b = NULL; |
491 | char name[32]; | 491 | char name[32]; |
492 | #ifdef CONFIG_SMP | ||
493 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
494 | #endif | ||
492 | 495 | ||
493 | sprintf(name, "threshold_bank%i", bank); | 496 | sprintf(name, "threshold_bank%i", bank); |
494 | 497 | ||
495 | #ifdef CONFIG_SMP | 498 | #ifdef CONFIG_SMP |
496 | if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ | 499 | if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ |
497 | i = cpumask_first(cpu_core_mask(cpu)); | 500 | i = cpumask_first(c->llc_shared_map); |
498 | 501 | ||
499 | /* first core not up yet */ | 502 | /* first core not up yet */ |
500 | if (cpu_data(i).cpu_core_id) | 503 | if (cpu_data(i).cpu_core_id) |
@@ -514,7 +517,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
514 | if (err) | 517 | if (err) |
515 | goto out; | 518 | goto out; |
516 | 519 | ||
517 | cpumask_copy(b->cpus, cpu_core_mask(cpu)); | 520 | cpumask_copy(b->cpus, c->llc_shared_map); |
518 | per_cpu(threshold_banks, cpu)[bank] = b; | 521 | per_cpu(threshold_banks, cpu)[bank] = b; |
519 | 522 | ||
520 | goto out; | 523 | goto out; |
@@ -539,7 +542,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) | |||
539 | #ifndef CONFIG_SMP | 542 | #ifndef CONFIG_SMP |
540 | cpumask_setall(b->cpus); | 543 | cpumask_setall(b->cpus); |
541 | #else | 544 | #else |
542 | cpumask_copy(b->cpus, cpu_core_mask(cpu)); | 545 | cpumask_copy(b->cpus, c->llc_shared_map); |
543 | #endif | 546 | #endif |
544 | 547 | ||
545 | per_cpu(threshold_banks, cpu)[bank] = b; | 548 | per_cpu(threshold_banks, cpu)[bank] = b; |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index e1acec0f7a3..889f665fe93 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c | |||
@@ -90,7 +90,7 @@ static void cmci_discover(int banks, int boot) | |||
90 | if (test_bit(i, owned)) | 90 | if (test_bit(i, owned)) |
91 | continue; | 91 | continue; |
92 | 92 | ||
93 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | 93 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); |
94 | 94 | ||
95 | /* Already owned by someone else? */ | 95 | /* Already owned by someone else? */ |
96 | if (val & CMCI_EN) { | 96 | if (val & CMCI_EN) { |
@@ -101,8 +101,8 @@ static void cmci_discover(int banks, int boot) | |||
101 | } | 101 | } |
102 | 102 | ||
103 | val |= CMCI_EN | CMCI_THRESHOLD; | 103 | val |= CMCI_EN | CMCI_THRESHOLD; |
104 | wrmsrl(MSR_IA32_MC0_CTL2 + i, val); | 104 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); |
105 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | 105 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); |
106 | 106 | ||
107 | /* Did the enable bit stick? -- the bank supports CMCI */ | 107 | /* Did the enable bit stick? -- the bank supports CMCI */ |
108 | if (val & CMCI_EN) { | 108 | if (val & CMCI_EN) { |
@@ -152,9 +152,9 @@ void cmci_clear(void) | |||
152 | if (!test_bit(i, __get_cpu_var(mce_banks_owned))) | 152 | if (!test_bit(i, __get_cpu_var(mce_banks_owned))) |
153 | continue; | 153 | continue; |
154 | /* Disable CMCI */ | 154 | /* Disable CMCI */ |
155 | rdmsrl(MSR_IA32_MC0_CTL2 + i, val); | 155 | rdmsrl(MSR_IA32_MCx_CTL2(i), val); |
156 | val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); | 156 | val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); |
157 | wrmsrl(MSR_IA32_MC0_CTL2 + i, val); | 157 | wrmsrl(MSR_IA32_MCx_CTL2(i), val); |
158 | __clear_bit(i, __get_cpu_var(mce_banks_owned)); | 158 | __clear_bit(i, __get_cpu_var(mce_banks_owned)); |
159 | } | 159 | } |
160 | spin_unlock_irqrestore(&cmci_discover_lock, flags); | 160 | spin_unlock_irqrestore(&cmci_discover_lock, flags); |
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c deleted file mode 100644 index f5f2d6f71fb..00000000000 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ /dev/null | |||
@@ -1,94 +0,0 @@ | |||
1 | /* | ||
2 | * Non Fatal Machine Check Exception Reporting | ||
3 | * | ||
4 | * (C) Copyright 2002 Dave Jones. <davej@redhat.com> | ||
5 | * | ||
6 | * This file contains routines to check for non-fatal MCEs every 15s | ||
7 | * | ||
8 | */ | ||
9 | #include <linux/interrupt.h> | ||
10 | #include <linux/workqueue.h> | ||
11 | #include <linux/jiffies.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/init.h> | ||
16 | #include <linux/smp.h> | ||
17 | |||
18 | #include <asm/processor.h> | ||
19 | #include <asm/system.h> | ||
20 | #include <asm/mce.h> | ||
21 | #include <asm/msr.h> | ||
22 | |||
23 | static int firstbank; | ||
24 | |||
25 | #define MCE_RATE (15*HZ) /* timer rate is 15s */ | ||
26 | |||
27 | static void mce_checkregs(void *info) | ||
28 | { | ||
29 | u32 low, high; | ||
30 | int i; | ||
31 | |||
32 | for (i = firstbank; i < nr_mce_banks; i++) { | ||
33 | rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); | ||
34 | |||
35 | if (!(high & (1<<31))) | ||
36 | continue; | ||
37 | |||
38 | printk(KERN_INFO "MCE: The hardware reports a non fatal, " | ||
39 | "correctable incident occurred on CPU %d.\n", | ||
40 | smp_processor_id()); | ||
41 | |||
42 | printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); | ||
43 | |||
44 | /* | ||
45 | * Scrub the error so we don't pick it up in MCE_RATE | ||
46 | * seconds time: | ||
47 | */ | ||
48 | wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); | ||
49 | |||
50 | /* Serialize: */ | ||
51 | wmb(); | ||
52 | add_taint(TAINT_MACHINE_CHECK); | ||
53 | } | ||
54 | } | ||
55 | |||
56 | static void mce_work_fn(struct work_struct *work); | ||
57 | static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); | ||
58 | |||
59 | static void mce_work_fn(struct work_struct *work) | ||
60 | { | ||
61 | on_each_cpu(mce_checkregs, NULL, 1); | ||
62 | schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); | ||
63 | } | ||
64 | |||
65 | static int __init init_nonfatal_mce_checker(void) | ||
66 | { | ||
67 | struct cpuinfo_x86 *c = &boot_cpu_data; | ||
68 | |||
69 | /* Check for MCE support */ | ||
70 | if (!cpu_has(c, X86_FEATURE_MCE)) | ||
71 | return -ENODEV; | ||
72 | |||
73 | /* Check for PPro style MCA */ | ||
74 | if (!cpu_has(c, X86_FEATURE_MCA)) | ||
75 | return -ENODEV; | ||
76 | |||
77 | /* Some Athlons misbehave when we frob bank 0 */ | ||
78 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | ||
79 | boot_cpu_data.x86 == 6) | ||
80 | firstbank = 1; | ||
81 | else | ||
82 | firstbank = 0; | ||
83 | |||
84 | /* | ||
85 | * Check for non-fatal errors every MCE_RATE s | ||
86 | */ | ||
87 | schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); | ||
88 | printk(KERN_INFO "Machine check exception polling timer started.\n"); | ||
89 | |||
90 | return 0; | ||
91 | } | ||
92 | module_init(init_nonfatal_mce_checker); | ||
93 | |||
94 | MODULE_LICENSE("GPL"); | ||
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c deleted file mode 100644 index 4482aea9aa2..00000000000 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ /dev/null | |||
@@ -1,163 +0,0 @@ | |||
1 | /* | ||
2 | * P4 specific Machine Check Exception Reporting | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/types.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/smp.h> | ||
8 | |||
9 | #include <asm/processor.h> | ||
10 | #include <asm/mce.h> | ||
11 | #include <asm/msr.h> | ||
12 | |||
13 | /* as supported by the P4/Xeon family */ | ||
14 | struct intel_mce_extended_msrs { | ||
15 | u32 eax; | ||
16 | u32 ebx; | ||
17 | u32 ecx; | ||
18 | u32 edx; | ||
19 | u32 esi; | ||
20 | u32 edi; | ||
21 | u32 ebp; | ||
22 | u32 esp; | ||
23 | u32 eflags; | ||
24 | u32 eip; | ||
25 | /* u32 *reserved[]; */ | ||
26 | }; | ||
27 | |||
28 | static int mce_num_extended_msrs; | ||
29 | |||
30 | /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ | ||
31 | static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) | ||
32 | { | ||
33 | u32 h; | ||
34 | |||
35 | rdmsr(MSR_IA32_MCG_EAX, r->eax, h); | ||
36 | rdmsr(MSR_IA32_MCG_EBX, r->ebx, h); | ||
37 | rdmsr(MSR_IA32_MCG_ECX, r->ecx, h); | ||
38 | rdmsr(MSR_IA32_MCG_EDX, r->edx, h); | ||
39 | rdmsr(MSR_IA32_MCG_ESI, r->esi, h); | ||
40 | rdmsr(MSR_IA32_MCG_EDI, r->edi, h); | ||
41 | rdmsr(MSR_IA32_MCG_EBP, r->ebp, h); | ||
42 | rdmsr(MSR_IA32_MCG_ESP, r->esp, h); | ||
43 | rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h); | ||
44 | rdmsr(MSR_IA32_MCG_EIP, r->eip, h); | ||
45 | } | ||
46 | |||
47 | static void intel_machine_check(struct pt_regs *regs, long error_code) | ||
48 | { | ||
49 | u32 alow, ahigh, high, low; | ||
50 | u32 mcgstl, mcgsth; | ||
51 | int recover = 1; | ||
52 | int i; | ||
53 | |||
54 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | ||
55 | if (mcgstl & (1<<0)) /* Recoverable ? */ | ||
56 | recover = 0; | ||
57 | |||
58 | printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", | ||
59 | smp_processor_id(), mcgsth, mcgstl); | ||
60 | |||
61 | if (mce_num_extended_msrs > 0) { | ||
62 | struct intel_mce_extended_msrs dbg; | ||
63 | |||
64 | intel_get_extended_msrs(&dbg); | ||
65 | |||
66 | printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" | ||
67 | "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" | ||
68 | "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", | ||
69 | smp_processor_id(), dbg.eip, dbg.eflags, | ||
70 | dbg.eax, dbg.ebx, dbg.ecx, dbg.edx, | ||
71 | dbg.esi, dbg.edi, dbg.ebp, dbg.esp); | ||
72 | } | ||
73 | |||
74 | for (i = 0; i < nr_mce_banks; i++) { | ||
75 | rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); | ||
76 | if (high & (1<<31)) { | ||
77 | char misc[20]; | ||
78 | char addr[24]; | ||
79 | |||
80 | misc[0] = addr[0] = '\0'; | ||
81 | if (high & (1<<29)) | ||
82 | recover |= 1; | ||
83 | if (high & (1<<25)) | ||
84 | recover |= 2; | ||
85 | high &= ~(1<<31); | ||
86 | if (high & (1<<27)) { | ||
87 | rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); | ||
88 | snprintf(misc, 20, "[%08x%08x]", ahigh, alow); | ||
89 | } | ||
90 | if (high & (1<<26)) { | ||
91 | rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); | ||
92 | snprintf(addr, 24, " at %08x%08x", ahigh, alow); | ||
93 | } | ||
94 | printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", | ||
95 | smp_processor_id(), i, high, low, misc, addr); | ||
96 | } | ||
97 | } | ||
98 | |||
99 | if (recover & 2) | ||
100 | panic("CPU context corrupt"); | ||
101 | if (recover & 1) | ||
102 | panic("Unable to continue"); | ||
103 | |||
104 | printk(KERN_EMERG "Attempting to continue.\n"); | ||
105 | |||
106 | /* | ||
107 | * Do not clear the MSR_IA32_MCi_STATUS if the error is not | ||
108 | * recoverable/continuable.This will allow BIOS to look at the MSRs | ||
109 | * for errors if the OS could not log the error. | ||
110 | */ | ||
111 | for (i = 0; i < nr_mce_banks; i++) { | ||
112 | u32 msr; | ||
113 | msr = MSR_IA32_MC0_STATUS+i*4; | ||
114 | rdmsr(msr, low, high); | ||
115 | if (high&(1<<31)) { | ||
116 | /* Clear it */ | ||
117 | wrmsr(msr, 0UL, 0UL); | ||
118 | /* Serialize */ | ||
119 | wmb(); | ||
120 | add_taint(TAINT_MACHINE_CHECK); | ||
121 | } | ||
122 | } | ||
123 | mcgstl &= ~(1<<2); | ||
124 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | ||
125 | } | ||
126 | |||
127 | void intel_p4_mcheck_init(struct cpuinfo_x86 *c) | ||
128 | { | ||
129 | u32 l, h; | ||
130 | int i; | ||
131 | |||
132 | machine_check_vector = intel_machine_check; | ||
133 | wmb(); | ||
134 | |||
135 | printk(KERN_INFO "Intel machine check architecture supported.\n"); | ||
136 | rdmsr(MSR_IA32_MCG_CAP, l, h); | ||
137 | if (l & (1<<8)) /* Control register present ? */ | ||
138 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | ||
139 | nr_mce_banks = l & 0xff; | ||
140 | |||
141 | for (i = 0; i < nr_mce_banks; i++) { | ||
142 | wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); | ||
143 | wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); | ||
144 | } | ||
145 | |||
146 | set_in_cr4(X86_CR4_MCE); | ||
147 | printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", | ||
148 | smp_processor_id()); | ||
149 | |||
150 | /* Check for P4/Xeon extended MCE MSRs */ | ||
151 | rdmsr(MSR_IA32_MCG_CAP, l, h); | ||
152 | if (l & (1<<9)) {/* MCG_EXT_P */ | ||
153 | mce_num_extended_msrs = (l >> 16) & 0xff; | ||
154 | printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" | ||
155 | " available\n", | ||
156 | smp_processor_id(), mce_num_extended_msrs); | ||
157 | |||
158 | #ifdef CONFIG_X86_MCE_P4THERMAL | ||
159 | /* Check for P4/Xeon Thermal monitor */ | ||
160 | intel_init_thermal(c); | ||
161 | #endif | ||
162 | } | ||
163 | } | ||
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c deleted file mode 100644 index 01e4f817818..00000000000 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ /dev/null | |||
@@ -1,127 +0,0 @@ | |||
1 | /* | ||
2 | * P6 specific Machine Check Exception Reporting | ||
3 | * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> | ||
4 | */ | ||
5 | #include <linux/interrupt.h> | ||
6 | #include <linux/kernel.h> | ||
7 | #include <linux/types.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/smp.h> | ||
10 | |||
11 | #include <asm/processor.h> | ||
12 | #include <asm/system.h> | ||
13 | #include <asm/mce.h> | ||
14 | #include <asm/msr.h> | ||
15 | |||
16 | /* Machine Check Handler For PII/PIII */ | ||
17 | static void intel_machine_check(struct pt_regs *regs, long error_code) | ||
18 | { | ||
19 | u32 alow, ahigh, high, low; | ||
20 | u32 mcgstl, mcgsth; | ||
21 | int recover = 1; | ||
22 | int i; | ||
23 | |||
24 | rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | ||
25 | if (mcgstl & (1<<0)) /* Recoverable ? */ | ||
26 | recover = 0; | ||
27 | |||
28 | printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", | ||
29 | smp_processor_id(), mcgsth, mcgstl); | ||
30 | |||
31 | for (i = 0; i < nr_mce_banks; i++) { | ||
32 | rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); | ||
33 | if (high & (1<<31)) { | ||
34 | char misc[20]; | ||
35 | char addr[24]; | ||
36 | |||
37 | misc[0] = '\0'; | ||
38 | addr[0] = '\0'; | ||
39 | |||
40 | if (high & (1<<29)) | ||
41 | recover |= 1; | ||
42 | if (high & (1<<25)) | ||
43 | recover |= 2; | ||
44 | high &= ~(1<<31); | ||
45 | |||
46 | if (high & (1<<27)) { | ||
47 | rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); | ||
48 | snprintf(misc, 20, "[%08x%08x]", ahigh, alow); | ||
49 | } | ||
50 | if (high & (1<<26)) { | ||
51 | rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); | ||
52 | snprintf(addr, 24, " at %08x%08x", ahigh, alow); | ||
53 | } | ||
54 | |||
55 | printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", | ||
56 | smp_processor_id(), i, high, low, misc, addr); | ||
57 | } | ||
58 | } | ||
59 | |||
60 | if (recover & 2) | ||
61 | panic("CPU context corrupt"); | ||
62 | if (recover & 1) | ||
63 | panic("Unable to continue"); | ||
64 | |||
65 | printk(KERN_EMERG "Attempting to continue.\n"); | ||
66 | /* | ||
67 | * Do not clear the MSR_IA32_MCi_STATUS if the error is not | ||
68 | * recoverable/continuable.This will allow BIOS to look at the MSRs | ||
69 | * for errors if the OS could not log the error: | ||
70 | */ | ||
71 | for (i = 0; i < nr_mce_banks; i++) { | ||
72 | unsigned int msr; | ||
73 | |||
74 | msr = MSR_IA32_MC0_STATUS+i*4; | ||
75 | rdmsr(msr, low, high); | ||
76 | if (high & (1<<31)) { | ||
77 | /* Clear it: */ | ||
78 | wrmsr(msr, 0UL, 0UL); | ||
79 | /* Serialize: */ | ||
80 | wmb(); | ||
81 | add_taint(TAINT_MACHINE_CHECK); | ||
82 | } | ||
83 | } | ||
84 | mcgstl &= ~(1<<2); | ||
85 | wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); | ||
86 | } | ||
87 | |||
88 | /* Set up machine check reporting for processors with Intel style MCE: */ | ||
89 | void intel_p6_mcheck_init(struct cpuinfo_x86 *c) | ||
90 | { | ||
91 | u32 l, h; | ||
92 | int i; | ||
93 | |||
94 | /* Check for MCE support */ | ||
95 | if (!cpu_has(c, X86_FEATURE_MCE)) | ||
96 | return; | ||
97 | |||
98 | /* Check for PPro style MCA */ | ||
99 | if (!cpu_has(c, X86_FEATURE_MCA)) | ||
100 | return; | ||
101 | |||
102 | /* Ok machine check is available */ | ||
103 | machine_check_vector = intel_machine_check; | ||
104 | /* Make sure the vector pointer is visible before we enable MCEs: */ | ||
105 | wmb(); | ||
106 | |||
107 | printk(KERN_INFO "Intel machine check architecture supported.\n"); | ||
108 | rdmsr(MSR_IA32_MCG_CAP, l, h); | ||
109 | if (l & (1<<8)) /* Control register present ? */ | ||
110 | wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); | ||
111 | nr_mce_banks = l & 0xff; | ||
112 | |||
113 | /* | ||
114 | * Following the example in IA-32 SDM Vol 3: | ||
115 | * - MC0_CTL should not be written | ||
116 | * - Status registers on all banks should be cleared on reset | ||
117 | */ | ||
118 | for (i = 1; i < nr_mce_banks; i++) | ||
119 | wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); | ||
120 | |||
121 | for (i = 0; i < nr_mce_banks; i++) | ||
122 | wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); | ||
123 | |||
124 | set_in_cr4(X86_CR4_MCE); | ||
125 | printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", | ||
126 | smp_processor_id()); | ||
127 | } | ||
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 8bc64cfbe93..b3a1dba7533 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c | |||
@@ -34,20 +34,31 @@ | |||
34 | /* How long to wait between reporting thermal events */ | 34 | /* How long to wait between reporting thermal events */ |
35 | #define CHECK_INTERVAL (300 * HZ) | 35 | #define CHECK_INTERVAL (300 * HZ) |
36 | 36 | ||
37 | static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; | 37 | /* |
38 | static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); | 38 | * Current thermal throttling state: |
39 | static DEFINE_PER_CPU(bool, thermal_throttle_active); | 39 | */ |
40 | struct thermal_state { | ||
41 | bool is_throttled; | ||
42 | |||
43 | u64 next_check; | ||
44 | unsigned long throttle_count; | ||
45 | unsigned long last_throttle_count; | ||
46 | }; | ||
40 | 47 | ||
41 | static atomic_t therm_throt_en = ATOMIC_INIT(0); | 48 | static DEFINE_PER_CPU(struct thermal_state, thermal_state); |
49 | |||
50 | static atomic_t therm_throt_en = ATOMIC_INIT(0); | ||
42 | 51 | ||
43 | #ifdef CONFIG_SYSFS | 52 | #ifdef CONFIG_SYSFS |
44 | #define define_therm_throt_sysdev_one_ro(_name) \ | 53 | #define define_therm_throt_sysdev_one_ro(_name) \ |
45 | static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) | 54 | static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) |
46 | 55 | ||
47 | #define define_therm_throt_sysdev_show_func(name) \ | 56 | #define define_therm_throt_sysdev_show_func(name) \ |
48 | static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ | 57 | \ |
49 | struct sysdev_attribute *attr, \ | 58 | static ssize_t therm_throt_sysdev_show_##name( \ |
50 | char *buf) \ | 59 | struct sys_device *dev, \ |
60 | struct sysdev_attribute *attr, \ | ||
61 | char *buf) \ | ||
51 | { \ | 62 | { \ |
52 | unsigned int cpu = dev->id; \ | 63 | unsigned int cpu = dev->id; \ |
53 | ssize_t ret; \ | 64 | ssize_t ret; \ |
@@ -55,7 +66,7 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ | |||
55 | preempt_disable(); /* CPU hotplug */ \ | 66 | preempt_disable(); /* CPU hotplug */ \ |
56 | if (cpu_online(cpu)) \ | 67 | if (cpu_online(cpu)) \ |
57 | ret = sprintf(buf, "%lu\n", \ | 68 | ret = sprintf(buf, "%lu\n", \ |
58 | per_cpu(thermal_throttle_##name, cpu)); \ | 69 | per_cpu(thermal_state, cpu).name); \ |
59 | else \ | 70 | else \ |
60 | ret = 0; \ | 71 | ret = 0; \ |
61 | preempt_enable(); \ | 72 | preempt_enable(); \ |
@@ -63,11 +74,11 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ | |||
63 | return ret; \ | 74 | return ret; \ |
64 | } | 75 | } |
65 | 76 | ||
66 | define_therm_throt_sysdev_show_func(count); | 77 | define_therm_throt_sysdev_show_func(throttle_count); |
67 | define_therm_throt_sysdev_one_ro(count); | 78 | define_therm_throt_sysdev_one_ro(throttle_count); |
68 | 79 | ||
69 | static struct attribute *thermal_throttle_attrs[] = { | 80 | static struct attribute *thermal_throttle_attrs[] = { |
70 | &attr_count.attr, | 81 | &attr_throttle_count.attr, |
71 | NULL | 82 | NULL |
72 | }; | 83 | }; |
73 | 84 | ||
@@ -93,34 +104,43 @@ static struct attribute_group thermal_throttle_attr_group = { | |||
93 | * 1 : Event should be logged further, and a message has been | 104 | * 1 : Event should be logged further, and a message has been |
94 | * printed to the syslog. | 105 | * printed to the syslog. |
95 | */ | 106 | */ |
96 | static int therm_throt_process(int curr) | 107 | static int therm_throt_process(bool is_throttled) |
97 | { | 108 | { |
98 | unsigned int cpu = smp_processor_id(); | 109 | struct thermal_state *state; |
99 | __u64 tmp_jiffs = get_jiffies_64(); | 110 | unsigned int this_cpu; |
100 | bool was_throttled = __get_cpu_var(thermal_throttle_active); | 111 | bool was_throttled; |
101 | bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; | 112 | u64 now; |
113 | |||
114 | this_cpu = smp_processor_id(); | ||
115 | now = get_jiffies_64(); | ||
116 | state = &per_cpu(thermal_state, this_cpu); | ||
117 | |||
118 | was_throttled = state->is_throttled; | ||
119 | state->is_throttled = is_throttled; | ||
102 | 120 | ||
103 | if (is_throttled) | 121 | if (is_throttled) |
104 | __get_cpu_var(thermal_throttle_count)++; | 122 | state->throttle_count++; |
105 | 123 | ||
106 | if (!(was_throttled ^ is_throttled) && | 124 | if (time_before64(now, state->next_check) && |
107 | time_before64(tmp_jiffs, __get_cpu_var(next_check))) | 125 | state->throttle_count != state->last_throttle_count) |
108 | return 0; | 126 | return 0; |
109 | 127 | ||
110 | __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; | 128 | state->next_check = now + CHECK_INTERVAL; |
129 | state->last_throttle_count = state->throttle_count; | ||
111 | 130 | ||
112 | /* if we just entered the thermal event */ | 131 | /* if we just entered the thermal event */ |
113 | if (is_throttled) { | 132 | if (is_throttled) { |
114 | printk(KERN_CRIT "CPU%d: Temperature above threshold, " | 133 | printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); |
115 | "cpu clock throttled (total events = %lu)\n", | ||
116 | cpu, __get_cpu_var(thermal_throttle_count)); | ||
117 | 134 | ||
118 | add_taint(TAINT_MACHINE_CHECK); | 135 | add_taint(TAINT_MACHINE_CHECK); |
119 | } else if (was_throttled) { | 136 | return 1; |
120 | printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); | 137 | } |
138 | if (was_throttled) { | ||
139 | printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); | ||
140 | return 1; | ||
121 | } | 141 | } |
122 | 142 | ||
123 | return 1; | 143 | return 0; |
124 | } | 144 | } |
125 | 145 | ||
126 | #ifdef CONFIG_SYSFS | 146 | #ifdef CONFIG_SYSFS |
@@ -210,7 +230,7 @@ static void intel_thermal_interrupt(void) | |||
210 | __u64 msr_val; | 230 | __u64 msr_val; |
211 | 231 | ||
212 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); | 232 | rdmsrl(MSR_IA32_THERM_STATUS, msr_val); |
213 | if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) | 233 | if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) |
214 | mce_log_therm_throt_event(msr_val); | 234 | mce_log_therm_throt_event(msr_val); |
215 | } | 235 | } |
216 | 236 | ||
@@ -257,9 +277,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c) | |||
257 | return; | 277 | return; |
258 | } | 278 | } |
259 | 279 | ||
260 | if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) | ||
261 | tm2 = 1; | ||
262 | |||
263 | /* Check whether a vector already exists */ | 280 | /* Check whether a vector already exists */ |
264 | if (h & APIC_VECTOR_MASK) { | 281 | if (h & APIC_VECTOR_MASK) { |
265 | printk(KERN_DEBUG | 282 | printk(KERN_DEBUG |
@@ -268,6 +285,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c) | |||
268 | return; | 285 | return; |
269 | } | 286 | } |
270 | 287 | ||
288 | /* early Pentium M models use different method for enabling TM2 */ | ||
289 | if (cpu_has(c, X86_FEATURE_TM2)) { | ||
290 | if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { | ||
291 | rdmsr(MSR_THERM2_CTL, l, h); | ||
292 | if (l & MSR_THERM2_CTL_TM_SELECT) | ||
293 | tm2 = 1; | ||
294 | } else if (l & MSR_IA32_MISC_ENABLE_TM2) | ||
295 | tm2 = 1; | ||
296 | } | ||
297 | |||
271 | /* We'll mask the thermal vector in the lapic till we're ready: */ | 298 | /* We'll mask the thermal vector in the lapic till we're ready: */ |
272 | h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; | 299 | h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; |
273 | apic_write(APIC_LVTTHMR, h); | 300 | apic_write(APIC_LVTTHMR, h); |
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index ee2331b0e58..33af14110df 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c | |||
@@ -7,15 +7,15 @@ | |||
7 | 7 | ||
8 | static void | 8 | static void |
9 | amd_get_mtrr(unsigned int reg, unsigned long *base, | 9 | amd_get_mtrr(unsigned int reg, unsigned long *base, |
10 | unsigned long *size, mtrr_type * type) | 10 | unsigned long *size, mtrr_type *type) |
11 | { | 11 | { |
12 | unsigned long low, high; | 12 | unsigned long low, high; |
13 | 13 | ||
14 | rdmsr(MSR_K6_UWCCR, low, high); | 14 | rdmsr(MSR_K6_UWCCR, low, high); |
15 | /* Upper dword is region 1, lower is region 0 */ | 15 | /* Upper dword is region 1, lower is region 0 */ |
16 | if (reg == 1) | 16 | if (reg == 1) |
17 | low = high; | 17 | low = high; |
18 | /* The base masks off on the right alignment */ | 18 | /* The base masks off on the right alignment */ |
19 | *base = (low & 0xFFFE0000) >> PAGE_SHIFT; | 19 | *base = (low & 0xFFFE0000) >> PAGE_SHIFT; |
20 | *type = 0; | 20 | *type = 0; |
21 | if (low & 1) | 21 | if (low & 1) |
@@ -27,74 +27,81 @@ amd_get_mtrr(unsigned int reg, unsigned long *base, | |||
27 | return; | 27 | return; |
28 | } | 28 | } |
29 | /* | 29 | /* |
30 | * This needs a little explaining. The size is stored as an | 30 | * This needs a little explaining. The size is stored as an |
31 | * inverted mask of bits of 128K granularity 15 bits long offset | 31 | * inverted mask of bits of 128K granularity 15 bits long offset |
32 | * 2 bits | 32 | * 2 bits. |
33 | * | 33 | * |
34 | * So to get a size we do invert the mask and add 1 to the lowest | 34 | * So to get a size we do invert the mask and add 1 to the lowest |
35 | * mask bit (4 as its 2 bits in). This gives us a size we then shift | 35 | * mask bit (4 as its 2 bits in). This gives us a size we then shift |
36 | * to turn into 128K blocks | 36 | * to turn into 128K blocks. |
37 | * | 37 | * |
38 | * eg 111 1111 1111 1100 is 512K | 38 | * eg 111 1111 1111 1100 is 512K |
39 | * | 39 | * |
40 | * invert 000 0000 0000 0011 | 40 | * invert 000 0000 0000 0011 |
41 | * +1 000 0000 0000 0100 | 41 | * +1 000 0000 0000 0100 |
42 | * *128K ... | 42 | * *128K ... |
43 | */ | 43 | */ |
44 | low = (~low) & 0x1FFFC; | 44 | low = (~low) & 0x1FFFC; |
45 | *size = (low + 4) << (15 - PAGE_SHIFT); | 45 | *size = (low + 4) << (15 - PAGE_SHIFT); |
46 | return; | ||
47 | } | 46 | } |
48 | 47 | ||
49 | static void amd_set_mtrr(unsigned int reg, unsigned long base, | 48 | /** |
50 | unsigned long size, mtrr_type type) | 49 | * amd_set_mtrr - Set variable MTRR register on the local CPU. |
51 | /* [SUMMARY] Set variable MTRR register on the local CPU. | 50 | * |
52 | <reg> The register to set. | 51 | * @reg The register to set. |
53 | <base> The base address of the region. | 52 | * @base The base address of the region. |
54 | <size> The size of the region. If this is 0 the region is disabled. | 53 | * @size The size of the region. If this is 0 the region is disabled. |
55 | <type> The type of the region. | 54 | * @type The type of the region. |
56 | [RETURNS] Nothing. | 55 | * |
57 | */ | 56 | * Returns nothing. |
57 | */ | ||
58 | static void | ||
59 | amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) | ||
58 | { | 60 | { |
59 | u32 regs[2]; | 61 | u32 regs[2]; |
60 | 62 | ||
61 | /* | 63 | /* |
62 | * Low is MTRR0 , High MTRR 1 | 64 | * Low is MTRR0, High MTRR 1 |
63 | */ | 65 | */ |
64 | rdmsr(MSR_K6_UWCCR, regs[0], regs[1]); | 66 | rdmsr(MSR_K6_UWCCR, regs[0], regs[1]); |
65 | /* | 67 | /* |
66 | * Blank to disable | 68 | * Blank to disable |
67 | */ | 69 | */ |
68 | if (size == 0) | 70 | if (size == 0) { |
69 | regs[reg] = 0; | 71 | regs[reg] = 0; |
70 | else | 72 | } else { |
71 | /* Set the register to the base, the type (off by one) and an | 73 | /* |
72 | inverted bitmask of the size The size is the only odd | 74 | * Set the register to the base, the type (off by one) and an |
73 | bit. We are fed say 512K We invert this and we get 111 1111 | 75 | * inverted bitmask of the size The size is the only odd |
74 | 1111 1011 but if you subtract one and invert you get the | 76 | * bit. We are fed say 512K We invert this and we get 111 1111 |
75 | desired 111 1111 1111 1100 mask | 77 | * 1111 1011 but if you subtract one and invert you get the |
76 | 78 | * desired 111 1111 1111 1100 mask | |
77 | But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */ | 79 | * |
80 | * But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! | ||
81 | */ | ||
78 | regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) | 82 | regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) |
79 | | (base << PAGE_SHIFT) | (type + 1); | 83 | | (base << PAGE_SHIFT) | (type + 1); |
84 | } | ||
80 | 85 | ||
81 | /* | 86 | /* |
82 | * The writeback rule is quite specific. See the manual. Its | 87 | * The writeback rule is quite specific. See the manual. Its |
83 | * disable local interrupts, write back the cache, set the mtrr | 88 | * disable local interrupts, write back the cache, set the mtrr |
84 | */ | 89 | */ |
85 | wbinvd(); | 90 | wbinvd(); |
86 | wrmsr(MSR_K6_UWCCR, regs[0], regs[1]); | 91 | wrmsr(MSR_K6_UWCCR, regs[0], regs[1]); |
87 | } | 92 | } |
88 | 93 | ||
89 | static int amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) | 94 | static int |
95 | amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) | ||
90 | { | 96 | { |
91 | /* Apply the K6 block alignment and size rules | 97 | /* |
92 | In order | 98 | * Apply the K6 block alignment and size rules |
93 | o Uncached or gathering only | 99 | * In order |
94 | o 128K or bigger block | 100 | * o Uncached or gathering only |
95 | o Power of 2 block | 101 | * o 128K or bigger block |
96 | o base suitably aligned to the power | 102 | * o Power of 2 block |
97 | */ | 103 | * o base suitably aligned to the power |
104 | */ | ||
98 | if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) | 105 | if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) |
99 | || (size & ~(size - 1)) - size || (base & (size - 1))) | 106 | || (size & ~(size - 1)) - size || (base & (size - 1))) |
100 | return -EINVAL; | 107 | return -EINVAL; |
@@ -115,5 +122,3 @@ int __init amd_init_mtrr(void) | |||
115 | set_mtrr_ops(&amd_mtrr_ops); | 122 | set_mtrr_ops(&amd_mtrr_ops); |
116 | return 0; | 123 | return 0; |
117 | } | 124 | } |
118 | |||
119 | //arch_initcall(amd_mtrr_init); | ||
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index cb9aa3a7a7a..de89f14eff3 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c | |||
@@ -1,7 +1,9 @@ | |||
1 | #include <linux/init.h> | 1 | #include <linux/init.h> |
2 | #include <linux/mm.h> | 2 | #include <linux/mm.h> |
3 | |||
3 | #include <asm/mtrr.h> | 4 | #include <asm/mtrr.h> |
4 | #include <asm/msr.h> | 5 | #include <asm/msr.h> |
6 | |||
5 | #include "mtrr.h" | 7 | #include "mtrr.h" |
6 | 8 | ||
7 | static struct { | 9 | static struct { |
@@ -12,25 +14,25 @@ static struct { | |||
12 | static u8 centaur_mcr_reserved; | 14 | static u8 centaur_mcr_reserved; |
13 | static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ | 15 | static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ |
14 | 16 | ||
15 | /* | 17 | /** |
16 | * Report boot time MCR setups | 18 | * centaur_get_free_region - Get a free MTRR. |
19 | * | ||
20 | * @base: The starting (base) address of the region. | ||
21 | * @size: The size (in bytes) of the region. | ||
22 | * | ||
23 | * Returns: the index of the region on success, else -1 on error. | ||
17 | */ | 24 | */ |
18 | |||
19 | static int | 25 | static int |
20 | centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) | 26 | centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) |
21 | /* [SUMMARY] Get a free MTRR. | ||
22 | <base> The starting (base) address of the region. | ||
23 | <size> The size (in bytes) of the region. | ||
24 | [RETURNS] The index of the region on success, else -1 on error. | ||
25 | */ | ||
26 | { | 27 | { |
27 | int i, max; | ||
28 | mtrr_type ltype; | ||
29 | unsigned long lbase, lsize; | 28 | unsigned long lbase, lsize; |
29 | mtrr_type ltype; | ||
30 | int i, max; | ||
30 | 31 | ||
31 | max = num_var_ranges; | 32 | max = num_var_ranges; |
32 | if (replace_reg >= 0 && replace_reg < max) | 33 | if (replace_reg >= 0 && replace_reg < max) |
33 | return replace_reg; | 34 | return replace_reg; |
35 | |||
34 | for (i = 0; i < max; ++i) { | 36 | for (i = 0; i < max; ++i) { |
35 | if (centaur_mcr_reserved & (1 << i)) | 37 | if (centaur_mcr_reserved & (1 << i)) |
36 | continue; | 38 | continue; |
@@ -38,11 +40,14 @@ centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) | |||
38 | if (lsize == 0) | 40 | if (lsize == 0) |
39 | return i; | 41 | return i; |
40 | } | 42 | } |
43 | |||
41 | return -ENOSPC; | 44 | return -ENOSPC; |
42 | } | 45 | } |
43 | 46 | ||
44 | void | 47 | /* |
45 | mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) | 48 | * Report boot time MCR setups |
49 | */ | ||
50 | void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) | ||
46 | { | 51 | { |
47 | centaur_mcr[mcr].low = lo; | 52 | centaur_mcr[mcr].low = lo; |
48 | centaur_mcr[mcr].high = hi; | 53 | centaur_mcr[mcr].high = hi; |
@@ -54,33 +59,35 @@ centaur_get_mcr(unsigned int reg, unsigned long *base, | |||
54 | { | 59 | { |
55 | *base = centaur_mcr[reg].high >> PAGE_SHIFT; | 60 | *base = centaur_mcr[reg].high >> PAGE_SHIFT; |
56 | *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; | 61 | *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; |
57 | *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */ | 62 | *type = MTRR_TYPE_WRCOMB; /* write-combining */ |
63 | |||
58 | if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) | 64 | if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) |
59 | *type = MTRR_TYPE_UNCACHABLE; | 65 | *type = MTRR_TYPE_UNCACHABLE; |
60 | if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25) | 66 | if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25) |
61 | *type = MTRR_TYPE_WRBACK; | 67 | *type = MTRR_TYPE_WRBACK; |
62 | if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31) | 68 | if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31) |
63 | *type = MTRR_TYPE_WRBACK; | 69 | *type = MTRR_TYPE_WRBACK; |
64 | |||
65 | } | 70 | } |
66 | 71 | ||
67 | static void centaur_set_mcr(unsigned int reg, unsigned long base, | 72 | static void |
68 | unsigned long size, mtrr_type type) | 73 | centaur_set_mcr(unsigned int reg, unsigned long base, |
74 | unsigned long size, mtrr_type type) | ||
69 | { | 75 | { |
70 | unsigned long low, high; | 76 | unsigned long low, high; |
71 | 77 | ||
72 | if (size == 0) { | 78 | if (size == 0) { |
73 | /* Disable */ | 79 | /* Disable */ |
74 | high = low = 0; | 80 | high = low = 0; |
75 | } else { | 81 | } else { |
76 | high = base << PAGE_SHIFT; | 82 | high = base << PAGE_SHIFT; |
77 | if (centaur_mcr_type == 0) | 83 | if (centaur_mcr_type == 0) { |
78 | low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */ | 84 | /* Only support write-combining... */ |
79 | else { | 85 | low = -size << PAGE_SHIFT | 0x1f; |
86 | } else { | ||
80 | if (type == MTRR_TYPE_UNCACHABLE) | 87 | if (type == MTRR_TYPE_UNCACHABLE) |
81 | low = -size << PAGE_SHIFT | 0x02; /* NC */ | 88 | low = -size << PAGE_SHIFT | 0x02; /* NC */ |
82 | else | 89 | else |
83 | low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */ | 90 | low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */ |
84 | } | 91 | } |
85 | } | 92 | } |
86 | centaur_mcr[reg].high = high; | 93 | centaur_mcr[reg].high = high; |
@@ -88,118 +95,16 @@ static void centaur_set_mcr(unsigned int reg, unsigned long base, | |||
88 | wrmsr(MSR_IDT_MCR0 + reg, low, high); | 95 | wrmsr(MSR_IDT_MCR0 + reg, low, high); |
89 | } | 96 | } |
90 | 97 | ||
91 | #if 0 | 98 | static int |
92 | /* | 99 | centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type) |
93 | * Initialise the later (saner) Winchip MCR variant. In this version | ||
94 | * the BIOS can pass us the registers it has used (but not their values) | ||
95 | * and the control register is read/write | ||
96 | */ | ||
97 | |||
98 | static void __init | ||
99 | centaur_mcr1_init(void) | ||
100 | { | ||
101 | unsigned i; | ||
102 | u32 lo, hi; | ||
103 | |||
104 | /* Unfortunately, MCR's are read-only, so there is no way to | ||
105 | * find out what the bios might have done. | ||
106 | */ | ||
107 | |||
108 | rdmsr(MSR_IDT_MCR_CTRL, lo, hi); | ||
109 | if (((lo >> 17) & 7) == 1) { /* Type 1 Winchip2 MCR */ | ||
110 | lo &= ~0x1C0; /* clear key */ | ||
111 | lo |= 0x040; /* set key to 1 */ | ||
112 | wrmsr(MSR_IDT_MCR_CTRL, lo, hi); /* unlock MCR */ | ||
113 | } | ||
114 | |||
115 | centaur_mcr_type = 1; | ||
116 | |||
117 | /* | ||
118 | * Clear any unconfigured MCR's. | ||
119 | */ | ||
120 | |||
121 | for (i = 0; i < 8; ++i) { | ||
122 | if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) { | ||
123 | if (!(lo & (1 << (9 + i)))) | ||
124 | wrmsr(MSR_IDT_MCR0 + i, 0, 0); | ||
125 | else | ||
126 | /* | ||
127 | * If the BIOS set up an MCR we cannot see it | ||
128 | * but we don't wish to obliterate it | ||
129 | */ | ||
130 | centaur_mcr_reserved |= (1 << i); | ||
131 | } | ||
132 | } | ||
133 | /* | ||
134 | * Throw the main write-combining switch... | ||
135 | * However if OOSTORE is enabled then people have already done far | ||
136 | * cleverer things and we should behave. | ||
137 | */ | ||
138 | |||
139 | lo |= 15; /* Write combine enables */ | ||
140 | wrmsr(MSR_IDT_MCR_CTRL, lo, hi); | ||
141 | } | ||
142 | |||
143 | /* | ||
144 | * Initialise the original winchip with read only MCR registers | ||
145 | * no used bitmask for the BIOS to pass on and write only control | ||
146 | */ | ||
147 | |||
148 | static void __init | ||
149 | centaur_mcr0_init(void) | ||
150 | { | ||
151 | unsigned i; | ||
152 | |||
153 | /* Unfortunately, MCR's are read-only, so there is no way to | ||
154 | * find out what the bios might have done. | ||
155 | */ | ||
156 | |||
157 | /* Clear any unconfigured MCR's. | ||
158 | * This way we are sure that the centaur_mcr array contains the actual | ||
159 | * values. The disadvantage is that any BIOS tweaks are thus undone. | ||
160 | * | ||
161 | */ | ||
162 | for (i = 0; i < 8; ++i) { | ||
163 | if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) | ||
164 | wrmsr(MSR_IDT_MCR0 + i, 0, 0); | ||
165 | } | ||
166 | |||
167 | wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); /* Write only */ | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * Initialise Winchip series MCR registers | ||
172 | */ | ||
173 | |||
174 | static void __init | ||
175 | centaur_mcr_init(void) | ||
176 | { | ||
177 | struct set_mtrr_context ctxt; | ||
178 | |||
179 | set_mtrr_prepare_save(&ctxt); | ||
180 | set_mtrr_cache_disable(&ctxt); | ||
181 | |||
182 | if (boot_cpu_data.x86_model == 4) | ||
183 | centaur_mcr0_init(); | ||
184 | else if (boot_cpu_data.x86_model == 8 || boot_cpu_data.x86_model == 9) | ||
185 | centaur_mcr1_init(); | ||
186 | |||
187 | set_mtrr_done(&ctxt); | ||
188 | } | ||
189 | #endif | ||
190 | |||
191 | static int centaur_validate_add_page(unsigned long base, | ||
192 | unsigned long size, unsigned int type) | ||
193 | { | 100 | { |
194 | /* | 101 | /* |
195 | * FIXME: Winchip2 supports uncached | 102 | * FIXME: Winchip2 supports uncached |
196 | */ | 103 | */ |
197 | if (type != MTRR_TYPE_WRCOMB && | 104 | if (type != MTRR_TYPE_WRCOMB && |
198 | (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { | 105 | (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { |
199 | printk(KERN_WARNING | 106 | pr_warning("mtrr: only write-combining%s supported\n", |
200 | "mtrr: only write-combining%s supported\n", | 107 | centaur_mcr_type ? " and uncacheable are" : " is"); |
201 | centaur_mcr_type ? " and uncacheable are" | ||
202 | : " is"); | ||
203 | return -EINVAL; | 108 | return -EINVAL; |
204 | } | 109 | } |
205 | return 0; | 110 | return 0; |
@@ -207,7 +112,6 @@ static int centaur_validate_add_page(unsigned long base, | |||
207 | 112 | ||
208 | static struct mtrr_ops centaur_mtrr_ops = { | 113 | static struct mtrr_ops centaur_mtrr_ops = { |
209 | .vendor = X86_VENDOR_CENTAUR, | 114 | .vendor = X86_VENDOR_CENTAUR, |
210 | // .init = centaur_mcr_init, | ||
211 | .set = centaur_set_mcr, | 115 | .set = centaur_set_mcr, |
212 | .get = centaur_get_mcr, | 116 | .get = centaur_get_mcr, |
213 | .get_free_region = centaur_get_free_region, | 117 | .get_free_region = centaur_get_free_region, |
@@ -220,5 +124,3 @@ int __init centaur_init_mtrr(void) | |||
220 | set_mtrr_ops(¢aur_mtrr_ops); | 124 | set_mtrr_ops(¢aur_mtrr_ops); |
221 | return 0; | 125 | return 0; |
222 | } | 126 | } |
223 | |||
224 | //arch_initcall(centaur_init_mtrr); | ||
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 1d584a18a50..315738c74aa 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c | |||
@@ -1,51 +1,75 @@ | |||
1 | /* MTRR (Memory Type Range Register) cleanup | 1 | /* |
2 | 2 | * MTRR (Memory Type Range Register) cleanup | |
3 | Copyright (C) 2009 Yinghai Lu | 3 | * |
4 | 4 | * Copyright (C) 2009 Yinghai Lu | |
5 | This library is free software; you can redistribute it and/or | 5 | * |
6 | modify it under the terms of the GNU Library General Public | 6 | * This library is free software; you can redistribute it and/or |
7 | License as published by the Free Software Foundation; either | 7 | * modify it under the terms of the GNU Library General Public |
8 | version 2 of the License, or (at your option) any later version. | 8 | * License as published by the Free Software Foundation; either |
9 | 9 | * version 2 of the License, or (at your option) any later version. | |
10 | This library is distributed in the hope that it will be useful, | 10 | * |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 11 | * This library is distributed in the hope that it will be useful, |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | Library General Public License for more details. | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | 14 | * Library General Public License for more details. | |
15 | You should have received a copy of the GNU Library General Public | 15 | * |
16 | License along with this library; if not, write to the Free | 16 | * You should have received a copy of the GNU Library General Public |
17 | Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 17 | * License along with this library; if not, write to the Free |
18 | */ | 18 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
19 | 19 | */ | |
20 | #include <linux/module.h> | 20 | #include <linux/module.h> |
21 | #include <linux/init.h> | 21 | #include <linux/init.h> |
22 | #include <linux/pci.h> | 22 | #include <linux/pci.h> |
23 | #include <linux/smp.h> | 23 | #include <linux/smp.h> |
24 | #include <linux/cpu.h> | 24 | #include <linux/cpu.h> |
25 | #include <linux/mutex.h> | ||
26 | #include <linux/sort.h> | 25 | #include <linux/sort.h> |
26 | #include <linux/mutex.h> | ||
27 | #include <linux/uaccess.h> | ||
28 | #include <linux/kvm_para.h> | ||
27 | 29 | ||
30 | #include <asm/processor.h> | ||
28 | #include <asm/e820.h> | 31 | #include <asm/e820.h> |
29 | #include <asm/mtrr.h> | 32 | #include <asm/mtrr.h> |
30 | #include <asm/uaccess.h> | ||
31 | #include <asm/processor.h> | ||
32 | #include <asm/msr.h> | 33 | #include <asm/msr.h> |
33 | #include <asm/kvm_para.h> | ||
34 | #include "mtrr.h" | ||
35 | 34 | ||
36 | /* should be related to MTRR_VAR_RANGES nums */ | 35 | #include "mtrr.h" |
37 | #define RANGE_NUM 256 | ||
38 | 36 | ||
39 | struct res_range { | 37 | struct res_range { |
40 | unsigned long start; | 38 | unsigned long start; |
41 | unsigned long end; | 39 | unsigned long end; |
40 | }; | ||
41 | |||
42 | struct var_mtrr_range_state { | ||
43 | unsigned long base_pfn; | ||
44 | unsigned long size_pfn; | ||
45 | mtrr_type type; | ||
46 | }; | ||
47 | |||
48 | struct var_mtrr_state { | ||
49 | unsigned long range_startk; | ||
50 | unsigned long range_sizek; | ||
51 | unsigned long chunk_sizek; | ||
52 | unsigned long gran_sizek; | ||
53 | unsigned int reg; | ||
42 | }; | 54 | }; |
43 | 55 | ||
56 | /* Should be related to MTRR_VAR_RANGES nums */ | ||
57 | #define RANGE_NUM 256 | ||
58 | |||
59 | static struct res_range __initdata range[RANGE_NUM]; | ||
60 | static int __initdata nr_range; | ||
61 | |||
62 | static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; | ||
63 | |||
64 | static int __initdata debug_print; | ||
65 | #define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) | ||
66 | |||
67 | |||
44 | static int __init | 68 | static int __init |
45 | add_range(struct res_range *range, int nr_range, unsigned long start, | 69 | add_range(struct res_range *range, int nr_range, |
46 | unsigned long end) | 70 | unsigned long start, unsigned long end) |
47 | { | 71 | { |
48 | /* out of slots */ | 72 | /* Out of slots: */ |
49 | if (nr_range >= RANGE_NUM) | 73 | if (nr_range >= RANGE_NUM) |
50 | return nr_range; | 74 | return nr_range; |
51 | 75 | ||
@@ -58,12 +82,12 @@ add_range(struct res_range *range, int nr_range, unsigned long start, | |||
58 | } | 82 | } |
59 | 83 | ||
60 | static int __init | 84 | static int __init |
61 | add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, | 85 | add_range_with_merge(struct res_range *range, int nr_range, |
62 | unsigned long end) | 86 | unsigned long start, unsigned long end) |
63 | { | 87 | { |
64 | int i; | 88 | int i; |
65 | 89 | ||
66 | /* try to merge it with old one */ | 90 | /* Try to merge it with old one: */ |
67 | for (i = 0; i < nr_range; i++) { | 91 | for (i = 0; i < nr_range; i++) { |
68 | unsigned long final_start, final_end; | 92 | unsigned long final_start, final_end; |
69 | unsigned long common_start, common_end; | 93 | unsigned long common_start, common_end; |
@@ -84,7 +108,7 @@ add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, | |||
84 | return nr_range; | 108 | return nr_range; |
85 | } | 109 | } |
86 | 110 | ||
87 | /* need to add that */ | 111 | /* Need to add it: */ |
88 | return add_range(range, nr_range, start, end); | 112 | return add_range(range, nr_range, start, end); |
89 | } | 113 | } |
90 | 114 | ||
@@ -117,7 +141,7 @@ subtract_range(struct res_range *range, unsigned long start, unsigned long end) | |||
117 | } | 141 | } |
118 | 142 | ||
119 | if (start > range[j].start && end < range[j].end) { | 143 | if (start > range[j].start && end < range[j].end) { |
120 | /* find the new spare */ | 144 | /* Find the new spare: */ |
121 | for (i = 0; i < RANGE_NUM; i++) { | 145 | for (i = 0; i < RANGE_NUM; i++) { |
122 | if (range[i].end == 0) | 146 | if (range[i].end == 0) |
123 | break; | 147 | break; |
@@ -146,14 +170,8 @@ static int __init cmp_range(const void *x1, const void *x2) | |||
146 | return start1 - start2; | 170 | return start1 - start2; |
147 | } | 171 | } |
148 | 172 | ||
149 | struct var_mtrr_range_state { | 173 | #define BIOS_BUG_MSG KERN_WARNING \ |
150 | unsigned long base_pfn; | 174 | "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" |
151 | unsigned long size_pfn; | ||
152 | mtrr_type type; | ||
153 | }; | ||
154 | |||
155 | static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; | ||
156 | static int __initdata debug_print; | ||
157 | 175 | ||
158 | static int __init | 176 | static int __init |
159 | x86_get_mtrr_mem_range(struct res_range *range, int nr_range, | 177 | x86_get_mtrr_mem_range(struct res_range *range, int nr_range, |
@@ -180,7 +198,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, | |||
180 | range[i].start, range[i].end + 1); | 198 | range[i].start, range[i].end + 1); |
181 | } | 199 | } |
182 | 200 | ||
183 | /* take out UC ranges */ | 201 | /* Take out UC ranges: */ |
184 | for (i = 0; i < num_var_ranges; i++) { | 202 | for (i = 0; i < num_var_ranges; i++) { |
185 | type = range_state[i].type; | 203 | type = range_state[i].type; |
186 | if (type != MTRR_TYPE_UNCACHABLE && | 204 | if (type != MTRR_TYPE_UNCACHABLE && |
@@ -193,9 +211,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, | |||
193 | if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && | 211 | if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && |
194 | (mtrr_state.enabled & 1)) { | 212 | (mtrr_state.enabled & 1)) { |
195 | /* Var MTRR contains UC entry below 1M? Skip it: */ | 213 | /* Var MTRR contains UC entry below 1M? Skip it: */ |
196 | printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d " | 214 | printk(BIOS_BUG_MSG, i); |
197 | "contains strange UC entry under 1M, check " | ||
198 | "with your system vendor!\n", i); | ||
199 | if (base + size <= (1<<(20-PAGE_SHIFT))) | 215 | if (base + size <= (1<<(20-PAGE_SHIFT))) |
200 | continue; | 216 | continue; |
201 | size -= (1<<(20-PAGE_SHIFT)) - base; | 217 | size -= (1<<(20-PAGE_SHIFT)) - base; |
@@ -237,17 +253,13 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, | |||
237 | return nr_range; | 253 | return nr_range; |
238 | } | 254 | } |
239 | 255 | ||
240 | static struct res_range __initdata range[RANGE_NUM]; | ||
241 | static int __initdata nr_range; | ||
242 | |||
243 | #ifdef CONFIG_MTRR_SANITIZER | 256 | #ifdef CONFIG_MTRR_SANITIZER |
244 | 257 | ||
245 | static unsigned long __init sum_ranges(struct res_range *range, int nr_range) | 258 | static unsigned long __init sum_ranges(struct res_range *range, int nr_range) |
246 | { | 259 | { |
247 | unsigned long sum; | 260 | unsigned long sum = 0; |
248 | int i; | 261 | int i; |
249 | 262 | ||
250 | sum = 0; | ||
251 | for (i = 0; i < nr_range; i++) | 263 | for (i = 0; i < nr_range; i++) |
252 | sum += range[i].end + 1 - range[i].start; | 264 | sum += range[i].end + 1 - range[i].start; |
253 | 265 | ||
@@ -278,17 +290,9 @@ static int __init mtrr_cleanup_debug_setup(char *str) | |||
278 | } | 290 | } |
279 | early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); | 291 | early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); |
280 | 292 | ||
281 | struct var_mtrr_state { | ||
282 | unsigned long range_startk; | ||
283 | unsigned long range_sizek; | ||
284 | unsigned long chunk_sizek; | ||
285 | unsigned long gran_sizek; | ||
286 | unsigned int reg; | ||
287 | }; | ||
288 | |||
289 | static void __init | 293 | static void __init |
290 | set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, | 294 | set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, |
291 | unsigned char type, unsigned int address_bits) | 295 | unsigned char type, unsigned int address_bits) |
292 | { | 296 | { |
293 | u32 base_lo, base_hi, mask_lo, mask_hi; | 297 | u32 base_lo, base_hi, mask_lo, mask_hi; |
294 | u64 base, mask; | 298 | u64 base, mask; |
@@ -301,7 +305,7 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, | |||
301 | mask = (1ULL << address_bits) - 1; | 305 | mask = (1ULL << address_bits) - 1; |
302 | mask &= ~((((u64)sizek) << 10) - 1); | 306 | mask &= ~((((u64)sizek) << 10) - 1); |
303 | 307 | ||
304 | base = ((u64)basek) << 10; | 308 | base = ((u64)basek) << 10; |
305 | 309 | ||
306 | base |= type; | 310 | base |= type; |
307 | mask |= 0x800; | 311 | mask |= 0x800; |
@@ -317,15 +321,14 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, | |||
317 | 321 | ||
318 | static void __init | 322 | static void __init |
319 | save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, | 323 | save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, |
320 | unsigned char type) | 324 | unsigned char type) |
321 | { | 325 | { |
322 | range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); | 326 | range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); |
323 | range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); | 327 | range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); |
324 | range_state[reg].type = type; | 328 | range_state[reg].type = type; |
325 | } | 329 | } |
326 | 330 | ||
327 | static void __init | 331 | static void __init set_var_mtrr_all(unsigned int address_bits) |
328 | set_var_mtrr_all(unsigned int address_bits) | ||
329 | { | 332 | { |
330 | unsigned long basek, sizek; | 333 | unsigned long basek, sizek; |
331 | unsigned char type; | 334 | unsigned char type; |
@@ -342,11 +345,11 @@ set_var_mtrr_all(unsigned int address_bits) | |||
342 | 345 | ||
343 | static unsigned long to_size_factor(unsigned long sizek, char *factorp) | 346 | static unsigned long to_size_factor(unsigned long sizek, char *factorp) |
344 | { | 347 | { |
345 | char factor; | ||
346 | unsigned long base = sizek; | 348 | unsigned long base = sizek; |
349 | char factor; | ||
347 | 350 | ||
348 | if (base & ((1<<10) - 1)) { | 351 | if (base & ((1<<10) - 1)) { |
349 | /* not MB alignment */ | 352 | /* Not MB-aligned: */ |
350 | factor = 'K'; | 353 | factor = 'K'; |
351 | } else if (base & ((1<<20) - 1)) { | 354 | } else if (base & ((1<<20) - 1)) { |
352 | factor = 'M'; | 355 | factor = 'M'; |
@@ -372,11 +375,12 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, | |||
372 | unsigned long max_align, align; | 375 | unsigned long max_align, align; |
373 | unsigned long sizek; | 376 | unsigned long sizek; |
374 | 377 | ||
375 | /* Compute the maximum size I can make a range */ | 378 | /* Compute the maximum size with which we can make a range: */ |
376 | if (range_startk) | 379 | if (range_startk) |
377 | max_align = ffs(range_startk) - 1; | 380 | max_align = ffs(range_startk) - 1; |
378 | else | 381 | else |
379 | max_align = 32; | 382 | max_align = 32; |
383 | |||
380 | align = fls(range_sizek) - 1; | 384 | align = fls(range_sizek) - 1; |
381 | if (align > max_align) | 385 | if (align > max_align) |
382 | align = max_align; | 386 | align = max_align; |
@@ -386,11 +390,10 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, | |||
386 | char start_factor = 'K', size_factor = 'K'; | 390 | char start_factor = 'K', size_factor = 'K'; |
387 | unsigned long start_base, size_base; | 391 | unsigned long start_base, size_base; |
388 | 392 | ||
389 | start_base = to_size_factor(range_startk, | 393 | start_base = to_size_factor(range_startk, &start_factor); |
390 | &start_factor), | 394 | size_base = to_size_factor(sizek, &size_factor); |
391 | size_base = to_size_factor(sizek, &size_factor), | ||
392 | 395 | ||
393 | printk(KERN_DEBUG "Setting variable MTRR %d, " | 396 | Dprintk("Setting variable MTRR %d, " |
394 | "base: %ld%cB, range: %ld%cB, type %s\n", | 397 | "base: %ld%cB, range: %ld%cB, type %s\n", |
395 | reg, start_base, start_factor, | 398 | reg, start_base, start_factor, |
396 | size_base, size_factor, | 399 | size_base, size_factor, |
@@ -425,10 +428,11 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, | |||
425 | chunk_sizek = state->chunk_sizek; | 428 | chunk_sizek = state->chunk_sizek; |
426 | gran_sizek = state->gran_sizek; | 429 | gran_sizek = state->gran_sizek; |
427 | 430 | ||
428 | /* align with gran size, prevent small block used up MTRRs */ | 431 | /* Align with gran size, prevent small block used up MTRRs: */ |
429 | range_basek = ALIGN(state->range_startk, gran_sizek); | 432 | range_basek = ALIGN(state->range_startk, gran_sizek); |
430 | if ((range_basek > basek) && basek) | 433 | if ((range_basek > basek) && basek) |
431 | return second_sizek; | 434 | return second_sizek; |
435 | |||
432 | state->range_sizek -= (range_basek - state->range_startk); | 436 | state->range_sizek -= (range_basek - state->range_startk); |
433 | range_sizek = ALIGN(state->range_sizek, gran_sizek); | 437 | range_sizek = ALIGN(state->range_sizek, gran_sizek); |
434 | 438 | ||
@@ -439,22 +443,21 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, | |||
439 | } | 443 | } |
440 | state->range_sizek = range_sizek; | 444 | state->range_sizek = range_sizek; |
441 | 445 | ||
442 | /* try to append some small hole */ | 446 | /* Try to append some small hole: */ |
443 | range0_basek = state->range_startk; | 447 | range0_basek = state->range_startk; |
444 | range0_sizek = ALIGN(state->range_sizek, chunk_sizek); | 448 | range0_sizek = ALIGN(state->range_sizek, chunk_sizek); |
445 | 449 | ||
446 | /* no increase */ | 450 | /* No increase: */ |
447 | if (range0_sizek == state->range_sizek) { | 451 | if (range0_sizek == state->range_sizek) { |
448 | if (debug_print) | 452 | Dprintk("rangeX: %016lx - %016lx\n", |
449 | printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", | 453 | range0_basek<<10, |
450 | range0_basek<<10, | 454 | (range0_basek + state->range_sizek)<<10); |
451 | (range0_basek + state->range_sizek)<<10); | ||
452 | state->reg = range_to_mtrr(state->reg, range0_basek, | 455 | state->reg = range_to_mtrr(state->reg, range0_basek, |
453 | state->range_sizek, MTRR_TYPE_WRBACK); | 456 | state->range_sizek, MTRR_TYPE_WRBACK); |
454 | return 0; | 457 | return 0; |
455 | } | 458 | } |
456 | 459 | ||
457 | /* only cut back, when it is not the last */ | 460 | /* Only cut back when it is not the last: */ |
458 | if (sizek) { | 461 | if (sizek) { |
459 | while (range0_basek + range0_sizek > (basek + sizek)) { | 462 | while (range0_basek + range0_sizek > (basek + sizek)) { |
460 | if (range0_sizek >= chunk_sizek) | 463 | if (range0_sizek >= chunk_sizek) |
@@ -470,16 +473,16 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, | |||
470 | second_try: | 473 | second_try: |
471 | range_basek = range0_basek + range0_sizek; | 474 | range_basek = range0_basek + range0_sizek; |
472 | 475 | ||
473 | /* one hole in the middle */ | 476 | /* One hole in the middle: */ |
474 | if (range_basek > basek && range_basek <= (basek + sizek)) | 477 | if (range_basek > basek && range_basek <= (basek + sizek)) |
475 | second_sizek = range_basek - basek; | 478 | second_sizek = range_basek - basek; |
476 | 479 | ||
477 | if (range0_sizek > state->range_sizek) { | 480 | if (range0_sizek > state->range_sizek) { |
478 | 481 | ||
479 | /* one hole in middle or at end */ | 482 | /* One hole in middle or at the end: */ |
480 | hole_sizek = range0_sizek - state->range_sizek - second_sizek; | 483 | hole_sizek = range0_sizek - state->range_sizek - second_sizek; |
481 | 484 | ||
482 | /* hole size should be less than half of range0 size */ | 485 | /* Hole size should be less than half of range0 size: */ |
483 | if (hole_sizek >= (range0_sizek >> 1) && | 486 | if (hole_sizek >= (range0_sizek >> 1) && |
484 | range0_sizek >= chunk_sizek) { | 487 | range0_sizek >= chunk_sizek) { |
485 | range0_sizek -= chunk_sizek; | 488 | range0_sizek -= chunk_sizek; |
@@ -491,32 +494,30 @@ second_try: | |||
491 | } | 494 | } |
492 | 495 | ||
493 | if (range0_sizek) { | 496 | if (range0_sizek) { |
494 | if (debug_print) | 497 | Dprintk("range0: %016lx - %016lx\n", |
495 | printk(KERN_DEBUG "range0: %016lx - %016lx\n", | 498 | range0_basek<<10, |
496 | range0_basek<<10, | 499 | (range0_basek + range0_sizek)<<10); |
497 | (range0_basek + range0_sizek)<<10); | ||
498 | state->reg = range_to_mtrr(state->reg, range0_basek, | 500 | state->reg = range_to_mtrr(state->reg, range0_basek, |
499 | range0_sizek, MTRR_TYPE_WRBACK); | 501 | range0_sizek, MTRR_TYPE_WRBACK); |
500 | } | 502 | } |
501 | 503 | ||
502 | if (range0_sizek < state->range_sizek) { | 504 | if (range0_sizek < state->range_sizek) { |
503 | /* need to handle left over */ | 505 | /* Need to handle left over range: */ |
504 | range_sizek = state->range_sizek - range0_sizek; | 506 | range_sizek = state->range_sizek - range0_sizek; |
505 | 507 | ||
506 | if (debug_print) | 508 | Dprintk("range: %016lx - %016lx\n", |
507 | printk(KERN_DEBUG "range: %016lx - %016lx\n", | 509 | range_basek<<10, |
508 | range_basek<<10, | 510 | (range_basek + range_sizek)<<10); |
509 | (range_basek + range_sizek)<<10); | 511 | |
510 | state->reg = range_to_mtrr(state->reg, range_basek, | 512 | state->reg = range_to_mtrr(state->reg, range_basek, |
511 | range_sizek, MTRR_TYPE_WRBACK); | 513 | range_sizek, MTRR_TYPE_WRBACK); |
512 | } | 514 | } |
513 | 515 | ||
514 | if (hole_sizek) { | 516 | if (hole_sizek) { |
515 | hole_basek = range_basek - hole_sizek - second_sizek; | 517 | hole_basek = range_basek - hole_sizek - second_sizek; |
516 | if (debug_print) | 518 | Dprintk("hole: %016lx - %016lx\n", |
517 | printk(KERN_DEBUG "hole: %016lx - %016lx\n", | 519 | hole_basek<<10, |
518 | hole_basek<<10, | 520 | (hole_basek + hole_sizek)<<10); |
519 | (hole_basek + hole_sizek)<<10); | ||
520 | state->reg = range_to_mtrr(state->reg, hole_basek, | 521 | state->reg = range_to_mtrr(state->reg, hole_basek, |
521 | hole_sizek, MTRR_TYPE_UNCACHABLE); | 522 | hole_sizek, MTRR_TYPE_UNCACHABLE); |
522 | } | 523 | } |
@@ -537,23 +538,23 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, | |||
537 | basek = base_pfn << (PAGE_SHIFT - 10); | 538 | basek = base_pfn << (PAGE_SHIFT - 10); |
538 | sizek = size_pfn << (PAGE_SHIFT - 10); | 539 | sizek = size_pfn << (PAGE_SHIFT - 10); |
539 | 540 | ||
540 | /* See if I can merge with the last range */ | 541 | /* See if I can merge with the last range: */ |
541 | if ((basek <= 1024) || | 542 | if ((basek <= 1024) || |
542 | (state->range_startk + state->range_sizek == basek)) { | 543 | (state->range_startk + state->range_sizek == basek)) { |
543 | unsigned long endk = basek + sizek; | 544 | unsigned long endk = basek + sizek; |
544 | state->range_sizek = endk - state->range_startk; | 545 | state->range_sizek = endk - state->range_startk; |
545 | return; | 546 | return; |
546 | } | 547 | } |
547 | /* Write the range mtrrs */ | 548 | /* Write the range mtrrs: */ |
548 | if (state->range_sizek != 0) | 549 | if (state->range_sizek != 0) |
549 | second_sizek = range_to_mtrr_with_hole(state, basek, sizek); | 550 | second_sizek = range_to_mtrr_with_hole(state, basek, sizek); |
550 | 551 | ||
551 | /* Allocate an msr */ | 552 | /* Allocate an msr: */ |
552 | state->range_startk = basek + second_sizek; | 553 | state->range_startk = basek + second_sizek; |
553 | state->range_sizek = sizek - second_sizek; | 554 | state->range_sizek = sizek - second_sizek; |
554 | } | 555 | } |
555 | 556 | ||
556 | /* mininum size of mtrr block that can take hole */ | 557 | /* Mininum size of mtrr block that can take hole: */ |
557 | static u64 mtrr_chunk_size __initdata = (256ULL<<20); | 558 | static u64 mtrr_chunk_size __initdata = (256ULL<<20); |
558 | 559 | ||
559 | static int __init parse_mtrr_chunk_size_opt(char *p) | 560 | static int __init parse_mtrr_chunk_size_opt(char *p) |
@@ -565,7 +566,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p) | |||
565 | } | 566 | } |
566 | early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); | 567 | early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); |
567 | 568 | ||
568 | /* granity of mtrr of block */ | 569 | /* Granularity of mtrr of block: */ |
569 | static u64 mtrr_gran_size __initdata; | 570 | static u64 mtrr_gran_size __initdata; |
570 | 571 | ||
571 | static int __init parse_mtrr_gran_size_opt(char *p) | 572 | static int __init parse_mtrr_gran_size_opt(char *p) |
@@ -577,7 +578,7 @@ static int __init parse_mtrr_gran_size_opt(char *p) | |||
577 | } | 578 | } |
578 | early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); | 579 | early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); |
579 | 580 | ||
580 | static int nr_mtrr_spare_reg __initdata = | 581 | static unsigned long nr_mtrr_spare_reg __initdata = |
581 | CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; | 582 | CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; |
582 | 583 | ||
583 | static int __init parse_mtrr_spare_reg(char *arg) | 584 | static int __init parse_mtrr_spare_reg(char *arg) |
@@ -586,7 +587,6 @@ static int __init parse_mtrr_spare_reg(char *arg) | |||
586 | nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); | 587 | nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); |
587 | return 0; | 588 | return 0; |
588 | } | 589 | } |
589 | |||
590 | early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); | 590 | early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); |
591 | 591 | ||
592 | static int __init | 592 | static int __init |
@@ -594,8 +594,8 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, | |||
594 | u64 chunk_size, u64 gran_size) | 594 | u64 chunk_size, u64 gran_size) |
595 | { | 595 | { |
596 | struct var_mtrr_state var_state; | 596 | struct var_mtrr_state var_state; |
597 | int i; | ||
598 | int num_reg; | 597 | int num_reg; |
598 | int i; | ||
599 | 599 | ||
600 | var_state.range_startk = 0; | 600 | var_state.range_startk = 0; |
601 | var_state.range_sizek = 0; | 601 | var_state.range_sizek = 0; |
@@ -605,17 +605,18 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, | |||
605 | 605 | ||
606 | memset(range_state, 0, sizeof(range_state)); | 606 | memset(range_state, 0, sizeof(range_state)); |
607 | 607 | ||
608 | /* Write the range etc */ | 608 | /* Write the range: */ |
609 | for (i = 0; i < nr_range; i++) | 609 | for (i = 0; i < nr_range; i++) { |
610 | set_var_mtrr_range(&var_state, range[i].start, | 610 | set_var_mtrr_range(&var_state, range[i].start, |
611 | range[i].end - range[i].start + 1); | 611 | range[i].end - range[i].start + 1); |
612 | } | ||
612 | 613 | ||
613 | /* Write the last range */ | 614 | /* Write the last range: */ |
614 | if (var_state.range_sizek != 0) | 615 | if (var_state.range_sizek != 0) |
615 | range_to_mtrr_with_hole(&var_state, 0, 0); | 616 | range_to_mtrr_with_hole(&var_state, 0, 0); |
616 | 617 | ||
617 | num_reg = var_state.reg; | 618 | num_reg = var_state.reg; |
618 | /* Clear out the extra MTRR's */ | 619 | /* Clear out the extra MTRR's: */ |
619 | while (var_state.reg < num_var_ranges) { | 620 | while (var_state.reg < num_var_ranges) { |
620 | save_var_mtrr(var_state.reg, 0, 0, 0); | 621 | save_var_mtrr(var_state.reg, 0, 0, 0); |
621 | var_state.reg++; | 622 | var_state.reg++; |
@@ -625,11 +626,11 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, | |||
625 | } | 626 | } |
626 | 627 | ||
627 | struct mtrr_cleanup_result { | 628 | struct mtrr_cleanup_result { |
628 | unsigned long gran_sizek; | 629 | unsigned long gran_sizek; |
629 | unsigned long chunk_sizek; | 630 | unsigned long chunk_sizek; |
630 | unsigned long lose_cover_sizek; | 631 | unsigned long lose_cover_sizek; |
631 | unsigned int num_reg; | 632 | unsigned int num_reg; |
632 | int bad; | 633 | int bad; |
633 | }; | 634 | }; |
634 | 635 | ||
635 | /* | 636 | /* |
@@ -645,10 +646,10 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM]; | |||
645 | 646 | ||
646 | static void __init print_out_mtrr_range_state(void) | 647 | static void __init print_out_mtrr_range_state(void) |
647 | { | 648 | { |
648 | int i; | ||
649 | char start_factor = 'K', size_factor = 'K'; | 649 | char start_factor = 'K', size_factor = 'K'; |
650 | unsigned long start_base, size_base; | 650 | unsigned long start_base, size_base; |
651 | mtrr_type type; | 651 | mtrr_type type; |
652 | int i; | ||
652 | 653 | ||
653 | for (i = 0; i < num_var_ranges; i++) { | 654 | for (i = 0; i < num_var_ranges; i++) { |
654 | 655 | ||
@@ -676,10 +677,10 @@ static int __init mtrr_need_cleanup(void) | |||
676 | int i; | 677 | int i; |
677 | mtrr_type type; | 678 | mtrr_type type; |
678 | unsigned long size; | 679 | unsigned long size; |
679 | /* extra one for all 0 */ | 680 | /* Extra one for all 0: */ |
680 | int num[MTRR_NUM_TYPES + 1]; | 681 | int num[MTRR_NUM_TYPES + 1]; |
681 | 682 | ||
682 | /* check entries number */ | 683 | /* Check entries number: */ |
683 | memset(num, 0, sizeof(num)); | 684 | memset(num, 0, sizeof(num)); |
684 | for (i = 0; i < num_var_ranges; i++) { | 685 | for (i = 0; i < num_var_ranges; i++) { |
685 | type = range_state[i].type; | 686 | type = range_state[i].type; |
@@ -693,88 +694,86 @@ static int __init mtrr_need_cleanup(void) | |||
693 | num[type]++; | 694 | num[type]++; |
694 | } | 695 | } |
695 | 696 | ||
696 | /* check if we got UC entries */ | 697 | /* Check if we got UC entries: */ |
697 | if (!num[MTRR_TYPE_UNCACHABLE]) | 698 | if (!num[MTRR_TYPE_UNCACHABLE]) |
698 | return 0; | 699 | return 0; |
699 | 700 | ||
700 | /* check if we only had WB and UC */ | 701 | /* Check if we only had WB and UC */ |
701 | if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != | 702 | if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != |
702 | num_var_ranges - num[MTRR_NUM_TYPES]) | 703 | num_var_ranges - num[MTRR_NUM_TYPES]) |
703 | return 0; | 704 | return 0; |
704 | 705 | ||
705 | return 1; | 706 | return 1; |
706 | } | 707 | } |
707 | 708 | ||
708 | static unsigned long __initdata range_sums; | 709 | static unsigned long __initdata range_sums; |
709 | static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, | 710 | |
710 | unsigned long extra_remove_base, | 711 | static void __init |
711 | unsigned long extra_remove_size, | 712 | mtrr_calc_range_state(u64 chunk_size, u64 gran_size, |
712 | int i) | 713 | unsigned long x_remove_base, |
714 | unsigned long x_remove_size, int i) | ||
713 | { | 715 | { |
714 | int num_reg; | ||
715 | static struct res_range range_new[RANGE_NUM]; | 716 | static struct res_range range_new[RANGE_NUM]; |
716 | static int nr_range_new; | ||
717 | unsigned long range_sums_new; | 717 | unsigned long range_sums_new; |
718 | static int nr_range_new; | ||
719 | int num_reg; | ||
718 | 720 | ||
719 | /* convert ranges to var ranges state */ | 721 | /* Convert ranges to var ranges state: */ |
720 | num_reg = x86_setup_var_mtrrs(range, nr_range, | 722 | num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); |
721 | chunk_size, gran_size); | ||
722 | 723 | ||
723 | /* we got new setting in range_state, check it */ | 724 | /* We got new setting in range_state, check it: */ |
724 | memset(range_new, 0, sizeof(range_new)); | 725 | memset(range_new, 0, sizeof(range_new)); |
725 | nr_range_new = x86_get_mtrr_mem_range(range_new, 0, | 726 | nr_range_new = x86_get_mtrr_mem_range(range_new, 0, |
726 | extra_remove_base, extra_remove_size); | 727 | x_remove_base, x_remove_size); |
727 | range_sums_new = sum_ranges(range_new, nr_range_new); | 728 | range_sums_new = sum_ranges(range_new, nr_range_new); |
728 | 729 | ||
729 | result[i].chunk_sizek = chunk_size >> 10; | 730 | result[i].chunk_sizek = chunk_size >> 10; |
730 | result[i].gran_sizek = gran_size >> 10; | 731 | result[i].gran_sizek = gran_size >> 10; |
731 | result[i].num_reg = num_reg; | 732 | result[i].num_reg = num_reg; |
733 | |||
732 | if (range_sums < range_sums_new) { | 734 | if (range_sums < range_sums_new) { |
733 | result[i].lose_cover_sizek = | 735 | result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT; |
734 | (range_sums_new - range_sums) << PSHIFT; | ||
735 | result[i].bad = 1; | 736 | result[i].bad = 1; |
736 | } else | 737 | } else { |
737 | result[i].lose_cover_sizek = | 738 | result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT; |
738 | (range_sums - range_sums_new) << PSHIFT; | 739 | } |
739 | 740 | ||
740 | /* double check it */ | 741 | /* Double check it: */ |
741 | if (!result[i].bad && !result[i].lose_cover_sizek) { | 742 | if (!result[i].bad && !result[i].lose_cover_sizek) { |
742 | if (nr_range_new != nr_range || | 743 | if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range))) |
743 | memcmp(range, range_new, sizeof(range))) | 744 | result[i].bad = 1; |
744 | result[i].bad = 1; | ||
745 | } | 745 | } |
746 | 746 | ||
747 | if (!result[i].bad && (range_sums - range_sums_new < | 747 | if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg])) |
748 | min_loss_pfn[num_reg])) { | 748 | min_loss_pfn[num_reg] = range_sums - range_sums_new; |
749 | min_loss_pfn[num_reg] = | ||
750 | range_sums - range_sums_new; | ||
751 | } | ||
752 | } | 749 | } |
753 | 750 | ||
754 | static void __init mtrr_print_out_one_result(int i) | 751 | static void __init mtrr_print_out_one_result(int i) |
755 | { | 752 | { |
756 | char gran_factor, chunk_factor, lose_factor; | ||
757 | unsigned long gran_base, chunk_base, lose_base; | 753 | unsigned long gran_base, chunk_base, lose_base; |
754 | char gran_factor, chunk_factor, lose_factor; | ||
758 | 755 | ||
759 | gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), | 756 | gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), |
760 | chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), | 757 | chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), |
761 | lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), | 758 | lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), |
762 | printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", | 759 | |
763 | result[i].bad ? "*BAD*" : " ", | 760 | pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", |
764 | gran_base, gran_factor, chunk_base, chunk_factor); | 761 | result[i].bad ? "*BAD*" : " ", |
765 | printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", | 762 | gran_base, gran_factor, chunk_base, chunk_factor); |
766 | result[i].num_reg, result[i].bad ? "-" : "", | 763 | pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n", |
767 | lose_base, lose_factor); | 764 | result[i].num_reg, result[i].bad ? "-" : "", |
765 | lose_base, lose_factor); | ||
768 | } | 766 | } |
769 | 767 | ||
770 | static int __init mtrr_search_optimal_index(void) | 768 | static int __init mtrr_search_optimal_index(void) |
771 | { | 769 | { |
772 | int i; | ||
773 | int num_reg_good; | 770 | int num_reg_good; |
774 | int index_good; | 771 | int index_good; |
772 | int i; | ||
775 | 773 | ||
776 | if (nr_mtrr_spare_reg >= num_var_ranges) | 774 | if (nr_mtrr_spare_reg >= num_var_ranges) |
777 | nr_mtrr_spare_reg = num_var_ranges - 1; | 775 | nr_mtrr_spare_reg = num_var_ranges - 1; |
776 | |||
778 | num_reg_good = -1; | 777 | num_reg_good = -1; |
779 | for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { | 778 | for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { |
780 | if (!min_loss_pfn[i]) | 779 | if (!min_loss_pfn[i]) |
@@ -796,24 +795,24 @@ static int __init mtrr_search_optimal_index(void) | |||
796 | return index_good; | 795 | return index_good; |
797 | } | 796 | } |
798 | 797 | ||
799 | |||
800 | int __init mtrr_cleanup(unsigned address_bits) | 798 | int __init mtrr_cleanup(unsigned address_bits) |
801 | { | 799 | { |
802 | unsigned long extra_remove_base, extra_remove_size; | 800 | unsigned long x_remove_base, x_remove_size; |
803 | unsigned long base, size, def, dummy; | 801 | unsigned long base, size, def, dummy; |
804 | mtrr_type type; | ||
805 | u64 chunk_size, gran_size; | 802 | u64 chunk_size, gran_size; |
803 | mtrr_type type; | ||
806 | int index_good; | 804 | int index_good; |
807 | int i; | 805 | int i; |
808 | 806 | ||
809 | if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) | 807 | if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) |
810 | return 0; | 808 | return 0; |
809 | |||
811 | rdmsr(MSR_MTRRdefType, def, dummy); | 810 | rdmsr(MSR_MTRRdefType, def, dummy); |
812 | def &= 0xff; | 811 | def &= 0xff; |
813 | if (def != MTRR_TYPE_UNCACHABLE) | 812 | if (def != MTRR_TYPE_UNCACHABLE) |
814 | return 0; | 813 | return 0; |
815 | 814 | ||
816 | /* get it and store it aside */ | 815 | /* Get it and store it aside: */ |
817 | memset(range_state, 0, sizeof(range_state)); | 816 | memset(range_state, 0, sizeof(range_state)); |
818 | for (i = 0; i < num_var_ranges; i++) { | 817 | for (i = 0; i < num_var_ranges; i++) { |
819 | mtrr_if->get(i, &base, &size, &type); | 818 | mtrr_if->get(i, &base, &size, &type); |
@@ -822,29 +821,28 @@ int __init mtrr_cleanup(unsigned address_bits) | |||
822 | range_state[i].type = type; | 821 | range_state[i].type = type; |
823 | } | 822 | } |
824 | 823 | ||
825 | /* check if we need handle it and can handle it */ | 824 | /* Check if we need handle it and can handle it: */ |
826 | if (!mtrr_need_cleanup()) | 825 | if (!mtrr_need_cleanup()) |
827 | return 0; | 826 | return 0; |
828 | 827 | ||
829 | /* print original var MTRRs at first, for debugging: */ | 828 | /* Print original var MTRRs at first, for debugging: */ |
830 | printk(KERN_DEBUG "original variable MTRRs\n"); | 829 | printk(KERN_DEBUG "original variable MTRRs\n"); |
831 | print_out_mtrr_range_state(); | 830 | print_out_mtrr_range_state(); |
832 | 831 | ||
833 | memset(range, 0, sizeof(range)); | 832 | memset(range, 0, sizeof(range)); |
834 | extra_remove_size = 0; | 833 | x_remove_size = 0; |
835 | extra_remove_base = 1 << (32 - PAGE_SHIFT); | 834 | x_remove_base = 1 << (32 - PAGE_SHIFT); |
836 | if (mtrr_tom2) | 835 | if (mtrr_tom2) |
837 | extra_remove_size = | 836 | x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base; |
838 | (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; | 837 | |
839 | nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, | 838 | nr_range = x86_get_mtrr_mem_range(range, 0, x_remove_base, x_remove_size); |
840 | extra_remove_size); | ||
841 | /* | 839 | /* |
842 | * [0, 1M) should always be coverred by var mtrr with WB | 840 | * [0, 1M) should always be covered by var mtrr with WB |
843 | * and fixed mtrrs should take effective before var mtrr for it | 841 | * and fixed mtrrs should take effect before var mtrr for it: |
844 | */ | 842 | */ |
845 | nr_range = add_range_with_merge(range, nr_range, 0, | 843 | nr_range = add_range_with_merge(range, nr_range, 0, |
846 | (1ULL<<(20 - PAGE_SHIFT)) - 1); | 844 | (1ULL<<(20 - PAGE_SHIFT)) - 1); |
847 | /* sort the ranges */ | 845 | /* Sort the ranges: */ |
848 | sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); | 846 | sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); |
849 | 847 | ||
850 | range_sums = sum_ranges(range, nr_range); | 848 | range_sums = sum_ranges(range, nr_range); |
@@ -854,7 +852,7 @@ int __init mtrr_cleanup(unsigned address_bits) | |||
854 | if (mtrr_chunk_size && mtrr_gran_size) { | 852 | if (mtrr_chunk_size && mtrr_gran_size) { |
855 | i = 0; | 853 | i = 0; |
856 | mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, | 854 | mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, |
857 | extra_remove_base, extra_remove_size, i); | 855 | x_remove_base, x_remove_size, i); |
858 | 856 | ||
859 | mtrr_print_out_one_result(i); | 857 | mtrr_print_out_one_result(i); |
860 | 858 | ||
@@ -880,7 +878,7 @@ int __init mtrr_cleanup(unsigned address_bits) | |||
880 | continue; | 878 | continue; |
881 | 879 | ||
882 | mtrr_calc_range_state(chunk_size, gran_size, | 880 | mtrr_calc_range_state(chunk_size, gran_size, |
883 | extra_remove_base, extra_remove_size, i); | 881 | x_remove_base, x_remove_size, i); |
884 | if (debug_print) { | 882 | if (debug_print) { |
885 | mtrr_print_out_one_result(i); | 883 | mtrr_print_out_one_result(i); |
886 | printk(KERN_INFO "\n"); | 884 | printk(KERN_INFO "\n"); |
@@ -890,7 +888,7 @@ int __init mtrr_cleanup(unsigned address_bits) | |||
890 | } | 888 | } |
891 | } | 889 | } |
892 | 890 | ||
893 | /* try to find the optimal index */ | 891 | /* Try to find the optimal index: */ |
894 | index_good = mtrr_search_optimal_index(); | 892 | index_good = mtrr_search_optimal_index(); |
895 | 893 | ||
896 | if (index_good != -1) { | 894 | if (index_good != -1) { |
@@ -898,7 +896,7 @@ int __init mtrr_cleanup(unsigned address_bits) | |||
898 | i = index_good; | 896 | i = index_good; |
899 | mtrr_print_out_one_result(i); | 897 | mtrr_print_out_one_result(i); |
900 | 898 | ||
901 | /* convert ranges to var ranges state */ | 899 | /* Convert ranges to var ranges state: */ |
902 | chunk_size = result[i].chunk_sizek; | 900 | chunk_size = result[i].chunk_sizek; |
903 | chunk_size <<= 10; | 901 | chunk_size <<= 10; |
904 | gran_size = result[i].gran_sizek; | 902 | gran_size = result[i].gran_sizek; |
@@ -941,8 +939,8 @@ early_param("disable_mtrr_trim", disable_mtrr_trim_setup); | |||
941 | * Note this won't check if the MTRRs < 4GB where the magic bit doesn't | 939 | * Note this won't check if the MTRRs < 4GB where the magic bit doesn't |
942 | * apply to are wrong, but so far we don't know of any such case in the wild. | 940 | * apply to are wrong, but so far we don't know of any such case in the wild. |
943 | */ | 941 | */ |
944 | #define Tom2Enabled (1U << 21) | 942 | #define Tom2Enabled (1U << 21) |
945 | #define Tom2ForceMemTypeWB (1U << 22) | 943 | #define Tom2ForceMemTypeWB (1U << 22) |
946 | 944 | ||
947 | int __init amd_special_default_mtrr(void) | 945 | int __init amd_special_default_mtrr(void) |
948 | { | 946 | { |
@@ -952,7 +950,7 @@ int __init amd_special_default_mtrr(void) | |||
952 | return 0; | 950 | return 0; |
953 | if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) | 951 | if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) |
954 | return 0; | 952 | return 0; |
955 | /* In case some hypervisor doesn't pass SYSCFG through */ | 953 | /* In case some hypervisor doesn't pass SYSCFG through: */ |
956 | if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) | 954 | if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) |
957 | return 0; | 955 | return 0; |
958 | /* | 956 | /* |
@@ -965,19 +963,21 @@ int __init amd_special_default_mtrr(void) | |||
965 | return 0; | 963 | return 0; |
966 | } | 964 | } |
967 | 965 | ||
968 | static u64 __init real_trim_memory(unsigned long start_pfn, | 966 | static u64 __init |
969 | unsigned long limit_pfn) | 967 | real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) |
970 | { | 968 | { |
971 | u64 trim_start, trim_size; | 969 | u64 trim_start, trim_size; |
970 | |||
972 | trim_start = start_pfn; | 971 | trim_start = start_pfn; |
973 | trim_start <<= PAGE_SHIFT; | 972 | trim_start <<= PAGE_SHIFT; |
973 | |||
974 | trim_size = limit_pfn; | 974 | trim_size = limit_pfn; |
975 | trim_size <<= PAGE_SHIFT; | 975 | trim_size <<= PAGE_SHIFT; |
976 | trim_size -= trim_start; | 976 | trim_size -= trim_start; |
977 | 977 | ||
978 | return e820_update_range(trim_start, trim_size, E820_RAM, | 978 | return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED); |
979 | E820_RESERVED); | ||
980 | } | 979 | } |
980 | |||
981 | /** | 981 | /** |
982 | * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs | 982 | * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs |
983 | * @end_pfn: ending page frame number | 983 | * @end_pfn: ending page frame number |
@@ -985,7 +985,7 @@ static u64 __init real_trim_memory(unsigned long start_pfn, | |||
985 | * Some buggy BIOSes don't setup the MTRRs properly for systems with certain | 985 | * Some buggy BIOSes don't setup the MTRRs properly for systems with certain |
986 | * memory configurations. This routine checks that the highest MTRR matches | 986 | * memory configurations. This routine checks that the highest MTRR matches |
987 | * the end of memory, to make sure the MTRRs having a write back type cover | 987 | * the end of memory, to make sure the MTRRs having a write back type cover |
988 | * all of the memory the kernel is intending to use. If not, it'll trim any | 988 | * all of the memory the kernel is intending to use. If not, it'll trim any |
989 | * memory off the end by adjusting end_pfn, removing it from the kernel's | 989 | * memory off the end by adjusting end_pfn, removing it from the kernel's |
990 | * allocation pools, warning the user with an obnoxious message. | 990 | * allocation pools, warning the user with an obnoxious message. |
991 | */ | 991 | */ |
@@ -994,21 +994,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) | |||
994 | unsigned long i, base, size, highest_pfn = 0, def, dummy; | 994 | unsigned long i, base, size, highest_pfn = 0, def, dummy; |
995 | mtrr_type type; | 995 | mtrr_type type; |
996 | u64 total_trim_size; | 996 | u64 total_trim_size; |
997 | |||
998 | /* extra one for all 0 */ | 997 | /* extra one for all 0 */ |
999 | int num[MTRR_NUM_TYPES + 1]; | 998 | int num[MTRR_NUM_TYPES + 1]; |
999 | |||
1000 | /* | 1000 | /* |
1001 | * Make sure we only trim uncachable memory on machines that | 1001 | * Make sure we only trim uncachable memory on machines that |
1002 | * support the Intel MTRR architecture: | 1002 | * support the Intel MTRR architecture: |
1003 | */ | 1003 | */ |
1004 | if (!is_cpu(INTEL) || disable_mtrr_trim) | 1004 | if (!is_cpu(INTEL) || disable_mtrr_trim) |
1005 | return 0; | 1005 | return 0; |
1006 | |||
1006 | rdmsr(MSR_MTRRdefType, def, dummy); | 1007 | rdmsr(MSR_MTRRdefType, def, dummy); |
1007 | def &= 0xff; | 1008 | def &= 0xff; |
1008 | if (def != MTRR_TYPE_UNCACHABLE) | 1009 | if (def != MTRR_TYPE_UNCACHABLE) |
1009 | return 0; | 1010 | return 0; |
1010 | 1011 | ||
1011 | /* get it and store it aside */ | 1012 | /* Get it and store it aside: */ |
1012 | memset(range_state, 0, sizeof(range_state)); | 1013 | memset(range_state, 0, sizeof(range_state)); |
1013 | for (i = 0; i < num_var_ranges; i++) { | 1014 | for (i = 0; i < num_var_ranges; i++) { |
1014 | mtrr_if->get(i, &base, &size, &type); | 1015 | mtrr_if->get(i, &base, &size, &type); |
@@ -1017,7 +1018,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) | |||
1017 | range_state[i].type = type; | 1018 | range_state[i].type = type; |
1018 | } | 1019 | } |
1019 | 1020 | ||
1020 | /* Find highest cached pfn */ | 1021 | /* Find highest cached pfn: */ |
1021 | for (i = 0; i < num_var_ranges; i++) { | 1022 | for (i = 0; i < num_var_ranges; i++) { |
1022 | type = range_state[i].type; | 1023 | type = range_state[i].type; |
1023 | if (type != MTRR_TYPE_WRBACK) | 1024 | if (type != MTRR_TYPE_WRBACK) |
@@ -1028,13 +1029,13 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) | |||
1028 | highest_pfn = base + size; | 1029 | highest_pfn = base + size; |
1029 | } | 1030 | } |
1030 | 1031 | ||
1031 | /* kvm/qemu doesn't have mtrr set right, don't trim them all */ | 1032 | /* kvm/qemu doesn't have mtrr set right, don't trim them all: */ |
1032 | if (!highest_pfn) { | 1033 | if (!highest_pfn) { |
1033 | printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); | 1034 | printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); |
1034 | return 0; | 1035 | return 0; |
1035 | } | 1036 | } |
1036 | 1037 | ||
1037 | /* check entries number */ | 1038 | /* Check entries number: */ |
1038 | memset(num, 0, sizeof(num)); | 1039 | memset(num, 0, sizeof(num)); |
1039 | for (i = 0; i < num_var_ranges; i++) { | 1040 | for (i = 0; i < num_var_ranges; i++) { |
1040 | type = range_state[i].type; | 1041 | type = range_state[i].type; |
@@ -1046,11 +1047,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) | |||
1046 | num[type]++; | 1047 | num[type]++; |
1047 | } | 1048 | } |
1048 | 1049 | ||
1049 | /* no entry for WB? */ | 1050 | /* No entry for WB? */ |
1050 | if (!num[MTRR_TYPE_WRBACK]) | 1051 | if (!num[MTRR_TYPE_WRBACK]) |
1051 | return 0; | 1052 | return 0; |
1052 | 1053 | ||
1053 | /* check if we only had WB and UC */ | 1054 | /* Check if we only had WB and UC: */ |
1054 | if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != | 1055 | if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != |
1055 | num_var_ranges - num[MTRR_NUM_TYPES]) | 1056 | num_var_ranges - num[MTRR_NUM_TYPES]) |
1056 | return 0; | 1057 | return 0; |
@@ -1066,31 +1067,31 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) | |||
1066 | } | 1067 | } |
1067 | nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); | 1068 | nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); |
1068 | 1069 | ||
1070 | /* Check the head: */ | ||
1069 | total_trim_size = 0; | 1071 | total_trim_size = 0; |
1070 | /* check the head */ | ||
1071 | if (range[0].start) | 1072 | if (range[0].start) |
1072 | total_trim_size += real_trim_memory(0, range[0].start); | 1073 | total_trim_size += real_trim_memory(0, range[0].start); |
1073 | /* check the holes */ | 1074 | |
1075 | /* Check the holes: */ | ||
1074 | for (i = 0; i < nr_range - 1; i++) { | 1076 | for (i = 0; i < nr_range - 1; i++) { |
1075 | if (range[i].end + 1 < range[i+1].start) | 1077 | if (range[i].end + 1 < range[i+1].start) |
1076 | total_trim_size += real_trim_memory(range[i].end + 1, | 1078 | total_trim_size += real_trim_memory(range[i].end + 1, |
1077 | range[i+1].start); | 1079 | range[i+1].start); |
1078 | } | 1080 | } |
1079 | /* check the top */ | 1081 | |
1082 | /* Check the top: */ | ||
1080 | i = nr_range - 1; | 1083 | i = nr_range - 1; |
1081 | if (range[i].end + 1 < end_pfn) | 1084 | if (range[i].end + 1 < end_pfn) |
1082 | total_trim_size += real_trim_memory(range[i].end + 1, | 1085 | total_trim_size += real_trim_memory(range[i].end + 1, |
1083 | end_pfn); | 1086 | end_pfn); |
1084 | 1087 | ||
1085 | if (total_trim_size) { | 1088 | if (total_trim_size) { |
1086 | printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" | 1089 | pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20); |
1087 | " all of memory, losing %lluMB of RAM.\n", | ||
1088 | total_trim_size >> 20); | ||
1089 | 1090 | ||
1090 | if (!changed_by_mtrr_cleanup) | 1091 | if (!changed_by_mtrr_cleanup) |
1091 | WARN_ON(1); | 1092 | WARN_ON(1); |
1092 | 1093 | ||
1093 | printk(KERN_INFO "update e820 for mtrr\n"); | 1094 | pr_info("update e820 for mtrr\n"); |
1094 | update_e820(); | 1095 | update_e820(); |
1095 | 1096 | ||
1096 | return 1; | 1097 | return 1; |
@@ -1098,4 +1099,3 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) | |||
1098 | 1099 | ||
1099 | return 0; | 1100 | return 0; |
1100 | } | 1101 | } |
1101 | |||
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index ff14c320040..228d982ce09 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c | |||
@@ -1,38 +1,40 @@ | |||
1 | #include <linux/init.h> | 1 | #include <linux/init.h> |
2 | #include <linux/io.h> | ||
2 | #include <linux/mm.h> | 3 | #include <linux/mm.h> |
3 | #include <asm/mtrr.h> | 4 | |
4 | #include <asm/msr.h> | ||
5 | #include <asm/io.h> | ||
6 | #include <asm/processor-cyrix.h> | 5 | #include <asm/processor-cyrix.h> |
7 | #include <asm/processor-flags.h> | 6 | #include <asm/processor-flags.h> |
7 | #include <asm/mtrr.h> | ||
8 | #include <asm/msr.h> | ||
9 | |||
8 | #include "mtrr.h" | 10 | #include "mtrr.h" |
9 | 11 | ||
10 | static void | 12 | static void |
11 | cyrix_get_arr(unsigned int reg, unsigned long *base, | 13 | cyrix_get_arr(unsigned int reg, unsigned long *base, |
12 | unsigned long *size, mtrr_type * type) | 14 | unsigned long *size, mtrr_type * type) |
13 | { | 15 | { |
14 | unsigned long flags; | ||
15 | unsigned char arr, ccr3, rcr, shift; | 16 | unsigned char arr, ccr3, rcr, shift; |
17 | unsigned long flags; | ||
16 | 18 | ||
17 | arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ | 19 | arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ |
18 | 20 | ||
19 | /* Save flags and disable interrupts */ | ||
20 | local_irq_save(flags); | 21 | local_irq_save(flags); |
21 | 22 | ||
22 | ccr3 = getCx86(CX86_CCR3); | 23 | ccr3 = getCx86(CX86_CCR3); |
23 | setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ | 24 | setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ |
24 | ((unsigned char *) base)[3] = getCx86(arr); | 25 | ((unsigned char *)base)[3] = getCx86(arr); |
25 | ((unsigned char *) base)[2] = getCx86(arr + 1); | 26 | ((unsigned char *)base)[2] = getCx86(arr + 1); |
26 | ((unsigned char *) base)[1] = getCx86(arr + 2); | 27 | ((unsigned char *)base)[1] = getCx86(arr + 2); |
27 | rcr = getCx86(CX86_RCR_BASE + reg); | 28 | rcr = getCx86(CX86_RCR_BASE + reg); |
28 | setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ | 29 | setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ |
29 | 30 | ||
30 | /* Enable interrupts if it was enabled previously */ | ||
31 | local_irq_restore(flags); | 31 | local_irq_restore(flags); |
32 | |||
32 | shift = ((unsigned char *) base)[1] & 0x0f; | 33 | shift = ((unsigned char *) base)[1] & 0x0f; |
33 | *base >>= PAGE_SHIFT; | 34 | *base >>= PAGE_SHIFT; |
34 | 35 | ||
35 | /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 | 36 | /* |
37 | * Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 | ||
36 | * Note: shift==0xf means 4G, this is unsupported. | 38 | * Note: shift==0xf means 4G, this is unsupported. |
37 | */ | 39 | */ |
38 | if (shift) | 40 | if (shift) |
@@ -76,17 +78,20 @@ cyrix_get_arr(unsigned int reg, unsigned long *base, | |||
76 | } | 78 | } |
77 | } | 79 | } |
78 | 80 | ||
81 | /* | ||
82 | * cyrix_get_free_region - get a free ARR. | ||
83 | * | ||
84 | * @base: the starting (base) address of the region. | ||
85 | * @size: the size (in bytes) of the region. | ||
86 | * | ||
87 | * Returns: the index of the region on success, else -1 on error. | ||
88 | */ | ||
79 | static int | 89 | static int |
80 | cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) | 90 | cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) |
81 | /* [SUMMARY] Get a free ARR. | ||
82 | <base> The starting (base) address of the region. | ||
83 | <size> The size (in bytes) of the region. | ||
84 | [RETURNS] The index of the region on success, else -1 on error. | ||
85 | */ | ||
86 | { | 91 | { |
87 | int i; | ||
88 | mtrr_type ltype; | ||
89 | unsigned long lbase, lsize; | 92 | unsigned long lbase, lsize; |
93 | mtrr_type ltype; | ||
94 | int i; | ||
90 | 95 | ||
91 | switch (replace_reg) { | 96 | switch (replace_reg) { |
92 | case 7: | 97 | case 7: |
@@ -107,14 +112,17 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) | |||
107 | cyrix_get_arr(7, &lbase, &lsize, <ype); | 112 | cyrix_get_arr(7, &lbase, &lsize, <ype); |
108 | if (lsize == 0) | 113 | if (lsize == 0) |
109 | return 7; | 114 | return 7; |
110 | /* Else try ARR0-ARR6 first */ | 115 | /* Else try ARR0-ARR6 first */ |
111 | } else { | 116 | } else { |
112 | for (i = 0; i < 7; i++) { | 117 | for (i = 0; i < 7; i++) { |
113 | cyrix_get_arr(i, &lbase, &lsize, <ype); | 118 | cyrix_get_arr(i, &lbase, &lsize, <ype); |
114 | if (lsize == 0) | 119 | if (lsize == 0) |
115 | return i; | 120 | return i; |
116 | } | 121 | } |
117 | /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */ | 122 | /* |
123 | * ARR0-ARR6 isn't free | ||
124 | * try ARR7 but its size must be at least 256K | ||
125 | */ | ||
118 | cyrix_get_arr(i, &lbase, &lsize, <ype); | 126 | cyrix_get_arr(i, &lbase, &lsize, <ype); |
119 | if ((lsize == 0) && (size >= 0x40)) | 127 | if ((lsize == 0) && (size >= 0x40)) |
120 | return i; | 128 | return i; |
@@ -122,21 +130,22 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) | |||
122 | return -ENOSPC; | 130 | return -ENOSPC; |
123 | } | 131 | } |
124 | 132 | ||
125 | static u32 cr4 = 0; | 133 | static u32 cr4, ccr3; |
126 | static u32 ccr3; | ||
127 | 134 | ||
128 | static void prepare_set(void) | 135 | static void prepare_set(void) |
129 | { | 136 | { |
130 | u32 cr0; | 137 | u32 cr0; |
131 | 138 | ||
132 | /* Save value of CR4 and clear Page Global Enable (bit 7) */ | 139 | /* Save value of CR4 and clear Page Global Enable (bit 7) */ |
133 | if ( cpu_has_pge ) { | 140 | if (cpu_has_pge) { |
134 | cr4 = read_cr4(); | 141 | cr4 = read_cr4(); |
135 | write_cr4(cr4 & ~X86_CR4_PGE); | 142 | write_cr4(cr4 & ~X86_CR4_PGE); |
136 | } | 143 | } |
137 | 144 | ||
138 | /* Disable and flush caches. Note that wbinvd flushes the TLBs as | 145 | /* |
139 | a side-effect */ | 146 | * Disable and flush caches. |
147 | * Note that wbinvd flushes the TLBs as a side-effect | ||
148 | */ | ||
140 | cr0 = read_cr0() | X86_CR0_CD; | 149 | cr0 = read_cr0() | X86_CR0_CD; |
141 | wbinvd(); | 150 | wbinvd(); |
142 | write_cr0(cr0); | 151 | write_cr0(cr0); |
@@ -147,22 +156,21 @@ static void prepare_set(void) | |||
147 | 156 | ||
148 | /* Cyrix ARRs - everything else was excluded at the top */ | 157 | /* Cyrix ARRs - everything else was excluded at the top */ |
149 | setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); | 158 | setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); |
150 | |||
151 | } | 159 | } |
152 | 160 | ||
153 | static void post_set(void) | 161 | static void post_set(void) |
154 | { | 162 | { |
155 | /* Flush caches and TLBs */ | 163 | /* Flush caches and TLBs */ |
156 | wbinvd(); | 164 | wbinvd(); |
157 | 165 | ||
158 | /* Cyrix ARRs - everything else was excluded at the top */ | 166 | /* Cyrix ARRs - everything else was excluded at the top */ |
159 | setCx86(CX86_CCR3, ccr3); | 167 | setCx86(CX86_CCR3, ccr3); |
160 | 168 | ||
161 | /* Enable caches */ | 169 | /* Enable caches */ |
162 | write_cr0(read_cr0() & 0xbfffffff); | 170 | write_cr0(read_cr0() & 0xbfffffff); |
163 | 171 | ||
164 | /* Restore value of CR4 */ | 172 | /* Restore value of CR4 */ |
165 | if ( cpu_has_pge ) | 173 | if (cpu_has_pge) |
166 | write_cr4(cr4); | 174 | write_cr4(cr4); |
167 | } | 175 | } |
168 | 176 | ||
@@ -178,7 +186,8 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, | |||
178 | size >>= 6; | 186 | size >>= 6; |
179 | 187 | ||
180 | size &= 0x7fff; /* make sure arr_size <= 14 */ | 188 | size &= 0x7fff; /* make sure arr_size <= 14 */ |
181 | for (arr_size = 0; size; arr_size++, size >>= 1) ; | 189 | for (arr_size = 0; size; arr_size++, size >>= 1) |
190 | ; | ||
182 | 191 | ||
183 | if (reg < 7) { | 192 | if (reg < 7) { |
184 | switch (type) { | 193 | switch (type) { |
@@ -215,18 +224,18 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, | |||
215 | prepare_set(); | 224 | prepare_set(); |
216 | 225 | ||
217 | base <<= PAGE_SHIFT; | 226 | base <<= PAGE_SHIFT; |
218 | setCx86(arr, ((unsigned char *) &base)[3]); | 227 | setCx86(arr + 0, ((unsigned char *)&base)[3]); |
219 | setCx86(arr + 1, ((unsigned char *) &base)[2]); | 228 | setCx86(arr + 1, ((unsigned char *)&base)[2]); |
220 | setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size); | 229 | setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size); |
221 | setCx86(CX86_RCR_BASE + reg, arr_type); | 230 | setCx86(CX86_RCR_BASE + reg, arr_type); |
222 | 231 | ||
223 | post_set(); | 232 | post_set(); |
224 | } | 233 | } |
225 | 234 | ||
226 | typedef struct { | 235 | typedef struct { |
227 | unsigned long base; | 236 | unsigned long base; |
228 | unsigned long size; | 237 | unsigned long size; |
229 | mtrr_type type; | 238 | mtrr_type type; |
230 | } arr_state_t; | 239 | } arr_state_t; |
231 | 240 | ||
232 | static arr_state_t arr_state[8] = { | 241 | static arr_state_t arr_state[8] = { |
@@ -247,16 +256,17 @@ static void cyrix_set_all(void) | |||
247 | setCx86(CX86_CCR0 + i, ccr_state[i]); | 256 | setCx86(CX86_CCR0 + i, ccr_state[i]); |
248 | for (; i < 7; i++) | 257 | for (; i < 7; i++) |
249 | setCx86(CX86_CCR4 + i, ccr_state[i]); | 258 | setCx86(CX86_CCR4 + i, ccr_state[i]); |
250 | for (i = 0; i < 8; i++) | 259 | |
251 | cyrix_set_arr(i, arr_state[i].base, | 260 | for (i = 0; i < 8; i++) { |
261 | cyrix_set_arr(i, arr_state[i].base, | ||
252 | arr_state[i].size, arr_state[i].type); | 262 | arr_state[i].size, arr_state[i].type); |
263 | } | ||
253 | 264 | ||
254 | post_set(); | 265 | post_set(); |
255 | } | 266 | } |
256 | 267 | ||
257 | static struct mtrr_ops cyrix_mtrr_ops = { | 268 | static struct mtrr_ops cyrix_mtrr_ops = { |
258 | .vendor = X86_VENDOR_CYRIX, | 269 | .vendor = X86_VENDOR_CYRIX, |
259 | // .init = cyrix_arr_init, | ||
260 | .set_all = cyrix_set_all, | 270 | .set_all = cyrix_set_all, |
261 | .set = cyrix_set_arr, | 271 | .set = cyrix_set_arr, |
262 | .get = cyrix_get_arr, | 272 | .get = cyrix_get_arr, |
@@ -270,5 +280,3 @@ int __init cyrix_init_mtrr(void) | |||
270 | set_mtrr_ops(&cyrix_mtrr_ops); | 280 | set_mtrr_ops(&cyrix_mtrr_ops); |
271 | return 0; | 281 | return 0; |
272 | } | 282 | } |
273 | |||
274 | //arch_initcall(cyrix_init_mtrr); | ||
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0543f69f0b2..55da0c5f68d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c | |||
@@ -1,28 +1,34 @@ | |||
1 | /* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong | 1 | /* |
2 | because MTRRs can span upto 40 bits (36bits on most modern x86) */ | 2 | * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong |
3 | * because MTRRs can span upto 40 bits (36bits on most modern x86) | ||
4 | */ | ||
5 | #define DEBUG | ||
6 | |||
7 | #include <linux/module.h> | ||
3 | #include <linux/init.h> | 8 | #include <linux/init.h> |
4 | #include <linux/slab.h> | 9 | #include <linux/slab.h> |
10 | #include <linux/io.h> | ||
5 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
6 | #include <linux/module.h> | 12 | |
7 | #include <asm/io.h> | ||
8 | #include <asm/mtrr.h> | ||
9 | #include <asm/msr.h> | ||
10 | #include <asm/system.h> | ||
11 | #include <asm/cpufeature.h> | ||
12 | #include <asm/processor-flags.h> | 13 | #include <asm/processor-flags.h> |
14 | #include <asm/cpufeature.h> | ||
13 | #include <asm/tlbflush.h> | 15 | #include <asm/tlbflush.h> |
16 | #include <asm/system.h> | ||
17 | #include <asm/mtrr.h> | ||
18 | #include <asm/msr.h> | ||
14 | #include <asm/pat.h> | 19 | #include <asm/pat.h> |
20 | |||
15 | #include "mtrr.h" | 21 | #include "mtrr.h" |
16 | 22 | ||
17 | struct fixed_range_block { | 23 | struct fixed_range_block { |
18 | int base_msr; /* start address of an MTRR block */ | 24 | int base_msr; /* start address of an MTRR block */ |
19 | int ranges; /* number of MTRRs in this block */ | 25 | int ranges; /* number of MTRRs in this block */ |
20 | }; | 26 | }; |
21 | 27 | ||
22 | static struct fixed_range_block fixed_range_blocks[] = { | 28 | static struct fixed_range_block fixed_range_blocks[] = { |
23 | { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ | 29 | { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ |
24 | { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ | 30 | { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ |
25 | { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ | 31 | { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ |
26 | {} | 32 | {} |
27 | }; | 33 | }; |
28 | 34 | ||
@@ -30,10 +36,10 @@ static unsigned long smp_changes_mask; | |||
30 | static int mtrr_state_set; | 36 | static int mtrr_state_set; |
31 | u64 mtrr_tom2; | 37 | u64 mtrr_tom2; |
32 | 38 | ||
33 | struct mtrr_state_type mtrr_state = {}; | 39 | struct mtrr_state_type mtrr_state; |
34 | EXPORT_SYMBOL_GPL(mtrr_state); | 40 | EXPORT_SYMBOL_GPL(mtrr_state); |
35 | 41 | ||
36 | /** | 42 | /* |
37 | * BIOS is expected to clear MtrrFixDramModEn bit, see for example | 43 | * BIOS is expected to clear MtrrFixDramModEn bit, see for example |
38 | * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD | 44 | * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD |
39 | * Opteron Processors" (26094 Rev. 3.30 February 2006), section | 45 | * Opteron Processors" (26094 Rev. 3.30 February 2006), section |
@@ -104,9 +110,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) | |||
104 | * Look of multiple ranges matching this address and pick type | 110 | * Look of multiple ranges matching this address and pick type |
105 | * as per MTRR precedence | 111 | * as per MTRR precedence |
106 | */ | 112 | */ |
107 | if (!(mtrr_state.enabled & 2)) { | 113 | if (!(mtrr_state.enabled & 2)) |
108 | return mtrr_state.def_type; | 114 | return mtrr_state.def_type; |
109 | } | ||
110 | 115 | ||
111 | prev_match = 0xFF; | 116 | prev_match = 0xFF; |
112 | for (i = 0; i < num_var_ranges; ++i) { | 117 | for (i = 0; i < num_var_ranges; ++i) { |
@@ -125,9 +130,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) | |||
125 | if (start_state != end_state) | 130 | if (start_state != end_state) |
126 | return 0xFE; | 131 | return 0xFE; |
127 | 132 | ||
128 | if ((start & mask) != (base & mask)) { | 133 | if ((start & mask) != (base & mask)) |
129 | continue; | 134 | continue; |
130 | } | ||
131 | 135 | ||
132 | curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; | 136 | curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; |
133 | if (prev_match == 0xFF) { | 137 | if (prev_match == 0xFF) { |
@@ -148,9 +152,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) | |||
148 | curr_match = MTRR_TYPE_WRTHROUGH; | 152 | curr_match = MTRR_TYPE_WRTHROUGH; |
149 | } | 153 | } |
150 | 154 | ||
151 | if (prev_match != curr_match) { | 155 | if (prev_match != curr_match) |
152 | return MTRR_TYPE_UNCACHABLE; | 156 | return MTRR_TYPE_UNCACHABLE; |
153 | } | ||
154 | } | 157 | } |
155 | 158 | ||
156 | if (mtrr_tom2) { | 159 | if (mtrr_tom2) { |
@@ -164,7 +167,7 @@ u8 mtrr_type_lookup(u64 start, u64 end) | |||
164 | return mtrr_state.def_type; | 167 | return mtrr_state.def_type; |
165 | } | 168 | } |
166 | 169 | ||
167 | /* Get the MSR pair relating to a var range */ | 170 | /* Get the MSR pair relating to a var range */ |
168 | static void | 171 | static void |
169 | get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) | 172 | get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) |
170 | { | 173 | { |
@@ -172,7 +175,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) | |||
172 | rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); | 175 | rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); |
173 | } | 176 | } |
174 | 177 | ||
175 | /* fill the MSR pair relating to a var range */ | 178 | /* Fill the MSR pair relating to a var range */ |
176 | void fill_mtrr_var_range(unsigned int index, | 179 | void fill_mtrr_var_range(unsigned int index, |
177 | u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) | 180 | u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) |
178 | { | 181 | { |
@@ -186,10 +189,9 @@ void fill_mtrr_var_range(unsigned int index, | |||
186 | vr[index].mask_hi = mask_hi; | 189 | vr[index].mask_hi = mask_hi; |
187 | } | 190 | } |
188 | 191 | ||
189 | static void | 192 | static void get_fixed_ranges(mtrr_type *frs) |
190 | get_fixed_ranges(mtrr_type * frs) | ||
191 | { | 193 | { |
192 | unsigned int *p = (unsigned int *) frs; | 194 | unsigned int *p = (unsigned int *)frs; |
193 | int i; | 195 | int i; |
194 | 196 | ||
195 | k8_check_syscfg_dram_mod_en(); | 197 | k8_check_syscfg_dram_mod_en(); |
@@ -217,22 +219,22 @@ static void __init print_fixed_last(void) | |||
217 | if (!last_fixed_end) | 219 | if (!last_fixed_end) |
218 | return; | 220 | return; |
219 | 221 | ||
220 | printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start, | 222 | pr_debug(" %05X-%05X %s\n", last_fixed_start, |
221 | last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); | 223 | last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); |
222 | 224 | ||
223 | last_fixed_end = 0; | 225 | last_fixed_end = 0; |
224 | } | 226 | } |
225 | 227 | ||
226 | static void __init update_fixed_last(unsigned base, unsigned end, | 228 | static void __init update_fixed_last(unsigned base, unsigned end, |
227 | mtrr_type type) | 229 | mtrr_type type) |
228 | { | 230 | { |
229 | last_fixed_start = base; | 231 | last_fixed_start = base; |
230 | last_fixed_end = end; | 232 | last_fixed_end = end; |
231 | last_fixed_type = type; | 233 | last_fixed_type = type; |
232 | } | 234 | } |
233 | 235 | ||
234 | static void __init print_fixed(unsigned base, unsigned step, | 236 | static void __init |
235 | const mtrr_type *types) | 237 | print_fixed(unsigned base, unsigned step, const mtrr_type *types) |
236 | { | 238 | { |
237 | unsigned i; | 239 | unsigned i; |
238 | 240 | ||
@@ -259,54 +261,55 @@ static void __init print_mtrr_state(void) | |||
259 | unsigned int i; | 261 | unsigned int i; |
260 | int high_width; | 262 | int high_width; |
261 | 263 | ||
262 | printk(KERN_DEBUG "MTRR default type: %s\n", | 264 | pr_debug("MTRR default type: %s\n", |
263 | mtrr_attrib_to_str(mtrr_state.def_type)); | 265 | mtrr_attrib_to_str(mtrr_state.def_type)); |
264 | if (mtrr_state.have_fixed) { | 266 | if (mtrr_state.have_fixed) { |
265 | printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n", | 267 | pr_debug("MTRR fixed ranges %sabled:\n", |
266 | mtrr_state.enabled & 1 ? "en" : "dis"); | 268 | mtrr_state.enabled & 1 ? "en" : "dis"); |
267 | print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); | 269 | print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); |
268 | for (i = 0; i < 2; ++i) | 270 | for (i = 0; i < 2; ++i) |
269 | print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); | 271 | print_fixed(0x80000 + i * 0x20000, 0x04000, |
272 | mtrr_state.fixed_ranges + (i + 1) * 8); | ||
270 | for (i = 0; i < 8; ++i) | 273 | for (i = 0; i < 8; ++i) |
271 | print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); | 274 | print_fixed(0xC0000 + i * 0x08000, 0x01000, |
275 | mtrr_state.fixed_ranges + (i + 3) * 8); | ||
272 | 276 | ||
273 | /* tail */ | 277 | /* tail */ |
274 | print_fixed_last(); | 278 | print_fixed_last(); |
275 | } | 279 | } |
276 | printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", | 280 | pr_debug("MTRR variable ranges %sabled:\n", |
277 | mtrr_state.enabled & 2 ? "en" : "dis"); | 281 | mtrr_state.enabled & 2 ? "en" : "dis"); |
278 | if (size_or_mask & 0xffffffffUL) | 282 | if (size_or_mask & 0xffffffffUL) |
279 | high_width = ffs(size_or_mask & 0xffffffffUL) - 1; | 283 | high_width = ffs(size_or_mask & 0xffffffffUL) - 1; |
280 | else | 284 | else |
281 | high_width = ffs(size_or_mask>>32) + 32 - 1; | 285 | high_width = ffs(size_or_mask>>32) + 32 - 1; |
282 | high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; | 286 | high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; |
287 | |||
283 | for (i = 0; i < num_var_ranges; ++i) { | 288 | for (i = 0; i < num_var_ranges; ++i) { |
284 | if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) | 289 | if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) |
285 | printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", | 290 | pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", |
286 | i, | 291 | i, |
287 | high_width, | 292 | high_width, |
288 | mtrr_state.var_ranges[i].base_hi, | 293 | mtrr_state.var_ranges[i].base_hi, |
289 | mtrr_state.var_ranges[i].base_lo >> 12, | 294 | mtrr_state.var_ranges[i].base_lo >> 12, |
290 | high_width, | 295 | high_width, |
291 | mtrr_state.var_ranges[i].mask_hi, | 296 | mtrr_state.var_ranges[i].mask_hi, |
292 | mtrr_state.var_ranges[i].mask_lo >> 12, | 297 | mtrr_state.var_ranges[i].mask_lo >> 12, |
293 | mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); | 298 | mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); |
294 | else | 299 | else |
295 | printk(KERN_DEBUG " %u disabled\n", i); | 300 | pr_debug(" %u disabled\n", i); |
296 | } | ||
297 | if (mtrr_tom2) { | ||
298 | printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n", | ||
299 | mtrr_tom2, mtrr_tom2>>20); | ||
300 | } | 301 | } |
302 | if (mtrr_tom2) | ||
303 | pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); | ||
301 | } | 304 | } |
302 | 305 | ||
303 | /* Grab all of the MTRR state for this CPU into *state */ | 306 | /* Grab all of the MTRR state for this CPU into *state */ |
304 | void __init get_mtrr_state(void) | 307 | void __init get_mtrr_state(void) |
305 | { | 308 | { |
306 | unsigned int i; | ||
307 | struct mtrr_var_range *vrs; | 309 | struct mtrr_var_range *vrs; |
308 | unsigned lo, dummy; | ||
309 | unsigned long flags; | 310 | unsigned long flags; |
311 | unsigned lo, dummy; | ||
312 | unsigned int i; | ||
310 | 313 | ||
311 | vrs = mtrr_state.var_ranges; | 314 | vrs = mtrr_state.var_ranges; |
312 | 315 | ||
@@ -324,6 +327,7 @@ void __init get_mtrr_state(void) | |||
324 | 327 | ||
325 | if (amd_special_default_mtrr()) { | 328 | if (amd_special_default_mtrr()) { |
326 | unsigned low, high; | 329 | unsigned low, high; |
330 | |||
327 | /* TOP_MEM2 */ | 331 | /* TOP_MEM2 */ |
328 | rdmsr(MSR_K8_TOP_MEM2, low, high); | 332 | rdmsr(MSR_K8_TOP_MEM2, low, high); |
329 | mtrr_tom2 = high; | 333 | mtrr_tom2 = high; |
@@ -344,10 +348,9 @@ void __init get_mtrr_state(void) | |||
344 | 348 | ||
345 | post_set(); | 349 | post_set(); |
346 | local_irq_restore(flags); | 350 | local_irq_restore(flags); |
347 | |||
348 | } | 351 | } |
349 | 352 | ||
350 | /* Some BIOS's are fucked and don't set all MTRRs the same! */ | 353 | /* Some BIOS's are messed up and don't set all MTRRs the same! */ |
351 | void __init mtrr_state_warn(void) | 354 | void __init mtrr_state_warn(void) |
352 | { | 355 | { |
353 | unsigned long mask = smp_changes_mask; | 356 | unsigned long mask = smp_changes_mask; |
@@ -355,28 +358,33 @@ void __init mtrr_state_warn(void) | |||
355 | if (!mask) | 358 | if (!mask) |
356 | return; | 359 | return; |
357 | if (mask & MTRR_CHANGE_MASK_FIXED) | 360 | if (mask & MTRR_CHANGE_MASK_FIXED) |
358 | printk(KERN_WARNING "mtrr: your CPUs had inconsistent fixed MTRR settings\n"); | 361 | pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); |
359 | if (mask & MTRR_CHANGE_MASK_VARIABLE) | 362 | if (mask & MTRR_CHANGE_MASK_VARIABLE) |
360 | printk(KERN_WARNING "mtrr: your CPUs had inconsistent variable MTRR settings\n"); | 363 | pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n"); |
361 | if (mask & MTRR_CHANGE_MASK_DEFTYPE) | 364 | if (mask & MTRR_CHANGE_MASK_DEFTYPE) |
362 | printk(KERN_WARNING "mtrr: your CPUs had inconsistent MTRRdefType settings\n"); | 365 | pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); |
366 | |||
363 | printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n"); | 367 | printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n"); |
364 | printk(KERN_INFO "mtrr: corrected configuration.\n"); | 368 | printk(KERN_INFO "mtrr: corrected configuration.\n"); |
365 | } | 369 | } |
366 | 370 | ||
367 | /* Doesn't attempt to pass an error out to MTRR users | 371 | /* |
368 | because it's quite complicated in some cases and probably not | 372 | * Doesn't attempt to pass an error out to MTRR users |
369 | worth it because the best error handling is to ignore it. */ | 373 | * because it's quite complicated in some cases and probably not |
374 | * worth it because the best error handling is to ignore it. | ||
375 | */ | ||
370 | void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) | 376 | void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) |
371 | { | 377 | { |
372 | if (wrmsr_safe(msr, a, b) < 0) | 378 | if (wrmsr_safe(msr, a, b) < 0) { |
373 | printk(KERN_ERR | 379 | printk(KERN_ERR |
374 | "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", | 380 | "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", |
375 | smp_processor_id(), msr, a, b); | 381 | smp_processor_id(), msr, a, b); |
382 | } | ||
376 | } | 383 | } |
377 | 384 | ||
378 | /** | 385 | /** |
379 | * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have | 386 | * set_fixed_range - checks & updates a fixed-range MTRR if it |
387 | * differs from the value it should have | ||
380 | * @msr: MSR address of the MTTR which should be checked and updated | 388 | * @msr: MSR address of the MTTR which should be checked and updated |
381 | * @changed: pointer which indicates whether the MTRR needed to be changed | 389 | * @changed: pointer which indicates whether the MTRR needed to be changed |
382 | * @msrwords: pointer to the MSR values which the MSR should have | 390 | * @msrwords: pointer to the MSR values which the MSR should have |
@@ -401,20 +409,23 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) | |||
401 | * | 409 | * |
402 | * Returns: The index of the region on success, else negative on error. | 410 | * Returns: The index of the region on success, else negative on error. |
403 | */ | 411 | */ |
404 | int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) | 412 | int |
413 | generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) | ||
405 | { | 414 | { |
406 | int i, max; | ||
407 | mtrr_type ltype; | ||
408 | unsigned long lbase, lsize; | 415 | unsigned long lbase, lsize; |
416 | mtrr_type ltype; | ||
417 | int i, max; | ||
409 | 418 | ||
410 | max = num_var_ranges; | 419 | max = num_var_ranges; |
411 | if (replace_reg >= 0 && replace_reg < max) | 420 | if (replace_reg >= 0 && replace_reg < max) |
412 | return replace_reg; | 421 | return replace_reg; |
422 | |||
413 | for (i = 0; i < max; ++i) { | 423 | for (i = 0; i < max; ++i) { |
414 | mtrr_if->get(i, &lbase, &lsize, <ype); | 424 | mtrr_if->get(i, &lbase, &lsize, <ype); |
415 | if (lsize == 0) | 425 | if (lsize == 0) |
416 | return i; | 426 | return i; |
417 | } | 427 | } |
428 | |||
418 | return -ENOSPC; | 429 | return -ENOSPC; |
419 | } | 430 | } |
420 | 431 | ||
@@ -434,7 +445,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, | |||
434 | rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); | 445 | rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); |
435 | 446 | ||
436 | if ((mask_lo & 0x800) == 0) { | 447 | if ((mask_lo & 0x800) == 0) { |
437 | /* Invalid (i.e. free) range */ | 448 | /* Invalid (i.e. free) range */ |
438 | *base = 0; | 449 | *base = 0; |
439 | *size = 0; | 450 | *size = 0; |
440 | *type = 0; | 451 | *type = 0; |
@@ -471,27 +482,31 @@ out_put_cpu: | |||
471 | } | 482 | } |
472 | 483 | ||
473 | /** | 484 | /** |
474 | * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set | 485 | * set_fixed_ranges - checks & updates the fixed-range MTRRs if they |
486 | * differ from the saved set | ||
475 | * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() | 487 | * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() |
476 | */ | 488 | */ |
477 | static int set_fixed_ranges(mtrr_type * frs) | 489 | static int set_fixed_ranges(mtrr_type *frs) |
478 | { | 490 | { |
479 | unsigned long long *saved = (unsigned long long *) frs; | 491 | unsigned long long *saved = (unsigned long long *)frs; |
480 | bool changed = false; | 492 | bool changed = false; |
481 | int block=-1, range; | 493 | int block = -1, range; |
482 | 494 | ||
483 | k8_check_syscfg_dram_mod_en(); | 495 | k8_check_syscfg_dram_mod_en(); |
484 | 496 | ||
485 | while (fixed_range_blocks[++block].ranges) | 497 | while (fixed_range_blocks[++block].ranges) { |
486 | for (range=0; range < fixed_range_blocks[block].ranges; range++) | 498 | for (range = 0; range < fixed_range_blocks[block].ranges; range++) |
487 | set_fixed_range(fixed_range_blocks[block].base_msr + range, | 499 | set_fixed_range(fixed_range_blocks[block].base_msr + range, |
488 | &changed, (unsigned int *) saved++); | 500 | &changed, (unsigned int *)saved++); |
501 | } | ||
489 | 502 | ||
490 | return changed; | 503 | return changed; |
491 | } | 504 | } |
492 | 505 | ||
493 | /* Set the MSR pair relating to a var range. Returns TRUE if | 506 | /* |
494 | changes are made */ | 507 | * Set the MSR pair relating to a var range. |
508 | * Returns true if changes are made. | ||
509 | */ | ||
495 | static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) | 510 | static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) |
496 | { | 511 | { |
497 | unsigned int lo, hi; | 512 | unsigned int lo, hi; |
@@ -501,6 +516,7 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) | |||
501 | if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) | 516 | if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) |
502 | || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != | 517 | || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != |
503 | (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { | 518 | (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { |
519 | |||
504 | mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); | 520 | mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); |
505 | changed = true; | 521 | changed = true; |
506 | } | 522 | } |
@@ -526,21 +542,26 @@ static u32 deftype_lo, deftype_hi; | |||
526 | */ | 542 | */ |
527 | static unsigned long set_mtrr_state(void) | 543 | static unsigned long set_mtrr_state(void) |
528 | { | 544 | { |
529 | unsigned int i; | ||
530 | unsigned long change_mask = 0; | 545 | unsigned long change_mask = 0; |
546 | unsigned int i; | ||
531 | 547 | ||
532 | for (i = 0; i < num_var_ranges; i++) | 548 | for (i = 0; i < num_var_ranges; i++) { |
533 | if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) | 549 | if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) |
534 | change_mask |= MTRR_CHANGE_MASK_VARIABLE; | 550 | change_mask |= MTRR_CHANGE_MASK_VARIABLE; |
551 | } | ||
535 | 552 | ||
536 | if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) | 553 | if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) |
537 | change_mask |= MTRR_CHANGE_MASK_FIXED; | 554 | change_mask |= MTRR_CHANGE_MASK_FIXED; |
538 | 555 | ||
539 | /* Set_mtrr_restore restores the old value of MTRRdefType, | 556 | /* |
540 | so to set it we fiddle with the saved value */ | 557 | * Set_mtrr_restore restores the old value of MTRRdefType, |
558 | * so to set it we fiddle with the saved value: | ||
559 | */ | ||
541 | if ((deftype_lo & 0xff) != mtrr_state.def_type | 560 | if ((deftype_lo & 0xff) != mtrr_state.def_type |
542 | || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { | 561 | || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { |
543 | deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10); | 562 | |
563 | deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | | ||
564 | (mtrr_state.enabled << 10); | ||
544 | change_mask |= MTRR_CHANGE_MASK_DEFTYPE; | 565 | change_mask |= MTRR_CHANGE_MASK_DEFTYPE; |
545 | } | 566 | } |
546 | 567 | ||
@@ -548,33 +569,36 @@ static unsigned long set_mtrr_state(void) | |||
548 | } | 569 | } |
549 | 570 | ||
550 | 571 | ||
551 | static unsigned long cr4 = 0; | 572 | static unsigned long cr4; |
552 | static DEFINE_SPINLOCK(set_atomicity_lock); | 573 | static DEFINE_SPINLOCK(set_atomicity_lock); |
553 | 574 | ||
554 | /* | 575 | /* |
555 | * Since we are disabling the cache don't allow any interrupts - they | 576 | * Since we are disabling the cache don't allow any interrupts, |
556 | * would run extremely slow and would only increase the pain. The caller must | 577 | * they would run extremely slow and would only increase the pain. |
557 | * ensure that local interrupts are disabled and are reenabled after post_set() | 578 | * |
558 | * has been called. | 579 | * The caller must ensure that local interrupts are disabled and |
580 | * are reenabled after post_set() has been called. | ||
559 | */ | 581 | */ |
560 | |||
561 | static void prepare_set(void) __acquires(set_atomicity_lock) | 582 | static void prepare_set(void) __acquires(set_atomicity_lock) |
562 | { | 583 | { |
563 | unsigned long cr0; | 584 | unsigned long cr0; |
564 | 585 | ||
565 | /* Note that this is not ideal, since the cache is only flushed/disabled | 586 | /* |
566 | for this CPU while the MTRRs are changed, but changing this requires | 587 | * Note that this is not ideal |
567 | more invasive changes to the way the kernel boots */ | 588 | * since the cache is only flushed/disabled for this CPU while the |
589 | * MTRRs are changed, but changing this requires more invasive | ||
590 | * changes to the way the kernel boots | ||
591 | */ | ||
568 | 592 | ||
569 | spin_lock(&set_atomicity_lock); | 593 | spin_lock(&set_atomicity_lock); |
570 | 594 | ||
571 | /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ | 595 | /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ |
572 | cr0 = read_cr0() | X86_CR0_CD; | 596 | cr0 = read_cr0() | X86_CR0_CD; |
573 | write_cr0(cr0); | 597 | write_cr0(cr0); |
574 | wbinvd(); | 598 | wbinvd(); |
575 | 599 | ||
576 | /* Save value of CR4 and clear Page Global Enable (bit 7) */ | 600 | /* Save value of CR4 and clear Page Global Enable (bit 7) */ |
577 | if ( cpu_has_pge ) { | 601 | if (cpu_has_pge) { |
578 | cr4 = read_cr4(); | 602 | cr4 = read_cr4(); |
579 | write_cr4(cr4 & ~X86_CR4_PGE); | 603 | write_cr4(cr4 & ~X86_CR4_PGE); |
580 | } | 604 | } |
@@ -582,26 +606,26 @@ static void prepare_set(void) __acquires(set_atomicity_lock) | |||
582 | /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ | 606 | /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ |
583 | __flush_tlb(); | 607 | __flush_tlb(); |
584 | 608 | ||
585 | /* Save MTRR state */ | 609 | /* Save MTRR state */ |
586 | rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); | 610 | rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); |
587 | 611 | ||
588 | /* Disable MTRRs, and set the default type to uncached */ | 612 | /* Disable MTRRs, and set the default type to uncached */ |
589 | mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); | 613 | mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); |
590 | } | 614 | } |
591 | 615 | ||
592 | static void post_set(void) __releases(set_atomicity_lock) | 616 | static void post_set(void) __releases(set_atomicity_lock) |
593 | { | 617 | { |
594 | /* Flush TLBs (no need to flush caches - they are disabled) */ | 618 | /* Flush TLBs (no need to flush caches - they are disabled) */ |
595 | __flush_tlb(); | 619 | __flush_tlb(); |
596 | 620 | ||
597 | /* Intel (P6) standard MTRRs */ | 621 | /* Intel (P6) standard MTRRs */ |
598 | mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); | 622 | mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); |
599 | 623 | ||
600 | /* Enable caches */ | 624 | /* Enable caches */ |
601 | write_cr0(read_cr0() & 0xbfffffff); | 625 | write_cr0(read_cr0() & 0xbfffffff); |
602 | 626 | ||
603 | /* Restore value of CR4 */ | 627 | /* Restore value of CR4 */ |
604 | if ( cpu_has_pge ) | 628 | if (cpu_has_pge) |
605 | write_cr4(cr4); | 629 | write_cr4(cr4); |
606 | spin_unlock(&set_atomicity_lock); | 630 | spin_unlock(&set_atomicity_lock); |
607 | } | 631 | } |
@@ -623,24 +647,27 @@ static void generic_set_all(void) | |||
623 | post_set(); | 647 | post_set(); |
624 | local_irq_restore(flags); | 648 | local_irq_restore(flags); |
625 | 649 | ||
626 | /* Use the atomic bitops to update the global mask */ | 650 | /* Use the atomic bitops to update the global mask */ |
627 | for (count = 0; count < sizeof mask * 8; ++count) { | 651 | for (count = 0; count < sizeof mask * 8; ++count) { |
628 | if (mask & 0x01) | 652 | if (mask & 0x01) |
629 | set_bit(count, &smp_changes_mask); | 653 | set_bit(count, &smp_changes_mask); |
630 | mask >>= 1; | 654 | mask >>= 1; |
631 | } | 655 | } |
632 | 656 | ||
633 | } | 657 | } |
634 | 658 | ||
659 | /** | ||
660 | * generic_set_mtrr - set variable MTRR register on the local CPU. | ||
661 | * | ||
662 | * @reg: The register to set. | ||
663 | * @base: The base address of the region. | ||
664 | * @size: The size of the region. If this is 0 the region is disabled. | ||
665 | * @type: The type of the region. | ||
666 | * | ||
667 | * Returns nothing. | ||
668 | */ | ||
635 | static void generic_set_mtrr(unsigned int reg, unsigned long base, | 669 | static void generic_set_mtrr(unsigned int reg, unsigned long base, |
636 | unsigned long size, mtrr_type type) | 670 | unsigned long size, mtrr_type type) |
637 | /* [SUMMARY] Set variable MTRR register on the local CPU. | ||
638 | <reg> The register to set. | ||
639 | <base> The base address of the region. | ||
640 | <size> The size of the region. If this is 0 the region is disabled. | ||
641 | <type> The type of the region. | ||
642 | [RETURNS] Nothing. | ||
643 | */ | ||
644 | { | 671 | { |
645 | unsigned long flags; | 672 | unsigned long flags; |
646 | struct mtrr_var_range *vr; | 673 | struct mtrr_var_range *vr; |
@@ -651,8 +678,10 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, | |||
651 | prepare_set(); | 678 | prepare_set(); |
652 | 679 | ||
653 | if (size == 0) { | 680 | if (size == 0) { |
654 | /* The invalid bit is kept in the mask, so we simply clear the | 681 | /* |
655 | relevant mask register to disable a range. */ | 682 | * The invalid bit is kept in the mask, so we simply |
683 | * clear the relevant mask register to disable a range. | ||
684 | */ | ||
656 | mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); | 685 | mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); |
657 | memset(vr, 0, sizeof(struct mtrr_var_range)); | 686 | memset(vr, 0, sizeof(struct mtrr_var_range)); |
658 | } else { | 687 | } else { |
@@ -669,46 +698,50 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, | |||
669 | local_irq_restore(flags); | 698 | local_irq_restore(flags); |
670 | } | 699 | } |
671 | 700 | ||
672 | int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) | 701 | int generic_validate_add_page(unsigned long base, unsigned long size, |
702 | unsigned int type) | ||
673 | { | 703 | { |
674 | unsigned long lbase, last; | 704 | unsigned long lbase, last; |
675 | 705 | ||
676 | /* For Intel PPro stepping <= 7, must be 4 MiB aligned | 706 | /* |
677 | and not touch 0x70000000->0x7003FFFF */ | 707 | * For Intel PPro stepping <= 7 |
708 | * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF | ||
709 | */ | ||
678 | if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && | 710 | if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && |
679 | boot_cpu_data.x86_model == 1 && | 711 | boot_cpu_data.x86_model == 1 && |
680 | boot_cpu_data.x86_mask <= 7) { | 712 | boot_cpu_data.x86_mask <= 7) { |
681 | if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { | 713 | if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { |
682 | printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); | 714 | pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); |
683 | return -EINVAL; | 715 | return -EINVAL; |
684 | } | 716 | } |
685 | if (!(base + size < 0x70000 || base > 0x7003F) && | 717 | if (!(base + size < 0x70000 || base > 0x7003F) && |
686 | (type == MTRR_TYPE_WRCOMB | 718 | (type == MTRR_TYPE_WRCOMB |
687 | || type == MTRR_TYPE_WRBACK)) { | 719 | || type == MTRR_TYPE_WRBACK)) { |
688 | printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); | 720 | pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); |
689 | return -EINVAL; | 721 | return -EINVAL; |
690 | } | 722 | } |
691 | } | 723 | } |
692 | 724 | ||
693 | /* Check upper bits of base and last are equal and lower bits are 0 | 725 | /* |
694 | for base and 1 for last */ | 726 | * Check upper bits of base and last are equal and lower bits are 0 |
727 | * for base and 1 for last | ||
728 | */ | ||
695 | last = base + size - 1; | 729 | last = base + size - 1; |
696 | for (lbase = base; !(lbase & 1) && (last & 1); | 730 | for (lbase = base; !(lbase & 1) && (last & 1); |
697 | lbase = lbase >> 1, last = last >> 1) ; | 731 | lbase = lbase >> 1, last = last >> 1) |
732 | ; | ||
698 | if (lbase != last) { | 733 | if (lbase != last) { |
699 | printk(KERN_WARNING "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", | 734 | pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); |
700 | base, size); | ||
701 | return -EINVAL; | 735 | return -EINVAL; |
702 | } | 736 | } |
703 | return 0; | 737 | return 0; |
704 | } | 738 | } |
705 | 739 | ||
706 | |||
707 | static int generic_have_wrcomb(void) | 740 | static int generic_have_wrcomb(void) |
708 | { | 741 | { |
709 | unsigned long config, dummy; | 742 | unsigned long config, dummy; |
710 | rdmsr(MSR_MTRRcap, config, dummy); | 743 | rdmsr(MSR_MTRRcap, config, dummy); |
711 | return (config & (1 << 10)); | 744 | return config & (1 << 10); |
712 | } | 745 | } |
713 | 746 | ||
714 | int positive_have_wrcomb(void) | 747 | int positive_have_wrcomb(void) |
@@ -716,14 +749,15 @@ int positive_have_wrcomb(void) | |||
716 | return 1; | 749 | return 1; |
717 | } | 750 | } |
718 | 751 | ||
719 | /* generic structure... | 752 | /* |
753 | * Generic structure... | ||
720 | */ | 754 | */ |
721 | struct mtrr_ops generic_mtrr_ops = { | 755 | struct mtrr_ops generic_mtrr_ops = { |
722 | .use_intel_if = 1, | 756 | .use_intel_if = 1, |
723 | .set_all = generic_set_all, | 757 | .set_all = generic_set_all, |
724 | .get = generic_get_mtrr, | 758 | .get = generic_get_mtrr, |
725 | .get_free_region = generic_get_free_region, | 759 | .get_free_region = generic_get_free_region, |
726 | .set = generic_set_mtrr, | 760 | .set = generic_set_mtrr, |
727 | .validate_add_page = generic_validate_add_page, | 761 | .validate_add_page = generic_validate_add_page, |
728 | .have_wrcomb = generic_have_wrcomb, | 762 | .have_wrcomb = generic_have_wrcomb, |
729 | }; | 763 | }; |
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index fb73a52913a..f04e7252760 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c | |||
@@ -1,27 +1,28 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/proc_fs.h> | ||
3 | #include <linux/capability.h> | 1 | #include <linux/capability.h> |
4 | #include <linux/ctype.h> | ||
5 | #include <linux/module.h> | ||
6 | #include <linux/seq_file.h> | 2 | #include <linux/seq_file.h> |
7 | #include <asm/uaccess.h> | 3 | #include <linux/uaccess.h> |
4 | #include <linux/proc_fs.h> | ||
5 | #include <linux/module.h> | ||
6 | #include <linux/ctype.h> | ||
7 | #include <linux/init.h> | ||
8 | 8 | ||
9 | #define LINE_SIZE 80 | 9 | #define LINE_SIZE 80 |
10 | 10 | ||
11 | #include <asm/mtrr.h> | 11 | #include <asm/mtrr.h> |
12 | |||
12 | #include "mtrr.h" | 13 | #include "mtrr.h" |
13 | 14 | ||
14 | #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) | 15 | #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) |
15 | 16 | ||
16 | static const char *const mtrr_strings[MTRR_NUM_TYPES] = | 17 | static const char *const mtrr_strings[MTRR_NUM_TYPES] = |
17 | { | 18 | { |
18 | "uncachable", /* 0 */ | 19 | "uncachable", /* 0 */ |
19 | "write-combining", /* 1 */ | 20 | "write-combining", /* 1 */ |
20 | "?", /* 2 */ | 21 | "?", /* 2 */ |
21 | "?", /* 3 */ | 22 | "?", /* 3 */ |
22 | "write-through", /* 4 */ | 23 | "write-through", /* 4 */ |
23 | "write-protect", /* 5 */ | 24 | "write-protect", /* 5 */ |
24 | "write-back", /* 6 */ | 25 | "write-back", /* 6 */ |
25 | }; | 26 | }; |
26 | 27 | ||
27 | const char *mtrr_attrib_to_str(int x) | 28 | const char *mtrr_attrib_to_str(int x) |
@@ -35,8 +36,8 @@ static int | |||
35 | mtrr_file_add(unsigned long base, unsigned long size, | 36 | mtrr_file_add(unsigned long base, unsigned long size, |
36 | unsigned int type, bool increment, struct file *file, int page) | 37 | unsigned int type, bool increment, struct file *file, int page) |
37 | { | 38 | { |
39 | unsigned int *fcount = FILE_FCOUNT(file); | ||
38 | int reg, max; | 40 | int reg, max; |
39 | unsigned int *fcount = FILE_FCOUNT(file); | ||
40 | 41 | ||
41 | max = num_var_ranges; | 42 | max = num_var_ranges; |
42 | if (fcount == NULL) { | 43 | if (fcount == NULL) { |
@@ -61,8 +62,8 @@ static int | |||
61 | mtrr_file_del(unsigned long base, unsigned long size, | 62 | mtrr_file_del(unsigned long base, unsigned long size, |
62 | struct file *file, int page) | 63 | struct file *file, int page) |
63 | { | 64 | { |
64 | int reg; | ||
65 | unsigned int *fcount = FILE_FCOUNT(file); | 65 | unsigned int *fcount = FILE_FCOUNT(file); |
66 | int reg; | ||
66 | 67 | ||
67 | if (!page) { | 68 | if (!page) { |
68 | if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) | 69 | if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) |
@@ -81,13 +82,14 @@ mtrr_file_del(unsigned long base, unsigned long size, | |||
81 | return reg; | 82 | return reg; |
82 | } | 83 | } |
83 | 84 | ||
84 | /* RED-PEN: seq_file can seek now. this is ignored. */ | 85 | /* |
86 | * seq_file can seek but we ignore it. | ||
87 | * | ||
88 | * Format of control line: | ||
89 | * "base=%Lx size=%Lx type=%s" or "disable=%d" | ||
90 | */ | ||
85 | static ssize_t | 91 | static ssize_t |
86 | mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) | 92 | mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) |
87 | /* Format of control line: | ||
88 | "base=%Lx size=%Lx type=%s" OR: | ||
89 | "disable=%d" | ||
90 | */ | ||
91 | { | 93 | { |
92 | int i, err; | 94 | int i, err; |
93 | unsigned long reg; | 95 | unsigned long reg; |
@@ -100,15 +102,18 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) | |||
100 | return -EPERM; | 102 | return -EPERM; |
101 | if (!len) | 103 | if (!len) |
102 | return -EINVAL; | 104 | return -EINVAL; |
105 | |||
103 | memset(line, 0, LINE_SIZE); | 106 | memset(line, 0, LINE_SIZE); |
104 | if (len > LINE_SIZE) | 107 | if (len > LINE_SIZE) |
105 | len = LINE_SIZE; | 108 | len = LINE_SIZE; |
106 | if (copy_from_user(line, buf, len - 1)) | 109 | if (copy_from_user(line, buf, len - 1)) |
107 | return -EFAULT; | 110 | return -EFAULT; |
111 | |||
108 | linelen = strlen(line); | 112 | linelen = strlen(line); |
109 | ptr = line + linelen - 1; | 113 | ptr = line + linelen - 1; |
110 | if (linelen && *ptr == '\n') | 114 | if (linelen && *ptr == '\n') |
111 | *ptr = '\0'; | 115 | *ptr = '\0'; |
116 | |||
112 | if (!strncmp(line, "disable=", 8)) { | 117 | if (!strncmp(line, "disable=", 8)) { |
113 | reg = simple_strtoul(line + 8, &ptr, 0); | 118 | reg = simple_strtoul(line + 8, &ptr, 0); |
114 | err = mtrr_del_page(reg, 0, 0); | 119 | err = mtrr_del_page(reg, 0, 0); |
@@ -116,28 +121,35 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) | |||
116 | return err; | 121 | return err; |
117 | return len; | 122 | return len; |
118 | } | 123 | } |
124 | |||
119 | if (strncmp(line, "base=", 5)) | 125 | if (strncmp(line, "base=", 5)) |
120 | return -EINVAL; | 126 | return -EINVAL; |
127 | |||
121 | base = simple_strtoull(line + 5, &ptr, 0); | 128 | base = simple_strtoull(line + 5, &ptr, 0); |
122 | for (; isspace(*ptr); ++ptr) ; | 129 | while (isspace(*ptr)) |
130 | ptr++; | ||
131 | |||
123 | if (strncmp(ptr, "size=", 5)) | 132 | if (strncmp(ptr, "size=", 5)) |
124 | return -EINVAL; | 133 | return -EINVAL; |
134 | |||
125 | size = simple_strtoull(ptr + 5, &ptr, 0); | 135 | size = simple_strtoull(ptr + 5, &ptr, 0); |
126 | if ((base & 0xfff) || (size & 0xfff)) | 136 | if ((base & 0xfff) || (size & 0xfff)) |
127 | return -EINVAL; | 137 | return -EINVAL; |
128 | for (; isspace(*ptr); ++ptr) ; | 138 | while (isspace(*ptr)) |
139 | ptr++; | ||
140 | |||
129 | if (strncmp(ptr, "type=", 5)) | 141 | if (strncmp(ptr, "type=", 5)) |
130 | return -EINVAL; | 142 | return -EINVAL; |
131 | ptr += 5; | 143 | ptr += 5; |
132 | for (; isspace(*ptr); ++ptr) ; | 144 | while (isspace(*ptr)) |
145 | ptr++; | ||
146 | |||
133 | for (i = 0; i < MTRR_NUM_TYPES; ++i) { | 147 | for (i = 0; i < MTRR_NUM_TYPES; ++i) { |
134 | if (strcmp(ptr, mtrr_strings[i])) | 148 | if (strcmp(ptr, mtrr_strings[i])) |
135 | continue; | 149 | continue; |
136 | base >>= PAGE_SHIFT; | 150 | base >>= PAGE_SHIFT; |
137 | size >>= PAGE_SHIFT; | 151 | size >>= PAGE_SHIFT; |
138 | err = | 152 | err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true); |
139 | mtrr_add_page((unsigned long) base, (unsigned long) size, i, | ||
140 | true); | ||
141 | if (err < 0) | 153 | if (err < 0) |
142 | return err; | 154 | return err; |
143 | return len; | 155 | return len; |
@@ -181,7 +193,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) | |||
181 | case MTRRIOC32_SET_PAGE_ENTRY: | 193 | case MTRRIOC32_SET_PAGE_ENTRY: |
182 | case MTRRIOC32_DEL_PAGE_ENTRY: | 194 | case MTRRIOC32_DEL_PAGE_ENTRY: |
183 | case MTRRIOC32_KILL_PAGE_ENTRY: { | 195 | case MTRRIOC32_KILL_PAGE_ENTRY: { |
184 | struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)__arg; | 196 | struct mtrr_sentry32 __user *s32; |
197 | |||
198 | s32 = (struct mtrr_sentry32 __user *)__arg; | ||
185 | err = get_user(sentry.base, &s32->base); | 199 | err = get_user(sentry.base, &s32->base); |
186 | err |= get_user(sentry.size, &s32->size); | 200 | err |= get_user(sentry.size, &s32->size); |
187 | err |= get_user(sentry.type, &s32->type); | 201 | err |= get_user(sentry.type, &s32->type); |
@@ -191,7 +205,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) | |||
191 | } | 205 | } |
192 | case MTRRIOC32_GET_ENTRY: | 206 | case MTRRIOC32_GET_ENTRY: |
193 | case MTRRIOC32_GET_PAGE_ENTRY: { | 207 | case MTRRIOC32_GET_PAGE_ENTRY: { |
194 | struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; | 208 | struct mtrr_gentry32 __user *g32; |
209 | |||
210 | g32 = (struct mtrr_gentry32 __user *)__arg; | ||
195 | err = get_user(gentry.regnum, &g32->regnum); | 211 | err = get_user(gentry.regnum, &g32->regnum); |
196 | err |= get_user(gentry.base, &g32->base); | 212 | err |= get_user(gentry.base, &g32->base); |
197 | err |= get_user(gentry.size, &g32->size); | 213 | err |= get_user(gentry.size, &g32->size); |
@@ -314,7 +330,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) | |||
314 | if (err) | 330 | if (err) |
315 | return err; | 331 | return err; |
316 | 332 | ||
317 | switch(cmd) { | 333 | switch (cmd) { |
318 | case MTRRIOC_GET_ENTRY: | 334 | case MTRRIOC_GET_ENTRY: |
319 | case MTRRIOC_GET_PAGE_ENTRY: | 335 | case MTRRIOC_GET_PAGE_ENTRY: |
320 | if (copy_to_user(arg, &gentry, sizeof gentry)) | 336 | if (copy_to_user(arg, &gentry, sizeof gentry)) |
@@ -323,7 +339,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) | |||
323 | #ifdef CONFIG_COMPAT | 339 | #ifdef CONFIG_COMPAT |
324 | case MTRRIOC32_GET_ENTRY: | 340 | case MTRRIOC32_GET_ENTRY: |
325 | case MTRRIOC32_GET_PAGE_ENTRY: { | 341 | case MTRRIOC32_GET_PAGE_ENTRY: { |
326 | struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; | 342 | struct mtrr_gentry32 __user *g32; |
343 | |||
344 | g32 = (struct mtrr_gentry32 __user *)__arg; | ||
327 | err = put_user(gentry.base, &g32->base); | 345 | err = put_user(gentry.base, &g32->base); |
328 | err |= put_user(gentry.size, &g32->size); | 346 | err |= put_user(gentry.size, &g32->size); |
329 | err |= put_user(gentry.regnum, &g32->regnum); | 347 | err |= put_user(gentry.regnum, &g32->regnum); |
@@ -335,11 +353,10 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) | |||
335 | return err; | 353 | return err; |
336 | } | 354 | } |
337 | 355 | ||
338 | static int | 356 | static int mtrr_close(struct inode *ino, struct file *file) |
339 | mtrr_close(struct inode *ino, struct file *file) | ||
340 | { | 357 | { |
341 | int i, max; | ||
342 | unsigned int *fcount = FILE_FCOUNT(file); | 358 | unsigned int *fcount = FILE_FCOUNT(file); |
359 | int i, max; | ||
343 | 360 | ||
344 | if (fcount != NULL) { | 361 | if (fcount != NULL) { |
345 | max = num_var_ranges; | 362 | max = num_var_ranges; |
@@ -359,22 +376,22 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset); | |||
359 | 376 | ||
360 | static int mtrr_open(struct inode *inode, struct file *file) | 377 | static int mtrr_open(struct inode *inode, struct file *file) |
361 | { | 378 | { |
362 | if (!mtrr_if) | 379 | if (!mtrr_if) |
363 | return -EIO; | 380 | return -EIO; |
364 | if (!mtrr_if->get) | 381 | if (!mtrr_if->get) |
365 | return -ENXIO; | 382 | return -ENXIO; |
366 | return single_open(file, mtrr_seq_show, NULL); | 383 | return single_open(file, mtrr_seq_show, NULL); |
367 | } | 384 | } |
368 | 385 | ||
369 | static const struct file_operations mtrr_fops = { | 386 | static const struct file_operations mtrr_fops = { |
370 | .owner = THIS_MODULE, | 387 | .owner = THIS_MODULE, |
371 | .open = mtrr_open, | 388 | .open = mtrr_open, |
372 | .read = seq_read, | 389 | .read = seq_read, |
373 | .llseek = seq_lseek, | 390 | .llseek = seq_lseek, |
374 | .write = mtrr_write, | 391 | .write = mtrr_write, |
375 | .unlocked_ioctl = mtrr_ioctl, | 392 | .unlocked_ioctl = mtrr_ioctl, |
376 | .compat_ioctl = mtrr_ioctl, | 393 | .compat_ioctl = mtrr_ioctl, |
377 | .release = mtrr_close, | 394 | .release = mtrr_close, |
378 | }; | 395 | }; |
379 | 396 | ||
380 | static int mtrr_seq_show(struct seq_file *seq, void *offset) | 397 | static int mtrr_seq_show(struct seq_file *seq, void *offset) |
@@ -388,23 +405,24 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) | |||
388 | max = num_var_ranges; | 405 | max = num_var_ranges; |
389 | for (i = 0; i < max; i++) { | 406 | for (i = 0; i < max; i++) { |
390 | mtrr_if->get(i, &base, &size, &type); | 407 | mtrr_if->get(i, &base, &size, &type); |
391 | if (size == 0) | 408 | if (size == 0) { |
392 | mtrr_usage_table[i] = 0; | 409 | mtrr_usage_table[i] = 0; |
393 | else { | 410 | continue; |
394 | if (size < (0x100000 >> PAGE_SHIFT)) { | ||
395 | /* less than 1MB */ | ||
396 | factor = 'K'; | ||
397 | size <<= PAGE_SHIFT - 10; | ||
398 | } else { | ||
399 | factor = 'M'; | ||
400 | size >>= 20 - PAGE_SHIFT; | ||
401 | } | ||
402 | /* RED-PEN: base can be > 32bit */ | ||
403 | len += seq_printf(seq, | ||
404 | "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", | ||
405 | i, base, base >> (20 - PAGE_SHIFT), size, factor, | ||
406 | mtrr_usage_table[i], mtrr_attrib_to_str(type)); | ||
407 | } | 411 | } |
412 | if (size < (0x100000 >> PAGE_SHIFT)) { | ||
413 | /* less than 1MB */ | ||
414 | factor = 'K'; | ||
415 | size <<= PAGE_SHIFT - 10; | ||
416 | } else { | ||
417 | factor = 'M'; | ||
418 | size >>= 20 - PAGE_SHIFT; | ||
419 | } | ||
420 | /* Base can be > 32bit */ | ||
421 | len += seq_printf(seq, "reg%02i: base=0x%06lx000 " | ||
422 | "(%5luMB), size=%5lu%cB, count=%d: %s\n", | ||
423 | i, base, base >> (20 - PAGE_SHIFT), size, | ||
424 | factor, mtrr_usage_table[i], | ||
425 | mtrr_attrib_to_str(type)); | ||
408 | } | 426 | } |
409 | return 0; | 427 | return 0; |
410 | } | 428 | } |
@@ -422,6 +440,5 @@ static int __init mtrr_if_init(void) | |||
422 | proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops); | 440 | proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops); |
423 | return 0; | 441 | return 0; |
424 | } | 442 | } |
425 | |||
426 | arch_initcall(mtrr_if_init); | 443 | arch_initcall(mtrr_if_init); |
427 | #endif /* CONFIG_PROC_FS */ | 444 | #endif /* CONFIG_PROC_FS */ |
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 8fc248b5aea..84e83de5457 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c | |||
@@ -25,43 +25,49 @@ | |||
25 | Operating System Writer's Guide" (Intel document number 242692), | 25 | Operating System Writer's Guide" (Intel document number 242692), |
26 | section 11.11.7 | 26 | section 11.11.7 |
27 | 27 | ||
28 | This was cleaned and made readable by Patrick Mochel <mochel@osdl.org> | 28 | This was cleaned and made readable by Patrick Mochel <mochel@osdl.org> |
29 | on 6-7 March 2002. | 29 | on 6-7 March 2002. |
30 | Source: Intel Architecture Software Developers Manual, Volume 3: | 30 | Source: Intel Architecture Software Developers Manual, Volume 3: |
31 | System Programming Guide; Section 9.11. (1997 edition - PPro). | 31 | System Programming Guide; Section 9.11. (1997 edition - PPro). |
32 | */ | 32 | */ |
33 | 33 | ||
34 | #define DEBUG | ||
35 | |||
36 | #include <linux/types.h> /* FIXME: kvm_para.h needs this */ | ||
37 | |||
38 | #include <linux/kvm_para.h> | ||
39 | #include <linux/uaccess.h> | ||
34 | #include <linux/module.h> | 40 | #include <linux/module.h> |
41 | #include <linux/mutex.h> | ||
35 | #include <linux/init.h> | 42 | #include <linux/init.h> |
43 | #include <linux/sort.h> | ||
44 | #include <linux/cpu.h> | ||
36 | #include <linux/pci.h> | 45 | #include <linux/pci.h> |
37 | #include <linux/smp.h> | 46 | #include <linux/smp.h> |
38 | #include <linux/cpu.h> | ||
39 | #include <linux/mutex.h> | ||
40 | #include <linux/sort.h> | ||
41 | 47 | ||
48 | #include <asm/processor.h> | ||
42 | #include <asm/e820.h> | 49 | #include <asm/e820.h> |
43 | #include <asm/mtrr.h> | 50 | #include <asm/mtrr.h> |
44 | #include <asm/uaccess.h> | ||
45 | #include <asm/processor.h> | ||
46 | #include <asm/msr.h> | 51 | #include <asm/msr.h> |
47 | #include <asm/kvm_para.h> | 52 | |
48 | #include "mtrr.h" | 53 | #include "mtrr.h" |
49 | 54 | ||
50 | u32 num_var_ranges = 0; | 55 | u32 num_var_ranges; |
51 | 56 | ||
52 | unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; | 57 | unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; |
53 | static DEFINE_MUTEX(mtrr_mutex); | 58 | static DEFINE_MUTEX(mtrr_mutex); |
54 | 59 | ||
55 | u64 size_or_mask, size_and_mask; | 60 | u64 size_or_mask, size_and_mask; |
61 | static bool mtrr_aps_delayed_init; | ||
56 | 62 | ||
57 | static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; | 63 | static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; |
58 | 64 | ||
59 | struct mtrr_ops * mtrr_if = NULL; | 65 | struct mtrr_ops *mtrr_if; |
60 | 66 | ||
61 | static void set_mtrr(unsigned int reg, unsigned long base, | 67 | static void set_mtrr(unsigned int reg, unsigned long base, |
62 | unsigned long size, mtrr_type type); | 68 | unsigned long size, mtrr_type type); |
63 | 69 | ||
64 | void set_mtrr_ops(struct mtrr_ops * ops) | 70 | void set_mtrr_ops(struct mtrr_ops *ops) |
65 | { | 71 | { |
66 | if (ops->vendor && ops->vendor < X86_VENDOR_NUM) | 72 | if (ops->vendor && ops->vendor < X86_VENDOR_NUM) |
67 | mtrr_ops[ops->vendor] = ops; | 73 | mtrr_ops[ops->vendor] = ops; |
@@ -72,30 +78,36 @@ static int have_wrcomb(void) | |||
72 | { | 78 | { |
73 | struct pci_dev *dev; | 79 | struct pci_dev *dev; |
74 | u8 rev; | 80 | u8 rev; |
75 | 81 | ||
76 | if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) { | 82 | dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); |
77 | /* ServerWorks LE chipsets < rev 6 have problems with write-combining | 83 | if (dev != NULL) { |
78 | Don't allow it and leave room for other chipsets to be tagged */ | 84 | /* |
85 | * ServerWorks LE chipsets < rev 6 have problems with | ||
86 | * write-combining. Don't allow it and leave room for other | ||
87 | * chipsets to be tagged | ||
88 | */ | ||
79 | if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && | 89 | if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && |
80 | dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { | 90 | dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { |
81 | pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); | 91 | pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); |
82 | if (rev <= 5) { | 92 | if (rev <= 5) { |
83 | printk(KERN_INFO "mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); | 93 | pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); |
84 | pci_dev_put(dev); | 94 | pci_dev_put(dev); |
85 | return 0; | 95 | return 0; |
86 | } | 96 | } |
87 | } | 97 | } |
88 | /* Intel 450NX errata # 23. Non ascending cacheline evictions to | 98 | /* |
89 | write combining memory may resulting in data corruption */ | 99 | * Intel 450NX errata # 23. Non ascending cacheline evictions to |
100 | * write combining memory may resulting in data corruption | ||
101 | */ | ||
90 | if (dev->vendor == PCI_VENDOR_ID_INTEL && | 102 | if (dev->vendor == PCI_VENDOR_ID_INTEL && |
91 | dev->device == PCI_DEVICE_ID_INTEL_82451NX) { | 103 | dev->device == PCI_DEVICE_ID_INTEL_82451NX) { |
92 | printk(KERN_INFO "mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); | 104 | pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); |
93 | pci_dev_put(dev); | 105 | pci_dev_put(dev); |
94 | return 0; | 106 | return 0; |
95 | } | 107 | } |
96 | pci_dev_put(dev); | 108 | pci_dev_put(dev); |
97 | } | 109 | } |
98 | return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0); | 110 | return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; |
99 | } | 111 | } |
100 | 112 | ||
101 | /* This function returns the number of variable MTRRs */ | 113 | /* This function returns the number of variable MTRRs */ |
@@ -103,12 +115,13 @@ static void __init set_num_var_ranges(void) | |||
103 | { | 115 | { |
104 | unsigned long config = 0, dummy; | 116 | unsigned long config = 0, dummy; |
105 | 117 | ||
106 | if (use_intel()) { | 118 | if (use_intel()) |
107 | rdmsr(MSR_MTRRcap, config, dummy); | 119 | rdmsr(MSR_MTRRcap, config, dummy); |
108 | } else if (is_cpu(AMD)) | 120 | else if (is_cpu(AMD)) |
109 | config = 2; | 121 | config = 2; |
110 | else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) | 122 | else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) |
111 | config = 8; | 123 | config = 8; |
124 | |||
112 | num_var_ranges = config & 0xff; | 125 | num_var_ranges = config & 0xff; |
113 | } | 126 | } |
114 | 127 | ||
@@ -130,10 +143,12 @@ struct set_mtrr_data { | |||
130 | mtrr_type smp_type; | 143 | mtrr_type smp_type; |
131 | }; | 144 | }; |
132 | 145 | ||
146 | /** | ||
147 | * ipi_handler - Synchronisation handler. Executed by "other" CPUs. | ||
148 | * | ||
149 | * Returns nothing. | ||
150 | */ | ||
133 | static void ipi_handler(void *info) | 151 | static void ipi_handler(void *info) |
134 | /* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. | ||
135 | [RETURNS] Nothing. | ||
136 | */ | ||
137 | { | 152 | { |
138 | #ifdef CONFIG_SMP | 153 | #ifdef CONFIG_SMP |
139 | struct set_mtrr_data *data = info; | 154 | struct set_mtrr_data *data = info; |
@@ -142,18 +157,22 @@ static void ipi_handler(void *info) | |||
142 | local_irq_save(flags); | 157 | local_irq_save(flags); |
143 | 158 | ||
144 | atomic_dec(&data->count); | 159 | atomic_dec(&data->count); |
145 | while(!atomic_read(&data->gate)) | 160 | while (!atomic_read(&data->gate)) |
146 | cpu_relax(); | 161 | cpu_relax(); |
147 | 162 | ||
148 | /* The master has cleared me to execute */ | 163 | /* The master has cleared me to execute */ |
149 | if (data->smp_reg != ~0U) | 164 | if (data->smp_reg != ~0U) { |
150 | mtrr_if->set(data->smp_reg, data->smp_base, | 165 | mtrr_if->set(data->smp_reg, data->smp_base, |
151 | data->smp_size, data->smp_type); | 166 | data->smp_size, data->smp_type); |
152 | else | 167 | } else if (mtrr_aps_delayed_init) { |
168 | /* | ||
169 | * Initialize the MTRRs inaddition to the synchronisation. | ||
170 | */ | ||
153 | mtrr_if->set_all(); | 171 | mtrr_if->set_all(); |
172 | } | ||
154 | 173 | ||
155 | atomic_dec(&data->count); | 174 | atomic_dec(&data->count); |
156 | while(atomic_read(&data->gate)) | 175 | while (atomic_read(&data->gate)) |
157 | cpu_relax(); | 176 | cpu_relax(); |
158 | 177 | ||
159 | atomic_dec(&data->count); | 178 | atomic_dec(&data->count); |
@@ -161,7 +180,8 @@ static void ipi_handler(void *info) | |||
161 | #endif | 180 | #endif |
162 | } | 181 | } |
163 | 182 | ||
164 | static inline int types_compatible(mtrr_type type1, mtrr_type type2) { | 183 | static inline int types_compatible(mtrr_type type1, mtrr_type type2) |
184 | { | ||
165 | return type1 == MTRR_TYPE_UNCACHABLE || | 185 | return type1 == MTRR_TYPE_UNCACHABLE || |
166 | type2 == MTRR_TYPE_UNCACHABLE || | 186 | type2 == MTRR_TYPE_UNCACHABLE || |
167 | (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || | 187 | (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || |
@@ -176,10 +196,10 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) { | |||
176 | * @type: mtrr type | 196 | * @type: mtrr type |
177 | * | 197 | * |
178 | * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: | 198 | * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: |
179 | * | 199 | * |
180 | * 1. Send IPI to do the following: | 200 | * 1. Send IPI to do the following: |
181 | * 2. Disable Interrupts | 201 | * 2. Disable Interrupts |
182 | * 3. Wait for all procs to do so | 202 | * 3. Wait for all procs to do so |
183 | * 4. Enter no-fill cache mode | 203 | * 4. Enter no-fill cache mode |
184 | * 5. Flush caches | 204 | * 5. Flush caches |
185 | * 6. Clear PGE bit | 205 | * 6. Clear PGE bit |
@@ -189,26 +209,27 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) { | |||
189 | * 10. Enable all range registers | 209 | * 10. Enable all range registers |
190 | * 11. Flush all TLBs and caches again | 210 | * 11. Flush all TLBs and caches again |
191 | * 12. Enter normal cache mode and reenable caching | 211 | * 12. Enter normal cache mode and reenable caching |
192 | * 13. Set PGE | 212 | * 13. Set PGE |
193 | * 14. Wait for buddies to catch up | 213 | * 14. Wait for buddies to catch up |
194 | * 15. Enable interrupts. | 214 | * 15. Enable interrupts. |
195 | * | 215 | * |
196 | * What does that mean for us? Well, first we set data.count to the number | 216 | * What does that mean for us? Well, first we set data.count to the number |
197 | * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait | 217 | * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait |
198 | * until it hits 0 and proceed. We set the data.gate flag and reset data.count. | 218 | * until it hits 0 and proceed. We set the data.gate flag and reset data.count. |
199 | * Meanwhile, they are waiting for that flag to be set. Once it's set, each | 219 | * Meanwhile, they are waiting for that flag to be set. Once it's set, each |
200 | * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it | 220 | * CPU goes through the transition of updating MTRRs. |
201 | * differently, so we call mtrr_if->set() callback and let them take care of it. | 221 | * The CPU vendors may each do it differently, |
202 | * When they're done, they again decrement data->count and wait for data.gate to | 222 | * so we call mtrr_if->set() callback and let them take care of it. |
203 | * be reset. | 223 | * When they're done, they again decrement data->count and wait for data.gate |
204 | * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag. | 224 | * to be reset. |
225 | * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag | ||
205 | * Everyone then enables interrupts and we all continue on. | 226 | * Everyone then enables interrupts and we all continue on. |
206 | * | 227 | * |
207 | * Note that the mechanism is the same for UP systems, too; all the SMP stuff | 228 | * Note that the mechanism is the same for UP systems, too; all the SMP stuff |
208 | * becomes nops. | 229 | * becomes nops. |
209 | */ | 230 | */ |
210 | static void set_mtrr(unsigned int reg, unsigned long base, | 231 | static void |
211 | unsigned long size, mtrr_type type) | 232 | set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) |
212 | { | 233 | { |
213 | struct set_mtrr_data data; | 234 | struct set_mtrr_data data; |
214 | unsigned long flags; | 235 | unsigned long flags; |
@@ -218,121 +239,124 @@ static void set_mtrr(unsigned int reg, unsigned long base, | |||
218 | data.smp_size = size; | 239 | data.smp_size = size; |
219 | data.smp_type = type; | 240 | data.smp_type = type; |
220 | atomic_set(&data.count, num_booting_cpus() - 1); | 241 | atomic_set(&data.count, num_booting_cpus() - 1); |
221 | /* make sure data.count is visible before unleashing other CPUs */ | 242 | |
243 | /* Make sure data.count is visible before unleashing other CPUs */ | ||
222 | smp_wmb(); | 244 | smp_wmb(); |
223 | atomic_set(&data.gate,0); | 245 | atomic_set(&data.gate, 0); |
224 | 246 | ||
225 | /* Start the ball rolling on other CPUs */ | 247 | /* Start the ball rolling on other CPUs */ |
226 | if (smp_call_function(ipi_handler, &data, 0) != 0) | 248 | if (smp_call_function(ipi_handler, &data, 0) != 0) |
227 | panic("mtrr: timed out waiting for other CPUs\n"); | 249 | panic("mtrr: timed out waiting for other CPUs\n"); |
228 | 250 | ||
229 | local_irq_save(flags); | 251 | local_irq_save(flags); |
230 | 252 | ||
231 | while(atomic_read(&data.count)) | 253 | while (atomic_read(&data.count)) |
232 | cpu_relax(); | 254 | cpu_relax(); |
233 | 255 | ||
234 | /* ok, reset count and toggle gate */ | 256 | /* Ok, reset count and toggle gate */ |
235 | atomic_set(&data.count, num_booting_cpus() - 1); | 257 | atomic_set(&data.count, num_booting_cpus() - 1); |
236 | smp_wmb(); | 258 | smp_wmb(); |
237 | atomic_set(&data.gate,1); | 259 | atomic_set(&data.gate, 1); |
238 | 260 | ||
239 | /* do our MTRR business */ | 261 | /* Do our MTRR business */ |
240 | 262 | ||
241 | /* HACK! | 263 | /* |
264 | * HACK! | ||
242 | * We use this same function to initialize the mtrrs on boot. | 265 | * We use this same function to initialize the mtrrs on boot. |
243 | * The state of the boot cpu's mtrrs has been saved, and we want | 266 | * The state of the boot cpu's mtrrs has been saved, and we want |
244 | * to replicate across all the APs. | 267 | * to replicate across all the APs. |
245 | * If we're doing that @reg is set to something special... | 268 | * If we're doing that @reg is set to something special... |
246 | */ | 269 | */ |
247 | if (reg != ~0U) | 270 | if (reg != ~0U) |
248 | mtrr_if->set(reg,base,size,type); | 271 | mtrr_if->set(reg, base, size, type); |
272 | else if (!mtrr_aps_delayed_init) | ||
273 | mtrr_if->set_all(); | ||
249 | 274 | ||
250 | /* wait for the others */ | 275 | /* Wait for the others */ |
251 | while(atomic_read(&data.count)) | 276 | while (atomic_read(&data.count)) |
252 | cpu_relax(); | 277 | cpu_relax(); |
253 | 278 | ||
254 | atomic_set(&data.count, num_booting_cpus() - 1); | 279 | atomic_set(&data.count, num_booting_cpus() - 1); |
255 | smp_wmb(); | 280 | smp_wmb(); |
256 | atomic_set(&data.gate,0); | 281 | atomic_set(&data.gate, 0); |
257 | 282 | ||
258 | /* | 283 | /* |
259 | * Wait here for everyone to have seen the gate change | 284 | * Wait here for everyone to have seen the gate change |
260 | * So we're the last ones to touch 'data' | 285 | * So we're the last ones to touch 'data' |
261 | */ | 286 | */ |
262 | while(atomic_read(&data.count)) | 287 | while (atomic_read(&data.count)) |
263 | cpu_relax(); | 288 | cpu_relax(); |
264 | 289 | ||
265 | local_irq_restore(flags); | 290 | local_irq_restore(flags); |
266 | } | 291 | } |
267 | 292 | ||
268 | /** | 293 | /** |
269 | * mtrr_add_page - Add a memory type region | 294 | * mtrr_add_page - Add a memory type region |
270 | * @base: Physical base address of region in pages (in units of 4 kB!) | 295 | * @base: Physical base address of region in pages (in units of 4 kB!) |
271 | * @size: Physical size of region in pages (4 kB) | 296 | * @size: Physical size of region in pages (4 kB) |
272 | * @type: Type of MTRR desired | 297 | * @type: Type of MTRR desired |
273 | * @increment: If this is true do usage counting on the region | 298 | * @increment: If this is true do usage counting on the region |
274 | * | 299 | * |
275 | * Memory type region registers control the caching on newer Intel and | 300 | * Memory type region registers control the caching on newer Intel and |
276 | * non Intel processors. This function allows drivers to request an | 301 | * non Intel processors. This function allows drivers to request an |
277 | * MTRR is added. The details and hardware specifics of each processor's | 302 | * MTRR is added. The details and hardware specifics of each processor's |
278 | * implementation are hidden from the caller, but nevertheless the | 303 | * implementation are hidden from the caller, but nevertheless the |
279 | * caller should expect to need to provide a power of two size on an | 304 | * caller should expect to need to provide a power of two size on an |
280 | * equivalent power of two boundary. | 305 | * equivalent power of two boundary. |
281 | * | 306 | * |
282 | * If the region cannot be added either because all regions are in use | 307 | * If the region cannot be added either because all regions are in use |
283 | * or the CPU cannot support it a negative value is returned. On success | 308 | * or the CPU cannot support it a negative value is returned. On success |
284 | * the register number for this entry is returned, but should be treated | 309 | * the register number for this entry is returned, but should be treated |
285 | * as a cookie only. | 310 | * as a cookie only. |
286 | * | 311 | * |
287 | * On a multiprocessor machine the changes are made to all processors. | 312 | * On a multiprocessor machine the changes are made to all processors. |
288 | * This is required on x86 by the Intel processors. | 313 | * This is required on x86 by the Intel processors. |
289 | * | 314 | * |
290 | * The available types are | 315 | * The available types are |
291 | * | 316 | * |
292 | * %MTRR_TYPE_UNCACHABLE - No caching | 317 | * %MTRR_TYPE_UNCACHABLE - No caching |
293 | * | 318 | * |
294 | * %MTRR_TYPE_WRBACK - Write data back in bursts whenever | 319 | * %MTRR_TYPE_WRBACK - Write data back in bursts whenever |
295 | * | 320 | * |
296 | * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts | 321 | * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts |
297 | * | 322 | * |
298 | * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes | 323 | * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes |
299 | * | 324 | * |
300 | * BUGS: Needs a quiet flag for the cases where drivers do not mind | 325 | * BUGS: Needs a quiet flag for the cases where drivers do not mind |
301 | * failures and do not wish system log messages to be sent. | 326 | * failures and do not wish system log messages to be sent. |
302 | */ | 327 | */ |
303 | 328 | int mtrr_add_page(unsigned long base, unsigned long size, | |
304 | int mtrr_add_page(unsigned long base, unsigned long size, | ||
305 | unsigned int type, bool increment) | 329 | unsigned int type, bool increment) |
306 | { | 330 | { |
331 | unsigned long lbase, lsize; | ||
307 | int i, replace, error; | 332 | int i, replace, error; |
308 | mtrr_type ltype; | 333 | mtrr_type ltype; |
309 | unsigned long lbase, lsize; | ||
310 | 334 | ||
311 | if (!mtrr_if) | 335 | if (!mtrr_if) |
312 | return -ENXIO; | 336 | return -ENXIO; |
313 | 337 | ||
314 | if ((error = mtrr_if->validate_add_page(base,size,type))) | 338 | error = mtrr_if->validate_add_page(base, size, type); |
339 | if (error) | ||
315 | return error; | 340 | return error; |
316 | 341 | ||
317 | if (type >= MTRR_NUM_TYPES) { | 342 | if (type >= MTRR_NUM_TYPES) { |
318 | printk(KERN_WARNING "mtrr: type: %u invalid\n", type); | 343 | pr_warning("mtrr: type: %u invalid\n", type); |
319 | return -EINVAL; | 344 | return -EINVAL; |
320 | } | 345 | } |
321 | 346 | ||
322 | /* If the type is WC, check that this processor supports it */ | 347 | /* If the type is WC, check that this processor supports it */ |
323 | if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { | 348 | if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { |
324 | printk(KERN_WARNING | 349 | pr_warning("mtrr: your processor doesn't support write-combining\n"); |
325 | "mtrr: your processor doesn't support write-combining\n"); | ||
326 | return -ENOSYS; | 350 | return -ENOSYS; |
327 | } | 351 | } |
328 | 352 | ||
329 | if (!size) { | 353 | if (!size) { |
330 | printk(KERN_WARNING "mtrr: zero sized request\n"); | 354 | pr_warning("mtrr: zero sized request\n"); |
331 | return -EINVAL; | 355 | return -EINVAL; |
332 | } | 356 | } |
333 | 357 | ||
334 | if (base & size_or_mask || size & size_or_mask) { | 358 | if (base & size_or_mask || size & size_or_mask) { |
335 | printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); | 359 | pr_warning("mtrr: base or size exceeds the MTRR width\n"); |
336 | return -EINVAL; | 360 | return -EINVAL; |
337 | } | 361 | } |
338 | 362 | ||
@@ -341,36 +365,40 @@ int mtrr_add_page(unsigned long base, unsigned long size, | |||
341 | 365 | ||
342 | /* No CPU hotplug when we change MTRR entries */ | 366 | /* No CPU hotplug when we change MTRR entries */ |
343 | get_online_cpus(); | 367 | get_online_cpus(); |
344 | /* Search for existing MTRR */ | 368 | |
369 | /* Search for existing MTRR */ | ||
345 | mutex_lock(&mtrr_mutex); | 370 | mutex_lock(&mtrr_mutex); |
346 | for (i = 0; i < num_var_ranges; ++i) { | 371 | for (i = 0; i < num_var_ranges; ++i) { |
347 | mtrr_if->get(i, &lbase, &lsize, <ype); | 372 | mtrr_if->get(i, &lbase, &lsize, <ype); |
348 | if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase) | 373 | if (!lsize || base > lbase + lsize - 1 || |
374 | base + size - 1 < lbase) | ||
349 | continue; | 375 | continue; |
350 | /* At this point we know there is some kind of overlap/enclosure */ | 376 | /* |
377 | * At this point we know there is some kind of | ||
378 | * overlap/enclosure | ||
379 | */ | ||
351 | if (base < lbase || base + size - 1 > lbase + lsize - 1) { | 380 | if (base < lbase || base + size - 1 > lbase + lsize - 1) { |
352 | if (base <= lbase && base + size - 1 >= lbase + lsize - 1) { | 381 | if (base <= lbase && |
382 | base + size - 1 >= lbase + lsize - 1) { | ||
353 | /* New region encloses an existing region */ | 383 | /* New region encloses an existing region */ |
354 | if (type == ltype) { | 384 | if (type == ltype) { |
355 | replace = replace == -1 ? i : -2; | 385 | replace = replace == -1 ? i : -2; |
356 | continue; | 386 | continue; |
357 | } | 387 | } else if (types_compatible(type, ltype)) |
358 | else if (types_compatible(type, ltype)) | ||
359 | continue; | 388 | continue; |
360 | } | 389 | } |
361 | printk(KERN_WARNING | 390 | pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing" |
362 | "mtrr: 0x%lx000,0x%lx000 overlaps existing" | 391 | " 0x%lx000,0x%lx000\n", base, size, lbase, |
363 | " 0x%lx000,0x%lx000\n", base, size, lbase, | 392 | lsize); |
364 | lsize); | ||
365 | goto out; | 393 | goto out; |
366 | } | 394 | } |
367 | /* New region is enclosed by an existing region */ | 395 | /* New region is enclosed by an existing region */ |
368 | if (ltype != type) { | 396 | if (ltype != type) { |
369 | if (types_compatible(type, ltype)) | 397 | if (types_compatible(type, ltype)) |
370 | continue; | 398 | continue; |
371 | printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", | 399 | pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", |
372 | base, size, mtrr_attrib_to_str(ltype), | 400 | base, size, mtrr_attrib_to_str(ltype), |
373 | mtrr_attrib_to_str(type)); | 401 | mtrr_attrib_to_str(type)); |
374 | goto out; | 402 | goto out; |
375 | } | 403 | } |
376 | if (increment) | 404 | if (increment) |
@@ -378,7 +406,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, | |||
378 | error = i; | 406 | error = i; |
379 | goto out; | 407 | goto out; |
380 | } | 408 | } |
381 | /* Search for an empty MTRR */ | 409 | /* Search for an empty MTRR */ |
382 | i = mtrr_if->get_free_region(base, size, replace); | 410 | i = mtrr_if->get_free_region(base, size, replace); |
383 | if (i >= 0) { | 411 | if (i >= 0) { |
384 | set_mtrr(i, base, size, type); | 412 | set_mtrr(i, base, size, type); |
@@ -393,8 +421,9 @@ int mtrr_add_page(unsigned long base, unsigned long size, | |||
393 | mtrr_usage_table[replace] = 0; | 421 | mtrr_usage_table[replace] = 0; |
394 | } | 422 | } |
395 | } | 423 | } |
396 | } else | 424 | } else { |
397 | printk(KERN_INFO "mtrr: no more MTRRs available\n"); | 425 | pr_info("mtrr: no more MTRRs available\n"); |
426 | } | ||
398 | error = i; | 427 | error = i; |
399 | out: | 428 | out: |
400 | mutex_unlock(&mtrr_mutex); | 429 | mutex_unlock(&mtrr_mutex); |
@@ -405,10 +434,8 @@ int mtrr_add_page(unsigned long base, unsigned long size, | |||
405 | static int mtrr_check(unsigned long base, unsigned long size) | 434 | static int mtrr_check(unsigned long base, unsigned long size) |
406 | { | 435 | { |
407 | if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { | 436 | if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { |
408 | printk(KERN_WARNING | 437 | pr_warning("mtrr: size and base must be multiples of 4 kiB\n"); |
409 | "mtrr: size and base must be multiples of 4 kiB\n"); | 438 | pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base); |
410 | printk(KERN_DEBUG | ||
411 | "mtrr: size: 0x%lx base: 0x%lx\n", size, base); | ||
412 | dump_stack(); | 439 | dump_stack(); |
413 | return -1; | 440 | return -1; |
414 | } | 441 | } |
@@ -416,66 +443,64 @@ static int mtrr_check(unsigned long base, unsigned long size) | |||
416 | } | 443 | } |
417 | 444 | ||
418 | /** | 445 | /** |
419 | * mtrr_add - Add a memory type region | 446 | * mtrr_add - Add a memory type region |
420 | * @base: Physical base address of region | 447 | * @base: Physical base address of region |
421 | * @size: Physical size of region | 448 | * @size: Physical size of region |
422 | * @type: Type of MTRR desired | 449 | * @type: Type of MTRR desired |
423 | * @increment: If this is true do usage counting on the region | 450 | * @increment: If this is true do usage counting on the region |
424 | * | 451 | * |
425 | * Memory type region registers control the caching on newer Intel and | 452 | * Memory type region registers control the caching on newer Intel and |
426 | * non Intel processors. This function allows drivers to request an | 453 | * non Intel processors. This function allows drivers to request an |
427 | * MTRR is added. The details and hardware specifics of each processor's | 454 | * MTRR is added. The details and hardware specifics of each processor's |
428 | * implementation are hidden from the caller, but nevertheless the | 455 | * implementation are hidden from the caller, but nevertheless the |
429 | * caller should expect to need to provide a power of two size on an | 456 | * caller should expect to need to provide a power of two size on an |
430 | * equivalent power of two boundary. | 457 | * equivalent power of two boundary. |
431 | * | 458 | * |
432 | * If the region cannot be added either because all regions are in use | 459 | * If the region cannot be added either because all regions are in use |
433 | * or the CPU cannot support it a negative value is returned. On success | 460 | * or the CPU cannot support it a negative value is returned. On success |
434 | * the register number for this entry is returned, but should be treated | 461 | * the register number for this entry is returned, but should be treated |
435 | * as a cookie only. | 462 | * as a cookie only. |
436 | * | 463 | * |
437 | * On a multiprocessor machine the changes are made to all processors. | 464 | * On a multiprocessor machine the changes are made to all processors. |
438 | * This is required on x86 by the Intel processors. | 465 | * This is required on x86 by the Intel processors. |
439 | * | 466 | * |
440 | * The available types are | 467 | * The available types are |
441 | * | 468 | * |
442 | * %MTRR_TYPE_UNCACHABLE - No caching | 469 | * %MTRR_TYPE_UNCACHABLE - No caching |
443 | * | 470 | * |
444 | * %MTRR_TYPE_WRBACK - Write data back in bursts whenever | 471 | * %MTRR_TYPE_WRBACK - Write data back in bursts whenever |
445 | * | 472 | * |
446 | * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts | 473 | * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts |
447 | * | 474 | * |
448 | * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes | 475 | * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes |
449 | * | 476 | * |
450 | * BUGS: Needs a quiet flag for the cases where drivers do not mind | 477 | * BUGS: Needs a quiet flag for the cases where drivers do not mind |
451 | * failures and do not wish system log messages to be sent. | 478 | * failures and do not wish system log messages to be sent. |
452 | */ | 479 | */ |
453 | 480 | int mtrr_add(unsigned long base, unsigned long size, unsigned int type, | |
454 | int | 481 | bool increment) |
455 | mtrr_add(unsigned long base, unsigned long size, unsigned int type, | ||
456 | bool increment) | ||
457 | { | 482 | { |
458 | if (mtrr_check(base, size)) | 483 | if (mtrr_check(base, size)) |
459 | return -EINVAL; | 484 | return -EINVAL; |
460 | return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, | 485 | return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, |
461 | increment); | 486 | increment); |
462 | } | 487 | } |
488 | EXPORT_SYMBOL(mtrr_add); | ||
463 | 489 | ||
464 | /** | 490 | /** |
465 | * mtrr_del_page - delete a memory type region | 491 | * mtrr_del_page - delete a memory type region |
466 | * @reg: Register returned by mtrr_add | 492 | * @reg: Register returned by mtrr_add |
467 | * @base: Physical base address | 493 | * @base: Physical base address |
468 | * @size: Size of region | 494 | * @size: Size of region |
469 | * | 495 | * |
470 | * If register is supplied then base and size are ignored. This is | 496 | * If register is supplied then base and size are ignored. This is |
471 | * how drivers should call it. | 497 | * how drivers should call it. |
472 | * | 498 | * |
473 | * Releases an MTRR region. If the usage count drops to zero the | 499 | * Releases an MTRR region. If the usage count drops to zero the |
474 | * register is freed and the region returns to default state. | 500 | * register is freed and the region returns to default state. |
475 | * On success the register is returned, on failure a negative error | 501 | * On success the register is returned, on failure a negative error |
476 | * code. | 502 | * code. |
477 | */ | 503 | */ |
478 | |||
479 | int mtrr_del_page(int reg, unsigned long base, unsigned long size) | 504 | int mtrr_del_page(int reg, unsigned long base, unsigned long size) |
480 | { | 505 | { |
481 | int i, max; | 506 | int i, max; |
@@ -500,22 +525,22 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) | |||
500 | } | 525 | } |
501 | } | 526 | } |
502 | if (reg < 0) { | 527 | if (reg < 0) { |
503 | printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, | 528 | pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n", |
504 | size); | 529 | base, size); |
505 | goto out; | 530 | goto out; |
506 | } | 531 | } |
507 | } | 532 | } |
508 | if (reg >= max) { | 533 | if (reg >= max) { |
509 | printk(KERN_WARNING "mtrr: register: %d too big\n", reg); | 534 | pr_warning("mtrr: register: %d too big\n", reg); |
510 | goto out; | 535 | goto out; |
511 | } | 536 | } |
512 | mtrr_if->get(reg, &lbase, &lsize, <ype); | 537 | mtrr_if->get(reg, &lbase, &lsize, <ype); |
513 | if (lsize < 1) { | 538 | if (lsize < 1) { |
514 | printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); | 539 | pr_warning("mtrr: MTRR %d not used\n", reg); |
515 | goto out; | 540 | goto out; |
516 | } | 541 | } |
517 | if (mtrr_usage_table[reg] < 1) { | 542 | if (mtrr_usage_table[reg] < 1) { |
518 | printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); | 543 | pr_warning("mtrr: reg: %d has count=0\n", reg); |
519 | goto out; | 544 | goto out; |
520 | } | 545 | } |
521 | if (--mtrr_usage_table[reg] < 1) | 546 | if (--mtrr_usage_table[reg] < 1) |
@@ -526,33 +551,31 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) | |||
526 | put_online_cpus(); | 551 | put_online_cpus(); |
527 | return error; | 552 | return error; |
528 | } | 553 | } |
554 | |||
529 | /** | 555 | /** |
530 | * mtrr_del - delete a memory type region | 556 | * mtrr_del - delete a memory type region |
531 | * @reg: Register returned by mtrr_add | 557 | * @reg: Register returned by mtrr_add |
532 | * @base: Physical base address | 558 | * @base: Physical base address |
533 | * @size: Size of region | 559 | * @size: Size of region |
534 | * | 560 | * |
535 | * If register is supplied then base and size are ignored. This is | 561 | * If register is supplied then base and size are ignored. This is |
536 | * how drivers should call it. | 562 | * how drivers should call it. |
537 | * | 563 | * |
538 | * Releases an MTRR region. If the usage count drops to zero the | 564 | * Releases an MTRR region. If the usage count drops to zero the |
539 | * register is freed and the region returns to default state. | 565 | * register is freed and the region returns to default state. |
540 | * On success the register is returned, on failure a negative error | 566 | * On success the register is returned, on failure a negative error |
541 | * code. | 567 | * code. |
542 | */ | 568 | */ |
543 | 569 | int mtrr_del(int reg, unsigned long base, unsigned long size) | |
544 | int | ||
545 | mtrr_del(int reg, unsigned long base, unsigned long size) | ||
546 | { | 570 | { |
547 | if (mtrr_check(base, size)) | 571 | if (mtrr_check(base, size)) |
548 | return -EINVAL; | 572 | return -EINVAL; |
549 | return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); | 573 | return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); |
550 | } | 574 | } |
551 | |||
552 | EXPORT_SYMBOL(mtrr_add); | ||
553 | EXPORT_SYMBOL(mtrr_del); | 575 | EXPORT_SYMBOL(mtrr_del); |
554 | 576 | ||
555 | /* HACK ALERT! | 577 | /* |
578 | * HACK ALERT! | ||
556 | * These should be called implicitly, but we can't yet until all the initcall | 579 | * These should be called implicitly, but we can't yet until all the initcall |
557 | * stuff is done... | 580 | * stuff is done... |
558 | */ | 581 | */ |
@@ -576,29 +599,28 @@ struct mtrr_value { | |||
576 | 599 | ||
577 | static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; | 600 | static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; |
578 | 601 | ||
579 | static int mtrr_save(struct sys_device * sysdev, pm_message_t state) | 602 | static int mtrr_save(struct sys_device *sysdev, pm_message_t state) |
580 | { | 603 | { |
581 | int i; | 604 | int i; |
582 | 605 | ||
583 | for (i = 0; i < num_var_ranges; i++) { | 606 | for (i = 0; i < num_var_ranges; i++) { |
584 | mtrr_if->get(i, | 607 | mtrr_if->get(i, &mtrr_value[i].lbase, |
585 | &mtrr_value[i].lbase, | 608 | &mtrr_value[i].lsize, |
586 | &mtrr_value[i].lsize, | 609 | &mtrr_value[i].ltype); |
587 | &mtrr_value[i].ltype); | ||
588 | } | 610 | } |
589 | return 0; | 611 | return 0; |
590 | } | 612 | } |
591 | 613 | ||
592 | static int mtrr_restore(struct sys_device * sysdev) | 614 | static int mtrr_restore(struct sys_device *sysdev) |
593 | { | 615 | { |
594 | int i; | 616 | int i; |
595 | 617 | ||
596 | for (i = 0; i < num_var_ranges; i++) { | 618 | for (i = 0; i < num_var_ranges; i++) { |
597 | if (mtrr_value[i].lsize) | 619 | if (mtrr_value[i].lsize) { |
598 | set_mtrr(i, | 620 | set_mtrr(i, mtrr_value[i].lbase, |
599 | mtrr_value[i].lbase, | 621 | mtrr_value[i].lsize, |
600 | mtrr_value[i].lsize, | 622 | mtrr_value[i].ltype); |
601 | mtrr_value[i].ltype); | 623 | } |
602 | } | 624 | } |
603 | return 0; | 625 | return 0; |
604 | } | 626 | } |
@@ -615,26 +637,29 @@ int __initdata changed_by_mtrr_cleanup; | |||
615 | /** | 637 | /** |
616 | * mtrr_bp_init - initialize mtrrs on the boot CPU | 638 | * mtrr_bp_init - initialize mtrrs on the boot CPU |
617 | * | 639 | * |
618 | * This needs to be called early; before any of the other CPUs are | 640 | * This needs to be called early; before any of the other CPUs are |
619 | * initialized (i.e. before smp_init()). | 641 | * initialized (i.e. before smp_init()). |
620 | * | 642 | * |
621 | */ | 643 | */ |
622 | void __init mtrr_bp_init(void) | 644 | void __init mtrr_bp_init(void) |
623 | { | 645 | { |
624 | u32 phys_addr; | 646 | u32 phys_addr; |
647 | |||
625 | init_ifs(); | 648 | init_ifs(); |
626 | 649 | ||
627 | phys_addr = 32; | 650 | phys_addr = 32; |
628 | 651 | ||
629 | if (cpu_has_mtrr) { | 652 | if (cpu_has_mtrr) { |
630 | mtrr_if = &generic_mtrr_ops; | 653 | mtrr_if = &generic_mtrr_ops; |
631 | size_or_mask = 0xff000000; /* 36 bits */ | 654 | size_or_mask = 0xff000000; /* 36 bits */ |
632 | size_and_mask = 0x00f00000; | 655 | size_and_mask = 0x00f00000; |
633 | phys_addr = 36; | 656 | phys_addr = 36; |
634 | 657 | ||
635 | /* This is an AMD specific MSR, but we assume(hope?) that | 658 | /* |
636 | Intel will implement it to when they extend the address | 659 | * This is an AMD specific MSR, but we assume(hope?) that |
637 | bus of the Xeon. */ | 660 | * Intel will implement it to when they extend the address |
661 | * bus of the Xeon. | ||
662 | */ | ||
638 | if (cpuid_eax(0x80000000) >= 0x80000008) { | 663 | if (cpuid_eax(0x80000000) >= 0x80000008) { |
639 | phys_addr = cpuid_eax(0x80000008) & 0xff; | 664 | phys_addr = cpuid_eax(0x80000008) & 0xff; |
640 | /* CPUID workaround for Intel 0F33/0F34 CPU */ | 665 | /* CPUID workaround for Intel 0F33/0F34 CPU */ |
@@ -649,9 +674,11 @@ void __init mtrr_bp_init(void) | |||
649 | size_and_mask = ~size_or_mask & 0xfffff00000ULL; | 674 | size_and_mask = ~size_or_mask & 0xfffff00000ULL; |
650 | } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && | 675 | } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && |
651 | boot_cpu_data.x86 == 6) { | 676 | boot_cpu_data.x86 == 6) { |
652 | /* VIA C* family have Intel style MTRRs, but | 677 | /* |
653 | don't support PAE */ | 678 | * VIA C* family have Intel style MTRRs, |
654 | size_or_mask = 0xfff00000; /* 32 bits */ | 679 | * but don't support PAE |
680 | */ | ||
681 | size_or_mask = 0xfff00000; /* 32 bits */ | ||
655 | size_and_mask = 0; | 682 | size_and_mask = 0; |
656 | phys_addr = 32; | 683 | phys_addr = 32; |
657 | } | 684 | } |
@@ -694,30 +721,28 @@ void __init mtrr_bp_init(void) | |||
694 | changed_by_mtrr_cleanup = 1; | 721 | changed_by_mtrr_cleanup = 1; |
695 | mtrr_if->set_all(); | 722 | mtrr_if->set_all(); |
696 | } | 723 | } |
697 | |||
698 | } | 724 | } |
699 | } | 725 | } |
700 | } | 726 | } |
701 | 727 | ||
702 | void mtrr_ap_init(void) | 728 | void mtrr_ap_init(void) |
703 | { | 729 | { |
704 | unsigned long flags; | 730 | if (!use_intel() || mtrr_aps_delayed_init) |
705 | |||
706 | if (!mtrr_if || !use_intel()) | ||
707 | return; | 731 | return; |
708 | /* | 732 | /* |
709 | * Ideally we should hold mtrr_mutex here to avoid mtrr entries changed, | 733 | * Ideally we should hold mtrr_mutex here to avoid mtrr entries |
710 | * but this routine will be called in cpu boot time, holding the lock | 734 | * changed, but this routine will be called in cpu boot time, |
711 | * breaks it. This routine is called in two cases: 1.very earily time | 735 | * holding the lock breaks it. |
712 | * of software resume, when there absolutely isn't mtrr entry changes; | 736 | * |
713 | * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to | 737 | * This routine is called in two cases: |
714 | * prevent mtrr entry changes | 738 | * |
739 | * 1. very earily time of software resume, when there absolutely | ||
740 | * isn't mtrr entry changes; | ||
741 | * | ||
742 | * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug | ||
743 | * lock to prevent mtrr entry changes | ||
715 | */ | 744 | */ |
716 | local_irq_save(flags); | 745 | set_mtrr(~0U, 0, 0, 0); |
717 | |||
718 | mtrr_if->set_all(); | ||
719 | |||
720 | local_irq_restore(flags); | ||
721 | } | 746 | } |
722 | 747 | ||
723 | /** | 748 | /** |
@@ -728,23 +753,55 @@ void mtrr_save_state(void) | |||
728 | smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); | 753 | smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); |
729 | } | 754 | } |
730 | 755 | ||
756 | void set_mtrr_aps_delayed_init(void) | ||
757 | { | ||
758 | if (!use_intel()) | ||
759 | return; | ||
760 | |||
761 | mtrr_aps_delayed_init = true; | ||
762 | } | ||
763 | |||
764 | /* | ||
765 | * MTRR initialization for all AP's | ||
766 | */ | ||
767 | void mtrr_aps_init(void) | ||
768 | { | ||
769 | if (!use_intel()) | ||
770 | return; | ||
771 | |||
772 | set_mtrr(~0U, 0, 0, 0); | ||
773 | mtrr_aps_delayed_init = false; | ||
774 | } | ||
775 | |||
776 | void mtrr_bp_restore(void) | ||
777 | { | ||
778 | if (!use_intel()) | ||
779 | return; | ||
780 | |||
781 | mtrr_if->set_all(); | ||
782 | } | ||
783 | |||
731 | static int __init mtrr_init_finialize(void) | 784 | static int __init mtrr_init_finialize(void) |
732 | { | 785 | { |
733 | if (!mtrr_if) | 786 | if (!mtrr_if) |
734 | return 0; | 787 | return 0; |
788 | |||
735 | if (use_intel()) { | 789 | if (use_intel()) { |
736 | if (!changed_by_mtrr_cleanup) | 790 | if (!changed_by_mtrr_cleanup) |
737 | mtrr_state_warn(); | 791 | mtrr_state_warn(); |
738 | } else { | 792 | return 0; |
739 | /* The CPUs haven't MTRR and seem to not support SMP. They have | ||
740 | * specific drivers, we use a tricky method to support | ||
741 | * suspend/resume for them. | ||
742 | * TBD: is there any system with such CPU which supports | ||
743 | * suspend/resume? if no, we should remove the code. | ||
744 | */ | ||
745 | sysdev_driver_register(&cpu_sysdev_class, | ||
746 | &mtrr_sysdev_driver); | ||
747 | } | 793 | } |
794 | |||
795 | /* | ||
796 | * The CPU has no MTRR and seems to not support SMP. They have | ||
797 | * specific drivers, we use a tricky method to support | ||
798 | * suspend/resume for them. | ||
799 | * | ||
800 | * TBD: is there any system with such CPU which supports | ||
801 | * suspend/resume? If no, we should remove the code. | ||
802 | */ | ||
803 | sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); | ||
804 | |||
748 | return 0; | 805 | return 0; |
749 | } | 806 | } |
750 | subsys_initcall(mtrr_init_finialize); | 807 | subsys_initcall(mtrr_init_finialize); |
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 7538b767f20..a501dee9a87 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * local mtrr defines. | 2 | * local MTRR defines. |
3 | */ | 3 | */ |
4 | 4 | ||
5 | #include <linux/types.h> | 5 | #include <linux/types.h> |
@@ -14,13 +14,12 @@ extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; | |||
14 | struct mtrr_ops { | 14 | struct mtrr_ops { |
15 | u32 vendor; | 15 | u32 vendor; |
16 | u32 use_intel_if; | 16 | u32 use_intel_if; |
17 | // void (*init)(void); | ||
18 | void (*set)(unsigned int reg, unsigned long base, | 17 | void (*set)(unsigned int reg, unsigned long base, |
19 | unsigned long size, mtrr_type type); | 18 | unsigned long size, mtrr_type type); |
20 | void (*set_all)(void); | 19 | void (*set_all)(void); |
21 | 20 | ||
22 | void (*get)(unsigned int reg, unsigned long *base, | 21 | void (*get)(unsigned int reg, unsigned long *base, |
23 | unsigned long *size, mtrr_type * type); | 22 | unsigned long *size, mtrr_type *type); |
24 | int (*get_free_region)(unsigned long base, unsigned long size, | 23 | int (*get_free_region)(unsigned long base, unsigned long size, |
25 | int replace_reg); | 24 | int replace_reg); |
26 | int (*validate_add_page)(unsigned long base, unsigned long size, | 25 | int (*validate_add_page)(unsigned long base, unsigned long size, |
@@ -39,11 +38,11 @@ extern int positive_have_wrcomb(void); | |||
39 | 38 | ||
40 | /* library functions for processor-specific routines */ | 39 | /* library functions for processor-specific routines */ |
41 | struct set_mtrr_context { | 40 | struct set_mtrr_context { |
42 | unsigned long flags; | 41 | unsigned long flags; |
43 | unsigned long cr4val; | 42 | unsigned long cr4val; |
44 | u32 deftype_lo; | 43 | u32 deftype_lo; |
45 | u32 deftype_hi; | 44 | u32 deftype_hi; |
46 | u32 ccr3; | 45 | u32 ccr3; |
47 | }; | 46 | }; |
48 | 47 | ||
49 | void set_mtrr_done(struct set_mtrr_context *ctxt); | 48 | void set_mtrr_done(struct set_mtrr_context *ctxt); |
@@ -54,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index, | |||
54 | u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); | 53 | u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); |
55 | void get_mtrr_state(void); | 54 | void get_mtrr_state(void); |
56 | 55 | ||
57 | extern void set_mtrr_ops(struct mtrr_ops * ops); | 56 | extern void set_mtrr_ops(struct mtrr_ops *ops); |
58 | 57 | ||
59 | extern u64 size_or_mask, size_and_mask; | 58 | extern u64 size_or_mask, size_and_mask; |
60 | extern struct mtrr_ops * mtrr_if; | 59 | extern struct mtrr_ops *mtrr_if; |
61 | 60 | ||
62 | #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) | 61 | #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) |
63 | #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) | 62 | #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) |
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 1f5fb1588d1..dfc80b4e6b0 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c | |||
@@ -1,24 +1,25 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/init.h> | 1 | #include <linux/init.h> |
3 | #include <asm/io.h> | 2 | #include <linux/io.h> |
4 | #include <asm/mtrr.h> | 3 | #include <linux/mm.h> |
5 | #include <asm/msr.h> | 4 | |
6 | #include <asm/processor-cyrix.h> | 5 | #include <asm/processor-cyrix.h> |
7 | #include <asm/processor-flags.h> | 6 | #include <asm/processor-flags.h> |
8 | #include "mtrr.h" | 7 | #include <asm/mtrr.h> |
8 | #include <asm/msr.h> | ||
9 | 9 | ||
10 | #include "mtrr.h" | ||
10 | 11 | ||
11 | /* Put the processor into a state where MTRRs can be safely set */ | 12 | /* Put the processor into a state where MTRRs can be safely set */ |
12 | void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) | 13 | void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) |
13 | { | 14 | { |
14 | unsigned int cr0; | 15 | unsigned int cr0; |
15 | 16 | ||
16 | /* Disable interrupts locally */ | 17 | /* Disable interrupts locally */ |
17 | local_irq_save(ctxt->flags); | 18 | local_irq_save(ctxt->flags); |
18 | 19 | ||
19 | if (use_intel() || is_cpu(CYRIX)) { | 20 | if (use_intel() || is_cpu(CYRIX)) { |
20 | 21 | ||
21 | /* Save value of CR4 and clear Page Global Enable (bit 7) */ | 22 | /* Save value of CR4 and clear Page Global Enable (bit 7) */ |
22 | if (cpu_has_pge) { | 23 | if (cpu_has_pge) { |
23 | ctxt->cr4val = read_cr4(); | 24 | ctxt->cr4val = read_cr4(); |
24 | write_cr4(ctxt->cr4val & ~X86_CR4_PGE); | 25 | write_cr4(ctxt->cr4val & ~X86_CR4_PGE); |
@@ -33,50 +34,61 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) | |||
33 | write_cr0(cr0); | 34 | write_cr0(cr0); |
34 | wbinvd(); | 35 | wbinvd(); |
35 | 36 | ||
36 | if (use_intel()) | 37 | if (use_intel()) { |
37 | /* Save MTRR state */ | 38 | /* Save MTRR state */ |
38 | rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); | 39 | rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); |
39 | else | 40 | } else { |
40 | /* Cyrix ARRs - everything else were excluded at the top */ | 41 | /* |
42 | * Cyrix ARRs - | ||
43 | * everything else were excluded at the top | ||
44 | */ | ||
41 | ctxt->ccr3 = getCx86(CX86_CCR3); | 45 | ctxt->ccr3 = getCx86(CX86_CCR3); |
46 | } | ||
42 | } | 47 | } |
43 | } | 48 | } |
44 | 49 | ||
45 | void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) | 50 | void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) |
46 | { | 51 | { |
47 | if (use_intel()) | 52 | if (use_intel()) { |
48 | /* Disable MTRRs, and set the default type to uncached */ | 53 | /* Disable MTRRs, and set the default type to uncached */ |
49 | mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, | 54 | mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, |
50 | ctxt->deftype_hi); | 55 | ctxt->deftype_hi); |
51 | else if (is_cpu(CYRIX)) | 56 | } else { |
52 | /* Cyrix ARRs - everything else were excluded at the top */ | 57 | if (is_cpu(CYRIX)) { |
53 | setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); | 58 | /* Cyrix ARRs - everything else were excluded at the top */ |
59 | setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); | ||
60 | } | ||
61 | } | ||
54 | } | 62 | } |
55 | 63 | ||
56 | /* Restore the processor after a set_mtrr_prepare */ | 64 | /* Restore the processor after a set_mtrr_prepare */ |
57 | void set_mtrr_done(struct set_mtrr_context *ctxt) | 65 | void set_mtrr_done(struct set_mtrr_context *ctxt) |
58 | { | 66 | { |
59 | if (use_intel() || is_cpu(CYRIX)) { | 67 | if (use_intel() || is_cpu(CYRIX)) { |
60 | 68 | ||
61 | /* Flush caches and TLBs */ | 69 | /* Flush caches and TLBs */ |
62 | wbinvd(); | 70 | wbinvd(); |
63 | 71 | ||
64 | /* Restore MTRRdefType */ | 72 | /* Restore MTRRdefType */ |
65 | if (use_intel()) | 73 | if (use_intel()) { |
66 | /* Intel (P6) standard MTRRs */ | 74 | /* Intel (P6) standard MTRRs */ |
67 | mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); | 75 | mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, |
68 | else | 76 | ctxt->deftype_hi); |
69 | /* Cyrix ARRs - everything else was excluded at the top */ | 77 | } else { |
78 | /* | ||
79 | * Cyrix ARRs - | ||
80 | * everything else was excluded at the top | ||
81 | */ | ||
70 | setCx86(CX86_CCR3, ctxt->ccr3); | 82 | setCx86(CX86_CCR3, ctxt->ccr3); |
83 | } | ||
71 | 84 | ||
72 | /* Enable caches */ | 85 | /* Enable caches */ |
73 | write_cr0(read_cr0() & 0xbfffffff); | 86 | write_cr0(read_cr0() & 0xbfffffff); |
74 | 87 | ||
75 | /* Restore value of CR4 */ | 88 | /* Restore value of CR4 */ |
76 | if (cpu_has_pge) | 89 | if (cpu_has_pge) |
77 | write_cr4(ctxt->cr4val); | 90 | write_cr4(ctxt->cr4val); |
78 | } | 91 | } |
79 | /* Re-enable interrupts locally (if enabled previously) */ | 92 | /* Re-enable interrupts locally (if enabled previously) */ |
80 | local_irq_restore(ctxt->flags); | 93 | local_irq_restore(ctxt->flags); |
81 | } | 94 | } |
82 | |||
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_event.c index 900332b800f..b5801c31184 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -1,16 +1,17 @@ | |||
1 | /* | 1 | /* |
2 | * Performance counter x86 architecture code | 2 | * Performance events x86 architecture code |
3 | * | 3 | * |
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | 4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> |
5 | * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar | 5 | * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar |
6 | * Copyright (C) 2009 Jaswinder Singh Rajput | 6 | * Copyright (C) 2009 Jaswinder Singh Rajput |
7 | * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter | 7 | * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter |
8 | * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 8 | * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
9 | * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> | ||
9 | * | 10 | * |
10 | * For licencing details see kernel-base/COPYING | 11 | * For licencing details see kernel-base/COPYING |
11 | */ | 12 | */ |
12 | 13 | ||
13 | #include <linux/perf_counter.h> | 14 | #include <linux/perf_event.h> |
14 | #include <linux/capability.h> | 15 | #include <linux/capability.h> |
15 | #include <linux/notifier.h> | 16 | #include <linux/notifier.h> |
16 | #include <linux/hardirq.h> | 17 | #include <linux/hardirq.h> |
@@ -20,19 +21,60 @@ | |||
20 | #include <linux/sched.h> | 21 | #include <linux/sched.h> |
21 | #include <linux/uaccess.h> | 22 | #include <linux/uaccess.h> |
22 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
24 | #include <linux/cpu.h> | ||
23 | 25 | ||
24 | #include <asm/apic.h> | 26 | #include <asm/apic.h> |
25 | #include <asm/stacktrace.h> | 27 | #include <asm/stacktrace.h> |
26 | #include <asm/nmi.h> | 28 | #include <asm/nmi.h> |
27 | 29 | ||
28 | static u64 perf_counter_mask __read_mostly; | 30 | static u64 perf_event_mask __read_mostly; |
29 | 31 | ||
30 | struct cpu_hw_counters { | 32 | /* The maximal number of PEBS events: */ |
31 | struct perf_counter *counters[X86_PMC_IDX_MAX]; | 33 | #define MAX_PEBS_EVENTS 4 |
34 | |||
35 | /* The size of a BTS record in bytes: */ | ||
36 | #define BTS_RECORD_SIZE 24 | ||
37 | |||
38 | /* The size of a per-cpu BTS buffer in bytes: */ | ||
39 | #define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) | ||
40 | |||
41 | /* The BTS overflow threshold in bytes from the end of the buffer: */ | ||
42 | #define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) | ||
43 | |||
44 | |||
45 | /* | ||
46 | * Bits in the debugctlmsr controlling branch tracing. | ||
47 | */ | ||
48 | #define X86_DEBUGCTL_TR (1 << 6) | ||
49 | #define X86_DEBUGCTL_BTS (1 << 7) | ||
50 | #define X86_DEBUGCTL_BTINT (1 << 8) | ||
51 | #define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) | ||
52 | #define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) | ||
53 | |||
54 | /* | ||
55 | * A debug store configuration. | ||
56 | * | ||
57 | * We only support architectures that use 64bit fields. | ||
58 | */ | ||
59 | struct debug_store { | ||
60 | u64 bts_buffer_base; | ||
61 | u64 bts_index; | ||
62 | u64 bts_absolute_maximum; | ||
63 | u64 bts_interrupt_threshold; | ||
64 | u64 pebs_buffer_base; | ||
65 | u64 pebs_index; | ||
66 | u64 pebs_absolute_maximum; | ||
67 | u64 pebs_interrupt_threshold; | ||
68 | u64 pebs_event_reset[MAX_PEBS_EVENTS]; | ||
69 | }; | ||
70 | |||
71 | struct cpu_hw_events { | ||
72 | struct perf_event *events[X86_PMC_IDX_MAX]; | ||
32 | unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; | 73 | unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; |
33 | unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; | 74 | unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; |
34 | unsigned long interrupts; | 75 | unsigned long interrupts; |
35 | int enabled; | 76 | int enabled; |
77 | struct debug_store *ds; | ||
36 | }; | 78 | }; |
37 | 79 | ||
38 | /* | 80 | /* |
@@ -44,25 +86,27 @@ struct x86_pmu { | |||
44 | int (*handle_irq)(struct pt_regs *); | 86 | int (*handle_irq)(struct pt_regs *); |
45 | void (*disable_all)(void); | 87 | void (*disable_all)(void); |
46 | void (*enable_all)(void); | 88 | void (*enable_all)(void); |
47 | void (*enable)(struct hw_perf_counter *, int); | 89 | void (*enable)(struct hw_perf_event *, int); |
48 | void (*disable)(struct hw_perf_counter *, int); | 90 | void (*disable)(struct hw_perf_event *, int); |
49 | unsigned eventsel; | 91 | unsigned eventsel; |
50 | unsigned perfctr; | 92 | unsigned perfctr; |
51 | u64 (*event_map)(int); | 93 | u64 (*event_map)(int); |
52 | u64 (*raw_event)(u64); | 94 | u64 (*raw_event)(u64); |
53 | int max_events; | 95 | int max_events; |
54 | int num_counters; | 96 | int num_events; |
55 | int num_counters_fixed; | 97 | int num_events_fixed; |
56 | int counter_bits; | 98 | int event_bits; |
57 | u64 counter_mask; | 99 | u64 event_mask; |
58 | int apic; | 100 | int apic; |
59 | u64 max_period; | 101 | u64 max_period; |
60 | u64 intel_ctrl; | 102 | u64 intel_ctrl; |
103 | void (*enable_bts)(u64 config); | ||
104 | void (*disable_bts)(void); | ||
61 | }; | 105 | }; |
62 | 106 | ||
63 | static struct x86_pmu x86_pmu __read_mostly; | 107 | static struct x86_pmu x86_pmu __read_mostly; |
64 | 108 | ||
65 | static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { | 109 | static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { |
66 | .enabled = 1, | 110 | .enabled = 1, |
67 | }; | 111 | }; |
68 | 112 | ||
@@ -80,35 +124,35 @@ static const u64 p6_perfmon_event_map[] = | |||
80 | [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, | 124 | [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, |
81 | }; | 125 | }; |
82 | 126 | ||
83 | static u64 p6_pmu_event_map(int event) | 127 | static u64 p6_pmu_event_map(int hw_event) |
84 | { | 128 | { |
85 | return p6_perfmon_event_map[event]; | 129 | return p6_perfmon_event_map[hw_event]; |
86 | } | 130 | } |
87 | 131 | ||
88 | /* | 132 | /* |
89 | * Counter setting that is specified not to count anything. | 133 | * Event setting that is specified not to count anything. |
90 | * We use this to effectively disable a counter. | 134 | * We use this to effectively disable a counter. |
91 | * | 135 | * |
92 | * L2_RQSTS with 0 MESI unit mask. | 136 | * L2_RQSTS with 0 MESI unit mask. |
93 | */ | 137 | */ |
94 | #define P6_NOP_COUNTER 0x0000002EULL | 138 | #define P6_NOP_EVENT 0x0000002EULL |
95 | 139 | ||
96 | static u64 p6_pmu_raw_event(u64 event) | 140 | static u64 p6_pmu_raw_event(u64 hw_event) |
97 | { | 141 | { |
98 | #define P6_EVNTSEL_EVENT_MASK 0x000000FFULL | 142 | #define P6_EVNTSEL_EVENT_MASK 0x000000FFULL |
99 | #define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL | 143 | #define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL |
100 | #define P6_EVNTSEL_EDGE_MASK 0x00040000ULL | 144 | #define P6_EVNTSEL_EDGE_MASK 0x00040000ULL |
101 | #define P6_EVNTSEL_INV_MASK 0x00800000ULL | 145 | #define P6_EVNTSEL_INV_MASK 0x00800000ULL |
102 | #define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL | 146 | #define P6_EVNTSEL_REG_MASK 0xFF000000ULL |
103 | 147 | ||
104 | #define P6_EVNTSEL_MASK \ | 148 | #define P6_EVNTSEL_MASK \ |
105 | (P6_EVNTSEL_EVENT_MASK | \ | 149 | (P6_EVNTSEL_EVENT_MASK | \ |
106 | P6_EVNTSEL_UNIT_MASK | \ | 150 | P6_EVNTSEL_UNIT_MASK | \ |
107 | P6_EVNTSEL_EDGE_MASK | \ | 151 | P6_EVNTSEL_EDGE_MASK | \ |
108 | P6_EVNTSEL_INV_MASK | \ | 152 | P6_EVNTSEL_INV_MASK | \ |
109 | P6_EVNTSEL_COUNTER_MASK) | 153 | P6_EVNTSEL_REG_MASK) |
110 | 154 | ||
111 | return event & P6_EVNTSEL_MASK; | 155 | return hw_event & P6_EVNTSEL_MASK; |
112 | } | 156 | } |
113 | 157 | ||
114 | 158 | ||
@@ -126,16 +170,16 @@ static const u64 intel_perfmon_event_map[] = | |||
126 | [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, | 170 | [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, |
127 | }; | 171 | }; |
128 | 172 | ||
129 | static u64 intel_pmu_event_map(int event) | 173 | static u64 intel_pmu_event_map(int hw_event) |
130 | { | 174 | { |
131 | return intel_perfmon_event_map[event]; | 175 | return intel_perfmon_event_map[hw_event]; |
132 | } | 176 | } |
133 | 177 | ||
134 | /* | 178 | /* |
135 | * Generalized hw caching related event table, filled | 179 | * Generalized hw caching related hw_event table, filled |
136 | * in on a per model basis. A value of 0 means | 180 | * in on a per model basis. A value of 0 means |
137 | * 'not supported', -1 means 'event makes no sense on | 181 | * 'not supported', -1 means 'hw_event makes no sense on |
138 | * this CPU', any other value means the raw event | 182 | * this CPU', any other value means the raw hw_event |
139 | * ID. | 183 | * ID. |
140 | */ | 184 | */ |
141 | 185 | ||
@@ -419,22 +463,22 @@ static const u64 atom_hw_cache_event_ids | |||
419 | }, | 463 | }, |
420 | }; | 464 | }; |
421 | 465 | ||
422 | static u64 intel_pmu_raw_event(u64 event) | 466 | static u64 intel_pmu_raw_event(u64 hw_event) |
423 | { | 467 | { |
424 | #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL | 468 | #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL |
425 | #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL | 469 | #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL |
426 | #define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL | 470 | #define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL |
427 | #define CORE_EVNTSEL_INV_MASK 0x00800000ULL | 471 | #define CORE_EVNTSEL_INV_MASK 0x00800000ULL |
428 | #define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL | 472 | #define CORE_EVNTSEL_REG_MASK 0xFF000000ULL |
429 | 473 | ||
430 | #define CORE_EVNTSEL_MASK \ | 474 | #define CORE_EVNTSEL_MASK \ |
431 | (CORE_EVNTSEL_EVENT_MASK | \ | 475 | (CORE_EVNTSEL_EVENT_MASK | \ |
432 | CORE_EVNTSEL_UNIT_MASK | \ | 476 | CORE_EVNTSEL_UNIT_MASK | \ |
433 | CORE_EVNTSEL_EDGE_MASK | \ | 477 | CORE_EVNTSEL_EDGE_MASK | \ |
434 | CORE_EVNTSEL_INV_MASK | \ | 478 | CORE_EVNTSEL_INV_MASK | \ |
435 | CORE_EVNTSEL_COUNTER_MASK) | 479 | CORE_EVNTSEL_REG_MASK) |
436 | 480 | ||
437 | return event & CORE_EVNTSEL_MASK; | 481 | return hw_event & CORE_EVNTSEL_MASK; |
438 | } | 482 | } |
439 | 483 | ||
440 | static const u64 amd_hw_cache_event_ids | 484 | static const u64 amd_hw_cache_event_ids |
@@ -541,52 +585,55 @@ static const u64 amd_perfmon_event_map[] = | |||
541 | [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, | 585 | [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, |
542 | }; | 586 | }; |
543 | 587 | ||
544 | static u64 amd_pmu_event_map(int event) | 588 | static u64 amd_pmu_event_map(int hw_event) |
545 | { | 589 | { |
546 | return amd_perfmon_event_map[event]; | 590 | return amd_perfmon_event_map[hw_event]; |
547 | } | 591 | } |
548 | 592 | ||
549 | static u64 amd_pmu_raw_event(u64 event) | 593 | static u64 amd_pmu_raw_event(u64 hw_event) |
550 | { | 594 | { |
551 | #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL | 595 | #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL |
552 | #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL | 596 | #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL |
553 | #define K7_EVNTSEL_EDGE_MASK 0x000040000ULL | 597 | #define K7_EVNTSEL_EDGE_MASK 0x000040000ULL |
554 | #define K7_EVNTSEL_INV_MASK 0x000800000ULL | 598 | #define K7_EVNTSEL_INV_MASK 0x000800000ULL |
555 | #define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL | 599 | #define K7_EVNTSEL_REG_MASK 0x0FF000000ULL |
556 | 600 | ||
557 | #define K7_EVNTSEL_MASK \ | 601 | #define K7_EVNTSEL_MASK \ |
558 | (K7_EVNTSEL_EVENT_MASK | \ | 602 | (K7_EVNTSEL_EVENT_MASK | \ |
559 | K7_EVNTSEL_UNIT_MASK | \ | 603 | K7_EVNTSEL_UNIT_MASK | \ |
560 | K7_EVNTSEL_EDGE_MASK | \ | 604 | K7_EVNTSEL_EDGE_MASK | \ |
561 | K7_EVNTSEL_INV_MASK | \ | 605 | K7_EVNTSEL_INV_MASK | \ |
562 | K7_EVNTSEL_COUNTER_MASK) | 606 | K7_EVNTSEL_REG_MASK) |
563 | 607 | ||
564 | return event & K7_EVNTSEL_MASK; | 608 | return hw_event & K7_EVNTSEL_MASK; |
565 | } | 609 | } |
566 | 610 | ||
567 | /* | 611 | /* |
568 | * Propagate counter elapsed time into the generic counter. | 612 | * Propagate event elapsed time into the generic event. |
569 | * Can only be executed on the CPU where the counter is active. | 613 | * Can only be executed on the CPU where the event is active. |
570 | * Returns the delta events processed. | 614 | * Returns the delta events processed. |
571 | */ | 615 | */ |
572 | static u64 | 616 | static u64 |
573 | x86_perf_counter_update(struct perf_counter *counter, | 617 | x86_perf_event_update(struct perf_event *event, |
574 | struct hw_perf_counter *hwc, int idx) | 618 | struct hw_perf_event *hwc, int idx) |
575 | { | 619 | { |
576 | int shift = 64 - x86_pmu.counter_bits; | 620 | int shift = 64 - x86_pmu.event_bits; |
577 | u64 prev_raw_count, new_raw_count; | 621 | u64 prev_raw_count, new_raw_count; |
578 | s64 delta; | 622 | s64 delta; |
579 | 623 | ||
624 | if (idx == X86_PMC_IDX_FIXED_BTS) | ||
625 | return 0; | ||
626 | |||
580 | /* | 627 | /* |
581 | * Careful: an NMI might modify the previous counter value. | 628 | * Careful: an NMI might modify the previous event value. |
582 | * | 629 | * |
583 | * Our tactic to handle this is to first atomically read and | 630 | * Our tactic to handle this is to first atomically read and |
584 | * exchange a new raw count - then add that new-prev delta | 631 | * exchange a new raw count - then add that new-prev delta |
585 | * count to the generic counter atomically: | 632 | * count to the generic event atomically: |
586 | */ | 633 | */ |
587 | again: | 634 | again: |
588 | prev_raw_count = atomic64_read(&hwc->prev_count); | 635 | prev_raw_count = atomic64_read(&hwc->prev_count); |
589 | rdmsrl(hwc->counter_base + idx, new_raw_count); | 636 | rdmsrl(hwc->event_base + idx, new_raw_count); |
590 | 637 | ||
591 | if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, | 638 | if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, |
592 | new_raw_count) != prev_raw_count) | 639 | new_raw_count) != prev_raw_count) |
@@ -595,7 +642,7 @@ again: | |||
595 | /* | 642 | /* |
596 | * Now we have the new raw value and have updated the prev | 643 | * Now we have the new raw value and have updated the prev |
597 | * timestamp already. We can now calculate the elapsed delta | 644 | * timestamp already. We can now calculate the elapsed delta |
598 | * (counter-)time and add that to the generic counter. | 645 | * (event-)time and add that to the generic event. |
599 | * | 646 | * |
600 | * Careful, not all hw sign-extends above the physical width | 647 | * Careful, not all hw sign-extends above the physical width |
601 | * of the count. | 648 | * of the count. |
@@ -603,13 +650,13 @@ again: | |||
603 | delta = (new_raw_count << shift) - (prev_raw_count << shift); | 650 | delta = (new_raw_count << shift) - (prev_raw_count << shift); |
604 | delta >>= shift; | 651 | delta >>= shift; |
605 | 652 | ||
606 | atomic64_add(delta, &counter->count); | 653 | atomic64_add(delta, &event->count); |
607 | atomic64_sub(delta, &hwc->period_left); | 654 | atomic64_sub(delta, &hwc->period_left); |
608 | 655 | ||
609 | return new_raw_count; | 656 | return new_raw_count; |
610 | } | 657 | } |
611 | 658 | ||
612 | static atomic_t active_counters; | 659 | static atomic_t active_events; |
613 | static DEFINE_MUTEX(pmc_reserve_mutex); | 660 | static DEFINE_MUTEX(pmc_reserve_mutex); |
614 | 661 | ||
615 | static bool reserve_pmc_hardware(void) | 662 | static bool reserve_pmc_hardware(void) |
@@ -620,12 +667,12 @@ static bool reserve_pmc_hardware(void) | |||
620 | if (nmi_watchdog == NMI_LOCAL_APIC) | 667 | if (nmi_watchdog == NMI_LOCAL_APIC) |
621 | disable_lapic_nmi_watchdog(); | 668 | disable_lapic_nmi_watchdog(); |
622 | 669 | ||
623 | for (i = 0; i < x86_pmu.num_counters; i++) { | 670 | for (i = 0; i < x86_pmu.num_events; i++) { |
624 | if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) | 671 | if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) |
625 | goto perfctr_fail; | 672 | goto perfctr_fail; |
626 | } | 673 | } |
627 | 674 | ||
628 | for (i = 0; i < x86_pmu.num_counters; i++) { | 675 | for (i = 0; i < x86_pmu.num_events; i++) { |
629 | if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) | 676 | if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) |
630 | goto eventsel_fail; | 677 | goto eventsel_fail; |
631 | } | 678 | } |
@@ -638,7 +685,7 @@ eventsel_fail: | |||
638 | for (i--; i >= 0; i--) | 685 | for (i--; i >= 0; i--) |
639 | release_evntsel_nmi(x86_pmu.eventsel + i); | 686 | release_evntsel_nmi(x86_pmu.eventsel + i); |
640 | 687 | ||
641 | i = x86_pmu.num_counters; | 688 | i = x86_pmu.num_events; |
642 | 689 | ||
643 | perfctr_fail: | 690 | perfctr_fail: |
644 | for (i--; i >= 0; i--) | 691 | for (i--; i >= 0; i--) |
@@ -656,7 +703,7 @@ static void release_pmc_hardware(void) | |||
656 | #ifdef CONFIG_X86_LOCAL_APIC | 703 | #ifdef CONFIG_X86_LOCAL_APIC |
657 | int i; | 704 | int i; |
658 | 705 | ||
659 | for (i = 0; i < x86_pmu.num_counters; i++) { | 706 | for (i = 0; i < x86_pmu.num_events; i++) { |
660 | release_perfctr_nmi(x86_pmu.perfctr + i); | 707 | release_perfctr_nmi(x86_pmu.perfctr + i); |
661 | release_evntsel_nmi(x86_pmu.eventsel + i); | 708 | release_evntsel_nmi(x86_pmu.eventsel + i); |
662 | } | 709 | } |
@@ -666,10 +713,110 @@ static void release_pmc_hardware(void) | |||
666 | #endif | 713 | #endif |
667 | } | 714 | } |
668 | 715 | ||
669 | static void hw_perf_counter_destroy(struct perf_counter *counter) | 716 | static inline bool bts_available(void) |
717 | { | ||
718 | return x86_pmu.enable_bts != NULL; | ||
719 | } | ||
720 | |||
721 | static inline void init_debug_store_on_cpu(int cpu) | ||
722 | { | ||
723 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
724 | |||
725 | if (!ds) | ||
726 | return; | ||
727 | |||
728 | wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, | ||
729 | (u32)((u64)(unsigned long)ds), | ||
730 | (u32)((u64)(unsigned long)ds >> 32)); | ||
731 | } | ||
732 | |||
733 | static inline void fini_debug_store_on_cpu(int cpu) | ||
734 | { | ||
735 | if (!per_cpu(cpu_hw_events, cpu).ds) | ||
736 | return; | ||
737 | |||
738 | wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); | ||
739 | } | ||
740 | |||
741 | static void release_bts_hardware(void) | ||
742 | { | ||
743 | int cpu; | ||
744 | |||
745 | if (!bts_available()) | ||
746 | return; | ||
747 | |||
748 | get_online_cpus(); | ||
749 | |||
750 | for_each_online_cpu(cpu) | ||
751 | fini_debug_store_on_cpu(cpu); | ||
752 | |||
753 | for_each_possible_cpu(cpu) { | ||
754 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
755 | |||
756 | if (!ds) | ||
757 | continue; | ||
758 | |||
759 | per_cpu(cpu_hw_events, cpu).ds = NULL; | ||
760 | |||
761 | kfree((void *)(unsigned long)ds->bts_buffer_base); | ||
762 | kfree(ds); | ||
763 | } | ||
764 | |||
765 | put_online_cpus(); | ||
766 | } | ||
767 | |||
768 | static int reserve_bts_hardware(void) | ||
670 | { | 769 | { |
671 | if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { | 770 | int cpu, err = 0; |
771 | |||
772 | if (!bts_available()) | ||
773 | return 0; | ||
774 | |||
775 | get_online_cpus(); | ||
776 | |||
777 | for_each_possible_cpu(cpu) { | ||
778 | struct debug_store *ds; | ||
779 | void *buffer; | ||
780 | |||
781 | err = -ENOMEM; | ||
782 | buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); | ||
783 | if (unlikely(!buffer)) | ||
784 | break; | ||
785 | |||
786 | ds = kzalloc(sizeof(*ds), GFP_KERNEL); | ||
787 | if (unlikely(!ds)) { | ||
788 | kfree(buffer); | ||
789 | break; | ||
790 | } | ||
791 | |||
792 | ds->bts_buffer_base = (u64)(unsigned long)buffer; | ||
793 | ds->bts_index = ds->bts_buffer_base; | ||
794 | ds->bts_absolute_maximum = | ||
795 | ds->bts_buffer_base + BTS_BUFFER_SIZE; | ||
796 | ds->bts_interrupt_threshold = | ||
797 | ds->bts_absolute_maximum - BTS_OVFL_TH; | ||
798 | |||
799 | per_cpu(cpu_hw_events, cpu).ds = ds; | ||
800 | err = 0; | ||
801 | } | ||
802 | |||
803 | if (err) | ||
804 | release_bts_hardware(); | ||
805 | else { | ||
806 | for_each_online_cpu(cpu) | ||
807 | init_debug_store_on_cpu(cpu); | ||
808 | } | ||
809 | |||
810 | put_online_cpus(); | ||
811 | |||
812 | return err; | ||
813 | } | ||
814 | |||
815 | static void hw_perf_event_destroy(struct perf_event *event) | ||
816 | { | ||
817 | if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { | ||
672 | release_pmc_hardware(); | 818 | release_pmc_hardware(); |
819 | release_bts_hardware(); | ||
673 | mutex_unlock(&pmc_reserve_mutex); | 820 | mutex_unlock(&pmc_reserve_mutex); |
674 | } | 821 | } |
675 | } | 822 | } |
@@ -680,7 +827,7 @@ static inline int x86_pmu_initialized(void) | |||
680 | } | 827 | } |
681 | 828 | ||
682 | static inline int | 829 | static inline int |
683 | set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) | 830 | set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) |
684 | { | 831 | { |
685 | unsigned int cache_type, cache_op, cache_result; | 832 | unsigned int cache_type, cache_op, cache_result; |
686 | u64 config, val; | 833 | u64 config, val; |
@@ -712,13 +859,49 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) | |||
712 | return 0; | 859 | return 0; |
713 | } | 860 | } |
714 | 861 | ||
862 | static void intel_pmu_enable_bts(u64 config) | ||
863 | { | ||
864 | unsigned long debugctlmsr; | ||
865 | |||
866 | debugctlmsr = get_debugctlmsr(); | ||
867 | |||
868 | debugctlmsr |= X86_DEBUGCTL_TR; | ||
869 | debugctlmsr |= X86_DEBUGCTL_BTS; | ||
870 | debugctlmsr |= X86_DEBUGCTL_BTINT; | ||
871 | |||
872 | if (!(config & ARCH_PERFMON_EVENTSEL_OS)) | ||
873 | debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; | ||
874 | |||
875 | if (!(config & ARCH_PERFMON_EVENTSEL_USR)) | ||
876 | debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; | ||
877 | |||
878 | update_debugctlmsr(debugctlmsr); | ||
879 | } | ||
880 | |||
881 | static void intel_pmu_disable_bts(void) | ||
882 | { | ||
883 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
884 | unsigned long debugctlmsr; | ||
885 | |||
886 | if (!cpuc->ds) | ||
887 | return; | ||
888 | |||
889 | debugctlmsr = get_debugctlmsr(); | ||
890 | |||
891 | debugctlmsr &= | ||
892 | ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | | ||
893 | X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); | ||
894 | |||
895 | update_debugctlmsr(debugctlmsr); | ||
896 | } | ||
897 | |||
715 | /* | 898 | /* |
716 | * Setup the hardware configuration for a given attr_type | 899 | * Setup the hardware configuration for a given attr_type |
717 | */ | 900 | */ |
718 | static int __hw_perf_counter_init(struct perf_counter *counter) | 901 | static int __hw_perf_event_init(struct perf_event *event) |
719 | { | 902 | { |
720 | struct perf_counter_attr *attr = &counter->attr; | 903 | struct perf_event_attr *attr = &event->attr; |
721 | struct hw_perf_counter *hwc = &counter->hw; | 904 | struct hw_perf_event *hwc = &event->hw; |
722 | u64 config; | 905 | u64 config; |
723 | int err; | 906 | int err; |
724 | 907 | ||
@@ -726,17 +909,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter) | |||
726 | return -ENODEV; | 909 | return -ENODEV; |
727 | 910 | ||
728 | err = 0; | 911 | err = 0; |
729 | if (!atomic_inc_not_zero(&active_counters)) { | 912 | if (!atomic_inc_not_zero(&active_events)) { |
730 | mutex_lock(&pmc_reserve_mutex); | 913 | mutex_lock(&pmc_reserve_mutex); |
731 | if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) | 914 | if (atomic_read(&active_events) == 0) { |
732 | err = -EBUSY; | 915 | if (!reserve_pmc_hardware()) |
733 | else | 916 | err = -EBUSY; |
734 | atomic_inc(&active_counters); | 917 | else |
918 | err = reserve_bts_hardware(); | ||
919 | } | ||
920 | if (!err) | ||
921 | atomic_inc(&active_events); | ||
735 | mutex_unlock(&pmc_reserve_mutex); | 922 | mutex_unlock(&pmc_reserve_mutex); |
736 | } | 923 | } |
737 | if (err) | 924 | if (err) |
738 | return err; | 925 | return err; |
739 | 926 | ||
927 | event->destroy = hw_perf_event_destroy; | ||
928 | |||
740 | /* | 929 | /* |
741 | * Generate PMC IRQs: | 930 | * Generate PMC IRQs: |
742 | * (keep 'enabled' bit clear for now) | 931 | * (keep 'enabled' bit clear for now) |
@@ -759,17 +948,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) | |||
759 | /* | 948 | /* |
760 | * If we have a PMU initialized but no APIC | 949 | * If we have a PMU initialized but no APIC |
761 | * interrupts, we cannot sample hardware | 950 | * interrupts, we cannot sample hardware |
762 | * counters (user-space has to fall back and | 951 | * events (user-space has to fall back and |
763 | * sample via a hrtimer based software counter): | 952 | * sample via a hrtimer based software event): |
764 | */ | 953 | */ |
765 | if (!x86_pmu.apic) | 954 | if (!x86_pmu.apic) |
766 | return -EOPNOTSUPP; | 955 | return -EOPNOTSUPP; |
767 | } | 956 | } |
768 | 957 | ||
769 | counter->destroy = hw_perf_counter_destroy; | ||
770 | |||
771 | /* | 958 | /* |
772 | * Raw event type provide the config in the event structure | 959 | * Raw hw_event type provide the config in the hw_event structure |
773 | */ | 960 | */ |
774 | if (attr->type == PERF_TYPE_RAW) { | 961 | if (attr->type == PERF_TYPE_RAW) { |
775 | hwc->config |= x86_pmu.raw_event(attr->config); | 962 | hwc->config |= x86_pmu.raw_event(attr->config); |
@@ -793,6 +980,20 @@ static int __hw_perf_counter_init(struct perf_counter *counter) | |||
793 | if (config == -1LL) | 980 | if (config == -1LL) |
794 | return -EINVAL; | 981 | return -EINVAL; |
795 | 982 | ||
983 | /* | ||
984 | * Branch tracing: | ||
985 | */ | ||
986 | if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && | ||
987 | (hwc->sample_period == 1)) { | ||
988 | /* BTS is not supported by this architecture. */ | ||
989 | if (!bts_available()) | ||
990 | return -EOPNOTSUPP; | ||
991 | |||
992 | /* BTS is currently only allowed for user-mode. */ | ||
993 | if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) | ||
994 | return -EOPNOTSUPP; | ||
995 | } | ||
996 | |||
796 | hwc->config |= config; | 997 | hwc->config |= config; |
797 | 998 | ||
798 | return 0; | 999 | return 0; |
@@ -800,7 +1001,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) | |||
800 | 1001 | ||
801 | static void p6_pmu_disable_all(void) | 1002 | static void p6_pmu_disable_all(void) |
802 | { | 1003 | { |
803 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | 1004 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
804 | u64 val; | 1005 | u64 val; |
805 | 1006 | ||
806 | if (!cpuc->enabled) | 1007 | if (!cpuc->enabled) |
@@ -817,12 +1018,23 @@ static void p6_pmu_disable_all(void) | |||
817 | 1018 | ||
818 | static void intel_pmu_disable_all(void) | 1019 | static void intel_pmu_disable_all(void) |
819 | { | 1020 | { |
1021 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
1022 | |||
1023 | if (!cpuc->enabled) | ||
1024 | return; | ||
1025 | |||
1026 | cpuc->enabled = 0; | ||
1027 | barrier(); | ||
1028 | |||
820 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); | 1029 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); |
1030 | |||
1031 | if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) | ||
1032 | intel_pmu_disable_bts(); | ||
821 | } | 1033 | } |
822 | 1034 | ||
823 | static void amd_pmu_disable_all(void) | 1035 | static void amd_pmu_disable_all(void) |
824 | { | 1036 | { |
825 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | 1037 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
826 | int idx; | 1038 | int idx; |
827 | 1039 | ||
828 | if (!cpuc->enabled) | 1040 | if (!cpuc->enabled) |
@@ -831,12 +1043,12 @@ static void amd_pmu_disable_all(void) | |||
831 | cpuc->enabled = 0; | 1043 | cpuc->enabled = 0; |
832 | /* | 1044 | /* |
833 | * ensure we write the disable before we start disabling the | 1045 | * ensure we write the disable before we start disabling the |
834 | * counters proper, so that amd_pmu_enable_counter() does the | 1046 | * events proper, so that amd_pmu_enable_event() does the |
835 | * right thing. | 1047 | * right thing. |
836 | */ | 1048 | */ |
837 | barrier(); | 1049 | barrier(); |
838 | 1050 | ||
839 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | 1051 | for (idx = 0; idx < x86_pmu.num_events; idx++) { |
840 | u64 val; | 1052 | u64 val; |
841 | 1053 | ||
842 | if (!test_bit(idx, cpuc->active_mask)) | 1054 | if (!test_bit(idx, cpuc->active_mask)) |
@@ -858,7 +1070,7 @@ void hw_perf_disable(void) | |||
858 | 1070 | ||
859 | static void p6_pmu_enable_all(void) | 1071 | static void p6_pmu_enable_all(void) |
860 | { | 1072 | { |
861 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | 1073 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
862 | unsigned long val; | 1074 | unsigned long val; |
863 | 1075 | ||
864 | if (cpuc->enabled) | 1076 | if (cpuc->enabled) |
@@ -875,12 +1087,30 @@ static void p6_pmu_enable_all(void) | |||
875 | 1087 | ||
876 | static void intel_pmu_enable_all(void) | 1088 | static void intel_pmu_enable_all(void) |
877 | { | 1089 | { |
1090 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
1091 | |||
1092 | if (cpuc->enabled) | ||
1093 | return; | ||
1094 | |||
1095 | cpuc->enabled = 1; | ||
1096 | barrier(); | ||
1097 | |||
878 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); | 1098 | wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); |
1099 | |||
1100 | if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { | ||
1101 | struct perf_event *event = | ||
1102 | cpuc->events[X86_PMC_IDX_FIXED_BTS]; | ||
1103 | |||
1104 | if (WARN_ON_ONCE(!event)) | ||
1105 | return; | ||
1106 | |||
1107 | intel_pmu_enable_bts(event->hw.config); | ||
1108 | } | ||
879 | } | 1109 | } |
880 | 1110 | ||
881 | static void amd_pmu_enable_all(void) | 1111 | static void amd_pmu_enable_all(void) |
882 | { | 1112 | { |
883 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | 1113 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
884 | int idx; | 1114 | int idx; |
885 | 1115 | ||
886 | if (cpuc->enabled) | 1116 | if (cpuc->enabled) |
@@ -889,14 +1119,14 @@ static void amd_pmu_enable_all(void) | |||
889 | cpuc->enabled = 1; | 1119 | cpuc->enabled = 1; |
890 | barrier(); | 1120 | barrier(); |
891 | 1121 | ||
892 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | 1122 | for (idx = 0; idx < x86_pmu.num_events; idx++) { |
893 | struct perf_counter *counter = cpuc->counters[idx]; | 1123 | struct perf_event *event = cpuc->events[idx]; |
894 | u64 val; | 1124 | u64 val; |
895 | 1125 | ||
896 | if (!test_bit(idx, cpuc->active_mask)) | 1126 | if (!test_bit(idx, cpuc->active_mask)) |
897 | continue; | 1127 | continue; |
898 | 1128 | ||
899 | val = counter->hw.config; | 1129 | val = event->hw.config; |
900 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; | 1130 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; |
901 | wrmsrl(MSR_K7_EVNTSEL0 + idx, val); | 1131 | wrmsrl(MSR_K7_EVNTSEL0 + idx, val); |
902 | } | 1132 | } |
@@ -923,19 +1153,19 @@ static inline void intel_pmu_ack_status(u64 ack) | |||
923 | wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); | 1153 | wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); |
924 | } | 1154 | } |
925 | 1155 | ||
926 | static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | 1156 | static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) |
927 | { | 1157 | { |
928 | (void)checking_wrmsrl(hwc->config_base + idx, | 1158 | (void)checking_wrmsrl(hwc->config_base + idx, |
929 | hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); | 1159 | hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); |
930 | } | 1160 | } |
931 | 1161 | ||
932 | static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) | 1162 | static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) |
933 | { | 1163 | { |
934 | (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); | 1164 | (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); |
935 | } | 1165 | } |
936 | 1166 | ||
937 | static inline void | 1167 | static inline void |
938 | intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) | 1168 | intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) |
939 | { | 1169 | { |
940 | int idx = __idx - X86_PMC_IDX_FIXED; | 1170 | int idx = __idx - X86_PMC_IDX_FIXED; |
941 | u64 ctrl_val, mask; | 1171 | u64 ctrl_val, mask; |
@@ -948,10 +1178,10 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) | |||
948 | } | 1178 | } |
949 | 1179 | ||
950 | static inline void | 1180 | static inline void |
951 | p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) | 1181 | p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) |
952 | { | 1182 | { |
953 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | 1183 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
954 | u64 val = P6_NOP_COUNTER; | 1184 | u64 val = P6_NOP_EVENT; |
955 | 1185 | ||
956 | if (cpuc->enabled) | 1186 | if (cpuc->enabled) |
957 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; | 1187 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; |
@@ -960,36 +1190,44 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) | |||
960 | } | 1190 | } |
961 | 1191 | ||
962 | static inline void | 1192 | static inline void |
963 | intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) | 1193 | intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) |
964 | { | 1194 | { |
1195 | if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { | ||
1196 | intel_pmu_disable_bts(); | ||
1197 | return; | ||
1198 | } | ||
1199 | |||
965 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { | 1200 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { |
966 | intel_pmu_disable_fixed(hwc, idx); | 1201 | intel_pmu_disable_fixed(hwc, idx); |
967 | return; | 1202 | return; |
968 | } | 1203 | } |
969 | 1204 | ||
970 | x86_pmu_disable_counter(hwc, idx); | 1205 | x86_pmu_disable_event(hwc, idx); |
971 | } | 1206 | } |
972 | 1207 | ||
973 | static inline void | 1208 | static inline void |
974 | amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) | 1209 | amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) |
975 | { | 1210 | { |
976 | x86_pmu_disable_counter(hwc, idx); | 1211 | x86_pmu_disable_event(hwc, idx); |
977 | } | 1212 | } |
978 | 1213 | ||
979 | static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); | 1214 | static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); |
980 | 1215 | ||
981 | /* | 1216 | /* |
982 | * Set the next IRQ period, based on the hwc->period_left value. | 1217 | * Set the next IRQ period, based on the hwc->period_left value. |
983 | * To be called with the counter disabled in hw: | 1218 | * To be called with the event disabled in hw: |
984 | */ | 1219 | */ |
985 | static int | 1220 | static int |
986 | x86_perf_counter_set_period(struct perf_counter *counter, | 1221 | x86_perf_event_set_period(struct perf_event *event, |
987 | struct hw_perf_counter *hwc, int idx) | 1222 | struct hw_perf_event *hwc, int idx) |
988 | { | 1223 | { |
989 | s64 left = atomic64_read(&hwc->period_left); | 1224 | s64 left = atomic64_read(&hwc->period_left); |
990 | s64 period = hwc->sample_period; | 1225 | s64 period = hwc->sample_period; |
991 | int err, ret = 0; | 1226 | int err, ret = 0; |
992 | 1227 | ||
1228 | if (idx == X86_PMC_IDX_FIXED_BTS) | ||
1229 | return 0; | ||
1230 | |||
993 | /* | 1231 | /* |
994 | * If we are way outside a reasoable range then just skip forward: | 1232 | * If we are way outside a reasoable range then just skip forward: |
995 | */ | 1233 | */ |
@@ -1007,7 +1245,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, | |||
1007 | ret = 1; | 1245 | ret = 1; |
1008 | } | 1246 | } |
1009 | /* | 1247 | /* |
1010 | * Quirk: certain CPUs dont like it if just 1 event is left: | 1248 | * Quirk: certain CPUs dont like it if just 1 hw_event is left: |
1011 | */ | 1249 | */ |
1012 | if (unlikely(left < 2)) | 1250 | if (unlikely(left < 2)) |
1013 | left = 2; | 1251 | left = 2; |
@@ -1015,24 +1253,24 @@ x86_perf_counter_set_period(struct perf_counter *counter, | |||
1015 | if (left > x86_pmu.max_period) | 1253 | if (left > x86_pmu.max_period) |
1016 | left = x86_pmu.max_period; | 1254 | left = x86_pmu.max_period; |
1017 | 1255 | ||
1018 | per_cpu(prev_left[idx], smp_processor_id()) = left; | 1256 | per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; |
1019 | 1257 | ||
1020 | /* | 1258 | /* |
1021 | * The hw counter starts counting from this counter offset, | 1259 | * The hw event starts counting from this event offset, |
1022 | * mark it to be able to extra future deltas: | 1260 | * mark it to be able to extra future deltas: |
1023 | */ | 1261 | */ |
1024 | atomic64_set(&hwc->prev_count, (u64)-left); | 1262 | atomic64_set(&hwc->prev_count, (u64)-left); |
1025 | 1263 | ||
1026 | err = checking_wrmsrl(hwc->counter_base + idx, | 1264 | err = checking_wrmsrl(hwc->event_base + idx, |
1027 | (u64)(-left) & x86_pmu.counter_mask); | 1265 | (u64)(-left) & x86_pmu.event_mask); |
1028 | 1266 | ||
1029 | perf_counter_update_userpage(counter); | 1267 | perf_event_update_userpage(event); |
1030 | 1268 | ||
1031 | return ret; | 1269 | return ret; |
1032 | } | 1270 | } |
1033 | 1271 | ||
1034 | static inline void | 1272 | static inline void |
1035 | intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) | 1273 | intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) |
1036 | { | 1274 | { |
1037 | int idx = __idx - X86_PMC_IDX_FIXED; | 1275 | int idx = __idx - X86_PMC_IDX_FIXED; |
1038 | u64 ctrl_val, bits, mask; | 1276 | u64 ctrl_val, bits, mask; |
@@ -1057,9 +1295,9 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) | |||
1057 | err = checking_wrmsrl(hwc->config_base, ctrl_val); | 1295 | err = checking_wrmsrl(hwc->config_base, ctrl_val); |
1058 | } | 1296 | } |
1059 | 1297 | ||
1060 | static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | 1298 | static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) |
1061 | { | 1299 | { |
1062 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | 1300 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1063 | u64 val; | 1301 | u64 val; |
1064 | 1302 | ||
1065 | val = hwc->config; | 1303 | val = hwc->config; |
@@ -1070,128 +1308,149 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | |||
1070 | } | 1308 | } |
1071 | 1309 | ||
1072 | 1310 | ||
1073 | static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | 1311 | static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) |
1074 | { | 1312 | { |
1313 | if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { | ||
1314 | if (!__get_cpu_var(cpu_hw_events).enabled) | ||
1315 | return; | ||
1316 | |||
1317 | intel_pmu_enable_bts(hwc->config); | ||
1318 | return; | ||
1319 | } | ||
1320 | |||
1075 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { | 1321 | if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { |
1076 | intel_pmu_enable_fixed(hwc, idx); | 1322 | intel_pmu_enable_fixed(hwc, idx); |
1077 | return; | 1323 | return; |
1078 | } | 1324 | } |
1079 | 1325 | ||
1080 | x86_pmu_enable_counter(hwc, idx); | 1326 | x86_pmu_enable_event(hwc, idx); |
1081 | } | 1327 | } |
1082 | 1328 | ||
1083 | static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) | 1329 | static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) |
1084 | { | 1330 | { |
1085 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | 1331 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1086 | 1332 | ||
1087 | if (cpuc->enabled) | 1333 | if (cpuc->enabled) |
1088 | x86_pmu_enable_counter(hwc, idx); | 1334 | x86_pmu_enable_event(hwc, idx); |
1089 | } | 1335 | } |
1090 | 1336 | ||
1091 | static int | 1337 | static int |
1092 | fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) | 1338 | fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) |
1093 | { | 1339 | { |
1094 | unsigned int event; | 1340 | unsigned int hw_event; |
1095 | 1341 | ||
1096 | if (!x86_pmu.num_counters_fixed) | 1342 | hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; |
1097 | return -1; | 1343 | |
1344 | if (unlikely((hw_event == | ||
1345 | x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && | ||
1346 | (hwc->sample_period == 1))) | ||
1347 | return X86_PMC_IDX_FIXED_BTS; | ||
1098 | 1348 | ||
1099 | event = hwc->config & ARCH_PERFMON_EVENT_MASK; | 1349 | if (!x86_pmu.num_events_fixed) |
1350 | return -1; | ||
1100 | 1351 | ||
1101 | if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) | 1352 | if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) |
1102 | return X86_PMC_IDX_FIXED_INSTRUCTIONS; | 1353 | return X86_PMC_IDX_FIXED_INSTRUCTIONS; |
1103 | if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) | 1354 | if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) |
1104 | return X86_PMC_IDX_FIXED_CPU_CYCLES; | 1355 | return X86_PMC_IDX_FIXED_CPU_CYCLES; |
1105 | if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) | 1356 | if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) |
1106 | return X86_PMC_IDX_FIXED_BUS_CYCLES; | 1357 | return X86_PMC_IDX_FIXED_BUS_CYCLES; |
1107 | 1358 | ||
1108 | return -1; | 1359 | return -1; |
1109 | } | 1360 | } |
1110 | 1361 | ||
1111 | /* | 1362 | /* |
1112 | * Find a PMC slot for the freshly enabled / scheduled in counter: | 1363 | * Find a PMC slot for the freshly enabled / scheduled in event: |
1113 | */ | 1364 | */ |
1114 | static int x86_pmu_enable(struct perf_counter *counter) | 1365 | static int x86_pmu_enable(struct perf_event *event) |
1115 | { | 1366 | { |
1116 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | 1367 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1117 | struct hw_perf_counter *hwc = &counter->hw; | 1368 | struct hw_perf_event *hwc = &event->hw; |
1118 | int idx; | 1369 | int idx; |
1119 | 1370 | ||
1120 | idx = fixed_mode_idx(counter, hwc); | 1371 | idx = fixed_mode_idx(event, hwc); |
1121 | if (idx >= 0) { | 1372 | if (idx == X86_PMC_IDX_FIXED_BTS) { |
1373 | /* BTS is already occupied. */ | ||
1374 | if (test_and_set_bit(idx, cpuc->used_mask)) | ||
1375 | return -EAGAIN; | ||
1376 | |||
1377 | hwc->config_base = 0; | ||
1378 | hwc->event_base = 0; | ||
1379 | hwc->idx = idx; | ||
1380 | } else if (idx >= 0) { | ||
1122 | /* | 1381 | /* |
1123 | * Try to get the fixed counter, if that is already taken | 1382 | * Try to get the fixed event, if that is already taken |
1124 | * then try to get a generic counter: | 1383 | * then try to get a generic event: |
1125 | */ | 1384 | */ |
1126 | if (test_and_set_bit(idx, cpuc->used_mask)) | 1385 | if (test_and_set_bit(idx, cpuc->used_mask)) |
1127 | goto try_generic; | 1386 | goto try_generic; |
1128 | 1387 | ||
1129 | hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; | 1388 | hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; |
1130 | /* | 1389 | /* |
1131 | * We set it so that counter_base + idx in wrmsr/rdmsr maps to | 1390 | * We set it so that event_base + idx in wrmsr/rdmsr maps to |
1132 | * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: | 1391 | * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: |
1133 | */ | 1392 | */ |
1134 | hwc->counter_base = | 1393 | hwc->event_base = |
1135 | MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; | 1394 | MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; |
1136 | hwc->idx = idx; | 1395 | hwc->idx = idx; |
1137 | } else { | 1396 | } else { |
1138 | idx = hwc->idx; | 1397 | idx = hwc->idx; |
1139 | /* Try to get the previous generic counter again */ | 1398 | /* Try to get the previous generic event again */ |
1140 | if (test_and_set_bit(idx, cpuc->used_mask)) { | 1399 | if (test_and_set_bit(idx, cpuc->used_mask)) { |
1141 | try_generic: | 1400 | try_generic: |
1142 | idx = find_first_zero_bit(cpuc->used_mask, | 1401 | idx = find_first_zero_bit(cpuc->used_mask, |
1143 | x86_pmu.num_counters); | 1402 | x86_pmu.num_events); |
1144 | if (idx == x86_pmu.num_counters) | 1403 | if (idx == x86_pmu.num_events) |
1145 | return -EAGAIN; | 1404 | return -EAGAIN; |
1146 | 1405 | ||
1147 | set_bit(idx, cpuc->used_mask); | 1406 | set_bit(idx, cpuc->used_mask); |
1148 | hwc->idx = idx; | 1407 | hwc->idx = idx; |
1149 | } | 1408 | } |
1150 | hwc->config_base = x86_pmu.eventsel; | 1409 | hwc->config_base = x86_pmu.eventsel; |
1151 | hwc->counter_base = x86_pmu.perfctr; | 1410 | hwc->event_base = x86_pmu.perfctr; |
1152 | } | 1411 | } |
1153 | 1412 | ||
1154 | perf_counters_lapic_init(); | 1413 | perf_events_lapic_init(); |
1155 | 1414 | ||
1156 | x86_pmu.disable(hwc, idx); | 1415 | x86_pmu.disable(hwc, idx); |
1157 | 1416 | ||
1158 | cpuc->counters[idx] = counter; | 1417 | cpuc->events[idx] = event; |
1159 | set_bit(idx, cpuc->active_mask); | 1418 | set_bit(idx, cpuc->active_mask); |
1160 | 1419 | ||
1161 | x86_perf_counter_set_period(counter, hwc, idx); | 1420 | x86_perf_event_set_period(event, hwc, idx); |
1162 | x86_pmu.enable(hwc, idx); | 1421 | x86_pmu.enable(hwc, idx); |
1163 | 1422 | ||
1164 | perf_counter_update_userpage(counter); | 1423 | perf_event_update_userpage(event); |
1165 | 1424 | ||
1166 | return 0; | 1425 | return 0; |
1167 | } | 1426 | } |
1168 | 1427 | ||
1169 | static void x86_pmu_unthrottle(struct perf_counter *counter) | 1428 | static void x86_pmu_unthrottle(struct perf_event *event) |
1170 | { | 1429 | { |
1171 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | 1430 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); |
1172 | struct hw_perf_counter *hwc = &counter->hw; | 1431 | struct hw_perf_event *hwc = &event->hw; |
1173 | 1432 | ||
1174 | if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || | 1433 | if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || |
1175 | cpuc->counters[hwc->idx] != counter)) | 1434 | cpuc->events[hwc->idx] != event)) |
1176 | return; | 1435 | return; |
1177 | 1436 | ||
1178 | x86_pmu.enable(hwc, hwc->idx); | 1437 | x86_pmu.enable(hwc, hwc->idx); |
1179 | } | 1438 | } |
1180 | 1439 | ||
1181 | void perf_counter_print_debug(void) | 1440 | void perf_event_print_debug(void) |
1182 | { | 1441 | { |
1183 | u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; | 1442 | u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; |
1184 | struct cpu_hw_counters *cpuc; | 1443 | struct cpu_hw_events *cpuc; |
1185 | unsigned long flags; | 1444 | unsigned long flags; |
1186 | int cpu, idx; | 1445 | int cpu, idx; |
1187 | 1446 | ||
1188 | if (!x86_pmu.num_counters) | 1447 | if (!x86_pmu.num_events) |
1189 | return; | 1448 | return; |
1190 | 1449 | ||
1191 | local_irq_save(flags); | 1450 | local_irq_save(flags); |
1192 | 1451 | ||
1193 | cpu = smp_processor_id(); | 1452 | cpu = smp_processor_id(); |
1194 | cpuc = &per_cpu(cpu_hw_counters, cpu); | 1453 | cpuc = &per_cpu(cpu_hw_events, cpu); |
1195 | 1454 | ||
1196 | if (x86_pmu.version >= 2) { | 1455 | if (x86_pmu.version >= 2) { |
1197 | rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); | 1456 | rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); |
@@ -1207,11 +1466,11 @@ void perf_counter_print_debug(void) | |||
1207 | } | 1466 | } |
1208 | pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); | 1467 | pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); |
1209 | 1468 | ||
1210 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | 1469 | for (idx = 0; idx < x86_pmu.num_events; idx++) { |
1211 | rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); | 1470 | rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); |
1212 | rdmsrl(x86_pmu.perfctr + idx, pmc_count); | 1471 | rdmsrl(x86_pmu.perfctr + idx, pmc_count); |
1213 | 1472 | ||
1214 | prev_left = per_cpu(prev_left[idx], cpu); | 1473 | prev_left = per_cpu(pmc_prev_left[idx], cpu); |
1215 | 1474 | ||
1216 | pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", | 1475 | pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", |
1217 | cpu, idx, pmc_ctrl); | 1476 | cpu, idx, pmc_ctrl); |
@@ -1220,7 +1479,7 @@ void perf_counter_print_debug(void) | |||
1220 | pr_info("CPU#%d: gen-PMC%d left: %016llx\n", | 1479 | pr_info("CPU#%d: gen-PMC%d left: %016llx\n", |
1221 | cpu, idx, prev_left); | 1480 | cpu, idx, prev_left); |
1222 | } | 1481 | } |
1223 | for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { | 1482 | for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { |
1224 | rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); | 1483 | rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); |
1225 | 1484 | ||
1226 | pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", | 1485 | pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", |
@@ -1229,10 +1488,69 @@ void perf_counter_print_debug(void) | |||
1229 | local_irq_restore(flags); | 1488 | local_irq_restore(flags); |
1230 | } | 1489 | } |
1231 | 1490 | ||
1232 | static void x86_pmu_disable(struct perf_counter *counter) | 1491 | static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) |
1233 | { | 1492 | { |
1234 | struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); | 1493 | struct debug_store *ds = cpuc->ds; |
1235 | struct hw_perf_counter *hwc = &counter->hw; | 1494 | struct bts_record { |
1495 | u64 from; | ||
1496 | u64 to; | ||
1497 | u64 flags; | ||
1498 | }; | ||
1499 | struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; | ||
1500 | struct bts_record *at, *top; | ||
1501 | struct perf_output_handle handle; | ||
1502 | struct perf_event_header header; | ||
1503 | struct perf_sample_data data; | ||
1504 | struct pt_regs regs; | ||
1505 | |||
1506 | if (!event) | ||
1507 | return; | ||
1508 | |||
1509 | if (!ds) | ||
1510 | return; | ||
1511 | |||
1512 | at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; | ||
1513 | top = (struct bts_record *)(unsigned long)ds->bts_index; | ||
1514 | |||
1515 | if (top <= at) | ||
1516 | return; | ||
1517 | |||
1518 | ds->bts_index = ds->bts_buffer_base; | ||
1519 | |||
1520 | |||
1521 | data.period = event->hw.last_period; | ||
1522 | data.addr = 0; | ||
1523 | regs.ip = 0; | ||
1524 | |||
1525 | /* | ||
1526 | * Prepare a generic sample, i.e. fill in the invariant fields. | ||
1527 | * We will overwrite the from and to address before we output | ||
1528 | * the sample. | ||
1529 | */ | ||
1530 | perf_prepare_sample(&header, &data, event, ®s); | ||
1531 | |||
1532 | if (perf_output_begin(&handle, event, | ||
1533 | header.size * (top - at), 1, 1)) | ||
1534 | return; | ||
1535 | |||
1536 | for (; at < top; at++) { | ||
1537 | data.ip = at->from; | ||
1538 | data.addr = at->to; | ||
1539 | |||
1540 | perf_output_sample(&handle, &header, &data, event); | ||
1541 | } | ||
1542 | |||
1543 | perf_output_end(&handle); | ||
1544 | |||
1545 | /* There's new data available. */ | ||
1546 | event->hw.interrupts++; | ||
1547 | event->pending_kill = POLL_IN; | ||
1548 | } | ||
1549 | |||
1550 | static void x86_pmu_disable(struct perf_event *event) | ||
1551 | { | ||
1552 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | ||
1553 | struct hw_perf_event *hwc = &event->hw; | ||
1236 | int idx = hwc->idx; | 1554 | int idx = hwc->idx; |
1237 | 1555 | ||
1238 | /* | 1556 | /* |
@@ -1244,59 +1562,67 @@ static void x86_pmu_disable(struct perf_counter *counter) | |||
1244 | 1562 | ||
1245 | /* | 1563 | /* |
1246 | * Make sure the cleared pointer becomes visible before we | 1564 | * Make sure the cleared pointer becomes visible before we |
1247 | * (potentially) free the counter: | 1565 | * (potentially) free the event: |
1248 | */ | 1566 | */ |
1249 | barrier(); | 1567 | barrier(); |
1250 | 1568 | ||
1251 | /* | 1569 | /* |
1252 | * Drain the remaining delta count out of a counter | 1570 | * Drain the remaining delta count out of a event |
1253 | * that we are disabling: | 1571 | * that we are disabling: |
1254 | */ | 1572 | */ |
1255 | x86_perf_counter_update(counter, hwc, idx); | 1573 | x86_perf_event_update(event, hwc, idx); |
1256 | cpuc->counters[idx] = NULL; | 1574 | |
1575 | /* Drain the remaining BTS records. */ | ||
1576 | if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) | ||
1577 | intel_pmu_drain_bts_buffer(cpuc); | ||
1578 | |||
1579 | cpuc->events[idx] = NULL; | ||
1257 | clear_bit(idx, cpuc->used_mask); | 1580 | clear_bit(idx, cpuc->used_mask); |
1258 | 1581 | ||
1259 | perf_counter_update_userpage(counter); | 1582 | perf_event_update_userpage(event); |
1260 | } | 1583 | } |
1261 | 1584 | ||
1262 | /* | 1585 | /* |
1263 | * Save and restart an expired counter. Called by NMI contexts, | 1586 | * Save and restart an expired event. Called by NMI contexts, |
1264 | * so it has to be careful about preempting normal counter ops: | 1587 | * so it has to be careful about preempting normal event ops: |
1265 | */ | 1588 | */ |
1266 | static int intel_pmu_save_and_restart(struct perf_counter *counter) | 1589 | static int intel_pmu_save_and_restart(struct perf_event *event) |
1267 | { | 1590 | { |
1268 | struct hw_perf_counter *hwc = &counter->hw; | 1591 | struct hw_perf_event *hwc = &event->hw; |
1269 | int idx = hwc->idx; | 1592 | int idx = hwc->idx; |
1270 | int ret; | 1593 | int ret; |
1271 | 1594 | ||
1272 | x86_perf_counter_update(counter, hwc, idx); | 1595 | x86_perf_event_update(event, hwc, idx); |
1273 | ret = x86_perf_counter_set_period(counter, hwc, idx); | 1596 | ret = x86_perf_event_set_period(event, hwc, idx); |
1274 | 1597 | ||
1275 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) | 1598 | if (event->state == PERF_EVENT_STATE_ACTIVE) |
1276 | intel_pmu_enable_counter(hwc, idx); | 1599 | intel_pmu_enable_event(hwc, idx); |
1277 | 1600 | ||
1278 | return ret; | 1601 | return ret; |
1279 | } | 1602 | } |
1280 | 1603 | ||
1281 | static void intel_pmu_reset(void) | 1604 | static void intel_pmu_reset(void) |
1282 | { | 1605 | { |
1606 | struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; | ||
1283 | unsigned long flags; | 1607 | unsigned long flags; |
1284 | int idx; | 1608 | int idx; |
1285 | 1609 | ||
1286 | if (!x86_pmu.num_counters) | 1610 | if (!x86_pmu.num_events) |
1287 | return; | 1611 | return; |
1288 | 1612 | ||
1289 | local_irq_save(flags); | 1613 | local_irq_save(flags); |
1290 | 1614 | ||
1291 | printk("clearing PMU state on CPU#%d\n", smp_processor_id()); | 1615 | printk("clearing PMU state on CPU#%d\n", smp_processor_id()); |
1292 | 1616 | ||
1293 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | 1617 | for (idx = 0; idx < x86_pmu.num_events; idx++) { |
1294 | checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); | 1618 | checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); |
1295 | checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); | 1619 | checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); |
1296 | } | 1620 | } |
1297 | for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { | 1621 | for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { |
1298 | checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); | 1622 | checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); |
1299 | } | 1623 | } |
1624 | if (ds) | ||
1625 | ds->bts_index = ds->bts_buffer_base; | ||
1300 | 1626 | ||
1301 | local_irq_restore(flags); | 1627 | local_irq_restore(flags); |
1302 | } | 1628 | } |
@@ -1304,39 +1630,38 @@ static void intel_pmu_reset(void) | |||
1304 | static int p6_pmu_handle_irq(struct pt_regs *regs) | 1630 | static int p6_pmu_handle_irq(struct pt_regs *regs) |
1305 | { | 1631 | { |
1306 | struct perf_sample_data data; | 1632 | struct perf_sample_data data; |
1307 | struct cpu_hw_counters *cpuc; | 1633 | struct cpu_hw_events *cpuc; |
1308 | struct perf_counter *counter; | 1634 | struct perf_event *event; |
1309 | struct hw_perf_counter *hwc; | 1635 | struct hw_perf_event *hwc; |
1310 | int idx, handled = 0; | 1636 | int idx, handled = 0; |
1311 | u64 val; | 1637 | u64 val; |
1312 | 1638 | ||
1313 | data.regs = regs; | ||
1314 | data.addr = 0; | 1639 | data.addr = 0; |
1315 | 1640 | ||
1316 | cpuc = &__get_cpu_var(cpu_hw_counters); | 1641 | cpuc = &__get_cpu_var(cpu_hw_events); |
1317 | 1642 | ||
1318 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | 1643 | for (idx = 0; idx < x86_pmu.num_events; idx++) { |
1319 | if (!test_bit(idx, cpuc->active_mask)) | 1644 | if (!test_bit(idx, cpuc->active_mask)) |
1320 | continue; | 1645 | continue; |
1321 | 1646 | ||
1322 | counter = cpuc->counters[idx]; | 1647 | event = cpuc->events[idx]; |
1323 | hwc = &counter->hw; | 1648 | hwc = &event->hw; |
1324 | 1649 | ||
1325 | val = x86_perf_counter_update(counter, hwc, idx); | 1650 | val = x86_perf_event_update(event, hwc, idx); |
1326 | if (val & (1ULL << (x86_pmu.counter_bits - 1))) | 1651 | if (val & (1ULL << (x86_pmu.event_bits - 1))) |
1327 | continue; | 1652 | continue; |
1328 | 1653 | ||
1329 | /* | 1654 | /* |
1330 | * counter overflow | 1655 | * event overflow |
1331 | */ | 1656 | */ |
1332 | handled = 1; | 1657 | handled = 1; |
1333 | data.period = counter->hw.last_period; | 1658 | data.period = event->hw.last_period; |
1334 | 1659 | ||
1335 | if (!x86_perf_counter_set_period(counter, hwc, idx)) | 1660 | if (!x86_perf_event_set_period(event, hwc, idx)) |
1336 | continue; | 1661 | continue; |
1337 | 1662 | ||
1338 | if (perf_counter_overflow(counter, 1, &data)) | 1663 | if (perf_event_overflow(event, 1, &data, regs)) |
1339 | p6_pmu_disable_counter(hwc, idx); | 1664 | p6_pmu_disable_event(hwc, idx); |
1340 | } | 1665 | } |
1341 | 1666 | ||
1342 | if (handled) | 1667 | if (handled) |
@@ -1352,16 +1677,16 @@ static int p6_pmu_handle_irq(struct pt_regs *regs) | |||
1352 | static int intel_pmu_handle_irq(struct pt_regs *regs) | 1677 | static int intel_pmu_handle_irq(struct pt_regs *regs) |
1353 | { | 1678 | { |
1354 | struct perf_sample_data data; | 1679 | struct perf_sample_data data; |
1355 | struct cpu_hw_counters *cpuc; | 1680 | struct cpu_hw_events *cpuc; |
1356 | int bit, loops; | 1681 | int bit, loops; |
1357 | u64 ack, status; | 1682 | u64 ack, status; |
1358 | 1683 | ||
1359 | data.regs = regs; | ||
1360 | data.addr = 0; | 1684 | data.addr = 0; |
1361 | 1685 | ||
1362 | cpuc = &__get_cpu_var(cpu_hw_counters); | 1686 | cpuc = &__get_cpu_var(cpu_hw_events); |
1363 | 1687 | ||
1364 | perf_disable(); | 1688 | perf_disable(); |
1689 | intel_pmu_drain_bts_buffer(cpuc); | ||
1365 | status = intel_pmu_get_status(); | 1690 | status = intel_pmu_get_status(); |
1366 | if (!status) { | 1691 | if (!status) { |
1367 | perf_enable(); | 1692 | perf_enable(); |
@@ -1371,8 +1696,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) | |||
1371 | loops = 0; | 1696 | loops = 0; |
1372 | again: | 1697 | again: |
1373 | if (++loops > 100) { | 1698 | if (++loops > 100) { |
1374 | WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); | 1699 | WARN_ONCE(1, "perfevents: irq loop stuck!\n"); |
1375 | perf_counter_print_debug(); | 1700 | perf_event_print_debug(); |
1376 | intel_pmu_reset(); | 1701 | intel_pmu_reset(); |
1377 | perf_enable(); | 1702 | perf_enable(); |
1378 | return 1; | 1703 | return 1; |
@@ -1381,19 +1706,19 @@ again: | |||
1381 | inc_irq_stat(apic_perf_irqs); | 1706 | inc_irq_stat(apic_perf_irqs); |
1382 | ack = status; | 1707 | ack = status; |
1383 | for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { | 1708 | for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { |
1384 | struct perf_counter *counter = cpuc->counters[bit]; | 1709 | struct perf_event *event = cpuc->events[bit]; |
1385 | 1710 | ||
1386 | clear_bit(bit, (unsigned long *) &status); | 1711 | clear_bit(bit, (unsigned long *) &status); |
1387 | if (!test_bit(bit, cpuc->active_mask)) | 1712 | if (!test_bit(bit, cpuc->active_mask)) |
1388 | continue; | 1713 | continue; |
1389 | 1714 | ||
1390 | if (!intel_pmu_save_and_restart(counter)) | 1715 | if (!intel_pmu_save_and_restart(event)) |
1391 | continue; | 1716 | continue; |
1392 | 1717 | ||
1393 | data.period = counter->hw.last_period; | 1718 | data.period = event->hw.last_period; |
1394 | 1719 | ||
1395 | if (perf_counter_overflow(counter, 1, &data)) | 1720 | if (perf_event_overflow(event, 1, &data, regs)) |
1396 | intel_pmu_disable_counter(&counter->hw, bit); | 1721 | intel_pmu_disable_event(&event->hw, bit); |
1397 | } | 1722 | } |
1398 | 1723 | ||
1399 | intel_pmu_ack_status(ack); | 1724 | intel_pmu_ack_status(ack); |
@@ -1413,39 +1738,38 @@ again: | |||
1413 | static int amd_pmu_handle_irq(struct pt_regs *regs) | 1738 | static int amd_pmu_handle_irq(struct pt_regs *regs) |
1414 | { | 1739 | { |
1415 | struct perf_sample_data data; | 1740 | struct perf_sample_data data; |
1416 | struct cpu_hw_counters *cpuc; | 1741 | struct cpu_hw_events *cpuc; |
1417 | struct perf_counter *counter; | 1742 | struct perf_event *event; |
1418 | struct hw_perf_counter *hwc; | 1743 | struct hw_perf_event *hwc; |
1419 | int idx, handled = 0; | 1744 | int idx, handled = 0; |
1420 | u64 val; | 1745 | u64 val; |
1421 | 1746 | ||
1422 | data.regs = regs; | ||
1423 | data.addr = 0; | 1747 | data.addr = 0; |
1424 | 1748 | ||
1425 | cpuc = &__get_cpu_var(cpu_hw_counters); | 1749 | cpuc = &__get_cpu_var(cpu_hw_events); |
1426 | 1750 | ||
1427 | for (idx = 0; idx < x86_pmu.num_counters; idx++) { | 1751 | for (idx = 0; idx < x86_pmu.num_events; idx++) { |
1428 | if (!test_bit(idx, cpuc->active_mask)) | 1752 | if (!test_bit(idx, cpuc->active_mask)) |
1429 | continue; | 1753 | continue; |
1430 | 1754 | ||
1431 | counter = cpuc->counters[idx]; | 1755 | event = cpuc->events[idx]; |
1432 | hwc = &counter->hw; | 1756 | hwc = &event->hw; |
1433 | 1757 | ||
1434 | val = x86_perf_counter_update(counter, hwc, idx); | 1758 | val = x86_perf_event_update(event, hwc, idx); |
1435 | if (val & (1ULL << (x86_pmu.counter_bits - 1))) | 1759 | if (val & (1ULL << (x86_pmu.event_bits - 1))) |
1436 | continue; | 1760 | continue; |
1437 | 1761 | ||
1438 | /* | 1762 | /* |
1439 | * counter overflow | 1763 | * event overflow |
1440 | */ | 1764 | */ |
1441 | handled = 1; | 1765 | handled = 1; |
1442 | data.period = counter->hw.last_period; | 1766 | data.period = event->hw.last_period; |
1443 | 1767 | ||
1444 | if (!x86_perf_counter_set_period(counter, hwc, idx)) | 1768 | if (!x86_perf_event_set_period(event, hwc, idx)) |
1445 | continue; | 1769 | continue; |
1446 | 1770 | ||
1447 | if (perf_counter_overflow(counter, 1, &data)) | 1771 | if (perf_event_overflow(event, 1, &data, regs)) |
1448 | amd_pmu_disable_counter(hwc, idx); | 1772 | amd_pmu_disable_event(hwc, idx); |
1449 | } | 1773 | } |
1450 | 1774 | ||
1451 | if (handled) | 1775 | if (handled) |
@@ -1459,18 +1783,21 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) | |||
1459 | irq_enter(); | 1783 | irq_enter(); |
1460 | ack_APIC_irq(); | 1784 | ack_APIC_irq(); |
1461 | inc_irq_stat(apic_pending_irqs); | 1785 | inc_irq_stat(apic_pending_irqs); |
1462 | perf_counter_do_pending(); | 1786 | perf_event_do_pending(); |
1463 | irq_exit(); | 1787 | irq_exit(); |
1464 | } | 1788 | } |
1465 | 1789 | ||
1466 | void set_perf_counter_pending(void) | 1790 | void set_perf_event_pending(void) |
1467 | { | 1791 | { |
1468 | #ifdef CONFIG_X86_LOCAL_APIC | 1792 | #ifdef CONFIG_X86_LOCAL_APIC |
1793 | if (!x86_pmu.apic || !x86_pmu_initialized()) | ||
1794 | return; | ||
1795 | |||
1469 | apic->send_IPI_self(LOCAL_PENDING_VECTOR); | 1796 | apic->send_IPI_self(LOCAL_PENDING_VECTOR); |
1470 | #endif | 1797 | #endif |
1471 | } | 1798 | } |
1472 | 1799 | ||
1473 | void perf_counters_lapic_init(void) | 1800 | void perf_events_lapic_init(void) |
1474 | { | 1801 | { |
1475 | #ifdef CONFIG_X86_LOCAL_APIC | 1802 | #ifdef CONFIG_X86_LOCAL_APIC |
1476 | if (!x86_pmu.apic || !x86_pmu_initialized()) | 1803 | if (!x86_pmu.apic || !x86_pmu_initialized()) |
@@ -1484,13 +1811,13 @@ void perf_counters_lapic_init(void) | |||
1484 | } | 1811 | } |
1485 | 1812 | ||
1486 | static int __kprobes | 1813 | static int __kprobes |
1487 | perf_counter_nmi_handler(struct notifier_block *self, | 1814 | perf_event_nmi_handler(struct notifier_block *self, |
1488 | unsigned long cmd, void *__args) | 1815 | unsigned long cmd, void *__args) |
1489 | { | 1816 | { |
1490 | struct die_args *args = __args; | 1817 | struct die_args *args = __args; |
1491 | struct pt_regs *regs; | 1818 | struct pt_regs *regs; |
1492 | 1819 | ||
1493 | if (!atomic_read(&active_counters)) | 1820 | if (!atomic_read(&active_events)) |
1494 | return NOTIFY_DONE; | 1821 | return NOTIFY_DONE; |
1495 | 1822 | ||
1496 | switch (cmd) { | 1823 | switch (cmd) { |
@@ -1509,7 +1836,7 @@ perf_counter_nmi_handler(struct notifier_block *self, | |||
1509 | #endif | 1836 | #endif |
1510 | /* | 1837 | /* |
1511 | * Can't rely on the handled return value to say it was our NMI, two | 1838 | * Can't rely on the handled return value to say it was our NMI, two |
1512 | * counters could trigger 'simultaneously' raising two back-to-back NMIs. | 1839 | * events could trigger 'simultaneously' raising two back-to-back NMIs. |
1513 | * | 1840 | * |
1514 | * If the first NMI handles both, the latter will be empty and daze | 1841 | * If the first NMI handles both, the latter will be empty and daze |
1515 | * the CPU. | 1842 | * the CPU. |
@@ -1519,8 +1846,8 @@ perf_counter_nmi_handler(struct notifier_block *self, | |||
1519 | return NOTIFY_STOP; | 1846 | return NOTIFY_STOP; |
1520 | } | 1847 | } |
1521 | 1848 | ||
1522 | static __read_mostly struct notifier_block perf_counter_nmi_notifier = { | 1849 | static __read_mostly struct notifier_block perf_event_nmi_notifier = { |
1523 | .notifier_call = perf_counter_nmi_handler, | 1850 | .notifier_call = perf_event_nmi_handler, |
1524 | .next = NULL, | 1851 | .next = NULL, |
1525 | .priority = 1 | 1852 | .priority = 1 |
1526 | }; | 1853 | }; |
@@ -1530,8 +1857,8 @@ static struct x86_pmu p6_pmu = { | |||
1530 | .handle_irq = p6_pmu_handle_irq, | 1857 | .handle_irq = p6_pmu_handle_irq, |
1531 | .disable_all = p6_pmu_disable_all, | 1858 | .disable_all = p6_pmu_disable_all, |
1532 | .enable_all = p6_pmu_enable_all, | 1859 | .enable_all = p6_pmu_enable_all, |
1533 | .enable = p6_pmu_enable_counter, | 1860 | .enable = p6_pmu_enable_event, |
1534 | .disable = p6_pmu_disable_counter, | 1861 | .disable = p6_pmu_disable_event, |
1535 | .eventsel = MSR_P6_EVNTSEL0, | 1862 | .eventsel = MSR_P6_EVNTSEL0, |
1536 | .perfctr = MSR_P6_PERFCTR0, | 1863 | .perfctr = MSR_P6_PERFCTR0, |
1537 | .event_map = p6_pmu_event_map, | 1864 | .event_map = p6_pmu_event_map, |
@@ -1540,16 +1867,16 @@ static struct x86_pmu p6_pmu = { | |||
1540 | .apic = 1, | 1867 | .apic = 1, |
1541 | .max_period = (1ULL << 31) - 1, | 1868 | .max_period = (1ULL << 31) - 1, |
1542 | .version = 0, | 1869 | .version = 0, |
1543 | .num_counters = 2, | 1870 | .num_events = 2, |
1544 | /* | 1871 | /* |
1545 | * Counters have 40 bits implemented. However they are designed such | 1872 | * Events have 40 bits implemented. However they are designed such |
1546 | * that bits [32-39] are sign extensions of bit 31. As such the | 1873 | * that bits [32-39] are sign extensions of bit 31. As such the |
1547 | * effective width of a counter for P6-like PMU is 32 bits only. | 1874 | * effective width of a event for P6-like PMU is 32 bits only. |
1548 | * | 1875 | * |
1549 | * See IA-32 Intel Architecture Software developer manual Vol 3B | 1876 | * See IA-32 Intel Architecture Software developer manual Vol 3B |
1550 | */ | 1877 | */ |
1551 | .counter_bits = 32, | 1878 | .event_bits = 32, |
1552 | .counter_mask = (1ULL << 32) - 1, | 1879 | .event_mask = (1ULL << 32) - 1, |
1553 | }; | 1880 | }; |
1554 | 1881 | ||
1555 | static struct x86_pmu intel_pmu = { | 1882 | static struct x86_pmu intel_pmu = { |
@@ -1557,8 +1884,8 @@ static struct x86_pmu intel_pmu = { | |||
1557 | .handle_irq = intel_pmu_handle_irq, | 1884 | .handle_irq = intel_pmu_handle_irq, |
1558 | .disable_all = intel_pmu_disable_all, | 1885 | .disable_all = intel_pmu_disable_all, |
1559 | .enable_all = intel_pmu_enable_all, | 1886 | .enable_all = intel_pmu_enable_all, |
1560 | .enable = intel_pmu_enable_counter, | 1887 | .enable = intel_pmu_enable_event, |
1561 | .disable = intel_pmu_disable_counter, | 1888 | .disable = intel_pmu_disable_event, |
1562 | .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, | 1889 | .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, |
1563 | .perfctr = MSR_ARCH_PERFMON_PERFCTR0, | 1890 | .perfctr = MSR_ARCH_PERFMON_PERFCTR0, |
1564 | .event_map = intel_pmu_event_map, | 1891 | .event_map = intel_pmu_event_map, |
@@ -1568,9 +1895,11 @@ static struct x86_pmu intel_pmu = { | |||
1568 | /* | 1895 | /* |
1569 | * Intel PMCs cannot be accessed sanely above 32 bit width, | 1896 | * Intel PMCs cannot be accessed sanely above 32 bit width, |
1570 | * so we install an artificial 1<<31 period regardless of | 1897 | * so we install an artificial 1<<31 period regardless of |
1571 | * the generic counter period: | 1898 | * the generic event period: |
1572 | */ | 1899 | */ |
1573 | .max_period = (1ULL << 31) - 1, | 1900 | .max_period = (1ULL << 31) - 1, |
1901 | .enable_bts = intel_pmu_enable_bts, | ||
1902 | .disable_bts = intel_pmu_disable_bts, | ||
1574 | }; | 1903 | }; |
1575 | 1904 | ||
1576 | static struct x86_pmu amd_pmu = { | 1905 | static struct x86_pmu amd_pmu = { |
@@ -1578,16 +1907,16 @@ static struct x86_pmu amd_pmu = { | |||
1578 | .handle_irq = amd_pmu_handle_irq, | 1907 | .handle_irq = amd_pmu_handle_irq, |
1579 | .disable_all = amd_pmu_disable_all, | 1908 | .disable_all = amd_pmu_disable_all, |
1580 | .enable_all = amd_pmu_enable_all, | 1909 | .enable_all = amd_pmu_enable_all, |
1581 | .enable = amd_pmu_enable_counter, | 1910 | .enable = amd_pmu_enable_event, |
1582 | .disable = amd_pmu_disable_counter, | 1911 | .disable = amd_pmu_disable_event, |
1583 | .eventsel = MSR_K7_EVNTSEL0, | 1912 | .eventsel = MSR_K7_EVNTSEL0, |
1584 | .perfctr = MSR_K7_PERFCTR0, | 1913 | .perfctr = MSR_K7_PERFCTR0, |
1585 | .event_map = amd_pmu_event_map, | 1914 | .event_map = amd_pmu_event_map, |
1586 | .raw_event = amd_pmu_raw_event, | 1915 | .raw_event = amd_pmu_raw_event, |
1587 | .max_events = ARRAY_SIZE(amd_perfmon_event_map), | 1916 | .max_events = ARRAY_SIZE(amd_perfmon_event_map), |
1588 | .num_counters = 4, | 1917 | .num_events = 4, |
1589 | .counter_bits = 48, | 1918 | .event_bits = 48, |
1590 | .counter_mask = (1ULL << 48) - 1, | 1919 | .event_mask = (1ULL << 48) - 1, |
1591 | .apic = 1, | 1920 | .apic = 1, |
1592 | /* use highest bit to detect overflow */ | 1921 | /* use highest bit to detect overflow */ |
1593 | .max_period = (1ULL << 47) - 1, | 1922 | .max_period = (1ULL << 47) - 1, |
@@ -1644,7 +1973,7 @@ static int intel_pmu_init(void) | |||
1644 | 1973 | ||
1645 | /* | 1974 | /* |
1646 | * Check whether the Architectural PerfMon supports | 1975 | * Check whether the Architectural PerfMon supports |
1647 | * Branch Misses Retired Event or not. | 1976 | * Branch Misses Retired hw_event or not. |
1648 | */ | 1977 | */ |
1649 | cpuid(10, &eax.full, &ebx, &unused, &edx.full); | 1978 | cpuid(10, &eax.full, &ebx, &unused, &edx.full); |
1650 | if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) | 1979 | if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) |
@@ -1656,15 +1985,15 @@ static int intel_pmu_init(void) | |||
1656 | 1985 | ||
1657 | x86_pmu = intel_pmu; | 1986 | x86_pmu = intel_pmu; |
1658 | x86_pmu.version = version; | 1987 | x86_pmu.version = version; |
1659 | x86_pmu.num_counters = eax.split.num_counters; | 1988 | x86_pmu.num_events = eax.split.num_events; |
1660 | x86_pmu.counter_bits = eax.split.bit_width; | 1989 | x86_pmu.event_bits = eax.split.bit_width; |
1661 | x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; | 1990 | x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; |
1662 | 1991 | ||
1663 | /* | 1992 | /* |
1664 | * Quirk: v2 perfmon does not report fixed-purpose counters, so | 1993 | * Quirk: v2 perfmon does not report fixed-purpose events, so |
1665 | * assume at least 3 counters: | 1994 | * assume at least 3 events: |
1666 | */ | 1995 | */ |
1667 | x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); | 1996 | x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); |
1668 | 1997 | ||
1669 | /* | 1998 | /* |
1670 | * Install the hw-cache-events table: | 1999 | * Install the hw-cache-events table: |
@@ -1711,11 +2040,11 @@ static int amd_pmu_init(void) | |||
1711 | return 0; | 2040 | return 0; |
1712 | } | 2041 | } |
1713 | 2042 | ||
1714 | void __init init_hw_perf_counters(void) | 2043 | void __init init_hw_perf_events(void) |
1715 | { | 2044 | { |
1716 | int err; | 2045 | int err; |
1717 | 2046 | ||
1718 | pr_info("Performance Counters: "); | 2047 | pr_info("Performance Events: "); |
1719 | 2048 | ||
1720 | switch (boot_cpu_data.x86_vendor) { | 2049 | switch (boot_cpu_data.x86_vendor) { |
1721 | case X86_VENDOR_INTEL: | 2050 | case X86_VENDOR_INTEL: |
@@ -1728,45 +2057,45 @@ void __init init_hw_perf_counters(void) | |||
1728 | return; | 2057 | return; |
1729 | } | 2058 | } |
1730 | if (err != 0) { | 2059 | if (err != 0) { |
1731 | pr_cont("no PMU driver, software counters only.\n"); | 2060 | pr_cont("no PMU driver, software events only.\n"); |
1732 | return; | 2061 | return; |
1733 | } | 2062 | } |
1734 | 2063 | ||
1735 | pr_cont("%s PMU driver.\n", x86_pmu.name); | 2064 | pr_cont("%s PMU driver.\n", x86_pmu.name); |
1736 | 2065 | ||
1737 | if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { | 2066 | if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { |
1738 | WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", | 2067 | WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", |
1739 | x86_pmu.num_counters, X86_PMC_MAX_GENERIC); | 2068 | x86_pmu.num_events, X86_PMC_MAX_GENERIC); |
1740 | x86_pmu.num_counters = X86_PMC_MAX_GENERIC; | 2069 | x86_pmu.num_events = X86_PMC_MAX_GENERIC; |
1741 | } | 2070 | } |
1742 | perf_counter_mask = (1 << x86_pmu.num_counters) - 1; | 2071 | perf_event_mask = (1 << x86_pmu.num_events) - 1; |
1743 | perf_max_counters = x86_pmu.num_counters; | 2072 | perf_max_events = x86_pmu.num_events; |
1744 | 2073 | ||
1745 | if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { | 2074 | if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { |
1746 | WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", | 2075 | WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", |
1747 | x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); | 2076 | x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); |
1748 | x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; | 2077 | x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; |
1749 | } | 2078 | } |
1750 | 2079 | ||
1751 | perf_counter_mask |= | 2080 | perf_event_mask |= |
1752 | ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; | 2081 | ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; |
1753 | x86_pmu.intel_ctrl = perf_counter_mask; | 2082 | x86_pmu.intel_ctrl = perf_event_mask; |
1754 | 2083 | ||
1755 | perf_counters_lapic_init(); | 2084 | perf_events_lapic_init(); |
1756 | register_die_notifier(&perf_counter_nmi_notifier); | 2085 | register_die_notifier(&perf_event_nmi_notifier); |
1757 | 2086 | ||
1758 | pr_info("... version: %d\n", x86_pmu.version); | 2087 | pr_info("... version: %d\n", x86_pmu.version); |
1759 | pr_info("... bit width: %d\n", x86_pmu.counter_bits); | 2088 | pr_info("... bit width: %d\n", x86_pmu.event_bits); |
1760 | pr_info("... generic counters: %d\n", x86_pmu.num_counters); | 2089 | pr_info("... generic registers: %d\n", x86_pmu.num_events); |
1761 | pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); | 2090 | pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); |
1762 | pr_info("... max period: %016Lx\n", x86_pmu.max_period); | 2091 | pr_info("... max period: %016Lx\n", x86_pmu.max_period); |
1763 | pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); | 2092 | pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); |
1764 | pr_info("... counter mask: %016Lx\n", perf_counter_mask); | 2093 | pr_info("... event mask: %016Lx\n", perf_event_mask); |
1765 | } | 2094 | } |
1766 | 2095 | ||
1767 | static inline void x86_pmu_read(struct perf_counter *counter) | 2096 | static inline void x86_pmu_read(struct perf_event *event) |
1768 | { | 2097 | { |
1769 | x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); | 2098 | x86_perf_event_update(event, &event->hw, event->hw.idx); |
1770 | } | 2099 | } |
1771 | 2100 | ||
1772 | static const struct pmu pmu = { | 2101 | static const struct pmu pmu = { |
@@ -1776,13 +2105,16 @@ static const struct pmu pmu = { | |||
1776 | .unthrottle = x86_pmu_unthrottle, | 2105 | .unthrottle = x86_pmu_unthrottle, |
1777 | }; | 2106 | }; |
1778 | 2107 | ||
1779 | const struct pmu *hw_perf_counter_init(struct perf_counter *counter) | 2108 | const struct pmu *hw_perf_event_init(struct perf_event *event) |
1780 | { | 2109 | { |
1781 | int err; | 2110 | int err; |
1782 | 2111 | ||
1783 | err = __hw_perf_counter_init(counter); | 2112 | err = __hw_perf_event_init(event); |
1784 | if (err) | 2113 | if (err) { |
2114 | if (event->destroy) | ||
2115 | event->destroy(event); | ||
1785 | return ERR_PTR(err); | 2116 | return ERR_PTR(err); |
2117 | } | ||
1786 | 2118 | ||
1787 | return &pmu; | 2119 | return &pmu; |
1788 | } | 2120 | } |
@@ -1798,8 +2130,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) | |||
1798 | entry->ip[entry->nr++] = ip; | 2130 | entry->ip[entry->nr++] = ip; |
1799 | } | 2131 | } |
1800 | 2132 | ||
1801 | static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); | 2133 | static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); |
1802 | static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); | 2134 | static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); |
1803 | static DEFINE_PER_CPU(int, in_nmi_frame); | 2135 | static DEFINE_PER_CPU(int, in_nmi_frame); |
1804 | 2136 | ||
1805 | 2137 | ||
@@ -1952,9 +2284,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | |||
1952 | struct perf_callchain_entry *entry; | 2284 | struct perf_callchain_entry *entry; |
1953 | 2285 | ||
1954 | if (in_nmi()) | 2286 | if (in_nmi()) |
1955 | entry = &__get_cpu_var(nmi_entry); | 2287 | entry = &__get_cpu_var(pmc_nmi_entry); |
1956 | else | 2288 | else |
1957 | entry = &__get_cpu_var(irq_entry); | 2289 | entry = &__get_cpu_var(pmc_irq_entry); |
1958 | 2290 | ||
1959 | entry->nr = 0; | 2291 | entry->nr = 0; |
1960 | 2292 | ||
@@ -1962,3 +2294,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | |||
1962 | 2294 | ||
1963 | return entry; | 2295 | return entry; |
1964 | } | 2296 | } |
2297 | |||
2298 | void hw_perf_event_setup_online(int cpu) | ||
2299 | { | ||
2300 | init_debug_store_on_cpu(cpu); | ||
2301 | } | ||
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index e60ed740d2b..fab786f60ed 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c | |||
@@ -20,7 +20,7 @@ | |||
20 | #include <linux/kprobes.h> | 20 | #include <linux/kprobes.h> |
21 | 21 | ||
22 | #include <asm/apic.h> | 22 | #include <asm/apic.h> |
23 | #include <asm/perf_counter.h> | 23 | #include <asm/perf_event.h> |
24 | 24 | ||
25 | struct nmi_watchdog_ctlblk { | 25 | struct nmi_watchdog_ctlblk { |
26 | unsigned int cccr_msr; | 26 | unsigned int cccr_msr; |
@@ -68,16 +68,16 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) | |||
68 | /* returns the bit offset of the performance counter register */ | 68 | /* returns the bit offset of the performance counter register */ |
69 | switch (boot_cpu_data.x86_vendor) { | 69 | switch (boot_cpu_data.x86_vendor) { |
70 | case X86_VENDOR_AMD: | 70 | case X86_VENDOR_AMD: |
71 | return (msr - MSR_K7_PERFCTR0); | 71 | return msr - MSR_K7_PERFCTR0; |
72 | case X86_VENDOR_INTEL: | 72 | case X86_VENDOR_INTEL: |
73 | if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) | 73 | if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) |
74 | return (msr - MSR_ARCH_PERFMON_PERFCTR0); | 74 | return msr - MSR_ARCH_PERFMON_PERFCTR0; |
75 | 75 | ||
76 | switch (boot_cpu_data.x86) { | 76 | switch (boot_cpu_data.x86) { |
77 | case 6: | 77 | case 6: |
78 | return (msr - MSR_P6_PERFCTR0); | 78 | return msr - MSR_P6_PERFCTR0; |
79 | case 15: | 79 | case 15: |
80 | return (msr - MSR_P4_BPU_PERFCTR0); | 80 | return msr - MSR_P4_BPU_PERFCTR0; |
81 | } | 81 | } |
82 | } | 82 | } |
83 | return 0; | 83 | return 0; |
@@ -92,16 +92,16 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) | |||
92 | /* returns the bit offset of the event selection register */ | 92 | /* returns the bit offset of the event selection register */ |
93 | switch (boot_cpu_data.x86_vendor) { | 93 | switch (boot_cpu_data.x86_vendor) { |
94 | case X86_VENDOR_AMD: | 94 | case X86_VENDOR_AMD: |
95 | return (msr - MSR_K7_EVNTSEL0); | 95 | return msr - MSR_K7_EVNTSEL0; |
96 | case X86_VENDOR_INTEL: | 96 | case X86_VENDOR_INTEL: |
97 | if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) | 97 | if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) |
98 | return (msr - MSR_ARCH_PERFMON_EVENTSEL0); | 98 | return msr - MSR_ARCH_PERFMON_EVENTSEL0; |
99 | 99 | ||
100 | switch (boot_cpu_data.x86) { | 100 | switch (boot_cpu_data.x86) { |
101 | case 6: | 101 | case 6: |
102 | return (msr - MSR_P6_EVNTSEL0); | 102 | return msr - MSR_P6_EVNTSEL0; |
103 | case 15: | 103 | case 15: |
104 | return (msr - MSR_P4_BSU_ESCR0); | 104 | return msr - MSR_P4_BSU_ESCR0; |
105 | } | 105 | } |
106 | } | 106 | } |
107 | return 0; | 107 | return 0; |
@@ -113,7 +113,7 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) | |||
113 | { | 113 | { |
114 | BUG_ON(counter > NMI_MAX_COUNTER_BITS); | 114 | BUG_ON(counter > NMI_MAX_COUNTER_BITS); |
115 | 115 | ||
116 | return (!test_bit(counter, perfctr_nmi_owner)); | 116 | return !test_bit(counter, perfctr_nmi_owner); |
117 | } | 117 | } |
118 | 118 | ||
119 | /* checks the an msr for availability */ | 119 | /* checks the an msr for availability */ |
@@ -124,7 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr) | |||
124 | counter = nmi_perfctr_msr_to_bit(msr); | 124 | counter = nmi_perfctr_msr_to_bit(msr); |
125 | BUG_ON(counter > NMI_MAX_COUNTER_BITS); | 125 | BUG_ON(counter > NMI_MAX_COUNTER_BITS); |
126 | 126 | ||
127 | return (!test_bit(counter, perfctr_nmi_owner)); | 127 | return !test_bit(counter, perfctr_nmi_owner); |
128 | } | 128 | } |
129 | EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); | 129 | EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); |
130 | 130 | ||
@@ -237,7 +237,7 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz) | |||
237 | */ | 237 | */ |
238 | counter_val = (u64)cpu_khz * 1000; | 238 | counter_val = (u64)cpu_khz * 1000; |
239 | do_div(counter_val, retval); | 239 | do_div(counter_val, retval); |
240 | if (counter_val > 0x7fffffffULL) { | 240 | if (counter_val > 0x7fffffffULL) { |
241 | u64 count = (u64)cpu_khz * 1000; | 241 | u64 count = (u64)cpu_khz * 1000; |
242 | do_div(count, 0x7fffffffUL); | 242 | do_div(count, 0x7fffffffUL); |
243 | retval = count + 1; | 243 | retval = count + 1; |
@@ -251,7 +251,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr, | |||
251 | u64 count = (u64)cpu_khz * 1000; | 251 | u64 count = (u64)cpu_khz * 1000; |
252 | 252 | ||
253 | do_div(count, nmi_hz); | 253 | do_div(count, nmi_hz); |
254 | if(descr) | 254 | if (descr) |
255 | pr_debug("setting %s to -0x%08Lx\n", descr, count); | 255 | pr_debug("setting %s to -0x%08Lx\n", descr, count); |
256 | wrmsrl(perfctr_msr, 0 - count); | 256 | wrmsrl(perfctr_msr, 0 - count); |
257 | } | 257 | } |
@@ -262,7 +262,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr, | |||
262 | u64 count = (u64)cpu_khz * 1000; | 262 | u64 count = (u64)cpu_khz * 1000; |
263 | 263 | ||
264 | do_div(count, nmi_hz); | 264 | do_div(count, nmi_hz); |
265 | if(descr) | 265 | if (descr) |
266 | pr_debug("setting %s to -0x%08Lx\n", descr, count); | 266 | pr_debug("setting %s to -0x%08Lx\n", descr, count); |
267 | wrmsr(perfctr_msr, (u32)(-count), 0); | 267 | wrmsr(perfctr_msr, (u32)(-count), 0); |
268 | } | 268 | } |
@@ -296,7 +296,7 @@ static int setup_k7_watchdog(unsigned nmi_hz) | |||
296 | 296 | ||
297 | /* setup the timer */ | 297 | /* setup the timer */ |
298 | wrmsr(evntsel_msr, evntsel, 0); | 298 | wrmsr(evntsel_msr, evntsel, 0); |
299 | write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); | 299 | write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz); |
300 | 300 | ||
301 | /* initialize the wd struct before enabling */ | 301 | /* initialize the wd struct before enabling */ |
302 | wd->perfctr_msr = perfctr_msr; | 302 | wd->perfctr_msr = perfctr_msr; |
@@ -387,7 +387,7 @@ static int setup_p6_watchdog(unsigned nmi_hz) | |||
387 | /* setup the timer */ | 387 | /* setup the timer */ |
388 | wrmsr(evntsel_msr, evntsel, 0); | 388 | wrmsr(evntsel_msr, evntsel, 0); |
389 | nmi_hz = adjust_for_32bit_ctr(nmi_hz); | 389 | nmi_hz = adjust_for_32bit_ctr(nmi_hz); |
390 | write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); | 390 | write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz); |
391 | 391 | ||
392 | /* initialize the wd struct before enabling */ | 392 | /* initialize the wd struct before enabling */ |
393 | wd->perfctr_msr = perfctr_msr; | 393 | wd->perfctr_msr = perfctr_msr; |
@@ -415,7 +415,7 @@ static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) | |||
415 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 415 | apic_write(APIC_LVTPC, APIC_DM_NMI); |
416 | 416 | ||
417 | /* P6/ARCH_PERFMON has 32 bit counter write */ | 417 | /* P6/ARCH_PERFMON has 32 bit counter write */ |
418 | write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); | 418 | write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz); |
419 | } | 419 | } |
420 | 420 | ||
421 | static const struct wd_ops p6_wd_ops = { | 421 | static const struct wd_ops p6_wd_ops = { |
@@ -490,9 +490,9 @@ static int setup_p4_watchdog(unsigned nmi_hz) | |||
490 | if (smp_num_siblings == 2) { | 490 | if (smp_num_siblings == 2) { |
491 | unsigned int ebx, apicid; | 491 | unsigned int ebx, apicid; |
492 | 492 | ||
493 | ebx = cpuid_ebx(1); | 493 | ebx = cpuid_ebx(1); |
494 | apicid = (ebx >> 24) & 0xff; | 494 | apicid = (ebx >> 24) & 0xff; |
495 | ht_num = apicid & 1; | 495 | ht_num = apicid & 1; |
496 | } else | 496 | } else |
497 | #endif | 497 | #endif |
498 | ht_num = 0; | 498 | ht_num = 0; |
@@ -544,7 +544,7 @@ static int setup_p4_watchdog(unsigned nmi_hz) | |||
544 | } | 544 | } |
545 | 545 | ||
546 | evntsel = P4_ESCR_EVENT_SELECT(0x3F) | 546 | evntsel = P4_ESCR_EVENT_SELECT(0x3F) |
547 | | P4_ESCR_OS | 547 | | P4_ESCR_OS |
548 | | P4_ESCR_USR; | 548 | | P4_ESCR_USR; |
549 | 549 | ||
550 | cccr_val |= P4_CCCR_THRESHOLD(15) | 550 | cccr_val |= P4_CCCR_THRESHOLD(15) |
@@ -612,7 +612,7 @@ static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) | |||
612 | { | 612 | { |
613 | unsigned dummy; | 613 | unsigned dummy; |
614 | /* | 614 | /* |
615 | * P4 quirks: | 615 | * P4 quirks: |
616 | * - An overflown perfctr will assert its interrupt | 616 | * - An overflown perfctr will assert its interrupt |
617 | * until the OVF flag in its CCCR is cleared. | 617 | * until the OVF flag in its CCCR is cleared. |
618 | * - LVTPC is masked on interrupt and must be | 618 | * - LVTPC is masked on interrupt and must be |
@@ -662,7 +662,8 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) | |||
662 | * NOTE: Corresponding bit = 0 in ebx indicates event present. | 662 | * NOTE: Corresponding bit = 0 in ebx indicates event present. |
663 | */ | 663 | */ |
664 | cpuid(10, &(eax.full), &ebx, &unused, &unused); | 664 | cpuid(10, &(eax.full), &ebx, &unused, &unused); |
665 | if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || | 665 | if ((eax.split.mask_length < |
666 | (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || | ||
666 | (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) | 667 | (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) |
667 | return 0; | 668 | return 0; |
668 | 669 | ||
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index d5e30397246..62ac8cb6ba2 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c | |||
@@ -116,11 +116,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) | |||
116 | seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); | 116 | seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); |
117 | #endif | 117 | #endif |
118 | seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); | 118 | seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); |
119 | #ifdef CONFIG_X86_64 | ||
120 | seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); | 119 | seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); |
121 | seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", | 120 | seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", |
122 | c->x86_phys_bits, c->x86_virt_bits); | 121 | c->x86_phys_bits, c->x86_virt_bits); |
123 | #endif | ||
124 | 122 | ||
125 | seq_printf(m, "power management:"); | 123 | seq_printf(m, "power management:"); |
126 | for (i = 0; i < 32; i++) { | 124 | for (i = 0; i < 32; i++) { |
@@ -128,7 +126,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) | |||
128 | if (i < ARRAY_SIZE(x86_power_flags) && | 126 | if (i < ARRAY_SIZE(x86_power_flags) && |
129 | x86_power_flags[i]) | 127 | x86_power_flags[i]) |
130 | seq_printf(m, "%s%s", | 128 | seq_printf(m, "%s%s", |
131 | x86_power_flags[i][0]?" ":"", | 129 | x86_power_flags[i][0] ? " " : "", |
132 | x86_power_flags[i]); | 130 | x86_power_flags[i]); |
133 | else | 131 | else |
134 | seq_printf(m, " [%d]", i); | 132 | seq_printf(m, " [%d]", i); |
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c new file mode 100644 index 00000000000..a640ae5ad20 --- /dev/null +++ b/arch/x86/kernel/cpu/sched.c | |||
@@ -0,0 +1,55 @@ | |||
1 | #include <linux/sched.h> | ||
2 | #include <linux/math64.h> | ||
3 | #include <linux/percpu.h> | ||
4 | #include <linux/irqflags.h> | ||
5 | |||
6 | #include <asm/cpufeature.h> | ||
7 | #include <asm/processor.h> | ||
8 | |||
9 | #ifdef CONFIG_SMP | ||
10 | |||
11 | static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched); | ||
12 | |||
13 | static unsigned long scale_aperfmperf(void) | ||
14 | { | ||
15 | struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched); | ||
16 | unsigned long ratio, flags; | ||
17 | |||
18 | local_irq_save(flags); | ||
19 | get_aperfmperf(&val); | ||
20 | local_irq_restore(flags); | ||
21 | |||
22 | ratio = calc_aperfmperf_ratio(old, &val); | ||
23 | *old = val; | ||
24 | |||
25 | return ratio; | ||
26 | } | ||
27 | |||
28 | unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) | ||
29 | { | ||
30 | /* | ||
31 | * do aperf/mperf on the cpu level because it includes things | ||
32 | * like turbo mode, which are relevant to full cores. | ||
33 | */ | ||
34 | if (boot_cpu_has(X86_FEATURE_APERFMPERF)) | ||
35 | return scale_aperfmperf(); | ||
36 | |||
37 | /* | ||
38 | * maybe have something cpufreq here | ||
39 | */ | ||
40 | |||
41 | return default_scale_freq_power(sd, cpu); | ||
42 | } | ||
43 | |||
44 | unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
45 | { | ||
46 | /* | ||
47 | * aperf/mperf already includes the smt gain | ||
48 | */ | ||
49 | if (boot_cpu_has(X86_FEATURE_APERFMPERF)) | ||
50 | return SCHED_LOAD_SCALE; | ||
51 | |||
52 | return default_scale_smt_power(sd, cpu); | ||
53 | } | ||
54 | |||
55 | #endif | ||
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 284c399e323..1cbed97b59c 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/dmi.h> | 24 | #include <linux/dmi.h> |
25 | #include <asm/div64.h> | 25 | #include <asm/div64.h> |
26 | #include <asm/vmware.h> | 26 | #include <asm/vmware.h> |
27 | #include <asm/x86_init.h> | ||
27 | 28 | ||
28 | #define CPUID_VMWARE_INFO_LEAF 0x40000000 | 29 | #define CPUID_VMWARE_INFO_LEAF 0x40000000 |
29 | #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 | 30 | #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 |
@@ -47,19 +48,33 @@ static inline int __vmware_platform(void) | |||
47 | return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; | 48 | return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; |
48 | } | 49 | } |
49 | 50 | ||
50 | static unsigned long __vmware_get_tsc_khz(void) | 51 | static unsigned long vmware_get_tsc_khz(void) |
51 | { | 52 | { |
52 | uint64_t tsc_hz; | 53 | uint64_t tsc_hz; |
53 | uint32_t eax, ebx, ecx, edx; | 54 | uint32_t eax, ebx, ecx, edx; |
55 | |||
56 | VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); | ||
57 | |||
58 | tsc_hz = eax | (((uint64_t)ebx) << 32); | ||
59 | do_div(tsc_hz, 1000); | ||
60 | BUG_ON(tsc_hz >> 32); | ||
61 | printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", | ||
62 | (unsigned long) tsc_hz / 1000, | ||
63 | (unsigned long) tsc_hz % 1000); | ||
64 | return tsc_hz; | ||
65 | } | ||
66 | |||
67 | void __init vmware_platform_setup(void) | ||
68 | { | ||
69 | uint32_t eax, ebx, ecx, edx; | ||
54 | 70 | ||
55 | VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); | 71 | VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); |
56 | 72 | ||
57 | if (ebx == UINT_MAX) | 73 | if (ebx != UINT_MAX) |
58 | return 0; | 74 | x86_platform.calibrate_tsc = vmware_get_tsc_khz; |
59 | tsc_hz = eax | (((uint64_t)ebx) << 32); | 75 | else |
60 | do_div(tsc_hz, 1000); | 76 | printk(KERN_WARNING |
61 | BUG_ON(tsc_hz >> 32); | 77 | "Failed to get TSC freq from the hypervisor\n"); |
62 | return tsc_hz; | ||
63 | } | 78 | } |
64 | 79 | ||
65 | /* | 80 | /* |
@@ -87,12 +102,6 @@ int vmware_platform(void) | |||
87 | return 0; | 102 | return 0; |
88 | } | 103 | } |
89 | 104 | ||
90 | unsigned long vmware_get_tsc_khz(void) | ||
91 | { | ||
92 | BUG_ON(!vmware_platform()); | ||
93 | return __vmware_get_tsc_khz(); | ||
94 | } | ||
95 | |||
96 | /* | 105 | /* |
97 | * VMware hypervisor takes care of exporting a reliable TSC to the guest. | 106 | * VMware hypervisor takes care of exporting a reliable TSC to the guest. |
98 | * Still, due to timing difference when running on virtual cpus, the TSC can | 107 | * Still, due to timing difference when running on virtual cpus, the TSC can |
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index b07af886124..6a52d4b36a3 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c | |||
@@ -182,7 +182,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier = | |||
182 | .notifier_call = cpuid_class_cpu_callback, | 182 | .notifier_call = cpuid_class_cpu_callback, |
183 | }; | 183 | }; |
184 | 184 | ||
185 | static char *cpuid_nodename(struct device *dev) | 185 | static char *cpuid_devnode(struct device *dev, mode_t *mode) |
186 | { | 186 | { |
187 | return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); | 187 | return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); |
188 | } | 188 | } |
@@ -203,7 +203,7 @@ static int __init cpuid_init(void) | |||
203 | err = PTR_ERR(cpuid_class); | 203 | err = PTR_ERR(cpuid_class); |
204 | goto out_chrdev; | 204 | goto out_chrdev; |
205 | } | 205 | } |
206 | cpuid_class->nodename = cpuid_nodename; | 206 | cpuid_class->devnode = cpuid_devnode; |
207 | for_each_online_cpu(i) { | 207 | for_each_online_cpu(i) { |
208 | err = cpuid_device_create(i); | 208 | err = cpuid_device_create(i); |
209 | if (err != 0) | 209 | if (err != 0) |
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index b4f14c6c09d..37250fe490b 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c | |||
@@ -27,9 +27,7 @@ static void doublefault_fn(void) | |||
27 | 27 | ||
28 | if (ptr_ok(gdt)) { | 28 | if (ptr_ok(gdt)) { |
29 | gdt += GDT_ENTRY_TSS << 3; | 29 | gdt += GDT_ENTRY_TSS << 3; |
30 | tss = *(u16 *)(gdt+2); | 30 | tss = get_desc_base((struct desc_struct *)gdt); |
31 | tss += *(u8 *)(gdt+4) << 16; | ||
32 | tss += *(u8 *)(gdt+7) << 24; | ||
33 | printk(KERN_EMERG "double fault, tss at %08lx\n", tss); | 31 | printk(KERN_EMERG "double fault, tss at %08lx\n", tss); |
34 | 32 | ||
35 | if (ptr_ok(tss)) { | 33 | if (ptr_ok(tss)) { |
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 48bfe138603..ef42a038f1a 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c | |||
@@ -509,15 +509,15 @@ enum bts_field { | |||
509 | bts_escape = ((unsigned long)-1 & ~bts_qual_mask) | 509 | bts_escape = ((unsigned long)-1 & ~bts_qual_mask) |
510 | }; | 510 | }; |
511 | 511 | ||
512 | static inline unsigned long bts_get(const char *base, enum bts_field field) | 512 | static inline unsigned long bts_get(const char *base, unsigned long field) |
513 | { | 513 | { |
514 | base += (ds_cfg.sizeof_ptr_field * field); | 514 | base += (ds_cfg.sizeof_ptr_field * field); |
515 | return *(unsigned long *)base; | 515 | return *(unsigned long *)base; |
516 | } | 516 | } |
517 | 517 | ||
518 | static inline void bts_set(char *base, enum bts_field field, unsigned long val) | 518 | static inline void bts_set(char *base, unsigned long field, unsigned long val) |
519 | { | 519 | { |
520 | base += (ds_cfg.sizeof_ptr_field * field);; | 520 | base += (ds_cfg.sizeof_ptr_field * field); |
521 | (*(unsigned long *)base) = val; | 521 | (*(unsigned long *)base) = val; |
522 | } | 522 | } |
523 | 523 | ||
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c8405718a4c..2d8a371d433 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c | |||
@@ -15,7 +15,6 @@ | |||
15 | #include <linux/bug.h> | 15 | #include <linux/bug.h> |
16 | #include <linux/nmi.h> | 16 | #include <linux/nmi.h> |
17 | #include <linux/sysfs.h> | 17 | #include <linux/sysfs.h> |
18 | #include <linux/ftrace.h> | ||
19 | 18 | ||
20 | #include <asm/stacktrace.h> | 19 | #include <asm/stacktrace.h> |
21 | 20 | ||
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index bca5fba91c9..f7dd2a7c3bf 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -5,7 +5,6 @@ | |||
5 | #include <linux/kallsyms.h> | 5 | #include <linux/kallsyms.h> |
6 | #include <linux/kprobes.h> | 6 | #include <linux/kprobes.h> |
7 | #include <linux/uaccess.h> | 7 | #include <linux/uaccess.h> |
8 | #include <linux/utsname.h> | ||
9 | #include <linux/hardirq.h> | 8 | #include <linux/hardirq.h> |
10 | #include <linux/kdebug.h> | 9 | #include <linux/kdebug.h> |
11 | #include <linux/module.h> | 10 | #include <linux/module.h> |
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 54b0a327676..a071e6be177 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c | |||
@@ -5,7 +5,6 @@ | |||
5 | #include <linux/kallsyms.h> | 5 | #include <linux/kallsyms.h> |
6 | #include <linux/kprobes.h> | 6 | #include <linux/kprobes.h> |
7 | #include <linux/uaccess.h> | 7 | #include <linux/uaccess.h> |
8 | #include <linux/utsname.h> | ||
9 | #include <linux/hardirq.h> | 8 | #include <linux/hardirq.h> |
10 | #include <linux/kdebug.h> | 9 | #include <linux/kdebug.h> |
11 | #include <linux/module.h> | 10 | #include <linux/module.h> |
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 5cb5725b2ba..85419bb7d4a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -115,7 +115,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, | |||
115 | { | 115 | { |
116 | int x = e820x->nr_map; | 116 | int x = e820x->nr_map; |
117 | 117 | ||
118 | if (x == ARRAY_SIZE(e820x->map)) { | 118 | if (x >= ARRAY_SIZE(e820x->map)) { |
119 | printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); | 119 | printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); |
120 | return; | 120 | return; |
121 | } | 121 | } |
@@ -1331,7 +1331,7 @@ void __init e820_reserve_resources(void) | |||
1331 | struct resource *res; | 1331 | struct resource *res; |
1332 | u64 end; | 1332 | u64 end; |
1333 | 1333 | ||
1334 | res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); | 1334 | res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); |
1335 | e820_res = res; | 1335 | e820_res = res; |
1336 | for (i = 0; i < e820.nr_map; i++) { | 1336 | for (i = 0; i < e820.nr_map; i++) { |
1337 | end = e820.map[i].addr + e820.map[i].size - 1; | 1337 | end = e820.map[i].addr + e820.map[i].size - 1; |
@@ -1455,28 +1455,11 @@ char *__init default_machine_specific_memory_setup(void) | |||
1455 | return who; | 1455 | return who; |
1456 | } | 1456 | } |
1457 | 1457 | ||
1458 | char *__init __attribute__((weak)) machine_specific_memory_setup(void) | ||
1459 | { | ||
1460 | if (x86_quirks->arch_memory_setup) { | ||
1461 | char *who = x86_quirks->arch_memory_setup(); | ||
1462 | |||
1463 | if (who) | ||
1464 | return who; | ||
1465 | } | ||
1466 | return default_machine_specific_memory_setup(); | ||
1467 | } | ||
1468 | |||
1469 | /* Overridden in paravirt.c if CONFIG_PARAVIRT */ | ||
1470 | char * __init __attribute__((weak)) memory_setup(void) | ||
1471 | { | ||
1472 | return machine_specific_memory_setup(); | ||
1473 | } | ||
1474 | |||
1475 | void __init setup_memory_map(void) | 1458 | void __init setup_memory_map(void) |
1476 | { | 1459 | { |
1477 | char *who; | 1460 | char *who; |
1478 | 1461 | ||
1479 | who = memory_setup(); | 1462 | who = x86_init.resources.memory_setup(); |
1480 | memcpy(&e820_saved, &e820, sizeof(struct e820map)); | 1463 | memcpy(&e820_saved, &e820, sizeof(struct e820map)); |
1481 | printk(KERN_INFO "BIOS-provided physical RAM map:\n"); | 1464 | printk(KERN_INFO "BIOS-provided physical RAM map:\n"); |
1482 | e820_print_map(who); | 1465 | e820_print_map(who); |
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 335f049d110..41fd965c80c 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c | |||
@@ -160,721 +160,6 @@ static struct console early_serial_console = { | |||
160 | .index = -1, | 160 | .index = -1, |
161 | }; | 161 | }; |
162 | 162 | ||
163 | #ifdef CONFIG_EARLY_PRINTK_DBGP | ||
164 | |||
165 | static struct ehci_caps __iomem *ehci_caps; | ||
166 | static struct ehci_regs __iomem *ehci_regs; | ||
167 | static struct ehci_dbg_port __iomem *ehci_debug; | ||
168 | static unsigned int dbgp_endpoint_out; | ||
169 | |||
170 | struct ehci_dev { | ||
171 | u32 bus; | ||
172 | u32 slot; | ||
173 | u32 func; | ||
174 | }; | ||
175 | |||
176 | static struct ehci_dev ehci_dev; | ||
177 | |||
178 | #define USB_DEBUG_DEVNUM 127 | ||
179 | |||
180 | #define DBGP_DATA_TOGGLE 0x8800 | ||
181 | |||
182 | static inline u32 dbgp_pid_update(u32 x, u32 tok) | ||
183 | { | ||
184 | return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); | ||
185 | } | ||
186 | |||
187 | static inline u32 dbgp_len_update(u32 x, u32 len) | ||
188 | { | ||
189 | return (x & ~0x0f) | (len & 0x0f); | ||
190 | } | ||
191 | |||
192 | /* | ||
193 | * USB Packet IDs (PIDs) | ||
194 | */ | ||
195 | |||
196 | /* token */ | ||
197 | #define USB_PID_OUT 0xe1 | ||
198 | #define USB_PID_IN 0x69 | ||
199 | #define USB_PID_SOF 0xa5 | ||
200 | #define USB_PID_SETUP 0x2d | ||
201 | /* handshake */ | ||
202 | #define USB_PID_ACK 0xd2 | ||
203 | #define USB_PID_NAK 0x5a | ||
204 | #define USB_PID_STALL 0x1e | ||
205 | #define USB_PID_NYET 0x96 | ||
206 | /* data */ | ||
207 | #define USB_PID_DATA0 0xc3 | ||
208 | #define USB_PID_DATA1 0x4b | ||
209 | #define USB_PID_DATA2 0x87 | ||
210 | #define USB_PID_MDATA 0x0f | ||
211 | /* Special */ | ||
212 | #define USB_PID_PREAMBLE 0x3c | ||
213 | #define USB_PID_ERR 0x3c | ||
214 | #define USB_PID_SPLIT 0x78 | ||
215 | #define USB_PID_PING 0xb4 | ||
216 | #define USB_PID_UNDEF_0 0xf0 | ||
217 | |||
218 | #define USB_PID_DATA_TOGGLE 0x88 | ||
219 | #define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) | ||
220 | |||
221 | #define PCI_CAP_ID_EHCI_DEBUG 0xa | ||
222 | |||
223 | #define HUB_ROOT_RESET_TIME 50 /* times are in msec */ | ||
224 | #define HUB_SHORT_RESET_TIME 10 | ||
225 | #define HUB_LONG_RESET_TIME 200 | ||
226 | #define HUB_RESET_TIMEOUT 500 | ||
227 | |||
228 | #define DBGP_MAX_PACKET 8 | ||
229 | |||
230 | static int dbgp_wait_until_complete(void) | ||
231 | { | ||
232 | u32 ctrl; | ||
233 | int loop = 0x100000; | ||
234 | |||
235 | do { | ||
236 | ctrl = readl(&ehci_debug->control); | ||
237 | /* Stop when the transaction is finished */ | ||
238 | if (ctrl & DBGP_DONE) | ||
239 | break; | ||
240 | } while (--loop > 0); | ||
241 | |||
242 | if (!loop) | ||
243 | return -1; | ||
244 | |||
245 | /* | ||
246 | * Now that we have observed the completed transaction, | ||
247 | * clear the done bit. | ||
248 | */ | ||
249 | writel(ctrl | DBGP_DONE, &ehci_debug->control); | ||
250 | return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); | ||
251 | } | ||
252 | |||
253 | static void __init dbgp_mdelay(int ms) | ||
254 | { | ||
255 | int i; | ||
256 | |||
257 | while (ms--) { | ||
258 | for (i = 0; i < 1000; i++) | ||
259 | outb(0x1, 0x80); | ||
260 | } | ||
261 | } | ||
262 | |||
263 | static void dbgp_breath(void) | ||
264 | { | ||
265 | /* Sleep to give the debug port a chance to breathe */ | ||
266 | } | ||
267 | |||
268 | static int dbgp_wait_until_done(unsigned ctrl) | ||
269 | { | ||
270 | u32 pids, lpid; | ||
271 | int ret; | ||
272 | int loop = 3; | ||
273 | |||
274 | retry: | ||
275 | writel(ctrl | DBGP_GO, &ehci_debug->control); | ||
276 | ret = dbgp_wait_until_complete(); | ||
277 | pids = readl(&ehci_debug->pids); | ||
278 | lpid = DBGP_PID_GET(pids); | ||
279 | |||
280 | if (ret < 0) | ||
281 | return ret; | ||
282 | |||
283 | /* | ||
284 | * If the port is getting full or it has dropped data | ||
285 | * start pacing ourselves, not necessary but it's friendly. | ||
286 | */ | ||
287 | if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) | ||
288 | dbgp_breath(); | ||
289 | |||
290 | /* If I get a NACK reissue the transmission */ | ||
291 | if (lpid == USB_PID_NAK) { | ||
292 | if (--loop > 0) | ||
293 | goto retry; | ||
294 | } | ||
295 | |||
296 | return ret; | ||
297 | } | ||
298 | |||
299 | static void dbgp_set_data(const void *buf, int size) | ||
300 | { | ||
301 | const unsigned char *bytes = buf; | ||
302 | u32 lo, hi; | ||
303 | int i; | ||
304 | |||
305 | lo = hi = 0; | ||
306 | for (i = 0; i < 4 && i < size; i++) | ||
307 | lo |= bytes[i] << (8*i); | ||
308 | for (; i < 8 && i < size; i++) | ||
309 | hi |= bytes[i] << (8*(i - 4)); | ||
310 | writel(lo, &ehci_debug->data03); | ||
311 | writel(hi, &ehci_debug->data47); | ||
312 | } | ||
313 | |||
314 | static void __init dbgp_get_data(void *buf, int size) | ||
315 | { | ||
316 | unsigned char *bytes = buf; | ||
317 | u32 lo, hi; | ||
318 | int i; | ||
319 | |||
320 | lo = readl(&ehci_debug->data03); | ||
321 | hi = readl(&ehci_debug->data47); | ||
322 | for (i = 0; i < 4 && i < size; i++) | ||
323 | bytes[i] = (lo >> (8*i)) & 0xff; | ||
324 | for (; i < 8 && i < size; i++) | ||
325 | bytes[i] = (hi >> (8*(i - 4))) & 0xff; | ||
326 | } | ||
327 | |||
328 | static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, | ||
329 | const char *bytes, int size) | ||
330 | { | ||
331 | u32 pids, addr, ctrl; | ||
332 | int ret; | ||
333 | |||
334 | if (size > DBGP_MAX_PACKET) | ||
335 | return -1; | ||
336 | |||
337 | addr = DBGP_EPADDR(devnum, endpoint); | ||
338 | |||
339 | pids = readl(&ehci_debug->pids); | ||
340 | pids = dbgp_pid_update(pids, USB_PID_OUT); | ||
341 | |||
342 | ctrl = readl(&ehci_debug->control); | ||
343 | ctrl = dbgp_len_update(ctrl, size); | ||
344 | ctrl |= DBGP_OUT; | ||
345 | ctrl |= DBGP_GO; | ||
346 | |||
347 | dbgp_set_data(bytes, size); | ||
348 | writel(addr, &ehci_debug->address); | ||
349 | writel(pids, &ehci_debug->pids); | ||
350 | |||
351 | ret = dbgp_wait_until_done(ctrl); | ||
352 | if (ret < 0) | ||
353 | return ret; | ||
354 | |||
355 | return ret; | ||
356 | } | ||
357 | |||
358 | static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, | ||
359 | int size) | ||
360 | { | ||
361 | u32 pids, addr, ctrl; | ||
362 | int ret; | ||
363 | |||
364 | if (size > DBGP_MAX_PACKET) | ||
365 | return -1; | ||
366 | |||
367 | addr = DBGP_EPADDR(devnum, endpoint); | ||
368 | |||
369 | pids = readl(&ehci_debug->pids); | ||
370 | pids = dbgp_pid_update(pids, USB_PID_IN); | ||
371 | |||
372 | ctrl = readl(&ehci_debug->control); | ||
373 | ctrl = dbgp_len_update(ctrl, size); | ||
374 | ctrl &= ~DBGP_OUT; | ||
375 | ctrl |= DBGP_GO; | ||
376 | |||
377 | writel(addr, &ehci_debug->address); | ||
378 | writel(pids, &ehci_debug->pids); | ||
379 | ret = dbgp_wait_until_done(ctrl); | ||
380 | if (ret < 0) | ||
381 | return ret; | ||
382 | |||
383 | if (size > ret) | ||
384 | size = ret; | ||
385 | dbgp_get_data(data, size); | ||
386 | return ret; | ||
387 | } | ||
388 | |||
389 | static int __init dbgp_control_msg(unsigned devnum, int requesttype, | ||
390 | int request, int value, int index, void *data, int size) | ||
391 | { | ||
392 | u32 pids, addr, ctrl; | ||
393 | struct usb_ctrlrequest req; | ||
394 | int read; | ||
395 | int ret; | ||
396 | |||
397 | read = (requesttype & USB_DIR_IN) != 0; | ||
398 | if (size > (read ? DBGP_MAX_PACKET:0)) | ||
399 | return -1; | ||
400 | |||
401 | /* Compute the control message */ | ||
402 | req.bRequestType = requesttype; | ||
403 | req.bRequest = request; | ||
404 | req.wValue = cpu_to_le16(value); | ||
405 | req.wIndex = cpu_to_le16(index); | ||
406 | req.wLength = cpu_to_le16(size); | ||
407 | |||
408 | pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); | ||
409 | addr = DBGP_EPADDR(devnum, 0); | ||
410 | |||
411 | ctrl = readl(&ehci_debug->control); | ||
412 | ctrl = dbgp_len_update(ctrl, sizeof(req)); | ||
413 | ctrl |= DBGP_OUT; | ||
414 | ctrl |= DBGP_GO; | ||
415 | |||
416 | /* Send the setup message */ | ||
417 | dbgp_set_data(&req, sizeof(req)); | ||
418 | writel(addr, &ehci_debug->address); | ||
419 | writel(pids, &ehci_debug->pids); | ||
420 | ret = dbgp_wait_until_done(ctrl); | ||
421 | if (ret < 0) | ||
422 | return ret; | ||
423 | |||
424 | /* Read the result */ | ||
425 | return dbgp_bulk_read(devnum, 0, data, size); | ||
426 | } | ||
427 | |||
428 | |||
429 | /* Find a PCI capability */ | ||
430 | static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) | ||
431 | { | ||
432 | u8 pos; | ||
433 | int bytes; | ||
434 | |||
435 | if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & | ||
436 | PCI_STATUS_CAP_LIST)) | ||
437 | return 0; | ||
438 | |||
439 | pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); | ||
440 | for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { | ||
441 | u8 id; | ||
442 | |||
443 | pos &= ~3; | ||
444 | id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); | ||
445 | if (id == 0xff) | ||
446 | break; | ||
447 | if (id == cap) | ||
448 | return pos; | ||
449 | |||
450 | pos = read_pci_config_byte(num, slot, func, | ||
451 | pos+PCI_CAP_LIST_NEXT); | ||
452 | } | ||
453 | return 0; | ||
454 | } | ||
455 | |||
456 | static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) | ||
457 | { | ||
458 | u32 class; | ||
459 | |||
460 | class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); | ||
461 | if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) | ||
462 | return 0; | ||
463 | |||
464 | return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); | ||
465 | } | ||
466 | |||
467 | static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) | ||
468 | { | ||
469 | u32 bus, slot, func; | ||
470 | |||
471 | for (bus = 0; bus < 256; bus++) { | ||
472 | for (slot = 0; slot < 32; slot++) { | ||
473 | for (func = 0; func < 8; func++) { | ||
474 | unsigned cap; | ||
475 | |||
476 | cap = __find_dbgp(bus, slot, func); | ||
477 | |||
478 | if (!cap) | ||
479 | continue; | ||
480 | if (ehci_num-- != 0) | ||
481 | continue; | ||
482 | *rbus = bus; | ||
483 | *rslot = slot; | ||
484 | *rfunc = func; | ||
485 | return cap; | ||
486 | } | ||
487 | } | ||
488 | } | ||
489 | return 0; | ||
490 | } | ||
491 | |||
492 | static int __init ehci_reset_port(int port) | ||
493 | { | ||
494 | u32 portsc; | ||
495 | u32 delay_time, delay; | ||
496 | int loop; | ||
497 | |||
498 | /* Reset the usb debug port */ | ||
499 | portsc = readl(&ehci_regs->port_status[port - 1]); | ||
500 | portsc &= ~PORT_PE; | ||
501 | portsc |= PORT_RESET; | ||
502 | writel(portsc, &ehci_regs->port_status[port - 1]); | ||
503 | |||
504 | delay = HUB_ROOT_RESET_TIME; | ||
505 | for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; | ||
506 | delay_time += delay) { | ||
507 | dbgp_mdelay(delay); | ||
508 | |||
509 | portsc = readl(&ehci_regs->port_status[port - 1]); | ||
510 | if (portsc & PORT_RESET) { | ||
511 | /* force reset to complete */ | ||
512 | loop = 2; | ||
513 | writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), | ||
514 | &ehci_regs->port_status[port - 1]); | ||
515 | do { | ||
516 | portsc = readl(&ehci_regs->port_status[port-1]); | ||
517 | } while ((portsc & PORT_RESET) && (--loop > 0)); | ||
518 | } | ||
519 | |||
520 | /* Device went away? */ | ||
521 | if (!(portsc & PORT_CONNECT)) | ||
522 | return -ENOTCONN; | ||
523 | |||
524 | /* bomb out completely if something weird happend */ | ||
525 | if ((portsc & PORT_CSC)) | ||
526 | return -EINVAL; | ||
527 | |||
528 | /* If we've finished resetting, then break out of the loop */ | ||
529 | if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) | ||
530 | return 0; | ||
531 | } | ||
532 | return -EBUSY; | ||
533 | } | ||
534 | |||
535 | static int __init ehci_wait_for_port(int port) | ||
536 | { | ||
537 | u32 status; | ||
538 | int ret, reps; | ||
539 | |||
540 | for (reps = 0; reps < 3; reps++) { | ||
541 | dbgp_mdelay(100); | ||
542 | status = readl(&ehci_regs->status); | ||
543 | if (status & STS_PCD) { | ||
544 | ret = ehci_reset_port(port); | ||
545 | if (ret == 0) | ||
546 | return 0; | ||
547 | } | ||
548 | } | ||
549 | return -ENOTCONN; | ||
550 | } | ||
551 | |||
552 | #ifdef DBGP_DEBUG | ||
553 | # define dbgp_printk early_printk | ||
554 | #else | ||
555 | static inline void dbgp_printk(const char *fmt, ...) { } | ||
556 | #endif | ||
557 | |||
558 | typedef void (*set_debug_port_t)(int port); | ||
559 | |||
560 | static void __init default_set_debug_port(int port) | ||
561 | { | ||
562 | } | ||
563 | |||
564 | static set_debug_port_t __initdata set_debug_port = default_set_debug_port; | ||
565 | |||
566 | static void __init nvidia_set_debug_port(int port) | ||
567 | { | ||
568 | u32 dword; | ||
569 | dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, | ||
570 | 0x74); | ||
571 | dword &= ~(0x0f<<12); | ||
572 | dword |= ((port & 0x0f)<<12); | ||
573 | write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, | ||
574 | dword); | ||
575 | dbgp_printk("set debug port to %d\n", port); | ||
576 | } | ||
577 | |||
578 | static void __init detect_set_debug_port(void) | ||
579 | { | ||
580 | u32 vendorid; | ||
581 | |||
582 | vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, | ||
583 | 0x00); | ||
584 | |||
585 | if ((vendorid & 0xffff) == 0x10de) { | ||
586 | dbgp_printk("using nvidia set_debug_port\n"); | ||
587 | set_debug_port = nvidia_set_debug_port; | ||
588 | } | ||
589 | } | ||
590 | |||
591 | static int __init ehci_setup(void) | ||
592 | { | ||
593 | struct usb_debug_descriptor dbgp_desc; | ||
594 | u32 cmd, ctrl, status, portsc, hcs_params; | ||
595 | u32 debug_port, new_debug_port = 0, n_ports; | ||
596 | u32 devnum; | ||
597 | int ret, i; | ||
598 | int loop; | ||
599 | int port_map_tried; | ||
600 | int playtimes = 3; | ||
601 | |||
602 | try_next_time: | ||
603 | port_map_tried = 0; | ||
604 | |||
605 | try_next_port: | ||
606 | |||
607 | hcs_params = readl(&ehci_caps->hcs_params); | ||
608 | debug_port = HCS_DEBUG_PORT(hcs_params); | ||
609 | n_ports = HCS_N_PORTS(hcs_params); | ||
610 | |||
611 | dbgp_printk("debug_port: %d\n", debug_port); | ||
612 | dbgp_printk("n_ports: %d\n", n_ports); | ||
613 | |||
614 | for (i = 1; i <= n_ports; i++) { | ||
615 | portsc = readl(&ehci_regs->port_status[i-1]); | ||
616 | dbgp_printk("portstatus%d: %08x\n", i, portsc); | ||
617 | } | ||
618 | |||
619 | if (port_map_tried && (new_debug_port != debug_port)) { | ||
620 | if (--playtimes) { | ||
621 | set_debug_port(new_debug_port); | ||
622 | goto try_next_time; | ||
623 | } | ||
624 | return -1; | ||
625 | } | ||
626 | |||
627 | loop = 10; | ||
628 | /* Reset the EHCI controller */ | ||
629 | cmd = readl(&ehci_regs->command); | ||
630 | cmd |= CMD_RESET; | ||
631 | writel(cmd, &ehci_regs->command); | ||
632 | do { | ||
633 | cmd = readl(&ehci_regs->command); | ||
634 | } while ((cmd & CMD_RESET) && (--loop > 0)); | ||
635 | |||
636 | if (!loop) { | ||
637 | dbgp_printk("can not reset ehci\n"); | ||
638 | return -1; | ||
639 | } | ||
640 | dbgp_printk("ehci reset done\n"); | ||
641 | |||
642 | /* Claim ownership, but do not enable yet */ | ||
643 | ctrl = readl(&ehci_debug->control); | ||
644 | ctrl |= DBGP_OWNER; | ||
645 | ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); | ||
646 | writel(ctrl, &ehci_debug->control); | ||
647 | |||
648 | /* Start the ehci running */ | ||
649 | cmd = readl(&ehci_regs->command); | ||
650 | cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); | ||
651 | cmd |= CMD_RUN; | ||
652 | writel(cmd, &ehci_regs->command); | ||
653 | |||
654 | /* Ensure everything is routed to the EHCI */ | ||
655 | writel(FLAG_CF, &ehci_regs->configured_flag); | ||
656 | |||
657 | /* Wait until the controller is no longer halted */ | ||
658 | loop = 10; | ||
659 | do { | ||
660 | status = readl(&ehci_regs->status); | ||
661 | } while ((status & STS_HALT) && (--loop > 0)); | ||
662 | |||
663 | if (!loop) { | ||
664 | dbgp_printk("ehci can be started\n"); | ||
665 | return -1; | ||
666 | } | ||
667 | dbgp_printk("ehci started\n"); | ||
668 | |||
669 | /* Wait for a device to show up in the debug port */ | ||
670 | ret = ehci_wait_for_port(debug_port); | ||
671 | if (ret < 0) { | ||
672 | dbgp_printk("No device found in debug port\n"); | ||
673 | goto next_debug_port; | ||
674 | } | ||
675 | dbgp_printk("ehci wait for port done\n"); | ||
676 | |||
677 | /* Enable the debug port */ | ||
678 | ctrl = readl(&ehci_debug->control); | ||
679 | ctrl |= DBGP_CLAIM; | ||
680 | writel(ctrl, &ehci_debug->control); | ||
681 | ctrl = readl(&ehci_debug->control); | ||
682 | if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { | ||
683 | dbgp_printk("No device in debug port\n"); | ||
684 | writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); | ||
685 | goto err; | ||
686 | } | ||
687 | dbgp_printk("debug ported enabled\n"); | ||
688 | |||
689 | /* Completely transfer the debug device to the debug controller */ | ||
690 | portsc = readl(&ehci_regs->port_status[debug_port - 1]); | ||
691 | portsc &= ~PORT_PE; | ||
692 | writel(portsc, &ehci_regs->port_status[debug_port - 1]); | ||
693 | |||
694 | dbgp_mdelay(100); | ||
695 | |||
696 | /* Find the debug device and make it device number 127 */ | ||
697 | for (devnum = 0; devnum <= 127; devnum++) { | ||
698 | ret = dbgp_control_msg(devnum, | ||
699 | USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, | ||
700 | USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, | ||
701 | &dbgp_desc, sizeof(dbgp_desc)); | ||
702 | if (ret > 0) | ||
703 | break; | ||
704 | } | ||
705 | if (devnum > 127) { | ||
706 | dbgp_printk("Could not find attached debug device\n"); | ||
707 | goto err; | ||
708 | } | ||
709 | if (ret < 0) { | ||
710 | dbgp_printk("Attached device is not a debug device\n"); | ||
711 | goto err; | ||
712 | } | ||
713 | dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; | ||
714 | |||
715 | /* Move the device to 127 if it isn't already there */ | ||
716 | if (devnum != USB_DEBUG_DEVNUM) { | ||
717 | ret = dbgp_control_msg(devnum, | ||
718 | USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, | ||
719 | USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); | ||
720 | if (ret < 0) { | ||
721 | dbgp_printk("Could not move attached device to %d\n", | ||
722 | USB_DEBUG_DEVNUM); | ||
723 | goto err; | ||
724 | } | ||
725 | devnum = USB_DEBUG_DEVNUM; | ||
726 | dbgp_printk("debug device renamed to 127\n"); | ||
727 | } | ||
728 | |||
729 | /* Enable the debug interface */ | ||
730 | ret = dbgp_control_msg(USB_DEBUG_DEVNUM, | ||
731 | USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, | ||
732 | USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); | ||
733 | if (ret < 0) { | ||
734 | dbgp_printk(" Could not enable the debug device\n"); | ||
735 | goto err; | ||
736 | } | ||
737 | dbgp_printk("debug interface enabled\n"); | ||
738 | |||
739 | /* Perform a small write to get the even/odd data state in sync | ||
740 | */ | ||
741 | ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); | ||
742 | if (ret < 0) { | ||
743 | dbgp_printk("dbgp_bulk_write failed: %d\n", ret); | ||
744 | goto err; | ||
745 | } | ||
746 | dbgp_printk("small write doned\n"); | ||
747 | |||
748 | return 0; | ||
749 | err: | ||
750 | /* Things didn't work so remove my claim */ | ||
751 | ctrl = readl(&ehci_debug->control); | ||
752 | ctrl &= ~(DBGP_CLAIM | DBGP_OUT); | ||
753 | writel(ctrl, &ehci_debug->control); | ||
754 | return -1; | ||
755 | |||
756 | next_debug_port: | ||
757 | port_map_tried |= (1<<(debug_port - 1)); | ||
758 | new_debug_port = ((debug_port-1+1)%n_ports) + 1; | ||
759 | if (port_map_tried != ((1<<n_ports) - 1)) { | ||
760 | set_debug_port(new_debug_port); | ||
761 | goto try_next_port; | ||
762 | } | ||
763 | if (--playtimes) { | ||
764 | set_debug_port(new_debug_port); | ||
765 | goto try_next_time; | ||
766 | } | ||
767 | |||
768 | return -1; | ||
769 | } | ||
770 | |||
771 | static int __init early_dbgp_init(char *s) | ||
772 | { | ||
773 | u32 debug_port, bar, offset; | ||
774 | u32 bus, slot, func, cap; | ||
775 | void __iomem *ehci_bar; | ||
776 | u32 dbgp_num; | ||
777 | u32 bar_val; | ||
778 | char *e; | ||
779 | int ret; | ||
780 | u8 byte; | ||
781 | |||
782 | if (!early_pci_allowed()) | ||
783 | return -1; | ||
784 | |||
785 | dbgp_num = 0; | ||
786 | if (*s) | ||
787 | dbgp_num = simple_strtoul(s, &e, 10); | ||
788 | dbgp_printk("dbgp_num: %d\n", dbgp_num); | ||
789 | |||
790 | cap = find_dbgp(dbgp_num, &bus, &slot, &func); | ||
791 | if (!cap) | ||
792 | return -1; | ||
793 | |||
794 | dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot, | ||
795 | func); | ||
796 | |||
797 | debug_port = read_pci_config(bus, slot, func, cap); | ||
798 | bar = (debug_port >> 29) & 0x7; | ||
799 | bar = (bar * 4) + 0xc; | ||
800 | offset = (debug_port >> 16) & 0xfff; | ||
801 | dbgp_printk("bar: %02x offset: %03x\n", bar, offset); | ||
802 | if (bar != PCI_BASE_ADDRESS_0) { | ||
803 | dbgp_printk("only debug ports on bar 1 handled.\n"); | ||
804 | |||
805 | return -1; | ||
806 | } | ||
807 | |||
808 | bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); | ||
809 | dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); | ||
810 | if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { | ||
811 | dbgp_printk("only simple 32bit mmio bars supported\n"); | ||
812 | |||
813 | return -1; | ||
814 | } | ||
815 | |||
816 | /* double check if the mem space is enabled */ | ||
817 | byte = read_pci_config_byte(bus, slot, func, 0x04); | ||
818 | if (!(byte & 0x2)) { | ||
819 | byte |= 0x02; | ||
820 | write_pci_config_byte(bus, slot, func, 0x04, byte); | ||
821 | dbgp_printk("mmio for ehci enabled\n"); | ||
822 | } | ||
823 | |||
824 | /* | ||
825 | * FIXME I don't have the bar size so just guess PAGE_SIZE is more | ||
826 | * than enough. 1K is the biggest I have seen. | ||
827 | */ | ||
828 | set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); | ||
829 | ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); | ||
830 | ehci_bar += bar_val & ~PAGE_MASK; | ||
831 | dbgp_printk("ehci_bar: %p\n", ehci_bar); | ||
832 | |||
833 | ehci_caps = ehci_bar; | ||
834 | ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); | ||
835 | ehci_debug = ehci_bar + offset; | ||
836 | ehci_dev.bus = bus; | ||
837 | ehci_dev.slot = slot; | ||
838 | ehci_dev.func = func; | ||
839 | |||
840 | detect_set_debug_port(); | ||
841 | |||
842 | ret = ehci_setup(); | ||
843 | if (ret < 0) { | ||
844 | dbgp_printk("ehci_setup failed\n"); | ||
845 | ehci_debug = NULL; | ||
846 | |||
847 | return -1; | ||
848 | } | ||
849 | |||
850 | return 0; | ||
851 | } | ||
852 | |||
853 | static void early_dbgp_write(struct console *con, const char *str, u32 n) | ||
854 | { | ||
855 | int chunk, ret; | ||
856 | |||
857 | if (!ehci_debug) | ||
858 | return; | ||
859 | while (n > 0) { | ||
860 | chunk = n; | ||
861 | if (chunk > DBGP_MAX_PACKET) | ||
862 | chunk = DBGP_MAX_PACKET; | ||
863 | ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, | ||
864 | dbgp_endpoint_out, str, chunk); | ||
865 | str += chunk; | ||
866 | n -= chunk; | ||
867 | } | ||
868 | } | ||
869 | |||
870 | static struct console early_dbgp_console = { | ||
871 | .name = "earlydbg", | ||
872 | .write = early_dbgp_write, | ||
873 | .flags = CON_PRINTBUFFER, | ||
874 | .index = -1, | ||
875 | }; | ||
876 | #endif | ||
877 | |||
878 | /* Direct interface for emergencies */ | 163 | /* Direct interface for emergencies */ |
879 | static struct console *early_console = &early_vga_console; | 164 | static struct console *early_console = &early_vga_console; |
880 | static int __initdata early_console_initialized; | 165 | static int __initdata early_console_initialized; |
@@ -891,10 +176,24 @@ asmlinkage void early_printk(const char *fmt, ...) | |||
891 | va_end(ap); | 176 | va_end(ap); |
892 | } | 177 | } |
893 | 178 | ||
179 | static inline void early_console_register(struct console *con, int keep_early) | ||
180 | { | ||
181 | if (early_console->index != -1) { | ||
182 | printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", | ||
183 | con->name); | ||
184 | return; | ||
185 | } | ||
186 | early_console = con; | ||
187 | if (keep_early) | ||
188 | early_console->flags &= ~CON_BOOT; | ||
189 | else | ||
190 | early_console->flags |= CON_BOOT; | ||
191 | register_console(early_console); | ||
192 | } | ||
894 | 193 | ||
895 | static int __init setup_early_printk(char *buf) | 194 | static int __init setup_early_printk(char *buf) |
896 | { | 195 | { |
897 | int keep_early; | 196 | int keep; |
898 | 197 | ||
899 | if (!buf) | 198 | if (!buf) |
900 | return 0; | 199 | return 0; |
@@ -903,42 +202,34 @@ static int __init setup_early_printk(char *buf) | |||
903 | return 0; | 202 | return 0; |
904 | early_console_initialized = 1; | 203 | early_console_initialized = 1; |
905 | 204 | ||
906 | keep_early = (strstr(buf, "keep") != NULL); | 205 | keep = (strstr(buf, "keep") != NULL); |
907 | 206 | ||
908 | if (!strncmp(buf, "serial", 6)) { | 207 | while (*buf != '\0') { |
909 | early_serial_init(buf + 6); | 208 | if (!strncmp(buf, "serial", 6)) { |
910 | early_console = &early_serial_console; | 209 | early_serial_init(buf + 6); |
911 | } else if (!strncmp(buf, "ttyS", 4)) { | 210 | early_console_register(&early_serial_console, keep); |
912 | early_serial_init(buf); | 211 | } |
913 | early_console = &early_serial_console; | 212 | if (!strncmp(buf, "ttyS", 4)) { |
914 | } else if (!strncmp(buf, "vga", 3) | 213 | early_serial_init(buf + 4); |
915 | && boot_params.screen_info.orig_video_isVGA == 1) { | 214 | early_console_register(&early_serial_console, keep); |
916 | max_xpos = boot_params.screen_info.orig_video_cols; | 215 | } |
917 | max_ypos = boot_params.screen_info.orig_video_lines; | 216 | if (!strncmp(buf, "vga", 3) && |
918 | current_ypos = boot_params.screen_info.orig_y; | 217 | boot_params.screen_info.orig_video_isVGA == 1) { |
919 | early_console = &early_vga_console; | 218 | max_xpos = boot_params.screen_info.orig_video_cols; |
219 | max_ypos = boot_params.screen_info.orig_video_lines; | ||
220 | current_ypos = boot_params.screen_info.orig_y; | ||
221 | early_console_register(&early_vga_console, keep); | ||
222 | } | ||
920 | #ifdef CONFIG_EARLY_PRINTK_DBGP | 223 | #ifdef CONFIG_EARLY_PRINTK_DBGP |
921 | } else if (!strncmp(buf, "dbgp", 4)) { | 224 | if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4)) |
922 | if (early_dbgp_init(buf+4) < 0) | 225 | early_console_register(&early_dbgp_console, keep); |
923 | return 0; | ||
924 | early_console = &early_dbgp_console; | ||
925 | /* | ||
926 | * usb subsys will reset ehci controller, so don't keep | ||
927 | * that early console | ||
928 | */ | ||
929 | keep_early = 0; | ||
930 | #endif | 226 | #endif |
931 | #ifdef CONFIG_HVC_XEN | 227 | #ifdef CONFIG_HVC_XEN |
932 | } else if (!strncmp(buf, "xen", 3)) { | 228 | if (!strncmp(buf, "xen", 3)) |
933 | early_console = &xenboot_console; | 229 | early_console_register(&xenboot_console, keep); |
934 | #endif | 230 | #endif |
231 | buf++; | ||
935 | } | 232 | } |
936 | |||
937 | if (keep_early) | ||
938 | early_console->flags &= ~CON_BOOT; | ||
939 | else | ||
940 | early_console->flags |= CON_BOOT; | ||
941 | register_console(early_console); | ||
942 | return 0; | 233 | return 0; |
943 | } | 234 | } |
944 | 235 | ||
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index fe26ba3e345..ad5bd988fb7 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c | |||
@@ -42,6 +42,7 @@ | |||
42 | #include <asm/time.h> | 42 | #include <asm/time.h> |
43 | #include <asm/cacheflush.h> | 43 | #include <asm/cacheflush.h> |
44 | #include <asm/tlbflush.h> | 44 | #include <asm/tlbflush.h> |
45 | #include <asm/x86_init.h> | ||
45 | 46 | ||
46 | #define EFI_DEBUG 1 | 47 | #define EFI_DEBUG 1 |
47 | #define PFX "EFI: " | 48 | #define PFX "EFI: " |
@@ -453,6 +454,9 @@ void __init efi_init(void) | |||
453 | if (add_efi_memmap) | 454 | if (add_efi_memmap) |
454 | do_add_efi_memmap(); | 455 | do_add_efi_memmap(); |
455 | 456 | ||
457 | x86_platform.get_wallclock = efi_get_time; | ||
458 | x86_platform.set_wallclock = efi_set_rtc_mmss; | ||
459 | |||
456 | /* Setup for EFI runtime service */ | 460 | /* Setup for EFI runtime service */ |
457 | reboot_type = BOOT_EFI; | 461 | reboot_type = BOOT_EFI; |
458 | 462 | ||
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c251be74510..b5c061f8f35 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller) | |||
146 | END(ftrace_graph_caller) | 146 | END(ftrace_graph_caller) |
147 | 147 | ||
148 | GLOBAL(return_to_handler) | 148 | GLOBAL(return_to_handler) |
149 | subq $80, %rsp | 149 | subq $24, %rsp |
150 | 150 | ||
151 | /* Save the return values */ | 151 | /* Save the return values */ |
152 | movq %rax, (%rsp) | 152 | movq %rax, (%rsp) |
@@ -155,10 +155,10 @@ GLOBAL(return_to_handler) | |||
155 | 155 | ||
156 | call ftrace_return_to_handler | 156 | call ftrace_return_to_handler |
157 | 157 | ||
158 | movq %rax, 72(%rsp) | 158 | movq %rax, 16(%rsp) |
159 | movq 8(%rsp), %rdx | 159 | movq 8(%rsp), %rdx |
160 | movq (%rsp), %rax | 160 | movq (%rsp), %rax |
161 | addq $72, %rsp | 161 | addq $16, %rsp |
162 | retq | 162 | retq |
163 | #endif | 163 | #endif |
164 | 164 | ||
@@ -536,20 +536,13 @@ sysret_signal: | |||
536 | bt $TIF_SYSCALL_AUDIT,%edx | 536 | bt $TIF_SYSCALL_AUDIT,%edx |
537 | jc sysret_audit | 537 | jc sysret_audit |
538 | #endif | 538 | #endif |
539 | /* edx: work flags (arg3) */ | 539 | /* |
540 | leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 | 540 | * We have a signal, or exit tracing or single-step. |
541 | xorl %esi,%esi # oldset -> arg2 | 541 | * These all wind up with the iret return path anyway, |
542 | SAVE_REST | 542 | * so just join that path right now. |
543 | FIXUP_TOP_OF_STACK %r11 | 543 | */ |
544 | call do_notify_resume | 544 | FIXUP_TOP_OF_STACK %r11, -ARGOFFSET |
545 | RESTORE_TOP_OF_STACK %r11 | 545 | jmp int_check_syscall_exit_work |
546 | RESTORE_REST | ||
547 | movl $_TIF_WORK_MASK,%edi | ||
548 | /* Use IRET because user could have changed frame. This | ||
549 | works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ | ||
550 | DISABLE_INTERRUPTS(CLBR_NONE) | ||
551 | TRACE_IRQS_OFF | ||
552 | jmp int_with_check | ||
553 | 546 | ||
554 | badsys: | 547 | badsys: |
555 | movq $-ENOSYS,RAX-ARGOFFSET(%rsp) | 548 | movq $-ENOSYS,RAX-ARGOFFSET(%rsp) |
@@ -654,6 +647,7 @@ int_careful: | |||
654 | int_very_careful: | 647 | int_very_careful: |
655 | TRACE_IRQS_ON | 648 | TRACE_IRQS_ON |
656 | ENABLE_INTERRUPTS(CLBR_NONE) | 649 | ENABLE_INTERRUPTS(CLBR_NONE) |
650 | int_check_syscall_exit_work: | ||
657 | SAVE_REST | 651 | SAVE_REST |
658 | /* Check for syscall exit trace */ | 652 | /* Check for syscall exit trace */ |
659 | testl $_TIF_WORK_SYSCALL_EXIT,%edx | 653 | testl $_TIF_WORK_SYSCALL_EXIT,%edx |
@@ -1021,7 +1015,7 @@ apicinterrupt ERROR_APIC_VECTOR \ | |||
1021 | apicinterrupt SPURIOUS_APIC_VECTOR \ | 1015 | apicinterrupt SPURIOUS_APIC_VECTOR \ |
1022 | spurious_interrupt smp_spurious_interrupt | 1016 | spurious_interrupt smp_spurious_interrupt |
1023 | 1017 | ||
1024 | #ifdef CONFIG_PERF_COUNTERS | 1018 | #ifdef CONFIG_PERF_EVENTS |
1025 | apicinterrupt LOCAL_PENDING_VECTOR \ | 1019 | apicinterrupt LOCAL_PENDING_VECTOR \ |
1026 | perf_pending_interrupt smp_perf_pending_interrupt | 1020 | perf_pending_interrupt smp_perf_pending_interrupt |
1027 | #endif | 1021 | #endif |
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d94e1ea3b9f..9dbb527e165 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c | |||
@@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, | |||
417 | unsigned long return_hooker = (unsigned long) | 417 | unsigned long return_hooker = (unsigned long) |
418 | &return_to_handler; | 418 | &return_to_handler; |
419 | 419 | ||
420 | /* Nmi's are currently unsupported */ | ||
421 | if (unlikely(in_nmi())) | ||
422 | return; | ||
423 | |||
424 | if (unlikely(atomic_read(¤t->tracing_graph_pause))) | 420 | if (unlikely(atomic_read(¤t->tracing_graph_pause))) |
425 | return; | 421 | return; |
426 | 422 | ||
@@ -498,37 +494,56 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) | |||
498 | 494 | ||
499 | struct syscall_metadata *syscall_nr_to_meta(int nr) | 495 | struct syscall_metadata *syscall_nr_to_meta(int nr) |
500 | { | 496 | { |
501 | if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) | 497 | if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) |
502 | return NULL; | 498 | return NULL; |
503 | 499 | ||
504 | return syscalls_metadata[nr]; | 500 | return syscalls_metadata[nr]; |
505 | } | 501 | } |
506 | 502 | ||
507 | void arch_init_ftrace_syscalls(void) | 503 | int syscall_name_to_nr(char *name) |
504 | { | ||
505 | int i; | ||
506 | |||
507 | if (!syscalls_metadata) | ||
508 | return -1; | ||
509 | |||
510 | for (i = 0; i < NR_syscalls; i++) { | ||
511 | if (syscalls_metadata[i]) { | ||
512 | if (!strcmp(syscalls_metadata[i]->name, name)) | ||
513 | return i; | ||
514 | } | ||
515 | } | ||
516 | return -1; | ||
517 | } | ||
518 | |||
519 | void set_syscall_enter_id(int num, int id) | ||
520 | { | ||
521 | syscalls_metadata[num]->enter_id = id; | ||
522 | } | ||
523 | |||
524 | void set_syscall_exit_id(int num, int id) | ||
525 | { | ||
526 | syscalls_metadata[num]->exit_id = id; | ||
527 | } | ||
528 | |||
529 | static int __init arch_init_ftrace_syscalls(void) | ||
508 | { | 530 | { |
509 | int i; | 531 | int i; |
510 | struct syscall_metadata *meta; | 532 | struct syscall_metadata *meta; |
511 | unsigned long **psys_syscall_table = &sys_call_table; | 533 | unsigned long **psys_syscall_table = &sys_call_table; |
512 | static atomic_t refs; | ||
513 | |||
514 | if (atomic_inc_return(&refs) != 1) | ||
515 | goto end; | ||
516 | 534 | ||
517 | syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * | 535 | syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * |
518 | FTRACE_SYSCALL_MAX, GFP_KERNEL); | 536 | NR_syscalls, GFP_KERNEL); |
519 | if (!syscalls_metadata) { | 537 | if (!syscalls_metadata) { |
520 | WARN_ON(1); | 538 | WARN_ON(1); |
521 | return; | 539 | return -ENOMEM; |
522 | } | 540 | } |
523 | 541 | ||
524 | for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { | 542 | for (i = 0; i < NR_syscalls; i++) { |
525 | meta = find_syscall_meta(psys_syscall_table[i]); | 543 | meta = find_syscall_meta(psys_syscall_table[i]); |
526 | syscalls_metadata[i] = meta; | 544 | syscalls_metadata[i] = meta; |
527 | } | 545 | } |
528 | return; | 546 | return 0; |
529 | |||
530 | /* Paranoid: avoid overflow */ | ||
531 | end: | ||
532 | atomic_dec(&refs); | ||
533 | } | 547 | } |
548 | arch_initcall(arch_init_ftrace_syscalls); | ||
534 | #endif | 549 | #endif |
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3f8579f8d42..4f8e2507e8f 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c | |||
@@ -11,8 +11,21 @@ | |||
11 | #include <asm/setup.h> | 11 | #include <asm/setup.h> |
12 | #include <asm/sections.h> | 12 | #include <asm/sections.h> |
13 | #include <asm/e820.h> | 13 | #include <asm/e820.h> |
14 | #include <asm/bios_ebda.h> | 14 | #include <asm/page.h> |
15 | #include <asm/trampoline.h> | 15 | #include <asm/trampoline.h> |
16 | #include <asm/apic.h> | ||
17 | #include <asm/io_apic.h> | ||
18 | #include <asm/bios_ebda.h> | ||
19 | |||
20 | static void __init i386_default_early_setup(void) | ||
21 | { | ||
22 | /* Initilize 32bit specific setup functions */ | ||
23 | x86_init.resources.probe_roms = probe_roms; | ||
24 | x86_init.resources.reserve_resources = i386_reserve_resources; | ||
25 | x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; | ||
26 | |||
27 | reserve_ebda_region(); | ||
28 | } | ||
16 | 29 | ||
17 | void __init i386_start_kernel(void) | 30 | void __init i386_start_kernel(void) |
18 | { | 31 | { |
@@ -29,7 +42,16 @@ void __init i386_start_kernel(void) | |||
29 | reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); | 42 | reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); |
30 | } | 43 | } |
31 | #endif | 44 | #endif |
32 | reserve_ebda_region(); | 45 | |
46 | /* Call the subarch specific early setup function */ | ||
47 | switch (boot_params.hdr.hardware_subarch) { | ||
48 | case X86_SUBARCH_MRST: | ||
49 | x86_mrst_early_setup(); | ||
50 | break; | ||
51 | default: | ||
52 | i386_default_early_setup(); | ||
53 | break; | ||
54 | } | ||
33 | 55 | ||
34 | /* | 56 | /* |
35 | * At this point everything still needed from the boot loader | 57 | * At this point everything still needed from the boot loader |
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 70eaa852c73..0b06cd778fd 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c | |||
@@ -23,8 +23,8 @@ | |||
23 | #include <asm/sections.h> | 23 | #include <asm/sections.h> |
24 | #include <asm/kdebug.h> | 24 | #include <asm/kdebug.h> |
25 | #include <asm/e820.h> | 25 | #include <asm/e820.h> |
26 | #include <asm/bios_ebda.h> | ||
27 | #include <asm/trampoline.h> | 26 | #include <asm/trampoline.h> |
27 | #include <asm/bios_ebda.h> | ||
28 | 28 | ||
29 | static void __init zap_identity_mappings(void) | 29 | static void __init zap_identity_mappings(void) |
30 | { | 30 | { |
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 0d98a01cbdb..050c278481b 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -79,7 +79,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) | |||
79 | * any particular GDT layout, because we load our own as soon as we | 79 | * any particular GDT layout, because we load our own as soon as we |
80 | * can. | 80 | * can. |
81 | */ | 81 | */ |
82 | .section .text.head,"ax",@progbits | 82 | __HEAD |
83 | ENTRY(startup_32) | 83 | ENTRY(startup_32) |
84 | /* test KEEP_SEGMENTS flag to see if the bootloader is asking | 84 | /* test KEEP_SEGMENTS flag to see if the bootloader is asking |
85 | us to not reload segments */ | 85 | us to not reload segments */ |
@@ -157,6 +157,7 @@ subarch_entries: | |||
157 | .long default_entry /* normal x86/PC */ | 157 | .long default_entry /* normal x86/PC */ |
158 | .long lguest_entry /* lguest hypervisor */ | 158 | .long lguest_entry /* lguest hypervisor */ |
159 | .long xen_entry /* Xen hypervisor */ | 159 | .long xen_entry /* Xen hypervisor */ |
160 | .long default_entry /* Moorestown MID */ | ||
160 | num_subarch_entries = (. - subarch_entries) / 4 | 161 | num_subarch_entries = (. - subarch_entries) / 4 |
161 | .previous | 162 | .previous |
162 | #endif /* CONFIG_PARAVIRT */ | 163 | #endif /* CONFIG_PARAVIRT */ |
@@ -261,9 +262,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); | |||
261 | * which will be freed later | 262 | * which will be freed later |
262 | */ | 263 | */ |
263 | 264 | ||
264 | #ifndef CONFIG_HOTPLUG_CPU | 265 | __CPUINIT |
265 | .section .init.text,"ax",@progbits | ||
266 | #endif | ||
267 | 266 | ||
268 | #ifdef CONFIG_SMP | 267 | #ifdef CONFIG_SMP |
269 | ENTRY(startup_32_smp) | 268 | ENTRY(startup_32_smp) |
@@ -441,7 +440,6 @@ is386: movl $2,%ecx # set MP | |||
441 | jne 1f | 440 | jne 1f |
442 | movl $per_cpu__gdt_page,%eax | 441 | movl $per_cpu__gdt_page,%eax |
443 | movl $per_cpu__stack_canary,%ecx | 442 | movl $per_cpu__stack_canary,%ecx |
444 | subl $20, %ecx | ||
445 | movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) | 443 | movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) |
446 | shrl $16, %ecx | 444 | shrl $16, %ecx |
447 | movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) | 445 | movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) |
@@ -602,11 +600,7 @@ ignore_int: | |||
602 | #endif | 600 | #endif |
603 | iret | 601 | iret |
604 | 602 | ||
605 | #ifndef CONFIG_HOTPLUG_CPU | ||
606 | __CPUINITDATA | ||
607 | #else | ||
608 | __REFDATA | 603 | __REFDATA |
609 | #endif | ||
610 | .align 4 | 604 | .align 4 |
611 | ENTRY(initial_code) | 605 | ENTRY(initial_code) |
612 | .long i386_start_kernel | 606 | .long i386_start_kernel |
@@ -614,7 +608,7 @@ ENTRY(initial_code) | |||
614 | /* | 608 | /* |
615 | * BSS section | 609 | * BSS section |
616 | */ | 610 | */ |
617 | .section ".bss.page_aligned","wa" | 611 | __PAGE_ALIGNED_BSS |
618 | .align PAGE_SIZE_asm | 612 | .align PAGE_SIZE_asm |
619 | #ifdef CONFIG_X86_PAE | 613 | #ifdef CONFIG_X86_PAE |
620 | swapper_pg_pmd: | 614 | swapper_pg_pmd: |
@@ -632,7 +626,7 @@ ENTRY(empty_zero_page) | |||
632 | * This starts the data section. | 626 | * This starts the data section. |
633 | */ | 627 | */ |
634 | #ifdef CONFIG_X86_PAE | 628 | #ifdef CONFIG_X86_PAE |
635 | .section ".data.page_aligned","wa" | 629 | __PAGE_ALIGNED_DATA |
636 | /* Page-aligned for the benefit of paravirt? */ | 630 | /* Page-aligned for the benefit of paravirt? */ |
637 | .align PAGE_SIZE_asm | 631 | .align PAGE_SIZE_asm |
638 | ENTRY(swapper_pg_dir) | 632 | ENTRY(swapper_pg_dir) |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index fa54f78e2a0..780cd928fcd 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -40,7 +40,7 @@ L4_START_KERNEL = pgd_index(__START_KERNEL_map) | |||
40 | L3_START_KERNEL = pud_index(__START_KERNEL_map) | 40 | L3_START_KERNEL = pud_index(__START_KERNEL_map) |
41 | 41 | ||
42 | .text | 42 | .text |
43 | .section .text.head | 43 | __HEAD |
44 | .code64 | 44 | .code64 |
45 | .globl startup_64 | 45 | .globl startup_64 |
46 | startup_64: | 46 | startup_64: |
@@ -418,7 +418,7 @@ ENTRY(phys_base) | |||
418 | ENTRY(idt_table) | 418 | ENTRY(idt_table) |
419 | .skip IDT_ENTRIES * 16 | 419 | .skip IDT_ENTRIES * 16 |
420 | 420 | ||
421 | .section .bss.page_aligned, "aw", @nobits | 421 | __PAGE_ALIGNED_BSS |
422 | .align PAGE_SIZE | 422 | .align PAGE_SIZE |
423 | ENTRY(empty_zero_page) | 423 | ENTRY(empty_zero_page) |
424 | .skip PAGE_SIZE | 424 | .skip PAGE_SIZE |
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 270ff83efc1..3a54dcb9cd0 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c | |||
@@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); | |||
20 | * way process stacks are handled. This is done by having a special | 20 | * way process stacks are handled. This is done by having a special |
21 | * "init_task" linker map entry.. | 21 | * "init_task" linker map entry.. |
22 | */ | 22 | */ |
23 | union thread_union init_thread_union | 23 | union thread_union init_thread_union __init_task_data = |
24 | __attribute__((__section__(".data.init_task"))) = | 24 | { INIT_THREAD_INFO(init_task) }; |
25 | { INIT_THREAD_INFO(init_task) }; | ||
26 | 25 | ||
27 | /* | 26 | /* |
28 | * Initial task structure. | 27 | * Initial task structure. |
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b0cdde6932f..74656d1d4e3 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c | |||
@@ -104,7 +104,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) | |||
104 | seq_printf(p, " Threshold APIC interrupts\n"); | 104 | seq_printf(p, " Threshold APIC interrupts\n"); |
105 | # endif | 105 | # endif |
106 | #endif | 106 | #endif |
107 | #ifdef CONFIG_X86_NEW_MCE | 107 | #ifdef CONFIG_X86_MCE |
108 | seq_printf(p, "%*s: ", prec, "MCE"); | 108 | seq_printf(p, "%*s: ", prec, "MCE"); |
109 | for_each_online_cpu(j) | 109 | for_each_online_cpu(j) |
110 | seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); | 110 | seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); |
@@ -200,7 +200,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) | |||
200 | sum += irq_stats(cpu)->irq_threshold_count; | 200 | sum += irq_stats(cpu)->irq_threshold_count; |
201 | # endif | 201 | # endif |
202 | #endif | 202 | #endif |
203 | #ifdef CONFIG_X86_NEW_MCE | 203 | #ifdef CONFIG_X86_MCE |
204 | sum += per_cpu(mce_exception_count, cpu); | 204 | sum += per_cpu(mce_exception_count, cpu); |
205 | sum += per_cpu(mce_poll_count, cpu); | 205 | sum += per_cpu(mce_poll_count, cpu); |
206 | #endif | 206 | #endif |
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 3b09634a515..7d35d0fe232 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
@@ -218,7 +218,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) | |||
218 | void fixup_irqs(void) | 218 | void fixup_irqs(void) |
219 | { | 219 | { |
220 | unsigned int irq; | 220 | unsigned int irq; |
221 | static int warned; | ||
222 | struct irq_desc *desc; | 221 | struct irq_desc *desc; |
223 | 222 | ||
224 | for_each_irq_desc(irq, desc) { | 223 | for_each_irq_desc(irq, desc) { |
@@ -236,8 +235,8 @@ void fixup_irqs(void) | |||
236 | } | 235 | } |
237 | if (desc->chip->set_affinity) | 236 | if (desc->chip->set_affinity) |
238 | desc->chip->set_affinity(irq, affinity); | 237 | desc->chip->set_affinity(irq, affinity); |
239 | else if (desc->action && !(warned++)) | 238 | else if (desc->action) |
240 | printk("Cannot set affinity for irq %i\n", irq); | 239 | printk_once("Cannot set affinity for irq %i\n", irq); |
241 | } | 240 | } |
242 | 241 | ||
243 | #if 0 | 242 | #if 0 |
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 92b7703d3d5..40f30773fb2 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c | |||
@@ -116,7 +116,7 @@ int vector_used_by_percpu_irq(unsigned int vector) | |||
116 | return 0; | 116 | return 0; |
117 | } | 117 | } |
118 | 118 | ||
119 | static void __init init_ISA_irqs(void) | 119 | void __init init_ISA_irqs(void) |
120 | { | 120 | { |
121 | int i; | 121 | int i; |
122 | 122 | ||
@@ -140,8 +140,10 @@ static void __init init_ISA_irqs(void) | |||
140 | } | 140 | } |
141 | } | 141 | } |
142 | 142 | ||
143 | /* Overridden in paravirt.c */ | 143 | void __init init_IRQ(void) |
144 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | 144 | { |
145 | x86_init.irqs.intr_init(); | ||
146 | } | ||
145 | 147 | ||
146 | static void __init smp_intr_init(void) | 148 | static void __init smp_intr_init(void) |
147 | { | 149 | { |
@@ -190,7 +192,7 @@ static void __init apic_intr_init(void) | |||
190 | #ifdef CONFIG_X86_MCE_THRESHOLD | 192 | #ifdef CONFIG_X86_MCE_THRESHOLD |
191 | alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); | 193 | alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); |
192 | #endif | 194 | #endif |
193 | #if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) | 195 | #if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC) |
194 | alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); | 196 | alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); |
195 | #endif | 197 | #endif |
196 | 198 | ||
@@ -206,39 +208,19 @@ static void __init apic_intr_init(void) | |||
206 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | 208 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); |
207 | 209 | ||
208 | /* Performance monitoring interrupts: */ | 210 | /* Performance monitoring interrupts: */ |
209 | # ifdef CONFIG_PERF_COUNTERS | 211 | # ifdef CONFIG_PERF_EVENTS |
210 | alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); | 212 | alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); |
211 | # endif | 213 | # endif |
212 | 214 | ||
213 | #endif | 215 | #endif |
214 | } | 216 | } |
215 | 217 | ||
216 | /** | ||
217 | * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors | ||
218 | * | ||
219 | * Description: | ||
220 | * Perform any necessary interrupt initialisation prior to setting up | ||
221 | * the "ordinary" interrupt call gates. For legacy reasons, the ISA | ||
222 | * interrupts should be initialised here if the machine emulates a PC | ||
223 | * in any way. | ||
224 | **/ | ||
225 | static void __init x86_quirk_pre_intr_init(void) | ||
226 | { | ||
227 | #ifdef CONFIG_X86_32 | ||
228 | if (x86_quirks->arch_pre_intr_init) { | ||
229 | if (x86_quirks->arch_pre_intr_init()) | ||
230 | return; | ||
231 | } | ||
232 | #endif | ||
233 | init_ISA_irqs(); | ||
234 | } | ||
235 | |||
236 | void __init native_init_IRQ(void) | 218 | void __init native_init_IRQ(void) |
237 | { | 219 | { |
238 | int i; | 220 | int i; |
239 | 221 | ||
240 | /* Execute any quirks before the call gates are initialised: */ | 222 | /* Execute any quirks before the call gates are initialised: */ |
241 | x86_quirk_pre_intr_init(); | 223 | x86_init.irqs.pre_vector_init(); |
242 | 224 | ||
243 | apic_intr_init(); | 225 | apic_intr_init(); |
244 | 226 | ||
@@ -258,12 +240,6 @@ void __init native_init_IRQ(void) | |||
258 | 240 | ||
259 | #ifdef CONFIG_X86_32 | 241 | #ifdef CONFIG_X86_32 |
260 | /* | 242 | /* |
261 | * Call quirks after call gates are initialised (usually add in | ||
262 | * the architecture specific gates): | ||
263 | */ | ||
264 | x86_quirk_intr_init(); | ||
265 | |||
266 | /* | ||
267 | * External FPU? Set up irq13 if so, for | 243 | * External FPU? Set up irq13 if so, for |
268 | * original braindamaged IBM FERR coupling. | 244 | * original braindamaged IBM FERR coupling. |
269 | */ | 245 | */ |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index c664d515f61..63b0ec8d3d4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -34,7 +34,6 @@ | |||
34 | struct kvm_para_state { | 34 | struct kvm_para_state { |
35 | u8 mmu_queue[MMU_QUEUE_SIZE]; | 35 | u8 mmu_queue[MMU_QUEUE_SIZE]; |
36 | int mmu_queue_len; | 36 | int mmu_queue_len; |
37 | enum paravirt_lazy_mode mode; | ||
38 | }; | 37 | }; |
39 | 38 | ||
40 | static DEFINE_PER_CPU(struct kvm_para_state, para_state); | 39 | static DEFINE_PER_CPU(struct kvm_para_state, para_state); |
@@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len) | |||
77 | { | 76 | { |
78 | struct kvm_para_state *state = kvm_para_state(); | 77 | struct kvm_para_state *state = kvm_para_state(); |
79 | 78 | ||
80 | if (state->mode != PARAVIRT_LAZY_MMU) { | 79 | if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { |
81 | kvm_mmu_op(buffer, len); | 80 | kvm_mmu_op(buffer, len); |
82 | return; | 81 | return; |
83 | } | 82 | } |
@@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn) | |||
185 | 184 | ||
186 | static void kvm_enter_lazy_mmu(void) | 185 | static void kvm_enter_lazy_mmu(void) |
187 | { | 186 | { |
188 | struct kvm_para_state *state = kvm_para_state(); | ||
189 | |||
190 | paravirt_enter_lazy_mmu(); | 187 | paravirt_enter_lazy_mmu(); |
191 | state->mode = paravirt_get_lazy_mode(); | ||
192 | } | 188 | } |
193 | 189 | ||
194 | static void kvm_leave_lazy_mmu(void) | 190 | static void kvm_leave_lazy_mmu(void) |
@@ -197,7 +193,6 @@ static void kvm_leave_lazy_mmu(void) | |||
197 | 193 | ||
198 | mmu_queue_flush(state); | 194 | mmu_queue_flush(state); |
199 | paravirt_leave_lazy_mmu(); | 195 | paravirt_leave_lazy_mmu(); |
200 | state->mode = paravirt_get_lazy_mode(); | ||
201 | } | 196 | } |
202 | 197 | ||
203 | static void __init paravirt_ops_setup(void) | 198 | static void __init paravirt_ops_setup(void) |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 223af43f152..feaeb0d3aa4 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -22,6 +22,8 @@ | |||
22 | #include <asm/msr.h> | 22 | #include <asm/msr.h> |
23 | #include <asm/apic.h> | 23 | #include <asm/apic.h> |
24 | #include <linux/percpu.h> | 24 | #include <linux/percpu.h> |
25 | |||
26 | #include <asm/x86_init.h> | ||
25 | #include <asm/reboot.h> | 27 | #include <asm/reboot.h> |
26 | 28 | ||
27 | #define KVM_SCALE 22 | 29 | #define KVM_SCALE 22 |
@@ -50,8 +52,8 @@ static unsigned long kvm_get_wallclock(void) | |||
50 | struct timespec ts; | 52 | struct timespec ts; |
51 | int low, high; | 53 | int low, high; |
52 | 54 | ||
53 | low = (int)__pa(&wall_clock); | 55 | low = (int)__pa_symbol(&wall_clock); |
54 | high = ((u64)__pa(&wall_clock) >> 32); | 56 | high = ((u64)__pa_symbol(&wall_clock) >> 32); |
55 | native_write_msr(MSR_KVM_WALL_CLOCK, low, high); | 57 | native_write_msr(MSR_KVM_WALL_CLOCK, low, high); |
56 | 58 | ||
57 | vcpu_time = &get_cpu_var(hv_clock); | 59 | vcpu_time = &get_cpu_var(hv_clock); |
@@ -182,12 +184,13 @@ void __init kvmclock_init(void) | |||
182 | if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { | 184 | if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { |
183 | if (kvm_register_clock("boot clock")) | 185 | if (kvm_register_clock("boot clock")) |
184 | return; | 186 | return; |
185 | pv_time_ops.get_wallclock = kvm_get_wallclock; | ||
186 | pv_time_ops.set_wallclock = kvm_set_wallclock; | ||
187 | pv_time_ops.sched_clock = kvm_clock_read; | 187 | pv_time_ops.sched_clock = kvm_clock_read; |
188 | pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; | 188 | x86_platform.calibrate_tsc = kvm_get_tsc_khz; |
189 | x86_platform.get_wallclock = kvm_get_wallclock; | ||
190 | x86_platform.set_wallclock = kvm_set_wallclock; | ||
189 | #ifdef CONFIG_X86_LOCAL_APIC | 191 | #ifdef CONFIG_X86_LOCAL_APIC |
190 | pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; | 192 | x86_cpuinit.setup_percpu_clockev = |
193 | kvm_setup_secondary_clock; | ||
191 | #endif | 194 | #endif |
192 | #ifdef CONFIG_SMP | 195 | #ifdef CONFIG_SMP |
193 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; | 196 | smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; |
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 71f1d99a635..ec6ef60cbd1 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c | |||
@@ -67,8 +67,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) | |||
67 | #ifdef CONFIG_SMP | 67 | #ifdef CONFIG_SMP |
68 | preempt_disable(); | 68 | preempt_disable(); |
69 | load_LDT(pc); | 69 | load_LDT(pc); |
70 | if (!cpus_equal(current->mm->cpu_vm_mask, | 70 | if (!cpumask_equal(mm_cpumask(current->mm), |
71 | cpumask_of_cpu(smp_processor_id()))) | 71 | cpumask_of(smp_processor_id()))) |
72 | smp_call_function(flush_ldt, current->mm, 1); | 72 | smp_call_function(flush_ldt, current->mm, 1); |
73 | preempt_enable(); | 73 | preempt_enable(); |
74 | #else | 74 | #else |
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9371448290a..378e9a8f1bf 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c | |||
@@ -210,8 +210,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, | |||
210 | { | 210 | { |
211 | ssize_t ret = -EINVAL; | 211 | ssize_t ret = -EINVAL; |
212 | 212 | ||
213 | if ((len >> PAGE_SHIFT) > num_physpages) { | 213 | if ((len >> PAGE_SHIFT) > totalram_pages) { |
214 | pr_err("microcode: too much data (max %ld pages)\n", num_physpages); | 214 | pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); |
215 | return ret; | 215 | return ret; |
216 | } | 216 | } |
217 | 217 | ||
@@ -236,7 +236,7 @@ static const struct file_operations microcode_fops = { | |||
236 | static struct miscdevice microcode_dev = { | 236 | static struct miscdevice microcode_dev = { |
237 | .minor = MICROCODE_MINOR, | 237 | .minor = MICROCODE_MINOR, |
238 | .name = "microcode", | 238 | .name = "microcode", |
239 | .devnode = "cpu/microcode", | 239 | .nodename = "cpu/microcode", |
240 | .fops = µcode_fops, | 240 | .fops = µcode_fops, |
241 | }; | 241 | }; |
242 | 242 | ||
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 651c93b2886..5be95ef4ffe 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c | |||
@@ -45,6 +45,11 @@ static int __init mpf_checksum(unsigned char *mp, int len) | |||
45 | return sum & 0xFF; | 45 | return sum & 0xFF; |
46 | } | 46 | } |
47 | 47 | ||
48 | int __init default_mpc_apic_id(struct mpc_cpu *m) | ||
49 | { | ||
50 | return m->apicid; | ||
51 | } | ||
52 | |||
48 | static void __init MP_processor_info(struct mpc_cpu *m) | 53 | static void __init MP_processor_info(struct mpc_cpu *m) |
49 | { | 54 | { |
50 | int apicid; | 55 | int apicid; |
@@ -55,10 +60,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) | |||
55 | return; | 60 | return; |
56 | } | 61 | } |
57 | 62 | ||
58 | if (x86_quirks->mpc_apic_id) | 63 | apicid = x86_init.mpparse.mpc_apic_id(m); |
59 | apicid = x86_quirks->mpc_apic_id(m); | ||
60 | else | ||
61 | apicid = m->apicid; | ||
62 | 64 | ||
63 | if (m->cpuflag & CPU_BOOTPROCESSOR) { | 65 | if (m->cpuflag & CPU_BOOTPROCESSOR) { |
64 | bootup_cpu = " (Bootup-CPU)"; | 66 | bootup_cpu = " (Bootup-CPU)"; |
@@ -70,16 +72,18 @@ static void __init MP_processor_info(struct mpc_cpu *m) | |||
70 | } | 72 | } |
71 | 73 | ||
72 | #ifdef CONFIG_X86_IO_APIC | 74 | #ifdef CONFIG_X86_IO_APIC |
73 | static void __init MP_bus_info(struct mpc_bus *m) | 75 | void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) |
74 | { | 76 | { |
75 | char str[7]; | ||
76 | memcpy(str, m->bustype, 6); | 77 | memcpy(str, m->bustype, 6); |
77 | str[6] = 0; | 78 | str[6] = 0; |
79 | apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); | ||
80 | } | ||
78 | 81 | ||
79 | if (x86_quirks->mpc_oem_bus_info) | 82 | static void __init MP_bus_info(struct mpc_bus *m) |
80 | x86_quirks->mpc_oem_bus_info(m, str); | 83 | { |
81 | else | 84 | char str[7]; |
82 | apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); | 85 | |
86 | x86_init.mpparse.mpc_oem_bus_info(m, str); | ||
83 | 87 | ||
84 | #if MAX_MP_BUSSES < 256 | 88 | #if MAX_MP_BUSSES < 256 |
85 | if (m->busid >= MAX_MP_BUSSES) { | 89 | if (m->busid >= MAX_MP_BUSSES) { |
@@ -96,8 +100,8 @@ static void __init MP_bus_info(struct mpc_bus *m) | |||
96 | mp_bus_id_to_type[m->busid] = MP_BUS_ISA; | 100 | mp_bus_id_to_type[m->busid] = MP_BUS_ISA; |
97 | #endif | 101 | #endif |
98 | } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { | 102 | } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { |
99 | if (x86_quirks->mpc_oem_pci_bus) | 103 | if (x86_init.mpparse.mpc_oem_pci_bus) |
100 | x86_quirks->mpc_oem_pci_bus(m); | 104 | x86_init.mpparse.mpc_oem_pci_bus(m); |
101 | 105 | ||
102 | clear_bit(m->busid, mp_bus_not_pci); | 106 | clear_bit(m->busid, mp_bus_not_pci); |
103 | #if defined(CONFIG_EISA) || defined(CONFIG_MCA) | 107 | #if defined(CONFIG_EISA) || defined(CONFIG_MCA) |
@@ -291,6 +295,8 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) | |||
291 | 1, mpc, mpc->length, 1); | 295 | 1, mpc, mpc->length, 1); |
292 | } | 296 | } |
293 | 297 | ||
298 | void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } | ||
299 | |||
294 | static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) | 300 | static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) |
295 | { | 301 | { |
296 | char str[16]; | 302 | char str[16]; |
@@ -312,16 +318,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) | |||
312 | if (early) | 318 | if (early) |
313 | return 1; | 319 | return 1; |
314 | 320 | ||
315 | if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) { | 321 | if (mpc->oemptr) |
316 | struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr; | 322 | x86_init.mpparse.smp_read_mpc_oem(mpc); |
317 | x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize); | ||
318 | } | ||
319 | 323 | ||
320 | /* | 324 | /* |
321 | * Now process the configuration blocks. | 325 | * Now process the configuration blocks. |
322 | */ | 326 | */ |
323 | if (x86_quirks->mpc_record) | 327 | x86_init.mpparse.mpc_record(0); |
324 | *x86_quirks->mpc_record = 0; | ||
325 | 328 | ||
326 | while (count < mpc->length) { | 329 | while (count < mpc->length) { |
327 | switch (*mpt) { | 330 | switch (*mpt) { |
@@ -353,8 +356,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) | |||
353 | count = mpc->length; | 356 | count = mpc->length; |
354 | break; | 357 | break; |
355 | } | 358 | } |
356 | if (x86_quirks->mpc_record) | 359 | x86_init.mpparse.mpc_record(1); |
357 | (*x86_quirks->mpc_record)++; | ||
358 | } | 360 | } |
359 | 361 | ||
360 | #ifdef CONFIG_X86_BIGSMP | 362 | #ifdef CONFIG_X86_BIGSMP |
@@ -482,11 +484,11 @@ static void __init construct_ioapic_table(int mpc_default_type) | |||
482 | MP_bus_info(&bus); | 484 | MP_bus_info(&bus); |
483 | } | 485 | } |
484 | 486 | ||
485 | ioapic.type = MP_IOAPIC; | 487 | ioapic.type = MP_IOAPIC; |
486 | ioapic.apicid = 2; | 488 | ioapic.apicid = 2; |
487 | ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; | 489 | ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; |
488 | ioapic.flags = MPC_APIC_USABLE; | 490 | ioapic.flags = MPC_APIC_USABLE; |
489 | ioapic.apicaddr = 0xFEC00000; | 491 | ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE; |
490 | MP_ioapic_info(&ioapic); | 492 | MP_ioapic_info(&ioapic); |
491 | 493 | ||
492 | /* | 494 | /* |
@@ -608,7 +610,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) | |||
608 | /* | 610 | /* |
609 | * Scan the memory blocks for an SMP configuration block. | 611 | * Scan the memory blocks for an SMP configuration block. |
610 | */ | 612 | */ |
611 | static void __init __get_smp_config(unsigned int early) | 613 | void __init default_get_smp_config(unsigned int early) |
612 | { | 614 | { |
613 | struct mpf_intel *mpf = mpf_found; | 615 | struct mpf_intel *mpf = mpf_found; |
614 | 616 | ||
@@ -625,11 +627,6 @@ static void __init __get_smp_config(unsigned int early) | |||
625 | if (acpi_lapic && acpi_ioapic) | 627 | if (acpi_lapic && acpi_ioapic) |
626 | return; | 628 | return; |
627 | 629 | ||
628 | if (x86_quirks->mach_get_smp_config) { | ||
629 | if (x86_quirks->mach_get_smp_config(early)) | ||
630 | return; | ||
631 | } | ||
632 | |||
633 | printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", | 630 | printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", |
634 | mpf->specification); | 631 | mpf->specification); |
635 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) | 632 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) |
@@ -670,16 +667,6 @@ static void __init __get_smp_config(unsigned int early) | |||
670 | */ | 667 | */ |
671 | } | 668 | } |
672 | 669 | ||
673 | void __init early_get_smp_config(void) | ||
674 | { | ||
675 | __get_smp_config(1); | ||
676 | } | ||
677 | |||
678 | void __init get_smp_config(void) | ||
679 | { | ||
680 | __get_smp_config(0); | ||
681 | } | ||
682 | |||
683 | static void __init smp_reserve_bootmem(struct mpf_intel *mpf) | 670 | static void __init smp_reserve_bootmem(struct mpf_intel *mpf) |
684 | { | 671 | { |
685 | unsigned long size = get_mpc_size(mpf->physptr); | 672 | unsigned long size = get_mpc_size(mpf->physptr); |
@@ -745,14 +732,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, | |||
745 | return 0; | 732 | return 0; |
746 | } | 733 | } |
747 | 734 | ||
748 | static void __init __find_smp_config(unsigned int reserve) | 735 | void __init default_find_smp_config(unsigned int reserve) |
749 | { | 736 | { |
750 | unsigned int address; | 737 | unsigned int address; |
751 | 738 | ||
752 | if (x86_quirks->mach_find_smp_config) { | ||
753 | if (x86_quirks->mach_find_smp_config(reserve)) | ||
754 | return; | ||
755 | } | ||
756 | /* | 739 | /* |
757 | * FIXME: Linux assumes you have 640K of base ram.. | 740 | * FIXME: Linux assumes you have 640K of base ram.. |
758 | * this continues the error... | 741 | * this continues the error... |
@@ -787,16 +770,6 @@ static void __init __find_smp_config(unsigned int reserve) | |||
787 | smp_scan_config(address, 0x400, reserve); | 770 | smp_scan_config(address, 0x400, reserve); |
788 | } | 771 | } |
789 | 772 | ||
790 | void __init early_find_smp_config(void) | ||
791 | { | ||
792 | __find_smp_config(0); | ||
793 | } | ||
794 | |||
795 | void __init find_smp_config(void) | ||
796 | { | ||
797 | __find_smp_config(1); | ||
798 | } | ||
799 | |||
800 | #ifdef CONFIG_X86_IO_APIC | 773 | #ifdef CONFIG_X86_IO_APIC |
801 | static u8 __initdata irq_used[MAX_IRQ_SOURCES]; | 774 | static u8 __initdata irq_used[MAX_IRQ_SOURCES]; |
802 | 775 | ||
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c new file mode 100644 index 00000000000..3b7078abc87 --- /dev/null +++ b/arch/x86/kernel/mrst.c | |||
@@ -0,0 +1,24 @@ | |||
1 | /* | ||
2 | * mrst.c: Intel Moorestown platform specific setup code | ||
3 | * | ||
4 | * (C) Copyright 2008 Intel Corporation | ||
5 | * Author: Jacob Pan (jacob.jun.pan@intel.com) | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; version 2 | ||
10 | * of the License. | ||
11 | */ | ||
12 | #include <linux/init.h> | ||
13 | |||
14 | #include <asm/setup.h> | ||
15 | |||
16 | /* | ||
17 | * Moorestown specific x86_init function overrides and early setup | ||
18 | * calls. | ||
19 | */ | ||
20 | void __init x86_mrst_early_setup(void) | ||
21 | { | ||
22 | x86_init.resources.probe_roms = x86_init_noop; | ||
23 | x86_init.resources.reserve_resources = x86_init_noop; | ||
24 | } | ||
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 98fd6cd4e3a..6a3cefc7dda 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c | |||
@@ -1,6 +1,7 @@ | |||
1 | /* ----------------------------------------------------------------------- * | 1 | /* ----------------------------------------------------------------------- * |
2 | * | 2 | * |
3 | * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved | 3 | * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved |
4 | * Copyright 2009 Intel Corporation; author: H. Peter Anvin | ||
4 | * | 5 | * |
5 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -80,11 +81,8 @@ static ssize_t msr_read(struct file *file, char __user *buf, | |||
80 | 81 | ||
81 | for (; count; count -= 8) { | 82 | for (; count; count -= 8) { |
82 | err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); | 83 | err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); |
83 | if (err) { | 84 | if (err) |
84 | if (err == -EFAULT) /* Fix idiotic error code */ | ||
85 | err = -EIO; | ||
86 | break; | 85 | break; |
87 | } | ||
88 | if (copy_to_user(tmp, &data, 8)) { | 86 | if (copy_to_user(tmp, &data, 8)) { |
89 | err = -EFAULT; | 87 | err = -EFAULT; |
90 | break; | 88 | break; |
@@ -115,11 +113,8 @@ static ssize_t msr_write(struct file *file, const char __user *buf, | |||
115 | break; | 113 | break; |
116 | } | 114 | } |
117 | err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); | 115 | err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); |
118 | if (err) { | 116 | if (err) |
119 | if (err == -EFAULT) /* Fix idiotic error code */ | ||
120 | err = -EIO; | ||
121 | break; | 117 | break; |
122 | } | ||
123 | tmp += 2; | 118 | tmp += 2; |
124 | bytes += 8; | 119 | bytes += 8; |
125 | } | 120 | } |
@@ -127,6 +122,54 @@ static ssize_t msr_write(struct file *file, const char __user *buf, | |||
127 | return bytes ? bytes : err; | 122 | return bytes ? bytes : err; |
128 | } | 123 | } |
129 | 124 | ||
125 | static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) | ||
126 | { | ||
127 | u32 __user *uregs = (u32 __user *)arg; | ||
128 | u32 regs[8]; | ||
129 | int cpu = iminor(file->f_path.dentry->d_inode); | ||
130 | int err; | ||
131 | |||
132 | switch (ioc) { | ||
133 | case X86_IOC_RDMSR_REGS: | ||
134 | if (!(file->f_mode & FMODE_READ)) { | ||
135 | err = -EBADF; | ||
136 | break; | ||
137 | } | ||
138 | if (copy_from_user(®s, uregs, sizeof regs)) { | ||
139 | err = -EFAULT; | ||
140 | break; | ||
141 | } | ||
142 | err = rdmsr_safe_regs_on_cpu(cpu, regs); | ||
143 | if (err) | ||
144 | break; | ||
145 | if (copy_to_user(uregs, ®s, sizeof regs)) | ||
146 | err = -EFAULT; | ||
147 | break; | ||
148 | |||
149 | case X86_IOC_WRMSR_REGS: | ||
150 | if (!(file->f_mode & FMODE_WRITE)) { | ||
151 | err = -EBADF; | ||
152 | break; | ||
153 | } | ||
154 | if (copy_from_user(®s, uregs, sizeof regs)) { | ||
155 | err = -EFAULT; | ||
156 | break; | ||
157 | } | ||
158 | err = wrmsr_safe_regs_on_cpu(cpu, regs); | ||
159 | if (err) | ||
160 | break; | ||
161 | if (copy_to_user(uregs, ®s, sizeof regs)) | ||
162 | err = -EFAULT; | ||
163 | break; | ||
164 | |||
165 | default: | ||
166 | err = -ENOTTY; | ||
167 | break; | ||
168 | } | ||
169 | |||
170 | return err; | ||
171 | } | ||
172 | |||
130 | static int msr_open(struct inode *inode, struct file *file) | 173 | static int msr_open(struct inode *inode, struct file *file) |
131 | { | 174 | { |
132 | unsigned int cpu = iminor(file->f_path.dentry->d_inode); | 175 | unsigned int cpu = iminor(file->f_path.dentry->d_inode); |
@@ -157,6 +200,8 @@ static const struct file_operations msr_fops = { | |||
157 | .read = msr_read, | 200 | .read = msr_read, |
158 | .write = msr_write, | 201 | .write = msr_write, |
159 | .open = msr_open, | 202 | .open = msr_open, |
203 | .unlocked_ioctl = msr_ioctl, | ||
204 | .compat_ioctl = msr_ioctl, | ||
160 | }; | 205 | }; |
161 | 206 | ||
162 | static int __cpuinit msr_device_create(int cpu) | 207 | static int __cpuinit msr_device_create(int cpu) |
@@ -196,7 +241,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = { | |||
196 | .notifier_call = msr_class_cpu_callback, | 241 | .notifier_call = msr_class_cpu_callback, |
197 | }; | 242 | }; |
198 | 243 | ||
199 | static char *msr_nodename(struct device *dev) | 244 | static char *msr_devnode(struct device *dev, mode_t *mode) |
200 | { | 245 | { |
201 | return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); | 246 | return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); |
202 | } | 247 | } |
@@ -217,7 +262,7 @@ static int __init msr_init(void) | |||
217 | err = PTR_ERR(msr_class); | 262 | err = PTR_ERR(msr_class); |
218 | goto out_chrdev; | 263 | goto out_chrdev; |
219 | } | 264 | } |
220 | msr_class->nodename = msr_nodename; | 265 | msr_class->devnode = msr_devnode; |
221 | for_each_online_cpu(i) { | 266 | for_each_online_cpu(i) { |
222 | err = msr_device_create(i); | 267 | err = msr_device_create(i); |
223 | if (err != 0) | 268 | if (err != 0) |
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 70ec9b951d7..1b1739d1631 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -54,17 +54,12 @@ u64 _paravirt_ident_64(u64 x) | |||
54 | return x; | 54 | return x; |
55 | } | 55 | } |
56 | 56 | ||
57 | static void __init default_banner(void) | 57 | void __init default_banner(void) |
58 | { | 58 | { |
59 | printk(KERN_INFO "Booting paravirtualized kernel on %s\n", | 59 | printk(KERN_INFO "Booting paravirtualized kernel on %s\n", |
60 | pv_info.name); | 60 | pv_info.name); |
61 | } | 61 | } |
62 | 62 | ||
63 | char *memory_setup(void) | ||
64 | { | ||
65 | return pv_init_ops.memory_setup(); | ||
66 | } | ||
67 | |||
68 | /* Simple instruction patching code. */ | 63 | /* Simple instruction patching code. */ |
69 | #define DEF_NATIVE(ops, name, code) \ | 64 | #define DEF_NATIVE(ops, name, code) \ |
70 | extern const char start_##ops##_##name[], end_##ops##_##name[]; \ | 65 | extern const char start_##ops##_##name[], end_##ops##_##name[]; \ |
@@ -188,11 +183,6 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len, | |||
188 | return insn_len; | 183 | return insn_len; |
189 | } | 184 | } |
190 | 185 | ||
191 | void init_IRQ(void) | ||
192 | { | ||
193 | pv_irq_ops.init_IRQ(); | ||
194 | } | ||
195 | |||
196 | static void native_flush_tlb(void) | 186 | static void native_flush_tlb(void) |
197 | { | 187 | { |
198 | __native_flush_tlb(); | 188 | __native_flush_tlb(); |
@@ -218,13 +208,6 @@ extern void native_irq_enable_sysexit(void); | |||
218 | extern void native_usergs_sysret32(void); | 208 | extern void native_usergs_sysret32(void); |
219 | extern void native_usergs_sysret64(void); | 209 | extern void native_usergs_sysret64(void); |
220 | 210 | ||
221 | static int __init print_banner(void) | ||
222 | { | ||
223 | pv_init_ops.banner(); | ||
224 | return 0; | ||
225 | } | ||
226 | core_initcall(print_banner); | ||
227 | |||
228 | static struct resource reserve_ioports = { | 211 | static struct resource reserve_ioports = { |
229 | .start = 0, | 212 | .start = 0, |
230 | .end = IO_SPACE_LIMIT, | 213 | .end = IO_SPACE_LIMIT, |
@@ -320,21 +303,13 @@ struct pv_info pv_info = { | |||
320 | 303 | ||
321 | struct pv_init_ops pv_init_ops = { | 304 | struct pv_init_ops pv_init_ops = { |
322 | .patch = native_patch, | 305 | .patch = native_patch, |
323 | .banner = default_banner, | ||
324 | .arch_setup = paravirt_nop, | ||
325 | .memory_setup = machine_specific_memory_setup, | ||
326 | }; | 306 | }; |
327 | 307 | ||
328 | struct pv_time_ops pv_time_ops = { | 308 | struct pv_time_ops pv_time_ops = { |
329 | .time_init = hpet_time_init, | ||
330 | .get_wallclock = native_get_wallclock, | ||
331 | .set_wallclock = native_set_wallclock, | ||
332 | .sched_clock = native_sched_clock, | 309 | .sched_clock = native_sched_clock, |
333 | .get_tsc_khz = native_calibrate_tsc, | ||
334 | }; | 310 | }; |
335 | 311 | ||
336 | struct pv_irq_ops pv_irq_ops = { | 312 | struct pv_irq_ops pv_irq_ops = { |
337 | .init_IRQ = native_init_IRQ, | ||
338 | .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), | 313 | .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), |
339 | .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), | 314 | .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), |
340 | .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), | 315 | .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), |
@@ -362,8 +337,9 @@ struct pv_cpu_ops pv_cpu_ops = { | |||
362 | #endif | 337 | #endif |
363 | .wbinvd = native_wbinvd, | 338 | .wbinvd = native_wbinvd, |
364 | .read_msr = native_read_msr_safe, | 339 | .read_msr = native_read_msr_safe, |
365 | .read_msr_amd = native_read_msr_amd_safe, | 340 | .rdmsr_regs = native_rdmsr_safe_regs, |
366 | .write_msr = native_write_msr_safe, | 341 | .write_msr = native_write_msr_safe, |
342 | .wrmsr_regs = native_wrmsr_safe_regs, | ||
367 | .read_tsc = native_read_tsc, | 343 | .read_tsc = native_read_tsc, |
368 | .read_pmc = native_read_pmc, | 344 | .read_pmc = native_read_pmc, |
369 | .read_tscp = native_read_tscp, | 345 | .read_tscp = native_read_tscp, |
@@ -408,8 +384,6 @@ struct pv_cpu_ops pv_cpu_ops = { | |||
408 | 384 | ||
409 | struct pv_apic_ops pv_apic_ops = { | 385 | struct pv_apic_ops pv_apic_ops = { |
410 | #ifdef CONFIG_X86_LOCAL_APIC | 386 | #ifdef CONFIG_X86_LOCAL_APIC |
411 | .setup_boot_clock = setup_boot_APIC_clock, | ||
412 | .setup_secondary_clock = setup_secondary_APIC_clock, | ||
413 | .startup_ipi_hook = paravirt_nop, | 387 | .startup_ipi_hook = paravirt_nop, |
414 | #endif | 388 | #endif |
415 | }; | 389 | }; |
@@ -423,13 +397,6 @@ struct pv_apic_ops pv_apic_ops = { | |||
423 | #endif | 397 | #endif |
424 | 398 | ||
425 | struct pv_mmu_ops pv_mmu_ops = { | 399 | struct pv_mmu_ops pv_mmu_ops = { |
426 | #ifndef CONFIG_X86_64 | ||
427 | .pagetable_setup_start = native_pagetable_setup_start, | ||
428 | .pagetable_setup_done = native_pagetable_setup_done, | ||
429 | #else | ||
430 | .pagetable_setup_start = paravirt_nop, | ||
431 | .pagetable_setup_done = paravirt_nop, | ||
432 | #endif | ||
433 | 400 | ||
434 | .read_cr2 = native_read_cr2, | 401 | .read_cr2 = native_read_cr2, |
435 | .write_cr2 = native_write_cr2, | 402 | .write_cr2 = native_write_cr2, |
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1a041bcf506..64b838eac18 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c | |||
@@ -3,6 +3,7 @@ | |||
3 | #include <linux/dmar.h> | 3 | #include <linux/dmar.h> |
4 | #include <linux/bootmem.h> | 4 | #include <linux/bootmem.h> |
5 | #include <linux/pci.h> | 5 | #include <linux/pci.h> |
6 | #include <linux/kmemleak.h> | ||
6 | 7 | ||
7 | #include <asm/proto.h> | 8 | #include <asm/proto.h> |
8 | #include <asm/dma.h> | 9 | #include <asm/dma.h> |
@@ -32,7 +33,14 @@ int no_iommu __read_mostly; | |||
32 | /* Set this to 1 if there is a HW IOMMU in the system */ | 33 | /* Set this to 1 if there is a HW IOMMU in the system */ |
33 | int iommu_detected __read_mostly = 0; | 34 | int iommu_detected __read_mostly = 0; |
34 | 35 | ||
35 | int iommu_pass_through; | 36 | /* |
37 | * This variable becomes 1 if iommu=pt is passed on the kernel command line. | ||
38 | * If this variable is 1, IOMMU implementations do no DMA ranslation for | ||
39 | * devices and allow every device to access to whole physical memory. This is | ||
40 | * useful if a user want to use an IOMMU only for KVM device assignment to | ||
41 | * guests and not for driver dma translation. | ||
42 | */ | ||
43 | int iommu_pass_through __read_mostly; | ||
36 | 44 | ||
37 | dma_addr_t bad_dma_address __read_mostly = 0; | 45 | dma_addr_t bad_dma_address __read_mostly = 0; |
38 | EXPORT_SYMBOL(bad_dma_address); | 46 | EXPORT_SYMBOL(bad_dma_address); |
@@ -88,6 +96,11 @@ void __init dma32_reserve_bootmem(void) | |||
88 | size = roundup(dma32_bootmem_size, align); | 96 | size = roundup(dma32_bootmem_size, align); |
89 | dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, | 97 | dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, |
90 | 512ULL<<20); | 98 | 512ULL<<20); |
99 | /* | ||
100 | * Kmemleak should not scan this block as it may not be mapped via the | ||
101 | * kernel direct mapping. | ||
102 | */ | ||
103 | kmemleak_ignore(dma32_bootmem_ptr); | ||
91 | if (dma32_bootmem_ptr) | 104 | if (dma32_bootmem_ptr) |
92 | dma32_bootmem_size = size; | 105 | dma32_bootmem_size = size; |
93 | else | 106 | else |
@@ -147,7 +160,7 @@ again: | |||
147 | return NULL; | 160 | return NULL; |
148 | 161 | ||
149 | addr = page_to_phys(page); | 162 | addr = page_to_phys(page); |
150 | if (!is_buffer_dma_capable(dma_mask, addr, size)) { | 163 | if (addr + size > dma_mask) { |
151 | __free_pages(page, get_order(size)); | 164 | __free_pages(page, get_order(size)); |
152 | 165 | ||
153 | if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { | 166 | if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { |
@@ -212,10 +225,8 @@ static __init int iommu_setup(char *p) | |||
212 | if (!strncmp(p, "soft", 4)) | 225 | if (!strncmp(p, "soft", 4)) |
213 | swiotlb = 1; | 226 | swiotlb = 1; |
214 | #endif | 227 | #endif |
215 | if (!strncmp(p, "pt", 2)) { | 228 | if (!strncmp(p, "pt", 2)) |
216 | iommu_pass_through = 1; | 229 | iommu_pass_through = 1; |
217 | return 1; | ||
218 | } | ||
219 | 230 | ||
220 | gart_parse_options(p); | 231 | gart_parse_options(p); |
221 | 232 | ||
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index d2e56b8f48e..98a827ee9ed 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c | |||
@@ -190,14 +190,13 @@ static void iommu_full(struct device *dev, size_t size, int dir) | |||
190 | static inline int | 190 | static inline int |
191 | need_iommu(struct device *dev, unsigned long addr, size_t size) | 191 | need_iommu(struct device *dev, unsigned long addr, size_t size) |
192 | { | 192 | { |
193 | return force_iommu || | 193 | return force_iommu || !dma_capable(dev, addr, size); |
194 | !is_buffer_dma_capable(*dev->dma_mask, addr, size); | ||
195 | } | 194 | } |
196 | 195 | ||
197 | static inline int | 196 | static inline int |
198 | nonforced_iommu(struct device *dev, unsigned long addr, size_t size) | 197 | nonforced_iommu(struct device *dev, unsigned long addr, size_t size) |
199 | { | 198 | { |
200 | return !is_buffer_dma_capable(*dev->dma_mask, addr, size); | 199 | return !dma_capable(dev, addr, size); |
201 | } | 200 | } |
202 | 201 | ||
203 | /* Map a single continuous physical area into the IOMMU. | 202 | /* Map a single continuous physical area into the IOMMU. |
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 71d412a09f3..a3933d4330c 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c | |||
@@ -14,7 +14,7 @@ | |||
14 | static int | 14 | static int |
15 | check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) | 15 | check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) |
16 | { | 16 | { |
17 | if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) { | 17 | if (hwdev && !dma_capable(hwdev, bus, size)) { |
18 | if (*hwdev->dma_mask >= DMA_BIT_MASK(32)) | 18 | if (*hwdev->dma_mask >= DMA_BIT_MASK(32)) |
19 | printk(KERN_ERR | 19 | printk(KERN_ERR |
20 | "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", | 20 | "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", |
@@ -79,12 +79,29 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, | |||
79 | free_pages((unsigned long)vaddr, get_order(size)); | 79 | free_pages((unsigned long)vaddr, get_order(size)); |
80 | } | 80 | } |
81 | 81 | ||
82 | static void nommu_sync_single_for_device(struct device *dev, | ||
83 | dma_addr_t addr, size_t size, | ||
84 | enum dma_data_direction dir) | ||
85 | { | ||
86 | flush_write_buffers(); | ||
87 | } | ||
88 | |||
89 | |||
90 | static void nommu_sync_sg_for_device(struct device *dev, | ||
91 | struct scatterlist *sg, int nelems, | ||
92 | enum dma_data_direction dir) | ||
93 | { | ||
94 | flush_write_buffers(); | ||
95 | } | ||
96 | |||
82 | struct dma_map_ops nommu_dma_ops = { | 97 | struct dma_map_ops nommu_dma_ops = { |
83 | .alloc_coherent = dma_generic_alloc_coherent, | 98 | .alloc_coherent = dma_generic_alloc_coherent, |
84 | .free_coherent = nommu_free_coherent, | 99 | .free_coherent = nommu_free_coherent, |
85 | .map_sg = nommu_map_sg, | 100 | .map_sg = nommu_map_sg, |
86 | .map_page = nommu_map_page, | 101 | .map_page = nommu_map_page, |
87 | .is_phys = 1, | 102 | .sync_single_for_device = nommu_sync_single_for_device, |
103 | .sync_sg_for_device = nommu_sync_sg_for_device, | ||
104 | .is_phys = 1, | ||
88 | }; | 105 | }; |
89 | 106 | ||
90 | void __init no_iommu_init(void) | 107 | void __init no_iommu_init(void) |
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 6af96ee4420..aaa6b7839f1 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c | |||
@@ -13,31 +13,6 @@ | |||
13 | 13 | ||
14 | int swiotlb __read_mostly; | 14 | int swiotlb __read_mostly; |
15 | 15 | ||
16 | void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs) | ||
17 | { | ||
18 | return alloc_bootmem_low_pages(size); | ||
19 | } | ||
20 | |||
21 | void *swiotlb_alloc(unsigned order, unsigned long nslabs) | ||
22 | { | ||
23 | return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); | ||
24 | } | ||
25 | |||
26 | dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) | ||
27 | { | ||
28 | return paddr; | ||
29 | } | ||
30 | |||
31 | phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) | ||
32 | { | ||
33 | return baddr; | ||
34 | } | ||
35 | |||
36 | int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) | ||
37 | { | ||
38 | return 0; | ||
39 | } | ||
40 | |||
41 | static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, | 16 | static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, |
42 | dma_addr_t *dma_handle, gfp_t flags) | 17 | dma_addr_t *dma_handle, gfp_t flags) |
43 | { | 18 | { |
@@ -71,9 +46,8 @@ void __init pci_swiotlb_init(void) | |||
71 | { | 46 | { |
72 | /* don't initialize swiotlb if iommu=off (no_iommu=1) */ | 47 | /* don't initialize swiotlb if iommu=off (no_iommu=1) */ |
73 | #ifdef CONFIG_X86_64 | 48 | #ifdef CONFIG_X86_64 |
74 | if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || | 49 | if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) |
75 | iommu_pass_through) | 50 | swiotlb = 1; |
76 | swiotlb = 1; | ||
77 | #endif | 51 | #endif |
78 | if (swiotlb_force) | 52 | if (swiotlb_force) |
79 | swiotlb = 1; | 53 | swiotlb = 1; |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 994dd6a4a2a..5284cd2b577 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -9,7 +9,7 @@ | |||
9 | #include <linux/pm.h> | 9 | #include <linux/pm.h> |
10 | #include <linux/clockchips.h> | 10 | #include <linux/clockchips.h> |
11 | #include <linux/random.h> | 11 | #include <linux/random.h> |
12 | #include <trace/power.h> | 12 | #include <trace/events/power.h> |
13 | #include <asm/system.h> | 13 | #include <asm/system.h> |
14 | #include <asm/apic.h> | 14 | #include <asm/apic.h> |
15 | #include <asm/syscalls.h> | 15 | #include <asm/syscalls.h> |
@@ -25,9 +25,6 @@ EXPORT_SYMBOL(idle_nomwait); | |||
25 | 25 | ||
26 | struct kmem_cache *task_xstate_cachep; | 26 | struct kmem_cache *task_xstate_cachep; |
27 | 27 | ||
28 | DEFINE_TRACE(power_start); | ||
29 | DEFINE_TRACE(power_end); | ||
30 | |||
31 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) | 28 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) |
32 | { | 29 | { |
33 | *dst = *src; | 30 | *dst = *src; |
@@ -299,9 +296,7 @@ static inline int hlt_use_halt(void) | |||
299 | void default_idle(void) | 296 | void default_idle(void) |
300 | { | 297 | { |
301 | if (hlt_use_halt()) { | 298 | if (hlt_use_halt()) { |
302 | struct power_trace it; | 299 | trace_power_start(POWER_CSTATE, 1); |
303 | |||
304 | trace_power_start(&it, POWER_CSTATE, 1); | ||
305 | current_thread_info()->status &= ~TS_POLLING; | 300 | current_thread_info()->status &= ~TS_POLLING; |
306 | /* | 301 | /* |
307 | * TS_POLLING-cleared state must be visible before we | 302 | * TS_POLLING-cleared state must be visible before we |
@@ -314,7 +309,6 @@ void default_idle(void) | |||
314 | else | 309 | else |
315 | local_irq_enable(); | 310 | local_irq_enable(); |
316 | current_thread_info()->status |= TS_POLLING; | 311 | current_thread_info()->status |= TS_POLLING; |
317 | trace_power_end(&it); | ||
318 | } else { | 312 | } else { |
319 | local_irq_enable(); | 313 | local_irq_enable(); |
320 | /* loop is done by the caller */ | 314 | /* loop is done by the caller */ |
@@ -372,9 +366,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); | |||
372 | */ | 366 | */ |
373 | void mwait_idle_with_hints(unsigned long ax, unsigned long cx) | 367 | void mwait_idle_with_hints(unsigned long ax, unsigned long cx) |
374 | { | 368 | { |
375 | struct power_trace it; | 369 | trace_power_start(POWER_CSTATE, (ax>>4)+1); |
376 | |||
377 | trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); | ||
378 | if (!need_resched()) { | 370 | if (!need_resched()) { |
379 | if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) | 371 | if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) |
380 | clflush((void *)¤t_thread_info()->flags); | 372 | clflush((void *)¤t_thread_info()->flags); |
@@ -384,15 +376,13 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) | |||
384 | if (!need_resched()) | 376 | if (!need_resched()) |
385 | __mwait(ax, cx); | 377 | __mwait(ax, cx); |
386 | } | 378 | } |
387 | trace_power_end(&it); | ||
388 | } | 379 | } |
389 | 380 | ||
390 | /* Default MONITOR/MWAIT with no hints, used for default C1 state */ | 381 | /* Default MONITOR/MWAIT with no hints, used for default C1 state */ |
391 | static void mwait_idle(void) | 382 | static void mwait_idle(void) |
392 | { | 383 | { |
393 | struct power_trace it; | ||
394 | if (!need_resched()) { | 384 | if (!need_resched()) { |
395 | trace_power_start(&it, POWER_CSTATE, 1); | 385 | trace_power_start(POWER_CSTATE, 1); |
396 | if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) | 386 | if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) |
397 | clflush((void *)¤t_thread_info()->flags); | 387 | clflush((void *)¤t_thread_info()->flags); |
398 | 388 | ||
@@ -402,7 +392,6 @@ static void mwait_idle(void) | |||
402 | __sti_mwait(0, 0); | 392 | __sti_mwait(0, 0); |
403 | else | 393 | else |
404 | local_irq_enable(); | 394 | local_irq_enable(); |
405 | trace_power_end(&it); | ||
406 | } else | 395 | } else |
407 | local_irq_enable(); | 396 | local_irq_enable(); |
408 | } | 397 | } |
@@ -414,13 +403,11 @@ static void mwait_idle(void) | |||
414 | */ | 403 | */ |
415 | static void poll_idle(void) | 404 | static void poll_idle(void) |
416 | { | 405 | { |
417 | struct power_trace it; | 406 | trace_power_start(POWER_CSTATE, 0); |
418 | |||
419 | trace_power_start(&it, POWER_CSTATE, 0); | ||
420 | local_irq_enable(); | 407 | local_irq_enable(); |
421 | while (!need_resched()) | 408 | while (!need_resched()) |
422 | cpu_relax(); | 409 | cpu_relax(); |
423 | trace_power_end(&it); | 410 | trace_power_end(0); |
424 | } | 411 | } |
425 | 412 | ||
426 | /* | 413 | /* |
@@ -519,16 +506,12 @@ static void c1e_idle(void) | |||
519 | if (!cpumask_test_cpu(cpu, c1e_mask)) { | 506 | if (!cpumask_test_cpu(cpu, c1e_mask)) { |
520 | cpumask_set_cpu(cpu, c1e_mask); | 507 | cpumask_set_cpu(cpu, c1e_mask); |
521 | /* | 508 | /* |
522 | * Force broadcast so ACPI can not interfere. Needs | 509 | * Force broadcast so ACPI can not interfere. |
523 | * to run with interrupts enabled as it uses | ||
524 | * smp_function_call. | ||
525 | */ | 510 | */ |
526 | local_irq_enable(); | ||
527 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, | 511 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, |
528 | &cpu); | 512 | &cpu); |
529 | printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", | 513 | printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", |
530 | cpu); | 514 | cpu); |
531 | local_irq_disable(); | ||
532 | } | 515 | } |
533 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); | 516 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); |
534 | 517 | ||
@@ -572,10 +555,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) | |||
572 | void __init init_c1e_mask(void) | 555 | void __init init_c1e_mask(void) |
573 | { | 556 | { |
574 | /* If we're using c1e_idle, we need to allocate c1e_mask. */ | 557 | /* If we're using c1e_idle, we need to allocate c1e_mask. */ |
575 | if (pm_idle == c1e_idle) { | 558 | if (pm_idle == c1e_idle) |
576 | alloc_cpumask_var(&c1e_mask, GFP_KERNEL); | 559 | zalloc_cpumask_var(&c1e_mask, GFP_KERNEL); |
577 | cpumask_clear(c1e_mask); | ||
578 | } | ||
579 | } | 560 | } |
580 | 561 | ||
581 | static int __init idle_setup(char *str) | 562 | static int __init idle_setup(char *str) |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 59f4524984a..4cf79567cda 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -61,9 +61,6 @@ | |||
61 | 61 | ||
62 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); | 62 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); |
63 | 63 | ||
64 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; | ||
65 | EXPORT_PER_CPU_SYMBOL(current_task); | ||
66 | |||
67 | /* | 64 | /* |
68 | * Return saved PC of a blocked thread. | 65 | * Return saved PC of a blocked thread. |
69 | */ | 66 | */ |
@@ -350,14 +347,21 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
350 | *next = &next_p->thread; | 347 | *next = &next_p->thread; |
351 | int cpu = smp_processor_id(); | 348 | int cpu = smp_processor_id(); |
352 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 349 | struct tss_struct *tss = &per_cpu(init_tss, cpu); |
350 | bool preload_fpu; | ||
353 | 351 | ||
354 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ | 352 | /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ |
355 | 353 | ||
356 | __unlazy_fpu(prev_p); | 354 | /* |
355 | * If the task has used fpu the last 5 timeslices, just do a full | ||
356 | * restore of the math state immediately to avoid the trap; the | ||
357 | * chances of needing FPU soon are obviously high now | ||
358 | */ | ||
359 | preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; | ||
357 | 360 | ||
361 | __unlazy_fpu(prev_p); | ||
358 | 362 | ||
359 | /* we're going to use this soon, after a few expensive things */ | 363 | /* we're going to use this soon, after a few expensive things */ |
360 | if (next_p->fpu_counter > 5) | 364 | if (preload_fpu) |
361 | prefetch(next->xstate); | 365 | prefetch(next->xstate); |
362 | 366 | ||
363 | /* | 367 | /* |
@@ -398,6 +402,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
398 | task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) | 402 | task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) |
399 | __switch_to_xtra(prev_p, next_p, tss); | 403 | __switch_to_xtra(prev_p, next_p, tss); |
400 | 404 | ||
405 | /* If we're going to preload the fpu context, make sure clts | ||
406 | is run while we're batching the cpu state updates. */ | ||
407 | if (preload_fpu) | ||
408 | clts(); | ||
409 | |||
401 | /* | 410 | /* |
402 | * Leave lazy mode, flushing any hypercalls made here. | 411 | * Leave lazy mode, flushing any hypercalls made here. |
403 | * This must be done before restoring TLS segments so | 412 | * This must be done before restoring TLS segments so |
@@ -407,15 +416,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
407 | */ | 416 | */ |
408 | arch_end_context_switch(next_p); | 417 | arch_end_context_switch(next_p); |
409 | 418 | ||
410 | /* If the task has used fpu the last 5 timeslices, just do a full | 419 | if (preload_fpu) |
411 | * restore of the math state immediately to avoid the trap; the | 420 | __math_state_restore(); |
412 | * chances of needing FPU soon are obviously high now | ||
413 | * | ||
414 | * tsk_used_math() checks prevent calling math_state_restore(), | ||
415 | * which can sleep in the case of !tsk_used_math() | ||
416 | */ | ||
417 | if (tsk_used_math(next_p) && next_p->fpu_counter > 5) | ||
418 | math_state_restore(); | ||
419 | 421 | ||
420 | /* | 422 | /* |
421 | * Restore %gs if needed (which is common) | 423 | * Restore %gs if needed (which is common) |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ebefb5407b9..ad535b68317 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -55,9 +55,6 @@ | |||
55 | 55 | ||
56 | asmlinkage extern void ret_from_fork(void); | 56 | asmlinkage extern void ret_from_fork(void); |
57 | 57 | ||
58 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; | ||
59 | EXPORT_PER_CPU_SYMBOL(current_task); | ||
60 | |||
61 | DEFINE_PER_CPU(unsigned long, old_rsp); | 58 | DEFINE_PER_CPU(unsigned long, old_rsp); |
62 | static DEFINE_PER_CPU(unsigned char, is_idle); | 59 | static DEFINE_PER_CPU(unsigned char, is_idle); |
63 | 60 | ||
@@ -386,9 +383,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
386 | int cpu = smp_processor_id(); | 383 | int cpu = smp_processor_id(); |
387 | struct tss_struct *tss = &per_cpu(init_tss, cpu); | 384 | struct tss_struct *tss = &per_cpu(init_tss, cpu); |
388 | unsigned fsindex, gsindex; | 385 | unsigned fsindex, gsindex; |
386 | bool preload_fpu; | ||
387 | |||
388 | /* | ||
389 | * If the task has used fpu the last 5 timeslices, just do a full | ||
390 | * restore of the math state immediately to avoid the trap; the | ||
391 | * chances of needing FPU soon are obviously high now | ||
392 | */ | ||
393 | preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; | ||
389 | 394 | ||
390 | /* we're going to use this soon, after a few expensive things */ | 395 | /* we're going to use this soon, after a few expensive things */ |
391 | if (next_p->fpu_counter > 5) | 396 | if (preload_fpu) |
392 | prefetch(next->xstate); | 397 | prefetch(next->xstate); |
393 | 398 | ||
394 | /* | 399 | /* |
@@ -419,6 +424,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
419 | 424 | ||
420 | load_TLS(next, cpu); | 425 | load_TLS(next, cpu); |
421 | 426 | ||
427 | /* Must be after DS reload */ | ||
428 | unlazy_fpu(prev_p); | ||
429 | |||
430 | /* Make sure cpu is ready for new context */ | ||
431 | if (preload_fpu) | ||
432 | clts(); | ||
433 | |||
422 | /* | 434 | /* |
423 | * Leave lazy mode, flushing any hypercalls made here. | 435 | * Leave lazy mode, flushing any hypercalls made here. |
424 | * This must be done before restoring TLS segments so | 436 | * This must be done before restoring TLS segments so |
@@ -459,9 +471,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
459 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); | 471 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); |
460 | prev->gsindex = gsindex; | 472 | prev->gsindex = gsindex; |
461 | 473 | ||
462 | /* Must be after DS reload */ | ||
463 | unlazy_fpu(prev_p); | ||
464 | |||
465 | /* | 474 | /* |
466 | * Switch the PDA and FPU contexts. | 475 | * Switch the PDA and FPU contexts. |
467 | */ | 476 | */ |
@@ -480,15 +489,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
480 | task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) | 489 | task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) |
481 | __switch_to_xtra(prev_p, next_p, tss); | 490 | __switch_to_xtra(prev_p, next_p, tss); |
482 | 491 | ||
483 | /* If the task has used fpu the last 5 timeslices, just do a full | 492 | /* |
484 | * restore of the math state immediately to avoid the trap; the | 493 | * Preload the FPU context, now that we've determined that the |
485 | * chances of needing FPU soon are obviously high now | 494 | * task is likely to be using it. |
486 | * | ||
487 | * tsk_used_math() checks prevent calling math_state_restore(), | ||
488 | * which can sleep in the case of !tsk_used_math() | ||
489 | */ | 495 | */ |
490 | if (tsk_used_math(next_p) && next_p->fpu_counter > 5) | 496 | if (preload_fpu) |
491 | math_state_restore(); | 497 | __math_state_restore(); |
492 | return prev_p; | 498 | return prev_p; |
493 | } | 499 | } |
494 | 500 | ||
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 09ecbde91c1..7b058a2dc66 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c | |||
@@ -35,10 +35,11 @@ | |||
35 | #include <asm/proto.h> | 35 | #include <asm/proto.h> |
36 | #include <asm/ds.h> | 36 | #include <asm/ds.h> |
37 | 37 | ||
38 | #include <trace/syscall.h> | ||
39 | |||
40 | #include "tls.h" | 38 | #include "tls.h" |
41 | 39 | ||
40 | #define CREATE_TRACE_POINTS | ||
41 | #include <trace/events/syscalls.h> | ||
42 | |||
42 | enum x86_regset { | 43 | enum x86_regset { |
43 | REGSET_GENERAL, | 44 | REGSET_GENERAL, |
44 | REGSET_FP, | 45 | REGSET_FP, |
@@ -324,16 +325,6 @@ static int putreg(struct task_struct *child, | |||
324 | return set_flags(child, value); | 325 | return set_flags(child, value); |
325 | 326 | ||
326 | #ifdef CONFIG_X86_64 | 327 | #ifdef CONFIG_X86_64 |
327 | /* | ||
328 | * Orig_ax is really just a flag with small positive and | ||
329 | * negative values, so make sure to always sign-extend it | ||
330 | * from 32 bits so that it works correctly regardless of | ||
331 | * whether we come from a 32-bit environment or not. | ||
332 | */ | ||
333 | case offsetof(struct user_regs_struct, orig_ax): | ||
334 | value = (long) (s32) value; | ||
335 | break; | ||
336 | |||
337 | case offsetof(struct user_regs_struct,fs_base): | 328 | case offsetof(struct user_regs_struct,fs_base): |
338 | if (value >= TASK_SIZE_OF(child)) | 329 | if (value >= TASK_SIZE_OF(child)) |
339 | return -EIO; | 330 | return -EIO; |
@@ -1125,10 +1116,15 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) | |||
1125 | 1116 | ||
1126 | case offsetof(struct user32, regs.orig_eax): | 1117 | case offsetof(struct user32, regs.orig_eax): |
1127 | /* | 1118 | /* |
1128 | * Sign-extend the value so that orig_eax = -1 | 1119 | * A 32-bit debugger setting orig_eax means to restore |
1129 | * causes (long)orig_ax < 0 tests to fire correctly. | 1120 | * the state of the task restarting a 32-bit syscall. |
1121 | * Make sure we interpret the -ERESTART* codes correctly | ||
1122 | * in case the task is not actually still sitting at the | ||
1123 | * exit from a 32-bit syscall with TS_COMPAT still set. | ||
1130 | */ | 1124 | */ |
1131 | regs->orig_ax = (long) (s32) value; | 1125 | regs->orig_ax = value; |
1126 | if (syscall_get_nr(child, regs) >= 0) | ||
1127 | task_thread_info(child)->status |= TS_COMPAT; | ||
1132 | break; | 1128 | break; |
1133 | 1129 | ||
1134 | case offsetof(struct user32, regs.eflags): | 1130 | case offsetof(struct user32, regs.eflags): |
@@ -1497,8 +1493,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) | |||
1497 | tracehook_report_syscall_entry(regs)) | 1493 | tracehook_report_syscall_entry(regs)) |
1498 | ret = -1L; | 1494 | ret = -1L; |
1499 | 1495 | ||
1500 | if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) | 1496 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
1501 | ftrace_syscall_enter(regs); | 1497 | trace_sys_enter(regs, regs->orig_ax); |
1502 | 1498 | ||
1503 | if (unlikely(current->audit_context)) { | 1499 | if (unlikely(current->audit_context)) { |
1504 | if (IS_IA32) | 1500 | if (IS_IA32) |
@@ -1523,8 +1519,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) | |||
1523 | if (unlikely(current->audit_context)) | 1519 | if (unlikely(current->audit_context)) |
1524 | audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); | 1520 | audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); |
1525 | 1521 | ||
1526 | if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) | 1522 | if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) |
1527 | ftrace_syscall_exit(regs); | 1523 | trace_sys_exit(regs, regs->ax); |
1528 | 1524 | ||
1529 | if (test_thread_flag(TIF_SYSCALL_TRACE)) | 1525 | if (test_thread_flag(TIF_SYSCALL_TRACE)) |
1530 | tracehook_report_syscall_exit(regs, 0); | 1526 | tracehook_report_syscall_exit(regs, 0); |
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index af71d06624b..6c3b2c6fd77 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c | |||
@@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev) | |||
508 | 508 | ||
509 | pci_read_config_dword(nb_ht, 0x60, &val); | 509 | pci_read_config_dword(nb_ht, 0x60, &val); |
510 | set_dev_node(&dev->dev, val & 7); | 510 | set_dev_node(&dev->dev, val & 7); |
511 | pci_dev_put(dev); | 511 | pci_dev_put(nb_ht); |
512 | } | 512 | } |
513 | 513 | ||
514 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, | 514 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a06e8d10184..27349f92a6d 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/pm.h> | 4 | #include <linux/pm.h> |
5 | #include <linux/efi.h> | 5 | #include <linux/efi.h> |
6 | #include <linux/dmi.h> | 6 | #include <linux/dmi.h> |
7 | #include <linux/tboot.h> | ||
7 | #include <acpi/reboot.h> | 8 | #include <acpi/reboot.h> |
8 | #include <asm/io.h> | 9 | #include <asm/io.h> |
9 | #include <asm/apic.h> | 10 | #include <asm/apic.h> |
@@ -508,6 +509,8 @@ static void native_machine_emergency_restart(void) | |||
508 | if (reboot_emergency) | 509 | if (reboot_emergency) |
509 | emergency_vmx_disable_all(); | 510 | emergency_vmx_disable_all(); |
510 | 511 | ||
512 | tboot_shutdown(TB_SHUTDOWN_REBOOT); | ||
513 | |||
511 | /* Tell the BIOS if we want cold or warm reboot */ | 514 | /* Tell the BIOS if we want cold or warm reboot */ |
512 | *((unsigned short *)__va(0x472)) = reboot_mode; | 515 | *((unsigned short *)__va(0x472)) = reboot_mode; |
513 | 516 | ||
@@ -634,6 +637,8 @@ static void native_machine_halt(void) | |||
634 | /* stop other cpus and apics */ | 637 | /* stop other cpus and apics */ |
635 | machine_shutdown(); | 638 | machine_shutdown(); |
636 | 639 | ||
640 | tboot_shutdown(TB_SHUTDOWN_HALT); | ||
641 | |||
637 | /* stop this cpu */ | 642 | /* stop this cpu */ |
638 | stop_this_cpu(NULL); | 643 | stop_this_cpu(NULL); |
639 | } | 644 | } |
@@ -645,6 +650,8 @@ static void native_machine_power_off(void) | |||
645 | machine_shutdown(); | 650 | machine_shutdown(); |
646 | pm_power_off(); | 651 | pm_power_off(); |
647 | } | 652 | } |
653 | /* a fallback in case there is no PM info available */ | ||
654 | tboot_shutdown(TB_SHUTDOWN_HALT); | ||
648 | } | 655 | } |
649 | 656 | ||
650 | struct machine_ops machine_ops = { | 657 | struct machine_ops machine_ops = { |
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index bf67dcb4a44..1cfbbfc3ae2 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/pnp.h> | 8 | #include <linux/pnp.h> |
9 | 9 | ||
10 | #include <asm/vsyscall.h> | 10 | #include <asm/vsyscall.h> |
11 | #include <asm/x86_init.h> | ||
11 | #include <asm/time.h> | 12 | #include <asm/time.h> |
12 | 13 | ||
13 | #ifdef CONFIG_X86_32 | 14 | #ifdef CONFIG_X86_32 |
@@ -165,13 +166,13 @@ void rtc_cmos_write(unsigned char val, unsigned char addr) | |||
165 | } | 166 | } |
166 | EXPORT_SYMBOL(rtc_cmos_write); | 167 | EXPORT_SYMBOL(rtc_cmos_write); |
167 | 168 | ||
168 | static int set_rtc_mmss(unsigned long nowtime) | 169 | int update_persistent_clock(struct timespec now) |
169 | { | 170 | { |
170 | unsigned long flags; | 171 | unsigned long flags; |
171 | int retval; | 172 | int retval; |
172 | 173 | ||
173 | spin_lock_irqsave(&rtc_lock, flags); | 174 | spin_lock_irqsave(&rtc_lock, flags); |
174 | retval = set_wallclock(nowtime); | 175 | retval = x86_platform.set_wallclock(now.tv_sec); |
175 | spin_unlock_irqrestore(&rtc_lock, flags); | 176 | spin_unlock_irqrestore(&rtc_lock, flags); |
176 | 177 | ||
177 | return retval; | 178 | return retval; |
@@ -183,18 +184,13 @@ void read_persistent_clock(struct timespec *ts) | |||
183 | unsigned long retval, flags; | 184 | unsigned long retval, flags; |
184 | 185 | ||
185 | spin_lock_irqsave(&rtc_lock, flags); | 186 | spin_lock_irqsave(&rtc_lock, flags); |
186 | retval = get_wallclock(); | 187 | retval = x86_platform.get_wallclock(); |
187 | spin_unlock_irqrestore(&rtc_lock, flags); | 188 | spin_unlock_irqrestore(&rtc_lock, flags); |
188 | 189 | ||
189 | ts->tv_sec = retval; | 190 | ts->tv_sec = retval; |
190 | ts->tv_nsec = 0; | 191 | ts->tv_nsec = 0; |
191 | } | 192 | } |
192 | 193 | ||
193 | int update_persistent_clock(struct timespec now) | ||
194 | { | ||
195 | return set_rtc_mmss(now.tv_sec); | ||
196 | } | ||
197 | |||
198 | unsigned long long native_read_tsc(void) | 194 | unsigned long long native_read_tsc(void) |
199 | { | 195 | { |
200 | return __native_read_tsc(); | 196 | return __native_read_tsc(); |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 63f32d220ef..e09f0e2c14b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/screen_info.h> | 27 | #include <linux/screen_info.h> |
28 | #include <linux/ioport.h> | 28 | #include <linux/ioport.h> |
29 | #include <linux/acpi.h> | 29 | #include <linux/acpi.h> |
30 | #include <linux/sfi.h> | ||
30 | #include <linux/apm_bios.h> | 31 | #include <linux/apm_bios.h> |
31 | #include <linux/initrd.h> | 32 | #include <linux/initrd.h> |
32 | #include <linux/bootmem.h> | 33 | #include <linux/bootmem.h> |
@@ -66,6 +67,7 @@ | |||
66 | 67 | ||
67 | #include <linux/percpu.h> | 68 | #include <linux/percpu.h> |
68 | #include <linux/crash_dump.h> | 69 | #include <linux/crash_dump.h> |
70 | #include <linux/tboot.h> | ||
69 | 71 | ||
70 | #include <video/edid.h> | 72 | #include <video/edid.h> |
71 | 73 | ||
@@ -108,10 +110,6 @@ | |||
108 | #include <asm/numa_64.h> | 110 | #include <asm/numa_64.h> |
109 | #endif | 111 | #endif |
110 | 112 | ||
111 | #ifndef ARCH_SETUP | ||
112 | #define ARCH_SETUP | ||
113 | #endif | ||
114 | |||
115 | /* | 113 | /* |
116 | * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. | 114 | * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. |
117 | * The direct mapping extends to max_pfn_mapped, so that we can directly access | 115 | * The direct mapping extends to max_pfn_mapped, so that we can directly access |
@@ -133,9 +131,9 @@ int default_cpu_present_to_apicid(int mps_cpu) | |||
133 | return __default_cpu_present_to_apicid(mps_cpu); | 131 | return __default_cpu_present_to_apicid(mps_cpu); |
134 | } | 132 | } |
135 | 133 | ||
136 | int default_check_phys_apicid_present(int boot_cpu_physical_apicid) | 134 | int default_check_phys_apicid_present(int phys_apicid) |
137 | { | 135 | { |
138 | return __default_check_phys_apicid_present(boot_cpu_physical_apicid); | 136 | return __default_check_phys_apicid_present(phys_apicid); |
139 | } | 137 | } |
140 | #endif | 138 | #endif |
141 | 139 | ||
@@ -171,13 +169,6 @@ static struct resource bss_resource = { | |||
171 | 169 | ||
172 | 170 | ||
173 | #ifdef CONFIG_X86_32 | 171 | #ifdef CONFIG_X86_32 |
174 | static struct resource video_ram_resource = { | ||
175 | .name = "Video RAM area", | ||
176 | .start = 0xa0000, | ||
177 | .end = 0xbffff, | ||
178 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
179 | }; | ||
180 | |||
181 | /* cpu data as detected by the assembly code in head.S */ | 172 | /* cpu data as detected by the assembly code in head.S */ |
182 | struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; | 173 | struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; |
183 | /* common cpu data for all cpus */ | 174 | /* common cpu data for all cpus */ |
@@ -605,7 +596,7 @@ static struct resource standard_io_resources[] = { | |||
605 | .flags = IORESOURCE_BUSY | IORESOURCE_IO } | 596 | .flags = IORESOURCE_BUSY | IORESOURCE_IO } |
606 | }; | 597 | }; |
607 | 598 | ||
608 | static void __init reserve_standard_io_resources(void) | 599 | void __init reserve_standard_io_resources(void) |
609 | { | 600 | { |
610 | int i; | 601 | int i; |
611 | 602 | ||
@@ -637,10 +628,6 @@ static int __init setup_elfcorehdr(char *arg) | |||
637 | early_param("elfcorehdr", setup_elfcorehdr); | 628 | early_param("elfcorehdr", setup_elfcorehdr); |
638 | #endif | 629 | #endif |
639 | 630 | ||
640 | static struct x86_quirks default_x86_quirks __initdata; | ||
641 | |||
642 | struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; | ||
643 | |||
644 | #ifdef CONFIG_X86_RESERVE_LOW_64K | 631 | #ifdef CONFIG_X86_RESERVE_LOW_64K |
645 | static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) | 632 | static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) |
646 | { | 633 | { |
@@ -757,7 +744,7 @@ void __init setup_arch(char **cmdline_p) | |||
757 | } | 744 | } |
758 | #endif | 745 | #endif |
759 | 746 | ||
760 | ARCH_SETUP | 747 | x86_init.oem.arch_setup(); |
761 | 748 | ||
762 | setup_memory_map(); | 749 | setup_memory_map(); |
763 | parse_setup_data(); | 750 | parse_setup_data(); |
@@ -796,6 +783,16 @@ void __init setup_arch(char **cmdline_p) | |||
796 | strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); | 783 | strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); |
797 | *cmdline_p = command_line; | 784 | *cmdline_p = command_line; |
798 | 785 | ||
786 | #ifdef CONFIG_X86_64 | ||
787 | /* | ||
788 | * Must call this twice: Once just to detect whether hardware doesn't | ||
789 | * support NX (so that the early EHCI debug console setup can safely | ||
790 | * call set_fixmap(), and then again after parsing early parameters to | ||
791 | * honor the respective command line option. | ||
792 | */ | ||
793 | check_efer(); | ||
794 | #endif | ||
795 | |||
799 | parse_early_param(); | 796 | parse_early_param(); |
800 | 797 | ||
801 | #ifdef CONFIG_X86_64 | 798 | #ifdef CONFIG_X86_64 |
@@ -833,11 +830,9 @@ void __init setup_arch(char **cmdline_p) | |||
833 | * VMware detection requires dmi to be available, so this | 830 | * VMware detection requires dmi to be available, so this |
834 | * needs to be done after dmi_scan_machine, for the BP. | 831 | * needs to be done after dmi_scan_machine, for the BP. |
835 | */ | 832 | */ |
836 | init_hypervisor(&boot_cpu_data); | 833 | init_hypervisor_platform(); |
837 | 834 | ||
838 | #ifdef CONFIG_X86_32 | 835 | x86_init.resources.probe_roms(); |
839 | probe_roms(); | ||
840 | #endif | ||
841 | 836 | ||
842 | /* after parse_early_param, so could debug it */ | 837 | /* after parse_early_param, so could debug it */ |
843 | insert_resource(&iomem_resource, &code_resource); | 838 | insert_resource(&iomem_resource, &code_resource); |
@@ -972,10 +967,11 @@ void __init setup_arch(char **cmdline_p) | |||
972 | kvmclock_init(); | 967 | kvmclock_init(); |
973 | #endif | 968 | #endif |
974 | 969 | ||
975 | paravirt_pagetable_setup_start(swapper_pg_dir); | 970 | x86_init.paging.pagetable_setup_start(swapper_pg_dir); |
976 | paging_init(); | 971 | paging_init(); |
977 | paravirt_pagetable_setup_done(swapper_pg_dir); | 972 | x86_init.paging.pagetable_setup_done(swapper_pg_dir); |
978 | paravirt_post_allocator_init(); | 973 | |
974 | tboot_probe(); | ||
979 | 975 | ||
980 | #ifdef CONFIG_X86_64 | 976 | #ifdef CONFIG_X86_64 |
981 | map_vsyscall(); | 977 | map_vsyscall(); |
@@ -990,13 +986,13 @@ void __init setup_arch(char **cmdline_p) | |||
990 | */ | 986 | */ |
991 | acpi_boot_init(); | 987 | acpi_boot_init(); |
992 | 988 | ||
993 | #if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) | 989 | sfi_init(); |
990 | |||
994 | /* | 991 | /* |
995 | * get boot-time SMP configuration: | 992 | * get boot-time SMP configuration: |
996 | */ | 993 | */ |
997 | if (smp_found_config) | 994 | if (smp_found_config) |
998 | get_smp_config(); | 995 | get_smp_config(); |
999 | #endif | ||
1000 | 996 | ||
1001 | prefill_possible_map(); | 997 | prefill_possible_map(); |
1002 | 998 | ||
@@ -1015,10 +1011,7 @@ void __init setup_arch(char **cmdline_p) | |||
1015 | e820_reserve_resources(); | 1011 | e820_reserve_resources(); |
1016 | e820_mark_nosave_regions(max_low_pfn); | 1012 | e820_mark_nosave_regions(max_low_pfn); |
1017 | 1013 | ||
1018 | #ifdef CONFIG_X86_32 | 1014 | x86_init.resources.reserve_resources(); |
1019 | request_resource(&iomem_resource, &video_ram_resource); | ||
1020 | #endif | ||
1021 | reserve_standard_io_resources(); | ||
1022 | 1015 | ||
1023 | e820_setup_gap(); | 1016 | e820_setup_gap(); |
1024 | 1017 | ||
@@ -1030,78 +1023,22 @@ void __init setup_arch(char **cmdline_p) | |||
1030 | conswitchp = &dummy_con; | 1023 | conswitchp = &dummy_con; |
1031 | #endif | 1024 | #endif |
1032 | #endif | 1025 | #endif |
1026 | x86_init.oem.banner(); | ||
1033 | } | 1027 | } |
1034 | 1028 | ||
1035 | #ifdef CONFIG_X86_32 | 1029 | #ifdef CONFIG_X86_32 |
1036 | 1030 | ||
1037 | /** | 1031 | static struct resource video_ram_resource = { |
1038 | * x86_quirk_intr_init - post gate setup interrupt initialisation | 1032 | .name = "Video RAM area", |
1039 | * | 1033 | .start = 0xa0000, |
1040 | * Description: | 1034 | .end = 0xbffff, |
1041 | * Fill in any interrupts that may have been left out by the general | 1035 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM |
1042 | * init_IRQ() routine. interrupts having to do with the machine rather | ||
1043 | * than the devices on the I/O bus (like APIC interrupts in intel MP | ||
1044 | * systems) are started here. | ||
1045 | **/ | ||
1046 | void __init x86_quirk_intr_init(void) | ||
1047 | { | ||
1048 | if (x86_quirks->arch_intr_init) { | ||
1049 | if (x86_quirks->arch_intr_init()) | ||
1050 | return; | ||
1051 | } | ||
1052 | } | ||
1053 | |||
1054 | /** | ||
1055 | * x86_quirk_trap_init - initialise system specific traps | ||
1056 | * | ||
1057 | * Description: | ||
1058 | * Called as the final act of trap_init(). Used in VISWS to initialise | ||
1059 | * the various board specific APIC traps. | ||
1060 | **/ | ||
1061 | void __init x86_quirk_trap_init(void) | ||
1062 | { | ||
1063 | if (x86_quirks->arch_trap_init) { | ||
1064 | if (x86_quirks->arch_trap_init()) | ||
1065 | return; | ||
1066 | } | ||
1067 | } | ||
1068 | |||
1069 | static struct irqaction irq0 = { | ||
1070 | .handler = timer_interrupt, | ||
1071 | .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, | ||
1072 | .name = "timer" | ||
1073 | }; | 1036 | }; |
1074 | 1037 | ||
1075 | /** | 1038 | void __init i386_reserve_resources(void) |
1076 | * x86_quirk_pre_time_init - do any specific initialisations before. | ||
1077 | * | ||
1078 | **/ | ||
1079 | void __init x86_quirk_pre_time_init(void) | ||
1080 | { | 1039 | { |
1081 | if (x86_quirks->arch_pre_time_init) | 1040 | request_resource(&iomem_resource, &video_ram_resource); |
1082 | x86_quirks->arch_pre_time_init(); | 1041 | reserve_standard_io_resources(); |
1083 | } | 1042 | } |
1084 | 1043 | ||
1085 | /** | ||
1086 | * x86_quirk_time_init - do any specific initialisations for the system timer. | ||
1087 | * | ||
1088 | * Description: | ||
1089 | * Must plug the system timer interrupt source at HZ into the IRQ listed | ||
1090 | * in irq_vectors.h:TIMER_IRQ | ||
1091 | **/ | ||
1092 | void __init x86_quirk_time_init(void) | ||
1093 | { | ||
1094 | if (x86_quirks->arch_time_init) { | ||
1095 | /* | ||
1096 | * A nonzero return code does not mean failure, it means | ||
1097 | * that the architecture quirk does not want any | ||
1098 | * generic (timer) setup to be performed after this: | ||
1099 | */ | ||
1100 | if (x86_quirks->arch_time_init()) | ||
1101 | return; | ||
1102 | } | ||
1103 | |||
1104 | irq0.mask = cpumask_of_cpu(0); | ||
1105 | setup_irq(0, &irq0); | ||
1106 | } | ||
1107 | #endif /* CONFIG_X86_32 */ | 1044 | #endif /* CONFIG_X86_32 */ |
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 29a3eef7cf4..d559af913e1 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset); | |||
55 | #define PERCPU_FIRST_CHUNK_RESERVE 0 | 55 | #define PERCPU_FIRST_CHUNK_RESERVE 0 |
56 | #endif | 56 | #endif |
57 | 57 | ||
58 | #ifdef CONFIG_X86_32 | ||
58 | /** | 59 | /** |
59 | * pcpu_need_numa - determine percpu allocation needs to consider NUMA | 60 | * pcpu_need_numa - determine percpu allocation needs to consider NUMA |
60 | * | 61 | * |
@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void) | |||
83 | #endif | 84 | #endif |
84 | return false; | 85 | return false; |
85 | } | 86 | } |
87 | #endif | ||
86 | 88 | ||
87 | /** | 89 | /** |
88 | * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu | 90 | * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu |
@@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, | |||
124 | } | 126 | } |
125 | 127 | ||
126 | /* | 128 | /* |
127 | * Large page remap allocator | 129 | * Helpers for first chunk memory allocation |
128 | * | ||
129 | * This allocator uses PMD page as unit. A PMD page is allocated for | ||
130 | * each cpu and each is remapped into vmalloc area using PMD mapping. | ||
131 | * As PMD page is quite large, only part of it is used for the first | ||
132 | * chunk. Unused part is returned to the bootmem allocator. | ||
133 | * | ||
134 | * So, the PMD pages are mapped twice - once to the physical mapping | ||
135 | * and to the vmalloc area for the first percpu chunk. The double | ||
136 | * mapping does add one more PMD TLB entry pressure but still is much | ||
137 | * better than only using 4k mappings while still being NUMA friendly. | ||
138 | */ | 130 | */ |
139 | #ifdef CONFIG_NEED_MULTIPLE_NODES | 131 | static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) |
140 | struct pcpul_ent { | ||
141 | unsigned int cpu; | ||
142 | void *ptr; | ||
143 | }; | ||
144 | |||
145 | static size_t pcpul_size; | ||
146 | static struct pcpul_ent *pcpul_map; | ||
147 | static struct vm_struct pcpul_vm; | ||
148 | |||
149 | static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) | ||
150 | { | 132 | { |
151 | size_t off = (size_t)pageno << PAGE_SHIFT; | 133 | return pcpu_alloc_bootmem(cpu, size, align); |
152 | |||
153 | if (off >= pcpul_size) | ||
154 | return NULL; | ||
155 | |||
156 | return virt_to_page(pcpul_map[cpu].ptr + off); | ||
157 | } | 134 | } |
158 | 135 | ||
159 | static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) | 136 | static void __init pcpu_fc_free(void *ptr, size_t size) |
160 | { | 137 | { |
161 | size_t map_size, dyn_size; | 138 | free_bootmem(__pa(ptr), size); |
162 | unsigned int cpu; | ||
163 | int i, j; | ||
164 | ssize_t ret; | ||
165 | |||
166 | if (!chosen) { | ||
167 | size_t vm_size = VMALLOC_END - VMALLOC_START; | ||
168 | size_t tot_size = num_possible_cpus() * PMD_SIZE; | ||
169 | |||
170 | /* on non-NUMA, embedding is better */ | ||
171 | if (!pcpu_need_numa()) | ||
172 | return -EINVAL; | ||
173 | |||
174 | /* don't consume more than 20% of vmalloc area */ | ||
175 | if (tot_size > vm_size / 5) { | ||
176 | pr_info("PERCPU: too large chunk size %zuMB for " | ||
177 | "large page remap\n", tot_size >> 20); | ||
178 | return -EINVAL; | ||
179 | } | ||
180 | } | ||
181 | |||
182 | /* need PSE */ | ||
183 | if (!cpu_has_pse) { | ||
184 | pr_warning("PERCPU: lpage allocator requires PSE\n"); | ||
185 | return -EINVAL; | ||
186 | } | ||
187 | |||
188 | /* | ||
189 | * Currently supports only single page. Supporting multiple | ||
190 | * pages won't be too difficult if it ever becomes necessary. | ||
191 | */ | ||
192 | pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + | ||
193 | PERCPU_DYNAMIC_RESERVE); | ||
194 | if (pcpul_size > PMD_SIZE) { | ||
195 | pr_warning("PERCPU: static data is larger than large page, " | ||
196 | "can't use large page\n"); | ||
197 | return -EINVAL; | ||
198 | } | ||
199 | dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; | ||
200 | |||
201 | /* allocate pointer array and alloc large pages */ | ||
202 | map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); | ||
203 | pcpul_map = alloc_bootmem(map_size); | ||
204 | |||
205 | for_each_possible_cpu(cpu) { | ||
206 | pcpul_map[cpu].cpu = cpu; | ||
207 | pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, | ||
208 | PMD_SIZE); | ||
209 | if (!pcpul_map[cpu].ptr) { | ||
210 | pr_warning("PERCPU: failed to allocate large page " | ||
211 | "for cpu%u\n", cpu); | ||
212 | goto enomem; | ||
213 | } | ||
214 | |||
215 | /* | ||
216 | * Only use pcpul_size bytes and give back the rest. | ||
217 | * | ||
218 | * Ingo: The 2MB up-rounding bootmem is needed to make | ||
219 | * sure the partial 2MB page is still fully RAM - it's | ||
220 | * not well-specified to have a PAT-incompatible area | ||
221 | * (unmapped RAM, device memory, etc.) in that hole. | ||
222 | */ | ||
223 | free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), | ||
224 | PMD_SIZE - pcpul_size); | ||
225 | |||
226 | memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); | ||
227 | } | ||
228 | |||
229 | /* allocate address and map */ | ||
230 | pcpul_vm.flags = VM_ALLOC; | ||
231 | pcpul_vm.size = num_possible_cpus() * PMD_SIZE; | ||
232 | vm_area_register_early(&pcpul_vm, PMD_SIZE); | ||
233 | |||
234 | for_each_possible_cpu(cpu) { | ||
235 | pmd_t *pmd, pmd_v; | ||
236 | |||
237 | pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + | ||
238 | cpu * PMD_SIZE); | ||
239 | pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), | ||
240 | PAGE_KERNEL_LARGE); | ||
241 | set_pmd(pmd, pmd_v); | ||
242 | } | ||
243 | |||
244 | /* we're ready, commit */ | ||
245 | pr_info("PERCPU: Remapped at %p with large pages, static data " | ||
246 | "%zu bytes\n", pcpul_vm.addr, static_size); | ||
247 | |||
248 | ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, | ||
249 | PERCPU_FIRST_CHUNK_RESERVE, dyn_size, | ||
250 | PMD_SIZE, pcpul_vm.addr, NULL); | ||
251 | |||
252 | /* sort pcpul_map array for pcpu_lpage_remapped() */ | ||
253 | for (i = 0; i < num_possible_cpus() - 1; i++) | ||
254 | for (j = i + 1; j < num_possible_cpus(); j++) | ||
255 | if (pcpul_map[i].ptr > pcpul_map[j].ptr) { | ||
256 | struct pcpul_ent tmp = pcpul_map[i]; | ||
257 | pcpul_map[i] = pcpul_map[j]; | ||
258 | pcpul_map[j] = tmp; | ||
259 | } | ||
260 | |||
261 | return ret; | ||
262 | |||
263 | enomem: | ||
264 | for_each_possible_cpu(cpu) | ||
265 | if (pcpul_map[cpu].ptr) | ||
266 | free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); | ||
267 | free_bootmem(__pa(pcpul_map), map_size); | ||
268 | return -ENOMEM; | ||
269 | } | 139 | } |
270 | 140 | ||
271 | /** | 141 | static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) |
272 | * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area | ||
273 | * @kaddr: the kernel address in question | ||
274 | * | ||
275 | * Determine whether @kaddr falls in the pcpul recycled area. This is | ||
276 | * used by pageattr to detect VM aliases and break up the pcpu PMD | ||
277 | * mapping such that the same physical page is not mapped under | ||
278 | * different attributes. | ||
279 | * | ||
280 | * The recycled area is always at the tail of a partially used PMD | ||
281 | * page. | ||
282 | * | ||
283 | * RETURNS: | ||
284 | * Address of corresponding remapped pcpu address if match is found; | ||
285 | * otherwise, NULL. | ||
286 | */ | ||
287 | void *pcpu_lpage_remapped(void *kaddr) | ||
288 | { | 142 | { |
289 | void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); | 143 | #ifdef CONFIG_NEED_MULTIPLE_NODES |
290 | unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; | 144 | if (early_cpu_to_node(from) == early_cpu_to_node(to)) |
291 | int left = 0, right = num_possible_cpus() - 1; | 145 | return LOCAL_DISTANCE; |
292 | int pos; | 146 | else |
293 | 147 | return REMOTE_DISTANCE; | |
294 | /* pcpul in use at all? */ | ||
295 | if (!pcpul_map) | ||
296 | return NULL; | ||
297 | |||
298 | /* okay, perform binary search */ | ||
299 | while (left <= right) { | ||
300 | pos = (left + right) / 2; | ||
301 | |||
302 | if (pcpul_map[pos].ptr < pmd_addr) | ||
303 | left = pos + 1; | ||
304 | else if (pcpul_map[pos].ptr > pmd_addr) | ||
305 | right = pos - 1; | ||
306 | else { | ||
307 | /* it shouldn't be in the area for the first chunk */ | ||
308 | WARN_ON(offset < pcpul_size); | ||
309 | |||
310 | return pcpul_vm.addr + | ||
311 | pcpul_map[pos].cpu * PMD_SIZE + offset; | ||
312 | } | ||
313 | } | ||
314 | |||
315 | return NULL; | ||
316 | } | ||
317 | #else | 148 | #else |
318 | static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) | 149 | return LOCAL_DISTANCE; |
319 | { | ||
320 | return -EINVAL; | ||
321 | } | ||
322 | #endif | 150 | #endif |
323 | |||
324 | /* | ||
325 | * Embedding allocator | ||
326 | * | ||
327 | * The first chunk is sized to just contain the static area plus | ||
328 | * module and dynamic reserves and embedded into linear physical | ||
329 | * mapping so that it can use PMD mapping without additional TLB | ||
330 | * pressure. | ||
331 | */ | ||
332 | static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) | ||
333 | { | ||
334 | size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; | ||
335 | |||
336 | /* | ||
337 | * If large page isn't supported, there's no benefit in doing | ||
338 | * this. Also, embedding allocation doesn't play well with | ||
339 | * NUMA. | ||
340 | */ | ||
341 | if (!chosen && (!cpu_has_pse || pcpu_need_numa())) | ||
342 | return -EINVAL; | ||
343 | |||
344 | return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, | ||
345 | reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); | ||
346 | } | 151 | } |
347 | 152 | ||
348 | /* | 153 | static void __init pcpup_populate_pte(unsigned long addr) |
349 | * 4k page allocator | ||
350 | * | ||
351 | * This is the basic allocator. Static percpu area is allocated | ||
352 | * page-by-page and most of initialization is done by the generic | ||
353 | * setup function. | ||
354 | */ | ||
355 | static struct page **pcpu4k_pages __initdata; | ||
356 | static int pcpu4k_nr_static_pages __initdata; | ||
357 | |||
358 | static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) | ||
359 | { | ||
360 | if (pageno < pcpu4k_nr_static_pages) | ||
361 | return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; | ||
362 | return NULL; | ||
363 | } | ||
364 | |||
365 | static void __init pcpu4k_populate_pte(unsigned long addr) | ||
366 | { | 154 | { |
367 | populate_extra_pte(addr); | 155 | populate_extra_pte(addr); |
368 | } | 156 | } |
369 | 157 | ||
370 | static ssize_t __init setup_pcpu_4k(size_t static_size) | ||
371 | { | ||
372 | size_t pages_size; | ||
373 | unsigned int cpu; | ||
374 | int i, j; | ||
375 | ssize_t ret; | ||
376 | |||
377 | pcpu4k_nr_static_pages = PFN_UP(static_size); | ||
378 | |||
379 | /* unaligned allocations can't be freed, round up to page size */ | ||
380 | pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() | ||
381 | * sizeof(pcpu4k_pages[0])); | ||
382 | pcpu4k_pages = alloc_bootmem(pages_size); | ||
383 | |||
384 | /* allocate and copy */ | ||
385 | j = 0; | ||
386 | for_each_possible_cpu(cpu) | ||
387 | for (i = 0; i < pcpu4k_nr_static_pages; i++) { | ||
388 | void *ptr; | ||
389 | |||
390 | ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); | ||
391 | if (!ptr) { | ||
392 | pr_warning("PERCPU: failed to allocate " | ||
393 | "4k page for cpu%u\n", cpu); | ||
394 | goto enomem; | ||
395 | } | ||
396 | |||
397 | memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); | ||
398 | pcpu4k_pages[j++] = virt_to_page(ptr); | ||
399 | } | ||
400 | |||
401 | /* we're ready, commit */ | ||
402 | pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", | ||
403 | pcpu4k_nr_static_pages, static_size); | ||
404 | |||
405 | ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, | ||
406 | PERCPU_FIRST_CHUNK_RESERVE, -1, | ||
407 | -1, NULL, pcpu4k_populate_pte); | ||
408 | goto out_free_ar; | ||
409 | |||
410 | enomem: | ||
411 | while (--j >= 0) | ||
412 | free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); | ||
413 | ret = -ENOMEM; | ||
414 | out_free_ar: | ||
415 | free_bootmem(__pa(pcpu4k_pages), pages_size); | ||
416 | return ret; | ||
417 | } | ||
418 | |||
419 | /* for explicit first chunk allocator selection */ | ||
420 | static char pcpu_chosen_alloc[16] __initdata; | ||
421 | |||
422 | static int __init percpu_alloc_setup(char *str) | ||
423 | { | ||
424 | strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); | ||
425 | return 0; | ||
426 | } | ||
427 | early_param("percpu_alloc", percpu_alloc_setup); | ||
428 | |||
429 | static inline void setup_percpu_segment(int cpu) | 158 | static inline void setup_percpu_segment(int cpu) |
430 | { | 159 | { |
431 | #ifdef CONFIG_X86_32 | 160 | #ifdef CONFIG_X86_32 |
@@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu) | |||
441 | 170 | ||
442 | void __init setup_per_cpu_areas(void) | 171 | void __init setup_per_cpu_areas(void) |
443 | { | 172 | { |
444 | size_t static_size = __per_cpu_end - __per_cpu_start; | ||
445 | unsigned int cpu; | 173 | unsigned int cpu; |
446 | unsigned long delta; | 174 | unsigned long delta; |
447 | size_t pcpu_unit_size; | 175 | int rc; |
448 | ssize_t ret; | ||
449 | 176 | ||
450 | pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", | 177 | pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", |
451 | NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); | 178 | NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); |
452 | 179 | ||
453 | /* | 180 | /* |
454 | * Allocate percpu area. If PSE is supported, try to make use | 181 | * Allocate percpu area. Embedding allocator is our favorite; |
455 | * of large page mappings. Please read comments on top of | 182 | * however, on NUMA configurations, it can result in very |
456 | * each allocator for details. | 183 | * sparse unit mapping and vmalloc area isn't spacious enough |
184 | * on 32bit. Use page in that case. | ||
457 | */ | 185 | */ |
458 | ret = -EINVAL; | 186 | #ifdef CONFIG_X86_32 |
459 | if (strlen(pcpu_chosen_alloc)) { | 187 | if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) |
460 | if (strcmp(pcpu_chosen_alloc, "4k")) { | 188 | pcpu_chosen_fc = PCPU_FC_PAGE; |
461 | if (!strcmp(pcpu_chosen_alloc, "lpage")) | 189 | #endif |
462 | ret = setup_pcpu_lpage(static_size, true); | 190 | rc = -EINVAL; |
463 | else if (!strcmp(pcpu_chosen_alloc, "embed")) | 191 | if (pcpu_chosen_fc != PCPU_FC_PAGE) { |
464 | ret = setup_pcpu_embed(static_size, true); | 192 | const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; |
465 | else | 193 | const size_t dyn_size = PERCPU_MODULE_RESERVE + |
466 | pr_warning("PERCPU: unknown allocator %s " | 194 | PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; |
467 | "specified\n", pcpu_chosen_alloc); | 195 | |
468 | if (ret < 0) | 196 | rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, |
469 | pr_warning("PERCPU: %s allocator failed (%zd), " | 197 | dyn_size, atom_size, |
470 | "falling back to 4k\n", | 198 | pcpu_cpu_distance, |
471 | pcpu_chosen_alloc, ret); | 199 | pcpu_fc_alloc, pcpu_fc_free); |
472 | } | 200 | if (rc < 0) |
473 | } else { | 201 | pr_warning("PERCPU: %s allocator failed (%d), " |
474 | ret = setup_pcpu_lpage(static_size, false); | 202 | "falling back to page size\n", |
475 | if (ret < 0) | 203 | pcpu_fc_names[pcpu_chosen_fc], rc); |
476 | ret = setup_pcpu_embed(static_size, false); | ||
477 | } | 204 | } |
478 | if (ret < 0) | 205 | if (rc < 0) |
479 | ret = setup_pcpu_4k(static_size); | 206 | rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, |
480 | if (ret < 0) | 207 | pcpu_fc_alloc, pcpu_fc_free, |
481 | panic("cannot allocate static percpu area (%zu bytes, err=%zd)", | 208 | pcpup_populate_pte); |
482 | static_size, ret); | 209 | if (rc < 0) |
483 | 210 | panic("cannot initialize percpu area (err=%d)", rc); | |
484 | pcpu_unit_size = ret; | ||
485 | 211 | ||
486 | /* alrighty, percpu areas up and running */ | 212 | /* alrighty, percpu areas up and running */ |
487 | delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; | 213 | delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; |
488 | for_each_possible_cpu(cpu) { | 214 | for_each_possible_cpu(cpu) { |
489 | per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; | 215 | per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; |
490 | per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); | 216 | per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); |
491 | per_cpu(cpu_number, cpu) = cpu; | 217 | per_cpu(cpu_number, cpu) = cpu; |
492 | setup_percpu_segment(cpu); | 218 | setup_percpu_segment(cpu); |
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c new file mode 100644 index 00000000000..34e09938265 --- /dev/null +++ b/arch/x86/kernel/sfi.c | |||
@@ -0,0 +1,122 @@ | |||
1 | /* | ||
2 | * sfi.c - x86 architecture SFI support. | ||
3 | * | ||
4 | * Copyright (c) 2009, Intel Corporation. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify it | ||
7 | * under the terms and conditions of the GNU General Public License, | ||
8 | * version 2, as published by the Free Software Foundation. | ||
9 | * | ||
10 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
13 | * more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License along with | ||
16 | * this program; if not, write to the Free Software Foundation, Inc., | ||
17 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. | ||
18 | * | ||
19 | */ | ||
20 | |||
21 | #define KMSG_COMPONENT "SFI" | ||
22 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt | ||
23 | |||
24 | #include <linux/acpi.h> | ||
25 | #include <linux/init.h> | ||
26 | #include <linux/sfi.h> | ||
27 | #include <linux/io.h> | ||
28 | |||
29 | #include <asm/io_apic.h> | ||
30 | #include <asm/mpspec.h> | ||
31 | #include <asm/setup.h> | ||
32 | #include <asm/apic.h> | ||
33 | |||
34 | #ifdef CONFIG_X86_LOCAL_APIC | ||
35 | static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; | ||
36 | |||
37 | void __init mp_sfi_register_lapic_address(unsigned long address) | ||
38 | { | ||
39 | mp_lapic_addr = address; | ||
40 | |||
41 | set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); | ||
42 | if (boot_cpu_physical_apicid == -1U) | ||
43 | boot_cpu_physical_apicid = read_apic_id(); | ||
44 | |||
45 | pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid); | ||
46 | } | ||
47 | |||
48 | /* All CPUs enumerated by SFI must be present and enabled */ | ||
49 | void __cpuinit mp_sfi_register_lapic(u8 id) | ||
50 | { | ||
51 | if (MAX_APICS - id <= 0) { | ||
52 | pr_warning("Processor #%d invalid (max %d)\n", | ||
53 | id, MAX_APICS); | ||
54 | return; | ||
55 | } | ||
56 | |||
57 | pr_info("registering lapic[%d]\n", id); | ||
58 | |||
59 | generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR))); | ||
60 | } | ||
61 | |||
62 | static int __init sfi_parse_cpus(struct sfi_table_header *table) | ||
63 | { | ||
64 | struct sfi_table_simple *sb; | ||
65 | struct sfi_cpu_table_entry *pentry; | ||
66 | int i; | ||
67 | int cpu_num; | ||
68 | |||
69 | sb = (struct sfi_table_simple *)table; | ||
70 | cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry); | ||
71 | pentry = (struct sfi_cpu_table_entry *)sb->pentry; | ||
72 | |||
73 | for (i = 0; i < cpu_num; i++) { | ||
74 | mp_sfi_register_lapic(pentry->apic_id); | ||
75 | pentry++; | ||
76 | } | ||
77 | |||
78 | smp_found_config = 1; | ||
79 | return 0; | ||
80 | } | ||
81 | #endif /* CONFIG_X86_LOCAL_APIC */ | ||
82 | |||
83 | #ifdef CONFIG_X86_IO_APIC | ||
84 | static u32 gsi_base; | ||
85 | |||
86 | static int __init sfi_parse_ioapic(struct sfi_table_header *table) | ||
87 | { | ||
88 | struct sfi_table_simple *sb; | ||
89 | struct sfi_apic_table_entry *pentry; | ||
90 | int i, num; | ||
91 | |||
92 | sb = (struct sfi_table_simple *)table; | ||
93 | num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry); | ||
94 | pentry = (struct sfi_apic_table_entry *)sb->pentry; | ||
95 | |||
96 | for (i = 0; i < num; i++) { | ||
97 | mp_register_ioapic(i, pentry->phys_addr, gsi_base); | ||
98 | gsi_base += io_apic_get_redir_entries(i); | ||
99 | pentry++; | ||
100 | } | ||
101 | |||
102 | WARN(pic_mode, KERN_WARNING | ||
103 | "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n"); | ||
104 | pic_mode = 0; | ||
105 | return 0; | ||
106 | } | ||
107 | #endif /* CONFIG_X86_IO_APIC */ | ||
108 | |||
109 | /* | ||
110 | * sfi_platform_init(): register lapics & io-apics | ||
111 | */ | ||
112 | int __init sfi_platform_init(void) | ||
113 | { | ||
114 | #ifdef CONFIG_X86_LOCAL_APIC | ||
115 | mp_sfi_register_lapic_address(sfi_lapic_addr); | ||
116 | sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); | ||
117 | #endif | ||
118 | #ifdef CONFIG_X86_IO_APIC | ||
119 | sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic); | ||
120 | #endif | ||
121 | return 0; | ||
122 | } | ||
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4c578751e94..6a44a76055a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c | |||
@@ -856,7 +856,7 @@ static void do_signal(struct pt_regs *regs) | |||
856 | void | 856 | void |
857 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | 857 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) |
858 | { | 858 | { |
859 | #ifdef CONFIG_X86_NEW_MCE | 859 | #ifdef CONFIG_X86_MCE |
860 | /* notify userspace of pending MCEs */ | 860 | /* notify userspace of pending MCEs */ |
861 | if (thread_info_flags & _TIF_MCE_NOTIFY) | 861 | if (thread_info_flags & _TIF_MCE_NOTIFY) |
862 | mce_notify_process(); | 862 | mce_notify_process(); |
@@ -869,6 +869,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | |||
869 | if (thread_info_flags & _TIF_NOTIFY_RESUME) { | 869 | if (thread_info_flags & _TIF_NOTIFY_RESUME) { |
870 | clear_thread_flag(TIF_NOTIFY_RESUME); | 870 | clear_thread_flag(TIF_NOTIFY_RESUME); |
871 | tracehook_notify_resume(regs); | 871 | tracehook_notify_resume(regs); |
872 | if (current->replacement_session_keyring) | ||
873 | key_replace_session_keyring(); | ||
872 | } | 874 | } |
873 | 875 | ||
874 | #ifdef CONFIG_X86_32 | 876 | #ifdef CONFIG_X86_32 |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda69ee6..565ebc65920 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/bootmem.h> | 47 | #include <linux/bootmem.h> |
48 | #include <linux/err.h> | 48 | #include <linux/err.h> |
49 | #include <linux/nmi.h> | 49 | #include <linux/nmi.h> |
50 | #include <linux/tboot.h> | ||
50 | 51 | ||
51 | #include <asm/acpi.h> | 52 | #include <asm/acpi.h> |
52 | #include <asm/desc.h> | 53 | #include <asm/desc.h> |
@@ -323,7 +324,7 @@ notrace static void __cpuinit start_secondary(void *unused) | |||
323 | /* enable local interrupts */ | 324 | /* enable local interrupts */ |
324 | local_irq_enable(); | 325 | local_irq_enable(); |
325 | 326 | ||
326 | setup_secondary_clock(); | 327 | x86_cpuinit.setup_percpu_clockev(); |
327 | 328 | ||
328 | wmb(); | 329 | wmb(); |
329 | cpu_idle(); | 330 | cpu_idle(); |
@@ -434,7 +435,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu) | |||
434 | * For perf, we return last level cache shared map. | 435 | * For perf, we return last level cache shared map. |
435 | * And for power savings, we return cpu_core_map | 436 | * And for power savings, we return cpu_core_map |
436 | */ | 437 | */ |
437 | if (sched_mc_power_savings || sched_smt_power_savings) | 438 | if ((sched_mc_power_savings || sched_smt_power_savings) && |
439 | !(cpu_has(c, X86_FEATURE_AMD_DCM))) | ||
438 | return cpu_core_mask(cpu); | 440 | return cpu_core_mask(cpu); |
439 | else | 441 | else |
440 | return c->llc_shared_map; | 442 | return c->llc_shared_map; |
@@ -1057,12 +1059,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) | |||
1057 | #endif | 1059 | #endif |
1058 | current_thread_info()->cpu = 0; /* needed? */ | 1060 | current_thread_info()->cpu = 0; /* needed? */ |
1059 | for_each_possible_cpu(i) { | 1061 | for_each_possible_cpu(i) { |
1060 | alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); | 1062 | zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); |
1061 | alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); | 1063 | zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); |
1062 | alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); | 1064 | zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); |
1063 | cpumask_clear(per_cpu(cpu_core_map, i)); | ||
1064 | cpumask_clear(per_cpu(cpu_sibling_map, i)); | ||
1065 | cpumask_clear(cpu_data(i).llc_shared_map); | ||
1066 | } | 1065 | } |
1067 | set_cpu_sibling_map(0); | 1066 | set_cpu_sibling_map(0); |
1068 | 1067 | ||
@@ -1112,13 +1111,26 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) | |||
1112 | 1111 | ||
1113 | printk(KERN_INFO "CPU%d: ", 0); | 1112 | printk(KERN_INFO "CPU%d: ", 0); |
1114 | print_cpu_info(&cpu_data(0)); | 1113 | print_cpu_info(&cpu_data(0)); |
1115 | setup_boot_clock(); | 1114 | x86_init.timers.setup_percpu_clockev(); |
1116 | 1115 | ||
1117 | if (is_uv_system()) | 1116 | if (is_uv_system()) |
1118 | uv_system_init(); | 1117 | uv_system_init(); |
1118 | |||
1119 | set_mtrr_aps_delayed_init(); | ||
1119 | out: | 1120 | out: |
1120 | preempt_enable(); | 1121 | preempt_enable(); |
1121 | } | 1122 | } |
1123 | |||
1124 | void arch_enable_nonboot_cpus_begin(void) | ||
1125 | { | ||
1126 | set_mtrr_aps_delayed_init(); | ||
1127 | } | ||
1128 | |||
1129 | void arch_enable_nonboot_cpus_end(void) | ||
1130 | { | ||
1131 | mtrr_aps_init(); | ||
1132 | } | ||
1133 | |||
1122 | /* | 1134 | /* |
1123 | * Early setup to make printk work. | 1135 | * Early setup to make printk work. |
1124 | */ | 1136 | */ |
@@ -1140,6 +1152,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) | |||
1140 | setup_ioapic_dest(); | 1152 | setup_ioapic_dest(); |
1141 | #endif | 1153 | #endif |
1142 | check_nmi_watchdog(); | 1154 | check_nmi_watchdog(); |
1155 | mtrr_aps_init(); | ||
1143 | } | 1156 | } |
1144 | 1157 | ||
1145 | static int __initdata setup_possible_cpus = -1; | 1158 | static int __initdata setup_possible_cpus = -1; |
@@ -1317,6 +1330,7 @@ void play_dead_common(void) | |||
1317 | void native_play_dead(void) | 1330 | void native_play_dead(void) |
1318 | { | 1331 | { |
1319 | play_dead_common(); | 1332 | play_dead_common(); |
1333 | tboot_shutdown(TB_SHUTDOWN_WFS); | ||
1320 | wbinvd_halt(); | 1334 | wbinvd_halt(); |
1321 | } | 1335 | } |
1322 | 1336 | ||
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index e8b9863ef8c..3149032ff10 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/ptrace.h> | 6 | #include <linux/ptrace.h> |
7 | #include <asm/desc.h> | ||
7 | 8 | ||
8 | unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) | 9 | unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) |
9 | { | 10 | { |
@@ -23,7 +24,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re | |||
23 | * and APM bios ones we just ignore here. | 24 | * and APM bios ones we just ignore here. |
24 | */ | 25 | */ |
25 | if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { | 26 | if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { |
26 | u32 *desc; | 27 | struct desc_struct *desc; |
27 | unsigned long base; | 28 | unsigned long base; |
28 | 29 | ||
29 | seg &= ~7UL; | 30 | seg &= ~7UL; |
@@ -33,12 +34,10 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re | |||
33 | addr = -1L; /* bogus selector, access would fault */ | 34 | addr = -1L; /* bogus selector, access would fault */ |
34 | else { | 35 | else { |
35 | desc = child->mm->context.ldt + seg; | 36 | desc = child->mm->context.ldt + seg; |
36 | base = ((desc[0] >> 16) | | 37 | base = get_desc_base(desc); |
37 | ((desc[1] & 0xff) << 16) | | ||
38 | (desc[1] & 0xff000000)); | ||
39 | 38 | ||
40 | /* 16-bit code segment? */ | 39 | /* 16-bit code segment? */ |
41 | if (!((desc[1] >> 22) & 1)) | 40 | if (!desc->d) |
42 | addr &= 0xffff; | 41 | addr &= 0xffff; |
43 | addr += base; | 42 | addr += base; |
44 | } | 43 | } |
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 6bc211accf0..45e00eb09c3 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c | |||
@@ -18,9 +18,9 @@ | |||
18 | #include <asm/ia32.h> | 18 | #include <asm/ia32.h> |
19 | #include <asm/syscalls.h> | 19 | #include <asm/syscalls.h> |
20 | 20 | ||
21 | asmlinkage long sys_mmap(unsigned long addr, unsigned long len, | 21 | SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, |
22 | unsigned long prot, unsigned long flags, | 22 | unsigned long, prot, unsigned long, flags, |
23 | unsigned long fd, unsigned long off) | 23 | unsigned long, fd, unsigned long, off) |
24 | { | 24 | { |
25 | long error; | 25 | long error; |
26 | struct file *file; | 26 | struct file *file; |
@@ -226,7 +226,7 @@ bottomup: | |||
226 | } | 226 | } |
227 | 227 | ||
228 | 228 | ||
229 | asmlinkage long sys_uname(struct new_utsname __user *name) | 229 | SYSCALL_DEFINE1(uname, struct new_utsname __user *, name) |
230 | { | 230 | { |
231 | int err; | 231 | int err; |
232 | down_read(&uts_sem); | 232 | down_read(&uts_sem); |
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d51321ddafd..0157cd26d7c 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S | |||
@@ -335,4 +335,4 @@ ENTRY(sys_call_table) | |||
335 | .long sys_preadv | 335 | .long sys_preadv |
336 | .long sys_pwritev | 336 | .long sys_pwritev |
337 | .long sys_rt_tgsigqueueinfo /* 335 */ | 337 | .long sys_rt_tgsigqueueinfo /* 335 */ |
338 | .long sys_perf_counter_open | 338 | .long sys_perf_event_open |
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c new file mode 100644 index 00000000000..86c9f91b48a --- /dev/null +++ b/arch/x86/kernel/tboot.c | |||
@@ -0,0 +1,447 @@ | |||
1 | /* | ||
2 | * tboot.c: main implementation of helper functions used by kernel for | ||
3 | * runtime support of Intel(R) Trusted Execution Technology | ||
4 | * | ||
5 | * Copyright (c) 2006-2009, Intel Corporation | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify it | ||
8 | * under the terms and conditions of the GNU General Public License, | ||
9 | * version 2, as published by the Free Software Foundation. | ||
10 | * | ||
11 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
14 | * more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License along with | ||
17 | * this program; if not, write to the Free Software Foundation, Inc., | ||
18 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #include <linux/dma_remapping.h> | ||
23 | #include <linux/init_task.h> | ||
24 | #include <linux/spinlock.h> | ||
25 | #include <linux/delay.h> | ||
26 | #include <linux/sched.h> | ||
27 | #include <linux/init.h> | ||
28 | #include <linux/dmar.h> | ||
29 | #include <linux/cpu.h> | ||
30 | #include <linux/pfn.h> | ||
31 | #include <linux/mm.h> | ||
32 | #include <linux/tboot.h> | ||
33 | |||
34 | #include <asm/trampoline.h> | ||
35 | #include <asm/processor.h> | ||
36 | #include <asm/bootparam.h> | ||
37 | #include <asm/pgtable.h> | ||
38 | #include <asm/pgalloc.h> | ||
39 | #include <asm/fixmap.h> | ||
40 | #include <asm/proto.h> | ||
41 | #include <asm/setup.h> | ||
42 | #include <asm/e820.h> | ||
43 | #include <asm/io.h> | ||
44 | |||
45 | #include "acpi/realmode/wakeup.h" | ||
46 | |||
47 | /* Global pointer to shared data; NULL means no measured launch. */ | ||
48 | struct tboot *tboot __read_mostly; | ||
49 | |||
50 | /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ | ||
51 | #define AP_WAIT_TIMEOUT 1 | ||
52 | |||
53 | #undef pr_fmt | ||
54 | #define pr_fmt(fmt) "tboot: " fmt | ||
55 | |||
56 | static u8 tboot_uuid[16] __initdata = TBOOT_UUID; | ||
57 | |||
58 | void __init tboot_probe(void) | ||
59 | { | ||
60 | /* Look for valid page-aligned address for shared page. */ | ||
61 | if (!boot_params.tboot_addr) | ||
62 | return; | ||
63 | /* | ||
64 | * also verify that it is mapped as we expect it before calling | ||
65 | * set_fixmap(), to reduce chance of garbage value causing crash | ||
66 | */ | ||
67 | if (!e820_any_mapped(boot_params.tboot_addr, | ||
68 | boot_params.tboot_addr, E820_RESERVED)) { | ||
69 | pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n"); | ||
70 | return; | ||
71 | } | ||
72 | |||
73 | /* only a natively booted kernel should be using TXT */ | ||
74 | if (paravirt_enabled()) { | ||
75 | pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); | ||
76 | return; | ||
77 | } | ||
78 | |||
79 | /* Map and check for tboot UUID. */ | ||
80 | set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); | ||
81 | tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); | ||
82 | if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { | ||
83 | pr_warning("tboot at 0x%llx is invalid\n", | ||
84 | boot_params.tboot_addr); | ||
85 | tboot = NULL; | ||
86 | return; | ||
87 | } | ||
88 | if (tboot->version < 5) { | ||
89 | pr_warning("tboot version is invalid: %u\n", tboot->version); | ||
90 | tboot = NULL; | ||
91 | return; | ||
92 | } | ||
93 | |||
94 | pr_info("found shared page at phys addr 0x%llx:\n", | ||
95 | boot_params.tboot_addr); | ||
96 | pr_debug("version: %d\n", tboot->version); | ||
97 | pr_debug("log_addr: 0x%08x\n", tboot->log_addr); | ||
98 | pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); | ||
99 | pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); | ||
100 | pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); | ||
101 | } | ||
102 | |||
103 | static pgd_t *tboot_pg_dir; | ||
104 | static struct mm_struct tboot_mm = { | ||
105 | .mm_rb = RB_ROOT, | ||
106 | .pgd = swapper_pg_dir, | ||
107 | .mm_users = ATOMIC_INIT(2), | ||
108 | .mm_count = ATOMIC_INIT(1), | ||
109 | .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), | ||
110 | .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), | ||
111 | .mmlist = LIST_HEAD_INIT(init_mm.mmlist), | ||
112 | .cpu_vm_mask = CPU_MASK_ALL, | ||
113 | }; | ||
114 | |||
115 | static inline void switch_to_tboot_pt(void) | ||
116 | { | ||
117 | write_cr3(virt_to_phys(tboot_pg_dir)); | ||
118 | } | ||
119 | |||
120 | static int map_tboot_page(unsigned long vaddr, unsigned long pfn, | ||
121 | pgprot_t prot) | ||
122 | { | ||
123 | pgd_t *pgd; | ||
124 | pud_t *pud; | ||
125 | pmd_t *pmd; | ||
126 | pte_t *pte; | ||
127 | |||
128 | pgd = pgd_offset(&tboot_mm, vaddr); | ||
129 | pud = pud_alloc(&tboot_mm, pgd, vaddr); | ||
130 | if (!pud) | ||
131 | return -1; | ||
132 | pmd = pmd_alloc(&tboot_mm, pud, vaddr); | ||
133 | if (!pmd) | ||
134 | return -1; | ||
135 | pte = pte_alloc_map(&tboot_mm, pmd, vaddr); | ||
136 | if (!pte) | ||
137 | return -1; | ||
138 | set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); | ||
139 | pte_unmap(pte); | ||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn, | ||
144 | unsigned long nr) | ||
145 | { | ||
146 | /* Reuse the original kernel mapping */ | ||
147 | tboot_pg_dir = pgd_alloc(&tboot_mm); | ||
148 | if (!tboot_pg_dir) | ||
149 | return -1; | ||
150 | |||
151 | for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) { | ||
152 | if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC)) | ||
153 | return -1; | ||
154 | } | ||
155 | |||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | static void tboot_create_trampoline(void) | ||
160 | { | ||
161 | u32 map_base, map_size; | ||
162 | |||
163 | /* Create identity map for tboot shutdown code. */ | ||
164 | map_base = PFN_DOWN(tboot->tboot_base); | ||
165 | map_size = PFN_UP(tboot->tboot_size); | ||
166 | if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size)) | ||
167 | panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", | ||
168 | map_base, map_size); | ||
169 | } | ||
170 | |||
171 | #ifdef CONFIG_ACPI_SLEEP | ||
172 | |||
173 | static void add_mac_region(phys_addr_t start, unsigned long size) | ||
174 | { | ||
175 | struct tboot_mac_region *mr; | ||
176 | phys_addr_t end = start + size; | ||
177 | |||
178 | if (start && size) { | ||
179 | mr = &tboot->mac_regions[tboot->num_mac_regions++]; | ||
180 | mr->start = round_down(start, PAGE_SIZE); | ||
181 | mr->size = round_up(end, PAGE_SIZE) - mr->start; | ||
182 | } | ||
183 | } | ||
184 | |||
185 | static int tboot_setup_sleep(void) | ||
186 | { | ||
187 | tboot->num_mac_regions = 0; | ||
188 | |||
189 | /* S3 resume code */ | ||
190 | add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); | ||
191 | |||
192 | #ifdef CONFIG_X86_TRAMPOLINE | ||
193 | /* AP trampoline code */ | ||
194 | add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); | ||
195 | #endif | ||
196 | |||
197 | /* kernel code + data + bss */ | ||
198 | add_mac_region(virt_to_phys(_text), _end - _text); | ||
199 | |||
200 | tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; | ||
201 | |||
202 | return 0; | ||
203 | } | ||
204 | |||
205 | #else /* no CONFIG_ACPI_SLEEP */ | ||
206 | |||
207 | static int tboot_setup_sleep(void) | ||
208 | { | ||
209 | /* S3 shutdown requested, but S3 not supported by the kernel... */ | ||
210 | BUG(); | ||
211 | return -1; | ||
212 | } | ||
213 | |||
214 | #endif | ||
215 | |||
216 | void tboot_shutdown(u32 shutdown_type) | ||
217 | { | ||
218 | void (*shutdown)(void); | ||
219 | |||
220 | if (!tboot_enabled()) | ||
221 | return; | ||
222 | |||
223 | /* | ||
224 | * if we're being called before the 1:1 mapping is set up then just | ||
225 | * return and let the normal shutdown happen; this should only be | ||
226 | * due to very early panic() | ||
227 | */ | ||
228 | if (!tboot_pg_dir) | ||
229 | return; | ||
230 | |||
231 | /* if this is S3 then set regions to MAC */ | ||
232 | if (shutdown_type == TB_SHUTDOWN_S3) | ||
233 | if (tboot_setup_sleep()) | ||
234 | return; | ||
235 | |||
236 | tboot->shutdown_type = shutdown_type; | ||
237 | |||
238 | switch_to_tboot_pt(); | ||
239 | |||
240 | shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry; | ||
241 | shutdown(); | ||
242 | |||
243 | /* should not reach here */ | ||
244 | while (1) | ||
245 | halt(); | ||
246 | } | ||
247 | |||
248 | static void tboot_copy_fadt(const struct acpi_table_fadt *fadt) | ||
249 | { | ||
250 | #define TB_COPY_GAS(tbg, g) \ | ||
251 | tbg.space_id = g.space_id; \ | ||
252 | tbg.bit_width = g.bit_width; \ | ||
253 | tbg.bit_offset = g.bit_offset; \ | ||
254 | tbg.access_width = g.access_width; \ | ||
255 | tbg.address = g.address; | ||
256 | |||
257 | TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block); | ||
258 | TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block); | ||
259 | TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block); | ||
260 | TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block); | ||
261 | |||
262 | /* | ||
263 | * We need phys addr of waking vector, but can't use virt_to_phys() on | ||
264 | * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys | ||
265 | * addr. | ||
266 | */ | ||
267 | tboot->acpi_sinfo.wakeup_vector = fadt->facs + | ||
268 | offsetof(struct acpi_table_facs, firmware_waking_vector); | ||
269 | } | ||
270 | |||
271 | void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) | ||
272 | { | ||
273 | static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { | ||
274 | /* S0,1,2: */ -1, -1, -1, | ||
275 | /* S3: */ TB_SHUTDOWN_S3, | ||
276 | /* S4: */ TB_SHUTDOWN_S4, | ||
277 | /* S5: */ TB_SHUTDOWN_S5 }; | ||
278 | |||
279 | if (!tboot_enabled()) | ||
280 | return; | ||
281 | |||
282 | tboot_copy_fadt(&acpi_gbl_FADT); | ||
283 | tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; | ||
284 | tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control; | ||
285 | /* we always use the 32b wakeup vector */ | ||
286 | tboot->acpi_sinfo.vector_width = 32; | ||
287 | |||
288 | if (sleep_state >= ACPI_S_STATE_COUNT || | ||
289 | acpi_shutdown_map[sleep_state] == -1) { | ||
290 | pr_warning("unsupported sleep state 0x%x\n", sleep_state); | ||
291 | return; | ||
292 | } | ||
293 | |||
294 | tboot_shutdown(acpi_shutdown_map[sleep_state]); | ||
295 | } | ||
296 | |||
297 | static atomic_t ap_wfs_count; | ||
298 | |||
299 | static int tboot_wait_for_aps(int num_aps) | ||
300 | { | ||
301 | unsigned long timeout; | ||
302 | |||
303 | timeout = AP_WAIT_TIMEOUT*HZ; | ||
304 | while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps && | ||
305 | timeout) { | ||
306 | mdelay(1); | ||
307 | timeout--; | ||
308 | } | ||
309 | |||
310 | if (timeout) | ||
311 | pr_warning("tboot wait for APs timeout\n"); | ||
312 | |||
313 | return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); | ||
314 | } | ||
315 | |||
316 | static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb, | ||
317 | unsigned long action, void *hcpu) | ||
318 | { | ||
319 | switch (action) { | ||
320 | case CPU_DYING: | ||
321 | atomic_inc(&ap_wfs_count); | ||
322 | if (num_online_cpus() == 1) | ||
323 | if (tboot_wait_for_aps(atomic_read(&ap_wfs_count))) | ||
324 | return NOTIFY_BAD; | ||
325 | break; | ||
326 | } | ||
327 | return NOTIFY_OK; | ||
328 | } | ||
329 | |||
330 | static struct notifier_block tboot_cpu_notifier __cpuinitdata = | ||
331 | { | ||
332 | .notifier_call = tboot_cpu_callback, | ||
333 | }; | ||
334 | |||
335 | static __init int tboot_late_init(void) | ||
336 | { | ||
337 | if (!tboot_enabled()) | ||
338 | return 0; | ||
339 | |||
340 | tboot_create_trampoline(); | ||
341 | |||
342 | atomic_set(&ap_wfs_count, 0); | ||
343 | register_hotcpu_notifier(&tboot_cpu_notifier); | ||
344 | return 0; | ||
345 | } | ||
346 | |||
347 | late_initcall(tboot_late_init); | ||
348 | |||
349 | /* | ||
350 | * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE) | ||
351 | */ | ||
352 | |||
353 | #define TXT_PUB_CONFIG_REGS_BASE 0xfed30000 | ||
354 | #define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000 | ||
355 | |||
356 | /* # pages for each config regs space - used by fixmap */ | ||
357 | #define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \ | ||
358 | TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT) | ||
359 | |||
360 | /* offsets from pub/priv config space */ | ||
361 | #define TXTCR_HEAP_BASE 0x0300 | ||
362 | #define TXTCR_HEAP_SIZE 0x0308 | ||
363 | |||
364 | #define SHA1_SIZE 20 | ||
365 | |||
366 | struct sha1_hash { | ||
367 | u8 hash[SHA1_SIZE]; | ||
368 | }; | ||
369 | |||
370 | struct sinit_mle_data { | ||
371 | u32 version; /* currently 6 */ | ||
372 | struct sha1_hash bios_acm_id; | ||
373 | u32 edx_senter_flags; | ||
374 | u64 mseg_valid; | ||
375 | struct sha1_hash sinit_hash; | ||
376 | struct sha1_hash mle_hash; | ||
377 | struct sha1_hash stm_hash; | ||
378 | struct sha1_hash lcp_policy_hash; | ||
379 | u32 lcp_policy_control; | ||
380 | u32 rlp_wakeup_addr; | ||
381 | u32 reserved; | ||
382 | u32 num_mdrs; | ||
383 | u32 mdrs_off; | ||
384 | u32 num_vtd_dmars; | ||
385 | u32 vtd_dmars_off; | ||
386 | } __packed; | ||
387 | |||
388 | struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl) | ||
389 | { | ||
390 | void *heap_base, *heap_ptr, *config; | ||
391 | |||
392 | if (!tboot_enabled()) | ||
393 | return dmar_tbl; | ||
394 | |||
395 | /* | ||
396 | * ACPI tables may not be DMA protected by tboot, so use DMAR copy | ||
397 | * SINIT saved in SinitMleData in TXT heap (which is DMA protected) | ||
398 | */ | ||
399 | |||
400 | /* map config space in order to get heap addr */ | ||
401 | config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES * | ||
402 | PAGE_SIZE); | ||
403 | if (!config) | ||
404 | return NULL; | ||
405 | |||
406 | /* now map TXT heap */ | ||
407 | heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE), | ||
408 | *(u64 *)(config + TXTCR_HEAP_SIZE)); | ||
409 | iounmap(config); | ||
410 | if (!heap_base) | ||
411 | return NULL; | ||
412 | |||
413 | /* walk heap to SinitMleData */ | ||
414 | /* skip BiosData */ | ||
415 | heap_ptr = heap_base + *(u64 *)heap_base; | ||
416 | /* skip OsMleData */ | ||
417 | heap_ptr += *(u64 *)heap_ptr; | ||
418 | /* skip OsSinitData */ | ||
419 | heap_ptr += *(u64 *)heap_ptr; | ||
420 | /* now points to SinitMleDataSize; set to SinitMleData */ | ||
421 | heap_ptr += sizeof(u64); | ||
422 | /* get addr of DMAR table */ | ||
423 | dmar_tbl = (struct acpi_table_header *)(heap_ptr + | ||
424 | ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off - | ||
425 | sizeof(u64)); | ||
426 | |||
427 | /* don't unmap heap because dmar.c needs access to this */ | ||
428 | |||
429 | return dmar_tbl; | ||
430 | } | ||
431 | |||
432 | int tboot_force_iommu(void) | ||
433 | { | ||
434 | if (!tboot_enabled()) | ||
435 | return 0; | ||
436 | |||
437 | if (no_iommu || swiotlb || dmar_disabled) | ||
438 | pr_warning("Forcing Intel-IOMMU to enabled\n"); | ||
439 | |||
440 | dmar_disabled = 0; | ||
441 | #ifdef CONFIG_SWIOTLB | ||
442 | swiotlb = 0; | ||
443 | #endif | ||
444 | no_iommu = 0; | ||
445 | |||
446 | return 1; | ||
447 | } | ||
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c new file mode 100644 index 00000000000..dcb00d27851 --- /dev/null +++ b/arch/x86/kernel/time.c | |||
@@ -0,0 +1,120 @@ | |||
1 | /* | ||
2 | * Copyright (c) 1991,1992,1995 Linus Torvalds | ||
3 | * Copyright (c) 1994 Alan Modra | ||
4 | * Copyright (c) 1995 Markus Kuhn | ||
5 | * Copyright (c) 1996 Ingo Molnar | ||
6 | * Copyright (c) 1998 Andrea Arcangeli | ||
7 | * Copyright (c) 2002,2006 Vojtech Pavlik | ||
8 | * Copyright (c) 2003 Andi Kleen | ||
9 | * | ||
10 | */ | ||
11 | |||
12 | #include <linux/clockchips.h> | ||
13 | #include <linux/interrupt.h> | ||
14 | #include <linux/time.h> | ||
15 | #include <linux/mca.h> | ||
16 | |||
17 | #include <asm/vsyscall.h> | ||
18 | #include <asm/x86_init.h> | ||
19 | #include <asm/i8259.h> | ||
20 | #include <asm/i8253.h> | ||
21 | #include <asm/timer.h> | ||
22 | #include <asm/hpet.h> | ||
23 | #include <asm/time.h> | ||
24 | |||
25 | #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) | ||
26 | int timer_ack; | ||
27 | #endif | ||
28 | |||
29 | #ifdef CONFIG_X86_64 | ||
30 | volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; | ||
31 | #endif | ||
32 | |||
33 | unsigned long profile_pc(struct pt_regs *regs) | ||
34 | { | ||
35 | unsigned long pc = instruction_pointer(regs); | ||
36 | |||
37 | if (!user_mode_vm(regs) && in_lock_functions(pc)) { | ||
38 | #ifdef CONFIG_FRAME_POINTER | ||
39 | return *(unsigned long *)(regs->bp + sizeof(long)); | ||
40 | #else | ||
41 | unsigned long *sp = (unsigned long *)regs->sp; | ||
42 | /* | ||
43 | * Return address is either directly at stack pointer | ||
44 | * or above a saved flags. Eflags has bits 22-31 zero, | ||
45 | * kernel addresses don't. | ||
46 | */ | ||
47 | if (sp[0] >> 22) | ||
48 | return sp[0]; | ||
49 | if (sp[1] >> 22) | ||
50 | return sp[1]; | ||
51 | #endif | ||
52 | } | ||
53 | return pc; | ||
54 | } | ||
55 | EXPORT_SYMBOL(profile_pc); | ||
56 | |||
57 | /* | ||
58 | * Default timer interrupt handler for PIT/HPET | ||
59 | */ | ||
60 | static irqreturn_t timer_interrupt(int irq, void *dev_id) | ||
61 | { | ||
62 | /* Keep nmi watchdog up to date */ | ||
63 | inc_irq_stat(irq0_irqs); | ||
64 | |||
65 | /* Optimized out for !IO_APIC and x86_64 */ | ||
66 | if (timer_ack) { | ||
67 | /* | ||
68 | * Subtle, when I/O APICs are used we have to ack timer IRQ | ||
69 | * manually to deassert NMI lines for the watchdog if run | ||
70 | * on an 82489DX-based system. | ||
71 | */ | ||
72 | spin_lock(&i8259A_lock); | ||
73 | outb(0x0c, PIC_MASTER_OCW3); | ||
74 | /* Ack the IRQ; AEOI will end it automatically. */ | ||
75 | inb(PIC_MASTER_POLL); | ||
76 | spin_unlock(&i8259A_lock); | ||
77 | } | ||
78 | |||
79 | global_clock_event->event_handler(global_clock_event); | ||
80 | |||
81 | /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ | ||
82 | if (MCA_bus) | ||
83 | outb_p(inb_p(0x61)| 0x80, 0x61); | ||
84 | |||
85 | return IRQ_HANDLED; | ||
86 | } | ||
87 | |||
88 | static struct irqaction irq0 = { | ||
89 | .handler = timer_interrupt, | ||
90 | .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, | ||
91 | .name = "timer" | ||
92 | }; | ||
93 | |||
94 | void __init setup_default_timer_irq(void) | ||
95 | { | ||
96 | setup_irq(0, &irq0); | ||
97 | } | ||
98 | |||
99 | /* Default timer init function */ | ||
100 | void __init hpet_time_init(void) | ||
101 | { | ||
102 | if (!hpet_enable()) | ||
103 | setup_pit_timer(); | ||
104 | setup_default_timer_irq(); | ||
105 | } | ||
106 | |||
107 | static __init void x86_late_time_init(void) | ||
108 | { | ||
109 | x86_init.timers.timer_init(); | ||
110 | tsc_init(); | ||
111 | } | ||
112 | |||
113 | /* | ||
114 | * Initialize TSC and delay the periodic timer init to | ||
115 | * late x86_late_time_init() so ioremap works. | ||
116 | */ | ||
117 | void __init time_init(void) | ||
118 | { | ||
119 | late_time_init = x86_late_time_init; | ||
120 | } | ||
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c deleted file mode 100644 index 5c5d87f0b2e..00000000000 --- a/arch/x86/kernel/time_32.c +++ /dev/null | |||
@@ -1,137 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1991, 1992, 1995 Linus Torvalds | ||
3 | * | ||
4 | * This file contains the PC-specific time handling details: | ||
5 | * reading the RTC at bootup, etc.. | ||
6 | * 1994-07-02 Alan Modra | ||
7 | * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime | ||
8 | * 1995-03-26 Markus Kuhn | ||
9 | * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 | ||
10 | * precision CMOS clock update | ||
11 | * 1996-05-03 Ingo Molnar | ||
12 | * fixed time warps in do_[slow|fast]_gettimeoffset() | ||
13 | * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 | ||
14 | * "A Kernel Model for Precision Timekeeping" by Dave Mills | ||
15 | * 1998-09-05 (Various) | ||
16 | * More robust do_fast_gettimeoffset() algorithm implemented | ||
17 | * (works with APM, Cyrix 6x86MX and Centaur C6), | ||
18 | * monotonic gettimeofday() with fast_get_timeoffset(), | ||
19 | * drift-proof precision TSC calibration on boot | ||
20 | * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D. | ||
21 | * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>; | ||
22 | * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>). | ||
23 | * 1998-12-16 Andrea Arcangeli | ||
24 | * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy | ||
25 | * because was not accounting lost_ticks. | ||
26 | * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli | ||
27 | * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to | ||
28 | * serialize accesses to xtime/lost_ticks). | ||
29 | */ | ||
30 | |||
31 | #include <linux/init.h> | ||
32 | #include <linux/interrupt.h> | ||
33 | #include <linux/time.h> | ||
34 | #include <linux/mca.h> | ||
35 | |||
36 | #include <asm/setup.h> | ||
37 | #include <asm/hpet.h> | ||
38 | #include <asm/time.h> | ||
39 | #include <asm/timer.h> | ||
40 | |||
41 | #include <asm/do_timer.h> | ||
42 | |||
43 | int timer_ack; | ||
44 | |||
45 | unsigned long profile_pc(struct pt_regs *regs) | ||
46 | { | ||
47 | unsigned long pc = instruction_pointer(regs); | ||
48 | |||
49 | #ifdef CONFIG_SMP | ||
50 | if (!user_mode_vm(regs) && in_lock_functions(pc)) { | ||
51 | #ifdef CONFIG_FRAME_POINTER | ||
52 | return *(unsigned long *)(regs->bp + sizeof(long)); | ||
53 | #else | ||
54 | unsigned long *sp = (unsigned long *)®s->sp; | ||
55 | |||
56 | /* Return address is either directly at stack pointer | ||
57 | or above a saved flags. Eflags has bits 22-31 zero, | ||
58 | kernel addresses don't. */ | ||
59 | if (sp[0] >> 22) | ||
60 | return sp[0]; | ||
61 | if (sp[1] >> 22) | ||
62 | return sp[1]; | ||
63 | #endif | ||
64 | } | ||
65 | #endif | ||
66 | return pc; | ||
67 | } | ||
68 | EXPORT_SYMBOL(profile_pc); | ||
69 | |||
70 | /* | ||
71 | * This is the same as the above, except we _also_ save the current | ||
72 | * Time Stamp Counter value at the time of the timer interrupt, so that | ||
73 | * we later on can estimate the time of day more exactly. | ||
74 | */ | ||
75 | irqreturn_t timer_interrupt(int irq, void *dev_id) | ||
76 | { | ||
77 | /* Keep nmi watchdog up to date */ | ||
78 | inc_irq_stat(irq0_irqs); | ||
79 | |||
80 | #ifdef CONFIG_X86_IO_APIC | ||
81 | if (timer_ack) { | ||
82 | /* | ||
83 | * Subtle, when I/O APICs are used we have to ack timer IRQ | ||
84 | * manually to deassert NMI lines for the watchdog if run | ||
85 | * on an 82489DX-based system. | ||
86 | */ | ||
87 | spin_lock(&i8259A_lock); | ||
88 | outb(0x0c, PIC_MASTER_OCW3); | ||
89 | /* Ack the IRQ; AEOI will end it automatically. */ | ||
90 | inb(PIC_MASTER_POLL); | ||
91 | spin_unlock(&i8259A_lock); | ||
92 | } | ||
93 | #endif | ||
94 | |||
95 | do_timer_interrupt_hook(); | ||
96 | |||
97 | #ifdef CONFIG_MCA | ||
98 | if (MCA_bus) { | ||
99 | /* The PS/2 uses level-triggered interrupts. You can't | ||
100 | turn them off, nor would you want to (any attempt to | ||
101 | enable edge-triggered interrupts usually gets intercepted by a | ||
102 | special hardware circuit). Hence we have to acknowledge | ||
103 | the timer interrupt. Through some incredibly stupid | ||
104 | design idea, the reset for IRQ 0 is done by setting the | ||
105 | high bit of the PPI port B (0x61). Note that some PS/2s, | ||
106 | notably the 55SX, work fine if this is removed. */ | ||
107 | |||
108 | u8 irq_v = inb_p(0x61); /* read the current state */ | ||
109 | outb_p(irq_v | 0x80, 0x61); /* reset the IRQ */ | ||
110 | } | ||
111 | #endif | ||
112 | |||
113 | return IRQ_HANDLED; | ||
114 | } | ||
115 | |||
116 | /* Duplicate of time_init() below, with hpet_enable part added */ | ||
117 | void __init hpet_time_init(void) | ||
118 | { | ||
119 | if (!hpet_enable()) | ||
120 | setup_pit_timer(); | ||
121 | x86_quirk_time_init(); | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * This is called directly from init code; we must delay timer setup in the | ||
126 | * HPET case as we can't make the decision to turn on HPET this early in the | ||
127 | * boot process. | ||
128 | * | ||
129 | * The chosen time_init function will usually be hpet_time_init, above, but | ||
130 | * in the case of virtual hardware, an alternative function may be substituted. | ||
131 | */ | ||
132 | void __init time_init(void) | ||
133 | { | ||
134 | x86_quirk_pre_time_init(); | ||
135 | tsc_init(); | ||
136 | late_time_init = choose_time_init(); | ||
137 | } | ||
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c deleted file mode 100644 index 5ba343e6184..00000000000 --- a/arch/x86/kernel/time_64.c +++ /dev/null | |||
@@ -1,135 +0,0 @@ | |||
1 | /* | ||
2 | * "High Precision Event Timer" based timekeeping. | ||
3 | * | ||
4 | * Copyright (c) 1991,1992,1995 Linus Torvalds | ||
5 | * Copyright (c) 1994 Alan Modra | ||
6 | * Copyright (c) 1995 Markus Kuhn | ||
7 | * Copyright (c) 1996 Ingo Molnar | ||
8 | * Copyright (c) 1998 Andrea Arcangeli | ||
9 | * Copyright (c) 2002,2006 Vojtech Pavlik | ||
10 | * Copyright (c) 2003 Andi Kleen | ||
11 | * RTC support code taken from arch/i386/kernel/timers/time_hpet.c | ||
12 | */ | ||
13 | |||
14 | #include <linux/clockchips.h> | ||
15 | #include <linux/init.h> | ||
16 | #include <linux/interrupt.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/time.h> | ||
19 | #include <linux/mca.h> | ||
20 | #include <linux/nmi.h> | ||
21 | |||
22 | #include <asm/i8253.h> | ||
23 | #include <asm/hpet.h> | ||
24 | #include <asm/vgtod.h> | ||
25 | #include <asm/time.h> | ||
26 | #include <asm/timer.h> | ||
27 | |||
28 | volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; | ||
29 | |||
30 | unsigned long profile_pc(struct pt_regs *regs) | ||
31 | { | ||
32 | unsigned long pc = instruction_pointer(regs); | ||
33 | |||
34 | /* Assume the lock function has either no stack frame or a copy | ||
35 | of flags from PUSHF | ||
36 | Eflags always has bits 22 and up cleared unlike kernel addresses. */ | ||
37 | if (!user_mode_vm(regs) && in_lock_functions(pc)) { | ||
38 | #ifdef CONFIG_FRAME_POINTER | ||
39 | return *(unsigned long *)(regs->bp + sizeof(long)); | ||
40 | #else | ||
41 | unsigned long *sp = (unsigned long *)regs->sp; | ||
42 | if (sp[0] >> 22) | ||
43 | return sp[0]; | ||
44 | if (sp[1] >> 22) | ||
45 | return sp[1]; | ||
46 | #endif | ||
47 | } | ||
48 | return pc; | ||
49 | } | ||
50 | EXPORT_SYMBOL(profile_pc); | ||
51 | |||
52 | static irqreturn_t timer_interrupt(int irq, void *dev_id) | ||
53 | { | ||
54 | inc_irq_stat(irq0_irqs); | ||
55 | |||
56 | global_clock_event->event_handler(global_clock_event); | ||
57 | |||
58 | #ifdef CONFIG_MCA | ||
59 | if (MCA_bus) { | ||
60 | u8 irq_v = inb_p(0x61); /* read the current state */ | ||
61 | outb_p(irq_v|0x80, 0x61); /* reset the IRQ */ | ||
62 | } | ||
63 | #endif | ||
64 | |||
65 | return IRQ_HANDLED; | ||
66 | } | ||
67 | |||
68 | /* calibrate_cpu is used on systems with fixed rate TSCs to determine | ||
69 | * processor frequency */ | ||
70 | #define TICK_COUNT 100000000 | ||
71 | unsigned long __init calibrate_cpu(void) | ||
72 | { | ||
73 | int tsc_start, tsc_now; | ||
74 | int i, no_ctr_free; | ||
75 | unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; | ||
76 | unsigned long flags; | ||
77 | |||
78 | for (i = 0; i < 4; i++) | ||
79 | if (avail_to_resrv_perfctr_nmi_bit(i)) | ||
80 | break; | ||
81 | no_ctr_free = (i == 4); | ||
82 | if (no_ctr_free) { | ||
83 | WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " | ||
84 | "cpu_khz value may be incorrect.\n"); | ||
85 | i = 3; | ||
86 | rdmsrl(MSR_K7_EVNTSEL3, evntsel3); | ||
87 | wrmsrl(MSR_K7_EVNTSEL3, 0); | ||
88 | rdmsrl(MSR_K7_PERFCTR3, pmc3); | ||
89 | } else { | ||
90 | reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); | ||
91 | reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); | ||
92 | } | ||
93 | local_irq_save(flags); | ||
94 | /* start measuring cycles, incrementing from 0 */ | ||
95 | wrmsrl(MSR_K7_PERFCTR0 + i, 0); | ||
96 | wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); | ||
97 | rdtscl(tsc_start); | ||
98 | do { | ||
99 | rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); | ||
100 | tsc_now = get_cycles(); | ||
101 | } while ((tsc_now - tsc_start) < TICK_COUNT); | ||
102 | |||
103 | local_irq_restore(flags); | ||
104 | if (no_ctr_free) { | ||
105 | wrmsrl(MSR_K7_EVNTSEL3, 0); | ||
106 | wrmsrl(MSR_K7_PERFCTR3, pmc3); | ||
107 | wrmsrl(MSR_K7_EVNTSEL3, evntsel3); | ||
108 | } else { | ||
109 | release_perfctr_nmi(MSR_K7_PERFCTR0 + i); | ||
110 | release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); | ||
111 | } | ||
112 | |||
113 | return pmc_now * tsc_khz / (tsc_now - tsc_start); | ||
114 | } | ||
115 | |||
116 | static struct irqaction irq0 = { | ||
117 | .handler = timer_interrupt, | ||
118 | .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER, | ||
119 | .name = "timer" | ||
120 | }; | ||
121 | |||
122 | void __init hpet_time_init(void) | ||
123 | { | ||
124 | if (!hpet_enable()) | ||
125 | setup_pit_timer(); | ||
126 | |||
127 | setup_irq(0, &irq0); | ||
128 | } | ||
129 | |||
130 | void __init time_init(void) | ||
131 | { | ||
132 | tsc_init(); | ||
133 | |||
134 | late_time_init = choose_time_init(); | ||
135 | } | ||
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 8ccabb8a2f6..503c1f2e883 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c | |||
@@ -640,13 +640,13 @@ static int __init uv_ptc_init(void) | |||
640 | if (!is_uv_system()) | 640 | if (!is_uv_system()) |
641 | return 0; | 641 | return 0; |
642 | 642 | ||
643 | proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); | 643 | proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL, |
644 | &proc_uv_ptc_operations); | ||
644 | if (!proc_uv_ptc) { | 645 | if (!proc_uv_ptc) { |
645 | printk(KERN_ERR "unable to create %s proc entry\n", | 646 | printk(KERN_ERR "unable to create %s proc entry\n", |
646 | UV_PTC_BASENAME); | 647 | UV_PTC_BASENAME); |
647 | return -EINVAL; | 648 | return -EINVAL; |
648 | } | 649 | } |
649 | proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; | ||
650 | return 0; | 650 | return 0; |
651 | } | 651 | } |
652 | 652 | ||
@@ -744,6 +744,7 @@ uv_activation_descriptor_init(int node, int pnode) | |||
744 | * note that base_dest_nodeid is actually a nasid. | 744 | * note that base_dest_nodeid is actually a nasid. |
745 | */ | 745 | */ |
746 | ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; | 746 | ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; |
747 | ad2->header.dest_subnodeid = 0x10; /* the LB */ | ||
747 | ad2->header.command = UV_NET_ENDPOINT_INTD; | 748 | ad2->header.command = UV_NET_ENDPOINT_INTD; |
748 | ad2->header.int_both = 1; | 749 | ad2->header.int_both = 1; |
749 | /* | 750 | /* |
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 808031a5ba1..699f7eeb896 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c | |||
@@ -4,7 +4,7 @@ | |||
4 | #include <asm/e820.h> | 4 | #include <asm/e820.h> |
5 | 5 | ||
6 | /* ready for x86_64 and x86 */ | 6 | /* ready for x86_64 and x86 */ |
7 | unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); | 7 | unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE); |
8 | 8 | ||
9 | void __init reserve_trampoline_memory(void) | 9 | void __init reserve_trampoline_memory(void) |
10 | { | 10 | { |
@@ -26,7 +26,7 @@ void __init reserve_trampoline_memory(void) | |||
26 | * bootstrap into the page concerned. The caller | 26 | * bootstrap into the page concerned. The caller |
27 | * has made sure it's suitably aligned. | 27 | * has made sure it's suitably aligned. |
28 | */ | 28 | */ |
29 | unsigned long setup_trampoline(void) | 29 | unsigned long __cpuinit setup_trampoline(void) |
30 | { | 30 | { |
31 | memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); | 31 | memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); |
32 | return virt_to_phys(trampoline_base); | 32 | return virt_to_phys(trampoline_base); |
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index 66d874e5404..8508237e8e4 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S | |||
@@ -28,16 +28,12 @@ | |||
28 | */ | 28 | */ |
29 | 29 | ||
30 | #include <linux/linkage.h> | 30 | #include <linux/linkage.h> |
31 | #include <linux/init.h> | ||
31 | #include <asm/segment.h> | 32 | #include <asm/segment.h> |
32 | #include <asm/page_types.h> | 33 | #include <asm/page_types.h> |
33 | 34 | ||
34 | /* We can free up trampoline after bootup if cpu hotplug is not supported. */ | 35 | /* We can free up trampoline after bootup if cpu hotplug is not supported. */ |
35 | #ifndef CONFIG_HOTPLUG_CPU | 36 | __CPUINITRODATA |
36 | .section ".cpuinit.data","aw",@progbits | ||
37 | #else | ||
38 | .section .rodata,"a",@progbits | ||
39 | #endif | ||
40 | |||
41 | .code16 | 37 | .code16 |
42 | 38 | ||
43 | ENTRY(trampoline_data) | 39 | ENTRY(trampoline_data) |
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index cddfb8d386b..596d54c660a 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S | |||
@@ -25,14 +25,15 @@ | |||
25 | */ | 25 | */ |
26 | 26 | ||
27 | #include <linux/linkage.h> | 27 | #include <linux/linkage.h> |
28 | #include <linux/init.h> | ||
28 | #include <asm/pgtable_types.h> | 29 | #include <asm/pgtable_types.h> |
29 | #include <asm/page_types.h> | 30 | #include <asm/page_types.h> |
30 | #include <asm/msr.h> | 31 | #include <asm/msr.h> |
31 | #include <asm/segment.h> | 32 | #include <asm/segment.h> |
32 | #include <asm/processor-flags.h> | 33 | #include <asm/processor-flags.h> |
33 | 34 | ||
34 | .section .rodata, "a", @progbits | 35 | /* We can free up the trampoline after bootup if cpu hotplug is not supported. */ |
35 | 36 | __CPUINITRODATA | |
36 | .code16 | 37 | .code16 |
37 | 38 | ||
38 | ENTRY(trampoline_data) | 39 | ENTRY(trampoline_data) |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5204332f475..7e37dcee0cc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -14,7 +14,6 @@ | |||
14 | #include <linux/spinlock.h> | 14 | #include <linux/spinlock.h> |
15 | #include <linux/kprobes.h> | 15 | #include <linux/kprobes.h> |
16 | #include <linux/uaccess.h> | 16 | #include <linux/uaccess.h> |
17 | #include <linux/utsname.h> | ||
18 | #include <linux/kdebug.h> | 17 | #include <linux/kdebug.h> |
19 | #include <linux/kernel.h> | 18 | #include <linux/kernel.h> |
20 | #include <linux/module.h> | 19 | #include <linux/module.h> |
@@ -59,12 +58,12 @@ | |||
59 | #include <asm/mach_traps.h> | 58 | #include <asm/mach_traps.h> |
60 | 59 | ||
61 | #ifdef CONFIG_X86_64 | 60 | #ifdef CONFIG_X86_64 |
61 | #include <asm/x86_init.h> | ||
62 | #include <asm/pgalloc.h> | 62 | #include <asm/pgalloc.h> |
63 | #include <asm/proto.h> | 63 | #include <asm/proto.h> |
64 | #else | 64 | #else |
65 | #include <asm/processor-flags.h> | 65 | #include <asm/processor-flags.h> |
66 | #include <asm/setup.h> | 66 | #include <asm/setup.h> |
67 | #include <asm/traps.h> | ||
68 | 67 | ||
69 | asmlinkage int system_call(void); | 68 | asmlinkage int system_call(void); |
70 | 69 | ||
@@ -73,11 +72,9 @@ char ignore_fpu_irq; | |||
73 | 72 | ||
74 | /* | 73 | /* |
75 | * The IDT has to be page-aligned to simplify the Pentium | 74 | * The IDT has to be page-aligned to simplify the Pentium |
76 | * F0 0F bug workaround.. We have a special link segment | 75 | * F0 0F bug workaround. |
77 | * for this. | ||
78 | */ | 76 | */ |
79 | gate_desc idt_table[256] | 77 | gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; |
80 | __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; | ||
81 | #endif | 78 | #endif |
82 | 79 | ||
83 | DECLARE_BITMAP(used_vectors, NR_VECTORS); | 80 | DECLARE_BITMAP(used_vectors, NR_VECTORS); |
@@ -786,33 +783,34 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) | |||
786 | #endif | 783 | #endif |
787 | } | 784 | } |
788 | 785 | ||
789 | #ifdef CONFIG_X86_32 | 786 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) |
790 | unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) | ||
791 | { | 787 | { |
792 | struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); | ||
793 | unsigned long base = (kesp - uesp) & -THREAD_SIZE; | ||
794 | unsigned long new_kesp = kesp - base; | ||
795 | unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; | ||
796 | __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; | ||
797 | |||
798 | /* Set up base for espfix segment */ | ||
799 | desc &= 0x00f0ff0000000000ULL; | ||
800 | desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | | ||
801 | ((((__u64)base) << 32) & 0xff00000000000000ULL) | | ||
802 | ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | | ||
803 | (lim_pages & 0xffff); | ||
804 | *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; | ||
805 | |||
806 | return new_kesp; | ||
807 | } | 788 | } |
808 | #endif | ||
809 | 789 | ||
810 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) | 790 | asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) |
811 | { | 791 | { |
812 | } | 792 | } |
813 | 793 | ||
814 | asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) | 794 | /* |
795 | * __math_state_restore assumes that cr0.TS is already clear and the | ||
796 | * fpu state is all ready for use. Used during context switch. | ||
797 | */ | ||
798 | void __math_state_restore(void) | ||
815 | { | 799 | { |
800 | struct thread_info *thread = current_thread_info(); | ||
801 | struct task_struct *tsk = thread->task; | ||
802 | |||
803 | /* | ||
804 | * Paranoid restore. send a SIGSEGV if we fail to restore the state. | ||
805 | */ | ||
806 | if (unlikely(restore_fpu_checking(tsk))) { | ||
807 | stts(); | ||
808 | force_sig(SIGSEGV, tsk); | ||
809 | return; | ||
810 | } | ||
811 | |||
812 | thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ | ||
813 | tsk->fpu_counter++; | ||
816 | } | 814 | } |
817 | 815 | ||
818 | /* | 816 | /* |
@@ -846,17 +844,8 @@ asmlinkage void math_state_restore(void) | |||
846 | } | 844 | } |
847 | 845 | ||
848 | clts(); /* Allow maths ops (or we recurse) */ | 846 | clts(); /* Allow maths ops (or we recurse) */ |
849 | /* | ||
850 | * Paranoid restore. send a SIGSEGV if we fail to restore the state. | ||
851 | */ | ||
852 | if (unlikely(restore_fpu_checking(tsk))) { | ||
853 | stts(); | ||
854 | force_sig(SIGSEGV, tsk); | ||
855 | return; | ||
856 | } | ||
857 | 847 | ||
858 | thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ | 848 | __math_state_restore(); |
859 | tsk->fpu_counter++; | ||
860 | } | 849 | } |
861 | EXPORT_SYMBOL_GPL(math_state_restore); | 850 | EXPORT_SYMBOL_GPL(math_state_restore); |
862 | 851 | ||
@@ -980,7 +969,5 @@ void __init trap_init(void) | |||
980 | */ | 969 | */ |
981 | cpu_init(); | 970 | cpu_init(); |
982 | 971 | ||
983 | #ifdef CONFIG_X86_32 | 972 | x86_init.irqs.trap_init(); |
984 | x86_quirk_trap_init(); | ||
985 | #endif | ||
986 | } | 973 | } |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index fc3672a303d..cd982f48e23 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -17,6 +17,8 @@ | |||
17 | #include <asm/time.h> | 17 | #include <asm/time.h> |
18 | #include <asm/delay.h> | 18 | #include <asm/delay.h> |
19 | #include <asm/hypervisor.h> | 19 | #include <asm/hypervisor.h> |
20 | #include <asm/nmi.h> | ||
21 | #include <asm/x86_init.h> | ||
20 | 22 | ||
21 | unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ | 23 | unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ |
22 | EXPORT_SYMBOL(cpu_khz); | 24 | EXPORT_SYMBOL(cpu_khz); |
@@ -400,15 +402,9 @@ unsigned long native_calibrate_tsc(void) | |||
400 | { | 402 | { |
401 | u64 tsc1, tsc2, delta, ref1, ref2; | 403 | u64 tsc1, tsc2, delta, ref1, ref2; |
402 | unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; | 404 | unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; |
403 | unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; | 405 | unsigned long flags, latch, ms, fast_calibrate; |
404 | int hpet = is_hpet_enabled(), i, loopmin; | 406 | int hpet = is_hpet_enabled(), i, loopmin; |
405 | 407 | ||
406 | hv_tsc_khz = get_hypervisor_tsc_freq(); | ||
407 | if (hv_tsc_khz) { | ||
408 | printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); | ||
409 | return hv_tsc_khz; | ||
410 | } | ||
411 | |||
412 | local_irq_save(flags); | 408 | local_irq_save(flags); |
413 | fast_calibrate = quick_pit_calibrate(); | 409 | fast_calibrate = quick_pit_calibrate(); |
414 | local_irq_restore(flags); | 410 | local_irq_restore(flags); |
@@ -566,7 +562,7 @@ int recalibrate_cpu_khz(void) | |||
566 | unsigned long cpu_khz_old = cpu_khz; | 562 | unsigned long cpu_khz_old = cpu_khz; |
567 | 563 | ||
568 | if (cpu_has_tsc) { | 564 | if (cpu_has_tsc) { |
569 | tsc_khz = calibrate_tsc(); | 565 | tsc_khz = x86_platform.calibrate_tsc(); |
570 | cpu_khz = tsc_khz; | 566 | cpu_khz = tsc_khz; |
571 | cpu_data(0).loops_per_jiffy = | 567 | cpu_data(0).loops_per_jiffy = |
572 | cpufreq_scale(cpu_data(0).loops_per_jiffy, | 568 | cpufreq_scale(cpu_data(0).loops_per_jiffy, |
@@ -670,7 +666,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, | |||
670 | if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || | 666 | if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || |
671 | (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || | 667 | (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || |
672 | (val == CPUFREQ_RESUMECHANGE)) { | 668 | (val == CPUFREQ_RESUMECHANGE)) { |
673 | *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); | 669 | *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); |
674 | 670 | ||
675 | tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); | 671 | tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); |
676 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) | 672 | if (!(freq->flags & CPUFREQ_CONST_LOOPS)) |
@@ -860,15 +856,71 @@ static void __init init_tsc_clocksource(void) | |||
860 | clocksource_register(&clocksource_tsc); | 856 | clocksource_register(&clocksource_tsc); |
861 | } | 857 | } |
862 | 858 | ||
859 | #ifdef CONFIG_X86_64 | ||
860 | /* | ||
861 | * calibrate_cpu is used on systems with fixed rate TSCs to determine | ||
862 | * processor frequency | ||
863 | */ | ||
864 | #define TICK_COUNT 100000000 | ||
865 | static unsigned long __init calibrate_cpu(void) | ||
866 | { | ||
867 | int tsc_start, tsc_now; | ||
868 | int i, no_ctr_free; | ||
869 | unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; | ||
870 | unsigned long flags; | ||
871 | |||
872 | for (i = 0; i < 4; i++) | ||
873 | if (avail_to_resrv_perfctr_nmi_bit(i)) | ||
874 | break; | ||
875 | no_ctr_free = (i == 4); | ||
876 | if (no_ctr_free) { | ||
877 | WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " | ||
878 | "cpu_khz value may be incorrect.\n"); | ||
879 | i = 3; | ||
880 | rdmsrl(MSR_K7_EVNTSEL3, evntsel3); | ||
881 | wrmsrl(MSR_K7_EVNTSEL3, 0); | ||
882 | rdmsrl(MSR_K7_PERFCTR3, pmc3); | ||
883 | } else { | ||
884 | reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); | ||
885 | reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); | ||
886 | } | ||
887 | local_irq_save(flags); | ||
888 | /* start measuring cycles, incrementing from 0 */ | ||
889 | wrmsrl(MSR_K7_PERFCTR0 + i, 0); | ||
890 | wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); | ||
891 | rdtscl(tsc_start); | ||
892 | do { | ||
893 | rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); | ||
894 | tsc_now = get_cycles(); | ||
895 | } while ((tsc_now - tsc_start) < TICK_COUNT); | ||
896 | |||
897 | local_irq_restore(flags); | ||
898 | if (no_ctr_free) { | ||
899 | wrmsrl(MSR_K7_EVNTSEL3, 0); | ||
900 | wrmsrl(MSR_K7_PERFCTR3, pmc3); | ||
901 | wrmsrl(MSR_K7_EVNTSEL3, evntsel3); | ||
902 | } else { | ||
903 | release_perfctr_nmi(MSR_K7_PERFCTR0 + i); | ||
904 | release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); | ||
905 | } | ||
906 | |||
907 | return pmc_now * tsc_khz / (tsc_now - tsc_start); | ||
908 | } | ||
909 | #else | ||
910 | static inline unsigned long calibrate_cpu(void) { return cpu_khz; } | ||
911 | #endif | ||
912 | |||
863 | void __init tsc_init(void) | 913 | void __init tsc_init(void) |
864 | { | 914 | { |
865 | u64 lpj; | 915 | u64 lpj; |
866 | int cpu; | 916 | int cpu; |
867 | 917 | ||
918 | x86_init.timers.tsc_pre_init(); | ||
919 | |||
868 | if (!cpu_has_tsc) | 920 | if (!cpu_has_tsc) |
869 | return; | 921 | return; |
870 | 922 | ||
871 | tsc_khz = calibrate_tsc(); | 923 | tsc_khz = x86_platform.calibrate_tsc(); |
872 | cpu_khz = tsc_khz; | 924 | cpu_khz = tsc_khz; |
873 | 925 | ||
874 | if (!tsc_khz) { | 926 | if (!tsc_khz) { |
@@ -876,11 +928,9 @@ void __init tsc_init(void) | |||
876 | return; | 928 | return; |
877 | } | 929 | } |
878 | 930 | ||
879 | #ifdef CONFIG_X86_64 | ||
880 | if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && | 931 | if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && |
881 | (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) | 932 | (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) |
882 | cpu_khz = calibrate_cpu(); | 933 | cpu_khz = calibrate_cpu(); |
883 | #endif | ||
884 | 934 | ||
885 | printk("Detected %lu.%03lu MHz processor.\n", | 935 | printk("Detected %lu.%03lu MHz processor.\n", |
886 | (unsigned long)cpu_khz / 1000, | 936 | (unsigned long)cpu_khz / 1000, |
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 027b5b49899..f37930954d1 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c | |||
@@ -114,7 +114,7 @@ void __cpuinit check_tsc_sync_source(int cpu) | |||
114 | return; | 114 | return; |
115 | 115 | ||
116 | if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { | 116 | if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { |
117 | pr_info("Skipping synchronization checks as TSC is reliable.\n"); | 117 | printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); |
118 | return; | 118 | return; |
119 | } | 119 | } |
120 | 120 | ||
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 31ffc24eec4..f068553a1b1 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <asm/setup.h> | 30 | #include <asm/setup.h> |
31 | #include <asm/apic.h> | 31 | #include <asm/apic.h> |
32 | #include <asm/e820.h> | 32 | #include <asm/e820.h> |
33 | #include <asm/time.h> | ||
33 | #include <asm/io.h> | 34 | #include <asm/io.h> |
34 | 35 | ||
35 | #include <linux/kernel_stat.h> | 36 | #include <linux/kernel_stat.h> |
@@ -53,7 +54,7 @@ int is_visws_box(void) | |||
53 | return visws_board_type >= 0; | 54 | return visws_board_type >= 0; |
54 | } | 55 | } |
55 | 56 | ||
56 | static int __init visws_time_init(void) | 57 | static void __init visws_time_init(void) |
57 | { | 58 | { |
58 | printk(KERN_INFO "Starting Cobalt Timer system clock\n"); | 59 | printk(KERN_INFO "Starting Cobalt Timer system clock\n"); |
59 | 60 | ||
@@ -66,21 +67,13 @@ static int __init visws_time_init(void) | |||
66 | /* Enable (unmask) the timer interrupt */ | 67 | /* Enable (unmask) the timer interrupt */ |
67 | co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); | 68 | co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); |
68 | 69 | ||
69 | /* | 70 | setup_default_timer_irq(); |
70 | * Zero return means the generic timer setup code will set up | ||
71 | * the standard vector: | ||
72 | */ | ||
73 | return 0; | ||
74 | } | 71 | } |
75 | 72 | ||
76 | static int __init visws_pre_intr_init(void) | 73 | /* Replaces the default init_ISA_irqs in the generic setup */ |
74 | static void __init visws_pre_intr_init(void) | ||
77 | { | 75 | { |
78 | init_VISWS_APIC_irqs(); | 76 | init_VISWS_APIC_irqs(); |
79 | |||
80 | /* | ||
81 | * We dont want ISA irqs to be set up by the generic code: | ||
82 | */ | ||
83 | return 1; | ||
84 | } | 77 | } |
85 | 78 | ||
86 | /* Quirk for machine specific memory setup. */ | 79 | /* Quirk for machine specific memory setup. */ |
@@ -156,12 +149,8 @@ static void visws_machine_power_off(void) | |||
156 | outl(PIIX_SPECIAL_STOP, 0xCFC); | 149 | outl(PIIX_SPECIAL_STOP, 0xCFC); |
157 | } | 150 | } |
158 | 151 | ||
159 | static int __init visws_get_smp_config(unsigned int early) | 152 | static void __init visws_get_smp_config(unsigned int early) |
160 | { | 153 | { |
161 | /* | ||
162 | * Prevent MP-table parsing by the generic code: | ||
163 | */ | ||
164 | return 1; | ||
165 | } | 154 | } |
166 | 155 | ||
167 | /* | 156 | /* |
@@ -208,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) | |||
208 | apic_version[m->apicid] = ver; | 197 | apic_version[m->apicid] = ver; |
209 | } | 198 | } |
210 | 199 | ||
211 | static int __init visws_find_smp_config(unsigned int reserve) | 200 | static void __init visws_find_smp_config(unsigned int reserve) |
212 | { | 201 | { |
213 | struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); | 202 | struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); |
214 | unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); | 203 | unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); |
@@ -230,21 +219,9 @@ static int __init visws_find_smp_config(unsigned int reserve) | |||
230 | MP_processor_info(mp++); | 219 | MP_processor_info(mp++); |
231 | 220 | ||
232 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | 221 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; |
233 | |||
234 | return 1; | ||
235 | } | 222 | } |
236 | 223 | ||
237 | static int visws_trap_init(void); | 224 | static void visws_trap_init(void); |
238 | |||
239 | static struct x86_quirks visws_x86_quirks __initdata = { | ||
240 | .arch_time_init = visws_time_init, | ||
241 | .arch_pre_intr_init = visws_pre_intr_init, | ||
242 | .arch_memory_setup = visws_memory_setup, | ||
243 | .arch_intr_init = NULL, | ||
244 | .arch_trap_init = visws_trap_init, | ||
245 | .mach_get_smp_config = visws_get_smp_config, | ||
246 | .mach_find_smp_config = visws_find_smp_config, | ||
247 | }; | ||
248 | 225 | ||
249 | void __init visws_early_detect(void) | 226 | void __init visws_early_detect(void) |
250 | { | 227 | { |
@@ -257,11 +234,14 @@ void __init visws_early_detect(void) | |||
257 | return; | 234 | return; |
258 | 235 | ||
259 | /* | 236 | /* |
260 | * Install special quirks for timer, interrupt and memory setup: | 237 | * Override the default platform setup functions |
261 | * Fall back to generic behavior for traps: | ||
262 | * Override generic MP-table parsing: | ||
263 | */ | 238 | */ |
264 | x86_quirks = &visws_x86_quirks; | 239 | x86_init.resources.memory_setup = visws_memory_setup; |
240 | x86_init.mpparse.get_smp_config = visws_get_smp_config; | ||
241 | x86_init.mpparse.find_smp_config = visws_find_smp_config; | ||
242 | x86_init.irqs.pre_vector_init = visws_pre_intr_init; | ||
243 | x86_init.irqs.trap_init = visws_trap_init; | ||
244 | x86_init.timers.timer_init = visws_time_init; | ||
265 | 245 | ||
266 | /* | 246 | /* |
267 | * Install reboot quirks: | 247 | * Install reboot quirks: |
@@ -400,12 +380,10 @@ static __init void cobalt_init(void) | |||
400 | co_apic_read(CO_APIC_ID)); | 380 | co_apic_read(CO_APIC_ID)); |
401 | } | 381 | } |
402 | 382 | ||
403 | static int __init visws_trap_init(void) | 383 | static void __init visws_trap_init(void) |
404 | { | 384 | { |
405 | lithium_init(); | 385 | lithium_init(); |
406 | cobalt_init(); | 386 | cobalt_init(); |
407 | |||
408 | return 1; | ||
409 | } | 387 | } |
410 | 388 | ||
411 | /* | 389 | /* |
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 95a7289e4b0..31e6f6cfe53 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c | |||
@@ -817,15 +817,15 @@ static inline int __init activate_vmi(void) | |||
817 | vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); | 817 | vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); |
818 | vmi_timer_ops.cancel_alarm = | 818 | vmi_timer_ops.cancel_alarm = |
819 | vmi_get_function(VMI_CALL_CancelAlarm); | 819 | vmi_get_function(VMI_CALL_CancelAlarm); |
820 | pv_time_ops.time_init = vmi_time_init; | 820 | x86_init.timers.timer_init = vmi_time_init; |
821 | pv_time_ops.get_wallclock = vmi_get_wallclock; | ||
822 | pv_time_ops.set_wallclock = vmi_set_wallclock; | ||
823 | #ifdef CONFIG_X86_LOCAL_APIC | 821 | #ifdef CONFIG_X86_LOCAL_APIC |
824 | pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; | 822 | x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init; |
825 | pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; | 823 | x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init; |
826 | #endif | 824 | #endif |
827 | pv_time_ops.sched_clock = vmi_sched_clock; | 825 | pv_time_ops.sched_clock = vmi_sched_clock; |
828 | pv_time_ops.get_tsc_khz = vmi_tsc_khz; | 826 | x86_platform.calibrate_tsc = vmi_tsc_khz; |
827 | x86_platform.get_wallclock = vmi_get_wallclock; | ||
828 | x86_platform.set_wallclock = vmi_set_wallclock; | ||
829 | 829 | ||
830 | /* We have true wallclock functions; disable CMOS clock sync */ | 830 | /* We have true wallclock functions; disable CMOS clock sync */ |
831 | no_sync_cmos_clock = 1; | 831 | no_sync_cmos_clock = 1; |
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 2b3eb82efee..611b9e2360d 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c | |||
@@ -68,7 +68,7 @@ unsigned long long vmi_sched_clock(void) | |||
68 | return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); | 68 | return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); |
69 | } | 69 | } |
70 | 70 | ||
71 | /* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ | 71 | /* x86_platform.calibrate_tsc = vmi_tsc_khz */ |
72 | unsigned long vmi_tsc_khz(void) | 72 | unsigned long vmi_tsc_khz(void) |
73 | { | 73 | { |
74 | unsigned long long khz; | 74 | unsigned long long khz; |
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 78d185d797d..92929fb3f9f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S | |||
@@ -45,12 +45,11 @@ PHDRS { | |||
45 | text PT_LOAD FLAGS(5); /* R_E */ | 45 | text PT_LOAD FLAGS(5); /* R_E */ |
46 | data PT_LOAD FLAGS(7); /* RWE */ | 46 | data PT_LOAD FLAGS(7); /* RWE */ |
47 | #ifdef CONFIG_X86_64 | 47 | #ifdef CONFIG_X86_64 |
48 | user PT_LOAD FLAGS(7); /* RWE */ | 48 | user PT_LOAD FLAGS(5); /* R_E */ |
49 | data.init PT_LOAD FLAGS(7); /* RWE */ | ||
50 | #ifdef CONFIG_SMP | 49 | #ifdef CONFIG_SMP |
51 | percpu PT_LOAD FLAGS(7); /* RWE */ | 50 | percpu PT_LOAD FLAGS(6); /* RW_ */ |
52 | #endif | 51 | #endif |
53 | data.init2 PT_LOAD FLAGS(7); /* RWE */ | 52 | init PT_LOAD FLAGS(7); /* RWE */ |
54 | #endif | 53 | #endif |
55 | note PT_NOTE FLAGS(0); /* ___ */ | 54 | note PT_NOTE FLAGS(0); /* ___ */ |
56 | } | 55 | } |
@@ -66,17 +65,11 @@ SECTIONS | |||
66 | #endif | 65 | #endif |
67 | 66 | ||
68 | /* Text and read-only data */ | 67 | /* Text and read-only data */ |
69 | |||
70 | /* bootstrapping code */ | ||
71 | .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { | ||
72 | _text = .; | ||
73 | *(.text.head) | ||
74 | } :text = 0x9090 | ||
75 | |||
76 | /* The rest of the text */ | ||
77 | .text : AT(ADDR(.text) - LOAD_OFFSET) { | 68 | .text : AT(ADDR(.text) - LOAD_OFFSET) { |
69 | _text = .; | ||
70 | /* bootstrapping code */ | ||
71 | HEAD_TEXT | ||
78 | #ifdef CONFIG_X86_32 | 72 | #ifdef CONFIG_X86_32 |
79 | /* not really needed, already page aligned */ | ||
80 | . = ALIGN(PAGE_SIZE); | 73 | . = ALIGN(PAGE_SIZE); |
81 | *(.text.page_aligned) | 74 | *(.text.page_aligned) |
82 | #endif | 75 | #endif |
@@ -95,87 +88,55 @@ SECTIONS | |||
95 | 88 | ||
96 | NOTES :text :note | 89 | NOTES :text :note |
97 | 90 | ||
98 | /* Exception table */ | 91 | EXCEPTION_TABLE(16) :text = 0x9090 |
99 | . = ALIGN(16); | ||
100 | __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { | ||
101 | __start___ex_table = .; | ||
102 | *(__ex_table) | ||
103 | __stop___ex_table = .; | ||
104 | } :text = 0x9090 | ||
105 | 92 | ||
106 | RODATA | 93 | RO_DATA(PAGE_SIZE) |
107 | 94 | ||
108 | /* Data */ | 95 | /* Data */ |
109 | . = ALIGN(PAGE_SIZE); | ||
110 | .data : AT(ADDR(.data) - LOAD_OFFSET) { | 96 | .data : AT(ADDR(.data) - LOAD_OFFSET) { |
111 | /* Start of data section */ | 97 | /* Start of data section */ |
112 | _sdata = .; | 98 | _sdata = .; |
113 | DATA_DATA | 99 | |
114 | CONSTRUCTORS | 100 | /* init_task */ |
115 | } :data | 101 | INIT_TASK_DATA(THREAD_SIZE) |
116 | 102 | ||
117 | #ifdef CONFIG_X86_32 | 103 | #ifdef CONFIG_X86_32 |
118 | /* 32 bit has nosave before _edata */ | 104 | /* 32 bit has nosave before _edata */ |
119 | . = ALIGN(PAGE_SIZE); | 105 | NOSAVE_DATA |
120 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { | ||
121 | __nosave_begin = .; | ||
122 | *(.data.nosave) | ||
123 | . = ALIGN(PAGE_SIZE); | ||
124 | __nosave_end = .; | ||
125 | } | ||
126 | #endif | 106 | #endif |
127 | 107 | ||
128 | . = ALIGN(PAGE_SIZE); | 108 | PAGE_ALIGNED_DATA(PAGE_SIZE) |
129 | .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { | ||
130 | *(.data.page_aligned) | ||
131 | *(.data.idt) | ||
132 | } | ||
133 | 109 | ||
134 | #ifdef CONFIG_X86_32 | 110 | CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) |
135 | . = ALIGN(32); | ||
136 | #else | ||
137 | . = ALIGN(PAGE_SIZE); | ||
138 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | ||
139 | #endif | ||
140 | .data.cacheline_aligned : | ||
141 | AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { | ||
142 | *(.data.cacheline_aligned) | ||
143 | } | ||
144 | 111 | ||
145 | /* rarely changed data like cpu maps */ | 112 | DATA_DATA |
146 | #ifdef CONFIG_X86_32 | 113 | CONSTRUCTORS |
147 | . = ALIGN(32); | 114 | |
148 | #else | 115 | /* rarely changed data like cpu maps */ |
149 | . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); | 116 | READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) |
150 | #endif | ||
151 | .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { | ||
152 | *(.data.read_mostly) | ||
153 | 117 | ||
154 | /* End of data section */ | 118 | /* End of data section */ |
155 | _edata = .; | 119 | _edata = .; |
156 | } | 120 | } :data |
157 | 121 | ||
158 | #ifdef CONFIG_X86_64 | 122 | #ifdef CONFIG_X86_64 |
159 | 123 | ||
160 | #define VSYSCALL_ADDR (-10*1024*1024) | 124 | #define VSYSCALL_ADDR (-10*1024*1024) |
161 | #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ | ||
162 | SIZEOF(.data.read_mostly) + 4095) & ~(4095)) | ||
163 | #define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ | ||
164 | SIZEOF(.data.read_mostly) + 4095) & ~(4095)) | ||
165 | 125 | ||
166 | #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) | 126 | #define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) |
167 | #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) | 127 | #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) |
168 | 128 | ||
169 | #define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) | 129 | #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) |
170 | #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) | 130 | #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) |
171 | 131 | ||
132 | . = ALIGN(4096); | ||
133 | __vsyscall_0 = .; | ||
134 | |||
172 | . = VSYSCALL_ADDR; | 135 | . = VSYSCALL_ADDR; |
173 | .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { | 136 | .vsyscall_0 : AT(VLOAD(.vsyscall_0)) { |
174 | *(.vsyscall_0) | 137 | *(.vsyscall_0) |
175 | } :user | 138 | } :user |
176 | 139 | ||
177 | __vsyscall_0 = VSYSCALL_VIRT_ADDR; | ||
178 | |||
179 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); | 140 | . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); |
180 | .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { | 141 | .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { |
181 | *(.vsyscall_fn) | 142 | *(.vsyscall_fn) |
@@ -215,11 +176,9 @@ SECTIONS | |||
215 | *(.vsyscall_3) | 176 | *(.vsyscall_3) |
216 | } | 177 | } |
217 | 178 | ||
218 | . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; | 179 | . = __vsyscall_0 + PAGE_SIZE; |
219 | 180 | ||
220 | #undef VSYSCALL_ADDR | 181 | #undef VSYSCALL_ADDR |
221 | #undef VSYSCALL_PHYS_ADDR | ||
222 | #undef VSYSCALL_VIRT_ADDR | ||
223 | #undef VLOAD_OFFSET | 182 | #undef VLOAD_OFFSET |
224 | #undef VLOAD | 183 | #undef VLOAD |
225 | #undef VVIRT_OFFSET | 184 | #undef VVIRT_OFFSET |
@@ -227,57 +186,27 @@ SECTIONS | |||
227 | 186 | ||
228 | #endif /* CONFIG_X86_64 */ | 187 | #endif /* CONFIG_X86_64 */ |
229 | 188 | ||
230 | /* init_task */ | ||
231 | . = ALIGN(THREAD_SIZE); | ||
232 | .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { | ||
233 | *(.data.init_task) | ||
234 | } | ||
235 | #ifdef CONFIG_X86_64 | ||
236 | :data.init | ||
237 | #endif | ||
238 | |||
239 | /* | ||
240 | * smp_locks might be freed after init | ||
241 | * start/end must be page aligned | ||
242 | */ | ||
243 | . = ALIGN(PAGE_SIZE); | ||
244 | .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { | ||
245 | __smp_locks = .; | ||
246 | *(.smp_locks) | ||
247 | __smp_locks_end = .; | ||
248 | . = ALIGN(PAGE_SIZE); | ||
249 | } | ||
250 | |||
251 | /* Init code and data - will be freed after init */ | 189 | /* Init code and data - will be freed after init */ |
252 | . = ALIGN(PAGE_SIZE); | 190 | . = ALIGN(PAGE_SIZE); |
253 | .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { | 191 | .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { |
254 | __init_begin = .; /* paired with __init_end */ | 192 | __init_begin = .; /* paired with __init_end */ |
255 | _sinittext = .; | ||
256 | INIT_TEXT | ||
257 | _einittext = .; | ||
258 | } | 193 | } |
259 | 194 | ||
260 | .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { | 195 | #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) |
261 | INIT_DATA | 196 | /* |
262 | } | 197 | * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the |
198 | * output PHDR, so the next output section - .init.text - should | ||
199 | * start another segment - init. | ||
200 | */ | ||
201 | PERCPU_VADDR(0, :percpu) | ||
202 | #endif | ||
263 | 203 | ||
264 | . = ALIGN(16); | 204 | INIT_TEXT_SECTION(PAGE_SIZE) |
265 | .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { | 205 | #ifdef CONFIG_X86_64 |
266 | __setup_start = .; | 206 | :init |
267 | *(.init.setup) | 207 | #endif |
268 | __setup_end = .; | ||
269 | } | ||
270 | .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { | ||
271 | __initcall_start = .; | ||
272 | INITCALLS | ||
273 | __initcall_end = .; | ||
274 | } | ||
275 | 208 | ||
276 | .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { | 209 | INIT_DATA_SECTION(16) |
277 | __con_initcall_start = .; | ||
278 | *(.con_initcall.init) | ||
279 | __con_initcall_end = .; | ||
280 | } | ||
281 | 210 | ||
282 | .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { | 211 | .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { |
283 | __x86_cpu_dev_start = .; | 212 | __x86_cpu_dev_start = .; |
@@ -285,8 +214,6 @@ SECTIONS | |||
285 | __x86_cpu_dev_end = .; | 214 | __x86_cpu_dev_end = .; |
286 | } | 215 | } |
287 | 216 | ||
288 | SECURITY_INIT | ||
289 | |||
290 | . = ALIGN(8); | 217 | . = ALIGN(8); |
291 | .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { | 218 | .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { |
292 | __parainstructions = .; | 219 | __parainstructions = .; |
@@ -317,26 +244,7 @@ SECTIONS | |||
317 | EXIT_DATA | 244 | EXIT_DATA |
318 | } | 245 | } |
319 | 246 | ||
320 | #ifdef CONFIG_BLK_DEV_INITRD | 247 | #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) |
321 | . = ALIGN(PAGE_SIZE); | ||
322 | .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { | ||
323 | __initramfs_start = .; | ||
324 | *(.init.ramfs) | ||
325 | __initramfs_end = .; | ||
326 | } | ||
327 | #endif | ||
328 | |||
329 | #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) | ||
330 | /* | ||
331 | * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the | ||
332 | * output PHDR, so the next output section - __data_nosave - should | ||
333 | * start another section data.init2. Also, pda should be at the head of | ||
334 | * percpu area. Preallocate it and define the percpu offset symbol | ||
335 | * so that it can be accessed as a percpu variable. | ||
336 | */ | ||
337 | . = ALIGN(PAGE_SIZE); | ||
338 | PERCPU_VADDR(0, :percpu) | ||
339 | #else | ||
340 | PERCPU(PAGE_SIZE) | 248 | PERCPU(PAGE_SIZE) |
341 | #endif | 249 | #endif |
342 | 250 | ||
@@ -347,15 +255,22 @@ SECTIONS | |||
347 | __init_end = .; | 255 | __init_end = .; |
348 | } | 256 | } |
349 | 257 | ||
258 | /* | ||
259 | * smp_locks might be freed after init | ||
260 | * start/end must be page aligned | ||
261 | */ | ||
262 | . = ALIGN(PAGE_SIZE); | ||
263 | .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { | ||
264 | __smp_locks = .; | ||
265 | *(.smp_locks) | ||
266 | __smp_locks_end = .; | ||
267 | . = ALIGN(PAGE_SIZE); | ||
268 | } | ||
269 | |||
350 | #ifdef CONFIG_X86_64 | 270 | #ifdef CONFIG_X86_64 |
351 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { | 271 | .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { |
352 | . = ALIGN(PAGE_SIZE); | 272 | NOSAVE_DATA |
353 | __nosave_begin = .; | 273 | } |
354 | *(.data.nosave) | ||
355 | . = ALIGN(PAGE_SIZE); | ||
356 | __nosave_end = .; | ||
357 | } :data.init2 | ||
358 | /* use another section data.init2, see PERCPU_VADDR() above */ | ||
359 | #endif | 274 | #endif |
360 | 275 | ||
361 | /* BSS */ | 276 | /* BSS */ |
@@ -380,15 +295,12 @@ SECTIONS | |||
380 | _end = .; | 295 | _end = .; |
381 | } | 296 | } |
382 | 297 | ||
383 | /* Sections to be discarded */ | ||
384 | /DISCARD/ : { | ||
385 | *(.exitcall.exit) | ||
386 | *(.eh_frame) | ||
387 | *(.discard) | ||
388 | } | ||
389 | |||
390 | STABS_DEBUG | 298 | STABS_DEBUG |
391 | DWARF_DEBUG | 299 | DWARF_DEBUG |
300 | |||
301 | /* Sections to be discarded */ | ||
302 | DISCARDS | ||
303 | /DISCARD/ : { *(.eh_frame) } | ||
392 | } | 304 | } |
393 | 305 | ||
394 | 306 | ||
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index cf53a78e2dc..8cb4974ff59 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c | |||
@@ -228,19 +228,11 @@ static long __vsyscall(3) venosys_1(void) | |||
228 | } | 228 | } |
229 | 229 | ||
230 | #ifdef CONFIG_SYSCTL | 230 | #ifdef CONFIG_SYSCTL |
231 | |||
232 | static int | ||
233 | vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, | ||
234 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
235 | { | ||
236 | return proc_dointvec(ctl, write, filp, buffer, lenp, ppos); | ||
237 | } | ||
238 | |||
239 | static ctl_table kernel_table2[] = { | 231 | static ctl_table kernel_table2[] = { |
240 | { .procname = "vsyscall64", | 232 | { .procname = "vsyscall64", |
241 | .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), | 233 | .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), |
242 | .mode = 0644, | 234 | .mode = 0644, |
243 | .proc_handler = vsyscall_sysctl_change }, | 235 | .proc_handler = proc_dointvec }, |
244 | {} | 236 | {} |
245 | }; | 237 | }; |
246 | 238 | ||
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c new file mode 100644 index 00000000000..4449a4a2c2e --- /dev/null +++ b/arch/x86/kernel/x86_init.c | |||
@@ -0,0 +1,75 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de> | ||
3 | * | ||
4 | * For licencing details see kernel-base/COPYING | ||
5 | */ | ||
6 | #include <linux/init.h> | ||
7 | |||
8 | #include <asm/bios_ebda.h> | ||
9 | #include <asm/paravirt.h> | ||
10 | #include <asm/mpspec.h> | ||
11 | #include <asm/setup.h> | ||
12 | #include <asm/apic.h> | ||
13 | #include <asm/e820.h> | ||
14 | #include <asm/time.h> | ||
15 | #include <asm/irq.h> | ||
16 | #include <asm/tsc.h> | ||
17 | |||
18 | void __cpuinit x86_init_noop(void) { } | ||
19 | void __init x86_init_uint_noop(unsigned int unused) { } | ||
20 | void __init x86_init_pgd_noop(pgd_t *unused) { } | ||
21 | |||
22 | /* | ||
23 | * The platform setup functions are preset with the default functions | ||
24 | * for standard PC hardware. | ||
25 | */ | ||
26 | struct x86_init_ops x86_init __initdata = { | ||
27 | |||
28 | .resources = { | ||
29 | .probe_roms = x86_init_noop, | ||
30 | .reserve_resources = reserve_standard_io_resources, | ||
31 | .memory_setup = default_machine_specific_memory_setup, | ||
32 | }, | ||
33 | |||
34 | .mpparse = { | ||
35 | .mpc_record = x86_init_uint_noop, | ||
36 | .setup_ioapic_ids = x86_init_noop, | ||
37 | .mpc_apic_id = default_mpc_apic_id, | ||
38 | .smp_read_mpc_oem = default_smp_read_mpc_oem, | ||
39 | .mpc_oem_bus_info = default_mpc_oem_bus_info, | ||
40 | .find_smp_config = default_find_smp_config, | ||
41 | .get_smp_config = default_get_smp_config, | ||
42 | }, | ||
43 | |||
44 | .irqs = { | ||
45 | .pre_vector_init = init_ISA_irqs, | ||
46 | .intr_init = native_init_IRQ, | ||
47 | .trap_init = x86_init_noop, | ||
48 | }, | ||
49 | |||
50 | .oem = { | ||
51 | .arch_setup = x86_init_noop, | ||
52 | .banner = default_banner, | ||
53 | }, | ||
54 | |||
55 | .paging = { | ||
56 | .pagetable_setup_start = native_pagetable_setup_start, | ||
57 | .pagetable_setup_done = native_pagetable_setup_done, | ||
58 | }, | ||
59 | |||
60 | .timers = { | ||
61 | .setup_percpu_clockev = setup_boot_APIC_clock, | ||
62 | .tsc_pre_init = x86_init_noop, | ||
63 | .timer_init = hpet_time_init, | ||
64 | }, | ||
65 | }; | ||
66 | |||
67 | struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { | ||
68 | .setup_percpu_clockev = setup_secondary_APIC_clock, | ||
69 | }; | ||
70 | |||
71 | struct x86_platform_ops x86_platform = { | ||
72 | .calibrate_tsc = native_calibrate_tsc, | ||
73 | .get_wallclock = mach_get_cmos_time, | ||
74 | .set_wallclock = mach_set_rtc_mmss, | ||
75 | }; | ||
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8600a09e0c6..b84e571f417 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -1,12 +1,8 @@ | |||
1 | # | 1 | # |
2 | # KVM configuration | 2 | # KVM configuration |
3 | # | 3 | # |
4 | config HAVE_KVM | ||
5 | bool | ||
6 | 4 | ||
7 | config HAVE_KVM_IRQCHIP | 5 | source "virt/kvm/Kconfig" |
8 | bool | ||
9 | default y | ||
10 | 6 | ||
11 | menuconfig VIRTUALIZATION | 7 | menuconfig VIRTUALIZATION |
12 | bool "Virtualization" | 8 | bool "Virtualization" |
@@ -29,6 +25,9 @@ config KVM | |||
29 | select PREEMPT_NOTIFIERS | 25 | select PREEMPT_NOTIFIERS |
30 | select MMU_NOTIFIER | 26 | select MMU_NOTIFIER |
31 | select ANON_INODES | 27 | select ANON_INODES |
28 | select HAVE_KVM_IRQCHIP | ||
29 | select HAVE_KVM_EVENTFD | ||
30 | select KVM_APIC_ARCHITECTURE | ||
32 | ---help--- | 31 | ---help--- |
33 | Support hosting fully virtualized guest machines using hardware | 32 | Support hosting fully virtualized guest machines using hardware |
34 | virtualization extensions. You will need a fairly recent | 33 | virtualization extensions. You will need a fairly recent |
@@ -63,18 +62,6 @@ config KVM_AMD | |||
63 | To compile this as a module, choose M here: the module | 62 | To compile this as a module, choose M here: the module |
64 | will be called kvm-amd. | 63 | will be called kvm-amd. |
65 | 64 | ||
66 | config KVM_TRACE | ||
67 | bool "KVM trace support" | ||
68 | depends on KVM && SYSFS | ||
69 | select MARKERS | ||
70 | select RELAY | ||
71 | select DEBUG_FS | ||
72 | default n | ||
73 | ---help--- | ||
74 | This option allows reading a trace of kvm-related events through | ||
75 | relayfs. Note the ABI is not considered stable and will be | ||
76 | modified in future updates. | ||
77 | |||
78 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under | 65 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under |
79 | # the virtualization menu. | 66 | # the virtualization menu. |
80 | source drivers/lguest/Kconfig | 67 | source drivers/lguest/Kconfig |
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index b43c4efafe8..0e7fe78d0f7 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
@@ -1,22 +1,19 @@ | |||
1 | # | ||
2 | # Makefile for Kernel-based Virtual Machine module | ||
3 | # | ||
4 | |||
5 | common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ | ||
6 | coalesced_mmio.o irq_comm.o) | ||
7 | ifeq ($(CONFIG_KVM_TRACE),y) | ||
8 | common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) | ||
9 | endif | ||
10 | ifeq ($(CONFIG_IOMMU_API),y) | ||
11 | common-objs += $(addprefix ../../../virt/kvm/, iommu.o) | ||
12 | endif | ||
13 | 1 | ||
14 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm | 2 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm |
15 | 3 | ||
16 | kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ | 4 | CFLAGS_x86.o := -I. |
17 | i8254.o timer.o | 5 | CFLAGS_svm.o := -I. |
18 | obj-$(CONFIG_KVM) += kvm.o | 6 | CFLAGS_vmx.o := -I. |
19 | kvm-intel-objs = vmx.o | 7 | |
20 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o | 8 | kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ |
21 | kvm-amd-objs = svm.o | 9 | coalesced_mmio.o irq_comm.o eventfd.o) |
22 | obj-$(CONFIG_KVM_AMD) += kvm-amd.o | 10 | kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) |
11 | |||
12 | kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ | ||
13 | i8254.o timer.o | ||
14 | kvm-intel-y += vmx.o | ||
15 | kvm-amd-y += svm.o | ||
16 | |||
17 | obj-$(CONFIG_KVM) += kvm.o | ||
18 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o | ||
19 | obj-$(CONFIG_KVM_AMD) += kvm-amd.o | ||
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/emulate.c index 616de4628d6..1be5cd640e9 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/emulate.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /****************************************************************************** | 1 | /****************************************************************************** |
2 | * x86_emulate.c | 2 | * emulate.c |
3 | * | 3 | * |
4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. | 4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. |
5 | * | 5 | * |
@@ -30,7 +30,9 @@ | |||
30 | #define DPRINTF(x...) do {} while (0) | 30 | #define DPRINTF(x...) do {} while (0) |
31 | #endif | 31 | #endif |
32 | #include <linux/module.h> | 32 | #include <linux/module.h> |
33 | #include <asm/kvm_x86_emulate.h> | 33 | #include <asm/kvm_emulate.h> |
34 | |||
35 | #include "mmu.h" /* for is_long_mode() */ | ||
34 | 36 | ||
35 | /* | 37 | /* |
36 | * Opcode effective-address decode tables. | 38 | * Opcode effective-address decode tables. |
@@ -60,6 +62,7 @@ | |||
60 | #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ | 62 | #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ |
61 | #define SrcOne (7<<4) /* Implied '1' */ | 63 | #define SrcOne (7<<4) /* Implied '1' */ |
62 | #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ | 64 | #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ |
65 | #define SrcImmU (9<<4) /* Immediate operand, unsigned */ | ||
63 | #define SrcMask (0xf<<4) | 66 | #define SrcMask (0xf<<4) |
64 | /* Generic ModRM decode. */ | 67 | /* Generic ModRM decode. */ |
65 | #define ModRM (1<<8) | 68 | #define ModRM (1<<8) |
@@ -97,11 +100,11 @@ static u32 opcode_table[256] = { | |||
97 | /* 0x10 - 0x17 */ | 100 | /* 0x10 - 0x17 */ |
98 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 101 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
99 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 102 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
100 | 0, 0, 0, 0, | 103 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, |
101 | /* 0x18 - 0x1F */ | 104 | /* 0x18 - 0x1F */ |
102 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 105 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
103 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 106 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
104 | 0, 0, 0, 0, | 107 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, |
105 | /* 0x20 - 0x27 */ | 108 | /* 0x20 - 0x27 */ |
106 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 109 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
107 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 110 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
@@ -195,7 +198,7 @@ static u32 opcode_table[256] = { | |||
195 | ByteOp | SrcImmUByte, SrcImmUByte, | 198 | ByteOp | SrcImmUByte, SrcImmUByte, |
196 | /* 0xE8 - 0xEF */ | 199 | /* 0xE8 - 0xEF */ |
197 | SrcImm | Stack, SrcImm | ImplicitOps, | 200 | SrcImm | Stack, SrcImm | ImplicitOps, |
198 | SrcImm | Src2Imm16, SrcImmByte | ImplicitOps, | 201 | SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, |
199 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | 202 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, |
200 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | 203 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, |
201 | /* 0xF0 - 0xF7 */ | 204 | /* 0xF0 - 0xF7 */ |
@@ -208,7 +211,7 @@ static u32 opcode_table[256] = { | |||
208 | 211 | ||
209 | static u32 twobyte_table[256] = { | 212 | static u32 twobyte_table[256] = { |
210 | /* 0x00 - 0x0F */ | 213 | /* 0x00 - 0x0F */ |
211 | 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, | 214 | 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, |
212 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, | 215 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, |
213 | /* 0x10 - 0x1F */ | 216 | /* 0x10 - 0x1F */ |
214 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, | 217 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, |
@@ -216,7 +219,9 @@ static u32 twobyte_table[256] = { | |||
216 | ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, | 219 | ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, |
217 | 0, 0, 0, 0, 0, 0, 0, 0, | 220 | 0, 0, 0, 0, 0, 0, 0, 0, |
218 | /* 0x30 - 0x3F */ | 221 | /* 0x30 - 0x3F */ |
219 | ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 222 | ImplicitOps, 0, ImplicitOps, 0, |
223 | ImplicitOps, ImplicitOps, 0, 0, | ||
224 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
220 | /* 0x40 - 0x47 */ | 225 | /* 0x40 - 0x47 */ |
221 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | 226 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, |
222 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | 227 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, |
@@ -319,8 +324,11 @@ static u32 group2_table[] = { | |||
319 | }; | 324 | }; |
320 | 325 | ||
321 | /* EFLAGS bit definitions. */ | 326 | /* EFLAGS bit definitions. */ |
327 | #define EFLG_VM (1<<17) | ||
328 | #define EFLG_RF (1<<16) | ||
322 | #define EFLG_OF (1<<11) | 329 | #define EFLG_OF (1<<11) |
323 | #define EFLG_DF (1<<10) | 330 | #define EFLG_DF (1<<10) |
331 | #define EFLG_IF (1<<9) | ||
324 | #define EFLG_SF (1<<7) | 332 | #define EFLG_SF (1<<7) |
325 | #define EFLG_ZF (1<<6) | 333 | #define EFLG_ZF (1<<6) |
326 | #define EFLG_AF (1<<4) | 334 | #define EFLG_AF (1<<4) |
@@ -1027,6 +1035,7 @@ done_prefixes: | |||
1027 | c->src.type = OP_MEM; | 1035 | c->src.type = OP_MEM; |
1028 | break; | 1036 | break; |
1029 | case SrcImm: | 1037 | case SrcImm: |
1038 | case SrcImmU: | ||
1030 | c->src.type = OP_IMM; | 1039 | c->src.type = OP_IMM; |
1031 | c->src.ptr = (unsigned long *)c->eip; | 1040 | c->src.ptr = (unsigned long *)c->eip; |
1032 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 1041 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; |
@@ -1044,6 +1053,19 @@ done_prefixes: | |||
1044 | c->src.val = insn_fetch(s32, 4, c->eip); | 1053 | c->src.val = insn_fetch(s32, 4, c->eip); |
1045 | break; | 1054 | break; |
1046 | } | 1055 | } |
1056 | if ((c->d & SrcMask) == SrcImmU) { | ||
1057 | switch (c->src.bytes) { | ||
1058 | case 1: | ||
1059 | c->src.val &= 0xff; | ||
1060 | break; | ||
1061 | case 2: | ||
1062 | c->src.val &= 0xffff; | ||
1063 | break; | ||
1064 | case 4: | ||
1065 | c->src.val &= 0xffffffff; | ||
1066 | break; | ||
1067 | } | ||
1068 | } | ||
1047 | break; | 1069 | break; |
1048 | case SrcImmByte: | 1070 | case SrcImmByte: |
1049 | case SrcImmUByte: | 1071 | case SrcImmUByte: |
@@ -1375,6 +1397,217 @@ static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) | |||
1375 | ctxt->interruptibility = mask; | 1397 | ctxt->interruptibility = mask; |
1376 | } | 1398 | } |
1377 | 1399 | ||
1400 | static inline void | ||
1401 | setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, | ||
1402 | struct kvm_segment *cs, struct kvm_segment *ss) | ||
1403 | { | ||
1404 | memset(cs, 0, sizeof(struct kvm_segment)); | ||
1405 | kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); | ||
1406 | memset(ss, 0, sizeof(struct kvm_segment)); | ||
1407 | |||
1408 | cs->l = 0; /* will be adjusted later */ | ||
1409 | cs->base = 0; /* flat segment */ | ||
1410 | cs->g = 1; /* 4kb granularity */ | ||
1411 | cs->limit = 0xffffffff; /* 4GB limit */ | ||
1412 | cs->type = 0x0b; /* Read, Execute, Accessed */ | ||
1413 | cs->s = 1; | ||
1414 | cs->dpl = 0; /* will be adjusted later */ | ||
1415 | cs->present = 1; | ||
1416 | cs->db = 1; | ||
1417 | |||
1418 | ss->unusable = 0; | ||
1419 | ss->base = 0; /* flat segment */ | ||
1420 | ss->limit = 0xffffffff; /* 4GB limit */ | ||
1421 | ss->g = 1; /* 4kb granularity */ | ||
1422 | ss->s = 1; | ||
1423 | ss->type = 0x03; /* Read/Write, Accessed */ | ||
1424 | ss->db = 1; /* 32bit stack segment */ | ||
1425 | ss->dpl = 0; | ||
1426 | ss->present = 1; | ||
1427 | } | ||
1428 | |||
1429 | static int | ||
1430 | emulate_syscall(struct x86_emulate_ctxt *ctxt) | ||
1431 | { | ||
1432 | struct decode_cache *c = &ctxt->decode; | ||
1433 | struct kvm_segment cs, ss; | ||
1434 | u64 msr_data; | ||
1435 | |||
1436 | /* syscall is not available in real mode */ | ||
1437 | if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL | ||
1438 | || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) | ||
1439 | return -1; | ||
1440 | |||
1441 | setup_syscalls_segments(ctxt, &cs, &ss); | ||
1442 | |||
1443 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); | ||
1444 | msr_data >>= 32; | ||
1445 | cs.selector = (u16)(msr_data & 0xfffc); | ||
1446 | ss.selector = (u16)(msr_data + 8); | ||
1447 | |||
1448 | if (is_long_mode(ctxt->vcpu)) { | ||
1449 | cs.db = 0; | ||
1450 | cs.l = 1; | ||
1451 | } | ||
1452 | kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); | ||
1453 | kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); | ||
1454 | |||
1455 | c->regs[VCPU_REGS_RCX] = c->eip; | ||
1456 | if (is_long_mode(ctxt->vcpu)) { | ||
1457 | #ifdef CONFIG_X86_64 | ||
1458 | c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; | ||
1459 | |||
1460 | kvm_x86_ops->get_msr(ctxt->vcpu, | ||
1461 | ctxt->mode == X86EMUL_MODE_PROT64 ? | ||
1462 | MSR_LSTAR : MSR_CSTAR, &msr_data); | ||
1463 | c->eip = msr_data; | ||
1464 | |||
1465 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); | ||
1466 | ctxt->eflags &= ~(msr_data | EFLG_RF); | ||
1467 | #endif | ||
1468 | } else { | ||
1469 | /* legacy mode */ | ||
1470 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); | ||
1471 | c->eip = (u32)msr_data; | ||
1472 | |||
1473 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); | ||
1474 | } | ||
1475 | |||
1476 | return 0; | ||
1477 | } | ||
1478 | |||
1479 | static int | ||
1480 | emulate_sysenter(struct x86_emulate_ctxt *ctxt) | ||
1481 | { | ||
1482 | struct decode_cache *c = &ctxt->decode; | ||
1483 | struct kvm_segment cs, ss; | ||
1484 | u64 msr_data; | ||
1485 | |||
1486 | /* inject #UD if LOCK prefix is used */ | ||
1487 | if (c->lock_prefix) | ||
1488 | return -1; | ||
1489 | |||
1490 | /* inject #GP if in real mode or paging is disabled */ | ||
1491 | if (ctxt->mode == X86EMUL_MODE_REAL || | ||
1492 | !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { | ||
1493 | kvm_inject_gp(ctxt->vcpu, 0); | ||
1494 | return -1; | ||
1495 | } | ||
1496 | |||
1497 | /* XXX sysenter/sysexit have not been tested in 64bit mode. | ||
1498 | * Therefore, we inject an #UD. | ||
1499 | */ | ||
1500 | if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
1501 | return -1; | ||
1502 | |||
1503 | setup_syscalls_segments(ctxt, &cs, &ss); | ||
1504 | |||
1505 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); | ||
1506 | switch (ctxt->mode) { | ||
1507 | case X86EMUL_MODE_PROT32: | ||
1508 | if ((msr_data & 0xfffc) == 0x0) { | ||
1509 | kvm_inject_gp(ctxt->vcpu, 0); | ||
1510 | return -1; | ||
1511 | } | ||
1512 | break; | ||
1513 | case X86EMUL_MODE_PROT64: | ||
1514 | if (msr_data == 0x0) { | ||
1515 | kvm_inject_gp(ctxt->vcpu, 0); | ||
1516 | return -1; | ||
1517 | } | ||
1518 | break; | ||
1519 | } | ||
1520 | |||
1521 | ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); | ||
1522 | cs.selector = (u16)msr_data; | ||
1523 | cs.selector &= ~SELECTOR_RPL_MASK; | ||
1524 | ss.selector = cs.selector + 8; | ||
1525 | ss.selector &= ~SELECTOR_RPL_MASK; | ||
1526 | if (ctxt->mode == X86EMUL_MODE_PROT64 | ||
1527 | || is_long_mode(ctxt->vcpu)) { | ||
1528 | cs.db = 0; | ||
1529 | cs.l = 1; | ||
1530 | } | ||
1531 | |||
1532 | kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); | ||
1533 | kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); | ||
1534 | |||
1535 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); | ||
1536 | c->eip = msr_data; | ||
1537 | |||
1538 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); | ||
1539 | c->regs[VCPU_REGS_RSP] = msr_data; | ||
1540 | |||
1541 | return 0; | ||
1542 | } | ||
1543 | |||
1544 | static int | ||
1545 | emulate_sysexit(struct x86_emulate_ctxt *ctxt) | ||
1546 | { | ||
1547 | struct decode_cache *c = &ctxt->decode; | ||
1548 | struct kvm_segment cs, ss; | ||
1549 | u64 msr_data; | ||
1550 | int usermode; | ||
1551 | |||
1552 | /* inject #UD if LOCK prefix is used */ | ||
1553 | if (c->lock_prefix) | ||
1554 | return -1; | ||
1555 | |||
1556 | /* inject #GP if in real mode or paging is disabled */ | ||
1557 | if (ctxt->mode == X86EMUL_MODE_REAL | ||
1558 | || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { | ||
1559 | kvm_inject_gp(ctxt->vcpu, 0); | ||
1560 | return -1; | ||
1561 | } | ||
1562 | |||
1563 | /* sysexit must be called from CPL 0 */ | ||
1564 | if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { | ||
1565 | kvm_inject_gp(ctxt->vcpu, 0); | ||
1566 | return -1; | ||
1567 | } | ||
1568 | |||
1569 | setup_syscalls_segments(ctxt, &cs, &ss); | ||
1570 | |||
1571 | if ((c->rex_prefix & 0x8) != 0x0) | ||
1572 | usermode = X86EMUL_MODE_PROT64; | ||
1573 | else | ||
1574 | usermode = X86EMUL_MODE_PROT32; | ||
1575 | |||
1576 | cs.dpl = 3; | ||
1577 | ss.dpl = 3; | ||
1578 | kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); | ||
1579 | switch (usermode) { | ||
1580 | case X86EMUL_MODE_PROT32: | ||
1581 | cs.selector = (u16)(msr_data + 16); | ||
1582 | if ((msr_data & 0xfffc) == 0x0) { | ||
1583 | kvm_inject_gp(ctxt->vcpu, 0); | ||
1584 | return -1; | ||
1585 | } | ||
1586 | ss.selector = (u16)(msr_data + 24); | ||
1587 | break; | ||
1588 | case X86EMUL_MODE_PROT64: | ||
1589 | cs.selector = (u16)(msr_data + 32); | ||
1590 | if (msr_data == 0x0) { | ||
1591 | kvm_inject_gp(ctxt->vcpu, 0); | ||
1592 | return -1; | ||
1593 | } | ||
1594 | ss.selector = cs.selector + 8; | ||
1595 | cs.db = 0; | ||
1596 | cs.l = 1; | ||
1597 | break; | ||
1598 | } | ||
1599 | cs.selector |= SELECTOR_RPL_MASK; | ||
1600 | ss.selector |= SELECTOR_RPL_MASK; | ||
1601 | |||
1602 | kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); | ||
1603 | kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); | ||
1604 | |||
1605 | c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; | ||
1606 | c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; | ||
1607 | |||
1608 | return 0; | ||
1609 | } | ||
1610 | |||
1378 | int | 1611 | int |
1379 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | 1612 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) |
1380 | { | 1613 | { |
@@ -1970,6 +2203,12 @@ twobyte_insn: | |||
1970 | goto cannot_emulate; | 2203 | goto cannot_emulate; |
1971 | } | 2204 | } |
1972 | break; | 2205 | break; |
2206 | case 0x05: /* syscall */ | ||
2207 | if (emulate_syscall(ctxt) == -1) | ||
2208 | goto cannot_emulate; | ||
2209 | else | ||
2210 | goto writeback; | ||
2211 | break; | ||
1973 | case 0x06: | 2212 | case 0x06: |
1974 | emulate_clts(ctxt->vcpu); | 2213 | emulate_clts(ctxt->vcpu); |
1975 | c->dst.type = OP_NONE; | 2214 | c->dst.type = OP_NONE; |
@@ -2036,6 +2275,18 @@ twobyte_insn: | |||
2036 | rc = X86EMUL_CONTINUE; | 2275 | rc = X86EMUL_CONTINUE; |
2037 | c->dst.type = OP_NONE; | 2276 | c->dst.type = OP_NONE; |
2038 | break; | 2277 | break; |
2278 | case 0x34: /* sysenter */ | ||
2279 | if (emulate_sysenter(ctxt) == -1) | ||
2280 | goto cannot_emulate; | ||
2281 | else | ||
2282 | goto writeback; | ||
2283 | break; | ||
2284 | case 0x35: /* sysexit */ | ||
2285 | if (emulate_sysexit(ctxt) == -1) | ||
2286 | goto cannot_emulate; | ||
2287 | else | ||
2288 | goto writeback; | ||
2289 | break; | ||
2039 | case 0x40 ... 0x4f: /* cmov */ | 2290 | case 0x40 ... 0x4f: /* cmov */ |
2040 | c->dst.val = c->dst.orig_val = c->src.val; | 2291 | c->dst.val = c->dst.orig_val = c->src.val; |
2041 | if (!test_cc(c->b, ctxt->eflags)) | 2292 | if (!test_cc(c->b, ctxt->eflags)) |
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 21f68e00524..82ad523b490 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -231,7 +231,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) | |||
231 | { | 231 | { |
232 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; | 232 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; |
233 | 233 | ||
234 | if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack) | 234 | if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack) |
235 | return atomic_read(&pit->pit_state.pit_timer.pending); | 235 | return atomic_read(&pit->pit_state.pit_timer.pending); |
236 | return 0; | 236 | return 0; |
237 | } | 237 | } |
@@ -252,7 +252,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) | |||
252 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; | 252 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; |
253 | struct hrtimer *timer; | 253 | struct hrtimer *timer; |
254 | 254 | ||
255 | if (vcpu->vcpu_id != 0 || !pit) | 255 | if (!kvm_vcpu_is_bsp(vcpu) || !pit) |
256 | return; | 256 | return; |
257 | 257 | ||
258 | timer = &pit->pit_state.pit_timer.timer; | 258 | timer = &pit->pit_state.pit_timer.timer; |
@@ -294,7 +294,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) | |||
294 | pt->timer.function = kvm_timer_fn; | 294 | pt->timer.function = kvm_timer_fn; |
295 | pt->t_ops = &kpit_ops; | 295 | pt->t_ops = &kpit_ops; |
296 | pt->kvm = ps->pit->kvm; | 296 | pt->kvm = ps->pit->kvm; |
297 | pt->vcpu_id = 0; | 297 | pt->vcpu = pt->kvm->bsp_vcpu; |
298 | 298 | ||
299 | atomic_set(&pt->pending, 0); | 299 | atomic_set(&pt->pending, 0); |
300 | ps->irq_ack = 1; | 300 | ps->irq_ack = 1; |
@@ -332,33 +332,62 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) | |||
332 | case 1: | 332 | case 1: |
333 | /* FIXME: enhance mode 4 precision */ | 333 | /* FIXME: enhance mode 4 precision */ |
334 | case 4: | 334 | case 4: |
335 | create_pit_timer(ps, val, 0); | 335 | if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { |
336 | create_pit_timer(ps, val, 0); | ||
337 | } | ||
336 | break; | 338 | break; |
337 | case 2: | 339 | case 2: |
338 | case 3: | 340 | case 3: |
339 | create_pit_timer(ps, val, 1); | 341 | if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ |
342 | create_pit_timer(ps, val, 1); | ||
343 | } | ||
340 | break; | 344 | break; |
341 | default: | 345 | default: |
342 | destroy_pit_timer(&ps->pit_timer); | 346 | destroy_pit_timer(&ps->pit_timer); |
343 | } | 347 | } |
344 | } | 348 | } |
345 | 349 | ||
346 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val) | 350 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start) |
351 | { | ||
352 | u8 saved_mode; | ||
353 | if (hpet_legacy_start) { | ||
354 | /* save existing mode for later reenablement */ | ||
355 | saved_mode = kvm->arch.vpit->pit_state.channels[0].mode; | ||
356 | kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */ | ||
357 | pit_load_count(kvm, channel, val); | ||
358 | kvm->arch.vpit->pit_state.channels[0].mode = saved_mode; | ||
359 | } else { | ||
360 | pit_load_count(kvm, channel, val); | ||
361 | } | ||
362 | } | ||
363 | |||
364 | static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev) | ||
365 | { | ||
366 | return container_of(dev, struct kvm_pit, dev); | ||
367 | } | ||
368 | |||
369 | static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev) | ||
347 | { | 370 | { |
348 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | 371 | return container_of(dev, struct kvm_pit, speaker_dev); |
349 | pit_load_count(kvm, channel, val); | ||
350 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | ||
351 | } | 372 | } |
352 | 373 | ||
353 | static void pit_ioport_write(struct kvm_io_device *this, | 374 | static inline int pit_in_range(gpa_t addr) |
354 | gpa_t addr, int len, const void *data) | ||
355 | { | 375 | { |
356 | struct kvm_pit *pit = (struct kvm_pit *)this->private; | 376 | return ((addr >= KVM_PIT_BASE_ADDRESS) && |
377 | (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); | ||
378 | } | ||
379 | |||
380 | static int pit_ioport_write(struct kvm_io_device *this, | ||
381 | gpa_t addr, int len, const void *data) | ||
382 | { | ||
383 | struct kvm_pit *pit = dev_to_pit(this); | ||
357 | struct kvm_kpit_state *pit_state = &pit->pit_state; | 384 | struct kvm_kpit_state *pit_state = &pit->pit_state; |
358 | struct kvm *kvm = pit->kvm; | 385 | struct kvm *kvm = pit->kvm; |
359 | int channel, access; | 386 | int channel, access; |
360 | struct kvm_kpit_channel_state *s; | 387 | struct kvm_kpit_channel_state *s; |
361 | u32 val = *(u32 *) data; | 388 | u32 val = *(u32 *) data; |
389 | if (!pit_in_range(addr)) | ||
390 | return -EOPNOTSUPP; | ||
362 | 391 | ||
363 | val &= 0xff; | 392 | val &= 0xff; |
364 | addr &= KVM_PIT_CHANNEL_MASK; | 393 | addr &= KVM_PIT_CHANNEL_MASK; |
@@ -421,16 +450,19 @@ static void pit_ioport_write(struct kvm_io_device *this, | |||
421 | } | 450 | } |
422 | 451 | ||
423 | mutex_unlock(&pit_state->lock); | 452 | mutex_unlock(&pit_state->lock); |
453 | return 0; | ||
424 | } | 454 | } |
425 | 455 | ||
426 | static void pit_ioport_read(struct kvm_io_device *this, | 456 | static int pit_ioport_read(struct kvm_io_device *this, |
427 | gpa_t addr, int len, void *data) | 457 | gpa_t addr, int len, void *data) |
428 | { | 458 | { |
429 | struct kvm_pit *pit = (struct kvm_pit *)this->private; | 459 | struct kvm_pit *pit = dev_to_pit(this); |
430 | struct kvm_kpit_state *pit_state = &pit->pit_state; | 460 | struct kvm_kpit_state *pit_state = &pit->pit_state; |
431 | struct kvm *kvm = pit->kvm; | 461 | struct kvm *kvm = pit->kvm; |
432 | int ret, count; | 462 | int ret, count; |
433 | struct kvm_kpit_channel_state *s; | 463 | struct kvm_kpit_channel_state *s; |
464 | if (!pit_in_range(addr)) | ||
465 | return -EOPNOTSUPP; | ||
434 | 466 | ||
435 | addr &= KVM_PIT_CHANNEL_MASK; | 467 | addr &= KVM_PIT_CHANNEL_MASK; |
436 | s = &pit_state->channels[addr]; | 468 | s = &pit_state->channels[addr]; |
@@ -485,37 +517,36 @@ static void pit_ioport_read(struct kvm_io_device *this, | |||
485 | memcpy(data, (char *)&ret, len); | 517 | memcpy(data, (char *)&ret, len); |
486 | 518 | ||
487 | mutex_unlock(&pit_state->lock); | 519 | mutex_unlock(&pit_state->lock); |
520 | return 0; | ||
488 | } | 521 | } |
489 | 522 | ||
490 | static int pit_in_range(struct kvm_io_device *this, gpa_t addr, | 523 | static int speaker_ioport_write(struct kvm_io_device *this, |
491 | int len, int is_write) | 524 | gpa_t addr, int len, const void *data) |
492 | { | ||
493 | return ((addr >= KVM_PIT_BASE_ADDRESS) && | ||
494 | (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); | ||
495 | } | ||
496 | |||
497 | static void speaker_ioport_write(struct kvm_io_device *this, | ||
498 | gpa_t addr, int len, const void *data) | ||
499 | { | 525 | { |
500 | struct kvm_pit *pit = (struct kvm_pit *)this->private; | 526 | struct kvm_pit *pit = speaker_to_pit(this); |
501 | struct kvm_kpit_state *pit_state = &pit->pit_state; | 527 | struct kvm_kpit_state *pit_state = &pit->pit_state; |
502 | struct kvm *kvm = pit->kvm; | 528 | struct kvm *kvm = pit->kvm; |
503 | u32 val = *(u32 *) data; | 529 | u32 val = *(u32 *) data; |
530 | if (addr != KVM_SPEAKER_BASE_ADDRESS) | ||
531 | return -EOPNOTSUPP; | ||
504 | 532 | ||
505 | mutex_lock(&pit_state->lock); | 533 | mutex_lock(&pit_state->lock); |
506 | pit_state->speaker_data_on = (val >> 1) & 1; | 534 | pit_state->speaker_data_on = (val >> 1) & 1; |
507 | pit_set_gate(kvm, 2, val & 1); | 535 | pit_set_gate(kvm, 2, val & 1); |
508 | mutex_unlock(&pit_state->lock); | 536 | mutex_unlock(&pit_state->lock); |
537 | return 0; | ||
509 | } | 538 | } |
510 | 539 | ||
511 | static void speaker_ioport_read(struct kvm_io_device *this, | 540 | static int speaker_ioport_read(struct kvm_io_device *this, |
512 | gpa_t addr, int len, void *data) | 541 | gpa_t addr, int len, void *data) |
513 | { | 542 | { |
514 | struct kvm_pit *pit = (struct kvm_pit *)this->private; | 543 | struct kvm_pit *pit = speaker_to_pit(this); |
515 | struct kvm_kpit_state *pit_state = &pit->pit_state; | 544 | struct kvm_kpit_state *pit_state = &pit->pit_state; |
516 | struct kvm *kvm = pit->kvm; | 545 | struct kvm *kvm = pit->kvm; |
517 | unsigned int refresh_clock; | 546 | unsigned int refresh_clock; |
518 | int ret; | 547 | int ret; |
548 | if (addr != KVM_SPEAKER_BASE_ADDRESS) | ||
549 | return -EOPNOTSUPP; | ||
519 | 550 | ||
520 | /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ | 551 | /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ |
521 | refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; | 552 | refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; |
@@ -527,12 +558,7 @@ static void speaker_ioport_read(struct kvm_io_device *this, | |||
527 | len = sizeof(ret); | 558 | len = sizeof(ret); |
528 | memcpy(data, (char *)&ret, len); | 559 | memcpy(data, (char *)&ret, len); |
529 | mutex_unlock(&pit_state->lock); | 560 | mutex_unlock(&pit_state->lock); |
530 | } | 561 | return 0; |
531 | |||
532 | static int speaker_in_range(struct kvm_io_device *this, gpa_t addr, | ||
533 | int len, int is_write) | ||
534 | { | ||
535 | return (addr == KVM_SPEAKER_BASE_ADDRESS); | ||
536 | } | 562 | } |
537 | 563 | ||
538 | void kvm_pit_reset(struct kvm_pit *pit) | 564 | void kvm_pit_reset(struct kvm_pit *pit) |
@@ -541,6 +567,7 @@ void kvm_pit_reset(struct kvm_pit *pit) | |||
541 | struct kvm_kpit_channel_state *c; | 567 | struct kvm_kpit_channel_state *c; |
542 | 568 | ||
543 | mutex_lock(&pit->pit_state.lock); | 569 | mutex_lock(&pit->pit_state.lock); |
570 | pit->pit_state.flags = 0; | ||
544 | for (i = 0; i < 3; i++) { | 571 | for (i = 0; i < 3; i++) { |
545 | c = &pit->pit_state.channels[i]; | 572 | c = &pit->pit_state.channels[i]; |
546 | c->mode = 0xff; | 573 | c->mode = 0xff; |
@@ -563,10 +590,22 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) | |||
563 | } | 590 | } |
564 | } | 591 | } |
565 | 592 | ||
566 | struct kvm_pit *kvm_create_pit(struct kvm *kvm) | 593 | static const struct kvm_io_device_ops pit_dev_ops = { |
594 | .read = pit_ioport_read, | ||
595 | .write = pit_ioport_write, | ||
596 | }; | ||
597 | |||
598 | static const struct kvm_io_device_ops speaker_dev_ops = { | ||
599 | .read = speaker_ioport_read, | ||
600 | .write = speaker_ioport_write, | ||
601 | }; | ||
602 | |||
603 | /* Caller must have writers lock on slots_lock */ | ||
604 | struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) | ||
567 | { | 605 | { |
568 | struct kvm_pit *pit; | 606 | struct kvm_pit *pit; |
569 | struct kvm_kpit_state *pit_state; | 607 | struct kvm_kpit_state *pit_state; |
608 | int ret; | ||
570 | 609 | ||
571 | pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); | 610 | pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); |
572 | if (!pit) | 611 | if (!pit) |
@@ -582,19 +621,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) | |||
582 | mutex_lock(&pit->pit_state.lock); | 621 | mutex_lock(&pit->pit_state.lock); |
583 | spin_lock_init(&pit->pit_state.inject_lock); | 622 | spin_lock_init(&pit->pit_state.inject_lock); |
584 | 623 | ||
585 | /* Initialize PIO device */ | ||
586 | pit->dev.read = pit_ioport_read; | ||
587 | pit->dev.write = pit_ioport_write; | ||
588 | pit->dev.in_range = pit_in_range; | ||
589 | pit->dev.private = pit; | ||
590 | kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); | ||
591 | |||
592 | pit->speaker_dev.read = speaker_ioport_read; | ||
593 | pit->speaker_dev.write = speaker_ioport_write; | ||
594 | pit->speaker_dev.in_range = speaker_in_range; | ||
595 | pit->speaker_dev.private = pit; | ||
596 | kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev); | ||
597 | |||
598 | kvm->arch.vpit = pit; | 624 | kvm->arch.vpit = pit; |
599 | pit->kvm = kvm; | 625 | pit->kvm = kvm; |
600 | 626 | ||
@@ -613,7 +639,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) | |||
613 | pit->mask_notifier.func = pit_mask_notifer; | 639 | pit->mask_notifier.func = pit_mask_notifer; |
614 | kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); | 640 | kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); |
615 | 641 | ||
642 | kvm_iodevice_init(&pit->dev, &pit_dev_ops); | ||
643 | ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); | ||
644 | if (ret < 0) | ||
645 | goto fail; | ||
646 | |||
647 | if (flags & KVM_PIT_SPEAKER_DUMMY) { | ||
648 | kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); | ||
649 | ret = __kvm_io_bus_register_dev(&kvm->pio_bus, | ||
650 | &pit->speaker_dev); | ||
651 | if (ret < 0) | ||
652 | goto fail_unregister; | ||
653 | } | ||
654 | |||
616 | return pit; | 655 | return pit; |
656 | |||
657 | fail_unregister: | ||
658 | __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); | ||
659 | |||
660 | fail: | ||
661 | if (pit->irq_source_id >= 0) | ||
662 | kvm_free_irq_source_id(kvm, pit->irq_source_id); | ||
663 | |||
664 | kfree(pit); | ||
665 | return NULL; | ||
617 | } | 666 | } |
618 | 667 | ||
619 | void kvm_free_pit(struct kvm *kvm) | 668 | void kvm_free_pit(struct kvm *kvm) |
@@ -623,6 +672,8 @@ void kvm_free_pit(struct kvm *kvm) | |||
623 | if (kvm->arch.vpit) { | 672 | if (kvm->arch.vpit) { |
624 | kvm_unregister_irq_mask_notifier(kvm, 0, | 673 | kvm_unregister_irq_mask_notifier(kvm, 0, |
625 | &kvm->arch.vpit->mask_notifier); | 674 | &kvm->arch.vpit->mask_notifier); |
675 | kvm_unregister_irq_ack_notifier(kvm, | ||
676 | &kvm->arch.vpit->pit_state.irq_ack_notifier); | ||
626 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | 677 | mutex_lock(&kvm->arch.vpit->pit_state.lock); |
627 | timer = &kvm->arch.vpit->pit_state.pit_timer.timer; | 678 | timer = &kvm->arch.vpit->pit_state.pit_timer.timer; |
628 | hrtimer_cancel(timer); | 679 | hrtimer_cancel(timer); |
@@ -637,10 +688,10 @@ static void __inject_pit_timer_intr(struct kvm *kvm) | |||
637 | struct kvm_vcpu *vcpu; | 688 | struct kvm_vcpu *vcpu; |
638 | int i; | 689 | int i; |
639 | 690 | ||
640 | mutex_lock(&kvm->lock); | 691 | mutex_lock(&kvm->irq_lock); |
641 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); | 692 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); |
642 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); | 693 | kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); |
643 | mutex_unlock(&kvm->lock); | 694 | mutex_unlock(&kvm->irq_lock); |
644 | 695 | ||
645 | /* | 696 | /* |
646 | * Provides NMI watchdog support via Virtual Wire mode. | 697 | * Provides NMI watchdog support via Virtual Wire mode. |
@@ -652,11 +703,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm) | |||
652 | * VCPU0, and only if its LVT0 is in EXTINT mode. | 703 | * VCPU0, and only if its LVT0 is in EXTINT mode. |
653 | */ | 704 | */ |
654 | if (kvm->arch.vapics_in_nmi_mode > 0) | 705 | if (kvm->arch.vapics_in_nmi_mode > 0) |
655 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | 706 | kvm_for_each_vcpu(i, vcpu, kvm) |
656 | vcpu = kvm->vcpus[i]; | 707 | kvm_apic_nmi_wd_deliver(vcpu); |
657 | if (vcpu) | ||
658 | kvm_apic_nmi_wd_deliver(vcpu); | ||
659 | } | ||
660 | } | 708 | } |
661 | 709 | ||
662 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) | 710 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) |
@@ -665,7 +713,7 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) | |||
665 | struct kvm *kvm = vcpu->kvm; | 713 | struct kvm *kvm = vcpu->kvm; |
666 | struct kvm_kpit_state *ps; | 714 | struct kvm_kpit_state *ps; |
667 | 715 | ||
668 | if (vcpu && pit) { | 716 | if (pit) { |
669 | int inject = 0; | 717 | int inject = 0; |
670 | ps = &pit->pit_state; | 718 | ps = &pit->pit_state; |
671 | 719 | ||
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index bbd863ff60b..d4c1c7ffdc0 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h | |||
@@ -21,6 +21,7 @@ struct kvm_kpit_channel_state { | |||
21 | 21 | ||
22 | struct kvm_kpit_state { | 22 | struct kvm_kpit_state { |
23 | struct kvm_kpit_channel_state channels[3]; | 23 | struct kvm_kpit_channel_state channels[3]; |
24 | u32 flags; | ||
24 | struct kvm_timer pit_timer; | 25 | struct kvm_timer pit_timer; |
25 | bool is_periodic; | 26 | bool is_periodic; |
26 | u32 speaker_data_on; | 27 | u32 speaker_data_on; |
@@ -49,8 +50,8 @@ struct kvm_pit { | |||
49 | #define KVM_PIT_CHANNEL_MASK 0x3 | 50 | #define KVM_PIT_CHANNEL_MASK 0x3 |
50 | 51 | ||
51 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); | 52 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); |
52 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); | 53 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); |
53 | struct kvm_pit *kvm_create_pit(struct kvm *kvm); | 54 | struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); |
54 | void kvm_free_pit(struct kvm *kvm); | 55 | void kvm_free_pit(struct kvm *kvm); |
55 | void kvm_pit_reset(struct kvm_pit *pit); | 56 | void kvm_pit_reset(struct kvm_pit *pit); |
56 | 57 | ||
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 1ccb50c74f1..01f15168280 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -30,50 +30,24 @@ | |||
30 | #include "irq.h" | 30 | #include "irq.h" |
31 | 31 | ||
32 | #include <linux/kvm_host.h> | 32 | #include <linux/kvm_host.h> |
33 | 33 | #include "trace.h" | |
34 | static void pic_lock(struct kvm_pic *s) | ||
35 | __acquires(&s->lock) | ||
36 | { | ||
37 | spin_lock(&s->lock); | ||
38 | } | ||
39 | |||
40 | static void pic_unlock(struct kvm_pic *s) | ||
41 | __releases(&s->lock) | ||
42 | { | ||
43 | struct kvm *kvm = s->kvm; | ||
44 | unsigned acks = s->pending_acks; | ||
45 | bool wakeup = s->wakeup_needed; | ||
46 | struct kvm_vcpu *vcpu; | ||
47 | |||
48 | s->pending_acks = 0; | ||
49 | s->wakeup_needed = false; | ||
50 | |||
51 | spin_unlock(&s->lock); | ||
52 | |||
53 | while (acks) { | ||
54 | kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)), | ||
55 | __ffs(acks)); | ||
56 | acks &= acks - 1; | ||
57 | } | ||
58 | |||
59 | if (wakeup) { | ||
60 | vcpu = s->kvm->vcpus[0]; | ||
61 | if (vcpu) | ||
62 | kvm_vcpu_kick(vcpu); | ||
63 | } | ||
64 | } | ||
65 | 34 | ||
66 | static void pic_clear_isr(struct kvm_kpic_state *s, int irq) | 35 | static void pic_clear_isr(struct kvm_kpic_state *s, int irq) |
67 | { | 36 | { |
68 | s->isr &= ~(1 << irq); | 37 | s->isr &= ~(1 << irq); |
69 | s->isr_ack |= (1 << irq); | 38 | s->isr_ack |= (1 << irq); |
39 | if (s != &s->pics_state->pics[0]) | ||
40 | irq += 8; | ||
41 | kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); | ||
70 | } | 42 | } |
71 | 43 | ||
72 | void kvm_pic_clear_isr_ack(struct kvm *kvm) | 44 | void kvm_pic_clear_isr_ack(struct kvm *kvm) |
73 | { | 45 | { |
74 | struct kvm_pic *s = pic_irqchip(kvm); | 46 | struct kvm_pic *s = pic_irqchip(kvm); |
47 | spin_lock(&s->lock); | ||
75 | s->pics[0].isr_ack = 0xff; | 48 | s->pics[0].isr_ack = 0xff; |
76 | s->pics[1].isr_ack = 0xff; | 49 | s->pics[1].isr_ack = 0xff; |
50 | spin_unlock(&s->lock); | ||
77 | } | 51 | } |
78 | 52 | ||
79 | /* | 53 | /* |
@@ -174,9 +148,9 @@ static void pic_update_irq(struct kvm_pic *s) | |||
174 | 148 | ||
175 | void kvm_pic_update_irq(struct kvm_pic *s) | 149 | void kvm_pic_update_irq(struct kvm_pic *s) |
176 | { | 150 | { |
177 | pic_lock(s); | 151 | spin_lock(&s->lock); |
178 | pic_update_irq(s); | 152 | pic_update_irq(s); |
179 | pic_unlock(s); | 153 | spin_unlock(&s->lock); |
180 | } | 154 | } |
181 | 155 | ||
182 | int kvm_pic_set_irq(void *opaque, int irq, int level) | 156 | int kvm_pic_set_irq(void *opaque, int irq, int level) |
@@ -184,12 +158,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) | |||
184 | struct kvm_pic *s = opaque; | 158 | struct kvm_pic *s = opaque; |
185 | int ret = -1; | 159 | int ret = -1; |
186 | 160 | ||
187 | pic_lock(s); | 161 | spin_lock(&s->lock); |
188 | if (irq >= 0 && irq < PIC_NUM_PINS) { | 162 | if (irq >= 0 && irq < PIC_NUM_PINS) { |
189 | ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); | 163 | ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); |
190 | pic_update_irq(s); | 164 | pic_update_irq(s); |
165 | trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, | ||
166 | s->pics[irq >> 3].imr, ret == 0); | ||
191 | } | 167 | } |
192 | pic_unlock(s); | 168 | spin_unlock(&s->lock); |
193 | 169 | ||
194 | return ret; | 170 | return ret; |
195 | } | 171 | } |
@@ -217,7 +193,7 @@ int kvm_pic_read_irq(struct kvm *kvm) | |||
217 | int irq, irq2, intno; | 193 | int irq, irq2, intno; |
218 | struct kvm_pic *s = pic_irqchip(kvm); | 194 | struct kvm_pic *s = pic_irqchip(kvm); |
219 | 195 | ||
220 | pic_lock(s); | 196 | spin_lock(&s->lock); |
221 | irq = pic_get_irq(&s->pics[0]); | 197 | irq = pic_get_irq(&s->pics[0]); |
222 | if (irq >= 0) { | 198 | if (irq >= 0) { |
223 | pic_intack(&s->pics[0], irq); | 199 | pic_intack(&s->pics[0], irq); |
@@ -242,8 +218,7 @@ int kvm_pic_read_irq(struct kvm *kvm) | |||
242 | intno = s->pics[0].irq_base + irq; | 218 | intno = s->pics[0].irq_base + irq; |
243 | } | 219 | } |
244 | pic_update_irq(s); | 220 | pic_update_irq(s); |
245 | pic_unlock(s); | 221 | spin_unlock(&s->lock); |
246 | kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq); | ||
247 | 222 | ||
248 | return intno; | 223 | return intno; |
249 | } | 224 | } |
@@ -252,7 +227,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) | |||
252 | { | 227 | { |
253 | int irq, irqbase, n; | 228 | int irq, irqbase, n; |
254 | struct kvm *kvm = s->pics_state->irq_request_opaque; | 229 | struct kvm *kvm = s->pics_state->irq_request_opaque; |
255 | struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; | 230 | struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; |
256 | 231 | ||
257 | if (s == &s->pics_state->pics[0]) | 232 | if (s == &s->pics_state->pics[0]) |
258 | irqbase = 0; | 233 | irqbase = 0; |
@@ -263,7 +238,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) | |||
263 | if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) | 238 | if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) |
264 | if (s->irr & (1 << irq) || s->isr & (1 << irq)) { | 239 | if (s->irr & (1 << irq) || s->isr & (1 << irq)) { |
265 | n = irq + irqbase; | 240 | n = irq + irqbase; |
266 | s->pics_state->pending_acks |= 1 << n; | 241 | kvm_notify_acked_irq(kvm, SELECT_PIC(n), n); |
267 | } | 242 | } |
268 | } | 243 | } |
269 | s->last_irr = 0; | 244 | s->last_irr = 0; |
@@ -428,8 +403,7 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1) | |||
428 | return s->elcr; | 403 | return s->elcr; |
429 | } | 404 | } |
430 | 405 | ||
431 | static int picdev_in_range(struct kvm_io_device *this, gpa_t addr, | 406 | static int picdev_in_range(gpa_t addr) |
432 | int len, int is_write) | ||
433 | { | 407 | { |
434 | switch (addr) { | 408 | switch (addr) { |
435 | case 0x20: | 409 | case 0x20: |
@@ -444,18 +418,25 @@ static int picdev_in_range(struct kvm_io_device *this, gpa_t addr, | |||
444 | } | 418 | } |
445 | } | 419 | } |
446 | 420 | ||
447 | static void picdev_write(struct kvm_io_device *this, | 421 | static inline struct kvm_pic *to_pic(struct kvm_io_device *dev) |
422 | { | ||
423 | return container_of(dev, struct kvm_pic, dev); | ||
424 | } | ||
425 | |||
426 | static int picdev_write(struct kvm_io_device *this, | ||
448 | gpa_t addr, int len, const void *val) | 427 | gpa_t addr, int len, const void *val) |
449 | { | 428 | { |
450 | struct kvm_pic *s = this->private; | 429 | struct kvm_pic *s = to_pic(this); |
451 | unsigned char data = *(unsigned char *)val; | 430 | unsigned char data = *(unsigned char *)val; |
431 | if (!picdev_in_range(addr)) | ||
432 | return -EOPNOTSUPP; | ||
452 | 433 | ||
453 | if (len != 1) { | 434 | if (len != 1) { |
454 | if (printk_ratelimit()) | 435 | if (printk_ratelimit()) |
455 | printk(KERN_ERR "PIC: non byte write\n"); | 436 | printk(KERN_ERR "PIC: non byte write\n"); |
456 | return; | 437 | return 0; |
457 | } | 438 | } |
458 | pic_lock(s); | 439 | spin_lock(&s->lock); |
459 | switch (addr) { | 440 | switch (addr) { |
460 | case 0x20: | 441 | case 0x20: |
461 | case 0x21: | 442 | case 0x21: |
@@ -468,21 +449,24 @@ static void picdev_write(struct kvm_io_device *this, | |||
468 | elcr_ioport_write(&s->pics[addr & 1], addr, data); | 449 | elcr_ioport_write(&s->pics[addr & 1], addr, data); |
469 | break; | 450 | break; |
470 | } | 451 | } |
471 | pic_unlock(s); | 452 | spin_unlock(&s->lock); |
453 | return 0; | ||
472 | } | 454 | } |
473 | 455 | ||
474 | static void picdev_read(struct kvm_io_device *this, | 456 | static int picdev_read(struct kvm_io_device *this, |
475 | gpa_t addr, int len, void *val) | 457 | gpa_t addr, int len, void *val) |
476 | { | 458 | { |
477 | struct kvm_pic *s = this->private; | 459 | struct kvm_pic *s = to_pic(this); |
478 | unsigned char data = 0; | 460 | unsigned char data = 0; |
461 | if (!picdev_in_range(addr)) | ||
462 | return -EOPNOTSUPP; | ||
479 | 463 | ||
480 | if (len != 1) { | 464 | if (len != 1) { |
481 | if (printk_ratelimit()) | 465 | if (printk_ratelimit()) |
482 | printk(KERN_ERR "PIC: non byte read\n"); | 466 | printk(KERN_ERR "PIC: non byte read\n"); |
483 | return; | 467 | return 0; |
484 | } | 468 | } |
485 | pic_lock(s); | 469 | spin_lock(&s->lock); |
486 | switch (addr) { | 470 | switch (addr) { |
487 | case 0x20: | 471 | case 0x20: |
488 | case 0x21: | 472 | case 0x21: |
@@ -496,7 +480,8 @@ static void picdev_read(struct kvm_io_device *this, | |||
496 | break; | 480 | break; |
497 | } | 481 | } |
498 | *(unsigned char *)val = data; | 482 | *(unsigned char *)val = data; |
499 | pic_unlock(s); | 483 | spin_unlock(&s->lock); |
484 | return 0; | ||
500 | } | 485 | } |
501 | 486 | ||
502 | /* | 487 | /* |
@@ -505,20 +490,27 @@ static void picdev_read(struct kvm_io_device *this, | |||
505 | static void pic_irq_request(void *opaque, int level) | 490 | static void pic_irq_request(void *opaque, int level) |
506 | { | 491 | { |
507 | struct kvm *kvm = opaque; | 492 | struct kvm *kvm = opaque; |
508 | struct kvm_vcpu *vcpu = kvm->vcpus[0]; | 493 | struct kvm_vcpu *vcpu = kvm->bsp_vcpu; |
509 | struct kvm_pic *s = pic_irqchip(kvm); | 494 | struct kvm_pic *s = pic_irqchip(kvm); |
510 | int irq = pic_get_irq(&s->pics[0]); | 495 | int irq = pic_get_irq(&s->pics[0]); |
511 | 496 | ||
512 | s->output = level; | 497 | s->output = level; |
513 | if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { | 498 | if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { |
514 | s->pics[0].isr_ack &= ~(1 << irq); | 499 | s->pics[0].isr_ack &= ~(1 << irq); |
515 | s->wakeup_needed = true; | 500 | kvm_vcpu_kick(vcpu); |
516 | } | 501 | } |
517 | } | 502 | } |
518 | 503 | ||
504 | static const struct kvm_io_device_ops picdev_ops = { | ||
505 | .read = picdev_read, | ||
506 | .write = picdev_write, | ||
507 | }; | ||
508 | |||
519 | struct kvm_pic *kvm_create_pic(struct kvm *kvm) | 509 | struct kvm_pic *kvm_create_pic(struct kvm *kvm) |
520 | { | 510 | { |
521 | struct kvm_pic *s; | 511 | struct kvm_pic *s; |
512 | int ret; | ||
513 | |||
522 | s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); | 514 | s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); |
523 | if (!s) | 515 | if (!s) |
524 | return NULL; | 516 | return NULL; |
@@ -534,10 +526,12 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) | |||
534 | /* | 526 | /* |
535 | * Initialize PIO device | 527 | * Initialize PIO device |
536 | */ | 528 | */ |
537 | s->dev.read = picdev_read; | 529 | kvm_iodevice_init(&s->dev, &picdev_ops); |
538 | s->dev.write = picdev_write; | 530 | ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); |
539 | s->dev.in_range = picdev_in_range; | 531 | if (ret < 0) { |
540 | s->dev.private = s; | 532 | kfree(s); |
541 | kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev); | 533 | return NULL; |
534 | } | ||
535 | |||
542 | return s; | 536 | return s; |
543 | } | 537 | } |
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 9f593188129..7d6058a2fd3 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h | |||
@@ -63,7 +63,6 @@ struct kvm_kpic_state { | |||
63 | 63 | ||
64 | struct kvm_pic { | 64 | struct kvm_pic { |
65 | spinlock_t lock; | 65 | spinlock_t lock; |
66 | bool wakeup_needed; | ||
67 | unsigned pending_acks; | 66 | unsigned pending_acks; |
68 | struct kvm *kvm; | 67 | struct kvm *kvm; |
69 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ | 68 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ |
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 1ff819dce7d..7bcc5b6a440 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h | |||
@@ -29,4 +29,13 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) | |||
29 | kvm_register_write(vcpu, VCPU_REGS_RIP, val); | 29 | kvm_register_write(vcpu, VCPU_REGS_RIP, val); |
30 | } | 30 | } |
31 | 31 | ||
32 | static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) | ||
33 | { | ||
34 | if (!test_bit(VCPU_EXREG_PDPTR, | ||
35 | (unsigned long *)&vcpu->arch.regs_avail)) | ||
36 | kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); | ||
37 | |||
38 | return vcpu->arch.pdptrs[index]; | ||
39 | } | ||
40 | |||
32 | #endif | 41 | #endif |
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h deleted file mode 100644 index ed66e4c078d..00000000000 --- a/arch/x86/kvm/kvm_svm.h +++ /dev/null | |||
@@ -1,51 +0,0 @@ | |||
1 | #ifndef __KVM_SVM_H | ||
2 | #define __KVM_SVM_H | ||
3 | |||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/types.h> | ||
6 | #include <linux/list.h> | ||
7 | #include <linux/kvm_host.h> | ||
8 | #include <asm/msr.h> | ||
9 | |||
10 | #include <asm/svm.h> | ||
11 | |||
12 | static const u32 host_save_user_msrs[] = { | ||
13 | #ifdef CONFIG_X86_64 | ||
14 | MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, | ||
15 | MSR_FS_BASE, | ||
16 | #endif | ||
17 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
18 | }; | ||
19 | |||
20 | #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) | ||
21 | |||
22 | struct kvm_vcpu; | ||
23 | |||
24 | struct vcpu_svm { | ||
25 | struct kvm_vcpu vcpu; | ||
26 | struct vmcb *vmcb; | ||
27 | unsigned long vmcb_pa; | ||
28 | struct svm_cpu_data *svm_data; | ||
29 | uint64_t asid_generation; | ||
30 | |||
31 | u64 next_rip; | ||
32 | |||
33 | u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; | ||
34 | u64 host_gs_base; | ||
35 | unsigned long host_cr2; | ||
36 | |||
37 | u32 *msrpm; | ||
38 | struct vmcb *hsave; | ||
39 | u64 hsave_msr; | ||
40 | |||
41 | u64 nested_vmcb; | ||
42 | |||
43 | /* These are the merged vectors */ | ||
44 | u32 *nested_msrpm; | ||
45 | |||
46 | /* gpa pointers to the real vectors */ | ||
47 | u64 nested_vmcb_msrpm; | ||
48 | }; | ||
49 | |||
50 | #endif | ||
51 | |||
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h index 26bd6ba74e1..55c7524dda5 100644 --- a/arch/x86/kvm/kvm_timer.h +++ b/arch/x86/kvm/kvm_timer.h | |||
@@ -6,7 +6,7 @@ struct kvm_timer { | |||
6 | bool reinject; | 6 | bool reinject; |
7 | struct kvm_timer_ops *t_ops; | 7 | struct kvm_timer_ops *t_ops; |
8 | struct kvm *kvm; | 8 | struct kvm *kvm; |
9 | int vcpu_id; | 9 | struct kvm_vcpu *vcpu; |
10 | }; | 10 | }; |
11 | 11 | ||
12 | struct kvm_timer_ops { | 12 | struct kvm_timer_ops { |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ae99d83f81a..1ae5ceba7eb 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -32,8 +32,11 @@ | |||
32 | #include <asm/current.h> | 32 | #include <asm/current.h> |
33 | #include <asm/apicdef.h> | 33 | #include <asm/apicdef.h> |
34 | #include <asm/atomic.h> | 34 | #include <asm/atomic.h> |
35 | #include <asm/apicdef.h> | ||
35 | #include "kvm_cache_regs.h" | 36 | #include "kvm_cache_regs.h" |
36 | #include "irq.h" | 37 | #include "irq.h" |
38 | #include "trace.h" | ||
39 | #include "x86.h" | ||
37 | 40 | ||
38 | #ifndef CONFIG_X86_64 | 41 | #ifndef CONFIG_X86_64 |
39 | #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) | 42 | #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) |
@@ -141,6 +144,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) | |||
141 | return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; | 144 | return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; |
142 | } | 145 | } |
143 | 146 | ||
147 | void kvm_apic_set_version(struct kvm_vcpu *vcpu) | ||
148 | { | ||
149 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
150 | struct kvm_cpuid_entry2 *feat; | ||
151 | u32 v = APIC_VERSION; | ||
152 | |||
153 | if (!irqchip_in_kernel(vcpu->kvm)) | ||
154 | return; | ||
155 | |||
156 | feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); | ||
157 | if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31)))) | ||
158 | v |= APIC_LVR_DIRECTED_EOI; | ||
159 | apic_set_reg(apic, APIC_LVR, v); | ||
160 | } | ||
161 | |||
162 | static inline int apic_x2apic_mode(struct kvm_lapic *apic) | ||
163 | { | ||
164 | return apic->vcpu->arch.apic_base & X2APIC_ENABLE; | ||
165 | } | ||
166 | |||
144 | static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { | 167 | static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { |
145 | LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ | 168 | LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ |
146 | LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ | 169 | LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ |
@@ -165,36 +188,52 @@ static int find_highest_vector(void *bitmap) | |||
165 | 188 | ||
166 | static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) | 189 | static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) |
167 | { | 190 | { |
191 | apic->irr_pending = true; | ||
168 | return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); | 192 | return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); |
169 | } | 193 | } |
170 | 194 | ||
171 | static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) | 195 | static inline int apic_search_irr(struct kvm_lapic *apic) |
172 | { | 196 | { |
173 | apic_clear_vector(vec, apic->regs + APIC_IRR); | 197 | return find_highest_vector(apic->regs + APIC_IRR); |
174 | } | 198 | } |
175 | 199 | ||
176 | static inline int apic_find_highest_irr(struct kvm_lapic *apic) | 200 | static inline int apic_find_highest_irr(struct kvm_lapic *apic) |
177 | { | 201 | { |
178 | int result; | 202 | int result; |
179 | 203 | ||
180 | result = find_highest_vector(apic->regs + APIC_IRR); | 204 | if (!apic->irr_pending) |
205 | return -1; | ||
206 | |||
207 | result = apic_search_irr(apic); | ||
181 | ASSERT(result == -1 || result >= 16); | 208 | ASSERT(result == -1 || result >= 16); |
182 | 209 | ||
183 | return result; | 210 | return result; |
184 | } | 211 | } |
185 | 212 | ||
213 | static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) | ||
214 | { | ||
215 | apic->irr_pending = false; | ||
216 | apic_clear_vector(vec, apic->regs + APIC_IRR); | ||
217 | if (apic_search_irr(apic) != -1) | ||
218 | apic->irr_pending = true; | ||
219 | } | ||
220 | |||
186 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | 221 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) |
187 | { | 222 | { |
188 | struct kvm_lapic *apic = vcpu->arch.apic; | 223 | struct kvm_lapic *apic = vcpu->arch.apic; |
189 | int highest_irr; | 224 | int highest_irr; |
190 | 225 | ||
226 | /* This may race with setting of irr in __apic_accept_irq() and | ||
227 | * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq | ||
228 | * will cause vmexit immediately and the value will be recalculated | ||
229 | * on the next vmentry. | ||
230 | */ | ||
191 | if (!apic) | 231 | if (!apic) |
192 | return 0; | 232 | return 0; |
193 | highest_irr = apic_find_highest_irr(apic); | 233 | highest_irr = apic_find_highest_irr(apic); |
194 | 234 | ||
195 | return highest_irr; | 235 | return highest_irr; |
196 | } | 236 | } |
197 | EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); | ||
198 | 237 | ||
199 | static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | 238 | static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, |
200 | int vector, int level, int trig_mode); | 239 | int vector, int level, int trig_mode); |
@@ -251,7 +290,12 @@ int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) | |||
251 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) | 290 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) |
252 | { | 291 | { |
253 | int result = 0; | 292 | int result = 0; |
254 | u8 logical_id; | 293 | u32 logical_id; |
294 | |||
295 | if (apic_x2apic_mode(apic)) { | ||
296 | logical_id = apic_get_reg(apic, APIC_LDR); | ||
297 | return logical_id & mda; | ||
298 | } | ||
255 | 299 | ||
256 | logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); | 300 | logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); |
257 | 301 | ||
@@ -331,6 +375,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
331 | break; | 375 | break; |
332 | 376 | ||
333 | result = !apic_test_and_set_irr(vector, apic); | 377 | result = !apic_test_and_set_irr(vector, apic); |
378 | trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, | ||
379 | trig_mode, vector, !result); | ||
334 | if (!result) { | 380 | if (!result) { |
335 | if (trig_mode) | 381 | if (trig_mode) |
336 | apic_debug("level trig mode repeatedly for " | 382 | apic_debug("level trig mode repeatedly for " |
@@ -425,7 +471,11 @@ static void apic_set_eoi(struct kvm_lapic *apic) | |||
425 | trigger_mode = IOAPIC_LEVEL_TRIG; | 471 | trigger_mode = IOAPIC_LEVEL_TRIG; |
426 | else | 472 | else |
427 | trigger_mode = IOAPIC_EDGE_TRIG; | 473 | trigger_mode = IOAPIC_EDGE_TRIG; |
428 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); | 474 | if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { |
475 | mutex_lock(&apic->vcpu->kvm->irq_lock); | ||
476 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); | ||
477 | mutex_unlock(&apic->vcpu->kvm->irq_lock); | ||
478 | } | ||
429 | } | 479 | } |
430 | 480 | ||
431 | static void apic_send_ipi(struct kvm_lapic *apic) | 481 | static void apic_send_ipi(struct kvm_lapic *apic) |
@@ -440,7 +490,12 @@ static void apic_send_ipi(struct kvm_lapic *apic) | |||
440 | irq.level = icr_low & APIC_INT_ASSERT; | 490 | irq.level = icr_low & APIC_INT_ASSERT; |
441 | irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; | 491 | irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; |
442 | irq.shorthand = icr_low & APIC_SHORT_MASK; | 492 | irq.shorthand = icr_low & APIC_SHORT_MASK; |
443 | irq.dest_id = GET_APIC_DEST_FIELD(icr_high); | 493 | if (apic_x2apic_mode(apic)) |
494 | irq.dest_id = icr_high; | ||
495 | else | ||
496 | irq.dest_id = GET_APIC_DEST_FIELD(icr_high); | ||
497 | |||
498 | trace_kvm_apic_ipi(icr_low, irq.dest_id); | ||
444 | 499 | ||
445 | apic_debug("icr_high 0x%x, icr_low 0x%x, " | 500 | apic_debug("icr_high 0x%x, icr_low 0x%x, " |
446 | "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " | 501 | "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " |
@@ -449,7 +504,9 @@ static void apic_send_ipi(struct kvm_lapic *apic) | |||
449 | irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, | 504 | irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, |
450 | irq.vector); | 505 | irq.vector); |
451 | 506 | ||
507 | mutex_lock(&apic->vcpu->kvm->irq_lock); | ||
452 | kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); | 508 | kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); |
509 | mutex_unlock(&apic->vcpu->kvm->irq_lock); | ||
453 | } | 510 | } |
454 | 511 | ||
455 | static u32 apic_get_tmcct(struct kvm_lapic *apic) | 512 | static u32 apic_get_tmcct(struct kvm_lapic *apic) |
@@ -495,12 +552,16 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) | |||
495 | { | 552 | { |
496 | u32 val = 0; | 553 | u32 val = 0; |
497 | 554 | ||
498 | KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); | ||
499 | |||
500 | if (offset >= LAPIC_MMIO_LENGTH) | 555 | if (offset >= LAPIC_MMIO_LENGTH) |
501 | return 0; | 556 | return 0; |
502 | 557 | ||
503 | switch (offset) { | 558 | switch (offset) { |
559 | case APIC_ID: | ||
560 | if (apic_x2apic_mode(apic)) | ||
561 | val = kvm_apic_id(apic); | ||
562 | else | ||
563 | val = kvm_apic_id(apic) << 24; | ||
564 | break; | ||
504 | case APIC_ARBPRI: | 565 | case APIC_ARBPRI: |
505 | printk(KERN_WARNING "Access APIC ARBPRI register " | 566 | printk(KERN_WARNING "Access APIC ARBPRI register " |
506 | "which is for P6\n"); | 567 | "which is for P6\n"); |
@@ -522,21 +583,35 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) | |||
522 | return val; | 583 | return val; |
523 | } | 584 | } |
524 | 585 | ||
525 | static void apic_mmio_read(struct kvm_io_device *this, | 586 | static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) |
526 | gpa_t address, int len, void *data) | 587 | { |
588 | return container_of(dev, struct kvm_lapic, dev); | ||
589 | } | ||
590 | |||
591 | static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, | ||
592 | void *data) | ||
527 | { | 593 | { |
528 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | ||
529 | unsigned int offset = address - apic->base_address; | ||
530 | unsigned char alignment = offset & 0xf; | 594 | unsigned char alignment = offset & 0xf; |
531 | u32 result; | 595 | u32 result; |
596 | /* this bitmask has a bit cleared for each reserver register */ | ||
597 | static const u64 rmask = 0x43ff01ffffffe70cULL; | ||
532 | 598 | ||
533 | if ((alignment + len) > 4) { | 599 | if ((alignment + len) > 4) { |
534 | printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d", | 600 | apic_debug("KVM_APIC_READ: alignment error %x %d\n", |
535 | (unsigned long)address, len); | 601 | offset, len); |
536 | return; | 602 | return 1; |
537 | } | 603 | } |
604 | |||
605 | if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) { | ||
606 | apic_debug("KVM_APIC_READ: read reserved register %x\n", | ||
607 | offset); | ||
608 | return 1; | ||
609 | } | ||
610 | |||
538 | result = __apic_read(apic, offset & ~0xf); | 611 | result = __apic_read(apic, offset & ~0xf); |
539 | 612 | ||
613 | trace_kvm_apic_read(offset, result); | ||
614 | |||
540 | switch (len) { | 615 | switch (len) { |
541 | case 1: | 616 | case 1: |
542 | case 2: | 617 | case 2: |
@@ -548,6 +623,28 @@ static void apic_mmio_read(struct kvm_io_device *this, | |||
548 | "should be 1,2, or 4 instead\n", len); | 623 | "should be 1,2, or 4 instead\n", len); |
549 | break; | 624 | break; |
550 | } | 625 | } |
626 | return 0; | ||
627 | } | ||
628 | |||
629 | static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) | ||
630 | { | ||
631 | return apic_hw_enabled(apic) && | ||
632 | addr >= apic->base_address && | ||
633 | addr < apic->base_address + LAPIC_MMIO_LENGTH; | ||
634 | } | ||
635 | |||
636 | static int apic_mmio_read(struct kvm_io_device *this, | ||
637 | gpa_t address, int len, void *data) | ||
638 | { | ||
639 | struct kvm_lapic *apic = to_lapic(this); | ||
640 | u32 offset = address - apic->base_address; | ||
641 | |||
642 | if (!apic_mmio_in_range(apic, address)) | ||
643 | return -EOPNOTSUPP; | ||
644 | |||
645 | apic_reg_read(apic, offset, len, data); | ||
646 | |||
647 | return 0; | ||
551 | } | 648 | } |
552 | 649 | ||
553 | static void update_divide_count(struct kvm_lapic *apic) | 650 | static void update_divide_count(struct kvm_lapic *apic) |
@@ -573,6 +670,15 @@ static void start_apic_timer(struct kvm_lapic *apic) | |||
573 | 670 | ||
574 | if (!apic->lapic_timer.period) | 671 | if (!apic->lapic_timer.period) |
575 | return; | 672 | return; |
673 | /* | ||
674 | * Do not allow the guest to program periodic timers with small | ||
675 | * interval, since the hrtimers are not throttled by the host | ||
676 | * scheduler. | ||
677 | */ | ||
678 | if (apic_lvtt_period(apic)) { | ||
679 | if (apic->lapic_timer.period < NSEC_PER_MSEC/2) | ||
680 | apic->lapic_timer.period = NSEC_PER_MSEC/2; | ||
681 | } | ||
576 | 682 | ||
577 | hrtimer_start(&apic->lapic_timer.timer, | 683 | hrtimer_start(&apic->lapic_timer.timer, |
578 | ktime_add_ns(now, apic->lapic_timer.period), | 684 | ktime_add_ns(now, apic->lapic_timer.period), |
@@ -603,40 +709,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) | |||
603 | apic->vcpu->kvm->arch.vapics_in_nmi_mode--; | 709 | apic->vcpu->kvm->arch.vapics_in_nmi_mode--; |
604 | } | 710 | } |
605 | 711 | ||
606 | static void apic_mmio_write(struct kvm_io_device *this, | 712 | static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) |
607 | gpa_t address, int len, const void *data) | ||
608 | { | 713 | { |
609 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | 714 | int ret = 0; |
610 | unsigned int offset = address - apic->base_address; | ||
611 | unsigned char alignment = offset & 0xf; | ||
612 | u32 val; | ||
613 | |||
614 | /* | ||
615 | * APIC register must be aligned on 128-bits boundary. | ||
616 | * 32/64/128 bits registers must be accessed thru 32 bits. | ||
617 | * Refer SDM 8.4.1 | ||
618 | */ | ||
619 | if (len != 4 || alignment) { | ||
620 | /* Don't shout loud, $infamous_os would cause only noise. */ | ||
621 | apic_debug("apic write: bad size=%d %lx\n", | ||
622 | len, (long)address); | ||
623 | return; | ||
624 | } | ||
625 | |||
626 | val = *(u32 *) data; | ||
627 | |||
628 | /* too common printing */ | ||
629 | if (offset != APIC_EOI) | ||
630 | apic_debug("%s: offset 0x%x with length 0x%x, and value is " | ||
631 | "0x%x\n", __func__, offset, len, val); | ||
632 | |||
633 | offset &= 0xff0; | ||
634 | 715 | ||
635 | KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); | 716 | trace_kvm_apic_write(reg, val); |
636 | 717 | ||
637 | switch (offset) { | 718 | switch (reg) { |
638 | case APIC_ID: /* Local APIC ID */ | 719 | case APIC_ID: /* Local APIC ID */ |
639 | apic_set_reg(apic, APIC_ID, val); | 720 | if (!apic_x2apic_mode(apic)) |
721 | apic_set_reg(apic, APIC_ID, val); | ||
722 | else | ||
723 | ret = 1; | ||
640 | break; | 724 | break; |
641 | 725 | ||
642 | case APIC_TASKPRI: | 726 | case APIC_TASKPRI: |
@@ -649,15 +733,24 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
649 | break; | 733 | break; |
650 | 734 | ||
651 | case APIC_LDR: | 735 | case APIC_LDR: |
652 | apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); | 736 | if (!apic_x2apic_mode(apic)) |
737 | apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); | ||
738 | else | ||
739 | ret = 1; | ||
653 | break; | 740 | break; |
654 | 741 | ||
655 | case APIC_DFR: | 742 | case APIC_DFR: |
656 | apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); | 743 | if (!apic_x2apic_mode(apic)) |
744 | apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); | ||
745 | else | ||
746 | ret = 1; | ||
657 | break; | 747 | break; |
658 | 748 | ||
659 | case APIC_SPIV: | 749 | case APIC_SPIV: { |
660 | apic_set_reg(apic, APIC_SPIV, val & 0x3ff); | 750 | u32 mask = 0x3ff; |
751 | if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) | ||
752 | mask |= APIC_SPIV_DIRECTED_EOI; | ||
753 | apic_set_reg(apic, APIC_SPIV, val & mask); | ||
661 | if (!(val & APIC_SPIV_APIC_ENABLED)) { | 754 | if (!(val & APIC_SPIV_APIC_ENABLED)) { |
662 | int i; | 755 | int i; |
663 | u32 lvt_val; | 756 | u32 lvt_val; |
@@ -672,7 +765,7 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
672 | 765 | ||
673 | } | 766 | } |
674 | break; | 767 | break; |
675 | 768 | } | |
676 | case APIC_ICR: | 769 | case APIC_ICR: |
677 | /* No delay here, so we always clear the pending bit */ | 770 | /* No delay here, so we always clear the pending bit */ |
678 | apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); | 771 | apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); |
@@ -680,7 +773,9 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
680 | break; | 773 | break; |
681 | 774 | ||
682 | case APIC_ICR2: | 775 | case APIC_ICR2: |
683 | apic_set_reg(apic, APIC_ICR2, val & 0xff000000); | 776 | if (!apic_x2apic_mode(apic)) |
777 | val &= 0xff000000; | ||
778 | apic_set_reg(apic, APIC_ICR2, val); | ||
684 | break; | 779 | break; |
685 | 780 | ||
686 | case APIC_LVT0: | 781 | case APIC_LVT0: |
@@ -694,8 +789,8 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
694 | if (!apic_sw_enabled(apic)) | 789 | if (!apic_sw_enabled(apic)) |
695 | val |= APIC_LVT_MASKED; | 790 | val |= APIC_LVT_MASKED; |
696 | 791 | ||
697 | val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4]; | 792 | val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; |
698 | apic_set_reg(apic, offset, val); | 793 | apic_set_reg(apic, reg, val); |
699 | 794 | ||
700 | break; | 795 | break; |
701 | 796 | ||
@@ -703,7 +798,7 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
703 | hrtimer_cancel(&apic->lapic_timer.timer); | 798 | hrtimer_cancel(&apic->lapic_timer.timer); |
704 | apic_set_reg(apic, APIC_TMICT, val); | 799 | apic_set_reg(apic, APIC_TMICT, val); |
705 | start_apic_timer(apic); | 800 | start_apic_timer(apic); |
706 | return; | 801 | break; |
707 | 802 | ||
708 | case APIC_TDCR: | 803 | case APIC_TDCR: |
709 | if (val & 4) | 804 | if (val & 4) |
@@ -712,27 +807,59 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
712 | update_divide_count(apic); | 807 | update_divide_count(apic); |
713 | break; | 808 | break; |
714 | 809 | ||
810 | case APIC_ESR: | ||
811 | if (apic_x2apic_mode(apic) && val != 0) { | ||
812 | printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val); | ||
813 | ret = 1; | ||
814 | } | ||
815 | break; | ||
816 | |||
817 | case APIC_SELF_IPI: | ||
818 | if (apic_x2apic_mode(apic)) { | ||
819 | apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); | ||
820 | } else | ||
821 | ret = 1; | ||
822 | break; | ||
715 | default: | 823 | default: |
716 | apic_debug("Local APIC Write to read-only register %x\n", | 824 | ret = 1; |
717 | offset); | ||
718 | break; | 825 | break; |
719 | } | 826 | } |
720 | 827 | if (ret) | |
828 | apic_debug("Local APIC Write to read-only register %x\n", reg); | ||
829 | return ret; | ||
721 | } | 830 | } |
722 | 831 | ||
723 | static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr, | 832 | static int apic_mmio_write(struct kvm_io_device *this, |
724 | int len, int size) | 833 | gpa_t address, int len, const void *data) |
725 | { | 834 | { |
726 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | 835 | struct kvm_lapic *apic = to_lapic(this); |
727 | int ret = 0; | 836 | unsigned int offset = address - apic->base_address; |
837 | u32 val; | ||
728 | 838 | ||
839 | if (!apic_mmio_in_range(apic, address)) | ||
840 | return -EOPNOTSUPP; | ||
729 | 841 | ||
730 | if (apic_hw_enabled(apic) && | 842 | /* |
731 | (addr >= apic->base_address) && | 843 | * APIC register must be aligned on 128-bits boundary. |
732 | (addr < (apic->base_address + LAPIC_MMIO_LENGTH))) | 844 | * 32/64/128 bits registers must be accessed thru 32 bits. |
733 | ret = 1; | 845 | * Refer SDM 8.4.1 |
846 | */ | ||
847 | if (len != 4 || (offset & 0xf)) { | ||
848 | /* Don't shout loud, $infamous_os would cause only noise. */ | ||
849 | apic_debug("apic write: bad size=%d %lx\n", len, (long)address); | ||
850 | return 0; | ||
851 | } | ||
734 | 852 | ||
735 | return ret; | 853 | val = *(u32*)data; |
854 | |||
855 | /* too common printing */ | ||
856 | if (offset != APIC_EOI) | ||
857 | apic_debug("%s: offset 0x%x with length 0x%x, and value is " | ||
858 | "0x%x\n", __func__, offset, len, val); | ||
859 | |||
860 | apic_reg_write(apic, offset & 0xff0, val); | ||
861 | |||
862 | return 0; | ||
736 | } | 863 | } |
737 | 864 | ||
738 | void kvm_free_lapic(struct kvm_vcpu *vcpu) | 865 | void kvm_free_lapic(struct kvm_vcpu *vcpu) |
@@ -763,7 +890,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) | |||
763 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4) | 890 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4) |
764 | | (apic_get_reg(apic, APIC_TASKPRI) & 4)); | 891 | | (apic_get_reg(apic, APIC_TASKPRI) & 4)); |
765 | } | 892 | } |
766 | EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr); | ||
767 | 893 | ||
768 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) | 894 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) |
769 | { | 895 | { |
@@ -776,7 +902,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) | |||
776 | 902 | ||
777 | return (tpr & 0xf0) >> 4; | 903 | return (tpr & 0xf0) >> 4; |
778 | } | 904 | } |
779 | EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); | ||
780 | 905 | ||
781 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | 906 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) |
782 | { | 907 | { |
@@ -787,10 +912,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | |||
787 | vcpu->arch.apic_base = value; | 912 | vcpu->arch.apic_base = value; |
788 | return; | 913 | return; |
789 | } | 914 | } |
790 | if (apic->vcpu->vcpu_id) | 915 | |
916 | if (!kvm_vcpu_is_bsp(apic->vcpu)) | ||
791 | value &= ~MSR_IA32_APICBASE_BSP; | 917 | value &= ~MSR_IA32_APICBASE_BSP; |
792 | 918 | ||
793 | vcpu->arch.apic_base = value; | 919 | vcpu->arch.apic_base = value; |
920 | if (apic_x2apic_mode(apic)) { | ||
921 | u32 id = kvm_apic_id(apic); | ||
922 | u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); | ||
923 | apic_set_reg(apic, APIC_LDR, ldr); | ||
924 | } | ||
794 | apic->base_address = apic->vcpu->arch.apic_base & | 925 | apic->base_address = apic->vcpu->arch.apic_base & |
795 | MSR_IA32_APICBASE_BASE; | 926 | MSR_IA32_APICBASE_BASE; |
796 | 927 | ||
@@ -800,12 +931,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | |||
800 | 931 | ||
801 | } | 932 | } |
802 | 933 | ||
803 | u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) | ||
804 | { | ||
805 | return vcpu->arch.apic_base; | ||
806 | } | ||
807 | EXPORT_SYMBOL_GPL(kvm_lapic_get_base); | ||
808 | |||
809 | void kvm_lapic_reset(struct kvm_vcpu *vcpu) | 934 | void kvm_lapic_reset(struct kvm_vcpu *vcpu) |
810 | { | 935 | { |
811 | struct kvm_lapic *apic; | 936 | struct kvm_lapic *apic; |
@@ -821,7 +946,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
821 | hrtimer_cancel(&apic->lapic_timer.timer); | 946 | hrtimer_cancel(&apic->lapic_timer.timer); |
822 | 947 | ||
823 | apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); | 948 | apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); |
824 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); | 949 | kvm_apic_set_version(apic->vcpu); |
825 | 950 | ||
826 | for (i = 0; i < APIC_LVT_NUM; i++) | 951 | for (i = 0; i < APIC_LVT_NUM; i++) |
827 | apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); | 952 | apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); |
@@ -842,9 +967,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
842 | apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); | 967 | apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); |
843 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); | 968 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); |
844 | } | 969 | } |
970 | apic->irr_pending = false; | ||
845 | update_divide_count(apic); | 971 | update_divide_count(apic); |
846 | atomic_set(&apic->lapic_timer.pending, 0); | 972 | atomic_set(&apic->lapic_timer.pending, 0); |
847 | if (vcpu->vcpu_id == 0) | 973 | if (kvm_vcpu_is_bsp(vcpu)) |
848 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; | 974 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; |
849 | apic_update_ppr(apic); | 975 | apic_update_ppr(apic); |
850 | 976 | ||
@@ -855,7 +981,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
855 | vcpu, kvm_apic_id(apic), | 981 | vcpu, kvm_apic_id(apic), |
856 | vcpu->arch.apic_base, apic->base_address); | 982 | vcpu->arch.apic_base, apic->base_address); |
857 | } | 983 | } |
858 | EXPORT_SYMBOL_GPL(kvm_lapic_reset); | ||
859 | 984 | ||
860 | bool kvm_apic_present(struct kvm_vcpu *vcpu) | 985 | bool kvm_apic_present(struct kvm_vcpu *vcpu) |
861 | { | 986 | { |
@@ -866,7 +991,6 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu) | |||
866 | { | 991 | { |
867 | return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); | 992 | return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); |
868 | } | 993 | } |
869 | EXPORT_SYMBOL_GPL(kvm_lapic_enabled); | ||
870 | 994 | ||
871 | /* | 995 | /* |
872 | *---------------------------------------------------------------------- | 996 | *---------------------------------------------------------------------- |
@@ -917,6 +1041,11 @@ static struct kvm_timer_ops lapic_timer_ops = { | |||
917 | .is_periodic = lapic_is_periodic, | 1041 | .is_periodic = lapic_is_periodic, |
918 | }; | 1042 | }; |
919 | 1043 | ||
1044 | static const struct kvm_io_device_ops apic_mmio_ops = { | ||
1045 | .read = apic_mmio_read, | ||
1046 | .write = apic_mmio_write, | ||
1047 | }; | ||
1048 | |||
920 | int kvm_create_lapic(struct kvm_vcpu *vcpu) | 1049 | int kvm_create_lapic(struct kvm_vcpu *vcpu) |
921 | { | 1050 | { |
922 | struct kvm_lapic *apic; | 1051 | struct kvm_lapic *apic; |
@@ -945,16 +1074,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) | |||
945 | apic->lapic_timer.timer.function = kvm_timer_fn; | 1074 | apic->lapic_timer.timer.function = kvm_timer_fn; |
946 | apic->lapic_timer.t_ops = &lapic_timer_ops; | 1075 | apic->lapic_timer.t_ops = &lapic_timer_ops; |
947 | apic->lapic_timer.kvm = vcpu->kvm; | 1076 | apic->lapic_timer.kvm = vcpu->kvm; |
948 | apic->lapic_timer.vcpu_id = vcpu->vcpu_id; | 1077 | apic->lapic_timer.vcpu = vcpu; |
949 | 1078 | ||
950 | apic->base_address = APIC_DEFAULT_PHYS_BASE; | 1079 | apic->base_address = APIC_DEFAULT_PHYS_BASE; |
951 | vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; | 1080 | vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; |
952 | 1081 | ||
953 | kvm_lapic_reset(vcpu); | 1082 | kvm_lapic_reset(vcpu); |
954 | apic->dev.read = apic_mmio_read; | 1083 | kvm_iodevice_init(&apic->dev, &apic_mmio_ops); |
955 | apic->dev.write = apic_mmio_write; | ||
956 | apic->dev.in_range = apic_mmio_range; | ||
957 | apic->dev.private = apic; | ||
958 | 1084 | ||
959 | return 0; | 1085 | return 0; |
960 | nomem_free_apic: | 1086 | nomem_free_apic: |
@@ -962,7 +1088,6 @@ nomem_free_apic: | |||
962 | nomem: | 1088 | nomem: |
963 | return -ENOMEM; | 1089 | return -ENOMEM; |
964 | } | 1090 | } |
965 | EXPORT_SYMBOL_GPL(kvm_create_lapic); | ||
966 | 1091 | ||
967 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) | 1092 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) |
968 | { | 1093 | { |
@@ -985,7 +1110,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) | |||
985 | u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); | 1110 | u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); |
986 | int r = 0; | 1111 | int r = 0; |
987 | 1112 | ||
988 | if (vcpu->vcpu_id == 0) { | 1113 | if (kvm_vcpu_is_bsp(vcpu)) { |
989 | if (!apic_hw_enabled(vcpu->arch.apic)) | 1114 | if (!apic_hw_enabled(vcpu->arch.apic)) |
990 | r = 1; | 1115 | r = 1; |
991 | if ((lvt0 & APIC_LVT_MASKED) == 0 && | 1116 | if ((lvt0 & APIC_LVT_MASKED) == 0 && |
@@ -1025,7 +1150,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | |||
1025 | 1150 | ||
1026 | apic->base_address = vcpu->arch.apic_base & | 1151 | apic->base_address = vcpu->arch.apic_base & |
1027 | MSR_IA32_APICBASE_BASE; | 1152 | MSR_IA32_APICBASE_BASE; |
1028 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); | 1153 | kvm_apic_set_version(vcpu); |
1154 | |||
1029 | apic_update_ppr(apic); | 1155 | apic_update_ppr(apic); |
1030 | hrtimer_cancel(&apic->lapic_timer.timer); | 1156 | hrtimer_cancel(&apic->lapic_timer.timer); |
1031 | update_divide_count(apic); | 1157 | update_divide_count(apic); |
@@ -1092,3 +1218,35 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) | |||
1092 | 1218 | ||
1093 | vcpu->arch.apic->vapic_addr = vapic_addr; | 1219 | vcpu->arch.apic->vapic_addr = vapic_addr; |
1094 | } | 1220 | } |
1221 | |||
1222 | int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) | ||
1223 | { | ||
1224 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1225 | u32 reg = (msr - APIC_BASE_MSR) << 4; | ||
1226 | |||
1227 | if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) | ||
1228 | return 1; | ||
1229 | |||
1230 | /* if this is ICR write vector before command */ | ||
1231 | if (msr == 0x830) | ||
1232 | apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); | ||
1233 | return apic_reg_write(apic, reg, (u32)data); | ||
1234 | } | ||
1235 | |||
1236 | int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) | ||
1237 | { | ||
1238 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1239 | u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; | ||
1240 | |||
1241 | if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) | ||
1242 | return 1; | ||
1243 | |||
1244 | if (apic_reg_read(apic, reg, 4, &low)) | ||
1245 | return 1; | ||
1246 | if (msr == 0x830) | ||
1247 | apic_reg_read(apic, APIC_ICR2, 4, &high); | ||
1248 | |||
1249 | *data = (((u64)high) << 32) | low; | ||
1250 | |||
1251 | return 0; | ||
1252 | } | ||
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index a587f8349c4..40010b09c4a 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h | |||
@@ -12,6 +12,7 @@ struct kvm_lapic { | |||
12 | struct kvm_timer lapic_timer; | 12 | struct kvm_timer lapic_timer; |
13 | u32 divide_count; | 13 | u32 divide_count; |
14 | struct kvm_vcpu *vcpu; | 14 | struct kvm_vcpu *vcpu; |
15 | bool irr_pending; | ||
15 | struct page *regs_page; | 16 | struct page *regs_page; |
16 | void *regs; | 17 | void *regs; |
17 | gpa_t vapic_addr; | 18 | gpa_t vapic_addr; |
@@ -28,6 +29,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); | |||
28 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); | 29 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); |
29 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); | 30 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); |
30 | u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); | 31 | u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); |
32 | void kvm_apic_set_version(struct kvm_vcpu *vcpu); | ||
31 | 33 | ||
32 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); | 34 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); |
33 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); | 35 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); |
@@ -44,4 +46,6 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); | |||
44 | void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); | 46 | void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); |
45 | void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); | 47 | void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); |
46 | 48 | ||
49 | int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); | ||
50 | int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); | ||
47 | #endif | 51 | #endif |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0ef5bb2b404..eca41ae9f45 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -18,6 +18,7 @@ | |||
18 | */ | 18 | */ |
19 | 19 | ||
20 | #include "mmu.h" | 20 | #include "mmu.h" |
21 | #include "kvm_cache_regs.h" | ||
21 | 22 | ||
22 | #include <linux/kvm_host.h> | 23 | #include <linux/kvm_host.h> |
23 | #include <linux/types.h> | 24 | #include <linux/types.h> |
@@ -107,6 +108,9 @@ module_param(oos_shadow, bool, 0644); | |||
107 | 108 | ||
108 | #define PT32_LEVEL_MASK(level) \ | 109 | #define PT32_LEVEL_MASK(level) \ |
109 | (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) | 110 | (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) |
111 | #define PT32_LVL_OFFSET_MASK(level) \ | ||
112 | (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ | ||
113 | * PT32_LEVEL_BITS))) - 1)) | ||
110 | 114 | ||
111 | #define PT32_INDEX(address, level)\ | 115 | #define PT32_INDEX(address, level)\ |
112 | (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) | 116 | (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) |
@@ -115,10 +119,19 @@ module_param(oos_shadow, bool, 0644); | |||
115 | #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) | 119 | #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) |
116 | #define PT64_DIR_BASE_ADDR_MASK \ | 120 | #define PT64_DIR_BASE_ADDR_MASK \ |
117 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) | 121 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) |
122 | #define PT64_LVL_ADDR_MASK(level) \ | ||
123 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ | ||
124 | * PT64_LEVEL_BITS))) - 1)) | ||
125 | #define PT64_LVL_OFFSET_MASK(level) \ | ||
126 | (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ | ||
127 | * PT64_LEVEL_BITS))) - 1)) | ||
118 | 128 | ||
119 | #define PT32_BASE_ADDR_MASK PAGE_MASK | 129 | #define PT32_BASE_ADDR_MASK PAGE_MASK |
120 | #define PT32_DIR_BASE_ADDR_MASK \ | 130 | #define PT32_DIR_BASE_ADDR_MASK \ |
121 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) | 131 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) |
132 | #define PT32_LVL_ADDR_MASK(level) \ | ||
133 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ | ||
134 | * PT32_LEVEL_BITS))) - 1)) | ||
122 | 135 | ||
123 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | 136 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ |
124 | | PT64_NX_MASK) | 137 | | PT64_NX_MASK) |
@@ -129,6 +142,7 @@ module_param(oos_shadow, bool, 0644); | |||
129 | #define PFERR_RSVD_MASK (1U << 3) | 142 | #define PFERR_RSVD_MASK (1U << 3) |
130 | #define PFERR_FETCH_MASK (1U << 4) | 143 | #define PFERR_FETCH_MASK (1U << 4) |
131 | 144 | ||
145 | #define PT_PDPE_LEVEL 3 | ||
132 | #define PT_DIRECTORY_LEVEL 2 | 146 | #define PT_DIRECTORY_LEVEL 2 |
133 | #define PT_PAGE_TABLE_LEVEL 1 | 147 | #define PT_PAGE_TABLE_LEVEL 1 |
134 | 148 | ||
@@ -139,10 +153,13 @@ module_param(oos_shadow, bool, 0644); | |||
139 | #define ACC_USER_MASK PT_USER_MASK | 153 | #define ACC_USER_MASK PT_USER_MASK |
140 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) | 154 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) |
141 | 155 | ||
156 | #define CREATE_TRACE_POINTS | ||
157 | #include "mmutrace.h" | ||
158 | |||
142 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 159 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
143 | 160 | ||
144 | struct kvm_rmap_desc { | 161 | struct kvm_rmap_desc { |
145 | u64 *shadow_ptes[RMAP_EXT]; | 162 | u64 *sptes[RMAP_EXT]; |
146 | struct kvm_rmap_desc *more; | 163 | struct kvm_rmap_desc *more; |
147 | }; | 164 | }; |
148 | 165 | ||
@@ -239,16 +256,25 @@ static int is_writeble_pte(unsigned long pte) | |||
239 | return pte & PT_WRITABLE_MASK; | 256 | return pte & PT_WRITABLE_MASK; |
240 | } | 257 | } |
241 | 258 | ||
242 | static int is_dirty_pte(unsigned long pte) | 259 | static int is_dirty_gpte(unsigned long pte) |
243 | { | 260 | { |
244 | return pte & shadow_dirty_mask; | 261 | return pte & PT_DIRTY_MASK; |
245 | } | 262 | } |
246 | 263 | ||
247 | static int is_rmap_pte(u64 pte) | 264 | static int is_rmap_spte(u64 pte) |
248 | { | 265 | { |
249 | return is_shadow_present_pte(pte); | 266 | return is_shadow_present_pte(pte); |
250 | } | 267 | } |
251 | 268 | ||
269 | static int is_last_spte(u64 pte, int level) | ||
270 | { | ||
271 | if (level == PT_PAGE_TABLE_LEVEL) | ||
272 | return 1; | ||
273 | if (is_large_pte(pte)) | ||
274 | return 1; | ||
275 | return 0; | ||
276 | } | ||
277 | |||
252 | static pfn_t spte_to_pfn(u64 pte) | 278 | static pfn_t spte_to_pfn(u64 pte) |
253 | { | 279 | { |
254 | return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 280 | return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
@@ -261,7 +287,7 @@ static gfn_t pse36_gfn_delta(u32 gpte) | |||
261 | return (gpte & PT32_DIR_PSE36_MASK) << shift; | 287 | return (gpte & PT32_DIR_PSE36_MASK) << shift; |
262 | } | 288 | } |
263 | 289 | ||
264 | static void set_shadow_pte(u64 *sptep, u64 spte) | 290 | static void __set_spte(u64 *sptep, u64 spte) |
265 | { | 291 | { |
266 | #ifdef CONFIG_X86_64 | 292 | #ifdef CONFIG_X86_64 |
267 | set_64bit((unsigned long *)sptep, spte); | 293 | set_64bit((unsigned long *)sptep, spte); |
@@ -380,37 +406,52 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) | |||
380 | * Return the pointer to the largepage write count for a given | 406 | * Return the pointer to the largepage write count for a given |
381 | * gfn, handling slots that are not large page aligned. | 407 | * gfn, handling slots that are not large page aligned. |
382 | */ | 408 | */ |
383 | static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) | 409 | static int *slot_largepage_idx(gfn_t gfn, |
410 | struct kvm_memory_slot *slot, | ||
411 | int level) | ||
384 | { | 412 | { |
385 | unsigned long idx; | 413 | unsigned long idx; |
386 | 414 | ||
387 | idx = (gfn / KVM_PAGES_PER_HPAGE) - | 415 | idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - |
388 | (slot->base_gfn / KVM_PAGES_PER_HPAGE); | 416 | (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); |
389 | return &slot->lpage_info[idx].write_count; | 417 | return &slot->lpage_info[level - 2][idx].write_count; |
390 | } | 418 | } |
391 | 419 | ||
392 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) | 420 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) |
393 | { | 421 | { |
422 | struct kvm_memory_slot *slot; | ||
394 | int *write_count; | 423 | int *write_count; |
424 | int i; | ||
395 | 425 | ||
396 | gfn = unalias_gfn(kvm, gfn); | 426 | gfn = unalias_gfn(kvm, gfn); |
397 | write_count = slot_largepage_idx(gfn, | 427 | |
398 | gfn_to_memslot_unaliased(kvm, gfn)); | 428 | slot = gfn_to_memslot_unaliased(kvm, gfn); |
399 | *write_count += 1; | 429 | for (i = PT_DIRECTORY_LEVEL; |
430 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | ||
431 | write_count = slot_largepage_idx(gfn, slot, i); | ||
432 | *write_count += 1; | ||
433 | } | ||
400 | } | 434 | } |
401 | 435 | ||
402 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | 436 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) |
403 | { | 437 | { |
438 | struct kvm_memory_slot *slot; | ||
404 | int *write_count; | 439 | int *write_count; |
440 | int i; | ||
405 | 441 | ||
406 | gfn = unalias_gfn(kvm, gfn); | 442 | gfn = unalias_gfn(kvm, gfn); |
407 | write_count = slot_largepage_idx(gfn, | 443 | for (i = PT_DIRECTORY_LEVEL; |
408 | gfn_to_memslot_unaliased(kvm, gfn)); | 444 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
409 | *write_count -= 1; | 445 | slot = gfn_to_memslot_unaliased(kvm, gfn); |
410 | WARN_ON(*write_count < 0); | 446 | write_count = slot_largepage_idx(gfn, slot, i); |
447 | *write_count -= 1; | ||
448 | WARN_ON(*write_count < 0); | ||
449 | } | ||
411 | } | 450 | } |
412 | 451 | ||
413 | static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) | 452 | static int has_wrprotected_page(struct kvm *kvm, |
453 | gfn_t gfn, | ||
454 | int level) | ||
414 | { | 455 | { |
415 | struct kvm_memory_slot *slot; | 456 | struct kvm_memory_slot *slot; |
416 | int *largepage_idx; | 457 | int *largepage_idx; |
@@ -418,47 +459,67 @@ static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) | |||
418 | gfn = unalias_gfn(kvm, gfn); | 459 | gfn = unalias_gfn(kvm, gfn); |
419 | slot = gfn_to_memslot_unaliased(kvm, gfn); | 460 | slot = gfn_to_memslot_unaliased(kvm, gfn); |
420 | if (slot) { | 461 | if (slot) { |
421 | largepage_idx = slot_largepage_idx(gfn, slot); | 462 | largepage_idx = slot_largepage_idx(gfn, slot, level); |
422 | return *largepage_idx; | 463 | return *largepage_idx; |
423 | } | 464 | } |
424 | 465 | ||
425 | return 1; | 466 | return 1; |
426 | } | 467 | } |
427 | 468 | ||
428 | static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) | 469 | static int host_mapping_level(struct kvm *kvm, gfn_t gfn) |
429 | { | 470 | { |
471 | unsigned long page_size = PAGE_SIZE; | ||
430 | struct vm_area_struct *vma; | 472 | struct vm_area_struct *vma; |
431 | unsigned long addr; | 473 | unsigned long addr; |
432 | int ret = 0; | 474 | int i, ret = 0; |
433 | 475 | ||
434 | addr = gfn_to_hva(kvm, gfn); | 476 | addr = gfn_to_hva(kvm, gfn); |
435 | if (kvm_is_error_hva(addr)) | 477 | if (kvm_is_error_hva(addr)) |
436 | return ret; | 478 | return page_size; |
437 | 479 | ||
438 | down_read(¤t->mm->mmap_sem); | 480 | down_read(¤t->mm->mmap_sem); |
439 | vma = find_vma(current->mm, addr); | 481 | vma = find_vma(current->mm, addr); |
440 | if (vma && is_vm_hugetlb_page(vma)) | 482 | if (!vma) |
441 | ret = 1; | 483 | goto out; |
484 | |||
485 | page_size = vma_kernel_pagesize(vma); | ||
486 | |||
487 | out: | ||
442 | up_read(¤t->mm->mmap_sem); | 488 | up_read(¤t->mm->mmap_sem); |
443 | 489 | ||
490 | for (i = PT_PAGE_TABLE_LEVEL; | ||
491 | i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { | ||
492 | if (page_size >= KVM_HPAGE_SIZE(i)) | ||
493 | ret = i; | ||
494 | else | ||
495 | break; | ||
496 | } | ||
497 | |||
444 | return ret; | 498 | return ret; |
445 | } | 499 | } |
446 | 500 | ||
447 | static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) | 501 | static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) |
448 | { | 502 | { |
449 | struct kvm_memory_slot *slot; | 503 | struct kvm_memory_slot *slot; |
450 | 504 | int host_level; | |
451 | if (has_wrprotected_page(vcpu->kvm, large_gfn)) | 505 | int level = PT_PAGE_TABLE_LEVEL; |
452 | return 0; | ||
453 | |||
454 | if (!host_largepage_backed(vcpu->kvm, large_gfn)) | ||
455 | return 0; | ||
456 | 506 | ||
457 | slot = gfn_to_memslot(vcpu->kvm, large_gfn); | 507 | slot = gfn_to_memslot(vcpu->kvm, large_gfn); |
458 | if (slot && slot->dirty_bitmap) | 508 | if (slot && slot->dirty_bitmap) |
459 | return 0; | 509 | return PT_PAGE_TABLE_LEVEL; |
460 | 510 | ||
461 | return 1; | 511 | host_level = host_mapping_level(vcpu->kvm, large_gfn); |
512 | |||
513 | if (host_level == PT_PAGE_TABLE_LEVEL) | ||
514 | return host_level; | ||
515 | |||
516 | for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { | ||
517 | |||
518 | if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) | ||
519 | break; | ||
520 | } | ||
521 | |||
522 | return level - 1; | ||
462 | } | 523 | } |
463 | 524 | ||
464 | /* | 525 | /* |
@@ -466,19 +527,19 @@ static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) | |||
466 | * Note: gfn must be unaliased before this function get called | 527 | * Note: gfn must be unaliased before this function get called |
467 | */ | 528 | */ |
468 | 529 | ||
469 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) | 530 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) |
470 | { | 531 | { |
471 | struct kvm_memory_slot *slot; | 532 | struct kvm_memory_slot *slot; |
472 | unsigned long idx; | 533 | unsigned long idx; |
473 | 534 | ||
474 | slot = gfn_to_memslot(kvm, gfn); | 535 | slot = gfn_to_memslot(kvm, gfn); |
475 | if (!lpage) | 536 | if (likely(level == PT_PAGE_TABLE_LEVEL)) |
476 | return &slot->rmap[gfn - slot->base_gfn]; | 537 | return &slot->rmap[gfn - slot->base_gfn]; |
477 | 538 | ||
478 | idx = (gfn / KVM_PAGES_PER_HPAGE) - | 539 | idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - |
479 | (slot->base_gfn / KVM_PAGES_PER_HPAGE); | 540 | (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); |
480 | 541 | ||
481 | return &slot->lpage_info[idx].rmap_pde; | 542 | return &slot->lpage_info[level - 2][idx].rmap_pde; |
482 | } | 543 | } |
483 | 544 | ||
484 | /* | 545 | /* |
@@ -494,42 +555,42 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) | |||
494 | * the spte was not added. | 555 | * the spte was not added. |
495 | * | 556 | * |
496 | */ | 557 | */ |
497 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) | 558 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) |
498 | { | 559 | { |
499 | struct kvm_mmu_page *sp; | 560 | struct kvm_mmu_page *sp; |
500 | struct kvm_rmap_desc *desc; | 561 | struct kvm_rmap_desc *desc; |
501 | unsigned long *rmapp; | 562 | unsigned long *rmapp; |
502 | int i, count = 0; | 563 | int i, count = 0; |
503 | 564 | ||
504 | if (!is_rmap_pte(*spte)) | 565 | if (!is_rmap_spte(*spte)) |
505 | return count; | 566 | return count; |
506 | gfn = unalias_gfn(vcpu->kvm, gfn); | 567 | gfn = unalias_gfn(vcpu->kvm, gfn); |
507 | sp = page_header(__pa(spte)); | 568 | sp = page_header(__pa(spte)); |
508 | sp->gfns[spte - sp->spt] = gfn; | 569 | sp->gfns[spte - sp->spt] = gfn; |
509 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); | 570 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); |
510 | if (!*rmapp) { | 571 | if (!*rmapp) { |
511 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | 572 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); |
512 | *rmapp = (unsigned long)spte; | 573 | *rmapp = (unsigned long)spte; |
513 | } else if (!(*rmapp & 1)) { | 574 | } else if (!(*rmapp & 1)) { |
514 | rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); | 575 | rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); |
515 | desc = mmu_alloc_rmap_desc(vcpu); | 576 | desc = mmu_alloc_rmap_desc(vcpu); |
516 | desc->shadow_ptes[0] = (u64 *)*rmapp; | 577 | desc->sptes[0] = (u64 *)*rmapp; |
517 | desc->shadow_ptes[1] = spte; | 578 | desc->sptes[1] = spte; |
518 | *rmapp = (unsigned long)desc | 1; | 579 | *rmapp = (unsigned long)desc | 1; |
519 | } else { | 580 | } else { |
520 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | 581 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); |
521 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 582 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); |
522 | while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) { | 583 | while (desc->sptes[RMAP_EXT-1] && desc->more) { |
523 | desc = desc->more; | 584 | desc = desc->more; |
524 | count += RMAP_EXT; | 585 | count += RMAP_EXT; |
525 | } | 586 | } |
526 | if (desc->shadow_ptes[RMAP_EXT-1]) { | 587 | if (desc->sptes[RMAP_EXT-1]) { |
527 | desc->more = mmu_alloc_rmap_desc(vcpu); | 588 | desc->more = mmu_alloc_rmap_desc(vcpu); |
528 | desc = desc->more; | 589 | desc = desc->more; |
529 | } | 590 | } |
530 | for (i = 0; desc->shadow_ptes[i]; ++i) | 591 | for (i = 0; desc->sptes[i]; ++i) |
531 | ; | 592 | ; |
532 | desc->shadow_ptes[i] = spte; | 593 | desc->sptes[i] = spte; |
533 | } | 594 | } |
534 | return count; | 595 | return count; |
535 | } | 596 | } |
@@ -541,14 +602,14 @@ static void rmap_desc_remove_entry(unsigned long *rmapp, | |||
541 | { | 602 | { |
542 | int j; | 603 | int j; |
543 | 604 | ||
544 | for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) | 605 | for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) |
545 | ; | 606 | ; |
546 | desc->shadow_ptes[i] = desc->shadow_ptes[j]; | 607 | desc->sptes[i] = desc->sptes[j]; |
547 | desc->shadow_ptes[j] = NULL; | 608 | desc->sptes[j] = NULL; |
548 | if (j != 0) | 609 | if (j != 0) |
549 | return; | 610 | return; |
550 | if (!prev_desc && !desc->more) | 611 | if (!prev_desc && !desc->more) |
551 | *rmapp = (unsigned long)desc->shadow_ptes[0]; | 612 | *rmapp = (unsigned long)desc->sptes[0]; |
552 | else | 613 | else |
553 | if (prev_desc) | 614 | if (prev_desc) |
554 | prev_desc->more = desc->more; | 615 | prev_desc->more = desc->more; |
@@ -566,7 +627,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
566 | unsigned long *rmapp; | 627 | unsigned long *rmapp; |
567 | int i; | 628 | int i; |
568 | 629 | ||
569 | if (!is_rmap_pte(*spte)) | 630 | if (!is_rmap_spte(*spte)) |
570 | return; | 631 | return; |
571 | sp = page_header(__pa(spte)); | 632 | sp = page_header(__pa(spte)); |
572 | pfn = spte_to_pfn(*spte); | 633 | pfn = spte_to_pfn(*spte); |
@@ -576,7 +637,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
576 | kvm_release_pfn_dirty(pfn); | 637 | kvm_release_pfn_dirty(pfn); |
577 | else | 638 | else |
578 | kvm_release_pfn_clean(pfn); | 639 | kvm_release_pfn_clean(pfn); |
579 | rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte)); | 640 | rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); |
580 | if (!*rmapp) { | 641 | if (!*rmapp) { |
581 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | 642 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); |
582 | BUG(); | 643 | BUG(); |
@@ -593,8 +654,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
593 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 654 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); |
594 | prev_desc = NULL; | 655 | prev_desc = NULL; |
595 | while (desc) { | 656 | while (desc) { |
596 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) | 657 | for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) |
597 | if (desc->shadow_ptes[i] == spte) { | 658 | if (desc->sptes[i] == spte) { |
598 | rmap_desc_remove_entry(rmapp, | 659 | rmap_desc_remove_entry(rmapp, |
599 | desc, i, | 660 | desc, i, |
600 | prev_desc); | 661 | prev_desc); |
@@ -625,10 +686,10 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) | |||
625 | prev_desc = NULL; | 686 | prev_desc = NULL; |
626 | prev_spte = NULL; | 687 | prev_spte = NULL; |
627 | while (desc) { | 688 | while (desc) { |
628 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { | 689 | for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { |
629 | if (prev_spte == spte) | 690 | if (prev_spte == spte) |
630 | return desc->shadow_ptes[i]; | 691 | return desc->sptes[i]; |
631 | prev_spte = desc->shadow_ptes[i]; | 692 | prev_spte = desc->sptes[i]; |
632 | } | 693 | } |
633 | desc = desc->more; | 694 | desc = desc->more; |
634 | } | 695 | } |
@@ -639,10 +700,10 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
639 | { | 700 | { |
640 | unsigned long *rmapp; | 701 | unsigned long *rmapp; |
641 | u64 *spte; | 702 | u64 *spte; |
642 | int write_protected = 0; | 703 | int i, write_protected = 0; |
643 | 704 | ||
644 | gfn = unalias_gfn(kvm, gfn); | 705 | gfn = unalias_gfn(kvm, gfn); |
645 | rmapp = gfn_to_rmap(kvm, gfn, 0); | 706 | rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); |
646 | 707 | ||
647 | spte = rmap_next(kvm, rmapp, NULL); | 708 | spte = rmap_next(kvm, rmapp, NULL); |
648 | while (spte) { | 709 | while (spte) { |
@@ -650,7 +711,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
650 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 711 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
651 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | 712 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); |
652 | if (is_writeble_pte(*spte)) { | 713 | if (is_writeble_pte(*spte)) { |
653 | set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); | 714 | __set_spte(spte, *spte & ~PT_WRITABLE_MASK); |
654 | write_protected = 1; | 715 | write_protected = 1; |
655 | } | 716 | } |
656 | spte = rmap_next(kvm, rmapp, spte); | 717 | spte = rmap_next(kvm, rmapp, spte); |
@@ -664,21 +725,24 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
664 | } | 725 | } |
665 | 726 | ||
666 | /* check for huge page mappings */ | 727 | /* check for huge page mappings */ |
667 | rmapp = gfn_to_rmap(kvm, gfn, 1); | 728 | for (i = PT_DIRECTORY_LEVEL; |
668 | spte = rmap_next(kvm, rmapp, NULL); | 729 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
669 | while (spte) { | 730 | rmapp = gfn_to_rmap(kvm, gfn, i); |
670 | BUG_ON(!spte); | 731 | spte = rmap_next(kvm, rmapp, NULL); |
671 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 732 | while (spte) { |
672 | BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); | 733 | BUG_ON(!spte); |
673 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); | 734 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
674 | if (is_writeble_pte(*spte)) { | 735 | BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); |
675 | rmap_remove(kvm, spte); | 736 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); |
676 | --kvm->stat.lpages; | 737 | if (is_writeble_pte(*spte)) { |
677 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | 738 | rmap_remove(kvm, spte); |
678 | spte = NULL; | 739 | --kvm->stat.lpages; |
679 | write_protected = 1; | 740 | __set_spte(spte, shadow_trap_nonpresent_pte); |
741 | spte = NULL; | ||
742 | write_protected = 1; | ||
743 | } | ||
744 | spte = rmap_next(kvm, rmapp, spte); | ||
680 | } | 745 | } |
681 | spte = rmap_next(kvm, rmapp, spte); | ||
682 | } | 746 | } |
683 | 747 | ||
684 | return write_protected; | 748 | return write_protected; |
@@ -693,7 +757,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) | |||
693 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 757 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
694 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); | 758 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); |
695 | rmap_remove(kvm, spte); | 759 | rmap_remove(kvm, spte); |
696 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | 760 | __set_spte(spte, shadow_trap_nonpresent_pte); |
697 | need_tlb_flush = 1; | 761 | need_tlb_flush = 1; |
698 | } | 762 | } |
699 | return need_tlb_flush; | 763 | return need_tlb_flush; |
@@ -702,7 +766,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) | |||
702 | static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | 766 | static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, |
703 | int (*handler)(struct kvm *kvm, unsigned long *rmapp)) | 767 | int (*handler)(struct kvm *kvm, unsigned long *rmapp)) |
704 | { | 768 | { |
705 | int i; | 769 | int i, j; |
706 | int retval = 0; | 770 | int retval = 0; |
707 | 771 | ||
708 | /* | 772 | /* |
@@ -721,11 +785,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | |||
721 | end = start + (memslot->npages << PAGE_SHIFT); | 785 | end = start + (memslot->npages << PAGE_SHIFT); |
722 | if (hva >= start && hva < end) { | 786 | if (hva >= start && hva < end) { |
723 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; | 787 | gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; |
788 | |||
724 | retval |= handler(kvm, &memslot->rmap[gfn_offset]); | 789 | retval |= handler(kvm, &memslot->rmap[gfn_offset]); |
725 | retval |= handler(kvm, | 790 | |
726 | &memslot->lpage_info[ | 791 | for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { |
727 | gfn_offset / | 792 | int idx = gfn_offset; |
728 | KVM_PAGES_PER_HPAGE].rmap_pde); | 793 | idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); |
794 | retval |= handler(kvm, | ||
795 | &memslot->lpage_info[j][idx].rmap_pde); | ||
796 | } | ||
729 | } | 797 | } |
730 | } | 798 | } |
731 | 799 | ||
@@ -763,12 +831,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) | |||
763 | 831 | ||
764 | #define RMAP_RECYCLE_THRESHOLD 1000 | 832 | #define RMAP_RECYCLE_THRESHOLD 1000 |
765 | 833 | ||
766 | static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) | 834 | static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) |
767 | { | 835 | { |
768 | unsigned long *rmapp; | 836 | unsigned long *rmapp; |
837 | struct kvm_mmu_page *sp; | ||
838 | |||
839 | sp = page_header(__pa(spte)); | ||
769 | 840 | ||
770 | gfn = unalias_gfn(vcpu->kvm, gfn); | 841 | gfn = unalias_gfn(vcpu->kvm, gfn); |
771 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); | 842 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); |
772 | 843 | ||
773 | kvm_unmap_rmapp(vcpu->kvm, rmapp); | 844 | kvm_unmap_rmapp(vcpu->kvm, rmapp); |
774 | kvm_flush_remote_tlbs(vcpu->kvm); | 845 | kvm_flush_remote_tlbs(vcpu->kvm); |
@@ -1109,6 +1180,7 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
1109 | return 1; | 1180 | return 1; |
1110 | } | 1181 | } |
1111 | 1182 | ||
1183 | trace_kvm_mmu_sync_page(sp); | ||
1112 | if (rmap_write_protect(vcpu->kvm, sp->gfn)) | 1184 | if (rmap_write_protect(vcpu->kvm, sp->gfn)) |
1113 | kvm_flush_remote_tlbs(vcpu->kvm); | 1185 | kvm_flush_remote_tlbs(vcpu->kvm); |
1114 | kvm_unlink_unsync_page(vcpu->kvm, sp); | 1186 | kvm_unlink_unsync_page(vcpu->kvm, sp); |
@@ -1231,8 +1303,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1231 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | 1303 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; |
1232 | role.quadrant = quadrant; | 1304 | role.quadrant = quadrant; |
1233 | } | 1305 | } |
1234 | pgprintk("%s: looking gfn %lx role %x\n", __func__, | ||
1235 | gfn, role.word); | ||
1236 | index = kvm_page_table_hashfn(gfn); | 1306 | index = kvm_page_table_hashfn(gfn); |
1237 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | 1307 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
1238 | hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) | 1308 | hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) |
@@ -1249,14 +1319,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1249 | set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); | 1319 | set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); |
1250 | kvm_mmu_mark_parents_unsync(vcpu, sp); | 1320 | kvm_mmu_mark_parents_unsync(vcpu, sp); |
1251 | } | 1321 | } |
1252 | pgprintk("%s: found\n", __func__); | 1322 | trace_kvm_mmu_get_page(sp, false); |
1253 | return sp; | 1323 | return sp; |
1254 | } | 1324 | } |
1255 | ++vcpu->kvm->stat.mmu_cache_miss; | 1325 | ++vcpu->kvm->stat.mmu_cache_miss; |
1256 | sp = kvm_mmu_alloc_page(vcpu, parent_pte); | 1326 | sp = kvm_mmu_alloc_page(vcpu, parent_pte); |
1257 | if (!sp) | 1327 | if (!sp) |
1258 | return sp; | 1328 | return sp; |
1259 | pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); | ||
1260 | sp->gfn = gfn; | 1329 | sp->gfn = gfn; |
1261 | sp->role = role; | 1330 | sp->role = role; |
1262 | hlist_add_head(&sp->hash_link, bucket); | 1331 | hlist_add_head(&sp->hash_link, bucket); |
@@ -1269,6 +1338,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1269 | vcpu->arch.mmu.prefetch_page(vcpu, sp); | 1338 | vcpu->arch.mmu.prefetch_page(vcpu, sp); |
1270 | else | 1339 | else |
1271 | nonpaging_prefetch_page(vcpu, sp); | 1340 | nonpaging_prefetch_page(vcpu, sp); |
1341 | trace_kvm_mmu_get_page(sp, true); | ||
1272 | return sp; | 1342 | return sp; |
1273 | } | 1343 | } |
1274 | 1344 | ||
@@ -1292,6 +1362,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) | |||
1292 | { | 1362 | { |
1293 | if (iterator->level < PT_PAGE_TABLE_LEVEL) | 1363 | if (iterator->level < PT_PAGE_TABLE_LEVEL) |
1294 | return false; | 1364 | return false; |
1365 | |||
1366 | if (iterator->level == PT_PAGE_TABLE_LEVEL) | ||
1367 | if (is_large_pte(*iterator->sptep)) | ||
1368 | return false; | ||
1369 | |||
1295 | iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); | 1370 | iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); |
1296 | iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; | 1371 | iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; |
1297 | return true; | 1372 | return true; |
@@ -1312,25 +1387,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, | |||
1312 | 1387 | ||
1313 | pt = sp->spt; | 1388 | pt = sp->spt; |
1314 | 1389 | ||
1315 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) { | ||
1316 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
1317 | if (is_shadow_present_pte(pt[i])) | ||
1318 | rmap_remove(kvm, &pt[i]); | ||
1319 | pt[i] = shadow_trap_nonpresent_pte; | ||
1320 | } | ||
1321 | return; | ||
1322 | } | ||
1323 | |||
1324 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | 1390 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { |
1325 | ent = pt[i]; | 1391 | ent = pt[i]; |
1326 | 1392 | ||
1327 | if (is_shadow_present_pte(ent)) { | 1393 | if (is_shadow_present_pte(ent)) { |
1328 | if (!is_large_pte(ent)) { | 1394 | if (!is_last_spte(ent, sp->role.level)) { |
1329 | ent &= PT64_BASE_ADDR_MASK; | 1395 | ent &= PT64_BASE_ADDR_MASK; |
1330 | mmu_page_remove_parent_pte(page_header(ent), | 1396 | mmu_page_remove_parent_pte(page_header(ent), |
1331 | &pt[i]); | 1397 | &pt[i]); |
1332 | } else { | 1398 | } else { |
1333 | --kvm->stat.lpages; | 1399 | if (is_large_pte(ent)) |
1400 | --kvm->stat.lpages; | ||
1334 | rmap_remove(kvm, &pt[i]); | 1401 | rmap_remove(kvm, &pt[i]); |
1335 | } | 1402 | } |
1336 | } | 1403 | } |
@@ -1346,10 +1413,10 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | |||
1346 | static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) | 1413 | static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) |
1347 | { | 1414 | { |
1348 | int i; | 1415 | int i; |
1416 | struct kvm_vcpu *vcpu; | ||
1349 | 1417 | ||
1350 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | 1418 | kvm_for_each_vcpu(i, vcpu, kvm) |
1351 | if (kvm->vcpus[i]) | 1419 | vcpu->arch.last_pte_updated = NULL; |
1352 | kvm->vcpus[i]->arch.last_pte_updated = NULL; | ||
1353 | } | 1420 | } |
1354 | 1421 | ||
1355 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) | 1422 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) |
@@ -1368,7 +1435,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
1368 | } | 1435 | } |
1369 | BUG_ON(!parent_pte); | 1436 | BUG_ON(!parent_pte); |
1370 | kvm_mmu_put_page(sp, parent_pte); | 1437 | kvm_mmu_put_page(sp, parent_pte); |
1371 | set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); | 1438 | __set_spte(parent_pte, shadow_trap_nonpresent_pte); |
1372 | } | 1439 | } |
1373 | } | 1440 | } |
1374 | 1441 | ||
@@ -1400,6 +1467,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm, | |||
1400 | static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1467 | static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) |
1401 | { | 1468 | { |
1402 | int ret; | 1469 | int ret; |
1470 | |||
1471 | trace_kvm_mmu_zap_page(sp); | ||
1403 | ++kvm->stat.mmu_shadow_zapped; | 1472 | ++kvm->stat.mmu_shadow_zapped; |
1404 | ret = mmu_zap_unsync_children(kvm, sp); | 1473 | ret = mmu_zap_unsync_children(kvm, sp); |
1405 | kvm_mmu_page_unlink_children(kvm, sp); | 1474 | kvm_mmu_page_unlink_children(kvm, sp); |
@@ -1516,7 +1585,7 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp) | |||
1516 | 1585 | ||
1517 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | 1586 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { |
1518 | if (pt[i] == shadow_notrap_nonpresent_pte) | 1587 | if (pt[i] == shadow_notrap_nonpresent_pte) |
1519 | set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); | 1588 | __set_spte(&pt[i], shadow_trap_nonpresent_pte); |
1520 | } | 1589 | } |
1521 | } | 1590 | } |
1522 | 1591 | ||
@@ -1646,6 +1715,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
1646 | struct kvm_mmu_page *s; | 1715 | struct kvm_mmu_page *s; |
1647 | struct hlist_node *node, *n; | 1716 | struct hlist_node *node, *n; |
1648 | 1717 | ||
1718 | trace_kvm_mmu_unsync_page(sp); | ||
1649 | index = kvm_page_table_hashfn(sp->gfn); | 1719 | index = kvm_page_table_hashfn(sp->gfn); |
1650 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | 1720 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
1651 | /* don't unsync if pagetable is shadowed with multiple roles */ | 1721 | /* don't unsync if pagetable is shadowed with multiple roles */ |
@@ -1682,9 +1752,9 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
1682 | return 0; | 1752 | return 0; |
1683 | } | 1753 | } |
1684 | 1754 | ||
1685 | static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | 1755 | static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
1686 | unsigned pte_access, int user_fault, | 1756 | unsigned pte_access, int user_fault, |
1687 | int write_fault, int dirty, int largepage, | 1757 | int write_fault, int dirty, int level, |
1688 | gfn_t gfn, pfn_t pfn, bool speculative, | 1758 | gfn_t gfn, pfn_t pfn, bool speculative, |
1689 | bool can_unsync) | 1759 | bool can_unsync) |
1690 | { | 1760 | { |
@@ -1707,7 +1777,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1707 | spte |= shadow_nx_mask; | 1777 | spte |= shadow_nx_mask; |
1708 | if (pte_access & ACC_USER_MASK) | 1778 | if (pte_access & ACC_USER_MASK) |
1709 | spte |= shadow_user_mask; | 1779 | spte |= shadow_user_mask; |
1710 | if (largepage) | 1780 | if (level > PT_PAGE_TABLE_LEVEL) |
1711 | spte |= PT_PAGE_SIZE_MASK; | 1781 | spte |= PT_PAGE_SIZE_MASK; |
1712 | if (tdp_enabled) | 1782 | if (tdp_enabled) |
1713 | spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, | 1783 | spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, |
@@ -1718,7 +1788,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1718 | if ((pte_access & ACC_WRITE_MASK) | 1788 | if ((pte_access & ACC_WRITE_MASK) |
1719 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { | 1789 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { |
1720 | 1790 | ||
1721 | if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { | 1791 | if (level > PT_PAGE_TABLE_LEVEL && |
1792 | has_wrprotected_page(vcpu->kvm, gfn, level)) { | ||
1722 | ret = 1; | 1793 | ret = 1; |
1723 | spte = shadow_trap_nonpresent_pte; | 1794 | spte = shadow_trap_nonpresent_pte; |
1724 | goto set_pte; | 1795 | goto set_pte; |
@@ -1732,7 +1803,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1732 | * is responsibility of mmu_get_page / kvm_sync_page. | 1803 | * is responsibility of mmu_get_page / kvm_sync_page. |
1733 | * Same reasoning can be applied to dirty page accounting. | 1804 | * Same reasoning can be applied to dirty page accounting. |
1734 | */ | 1805 | */ |
1735 | if (!can_unsync && is_writeble_pte(*shadow_pte)) | 1806 | if (!can_unsync && is_writeble_pte(*sptep)) |
1736 | goto set_pte; | 1807 | goto set_pte; |
1737 | 1808 | ||
1738 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { | 1809 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { |
@@ -1749,65 +1820,67 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1749 | mark_page_dirty(vcpu->kvm, gfn); | 1820 | mark_page_dirty(vcpu->kvm, gfn); |
1750 | 1821 | ||
1751 | set_pte: | 1822 | set_pte: |
1752 | set_shadow_pte(shadow_pte, spte); | 1823 | __set_spte(sptep, spte); |
1753 | return ret; | 1824 | return ret; |
1754 | } | 1825 | } |
1755 | 1826 | ||
1756 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | 1827 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
1757 | unsigned pt_access, unsigned pte_access, | 1828 | unsigned pt_access, unsigned pte_access, |
1758 | int user_fault, int write_fault, int dirty, | 1829 | int user_fault, int write_fault, int dirty, |
1759 | int *ptwrite, int largepage, gfn_t gfn, | 1830 | int *ptwrite, int level, gfn_t gfn, |
1760 | pfn_t pfn, bool speculative) | 1831 | pfn_t pfn, bool speculative) |
1761 | { | 1832 | { |
1762 | int was_rmapped = 0; | 1833 | int was_rmapped = 0; |
1763 | int was_writeble = is_writeble_pte(*shadow_pte); | 1834 | int was_writeble = is_writeble_pte(*sptep); |
1764 | int rmap_count; | 1835 | int rmap_count; |
1765 | 1836 | ||
1766 | pgprintk("%s: spte %llx access %x write_fault %d" | 1837 | pgprintk("%s: spte %llx access %x write_fault %d" |
1767 | " user_fault %d gfn %lx\n", | 1838 | " user_fault %d gfn %lx\n", |
1768 | __func__, *shadow_pte, pt_access, | 1839 | __func__, *sptep, pt_access, |
1769 | write_fault, user_fault, gfn); | 1840 | write_fault, user_fault, gfn); |
1770 | 1841 | ||
1771 | if (is_rmap_pte(*shadow_pte)) { | 1842 | if (is_rmap_spte(*sptep)) { |
1772 | /* | 1843 | /* |
1773 | * If we overwrite a PTE page pointer with a 2MB PMD, unlink | 1844 | * If we overwrite a PTE page pointer with a 2MB PMD, unlink |
1774 | * the parent of the now unreachable PTE. | 1845 | * the parent of the now unreachable PTE. |
1775 | */ | 1846 | */ |
1776 | if (largepage && !is_large_pte(*shadow_pte)) { | 1847 | if (level > PT_PAGE_TABLE_LEVEL && |
1848 | !is_large_pte(*sptep)) { | ||
1777 | struct kvm_mmu_page *child; | 1849 | struct kvm_mmu_page *child; |
1778 | u64 pte = *shadow_pte; | 1850 | u64 pte = *sptep; |
1779 | 1851 | ||
1780 | child = page_header(pte & PT64_BASE_ADDR_MASK); | 1852 | child = page_header(pte & PT64_BASE_ADDR_MASK); |
1781 | mmu_page_remove_parent_pte(child, shadow_pte); | 1853 | mmu_page_remove_parent_pte(child, sptep); |
1782 | } else if (pfn != spte_to_pfn(*shadow_pte)) { | 1854 | } else if (pfn != spte_to_pfn(*sptep)) { |
1783 | pgprintk("hfn old %lx new %lx\n", | 1855 | pgprintk("hfn old %lx new %lx\n", |
1784 | spte_to_pfn(*shadow_pte), pfn); | 1856 | spte_to_pfn(*sptep), pfn); |
1785 | rmap_remove(vcpu->kvm, shadow_pte); | 1857 | rmap_remove(vcpu->kvm, sptep); |
1786 | } else | 1858 | } else |
1787 | was_rmapped = 1; | 1859 | was_rmapped = 1; |
1788 | } | 1860 | } |
1789 | if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, | 1861 | |
1790 | dirty, largepage, gfn, pfn, speculative, true)) { | 1862 | if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, |
1863 | dirty, level, gfn, pfn, speculative, true)) { | ||
1791 | if (write_fault) | 1864 | if (write_fault) |
1792 | *ptwrite = 1; | 1865 | *ptwrite = 1; |
1793 | kvm_x86_ops->tlb_flush(vcpu); | 1866 | kvm_x86_ops->tlb_flush(vcpu); |
1794 | } | 1867 | } |
1795 | 1868 | ||
1796 | pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); | 1869 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); |
1797 | pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", | 1870 | pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", |
1798 | is_large_pte(*shadow_pte)? "2MB" : "4kB", | 1871 | is_large_pte(*sptep)? "2MB" : "4kB", |
1799 | is_present_pte(*shadow_pte)?"RW":"R", gfn, | 1872 | *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, |
1800 | *shadow_pte, shadow_pte); | 1873 | *sptep, sptep); |
1801 | if (!was_rmapped && is_large_pte(*shadow_pte)) | 1874 | if (!was_rmapped && is_large_pte(*sptep)) |
1802 | ++vcpu->kvm->stat.lpages; | 1875 | ++vcpu->kvm->stat.lpages; |
1803 | 1876 | ||
1804 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); | 1877 | page_header_update_slot(vcpu->kvm, sptep, gfn); |
1805 | if (!was_rmapped) { | 1878 | if (!was_rmapped) { |
1806 | rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); | 1879 | rmap_count = rmap_add(vcpu, sptep, gfn); |
1807 | if (!is_rmap_pte(*shadow_pte)) | 1880 | if (!is_rmap_spte(*sptep)) |
1808 | kvm_release_pfn_clean(pfn); | 1881 | kvm_release_pfn_clean(pfn); |
1809 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) | 1882 | if (rmap_count > RMAP_RECYCLE_THRESHOLD) |
1810 | rmap_recycle(vcpu, gfn, largepage); | 1883 | rmap_recycle(vcpu, sptep, gfn); |
1811 | } else { | 1884 | } else { |
1812 | if (was_writeble) | 1885 | if (was_writeble) |
1813 | kvm_release_pfn_dirty(pfn); | 1886 | kvm_release_pfn_dirty(pfn); |
@@ -1815,7 +1888,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1815 | kvm_release_pfn_clean(pfn); | 1888 | kvm_release_pfn_clean(pfn); |
1816 | } | 1889 | } |
1817 | if (speculative) { | 1890 | if (speculative) { |
1818 | vcpu->arch.last_pte_updated = shadow_pte; | 1891 | vcpu->arch.last_pte_updated = sptep; |
1819 | vcpu->arch.last_pte_gfn = gfn; | 1892 | vcpu->arch.last_pte_gfn = gfn; |
1820 | } | 1893 | } |
1821 | } | 1894 | } |
@@ -1825,7 +1898,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | |||
1825 | } | 1898 | } |
1826 | 1899 | ||
1827 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | 1900 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, |
1828 | int largepage, gfn_t gfn, pfn_t pfn) | 1901 | int level, gfn_t gfn, pfn_t pfn) |
1829 | { | 1902 | { |
1830 | struct kvm_shadow_walk_iterator iterator; | 1903 | struct kvm_shadow_walk_iterator iterator; |
1831 | struct kvm_mmu_page *sp; | 1904 | struct kvm_mmu_page *sp; |
@@ -1833,11 +1906,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
1833 | gfn_t pseudo_gfn; | 1906 | gfn_t pseudo_gfn; |
1834 | 1907 | ||
1835 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { | 1908 | for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { |
1836 | if (iterator.level == PT_PAGE_TABLE_LEVEL | 1909 | if (iterator.level == level) { |
1837 | || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { | ||
1838 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, | 1910 | mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, |
1839 | 0, write, 1, &pt_write, | 1911 | 0, write, 1, &pt_write, |
1840 | largepage, gfn, pfn, false); | 1912 | level, gfn, pfn, false); |
1841 | ++vcpu->stat.pf_fixed; | 1913 | ++vcpu->stat.pf_fixed; |
1842 | break; | 1914 | break; |
1843 | } | 1915 | } |
@@ -1853,10 +1925,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
1853 | return -ENOMEM; | 1925 | return -ENOMEM; |
1854 | } | 1926 | } |
1855 | 1927 | ||
1856 | set_shadow_pte(iterator.sptep, | 1928 | __set_spte(iterator.sptep, |
1857 | __pa(sp->spt) | 1929 | __pa(sp->spt) |
1858 | | PT_PRESENT_MASK | PT_WRITABLE_MASK | 1930 | | PT_PRESENT_MASK | PT_WRITABLE_MASK |
1859 | | shadow_user_mask | shadow_x_mask); | 1931 | | shadow_user_mask | shadow_x_mask); |
1860 | } | 1932 | } |
1861 | } | 1933 | } |
1862 | return pt_write; | 1934 | return pt_write; |
@@ -1865,14 +1937,20 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | |||
1865 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | 1937 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) |
1866 | { | 1938 | { |
1867 | int r; | 1939 | int r; |
1868 | int largepage = 0; | 1940 | int level; |
1869 | pfn_t pfn; | 1941 | pfn_t pfn; |
1870 | unsigned long mmu_seq; | 1942 | unsigned long mmu_seq; |
1871 | 1943 | ||
1872 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { | 1944 | level = mapping_level(vcpu, gfn); |
1873 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 1945 | |
1874 | largepage = 1; | 1946 | /* |
1875 | } | 1947 | * This path builds a PAE pagetable - so we can map 2mb pages at |
1948 | * maximum. Therefore check if the level is larger than that. | ||
1949 | */ | ||
1950 | if (level > PT_DIRECTORY_LEVEL) | ||
1951 | level = PT_DIRECTORY_LEVEL; | ||
1952 | |||
1953 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | ||
1876 | 1954 | ||
1877 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 1955 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
1878 | smp_rmb(); | 1956 | smp_rmb(); |
@@ -1888,7 +1966,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
1888 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 1966 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
1889 | goto out_unlock; | 1967 | goto out_unlock; |
1890 | kvm_mmu_free_some_pages(vcpu); | 1968 | kvm_mmu_free_some_pages(vcpu); |
1891 | r = __direct_map(vcpu, v, write, largepage, gfn, pfn); | 1969 | r = __direct_map(vcpu, v, write, level, gfn, pfn); |
1892 | spin_unlock(&vcpu->kvm->mmu_lock); | 1970 | spin_unlock(&vcpu->kvm->mmu_lock); |
1893 | 1971 | ||
1894 | 1972 | ||
@@ -1954,6 +2032,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
1954 | gfn_t root_gfn; | 2032 | gfn_t root_gfn; |
1955 | struct kvm_mmu_page *sp; | 2033 | struct kvm_mmu_page *sp; |
1956 | int direct = 0; | 2034 | int direct = 0; |
2035 | u64 pdptr; | ||
1957 | 2036 | ||
1958 | root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; | 2037 | root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; |
1959 | 2038 | ||
@@ -1981,11 +2060,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
1981 | 2060 | ||
1982 | ASSERT(!VALID_PAGE(root)); | 2061 | ASSERT(!VALID_PAGE(root)); |
1983 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { | 2062 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { |
1984 | if (!is_present_pte(vcpu->arch.pdptrs[i])) { | 2063 | pdptr = kvm_pdptr_read(vcpu, i); |
2064 | if (!is_present_gpte(pdptr)) { | ||
1985 | vcpu->arch.mmu.pae_root[i] = 0; | 2065 | vcpu->arch.mmu.pae_root[i] = 0; |
1986 | continue; | 2066 | continue; |
1987 | } | 2067 | } |
1988 | root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; | 2068 | root_gfn = pdptr >> PAGE_SHIFT; |
1989 | } else if (vcpu->arch.mmu.root_level == 0) | 2069 | } else if (vcpu->arch.mmu.root_level == 0) |
1990 | root_gfn = 0; | 2070 | root_gfn = 0; |
1991 | if (mmu_check_root(vcpu, root_gfn)) | 2071 | if (mmu_check_root(vcpu, root_gfn)) |
@@ -2062,7 +2142,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
2062 | { | 2142 | { |
2063 | pfn_t pfn; | 2143 | pfn_t pfn; |
2064 | int r; | 2144 | int r; |
2065 | int largepage = 0; | 2145 | int level; |
2066 | gfn_t gfn = gpa >> PAGE_SHIFT; | 2146 | gfn_t gfn = gpa >> PAGE_SHIFT; |
2067 | unsigned long mmu_seq; | 2147 | unsigned long mmu_seq; |
2068 | 2148 | ||
@@ -2073,10 +2153,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
2073 | if (r) | 2153 | if (r) |
2074 | return r; | 2154 | return r; |
2075 | 2155 | ||
2076 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { | 2156 | level = mapping_level(vcpu, gfn); |
2077 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 2157 | |
2078 | largepage = 1; | 2158 | gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); |
2079 | } | 2159 | |
2080 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2160 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2081 | smp_rmb(); | 2161 | smp_rmb(); |
2082 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2162 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
@@ -2089,7 +2169,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
2089 | goto out_unlock; | 2169 | goto out_unlock; |
2090 | kvm_mmu_free_some_pages(vcpu); | 2170 | kvm_mmu_free_some_pages(vcpu); |
2091 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, | 2171 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, |
2092 | largepage, gfn, pfn); | 2172 | level, gfn, pfn); |
2093 | spin_unlock(&vcpu->kvm->mmu_lock); | 2173 | spin_unlock(&vcpu->kvm->mmu_lock); |
2094 | 2174 | ||
2095 | return r; | 2175 | return r; |
@@ -2206,7 +2286,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) | |||
2206 | context->rsvd_bits_mask[0][0] = exb_bit_rsvd | | 2286 | context->rsvd_bits_mask[0][0] = exb_bit_rsvd | |
2207 | rsvd_bits(maxphyaddr, 51); | 2287 | rsvd_bits(maxphyaddr, 51); |
2208 | context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; | 2288 | context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; |
2209 | context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2]; | 2289 | context->rsvd_bits_mask[1][2] = exb_bit_rsvd | |
2290 | rsvd_bits(maxphyaddr, 51) | | ||
2291 | rsvd_bits(13, 29); | ||
2210 | context->rsvd_bits_mask[1][1] = exb_bit_rsvd | | 2292 | context->rsvd_bits_mask[1][1] = exb_bit_rsvd | |
2211 | rsvd_bits(maxphyaddr, 51) | | 2293 | rsvd_bits(maxphyaddr, 51) | |
2212 | rsvd_bits(13, 20); /* large page */ | 2294 | rsvd_bits(13, 20); /* large page */ |
@@ -2357,8 +2439,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) | |||
2357 | spin_unlock(&vcpu->kvm->mmu_lock); | 2439 | spin_unlock(&vcpu->kvm->mmu_lock); |
2358 | if (r) | 2440 | if (r) |
2359 | goto out; | 2441 | goto out; |
2442 | /* set_cr3() should ensure TLB has been flushed */ | ||
2360 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); | 2443 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); |
2361 | kvm_mmu_flush_tlb(vcpu); | ||
2362 | out: | 2444 | out: |
2363 | return r; | 2445 | return r; |
2364 | } | 2446 | } |
@@ -2378,15 +2460,14 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | |||
2378 | 2460 | ||
2379 | pte = *spte; | 2461 | pte = *spte; |
2380 | if (is_shadow_present_pte(pte)) { | 2462 | if (is_shadow_present_pte(pte)) { |
2381 | if (sp->role.level == PT_PAGE_TABLE_LEVEL || | 2463 | if (is_last_spte(pte, sp->role.level)) |
2382 | is_large_pte(pte)) | ||
2383 | rmap_remove(vcpu->kvm, spte); | 2464 | rmap_remove(vcpu->kvm, spte); |
2384 | else { | 2465 | else { |
2385 | child = page_header(pte & PT64_BASE_ADDR_MASK); | 2466 | child = page_header(pte & PT64_BASE_ADDR_MASK); |
2386 | mmu_page_remove_parent_pte(child, spte); | 2467 | mmu_page_remove_parent_pte(child, spte); |
2387 | } | 2468 | } |
2388 | } | 2469 | } |
2389 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | 2470 | __set_spte(spte, shadow_trap_nonpresent_pte); |
2390 | if (is_large_pte(pte)) | 2471 | if (is_large_pte(pte)) |
2391 | --vcpu->kvm->stat.lpages; | 2472 | --vcpu->kvm->stat.lpages; |
2392 | } | 2473 | } |
@@ -2397,11 +2478,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | |||
2397 | const void *new) | 2478 | const void *new) |
2398 | { | 2479 | { |
2399 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { | 2480 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { |
2400 | if (!vcpu->arch.update_pte.largepage || | 2481 | ++vcpu->kvm->stat.mmu_pde_zapped; |
2401 | sp->role.glevels == PT32_ROOT_LEVEL) { | 2482 | return; |
2402 | ++vcpu->kvm->stat.mmu_pde_zapped; | ||
2403 | return; | ||
2404 | } | ||
2405 | } | 2483 | } |
2406 | 2484 | ||
2407 | ++vcpu->kvm->stat.mmu_pte_updated; | 2485 | ++vcpu->kvm->stat.mmu_pte_updated; |
@@ -2447,8 +2525,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2447 | u64 gpte = 0; | 2525 | u64 gpte = 0; |
2448 | pfn_t pfn; | 2526 | pfn_t pfn; |
2449 | 2527 | ||
2450 | vcpu->arch.update_pte.largepage = 0; | ||
2451 | |||
2452 | if (bytes != 4 && bytes != 8) | 2528 | if (bytes != 4 && bytes != 8) |
2453 | return; | 2529 | return; |
2454 | 2530 | ||
@@ -2472,14 +2548,10 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
2472 | if ((bytes == 4) && (gpa % 4 == 0)) | 2548 | if ((bytes == 4) && (gpa % 4 == 0)) |
2473 | memcpy((void *)&gpte, new, 4); | 2549 | memcpy((void *)&gpte, new, 4); |
2474 | } | 2550 | } |
2475 | if (!is_present_pte(gpte)) | 2551 | if (!is_present_gpte(gpte)) |
2476 | return; | 2552 | return; |
2477 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 2553 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
2478 | 2554 | ||
2479 | if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { | ||
2480 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | ||
2481 | vcpu->arch.update_pte.largepage = 1; | ||
2482 | } | ||
2483 | vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2555 | vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2484 | smp_rmb(); | 2556 | smp_rmb(); |
2485 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2557 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
@@ -2622,6 +2694,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | |||
2622 | gpa_t gpa; | 2694 | gpa_t gpa; |
2623 | int r; | 2695 | int r; |
2624 | 2696 | ||
2697 | if (tdp_enabled) | ||
2698 | return 0; | ||
2699 | |||
2625 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); | 2700 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); |
2626 | 2701 | ||
2627 | spin_lock(&vcpu->kvm->mmu_lock); | 2702 | spin_lock(&vcpu->kvm->mmu_lock); |
@@ -2633,7 +2708,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); | |||
2633 | 2708 | ||
2634 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | 2709 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) |
2635 | { | 2710 | { |
2636 | while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { | 2711 | while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && |
2712 | !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { | ||
2637 | struct kvm_mmu_page *sp; | 2713 | struct kvm_mmu_page *sp; |
2638 | 2714 | ||
2639 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, | 2715 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, |
@@ -2670,8 +2746,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | |||
2670 | ++vcpu->stat.mmio_exits; | 2746 | ++vcpu->stat.mmio_exits; |
2671 | return 0; | 2747 | return 0; |
2672 | case EMULATE_FAIL: | 2748 | case EMULATE_FAIL: |
2673 | kvm_report_emulation_failure(vcpu, "pagetable"); | 2749 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; |
2674 | return 1; | 2750 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; |
2751 | return 0; | ||
2675 | default: | 2752 | default: |
2676 | BUG(); | 2753 | BUG(); |
2677 | } | 2754 | } |
@@ -2712,12 +2789,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | |||
2712 | 2789 | ||
2713 | ASSERT(vcpu); | 2790 | ASSERT(vcpu); |
2714 | 2791 | ||
2715 | if (vcpu->kvm->arch.n_requested_mmu_pages) | ||
2716 | vcpu->kvm->arch.n_free_mmu_pages = | ||
2717 | vcpu->kvm->arch.n_requested_mmu_pages; | ||
2718 | else | ||
2719 | vcpu->kvm->arch.n_free_mmu_pages = | ||
2720 | vcpu->kvm->arch.n_alloc_mmu_pages; | ||
2721 | /* | 2792 | /* |
2722 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. | 2793 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. |
2723 | * Therefore we need to allocate shadow page tables in the first | 2794 | * Therefore we need to allocate shadow page tables in the first |
@@ -3029,6 +3100,24 @@ out: | |||
3029 | return r; | 3100 | return r; |
3030 | } | 3101 | } |
3031 | 3102 | ||
3103 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) | ||
3104 | { | ||
3105 | struct kvm_shadow_walk_iterator iterator; | ||
3106 | int nr_sptes = 0; | ||
3107 | |||
3108 | spin_lock(&vcpu->kvm->mmu_lock); | ||
3109 | for_each_shadow_entry(vcpu, addr, iterator) { | ||
3110 | sptes[iterator.level-1] = *iterator.sptep; | ||
3111 | nr_sptes++; | ||
3112 | if (!is_shadow_present_pte(*iterator.sptep)) | ||
3113 | break; | ||
3114 | } | ||
3115 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
3116 | |||
3117 | return nr_sptes; | ||
3118 | } | ||
3119 | EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); | ||
3120 | |||
3032 | #ifdef AUDIT | 3121 | #ifdef AUDIT |
3033 | 3122 | ||
3034 | static const char *audit_msg; | 3123 | static const char *audit_msg; |
@@ -3041,6 +3130,54 @@ static gva_t canonicalize(gva_t gva) | |||
3041 | return gva; | 3130 | return gva; |
3042 | } | 3131 | } |
3043 | 3132 | ||
3133 | |||
3134 | typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, | ||
3135 | u64 *sptep); | ||
3136 | |||
3137 | static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, | ||
3138 | inspect_spte_fn fn) | ||
3139 | { | ||
3140 | int i; | ||
3141 | |||
3142 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
3143 | u64 ent = sp->spt[i]; | ||
3144 | |||
3145 | if (is_shadow_present_pte(ent)) { | ||
3146 | if (!is_last_spte(ent, sp->role.level)) { | ||
3147 | struct kvm_mmu_page *child; | ||
3148 | child = page_header(ent & PT64_BASE_ADDR_MASK); | ||
3149 | __mmu_spte_walk(kvm, child, fn); | ||
3150 | } else | ||
3151 | fn(kvm, sp, &sp->spt[i]); | ||
3152 | } | ||
3153 | } | ||
3154 | } | ||
3155 | |||
3156 | static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) | ||
3157 | { | ||
3158 | int i; | ||
3159 | struct kvm_mmu_page *sp; | ||
3160 | |||
3161 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
3162 | return; | ||
3163 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
3164 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
3165 | sp = page_header(root); | ||
3166 | __mmu_spte_walk(vcpu->kvm, sp, fn); | ||
3167 | return; | ||
3168 | } | ||
3169 | for (i = 0; i < 4; ++i) { | ||
3170 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
3171 | |||
3172 | if (root && VALID_PAGE(root)) { | ||
3173 | root &= PT64_BASE_ADDR_MASK; | ||
3174 | sp = page_header(root); | ||
3175 | __mmu_spte_walk(vcpu->kvm, sp, fn); | ||
3176 | } | ||
3177 | } | ||
3178 | return; | ||
3179 | } | ||
3180 | |||
3044 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | 3181 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, |
3045 | gva_t va, int level) | 3182 | gva_t va, int level) |
3046 | { | 3183 | { |
@@ -3055,20 +3192,19 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | |||
3055 | continue; | 3192 | continue; |
3056 | 3193 | ||
3057 | va = canonicalize(va); | 3194 | va = canonicalize(va); |
3058 | if (level > 1) { | 3195 | if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) |
3059 | if (ent == shadow_notrap_nonpresent_pte) | 3196 | audit_mappings_page(vcpu, ent, va, level - 1); |
3060 | printk(KERN_ERR "audit: (%s) nontrapping pte" | 3197 | else { |
3061 | " in nonleaf level: levels %d gva %lx" | ||
3062 | " level %d pte %llx\n", audit_msg, | ||
3063 | vcpu->arch.mmu.root_level, va, level, ent); | ||
3064 | else | ||
3065 | audit_mappings_page(vcpu, ent, va, level - 1); | ||
3066 | } else { | ||
3067 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); | 3198 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); |
3068 | gfn_t gfn = gpa >> PAGE_SHIFT; | 3199 | gfn_t gfn = gpa >> PAGE_SHIFT; |
3069 | pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); | 3200 | pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); |
3070 | hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; | 3201 | hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; |
3071 | 3202 | ||
3203 | if (is_error_pfn(pfn)) { | ||
3204 | kvm_release_pfn_clean(pfn); | ||
3205 | continue; | ||
3206 | } | ||
3207 | |||
3072 | if (is_shadow_present_pte(ent) | 3208 | if (is_shadow_present_pte(ent) |
3073 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | 3209 | && (ent & PT64_BASE_ADDR_MASK) != hpa) |
3074 | printk(KERN_ERR "xx audit error: (%s) levels %d" | 3210 | printk(KERN_ERR "xx audit error: (%s) levels %d" |
@@ -3122,7 +3258,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) | |||
3122 | d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | 3258 | d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); |
3123 | while (d) { | 3259 | while (d) { |
3124 | for (k = 0; k < RMAP_EXT; ++k) | 3260 | for (k = 0; k < RMAP_EXT; ++k) |
3125 | if (d->shadow_ptes[k]) | 3261 | if (d->sptes[k]) |
3126 | ++nmaps; | 3262 | ++nmaps; |
3127 | else | 3263 | else |
3128 | break; | 3264 | break; |
@@ -3133,9 +3269,48 @@ static int count_rmaps(struct kvm_vcpu *vcpu) | |||
3133 | return nmaps; | 3269 | return nmaps; |
3134 | } | 3270 | } |
3135 | 3271 | ||
3136 | static int count_writable_mappings(struct kvm_vcpu *vcpu) | 3272 | void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) |
3273 | { | ||
3274 | unsigned long *rmapp; | ||
3275 | struct kvm_mmu_page *rev_sp; | ||
3276 | gfn_t gfn; | ||
3277 | |||
3278 | if (*sptep & PT_WRITABLE_MASK) { | ||
3279 | rev_sp = page_header(__pa(sptep)); | ||
3280 | gfn = rev_sp->gfns[sptep - rev_sp->spt]; | ||
3281 | |||
3282 | if (!gfn_to_memslot(kvm, gfn)) { | ||
3283 | if (!printk_ratelimit()) | ||
3284 | return; | ||
3285 | printk(KERN_ERR "%s: no memslot for gfn %ld\n", | ||
3286 | audit_msg, gfn); | ||
3287 | printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", | ||
3288 | audit_msg, sptep - rev_sp->spt, | ||
3289 | rev_sp->gfn); | ||
3290 | dump_stack(); | ||
3291 | return; | ||
3292 | } | ||
3293 | |||
3294 | rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], | ||
3295 | is_large_pte(*sptep)); | ||
3296 | if (!*rmapp) { | ||
3297 | if (!printk_ratelimit()) | ||
3298 | return; | ||
3299 | printk(KERN_ERR "%s: no rmap for writable spte %llx\n", | ||
3300 | audit_msg, *sptep); | ||
3301 | dump_stack(); | ||
3302 | } | ||
3303 | } | ||
3304 | |||
3305 | } | ||
3306 | |||
3307 | void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) | ||
3308 | { | ||
3309 | mmu_spte_walk(vcpu, inspect_spte_has_rmap); | ||
3310 | } | ||
3311 | |||
3312 | static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) | ||
3137 | { | 3313 | { |
3138 | int nmaps = 0; | ||
3139 | struct kvm_mmu_page *sp; | 3314 | struct kvm_mmu_page *sp; |
3140 | int i; | 3315 | int i; |
3141 | 3316 | ||
@@ -3152,20 +3327,16 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu) | |||
3152 | continue; | 3327 | continue; |
3153 | if (!(ent & PT_WRITABLE_MASK)) | 3328 | if (!(ent & PT_WRITABLE_MASK)) |
3154 | continue; | 3329 | continue; |
3155 | ++nmaps; | 3330 | inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); |
3156 | } | 3331 | } |
3157 | } | 3332 | } |
3158 | return nmaps; | 3333 | return; |
3159 | } | 3334 | } |
3160 | 3335 | ||
3161 | static void audit_rmap(struct kvm_vcpu *vcpu) | 3336 | static void audit_rmap(struct kvm_vcpu *vcpu) |
3162 | { | 3337 | { |
3163 | int n_rmap = count_rmaps(vcpu); | 3338 | check_writable_mappings_rmap(vcpu); |
3164 | int n_actual = count_writable_mappings(vcpu); | 3339 | count_rmaps(vcpu); |
3165 | |||
3166 | if (n_rmap != n_actual) | ||
3167 | printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", | ||
3168 | __func__, audit_msg, n_rmap, n_actual); | ||
3169 | } | 3340 | } |
3170 | 3341 | ||
3171 | static void audit_write_protection(struct kvm_vcpu *vcpu) | 3342 | static void audit_write_protection(struct kvm_vcpu *vcpu) |
@@ -3173,20 +3344,28 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) | |||
3173 | struct kvm_mmu_page *sp; | 3344 | struct kvm_mmu_page *sp; |
3174 | struct kvm_memory_slot *slot; | 3345 | struct kvm_memory_slot *slot; |
3175 | unsigned long *rmapp; | 3346 | unsigned long *rmapp; |
3347 | u64 *spte; | ||
3176 | gfn_t gfn; | 3348 | gfn_t gfn; |
3177 | 3349 | ||
3178 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | 3350 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { |
3179 | if (sp->role.direct) | 3351 | if (sp->role.direct) |
3180 | continue; | 3352 | continue; |
3353 | if (sp->unsync) | ||
3354 | continue; | ||
3181 | 3355 | ||
3182 | gfn = unalias_gfn(vcpu->kvm, sp->gfn); | 3356 | gfn = unalias_gfn(vcpu->kvm, sp->gfn); |
3183 | slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); | 3357 | slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); |
3184 | rmapp = &slot->rmap[gfn - slot->base_gfn]; | 3358 | rmapp = &slot->rmap[gfn - slot->base_gfn]; |
3185 | if (*rmapp) | 3359 | |
3186 | printk(KERN_ERR "%s: (%s) shadow page has writable" | 3360 | spte = rmap_next(vcpu->kvm, rmapp, NULL); |
3187 | " mappings: gfn %lx role %x\n", | 3361 | while (spte) { |
3362 | if (*spte & PT_WRITABLE_MASK) | ||
3363 | printk(KERN_ERR "%s: (%s) shadow page has " | ||
3364 | "writable mappings: gfn %lx role %x\n", | ||
3188 | __func__, audit_msg, sp->gfn, | 3365 | __func__, audit_msg, sp->gfn, |
3189 | sp->role.word); | 3366 | sp->role.word); |
3367 | spte = rmap_next(vcpu->kvm, rmapp, spte); | ||
3368 | } | ||
3190 | } | 3369 | } |
3191 | } | 3370 | } |
3192 | 3371 | ||
@@ -3198,7 +3377,9 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) | |||
3198 | audit_msg = msg; | 3377 | audit_msg = msg; |
3199 | audit_rmap(vcpu); | 3378 | audit_rmap(vcpu); |
3200 | audit_write_protection(vcpu); | 3379 | audit_write_protection(vcpu); |
3201 | audit_mappings(vcpu); | 3380 | if (strcmp("pre pte write", audit_msg) != 0) |
3381 | audit_mappings(vcpu); | ||
3382 | audit_writable_sptes_have_rmaps(vcpu); | ||
3202 | dbg = olddbg; | 3383 | dbg = olddbg; |
3203 | } | 3384 | } |
3204 | 3385 | ||
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 3494a2fb136..61a1b3884b4 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h | |||
@@ -37,6 +37,8 @@ | |||
37 | #define PT32_ROOT_LEVEL 2 | 37 | #define PT32_ROOT_LEVEL 2 |
38 | #define PT32E_ROOT_LEVEL 3 | 38 | #define PT32E_ROOT_LEVEL 3 |
39 | 39 | ||
40 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); | ||
41 | |||
40 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | 42 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) |
41 | { | 43 | { |
42 | if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) | 44 | if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) |
@@ -75,7 +77,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu) | |||
75 | return vcpu->arch.cr0 & X86_CR0_PG; | 77 | return vcpu->arch.cr0 & X86_CR0_PG; |
76 | } | 78 | } |
77 | 79 | ||
78 | static inline int is_present_pte(unsigned long pte) | 80 | static inline int is_present_gpte(unsigned long pte) |
79 | { | 81 | { |
80 | return pte & PT_PRESENT_MASK; | 82 | return pte & PT_PRESENT_MASK; |
81 | } | 83 | } |
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h new file mode 100644 index 00000000000..3e4a5c6ca2a --- /dev/null +++ b/arch/x86/kvm/mmutrace.h | |||
@@ -0,0 +1,220 @@ | |||
1 | #if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ) | ||
2 | #define _TRACE_KVMMMU_H | ||
3 | |||
4 | #include <linux/tracepoint.h> | ||
5 | #include <linux/ftrace_event.h> | ||
6 | |||
7 | #undef TRACE_SYSTEM | ||
8 | #define TRACE_SYSTEM kvmmmu | ||
9 | #define TRACE_INCLUDE_PATH . | ||
10 | #define TRACE_INCLUDE_FILE mmutrace | ||
11 | |||
12 | #define KVM_MMU_PAGE_FIELDS \ | ||
13 | __field(__u64, gfn) \ | ||
14 | __field(__u32, role) \ | ||
15 | __field(__u32, root_count) \ | ||
16 | __field(__u32, unsync) | ||
17 | |||
18 | #define KVM_MMU_PAGE_ASSIGN(sp) \ | ||
19 | __entry->gfn = sp->gfn; \ | ||
20 | __entry->role = sp->role.word; \ | ||
21 | __entry->root_count = sp->root_count; \ | ||
22 | __entry->unsync = sp->unsync; | ||
23 | |||
24 | #define KVM_MMU_PAGE_PRINTK() ({ \ | ||
25 | const char *ret = p->buffer + p->len; \ | ||
26 | static const char *access_str[] = { \ | ||
27 | "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ | ||
28 | }; \ | ||
29 | union kvm_mmu_page_role role; \ | ||
30 | \ | ||
31 | role.word = __entry->role; \ | ||
32 | \ | ||
33 | trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \ | ||
34 | " %snxe root %u %s%c", \ | ||
35 | __entry->gfn, role.level, role.glevels, \ | ||
36 | role.quadrant, \ | ||
37 | role.direct ? " direct" : "", \ | ||
38 | access_str[role.access], \ | ||
39 | role.invalid ? " invalid" : "", \ | ||
40 | role.cr4_pge ? "" : "!", \ | ||
41 | role.nxe ? "" : "!", \ | ||
42 | __entry->root_count, \ | ||
43 | __entry->unsync ? "unsync" : "sync", 0); \ | ||
44 | ret; \ | ||
45 | }) | ||
46 | |||
47 | #define kvm_mmu_trace_pferr_flags \ | ||
48 | { PFERR_PRESENT_MASK, "P" }, \ | ||
49 | { PFERR_WRITE_MASK, "W" }, \ | ||
50 | { PFERR_USER_MASK, "U" }, \ | ||
51 | { PFERR_RSVD_MASK, "RSVD" }, \ | ||
52 | { PFERR_FETCH_MASK, "F" } | ||
53 | |||
54 | /* | ||
55 | * A pagetable walk has started | ||
56 | */ | ||
57 | TRACE_EVENT( | ||
58 | kvm_mmu_pagetable_walk, | ||
59 | TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), | ||
60 | TP_ARGS(addr, write_fault, user_fault, fetch_fault), | ||
61 | |||
62 | TP_STRUCT__entry( | ||
63 | __field(__u64, addr) | ||
64 | __field(__u32, pferr) | ||
65 | ), | ||
66 | |||
67 | TP_fast_assign( | ||
68 | __entry->addr = addr; | ||
69 | __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) | ||
70 | | (!!fetch_fault << 4); | ||
71 | ), | ||
72 | |||
73 | TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, | ||
74 | __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) | ||
75 | ); | ||
76 | |||
77 | |||
78 | /* We just walked a paging element */ | ||
79 | TRACE_EVENT( | ||
80 | kvm_mmu_paging_element, | ||
81 | TP_PROTO(u64 pte, int level), | ||
82 | TP_ARGS(pte, level), | ||
83 | |||
84 | TP_STRUCT__entry( | ||
85 | __field(__u64, pte) | ||
86 | __field(__u32, level) | ||
87 | ), | ||
88 | |||
89 | TP_fast_assign( | ||
90 | __entry->pte = pte; | ||
91 | __entry->level = level; | ||
92 | ), | ||
93 | |||
94 | TP_printk("pte %llx level %u", __entry->pte, __entry->level) | ||
95 | ); | ||
96 | |||
97 | /* We set a pte accessed bit */ | ||
98 | TRACE_EVENT( | ||
99 | kvm_mmu_set_accessed_bit, | ||
100 | TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), | ||
101 | TP_ARGS(table_gfn, index, size), | ||
102 | |||
103 | TP_STRUCT__entry( | ||
104 | __field(__u64, gpa) | ||
105 | ), | ||
106 | |||
107 | TP_fast_assign( | ||
108 | __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) | ||
109 | + index * size; | ||
110 | ), | ||
111 | |||
112 | TP_printk("gpa %llx", __entry->gpa) | ||
113 | ); | ||
114 | |||
115 | /* We set a pte dirty bit */ | ||
116 | TRACE_EVENT( | ||
117 | kvm_mmu_set_dirty_bit, | ||
118 | TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), | ||
119 | TP_ARGS(table_gfn, index, size), | ||
120 | |||
121 | TP_STRUCT__entry( | ||
122 | __field(__u64, gpa) | ||
123 | ), | ||
124 | |||
125 | TP_fast_assign( | ||
126 | __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) | ||
127 | + index * size; | ||
128 | ), | ||
129 | |||
130 | TP_printk("gpa %llx", __entry->gpa) | ||
131 | ); | ||
132 | |||
133 | TRACE_EVENT( | ||
134 | kvm_mmu_walker_error, | ||
135 | TP_PROTO(u32 pferr), | ||
136 | TP_ARGS(pferr), | ||
137 | |||
138 | TP_STRUCT__entry( | ||
139 | __field(__u32, pferr) | ||
140 | ), | ||
141 | |||
142 | TP_fast_assign( | ||
143 | __entry->pferr = pferr; | ||
144 | ), | ||
145 | |||
146 | TP_printk("pferr %x %s", __entry->pferr, | ||
147 | __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) | ||
148 | ); | ||
149 | |||
150 | TRACE_EVENT( | ||
151 | kvm_mmu_get_page, | ||
152 | TP_PROTO(struct kvm_mmu_page *sp, bool created), | ||
153 | TP_ARGS(sp, created), | ||
154 | |||
155 | TP_STRUCT__entry( | ||
156 | KVM_MMU_PAGE_FIELDS | ||
157 | __field(bool, created) | ||
158 | ), | ||
159 | |||
160 | TP_fast_assign( | ||
161 | KVM_MMU_PAGE_ASSIGN(sp) | ||
162 | __entry->created = created; | ||
163 | ), | ||
164 | |||
165 | TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(), | ||
166 | __entry->created ? "new" : "existing") | ||
167 | ); | ||
168 | |||
169 | TRACE_EVENT( | ||
170 | kvm_mmu_sync_page, | ||
171 | TP_PROTO(struct kvm_mmu_page *sp), | ||
172 | TP_ARGS(sp), | ||
173 | |||
174 | TP_STRUCT__entry( | ||
175 | KVM_MMU_PAGE_FIELDS | ||
176 | ), | ||
177 | |||
178 | TP_fast_assign( | ||
179 | KVM_MMU_PAGE_ASSIGN(sp) | ||
180 | ), | ||
181 | |||
182 | TP_printk("%s", KVM_MMU_PAGE_PRINTK()) | ||
183 | ); | ||
184 | |||
185 | TRACE_EVENT( | ||
186 | kvm_mmu_unsync_page, | ||
187 | TP_PROTO(struct kvm_mmu_page *sp), | ||
188 | TP_ARGS(sp), | ||
189 | |||
190 | TP_STRUCT__entry( | ||
191 | KVM_MMU_PAGE_FIELDS | ||
192 | ), | ||
193 | |||
194 | TP_fast_assign( | ||
195 | KVM_MMU_PAGE_ASSIGN(sp) | ||
196 | ), | ||
197 | |||
198 | TP_printk("%s", KVM_MMU_PAGE_PRINTK()) | ||
199 | ); | ||
200 | |||
201 | TRACE_EVENT( | ||
202 | kvm_mmu_zap_page, | ||
203 | TP_PROTO(struct kvm_mmu_page *sp), | ||
204 | TP_ARGS(sp), | ||
205 | |||
206 | TP_STRUCT__entry( | ||
207 | KVM_MMU_PAGE_FIELDS | ||
208 | ), | ||
209 | |||
210 | TP_fast_assign( | ||
211 | KVM_MMU_PAGE_ASSIGN(sp) | ||
212 | ), | ||
213 | |||
214 | TP_printk("%s", KVM_MMU_PAGE_PRINTK()) | ||
215 | ); | ||
216 | |||
217 | #endif /* _TRACE_KVMMMU_H */ | ||
218 | |||
219 | /* This part must be outside protection */ | ||
220 | #include <trace/define_trace.h> | ||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67785f63539..d2fec9c12d2 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -27,7 +27,8 @@ | |||
27 | #define guest_walker guest_walker64 | 27 | #define guest_walker guest_walker64 |
28 | #define FNAME(name) paging##64_##name | 28 | #define FNAME(name) paging##64_##name |
29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | 29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK |
30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | 30 | #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) |
31 | #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) | ||
31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | 32 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) |
32 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | 33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) |
33 | #define PT_LEVEL_BITS PT64_LEVEL_BITS | 34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS |
@@ -43,7 +44,8 @@ | |||
43 | #define guest_walker guest_walker32 | 44 | #define guest_walker guest_walker32 |
44 | #define FNAME(name) paging##32_##name | 45 | #define FNAME(name) paging##32_##name |
45 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | 46 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK |
46 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | 47 | #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) |
48 | #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) | ||
47 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | 49 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) |
48 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | 50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) |
49 | #define PT_LEVEL_BITS PT32_LEVEL_BITS | 51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS |
@@ -53,8 +55,8 @@ | |||
53 | #error Invalid PTTYPE value | 55 | #error Invalid PTTYPE value |
54 | #endif | 56 | #endif |
55 | 57 | ||
56 | #define gpte_to_gfn FNAME(gpte_to_gfn) | 58 | #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) |
57 | #define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) | 59 | #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) |
58 | 60 | ||
59 | /* | 61 | /* |
60 | * The guest_walker structure emulates the behavior of the hardware page | 62 | * The guest_walker structure emulates the behavior of the hardware page |
@@ -71,14 +73,9 @@ struct guest_walker { | |||
71 | u32 error_code; | 73 | u32 error_code; |
72 | }; | 74 | }; |
73 | 75 | ||
74 | static gfn_t gpte_to_gfn(pt_element_t gpte) | 76 | static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) |
75 | { | 77 | { |
76 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | 78 | return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; |
77 | } | ||
78 | |||
79 | static gfn_t gpte_to_gfn_pde(pt_element_t gpte) | ||
80 | { | ||
81 | return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
82 | } | 79 | } |
83 | 80 | ||
84 | static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, | 81 | static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, |
@@ -125,14 +122,16 @@ static int FNAME(walk_addr)(struct guest_walker *walker, | |||
125 | gpa_t pte_gpa; | 122 | gpa_t pte_gpa; |
126 | int rsvd_fault = 0; | 123 | int rsvd_fault = 0; |
127 | 124 | ||
128 | pgprintk("%s: addr %lx\n", __func__, addr); | 125 | trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, |
126 | fetch_fault); | ||
129 | walk: | 127 | walk: |
130 | walker->level = vcpu->arch.mmu.root_level; | 128 | walker->level = vcpu->arch.mmu.root_level; |
131 | pte = vcpu->arch.cr3; | 129 | pte = vcpu->arch.cr3; |
132 | #if PTTYPE == 64 | 130 | #if PTTYPE == 64 |
133 | if (!is_long_mode(vcpu)) { | 131 | if (!is_long_mode(vcpu)) { |
134 | pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; | 132 | pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); |
135 | if (!is_present_pte(pte)) | 133 | trace_kvm_mmu_paging_element(pte, walker->level); |
134 | if (!is_present_gpte(pte)) | ||
136 | goto not_present; | 135 | goto not_present; |
137 | --walker->level; | 136 | --walker->level; |
138 | } | 137 | } |
@@ -150,12 +149,11 @@ walk: | |||
150 | pte_gpa += index * sizeof(pt_element_t); | 149 | pte_gpa += index * sizeof(pt_element_t); |
151 | walker->table_gfn[walker->level - 1] = table_gfn; | 150 | walker->table_gfn[walker->level - 1] = table_gfn; |
152 | walker->pte_gpa[walker->level - 1] = pte_gpa; | 151 | walker->pte_gpa[walker->level - 1] = pte_gpa; |
153 | pgprintk("%s: table_gfn[%d] %lx\n", __func__, | ||
154 | walker->level - 1, table_gfn); | ||
155 | 152 | ||
156 | kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); | 153 | kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); |
154 | trace_kvm_mmu_paging_element(pte, walker->level); | ||
157 | 155 | ||
158 | if (!is_present_pte(pte)) | 156 | if (!is_present_gpte(pte)) |
159 | goto not_present; | 157 | goto not_present; |
160 | 158 | ||
161 | rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); | 159 | rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); |
@@ -175,6 +173,8 @@ walk: | |||
175 | #endif | 173 | #endif |
176 | 174 | ||
177 | if (!(pte & PT_ACCESSED_MASK)) { | 175 | if (!(pte & PT_ACCESSED_MASK)) { |
176 | trace_kvm_mmu_set_accessed_bit(table_gfn, index, | ||
177 | sizeof(pte)); | ||
178 | mark_page_dirty(vcpu->kvm, table_gfn); | 178 | mark_page_dirty(vcpu->kvm, table_gfn); |
179 | if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, | 179 | if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, |
180 | index, pte, pte|PT_ACCESSED_MASK)) | 180 | index, pte, pte|PT_ACCESSED_MASK)) |
@@ -186,18 +186,24 @@ walk: | |||
186 | 186 | ||
187 | walker->ptes[walker->level - 1] = pte; | 187 | walker->ptes[walker->level - 1] = pte; |
188 | 188 | ||
189 | if (walker->level == PT_PAGE_TABLE_LEVEL) { | 189 | if ((walker->level == PT_PAGE_TABLE_LEVEL) || |
190 | walker->gfn = gpte_to_gfn(pte); | 190 | ((walker->level == PT_DIRECTORY_LEVEL) && |
191 | break; | 191 | (pte & PT_PAGE_SIZE_MASK) && |
192 | } | 192 | (PTTYPE == 64 || is_pse(vcpu))) || |
193 | 193 | ((walker->level == PT_PDPE_LEVEL) && | |
194 | if (walker->level == PT_DIRECTORY_LEVEL | 194 | (pte & PT_PAGE_SIZE_MASK) && |
195 | && (pte & PT_PAGE_SIZE_MASK) | 195 | is_long_mode(vcpu))) { |
196 | && (PTTYPE == 64 || is_pse(vcpu))) { | 196 | int lvl = walker->level; |
197 | walker->gfn = gpte_to_gfn_pde(pte); | 197 | |
198 | walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); | 198 | walker->gfn = gpte_to_gfn_lvl(pte, lvl); |
199 | if (PTTYPE == 32 && is_cpuid_PSE36()) | 199 | walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) |
200 | >> PAGE_SHIFT; | ||
201 | |||
202 | if (PTTYPE == 32 && | ||
203 | walker->level == PT_DIRECTORY_LEVEL && | ||
204 | is_cpuid_PSE36()) | ||
200 | walker->gfn += pse36_gfn_delta(pte); | 205 | walker->gfn += pse36_gfn_delta(pte); |
206 | |||
201 | break; | 207 | break; |
202 | } | 208 | } |
203 | 209 | ||
@@ -205,9 +211,10 @@ walk: | |||
205 | --walker->level; | 211 | --walker->level; |
206 | } | 212 | } |
207 | 213 | ||
208 | if (write_fault && !is_dirty_pte(pte)) { | 214 | if (write_fault && !is_dirty_gpte(pte)) { |
209 | bool ret; | 215 | bool ret; |
210 | 216 | ||
217 | trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); | ||
211 | mark_page_dirty(vcpu->kvm, table_gfn); | 218 | mark_page_dirty(vcpu->kvm, table_gfn); |
212 | ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, | 219 | ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, |
213 | pte|PT_DIRTY_MASK); | 220 | pte|PT_DIRTY_MASK); |
@@ -239,6 +246,7 @@ err: | |||
239 | walker->error_code |= PFERR_FETCH_MASK; | 246 | walker->error_code |= PFERR_FETCH_MASK; |
240 | if (rsvd_fault) | 247 | if (rsvd_fault) |
241 | walker->error_code |= PFERR_RSVD_MASK; | 248 | walker->error_code |= PFERR_RSVD_MASK; |
249 | trace_kvm_mmu_walker_error(walker->error_code); | ||
242 | return 0; | 250 | return 0; |
243 | } | 251 | } |
244 | 252 | ||
@@ -248,12 +256,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
248 | pt_element_t gpte; | 256 | pt_element_t gpte; |
249 | unsigned pte_access; | 257 | unsigned pte_access; |
250 | pfn_t pfn; | 258 | pfn_t pfn; |
251 | int largepage = vcpu->arch.update_pte.largepage; | ||
252 | 259 | ||
253 | gpte = *(const pt_element_t *)pte; | 260 | gpte = *(const pt_element_t *)pte; |
254 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { | 261 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { |
255 | if (!is_present_pte(gpte)) | 262 | if (!is_present_gpte(gpte)) |
256 | set_shadow_pte(spte, shadow_notrap_nonpresent_pte); | 263 | __set_spte(spte, shadow_notrap_nonpresent_pte); |
257 | return; | 264 | return; |
258 | } | 265 | } |
259 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); | 266 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); |
@@ -267,7 +274,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
267 | return; | 274 | return; |
268 | kvm_get_pfn(pfn); | 275 | kvm_get_pfn(pfn); |
269 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, | 276 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, |
270 | gpte & PT_DIRTY_MASK, NULL, largepage, | 277 | gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, |
271 | gpte_to_gfn(gpte), pfn, true); | 278 | gpte_to_gfn(gpte), pfn, true); |
272 | } | 279 | } |
273 | 280 | ||
@@ -276,7 +283,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
276 | */ | 283 | */ |
277 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 284 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, |
278 | struct guest_walker *gw, | 285 | struct guest_walker *gw, |
279 | int user_fault, int write_fault, int largepage, | 286 | int user_fault, int write_fault, int hlevel, |
280 | int *ptwrite, pfn_t pfn) | 287 | int *ptwrite, pfn_t pfn) |
281 | { | 288 | { |
282 | unsigned access = gw->pt_access; | 289 | unsigned access = gw->pt_access; |
@@ -289,19 +296,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
289 | pt_element_t curr_pte; | 296 | pt_element_t curr_pte; |
290 | struct kvm_shadow_walk_iterator iterator; | 297 | struct kvm_shadow_walk_iterator iterator; |
291 | 298 | ||
292 | if (!is_present_pte(gw->ptes[gw->level - 1])) | 299 | if (!is_present_gpte(gw->ptes[gw->level - 1])) |
293 | return NULL; | 300 | return NULL; |
294 | 301 | ||
295 | for_each_shadow_entry(vcpu, addr, iterator) { | 302 | for_each_shadow_entry(vcpu, addr, iterator) { |
296 | level = iterator.level; | 303 | level = iterator.level; |
297 | sptep = iterator.sptep; | 304 | sptep = iterator.sptep; |
298 | if (level == PT_PAGE_TABLE_LEVEL | 305 | if (iterator.level == hlevel) { |
299 | || (largepage && level == PT_DIRECTORY_LEVEL)) { | ||
300 | mmu_set_spte(vcpu, sptep, access, | 306 | mmu_set_spte(vcpu, sptep, access, |
301 | gw->pte_access & access, | 307 | gw->pte_access & access, |
302 | user_fault, write_fault, | 308 | user_fault, write_fault, |
303 | gw->ptes[gw->level-1] & PT_DIRTY_MASK, | 309 | gw->ptes[gw->level-1] & PT_DIRTY_MASK, |
304 | ptwrite, largepage, | 310 | ptwrite, level, |
305 | gw->gfn, pfn, false); | 311 | gw->gfn, pfn, false); |
306 | break; | 312 | break; |
307 | } | 313 | } |
@@ -311,16 +317,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
311 | 317 | ||
312 | if (is_large_pte(*sptep)) { | 318 | if (is_large_pte(*sptep)) { |
313 | rmap_remove(vcpu->kvm, sptep); | 319 | rmap_remove(vcpu->kvm, sptep); |
314 | set_shadow_pte(sptep, shadow_trap_nonpresent_pte); | 320 | __set_spte(sptep, shadow_trap_nonpresent_pte); |
315 | kvm_flush_remote_tlbs(vcpu->kvm); | 321 | kvm_flush_remote_tlbs(vcpu->kvm); |
316 | } | 322 | } |
317 | 323 | ||
318 | if (level == PT_DIRECTORY_LEVEL | 324 | if (level <= gw->level) { |
319 | && gw->level == PT_DIRECTORY_LEVEL) { | 325 | int delta = level - gw->level + 1; |
320 | direct = 1; | 326 | direct = 1; |
321 | if (!is_dirty_pte(gw->ptes[level - 1])) | 327 | if (!is_dirty_gpte(gw->ptes[level - delta])) |
322 | access &= ~ACC_WRITE_MASK; | 328 | access &= ~ACC_WRITE_MASK; |
323 | table_gfn = gpte_to_gfn(gw->ptes[level - 1]); | 329 | table_gfn = gpte_to_gfn(gw->ptes[level - delta]); |
330 | /* advance table_gfn when emulating 1gb pages with 4k */ | ||
331 | if (delta == 0) | ||
332 | table_gfn += PT_INDEX(addr, level); | ||
324 | } else { | 333 | } else { |
325 | direct = 0; | 334 | direct = 0; |
326 | table_gfn = gw->table_gfn[level - 2]; | 335 | table_gfn = gw->table_gfn[level - 2]; |
@@ -369,11 +378,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
369 | int user_fault = error_code & PFERR_USER_MASK; | 378 | int user_fault = error_code & PFERR_USER_MASK; |
370 | int fetch_fault = error_code & PFERR_FETCH_MASK; | 379 | int fetch_fault = error_code & PFERR_FETCH_MASK; |
371 | struct guest_walker walker; | 380 | struct guest_walker walker; |
372 | u64 *shadow_pte; | 381 | u64 *sptep; |
373 | int write_pt = 0; | 382 | int write_pt = 0; |
374 | int r; | 383 | int r; |
375 | pfn_t pfn; | 384 | pfn_t pfn; |
376 | int largepage = 0; | 385 | int level = PT_PAGE_TABLE_LEVEL; |
377 | unsigned long mmu_seq; | 386 | unsigned long mmu_seq; |
378 | 387 | ||
379 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); | 388 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
@@ -399,14 +408,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
399 | return 0; | 408 | return 0; |
400 | } | 409 | } |
401 | 410 | ||
402 | if (walker.level == PT_DIRECTORY_LEVEL) { | 411 | if (walker.level >= PT_DIRECTORY_LEVEL) { |
403 | gfn_t large_gfn; | 412 | level = min(walker.level, mapping_level(vcpu, walker.gfn)); |
404 | large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); | 413 | walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); |
405 | if (is_largepage_backed(vcpu, large_gfn)) { | ||
406 | walker.gfn = large_gfn; | ||
407 | largepage = 1; | ||
408 | } | ||
409 | } | 414 | } |
415 | |||
410 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 416 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
411 | smp_rmb(); | 417 | smp_rmb(); |
412 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); | 418 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); |
@@ -422,11 +428,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
422 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 428 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
423 | goto out_unlock; | 429 | goto out_unlock; |
424 | kvm_mmu_free_some_pages(vcpu); | 430 | kvm_mmu_free_some_pages(vcpu); |
425 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 431 | sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
426 | largepage, &write_pt, pfn); | 432 | level, &write_pt, pfn); |
427 | |||
428 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, | 433 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, |
429 | shadow_pte, *shadow_pte, write_pt); | 434 | sptep, *sptep, write_pt); |
430 | 435 | ||
431 | if (!write_pt) | 436 | if (!write_pt) |
432 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | 437 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ |
@@ -459,8 +464,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
459 | sptep = iterator.sptep; | 464 | sptep = iterator.sptep; |
460 | 465 | ||
461 | /* FIXME: properly handle invlpg on large guest pages */ | 466 | /* FIXME: properly handle invlpg on large guest pages */ |
462 | if (level == PT_PAGE_TABLE_LEVEL || | 467 | if (level == PT_PAGE_TABLE_LEVEL || |
463 | ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { | 468 | ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || |
469 | ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { | ||
464 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | 470 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); |
465 | 471 | ||
466 | pte_gpa = (sp->gfn << PAGE_SHIFT); | 472 | pte_gpa = (sp->gfn << PAGE_SHIFT); |
@@ -472,7 +478,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
472 | --vcpu->kvm->stat.lpages; | 478 | --vcpu->kvm->stat.lpages; |
473 | need_flush = 1; | 479 | need_flush = 1; |
474 | } | 480 | } |
475 | set_shadow_pte(sptep, shadow_trap_nonpresent_pte); | 481 | __set_spte(sptep, shadow_trap_nonpresent_pte); |
476 | break; | 482 | break; |
477 | } | 483 | } |
478 | 484 | ||
@@ -489,7 +495,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | |||
489 | if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, | 495 | if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, |
490 | sizeof(pt_element_t))) | 496 | sizeof(pt_element_t))) |
491 | return; | 497 | return; |
492 | if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { | 498 | if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) { |
493 | if (mmu_topup_memory_caches(vcpu)) | 499 | if (mmu_topup_memory_caches(vcpu)) |
494 | return; | 500 | return; |
495 | kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, | 501 | kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, |
@@ -536,7 +542,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | |||
536 | r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); | 542 | r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); |
537 | pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); | 543 | pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); |
538 | for (j = 0; j < ARRAY_SIZE(pt); ++j) | 544 | for (j = 0; j < ARRAY_SIZE(pt); ++j) |
539 | if (r || is_present_pte(pt[j])) | 545 | if (r || is_present_gpte(pt[j])) |
540 | sp->spt[i+j] = shadow_trap_nonpresent_pte; | 546 | sp->spt[i+j] = shadow_trap_nonpresent_pte; |
541 | else | 547 | else |
542 | sp->spt[i+j] = shadow_notrap_nonpresent_pte; | 548 | sp->spt[i+j] = shadow_notrap_nonpresent_pte; |
@@ -574,23 +580,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
574 | sizeof(pt_element_t))) | 580 | sizeof(pt_element_t))) |
575 | return -EINVAL; | 581 | return -EINVAL; |
576 | 582 | ||
577 | if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || | 583 | if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || |
578 | !(gpte & PT_ACCESSED_MASK)) { | 584 | !(gpte & PT_ACCESSED_MASK)) { |
579 | u64 nonpresent; | 585 | u64 nonpresent; |
580 | 586 | ||
581 | rmap_remove(vcpu->kvm, &sp->spt[i]); | 587 | rmap_remove(vcpu->kvm, &sp->spt[i]); |
582 | if (is_present_pte(gpte)) | 588 | if (is_present_gpte(gpte)) |
583 | nonpresent = shadow_trap_nonpresent_pte; | 589 | nonpresent = shadow_trap_nonpresent_pte; |
584 | else | 590 | else |
585 | nonpresent = shadow_notrap_nonpresent_pte; | 591 | nonpresent = shadow_notrap_nonpresent_pte; |
586 | set_shadow_pte(&sp->spt[i], nonpresent); | 592 | __set_spte(&sp->spt[i], nonpresent); |
587 | continue; | 593 | continue; |
588 | } | 594 | } |
589 | 595 | ||
590 | nr_present++; | 596 | nr_present++; |
591 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | 597 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); |
592 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, | 598 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, |
593 | is_dirty_pte(gpte), 0, gfn, | 599 | is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, |
594 | spte_to_pfn(sp->spt[i]), true, false); | 600 | spte_to_pfn(sp->spt[i]), true, false); |
595 | } | 601 | } |
596 | 602 | ||
@@ -603,9 +609,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | |||
603 | #undef PT_BASE_ADDR_MASK | 609 | #undef PT_BASE_ADDR_MASK |
604 | #undef PT_INDEX | 610 | #undef PT_INDEX |
605 | #undef PT_LEVEL_MASK | 611 | #undef PT_LEVEL_MASK |
606 | #undef PT_DIR_BASE_ADDR_MASK | 612 | #undef PT_LVL_ADDR_MASK |
613 | #undef PT_LVL_OFFSET_MASK | ||
607 | #undef PT_LEVEL_BITS | 614 | #undef PT_LEVEL_BITS |
608 | #undef PT_MAX_FULL_LEVELS | 615 | #undef PT_MAX_FULL_LEVELS |
609 | #undef gpte_to_gfn | 616 | #undef gpte_to_gfn |
610 | #undef gpte_to_gfn_pde | 617 | #undef gpte_to_gfn_lvl |
611 | #undef CMPXCHG | 618 | #undef CMPXCHG |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b1f658ad2f0..944cc9c04b3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -15,7 +15,6 @@ | |||
15 | */ | 15 | */ |
16 | #include <linux/kvm_host.h> | 16 | #include <linux/kvm_host.h> |
17 | 17 | ||
18 | #include "kvm_svm.h" | ||
19 | #include "irq.h" | 18 | #include "irq.h" |
20 | #include "mmu.h" | 19 | #include "mmu.h" |
21 | #include "kvm_cache_regs.h" | 20 | #include "kvm_cache_regs.h" |
@@ -26,10 +25,12 @@ | |||
26 | #include <linux/vmalloc.h> | 25 | #include <linux/vmalloc.h> |
27 | #include <linux/highmem.h> | 26 | #include <linux/highmem.h> |
28 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
28 | #include <linux/ftrace_event.h> | ||
29 | 29 | ||
30 | #include <asm/desc.h> | 30 | #include <asm/desc.h> |
31 | 31 | ||
32 | #include <asm/virtext.h> | 32 | #include <asm/virtext.h> |
33 | #include "trace.h" | ||
33 | 34 | ||
34 | #define __ex(x) __kvm_handle_fault_on_reboot(x) | 35 | #define __ex(x) __kvm_handle_fault_on_reboot(x) |
35 | 36 | ||
@@ -46,6 +47,10 @@ MODULE_LICENSE("GPL"); | |||
46 | #define SVM_FEATURE_LBRV (1 << 1) | 47 | #define SVM_FEATURE_LBRV (1 << 1) |
47 | #define SVM_FEATURE_SVML (1 << 2) | 48 | #define SVM_FEATURE_SVML (1 << 2) |
48 | 49 | ||
50 | #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ | ||
51 | #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ | ||
52 | #define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ | ||
53 | |||
49 | #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) | 54 | #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) |
50 | 55 | ||
51 | /* Turn on to get debugging output*/ | 56 | /* Turn on to get debugging output*/ |
@@ -57,6 +62,58 @@ MODULE_LICENSE("GPL"); | |||
57 | #define nsvm_printk(fmt, args...) do {} while(0) | 62 | #define nsvm_printk(fmt, args...) do {} while(0) |
58 | #endif | 63 | #endif |
59 | 64 | ||
65 | static const u32 host_save_user_msrs[] = { | ||
66 | #ifdef CONFIG_X86_64 | ||
67 | MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, | ||
68 | MSR_FS_BASE, | ||
69 | #endif | ||
70 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
71 | }; | ||
72 | |||
73 | #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) | ||
74 | |||
75 | struct kvm_vcpu; | ||
76 | |||
77 | struct nested_state { | ||
78 | struct vmcb *hsave; | ||
79 | u64 hsave_msr; | ||
80 | u64 vmcb; | ||
81 | |||
82 | /* These are the merged vectors */ | ||
83 | u32 *msrpm; | ||
84 | |||
85 | /* gpa pointers to the real vectors */ | ||
86 | u64 vmcb_msrpm; | ||
87 | |||
88 | /* cache for intercepts of the guest */ | ||
89 | u16 intercept_cr_read; | ||
90 | u16 intercept_cr_write; | ||
91 | u16 intercept_dr_read; | ||
92 | u16 intercept_dr_write; | ||
93 | u32 intercept_exceptions; | ||
94 | u64 intercept; | ||
95 | |||
96 | }; | ||
97 | |||
98 | struct vcpu_svm { | ||
99 | struct kvm_vcpu vcpu; | ||
100 | struct vmcb *vmcb; | ||
101 | unsigned long vmcb_pa; | ||
102 | struct svm_cpu_data *svm_data; | ||
103 | uint64_t asid_generation; | ||
104 | uint64_t sysenter_esp; | ||
105 | uint64_t sysenter_eip; | ||
106 | |||
107 | u64 next_rip; | ||
108 | |||
109 | u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; | ||
110 | u64 host_gs_base; | ||
111 | |||
112 | u32 *msrpm; | ||
113 | |||
114 | struct nested_state nested; | ||
115 | }; | ||
116 | |||
60 | /* enable NPT for AMD64 and X86 with PAE */ | 117 | /* enable NPT for AMD64 and X86 with PAE */ |
61 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) | 118 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
62 | static bool npt_enabled = true; | 119 | static bool npt_enabled = true; |
@@ -67,15 +124,14 @@ static int npt = 1; | |||
67 | 124 | ||
68 | module_param(npt, int, S_IRUGO); | 125 | module_param(npt, int, S_IRUGO); |
69 | 126 | ||
70 | static int nested = 0; | 127 | static int nested = 1; |
71 | module_param(nested, int, S_IRUGO); | 128 | module_param(nested, int, S_IRUGO); |
72 | 129 | ||
73 | static void svm_flush_tlb(struct kvm_vcpu *vcpu); | 130 | static void svm_flush_tlb(struct kvm_vcpu *vcpu); |
131 | static void svm_complete_interrupts(struct vcpu_svm *svm); | ||
74 | 132 | ||
75 | static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); | 133 | static int nested_svm_exit_handled(struct vcpu_svm *svm); |
76 | static int nested_svm_vmexit(struct vcpu_svm *svm); | 134 | static int nested_svm_vmexit(struct vcpu_svm *svm); |
77 | static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, | ||
78 | void *arg2, void *opaque); | ||
79 | static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, | 135 | static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, |
80 | bool has_error_code, u32 error_code); | 136 | bool has_error_code, u32 error_code); |
81 | 137 | ||
@@ -86,7 +142,22 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) | |||
86 | 142 | ||
87 | static inline bool is_nested(struct vcpu_svm *svm) | 143 | static inline bool is_nested(struct vcpu_svm *svm) |
88 | { | 144 | { |
89 | return svm->nested_vmcb; | 145 | return svm->nested.vmcb; |
146 | } | ||
147 | |||
148 | static inline void enable_gif(struct vcpu_svm *svm) | ||
149 | { | ||
150 | svm->vcpu.arch.hflags |= HF_GIF_MASK; | ||
151 | } | ||
152 | |||
153 | static inline void disable_gif(struct vcpu_svm *svm) | ||
154 | { | ||
155 | svm->vcpu.arch.hflags &= ~HF_GIF_MASK; | ||
156 | } | ||
157 | |||
158 | static inline bool gif_set(struct vcpu_svm *svm) | ||
159 | { | ||
160 | return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); | ||
90 | } | 161 | } |
91 | 162 | ||
92 | static unsigned long iopm_base; | 163 | static unsigned long iopm_base; |
@@ -147,19 +218,6 @@ static inline void invlpga(unsigned long addr, u32 asid) | |||
147 | asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); | 218 | asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); |
148 | } | 219 | } |
149 | 220 | ||
150 | static inline unsigned long kvm_read_cr2(void) | ||
151 | { | ||
152 | unsigned long cr2; | ||
153 | |||
154 | asm volatile ("mov %%cr2, %0" : "=r" (cr2)); | ||
155 | return cr2; | ||
156 | } | ||
157 | |||
158 | static inline void kvm_write_cr2(unsigned long val) | ||
159 | { | ||
160 | asm volatile ("mov %0, %%cr2" :: "r" (val)); | ||
161 | } | ||
162 | |||
163 | static inline void force_new_asid(struct kvm_vcpu *vcpu) | 221 | static inline void force_new_asid(struct kvm_vcpu *vcpu) |
164 | { | 222 | { |
165 | to_svm(vcpu)->asid_generation--; | 223 | to_svm(vcpu)->asid_generation--; |
@@ -263,7 +321,7 @@ static void svm_hardware_enable(void *garbage) | |||
263 | 321 | ||
264 | struct svm_cpu_data *svm_data; | 322 | struct svm_cpu_data *svm_data; |
265 | uint64_t efer; | 323 | uint64_t efer; |
266 | struct desc_ptr gdt_descr; | 324 | struct descriptor_table gdt_descr; |
267 | struct desc_struct *gdt; | 325 | struct desc_struct *gdt; |
268 | int me = raw_smp_processor_id(); | 326 | int me = raw_smp_processor_id(); |
269 | 327 | ||
@@ -283,8 +341,8 @@ static void svm_hardware_enable(void *garbage) | |||
283 | svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; | 341 | svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; |
284 | svm_data->next_asid = svm_data->max_asid + 1; | 342 | svm_data->next_asid = svm_data->max_asid + 1; |
285 | 343 | ||
286 | asm volatile ("sgdt %0" : "=m"(gdt_descr)); | 344 | kvm_get_gdt(&gdt_descr); |
287 | gdt = (struct desc_struct *)gdt_descr.address; | 345 | gdt = (struct desc_struct *)gdt_descr.base; |
288 | svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); | 346 | svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); |
289 | 347 | ||
290 | rdmsrl(MSR_EFER, efer); | 348 | rdmsrl(MSR_EFER, efer); |
@@ -367,8 +425,6 @@ static void svm_vcpu_init_msrpm(u32 *msrpm) | |||
367 | #endif | 425 | #endif |
368 | set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); | 426 | set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); |
369 | set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); | 427 | set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); |
370 | set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); | ||
371 | set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); | ||
372 | } | 428 | } |
373 | 429 | ||
374 | static void svm_enable_lbrv(struct vcpu_svm *svm) | 430 | static void svm_enable_lbrv(struct vcpu_svm *svm) |
@@ -595,8 +651,10 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
595 | } | 651 | } |
596 | force_new_asid(&svm->vcpu); | 652 | force_new_asid(&svm->vcpu); |
597 | 653 | ||
598 | svm->nested_vmcb = 0; | 654 | svm->nested.vmcb = 0; |
599 | svm->vcpu.arch.hflags = HF_GIF_MASK; | 655 | svm->vcpu.arch.hflags = 0; |
656 | |||
657 | enable_gif(svm); | ||
600 | } | 658 | } |
601 | 659 | ||
602 | static int svm_vcpu_reset(struct kvm_vcpu *vcpu) | 660 | static int svm_vcpu_reset(struct kvm_vcpu *vcpu) |
@@ -605,7 +663,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) | |||
605 | 663 | ||
606 | init_vmcb(svm); | 664 | init_vmcb(svm); |
607 | 665 | ||
608 | if (vcpu->vcpu_id != 0) { | 666 | if (!kvm_vcpu_is_bsp(vcpu)) { |
609 | kvm_rip_write(vcpu, 0); | 667 | kvm_rip_write(vcpu, 0); |
610 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; | 668 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; |
611 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; | 669 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; |
@@ -656,9 +714,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
656 | hsave_page = alloc_page(GFP_KERNEL); | 714 | hsave_page = alloc_page(GFP_KERNEL); |
657 | if (!hsave_page) | 715 | if (!hsave_page) |
658 | goto uninit; | 716 | goto uninit; |
659 | svm->hsave = page_address(hsave_page); | 717 | svm->nested.hsave = page_address(hsave_page); |
660 | 718 | ||
661 | svm->nested_msrpm = page_address(nested_msrpm_pages); | 719 | svm->nested.msrpm = page_address(nested_msrpm_pages); |
662 | 720 | ||
663 | svm->vmcb = page_address(page); | 721 | svm->vmcb = page_address(page); |
664 | clear_page(svm->vmcb); | 722 | clear_page(svm->vmcb); |
@@ -669,7 +727,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
669 | fx_init(&svm->vcpu); | 727 | fx_init(&svm->vcpu); |
670 | svm->vcpu.fpu_active = 1; | 728 | svm->vcpu.fpu_active = 1; |
671 | svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | 729 | svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; |
672 | if (svm->vcpu.vcpu_id == 0) | 730 | if (kvm_vcpu_is_bsp(&svm->vcpu)) |
673 | svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; | 731 | svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; |
674 | 732 | ||
675 | return &svm->vcpu; | 733 | return &svm->vcpu; |
@@ -688,8 +746,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) | |||
688 | 746 | ||
689 | __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); | 747 | __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); |
690 | __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); | 748 | __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); |
691 | __free_page(virt_to_page(svm->hsave)); | 749 | __free_page(virt_to_page(svm->nested.hsave)); |
692 | __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER); | 750 | __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); |
693 | kvm_vcpu_uninit(vcpu); | 751 | kvm_vcpu_uninit(vcpu); |
694 | kmem_cache_free(kvm_vcpu_cache, svm); | 752 | kmem_cache_free(kvm_vcpu_cache, svm); |
695 | } | 753 | } |
@@ -740,6 +798,18 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | |||
740 | to_svm(vcpu)->vmcb->save.rflags = rflags; | 798 | to_svm(vcpu)->vmcb->save.rflags = rflags; |
741 | } | 799 | } |
742 | 800 | ||
801 | static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | ||
802 | { | ||
803 | switch (reg) { | ||
804 | case VCPU_EXREG_PDPTR: | ||
805 | BUG_ON(!npt_enabled); | ||
806 | load_pdptrs(vcpu, vcpu->arch.cr3); | ||
807 | break; | ||
808 | default: | ||
809 | BUG(); | ||
810 | } | ||
811 | } | ||
812 | |||
743 | static void svm_set_vintr(struct vcpu_svm *svm) | 813 | static void svm_set_vintr(struct vcpu_svm *svm) |
744 | { | 814 | { |
745 | svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; | 815 | svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; |
@@ -1061,7 +1131,6 @@ static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) | |||
1061 | val = 0; | 1131 | val = 0; |
1062 | } | 1132 | } |
1063 | 1133 | ||
1064 | KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); | ||
1065 | return val; | 1134 | return val; |
1066 | } | 1135 | } |
1067 | 1136 | ||
@@ -1070,8 +1139,6 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, | |||
1070 | { | 1139 | { |
1071 | struct vcpu_svm *svm = to_svm(vcpu); | 1140 | struct vcpu_svm *svm = to_svm(vcpu); |
1072 | 1141 | ||
1073 | KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler); | ||
1074 | |||
1075 | *exception = 0; | 1142 | *exception = 0; |
1076 | 1143 | ||
1077 | switch (dr) { | 1144 | switch (dr) { |
@@ -1119,25 +1186,9 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1119 | fault_address = svm->vmcb->control.exit_info_2; | 1186 | fault_address = svm->vmcb->control.exit_info_2; |
1120 | error_code = svm->vmcb->control.exit_info_1; | 1187 | error_code = svm->vmcb->control.exit_info_1; |
1121 | 1188 | ||
1122 | if (!npt_enabled) | 1189 | trace_kvm_page_fault(fault_address, error_code); |
1123 | KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code, | 1190 | if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) |
1124 | (u32)fault_address, (u32)(fault_address >> 32), | 1191 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); |
1125 | handler); | ||
1126 | else | ||
1127 | KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code, | ||
1128 | (u32)fault_address, (u32)(fault_address >> 32), | ||
1129 | handler); | ||
1130 | /* | ||
1131 | * FIXME: Tis shouldn't be necessary here, but there is a flush | ||
1132 | * missing in the MMU code. Until we find this bug, flush the | ||
1133 | * complete TLB here on an NPF | ||
1134 | */ | ||
1135 | if (npt_enabled) | ||
1136 | svm_flush_tlb(&svm->vcpu); | ||
1137 | else { | ||
1138 | if (kvm_event_needs_reinjection(&svm->vcpu)) | ||
1139 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); | ||
1140 | } | ||
1141 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); | 1192 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); |
1142 | } | 1193 | } |
1143 | 1194 | ||
@@ -1253,14 +1304,12 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1253 | 1304 | ||
1254 | static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1305 | static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1255 | { | 1306 | { |
1256 | KVMTRACE_0D(NMI, &svm->vcpu, handler); | ||
1257 | return 1; | 1307 | return 1; |
1258 | } | 1308 | } |
1259 | 1309 | ||
1260 | static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1310 | static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1261 | { | 1311 | { |
1262 | ++svm->vcpu.stat.irq_exits; | 1312 | ++svm->vcpu.stat.irq_exits; |
1263 | KVMTRACE_0D(INTR, &svm->vcpu, handler); | ||
1264 | return 1; | 1313 | return 1; |
1265 | } | 1314 | } |
1266 | 1315 | ||
@@ -1303,44 +1352,39 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) | |||
1303 | static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, | 1352 | static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, |
1304 | bool has_error_code, u32 error_code) | 1353 | bool has_error_code, u32 error_code) |
1305 | { | 1354 | { |
1306 | if (is_nested(svm)) { | 1355 | if (!is_nested(svm)) |
1307 | svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; | 1356 | return 0; |
1308 | svm->vmcb->control.exit_code_hi = 0; | ||
1309 | svm->vmcb->control.exit_info_1 = error_code; | ||
1310 | svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; | ||
1311 | if (nested_svm_exit_handled(svm, false)) { | ||
1312 | nsvm_printk("VMexit -> EXCP 0x%x\n", nr); | ||
1313 | |||
1314 | nested_svm_vmexit(svm); | ||
1315 | return 1; | ||
1316 | } | ||
1317 | } | ||
1318 | 1357 | ||
1319 | return 0; | 1358 | svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; |
1359 | svm->vmcb->control.exit_code_hi = 0; | ||
1360 | svm->vmcb->control.exit_info_1 = error_code; | ||
1361 | svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; | ||
1362 | |||
1363 | return nested_svm_exit_handled(svm); | ||
1320 | } | 1364 | } |
1321 | 1365 | ||
1322 | static inline int nested_svm_intr(struct vcpu_svm *svm) | 1366 | static inline int nested_svm_intr(struct vcpu_svm *svm) |
1323 | { | 1367 | { |
1324 | if (is_nested(svm)) { | 1368 | if (!is_nested(svm)) |
1325 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) | 1369 | return 0; |
1326 | return 0; | ||
1327 | 1370 | ||
1328 | if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) | 1371 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) |
1329 | return 0; | 1372 | return 0; |
1330 | 1373 | ||
1331 | svm->vmcb->control.exit_code = SVM_EXIT_INTR; | 1374 | if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) |
1375 | return 0; | ||
1332 | 1376 | ||
1333 | if (nested_svm_exit_handled(svm, false)) { | 1377 | svm->vmcb->control.exit_code = SVM_EXIT_INTR; |
1334 | nsvm_printk("VMexit -> INTR\n"); | 1378 | |
1335 | nested_svm_vmexit(svm); | 1379 | if (nested_svm_exit_handled(svm)) { |
1336 | return 1; | 1380 | nsvm_printk("VMexit -> INTR\n"); |
1337 | } | 1381 | return 1; |
1338 | } | 1382 | } |
1339 | 1383 | ||
1340 | return 0; | 1384 | return 0; |
1341 | } | 1385 | } |
1342 | 1386 | ||
1343 | static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) | 1387 | static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx) |
1344 | { | 1388 | { |
1345 | struct page *page; | 1389 | struct page *page; |
1346 | 1390 | ||
@@ -1348,236 +1392,246 @@ static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) | |||
1348 | page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); | 1392 | page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); |
1349 | up_read(¤t->mm->mmap_sem); | 1393 | up_read(¤t->mm->mmap_sem); |
1350 | 1394 | ||
1351 | if (is_error_page(page)) { | 1395 | if (is_error_page(page)) |
1352 | printk(KERN_INFO "%s: could not find page at 0x%llx\n", | 1396 | goto error; |
1353 | __func__, gpa); | 1397 | |
1354 | kvm_release_page_clean(page); | 1398 | return kmap_atomic(page, idx); |
1355 | kvm_inject_gp(&svm->vcpu, 0); | 1399 | |
1356 | return NULL; | 1400 | error: |
1357 | } | 1401 | kvm_release_page_clean(page); |
1358 | return page; | 1402 | kvm_inject_gp(&svm->vcpu, 0); |
1403 | |||
1404 | return NULL; | ||
1359 | } | 1405 | } |
1360 | 1406 | ||
1361 | static int nested_svm_do(struct vcpu_svm *svm, | 1407 | static void nested_svm_unmap(void *addr, enum km_type idx) |
1362 | u64 arg1_gpa, u64 arg2_gpa, void *opaque, | ||
1363 | int (*handler)(struct vcpu_svm *svm, | ||
1364 | void *arg1, | ||
1365 | void *arg2, | ||
1366 | void *opaque)) | ||
1367 | { | 1408 | { |
1368 | struct page *arg1_page; | 1409 | struct page *page; |
1369 | struct page *arg2_page = NULL; | ||
1370 | void *arg1; | ||
1371 | void *arg2 = NULL; | ||
1372 | int retval; | ||
1373 | 1410 | ||
1374 | arg1_page = nested_svm_get_page(svm, arg1_gpa); | 1411 | if (!addr) |
1375 | if(arg1_page == NULL) | 1412 | return; |
1376 | return 1; | ||
1377 | 1413 | ||
1378 | if (arg2_gpa) { | 1414 | page = kmap_atomic_to_page(addr); |
1379 | arg2_page = nested_svm_get_page(svm, arg2_gpa); | 1415 | |
1380 | if(arg2_page == NULL) { | 1416 | kunmap_atomic(addr, idx); |
1381 | kvm_release_page_clean(arg1_page); | 1417 | kvm_release_page_dirty(page); |
1382 | return 1; | 1418 | } |
1383 | } | 1419 | |
1384 | } | 1420 | static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) |
1421 | { | ||
1422 | u32 param = svm->vmcb->control.exit_info_1 & 1; | ||
1423 | u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; | ||
1424 | bool ret = false; | ||
1425 | u32 t0, t1; | ||
1426 | u8 *msrpm; | ||
1385 | 1427 | ||
1386 | arg1 = kmap_atomic(arg1_page, KM_USER0); | 1428 | if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) |
1387 | if (arg2_gpa) | 1429 | return false; |
1388 | arg2 = kmap_atomic(arg2_page, KM_USER1); | ||
1389 | 1430 | ||
1390 | retval = handler(svm, arg1, arg2, opaque); | 1431 | msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); |
1432 | |||
1433 | if (!msrpm) | ||
1434 | goto out; | ||
1435 | |||
1436 | switch (msr) { | ||
1437 | case 0 ... 0x1fff: | ||
1438 | t0 = (msr * 2) % 8; | ||
1439 | t1 = msr / 8; | ||
1440 | break; | ||
1441 | case 0xc0000000 ... 0xc0001fff: | ||
1442 | t0 = (8192 + msr - 0xc0000000) * 2; | ||
1443 | t1 = (t0 / 8); | ||
1444 | t0 %= 8; | ||
1445 | break; | ||
1446 | case 0xc0010000 ... 0xc0011fff: | ||
1447 | t0 = (16384 + msr - 0xc0010000) * 2; | ||
1448 | t1 = (t0 / 8); | ||
1449 | t0 %= 8; | ||
1450 | break; | ||
1451 | default: | ||
1452 | ret = true; | ||
1453 | goto out; | ||
1454 | } | ||
1391 | 1455 | ||
1392 | kunmap_atomic(arg1, KM_USER0); | 1456 | ret = msrpm[t1] & ((1 << param) << t0); |
1393 | if (arg2_gpa) | ||
1394 | kunmap_atomic(arg2, KM_USER1); | ||
1395 | 1457 | ||
1396 | kvm_release_page_dirty(arg1_page); | 1458 | out: |
1397 | if (arg2_gpa) | 1459 | nested_svm_unmap(msrpm, KM_USER0); |
1398 | kvm_release_page_dirty(arg2_page); | ||
1399 | 1460 | ||
1400 | return retval; | 1461 | return ret; |
1401 | } | 1462 | } |
1402 | 1463 | ||
1403 | static int nested_svm_exit_handled_real(struct vcpu_svm *svm, | 1464 | static int nested_svm_exit_special(struct vcpu_svm *svm) |
1404 | void *arg1, | ||
1405 | void *arg2, | ||
1406 | void *opaque) | ||
1407 | { | 1465 | { |
1408 | struct vmcb *nested_vmcb = (struct vmcb *)arg1; | ||
1409 | bool kvm_overrides = *(bool *)opaque; | ||
1410 | u32 exit_code = svm->vmcb->control.exit_code; | 1466 | u32 exit_code = svm->vmcb->control.exit_code; |
1411 | 1467 | ||
1412 | if (kvm_overrides) { | 1468 | switch (exit_code) { |
1413 | switch (exit_code) { | 1469 | case SVM_EXIT_INTR: |
1414 | case SVM_EXIT_INTR: | 1470 | case SVM_EXIT_NMI: |
1415 | case SVM_EXIT_NMI: | 1471 | return NESTED_EXIT_HOST; |
1416 | return 0; | ||
1417 | /* For now we are always handling NPFs when using them */ | 1472 | /* For now we are always handling NPFs when using them */ |
1418 | case SVM_EXIT_NPF: | 1473 | case SVM_EXIT_NPF: |
1419 | if (npt_enabled) | 1474 | if (npt_enabled) |
1420 | return 0; | 1475 | return NESTED_EXIT_HOST; |
1421 | break; | 1476 | break; |
1422 | /* When we're shadowing, trap PFs */ | 1477 | /* When we're shadowing, trap PFs */ |
1423 | case SVM_EXIT_EXCP_BASE + PF_VECTOR: | 1478 | case SVM_EXIT_EXCP_BASE + PF_VECTOR: |
1424 | if (!npt_enabled) | 1479 | if (!npt_enabled) |
1425 | return 0; | 1480 | return NESTED_EXIT_HOST; |
1426 | break; | 1481 | break; |
1427 | default: | 1482 | default: |
1428 | break; | 1483 | break; |
1429 | } | ||
1430 | } | 1484 | } |
1431 | 1485 | ||
1486 | return NESTED_EXIT_CONTINUE; | ||
1487 | } | ||
1488 | |||
1489 | /* | ||
1490 | * If this function returns true, this #vmexit was already handled | ||
1491 | */ | ||
1492 | static int nested_svm_exit_handled(struct vcpu_svm *svm) | ||
1493 | { | ||
1494 | u32 exit_code = svm->vmcb->control.exit_code; | ||
1495 | int vmexit = NESTED_EXIT_HOST; | ||
1496 | |||
1432 | switch (exit_code) { | 1497 | switch (exit_code) { |
1498 | case SVM_EXIT_MSR: | ||
1499 | vmexit = nested_svm_exit_handled_msr(svm); | ||
1500 | break; | ||
1433 | case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { | 1501 | case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { |
1434 | u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); | 1502 | u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); |
1435 | if (nested_vmcb->control.intercept_cr_read & cr_bits) | 1503 | if (svm->nested.intercept_cr_read & cr_bits) |
1436 | return 1; | 1504 | vmexit = NESTED_EXIT_DONE; |
1437 | break; | 1505 | break; |
1438 | } | 1506 | } |
1439 | case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { | 1507 | case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { |
1440 | u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); | 1508 | u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); |
1441 | if (nested_vmcb->control.intercept_cr_write & cr_bits) | 1509 | if (svm->nested.intercept_cr_write & cr_bits) |
1442 | return 1; | 1510 | vmexit = NESTED_EXIT_DONE; |
1443 | break; | 1511 | break; |
1444 | } | 1512 | } |
1445 | case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { | 1513 | case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { |
1446 | u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); | 1514 | u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); |
1447 | if (nested_vmcb->control.intercept_dr_read & dr_bits) | 1515 | if (svm->nested.intercept_dr_read & dr_bits) |
1448 | return 1; | 1516 | vmexit = NESTED_EXIT_DONE; |
1449 | break; | 1517 | break; |
1450 | } | 1518 | } |
1451 | case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { | 1519 | case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { |
1452 | u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); | 1520 | u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); |
1453 | if (nested_vmcb->control.intercept_dr_write & dr_bits) | 1521 | if (svm->nested.intercept_dr_write & dr_bits) |
1454 | return 1; | 1522 | vmexit = NESTED_EXIT_DONE; |
1455 | break; | 1523 | break; |
1456 | } | 1524 | } |
1457 | case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { | 1525 | case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { |
1458 | u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); | 1526 | u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); |
1459 | if (nested_vmcb->control.intercept_exceptions & excp_bits) | 1527 | if (svm->nested.intercept_exceptions & excp_bits) |
1460 | return 1; | 1528 | vmexit = NESTED_EXIT_DONE; |
1461 | break; | 1529 | break; |
1462 | } | 1530 | } |
1463 | default: { | 1531 | default: { |
1464 | u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); | 1532 | u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); |
1465 | nsvm_printk("exit code: 0x%x\n", exit_code); | 1533 | nsvm_printk("exit code: 0x%x\n", exit_code); |
1466 | if (nested_vmcb->control.intercept & exit_bits) | 1534 | if (svm->nested.intercept & exit_bits) |
1467 | return 1; | 1535 | vmexit = NESTED_EXIT_DONE; |
1468 | } | 1536 | } |
1469 | } | 1537 | } |
1470 | 1538 | ||
1471 | return 0; | 1539 | if (vmexit == NESTED_EXIT_DONE) { |
1472 | } | 1540 | nsvm_printk("#VMEXIT reason=%04x\n", exit_code); |
1473 | 1541 | nested_svm_vmexit(svm); | |
1474 | static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, | ||
1475 | void *arg1, void *arg2, | ||
1476 | void *opaque) | ||
1477 | { | ||
1478 | struct vmcb *nested_vmcb = (struct vmcb *)arg1; | ||
1479 | u8 *msrpm = (u8 *)arg2; | ||
1480 | u32 t0, t1; | ||
1481 | u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; | ||
1482 | u32 param = svm->vmcb->control.exit_info_1 & 1; | ||
1483 | |||
1484 | if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT))) | ||
1485 | return 0; | ||
1486 | |||
1487 | switch(msr) { | ||
1488 | case 0 ... 0x1fff: | ||
1489 | t0 = (msr * 2) % 8; | ||
1490 | t1 = msr / 8; | ||
1491 | break; | ||
1492 | case 0xc0000000 ... 0xc0001fff: | ||
1493 | t0 = (8192 + msr - 0xc0000000) * 2; | ||
1494 | t1 = (t0 / 8); | ||
1495 | t0 %= 8; | ||
1496 | break; | ||
1497 | case 0xc0010000 ... 0xc0011fff: | ||
1498 | t0 = (16384 + msr - 0xc0010000) * 2; | ||
1499 | t1 = (t0 / 8); | ||
1500 | t0 %= 8; | ||
1501 | break; | ||
1502 | default: | ||
1503 | return 1; | ||
1504 | break; | ||
1505 | } | 1542 | } |
1506 | if (msrpm[t1] & ((1 << param) << t0)) | ||
1507 | return 1; | ||
1508 | 1543 | ||
1509 | return 0; | 1544 | return vmexit; |
1545 | } | ||
1546 | |||
1547 | static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) | ||
1548 | { | ||
1549 | struct vmcb_control_area *dst = &dst_vmcb->control; | ||
1550 | struct vmcb_control_area *from = &from_vmcb->control; | ||
1551 | |||
1552 | dst->intercept_cr_read = from->intercept_cr_read; | ||
1553 | dst->intercept_cr_write = from->intercept_cr_write; | ||
1554 | dst->intercept_dr_read = from->intercept_dr_read; | ||
1555 | dst->intercept_dr_write = from->intercept_dr_write; | ||
1556 | dst->intercept_exceptions = from->intercept_exceptions; | ||
1557 | dst->intercept = from->intercept; | ||
1558 | dst->iopm_base_pa = from->iopm_base_pa; | ||
1559 | dst->msrpm_base_pa = from->msrpm_base_pa; | ||
1560 | dst->tsc_offset = from->tsc_offset; | ||
1561 | dst->asid = from->asid; | ||
1562 | dst->tlb_ctl = from->tlb_ctl; | ||
1563 | dst->int_ctl = from->int_ctl; | ||
1564 | dst->int_vector = from->int_vector; | ||
1565 | dst->int_state = from->int_state; | ||
1566 | dst->exit_code = from->exit_code; | ||
1567 | dst->exit_code_hi = from->exit_code_hi; | ||
1568 | dst->exit_info_1 = from->exit_info_1; | ||
1569 | dst->exit_info_2 = from->exit_info_2; | ||
1570 | dst->exit_int_info = from->exit_int_info; | ||
1571 | dst->exit_int_info_err = from->exit_int_info_err; | ||
1572 | dst->nested_ctl = from->nested_ctl; | ||
1573 | dst->event_inj = from->event_inj; | ||
1574 | dst->event_inj_err = from->event_inj_err; | ||
1575 | dst->nested_cr3 = from->nested_cr3; | ||
1576 | dst->lbr_ctl = from->lbr_ctl; | ||
1510 | } | 1577 | } |
1511 | 1578 | ||
1512 | static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) | 1579 | static int nested_svm_vmexit(struct vcpu_svm *svm) |
1513 | { | 1580 | { |
1514 | bool k = kvm_override; | 1581 | struct vmcb *nested_vmcb; |
1515 | 1582 | struct vmcb *hsave = svm->nested.hsave; | |
1516 | switch (svm->vmcb->control.exit_code) { | 1583 | struct vmcb *vmcb = svm->vmcb; |
1517 | case SVM_EXIT_MSR: | ||
1518 | return nested_svm_do(svm, svm->nested_vmcb, | ||
1519 | svm->nested_vmcb_msrpm, NULL, | ||
1520 | nested_svm_exit_handled_msr); | ||
1521 | default: break; | ||
1522 | } | ||
1523 | 1584 | ||
1524 | return nested_svm_do(svm, svm->nested_vmcb, 0, &k, | 1585 | nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); |
1525 | nested_svm_exit_handled_real); | 1586 | if (!nested_vmcb) |
1526 | } | 1587 | return 1; |
1527 | |||
1528 | static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, | ||
1529 | void *arg2, void *opaque) | ||
1530 | { | ||
1531 | struct vmcb *nested_vmcb = (struct vmcb *)arg1; | ||
1532 | struct vmcb *hsave = svm->hsave; | ||
1533 | u64 nested_save[] = { nested_vmcb->save.cr0, | ||
1534 | nested_vmcb->save.cr3, | ||
1535 | nested_vmcb->save.cr4, | ||
1536 | nested_vmcb->save.efer, | ||
1537 | nested_vmcb->control.intercept_cr_read, | ||
1538 | nested_vmcb->control.intercept_cr_write, | ||
1539 | nested_vmcb->control.intercept_dr_read, | ||
1540 | nested_vmcb->control.intercept_dr_write, | ||
1541 | nested_vmcb->control.intercept_exceptions, | ||
1542 | nested_vmcb->control.intercept, | ||
1543 | nested_vmcb->control.msrpm_base_pa, | ||
1544 | nested_vmcb->control.iopm_base_pa, | ||
1545 | nested_vmcb->control.tsc_offset }; | ||
1546 | 1588 | ||
1547 | /* Give the current vmcb to the guest */ | 1589 | /* Give the current vmcb to the guest */ |
1548 | memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb)); | 1590 | disable_gif(svm); |
1549 | nested_vmcb->save.cr0 = nested_save[0]; | 1591 | |
1550 | if (!npt_enabled) | 1592 | nested_vmcb->save.es = vmcb->save.es; |
1551 | nested_vmcb->save.cr3 = nested_save[1]; | 1593 | nested_vmcb->save.cs = vmcb->save.cs; |
1552 | nested_vmcb->save.cr4 = nested_save[2]; | 1594 | nested_vmcb->save.ss = vmcb->save.ss; |
1553 | nested_vmcb->save.efer = nested_save[3]; | 1595 | nested_vmcb->save.ds = vmcb->save.ds; |
1554 | nested_vmcb->control.intercept_cr_read = nested_save[4]; | 1596 | nested_vmcb->save.gdtr = vmcb->save.gdtr; |
1555 | nested_vmcb->control.intercept_cr_write = nested_save[5]; | 1597 | nested_vmcb->save.idtr = vmcb->save.idtr; |
1556 | nested_vmcb->control.intercept_dr_read = nested_save[6]; | 1598 | if (npt_enabled) |
1557 | nested_vmcb->control.intercept_dr_write = nested_save[7]; | 1599 | nested_vmcb->save.cr3 = vmcb->save.cr3; |
1558 | nested_vmcb->control.intercept_exceptions = nested_save[8]; | 1600 | nested_vmcb->save.cr2 = vmcb->save.cr2; |
1559 | nested_vmcb->control.intercept = nested_save[9]; | 1601 | nested_vmcb->save.rflags = vmcb->save.rflags; |
1560 | nested_vmcb->control.msrpm_base_pa = nested_save[10]; | 1602 | nested_vmcb->save.rip = vmcb->save.rip; |
1561 | nested_vmcb->control.iopm_base_pa = nested_save[11]; | 1603 | nested_vmcb->save.rsp = vmcb->save.rsp; |
1562 | nested_vmcb->control.tsc_offset = nested_save[12]; | 1604 | nested_vmcb->save.rax = vmcb->save.rax; |
1605 | nested_vmcb->save.dr7 = vmcb->save.dr7; | ||
1606 | nested_vmcb->save.dr6 = vmcb->save.dr6; | ||
1607 | nested_vmcb->save.cpl = vmcb->save.cpl; | ||
1608 | |||
1609 | nested_vmcb->control.int_ctl = vmcb->control.int_ctl; | ||
1610 | nested_vmcb->control.int_vector = vmcb->control.int_vector; | ||
1611 | nested_vmcb->control.int_state = vmcb->control.int_state; | ||
1612 | nested_vmcb->control.exit_code = vmcb->control.exit_code; | ||
1613 | nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; | ||
1614 | nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; | ||
1615 | nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; | ||
1616 | nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; | ||
1617 | nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; | ||
1618 | nested_vmcb->control.tlb_ctl = 0; | ||
1619 | nested_vmcb->control.event_inj = 0; | ||
1620 | nested_vmcb->control.event_inj_err = 0; | ||
1563 | 1621 | ||
1564 | /* We always set V_INTR_MASKING and remember the old value in hflags */ | 1622 | /* We always set V_INTR_MASKING and remember the old value in hflags */ |
1565 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) | 1623 | if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) |
1566 | nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; | 1624 | nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; |
1567 | 1625 | ||
1568 | if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) && | ||
1569 | (nested_vmcb->control.int_vector)) { | ||
1570 | nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n", | ||
1571 | nested_vmcb->control.int_vector); | ||
1572 | } | ||
1573 | |||
1574 | /* Restore the original control entries */ | 1626 | /* Restore the original control entries */ |
1575 | svm->vmcb->control = hsave->control; | 1627 | copy_vmcb_control_area(vmcb, hsave); |
1576 | 1628 | ||
1577 | /* Kill any pending exceptions */ | 1629 | /* Kill any pending exceptions */ |
1578 | if (svm->vcpu.arch.exception.pending == true) | 1630 | if (svm->vcpu.arch.exception.pending == true) |
1579 | nsvm_printk("WARNING: Pending Exception\n"); | 1631 | nsvm_printk("WARNING: Pending Exception\n"); |
1580 | svm->vcpu.arch.exception.pending = false; | 1632 | |
1633 | kvm_clear_exception_queue(&svm->vcpu); | ||
1634 | kvm_clear_interrupt_queue(&svm->vcpu); | ||
1581 | 1635 | ||
1582 | /* Restore selected save entries */ | 1636 | /* Restore selected save entries */ |
1583 | svm->vmcb->save.es = hsave->save.es; | 1637 | svm->vmcb->save.es = hsave->save.es; |
@@ -1603,19 +1657,10 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, | |||
1603 | svm->vmcb->save.cpl = 0; | 1657 | svm->vmcb->save.cpl = 0; |
1604 | svm->vmcb->control.exit_int_info = 0; | 1658 | svm->vmcb->control.exit_int_info = 0; |
1605 | 1659 | ||
1606 | svm->vcpu.arch.hflags &= ~HF_GIF_MASK; | ||
1607 | /* Exit nested SVM mode */ | 1660 | /* Exit nested SVM mode */ |
1608 | svm->nested_vmcb = 0; | 1661 | svm->nested.vmcb = 0; |
1609 | 1662 | ||
1610 | return 0; | 1663 | nested_svm_unmap(nested_vmcb, KM_USER0); |
1611 | } | ||
1612 | |||
1613 | static int nested_svm_vmexit(struct vcpu_svm *svm) | ||
1614 | { | ||
1615 | nsvm_printk("VMexit\n"); | ||
1616 | if (nested_svm_do(svm, svm->nested_vmcb, 0, | ||
1617 | NULL, nested_svm_vmexit_real)) | ||
1618 | return 1; | ||
1619 | 1664 | ||
1620 | kvm_mmu_reset_context(&svm->vcpu); | 1665 | kvm_mmu_reset_context(&svm->vcpu); |
1621 | kvm_mmu_load(&svm->vcpu); | 1666 | kvm_mmu_load(&svm->vcpu); |
@@ -1623,38 +1668,63 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) | |||
1623 | return 0; | 1668 | return 0; |
1624 | } | 1669 | } |
1625 | 1670 | ||
1626 | static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1, | 1671 | static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) |
1627 | void *arg2, void *opaque) | ||
1628 | { | 1672 | { |
1673 | u32 *nested_msrpm; | ||
1629 | int i; | 1674 | int i; |
1630 | u32 *nested_msrpm = (u32*)arg1; | 1675 | |
1676 | nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); | ||
1677 | if (!nested_msrpm) | ||
1678 | return false; | ||
1679 | |||
1631 | for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) | 1680 | for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) |
1632 | svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; | 1681 | svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; |
1633 | svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm); | ||
1634 | 1682 | ||
1635 | return 0; | 1683 | svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); |
1684 | |||
1685 | nested_svm_unmap(nested_msrpm, KM_USER0); | ||
1686 | |||
1687 | return true; | ||
1636 | } | 1688 | } |
1637 | 1689 | ||
1638 | static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, | 1690 | static bool nested_svm_vmrun(struct vcpu_svm *svm) |
1639 | void *arg2, void *opaque) | ||
1640 | { | 1691 | { |
1641 | struct vmcb *nested_vmcb = (struct vmcb *)arg1; | 1692 | struct vmcb *nested_vmcb; |
1642 | struct vmcb *hsave = svm->hsave; | 1693 | struct vmcb *hsave = svm->nested.hsave; |
1694 | struct vmcb *vmcb = svm->vmcb; | ||
1695 | |||
1696 | nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); | ||
1697 | if (!nested_vmcb) | ||
1698 | return false; | ||
1643 | 1699 | ||
1644 | /* nested_vmcb is our indicator if nested SVM is activated */ | 1700 | /* nested_vmcb is our indicator if nested SVM is activated */ |
1645 | svm->nested_vmcb = svm->vmcb->save.rax; | 1701 | svm->nested.vmcb = svm->vmcb->save.rax; |
1646 | 1702 | ||
1647 | /* Clear internal status */ | 1703 | /* Clear internal status */ |
1648 | svm->vcpu.arch.exception.pending = false; | 1704 | kvm_clear_exception_queue(&svm->vcpu); |
1705 | kvm_clear_interrupt_queue(&svm->vcpu); | ||
1649 | 1706 | ||
1650 | /* Save the old vmcb, so we don't need to pick what we save, but | 1707 | /* Save the old vmcb, so we don't need to pick what we save, but |
1651 | can restore everything when a VMEXIT occurs */ | 1708 | can restore everything when a VMEXIT occurs */ |
1652 | memcpy(hsave, svm->vmcb, sizeof(struct vmcb)); | 1709 | hsave->save.es = vmcb->save.es; |
1653 | /* We need to remember the original CR3 in the SPT case */ | 1710 | hsave->save.cs = vmcb->save.cs; |
1654 | if (!npt_enabled) | 1711 | hsave->save.ss = vmcb->save.ss; |
1655 | hsave->save.cr3 = svm->vcpu.arch.cr3; | 1712 | hsave->save.ds = vmcb->save.ds; |
1656 | hsave->save.cr4 = svm->vcpu.arch.cr4; | 1713 | hsave->save.gdtr = vmcb->save.gdtr; |
1657 | hsave->save.rip = svm->next_rip; | 1714 | hsave->save.idtr = vmcb->save.idtr; |
1715 | hsave->save.efer = svm->vcpu.arch.shadow_efer; | ||
1716 | hsave->save.cr0 = svm->vcpu.arch.cr0; | ||
1717 | hsave->save.cr4 = svm->vcpu.arch.cr4; | ||
1718 | hsave->save.rflags = vmcb->save.rflags; | ||
1719 | hsave->save.rip = svm->next_rip; | ||
1720 | hsave->save.rsp = vmcb->save.rsp; | ||
1721 | hsave->save.rax = vmcb->save.rax; | ||
1722 | if (npt_enabled) | ||
1723 | hsave->save.cr3 = vmcb->save.cr3; | ||
1724 | else | ||
1725 | hsave->save.cr3 = svm->vcpu.arch.cr3; | ||
1726 | |||
1727 | copy_vmcb_control_area(hsave, vmcb); | ||
1658 | 1728 | ||
1659 | if (svm->vmcb->save.rflags & X86_EFLAGS_IF) | 1729 | if (svm->vmcb->save.rflags & X86_EFLAGS_IF) |
1660 | svm->vcpu.arch.hflags |= HF_HIF_MASK; | 1730 | svm->vcpu.arch.hflags |= HF_HIF_MASK; |
@@ -1679,7 +1749,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, | |||
1679 | kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); | 1749 | kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); |
1680 | kvm_mmu_reset_context(&svm->vcpu); | 1750 | kvm_mmu_reset_context(&svm->vcpu); |
1681 | } | 1751 | } |
1682 | svm->vmcb->save.cr2 = nested_vmcb->save.cr2; | 1752 | svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; |
1683 | kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); | 1753 | kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); |
1684 | kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); | 1754 | kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); |
1685 | kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); | 1755 | kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); |
@@ -1706,7 +1776,15 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, | |||
1706 | 1776 | ||
1707 | svm->vmcb->control.intercept |= nested_vmcb->control.intercept; | 1777 | svm->vmcb->control.intercept |= nested_vmcb->control.intercept; |
1708 | 1778 | ||
1709 | svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; | 1779 | svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; |
1780 | |||
1781 | /* cache intercepts */ | ||
1782 | svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; | ||
1783 | svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; | ||
1784 | svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read; | ||
1785 | svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write; | ||
1786 | svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; | ||
1787 | svm->nested.intercept = nested_vmcb->control.intercept; | ||
1710 | 1788 | ||
1711 | force_new_asid(&svm->vcpu); | 1789 | force_new_asid(&svm->vcpu); |
1712 | svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; | 1790 | svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; |
@@ -1734,12 +1812,14 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, | |||
1734 | svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; | 1812 | svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; |
1735 | svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; | 1813 | svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; |
1736 | 1814 | ||
1737 | svm->vcpu.arch.hflags |= HF_GIF_MASK; | 1815 | nested_svm_unmap(nested_vmcb, KM_USER0); |
1738 | 1816 | ||
1739 | return 0; | 1817 | enable_gif(svm); |
1818 | |||
1819 | return true; | ||
1740 | } | 1820 | } |
1741 | 1821 | ||
1742 | static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) | 1822 | static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) |
1743 | { | 1823 | { |
1744 | to_vmcb->save.fs = from_vmcb->save.fs; | 1824 | to_vmcb->save.fs = from_vmcb->save.fs; |
1745 | to_vmcb->save.gs = from_vmcb->save.gs; | 1825 | to_vmcb->save.gs = from_vmcb->save.gs; |
@@ -1753,44 +1833,44 @@ static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) | |||
1753 | to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; | 1833 | to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; |
1754 | to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; | 1834 | to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; |
1755 | to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; | 1835 | to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; |
1756 | |||
1757 | return 1; | ||
1758 | } | ||
1759 | |||
1760 | static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb, | ||
1761 | void *arg2, void *opaque) | ||
1762 | { | ||
1763 | return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb); | ||
1764 | } | ||
1765 | |||
1766 | static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, | ||
1767 | void *arg2, void *opaque) | ||
1768 | { | ||
1769 | return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb); | ||
1770 | } | 1836 | } |
1771 | 1837 | ||
1772 | static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1838 | static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1773 | { | 1839 | { |
1840 | struct vmcb *nested_vmcb; | ||
1841 | |||
1774 | if (nested_svm_check_permissions(svm)) | 1842 | if (nested_svm_check_permissions(svm)) |
1775 | return 1; | 1843 | return 1; |
1776 | 1844 | ||
1777 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 1845 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
1778 | skip_emulated_instruction(&svm->vcpu); | 1846 | skip_emulated_instruction(&svm->vcpu); |
1779 | 1847 | ||
1780 | nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload); | 1848 | nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); |
1849 | if (!nested_vmcb) | ||
1850 | return 1; | ||
1851 | |||
1852 | nested_svm_vmloadsave(nested_vmcb, svm->vmcb); | ||
1853 | nested_svm_unmap(nested_vmcb, KM_USER0); | ||
1781 | 1854 | ||
1782 | return 1; | 1855 | return 1; |
1783 | } | 1856 | } |
1784 | 1857 | ||
1785 | static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1858 | static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1786 | { | 1859 | { |
1860 | struct vmcb *nested_vmcb; | ||
1861 | |||
1787 | if (nested_svm_check_permissions(svm)) | 1862 | if (nested_svm_check_permissions(svm)) |
1788 | return 1; | 1863 | return 1; |
1789 | 1864 | ||
1790 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 1865 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
1791 | skip_emulated_instruction(&svm->vcpu); | 1866 | skip_emulated_instruction(&svm->vcpu); |
1792 | 1867 | ||
1793 | nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave); | 1868 | nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); |
1869 | if (!nested_vmcb) | ||
1870 | return 1; | ||
1871 | |||
1872 | nested_svm_vmloadsave(svm->vmcb, nested_vmcb); | ||
1873 | nested_svm_unmap(nested_vmcb, KM_USER0); | ||
1794 | 1874 | ||
1795 | return 1; | 1875 | return 1; |
1796 | } | 1876 | } |
@@ -1798,19 +1878,29 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1798 | static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1878 | static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1799 | { | 1879 | { |
1800 | nsvm_printk("VMrun\n"); | 1880 | nsvm_printk("VMrun\n"); |
1881 | |||
1801 | if (nested_svm_check_permissions(svm)) | 1882 | if (nested_svm_check_permissions(svm)) |
1802 | return 1; | 1883 | return 1; |
1803 | 1884 | ||
1804 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 1885 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
1805 | skip_emulated_instruction(&svm->vcpu); | 1886 | skip_emulated_instruction(&svm->vcpu); |
1806 | 1887 | ||
1807 | if (nested_svm_do(svm, svm->vmcb->save.rax, 0, | 1888 | if (!nested_svm_vmrun(svm)) |
1808 | NULL, nested_svm_vmrun)) | ||
1809 | return 1; | 1889 | return 1; |
1810 | 1890 | ||
1811 | if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0, | 1891 | if (!nested_svm_vmrun_msrpm(svm)) |
1812 | NULL, nested_svm_vmrun_msrpm)) | 1892 | goto failed; |
1813 | return 1; | 1893 | |
1894 | return 1; | ||
1895 | |||
1896 | failed: | ||
1897 | |||
1898 | svm->vmcb->control.exit_code = SVM_EXIT_ERR; | ||
1899 | svm->vmcb->control.exit_code_hi = 0; | ||
1900 | svm->vmcb->control.exit_info_1 = 0; | ||
1901 | svm->vmcb->control.exit_info_2 = 0; | ||
1902 | |||
1903 | nested_svm_vmexit(svm); | ||
1814 | 1904 | ||
1815 | return 1; | 1905 | return 1; |
1816 | } | 1906 | } |
@@ -1823,7 +1913,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1823 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 1913 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
1824 | skip_emulated_instruction(&svm->vcpu); | 1914 | skip_emulated_instruction(&svm->vcpu); |
1825 | 1915 | ||
1826 | svm->vcpu.arch.hflags |= HF_GIF_MASK; | 1916 | enable_gif(svm); |
1827 | 1917 | ||
1828 | return 1; | 1918 | return 1; |
1829 | } | 1919 | } |
@@ -1836,7 +1926,7 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1836 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | 1926 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
1837 | skip_emulated_instruction(&svm->vcpu); | 1927 | skip_emulated_instruction(&svm->vcpu); |
1838 | 1928 | ||
1839 | svm->vcpu.arch.hflags &= ~HF_GIF_MASK; | 1929 | disable_gif(svm); |
1840 | 1930 | ||
1841 | /* After a CLGI no interrupts should come */ | 1931 | /* After a CLGI no interrupts should come */ |
1842 | svm_clear_vintr(svm); | 1932 | svm_clear_vintr(svm); |
@@ -1845,6 +1935,19 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1845 | return 1; | 1935 | return 1; |
1846 | } | 1936 | } |
1847 | 1937 | ||
1938 | static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1939 | { | ||
1940 | struct kvm_vcpu *vcpu = &svm->vcpu; | ||
1941 | nsvm_printk("INVLPGA\n"); | ||
1942 | |||
1943 | /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ | ||
1944 | kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); | ||
1945 | |||
1946 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; | ||
1947 | skip_emulated_instruction(&svm->vcpu); | ||
1948 | return 1; | ||
1949 | } | ||
1950 | |||
1848 | static int invalid_op_interception(struct vcpu_svm *svm, | 1951 | static int invalid_op_interception(struct vcpu_svm *svm, |
1849 | struct kvm_run *kvm_run) | 1952 | struct kvm_run *kvm_run) |
1850 | { | 1953 | { |
@@ -1953,7 +2056,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | |||
1953 | struct vcpu_svm *svm = to_svm(vcpu); | 2056 | struct vcpu_svm *svm = to_svm(vcpu); |
1954 | 2057 | ||
1955 | switch (ecx) { | 2058 | switch (ecx) { |
1956 | case MSR_IA32_TIME_STAMP_COUNTER: { | 2059 | case MSR_IA32_TSC: { |
1957 | u64 tsc; | 2060 | u64 tsc; |
1958 | 2061 | ||
1959 | rdtscll(tsc); | 2062 | rdtscll(tsc); |
@@ -1981,10 +2084,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | |||
1981 | *data = svm->vmcb->save.sysenter_cs; | 2084 | *data = svm->vmcb->save.sysenter_cs; |
1982 | break; | 2085 | break; |
1983 | case MSR_IA32_SYSENTER_EIP: | 2086 | case MSR_IA32_SYSENTER_EIP: |
1984 | *data = svm->vmcb->save.sysenter_eip; | 2087 | *data = svm->sysenter_eip; |
1985 | break; | 2088 | break; |
1986 | case MSR_IA32_SYSENTER_ESP: | 2089 | case MSR_IA32_SYSENTER_ESP: |
1987 | *data = svm->vmcb->save.sysenter_esp; | 2090 | *data = svm->sysenter_esp; |
1988 | break; | 2091 | break; |
1989 | /* Nobody will change the following 5 values in the VMCB so | 2092 | /* Nobody will change the following 5 values in the VMCB so |
1990 | we can safely return them on rdmsr. They will always be 0 | 2093 | we can safely return them on rdmsr. They will always be 0 |
@@ -2005,7 +2108,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | |||
2005 | *data = svm->vmcb->save.last_excp_to; | 2108 | *data = svm->vmcb->save.last_excp_to; |
2006 | break; | 2109 | break; |
2007 | case MSR_VM_HSAVE_PA: | 2110 | case MSR_VM_HSAVE_PA: |
2008 | *data = svm->hsave_msr; | 2111 | *data = svm->nested.hsave_msr; |
2009 | break; | 2112 | break; |
2010 | case MSR_VM_CR: | 2113 | case MSR_VM_CR: |
2011 | *data = 0; | 2114 | *data = 0; |
@@ -2027,8 +2130,7 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
2027 | if (svm_get_msr(&svm->vcpu, ecx, &data)) | 2130 | if (svm_get_msr(&svm->vcpu, ecx, &data)) |
2028 | kvm_inject_gp(&svm->vcpu, 0); | 2131 | kvm_inject_gp(&svm->vcpu, 0); |
2029 | else { | 2132 | else { |
2030 | KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, | 2133 | trace_kvm_msr_read(ecx, data); |
2031 | (u32)(data >> 32), handler); | ||
2032 | 2134 | ||
2033 | svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; | 2135 | svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; |
2034 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; | 2136 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; |
@@ -2043,7 +2145,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
2043 | struct vcpu_svm *svm = to_svm(vcpu); | 2145 | struct vcpu_svm *svm = to_svm(vcpu); |
2044 | 2146 | ||
2045 | switch (ecx) { | 2147 | switch (ecx) { |
2046 | case MSR_IA32_TIME_STAMP_COUNTER: { | 2148 | case MSR_IA32_TSC: { |
2047 | u64 tsc; | 2149 | u64 tsc; |
2048 | 2150 | ||
2049 | rdtscll(tsc); | 2151 | rdtscll(tsc); |
@@ -2071,9 +2173,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
2071 | svm->vmcb->save.sysenter_cs = data; | 2173 | svm->vmcb->save.sysenter_cs = data; |
2072 | break; | 2174 | break; |
2073 | case MSR_IA32_SYSENTER_EIP: | 2175 | case MSR_IA32_SYSENTER_EIP: |
2176 | svm->sysenter_eip = data; | ||
2074 | svm->vmcb->save.sysenter_eip = data; | 2177 | svm->vmcb->save.sysenter_eip = data; |
2075 | break; | 2178 | break; |
2076 | case MSR_IA32_SYSENTER_ESP: | 2179 | case MSR_IA32_SYSENTER_ESP: |
2180 | svm->sysenter_esp = data; | ||
2077 | svm->vmcb->save.sysenter_esp = data; | 2181 | svm->vmcb->save.sysenter_esp = data; |
2078 | break; | 2182 | break; |
2079 | case MSR_IA32_DEBUGCTLMSR: | 2183 | case MSR_IA32_DEBUGCTLMSR: |
@@ -2091,24 +2195,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
2091 | else | 2195 | else |
2092 | svm_disable_lbrv(svm); | 2196 | svm_disable_lbrv(svm); |
2093 | break; | 2197 | break; |
2094 | case MSR_K7_EVNTSEL0: | ||
2095 | case MSR_K7_EVNTSEL1: | ||
2096 | case MSR_K7_EVNTSEL2: | ||
2097 | case MSR_K7_EVNTSEL3: | ||
2098 | case MSR_K7_PERFCTR0: | ||
2099 | case MSR_K7_PERFCTR1: | ||
2100 | case MSR_K7_PERFCTR2: | ||
2101 | case MSR_K7_PERFCTR3: | ||
2102 | /* | ||
2103 | * Just discard all writes to the performance counters; this | ||
2104 | * should keep both older linux and windows 64-bit guests | ||
2105 | * happy | ||
2106 | */ | ||
2107 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data); | ||
2108 | |||
2109 | break; | ||
2110 | case MSR_VM_HSAVE_PA: | 2198 | case MSR_VM_HSAVE_PA: |
2111 | svm->hsave_msr = data; | 2199 | svm->nested.hsave_msr = data; |
2200 | break; | ||
2201 | case MSR_VM_CR: | ||
2202 | case MSR_VM_IGNNE: | ||
2203 | pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); | ||
2112 | break; | 2204 | break; |
2113 | default: | 2205 | default: |
2114 | return kvm_set_msr_common(vcpu, ecx, data); | 2206 | return kvm_set_msr_common(vcpu, ecx, data); |
@@ -2122,8 +2214,7 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
2122 | u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | 2214 | u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) |
2123 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); | 2215 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); |
2124 | 2216 | ||
2125 | KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), | 2217 | trace_kvm_msr_write(ecx, data); |
2126 | handler); | ||
2127 | 2218 | ||
2128 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; | 2219 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; |
2129 | if (svm_set_msr(&svm->vcpu, ecx, data)) | 2220 | if (svm_set_msr(&svm->vcpu, ecx, data)) |
@@ -2144,8 +2235,6 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
2144 | static int interrupt_window_interception(struct vcpu_svm *svm, | 2235 | static int interrupt_window_interception(struct vcpu_svm *svm, |
2145 | struct kvm_run *kvm_run) | 2236 | struct kvm_run *kvm_run) |
2146 | { | 2237 | { |
2147 | KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler); | ||
2148 | |||
2149 | svm_clear_vintr(svm); | 2238 | svm_clear_vintr(svm); |
2150 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; | 2239 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; |
2151 | /* | 2240 | /* |
@@ -2201,7 +2290,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | |||
2201 | [SVM_EXIT_INVD] = emulate_on_interception, | 2290 | [SVM_EXIT_INVD] = emulate_on_interception, |
2202 | [SVM_EXIT_HLT] = halt_interception, | 2291 | [SVM_EXIT_HLT] = halt_interception, |
2203 | [SVM_EXIT_INVLPG] = invlpg_interception, | 2292 | [SVM_EXIT_INVLPG] = invlpg_interception, |
2204 | [SVM_EXIT_INVLPGA] = invalid_op_interception, | 2293 | [SVM_EXIT_INVLPGA] = invlpga_interception, |
2205 | [SVM_EXIT_IOIO] = io_interception, | 2294 | [SVM_EXIT_IOIO] = io_interception, |
2206 | [SVM_EXIT_MSR] = msr_interception, | 2295 | [SVM_EXIT_MSR] = msr_interception, |
2207 | [SVM_EXIT_TASK_SWITCH] = task_switch_interception, | 2296 | [SVM_EXIT_TASK_SWITCH] = task_switch_interception, |
@@ -2224,20 +2313,26 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
2224 | struct vcpu_svm *svm = to_svm(vcpu); | 2313 | struct vcpu_svm *svm = to_svm(vcpu); |
2225 | u32 exit_code = svm->vmcb->control.exit_code; | 2314 | u32 exit_code = svm->vmcb->control.exit_code; |
2226 | 2315 | ||
2227 | KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip, | 2316 | trace_kvm_exit(exit_code, svm->vmcb->save.rip); |
2228 | (u32)((u64)svm->vmcb->save.rip >> 32), entryexit); | ||
2229 | 2317 | ||
2230 | if (is_nested(svm)) { | 2318 | if (is_nested(svm)) { |
2319 | int vmexit; | ||
2320 | |||
2231 | nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", | 2321 | nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", |
2232 | exit_code, svm->vmcb->control.exit_info_1, | 2322 | exit_code, svm->vmcb->control.exit_info_1, |
2233 | svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); | 2323 | svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); |
2234 | if (nested_svm_exit_handled(svm, true)) { | 2324 | |
2235 | nested_svm_vmexit(svm); | 2325 | vmexit = nested_svm_exit_special(svm); |
2236 | nsvm_printk("-> #VMEXIT\n"); | 2326 | |
2327 | if (vmexit == NESTED_EXIT_CONTINUE) | ||
2328 | vmexit = nested_svm_exit_handled(svm); | ||
2329 | |||
2330 | if (vmexit == NESTED_EXIT_DONE) | ||
2237 | return 1; | 2331 | return 1; |
2238 | } | ||
2239 | } | 2332 | } |
2240 | 2333 | ||
2334 | svm_complete_interrupts(svm); | ||
2335 | |||
2241 | if (npt_enabled) { | 2336 | if (npt_enabled) { |
2242 | int mmu_reload = 0; | 2337 | int mmu_reload = 0; |
2243 | if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { | 2338 | if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { |
@@ -2246,12 +2341,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
2246 | } | 2341 | } |
2247 | vcpu->arch.cr0 = svm->vmcb->save.cr0; | 2342 | vcpu->arch.cr0 = svm->vmcb->save.cr0; |
2248 | vcpu->arch.cr3 = svm->vmcb->save.cr3; | 2343 | vcpu->arch.cr3 = svm->vmcb->save.cr3; |
2249 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | ||
2250 | if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { | ||
2251 | kvm_inject_gp(vcpu, 0); | ||
2252 | return 1; | ||
2253 | } | ||
2254 | } | ||
2255 | if (mmu_reload) { | 2344 | if (mmu_reload) { |
2256 | kvm_mmu_reset_context(vcpu); | 2345 | kvm_mmu_reset_context(vcpu); |
2257 | kvm_mmu_load(vcpu); | 2346 | kvm_mmu_load(vcpu); |
@@ -2319,7 +2408,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | |||
2319 | { | 2408 | { |
2320 | struct vmcb_control_area *control; | 2409 | struct vmcb_control_area *control; |
2321 | 2410 | ||
2322 | KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); | 2411 | trace_kvm_inj_virq(irq); |
2323 | 2412 | ||
2324 | ++svm->vcpu.stat.irq_injections; | 2413 | ++svm->vcpu.stat.irq_injections; |
2325 | control = &svm->vmcb->control; | 2414 | control = &svm->vmcb->control; |
@@ -2329,21 +2418,14 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | |||
2329 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); | 2418 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); |
2330 | } | 2419 | } |
2331 | 2420 | ||
2332 | static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr) | ||
2333 | { | ||
2334 | struct vcpu_svm *svm = to_svm(vcpu); | ||
2335 | |||
2336 | svm->vmcb->control.event_inj = nr | | ||
2337 | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; | ||
2338 | } | ||
2339 | |||
2340 | static void svm_set_irq(struct kvm_vcpu *vcpu) | 2421 | static void svm_set_irq(struct kvm_vcpu *vcpu) |
2341 | { | 2422 | { |
2342 | struct vcpu_svm *svm = to_svm(vcpu); | 2423 | struct vcpu_svm *svm = to_svm(vcpu); |
2343 | 2424 | ||
2344 | nested_svm_intr(svm); | 2425 | BUG_ON(!(gif_set(svm))); |
2345 | 2426 | ||
2346 | svm_queue_irq(vcpu, vcpu->arch.interrupt.nr); | 2427 | svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | |
2428 | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; | ||
2347 | } | 2429 | } |
2348 | 2430 | ||
2349 | static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) | 2431 | static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) |
@@ -2371,13 +2453,25 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) | |||
2371 | struct vmcb *vmcb = svm->vmcb; | 2453 | struct vmcb *vmcb = svm->vmcb; |
2372 | return (vmcb->save.rflags & X86_EFLAGS_IF) && | 2454 | return (vmcb->save.rflags & X86_EFLAGS_IF) && |
2373 | !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && | 2455 | !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && |
2374 | (svm->vcpu.arch.hflags & HF_GIF_MASK); | 2456 | gif_set(svm) && |
2457 | !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)); | ||
2375 | } | 2458 | } |
2376 | 2459 | ||
2377 | static void enable_irq_window(struct kvm_vcpu *vcpu) | 2460 | static void enable_irq_window(struct kvm_vcpu *vcpu) |
2378 | { | 2461 | { |
2379 | svm_set_vintr(to_svm(vcpu)); | 2462 | struct vcpu_svm *svm = to_svm(vcpu); |
2380 | svm_inject_irq(to_svm(vcpu), 0x0); | 2463 | nsvm_printk("Trying to open IRQ window\n"); |
2464 | |||
2465 | nested_svm_intr(svm); | ||
2466 | |||
2467 | /* In case GIF=0 we can't rely on the CPU to tell us when | ||
2468 | * GIF becomes 1, because that's a separate STGI/VMRUN intercept. | ||
2469 | * The next time we get that intercept, this function will be | ||
2470 | * called again though and we'll get the vintr intercept. */ | ||
2471 | if (gif_set(svm)) { | ||
2472 | svm_set_vintr(svm); | ||
2473 | svm_inject_irq(svm, 0x0); | ||
2474 | } | ||
2381 | } | 2475 | } |
2382 | 2476 | ||
2383 | static void enable_nmi_window(struct kvm_vcpu *vcpu) | 2477 | static void enable_nmi_window(struct kvm_vcpu *vcpu) |
@@ -2456,6 +2550,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) | |||
2456 | case SVM_EXITINTINFO_TYPE_EXEPT: | 2550 | case SVM_EXITINTINFO_TYPE_EXEPT: |
2457 | /* In case of software exception do not reinject an exception | 2551 | /* In case of software exception do not reinject an exception |
2458 | vector, but re-execute and instruction instead */ | 2552 | vector, but re-execute and instruction instead */ |
2553 | if (is_nested(svm)) | ||
2554 | break; | ||
2459 | if (kvm_exception_is_soft(vector)) | 2555 | if (kvm_exception_is_soft(vector)) |
2460 | break; | 2556 | break; |
2461 | if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { | 2557 | if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { |
@@ -2498,9 +2594,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2498 | fs_selector = kvm_read_fs(); | 2594 | fs_selector = kvm_read_fs(); |
2499 | gs_selector = kvm_read_gs(); | 2595 | gs_selector = kvm_read_gs(); |
2500 | ldt_selector = kvm_read_ldt(); | 2596 | ldt_selector = kvm_read_ldt(); |
2501 | svm->host_cr2 = kvm_read_cr2(); | 2597 | svm->vmcb->save.cr2 = vcpu->arch.cr2; |
2502 | if (!is_nested(svm)) | ||
2503 | svm->vmcb->save.cr2 = vcpu->arch.cr2; | ||
2504 | /* required for live migration with NPT */ | 2598 | /* required for live migration with NPT */ |
2505 | if (npt_enabled) | 2599 | if (npt_enabled) |
2506 | svm->vmcb->save.cr3 = vcpu->arch.cr3; | 2600 | svm->vmcb->save.cr3 = vcpu->arch.cr3; |
@@ -2585,8 +2679,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2585 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | 2679 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; |
2586 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; | 2680 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; |
2587 | 2681 | ||
2588 | kvm_write_cr2(svm->host_cr2); | ||
2589 | |||
2590 | kvm_load_fs(fs_selector); | 2682 | kvm_load_fs(fs_selector); |
2591 | kvm_load_gs(gs_selector); | 2683 | kvm_load_gs(gs_selector); |
2592 | kvm_load_ldt(ldt_selector); | 2684 | kvm_load_ldt(ldt_selector); |
@@ -2602,7 +2694,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2602 | 2694 | ||
2603 | svm->next_rip = 0; | 2695 | svm->next_rip = 0; |
2604 | 2696 | ||
2605 | svm_complete_interrupts(svm); | 2697 | if (npt_enabled) { |
2698 | vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); | ||
2699 | vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); | ||
2700 | } | ||
2606 | } | 2701 | } |
2607 | 2702 | ||
2608 | #undef R | 2703 | #undef R |
@@ -2673,6 +2768,64 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) | |||
2673 | return 0; | 2768 | return 0; |
2674 | } | 2769 | } |
2675 | 2770 | ||
2771 | static const struct trace_print_flags svm_exit_reasons_str[] = { | ||
2772 | { SVM_EXIT_READ_CR0, "read_cr0" }, | ||
2773 | { SVM_EXIT_READ_CR3, "read_cr3" }, | ||
2774 | { SVM_EXIT_READ_CR4, "read_cr4" }, | ||
2775 | { SVM_EXIT_READ_CR8, "read_cr8" }, | ||
2776 | { SVM_EXIT_WRITE_CR0, "write_cr0" }, | ||
2777 | { SVM_EXIT_WRITE_CR3, "write_cr3" }, | ||
2778 | { SVM_EXIT_WRITE_CR4, "write_cr4" }, | ||
2779 | { SVM_EXIT_WRITE_CR8, "write_cr8" }, | ||
2780 | { SVM_EXIT_READ_DR0, "read_dr0" }, | ||
2781 | { SVM_EXIT_READ_DR1, "read_dr1" }, | ||
2782 | { SVM_EXIT_READ_DR2, "read_dr2" }, | ||
2783 | { SVM_EXIT_READ_DR3, "read_dr3" }, | ||
2784 | { SVM_EXIT_WRITE_DR0, "write_dr0" }, | ||
2785 | { SVM_EXIT_WRITE_DR1, "write_dr1" }, | ||
2786 | { SVM_EXIT_WRITE_DR2, "write_dr2" }, | ||
2787 | { SVM_EXIT_WRITE_DR3, "write_dr3" }, | ||
2788 | { SVM_EXIT_WRITE_DR5, "write_dr5" }, | ||
2789 | { SVM_EXIT_WRITE_DR7, "write_dr7" }, | ||
2790 | { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, | ||
2791 | { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, | ||
2792 | { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, | ||
2793 | { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, | ||
2794 | { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, | ||
2795 | { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, | ||
2796 | { SVM_EXIT_INTR, "interrupt" }, | ||
2797 | { SVM_EXIT_NMI, "nmi" }, | ||
2798 | { SVM_EXIT_SMI, "smi" }, | ||
2799 | { SVM_EXIT_INIT, "init" }, | ||
2800 | { SVM_EXIT_VINTR, "vintr" }, | ||
2801 | { SVM_EXIT_CPUID, "cpuid" }, | ||
2802 | { SVM_EXIT_INVD, "invd" }, | ||
2803 | { SVM_EXIT_HLT, "hlt" }, | ||
2804 | { SVM_EXIT_INVLPG, "invlpg" }, | ||
2805 | { SVM_EXIT_INVLPGA, "invlpga" }, | ||
2806 | { SVM_EXIT_IOIO, "io" }, | ||
2807 | { SVM_EXIT_MSR, "msr" }, | ||
2808 | { SVM_EXIT_TASK_SWITCH, "task_switch" }, | ||
2809 | { SVM_EXIT_SHUTDOWN, "shutdown" }, | ||
2810 | { SVM_EXIT_VMRUN, "vmrun" }, | ||
2811 | { SVM_EXIT_VMMCALL, "hypercall" }, | ||
2812 | { SVM_EXIT_VMLOAD, "vmload" }, | ||
2813 | { SVM_EXIT_VMSAVE, "vmsave" }, | ||
2814 | { SVM_EXIT_STGI, "stgi" }, | ||
2815 | { SVM_EXIT_CLGI, "clgi" }, | ||
2816 | { SVM_EXIT_SKINIT, "skinit" }, | ||
2817 | { SVM_EXIT_WBINVD, "wbinvd" }, | ||
2818 | { SVM_EXIT_MONITOR, "monitor" }, | ||
2819 | { SVM_EXIT_MWAIT, "mwait" }, | ||
2820 | { SVM_EXIT_NPF, "npf" }, | ||
2821 | { -1, NULL } | ||
2822 | }; | ||
2823 | |||
2824 | static bool svm_gb_page_enable(void) | ||
2825 | { | ||
2826 | return true; | ||
2827 | } | ||
2828 | |||
2676 | static struct kvm_x86_ops svm_x86_ops = { | 2829 | static struct kvm_x86_ops svm_x86_ops = { |
2677 | .cpu_has_kvm_support = has_svm, | 2830 | .cpu_has_kvm_support = has_svm, |
2678 | .disabled_by_bios = is_disabled, | 2831 | .disabled_by_bios = is_disabled, |
@@ -2710,6 +2863,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
2710 | .set_gdt = svm_set_gdt, | 2863 | .set_gdt = svm_set_gdt, |
2711 | .get_dr = svm_get_dr, | 2864 | .get_dr = svm_get_dr, |
2712 | .set_dr = svm_set_dr, | 2865 | .set_dr = svm_set_dr, |
2866 | .cache_reg = svm_cache_reg, | ||
2713 | .get_rflags = svm_get_rflags, | 2867 | .get_rflags = svm_get_rflags, |
2714 | .set_rflags = svm_set_rflags, | 2868 | .set_rflags = svm_set_rflags, |
2715 | 2869 | ||
@@ -2733,6 +2887,9 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
2733 | .set_tss_addr = svm_set_tss_addr, | 2887 | .set_tss_addr = svm_set_tss_addr, |
2734 | .get_tdp_level = get_npt_level, | 2888 | .get_tdp_level = get_npt_level, |
2735 | .get_mt_mask = svm_get_mt_mask, | 2889 | .get_mt_mask = svm_get_mt_mask, |
2890 | |||
2891 | .exit_reasons_str = svm_exit_reasons_str, | ||
2892 | .gb_page_enable = svm_gb_page_enable, | ||
2736 | }; | 2893 | }; |
2737 | 2894 | ||
2738 | static int __init svm_init(void) | 2895 | static int __init svm_init(void) |
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index 86dbac072d0..eea40439066 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c | |||
@@ -9,12 +9,16 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) | |||
9 | int restart_timer = 0; | 9 | int restart_timer = 0; |
10 | wait_queue_head_t *q = &vcpu->wq; | 10 | wait_queue_head_t *q = &vcpu->wq; |
11 | 11 | ||
12 | /* FIXME: this code should not know anything about vcpus */ | 12 | /* |
13 | if (!atomic_inc_and_test(&ktimer->pending)) | 13 | * There is a race window between reading and incrementing, but we do |
14 | * not care about potentially loosing timer events in the !reinject | ||
15 | * case anyway. | ||
16 | */ | ||
17 | if (ktimer->reinject || !atomic_read(&ktimer->pending)) { | ||
18 | atomic_inc(&ktimer->pending); | ||
19 | /* FIXME: this code should not know anything about vcpus */ | ||
14 | set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); | 20 | set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); |
15 | 21 | } | |
16 | if (!ktimer->reinject) | ||
17 | atomic_set(&ktimer->pending, 1); | ||
18 | 22 | ||
19 | if (waitqueue_active(q)) | 23 | if (waitqueue_active(q)) |
20 | wake_up_interruptible(q); | 24 | wake_up_interruptible(q); |
@@ -33,7 +37,7 @@ enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) | |||
33 | struct kvm_vcpu *vcpu; | 37 | struct kvm_vcpu *vcpu; |
34 | struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); | 38 | struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); |
35 | 39 | ||
36 | vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id]; | 40 | vcpu = ktimer->vcpu; |
37 | if (!vcpu) | 41 | if (!vcpu) |
38 | return HRTIMER_NORESTART; | 42 | return HRTIMER_NORESTART; |
39 | 43 | ||
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h new file mode 100644 index 00000000000..0d480e77eac --- /dev/null +++ b/arch/x86/kvm/trace.h | |||
@@ -0,0 +1,355 @@ | |||
1 | #if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) | ||
2 | #define _TRACE_KVM_H | ||
3 | |||
4 | #include <linux/tracepoint.h> | ||
5 | |||
6 | #undef TRACE_SYSTEM | ||
7 | #define TRACE_SYSTEM kvm | ||
8 | #define TRACE_INCLUDE_PATH arch/x86/kvm | ||
9 | #define TRACE_INCLUDE_FILE trace | ||
10 | |||
11 | /* | ||
12 | * Tracepoint for guest mode entry. | ||
13 | */ | ||
14 | TRACE_EVENT(kvm_entry, | ||
15 | TP_PROTO(unsigned int vcpu_id), | ||
16 | TP_ARGS(vcpu_id), | ||
17 | |||
18 | TP_STRUCT__entry( | ||
19 | __field( unsigned int, vcpu_id ) | ||
20 | ), | ||
21 | |||
22 | TP_fast_assign( | ||
23 | __entry->vcpu_id = vcpu_id; | ||
24 | ), | ||
25 | |||
26 | TP_printk("vcpu %u", __entry->vcpu_id) | ||
27 | ); | ||
28 | |||
29 | /* | ||
30 | * Tracepoint for hypercall. | ||
31 | */ | ||
32 | TRACE_EVENT(kvm_hypercall, | ||
33 | TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, | ||
34 | unsigned long a2, unsigned long a3), | ||
35 | TP_ARGS(nr, a0, a1, a2, a3), | ||
36 | |||
37 | TP_STRUCT__entry( | ||
38 | __field( unsigned long, nr ) | ||
39 | __field( unsigned long, a0 ) | ||
40 | __field( unsigned long, a1 ) | ||
41 | __field( unsigned long, a2 ) | ||
42 | __field( unsigned long, a3 ) | ||
43 | ), | ||
44 | |||
45 | TP_fast_assign( | ||
46 | __entry->nr = nr; | ||
47 | __entry->a0 = a0; | ||
48 | __entry->a1 = a1; | ||
49 | __entry->a2 = a2; | ||
50 | __entry->a3 = a3; | ||
51 | ), | ||
52 | |||
53 | TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx", | ||
54 | __entry->nr, __entry->a0, __entry->a1, __entry->a2, | ||
55 | __entry->a3) | ||
56 | ); | ||
57 | |||
58 | /* | ||
59 | * Tracepoint for PIO. | ||
60 | */ | ||
61 | TRACE_EVENT(kvm_pio, | ||
62 | TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, | ||
63 | unsigned int count), | ||
64 | TP_ARGS(rw, port, size, count), | ||
65 | |||
66 | TP_STRUCT__entry( | ||
67 | __field( unsigned int, rw ) | ||
68 | __field( unsigned int, port ) | ||
69 | __field( unsigned int, size ) | ||
70 | __field( unsigned int, count ) | ||
71 | ), | ||
72 | |||
73 | TP_fast_assign( | ||
74 | __entry->rw = rw; | ||
75 | __entry->port = port; | ||
76 | __entry->size = size; | ||
77 | __entry->count = count; | ||
78 | ), | ||
79 | |||
80 | TP_printk("pio_%s at 0x%x size %d count %d", | ||
81 | __entry->rw ? "write" : "read", | ||
82 | __entry->port, __entry->size, __entry->count) | ||
83 | ); | ||
84 | |||
85 | /* | ||
86 | * Tracepoint for cpuid. | ||
87 | */ | ||
88 | TRACE_EVENT(kvm_cpuid, | ||
89 | TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx, | ||
90 | unsigned long rcx, unsigned long rdx), | ||
91 | TP_ARGS(function, rax, rbx, rcx, rdx), | ||
92 | |||
93 | TP_STRUCT__entry( | ||
94 | __field( unsigned int, function ) | ||
95 | __field( unsigned long, rax ) | ||
96 | __field( unsigned long, rbx ) | ||
97 | __field( unsigned long, rcx ) | ||
98 | __field( unsigned long, rdx ) | ||
99 | ), | ||
100 | |||
101 | TP_fast_assign( | ||
102 | __entry->function = function; | ||
103 | __entry->rax = rax; | ||
104 | __entry->rbx = rbx; | ||
105 | __entry->rcx = rcx; | ||
106 | __entry->rdx = rdx; | ||
107 | ), | ||
108 | |||
109 | TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx", | ||
110 | __entry->function, __entry->rax, | ||
111 | __entry->rbx, __entry->rcx, __entry->rdx) | ||
112 | ); | ||
113 | |||
114 | #define AREG(x) { APIC_##x, "APIC_" #x } | ||
115 | |||
116 | #define kvm_trace_symbol_apic \ | ||
117 | AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \ | ||
118 | AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \ | ||
119 | AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \ | ||
120 | AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \ | ||
121 | AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \ | ||
122 | AREG(ECTRL) | ||
123 | /* | ||
124 | * Tracepoint for apic access. | ||
125 | */ | ||
126 | TRACE_EVENT(kvm_apic, | ||
127 | TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val), | ||
128 | TP_ARGS(rw, reg, val), | ||
129 | |||
130 | TP_STRUCT__entry( | ||
131 | __field( unsigned int, rw ) | ||
132 | __field( unsigned int, reg ) | ||
133 | __field( unsigned int, val ) | ||
134 | ), | ||
135 | |||
136 | TP_fast_assign( | ||
137 | __entry->rw = rw; | ||
138 | __entry->reg = reg; | ||
139 | __entry->val = val; | ||
140 | ), | ||
141 | |||
142 | TP_printk("apic_%s %s = 0x%x", | ||
143 | __entry->rw ? "write" : "read", | ||
144 | __print_symbolic(__entry->reg, kvm_trace_symbol_apic), | ||
145 | __entry->val) | ||
146 | ); | ||
147 | |||
148 | #define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) | ||
149 | #define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) | ||
150 | |||
151 | /* | ||
152 | * Tracepoint for kvm guest exit: | ||
153 | */ | ||
154 | TRACE_EVENT(kvm_exit, | ||
155 | TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), | ||
156 | TP_ARGS(exit_reason, guest_rip), | ||
157 | |||
158 | TP_STRUCT__entry( | ||
159 | __field( unsigned int, exit_reason ) | ||
160 | __field( unsigned long, guest_rip ) | ||
161 | ), | ||
162 | |||
163 | TP_fast_assign( | ||
164 | __entry->exit_reason = exit_reason; | ||
165 | __entry->guest_rip = guest_rip; | ||
166 | ), | ||
167 | |||
168 | TP_printk("reason %s rip 0x%lx", | ||
169 | ftrace_print_symbols_seq(p, __entry->exit_reason, | ||
170 | kvm_x86_ops->exit_reasons_str), | ||
171 | __entry->guest_rip) | ||
172 | ); | ||
173 | |||
174 | /* | ||
175 | * Tracepoint for kvm interrupt injection: | ||
176 | */ | ||
177 | TRACE_EVENT(kvm_inj_virq, | ||
178 | TP_PROTO(unsigned int irq), | ||
179 | TP_ARGS(irq), | ||
180 | |||
181 | TP_STRUCT__entry( | ||
182 | __field( unsigned int, irq ) | ||
183 | ), | ||
184 | |||
185 | TP_fast_assign( | ||
186 | __entry->irq = irq; | ||
187 | ), | ||
188 | |||
189 | TP_printk("irq %u", __entry->irq) | ||
190 | ); | ||
191 | |||
192 | /* | ||
193 | * Tracepoint for page fault. | ||
194 | */ | ||
195 | TRACE_EVENT(kvm_page_fault, | ||
196 | TP_PROTO(unsigned long fault_address, unsigned int error_code), | ||
197 | TP_ARGS(fault_address, error_code), | ||
198 | |||
199 | TP_STRUCT__entry( | ||
200 | __field( unsigned long, fault_address ) | ||
201 | __field( unsigned int, error_code ) | ||
202 | ), | ||
203 | |||
204 | TP_fast_assign( | ||
205 | __entry->fault_address = fault_address; | ||
206 | __entry->error_code = error_code; | ||
207 | ), | ||
208 | |||
209 | TP_printk("address %lx error_code %x", | ||
210 | __entry->fault_address, __entry->error_code) | ||
211 | ); | ||
212 | |||
213 | /* | ||
214 | * Tracepoint for guest MSR access. | ||
215 | */ | ||
216 | TRACE_EVENT(kvm_msr, | ||
217 | TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), | ||
218 | TP_ARGS(rw, ecx, data), | ||
219 | |||
220 | TP_STRUCT__entry( | ||
221 | __field( unsigned int, rw ) | ||
222 | __field( unsigned int, ecx ) | ||
223 | __field( unsigned long, data ) | ||
224 | ), | ||
225 | |||
226 | TP_fast_assign( | ||
227 | __entry->rw = rw; | ||
228 | __entry->ecx = ecx; | ||
229 | __entry->data = data; | ||
230 | ), | ||
231 | |||
232 | TP_printk("msr_%s %x = 0x%lx", | ||
233 | __entry->rw ? "write" : "read", | ||
234 | __entry->ecx, __entry->data) | ||
235 | ); | ||
236 | |||
237 | #define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) | ||
238 | #define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) | ||
239 | |||
240 | /* | ||
241 | * Tracepoint for guest CR access. | ||
242 | */ | ||
243 | TRACE_EVENT(kvm_cr, | ||
244 | TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val), | ||
245 | TP_ARGS(rw, cr, val), | ||
246 | |||
247 | TP_STRUCT__entry( | ||
248 | __field( unsigned int, rw ) | ||
249 | __field( unsigned int, cr ) | ||
250 | __field( unsigned long, val ) | ||
251 | ), | ||
252 | |||
253 | TP_fast_assign( | ||
254 | __entry->rw = rw; | ||
255 | __entry->cr = cr; | ||
256 | __entry->val = val; | ||
257 | ), | ||
258 | |||
259 | TP_printk("cr_%s %x = 0x%lx", | ||
260 | __entry->rw ? "write" : "read", | ||
261 | __entry->cr, __entry->val) | ||
262 | ); | ||
263 | |||
264 | #define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val) | ||
265 | #define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val) | ||
266 | |||
267 | TRACE_EVENT(kvm_pic_set_irq, | ||
268 | TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced), | ||
269 | TP_ARGS(chip, pin, elcr, imr, coalesced), | ||
270 | |||
271 | TP_STRUCT__entry( | ||
272 | __field( __u8, chip ) | ||
273 | __field( __u8, pin ) | ||
274 | __field( __u8, elcr ) | ||
275 | __field( __u8, imr ) | ||
276 | __field( bool, coalesced ) | ||
277 | ), | ||
278 | |||
279 | TP_fast_assign( | ||
280 | __entry->chip = chip; | ||
281 | __entry->pin = pin; | ||
282 | __entry->elcr = elcr; | ||
283 | __entry->imr = imr; | ||
284 | __entry->coalesced = coalesced; | ||
285 | ), | ||
286 | |||
287 | TP_printk("chip %u pin %u (%s%s)%s", | ||
288 | __entry->chip, __entry->pin, | ||
289 | (__entry->elcr & (1 << __entry->pin)) ? "level":"edge", | ||
290 | (__entry->imr & (1 << __entry->pin)) ? "|masked":"", | ||
291 | __entry->coalesced ? " (coalesced)" : "") | ||
292 | ); | ||
293 | |||
294 | #define kvm_apic_dst_shorthand \ | ||
295 | {0x0, "dst"}, \ | ||
296 | {0x1, "self"}, \ | ||
297 | {0x2, "all"}, \ | ||
298 | {0x3, "all-but-self"} | ||
299 | |||
300 | TRACE_EVENT(kvm_apic_ipi, | ||
301 | TP_PROTO(__u32 icr_low, __u32 dest_id), | ||
302 | TP_ARGS(icr_low, dest_id), | ||
303 | |||
304 | TP_STRUCT__entry( | ||
305 | __field( __u32, icr_low ) | ||
306 | __field( __u32, dest_id ) | ||
307 | ), | ||
308 | |||
309 | TP_fast_assign( | ||
310 | __entry->icr_low = icr_low; | ||
311 | __entry->dest_id = dest_id; | ||
312 | ), | ||
313 | |||
314 | TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)", | ||
315 | __entry->dest_id, (u8)__entry->icr_low, | ||
316 | __print_symbolic((__entry->icr_low >> 8 & 0x7), | ||
317 | kvm_deliver_mode), | ||
318 | (__entry->icr_low & (1<<11)) ? "logical" : "physical", | ||
319 | (__entry->icr_low & (1<<14)) ? "assert" : "de-assert", | ||
320 | (__entry->icr_low & (1<<15)) ? "level" : "edge", | ||
321 | __print_symbolic((__entry->icr_low >> 18 & 0x3), | ||
322 | kvm_apic_dst_shorthand)) | ||
323 | ); | ||
324 | |||
325 | TRACE_EVENT(kvm_apic_accept_irq, | ||
326 | TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced), | ||
327 | TP_ARGS(apicid, dm, tm, vec, coalesced), | ||
328 | |||
329 | TP_STRUCT__entry( | ||
330 | __field( __u32, apicid ) | ||
331 | __field( __u16, dm ) | ||
332 | __field( __u8, tm ) | ||
333 | __field( __u8, vec ) | ||
334 | __field( bool, coalesced ) | ||
335 | ), | ||
336 | |||
337 | TP_fast_assign( | ||
338 | __entry->apicid = apicid; | ||
339 | __entry->dm = dm; | ||
340 | __entry->tm = tm; | ||
341 | __entry->vec = vec; | ||
342 | __entry->coalesced = coalesced; | ||
343 | ), | ||
344 | |||
345 | TP_printk("apicid %x vec %u (%s|%s)%s", | ||
346 | __entry->apicid, __entry->vec, | ||
347 | __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), | ||
348 | __entry->tm ? "level" : "edge", | ||
349 | __entry->coalesced ? " (coalesced)" : "") | ||
350 | ); | ||
351 | |||
352 | #endif /* _TRACE_KVM_H */ | ||
353 | |||
354 | /* This part must be outside protection */ | ||
355 | #include <trace/define_trace.h> | ||
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 29f912927a5..f3812014bd0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/highmem.h> | 25 | #include <linux/highmem.h> |
26 | #include <linux/sched.h> | 26 | #include <linux/sched.h> |
27 | #include <linux/moduleparam.h> | 27 | #include <linux/moduleparam.h> |
28 | #include <linux/ftrace_event.h> | ||
28 | #include "kvm_cache_regs.h" | 29 | #include "kvm_cache_regs.h" |
29 | #include "x86.h" | 30 | #include "x86.h" |
30 | 31 | ||
@@ -34,6 +35,8 @@ | |||
34 | #include <asm/virtext.h> | 35 | #include <asm/virtext.h> |
35 | #include <asm/mce.h> | 36 | #include <asm/mce.h> |
36 | 37 | ||
38 | #include "trace.h" | ||
39 | |||
37 | #define __ex(x) __kvm_handle_fault_on_reboot(x) | 40 | #define __ex(x) __kvm_handle_fault_on_reboot(x) |
38 | 41 | ||
39 | MODULE_AUTHOR("Qumranet"); | 42 | MODULE_AUTHOR("Qumranet"); |
@@ -51,6 +54,10 @@ module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); | |||
51 | static int __read_mostly enable_ept = 1; | 54 | static int __read_mostly enable_ept = 1; |
52 | module_param_named(ept, enable_ept, bool, S_IRUGO); | 55 | module_param_named(ept, enable_ept, bool, S_IRUGO); |
53 | 56 | ||
57 | static int __read_mostly enable_unrestricted_guest = 1; | ||
58 | module_param_named(unrestricted_guest, | ||
59 | enable_unrestricted_guest, bool, S_IRUGO); | ||
60 | |||
54 | static int __read_mostly emulate_invalid_guest_state = 0; | 61 | static int __read_mostly emulate_invalid_guest_state = 0; |
55 | module_param(emulate_invalid_guest_state, bool, S_IRUGO); | 62 | module_param(emulate_invalid_guest_state, bool, S_IRUGO); |
56 | 63 | ||
@@ -84,6 +91,14 @@ struct vcpu_vmx { | |||
84 | int guest_efer_loaded; | 91 | int guest_efer_loaded; |
85 | } host_state; | 92 | } host_state; |
86 | struct { | 93 | struct { |
94 | int vm86_active; | ||
95 | u8 save_iopl; | ||
96 | struct kvm_save_segment { | ||
97 | u16 selector; | ||
98 | unsigned long base; | ||
99 | u32 limit; | ||
100 | u32 ar; | ||
101 | } tr, es, ds, fs, gs; | ||
87 | struct { | 102 | struct { |
88 | bool pending; | 103 | bool pending; |
89 | u8 vector; | 104 | u8 vector; |
@@ -161,6 +176,8 @@ static struct kvm_vmx_segment_field { | |||
161 | VMX_SEGMENT_FIELD(LDTR), | 176 | VMX_SEGMENT_FIELD(LDTR), |
162 | }; | 177 | }; |
163 | 178 | ||
179 | static void ept_save_pdptrs(struct kvm_vcpu *vcpu); | ||
180 | |||
164 | /* | 181 | /* |
165 | * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it | 182 | * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it |
166 | * away by decrementing the array size. | 183 | * away by decrementing the array size. |
@@ -256,6 +273,26 @@ static inline bool cpu_has_vmx_flexpriority(void) | |||
256 | cpu_has_vmx_virtualize_apic_accesses(); | 273 | cpu_has_vmx_virtualize_apic_accesses(); |
257 | } | 274 | } |
258 | 275 | ||
276 | static inline bool cpu_has_vmx_ept_execute_only(void) | ||
277 | { | ||
278 | return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT); | ||
279 | } | ||
280 | |||
281 | static inline bool cpu_has_vmx_eptp_uncacheable(void) | ||
282 | { | ||
283 | return !!(vmx_capability.ept & VMX_EPTP_UC_BIT); | ||
284 | } | ||
285 | |||
286 | static inline bool cpu_has_vmx_eptp_writeback(void) | ||
287 | { | ||
288 | return !!(vmx_capability.ept & VMX_EPTP_WB_BIT); | ||
289 | } | ||
290 | |||
291 | static inline bool cpu_has_vmx_ept_2m_page(void) | ||
292 | { | ||
293 | return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); | ||
294 | } | ||
295 | |||
259 | static inline int cpu_has_vmx_invept_individual_addr(void) | 296 | static inline int cpu_has_vmx_invept_individual_addr(void) |
260 | { | 297 | { |
261 | return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); | 298 | return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); |
@@ -277,6 +314,12 @@ static inline int cpu_has_vmx_ept(void) | |||
277 | SECONDARY_EXEC_ENABLE_EPT; | 314 | SECONDARY_EXEC_ENABLE_EPT; |
278 | } | 315 | } |
279 | 316 | ||
317 | static inline int cpu_has_vmx_unrestricted_guest(void) | ||
318 | { | ||
319 | return vmcs_config.cpu_based_2nd_exec_ctrl & | ||
320 | SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
321 | } | ||
322 | |||
280 | static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) | 323 | static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) |
281 | { | 324 | { |
282 | return flexpriority_enabled && | 325 | return flexpriority_enabled && |
@@ -497,14 +540,16 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) | |||
497 | eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); | 540 | eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); |
498 | if (!vcpu->fpu_active) | 541 | if (!vcpu->fpu_active) |
499 | eb |= 1u << NM_VECTOR; | 542 | eb |= 1u << NM_VECTOR; |
543 | /* | ||
544 | * Unconditionally intercept #DB so we can maintain dr6 without | ||
545 | * reading it every exit. | ||
546 | */ | ||
547 | eb |= 1u << DB_VECTOR; | ||
500 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { | 548 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { |
501 | if (vcpu->guest_debug & | ||
502 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) | ||
503 | eb |= 1u << DB_VECTOR; | ||
504 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) | 549 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) |
505 | eb |= 1u << BP_VECTOR; | 550 | eb |= 1u << BP_VECTOR; |
506 | } | 551 | } |
507 | if (vcpu->arch.rmode.vm86_active) | 552 | if (to_vmx(vcpu)->rmode.vm86_active) |
508 | eb = ~0; | 553 | eb = ~0; |
509 | if (enable_ept) | 554 | if (enable_ept) |
510 | eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ | 555 | eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ |
@@ -528,12 +573,15 @@ static void reload_tss(void) | |||
528 | static void load_transition_efer(struct vcpu_vmx *vmx) | 573 | static void load_transition_efer(struct vcpu_vmx *vmx) |
529 | { | 574 | { |
530 | int efer_offset = vmx->msr_offset_efer; | 575 | int efer_offset = vmx->msr_offset_efer; |
531 | u64 host_efer = vmx->host_msrs[efer_offset].data; | 576 | u64 host_efer; |
532 | u64 guest_efer = vmx->guest_msrs[efer_offset].data; | 577 | u64 guest_efer; |
533 | u64 ignore_bits; | 578 | u64 ignore_bits; |
534 | 579 | ||
535 | if (efer_offset < 0) | 580 | if (efer_offset < 0) |
536 | return; | 581 | return; |
582 | host_efer = vmx->host_msrs[efer_offset].data; | ||
583 | guest_efer = vmx->guest_msrs[efer_offset].data; | ||
584 | |||
537 | /* | 585 | /* |
538 | * NX is emulated; LMA and LME handled by hardware; SCE meaninless | 586 | * NX is emulated; LMA and LME handled by hardware; SCE meaninless |
539 | * outside long mode | 587 | * outside long mode |
@@ -735,12 +783,17 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) | |||
735 | 783 | ||
736 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | 784 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) |
737 | { | 785 | { |
738 | return vmcs_readl(GUEST_RFLAGS); | 786 | unsigned long rflags; |
787 | |||
788 | rflags = vmcs_readl(GUEST_RFLAGS); | ||
789 | if (to_vmx(vcpu)->rmode.vm86_active) | ||
790 | rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); | ||
791 | return rflags; | ||
739 | } | 792 | } |
740 | 793 | ||
741 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | 794 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) |
742 | { | 795 | { |
743 | if (vcpu->arch.rmode.vm86_active) | 796 | if (to_vmx(vcpu)->rmode.vm86_active) |
744 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | 797 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; |
745 | vmcs_writel(GUEST_RFLAGS, rflags); | 798 | vmcs_writel(GUEST_RFLAGS, rflags); |
746 | } | 799 | } |
@@ -797,12 +850,13 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
797 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; | 850 | intr_info |= INTR_INFO_DELIVER_CODE_MASK; |
798 | } | 851 | } |
799 | 852 | ||
800 | if (vcpu->arch.rmode.vm86_active) { | 853 | if (vmx->rmode.vm86_active) { |
801 | vmx->rmode.irq.pending = true; | 854 | vmx->rmode.irq.pending = true; |
802 | vmx->rmode.irq.vector = nr; | 855 | vmx->rmode.irq.vector = nr; |
803 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | 856 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); |
804 | if (nr == BP_VECTOR || nr == OF_VECTOR) | 857 | if (kvm_exception_is_soft(nr)) |
805 | vmx->rmode.irq.rip++; | 858 | vmx->rmode.irq.rip += |
859 | vmx->vcpu.arch.event_exit_inst_len; | ||
806 | intr_info |= INTR_TYPE_SOFT_INTR; | 860 | intr_info |= INTR_TYPE_SOFT_INTR; |
807 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); | 861 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); |
808 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | 862 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); |
@@ -940,7 +994,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | |||
940 | case MSR_EFER: | 994 | case MSR_EFER: |
941 | return kvm_get_msr_common(vcpu, msr_index, pdata); | 995 | return kvm_get_msr_common(vcpu, msr_index, pdata); |
942 | #endif | 996 | #endif |
943 | case MSR_IA32_TIME_STAMP_COUNTER: | 997 | case MSR_IA32_TSC: |
944 | data = guest_read_tsc(); | 998 | data = guest_read_tsc(); |
945 | break; | 999 | break; |
946 | case MSR_IA32_SYSENTER_CS: | 1000 | case MSR_IA32_SYSENTER_CS: |
@@ -953,9 +1007,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | |||
953 | data = vmcs_readl(GUEST_SYSENTER_ESP); | 1007 | data = vmcs_readl(GUEST_SYSENTER_ESP); |
954 | break; | 1008 | break; |
955 | default: | 1009 | default: |
956 | vmx_load_host_state(to_vmx(vcpu)); | ||
957 | msr = find_msr_entry(to_vmx(vcpu), msr_index); | 1010 | msr = find_msr_entry(to_vmx(vcpu), msr_index); |
958 | if (msr) { | 1011 | if (msr) { |
1012 | vmx_load_host_state(to_vmx(vcpu)); | ||
959 | data = msr->data; | 1013 | data = msr->data; |
960 | break; | 1014 | break; |
961 | } | 1015 | } |
@@ -1000,22 +1054,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
1000 | case MSR_IA32_SYSENTER_ESP: | 1054 | case MSR_IA32_SYSENTER_ESP: |
1001 | vmcs_writel(GUEST_SYSENTER_ESP, data); | 1055 | vmcs_writel(GUEST_SYSENTER_ESP, data); |
1002 | break; | 1056 | break; |
1003 | case MSR_IA32_TIME_STAMP_COUNTER: | 1057 | case MSR_IA32_TSC: |
1004 | rdtscll(host_tsc); | 1058 | rdtscll(host_tsc); |
1005 | guest_write_tsc(data, host_tsc); | 1059 | guest_write_tsc(data, host_tsc); |
1006 | break; | 1060 | break; |
1007 | case MSR_P6_PERFCTR0: | ||
1008 | case MSR_P6_PERFCTR1: | ||
1009 | case MSR_P6_EVNTSEL0: | ||
1010 | case MSR_P6_EVNTSEL1: | ||
1011 | /* | ||
1012 | * Just discard all writes to the performance counters; this | ||
1013 | * should keep both older linux and windows 64-bit guests | ||
1014 | * happy | ||
1015 | */ | ||
1016 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); | ||
1017 | |||
1018 | break; | ||
1019 | case MSR_IA32_CR_PAT: | 1061 | case MSR_IA32_CR_PAT: |
1020 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { | 1062 | if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { |
1021 | vmcs_write64(GUEST_IA32_PAT, data); | 1063 | vmcs_write64(GUEST_IA32_PAT, data); |
@@ -1024,9 +1066,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
1024 | } | 1066 | } |
1025 | /* Otherwise falls through to kvm_set_msr_common */ | 1067 | /* Otherwise falls through to kvm_set_msr_common */ |
1026 | default: | 1068 | default: |
1027 | vmx_load_host_state(vmx); | ||
1028 | msr = find_msr_entry(vmx, msr_index); | 1069 | msr = find_msr_entry(vmx, msr_index); |
1029 | if (msr) { | 1070 | if (msr) { |
1071 | vmx_load_host_state(vmx); | ||
1030 | msr->data = data; | 1072 | msr->data = data; |
1031 | break; | 1073 | break; |
1032 | } | 1074 | } |
@@ -1046,6 +1088,10 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | |||
1046 | case VCPU_REGS_RIP: | 1088 | case VCPU_REGS_RIP: |
1047 | vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); | 1089 | vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); |
1048 | break; | 1090 | break; |
1091 | case VCPU_EXREG_PDPTR: | ||
1092 | if (enable_ept) | ||
1093 | ept_save_pdptrs(vcpu); | ||
1094 | break; | ||
1049 | default: | 1095 | default: |
1050 | break; | 1096 | break; |
1051 | } | 1097 | } |
@@ -1203,7 +1249,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1203 | opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | 1249 | opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | |
1204 | SECONDARY_EXEC_WBINVD_EXITING | | 1250 | SECONDARY_EXEC_WBINVD_EXITING | |
1205 | SECONDARY_EXEC_ENABLE_VPID | | 1251 | SECONDARY_EXEC_ENABLE_VPID | |
1206 | SECONDARY_EXEC_ENABLE_EPT; | 1252 | SECONDARY_EXEC_ENABLE_EPT | |
1253 | SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
1207 | if (adjust_vmx_controls(min2, opt2, | 1254 | if (adjust_vmx_controls(min2, opt2, |
1208 | MSR_IA32_VMX_PROCBASED_CTLS2, | 1255 | MSR_IA32_VMX_PROCBASED_CTLS2, |
1209 | &_cpu_based_2nd_exec_control) < 0) | 1256 | &_cpu_based_2nd_exec_control) < 0) |
@@ -1217,12 +1264,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1217 | if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { | 1264 | if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { |
1218 | /* CR3 accesses and invlpg don't need to cause VM Exits when EPT | 1265 | /* CR3 accesses and invlpg don't need to cause VM Exits when EPT |
1219 | enabled */ | 1266 | enabled */ |
1220 | min &= ~(CPU_BASED_CR3_LOAD_EXITING | | 1267 | _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | |
1221 | CPU_BASED_CR3_STORE_EXITING | | 1268 | CPU_BASED_CR3_STORE_EXITING | |
1222 | CPU_BASED_INVLPG_EXITING); | 1269 | CPU_BASED_INVLPG_EXITING); |
1223 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, | ||
1224 | &_cpu_based_exec_control) < 0) | ||
1225 | return -EIO; | ||
1226 | rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, | 1270 | rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, |
1227 | vmx_capability.ept, vmx_capability.vpid); | 1271 | vmx_capability.ept, vmx_capability.vpid); |
1228 | } | 1272 | } |
@@ -1333,8 +1377,13 @@ static __init int hardware_setup(void) | |||
1333 | if (!cpu_has_vmx_vpid()) | 1377 | if (!cpu_has_vmx_vpid()) |
1334 | enable_vpid = 0; | 1378 | enable_vpid = 0; |
1335 | 1379 | ||
1336 | if (!cpu_has_vmx_ept()) | 1380 | if (!cpu_has_vmx_ept()) { |
1337 | enable_ept = 0; | 1381 | enable_ept = 0; |
1382 | enable_unrestricted_guest = 0; | ||
1383 | } | ||
1384 | |||
1385 | if (!cpu_has_vmx_unrestricted_guest()) | ||
1386 | enable_unrestricted_guest = 0; | ||
1338 | 1387 | ||
1339 | if (!cpu_has_vmx_flexpriority()) | 1388 | if (!cpu_has_vmx_flexpriority()) |
1340 | flexpriority_enabled = 0; | 1389 | flexpriority_enabled = 0; |
@@ -1342,6 +1391,9 @@ static __init int hardware_setup(void) | |||
1342 | if (!cpu_has_vmx_tpr_shadow()) | 1391 | if (!cpu_has_vmx_tpr_shadow()) |
1343 | kvm_x86_ops->update_cr8_intercept = NULL; | 1392 | kvm_x86_ops->update_cr8_intercept = NULL; |
1344 | 1393 | ||
1394 | if (enable_ept && !cpu_has_vmx_ept_2m_page()) | ||
1395 | kvm_disable_largepages(); | ||
1396 | |||
1345 | return alloc_kvm_area(); | 1397 | return alloc_kvm_area(); |
1346 | } | 1398 | } |
1347 | 1399 | ||
@@ -1372,15 +1424,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
1372 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1424 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1373 | 1425 | ||
1374 | vmx->emulation_required = 1; | 1426 | vmx->emulation_required = 1; |
1375 | vcpu->arch.rmode.vm86_active = 0; | 1427 | vmx->rmode.vm86_active = 0; |
1376 | 1428 | ||
1377 | vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); | 1429 | vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); |
1378 | vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); | 1430 | vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); |
1379 | vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); | 1431 | vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); |
1380 | 1432 | ||
1381 | flags = vmcs_readl(GUEST_RFLAGS); | 1433 | flags = vmcs_readl(GUEST_RFLAGS); |
1382 | flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); | 1434 | flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); |
1383 | flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); | 1435 | flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); |
1384 | vmcs_writel(GUEST_RFLAGS, flags); | 1436 | vmcs_writel(GUEST_RFLAGS, flags); |
1385 | 1437 | ||
1386 | vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | | 1438 | vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | |
@@ -1391,10 +1443,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
1391 | if (emulate_invalid_guest_state) | 1443 | if (emulate_invalid_guest_state) |
1392 | return; | 1444 | return; |
1393 | 1445 | ||
1394 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); | 1446 | fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es); |
1395 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); | 1447 | fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds); |
1396 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); | 1448 | fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); |
1397 | fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); | 1449 | fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); |
1398 | 1450 | ||
1399 | vmcs_write16(GUEST_SS_SELECTOR, 0); | 1451 | vmcs_write16(GUEST_SS_SELECTOR, 0); |
1400 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); | 1452 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); |
@@ -1433,20 +1485,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
1433 | unsigned long flags; | 1485 | unsigned long flags; |
1434 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1486 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1435 | 1487 | ||
1488 | if (enable_unrestricted_guest) | ||
1489 | return; | ||
1490 | |||
1436 | vmx->emulation_required = 1; | 1491 | vmx->emulation_required = 1; |
1437 | vcpu->arch.rmode.vm86_active = 1; | 1492 | vmx->rmode.vm86_active = 1; |
1438 | 1493 | ||
1439 | vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); | 1494 | vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); |
1440 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); | 1495 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); |
1441 | 1496 | ||
1442 | vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); | 1497 | vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); |
1443 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); | 1498 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); |
1444 | 1499 | ||
1445 | vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); | 1500 | vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); |
1446 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | 1501 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); |
1447 | 1502 | ||
1448 | flags = vmcs_readl(GUEST_RFLAGS); | 1503 | flags = vmcs_readl(GUEST_RFLAGS); |
1449 | vcpu->arch.rmode.save_iopl | 1504 | vmx->rmode.save_iopl |
1450 | = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | 1505 | = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; |
1451 | 1506 | ||
1452 | flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | 1507 | flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; |
@@ -1468,10 +1523,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
1468 | vmcs_writel(GUEST_CS_BASE, 0xf0000); | 1523 | vmcs_writel(GUEST_CS_BASE, 0xf0000); |
1469 | vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); | 1524 | vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); |
1470 | 1525 | ||
1471 | fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); | 1526 | fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); |
1472 | fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); | 1527 | fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); |
1473 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); | 1528 | fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); |
1474 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); | 1529 | fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); |
1475 | 1530 | ||
1476 | continue_rmode: | 1531 | continue_rmode: |
1477 | kvm_mmu_reset_context(vcpu); | 1532 | kvm_mmu_reset_context(vcpu); |
@@ -1545,11 +1600,11 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | |||
1545 | 1600 | ||
1546 | static void ept_load_pdptrs(struct kvm_vcpu *vcpu) | 1601 | static void ept_load_pdptrs(struct kvm_vcpu *vcpu) |
1547 | { | 1602 | { |
1603 | if (!test_bit(VCPU_EXREG_PDPTR, | ||
1604 | (unsigned long *)&vcpu->arch.regs_dirty)) | ||
1605 | return; | ||
1606 | |||
1548 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | 1607 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { |
1549 | if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { | ||
1550 | printk(KERN_ERR "EPT: Fail to load pdptrs!\n"); | ||
1551 | return; | ||
1552 | } | ||
1553 | vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); | 1608 | vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); |
1554 | vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); | 1609 | vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); |
1555 | vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); | 1610 | vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); |
@@ -1557,6 +1612,21 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) | |||
1557 | } | 1612 | } |
1558 | } | 1613 | } |
1559 | 1614 | ||
1615 | static void ept_save_pdptrs(struct kvm_vcpu *vcpu) | ||
1616 | { | ||
1617 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | ||
1618 | vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); | ||
1619 | vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); | ||
1620 | vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); | ||
1621 | vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); | ||
1622 | } | ||
1623 | |||
1624 | __set_bit(VCPU_EXREG_PDPTR, | ||
1625 | (unsigned long *)&vcpu->arch.regs_avail); | ||
1626 | __set_bit(VCPU_EXREG_PDPTR, | ||
1627 | (unsigned long *)&vcpu->arch.regs_dirty); | ||
1628 | } | ||
1629 | |||
1560 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); | 1630 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); |
1561 | 1631 | ||
1562 | static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, | 1632 | static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, |
@@ -1571,8 +1641,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, | |||
1571 | CPU_BASED_CR3_STORE_EXITING)); | 1641 | CPU_BASED_CR3_STORE_EXITING)); |
1572 | vcpu->arch.cr0 = cr0; | 1642 | vcpu->arch.cr0 = cr0; |
1573 | vmx_set_cr4(vcpu, vcpu->arch.cr4); | 1643 | vmx_set_cr4(vcpu, vcpu->arch.cr4); |
1574 | *hw_cr0 |= X86_CR0_PE | X86_CR0_PG; | ||
1575 | *hw_cr0 &= ~X86_CR0_WP; | ||
1576 | } else if (!is_paging(vcpu)) { | 1644 | } else if (!is_paging(vcpu)) { |
1577 | /* From nonpaging to paging */ | 1645 | /* From nonpaging to paging */ |
1578 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, | 1646 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, |
@@ -1581,9 +1649,10 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, | |||
1581 | CPU_BASED_CR3_STORE_EXITING)); | 1649 | CPU_BASED_CR3_STORE_EXITING)); |
1582 | vcpu->arch.cr0 = cr0; | 1650 | vcpu->arch.cr0 = cr0; |
1583 | vmx_set_cr4(vcpu, vcpu->arch.cr4); | 1651 | vmx_set_cr4(vcpu, vcpu->arch.cr4); |
1584 | if (!(vcpu->arch.cr0 & X86_CR0_WP)) | ||
1585 | *hw_cr0 &= ~X86_CR0_WP; | ||
1586 | } | 1652 | } |
1653 | |||
1654 | if (!(cr0 & X86_CR0_WP)) | ||
1655 | *hw_cr0 &= ~X86_CR0_WP; | ||
1587 | } | 1656 | } |
1588 | 1657 | ||
1589 | static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, | 1658 | static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, |
@@ -1598,15 +1667,21 @@ static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, | |||
1598 | 1667 | ||
1599 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | 1668 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
1600 | { | 1669 | { |
1601 | unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | | 1670 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1602 | KVM_VM_CR0_ALWAYS_ON; | 1671 | unsigned long hw_cr0; |
1672 | |||
1673 | if (enable_unrestricted_guest) | ||
1674 | hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) | ||
1675 | | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; | ||
1676 | else | ||
1677 | hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; | ||
1603 | 1678 | ||
1604 | vmx_fpu_deactivate(vcpu); | 1679 | vmx_fpu_deactivate(vcpu); |
1605 | 1680 | ||
1606 | if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE)) | 1681 | if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) |
1607 | enter_pmode(vcpu); | 1682 | enter_pmode(vcpu); |
1608 | 1683 | ||
1609 | if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE)) | 1684 | if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) |
1610 | enter_rmode(vcpu); | 1685 | enter_rmode(vcpu); |
1611 | 1686 | ||
1612 | #ifdef CONFIG_X86_64 | 1687 | #ifdef CONFIG_X86_64 |
@@ -1650,10 +1725,8 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
1650 | if (enable_ept) { | 1725 | if (enable_ept) { |
1651 | eptp = construct_eptp(cr3); | 1726 | eptp = construct_eptp(cr3); |
1652 | vmcs_write64(EPT_POINTER, eptp); | 1727 | vmcs_write64(EPT_POINTER, eptp); |
1653 | ept_sync_context(eptp); | ||
1654 | ept_load_pdptrs(vcpu); | ||
1655 | guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : | 1728 | guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : |
1656 | VMX_EPT_IDENTITY_PAGETABLE_ADDR; | 1729 | vcpu->kvm->arch.ept_identity_map_addr; |
1657 | } | 1730 | } |
1658 | 1731 | ||
1659 | vmx_flush_tlb(vcpu); | 1732 | vmx_flush_tlb(vcpu); |
@@ -1664,7 +1737,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
1664 | 1737 | ||
1665 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 1738 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
1666 | { | 1739 | { |
1667 | unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ? | 1740 | unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? |
1668 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); | 1741 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); |
1669 | 1742 | ||
1670 | vcpu->arch.cr4 = cr4; | 1743 | vcpu->arch.cr4 = cr4; |
@@ -1707,16 +1780,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, | |||
1707 | 1780 | ||
1708 | static int vmx_get_cpl(struct kvm_vcpu *vcpu) | 1781 | static int vmx_get_cpl(struct kvm_vcpu *vcpu) |
1709 | { | 1782 | { |
1710 | struct kvm_segment kvm_seg; | ||
1711 | |||
1712 | if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ | 1783 | if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ |
1713 | return 0; | 1784 | return 0; |
1714 | 1785 | ||
1715 | if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ | 1786 | if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ |
1716 | return 3; | 1787 | return 3; |
1717 | 1788 | ||
1718 | vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS); | 1789 | return vmcs_read16(GUEST_CS_SELECTOR) & 3; |
1719 | return kvm_seg.selector & 3; | ||
1720 | } | 1790 | } |
1721 | 1791 | ||
1722 | static u32 vmx_segment_access_rights(struct kvm_segment *var) | 1792 | static u32 vmx_segment_access_rights(struct kvm_segment *var) |
@@ -1744,20 +1814,21 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) | |||
1744 | static void vmx_set_segment(struct kvm_vcpu *vcpu, | 1814 | static void vmx_set_segment(struct kvm_vcpu *vcpu, |
1745 | struct kvm_segment *var, int seg) | 1815 | struct kvm_segment *var, int seg) |
1746 | { | 1816 | { |
1817 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1747 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 1818 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
1748 | u32 ar; | 1819 | u32 ar; |
1749 | 1820 | ||
1750 | if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) { | 1821 | if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { |
1751 | vcpu->arch.rmode.tr.selector = var->selector; | 1822 | vmx->rmode.tr.selector = var->selector; |
1752 | vcpu->arch.rmode.tr.base = var->base; | 1823 | vmx->rmode.tr.base = var->base; |
1753 | vcpu->arch.rmode.tr.limit = var->limit; | 1824 | vmx->rmode.tr.limit = var->limit; |
1754 | vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); | 1825 | vmx->rmode.tr.ar = vmx_segment_access_rights(var); |
1755 | return; | 1826 | return; |
1756 | } | 1827 | } |
1757 | vmcs_writel(sf->base, var->base); | 1828 | vmcs_writel(sf->base, var->base); |
1758 | vmcs_write32(sf->limit, var->limit); | 1829 | vmcs_write32(sf->limit, var->limit); |
1759 | vmcs_write16(sf->selector, var->selector); | 1830 | vmcs_write16(sf->selector, var->selector); |
1760 | if (vcpu->arch.rmode.vm86_active && var->s) { | 1831 | if (vmx->rmode.vm86_active && var->s) { |
1761 | /* | 1832 | /* |
1762 | * Hack real-mode segments into vm86 compatibility. | 1833 | * Hack real-mode segments into vm86 compatibility. |
1763 | */ | 1834 | */ |
@@ -1766,6 +1837,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
1766 | ar = 0xf3; | 1837 | ar = 0xf3; |
1767 | } else | 1838 | } else |
1768 | ar = vmx_segment_access_rights(var); | 1839 | ar = vmx_segment_access_rights(var); |
1840 | |||
1841 | /* | ||
1842 | * Fix the "Accessed" bit in AR field of segment registers for older | ||
1843 | * qemu binaries. | ||
1844 | * IA32 arch specifies that at the time of processor reset the | ||
1845 | * "Accessed" bit in the AR field of segment registers is 1. And qemu | ||
1846 | * is setting it to 0 in the usedland code. This causes invalid guest | ||
1847 | * state vmexit when "unrestricted guest" mode is turned on. | ||
1848 | * Fix for this setup issue in cpu_reset is being pushed in the qemu | ||
1849 | * tree. Newer qemu binaries with that qemu fix would not need this | ||
1850 | * kvm hack. | ||
1851 | */ | ||
1852 | if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) | ||
1853 | ar |= 0x1; /* Accessed */ | ||
1854 | |||
1769 | vmcs_write32(sf->ar_bytes, ar); | 1855 | vmcs_write32(sf->ar_bytes, ar); |
1770 | } | 1856 | } |
1771 | 1857 | ||
@@ -2040,7 +2126,7 @@ static int init_rmode_identity_map(struct kvm *kvm) | |||
2040 | if (likely(kvm->arch.ept_identity_pagetable_done)) | 2126 | if (likely(kvm->arch.ept_identity_pagetable_done)) |
2041 | return 1; | 2127 | return 1; |
2042 | ret = 0; | 2128 | ret = 0; |
2043 | identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT; | 2129 | identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; |
2044 | r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); | 2130 | r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); |
2045 | if (r < 0) | 2131 | if (r < 0) |
2046 | goto out; | 2132 | goto out; |
@@ -2062,11 +2148,19 @@ out: | |||
2062 | static void seg_setup(int seg) | 2148 | static void seg_setup(int seg) |
2063 | { | 2149 | { |
2064 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 2150 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
2151 | unsigned int ar; | ||
2065 | 2152 | ||
2066 | vmcs_write16(sf->selector, 0); | 2153 | vmcs_write16(sf->selector, 0); |
2067 | vmcs_writel(sf->base, 0); | 2154 | vmcs_writel(sf->base, 0); |
2068 | vmcs_write32(sf->limit, 0xffff); | 2155 | vmcs_write32(sf->limit, 0xffff); |
2069 | vmcs_write32(sf->ar_bytes, 0xf3); | 2156 | if (enable_unrestricted_guest) { |
2157 | ar = 0x93; | ||
2158 | if (seg == VCPU_SREG_CS) | ||
2159 | ar |= 0x08; /* code segment */ | ||
2160 | } else | ||
2161 | ar = 0xf3; | ||
2162 | |||
2163 | vmcs_write32(sf->ar_bytes, ar); | ||
2070 | } | 2164 | } |
2071 | 2165 | ||
2072 | static int alloc_apic_access_page(struct kvm *kvm) | 2166 | static int alloc_apic_access_page(struct kvm *kvm) |
@@ -2101,14 +2195,15 @@ static int alloc_identity_pagetable(struct kvm *kvm) | |||
2101 | goto out; | 2195 | goto out; |
2102 | kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; | 2196 | kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; |
2103 | kvm_userspace_mem.flags = 0; | 2197 | kvm_userspace_mem.flags = 0; |
2104 | kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; | 2198 | kvm_userspace_mem.guest_phys_addr = |
2199 | kvm->arch.ept_identity_map_addr; | ||
2105 | kvm_userspace_mem.memory_size = PAGE_SIZE; | 2200 | kvm_userspace_mem.memory_size = PAGE_SIZE; |
2106 | r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); | 2201 | r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); |
2107 | if (r) | 2202 | if (r) |
2108 | goto out; | 2203 | goto out; |
2109 | 2204 | ||
2110 | kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, | 2205 | kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, |
2111 | VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); | 2206 | kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); |
2112 | out: | 2207 | out: |
2113 | up_write(&kvm->slots_lock); | 2208 | up_write(&kvm->slots_lock); |
2114 | return r; | 2209 | return r; |
@@ -2209,6 +2304,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
2209 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; | 2304 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; |
2210 | if (!enable_ept) | 2305 | if (!enable_ept) |
2211 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; | 2306 | exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; |
2307 | if (!enable_unrestricted_guest) | ||
2308 | exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; | ||
2212 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | 2309 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); |
2213 | } | 2310 | } |
2214 | 2311 | ||
@@ -2326,14 +2423,14 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2326 | goto out; | 2423 | goto out; |
2327 | } | 2424 | } |
2328 | 2425 | ||
2329 | vmx->vcpu.arch.rmode.vm86_active = 0; | 2426 | vmx->rmode.vm86_active = 0; |
2330 | 2427 | ||
2331 | vmx->soft_vnmi_blocked = 0; | 2428 | vmx->soft_vnmi_blocked = 0; |
2332 | 2429 | ||
2333 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); | 2430 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); |
2334 | kvm_set_cr8(&vmx->vcpu, 0); | 2431 | kvm_set_cr8(&vmx->vcpu, 0); |
2335 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | 2432 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; |
2336 | if (vmx->vcpu.vcpu_id == 0) | 2433 | if (kvm_vcpu_is_bsp(&vmx->vcpu)) |
2337 | msr |= MSR_IA32_APICBASE_BSP; | 2434 | msr |= MSR_IA32_APICBASE_BSP; |
2338 | kvm_set_apic_base(&vmx->vcpu, msr); | 2435 | kvm_set_apic_base(&vmx->vcpu, msr); |
2339 | 2436 | ||
@@ -2344,7 +2441,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2344 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | 2441 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode |
2345 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. | 2442 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. |
2346 | */ | 2443 | */ |
2347 | if (vmx->vcpu.vcpu_id == 0) { | 2444 | if (kvm_vcpu_is_bsp(&vmx->vcpu)) { |
2348 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); | 2445 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); |
2349 | vmcs_writel(GUEST_CS_BASE, 0x000f0000); | 2446 | vmcs_writel(GUEST_CS_BASE, 0x000f0000); |
2350 | } else { | 2447 | } else { |
@@ -2373,7 +2470,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2373 | vmcs_writel(GUEST_SYSENTER_EIP, 0); | 2470 | vmcs_writel(GUEST_SYSENTER_EIP, 0); |
2374 | 2471 | ||
2375 | vmcs_writel(GUEST_RFLAGS, 0x02); | 2472 | vmcs_writel(GUEST_RFLAGS, 0x02); |
2376 | if (vmx->vcpu.vcpu_id == 0) | 2473 | if (kvm_vcpu_is_bsp(&vmx->vcpu)) |
2377 | kvm_rip_write(vcpu, 0xfff0); | 2474 | kvm_rip_write(vcpu, 0xfff0); |
2378 | else | 2475 | else |
2379 | kvm_rip_write(vcpu, 0); | 2476 | kvm_rip_write(vcpu, 0); |
@@ -2461,13 +2558,16 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) | |||
2461 | uint32_t intr; | 2558 | uint32_t intr; |
2462 | int irq = vcpu->arch.interrupt.nr; | 2559 | int irq = vcpu->arch.interrupt.nr; |
2463 | 2560 | ||
2464 | KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); | 2561 | trace_kvm_inj_virq(irq); |
2465 | 2562 | ||
2466 | ++vcpu->stat.irq_injections; | 2563 | ++vcpu->stat.irq_injections; |
2467 | if (vcpu->arch.rmode.vm86_active) { | 2564 | if (vmx->rmode.vm86_active) { |
2468 | vmx->rmode.irq.pending = true; | 2565 | vmx->rmode.irq.pending = true; |
2469 | vmx->rmode.irq.vector = irq; | 2566 | vmx->rmode.irq.vector = irq; |
2470 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | 2567 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); |
2568 | if (vcpu->arch.interrupt.soft) | ||
2569 | vmx->rmode.irq.rip += | ||
2570 | vmx->vcpu.arch.event_exit_inst_len; | ||
2471 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2571 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
2472 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); | 2572 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); |
2473 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | 2573 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); |
@@ -2502,7 +2602,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
2502 | } | 2602 | } |
2503 | 2603 | ||
2504 | ++vcpu->stat.nmi_injections; | 2604 | ++vcpu->stat.nmi_injections; |
2505 | if (vcpu->arch.rmode.vm86_active) { | 2605 | if (vmx->rmode.vm86_active) { |
2506 | vmx->rmode.irq.pending = true; | 2606 | vmx->rmode.irq.pending = true; |
2507 | vmx->rmode.irq.vector = NMI_VECTOR; | 2607 | vmx->rmode.irq.vector = NMI_VECTOR; |
2508 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | 2608 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); |
@@ -2659,14 +2759,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2659 | if (enable_ept) | 2759 | if (enable_ept) |
2660 | BUG(); | 2760 | BUG(); |
2661 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | 2761 | cr2 = vmcs_readl(EXIT_QUALIFICATION); |
2662 | KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, | 2762 | trace_kvm_page_fault(cr2, error_code); |
2663 | (u32)((u64)cr2 >> 32), handler); | 2763 | |
2664 | if (kvm_event_needs_reinjection(vcpu)) | 2764 | if (kvm_event_needs_reinjection(vcpu)) |
2665 | kvm_mmu_unprotect_page_virt(vcpu, cr2); | 2765 | kvm_mmu_unprotect_page_virt(vcpu, cr2); |
2666 | return kvm_mmu_page_fault(vcpu, cr2, error_code); | 2766 | return kvm_mmu_page_fault(vcpu, cr2, error_code); |
2667 | } | 2767 | } |
2668 | 2768 | ||
2669 | if (vcpu->arch.rmode.vm86_active && | 2769 | if (vmx->rmode.vm86_active && |
2670 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, | 2770 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, |
2671 | error_code)) { | 2771 | error_code)) { |
2672 | if (vcpu->arch.halt_request) { | 2772 | if (vcpu->arch.halt_request) { |
@@ -2707,7 +2807,6 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu, | |||
2707 | struct kvm_run *kvm_run) | 2807 | struct kvm_run *kvm_run) |
2708 | { | 2808 | { |
2709 | ++vcpu->stat.irq_exits; | 2809 | ++vcpu->stat.irq_exits; |
2710 | KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler); | ||
2711 | return 1; | 2810 | return 1; |
2712 | } | 2811 | } |
2713 | 2812 | ||
@@ -2755,7 +2854,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
2755 | 2854 | ||
2756 | static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2855 | static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2757 | { | 2856 | { |
2758 | unsigned long exit_qualification; | 2857 | unsigned long exit_qualification, val; |
2759 | int cr; | 2858 | int cr; |
2760 | int reg; | 2859 | int reg; |
2761 | 2860 | ||
@@ -2764,21 +2863,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2764 | reg = (exit_qualification >> 8) & 15; | 2863 | reg = (exit_qualification >> 8) & 15; |
2765 | switch ((exit_qualification >> 4) & 3) { | 2864 | switch ((exit_qualification >> 4) & 3) { |
2766 | case 0: /* mov to cr */ | 2865 | case 0: /* mov to cr */ |
2767 | KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, | 2866 | val = kvm_register_read(vcpu, reg); |
2768 | (u32)kvm_register_read(vcpu, reg), | 2867 | trace_kvm_cr_write(cr, val); |
2769 | (u32)((u64)kvm_register_read(vcpu, reg) >> 32), | ||
2770 | handler); | ||
2771 | switch (cr) { | 2868 | switch (cr) { |
2772 | case 0: | 2869 | case 0: |
2773 | kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); | 2870 | kvm_set_cr0(vcpu, val); |
2774 | skip_emulated_instruction(vcpu); | 2871 | skip_emulated_instruction(vcpu); |
2775 | return 1; | 2872 | return 1; |
2776 | case 3: | 2873 | case 3: |
2777 | kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); | 2874 | kvm_set_cr3(vcpu, val); |
2778 | skip_emulated_instruction(vcpu); | 2875 | skip_emulated_instruction(vcpu); |
2779 | return 1; | 2876 | return 1; |
2780 | case 4: | 2877 | case 4: |
2781 | kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); | 2878 | kvm_set_cr4(vcpu, val); |
2782 | skip_emulated_instruction(vcpu); | 2879 | skip_emulated_instruction(vcpu); |
2783 | return 1; | 2880 | return 1; |
2784 | case 8: { | 2881 | case 8: { |
@@ -2800,23 +2897,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2800 | vcpu->arch.cr0 &= ~X86_CR0_TS; | 2897 | vcpu->arch.cr0 &= ~X86_CR0_TS; |
2801 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); | 2898 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); |
2802 | vmx_fpu_activate(vcpu); | 2899 | vmx_fpu_activate(vcpu); |
2803 | KVMTRACE_0D(CLTS, vcpu, handler); | ||
2804 | skip_emulated_instruction(vcpu); | 2900 | skip_emulated_instruction(vcpu); |
2805 | return 1; | 2901 | return 1; |
2806 | case 1: /*mov from cr*/ | 2902 | case 1: /*mov from cr*/ |
2807 | switch (cr) { | 2903 | switch (cr) { |
2808 | case 3: | 2904 | case 3: |
2809 | kvm_register_write(vcpu, reg, vcpu->arch.cr3); | 2905 | kvm_register_write(vcpu, reg, vcpu->arch.cr3); |
2810 | KVMTRACE_3D(CR_READ, vcpu, (u32)cr, | 2906 | trace_kvm_cr_read(cr, vcpu->arch.cr3); |
2811 | (u32)kvm_register_read(vcpu, reg), | ||
2812 | (u32)((u64)kvm_register_read(vcpu, reg) >> 32), | ||
2813 | handler); | ||
2814 | skip_emulated_instruction(vcpu); | 2907 | skip_emulated_instruction(vcpu); |
2815 | return 1; | 2908 | return 1; |
2816 | case 8: | 2909 | case 8: |
2817 | kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); | 2910 | val = kvm_get_cr8(vcpu); |
2818 | KVMTRACE_2D(CR_READ, vcpu, (u32)cr, | 2911 | kvm_register_write(vcpu, reg, val); |
2819 | (u32)kvm_register_read(vcpu, reg), handler); | 2912 | trace_kvm_cr_read(cr, val); |
2820 | skip_emulated_instruction(vcpu); | 2913 | skip_emulated_instruction(vcpu); |
2821 | return 1; | 2914 | return 1; |
2822 | } | 2915 | } |
@@ -2841,6 +2934,8 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2841 | unsigned long val; | 2934 | unsigned long val; |
2842 | int dr, reg; | 2935 | int dr, reg; |
2843 | 2936 | ||
2937 | if (!kvm_require_cpl(vcpu, 0)) | ||
2938 | return 1; | ||
2844 | dr = vmcs_readl(GUEST_DR7); | 2939 | dr = vmcs_readl(GUEST_DR7); |
2845 | if (dr & DR7_GD) { | 2940 | if (dr & DR7_GD) { |
2846 | /* | 2941 | /* |
@@ -2884,7 +2979,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2884 | val = 0; | 2979 | val = 0; |
2885 | } | 2980 | } |
2886 | kvm_register_write(vcpu, reg, val); | 2981 | kvm_register_write(vcpu, reg, val); |
2887 | KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); | ||
2888 | } else { | 2982 | } else { |
2889 | val = vcpu->arch.regs[reg]; | 2983 | val = vcpu->arch.regs[reg]; |
2890 | switch (dr) { | 2984 | switch (dr) { |
@@ -2917,7 +3011,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2917 | } | 3011 | } |
2918 | break; | 3012 | break; |
2919 | } | 3013 | } |
2920 | KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler); | ||
2921 | } | 3014 | } |
2922 | skip_emulated_instruction(vcpu); | 3015 | skip_emulated_instruction(vcpu); |
2923 | return 1; | 3016 | return 1; |
@@ -2939,8 +3032,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2939 | return 1; | 3032 | return 1; |
2940 | } | 3033 | } |
2941 | 3034 | ||
2942 | KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32), | 3035 | trace_kvm_msr_read(ecx, data); |
2943 | handler); | ||
2944 | 3036 | ||
2945 | /* FIXME: handling of bits 32:63 of rax, rdx */ | 3037 | /* FIXME: handling of bits 32:63 of rax, rdx */ |
2946 | vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; | 3038 | vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; |
@@ -2955,8 +3047,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2955 | u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | 3047 | u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) |
2956 | | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); | 3048 | | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); |
2957 | 3049 | ||
2958 | KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32), | 3050 | trace_kvm_msr_write(ecx, data); |
2959 | handler); | ||
2960 | 3051 | ||
2961 | if (vmx_set_msr(vcpu, ecx, data) != 0) { | 3052 | if (vmx_set_msr(vcpu, ecx, data) != 0) { |
2962 | kvm_inject_gp(vcpu, 0); | 3053 | kvm_inject_gp(vcpu, 0); |
@@ -2983,7 +3074,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, | |||
2983 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | 3074 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; |
2984 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | 3075 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); |
2985 | 3076 | ||
2986 | KVMTRACE_0D(PEND_INTR, vcpu, handler); | ||
2987 | ++vcpu->stat.irq_window_exits; | 3077 | ++vcpu->stat.irq_window_exits; |
2988 | 3078 | ||
2989 | /* | 3079 | /* |
@@ -3049,7 +3139,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3049 | printk(KERN_ERR | 3139 | printk(KERN_ERR |
3050 | "Fail to handle apic access vmexit! Offset is 0x%lx\n", | 3140 | "Fail to handle apic access vmexit! Offset is 0x%lx\n", |
3051 | offset); | 3141 | offset); |
3052 | return -ENOTSUPP; | 3142 | return -ENOEXEC; |
3053 | } | 3143 | } |
3054 | return 1; | 3144 | return 1; |
3055 | } | 3145 | } |
@@ -3118,7 +3208,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3118 | 3208 | ||
3119 | if (exit_qualification & (1 << 6)) { | 3209 | if (exit_qualification & (1 << 6)) { |
3120 | printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); | 3210 | printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); |
3121 | return -ENOTSUPP; | 3211 | return -EINVAL; |
3122 | } | 3212 | } |
3123 | 3213 | ||
3124 | gla_validity = (exit_qualification >> 7) & 0x3; | 3214 | gla_validity = (exit_qualification >> 7) & 0x3; |
@@ -3130,14 +3220,98 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3130 | printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", | 3220 | printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", |
3131 | (long unsigned int)exit_qualification); | 3221 | (long unsigned int)exit_qualification); |
3132 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | 3222 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; |
3133 | kvm_run->hw.hardware_exit_reason = 0; | 3223 | kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; |
3134 | return -ENOTSUPP; | 3224 | return 0; |
3135 | } | 3225 | } |
3136 | 3226 | ||
3137 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | 3227 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); |
3228 | trace_kvm_page_fault(gpa, exit_qualification); | ||
3138 | return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); | 3229 | return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); |
3139 | } | 3230 | } |
3140 | 3231 | ||
3232 | static u64 ept_rsvd_mask(u64 spte, int level) | ||
3233 | { | ||
3234 | int i; | ||
3235 | u64 mask = 0; | ||
3236 | |||
3237 | for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) | ||
3238 | mask |= (1ULL << i); | ||
3239 | |||
3240 | if (level > 2) | ||
3241 | /* bits 7:3 reserved */ | ||
3242 | mask |= 0xf8; | ||
3243 | else if (level == 2) { | ||
3244 | if (spte & (1ULL << 7)) | ||
3245 | /* 2MB ref, bits 20:12 reserved */ | ||
3246 | mask |= 0x1ff000; | ||
3247 | else | ||
3248 | /* bits 6:3 reserved */ | ||
3249 | mask |= 0x78; | ||
3250 | } | ||
3251 | |||
3252 | return mask; | ||
3253 | } | ||
3254 | |||
3255 | static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, | ||
3256 | int level) | ||
3257 | { | ||
3258 | printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level); | ||
3259 | |||
3260 | /* 010b (write-only) */ | ||
3261 | WARN_ON((spte & 0x7) == 0x2); | ||
3262 | |||
3263 | /* 110b (write/execute) */ | ||
3264 | WARN_ON((spte & 0x7) == 0x6); | ||
3265 | |||
3266 | /* 100b (execute-only) and value not supported by logical processor */ | ||
3267 | if (!cpu_has_vmx_ept_execute_only()) | ||
3268 | WARN_ON((spte & 0x7) == 0x4); | ||
3269 | |||
3270 | /* not 000b */ | ||
3271 | if ((spte & 0x7)) { | ||
3272 | u64 rsvd_bits = spte & ept_rsvd_mask(spte, level); | ||
3273 | |||
3274 | if (rsvd_bits != 0) { | ||
3275 | printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n", | ||
3276 | __func__, rsvd_bits); | ||
3277 | WARN_ON(1); | ||
3278 | } | ||
3279 | |||
3280 | if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) { | ||
3281 | u64 ept_mem_type = (spte & 0x38) >> 3; | ||
3282 | |||
3283 | if (ept_mem_type == 2 || ept_mem_type == 3 || | ||
3284 | ept_mem_type == 7) { | ||
3285 | printk(KERN_ERR "%s: ept_mem_type=0x%llx\n", | ||
3286 | __func__, ept_mem_type); | ||
3287 | WARN_ON(1); | ||
3288 | } | ||
3289 | } | ||
3290 | } | ||
3291 | } | ||
3292 | |||
3293 | static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
3294 | { | ||
3295 | u64 sptes[4]; | ||
3296 | int nr_sptes, i; | ||
3297 | gpa_t gpa; | ||
3298 | |||
3299 | gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); | ||
3300 | |||
3301 | printk(KERN_ERR "EPT: Misconfiguration.\n"); | ||
3302 | printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); | ||
3303 | |||
3304 | nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); | ||
3305 | |||
3306 | for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) | ||
3307 | ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); | ||
3308 | |||
3309 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
3310 | kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; | ||
3311 | |||
3312 | return 0; | ||
3313 | } | ||
3314 | |||
3141 | static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 3315 | static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
3142 | { | 3316 | { |
3143 | u32 cpu_based_vm_exec_control; | 3317 | u32 cpu_based_vm_exec_control; |
@@ -3217,8 +3391,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | |||
3217 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | 3391 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, |
3218 | [EXIT_REASON_WBINVD] = handle_wbinvd, | 3392 | [EXIT_REASON_WBINVD] = handle_wbinvd, |
3219 | [EXIT_REASON_TASK_SWITCH] = handle_task_switch, | 3393 | [EXIT_REASON_TASK_SWITCH] = handle_task_switch, |
3220 | [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, | ||
3221 | [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, | 3394 | [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, |
3395 | [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, | ||
3396 | [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, | ||
3222 | }; | 3397 | }; |
3223 | 3398 | ||
3224 | static const int kvm_vmx_max_exit_handlers = | 3399 | static const int kvm_vmx_max_exit_handlers = |
@@ -3234,8 +3409,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
3234 | u32 exit_reason = vmx->exit_reason; | 3409 | u32 exit_reason = vmx->exit_reason; |
3235 | u32 vectoring_info = vmx->idt_vectoring_info; | 3410 | u32 vectoring_info = vmx->idt_vectoring_info; |
3236 | 3411 | ||
3237 | KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), | 3412 | trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); |
3238 | (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); | ||
3239 | 3413 | ||
3240 | /* If we need to emulate an MMIO from handle_invalid_guest_state | 3414 | /* If we need to emulate an MMIO from handle_invalid_guest_state |
3241 | * we just return 0 */ | 3415 | * we just return 0 */ |
@@ -3247,10 +3421,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
3247 | 3421 | ||
3248 | /* Access CR3 don't cause VMExit in paging mode, so we need | 3422 | /* Access CR3 don't cause VMExit in paging mode, so we need |
3249 | * to sync with guest real CR3. */ | 3423 | * to sync with guest real CR3. */ |
3250 | if (enable_ept && is_paging(vcpu)) { | 3424 | if (enable_ept && is_paging(vcpu)) |
3251 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); | 3425 | vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); |
3252 | ept_load_pdptrs(vcpu); | ||
3253 | } | ||
3254 | 3426 | ||
3255 | if (unlikely(vmx->fail)) { | 3427 | if (unlikely(vmx->fail)) { |
3256 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; | 3428 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
@@ -3326,10 +3498,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) | |||
3326 | 3498 | ||
3327 | /* We need to handle NMIs before interrupts are enabled */ | 3499 | /* We need to handle NMIs before interrupts are enabled */ |
3328 | if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && | 3500 | if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && |
3329 | (exit_intr_info & INTR_INFO_VALID_MASK)) { | 3501 | (exit_intr_info & INTR_INFO_VALID_MASK)) |
3330 | KVMTRACE_0D(NMI, &vmx->vcpu, handler); | ||
3331 | asm("int $2"); | 3502 | asm("int $2"); |
3332 | } | ||
3333 | 3503 | ||
3334 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; | 3504 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; |
3335 | 3505 | ||
@@ -3434,6 +3604,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3434 | { | 3604 | { |
3435 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3605 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
3436 | 3606 | ||
3607 | if (enable_ept && is_paging(vcpu)) { | ||
3608 | vmcs_writel(GUEST_CR3, vcpu->arch.cr3); | ||
3609 | ept_load_pdptrs(vcpu); | ||
3610 | } | ||
3437 | /* Record the guest's net vcpu time for enforced NMI injections. */ | 3611 | /* Record the guest's net vcpu time for enforced NMI injections. */ |
3438 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) | 3612 | if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) |
3439 | vmx->entry_time = ktime_get(); | 3613 | vmx->entry_time = ktime_get(); |
@@ -3449,12 +3623,21 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3449 | if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) | 3623 | if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) |
3450 | vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); | 3624 | vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); |
3451 | 3625 | ||
3626 | /* When single-stepping over STI and MOV SS, we must clear the | ||
3627 | * corresponding interruptibility bits in the guest state. Otherwise | ||
3628 | * vmentry fails as it then expects bit 14 (BS) in pending debug | ||
3629 | * exceptions being set, but that's not correct for the guest debugging | ||
3630 | * case. */ | ||
3631 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) | ||
3632 | vmx_set_interrupt_shadow(vcpu, 0); | ||
3633 | |||
3452 | /* | 3634 | /* |
3453 | * Loading guest fpu may have cleared host cr0.ts | 3635 | * Loading guest fpu may have cleared host cr0.ts |
3454 | */ | 3636 | */ |
3455 | vmcs_writel(HOST_CR0, read_cr0()); | 3637 | vmcs_writel(HOST_CR0, read_cr0()); |
3456 | 3638 | ||
3457 | set_debugreg(vcpu->arch.dr6, 6); | 3639 | if (vcpu->arch.switch_db_regs) |
3640 | set_debugreg(vcpu->arch.dr6, 6); | ||
3458 | 3641 | ||
3459 | asm( | 3642 | asm( |
3460 | /* Store host registers */ | 3643 | /* Store host registers */ |
@@ -3465,11 +3648,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3465 | "mov %%"R"sp, %c[host_rsp](%0) \n\t" | 3648 | "mov %%"R"sp, %c[host_rsp](%0) \n\t" |
3466 | __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" | 3649 | __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" |
3467 | "1: \n\t" | 3650 | "1: \n\t" |
3651 | /* Reload cr2 if changed */ | ||
3652 | "mov %c[cr2](%0), %%"R"ax \n\t" | ||
3653 | "mov %%cr2, %%"R"dx \n\t" | ||
3654 | "cmp %%"R"ax, %%"R"dx \n\t" | ||
3655 | "je 2f \n\t" | ||
3656 | "mov %%"R"ax, %%cr2 \n\t" | ||
3657 | "2: \n\t" | ||
3468 | /* Check if vmlaunch of vmresume is needed */ | 3658 | /* Check if vmlaunch of vmresume is needed */ |
3469 | "cmpl $0, %c[launched](%0) \n\t" | 3659 | "cmpl $0, %c[launched](%0) \n\t" |
3470 | /* Load guest registers. Don't clobber flags. */ | 3660 | /* Load guest registers. Don't clobber flags. */ |
3471 | "mov %c[cr2](%0), %%"R"ax \n\t" | ||
3472 | "mov %%"R"ax, %%cr2 \n\t" | ||
3473 | "mov %c[rax](%0), %%"R"ax \n\t" | 3661 | "mov %c[rax](%0), %%"R"ax \n\t" |
3474 | "mov %c[rbx](%0), %%"R"bx \n\t" | 3662 | "mov %c[rbx](%0), %%"R"bx \n\t" |
3475 | "mov %c[rdx](%0), %%"R"dx \n\t" | 3663 | "mov %c[rdx](%0), %%"R"dx \n\t" |
@@ -3547,10 +3735,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3547 | #endif | 3735 | #endif |
3548 | ); | 3736 | ); |
3549 | 3737 | ||
3550 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); | 3738 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) |
3739 | | (1 << VCPU_EXREG_PDPTR)); | ||
3551 | vcpu->arch.regs_dirty = 0; | 3740 | vcpu->arch.regs_dirty = 0; |
3552 | 3741 | ||
3553 | get_debugreg(vcpu->arch.dr6, 6); | 3742 | if (vcpu->arch.switch_db_regs) |
3743 | get_debugreg(vcpu->arch.dr6, 6); | ||
3554 | 3744 | ||
3555 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 3745 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
3556 | if (vmx->rmode.irq.pending) | 3746 | if (vmx->rmode.irq.pending) |
@@ -3633,9 +3823,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
3633 | if (alloc_apic_access_page(kvm) != 0) | 3823 | if (alloc_apic_access_page(kvm) != 0) |
3634 | goto free_vmcs; | 3824 | goto free_vmcs; |
3635 | 3825 | ||
3636 | if (enable_ept) | 3826 | if (enable_ept) { |
3827 | if (!kvm->arch.ept_identity_map_addr) | ||
3828 | kvm->arch.ept_identity_map_addr = | ||
3829 | VMX_EPT_IDENTITY_PAGETABLE_ADDR; | ||
3637 | if (alloc_identity_pagetable(kvm) != 0) | 3830 | if (alloc_identity_pagetable(kvm) != 0) |
3638 | goto free_vmcs; | 3831 | goto free_vmcs; |
3832 | } | ||
3639 | 3833 | ||
3640 | return &vmx->vcpu; | 3834 | return &vmx->vcpu; |
3641 | 3835 | ||
@@ -3699,6 +3893,34 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) | |||
3699 | return ret; | 3893 | return ret; |
3700 | } | 3894 | } |
3701 | 3895 | ||
3896 | static const struct trace_print_flags vmx_exit_reasons_str[] = { | ||
3897 | { EXIT_REASON_EXCEPTION_NMI, "exception" }, | ||
3898 | { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, | ||
3899 | { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, | ||
3900 | { EXIT_REASON_NMI_WINDOW, "nmi_window" }, | ||
3901 | { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, | ||
3902 | { EXIT_REASON_CR_ACCESS, "cr_access" }, | ||
3903 | { EXIT_REASON_DR_ACCESS, "dr_access" }, | ||
3904 | { EXIT_REASON_CPUID, "cpuid" }, | ||
3905 | { EXIT_REASON_MSR_READ, "rdmsr" }, | ||
3906 | { EXIT_REASON_MSR_WRITE, "wrmsr" }, | ||
3907 | { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, | ||
3908 | { EXIT_REASON_HLT, "halt" }, | ||
3909 | { EXIT_REASON_INVLPG, "invlpg" }, | ||
3910 | { EXIT_REASON_VMCALL, "hypercall" }, | ||
3911 | { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, | ||
3912 | { EXIT_REASON_APIC_ACCESS, "apic_access" }, | ||
3913 | { EXIT_REASON_WBINVD, "wbinvd" }, | ||
3914 | { EXIT_REASON_TASK_SWITCH, "task_switch" }, | ||
3915 | { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, | ||
3916 | { -1, NULL } | ||
3917 | }; | ||
3918 | |||
3919 | static bool vmx_gb_page_enable(void) | ||
3920 | { | ||
3921 | return false; | ||
3922 | } | ||
3923 | |||
3702 | static struct kvm_x86_ops vmx_x86_ops = { | 3924 | static struct kvm_x86_ops vmx_x86_ops = { |
3703 | .cpu_has_kvm_support = cpu_has_kvm_support, | 3925 | .cpu_has_kvm_support = cpu_has_kvm_support, |
3704 | .disabled_by_bios = vmx_disabled_by_bios, | 3926 | .disabled_by_bios = vmx_disabled_by_bios, |
@@ -3758,6 +3980,9 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
3758 | .set_tss_addr = vmx_set_tss_addr, | 3980 | .set_tss_addr = vmx_set_tss_addr, |
3759 | .get_tdp_level = get_ept_level, | 3981 | .get_tdp_level = get_ept_level, |
3760 | .get_mt_mask = vmx_get_mt_mask, | 3982 | .get_mt_mask = vmx_get_mt_mask, |
3983 | |||
3984 | .exit_reasons_str = vmx_exit_reasons_str, | ||
3985 | .gb_page_enable = vmx_gb_page_enable, | ||
3761 | }; | 3986 | }; |
3762 | 3987 | ||
3763 | static int __init vmx_init(void) | 3988 | static int __init vmx_init(void) |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3d452901182..be451ee4424 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -37,11 +37,16 @@ | |||
37 | #include <linux/iommu.h> | 37 | #include <linux/iommu.h> |
38 | #include <linux/intel-iommu.h> | 38 | #include <linux/intel-iommu.h> |
39 | #include <linux/cpufreq.h> | 39 | #include <linux/cpufreq.h> |
40 | #include <trace/events/kvm.h> | ||
41 | #undef TRACE_INCLUDE_FILE | ||
42 | #define CREATE_TRACE_POINTS | ||
43 | #include "trace.h" | ||
40 | 44 | ||
41 | #include <asm/uaccess.h> | 45 | #include <asm/uaccess.h> |
42 | #include <asm/msr.h> | 46 | #include <asm/msr.h> |
43 | #include <asm/desc.h> | 47 | #include <asm/desc.h> |
44 | #include <asm/mtrr.h> | 48 | #include <asm/mtrr.h> |
49 | #include <asm/mce.h> | ||
45 | 50 | ||
46 | #define MAX_IO_MSRS 256 | 51 | #define MAX_IO_MSRS 256 |
47 | #define CR0_RESERVED_BITS \ | 52 | #define CR0_RESERVED_BITS \ |
@@ -55,6 +60,10 @@ | |||
55 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) | 60 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) |
56 | 61 | ||
57 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) | 62 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) |
63 | |||
64 | #define KVM_MAX_MCE_BANKS 32 | ||
65 | #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P | ||
66 | |||
58 | /* EFER defaults: | 67 | /* EFER defaults: |
59 | * - enable syscall per default because its emulated by KVM | 68 | * - enable syscall per default because its emulated by KVM |
60 | * - enable LME and LMA per default on 64 bit KVM | 69 | * - enable LME and LMA per default on 64 bit KVM |
@@ -68,14 +77,16 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; | |||
68 | #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM | 77 | #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM |
69 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU | 78 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU |
70 | 79 | ||
80 | static void update_cr8_intercept(struct kvm_vcpu *vcpu); | ||
71 | static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | 81 | static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, |
72 | struct kvm_cpuid_entry2 __user *entries); | 82 | struct kvm_cpuid_entry2 __user *entries); |
73 | struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, | ||
74 | u32 function, u32 index); | ||
75 | 83 | ||
76 | struct kvm_x86_ops *kvm_x86_ops; | 84 | struct kvm_x86_ops *kvm_x86_ops; |
77 | EXPORT_SYMBOL_GPL(kvm_x86_ops); | 85 | EXPORT_SYMBOL_GPL(kvm_x86_ops); |
78 | 86 | ||
87 | int ignore_msrs = 0; | ||
88 | module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); | ||
89 | |||
79 | struct kvm_stats_debugfs_item debugfs_entries[] = { | 90 | struct kvm_stats_debugfs_item debugfs_entries[] = { |
80 | { "pf_fixed", VCPU_STAT(pf_fixed) }, | 91 | { "pf_fixed", VCPU_STAT(pf_fixed) }, |
81 | { "pf_guest", VCPU_STAT(pf_guest) }, | 92 | { "pf_guest", VCPU_STAT(pf_guest) }, |
@@ -122,18 +133,16 @@ unsigned long segment_base(u16 selector) | |||
122 | if (selector == 0) | 133 | if (selector == 0) |
123 | return 0; | 134 | return 0; |
124 | 135 | ||
125 | asm("sgdt %0" : "=m"(gdt)); | 136 | kvm_get_gdt(&gdt); |
126 | table_base = gdt.base; | 137 | table_base = gdt.base; |
127 | 138 | ||
128 | if (selector & 4) { /* from ldt */ | 139 | if (selector & 4) { /* from ldt */ |
129 | u16 ldt_selector; | 140 | u16 ldt_selector = kvm_read_ldt(); |
130 | 141 | ||
131 | asm("sldt %0" : "=g"(ldt_selector)); | ||
132 | table_base = segment_base(ldt_selector); | 142 | table_base = segment_base(ldt_selector); |
133 | } | 143 | } |
134 | d = (struct desc_struct *)(table_base + (selector & ~7)); | 144 | d = (struct desc_struct *)(table_base + (selector & ~7)); |
135 | v = d->base0 | ((unsigned long)d->base1 << 16) | | 145 | v = get_desc_base(d); |
136 | ((unsigned long)d->base2 << 24); | ||
137 | #ifdef CONFIG_X86_64 | 146 | #ifdef CONFIG_X86_64 |
138 | if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) | 147 | if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) |
139 | v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; | 148 | v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; |
@@ -176,16 +185,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, | |||
176 | ++vcpu->stat.pf_guest; | 185 | ++vcpu->stat.pf_guest; |
177 | 186 | ||
178 | if (vcpu->arch.exception.pending) { | 187 | if (vcpu->arch.exception.pending) { |
179 | if (vcpu->arch.exception.nr == PF_VECTOR) { | 188 | switch(vcpu->arch.exception.nr) { |
180 | printk(KERN_DEBUG "kvm: inject_page_fault:" | 189 | case DF_VECTOR: |
181 | " double fault 0x%lx\n", addr); | ||
182 | vcpu->arch.exception.nr = DF_VECTOR; | ||
183 | vcpu->arch.exception.error_code = 0; | ||
184 | } else if (vcpu->arch.exception.nr == DF_VECTOR) { | ||
185 | /* triple fault -> shutdown */ | 190 | /* triple fault -> shutdown */ |
186 | set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); | 191 | set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); |
192 | return; | ||
193 | case PF_VECTOR: | ||
194 | vcpu->arch.exception.nr = DF_VECTOR; | ||
195 | vcpu->arch.exception.error_code = 0; | ||
196 | return; | ||
197 | default: | ||
198 | /* replace previous exception with a new one in a hope | ||
199 | that instruction re-execution will regenerate lost | ||
200 | exception */ | ||
201 | vcpu->arch.exception.pending = false; | ||
202 | break; | ||
187 | } | 203 | } |
188 | return; | ||
189 | } | 204 | } |
190 | vcpu->arch.cr2 = addr; | 205 | vcpu->arch.cr2 = addr; |
191 | kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); | 206 | kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); |
@@ -207,12 +222,18 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) | |||
207 | } | 222 | } |
208 | EXPORT_SYMBOL_GPL(kvm_queue_exception_e); | 223 | EXPORT_SYMBOL_GPL(kvm_queue_exception_e); |
209 | 224 | ||
210 | static void __queue_exception(struct kvm_vcpu *vcpu) | 225 | /* |
226 | * Checks if cpl <= required_cpl; if true, return true. Otherwise queue | ||
227 | * a #GP and return false. | ||
228 | */ | ||
229 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) | ||
211 | { | 230 | { |
212 | kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, | 231 | if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) |
213 | vcpu->arch.exception.has_error_code, | 232 | return true; |
214 | vcpu->arch.exception.error_code); | 233 | kvm_queue_exception_e(vcpu, GP_VECTOR, 0); |
234 | return false; | ||
215 | } | 235 | } |
236 | EXPORT_SYMBOL_GPL(kvm_require_cpl); | ||
216 | 237 | ||
217 | /* | 238 | /* |
218 | * Load the pae pdptrs. Return true is they are all valid. | 239 | * Load the pae pdptrs. Return true is they are all valid. |
@@ -232,7 +253,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
232 | goto out; | 253 | goto out; |
233 | } | 254 | } |
234 | for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { | 255 | for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { |
235 | if (is_present_pte(pdpte[i]) && | 256 | if (is_present_gpte(pdpte[i]) && |
236 | (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { | 257 | (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { |
237 | ret = 0; | 258 | ret = 0; |
238 | goto out; | 259 | goto out; |
@@ -241,6 +262,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
241 | ret = 1; | 262 | ret = 1; |
242 | 263 | ||
243 | memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); | 264 | memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); |
265 | __set_bit(VCPU_EXREG_PDPTR, | ||
266 | (unsigned long *)&vcpu->arch.regs_avail); | ||
267 | __set_bit(VCPU_EXREG_PDPTR, | ||
268 | (unsigned long *)&vcpu->arch.regs_dirty); | ||
244 | out: | 269 | out: |
245 | 270 | ||
246 | return ret; | 271 | return ret; |
@@ -256,6 +281,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) | |||
256 | if (is_long_mode(vcpu) || !is_pae(vcpu)) | 281 | if (is_long_mode(vcpu) || !is_pae(vcpu)) |
257 | return false; | 282 | return false; |
258 | 283 | ||
284 | if (!test_bit(VCPU_EXREG_PDPTR, | ||
285 | (unsigned long *)&vcpu->arch.regs_avail)) | ||
286 | return true; | ||
287 | |||
259 | r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); | 288 | r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); |
260 | if (r < 0) | 289 | if (r < 0) |
261 | goto out; | 290 | goto out; |
@@ -328,9 +357,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0); | |||
328 | void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) | 357 | void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) |
329 | { | 358 | { |
330 | kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); | 359 | kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); |
331 | KVMTRACE_1D(LMSW, vcpu, | ||
332 | (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), | ||
333 | handler); | ||
334 | } | 360 | } |
335 | EXPORT_SYMBOL_GPL(kvm_lmsw); | 361 | EXPORT_SYMBOL_GPL(kvm_lmsw); |
336 | 362 | ||
@@ -466,7 +492,7 @@ static u32 msrs_to_save[] = { | |||
466 | #ifdef CONFIG_X86_64 | 492 | #ifdef CONFIG_X86_64 |
467 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | 493 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, |
468 | #endif | 494 | #endif |
469 | MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, | 495 | MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, |
470 | MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA | 496 | MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA |
471 | }; | 497 | }; |
472 | 498 | ||
@@ -644,8 +670,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) | |||
644 | 670 | ||
645 | /* Keep irq disabled to prevent changes to the clock */ | 671 | /* Keep irq disabled to prevent changes to the clock */ |
646 | local_irq_save(flags); | 672 | local_irq_save(flags); |
647 | kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, | 673 | kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); |
648 | &vcpu->hv_clock.tsc_timestamp); | ||
649 | ktime_get_ts(&ts); | 674 | ktime_get_ts(&ts); |
650 | local_irq_restore(flags); | 675 | local_irq_restore(flags); |
651 | 676 | ||
@@ -778,23 +803,60 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
778 | return 0; | 803 | return 0; |
779 | } | 804 | } |
780 | 805 | ||
806 | static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) | ||
807 | { | ||
808 | u64 mcg_cap = vcpu->arch.mcg_cap; | ||
809 | unsigned bank_num = mcg_cap & 0xff; | ||
810 | |||
811 | switch (msr) { | ||
812 | case MSR_IA32_MCG_STATUS: | ||
813 | vcpu->arch.mcg_status = data; | ||
814 | break; | ||
815 | case MSR_IA32_MCG_CTL: | ||
816 | if (!(mcg_cap & MCG_CTL_P)) | ||
817 | return 1; | ||
818 | if (data != 0 && data != ~(u64)0) | ||
819 | return -1; | ||
820 | vcpu->arch.mcg_ctl = data; | ||
821 | break; | ||
822 | default: | ||
823 | if (msr >= MSR_IA32_MC0_CTL && | ||
824 | msr < MSR_IA32_MC0_CTL + 4 * bank_num) { | ||
825 | u32 offset = msr - MSR_IA32_MC0_CTL; | ||
826 | /* only 0 or all 1s can be written to IA32_MCi_CTL */ | ||
827 | if ((offset & 0x3) == 0 && | ||
828 | data != 0 && data != ~(u64)0) | ||
829 | return -1; | ||
830 | vcpu->arch.mce_banks[offset] = data; | ||
831 | break; | ||
832 | } | ||
833 | return 1; | ||
834 | } | ||
835 | return 0; | ||
836 | } | ||
837 | |||
781 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | 838 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) |
782 | { | 839 | { |
783 | switch (msr) { | 840 | switch (msr) { |
784 | case MSR_EFER: | 841 | case MSR_EFER: |
785 | set_efer(vcpu, data); | 842 | set_efer(vcpu, data); |
786 | break; | 843 | break; |
787 | case MSR_IA32_MC0_STATUS: | 844 | case MSR_K7_HWCR: |
788 | pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", | 845 | data &= ~(u64)0x40; /* ignore flush filter disable */ |
789 | __func__, data); | 846 | if (data != 0) { |
847 | pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", | ||
848 | data); | ||
849 | return 1; | ||
850 | } | ||
790 | break; | 851 | break; |
791 | case MSR_IA32_MCG_STATUS: | 852 | case MSR_FAM10H_MMIO_CONF_BASE: |
792 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", | 853 | if (data != 0) { |
793 | __func__, data); | 854 | pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " |
855 | "0x%llx\n", data); | ||
856 | return 1; | ||
857 | } | ||
794 | break; | 858 | break; |
795 | case MSR_IA32_MCG_CTL: | 859 | case MSR_AMD64_NB_CFG: |
796 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", | ||
797 | __func__, data); | ||
798 | break; | 860 | break; |
799 | case MSR_IA32_DEBUGCTLMSR: | 861 | case MSR_IA32_DEBUGCTLMSR: |
800 | if (!data) { | 862 | if (!data) { |
@@ -811,12 +873,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
811 | case MSR_IA32_UCODE_REV: | 873 | case MSR_IA32_UCODE_REV: |
812 | case MSR_IA32_UCODE_WRITE: | 874 | case MSR_IA32_UCODE_WRITE: |
813 | case MSR_VM_HSAVE_PA: | 875 | case MSR_VM_HSAVE_PA: |
876 | case MSR_AMD64_PATCH_LOADER: | ||
814 | break; | 877 | break; |
815 | case 0x200 ... 0x2ff: | 878 | case 0x200 ... 0x2ff: |
816 | return set_msr_mtrr(vcpu, msr, data); | 879 | return set_msr_mtrr(vcpu, msr, data); |
817 | case MSR_IA32_APICBASE: | 880 | case MSR_IA32_APICBASE: |
818 | kvm_set_apic_base(vcpu, data); | 881 | kvm_set_apic_base(vcpu, data); |
819 | break; | 882 | break; |
883 | case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: | ||
884 | return kvm_x2apic_msr_write(vcpu, msr, data); | ||
820 | case MSR_IA32_MISC_ENABLE: | 885 | case MSR_IA32_MISC_ENABLE: |
821 | vcpu->arch.ia32_misc_enable_msr = data; | 886 | vcpu->arch.ia32_misc_enable_msr = data; |
822 | break; | 887 | break; |
@@ -850,9 +915,50 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
850 | kvm_request_guest_time_update(vcpu); | 915 | kvm_request_guest_time_update(vcpu); |
851 | break; | 916 | break; |
852 | } | 917 | } |
918 | case MSR_IA32_MCG_CTL: | ||
919 | case MSR_IA32_MCG_STATUS: | ||
920 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: | ||
921 | return set_msr_mce(vcpu, msr, data); | ||
922 | |||
923 | /* Performance counters are not protected by a CPUID bit, | ||
924 | * so we should check all of them in the generic path for the sake of | ||
925 | * cross vendor migration. | ||
926 | * Writing a zero into the event select MSRs disables them, | ||
927 | * which we perfectly emulate ;-). Any other value should be at least | ||
928 | * reported, some guests depend on them. | ||
929 | */ | ||
930 | case MSR_P6_EVNTSEL0: | ||
931 | case MSR_P6_EVNTSEL1: | ||
932 | case MSR_K7_EVNTSEL0: | ||
933 | case MSR_K7_EVNTSEL1: | ||
934 | case MSR_K7_EVNTSEL2: | ||
935 | case MSR_K7_EVNTSEL3: | ||
936 | if (data != 0) | ||
937 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " | ||
938 | "0x%x data 0x%llx\n", msr, data); | ||
939 | break; | ||
940 | /* at least RHEL 4 unconditionally writes to the perfctr registers, | ||
941 | * so we ignore writes to make it happy. | ||
942 | */ | ||
943 | case MSR_P6_PERFCTR0: | ||
944 | case MSR_P6_PERFCTR1: | ||
945 | case MSR_K7_PERFCTR0: | ||
946 | case MSR_K7_PERFCTR1: | ||
947 | case MSR_K7_PERFCTR2: | ||
948 | case MSR_K7_PERFCTR3: | ||
949 | pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " | ||
950 | "0x%x data 0x%llx\n", msr, data); | ||
951 | break; | ||
853 | default: | 952 | default: |
854 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); | 953 | if (!ignore_msrs) { |
855 | return 1; | 954 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", |
955 | msr, data); | ||
956 | return 1; | ||
957 | } else { | ||
958 | pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", | ||
959 | msr, data); | ||
960 | break; | ||
961 | } | ||
856 | } | 962 | } |
857 | return 0; | 963 | return 0; |
858 | } | 964 | } |
@@ -905,26 +1011,47 @@ static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
905 | return 0; | 1011 | return 0; |
906 | } | 1012 | } |
907 | 1013 | ||
908 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | 1014 | static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) |
909 | { | 1015 | { |
910 | u64 data; | 1016 | u64 data; |
1017 | u64 mcg_cap = vcpu->arch.mcg_cap; | ||
1018 | unsigned bank_num = mcg_cap & 0xff; | ||
911 | 1019 | ||
912 | switch (msr) { | 1020 | switch (msr) { |
913 | case 0xc0010010: /* SYSCFG */ | ||
914 | case 0xc0010015: /* HWCR */ | ||
915 | case MSR_IA32_PLATFORM_ID: | ||
916 | case MSR_IA32_P5_MC_ADDR: | 1021 | case MSR_IA32_P5_MC_ADDR: |
917 | case MSR_IA32_P5_MC_TYPE: | 1022 | case MSR_IA32_P5_MC_TYPE: |
918 | case MSR_IA32_MC0_CTL: | 1023 | data = 0; |
919 | case MSR_IA32_MCG_STATUS: | 1024 | break; |
920 | case MSR_IA32_MCG_CAP: | 1025 | case MSR_IA32_MCG_CAP: |
1026 | data = vcpu->arch.mcg_cap; | ||
1027 | break; | ||
921 | case MSR_IA32_MCG_CTL: | 1028 | case MSR_IA32_MCG_CTL: |
922 | case MSR_IA32_MC0_MISC: | 1029 | if (!(mcg_cap & MCG_CTL_P)) |
923 | case MSR_IA32_MC0_MISC+4: | 1030 | return 1; |
924 | case MSR_IA32_MC0_MISC+8: | 1031 | data = vcpu->arch.mcg_ctl; |
925 | case MSR_IA32_MC0_MISC+12: | 1032 | break; |
926 | case MSR_IA32_MC0_MISC+16: | 1033 | case MSR_IA32_MCG_STATUS: |
927 | case MSR_IA32_MC0_MISC+20: | 1034 | data = vcpu->arch.mcg_status; |
1035 | break; | ||
1036 | default: | ||
1037 | if (msr >= MSR_IA32_MC0_CTL && | ||
1038 | msr < MSR_IA32_MC0_CTL + 4 * bank_num) { | ||
1039 | u32 offset = msr - MSR_IA32_MC0_CTL; | ||
1040 | data = vcpu->arch.mce_banks[offset]; | ||
1041 | break; | ||
1042 | } | ||
1043 | return 1; | ||
1044 | } | ||
1045 | *pdata = data; | ||
1046 | return 0; | ||
1047 | } | ||
1048 | |||
1049 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | ||
1050 | { | ||
1051 | u64 data; | ||
1052 | |||
1053 | switch (msr) { | ||
1054 | case MSR_IA32_PLATFORM_ID: | ||
928 | case MSR_IA32_UCODE_REV: | 1055 | case MSR_IA32_UCODE_REV: |
929 | case MSR_IA32_EBL_CR_POWERON: | 1056 | case MSR_IA32_EBL_CR_POWERON: |
930 | case MSR_IA32_DEBUGCTLMSR: | 1057 | case MSR_IA32_DEBUGCTLMSR: |
@@ -932,10 +1059,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
932 | case MSR_IA32_LASTBRANCHTOIP: | 1059 | case MSR_IA32_LASTBRANCHTOIP: |
933 | case MSR_IA32_LASTINTFROMIP: | 1060 | case MSR_IA32_LASTINTFROMIP: |
934 | case MSR_IA32_LASTINTTOIP: | 1061 | case MSR_IA32_LASTINTTOIP: |
1062 | case MSR_K8_SYSCFG: | ||
1063 | case MSR_K7_HWCR: | ||
935 | case MSR_VM_HSAVE_PA: | 1064 | case MSR_VM_HSAVE_PA: |
1065 | case MSR_P6_PERFCTR0: | ||
1066 | case MSR_P6_PERFCTR1: | ||
936 | case MSR_P6_EVNTSEL0: | 1067 | case MSR_P6_EVNTSEL0: |
937 | case MSR_P6_EVNTSEL1: | 1068 | case MSR_P6_EVNTSEL1: |
938 | case MSR_K7_EVNTSEL0: | 1069 | case MSR_K7_EVNTSEL0: |
1070 | case MSR_K7_PERFCTR0: | ||
1071 | case MSR_K8_INT_PENDING_MSG: | ||
1072 | case MSR_AMD64_NB_CFG: | ||
1073 | case MSR_FAM10H_MMIO_CONF_BASE: | ||
939 | data = 0; | 1074 | data = 0; |
940 | break; | 1075 | break; |
941 | case MSR_MTRRcap: | 1076 | case MSR_MTRRcap: |
@@ -949,6 +1084,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
949 | case MSR_IA32_APICBASE: | 1084 | case MSR_IA32_APICBASE: |
950 | data = kvm_get_apic_base(vcpu); | 1085 | data = kvm_get_apic_base(vcpu); |
951 | break; | 1086 | break; |
1087 | case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: | ||
1088 | return kvm_x2apic_msr_read(vcpu, msr, pdata); | ||
1089 | break; | ||
952 | case MSR_IA32_MISC_ENABLE: | 1090 | case MSR_IA32_MISC_ENABLE: |
953 | data = vcpu->arch.ia32_misc_enable_msr; | 1091 | data = vcpu->arch.ia32_misc_enable_msr; |
954 | break; | 1092 | break; |
@@ -967,9 +1105,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
967 | case MSR_KVM_SYSTEM_TIME: | 1105 | case MSR_KVM_SYSTEM_TIME: |
968 | data = vcpu->arch.time; | 1106 | data = vcpu->arch.time; |
969 | break; | 1107 | break; |
1108 | case MSR_IA32_P5_MC_ADDR: | ||
1109 | case MSR_IA32_P5_MC_TYPE: | ||
1110 | case MSR_IA32_MCG_CAP: | ||
1111 | case MSR_IA32_MCG_CTL: | ||
1112 | case MSR_IA32_MCG_STATUS: | ||
1113 | case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: | ||
1114 | return get_msr_mce(vcpu, msr, pdata); | ||
970 | default: | 1115 | default: |
971 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); | 1116 | if (!ignore_msrs) { |
972 | return 1; | 1117 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); |
1118 | return 1; | ||
1119 | } else { | ||
1120 | pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); | ||
1121 | data = 0; | ||
1122 | } | ||
1123 | break; | ||
973 | } | 1124 | } |
974 | *pdata = data; | 1125 | *pdata = data; |
975 | return 0; | 1126 | return 0; |
@@ -1068,6 +1219,11 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
1068 | case KVM_CAP_REINJECT_CONTROL: | 1219 | case KVM_CAP_REINJECT_CONTROL: |
1069 | case KVM_CAP_IRQ_INJECT_STATUS: | 1220 | case KVM_CAP_IRQ_INJECT_STATUS: |
1070 | case KVM_CAP_ASSIGN_DEV_IRQ: | 1221 | case KVM_CAP_ASSIGN_DEV_IRQ: |
1222 | case KVM_CAP_IRQFD: | ||
1223 | case KVM_CAP_IOEVENTFD: | ||
1224 | case KVM_CAP_PIT2: | ||
1225 | case KVM_CAP_PIT_STATE2: | ||
1226 | case KVM_CAP_SET_IDENTITY_MAP_ADDR: | ||
1071 | r = 1; | 1227 | r = 1; |
1072 | break; | 1228 | break; |
1073 | case KVM_CAP_COALESCED_MMIO: | 1229 | case KVM_CAP_COALESCED_MMIO: |
@@ -1088,6 +1244,9 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
1088 | case KVM_CAP_IOMMU: | 1244 | case KVM_CAP_IOMMU: |
1089 | r = iommu_found(); | 1245 | r = iommu_found(); |
1090 | break; | 1246 | break; |
1247 | case KVM_CAP_MCE: | ||
1248 | r = KVM_MAX_MCE_BANKS; | ||
1249 | break; | ||
1091 | default: | 1250 | default: |
1092 | r = 0; | 1251 | r = 0; |
1093 | break; | 1252 | break; |
@@ -1147,6 +1306,16 @@ long kvm_arch_dev_ioctl(struct file *filp, | |||
1147 | r = 0; | 1306 | r = 0; |
1148 | break; | 1307 | break; |
1149 | } | 1308 | } |
1309 | case KVM_X86_GET_MCE_CAP_SUPPORTED: { | ||
1310 | u64 mce_cap; | ||
1311 | |||
1312 | mce_cap = KVM_MCE_CAP_SUPPORTED; | ||
1313 | r = -EFAULT; | ||
1314 | if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) | ||
1315 | goto out; | ||
1316 | r = 0; | ||
1317 | break; | ||
1318 | } | ||
1150 | default: | 1319 | default: |
1151 | r = -EINVAL; | 1320 | r = -EINVAL; |
1152 | } | 1321 | } |
@@ -1227,6 +1396,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, | |||
1227 | vcpu->arch.cpuid_nent = cpuid->nent; | 1396 | vcpu->arch.cpuid_nent = cpuid->nent; |
1228 | cpuid_fix_nx_cap(vcpu); | 1397 | cpuid_fix_nx_cap(vcpu); |
1229 | r = 0; | 1398 | r = 0; |
1399 | kvm_apic_set_version(vcpu); | ||
1230 | 1400 | ||
1231 | out_free: | 1401 | out_free: |
1232 | vfree(cpuid_entries); | 1402 | vfree(cpuid_entries); |
@@ -1248,6 +1418,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, | |||
1248 | cpuid->nent * sizeof(struct kvm_cpuid_entry2))) | 1418 | cpuid->nent * sizeof(struct kvm_cpuid_entry2))) |
1249 | goto out; | 1419 | goto out; |
1250 | vcpu->arch.cpuid_nent = cpuid->nent; | 1420 | vcpu->arch.cpuid_nent = cpuid->nent; |
1421 | kvm_apic_set_version(vcpu); | ||
1251 | return 0; | 1422 | return 0; |
1252 | 1423 | ||
1253 | out: | 1424 | out: |
@@ -1290,6 +1461,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
1290 | u32 index, int *nent, int maxnent) | 1461 | u32 index, int *nent, int maxnent) |
1291 | { | 1462 | { |
1292 | unsigned f_nx = is_efer_nx() ? F(NX) : 0; | 1463 | unsigned f_nx = is_efer_nx() ? F(NX) : 0; |
1464 | unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; | ||
1293 | #ifdef CONFIG_X86_64 | 1465 | #ifdef CONFIG_X86_64 |
1294 | unsigned f_lm = F(LM); | 1466 | unsigned f_lm = F(LM); |
1295 | #else | 1467 | #else |
@@ -1314,7 +1486,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
1314 | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | | 1486 | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | |
1315 | F(PAT) | F(PSE36) | 0 /* Reserved */ | | 1487 | F(PAT) | F(PSE36) | 0 /* Reserved */ | |
1316 | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | | 1488 | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | |
1317 | F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ | | 1489 | F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | |
1318 | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); | 1490 | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); |
1319 | /* cpuid 1.ecx */ | 1491 | /* cpuid 1.ecx */ |
1320 | const u32 kvm_supported_word4_x86_features = | 1492 | const u32 kvm_supported_word4_x86_features = |
@@ -1323,7 +1495,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
1323 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | | 1495 | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | |
1324 | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | | 1496 | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | |
1325 | 0 /* Reserved, DCA */ | F(XMM4_1) | | 1497 | 0 /* Reserved, DCA */ | F(XMM4_1) | |
1326 | F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) | | 1498 | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | |
1327 | 0 /* Reserved, XSAVE, OSXSAVE */; | 1499 | 0 /* Reserved, XSAVE, OSXSAVE */; |
1328 | /* cpuid 0x80000001.ecx */ | 1500 | /* cpuid 0x80000001.ecx */ |
1329 | const u32 kvm_supported_word6_x86_features = | 1501 | const u32 kvm_supported_word6_x86_features = |
@@ -1344,6 +1516,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
1344 | case 1: | 1516 | case 1: |
1345 | entry->edx &= kvm_supported_word0_x86_features; | 1517 | entry->edx &= kvm_supported_word0_x86_features; |
1346 | entry->ecx &= kvm_supported_word4_x86_features; | 1518 | entry->ecx &= kvm_supported_word4_x86_features; |
1519 | /* we support x2apic emulation even if host does not support | ||
1520 | * it since we emulate x2apic in software */ | ||
1521 | entry->ecx |= F(X2APIC); | ||
1347 | break; | 1522 | break; |
1348 | /* function 2 entries are STATEFUL. That is, repeated cpuid commands | 1523 | /* function 2 entries are STATEFUL. That is, repeated cpuid commands |
1349 | * may return different values. This forces us to get_cpu() before | 1524 | * may return different values. This forces us to get_cpu() before |
@@ -1435,6 +1610,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | |||
1435 | for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) | 1610 | for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) |
1436 | do_cpuid_ent(&cpuid_entries[nent], func, 0, | 1611 | do_cpuid_ent(&cpuid_entries[nent], func, 0, |
1437 | &nent, cpuid->nent); | 1612 | &nent, cpuid->nent); |
1613 | r = -E2BIG; | ||
1614 | if (nent >= cpuid->nent) | ||
1615 | goto out_free; | ||
1616 | |||
1438 | r = -EFAULT; | 1617 | r = -EFAULT; |
1439 | if (copy_to_user(entries, cpuid_entries, | 1618 | if (copy_to_user(entries, cpuid_entries, |
1440 | nent * sizeof(struct kvm_cpuid_entry2))) | 1619 | nent * sizeof(struct kvm_cpuid_entry2))) |
@@ -1464,6 +1643,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, | |||
1464 | vcpu_load(vcpu); | 1643 | vcpu_load(vcpu); |
1465 | memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); | 1644 | memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); |
1466 | kvm_apic_post_state_restore(vcpu); | 1645 | kvm_apic_post_state_restore(vcpu); |
1646 | update_cr8_intercept(vcpu); | ||
1467 | vcpu_put(vcpu); | 1647 | vcpu_put(vcpu); |
1468 | 1648 | ||
1469 | return 0; | 1649 | return 0; |
@@ -1503,6 +1683,80 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, | |||
1503 | return 0; | 1683 | return 0; |
1504 | } | 1684 | } |
1505 | 1685 | ||
1686 | static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, | ||
1687 | u64 mcg_cap) | ||
1688 | { | ||
1689 | int r; | ||
1690 | unsigned bank_num = mcg_cap & 0xff, bank; | ||
1691 | |||
1692 | r = -EINVAL; | ||
1693 | if (!bank_num) | ||
1694 | goto out; | ||
1695 | if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) | ||
1696 | goto out; | ||
1697 | r = 0; | ||
1698 | vcpu->arch.mcg_cap = mcg_cap; | ||
1699 | /* Init IA32_MCG_CTL to all 1s */ | ||
1700 | if (mcg_cap & MCG_CTL_P) | ||
1701 | vcpu->arch.mcg_ctl = ~(u64)0; | ||
1702 | /* Init IA32_MCi_CTL to all 1s */ | ||
1703 | for (bank = 0; bank < bank_num; bank++) | ||
1704 | vcpu->arch.mce_banks[bank*4] = ~(u64)0; | ||
1705 | out: | ||
1706 | return r; | ||
1707 | } | ||
1708 | |||
1709 | static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, | ||
1710 | struct kvm_x86_mce *mce) | ||
1711 | { | ||
1712 | u64 mcg_cap = vcpu->arch.mcg_cap; | ||
1713 | unsigned bank_num = mcg_cap & 0xff; | ||
1714 | u64 *banks = vcpu->arch.mce_banks; | ||
1715 | |||
1716 | if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) | ||
1717 | return -EINVAL; | ||
1718 | /* | ||
1719 | * if IA32_MCG_CTL is not all 1s, the uncorrected error | ||
1720 | * reporting is disabled | ||
1721 | */ | ||
1722 | if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && | ||
1723 | vcpu->arch.mcg_ctl != ~(u64)0) | ||
1724 | return 0; | ||
1725 | banks += 4 * mce->bank; | ||
1726 | /* | ||
1727 | * if IA32_MCi_CTL is not all 1s, the uncorrected error | ||
1728 | * reporting is disabled for the bank | ||
1729 | */ | ||
1730 | if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) | ||
1731 | return 0; | ||
1732 | if (mce->status & MCI_STATUS_UC) { | ||
1733 | if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || | ||
1734 | !(vcpu->arch.cr4 & X86_CR4_MCE)) { | ||
1735 | printk(KERN_DEBUG "kvm: set_mce: " | ||
1736 | "injects mce exception while " | ||
1737 | "previous one is in progress!\n"); | ||
1738 | set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); | ||
1739 | return 0; | ||
1740 | } | ||
1741 | if (banks[1] & MCI_STATUS_VAL) | ||
1742 | mce->status |= MCI_STATUS_OVER; | ||
1743 | banks[2] = mce->addr; | ||
1744 | banks[3] = mce->misc; | ||
1745 | vcpu->arch.mcg_status = mce->mcg_status; | ||
1746 | banks[1] = mce->status; | ||
1747 | kvm_queue_exception(vcpu, MC_VECTOR); | ||
1748 | } else if (!(banks[1] & MCI_STATUS_VAL) | ||
1749 | || !(banks[1] & MCI_STATUS_UC)) { | ||
1750 | if (banks[1] & MCI_STATUS_VAL) | ||
1751 | mce->status |= MCI_STATUS_OVER; | ||
1752 | banks[2] = mce->addr; | ||
1753 | banks[3] = mce->misc; | ||
1754 | banks[1] = mce->status; | ||
1755 | } else | ||
1756 | banks[1] |= MCI_STATUS_OVER; | ||
1757 | return 0; | ||
1758 | } | ||
1759 | |||
1506 | long kvm_arch_vcpu_ioctl(struct file *filp, | 1760 | long kvm_arch_vcpu_ioctl(struct file *filp, |
1507 | unsigned int ioctl, unsigned long arg) | 1761 | unsigned int ioctl, unsigned long arg) |
1508 | { | 1762 | { |
@@ -1636,6 +1890,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
1636 | kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); | 1890 | kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); |
1637 | break; | 1891 | break; |
1638 | } | 1892 | } |
1893 | case KVM_X86_SETUP_MCE: { | ||
1894 | u64 mcg_cap; | ||
1895 | |||
1896 | r = -EFAULT; | ||
1897 | if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) | ||
1898 | goto out; | ||
1899 | r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); | ||
1900 | break; | ||
1901 | } | ||
1902 | case KVM_X86_SET_MCE: { | ||
1903 | struct kvm_x86_mce mce; | ||
1904 | |||
1905 | r = -EFAULT; | ||
1906 | if (copy_from_user(&mce, argp, sizeof mce)) | ||
1907 | goto out; | ||
1908 | r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); | ||
1909 | break; | ||
1910 | } | ||
1639 | default: | 1911 | default: |
1640 | r = -EINVAL; | 1912 | r = -EINVAL; |
1641 | } | 1913 | } |
@@ -1654,6 +1926,13 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) | |||
1654 | return ret; | 1926 | return ret; |
1655 | } | 1927 | } |
1656 | 1928 | ||
1929 | static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, | ||
1930 | u64 ident_addr) | ||
1931 | { | ||
1932 | kvm->arch.ept_identity_map_addr = ident_addr; | ||
1933 | return 0; | ||
1934 | } | ||
1935 | |||
1657 | static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, | 1936 | static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, |
1658 | u32 kvm_nr_mmu_pages) | 1937 | u32 kvm_nr_mmu_pages) |
1659 | { | 1938 | { |
@@ -1775,19 +2054,25 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | |||
1775 | r = 0; | 2054 | r = 0; |
1776 | switch (chip->chip_id) { | 2055 | switch (chip->chip_id) { |
1777 | case KVM_IRQCHIP_PIC_MASTER: | 2056 | case KVM_IRQCHIP_PIC_MASTER: |
2057 | spin_lock(&pic_irqchip(kvm)->lock); | ||
1778 | memcpy(&pic_irqchip(kvm)->pics[0], | 2058 | memcpy(&pic_irqchip(kvm)->pics[0], |
1779 | &chip->chip.pic, | 2059 | &chip->chip.pic, |
1780 | sizeof(struct kvm_pic_state)); | 2060 | sizeof(struct kvm_pic_state)); |
2061 | spin_unlock(&pic_irqchip(kvm)->lock); | ||
1781 | break; | 2062 | break; |
1782 | case KVM_IRQCHIP_PIC_SLAVE: | 2063 | case KVM_IRQCHIP_PIC_SLAVE: |
2064 | spin_lock(&pic_irqchip(kvm)->lock); | ||
1783 | memcpy(&pic_irqchip(kvm)->pics[1], | 2065 | memcpy(&pic_irqchip(kvm)->pics[1], |
1784 | &chip->chip.pic, | 2066 | &chip->chip.pic, |
1785 | sizeof(struct kvm_pic_state)); | 2067 | sizeof(struct kvm_pic_state)); |
2068 | spin_unlock(&pic_irqchip(kvm)->lock); | ||
1786 | break; | 2069 | break; |
1787 | case KVM_IRQCHIP_IOAPIC: | 2070 | case KVM_IRQCHIP_IOAPIC: |
2071 | mutex_lock(&kvm->irq_lock); | ||
1788 | memcpy(ioapic_irqchip(kvm), | 2072 | memcpy(ioapic_irqchip(kvm), |
1789 | &chip->chip.ioapic, | 2073 | &chip->chip.ioapic, |
1790 | sizeof(struct kvm_ioapic_state)); | 2074 | sizeof(struct kvm_ioapic_state)); |
2075 | mutex_unlock(&kvm->irq_lock); | ||
1791 | break; | 2076 | break; |
1792 | default: | 2077 | default: |
1793 | r = -EINVAL; | 2078 | r = -EINVAL; |
@@ -1801,7 +2086,9 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) | |||
1801 | { | 2086 | { |
1802 | int r = 0; | 2087 | int r = 0; |
1803 | 2088 | ||
2089 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | ||
1804 | memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); | 2090 | memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); |
2091 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | ||
1805 | return r; | 2092 | return r; |
1806 | } | 2093 | } |
1807 | 2094 | ||
@@ -1809,8 +2096,39 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) | |||
1809 | { | 2096 | { |
1810 | int r = 0; | 2097 | int r = 0; |
1811 | 2098 | ||
2099 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | ||
1812 | memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); | 2100 | memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); |
1813 | kvm_pit_load_count(kvm, 0, ps->channels[0].count); | 2101 | kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); |
2102 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | ||
2103 | return r; | ||
2104 | } | ||
2105 | |||
2106 | static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) | ||
2107 | { | ||
2108 | int r = 0; | ||
2109 | |||
2110 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | ||
2111 | memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, | ||
2112 | sizeof(ps->channels)); | ||
2113 | ps->flags = kvm->arch.vpit->pit_state.flags; | ||
2114 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | ||
2115 | return r; | ||
2116 | } | ||
2117 | |||
2118 | static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) | ||
2119 | { | ||
2120 | int r = 0, start = 0; | ||
2121 | u32 prev_legacy, cur_legacy; | ||
2122 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | ||
2123 | prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; | ||
2124 | cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; | ||
2125 | if (!prev_legacy && cur_legacy) | ||
2126 | start = 1; | ||
2127 | memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, | ||
2128 | sizeof(kvm->arch.vpit->pit_state.channels)); | ||
2129 | kvm->arch.vpit->pit_state.flags = ps->flags; | ||
2130 | kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); | ||
2131 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | ||
1814 | return r; | 2132 | return r; |
1815 | } | 2133 | } |
1816 | 2134 | ||
@@ -1819,7 +2137,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, | |||
1819 | { | 2137 | { |
1820 | if (!kvm->arch.vpit) | 2138 | if (!kvm->arch.vpit) |
1821 | return -ENXIO; | 2139 | return -ENXIO; |
2140 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | ||
1822 | kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; | 2141 | kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; |
2142 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | ||
1823 | return 0; | 2143 | return 0; |
1824 | } | 2144 | } |
1825 | 2145 | ||
@@ -1845,7 +2165,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | |||
1845 | spin_lock(&kvm->mmu_lock); | 2165 | spin_lock(&kvm->mmu_lock); |
1846 | kvm_mmu_slot_remove_write_access(kvm, log->slot); | 2166 | kvm_mmu_slot_remove_write_access(kvm, log->slot); |
1847 | spin_unlock(&kvm->mmu_lock); | 2167 | spin_unlock(&kvm->mmu_lock); |
1848 | kvm_flush_remote_tlbs(kvm); | ||
1849 | memslot = &kvm->memslots[log->slot]; | 2168 | memslot = &kvm->memslots[log->slot]; |
1850 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; | 2169 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; |
1851 | memset(memslot->dirty_bitmap, 0, n); | 2170 | memset(memslot->dirty_bitmap, 0, n); |
@@ -1869,7 +2188,9 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1869 | */ | 2188 | */ |
1870 | union { | 2189 | union { |
1871 | struct kvm_pit_state ps; | 2190 | struct kvm_pit_state ps; |
2191 | struct kvm_pit_state2 ps2; | ||
1872 | struct kvm_memory_alias alias; | 2192 | struct kvm_memory_alias alias; |
2193 | struct kvm_pit_config pit_config; | ||
1873 | } u; | 2194 | } u; |
1874 | 2195 | ||
1875 | switch (ioctl) { | 2196 | switch (ioctl) { |
@@ -1878,6 +2199,17 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1878 | if (r < 0) | 2199 | if (r < 0) |
1879 | goto out; | 2200 | goto out; |
1880 | break; | 2201 | break; |
2202 | case KVM_SET_IDENTITY_MAP_ADDR: { | ||
2203 | u64 ident_addr; | ||
2204 | |||
2205 | r = -EFAULT; | ||
2206 | if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) | ||
2207 | goto out; | ||
2208 | r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); | ||
2209 | if (r < 0) | ||
2210 | goto out; | ||
2211 | break; | ||
2212 | } | ||
1881 | case KVM_SET_MEMORY_REGION: { | 2213 | case KVM_SET_MEMORY_REGION: { |
1882 | struct kvm_memory_region kvm_mem; | 2214 | struct kvm_memory_region kvm_mem; |
1883 | struct kvm_userspace_memory_region kvm_userspace_mem; | 2215 | struct kvm_userspace_memory_region kvm_userspace_mem; |
@@ -1930,16 +2262,24 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1930 | } | 2262 | } |
1931 | break; | 2263 | break; |
1932 | case KVM_CREATE_PIT: | 2264 | case KVM_CREATE_PIT: |
1933 | mutex_lock(&kvm->lock); | 2265 | u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; |
2266 | goto create_pit; | ||
2267 | case KVM_CREATE_PIT2: | ||
2268 | r = -EFAULT; | ||
2269 | if (copy_from_user(&u.pit_config, argp, | ||
2270 | sizeof(struct kvm_pit_config))) | ||
2271 | goto out; | ||
2272 | create_pit: | ||
2273 | down_write(&kvm->slots_lock); | ||
1934 | r = -EEXIST; | 2274 | r = -EEXIST; |
1935 | if (kvm->arch.vpit) | 2275 | if (kvm->arch.vpit) |
1936 | goto create_pit_unlock; | 2276 | goto create_pit_unlock; |
1937 | r = -ENOMEM; | 2277 | r = -ENOMEM; |
1938 | kvm->arch.vpit = kvm_create_pit(kvm); | 2278 | kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); |
1939 | if (kvm->arch.vpit) | 2279 | if (kvm->arch.vpit) |
1940 | r = 0; | 2280 | r = 0; |
1941 | create_pit_unlock: | 2281 | create_pit_unlock: |
1942 | mutex_unlock(&kvm->lock); | 2282 | up_write(&kvm->slots_lock); |
1943 | break; | 2283 | break; |
1944 | case KVM_IRQ_LINE_STATUS: | 2284 | case KVM_IRQ_LINE_STATUS: |
1945 | case KVM_IRQ_LINE: { | 2285 | case KVM_IRQ_LINE: { |
@@ -1950,10 +2290,10 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1950 | goto out; | 2290 | goto out; |
1951 | if (irqchip_in_kernel(kvm)) { | 2291 | if (irqchip_in_kernel(kvm)) { |
1952 | __s32 status; | 2292 | __s32 status; |
1953 | mutex_lock(&kvm->lock); | 2293 | mutex_lock(&kvm->irq_lock); |
1954 | status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, | 2294 | status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, |
1955 | irq_event.irq, irq_event.level); | 2295 | irq_event.irq, irq_event.level); |
1956 | mutex_unlock(&kvm->lock); | 2296 | mutex_unlock(&kvm->irq_lock); |
1957 | if (ioctl == KVM_IRQ_LINE_STATUS) { | 2297 | if (ioctl == KVM_IRQ_LINE_STATUS) { |
1958 | irq_event.status = status; | 2298 | irq_event.status = status; |
1959 | if (copy_to_user(argp, &irq_event, | 2299 | if (copy_to_user(argp, &irq_event, |
@@ -2042,6 +2382,32 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
2042 | r = 0; | 2382 | r = 0; |
2043 | break; | 2383 | break; |
2044 | } | 2384 | } |
2385 | case KVM_GET_PIT2: { | ||
2386 | r = -ENXIO; | ||
2387 | if (!kvm->arch.vpit) | ||
2388 | goto out; | ||
2389 | r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); | ||
2390 | if (r) | ||
2391 | goto out; | ||
2392 | r = -EFAULT; | ||
2393 | if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) | ||
2394 | goto out; | ||
2395 | r = 0; | ||
2396 | break; | ||
2397 | } | ||
2398 | case KVM_SET_PIT2: { | ||
2399 | r = -EFAULT; | ||
2400 | if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) | ||
2401 | goto out; | ||
2402 | r = -ENXIO; | ||
2403 | if (!kvm->arch.vpit) | ||
2404 | goto out; | ||
2405 | r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); | ||
2406 | if (r) | ||
2407 | goto out; | ||
2408 | r = 0; | ||
2409 | break; | ||
2410 | } | ||
2045 | case KVM_REINJECT_CONTROL: { | 2411 | case KVM_REINJECT_CONTROL: { |
2046 | struct kvm_reinject_control control; | 2412 | struct kvm_reinject_control control; |
2047 | r = -EFAULT; | 2413 | r = -EFAULT; |
@@ -2075,35 +2441,23 @@ static void kvm_init_msr_list(void) | |||
2075 | num_msrs_to_save = j; | 2441 | num_msrs_to_save = j; |
2076 | } | 2442 | } |
2077 | 2443 | ||
2078 | /* | 2444 | static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, |
2079 | * Only apic need an MMIO device hook, so shortcut now.. | 2445 | const void *v) |
2080 | */ | ||
2081 | static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, | ||
2082 | gpa_t addr, int len, | ||
2083 | int is_write) | ||
2084 | { | 2446 | { |
2085 | struct kvm_io_device *dev; | 2447 | if (vcpu->arch.apic && |
2448 | !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) | ||
2449 | return 0; | ||
2086 | 2450 | ||
2087 | if (vcpu->arch.apic) { | 2451 | return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); |
2088 | dev = &vcpu->arch.apic->dev; | ||
2089 | if (dev->in_range(dev, addr, len, is_write)) | ||
2090 | return dev; | ||
2091 | } | ||
2092 | return NULL; | ||
2093 | } | 2452 | } |
2094 | 2453 | ||
2095 | 2454 | static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) | |
2096 | static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, | ||
2097 | gpa_t addr, int len, | ||
2098 | int is_write) | ||
2099 | { | 2455 | { |
2100 | struct kvm_io_device *dev; | 2456 | if (vcpu->arch.apic && |
2457 | !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) | ||
2458 | return 0; | ||
2101 | 2459 | ||
2102 | dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); | 2460 | return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); |
2103 | if (dev == NULL) | ||
2104 | dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, | ||
2105 | is_write); | ||
2106 | return dev; | ||
2107 | } | 2461 | } |
2108 | 2462 | ||
2109 | static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, | 2463 | static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, |
@@ -2172,11 +2526,12 @@ static int emulator_read_emulated(unsigned long addr, | |||
2172 | unsigned int bytes, | 2526 | unsigned int bytes, |
2173 | struct kvm_vcpu *vcpu) | 2527 | struct kvm_vcpu *vcpu) |
2174 | { | 2528 | { |
2175 | struct kvm_io_device *mmio_dev; | ||
2176 | gpa_t gpa; | 2529 | gpa_t gpa; |
2177 | 2530 | ||
2178 | if (vcpu->mmio_read_completed) { | 2531 | if (vcpu->mmio_read_completed) { |
2179 | memcpy(val, vcpu->mmio_data, bytes); | 2532 | memcpy(val, vcpu->mmio_data, bytes); |
2533 | trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, | ||
2534 | vcpu->mmio_phys_addr, *(u64 *)val); | ||
2180 | vcpu->mmio_read_completed = 0; | 2535 | vcpu->mmio_read_completed = 0; |
2181 | return X86EMUL_CONTINUE; | 2536 | return X86EMUL_CONTINUE; |
2182 | } | 2537 | } |
@@ -2197,14 +2552,12 @@ mmio: | |||
2197 | /* | 2552 | /* |
2198 | * Is this MMIO handled locally? | 2553 | * Is this MMIO handled locally? |
2199 | */ | 2554 | */ |
2200 | mutex_lock(&vcpu->kvm->lock); | 2555 | if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { |
2201 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); | 2556 | trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); |
2202 | if (mmio_dev) { | ||
2203 | kvm_iodevice_read(mmio_dev, gpa, bytes, val); | ||
2204 | mutex_unlock(&vcpu->kvm->lock); | ||
2205 | return X86EMUL_CONTINUE; | 2557 | return X86EMUL_CONTINUE; |
2206 | } | 2558 | } |
2207 | mutex_unlock(&vcpu->kvm->lock); | 2559 | |
2560 | trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); | ||
2208 | 2561 | ||
2209 | vcpu->mmio_needed = 1; | 2562 | vcpu->mmio_needed = 1; |
2210 | vcpu->mmio_phys_addr = gpa; | 2563 | vcpu->mmio_phys_addr = gpa; |
@@ -2231,7 +2584,6 @@ static int emulator_write_emulated_onepage(unsigned long addr, | |||
2231 | unsigned int bytes, | 2584 | unsigned int bytes, |
2232 | struct kvm_vcpu *vcpu) | 2585 | struct kvm_vcpu *vcpu) |
2233 | { | 2586 | { |
2234 | struct kvm_io_device *mmio_dev; | ||
2235 | gpa_t gpa; | 2587 | gpa_t gpa; |
2236 | 2588 | ||
2237 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | 2589 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); |
@@ -2249,17 +2601,12 @@ static int emulator_write_emulated_onepage(unsigned long addr, | |||
2249 | return X86EMUL_CONTINUE; | 2601 | return X86EMUL_CONTINUE; |
2250 | 2602 | ||
2251 | mmio: | 2603 | mmio: |
2604 | trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); | ||
2252 | /* | 2605 | /* |
2253 | * Is this MMIO handled locally? | 2606 | * Is this MMIO handled locally? |
2254 | */ | 2607 | */ |
2255 | mutex_lock(&vcpu->kvm->lock); | 2608 | if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) |
2256 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); | ||
2257 | if (mmio_dev) { | ||
2258 | kvm_iodevice_write(mmio_dev, gpa, bytes, val); | ||
2259 | mutex_unlock(&vcpu->kvm->lock); | ||
2260 | return X86EMUL_CONTINUE; | 2609 | return X86EMUL_CONTINUE; |
2261 | } | ||
2262 | mutex_unlock(&vcpu->kvm->lock); | ||
2263 | 2610 | ||
2264 | vcpu->mmio_needed = 1; | 2611 | vcpu->mmio_needed = 1; |
2265 | vcpu->mmio_phys_addr = gpa; | 2612 | vcpu->mmio_phys_addr = gpa; |
@@ -2297,12 +2644,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
2297 | unsigned int bytes, | 2644 | unsigned int bytes, |
2298 | struct kvm_vcpu *vcpu) | 2645 | struct kvm_vcpu *vcpu) |
2299 | { | 2646 | { |
2300 | static int reported; | 2647 | printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); |
2301 | |||
2302 | if (!reported) { | ||
2303 | reported = 1; | ||
2304 | printk(KERN_WARNING "kvm: emulating exchange as write\n"); | ||
2305 | } | ||
2306 | #ifndef CONFIG_X86_64 | 2648 | #ifndef CONFIG_X86_64 |
2307 | /* guests cmpxchg8b have to be emulated atomically */ | 2649 | /* guests cmpxchg8b have to be emulated atomically */ |
2308 | if (bytes == 8) { | 2650 | if (bytes == 8) { |
@@ -2348,7 +2690,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) | |||
2348 | 2690 | ||
2349 | int emulate_clts(struct kvm_vcpu *vcpu) | 2691 | int emulate_clts(struct kvm_vcpu *vcpu) |
2350 | { | 2692 | { |
2351 | KVMTRACE_0D(CLTS, vcpu, handler); | ||
2352 | kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); | 2693 | kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); |
2353 | return X86EMUL_CONTINUE; | 2694 | return X86EMUL_CONTINUE; |
2354 | } | 2695 | } |
@@ -2425,7 +2766,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
2425 | kvm_clear_exception_queue(vcpu); | 2766 | kvm_clear_exception_queue(vcpu); |
2426 | vcpu->arch.mmio_fault_cr2 = cr2; | 2767 | vcpu->arch.mmio_fault_cr2 = cr2; |
2427 | /* | 2768 | /* |
2428 | * TODO: fix x86_emulate.c to use guest_read/write_register | 2769 | * TODO: fix emulate.c to use guest_read/write_register |
2429 | * instead of direct ->regs accesses, can save hundred cycles | 2770 | * instead of direct ->regs accesses, can save hundred cycles |
2430 | * on Intel for instructions that don't read/change RSP, for | 2771 | * on Intel for instructions that don't read/change RSP, for |
2431 | * for example. | 2772 | * for example. |
@@ -2449,14 +2790,33 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
2449 | 2790 | ||
2450 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | 2791 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); |
2451 | 2792 | ||
2452 | /* Reject the instructions other than VMCALL/VMMCALL when | 2793 | /* Only allow emulation of specific instructions on #UD |
2453 | * try to emulate invalid opcode */ | 2794 | * (namely VMMCALL, sysenter, sysexit, syscall)*/ |
2454 | c = &vcpu->arch.emulate_ctxt.decode; | 2795 | c = &vcpu->arch.emulate_ctxt.decode; |
2455 | if ((emulation_type & EMULTYPE_TRAP_UD) && | 2796 | if (emulation_type & EMULTYPE_TRAP_UD) { |
2456 | (!(c->twobyte && c->b == 0x01 && | 2797 | if (!c->twobyte) |
2457 | (c->modrm_reg == 0 || c->modrm_reg == 3) && | 2798 | return EMULATE_FAIL; |
2458 | c->modrm_mod == 3 && c->modrm_rm == 1))) | 2799 | switch (c->b) { |
2459 | return EMULATE_FAIL; | 2800 | case 0x01: /* VMMCALL */ |
2801 | if (c->modrm_mod != 3 || c->modrm_rm != 1) | ||
2802 | return EMULATE_FAIL; | ||
2803 | break; | ||
2804 | case 0x34: /* sysenter */ | ||
2805 | case 0x35: /* sysexit */ | ||
2806 | if (c->modrm_mod != 0 || c->modrm_rm != 0) | ||
2807 | return EMULATE_FAIL; | ||
2808 | break; | ||
2809 | case 0x05: /* syscall */ | ||
2810 | if (c->modrm_mod != 0 || c->modrm_rm != 0) | ||
2811 | return EMULATE_FAIL; | ||
2812 | break; | ||
2813 | default: | ||
2814 | return EMULATE_FAIL; | ||
2815 | } | ||
2816 | |||
2817 | if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) | ||
2818 | return EMULATE_FAIL; | ||
2819 | } | ||
2460 | 2820 | ||
2461 | ++vcpu->stat.insn_emulation; | 2821 | ++vcpu->stat.insn_emulation; |
2462 | if (r) { | 2822 | if (r) { |
@@ -2576,52 +2936,40 @@ int complete_pio(struct kvm_vcpu *vcpu) | |||
2576 | return 0; | 2936 | return 0; |
2577 | } | 2937 | } |
2578 | 2938 | ||
2579 | static void kernel_pio(struct kvm_io_device *pio_dev, | 2939 | static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) |
2580 | struct kvm_vcpu *vcpu, | ||
2581 | void *pd) | ||
2582 | { | 2940 | { |
2583 | /* TODO: String I/O for in kernel device */ | 2941 | /* TODO: String I/O for in kernel device */ |
2942 | int r; | ||
2584 | 2943 | ||
2585 | mutex_lock(&vcpu->kvm->lock); | ||
2586 | if (vcpu->arch.pio.in) | 2944 | if (vcpu->arch.pio.in) |
2587 | kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, | 2945 | r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, |
2588 | vcpu->arch.pio.size, | 2946 | vcpu->arch.pio.size, pd); |
2589 | pd); | ||
2590 | else | 2947 | else |
2591 | kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, | 2948 | r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, |
2592 | vcpu->arch.pio.size, | 2949 | vcpu->arch.pio.size, pd); |
2593 | pd); | 2950 | return r; |
2594 | mutex_unlock(&vcpu->kvm->lock); | ||
2595 | } | 2951 | } |
2596 | 2952 | ||
2597 | static void pio_string_write(struct kvm_io_device *pio_dev, | 2953 | static int pio_string_write(struct kvm_vcpu *vcpu) |
2598 | struct kvm_vcpu *vcpu) | ||
2599 | { | 2954 | { |
2600 | struct kvm_pio_request *io = &vcpu->arch.pio; | 2955 | struct kvm_pio_request *io = &vcpu->arch.pio; |
2601 | void *pd = vcpu->arch.pio_data; | 2956 | void *pd = vcpu->arch.pio_data; |
2602 | int i; | 2957 | int i, r = 0; |
2603 | 2958 | ||
2604 | mutex_lock(&vcpu->kvm->lock); | ||
2605 | for (i = 0; i < io->cur_count; i++) { | 2959 | for (i = 0; i < io->cur_count; i++) { |
2606 | kvm_iodevice_write(pio_dev, io->port, | 2960 | if (kvm_io_bus_write(&vcpu->kvm->pio_bus, |
2607 | io->size, | 2961 | io->port, io->size, pd)) { |
2608 | pd); | 2962 | r = -EOPNOTSUPP; |
2963 | break; | ||
2964 | } | ||
2609 | pd += io->size; | 2965 | pd += io->size; |
2610 | } | 2966 | } |
2611 | mutex_unlock(&vcpu->kvm->lock); | 2967 | return r; |
2612 | } | ||
2613 | |||
2614 | static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, | ||
2615 | gpa_t addr, int len, | ||
2616 | int is_write) | ||
2617 | { | ||
2618 | return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); | ||
2619 | } | 2968 | } |
2620 | 2969 | ||
2621 | int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | 2970 | int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, |
2622 | int size, unsigned port) | 2971 | int size, unsigned port) |
2623 | { | 2972 | { |
2624 | struct kvm_io_device *pio_dev; | ||
2625 | unsigned long val; | 2973 | unsigned long val; |
2626 | 2974 | ||
2627 | vcpu->run->exit_reason = KVM_EXIT_IO; | 2975 | vcpu->run->exit_reason = KVM_EXIT_IO; |
@@ -2635,19 +2983,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2635 | vcpu->arch.pio.down = 0; | 2983 | vcpu->arch.pio.down = 0; |
2636 | vcpu->arch.pio.rep = 0; | 2984 | vcpu->arch.pio.rep = 0; |
2637 | 2985 | ||
2638 | if (vcpu->run->io.direction == KVM_EXIT_IO_IN) | 2986 | trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, |
2639 | KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, | 2987 | size, 1); |
2640 | handler); | ||
2641 | else | ||
2642 | KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, | ||
2643 | handler); | ||
2644 | 2988 | ||
2645 | val = kvm_register_read(vcpu, VCPU_REGS_RAX); | 2989 | val = kvm_register_read(vcpu, VCPU_REGS_RAX); |
2646 | memcpy(vcpu->arch.pio_data, &val, 4); | 2990 | memcpy(vcpu->arch.pio_data, &val, 4); |
2647 | 2991 | ||
2648 | pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); | 2992 | if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { |
2649 | if (pio_dev) { | ||
2650 | kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); | ||
2651 | complete_pio(vcpu); | 2993 | complete_pio(vcpu); |
2652 | return 1; | 2994 | return 1; |
2653 | } | 2995 | } |
@@ -2661,7 +3003,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2661 | { | 3003 | { |
2662 | unsigned now, in_page; | 3004 | unsigned now, in_page; |
2663 | int ret = 0; | 3005 | int ret = 0; |
2664 | struct kvm_io_device *pio_dev; | ||
2665 | 3006 | ||
2666 | vcpu->run->exit_reason = KVM_EXIT_IO; | 3007 | vcpu->run->exit_reason = KVM_EXIT_IO; |
2667 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | 3008 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; |
@@ -2674,12 +3015,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2674 | vcpu->arch.pio.down = down; | 3015 | vcpu->arch.pio.down = down; |
2675 | vcpu->arch.pio.rep = rep; | 3016 | vcpu->arch.pio.rep = rep; |
2676 | 3017 | ||
2677 | if (vcpu->run->io.direction == KVM_EXIT_IO_IN) | 3018 | trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, |
2678 | KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, | 3019 | size, count); |
2679 | handler); | ||
2680 | else | ||
2681 | KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, | ||
2682 | handler); | ||
2683 | 3020 | ||
2684 | if (!count) { | 3021 | if (!count) { |
2685 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 3022 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
@@ -2709,9 +3046,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2709 | 3046 | ||
2710 | vcpu->arch.pio.guest_gva = address; | 3047 | vcpu->arch.pio.guest_gva = address; |
2711 | 3048 | ||
2712 | pio_dev = vcpu_find_pio_dev(vcpu, port, | ||
2713 | vcpu->arch.pio.cur_count, | ||
2714 | !vcpu->arch.pio.in); | ||
2715 | if (!vcpu->arch.pio.in) { | 3049 | if (!vcpu->arch.pio.in) { |
2716 | /* string PIO write */ | 3050 | /* string PIO write */ |
2717 | ret = pio_copy_data(vcpu); | 3051 | ret = pio_copy_data(vcpu); |
@@ -2719,16 +3053,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2719 | kvm_inject_gp(vcpu, 0); | 3053 | kvm_inject_gp(vcpu, 0); |
2720 | return 1; | 3054 | return 1; |
2721 | } | 3055 | } |
2722 | if (ret == 0 && pio_dev) { | 3056 | if (ret == 0 && !pio_string_write(vcpu)) { |
2723 | pio_string_write(pio_dev, vcpu); | ||
2724 | complete_pio(vcpu); | 3057 | complete_pio(vcpu); |
2725 | if (vcpu->arch.pio.count == 0) | 3058 | if (vcpu->arch.pio.count == 0) |
2726 | ret = 1; | 3059 | ret = 1; |
2727 | } | 3060 | } |
2728 | } else if (pio_dev) | 3061 | } |
2729 | pr_unimpl(vcpu, "no string pio read support yet, " | 3062 | /* no string PIO read support yet */ |
2730 | "port %x size %d count %ld\n", | ||
2731 | port, size, count); | ||
2732 | 3063 | ||
2733 | return ret; | 3064 | return ret; |
2734 | } | 3065 | } |
@@ -2761,10 +3092,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va | |||
2761 | 3092 | ||
2762 | spin_lock(&kvm_lock); | 3093 | spin_lock(&kvm_lock); |
2763 | list_for_each_entry(kvm, &vm_list, vm_list) { | 3094 | list_for_each_entry(kvm, &vm_list, vm_list) { |
2764 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | 3095 | kvm_for_each_vcpu(i, vcpu, kvm) { |
2765 | vcpu = kvm->vcpus[i]; | ||
2766 | if (!vcpu) | ||
2767 | continue; | ||
2768 | if (vcpu->cpu != freq->cpu) | 3096 | if (vcpu->cpu != freq->cpu) |
2769 | continue; | 3097 | continue; |
2770 | if (!kvm_request_guest_time_update(vcpu)) | 3098 | if (!kvm_request_guest_time_update(vcpu)) |
@@ -2857,7 +3185,6 @@ void kvm_arch_exit(void) | |||
2857 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) | 3185 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) |
2858 | { | 3186 | { |
2859 | ++vcpu->stat.halt_exits; | 3187 | ++vcpu->stat.halt_exits; |
2860 | KVMTRACE_0D(HLT, vcpu, handler); | ||
2861 | if (irqchip_in_kernel(vcpu->kvm)) { | 3188 | if (irqchip_in_kernel(vcpu->kvm)) { |
2862 | vcpu->arch.mp_state = KVM_MP_STATE_HALTED; | 3189 | vcpu->arch.mp_state = KVM_MP_STATE_HALTED; |
2863 | return 1; | 3190 | return 1; |
@@ -2888,7 +3215,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
2888 | a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); | 3215 | a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); |
2889 | a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); | 3216 | a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); |
2890 | 3217 | ||
2891 | KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); | 3218 | trace_kvm_hypercall(nr, a0, a1, a2, a3); |
2892 | 3219 | ||
2893 | if (!is_long_mode(vcpu)) { | 3220 | if (!is_long_mode(vcpu)) { |
2894 | nr &= 0xFFFFFFFF; | 3221 | nr &= 0xFFFFFFFF; |
@@ -2898,6 +3225,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
2898 | a3 &= 0xFFFFFFFF; | 3225 | a3 &= 0xFFFFFFFF; |
2899 | } | 3226 | } |
2900 | 3227 | ||
3228 | if (kvm_x86_ops->get_cpl(vcpu) != 0) { | ||
3229 | ret = -KVM_EPERM; | ||
3230 | goto out; | ||
3231 | } | ||
3232 | |||
2901 | switch (nr) { | 3233 | switch (nr) { |
2902 | case KVM_HC_VAPIC_POLL_IRQ: | 3234 | case KVM_HC_VAPIC_POLL_IRQ: |
2903 | ret = 0; | 3235 | ret = 0; |
@@ -2909,6 +3241,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
2909 | ret = -KVM_ENOSYS; | 3241 | ret = -KVM_ENOSYS; |
2910 | break; | 3242 | break; |
2911 | } | 3243 | } |
3244 | out: | ||
2912 | kvm_register_write(vcpu, VCPU_REGS_RAX, ret); | 3245 | kvm_register_write(vcpu, VCPU_REGS_RAX, ret); |
2913 | ++vcpu->stat.hypercalls; | 3246 | ++vcpu->stat.hypercalls; |
2914 | return r; | 3247 | return r; |
@@ -2988,8 +3321,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) | |||
2988 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); | 3321 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); |
2989 | return 0; | 3322 | return 0; |
2990 | } | 3323 | } |
2991 | KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value, | ||
2992 | (u32)((u64)value >> 32), handler); | ||
2993 | 3324 | ||
2994 | return value; | 3325 | return value; |
2995 | } | 3326 | } |
@@ -2997,9 +3328,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) | |||
2997 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, | 3328 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, |
2998 | unsigned long *rflags) | 3329 | unsigned long *rflags) |
2999 | { | 3330 | { |
3000 | KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val, | ||
3001 | (u32)((u64)val >> 32), handler); | ||
3002 | |||
3003 | switch (cr) { | 3331 | switch (cr) { |
3004 | case 0: | 3332 | case 0: |
3005 | kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); | 3333 | kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); |
@@ -3109,11 +3437,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | |||
3109 | kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); | 3437 | kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); |
3110 | } | 3438 | } |
3111 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 3439 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
3112 | KVMTRACE_5D(CPUID, vcpu, function, | 3440 | trace_kvm_cpuid(function, |
3113 | (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), | 3441 | kvm_register_read(vcpu, VCPU_REGS_RAX), |
3114 | (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), | 3442 | kvm_register_read(vcpu, VCPU_REGS_RBX), |
3115 | (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), | 3443 | kvm_register_read(vcpu, VCPU_REGS_RCX), |
3116 | (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); | 3444 | kvm_register_read(vcpu, VCPU_REGS_RDX)); |
3117 | } | 3445 | } |
3118 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | 3446 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); |
3119 | 3447 | ||
@@ -3179,6 +3507,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) | |||
3179 | if (!kvm_x86_ops->update_cr8_intercept) | 3507 | if (!kvm_x86_ops->update_cr8_intercept) |
3180 | return; | 3508 | return; |
3181 | 3509 | ||
3510 | if (!vcpu->arch.apic) | ||
3511 | return; | ||
3512 | |||
3182 | if (!vcpu->arch.apic->vapic_addr) | 3513 | if (!vcpu->arch.apic->vapic_addr) |
3183 | max_irr = kvm_lapic_find_highest_irr(vcpu); | 3514 | max_irr = kvm_lapic_find_highest_irr(vcpu); |
3184 | else | 3515 | else |
@@ -3192,12 +3523,16 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) | |||
3192 | kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); | 3523 | kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); |
3193 | } | 3524 | } |
3194 | 3525 | ||
3195 | static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 3526 | static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
3196 | { | 3527 | { |
3197 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) | ||
3198 | kvm_x86_ops->set_interrupt_shadow(vcpu, 0); | ||
3199 | |||
3200 | /* try to reinject previous events if any */ | 3528 | /* try to reinject previous events if any */ |
3529 | if (vcpu->arch.exception.pending) { | ||
3530 | kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, | ||
3531 | vcpu->arch.exception.has_error_code, | ||
3532 | vcpu->arch.exception.error_code); | ||
3533 | return; | ||
3534 | } | ||
3535 | |||
3201 | if (vcpu->arch.nmi_injected) { | 3536 | if (vcpu->arch.nmi_injected) { |
3202 | kvm_x86_ops->set_nmi(vcpu); | 3537 | kvm_x86_ops->set_nmi(vcpu); |
3203 | return; | 3538 | return; |
@@ -3271,16 +3606,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3271 | smp_mb__after_clear_bit(); | 3606 | smp_mb__after_clear_bit(); |
3272 | 3607 | ||
3273 | if (vcpu->requests || need_resched() || signal_pending(current)) { | 3608 | if (vcpu->requests || need_resched() || signal_pending(current)) { |
3609 | set_bit(KVM_REQ_KICK, &vcpu->requests); | ||
3274 | local_irq_enable(); | 3610 | local_irq_enable(); |
3275 | preempt_enable(); | 3611 | preempt_enable(); |
3276 | r = 1; | 3612 | r = 1; |
3277 | goto out; | 3613 | goto out; |
3278 | } | 3614 | } |
3279 | 3615 | ||
3280 | if (vcpu->arch.exception.pending) | 3616 | inject_pending_event(vcpu, kvm_run); |
3281 | __queue_exception(vcpu); | ||
3282 | else | ||
3283 | inject_pending_irq(vcpu, kvm_run); | ||
3284 | 3617 | ||
3285 | /* enable NMI/IRQ window open exits if needed */ | 3618 | /* enable NMI/IRQ window open exits if needed */ |
3286 | if (vcpu->arch.nmi_pending) | 3619 | if (vcpu->arch.nmi_pending) |
@@ -3297,14 +3630,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3297 | 3630 | ||
3298 | kvm_guest_enter(); | 3631 | kvm_guest_enter(); |
3299 | 3632 | ||
3300 | get_debugreg(vcpu->arch.host_dr6, 6); | ||
3301 | get_debugreg(vcpu->arch.host_dr7, 7); | ||
3302 | if (unlikely(vcpu->arch.switch_db_regs)) { | 3633 | if (unlikely(vcpu->arch.switch_db_regs)) { |
3303 | get_debugreg(vcpu->arch.host_db[0], 0); | ||
3304 | get_debugreg(vcpu->arch.host_db[1], 1); | ||
3305 | get_debugreg(vcpu->arch.host_db[2], 2); | ||
3306 | get_debugreg(vcpu->arch.host_db[3], 3); | ||
3307 | |||
3308 | set_debugreg(0, 7); | 3634 | set_debugreg(0, 7); |
3309 | set_debugreg(vcpu->arch.eff_db[0], 0); | 3635 | set_debugreg(vcpu->arch.eff_db[0], 0); |
3310 | set_debugreg(vcpu->arch.eff_db[1], 1); | 3636 | set_debugreg(vcpu->arch.eff_db[1], 1); |
@@ -3312,18 +3638,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3312 | set_debugreg(vcpu->arch.eff_db[3], 3); | 3638 | set_debugreg(vcpu->arch.eff_db[3], 3); |
3313 | } | 3639 | } |
3314 | 3640 | ||
3315 | KVMTRACE_0D(VMENTRY, vcpu, entryexit); | 3641 | trace_kvm_entry(vcpu->vcpu_id); |
3316 | kvm_x86_ops->run(vcpu, kvm_run); | 3642 | kvm_x86_ops->run(vcpu, kvm_run); |
3317 | 3643 | ||
3318 | if (unlikely(vcpu->arch.switch_db_regs)) { | 3644 | if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { |
3319 | set_debugreg(0, 7); | 3645 | set_debugreg(current->thread.debugreg0, 0); |
3320 | set_debugreg(vcpu->arch.host_db[0], 0); | 3646 | set_debugreg(current->thread.debugreg1, 1); |
3321 | set_debugreg(vcpu->arch.host_db[1], 1); | 3647 | set_debugreg(current->thread.debugreg2, 2); |
3322 | set_debugreg(vcpu->arch.host_db[2], 2); | 3648 | set_debugreg(current->thread.debugreg3, 3); |
3323 | set_debugreg(vcpu->arch.host_db[3], 3); | 3649 | set_debugreg(current->thread.debugreg6, 6); |
3650 | set_debugreg(current->thread.debugreg7, 7); | ||
3324 | } | 3651 | } |
3325 | set_debugreg(vcpu->arch.host_dr6, 6); | ||
3326 | set_debugreg(vcpu->arch.host_dr7, 7); | ||
3327 | 3652 | ||
3328 | set_bit(KVM_REQ_KICK, &vcpu->requests); | 3653 | set_bit(KVM_REQ_KICK, &vcpu->requests); |
3329 | local_irq_enable(); | 3654 | local_irq_enable(); |
@@ -3653,11 +3978,8 @@ static void kvm_set_segment(struct kvm_vcpu *vcpu, | |||
3653 | static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, | 3978 | static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, |
3654 | struct kvm_segment *kvm_desct) | 3979 | struct kvm_segment *kvm_desct) |
3655 | { | 3980 | { |
3656 | kvm_desct->base = seg_desc->base0; | 3981 | kvm_desct->base = get_desc_base(seg_desc); |
3657 | kvm_desct->base |= seg_desc->base1 << 16; | 3982 | kvm_desct->limit = get_desc_limit(seg_desc); |
3658 | kvm_desct->base |= seg_desc->base2 << 24; | ||
3659 | kvm_desct->limit = seg_desc->limit0; | ||
3660 | kvm_desct->limit |= seg_desc->limit << 16; | ||
3661 | if (seg_desc->g) { | 3983 | if (seg_desc->g) { |
3662 | kvm_desct->limit <<= 12; | 3984 | kvm_desct->limit <<= 12; |
3663 | kvm_desct->limit |= 0xfff; | 3985 | kvm_desct->limit |= 0xfff; |
@@ -3701,7 +4023,6 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, | |||
3701 | static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | 4023 | static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, |
3702 | struct desc_struct *seg_desc) | 4024 | struct desc_struct *seg_desc) |
3703 | { | 4025 | { |
3704 | gpa_t gpa; | ||
3705 | struct descriptor_table dtable; | 4026 | struct descriptor_table dtable; |
3706 | u16 index = selector >> 3; | 4027 | u16 index = selector >> 3; |
3707 | 4028 | ||
@@ -3711,16 +4032,13 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | |||
3711 | kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); | 4032 | kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); |
3712 | return 1; | 4033 | return 1; |
3713 | } | 4034 | } |
3714 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); | 4035 | return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); |
3715 | gpa += index * 8; | ||
3716 | return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8); | ||
3717 | } | 4036 | } |
3718 | 4037 | ||
3719 | /* allowed just for 8 bytes segments */ | 4038 | /* allowed just for 8 bytes segments */ |
3720 | static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | 4039 | static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, |
3721 | struct desc_struct *seg_desc) | 4040 | struct desc_struct *seg_desc) |
3722 | { | 4041 | { |
3723 | gpa_t gpa; | ||
3724 | struct descriptor_table dtable; | 4042 | struct descriptor_table dtable; |
3725 | u16 index = selector >> 3; | 4043 | u16 index = selector >> 3; |
3726 | 4044 | ||
@@ -3728,19 +4046,13 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | |||
3728 | 4046 | ||
3729 | if (dtable.limit < index * 8 + 7) | 4047 | if (dtable.limit < index * 8 + 7) |
3730 | return 1; | 4048 | return 1; |
3731 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); | 4049 | return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); |
3732 | gpa += index * 8; | ||
3733 | return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8); | ||
3734 | } | 4050 | } |
3735 | 4051 | ||
3736 | static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, | 4052 | static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, |
3737 | struct desc_struct *seg_desc) | 4053 | struct desc_struct *seg_desc) |
3738 | { | 4054 | { |
3739 | u32 base_addr; | 4055 | u32 base_addr = get_desc_base(seg_desc); |
3740 | |||
3741 | base_addr = seg_desc->base0; | ||
3742 | base_addr |= (seg_desc->base1 << 16); | ||
3743 | base_addr |= (seg_desc->base2 << 24); | ||
3744 | 4056 | ||
3745 | return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); | 4057 | return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); |
3746 | } | 4058 | } |
@@ -3785,12 +4097,19 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se | |||
3785 | return 0; | 4097 | return 0; |
3786 | } | 4098 | } |
3787 | 4099 | ||
4100 | static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) | ||
4101 | { | ||
4102 | return (seg != VCPU_SREG_LDTR) && | ||
4103 | (seg != VCPU_SREG_TR) && | ||
4104 | (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); | ||
4105 | } | ||
4106 | |||
3788 | int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | 4107 | int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, |
3789 | int type_bits, int seg) | 4108 | int type_bits, int seg) |
3790 | { | 4109 | { |
3791 | struct kvm_segment kvm_seg; | 4110 | struct kvm_segment kvm_seg; |
3792 | 4111 | ||
3793 | if (!(vcpu->arch.cr0 & X86_CR0_PE)) | 4112 | if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) |
3794 | return kvm_load_realmode_segment(vcpu, selector, seg); | 4113 | return kvm_load_realmode_segment(vcpu, selector, seg); |
3795 | if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) | 4114 | if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) |
3796 | return 1; | 4115 | return 1; |
@@ -4029,7 +4348,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) | |||
4029 | } | 4348 | } |
4030 | } | 4349 | } |
4031 | 4350 | ||
4032 | if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { | 4351 | if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { |
4033 | kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); | 4352 | kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); |
4034 | return 1; | 4353 | return 1; |
4035 | } | 4354 | } |
@@ -4099,13 +4418,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
4099 | 4418 | ||
4100 | vcpu->arch.cr2 = sregs->cr2; | 4419 | vcpu->arch.cr2 = sregs->cr2; |
4101 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; | 4420 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; |
4102 | 4421 | vcpu->arch.cr3 = sregs->cr3; | |
4103 | down_read(&vcpu->kvm->slots_lock); | ||
4104 | if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT)) | ||
4105 | vcpu->arch.cr3 = sregs->cr3; | ||
4106 | else | ||
4107 | set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); | ||
4108 | up_read(&vcpu->kvm->slots_lock); | ||
4109 | 4422 | ||
4110 | kvm_set_cr8(vcpu, sregs->cr8); | 4423 | kvm_set_cr8(vcpu, sregs->cr8); |
4111 | 4424 | ||
@@ -4147,8 +4460,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
4147 | kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); | 4460 | kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); |
4148 | kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); | 4461 | kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); |
4149 | 4462 | ||
4463 | update_cr8_intercept(vcpu); | ||
4464 | |||
4150 | /* Older userspace won't unhalt the vcpu on reset. */ | 4465 | /* Older userspace won't unhalt the vcpu on reset. */ |
4151 | if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && | 4466 | if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && |
4152 | sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && | 4467 | sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && |
4153 | !(vcpu->arch.cr0 & X86_CR0_PE)) | 4468 | !(vcpu->arch.cr0 & X86_CR0_PE)) |
4154 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 4469 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
@@ -4419,7 +4734,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
4419 | kvm = vcpu->kvm; | 4734 | kvm = vcpu->kvm; |
4420 | 4735 | ||
4421 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | 4736 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; |
4422 | if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) | 4737 | if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) |
4423 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 4738 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
4424 | else | 4739 | else |
4425 | vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; | 4740 | vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; |
@@ -4441,6 +4756,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
4441 | goto fail_mmu_destroy; | 4756 | goto fail_mmu_destroy; |
4442 | } | 4757 | } |
4443 | 4758 | ||
4759 | vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, | ||
4760 | GFP_KERNEL); | ||
4761 | if (!vcpu->arch.mce_banks) { | ||
4762 | r = -ENOMEM; | ||
4763 | goto fail_mmu_destroy; | ||
4764 | } | ||
4765 | vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; | ||
4766 | |||
4444 | return 0; | 4767 | return 0; |
4445 | 4768 | ||
4446 | fail_mmu_destroy: | 4769 | fail_mmu_destroy: |
@@ -4488,20 +4811,22 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) | |||
4488 | static void kvm_free_vcpus(struct kvm *kvm) | 4811 | static void kvm_free_vcpus(struct kvm *kvm) |
4489 | { | 4812 | { |
4490 | unsigned int i; | 4813 | unsigned int i; |
4814 | struct kvm_vcpu *vcpu; | ||
4491 | 4815 | ||
4492 | /* | 4816 | /* |
4493 | * Unpin any mmu pages first. | 4817 | * Unpin any mmu pages first. |
4494 | */ | 4818 | */ |
4495 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | 4819 | kvm_for_each_vcpu(i, vcpu, kvm) |
4496 | if (kvm->vcpus[i]) | 4820 | kvm_unload_vcpu_mmu(vcpu); |
4497 | kvm_unload_vcpu_mmu(kvm->vcpus[i]); | 4821 | kvm_for_each_vcpu(i, vcpu, kvm) |
4498 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | 4822 | kvm_arch_vcpu_free(vcpu); |
4499 | if (kvm->vcpus[i]) { | 4823 | |
4500 | kvm_arch_vcpu_free(kvm->vcpus[i]); | 4824 | mutex_lock(&kvm->lock); |
4501 | kvm->vcpus[i] = NULL; | 4825 | for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) |
4502 | } | 4826 | kvm->vcpus[i] = NULL; |
4503 | } | ||
4504 | 4827 | ||
4828 | atomic_set(&kvm->online_vcpus, 0); | ||
4829 | mutex_unlock(&kvm->lock); | ||
4505 | } | 4830 | } |
4506 | 4831 | ||
4507 | void kvm_arch_sync_events(struct kvm *kvm) | 4832 | void kvm_arch_sync_events(struct kvm *kvm) |
@@ -4578,7 +4903,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm, | |||
4578 | 4903 | ||
4579 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); | 4904 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); |
4580 | spin_unlock(&kvm->mmu_lock); | 4905 | spin_unlock(&kvm->mmu_lock); |
4581 | kvm_flush_remote_tlbs(kvm); | ||
4582 | 4906 | ||
4583 | return 0; | 4907 | return 0; |
4584 | } | 4908 | } |
@@ -4592,8 +4916,10 @@ void kvm_arch_flush_shadow(struct kvm *kvm) | |||
4592 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | 4916 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) |
4593 | { | 4917 | { |
4594 | return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE | 4918 | return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE |
4595 | || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED | 4919 | || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED |
4596 | || vcpu->arch.nmi_pending; | 4920 | || vcpu->arch.nmi_pending || |
4921 | (kvm_arch_interrupt_allowed(vcpu) && | ||
4922 | kvm_cpu_has_interrupt(vcpu)); | ||
4597 | } | 4923 | } |
4598 | 4924 | ||
4599 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) | 4925 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) |
@@ -4617,3 +4943,9 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) | |||
4617 | { | 4943 | { |
4618 | return kvm_x86_ops->interrupt_allowed(vcpu); | 4944 | return kvm_x86_ops->interrupt_allowed(vcpu); |
4619 | } | 4945 | } |
4946 | |||
4947 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); | ||
4948 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); | ||
4949 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); | ||
4950 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); | ||
4951 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); | ||
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 4c8e10af78e..5eadea585d2 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h | |||
@@ -31,4 +31,8 @@ static inline bool kvm_exception_is_soft(unsigned int nr) | |||
31 | { | 31 | { |
32 | return (nr == BP_VECTOR) || (nr == OF_VECTOR); | 32 | return (nr == BP_VECTOR) || (nr == OF_VECTOR); |
33 | } | 33 | } |
34 | |||
35 | struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, | ||
36 | u32 function, u32 index); | ||
37 | |||
34 | #endif | 38 | #endif |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index d677fa9ca65..7e59dc1d3fc 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -1135,11 +1135,6 @@ static struct notifier_block paniced = { | |||
1135 | /* Setting up memory is fairly easy. */ | 1135 | /* Setting up memory is fairly easy. */ |
1136 | static __init char *lguest_memory_setup(void) | 1136 | static __init char *lguest_memory_setup(void) |
1137 | { | 1137 | { |
1138 | /* We do this here and not earlier because lockcheck used to barf if we | ||
1139 | * did it before start_kernel(). I think we fixed that, so it'd be | ||
1140 | * nice to move it back to lguest_init. Patch welcome... */ | ||
1141 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); | ||
1142 | |||
1143 | /* | 1138 | /* |
1144 | *The Linux bootloader header contains an "e820" memory map: the | 1139 | *The Linux bootloader header contains an "e820" memory map: the |
1145 | * Launcher populated the first entry with our memory limit. | 1140 | * Launcher populated the first entry with our memory limit. |
@@ -1262,7 +1257,6 @@ __init void lguest_init(void) | |||
1262 | */ | 1257 | */ |
1263 | 1258 | ||
1264 | /* Interrupt-related operations */ | 1259 | /* Interrupt-related operations */ |
1265 | pv_irq_ops.init_IRQ = lguest_init_IRQ; | ||
1266 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); | 1260 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); |
1267 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); | 1261 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); |
1268 | pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); | 1262 | pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); |
@@ -1270,7 +1264,6 @@ __init void lguest_init(void) | |||
1270 | pv_irq_ops.safe_halt = lguest_safe_halt; | 1264 | pv_irq_ops.safe_halt = lguest_safe_halt; |
1271 | 1265 | ||
1272 | /* Setup operations */ | 1266 | /* Setup operations */ |
1273 | pv_init_ops.memory_setup = lguest_memory_setup; | ||
1274 | pv_init_ops.patch = lguest_patch; | 1267 | pv_init_ops.patch = lguest_patch; |
1275 | 1268 | ||
1276 | /* Intercepts of various CPU instructions */ | 1269 | /* Intercepts of various CPU instructions */ |
@@ -1320,10 +1313,11 @@ __init void lguest_init(void) | |||
1320 | set_lguest_basic_apic_ops(); | 1313 | set_lguest_basic_apic_ops(); |
1321 | #endif | 1314 | #endif |
1322 | 1315 | ||
1323 | /* Time operations */ | 1316 | x86_init.resources.memory_setup = lguest_memory_setup; |
1324 | pv_time_ops.get_wallclock = lguest_get_wallclock; | 1317 | x86_init.irqs.intr_init = lguest_init_IRQ; |
1325 | pv_time_ops.time_init = lguest_time_init; | 1318 | x86_init.timers.timer_init = lguest_time_init; |
1326 | pv_time_ops.get_tsc_khz = lguest_tsc_khz; | 1319 | x86_platform.calibrate_tsc = lguest_tsc_khz; |
1320 | x86_platform.get_wallclock = lguest_get_wallclock; | ||
1327 | 1321 | ||
1328 | /* | 1322 | /* |
1329 | * Now is a good time to look at the implementations of these functions | 1323 | * Now is a good time to look at the implementations of these functions |
@@ -1365,10 +1359,13 @@ __init void lguest_init(void) | |||
1365 | 1359 | ||
1366 | /* | 1360 | /* |
1367 | * If we don't initialize the lock dependency checker now, it crashes | 1361 | * If we don't initialize the lock dependency checker now, it crashes |
1368 | * paravirt_disable_iospace. | 1362 | * atomic_notifier_chain_register, then paravirt_disable_iospace. |
1369 | */ | 1363 | */ |
1370 | lockdep_init(); | 1364 | lockdep_init(); |
1371 | 1365 | ||
1366 | /* Hook in our special panic hypercall code. */ | ||
1367 | atomic_notifier_chain_register(&panic_notifier_list, &paniced); | ||
1368 | |||
1372 | /* | 1369 | /* |
1373 | * The IDE code spends about 3 seconds probing for disks: if we reserve | 1370 | * The IDE code spends about 3 seconds probing for disks: if we reserve |
1374 | * all the I/O ports up front it can't get them and so doesn't probe. | 1371 | * all the I/O ports up front it can't get them and so doesn't probe. |
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 07c31899c9c..9e609206fac 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile | |||
@@ -9,6 +9,8 @@ lib-y += thunk_$(BITS).o | |||
9 | lib-y += usercopy_$(BITS).o getuser.o putuser.o | 9 | lib-y += usercopy_$(BITS).o getuser.o putuser.o |
10 | lib-y += memcpy_$(BITS).o | 10 | lib-y += memcpy_$(BITS).o |
11 | 11 | ||
12 | obj-y += msr-reg.o msr-reg-export.o | ||
13 | |||
12 | ifeq ($(CONFIG_X86_32),y) | 14 | ifeq ($(CONFIG_X86_32),y) |
13 | obj-y += atomic64_32.o | 15 | obj-y += atomic64_32.o |
14 | lib-y += checksum_32.o | 16 | lib-y += checksum_32.o |
diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c new file mode 100644 index 00000000000..a311cc59b65 --- /dev/null +++ b/arch/x86/lib/msr-reg-export.c | |||
@@ -0,0 +1,5 @@ | |||
1 | #include <linux/module.h> | ||
2 | #include <asm/msr.h> | ||
3 | |||
4 | EXPORT_SYMBOL(native_rdmsr_safe_regs); | ||
5 | EXPORT_SYMBOL(native_wrmsr_safe_regs); | ||
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S new file mode 100644 index 00000000000..69fa10623f2 --- /dev/null +++ b/arch/x86/lib/msr-reg.S | |||
@@ -0,0 +1,102 @@ | |||
1 | #include <linux/linkage.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <asm/dwarf2.h> | ||
4 | #include <asm/asm.h> | ||
5 | #include <asm/msr.h> | ||
6 | |||
7 | #ifdef CONFIG_X86_64 | ||
8 | /* | ||
9 | * int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]); | ||
10 | * | ||
11 | * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi] | ||
12 | * | ||
13 | */ | ||
14 | .macro op_safe_regs op | ||
15 | ENTRY(native_\op\()_safe_regs) | ||
16 | CFI_STARTPROC | ||
17 | pushq_cfi %rbx | ||
18 | pushq_cfi %rbp | ||
19 | movq %rdi, %r10 /* Save pointer */ | ||
20 | xorl %r11d, %r11d /* Return value */ | ||
21 | movl (%rdi), %eax | ||
22 | movl 4(%rdi), %ecx | ||
23 | movl 8(%rdi), %edx | ||
24 | movl 12(%rdi), %ebx | ||
25 | movl 20(%rdi), %ebp | ||
26 | movl 24(%rdi), %esi | ||
27 | movl 28(%rdi), %edi | ||
28 | CFI_REMEMBER_STATE | ||
29 | 1: \op | ||
30 | 2: movl %eax, (%r10) | ||
31 | movl %r11d, %eax /* Return value */ | ||
32 | movl %ecx, 4(%r10) | ||
33 | movl %edx, 8(%r10) | ||
34 | movl %ebx, 12(%r10) | ||
35 | movl %ebp, 20(%r10) | ||
36 | movl %esi, 24(%r10) | ||
37 | movl %edi, 28(%r10) | ||
38 | popq_cfi %rbp | ||
39 | popq_cfi %rbx | ||
40 | ret | ||
41 | 3: | ||
42 | CFI_RESTORE_STATE | ||
43 | movl $-EIO, %r11d | ||
44 | jmp 2b | ||
45 | |||
46 | _ASM_EXTABLE(1b, 3b) | ||
47 | CFI_ENDPROC | ||
48 | ENDPROC(native_\op\()_safe_regs) | ||
49 | .endm | ||
50 | |||
51 | #else /* X86_32 */ | ||
52 | |||
53 | .macro op_safe_regs op | ||
54 | ENTRY(native_\op\()_safe_regs) | ||
55 | CFI_STARTPROC | ||
56 | pushl_cfi %ebx | ||
57 | pushl_cfi %ebp | ||
58 | pushl_cfi %esi | ||
59 | pushl_cfi %edi | ||
60 | pushl_cfi $0 /* Return value */ | ||
61 | pushl_cfi %eax | ||
62 | movl 4(%eax), %ecx | ||
63 | movl 8(%eax), %edx | ||
64 | movl 12(%eax), %ebx | ||
65 | movl 20(%eax), %ebp | ||
66 | movl 24(%eax), %esi | ||
67 | movl 28(%eax), %edi | ||
68 | movl (%eax), %eax | ||
69 | CFI_REMEMBER_STATE | ||
70 | 1: \op | ||
71 | 2: pushl_cfi %eax | ||
72 | movl 4(%esp), %eax | ||
73 | popl_cfi (%eax) | ||
74 | addl $4, %esp | ||
75 | CFI_ADJUST_CFA_OFFSET -4 | ||
76 | movl %ecx, 4(%eax) | ||
77 | movl %edx, 8(%eax) | ||
78 | movl %ebx, 12(%eax) | ||
79 | movl %ebp, 20(%eax) | ||
80 | movl %esi, 24(%eax) | ||
81 | movl %edi, 28(%eax) | ||
82 | popl_cfi %eax | ||
83 | popl_cfi %edi | ||
84 | popl_cfi %esi | ||
85 | popl_cfi %ebp | ||
86 | popl_cfi %ebx | ||
87 | ret | ||
88 | 3: | ||
89 | CFI_RESTORE_STATE | ||
90 | movl $-EIO, 4(%esp) | ||
91 | jmp 2b | ||
92 | |||
93 | _ASM_EXTABLE(1b, 3b) | ||
94 | CFI_ENDPROC | ||
95 | ENDPROC(native_\op\()_safe_regs) | ||
96 | .endm | ||
97 | |||
98 | #endif | ||
99 | |||
100 | op_safe_regs rdmsr | ||
101 | op_safe_regs wrmsr | ||
102 | |||
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index caa24aca811..33a1e3ca22d 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c | |||
@@ -175,3 +175,52 @@ int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) | |||
175 | return err ? err : rv.err; | 175 | return err ? err : rv.err; |
176 | } | 176 | } |
177 | EXPORT_SYMBOL(wrmsr_safe_on_cpu); | 177 | EXPORT_SYMBOL(wrmsr_safe_on_cpu); |
178 | |||
179 | /* | ||
180 | * These variants are significantly slower, but allows control over | ||
181 | * the entire 32-bit GPR set. | ||
182 | */ | ||
183 | struct msr_regs_info { | ||
184 | u32 *regs; | ||
185 | int err; | ||
186 | }; | ||
187 | |||
188 | static void __rdmsr_safe_regs_on_cpu(void *info) | ||
189 | { | ||
190 | struct msr_regs_info *rv = info; | ||
191 | |||
192 | rv->err = rdmsr_safe_regs(rv->regs); | ||
193 | } | ||
194 | |||
195 | static void __wrmsr_safe_regs_on_cpu(void *info) | ||
196 | { | ||
197 | struct msr_regs_info *rv = info; | ||
198 | |||
199 | rv->err = wrmsr_safe_regs(rv->regs); | ||
200 | } | ||
201 | |||
202 | int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) | ||
203 | { | ||
204 | int err; | ||
205 | struct msr_regs_info rv; | ||
206 | |||
207 | rv.regs = regs; | ||
208 | rv.err = -EIO; | ||
209 | err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1); | ||
210 | |||
211 | return err ? err : rv.err; | ||
212 | } | ||
213 | EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); | ||
214 | |||
215 | int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) | ||
216 | { | ||
217 | int err; | ||
218 | struct msr_regs_info rv; | ||
219 | |||
220 | rv.regs = regs; | ||
221 | rv.err = -EIO; | ||
222 | err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1); | ||
223 | |||
224 | return err ? err : rv.err; | ||
225 | } | ||
226 | EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu); | ||
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index eefdeee8a87..06630d26e56 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -1,5 +1,10 @@ | |||
1 | obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ | 1 | obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ |
2 | pat.o pgtable.o gup.o | 2 | pat.o pgtable.o physaddr.o gup.o setup_nx.o |
3 | |||
4 | # Make sure __phys_addr has no stackprotector | ||
5 | nostackp := $(call cc-option, -fno-stack-protector) | ||
6 | CFLAGS_physaddr.o := $(nostackp) | ||
7 | CFLAGS_setup_nx.o := $(nostackp) | ||
3 | 8 | ||
4 | obj-$(CONFIG_SMP) += tlb.o | 9 | obj-$(CONFIG_SMP) += tlb.o |
5 | 10 | ||
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index bfae139182f..f4cee9028cf 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -10,7 +10,7 @@ | |||
10 | #include <linux/bootmem.h> /* max_low_pfn */ | 10 | #include <linux/bootmem.h> /* max_low_pfn */ |
11 | #include <linux/kprobes.h> /* __kprobes, ... */ | 11 | #include <linux/kprobes.h> /* __kprobes, ... */ |
12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ | 12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ |
13 | #include <linux/perf_counter.h> /* perf_swcounter_event */ | 13 | #include <linux/perf_event.h> /* perf_sw_event */ |
14 | 14 | ||
15 | #include <asm/traps.h> /* dotraplinkage, ... */ | 15 | #include <asm/traps.h> /* dotraplinkage, ... */ |
16 | #include <asm/pgalloc.h> /* pgd_*(), ... */ | 16 | #include <asm/pgalloc.h> /* pgd_*(), ... */ |
@@ -167,6 +167,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, | |||
167 | info.si_errno = 0; | 167 | info.si_errno = 0; |
168 | info.si_code = si_code; | 168 | info.si_code = si_code; |
169 | info.si_addr = (void __user *)address; | 169 | info.si_addr = (void __user *)address; |
170 | info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; | ||
170 | 171 | ||
171 | force_sig_info(si_signo, &info, tsk); | 172 | force_sig_info(si_signo, &info, tsk); |
172 | } | 173 | } |
@@ -285,26 +286,25 @@ check_v8086_mode(struct pt_regs *regs, unsigned long address, | |||
285 | tsk->thread.screen_bitmap |= 1 << bit; | 286 | tsk->thread.screen_bitmap |= 1 << bit; |
286 | } | 287 | } |
287 | 288 | ||
288 | static void dump_pagetable(unsigned long address) | 289 | static bool low_pfn(unsigned long pfn) |
289 | { | 290 | { |
290 | __typeof__(pte_val(__pte(0))) page; | 291 | return pfn < max_low_pfn; |
292 | } | ||
291 | 293 | ||
292 | page = read_cr3(); | 294 | static void dump_pagetable(unsigned long address) |
293 | page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; | 295 | { |
296 | pgd_t *base = __va(read_cr3()); | ||
297 | pgd_t *pgd = &base[pgd_index(address)]; | ||
298 | pmd_t *pmd; | ||
299 | pte_t *pte; | ||
294 | 300 | ||
295 | #ifdef CONFIG_X86_PAE | 301 | #ifdef CONFIG_X86_PAE |
296 | printk("*pdpt = %016Lx ", page); | 302 | printk("*pdpt = %016Lx ", pgd_val(*pgd)); |
297 | if ((page >> PAGE_SHIFT) < max_low_pfn | 303 | if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) |
298 | && page & _PAGE_PRESENT) { | 304 | goto out; |
299 | page &= PAGE_MASK; | ||
300 | page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) | ||
301 | & (PTRS_PER_PMD - 1)]; | ||
302 | printk(KERN_CONT "*pde = %016Lx ", page); | ||
303 | page &= ~_PAGE_NX; | ||
304 | } | ||
305 | #else | ||
306 | printk("*pde = %08lx ", page); | ||
307 | #endif | 305 | #endif |
306 | pmd = pmd_offset(pud_offset(pgd, address), address); | ||
307 | printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); | ||
308 | 308 | ||
309 | /* | 309 | /* |
310 | * We must not directly access the pte in the highpte | 310 | * We must not directly access the pte in the highpte |
@@ -312,16 +312,12 @@ static void dump_pagetable(unsigned long address) | |||
312 | * And let's rather not kmap-atomic the pte, just in case | 312 | * And let's rather not kmap-atomic the pte, just in case |
313 | * it's allocated already: | 313 | * it's allocated already: |
314 | */ | 314 | */ |
315 | if ((page >> PAGE_SHIFT) < max_low_pfn | 315 | if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) |
316 | && (page & _PAGE_PRESENT) | 316 | goto out; |
317 | && !(page & _PAGE_PSE)) { | ||
318 | |||
319 | page &= PAGE_MASK; | ||
320 | page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) | ||
321 | & (PTRS_PER_PTE - 1)]; | ||
322 | printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); | ||
323 | } | ||
324 | 317 | ||
318 | pte = pte_offset_kernel(pmd, address); | ||
319 | printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); | ||
320 | out: | ||
325 | printk("\n"); | 321 | printk("\n"); |
326 | } | 322 | } |
327 | 323 | ||
@@ -450,16 +446,12 @@ static int bad_address(void *p) | |||
450 | 446 | ||
451 | static void dump_pagetable(unsigned long address) | 447 | static void dump_pagetable(unsigned long address) |
452 | { | 448 | { |
453 | pgd_t *pgd; | 449 | pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); |
450 | pgd_t *pgd = base + pgd_index(address); | ||
454 | pud_t *pud; | 451 | pud_t *pud; |
455 | pmd_t *pmd; | 452 | pmd_t *pmd; |
456 | pte_t *pte; | 453 | pte_t *pte; |
457 | 454 | ||
458 | pgd = (pgd_t *)read_cr3(); | ||
459 | |||
460 | pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); | ||
461 | |||
462 | pgd += pgd_index(address); | ||
463 | if (bad_address(pgd)) | 455 | if (bad_address(pgd)) |
464 | goto bad; | 456 | goto bad; |
465 | 457 | ||
@@ -799,10 +791,12 @@ out_of_memory(struct pt_regs *regs, unsigned long error_code, | |||
799 | } | 791 | } |
800 | 792 | ||
801 | static void | 793 | static void |
802 | do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) | 794 | do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, |
795 | unsigned int fault) | ||
803 | { | 796 | { |
804 | struct task_struct *tsk = current; | 797 | struct task_struct *tsk = current; |
805 | struct mm_struct *mm = tsk->mm; | 798 | struct mm_struct *mm = tsk->mm; |
799 | int code = BUS_ADRERR; | ||
806 | 800 | ||
807 | up_read(&mm->mmap_sem); | 801 | up_read(&mm->mmap_sem); |
808 | 802 | ||
@@ -818,7 +812,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) | |||
818 | tsk->thread.error_code = error_code; | 812 | tsk->thread.error_code = error_code; |
819 | tsk->thread.trap_no = 14; | 813 | tsk->thread.trap_no = 14; |
820 | 814 | ||
821 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); | 815 | #ifdef CONFIG_MEMORY_FAILURE |
816 | if (fault & VM_FAULT_HWPOISON) { | ||
817 | printk(KERN_ERR | ||
818 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", | ||
819 | tsk->comm, tsk->pid, address); | ||
820 | code = BUS_MCEERR_AR; | ||
821 | } | ||
822 | #endif | ||
823 | force_sig_info_fault(SIGBUS, code, address, tsk); | ||
822 | } | 824 | } |
823 | 825 | ||
824 | static noinline void | 826 | static noinline void |
@@ -828,8 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, | |||
828 | if (fault & VM_FAULT_OOM) { | 830 | if (fault & VM_FAULT_OOM) { |
829 | out_of_memory(regs, error_code, address); | 831 | out_of_memory(regs, error_code, address); |
830 | } else { | 832 | } else { |
831 | if (fault & VM_FAULT_SIGBUS) | 833 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) |
832 | do_sigbus(regs, error_code, address); | 834 | do_sigbus(regs, error_code, address, fault); |
833 | else | 835 | else |
834 | BUG(); | 836 | BUG(); |
835 | } | 837 | } |
@@ -1026,7 +1028,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1026 | if (unlikely(error_code & PF_RSVD)) | 1028 | if (unlikely(error_code & PF_RSVD)) |
1027 | pgtable_bad(regs, error_code, address); | 1029 | pgtable_bad(regs, error_code, address); |
1028 | 1030 | ||
1029 | perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); | 1031 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); |
1030 | 1032 | ||
1031 | /* | 1033 | /* |
1032 | * If we're in an interrupt, have no user context or are running | 1034 | * If we're in an interrupt, have no user context or are running |
@@ -1123,11 +1125,11 @@ good_area: | |||
1123 | 1125 | ||
1124 | if (fault & VM_FAULT_MAJOR) { | 1126 | if (fault & VM_FAULT_MAJOR) { |
1125 | tsk->maj_flt++; | 1127 | tsk->maj_flt++; |
1126 | perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, | 1128 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, |
1127 | regs, address); | 1129 | regs, address); |
1128 | } else { | 1130 | } else { |
1129 | tsk->min_flt++; | 1131 | tsk->min_flt++; |
1130 | perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, | 1132 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, |
1131 | regs, address); | 1133 | regs, address); |
1132 | } | 1134 | } |
1133 | 1135 | ||
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 2112ed55e7e..63a6ba66cbe 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c | |||
@@ -24,7 +24,7 @@ void kunmap(struct page *page) | |||
24 | * no global lock is needed and because the kmap code must perform a global TLB | 24 | * no global lock is needed and because the kmap code must perform a global TLB |
25 | * invalidation when the kmap pool wraps. | 25 | * invalidation when the kmap pool wraps. |
26 | * | 26 | * |
27 | * However when holding an atomic kmap is is not legal to sleep, so atomic | 27 | * However when holding an atomic kmap it is not legal to sleep, so atomic |
28 | * kmaps are appropriate for short, tight code paths only. | 28 | * kmaps are appropriate for short, tight code paths only. |
29 | */ | 29 | */ |
30 | void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) | 30 | void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) |
@@ -104,6 +104,7 @@ EXPORT_SYMBOL(kunmap); | |||
104 | EXPORT_SYMBOL(kmap_atomic); | 104 | EXPORT_SYMBOL(kmap_atomic); |
105 | EXPORT_SYMBOL(kunmap_atomic); | 105 | EXPORT_SYMBOL(kunmap_atomic); |
106 | EXPORT_SYMBOL(kmap_atomic_prot); | 106 | EXPORT_SYMBOL(kmap_atomic_prot); |
107 | EXPORT_SYMBOL(kmap_atomic_to_page); | ||
107 | 108 | ||
108 | void __init set_highmem_pages_init(void) | 109 | void __init set_highmem_pages_init(void) |
109 | { | 110 | { |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 0607119cef9..73ffd5536f6 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -28,69 +28,6 @@ int direct_gbpages | |||
28 | #endif | 28 | #endif |
29 | ; | 29 | ; |
30 | 30 | ||
31 | int nx_enabled; | ||
32 | |||
33 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) | ||
34 | static int disable_nx __cpuinitdata; | ||
35 | |||
36 | /* | ||
37 | * noexec = on|off | ||
38 | * | ||
39 | * Control non-executable mappings for processes. | ||
40 | * | ||
41 | * on Enable | ||
42 | * off Disable | ||
43 | */ | ||
44 | static int __init noexec_setup(char *str) | ||
45 | { | ||
46 | if (!str) | ||
47 | return -EINVAL; | ||
48 | if (!strncmp(str, "on", 2)) { | ||
49 | __supported_pte_mask |= _PAGE_NX; | ||
50 | disable_nx = 0; | ||
51 | } else if (!strncmp(str, "off", 3)) { | ||
52 | disable_nx = 1; | ||
53 | __supported_pte_mask &= ~_PAGE_NX; | ||
54 | } | ||
55 | return 0; | ||
56 | } | ||
57 | early_param("noexec", noexec_setup); | ||
58 | #endif | ||
59 | |||
60 | #ifdef CONFIG_X86_PAE | ||
61 | static void __init set_nx(void) | ||
62 | { | ||
63 | unsigned int v[4], l, h; | ||
64 | |||
65 | if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { | ||
66 | cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); | ||
67 | |||
68 | if ((v[3] & (1 << 20)) && !disable_nx) { | ||
69 | rdmsr(MSR_EFER, l, h); | ||
70 | l |= EFER_NX; | ||
71 | wrmsr(MSR_EFER, l, h); | ||
72 | nx_enabled = 1; | ||
73 | __supported_pte_mask |= _PAGE_NX; | ||
74 | } | ||
75 | } | ||
76 | } | ||
77 | #else | ||
78 | static inline void set_nx(void) | ||
79 | { | ||
80 | } | ||
81 | #endif | ||
82 | |||
83 | #ifdef CONFIG_X86_64 | ||
84 | void __cpuinit check_efer(void) | ||
85 | { | ||
86 | unsigned long efer; | ||
87 | |||
88 | rdmsrl(MSR_EFER, efer); | ||
89 | if (!(efer & EFER_NX) || disable_nx) | ||
90 | __supported_pte_mask &= ~_PAGE_NX; | ||
91 | } | ||
92 | #endif | ||
93 | |||
94 | static void __init find_early_table_space(unsigned long end, int use_pse, | 31 | static void __init find_early_table_space(unsigned long end, int use_pse, |
95 | int use_gbpages) | 32 | int use_gbpages) |
96 | { | 33 | { |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3cd7711bb94..30938c1d8d5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -84,7 +84,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) | |||
84 | #ifdef CONFIG_X86_PAE | 84 | #ifdef CONFIG_X86_PAE |
85 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { | 85 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { |
86 | if (after_bootmem) | 86 | if (after_bootmem) |
87 | pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); | 87 | pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); |
88 | else | 88 | else |
89 | pmd_table = (pmd_t *)alloc_low_page(); | 89 | pmd_table = (pmd_t *)alloc_low_page(); |
90 | paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); | 90 | paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); |
@@ -116,7 +116,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) | |||
116 | #endif | 116 | #endif |
117 | if (!page_table) | 117 | if (!page_table) |
118 | page_table = | 118 | page_table = |
119 | (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); | 119 | (pte_t *)alloc_bootmem_pages(PAGE_SIZE); |
120 | } else | 120 | } else |
121 | page_table = (pte_t *)alloc_low_page(); | 121 | page_table = (pte_t *)alloc_low_page(); |
122 | 122 | ||
@@ -857,8 +857,6 @@ static void __init test_wp_bit(void) | |||
857 | } | 857 | } |
858 | } | 858 | } |
859 | 859 | ||
860 | static struct kcore_list kcore_mem, kcore_vmalloc; | ||
861 | |||
862 | void __init mem_init(void) | 860 | void __init mem_init(void) |
863 | { | 861 | { |
864 | int codesize, reservedpages, datasize, initsize; | 862 | int codesize, reservedpages, datasize, initsize; |
@@ -886,13 +884,9 @@ void __init mem_init(void) | |||
886 | datasize = (unsigned long) &_edata - (unsigned long) &_etext; | 884 | datasize = (unsigned long) &_edata - (unsigned long) &_etext; |
887 | initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; | 885 | initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; |
888 | 886 | ||
889 | kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); | ||
890 | kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, | ||
891 | VMALLOC_END-VMALLOC_START); | ||
892 | |||
893 | printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " | 887 | printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " |
894 | "%dk reserved, %dk data, %dk init, %ldk highmem)\n", | 888 | "%dk reserved, %dk data, %dk init, %ldk highmem)\n", |
895 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | 889 | nr_free_pages() << (PAGE_SHIFT-10), |
896 | num_physpages << (PAGE_SHIFT-10), | 890 | num_physpages << (PAGE_SHIFT-10), |
897 | codesize >> 10, | 891 | codesize >> 10, |
898 | reservedpages << (PAGE_SHIFT-10), | 892 | reservedpages << (PAGE_SHIFT-10), |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6176fe8f29e..5a4398a6006 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -647,8 +647,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | |||
647 | 647 | ||
648 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 648 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
649 | 649 | ||
650 | static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, | 650 | static struct kcore_list kcore_vsyscall; |
651 | kcore_modules, kcore_vsyscall; | ||
652 | 651 | ||
653 | void __init mem_init(void) | 652 | void __init mem_init(void) |
654 | { | 653 | { |
@@ -677,17 +676,12 @@ void __init mem_init(void) | |||
677 | initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; | 676 | initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; |
678 | 677 | ||
679 | /* Register memory areas for /proc/kcore */ | 678 | /* Register memory areas for /proc/kcore */ |
680 | kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); | ||
681 | kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, | ||
682 | VMALLOC_END-VMALLOC_START); | ||
683 | kclist_add(&kcore_kernel, &_stext, _end - _stext); | ||
684 | kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); | ||
685 | kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, | 679 | kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, |
686 | VSYSCALL_END - VSYSCALL_START); | 680 | VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); |
687 | 681 | ||
688 | printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " | 682 | printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " |
689 | "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", | 683 | "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", |
690 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | 684 | nr_free_pages() << (PAGE_SHIFT-10), |
691 | max_pfn << (PAGE_SHIFT-10), | 685 | max_pfn << (PAGE_SHIFT-10), |
692 | codesize >> 10, | 686 | codesize >> 10, |
693 | absent_pages << (PAGE_SHIFT-10), | 687 | absent_pages << (PAGE_SHIFT-10), |
@@ -796,7 +790,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | |||
796 | return ret; | 790 | return ret; |
797 | 791 | ||
798 | #else | 792 | #else |
799 | reserve_bootmem(phys, len, BOOTMEM_DEFAULT); | 793 | reserve_bootmem(phys, len, flags); |
800 | #endif | 794 | #endif |
801 | 795 | ||
802 | if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { | 796 | if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { |
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index fe6f84ca121..84e236ce76b 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c | |||
@@ -21,7 +21,7 @@ | |||
21 | #include <linux/module.h> | 21 | #include <linux/module.h> |
22 | #include <linux/highmem.h> | 22 | #include <linux/highmem.h> |
23 | 23 | ||
24 | int is_io_mapping_possible(resource_size_t base, unsigned long size) | 24 | static int is_io_mapping_possible(resource_size_t base, unsigned long size) |
25 | { | 25 | { |
26 | #if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) | 26 | #if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) |
27 | /* There is no way to map greater than 1 << 32 address without PAE */ | 27 | /* There is no way to map greater than 1 << 32 address without PAE */ |
@@ -30,7 +30,30 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size) | |||
30 | #endif | 30 | #endif |
31 | return 1; | 31 | return 1; |
32 | } | 32 | } |
33 | EXPORT_SYMBOL_GPL(is_io_mapping_possible); | 33 | |
34 | int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) | ||
35 | { | ||
36 | unsigned long flag = _PAGE_CACHE_WC; | ||
37 | int ret; | ||
38 | |||
39 | if (!is_io_mapping_possible(base, size)) | ||
40 | return -EINVAL; | ||
41 | |||
42 | ret = io_reserve_memtype(base, base + size, &flag); | ||
43 | if (ret) | ||
44 | return ret; | ||
45 | |||
46 | *prot = __pgprot(__PAGE_KERNEL | flag); | ||
47 | return 0; | ||
48 | } | ||
49 | EXPORT_SYMBOL_GPL(iomap_create_wc); | ||
50 | |||
51 | void | ||
52 | iomap_free(resource_size_t base, unsigned long size) | ||
53 | { | ||
54 | io_free_memtype(base, base + size); | ||
55 | } | ||
56 | EXPORT_SYMBOL_GPL(iomap_free); | ||
34 | 57 | ||
35 | void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) | 58 | void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) |
36 | { | 59 | { |
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 8a450930834..334e63ca7b2 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -22,77 +22,7 @@ | |||
22 | #include <asm/pgalloc.h> | 22 | #include <asm/pgalloc.h> |
23 | #include <asm/pat.h> | 23 | #include <asm/pat.h> |
24 | 24 | ||
25 | static inline int phys_addr_valid(resource_size_t addr) | 25 | #include "physaddr.h" |
26 | { | ||
27 | #ifdef CONFIG_PHYS_ADDR_T_64BIT | ||
28 | return !(addr >> boot_cpu_data.x86_phys_bits); | ||
29 | #else | ||
30 | return 1; | ||
31 | #endif | ||
32 | } | ||
33 | |||
34 | #ifdef CONFIG_X86_64 | ||
35 | |||
36 | unsigned long __phys_addr(unsigned long x) | ||
37 | { | ||
38 | if (x >= __START_KERNEL_map) { | ||
39 | x -= __START_KERNEL_map; | ||
40 | VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); | ||
41 | x += phys_base; | ||
42 | } else { | ||
43 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); | ||
44 | x -= PAGE_OFFSET; | ||
45 | VIRTUAL_BUG_ON(!phys_addr_valid(x)); | ||
46 | } | ||
47 | return x; | ||
48 | } | ||
49 | EXPORT_SYMBOL(__phys_addr); | ||
50 | |||
51 | bool __virt_addr_valid(unsigned long x) | ||
52 | { | ||
53 | if (x >= __START_KERNEL_map) { | ||
54 | x -= __START_KERNEL_map; | ||
55 | if (x >= KERNEL_IMAGE_SIZE) | ||
56 | return false; | ||
57 | x += phys_base; | ||
58 | } else { | ||
59 | if (x < PAGE_OFFSET) | ||
60 | return false; | ||
61 | x -= PAGE_OFFSET; | ||
62 | if (!phys_addr_valid(x)) | ||
63 | return false; | ||
64 | } | ||
65 | |||
66 | return pfn_valid(x >> PAGE_SHIFT); | ||
67 | } | ||
68 | EXPORT_SYMBOL(__virt_addr_valid); | ||
69 | |||
70 | #else | ||
71 | |||
72 | #ifdef CONFIG_DEBUG_VIRTUAL | ||
73 | unsigned long __phys_addr(unsigned long x) | ||
74 | { | ||
75 | /* VMALLOC_* aren't constants */ | ||
76 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); | ||
77 | VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); | ||
78 | return x - PAGE_OFFSET; | ||
79 | } | ||
80 | EXPORT_SYMBOL(__phys_addr); | ||
81 | #endif | ||
82 | |||
83 | bool __virt_addr_valid(unsigned long x) | ||
84 | { | ||
85 | if (x < PAGE_OFFSET) | ||
86 | return false; | ||
87 | if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) | ||
88 | return false; | ||
89 | if (x >= FIXADDR_START) | ||
90 | return false; | ||
91 | return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); | ||
92 | } | ||
93 | EXPORT_SYMBOL(__virt_addr_valid); | ||
94 | |||
95 | #endif | ||
96 | 26 | ||
97 | int page_is_ram(unsigned long pagenr) | 27 | int page_is_ram(unsigned long pagenr) |
98 | { | 28 | { |
@@ -228,24 +158,14 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
228 | retval = reserve_memtype(phys_addr, (u64)phys_addr + size, | 158 | retval = reserve_memtype(phys_addr, (u64)phys_addr + size, |
229 | prot_val, &new_prot_val); | 159 | prot_val, &new_prot_val); |
230 | if (retval) { | 160 | if (retval) { |
231 | pr_debug("Warning: reserve_memtype returned %d\n", retval); | 161 | printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval); |
232 | return NULL; | 162 | return NULL; |
233 | } | 163 | } |
234 | 164 | ||
235 | if (prot_val != new_prot_val) { | 165 | if (prot_val != new_prot_val) { |
236 | /* | 166 | if (!is_new_memtype_allowed(phys_addr, size, |
237 | * Do not fallback to certain memory types with certain | 167 | prot_val, new_prot_val)) { |
238 | * requested type: | 168 | printk(KERN_ERR |
239 | * - request is uc-, return cannot be write-back | ||
240 | * - request is uc-, return cannot be write-combine | ||
241 | * - request is write-combine, return cannot be write-back | ||
242 | */ | ||
243 | if ((prot_val == _PAGE_CACHE_UC_MINUS && | ||
244 | (new_prot_val == _PAGE_CACHE_WB || | ||
245 | new_prot_val == _PAGE_CACHE_WC)) || | ||
246 | (prot_val == _PAGE_CACHE_WC && | ||
247 | new_prot_val == _PAGE_CACHE_WB)) { | ||
248 | pr_debug( | ||
249 | "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", | 169 | "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", |
250 | (unsigned long long)phys_addr, | 170 | (unsigned long long)phys_addr, |
251 | (unsigned long long)(phys_addr + size), | 171 | (unsigned long long)(phys_addr + size), |
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index 2c55ed09865..8cc18334414 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c | |||
@@ -225,9 +225,6 @@ void kmemcheck_hide(struct pt_regs *regs) | |||
225 | 225 | ||
226 | BUG_ON(!irqs_disabled()); | 226 | BUG_ON(!irqs_disabled()); |
227 | 227 | ||
228 | if (data->balance == 0) | ||
229 | return; | ||
230 | |||
231 | if (unlikely(data->balance != 1)) { | 228 | if (unlikely(data->balance != 1)) { |
232 | kmemcheck_show_all(); | 229 | kmemcheck_show_all(); |
233 | kmemcheck_error_save_bug(regs); | 230 | kmemcheck_error_save_bug(regs); |
@@ -331,6 +328,20 @@ static void kmemcheck_read_strict(struct pt_regs *regs, | |||
331 | kmemcheck_shadow_set(shadow, size); | 328 | kmemcheck_shadow_set(shadow, size); |
332 | } | 329 | } |
333 | 330 | ||
331 | bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) | ||
332 | { | ||
333 | enum kmemcheck_shadow status; | ||
334 | void *shadow; | ||
335 | |||
336 | shadow = kmemcheck_shadow_lookup(addr); | ||
337 | if (!shadow) | ||
338 | return true; | ||
339 | |||
340 | status = kmemcheck_shadow_test(shadow, size); | ||
341 | |||
342 | return status == KMEMCHECK_SHADOW_INITIALIZED; | ||
343 | } | ||
344 | |||
334 | /* Access may cross page boundary */ | 345 | /* Access may cross page boundary */ |
335 | static void kmemcheck_read(struct pt_regs *regs, | 346 | static void kmemcheck_read(struct pt_regs *regs, |
336 | unsigned long addr, unsigned int size) | 347 | unsigned long addr, unsigned int size) |
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c index e773b6bd007..3f66b82076a 100644 --- a/arch/x86/mm/kmemcheck/shadow.c +++ b/arch/x86/mm/kmemcheck/shadow.c | |||
@@ -1,7 +1,6 @@ | |||
1 | #include <linux/kmemcheck.h> | 1 | #include <linux/kmemcheck.h> |
2 | #include <linux/module.h> | 2 | #include <linux/module.h> |
3 | #include <linux/mm.h> | 3 | #include <linux/mm.h> |
4 | #include <linux/module.h> | ||
5 | 4 | ||
6 | #include <asm/page.h> | 5 | #include <asm/page.h> |
7 | #include <asm/pgtable.h> | 6 | #include <asm/pgtable.h> |
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 16582960056..c8191defc38 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c | |||
@@ -29,13 +29,26 @@ | |||
29 | #include <linux/random.h> | 29 | #include <linux/random.h> |
30 | #include <linux/limits.h> | 30 | #include <linux/limits.h> |
31 | #include <linux/sched.h> | 31 | #include <linux/sched.h> |
32 | #include <asm/elf.h> | ||
33 | |||
34 | static unsigned int stack_maxrandom_size(void) | ||
35 | { | ||
36 | unsigned int max = 0; | ||
37 | if ((current->flags & PF_RANDOMIZE) && | ||
38 | !(current->personality & ADDR_NO_RANDOMIZE)) { | ||
39 | max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT; | ||
40 | } | ||
41 | |||
42 | return max; | ||
43 | } | ||
44 | |||
32 | 45 | ||
33 | /* | 46 | /* |
34 | * Top of mmap area (just below the process stack). | 47 | * Top of mmap area (just below the process stack). |
35 | * | 48 | * |
36 | * Leave an at least ~128 MB hole. | 49 | * Leave an at least ~128 MB hole with possible stack randomization. |
37 | */ | 50 | */ |
38 | #define MIN_GAP (128*1024*1024) | 51 | #define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) |
39 | #define MAX_GAP (TASK_SIZE/6*5) | 52 | #define MAX_GAP (TASK_SIZE/6*5) |
40 | 53 | ||
41 | /* | 54 | /* |
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7e600c1962d..dd38bfbefd1 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/seq_file.h> | 12 | #include <linux/seq_file.h> |
13 | #include <linux/debugfs.h> | 13 | #include <linux/debugfs.h> |
14 | #include <linux/pfn.h> | 14 | #include <linux/pfn.h> |
15 | #include <linux/percpu.h> | ||
15 | 16 | ||
16 | #include <asm/e820.h> | 17 | #include <asm/e820.h> |
17 | #include <asm/processor.h> | 18 | #include <asm/processor.h> |
@@ -143,6 +144,7 @@ void clflush_cache_range(void *vaddr, unsigned int size) | |||
143 | 144 | ||
144 | mb(); | 145 | mb(); |
145 | } | 146 | } |
147 | EXPORT_SYMBOL_GPL(clflush_cache_range); | ||
146 | 148 | ||
147 | static void __cpa_flush_all(void *arg) | 149 | static void __cpa_flush_all(void *arg) |
148 | { | 150 | { |
@@ -686,7 +688,7 @@ static int cpa_process_alias(struct cpa_data *cpa) | |||
686 | { | 688 | { |
687 | struct cpa_data alias_cpa; | 689 | struct cpa_data alias_cpa; |
688 | unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); | 690 | unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); |
689 | unsigned long vaddr, remapped; | 691 | unsigned long vaddr; |
690 | int ret; | 692 | int ret; |
691 | 693 | ||
692 | if (cpa->pfn >= max_pfn_mapped) | 694 | if (cpa->pfn >= max_pfn_mapped) |
@@ -744,24 +746,6 @@ static int cpa_process_alias(struct cpa_data *cpa) | |||
744 | } | 746 | } |
745 | #endif | 747 | #endif |
746 | 748 | ||
747 | /* | ||
748 | * If the PMD page was partially used for per-cpu remapping, | ||
749 | * the recycled area needs to be split and modified. Because | ||
750 | * the area is always proper subset of a PMD page | ||
751 | * cpa->numpages is guaranteed to be 1 for these areas, so | ||
752 | * there's no need to loop over and check for further remaps. | ||
753 | */ | ||
754 | remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); | ||
755 | if (remapped) { | ||
756 | WARN_ON(cpa->numpages > 1); | ||
757 | alias_cpa = *cpa; | ||
758 | alias_cpa.vaddr = &remapped; | ||
759 | alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); | ||
760 | ret = __change_page_attr_set_clr(&alias_cpa, 0); | ||
761 | if (ret) | ||
762 | return ret; | ||
763 | } | ||
764 | |||
765 | return 0; | 749 | return 0; |
766 | } | 750 | } |
767 | 751 | ||
@@ -822,6 +806,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, | |||
822 | { | 806 | { |
823 | struct cpa_data cpa; | 807 | struct cpa_data cpa; |
824 | int ret, cache, checkalias; | 808 | int ret, cache, checkalias; |
809 | unsigned long baddr = 0; | ||
825 | 810 | ||
826 | /* | 811 | /* |
827 | * Check, if we are requested to change a not supported | 812 | * Check, if we are requested to change a not supported |
@@ -853,6 +838,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, | |||
853 | */ | 838 | */ |
854 | WARN_ON_ONCE(1); | 839 | WARN_ON_ONCE(1); |
855 | } | 840 | } |
841 | /* | ||
842 | * Save address for cache flush. *addr is modified in the call | ||
843 | * to __change_page_attr_set_clr() below. | ||
844 | */ | ||
845 | baddr = *addr; | ||
856 | } | 846 | } |
857 | 847 | ||
858 | /* Must avoid aliasing mappings in the highmem code */ | 848 | /* Must avoid aliasing mappings in the highmem code */ |
@@ -900,7 +890,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, | |||
900 | cpa_flush_array(addr, numpages, cache, | 890 | cpa_flush_array(addr, numpages, cache, |
901 | cpa.flags, pages); | 891 | cpa.flags, pages); |
902 | } else | 892 | } else |
903 | cpa_flush_range(*addr, numpages, cache); | 893 | cpa_flush_range(baddr, numpages, cache); |
904 | } else | 894 | } else |
905 | cpa_flush_all(cache); | 895 | cpa_flush_all(cache); |
906 | 896 | ||
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e6718bb2806..e78cd0ec2bc 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/gfp.h> | 15 | #include <linux/gfp.h> |
16 | #include <linux/mm.h> | 16 | #include <linux/mm.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/rbtree.h> | ||
18 | 19 | ||
19 | #include <asm/cacheflush.h> | 20 | #include <asm/cacheflush.h> |
20 | #include <asm/processor.h> | 21 | #include <asm/processor.h> |
@@ -80,6 +81,7 @@ enum { | |||
80 | void pat_init(void) | 81 | void pat_init(void) |
81 | { | 82 | { |
82 | u64 pat; | 83 | u64 pat; |
84 | bool boot_cpu = !boot_pat_state; | ||
83 | 85 | ||
84 | if (!pat_enabled) | 86 | if (!pat_enabled) |
85 | return; | 87 | return; |
@@ -121,8 +123,10 @@ void pat_init(void) | |||
121 | rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); | 123 | rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); |
122 | 124 | ||
123 | wrmsrl(MSR_IA32_CR_PAT, pat); | 125 | wrmsrl(MSR_IA32_CR_PAT, pat); |
124 | printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", | 126 | |
125 | smp_processor_id(), boot_pat_state, pat); | 127 | if (boot_cpu) |
128 | printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", | ||
129 | smp_processor_id(), boot_pat_state, pat); | ||
126 | } | 130 | } |
127 | 131 | ||
128 | #undef PAT | 132 | #undef PAT |
@@ -148,11 +152,10 @@ static char *cattr_name(unsigned long flags) | |||
148 | * areas). All the aliases have the same cache attributes of course. | 152 | * areas). All the aliases have the same cache attributes of course. |
149 | * Zero attributes are represented as holes. | 153 | * Zero attributes are represented as holes. |
150 | * | 154 | * |
151 | * Currently the data structure is a list because the number of mappings | 155 | * The data structure is a list that is also organized as an rbtree |
152 | * are expected to be relatively small. If this should be a problem | 156 | * sorted on the start address of memtype range. |
153 | * it could be changed to a rbtree or similar. | ||
154 | * | 157 | * |
155 | * memtype_lock protects the whole list. | 158 | * memtype_lock protects both the linear list and rbtree. |
156 | */ | 159 | */ |
157 | 160 | ||
158 | struct memtype { | 161 | struct memtype { |
@@ -160,11 +163,53 @@ struct memtype { | |||
160 | u64 end; | 163 | u64 end; |
161 | unsigned long type; | 164 | unsigned long type; |
162 | struct list_head nd; | 165 | struct list_head nd; |
166 | struct rb_node rb; | ||
163 | }; | 167 | }; |
164 | 168 | ||
169 | static struct rb_root memtype_rbroot = RB_ROOT; | ||
165 | static LIST_HEAD(memtype_list); | 170 | static LIST_HEAD(memtype_list); |
166 | static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ | 171 | static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ |
167 | 172 | ||
173 | static struct memtype *memtype_rb_search(struct rb_root *root, u64 start) | ||
174 | { | ||
175 | struct rb_node *node = root->rb_node; | ||
176 | struct memtype *last_lower = NULL; | ||
177 | |||
178 | while (node) { | ||
179 | struct memtype *data = container_of(node, struct memtype, rb); | ||
180 | |||
181 | if (data->start < start) { | ||
182 | last_lower = data; | ||
183 | node = node->rb_right; | ||
184 | } else if (data->start > start) { | ||
185 | node = node->rb_left; | ||
186 | } else | ||
187 | return data; | ||
188 | } | ||
189 | |||
190 | /* Will return NULL if there is no entry with its start <= start */ | ||
191 | return last_lower; | ||
192 | } | ||
193 | |||
194 | static void memtype_rb_insert(struct rb_root *root, struct memtype *data) | ||
195 | { | ||
196 | struct rb_node **new = &(root->rb_node); | ||
197 | struct rb_node *parent = NULL; | ||
198 | |||
199 | while (*new) { | ||
200 | struct memtype *this = container_of(*new, struct memtype, rb); | ||
201 | |||
202 | parent = *new; | ||
203 | if (data->start <= this->start) | ||
204 | new = &((*new)->rb_left); | ||
205 | else if (data->start > this->start) | ||
206 | new = &((*new)->rb_right); | ||
207 | } | ||
208 | |||
209 | rb_link_node(&data->rb, parent, new); | ||
210 | rb_insert_color(&data->rb, root); | ||
211 | } | ||
212 | |||
168 | /* | 213 | /* |
169 | * Does intersection of PAT memory type and MTRR memory type and returns | 214 | * Does intersection of PAT memory type and MTRR memory type and returns |
170 | * the resulting memory type as PAT understands it. | 215 | * the resulting memory type as PAT understands it. |
@@ -218,9 +263,6 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) | |||
218 | return -EBUSY; | 263 | return -EBUSY; |
219 | } | 264 | } |
220 | 265 | ||
221 | static struct memtype *cached_entry; | ||
222 | static u64 cached_start; | ||
223 | |||
224 | static int pat_pagerange_is_ram(unsigned long start, unsigned long end) | 266 | static int pat_pagerange_is_ram(unsigned long start, unsigned long end) |
225 | { | 267 | { |
226 | int ram_page = 0, not_rampage = 0; | 268 | int ram_page = 0, not_rampage = 0; |
@@ -249,63 +291,61 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end) | |||
249 | } | 291 | } |
250 | 292 | ||
251 | /* | 293 | /* |
252 | * For RAM pages, mark the pages as non WB memory type using | 294 | * For RAM pages, we use page flags to mark the pages with appropriate type. |
253 | * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or | 295 | * Here we do two pass: |
254 | * set_memory_wc() on a RAM page at a time before marking it as WB again. | 296 | * - Find the memtype of all the pages in the range, look for any conflicts |
255 | * This is ok, because only one driver will be owning the page and | 297 | * - In case of no conflicts, set the new memtype for pages in the range |
256 | * doing set_memory_*() calls. | ||
257 | * | 298 | * |
258 | * For now, we use PageNonWB to track that the RAM page is being mapped | 299 | * Caller must hold memtype_lock for atomicity. |
259 | * as non WB. In future, we will have to use one more flag | ||
260 | * (or some other mechanism in page_struct) to distinguish between | ||
261 | * UC and WC mapping. | ||
262 | */ | 300 | */ |
263 | static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, | 301 | static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, |
264 | unsigned long *new_type) | 302 | unsigned long *new_type) |
265 | { | 303 | { |
266 | struct page *page; | 304 | struct page *page; |
267 | u64 pfn, end_pfn; | 305 | u64 pfn; |
306 | |||
307 | if (req_type == _PAGE_CACHE_UC) { | ||
308 | /* We do not support strong UC */ | ||
309 | WARN_ON_ONCE(1); | ||
310 | req_type = _PAGE_CACHE_UC_MINUS; | ||
311 | } | ||
268 | 312 | ||
269 | for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { | 313 | for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { |
270 | page = pfn_to_page(pfn); | 314 | unsigned long type; |
271 | if (page_mapped(page) || PageNonWB(page)) | ||
272 | goto out; | ||
273 | 315 | ||
274 | SetPageNonWB(page); | 316 | page = pfn_to_page(pfn); |
317 | type = get_page_memtype(page); | ||
318 | if (type != -1) { | ||
319 | printk(KERN_INFO "reserve_ram_pages_type failed " | ||
320 | "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", | ||
321 | start, end, type, req_type); | ||
322 | if (new_type) | ||
323 | *new_type = type; | ||
324 | |||
325 | return -EBUSY; | ||
326 | } | ||
275 | } | 327 | } |
276 | return 0; | ||
277 | 328 | ||
278 | out: | 329 | if (new_type) |
279 | end_pfn = pfn; | 330 | *new_type = req_type; |
280 | for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { | 331 | |
332 | for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { | ||
281 | page = pfn_to_page(pfn); | 333 | page = pfn_to_page(pfn); |
282 | ClearPageNonWB(page); | 334 | set_page_memtype(page, req_type); |
283 | } | 335 | } |
284 | 336 | return 0; | |
285 | return -EINVAL; | ||
286 | } | 337 | } |
287 | 338 | ||
288 | static int free_ram_pages_type(u64 start, u64 end) | 339 | static int free_ram_pages_type(u64 start, u64 end) |
289 | { | 340 | { |
290 | struct page *page; | 341 | struct page *page; |
291 | u64 pfn, end_pfn; | 342 | u64 pfn; |
292 | 343 | ||
293 | for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { | 344 | for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { |
294 | page = pfn_to_page(pfn); | 345 | page = pfn_to_page(pfn); |
295 | if (page_mapped(page) || !PageNonWB(page)) | 346 | set_page_memtype(page, -1); |
296 | goto out; | ||
297 | |||
298 | ClearPageNonWB(page); | ||
299 | } | 347 | } |
300 | return 0; | 348 | return 0; |
301 | |||
302 | out: | ||
303 | end_pfn = pfn; | ||
304 | for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { | ||
305 | page = pfn_to_page(pfn); | ||
306 | SetPageNonWB(page); | ||
307 | } | ||
308 | return -EINVAL; | ||
309 | } | 349 | } |
310 | 350 | ||
311 | /* | 351 | /* |
@@ -339,6 +379,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
339 | if (new_type) { | 379 | if (new_type) { |
340 | if (req_type == -1) | 380 | if (req_type == -1) |
341 | *new_type = _PAGE_CACHE_WB; | 381 | *new_type = _PAGE_CACHE_WB; |
382 | else if (req_type == _PAGE_CACHE_WC) | ||
383 | *new_type = _PAGE_CACHE_UC_MINUS; | ||
342 | else | 384 | else |
343 | *new_type = req_type & _PAGE_CACHE_MASK; | 385 | *new_type = req_type & _PAGE_CACHE_MASK; |
344 | } | 386 | } |
@@ -364,11 +406,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
364 | *new_type = actual_type; | 406 | *new_type = actual_type; |
365 | 407 | ||
366 | is_range_ram = pat_pagerange_is_ram(start, end); | 408 | is_range_ram = pat_pagerange_is_ram(start, end); |
367 | if (is_range_ram == 1) | 409 | if (is_range_ram == 1) { |
368 | return reserve_ram_pages_type(start, end, req_type, | 410 | |
369 | new_type); | 411 | spin_lock(&memtype_lock); |
370 | else if (is_range_ram < 0) | 412 | err = reserve_ram_pages_type(start, end, req_type, new_type); |
413 | spin_unlock(&memtype_lock); | ||
414 | |||
415 | return err; | ||
416 | } else if (is_range_ram < 0) { | ||
371 | return -EINVAL; | 417 | return -EINVAL; |
418 | } | ||
372 | 419 | ||
373 | new = kmalloc(sizeof(struct memtype), GFP_KERNEL); | 420 | new = kmalloc(sizeof(struct memtype), GFP_KERNEL); |
374 | if (!new) | 421 | if (!new) |
@@ -380,17 +427,11 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
380 | 427 | ||
381 | spin_lock(&memtype_lock); | 428 | spin_lock(&memtype_lock); |
382 | 429 | ||
383 | if (cached_entry && start >= cached_start) | ||
384 | entry = cached_entry; | ||
385 | else | ||
386 | entry = list_entry(&memtype_list, struct memtype, nd); | ||
387 | |||
388 | /* Search for existing mapping that overlaps the current range */ | 430 | /* Search for existing mapping that overlaps the current range */ |
389 | where = NULL; | 431 | where = NULL; |
390 | list_for_each_entry_continue(entry, &memtype_list, nd) { | 432 | list_for_each_entry(entry, &memtype_list, nd) { |
391 | if (end <= entry->start) { | 433 | if (end <= entry->start) { |
392 | where = entry->nd.prev; | 434 | where = entry->nd.prev; |
393 | cached_entry = list_entry(where, struct memtype, nd); | ||
394 | break; | 435 | break; |
395 | } else if (start <= entry->start) { /* end > entry->start */ | 436 | } else if (start <= entry->start) { /* end > entry->start */ |
396 | err = chk_conflict(new, entry, new_type); | 437 | err = chk_conflict(new, entry, new_type); |
@@ -398,8 +439,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
398 | dprintk("Overlap at 0x%Lx-0x%Lx\n", | 439 | dprintk("Overlap at 0x%Lx-0x%Lx\n", |
399 | entry->start, entry->end); | 440 | entry->start, entry->end); |
400 | where = entry->nd.prev; | 441 | where = entry->nd.prev; |
401 | cached_entry = list_entry(where, | ||
402 | struct memtype, nd); | ||
403 | } | 442 | } |
404 | break; | 443 | break; |
405 | } else if (start < entry->end) { /* start > entry->start */ | 444 | } else if (start < entry->end) { /* start > entry->start */ |
@@ -407,8 +446,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
407 | if (!err) { | 446 | if (!err) { |
408 | dprintk("Overlap at 0x%Lx-0x%Lx\n", | 447 | dprintk("Overlap at 0x%Lx-0x%Lx\n", |
409 | entry->start, entry->end); | 448 | entry->start, entry->end); |
410 | cached_entry = list_entry(entry->nd.prev, | ||
411 | struct memtype, nd); | ||
412 | 449 | ||
413 | /* | 450 | /* |
414 | * Move to right position in the linked | 451 | * Move to right position in the linked |
@@ -436,13 +473,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
436 | return err; | 473 | return err; |
437 | } | 474 | } |
438 | 475 | ||
439 | cached_start = start; | ||
440 | |||
441 | if (where) | 476 | if (where) |
442 | list_add(&new->nd, where); | 477 | list_add(&new->nd, where); |
443 | else | 478 | else |
444 | list_add_tail(&new->nd, &memtype_list); | 479 | list_add_tail(&new->nd, &memtype_list); |
445 | 480 | ||
481 | memtype_rb_insert(&memtype_rbroot, new); | ||
482 | |||
446 | spin_unlock(&memtype_lock); | 483 | spin_unlock(&memtype_lock); |
447 | 484 | ||
448 | dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", | 485 | dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", |
@@ -454,7 +491,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
454 | 491 | ||
455 | int free_memtype(u64 start, u64 end) | 492 | int free_memtype(u64 start, u64 end) |
456 | { | 493 | { |
457 | struct memtype *entry; | 494 | struct memtype *entry, *saved_entry; |
458 | int err = -EINVAL; | 495 | int err = -EINVAL; |
459 | int is_range_ram; | 496 | int is_range_ram; |
460 | 497 | ||
@@ -466,23 +503,58 @@ int free_memtype(u64 start, u64 end) | |||
466 | return 0; | 503 | return 0; |
467 | 504 | ||
468 | is_range_ram = pat_pagerange_is_ram(start, end); | 505 | is_range_ram = pat_pagerange_is_ram(start, end); |
469 | if (is_range_ram == 1) | 506 | if (is_range_ram == 1) { |
470 | return free_ram_pages_type(start, end); | 507 | |
471 | else if (is_range_ram < 0) | 508 | spin_lock(&memtype_lock); |
509 | err = free_ram_pages_type(start, end); | ||
510 | spin_unlock(&memtype_lock); | ||
511 | |||
512 | return err; | ||
513 | } else if (is_range_ram < 0) { | ||
472 | return -EINVAL; | 514 | return -EINVAL; |
515 | } | ||
473 | 516 | ||
474 | spin_lock(&memtype_lock); | 517 | spin_lock(&memtype_lock); |
475 | list_for_each_entry(entry, &memtype_list, nd) { | 518 | |
519 | entry = memtype_rb_search(&memtype_rbroot, start); | ||
520 | if (unlikely(entry == NULL)) | ||
521 | goto unlock_ret; | ||
522 | |||
523 | /* | ||
524 | * Saved entry points to an entry with start same or less than what | ||
525 | * we searched for. Now go through the list in both directions to look | ||
526 | * for the entry that matches with both start and end, with list stored | ||
527 | * in sorted start address | ||
528 | */ | ||
529 | saved_entry = entry; | ||
530 | list_for_each_entry_from(entry, &memtype_list, nd) { | ||
476 | if (entry->start == start && entry->end == end) { | 531 | if (entry->start == start && entry->end == end) { |
477 | if (cached_entry == entry || cached_start == start) | 532 | rb_erase(&entry->rb, &memtype_rbroot); |
478 | cached_entry = NULL; | 533 | list_del(&entry->nd); |
534 | kfree(entry); | ||
535 | err = 0; | ||
536 | break; | ||
537 | } else if (entry->start > start) { | ||
538 | break; | ||
539 | } | ||
540 | } | ||
479 | 541 | ||
542 | if (!err) | ||
543 | goto unlock_ret; | ||
544 | |||
545 | entry = saved_entry; | ||
546 | list_for_each_entry_reverse(entry, &memtype_list, nd) { | ||
547 | if (entry->start == start && entry->end == end) { | ||
548 | rb_erase(&entry->rb, &memtype_rbroot); | ||
480 | list_del(&entry->nd); | 549 | list_del(&entry->nd); |
481 | kfree(entry); | 550 | kfree(entry); |
482 | err = 0; | 551 | err = 0; |
483 | break; | 552 | break; |
553 | } else if (entry->start < start) { | ||
554 | break; | ||
484 | } | 555 | } |
485 | } | 556 | } |
557 | unlock_ret: | ||
486 | spin_unlock(&memtype_lock); | 558 | spin_unlock(&memtype_lock); |
487 | 559 | ||
488 | if (err) { | 560 | if (err) { |
@@ -496,6 +568,101 @@ int free_memtype(u64 start, u64 end) | |||
496 | } | 568 | } |
497 | 569 | ||
498 | 570 | ||
571 | /** | ||
572 | * lookup_memtype - Looksup the memory type for a physical address | ||
573 | * @paddr: physical address of which memory type needs to be looked up | ||
574 | * | ||
575 | * Only to be called when PAT is enabled | ||
576 | * | ||
577 | * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or | ||
578 | * _PAGE_CACHE_UC | ||
579 | */ | ||
580 | static unsigned long lookup_memtype(u64 paddr) | ||
581 | { | ||
582 | int rettype = _PAGE_CACHE_WB; | ||
583 | struct memtype *entry; | ||
584 | |||
585 | if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) | ||
586 | return rettype; | ||
587 | |||
588 | if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { | ||
589 | struct page *page; | ||
590 | spin_lock(&memtype_lock); | ||
591 | page = pfn_to_page(paddr >> PAGE_SHIFT); | ||
592 | rettype = get_page_memtype(page); | ||
593 | spin_unlock(&memtype_lock); | ||
594 | /* | ||
595 | * -1 from get_page_memtype() implies RAM page is in its | ||
596 | * default state and not reserved, and hence of type WB | ||
597 | */ | ||
598 | if (rettype == -1) | ||
599 | rettype = _PAGE_CACHE_WB; | ||
600 | |||
601 | return rettype; | ||
602 | } | ||
603 | |||
604 | spin_lock(&memtype_lock); | ||
605 | |||
606 | entry = memtype_rb_search(&memtype_rbroot, paddr); | ||
607 | if (entry != NULL) | ||
608 | rettype = entry->type; | ||
609 | else | ||
610 | rettype = _PAGE_CACHE_UC_MINUS; | ||
611 | |||
612 | spin_unlock(&memtype_lock); | ||
613 | return rettype; | ||
614 | } | ||
615 | |||
616 | /** | ||
617 | * io_reserve_memtype - Request a memory type mapping for a region of memory | ||
618 | * @start: start (physical address) of the region | ||
619 | * @end: end (physical address) of the region | ||
620 | * @type: A pointer to memtype, with requested type. On success, requested | ||
621 | * or any other compatible type that was available for the region is returned | ||
622 | * | ||
623 | * On success, returns 0 | ||
624 | * On failure, returns non-zero | ||
625 | */ | ||
626 | int io_reserve_memtype(resource_size_t start, resource_size_t end, | ||
627 | unsigned long *type) | ||
628 | { | ||
629 | resource_size_t size = end - start; | ||
630 | unsigned long req_type = *type; | ||
631 | unsigned long new_type; | ||
632 | int ret; | ||
633 | |||
634 | WARN_ON_ONCE(iomem_map_sanity_check(start, size)); | ||
635 | |||
636 | ret = reserve_memtype(start, end, req_type, &new_type); | ||
637 | if (ret) | ||
638 | goto out_err; | ||
639 | |||
640 | if (!is_new_memtype_allowed(start, size, req_type, new_type)) | ||
641 | goto out_free; | ||
642 | |||
643 | if (kernel_map_sync_memtype(start, size, new_type) < 0) | ||
644 | goto out_free; | ||
645 | |||
646 | *type = new_type; | ||
647 | return 0; | ||
648 | |||
649 | out_free: | ||
650 | free_memtype(start, end); | ||
651 | ret = -EBUSY; | ||
652 | out_err: | ||
653 | return ret; | ||
654 | } | ||
655 | |||
656 | /** | ||
657 | * io_free_memtype - Release a memory type mapping for a region of memory | ||
658 | * @start: start (physical address) of the region | ||
659 | * @end: end (physical address) of the region | ||
660 | */ | ||
661 | void io_free_memtype(resource_size_t start, resource_size_t end) | ||
662 | { | ||
663 | free_memtype(start, end); | ||
664 | } | ||
665 | |||
499 | pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, | 666 | pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, |
500 | unsigned long size, pgprot_t vma_prot) | 667 | unsigned long size, pgprot_t vma_prot) |
501 | { | 668 | { |
@@ -577,7 +744,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) | |||
577 | { | 744 | { |
578 | unsigned long id_sz; | 745 | unsigned long id_sz; |
579 | 746 | ||
580 | if (!pat_enabled || base >= __pa(high_memory)) | 747 | if (base >= __pa(high_memory)) |
581 | return 0; | 748 | return 0; |
582 | 749 | ||
583 | id_sz = (__pa(high_memory) < base + size) ? | 750 | id_sz = (__pa(high_memory) < base + size) ? |
@@ -612,18 +779,37 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, | |||
612 | is_ram = pat_pagerange_is_ram(paddr, paddr + size); | 779 | is_ram = pat_pagerange_is_ram(paddr, paddr + size); |
613 | 780 | ||
614 | /* | 781 | /* |
615 | * reserve_pfn_range() doesn't support RAM pages. Maintain the current | 782 | * reserve_pfn_range() for RAM pages. We do not refcount to keep |
616 | * behavior with RAM pages by returning success. | 783 | * track of number of mappings of RAM pages. We can assert that |
784 | * the type requested matches the type of first page in the range. | ||
617 | */ | 785 | */ |
618 | if (is_ram != 0) | 786 | if (is_ram) { |
787 | if (!pat_enabled) | ||
788 | return 0; | ||
789 | |||
790 | flags = lookup_memtype(paddr); | ||
791 | if (want_flags != flags) { | ||
792 | printk(KERN_WARNING | ||
793 | "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n", | ||
794 | current->comm, current->pid, | ||
795 | cattr_name(want_flags), | ||
796 | (unsigned long long)paddr, | ||
797 | (unsigned long long)(paddr + size), | ||
798 | cattr_name(flags)); | ||
799 | *vma_prot = __pgprot((pgprot_val(*vma_prot) & | ||
800 | (~_PAGE_CACHE_MASK)) | | ||
801 | flags); | ||
802 | } | ||
619 | return 0; | 803 | return 0; |
804 | } | ||
620 | 805 | ||
621 | ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); | 806 | ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); |
622 | if (ret) | 807 | if (ret) |
623 | return ret; | 808 | return ret; |
624 | 809 | ||
625 | if (flags != want_flags) { | 810 | if (flags != want_flags) { |
626 | if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) { | 811 | if (strict_prot || |
812 | !is_new_memtype_allowed(paddr, size, want_flags, flags)) { | ||
627 | free_memtype(paddr, paddr + size); | 813 | free_memtype(paddr, paddr + size); |
628 | printk(KERN_ERR "%s:%d map pfn expected mapping type %s" | 814 | printk(KERN_ERR "%s:%d map pfn expected mapping type %s" |
629 | " for %Lx-%Lx, got %s\n", | 815 | " for %Lx-%Lx, got %s\n", |
@@ -677,14 +863,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) | |||
677 | unsigned long vma_size = vma->vm_end - vma->vm_start; | 863 | unsigned long vma_size = vma->vm_end - vma->vm_start; |
678 | pgprot_t pgprot; | 864 | pgprot_t pgprot; |
679 | 865 | ||
680 | if (!pat_enabled) | ||
681 | return 0; | ||
682 | |||
683 | /* | ||
684 | * For now, only handle remap_pfn_range() vmas where | ||
685 | * is_linear_pfn_mapping() == TRUE. Handling of | ||
686 | * vm_insert_pfn() is TBD. | ||
687 | */ | ||
688 | if (is_linear_pfn_mapping(vma)) { | 866 | if (is_linear_pfn_mapping(vma)) { |
689 | /* | 867 | /* |
690 | * reserve the whole chunk covered by vma. We need the | 868 | * reserve the whole chunk covered by vma. We need the |
@@ -712,23 +890,24 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) | |||
712 | int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, | 890 | int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, |
713 | unsigned long pfn, unsigned long size) | 891 | unsigned long pfn, unsigned long size) |
714 | { | 892 | { |
893 | unsigned long flags; | ||
715 | resource_size_t paddr; | 894 | resource_size_t paddr; |
716 | unsigned long vma_size = vma->vm_end - vma->vm_start; | 895 | unsigned long vma_size = vma->vm_end - vma->vm_start; |
717 | 896 | ||
718 | if (!pat_enabled) | ||
719 | return 0; | ||
720 | |||
721 | /* | ||
722 | * For now, only handle remap_pfn_range() vmas where | ||
723 | * is_linear_pfn_mapping() == TRUE. Handling of | ||
724 | * vm_insert_pfn() is TBD. | ||
725 | */ | ||
726 | if (is_linear_pfn_mapping(vma)) { | 897 | if (is_linear_pfn_mapping(vma)) { |
727 | /* reserve the whole chunk starting from vm_pgoff */ | 898 | /* reserve the whole chunk starting from vm_pgoff */ |
728 | paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; | 899 | paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; |
729 | return reserve_pfn_range(paddr, vma_size, prot, 0); | 900 | return reserve_pfn_range(paddr, vma_size, prot, 0); |
730 | } | 901 | } |
731 | 902 | ||
903 | if (!pat_enabled) | ||
904 | return 0; | ||
905 | |||
906 | /* for vm_insert_pfn and friends, we set prot based on lookup */ | ||
907 | flags = lookup_memtype(pfn << PAGE_SHIFT); | ||
908 | *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | | ||
909 | flags); | ||
910 | |||
732 | return 0; | 911 | return 0; |
733 | } | 912 | } |
734 | 913 | ||
@@ -743,14 +922,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, | |||
743 | resource_size_t paddr; | 922 | resource_size_t paddr; |
744 | unsigned long vma_size = vma->vm_end - vma->vm_start; | 923 | unsigned long vma_size = vma->vm_end - vma->vm_start; |
745 | 924 | ||
746 | if (!pat_enabled) | ||
747 | return; | ||
748 | |||
749 | /* | ||
750 | * For now, only handle remap_pfn_range() vmas where | ||
751 | * is_linear_pfn_mapping() == TRUE. Handling of | ||
752 | * vm_insert_pfn() is TBD. | ||
753 | */ | ||
754 | if (is_linear_pfn_mapping(vma)) { | 925 | if (is_linear_pfn_mapping(vma)) { |
755 | /* free the whole chunk starting from vm_pgoff */ | 926 | /* free the whole chunk starting from vm_pgoff */ |
756 | paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; | 927 | paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; |
@@ -826,7 +997,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v) | |||
826 | return 0; | 997 | return 0; |
827 | } | 998 | } |
828 | 999 | ||
829 | static struct seq_operations memtype_seq_ops = { | 1000 | static const struct seq_operations memtype_seq_ops = { |
830 | .start = memtype_seq_start, | 1001 | .start = memtype_seq_start, |
831 | .next = memtype_seq_next, | 1002 | .next = memtype_seq_next, |
832 | .stop = memtype_seq_stop, | 1003 | .stop = memtype_seq_stop, |
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c new file mode 100644 index 00000000000..d2e2735327b --- /dev/null +++ b/arch/x86/mm/physaddr.c | |||
@@ -0,0 +1,70 @@ | |||
1 | #include <linux/mmdebug.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/mm.h> | ||
4 | |||
5 | #include <asm/page.h> | ||
6 | |||
7 | #include "physaddr.h" | ||
8 | |||
9 | #ifdef CONFIG_X86_64 | ||
10 | |||
11 | unsigned long __phys_addr(unsigned long x) | ||
12 | { | ||
13 | if (x >= __START_KERNEL_map) { | ||
14 | x -= __START_KERNEL_map; | ||
15 | VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); | ||
16 | x += phys_base; | ||
17 | } else { | ||
18 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); | ||
19 | x -= PAGE_OFFSET; | ||
20 | VIRTUAL_BUG_ON(!phys_addr_valid(x)); | ||
21 | } | ||
22 | return x; | ||
23 | } | ||
24 | EXPORT_SYMBOL(__phys_addr); | ||
25 | |||
26 | bool __virt_addr_valid(unsigned long x) | ||
27 | { | ||
28 | if (x >= __START_KERNEL_map) { | ||
29 | x -= __START_KERNEL_map; | ||
30 | if (x >= KERNEL_IMAGE_SIZE) | ||
31 | return false; | ||
32 | x += phys_base; | ||
33 | } else { | ||
34 | if (x < PAGE_OFFSET) | ||
35 | return false; | ||
36 | x -= PAGE_OFFSET; | ||
37 | if (!phys_addr_valid(x)) | ||
38 | return false; | ||
39 | } | ||
40 | |||
41 | return pfn_valid(x >> PAGE_SHIFT); | ||
42 | } | ||
43 | EXPORT_SYMBOL(__virt_addr_valid); | ||
44 | |||
45 | #else | ||
46 | |||
47 | #ifdef CONFIG_DEBUG_VIRTUAL | ||
48 | unsigned long __phys_addr(unsigned long x) | ||
49 | { | ||
50 | /* VMALLOC_* aren't constants */ | ||
51 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); | ||
52 | VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); | ||
53 | return x - PAGE_OFFSET; | ||
54 | } | ||
55 | EXPORT_SYMBOL(__phys_addr); | ||
56 | #endif | ||
57 | |||
58 | bool __virt_addr_valid(unsigned long x) | ||
59 | { | ||
60 | if (x < PAGE_OFFSET) | ||
61 | return false; | ||
62 | if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) | ||
63 | return false; | ||
64 | if (x >= FIXADDR_START) | ||
65 | return false; | ||
66 | return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); | ||
67 | } | ||
68 | EXPORT_SYMBOL(__virt_addr_valid); | ||
69 | |||
70 | #endif /* CONFIG_X86_64 */ | ||
diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h new file mode 100644 index 00000000000..a3cd5a0c97b --- /dev/null +++ b/arch/x86/mm/physaddr.h | |||
@@ -0,0 +1,10 @@ | |||
1 | #include <asm/processor.h> | ||
2 | |||
3 | static inline int phys_addr_valid(resource_size_t addr) | ||
4 | { | ||
5 | #ifdef CONFIG_PHYS_ADDR_T_64BIT | ||
6 | return !(addr >> boot_cpu_data.x86_phys_bits); | ||
7 | #else | ||
8 | return 1; | ||
9 | #endif | ||
10 | } | ||
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c new file mode 100644 index 00000000000..513d8ed5d2e --- /dev/null +++ b/arch/x86/mm/setup_nx.c | |||
@@ -0,0 +1,69 @@ | |||
1 | #include <linux/spinlock.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/init.h> | ||
4 | |||
5 | #include <asm/pgtable.h> | ||
6 | |||
7 | int nx_enabled; | ||
8 | |||
9 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) | ||
10 | static int disable_nx __cpuinitdata; | ||
11 | |||
12 | /* | ||
13 | * noexec = on|off | ||
14 | * | ||
15 | * Control non-executable mappings for processes. | ||
16 | * | ||
17 | * on Enable | ||
18 | * off Disable | ||
19 | */ | ||
20 | static int __init noexec_setup(char *str) | ||
21 | { | ||
22 | if (!str) | ||
23 | return -EINVAL; | ||
24 | if (!strncmp(str, "on", 2)) { | ||
25 | __supported_pte_mask |= _PAGE_NX; | ||
26 | disable_nx = 0; | ||
27 | } else if (!strncmp(str, "off", 3)) { | ||
28 | disable_nx = 1; | ||
29 | __supported_pte_mask &= ~_PAGE_NX; | ||
30 | } | ||
31 | return 0; | ||
32 | } | ||
33 | early_param("noexec", noexec_setup); | ||
34 | #endif | ||
35 | |||
36 | #ifdef CONFIG_X86_PAE | ||
37 | void __init set_nx(void) | ||
38 | { | ||
39 | unsigned int v[4], l, h; | ||
40 | |||
41 | if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { | ||
42 | cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); | ||
43 | |||
44 | if ((v[3] & (1 << 20)) && !disable_nx) { | ||
45 | rdmsr(MSR_EFER, l, h); | ||
46 | l |= EFER_NX; | ||
47 | wrmsr(MSR_EFER, l, h); | ||
48 | nx_enabled = 1; | ||
49 | __supported_pte_mask |= _PAGE_NX; | ||
50 | } | ||
51 | } | ||
52 | } | ||
53 | #else | ||
54 | void set_nx(void) | ||
55 | { | ||
56 | } | ||
57 | #endif | ||
58 | |||
59 | #ifdef CONFIG_X86_64 | ||
60 | void __cpuinit check_efer(void) | ||
61 | { | ||
62 | unsigned long efer; | ||
63 | |||
64 | rdmsrl(MSR_EFER, efer); | ||
65 | if (!(efer & EFER_NX) || disable_nx) | ||
66 | __supported_pte_mask &= ~_PAGE_NX; | ||
67 | } | ||
68 | #endif | ||
69 | |||
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 29a0e37114f..6f8aa33031c 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c | |||
@@ -215,7 +215,7 @@ int __init get_memcfg_from_srat(void) | |||
215 | goto out_fail; | 215 | goto out_fail; |
216 | 216 | ||
217 | if (num_memory_chunks == 0) { | 217 | if (num_memory_chunks == 0) { |
218 | printk(KERN_WARNING | 218 | printk(KERN_DEBUG |
219 | "could not find any ACPI SRAT memory areas.\n"); | 219 | "could not find any ACPI SRAT memory areas.\n"); |
220 | goto out_fail; | 220 | goto out_fail; |
221 | } | 221 | } |
@@ -277,7 +277,7 @@ int __init get_memcfg_from_srat(void) | |||
277 | } | 277 | } |
278 | return 1; | 278 | return 1; |
279 | out_fail: | 279 | out_fail: |
280 | printk(KERN_ERR "failed to get NUMA memory information from SRAT" | 280 | printk(KERN_DEBUG "failed to get NUMA memory information from SRAT" |
281 | " table\n"); | 281 | " table\n"); |
282 | return 0; | 282 | return 0; |
283 | } | 283 | } |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 821e97017e9..36fe08eeb5c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -59,7 +59,8 @@ void leave_mm(int cpu) | |||
59 | { | 59 | { |
60 | if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) | 60 | if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) |
61 | BUG(); | 61 | BUG(); |
62 | cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); | 62 | cpumask_clear_cpu(cpu, |
63 | mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); | ||
63 | load_cr3(swapper_pg_dir); | 64 | load_cr3(swapper_pg_dir); |
64 | } | 65 | } |
65 | EXPORT_SYMBOL_GPL(leave_mm); | 66 | EXPORT_SYMBOL_GPL(leave_mm); |
@@ -183,18 +184,17 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, | |||
183 | 184 | ||
184 | f->flush_mm = mm; | 185 | f->flush_mm = mm; |
185 | f->flush_va = va; | 186 | f->flush_va = va; |
186 | cpumask_andnot(to_cpumask(f->flush_cpumask), | 187 | if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { |
187 | cpumask, cpumask_of(smp_processor_id())); | 188 | /* |
188 | 189 | * We have to send the IPI only to | |
189 | /* | 190 | * CPUs affected. |
190 | * We have to send the IPI only to | 191 | */ |
191 | * CPUs affected. | 192 | apic->send_IPI_mask(to_cpumask(f->flush_cpumask), |
192 | */ | 193 | INVALIDATE_TLB_VECTOR_START + sender); |
193 | apic->send_IPI_mask(to_cpumask(f->flush_cpumask), | ||
194 | INVALIDATE_TLB_VECTOR_START + sender); | ||
195 | 194 | ||
196 | while (!cpumask_empty(to_cpumask(f->flush_cpumask))) | 195 | while (!cpumask_empty(to_cpumask(f->flush_cpumask))) |
197 | cpu_relax(); | 196 | cpu_relax(); |
197 | } | ||
198 | 198 | ||
199 | f->flush_mm = NULL; | 199 | f->flush_mm = NULL; |
200 | f->flush_va = 0; | 200 | f->flush_va = 0; |
@@ -235,8 +235,8 @@ void flush_tlb_current_task(void) | |||
235 | preempt_disable(); | 235 | preempt_disable(); |
236 | 236 | ||
237 | local_flush_tlb(); | 237 | local_flush_tlb(); |
238 | if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) | 238 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) |
239 | flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); | 239 | flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); |
240 | preempt_enable(); | 240 | preempt_enable(); |
241 | } | 241 | } |
242 | 242 | ||
@@ -250,8 +250,8 @@ void flush_tlb_mm(struct mm_struct *mm) | |||
250 | else | 250 | else |
251 | leave_mm(smp_processor_id()); | 251 | leave_mm(smp_processor_id()); |
252 | } | 252 | } |
253 | if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) | 253 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) |
254 | flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); | 254 | flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); |
255 | 255 | ||
256 | preempt_enable(); | 256 | preempt_enable(); |
257 | } | 257 | } |
@@ -269,8 +269,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) | |||
269 | leave_mm(smp_processor_id()); | 269 | leave_mm(smp_processor_id()); |
270 | } | 270 | } |
271 | 271 | ||
272 | if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) | 272 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) |
273 | flush_tlb_others(&mm->cpu_vm_mask, mm, va); | 273 | flush_tlb_others(mm_cpumask(mm), mm, va); |
274 | 274 | ||
275 | preempt_enable(); | 275 | preempt_enable(); |
276 | } | 276 | } |
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 89b9a5cd63d..cb88b1a0bd5 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c | |||
@@ -1,11 +1,14 @@ | |||
1 | /** | 1 | /** |
2 | * @file nmi_int.c | 2 | * @file nmi_int.c |
3 | * | 3 | * |
4 | * @remark Copyright 2002-2008 OProfile authors | 4 | * @remark Copyright 2002-2009 OProfile authors |
5 | * @remark Read the file COPYING | 5 | * @remark Read the file COPYING |
6 | * | 6 | * |
7 | * @author John Levon <levon@movementarian.org> | 7 | * @author John Levon <levon@movementarian.org> |
8 | * @author Robert Richter <robert.richter@amd.com> | 8 | * @author Robert Richter <robert.richter@amd.com> |
9 | * @author Barry Kasindorf <barry.kasindorf@amd.com> | ||
10 | * @author Jason Yeh <jason.yeh@amd.com> | ||
11 | * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> | ||
9 | */ | 12 | */ |
10 | 13 | ||
11 | #include <linux/init.h> | 14 | #include <linux/init.h> |
@@ -24,13 +27,35 @@ | |||
24 | #include "op_counter.h" | 27 | #include "op_counter.h" |
25 | #include "op_x86_model.h" | 28 | #include "op_x86_model.h" |
26 | 29 | ||
27 | static struct op_x86_model_spec const *model; | 30 | static struct op_x86_model_spec *model; |
28 | static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); | 31 | static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); |
29 | static DEFINE_PER_CPU(unsigned long, saved_lvtpc); | 32 | static DEFINE_PER_CPU(unsigned long, saved_lvtpc); |
30 | 33 | ||
31 | /* 0 == registered but off, 1 == registered and on */ | 34 | /* 0 == registered but off, 1 == registered and on */ |
32 | static int nmi_enabled = 0; | 35 | static int nmi_enabled = 0; |
33 | 36 | ||
37 | struct op_counter_config counter_config[OP_MAX_COUNTER]; | ||
38 | |||
39 | /* common functions */ | ||
40 | |||
41 | u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, | ||
42 | struct op_counter_config *counter_config) | ||
43 | { | ||
44 | u64 val = 0; | ||
45 | u16 event = (u16)counter_config->event; | ||
46 | |||
47 | val |= ARCH_PERFMON_EVENTSEL_INT; | ||
48 | val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; | ||
49 | val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; | ||
50 | val |= (counter_config->unit_mask & 0xFF) << 8; | ||
51 | event &= model->event_mask ? model->event_mask : 0xFF; | ||
52 | val |= event & 0xFF; | ||
53 | val |= (event & 0x0F00) << 24; | ||
54 | |||
55 | return val; | ||
56 | } | ||
57 | |||
58 | |||
34 | static int profile_exceptions_notify(struct notifier_block *self, | 59 | static int profile_exceptions_notify(struct notifier_block *self, |
35 | unsigned long val, void *data) | 60 | unsigned long val, void *data) |
36 | { | 61 | { |
@@ -52,36 +77,214 @@ static int profile_exceptions_notify(struct notifier_block *self, | |||
52 | 77 | ||
53 | static void nmi_cpu_save_registers(struct op_msrs *msrs) | 78 | static void nmi_cpu_save_registers(struct op_msrs *msrs) |
54 | { | 79 | { |
55 | unsigned int const nr_ctrs = model->num_counters; | ||
56 | unsigned int const nr_ctrls = model->num_controls; | ||
57 | struct op_msr *counters = msrs->counters; | 80 | struct op_msr *counters = msrs->counters; |
58 | struct op_msr *controls = msrs->controls; | 81 | struct op_msr *controls = msrs->controls; |
59 | unsigned int i; | 82 | unsigned int i; |
60 | 83 | ||
61 | for (i = 0; i < nr_ctrs; ++i) { | 84 | for (i = 0; i < model->num_counters; ++i) { |
62 | if (counters[i].addr) { | 85 | if (counters[i].addr) |
63 | rdmsr(counters[i].addr, | 86 | rdmsrl(counters[i].addr, counters[i].saved); |
64 | counters[i].saved.low, | 87 | } |
65 | counters[i].saved.high); | 88 | |
66 | } | 89 | for (i = 0; i < model->num_controls; ++i) { |
90 | if (controls[i].addr) | ||
91 | rdmsrl(controls[i].addr, controls[i].saved); | ||
92 | } | ||
93 | } | ||
94 | |||
95 | static void nmi_cpu_start(void *dummy) | ||
96 | { | ||
97 | struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); | ||
98 | model->start(msrs); | ||
99 | } | ||
100 | |||
101 | static int nmi_start(void) | ||
102 | { | ||
103 | on_each_cpu(nmi_cpu_start, NULL, 1); | ||
104 | return 0; | ||
105 | } | ||
106 | |||
107 | static void nmi_cpu_stop(void *dummy) | ||
108 | { | ||
109 | struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); | ||
110 | model->stop(msrs); | ||
111 | } | ||
112 | |||
113 | static void nmi_stop(void) | ||
114 | { | ||
115 | on_each_cpu(nmi_cpu_stop, NULL, 1); | ||
116 | } | ||
117 | |||
118 | #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX | ||
119 | |||
120 | static DEFINE_PER_CPU(int, switch_index); | ||
121 | |||
122 | static inline int has_mux(void) | ||
123 | { | ||
124 | return !!model->switch_ctrl; | ||
125 | } | ||
126 | |||
127 | inline int op_x86_phys_to_virt(int phys) | ||
128 | { | ||
129 | return __get_cpu_var(switch_index) + phys; | ||
130 | } | ||
131 | |||
132 | inline int op_x86_virt_to_phys(int virt) | ||
133 | { | ||
134 | return virt % model->num_counters; | ||
135 | } | ||
136 | |||
137 | static void nmi_shutdown_mux(void) | ||
138 | { | ||
139 | int i; | ||
140 | |||
141 | if (!has_mux()) | ||
142 | return; | ||
143 | |||
144 | for_each_possible_cpu(i) { | ||
145 | kfree(per_cpu(cpu_msrs, i).multiplex); | ||
146 | per_cpu(cpu_msrs, i).multiplex = NULL; | ||
147 | per_cpu(switch_index, i) = 0; | ||
67 | } | 148 | } |
149 | } | ||
150 | |||
151 | static int nmi_setup_mux(void) | ||
152 | { | ||
153 | size_t multiplex_size = | ||
154 | sizeof(struct op_msr) * model->num_virt_counters; | ||
155 | int i; | ||
156 | |||
157 | if (!has_mux()) | ||
158 | return 1; | ||
159 | |||
160 | for_each_possible_cpu(i) { | ||
161 | per_cpu(cpu_msrs, i).multiplex = | ||
162 | kmalloc(multiplex_size, GFP_KERNEL); | ||
163 | if (!per_cpu(cpu_msrs, i).multiplex) | ||
164 | return 0; | ||
165 | } | ||
166 | |||
167 | return 1; | ||
168 | } | ||
169 | |||
170 | static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) | ||
171 | { | ||
172 | int i; | ||
173 | struct op_msr *multiplex = msrs->multiplex; | ||
174 | |||
175 | if (!has_mux()) | ||
176 | return; | ||
68 | 177 | ||
69 | for (i = 0; i < nr_ctrls; ++i) { | 178 | for (i = 0; i < model->num_virt_counters; ++i) { |
70 | if (controls[i].addr) { | 179 | if (counter_config[i].enabled) { |
71 | rdmsr(controls[i].addr, | 180 | multiplex[i].saved = -(u64)counter_config[i].count; |
72 | controls[i].saved.low, | 181 | } else { |
73 | controls[i].saved.high); | 182 | multiplex[i].addr = 0; |
183 | multiplex[i].saved = 0; | ||
74 | } | 184 | } |
75 | } | 185 | } |
186 | |||
187 | per_cpu(switch_index, cpu) = 0; | ||
188 | } | ||
189 | |||
190 | static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) | ||
191 | { | ||
192 | struct op_msr *multiplex = msrs->multiplex; | ||
193 | int i; | ||
194 | |||
195 | for (i = 0; i < model->num_counters; ++i) { | ||
196 | int virt = op_x86_phys_to_virt(i); | ||
197 | if (multiplex[virt].addr) | ||
198 | rdmsrl(multiplex[virt].addr, multiplex[virt].saved); | ||
199 | } | ||
200 | } | ||
201 | |||
202 | static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) | ||
203 | { | ||
204 | struct op_msr *multiplex = msrs->multiplex; | ||
205 | int i; | ||
206 | |||
207 | for (i = 0; i < model->num_counters; ++i) { | ||
208 | int virt = op_x86_phys_to_virt(i); | ||
209 | if (multiplex[virt].addr) | ||
210 | wrmsrl(multiplex[virt].addr, multiplex[virt].saved); | ||
211 | } | ||
76 | } | 212 | } |
77 | 213 | ||
78 | static void nmi_save_registers(void *dummy) | 214 | static void nmi_cpu_switch(void *dummy) |
79 | { | 215 | { |
80 | int cpu = smp_processor_id(); | 216 | int cpu = smp_processor_id(); |
217 | int si = per_cpu(switch_index, cpu); | ||
81 | struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); | 218 | struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); |
82 | nmi_cpu_save_registers(msrs); | 219 | |
220 | nmi_cpu_stop(NULL); | ||
221 | nmi_cpu_save_mpx_registers(msrs); | ||
222 | |||
223 | /* move to next set */ | ||
224 | si += model->num_counters; | ||
225 | if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) | ||
226 | per_cpu(switch_index, cpu) = 0; | ||
227 | else | ||
228 | per_cpu(switch_index, cpu) = si; | ||
229 | |||
230 | model->switch_ctrl(model, msrs); | ||
231 | nmi_cpu_restore_mpx_registers(msrs); | ||
232 | |||
233 | nmi_cpu_start(NULL); | ||
234 | } | ||
235 | |||
236 | |||
237 | /* | ||
238 | * Quick check to see if multiplexing is necessary. | ||
239 | * The check should be sufficient since counters are used | ||
240 | * in ordre. | ||
241 | */ | ||
242 | static int nmi_multiplex_on(void) | ||
243 | { | ||
244 | return counter_config[model->num_counters].count ? 0 : -EINVAL; | ||
245 | } | ||
246 | |||
247 | static int nmi_switch_event(void) | ||
248 | { | ||
249 | if (!has_mux()) | ||
250 | return -ENOSYS; /* not implemented */ | ||
251 | if (nmi_multiplex_on() < 0) | ||
252 | return -EINVAL; /* not necessary */ | ||
253 | |||
254 | on_each_cpu(nmi_cpu_switch, NULL, 1); | ||
255 | |||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | static inline void mux_init(struct oprofile_operations *ops) | ||
260 | { | ||
261 | if (has_mux()) | ||
262 | ops->switch_events = nmi_switch_event; | ||
263 | } | ||
264 | |||
265 | static void mux_clone(int cpu) | ||
266 | { | ||
267 | if (!has_mux()) | ||
268 | return; | ||
269 | |||
270 | memcpy(per_cpu(cpu_msrs, cpu).multiplex, | ||
271 | per_cpu(cpu_msrs, 0).multiplex, | ||
272 | sizeof(struct op_msr) * model->num_virt_counters); | ||
83 | } | 273 | } |
84 | 274 | ||
275 | #else | ||
276 | |||
277 | inline int op_x86_phys_to_virt(int phys) { return phys; } | ||
278 | inline int op_x86_virt_to_phys(int virt) { return virt; } | ||
279 | static inline void nmi_shutdown_mux(void) { } | ||
280 | static inline int nmi_setup_mux(void) { return 1; } | ||
281 | static inline void | ||
282 | nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } | ||
283 | static inline void mux_init(struct oprofile_operations *ops) { } | ||
284 | static void mux_clone(int cpu) { } | ||
285 | |||
286 | #endif | ||
287 | |||
85 | static void free_msrs(void) | 288 | static void free_msrs(void) |
86 | { | 289 | { |
87 | int i; | 290 | int i; |
@@ -95,38 +298,32 @@ static void free_msrs(void) | |||
95 | 298 | ||
96 | static int allocate_msrs(void) | 299 | static int allocate_msrs(void) |
97 | { | 300 | { |
98 | int success = 1; | ||
99 | size_t controls_size = sizeof(struct op_msr) * model->num_controls; | 301 | size_t controls_size = sizeof(struct op_msr) * model->num_controls; |
100 | size_t counters_size = sizeof(struct op_msr) * model->num_counters; | 302 | size_t counters_size = sizeof(struct op_msr) * model->num_counters; |
101 | 303 | ||
102 | int i; | 304 | int i; |
103 | for_each_possible_cpu(i) { | 305 | for_each_possible_cpu(i) { |
104 | per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, | 306 | per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, |
105 | GFP_KERNEL); | 307 | GFP_KERNEL); |
106 | if (!per_cpu(cpu_msrs, i).counters) { | 308 | if (!per_cpu(cpu_msrs, i).counters) |
107 | success = 0; | 309 | return 0; |
108 | break; | ||
109 | } | ||
110 | per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, | 310 | per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, |
111 | GFP_KERNEL); | 311 | GFP_KERNEL); |
112 | if (!per_cpu(cpu_msrs, i).controls) { | 312 | if (!per_cpu(cpu_msrs, i).controls) |
113 | success = 0; | 313 | return 0; |
114 | break; | ||
115 | } | ||
116 | } | 314 | } |
117 | 315 | ||
118 | if (!success) | 316 | return 1; |
119 | free_msrs(); | ||
120 | |||
121 | return success; | ||
122 | } | 317 | } |
123 | 318 | ||
124 | static void nmi_cpu_setup(void *dummy) | 319 | static void nmi_cpu_setup(void *dummy) |
125 | { | 320 | { |
126 | int cpu = smp_processor_id(); | 321 | int cpu = smp_processor_id(); |
127 | struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); | 322 | struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); |
323 | nmi_cpu_save_registers(msrs); | ||
128 | spin_lock(&oprofilefs_lock); | 324 | spin_lock(&oprofilefs_lock); |
129 | model->setup_ctrs(msrs); | 325 | model->setup_ctrs(model, msrs); |
326 | nmi_cpu_setup_mux(cpu, msrs); | ||
130 | spin_unlock(&oprofilefs_lock); | 327 | spin_unlock(&oprofilefs_lock); |
131 | per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); | 328 | per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); |
132 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 329 | apic_write(APIC_LVTPC, APIC_DM_NMI); |
@@ -144,11 +341,15 @@ static int nmi_setup(void) | |||
144 | int cpu; | 341 | int cpu; |
145 | 342 | ||
146 | if (!allocate_msrs()) | 343 | if (!allocate_msrs()) |
147 | return -ENOMEM; | 344 | err = -ENOMEM; |
345 | else if (!nmi_setup_mux()) | ||
346 | err = -ENOMEM; | ||
347 | else | ||
348 | err = register_die_notifier(&profile_exceptions_nb); | ||
148 | 349 | ||
149 | err = register_die_notifier(&profile_exceptions_nb); | ||
150 | if (err) { | 350 | if (err) { |
151 | free_msrs(); | 351 | free_msrs(); |
352 | nmi_shutdown_mux(); | ||
152 | return err; | 353 | return err; |
153 | } | 354 | } |
154 | 355 | ||
@@ -159,45 +360,38 @@ static int nmi_setup(void) | |||
159 | /* Assume saved/restored counters are the same on all CPUs */ | 360 | /* Assume saved/restored counters are the same on all CPUs */ |
160 | model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); | 361 | model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); |
161 | for_each_possible_cpu(cpu) { | 362 | for_each_possible_cpu(cpu) { |
162 | if (cpu != 0) { | 363 | if (!cpu) |
163 | memcpy(per_cpu(cpu_msrs, cpu).counters, | 364 | continue; |
164 | per_cpu(cpu_msrs, 0).counters, | 365 | |
165 | sizeof(struct op_msr) * model->num_counters); | 366 | memcpy(per_cpu(cpu_msrs, cpu).counters, |
166 | 367 | per_cpu(cpu_msrs, 0).counters, | |
167 | memcpy(per_cpu(cpu_msrs, cpu).controls, | 368 | sizeof(struct op_msr) * model->num_counters); |
168 | per_cpu(cpu_msrs, 0).controls, | 369 | |
169 | sizeof(struct op_msr) * model->num_controls); | 370 | memcpy(per_cpu(cpu_msrs, cpu).controls, |
170 | } | 371 | per_cpu(cpu_msrs, 0).controls, |
372 | sizeof(struct op_msr) * model->num_controls); | ||
171 | 373 | ||
374 | mux_clone(cpu); | ||
172 | } | 375 | } |
173 | on_each_cpu(nmi_save_registers, NULL, 1); | ||
174 | on_each_cpu(nmi_cpu_setup, NULL, 1); | 376 | on_each_cpu(nmi_cpu_setup, NULL, 1); |
175 | nmi_enabled = 1; | 377 | nmi_enabled = 1; |
176 | return 0; | 378 | return 0; |
177 | } | 379 | } |
178 | 380 | ||
179 | static void nmi_restore_registers(struct op_msrs *msrs) | 381 | static void nmi_cpu_restore_registers(struct op_msrs *msrs) |
180 | { | 382 | { |
181 | unsigned int const nr_ctrs = model->num_counters; | ||
182 | unsigned int const nr_ctrls = model->num_controls; | ||
183 | struct op_msr *counters = msrs->counters; | 383 | struct op_msr *counters = msrs->counters; |
184 | struct op_msr *controls = msrs->controls; | 384 | struct op_msr *controls = msrs->controls; |
185 | unsigned int i; | 385 | unsigned int i; |
186 | 386 | ||
187 | for (i = 0; i < nr_ctrls; ++i) { | 387 | for (i = 0; i < model->num_controls; ++i) { |
188 | if (controls[i].addr) { | 388 | if (controls[i].addr) |
189 | wrmsr(controls[i].addr, | 389 | wrmsrl(controls[i].addr, controls[i].saved); |
190 | controls[i].saved.low, | ||
191 | controls[i].saved.high); | ||
192 | } | ||
193 | } | 390 | } |
194 | 391 | ||
195 | for (i = 0; i < nr_ctrs; ++i) { | 392 | for (i = 0; i < model->num_counters; ++i) { |
196 | if (counters[i].addr) { | 393 | if (counters[i].addr) |
197 | wrmsr(counters[i].addr, | 394 | wrmsrl(counters[i].addr, counters[i].saved); |
198 | counters[i].saved.low, | ||
199 | counters[i].saved.high); | ||
200 | } | ||
201 | } | 395 | } |
202 | } | 396 | } |
203 | 397 | ||
@@ -205,7 +399,7 @@ static void nmi_cpu_shutdown(void *dummy) | |||
205 | { | 399 | { |
206 | unsigned int v; | 400 | unsigned int v; |
207 | int cpu = smp_processor_id(); | 401 | int cpu = smp_processor_id(); |
208 | struct op_msrs *msrs = &__get_cpu_var(cpu_msrs); | 402 | struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); |
209 | 403 | ||
210 | /* restoring APIC_LVTPC can trigger an apic error because the delivery | 404 | /* restoring APIC_LVTPC can trigger an apic error because the delivery |
211 | * mode and vector nr combination can be illegal. That's by design: on | 405 | * mode and vector nr combination can be illegal. That's by design: on |
@@ -216,7 +410,7 @@ static void nmi_cpu_shutdown(void *dummy) | |||
216 | apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); | 410 | apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); |
217 | apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); | 411 | apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); |
218 | apic_write(APIC_LVTERR, v); | 412 | apic_write(APIC_LVTERR, v); |
219 | nmi_restore_registers(msrs); | 413 | nmi_cpu_restore_registers(msrs); |
220 | } | 414 | } |
221 | 415 | ||
222 | static void nmi_shutdown(void) | 416 | static void nmi_shutdown(void) |
@@ -226,42 +420,18 @@ static void nmi_shutdown(void) | |||
226 | nmi_enabled = 0; | 420 | nmi_enabled = 0; |
227 | on_each_cpu(nmi_cpu_shutdown, NULL, 1); | 421 | on_each_cpu(nmi_cpu_shutdown, NULL, 1); |
228 | unregister_die_notifier(&profile_exceptions_nb); | 422 | unregister_die_notifier(&profile_exceptions_nb); |
423 | nmi_shutdown_mux(); | ||
229 | msrs = &get_cpu_var(cpu_msrs); | 424 | msrs = &get_cpu_var(cpu_msrs); |
230 | model->shutdown(msrs); | 425 | model->shutdown(msrs); |
231 | free_msrs(); | 426 | free_msrs(); |
232 | put_cpu_var(cpu_msrs); | 427 | put_cpu_var(cpu_msrs); |
233 | } | 428 | } |
234 | 429 | ||
235 | static void nmi_cpu_start(void *dummy) | ||
236 | { | ||
237 | struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); | ||
238 | model->start(msrs); | ||
239 | } | ||
240 | |||
241 | static int nmi_start(void) | ||
242 | { | ||
243 | on_each_cpu(nmi_cpu_start, NULL, 1); | ||
244 | return 0; | ||
245 | } | ||
246 | |||
247 | static void nmi_cpu_stop(void *dummy) | ||
248 | { | ||
249 | struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); | ||
250 | model->stop(msrs); | ||
251 | } | ||
252 | |||
253 | static void nmi_stop(void) | ||
254 | { | ||
255 | on_each_cpu(nmi_cpu_stop, NULL, 1); | ||
256 | } | ||
257 | |||
258 | struct op_counter_config counter_config[OP_MAX_COUNTER]; | ||
259 | |||
260 | static int nmi_create_files(struct super_block *sb, struct dentry *root) | 430 | static int nmi_create_files(struct super_block *sb, struct dentry *root) |
261 | { | 431 | { |
262 | unsigned int i; | 432 | unsigned int i; |
263 | 433 | ||
264 | for (i = 0; i < model->num_counters; ++i) { | 434 | for (i = 0; i < model->num_virt_counters; ++i) { |
265 | struct dentry *dir; | 435 | struct dentry *dir; |
266 | char buf[4]; | 436 | char buf[4]; |
267 | 437 | ||
@@ -270,7 +440,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) | |||
270 | * NOTE: assumes 1:1 mapping here (that counters are organized | 440 | * NOTE: assumes 1:1 mapping here (that counters are organized |
271 | * sequentially in their struct assignment). | 441 | * sequentially in their struct assignment). |
272 | */ | 442 | */ |
273 | if (unlikely(!avail_to_resrv_perfctr_nmi_bit(i))) | 443 | if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i))) |
274 | continue; | 444 | continue; |
275 | 445 | ||
276 | snprintf(buf, sizeof(buf), "%d", i); | 446 | snprintf(buf, sizeof(buf), "%d", i); |
@@ -402,6 +572,7 @@ module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); | |||
402 | static int __init ppro_init(char **cpu_type) | 572 | static int __init ppro_init(char **cpu_type) |
403 | { | 573 | { |
404 | __u8 cpu_model = boot_cpu_data.x86_model; | 574 | __u8 cpu_model = boot_cpu_data.x86_model; |
575 | struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ | ||
405 | 576 | ||
406 | if (force_arch_perfmon && cpu_has_arch_perfmon) | 577 | if (force_arch_perfmon && cpu_has_arch_perfmon) |
407 | return 0; | 578 | return 0; |
@@ -428,7 +599,7 @@ static int __init ppro_init(char **cpu_type) | |||
428 | *cpu_type = "i386/core_2"; | 599 | *cpu_type = "i386/core_2"; |
429 | break; | 600 | break; |
430 | case 26: | 601 | case 26: |
431 | arch_perfmon_setup_counters(); | 602 | spec = &op_arch_perfmon_spec; |
432 | *cpu_type = "i386/core_i7"; | 603 | *cpu_type = "i386/core_i7"; |
433 | break; | 604 | break; |
434 | case 28: | 605 | case 28: |
@@ -439,17 +610,7 @@ static int __init ppro_init(char **cpu_type) | |||
439 | return 0; | 610 | return 0; |
440 | } | 611 | } |
441 | 612 | ||
442 | model = &op_ppro_spec; | 613 | model = spec; |
443 | return 1; | ||
444 | } | ||
445 | |||
446 | static int __init arch_perfmon_init(char **cpu_type) | ||
447 | { | ||
448 | if (!cpu_has_arch_perfmon) | ||
449 | return 0; | ||
450 | *cpu_type = "i386/arch_perfmon"; | ||
451 | model = &op_arch_perfmon_spec; | ||
452 | arch_perfmon_setup_counters(); | ||
453 | return 1; | 614 | return 1; |
454 | } | 615 | } |
455 | 616 | ||
@@ -471,27 +632,26 @@ int __init op_nmi_init(struct oprofile_operations *ops) | |||
471 | /* Needs to be at least an Athlon (or hammer in 32bit mode) */ | 632 | /* Needs to be at least an Athlon (or hammer in 32bit mode) */ |
472 | 633 | ||
473 | switch (family) { | 634 | switch (family) { |
474 | default: | ||
475 | return -ENODEV; | ||
476 | case 6: | 635 | case 6: |
477 | model = &op_amd_spec; | ||
478 | cpu_type = "i386/athlon"; | 636 | cpu_type = "i386/athlon"; |
479 | break; | 637 | break; |
480 | case 0xf: | 638 | case 0xf: |
481 | model = &op_amd_spec; | 639 | /* |
482 | /* Actually it could be i386/hammer too, but give | 640 | * Actually it could be i386/hammer too, but |
483 | user space an consistent name. */ | 641 | * give user space an consistent name. |
642 | */ | ||
484 | cpu_type = "x86-64/hammer"; | 643 | cpu_type = "x86-64/hammer"; |
485 | break; | 644 | break; |
486 | case 0x10: | 645 | case 0x10: |
487 | model = &op_amd_spec; | ||
488 | cpu_type = "x86-64/family10"; | 646 | cpu_type = "x86-64/family10"; |
489 | break; | 647 | break; |
490 | case 0x11: | 648 | case 0x11: |
491 | model = &op_amd_spec; | ||
492 | cpu_type = "x86-64/family11h"; | 649 | cpu_type = "x86-64/family11h"; |
493 | break; | 650 | break; |
651 | default: | ||
652 | return -ENODEV; | ||
494 | } | 653 | } |
654 | model = &op_amd_spec; | ||
495 | break; | 655 | break; |
496 | 656 | ||
497 | case X86_VENDOR_INTEL: | 657 | case X86_VENDOR_INTEL: |
@@ -510,8 +670,15 @@ int __init op_nmi_init(struct oprofile_operations *ops) | |||
510 | break; | 670 | break; |
511 | } | 671 | } |
512 | 672 | ||
513 | if (!cpu_type && !arch_perfmon_init(&cpu_type)) | 673 | if (cpu_type) |
674 | break; | ||
675 | |||
676 | if (!cpu_has_arch_perfmon) | ||
514 | return -ENODEV; | 677 | return -ENODEV; |
678 | |||
679 | /* use arch perfmon as fallback */ | ||
680 | cpu_type = "i386/arch_perfmon"; | ||
681 | model = &op_arch_perfmon_spec; | ||
515 | break; | 682 | break; |
516 | 683 | ||
517 | default: | 684 | default: |
@@ -522,18 +689,23 @@ int __init op_nmi_init(struct oprofile_operations *ops) | |||
522 | register_cpu_notifier(&oprofile_cpu_nb); | 689 | register_cpu_notifier(&oprofile_cpu_nb); |
523 | #endif | 690 | #endif |
524 | /* default values, can be overwritten by model */ | 691 | /* default values, can be overwritten by model */ |
525 | ops->create_files = nmi_create_files; | 692 | ops->create_files = nmi_create_files; |
526 | ops->setup = nmi_setup; | 693 | ops->setup = nmi_setup; |
527 | ops->shutdown = nmi_shutdown; | 694 | ops->shutdown = nmi_shutdown; |
528 | ops->start = nmi_start; | 695 | ops->start = nmi_start; |
529 | ops->stop = nmi_stop; | 696 | ops->stop = nmi_stop; |
530 | ops->cpu_type = cpu_type; | 697 | ops->cpu_type = cpu_type; |
531 | 698 | ||
532 | if (model->init) | 699 | if (model->init) |
533 | ret = model->init(ops); | 700 | ret = model->init(ops); |
534 | if (ret) | 701 | if (ret) |
535 | return ret; | 702 | return ret; |
536 | 703 | ||
704 | if (!model->num_virt_counters) | ||
705 | model->num_virt_counters = model->num_counters; | ||
706 | |||
707 | mux_init(ops); | ||
708 | |||
537 | init_sysfs(); | 709 | init_sysfs(); |
538 | using_nmi = 1; | 710 | using_nmi = 1; |
539 | printk(KERN_INFO "oprofile: using NMI interrupt.\n"); | 711 | printk(KERN_INFO "oprofile: using NMI interrupt.\n"); |
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index 91b6a116165..e28398df0df 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h | |||
@@ -10,7 +10,7 @@ | |||
10 | #ifndef OP_COUNTER_H | 10 | #ifndef OP_COUNTER_H |
11 | #define OP_COUNTER_H | 11 | #define OP_COUNTER_H |
12 | 12 | ||
13 | #define OP_MAX_COUNTER 8 | 13 | #define OP_MAX_COUNTER 32 |
14 | 14 | ||
15 | /* Per-perfctr configuration as set via | 15 | /* Per-perfctr configuration as set via |
16 | * oprofilefs. | 16 | * oprofilefs. |
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 8fdf06e4edf..39686c29f03 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c | |||
@@ -9,12 +9,15 @@ | |||
9 | * @author Philippe Elie | 9 | * @author Philippe Elie |
10 | * @author Graydon Hoare | 10 | * @author Graydon Hoare |
11 | * @author Robert Richter <robert.richter@amd.com> | 11 | * @author Robert Richter <robert.richter@amd.com> |
12 | * @author Barry Kasindorf | 12 | * @author Barry Kasindorf <barry.kasindorf@amd.com> |
13 | * @author Jason Yeh <jason.yeh@amd.com> | ||
14 | * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> | ||
13 | */ | 15 | */ |
14 | 16 | ||
15 | #include <linux/oprofile.h> | 17 | #include <linux/oprofile.h> |
16 | #include <linux/device.h> | 18 | #include <linux/device.h> |
17 | #include <linux/pci.h> | 19 | #include <linux/pci.h> |
20 | #include <linux/percpu.h> | ||
18 | 21 | ||
19 | #include <asm/ptrace.h> | 22 | #include <asm/ptrace.h> |
20 | #include <asm/msr.h> | 23 | #include <asm/msr.h> |
@@ -25,43 +28,36 @@ | |||
25 | 28 | ||
26 | #define NUM_COUNTERS 4 | 29 | #define NUM_COUNTERS 4 |
27 | #define NUM_CONTROLS 4 | 30 | #define NUM_CONTROLS 4 |
31 | #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX | ||
32 | #define NUM_VIRT_COUNTERS 32 | ||
33 | #define NUM_VIRT_CONTROLS 32 | ||
34 | #else | ||
35 | #define NUM_VIRT_COUNTERS NUM_COUNTERS | ||
36 | #define NUM_VIRT_CONTROLS NUM_CONTROLS | ||
37 | #endif | ||
38 | |||
39 | #define OP_EVENT_MASK 0x0FFF | ||
40 | #define OP_CTR_OVERFLOW (1ULL<<31) | ||
28 | 41 | ||
29 | #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) | 42 | #define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) |
30 | #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) | 43 | |
31 | #define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) | 44 | static unsigned long reset_value[NUM_VIRT_COUNTERS]; |
32 | #define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) | ||
33 | |||
34 | #define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) | ||
35 | #define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) | ||
36 | #define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) | ||
37 | #define CTRL_SET_ACTIVE(n) (n |= (1<<22)) | ||
38 | #define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) | ||
39 | #define CTRL_CLEAR_LO(x) (x &= (1<<21)) | ||
40 | #define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) | ||
41 | #define CTRL_SET_ENABLE(val) (val |= 1<<20) | ||
42 | #define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) | ||
43 | #define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) | ||
44 | #define CTRL_SET_UM(val, m) (val |= (m << 8)) | ||
45 | #define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) | ||
46 | #define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) | ||
47 | #define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) | ||
48 | #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) | ||
49 | |||
50 | static unsigned long reset_value[NUM_COUNTERS]; | ||
51 | 45 | ||
52 | #ifdef CONFIG_OPROFILE_IBS | 46 | #ifdef CONFIG_OPROFILE_IBS |
53 | 47 | ||
54 | /* IbsFetchCtl bits/masks */ | 48 | /* IbsFetchCtl bits/masks */ |
55 | #define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ | 49 | #define IBS_FETCH_RAND_EN (1ULL<<57) |
56 | #define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ | 50 | #define IBS_FETCH_VAL (1ULL<<49) |
57 | #define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ | 51 | #define IBS_FETCH_ENABLE (1ULL<<48) |
52 | #define IBS_FETCH_CNT_MASK 0xFFFF0000ULL | ||
58 | 53 | ||
59 | /*IbsOpCtl bits */ | 54 | /*IbsOpCtl bits */ |
60 | #define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ | 55 | #define IBS_OP_CNT_CTL (1ULL<<19) |
61 | #define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ | 56 | #define IBS_OP_VAL (1ULL<<18) |
57 | #define IBS_OP_ENABLE (1ULL<<17) | ||
62 | 58 | ||
63 | #define IBS_FETCH_SIZE 6 | 59 | #define IBS_FETCH_SIZE 6 |
64 | #define IBS_OP_SIZE 12 | 60 | #define IBS_OP_SIZE 12 |
65 | 61 | ||
66 | static int has_ibs; /* AMD Family10h and later */ | 62 | static int has_ibs; /* AMD Family10h and later */ |
67 | 63 | ||
@@ -78,6 +74,45 @@ static struct op_ibs_config ibs_config; | |||
78 | 74 | ||
79 | #endif | 75 | #endif |
80 | 76 | ||
77 | #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX | ||
78 | |||
79 | static void op_mux_fill_in_addresses(struct op_msrs * const msrs) | ||
80 | { | ||
81 | int i; | ||
82 | |||
83 | for (i = 0; i < NUM_VIRT_COUNTERS; i++) { | ||
84 | int hw_counter = op_x86_virt_to_phys(i); | ||
85 | if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) | ||
86 | msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; | ||
87 | else | ||
88 | msrs->multiplex[i].addr = 0; | ||
89 | } | ||
90 | } | ||
91 | |||
92 | static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, | ||
93 | struct op_msrs const * const msrs) | ||
94 | { | ||
95 | u64 val; | ||
96 | int i; | ||
97 | |||
98 | /* enable active counters */ | ||
99 | for (i = 0; i < NUM_COUNTERS; ++i) { | ||
100 | int virt = op_x86_phys_to_virt(i); | ||
101 | if (!counter_config[virt].enabled) | ||
102 | continue; | ||
103 | rdmsrl(msrs->controls[i].addr, val); | ||
104 | val &= model->reserved; | ||
105 | val |= op_x86_get_ctrl(model, &counter_config[virt]); | ||
106 | wrmsrl(msrs->controls[i].addr, val); | ||
107 | } | ||
108 | } | ||
109 | |||
110 | #else | ||
111 | |||
112 | static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { } | ||
113 | |||
114 | #endif | ||
115 | |||
81 | /* functions for op_amd_spec */ | 116 | /* functions for op_amd_spec */ |
82 | 117 | ||
83 | static void op_amd_fill_in_addresses(struct op_msrs * const msrs) | 118 | static void op_amd_fill_in_addresses(struct op_msrs * const msrs) |
@@ -97,150 +132,174 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) | |||
97 | else | 132 | else |
98 | msrs->controls[i].addr = 0; | 133 | msrs->controls[i].addr = 0; |
99 | } | 134 | } |
100 | } | ||
101 | 135 | ||
136 | op_mux_fill_in_addresses(msrs); | ||
137 | } | ||
102 | 138 | ||
103 | static void op_amd_setup_ctrs(struct op_msrs const * const msrs) | 139 | static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, |
140 | struct op_msrs const * const msrs) | ||
104 | { | 141 | { |
105 | unsigned int low, high; | 142 | u64 val; |
106 | int i; | 143 | int i; |
107 | 144 | ||
145 | /* setup reset_value */ | ||
146 | for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { | ||
147 | if (counter_config[i].enabled) | ||
148 | reset_value[i] = counter_config[i].count; | ||
149 | else | ||
150 | reset_value[i] = 0; | ||
151 | } | ||
152 | |||
108 | /* clear all counters */ | 153 | /* clear all counters */ |
109 | for (i = 0 ; i < NUM_CONTROLS; ++i) { | 154 | for (i = 0; i < NUM_CONTROLS; ++i) { |
110 | if (unlikely(!CTRL_IS_RESERVED(msrs, i))) | 155 | if (unlikely(!msrs->controls[i].addr)) |
111 | continue; | 156 | continue; |
112 | CTRL_READ(low, high, msrs, i); | 157 | rdmsrl(msrs->controls[i].addr, val); |
113 | CTRL_CLEAR_LO(low); | 158 | val &= model->reserved; |
114 | CTRL_CLEAR_HI(high); | 159 | wrmsrl(msrs->controls[i].addr, val); |
115 | CTRL_WRITE(low, high, msrs, i); | ||
116 | } | 160 | } |
117 | 161 | ||
118 | /* avoid a false detection of ctr overflows in NMI handler */ | 162 | /* avoid a false detection of ctr overflows in NMI handler */ |
119 | for (i = 0; i < NUM_COUNTERS; ++i) { | 163 | for (i = 0; i < NUM_COUNTERS; ++i) { |
120 | if (unlikely(!CTR_IS_RESERVED(msrs, i))) | 164 | if (unlikely(!msrs->counters[i].addr)) |
121 | continue; | 165 | continue; |
122 | CTR_WRITE(1, msrs, i); | 166 | wrmsrl(msrs->counters[i].addr, -1LL); |
123 | } | 167 | } |
124 | 168 | ||
125 | /* enable active counters */ | 169 | /* enable active counters */ |
126 | for (i = 0; i < NUM_COUNTERS; ++i) { | 170 | for (i = 0; i < NUM_COUNTERS; ++i) { |
127 | if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { | 171 | int virt = op_x86_phys_to_virt(i); |
128 | reset_value[i] = counter_config[i].count; | 172 | if (!counter_config[virt].enabled) |
173 | continue; | ||
174 | if (!msrs->counters[i].addr) | ||
175 | continue; | ||
129 | 176 | ||
130 | CTR_WRITE(counter_config[i].count, msrs, i); | 177 | /* setup counter registers */ |
131 | 178 | wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); | |
132 | CTRL_READ(low, high, msrs, i); | 179 | |
133 | CTRL_CLEAR_LO(low); | 180 | /* setup control registers */ |
134 | CTRL_CLEAR_HI(high); | 181 | rdmsrl(msrs->controls[i].addr, val); |
135 | CTRL_SET_ENABLE(low); | 182 | val &= model->reserved; |
136 | CTRL_SET_USR(low, counter_config[i].user); | 183 | val |= op_x86_get_ctrl(model, &counter_config[virt]); |
137 | CTRL_SET_KERN(low, counter_config[i].kernel); | 184 | wrmsrl(msrs->controls[i].addr, val); |
138 | CTRL_SET_UM(low, counter_config[i].unit_mask); | ||
139 | CTRL_SET_EVENT_LOW(low, counter_config[i].event); | ||
140 | CTRL_SET_EVENT_HIGH(high, counter_config[i].event); | ||
141 | CTRL_SET_HOST_ONLY(high, 0); | ||
142 | CTRL_SET_GUEST_ONLY(high, 0); | ||
143 | |||
144 | CTRL_WRITE(low, high, msrs, i); | ||
145 | } else { | ||
146 | reset_value[i] = 0; | ||
147 | } | ||
148 | } | 185 | } |
149 | } | 186 | } |
150 | 187 | ||
151 | #ifdef CONFIG_OPROFILE_IBS | 188 | #ifdef CONFIG_OPROFILE_IBS |
152 | 189 | ||
153 | static inline int | 190 | static inline void |
154 | op_amd_handle_ibs(struct pt_regs * const regs, | 191 | op_amd_handle_ibs(struct pt_regs * const regs, |
155 | struct op_msrs const * const msrs) | 192 | struct op_msrs const * const msrs) |
156 | { | 193 | { |
157 | u32 low, high; | 194 | u64 val, ctl; |
158 | u64 msr; | ||
159 | struct op_entry entry; | 195 | struct op_entry entry; |
160 | 196 | ||
161 | if (!has_ibs) | 197 | if (!has_ibs) |
162 | return 1; | 198 | return; |
163 | 199 | ||
164 | if (ibs_config.fetch_enabled) { | 200 | if (ibs_config.fetch_enabled) { |
165 | rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); | 201 | rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); |
166 | if (high & IBS_FETCH_HIGH_VALID_BIT) { | 202 | if (ctl & IBS_FETCH_VAL) { |
167 | rdmsrl(MSR_AMD64_IBSFETCHLINAD, msr); | 203 | rdmsrl(MSR_AMD64_IBSFETCHLINAD, val); |
168 | oprofile_write_reserve(&entry, regs, msr, | 204 | oprofile_write_reserve(&entry, regs, val, |
169 | IBS_FETCH_CODE, IBS_FETCH_SIZE); | 205 | IBS_FETCH_CODE, IBS_FETCH_SIZE); |
170 | oprofile_add_data(&entry, (u32)msr); | 206 | oprofile_add_data64(&entry, val); |
171 | oprofile_add_data(&entry, (u32)(msr >> 32)); | 207 | oprofile_add_data64(&entry, ctl); |
172 | oprofile_add_data(&entry, low); | 208 | rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val); |
173 | oprofile_add_data(&entry, high); | 209 | oprofile_add_data64(&entry, val); |
174 | rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, msr); | ||
175 | oprofile_add_data(&entry, (u32)msr); | ||
176 | oprofile_add_data(&entry, (u32)(msr >> 32)); | ||
177 | oprofile_write_commit(&entry); | 210 | oprofile_write_commit(&entry); |
178 | 211 | ||
179 | /* reenable the IRQ */ | 212 | /* reenable the IRQ */ |
180 | high &= ~IBS_FETCH_HIGH_VALID_BIT; | 213 | ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); |
181 | high |= IBS_FETCH_HIGH_ENABLE; | 214 | ctl |= IBS_FETCH_ENABLE; |
182 | low &= IBS_FETCH_LOW_MAX_CNT_MASK; | 215 | wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); |
183 | wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); | ||
184 | } | 216 | } |
185 | } | 217 | } |
186 | 218 | ||
187 | if (ibs_config.op_enabled) { | 219 | if (ibs_config.op_enabled) { |
188 | rdmsr(MSR_AMD64_IBSOPCTL, low, high); | 220 | rdmsrl(MSR_AMD64_IBSOPCTL, ctl); |
189 | if (low & IBS_OP_LOW_VALID_BIT) { | 221 | if (ctl & IBS_OP_VAL) { |
190 | rdmsrl(MSR_AMD64_IBSOPRIP, msr); | 222 | rdmsrl(MSR_AMD64_IBSOPRIP, val); |
191 | oprofile_write_reserve(&entry, regs, msr, | 223 | oprofile_write_reserve(&entry, regs, val, |
192 | IBS_OP_CODE, IBS_OP_SIZE); | 224 | IBS_OP_CODE, IBS_OP_SIZE); |
193 | oprofile_add_data(&entry, (u32)msr); | 225 | oprofile_add_data64(&entry, val); |
194 | oprofile_add_data(&entry, (u32)(msr >> 32)); | 226 | rdmsrl(MSR_AMD64_IBSOPDATA, val); |
195 | rdmsrl(MSR_AMD64_IBSOPDATA, msr); | 227 | oprofile_add_data64(&entry, val); |
196 | oprofile_add_data(&entry, (u32)msr); | 228 | rdmsrl(MSR_AMD64_IBSOPDATA2, val); |
197 | oprofile_add_data(&entry, (u32)(msr >> 32)); | 229 | oprofile_add_data64(&entry, val); |
198 | rdmsrl(MSR_AMD64_IBSOPDATA2, msr); | 230 | rdmsrl(MSR_AMD64_IBSOPDATA3, val); |
199 | oprofile_add_data(&entry, (u32)msr); | 231 | oprofile_add_data64(&entry, val); |
200 | oprofile_add_data(&entry, (u32)(msr >> 32)); | 232 | rdmsrl(MSR_AMD64_IBSDCLINAD, val); |
201 | rdmsrl(MSR_AMD64_IBSOPDATA3, msr); | 233 | oprofile_add_data64(&entry, val); |
202 | oprofile_add_data(&entry, (u32)msr); | 234 | rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); |
203 | oprofile_add_data(&entry, (u32)(msr >> 32)); | 235 | oprofile_add_data64(&entry, val); |
204 | rdmsrl(MSR_AMD64_IBSDCLINAD, msr); | ||
205 | oprofile_add_data(&entry, (u32)msr); | ||
206 | oprofile_add_data(&entry, (u32)(msr >> 32)); | ||
207 | rdmsrl(MSR_AMD64_IBSDCPHYSAD, msr); | ||
208 | oprofile_add_data(&entry, (u32)msr); | ||
209 | oprofile_add_data(&entry, (u32)(msr >> 32)); | ||
210 | oprofile_write_commit(&entry); | 236 | oprofile_write_commit(&entry); |
211 | 237 | ||
212 | /* reenable the IRQ */ | 238 | /* reenable the IRQ */ |
213 | high = 0; | 239 | ctl &= ~IBS_OP_VAL & 0xFFFFFFFF; |
214 | low &= ~IBS_OP_LOW_VALID_BIT; | 240 | ctl |= IBS_OP_ENABLE; |
215 | low |= IBS_OP_LOW_ENABLE; | 241 | wrmsrl(MSR_AMD64_IBSOPCTL, ctl); |
216 | wrmsr(MSR_AMD64_IBSOPCTL, low, high); | ||
217 | } | 242 | } |
218 | } | 243 | } |
244 | } | ||
219 | 245 | ||
220 | return 1; | 246 | static inline void op_amd_start_ibs(void) |
247 | { | ||
248 | u64 val; | ||
249 | if (has_ibs && ibs_config.fetch_enabled) { | ||
250 | val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; | ||
251 | val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; | ||
252 | val |= IBS_FETCH_ENABLE; | ||
253 | wrmsrl(MSR_AMD64_IBSFETCHCTL, val); | ||
254 | } | ||
255 | |||
256 | if (has_ibs && ibs_config.op_enabled) { | ||
257 | val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; | ||
258 | val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; | ||
259 | val |= IBS_OP_ENABLE; | ||
260 | wrmsrl(MSR_AMD64_IBSOPCTL, val); | ||
261 | } | ||
262 | } | ||
263 | |||
264 | static void op_amd_stop_ibs(void) | ||
265 | { | ||
266 | if (has_ibs && ibs_config.fetch_enabled) | ||
267 | /* clear max count and enable */ | ||
268 | wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); | ||
269 | |||
270 | if (has_ibs && ibs_config.op_enabled) | ||
271 | /* clear max count and enable */ | ||
272 | wrmsrl(MSR_AMD64_IBSOPCTL, 0); | ||
221 | } | 273 | } |
222 | 274 | ||
275 | #else | ||
276 | |||
277 | static inline void op_amd_handle_ibs(struct pt_regs * const regs, | ||
278 | struct op_msrs const * const msrs) { } | ||
279 | static inline void op_amd_start_ibs(void) { } | ||
280 | static inline void op_amd_stop_ibs(void) { } | ||
281 | |||
223 | #endif | 282 | #endif |
224 | 283 | ||
225 | static int op_amd_check_ctrs(struct pt_regs * const regs, | 284 | static int op_amd_check_ctrs(struct pt_regs * const regs, |
226 | struct op_msrs const * const msrs) | 285 | struct op_msrs const * const msrs) |
227 | { | 286 | { |
228 | unsigned int low, high; | 287 | u64 val; |
229 | int i; | 288 | int i; |
230 | 289 | ||
231 | for (i = 0 ; i < NUM_COUNTERS; ++i) { | 290 | for (i = 0; i < NUM_COUNTERS; ++i) { |
232 | if (!reset_value[i]) | 291 | int virt = op_x86_phys_to_virt(i); |
292 | if (!reset_value[virt]) | ||
233 | continue; | 293 | continue; |
234 | CTR_READ(low, high, msrs, i); | 294 | rdmsrl(msrs->counters[i].addr, val); |
235 | if (CTR_OVERFLOWED(low)) { | 295 | /* bit is clear if overflowed: */ |
236 | oprofile_add_sample(regs, i); | 296 | if (val & OP_CTR_OVERFLOW) |
237 | CTR_WRITE(reset_value[i], msrs, i); | 297 | continue; |
238 | } | 298 | oprofile_add_sample(regs, virt); |
299 | wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); | ||
239 | } | 300 | } |
240 | 301 | ||
241 | #ifdef CONFIG_OPROFILE_IBS | ||
242 | op_amd_handle_ibs(regs, msrs); | 302 | op_amd_handle_ibs(regs, msrs); |
243 | #endif | ||
244 | 303 | ||
245 | /* See op_model_ppro.c */ | 304 | /* See op_model_ppro.c */ |
246 | return 1; | 305 | return 1; |
@@ -248,79 +307,50 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, | |||
248 | 307 | ||
249 | static void op_amd_start(struct op_msrs const * const msrs) | 308 | static void op_amd_start(struct op_msrs const * const msrs) |
250 | { | 309 | { |
251 | unsigned int low, high; | 310 | u64 val; |
252 | int i; | 311 | int i; |
253 | for (i = 0 ; i < NUM_COUNTERS ; ++i) { | ||
254 | if (reset_value[i]) { | ||
255 | CTRL_READ(low, high, msrs, i); | ||
256 | CTRL_SET_ACTIVE(low); | ||
257 | CTRL_WRITE(low, high, msrs, i); | ||
258 | } | ||
259 | } | ||
260 | 312 | ||
261 | #ifdef CONFIG_OPROFILE_IBS | 313 | for (i = 0; i < NUM_COUNTERS; ++i) { |
262 | if (has_ibs && ibs_config.fetch_enabled) { | 314 | if (!reset_value[op_x86_phys_to_virt(i)]) |
263 | low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; | 315 | continue; |
264 | high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ | 316 | rdmsrl(msrs->controls[i].addr, val); |
265 | + IBS_FETCH_HIGH_ENABLE; | 317 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; |
266 | wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); | 318 | wrmsrl(msrs->controls[i].addr, val); |
267 | } | 319 | } |
268 | 320 | ||
269 | if (has_ibs && ibs_config.op_enabled) { | 321 | op_amd_start_ibs(); |
270 | low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) | ||
271 | + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ | ||
272 | + IBS_OP_LOW_ENABLE; | ||
273 | high = 0; | ||
274 | wrmsr(MSR_AMD64_IBSOPCTL, low, high); | ||
275 | } | ||
276 | #endif | ||
277 | } | 322 | } |
278 | 323 | ||
279 | |||
280 | static void op_amd_stop(struct op_msrs const * const msrs) | 324 | static void op_amd_stop(struct op_msrs const * const msrs) |
281 | { | 325 | { |
282 | unsigned int low, high; | 326 | u64 val; |
283 | int i; | 327 | int i; |
284 | 328 | ||
285 | /* | 329 | /* |
286 | * Subtle: stop on all counters to avoid race with setting our | 330 | * Subtle: stop on all counters to avoid race with setting our |
287 | * pm callback | 331 | * pm callback |
288 | */ | 332 | */ |
289 | for (i = 0 ; i < NUM_COUNTERS ; ++i) { | 333 | for (i = 0; i < NUM_COUNTERS; ++i) { |
290 | if (!reset_value[i]) | 334 | if (!reset_value[op_x86_phys_to_virt(i)]) |
291 | continue; | 335 | continue; |
292 | CTRL_READ(low, high, msrs, i); | 336 | rdmsrl(msrs->controls[i].addr, val); |
293 | CTRL_SET_INACTIVE(low); | 337 | val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; |
294 | CTRL_WRITE(low, high, msrs, i); | 338 | wrmsrl(msrs->controls[i].addr, val); |
295 | } | ||
296 | |||
297 | #ifdef CONFIG_OPROFILE_IBS | ||
298 | if (has_ibs && ibs_config.fetch_enabled) { | ||
299 | /* clear max count and enable */ | ||
300 | low = 0; | ||
301 | high = 0; | ||
302 | wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); | ||
303 | } | 339 | } |
304 | 340 | ||
305 | if (has_ibs && ibs_config.op_enabled) { | 341 | op_amd_stop_ibs(); |
306 | /* clear max count and enable */ | ||
307 | low = 0; | ||
308 | high = 0; | ||
309 | wrmsr(MSR_AMD64_IBSOPCTL, low, high); | ||
310 | } | ||
311 | #endif | ||
312 | } | 342 | } |
313 | 343 | ||
314 | static void op_amd_shutdown(struct op_msrs const * const msrs) | 344 | static void op_amd_shutdown(struct op_msrs const * const msrs) |
315 | { | 345 | { |
316 | int i; | 346 | int i; |
317 | 347 | ||
318 | for (i = 0 ; i < NUM_COUNTERS ; ++i) { | 348 | for (i = 0; i < NUM_COUNTERS; ++i) { |
319 | if (CTR_IS_RESERVED(msrs, i)) | 349 | if (msrs->counters[i].addr) |
320 | release_perfctr_nmi(MSR_K7_PERFCTR0 + i); | 350 | release_perfctr_nmi(MSR_K7_PERFCTR0 + i); |
321 | } | 351 | } |
322 | for (i = 0 ; i < NUM_CONTROLS ; ++i) { | 352 | for (i = 0; i < NUM_CONTROLS; ++i) { |
323 | if (CTRL_IS_RESERVED(msrs, i)) | 353 | if (msrs->controls[i].addr) |
324 | release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); | 354 | release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); |
325 | } | 355 | } |
326 | } | 356 | } |
@@ -490,15 +520,21 @@ static void op_amd_exit(void) {} | |||
490 | 520 | ||
491 | #endif /* CONFIG_OPROFILE_IBS */ | 521 | #endif /* CONFIG_OPROFILE_IBS */ |
492 | 522 | ||
493 | struct op_x86_model_spec const op_amd_spec = { | 523 | struct op_x86_model_spec op_amd_spec = { |
494 | .init = op_amd_init, | ||
495 | .exit = op_amd_exit, | ||
496 | .num_counters = NUM_COUNTERS, | 524 | .num_counters = NUM_COUNTERS, |
497 | .num_controls = NUM_CONTROLS, | 525 | .num_controls = NUM_CONTROLS, |
526 | .num_virt_counters = NUM_VIRT_COUNTERS, | ||
527 | .reserved = MSR_AMD_EVENTSEL_RESERVED, | ||
528 | .event_mask = OP_EVENT_MASK, | ||
529 | .init = op_amd_init, | ||
530 | .exit = op_amd_exit, | ||
498 | .fill_in_addresses = &op_amd_fill_in_addresses, | 531 | .fill_in_addresses = &op_amd_fill_in_addresses, |
499 | .setup_ctrs = &op_amd_setup_ctrs, | 532 | .setup_ctrs = &op_amd_setup_ctrs, |
500 | .check_ctrs = &op_amd_check_ctrs, | 533 | .check_ctrs = &op_amd_check_ctrs, |
501 | .start = &op_amd_start, | 534 | .start = &op_amd_start, |
502 | .stop = &op_amd_stop, | 535 | .stop = &op_amd_stop, |
503 | .shutdown = &op_amd_shutdown | 536 | .shutdown = &op_amd_shutdown, |
537 | #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX | ||
538 | .switch_ctrl = &op_mux_switch_ctrl, | ||
539 | #endif | ||
504 | }; | 540 | }; |
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 819b131fd75..ac6b354becd 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c | |||
@@ -32,6 +32,8 @@ | |||
32 | #define NUM_CCCRS_HT2 9 | 32 | #define NUM_CCCRS_HT2 9 |
33 | #define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) | 33 | #define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) |
34 | 34 | ||
35 | #define OP_CTR_OVERFLOW (1ULL<<31) | ||
36 | |||
35 | static unsigned int num_counters = NUM_COUNTERS_NON_HT; | 37 | static unsigned int num_counters = NUM_COUNTERS_NON_HT; |
36 | static unsigned int num_controls = NUM_CONTROLS_NON_HT; | 38 | static unsigned int num_controls = NUM_CONTROLS_NON_HT; |
37 | 39 | ||
@@ -350,8 +352,6 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { | |||
350 | #define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) | 352 | #define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) |
351 | #define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) | 353 | #define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) |
352 | #define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) | 354 | #define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) |
353 | #define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) | ||
354 | #define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) | ||
355 | 355 | ||
356 | #define CCCR_RESERVED_BITS 0x38030FFF | 356 | #define CCCR_RESERVED_BITS 0x38030FFF |
357 | #define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) | 357 | #define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) |
@@ -361,17 +361,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { | |||
361 | #define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) | 361 | #define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) |
362 | #define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) | 362 | #define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) |
363 | #define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) | 363 | #define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) |
364 | #define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) | ||
365 | #define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) | ||
366 | #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) | 364 | #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) |
367 | #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) | 365 | #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) |
368 | 366 | ||
369 | #define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) | ||
370 | #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) | ||
371 | #define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0) | ||
372 | #define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0) | ||
373 | #define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) | ||
374 | |||
375 | 367 | ||
376 | /* this assigns a "stagger" to the current CPU, which is used throughout | 368 | /* this assigns a "stagger" to the current CPU, which is used throughout |
377 | the code in this module as an extra array offset, to select the "even" | 369 | the code in this module as an extra array offset, to select the "even" |
@@ -515,7 +507,7 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) | |||
515 | if (ev->bindings[i].virt_counter & counter_bit) { | 507 | if (ev->bindings[i].virt_counter & counter_bit) { |
516 | 508 | ||
517 | /* modify ESCR */ | 509 | /* modify ESCR */ |
518 | ESCR_READ(escr, high, ev, i); | 510 | rdmsr(ev->bindings[i].escr_address, escr, high); |
519 | ESCR_CLEAR(escr); | 511 | ESCR_CLEAR(escr); |
520 | if (stag == 0) { | 512 | if (stag == 0) { |
521 | ESCR_SET_USR_0(escr, counter_config[ctr].user); | 513 | ESCR_SET_USR_0(escr, counter_config[ctr].user); |
@@ -526,10 +518,11 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) | |||
526 | } | 518 | } |
527 | ESCR_SET_EVENT_SELECT(escr, ev->event_select); | 519 | ESCR_SET_EVENT_SELECT(escr, ev->event_select); |
528 | ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); | 520 | ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); |
529 | ESCR_WRITE(escr, high, ev, i); | 521 | wrmsr(ev->bindings[i].escr_address, escr, high); |
530 | 522 | ||
531 | /* modify CCCR */ | 523 | /* modify CCCR */ |
532 | CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); | 524 | rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, |
525 | cccr, high); | ||
533 | CCCR_CLEAR(cccr); | 526 | CCCR_CLEAR(cccr); |
534 | CCCR_SET_REQUIRED_BITS(cccr); | 527 | CCCR_SET_REQUIRED_BITS(cccr); |
535 | CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); | 528 | CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); |
@@ -537,7 +530,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) | |||
537 | CCCR_SET_PMI_OVF_0(cccr); | 530 | CCCR_SET_PMI_OVF_0(cccr); |
538 | else | 531 | else |
539 | CCCR_SET_PMI_OVF_1(cccr); | 532 | CCCR_SET_PMI_OVF_1(cccr); |
540 | CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); | 533 | wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, |
534 | cccr, high); | ||
541 | return; | 535 | return; |
542 | } | 536 | } |
543 | } | 537 | } |
@@ -548,7 +542,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) | |||
548 | } | 542 | } |
549 | 543 | ||
550 | 544 | ||
551 | static void p4_setup_ctrs(struct op_msrs const * const msrs) | 545 | static void p4_setup_ctrs(struct op_x86_model_spec const *model, |
546 | struct op_msrs const * const msrs) | ||
552 | { | 547 | { |
553 | unsigned int i; | 548 | unsigned int i; |
554 | unsigned int low, high; | 549 | unsigned int low, high; |
@@ -563,8 +558,8 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) | |||
563 | } | 558 | } |
564 | 559 | ||
565 | /* clear the cccrs we will use */ | 560 | /* clear the cccrs we will use */ |
566 | for (i = 0 ; i < num_counters ; i++) { | 561 | for (i = 0; i < num_counters; i++) { |
567 | if (unlikely(!CTRL_IS_RESERVED(msrs, i))) | 562 | if (unlikely(!msrs->controls[i].addr)) |
568 | continue; | 563 | continue; |
569 | rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); | 564 | rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); |
570 | CCCR_CLEAR(low); | 565 | CCCR_CLEAR(low); |
@@ -574,17 +569,18 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) | |||
574 | 569 | ||
575 | /* clear all escrs (including those outside our concern) */ | 570 | /* clear all escrs (including those outside our concern) */ |
576 | for (i = num_counters; i < num_controls; i++) { | 571 | for (i = num_counters; i < num_controls; i++) { |
577 | if (unlikely(!CTRL_IS_RESERVED(msrs, i))) | 572 | if (unlikely(!msrs->controls[i].addr)) |
578 | continue; | 573 | continue; |
579 | wrmsr(msrs->controls[i].addr, 0, 0); | 574 | wrmsr(msrs->controls[i].addr, 0, 0); |
580 | } | 575 | } |
581 | 576 | ||
582 | /* setup all counters */ | 577 | /* setup all counters */ |
583 | for (i = 0 ; i < num_counters ; ++i) { | 578 | for (i = 0; i < num_counters; ++i) { |
584 | if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) { | 579 | if (counter_config[i].enabled && msrs->controls[i].addr) { |
585 | reset_value[i] = counter_config[i].count; | 580 | reset_value[i] = counter_config[i].count; |
586 | pmc_setup_one_p4_counter(i); | 581 | pmc_setup_one_p4_counter(i); |
587 | CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); | 582 | wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address, |
583 | -(u64)counter_config[i].count); | ||
588 | } else { | 584 | } else { |
589 | reset_value[i] = 0; | 585 | reset_value[i] = 0; |
590 | } | 586 | } |
@@ -624,14 +620,16 @@ static int p4_check_ctrs(struct pt_regs * const regs, | |||
624 | 620 | ||
625 | real = VIRT_CTR(stag, i); | 621 | real = VIRT_CTR(stag, i); |
626 | 622 | ||
627 | CCCR_READ(low, high, real); | 623 | rdmsr(p4_counters[real].cccr_address, low, high); |
628 | CTR_READ(ctr, high, real); | 624 | rdmsr(p4_counters[real].counter_address, ctr, high); |
629 | if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { | 625 | if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) { |
630 | oprofile_add_sample(regs, i); | 626 | oprofile_add_sample(regs, i); |
631 | CTR_WRITE(reset_value[i], real); | 627 | wrmsrl(p4_counters[real].counter_address, |
628 | -(u64)reset_value[i]); | ||
632 | CCCR_CLEAR_OVF(low); | 629 | CCCR_CLEAR_OVF(low); |
633 | CCCR_WRITE(low, high, real); | 630 | wrmsr(p4_counters[real].cccr_address, low, high); |
634 | CTR_WRITE(reset_value[i], real); | 631 | wrmsrl(p4_counters[real].counter_address, |
632 | -(u64)reset_value[i]); | ||
635 | } | 633 | } |
636 | } | 634 | } |
637 | 635 | ||
@@ -653,9 +651,9 @@ static void p4_start(struct op_msrs const * const msrs) | |||
653 | for (i = 0; i < num_counters; ++i) { | 651 | for (i = 0; i < num_counters; ++i) { |
654 | if (!reset_value[i]) | 652 | if (!reset_value[i]) |
655 | continue; | 653 | continue; |
656 | CCCR_READ(low, high, VIRT_CTR(stag, i)); | 654 | rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); |
657 | CCCR_SET_ENABLE(low); | 655 | CCCR_SET_ENABLE(low); |
658 | CCCR_WRITE(low, high, VIRT_CTR(stag, i)); | 656 | wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); |
659 | } | 657 | } |
660 | } | 658 | } |
661 | 659 | ||
@@ -670,9 +668,9 @@ static void p4_stop(struct op_msrs const * const msrs) | |||
670 | for (i = 0; i < num_counters; ++i) { | 668 | for (i = 0; i < num_counters; ++i) { |
671 | if (!reset_value[i]) | 669 | if (!reset_value[i]) |
672 | continue; | 670 | continue; |
673 | CCCR_READ(low, high, VIRT_CTR(stag, i)); | 671 | rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); |
674 | CCCR_SET_DISABLE(low); | 672 | CCCR_SET_DISABLE(low); |
675 | CCCR_WRITE(low, high, VIRT_CTR(stag, i)); | 673 | wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); |
676 | } | 674 | } |
677 | } | 675 | } |
678 | 676 | ||
@@ -680,8 +678,8 @@ static void p4_shutdown(struct op_msrs const * const msrs) | |||
680 | { | 678 | { |
681 | int i; | 679 | int i; |
682 | 680 | ||
683 | for (i = 0 ; i < num_counters ; ++i) { | 681 | for (i = 0; i < num_counters; ++i) { |
684 | if (CTR_IS_RESERVED(msrs, i)) | 682 | if (msrs->counters[i].addr) |
685 | release_perfctr_nmi(msrs->counters[i].addr); | 683 | release_perfctr_nmi(msrs->counters[i].addr); |
686 | } | 684 | } |
687 | /* | 685 | /* |
@@ -689,15 +687,15 @@ static void p4_shutdown(struct op_msrs const * const msrs) | |||
689 | * conjunction with the counter registers (hence the starting offset). | 687 | * conjunction with the counter registers (hence the starting offset). |
690 | * This saves a few bits. | 688 | * This saves a few bits. |
691 | */ | 689 | */ |
692 | for (i = num_counters ; i < num_controls ; ++i) { | 690 | for (i = num_counters; i < num_controls; ++i) { |
693 | if (CTRL_IS_RESERVED(msrs, i)) | 691 | if (msrs->controls[i].addr) |
694 | release_evntsel_nmi(msrs->controls[i].addr); | 692 | release_evntsel_nmi(msrs->controls[i].addr); |
695 | } | 693 | } |
696 | } | 694 | } |
697 | 695 | ||
698 | 696 | ||
699 | #ifdef CONFIG_SMP | 697 | #ifdef CONFIG_SMP |
700 | struct op_x86_model_spec const op_p4_ht2_spec = { | 698 | struct op_x86_model_spec op_p4_ht2_spec = { |
701 | .num_counters = NUM_COUNTERS_HT2, | 699 | .num_counters = NUM_COUNTERS_HT2, |
702 | .num_controls = NUM_CONTROLS_HT2, | 700 | .num_controls = NUM_CONTROLS_HT2, |
703 | .fill_in_addresses = &p4_fill_in_addresses, | 701 | .fill_in_addresses = &p4_fill_in_addresses, |
@@ -709,7 +707,7 @@ struct op_x86_model_spec const op_p4_ht2_spec = { | |||
709 | }; | 707 | }; |
710 | #endif | 708 | #endif |
711 | 709 | ||
712 | struct op_x86_model_spec const op_p4_spec = { | 710 | struct op_x86_model_spec op_p4_spec = { |
713 | .num_counters = NUM_COUNTERS_NON_HT, | 711 | .num_counters = NUM_COUNTERS_NON_HT, |
714 | .num_controls = NUM_CONTROLS_NON_HT, | 712 | .num_controls = NUM_CONTROLS_NON_HT, |
715 | .fill_in_addresses = &p4_fill_in_addresses, | 713 | .fill_in_addresses = &p4_fill_in_addresses, |
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 4da7230b3d1..8eb05878554 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c | |||
@@ -10,6 +10,7 @@ | |||
10 | * @author Philippe Elie | 10 | * @author Philippe Elie |
11 | * @author Graydon Hoare | 11 | * @author Graydon Hoare |
12 | * @author Andi Kleen | 12 | * @author Andi Kleen |
13 | * @author Robert Richter <robert.richter@amd.com> | ||
13 | */ | 14 | */ |
14 | 15 | ||
15 | #include <linux/oprofile.h> | 16 | #include <linux/oprofile.h> |
@@ -18,7 +19,6 @@ | |||
18 | #include <asm/msr.h> | 19 | #include <asm/msr.h> |
19 | #include <asm/apic.h> | 20 | #include <asm/apic.h> |
20 | #include <asm/nmi.h> | 21 | #include <asm/nmi.h> |
21 | #include <asm/perf_counter.h> | ||
22 | 22 | ||
23 | #include "op_x86_model.h" | 23 | #include "op_x86_model.h" |
24 | #include "op_counter.h" | 24 | #include "op_counter.h" |
@@ -26,20 +26,7 @@ | |||
26 | static int num_counters = 2; | 26 | static int num_counters = 2; |
27 | static int counter_width = 32; | 27 | static int counter_width = 32; |
28 | 28 | ||
29 | #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) | 29 | #define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21)) |
30 | #define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) | ||
31 | |||
32 | #define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) | ||
33 | #define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) | ||
34 | #define CTRL_WRITE(l, h, msrs, c) do {wrmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) | ||
35 | #define CTRL_SET_ACTIVE(n) (n |= (1<<22)) | ||
36 | #define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) | ||
37 | #define CTRL_CLEAR(x) (x &= (1<<21)) | ||
38 | #define CTRL_SET_ENABLE(val) (val |= 1<<20) | ||
39 | #define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) | ||
40 | #define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) | ||
41 | #define CTRL_SET_UM(val, m) (val |= (m << 8)) | ||
42 | #define CTRL_SET_EVENT(val, e) (val |= e) | ||
43 | 30 | ||
44 | static u64 *reset_value; | 31 | static u64 *reset_value; |
45 | 32 | ||
@@ -63,9 +50,10 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs) | |||
63 | } | 50 | } |
64 | 51 | ||
65 | 52 | ||
66 | static void ppro_setup_ctrs(struct op_msrs const * const msrs) | 53 | static void ppro_setup_ctrs(struct op_x86_model_spec const *model, |
54 | struct op_msrs const * const msrs) | ||
67 | { | 55 | { |
68 | unsigned int low, high; | 56 | u64 val; |
69 | int i; | 57 | int i; |
70 | 58 | ||
71 | if (!reset_value) { | 59 | if (!reset_value) { |
@@ -93,36 +81,30 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) | |||
93 | } | 81 | } |
94 | 82 | ||
95 | /* clear all counters */ | 83 | /* clear all counters */ |
96 | for (i = 0 ; i < num_counters; ++i) { | 84 | for (i = 0; i < num_counters; ++i) { |
97 | if (unlikely(!CTRL_IS_RESERVED(msrs, i))) | 85 | if (unlikely(!msrs->controls[i].addr)) |
98 | continue; | 86 | continue; |
99 | CTRL_READ(low, high, msrs, i); | 87 | rdmsrl(msrs->controls[i].addr, val); |
100 | CTRL_CLEAR(low); | 88 | val &= model->reserved; |
101 | CTRL_WRITE(low, high, msrs, i); | 89 | wrmsrl(msrs->controls[i].addr, val); |
102 | } | 90 | } |
103 | 91 | ||
104 | /* avoid a false detection of ctr overflows in NMI handler */ | 92 | /* avoid a false detection of ctr overflows in NMI handler */ |
105 | for (i = 0; i < num_counters; ++i) { | 93 | for (i = 0; i < num_counters; ++i) { |
106 | if (unlikely(!CTR_IS_RESERVED(msrs, i))) | 94 | if (unlikely(!msrs->counters[i].addr)) |
107 | continue; | 95 | continue; |
108 | wrmsrl(msrs->counters[i].addr, -1LL); | 96 | wrmsrl(msrs->counters[i].addr, -1LL); |
109 | } | 97 | } |
110 | 98 | ||
111 | /* enable active counters */ | 99 | /* enable active counters */ |
112 | for (i = 0; i < num_counters; ++i) { | 100 | for (i = 0; i < num_counters; ++i) { |
113 | if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { | 101 | if (counter_config[i].enabled && msrs->counters[i].addr) { |
114 | reset_value[i] = counter_config[i].count; | 102 | reset_value[i] = counter_config[i].count; |
115 | |||
116 | wrmsrl(msrs->counters[i].addr, -reset_value[i]); | 103 | wrmsrl(msrs->counters[i].addr, -reset_value[i]); |
117 | 104 | rdmsrl(msrs->controls[i].addr, val); | |
118 | CTRL_READ(low, high, msrs, i); | 105 | val &= model->reserved; |
119 | CTRL_CLEAR(low); | 106 | val |= op_x86_get_ctrl(model, &counter_config[i]); |
120 | CTRL_SET_ENABLE(low); | 107 | wrmsrl(msrs->controls[i].addr, val); |
121 | CTRL_SET_USR(low, counter_config[i].user); | ||
122 | CTRL_SET_KERN(low, counter_config[i].kernel); | ||
123 | CTRL_SET_UM(low, counter_config[i].unit_mask); | ||
124 | CTRL_SET_EVENT(low, counter_config[i].event); | ||
125 | CTRL_WRITE(low, high, msrs, i); | ||
126 | } else { | 108 | } else { |
127 | reset_value[i] = 0; | 109 | reset_value[i] = 0; |
128 | } | 110 | } |
@@ -143,14 +125,14 @@ static int ppro_check_ctrs(struct pt_regs * const regs, | |||
143 | if (unlikely(!reset_value)) | 125 | if (unlikely(!reset_value)) |
144 | goto out; | 126 | goto out; |
145 | 127 | ||
146 | for (i = 0 ; i < num_counters; ++i) { | 128 | for (i = 0; i < num_counters; ++i) { |
147 | if (!reset_value[i]) | 129 | if (!reset_value[i]) |
148 | continue; | 130 | continue; |
149 | rdmsrl(msrs->counters[i].addr, val); | 131 | rdmsrl(msrs->counters[i].addr, val); |
150 | if (CTR_OVERFLOWED(val)) { | 132 | if (val & (1ULL << (counter_width - 1))) |
151 | oprofile_add_sample(regs, i); | 133 | continue; |
152 | wrmsrl(msrs->counters[i].addr, -reset_value[i]); | 134 | oprofile_add_sample(regs, i); |
153 | } | 135 | wrmsrl(msrs->counters[i].addr, -reset_value[i]); |
154 | } | 136 | } |
155 | 137 | ||
156 | out: | 138 | out: |
@@ -171,16 +153,16 @@ out: | |||
171 | 153 | ||
172 | static void ppro_start(struct op_msrs const * const msrs) | 154 | static void ppro_start(struct op_msrs const * const msrs) |
173 | { | 155 | { |
174 | unsigned int low, high; | 156 | u64 val; |
175 | int i; | 157 | int i; |
176 | 158 | ||
177 | if (!reset_value) | 159 | if (!reset_value) |
178 | return; | 160 | return; |
179 | for (i = 0; i < num_counters; ++i) { | 161 | for (i = 0; i < num_counters; ++i) { |
180 | if (reset_value[i]) { | 162 | if (reset_value[i]) { |
181 | CTRL_READ(low, high, msrs, i); | 163 | rdmsrl(msrs->controls[i].addr, val); |
182 | CTRL_SET_ACTIVE(low); | 164 | val |= ARCH_PERFMON_EVENTSEL0_ENABLE; |
183 | CTRL_WRITE(low, high, msrs, i); | 165 | wrmsrl(msrs->controls[i].addr, val); |
184 | } | 166 | } |
185 | } | 167 | } |
186 | } | 168 | } |
@@ -188,7 +170,7 @@ static void ppro_start(struct op_msrs const * const msrs) | |||
188 | 170 | ||
189 | static void ppro_stop(struct op_msrs const * const msrs) | 171 | static void ppro_stop(struct op_msrs const * const msrs) |
190 | { | 172 | { |
191 | unsigned int low, high; | 173 | u64 val; |
192 | int i; | 174 | int i; |
193 | 175 | ||
194 | if (!reset_value) | 176 | if (!reset_value) |
@@ -196,9 +178,9 @@ static void ppro_stop(struct op_msrs const * const msrs) | |||
196 | for (i = 0; i < num_counters; ++i) { | 178 | for (i = 0; i < num_counters; ++i) { |
197 | if (!reset_value[i]) | 179 | if (!reset_value[i]) |
198 | continue; | 180 | continue; |
199 | CTRL_READ(low, high, msrs, i); | 181 | rdmsrl(msrs->controls[i].addr, val); |
200 | CTRL_SET_INACTIVE(low); | 182 | val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; |
201 | CTRL_WRITE(low, high, msrs, i); | 183 | wrmsrl(msrs->controls[i].addr, val); |
202 | } | 184 | } |
203 | } | 185 | } |
204 | 186 | ||
@@ -206,12 +188,12 @@ static void ppro_shutdown(struct op_msrs const * const msrs) | |||
206 | { | 188 | { |
207 | int i; | 189 | int i; |
208 | 190 | ||
209 | for (i = 0 ; i < num_counters ; ++i) { | 191 | for (i = 0; i < num_counters; ++i) { |
210 | if (CTR_IS_RESERVED(msrs, i)) | 192 | if (msrs->counters[i].addr) |
211 | release_perfctr_nmi(MSR_P6_PERFCTR0 + i); | 193 | release_perfctr_nmi(MSR_P6_PERFCTR0 + i); |
212 | } | 194 | } |
213 | for (i = 0 ; i < num_counters ; ++i) { | 195 | for (i = 0; i < num_counters; ++i) { |
214 | if (CTRL_IS_RESERVED(msrs, i)) | 196 | if (msrs->controls[i].addr) |
215 | release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); | 197 | release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); |
216 | } | 198 | } |
217 | if (reset_value) { | 199 | if (reset_value) { |
@@ -222,8 +204,9 @@ static void ppro_shutdown(struct op_msrs const * const msrs) | |||
222 | 204 | ||
223 | 205 | ||
224 | struct op_x86_model_spec op_ppro_spec = { | 206 | struct op_x86_model_spec op_ppro_spec = { |
225 | .num_counters = 2, /* can be overriden */ | 207 | .num_counters = 2, |
226 | .num_controls = 2, /* dito */ | 208 | .num_controls = 2, |
209 | .reserved = MSR_PPRO_EVENTSEL_RESERVED, | ||
227 | .fill_in_addresses = &ppro_fill_in_addresses, | 210 | .fill_in_addresses = &ppro_fill_in_addresses, |
228 | .setup_ctrs = &ppro_setup_ctrs, | 211 | .setup_ctrs = &ppro_setup_ctrs, |
229 | .check_ctrs = &ppro_check_ctrs, | 212 | .check_ctrs = &ppro_check_ctrs, |
@@ -241,7 +224,7 @@ struct op_x86_model_spec op_ppro_spec = { | |||
241 | * the specific CPU. | 224 | * the specific CPU. |
242 | */ | 225 | */ |
243 | 226 | ||
244 | void arch_perfmon_setup_counters(void) | 227 | static void arch_perfmon_setup_counters(void) |
245 | { | 228 | { |
246 | union cpuid10_eax eax; | 229 | union cpuid10_eax eax; |
247 | 230 | ||
@@ -251,19 +234,25 @@ void arch_perfmon_setup_counters(void) | |||
251 | if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && | 234 | if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && |
252 | current_cpu_data.x86_model == 15) { | 235 | current_cpu_data.x86_model == 15) { |
253 | eax.split.version_id = 2; | 236 | eax.split.version_id = 2; |
254 | eax.split.num_counters = 2; | 237 | eax.split.num_events = 2; |
255 | eax.split.bit_width = 40; | 238 | eax.split.bit_width = 40; |
256 | } | 239 | } |
257 | 240 | ||
258 | num_counters = eax.split.num_counters; | 241 | num_counters = eax.split.num_events; |
259 | 242 | ||
260 | op_arch_perfmon_spec.num_counters = num_counters; | 243 | op_arch_perfmon_spec.num_counters = num_counters; |
261 | op_arch_perfmon_spec.num_controls = num_counters; | 244 | op_arch_perfmon_spec.num_controls = num_counters; |
262 | op_ppro_spec.num_counters = num_counters; | 245 | } |
263 | op_ppro_spec.num_controls = num_counters; | 246 | |
247 | static int arch_perfmon_init(struct oprofile_operations *ignore) | ||
248 | { | ||
249 | arch_perfmon_setup_counters(); | ||
250 | return 0; | ||
264 | } | 251 | } |
265 | 252 | ||
266 | struct op_x86_model_spec op_arch_perfmon_spec = { | 253 | struct op_x86_model_spec op_arch_perfmon_spec = { |
254 | .reserved = MSR_PPRO_EVENTSEL_RESERVED, | ||
255 | .init = &arch_perfmon_init, | ||
267 | /* num_counters/num_controls filled in at runtime */ | 256 | /* num_counters/num_controls filled in at runtime */ |
268 | .fill_in_addresses = &ppro_fill_in_addresses, | 257 | .fill_in_addresses = &ppro_fill_in_addresses, |
269 | /* user space does the cpuid check for available events */ | 258 | /* user space does the cpuid check for available events */ |
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 825e79064d6..7b8e75d1608 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h | |||
@@ -6,51 +6,66 @@ | |||
6 | * @remark Read the file COPYING | 6 | * @remark Read the file COPYING |
7 | * | 7 | * |
8 | * @author Graydon Hoare | 8 | * @author Graydon Hoare |
9 | * @author Robert Richter <robert.richter@amd.com> | ||
9 | */ | 10 | */ |
10 | 11 | ||
11 | #ifndef OP_X86_MODEL_H | 12 | #ifndef OP_X86_MODEL_H |
12 | #define OP_X86_MODEL_H | 13 | #define OP_X86_MODEL_H |
13 | 14 | ||
14 | struct op_saved_msr { | 15 | #include <asm/types.h> |
15 | unsigned int high; | 16 | #include <asm/perf_event.h> |
16 | unsigned int low; | ||
17 | }; | ||
18 | 17 | ||
19 | struct op_msr { | 18 | struct op_msr { |
20 | unsigned long addr; | 19 | unsigned long addr; |
21 | struct op_saved_msr saved; | 20 | u64 saved; |
22 | }; | 21 | }; |
23 | 22 | ||
24 | struct op_msrs { | 23 | struct op_msrs { |
25 | struct op_msr *counters; | 24 | struct op_msr *counters; |
26 | struct op_msr *controls; | 25 | struct op_msr *controls; |
26 | struct op_msr *multiplex; | ||
27 | }; | 27 | }; |
28 | 28 | ||
29 | struct pt_regs; | 29 | struct pt_regs; |
30 | 30 | ||
31 | struct oprofile_operations; | ||
32 | |||
31 | /* The model vtable abstracts the differences between | 33 | /* The model vtable abstracts the differences between |
32 | * various x86 CPU models' perfctr support. | 34 | * various x86 CPU models' perfctr support. |
33 | */ | 35 | */ |
34 | struct op_x86_model_spec { | 36 | struct op_x86_model_spec { |
35 | int (*init)(struct oprofile_operations *ops); | 37 | unsigned int num_counters; |
36 | void (*exit)(void); | 38 | unsigned int num_controls; |
37 | unsigned int num_counters; | 39 | unsigned int num_virt_counters; |
38 | unsigned int num_controls; | 40 | u64 reserved; |
39 | void (*fill_in_addresses)(struct op_msrs * const msrs); | 41 | u16 event_mask; |
40 | void (*setup_ctrs)(struct op_msrs const * const msrs); | 42 | int (*init)(struct oprofile_operations *ops); |
41 | int (*check_ctrs)(struct pt_regs * const regs, | 43 | void (*exit)(void); |
42 | struct op_msrs const * const msrs); | 44 | void (*fill_in_addresses)(struct op_msrs * const msrs); |
43 | void (*start)(struct op_msrs const * const msrs); | 45 | void (*setup_ctrs)(struct op_x86_model_spec const *model, |
44 | void (*stop)(struct op_msrs const * const msrs); | 46 | struct op_msrs const * const msrs); |
45 | void (*shutdown)(struct op_msrs const * const msrs); | 47 | int (*check_ctrs)(struct pt_regs * const regs, |
48 | struct op_msrs const * const msrs); | ||
49 | void (*start)(struct op_msrs const * const msrs); | ||
50 | void (*stop)(struct op_msrs const * const msrs); | ||
51 | void (*shutdown)(struct op_msrs const * const msrs); | ||
52 | #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX | ||
53 | void (*switch_ctrl)(struct op_x86_model_spec const *model, | ||
54 | struct op_msrs const * const msrs); | ||
55 | #endif | ||
46 | }; | 56 | }; |
47 | 57 | ||
58 | struct op_counter_config; | ||
59 | |||
60 | extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, | ||
61 | struct op_counter_config *counter_config); | ||
62 | extern int op_x86_phys_to_virt(int phys); | ||
63 | extern int op_x86_virt_to_phys(int virt); | ||
64 | |||
48 | extern struct op_x86_model_spec op_ppro_spec; | 65 | extern struct op_x86_model_spec op_ppro_spec; |
49 | extern struct op_x86_model_spec const op_p4_spec; | 66 | extern struct op_x86_model_spec op_p4_spec; |
50 | extern struct op_x86_model_spec const op_p4_ht2_spec; | 67 | extern struct op_x86_model_spec op_p4_ht2_spec; |
51 | extern struct op_x86_model_spec const op_amd_spec; | 68 | extern struct op_x86_model_spec op_amd_spec; |
52 | extern struct op_x86_model_spec op_arch_perfmon_spec; | 69 | extern struct op_x86_model_spec op_arch_perfmon_spec; |
53 | 70 | ||
54 | extern void arch_perfmon_setup_counters(void); | ||
55 | |||
56 | #endif /* OP_X86_MODEL_H */ | 71 | #endif /* OP_X86_MODEL_H */ |
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 3ffa10df20b..572ee9782f2 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c | |||
@@ -15,63 +15,6 @@ | |||
15 | * also get peer root bus resource for io,mmio | 15 | * also get peer root bus resource for io,mmio |
16 | */ | 16 | */ |
17 | 17 | ||
18 | #ifdef CONFIG_NUMA | ||
19 | |||
20 | #define BUS_NR 256 | ||
21 | |||
22 | #ifdef CONFIG_X86_64 | ||
23 | |||
24 | static int mp_bus_to_node[BUS_NR]; | ||
25 | |||
26 | void set_mp_bus_to_node(int busnum, int node) | ||
27 | { | ||
28 | if (busnum >= 0 && busnum < BUS_NR) | ||
29 | mp_bus_to_node[busnum] = node; | ||
30 | } | ||
31 | |||
32 | int get_mp_bus_to_node(int busnum) | ||
33 | { | ||
34 | int node = -1; | ||
35 | |||
36 | if (busnum < 0 || busnum > (BUS_NR - 1)) | ||
37 | return node; | ||
38 | |||
39 | node = mp_bus_to_node[busnum]; | ||
40 | |||
41 | /* | ||
42 | * let numa_node_id to decide it later in dma_alloc_pages | ||
43 | * if there is no ram on that node | ||
44 | */ | ||
45 | if (node != -1 && !node_online(node)) | ||
46 | node = -1; | ||
47 | |||
48 | return node; | ||
49 | } | ||
50 | |||
51 | #else /* CONFIG_X86_32 */ | ||
52 | |||
53 | static unsigned char mp_bus_to_node[BUS_NR]; | ||
54 | |||
55 | void set_mp_bus_to_node(int busnum, int node) | ||
56 | { | ||
57 | if (busnum >= 0 && busnum < BUS_NR) | ||
58 | mp_bus_to_node[busnum] = (unsigned char) node; | ||
59 | } | ||
60 | |||
61 | int get_mp_bus_to_node(int busnum) | ||
62 | { | ||
63 | int node; | ||
64 | |||
65 | if (busnum < 0 || busnum > (BUS_NR - 1)) | ||
66 | return 0; | ||
67 | node = mp_bus_to_node[busnum]; | ||
68 | return node; | ||
69 | } | ||
70 | |||
71 | #endif /* CONFIG_X86_32 */ | ||
72 | |||
73 | #endif /* CONFIG_NUMA */ | ||
74 | |||
75 | #ifdef CONFIG_X86_64 | 18 | #ifdef CONFIG_X86_64 |
76 | 19 | ||
77 | /* | 20 | /* |
@@ -301,11 +244,6 @@ static int __init early_fill_mp_bus_info(void) | |||
301 | u64 val; | 244 | u64 val; |
302 | u32 address; | 245 | u32 address; |
303 | 246 | ||
304 | #ifdef CONFIG_NUMA | ||
305 | for (i = 0; i < BUS_NR; i++) | ||
306 | mp_bus_to_node[i] = -1; | ||
307 | #endif | ||
308 | |||
309 | if (!early_pci_allowed()) | 247 | if (!early_pci_allowed()) |
310 | return -1; | 248 | return -1; |
311 | 249 | ||
@@ -346,7 +284,7 @@ static int __init early_fill_mp_bus_info(void) | |||
346 | node = (reg >> 4) & 0x07; | 284 | node = (reg >> 4) & 0x07; |
347 | #ifdef CONFIG_NUMA | 285 | #ifdef CONFIG_NUMA |
348 | for (j = min_bus; j <= max_bus; j++) | 286 | for (j = min_bus; j <= max_bus; j++) |
349 | mp_bus_to_node[j] = (unsigned char) node; | 287 | set_mp_bus_to_node(j, node); |
350 | #endif | 288 | #endif |
351 | link = (reg >> 8) & 0x03; | 289 | link = (reg >> 8) & 0x03; |
352 | 290 | ||
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 2202b6257b8..1331fcf2614 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c | |||
@@ -600,3 +600,72 @@ struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno) | |||
600 | { | 600 | { |
601 | return pci_scan_bus_on_node(busno, &pci_root_ops, -1); | 601 | return pci_scan_bus_on_node(busno, &pci_root_ops, -1); |
602 | } | 602 | } |
603 | |||
604 | /* | ||
605 | * NUMA info for PCI busses | ||
606 | * | ||
607 | * Early arch code is responsible for filling in reasonable values here. | ||
608 | * A node id of "-1" means "use current node". In other words, if a bus | ||
609 | * has a -1 node id, it's not tightly coupled to any particular chunk | ||
610 | * of memory (as is the case on some Nehalem systems). | ||
611 | */ | ||
612 | #ifdef CONFIG_NUMA | ||
613 | |||
614 | #define BUS_NR 256 | ||
615 | |||
616 | #ifdef CONFIG_X86_64 | ||
617 | |||
618 | static int mp_bus_to_node[BUS_NR] = { | ||
619 | [0 ... BUS_NR - 1] = -1 | ||
620 | }; | ||
621 | |||
622 | void set_mp_bus_to_node(int busnum, int node) | ||
623 | { | ||
624 | if (busnum >= 0 && busnum < BUS_NR) | ||
625 | mp_bus_to_node[busnum] = node; | ||
626 | } | ||
627 | |||
628 | int get_mp_bus_to_node(int busnum) | ||
629 | { | ||
630 | int node = -1; | ||
631 | |||
632 | if (busnum < 0 || busnum > (BUS_NR - 1)) | ||
633 | return node; | ||
634 | |||
635 | node = mp_bus_to_node[busnum]; | ||
636 | |||
637 | /* | ||
638 | * let numa_node_id to decide it later in dma_alloc_pages | ||
639 | * if there is no ram on that node | ||
640 | */ | ||
641 | if (node != -1 && !node_online(node)) | ||
642 | node = -1; | ||
643 | |||
644 | return node; | ||
645 | } | ||
646 | |||
647 | #else /* CONFIG_X86_32 */ | ||
648 | |||
649 | static int mp_bus_to_node[BUS_NR] = { | ||
650 | [0 ... BUS_NR - 1] = -1 | ||
651 | }; | ||
652 | |||
653 | void set_mp_bus_to_node(int busnum, int node) | ||
654 | { | ||
655 | if (busnum >= 0 && busnum < BUS_NR) | ||
656 | mp_bus_to_node[busnum] = (unsigned char) node; | ||
657 | } | ||
658 | |||
659 | int get_mp_bus_to_node(int busnum) | ||
660 | { | ||
661 | int node; | ||
662 | |||
663 | if (busnum < 0 || busnum > (BUS_NR - 1)) | ||
664 | return 0; | ||
665 | node = mp_bus_to_node[busnum]; | ||
666 | return node; | ||
667 | } | ||
668 | |||
669 | #endif /* CONFIG_X86_32 */ | ||
670 | |||
671 | #endif /* CONFIG_NUMA */ | ||
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index bd13c3e4c6d..347d882b3bb 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c | |||
@@ -192,13 +192,14 @@ struct pci_raw_ops pci_direct_conf2 = { | |||
192 | static int __init pci_sanity_check(struct pci_raw_ops *o) | 192 | static int __init pci_sanity_check(struct pci_raw_ops *o) |
193 | { | 193 | { |
194 | u32 x = 0; | 194 | u32 x = 0; |
195 | int devfn; | 195 | int year, devfn; |
196 | 196 | ||
197 | if (pci_probe & PCI_NO_CHECKS) | 197 | if (pci_probe & PCI_NO_CHECKS) |
198 | return 1; | 198 | return 1; |
199 | /* Assume Type 1 works for newer systems. | 199 | /* Assume Type 1 works for newer systems. |
200 | This handles machines that don't have anything on PCI Bus 0. */ | 200 | This handles machines that don't have anything on PCI Bus 0. */ |
201 | if (dmi_get_year(DMI_BIOS_DATE) >= 2001) | 201 | dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL); |
202 | if (year >= 2001) | ||
202 | return 1; | 203 | return 1; |
203 | 204 | ||
204 | for (devfn = 0; devfn < 0x100; devfn++) { | 205 | for (devfn = 0; devfn < 0x100; devfn++) { |
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 712443ec6d4..602c172d3bd 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c | |||
@@ -13,10 +13,14 @@ | |||
13 | #include <linux/pci.h> | 13 | #include <linux/pci.h> |
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/acpi.h> | 15 | #include <linux/acpi.h> |
16 | #include <linux/sfi_acpi.h> | ||
16 | #include <linux/bitmap.h> | 17 | #include <linux/bitmap.h> |
17 | #include <linux/sort.h> | 18 | #include <linux/sort.h> |
18 | #include <asm/e820.h> | 19 | #include <asm/e820.h> |
19 | #include <asm/pci_x86.h> | 20 | #include <asm/pci_x86.h> |
21 | #include <asm/acpi.h> | ||
22 | |||
23 | #define PREFIX "PCI: " | ||
20 | 24 | ||
21 | /* aperture is up to 256MB but BIOS may reserve less */ | 25 | /* aperture is up to 256MB but BIOS may reserve less */ |
22 | #define MMCONFIG_APER_MIN (2 * 1024*1024) | 26 | #define MMCONFIG_APER_MIN (2 * 1024*1024) |
@@ -491,7 +495,7 @@ static void __init pci_mmcfg_reject_broken(int early) | |||
491 | (unsigned int)cfg->start_bus_number, | 495 | (unsigned int)cfg->start_bus_number, |
492 | (unsigned int)cfg->end_bus_number); | 496 | (unsigned int)cfg->end_bus_number); |
493 | 497 | ||
494 | if (!early) | 498 | if (!early && !acpi_disabled) |
495 | valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); | 499 | valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); |
496 | 500 | ||
497 | if (valid) | 501 | if (valid) |
@@ -606,7 +610,7 @@ static void __init __pci_mmcfg_init(int early) | |||
606 | } | 610 | } |
607 | 611 | ||
608 | if (!known_bridge) | 612 | if (!known_bridge) |
609 | acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); | 613 | acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg); |
610 | 614 | ||
611 | pci_mmcfg_reject_broken(early); | 615 | pci_mmcfg_reject_broken(early); |
612 | 616 | ||
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 8b2d561046a..f10a7e94a84 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c | |||
@@ -11,9 +11,9 @@ | |||
11 | 11 | ||
12 | #include <linux/pci.h> | 12 | #include <linux/pci.h> |
13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
14 | #include <linux/acpi.h> | ||
15 | #include <asm/e820.h> | 14 | #include <asm/e820.h> |
16 | #include <asm/pci_x86.h> | 15 | #include <asm/pci_x86.h> |
16 | #include <acpi/acpi.h> | ||
17 | 17 | ||
18 | /* Assume systems with more busses have correct MCFG */ | 18 | /* Assume systems with more busses have correct MCFG */ |
19 | #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) | 19 | #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) |
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index b3d20b9cac6..8aa85f17667 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c | |||
@@ -242,11 +242,7 @@ static void __restore_processor_state(struct saved_context *ctxt) | |||
242 | fix_processor_context(); | 242 | fix_processor_context(); |
243 | 243 | ||
244 | do_fpu_end(); | 244 | do_fpu_end(); |
245 | mtrr_ap_init(); | 245 | mtrr_bp_restore(); |
246 | |||
247 | #ifdef CONFIG_X86_OLD_MCE | ||
248 | mcheck_init(&boot_cpu_data); | ||
249 | #endif | ||
250 | } | 246 | } |
251 | 247 | ||
252 | /* Needed by apm.c */ | 248 | /* Needed by apm.c */ |
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 88112b49f02..6b4ffedb93c 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile | |||
@@ -122,7 +122,7 @@ quiet_cmd_vdso = VDSO $@ | |||
122 | $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ | 122 | $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ |
123 | -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) | 123 | -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) |
124 | 124 | ||
125 | VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv) | 125 | VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) |
126 | GCOV_PROFILE := n | 126 | GCOV_PROFILE := n |
127 | 127 | ||
128 | # | 128 | # |
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 172438f86a0..3bb4fc21f4f 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile | |||
@@ -5,6 +5,11 @@ CFLAGS_REMOVE_time.o = -pg | |||
5 | CFLAGS_REMOVE_irq.o = -pg | 5 | CFLAGS_REMOVE_irq.o = -pg |
6 | endif | 6 | endif |
7 | 7 | ||
8 | # Make sure early boot has no stackprotector | ||
9 | nostackp := $(call cc-option, -fno-stack-protector) | ||
10 | CFLAGS_enlighten.o := $(nostackp) | ||
11 | CFLAGS_mmu.o := $(nostackp) | ||
12 | |||
8 | obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ | 13 | obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ |
9 | time.o xen-asm.o xen-asm_$(BITS).o \ | 14 | time.o xen-asm.o xen-asm_$(BITS).o \ |
10 | grant-table.o suspend.o | 15 | grant-table.o suspend.o |
@@ -12,3 +17,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ | |||
12 | obj-$(CONFIG_SMP) += smp.o | 17 | obj-$(CONFIG_SMP) += smp.o |
13 | obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o | 18 | obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o |
14 | obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o | 19 | obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o |
20 | |||
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0a1700a2be9..3439616d69f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <asm/pgtable.h> | 51 | #include <asm/pgtable.h> |
52 | #include <asm/tlbflush.h> | 52 | #include <asm/tlbflush.h> |
53 | #include <asm/reboot.h> | 53 | #include <asm/reboot.h> |
54 | #include <asm/stackprotector.h> | ||
54 | 55 | ||
55 | #include "xen-ops.h" | 56 | #include "xen-ops.h" |
56 | #include "mmu.h" | 57 | #include "mmu.h" |
@@ -215,6 +216,7 @@ static __init void xen_init_cpuid_mask(void) | |||
215 | (1 << X86_FEATURE_ACPI)); /* disable ACPI */ | 216 | (1 << X86_FEATURE_ACPI)); /* disable ACPI */ |
216 | 217 | ||
217 | ax = 1; | 218 | ax = 1; |
219 | cx = 0; | ||
218 | xen_cpuid(&ax, &bx, &cx, &dx); | 220 | xen_cpuid(&ax, &bx, &cx, &dx); |
219 | 221 | ||
220 | /* cpuid claims we support xsave; try enabling it to see what happens */ | 222 | /* cpuid claims we support xsave; try enabling it to see what happens */ |
@@ -329,18 +331,28 @@ static void xen_load_gdt(const struct desc_ptr *dtr) | |||
329 | unsigned long frames[pages]; | 331 | unsigned long frames[pages]; |
330 | int f; | 332 | int f; |
331 | 333 | ||
332 | /* A GDT can be up to 64k in size, which corresponds to 8192 | 334 | /* |
333 | 8-byte entries, or 16 4k pages.. */ | 335 | * A GDT can be up to 64k in size, which corresponds to 8192 |
336 | * 8-byte entries, or 16 4k pages.. | ||
337 | */ | ||
334 | 338 | ||
335 | BUG_ON(size > 65536); | 339 | BUG_ON(size > 65536); |
336 | BUG_ON(va & ~PAGE_MASK); | 340 | BUG_ON(va & ~PAGE_MASK); |
337 | 341 | ||
338 | for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { | 342 | for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { |
339 | int level; | 343 | int level; |
340 | pte_t *ptep = lookup_address(va, &level); | 344 | pte_t *ptep; |
341 | unsigned long pfn, mfn; | 345 | unsigned long pfn, mfn; |
342 | void *virt; | 346 | void *virt; |
343 | 347 | ||
348 | /* | ||
349 | * The GDT is per-cpu and is in the percpu data area. | ||
350 | * That can be virtually mapped, so we need to do a | ||
351 | * page-walk to get the underlying MFN for the | ||
352 | * hypercall. The page can also be in the kernel's | ||
353 | * linear range, so we need to RO that mapping too. | ||
354 | */ | ||
355 | ptep = lookup_address(va, &level); | ||
344 | BUG_ON(ptep == NULL); | 356 | BUG_ON(ptep == NULL); |
345 | 357 | ||
346 | pfn = pte_pfn(*ptep); | 358 | pfn = pte_pfn(*ptep); |
@@ -357,6 +369,44 @@ static void xen_load_gdt(const struct desc_ptr *dtr) | |||
357 | BUG(); | 369 | BUG(); |
358 | } | 370 | } |
359 | 371 | ||
372 | /* | ||
373 | * load_gdt for early boot, when the gdt is only mapped once | ||
374 | */ | ||
375 | static __init void xen_load_gdt_boot(const struct desc_ptr *dtr) | ||
376 | { | ||
377 | unsigned long va = dtr->address; | ||
378 | unsigned int size = dtr->size + 1; | ||
379 | unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; | ||
380 | unsigned long frames[pages]; | ||
381 | int f; | ||
382 | |||
383 | /* | ||
384 | * A GDT can be up to 64k in size, which corresponds to 8192 | ||
385 | * 8-byte entries, or 16 4k pages.. | ||
386 | */ | ||
387 | |||
388 | BUG_ON(size > 65536); | ||
389 | BUG_ON(va & ~PAGE_MASK); | ||
390 | |||
391 | for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { | ||
392 | pte_t pte; | ||
393 | unsigned long pfn, mfn; | ||
394 | |||
395 | pfn = virt_to_pfn(va); | ||
396 | mfn = pfn_to_mfn(pfn); | ||
397 | |||
398 | pte = pfn_pte(pfn, PAGE_KERNEL_RO); | ||
399 | |||
400 | if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) | ||
401 | BUG(); | ||
402 | |||
403 | frames[f] = mfn; | ||
404 | } | ||
405 | |||
406 | if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) | ||
407 | BUG(); | ||
408 | } | ||
409 | |||
360 | static void load_TLS_descriptor(struct thread_struct *t, | 410 | static void load_TLS_descriptor(struct thread_struct *t, |
361 | unsigned int cpu, unsigned int i) | 411 | unsigned int cpu, unsigned int i) |
362 | { | 412 | { |
@@ -580,6 +630,29 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, | |||
580 | preempt_enable(); | 630 | preempt_enable(); |
581 | } | 631 | } |
582 | 632 | ||
633 | /* | ||
634 | * Version of write_gdt_entry for use at early boot-time needed to | ||
635 | * update an entry as simply as possible. | ||
636 | */ | ||
637 | static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, | ||
638 | const void *desc, int type) | ||
639 | { | ||
640 | switch (type) { | ||
641 | case DESC_LDT: | ||
642 | case DESC_TSS: | ||
643 | /* ignore */ | ||
644 | break; | ||
645 | |||
646 | default: { | ||
647 | xmaddr_t maddr = virt_to_machine(&dt[entry]); | ||
648 | |||
649 | if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) | ||
650 | dt[entry] = *(struct desc_struct *)desc; | ||
651 | } | ||
652 | |||
653 | } | ||
654 | } | ||
655 | |||
583 | static void xen_load_sp0(struct tss_struct *tss, | 656 | static void xen_load_sp0(struct tss_struct *tss, |
584 | struct thread_struct *thread) | 657 | struct thread_struct *thread) |
585 | { | 658 | { |
@@ -713,7 +786,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) | |||
713 | set: | 786 | set: |
714 | base = ((u64)high << 32) | low; | 787 | base = ((u64)high << 32) | low; |
715 | if (HYPERVISOR_set_segment_base(which, base) != 0) | 788 | if (HYPERVISOR_set_segment_base(which, base) != 0) |
716 | ret = -EFAULT; | 789 | ret = -EIO; |
717 | break; | 790 | break; |
718 | #endif | 791 | #endif |
719 | 792 | ||
@@ -839,19 +912,9 @@ static const struct pv_info xen_info __initdata = { | |||
839 | 912 | ||
840 | static const struct pv_init_ops xen_init_ops __initdata = { | 913 | static const struct pv_init_ops xen_init_ops __initdata = { |
841 | .patch = xen_patch, | 914 | .patch = xen_patch, |
842 | |||
843 | .banner = xen_banner, | ||
844 | .memory_setup = xen_memory_setup, | ||
845 | .arch_setup = xen_arch_setup, | ||
846 | .post_allocator_init = xen_post_allocator_init, | ||
847 | }; | 915 | }; |
848 | 916 | ||
849 | static const struct pv_time_ops xen_time_ops __initdata = { | 917 | static const struct pv_time_ops xen_time_ops __initdata = { |
850 | .time_init = xen_time_init, | ||
851 | |||
852 | .set_wallclock = xen_set_wallclock, | ||
853 | .get_wallclock = xen_get_wallclock, | ||
854 | .get_tsc_khz = xen_tsc_khz, | ||
855 | .sched_clock = xen_sched_clock, | 918 | .sched_clock = xen_sched_clock, |
856 | }; | 919 | }; |
857 | 920 | ||
@@ -917,8 +980,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { | |||
917 | 980 | ||
918 | static const struct pv_apic_ops xen_apic_ops __initdata = { | 981 | static const struct pv_apic_ops xen_apic_ops __initdata = { |
919 | #ifdef CONFIG_X86_LOCAL_APIC | 982 | #ifdef CONFIG_X86_LOCAL_APIC |
920 | .setup_boot_clock = paravirt_nop, | ||
921 | .setup_secondary_clock = paravirt_nop, | ||
922 | .startup_ipi_hook = paravirt_nop, | 983 | .startup_ipi_hook = paravirt_nop, |
923 | #endif | 984 | #endif |
924 | }; | 985 | }; |
@@ -964,6 +1025,23 @@ static const struct machine_ops __initdata xen_machine_ops = { | |||
964 | .emergency_restart = xen_emergency_restart, | 1025 | .emergency_restart = xen_emergency_restart, |
965 | }; | 1026 | }; |
966 | 1027 | ||
1028 | /* | ||
1029 | * Set up the GDT and segment registers for -fstack-protector. Until | ||
1030 | * we do this, we have to be careful not to call any stack-protected | ||
1031 | * function, which is most of the kernel. | ||
1032 | */ | ||
1033 | static void __init xen_setup_stackprotector(void) | ||
1034 | { | ||
1035 | pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; | ||
1036 | pv_cpu_ops.load_gdt = xen_load_gdt_boot; | ||
1037 | |||
1038 | setup_stack_canary_segment(0); | ||
1039 | switch_to_new_gdt(0); | ||
1040 | |||
1041 | pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; | ||
1042 | pv_cpu_ops.load_gdt = xen_load_gdt; | ||
1043 | } | ||
1044 | |||
967 | /* First C function to be called on Xen boot */ | 1045 | /* First C function to be called on Xen boot */ |
968 | asmlinkage void __init xen_start_kernel(void) | 1046 | asmlinkage void __init xen_start_kernel(void) |
969 | { | 1047 | { |
@@ -974,20 +1052,55 @@ asmlinkage void __init xen_start_kernel(void) | |||
974 | 1052 | ||
975 | xen_domain_type = XEN_PV_DOMAIN; | 1053 | xen_domain_type = XEN_PV_DOMAIN; |
976 | 1054 | ||
977 | BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); | ||
978 | |||
979 | xen_setup_features(); | ||
980 | |||
981 | /* Install Xen paravirt ops */ | 1055 | /* Install Xen paravirt ops */ |
982 | pv_info = xen_info; | 1056 | pv_info = xen_info; |
983 | pv_init_ops = xen_init_ops; | 1057 | pv_init_ops = xen_init_ops; |
984 | pv_time_ops = xen_time_ops; | 1058 | pv_time_ops = xen_time_ops; |
985 | pv_cpu_ops = xen_cpu_ops; | 1059 | pv_cpu_ops = xen_cpu_ops; |
986 | pv_apic_ops = xen_apic_ops; | 1060 | pv_apic_ops = xen_apic_ops; |
987 | pv_mmu_ops = xen_mmu_ops; | ||
988 | 1061 | ||
989 | xen_init_irq_ops(); | 1062 | x86_init.resources.memory_setup = xen_memory_setup; |
1063 | x86_init.oem.arch_setup = xen_arch_setup; | ||
1064 | x86_init.oem.banner = xen_banner; | ||
1065 | |||
1066 | x86_init.timers.timer_init = xen_time_init; | ||
1067 | x86_init.timers.setup_percpu_clockev = x86_init_noop; | ||
1068 | x86_cpuinit.setup_percpu_clockev = x86_init_noop; | ||
1069 | |||
1070 | x86_platform.calibrate_tsc = xen_tsc_khz; | ||
1071 | x86_platform.get_wallclock = xen_get_wallclock; | ||
1072 | x86_platform.set_wallclock = xen_set_wallclock; | ||
1073 | |||
1074 | /* | ||
1075 | * Set up some pagetable state before starting to set any ptes. | ||
1076 | */ | ||
1077 | |||
1078 | /* Prevent unwanted bits from being set in PTEs. */ | ||
1079 | __supported_pte_mask &= ~_PAGE_GLOBAL; | ||
1080 | if (!xen_initial_domain()) | ||
1081 | __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); | ||
1082 | |||
1083 | __supported_pte_mask |= _PAGE_IOMAP; | ||
1084 | |||
1085 | #ifdef CONFIG_X86_64 | ||
1086 | /* Work out if we support NX */ | ||
1087 | check_efer(); | ||
1088 | #endif | ||
1089 | |||
1090 | xen_setup_features(); | ||
990 | 1091 | ||
1092 | /* Get mfn list */ | ||
1093 | if (!xen_feature(XENFEAT_auto_translated_physmap)) | ||
1094 | xen_build_dynamic_phys_to_machine(); | ||
1095 | |||
1096 | /* | ||
1097 | * Set up kernel GDT and segment registers, mainly so that | ||
1098 | * -fstack-protector code can be executed. | ||
1099 | */ | ||
1100 | xen_setup_stackprotector(); | ||
1101 | |||
1102 | xen_init_mmu_ops(); | ||
1103 | xen_init_irq_ops(); | ||
991 | xen_init_cpuid_mask(); | 1104 | xen_init_cpuid_mask(); |
992 | 1105 | ||
993 | #ifdef CONFIG_X86_LOCAL_APIC | 1106 | #ifdef CONFIG_X86_LOCAL_APIC |
@@ -1004,13 +1117,6 @@ asmlinkage void __init xen_start_kernel(void) | |||
1004 | 1117 | ||
1005 | machine_ops = xen_machine_ops; | 1118 | machine_ops = xen_machine_ops; |
1006 | 1119 | ||
1007 | #ifdef CONFIG_X86_64 | ||
1008 | /* | ||
1009 | * Setup percpu state. We only need to do this for 64-bit | ||
1010 | * because 32-bit already has %fs set properly. | ||
1011 | */ | ||
1012 | load_percpu_segment(0); | ||
1013 | #endif | ||
1014 | /* | 1120 | /* |
1015 | * The only reliable way to retain the initial address of the | 1121 | * The only reliable way to retain the initial address of the |
1016 | * percpu gdt_page is to remember it here, so we can go and | 1122 | * percpu gdt_page is to remember it here, so we can go and |
@@ -1020,22 +1126,8 @@ asmlinkage void __init xen_start_kernel(void) | |||
1020 | 1126 | ||
1021 | xen_smp_init(); | 1127 | xen_smp_init(); |
1022 | 1128 | ||
1023 | /* Get mfn list */ | ||
1024 | if (!xen_feature(XENFEAT_auto_translated_physmap)) | ||
1025 | xen_build_dynamic_phys_to_machine(); | ||
1026 | |||
1027 | pgd = (pgd_t *)xen_start_info->pt_base; | 1129 | pgd = (pgd_t *)xen_start_info->pt_base; |
1028 | 1130 | ||
1029 | /* Prevent unwanted bits from being set in PTEs. */ | ||
1030 | __supported_pte_mask &= ~_PAGE_GLOBAL; | ||
1031 | if (!xen_initial_domain()) | ||
1032 | __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); | ||
1033 | |||
1034 | #ifdef CONFIG_X86_64 | ||
1035 | /* Work out if we support NX */ | ||
1036 | check_efer(); | ||
1037 | #endif | ||
1038 | |||
1039 | /* Don't do the full vcpu_info placement stuff until we have a | 1131 | /* Don't do the full vcpu_info placement stuff until we have a |
1040 | possible map and a non-dummy shared_info. */ | 1132 | possible map and a non-dummy shared_info. */ |
1041 | per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; | 1133 | per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; |
@@ -1061,6 +1153,7 @@ asmlinkage void __init xen_start_kernel(void) | |||
1061 | /* set up basic CPUID stuff */ | 1153 | /* set up basic CPUID stuff */ |
1062 | cpu_detect(&new_cpu_data); | 1154 | cpu_detect(&new_cpu_data); |
1063 | new_cpu_data.hard_math = 1; | 1155 | new_cpu_data.hard_math = 1; |
1156 | new_cpu_data.wp_works_ok = 1; | ||
1064 | new_cpu_data.x86_capability[0] = cpuid_edx(1); | 1157 | new_cpu_data.x86_capability[0] = cpuid_edx(1); |
1065 | #endif | 1158 | #endif |
1066 | 1159 | ||
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index cfd17799bd6..9d30105a0c4 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c | |||
@@ -1,5 +1,7 @@ | |||
1 | #include <linux/hardirq.h> | 1 | #include <linux/hardirq.h> |
2 | 2 | ||
3 | #include <asm/x86_init.h> | ||
4 | |||
3 | #include <xen/interface/xen.h> | 5 | #include <xen/interface/xen.h> |
4 | #include <xen/interface/sched.h> | 6 | #include <xen/interface/sched.h> |
5 | #include <xen/interface/vcpu.h> | 7 | #include <xen/interface/vcpu.h> |
@@ -112,8 +114,6 @@ static void xen_halt(void) | |||
112 | } | 114 | } |
113 | 115 | ||
114 | static const struct pv_irq_ops xen_irq_ops __initdata = { | 116 | static const struct pv_irq_ops xen_irq_ops __initdata = { |
115 | .init_IRQ = xen_init_IRQ, | ||
116 | |||
117 | .save_fl = PV_CALLEE_SAVE(xen_save_fl), | 117 | .save_fl = PV_CALLEE_SAVE(xen_save_fl), |
118 | .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), | 118 | .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), |
119 | .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), | 119 | .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), |
@@ -129,4 +129,5 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { | |||
129 | void __init xen_init_irq_ops() | 129 | void __init xen_init_irq_ops() |
130 | { | 130 | { |
131 | pv_irq_ops = xen_irq_ops; | 131 | pv_irq_ops = xen_irq_ops; |
132 | x86_init.irqs.intr_init = xen_init_IRQ; | ||
132 | } | 133 | } |
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 4ceb2858165..3bf7b1d250c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -1165,14 +1165,14 @@ static void xen_drop_mm_ref(struct mm_struct *mm) | |||
1165 | /* Get the "official" set of cpus referring to our pagetable. */ | 1165 | /* Get the "official" set of cpus referring to our pagetable. */ |
1166 | if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { | 1166 | if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { |
1167 | for_each_online_cpu(cpu) { | 1167 | for_each_online_cpu(cpu) { |
1168 | if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask) | 1168 | if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) |
1169 | && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) | 1169 | && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) |
1170 | continue; | 1170 | continue; |
1171 | smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); | 1171 | smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); |
1172 | } | 1172 | } |
1173 | return; | 1173 | return; |
1174 | } | 1174 | } |
1175 | cpumask_copy(mask, &mm->cpu_vm_mask); | 1175 | cpumask_copy(mask, mm_cpumask(mm)); |
1176 | 1176 | ||
1177 | /* It's possible that a vcpu may have a stale reference to our | 1177 | /* It's possible that a vcpu may have a stale reference to our |
1178 | cr3, because its in lazy mode, and it hasn't yet flushed | 1178 | cr3, because its in lazy mode, and it hasn't yet flushed |
@@ -1229,9 +1229,12 @@ static __init void xen_pagetable_setup_start(pgd_t *base) | |||
1229 | { | 1229 | { |
1230 | } | 1230 | } |
1231 | 1231 | ||
1232 | static void xen_post_allocator_init(void); | ||
1233 | |||
1232 | static __init void xen_pagetable_setup_done(pgd_t *base) | 1234 | static __init void xen_pagetable_setup_done(pgd_t *base) |
1233 | { | 1235 | { |
1234 | xen_setup_shared_info(); | 1236 | xen_setup_shared_info(); |
1237 | xen_post_allocator_init(); | ||
1235 | } | 1238 | } |
1236 | 1239 | ||
1237 | static void xen_write_cr2(unsigned long cr2) | 1240 | static void xen_write_cr2(unsigned long cr2) |
@@ -1841,7 +1844,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) | |||
1841 | #endif | 1844 | #endif |
1842 | } | 1845 | } |
1843 | 1846 | ||
1844 | __init void xen_post_allocator_init(void) | 1847 | static __init void xen_post_allocator_init(void) |
1845 | { | 1848 | { |
1846 | pv_mmu_ops.set_pte = xen_set_pte; | 1849 | pv_mmu_ops.set_pte = xen_set_pte; |
1847 | pv_mmu_ops.set_pmd = xen_set_pmd; | 1850 | pv_mmu_ops.set_pmd = xen_set_pmd; |
@@ -1875,10 +1878,7 @@ static void xen_leave_lazy_mmu(void) | |||
1875 | preempt_enable(); | 1878 | preempt_enable(); |
1876 | } | 1879 | } |
1877 | 1880 | ||
1878 | const struct pv_mmu_ops xen_mmu_ops __initdata = { | 1881 | static const struct pv_mmu_ops xen_mmu_ops __initdata = { |
1879 | .pagetable_setup_start = xen_pagetable_setup_start, | ||
1880 | .pagetable_setup_done = xen_pagetable_setup_done, | ||
1881 | |||
1882 | .read_cr2 = xen_read_cr2, | 1882 | .read_cr2 = xen_read_cr2, |
1883 | .write_cr2 = xen_write_cr2, | 1883 | .write_cr2 = xen_write_cr2, |
1884 | 1884 | ||
@@ -1954,6 +1954,12 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { | |||
1954 | .set_fixmap = xen_set_fixmap, | 1954 | .set_fixmap = xen_set_fixmap, |
1955 | }; | 1955 | }; |
1956 | 1956 | ||
1957 | void __init xen_init_mmu_ops(void) | ||
1958 | { | ||
1959 | x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; | ||
1960 | x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; | ||
1961 | pv_mmu_ops = xen_mmu_ops; | ||
1962 | } | ||
1957 | 1963 | ||
1958 | #ifdef CONFIG_XEN_DEBUG_FS | 1964 | #ifdef CONFIG_XEN_DEBUG_FS |
1959 | 1965 | ||
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index da730262489..5fe6bc7f5ec 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h | |||
@@ -59,5 +59,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, | |||
59 | 59 | ||
60 | unsigned long xen_read_cr2_direct(void); | 60 | unsigned long xen_read_cr2_direct(void); |
61 | 61 | ||
62 | extern const struct pv_mmu_ops xen_mmu_ops; | 62 | extern void xen_init_mmu_ops(void); |
63 | #endif /* _XEN_MMU_H */ | 63 | #endif /* _XEN_MMU_H */ |
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 429834ec168..fe03eeed7b4 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c | |||
@@ -236,6 +236,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) | |||
236 | ctxt->user_regs.ss = __KERNEL_DS; | 236 | ctxt->user_regs.ss = __KERNEL_DS; |
237 | #ifdef CONFIG_X86_32 | 237 | #ifdef CONFIG_X86_32 |
238 | ctxt->user_regs.fs = __KERNEL_PERCPU; | 238 | ctxt->user_regs.fs = __KERNEL_PERCPU; |
239 | ctxt->user_regs.gs = __KERNEL_STACK_CANARY; | ||
239 | #else | 240 | #else |
240 | ctxt->gs_base_kernel = per_cpu_offset(cpu); | 241 | ctxt->gs_base_kernel = per_cpu_offset(cpu); |
241 | #endif | 242 | #endif |
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 5601506f2dd..36a5141108d 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c | |||
@@ -187,7 +187,6 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl | |||
187 | struct xen_spinlock *prev; | 187 | struct xen_spinlock *prev; |
188 | int irq = __get_cpu_var(lock_kicker_irq); | 188 | int irq = __get_cpu_var(lock_kicker_irq); |
189 | int ret; | 189 | int ret; |
190 | unsigned long flags; | ||
191 | u64 start; | 190 | u64 start; |
192 | 191 | ||
193 | /* If kicker interrupts not initialized yet, just spin */ | 192 | /* If kicker interrupts not initialized yet, just spin */ |
@@ -199,16 +198,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl | |||
199 | /* announce we're spinning */ | 198 | /* announce we're spinning */ |
200 | prev = spinning_lock(xl); | 199 | prev = spinning_lock(xl); |
201 | 200 | ||
202 | flags = __raw_local_save_flags(); | ||
203 | if (irq_enable) { | ||
204 | ADD_STATS(taken_slow_irqenable, 1); | ||
205 | raw_local_irq_enable(); | ||
206 | } | ||
207 | |||
208 | ADD_STATS(taken_slow, 1); | 201 | ADD_STATS(taken_slow, 1); |
209 | ADD_STATS(taken_slow_nested, prev != NULL); | 202 | ADD_STATS(taken_slow_nested, prev != NULL); |
210 | 203 | ||
211 | do { | 204 | do { |
205 | unsigned long flags; | ||
206 | |||
212 | /* clear pending */ | 207 | /* clear pending */ |
213 | xen_clear_irq_pending(irq); | 208 | xen_clear_irq_pending(irq); |
214 | 209 | ||
@@ -228,6 +223,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl | |||
228 | goto out; | 223 | goto out; |
229 | } | 224 | } |
230 | 225 | ||
226 | flags = __raw_local_save_flags(); | ||
227 | if (irq_enable) { | ||
228 | ADD_STATS(taken_slow_irqenable, 1); | ||
229 | raw_local_irq_enable(); | ||
230 | } | ||
231 | |||
231 | /* | 232 | /* |
232 | * Block until irq becomes pending. If we're | 233 | * Block until irq becomes pending. If we're |
233 | * interrupted at this point (after the trylock but | 234 | * interrupted at this point (after the trylock but |
@@ -238,13 +239,15 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl | |||
238 | * pending. | 239 | * pending. |
239 | */ | 240 | */ |
240 | xen_poll_irq(irq); | 241 | xen_poll_irq(irq); |
242 | |||
243 | raw_local_irq_restore(flags); | ||
244 | |||
241 | ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); | 245 | ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); |
242 | } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ | 246 | } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ |
243 | 247 | ||
244 | kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); | 248 | kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); |
245 | 249 | ||
246 | out: | 250 | out: |
247 | raw_local_irq_restore(flags); | ||
248 | unspinning_lock(xl, prev); | 251 | unspinning_lock(xl, prev); |
249 | spin_time_accum_blocked(start); | 252 | spin_time_accum_blocked(start); |
250 | 253 | ||
@@ -323,8 +326,13 @@ static void xen_spin_unlock(struct raw_spinlock *lock) | |||
323 | smp_wmb(); /* make sure no writes get moved after unlock */ | 326 | smp_wmb(); /* make sure no writes get moved after unlock */ |
324 | xl->lock = 0; /* release lock */ | 327 | xl->lock = 0; /* release lock */ |
325 | 328 | ||
326 | /* make sure unlock happens before kick */ | 329 | /* |
327 | barrier(); | 330 | * Make sure unlock happens before checking for waiting |
331 | * spinners. We need a strong barrier to enforce the | ||
332 | * write-read ordering to different memory locations, as the | ||
333 | * CPU makes no implied guarantees about their ordering. | ||
334 | */ | ||
335 | mb(); | ||
328 | 336 | ||
329 | if (unlikely(xl->spinners)) | 337 | if (unlikely(xl->spinners)) |
330 | xen_spin_unlock_slow(xl); | 338 | xen_spin_unlock_slow(xl); |
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 22494fd4c9b..355fa6b99c9 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h | |||
@@ -30,8 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); | |||
30 | void xen_ident_map_ISA(void); | 30 | void xen_ident_map_ISA(void); |
31 | void xen_reserve_top(void); | 31 | void xen_reserve_top(void); |
32 | 32 | ||
33 | void xen_post_allocator_init(void); | ||
34 | |||
35 | char * __init xen_memory_setup(void); | 33 | char * __init xen_memory_setup(void); |
36 | void __init xen_arch_setup(void); | 34 | void __init xen_arch_setup(void); |
37 | void __init xen_init_IRQ(void); | 35 | void __init xen_init_IRQ(void); |