diff options
author | Arjan van de Ven <arjan@linux.intel.com> | 2008-10-17 12:20:26 -0400 |
---|---|---|
committer | Arjan van de Ven <arjan@linux.intel.com> | 2008-10-17 12:20:26 -0400 |
commit | 651dab4264e4ba0e563f5ff56f748127246e9065 (patch) | |
tree | 016630974bdcb00fe529b673f96d389e0fd6dc94 /arch/x86 | |
parent | 40b8606253552109815786e5d4b0de98782d31f5 (diff) | |
parent | 2e532d68a2b3e2aa6b19731501222069735c741c (diff) |
Merge commit 'linus/master' into merge-linus
Conflicts:
arch/x86/kvm/i8254.c
Diffstat (limited to 'arch/x86')
229 files changed, 17032 insertions, 9459 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d1325..bd3c2c53873e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -18,6 +18,7 @@ config X86_64 | |||
18 | ### Arch settings | 18 | ### Arch settings |
19 | config X86 | 19 | config X86 |
20 | def_bool y | 20 | def_bool y |
21 | select HAVE_AOUT if X86_32 | ||
21 | select HAVE_UNSTABLE_SCHED_CLOCK | 22 | select HAVE_UNSTABLE_SCHED_CLOCK |
22 | select HAVE_IDE | 23 | select HAVE_IDE |
23 | select HAVE_OPROFILE | 24 | select HAVE_OPROFILE |
@@ -29,6 +30,7 @@ config X86 | |||
29 | select HAVE_FTRACE | 30 | select HAVE_FTRACE |
30 | select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) | 31 | select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) |
31 | select HAVE_ARCH_KGDB if !X86_VOYAGER | 32 | select HAVE_ARCH_KGDB if !X86_VOYAGER |
33 | select HAVE_ARCH_TRACEHOOK | ||
32 | select HAVE_GENERIC_DMA_COHERENT if X86_32 | 34 | select HAVE_GENERIC_DMA_COHERENT if X86_32 |
33 | select HAVE_EFFICIENT_UNALIGNED_ACCESS | 35 | select HAVE_EFFICIENT_UNALIGNED_ACCESS |
34 | 36 | ||
@@ -37,10 +39,6 @@ config ARCH_DEFCONFIG | |||
37 | default "arch/x86/configs/i386_defconfig" if X86_32 | 39 | default "arch/x86/configs/i386_defconfig" if X86_32 |
38 | default "arch/x86/configs/x86_64_defconfig" if X86_64 | 40 | default "arch/x86/configs/x86_64_defconfig" if X86_64 |
39 | 41 | ||
40 | |||
41 | config GENERIC_LOCKBREAK | ||
42 | def_bool n | ||
43 | |||
44 | config GENERIC_TIME | 42 | config GENERIC_TIME |
45 | def_bool y | 43 | def_bool y |
46 | 44 | ||
@@ -93,7 +91,7 @@ config GENERIC_HWEIGHT | |||
93 | def_bool y | 91 | def_bool y |
94 | 92 | ||
95 | config GENERIC_GPIO | 93 | config GENERIC_GPIO |
96 | def_bool n | 94 | bool |
97 | 95 | ||
98 | config ARCH_MAY_HAVE_PC_FDC | 96 | config ARCH_MAY_HAVE_PC_FDC |
99 | def_bool y | 97 | def_bool y |
@@ -104,12 +102,6 @@ config RWSEM_GENERIC_SPINLOCK | |||
104 | config RWSEM_XCHGADD_ALGORITHM | 102 | config RWSEM_XCHGADD_ALGORITHM |
105 | def_bool X86_XADD | 103 | def_bool X86_XADD |
106 | 104 | ||
107 | config ARCH_HAS_ILOG2_U32 | ||
108 | def_bool n | ||
109 | |||
110 | config ARCH_HAS_ILOG2_U64 | ||
111 | def_bool n | ||
112 | |||
113 | config ARCH_HAS_CPU_IDLE_WAIT | 105 | config ARCH_HAS_CPU_IDLE_WAIT |
114 | def_bool y | 106 | def_bool y |
115 | 107 | ||
@@ -151,9 +143,6 @@ config AUDIT_ARCH | |||
151 | bool | 143 | bool |
152 | default X86_64 | 144 | default X86_64 |
153 | 145 | ||
154 | config ARCH_SUPPORTS_AOUT | ||
155 | def_bool y | ||
156 | |||
157 | config ARCH_SUPPORTS_OPTIMIZED_INLINING | 146 | config ARCH_SUPPORTS_OPTIMIZED_INLINING |
158 | def_bool y | 147 | def_bool y |
159 | 148 | ||
@@ -553,6 +542,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT | |||
553 | config AMD_IOMMU | 542 | config AMD_IOMMU |
554 | bool "AMD IOMMU support" | 543 | bool "AMD IOMMU support" |
555 | select SWIOTLB | 544 | select SWIOTLB |
545 | select PCI_MSI | ||
556 | depends on X86_64 && PCI && ACPI | 546 | depends on X86_64 && PCI && ACPI |
557 | help | 547 | help |
558 | With this option you can enable support for AMD IOMMU hardware in | 548 | With this option you can enable support for AMD IOMMU hardware in |
@@ -758,9 +748,8 @@ config I8K | |||
758 | Say N otherwise. | 748 | Say N otherwise. |
759 | 749 | ||
760 | config X86_REBOOTFIXUPS | 750 | config X86_REBOOTFIXUPS |
761 | def_bool n | 751 | bool "Enable X86 board specific fixups for reboot" |
762 | prompt "Enable X86 board specific fixups for reboot" | 752 | depends on X86_32 |
763 | depends on X86_32 && X86 | ||
764 | ---help--- | 753 | ---help--- |
765 | This enables chipset and/or board specific fixups to be done | 754 | This enables chipset and/or board specific fixups to be done |
766 | in order to get reboot to work correctly. This is only needed on | 755 | in order to get reboot to work correctly. This is only needed on |
@@ -776,23 +765,45 @@ config X86_REBOOTFIXUPS | |||
776 | Say N otherwise. | 765 | Say N otherwise. |
777 | 766 | ||
778 | config MICROCODE | 767 | config MICROCODE |
779 | tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" | 768 | tristate "/dev/cpu/microcode - microcode support" |
780 | select FW_LOADER | 769 | select FW_LOADER |
781 | ---help--- | 770 | ---help--- |
782 | If you say Y here, you will be able to update the microcode on | 771 | If you say Y here, you will be able to update the microcode on |
783 | Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, | 772 | certain Intel and AMD processors. The Intel support is for the |
784 | Pentium III, Pentium 4, Xeon etc. You will obviously need the | 773 | IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, |
785 | actual microcode binary data itself which is not shipped with the | 774 | Pentium 4, Xeon etc. The AMD support is for family 0x10 and |
786 | Linux kernel. | 775 | 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra. |
776 | You will obviously need the actual microcode binary data itself | ||
777 | which is not shipped with the Linux kernel. | ||
787 | 778 | ||
788 | For latest news and information on obtaining all the required | 779 | This option selects the general module only, you need to select |
789 | ingredients for this driver, check: | 780 | at least one vendor specific module as well. |
790 | <http://www.urbanmyth.org/microcode/>. | ||
791 | 781 | ||
792 | To compile this driver as a module, choose M here: the | 782 | To compile this driver as a module, choose M here: the |
793 | module will be called microcode. | 783 | module will be called microcode. |
794 | 784 | ||
795 | config MICROCODE_OLD_INTERFACE | 785 | config MICROCODE_INTEL |
786 | bool "Intel microcode patch loading support" | ||
787 | depends on MICROCODE | ||
788 | default MICROCODE | ||
789 | select FW_LOADER | ||
790 | --help--- | ||
791 | This options enables microcode patch loading support for Intel | ||
792 | processors. | ||
793 | |||
794 | For latest news and information on obtaining all the required | ||
795 | Intel ingredients for this driver, check: | ||
796 | <http://www.urbanmyth.org/microcode/>. | ||
797 | |||
798 | config MICROCODE_AMD | ||
799 | bool "AMD microcode patch loading support" | ||
800 | depends on MICROCODE | ||
801 | select FW_LOADER | ||
802 | --help--- | ||
803 | If you select this option, microcode patch loading support for AMD | ||
804 | processors will be enabled. | ||
805 | |||
806 | config MICROCODE_OLD_INTERFACE | ||
796 | def_bool y | 807 | def_bool y |
797 | depends on MICROCODE | 808 | depends on MICROCODE |
798 | 809 | ||
@@ -922,16 +933,17 @@ config HIGHMEM | |||
922 | depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) | 933 | depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) |
923 | 934 | ||
924 | config X86_PAE | 935 | config X86_PAE |
925 | def_bool n | 936 | bool "PAE (Physical Address Extension) Support" |
926 | prompt "PAE (Physical Address Extension) Support" | ||
927 | depends on X86_32 && !HIGHMEM4G | 937 | depends on X86_32 && !HIGHMEM4G |
928 | select RESOURCES_64BIT | ||
929 | help | 938 | help |
930 | PAE is required for NX support, and furthermore enables | 939 | PAE is required for NX support, and furthermore enables |
931 | larger swapspace support for non-overcommit purposes. It | 940 | larger swapspace support for non-overcommit purposes. It |
932 | has the cost of more pagetable lookup overhead, and also | 941 | has the cost of more pagetable lookup overhead, and also |
933 | consumes more pagetable space per process. | 942 | consumes more pagetable space per process. |
934 | 943 | ||
944 | config ARCH_PHYS_ADDR_T_64BIT | ||
945 | def_bool X86_64 || X86_PAE | ||
946 | |||
935 | # Common NUMA Features | 947 | # Common NUMA Features |
936 | config NUMA | 948 | config NUMA |
937 | bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" | 949 | bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" |
@@ -1020,7 +1032,7 @@ config HAVE_ARCH_ALLOC_REMAP | |||
1020 | 1032 | ||
1021 | config ARCH_FLATMEM_ENABLE | 1033 | config ARCH_FLATMEM_ENABLE |
1022 | def_bool y | 1034 | def_bool y |
1023 | depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA | 1035 | depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA |
1024 | 1036 | ||
1025 | config ARCH_DISCONTIGMEM_ENABLE | 1037 | config ARCH_DISCONTIGMEM_ENABLE |
1026 | def_bool y | 1038 | def_bool y |
@@ -1036,7 +1048,7 @@ config ARCH_SPARSEMEM_DEFAULT | |||
1036 | 1048 | ||
1037 | config ARCH_SPARSEMEM_ENABLE | 1049 | config ARCH_SPARSEMEM_ENABLE |
1038 | def_bool y | 1050 | def_bool y |
1039 | depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) | 1051 | depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH |
1040 | select SPARSEMEM_STATIC if X86_32 | 1052 | select SPARSEMEM_STATIC if X86_32 |
1041 | select SPARSEMEM_VMEMMAP_ENABLE if X86_64 | 1053 | select SPARSEMEM_VMEMMAP_ENABLE if X86_64 |
1042 | 1054 | ||
@@ -1059,6 +1071,56 @@ config HIGHPTE | |||
1059 | low memory. Setting this option will put user-space page table | 1071 | low memory. Setting this option will put user-space page table |
1060 | entries in high memory. | 1072 | entries in high memory. |
1061 | 1073 | ||
1074 | config X86_CHECK_BIOS_CORRUPTION | ||
1075 | bool "Check for low memory corruption" | ||
1076 | help | ||
1077 | Periodically check for memory corruption in low memory, which | ||
1078 | is suspected to be caused by BIOS. Even when enabled in the | ||
1079 | configuration, it is disabled at runtime. Enable it by | ||
1080 | setting "memory_corruption_check=1" on the kernel command | ||
1081 | line. By default it scans the low 64k of memory every 60 | ||
1082 | seconds; see the memory_corruption_check_size and | ||
1083 | memory_corruption_check_period parameters in | ||
1084 | Documentation/kernel-parameters.txt to adjust this. | ||
1085 | |||
1086 | When enabled with the default parameters, this option has | ||
1087 | almost no overhead, as it reserves a relatively small amount | ||
1088 | of memory and scans it infrequently. It both detects corruption | ||
1089 | and prevents it from affecting the running system. | ||
1090 | |||
1091 | It is, however, intended as a diagnostic tool; if repeatable | ||
1092 | BIOS-originated corruption always affects the same memory, | ||
1093 | you can use memmap= to prevent the kernel from using that | ||
1094 | memory. | ||
1095 | |||
1096 | config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK | ||
1097 | bool "Set the default setting of memory_corruption_check" | ||
1098 | depends on X86_CHECK_BIOS_CORRUPTION | ||
1099 | default y | ||
1100 | help | ||
1101 | Set whether the default state of memory_corruption_check is | ||
1102 | on or off. | ||
1103 | |||
1104 | config X86_RESERVE_LOW_64K | ||
1105 | bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" | ||
1106 | default y | ||
1107 | help | ||
1108 | Reserve the first 64K of physical RAM on BIOSes that are known | ||
1109 | to potentially corrupt that memory range. A numbers of BIOSes are | ||
1110 | known to utilize this area during suspend/resume, so it must not | ||
1111 | be used by the kernel. | ||
1112 | |||
1113 | Set this to N if you are absolutely sure that you trust the BIOS | ||
1114 | to get all its memory reservations and usages right. | ||
1115 | |||
1116 | If you have doubts about the BIOS (e.g. suspend/resume does not | ||
1117 | work or there's kernel crashes after certain hardware hotplug | ||
1118 | events) and it's not AMI or Phoenix, then you might want to enable | ||
1119 | X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical | ||
1120 | corruption patterns. | ||
1121 | |||
1122 | Say Y if unsure. | ||
1123 | |||
1062 | config MATH_EMULATION | 1124 | config MATH_EMULATION |
1063 | bool | 1125 | bool |
1064 | prompt "Math emulation" if X86_32 | 1126 | prompt "Math emulation" if X86_32 |
@@ -1117,10 +1179,10 @@ config MTRR | |||
1117 | You can safely say Y even if your machine doesn't have MTRRs, you'll | 1179 | You can safely say Y even if your machine doesn't have MTRRs, you'll |
1118 | just add about 9 KB to your kernel. | 1180 | just add about 9 KB to your kernel. |
1119 | 1181 | ||
1120 | See <file:Documentation/mtrr.txt> for more information. | 1182 | See <file:Documentation/x86/mtrr.txt> for more information. |
1121 | 1183 | ||
1122 | config MTRR_SANITIZER | 1184 | config MTRR_SANITIZER |
1123 | bool | 1185 | def_bool y |
1124 | prompt "MTRR cleanup support" | 1186 | prompt "MTRR cleanup support" |
1125 | depends on MTRR | 1187 | depends on MTRR |
1126 | help | 1188 | help |
@@ -1131,7 +1193,7 @@ config MTRR_SANITIZER | |||
1131 | The largest mtrr entry size for a continous block can be set with | 1193 | The largest mtrr entry size for a continous block can be set with |
1132 | mtrr_chunk_size. | 1194 | mtrr_chunk_size. |
1133 | 1195 | ||
1134 | If unsure, say N. | 1196 | If unsure, say Y. |
1135 | 1197 | ||
1136 | config MTRR_SANITIZER_ENABLE_DEFAULT | 1198 | config MTRR_SANITIZER_ENABLE_DEFAULT |
1137 | int "MTRR cleanup enable value (0-1)" | 1199 | int "MTRR cleanup enable value (0-1)" |
@@ -1166,8 +1228,7 @@ config X86_PAT | |||
1166 | If unsure, say Y. | 1228 | If unsure, say Y. |
1167 | 1229 | ||
1168 | config EFI | 1230 | config EFI |
1169 | def_bool n | 1231 | bool "EFI runtime service support" |
1170 | prompt "EFI runtime service support" | ||
1171 | depends on ACPI | 1232 | depends on ACPI |
1172 | ---help--- | 1233 | ---help--- |
1173 | This enables the kernel to use EFI runtime services that are | 1234 | This enables the kernel to use EFI runtime services that are |
@@ -1191,7 +1252,6 @@ config IRQBALANCE | |||
1191 | config SECCOMP | 1252 | config SECCOMP |
1192 | def_bool y | 1253 | def_bool y |
1193 | prompt "Enable seccomp to safely compute untrusted bytecode" | 1254 | prompt "Enable seccomp to safely compute untrusted bytecode" |
1194 | depends on PROC_FS | ||
1195 | help | 1255 | help |
1196 | This kernel feature is useful for number crunching applications | 1256 | This kernel feature is useful for number crunching applications |
1197 | that may need to compute untrusted bytecode during their | 1257 | that may need to compute untrusted bytecode during their |
@@ -1199,7 +1259,7 @@ config SECCOMP | |||
1199 | the process as file descriptors supporting the read/write | 1259 | the process as file descriptors supporting the read/write |
1200 | syscalls, it's possible to isolate those applications in | 1260 | syscalls, it's possible to isolate those applications in |
1201 | their own address space using seccomp. Once seccomp is | 1261 | their own address space using seccomp. Once seccomp is |
1202 | enabled via /proc/<pid>/seccomp, it cannot be disabled | 1262 | enabled via prctl(PR_SET_SECCOMP), it cannot be disabled |
1203 | and the task is only allowed to execute a few safe syscalls | 1263 | and the task is only allowed to execute a few safe syscalls |
1204 | defined by each seccomp mode. | 1264 | defined by each seccomp mode. |
1205 | 1265 | ||
@@ -1356,14 +1416,14 @@ config PHYSICAL_ALIGN | |||
1356 | Don't change this unless you know what you are doing. | 1416 | Don't change this unless you know what you are doing. |
1357 | 1417 | ||
1358 | config HOTPLUG_CPU | 1418 | config HOTPLUG_CPU |
1359 | bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)" | 1419 | bool "Support for hot-pluggable CPUs" |
1360 | depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER | 1420 | depends on SMP && HOTPLUG && !X86_VOYAGER |
1361 | ---help--- | 1421 | ---help--- |
1362 | Say Y here to experiment with turning CPUs off and on, and to | 1422 | Say Y here to allow turning CPUs off and on. CPUs can be |
1363 | enable suspend on SMP systems. CPUs can be controlled through | 1423 | controlled through /sys/devices/system/cpu. |
1364 | /sys/devices/system/cpu. | 1424 | ( Note: power management support will enable this option |
1365 | Say N if you want to disable CPU hotplug and don't need to | 1425 | automatically on SMP systems. ) |
1366 | suspend. | 1426 | Say N if you want to disable CPU hotplug. |
1367 | 1427 | ||
1368 | config COMPAT_VDSO | 1428 | config COMPAT_VDSO |
1369 | def_bool y | 1429 | def_bool y |
@@ -1378,6 +1438,51 @@ config COMPAT_VDSO | |||
1378 | 1438 | ||
1379 | If unsure, say Y. | 1439 | If unsure, say Y. |
1380 | 1440 | ||
1441 | config CMDLINE_BOOL | ||
1442 | bool "Built-in kernel command line" | ||
1443 | default n | ||
1444 | help | ||
1445 | Allow for specifying boot arguments to the kernel at | ||
1446 | build time. On some systems (e.g. embedded ones), it is | ||
1447 | necessary or convenient to provide some or all of the | ||
1448 | kernel boot arguments with the kernel itself (that is, | ||
1449 | to not rely on the boot loader to provide them.) | ||
1450 | |||
1451 | To compile command line arguments into the kernel, | ||
1452 | set this option to 'Y', then fill in the | ||
1453 | the boot arguments in CONFIG_CMDLINE. | ||
1454 | |||
1455 | Systems with fully functional boot loaders (i.e. non-embedded) | ||
1456 | should leave this option set to 'N'. | ||
1457 | |||
1458 | config CMDLINE | ||
1459 | string "Built-in kernel command string" | ||
1460 | depends on CMDLINE_BOOL | ||
1461 | default "" | ||
1462 | help | ||
1463 | Enter arguments here that should be compiled into the kernel | ||
1464 | image and used at boot time. If the boot loader provides a | ||
1465 | command line at boot time, it is appended to this string to | ||
1466 | form the full kernel command line, when the system boots. | ||
1467 | |||
1468 | However, you can use the CONFIG_CMDLINE_OVERRIDE option to | ||
1469 | change this behavior. | ||
1470 | |||
1471 | In most cases, the command line (whether built-in or provided | ||
1472 | by the boot loader) should specify the device for the root | ||
1473 | file system. | ||
1474 | |||
1475 | config CMDLINE_OVERRIDE | ||
1476 | bool "Built-in command line overrides boot loader arguments" | ||
1477 | default n | ||
1478 | depends on CMDLINE_BOOL | ||
1479 | help | ||
1480 | Set this option to 'Y' to have the kernel ignore the boot loader | ||
1481 | command line, and use ONLY the built-in command line. | ||
1482 | |||
1483 | This is used to work around broken boot loaders. This should | ||
1484 | be set to 'N' under normal conditions. | ||
1485 | |||
1381 | endmenu | 1486 | endmenu |
1382 | 1487 | ||
1383 | config ARCH_ENABLE_MEMORY_HOTPLUG | 1488 | config ARCH_ENABLE_MEMORY_HOTPLUG |
@@ -1643,6 +1748,14 @@ config DMAR_FLOPPY_WA | |||
1643 | workaround will setup a 1:1 mapping for the first | 1748 | workaround will setup a 1:1 mapping for the first |
1644 | 16M to make floppy (an ISA device) work. | 1749 | 16M to make floppy (an ISA device) work. |
1645 | 1750 | ||
1751 | config INTR_REMAP | ||
1752 | bool "Support for Interrupt Remapping (EXPERIMENTAL)" | ||
1753 | depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL | ||
1754 | help | ||
1755 | Supports Interrupt remapping for IO-APIC and MSI devices. | ||
1756 | To use x2apic mode in the CPU's which support x2APIC enhancements or | ||
1757 | to support platforms with CPU's having > 8 bit APIC ID, say Y. | ||
1758 | |||
1646 | source "drivers/pci/pcie/Kconfig" | 1759 | source "drivers/pci/pcie/Kconfig" |
1647 | 1760 | ||
1648 | source "drivers/pci/Kconfig" | 1761 | source "drivers/pci/Kconfig" |
@@ -1759,7 +1872,7 @@ config IA32_EMULATION | |||
1759 | 1872 | ||
1760 | config IA32_AOUT | 1873 | config IA32_AOUT |
1761 | tristate "IA32 a.out support" | 1874 | tristate "IA32 a.out support" |
1762 | depends on IA32_EMULATION && ARCH_SUPPORTS_AOUT | 1875 | depends on IA32_EMULATION |
1763 | help | 1876 | help |
1764 | Support old a.out binaries in the 32bit emulation. | 1877 | Support old a.out binaries in the 32bit emulation. |
1765 | 1878 | ||
@@ -1773,7 +1886,7 @@ config COMPAT_FOR_U64_ALIGNMENT | |||
1773 | 1886 | ||
1774 | config SYSVIPC_COMPAT | 1887 | config SYSVIPC_COMPAT |
1775 | def_bool y | 1888 | def_bool y |
1776 | depends on X86_64 && COMPAT && SYSVIPC | 1889 | depends on COMPAT && SYSVIPC |
1777 | 1890 | ||
1778 | endmenu | 1891 | endmenu |
1779 | 1892 | ||
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2c518fbc52ec..0b7c4a3f0651 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu | |||
@@ -38,8 +38,7 @@ config M386 | |||
38 | - "Crusoe" for the Transmeta Crusoe series. | 38 | - "Crusoe" for the Transmeta Crusoe series. |
39 | - "Efficeon" for the Transmeta Efficeon series. | 39 | - "Efficeon" for the Transmeta Efficeon series. |
40 | - "Winchip-C6" for original IDT Winchip. | 40 | - "Winchip-C6" for original IDT Winchip. |
41 | - "Winchip-2" for IDT Winchip 2. | 41 | - "Winchip-2" for IDT Winchips with 3dNow! capabilities. |
42 | - "Winchip-2A" for IDT Winchips with 3dNow! capabilities. | ||
43 | - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). | 42 | - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). |
44 | - "Geode GX/LX" For AMD Geode GX and LX processors. | 43 | - "Geode GX/LX" For AMD Geode GX and LX processors. |
45 | - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. | 44 | - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. |
@@ -194,19 +193,11 @@ config MWINCHIPC6 | |||
194 | treat this chip as a 586TSC with some extended instructions | 193 | treat this chip as a 586TSC with some extended instructions |
195 | and alignment requirements. | 194 | and alignment requirements. |
196 | 195 | ||
197 | config MWINCHIP2 | ||
198 | bool "Winchip-2" | ||
199 | depends on X86_32 | ||
200 | help | ||
201 | Select this for an IDT Winchip-2. Linux and GCC | ||
202 | treat this chip as a 586TSC with some extended instructions | ||
203 | and alignment requirements. | ||
204 | |||
205 | config MWINCHIP3D | 196 | config MWINCHIP3D |
206 | bool "Winchip-2A/Winchip-3" | 197 | bool "Winchip-2/Winchip-2A/Winchip-3" |
207 | depends on X86_32 | 198 | depends on X86_32 |
208 | help | 199 | help |
209 | Select this for an IDT Winchip-2A or 3. Linux and GCC | 200 | Select this for an IDT Winchip-2, 2A or 3. Linux and GCC |
210 | treat this chip as a 586TSC with some extended instructions | 201 | treat this chip as a 586TSC with some extended instructions |
211 | and alignment requirements. Also enable out of order memory | 202 | and alignment requirements. Also enable out of order memory |
212 | stores for this CPU, which can increase performance of some | 203 | stores for this CPU, which can increase performance of some |
@@ -318,7 +309,7 @@ config X86_L1_CACHE_SHIFT | |||
318 | int | 309 | int |
319 | default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC | 310 | default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC |
320 | default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 | 311 | default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 |
321 | default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX | 312 | default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX |
322 | default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 | 313 | default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 |
323 | 314 | ||
324 | config X86_XADD | 315 | config X86_XADD |
@@ -360,7 +351,7 @@ config X86_POPAD_OK | |||
360 | 351 | ||
361 | config X86_ALIGNMENT_16 | 352 | config X86_ALIGNMENT_16 |
362 | def_bool y | 353 | def_bool y |
363 | depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 | 354 | depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 |
364 | 355 | ||
365 | config X86_INTEL_USERCOPY | 356 | config X86_INTEL_USERCOPY |
366 | def_bool y | 357 | def_bool y |
@@ -368,7 +359,7 @@ config X86_INTEL_USERCOPY | |||
368 | 359 | ||
369 | config X86_USE_PPRO_CHECKSUM | 360 | config X86_USE_PPRO_CHECKSUM |
370 | def_bool y | 361 | def_bool y |
371 | depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 | 362 | depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 |
372 | 363 | ||
373 | config X86_USE_3DNOW | 364 | config X86_USE_3DNOW |
374 | def_bool y | 365 | def_bool y |
@@ -376,24 +367,27 @@ config X86_USE_3DNOW | |||
376 | 367 | ||
377 | config X86_OOSTORE | 368 | config X86_OOSTORE |
378 | def_bool y | 369 | def_bool y |
379 | depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR | 370 | depends on (MWINCHIP3D || MWINCHIPC6) && MTRR |
380 | 371 | ||
381 | # | 372 | # |
382 | # P6_NOPs are a relatively minor optimization that require a family >= | 373 | # P6_NOPs are a relatively minor optimization that require a family >= |
383 | # 6 processor, except that it is broken on certain VIA chips. | 374 | # 6 processor, except that it is broken on certain VIA chips. |
384 | # Furthermore, AMD chips prefer a totally different sequence of NOPs | 375 | # Furthermore, AMD chips prefer a totally different sequence of NOPs |
385 | # (which work on all CPUs). As a result, disallow these if we're | 376 | # (which work on all CPUs). In addition, it looks like Virtual PC |
386 | # compiling X86_GENERIC but not X86_64 (these NOPs do work on all | 377 | # does not understand them. |
387 | # x86-64 capable chips); the list of processors in the right-hand clause | 378 | # |
388 | # are the cores that benefit from this optimization. | 379 | # As a result, disallow these if we're not compiling for X86_64 (these |
380 | # NOPs do work on all x86-64 capable chips); the list of processors in | ||
381 | # the right-hand clause are the cores that benefit from this optimization. | ||
389 | # | 382 | # |
390 | config X86_P6_NOP | 383 | config X86_P6_NOP |
391 | def_bool y | 384 | def_bool y |
392 | depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC) | 385 | depends on X86_64 |
386 | depends on (MCORE2 || MPENTIUM4 || MPSC) | ||
393 | 387 | ||
394 | config X86_TSC | 388 | config X86_TSC |
395 | def_bool y | 389 | def_bool y |
396 | depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 | 390 | depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 |
397 | 391 | ||
398 | config X86_CMPXCHG64 | 392 | config X86_CMPXCHG64 |
399 | def_bool y | 393 | def_bool y |
@@ -403,7 +397,7 @@ config X86_CMPXCHG64 | |||
403 | # generates cmov. | 397 | # generates cmov. |
404 | config X86_CMOV | 398 | config X86_CMOV |
405 | def_bool y | 399 | def_bool y |
406 | depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64) | 400 | depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64) |
407 | 401 | ||
408 | config X86_MINIMUM_CPU_FAMILY | 402 | config X86_MINIMUM_CPU_FAMILY |
409 | int | 403 | int |
@@ -414,4 +408,124 @@ config X86_MINIMUM_CPU_FAMILY | |||
414 | 408 | ||
415 | config X86_DEBUGCTLMSR | 409 | config X86_DEBUGCTLMSR |
416 | def_bool y | 410 | def_bool y |
417 | depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) | 411 | depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) |
412 | |||
413 | menuconfig PROCESSOR_SELECT | ||
414 | bool "Supported processor vendors" if EMBEDDED | ||
415 | help | ||
416 | This lets you choose what x86 vendor support code your kernel | ||
417 | will include. | ||
418 | |||
419 | config CPU_SUP_INTEL | ||
420 | default y | ||
421 | bool "Support Intel processors" if PROCESSOR_SELECT | ||
422 | help | ||
423 | This enables detection, tunings and quirks for Intel processors | ||
424 | |||
425 | You need this enabled if you want your kernel to run on an | ||
426 | Intel CPU. Disabling this option on other types of CPUs | ||
427 | makes the kernel a tiny bit smaller. Disabling it on an Intel | ||
428 | CPU might render the kernel unbootable. | ||
429 | |||
430 | If unsure, say N. | ||
431 | |||
432 | config CPU_SUP_CYRIX_32 | ||
433 | default y | ||
434 | bool "Support Cyrix processors" if PROCESSOR_SELECT | ||
435 | depends on !64BIT | ||
436 | help | ||
437 | This enables detection, tunings and quirks for Cyrix processors | ||
438 | |||
439 | You need this enabled if you want your kernel to run on a | ||
440 | Cyrix CPU. Disabling this option on other types of CPUs | ||
441 | makes the kernel a tiny bit smaller. Disabling it on a Cyrix | ||
442 | CPU might render the kernel unbootable. | ||
443 | |||
444 | If unsure, say N. | ||
445 | |||
446 | config CPU_SUP_AMD | ||
447 | default y | ||
448 | bool "Support AMD processors" if PROCESSOR_SELECT | ||
449 | help | ||
450 | This enables detection, tunings and quirks for AMD processors | ||
451 | |||
452 | You need this enabled if you want your kernel to run on an | ||
453 | AMD CPU. Disabling this option on other types of CPUs | ||
454 | makes the kernel a tiny bit smaller. Disabling it on an AMD | ||
455 | CPU might render the kernel unbootable. | ||
456 | |||
457 | If unsure, say N. | ||
458 | |||
459 | config CPU_SUP_CENTAUR_32 | ||
460 | default y | ||
461 | bool "Support Centaur processors" if PROCESSOR_SELECT | ||
462 | depends on !64BIT | ||
463 | help | ||
464 | This enables detection, tunings and quirks for Centaur processors | ||
465 | |||
466 | You need this enabled if you want your kernel to run on a | ||
467 | Centaur CPU. Disabling this option on other types of CPUs | ||
468 | makes the kernel a tiny bit smaller. Disabling it on a Centaur | ||
469 | CPU might render the kernel unbootable. | ||
470 | |||
471 | If unsure, say N. | ||
472 | |||
473 | config CPU_SUP_CENTAUR_64 | ||
474 | default y | ||
475 | bool "Support Centaur processors" if PROCESSOR_SELECT | ||
476 | depends on 64BIT | ||
477 | help | ||
478 | This enables detection, tunings and quirks for Centaur processors | ||
479 | |||
480 | You need this enabled if you want your kernel to run on a | ||
481 | Centaur CPU. Disabling this option on other types of CPUs | ||
482 | makes the kernel a tiny bit smaller. Disabling it on a Centaur | ||
483 | CPU might render the kernel unbootable. | ||
484 | |||
485 | If unsure, say N. | ||
486 | |||
487 | config CPU_SUP_TRANSMETA_32 | ||
488 | default y | ||
489 | bool "Support Transmeta processors" if PROCESSOR_SELECT | ||
490 | depends on !64BIT | ||
491 | help | ||
492 | This enables detection, tunings and quirks for Transmeta processors | ||
493 | |||
494 | You need this enabled if you want your kernel to run on a | ||
495 | Transmeta CPU. Disabling this option on other types of CPUs | ||
496 | makes the kernel a tiny bit smaller. Disabling it on a Transmeta | ||
497 | CPU might render the kernel unbootable. | ||
498 | |||
499 | If unsure, say N. | ||
500 | |||
501 | config CPU_SUP_UMC_32 | ||
502 | default y | ||
503 | bool "Support UMC processors" if PROCESSOR_SELECT | ||
504 | depends on !64BIT | ||
505 | help | ||
506 | This enables detection, tunings and quirks for UMC processors | ||
507 | |||
508 | You need this enabled if you want your kernel to run on a | ||
509 | UMC CPU. Disabling this option on other types of CPUs | ||
510 | makes the kernel a tiny bit smaller. Disabling it on a UMC | ||
511 | CPU might render the kernel unbootable. | ||
512 | |||
513 | If unsure, say N. | ||
514 | |||
515 | config X86_DS | ||
516 | bool "Debug Store support" | ||
517 | default y | ||
518 | help | ||
519 | Add support for Debug Store. | ||
520 | This allows the kernel to provide a memory buffer to the hardware | ||
521 | to store various profiling and tracing events. | ||
522 | |||
523 | config X86_PTRACE_BTS | ||
524 | bool "ptrace interface to Branch Trace Store" | ||
525 | default y | ||
526 | depends on (X86_DS && X86_DEBUGCTLMSR) | ||
527 | help | ||
528 | Add a ptrace interface to allow collecting an execution trace | ||
529 | of the traced task. | ||
530 | This collects control flow changes in a (cyclic) buffer and allows | ||
531 | debuggers to fill in the gaps and show an execution trace of the debuggee. | ||
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 092f019e033a..2a3dfbd5e677 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug | |||
@@ -43,6 +43,19 @@ config EARLY_PRINTK | |||
43 | with klogd/syslogd or the X server. You should normally N here, | 43 | with klogd/syslogd or the X server. You should normally N here, |
44 | unless you want to debug such a crash. | 44 | unless you want to debug such a crash. |
45 | 45 | ||
46 | config EARLY_PRINTK_DBGP | ||
47 | bool "Early printk via EHCI debug port" | ||
48 | default n | ||
49 | depends on EARLY_PRINTK && PCI | ||
50 | help | ||
51 | Write kernel log output directly into the EHCI debug port. | ||
52 | |||
53 | This is useful for kernel debugging when your machine crashes very | ||
54 | early before the console code is initialized. For normal operation | ||
55 | it is not recommended because it looks ugly and doesn't cooperate | ||
56 | with klogd/syslogd or the X server. You should normally N here, | ||
57 | unless you want to debug such a crash. You need usb debug device. | ||
58 | |||
46 | config DEBUG_STACKOVERFLOW | 59 | config DEBUG_STACKOVERFLOW |
47 | bool "Check for stack overflows" | 60 | bool "Check for stack overflows" |
48 | depends on DEBUG_KERNEL | 61 | depends on DEBUG_KERNEL |
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index e372b584e919..80177ec052f0 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu | |||
@@ -28,7 +28,6 @@ cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) | |||
28 | cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 | 28 | cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 |
29 | cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 | 29 | cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 |
30 | cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) | 30 | cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) |
31 | cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586) | ||
32 | cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) | 31 | cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) |
33 | cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 | 32 | cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 |
34 | cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) | 33 | cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) |
@@ -45,3 +44,8 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx | |||
45 | # cpu entries | 44 | # cpu entries |
46 | cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) | 45 | cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) |
47 | 46 | ||
47 | # Bug fix for binutils: this option is required in order to keep | ||
48 | # binutils from generating NOPL instructions against our will. | ||
49 | ifneq ($(CONFIG_X86_P6_NOP),y) | ||
50 | cflags-y += $(call cc-option,-Wa$(comma)-mtune=generic32,) | ||
51 | endif | ||
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 7ee102f9c4f8..cd48c7210016 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile | |||
@@ -72,9 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ | |||
72 | KBUILD_CFLAGS += $(call cc-option,-m32) | 72 | KBUILD_CFLAGS += $(call cc-option,-m32) |
73 | KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ | 73 | KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ |
74 | 74 | ||
75 | $(obj)/zImage: IMAGE_OFFSET := 0x1000 | ||
76 | $(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) | 75 | $(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) |
77 | $(obj)/bzImage: IMAGE_OFFSET := 0x100000 | ||
78 | $(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__ | 76 | $(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__ |
79 | $(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ | 77 | $(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ |
80 | $(obj)/bzImage: BUILDFLAGS := -b | 78 | $(obj)/bzImage: BUILDFLAGS := -b |
@@ -117,7 +115,7 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE | |||
117 | $(call if_changed,objcopy) | 115 | $(call if_changed,objcopy) |
118 | 116 | ||
119 | $(obj)/compressed/vmlinux: FORCE | 117 | $(obj)/compressed/vmlinux: FORCE |
120 | $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ | 118 | $(Q)$(MAKE) $(build)=$(obj)/compressed $@ |
121 | 119 | ||
122 | # Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel | 120 | # Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel |
123 | FDARGS = | 121 | FDARGS = |
@@ -181,6 +179,7 @@ isoimage: $(BOOTIMAGE) | |||
181 | mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ | 179 | mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ |
182 | -no-emul-boot -boot-load-size 4 -boot-info-table \ | 180 | -no-emul-boot -boot-load-size 4 -boot-info-table \ |
183 | $(obj)/isoimage | 181 | $(obj)/isoimage |
182 | isohybrid $(obj)/image.iso 2>/dev/null || true | ||
184 | rm -rf $(obj)/isoimage | 183 | rm -rf $(obj)/isoimage |
185 | 184 | ||
186 | zlilo: $(BOOTIMAGE) | 185 | zlilo: $(BOOTIMAGE) |
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 92fdd35bd93e..1771c804e02f 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile | |||
@@ -27,9 +27,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE | |||
27 | $(call if_changed,objcopy) | 27 | $(call if_changed,objcopy) |
28 | 28 | ||
29 | 29 | ||
30 | ifeq ($(CONFIG_X86_32),y) | 30 | targets += vmlinux.bin.all vmlinux.relocs relocs |
31 | targets += vmlinux.bin.all vmlinux.relocs | 31 | hostprogs-$(CONFIG_X86_32) += relocs |
32 | hostprogs-y := relocs | ||
33 | 32 | ||
34 | quiet_cmd_relocs = RELOCS $@ | 33 | quiet_cmd_relocs = RELOCS $@ |
35 | cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< | 34 | cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< |
@@ -43,6 +42,8 @@ quiet_cmd_relocbin = BUILD $@ | |||
43 | $(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE | 42 | $(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE |
44 | $(call if_changed,relocbin) | 43 | $(call if_changed,relocbin) |
45 | 44 | ||
45 | ifeq ($(CONFIG_X86_32),y) | ||
46 | |||
46 | ifdef CONFIG_RELOCATABLE | 47 | ifdef CONFIG_RELOCATABLE |
47 | $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE | 48 | $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE |
48 | $(call if_changed,gzip) | 49 | $(call if_changed,gzip) |
@@ -59,6 +60,5 @@ $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE | |||
59 | LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T | 60 | LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T |
60 | endif | 61 | endif |
61 | 62 | ||
62 | |||
63 | $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE | 63 | $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE |
64 | $(call if_changed,ld) | 64 | $(call if_changed,ld) |
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index ba7736cf2ec7..29c5fbf08392 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S | |||
@@ -137,14 +137,15 @@ relocated: | |||
137 | */ | 137 | */ |
138 | movl output_len(%ebx), %eax | 138 | movl output_len(%ebx), %eax |
139 | pushl %eax | 139 | pushl %eax |
140 | # push arguments for decompress_kernel: | ||
140 | pushl %ebp # output address | 141 | pushl %ebp # output address |
141 | movl input_len(%ebx), %eax | 142 | movl input_len(%ebx), %eax |
142 | pushl %eax # input_len | 143 | pushl %eax # input_len |
143 | leal input_data(%ebx), %eax | 144 | leal input_data(%ebx), %eax |
144 | pushl %eax # input_data | 145 | pushl %eax # input_data |
145 | leal boot_heap(%ebx), %eax | 146 | leal boot_heap(%ebx), %eax |
146 | pushl %eax # heap area as third argument | 147 | pushl %eax # heap area |
147 | pushl %esi # real mode pointer as second arg | 148 | pushl %esi # real mode pointer |
148 | call decompress_kernel | 149 | call decompress_kernel |
149 | addl $20, %esp | 150 | addl $20, %esp |
150 | popl %ecx | 151 | popl %ecx |
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 9fea73706479..5780d361105b 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c | |||
@@ -16,7 +16,7 @@ | |||
16 | */ | 16 | */ |
17 | #undef CONFIG_PARAVIRT | 17 | #undef CONFIG_PARAVIRT |
18 | #ifdef CONFIG_X86_32 | 18 | #ifdef CONFIG_X86_32 |
19 | #define _ASM_DESC_H_ 1 | 19 | #define ASM_X86__DESC_H 1 |
20 | #endif | 20 | #endif |
21 | 21 | ||
22 | #ifdef CONFIG_X86_64 | 22 | #ifdef CONFIG_X86_64 |
@@ -27,7 +27,7 @@ | |||
27 | #include <linux/linkage.h> | 27 | #include <linux/linkage.h> |
28 | #include <linux/screen_info.h> | 28 | #include <linux/screen_info.h> |
29 | #include <linux/elf.h> | 29 | #include <linux/elf.h> |
30 | #include <asm/io.h> | 30 | #include <linux/io.h> |
31 | #include <asm/page.h> | 31 | #include <asm/page.h> |
32 | #include <asm/boot.h> | 32 | #include <asm/boot.h> |
33 | #include <asm/bootparam.h> | 33 | #include <asm/bootparam.h> |
@@ -251,7 +251,7 @@ static void __putstr(int error, const char *s) | |||
251 | y--; | 251 | y--; |
252 | } | 252 | } |
253 | } else { | 253 | } else { |
254 | vidmem [(x + cols * y) * 2] = c; | 254 | vidmem[(x + cols * y) * 2] = c; |
255 | if (++x >= cols) { | 255 | if (++x >= cols) { |
256 | x = 0; | 256 | x = 0; |
257 | if (++y >= lines) { | 257 | if (++y >= lines) { |
@@ -277,7 +277,8 @@ static void *memset(void *s, int c, unsigned n) | |||
277 | int i; | 277 | int i; |
278 | char *ss = s; | 278 | char *ss = s; |
279 | 279 | ||
280 | for (i = 0; i < n; i++) ss[i] = c; | 280 | for (i = 0; i < n; i++) |
281 | ss[i] = c; | ||
281 | return s; | 282 | return s; |
282 | } | 283 | } |
283 | 284 | ||
@@ -287,7 +288,8 @@ static void *memcpy(void *dest, const void *src, unsigned n) | |||
287 | const char *s = src; | 288 | const char *s = src; |
288 | char *d = dest; | 289 | char *d = dest; |
289 | 290 | ||
290 | for (i = 0; i < n; i++) d[i] = s[i]; | 291 | for (i = 0; i < n; i++) |
292 | d[i] = s[i]; | ||
291 | return dest; | 293 | return dest; |
292 | } | 294 | } |
293 | 295 | ||
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c index a1310c52fc0c..857e492c571e 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/boot/compressed/relocs.c | |||
@@ -492,7 +492,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) | |||
492 | continue; | 492 | continue; |
493 | } | 493 | } |
494 | sh_symtab = sec_symtab->symtab; | 494 | sh_symtab = sec_symtab->symtab; |
495 | sym_strtab = sec->link->strtab; | 495 | sym_strtab = sec_symtab->link->strtab; |
496 | for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { | 496 | for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { |
497 | Elf32_Rel *rel; | 497 | Elf32_Rel *rel; |
498 | Elf32_Sym *sym; | 498 | Elf32_Sym *sym; |
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 75298fe2edca..6ec6bb6e9957 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c | |||
@@ -59,17 +59,18 @@ int validate_cpu(void) | |||
59 | u32 e = err_flags[i]; | 59 | u32 e = err_flags[i]; |
60 | 60 | ||
61 | for (j = 0; j < 32; j++) { | 61 | for (j = 0; j < 32; j++) { |
62 | int n = (i << 5)+j; | 62 | if (msg_strs[0] < i || |
63 | if (*msg_strs < n) { | 63 | (msg_strs[0] == i && msg_strs[1] < j)) { |
64 | /* Skip to the next string */ | 64 | /* Skip to the next string */ |
65 | do { | 65 | msg_strs += 2; |
66 | msg_strs++; | 66 | while (*msg_strs++) |
67 | } while (*msg_strs); | 67 | ; |
68 | msg_strs++; | ||
69 | } | 68 | } |
70 | if (e & 1) { | 69 | if (e & 1) { |
71 | if (*msg_strs == n && msg_strs[1]) | 70 | if (msg_strs[0] == i && |
72 | printf("%s ", msg_strs+1); | 71 | msg_strs[1] == j && |
72 | msg_strs[2]) | ||
73 | printf("%s ", msg_strs+2); | ||
73 | else | 74 | else |
74 | printf("%d:%d ", i, j); | 75 | printf("%d:%d ", i, j); |
75 | } | 76 | } |
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 4b9ae7c56748..4d3ff037201f 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c | |||
@@ -38,12 +38,12 @@ static const u32 req_flags[NCAPINTS] = | |||
38 | { | 38 | { |
39 | REQUIRED_MASK0, | 39 | REQUIRED_MASK0, |
40 | REQUIRED_MASK1, | 40 | REQUIRED_MASK1, |
41 | REQUIRED_MASK2, | 41 | 0, /* REQUIRED_MASK2 not implemented in this file */ |
42 | REQUIRED_MASK3, | 42 | 0, /* REQUIRED_MASK3 not implemented in this file */ |
43 | REQUIRED_MASK4, | 43 | REQUIRED_MASK4, |
44 | REQUIRED_MASK5, | 44 | 0, /* REQUIRED_MASK5 not implemented in this file */ |
45 | REQUIRED_MASK6, | 45 | REQUIRED_MASK6, |
46 | REQUIRED_MASK7, | 46 | 0, /* REQUIRED_MASK7 not implemented in this file */ |
47 | }; | 47 | }; |
48 | 48 | ||
49 | #define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) | 49 | #define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) |
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index d93cbc6464d0..1aae8f3e5ca1 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c | |||
@@ -41,6 +41,7 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) | |||
41 | char *mbrbuf_ptr, *mbrbuf_end; | 41 | char *mbrbuf_ptr, *mbrbuf_end; |
42 | u32 buf_base, mbr_base; | 42 | u32 buf_base, mbr_base; |
43 | extern char _end[]; | 43 | extern char _end[]; |
44 | u16 mbr_magic; | ||
44 | 45 | ||
45 | sector_size = ei->params.bytes_per_sector; | 46 | sector_size = ei->params.bytes_per_sector; |
46 | if (!sector_size) | 47 | if (!sector_size) |
@@ -58,11 +59,15 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) | |||
58 | if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr) | 59 | if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr) |
59 | return -1; | 60 | return -1; |
60 | 61 | ||
62 | memset(mbrbuf_ptr, 0, sector_size); | ||
61 | if (read_mbr(devno, mbrbuf_ptr)) | 63 | if (read_mbr(devno, mbrbuf_ptr)) |
62 | return -1; | 64 | return -1; |
63 | 65 | ||
64 | *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET]; | 66 | *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET]; |
65 | return 0; | 67 | mbr_magic = *(u16 *)&mbrbuf_ptr[510]; |
68 | |||
69 | /* check for valid MBR magic */ | ||
70 | return mbr_magic == 0xAA55 ? 0 : -1; | ||
66 | } | 71 | } |
67 | 72 | ||
68 | static int get_edd_info(u8 devno, struct edd_info *ei) | 73 | static int get_edd_info(u8 devno, struct edd_info *ei) |
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index af86e431acfa..b993062e9a5f 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S | |||
@@ -30,7 +30,6 @@ SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ | |||
30 | SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ | 30 | SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ |
31 | /* to be loaded */ | 31 | /* to be loaded */ |
32 | ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ | 32 | ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ |
33 | SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */ | ||
34 | 33 | ||
35 | #ifndef SVGA_MODE | 34 | #ifndef SVGA_MODE |
36 | #define SVGA_MODE ASK_VGA | 35 | #define SVGA_MODE ASK_VGA |
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index bbe76953bae9..8ef60f20b371 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c | |||
@@ -15,33 +15,33 @@ | |||
15 | 15 | ||
16 | #include <stdio.h> | 16 | #include <stdio.h> |
17 | 17 | ||
18 | #include "../kernel/cpu/feature_names.c" | 18 | #include "../kernel/cpu/capflags.c" |
19 | |||
20 | #if NCAPFLAGS > 8 | ||
21 | # error "Need to adjust the boot code handling of CPUID strings" | ||
22 | #endif | ||
23 | 19 | ||
24 | int main(void) | 20 | int main(void) |
25 | { | 21 | { |
26 | int i; | 22 | int i, j; |
27 | const char *str; | 23 | const char *str; |
28 | 24 | ||
29 | printf("static const char x86_cap_strs[] = \n"); | 25 | printf("static const char x86_cap_strs[] = \n"); |
30 | 26 | ||
31 | for (i = 0; i < NCAPINTS*32; i++) { | 27 | for (i = 0; i < NCAPINTS; i++) { |
32 | str = x86_cap_flags[i]; | 28 | for (j = 0; j < 32; j++) { |
33 | 29 | str = x86_cap_flags[i*32+j]; | |
34 | if (i == NCAPINTS*32-1) { | 30 | |
35 | /* The last entry must be unconditional; this | 31 | if (i == NCAPINTS-1 && j == 31) { |
36 | also consumes the compiler-added null character */ | 32 | /* The last entry must be unconditional; this |
37 | if (!str) | 33 | also consumes the compiler-added null |
38 | str = ""; | 34 | character */ |
39 | printf("\t\"\\x%02x\"\"%s\"\n", i, str); | 35 | if (!str) |
40 | } else if (str) { | 36 | str = ""; |
41 | printf("#if REQUIRED_MASK%d & (1 << %d)\n" | 37 | printf("\t\"\\x%02x\\x%02x\"\"%s\"\n", |
42 | "\t\"\\x%02x\"\"%s\\0\"\n" | 38 | i, j, str); |
43 | "#endif\n", | 39 | } else if (str) { |
44 | i >> 5, i & 31, i, str); | 40 | printf("#if REQUIRED_MASK%d & (1 << %d)\n" |
41 | "\t\"\\x%02x\\x%02x\"\"%s\\0\"\n" | ||
42 | "#endif\n", | ||
43 | i, j, i, j, str); | ||
44 | } | ||
45 | } | 45 | } |
46 | } | 46 | } |
47 | printf("\t;\n"); | 47 | printf("\t;\n"); |
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 401ad998ad08..99b3079dc6ab 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c | |||
@@ -88,14 +88,11 @@ static int vesa_probe(void) | |||
88 | (vminfo.memory_layout == 4 || | 88 | (vminfo.memory_layout == 4 || |
89 | vminfo.memory_layout == 6) && | 89 | vminfo.memory_layout == 6) && |
90 | vminfo.memory_planes == 1) { | 90 | vminfo.memory_planes == 1) { |
91 | #ifdef CONFIG_FB | 91 | #ifdef CONFIG_FB_BOOT_VESA_SUPPORT |
92 | /* Graphics mode, color, linear frame buffer | 92 | /* Graphics mode, color, linear frame buffer |
93 | supported. Only register the mode if | 93 | supported. Only register the mode if |
94 | if framebuffer is configured, however, | 94 | if framebuffer is configured, however, |
95 | otherwise the user will be left without a screen. | 95 | otherwise the user will be left without a screen. */ |
96 | We don't require CONFIG_FB_VESA, however, since | ||
97 | some of the other framebuffer drivers can use | ||
98 | this mode-setting, too. */ | ||
99 | mi = GET_HEAP(struct mode_info, 1); | 96 | mi = GET_HEAP(struct mode_info, 1); |
100 | mi->mode = mode + VIDEO_FIRST_VESA; | 97 | mi->mode = mode + VIDEO_FIRST_VESA; |
101 | mi->depth = vminfo.bpp; | 98 | mi->depth = vminfo.bpp; |
@@ -133,10 +130,12 @@ static int vesa_set_mode(struct mode_info *mode) | |||
133 | if ((vminfo.mode_attr & 0x15) == 0x05) { | 130 | if ((vminfo.mode_attr & 0x15) == 0x05) { |
134 | /* It's a supported text mode */ | 131 | /* It's a supported text mode */ |
135 | is_graphic = 0; | 132 | is_graphic = 0; |
133 | #ifdef CONFIG_FB_BOOT_VESA_SUPPORT | ||
136 | } else if ((vminfo.mode_attr & 0x99) == 0x99) { | 134 | } else if ((vminfo.mode_attr & 0x99) == 0x99) { |
137 | /* It's a graphics mode with linear frame buffer */ | 135 | /* It's a graphics mode with linear frame buffer */ |
138 | is_graphic = 1; | 136 | is_graphic = 1; |
139 | vesa_mode |= 0x4000; /* Request linear frame buffer */ | 137 | vesa_mode |= 0x4000; /* Request linear frame buffer */ |
138 | #endif | ||
140 | } else { | 139 | } else { |
141 | return -1; /* Invalid mode */ | 140 | return -1; /* Invalid mode */ |
142 | } | 141 | } |
@@ -224,7 +223,7 @@ static void vesa_store_pm_info(void) | |||
224 | static void vesa_store_mode_params_graphics(void) | 223 | static void vesa_store_mode_params_graphics(void) |
225 | { | 224 | { |
226 | /* Tell the kernel we're in VESA graphics mode */ | 225 | /* Tell the kernel we're in VESA graphics mode */ |
227 | boot_params.screen_info.orig_video_isVGA = 0x23; | 226 | boot_params.screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB; |
228 | 227 | ||
229 | /* Mode parameters */ | 228 | /* Mode parameters */ |
230 | boot_params.screen_info.vesa_attributes = vminfo.mode_attr; | 229 | boot_params.screen_info.vesa_attributes = vminfo.mode_attr; |
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 104275e191a8..52d0359719d7 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig | |||
@@ -1,7 +1,7 @@ | |||
1 | # | 1 | # |
2 | # Automatically generated make config: don't edit | 2 | # Automatically generated make config: don't edit |
3 | # Linux kernel version: 2.6.27-rc4 | 3 | # Linux kernel version: 2.6.27-rc5 |
4 | # Mon Aug 25 15:04:00 2008 | 4 | # Wed Sep 3 17:23:09 2008 |
5 | # | 5 | # |
6 | # CONFIG_64BIT is not set | 6 | # CONFIG_64BIT is not set |
7 | CONFIG_X86_32=y | 7 | CONFIG_X86_32=y |
@@ -202,7 +202,7 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y | |||
202 | # CONFIG_M586 is not set | 202 | # CONFIG_M586 is not set |
203 | # CONFIG_M586TSC is not set | 203 | # CONFIG_M586TSC is not set |
204 | # CONFIG_M586MMX is not set | 204 | # CONFIG_M586MMX is not set |
205 | # CONFIG_M686 is not set | 205 | CONFIG_M686=y |
206 | # CONFIG_MPENTIUMII is not set | 206 | # CONFIG_MPENTIUMII is not set |
207 | # CONFIG_MPENTIUMIII is not set | 207 | # CONFIG_MPENTIUMIII is not set |
208 | # CONFIG_MPENTIUMM is not set | 208 | # CONFIG_MPENTIUMM is not set |
@@ -213,7 +213,6 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y | |||
213 | # CONFIG_MCRUSOE is not set | 213 | # CONFIG_MCRUSOE is not set |
214 | # CONFIG_MEFFICEON is not set | 214 | # CONFIG_MEFFICEON is not set |
215 | # CONFIG_MWINCHIPC6 is not set | 215 | # CONFIG_MWINCHIPC6 is not set |
216 | # CONFIG_MWINCHIP2 is not set | ||
217 | # CONFIG_MWINCHIP3D is not set | 216 | # CONFIG_MWINCHIP3D is not set |
218 | # CONFIG_MGEODEGX1 is not set | 217 | # CONFIG_MGEODEGX1 is not set |
219 | # CONFIG_MGEODE_LX is not set | 218 | # CONFIG_MGEODE_LX is not set |
@@ -221,13 +220,14 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y | |||
221 | # CONFIG_MVIAC3_2 is not set | 220 | # CONFIG_MVIAC3_2 is not set |
222 | # CONFIG_MVIAC7 is not set | 221 | # CONFIG_MVIAC7 is not set |
223 | # CONFIG_MPSC is not set | 222 | # CONFIG_MPSC is not set |
224 | CONFIG_MCORE2=y | 223 | # CONFIG_MCORE2 is not set |
225 | # CONFIG_GENERIC_CPU is not set | 224 | # CONFIG_GENERIC_CPU is not set |
226 | CONFIG_X86_GENERIC=y | 225 | CONFIG_X86_GENERIC=y |
227 | CONFIG_X86_CPU=y | 226 | CONFIG_X86_CPU=y |
228 | CONFIG_X86_CMPXCHG=y | 227 | CONFIG_X86_CMPXCHG=y |
229 | CONFIG_X86_L1_CACHE_SHIFT=7 | 228 | CONFIG_X86_L1_CACHE_SHIFT=7 |
230 | CONFIG_X86_XADD=y | 229 | CONFIG_X86_XADD=y |
230 | # CONFIG_X86_PPRO_FENCE is not set | ||
231 | CONFIG_X86_WP_WORKS_OK=y | 231 | CONFIG_X86_WP_WORKS_OK=y |
232 | CONFIG_X86_INVLPG=y | 232 | CONFIG_X86_INVLPG=y |
233 | CONFIG_X86_BSWAP=y | 233 | CONFIG_X86_BSWAP=y |
@@ -235,14 +235,15 @@ CONFIG_X86_POPAD_OK=y | |||
235 | CONFIG_X86_INTEL_USERCOPY=y | 235 | CONFIG_X86_INTEL_USERCOPY=y |
236 | CONFIG_X86_USE_PPRO_CHECKSUM=y | 236 | CONFIG_X86_USE_PPRO_CHECKSUM=y |
237 | CONFIG_X86_TSC=y | 237 | CONFIG_X86_TSC=y |
238 | CONFIG_X86_CMOV=y | ||
238 | CONFIG_X86_MINIMUM_CPU_FAMILY=4 | 239 | CONFIG_X86_MINIMUM_CPU_FAMILY=4 |
239 | CONFIG_X86_DEBUGCTLMSR=y | 240 | CONFIG_X86_DEBUGCTLMSR=y |
240 | CONFIG_HPET_TIMER=y | 241 | CONFIG_HPET_TIMER=y |
241 | CONFIG_HPET_EMULATE_RTC=y | 242 | CONFIG_HPET_EMULATE_RTC=y |
242 | CONFIG_DMI=y | 243 | CONFIG_DMI=y |
243 | # CONFIG_IOMMU_HELPER is not set | 244 | # CONFIG_IOMMU_HELPER is not set |
244 | CONFIG_NR_CPUS=4 | 245 | CONFIG_NR_CPUS=64 |
245 | # CONFIG_SCHED_SMT is not set | 246 | CONFIG_SCHED_SMT=y |
246 | CONFIG_SCHED_MC=y | 247 | CONFIG_SCHED_MC=y |
247 | # CONFIG_PREEMPT_NONE is not set | 248 | # CONFIG_PREEMPT_NONE is not set |
248 | CONFIG_PREEMPT_VOLUNTARY=y | 249 | CONFIG_PREEMPT_VOLUNTARY=y |
@@ -254,7 +255,8 @@ CONFIG_VM86=y | |||
254 | # CONFIG_TOSHIBA is not set | 255 | # CONFIG_TOSHIBA is not set |
255 | # CONFIG_I8K is not set | 256 | # CONFIG_I8K is not set |
256 | CONFIG_X86_REBOOTFIXUPS=y | 257 | CONFIG_X86_REBOOTFIXUPS=y |
257 | # CONFIG_MICROCODE is not set | 258 | CONFIG_MICROCODE=y |
259 | CONFIG_MICROCODE_OLD_INTERFACE=y | ||
258 | CONFIG_X86_MSR=y | 260 | CONFIG_X86_MSR=y |
259 | CONFIG_X86_CPUID=y | 261 | CONFIG_X86_CPUID=y |
260 | # CONFIG_NOHIGHMEM is not set | 262 | # CONFIG_NOHIGHMEM is not set |
@@ -1532,7 +1534,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y | |||
1532 | CONFIG_VGA_CONSOLE=y | 1534 | CONFIG_VGA_CONSOLE=y |
1533 | CONFIG_VGACON_SOFT_SCROLLBACK=y | 1535 | CONFIG_VGACON_SOFT_SCROLLBACK=y |
1534 | CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 | 1536 | CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 |
1535 | CONFIG_VIDEO_SELECT=y | ||
1536 | CONFIG_DUMMY_CONSOLE=y | 1537 | CONFIG_DUMMY_CONSOLE=y |
1537 | # CONFIG_FRAMEBUFFER_CONSOLE is not set | 1538 | # CONFIG_FRAMEBUFFER_CONSOLE is not set |
1538 | CONFIG_LOGO=y | 1539 | CONFIG_LOGO=y |
@@ -2115,7 +2116,7 @@ CONFIG_IO_DELAY_0X80=y | |||
2115 | CONFIG_DEFAULT_IO_DELAY_TYPE=0 | 2116 | CONFIG_DEFAULT_IO_DELAY_TYPE=0 |
2116 | CONFIG_DEBUG_BOOT_PARAMS=y | 2117 | CONFIG_DEBUG_BOOT_PARAMS=y |
2117 | # CONFIG_CPA_DEBUG is not set | 2118 | # CONFIG_CPA_DEBUG is not set |
2118 | # CONFIG_OPTIMIZE_INLINING is not set | 2119 | CONFIG_OPTIMIZE_INLINING=y |
2119 | 2120 | ||
2120 | # | 2121 | # |
2121 | # Security options | 2122 | # Security options |
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 678c8acefe04..f0a03d7a7d63 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig | |||
@@ -1,7 +1,7 @@ | |||
1 | # | 1 | # |
2 | # Automatically generated make config: don't edit | 2 | # Automatically generated make config: don't edit |
3 | # Linux kernel version: 2.6.27-rc4 | 3 | # Linux kernel version: 2.6.27-rc5 |
4 | # Mon Aug 25 14:40:46 2008 | 4 | # Wed Sep 3 17:13:39 2008 |
5 | # | 5 | # |
6 | CONFIG_64BIT=y | 6 | CONFIG_64BIT=y |
7 | # CONFIG_X86_32 is not set | 7 | # CONFIG_X86_32 is not set |
@@ -210,7 +210,6 @@ CONFIG_X86_PC=y | |||
210 | # CONFIG_MCRUSOE is not set | 210 | # CONFIG_MCRUSOE is not set |
211 | # CONFIG_MEFFICEON is not set | 211 | # CONFIG_MEFFICEON is not set |
212 | # CONFIG_MWINCHIPC6 is not set | 212 | # CONFIG_MWINCHIPC6 is not set |
213 | # CONFIG_MWINCHIP2 is not set | ||
214 | # CONFIG_MWINCHIP3D is not set | 213 | # CONFIG_MWINCHIP3D is not set |
215 | # CONFIG_MGEODEGX1 is not set | 214 | # CONFIG_MGEODEGX1 is not set |
216 | # CONFIG_MGEODE_LX is not set | 215 | # CONFIG_MGEODE_LX is not set |
@@ -218,17 +217,14 @@ CONFIG_X86_PC=y | |||
218 | # CONFIG_MVIAC3_2 is not set | 217 | # CONFIG_MVIAC3_2 is not set |
219 | # CONFIG_MVIAC7 is not set | 218 | # CONFIG_MVIAC7 is not set |
220 | # CONFIG_MPSC is not set | 219 | # CONFIG_MPSC is not set |
221 | CONFIG_MCORE2=y | 220 | # CONFIG_MCORE2 is not set |
222 | # CONFIG_GENERIC_CPU is not set | 221 | CONFIG_GENERIC_CPU=y |
223 | CONFIG_X86_CPU=y | 222 | CONFIG_X86_CPU=y |
224 | CONFIG_X86_L1_CACHE_BYTES=64 | 223 | CONFIG_X86_L1_CACHE_BYTES=128 |
225 | CONFIG_X86_INTERNODE_CACHE_BYTES=64 | 224 | CONFIG_X86_INTERNODE_CACHE_BYTES=128 |
226 | CONFIG_X86_CMPXCHG=y | 225 | CONFIG_X86_CMPXCHG=y |
227 | CONFIG_X86_L1_CACHE_SHIFT=6 | 226 | CONFIG_X86_L1_CACHE_SHIFT=7 |
228 | CONFIG_X86_WP_WORKS_OK=y | 227 | CONFIG_X86_WP_WORKS_OK=y |
229 | CONFIG_X86_INTEL_USERCOPY=y | ||
230 | CONFIG_X86_USE_PPRO_CHECKSUM=y | ||
231 | CONFIG_X86_P6_NOP=y | ||
232 | CONFIG_X86_TSC=y | 228 | CONFIG_X86_TSC=y |
233 | CONFIG_X86_CMPXCHG64=y | 229 | CONFIG_X86_CMPXCHG64=y |
234 | CONFIG_X86_CMOV=y | 230 | CONFIG_X86_CMOV=y |
@@ -243,9 +239,8 @@ CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y | |||
243 | CONFIG_AMD_IOMMU=y | 239 | CONFIG_AMD_IOMMU=y |
244 | CONFIG_SWIOTLB=y | 240 | CONFIG_SWIOTLB=y |
245 | CONFIG_IOMMU_HELPER=y | 241 | CONFIG_IOMMU_HELPER=y |
246 | # CONFIG_MAXSMP is not set | 242 | CONFIG_NR_CPUS=64 |
247 | CONFIG_NR_CPUS=4 | 243 | CONFIG_SCHED_SMT=y |
248 | # CONFIG_SCHED_SMT is not set | ||
249 | CONFIG_SCHED_MC=y | 244 | CONFIG_SCHED_MC=y |
250 | # CONFIG_PREEMPT_NONE is not set | 245 | # CONFIG_PREEMPT_NONE is not set |
251 | CONFIG_PREEMPT_VOLUNTARY=y | 246 | CONFIG_PREEMPT_VOLUNTARY=y |
@@ -254,7 +249,8 @@ CONFIG_X86_LOCAL_APIC=y | |||
254 | CONFIG_X86_IO_APIC=y | 249 | CONFIG_X86_IO_APIC=y |
255 | # CONFIG_X86_MCE is not set | 250 | # CONFIG_X86_MCE is not set |
256 | # CONFIG_I8K is not set | 251 | # CONFIG_I8K is not set |
257 | # CONFIG_MICROCODE is not set | 252 | CONFIG_MICROCODE=y |
253 | CONFIG_MICROCODE_OLD_INTERFACE=y | ||
258 | CONFIG_X86_MSR=y | 254 | CONFIG_X86_MSR=y |
259 | CONFIG_X86_CPUID=y | 255 | CONFIG_X86_CPUID=y |
260 | CONFIG_NUMA=y | 256 | CONFIG_NUMA=y |
@@ -290,7 +286,7 @@ CONFIG_BOUNCE=y | |||
290 | CONFIG_VIRT_TO_BUS=y | 286 | CONFIG_VIRT_TO_BUS=y |
291 | CONFIG_MTRR=y | 287 | CONFIG_MTRR=y |
292 | # CONFIG_MTRR_SANITIZER is not set | 288 | # CONFIG_MTRR_SANITIZER is not set |
293 | # CONFIG_X86_PAT is not set | 289 | CONFIG_X86_PAT=y |
294 | CONFIG_EFI=y | 290 | CONFIG_EFI=y |
295 | CONFIG_SECCOMP=y | 291 | CONFIG_SECCOMP=y |
296 | # CONFIG_HZ_100 is not set | 292 | # CONFIG_HZ_100 is not set |
@@ -1508,7 +1504,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y | |||
1508 | CONFIG_VGA_CONSOLE=y | 1504 | CONFIG_VGA_CONSOLE=y |
1509 | CONFIG_VGACON_SOFT_SCROLLBACK=y | 1505 | CONFIG_VGACON_SOFT_SCROLLBACK=y |
1510 | CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 | 1506 | CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 |
1511 | CONFIG_VIDEO_SELECT=y | ||
1512 | CONFIG_DUMMY_CONSOLE=y | 1507 | CONFIG_DUMMY_CONSOLE=y |
1513 | # CONFIG_FRAMEBUFFER_CONSOLE is not set | 1508 | # CONFIG_FRAMEBUFFER_CONSOLE is not set |
1514 | CONFIG_LOGO=y | 1509 | CONFIG_LOGO=y |
@@ -2089,7 +2084,7 @@ CONFIG_IO_DELAY_0X80=y | |||
2089 | CONFIG_DEFAULT_IO_DELAY_TYPE=0 | 2084 | CONFIG_DEFAULT_IO_DELAY_TYPE=0 |
2090 | CONFIG_DEBUG_BOOT_PARAMS=y | 2085 | CONFIG_DEBUG_BOOT_PARAMS=y |
2091 | # CONFIG_CPA_DEBUG is not set | 2086 | # CONFIG_CPA_DEBUG is not set |
2092 | # CONFIG_OPTIMIZE_INLINING is not set | 2087 | CONFIG_OPTIMIZE_INLINING=y |
2093 | 2088 | ||
2094 | # | 2089 | # |
2095 | # Security options | 2090 | # Security options |
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 3874c2de5403..903de4aa5094 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile | |||
@@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o | |||
10 | obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o | 10 | obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o |
11 | obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o | 11 | obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o |
12 | 12 | ||
13 | obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o | ||
14 | |||
13 | aes-i586-y := aes-i586-asm_32.o aes_glue.o | 15 | aes-i586-y := aes-i586-asm_32.o aes_glue.o |
14 | twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o | 16 | twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o |
15 | salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o | 17 | salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o |
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c new file mode 100644 index 000000000000..070afc5b6c94 --- /dev/null +++ b/arch/x86/crypto/crc32c-intel.c | |||
@@ -0,0 +1,197 @@ | |||
1 | /* | ||
2 | * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal. | ||
3 | * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE) | ||
4 | * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at: | ||
5 | * http://www.intel.com/products/processor/manuals/ | ||
6 | * Intel(R) 64 and IA-32 Architectures Software Developer's Manual | ||
7 | * Volume 2A: Instruction Set Reference, A-M | ||
8 | * | ||
9 | * Copyright (c) 2008 Austin Zhang <austin_zhang@linux.intel.com> | ||
10 | * Copyright (c) 2008 Kent Liu <kent.liu@intel.com> | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or modify it | ||
13 | * under the terms of the GNU General Public License as published by the Free | ||
14 | * Software Foundation; either version 2 of the License, or (at your option) | ||
15 | * any later version. | ||
16 | * | ||
17 | */ | ||
18 | #include <linux/init.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/string.h> | ||
21 | #include <linux/kernel.h> | ||
22 | #include <crypto/internal/hash.h> | ||
23 | |||
24 | #include <asm/cpufeature.h> | ||
25 | |||
26 | #define CHKSUM_BLOCK_SIZE 1 | ||
27 | #define CHKSUM_DIGEST_SIZE 4 | ||
28 | |||
29 | #define SCALE_F sizeof(unsigned long) | ||
30 | |||
31 | #ifdef CONFIG_X86_64 | ||
32 | #define REX_PRE "0x48, " | ||
33 | #else | ||
34 | #define REX_PRE | ||
35 | #endif | ||
36 | |||
37 | static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) | ||
38 | { | ||
39 | while (length--) { | ||
40 | __asm__ __volatile__( | ||
41 | ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1" | ||
42 | :"=S"(crc) | ||
43 | :"0"(crc), "c"(*data) | ||
44 | ); | ||
45 | data++; | ||
46 | } | ||
47 | |||
48 | return crc; | ||
49 | } | ||
50 | |||
51 | static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len) | ||
52 | { | ||
53 | unsigned int iquotient = len / SCALE_F; | ||
54 | unsigned int iremainder = len % SCALE_F; | ||
55 | unsigned long *ptmp = (unsigned long *)p; | ||
56 | |||
57 | while (iquotient--) { | ||
58 | __asm__ __volatile__( | ||
59 | ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;" | ||
60 | :"=S"(crc) | ||
61 | :"0"(crc), "c"(*ptmp) | ||
62 | ); | ||
63 | ptmp++; | ||
64 | } | ||
65 | |||
66 | if (iremainder) | ||
67 | crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp, | ||
68 | iremainder); | ||
69 | |||
70 | return crc; | ||
71 | } | ||
72 | |||
73 | /* | ||
74 | * Setting the seed allows arbitrary accumulators and flexible XOR policy | ||
75 | * If your algorithm starts with ~0, then XOR with ~0 before you set | ||
76 | * the seed. | ||
77 | */ | ||
78 | static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key, | ||
79 | unsigned int keylen) | ||
80 | { | ||
81 | u32 *mctx = crypto_ahash_ctx(hash); | ||
82 | |||
83 | if (keylen != sizeof(u32)) { | ||
84 | crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); | ||
85 | return -EINVAL; | ||
86 | } | ||
87 | *mctx = le32_to_cpup((__le32 *)key); | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | static int crc32c_intel_init(struct ahash_request *req) | ||
92 | { | ||
93 | u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); | ||
94 | u32 *crcp = ahash_request_ctx(req); | ||
95 | |||
96 | *crcp = *mctx; | ||
97 | |||
98 | return 0; | ||
99 | } | ||
100 | |||
101 | static int crc32c_intel_update(struct ahash_request *req) | ||
102 | { | ||
103 | struct crypto_hash_walk walk; | ||
104 | u32 *crcp = ahash_request_ctx(req); | ||
105 | u32 crc = *crcp; | ||
106 | int nbytes; | ||
107 | |||
108 | for (nbytes = crypto_hash_walk_first(req, &walk); nbytes; | ||
109 | nbytes = crypto_hash_walk_done(&walk, 0)) | ||
110 | crc = crc32c_intel_le_hw(crc, walk.data, nbytes); | ||
111 | |||
112 | *crcp = crc; | ||
113 | return 0; | ||
114 | } | ||
115 | |||
116 | static int crc32c_intel_final(struct ahash_request *req) | ||
117 | { | ||
118 | u32 *crcp = ahash_request_ctx(req); | ||
119 | |||
120 | *(__le32 *)req->result = ~cpu_to_le32p(crcp); | ||
121 | return 0; | ||
122 | } | ||
123 | |||
124 | static int crc32c_intel_digest(struct ahash_request *req) | ||
125 | { | ||
126 | struct crypto_hash_walk walk; | ||
127 | u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); | ||
128 | u32 crc = *mctx; | ||
129 | int nbytes; | ||
130 | |||
131 | for (nbytes = crypto_hash_walk_first(req, &walk); nbytes; | ||
132 | nbytes = crypto_hash_walk_done(&walk, 0)) | ||
133 | crc = crc32c_intel_le_hw(crc, walk.data, nbytes); | ||
134 | |||
135 | *(__le32 *)req->result = ~cpu_to_le32(crc); | ||
136 | return 0; | ||
137 | } | ||
138 | |||
139 | static int crc32c_intel_cra_init(struct crypto_tfm *tfm) | ||
140 | { | ||
141 | u32 *key = crypto_tfm_ctx(tfm); | ||
142 | |||
143 | *key = ~0; | ||
144 | |||
145 | tfm->crt_ahash.reqsize = sizeof(u32); | ||
146 | |||
147 | return 0; | ||
148 | } | ||
149 | |||
150 | static struct crypto_alg alg = { | ||
151 | .cra_name = "crc32c", | ||
152 | .cra_driver_name = "crc32c-intel", | ||
153 | .cra_priority = 200, | ||
154 | .cra_flags = CRYPTO_ALG_TYPE_AHASH, | ||
155 | .cra_blocksize = CHKSUM_BLOCK_SIZE, | ||
156 | .cra_alignmask = 3, | ||
157 | .cra_ctxsize = sizeof(u32), | ||
158 | .cra_module = THIS_MODULE, | ||
159 | .cra_list = LIST_HEAD_INIT(alg.cra_list), | ||
160 | .cra_init = crc32c_intel_cra_init, | ||
161 | .cra_type = &crypto_ahash_type, | ||
162 | .cra_u = { | ||
163 | .ahash = { | ||
164 | .digestsize = CHKSUM_DIGEST_SIZE, | ||
165 | .setkey = crc32c_intel_setkey, | ||
166 | .init = crc32c_intel_init, | ||
167 | .update = crc32c_intel_update, | ||
168 | .final = crc32c_intel_final, | ||
169 | .digest = crc32c_intel_digest, | ||
170 | } | ||
171 | } | ||
172 | }; | ||
173 | |||
174 | |||
175 | static int __init crc32c_intel_mod_init(void) | ||
176 | { | ||
177 | if (cpu_has_xmm4_2) | ||
178 | return crypto_register_alg(&alg); | ||
179 | else | ||
180 | return -ENODEV; | ||
181 | } | ||
182 | |||
183 | static void __exit crc32c_intel_mod_fini(void) | ||
184 | { | ||
185 | crypto_unregister_alg(&alg); | ||
186 | } | ||
187 | |||
188 | module_init(crc32c_intel_mod_init); | ||
189 | module_exit(crc32c_intel_mod_fini); | ||
190 | |||
191 | MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>"); | ||
192 | MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware."); | ||
193 | MODULE_LICENSE("GPL"); | ||
194 | |||
195 | MODULE_ALIAS("crc32c"); | ||
196 | MODULE_ALIAS("crc32c-intel"); | ||
197 | |||
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index a0e1dbe67dc1..127ec3f07214 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c | |||
@@ -85,8 +85,10 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump) | |||
85 | dump->regs.ax = regs->ax; | 85 | dump->regs.ax = regs->ax; |
86 | dump->regs.ds = current->thread.ds; | 86 | dump->regs.ds = current->thread.ds; |
87 | dump->regs.es = current->thread.es; | 87 | dump->regs.es = current->thread.es; |
88 | asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; | 88 | savesegment(fs, fs); |
89 | asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; | 89 | dump->regs.fs = fs; |
90 | savesegment(gs, gs); | ||
91 | dump->regs.gs = gs; | ||
90 | dump->regs.orig_ax = regs->orig_ax; | 92 | dump->regs.orig_ax = regs->orig_ax; |
91 | dump->regs.ip = regs->ip; | 93 | dump->regs.ip = regs->ip; |
92 | dump->regs.cs = regs->cs; | 94 | dump->regs.cs = regs->cs; |
@@ -430,8 +432,9 @@ beyond_if: | |||
430 | current->mm->start_stack = | 432 | current->mm->start_stack = |
431 | (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); | 433 | (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); |
432 | /* start thread */ | 434 | /* start thread */ |
433 | asm volatile("movl %0,%%fs" :: "r" (0)); \ | 435 | loadsegment(fs, 0); |
434 | asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); | 436 | loadsegment(ds, __USER32_DS); |
437 | loadsegment(es, __USER32_DS); | ||
435 | load_gs_index(0); | 438 | load_gs_index(0); |
436 | (regs)->ip = ex.a_entry; | 439 | (regs)->ip = ex.a_entry; |
437 | (regs)->sp = current->mm->start_stack; | 440 | (regs)->sp = current->mm->start_stack; |
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 20af4c79579a..4bc02b23674b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c | |||
@@ -179,9 +179,10 @@ struct sigframe | |||
179 | u32 pretcode; | 179 | u32 pretcode; |
180 | int sig; | 180 | int sig; |
181 | struct sigcontext_ia32 sc; | 181 | struct sigcontext_ia32 sc; |
182 | struct _fpstate_ia32 fpstate; | 182 | struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */ |
183 | unsigned int extramask[_COMPAT_NSIG_WORDS-1]; | 183 | unsigned int extramask[_COMPAT_NSIG_WORDS-1]; |
184 | char retcode[8]; | 184 | char retcode[8]; |
185 | /* fp state follows here */ | ||
185 | }; | 186 | }; |
186 | 187 | ||
187 | struct rt_sigframe | 188 | struct rt_sigframe |
@@ -192,8 +193,8 @@ struct rt_sigframe | |||
192 | u32 puc; | 193 | u32 puc; |
193 | compat_siginfo_t info; | 194 | compat_siginfo_t info; |
194 | struct ucontext_ia32 uc; | 195 | struct ucontext_ia32 uc; |
195 | struct _fpstate_ia32 fpstate; | ||
196 | char retcode[8]; | 196 | char retcode[8]; |
197 | /* fp state follows here */ | ||
197 | }; | 198 | }; |
198 | 199 | ||
199 | #define COPY(x) { \ | 200 | #define COPY(x) { \ |
@@ -206,7 +207,7 @@ struct rt_sigframe | |||
206 | { unsigned int cur; \ | 207 | { unsigned int cur; \ |
207 | unsigned short pre; \ | 208 | unsigned short pre; \ |
208 | err |= __get_user(pre, &sc->seg); \ | 209 | err |= __get_user(pre, &sc->seg); \ |
209 | asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ | 210 | savesegment(seg, cur); \ |
210 | pre |= mask; \ | 211 | pre |= mask; \ |
211 | if (pre != cur) loadsegment(seg, pre); } | 212 | if (pre != cur) loadsegment(seg, pre); } |
212 | 213 | ||
@@ -215,7 +216,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, | |||
215 | unsigned int *peax) | 216 | unsigned int *peax) |
216 | { | 217 | { |
217 | unsigned int tmpflags, gs, oldgs, err = 0; | 218 | unsigned int tmpflags, gs, oldgs, err = 0; |
218 | struct _fpstate_ia32 __user *buf; | 219 | void __user *buf; |
219 | u32 tmp; | 220 | u32 tmp; |
220 | 221 | ||
221 | /* Always make any pending restarted system calls return -EINTR */ | 222 | /* Always make any pending restarted system calls return -EINTR */ |
@@ -235,7 +236,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, | |||
235 | */ | 236 | */ |
236 | err |= __get_user(gs, &sc->gs); | 237 | err |= __get_user(gs, &sc->gs); |
237 | gs |= 3; | 238 | gs |= 3; |
238 | asm("movl %%gs,%0" : "=r" (oldgs)); | 239 | savesegment(gs, oldgs); |
239 | if (gs != oldgs) | 240 | if (gs != oldgs) |
240 | load_gs_index(gs); | 241 | load_gs_index(gs); |
241 | 242 | ||
@@ -259,26 +260,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, | |||
259 | 260 | ||
260 | err |= __get_user(tmp, &sc->fpstate); | 261 | err |= __get_user(tmp, &sc->fpstate); |
261 | buf = compat_ptr(tmp); | 262 | buf = compat_ptr(tmp); |
262 | if (buf) { | 263 | err |= restore_i387_xstate_ia32(buf); |
263 | if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) | ||
264 | goto badframe; | ||
265 | err |= restore_i387_ia32(buf); | ||
266 | } else { | ||
267 | struct task_struct *me = current; | ||
268 | |||
269 | if (used_math()) { | ||
270 | clear_fpu(me); | ||
271 | clear_used_math(); | ||
272 | } | ||
273 | } | ||
274 | 264 | ||
275 | err |= __get_user(tmp, &sc->ax); | 265 | err |= __get_user(tmp, &sc->ax); |
276 | *peax = tmp; | 266 | *peax = tmp; |
277 | 267 | ||
278 | return err; | 268 | return err; |
279 | |||
280 | badframe: | ||
281 | return 1; | ||
282 | } | 269 | } |
283 | 270 | ||
284 | asmlinkage long sys32_sigreturn(struct pt_regs *regs) | 271 | asmlinkage long sys32_sigreturn(struct pt_regs *regs) |
@@ -350,46 +337,42 @@ badframe: | |||
350 | */ | 337 | */ |
351 | 338 | ||
352 | static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, | 339 | static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, |
353 | struct _fpstate_ia32 __user *fpstate, | 340 | void __user *fpstate, |
354 | struct pt_regs *regs, unsigned int mask) | 341 | struct pt_regs *regs, unsigned int mask) |
355 | { | 342 | { |
356 | int tmp, err = 0; | 343 | int tmp, err = 0; |
357 | 344 | ||
358 | tmp = 0; | 345 | savesegment(gs, tmp); |
359 | __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); | ||
360 | err |= __put_user(tmp, (unsigned int __user *)&sc->gs); | 346 | err |= __put_user(tmp, (unsigned int __user *)&sc->gs); |
361 | __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); | 347 | savesegment(fs, tmp); |
362 | err |= __put_user(tmp, (unsigned int __user *)&sc->fs); | 348 | err |= __put_user(tmp, (unsigned int __user *)&sc->fs); |
363 | __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp)); | 349 | savesegment(ds, tmp); |
364 | err |= __put_user(tmp, (unsigned int __user *)&sc->ds); | 350 | err |= __put_user(tmp, (unsigned int __user *)&sc->ds); |
365 | __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); | 351 | savesegment(es, tmp); |
366 | err |= __put_user(tmp, (unsigned int __user *)&sc->es); | 352 | err |= __put_user(tmp, (unsigned int __user *)&sc->es); |
367 | 353 | ||
368 | err |= __put_user((u32)regs->di, &sc->di); | 354 | err |= __put_user(regs->di, &sc->di); |
369 | err |= __put_user((u32)regs->si, &sc->si); | 355 | err |= __put_user(regs->si, &sc->si); |
370 | err |= __put_user((u32)regs->bp, &sc->bp); | 356 | err |= __put_user(regs->bp, &sc->bp); |
371 | err |= __put_user((u32)regs->sp, &sc->sp); | 357 | err |= __put_user(regs->sp, &sc->sp); |
372 | err |= __put_user((u32)regs->bx, &sc->bx); | 358 | err |= __put_user(regs->bx, &sc->bx); |
373 | err |= __put_user((u32)regs->dx, &sc->dx); | 359 | err |= __put_user(regs->dx, &sc->dx); |
374 | err |= __put_user((u32)regs->cx, &sc->cx); | 360 | err |= __put_user(regs->cx, &sc->cx); |
375 | err |= __put_user((u32)regs->ax, &sc->ax); | 361 | err |= __put_user(regs->ax, &sc->ax); |
376 | err |= __put_user((u32)regs->cs, &sc->cs); | 362 | err |= __put_user(regs->cs, &sc->cs); |
377 | err |= __put_user((u32)regs->ss, &sc->ss); | 363 | err |= __put_user(regs->ss, &sc->ss); |
378 | err |= __put_user(current->thread.trap_no, &sc->trapno); | 364 | err |= __put_user(current->thread.trap_no, &sc->trapno); |
379 | err |= __put_user(current->thread.error_code, &sc->err); | 365 | err |= __put_user(current->thread.error_code, &sc->err); |
380 | err |= __put_user((u32)regs->ip, &sc->ip); | 366 | err |= __put_user(regs->ip, &sc->ip); |
381 | err |= __put_user((u32)regs->flags, &sc->flags); | 367 | err |= __put_user(regs->flags, &sc->flags); |
382 | err |= __put_user((u32)regs->sp, &sc->sp_at_signal); | 368 | err |= __put_user(regs->sp, &sc->sp_at_signal); |
383 | 369 | ||
384 | tmp = save_i387_ia32(fpstate); | 370 | tmp = save_i387_xstate_ia32(fpstate); |
385 | if (tmp < 0) | 371 | if (tmp < 0) |
386 | err = -EFAULT; | 372 | err = -EFAULT; |
387 | else { | 373 | else |
388 | clear_used_math(); | ||
389 | stts(); | ||
390 | err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), | 374 | err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), |
391 | &sc->fpstate); | 375 | &sc->fpstate); |
392 | } | ||
393 | 376 | ||
394 | /* non-iBCS2 extensions.. */ | 377 | /* non-iBCS2 extensions.. */ |
395 | err |= __put_user(mask, &sc->oldmask); | 378 | err |= __put_user(mask, &sc->oldmask); |
@@ -402,7 +385,8 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, | |||
402 | * Determine which stack to use.. | 385 | * Determine which stack to use.. |
403 | */ | 386 | */ |
404 | static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, | 387 | static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, |
405 | size_t frame_size) | 388 | size_t frame_size, |
389 | void **fpstate) | ||
406 | { | 390 | { |
407 | unsigned long sp; | 391 | unsigned long sp; |
408 | 392 | ||
@@ -421,6 +405,11 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, | |||
421 | ka->sa.sa_restorer) | 405 | ka->sa.sa_restorer) |
422 | sp = (unsigned long) ka->sa.sa_restorer; | 406 | sp = (unsigned long) ka->sa.sa_restorer; |
423 | 407 | ||
408 | if (used_math()) { | ||
409 | sp = sp - sig_xstate_ia32_size; | ||
410 | *fpstate = (struct _fpstate_ia32 *) sp; | ||
411 | } | ||
412 | |||
424 | sp -= frame_size; | 413 | sp -= frame_size; |
425 | /* Align the stack pointer according to the i386 ABI, | 414 | /* Align the stack pointer according to the i386 ABI, |
426 | * i.e. so that on function entry ((sp + 4) & 15) == 0. */ | 415 | * i.e. so that on function entry ((sp + 4) & 15) == 0. */ |
@@ -434,6 +423,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, | |||
434 | struct sigframe __user *frame; | 423 | struct sigframe __user *frame; |
435 | void __user *restorer; | 424 | void __user *restorer; |
436 | int err = 0; | 425 | int err = 0; |
426 | void __user *fpstate = NULL; | ||
437 | 427 | ||
438 | /* copy_to_user optimizes that into a single 8 byte store */ | 428 | /* copy_to_user optimizes that into a single 8 byte store */ |
439 | static const struct { | 429 | static const struct { |
@@ -448,25 +438,21 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, | |||
448 | 0, | 438 | 0, |
449 | }; | 439 | }; |
450 | 440 | ||
451 | frame = get_sigframe(ka, regs, sizeof(*frame)); | 441 | frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); |
452 | 442 | ||
453 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | 443 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) |
454 | goto give_sigsegv; | 444 | return -EFAULT; |
455 | 445 | ||
456 | err |= __put_user(sig, &frame->sig); | 446 | if (__put_user(sig, &frame->sig)) |
457 | if (err) | 447 | return -EFAULT; |
458 | goto give_sigsegv; | ||
459 | 448 | ||
460 | err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs, | 449 | if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) |
461 | set->sig[0]); | 450 | return -EFAULT; |
462 | if (err) | ||
463 | goto give_sigsegv; | ||
464 | 451 | ||
465 | if (_COMPAT_NSIG_WORDS > 1) { | 452 | if (_COMPAT_NSIG_WORDS > 1) { |
466 | err |= __copy_to_user(frame->extramask, &set->sig[1], | 453 | if (__copy_to_user(frame->extramask, &set->sig[1], |
467 | sizeof(frame->extramask)); | 454 | sizeof(frame->extramask))) |
468 | if (err) | 455 | return -EFAULT; |
469 | goto give_sigsegv; | ||
470 | } | 456 | } |
471 | 457 | ||
472 | if (ka->sa.sa_flags & SA_RESTORER) { | 458 | if (ka->sa.sa_flags & SA_RESTORER) { |
@@ -487,7 +473,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, | |||
487 | */ | 473 | */ |
488 | err |= __copy_to_user(frame->retcode, &code, 8); | 474 | err |= __copy_to_user(frame->retcode, &code, 8); |
489 | if (err) | 475 | if (err) |
490 | goto give_sigsegv; | 476 | return -EFAULT; |
491 | 477 | ||
492 | /* Set up registers for signal handler */ | 478 | /* Set up registers for signal handler */ |
493 | regs->sp = (unsigned long) frame; | 479 | regs->sp = (unsigned long) frame; |
@@ -498,8 +484,8 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, | |||
498 | regs->dx = 0; | 484 | regs->dx = 0; |
499 | regs->cx = 0; | 485 | regs->cx = 0; |
500 | 486 | ||
501 | asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); | 487 | loadsegment(ds, __USER32_DS); |
502 | asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); | 488 | loadsegment(es, __USER32_DS); |
503 | 489 | ||
504 | regs->cs = __USER32_CS; | 490 | regs->cs = __USER32_CS; |
505 | regs->ss = __USER32_DS; | 491 | regs->ss = __USER32_DS; |
@@ -510,10 +496,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, | |||
510 | #endif | 496 | #endif |
511 | 497 | ||
512 | return 0; | 498 | return 0; |
513 | |||
514 | give_sigsegv: | ||
515 | force_sigsegv(sig, current); | ||
516 | return -EFAULT; | ||
517 | } | 499 | } |
518 | 500 | ||
519 | int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | 501 | int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, |
@@ -522,6 +504,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | |||
522 | struct rt_sigframe __user *frame; | 504 | struct rt_sigframe __user *frame; |
523 | void __user *restorer; | 505 | void __user *restorer; |
524 | int err = 0; | 506 | int err = 0; |
507 | void __user *fpstate = NULL; | ||
525 | 508 | ||
526 | /* __copy_to_user optimizes that into a single 8 byte store */ | 509 | /* __copy_to_user optimizes that into a single 8 byte store */ |
527 | static const struct { | 510 | static const struct { |
@@ -537,30 +520,33 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | |||
537 | 0, | 520 | 0, |
538 | }; | 521 | }; |
539 | 522 | ||
540 | frame = get_sigframe(ka, regs, sizeof(*frame)); | 523 | frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); |
541 | 524 | ||
542 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | 525 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) |
543 | goto give_sigsegv; | 526 | return -EFAULT; |
544 | 527 | ||
545 | err |= __put_user(sig, &frame->sig); | 528 | err |= __put_user(sig, &frame->sig); |
546 | err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); | 529 | err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); |
547 | err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); | 530 | err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); |
548 | err |= copy_siginfo_to_user32(&frame->info, info); | 531 | err |= copy_siginfo_to_user32(&frame->info, info); |
549 | if (err) | 532 | if (err) |
550 | goto give_sigsegv; | 533 | return -EFAULT; |
551 | 534 | ||
552 | /* Create the ucontext. */ | 535 | /* Create the ucontext. */ |
553 | err |= __put_user(0, &frame->uc.uc_flags); | 536 | if (cpu_has_xsave) |
537 | err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); | ||
538 | else | ||
539 | err |= __put_user(0, &frame->uc.uc_flags); | ||
554 | err |= __put_user(0, &frame->uc.uc_link); | 540 | err |= __put_user(0, &frame->uc.uc_link); |
555 | err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); | 541 | err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); |
556 | err |= __put_user(sas_ss_flags(regs->sp), | 542 | err |= __put_user(sas_ss_flags(regs->sp), |
557 | &frame->uc.uc_stack.ss_flags); | 543 | &frame->uc.uc_stack.ss_flags); |
558 | err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); | 544 | err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); |
559 | err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, | 545 | err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, |
560 | regs, set->sig[0]); | 546 | regs, set->sig[0]); |
561 | err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); | 547 | err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); |
562 | if (err) | 548 | if (err) |
563 | goto give_sigsegv; | 549 | return -EFAULT; |
564 | 550 | ||
565 | if (ka->sa.sa_flags & SA_RESTORER) | 551 | if (ka->sa.sa_flags & SA_RESTORER) |
566 | restorer = ka->sa.sa_restorer; | 552 | restorer = ka->sa.sa_restorer; |
@@ -575,7 +561,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | |||
575 | */ | 561 | */ |
576 | err |= __copy_to_user(frame->retcode, &code, 8); | 562 | err |= __copy_to_user(frame->retcode, &code, 8); |
577 | if (err) | 563 | if (err) |
578 | goto give_sigsegv; | 564 | return -EFAULT; |
579 | 565 | ||
580 | /* Set up registers for signal handler */ | 566 | /* Set up registers for signal handler */ |
581 | regs->sp = (unsigned long) frame; | 567 | regs->sp = (unsigned long) frame; |
@@ -591,8 +577,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | |||
591 | regs->dx = (unsigned long) &frame->info; | 577 | regs->dx = (unsigned long) &frame->info; |
592 | regs->cx = (unsigned long) &frame->uc; | 578 | regs->cx = (unsigned long) &frame->uc; |
593 | 579 | ||
594 | asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); | 580 | loadsegment(ds, __USER32_DS); |
595 | asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); | 581 | loadsegment(es, __USER32_DS); |
596 | 582 | ||
597 | regs->cs = __USER32_CS; | 583 | regs->cs = __USER32_CS; |
598 | regs->ss = __USER32_DS; | 584 | regs->ss = __USER32_DS; |
@@ -603,8 +589,4 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | |||
603 | #endif | 589 | #endif |
604 | 590 | ||
605 | return 0; | 591 | return 0; |
606 | |||
607 | give_sigsegv: | ||
608 | force_sigsegv(sig, current); | ||
609 | return -EFAULT; | ||
610 | } | 592 | } |
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ffc1bb4fed7d..256b00b61892 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S | |||
@@ -39,11 +39,11 @@ | |||
39 | .endm | 39 | .endm |
40 | 40 | ||
41 | /* clobbers %eax */ | 41 | /* clobbers %eax */ |
42 | .macro CLEAR_RREGS | 42 | .macro CLEAR_RREGS _r9=rax |
43 | xorl %eax,%eax | 43 | xorl %eax,%eax |
44 | movq %rax,R11(%rsp) | 44 | movq %rax,R11(%rsp) |
45 | movq %rax,R10(%rsp) | 45 | movq %rax,R10(%rsp) |
46 | movq %rax,R9(%rsp) | 46 | movq %\_r9,R9(%rsp) |
47 | movq %rax,R8(%rsp) | 47 | movq %rax,R8(%rsp) |
48 | .endm | 48 | .endm |
49 | 49 | ||
@@ -52,11 +52,10 @@ | |||
52 | * We don't reload %eax because syscall_trace_enter() returned | 52 | * We don't reload %eax because syscall_trace_enter() returned |
53 | * the value it wants us to use in the table lookup. | 53 | * the value it wants us to use in the table lookup. |
54 | */ | 54 | */ |
55 | .macro LOAD_ARGS32 offset | 55 | .macro LOAD_ARGS32 offset, _r9=0 |
56 | movl \offset(%rsp),%r11d | 56 | .if \_r9 |
57 | movl \offset+8(%rsp),%r10d | ||
58 | movl \offset+16(%rsp),%r9d | 57 | movl \offset+16(%rsp),%r9d |
59 | movl \offset+24(%rsp),%r8d | 58 | .endif |
60 | movl \offset+40(%rsp),%ecx | 59 | movl \offset+40(%rsp),%ecx |
61 | movl \offset+48(%rsp),%edx | 60 | movl \offset+48(%rsp),%edx |
62 | movl \offset+56(%rsp),%esi | 61 | movl \offset+56(%rsp),%esi |
@@ -145,7 +144,7 @@ ENTRY(ia32_sysenter_target) | |||
145 | SAVE_ARGS 0,0,1 | 144 | SAVE_ARGS 0,0,1 |
146 | /* no need to do an access_ok check here because rbp has been | 145 | /* no need to do an access_ok check here because rbp has been |
147 | 32bit zero extended */ | 146 | 32bit zero extended */ |
148 | 1: movl (%rbp),%r9d | 147 | 1: movl (%rbp),%ebp |
149 | .section __ex_table,"a" | 148 | .section __ex_table,"a" |
150 | .quad 1b,ia32_badarg | 149 | .quad 1b,ia32_badarg |
151 | .previous | 150 | .previous |
@@ -157,7 +156,7 @@ ENTRY(ia32_sysenter_target) | |||
157 | cmpl $(IA32_NR_syscalls-1),%eax | 156 | cmpl $(IA32_NR_syscalls-1),%eax |
158 | ja ia32_badsys | 157 | ja ia32_badsys |
159 | sysenter_do_call: | 158 | sysenter_do_call: |
160 | IA32_ARG_FIXUP 1 | 159 | IA32_ARG_FIXUP |
161 | sysenter_dispatch: | 160 | sysenter_dispatch: |
162 | call *ia32_sys_call_table(,%rax,8) | 161 | call *ia32_sys_call_table(,%rax,8) |
163 | movq %rax,RAX-ARGOFFSET(%rsp) | 162 | movq %rax,RAX-ARGOFFSET(%rsp) |
@@ -234,20 +233,17 @@ sysexit_audit: | |||
234 | #endif | 233 | #endif |
235 | 234 | ||
236 | sysenter_tracesys: | 235 | sysenter_tracesys: |
237 | xchgl %r9d,%ebp | ||
238 | #ifdef CONFIG_AUDITSYSCALL | 236 | #ifdef CONFIG_AUDITSYSCALL |
239 | testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) | 237 | testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) |
240 | jz sysenter_auditsys | 238 | jz sysenter_auditsys |
241 | #endif | 239 | #endif |
242 | SAVE_REST | 240 | SAVE_REST |
243 | CLEAR_RREGS | 241 | CLEAR_RREGS |
244 | movq %r9,R9(%rsp) | ||
245 | movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ | 242 | movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ |
246 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | 243 | movq %rsp,%rdi /* &pt_regs -> arg1 */ |
247 | call syscall_trace_enter | 244 | call syscall_trace_enter |
248 | LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ | 245 | LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ |
249 | RESTORE_REST | 246 | RESTORE_REST |
250 | xchgl %ebp,%r9d | ||
251 | cmpl $(IA32_NR_syscalls-1),%eax | 247 | cmpl $(IA32_NR_syscalls-1),%eax |
252 | ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ | 248 | ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ |
253 | jmp sysenter_do_call | 249 | jmp sysenter_do_call |
@@ -314,9 +310,9 @@ ENTRY(ia32_cstar_target) | |||
314 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) | 310 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) |
315 | CFI_REMEMBER_STATE | 311 | CFI_REMEMBER_STATE |
316 | jnz cstar_tracesys | 312 | jnz cstar_tracesys |
317 | cstar_do_call: | ||
318 | cmpl $IA32_NR_syscalls-1,%eax | 313 | cmpl $IA32_NR_syscalls-1,%eax |
319 | ja ia32_badsys | 314 | ja ia32_badsys |
315 | cstar_do_call: | ||
320 | IA32_ARG_FIXUP 1 | 316 | IA32_ARG_FIXUP 1 |
321 | cstar_dispatch: | 317 | cstar_dispatch: |
322 | call *ia32_sys_call_table(,%rax,8) | 318 | call *ia32_sys_call_table(,%rax,8) |
@@ -357,15 +353,13 @@ cstar_tracesys: | |||
357 | #endif | 353 | #endif |
358 | xchgl %r9d,%ebp | 354 | xchgl %r9d,%ebp |
359 | SAVE_REST | 355 | SAVE_REST |
360 | CLEAR_RREGS | 356 | CLEAR_RREGS r9 |
361 | movq %r9,R9(%rsp) | ||
362 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ | 357 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ |
363 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | 358 | movq %rsp,%rdi /* &pt_regs -> arg1 */ |
364 | call syscall_trace_enter | 359 | call syscall_trace_enter |
365 | LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ | 360 | LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ |
366 | RESTORE_REST | 361 | RESTORE_REST |
367 | xchgl %ebp,%r9d | 362 | xchgl %ebp,%r9d |
368 | movl RSP-ARGOFFSET(%rsp), %r8d | ||
369 | cmpl $(IA32_NR_syscalls-1),%eax | 363 | cmpl $(IA32_NR_syscalls-1),%eax |
370 | ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ | 364 | ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ |
371 | jmp cstar_do_call | 365 | jmp cstar_do_call |
@@ -577,8 +571,8 @@ ia32_sys_call_table: | |||
577 | .quad compat_sys_setrlimit /* 75 */ | 571 | .quad compat_sys_setrlimit /* 75 */ |
578 | .quad compat_sys_old_getrlimit /* old_getrlimit */ | 572 | .quad compat_sys_old_getrlimit /* old_getrlimit */ |
579 | .quad compat_sys_getrusage | 573 | .quad compat_sys_getrusage |
580 | .quad sys32_gettimeofday | 574 | .quad compat_sys_gettimeofday |
581 | .quad sys32_settimeofday | 575 | .quad compat_sys_settimeofday |
582 | .quad sys_getgroups16 /* 80 */ | 576 | .quad sys_getgroups16 /* 80 */ |
583 | .quad sys_setgroups16 | 577 | .quad sys_setgroups16 |
584 | .quad sys32_old_select | 578 | .quad sys32_old_select |
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index d3c64088b981..2e09dcd3c0a6 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c | |||
@@ -49,41 +49,6 @@ | |||
49 | 49 | ||
50 | #define AA(__x) ((unsigned long)(__x)) | 50 | #define AA(__x) ((unsigned long)(__x)) |
51 | 51 | ||
52 | int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf) | ||
53 | { | ||
54 | compat_ino_t ino; | ||
55 | |||
56 | typeof(ubuf->st_uid) uid = 0; | ||
57 | typeof(ubuf->st_gid) gid = 0; | ||
58 | SET_UID(uid, kbuf->uid); | ||
59 | SET_GID(gid, kbuf->gid); | ||
60 | if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev)) | ||
61 | return -EOVERFLOW; | ||
62 | if (kbuf->size >= 0x7fffffff) | ||
63 | return -EOVERFLOW; | ||
64 | ino = kbuf->ino; | ||
65 | if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino) | ||
66 | return -EOVERFLOW; | ||
67 | if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) || | ||
68 | __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) || | ||
69 | __put_user(ino, &ubuf->st_ino) || | ||
70 | __put_user(kbuf->mode, &ubuf->st_mode) || | ||
71 | __put_user(kbuf->nlink, &ubuf->st_nlink) || | ||
72 | __put_user(uid, &ubuf->st_uid) || | ||
73 | __put_user(gid, &ubuf->st_gid) || | ||
74 | __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || | ||
75 | __put_user(kbuf->size, &ubuf->st_size) || | ||
76 | __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) || | ||
77 | __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || | ||
78 | __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) || | ||
79 | __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || | ||
80 | __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) || | ||
81 | __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || | ||
82 | __put_user(kbuf->blksize, &ubuf->st_blksize) || | ||
83 | __put_user(kbuf->blocks, &ubuf->st_blocks)) | ||
84 | return -EFAULT; | ||
85 | return 0; | ||
86 | } | ||
87 | 52 | ||
88 | asmlinkage long sys32_truncate64(char __user *filename, | 53 | asmlinkage long sys32_truncate64(char __user *filename, |
89 | unsigned long offset_low, | 54 | unsigned long offset_low, |
@@ -402,75 +367,11 @@ asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, | |||
402 | return 0; | 367 | return 0; |
403 | } | 368 | } |
404 | 369 | ||
405 | static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i) | ||
406 | { | ||
407 | int err = -EFAULT; | ||
408 | |||
409 | if (access_ok(VERIFY_READ, i, sizeof(*i))) { | ||
410 | err = __get_user(o->tv_sec, &i->tv_sec); | ||
411 | err |= __get_user(o->tv_usec, &i->tv_usec); | ||
412 | } | ||
413 | return err; | ||
414 | } | ||
415 | |||
416 | static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i) | ||
417 | { | ||
418 | int err = -EFAULT; | ||
419 | |||
420 | if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { | ||
421 | err = __put_user(i->tv_sec, &o->tv_sec); | ||
422 | err |= __put_user(i->tv_usec, &o->tv_usec); | ||
423 | } | ||
424 | return err; | ||
425 | } | ||
426 | |||
427 | asmlinkage long sys32_alarm(unsigned int seconds) | 370 | asmlinkage long sys32_alarm(unsigned int seconds) |
428 | { | 371 | { |
429 | return alarm_setitimer(seconds); | 372 | return alarm_setitimer(seconds); |
430 | } | 373 | } |
431 | 374 | ||
432 | /* | ||
433 | * Translations due to time_t size differences. Which affects all | ||
434 | * sorts of things, like timeval and itimerval. | ||
435 | */ | ||
436 | asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv, | ||
437 | struct timezone __user *tz) | ||
438 | { | ||
439 | if (tv) { | ||
440 | struct timeval ktv; | ||
441 | |||
442 | do_gettimeofday(&ktv); | ||
443 | if (put_tv32(tv, &ktv)) | ||
444 | return -EFAULT; | ||
445 | } | ||
446 | if (tz) { | ||
447 | if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) | ||
448 | return -EFAULT; | ||
449 | } | ||
450 | return 0; | ||
451 | } | ||
452 | |||
453 | asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv, | ||
454 | struct timezone __user *tz) | ||
455 | { | ||
456 | struct timeval ktv; | ||
457 | struct timespec kts; | ||
458 | struct timezone ktz; | ||
459 | |||
460 | if (tv) { | ||
461 | if (get_tv32(&ktv, tv)) | ||
462 | return -EFAULT; | ||
463 | kts.tv_sec = ktv.tv_sec; | ||
464 | kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC; | ||
465 | } | ||
466 | if (tz) { | ||
467 | if (copy_from_user(&ktz, tz, sizeof(ktz))) | ||
468 | return -EFAULT; | ||
469 | } | ||
470 | |||
471 | return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); | ||
472 | } | ||
473 | |||
474 | struct sel_arg_struct { | 375 | struct sel_arg_struct { |
475 | unsigned int n; | 376 | unsigned int n; |
476 | unsigned int inp; | 377 | unsigned int inp; |
@@ -556,15 +457,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig, | |||
556 | return ret; | 457 | return ret; |
557 | } | 458 | } |
558 | 459 | ||
559 | /* These are here just in case some old ia32 binary calls it. */ | ||
560 | asmlinkage long sys32_pause(void) | ||
561 | { | ||
562 | current->state = TASK_INTERRUPTIBLE; | ||
563 | schedule(); | ||
564 | return -ERESTARTNOHAND; | ||
565 | } | ||
566 | |||
567 | |||
568 | #ifdef CONFIG_SYSCTL_SYSCALL | 460 | #ifdef CONFIG_SYSCTL_SYSCALL |
569 | struct sysctl_ia32 { | 461 | struct sysctl_ia32 { |
570 | unsigned int name; | 462 | unsigned int name; |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3db651fc8ec5..0d41f0343dc0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -10,7 +10,7 @@ ifdef CONFIG_FTRACE | |||
10 | # Do not profile debug and lowlevel utilities | 10 | # Do not profile debug and lowlevel utilities |
11 | CFLAGS_REMOVE_tsc.o = -pg | 11 | CFLAGS_REMOVE_tsc.o = -pg |
12 | CFLAGS_REMOVE_rtc.o = -pg | 12 | CFLAGS_REMOVE_rtc.o = -pg |
13 | CFLAGS_REMOVE_paravirt.o = -pg | 13 | CFLAGS_REMOVE_paravirt-spinlocks.o = -pg |
14 | endif | 14 | endif |
15 | 15 | ||
16 | # | 16 | # |
@@ -23,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp) | |||
23 | CFLAGS_tsc.o := $(nostackp) | 23 | CFLAGS_tsc.o := $(nostackp) |
24 | 24 | ||
25 | obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o | 25 | obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o |
26 | obj-y += traps_$(BITS).o irq_$(BITS).o | 26 | obj-y += traps.o irq_$(BITS).o dumpstack_$(BITS).o |
27 | obj-y += time_$(BITS).o ioport.o ldt.o | 27 | obj-y += time_$(BITS).o ioport.o ldt.o |
28 | obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o | 28 | obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o |
29 | obj-$(CONFIG_X86_VISWS) += visws_quirks.o | 29 | obj-$(CONFIG_X86_VISWS) += visws_quirks.o |
@@ -38,7 +38,7 @@ obj-y += tsc.o io_delay.o rtc.o | |||
38 | 38 | ||
39 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o | 39 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o |
40 | obj-y += process.o | 40 | obj-y += process.o |
41 | obj-y += i387.o | 41 | obj-y += i387.o xsave.o |
42 | obj-y += ptrace.o | 42 | obj-y += ptrace.o |
43 | obj-y += ds.o | 43 | obj-y += ds.o |
44 | obj-$(CONFIG_X86_32) += tls.o | 44 | obj-$(CONFIG_X86_32) += tls.o |
@@ -51,7 +51,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o | |||
51 | obj-$(CONFIG_MCA) += mca_32.o | 51 | obj-$(CONFIG_MCA) += mca_32.o |
52 | obj-$(CONFIG_X86_MSR) += msr.o | 52 | obj-$(CONFIG_X86_MSR) += msr.o |
53 | obj-$(CONFIG_X86_CPUID) += cpuid.o | 53 | obj-$(CONFIG_X86_CPUID) += cpuid.o |
54 | obj-$(CONFIG_MICROCODE) += microcode.o | ||
55 | obj-$(CONFIG_PCI) += early-quirks.o | 54 | obj-$(CONFIG_PCI) += early-quirks.o |
56 | apm-y := apm_32.o | 55 | apm-y := apm_32.o |
57 | obj-$(CONFIG_APM) += apm.o | 56 | obj-$(CONFIG_APM) += apm.o |
@@ -69,6 +68,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o | |||
69 | obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o | 68 | obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o |
70 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o | 69 | obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o |
71 | obj-$(CONFIG_X86_NUMAQ) += numaq_32.o | 70 | obj-$(CONFIG_X86_NUMAQ) += numaq_32.o |
71 | obj-$(CONFIG_X86_ES7000) += es7000_32.o | ||
72 | obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o | 72 | obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o |
73 | obj-y += vsmp_64.o | 73 | obj-y += vsmp_64.o |
74 | obj-$(CONFIG_KPROBES) += kprobes.o | 74 | obj-$(CONFIG_KPROBES) += kprobes.o |
@@ -89,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o | |||
89 | obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o | 89 | obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o |
90 | obj-$(CONFIG_KVM_GUEST) += kvm.o | 90 | obj-$(CONFIG_KVM_GUEST) += kvm.o |
91 | obj-$(CONFIG_KVM_CLOCK) += kvmclock.o | 91 | obj-$(CONFIG_KVM_CLOCK) += kvmclock.o |
92 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o | 92 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o |
93 | obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o | 93 | obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o |
94 | 94 | ||
95 | obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o | 95 | obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o |
@@ -99,11 +99,18 @@ scx200-y += scx200_32.o | |||
99 | 99 | ||
100 | obj-$(CONFIG_OLPC) += olpc.o | 100 | obj-$(CONFIG_OLPC) += olpc.o |
101 | 101 | ||
102 | microcode-y := microcode_core.o | ||
103 | microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o | ||
104 | microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o | ||
105 | obj-$(CONFIG_MICROCODE) += microcode.o | ||
106 | |||
102 | ### | 107 | ### |
103 | # 64 bit specific files | 108 | # 64 bit specific files |
104 | ifeq ($(CONFIG_X86_64),y) | 109 | ifeq ($(CONFIG_X86_64),y) |
105 | obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o | 110 | obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o |
106 | obj-y += bios_uv.o | 111 | obj-y += bios_uv.o |
112 | obj-y += genx2apic_cluster.o | ||
113 | obj-y += genx2apic_phys.o | ||
107 | obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o | 114 | obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o |
108 | obj-$(CONFIG_AUDIT) += audit_64.o | 115 | obj-$(CONFIG_AUDIT) += audit_64.o |
109 | 116 | ||
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index bfd10fd211cd..eb875cdc7367 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
@@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled); | |||
58 | #ifdef CONFIG_X86_64 | 58 | #ifdef CONFIG_X86_64 |
59 | 59 | ||
60 | #include <asm/proto.h> | 60 | #include <asm/proto.h> |
61 | #include <asm/genapic.h> | ||
62 | 61 | ||
63 | #else /* X86 */ | 62 | #else /* X86 */ |
64 | 63 | ||
@@ -97,8 +96,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; | |||
97 | #warning ACPI uses CMPXCHG, i486 and later hardware | 96 | #warning ACPI uses CMPXCHG, i486 and later hardware |
98 | #endif | 97 | #endif |
99 | 98 | ||
100 | static int acpi_mcfg_64bit_base_addr __initdata = FALSE; | ||
101 | |||
102 | /* -------------------------------------------------------------------------- | 99 | /* -------------------------------------------------------------------------- |
103 | Boot-time Configuration | 100 | Boot-time Configuration |
104 | -------------------------------------------------------------------------- */ | 101 | -------------------------------------------------------------------------- */ |
@@ -160,6 +157,8 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) | |||
160 | struct acpi_mcfg_allocation *pci_mmcfg_config; | 157 | struct acpi_mcfg_allocation *pci_mmcfg_config; |
161 | int pci_mmcfg_config_num; | 158 | int pci_mmcfg_config_num; |
162 | 159 | ||
160 | static int acpi_mcfg_64bit_base_addr __initdata = FALSE; | ||
161 | |||
163 | static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) | 162 | static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) |
164 | { | 163 | { |
165 | if (!strcmp(mcfg->header.oem_id, "SGI")) | 164 | if (!strcmp(mcfg->header.oem_id, "SGI")) |
@@ -253,10 +252,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled) | |||
253 | return; | 252 | return; |
254 | } | 253 | } |
255 | 254 | ||
256 | #ifdef CONFIG_X86_32 | ||
257 | if (boot_cpu_physical_apicid != -1U) | 255 | if (boot_cpu_physical_apicid != -1U) |
258 | ver = apic_version[boot_cpu_physical_apicid]; | 256 | ver = apic_version[boot_cpu_physical_apicid]; |
259 | #endif | ||
260 | 257 | ||
261 | generic_processor_info(id, ver); | 258 | generic_processor_info(id, ver); |
262 | } | 259 | } |
@@ -775,11 +772,9 @@ static void __init acpi_register_lapic_address(unsigned long address) | |||
775 | 772 | ||
776 | set_fixmap_nocache(FIX_APIC_BASE, address); | 773 | set_fixmap_nocache(FIX_APIC_BASE, address); |
777 | if (boot_cpu_physical_apicid == -1U) { | 774 | if (boot_cpu_physical_apicid == -1U) { |
778 | boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); | 775 | boot_cpu_physical_apicid = read_apic_id(); |
779 | #ifdef CONFIG_X86_32 | ||
780 | apic_version[boot_cpu_physical_apicid] = | 776 | apic_version[boot_cpu_physical_apicid] = |
781 | GET_APIC_VERSION(apic_read(APIC_LVR)); | 777 | GET_APIC_VERSION(apic_read(APIC_LVR)); |
782 | #endif | ||
783 | } | 778 | } |
784 | } | 779 | } |
785 | 780 | ||
@@ -1351,7 +1346,9 @@ static void __init acpi_process_madt(void) | |||
1351 | acpi_ioapic = 1; | 1346 | acpi_ioapic = 1; |
1352 | 1347 | ||
1353 | smp_found_config = 1; | 1348 | smp_found_config = 1; |
1349 | #ifdef CONFIG_X86_32 | ||
1354 | setup_apic_routing(); | 1350 | setup_apic_routing(); |
1351 | #endif | ||
1355 | } | 1352 | } |
1356 | } | 1353 | } |
1357 | if (error == -EINVAL) { | 1354 | if (error == -EINVAL) { |
@@ -1421,8 +1418,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d) | |||
1421 | */ | 1418 | */ |
1422 | static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) | 1419 | static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) |
1423 | { | 1420 | { |
1424 | pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); | 1421 | /* |
1425 | acpi_skip_timer_override = 1; | 1422 | * The ati_ixp4x0_rev() early PCI quirk should have set |
1423 | * the acpi_skip_timer_override flag already: | ||
1424 | */ | ||
1425 | if (!acpi_skip_timer_override) { | ||
1426 | WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); | ||
1427 | pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", | ||
1428 | d->ident); | ||
1429 | acpi_skip_timer_override = 1; | ||
1430 | } | ||
1426 | return 0; | 1431 | return 0; |
1427 | } | 1432 | } |
1428 | 1433 | ||
@@ -1605,6 +1610,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { | |||
1605 | */ | 1610 | */ |
1606 | { | 1611 | { |
1607 | .callback = dmi_ignore_irq0_timer_override, | 1612 | .callback = dmi_ignore_irq0_timer_override, |
1613 | .ident = "HP nx6115 laptop", | ||
1614 | .matches = { | ||
1615 | DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), | ||
1616 | DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"), | ||
1617 | }, | ||
1618 | }, | ||
1619 | { | ||
1620 | .callback = dmi_ignore_irq0_timer_override, | ||
1608 | .ident = "HP NX6125 laptop", | 1621 | .ident = "HP NX6125 laptop", |
1609 | .matches = { | 1622 | .matches = { |
1610 | DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), | 1623 | DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), |
@@ -1619,6 +1632,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { | |||
1619 | DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), | 1632 | DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), |
1620 | }, | 1633 | }, |
1621 | }, | 1634 | }, |
1635 | { | ||
1636 | .callback = dmi_ignore_irq0_timer_override, | ||
1637 | .ident = "HP 6715b laptop", | ||
1638 | .matches = { | ||
1639 | DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), | ||
1640 | DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), | ||
1641 | }, | ||
1642 | }, | ||
1622 | {} | 1643 | {} |
1623 | }; | 1644 | }; |
1624 | 1645 | ||
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2763cb37b553..a84ac7b570e6 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -145,35 +145,25 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { | |||
145 | extern char __vsyscall_0; | 145 | extern char __vsyscall_0; |
146 | const unsigned char *const *find_nop_table(void) | 146 | const unsigned char *const *find_nop_table(void) |
147 | { | 147 | { |
148 | return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || | 148 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && |
149 | boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; | 149 | boot_cpu_has(X86_FEATURE_NOPL)) |
150 | return p6_nops; | ||
151 | else | ||
152 | return k8_nops; | ||
150 | } | 153 | } |
151 | 154 | ||
152 | #else /* CONFIG_X86_64 */ | 155 | #else /* CONFIG_X86_64 */ |
153 | 156 | ||
154 | static const struct nop { | ||
155 | int cpuid; | ||
156 | const unsigned char *const *noptable; | ||
157 | } noptypes[] = { | ||
158 | { X86_FEATURE_K8, k8_nops }, | ||
159 | { X86_FEATURE_K7, k7_nops }, | ||
160 | { X86_FEATURE_P4, p6_nops }, | ||
161 | { X86_FEATURE_P3, p6_nops }, | ||
162 | { -1, NULL } | ||
163 | }; | ||
164 | |||
165 | const unsigned char *const *find_nop_table(void) | 157 | const unsigned char *const *find_nop_table(void) |
166 | { | 158 | { |
167 | const unsigned char *const *noptable = intel_nops; | 159 | if (boot_cpu_has(X86_FEATURE_K8)) |
168 | int i; | 160 | return k8_nops; |
169 | 161 | else if (boot_cpu_has(X86_FEATURE_K7)) | |
170 | for (i = 0; noptypes[i].cpuid >= 0; i++) { | 162 | return k7_nops; |
171 | if (boot_cpu_has(noptypes[i].cpuid)) { | 163 | else if (boot_cpu_has(X86_FEATURE_NOPL)) |
172 | noptable = noptypes[i].noptable; | 164 | return p6_nops; |
173 | break; | 165 | else |
174 | } | 166 | return intel_nops; |
175 | } | ||
176 | return noptable; | ||
177 | } | 167 | } |
178 | 168 | ||
179 | #endif /* CONFIG_X86_64 */ | 169 | #endif /* CONFIG_X86_64 */ |
@@ -241,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) | |||
241 | continue; | 231 | continue; |
242 | if (*ptr > text_end) | 232 | if (*ptr > text_end) |
243 | continue; | 233 | continue; |
244 | text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ | 234 | /* turn DS segment override prefix into lock prefix */ |
235 | text_poke(*ptr, ((unsigned char []){0xf0}), 1); | ||
245 | }; | 236 | }; |
246 | } | 237 | } |
247 | 238 | ||
248 | static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) | 239 | static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) |
249 | { | 240 | { |
250 | u8 **ptr; | 241 | u8 **ptr; |
251 | char insn[1]; | ||
252 | 242 | ||
253 | if (noreplace_smp) | 243 | if (noreplace_smp) |
254 | return; | 244 | return; |
255 | 245 | ||
256 | add_nops(insn, 1); | ||
257 | for (ptr = start; ptr < end; ptr++) { | 246 | for (ptr = start; ptr < end; ptr++) { |
258 | if (*ptr < text) | 247 | if (*ptr < text) |
259 | continue; | 248 | continue; |
260 | if (*ptr > text_end) | 249 | if (*ptr > text_end) |
261 | continue; | 250 | continue; |
262 | text_poke(*ptr, insn, 1); | 251 | /* turn lock prefix into DS segment override prefix */ |
252 | text_poke(*ptr, ((unsigned char []){0x3E}), 1); | ||
263 | }; | 253 | }; |
264 | } | 254 | } |
265 | 255 | ||
@@ -454,7 +444,7 @@ void __init alternative_instructions(void) | |||
454 | _text, _etext); | 444 | _text, _etext); |
455 | 445 | ||
456 | /* Only switch to UP mode if we don't immediately boot others */ | 446 | /* Only switch to UP mode if we don't immediately boot others */ |
457 | if (num_possible_cpus() == 1 || setup_max_cpus <= 1) | 447 | if (num_present_cpus() == 1 || setup_max_cpus <= 1) |
458 | alternatives_smp_switch(0); | 448 | alternatives_smp_switch(0); |
459 | } | 449 | } |
460 | #endif | 450 | #endif |
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 69b4d060b21c..a8fd9ebdc8e2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c | |||
@@ -33,6 +33,10 @@ | |||
33 | 33 | ||
34 | static DEFINE_RWLOCK(amd_iommu_devtable_lock); | 34 | static DEFINE_RWLOCK(amd_iommu_devtable_lock); |
35 | 35 | ||
36 | /* A list of preallocated protection domains */ | ||
37 | static LIST_HEAD(iommu_pd_list); | ||
38 | static DEFINE_SPINLOCK(iommu_pd_list_lock); | ||
39 | |||
36 | /* | 40 | /* |
37 | * general struct to manage commands send to an IOMMU | 41 | * general struct to manage commands send to an IOMMU |
38 | */ | 42 | */ |
@@ -51,6 +55,102 @@ static int iommu_has_npcache(struct amd_iommu *iommu) | |||
51 | 55 | ||
52 | /**************************************************************************** | 56 | /**************************************************************************** |
53 | * | 57 | * |
58 | * Interrupt handling functions | ||
59 | * | ||
60 | ****************************************************************************/ | ||
61 | |||
62 | static void iommu_print_event(void *__evt) | ||
63 | { | ||
64 | u32 *event = __evt; | ||
65 | int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; | ||
66 | int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; | ||
67 | int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK; | ||
68 | int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; | ||
69 | u64 address = (u64)(((u64)event[3]) << 32) | event[2]; | ||
70 | |||
71 | printk(KERN_ERR "AMD IOMMU: Event logged ["); | ||
72 | |||
73 | switch (type) { | ||
74 | case EVENT_TYPE_ILL_DEV: | ||
75 | printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x " | ||
76 | "address=0x%016llx flags=0x%04x]\n", | ||
77 | PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), | ||
78 | address, flags); | ||
79 | break; | ||
80 | case EVENT_TYPE_IO_FAULT: | ||
81 | printk("IO_PAGE_FAULT device=%02x:%02x.%x " | ||
82 | "domain=0x%04x address=0x%016llx flags=0x%04x]\n", | ||
83 | PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), | ||
84 | domid, address, flags); | ||
85 | break; | ||
86 | case EVENT_TYPE_DEV_TAB_ERR: | ||
87 | printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x " | ||
88 | "address=0x%016llx flags=0x%04x]\n", | ||
89 | PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), | ||
90 | address, flags); | ||
91 | break; | ||
92 | case EVENT_TYPE_PAGE_TAB_ERR: | ||
93 | printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x " | ||
94 | "domain=0x%04x address=0x%016llx flags=0x%04x]\n", | ||
95 | PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), | ||
96 | domid, address, flags); | ||
97 | break; | ||
98 | case EVENT_TYPE_ILL_CMD: | ||
99 | printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); | ||
100 | break; | ||
101 | case EVENT_TYPE_CMD_HARD_ERR: | ||
102 | printk("COMMAND_HARDWARE_ERROR address=0x%016llx " | ||
103 | "flags=0x%04x]\n", address, flags); | ||
104 | break; | ||
105 | case EVENT_TYPE_IOTLB_INV_TO: | ||
106 | printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x " | ||
107 | "address=0x%016llx]\n", | ||
108 | PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), | ||
109 | address); | ||
110 | break; | ||
111 | case EVENT_TYPE_INV_DEV_REQ: | ||
112 | printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x " | ||
113 | "address=0x%016llx flags=0x%04x]\n", | ||
114 | PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), | ||
115 | address, flags); | ||
116 | break; | ||
117 | default: | ||
118 | printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type); | ||
119 | } | ||
120 | } | ||
121 | |||
122 | static void iommu_poll_events(struct amd_iommu *iommu) | ||
123 | { | ||
124 | u32 head, tail; | ||
125 | unsigned long flags; | ||
126 | |||
127 | spin_lock_irqsave(&iommu->lock, flags); | ||
128 | |||
129 | head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); | ||
130 | tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); | ||
131 | |||
132 | while (head != tail) { | ||
133 | iommu_print_event(iommu->evt_buf + head); | ||
134 | head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; | ||
135 | } | ||
136 | |||
137 | writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); | ||
138 | |||
139 | spin_unlock_irqrestore(&iommu->lock, flags); | ||
140 | } | ||
141 | |||
142 | irqreturn_t amd_iommu_int_handler(int irq, void *data) | ||
143 | { | ||
144 | struct amd_iommu *iommu; | ||
145 | |||
146 | list_for_each_entry(iommu, &amd_iommu_list, list) | ||
147 | iommu_poll_events(iommu); | ||
148 | |||
149 | return IRQ_HANDLED; | ||
150 | } | ||
151 | |||
152 | /**************************************************************************** | ||
153 | * | ||
54 | * IOMMU command queuing functions | 154 | * IOMMU command queuing functions |
55 | * | 155 | * |
56 | ****************************************************************************/ | 156 | ****************************************************************************/ |
@@ -101,10 +201,10 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) | |||
101 | */ | 201 | */ |
102 | static int iommu_completion_wait(struct amd_iommu *iommu) | 202 | static int iommu_completion_wait(struct amd_iommu *iommu) |
103 | { | 203 | { |
104 | int ret, ready = 0; | 204 | int ret = 0, ready = 0; |
105 | unsigned status = 0; | 205 | unsigned status = 0; |
106 | struct iommu_cmd cmd; | 206 | struct iommu_cmd cmd; |
107 | unsigned long i = 0; | 207 | unsigned long flags, i = 0; |
108 | 208 | ||
109 | memset(&cmd, 0, sizeof(cmd)); | 209 | memset(&cmd, 0, sizeof(cmd)); |
110 | cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; | 210 | cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; |
@@ -112,10 +212,12 @@ static int iommu_completion_wait(struct amd_iommu *iommu) | |||
112 | 212 | ||
113 | iommu->need_sync = 0; | 213 | iommu->need_sync = 0; |
114 | 214 | ||
115 | ret = iommu_queue_command(iommu, &cmd); | 215 | spin_lock_irqsave(&iommu->lock, flags); |
216 | |||
217 | ret = __iommu_queue_command(iommu, &cmd); | ||
116 | 218 | ||
117 | if (ret) | 219 | if (ret) |
118 | return ret; | 220 | goto out; |
119 | 221 | ||
120 | while (!ready && (i < EXIT_LOOP_COUNT)) { | 222 | while (!ready && (i < EXIT_LOOP_COUNT)) { |
121 | ++i; | 223 | ++i; |
@@ -130,6 +232,8 @@ static int iommu_completion_wait(struct amd_iommu *iommu) | |||
130 | 232 | ||
131 | if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) | 233 | if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) |
132 | printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); | 234 | printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); |
235 | out: | ||
236 | spin_unlock_irqrestore(&iommu->lock, flags); | ||
133 | 237 | ||
134 | return 0; | 238 | return 0; |
135 | } | 239 | } |
@@ -140,6 +244,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) | |||
140 | static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) | 244 | static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) |
141 | { | 245 | { |
142 | struct iommu_cmd cmd; | 246 | struct iommu_cmd cmd; |
247 | int ret; | ||
143 | 248 | ||
144 | BUG_ON(iommu == NULL); | 249 | BUG_ON(iommu == NULL); |
145 | 250 | ||
@@ -147,9 +252,11 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) | |||
147 | CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); | 252 | CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); |
148 | cmd.data[0] = devid; | 253 | cmd.data[0] = devid; |
149 | 254 | ||
255 | ret = iommu_queue_command(iommu, &cmd); | ||
256 | |||
150 | iommu->need_sync = 1; | 257 | iommu->need_sync = 1; |
151 | 258 | ||
152 | return iommu_queue_command(iommu, &cmd); | 259 | return ret; |
153 | } | 260 | } |
154 | 261 | ||
155 | /* | 262 | /* |
@@ -159,6 +266,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, | |||
159 | u64 address, u16 domid, int pde, int s) | 266 | u64 address, u16 domid, int pde, int s) |
160 | { | 267 | { |
161 | struct iommu_cmd cmd; | 268 | struct iommu_cmd cmd; |
269 | int ret; | ||
162 | 270 | ||
163 | memset(&cmd, 0, sizeof(cmd)); | 271 | memset(&cmd, 0, sizeof(cmd)); |
164 | address &= PAGE_MASK; | 272 | address &= PAGE_MASK; |
@@ -171,9 +279,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, | |||
171 | if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ | 279 | if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ |
172 | cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; | 280 | cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; |
173 | 281 | ||
282 | ret = iommu_queue_command(iommu, &cmd); | ||
283 | |||
174 | iommu->need_sync = 1; | 284 | iommu->need_sync = 1; |
175 | 285 | ||
176 | return iommu_queue_command(iommu, &cmd); | 286 | return ret; |
177 | } | 287 | } |
178 | 288 | ||
179 | /* | 289 | /* |
@@ -185,7 +295,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, | |||
185 | u64 address, size_t size) | 295 | u64 address, size_t size) |
186 | { | 296 | { |
187 | int s = 0; | 297 | int s = 0; |
188 | unsigned pages = iommu_num_pages(address, size); | 298 | unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); |
189 | 299 | ||
190 | address &= PAGE_MASK; | 300 | address &= PAGE_MASK; |
191 | 301 | ||
@@ -203,6 +313,14 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, | |||
203 | return 0; | 313 | return 0; |
204 | } | 314 | } |
205 | 315 | ||
316 | /* Flush the whole IO/TLB for a given protection domain */ | ||
317 | static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) | ||
318 | { | ||
319 | u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; | ||
320 | |||
321 | iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); | ||
322 | } | ||
323 | |||
206 | /**************************************************************************** | 324 | /**************************************************************************** |
207 | * | 325 | * |
208 | * The functions below are used the create the page table mappings for | 326 | * The functions below are used the create the page table mappings for |
@@ -362,11 +480,6 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, | |||
362 | * efficient allocator. | 480 | * efficient allocator. |
363 | * | 481 | * |
364 | ****************************************************************************/ | 482 | ****************************************************************************/ |
365 | static unsigned long dma_mask_to_pages(unsigned long mask) | ||
366 | { | ||
367 | return (mask >> PAGE_SHIFT) + | ||
368 | (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); | ||
369 | } | ||
370 | 483 | ||
371 | /* | 484 | /* |
372 | * The address allocator core function. | 485 | * The address allocator core function. |
@@ -375,25 +488,31 @@ static unsigned long dma_mask_to_pages(unsigned long mask) | |||
375 | */ | 488 | */ |
376 | static unsigned long dma_ops_alloc_addresses(struct device *dev, | 489 | static unsigned long dma_ops_alloc_addresses(struct device *dev, |
377 | struct dma_ops_domain *dom, | 490 | struct dma_ops_domain *dom, |
378 | unsigned int pages) | 491 | unsigned int pages, |
492 | unsigned long align_mask, | ||
493 | u64 dma_mask) | ||
379 | { | 494 | { |
380 | unsigned long limit = dma_mask_to_pages(*dev->dma_mask); | 495 | unsigned long limit; |
381 | unsigned long address; | 496 | unsigned long address; |
382 | unsigned long size = dom->aperture_size >> PAGE_SHIFT; | ||
383 | unsigned long boundary_size; | 497 | unsigned long boundary_size; |
384 | 498 | ||
385 | boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, | 499 | boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, |
386 | PAGE_SIZE) >> PAGE_SHIFT; | 500 | PAGE_SIZE) >> PAGE_SHIFT; |
387 | limit = limit < size ? limit : size; | 501 | limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, |
502 | dma_mask >> PAGE_SHIFT); | ||
388 | 503 | ||
389 | if (dom->next_bit >= limit) | 504 | if (dom->next_bit >= limit) { |
390 | dom->next_bit = 0; | 505 | dom->next_bit = 0; |
506 | dom->need_flush = true; | ||
507 | } | ||
391 | 508 | ||
392 | address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, | 509 | address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, |
393 | 0 , boundary_size, 0); | 510 | 0 , boundary_size, align_mask); |
394 | if (address == -1) | 511 | if (address == -1) { |
395 | address = iommu_area_alloc(dom->bitmap, limit, 0, pages, | 512 | address = iommu_area_alloc(dom->bitmap, limit, 0, pages, |
396 | 0, boundary_size, 0); | 513 | 0, boundary_size, align_mask); |
514 | dom->need_flush = true; | ||
515 | } | ||
397 | 516 | ||
398 | if (likely(address != -1)) { | 517 | if (likely(address != -1)) { |
399 | dom->next_bit = address + pages; | 518 | dom->next_bit = address + pages; |
@@ -459,7 +578,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, | |||
459 | if (start_page + pages > last_page) | 578 | if (start_page + pages > last_page) |
460 | pages = last_page - start_page; | 579 | pages = last_page - start_page; |
461 | 580 | ||
462 | set_bit_string(dom->bitmap, start_page, pages); | 581 | iommu_area_reserve(dom->bitmap, start_page, pages); |
463 | } | 582 | } |
464 | 583 | ||
465 | static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) | 584 | static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) |
@@ -553,12 +672,16 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, | |||
553 | dma_dom->bitmap[0] = 1; | 672 | dma_dom->bitmap[0] = 1; |
554 | dma_dom->next_bit = 0; | 673 | dma_dom->next_bit = 0; |
555 | 674 | ||
675 | dma_dom->need_flush = false; | ||
676 | dma_dom->target_dev = 0xffff; | ||
677 | |||
556 | /* Intialize the exclusion range if necessary */ | 678 | /* Intialize the exclusion range if necessary */ |
557 | if (iommu->exclusion_start && | 679 | if (iommu->exclusion_start && |
558 | iommu->exclusion_start < dma_dom->aperture_size) { | 680 | iommu->exclusion_start < dma_dom->aperture_size) { |
559 | unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; | 681 | unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; |
560 | int pages = iommu_num_pages(iommu->exclusion_start, | 682 | int pages = iommu_num_pages(iommu->exclusion_start, |
561 | iommu->exclusion_length); | 683 | iommu->exclusion_length, |
684 | PAGE_SIZE); | ||
562 | dma_ops_reserve_addresses(dma_dom, startpage, pages); | 685 | dma_ops_reserve_addresses(dma_dom, startpage, pages); |
563 | } | 686 | } |
564 | 687 | ||
@@ -623,12 +746,13 @@ static void set_device_domain(struct amd_iommu *iommu, | |||
623 | 746 | ||
624 | u64 pte_root = virt_to_phys(domain->pt_root); | 747 | u64 pte_root = virt_to_phys(domain->pt_root); |
625 | 748 | ||
626 | pte_root |= (domain->mode & 0x07) << 9; | 749 | pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) |
627 | pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; | 750 | << DEV_ENTRY_MODE_SHIFT; |
751 | pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; | ||
628 | 752 | ||
629 | write_lock_irqsave(&amd_iommu_devtable_lock, flags); | 753 | write_lock_irqsave(&amd_iommu_devtable_lock, flags); |
630 | amd_iommu_dev_table[devid].data[0] = pte_root; | 754 | amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); |
631 | amd_iommu_dev_table[devid].data[1] = pte_root >> 32; | 755 | amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); |
632 | amd_iommu_dev_table[devid].data[2] = domain->id; | 756 | amd_iommu_dev_table[devid].data[2] = domain->id; |
633 | 757 | ||
634 | amd_iommu_pd_table[devid] = domain; | 758 | amd_iommu_pd_table[devid] = domain; |
@@ -646,6 +770,45 @@ static void set_device_domain(struct amd_iommu *iommu, | |||
646 | *****************************************************************************/ | 770 | *****************************************************************************/ |
647 | 771 | ||
648 | /* | 772 | /* |
773 | * This function checks if the driver got a valid device from the caller to | ||
774 | * avoid dereferencing invalid pointers. | ||
775 | */ | ||
776 | static bool check_device(struct device *dev) | ||
777 | { | ||
778 | if (!dev || !dev->dma_mask) | ||
779 | return false; | ||
780 | |||
781 | return true; | ||
782 | } | ||
783 | |||
784 | /* | ||
785 | * In this function the list of preallocated protection domains is traversed to | ||
786 | * find the domain for a specific device | ||
787 | */ | ||
788 | static struct dma_ops_domain *find_protection_domain(u16 devid) | ||
789 | { | ||
790 | struct dma_ops_domain *entry, *ret = NULL; | ||
791 | unsigned long flags; | ||
792 | |||
793 | if (list_empty(&iommu_pd_list)) | ||
794 | return NULL; | ||
795 | |||
796 | spin_lock_irqsave(&iommu_pd_list_lock, flags); | ||
797 | |||
798 | list_for_each_entry(entry, &iommu_pd_list, list) { | ||
799 | if (entry->target_dev == devid) { | ||
800 | ret = entry; | ||
801 | list_del(&ret->list); | ||
802 | break; | ||
803 | } | ||
804 | } | ||
805 | |||
806 | spin_unlock_irqrestore(&iommu_pd_list_lock, flags); | ||
807 | |||
808 | return ret; | ||
809 | } | ||
810 | |||
811 | /* | ||
649 | * In the dma_ops path we only have the struct device. This function | 812 | * In the dma_ops path we only have the struct device. This function |
650 | * finds the corresponding IOMMU, the protection domain and the | 813 | * finds the corresponding IOMMU, the protection domain and the |
651 | * requestor id for a given device. | 814 | * requestor id for a given device. |
@@ -661,27 +824,30 @@ static int get_device_resources(struct device *dev, | |||
661 | struct pci_dev *pcidev; | 824 | struct pci_dev *pcidev; |
662 | u16 _bdf; | 825 | u16 _bdf; |
663 | 826 | ||
664 | BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); | 827 | *iommu = NULL; |
828 | *domain = NULL; | ||
829 | *bdf = 0xffff; | ||
830 | |||
831 | if (dev->bus != &pci_bus_type) | ||
832 | return 0; | ||
665 | 833 | ||
666 | pcidev = to_pci_dev(dev); | 834 | pcidev = to_pci_dev(dev); |
667 | _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); | 835 | _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); |
668 | 836 | ||
669 | /* device not translated by any IOMMU in the system? */ | 837 | /* device not translated by any IOMMU in the system? */ |
670 | if (_bdf > amd_iommu_last_bdf) { | 838 | if (_bdf > amd_iommu_last_bdf) |
671 | *iommu = NULL; | ||
672 | *domain = NULL; | ||
673 | *bdf = 0xffff; | ||
674 | return 0; | 839 | return 0; |
675 | } | ||
676 | 840 | ||
677 | *bdf = amd_iommu_alias_table[_bdf]; | 841 | *bdf = amd_iommu_alias_table[_bdf]; |
678 | 842 | ||
679 | *iommu = amd_iommu_rlookup_table[*bdf]; | 843 | *iommu = amd_iommu_rlookup_table[*bdf]; |
680 | if (*iommu == NULL) | 844 | if (*iommu == NULL) |
681 | return 0; | 845 | return 0; |
682 | dma_dom = (*iommu)->default_dom; | ||
683 | *domain = domain_for_device(*bdf); | 846 | *domain = domain_for_device(*bdf); |
684 | if (*domain == NULL) { | 847 | if (*domain == NULL) { |
848 | dma_dom = find_protection_domain(*bdf); | ||
849 | if (!dma_dom) | ||
850 | dma_dom = (*iommu)->default_dom; | ||
685 | *domain = &dma_dom->domain; | 851 | *domain = &dma_dom->domain; |
686 | set_device_domain(*iommu, *domain, *bdf); | 852 | set_device_domain(*iommu, *domain, *bdf); |
687 | printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " | 853 | printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " |
@@ -760,17 +926,24 @@ static dma_addr_t __map_single(struct device *dev, | |||
760 | struct dma_ops_domain *dma_dom, | 926 | struct dma_ops_domain *dma_dom, |
761 | phys_addr_t paddr, | 927 | phys_addr_t paddr, |
762 | size_t size, | 928 | size_t size, |
763 | int dir) | 929 | int dir, |
930 | bool align, | ||
931 | u64 dma_mask) | ||
764 | { | 932 | { |
765 | dma_addr_t offset = paddr & ~PAGE_MASK; | 933 | dma_addr_t offset = paddr & ~PAGE_MASK; |
766 | dma_addr_t address, start; | 934 | dma_addr_t address, start; |
767 | unsigned int pages; | 935 | unsigned int pages; |
936 | unsigned long align_mask = 0; | ||
768 | int i; | 937 | int i; |
769 | 938 | ||
770 | pages = iommu_num_pages(paddr, size); | 939 | pages = iommu_num_pages(paddr, size, PAGE_SIZE); |
771 | paddr &= PAGE_MASK; | 940 | paddr &= PAGE_MASK; |
772 | 941 | ||
773 | address = dma_ops_alloc_addresses(dev, dma_dom, pages); | 942 | if (align) |
943 | align_mask = (1UL << get_order(size)) - 1; | ||
944 | |||
945 | address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, | ||
946 | dma_mask); | ||
774 | if (unlikely(address == bad_dma_address)) | 947 | if (unlikely(address == bad_dma_address)) |
775 | goto out; | 948 | goto out; |
776 | 949 | ||
@@ -782,6 +955,12 @@ static dma_addr_t __map_single(struct device *dev, | |||
782 | } | 955 | } |
783 | address += offset; | 956 | address += offset; |
784 | 957 | ||
958 | if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { | ||
959 | iommu_flush_tlb(iommu, dma_dom->domain.id); | ||
960 | dma_dom->need_flush = false; | ||
961 | } else if (unlikely(iommu_has_npcache(iommu))) | ||
962 | iommu_flush_pages(iommu, dma_dom->domain.id, address, size); | ||
963 | |||
785 | out: | 964 | out: |
786 | return address; | 965 | return address; |
787 | } | 966 | } |
@@ -802,7 +981,7 @@ static void __unmap_single(struct amd_iommu *iommu, | |||
802 | if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) | 981 | if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) |
803 | return; | 982 | return; |
804 | 983 | ||
805 | pages = iommu_num_pages(dma_addr, size); | 984 | pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); |
806 | dma_addr &= PAGE_MASK; | 985 | dma_addr &= PAGE_MASK; |
807 | start = dma_addr; | 986 | start = dma_addr; |
808 | 987 | ||
@@ -812,6 +991,9 @@ static void __unmap_single(struct amd_iommu *iommu, | |||
812 | } | 991 | } |
813 | 992 | ||
814 | dma_ops_free_addresses(dma_dom, dma_addr, pages); | 993 | dma_ops_free_addresses(dma_dom, dma_addr, pages); |
994 | |||
995 | if (amd_iommu_unmap_flush) | ||
996 | iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); | ||
815 | } | 997 | } |
816 | 998 | ||
817 | /* | 999 | /* |
@@ -825,6 +1007,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, | |||
825 | struct protection_domain *domain; | 1007 | struct protection_domain *domain; |
826 | u16 devid; | 1008 | u16 devid; |
827 | dma_addr_t addr; | 1009 | dma_addr_t addr; |
1010 | u64 dma_mask; | ||
1011 | |||
1012 | if (!check_device(dev)) | ||
1013 | return bad_dma_address; | ||
1014 | |||
1015 | dma_mask = *dev->dma_mask; | ||
828 | 1016 | ||
829 | get_device_resources(dev, &iommu, &domain, &devid); | 1017 | get_device_resources(dev, &iommu, &domain, &devid); |
830 | 1018 | ||
@@ -833,14 +1021,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, | |||
833 | return (dma_addr_t)paddr; | 1021 | return (dma_addr_t)paddr; |
834 | 1022 | ||
835 | spin_lock_irqsave(&domain->lock, flags); | 1023 | spin_lock_irqsave(&domain->lock, flags); |
836 | addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); | 1024 | addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, |
1025 | dma_mask); | ||
837 | if (addr == bad_dma_address) | 1026 | if (addr == bad_dma_address) |
838 | goto out; | 1027 | goto out; |
839 | 1028 | ||
840 | if (iommu_has_npcache(iommu)) | 1029 | if (unlikely(iommu->need_sync)) |
841 | iommu_flush_pages(iommu, domain->id, addr, size); | ||
842 | |||
843 | if (iommu->need_sync) | ||
844 | iommu_completion_wait(iommu); | 1030 | iommu_completion_wait(iommu); |
845 | 1031 | ||
846 | out: | 1032 | out: |
@@ -860,7 +1046,8 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, | |||
860 | struct protection_domain *domain; | 1046 | struct protection_domain *domain; |
861 | u16 devid; | 1047 | u16 devid; |
862 | 1048 | ||
863 | if (!get_device_resources(dev, &iommu, &domain, &devid)) | 1049 | if (!check_device(dev) || |
1050 | !get_device_resources(dev, &iommu, &domain, &devid)) | ||
864 | /* device not handled by any AMD IOMMU */ | 1051 | /* device not handled by any AMD IOMMU */ |
865 | return; | 1052 | return; |
866 | 1053 | ||
@@ -868,9 +1055,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, | |||
868 | 1055 | ||
869 | __unmap_single(iommu, domain->priv, dma_addr, size, dir); | 1056 | __unmap_single(iommu, domain->priv, dma_addr, size, dir); |
870 | 1057 | ||
871 | iommu_flush_pages(iommu, domain->id, dma_addr, size); | 1058 | if (unlikely(iommu->need_sync)) |
872 | |||
873 | if (iommu->need_sync) | ||
874 | iommu_completion_wait(iommu); | 1059 | iommu_completion_wait(iommu); |
875 | 1060 | ||
876 | spin_unlock_irqrestore(&domain->lock, flags); | 1061 | spin_unlock_irqrestore(&domain->lock, flags); |
@@ -909,6 +1094,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, | |||
909 | struct scatterlist *s; | 1094 | struct scatterlist *s; |
910 | phys_addr_t paddr; | 1095 | phys_addr_t paddr; |
911 | int mapped_elems = 0; | 1096 | int mapped_elems = 0; |
1097 | u64 dma_mask; | ||
1098 | |||
1099 | if (!check_device(dev)) | ||
1100 | return 0; | ||
1101 | |||
1102 | dma_mask = *dev->dma_mask; | ||
912 | 1103 | ||
913 | get_device_resources(dev, &iommu, &domain, &devid); | 1104 | get_device_resources(dev, &iommu, &domain, &devid); |
914 | 1105 | ||
@@ -921,19 +1112,17 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, | |||
921 | paddr = sg_phys(s); | 1112 | paddr = sg_phys(s); |
922 | 1113 | ||
923 | s->dma_address = __map_single(dev, iommu, domain->priv, | 1114 | s->dma_address = __map_single(dev, iommu, domain->priv, |
924 | paddr, s->length, dir); | 1115 | paddr, s->length, dir, false, |
1116 | dma_mask); | ||
925 | 1117 | ||
926 | if (s->dma_address) { | 1118 | if (s->dma_address) { |
927 | s->dma_length = s->length; | 1119 | s->dma_length = s->length; |
928 | mapped_elems++; | 1120 | mapped_elems++; |
929 | } else | 1121 | } else |
930 | goto unmap; | 1122 | goto unmap; |
931 | if (iommu_has_npcache(iommu)) | ||
932 | iommu_flush_pages(iommu, domain->id, s->dma_address, | ||
933 | s->dma_length); | ||
934 | } | 1123 | } |
935 | 1124 | ||
936 | if (iommu->need_sync) | 1125 | if (unlikely(iommu->need_sync)) |
937 | iommu_completion_wait(iommu); | 1126 | iommu_completion_wait(iommu); |
938 | 1127 | ||
939 | out: | 1128 | out: |
@@ -967,7 +1156,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
967 | u16 devid; | 1156 | u16 devid; |
968 | int i; | 1157 | int i; |
969 | 1158 | ||
970 | if (!get_device_resources(dev, &iommu, &domain, &devid)) | 1159 | if (!check_device(dev) || |
1160 | !get_device_resources(dev, &iommu, &domain, &devid)) | ||
971 | return; | 1161 | return; |
972 | 1162 | ||
973 | spin_lock_irqsave(&domain->lock, flags); | 1163 | spin_lock_irqsave(&domain->lock, flags); |
@@ -975,12 +1165,10 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
975 | for_each_sg(sglist, s, nelems, i) { | 1165 | for_each_sg(sglist, s, nelems, i) { |
976 | __unmap_single(iommu, domain->priv, s->dma_address, | 1166 | __unmap_single(iommu, domain->priv, s->dma_address, |
977 | s->dma_length, dir); | 1167 | s->dma_length, dir); |
978 | iommu_flush_pages(iommu, domain->id, s->dma_address, | ||
979 | s->dma_length); | ||
980 | s->dma_address = s->dma_length = 0; | 1168 | s->dma_address = s->dma_length = 0; |
981 | } | 1169 | } |
982 | 1170 | ||
983 | if (iommu->need_sync) | 1171 | if (unlikely(iommu->need_sync)) |
984 | iommu_completion_wait(iommu); | 1172 | iommu_completion_wait(iommu); |
985 | 1173 | ||
986 | spin_unlock_irqrestore(&domain->lock, flags); | 1174 | spin_unlock_irqrestore(&domain->lock, flags); |
@@ -998,25 +1186,33 @@ static void *alloc_coherent(struct device *dev, size_t size, | |||
998 | struct protection_domain *domain; | 1186 | struct protection_domain *domain; |
999 | u16 devid; | 1187 | u16 devid; |
1000 | phys_addr_t paddr; | 1188 | phys_addr_t paddr; |
1189 | u64 dma_mask = dev->coherent_dma_mask; | ||
1190 | |||
1191 | if (!check_device(dev)) | ||
1192 | return NULL; | ||
1001 | 1193 | ||
1194 | if (!get_device_resources(dev, &iommu, &domain, &devid)) | ||
1195 | flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); | ||
1196 | |||
1197 | flag |= __GFP_ZERO; | ||
1002 | virt_addr = (void *)__get_free_pages(flag, get_order(size)); | 1198 | virt_addr = (void *)__get_free_pages(flag, get_order(size)); |
1003 | if (!virt_addr) | 1199 | if (!virt_addr) |
1004 | return 0; | 1200 | return 0; |
1005 | 1201 | ||
1006 | memset(virt_addr, 0, size); | ||
1007 | paddr = virt_to_phys(virt_addr); | 1202 | paddr = virt_to_phys(virt_addr); |
1008 | 1203 | ||
1009 | get_device_resources(dev, &iommu, &domain, &devid); | ||
1010 | |||
1011 | if (!iommu || !domain) { | 1204 | if (!iommu || !domain) { |
1012 | *dma_addr = (dma_addr_t)paddr; | 1205 | *dma_addr = (dma_addr_t)paddr; |
1013 | return virt_addr; | 1206 | return virt_addr; |
1014 | } | 1207 | } |
1015 | 1208 | ||
1209 | if (!dma_mask) | ||
1210 | dma_mask = *dev->dma_mask; | ||
1211 | |||
1016 | spin_lock_irqsave(&domain->lock, flags); | 1212 | spin_lock_irqsave(&domain->lock, flags); |
1017 | 1213 | ||
1018 | *dma_addr = __map_single(dev, iommu, domain->priv, paddr, | 1214 | *dma_addr = __map_single(dev, iommu, domain->priv, paddr, |
1019 | size, DMA_BIDIRECTIONAL); | 1215 | size, DMA_BIDIRECTIONAL, true, dma_mask); |
1020 | 1216 | ||
1021 | if (*dma_addr == bad_dma_address) { | 1217 | if (*dma_addr == bad_dma_address) { |
1022 | free_pages((unsigned long)virt_addr, get_order(size)); | 1218 | free_pages((unsigned long)virt_addr, get_order(size)); |
@@ -1024,10 +1220,7 @@ static void *alloc_coherent(struct device *dev, size_t size, | |||
1024 | goto out; | 1220 | goto out; |
1025 | } | 1221 | } |
1026 | 1222 | ||
1027 | if (iommu_has_npcache(iommu)) | 1223 | if (unlikely(iommu->need_sync)) |
1028 | iommu_flush_pages(iommu, domain->id, *dma_addr, size); | ||
1029 | |||
1030 | if (iommu->need_sync) | ||
1031 | iommu_completion_wait(iommu); | 1224 | iommu_completion_wait(iommu); |
1032 | 1225 | ||
1033 | out: | 1226 | out: |
@@ -1038,8 +1231,6 @@ out: | |||
1038 | 1231 | ||
1039 | /* | 1232 | /* |
1040 | * The exported free_coherent function for dma_ops. | 1233 | * The exported free_coherent function for dma_ops. |
1041 | * FIXME: fix the generic x86 DMA layer so that it actually calls that | ||
1042 | * function. | ||
1043 | */ | 1234 | */ |
1044 | static void free_coherent(struct device *dev, size_t size, | 1235 | static void free_coherent(struct device *dev, size_t size, |
1045 | void *virt_addr, dma_addr_t dma_addr) | 1236 | void *virt_addr, dma_addr_t dma_addr) |
@@ -1049,6 +1240,9 @@ static void free_coherent(struct device *dev, size_t size, | |||
1049 | struct protection_domain *domain; | 1240 | struct protection_domain *domain; |
1050 | u16 devid; | 1241 | u16 devid; |
1051 | 1242 | ||
1243 | if (!check_device(dev)) | ||
1244 | return; | ||
1245 | |||
1052 | get_device_resources(dev, &iommu, &domain, &devid); | 1246 | get_device_resources(dev, &iommu, &domain, &devid); |
1053 | 1247 | ||
1054 | if (!iommu || !domain) | 1248 | if (!iommu || !domain) |
@@ -1057,9 +1251,8 @@ static void free_coherent(struct device *dev, size_t size, | |||
1057 | spin_lock_irqsave(&domain->lock, flags); | 1251 | spin_lock_irqsave(&domain->lock, flags); |
1058 | 1252 | ||
1059 | __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); | 1253 | __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); |
1060 | iommu_flush_pages(iommu, domain->id, dma_addr, size); | ||
1061 | 1254 | ||
1062 | if (iommu->need_sync) | 1255 | if (unlikely(iommu->need_sync)) |
1063 | iommu_completion_wait(iommu); | 1256 | iommu_completion_wait(iommu); |
1064 | 1257 | ||
1065 | spin_unlock_irqrestore(&domain->lock, flags); | 1258 | spin_unlock_irqrestore(&domain->lock, flags); |
@@ -1069,6 +1262,30 @@ free_mem: | |||
1069 | } | 1262 | } |
1070 | 1263 | ||
1071 | /* | 1264 | /* |
1265 | * This function is called by the DMA layer to find out if we can handle a | ||
1266 | * particular device. It is part of the dma_ops. | ||
1267 | */ | ||
1268 | static int amd_iommu_dma_supported(struct device *dev, u64 mask) | ||
1269 | { | ||
1270 | u16 bdf; | ||
1271 | struct pci_dev *pcidev; | ||
1272 | |||
1273 | /* No device or no PCI device */ | ||
1274 | if (!dev || dev->bus != &pci_bus_type) | ||
1275 | return 0; | ||
1276 | |||
1277 | pcidev = to_pci_dev(dev); | ||
1278 | |||
1279 | bdf = calc_devid(pcidev->bus->number, pcidev->devfn); | ||
1280 | |||
1281 | /* Out of our scope? */ | ||
1282 | if (bdf > amd_iommu_last_bdf) | ||
1283 | return 0; | ||
1284 | |||
1285 | return 1; | ||
1286 | } | ||
1287 | |||
1288 | /* | ||
1072 | * The function for pre-allocating protection domains. | 1289 | * The function for pre-allocating protection domains. |
1073 | * | 1290 | * |
1074 | * If the driver core informs the DMA layer if a driver grabs a device | 1291 | * If the driver core informs the DMA layer if a driver grabs a device |
@@ -1097,10 +1314,9 @@ void prealloc_protection_domains(void) | |||
1097 | if (!dma_dom) | 1314 | if (!dma_dom) |
1098 | continue; | 1315 | continue; |
1099 | init_unity_mappings_for_device(dma_dom, devid); | 1316 | init_unity_mappings_for_device(dma_dom, devid); |
1100 | set_device_domain(iommu, &dma_dom->domain, devid); | 1317 | dma_dom->target_dev = devid; |
1101 | printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", | 1318 | |
1102 | dma_dom->domain.id); | 1319 | list_add_tail(&dma_dom->list, &iommu_pd_list); |
1103 | print_devid(devid, 1); | ||
1104 | } | 1320 | } |
1105 | } | 1321 | } |
1106 | 1322 | ||
@@ -1111,6 +1327,7 @@ static struct dma_mapping_ops amd_iommu_dma_ops = { | |||
1111 | .unmap_single = unmap_single, | 1327 | .unmap_single = unmap_single, |
1112 | .map_sg = map_sg, | 1328 | .map_sg = map_sg, |
1113 | .unmap_sg = unmap_sg, | 1329 | .unmap_sg = unmap_sg, |
1330 | .dma_supported = amd_iommu_dma_supported, | ||
1114 | }; | 1331 | }; |
1115 | 1332 | ||
1116 | /* | 1333 | /* |
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index a69cc0f52042..4cd8083c58be 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c | |||
@@ -22,6 +22,8 @@ | |||
22 | #include <linux/gfp.h> | 22 | #include <linux/gfp.h> |
23 | #include <linux/list.h> | 23 | #include <linux/list.h> |
24 | #include <linux/sysdev.h> | 24 | #include <linux/sysdev.h> |
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/msi.h> | ||
25 | #include <asm/pci-direct.h> | 27 | #include <asm/pci-direct.h> |
26 | #include <asm/amd_iommu_types.h> | 28 | #include <asm/amd_iommu_types.h> |
27 | #include <asm/amd_iommu.h> | 29 | #include <asm/amd_iommu.h> |
@@ -30,7 +32,6 @@ | |||
30 | /* | 32 | /* |
31 | * definitions for the ACPI scanning code | 33 | * definitions for the ACPI scanning code |
32 | */ | 34 | */ |
33 | #define PCI_BUS(x) (((x) >> 8) & 0xff) | ||
34 | #define IVRS_HEADER_LENGTH 48 | 35 | #define IVRS_HEADER_LENGTH 48 |
35 | 36 | ||
36 | #define ACPI_IVHD_TYPE 0x10 | 37 | #define ACPI_IVHD_TYPE 0x10 |
@@ -121,6 +122,7 @@ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings | |||
121 | we find in ACPI */ | 122 | we find in ACPI */ |
122 | unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ | 123 | unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ |
123 | int amd_iommu_isolate; /* if 1, device isolation is enabled */ | 124 | int amd_iommu_isolate; /* if 1, device isolation is enabled */ |
125 | bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ | ||
124 | 126 | ||
125 | LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the | 127 | LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the |
126 | system */ | 128 | system */ |
@@ -234,7 +236,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) | |||
234 | { | 236 | { |
235 | u32 ctrl; | 237 | u32 ctrl; |
236 | 238 | ||
237 | ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); | 239 | ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); |
238 | ctrl &= ~(1 << bit); | 240 | ctrl &= ~(1 << bit); |
239 | writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); | 241 | writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); |
240 | } | 242 | } |
@@ -242,13 +244,23 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) | |||
242 | /* Function to enable the hardware */ | 244 | /* Function to enable the hardware */ |
243 | void __init iommu_enable(struct amd_iommu *iommu) | 245 | void __init iommu_enable(struct amd_iommu *iommu) |
244 | { | 246 | { |
245 | printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); | 247 | printk(KERN_INFO "AMD IOMMU: Enabling IOMMU " |
246 | print_devid(iommu->devid, 0); | 248 | "at %02x:%02x.%x cap 0x%hx\n", |
247 | printk(" cap 0x%hx\n", iommu->cap_ptr); | 249 | iommu->dev->bus->number, |
250 | PCI_SLOT(iommu->dev->devfn), | ||
251 | PCI_FUNC(iommu->dev->devfn), | ||
252 | iommu->cap_ptr); | ||
248 | 253 | ||
249 | iommu_feature_enable(iommu, CONTROL_IOMMU_EN); | 254 | iommu_feature_enable(iommu, CONTROL_IOMMU_EN); |
250 | } | 255 | } |
251 | 256 | ||
257 | /* Function to enable IOMMU event logging and event interrupts */ | ||
258 | void __init iommu_enable_event_logging(struct amd_iommu *iommu) | ||
259 | { | ||
260 | iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); | ||
261 | iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); | ||
262 | } | ||
263 | |||
252 | /* | 264 | /* |
253 | * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in | 265 | * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in |
254 | * the system has one. | 266 | * the system has one. |
@@ -286,6 +298,14 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu) | |||
286 | ****************************************************************************/ | 298 | ****************************************************************************/ |
287 | 299 | ||
288 | /* | 300 | /* |
301 | * This function calculates the length of a given IVHD entry | ||
302 | */ | ||
303 | static inline int ivhd_entry_length(u8 *ivhd) | ||
304 | { | ||
305 | return 0x04 << (*ivhd >> 6); | ||
306 | } | ||
307 | |||
308 | /* | ||
289 | * This function reads the last device id the IOMMU has to handle from the PCI | 309 | * This function reads the last device id the IOMMU has to handle from the PCI |
290 | * capability header for this IOMMU | 310 | * capability header for this IOMMU |
291 | */ | 311 | */ |
@@ -329,7 +349,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) | |||
329 | default: | 349 | default: |
330 | break; | 350 | break; |
331 | } | 351 | } |
332 | p += 0x04 << (*p >> 6); | 352 | p += ivhd_entry_length(p); |
333 | } | 353 | } |
334 | 354 | ||
335 | WARN_ON(p != end); | 355 | WARN_ON(p != end); |
@@ -414,7 +434,32 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) | |||
414 | 434 | ||
415 | static void __init free_command_buffer(struct amd_iommu *iommu) | 435 | static void __init free_command_buffer(struct amd_iommu *iommu) |
416 | { | 436 | { |
417 | free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); | 437 | free_pages((unsigned long)iommu->cmd_buf, |
438 | get_order(iommu->cmd_buf_size)); | ||
439 | } | ||
440 | |||
441 | /* allocates the memory where the IOMMU will log its events to */ | ||
442 | static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) | ||
443 | { | ||
444 | u64 entry; | ||
445 | iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, | ||
446 | get_order(EVT_BUFFER_SIZE)); | ||
447 | |||
448 | if (iommu->evt_buf == NULL) | ||
449 | return NULL; | ||
450 | |||
451 | entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; | ||
452 | memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, | ||
453 | &entry, sizeof(entry)); | ||
454 | |||
455 | iommu->evt_buf_size = EVT_BUFFER_SIZE; | ||
456 | |||
457 | return iommu->evt_buf; | ||
458 | } | ||
459 | |||
460 | static void __init free_event_buffer(struct amd_iommu *iommu) | ||
461 | { | ||
462 | free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE)); | ||
418 | } | 463 | } |
419 | 464 | ||
420 | /* sets a specific bit in the device table entry. */ | 465 | /* sets a specific bit in the device table entry. */ |
@@ -487,19 +532,21 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) | |||
487 | */ | 532 | */ |
488 | static void __init init_iommu_from_pci(struct amd_iommu *iommu) | 533 | static void __init init_iommu_from_pci(struct amd_iommu *iommu) |
489 | { | 534 | { |
490 | int bus = PCI_BUS(iommu->devid); | ||
491 | int dev = PCI_SLOT(iommu->devid); | ||
492 | int fn = PCI_FUNC(iommu->devid); | ||
493 | int cap_ptr = iommu->cap_ptr; | 535 | int cap_ptr = iommu->cap_ptr; |
494 | u32 range; | 536 | u32 range, misc; |
495 | 537 | ||
496 | iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); | 538 | pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, |
539 | &iommu->cap); | ||
540 | pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET, | ||
541 | &range); | ||
542 | pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET, | ||
543 | &misc); | ||
497 | 544 | ||
498 | range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); | ||
499 | iommu->first_device = calc_devid(MMIO_GET_BUS(range), | 545 | iommu->first_device = calc_devid(MMIO_GET_BUS(range), |
500 | MMIO_GET_FD(range)); | 546 | MMIO_GET_FD(range)); |
501 | iommu->last_device = calc_devid(MMIO_GET_BUS(range), | 547 | iommu->last_device = calc_devid(MMIO_GET_BUS(range), |
502 | MMIO_GET_LD(range)); | 548 | MMIO_GET_LD(range)); |
549 | iommu->evt_msi_num = MMIO_MSI_NUM(misc); | ||
503 | } | 550 | } |
504 | 551 | ||
505 | /* | 552 | /* |
@@ -604,7 +651,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, | |||
604 | break; | 651 | break; |
605 | } | 652 | } |
606 | 653 | ||
607 | p += 0x04 << (e->type >> 6); | 654 | p += ivhd_entry_length(p); |
608 | } | 655 | } |
609 | } | 656 | } |
610 | 657 | ||
@@ -622,6 +669,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu) | |||
622 | static void __init free_iommu_one(struct amd_iommu *iommu) | 669 | static void __init free_iommu_one(struct amd_iommu *iommu) |
623 | { | 670 | { |
624 | free_command_buffer(iommu); | 671 | free_command_buffer(iommu); |
672 | free_event_buffer(iommu); | ||
625 | iommu_unmap_mmio_space(iommu); | 673 | iommu_unmap_mmio_space(iommu); |
626 | } | 674 | } |
627 | 675 | ||
@@ -649,8 +697,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) | |||
649 | /* | 697 | /* |
650 | * Copy data from ACPI table entry to the iommu struct | 698 | * Copy data from ACPI table entry to the iommu struct |
651 | */ | 699 | */ |
652 | iommu->devid = h->devid; | 700 | iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff); |
701 | if (!iommu->dev) | ||
702 | return 1; | ||
703 | |||
653 | iommu->cap_ptr = h->cap_ptr; | 704 | iommu->cap_ptr = h->cap_ptr; |
705 | iommu->pci_seg = h->pci_seg; | ||
654 | iommu->mmio_phys = h->mmio_phys; | 706 | iommu->mmio_phys = h->mmio_phys; |
655 | iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); | 707 | iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); |
656 | if (!iommu->mmio_base) | 708 | if (!iommu->mmio_base) |
@@ -661,11 +713,17 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) | |||
661 | if (!iommu->cmd_buf) | 713 | if (!iommu->cmd_buf) |
662 | return -ENOMEM; | 714 | return -ENOMEM; |
663 | 715 | ||
716 | iommu->evt_buf = alloc_event_buffer(iommu); | ||
717 | if (!iommu->evt_buf) | ||
718 | return -ENOMEM; | ||
719 | |||
720 | iommu->int_enabled = false; | ||
721 | |||
664 | init_iommu_from_pci(iommu); | 722 | init_iommu_from_pci(iommu); |
665 | init_iommu_from_acpi(iommu, h); | 723 | init_iommu_from_acpi(iommu, h); |
666 | init_iommu_devices(iommu); | 724 | init_iommu_devices(iommu); |
667 | 725 | ||
668 | return 0; | 726 | return pci_enable_device(iommu->dev); |
669 | } | 727 | } |
670 | 728 | ||
671 | /* | 729 | /* |
@@ -706,6 +764,95 @@ static int __init init_iommu_all(struct acpi_table_header *table) | |||
706 | 764 | ||
707 | /**************************************************************************** | 765 | /**************************************************************************** |
708 | * | 766 | * |
767 | * The following functions initialize the MSI interrupts for all IOMMUs | ||
768 | * in the system. Its a bit challenging because there could be multiple | ||
769 | * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per | ||
770 | * pci_dev. | ||
771 | * | ||
772 | ****************************************************************************/ | ||
773 | |||
774 | static int __init iommu_setup_msix(struct amd_iommu *iommu) | ||
775 | { | ||
776 | struct amd_iommu *curr; | ||
777 | struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ | ||
778 | int nvec = 0, i; | ||
779 | |||
780 | list_for_each_entry(curr, &amd_iommu_list, list) { | ||
781 | if (curr->dev == iommu->dev) { | ||
782 | entries[nvec].entry = curr->evt_msi_num; | ||
783 | entries[nvec].vector = 0; | ||
784 | curr->int_enabled = true; | ||
785 | nvec++; | ||
786 | } | ||
787 | } | ||
788 | |||
789 | if (pci_enable_msix(iommu->dev, entries, nvec)) { | ||
790 | pci_disable_msix(iommu->dev); | ||
791 | return 1; | ||
792 | } | ||
793 | |||
794 | for (i = 0; i < nvec; ++i) { | ||
795 | int r = request_irq(entries->vector, amd_iommu_int_handler, | ||
796 | IRQF_SAMPLE_RANDOM, | ||
797 | "AMD IOMMU", | ||
798 | NULL); | ||
799 | if (r) | ||
800 | goto out_free; | ||
801 | } | ||
802 | |||
803 | return 0; | ||
804 | |||
805 | out_free: | ||
806 | for (i -= 1; i >= 0; --i) | ||
807 | free_irq(entries->vector, NULL); | ||
808 | |||
809 | pci_disable_msix(iommu->dev); | ||
810 | |||
811 | return 1; | ||
812 | } | ||
813 | |||
814 | static int __init iommu_setup_msi(struct amd_iommu *iommu) | ||
815 | { | ||
816 | int r; | ||
817 | struct amd_iommu *curr; | ||
818 | |||
819 | list_for_each_entry(curr, &amd_iommu_list, list) { | ||
820 | if (curr->dev == iommu->dev) | ||
821 | curr->int_enabled = true; | ||
822 | } | ||
823 | |||
824 | |||
825 | if (pci_enable_msi(iommu->dev)) | ||
826 | return 1; | ||
827 | |||
828 | r = request_irq(iommu->dev->irq, amd_iommu_int_handler, | ||
829 | IRQF_SAMPLE_RANDOM, | ||
830 | "AMD IOMMU", | ||
831 | NULL); | ||
832 | |||
833 | if (r) { | ||
834 | pci_disable_msi(iommu->dev); | ||
835 | return 1; | ||
836 | } | ||
837 | |||
838 | return 0; | ||
839 | } | ||
840 | |||
841 | static int __init iommu_init_msi(struct amd_iommu *iommu) | ||
842 | { | ||
843 | if (iommu->int_enabled) | ||
844 | return 0; | ||
845 | |||
846 | if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) | ||
847 | return iommu_setup_msix(iommu); | ||
848 | else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) | ||
849 | return iommu_setup_msi(iommu); | ||
850 | |||
851 | return 1; | ||
852 | } | ||
853 | |||
854 | /**************************************************************************** | ||
855 | * | ||
709 | * The next functions belong to the third pass of parsing the ACPI | 856 | * The next functions belong to the third pass of parsing the ACPI |
710 | * table. In this last pass the memory mapping requirements are | 857 | * table. In this last pass the memory mapping requirements are |
711 | * gathered (like exclusion and unity mapping reanges). | 858 | * gathered (like exclusion and unity mapping reanges). |
@@ -811,7 +958,6 @@ static void init_device_table(void) | |||
811 | for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { | 958 | for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { |
812 | set_dev_entry_bit(devid, DEV_ENTRY_VALID); | 959 | set_dev_entry_bit(devid, DEV_ENTRY_VALID); |
813 | set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION); | 960 | set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION); |
814 | set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT); | ||
815 | } | 961 | } |
816 | } | 962 | } |
817 | 963 | ||
@@ -825,6 +971,8 @@ static void __init enable_iommus(void) | |||
825 | 971 | ||
826 | list_for_each_entry(iommu, &amd_iommu_list, list) { | 972 | list_for_each_entry(iommu, &amd_iommu_list, list) { |
827 | iommu_set_exclusion_range(iommu); | 973 | iommu_set_exclusion_range(iommu); |
974 | iommu_init_msi(iommu); | ||
975 | iommu_enable_event_logging(iommu); | ||
828 | iommu_enable(iommu); | 976 | iommu_enable(iommu); |
829 | } | 977 | } |
830 | } | 978 | } |
@@ -995,11 +1143,17 @@ int __init amd_iommu_init(void) | |||
995 | else | 1143 | else |
996 | printk("disabled\n"); | 1144 | printk("disabled\n"); |
997 | 1145 | ||
1146 | if (amd_iommu_unmap_flush) | ||
1147 | printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n"); | ||
1148 | else | ||
1149 | printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n"); | ||
1150 | |||
998 | out: | 1151 | out: |
999 | return ret; | 1152 | return ret; |
1000 | 1153 | ||
1001 | free: | 1154 | free: |
1002 | free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); | 1155 | free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, |
1156 | get_order(MAX_DOMAIN_ID/8)); | ||
1003 | 1157 | ||
1004 | free_pages((unsigned long)amd_iommu_pd_table, | 1158 | free_pages((unsigned long)amd_iommu_pd_table, |
1005 | get_order(rlookup_table_size)); | 1159 | get_order(rlookup_table_size)); |
@@ -1057,8 +1211,10 @@ void __init amd_iommu_detect(void) | |||
1057 | static int __init parse_amd_iommu_options(char *str) | 1211 | static int __init parse_amd_iommu_options(char *str) |
1058 | { | 1212 | { |
1059 | for (; *str; ++str) { | 1213 | for (; *str; ++str) { |
1060 | if (strcmp(str, "isolate") == 0) | 1214 | if (strncmp(str, "isolate", 7) == 0) |
1061 | amd_iommu_isolate = 1; | 1215 | amd_iommu_isolate = 1; |
1216 | if (strncmp(str, "fullflush", 11) == 0) | ||
1217 | amd_iommu_unmap_flush = true; | ||
1062 | } | 1218 | } |
1063 | 1219 | ||
1064 | return 1; | 1220 | return 1; |
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 44e21826db11..9a32b37ee2ee 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c | |||
@@ -455,11 +455,11 @@ out: | |||
455 | force_iommu || | 455 | force_iommu || |
456 | valid_agp || | 456 | valid_agp || |
457 | fallback_aper_force) { | 457 | fallback_aper_force) { |
458 | printk(KERN_ERR | 458 | printk(KERN_INFO |
459 | "Your BIOS doesn't leave a aperture memory hole\n"); | 459 | "Your BIOS doesn't leave a aperture memory hole\n"); |
460 | printk(KERN_ERR | 460 | printk(KERN_INFO |
461 | "Please enable the IOMMU option in the BIOS setup\n"); | 461 | "Please enable the IOMMU option in the BIOS setup\n"); |
462 | printk(KERN_ERR | 462 | printk(KERN_INFO |
463 | "This costs you %d MB of RAM\n", | 463 | "This costs you %d MB of RAM\n", |
464 | 32 << fallback_aper_order); | 464 | 32 << fallback_aper_order); |
465 | 465 | ||
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index f88bd0d982b0..21c831d96af3 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c | |||
@@ -60,10 +60,8 @@ unsigned long mp_lapic_addr; | |||
60 | static int force_enable_local_apic; | 60 | static int force_enable_local_apic; |
61 | int disable_apic; | 61 | int disable_apic; |
62 | 62 | ||
63 | /* Local APIC timer verification ok */ | ||
64 | static int local_apic_timer_verify_ok; | ||
65 | /* Disable local APIC timer from the kernel commandline or via dmi quirk */ | 63 | /* Disable local APIC timer from the kernel commandline or via dmi quirk */ |
66 | static int local_apic_timer_disabled; | 64 | static int disable_apic_timer __cpuinitdata; |
67 | /* Local APIC timer works in C2 */ | 65 | /* Local APIC timer works in C2 */ |
68 | int local_apic_timer_c2_ok; | 66 | int local_apic_timer_c2_ok; |
69 | EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); | 67 | EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); |
@@ -130,7 +128,11 @@ static inline int lapic_get_version(void) | |||
130 | */ | 128 | */ |
131 | static inline int lapic_is_integrated(void) | 129 | static inline int lapic_is_integrated(void) |
132 | { | 130 | { |
131 | #ifdef CONFIG_X86_64 | ||
132 | return 1; | ||
133 | #else | ||
133 | return APIC_INTEGRATED(lapic_get_version()); | 134 | return APIC_INTEGRATED(lapic_get_version()); |
135 | #endif | ||
134 | } | 136 | } |
135 | 137 | ||
136 | /* | 138 | /* |
@@ -145,13 +147,18 @@ static int modern_apic(void) | |||
145 | return lapic_get_version() >= 0x14; | 147 | return lapic_get_version() >= 0x14; |
146 | } | 148 | } |
147 | 149 | ||
148 | void apic_wait_icr_idle(void) | 150 | /* |
151 | * Paravirt kernels also might be using these below ops. So we still | ||
152 | * use generic apic_read()/apic_write(), which might be pointing to different | ||
153 | * ops in PARAVIRT case. | ||
154 | */ | ||
155 | void xapic_wait_icr_idle(void) | ||
149 | { | 156 | { |
150 | while (apic_read(APIC_ICR) & APIC_ICR_BUSY) | 157 | while (apic_read(APIC_ICR) & APIC_ICR_BUSY) |
151 | cpu_relax(); | 158 | cpu_relax(); |
152 | } | 159 | } |
153 | 160 | ||
154 | u32 safe_apic_wait_icr_idle(void) | 161 | u32 safe_xapic_wait_icr_idle(void) |
155 | { | 162 | { |
156 | u32 send_status; | 163 | u32 send_status; |
157 | int timeout; | 164 | int timeout; |
@@ -167,16 +174,48 @@ u32 safe_apic_wait_icr_idle(void) | |||
167 | return send_status; | 174 | return send_status; |
168 | } | 175 | } |
169 | 176 | ||
177 | void xapic_icr_write(u32 low, u32 id) | ||
178 | { | ||
179 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); | ||
180 | apic_write(APIC_ICR, low); | ||
181 | } | ||
182 | |||
183 | u64 xapic_icr_read(void) | ||
184 | { | ||
185 | u32 icr1, icr2; | ||
186 | |||
187 | icr2 = apic_read(APIC_ICR2); | ||
188 | icr1 = apic_read(APIC_ICR); | ||
189 | |||
190 | return icr1 | ((u64)icr2 << 32); | ||
191 | } | ||
192 | |||
193 | static struct apic_ops xapic_ops = { | ||
194 | .read = native_apic_mem_read, | ||
195 | .write = native_apic_mem_write, | ||
196 | .icr_read = xapic_icr_read, | ||
197 | .icr_write = xapic_icr_write, | ||
198 | .wait_icr_idle = xapic_wait_icr_idle, | ||
199 | .safe_wait_icr_idle = safe_xapic_wait_icr_idle, | ||
200 | }; | ||
201 | |||
202 | struct apic_ops __read_mostly *apic_ops = &xapic_ops; | ||
203 | EXPORT_SYMBOL_GPL(apic_ops); | ||
204 | |||
170 | /** | 205 | /** |
171 | * enable_NMI_through_LVT0 - enable NMI through local vector table 0 | 206 | * enable_NMI_through_LVT0 - enable NMI through local vector table 0 |
172 | */ | 207 | */ |
173 | void __cpuinit enable_NMI_through_LVT0(void) | 208 | void __cpuinit enable_NMI_through_LVT0(void) |
174 | { | 209 | { |
175 | unsigned int v = APIC_DM_NMI; | 210 | unsigned int v; |
176 | 211 | ||
177 | /* Level triggered for 82489DX */ | 212 | /* unmask and set to NMI */ |
213 | v = APIC_DM_NMI; | ||
214 | |||
215 | /* Level triggered for 82489DX (32bit mode) */ | ||
178 | if (!lapic_is_integrated()) | 216 | if (!lapic_is_integrated()) |
179 | v |= APIC_LVT_LEVEL_TRIGGER; | 217 | v |= APIC_LVT_LEVEL_TRIGGER; |
218 | |||
180 | apic_write(APIC_LVT0, v); | 219 | apic_write(APIC_LVT0, v); |
181 | } | 220 | } |
182 | 221 | ||
@@ -193,9 +232,13 @@ int get_physical_broadcast(void) | |||
193 | */ | 232 | */ |
194 | int lapic_get_maxlvt(void) | 233 | int lapic_get_maxlvt(void) |
195 | { | 234 | { |
196 | unsigned int v = apic_read(APIC_LVR); | 235 | unsigned int v; |
197 | 236 | ||
198 | /* 82489DXs do not report # of LVT entries. */ | 237 | v = apic_read(APIC_LVR); |
238 | /* | ||
239 | * - we always have APIC integrated on 64bit mode | ||
240 | * - 82489DXs do not report # of LVT entries | ||
241 | */ | ||
199 | return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; | 242 | return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; |
200 | } | 243 | } |
201 | 244 | ||
@@ -203,8 +246,12 @@ int lapic_get_maxlvt(void) | |||
203 | * Local APIC timer | 246 | * Local APIC timer |
204 | */ | 247 | */ |
205 | 248 | ||
206 | /* Clock divisor is set to 16 */ | 249 | /* Clock divisor */ |
250 | #ifdef CONFG_X86_64 | ||
251 | #define APIC_DIVISOR 1 | ||
252 | #else | ||
207 | #define APIC_DIVISOR 16 | 253 | #define APIC_DIVISOR 16 |
254 | #endif | ||
208 | 255 | ||
209 | /* | 256 | /* |
210 | * This function sets up the local APIC timer, with a timeout of | 257 | * This function sets up the local APIC timer, with a timeout of |
@@ -212,6 +259,9 @@ int lapic_get_maxlvt(void) | |||
212 | * this function twice on the boot CPU, once with a bogus timeout | 259 | * this function twice on the boot CPU, once with a bogus timeout |
213 | * value, second time for real. The other (noncalibrating) CPUs | 260 | * value, second time for real. The other (noncalibrating) CPUs |
214 | * call this function only once, with the real, calibrated value. | 261 | * call this function only once, with the real, calibrated value. |
262 | * | ||
263 | * We do reads before writes even if unnecessary, to get around the | ||
264 | * P5 APIC double write bug. | ||
215 | */ | 265 | */ |
216 | static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | 266 | static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) |
217 | { | 267 | { |
@@ -233,14 +283,48 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | |||
233 | */ | 283 | */ |
234 | tmp_value = apic_read(APIC_TDCR); | 284 | tmp_value = apic_read(APIC_TDCR); |
235 | apic_write(APIC_TDCR, | 285 | apic_write(APIC_TDCR, |
236 | (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | | 286 | (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | |
237 | APIC_TDR_DIV_16); | 287 | APIC_TDR_DIV_16); |
238 | 288 | ||
239 | if (!oneshot) | 289 | if (!oneshot) |
240 | apic_write(APIC_TMICT, clocks / APIC_DIVISOR); | 290 | apic_write(APIC_TMICT, clocks / APIC_DIVISOR); |
241 | } | 291 | } |
242 | 292 | ||
243 | /* | 293 | /* |
294 | * Setup extended LVT, AMD specific (K8, family 10h) | ||
295 | * | ||
296 | * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and | ||
297 | * MCE interrupts are supported. Thus MCE offset must be set to 0. | ||
298 | * | ||
299 | * If mask=1, the LVT entry does not generate interrupts while mask=0 | ||
300 | * enables the vector. See also the BKDGs. | ||
301 | */ | ||
302 | |||
303 | #define APIC_EILVT_LVTOFF_MCE 0 | ||
304 | #define APIC_EILVT_LVTOFF_IBS 1 | ||
305 | |||
306 | static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) | ||
307 | { | ||
308 | unsigned long reg = (lvt_off << 4) + APIC_EILVT0; | ||
309 | unsigned int v = (mask << 16) | (msg_type << 8) | vector; | ||
310 | |||
311 | apic_write(reg, v); | ||
312 | } | ||
313 | |||
314 | u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) | ||
315 | { | ||
316 | setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); | ||
317 | return APIC_EILVT_LVTOFF_MCE; | ||
318 | } | ||
319 | |||
320 | u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) | ||
321 | { | ||
322 | setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); | ||
323 | return APIC_EILVT_LVTOFF_IBS; | ||
324 | } | ||
325 | EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); | ||
326 | |||
327 | /* | ||
244 | * Program the next event, relative to now | 328 | * Program the next event, relative to now |
245 | */ | 329 | */ |
246 | static int lapic_next_event(unsigned long delta, | 330 | static int lapic_next_event(unsigned long delta, |
@@ -259,8 +343,8 @@ static void lapic_timer_setup(enum clock_event_mode mode, | |||
259 | unsigned long flags; | 343 | unsigned long flags; |
260 | unsigned int v; | 344 | unsigned int v; |
261 | 345 | ||
262 | /* Lapic used for broadcast ? */ | 346 | /* Lapic used as dummy for broadcast ? */ |
263 | if (!local_apic_timer_verify_ok) | 347 | if (evt->features & CLOCK_EVT_FEAT_DUMMY) |
264 | return; | 348 | return; |
265 | 349 | ||
266 | local_irq_save(flags); | 350 | local_irq_save(flags); |
@@ -473,7 +557,7 @@ static int __init calibrate_APIC_clock(void) | |||
473 | return -1; | 557 | return -1; |
474 | } | 558 | } |
475 | 559 | ||
476 | local_apic_timer_verify_ok = 1; | 560 | levt->features &= ~CLOCK_EVT_FEAT_DUMMY; |
477 | 561 | ||
478 | /* We trust the pm timer based calibration */ | 562 | /* We trust the pm timer based calibration */ |
479 | if (!pm_referenced) { | 563 | if (!pm_referenced) { |
@@ -507,11 +591,11 @@ static int __init calibrate_APIC_clock(void) | |||
507 | if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) | 591 | if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) |
508 | apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); | 592 | apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); |
509 | else | 593 | else |
510 | local_apic_timer_verify_ok = 0; | 594 | levt->features |= CLOCK_EVT_FEAT_DUMMY; |
511 | } else | 595 | } else |
512 | local_irq_enable(); | 596 | local_irq_enable(); |
513 | 597 | ||
514 | if (!local_apic_timer_verify_ok) { | 598 | if (levt->features & CLOCK_EVT_FEAT_DUMMY) { |
515 | printk(KERN_WARNING | 599 | printk(KERN_WARNING |
516 | "APIC timer disabled due to verification failure.\n"); | 600 | "APIC timer disabled due to verification failure.\n"); |
517 | return -1; | 601 | return -1; |
@@ -533,7 +617,8 @@ void __init setup_boot_APIC_clock(void) | |||
533 | * timer as a dummy clock event source on SMP systems, so the | 617 | * timer as a dummy clock event source on SMP systems, so the |
534 | * broadcast mechanism is used. On UP systems simply ignore it. | 618 | * broadcast mechanism is used. On UP systems simply ignore it. |
535 | */ | 619 | */ |
536 | if (local_apic_timer_disabled) { | 620 | if (disable_apic_timer) { |
621 | printk(KERN_INFO "Disabling APIC timer\n"); | ||
537 | /* No broadcast on UP ! */ | 622 | /* No broadcast on UP ! */ |
538 | if (num_possible_cpus() > 1) { | 623 | if (num_possible_cpus() > 1) { |
539 | lapic_clockevent.mult = 1; | 624 | lapic_clockevent.mult = 1; |
@@ -602,7 +687,11 @@ static void local_apic_timer_interrupt(void) | |||
602 | /* | 687 | /* |
603 | * the NMI deadlock-detector uses this. | 688 | * the NMI deadlock-detector uses this. |
604 | */ | 689 | */ |
690 | #ifdef CONFIG_X86_64 | ||
691 | add_pda(apic_timer_irqs, 1); | ||
692 | #else | ||
605 | per_cpu(irq_stat, cpu).apic_timer_irqs++; | 693 | per_cpu(irq_stat, cpu).apic_timer_irqs++; |
694 | #endif | ||
606 | 695 | ||
607 | evt->event_handler(evt); | 696 | evt->event_handler(evt); |
608 | } | 697 | } |
@@ -642,35 +731,6 @@ int setup_profiling_timer(unsigned int multiplier) | |||
642 | } | 731 | } |
643 | 732 | ||
644 | /* | 733 | /* |
645 | * Setup extended LVT, AMD specific (K8, family 10h) | ||
646 | * | ||
647 | * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and | ||
648 | * MCE interrupts are supported. Thus MCE offset must be set to 0. | ||
649 | */ | ||
650 | |||
651 | #define APIC_EILVT_LVTOFF_MCE 0 | ||
652 | #define APIC_EILVT_LVTOFF_IBS 1 | ||
653 | |||
654 | static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) | ||
655 | { | ||
656 | unsigned long reg = (lvt_off << 4) + APIC_EILVT0; | ||
657 | unsigned int v = (mask << 16) | (msg_type << 8) | vector; | ||
658 | apic_write(reg, v); | ||
659 | } | ||
660 | |||
661 | u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) | ||
662 | { | ||
663 | setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); | ||
664 | return APIC_EILVT_LVTOFF_MCE; | ||
665 | } | ||
666 | |||
667 | u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) | ||
668 | { | ||
669 | setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); | ||
670 | return APIC_EILVT_LVTOFF_IBS; | ||
671 | } | ||
672 | |||
673 | /* | ||
674 | * Local APIC start and shutdown | 734 | * Local APIC start and shutdown |
675 | */ | 735 | */ |
676 | 736 | ||
@@ -715,7 +775,7 @@ void clear_local_APIC(void) | |||
715 | } | 775 | } |
716 | 776 | ||
717 | /* lets not touch this if we didn't frob it */ | 777 | /* lets not touch this if we didn't frob it */ |
718 | #ifdef CONFIG_X86_MCE_P4THERMAL | 778 | #if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) |
719 | if (maxlvt >= 5) { | 779 | if (maxlvt >= 5) { |
720 | v = apic_read(APIC_LVTTHMR); | 780 | v = apic_read(APIC_LVTTHMR); |
721 | apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); | 781 | apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); |
@@ -732,10 +792,6 @@ void clear_local_APIC(void) | |||
732 | if (maxlvt >= 4) | 792 | if (maxlvt >= 4) |
733 | apic_write(APIC_LVTPC, APIC_LVT_MASKED); | 793 | apic_write(APIC_LVTPC, APIC_LVT_MASKED); |
734 | 794 | ||
735 | #ifdef CONFIG_X86_MCE_P4THERMAL | ||
736 | if (maxlvt >= 5) | ||
737 | apic_write(APIC_LVTTHMR, APIC_LVT_MASKED); | ||
738 | #endif | ||
739 | /* Integrated APIC (!82489DX) ? */ | 795 | /* Integrated APIC (!82489DX) ? */ |
740 | if (lapic_is_integrated()) { | 796 | if (lapic_is_integrated()) { |
741 | if (maxlvt > 3) | 797 | if (maxlvt > 3) |
@@ -750,7 +806,7 @@ void clear_local_APIC(void) | |||
750 | */ | 806 | */ |
751 | void disable_local_APIC(void) | 807 | void disable_local_APIC(void) |
752 | { | 808 | { |
753 | unsigned long value; | 809 | unsigned int value; |
754 | 810 | ||
755 | clear_local_APIC(); | 811 | clear_local_APIC(); |
756 | 812 | ||
@@ -762,6 +818,7 @@ void disable_local_APIC(void) | |||
762 | value &= ~APIC_SPIV_APIC_ENABLED; | 818 | value &= ~APIC_SPIV_APIC_ENABLED; |
763 | apic_write(APIC_SPIV, value); | 819 | apic_write(APIC_SPIV, value); |
764 | 820 | ||
821 | #ifdef CONFIG_X86_32 | ||
765 | /* | 822 | /* |
766 | * When LAPIC was disabled by the BIOS and enabled by the kernel, | 823 | * When LAPIC was disabled by the BIOS and enabled by the kernel, |
767 | * restore the disabled state. | 824 | * restore the disabled state. |
@@ -773,6 +830,7 @@ void disable_local_APIC(void) | |||
773 | l &= ~MSR_IA32_APICBASE_ENABLE; | 830 | l &= ~MSR_IA32_APICBASE_ENABLE; |
774 | wrmsr(MSR_IA32_APICBASE, l, h); | 831 | wrmsr(MSR_IA32_APICBASE, l, h); |
775 | } | 832 | } |
833 | #endif | ||
776 | } | 834 | } |
777 | 835 | ||
778 | /* | 836 | /* |
@@ -789,11 +847,15 @@ void lapic_shutdown(void) | |||
789 | return; | 847 | return; |
790 | 848 | ||
791 | local_irq_save(flags); | 849 | local_irq_save(flags); |
792 | clear_local_APIC(); | ||
793 | 850 | ||
794 | if (enabled_via_apicbase) | 851 | #ifdef CONFIG_X86_32 |
852 | if (!enabled_via_apicbase) | ||
853 | clear_local_APIC(); | ||
854 | else | ||
855 | #endif | ||
795 | disable_local_APIC(); | 856 | disable_local_APIC(); |
796 | 857 | ||
858 | |||
797 | local_irq_restore(flags); | 859 | local_irq_restore(flags); |
798 | } | 860 | } |
799 | 861 | ||
@@ -838,6 +900,12 @@ int __init verify_local_APIC(void) | |||
838 | */ | 900 | */ |
839 | reg0 = apic_read(APIC_ID); | 901 | reg0 = apic_read(APIC_ID); |
840 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); | 902 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); |
903 | apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); | ||
904 | reg1 = apic_read(APIC_ID); | ||
905 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); | ||
906 | apic_write(APIC_ID, reg0); | ||
907 | if (reg1 != (reg0 ^ APIC_ID_MASK)) | ||
908 | return 0; | ||
841 | 909 | ||
842 | /* | 910 | /* |
843 | * The next two are just to see if we have sane values. | 911 | * The next two are just to see if we have sane values. |
@@ -863,14 +931,15 @@ void __init sync_Arb_IDs(void) | |||
863 | */ | 931 | */ |
864 | if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) | 932 | if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) |
865 | return; | 933 | return; |
934 | |||
866 | /* | 935 | /* |
867 | * Wait for idle. | 936 | * Wait for idle. |
868 | */ | 937 | */ |
869 | apic_wait_icr_idle(); | 938 | apic_wait_icr_idle(); |
870 | 939 | ||
871 | apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); | 940 | apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); |
872 | apic_write(APIC_ICR, | 941 | apic_write(APIC_ICR, APIC_DEST_ALLINC | |
873 | APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); | 942 | APIC_INT_LEVELTRIG | APIC_DM_INIT); |
874 | } | 943 | } |
875 | 944 | ||
876 | /* | 945 | /* |
@@ -878,7 +947,7 @@ void __init sync_Arb_IDs(void) | |||
878 | */ | 947 | */ |
879 | void __init init_bsp_APIC(void) | 948 | void __init init_bsp_APIC(void) |
880 | { | 949 | { |
881 | unsigned long value; | 950 | unsigned int value; |
882 | 951 | ||
883 | /* | 952 | /* |
884 | * Don't do the setup now if we have a SMP BIOS as the | 953 | * Don't do the setup now if we have a SMP BIOS as the |
@@ -899,11 +968,13 @@ void __init init_bsp_APIC(void) | |||
899 | value &= ~APIC_VECTOR_MASK; | 968 | value &= ~APIC_VECTOR_MASK; |
900 | value |= APIC_SPIV_APIC_ENABLED; | 969 | value |= APIC_SPIV_APIC_ENABLED; |
901 | 970 | ||
971 | #ifdef CONFIG_X86_32 | ||
902 | /* This bit is reserved on P4/Xeon and should be cleared */ | 972 | /* This bit is reserved on P4/Xeon and should be cleared */ |
903 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && | 973 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && |
904 | (boot_cpu_data.x86 == 15)) | 974 | (boot_cpu_data.x86 == 15)) |
905 | value &= ~APIC_SPIV_FOCUS_DISABLED; | 975 | value &= ~APIC_SPIV_FOCUS_DISABLED; |
906 | else | 976 | else |
977 | #endif | ||
907 | value |= APIC_SPIV_FOCUS_DISABLED; | 978 | value |= APIC_SPIV_FOCUS_DISABLED; |
908 | value |= SPURIOUS_APIC_VECTOR; | 979 | value |= SPURIOUS_APIC_VECTOR; |
909 | apic_write(APIC_SPIV, value); | 980 | apic_write(APIC_SPIV, value); |
@@ -922,6 +993,16 @@ static void __cpuinit lapic_setup_esr(void) | |||
922 | { | 993 | { |
923 | unsigned long oldvalue, value, maxlvt; | 994 | unsigned long oldvalue, value, maxlvt; |
924 | if (lapic_is_integrated() && !esr_disable) { | 995 | if (lapic_is_integrated() && !esr_disable) { |
996 | if (esr_disable) { | ||
997 | /* | ||
998 | * Something untraceable is creating bad interrupts on | ||
999 | * secondary quads ... for the moment, just leave the | ||
1000 | * ESR disabled - we can't do anything useful with the | ||
1001 | * errors anyway - mbligh | ||
1002 | */ | ||
1003 | printk(KERN_INFO "Leaving ESR disabled.\n"); | ||
1004 | return; | ||
1005 | } | ||
925 | /* !82489DX */ | 1006 | /* !82489DX */ |
926 | maxlvt = lapic_get_maxlvt(); | 1007 | maxlvt = lapic_get_maxlvt(); |
927 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | 1008 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ |
@@ -942,16 +1023,7 @@ static void __cpuinit lapic_setup_esr(void) | |||
942 | "vector: 0x%08lx after: 0x%08lx\n", | 1023 | "vector: 0x%08lx after: 0x%08lx\n", |
943 | oldvalue, value); | 1024 | oldvalue, value); |
944 | } else { | 1025 | } else { |
945 | if (esr_disable) | 1026 | printk(KERN_INFO "No ESR for 82489DX.\n"); |
946 | /* | ||
947 | * Something untraceable is creating bad interrupts on | ||
948 | * secondary quads ... for the moment, just leave the | ||
949 | * ESR disabled - we can't do anything useful with the | ||
950 | * errors anyway - mbligh | ||
951 | */ | ||
952 | printk(KERN_INFO "Leaving ESR disabled.\n"); | ||
953 | else | ||
954 | printk(KERN_INFO "No ESR for 82489DX.\n"); | ||
955 | } | 1027 | } |
956 | } | 1028 | } |
957 | 1029 | ||
@@ -1089,13 +1161,17 @@ void __cpuinit setup_local_APIC(void) | |||
1089 | 1161 | ||
1090 | void __cpuinit end_local_APIC_setup(void) | 1162 | void __cpuinit end_local_APIC_setup(void) |
1091 | { | 1163 | { |
1092 | unsigned long value; | ||
1093 | |||
1094 | lapic_setup_esr(); | 1164 | lapic_setup_esr(); |
1095 | /* Disable the local apic timer */ | 1165 | |
1096 | value = apic_read(APIC_LVTT); | 1166 | #ifdef CONFIG_X86_32 |
1097 | value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); | 1167 | { |
1098 | apic_write(APIC_LVTT, value); | 1168 | unsigned int value; |
1169 | /* Disable the local apic timer */ | ||
1170 | value = apic_read(APIC_LVTT); | ||
1171 | value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); | ||
1172 | apic_write(APIC_LVTT, value); | ||
1173 | } | ||
1174 | #endif | ||
1099 | 1175 | ||
1100 | setup_apic_nmi_watchdog(NULL); | 1176 | setup_apic_nmi_watchdog(NULL); |
1101 | apic_pm_activate(); | 1177 | apic_pm_activate(); |
@@ -1205,7 +1281,7 @@ void __init init_apic_mappings(void) | |||
1205 | * default configuration (or the MP table is broken). | 1281 | * default configuration (or the MP table is broken). |
1206 | */ | 1282 | */ |
1207 | if (boot_cpu_physical_apicid == -1U) | 1283 | if (boot_cpu_physical_apicid == -1U) |
1208 | boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); | 1284 | boot_cpu_physical_apicid = read_apic_id(); |
1209 | 1285 | ||
1210 | } | 1286 | } |
1211 | 1287 | ||
@@ -1242,7 +1318,7 @@ int __init APIC_init_uniprocessor(void) | |||
1242 | * might be zero if read from MP tables. Get it from LAPIC. | 1318 | * might be zero if read from MP tables. Get it from LAPIC. |
1243 | */ | 1319 | */ |
1244 | #ifdef CONFIG_CRASH_DUMP | 1320 | #ifdef CONFIG_CRASH_DUMP |
1245 | boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); | 1321 | boot_cpu_physical_apicid = read_apic_id(); |
1246 | #endif | 1322 | #endif |
1247 | physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); | 1323 | physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); |
1248 | 1324 | ||
@@ -1321,59 +1397,12 @@ void smp_error_interrupt(struct pt_regs *regs) | |||
1321 | irq_exit(); | 1397 | irq_exit(); |
1322 | } | 1398 | } |
1323 | 1399 | ||
1324 | #ifdef CONFIG_SMP | ||
1325 | void __init smp_intr_init(void) | ||
1326 | { | ||
1327 | /* | ||
1328 | * IRQ0 must be given a fixed assignment and initialized, | ||
1329 | * because it's used before the IO-APIC is set up. | ||
1330 | */ | ||
1331 | set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); | ||
1332 | |||
1333 | /* | ||
1334 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | ||
1335 | * IPI, driven by wakeup. | ||
1336 | */ | ||
1337 | alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | ||
1338 | |||
1339 | /* IPI for invalidation */ | ||
1340 | alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); | ||
1341 | |||
1342 | /* IPI for generic function call */ | ||
1343 | alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | ||
1344 | |||
1345 | /* IPI for single call function */ | ||
1346 | set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, | ||
1347 | call_function_single_interrupt); | ||
1348 | } | ||
1349 | #endif | ||
1350 | |||
1351 | /* | ||
1352 | * Initialize APIC interrupts | ||
1353 | */ | ||
1354 | void __init apic_intr_init(void) | ||
1355 | { | ||
1356 | #ifdef CONFIG_SMP | ||
1357 | smp_intr_init(); | ||
1358 | #endif | ||
1359 | /* self generated IPI for local APIC timer */ | ||
1360 | alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); | ||
1361 | |||
1362 | /* IPI vectors for APIC spurious and error interrupts */ | ||
1363 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | ||
1364 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | ||
1365 | |||
1366 | /* thermal monitor LVT interrupt */ | ||
1367 | #ifdef CONFIG_X86_MCE_P4THERMAL | ||
1368 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | ||
1369 | #endif | ||
1370 | } | ||
1371 | |||
1372 | /** | 1400 | /** |
1373 | * connect_bsp_APIC - attach the APIC to the interrupt system | 1401 | * connect_bsp_APIC - attach the APIC to the interrupt system |
1374 | */ | 1402 | */ |
1375 | void __init connect_bsp_APIC(void) | 1403 | void __init connect_bsp_APIC(void) |
1376 | { | 1404 | { |
1405 | #ifdef CONFIG_X86_32 | ||
1377 | if (pic_mode) { | 1406 | if (pic_mode) { |
1378 | /* | 1407 | /* |
1379 | * Do not trust the local APIC being empty at bootup. | 1408 | * Do not trust the local APIC being empty at bootup. |
@@ -1388,6 +1417,7 @@ void __init connect_bsp_APIC(void) | |||
1388 | outb(0x70, 0x22); | 1417 | outb(0x70, 0x22); |
1389 | outb(0x01, 0x23); | 1418 | outb(0x01, 0x23); |
1390 | } | 1419 | } |
1420 | #endif | ||
1391 | enable_apic_mode(); | 1421 | enable_apic_mode(); |
1392 | } | 1422 | } |
1393 | 1423 | ||
@@ -1400,6 +1430,9 @@ void __init connect_bsp_APIC(void) | |||
1400 | */ | 1430 | */ |
1401 | void disconnect_bsp_APIC(int virt_wire_setup) | 1431 | void disconnect_bsp_APIC(int virt_wire_setup) |
1402 | { | 1432 | { |
1433 | unsigned int value; | ||
1434 | |||
1435 | #ifdef CONFIG_X86_32 | ||
1403 | if (pic_mode) { | 1436 | if (pic_mode) { |
1404 | /* | 1437 | /* |
1405 | * Put the board back into PIC mode (has an effect only on | 1438 | * Put the board back into PIC mode (has an effect only on |
@@ -1411,54 +1444,53 @@ void disconnect_bsp_APIC(int virt_wire_setup) | |||
1411 | "entering PIC mode.\n"); | 1444 | "entering PIC mode.\n"); |
1412 | outb(0x70, 0x22); | 1445 | outb(0x70, 0x22); |
1413 | outb(0x00, 0x23); | 1446 | outb(0x00, 0x23); |
1414 | } else { | 1447 | return; |
1415 | /* Go back to Virtual Wire compatibility mode */ | 1448 | } |
1416 | unsigned long value; | 1449 | #endif |
1417 | 1450 | ||
1418 | /* For the spurious interrupt use vector F, and enable it */ | 1451 | /* Go back to Virtual Wire compatibility mode */ |
1419 | value = apic_read(APIC_SPIV); | ||
1420 | value &= ~APIC_VECTOR_MASK; | ||
1421 | value |= APIC_SPIV_APIC_ENABLED; | ||
1422 | value |= 0xf; | ||
1423 | apic_write(APIC_SPIV, value); | ||
1424 | 1452 | ||
1425 | if (!virt_wire_setup) { | 1453 | /* For the spurious interrupt use vector F, and enable it */ |
1426 | /* | 1454 | value = apic_read(APIC_SPIV); |
1427 | * For LVT0 make it edge triggered, active high, | 1455 | value &= ~APIC_VECTOR_MASK; |
1428 | * external and enabled | 1456 | value |= APIC_SPIV_APIC_ENABLED; |
1429 | */ | 1457 | value |= 0xf; |
1430 | value = apic_read(APIC_LVT0); | 1458 | apic_write(APIC_SPIV, value); |
1431 | value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | | ||
1432 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | | ||
1433 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); | ||
1434 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | ||
1435 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); | ||
1436 | apic_write(APIC_LVT0, value); | ||
1437 | } else { | ||
1438 | /* Disable LVT0 */ | ||
1439 | apic_write(APIC_LVT0, APIC_LVT_MASKED); | ||
1440 | } | ||
1441 | 1459 | ||
1460 | if (!virt_wire_setup) { | ||
1442 | /* | 1461 | /* |
1443 | * For LVT1 make it edge triggered, active high, nmi and | 1462 | * For LVT0 make it edge triggered, active high, |
1444 | * enabled | 1463 | * external and enabled |
1445 | */ | 1464 | */ |
1446 | value = apic_read(APIC_LVT1); | 1465 | value = apic_read(APIC_LVT0); |
1447 | value &= ~( | 1466 | value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | |
1448 | APIC_MODE_MASK | APIC_SEND_PENDING | | ||
1449 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | | 1467 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | |
1450 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); | 1468 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); |
1451 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | 1469 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; |
1452 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); | 1470 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); |
1453 | apic_write(APIC_LVT1, value); | 1471 | apic_write(APIC_LVT0, value); |
1472 | } else { | ||
1473 | /* Disable LVT0 */ | ||
1474 | apic_write(APIC_LVT0, APIC_LVT_MASKED); | ||
1454 | } | 1475 | } |
1476 | |||
1477 | /* | ||
1478 | * For LVT1 make it edge triggered, active high, | ||
1479 | * nmi and enabled | ||
1480 | */ | ||
1481 | value = apic_read(APIC_LVT1); | ||
1482 | value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | | ||
1483 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | | ||
1484 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); | ||
1485 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | ||
1486 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); | ||
1487 | apic_write(APIC_LVT1, value); | ||
1455 | } | 1488 | } |
1456 | 1489 | ||
1457 | void __cpuinit generic_processor_info(int apicid, int version) | 1490 | void __cpuinit generic_processor_info(int apicid, int version) |
1458 | { | 1491 | { |
1459 | int cpu; | 1492 | int cpu; |
1460 | cpumask_t tmp_map; | 1493 | cpumask_t tmp_map; |
1461 | physid_mask_t phys_cpu; | ||
1462 | 1494 | ||
1463 | /* | 1495 | /* |
1464 | * Validate version | 1496 | * Validate version |
@@ -1471,9 +1503,6 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1471 | } | 1503 | } |
1472 | apic_version[apicid] = version; | 1504 | apic_version[apicid] = version; |
1473 | 1505 | ||
1474 | phys_cpu = apicid_to_cpu_present(apicid); | ||
1475 | physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu); | ||
1476 | |||
1477 | if (num_processors >= NR_CPUS) { | 1506 | if (num_processors >= NR_CPUS) { |
1478 | printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." | 1507 | printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." |
1479 | " Processor ignored.\n", NR_CPUS); | 1508 | " Processor ignored.\n", NR_CPUS); |
@@ -1484,17 +1513,19 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1484 | cpus_complement(tmp_map, cpu_present_map); | 1513 | cpus_complement(tmp_map, cpu_present_map); |
1485 | cpu = first_cpu(tmp_map); | 1514 | cpu = first_cpu(tmp_map); |
1486 | 1515 | ||
1487 | if (apicid == boot_cpu_physical_apicid) | 1516 | physid_set(apicid, phys_cpu_present_map); |
1517 | if (apicid == boot_cpu_physical_apicid) { | ||
1488 | /* | 1518 | /* |
1489 | * x86_bios_cpu_apicid is required to have processors listed | 1519 | * x86_bios_cpu_apicid is required to have processors listed |
1490 | * in same order as logical cpu numbers. Hence the first | 1520 | * in same order as logical cpu numbers. Hence the first |
1491 | * entry is BSP, and so on. | 1521 | * entry is BSP, and so on. |
1492 | */ | 1522 | */ |
1493 | cpu = 0; | 1523 | cpu = 0; |
1494 | 1524 | } | |
1495 | if (apicid > max_physical_apicid) | 1525 | if (apicid > max_physical_apicid) |
1496 | max_physical_apicid = apicid; | 1526 | max_physical_apicid = apicid; |
1497 | 1527 | ||
1528 | #ifdef CONFIG_X86_32 | ||
1498 | /* | 1529 | /* |
1499 | * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y | 1530 | * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y |
1500 | * but we need to work other dependencies like SMP_SUSPEND etc | 1531 | * but we need to work other dependencies like SMP_SUSPEND etc |
@@ -1514,7 +1545,9 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1514 | def_to_bigsmp = 1; | 1545 | def_to_bigsmp = 1; |
1515 | } | 1546 | } |
1516 | } | 1547 | } |
1517 | #ifdef CONFIG_SMP | 1548 | #endif |
1549 | |||
1550 | #if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) | ||
1518 | /* are we being called early in kernel startup? */ | 1551 | /* are we being called early in kernel startup? */ |
1519 | if (early_per_cpu_ptr(x86_cpu_to_apicid)) { | 1552 | if (early_per_cpu_ptr(x86_cpu_to_apicid)) { |
1520 | u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); | 1553 | u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); |
@@ -1527,6 +1560,7 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1527 | per_cpu(x86_bios_cpu_apicid, cpu) = apicid; | 1560 | per_cpu(x86_bios_cpu_apicid, cpu) = apicid; |
1528 | } | 1561 | } |
1529 | #endif | 1562 | #endif |
1563 | |||
1530 | cpu_set(cpu, cpu_possible_map); | 1564 | cpu_set(cpu, cpu_possible_map); |
1531 | cpu_set(cpu, cpu_present_map); | 1565 | cpu_set(cpu, cpu_present_map); |
1532 | } | 1566 | } |
@@ -1537,6 +1571,11 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1537 | #ifdef CONFIG_PM | 1571 | #ifdef CONFIG_PM |
1538 | 1572 | ||
1539 | static struct { | 1573 | static struct { |
1574 | /* | ||
1575 | * 'active' is true if the local APIC was enabled by us and | ||
1576 | * not the BIOS; this signifies that we are also responsible | ||
1577 | * for disabling it before entering apm/acpi suspend | ||
1578 | */ | ||
1540 | int active; | 1579 | int active; |
1541 | /* r/w apic fields */ | 1580 | /* r/w apic fields */ |
1542 | unsigned int apic_id; | 1581 | unsigned int apic_id; |
@@ -1577,7 +1616,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) | |||
1577 | apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); | 1616 | apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); |
1578 | apic_pm_state.apic_tmict = apic_read(APIC_TMICT); | 1617 | apic_pm_state.apic_tmict = apic_read(APIC_TMICT); |
1579 | apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); | 1618 | apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); |
1580 | #ifdef CONFIG_X86_MCE_P4THERMAL | 1619 | #if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) |
1581 | if (maxlvt >= 5) | 1620 | if (maxlvt >= 5) |
1582 | apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); | 1621 | apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); |
1583 | #endif | 1622 | #endif |
@@ -1601,16 +1640,23 @@ static int lapic_resume(struct sys_device *dev) | |||
1601 | 1640 | ||
1602 | local_irq_save(flags); | 1641 | local_irq_save(flags); |
1603 | 1642 | ||
1604 | /* | 1643 | #ifdef CONFIG_X86_64 |
1605 | * Make sure the APICBASE points to the right address | 1644 | if (x2apic) |
1606 | * | 1645 | enable_x2apic(); |
1607 | * FIXME! This will be wrong if we ever support suspend on | 1646 | else |
1608 | * SMP! We'll need to do this as part of the CPU restore! | 1647 | #endif |
1609 | */ | 1648 | { |
1610 | rdmsr(MSR_IA32_APICBASE, l, h); | 1649 | /* |
1611 | l &= ~MSR_IA32_APICBASE_BASE; | 1650 | * Make sure the APICBASE points to the right address |
1612 | l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; | 1651 | * |
1613 | wrmsr(MSR_IA32_APICBASE, l, h); | 1652 | * FIXME! This will be wrong if we ever support suspend on |
1653 | * SMP! We'll need to do this as part of the CPU restore! | ||
1654 | */ | ||
1655 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
1656 | l &= ~MSR_IA32_APICBASE_BASE; | ||
1657 | l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; | ||
1658 | wrmsr(MSR_IA32_APICBASE, l, h); | ||
1659 | } | ||
1614 | 1660 | ||
1615 | apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); | 1661 | apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); |
1616 | apic_write(APIC_ID, apic_pm_state.apic_id); | 1662 | apic_write(APIC_ID, apic_pm_state.apic_id); |
@@ -1620,7 +1666,7 @@ static int lapic_resume(struct sys_device *dev) | |||
1620 | apic_write(APIC_SPIV, apic_pm_state.apic_spiv); | 1666 | apic_write(APIC_SPIV, apic_pm_state.apic_spiv); |
1621 | apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); | 1667 | apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); |
1622 | apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); | 1668 | apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); |
1623 | #ifdef CONFIG_X86_MCE_P4THERMAL | 1669 | #if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) |
1624 | if (maxlvt >= 5) | 1670 | if (maxlvt >= 5) |
1625 | apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); | 1671 | apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); |
1626 | #endif | 1672 | #endif |
@@ -1634,7 +1680,9 @@ static int lapic_resume(struct sys_device *dev) | |||
1634 | apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); | 1680 | apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); |
1635 | apic_write(APIC_ESR, 0); | 1681 | apic_write(APIC_ESR, 0); |
1636 | apic_read(APIC_ESR); | 1682 | apic_read(APIC_ESR); |
1683 | |||
1637 | local_irq_restore(flags); | 1684 | local_irq_restore(flags); |
1685 | |||
1638 | return 0; | 1686 | return 0; |
1639 | } | 1687 | } |
1640 | 1688 | ||
@@ -1690,20 +1738,20 @@ static int __init parse_lapic(char *arg) | |||
1690 | } | 1738 | } |
1691 | early_param("lapic", parse_lapic); | 1739 | early_param("lapic", parse_lapic); |
1692 | 1740 | ||
1693 | static int __init parse_nolapic(char *arg) | 1741 | static int __init setup_disableapic(char *arg) |
1694 | { | 1742 | { |
1695 | disable_apic = 1; | 1743 | disable_apic = 1; |
1696 | setup_clear_cpu_cap(X86_FEATURE_APIC); | 1744 | setup_clear_cpu_cap(X86_FEATURE_APIC); |
1697 | return 0; | 1745 | return 0; |
1698 | } | 1746 | } |
1699 | early_param("nolapic", parse_nolapic); | 1747 | early_param("disableapic", setup_disableapic); |
1700 | 1748 | ||
1701 | static int __init parse_disable_lapic_timer(char *arg) | 1749 | /* same as disableapic, for compatibility */ |
1750 | static int __init setup_nolapic(char *arg) | ||
1702 | { | 1751 | { |
1703 | local_apic_timer_disabled = 1; | 1752 | return setup_disableapic(arg); |
1704 | return 0; | ||
1705 | } | 1753 | } |
1706 | early_param("nolapic_timer", parse_disable_lapic_timer); | 1754 | early_param("nolapic", setup_nolapic); |
1707 | 1755 | ||
1708 | static int __init parse_lapic_timer_c2_ok(char *arg) | 1756 | static int __init parse_lapic_timer_c2_ok(char *arg) |
1709 | { | 1757 | { |
@@ -1712,15 +1760,40 @@ static int __init parse_lapic_timer_c2_ok(char *arg) | |||
1712 | } | 1760 | } |
1713 | early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); | 1761 | early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); |
1714 | 1762 | ||
1763 | static int __init parse_disable_apic_timer(char *arg) | ||
1764 | { | ||
1765 | disable_apic_timer = 1; | ||
1766 | return 0; | ||
1767 | } | ||
1768 | early_param("noapictimer", parse_disable_apic_timer); | ||
1769 | |||
1770 | static int __init parse_nolapic_timer(char *arg) | ||
1771 | { | ||
1772 | disable_apic_timer = 1; | ||
1773 | return 0; | ||
1774 | } | ||
1775 | early_param("nolapic_timer", parse_nolapic_timer); | ||
1776 | |||
1715 | static int __init apic_set_verbosity(char *arg) | 1777 | static int __init apic_set_verbosity(char *arg) |
1716 | { | 1778 | { |
1717 | if (!arg) | 1779 | if (!arg) { |
1780 | #ifdef CONFIG_X86_64 | ||
1781 | skip_ioapic_setup = 0; | ||
1782 | ioapic_force = 1; | ||
1783 | return 0; | ||
1784 | #endif | ||
1718 | return -EINVAL; | 1785 | return -EINVAL; |
1786 | } | ||
1719 | 1787 | ||
1720 | if (strcmp(arg, "debug") == 0) | 1788 | if (strcmp("debug", arg) == 0) |
1721 | apic_verbosity = APIC_DEBUG; | 1789 | apic_verbosity = APIC_DEBUG; |
1722 | else if (strcmp(arg, "verbose") == 0) | 1790 | else if (strcmp("verbose", arg) == 0) |
1723 | apic_verbosity = APIC_VERBOSE; | 1791 | apic_verbosity = APIC_VERBOSE; |
1792 | else { | ||
1793 | printk(KERN_WARNING "APIC Verbosity level %s not recognised" | ||
1794 | " use apic=verbose or apic=debug\n", arg); | ||
1795 | return -EINVAL; | ||
1796 | } | ||
1724 | 1797 | ||
1725 | return 0; | 1798 | return 0; |
1726 | } | 1799 | } |
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 446c062e831c..94ddb69ae15e 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/clockchips.h> | 27 | #include <linux/clockchips.h> |
28 | #include <linux/acpi_pmtmr.h> | 28 | #include <linux/acpi_pmtmr.h> |
29 | #include <linux/module.h> | 29 | #include <linux/module.h> |
30 | #include <linux/dmar.h> | ||
30 | 31 | ||
31 | #include <asm/atomic.h> | 32 | #include <asm/atomic.h> |
32 | #include <asm/smp.h> | 33 | #include <asm/smp.h> |
@@ -39,13 +40,20 @@ | |||
39 | #include <asm/proto.h> | 40 | #include <asm/proto.h> |
40 | #include <asm/timex.h> | 41 | #include <asm/timex.h> |
41 | #include <asm/apic.h> | 42 | #include <asm/apic.h> |
43 | #include <asm/i8259.h> | ||
42 | 44 | ||
43 | #include <mach_ipi.h> | 45 | #include <mach_ipi.h> |
44 | #include <mach_apic.h> | 46 | #include <mach_apic.h> |
45 | 47 | ||
48 | /* Disable local APIC timer from the kernel commandline or via dmi quirk */ | ||
46 | static int disable_apic_timer __cpuinitdata; | 49 | static int disable_apic_timer __cpuinitdata; |
47 | static int apic_calibrate_pmtmr __initdata; | 50 | static int apic_calibrate_pmtmr __initdata; |
48 | int disable_apic; | 51 | int disable_apic; |
52 | int disable_x2apic; | ||
53 | int x2apic; | ||
54 | |||
55 | /* x2apic enabled before OS handover */ | ||
56 | int x2apic_preenabled; | ||
49 | 57 | ||
50 | /* Local APIC timer works in C2 */ | 58 | /* Local APIC timer works in C2 */ |
51 | int local_apic_timer_c2_ok; | 59 | int local_apic_timer_c2_ok; |
@@ -73,6 +81,9 @@ static void lapic_timer_setup(enum clock_event_mode mode, | |||
73 | static void lapic_timer_broadcast(cpumask_t mask); | 81 | static void lapic_timer_broadcast(cpumask_t mask); |
74 | static void apic_pm_activate(void); | 82 | static void apic_pm_activate(void); |
75 | 83 | ||
84 | /* | ||
85 | * The local apic timer can be used for any function which is CPU local. | ||
86 | */ | ||
76 | static struct clock_event_device lapic_clockevent = { | 87 | static struct clock_event_device lapic_clockevent = { |
77 | .name = "lapic", | 88 | .name = "lapic", |
78 | .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT | 89 | .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT |
@@ -99,11 +110,15 @@ static inline int lapic_get_version(void) | |||
99 | } | 110 | } |
100 | 111 | ||
101 | /* | 112 | /* |
102 | * Check, if the APIC is integrated or a seperate chip | 113 | * Check, if the APIC is integrated or a separate chip |
103 | */ | 114 | */ |
104 | static inline int lapic_is_integrated(void) | 115 | static inline int lapic_is_integrated(void) |
105 | { | 116 | { |
117 | #ifdef CONFIG_X86_64 | ||
106 | return 1; | 118 | return 1; |
119 | #else | ||
120 | return APIC_INTEGRATED(lapic_get_version()); | ||
121 | #endif | ||
107 | } | 122 | } |
108 | 123 | ||
109 | /* | 124 | /* |
@@ -118,13 +133,18 @@ static int modern_apic(void) | |||
118 | return lapic_get_version() >= 0x14; | 133 | return lapic_get_version() >= 0x14; |
119 | } | 134 | } |
120 | 135 | ||
121 | void apic_wait_icr_idle(void) | 136 | /* |
137 | * Paravirt kernels also might be using these below ops. So we still | ||
138 | * use generic apic_read()/apic_write(), which might be pointing to different | ||
139 | * ops in PARAVIRT case. | ||
140 | */ | ||
141 | void xapic_wait_icr_idle(void) | ||
122 | { | 142 | { |
123 | while (apic_read(APIC_ICR) & APIC_ICR_BUSY) | 143 | while (apic_read(APIC_ICR) & APIC_ICR_BUSY) |
124 | cpu_relax(); | 144 | cpu_relax(); |
125 | } | 145 | } |
126 | 146 | ||
127 | u32 safe_apic_wait_icr_idle(void) | 147 | u32 safe_xapic_wait_icr_idle(void) |
128 | { | 148 | { |
129 | u32 send_status; | 149 | u32 send_status; |
130 | int timeout; | 150 | int timeout; |
@@ -140,6 +160,68 @@ u32 safe_apic_wait_icr_idle(void) | |||
140 | return send_status; | 160 | return send_status; |
141 | } | 161 | } |
142 | 162 | ||
163 | void xapic_icr_write(u32 low, u32 id) | ||
164 | { | ||
165 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); | ||
166 | apic_write(APIC_ICR, low); | ||
167 | } | ||
168 | |||
169 | u64 xapic_icr_read(void) | ||
170 | { | ||
171 | u32 icr1, icr2; | ||
172 | |||
173 | icr2 = apic_read(APIC_ICR2); | ||
174 | icr1 = apic_read(APIC_ICR); | ||
175 | |||
176 | return icr1 | ((u64)icr2 << 32); | ||
177 | } | ||
178 | |||
179 | static struct apic_ops xapic_ops = { | ||
180 | .read = native_apic_mem_read, | ||
181 | .write = native_apic_mem_write, | ||
182 | .icr_read = xapic_icr_read, | ||
183 | .icr_write = xapic_icr_write, | ||
184 | .wait_icr_idle = xapic_wait_icr_idle, | ||
185 | .safe_wait_icr_idle = safe_xapic_wait_icr_idle, | ||
186 | }; | ||
187 | |||
188 | struct apic_ops __read_mostly *apic_ops = &xapic_ops; | ||
189 | EXPORT_SYMBOL_GPL(apic_ops); | ||
190 | |||
191 | static void x2apic_wait_icr_idle(void) | ||
192 | { | ||
193 | /* no need to wait for icr idle in x2apic */ | ||
194 | return; | ||
195 | } | ||
196 | |||
197 | static u32 safe_x2apic_wait_icr_idle(void) | ||
198 | { | ||
199 | /* no need to wait for icr idle in x2apic */ | ||
200 | return 0; | ||
201 | } | ||
202 | |||
203 | void x2apic_icr_write(u32 low, u32 id) | ||
204 | { | ||
205 | wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); | ||
206 | } | ||
207 | |||
208 | u64 x2apic_icr_read(void) | ||
209 | { | ||
210 | unsigned long val; | ||
211 | |||
212 | rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); | ||
213 | return val; | ||
214 | } | ||
215 | |||
216 | static struct apic_ops x2apic_ops = { | ||
217 | .read = native_apic_msr_read, | ||
218 | .write = native_apic_msr_write, | ||
219 | .icr_read = x2apic_icr_read, | ||
220 | .icr_write = x2apic_icr_write, | ||
221 | .wait_icr_idle = x2apic_wait_icr_idle, | ||
222 | .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, | ||
223 | }; | ||
224 | |||
143 | /** | 225 | /** |
144 | * enable_NMI_through_LVT0 - enable NMI through local vector table 0 | 226 | * enable_NMI_through_LVT0 - enable NMI through local vector table 0 |
145 | */ | 227 | */ |
@@ -149,6 +231,11 @@ void __cpuinit enable_NMI_through_LVT0(void) | |||
149 | 231 | ||
150 | /* unmask and set to NMI */ | 232 | /* unmask and set to NMI */ |
151 | v = APIC_DM_NMI; | 233 | v = APIC_DM_NMI; |
234 | |||
235 | /* Level triggered for 82489DX (32bit mode) */ | ||
236 | if (!lapic_is_integrated()) | ||
237 | v |= APIC_LVT_LEVEL_TRIGGER; | ||
238 | |||
152 | apic_write(APIC_LVT0, v); | 239 | apic_write(APIC_LVT0, v); |
153 | } | 240 | } |
154 | 241 | ||
@@ -157,14 +244,28 @@ void __cpuinit enable_NMI_through_LVT0(void) | |||
157 | */ | 244 | */ |
158 | int lapic_get_maxlvt(void) | 245 | int lapic_get_maxlvt(void) |
159 | { | 246 | { |
160 | unsigned int v, maxlvt; | 247 | unsigned int v; |
161 | 248 | ||
162 | v = apic_read(APIC_LVR); | 249 | v = apic_read(APIC_LVR); |
163 | maxlvt = GET_APIC_MAXLVT(v); | 250 | /* |
164 | return maxlvt; | 251 | * - we always have APIC integrated on 64bit mode |
252 | * - 82489DXs do not report # of LVT entries | ||
253 | */ | ||
254 | return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; | ||
165 | } | 255 | } |
166 | 256 | ||
167 | /* | 257 | /* |
258 | * Local APIC timer | ||
259 | */ | ||
260 | |||
261 | /* Clock divisor */ | ||
262 | #ifdef CONFG_X86_64 | ||
263 | #define APIC_DIVISOR 1 | ||
264 | #else | ||
265 | #define APIC_DIVISOR 16 | ||
266 | #endif | ||
267 | |||
268 | /* | ||
168 | * This function sets up the local APIC timer, with a timeout of | 269 | * This function sets up the local APIC timer, with a timeout of |
169 | * 'clocks' APIC bus clock. During calibration we actually call | 270 | * 'clocks' APIC bus clock. During calibration we actually call |
170 | * this function twice on the boot CPU, once with a bogus timeout | 271 | * this function twice on the boot CPU, once with a bogus timeout |
@@ -174,7 +275,6 @@ int lapic_get_maxlvt(void) | |||
174 | * We do reads before writes even if unnecessary, to get around the | 275 | * We do reads before writes even if unnecessary, to get around the |
175 | * P5 APIC double write bug. | 276 | * P5 APIC double write bug. |
176 | */ | 277 | */ |
177 | |||
178 | static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | 278 | static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) |
179 | { | 279 | { |
180 | unsigned int lvtt_value, tmp_value; | 280 | unsigned int lvtt_value, tmp_value; |
@@ -182,6 +282,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | |||
182 | lvtt_value = LOCAL_TIMER_VECTOR; | 282 | lvtt_value = LOCAL_TIMER_VECTOR; |
183 | if (!oneshot) | 283 | if (!oneshot) |
184 | lvtt_value |= APIC_LVT_TIMER_PERIODIC; | 284 | lvtt_value |= APIC_LVT_TIMER_PERIODIC; |
285 | if (!lapic_is_integrated()) | ||
286 | lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); | ||
287 | |||
185 | if (!irqen) | 288 | if (!irqen) |
186 | lvtt_value |= APIC_LVT_MASKED; | 289 | lvtt_value |= APIC_LVT_MASKED; |
187 | 290 | ||
@@ -191,12 +294,12 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | |||
191 | * Divide PICLK by 16 | 294 | * Divide PICLK by 16 |
192 | */ | 295 | */ |
193 | tmp_value = apic_read(APIC_TDCR); | 296 | tmp_value = apic_read(APIC_TDCR); |
194 | apic_write(APIC_TDCR, (tmp_value | 297 | apic_write(APIC_TDCR, |
195 | & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | 298 | (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | |
196 | | APIC_TDR_DIV_16); | 299 | APIC_TDR_DIV_16); |
197 | 300 | ||
198 | if (!oneshot) | 301 | if (!oneshot) |
199 | apic_write(APIC_TMICT, clocks); | 302 | apic_write(APIC_TMICT, clocks / APIC_DIVISOR); |
200 | } | 303 | } |
201 | 304 | ||
202 | /* | 305 | /* |
@@ -204,6 +307,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | |||
204 | * | 307 | * |
205 | * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and | 308 | * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and |
206 | * MCE interrupts are supported. Thus MCE offset must be set to 0. | 309 | * MCE interrupts are supported. Thus MCE offset must be set to 0. |
310 | * | ||
311 | * If mask=1, the LVT entry does not generate interrupts while mask=0 | ||
312 | * enables the vector. See also the BKDGs. | ||
207 | */ | 313 | */ |
208 | 314 | ||
209 | #define APIC_EILVT_LVTOFF_MCE 0 | 315 | #define APIC_EILVT_LVTOFF_MCE 0 |
@@ -228,6 +334,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) | |||
228 | setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); | 334 | setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); |
229 | return APIC_EILVT_LVTOFF_IBS; | 335 | return APIC_EILVT_LVTOFF_IBS; |
230 | } | 336 | } |
337 | EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); | ||
231 | 338 | ||
232 | /* | 339 | /* |
233 | * Program the next event, relative to now | 340 | * Program the next event, relative to now |
@@ -366,7 +473,7 @@ static int __init calibrate_APIC_clock(void) | |||
366 | lapic_clockevent.min_delta_ns = | 473 | lapic_clockevent.min_delta_ns = |
367 | clockevent_delta2ns(0xF, &lapic_clockevent); | 474 | clockevent_delta2ns(0xF, &lapic_clockevent); |
368 | 475 | ||
369 | calibration_result = result / HZ; | 476 | calibration_result = (result * APIC_DIVISOR) / HZ; |
370 | 477 | ||
371 | /* | 478 | /* |
372 | * Do a sanity check on the APIC calibration result | 479 | * Do a sanity check on the APIC calibration result |
@@ -388,10 +495,10 @@ static int __init calibrate_APIC_clock(void) | |||
388 | void __init setup_boot_APIC_clock(void) | 495 | void __init setup_boot_APIC_clock(void) |
389 | { | 496 | { |
390 | /* | 497 | /* |
391 | * The local apic timer can be disabled via the kernel commandline. | 498 | * The local apic timer can be disabled via the kernel |
392 | * Register the lapic timer as a dummy clock event source on SMP | 499 | * commandline or from the CPU detection code. Register the lapic |
393 | * systems, so the broadcast mechanism is used. On UP systems simply | 500 | * timer as a dummy clock event source on SMP systems, so the |
394 | * ignore it. | 501 | * broadcast mechanism is used. On UP systems simply ignore it. |
395 | */ | 502 | */ |
396 | if (disable_apic_timer) { | 503 | if (disable_apic_timer) { |
397 | printk(KERN_INFO "Disabling APIC timer\n"); | 504 | printk(KERN_INFO "Disabling APIC timer\n"); |
@@ -403,7 +510,9 @@ void __init setup_boot_APIC_clock(void) | |||
403 | return; | 510 | return; |
404 | } | 511 | } |
405 | 512 | ||
406 | printk(KERN_INFO "Using local APIC timer interrupts.\n"); | 513 | apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" |
514 | "calibrating APIC timer ...\n"); | ||
515 | |||
407 | if (calibrate_APIC_clock()) { | 516 | if (calibrate_APIC_clock()) { |
408 | /* No broadcast on UP ! */ | 517 | /* No broadcast on UP ! */ |
409 | if (num_possible_cpus() > 1) | 518 | if (num_possible_cpus() > 1) |
@@ -422,6 +531,7 @@ void __init setup_boot_APIC_clock(void) | |||
422 | printk(KERN_WARNING "APIC timer registered as dummy," | 531 | printk(KERN_WARNING "APIC timer registered as dummy," |
423 | " due to nmi_watchdog=%d!\n", nmi_watchdog); | 532 | " due to nmi_watchdog=%d!\n", nmi_watchdog); |
424 | 533 | ||
534 | /* Setup the lapic or request the broadcast */ | ||
425 | setup_APIC_timer(); | 535 | setup_APIC_timer(); |
426 | } | 536 | } |
427 | 537 | ||
@@ -460,7 +570,11 @@ static void local_apic_timer_interrupt(void) | |||
460 | /* | 570 | /* |
461 | * the NMI deadlock-detector uses this. | 571 | * the NMI deadlock-detector uses this. |
462 | */ | 572 | */ |
573 | #ifdef CONFIG_X86_64 | ||
463 | add_pda(apic_timer_irqs, 1); | 574 | add_pda(apic_timer_irqs, 1); |
575 | #else | ||
576 | per_cpu(irq_stat, cpu).apic_timer_irqs++; | ||
577 | #endif | ||
464 | 578 | ||
465 | evt->event_handler(evt); | 579 | evt->event_handler(evt); |
466 | } | 580 | } |
@@ -491,6 +605,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) | |||
491 | irq_enter(); | 605 | irq_enter(); |
492 | local_apic_timer_interrupt(); | 606 | local_apic_timer_interrupt(); |
493 | irq_exit(); | 607 | irq_exit(); |
608 | |||
494 | set_irq_regs(old_regs); | 609 | set_irq_regs(old_regs); |
495 | } | 610 | } |
496 | 611 | ||
@@ -544,6 +659,13 @@ void clear_local_APIC(void) | |||
544 | apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); | 659 | apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); |
545 | } | 660 | } |
546 | 661 | ||
662 | /* lets not touch this if we didn't frob it */ | ||
663 | #if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) | ||
664 | if (maxlvt >= 5) { | ||
665 | v = apic_read(APIC_LVTTHMR); | ||
666 | apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); | ||
667 | } | ||
668 | #endif | ||
547 | /* | 669 | /* |
548 | * Clean APIC state for other OSs: | 670 | * Clean APIC state for other OSs: |
549 | */ | 671 | */ |
@@ -554,8 +676,14 @@ void clear_local_APIC(void) | |||
554 | apic_write(APIC_LVTERR, APIC_LVT_MASKED); | 676 | apic_write(APIC_LVTERR, APIC_LVT_MASKED); |
555 | if (maxlvt >= 4) | 677 | if (maxlvt >= 4) |
556 | apic_write(APIC_LVTPC, APIC_LVT_MASKED); | 678 | apic_write(APIC_LVTPC, APIC_LVT_MASKED); |
557 | apic_write(APIC_ESR, 0); | 679 | |
558 | apic_read(APIC_ESR); | 680 | /* Integrated APIC (!82489DX) ? */ |
681 | if (lapic_is_integrated()) { | ||
682 | if (maxlvt > 3) | ||
683 | /* Clear ESR due to Pentium errata 3AP and 11AP */ | ||
684 | apic_write(APIC_ESR, 0); | ||
685 | apic_read(APIC_ESR); | ||
686 | } | ||
559 | } | 687 | } |
560 | 688 | ||
561 | /** | 689 | /** |
@@ -574,8 +702,28 @@ void disable_local_APIC(void) | |||
574 | value = apic_read(APIC_SPIV); | 702 | value = apic_read(APIC_SPIV); |
575 | value &= ~APIC_SPIV_APIC_ENABLED; | 703 | value &= ~APIC_SPIV_APIC_ENABLED; |
576 | apic_write(APIC_SPIV, value); | 704 | apic_write(APIC_SPIV, value); |
705 | |||
706 | #ifdef CONFIG_X86_32 | ||
707 | /* | ||
708 | * When LAPIC was disabled by the BIOS and enabled by the kernel, | ||
709 | * restore the disabled state. | ||
710 | */ | ||
711 | if (enabled_via_apicbase) { | ||
712 | unsigned int l, h; | ||
713 | |||
714 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
715 | l &= ~MSR_IA32_APICBASE_ENABLE; | ||
716 | wrmsr(MSR_IA32_APICBASE, l, h); | ||
717 | } | ||
718 | #endif | ||
577 | } | 719 | } |
578 | 720 | ||
721 | /* | ||
722 | * If Linux enabled the LAPIC against the BIOS default disable it down before | ||
723 | * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and | ||
724 | * not power-off. Additionally clear all LVT entries before disable_local_APIC | ||
725 | * for the case where Linux didn't enable the LAPIC. | ||
726 | */ | ||
579 | void lapic_shutdown(void) | 727 | void lapic_shutdown(void) |
580 | { | 728 | { |
581 | unsigned long flags; | 729 | unsigned long flags; |
@@ -585,7 +733,13 @@ void lapic_shutdown(void) | |||
585 | 733 | ||
586 | local_irq_save(flags); | 734 | local_irq_save(flags); |
587 | 735 | ||
588 | disable_local_APIC(); | 736 | #ifdef CONFIG_X86_32 |
737 | if (!enabled_via_apicbase) | ||
738 | clear_local_APIC(); | ||
739 | else | ||
740 | #endif | ||
741 | disable_local_APIC(); | ||
742 | |||
589 | 743 | ||
590 | local_irq_restore(flags); | 744 | local_irq_restore(flags); |
591 | } | 745 | } |
@@ -629,10 +783,10 @@ int __init verify_local_APIC(void) | |||
629 | /* | 783 | /* |
630 | * The ID register is read/write in a real APIC. | 784 | * The ID register is read/write in a real APIC. |
631 | */ | 785 | */ |
632 | reg0 = read_apic_id(); | 786 | reg0 = apic_read(APIC_ID); |
633 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); | 787 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); |
634 | apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); | 788 | apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); |
635 | reg1 = read_apic_id(); | 789 | reg1 = apic_read(APIC_ID); |
636 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); | 790 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); |
637 | apic_write(APIC_ID, reg0); | 791 | apic_write(APIC_ID, reg0); |
638 | if (reg1 != (reg0 ^ APIC_ID_MASK)) | 792 | if (reg1 != (reg0 ^ APIC_ID_MASK)) |
@@ -656,8 +810,11 @@ int __init verify_local_APIC(void) | |||
656 | */ | 810 | */ |
657 | void __init sync_Arb_IDs(void) | 811 | void __init sync_Arb_IDs(void) |
658 | { | 812 | { |
659 | /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ | 813 | /* |
660 | if (modern_apic()) | 814 | * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not |
815 | * needed on AMD. | ||
816 | */ | ||
817 | if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) | ||
661 | return; | 818 | return; |
662 | 819 | ||
663 | /* | 820 | /* |
@@ -666,8 +823,8 @@ void __init sync_Arb_IDs(void) | |||
666 | apic_wait_icr_idle(); | 823 | apic_wait_icr_idle(); |
667 | 824 | ||
668 | apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); | 825 | apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); |
669 | apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | 826 | apic_write(APIC_ICR, APIC_DEST_ALLINC | |
670 | | APIC_DM_INIT); | 827 | APIC_INT_LEVELTRIG | APIC_DM_INIT); |
671 | } | 828 | } |
672 | 829 | ||
673 | /* | 830 | /* |
@@ -684,8 +841,6 @@ void __init init_bsp_APIC(void) | |||
684 | if (smp_found_config || !cpu_has_apic) | 841 | if (smp_found_config || !cpu_has_apic) |
685 | return; | 842 | return; |
686 | 843 | ||
687 | value = apic_read(APIC_LVR); | ||
688 | |||
689 | /* | 844 | /* |
690 | * Do not trust the local APIC being empty at bootup. | 845 | * Do not trust the local APIC being empty at bootup. |
691 | */ | 846 | */ |
@@ -697,7 +852,15 @@ void __init init_bsp_APIC(void) | |||
697 | value = apic_read(APIC_SPIV); | 852 | value = apic_read(APIC_SPIV); |
698 | value &= ~APIC_VECTOR_MASK; | 853 | value &= ~APIC_VECTOR_MASK; |
699 | value |= APIC_SPIV_APIC_ENABLED; | 854 | value |= APIC_SPIV_APIC_ENABLED; |
700 | value |= APIC_SPIV_FOCUS_DISABLED; | 855 | |
856 | #ifdef CONFIG_X86_32 | ||
857 | /* This bit is reserved on P4/Xeon and should be cleared */ | ||
858 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && | ||
859 | (boot_cpu_data.x86 == 15)) | ||
860 | value &= ~APIC_SPIV_FOCUS_DISABLED; | ||
861 | else | ||
862 | #endif | ||
863 | value |= APIC_SPIV_FOCUS_DISABLED; | ||
701 | value |= SPURIOUS_APIC_VECTOR; | 864 | value |= SPURIOUS_APIC_VECTOR; |
702 | apic_write(APIC_SPIV, value); | 865 | apic_write(APIC_SPIV, value); |
703 | 866 | ||
@@ -706,9 +869,50 @@ void __init init_bsp_APIC(void) | |||
706 | */ | 869 | */ |
707 | apic_write(APIC_LVT0, APIC_DM_EXTINT); | 870 | apic_write(APIC_LVT0, APIC_DM_EXTINT); |
708 | value = APIC_DM_NMI; | 871 | value = APIC_DM_NMI; |
872 | if (!lapic_is_integrated()) /* 82489DX */ | ||
873 | value |= APIC_LVT_LEVEL_TRIGGER; | ||
709 | apic_write(APIC_LVT1, value); | 874 | apic_write(APIC_LVT1, value); |
710 | } | 875 | } |
711 | 876 | ||
877 | static void __cpuinit lapic_setup_esr(void) | ||
878 | { | ||
879 | unsigned long oldvalue, value, maxlvt; | ||
880 | if (lapic_is_integrated() && !esr_disable) { | ||
881 | if (esr_disable) { | ||
882 | /* | ||
883 | * Something untraceable is creating bad interrupts on | ||
884 | * secondary quads ... for the moment, just leave the | ||
885 | * ESR disabled - we can't do anything useful with the | ||
886 | * errors anyway - mbligh | ||
887 | */ | ||
888 | printk(KERN_INFO "Leaving ESR disabled.\n"); | ||
889 | return; | ||
890 | } | ||
891 | /* !82489DX */ | ||
892 | maxlvt = lapic_get_maxlvt(); | ||
893 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | ||
894 | apic_write(APIC_ESR, 0); | ||
895 | oldvalue = apic_read(APIC_ESR); | ||
896 | |||
897 | /* enables sending errors */ | ||
898 | value = ERROR_APIC_VECTOR; | ||
899 | apic_write(APIC_LVTERR, value); | ||
900 | /* | ||
901 | * spec says clear errors after enabling vector. | ||
902 | */ | ||
903 | if (maxlvt > 3) | ||
904 | apic_write(APIC_ESR, 0); | ||
905 | value = apic_read(APIC_ESR); | ||
906 | if (value != oldvalue) | ||
907 | apic_printk(APIC_VERBOSE, "ESR value before enabling " | ||
908 | "vector: 0x%08lx after: 0x%08lx\n", | ||
909 | oldvalue, value); | ||
910 | } else { | ||
911 | printk(KERN_INFO "No ESR for 82489DX.\n"); | ||
912 | } | ||
913 | } | ||
914 | |||
915 | |||
712 | /** | 916 | /** |
713 | * setup_local_APIC - setup the local APIC | 917 | * setup_local_APIC - setup the local APIC |
714 | */ | 918 | */ |
@@ -814,25 +1018,143 @@ void __cpuinit setup_local_APIC(void) | |||
814 | preempt_enable(); | 1018 | preempt_enable(); |
815 | } | 1019 | } |
816 | 1020 | ||
817 | static void __cpuinit lapic_setup_esr(void) | ||
818 | { | ||
819 | unsigned maxlvt = lapic_get_maxlvt(); | ||
820 | |||
821 | apic_write(APIC_LVTERR, ERROR_APIC_VECTOR); | ||
822 | /* | ||
823 | * spec says clear errors after enabling vector. | ||
824 | */ | ||
825 | if (maxlvt > 3) | ||
826 | apic_write(APIC_ESR, 0); | ||
827 | } | ||
828 | |||
829 | void __cpuinit end_local_APIC_setup(void) | 1021 | void __cpuinit end_local_APIC_setup(void) |
830 | { | 1022 | { |
831 | lapic_setup_esr(); | 1023 | lapic_setup_esr(); |
1024 | |||
1025 | #ifdef CONFIG_X86_32 | ||
1026 | { | ||
1027 | unsigned int value; | ||
1028 | /* Disable the local apic timer */ | ||
1029 | value = apic_read(APIC_LVTT); | ||
1030 | value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); | ||
1031 | apic_write(APIC_LVTT, value); | ||
1032 | } | ||
1033 | #endif | ||
1034 | |||
832 | setup_apic_nmi_watchdog(NULL); | 1035 | setup_apic_nmi_watchdog(NULL); |
833 | apic_pm_activate(); | 1036 | apic_pm_activate(); |
834 | } | 1037 | } |
835 | 1038 | ||
1039 | void check_x2apic(void) | ||
1040 | { | ||
1041 | int msr, msr2; | ||
1042 | |||
1043 | rdmsr(MSR_IA32_APICBASE, msr, msr2); | ||
1044 | |||
1045 | if (msr & X2APIC_ENABLE) { | ||
1046 | printk("x2apic enabled by BIOS, switching to x2apic ops\n"); | ||
1047 | x2apic_preenabled = x2apic = 1; | ||
1048 | apic_ops = &x2apic_ops; | ||
1049 | } | ||
1050 | } | ||
1051 | |||
1052 | void enable_x2apic(void) | ||
1053 | { | ||
1054 | int msr, msr2; | ||
1055 | |||
1056 | rdmsr(MSR_IA32_APICBASE, msr, msr2); | ||
1057 | if (!(msr & X2APIC_ENABLE)) { | ||
1058 | printk("Enabling x2apic\n"); | ||
1059 | wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); | ||
1060 | } | ||
1061 | } | ||
1062 | |||
1063 | void enable_IR_x2apic(void) | ||
1064 | { | ||
1065 | #ifdef CONFIG_INTR_REMAP | ||
1066 | int ret; | ||
1067 | unsigned long flags; | ||
1068 | |||
1069 | if (!cpu_has_x2apic) | ||
1070 | return; | ||
1071 | |||
1072 | if (!x2apic_preenabled && disable_x2apic) { | ||
1073 | printk(KERN_INFO | ||
1074 | "Skipped enabling x2apic and Interrupt-remapping " | ||
1075 | "because of nox2apic\n"); | ||
1076 | return; | ||
1077 | } | ||
1078 | |||
1079 | if (x2apic_preenabled && disable_x2apic) | ||
1080 | panic("Bios already enabled x2apic, can't enforce nox2apic"); | ||
1081 | |||
1082 | if (!x2apic_preenabled && skip_ioapic_setup) { | ||
1083 | printk(KERN_INFO | ||
1084 | "Skipped enabling x2apic and Interrupt-remapping " | ||
1085 | "because of skipping io-apic setup\n"); | ||
1086 | return; | ||
1087 | } | ||
1088 | |||
1089 | ret = dmar_table_init(); | ||
1090 | if (ret) { | ||
1091 | printk(KERN_INFO | ||
1092 | "dmar_table_init() failed with %d:\n", ret); | ||
1093 | |||
1094 | if (x2apic_preenabled) | ||
1095 | panic("x2apic enabled by bios. But IR enabling failed"); | ||
1096 | else | ||
1097 | printk(KERN_INFO | ||
1098 | "Not enabling x2apic,Intr-remapping\n"); | ||
1099 | return; | ||
1100 | } | ||
1101 | |||
1102 | local_irq_save(flags); | ||
1103 | mask_8259A(); | ||
1104 | save_mask_IO_APIC_setup(); | ||
1105 | |||
1106 | ret = enable_intr_remapping(1); | ||
1107 | |||
1108 | if (ret && x2apic_preenabled) { | ||
1109 | local_irq_restore(flags); | ||
1110 | panic("x2apic enabled by bios. But IR enabling failed"); | ||
1111 | } | ||
1112 | |||
1113 | if (ret) | ||
1114 | goto end; | ||
1115 | |||
1116 | if (!x2apic) { | ||
1117 | x2apic = 1; | ||
1118 | apic_ops = &x2apic_ops; | ||
1119 | enable_x2apic(); | ||
1120 | } | ||
1121 | end: | ||
1122 | if (ret) | ||
1123 | /* | ||
1124 | * IR enabling failed | ||
1125 | */ | ||
1126 | restore_IO_APIC_setup(); | ||
1127 | else | ||
1128 | reinit_intr_remapped_IO_APIC(x2apic_preenabled); | ||
1129 | |||
1130 | unmask_8259A(); | ||
1131 | local_irq_restore(flags); | ||
1132 | |||
1133 | if (!ret) { | ||
1134 | if (!x2apic_preenabled) | ||
1135 | printk(KERN_INFO | ||
1136 | "Enabled x2apic and interrupt-remapping\n"); | ||
1137 | else | ||
1138 | printk(KERN_INFO | ||
1139 | "Enabled Interrupt-remapping\n"); | ||
1140 | } else | ||
1141 | printk(KERN_ERR | ||
1142 | "Failed to enable Interrupt-remapping and x2apic\n"); | ||
1143 | #else | ||
1144 | if (!cpu_has_x2apic) | ||
1145 | return; | ||
1146 | |||
1147 | if (x2apic_preenabled) | ||
1148 | panic("x2apic enabled prior OS handover," | ||
1149 | " enable CONFIG_INTR_REMAP"); | ||
1150 | |||
1151 | printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " | ||
1152 | " and x2apic\n"); | ||
1153 | #endif | ||
1154 | |||
1155 | return; | ||
1156 | } | ||
1157 | |||
836 | /* | 1158 | /* |
837 | * Detect and enable local APICs on non-SMP boards. | 1159 | * Detect and enable local APICs on non-SMP boards. |
838 | * Original code written by Keir Fraser. | 1160 | * Original code written by Keir Fraser. |
@@ -872,7 +1194,7 @@ void __init early_init_lapic_mapping(void) | |||
872 | * Fetch the APIC ID of the BSP in case we have a | 1194 | * Fetch the APIC ID of the BSP in case we have a |
873 | * default configuration (or the MP table is broken). | 1195 | * default configuration (or the MP table is broken). |
874 | */ | 1196 | */ |
875 | boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); | 1197 | boot_cpu_physical_apicid = read_apic_id(); |
876 | } | 1198 | } |
877 | 1199 | ||
878 | /** | 1200 | /** |
@@ -880,6 +1202,11 @@ void __init early_init_lapic_mapping(void) | |||
880 | */ | 1202 | */ |
881 | void __init init_apic_mappings(void) | 1203 | void __init init_apic_mappings(void) |
882 | { | 1204 | { |
1205 | if (x2apic) { | ||
1206 | boot_cpu_physical_apicid = read_apic_id(); | ||
1207 | return; | ||
1208 | } | ||
1209 | |||
883 | /* | 1210 | /* |
884 | * If no local APIC can be found then set up a fake all | 1211 | * If no local APIC can be found then set up a fake all |
885 | * zeroes page to simulate the local APIC and another | 1212 | * zeroes page to simulate the local APIC and another |
@@ -899,13 +1226,15 @@ void __init init_apic_mappings(void) | |||
899 | * Fetch the APIC ID of the BSP in case we have a | 1226 | * Fetch the APIC ID of the BSP in case we have a |
900 | * default configuration (or the MP table is broken). | 1227 | * default configuration (or the MP table is broken). |
901 | */ | 1228 | */ |
902 | boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); | 1229 | boot_cpu_physical_apicid = read_apic_id(); |
903 | } | 1230 | } |
904 | 1231 | ||
905 | /* | 1232 | /* |
906 | * This initializes the IO-APIC and APIC hardware if this is | 1233 | * This initializes the IO-APIC and APIC hardware if this is |
907 | * a UP kernel. | 1234 | * a UP kernel. |
908 | */ | 1235 | */ |
1236 | int apic_version[MAX_APICS]; | ||
1237 | |||
909 | int __init APIC_init_uniprocessor(void) | 1238 | int __init APIC_init_uniprocessor(void) |
910 | { | 1239 | { |
911 | if (disable_apic) { | 1240 | if (disable_apic) { |
@@ -918,6 +1247,9 @@ int __init APIC_init_uniprocessor(void) | |||
918 | return -1; | 1247 | return -1; |
919 | } | 1248 | } |
920 | 1249 | ||
1250 | enable_IR_x2apic(); | ||
1251 | setup_apic_routing(); | ||
1252 | |||
921 | verify_local_APIC(); | 1253 | verify_local_APIC(); |
922 | 1254 | ||
923 | connect_bsp_APIC(); | 1255 | connect_bsp_APIC(); |
@@ -1004,17 +1336,57 @@ asmlinkage void smp_error_interrupt(void) | |||
1004 | } | 1336 | } |
1005 | 1337 | ||
1006 | /** | 1338 | /** |
1007 | * * connect_bsp_APIC - attach the APIC to the interrupt system | 1339 | * connect_bsp_APIC - attach the APIC to the interrupt system |
1008 | * */ | 1340 | */ |
1009 | void __init connect_bsp_APIC(void) | 1341 | void __init connect_bsp_APIC(void) |
1010 | { | 1342 | { |
1343 | #ifdef CONFIG_X86_32 | ||
1344 | if (pic_mode) { | ||
1345 | /* | ||
1346 | * Do not trust the local APIC being empty at bootup. | ||
1347 | */ | ||
1348 | clear_local_APIC(); | ||
1349 | /* | ||
1350 | * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's | ||
1351 | * local APIC to INT and NMI lines. | ||
1352 | */ | ||
1353 | apic_printk(APIC_VERBOSE, "leaving PIC mode, " | ||
1354 | "enabling APIC mode.\n"); | ||
1355 | outb(0x70, 0x22); | ||
1356 | outb(0x01, 0x23); | ||
1357 | } | ||
1358 | #endif | ||
1011 | enable_apic_mode(); | 1359 | enable_apic_mode(); |
1012 | } | 1360 | } |
1013 | 1361 | ||
1362 | /** | ||
1363 | * disconnect_bsp_APIC - detach the APIC from the interrupt system | ||
1364 | * @virt_wire_setup: indicates, whether virtual wire mode is selected | ||
1365 | * | ||
1366 | * Virtual wire mode is necessary to deliver legacy interrupts even when the | ||
1367 | * APIC is disabled. | ||
1368 | */ | ||
1014 | void disconnect_bsp_APIC(int virt_wire_setup) | 1369 | void disconnect_bsp_APIC(int virt_wire_setup) |
1015 | { | 1370 | { |
1371 | unsigned int value; | ||
1372 | |||
1373 | #ifdef CONFIG_X86_32 | ||
1374 | if (pic_mode) { | ||
1375 | /* | ||
1376 | * Put the board back into PIC mode (has an effect only on | ||
1377 | * certain older boards). Note that APIC interrupts, including | ||
1378 | * IPIs, won't work beyond this point! The only exception are | ||
1379 | * INIT IPIs. | ||
1380 | */ | ||
1381 | apic_printk(APIC_VERBOSE, "disabling APIC mode, " | ||
1382 | "entering PIC mode.\n"); | ||
1383 | outb(0x70, 0x22); | ||
1384 | outb(0x00, 0x23); | ||
1385 | return; | ||
1386 | } | ||
1387 | #endif | ||
1388 | |||
1016 | /* Go back to Virtual Wire compatibility mode */ | 1389 | /* Go back to Virtual Wire compatibility mode */ |
1017 | unsigned long value; | ||
1018 | 1390 | ||
1019 | /* For the spurious interrupt use vector F, and enable it */ | 1391 | /* For the spurious interrupt use vector F, and enable it */ |
1020 | value = apic_read(APIC_SPIV); | 1392 | value = apic_read(APIC_SPIV); |
@@ -1040,7 +1412,10 @@ void disconnect_bsp_APIC(int virt_wire_setup) | |||
1040 | apic_write(APIC_LVT0, APIC_LVT_MASKED); | 1412 | apic_write(APIC_LVT0, APIC_LVT_MASKED); |
1041 | } | 1413 | } |
1042 | 1414 | ||
1043 | /* For LVT1 make it edge triggered, active high, nmi and enabled */ | 1415 | /* |
1416 | * For LVT1 make it edge triggered, active high, | ||
1417 | * nmi and enabled | ||
1418 | */ | ||
1044 | value = apic_read(APIC_LVT1); | 1419 | value = apic_read(APIC_LVT1); |
1045 | value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | | 1420 | value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | |
1046 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | | 1421 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | |
@@ -1055,9 +1430,20 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1055 | int cpu; | 1430 | int cpu; |
1056 | cpumask_t tmp_map; | 1431 | cpumask_t tmp_map; |
1057 | 1432 | ||
1433 | /* | ||
1434 | * Validate version | ||
1435 | */ | ||
1436 | if (version == 0x0) { | ||
1437 | printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " | ||
1438 | "fixing up to 0x10. (tell your hw vendor)\n", | ||
1439 | version); | ||
1440 | version = 0x10; | ||
1441 | } | ||
1442 | apic_version[apicid] = version; | ||
1443 | |||
1058 | if (num_processors >= NR_CPUS) { | 1444 | if (num_processors >= NR_CPUS) { |
1059 | printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." | 1445 | printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." |
1060 | " Processor ignored.\n", NR_CPUS); | 1446 | " Processor ignored.\n", NR_CPUS); |
1061 | return; | 1447 | return; |
1062 | } | 1448 | } |
1063 | 1449 | ||
@@ -1077,6 +1463,29 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1077 | if (apicid > max_physical_apicid) | 1463 | if (apicid > max_physical_apicid) |
1078 | max_physical_apicid = apicid; | 1464 | max_physical_apicid = apicid; |
1079 | 1465 | ||
1466 | #ifdef CONFIG_X86_32 | ||
1467 | /* | ||
1468 | * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y | ||
1469 | * but we need to work other dependencies like SMP_SUSPEND etc | ||
1470 | * before this can be done without some confusion. | ||
1471 | * if (CPU_HOTPLUG_ENABLED || num_processors > 8) | ||
1472 | * - Ashok Raj <ashok.raj@intel.com> | ||
1473 | */ | ||
1474 | if (max_physical_apicid >= 8) { | ||
1475 | switch (boot_cpu_data.x86_vendor) { | ||
1476 | case X86_VENDOR_INTEL: | ||
1477 | if (!APIC_XAPIC(version)) { | ||
1478 | def_to_bigsmp = 0; | ||
1479 | break; | ||
1480 | } | ||
1481 | /* If P4 and above fall through */ | ||
1482 | case X86_VENDOR_AMD: | ||
1483 | def_to_bigsmp = 1; | ||
1484 | } | ||
1485 | } | ||
1486 | #endif | ||
1487 | |||
1488 | #if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) | ||
1080 | /* are we being called early in kernel startup? */ | 1489 | /* are we being called early in kernel startup? */ |
1081 | if (early_per_cpu_ptr(x86_cpu_to_apicid)) { | 1490 | if (early_per_cpu_ptr(x86_cpu_to_apicid)) { |
1082 | u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); | 1491 | u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); |
@@ -1088,20 +1497,28 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1088 | per_cpu(x86_cpu_to_apicid, cpu) = apicid; | 1497 | per_cpu(x86_cpu_to_apicid, cpu) = apicid; |
1089 | per_cpu(x86_bios_cpu_apicid, cpu) = apicid; | 1498 | per_cpu(x86_bios_cpu_apicid, cpu) = apicid; |
1090 | } | 1499 | } |
1500 | #endif | ||
1091 | 1501 | ||
1092 | cpu_set(cpu, cpu_possible_map); | 1502 | cpu_set(cpu, cpu_possible_map); |
1093 | cpu_set(cpu, cpu_present_map); | 1503 | cpu_set(cpu, cpu_present_map); |
1094 | } | 1504 | } |
1095 | 1505 | ||
1506 | int hard_smp_processor_id(void) | ||
1507 | { | ||
1508 | return read_apic_id(); | ||
1509 | } | ||
1510 | |||
1096 | /* | 1511 | /* |
1097 | * Power management | 1512 | * Power management |
1098 | */ | 1513 | */ |
1099 | #ifdef CONFIG_PM | 1514 | #ifdef CONFIG_PM |
1100 | 1515 | ||
1101 | static struct { | 1516 | static struct { |
1102 | /* 'active' is true if the local APIC was enabled by us and | 1517 | /* |
1103 | not the BIOS; this signifies that we are also responsible | 1518 | * 'active' is true if the local APIC was enabled by us and |
1104 | for disabling it before entering apm/acpi suspend */ | 1519 | * not the BIOS; this signifies that we are also responsible |
1520 | * for disabling it before entering apm/acpi suspend | ||
1521 | */ | ||
1105 | int active; | 1522 | int active; |
1106 | /* r/w apic fields */ | 1523 | /* r/w apic fields */ |
1107 | unsigned int apic_id; | 1524 | unsigned int apic_id; |
@@ -1129,7 +1546,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) | |||
1129 | 1546 | ||
1130 | maxlvt = lapic_get_maxlvt(); | 1547 | maxlvt = lapic_get_maxlvt(); |
1131 | 1548 | ||
1132 | apic_pm_state.apic_id = read_apic_id(); | 1549 | apic_pm_state.apic_id = apic_read(APIC_ID); |
1133 | apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); | 1550 | apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); |
1134 | apic_pm_state.apic_ldr = apic_read(APIC_LDR); | 1551 | apic_pm_state.apic_ldr = apic_read(APIC_LDR); |
1135 | apic_pm_state.apic_dfr = apic_read(APIC_DFR); | 1552 | apic_pm_state.apic_dfr = apic_read(APIC_DFR); |
@@ -1142,10 +1559,11 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) | |||
1142 | apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); | 1559 | apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); |
1143 | apic_pm_state.apic_tmict = apic_read(APIC_TMICT); | 1560 | apic_pm_state.apic_tmict = apic_read(APIC_TMICT); |
1144 | apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); | 1561 | apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); |
1145 | #ifdef CONFIG_X86_MCE_INTEL | 1562 | #if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) |
1146 | if (maxlvt >= 5) | 1563 | if (maxlvt >= 5) |
1147 | apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); | 1564 | apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); |
1148 | #endif | 1565 | #endif |
1566 | |||
1149 | local_irq_save(flags); | 1567 | local_irq_save(flags); |
1150 | disable_local_APIC(); | 1568 | disable_local_APIC(); |
1151 | local_irq_restore(flags); | 1569 | local_irq_restore(flags); |
@@ -1164,10 +1582,25 @@ static int lapic_resume(struct sys_device *dev) | |||
1164 | maxlvt = lapic_get_maxlvt(); | 1582 | maxlvt = lapic_get_maxlvt(); |
1165 | 1583 | ||
1166 | local_irq_save(flags); | 1584 | local_irq_save(flags); |
1167 | rdmsr(MSR_IA32_APICBASE, l, h); | 1585 | |
1168 | l &= ~MSR_IA32_APICBASE_BASE; | 1586 | #ifdef CONFIG_X86_64 |
1169 | l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; | 1587 | if (x2apic) |
1170 | wrmsr(MSR_IA32_APICBASE, l, h); | 1588 | enable_x2apic(); |
1589 | else | ||
1590 | #endif | ||
1591 | { | ||
1592 | /* | ||
1593 | * Make sure the APICBASE points to the right address | ||
1594 | * | ||
1595 | * FIXME! This will be wrong if we ever support suspend on | ||
1596 | * SMP! We'll need to do this as part of the CPU restore! | ||
1597 | */ | ||
1598 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
1599 | l &= ~MSR_IA32_APICBASE_BASE; | ||
1600 | l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; | ||
1601 | wrmsr(MSR_IA32_APICBASE, l, h); | ||
1602 | } | ||
1603 | |||
1171 | apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); | 1604 | apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); |
1172 | apic_write(APIC_ID, apic_pm_state.apic_id); | 1605 | apic_write(APIC_ID, apic_pm_state.apic_id); |
1173 | apic_write(APIC_DFR, apic_pm_state.apic_dfr); | 1606 | apic_write(APIC_DFR, apic_pm_state.apic_dfr); |
@@ -1176,7 +1609,7 @@ static int lapic_resume(struct sys_device *dev) | |||
1176 | apic_write(APIC_SPIV, apic_pm_state.apic_spiv); | 1609 | apic_write(APIC_SPIV, apic_pm_state.apic_spiv); |
1177 | apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); | 1610 | apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); |
1178 | apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); | 1611 | apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); |
1179 | #ifdef CONFIG_X86_MCE_INTEL | 1612 | #if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) |
1180 | if (maxlvt >= 5) | 1613 | if (maxlvt >= 5) |
1181 | apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); | 1614 | apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); |
1182 | #endif | 1615 | #endif |
@@ -1190,10 +1623,17 @@ static int lapic_resume(struct sys_device *dev) | |||
1190 | apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); | 1623 | apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); |
1191 | apic_write(APIC_ESR, 0); | 1624 | apic_write(APIC_ESR, 0); |
1192 | apic_read(APIC_ESR); | 1625 | apic_read(APIC_ESR); |
1626 | |||
1193 | local_irq_restore(flags); | 1627 | local_irq_restore(flags); |
1628 | |||
1194 | return 0; | 1629 | return 0; |
1195 | } | 1630 | } |
1196 | 1631 | ||
1632 | /* | ||
1633 | * This device has no shutdown method - fully functioning local APICs | ||
1634 | * are needed on every CPU up until machine_halt/restart/poweroff. | ||
1635 | */ | ||
1636 | |||
1197 | static struct sysdev_class lapic_sysclass = { | 1637 | static struct sysdev_class lapic_sysclass = { |
1198 | .name = "lapic", | 1638 | .name = "lapic", |
1199 | .resume = lapic_resume, | 1639 | .resume = lapic_resume, |
@@ -1307,31 +1747,19 @@ __cpuinit int apic_is_clustered_box(void) | |||
1307 | return (clusters > 2); | 1747 | return (clusters > 2); |
1308 | } | 1748 | } |
1309 | 1749 | ||
1310 | /* | 1750 | static __init int setup_nox2apic(char *str) |
1311 | * APIC command line parameters | ||
1312 | */ | ||
1313 | static int __init apic_set_verbosity(char *str) | ||
1314 | { | 1751 | { |
1315 | if (str == NULL) { | 1752 | disable_x2apic = 1; |
1316 | skip_ioapic_setup = 0; | 1753 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_X2APIC); |
1317 | ioapic_force = 1; | ||
1318 | return 0; | ||
1319 | } | ||
1320 | if (strcmp("debug", str) == 0) | ||
1321 | apic_verbosity = APIC_DEBUG; | ||
1322 | else if (strcmp("verbose", str) == 0) | ||
1323 | apic_verbosity = APIC_VERBOSE; | ||
1324 | else { | ||
1325 | printk(KERN_WARNING "APIC Verbosity level %s not recognised" | ||
1326 | " use apic=verbose or apic=debug\n", str); | ||
1327 | return -EINVAL; | ||
1328 | } | ||
1329 | |||
1330 | return 0; | 1754 | return 0; |
1331 | } | 1755 | } |
1332 | early_param("apic", apic_set_verbosity); | 1756 | early_param("nox2apic", setup_nox2apic); |
1757 | |||
1333 | 1758 | ||
1334 | static __init int setup_disableapic(char *str) | 1759 | /* |
1760 | * APIC command line parameters | ||
1761 | */ | ||
1762 | static int __init setup_disableapic(char *arg) | ||
1335 | { | 1763 | { |
1336 | disable_apic = 1; | 1764 | disable_apic = 1; |
1337 | setup_clear_cpu_cap(X86_FEATURE_APIC); | 1765 | setup_clear_cpu_cap(X86_FEATURE_APIC); |
@@ -1340,9 +1768,9 @@ static __init int setup_disableapic(char *str) | |||
1340 | early_param("disableapic", setup_disableapic); | 1768 | early_param("disableapic", setup_disableapic); |
1341 | 1769 | ||
1342 | /* same as disableapic, for compatibility */ | 1770 | /* same as disableapic, for compatibility */ |
1343 | static __init int setup_nolapic(char *str) | 1771 | static int __init setup_nolapic(char *arg) |
1344 | { | 1772 | { |
1345 | return setup_disableapic(str); | 1773 | return setup_disableapic(arg); |
1346 | } | 1774 | } |
1347 | early_param("nolapic", setup_nolapic); | 1775 | early_param("nolapic", setup_nolapic); |
1348 | 1776 | ||
@@ -1353,14 +1781,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg) | |||
1353 | } | 1781 | } |
1354 | early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); | 1782 | early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); |
1355 | 1783 | ||
1356 | static __init int setup_noapictimer(char *str) | 1784 | static int __init parse_disable_apic_timer(char *arg) |
1357 | { | 1785 | { |
1358 | if (str[0] != ' ' && str[0] != 0) | ||
1359 | return 0; | ||
1360 | disable_apic_timer = 1; | 1786 | disable_apic_timer = 1; |
1361 | return 1; | 1787 | return 0; |
1362 | } | 1788 | } |
1363 | __setup("noapictimer", setup_noapictimer); | 1789 | early_param("noapictimer", parse_disable_apic_timer); |
1790 | |||
1791 | static int __init parse_nolapic_timer(char *arg) | ||
1792 | { | ||
1793 | disable_apic_timer = 1; | ||
1794 | return 0; | ||
1795 | } | ||
1796 | early_param("nolapic_timer", parse_nolapic_timer); | ||
1364 | 1797 | ||
1365 | static __init int setup_apicpmtimer(char *s) | 1798 | static __init int setup_apicpmtimer(char *s) |
1366 | { | 1799 | { |
@@ -1370,6 +1803,31 @@ static __init int setup_apicpmtimer(char *s) | |||
1370 | } | 1803 | } |
1371 | __setup("apicpmtimer", setup_apicpmtimer); | 1804 | __setup("apicpmtimer", setup_apicpmtimer); |
1372 | 1805 | ||
1806 | static int __init apic_set_verbosity(char *arg) | ||
1807 | { | ||
1808 | if (!arg) { | ||
1809 | #ifdef CONFIG_X86_64 | ||
1810 | skip_ioapic_setup = 0; | ||
1811 | ioapic_force = 1; | ||
1812 | return 0; | ||
1813 | #endif | ||
1814 | return -EINVAL; | ||
1815 | } | ||
1816 | |||
1817 | if (strcmp("debug", arg) == 0) | ||
1818 | apic_verbosity = APIC_DEBUG; | ||
1819 | else if (strcmp("verbose", arg) == 0) | ||
1820 | apic_verbosity = APIC_VERBOSE; | ||
1821 | else { | ||
1822 | printk(KERN_WARNING "APIC Verbosity level %s not recognised" | ||
1823 | " use apic=verbose or apic=debug\n", arg); | ||
1824 | return -EINVAL; | ||
1825 | } | ||
1826 | |||
1827 | return 0; | ||
1828 | } | ||
1829 | early_param("apic", apic_set_verbosity); | ||
1830 | |||
1373 | static int __init lapic_insert_resource(void) | 1831 | static int __init lapic_insert_resource(void) |
1374 | { | 1832 | { |
1375 | if (!apic_phys) | 1833 | if (!apic_phys) |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 9ee24e6bc4b0..5145a6e72bbb 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -228,12 +228,12 @@ | |||
228 | #include <linux/suspend.h> | 228 | #include <linux/suspend.h> |
229 | #include <linux/kthread.h> | 229 | #include <linux/kthread.h> |
230 | #include <linux/jiffies.h> | 230 | #include <linux/jiffies.h> |
231 | #include <linux/smp_lock.h> | ||
232 | 231 | ||
233 | #include <asm/system.h> | 232 | #include <asm/system.h> |
234 | #include <asm/uaccess.h> | 233 | #include <asm/uaccess.h> |
235 | #include <asm/desc.h> | 234 | #include <asm/desc.h> |
236 | #include <asm/i8253.h> | 235 | #include <asm/i8253.h> |
236 | #include <asm/olpc.h> | ||
237 | #include <asm/paravirt.h> | 237 | #include <asm/paravirt.h> |
238 | #include <asm/reboot.h> | 238 | #include <asm/reboot.h> |
239 | 239 | ||
@@ -2217,7 +2217,7 @@ static int __init apm_init(void) | |||
2217 | 2217 | ||
2218 | dmi_check_system(apm_dmi_table); | 2218 | dmi_check_system(apm_dmi_table); |
2219 | 2219 | ||
2220 | if (apm_info.bios.version == 0 || paravirt_enabled()) { | 2220 | if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) { |
2221 | printk(KERN_INFO "apm: BIOS not found.\n"); | 2221 | printk(KERN_INFO "apm: BIOS not found.\n"); |
2222 | return -ENODEV; | 2222 | return -ENODEV; |
2223 | } | 2223 | } |
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index aa89387006fe..505543a75a56 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c | |||
@@ -22,7 +22,7 @@ | |||
22 | 22 | ||
23 | #define __NO_STUBS 1 | 23 | #define __NO_STUBS 1 |
24 | #undef __SYSCALL | 24 | #undef __SYSCALL |
25 | #undef _ASM_X86_64_UNISTD_H_ | 25 | #undef ASM_X86__UNISTD_64_H |
26 | #define __SYSCALL(nr, sym) [nr] = 1, | 26 | #define __SYSCALL(nr, sym) [nr] = 1, |
27 | static char syscalls[] = { | 27 | static char syscalls[] = { |
28 | #include <asm/unistd.h> | 28 | #include <asm/unistd.h> |
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index c639bd55391c..fdd585f9c53d 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c | |||
@@ -25,11 +25,11 @@ x86_bios_strerror(long status) | |||
25 | { | 25 | { |
26 | const char *str; | 26 | const char *str; |
27 | switch (status) { | 27 | switch (status) { |
28 | case 0: str = "Call completed without error"; break; | 28 | case 0: str = "Call completed without error"; break; |
29 | case -1: str = "Not implemented"; break; | 29 | case -1: str = "Not implemented"; break; |
30 | case -2: str = "Invalid argument"; break; | 30 | case -2: str = "Invalid argument"; break; |
31 | case -3: str = "Call completed with error"; break; | 31 | case -3: str = "Call completed with error"; break; |
32 | default: str = "Unknown BIOS status code"; break; | 32 | default: str = "Unknown BIOS status code"; break; |
33 | } | 33 | } |
34 | return str; | 34 | return str; |
35 | } | 35 | } |
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index ee76eaad3001..7f0b45a5d788 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -3,22 +3,30 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | obj-y := intel_cacheinfo.o addon_cpuid_features.o | 5 | obj-y := intel_cacheinfo.o addon_cpuid_features.o |
6 | obj-y += proc.o feature_names.o | 6 | obj-y += proc.o capflags.o powerflags.o common.o |
7 | 7 | ||
8 | obj-$(CONFIG_X86_32) += common.o bugs.o | 8 | obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o |
9 | obj-$(CONFIG_X86_64) += common_64.o bugs_64.o | 9 | obj-$(CONFIG_X86_64) += bugs_64.o |
10 | obj-$(CONFIG_X86_32) += amd.o | 10 | |
11 | obj-$(CONFIG_X86_64) += amd_64.o | 11 | obj-$(CONFIG_CPU_SUP_INTEL) += intel.o |
12 | obj-$(CONFIG_X86_32) += cyrix.o | 12 | obj-$(CONFIG_CPU_SUP_AMD) += amd.o |
13 | obj-$(CONFIG_X86_32) += centaur.o | 13 | obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o |
14 | obj-$(CONFIG_X86_64) += centaur_64.o | 14 | obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o |
15 | obj-$(CONFIG_X86_32) += transmeta.o | 15 | obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o |
16 | obj-$(CONFIG_X86_32) += intel.o | 16 | obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o |
17 | obj-$(CONFIG_X86_64) += intel_64.o | 17 | obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o |
18 | obj-$(CONFIG_X86_32) += umc.o | ||
19 | 18 | ||
20 | obj-$(CONFIG_X86_MCE) += mcheck/ | 19 | obj-$(CONFIG_X86_MCE) += mcheck/ |
21 | obj-$(CONFIG_MTRR) += mtrr/ | 20 | obj-$(CONFIG_MTRR) += mtrr/ |
22 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ | 21 | obj-$(CONFIG_CPU_FREQ) += cpufreq/ |
23 | 22 | ||
24 | obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o | 23 | obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o |
24 | |||
25 | quiet_cmd_mkcapflags = MKCAP $@ | ||
26 | cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ | ||
27 | |||
28 | cpufeature = $(src)/../../../../include/asm-x86/cpufeature.h | ||
29 | |||
30 | targets += capflags.c | ||
31 | $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE | ||
32 | $(call if_changed,mkcapflags) | ||
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index a6ef672adbba..0d9c993aa93e 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c | |||
@@ -7,6 +7,8 @@ | |||
7 | #include <asm/pat.h> | 7 | #include <asm/pat.h> |
8 | #include <asm/processor.h> | 8 | #include <asm/processor.h> |
9 | 9 | ||
10 | #include <mach_apic.h> | ||
11 | |||
10 | struct cpuid_bit { | 12 | struct cpuid_bit { |
11 | u16 feature; | 13 | u16 feature; |
12 | u8 reg; | 14 | u8 reg; |
@@ -48,6 +50,92 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) | |||
48 | } | 50 | } |
49 | } | 51 | } |
50 | 52 | ||
53 | /* leaf 0xb SMT level */ | ||
54 | #define SMT_LEVEL 0 | ||
55 | |||
56 | /* leaf 0xb sub-leaf types */ | ||
57 | #define INVALID_TYPE 0 | ||
58 | #define SMT_TYPE 1 | ||
59 | #define CORE_TYPE 2 | ||
60 | |||
61 | #define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) | ||
62 | #define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) | ||
63 | #define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) | ||
64 | |||
65 | /* | ||
66 | * Check for extended topology enumeration cpuid leaf 0xb and if it | ||
67 | * exists, use it for populating initial_apicid and cpu topology | ||
68 | * detection. | ||
69 | */ | ||
70 | void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) | ||
71 | { | ||
72 | #ifdef CONFIG_SMP | ||
73 | unsigned int eax, ebx, ecx, edx, sub_index; | ||
74 | unsigned int ht_mask_width, core_plus_mask_width; | ||
75 | unsigned int core_select_mask, core_level_siblings; | ||
76 | |||
77 | if (c->cpuid_level < 0xb) | ||
78 | return; | ||
79 | |||
80 | cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); | ||
81 | |||
82 | /* | ||
83 | * check if the cpuid leaf 0xb is actually implemented. | ||
84 | */ | ||
85 | if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) | ||
86 | return; | ||
87 | |||
88 | set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); | ||
89 | |||
90 | /* | ||
91 | * initial apic id, which also represents 32-bit extended x2apic id. | ||
92 | */ | ||
93 | c->initial_apicid = edx; | ||
94 | |||
95 | /* | ||
96 | * Populate HT related information from sub-leaf level 0. | ||
97 | */ | ||
98 | core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); | ||
99 | core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); | ||
100 | |||
101 | sub_index = 1; | ||
102 | do { | ||
103 | cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); | ||
104 | |||
105 | /* | ||
106 | * Check for the Core type in the implemented sub leaves. | ||
107 | */ | ||
108 | if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { | ||
109 | core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); | ||
110 | core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); | ||
111 | break; | ||
112 | } | ||
113 | |||
114 | sub_index++; | ||
115 | } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); | ||
116 | |||
117 | core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; | ||
118 | |||
119 | #ifdef CONFIG_X86_32 | ||
120 | c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width) | ||
121 | & core_select_mask; | ||
122 | c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); | ||
123 | #else | ||
124 | c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask; | ||
125 | c->phys_proc_id = phys_pkg_id(core_plus_mask_width); | ||
126 | #endif | ||
127 | c->x86_max_cores = (core_level_siblings / smp_num_siblings); | ||
128 | |||
129 | |||
130 | printk(KERN_INFO "CPU: Physical Processor ID: %d\n", | ||
131 | c->phys_proc_id); | ||
132 | if (c->x86_max_cores > 1) | ||
133 | printk(KERN_INFO "CPU: Processor Core ID: %d\n", | ||
134 | c->cpu_core_id); | ||
135 | return; | ||
136 | #endif | ||
137 | } | ||
138 | |||
51 | #ifdef CONFIG_X86_PAT | 139 | #ifdef CONFIG_X86_PAT |
52 | void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) | 140 | void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) |
53 | { | 141 | { |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index cae9cabc3031..32e73520adf7 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -1,13 +1,22 @@ | |||
1 | #include <linux/init.h> | 1 | #include <linux/init.h> |
2 | #include <linux/bitops.h> | 2 | #include <linux/bitops.h> |
3 | #include <linux/mm.h> | 3 | #include <linux/mm.h> |
4 | |||
4 | #include <asm/io.h> | 5 | #include <asm/io.h> |
5 | #include <asm/processor.h> | 6 | #include <asm/processor.h> |
6 | #include <asm/apic.h> | 7 | #include <asm/apic.h> |
7 | 8 | ||
9 | #ifdef CONFIG_X86_64 | ||
10 | # include <asm/numa_64.h> | ||
11 | # include <asm/mmconfig.h> | ||
12 | # include <asm/cacheflush.h> | ||
13 | #endif | ||
14 | |||
8 | #include <mach_apic.h> | 15 | #include <mach_apic.h> |
16 | |||
9 | #include "cpu.h" | 17 | #include "cpu.h" |
10 | 18 | ||
19 | #ifdef CONFIG_X86_32 | ||
11 | /* | 20 | /* |
12 | * B step AMD K6 before B 9730xxxx have hardware bugs that can cause | 21 | * B step AMD K6 before B 9730xxxx have hardware bugs that can cause |
13 | * misexecution of code under Linux. Owners of such processors should | 22 | * misexecution of code under Linux. Owners of such processors should |
@@ -24,21 +33,273 @@ | |||
24 | extern void vide(void); | 33 | extern void vide(void); |
25 | __asm__(".align 4\nvide: ret"); | 34 | __asm__(".align 4\nvide: ret"); |
26 | 35 | ||
27 | static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) | 36 | static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) |
28 | { | 37 | { |
29 | if (cpuid_eax(0x80000000) >= 0x80000007) { | 38 | /* |
30 | c->x86_power = cpuid_edx(0x80000007); | 39 | * General Systems BIOSen alias the cpu frequency registers |
31 | if (c->x86_power & (1<<8)) | 40 | * of the Elan at 0x000df000. Unfortuantly, one of the Linux |
32 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | 41 | * drivers subsequently pokes it, and changes the CPU speed. |
42 | * Workaround : Remove the unneeded alias. | ||
43 | */ | ||
44 | #define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ | ||
45 | #define CBAR_ENB (0x80000000) | ||
46 | #define CBAR_KEY (0X000000CB) | ||
47 | if (c->x86_model == 9 || c->x86_model == 10) { | ||
48 | if (inl (CBAR) & CBAR_ENB) | ||
49 | outl (0 | CBAR_KEY, CBAR); | ||
33 | } | 50 | } |
34 | } | 51 | } |
35 | 52 | ||
36 | static void __cpuinit init_amd(struct cpuinfo_x86 *c) | 53 | |
54 | static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) | ||
37 | { | 55 | { |
38 | u32 l, h; | 56 | u32 l, h; |
39 | int mbytes = num_physpages >> (20-PAGE_SHIFT); | 57 | int mbytes = num_physpages >> (20-PAGE_SHIFT); |
40 | int r; | ||
41 | 58 | ||
59 | if (c->x86_model < 6) { | ||
60 | /* Based on AMD doc 20734R - June 2000 */ | ||
61 | if (c->x86_model == 0) { | ||
62 | clear_cpu_cap(c, X86_FEATURE_APIC); | ||
63 | set_cpu_cap(c, X86_FEATURE_PGE); | ||
64 | } | ||
65 | return; | ||
66 | } | ||
67 | |||
68 | if (c->x86_model == 6 && c->x86_mask == 1) { | ||
69 | const int K6_BUG_LOOP = 1000000; | ||
70 | int n; | ||
71 | void (*f_vide)(void); | ||
72 | unsigned long d, d2; | ||
73 | |||
74 | printk(KERN_INFO "AMD K6 stepping B detected - "); | ||
75 | |||
76 | /* | ||
77 | * It looks like AMD fixed the 2.6.2 bug and improved indirect | ||
78 | * calls at the same time. | ||
79 | */ | ||
80 | |||
81 | n = K6_BUG_LOOP; | ||
82 | f_vide = vide; | ||
83 | rdtscl(d); | ||
84 | while (n--) | ||
85 | f_vide(); | ||
86 | rdtscl(d2); | ||
87 | d = d2-d; | ||
88 | |||
89 | if (d > 20*K6_BUG_LOOP) | ||
90 | printk("system stability may be impaired when more than 32 MB are used.\n"); | ||
91 | else | ||
92 | printk("probably OK (after B9730xxxx).\n"); | ||
93 | printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); | ||
94 | } | ||
95 | |||
96 | /* K6 with old style WHCR */ | ||
97 | if (c->x86_model < 8 || | ||
98 | (c->x86_model == 8 && c->x86_mask < 8)) { | ||
99 | /* We can only write allocate on the low 508Mb */ | ||
100 | if (mbytes > 508) | ||
101 | mbytes = 508; | ||
102 | |||
103 | rdmsr(MSR_K6_WHCR, l, h); | ||
104 | if ((l&0x0000FFFF) == 0) { | ||
105 | unsigned long flags; | ||
106 | l = (1<<0)|((mbytes/4)<<1); | ||
107 | local_irq_save(flags); | ||
108 | wbinvd(); | ||
109 | wrmsr(MSR_K6_WHCR, l, h); | ||
110 | local_irq_restore(flags); | ||
111 | printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", | ||
112 | mbytes); | ||
113 | } | ||
114 | return; | ||
115 | } | ||
116 | |||
117 | if ((c->x86_model == 8 && c->x86_mask > 7) || | ||
118 | c->x86_model == 9 || c->x86_model == 13) { | ||
119 | /* The more serious chips .. */ | ||
120 | |||
121 | if (mbytes > 4092) | ||
122 | mbytes = 4092; | ||
123 | |||
124 | rdmsr(MSR_K6_WHCR, l, h); | ||
125 | if ((l&0xFFFF0000) == 0) { | ||
126 | unsigned long flags; | ||
127 | l = ((mbytes>>2)<<22)|(1<<16); | ||
128 | local_irq_save(flags); | ||
129 | wbinvd(); | ||
130 | wrmsr(MSR_K6_WHCR, l, h); | ||
131 | local_irq_restore(flags); | ||
132 | printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", | ||
133 | mbytes); | ||
134 | } | ||
135 | |||
136 | return; | ||
137 | } | ||
138 | |||
139 | if (c->x86_model == 10) { | ||
140 | /* AMD Geode LX is model 10 */ | ||
141 | /* placeholder for any needed mods */ | ||
142 | return; | ||
143 | } | ||
144 | } | ||
145 | |||
146 | static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) | ||
147 | { | ||
148 | u32 l, h; | ||
149 | |||
150 | /* | ||
151 | * Bit 15 of Athlon specific MSR 15, needs to be 0 | ||
152 | * to enable SSE on Palomino/Morgan/Barton CPU's. | ||
153 | * If the BIOS didn't enable it already, enable it here. | ||
154 | */ | ||
155 | if (c->x86_model >= 6 && c->x86_model <= 10) { | ||
156 | if (!cpu_has(c, X86_FEATURE_XMM)) { | ||
157 | printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); | ||
158 | rdmsr(MSR_K7_HWCR, l, h); | ||
159 | l &= ~0x00008000; | ||
160 | wrmsr(MSR_K7_HWCR, l, h); | ||
161 | set_cpu_cap(c, X86_FEATURE_XMM); | ||
162 | } | ||
163 | } | ||
164 | |||
165 | /* | ||
166 | * It's been determined by AMD that Athlons since model 8 stepping 1 | ||
167 | * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx | ||
168 | * As per AMD technical note 27212 0.2 | ||
169 | */ | ||
170 | if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { | ||
171 | rdmsr(MSR_K7_CLK_CTL, l, h); | ||
172 | if ((l & 0xfff00000) != 0x20000000) { | ||
173 | printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, | ||
174 | ((l & 0x000fffff)|0x20000000)); | ||
175 | wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); | ||
176 | } | ||
177 | } | ||
178 | |||
179 | set_cpu_cap(c, X86_FEATURE_K7); | ||
180 | } | ||
181 | #endif | ||
182 | |||
183 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) | ||
184 | static int __cpuinit nearby_node(int apicid) | ||
185 | { | ||
186 | int i, node; | ||
187 | |||
188 | for (i = apicid - 1; i >= 0; i--) { | ||
189 | node = apicid_to_node[i]; | ||
190 | if (node != NUMA_NO_NODE && node_online(node)) | ||
191 | return node; | ||
192 | } | ||
193 | for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { | ||
194 | node = apicid_to_node[i]; | ||
195 | if (node != NUMA_NO_NODE && node_online(node)) | ||
196 | return node; | ||
197 | } | ||
198 | return first_node(node_online_map); /* Shouldn't happen */ | ||
199 | } | ||
200 | #endif | ||
201 | |||
202 | /* | ||
203 | * On a AMD dual core setup the lower bits of the APIC id distingush the cores. | ||
204 | * Assumes number of cores is a power of two. | ||
205 | */ | ||
206 | static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) | ||
207 | { | ||
208 | #ifdef CONFIG_X86_HT | ||
209 | unsigned bits; | ||
210 | |||
211 | bits = c->x86_coreid_bits; | ||
212 | |||
213 | /* Low order bits define the core id (index of core in socket) */ | ||
214 | c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); | ||
215 | /* Convert the initial APIC ID into the socket ID */ | ||
216 | c->phys_proc_id = c->initial_apicid >> bits; | ||
217 | #endif | ||
218 | } | ||
219 | |||
220 | static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) | ||
221 | { | ||
222 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) | ||
223 | int cpu = smp_processor_id(); | ||
224 | int node; | ||
225 | unsigned apicid = hard_smp_processor_id(); | ||
226 | |||
227 | node = c->phys_proc_id; | ||
228 | if (apicid_to_node[apicid] != NUMA_NO_NODE) | ||
229 | node = apicid_to_node[apicid]; | ||
230 | if (!node_online(node)) { | ||
231 | /* Two possibilities here: | ||
232 | - The CPU is missing memory and no node was created. | ||
233 | In that case try picking one from a nearby CPU | ||
234 | - The APIC IDs differ from the HyperTransport node IDs | ||
235 | which the K8 northbridge parsing fills in. | ||
236 | Assume they are all increased by a constant offset, | ||
237 | but in the same order as the HT nodeids. | ||
238 | If that doesn't result in a usable node fall back to the | ||
239 | path for the previous case. */ | ||
240 | |||
241 | int ht_nodeid = c->initial_apicid; | ||
242 | |||
243 | if (ht_nodeid >= 0 && | ||
244 | apicid_to_node[ht_nodeid] != NUMA_NO_NODE) | ||
245 | node = apicid_to_node[ht_nodeid]; | ||
246 | /* Pick a nearby node */ | ||
247 | if (!node_online(node)) | ||
248 | node = nearby_node(apicid); | ||
249 | } | ||
250 | numa_set_node(cpu, node); | ||
251 | |||
252 | printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | ||
253 | #endif | ||
254 | } | ||
255 | |||
256 | static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) | ||
257 | { | ||
258 | #ifdef CONFIG_X86_HT | ||
259 | unsigned bits, ecx; | ||
260 | |||
261 | /* Multi core CPU? */ | ||
262 | if (c->extended_cpuid_level < 0x80000008) | ||
263 | return; | ||
264 | |||
265 | ecx = cpuid_ecx(0x80000008); | ||
266 | |||
267 | c->x86_max_cores = (ecx & 0xff) + 1; | ||
268 | |||
269 | /* CPU telling us the core id bits shift? */ | ||
270 | bits = (ecx >> 12) & 0xF; | ||
271 | |||
272 | /* Otherwise recompute */ | ||
273 | if (bits == 0) { | ||
274 | while ((1 << bits) < c->x86_max_cores) | ||
275 | bits++; | ||
276 | } | ||
277 | |||
278 | c->x86_coreid_bits = bits; | ||
279 | #endif | ||
280 | } | ||
281 | |||
282 | static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) | ||
283 | { | ||
284 | early_init_amd_mc(c); | ||
285 | |||
286 | /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ | ||
287 | if (c->x86_power & (1<<8)) | ||
288 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | ||
289 | |||
290 | #ifdef CONFIG_X86_64 | ||
291 | set_cpu_cap(c, X86_FEATURE_SYSCALL32); | ||
292 | #else | ||
293 | /* Set MTRR capability flag if appropriate */ | ||
294 | if (c->x86 == 5) | ||
295 | if (c->x86_model == 13 || c->x86_model == 9 || | ||
296 | (c->x86_model == 8 && c->x86_mask >= 8)) | ||
297 | set_cpu_cap(c, X86_FEATURE_K6_MTRR); | ||
298 | #endif | ||
299 | } | ||
300 | |||
301 | static void __cpuinit init_amd(struct cpuinfo_x86 *c) | ||
302 | { | ||
42 | #ifdef CONFIG_SMP | 303 | #ifdef CONFIG_SMP |
43 | unsigned long long value; | 304 | unsigned long long value; |
44 | 305 | ||
@@ -49,7 +310,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
49 | * Errata 63 for SH-B3 steppings | 310 | * Errata 63 for SH-B3 steppings |
50 | * Errata 122 for all steppings (F+ have it disabled by default) | 311 | * Errata 122 for all steppings (F+ have it disabled by default) |
51 | */ | 312 | */ |
52 | if (c->x86 == 15) { | 313 | if (c->x86 == 0xf) { |
53 | rdmsrl(MSR_K7_HWCR, value); | 314 | rdmsrl(MSR_K7_HWCR, value); |
54 | value |= 1 << 6; | 315 | value |= 1 << 6; |
55 | wrmsrl(MSR_K7_HWCR, value); | 316 | wrmsrl(MSR_K7_HWCR, value); |
@@ -59,213 +320,119 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) | |||
59 | early_init_amd(c); | 320 | early_init_amd(c); |
60 | 321 | ||
61 | /* | 322 | /* |
62 | * FIXME: We should handle the K5 here. Set up the write | ||
63 | * range and also turn on MSR 83 bits 4 and 31 (write alloc, | ||
64 | * no bus pipeline) | ||
65 | */ | ||
66 | |||
67 | /* | ||
68 | * Bit 31 in normal CPUID used for nonstandard 3DNow ID; | 323 | * Bit 31 in normal CPUID used for nonstandard 3DNow ID; |
69 | * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway | 324 | * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway |
70 | */ | 325 | */ |
71 | clear_cpu_cap(c, 0*32+31); | 326 | clear_cpu_cap(c, 0*32+31); |
72 | 327 | ||
73 | r = get_model_name(c); | 328 | #ifdef CONFIG_X86_64 |
329 | /* On C+ stepping K8 rep microcode works well for copy/memset */ | ||
330 | if (c->x86 == 0xf) { | ||
331 | u32 level; | ||
74 | 332 | ||
75 | switch (c->x86) { | 333 | level = cpuid_eax(1); |
76 | case 4: | 334 | if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) |
77 | /* | 335 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); |
78 | * General Systems BIOSen alias the cpu frequency registers | ||
79 | * of the Elan at 0x000df000. Unfortuantly, one of the Linux | ||
80 | * drivers subsequently pokes it, and changes the CPU speed. | ||
81 | * Workaround : Remove the unneeded alias. | ||
82 | */ | ||
83 | #define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ | ||
84 | #define CBAR_ENB (0x80000000) | ||
85 | #define CBAR_KEY (0X000000CB) | ||
86 | if (c->x86_model == 9 || c->x86_model == 10) { | ||
87 | if (inl (CBAR) & CBAR_ENB) | ||
88 | outl (0 | CBAR_KEY, CBAR); | ||
89 | } | ||
90 | break; | ||
91 | case 5: | ||
92 | if (c->x86_model < 6) { | ||
93 | /* Based on AMD doc 20734R - June 2000 */ | ||
94 | if (c->x86_model == 0) { | ||
95 | clear_cpu_cap(c, X86_FEATURE_APIC); | ||
96 | set_cpu_cap(c, X86_FEATURE_PGE); | ||
97 | } | ||
98 | break; | ||
99 | } | ||
100 | |||
101 | if (c->x86_model == 6 && c->x86_mask == 1) { | ||
102 | const int K6_BUG_LOOP = 1000000; | ||
103 | int n; | ||
104 | void (*f_vide)(void); | ||
105 | unsigned long d, d2; | ||
106 | |||
107 | printk(KERN_INFO "AMD K6 stepping B detected - "); | ||
108 | |||
109 | /* | ||
110 | * It looks like AMD fixed the 2.6.2 bug and improved indirect | ||
111 | * calls at the same time. | ||
112 | */ | ||
113 | |||
114 | n = K6_BUG_LOOP; | ||
115 | f_vide = vide; | ||
116 | rdtscl(d); | ||
117 | while (n--) | ||
118 | f_vide(); | ||
119 | rdtscl(d2); | ||
120 | d = d2-d; | ||
121 | |||
122 | if (d > 20*K6_BUG_LOOP) | ||
123 | printk("system stability may be impaired when more than 32 MB are used.\n"); | ||
124 | else | ||
125 | printk("probably OK (after B9730xxxx).\n"); | ||
126 | printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); | ||
127 | } | ||
128 | |||
129 | /* K6 with old style WHCR */ | ||
130 | if (c->x86_model < 8 || | ||
131 | (c->x86_model == 8 && c->x86_mask < 8)) { | ||
132 | /* We can only write allocate on the low 508Mb */ | ||
133 | if (mbytes > 508) | ||
134 | mbytes = 508; | ||
135 | |||
136 | rdmsr(MSR_K6_WHCR, l, h); | ||
137 | if ((l&0x0000FFFF) == 0) { | ||
138 | unsigned long flags; | ||
139 | l = (1<<0)|((mbytes/4)<<1); | ||
140 | local_irq_save(flags); | ||
141 | wbinvd(); | ||
142 | wrmsr(MSR_K6_WHCR, l, h); | ||
143 | local_irq_restore(flags); | ||
144 | printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", | ||
145 | mbytes); | ||
146 | } | ||
147 | break; | ||
148 | } | ||
149 | |||
150 | if ((c->x86_model == 8 && c->x86_mask > 7) || | ||
151 | c->x86_model == 9 || c->x86_model == 13) { | ||
152 | /* The more serious chips .. */ | ||
153 | |||
154 | if (mbytes > 4092) | ||
155 | mbytes = 4092; | ||
156 | |||
157 | rdmsr(MSR_K6_WHCR, l, h); | ||
158 | if ((l&0xFFFF0000) == 0) { | ||
159 | unsigned long flags; | ||
160 | l = ((mbytes>>2)<<22)|(1<<16); | ||
161 | local_irq_save(flags); | ||
162 | wbinvd(); | ||
163 | wrmsr(MSR_K6_WHCR, l, h); | ||
164 | local_irq_restore(flags); | ||
165 | printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", | ||
166 | mbytes); | ||
167 | } | ||
168 | |||
169 | /* Set MTRR capability flag if appropriate */ | ||
170 | if (c->x86_model == 13 || c->x86_model == 9 || | ||
171 | (c->x86_model == 8 && c->x86_mask >= 8)) | ||
172 | set_cpu_cap(c, X86_FEATURE_K6_MTRR); | ||
173 | break; | ||
174 | } | ||
175 | |||
176 | if (c->x86_model == 10) { | ||
177 | /* AMD Geode LX is model 10 */ | ||
178 | /* placeholder for any needed mods */ | ||
179 | break; | ||
180 | } | ||
181 | break; | ||
182 | case 6: /* An Athlon/Duron */ | ||
183 | |||
184 | /* | ||
185 | * Bit 15 of Athlon specific MSR 15, needs to be 0 | ||
186 | * to enable SSE on Palomino/Morgan/Barton CPU's. | ||
187 | * If the BIOS didn't enable it already, enable it here. | ||
188 | */ | ||
189 | if (c->x86_model >= 6 && c->x86_model <= 10) { | ||
190 | if (!cpu_has(c, X86_FEATURE_XMM)) { | ||
191 | printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); | ||
192 | rdmsr(MSR_K7_HWCR, l, h); | ||
193 | l &= ~0x00008000; | ||
194 | wrmsr(MSR_K7_HWCR, l, h); | ||
195 | set_cpu_cap(c, X86_FEATURE_XMM); | ||
196 | } | ||
197 | } | ||
198 | |||
199 | /* | ||
200 | * It's been determined by AMD that Athlons since model 8 stepping 1 | ||
201 | * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx | ||
202 | * As per AMD technical note 27212 0.2 | ||
203 | */ | ||
204 | if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { | ||
205 | rdmsr(MSR_K7_CLK_CTL, l, h); | ||
206 | if ((l & 0xfff00000) != 0x20000000) { | ||
207 | printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, | ||
208 | ((l & 0x000fffff)|0x20000000)); | ||
209 | wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); | ||
210 | } | ||
211 | } | ||
212 | break; | ||
213 | } | 336 | } |
337 | if (c->x86 == 0x10 || c->x86 == 0x11) | ||
338 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | ||
339 | #else | ||
340 | |||
341 | /* | ||
342 | * FIXME: We should handle the K5 here. Set up the write | ||
343 | * range and also turn on MSR 83 bits 4 and 31 (write alloc, | ||
344 | * no bus pipeline) | ||
345 | */ | ||
214 | 346 | ||
215 | switch (c->x86) { | 347 | switch (c->x86) { |
216 | case 15: | 348 | case 4: |
217 | /* Use K8 tuning for Fam10h and Fam11h */ | 349 | init_amd_k5(c); |
218 | case 0x10: | ||
219 | case 0x11: | ||
220 | set_cpu_cap(c, X86_FEATURE_K8); | ||
221 | break; | 350 | break; |
222 | case 6: | 351 | case 5: |
223 | set_cpu_cap(c, X86_FEATURE_K7); | 352 | init_amd_k6(c); |
353 | break; | ||
354 | case 6: /* An Athlon/Duron */ | ||
355 | init_amd_k7(c); | ||
224 | break; | 356 | break; |
225 | } | 357 | } |
358 | |||
359 | /* K6s reports MCEs but don't actually have all the MSRs */ | ||
360 | if (c->x86 < 6) | ||
361 | clear_cpu_cap(c, X86_FEATURE_MCE); | ||
362 | #endif | ||
363 | |||
364 | /* Enable workaround for FXSAVE leak */ | ||
226 | if (c->x86 >= 6) | 365 | if (c->x86 >= 6) |
227 | set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); | 366 | set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); |
228 | 367 | ||
229 | display_cacheinfo(c); | 368 | if (!c->x86_model_id[0]) { |
230 | 369 | switch (c->x86) { | |
231 | if (cpuid_eax(0x80000000) >= 0x80000008) | 370 | case 0xf: |
232 | c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; | 371 | /* Should distinguish Models here, but this is only |
372 | a fallback anyways. */ | ||
373 | strcpy(c->x86_model_id, "Hammer"); | ||
374 | break; | ||
375 | } | ||
376 | } | ||
233 | 377 | ||
234 | #ifdef CONFIG_X86_HT | 378 | display_cacheinfo(c); |
235 | /* | ||
236 | * On a AMD multi core setup the lower bits of the APIC id | ||
237 | * distinguish the cores. | ||
238 | */ | ||
239 | if (c->x86_max_cores > 1) { | ||
240 | int cpu = smp_processor_id(); | ||
241 | unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf; | ||
242 | 379 | ||
243 | if (bits == 0) { | 380 | /* Multi core CPU? */ |
244 | while ((1 << bits) < c->x86_max_cores) | 381 | if (c->extended_cpuid_level >= 0x80000008) { |
245 | bits++; | 382 | amd_detect_cmp(c); |
246 | } | 383 | srat_detect_node(c); |
247 | c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1); | ||
248 | c->phys_proc_id >>= bits; | ||
249 | printk(KERN_INFO "CPU %d(%d) -> Core %d\n", | ||
250 | cpu, c->x86_max_cores, c->cpu_core_id); | ||
251 | } | 384 | } |
385 | |||
386 | #ifdef CONFIG_X86_32 | ||
387 | detect_ht(c); | ||
252 | #endif | 388 | #endif |
253 | 389 | ||
254 | if (cpuid_eax(0x80000000) >= 0x80000006) { | 390 | if (c->extended_cpuid_level >= 0x80000006) { |
255 | if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) | 391 | if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) |
256 | num_cache_leaves = 4; | 392 | num_cache_leaves = 4; |
257 | else | 393 | else |
258 | num_cache_leaves = 3; | 394 | num_cache_leaves = 3; |
259 | } | 395 | } |
260 | 396 | ||
261 | /* K6s reports MCEs but don't actually have all the MSRs */ | 397 | if (c->x86 >= 0xf && c->x86 <= 0x11) |
262 | if (c->x86 < 6) | 398 | set_cpu_cap(c, X86_FEATURE_K8); |
263 | clear_cpu_cap(c, X86_FEATURE_MCE); | ||
264 | 399 | ||
265 | if (cpu_has_xmm2) | 400 | if (cpu_has_xmm2) { |
401 | /* MFENCE stops RDTSC speculation */ | ||
266 | set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); | 402 | set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); |
403 | } | ||
404 | |||
405 | #ifdef CONFIG_X86_64 | ||
406 | if (c->x86 == 0x10) { | ||
407 | /* do this for boot cpu */ | ||
408 | if (c == &boot_cpu_data) | ||
409 | check_enable_amd_mmconf_dmi(); | ||
410 | |||
411 | fam10h_check_enable_mmcfg(); | ||
412 | } | ||
413 | |||
414 | if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { | ||
415 | unsigned long long tseg; | ||
416 | |||
417 | /* | ||
418 | * Split up direct mapping around the TSEG SMM area. | ||
419 | * Don't do it for gbpages because there seems very little | ||
420 | * benefit in doing so. | ||
421 | */ | ||
422 | if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { | ||
423 | printk(KERN_DEBUG "tseg: %010llx\n", tseg); | ||
424 | if ((tseg>>PMD_SHIFT) < | ||
425 | (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || | ||
426 | ((tseg>>PMD_SHIFT) < | ||
427 | (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && | ||
428 | (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) | ||
429 | set_memory_4k((unsigned long)__va(tseg), 1); | ||
430 | } | ||
431 | } | ||
432 | #endif | ||
267 | } | 433 | } |
268 | 434 | ||
435 | #ifdef CONFIG_X86_32 | ||
269 | static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) | 436 | static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) |
270 | { | 437 | { |
271 | /* AMD errata T13 (order #21922) */ | 438 | /* AMD errata T13 (order #21922) */ |
@@ -278,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int | |||
278 | } | 445 | } |
279 | return size; | 446 | return size; |
280 | } | 447 | } |
448 | #endif | ||
281 | 449 | ||
282 | static struct cpu_dev amd_cpu_dev __cpuinitdata = { | 450 | static struct cpu_dev amd_cpu_dev __cpuinitdata = { |
283 | .c_vendor = "AMD", | 451 | .c_vendor = "AMD", |
284 | .c_ident = { "AuthenticAMD" }, | 452 | .c_ident = { "AuthenticAMD" }, |
453 | #ifdef CONFIG_X86_32 | ||
285 | .c_models = { | 454 | .c_models = { |
286 | { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = | 455 | { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = |
287 | { | 456 | { |
@@ -294,9 +463,11 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = { | |||
294 | } | 463 | } |
295 | }, | 464 | }, |
296 | }, | 465 | }, |
466 | .c_size_cache = amd_size_cache, | ||
467 | #endif | ||
297 | .c_early_init = early_init_amd, | 468 | .c_early_init = early_init_amd, |
298 | .c_init = init_amd, | 469 | .c_init = init_amd, |
299 | .c_size_cache = amd_size_cache, | 470 | .c_x86_vendor = X86_VENDOR_AMD, |
300 | }; | 471 | }; |
301 | 472 | ||
302 | cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); | 473 | cpu_dev_register(amd_cpu_dev); |
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c deleted file mode 100644 index d1692b2a41ff..000000000000 --- a/arch/x86/kernel/cpu/amd_64.c +++ /dev/null | |||
@@ -1,224 +0,0 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/mm.h> | ||
3 | |||
4 | #include <asm/numa_64.h> | ||
5 | #include <asm/mmconfig.h> | ||
6 | #include <asm/cacheflush.h> | ||
7 | |||
8 | #include <mach_apic.h> | ||
9 | |||
10 | #include "cpu.h" | ||
11 | |||
12 | int force_mwait __cpuinitdata; | ||
13 | |||
14 | #ifdef CONFIG_NUMA | ||
15 | static int __cpuinit nearby_node(int apicid) | ||
16 | { | ||
17 | int i, node; | ||
18 | |||
19 | for (i = apicid - 1; i >= 0; i--) { | ||
20 | node = apicid_to_node[i]; | ||
21 | if (node != NUMA_NO_NODE && node_online(node)) | ||
22 | return node; | ||
23 | } | ||
24 | for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { | ||
25 | node = apicid_to_node[i]; | ||
26 | if (node != NUMA_NO_NODE && node_online(node)) | ||
27 | return node; | ||
28 | } | ||
29 | return first_node(node_online_map); /* Shouldn't happen */ | ||
30 | } | ||
31 | #endif | ||
32 | |||
33 | /* | ||
34 | * On a AMD dual core setup the lower bits of the APIC id distingush the cores. | ||
35 | * Assumes number of cores is a power of two. | ||
36 | */ | ||
37 | static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) | ||
38 | { | ||
39 | #ifdef CONFIG_SMP | ||
40 | unsigned bits; | ||
41 | #ifdef CONFIG_NUMA | ||
42 | int cpu = smp_processor_id(); | ||
43 | int node = 0; | ||
44 | unsigned apicid = hard_smp_processor_id(); | ||
45 | #endif | ||
46 | bits = c->x86_coreid_bits; | ||
47 | |||
48 | /* Low order bits define the core id (index of core in socket) */ | ||
49 | c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); | ||
50 | /* Convert the initial APIC ID into the socket ID */ | ||
51 | c->phys_proc_id = c->initial_apicid >> bits; | ||
52 | |||
53 | #ifdef CONFIG_NUMA | ||
54 | node = c->phys_proc_id; | ||
55 | if (apicid_to_node[apicid] != NUMA_NO_NODE) | ||
56 | node = apicid_to_node[apicid]; | ||
57 | if (!node_online(node)) { | ||
58 | /* Two possibilities here: | ||
59 | - The CPU is missing memory and no node was created. | ||
60 | In that case try picking one from a nearby CPU | ||
61 | - The APIC IDs differ from the HyperTransport node IDs | ||
62 | which the K8 northbridge parsing fills in. | ||
63 | Assume they are all increased by a constant offset, | ||
64 | but in the same order as the HT nodeids. | ||
65 | If that doesn't result in a usable node fall back to the | ||
66 | path for the previous case. */ | ||
67 | |||
68 | int ht_nodeid = c->initial_apicid; | ||
69 | |||
70 | if (ht_nodeid >= 0 && | ||
71 | apicid_to_node[ht_nodeid] != NUMA_NO_NODE) | ||
72 | node = apicid_to_node[ht_nodeid]; | ||
73 | /* Pick a nearby node */ | ||
74 | if (!node_online(node)) | ||
75 | node = nearby_node(apicid); | ||
76 | } | ||
77 | numa_set_node(cpu, node); | ||
78 | |||
79 | printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | ||
80 | #endif | ||
81 | #endif | ||
82 | } | ||
83 | |||
84 | static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) | ||
85 | { | ||
86 | #ifdef CONFIG_SMP | ||
87 | unsigned bits, ecx; | ||
88 | |||
89 | /* Multi core CPU? */ | ||
90 | if (c->extended_cpuid_level < 0x80000008) | ||
91 | return; | ||
92 | |||
93 | ecx = cpuid_ecx(0x80000008); | ||
94 | |||
95 | c->x86_max_cores = (ecx & 0xff) + 1; | ||
96 | |||
97 | /* CPU telling us the core id bits shift? */ | ||
98 | bits = (ecx >> 12) & 0xF; | ||
99 | |||
100 | /* Otherwise recompute */ | ||
101 | if (bits == 0) { | ||
102 | while ((1 << bits) < c->x86_max_cores) | ||
103 | bits++; | ||
104 | } | ||
105 | |||
106 | c->x86_coreid_bits = bits; | ||
107 | |||
108 | #endif | ||
109 | } | ||
110 | |||
111 | static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) | ||
112 | { | ||
113 | early_init_amd_mc(c); | ||
114 | |||
115 | /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ | ||
116 | if (c->x86_power & (1<<8)) | ||
117 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | ||
118 | |||
119 | set_cpu_cap(c, X86_FEATURE_SYSCALL32); | ||
120 | } | ||
121 | |||
122 | static void __cpuinit init_amd(struct cpuinfo_x86 *c) | ||
123 | { | ||
124 | unsigned level; | ||
125 | |||
126 | #ifdef CONFIG_SMP | ||
127 | unsigned long value; | ||
128 | |||
129 | /* | ||
130 | * Disable TLB flush filter by setting HWCR.FFDIS on K8 | ||
131 | * bit 6 of msr C001_0015 | ||
132 | * | ||
133 | * Errata 63 for SH-B3 steppings | ||
134 | * Errata 122 for all steppings (F+ have it disabled by default) | ||
135 | */ | ||
136 | if (c->x86 == 0xf) { | ||
137 | rdmsrl(MSR_K8_HWCR, value); | ||
138 | value |= 1 << 6; | ||
139 | wrmsrl(MSR_K8_HWCR, value); | ||
140 | } | ||
141 | #endif | ||
142 | |||
143 | /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; | ||
144 | 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ | ||
145 | clear_cpu_cap(c, 0*32+31); | ||
146 | |||
147 | /* On C+ stepping K8 rep microcode works well for copy/memset */ | ||
148 | if (c->x86 == 0xf) { | ||
149 | level = cpuid_eax(1); | ||
150 | if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) | ||
151 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | ||
152 | } | ||
153 | if (c->x86 == 0x10 || c->x86 == 0x11) | ||
154 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | ||
155 | |||
156 | /* Enable workaround for FXSAVE leak */ | ||
157 | if (c->x86 >= 6) | ||
158 | set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); | ||
159 | |||
160 | level = get_model_name(c); | ||
161 | if (!level) { | ||
162 | switch (c->x86) { | ||
163 | case 0xf: | ||
164 | /* Should distinguish Models here, but this is only | ||
165 | a fallback anyways. */ | ||
166 | strcpy(c->x86_model_id, "Hammer"); | ||
167 | break; | ||
168 | } | ||
169 | } | ||
170 | display_cacheinfo(c); | ||
171 | |||
172 | /* Multi core CPU? */ | ||
173 | if (c->extended_cpuid_level >= 0x80000008) | ||
174 | amd_detect_cmp(c); | ||
175 | |||
176 | if (c->extended_cpuid_level >= 0x80000006 && | ||
177 | (cpuid_edx(0x80000006) & 0xf000)) | ||
178 | num_cache_leaves = 4; | ||
179 | else | ||
180 | num_cache_leaves = 3; | ||
181 | |||
182 | if (c->x86 >= 0xf && c->x86 <= 0x11) | ||
183 | set_cpu_cap(c, X86_FEATURE_K8); | ||
184 | |||
185 | /* MFENCE stops RDTSC speculation */ | ||
186 | set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); | ||
187 | |||
188 | if (c->x86 == 0x10) { | ||
189 | /* do this for boot cpu */ | ||
190 | if (c == &boot_cpu_data) | ||
191 | check_enable_amd_mmconf_dmi(); | ||
192 | |||
193 | fam10h_check_enable_mmcfg(); | ||
194 | } | ||
195 | |||
196 | if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { | ||
197 | unsigned long long tseg; | ||
198 | |||
199 | /* | ||
200 | * Split up direct mapping around the TSEG SMM area. | ||
201 | * Don't do it for gbpages because there seems very little | ||
202 | * benefit in doing so. | ||
203 | */ | ||
204 | if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { | ||
205 | printk(KERN_DEBUG "tseg: %010llx\n", tseg); | ||
206 | if ((tseg>>PMD_SHIFT) < | ||
207 | (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || | ||
208 | ((tseg>>PMD_SHIFT) < | ||
209 | (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && | ||
210 | (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) | ||
211 | set_memory_4k((unsigned long)__va(tseg), 1); | ||
212 | } | ||
213 | } | ||
214 | } | ||
215 | |||
216 | static struct cpu_dev amd_cpu_dev __cpuinitdata = { | ||
217 | .c_vendor = "AMD", | ||
218 | .c_ident = { "AuthenticAMD" }, | ||
219 | .c_early_init = early_init_amd, | ||
220 | .c_init = init_amd, | ||
221 | }; | ||
222 | |||
223 | cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); | ||
224 | |||
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e0f45edd6a55..89bfdd9cacc6 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c | |||
@@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) | |||
289 | if (c->x86_model >= 6 && c->x86_model < 9) | 289 | if (c->x86_model >= 6 && c->x86_model < 9) |
290 | set_cpu_cap(c, X86_FEATURE_3DNOW); | 290 | set_cpu_cap(c, X86_FEATURE_3DNOW); |
291 | 291 | ||
292 | get_model_name(c); | ||
293 | display_cacheinfo(c); | 292 | display_cacheinfo(c); |
294 | } | 293 | } |
295 | 294 | ||
@@ -314,6 +313,16 @@ enum { | |||
314 | EAMD3D = 1<<20, | 313 | EAMD3D = 1<<20, |
315 | }; | 314 | }; |
316 | 315 | ||
316 | static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) | ||
317 | { | ||
318 | switch (c->x86) { | ||
319 | case 5: | ||
320 | /* Emulate MTRRs using Centaur's MCR. */ | ||
321 | set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); | ||
322 | break; | ||
323 | } | ||
324 | } | ||
325 | |||
317 | static void __cpuinit init_centaur(struct cpuinfo_x86 *c) | 326 | static void __cpuinit init_centaur(struct cpuinfo_x86 *c) |
318 | { | 327 | { |
319 | 328 | ||
@@ -462,8 +471,10 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) | |||
462 | static struct cpu_dev centaur_cpu_dev __cpuinitdata = { | 471 | static struct cpu_dev centaur_cpu_dev __cpuinitdata = { |
463 | .c_vendor = "Centaur", | 472 | .c_vendor = "Centaur", |
464 | .c_ident = { "CentaurHauls" }, | 473 | .c_ident = { "CentaurHauls" }, |
474 | .c_early_init = early_init_centaur, | ||
465 | .c_init = init_centaur, | 475 | .c_init = init_centaur, |
466 | .c_size_cache = centaur_size_cache, | 476 | .c_size_cache = centaur_size_cache, |
477 | .c_x86_vendor = X86_VENDOR_CENTAUR, | ||
467 | }; | 478 | }; |
468 | 479 | ||
469 | cpu_vendor_dev_register(X86_VENDOR_CENTAUR, ¢aur_cpu_dev); | 480 | cpu_dev_register(centaur_cpu_dev); |
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c index 1d181c40e2e1..a1625f5a1e78 100644 --- a/arch/x86/kernel/cpu/centaur_64.c +++ b/arch/x86/kernel/cpu/centaur_64.c | |||
@@ -16,9 +16,10 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) | |||
16 | 16 | ||
17 | static void __cpuinit init_centaur(struct cpuinfo_x86 *c) | 17 | static void __cpuinit init_centaur(struct cpuinfo_x86 *c) |
18 | { | 18 | { |
19 | early_init_centaur(c); | ||
20 | |||
19 | if (c->x86 == 0x6 && c->x86_model >= 0xf) { | 21 | if (c->x86 == 0x6 && c->x86_model >= 0xf) { |
20 | c->x86_cache_alignment = c->x86_clflush_size * 2; | 22 | c->x86_cache_alignment = c->x86_clflush_size * 2; |
21 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | ||
22 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | 23 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); |
23 | } | 24 | } |
24 | set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); | 25 | set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); |
@@ -29,7 +30,8 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = { | |||
29 | .c_ident = { "CentaurHauls" }, | 30 | .c_ident = { "CentaurHauls" }, |
30 | .c_early_init = early_init_centaur, | 31 | .c_early_init = early_init_centaur, |
31 | .c_init = init_centaur, | 32 | .c_init = init_centaur, |
33 | .c_x86_vendor = X86_VENDOR_CENTAUR, | ||
32 | }; | 34 | }; |
33 | 35 | ||
34 | cpu_vendor_dev_register(X86_VENDOR_CENTAUR, ¢aur_cpu_dev); | 36 | cpu_dev_register(centaur_cpu_dev); |
35 | 37 | ||
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c new file mode 100644 index 000000000000..2056ccf572cc --- /dev/null +++ b/arch/x86/kernel/cpu/cmpxchg.c | |||
@@ -0,0 +1,72 @@ | |||
1 | /* | ||
2 | * cmpxchg*() fallbacks for CPU not supporting these instructions | ||
3 | */ | ||
4 | |||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/smp.h> | ||
7 | #include <linux/module.h> | ||
8 | |||
9 | #ifndef CONFIG_X86_CMPXCHG | ||
10 | unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) | ||
11 | { | ||
12 | u8 prev; | ||
13 | unsigned long flags; | ||
14 | |||
15 | /* Poor man's cmpxchg for 386. Unsuitable for SMP */ | ||
16 | local_irq_save(flags); | ||
17 | prev = *(u8 *)ptr; | ||
18 | if (prev == old) | ||
19 | *(u8 *)ptr = new; | ||
20 | local_irq_restore(flags); | ||
21 | return prev; | ||
22 | } | ||
23 | EXPORT_SYMBOL(cmpxchg_386_u8); | ||
24 | |||
25 | unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) | ||
26 | { | ||
27 | u16 prev; | ||
28 | unsigned long flags; | ||
29 | |||
30 | /* Poor man's cmpxchg for 386. Unsuitable for SMP */ | ||
31 | local_irq_save(flags); | ||
32 | prev = *(u16 *)ptr; | ||
33 | if (prev == old) | ||
34 | *(u16 *)ptr = new; | ||
35 | local_irq_restore(flags); | ||
36 | return prev; | ||
37 | } | ||
38 | EXPORT_SYMBOL(cmpxchg_386_u16); | ||
39 | |||
40 | unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) | ||
41 | { | ||
42 | u32 prev; | ||
43 | unsigned long flags; | ||
44 | |||
45 | /* Poor man's cmpxchg for 386. Unsuitable for SMP */ | ||
46 | local_irq_save(flags); | ||
47 | prev = *(u32 *)ptr; | ||
48 | if (prev == old) | ||
49 | *(u32 *)ptr = new; | ||
50 | local_irq_restore(flags); | ||
51 | return prev; | ||
52 | } | ||
53 | EXPORT_SYMBOL(cmpxchg_386_u32); | ||
54 | #endif | ||
55 | |||
56 | #ifndef CONFIG_X86_CMPXCHG64 | ||
57 | unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) | ||
58 | { | ||
59 | u64 prev; | ||
60 | unsigned long flags; | ||
61 | |||
62 | /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ | ||
63 | local_irq_save(flags); | ||
64 | prev = *(u64 *)ptr; | ||
65 | if (prev == old) | ||
66 | *(u64 *)ptr = new; | ||
67 | local_irq_restore(flags); | ||
68 | return prev; | ||
69 | } | ||
70 | EXPORT_SYMBOL(cmpxchg_486_u64); | ||
71 | #endif | ||
72 | |||
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80ab20d4fa39..25581dcb280e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -1,27 +1,62 @@ | |||
1 | #include <linux/init.h> | 1 | #include <linux/init.h> |
2 | #include <linux/kernel.h> | ||
3 | #include <linux/sched.h> | ||
2 | #include <linux/string.h> | 4 | #include <linux/string.h> |
5 | #include <linux/bootmem.h> | ||
6 | #include <linux/bitops.h> | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/kgdb.h> | ||
9 | #include <linux/topology.h> | ||
3 | #include <linux/delay.h> | 10 | #include <linux/delay.h> |
4 | #include <linux/smp.h> | 11 | #include <linux/smp.h> |
5 | #include <linux/module.h> | ||
6 | #include <linux/percpu.h> | 12 | #include <linux/percpu.h> |
7 | #include <linux/bootmem.h> | ||
8 | #include <asm/processor.h> | ||
9 | #include <asm/i387.h> | 13 | #include <asm/i387.h> |
10 | #include <asm/msr.h> | 14 | #include <asm/msr.h> |
11 | #include <asm/io.h> | 15 | #include <asm/io.h> |
16 | #include <asm/linkage.h> | ||
12 | #include <asm/mmu_context.h> | 17 | #include <asm/mmu_context.h> |
13 | #include <asm/mtrr.h> | 18 | #include <asm/mtrr.h> |
14 | #include <asm/mce.h> | 19 | #include <asm/mce.h> |
15 | #include <asm/pat.h> | 20 | #include <asm/pat.h> |
21 | #include <asm/asm.h> | ||
22 | #include <asm/numa.h> | ||
16 | #ifdef CONFIG_X86_LOCAL_APIC | 23 | #ifdef CONFIG_X86_LOCAL_APIC |
17 | #include <asm/mpspec.h> | 24 | #include <asm/mpspec.h> |
18 | #include <asm/apic.h> | 25 | #include <asm/apic.h> |
19 | #include <mach_apic.h> | 26 | #include <mach_apic.h> |
27 | #include <asm/genapic.h> | ||
20 | #endif | 28 | #endif |
21 | 29 | ||
30 | #include <asm/pda.h> | ||
31 | #include <asm/pgtable.h> | ||
32 | #include <asm/processor.h> | ||
33 | #include <asm/desc.h> | ||
34 | #include <asm/atomic.h> | ||
35 | #include <asm/proto.h> | ||
36 | #include <asm/sections.h> | ||
37 | #include <asm/setup.h> | ||
38 | |||
22 | #include "cpu.h" | 39 | #include "cpu.h" |
23 | 40 | ||
41 | static struct cpu_dev *this_cpu __cpuinitdata; | ||
42 | |||
43 | #ifdef CONFIG_X86_64 | ||
44 | /* We need valid kernel segments for data and code in long mode too | ||
45 | * IRET will check the segment types kkeil 2000/10/28 | ||
46 | * Also sysret mandates a special GDT layout | ||
47 | */ | ||
48 | /* The TLS descriptors are currently at a different place compared to i386. | ||
49 | Hopefully nobody expects them at a fixed place (Wine?) */ | ||
24 | DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { | 50 | DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { |
51 | [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, | ||
52 | [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, | ||
53 | [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, | ||
54 | [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, | ||
55 | [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, | ||
56 | [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, | ||
57 | } }; | ||
58 | #else | ||
59 | DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { | ||
25 | [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, | 60 | [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, |
26 | [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, | 61 | [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, |
27 | [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, | 62 | [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, |
@@ -55,17 +90,157 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { | |||
55 | [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, | 90 | [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, |
56 | [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, | 91 | [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, |
57 | } }; | 92 | } }; |
93 | #endif | ||
58 | EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); | 94 | EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); |
59 | 95 | ||
60 | __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; | 96 | #ifdef CONFIG_X86_32 |
61 | |||
62 | static int cachesize_override __cpuinitdata = -1; | 97 | static int cachesize_override __cpuinitdata = -1; |
63 | static int disable_x86_serial_nr __cpuinitdata = 1; | 98 | static int disable_x86_serial_nr __cpuinitdata = 1; |
64 | 99 | ||
65 | struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; | 100 | static int __init cachesize_setup(char *str) |
101 | { | ||
102 | get_option(&str, &cachesize_override); | ||
103 | return 1; | ||
104 | } | ||
105 | __setup("cachesize=", cachesize_setup); | ||
106 | |||
107 | static int __init x86_fxsr_setup(char *s) | ||
108 | { | ||
109 | setup_clear_cpu_cap(X86_FEATURE_FXSR); | ||
110 | setup_clear_cpu_cap(X86_FEATURE_XMM); | ||
111 | return 1; | ||
112 | } | ||
113 | __setup("nofxsr", x86_fxsr_setup); | ||
114 | |||
115 | static int __init x86_sep_setup(char *s) | ||
116 | { | ||
117 | setup_clear_cpu_cap(X86_FEATURE_SEP); | ||
118 | return 1; | ||
119 | } | ||
120 | __setup("nosep", x86_sep_setup); | ||
121 | |||
122 | /* Standard macro to see if a specific flag is changeable */ | ||
123 | static inline int flag_is_changeable_p(u32 flag) | ||
124 | { | ||
125 | u32 f1, f2; | ||
126 | |||
127 | /* | ||
128 | * Cyrix and IDT cpus allow disabling of CPUID | ||
129 | * so the code below may return different results | ||
130 | * when it is executed before and after enabling | ||
131 | * the CPUID. Add "volatile" to not allow gcc to | ||
132 | * optimize the subsequent calls to this function. | ||
133 | */ | ||
134 | asm volatile ("pushfl\n\t" | ||
135 | "pushfl\n\t" | ||
136 | "popl %0\n\t" | ||
137 | "movl %0,%1\n\t" | ||
138 | "xorl %2,%0\n\t" | ||
139 | "pushl %0\n\t" | ||
140 | "popfl\n\t" | ||
141 | "pushfl\n\t" | ||
142 | "popl %0\n\t" | ||
143 | "popfl\n\t" | ||
144 | : "=&r" (f1), "=&r" (f2) | ||
145 | : "ir" (flag)); | ||
146 | |||
147 | return ((f1^f2) & flag) != 0; | ||
148 | } | ||
149 | |||
150 | /* Probe for the CPUID instruction */ | ||
151 | static int __cpuinit have_cpuid_p(void) | ||
152 | { | ||
153 | return flag_is_changeable_p(X86_EFLAGS_ID); | ||
154 | } | ||
155 | |||
156 | static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) | ||
157 | { | ||
158 | if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { | ||
159 | /* Disable processor serial number */ | ||
160 | unsigned long lo, hi; | ||
161 | rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); | ||
162 | lo |= 0x200000; | ||
163 | wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); | ||
164 | printk(KERN_NOTICE "CPU serial number disabled.\n"); | ||
165 | clear_cpu_cap(c, X86_FEATURE_PN); | ||
166 | |||
167 | /* Disabling the serial number may affect the cpuid level */ | ||
168 | c->cpuid_level = cpuid_eax(0); | ||
169 | } | ||
170 | } | ||
171 | |||
172 | static int __init x86_serial_nr_setup(char *s) | ||
173 | { | ||
174 | disable_x86_serial_nr = 0; | ||
175 | return 1; | ||
176 | } | ||
177 | __setup("serialnumber", x86_serial_nr_setup); | ||
178 | #else | ||
179 | static inline int flag_is_changeable_p(u32 flag) | ||
180 | { | ||
181 | return 1; | ||
182 | } | ||
183 | /* Probe for the CPUID instruction */ | ||
184 | static inline int have_cpuid_p(void) | ||
185 | { | ||
186 | return 1; | ||
187 | } | ||
188 | static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) | ||
189 | { | ||
190 | } | ||
191 | #endif | ||
192 | |||
193 | /* | ||
194 | * Naming convention should be: <Name> [(<Codename>)] | ||
195 | * This table only is used unless init_<vendor>() below doesn't set it; | ||
196 | * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used | ||
197 | * | ||
198 | */ | ||
199 | |||
200 | /* Look up CPU names by table lookup. */ | ||
201 | static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) | ||
202 | { | ||
203 | struct cpu_model_info *info; | ||
204 | |||
205 | if (c->x86_model >= 16) | ||
206 | return NULL; /* Range check */ | ||
207 | |||
208 | if (!this_cpu) | ||
209 | return NULL; | ||
210 | |||
211 | info = this_cpu->c_models; | ||
212 | |||
213 | while (info && info->family) { | ||
214 | if (info->family == c->x86) | ||
215 | return info->model_names[c->x86_model]; | ||
216 | info++; | ||
217 | } | ||
218 | return NULL; /* Not found */ | ||
219 | } | ||
220 | |||
221 | __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; | ||
222 | |||
223 | /* Current gdt points %fs at the "master" per-cpu area: after this, | ||
224 | * it's on the real one. */ | ||
225 | void switch_to_new_gdt(void) | ||
226 | { | ||
227 | struct desc_ptr gdt_descr; | ||
228 | |||
229 | gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); | ||
230 | gdt_descr.size = GDT_SIZE - 1; | ||
231 | load_gdt(&gdt_descr); | ||
232 | #ifdef CONFIG_X86_32 | ||
233 | asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); | ||
234 | #endif | ||
235 | } | ||
236 | |||
237 | static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; | ||
66 | 238 | ||
67 | static void __cpuinit default_init(struct cpuinfo_x86 *c) | 239 | static void __cpuinit default_init(struct cpuinfo_x86 *c) |
68 | { | 240 | { |
241 | #ifdef CONFIG_X86_64 | ||
242 | display_cacheinfo(c); | ||
243 | #else | ||
69 | /* Not much we can do here... */ | 244 | /* Not much we can do here... */ |
70 | /* Check if at least it has cpuid */ | 245 | /* Check if at least it has cpuid */ |
71 | if (c->cpuid_level == -1) { | 246 | if (c->cpuid_level == -1) { |
@@ -75,28 +250,22 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c) | |||
75 | else if (c->x86 == 3) | 250 | else if (c->x86 == 3) |
76 | strcpy(c->x86_model_id, "386"); | 251 | strcpy(c->x86_model_id, "386"); |
77 | } | 252 | } |
253 | #endif | ||
78 | } | 254 | } |
79 | 255 | ||
80 | static struct cpu_dev __cpuinitdata default_cpu = { | 256 | static struct cpu_dev __cpuinitdata default_cpu = { |
81 | .c_init = default_init, | 257 | .c_init = default_init, |
82 | .c_vendor = "Unknown", | 258 | .c_vendor = "Unknown", |
259 | .c_x86_vendor = X86_VENDOR_UNKNOWN, | ||
83 | }; | 260 | }; |
84 | static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; | ||
85 | |||
86 | static int __init cachesize_setup(char *str) | ||
87 | { | ||
88 | get_option(&str, &cachesize_override); | ||
89 | return 1; | ||
90 | } | ||
91 | __setup("cachesize=", cachesize_setup); | ||
92 | 261 | ||
93 | int __cpuinit get_model_name(struct cpuinfo_x86 *c) | 262 | static void __cpuinit get_model_name(struct cpuinfo_x86 *c) |
94 | { | 263 | { |
95 | unsigned int *v; | 264 | unsigned int *v; |
96 | char *p, *q; | 265 | char *p, *q; |
97 | 266 | ||
98 | if (cpuid_eax(0x80000000) < 0x80000004) | 267 | if (c->extended_cpuid_level < 0x80000004) |
99 | return 0; | 268 | return; |
100 | 269 | ||
101 | v = (unsigned int *) c->x86_model_id; | 270 | v = (unsigned int *) c->x86_model_id; |
102 | cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); | 271 | cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); |
@@ -115,30 +284,34 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c) | |||
115 | while (q <= &c->x86_model_id[48]) | 284 | while (q <= &c->x86_model_id[48]) |
116 | *q++ = '\0'; /* Zero-pad the rest */ | 285 | *q++ = '\0'; /* Zero-pad the rest */ |
117 | } | 286 | } |
118 | |||
119 | return 1; | ||
120 | } | 287 | } |
121 | 288 | ||
122 | |||
123 | void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) | 289 | void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) |
124 | { | 290 | { |
125 | unsigned int n, dummy, ecx, edx, l2size; | 291 | unsigned int n, dummy, ebx, ecx, edx, l2size; |
126 | 292 | ||
127 | n = cpuid_eax(0x80000000); | 293 | n = c->extended_cpuid_level; |
128 | 294 | ||
129 | if (n >= 0x80000005) { | 295 | if (n >= 0x80000005) { |
130 | cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); | 296 | cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); |
131 | printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", | 297 | printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", |
132 | edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); | 298 | edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); |
133 | c->x86_cache_size = (ecx>>24)+(edx>>24); | 299 | c->x86_cache_size = (ecx>>24) + (edx>>24); |
300 | #ifdef CONFIG_X86_64 | ||
301 | /* On K8 L1 TLB is inclusive, so don't count it */ | ||
302 | c->x86_tlbsize = 0; | ||
303 | #endif | ||
134 | } | 304 | } |
135 | 305 | ||
136 | if (n < 0x80000006) /* Some chips just has a large L1. */ | 306 | if (n < 0x80000006) /* Some chips just has a large L1. */ |
137 | return; | 307 | return; |
138 | 308 | ||
139 | ecx = cpuid_ecx(0x80000006); | 309 | cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); |
140 | l2size = ecx >> 16; | 310 | l2size = ecx >> 16; |
141 | 311 | ||
312 | #ifdef CONFIG_X86_64 | ||
313 | c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); | ||
314 | #else | ||
142 | /* do processor-specific cache resizing */ | 315 | /* do processor-specific cache resizing */ |
143 | if (this_cpu->c_size_cache) | 316 | if (this_cpu->c_size_cache) |
144 | l2size = this_cpu->c_size_cache(c, l2size); | 317 | l2size = this_cpu->c_size_cache(c, l2size); |
@@ -149,116 +322,106 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) | |||
149 | 322 | ||
150 | if (l2size == 0) | 323 | if (l2size == 0) |
151 | return; /* Again, no L2 cache is possible */ | 324 | return; /* Again, no L2 cache is possible */ |
325 | #endif | ||
152 | 326 | ||
153 | c->x86_cache_size = l2size; | 327 | c->x86_cache_size = l2size; |
154 | 328 | ||
155 | printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", | 329 | printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", |
156 | l2size, ecx & 0xFF); | 330 | l2size, ecx & 0xFF); |
157 | } | 331 | } |
158 | 332 | ||
159 | /* | 333 | void __cpuinit detect_ht(struct cpuinfo_x86 *c) |
160 | * Naming convention should be: <Name> [(<Codename>)] | ||
161 | * This table only is used unless init_<vendor>() below doesn't set it; | ||
162 | * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used | ||
163 | * | ||
164 | */ | ||
165 | |||
166 | /* Look up CPU names by table lookup. */ | ||
167 | static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) | ||
168 | { | 334 | { |
169 | struct cpu_model_info *info; | 335 | #ifdef CONFIG_X86_HT |
336 | u32 eax, ebx, ecx, edx; | ||
337 | int index_msb, core_bits; | ||
170 | 338 | ||
171 | if (c->x86_model >= 16) | 339 | if (!cpu_has(c, X86_FEATURE_HT)) |
172 | return NULL; /* Range check */ | 340 | return; |
173 | 341 | ||
174 | if (!this_cpu) | 342 | if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) |
175 | return NULL; | 343 | goto out; |
176 | 344 | ||
177 | info = this_cpu->c_models; | 345 | if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) |
346 | return; | ||
178 | 347 | ||
179 | while (info && info->family) { | 348 | cpuid(1, &eax, &ebx, &ecx, &edx); |
180 | if (info->family == c->x86) | 349 | |
181 | return info->model_names[c->x86_model]; | 350 | smp_num_siblings = (ebx & 0xff0000) >> 16; |
182 | info++; | 351 | |
352 | if (smp_num_siblings == 1) { | ||
353 | printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); | ||
354 | } else if (smp_num_siblings > 1) { | ||
355 | |||
356 | if (smp_num_siblings > NR_CPUS) { | ||
357 | printk(KERN_WARNING "CPU: Unsupported number of siblings %d", | ||
358 | smp_num_siblings); | ||
359 | smp_num_siblings = 1; | ||
360 | return; | ||
361 | } | ||
362 | |||
363 | index_msb = get_count_order(smp_num_siblings); | ||
364 | #ifdef CONFIG_X86_64 | ||
365 | c->phys_proc_id = phys_pkg_id(index_msb); | ||
366 | #else | ||
367 | c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); | ||
368 | #endif | ||
369 | |||
370 | smp_num_siblings = smp_num_siblings / c->x86_max_cores; | ||
371 | |||
372 | index_msb = get_count_order(smp_num_siblings); | ||
373 | |||
374 | core_bits = get_count_order(c->x86_max_cores); | ||
375 | |||
376 | #ifdef CONFIG_X86_64 | ||
377 | c->cpu_core_id = phys_pkg_id(index_msb) & | ||
378 | ((1 << core_bits) - 1); | ||
379 | #else | ||
380 | c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & | ||
381 | ((1 << core_bits) - 1); | ||
382 | #endif | ||
183 | } | 383 | } |
184 | return NULL; /* Not found */ | ||
185 | } | ||
186 | 384 | ||
385 | out: | ||
386 | if ((c->x86_max_cores * smp_num_siblings) > 1) { | ||
387 | printk(KERN_INFO "CPU: Physical Processor ID: %d\n", | ||
388 | c->phys_proc_id); | ||
389 | printk(KERN_INFO "CPU: Processor Core ID: %d\n", | ||
390 | c->cpu_core_id); | ||
391 | } | ||
392 | #endif | ||
393 | } | ||
187 | 394 | ||
188 | static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) | 395 | static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) |
189 | { | 396 | { |
190 | char *v = c->x86_vendor_id; | 397 | char *v = c->x86_vendor_id; |
191 | int i; | 398 | int i; |
192 | static int printed; | 399 | static int printed; |
193 | 400 | ||
194 | for (i = 0; i < X86_VENDOR_NUM; i++) { | 401 | for (i = 0; i < X86_VENDOR_NUM; i++) { |
195 | if (cpu_devs[i]) { | 402 | if (!cpu_devs[i]) |
196 | if (!strcmp(v, cpu_devs[i]->c_ident[0]) || | 403 | break; |
197 | (cpu_devs[i]->c_ident[1] && | 404 | |
198 | !strcmp(v, cpu_devs[i]->c_ident[1]))) { | 405 | if (!strcmp(v, cpu_devs[i]->c_ident[0]) || |
199 | c->x86_vendor = i; | 406 | (cpu_devs[i]->c_ident[1] && |
200 | if (!early) | 407 | !strcmp(v, cpu_devs[i]->c_ident[1]))) { |
201 | this_cpu = cpu_devs[i]; | 408 | this_cpu = cpu_devs[i]; |
202 | return; | 409 | c->x86_vendor = this_cpu->c_x86_vendor; |
203 | } | 410 | return; |
204 | } | 411 | } |
205 | } | 412 | } |
413 | |||
206 | if (!printed) { | 414 | if (!printed) { |
207 | printed++; | 415 | printed++; |
208 | printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); | 416 | printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v); |
209 | printk(KERN_ERR "CPU: Your system may be unstable.\n"); | 417 | printk(KERN_ERR "CPU: Your system may be unstable.\n"); |
210 | } | 418 | } |
419 | |||
211 | c->x86_vendor = X86_VENDOR_UNKNOWN; | 420 | c->x86_vendor = X86_VENDOR_UNKNOWN; |
212 | this_cpu = &default_cpu; | 421 | this_cpu = &default_cpu; |
213 | } | 422 | } |
214 | 423 | ||
215 | 424 | void __cpuinit cpu_detect(struct cpuinfo_x86 *c) | |
216 | static int __init x86_fxsr_setup(char *s) | ||
217 | { | ||
218 | setup_clear_cpu_cap(X86_FEATURE_FXSR); | ||
219 | setup_clear_cpu_cap(X86_FEATURE_XMM); | ||
220 | return 1; | ||
221 | } | ||
222 | __setup("nofxsr", x86_fxsr_setup); | ||
223 | |||
224 | |||
225 | static int __init x86_sep_setup(char *s) | ||
226 | { | ||
227 | setup_clear_cpu_cap(X86_FEATURE_SEP); | ||
228 | return 1; | ||
229 | } | ||
230 | __setup("nosep", x86_sep_setup); | ||
231 | |||
232 | |||
233 | /* Standard macro to see if a specific flag is changeable */ | ||
234 | static inline int flag_is_changeable_p(u32 flag) | ||
235 | { | ||
236 | u32 f1, f2; | ||
237 | |||
238 | asm("pushfl\n\t" | ||
239 | "pushfl\n\t" | ||
240 | "popl %0\n\t" | ||
241 | "movl %0,%1\n\t" | ||
242 | "xorl %2,%0\n\t" | ||
243 | "pushl %0\n\t" | ||
244 | "popfl\n\t" | ||
245 | "pushfl\n\t" | ||
246 | "popl %0\n\t" | ||
247 | "popfl\n\t" | ||
248 | : "=&r" (f1), "=&r" (f2) | ||
249 | : "ir" (flag)); | ||
250 | |||
251 | return ((f1^f2) & flag) != 0; | ||
252 | } | ||
253 | |||
254 | |||
255 | /* Probe for the CPUID instruction */ | ||
256 | static int __cpuinit have_cpuid_p(void) | ||
257 | { | ||
258 | return flag_is_changeable_p(X86_EFLAGS_ID); | ||
259 | } | ||
260 | |||
261 | void __init cpu_detect(struct cpuinfo_x86 *c) | ||
262 | { | 425 | { |
263 | /* Get vendor name */ | 426 | /* Get vendor name */ |
264 | cpuid(0x00000000, (unsigned int *)&c->cpuid_level, | 427 | cpuid(0x00000000, (unsigned int *)&c->cpuid_level, |
@@ -267,50 +430,87 @@ void __init cpu_detect(struct cpuinfo_x86 *c) | |||
267 | (unsigned int *)&c->x86_vendor_id[4]); | 430 | (unsigned int *)&c->x86_vendor_id[4]); |
268 | 431 | ||
269 | c->x86 = 4; | 432 | c->x86 = 4; |
433 | /* Intel-defined flags: level 0x00000001 */ | ||
270 | if (c->cpuid_level >= 0x00000001) { | 434 | if (c->cpuid_level >= 0x00000001) { |
271 | u32 junk, tfms, cap0, misc; | 435 | u32 junk, tfms, cap0, misc; |
272 | cpuid(0x00000001, &tfms, &misc, &junk, &cap0); | 436 | cpuid(0x00000001, &tfms, &misc, &junk, &cap0); |
273 | c->x86 = (tfms >> 8) & 15; | 437 | c->x86 = (tfms >> 8) & 0xf; |
274 | c->x86_model = (tfms >> 4) & 15; | 438 | c->x86_model = (tfms >> 4) & 0xf; |
439 | c->x86_mask = tfms & 0xf; | ||
275 | if (c->x86 == 0xf) | 440 | if (c->x86 == 0xf) |
276 | c->x86 += (tfms >> 20) & 0xff; | 441 | c->x86 += (tfms >> 20) & 0xff; |
277 | if (c->x86 >= 0x6) | 442 | if (c->x86 >= 0x6) |
278 | c->x86_model += ((tfms >> 16) & 0xF) << 4; | 443 | c->x86_model += ((tfms >> 16) & 0xf) << 4; |
279 | c->x86_mask = tfms & 15; | ||
280 | if (cap0 & (1<<19)) { | 444 | if (cap0 & (1<<19)) { |
281 | c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; | ||
282 | c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; | 445 | c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; |
446 | c->x86_cache_alignment = c->x86_clflush_size; | ||
283 | } | 447 | } |
284 | } | 448 | } |
285 | } | 449 | } |
286 | static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) | 450 | |
451 | static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) | ||
287 | { | 452 | { |
288 | u32 tfms, xlvl; | 453 | u32 tfms, xlvl; |
289 | unsigned int ebx; | 454 | u32 ebx; |
290 | 455 | ||
291 | memset(&c->x86_capability, 0, sizeof c->x86_capability); | 456 | /* Intel-defined flags: level 0x00000001 */ |
292 | if (have_cpuid_p()) { | 457 | if (c->cpuid_level >= 0x00000001) { |
293 | /* Intel-defined flags: level 0x00000001 */ | 458 | u32 capability, excap; |
294 | if (c->cpuid_level >= 0x00000001) { | 459 | cpuid(0x00000001, &tfms, &ebx, &excap, &capability); |
295 | u32 capability, excap; | 460 | c->x86_capability[0] = capability; |
296 | cpuid(0x00000001, &tfms, &ebx, &excap, &capability); | 461 | c->x86_capability[4] = excap; |
297 | c->x86_capability[0] = capability; | 462 | } |
298 | c->x86_capability[4] = excap; | ||
299 | } | ||
300 | 463 | ||
301 | /* AMD-defined flags: level 0x80000001 */ | 464 | /* AMD-defined flags: level 0x80000001 */ |
302 | xlvl = cpuid_eax(0x80000000); | 465 | xlvl = cpuid_eax(0x80000000); |
303 | if ((xlvl & 0xffff0000) == 0x80000000) { | 466 | c->extended_cpuid_level = xlvl; |
304 | if (xlvl >= 0x80000001) { | 467 | if ((xlvl & 0xffff0000) == 0x80000000) { |
305 | c->x86_capability[1] = cpuid_edx(0x80000001); | 468 | if (xlvl >= 0x80000001) { |
306 | c->x86_capability[6] = cpuid_ecx(0x80000001); | 469 | c->x86_capability[1] = cpuid_edx(0x80000001); |
307 | } | 470 | c->x86_capability[6] = cpuid_ecx(0x80000001); |
308 | } | 471 | } |
472 | } | ||
473 | |||
474 | #ifdef CONFIG_X86_64 | ||
475 | if (c->extended_cpuid_level >= 0x80000008) { | ||
476 | u32 eax = cpuid_eax(0x80000008); | ||
309 | 477 | ||
478 | c->x86_virt_bits = (eax >> 8) & 0xff; | ||
479 | c->x86_phys_bits = eax & 0xff; | ||
310 | } | 480 | } |
481 | #endif | ||
482 | |||
483 | if (c->extended_cpuid_level >= 0x80000007) | ||
484 | c->x86_power = cpuid_edx(0x80000007); | ||
311 | 485 | ||
312 | } | 486 | } |
313 | 487 | ||
488 | static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) | ||
489 | { | ||
490 | #ifdef CONFIG_X86_32 | ||
491 | int i; | ||
492 | |||
493 | /* | ||
494 | * First of all, decide if this is a 486 or higher | ||
495 | * It's a 486 if we can modify the AC flag | ||
496 | */ | ||
497 | if (flag_is_changeable_p(X86_EFLAGS_AC)) | ||
498 | c->x86 = 4; | ||
499 | else | ||
500 | c->x86 = 3; | ||
501 | |||
502 | for (i = 0; i < X86_VENDOR_NUM; i++) | ||
503 | if (cpu_devs[i] && cpu_devs[i]->c_identify) { | ||
504 | c->x86_vendor_id[0] = 0; | ||
505 | cpu_devs[i]->c_identify(c); | ||
506 | if (c->x86_vendor_id[0]) { | ||
507 | get_cpu_vendor(c); | ||
508 | break; | ||
509 | } | ||
510 | } | ||
511 | #endif | ||
512 | } | ||
513 | |||
314 | /* | 514 | /* |
315 | * Do minimum CPU detection early. | 515 | * Do minimum CPU detection early. |
316 | * Fields really needed: vendor, cpuid_level, family, model, mask, | 516 | * Fields really needed: vendor, cpuid_level, family, model, mask, |
@@ -320,109 +520,113 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) | |||
320 | * WARNING: this function is only called on the BP. Don't add code here | 520 | * WARNING: this function is only called on the BP. Don't add code here |
321 | * that is supposed to run on all CPUs. | 521 | * that is supposed to run on all CPUs. |
322 | */ | 522 | */ |
323 | static void __init early_cpu_detect(void) | 523 | static void __init early_identify_cpu(struct cpuinfo_x86 *c) |
324 | { | 524 | { |
325 | struct cpuinfo_x86 *c = &boot_cpu_data; | 525 | #ifdef CONFIG_X86_64 |
326 | 526 | c->x86_clflush_size = 64; | |
327 | c->x86_cache_alignment = 32; | 527 | #else |
328 | c->x86_clflush_size = 32; | 528 | c->x86_clflush_size = 32; |
529 | #endif | ||
530 | c->x86_cache_alignment = c->x86_clflush_size; | ||
531 | |||
532 | memset(&c->x86_capability, 0, sizeof c->x86_capability); | ||
533 | c->extended_cpuid_level = 0; | ||
534 | |||
535 | if (!have_cpuid_p()) | ||
536 | identify_cpu_without_cpuid(c); | ||
329 | 537 | ||
538 | /* cyrix could have cpuid enabled via c_identify()*/ | ||
330 | if (!have_cpuid_p()) | 539 | if (!have_cpuid_p()) |
331 | return; | 540 | return; |
332 | 541 | ||
333 | cpu_detect(c); | 542 | cpu_detect(c); |
334 | 543 | ||
335 | get_cpu_vendor(c, 1); | 544 | get_cpu_vendor(c); |
545 | |||
546 | get_cpu_cap(c); | ||
336 | 547 | ||
337 | if (c->x86_vendor != X86_VENDOR_UNKNOWN && | 548 | if (this_cpu->c_early_init) |
338 | cpu_devs[c->x86_vendor]->c_early_init) | 549 | this_cpu->c_early_init(c); |
339 | cpu_devs[c->x86_vendor]->c_early_init(c); | ||
340 | 550 | ||
341 | early_get_cap(c); | 551 | validate_pat_support(c); |
342 | } | 552 | } |
343 | 553 | ||
344 | static void __cpuinit generic_identify(struct cpuinfo_x86 *c) | 554 | void __init early_cpu_init(void) |
345 | { | 555 | { |
346 | u32 tfms, xlvl; | 556 | struct cpu_dev **cdev; |
347 | unsigned int ebx; | 557 | int count = 0; |
348 | 558 | ||
349 | if (have_cpuid_p()) { | 559 | printk("KERNEL supported cpus:\n"); |
350 | /* Get vendor name */ | 560 | for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { |
351 | cpuid(0x00000000, (unsigned int *)&c->cpuid_level, | 561 | struct cpu_dev *cpudev = *cdev; |
352 | (unsigned int *)&c->x86_vendor_id[0], | 562 | unsigned int j; |
353 | (unsigned int *)&c->x86_vendor_id[8], | 563 | |
354 | (unsigned int *)&c->x86_vendor_id[4]); | 564 | if (count >= X86_VENDOR_NUM) |
355 | 565 | break; | |
356 | get_cpu_vendor(c, 0); | 566 | cpu_devs[count] = cpudev; |
357 | /* Initialize the standard set of capabilities */ | 567 | count++; |
358 | /* Note that the vendor-specific code below might override */ | 568 | |
359 | /* Intel-defined flags: level 0x00000001 */ | 569 | for (j = 0; j < 2; j++) { |
360 | if (c->cpuid_level >= 0x00000001) { | 570 | if (!cpudev->c_ident[j]) |
361 | u32 capability, excap; | 571 | continue; |
362 | cpuid(0x00000001, &tfms, &ebx, &excap, &capability); | 572 | printk(" %s %s\n", cpudev->c_vendor, |
363 | c->x86_capability[0] = capability; | 573 | cpudev->c_ident[j]); |
364 | c->x86_capability[4] = excap; | ||
365 | c->x86 = (tfms >> 8) & 15; | ||
366 | c->x86_model = (tfms >> 4) & 15; | ||
367 | if (c->x86 == 0xf) | ||
368 | c->x86 += (tfms >> 20) & 0xff; | ||
369 | if (c->x86 >= 0x6) | ||
370 | c->x86_model += ((tfms >> 16) & 0xF) << 4; | ||
371 | c->x86_mask = tfms & 15; | ||
372 | c->initial_apicid = (ebx >> 24) & 0xFF; | ||
373 | #ifdef CONFIG_X86_HT | ||
374 | c->apicid = phys_pkg_id(c->initial_apicid, 0); | ||
375 | c->phys_proc_id = c->initial_apicid; | ||
376 | #else | ||
377 | c->apicid = c->initial_apicid; | ||
378 | #endif | ||
379 | if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) | ||
380 | c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8; | ||
381 | } else { | ||
382 | /* Have CPUID level 0 only - unheard of */ | ||
383 | c->x86 = 4; | ||
384 | } | ||
385 | |||
386 | /* AMD-defined flags: level 0x80000001 */ | ||
387 | xlvl = cpuid_eax(0x80000000); | ||
388 | if ((xlvl & 0xffff0000) == 0x80000000) { | ||
389 | if (xlvl >= 0x80000001) { | ||
390 | c->x86_capability[1] = cpuid_edx(0x80000001); | ||
391 | c->x86_capability[6] = cpuid_ecx(0x80000001); | ||
392 | } | ||
393 | if (xlvl >= 0x80000004) | ||
394 | get_model_name(c); /* Default name */ | ||
395 | } | 574 | } |
396 | |||
397 | init_scattered_cpuid_features(c); | ||
398 | } | 575 | } |
399 | 576 | ||
577 | early_identify_cpu(&boot_cpu_data); | ||
400 | } | 578 | } |
401 | 579 | ||
402 | static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) | 580 | /* |
581 | * The NOPL instruction is supposed to exist on all CPUs with | ||
582 | * family >= 6; unfortunately, that's not true in practice because | ||
583 | * of early VIA chips and (more importantly) broken virtualizers that | ||
584 | * are not easy to detect. In the latter case it doesn't even *fail* | ||
585 | * reliably, so probing for it doesn't even work. Disable it completely | ||
586 | * unless we can find a reliable way to detect all the broken cases. | ||
587 | */ | ||
588 | static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) | ||
403 | { | 589 | { |
404 | if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { | 590 | clear_cpu_cap(c, X86_FEATURE_NOPL); |
405 | /* Disable processor serial number */ | ||
406 | unsigned long lo, hi; | ||
407 | rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); | ||
408 | lo |= 0x200000; | ||
409 | wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); | ||
410 | printk(KERN_NOTICE "CPU serial number disabled.\n"); | ||
411 | clear_cpu_cap(c, X86_FEATURE_PN); | ||
412 | |||
413 | /* Disabling the serial number may affect the cpuid level */ | ||
414 | c->cpuid_level = cpuid_eax(0); | ||
415 | } | ||
416 | } | 591 | } |
417 | 592 | ||
418 | static int __init x86_serial_nr_setup(char *s) | 593 | static void __cpuinit generic_identify(struct cpuinfo_x86 *c) |
419 | { | 594 | { |
420 | disable_x86_serial_nr = 0; | 595 | c->extended_cpuid_level = 0; |
421 | return 1; | 596 | |
422 | } | 597 | if (!have_cpuid_p()) |
423 | __setup("serialnumber", x86_serial_nr_setup); | 598 | identify_cpu_without_cpuid(c); |
599 | |||
600 | /* cyrix could have cpuid enabled via c_identify()*/ | ||
601 | if (!have_cpuid_p()) | ||
602 | return; | ||
603 | |||
604 | cpu_detect(c); | ||
605 | |||
606 | get_cpu_vendor(c); | ||
424 | 607 | ||
608 | get_cpu_cap(c); | ||
425 | 609 | ||
610 | if (c->cpuid_level >= 0x00000001) { | ||
611 | c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; | ||
612 | #ifdef CONFIG_X86_32 | ||
613 | # ifdef CONFIG_X86_HT | ||
614 | c->apicid = phys_pkg_id(c->initial_apicid, 0); | ||
615 | # else | ||
616 | c->apicid = c->initial_apicid; | ||
617 | # endif | ||
618 | #endif | ||
619 | |||
620 | #ifdef CONFIG_X86_HT | ||
621 | c->phys_proc_id = c->initial_apicid; | ||
622 | #endif | ||
623 | } | ||
624 | |||
625 | get_model_name(c); /* Default name */ | ||
626 | |||
627 | init_scattered_cpuid_features(c); | ||
628 | detect_nopl(c); | ||
629 | } | ||
426 | 630 | ||
427 | /* | 631 | /* |
428 | * This does the hard work of actually picking apart the CPU stuff... | 632 | * This does the hard work of actually picking apart the CPU stuff... |
@@ -434,30 +638,29 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |||
434 | c->loops_per_jiffy = loops_per_jiffy; | 638 | c->loops_per_jiffy = loops_per_jiffy; |
435 | c->x86_cache_size = -1; | 639 | c->x86_cache_size = -1; |
436 | c->x86_vendor = X86_VENDOR_UNKNOWN; | 640 | c->x86_vendor = X86_VENDOR_UNKNOWN; |
437 | c->cpuid_level = -1; /* CPUID not detected */ | ||
438 | c->x86_model = c->x86_mask = 0; /* So far unknown... */ | 641 | c->x86_model = c->x86_mask = 0; /* So far unknown... */ |
439 | c->x86_vendor_id[0] = '\0'; /* Unset */ | 642 | c->x86_vendor_id[0] = '\0'; /* Unset */ |
440 | c->x86_model_id[0] = '\0'; /* Unset */ | 643 | c->x86_model_id[0] = '\0'; /* Unset */ |
441 | c->x86_max_cores = 1; | 644 | c->x86_max_cores = 1; |
645 | c->x86_coreid_bits = 0; | ||
646 | #ifdef CONFIG_X86_64 | ||
647 | c->x86_clflush_size = 64; | ||
648 | #else | ||
649 | c->cpuid_level = -1; /* CPUID not detected */ | ||
442 | c->x86_clflush_size = 32; | 650 | c->x86_clflush_size = 32; |
651 | #endif | ||
652 | c->x86_cache_alignment = c->x86_clflush_size; | ||
443 | memset(&c->x86_capability, 0, sizeof c->x86_capability); | 653 | memset(&c->x86_capability, 0, sizeof c->x86_capability); |
444 | 654 | ||
445 | if (!have_cpuid_p()) { | ||
446 | /* | ||
447 | * First of all, decide if this is a 486 or higher | ||
448 | * It's a 486 if we can modify the AC flag | ||
449 | */ | ||
450 | if (flag_is_changeable_p(X86_EFLAGS_AC)) | ||
451 | c->x86 = 4; | ||
452 | else | ||
453 | c->x86 = 3; | ||
454 | } | ||
455 | |||
456 | generic_identify(c); | 655 | generic_identify(c); |
457 | 656 | ||
458 | if (this_cpu->c_identify) | 657 | if (this_cpu->c_identify) |
459 | this_cpu->c_identify(c); | 658 | this_cpu->c_identify(c); |
460 | 659 | ||
660 | #ifdef CONFIG_X86_64 | ||
661 | c->apicid = phys_pkg_id(0); | ||
662 | #endif | ||
663 | |||
461 | /* | 664 | /* |
462 | * Vendor-specific initialization. In this section we | 665 | * Vendor-specific initialization. In this section we |
463 | * canonicalize the feature flags, meaning if there are | 666 | * canonicalize the feature flags, meaning if there are |
@@ -491,6 +694,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |||
491 | c->x86, c->x86_model); | 694 | c->x86, c->x86_model); |
492 | } | 695 | } |
493 | 696 | ||
697 | #ifdef CONFIG_X86_64 | ||
698 | detect_ht(c); | ||
699 | #endif | ||
700 | |||
494 | /* | 701 | /* |
495 | * On SMP, boot_cpu_data holds the common feature set between | 702 | * On SMP, boot_cpu_data holds the common feature set between |
496 | * all CPUs; so make sure that we indicate which features are | 703 | * all CPUs; so make sure that we indicate which features are |
@@ -499,7 +706,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |||
499 | */ | 706 | */ |
500 | if (c != &boot_cpu_data) { | 707 | if (c != &boot_cpu_data) { |
501 | /* AND the already accumulated flags with these */ | 708 | /* AND the already accumulated flags with these */ |
502 | for (i = 0 ; i < NCAPINTS ; i++) | 709 | for (i = 0; i < NCAPINTS; i++) |
503 | boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; | 710 | boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; |
504 | } | 711 | } |
505 | 712 | ||
@@ -507,72 +714,91 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |||
507 | for (i = 0; i < NCAPINTS; i++) | 714 | for (i = 0; i < NCAPINTS; i++) |
508 | c->x86_capability[i] &= ~cleared_cpu_caps[i]; | 715 | c->x86_capability[i] &= ~cleared_cpu_caps[i]; |
509 | 716 | ||
717 | #ifdef CONFIG_X86_MCE | ||
510 | /* Init Machine Check Exception if available. */ | 718 | /* Init Machine Check Exception if available. */ |
511 | mcheck_init(c); | 719 | mcheck_init(c); |
720 | #endif | ||
512 | 721 | ||
513 | select_idle_routine(c); | 722 | select_idle_routine(c); |
723 | |||
724 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) | ||
725 | numa_add_cpu(smp_processor_id()); | ||
726 | #endif | ||
514 | } | 727 | } |
515 | 728 | ||
729 | #ifdef CONFIG_X86_64 | ||
730 | static void vgetcpu_set_mode(void) | ||
731 | { | ||
732 | if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) | ||
733 | vgetcpu_mode = VGETCPU_RDTSCP; | ||
734 | else | ||
735 | vgetcpu_mode = VGETCPU_LSL; | ||
736 | } | ||
737 | #endif | ||
738 | |||
516 | void __init identify_boot_cpu(void) | 739 | void __init identify_boot_cpu(void) |
517 | { | 740 | { |
518 | identify_cpu(&boot_cpu_data); | 741 | identify_cpu(&boot_cpu_data); |
742 | #ifdef CONFIG_X86_32 | ||
519 | sysenter_setup(); | 743 | sysenter_setup(); |
520 | enable_sep_cpu(); | 744 | enable_sep_cpu(); |
745 | #else | ||
746 | vgetcpu_set_mode(); | ||
747 | #endif | ||
521 | } | 748 | } |
522 | 749 | ||
523 | void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) | 750 | void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) |
524 | { | 751 | { |
525 | BUG_ON(c == &boot_cpu_data); | 752 | BUG_ON(c == &boot_cpu_data); |
526 | identify_cpu(c); | 753 | identify_cpu(c); |
754 | #ifdef CONFIG_X86_32 | ||
527 | enable_sep_cpu(); | 755 | enable_sep_cpu(); |
756 | #endif | ||
528 | mtrr_ap_init(); | 757 | mtrr_ap_init(); |
529 | } | 758 | } |
530 | 759 | ||
531 | #ifdef CONFIG_X86_HT | 760 | struct msr_range { |
532 | void __cpuinit detect_ht(struct cpuinfo_x86 *c) | 761 | unsigned min; |
533 | { | 762 | unsigned max; |
534 | u32 eax, ebx, ecx, edx; | 763 | }; |
535 | int index_msb, core_bits; | ||
536 | |||
537 | cpuid(1, &eax, &ebx, &ecx, &edx); | ||
538 | |||
539 | if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) | ||
540 | return; | ||
541 | |||
542 | smp_num_siblings = (ebx & 0xff0000) >> 16; | ||
543 | 764 | ||
544 | if (smp_num_siblings == 1) { | 765 | static struct msr_range msr_range_array[] __cpuinitdata = { |
545 | printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); | 766 | { 0x00000000, 0x00000418}, |
546 | } else if (smp_num_siblings > 1) { | 767 | { 0xc0000000, 0xc000040b}, |
768 | { 0xc0010000, 0xc0010142}, | ||
769 | { 0xc0011000, 0xc001103b}, | ||
770 | }; | ||
547 | 771 | ||
548 | if (smp_num_siblings > NR_CPUS) { | 772 | static void __cpuinit print_cpu_msr(void) |
549 | printk(KERN_WARNING "CPU: Unsupported number of the " | 773 | { |
550 | "siblings %d", smp_num_siblings); | 774 | unsigned index; |
551 | smp_num_siblings = 1; | 775 | u64 val; |
552 | return; | 776 | int i; |
777 | unsigned index_min, index_max; | ||
778 | |||
779 | for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { | ||
780 | index_min = msr_range_array[i].min; | ||
781 | index_max = msr_range_array[i].max; | ||
782 | for (index = index_min; index < index_max; index++) { | ||
783 | if (rdmsrl_amd_safe(index, &val)) | ||
784 | continue; | ||
785 | printk(KERN_INFO " MSR%08x: %016llx\n", index, val); | ||
553 | } | 786 | } |
787 | } | ||
788 | } | ||
554 | 789 | ||
555 | index_msb = get_count_order(smp_num_siblings); | 790 | static int show_msr __cpuinitdata; |
556 | c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); | 791 | static __init int setup_show_msr(char *arg) |
557 | 792 | { | |
558 | printk(KERN_INFO "CPU: Physical Processor ID: %d\n", | 793 | int num; |
559 | c->phys_proc_id); | ||
560 | |||
561 | smp_num_siblings = smp_num_siblings / c->x86_max_cores; | ||
562 | |||
563 | index_msb = get_count_order(smp_num_siblings) ; | ||
564 | |||
565 | core_bits = get_count_order(c->x86_max_cores); | ||
566 | 794 | ||
567 | c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & | 795 | get_option(&arg, &num); |
568 | ((1 << core_bits) - 1); | ||
569 | 796 | ||
570 | if (c->x86_max_cores > 1) | 797 | if (num > 0) |
571 | printk(KERN_INFO "CPU: Processor Core ID: %d\n", | 798 | show_msr = num; |
572 | c->cpu_core_id); | 799 | return 1; |
573 | } | ||
574 | } | 800 | } |
575 | #endif | 801 | __setup("show_msr=", setup_show_msr); |
576 | 802 | ||
577 | static __init int setup_noclflush(char *arg) | 803 | static __init int setup_noclflush(char *arg) |
578 | { | 804 | { |
@@ -590,18 +816,26 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) | |||
590 | else if (c->cpuid_level >= 0) | 816 | else if (c->cpuid_level >= 0) |
591 | vendor = c->x86_vendor_id; | 817 | vendor = c->x86_vendor_id; |
592 | 818 | ||
593 | if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) | 819 | if (vendor && !strstr(c->x86_model_id, vendor)) |
594 | printk("%s ", vendor); | 820 | printk(KERN_CONT "%s ", vendor); |
595 | 821 | ||
596 | if (!c->x86_model_id[0]) | 822 | if (c->x86_model_id[0]) |
597 | printk("%d86", c->x86); | 823 | printk(KERN_CONT "%s", c->x86_model_id); |
598 | else | 824 | else |
599 | printk("%s", c->x86_model_id); | 825 | printk(KERN_CONT "%d86", c->x86); |
600 | 826 | ||
601 | if (c->x86_mask || c->cpuid_level >= 0) | 827 | if (c->x86_mask || c->cpuid_level >= 0) |
602 | printk(" stepping %02x\n", c->x86_mask); | 828 | printk(KERN_CONT " stepping %02x\n", c->x86_mask); |
603 | else | 829 | else |
604 | printk("\n"); | 830 | printk(KERN_CONT "\n"); |
831 | |||
832 | #ifdef CONFIG_SMP | ||
833 | if (c->cpu_index < show_msr) | ||
834 | print_cpu_msr(); | ||
835 | #else | ||
836 | if (show_msr) | ||
837 | print_cpu_msr(); | ||
838 | #endif | ||
605 | } | 839 | } |
606 | 840 | ||
607 | static __init int setup_disablecpuid(char *arg) | 841 | static __init int setup_disablecpuid(char *arg) |
@@ -617,19 +851,89 @@ __setup("clearcpuid=", setup_disablecpuid); | |||
617 | 851 | ||
618 | cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; | 852 | cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; |
619 | 853 | ||
620 | void __init early_cpu_init(void) | 854 | #ifdef CONFIG_X86_64 |
855 | struct x8664_pda **_cpu_pda __read_mostly; | ||
856 | EXPORT_SYMBOL(_cpu_pda); | ||
857 | |||
858 | struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; | ||
859 | |||
860 | char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; | ||
861 | |||
862 | void __cpuinit pda_init(int cpu) | ||
621 | { | 863 | { |
622 | struct cpu_vendor_dev *cvdev; | 864 | struct x8664_pda *pda = cpu_pda(cpu); |
865 | |||
866 | /* Setup up data that may be needed in __get_free_pages early */ | ||
867 | loadsegment(fs, 0); | ||
868 | loadsegment(gs, 0); | ||
869 | /* Memory clobbers used to order PDA accessed */ | ||
870 | mb(); | ||
871 | wrmsrl(MSR_GS_BASE, pda); | ||
872 | mb(); | ||
873 | |||
874 | pda->cpunumber = cpu; | ||
875 | pda->irqcount = -1; | ||
876 | pda->kernelstack = (unsigned long)stack_thread_info() - | ||
877 | PDA_STACKOFFSET + THREAD_SIZE; | ||
878 | pda->active_mm = &init_mm; | ||
879 | pda->mmu_state = 0; | ||
880 | |||
881 | if (cpu == 0) { | ||
882 | /* others are initialized in smpboot.c */ | ||
883 | pda->pcurrent = &init_task; | ||
884 | pda->irqstackptr = boot_cpu_stack; | ||
885 | pda->irqstackptr += IRQSTACKSIZE - 64; | ||
886 | } else { | ||
887 | if (!pda->irqstackptr) { | ||
888 | pda->irqstackptr = (char *) | ||
889 | __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); | ||
890 | if (!pda->irqstackptr) | ||
891 | panic("cannot allocate irqstack for cpu %d", | ||
892 | cpu); | ||
893 | pda->irqstackptr += IRQSTACKSIZE - 64; | ||
894 | } | ||
895 | |||
896 | if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) | ||
897 | pda->nodenumber = cpu_to_node(cpu); | ||
898 | } | ||
899 | } | ||
900 | |||
901 | char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + | ||
902 | DEBUG_STKSZ] __page_aligned_bss; | ||
623 | 903 | ||
624 | for (cvdev = __x86cpuvendor_start ; | 904 | extern asmlinkage void ignore_sysret(void); |
625 | cvdev < __x86cpuvendor_end ; | ||
626 | cvdev++) | ||
627 | cpu_devs[cvdev->vendor] = cvdev->cpu_dev; | ||
628 | 905 | ||
629 | early_cpu_detect(); | 906 | /* May not be marked __init: used by software suspend */ |
630 | validate_pat_support(&boot_cpu_data); | 907 | void syscall_init(void) |
908 | { | ||
909 | /* | ||
910 | * LSTAR and STAR live in a bit strange symbiosis. | ||
911 | * They both write to the same internal register. STAR allows to | ||
912 | * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. | ||
913 | */ | ||
914 | wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); | ||
915 | wrmsrl(MSR_LSTAR, system_call); | ||
916 | wrmsrl(MSR_CSTAR, ignore_sysret); | ||
917 | |||
918 | #ifdef CONFIG_IA32_EMULATION | ||
919 | syscall32_cpu_init(); | ||
920 | #endif | ||
921 | |||
922 | /* Flags to clear on syscall */ | ||
923 | wrmsrl(MSR_SYSCALL_MASK, | ||
924 | X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); | ||
631 | } | 925 | } |
632 | 926 | ||
927 | unsigned long kernel_eflags; | ||
928 | |||
929 | /* | ||
930 | * Copies of the original ist values from the tss are only accessed during | ||
931 | * debugging, no special alignment required. | ||
932 | */ | ||
933 | DEFINE_PER_CPU(struct orig_ist, orig_ist); | ||
934 | |||
935 | #else | ||
936 | |||
633 | /* Make sure %fs is initialized properly in idle threads */ | 937 | /* Make sure %fs is initialized properly in idle threads */ |
634 | struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) | 938 | struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) |
635 | { | 939 | { |
@@ -637,25 +941,136 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) | |||
637 | regs->fs = __KERNEL_PERCPU; | 941 | regs->fs = __KERNEL_PERCPU; |
638 | return regs; | 942 | return regs; |
639 | } | 943 | } |
640 | 944 | #endif | |
641 | /* Current gdt points %fs at the "master" per-cpu area: after this, | ||
642 | * it's on the real one. */ | ||
643 | void switch_to_new_gdt(void) | ||
644 | { | ||
645 | struct desc_ptr gdt_descr; | ||
646 | |||
647 | gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); | ||
648 | gdt_descr.size = GDT_SIZE - 1; | ||
649 | load_gdt(&gdt_descr); | ||
650 | asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); | ||
651 | } | ||
652 | 945 | ||
653 | /* | 946 | /* |
654 | * cpu_init() initializes state that is per-CPU. Some data is already | 947 | * cpu_init() initializes state that is per-CPU. Some data is already |
655 | * initialized (naturally) in the bootstrap process, such as the GDT | 948 | * initialized (naturally) in the bootstrap process, such as the GDT |
656 | * and IDT. We reload them nevertheless, this function acts as a | 949 | * and IDT. We reload them nevertheless, this function acts as a |
657 | * 'CPU state barrier', nothing should get across. | 950 | * 'CPU state barrier', nothing should get across. |
951 | * A lot of state is already set up in PDA init for 64 bit | ||
658 | */ | 952 | */ |
953 | #ifdef CONFIG_X86_64 | ||
954 | void __cpuinit cpu_init(void) | ||
955 | { | ||
956 | int cpu = stack_smp_processor_id(); | ||
957 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
958 | struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); | ||
959 | unsigned long v; | ||
960 | char *estacks = NULL; | ||
961 | struct task_struct *me; | ||
962 | int i; | ||
963 | |||
964 | /* CPU 0 is initialised in head64.c */ | ||
965 | if (cpu != 0) | ||
966 | pda_init(cpu); | ||
967 | else | ||
968 | estacks = boot_exception_stacks; | ||
969 | |||
970 | me = current; | ||
971 | |||
972 | if (cpu_test_and_set(cpu, cpu_initialized)) | ||
973 | panic("CPU#%d already initialized!\n", cpu); | ||
974 | |||
975 | printk(KERN_INFO "Initializing CPU#%d\n", cpu); | ||
976 | |||
977 | clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); | ||
978 | |||
979 | /* | ||
980 | * Initialize the per-CPU GDT with the boot GDT, | ||
981 | * and set up the GDT descriptor: | ||
982 | */ | ||
983 | |||
984 | switch_to_new_gdt(); | ||
985 | load_idt((const struct desc_ptr *)&idt_descr); | ||
986 | |||
987 | memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); | ||
988 | syscall_init(); | ||
989 | |||
990 | wrmsrl(MSR_FS_BASE, 0); | ||
991 | wrmsrl(MSR_KERNEL_GS_BASE, 0); | ||
992 | barrier(); | ||
993 | |||
994 | check_efer(); | ||
995 | if (cpu != 0 && x2apic) | ||
996 | enable_x2apic(); | ||
997 | |||
998 | /* | ||
999 | * set up and load the per-CPU TSS | ||
1000 | */ | ||
1001 | if (!orig_ist->ist[0]) { | ||
1002 | static const unsigned int order[N_EXCEPTION_STACKS] = { | ||
1003 | [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, | ||
1004 | [DEBUG_STACK - 1] = DEBUG_STACK_ORDER | ||
1005 | }; | ||
1006 | for (v = 0; v < N_EXCEPTION_STACKS; v++) { | ||
1007 | if (cpu) { | ||
1008 | estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); | ||
1009 | if (!estacks) | ||
1010 | panic("Cannot allocate exception " | ||
1011 | "stack %ld %d\n", v, cpu); | ||
1012 | } | ||
1013 | estacks += PAGE_SIZE << order[v]; | ||
1014 | orig_ist->ist[v] = t->x86_tss.ist[v] = | ||
1015 | (unsigned long)estacks; | ||
1016 | } | ||
1017 | } | ||
1018 | |||
1019 | t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); | ||
1020 | /* | ||
1021 | * <= is required because the CPU will access up to | ||
1022 | * 8 bits beyond the end of the IO permission bitmap. | ||
1023 | */ | ||
1024 | for (i = 0; i <= IO_BITMAP_LONGS; i++) | ||
1025 | t->io_bitmap[i] = ~0UL; | ||
1026 | |||
1027 | atomic_inc(&init_mm.mm_count); | ||
1028 | me->active_mm = &init_mm; | ||
1029 | if (me->mm) | ||
1030 | BUG(); | ||
1031 | enter_lazy_tlb(&init_mm, me); | ||
1032 | |||
1033 | load_sp0(t, ¤t->thread); | ||
1034 | set_tss_desc(cpu, t); | ||
1035 | load_TR_desc(); | ||
1036 | load_LDT(&init_mm.context); | ||
1037 | |||
1038 | #ifdef CONFIG_KGDB | ||
1039 | /* | ||
1040 | * If the kgdb is connected no debug regs should be altered. This | ||
1041 | * is only applicable when KGDB and a KGDB I/O module are built | ||
1042 | * into the kernel and you are using early debugging with | ||
1043 | * kgdbwait. KGDB will control the kernel HW breakpoint registers. | ||
1044 | */ | ||
1045 | if (kgdb_connected && arch_kgdb_ops.correct_hw_break) | ||
1046 | arch_kgdb_ops.correct_hw_break(); | ||
1047 | else { | ||
1048 | #endif | ||
1049 | /* | ||
1050 | * Clear all 6 debug registers: | ||
1051 | */ | ||
1052 | |||
1053 | set_debugreg(0UL, 0); | ||
1054 | set_debugreg(0UL, 1); | ||
1055 | set_debugreg(0UL, 2); | ||
1056 | set_debugreg(0UL, 3); | ||
1057 | set_debugreg(0UL, 6); | ||
1058 | set_debugreg(0UL, 7); | ||
1059 | #ifdef CONFIG_KGDB | ||
1060 | /* If the kgdb is connected no debug regs should be altered. */ | ||
1061 | } | ||
1062 | #endif | ||
1063 | |||
1064 | fpu_init(); | ||
1065 | |||
1066 | raw_local_save_flags(kernel_eflags); | ||
1067 | |||
1068 | if (is_uv_system()) | ||
1069 | uv_cpu_init(); | ||
1070 | } | ||
1071 | |||
1072 | #else | ||
1073 | |||
659 | void __cpuinit cpu_init(void) | 1074 | void __cpuinit cpu_init(void) |
660 | { | 1075 | { |
661 | int cpu = smp_processor_id(); | 1076 | int cpu = smp_processor_id(); |
@@ -709,19 +1124,21 @@ void __cpuinit cpu_init(void) | |||
709 | /* | 1124 | /* |
710 | * Force FPU initialization: | 1125 | * Force FPU initialization: |
711 | */ | 1126 | */ |
712 | current_thread_info()->status = 0; | 1127 | if (cpu_has_xsave) |
1128 | current_thread_info()->status = TS_XSAVE; | ||
1129 | else | ||
1130 | current_thread_info()->status = 0; | ||
713 | clear_used_math(); | 1131 | clear_used_math(); |
714 | mxcsr_feature_mask_init(); | 1132 | mxcsr_feature_mask_init(); |
715 | } | ||
716 | 1133 | ||
717 | #ifdef CONFIG_HOTPLUG_CPU | 1134 | /* |
718 | void __cpuinit cpu_uninit(void) | 1135 | * Boot processor to setup the FP and extended state context info. |
719 | { | 1136 | */ |
720 | int cpu = raw_smp_processor_id(); | 1137 | if (!smp_processor_id()) |
721 | cpu_clear(cpu, cpu_initialized); | 1138 | init_thread_xstate(); |
722 | 1139 | ||
723 | /* lazy TLB state */ | 1140 | xsave_init(); |
724 | per_cpu(cpu_tlbstate, cpu).state = 0; | ||
725 | per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; | ||
726 | } | 1141 | } |
1142 | |||
1143 | |||
727 | #endif | 1144 | #endif |
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c deleted file mode 100644 index dd6e3f15017e..000000000000 --- a/arch/x86/kernel/cpu/common_64.c +++ /dev/null | |||
@@ -1,670 +0,0 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/kernel.h> | ||
3 | #include <linux/sched.h> | ||
4 | #include <linux/string.h> | ||
5 | #include <linux/bootmem.h> | ||
6 | #include <linux/bitops.h> | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/kgdb.h> | ||
9 | #include <linux/topology.h> | ||
10 | #include <linux/delay.h> | ||
11 | #include <linux/smp.h> | ||
12 | #include <linux/percpu.h> | ||
13 | #include <asm/i387.h> | ||
14 | #include <asm/msr.h> | ||
15 | #include <asm/io.h> | ||
16 | #include <asm/linkage.h> | ||
17 | #include <asm/mmu_context.h> | ||
18 | #include <asm/mtrr.h> | ||
19 | #include <asm/mce.h> | ||
20 | #include <asm/pat.h> | ||
21 | #include <asm/numa.h> | ||
22 | #ifdef CONFIG_X86_LOCAL_APIC | ||
23 | #include <asm/mpspec.h> | ||
24 | #include <asm/apic.h> | ||
25 | #include <mach_apic.h> | ||
26 | #endif | ||
27 | #include <asm/pda.h> | ||
28 | #include <asm/pgtable.h> | ||
29 | #include <asm/processor.h> | ||
30 | #include <asm/desc.h> | ||
31 | #include <asm/atomic.h> | ||
32 | #include <asm/proto.h> | ||
33 | #include <asm/sections.h> | ||
34 | #include <asm/setup.h> | ||
35 | #include <asm/genapic.h> | ||
36 | |||
37 | #include "cpu.h" | ||
38 | |||
39 | /* We need valid kernel segments for data and code in long mode too | ||
40 | * IRET will check the segment types kkeil 2000/10/28 | ||
41 | * Also sysret mandates a special GDT layout | ||
42 | */ | ||
43 | /* The TLS descriptors are currently at a different place compared to i386. | ||
44 | Hopefully nobody expects them at a fixed place (Wine?) */ | ||
45 | DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { | ||
46 | [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, | ||
47 | [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, | ||
48 | [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, | ||
49 | [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, | ||
50 | [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, | ||
51 | [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, | ||
52 | } }; | ||
53 | EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); | ||
54 | |||
55 | __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; | ||
56 | |||
57 | /* Current gdt points %fs at the "master" per-cpu area: after this, | ||
58 | * it's on the real one. */ | ||
59 | void switch_to_new_gdt(void) | ||
60 | { | ||
61 | struct desc_ptr gdt_descr; | ||
62 | |||
63 | gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); | ||
64 | gdt_descr.size = GDT_SIZE - 1; | ||
65 | load_gdt(&gdt_descr); | ||
66 | } | ||
67 | |||
68 | struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; | ||
69 | |||
70 | static void __cpuinit default_init(struct cpuinfo_x86 *c) | ||
71 | { | ||
72 | display_cacheinfo(c); | ||
73 | } | ||
74 | |||
75 | static struct cpu_dev __cpuinitdata default_cpu = { | ||
76 | .c_init = default_init, | ||
77 | .c_vendor = "Unknown", | ||
78 | }; | ||
79 | static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; | ||
80 | |||
81 | int __cpuinit get_model_name(struct cpuinfo_x86 *c) | ||
82 | { | ||
83 | unsigned int *v; | ||
84 | |||
85 | if (c->extended_cpuid_level < 0x80000004) | ||
86 | return 0; | ||
87 | |||
88 | v = (unsigned int *) c->x86_model_id; | ||
89 | cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); | ||
90 | cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); | ||
91 | cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); | ||
92 | c->x86_model_id[48] = 0; | ||
93 | return 1; | ||
94 | } | ||
95 | |||
96 | |||
97 | void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) | ||
98 | { | ||
99 | unsigned int n, dummy, ebx, ecx, edx; | ||
100 | |||
101 | n = c->extended_cpuid_level; | ||
102 | |||
103 | if (n >= 0x80000005) { | ||
104 | cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); | ||
105 | printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " | ||
106 | "D cache %dK (%d bytes/line)\n", | ||
107 | edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); | ||
108 | c->x86_cache_size = (ecx>>24) + (edx>>24); | ||
109 | /* On K8 L1 TLB is inclusive, so don't count it */ | ||
110 | c->x86_tlbsize = 0; | ||
111 | } | ||
112 | |||
113 | if (n >= 0x80000006) { | ||
114 | cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); | ||
115 | ecx = cpuid_ecx(0x80000006); | ||
116 | c->x86_cache_size = ecx >> 16; | ||
117 | c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); | ||
118 | |||
119 | printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", | ||
120 | c->x86_cache_size, ecx & 0xFF); | ||
121 | } | ||
122 | } | ||
123 | |||
124 | void __cpuinit detect_ht(struct cpuinfo_x86 *c) | ||
125 | { | ||
126 | #ifdef CONFIG_SMP | ||
127 | u32 eax, ebx, ecx, edx; | ||
128 | int index_msb, core_bits; | ||
129 | |||
130 | cpuid(1, &eax, &ebx, &ecx, &edx); | ||
131 | |||
132 | |||
133 | if (!cpu_has(c, X86_FEATURE_HT)) | ||
134 | return; | ||
135 | if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) | ||
136 | goto out; | ||
137 | |||
138 | smp_num_siblings = (ebx & 0xff0000) >> 16; | ||
139 | |||
140 | if (smp_num_siblings == 1) { | ||
141 | printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); | ||
142 | } else if (smp_num_siblings > 1) { | ||
143 | |||
144 | if (smp_num_siblings > NR_CPUS) { | ||
145 | printk(KERN_WARNING "CPU: Unsupported number of " | ||
146 | "siblings %d", smp_num_siblings); | ||
147 | smp_num_siblings = 1; | ||
148 | return; | ||
149 | } | ||
150 | |||
151 | index_msb = get_count_order(smp_num_siblings); | ||
152 | c->phys_proc_id = phys_pkg_id(index_msb); | ||
153 | |||
154 | smp_num_siblings = smp_num_siblings / c->x86_max_cores; | ||
155 | |||
156 | index_msb = get_count_order(smp_num_siblings); | ||
157 | |||
158 | core_bits = get_count_order(c->x86_max_cores); | ||
159 | |||
160 | c->cpu_core_id = phys_pkg_id(index_msb) & | ||
161 | ((1 << core_bits) - 1); | ||
162 | } | ||
163 | out: | ||
164 | if ((c->x86_max_cores * smp_num_siblings) > 1) { | ||
165 | printk(KERN_INFO "CPU: Physical Processor ID: %d\n", | ||
166 | c->phys_proc_id); | ||
167 | printk(KERN_INFO "CPU: Processor Core ID: %d\n", | ||
168 | c->cpu_core_id); | ||
169 | } | ||
170 | |||
171 | #endif | ||
172 | } | ||
173 | |||
174 | static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) | ||
175 | { | ||
176 | char *v = c->x86_vendor_id; | ||
177 | int i; | ||
178 | static int printed; | ||
179 | |||
180 | for (i = 0; i < X86_VENDOR_NUM; i++) { | ||
181 | if (cpu_devs[i]) { | ||
182 | if (!strcmp(v, cpu_devs[i]->c_ident[0]) || | ||
183 | (cpu_devs[i]->c_ident[1] && | ||
184 | !strcmp(v, cpu_devs[i]->c_ident[1]))) { | ||
185 | c->x86_vendor = i; | ||
186 | this_cpu = cpu_devs[i]; | ||
187 | return; | ||
188 | } | ||
189 | } | ||
190 | } | ||
191 | if (!printed) { | ||
192 | printed++; | ||
193 | printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); | ||
194 | printk(KERN_ERR "CPU: Your system may be unstable.\n"); | ||
195 | } | ||
196 | c->x86_vendor = X86_VENDOR_UNKNOWN; | ||
197 | } | ||
198 | |||
199 | static void __init early_cpu_support_print(void) | ||
200 | { | ||
201 | int i,j; | ||
202 | struct cpu_dev *cpu_devx; | ||
203 | |||
204 | printk("KERNEL supported cpus:\n"); | ||
205 | for (i = 0; i < X86_VENDOR_NUM; i++) { | ||
206 | cpu_devx = cpu_devs[i]; | ||
207 | if (!cpu_devx) | ||
208 | continue; | ||
209 | for (j = 0; j < 2; j++) { | ||
210 | if (!cpu_devx->c_ident[j]) | ||
211 | continue; | ||
212 | printk(" %s %s\n", cpu_devx->c_vendor, | ||
213 | cpu_devx->c_ident[j]); | ||
214 | } | ||
215 | } | ||
216 | } | ||
217 | |||
218 | static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); | ||
219 | |||
220 | void __init early_cpu_init(void) | ||
221 | { | ||
222 | struct cpu_vendor_dev *cvdev; | ||
223 | |||
224 | for (cvdev = __x86cpuvendor_start ; | ||
225 | cvdev < __x86cpuvendor_end ; | ||
226 | cvdev++) | ||
227 | cpu_devs[cvdev->vendor] = cvdev->cpu_dev; | ||
228 | early_cpu_support_print(); | ||
229 | early_identify_cpu(&boot_cpu_data); | ||
230 | } | ||
231 | |||
232 | /* Do some early cpuid on the boot CPU to get some parameter that are | ||
233 | needed before check_bugs. Everything advanced is in identify_cpu | ||
234 | below. */ | ||
235 | static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) | ||
236 | { | ||
237 | u32 tfms, xlvl; | ||
238 | |||
239 | c->loops_per_jiffy = loops_per_jiffy; | ||
240 | c->x86_cache_size = -1; | ||
241 | c->x86_vendor = X86_VENDOR_UNKNOWN; | ||
242 | c->x86_model = c->x86_mask = 0; /* So far unknown... */ | ||
243 | c->x86_vendor_id[0] = '\0'; /* Unset */ | ||
244 | c->x86_model_id[0] = '\0'; /* Unset */ | ||
245 | c->x86_clflush_size = 64; | ||
246 | c->x86_cache_alignment = c->x86_clflush_size; | ||
247 | c->x86_max_cores = 1; | ||
248 | c->x86_coreid_bits = 0; | ||
249 | c->extended_cpuid_level = 0; | ||
250 | memset(&c->x86_capability, 0, sizeof c->x86_capability); | ||
251 | |||
252 | /* Get vendor name */ | ||
253 | cpuid(0x00000000, (unsigned int *)&c->cpuid_level, | ||
254 | (unsigned int *)&c->x86_vendor_id[0], | ||
255 | (unsigned int *)&c->x86_vendor_id[8], | ||
256 | (unsigned int *)&c->x86_vendor_id[4]); | ||
257 | |||
258 | get_cpu_vendor(c); | ||
259 | |||
260 | /* Initialize the standard set of capabilities */ | ||
261 | /* Note that the vendor-specific code below might override */ | ||
262 | |||
263 | /* Intel-defined flags: level 0x00000001 */ | ||
264 | if (c->cpuid_level >= 0x00000001) { | ||
265 | __u32 misc; | ||
266 | cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], | ||
267 | &c->x86_capability[0]); | ||
268 | c->x86 = (tfms >> 8) & 0xf; | ||
269 | c->x86_model = (tfms >> 4) & 0xf; | ||
270 | c->x86_mask = tfms & 0xf; | ||
271 | if (c->x86 == 0xf) | ||
272 | c->x86 += (tfms >> 20) & 0xff; | ||
273 | if (c->x86 >= 0x6) | ||
274 | c->x86_model += ((tfms >> 16) & 0xF) << 4; | ||
275 | if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) | ||
276 | c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; | ||
277 | } else { | ||
278 | /* Have CPUID level 0 only - unheard of */ | ||
279 | c->x86 = 4; | ||
280 | } | ||
281 | |||
282 | c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff; | ||
283 | #ifdef CONFIG_SMP | ||
284 | c->phys_proc_id = c->initial_apicid; | ||
285 | #endif | ||
286 | /* AMD-defined flags: level 0x80000001 */ | ||
287 | xlvl = cpuid_eax(0x80000000); | ||
288 | c->extended_cpuid_level = xlvl; | ||
289 | if ((xlvl & 0xffff0000) == 0x80000000) { | ||
290 | if (xlvl >= 0x80000001) { | ||
291 | c->x86_capability[1] = cpuid_edx(0x80000001); | ||
292 | c->x86_capability[6] = cpuid_ecx(0x80000001); | ||
293 | } | ||
294 | if (xlvl >= 0x80000004) | ||
295 | get_model_name(c); /* Default name */ | ||
296 | } | ||
297 | |||
298 | /* Transmeta-defined flags: level 0x80860001 */ | ||
299 | xlvl = cpuid_eax(0x80860000); | ||
300 | if ((xlvl & 0xffff0000) == 0x80860000) { | ||
301 | /* Don't set x86_cpuid_level here for now to not confuse. */ | ||
302 | if (xlvl >= 0x80860001) | ||
303 | c->x86_capability[2] = cpuid_edx(0x80860001); | ||
304 | } | ||
305 | |||
306 | if (c->extended_cpuid_level >= 0x80000007) | ||
307 | c->x86_power = cpuid_edx(0x80000007); | ||
308 | |||
309 | if (c->extended_cpuid_level >= 0x80000008) { | ||
310 | u32 eax = cpuid_eax(0x80000008); | ||
311 | |||
312 | c->x86_virt_bits = (eax >> 8) & 0xff; | ||
313 | c->x86_phys_bits = eax & 0xff; | ||
314 | } | ||
315 | |||
316 | if (c->x86_vendor != X86_VENDOR_UNKNOWN && | ||
317 | cpu_devs[c->x86_vendor]->c_early_init) | ||
318 | cpu_devs[c->x86_vendor]->c_early_init(c); | ||
319 | |||
320 | validate_pat_support(c); | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * This does the hard work of actually picking apart the CPU stuff... | ||
325 | */ | ||
326 | static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | ||
327 | { | ||
328 | int i; | ||
329 | |||
330 | early_identify_cpu(c); | ||
331 | |||
332 | init_scattered_cpuid_features(c); | ||
333 | |||
334 | c->apicid = phys_pkg_id(0); | ||
335 | |||
336 | /* | ||
337 | * Vendor-specific initialization. In this section we | ||
338 | * canonicalize the feature flags, meaning if there are | ||
339 | * features a certain CPU supports which CPUID doesn't | ||
340 | * tell us, CPUID claiming incorrect flags, or other bugs, | ||
341 | * we handle them here. | ||
342 | * | ||
343 | * At the end of this section, c->x86_capability better | ||
344 | * indicate the features this CPU genuinely supports! | ||
345 | */ | ||
346 | if (this_cpu->c_init) | ||
347 | this_cpu->c_init(c); | ||
348 | |||
349 | detect_ht(c); | ||
350 | |||
351 | /* | ||
352 | * On SMP, boot_cpu_data holds the common feature set between | ||
353 | * all CPUs; so make sure that we indicate which features are | ||
354 | * common between the CPUs. The first time this routine gets | ||
355 | * executed, c == &boot_cpu_data. | ||
356 | */ | ||
357 | if (c != &boot_cpu_data) { | ||
358 | /* AND the already accumulated flags with these */ | ||
359 | for (i = 0; i < NCAPINTS; i++) | ||
360 | boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; | ||
361 | } | ||
362 | |||
363 | /* Clear all flags overriden by options */ | ||
364 | for (i = 0; i < NCAPINTS; i++) | ||
365 | c->x86_capability[i] &= ~cleared_cpu_caps[i]; | ||
366 | |||
367 | #ifdef CONFIG_X86_MCE | ||
368 | mcheck_init(c); | ||
369 | #endif | ||
370 | select_idle_routine(c); | ||
371 | |||
372 | #ifdef CONFIG_NUMA | ||
373 | numa_add_cpu(smp_processor_id()); | ||
374 | #endif | ||
375 | |||
376 | } | ||
377 | |||
378 | void __cpuinit identify_boot_cpu(void) | ||
379 | { | ||
380 | identify_cpu(&boot_cpu_data); | ||
381 | } | ||
382 | |||
383 | void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) | ||
384 | { | ||
385 | BUG_ON(c == &boot_cpu_data); | ||
386 | identify_cpu(c); | ||
387 | mtrr_ap_init(); | ||
388 | } | ||
389 | |||
390 | static __init int setup_noclflush(char *arg) | ||
391 | { | ||
392 | setup_clear_cpu_cap(X86_FEATURE_CLFLSH); | ||
393 | return 1; | ||
394 | } | ||
395 | __setup("noclflush", setup_noclflush); | ||
396 | |||
397 | void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) | ||
398 | { | ||
399 | if (c->x86_model_id[0]) | ||
400 | printk(KERN_CONT "%s", c->x86_model_id); | ||
401 | |||
402 | if (c->x86_mask || c->cpuid_level >= 0) | ||
403 | printk(KERN_CONT " stepping %02x\n", c->x86_mask); | ||
404 | else | ||
405 | printk(KERN_CONT "\n"); | ||
406 | } | ||
407 | |||
408 | static __init int setup_disablecpuid(char *arg) | ||
409 | { | ||
410 | int bit; | ||
411 | if (get_option(&arg, &bit) && bit < NCAPINTS*32) | ||
412 | setup_clear_cpu_cap(bit); | ||
413 | else | ||
414 | return 0; | ||
415 | return 1; | ||
416 | } | ||
417 | __setup("clearcpuid=", setup_disablecpuid); | ||
418 | |||
419 | cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; | ||
420 | |||
421 | struct x8664_pda **_cpu_pda __read_mostly; | ||
422 | EXPORT_SYMBOL(_cpu_pda); | ||
423 | |||
424 | struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; | ||
425 | |||
426 | char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; | ||
427 | |||
428 | unsigned long __supported_pte_mask __read_mostly = ~0UL; | ||
429 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | ||
430 | |||
431 | static int do_not_nx __cpuinitdata; | ||
432 | |||
433 | /* noexec=on|off | ||
434 | Control non executable mappings for 64bit processes. | ||
435 | |||
436 | on Enable(default) | ||
437 | off Disable | ||
438 | */ | ||
439 | static int __init nonx_setup(char *str) | ||
440 | { | ||
441 | if (!str) | ||
442 | return -EINVAL; | ||
443 | if (!strncmp(str, "on", 2)) { | ||
444 | __supported_pte_mask |= _PAGE_NX; | ||
445 | do_not_nx = 0; | ||
446 | } else if (!strncmp(str, "off", 3)) { | ||
447 | do_not_nx = 1; | ||
448 | __supported_pte_mask &= ~_PAGE_NX; | ||
449 | } | ||
450 | return 0; | ||
451 | } | ||
452 | early_param("noexec", nonx_setup); | ||
453 | |||
454 | int force_personality32; | ||
455 | |||
456 | /* noexec32=on|off | ||
457 | Control non executable heap for 32bit processes. | ||
458 | To control the stack too use noexec=off | ||
459 | |||
460 | on PROT_READ does not imply PROT_EXEC for 32bit processes (default) | ||
461 | off PROT_READ implies PROT_EXEC | ||
462 | */ | ||
463 | static int __init nonx32_setup(char *str) | ||
464 | { | ||
465 | if (!strcmp(str, "on")) | ||
466 | force_personality32 &= ~READ_IMPLIES_EXEC; | ||
467 | else if (!strcmp(str, "off")) | ||
468 | force_personality32 |= READ_IMPLIES_EXEC; | ||
469 | return 1; | ||
470 | } | ||
471 | __setup("noexec32=", nonx32_setup); | ||
472 | |||
473 | void pda_init(int cpu) | ||
474 | { | ||
475 | struct x8664_pda *pda = cpu_pda(cpu); | ||
476 | |||
477 | /* Setup up data that may be needed in __get_free_pages early */ | ||
478 | loadsegment(fs, 0); | ||
479 | loadsegment(gs, 0); | ||
480 | /* Memory clobbers used to order PDA accessed */ | ||
481 | mb(); | ||
482 | wrmsrl(MSR_GS_BASE, pda); | ||
483 | mb(); | ||
484 | |||
485 | pda->cpunumber = cpu; | ||
486 | pda->irqcount = -1; | ||
487 | pda->kernelstack = (unsigned long)stack_thread_info() - | ||
488 | PDA_STACKOFFSET + THREAD_SIZE; | ||
489 | pda->active_mm = &init_mm; | ||
490 | pda->mmu_state = 0; | ||
491 | |||
492 | if (cpu == 0) { | ||
493 | /* others are initialized in smpboot.c */ | ||
494 | pda->pcurrent = &init_task; | ||
495 | pda->irqstackptr = boot_cpu_stack; | ||
496 | } else { | ||
497 | pda->irqstackptr = (char *) | ||
498 | __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); | ||
499 | if (!pda->irqstackptr) | ||
500 | panic("cannot allocate irqstack for cpu %d", cpu); | ||
501 | |||
502 | if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) | ||
503 | pda->nodenumber = cpu_to_node(cpu); | ||
504 | } | ||
505 | |||
506 | pda->irqstackptr += IRQSTACKSIZE-64; | ||
507 | } | ||
508 | |||
509 | char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + | ||
510 | DEBUG_STKSZ] __page_aligned_bss; | ||
511 | |||
512 | extern asmlinkage void ignore_sysret(void); | ||
513 | |||
514 | /* May not be marked __init: used by software suspend */ | ||
515 | void syscall_init(void) | ||
516 | { | ||
517 | /* | ||
518 | * LSTAR and STAR live in a bit strange symbiosis. | ||
519 | * They both write to the same internal register. STAR allows to | ||
520 | * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. | ||
521 | */ | ||
522 | wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); | ||
523 | wrmsrl(MSR_LSTAR, system_call); | ||
524 | wrmsrl(MSR_CSTAR, ignore_sysret); | ||
525 | |||
526 | #ifdef CONFIG_IA32_EMULATION | ||
527 | syscall32_cpu_init(); | ||
528 | #endif | ||
529 | |||
530 | /* Flags to clear on syscall */ | ||
531 | wrmsrl(MSR_SYSCALL_MASK, | ||
532 | X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); | ||
533 | } | ||
534 | |||
535 | void __cpuinit check_efer(void) | ||
536 | { | ||
537 | unsigned long efer; | ||
538 | |||
539 | rdmsrl(MSR_EFER, efer); | ||
540 | if (!(efer & EFER_NX) || do_not_nx) | ||
541 | __supported_pte_mask &= ~_PAGE_NX; | ||
542 | } | ||
543 | |||
544 | unsigned long kernel_eflags; | ||
545 | |||
546 | /* | ||
547 | * Copies of the original ist values from the tss are only accessed during | ||
548 | * debugging, no special alignment required. | ||
549 | */ | ||
550 | DEFINE_PER_CPU(struct orig_ist, orig_ist); | ||
551 | |||
552 | /* | ||
553 | * cpu_init() initializes state that is per-CPU. Some data is already | ||
554 | * initialized (naturally) in the bootstrap process, such as the GDT | ||
555 | * and IDT. We reload them nevertheless, this function acts as a | ||
556 | * 'CPU state barrier', nothing should get across. | ||
557 | * A lot of state is already set up in PDA init. | ||
558 | */ | ||
559 | void __cpuinit cpu_init(void) | ||
560 | { | ||
561 | int cpu = stack_smp_processor_id(); | ||
562 | struct tss_struct *t = &per_cpu(init_tss, cpu); | ||
563 | struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); | ||
564 | unsigned long v; | ||
565 | char *estacks = NULL; | ||
566 | struct task_struct *me; | ||
567 | int i; | ||
568 | |||
569 | /* CPU 0 is initialised in head64.c */ | ||
570 | if (cpu != 0) | ||
571 | pda_init(cpu); | ||
572 | else | ||
573 | estacks = boot_exception_stacks; | ||
574 | |||
575 | me = current; | ||
576 | |||
577 | if (cpu_test_and_set(cpu, cpu_initialized)) | ||
578 | panic("CPU#%d already initialized!\n", cpu); | ||
579 | |||
580 | printk(KERN_INFO "Initializing CPU#%d\n", cpu); | ||
581 | |||
582 | clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); | ||
583 | |||
584 | /* | ||
585 | * Initialize the per-CPU GDT with the boot GDT, | ||
586 | * and set up the GDT descriptor: | ||
587 | */ | ||
588 | |||
589 | switch_to_new_gdt(); | ||
590 | load_idt((const struct desc_ptr *)&idt_descr); | ||
591 | |||
592 | memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); | ||
593 | syscall_init(); | ||
594 | |||
595 | wrmsrl(MSR_FS_BASE, 0); | ||
596 | wrmsrl(MSR_KERNEL_GS_BASE, 0); | ||
597 | barrier(); | ||
598 | |||
599 | check_efer(); | ||
600 | |||
601 | /* | ||
602 | * set up and load the per-CPU TSS | ||
603 | */ | ||
604 | for (v = 0; v < N_EXCEPTION_STACKS; v++) { | ||
605 | static const unsigned int order[N_EXCEPTION_STACKS] = { | ||
606 | [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, | ||
607 | [DEBUG_STACK - 1] = DEBUG_STACK_ORDER | ||
608 | }; | ||
609 | if (cpu) { | ||
610 | estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); | ||
611 | if (!estacks) | ||
612 | panic("Cannot allocate exception stack %ld %d\n", | ||
613 | v, cpu); | ||
614 | } | ||
615 | estacks += PAGE_SIZE << order[v]; | ||
616 | orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; | ||
617 | } | ||
618 | |||
619 | t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); | ||
620 | /* | ||
621 | * <= is required because the CPU will access up to | ||
622 | * 8 bits beyond the end of the IO permission bitmap. | ||
623 | */ | ||
624 | for (i = 0; i <= IO_BITMAP_LONGS; i++) | ||
625 | t->io_bitmap[i] = ~0UL; | ||
626 | |||
627 | atomic_inc(&init_mm.mm_count); | ||
628 | me->active_mm = &init_mm; | ||
629 | if (me->mm) | ||
630 | BUG(); | ||
631 | enter_lazy_tlb(&init_mm, me); | ||
632 | |||
633 | load_sp0(t, ¤t->thread); | ||
634 | set_tss_desc(cpu, t); | ||
635 | load_TR_desc(); | ||
636 | load_LDT(&init_mm.context); | ||
637 | |||
638 | #ifdef CONFIG_KGDB | ||
639 | /* | ||
640 | * If the kgdb is connected no debug regs should be altered. This | ||
641 | * is only applicable when KGDB and a KGDB I/O module are built | ||
642 | * into the kernel and you are using early debugging with | ||
643 | * kgdbwait. KGDB will control the kernel HW breakpoint registers. | ||
644 | */ | ||
645 | if (kgdb_connected && arch_kgdb_ops.correct_hw_break) | ||
646 | arch_kgdb_ops.correct_hw_break(); | ||
647 | else { | ||
648 | #endif | ||
649 | /* | ||
650 | * Clear all 6 debug registers: | ||
651 | */ | ||
652 | |||
653 | set_debugreg(0UL, 0); | ||
654 | set_debugreg(0UL, 1); | ||
655 | set_debugreg(0UL, 2); | ||
656 | set_debugreg(0UL, 3); | ||
657 | set_debugreg(0UL, 6); | ||
658 | set_debugreg(0UL, 7); | ||
659 | #ifdef CONFIG_KGDB | ||
660 | /* If the kgdb is connected no debug regs should be altered. */ | ||
661 | } | ||
662 | #endif | ||
663 | |||
664 | fpu_init(); | ||
665 | |||
666 | raw_local_save_flags(kernel_eflags); | ||
667 | |||
668 | if (is_uv_system()) | ||
669 | uv_cpu_init(); | ||
670 | } | ||
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 4d894e8565fe..de4094a39210 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h | |||
@@ -21,23 +21,16 @@ struct cpu_dev { | |||
21 | void (*c_init)(struct cpuinfo_x86 * c); | 21 | void (*c_init)(struct cpuinfo_x86 * c); |
22 | void (*c_identify)(struct cpuinfo_x86 * c); | 22 | void (*c_identify)(struct cpuinfo_x86 * c); |
23 | unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); | 23 | unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); |
24 | int c_x86_vendor; | ||
24 | }; | 25 | }; |
25 | 26 | ||
26 | extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; | 27 | #define cpu_dev_register(cpu_devX) \ |
28 | static struct cpu_dev *__cpu_dev_##cpu_devX __used \ | ||
29 | __attribute__((__section__(".x86_cpu_dev.init"))) = \ | ||
30 | &cpu_devX; | ||
27 | 31 | ||
28 | struct cpu_vendor_dev { | 32 | extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[]; |
29 | int vendor; | ||
30 | struct cpu_dev *cpu_dev; | ||
31 | }; | ||
32 | |||
33 | #define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \ | ||
34 | static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \ | ||
35 | __attribute__((__section__(".x86cpuvendor.init"))) = \ | ||
36 | { cpu_vendor_id, cpu_dev } | ||
37 | |||
38 | extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[]; | ||
39 | 33 | ||
40 | extern int get_model_name(struct cpuinfo_x86 *c); | ||
41 | extern void display_cacheinfo(struct cpuinfo_x86 *c); | 34 | extern void display_cacheinfo(struct cpuinfo_x86 *c); |
42 | 35 | ||
43 | #endif | 36 | #endif |
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index dd097b835839..c24c4a487b7c 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | |||
@@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask) | |||
256 | * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and | 256 | * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and |
257 | * no meaning should be associated with absolute values of these MSRs. | 257 | * no meaning should be associated with absolute values of these MSRs. |
258 | */ | 258 | */ |
259 | static unsigned int get_measured_perf(unsigned int cpu) | 259 | static unsigned int get_measured_perf(struct cpufreq_policy *policy, |
260 | unsigned int cpu) | ||
260 | { | 261 | { |
261 | union { | 262 | union { |
262 | struct { | 263 | struct { |
@@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu) | |||
326 | 327 | ||
327 | #endif | 328 | #endif |
328 | 329 | ||
329 | retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; | 330 | retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100; |
330 | 331 | ||
331 | put_cpu(); | 332 | put_cpu(); |
332 | set_cpus_allowed_ptr(current, &saved_mask); | 333 | set_cpus_allowed_ptr(current, &saved_mask); |
@@ -785,7 +786,11 @@ static int __init acpi_cpufreq_init(void) | |||
785 | if (ret) | 786 | if (ret) |
786 | return ret; | 787 | return ret; |
787 | 788 | ||
788 | return cpufreq_register_driver(&acpi_cpufreq_driver); | 789 | ret = cpufreq_register_driver(&acpi_cpufreq_driver); |
790 | if (ret) | ||
791 | free_percpu(acpi_perf_data); | ||
792 | |||
793 | return ret; | ||
789 | } | 794 | } |
790 | 795 | ||
791 | static void __exit acpi_cpufreq_exit(void) | 796 | static void __exit acpi_cpufreq_exit(void) |
@@ -795,8 +800,6 @@ static void __exit acpi_cpufreq_exit(void) | |||
795 | cpufreq_unregister_driver(&acpi_cpufreq_driver); | 800 | cpufreq_unregister_driver(&acpi_cpufreq_driver); |
796 | 801 | ||
797 | free_percpu(acpi_perf_data); | 802 | free_percpu(acpi_perf_data); |
798 | |||
799 | return; | ||
800 | } | 803 | } |
801 | 804 | ||
802 | module_param(acpi_pstate_strict, uint, 0644); | 805 | module_param(acpi_pstate_strict, uint, 0644); |
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c index e4a4bf870e94..fe613c93b366 100644 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c | |||
@@ -25,8 +25,8 @@ | |||
25 | #include <linux/cpufreq.h> | 25 | #include <linux/cpufreq.h> |
26 | 26 | ||
27 | #include <asm/msr.h> | 27 | #include <asm/msr.h> |
28 | #include <asm/timex.h> | 28 | #include <linux/timex.h> |
29 | #include <asm/io.h> | 29 | #include <linux/io.h> |
30 | 30 | ||
31 | #define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ | 31 | #define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ |
32 | #define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ | 32 | #define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ |
@@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) | |||
82 | u8 clockspeed_reg; /* Clock Speed Register */ | 82 | u8 clockspeed_reg; /* Clock Speed Register */ |
83 | 83 | ||
84 | local_irq_disable(); | 84 | local_irq_disable(); |
85 | outb_p(0x80,REG_CSCIR); | 85 | outb_p(0x80, REG_CSCIR); |
86 | clockspeed_reg = inb_p(REG_CSCDR); | 86 | clockspeed_reg = inb_p(REG_CSCDR); |
87 | local_irq_enable(); | 87 | local_irq_enable(); |
88 | 88 | ||
@@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) | |||
98 | } | 98 | } |
99 | 99 | ||
100 | /* 33 MHz is not 32 MHz... */ | 100 | /* 33 MHz is not 32 MHz... */ |
101 | if ((clockspeed_reg & 0xE0)==0xA0) | 101 | if ((clockspeed_reg & 0xE0) == 0xA0) |
102 | return 33000; | 102 | return 33000; |
103 | 103 | ||
104 | return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); | 104 | return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000; |
105 | } | 105 | } |
106 | 106 | ||
107 | 107 | ||
@@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) | |||
117 | * There is no return value. | 117 | * There is no return value. |
118 | */ | 118 | */ |
119 | 119 | ||
120 | static void elanfreq_set_cpu_state (unsigned int state) | 120 | static void elanfreq_set_cpu_state(unsigned int state) |
121 | { | 121 | { |
122 | struct cpufreq_freqs freqs; | 122 | struct cpufreq_freqs freqs; |
123 | 123 | ||
@@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state) | |||
144 | */ | 144 | */ |
145 | 145 | ||
146 | local_irq_disable(); | 146 | local_irq_disable(); |
147 | outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ | 147 | outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */ |
148 | outb_p(0x00,REG_CSCDR); | 148 | outb_p(0x00, REG_CSCDR); |
149 | local_irq_enable(); /* wait till internal pipelines and */ | 149 | local_irq_enable(); /* wait till internal pipelines and */ |
150 | udelay(1000); /* buffers have cleaned up */ | 150 | udelay(1000); /* buffers have cleaned up */ |
151 | 151 | ||
152 | local_irq_disable(); | 152 | local_irq_disable(); |
153 | 153 | ||
154 | /* now, set the CPU clock speed register (0x80) */ | 154 | /* now, set the CPU clock speed register (0x80) */ |
155 | outb_p(0x80,REG_CSCIR); | 155 | outb_p(0x80, REG_CSCIR); |
156 | outb_p(elan_multiplier[state].val80h,REG_CSCDR); | 156 | outb_p(elan_multiplier[state].val80h, REG_CSCDR); |
157 | 157 | ||
158 | /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ | 158 | /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ |
159 | outb_p(0x40,REG_CSCIR); | 159 | outb_p(0x40, REG_CSCIR); |
160 | outb_p(elan_multiplier[state].val40h,REG_CSCDR); | 160 | outb_p(elan_multiplier[state].val40h, REG_CSCDR); |
161 | udelay(10000); | 161 | udelay(10000); |
162 | local_irq_enable(); | 162 | local_irq_enable(); |
163 | 163 | ||
@@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state) | |||
173 | * for the hardware supported by the driver. | 173 | * for the hardware supported by the driver. |
174 | */ | 174 | */ |
175 | 175 | ||
176 | static int elanfreq_verify (struct cpufreq_policy *policy) | 176 | static int elanfreq_verify(struct cpufreq_policy *policy) |
177 | { | 177 | { |
178 | return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); | 178 | return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); |
179 | } | 179 | } |
180 | 180 | ||
181 | static int elanfreq_target (struct cpufreq_policy *policy, | 181 | static int elanfreq_target(struct cpufreq_policy *policy, |
182 | unsigned int target_freq, | 182 | unsigned int target_freq, |
183 | unsigned int relation) | 183 | unsigned int relation) |
184 | { | 184 | { |
@@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) | |||
205 | 205 | ||
206 | /* capability check */ | 206 | /* capability check */ |
207 | if ((c->x86_vendor != X86_VENDOR_AMD) || | 207 | if ((c->x86_vendor != X86_VENDOR_AMD) || |
208 | (c->x86 != 4) || (c->x86_model!=10)) | 208 | (c->x86 != 4) || (c->x86_model != 10)) |
209 | return -ENODEV; | 209 | return -ENODEV; |
210 | 210 | ||
211 | /* max freq */ | 211 | /* max freq */ |
@@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) | |||
213 | max_freq = elanfreq_get_cpu_frequency(0); | 213 | max_freq = elanfreq_get_cpu_frequency(0); |
214 | 214 | ||
215 | /* table init */ | 215 | /* table init */ |
216 | for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { | 216 | for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { |
217 | if (elanfreq_table[i].frequency > max_freq) | 217 | if (elanfreq_table[i].frequency > max_freq) |
218 | elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; | 218 | elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; |
219 | } | 219 | } |
@@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) | |||
224 | 224 | ||
225 | result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); | 225 | result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); |
226 | if (result) | 226 | if (result) |
227 | return (result); | 227 | return result; |
228 | 228 | ||
229 | cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); | 229 | cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); |
230 | return 0; | 230 | return 0; |
@@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup); | |||
260 | #endif | 260 | #endif |
261 | 261 | ||
262 | 262 | ||
263 | static struct freq_attr* elanfreq_attr[] = { | 263 | static struct freq_attr *elanfreq_attr[] = { |
264 | &cpufreq_freq_attr_scaling_available_freqs, | 264 | &cpufreq_freq_attr_scaling_available_freqs, |
265 | NULL, | 265 | NULL, |
266 | }; | 266 | }; |
@@ -284,9 +284,9 @@ static int __init elanfreq_init(void) | |||
284 | 284 | ||
285 | /* Test if we have the right hardware */ | 285 | /* Test if we have the right hardware */ |
286 | if ((c->x86_vendor != X86_VENDOR_AMD) || | 286 | if ((c->x86_vendor != X86_VENDOR_AMD) || |
287 | (c->x86 != 4) || (c->x86_model!=10)) { | 287 | (c->x86 != 4) || (c->x86_model != 10)) { |
288 | printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); | 288 | printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); |
289 | return -ENODEV; | 289 | return -ENODEV; |
290 | } | 290 | } |
291 | return cpufreq_register_driver(&elanfreq_driver); | 291 | return cpufreq_register_driver(&elanfreq_driver); |
292 | } | 292 | } |
@@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void) | |||
298 | } | 298 | } |
299 | 299 | ||
300 | 300 | ||
301 | module_param (max_freq, int, 0444); | 301 | module_param(max_freq, int, 0444); |
302 | 302 | ||
303 | MODULE_LICENSE("GPL"); | 303 | MODULE_LICENSE("GPL"); |
304 | MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); | 304 | MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); |
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index f1685fb91fbd..b8e05ee4f736 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | |||
@@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) | |||
171 | } | 171 | } |
172 | 172 | ||
173 | if (c->x86 != 0xF) { | 173 | if (c->x86 != 0xF) { |
174 | printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n"); | 174 | printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n"); |
175 | return 0; | 175 | return 0; |
176 | } | 176 | } |
177 | 177 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index eb9b62b0830c..b5ced806a316 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c | |||
@@ -15,12 +15,11 @@ | |||
15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
16 | 16 | ||
17 | #include <asm/msr.h> | 17 | #include <asm/msr.h> |
18 | #include <asm/timex.h> | 18 | #include <linux/timex.h> |
19 | #include <asm/io.h> | 19 | #include <linux/io.h> |
20 | 20 | ||
21 | 21 | #define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long | |
22 | #define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long | 22 | as it is unused */ |
23 | as it is unused */ | ||
24 | 23 | ||
25 | static unsigned int busfreq; /* FSB, in 10 kHz */ | 24 | static unsigned int busfreq; /* FSB, in 10 kHz */ |
26 | static unsigned int max_multiplier; | 25 | static unsigned int max_multiplier; |
@@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void) | |||
53 | 52 | ||
54 | msrval = POWERNOW_IOPORT + 0x1; | 53 | msrval = POWERNOW_IOPORT + 0x1; |
55 | wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ | 54 | wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ |
56 | invalue=inl(POWERNOW_IOPORT + 0x8); | 55 | invalue = inl(POWERNOW_IOPORT + 0x8); |
57 | msrval = POWERNOW_IOPORT + 0x0; | 56 | msrval = POWERNOW_IOPORT + 0x0; |
58 | wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ | 57 | wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ |
59 | 58 | ||
@@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void) | |||
67 | * | 66 | * |
68 | * Tries to change the PowerNow! multiplier | 67 | * Tries to change the PowerNow! multiplier |
69 | */ | 68 | */ |
70 | static void powernow_k6_set_state (unsigned int best_i) | 69 | static void powernow_k6_set_state(unsigned int best_i) |
71 | { | 70 | { |
72 | unsigned long outvalue=0, invalue=0; | 71 | unsigned long outvalue = 0, invalue = 0; |
73 | unsigned long msrval; | 72 | unsigned long msrval; |
74 | struct cpufreq_freqs freqs; | 73 | struct cpufreq_freqs freqs; |
75 | 74 | ||
@@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i) | |||
90 | 89 | ||
91 | msrval = POWERNOW_IOPORT + 0x1; | 90 | msrval = POWERNOW_IOPORT + 0x1; |
92 | wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ | 91 | wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ |
93 | invalue=inl(POWERNOW_IOPORT + 0x8); | 92 | invalue = inl(POWERNOW_IOPORT + 0x8); |
94 | invalue = invalue & 0xf; | 93 | invalue = invalue & 0xf; |
95 | outvalue = outvalue | invalue; | 94 | outvalue = outvalue | invalue; |
96 | outl(outvalue ,(POWERNOW_IOPORT + 0x8)); | 95 | outl(outvalue , (POWERNOW_IOPORT + 0x8)); |
97 | msrval = POWERNOW_IOPORT + 0x0; | 96 | msrval = POWERNOW_IOPORT + 0x0; |
98 | wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ | 97 | wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ |
99 | 98 | ||
@@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy) | |||
124 | * | 123 | * |
125 | * sets a new CPUFreq policy | 124 | * sets a new CPUFreq policy |
126 | */ | 125 | */ |
127 | static int powernow_k6_target (struct cpufreq_policy *policy, | 126 | static int powernow_k6_target(struct cpufreq_policy *policy, |
128 | unsigned int target_freq, | 127 | unsigned int target_freq, |
129 | unsigned int relation) | 128 | unsigned int relation) |
130 | { | 129 | { |
@@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) | |||
152 | busfreq = cpu_khz / max_multiplier; | 151 | busfreq = cpu_khz / max_multiplier; |
153 | 152 | ||
154 | /* table init */ | 153 | /* table init */ |
155 | for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { | 154 | for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { |
156 | if (clock_ratio[i].index > max_multiplier) | 155 | if (clock_ratio[i].index > max_multiplier) |
157 | clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; | 156 | clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; |
158 | else | 157 | else |
@@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) | |||
165 | 164 | ||
166 | result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); | 165 | result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); |
167 | if (result) | 166 | if (result) |
168 | return (result); | 167 | return result; |
169 | 168 | ||
170 | cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); | 169 | cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); |
171 | 170 | ||
@@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) | |||
176 | static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) | 175 | static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) |
177 | { | 176 | { |
178 | unsigned int i; | 177 | unsigned int i; |
179 | for (i=0; i<8; i++) { | 178 | for (i = 0; i < 8; i++) { |
180 | if (i==max_multiplier) | 179 | if (i == max_multiplier) |
181 | powernow_k6_set_state(i); | 180 | powernow_k6_set_state(i); |
182 | } | 181 | } |
183 | cpufreq_frequency_table_put_attr(policy->cpu); | 182 | cpufreq_frequency_table_put_attr(policy->cpu); |
@@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu) | |||
189 | return busfreq * powernow_k6_get_cpu_multiplier(); | 188 | return busfreq * powernow_k6_get_cpu_multiplier(); |
190 | } | 189 | } |
191 | 190 | ||
192 | static struct freq_attr* powernow_k6_attr[] = { | 191 | static struct freq_attr *powernow_k6_attr[] = { |
193 | &cpufreq_freq_attr_scaling_available_freqs, | 192 | &cpufreq_freq_attr_scaling_available_freqs, |
194 | NULL, | 193 | NULL, |
195 | }; | 194 | }; |
@@ -227,7 +226,7 @@ static int __init powernow_k6_init(void) | |||
227 | } | 226 | } |
228 | 227 | ||
229 | if (cpufreq_register_driver(&powernow_k6_driver)) { | 228 | if (cpufreq_register_driver(&powernow_k6_driver)) { |
230 | release_region (POWERNOW_IOPORT, 16); | 229 | release_region(POWERNOW_IOPORT, 16); |
231 | return -EINVAL; | 230 | return -EINVAL; |
232 | } | 231 | } |
233 | 232 | ||
@@ -243,13 +242,13 @@ static int __init powernow_k6_init(void) | |||
243 | static void __exit powernow_k6_exit(void) | 242 | static void __exit powernow_k6_exit(void) |
244 | { | 243 | { |
245 | cpufreq_unregister_driver(&powernow_k6_driver); | 244 | cpufreq_unregister_driver(&powernow_k6_driver); |
246 | release_region (POWERNOW_IOPORT, 16); | 245 | release_region(POWERNOW_IOPORT, 16); |
247 | } | 246 | } |
248 | 247 | ||
249 | 248 | ||
250 | MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); | 249 | MODULE_AUTHOR("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); |
251 | MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); | 250 | MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); |
252 | MODULE_LICENSE ("GPL"); | 251 | MODULE_LICENSE("GPL"); |
253 | 252 | ||
254 | module_init(powernow_k6_init); | 253 | module_init(powernow_k6_init); |
255 | module_exit(powernow_k6_exit); | 254 | module_exit(powernow_k6_exit); |
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 15e13c01cc36..3b5f06423e77 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | |||
@@ -26,7 +26,7 @@ | |||
26 | #include <asm/cpufeature.h> | 26 | #include <asm/cpufeature.h> |
27 | 27 | ||
28 | #define PFX "speedstep-centrino: " | 28 | #define PFX "speedstep-centrino: " |
29 | #define MAINTAINER "cpufreq@lists.linux.org.uk" | 29 | #define MAINTAINER "cpufreq@vger.kernel.org" |
30 | 30 | ||
31 | #define dprintk(msg...) \ | 31 | #define dprintk(msg...) \ |
32 | cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) | 32 | cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) |
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index e710a21bb6e8..ffd0f5ed071a 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c | |||
@@ -15,13 +15,11 @@ | |||
15 | /* | 15 | /* |
16 | * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU | 16 | * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU |
17 | */ | 17 | */ |
18 | static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) | 18 | static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) |
19 | { | 19 | { |
20 | unsigned char ccr2, ccr3; | 20 | unsigned char ccr2, ccr3; |
21 | unsigned long flags; | ||
22 | 21 | ||
23 | /* we test for DEVID by checking whether CCR3 is writable */ | 22 | /* we test for DEVID by checking whether CCR3 is writable */ |
24 | local_irq_save(flags); | ||
25 | ccr3 = getCx86(CX86_CCR3); | 23 | ccr3 = getCx86(CX86_CCR3); |
26 | setCx86(CX86_CCR3, ccr3 ^ 0x80); | 24 | setCx86(CX86_CCR3, ccr3 ^ 0x80); |
27 | getCx86(0xc0); /* dummy to change bus */ | 25 | getCx86(0xc0); /* dummy to change bus */ |
@@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) | |||
44 | *dir0 = getCx86(CX86_DIR0); | 42 | *dir0 = getCx86(CX86_DIR0); |
45 | *dir1 = getCx86(CX86_DIR1); | 43 | *dir1 = getCx86(CX86_DIR1); |
46 | } | 44 | } |
47 | local_irq_restore(flags); | ||
48 | } | 45 | } |
49 | 46 | ||
47 | static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) | ||
48 | { | ||
49 | unsigned long flags; | ||
50 | |||
51 | local_irq_save(flags); | ||
52 | __do_cyrix_devid(dir0, dir1); | ||
53 | local_irq_restore(flags); | ||
54 | } | ||
50 | /* | 55 | /* |
51 | * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in | 56 | * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in |
52 | * order to identify the Cyrix CPU model after we're out of setup.c | 57 | * order to identify the Cyrix CPU model after we're out of setup.c |
@@ -116,7 +121,7 @@ static void __cpuinit set_cx86_reorder(void) | |||
116 | setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ | 121 | setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ |
117 | 122 | ||
118 | /* Load/Store Serialize to mem access disable (=reorder it) */ | 123 | /* Load/Store Serialize to mem access disable (=reorder it) */ |
119 | setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); | 124 | setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80); |
120 | /* set load/store serialize from 1GB to 4GB */ | 125 | /* set load/store serialize from 1GB to 4GB */ |
121 | ccr3 |= 0xe0; | 126 | ccr3 |= 0xe0; |
122 | setCx86(CX86_CCR3, ccr3); | 127 | setCx86(CX86_CCR3, ccr3); |
@@ -127,11 +132,11 @@ static void __cpuinit set_cx86_memwb(void) | |||
127 | printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); | 132 | printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); |
128 | 133 | ||
129 | /* CCR2 bit 2: unlock NW bit */ | 134 | /* CCR2 bit 2: unlock NW bit */ |
130 | setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); | 135 | setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04); |
131 | /* set 'Not Write-through' */ | 136 | /* set 'Not Write-through' */ |
132 | write_cr0(read_cr0() | X86_CR0_NW); | 137 | write_cr0(read_cr0() | X86_CR0_NW); |
133 | /* CCR2 bit 2: lock NW bit and set WT1 */ | 138 | /* CCR2 bit 2: lock NW bit and set WT1 */ |
134 | setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); | 139 | setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14); |
135 | } | 140 | } |
136 | 141 | ||
137 | /* | 142 | /* |
@@ -145,14 +150,14 @@ static void __cpuinit geode_configure(void) | |||
145 | local_irq_save(flags); | 150 | local_irq_save(flags); |
146 | 151 | ||
147 | /* Suspend on halt power saving and enable #SUSP pin */ | 152 | /* Suspend on halt power saving and enable #SUSP pin */ |
148 | setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); | 153 | setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88); |
149 | 154 | ||
150 | ccr3 = getCx86(CX86_CCR3); | 155 | ccr3 = getCx86(CX86_CCR3); |
151 | setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ | 156 | setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ |
152 | 157 | ||
153 | 158 | ||
154 | /* FPU fast, DTE cache, Mem bypass */ | 159 | /* FPU fast, DTE cache, Mem bypass */ |
155 | setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); | 160 | setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38); |
156 | setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ | 161 | setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ |
157 | 162 | ||
158 | set_cx86_memwb(); | 163 | set_cx86_memwb(); |
@@ -161,6 +166,24 @@ static void __cpuinit geode_configure(void) | |||
161 | local_irq_restore(flags); | 166 | local_irq_restore(flags); |
162 | } | 167 | } |
163 | 168 | ||
169 | static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) | ||
170 | { | ||
171 | unsigned char dir0, dir0_msn, dir1 = 0; | ||
172 | |||
173 | __do_cyrix_devid(&dir0, &dir1); | ||
174 | dir0_msn = dir0 >> 4; /* identifies CPU "family" */ | ||
175 | |||
176 | switch (dir0_msn) { | ||
177 | case 3: /* 6x86/6x86L */ | ||
178 | /* Emulate MTRRs using Cyrix's ARRs. */ | ||
179 | set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); | ||
180 | break; | ||
181 | case 5: /* 6x86MX/M II */ | ||
182 | /* Emulate MTRRs using Cyrix's ARRs. */ | ||
183 | set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); | ||
184 | break; | ||
185 | } | ||
186 | } | ||
164 | 187 | ||
165 | static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) | 188 | static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) |
166 | { | 189 | { |
@@ -268,7 +291,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) | |||
268 | /* GXm supports extended cpuid levels 'ala' AMD */ | 291 | /* GXm supports extended cpuid levels 'ala' AMD */ |
269 | if (c->cpuid_level == 2) { | 292 | if (c->cpuid_level == 2) { |
270 | /* Enable cxMMX extensions (GX1 Datasheet 54) */ | 293 | /* Enable cxMMX extensions (GX1 Datasheet 54) */ |
271 | setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); | 294 | setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1); |
272 | 295 | ||
273 | /* | 296 | /* |
274 | * GXm : 0x30 ... 0x5f GXm datasheet 51 | 297 | * GXm : 0x30 ... 0x5f GXm datasheet 51 |
@@ -278,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) | |||
278 | */ | 301 | */ |
279 | if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) | 302 | if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) |
280 | geode_configure(); | 303 | geode_configure(); |
281 | get_model_name(c); /* get CPU marketing name */ | ||
282 | return; | 304 | return; |
283 | } else { /* MediaGX */ | 305 | } else { /* MediaGX */ |
284 | Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; | 306 | Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; |
@@ -291,7 +313,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) | |||
291 | if (dir1 > 7) { | 313 | if (dir1 > 7) { |
292 | dir0_msn++; /* M II */ | 314 | dir0_msn++; /* M II */ |
293 | /* Enable MMX extensions (App note 108) */ | 315 | /* Enable MMX extensions (App note 108) */ |
294 | setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); | 316 | setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1); |
295 | } else { | 317 | } else { |
296 | c->coma_bug = 1; /* 6x86MX, it has the bug. */ | 318 | c->coma_bug = 1; /* 6x86MX, it has the bug. */ |
297 | } | 319 | } |
@@ -406,7 +428,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) | |||
406 | local_irq_save(flags); | 428 | local_irq_save(flags); |
407 | ccr3 = getCx86(CX86_CCR3); | 429 | ccr3 = getCx86(CX86_CCR3); |
408 | setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ | 430 | setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ |
409 | setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */ | 431 | setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */ |
410 | setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ | 432 | setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ |
411 | local_irq_restore(flags); | 433 | local_irq_restore(flags); |
412 | } | 434 | } |
@@ -416,16 +438,19 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) | |||
416 | static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { | 438 | static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { |
417 | .c_vendor = "Cyrix", | 439 | .c_vendor = "Cyrix", |
418 | .c_ident = { "CyrixInstead" }, | 440 | .c_ident = { "CyrixInstead" }, |
441 | .c_early_init = early_init_cyrix, | ||
419 | .c_init = init_cyrix, | 442 | .c_init = init_cyrix, |
420 | .c_identify = cyrix_identify, | 443 | .c_identify = cyrix_identify, |
444 | .c_x86_vendor = X86_VENDOR_CYRIX, | ||
421 | }; | 445 | }; |
422 | 446 | ||
423 | cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev); | 447 | cpu_dev_register(cyrix_cpu_dev); |
424 | 448 | ||
425 | static struct cpu_dev nsc_cpu_dev __cpuinitdata = { | 449 | static struct cpu_dev nsc_cpu_dev __cpuinitdata = { |
426 | .c_vendor = "NSC", | 450 | .c_vendor = "NSC", |
427 | .c_ident = { "Geode by NSC" }, | 451 | .c_ident = { "Geode by NSC" }, |
428 | .c_init = init_nsc, | 452 | .c_init = init_nsc, |
453 | .c_x86_vendor = X86_VENDOR_NSC, | ||
429 | }; | 454 | }; |
430 | 455 | ||
431 | cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev); | 456 | cpu_dev_register(nsc_cpu_dev); |
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c deleted file mode 100644 index e43ad4ad4cba..000000000000 --- a/arch/x86/kernel/cpu/feature_names.c +++ /dev/null | |||
@@ -1,83 +0,0 @@ | |||
1 | /* | ||
2 | * Strings for the various x86 capability flags. | ||
3 | * | ||
4 | * This file must not contain any executable code. | ||
5 | */ | ||
6 | |||
7 | #include <asm/cpufeature.h> | ||
8 | |||
9 | /* | ||
10 | * These flag bits must match the definitions in <asm/cpufeature.h>. | ||
11 | * NULL means this bit is undefined or reserved; either way it doesn't | ||
12 | * have meaning as far as Linux is concerned. Note that it's important | ||
13 | * to realize there is a difference between this table and CPUID -- if | ||
14 | * applications want to get the raw CPUID data, they should access | ||
15 | * /dev/cpu/<cpu_nr>/cpuid instead. | ||
16 | */ | ||
17 | const char * const x86_cap_flags[NCAPINTS*32] = { | ||
18 | /* Intel-defined */ | ||
19 | "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", | ||
20 | "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", | ||
21 | "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", | ||
22 | "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", | ||
23 | |||
24 | /* AMD-defined */ | ||
25 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
26 | NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, | ||
27 | NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, | ||
28 | NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", | ||
29 | "3dnowext", "3dnow", | ||
30 | |||
31 | /* Transmeta-defined */ | ||
32 | "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, | ||
33 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
34 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
35 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
36 | |||
37 | /* Other (Linux-defined) */ | ||
38 | "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", | ||
39 | NULL, NULL, NULL, NULL, | ||
40 | "constant_tsc", "up", NULL, "arch_perfmon", | ||
41 | "pebs", "bts", NULL, NULL, | ||
42 | "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
43 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
44 | |||
45 | /* Intel-defined (#2) */ | ||
46 | "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", | ||
47 | "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, | ||
48 | NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", | ||
49 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
50 | |||
51 | /* VIA/Cyrix/Centaur-defined */ | ||
52 | NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", | ||
53 | "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, | ||
54 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
55 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
56 | |||
57 | /* AMD-defined (#2) */ | ||
58 | "lahf_lm", "cmp_legacy", "svm", "extapic", | ||
59 | "cr8_legacy", "abm", "sse4a", "misalignsse", | ||
60 | "3dnowprefetch", "osvw", "ibs", "sse5", | ||
61 | "skinit", "wdt", NULL, NULL, | ||
62 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
63 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
64 | |||
65 | /* Auxiliary (Linux-defined) */ | ||
66 | "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
67 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
68 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
69 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
70 | }; | ||
71 | |||
72 | const char *const x86_power_flags[32] = { | ||
73 | "ts", /* temperature sensor */ | ||
74 | "fid", /* frequency id control */ | ||
75 | "vid", /* voltage id control */ | ||
76 | "ttp", /* thermal trip */ | ||
77 | "tm", | ||
78 | "stc", | ||
79 | "100mhzsteps", | ||
80 | "hwpstate", | ||
81 | "", /* tsc invariant mapped to constant_tsc */ | ||
82 | /* nothing */ | ||
83 | }; | ||
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b75f2569b8f8..99468dbd08da 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -15,6 +15,11 @@ | |||
15 | #include <asm/ds.h> | 15 | #include <asm/ds.h> |
16 | #include <asm/bugs.h> | 16 | #include <asm/bugs.h> |
17 | 17 | ||
18 | #ifdef CONFIG_X86_64 | ||
19 | #include <asm/topology.h> | ||
20 | #include <asm/numa_64.h> | ||
21 | #endif | ||
22 | |||
18 | #include "cpu.h" | 23 | #include "cpu.h" |
19 | 24 | ||
20 | #ifdef CONFIG_X86_LOCAL_APIC | 25 | #ifdef CONFIG_X86_LOCAL_APIC |
@@ -23,23 +28,22 @@ | |||
23 | #include <mach_apic.h> | 28 | #include <mach_apic.h> |
24 | #endif | 29 | #endif |
25 | 30 | ||
26 | #ifdef CONFIG_X86_INTEL_USERCOPY | ||
27 | /* | ||
28 | * Alignment at which movsl is preferred for bulk memory copies. | ||
29 | */ | ||
30 | struct movsl_mask movsl_mask __read_mostly; | ||
31 | #endif | ||
32 | |||
33 | static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) | 31 | static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) |
34 | { | 32 | { |
35 | /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ | ||
36 | if (c->x86 == 15 && c->x86_cache_alignment == 64) | ||
37 | c->x86_cache_alignment = 128; | ||
38 | if ((c->x86 == 0xf && c->x86_model >= 0x03) || | 33 | if ((c->x86 == 0xf && c->x86_model >= 0x03) || |
39 | (c->x86 == 0x6 && c->x86_model >= 0x0e)) | 34 | (c->x86 == 0x6 && c->x86_model >= 0x0e)) |
40 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | 35 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); |
36 | |||
37 | #ifdef CONFIG_X86_64 | ||
38 | set_cpu_cap(c, X86_FEATURE_SYSENTER32); | ||
39 | #else | ||
40 | /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ | ||
41 | if (c->x86 == 15 && c->x86_cache_alignment == 64) | ||
42 | c->x86_cache_alignment = 128; | ||
43 | #endif | ||
41 | } | 44 | } |
42 | 45 | ||
46 | #ifdef CONFIG_X86_32 | ||
43 | /* | 47 | /* |
44 | * Early probe support logic for ppro memory erratum #50 | 48 | * Early probe support logic for ppro memory erratum #50 |
45 | * | 49 | * |
@@ -59,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void) | |||
59 | return 0; | 63 | return 0; |
60 | } | 64 | } |
61 | 65 | ||
66 | #ifdef CONFIG_X86_F00F_BUG | ||
67 | static void __cpuinit trap_init_f00f_bug(void) | ||
68 | { | ||
69 | __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); | ||
62 | 70 | ||
63 | /* | 71 | /* |
64 | * P4 Xeon errata 037 workaround. | 72 | * Update the IDT descriptor and reload the IDT so that |
65 | * Hardware prefetcher may cause stale data to be loaded into the cache. | 73 | * it uses the read-only mapped virtual address. |
66 | */ | 74 | */ |
67 | static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) | 75 | idt_descr.address = fix_to_virt(FIX_F00F_IDT); |
76 | load_idt(&idt_descr); | ||
77 | } | ||
78 | #endif | ||
79 | |||
80 | static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) | ||
68 | { | 81 | { |
69 | unsigned long lo, hi; | 82 | unsigned long lo, hi; |
70 | 83 | ||
84 | #ifdef CONFIG_X86_F00F_BUG | ||
85 | /* | ||
86 | * All current models of Pentium and Pentium with MMX technology CPUs | ||
87 | * have the F0 0F bug, which lets nonprivileged users lock up the system. | ||
88 | * Note that the workaround only should be initialized once... | ||
89 | */ | ||
90 | c->f00f_bug = 0; | ||
91 | if (!paravirt_enabled() && c->x86 == 5) { | ||
92 | static int f00f_workaround_enabled; | ||
93 | |||
94 | c->f00f_bug = 1; | ||
95 | if (!f00f_workaround_enabled) { | ||
96 | trap_init_f00f_bug(); | ||
97 | printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); | ||
98 | f00f_workaround_enabled = 1; | ||
99 | } | ||
100 | } | ||
101 | #endif | ||
102 | |||
103 | /* | ||
104 | * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until | ||
105 | * model 3 mask 3 | ||
106 | */ | ||
107 | if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) | ||
108 | clear_cpu_cap(c, X86_FEATURE_SEP); | ||
109 | |||
110 | /* | ||
111 | * P4 Xeon errata 037 workaround. | ||
112 | * Hardware prefetcher may cause stale data to be loaded into the cache. | ||
113 | */ | ||
71 | if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { | 114 | if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { |
72 | rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); | 115 | rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); |
73 | if ((lo & (1<<9)) == 0) { | 116 | if ((lo & (1<<9)) == 0) { |
@@ -77,13 +120,68 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) | |||
77 | wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); | 120 | wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); |
78 | } | 121 | } |
79 | } | 122 | } |
123 | |||
124 | /* | ||
125 | * See if we have a good local APIC by checking for buggy Pentia, | ||
126 | * i.e. all B steppings and the C2 stepping of P54C when using their | ||
127 | * integrated APIC (see 11AP erratum in "Pentium Processor | ||
128 | * Specification Update"). | ||
129 | */ | ||
130 | if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && | ||
131 | (c->x86_mask < 0x6 || c->x86_mask == 0xb)) | ||
132 | set_cpu_cap(c, X86_FEATURE_11AP); | ||
133 | |||
134 | |||
135 | #ifdef CONFIG_X86_INTEL_USERCOPY | ||
136 | /* | ||
137 | * Set up the preferred alignment for movsl bulk memory moves | ||
138 | */ | ||
139 | switch (c->x86) { | ||
140 | case 4: /* 486: untested */ | ||
141 | break; | ||
142 | case 5: /* Old Pentia: untested */ | ||
143 | break; | ||
144 | case 6: /* PII/PIII only like movsl with 8-byte alignment */ | ||
145 | movsl_mask.mask = 7; | ||
146 | break; | ||
147 | case 15: /* P4 is OK down to 8-byte alignment */ | ||
148 | movsl_mask.mask = 7; | ||
149 | break; | ||
150 | } | ||
151 | #endif | ||
152 | |||
153 | #ifdef CONFIG_X86_NUMAQ | ||
154 | numaq_tsc_disable(); | ||
155 | #endif | ||
80 | } | 156 | } |
157 | #else | ||
158 | static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) | ||
159 | { | ||
160 | } | ||
161 | #endif | ||
81 | 162 | ||
163 | static void __cpuinit srat_detect_node(void) | ||
164 | { | ||
165 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) | ||
166 | unsigned node; | ||
167 | int cpu = smp_processor_id(); | ||
168 | int apicid = hard_smp_processor_id(); | ||
169 | |||
170 | /* Don't do the funky fallback heuristics the AMD version employs | ||
171 | for now. */ | ||
172 | node = apicid_to_node[apicid]; | ||
173 | if (node == NUMA_NO_NODE || !node_online(node)) | ||
174 | node = first_node(node_online_map); | ||
175 | numa_set_node(cpu, node); | ||
176 | |||
177 | printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | ||
178 | #endif | ||
179 | } | ||
82 | 180 | ||
83 | /* | 181 | /* |
84 | * find out the number of processor cores on the die | 182 | * find out the number of processor cores on the die |
85 | */ | 183 | */ |
86 | static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) | 184 | static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) |
87 | { | 185 | { |
88 | unsigned int eax, ebx, ecx, edx; | 186 | unsigned int eax, ebx, ecx, edx; |
89 | 187 | ||
@@ -98,45 +196,51 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) | |||
98 | return 1; | 196 | return 1; |
99 | } | 197 | } |
100 | 198 | ||
101 | #ifdef CONFIG_X86_F00F_BUG | 199 | static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c) |
102 | static void __cpuinit trap_init_f00f_bug(void) | ||
103 | { | 200 | { |
104 | __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); | 201 | /* Intel VMX MSR indicated features */ |
105 | 202 | #define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 | |
106 | /* | 203 | #define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000 |
107 | * Update the IDT descriptor and reload the IDT so that | 204 | #define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000 |
108 | * it uses the read-only mapped virtual address. | 205 | #define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 |
109 | */ | 206 | #define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 |
110 | idt_descr.address = fix_to_virt(FIX_F00F_IDT); | 207 | #define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 |
111 | load_idt(&idt_descr); | 208 | |
209 | u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; | ||
210 | |||
211 | clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW); | ||
212 | clear_cpu_cap(c, X86_FEATURE_VNMI); | ||
213 | clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); | ||
214 | clear_cpu_cap(c, X86_FEATURE_EPT); | ||
215 | clear_cpu_cap(c, X86_FEATURE_VPID); | ||
216 | |||
217 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); | ||
218 | msr_ctl = vmx_msr_high | vmx_msr_low; | ||
219 | if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) | ||
220 | set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); | ||
221 | if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) | ||
222 | set_cpu_cap(c, X86_FEATURE_VNMI); | ||
223 | if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { | ||
224 | rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, | ||
225 | vmx_msr_low, vmx_msr_high); | ||
226 | msr_ctl2 = vmx_msr_high | vmx_msr_low; | ||
227 | if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && | ||
228 | (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) | ||
229 | set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); | ||
230 | if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) | ||
231 | set_cpu_cap(c, X86_FEATURE_EPT); | ||
232 | if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) | ||
233 | set_cpu_cap(c, X86_FEATURE_VPID); | ||
234 | } | ||
112 | } | 235 | } |
113 | #endif | ||
114 | 236 | ||
115 | static void __cpuinit init_intel(struct cpuinfo_x86 *c) | 237 | static void __cpuinit init_intel(struct cpuinfo_x86 *c) |
116 | { | 238 | { |
117 | unsigned int l2 = 0; | 239 | unsigned int l2 = 0; |
118 | char *p = NULL; | ||
119 | 240 | ||
120 | early_init_intel(c); | 241 | early_init_intel(c); |
121 | 242 | ||
122 | #ifdef CONFIG_X86_F00F_BUG | 243 | intel_workarounds(c); |
123 | /* | ||
124 | * All current models of Pentium and Pentium with MMX technology CPUs | ||
125 | * have the F0 0F bug, which lets nonprivileged users lock up the system. | ||
126 | * Note that the workaround only should be initialized once... | ||
127 | */ | ||
128 | c->f00f_bug = 0; | ||
129 | if (!paravirt_enabled() && c->x86 == 5) { | ||
130 | static int f00f_workaround_enabled; | ||
131 | |||
132 | c->f00f_bug = 1; | ||
133 | if (!f00f_workaround_enabled) { | ||
134 | trap_init_f00f_bug(); | ||
135 | printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); | ||
136 | f00f_workaround_enabled = 1; | ||
137 | } | ||
138 | } | ||
139 | #endif | ||
140 | 244 | ||
141 | l2 = init_intel_cacheinfo(c); | 245 | l2 = init_intel_cacheinfo(c); |
142 | if (c->cpuid_level > 9) { | 246 | if (c->cpuid_level > 9) { |
@@ -146,16 +250,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) | |||
146 | set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); | 250 | set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); |
147 | } | 251 | } |
148 | 252 | ||
149 | /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ | 253 | if (cpu_has_xmm2) |
150 | if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) | 254 | set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); |
151 | clear_cpu_cap(c, X86_FEATURE_SEP); | 255 | if (cpu_has_ds) { |
256 | unsigned int l1; | ||
257 | rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); | ||
258 | if (!(l1 & (1<<11))) | ||
259 | set_cpu_cap(c, X86_FEATURE_BTS); | ||
260 | if (!(l1 & (1<<12))) | ||
261 | set_cpu_cap(c, X86_FEATURE_PEBS); | ||
262 | ds_init_intel(c); | ||
263 | } | ||
152 | 264 | ||
265 | #ifdef CONFIG_X86_64 | ||
266 | if (c->x86 == 15) | ||
267 | c->x86_cache_alignment = c->x86_clflush_size * 2; | ||
268 | if (c->x86 == 6) | ||
269 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | ||
270 | #else | ||
153 | /* | 271 | /* |
154 | * Names for the Pentium II/Celeron processors | 272 | * Names for the Pentium II/Celeron processors |
155 | * detectable only by also checking the cache size. | 273 | * detectable only by also checking the cache size. |
156 | * Dixon is NOT a Celeron. | 274 | * Dixon is NOT a Celeron. |
157 | */ | 275 | */ |
158 | if (c->x86 == 6) { | 276 | if (c->x86 == 6) { |
277 | char *p = NULL; | ||
278 | |||
159 | switch (c->x86_model) { | 279 | switch (c->x86_model) { |
160 | case 5: | 280 | case 5: |
161 | if (c->x86_mask == 0) { | 281 | if (c->x86_mask == 0) { |
@@ -178,70 +298,41 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) | |||
178 | p = "Celeron (Coppermine)"; | 298 | p = "Celeron (Coppermine)"; |
179 | break; | 299 | break; |
180 | } | 300 | } |
181 | } | ||
182 | |||
183 | if (p) | ||
184 | strcpy(c->x86_model_id, p); | ||
185 | |||
186 | c->x86_max_cores = num_cpu_cores(c); | ||
187 | |||
188 | detect_ht(c); | ||
189 | 301 | ||
190 | /* Work around errata */ | 302 | if (p) |
191 | Intel_errata_workarounds(c); | 303 | strcpy(c->x86_model_id, p); |
192 | |||
193 | #ifdef CONFIG_X86_INTEL_USERCOPY | ||
194 | /* | ||
195 | * Set up the preferred alignment for movsl bulk memory moves | ||
196 | */ | ||
197 | switch (c->x86) { | ||
198 | case 4: /* 486: untested */ | ||
199 | break; | ||
200 | case 5: /* Old Pentia: untested */ | ||
201 | break; | ||
202 | case 6: /* PII/PIII only like movsl with 8-byte alignment */ | ||
203 | movsl_mask.mask = 7; | ||
204 | break; | ||
205 | case 15: /* P4 is OK down to 8-byte alignment */ | ||
206 | movsl_mask.mask = 7; | ||
207 | break; | ||
208 | } | 304 | } |
209 | #endif | ||
210 | 305 | ||
211 | if (cpu_has_xmm2) | 306 | if (c->x86 == 15) |
212 | set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); | ||
213 | if (c->x86 == 15) { | ||
214 | set_cpu_cap(c, X86_FEATURE_P4); | 307 | set_cpu_cap(c, X86_FEATURE_P4); |
215 | } | ||
216 | if (c->x86 == 6) | 308 | if (c->x86 == 6) |
217 | set_cpu_cap(c, X86_FEATURE_P3); | 309 | set_cpu_cap(c, X86_FEATURE_P3); |
218 | if (cpu_has_ds) { | ||
219 | unsigned int l1; | ||
220 | rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); | ||
221 | if (!(l1 & (1<<11))) | ||
222 | set_cpu_cap(c, X86_FEATURE_BTS); | ||
223 | if (!(l1 & (1<<12))) | ||
224 | set_cpu_cap(c, X86_FEATURE_PEBS); | ||
225 | } | ||
226 | 310 | ||
227 | if (cpu_has_bts) | 311 | if (cpu_has_bts) |
228 | ds_init_intel(c); | 312 | ptrace_bts_init_intel(c); |
229 | 313 | ||
230 | /* | 314 | #endif |
231 | * See if we have a good local APIC by checking for buggy Pentia, | ||
232 | * i.e. all B steppings and the C2 stepping of P54C when using their | ||
233 | * integrated APIC (see 11AP erratum in "Pentium Processor | ||
234 | * Specification Update"). | ||
235 | */ | ||
236 | if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && | ||
237 | (c->x86_mask < 0x6 || c->x86_mask == 0xb)) | ||
238 | set_cpu_cap(c, X86_FEATURE_11AP); | ||
239 | 315 | ||
240 | #ifdef CONFIG_X86_NUMAQ | 316 | detect_extended_topology(c); |
241 | numaq_tsc_disable(); | 317 | if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { |
318 | /* | ||
319 | * let's use the legacy cpuid vector 0x1 and 0x4 for topology | ||
320 | * detection. | ||
321 | */ | ||
322 | c->x86_max_cores = intel_num_cpu_cores(c); | ||
323 | #ifdef CONFIG_X86_32 | ||
324 | detect_ht(c); | ||
242 | #endif | 325 | #endif |
326 | } | ||
327 | |||
328 | /* Work around errata */ | ||
329 | srat_detect_node(); | ||
330 | |||
331 | if (cpu_has(c, X86_FEATURE_VMX)) | ||
332 | detect_vmx_virtcap(c); | ||
243 | } | 333 | } |
244 | 334 | ||
335 | #ifdef CONFIG_X86_32 | ||
245 | static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) | 336 | static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) |
246 | { | 337 | { |
247 | /* | 338 | /* |
@@ -254,10 +345,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i | |||
254 | size = 256; | 345 | size = 256; |
255 | return size; | 346 | return size; |
256 | } | 347 | } |
348 | #endif | ||
257 | 349 | ||
258 | static struct cpu_dev intel_cpu_dev __cpuinitdata = { | 350 | static struct cpu_dev intel_cpu_dev __cpuinitdata = { |
259 | .c_vendor = "Intel", | 351 | .c_vendor = "Intel", |
260 | .c_ident = { "GenuineIntel" }, | 352 | .c_ident = { "GenuineIntel" }, |
353 | #ifdef CONFIG_X86_32 | ||
261 | .c_models = { | 354 | .c_models = { |
262 | { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = | 355 | { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = |
263 | { | 356 | { |
@@ -307,76 +400,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = { | |||
307 | } | 400 | } |
308 | }, | 401 | }, |
309 | }, | 402 | }, |
403 | .c_size_cache = intel_size_cache, | ||
404 | #endif | ||
310 | .c_early_init = early_init_intel, | 405 | .c_early_init = early_init_intel, |
311 | .c_init = init_intel, | 406 | .c_init = init_intel, |
312 | .c_size_cache = intel_size_cache, | 407 | .c_x86_vendor = X86_VENDOR_INTEL, |
313 | }; | 408 | }; |
314 | 409 | ||
315 | cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); | 410 | cpu_dev_register(intel_cpu_dev); |
316 | |||
317 | #ifndef CONFIG_X86_CMPXCHG | ||
318 | unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) | ||
319 | { | ||
320 | u8 prev; | ||
321 | unsigned long flags; | ||
322 | |||
323 | /* Poor man's cmpxchg for 386. Unsuitable for SMP */ | ||
324 | local_irq_save(flags); | ||
325 | prev = *(u8 *)ptr; | ||
326 | if (prev == old) | ||
327 | *(u8 *)ptr = new; | ||
328 | local_irq_restore(flags); | ||
329 | return prev; | ||
330 | } | ||
331 | EXPORT_SYMBOL(cmpxchg_386_u8); | ||
332 | |||
333 | unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) | ||
334 | { | ||
335 | u16 prev; | ||
336 | unsigned long flags; | ||
337 | |||
338 | /* Poor man's cmpxchg for 386. Unsuitable for SMP */ | ||
339 | local_irq_save(flags); | ||
340 | prev = *(u16 *)ptr; | ||
341 | if (prev == old) | ||
342 | *(u16 *)ptr = new; | ||
343 | local_irq_restore(flags); | ||
344 | return prev; | ||
345 | } | ||
346 | EXPORT_SYMBOL(cmpxchg_386_u16); | ||
347 | |||
348 | unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) | ||
349 | { | ||
350 | u32 prev; | ||
351 | unsigned long flags; | ||
352 | |||
353 | /* Poor man's cmpxchg for 386. Unsuitable for SMP */ | ||
354 | local_irq_save(flags); | ||
355 | prev = *(u32 *)ptr; | ||
356 | if (prev == old) | ||
357 | *(u32 *)ptr = new; | ||
358 | local_irq_restore(flags); | ||
359 | return prev; | ||
360 | } | ||
361 | EXPORT_SYMBOL(cmpxchg_386_u32); | ||
362 | #endif | ||
363 | |||
364 | #ifndef CONFIG_X86_CMPXCHG64 | ||
365 | unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) | ||
366 | { | ||
367 | u64 prev; | ||
368 | unsigned long flags; | ||
369 | |||
370 | /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ | ||
371 | local_irq_save(flags); | ||
372 | prev = *(u64 *)ptr; | ||
373 | if (prev == old) | ||
374 | *(u64 *)ptr = new; | ||
375 | local_irq_restore(flags); | ||
376 | return prev; | ||
377 | } | ||
378 | EXPORT_SYMBOL(cmpxchg_486_u64); | ||
379 | #endif | ||
380 | |||
381 | /* arch_initcall(intel_cpu_init); */ | ||
382 | 411 | ||
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c deleted file mode 100644 index 1019c58d39f0..000000000000 --- a/arch/x86/kernel/cpu/intel_64.c +++ /dev/null | |||
@@ -1,95 +0,0 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/smp.h> | ||
3 | #include <asm/processor.h> | ||
4 | #include <asm/ptrace.h> | ||
5 | #include <asm/topology.h> | ||
6 | #include <asm/numa_64.h> | ||
7 | |||
8 | #include "cpu.h" | ||
9 | |||
10 | static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) | ||
11 | { | ||
12 | if ((c->x86 == 0xf && c->x86_model >= 0x03) || | ||
13 | (c->x86 == 0x6 && c->x86_model >= 0x0e)) | ||
14 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | ||
15 | |||
16 | set_cpu_cap(c, X86_FEATURE_SYSENTER32); | ||
17 | } | ||
18 | |||
19 | /* | ||
20 | * find out the number of processor cores on the die | ||
21 | */ | ||
22 | static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) | ||
23 | { | ||
24 | unsigned int eax, t; | ||
25 | |||
26 | if (c->cpuid_level < 4) | ||
27 | return 1; | ||
28 | |||
29 | cpuid_count(4, 0, &eax, &t, &t, &t); | ||
30 | |||
31 | if (eax & 0x1f) | ||
32 | return ((eax >> 26) + 1); | ||
33 | else | ||
34 | return 1; | ||
35 | } | ||
36 | |||
37 | static void __cpuinit srat_detect_node(void) | ||
38 | { | ||
39 | #ifdef CONFIG_NUMA | ||
40 | unsigned node; | ||
41 | int cpu = smp_processor_id(); | ||
42 | int apicid = hard_smp_processor_id(); | ||
43 | |||
44 | /* Don't do the funky fallback heuristics the AMD version employs | ||
45 | for now. */ | ||
46 | node = apicid_to_node[apicid]; | ||
47 | if (node == NUMA_NO_NODE || !node_online(node)) | ||
48 | node = first_node(node_online_map); | ||
49 | numa_set_node(cpu, node); | ||
50 | |||
51 | printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | ||
52 | #endif | ||
53 | } | ||
54 | |||
55 | static void __cpuinit init_intel(struct cpuinfo_x86 *c) | ||
56 | { | ||
57 | init_intel_cacheinfo(c); | ||
58 | if (c->cpuid_level > 9) { | ||
59 | unsigned eax = cpuid_eax(10); | ||
60 | /* Check for version and the number of counters */ | ||
61 | if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) | ||
62 | set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); | ||
63 | } | ||
64 | |||
65 | if (cpu_has_ds) { | ||
66 | unsigned int l1, l2; | ||
67 | rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); | ||
68 | if (!(l1 & (1<<11))) | ||
69 | set_cpu_cap(c, X86_FEATURE_BTS); | ||
70 | if (!(l1 & (1<<12))) | ||
71 | set_cpu_cap(c, X86_FEATURE_PEBS); | ||
72 | } | ||
73 | |||
74 | |||
75 | if (cpu_has_bts) | ||
76 | ds_init_intel(c); | ||
77 | |||
78 | if (c->x86 == 15) | ||
79 | c->x86_cache_alignment = c->x86_clflush_size * 2; | ||
80 | if (c->x86 == 6) | ||
81 | set_cpu_cap(c, X86_FEATURE_REP_GOOD); | ||
82 | set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); | ||
83 | c->x86_max_cores = intel_num_cpu_cores(c); | ||
84 | |||
85 | srat_detect_node(); | ||
86 | } | ||
87 | |||
88 | static struct cpu_dev intel_cpu_dev __cpuinitdata = { | ||
89 | .c_vendor = "Intel", | ||
90 | .c_ident = { "GenuineIntel" }, | ||
91 | .c_early_init = early_init_intel, | ||
92 | .c_init = init_intel, | ||
93 | }; | ||
94 | cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); | ||
95 | |||
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 6b0a10b002f1..3f46afbb1cf1 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -1,8 +1,8 @@ | |||
1 | /* | 1 | /* |
2 | * Routines to indentify caches on Intel CPU. | 2 | * Routines to indentify caches on Intel CPU. |
3 | * | 3 | * |
4 | * Changes: | 4 | * Changes: |
5 | * Venkatesh Pallipadi : Adding cache identification through cpuid(4) | 5 | * Venkatesh Pallipadi : Adding cache identification through cpuid(4) |
6 | * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. | 6 | * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. |
7 | * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. | 7 | * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. |
8 | */ | 8 | */ |
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/compiler.h> | 13 | #include <linux/compiler.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
16 | #include <linux/pci.h> | ||
16 | 17 | ||
17 | #include <asm/processor.h> | 18 | #include <asm/processor.h> |
18 | #include <asm/smp.h> | 19 | #include <asm/smp.h> |
@@ -130,9 +131,18 @@ struct _cpuid4_info { | |||
130 | union _cpuid4_leaf_ebx ebx; | 131 | union _cpuid4_leaf_ebx ebx; |
131 | union _cpuid4_leaf_ecx ecx; | 132 | union _cpuid4_leaf_ecx ecx; |
132 | unsigned long size; | 133 | unsigned long size; |
134 | unsigned long can_disable; | ||
133 | cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ | 135 | cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ |
134 | }; | 136 | }; |
135 | 137 | ||
138 | #ifdef CONFIG_PCI | ||
139 | static struct pci_device_id k8_nb_id[] = { | ||
140 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, | ||
141 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, | ||
142 | {} | ||
143 | }; | ||
144 | #endif | ||
145 | |||
136 | unsigned short num_cache_leaves; | 146 | unsigned short num_cache_leaves; |
137 | 147 | ||
138 | /* AMD doesn't have CPUID4. Emulate it here to report the same | 148 | /* AMD doesn't have CPUID4. Emulate it here to report the same |
@@ -182,9 +192,10 @@ static unsigned short assocs[] __cpuinitdata = { | |||
182 | static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; | 192 | static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; |
183 | static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; | 193 | static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; |
184 | 194 | ||
185 | static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, | 195 | static void __cpuinit |
186 | union _cpuid4_leaf_ebx *ebx, | 196 | amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, |
187 | union _cpuid4_leaf_ecx *ecx) | 197 | union _cpuid4_leaf_ebx *ebx, |
198 | union _cpuid4_leaf_ecx *ecx) | ||
188 | { | 199 | { |
189 | unsigned dummy; | 200 | unsigned dummy; |
190 | unsigned line_size, lines_per_tag, assoc, size_in_kb; | 201 | unsigned line_size, lines_per_tag, assoc, size_in_kb; |
@@ -251,27 +262,40 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, | |||
251 | (ebx->split.ways_of_associativity + 1) - 1; | 262 | (ebx->split.ways_of_associativity + 1) - 1; |
252 | } | 263 | } |
253 | 264 | ||
254 | static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) | 265 | static void __cpuinit |
266 | amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) | ||
267 | { | ||
268 | if (index < 3) | ||
269 | return; | ||
270 | this_leaf->can_disable = 1; | ||
271 | } | ||
272 | |||
273 | static int | ||
274 | __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) | ||
255 | { | 275 | { |
256 | union _cpuid4_leaf_eax eax; | 276 | union _cpuid4_leaf_eax eax; |
257 | union _cpuid4_leaf_ebx ebx; | 277 | union _cpuid4_leaf_ebx ebx; |
258 | union _cpuid4_leaf_ecx ecx; | 278 | union _cpuid4_leaf_ecx ecx; |
259 | unsigned edx; | 279 | unsigned edx; |
260 | 280 | ||
261 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) | 281 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { |
262 | amd_cpuid4(index, &eax, &ebx, &ecx); | 282 | amd_cpuid4(index, &eax, &ebx, &ecx); |
263 | else | 283 | if (boot_cpu_data.x86 >= 0x10) |
264 | cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); | 284 | amd_check_l3_disable(index, this_leaf); |
285 | } else { | ||
286 | cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); | ||
287 | } | ||
288 | |||
265 | if (eax.split.type == CACHE_TYPE_NULL) | 289 | if (eax.split.type == CACHE_TYPE_NULL) |
266 | return -EIO; /* better error ? */ | 290 | return -EIO; /* better error ? */ |
267 | 291 | ||
268 | this_leaf->eax = eax; | 292 | this_leaf->eax = eax; |
269 | this_leaf->ebx = ebx; | 293 | this_leaf->ebx = ebx; |
270 | this_leaf->ecx = ecx; | 294 | this_leaf->ecx = ecx; |
271 | this_leaf->size = (ecx.split.number_of_sets + 1) * | 295 | this_leaf->size = (ecx.split.number_of_sets + 1) * |
272 | (ebx.split.coherency_line_size + 1) * | 296 | (ebx.split.coherency_line_size + 1) * |
273 | (ebx.split.physical_line_partition + 1) * | 297 | (ebx.split.physical_line_partition + 1) * |
274 | (ebx.split.ways_of_associativity + 1); | 298 | (ebx.split.ways_of_associativity + 1); |
275 | return 0; | 299 | return 0; |
276 | } | 300 | } |
277 | 301 | ||
@@ -453,7 +477,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) | |||
453 | 477 | ||
454 | /* pointer to _cpuid4_info array (for each cache leaf) */ | 478 | /* pointer to _cpuid4_info array (for each cache leaf) */ |
455 | static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); | 479 | static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); |
456 | #define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) | 480 | #define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) |
457 | 481 | ||
458 | #ifdef CONFIG_SMP | 482 | #ifdef CONFIG_SMP |
459 | static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) | 483 | static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) |
@@ -490,7 +514,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) | |||
490 | 514 | ||
491 | this_leaf = CPUID4_INFO_IDX(cpu, index); | 515 | this_leaf = CPUID4_INFO_IDX(cpu, index); |
492 | for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { | 516 | for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { |
493 | sibling_leaf = CPUID4_INFO_IDX(sibling, index); | 517 | sibling_leaf = CPUID4_INFO_IDX(sibling, index); |
494 | cpu_clear(cpu, sibling_leaf->shared_cpu_map); | 518 | cpu_clear(cpu, sibling_leaf->shared_cpu_map); |
495 | } | 519 | } |
496 | } | 520 | } |
@@ -572,7 +596,7 @@ struct _index_kobject { | |||
572 | 596 | ||
573 | /* pointer to array of kobjects for cpuX/cache/indexY */ | 597 | /* pointer to array of kobjects for cpuX/cache/indexY */ |
574 | static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); | 598 | static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); |
575 | #define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) | 599 | #define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) |
576 | 600 | ||
577 | #define show_one_plus(file_name, object, val) \ | 601 | #define show_one_plus(file_name, object, val) \ |
578 | static ssize_t show_##file_name \ | 602 | static ssize_t show_##file_name \ |
@@ -637,6 +661,99 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { | |||
637 | } | 661 | } |
638 | } | 662 | } |
639 | 663 | ||
664 | #define to_object(k) container_of(k, struct _index_kobject, kobj) | ||
665 | #define to_attr(a) container_of(a, struct _cache_attr, attr) | ||
666 | |||
667 | #ifdef CONFIG_PCI | ||
668 | static struct pci_dev *get_k8_northbridge(int node) | ||
669 | { | ||
670 | struct pci_dev *dev = NULL; | ||
671 | int i; | ||
672 | |||
673 | for (i = 0; i <= node; i++) { | ||
674 | do { | ||
675 | dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); | ||
676 | if (!dev) | ||
677 | break; | ||
678 | } while (!pci_match_id(&k8_nb_id[0], dev)); | ||
679 | if (!dev) | ||
680 | break; | ||
681 | } | ||
682 | return dev; | ||
683 | } | ||
684 | #else | ||
685 | static struct pci_dev *get_k8_northbridge(int node) | ||
686 | { | ||
687 | return NULL; | ||
688 | } | ||
689 | #endif | ||
690 | |||
691 | static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) | ||
692 | { | ||
693 | int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); | ||
694 | struct pci_dev *dev = NULL; | ||
695 | ssize_t ret = 0; | ||
696 | int i; | ||
697 | |||
698 | if (!this_leaf->can_disable) | ||
699 | return sprintf(buf, "Feature not enabled\n"); | ||
700 | |||
701 | dev = get_k8_northbridge(node); | ||
702 | if (!dev) { | ||
703 | printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); | ||
704 | return -EINVAL; | ||
705 | } | ||
706 | |||
707 | for (i = 0; i < 2; i++) { | ||
708 | unsigned int reg; | ||
709 | |||
710 | pci_read_config_dword(dev, 0x1BC + i * 4, ®); | ||
711 | |||
712 | ret += sprintf(buf, "%sEntry: %d\n", buf, i); | ||
713 | ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", | ||
714 | buf, | ||
715 | reg & 0x80000000 ? "Disabled" : "Allowed", | ||
716 | reg & 0x40000000 ? "Disabled" : "Allowed"); | ||
717 | ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", | ||
718 | buf, (reg & 0x30000) >> 16, reg & 0xfff); | ||
719 | } | ||
720 | return ret; | ||
721 | } | ||
722 | |||
723 | static ssize_t | ||
724 | store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, | ||
725 | size_t count) | ||
726 | { | ||
727 | int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); | ||
728 | struct pci_dev *dev = NULL; | ||
729 | unsigned int ret, index, val; | ||
730 | |||
731 | if (!this_leaf->can_disable) | ||
732 | return 0; | ||
733 | |||
734 | if (strlen(buf) > 15) | ||
735 | return -EINVAL; | ||
736 | |||
737 | ret = sscanf(buf, "%x %x", &index, &val); | ||
738 | if (ret != 2) | ||
739 | return -EINVAL; | ||
740 | if (index > 1) | ||
741 | return -EINVAL; | ||
742 | |||
743 | val |= 0xc0000000; | ||
744 | dev = get_k8_northbridge(node); | ||
745 | if (!dev) { | ||
746 | printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); | ||
747 | return -EINVAL; | ||
748 | } | ||
749 | |||
750 | pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); | ||
751 | wbinvd(); | ||
752 | pci_write_config_dword(dev, 0x1BC + index * 4, val); | ||
753 | |||
754 | return 1; | ||
755 | } | ||
756 | |||
640 | struct _cache_attr { | 757 | struct _cache_attr { |
641 | struct attribute attr; | 758 | struct attribute attr; |
642 | ssize_t (*show)(struct _cpuid4_info *, char *); | 759 | ssize_t (*show)(struct _cpuid4_info *, char *); |
@@ -657,6 +774,8 @@ define_one_ro(size); | |||
657 | define_one_ro(shared_cpu_map); | 774 | define_one_ro(shared_cpu_map); |
658 | define_one_ro(shared_cpu_list); | 775 | define_one_ro(shared_cpu_list); |
659 | 776 | ||
777 | static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); | ||
778 | |||
660 | static struct attribute * default_attrs[] = { | 779 | static struct attribute * default_attrs[] = { |
661 | &type.attr, | 780 | &type.attr, |
662 | &level.attr, | 781 | &level.attr, |
@@ -667,12 +786,10 @@ static struct attribute * default_attrs[] = { | |||
667 | &size.attr, | 786 | &size.attr, |
668 | &shared_cpu_map.attr, | 787 | &shared_cpu_map.attr, |
669 | &shared_cpu_list.attr, | 788 | &shared_cpu_list.attr, |
789 | &cache_disable.attr, | ||
670 | NULL | 790 | NULL |
671 | }; | 791 | }; |
672 | 792 | ||
673 | #define to_object(k) container_of(k, struct _index_kobject, kobj) | ||
674 | #define to_attr(a) container_of(a, struct _cache_attr, attr) | ||
675 | |||
676 | static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) | 793 | static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) |
677 | { | 794 | { |
678 | struct _cache_attr *fattr = to_attr(attr); | 795 | struct _cache_attr *fattr = to_attr(attr); |
@@ -682,14 +799,22 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) | |||
682 | ret = fattr->show ? | 799 | ret = fattr->show ? |
683 | fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), | 800 | fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), |
684 | buf) : | 801 | buf) : |
685 | 0; | 802 | 0; |
686 | return ret; | 803 | return ret; |
687 | } | 804 | } |
688 | 805 | ||
689 | static ssize_t store(struct kobject * kobj, struct attribute * attr, | 806 | static ssize_t store(struct kobject * kobj, struct attribute * attr, |
690 | const char * buf, size_t count) | 807 | const char * buf, size_t count) |
691 | { | 808 | { |
692 | return 0; | 809 | struct _cache_attr *fattr = to_attr(attr); |
810 | struct _index_kobject *this_leaf = to_object(kobj); | ||
811 | ssize_t ret; | ||
812 | |||
813 | ret = fattr->store ? | ||
814 | fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), | ||
815 | buf, count) : | ||
816 | 0; | ||
817 | return ret; | ||
693 | } | 818 | } |
694 | 819 | ||
695 | static struct sysfs_ops sysfs_ops = { | 820 | static struct sysfs_ops sysfs_ops = { |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 726a5fcdf341..4b031a4ac856 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c | |||
@@ -860,7 +860,7 @@ error: | |||
860 | return err; | 860 | return err; |
861 | } | 861 | } |
862 | 862 | ||
863 | static void mce_remove_device(unsigned int cpu) | 863 | static __cpuinit void mce_remove_device(unsigned int cpu) |
864 | { | 864 | { |
865 | int i; | 865 | int i; |
866 | 866 | ||
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl new file mode 100644 index 000000000000..dfea390e1608 --- /dev/null +++ b/arch/x86/kernel/cpu/mkcapflags.pl | |||
@@ -0,0 +1,32 @@ | |||
1 | #!/usr/bin/perl | ||
2 | # | ||
3 | # Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h | ||
4 | # | ||
5 | |||
6 | ($in, $out) = @ARGV; | ||
7 | |||
8 | open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n"; | ||
9 | open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; | ||
10 | |||
11 | print OUT "#include <asm/cpufeature.h>\n\n"; | ||
12 | print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; | ||
13 | |||
14 | while (defined($line = <IN>)) { | ||
15 | if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { | ||
16 | $macro = $1; | ||
17 | $feature = $2; | ||
18 | $tail = $3; | ||
19 | if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { | ||
20 | $feature = $1; | ||
21 | } | ||
22 | |||
23 | if ($feature ne '') { | ||
24 | printf OUT "\t%-32s = \"%s\",\n", | ||
25 | "[$macro]", "\L$feature"; | ||
26 | } | ||
27 | } | ||
28 | } | ||
29 | print OUT "};\n"; | ||
30 | |||
31 | close(IN); | ||
32 | close(OUT); | ||
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index cb7d3b6a80eb..4e8d77f01eeb 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c | |||
@@ -401,12 +401,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, | |||
401 | tmp |= ~((1<<(hi - 1)) - 1); | 401 | tmp |= ~((1<<(hi - 1)) - 1); |
402 | 402 | ||
403 | if (tmp != mask_lo) { | 403 | if (tmp != mask_lo) { |
404 | static int once = 1; | 404 | WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); |
405 | |||
406 | if (once) { | ||
407 | printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); | ||
408 | once = 0; | ||
409 | } | ||
410 | mask_lo = tmp; | 405 | mask_lo = tmp; |
411 | } | 406 | } |
412 | } | 407 | } |
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 84c480bb3715..4c4214690dd1 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c | |||
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) | |||
405 | } | 405 | } |
406 | /* RED-PEN: base can be > 32bit */ | 406 | /* RED-PEN: base can be > 32bit */ |
407 | len += seq_printf(seq, | 407 | len += seq_printf(seq, |
408 | "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", | 408 | "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", |
409 | i, base, base >> (20 - PAGE_SHIFT), size, factor, | 409 | i, base, base >> (20 - PAGE_SHIFT), size, factor, |
410 | mtrr_attrib_to_str(type), mtrr_usage_table[i]); | 410 | mtrr_usage_table[i], mtrr_attrib_to_str(type)); |
411 | } | 411 | } |
412 | } | 412 | } |
413 | return 0; | 413 | return 0; |
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index b117d7f8a564..c78c04821ea1 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c | |||
@@ -729,7 +729,7 @@ struct var_mtrr_range_state { | |||
729 | mtrr_type type; | 729 | mtrr_type type; |
730 | }; | 730 | }; |
731 | 731 | ||
732 | struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; | 732 | static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; |
733 | static int __initdata debug_print; | 733 | static int __initdata debug_print; |
734 | 734 | ||
735 | static int __init | 735 | static int __init |
@@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, | |||
759 | /* take out UC ranges */ | 759 | /* take out UC ranges */ |
760 | for (i = 0; i < num_var_ranges; i++) { | 760 | for (i = 0; i < num_var_ranges; i++) { |
761 | type = range_state[i].type; | 761 | type = range_state[i].type; |
762 | if (type != MTRR_TYPE_UNCACHABLE) | 762 | if (type != MTRR_TYPE_UNCACHABLE && |
763 | type != MTRR_TYPE_WRPROT) | ||
763 | continue; | 764 | continue; |
764 | size = range_state[i].size_pfn; | 765 | size = range_state[i].size_pfn; |
765 | if (!size) | 766 | if (!size) |
@@ -834,7 +835,14 @@ static int __init enable_mtrr_cleanup_setup(char *str) | |||
834 | enable_mtrr_cleanup = 1; | 835 | enable_mtrr_cleanup = 1; |
835 | return 0; | 836 | return 0; |
836 | } | 837 | } |
837 | early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); | 838 | early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); |
839 | |||
840 | static int __init mtrr_cleanup_debug_setup(char *str) | ||
841 | { | ||
842 | debug_print = 1; | ||
843 | return 0; | ||
844 | } | ||
845 | early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); | ||
838 | 846 | ||
839 | struct var_mtrr_state { | 847 | struct var_mtrr_state { |
840 | unsigned long range_startk; | 848 | unsigned long range_startk; |
@@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits) | |||
898 | } | 906 | } |
899 | } | 907 | } |
900 | 908 | ||
909 | static unsigned long to_size_factor(unsigned long sizek, char *factorp) | ||
910 | { | ||
911 | char factor; | ||
912 | unsigned long base = sizek; | ||
913 | |||
914 | if (base & ((1<<10) - 1)) { | ||
915 | /* not MB alignment */ | ||
916 | factor = 'K'; | ||
917 | } else if (base & ((1<<20) - 1)){ | ||
918 | factor = 'M'; | ||
919 | base >>= 10; | ||
920 | } else { | ||
921 | factor = 'G'; | ||
922 | base >>= 20; | ||
923 | } | ||
924 | |||
925 | *factorp = factor; | ||
926 | |||
927 | return base; | ||
928 | } | ||
929 | |||
901 | static unsigned int __init | 930 | static unsigned int __init |
902 | range_to_mtrr(unsigned int reg, unsigned long range_startk, | 931 | range_to_mtrr(unsigned int reg, unsigned long range_startk, |
903 | unsigned long range_sizek, unsigned char type) | 932 | unsigned long range_sizek, unsigned char type) |
@@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, | |||
919 | align = max_align; | 948 | align = max_align; |
920 | 949 | ||
921 | sizek = 1 << align; | 950 | sizek = 1 << align; |
922 | if (debug_print) | 951 | if (debug_print) { |
952 | char start_factor = 'K', size_factor = 'K'; | ||
953 | unsigned long start_base, size_base; | ||
954 | |||
955 | start_base = to_size_factor(range_startk, &start_factor), | ||
956 | size_base = to_size_factor(sizek, &size_factor), | ||
957 | |||
923 | printk(KERN_DEBUG "Setting variable MTRR %d, " | 958 | printk(KERN_DEBUG "Setting variable MTRR %d, " |
924 | "base: %ldMB, range: %ldMB, type %s\n", | 959 | "base: %ld%cB, range: %ld%cB, type %s\n", |
925 | reg, range_startk >> 10, sizek >> 10, | 960 | reg, start_base, start_factor, |
961 | size_base, size_factor, | ||
926 | (type == MTRR_TYPE_UNCACHABLE)?"UC": | 962 | (type == MTRR_TYPE_UNCACHABLE)?"UC": |
927 | ((type == MTRR_TYPE_WRBACK)?"WB":"Other") | 963 | ((type == MTRR_TYPE_WRBACK)?"WB":"Other") |
928 | ); | 964 | ); |
965 | } | ||
929 | save_var_mtrr(reg++, range_startk, sizek, type); | 966 | save_var_mtrr(reg++, range_startk, sizek, type); |
930 | range_startk += sizek; | 967 | range_startk += sizek; |
931 | range_sizek -= sizek; | 968 | range_sizek -= sizek; |
@@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, | |||
970 | /* try to append some small hole */ | 1007 | /* try to append some small hole */ |
971 | range0_basek = state->range_startk; | 1008 | range0_basek = state->range_startk; |
972 | range0_sizek = ALIGN(state->range_sizek, chunk_sizek); | 1009 | range0_sizek = ALIGN(state->range_sizek, chunk_sizek); |
1010 | |||
1011 | /* no increase */ | ||
973 | if (range0_sizek == state->range_sizek) { | 1012 | if (range0_sizek == state->range_sizek) { |
974 | if (debug_print) | 1013 | if (debug_print) |
975 | printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", | 1014 | printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", |
@@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, | |||
980 | return 0; | 1019 | return 0; |
981 | } | 1020 | } |
982 | 1021 | ||
983 | range0_sizek -= chunk_sizek; | 1022 | /* only cut back, when it is not the last */ |
984 | if (range0_sizek && sizek) { | 1023 | if (sizek) { |
985 | while (range0_basek + range0_sizek > (basek + sizek)) { | 1024 | while (range0_basek + range0_sizek > (basek + sizek)) { |
986 | range0_sizek -= chunk_sizek; | 1025 | if (range0_sizek >= chunk_sizek) |
987 | if (!range0_sizek) | 1026 | range0_sizek -= chunk_sizek; |
988 | break; | 1027 | else |
989 | } | 1028 | range0_sizek = 0; |
1029 | |||
1030 | if (!range0_sizek) | ||
1031 | break; | ||
1032 | } | ||
1033 | } | ||
1034 | |||
1035 | second_try: | ||
1036 | range_basek = range0_basek + range0_sizek; | ||
1037 | |||
1038 | /* one hole in the middle */ | ||
1039 | if (range_basek > basek && range_basek <= (basek + sizek)) | ||
1040 | second_sizek = range_basek - basek; | ||
1041 | |||
1042 | if (range0_sizek > state->range_sizek) { | ||
1043 | |||
1044 | /* one hole in middle or at end */ | ||
1045 | hole_sizek = range0_sizek - state->range_sizek - second_sizek; | ||
1046 | |||
1047 | /* hole size should be less than half of range0 size */ | ||
1048 | if (hole_sizek >= (range0_sizek >> 1) && | ||
1049 | range0_sizek >= chunk_sizek) { | ||
1050 | range0_sizek -= chunk_sizek; | ||
1051 | second_sizek = 0; | ||
1052 | hole_sizek = 0; | ||
1053 | |||
1054 | goto second_try; | ||
1055 | } | ||
990 | } | 1056 | } |
991 | 1057 | ||
992 | if (range0_sizek) { | 1058 | if (range0_sizek) { |
@@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, | |||
996 | (range0_basek + range0_sizek)<<10); | 1062 | (range0_basek + range0_sizek)<<10); |
997 | state->reg = range_to_mtrr(state->reg, range0_basek, | 1063 | state->reg = range_to_mtrr(state->reg, range0_basek, |
998 | range0_sizek, MTRR_TYPE_WRBACK); | 1064 | range0_sizek, MTRR_TYPE_WRBACK); |
999 | |||
1000 | } | ||
1001 | |||
1002 | range_basek = range0_basek + range0_sizek; | ||
1003 | range_sizek = chunk_sizek; | ||
1004 | |||
1005 | if (range_basek + range_sizek > basek && | ||
1006 | range_basek + range_sizek <= (basek + sizek)) { | ||
1007 | /* one hole */ | ||
1008 | second_basek = basek; | ||
1009 | second_sizek = range_basek + range_sizek - basek; | ||
1010 | } | 1065 | } |
1011 | 1066 | ||
1012 | /* if last piece, only could one hole near end */ | 1067 | if (range0_sizek < state->range_sizek) { |
1013 | if ((second_basek || !basek) && | 1068 | /* need to handle left over */ |
1014 | range_sizek - (state->range_sizek - range0_sizek) - second_sizek < | ||
1015 | (chunk_sizek >> 1)) { | ||
1016 | /* | ||
1017 | * one hole in middle (second_sizek is 0) or at end | ||
1018 | * (second_sizek is 0 ) | ||
1019 | */ | ||
1020 | hole_sizek = range_sizek - (state->range_sizek - range0_sizek) | ||
1021 | - second_sizek; | ||
1022 | hole_basek = range_basek + range_sizek - hole_sizek | ||
1023 | - second_sizek; | ||
1024 | } else { | ||
1025 | /* fallback for big hole, or several holes */ | ||
1026 | range_sizek = state->range_sizek - range0_sizek; | 1069 | range_sizek = state->range_sizek - range0_sizek; |
1027 | second_basek = 0; | 1070 | |
1028 | second_sizek = 0; | 1071 | if (debug_print) |
1072 | printk(KERN_DEBUG "range: %016lx - %016lx\n", | ||
1073 | range_basek<<10, | ||
1074 | (range_basek + range_sizek)<<10); | ||
1075 | state->reg = range_to_mtrr(state->reg, range_basek, | ||
1076 | range_sizek, MTRR_TYPE_WRBACK); | ||
1029 | } | 1077 | } |
1030 | 1078 | ||
1031 | if (debug_print) | ||
1032 | printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10, | ||
1033 | (range_basek + range_sizek)<<10); | ||
1034 | state->reg = range_to_mtrr(state->reg, range_basek, range_sizek, | ||
1035 | MTRR_TYPE_WRBACK); | ||
1036 | if (hole_sizek) { | 1079 | if (hole_sizek) { |
1080 | hole_basek = range_basek - hole_sizek - second_sizek; | ||
1037 | if (debug_print) | 1081 | if (debug_print) |
1038 | printk(KERN_DEBUG "hole: %016lx - %016lx\n", | 1082 | printk(KERN_DEBUG "hole: %016lx - %016lx\n", |
1039 | hole_basek<<10, (hole_basek + hole_sizek)<<10); | 1083 | hole_basek<<10, |
1040 | state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, | 1084 | (hole_basek + hole_sizek)<<10); |
1041 | MTRR_TYPE_UNCACHABLE); | 1085 | state->reg = range_to_mtrr(state->reg, hole_basek, |
1042 | 1086 | hole_sizek, MTRR_TYPE_UNCACHABLE); | |
1043 | } | 1087 | } |
1044 | 1088 | ||
1045 | return second_sizek; | 1089 | return second_sizek; |
@@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result { | |||
1154 | }; | 1198 | }; |
1155 | 1199 | ||
1156 | /* | 1200 | /* |
1157 | * gran_size: 1M, 2M, ..., 2G | 1201 | * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G |
1158 | * chunk size: gran_size, ..., 4G | 1202 | * chunk size: gran_size, ..., 2G |
1159 | * so we need (2+13)*6 | 1203 | * so we need (1+16)*8 |
1160 | */ | 1204 | */ |
1161 | #define NUM_RESULT 90 | 1205 | #define NUM_RESULT 136 |
1162 | #define PSHIFT (PAGE_SHIFT - 10) | 1206 | #define PSHIFT (PAGE_SHIFT - 10) |
1163 | 1207 | ||
1164 | static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; | 1208 | static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; |
@@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM]; | |||
1168 | static int __init mtrr_cleanup(unsigned address_bits) | 1212 | static int __init mtrr_cleanup(unsigned address_bits) |
1169 | { | 1213 | { |
1170 | unsigned long extra_remove_base, extra_remove_size; | 1214 | unsigned long extra_remove_base, extra_remove_size; |
1171 | unsigned long i, base, size, def, dummy; | 1215 | unsigned long base, size, def, dummy; |
1172 | mtrr_type type; | 1216 | mtrr_type type; |
1173 | int nr_range, nr_range_new; | 1217 | int nr_range, nr_range_new; |
1174 | u64 chunk_size, gran_size; | 1218 | u64 chunk_size, gran_size; |
1175 | unsigned long range_sums, range_sums_new; | 1219 | unsigned long range_sums, range_sums_new; |
1176 | int index_good; | 1220 | int index_good; |
1177 | int num_reg_good; | 1221 | int num_reg_good; |
1222 | int i; | ||
1178 | 1223 | ||
1179 | /* extra one for all 0 */ | 1224 | /* extra one for all 0 */ |
1180 | int num[MTRR_NUM_TYPES + 1]; | 1225 | int num[MTRR_NUM_TYPES + 1]; |
@@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits) | |||
1204 | continue; | 1249 | continue; |
1205 | if (!size) | 1250 | if (!size) |
1206 | type = MTRR_NUM_TYPES; | 1251 | type = MTRR_NUM_TYPES; |
1252 | if (type == MTRR_TYPE_WRPROT) | ||
1253 | type = MTRR_TYPE_UNCACHABLE; | ||
1207 | num[type]++; | 1254 | num[type]++; |
1208 | } | 1255 | } |
1209 | 1256 | ||
@@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits) | |||
1216 | num_var_ranges - num[MTRR_NUM_TYPES]) | 1263 | num_var_ranges - num[MTRR_NUM_TYPES]) |
1217 | return 0; | 1264 | return 0; |
1218 | 1265 | ||
1266 | /* print original var MTRRs at first, for debugging: */ | ||
1267 | printk(KERN_DEBUG "original variable MTRRs\n"); | ||
1268 | for (i = 0; i < num_var_ranges; i++) { | ||
1269 | char start_factor = 'K', size_factor = 'K'; | ||
1270 | unsigned long start_base, size_base; | ||
1271 | |||
1272 | size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); | ||
1273 | if (!size_base) | ||
1274 | continue; | ||
1275 | |||
1276 | size_base = to_size_factor(size_base, &size_factor), | ||
1277 | start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); | ||
1278 | start_base = to_size_factor(start_base, &start_factor), | ||
1279 | type = range_state[i].type; | ||
1280 | |||
1281 | printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", | ||
1282 | i, start_base, start_factor, | ||
1283 | size_base, size_factor, | ||
1284 | (type == MTRR_TYPE_UNCACHABLE) ? "UC" : | ||
1285 | ((type == MTRR_TYPE_WRPROT) ? "WP" : | ||
1286 | ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) | ||
1287 | ); | ||
1288 | } | ||
1289 | |||
1219 | memset(range, 0, sizeof(range)); | 1290 | memset(range, 0, sizeof(range)); |
1220 | extra_remove_size = 0; | 1291 | extra_remove_size = 0; |
1221 | if (mtrr_tom2) { | 1292 | extra_remove_base = 1 << (32 - PAGE_SHIFT); |
1222 | extra_remove_base = 1 << (32 - PAGE_SHIFT); | 1293 | if (mtrr_tom2) |
1223 | extra_remove_size = | 1294 | extra_remove_size = |
1224 | (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; | 1295 | (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; |
1225 | } | ||
1226 | nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, | 1296 | nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, |
1227 | extra_remove_size); | 1297 | extra_remove_size); |
1298 | /* | ||
1299 | * [0, 1M) should always be coverred by var mtrr with WB | ||
1300 | * and fixed mtrrs should take effective before var mtrr for it | ||
1301 | */ | ||
1302 | nr_range = add_range_with_merge(range, nr_range, 0, | ||
1303 | (1ULL<<(20 - PAGE_SHIFT)) - 1); | ||
1304 | /* sort the ranges */ | ||
1305 | sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); | ||
1306 | |||
1228 | range_sums = sum_ranges(range, nr_range); | 1307 | range_sums = sum_ranges(range, nr_range); |
1229 | printk(KERN_INFO "total RAM coverred: %ldM\n", | 1308 | printk(KERN_INFO "total RAM coverred: %ldM\n", |
1230 | range_sums >> (20 - PAGE_SHIFT)); | 1309 | range_sums >> (20 - PAGE_SHIFT)); |
1231 | 1310 | ||
1232 | if (mtrr_chunk_size && mtrr_gran_size) { | 1311 | if (mtrr_chunk_size && mtrr_gran_size) { |
1233 | int num_reg; | 1312 | int num_reg; |
1313 | char gran_factor, chunk_factor, lose_factor; | ||
1314 | unsigned long gran_base, chunk_base, lose_base; | ||
1234 | 1315 | ||
1235 | debug_print = 1; | 1316 | debug_print++; |
1236 | /* convert ranges to var ranges state */ | 1317 | /* convert ranges to var ranges state */ |
1237 | num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, | 1318 | num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, |
1238 | mtrr_gran_size); | 1319 | mtrr_gran_size); |
@@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits) | |||
1256 | result[i].lose_cover_sizek = | 1337 | result[i].lose_cover_sizek = |
1257 | (range_sums - range_sums_new) << PSHIFT; | 1338 | (range_sums - range_sums_new) << PSHIFT; |
1258 | 1339 | ||
1259 | printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", | 1340 | gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), |
1260 | result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, | 1341 | chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), |
1261 | result[i].chunk_sizek >> 10); | 1342 | lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), |
1262 | printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", | 1343 | printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", |
1344 | result[i].bad?"*BAD*":" ", | ||
1345 | gran_base, gran_factor, chunk_base, chunk_factor); | ||
1346 | printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", | ||
1263 | result[i].num_reg, result[i].bad?"-":"", | 1347 | result[i].num_reg, result[i].bad?"-":"", |
1264 | result[i].lose_cover_sizek >> 10); | 1348 | lose_base, lose_factor); |
1265 | if (!result[i].bad) { | 1349 | if (!result[i].bad) { |
1266 | set_var_mtrr_all(address_bits); | 1350 | set_var_mtrr_all(address_bits); |
1267 | return 1; | 1351 | return 1; |
1268 | } | 1352 | } |
1269 | printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " | 1353 | printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " |
1270 | "will find optimal one\n"); | 1354 | "will find optimal one\n"); |
1271 | debug_print = 0; | 1355 | debug_print--; |
1272 | memset(result, 0, sizeof(result[0])); | 1356 | memset(result, 0, sizeof(result[0])); |
1273 | } | 1357 | } |
1274 | 1358 | ||
1275 | i = 0; | 1359 | i = 0; |
1276 | memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); | 1360 | memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); |
1277 | memset(result, 0, sizeof(result)); | 1361 | memset(result, 0, sizeof(result)); |
1278 | for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { | 1362 | for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { |
1279 | for (chunk_size = gran_size; chunk_size < (1ULL<<33); | 1363 | char gran_factor; |
1364 | unsigned long gran_base; | ||
1365 | |||
1366 | if (debug_print) | ||
1367 | gran_base = to_size_factor(gran_size >> 10, &gran_factor); | ||
1368 | |||
1369 | for (chunk_size = gran_size; chunk_size < (1ULL<<32); | ||
1280 | chunk_size <<= 1) { | 1370 | chunk_size <<= 1) { |
1281 | int num_reg; | 1371 | int num_reg; |
1282 | 1372 | ||
1283 | if (debug_print) | 1373 | if (debug_print) { |
1284 | printk(KERN_INFO | 1374 | char chunk_factor; |
1285 | "\ngran_size: %lldM chunk_size_size: %lldM\n", | 1375 | unsigned long chunk_base; |
1286 | gran_size >> 20, chunk_size >> 20); | 1376 | |
1377 | chunk_base = to_size_factor(chunk_size>>10, &chunk_factor), | ||
1378 | printk(KERN_INFO "\n"); | ||
1379 | printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n", | ||
1380 | gran_base, gran_factor, chunk_base, chunk_factor); | ||
1381 | } | ||
1287 | if (i >= NUM_RESULT) | 1382 | if (i >= NUM_RESULT) |
1288 | continue; | 1383 | continue; |
1289 | 1384 | ||
@@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits) | |||
1326 | 1421 | ||
1327 | /* print out all */ | 1422 | /* print out all */ |
1328 | for (i = 0; i < NUM_RESULT; i++) { | 1423 | for (i = 0; i < NUM_RESULT; i++) { |
1329 | printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", | 1424 | char gran_factor, chunk_factor, lose_factor; |
1330 | result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, | 1425 | unsigned long gran_base, chunk_base, lose_base; |
1331 | result[i].chunk_sizek >> 10); | 1426 | |
1332 | printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", | 1427 | gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), |
1333 | result[i].num_reg, result[i].bad?"-":"", | 1428 | chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), |
1334 | result[i].lose_cover_sizek >> 10); | 1429 | lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), |
1430 | printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", | ||
1431 | result[i].bad?"*BAD*":" ", | ||
1432 | gran_base, gran_factor, chunk_base, chunk_factor); | ||
1433 | printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", | ||
1434 | result[i].num_reg, result[i].bad?"-":"", | ||
1435 | lose_base, lose_factor); | ||
1335 | } | 1436 | } |
1336 | 1437 | ||
1337 | /* try to find the optimal index */ | 1438 | /* try to find the optimal index */ |
@@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits) | |||
1339 | nr_mtrr_spare_reg = num_var_ranges - 1; | 1440 | nr_mtrr_spare_reg = num_var_ranges - 1; |
1340 | num_reg_good = -1; | 1441 | num_reg_good = -1; |
1341 | for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { | 1442 | for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { |
1342 | if (!min_loss_pfn[i]) { | 1443 | if (!min_loss_pfn[i]) |
1343 | num_reg_good = i; | 1444 | num_reg_good = i; |
1344 | break; | ||
1345 | } | ||
1346 | } | 1445 | } |
1347 | 1446 | ||
1348 | index_good = -1; | 1447 | index_good = -1; |
@@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits) | |||
1358 | } | 1457 | } |
1359 | 1458 | ||
1360 | if (index_good != -1) { | 1459 | if (index_good != -1) { |
1460 | char gran_factor, chunk_factor, lose_factor; | ||
1461 | unsigned long gran_base, chunk_base, lose_base; | ||
1462 | |||
1361 | printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); | 1463 | printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); |
1362 | i = index_good; | 1464 | i = index_good; |
1363 | printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", | 1465 | gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), |
1364 | result[i].gran_sizek >> 10, | 1466 | chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), |
1365 | result[i].chunk_sizek >> 10); | 1467 | lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), |
1366 | printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", | 1468 | printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t", |
1367 | result[i].num_reg, | 1469 | gran_base, gran_factor, chunk_base, chunk_factor); |
1368 | result[i].lose_cover_sizek >> 10); | 1470 | printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n", |
1471 | result[i].num_reg, lose_base, lose_factor); | ||
1369 | /* convert ranges to var ranges state */ | 1472 | /* convert ranges to var ranges state */ |
1370 | chunk_size = result[i].chunk_sizek; | 1473 | chunk_size = result[i].chunk_sizek; |
1371 | chunk_size <<= 10; | 1474 | chunk_size <<= 10; |
1372 | gran_size = result[i].gran_sizek; | 1475 | gran_size = result[i].gran_sizek; |
1373 | gran_size <<= 10; | 1476 | gran_size <<= 10; |
1374 | debug_print = 1; | 1477 | debug_print++; |
1375 | x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); | 1478 | x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); |
1479 | debug_print--; | ||
1376 | set_var_mtrr_all(address_bits); | 1480 | set_var_mtrr_all(address_bits); |
1377 | return 1; | 1481 | return 1; |
1378 | } | 1482 | } |
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 05cc22dbd4ff..6bff382094f5 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c | |||
@@ -295,13 +295,19 @@ static int setup_k7_watchdog(unsigned nmi_hz) | |||
295 | /* setup the timer */ | 295 | /* setup the timer */ |
296 | wrmsr(evntsel_msr, evntsel, 0); | 296 | wrmsr(evntsel_msr, evntsel, 0); |
297 | write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); | 297 | write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); |
298 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
299 | evntsel |= K7_EVNTSEL_ENABLE; | ||
300 | wrmsr(evntsel_msr, evntsel, 0); | ||
301 | 298 | ||
299 | /* initialize the wd struct before enabling */ | ||
302 | wd->perfctr_msr = perfctr_msr; | 300 | wd->perfctr_msr = perfctr_msr; |
303 | wd->evntsel_msr = evntsel_msr; | 301 | wd->evntsel_msr = evntsel_msr; |
304 | wd->cccr_msr = 0; /* unused */ | 302 | wd->cccr_msr = 0; /* unused */ |
303 | |||
304 | /* ok, everything is initialized, announce that we're set */ | ||
305 | cpu_nmi_set_wd_enabled(); | ||
306 | |||
307 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
308 | evntsel |= K7_EVNTSEL_ENABLE; | ||
309 | wrmsr(evntsel_msr, evntsel, 0); | ||
310 | |||
305 | return 1; | 311 | return 1; |
306 | } | 312 | } |
307 | 313 | ||
@@ -379,13 +385,19 @@ static int setup_p6_watchdog(unsigned nmi_hz) | |||
379 | wrmsr(evntsel_msr, evntsel, 0); | 385 | wrmsr(evntsel_msr, evntsel, 0); |
380 | nmi_hz = adjust_for_32bit_ctr(nmi_hz); | 386 | nmi_hz = adjust_for_32bit_ctr(nmi_hz); |
381 | write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); | 387 | write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); |
382 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
383 | evntsel |= P6_EVNTSEL0_ENABLE; | ||
384 | wrmsr(evntsel_msr, evntsel, 0); | ||
385 | 388 | ||
389 | /* initialize the wd struct before enabling */ | ||
386 | wd->perfctr_msr = perfctr_msr; | 390 | wd->perfctr_msr = perfctr_msr; |
387 | wd->evntsel_msr = evntsel_msr; | 391 | wd->evntsel_msr = evntsel_msr; |
388 | wd->cccr_msr = 0; /* unused */ | 392 | wd->cccr_msr = 0; /* unused */ |
393 | |||
394 | /* ok, everything is initialized, announce that we're set */ | ||
395 | cpu_nmi_set_wd_enabled(); | ||
396 | |||
397 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
398 | evntsel |= P6_EVNTSEL0_ENABLE; | ||
399 | wrmsr(evntsel_msr, evntsel, 0); | ||
400 | |||
389 | return 1; | 401 | return 1; |
390 | } | 402 | } |
391 | 403 | ||
@@ -432,6 +444,27 @@ static const struct wd_ops p6_wd_ops = { | |||
432 | #define P4_CCCR_ENABLE (1 << 12) | 444 | #define P4_CCCR_ENABLE (1 << 12) |
433 | #define P4_CCCR_OVF (1 << 31) | 445 | #define P4_CCCR_OVF (1 << 31) |
434 | 446 | ||
447 | #define P4_CONTROLS 18 | ||
448 | static unsigned int p4_controls[18] = { | ||
449 | MSR_P4_BPU_CCCR0, | ||
450 | MSR_P4_BPU_CCCR1, | ||
451 | MSR_P4_BPU_CCCR2, | ||
452 | MSR_P4_BPU_CCCR3, | ||
453 | MSR_P4_MS_CCCR0, | ||
454 | MSR_P4_MS_CCCR1, | ||
455 | MSR_P4_MS_CCCR2, | ||
456 | MSR_P4_MS_CCCR3, | ||
457 | MSR_P4_FLAME_CCCR0, | ||
458 | MSR_P4_FLAME_CCCR1, | ||
459 | MSR_P4_FLAME_CCCR2, | ||
460 | MSR_P4_FLAME_CCCR3, | ||
461 | MSR_P4_IQ_CCCR0, | ||
462 | MSR_P4_IQ_CCCR1, | ||
463 | MSR_P4_IQ_CCCR2, | ||
464 | MSR_P4_IQ_CCCR3, | ||
465 | MSR_P4_IQ_CCCR4, | ||
466 | MSR_P4_IQ_CCCR5, | ||
467 | }; | ||
435 | /* | 468 | /* |
436 | * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter | 469 | * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter |
437 | * CRU_ESCR0 (with any non-null event selector) through a complemented | 470 | * CRU_ESCR0 (with any non-null event selector) through a complemented |
@@ -473,6 +506,26 @@ static int setup_p4_watchdog(unsigned nmi_hz) | |||
473 | evntsel_msr = MSR_P4_CRU_ESCR0; | 506 | evntsel_msr = MSR_P4_CRU_ESCR0; |
474 | cccr_msr = MSR_P4_IQ_CCCR0; | 507 | cccr_msr = MSR_P4_IQ_CCCR0; |
475 | cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); | 508 | cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); |
509 | |||
510 | /* | ||
511 | * If we're on the kdump kernel or other situation, we may | ||
512 | * still have other performance counter registers set to | ||
513 | * interrupt and they'll keep interrupting forever because | ||
514 | * of the P4_CCCR_OVF quirk. So we need to ACK all the | ||
515 | * pending interrupts and disable all the registers here, | ||
516 | * before reenabling the NMI delivery. Refer to p4_rearm() | ||
517 | * about the P4_CCCR_OVF quirk. | ||
518 | */ | ||
519 | if (reset_devices) { | ||
520 | unsigned int low, high; | ||
521 | int i; | ||
522 | |||
523 | for (i = 0; i < P4_CONTROLS; i++) { | ||
524 | rdmsr(p4_controls[i], low, high); | ||
525 | low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF); | ||
526 | wrmsr(p4_controls[i], low, high); | ||
527 | } | ||
528 | } | ||
476 | } else { | 529 | } else { |
477 | /* logical cpu 1 */ | 530 | /* logical cpu 1 */ |
478 | perfctr_msr = MSR_P4_IQ_PERFCTR1; | 531 | perfctr_msr = MSR_P4_IQ_PERFCTR1; |
@@ -499,12 +552,17 @@ static int setup_p4_watchdog(unsigned nmi_hz) | |||
499 | wrmsr(evntsel_msr, evntsel, 0); | 552 | wrmsr(evntsel_msr, evntsel, 0); |
500 | wrmsr(cccr_msr, cccr_val, 0); | 553 | wrmsr(cccr_msr, cccr_val, 0); |
501 | write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); | 554 | write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); |
502 | apic_write(APIC_LVTPC, APIC_DM_NMI); | 555 | |
503 | cccr_val |= P4_CCCR_ENABLE; | ||
504 | wrmsr(cccr_msr, cccr_val, 0); | ||
505 | wd->perfctr_msr = perfctr_msr; | 556 | wd->perfctr_msr = perfctr_msr; |
506 | wd->evntsel_msr = evntsel_msr; | 557 | wd->evntsel_msr = evntsel_msr; |
507 | wd->cccr_msr = cccr_msr; | 558 | wd->cccr_msr = cccr_msr; |
559 | |||
560 | /* ok, everything is initialized, announce that we're set */ | ||
561 | cpu_nmi_set_wd_enabled(); | ||
562 | |||
563 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
564 | cccr_val |= P4_CCCR_ENABLE; | ||
565 | wrmsr(cccr_msr, cccr_val, 0); | ||
508 | return 1; | 566 | return 1; |
509 | } | 567 | } |
510 | 568 | ||
@@ -620,13 +678,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) | |||
620 | wrmsr(evntsel_msr, evntsel, 0); | 678 | wrmsr(evntsel_msr, evntsel, 0); |
621 | nmi_hz = adjust_for_32bit_ctr(nmi_hz); | 679 | nmi_hz = adjust_for_32bit_ctr(nmi_hz); |
622 | write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); | 680 | write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); |
623 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
624 | evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
625 | wrmsr(evntsel_msr, evntsel, 0); | ||
626 | 681 | ||
627 | wd->perfctr_msr = perfctr_msr; | 682 | wd->perfctr_msr = perfctr_msr; |
628 | wd->evntsel_msr = evntsel_msr; | 683 | wd->evntsel_msr = evntsel_msr; |
629 | wd->cccr_msr = 0; /* unused */ | 684 | wd->cccr_msr = 0; /* unused */ |
685 | |||
686 | /* ok, everything is initialized, announce that we're set */ | ||
687 | cpu_nmi_set_wd_enabled(); | ||
688 | |||
689 | apic_write(APIC_LVTPC, APIC_DM_NMI); | ||
690 | evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; | ||
691 | wrmsr(evntsel_msr, evntsel, 0); | ||
630 | intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); | 692 | intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); |
631 | return 1; | 693 | return 1; |
632 | } | 694 | } |
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c new file mode 100644 index 000000000000..5abbea297e0c --- /dev/null +++ b/arch/x86/kernel/cpu/powerflags.c | |||
@@ -0,0 +1,20 @@ | |||
1 | /* | ||
2 | * Strings for the various x86 power flags | ||
3 | * | ||
4 | * This file must not contain any executable code. | ||
5 | */ | ||
6 | |||
7 | #include <asm/cpufeature.h> | ||
8 | |||
9 | const char *const x86_power_flags[32] = { | ||
10 | "ts", /* temperature sensor */ | ||
11 | "fid", /* frequency id control */ | ||
12 | "vid", /* voltage id control */ | ||
13 | "ttp", /* thermal trip */ | ||
14 | "tm", | ||
15 | "stc", | ||
16 | "100mhzsteps", | ||
17 | "hwpstate", | ||
18 | "", /* tsc invariant mapped to constant_tsc */ | ||
19 | /* nothing */ | ||
20 | }; | ||
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index b911a2c61b8f..52b3fefbd5af 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c | |||
@@ -5,6 +5,18 @@ | |||
5 | #include <asm/msr.h> | 5 | #include <asm/msr.h> |
6 | #include "cpu.h" | 6 | #include "cpu.h" |
7 | 7 | ||
8 | static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c) | ||
9 | { | ||
10 | u32 xlvl; | ||
11 | |||
12 | /* Transmeta-defined flags: level 0x80860001 */ | ||
13 | xlvl = cpuid_eax(0x80860000); | ||
14 | if ((xlvl & 0xffff0000) == 0x80860000) { | ||
15 | if (xlvl >= 0x80860001) | ||
16 | c->x86_capability[2] = cpuid_edx(0x80860001); | ||
17 | } | ||
18 | } | ||
19 | |||
8 | static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) | 20 | static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) |
9 | { | 21 | { |
10 | unsigned int cap_mask, uk, max, dummy; | 22 | unsigned int cap_mask, uk, max, dummy; |
@@ -12,7 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) | |||
12 | unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; | 24 | unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; |
13 | char cpu_info[65]; | 25 | char cpu_info[65]; |
14 | 26 | ||
15 | get_model_name(c); /* Same as AMD/Cyrix */ | 27 | early_init_transmeta(c); |
28 | |||
16 | display_cacheinfo(c); | 29 | display_cacheinfo(c); |
17 | 30 | ||
18 | /* Print CMS and CPU revision */ | 31 | /* Print CMS and CPU revision */ |
@@ -85,23 +98,12 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) | |||
85 | #endif | 98 | #endif |
86 | } | 99 | } |
87 | 100 | ||
88 | static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c) | ||
89 | { | ||
90 | u32 xlvl; | ||
91 | |||
92 | /* Transmeta-defined flags: level 0x80860001 */ | ||
93 | xlvl = cpuid_eax(0x80860000); | ||
94 | if ((xlvl & 0xffff0000) == 0x80860000) { | ||
95 | if (xlvl >= 0x80860001) | ||
96 | c->x86_capability[2] = cpuid_edx(0x80860001); | ||
97 | } | ||
98 | } | ||
99 | |||
100 | static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { | 101 | static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { |
101 | .c_vendor = "Transmeta", | 102 | .c_vendor = "Transmeta", |
102 | .c_ident = { "GenuineTMx86", "TransmetaCPU" }, | 103 | .c_ident = { "GenuineTMx86", "TransmetaCPU" }, |
104 | .c_early_init = early_init_transmeta, | ||
103 | .c_init = init_transmeta, | 105 | .c_init = init_transmeta, |
104 | .c_identify = transmeta_identify, | 106 | .c_x86_vendor = X86_VENDOR_TRANSMETA, |
105 | }; | 107 | }; |
106 | 108 | ||
107 | cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev); | 109 | cpu_dev_register(transmeta_cpu_dev); |
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index b1fc90989d75..e777f79e0960 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c | |||
@@ -19,7 +19,8 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = { | |||
19 | } | 19 | } |
20 | }, | 20 | }, |
21 | }, | 21 | }, |
22 | .c_x86_vendor = X86_VENDOR_UMC, | ||
22 | }; | 23 | }; |
23 | 24 | ||
24 | cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev); | 25 | cpu_dev_register(umc_cpu_dev); |
25 | 26 | ||
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 8e9cd6a8ec12..72cefd1e649b 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c | |||
@@ -36,7 +36,6 @@ | |||
36 | #include <linux/smp_lock.h> | 36 | #include <linux/smp_lock.h> |
37 | #include <linux/major.h> | 37 | #include <linux/major.h> |
38 | #include <linux/fs.h> | 38 | #include <linux/fs.h> |
39 | #include <linux/smp_lock.h> | ||
40 | #include <linux/device.h> | 39 | #include <linux/device.h> |
41 | #include <linux/cpu.h> | 40 | #include <linux/cpu.h> |
42 | #include <linux/notifier.h> | 41 | #include <linux/notifier.h> |
@@ -148,8 +147,8 @@ static __cpuinit int cpuid_device_create(int cpu) | |||
148 | { | 147 | { |
149 | struct device *dev; | 148 | struct device *dev; |
150 | 149 | ||
151 | dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), | 150 | dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, |
152 | NULL, "cpu%d", cpu); | 151 | "cpu%d", cpu); |
153 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; | 152 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; |
154 | } | 153 | } |
155 | 154 | ||
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 15e6c6bc4a46..e90a60ef10c2 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c | |||
@@ -7,9 +7,8 @@ | |||
7 | 7 | ||
8 | #include <linux/errno.h> | 8 | #include <linux/errno.h> |
9 | #include <linux/crash_dump.h> | 9 | #include <linux/crash_dump.h> |
10 | 10 | #include <linux/uaccess.h> | |
11 | #include <asm/uaccess.h> | 11 | #include <linux/io.h> |
12 | #include <asm/io.h> | ||
13 | 12 | ||
14 | /** | 13 | /** |
15 | * copy_oldmem_page - copy one page from "oldmem" | 14 | * copy_oldmem_page - copy one page from "oldmem" |
@@ -25,7 +24,7 @@ | |||
25 | * in the current kernel. We stitch up a pte, similar to kmap_atomic. | 24 | * in the current kernel. We stitch up a pte, similar to kmap_atomic. |
26 | */ | 25 | */ |
27 | ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | 26 | ssize_t copy_oldmem_page(unsigned long pfn, char *buf, |
28 | size_t csize, unsigned long offset, int userbuf) | 27 | size_t csize, unsigned long offset, int userbuf) |
29 | { | 28 | { |
30 | void *vaddr; | 29 | void *vaddr; |
31 | 30 | ||
@@ -33,14 +32,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | |||
33 | return 0; | 32 | return 0; |
34 | 33 | ||
35 | vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); | 34 | vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); |
35 | if (!vaddr) | ||
36 | return -ENOMEM; | ||
36 | 37 | ||
37 | if (userbuf) { | 38 | if (userbuf) { |
38 | if (copy_to_user(buf, (vaddr + offset), csize)) { | 39 | if (copy_to_user(buf, vaddr + offset, csize)) { |
39 | iounmap(vaddr); | 40 | iounmap(vaddr); |
40 | return -EFAULT; | 41 | return -EFAULT; |
41 | } | 42 | } |
42 | } else | 43 | } else |
43 | memcpy(buf, (vaddr + offset), csize); | 44 | memcpy(buf, vaddr + offset, csize); |
44 | 45 | ||
45 | iounmap(vaddr); | 46 | iounmap(vaddr); |
46 | return csize; | 47 | return csize; |
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index a47798b59f07..b4f14c6c09d9 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c | |||
@@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = { | |||
66 | .ds = __USER_DS, | 66 | .ds = __USER_DS, |
67 | .fs = __KERNEL_PERCPU, | 67 | .fs = __KERNEL_PERCPU, |
68 | 68 | ||
69 | .__cr3 = __pa(swapper_pg_dir) | 69 | .__cr3 = __pa_nodebug(swapper_pg_dir), |
70 | } | 70 | } |
71 | }; | 71 | }; |
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 11c11b8ec48d..2b69994fd3a8 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c | |||
@@ -2,26 +2,49 @@ | |||
2 | * Debug Store support | 2 | * Debug Store support |
3 | * | 3 | * |
4 | * This provides a low-level interface to the hardware's Debug Store | 4 | * This provides a low-level interface to the hardware's Debug Store |
5 | * feature that is used for last branch recording (LBR) and | 5 | * feature that is used for branch trace store (BTS) and |
6 | * precise-event based sampling (PEBS). | 6 | * precise-event based sampling (PEBS). |
7 | * | 7 | * |
8 | * Different architectures use a different DS layout/pointer size. | 8 | * It manages: |
9 | * The below functions therefore work on a void*. | 9 | * - per-thread and per-cpu allocation of BTS and PEBS |
10 | * - buffer memory allocation (optional) | ||
11 | * - buffer overflow handling | ||
12 | * - buffer access | ||
10 | * | 13 | * |
14 | * It assumes: | ||
15 | * - get_task_struct on all parameter tasks | ||
16 | * - current is allowed to trace parameter tasks | ||
11 | * | 17 | * |
12 | * Since there is no user for PEBS, yet, only LBR (or branch | ||
13 | * trace store, BTS) is supported. | ||
14 | * | 18 | * |
15 | * | 19 | * Copyright (C) 2007-2008 Intel Corporation. |
16 | * Copyright (C) 2007 Intel Corporation. | 20 | * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008 |
17 | * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007 | ||
18 | */ | 21 | */ |
19 | 22 | ||
23 | |||
24 | #ifdef CONFIG_X86_DS | ||
25 | |||
20 | #include <asm/ds.h> | 26 | #include <asm/ds.h> |
21 | 27 | ||
22 | #include <linux/errno.h> | 28 | #include <linux/errno.h> |
23 | #include <linux/string.h> | 29 | #include <linux/string.h> |
24 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
31 | #include <linux/sched.h> | ||
32 | #include <linux/mm.h> | ||
33 | |||
34 | |||
35 | /* | ||
36 | * The configuration for a particular DS hardware implementation. | ||
37 | */ | ||
38 | struct ds_configuration { | ||
39 | /* the size of the DS structure in bytes */ | ||
40 | unsigned char sizeof_ds; | ||
41 | /* the size of one pointer-typed field in the DS structure in bytes; | ||
42 | this covers the first 8 fields related to buffer management. */ | ||
43 | unsigned char sizeof_field; | ||
44 | /* the size of a BTS/PEBS record in bytes */ | ||
45 | unsigned char sizeof_rec[2]; | ||
46 | }; | ||
47 | static struct ds_configuration ds_cfg; | ||
25 | 48 | ||
26 | 49 | ||
27 | /* | 50 | /* |
@@ -44,378 +67,747 @@ | |||
44 | * (interrupt occurs when write pointer passes interrupt pointer) | 67 | * (interrupt occurs when write pointer passes interrupt pointer) |
45 | * - value to which counter is reset following counter overflow | 68 | * - value to which counter is reset following counter overflow |
46 | * | 69 | * |
47 | * On later architectures, the last branch recording hardware uses | 70 | * Later architectures use 64bit pointers throughout, whereas earlier |
48 | * 64bit pointers even in 32bit mode. | 71 | * architectures use 32bit pointers in 32bit mode. |
49 | * | ||
50 | * | ||
51 | * Branch Trace Store (BTS) records store information about control | ||
52 | * flow changes. They at least provide the following information: | ||
53 | * - source linear address | ||
54 | * - destination linear address | ||
55 | * | 72 | * |
56 | * Netburst supported a predicated bit that had been dropped in later | ||
57 | * architectures. We do not suppor it. | ||
58 | * | 73 | * |
74 | * We compute the base address for the first 8 fields based on: | ||
75 | * - the field size stored in the DS configuration | ||
76 | * - the relative field position | ||
77 | * - an offset giving the start of the respective region | ||
59 | * | 78 | * |
60 | * In order to abstract from the actual DS and BTS layout, we describe | 79 | * This offset is further used to index various arrays holding |
61 | * the access to the relevant fields. | 80 | * information for BTS and PEBS at the respective index. |
62 | * Thanks to Andi Kleen for proposing this design. | ||
63 | * | 81 | * |
64 | * The implementation, however, is not as general as it might seem. In | 82 | * On later 32bit processors, we only access the lower 32bit of the |
65 | * order to stay somewhat simple and efficient, we assume an | 83 | * 64bit pointer fields. The upper halves will be zeroed out. |
66 | * underlying unsigned type (mostly a pointer type) and we expect the | ||
67 | * field to be at least as big as that type. | ||
68 | */ | 84 | */ |
69 | 85 | ||
70 | /* | 86 | enum ds_field { |
71 | * A special from_ip address to indicate that the BTS record is an | 87 | ds_buffer_base = 0, |
72 | * info record that needs to be interpreted or skipped. | 88 | ds_index, |
73 | */ | 89 | ds_absolute_maximum, |
74 | #define BTS_ESCAPE_ADDRESS (-1) | 90 | ds_interrupt_threshold, |
91 | }; | ||
75 | 92 | ||
76 | /* | 93 | enum ds_qualifier { |
77 | * A field access descriptor | 94 | ds_bts = 0, |
78 | */ | 95 | ds_pebs |
79 | struct access_desc { | ||
80 | unsigned char offset; | ||
81 | unsigned char size; | ||
82 | }; | 96 | }; |
83 | 97 | ||
98 | static inline unsigned long ds_get(const unsigned char *base, | ||
99 | enum ds_qualifier qual, enum ds_field field) | ||
100 | { | ||
101 | base += (ds_cfg.sizeof_field * (field + (4 * qual))); | ||
102 | return *(unsigned long *)base; | ||
103 | } | ||
104 | |||
105 | static inline void ds_set(unsigned char *base, enum ds_qualifier qual, | ||
106 | enum ds_field field, unsigned long value) | ||
107 | { | ||
108 | base += (ds_cfg.sizeof_field * (field + (4 * qual))); | ||
109 | (*(unsigned long *)base) = value; | ||
110 | } | ||
111 | |||
112 | |||
84 | /* | 113 | /* |
85 | * The configuration for a particular DS/BTS hardware implementation. | 114 | * Locking is done only for allocating BTS or PEBS resources and for |
115 | * guarding context and buffer memory allocation. | ||
116 | * | ||
117 | * Most functions require the current task to own the ds context part | ||
118 | * they are going to access. All the locking is done when validating | ||
119 | * access to the context. | ||
86 | */ | 120 | */ |
87 | struct ds_configuration { | 121 | static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); |
88 | /* the DS configuration */ | ||
89 | unsigned char sizeof_ds; | ||
90 | struct access_desc bts_buffer_base; | ||
91 | struct access_desc bts_index; | ||
92 | struct access_desc bts_absolute_maximum; | ||
93 | struct access_desc bts_interrupt_threshold; | ||
94 | /* the BTS configuration */ | ||
95 | unsigned char sizeof_bts; | ||
96 | struct access_desc from_ip; | ||
97 | struct access_desc to_ip; | ||
98 | /* BTS variants used to store additional information like | ||
99 | timestamps */ | ||
100 | struct access_desc info_type; | ||
101 | struct access_desc info_data; | ||
102 | unsigned long debugctl_mask; | ||
103 | }; | ||
104 | 122 | ||
105 | /* | 123 | /* |
106 | * The global configuration used by the below accessor functions | 124 | * Validate that the current task is allowed to access the BTS/PEBS |
125 | * buffer of the parameter task. | ||
126 | * | ||
127 | * Returns 0, if access is granted; -Eerrno, otherwise. | ||
107 | */ | 128 | */ |
108 | static struct ds_configuration ds_cfg; | 129 | static inline int ds_validate_access(struct ds_context *context, |
130 | enum ds_qualifier qual) | ||
131 | { | ||
132 | if (!context) | ||
133 | return -EPERM; | ||
134 | |||
135 | if (context->owner[qual] == current) | ||
136 | return 0; | ||
137 | |||
138 | return -EPERM; | ||
139 | } | ||
140 | |||
109 | 141 | ||
110 | /* | 142 | /* |
111 | * Accessor functions for some DS and BTS fields using the above | 143 | * We either support (system-wide) per-cpu or per-thread allocation. |
112 | * global ptrace_bts_cfg. | 144 | * We distinguish the two based on the task_struct pointer, where a |
145 | * NULL pointer indicates per-cpu allocation for the current cpu. | ||
146 | * | ||
147 | * Allocations are use-counted. As soon as resources are allocated, | ||
148 | * further allocations must be of the same type (per-cpu or | ||
149 | * per-thread). We model this by counting allocations (i.e. the number | ||
150 | * of tracers of a certain type) for one type negatively: | ||
151 | * =0 no tracers | ||
152 | * >0 number of per-thread tracers | ||
153 | * <0 number of per-cpu tracers | ||
154 | * | ||
155 | * The below functions to get and put tracers and to check the | ||
156 | * allocation type require the ds_lock to be held by the caller. | ||
157 | * | ||
158 | * Tracers essentially gives the number of ds contexts for a certain | ||
159 | * type of allocation. | ||
113 | */ | 160 | */ |
114 | static inline unsigned long get_bts_buffer_base(char *base) | 161 | static long tracers; |
162 | |||
163 | static inline void get_tracer(struct task_struct *task) | ||
115 | { | 164 | { |
116 | return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); | 165 | tracers += (task ? 1 : -1); |
117 | } | 166 | } |
118 | static inline void set_bts_buffer_base(char *base, unsigned long value) | 167 | |
168 | static inline void put_tracer(struct task_struct *task) | ||
119 | { | 169 | { |
120 | (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; | 170 | tracers -= (task ? 1 : -1); |
121 | } | 171 | } |
122 | static inline unsigned long get_bts_index(char *base) | 172 | |
173 | static inline int check_tracer(struct task_struct *task) | ||
123 | { | 174 | { |
124 | return *(unsigned long *)(base + ds_cfg.bts_index.offset); | 175 | return (task ? (tracers >= 0) : (tracers <= 0)); |
125 | } | 176 | } |
126 | static inline void set_bts_index(char *base, unsigned long value) | 177 | |
178 | |||
179 | /* | ||
180 | * The DS context is either attached to a thread or to a cpu: | ||
181 | * - in the former case, the thread_struct contains a pointer to the | ||
182 | * attached context. | ||
183 | * - in the latter case, we use a static array of per-cpu context | ||
184 | * pointers. | ||
185 | * | ||
186 | * Contexts are use-counted. They are allocated on first access and | ||
187 | * deallocated when the last user puts the context. | ||
188 | * | ||
189 | * We distinguish between an allocating and a non-allocating get of a | ||
190 | * context: | ||
191 | * - the allocating get is used for requesting BTS/PEBS resources. It | ||
192 | * requires the caller to hold the global ds_lock. | ||
193 | * - the non-allocating get is used for all other cases. A | ||
194 | * non-existing context indicates an error. It acquires and releases | ||
195 | * the ds_lock itself for obtaining the context. | ||
196 | * | ||
197 | * A context and its DS configuration are allocated and deallocated | ||
198 | * together. A context always has a DS configuration of the | ||
199 | * appropriate size. | ||
200 | */ | ||
201 | static DEFINE_PER_CPU(struct ds_context *, system_context); | ||
202 | |||
203 | #define this_system_context per_cpu(system_context, smp_processor_id()) | ||
204 | |||
205 | /* | ||
206 | * Returns the pointer to the parameter task's context or to the | ||
207 | * system-wide context, if task is NULL. | ||
208 | * | ||
209 | * Increases the use count of the returned context, if not NULL. | ||
210 | */ | ||
211 | static inline struct ds_context *ds_get_context(struct task_struct *task) | ||
127 | { | 212 | { |
128 | (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; | 213 | struct ds_context *context; |
214 | |||
215 | spin_lock(&ds_lock); | ||
216 | |||
217 | context = (task ? task->thread.ds_ctx : this_system_context); | ||
218 | if (context) | ||
219 | context->count++; | ||
220 | |||
221 | spin_unlock(&ds_lock); | ||
222 | |||
223 | return context; | ||
129 | } | 224 | } |
130 | static inline unsigned long get_bts_absolute_maximum(char *base) | 225 | |
226 | /* | ||
227 | * Same as ds_get_context, but allocates the context and it's DS | ||
228 | * structure, if necessary; returns NULL; if out of memory. | ||
229 | * | ||
230 | * pre: requires ds_lock to be held | ||
231 | */ | ||
232 | static inline struct ds_context *ds_alloc_context(struct task_struct *task) | ||
131 | { | 233 | { |
132 | return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); | 234 | struct ds_context **p_context = |
235 | (task ? &task->thread.ds_ctx : &this_system_context); | ||
236 | struct ds_context *context = *p_context; | ||
237 | |||
238 | if (!context) { | ||
239 | context = kzalloc(sizeof(*context), GFP_KERNEL); | ||
240 | |||
241 | if (!context) | ||
242 | return NULL; | ||
243 | |||
244 | context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); | ||
245 | if (!context->ds) { | ||
246 | kfree(context); | ||
247 | return NULL; | ||
248 | } | ||
249 | |||
250 | *p_context = context; | ||
251 | |||
252 | context->this = p_context; | ||
253 | context->task = task; | ||
254 | |||
255 | if (task) | ||
256 | set_tsk_thread_flag(task, TIF_DS_AREA_MSR); | ||
257 | |||
258 | if (!task || (task == current)) | ||
259 | wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0); | ||
260 | |||
261 | get_tracer(task); | ||
262 | } | ||
263 | |||
264 | context->count++; | ||
265 | |||
266 | return context; | ||
133 | } | 267 | } |
134 | static inline void set_bts_absolute_maximum(char *base, unsigned long value) | 268 | |
269 | /* | ||
270 | * Decreases the use count of the parameter context, if not NULL. | ||
271 | * Deallocates the context, if the use count reaches zero. | ||
272 | */ | ||
273 | static inline void ds_put_context(struct ds_context *context) | ||
135 | { | 274 | { |
136 | (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; | 275 | if (!context) |
276 | return; | ||
277 | |||
278 | spin_lock(&ds_lock); | ||
279 | |||
280 | if (--context->count) | ||
281 | goto out; | ||
282 | |||
283 | *(context->this) = NULL; | ||
284 | |||
285 | if (context->task) | ||
286 | clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); | ||
287 | |||
288 | if (!context->task || (context->task == current)) | ||
289 | wrmsrl(MSR_IA32_DS_AREA, 0); | ||
290 | |||
291 | put_tracer(context->task); | ||
292 | |||
293 | /* free any leftover buffers from tracers that did not | ||
294 | * deallocate them properly. */ | ||
295 | kfree(context->buffer[ds_bts]); | ||
296 | kfree(context->buffer[ds_pebs]); | ||
297 | kfree(context->ds); | ||
298 | kfree(context); | ||
299 | out: | ||
300 | spin_unlock(&ds_lock); | ||
137 | } | 301 | } |
138 | static inline unsigned long get_bts_interrupt_threshold(char *base) | 302 | |
303 | |||
304 | /* | ||
305 | * Handle a buffer overflow | ||
306 | * | ||
307 | * task: the task whose buffers are overflowing; | ||
308 | * NULL for a buffer overflow on the current cpu | ||
309 | * context: the ds context | ||
310 | * qual: the buffer type | ||
311 | */ | ||
312 | static void ds_overflow(struct task_struct *task, struct ds_context *context, | ||
313 | enum ds_qualifier qual) | ||
139 | { | 314 | { |
140 | return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); | 315 | if (!context) |
316 | return; | ||
317 | |||
318 | if (context->callback[qual]) | ||
319 | (*context->callback[qual])(task); | ||
320 | |||
321 | /* todo: do some more overflow handling */ | ||
141 | } | 322 | } |
142 | static inline void set_bts_interrupt_threshold(char *base, unsigned long value) | 323 | |
324 | |||
325 | /* | ||
326 | * Allocate a non-pageable buffer of the parameter size. | ||
327 | * Checks the memory and the locked memory rlimit. | ||
328 | * | ||
329 | * Returns the buffer, if successful; | ||
330 | * NULL, if out of memory or rlimit exceeded. | ||
331 | * | ||
332 | * size: the requested buffer size in bytes | ||
333 | * pages (out): if not NULL, contains the number of pages reserved | ||
334 | */ | ||
335 | static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) | ||
143 | { | 336 | { |
144 | (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; | 337 | unsigned long rlim, vm, pgsz; |
338 | void *buffer; | ||
339 | |||
340 | pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; | ||
341 | |||
342 | rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | ||
343 | vm = current->mm->total_vm + pgsz; | ||
344 | if (rlim < vm) | ||
345 | return NULL; | ||
346 | |||
347 | rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | ||
348 | vm = current->mm->locked_vm + pgsz; | ||
349 | if (rlim < vm) | ||
350 | return NULL; | ||
351 | |||
352 | buffer = kzalloc(size, GFP_KERNEL); | ||
353 | if (!buffer) | ||
354 | return NULL; | ||
355 | |||
356 | current->mm->total_vm += pgsz; | ||
357 | current->mm->locked_vm += pgsz; | ||
358 | |||
359 | if (pages) | ||
360 | *pages = pgsz; | ||
361 | |||
362 | return buffer; | ||
145 | } | 363 | } |
146 | static inline unsigned long get_from_ip(char *base) | 364 | |
365 | static int ds_request(struct task_struct *task, void *base, size_t size, | ||
366 | ds_ovfl_callback_t ovfl, enum ds_qualifier qual) | ||
147 | { | 367 | { |
148 | return *(unsigned long *)(base + ds_cfg.from_ip.offset); | 368 | struct ds_context *context; |
369 | unsigned long buffer, adj; | ||
370 | const unsigned long alignment = (1 << 3); | ||
371 | int error = 0; | ||
372 | |||
373 | if (!ds_cfg.sizeof_ds) | ||
374 | return -EOPNOTSUPP; | ||
375 | |||
376 | /* we require some space to do alignment adjustments below */ | ||
377 | if (size < (alignment + ds_cfg.sizeof_rec[qual])) | ||
378 | return -EINVAL; | ||
379 | |||
380 | /* buffer overflow notification is not yet implemented */ | ||
381 | if (ovfl) | ||
382 | return -EOPNOTSUPP; | ||
383 | |||
384 | |||
385 | spin_lock(&ds_lock); | ||
386 | |||
387 | if (!check_tracer(task)) | ||
388 | return -EPERM; | ||
389 | |||
390 | error = -ENOMEM; | ||
391 | context = ds_alloc_context(task); | ||
392 | if (!context) | ||
393 | goto out_unlock; | ||
394 | |||
395 | error = -EALREADY; | ||
396 | if (context->owner[qual] == current) | ||
397 | goto out_unlock; | ||
398 | error = -EPERM; | ||
399 | if (context->owner[qual] != NULL) | ||
400 | goto out_unlock; | ||
401 | context->owner[qual] = current; | ||
402 | |||
403 | spin_unlock(&ds_lock); | ||
404 | |||
405 | |||
406 | error = -ENOMEM; | ||
407 | if (!base) { | ||
408 | base = ds_allocate_buffer(size, &context->pages[qual]); | ||
409 | if (!base) | ||
410 | goto out_release; | ||
411 | |||
412 | context->buffer[qual] = base; | ||
413 | } | ||
414 | error = 0; | ||
415 | |||
416 | context->callback[qual] = ovfl; | ||
417 | |||
418 | /* adjust the buffer address and size to meet alignment | ||
419 | * constraints: | ||
420 | * - buffer is double-word aligned | ||
421 | * - size is multiple of record size | ||
422 | * | ||
423 | * We checked the size at the very beginning; we have enough | ||
424 | * space to do the adjustment. | ||
425 | */ | ||
426 | buffer = (unsigned long)base; | ||
427 | |||
428 | adj = ALIGN(buffer, alignment) - buffer; | ||
429 | buffer += adj; | ||
430 | size -= adj; | ||
431 | |||
432 | size /= ds_cfg.sizeof_rec[qual]; | ||
433 | size *= ds_cfg.sizeof_rec[qual]; | ||
434 | |||
435 | ds_set(context->ds, qual, ds_buffer_base, buffer); | ||
436 | ds_set(context->ds, qual, ds_index, buffer); | ||
437 | ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); | ||
438 | |||
439 | if (ovfl) { | ||
440 | /* todo: select a suitable interrupt threshold */ | ||
441 | } else | ||
442 | ds_set(context->ds, qual, | ||
443 | ds_interrupt_threshold, buffer + size + 1); | ||
444 | |||
445 | /* we keep the context until ds_release */ | ||
446 | return error; | ||
447 | |||
448 | out_release: | ||
449 | context->owner[qual] = NULL; | ||
450 | ds_put_context(context); | ||
451 | return error; | ||
452 | |||
453 | out_unlock: | ||
454 | spin_unlock(&ds_lock); | ||
455 | ds_put_context(context); | ||
456 | return error; | ||
149 | } | 457 | } |
150 | static inline void set_from_ip(char *base, unsigned long value) | 458 | |
459 | int ds_request_bts(struct task_struct *task, void *base, size_t size, | ||
460 | ds_ovfl_callback_t ovfl) | ||
151 | { | 461 | { |
152 | (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; | 462 | return ds_request(task, base, size, ovfl, ds_bts); |
153 | } | 463 | } |
154 | static inline unsigned long get_to_ip(char *base) | 464 | |
465 | int ds_request_pebs(struct task_struct *task, void *base, size_t size, | ||
466 | ds_ovfl_callback_t ovfl) | ||
155 | { | 467 | { |
156 | return *(unsigned long *)(base + ds_cfg.to_ip.offset); | 468 | return ds_request(task, base, size, ovfl, ds_pebs); |
157 | } | 469 | } |
158 | static inline void set_to_ip(char *base, unsigned long value) | 470 | |
471 | static int ds_release(struct task_struct *task, enum ds_qualifier qual) | ||
159 | { | 472 | { |
160 | (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; | 473 | struct ds_context *context; |
474 | int error; | ||
475 | |||
476 | context = ds_get_context(task); | ||
477 | error = ds_validate_access(context, qual); | ||
478 | if (error < 0) | ||
479 | goto out; | ||
480 | |||
481 | kfree(context->buffer[qual]); | ||
482 | context->buffer[qual] = NULL; | ||
483 | |||
484 | current->mm->total_vm -= context->pages[qual]; | ||
485 | current->mm->locked_vm -= context->pages[qual]; | ||
486 | context->pages[qual] = 0; | ||
487 | context->owner[qual] = NULL; | ||
488 | |||
489 | /* | ||
490 | * we put the context twice: | ||
491 | * once for the ds_get_context | ||
492 | * once for the corresponding ds_request | ||
493 | */ | ||
494 | ds_put_context(context); | ||
495 | out: | ||
496 | ds_put_context(context); | ||
497 | return error; | ||
161 | } | 498 | } |
162 | static inline unsigned char get_info_type(char *base) | 499 | |
500 | int ds_release_bts(struct task_struct *task) | ||
163 | { | 501 | { |
164 | return *(unsigned char *)(base + ds_cfg.info_type.offset); | 502 | return ds_release(task, ds_bts); |
165 | } | 503 | } |
166 | static inline void set_info_type(char *base, unsigned char value) | 504 | |
505 | int ds_release_pebs(struct task_struct *task) | ||
167 | { | 506 | { |
168 | (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; | 507 | return ds_release(task, ds_pebs); |
169 | } | 508 | } |
170 | static inline unsigned long get_info_data(char *base) | 509 | |
510 | static int ds_get_index(struct task_struct *task, size_t *pos, | ||
511 | enum ds_qualifier qual) | ||
171 | { | 512 | { |
172 | return *(unsigned long *)(base + ds_cfg.info_data.offset); | 513 | struct ds_context *context; |
514 | unsigned long base, index; | ||
515 | int error; | ||
516 | |||
517 | context = ds_get_context(task); | ||
518 | error = ds_validate_access(context, qual); | ||
519 | if (error < 0) | ||
520 | goto out; | ||
521 | |||
522 | base = ds_get(context->ds, qual, ds_buffer_base); | ||
523 | index = ds_get(context->ds, qual, ds_index); | ||
524 | |||
525 | error = ((index - base) / ds_cfg.sizeof_rec[qual]); | ||
526 | if (pos) | ||
527 | *pos = error; | ||
528 | out: | ||
529 | ds_put_context(context); | ||
530 | return error; | ||
173 | } | 531 | } |
174 | static inline void set_info_data(char *base, unsigned long value) | 532 | |
533 | int ds_get_bts_index(struct task_struct *task, size_t *pos) | ||
175 | { | 534 | { |
176 | (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; | 535 | return ds_get_index(task, pos, ds_bts); |
177 | } | 536 | } |
178 | 537 | ||
538 | int ds_get_pebs_index(struct task_struct *task, size_t *pos) | ||
539 | { | ||
540 | return ds_get_index(task, pos, ds_pebs); | ||
541 | } | ||
179 | 542 | ||
180 | int ds_allocate(void **dsp, size_t bts_size_in_bytes) | 543 | static int ds_get_end(struct task_struct *task, size_t *pos, |
544 | enum ds_qualifier qual) | ||
181 | { | 545 | { |
182 | size_t bts_size_in_records; | 546 | struct ds_context *context; |
183 | unsigned long bts; | 547 | unsigned long base, end; |
184 | void *ds; | 548 | int error; |
549 | |||
550 | context = ds_get_context(task); | ||
551 | error = ds_validate_access(context, qual); | ||
552 | if (error < 0) | ||
553 | goto out; | ||
554 | |||
555 | base = ds_get(context->ds, qual, ds_buffer_base); | ||
556 | end = ds_get(context->ds, qual, ds_absolute_maximum); | ||
557 | |||
558 | error = ((end - base) / ds_cfg.sizeof_rec[qual]); | ||
559 | if (pos) | ||
560 | *pos = error; | ||
561 | out: | ||
562 | ds_put_context(context); | ||
563 | return error; | ||
564 | } | ||
185 | 565 | ||
186 | if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) | 566 | int ds_get_bts_end(struct task_struct *task, size_t *pos) |
187 | return -EOPNOTSUPP; | 567 | { |
568 | return ds_get_end(task, pos, ds_bts); | ||
569 | } | ||
188 | 570 | ||
189 | if (bts_size_in_bytes < 0) | 571 | int ds_get_pebs_end(struct task_struct *task, size_t *pos) |
190 | return -EINVAL; | 572 | { |
573 | return ds_get_end(task, pos, ds_pebs); | ||
574 | } | ||
191 | 575 | ||
192 | bts_size_in_records = | 576 | static int ds_access(struct task_struct *task, size_t index, |
193 | bts_size_in_bytes / ds_cfg.sizeof_bts; | 577 | const void **record, enum ds_qualifier qual) |
194 | bts_size_in_bytes = | 578 | { |
195 | bts_size_in_records * ds_cfg.sizeof_bts; | 579 | struct ds_context *context; |
580 | unsigned long base, idx; | ||
581 | int error; | ||
196 | 582 | ||
197 | if (bts_size_in_bytes <= 0) | 583 | if (!record) |
198 | return -EINVAL; | 584 | return -EINVAL; |
199 | 585 | ||
200 | bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); | 586 | context = ds_get_context(task); |
201 | 587 | error = ds_validate_access(context, qual); | |
202 | if (!bts) | 588 | if (error < 0) |
203 | return -ENOMEM; | 589 | goto out; |
204 | 590 | ||
205 | ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); | 591 | base = ds_get(context->ds, qual, ds_buffer_base); |
592 | idx = base + (index * ds_cfg.sizeof_rec[qual]); | ||
206 | 593 | ||
207 | if (!ds) { | 594 | error = -EINVAL; |
208 | kfree((void *)bts); | 595 | if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) |
209 | return -ENOMEM; | 596 | goto out; |
210 | } | ||
211 | |||
212 | set_bts_buffer_base(ds, bts); | ||
213 | set_bts_index(ds, bts); | ||
214 | set_bts_absolute_maximum(ds, bts + bts_size_in_bytes); | ||
215 | set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1); | ||
216 | 597 | ||
217 | *dsp = ds; | 598 | *record = (const void *)idx; |
218 | return 0; | 599 | error = ds_cfg.sizeof_rec[qual]; |
600 | out: | ||
601 | ds_put_context(context); | ||
602 | return error; | ||
219 | } | 603 | } |
220 | 604 | ||
221 | int ds_free(void **dsp) | 605 | int ds_access_bts(struct task_struct *task, size_t index, const void **record) |
222 | { | 606 | { |
223 | if (*dsp) { | 607 | return ds_access(task, index, record, ds_bts); |
224 | kfree((void *)get_bts_buffer_base(*dsp)); | ||
225 | kfree(*dsp); | ||
226 | *dsp = NULL; | ||
227 | } | ||
228 | return 0; | ||
229 | } | 608 | } |
230 | 609 | ||
231 | int ds_get_bts_size(void *ds) | 610 | int ds_access_pebs(struct task_struct *task, size_t index, const void **record) |
232 | { | 611 | { |
233 | int size_in_bytes; | 612 | return ds_access(task, index, record, ds_pebs); |
234 | |||
235 | if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) | ||
236 | return -EOPNOTSUPP; | ||
237 | |||
238 | if (!ds) | ||
239 | return 0; | ||
240 | |||
241 | size_in_bytes = | ||
242 | get_bts_absolute_maximum(ds) - | ||
243 | get_bts_buffer_base(ds); | ||
244 | return size_in_bytes; | ||
245 | } | 613 | } |
246 | 614 | ||
247 | int ds_get_bts_end(void *ds) | 615 | static int ds_write(struct task_struct *task, const void *record, size_t size, |
616 | enum ds_qualifier qual, int force) | ||
248 | { | 617 | { |
249 | int size_in_bytes = ds_get_bts_size(ds); | 618 | struct ds_context *context; |
250 | 619 | int error; | |
251 | if (size_in_bytes <= 0) | ||
252 | return size_in_bytes; | ||
253 | 620 | ||
254 | return size_in_bytes / ds_cfg.sizeof_bts; | 621 | if (!record) |
255 | } | 622 | return -EINVAL; |
256 | 623 | ||
257 | int ds_get_bts_index(void *ds) | 624 | error = -EPERM; |
258 | { | 625 | context = ds_get_context(task); |
259 | int index_offset_in_bytes; | 626 | if (!context) |
627 | goto out; | ||
260 | 628 | ||
261 | if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) | 629 | if (!force) { |
262 | return -EOPNOTSUPP; | 630 | error = ds_validate_access(context, qual); |
631 | if (error < 0) | ||
632 | goto out; | ||
633 | } | ||
263 | 634 | ||
264 | index_offset_in_bytes = | 635 | error = 0; |
265 | get_bts_index(ds) - | 636 | while (size) { |
266 | get_bts_buffer_base(ds); | 637 | unsigned long base, index, end, write_end, int_th; |
638 | unsigned long write_size, adj_write_size; | ||
639 | |||
640 | /* | ||
641 | * write as much as possible without producing an | ||
642 | * overflow interrupt. | ||
643 | * | ||
644 | * interrupt_threshold must either be | ||
645 | * - bigger than absolute_maximum or | ||
646 | * - point to a record between buffer_base and absolute_maximum | ||
647 | * | ||
648 | * index points to a valid record. | ||
649 | */ | ||
650 | base = ds_get(context->ds, qual, ds_buffer_base); | ||
651 | index = ds_get(context->ds, qual, ds_index); | ||
652 | end = ds_get(context->ds, qual, ds_absolute_maximum); | ||
653 | int_th = ds_get(context->ds, qual, ds_interrupt_threshold); | ||
654 | |||
655 | write_end = min(end, int_th); | ||
656 | |||
657 | /* if we are already beyond the interrupt threshold, | ||
658 | * we fill the entire buffer */ | ||
659 | if (write_end <= index) | ||
660 | write_end = end; | ||
661 | |||
662 | if (write_end <= index) | ||
663 | goto out; | ||
664 | |||
665 | write_size = min((unsigned long) size, write_end - index); | ||
666 | memcpy((void *)index, record, write_size); | ||
667 | |||
668 | record = (const char *)record + write_size; | ||
669 | size -= write_size; | ||
670 | error += write_size; | ||
671 | |||
672 | adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; | ||
673 | adj_write_size *= ds_cfg.sizeof_rec[qual]; | ||
674 | |||
675 | /* zero out trailing bytes */ | ||
676 | memset((char *)index + write_size, 0, | ||
677 | adj_write_size - write_size); | ||
678 | index += adj_write_size; | ||
679 | |||
680 | if (index >= end) | ||
681 | index = base; | ||
682 | ds_set(context->ds, qual, ds_index, index); | ||
683 | |||
684 | if (index >= int_th) | ||
685 | ds_overflow(task, context, qual); | ||
686 | } | ||
267 | 687 | ||
268 | return index_offset_in_bytes / ds_cfg.sizeof_bts; | 688 | out: |
689 | ds_put_context(context); | ||
690 | return error; | ||
269 | } | 691 | } |
270 | 692 | ||
271 | int ds_set_overflow(void *ds, int method) | 693 | int ds_write_bts(struct task_struct *task, const void *record, size_t size) |
272 | { | 694 | { |
273 | switch (method) { | 695 | return ds_write(task, record, size, ds_bts, /* force = */ 0); |
274 | case DS_O_SIGNAL: | ||
275 | return -EOPNOTSUPP; | ||
276 | case DS_O_WRAP: | ||
277 | return 0; | ||
278 | default: | ||
279 | return -EINVAL; | ||
280 | } | ||
281 | } | 696 | } |
282 | 697 | ||
283 | int ds_get_overflow(void *ds) | 698 | int ds_write_pebs(struct task_struct *task, const void *record, size_t size) |
284 | { | 699 | { |
285 | return DS_O_WRAP; | 700 | return ds_write(task, record, size, ds_pebs, /* force = */ 0); |
286 | } | 701 | } |
287 | 702 | ||
288 | int ds_clear(void *ds) | 703 | int ds_unchecked_write_bts(struct task_struct *task, |
704 | const void *record, size_t size) | ||
289 | { | 705 | { |
290 | int bts_size = ds_get_bts_size(ds); | 706 | return ds_write(task, record, size, ds_bts, /* force = */ 1); |
291 | unsigned long bts_base; | ||
292 | |||
293 | if (bts_size <= 0) | ||
294 | return bts_size; | ||
295 | |||
296 | bts_base = get_bts_buffer_base(ds); | ||
297 | memset((void *)bts_base, 0, bts_size); | ||
298 | |||
299 | set_bts_index(ds, bts_base); | ||
300 | return 0; | ||
301 | } | 707 | } |
302 | 708 | ||
303 | int ds_read_bts(void *ds, int index, struct bts_struct *out) | 709 | int ds_unchecked_write_pebs(struct task_struct *task, |
710 | const void *record, size_t size) | ||
304 | { | 711 | { |
305 | void *bts; | 712 | return ds_write(task, record, size, ds_pebs, /* force = */ 1); |
713 | } | ||
306 | 714 | ||
307 | if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) | 715 | static int ds_reset_or_clear(struct task_struct *task, |
308 | return -EOPNOTSUPP; | 716 | enum ds_qualifier qual, int clear) |
717 | { | ||
718 | struct ds_context *context; | ||
719 | unsigned long base, end; | ||
720 | int error; | ||
309 | 721 | ||
310 | if (index < 0) | 722 | context = ds_get_context(task); |
311 | return -EINVAL; | 723 | error = ds_validate_access(context, qual); |
724 | if (error < 0) | ||
725 | goto out; | ||
312 | 726 | ||
313 | if (index >= ds_get_bts_size(ds)) | 727 | base = ds_get(context->ds, qual, ds_buffer_base); |
314 | return -EINVAL; | 728 | end = ds_get(context->ds, qual, ds_absolute_maximum); |
315 | 729 | ||
316 | bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); | 730 | if (clear) |
731 | memset((void *)base, 0, end - base); | ||
317 | 732 | ||
318 | memset(out, 0, sizeof(*out)); | 733 | ds_set(context->ds, qual, ds_index, base); |
319 | if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) { | ||
320 | out->qualifier = get_info_type(bts); | ||
321 | out->variant.jiffies = get_info_data(bts); | ||
322 | } else { | ||
323 | out->qualifier = BTS_BRANCH; | ||
324 | out->variant.lbr.from_ip = get_from_ip(bts); | ||
325 | out->variant.lbr.to_ip = get_to_ip(bts); | ||
326 | } | ||
327 | 734 | ||
328 | return sizeof(*out);; | 735 | error = 0; |
736 | out: | ||
737 | ds_put_context(context); | ||
738 | return error; | ||
329 | } | 739 | } |
330 | 740 | ||
331 | int ds_write_bts(void *ds, const struct bts_struct *in) | 741 | int ds_reset_bts(struct task_struct *task) |
332 | { | 742 | { |
333 | unsigned long bts; | 743 | return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); |
334 | 744 | } | |
335 | if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) | ||
336 | return -EOPNOTSUPP; | ||
337 | |||
338 | if (ds_get_bts_size(ds) <= 0) | ||
339 | return -ENXIO; | ||
340 | 745 | ||
341 | bts = get_bts_index(ds); | 746 | int ds_reset_pebs(struct task_struct *task) |
747 | { | ||
748 | return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); | ||
749 | } | ||
342 | 750 | ||
343 | memset((void *)bts, 0, ds_cfg.sizeof_bts); | 751 | int ds_clear_bts(struct task_struct *task) |
344 | switch (in->qualifier) { | 752 | { |
345 | case BTS_INVALID: | 753 | return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); |
346 | break; | 754 | } |
347 | 755 | ||
348 | case BTS_BRANCH: | 756 | int ds_clear_pebs(struct task_struct *task) |
349 | set_from_ip((void *)bts, in->variant.lbr.from_ip); | 757 | { |
350 | set_to_ip((void *)bts, in->variant.lbr.to_ip); | 758 | return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); |
351 | break; | 759 | } |
352 | 760 | ||
353 | case BTS_TASK_ARRIVES: | 761 | int ds_get_pebs_reset(struct task_struct *task, u64 *value) |
354 | case BTS_TASK_DEPARTS: | 762 | { |
355 | set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); | 763 | struct ds_context *context; |
356 | set_info_type((void *)bts, in->qualifier); | 764 | int error; |
357 | set_info_data((void *)bts, in->variant.jiffies); | ||
358 | break; | ||
359 | 765 | ||
360 | default: | 766 | if (!value) |
361 | return -EINVAL; | 767 | return -EINVAL; |
362 | } | ||
363 | 768 | ||
364 | bts = bts + ds_cfg.sizeof_bts; | 769 | context = ds_get_context(task); |
365 | if (bts >= get_bts_absolute_maximum(ds)) | 770 | error = ds_validate_access(context, ds_pebs); |
366 | bts = get_bts_buffer_base(ds); | 771 | if (error < 0) |
367 | set_bts_index(ds, bts); | 772 | goto out; |
368 | 773 | ||
369 | return ds_cfg.sizeof_bts; | 774 | *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); |
775 | |||
776 | error = 0; | ||
777 | out: | ||
778 | ds_put_context(context); | ||
779 | return error; | ||
370 | } | 780 | } |
371 | 781 | ||
372 | unsigned long ds_debugctl_mask(void) | 782 | int ds_set_pebs_reset(struct task_struct *task, u64 value) |
373 | { | 783 | { |
374 | return ds_cfg.debugctl_mask; | 784 | struct ds_context *context; |
375 | } | 785 | int error; |
376 | 786 | ||
377 | #ifdef __i386__ | 787 | context = ds_get_context(task); |
378 | static const struct ds_configuration ds_cfg_netburst = { | 788 | error = ds_validate_access(context, ds_pebs); |
379 | .sizeof_ds = 9 * 4, | 789 | if (error < 0) |
380 | .bts_buffer_base = { 0, 4 }, | 790 | goto out; |
381 | .bts_index = { 4, 4 }, | ||
382 | .bts_absolute_maximum = { 8, 4 }, | ||
383 | .bts_interrupt_threshold = { 12, 4 }, | ||
384 | .sizeof_bts = 3 * 4, | ||
385 | .from_ip = { 0, 4 }, | ||
386 | .to_ip = { 4, 4 }, | ||
387 | .info_type = { 4, 1 }, | ||
388 | .info_data = { 8, 4 }, | ||
389 | .debugctl_mask = (1<<2)|(1<<3) | ||
390 | }; | ||
391 | 791 | ||
392 | static const struct ds_configuration ds_cfg_pentium_m = { | 792 | *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; |
393 | .sizeof_ds = 9 * 4, | 793 | |
394 | .bts_buffer_base = { 0, 4 }, | 794 | error = 0; |
395 | .bts_index = { 4, 4 }, | 795 | out: |
396 | .bts_absolute_maximum = { 8, 4 }, | 796 | ds_put_context(context); |
397 | .bts_interrupt_threshold = { 12, 4 }, | 797 | return error; |
398 | .sizeof_bts = 3 * 4, | 798 | } |
399 | .from_ip = { 0, 4 }, | 799 | |
400 | .to_ip = { 4, 4 }, | 800 | static const struct ds_configuration ds_cfg_var = { |
401 | .info_type = { 4, 1 }, | 801 | .sizeof_ds = sizeof(long) * 12, |
402 | .info_data = { 8, 4 }, | 802 | .sizeof_field = sizeof(long), |
403 | .debugctl_mask = (1<<6)|(1<<7) | 803 | .sizeof_rec[ds_bts] = sizeof(long) * 3, |
804 | .sizeof_rec[ds_pebs] = sizeof(long) * 10 | ||
404 | }; | 805 | }; |
405 | #endif /* _i386_ */ | 806 | static const struct ds_configuration ds_cfg_64 = { |
406 | 807 | .sizeof_ds = 8 * 12, | |
407 | static const struct ds_configuration ds_cfg_core2 = { | 808 | .sizeof_field = 8, |
408 | .sizeof_ds = 9 * 8, | 809 | .sizeof_rec[ds_bts] = 8 * 3, |
409 | .bts_buffer_base = { 0, 8 }, | 810 | .sizeof_rec[ds_pebs] = 8 * 10 |
410 | .bts_index = { 8, 8 }, | ||
411 | .bts_absolute_maximum = { 16, 8 }, | ||
412 | .bts_interrupt_threshold = { 24, 8 }, | ||
413 | .sizeof_bts = 3 * 8, | ||
414 | .from_ip = { 0, 8 }, | ||
415 | .to_ip = { 8, 8 }, | ||
416 | .info_type = { 8, 1 }, | ||
417 | .info_data = { 16, 8 }, | ||
418 | .debugctl_mask = (1<<6)|(1<<7)|(1<<9) | ||
419 | }; | 811 | }; |
420 | 812 | ||
421 | static inline void | 813 | static inline void |
@@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) | |||
429 | switch (c->x86) { | 821 | switch (c->x86) { |
430 | case 0x6: | 822 | case 0x6: |
431 | switch (c->x86_model) { | 823 | switch (c->x86_model) { |
432 | #ifdef __i386__ | ||
433 | case 0xD: | 824 | case 0xD: |
434 | case 0xE: /* Pentium M */ | 825 | case 0xE: /* Pentium M */ |
435 | ds_configure(&ds_cfg_pentium_m); | 826 | ds_configure(&ds_cfg_var); |
436 | break; | 827 | break; |
437 | #endif /* _i386_ */ | ||
438 | case 0xF: /* Core2 */ | 828 | case 0xF: /* Core2 */ |
439 | ds_configure(&ds_cfg_core2); | 829 | case 0x1C: /* Atom */ |
830 | ds_configure(&ds_cfg_64); | ||
440 | break; | 831 | break; |
441 | default: | 832 | default: |
442 | /* sorry, don't know about them */ | 833 | /* sorry, don't know about them */ |
@@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) | |||
445 | break; | 836 | break; |
446 | case 0xF: | 837 | case 0xF: |
447 | switch (c->x86_model) { | 838 | switch (c->x86_model) { |
448 | #ifdef __i386__ | ||
449 | case 0x0: | 839 | case 0x0: |
450 | case 0x1: | 840 | case 0x1: |
451 | case 0x2: /* Netburst */ | 841 | case 0x2: /* Netburst */ |
452 | ds_configure(&ds_cfg_netburst); | 842 | ds_configure(&ds_cfg_var); |
453 | break; | 843 | break; |
454 | #endif /* _i386_ */ | ||
455 | default: | 844 | default: |
456 | /* sorry, don't know about them */ | 845 | /* sorry, don't know about them */ |
457 | break; | 846 | break; |
@@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) | |||
462 | break; | 851 | break; |
463 | } | 852 | } |
464 | } | 853 | } |
854 | |||
855 | void ds_free(struct ds_context *context) | ||
856 | { | ||
857 | /* This is called when the task owning the parameter context | ||
858 | * is dying. There should not be any user of that context left | ||
859 | * to disturb us, anymore. */ | ||
860 | unsigned long leftovers = context->count; | ||
861 | while (leftovers--) | ||
862 | ds_put_context(context); | ||
863 | } | ||
864 | #endif /* CONFIG_X86_DS */ | ||
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c new file mode 100644 index 000000000000..1a78180f08d3 --- /dev/null +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -0,0 +1,449 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
3 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
4 | */ | ||
5 | #include <linux/kallsyms.h> | ||
6 | #include <linux/kprobes.h> | ||
7 | #include <linux/uaccess.h> | ||
8 | #include <linux/utsname.h> | ||
9 | #include <linux/hardirq.h> | ||
10 | #include <linux/kdebug.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/ptrace.h> | ||
13 | #include <linux/kexec.h> | ||
14 | #include <linux/bug.h> | ||
15 | #include <linux/nmi.h> | ||
16 | #include <linux/sysfs.h> | ||
17 | |||
18 | #include <asm/stacktrace.h> | ||
19 | |||
20 | #define STACKSLOTS_PER_LINE 8 | ||
21 | #define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) | ||
22 | |||
23 | int panic_on_unrecovered_nmi; | ||
24 | int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; | ||
25 | static unsigned int code_bytes = 64; | ||
26 | static int die_counter; | ||
27 | |||
28 | void printk_address(unsigned long address, int reliable) | ||
29 | { | ||
30 | printk(" [<%p>] %s%pS\n", (void *) address, | ||
31 | reliable ? "" : "? ", (void *) address); | ||
32 | } | ||
33 | |||
34 | static inline int valid_stack_ptr(struct thread_info *tinfo, | ||
35 | void *p, unsigned int size, void *end) | ||
36 | { | ||
37 | void *t = tinfo; | ||
38 | if (end) { | ||
39 | if (p < end && p >= (end-THREAD_SIZE)) | ||
40 | return 1; | ||
41 | else | ||
42 | return 0; | ||
43 | } | ||
44 | return p > t && p < t + THREAD_SIZE - size; | ||
45 | } | ||
46 | |||
47 | /* The form of the top of the frame on the stack */ | ||
48 | struct stack_frame { | ||
49 | struct stack_frame *next_frame; | ||
50 | unsigned long return_address; | ||
51 | }; | ||
52 | |||
53 | static inline unsigned long | ||
54 | print_context_stack(struct thread_info *tinfo, | ||
55 | unsigned long *stack, unsigned long bp, | ||
56 | const struct stacktrace_ops *ops, void *data, | ||
57 | unsigned long *end) | ||
58 | { | ||
59 | struct stack_frame *frame = (struct stack_frame *)bp; | ||
60 | |||
61 | while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { | ||
62 | unsigned long addr; | ||
63 | |||
64 | addr = *stack; | ||
65 | if (__kernel_text_address(addr)) { | ||
66 | if ((unsigned long) stack == bp + sizeof(long)) { | ||
67 | ops->address(data, addr, 1); | ||
68 | frame = frame->next_frame; | ||
69 | bp = (unsigned long) frame; | ||
70 | } else { | ||
71 | ops->address(data, addr, bp == 0); | ||
72 | } | ||
73 | } | ||
74 | stack++; | ||
75 | } | ||
76 | return bp; | ||
77 | } | ||
78 | |||
79 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | ||
80 | unsigned long *stack, unsigned long bp, | ||
81 | const struct stacktrace_ops *ops, void *data) | ||
82 | { | ||
83 | if (!task) | ||
84 | task = current; | ||
85 | |||
86 | if (!stack) { | ||
87 | unsigned long dummy; | ||
88 | stack = &dummy; | ||
89 | if (task && task != current) | ||
90 | stack = (unsigned long *)task->thread.sp; | ||
91 | } | ||
92 | |||
93 | #ifdef CONFIG_FRAME_POINTER | ||
94 | if (!bp) { | ||
95 | if (task == current) { | ||
96 | /* Grab bp right from our regs */ | ||
97 | get_bp(bp); | ||
98 | } else { | ||
99 | /* bp is the last reg pushed by switch_to */ | ||
100 | bp = *(unsigned long *) task->thread.sp; | ||
101 | } | ||
102 | } | ||
103 | #endif | ||
104 | |||
105 | for (;;) { | ||
106 | struct thread_info *context; | ||
107 | |||
108 | context = (struct thread_info *) | ||
109 | ((unsigned long)stack & (~(THREAD_SIZE - 1))); | ||
110 | bp = print_context_stack(context, stack, bp, ops, data, NULL); | ||
111 | |||
112 | stack = (unsigned long *)context->previous_esp; | ||
113 | if (!stack) | ||
114 | break; | ||
115 | if (ops->stack(data, "IRQ") < 0) | ||
116 | break; | ||
117 | touch_nmi_watchdog(); | ||
118 | } | ||
119 | } | ||
120 | EXPORT_SYMBOL(dump_trace); | ||
121 | |||
122 | static void | ||
123 | print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
124 | { | ||
125 | printk(data); | ||
126 | print_symbol(msg, symbol); | ||
127 | printk("\n"); | ||
128 | } | ||
129 | |||
130 | static void print_trace_warning(void *data, char *msg) | ||
131 | { | ||
132 | printk("%s%s\n", (char *)data, msg); | ||
133 | } | ||
134 | |||
135 | static int print_trace_stack(void *data, char *name) | ||
136 | { | ||
137 | printk("%s <%s> ", (char *)data, name); | ||
138 | return 0; | ||
139 | } | ||
140 | |||
141 | /* | ||
142 | * Print one address/symbol entries per line. | ||
143 | */ | ||
144 | static void print_trace_address(void *data, unsigned long addr, int reliable) | ||
145 | { | ||
146 | touch_nmi_watchdog(); | ||
147 | printk(data); | ||
148 | printk_address(addr, reliable); | ||
149 | } | ||
150 | |||
151 | static const struct stacktrace_ops print_trace_ops = { | ||
152 | .warning = print_trace_warning, | ||
153 | .warning_symbol = print_trace_warning_symbol, | ||
154 | .stack = print_trace_stack, | ||
155 | .address = print_trace_address, | ||
156 | }; | ||
157 | |||
158 | static void | ||
159 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
160 | unsigned long *stack, unsigned long bp, char *log_lvl) | ||
161 | { | ||
162 | printk("%sCall Trace:\n", log_lvl); | ||
163 | dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); | ||
164 | } | ||
165 | |||
166 | void show_trace(struct task_struct *task, struct pt_regs *regs, | ||
167 | unsigned long *stack, unsigned long bp) | ||
168 | { | ||
169 | show_trace_log_lvl(task, regs, stack, bp, ""); | ||
170 | } | ||
171 | |||
172 | static void | ||
173 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
174 | unsigned long *sp, unsigned long bp, char *log_lvl) | ||
175 | { | ||
176 | unsigned long *stack; | ||
177 | int i; | ||
178 | |||
179 | if (sp == NULL) { | ||
180 | if (task) | ||
181 | sp = (unsigned long *)task->thread.sp; | ||
182 | else | ||
183 | sp = (unsigned long *)&sp; | ||
184 | } | ||
185 | |||
186 | stack = sp; | ||
187 | for (i = 0; i < kstack_depth_to_print; i++) { | ||
188 | if (kstack_end(stack)) | ||
189 | break; | ||
190 | if (i && ((i % STACKSLOTS_PER_LINE) == 0)) | ||
191 | printk("\n%s", log_lvl); | ||
192 | printk(" %08lx", *stack++); | ||
193 | touch_nmi_watchdog(); | ||
194 | } | ||
195 | printk("\n"); | ||
196 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); | ||
197 | } | ||
198 | |||
199 | void show_stack(struct task_struct *task, unsigned long *sp) | ||
200 | { | ||
201 | show_stack_log_lvl(task, NULL, sp, 0, ""); | ||
202 | } | ||
203 | |||
204 | /* | ||
205 | * The architecture-independent dump_stack generator | ||
206 | */ | ||
207 | void dump_stack(void) | ||
208 | { | ||
209 | unsigned long bp = 0; | ||
210 | unsigned long stack; | ||
211 | |||
212 | #ifdef CONFIG_FRAME_POINTER | ||
213 | if (!bp) | ||
214 | get_bp(bp); | ||
215 | #endif | ||
216 | |||
217 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | ||
218 | current->pid, current->comm, print_tainted(), | ||
219 | init_utsname()->release, | ||
220 | (int)strcspn(init_utsname()->version, " "), | ||
221 | init_utsname()->version); | ||
222 | show_trace(NULL, NULL, &stack, bp); | ||
223 | } | ||
224 | |||
225 | EXPORT_SYMBOL(dump_stack); | ||
226 | |||
227 | void show_registers(struct pt_regs *regs) | ||
228 | { | ||
229 | int i; | ||
230 | |||
231 | print_modules(); | ||
232 | __show_regs(regs, 0); | ||
233 | |||
234 | printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", | ||
235 | TASK_COMM_LEN, current->comm, task_pid_nr(current), | ||
236 | current_thread_info(), current, task_thread_info(current)); | ||
237 | /* | ||
238 | * When in-kernel, we also print out the stack and code at the | ||
239 | * time of the fault.. | ||
240 | */ | ||
241 | if (!user_mode_vm(regs)) { | ||
242 | unsigned int code_prologue = code_bytes * 43 / 64; | ||
243 | unsigned int code_len = code_bytes; | ||
244 | unsigned char c; | ||
245 | u8 *ip; | ||
246 | |||
247 | printk(KERN_EMERG "Stack:\n"); | ||
248 | show_stack_log_lvl(NULL, regs, ®s->sp, | ||
249 | 0, KERN_EMERG); | ||
250 | |||
251 | printk(KERN_EMERG "Code: "); | ||
252 | |||
253 | ip = (u8 *)regs->ip - code_prologue; | ||
254 | if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { | ||
255 | /* try starting at IP */ | ||
256 | ip = (u8 *)regs->ip; | ||
257 | code_len = code_len - code_prologue + 1; | ||
258 | } | ||
259 | for (i = 0; i < code_len; i++, ip++) { | ||
260 | if (ip < (u8 *)PAGE_OFFSET || | ||
261 | probe_kernel_address(ip, c)) { | ||
262 | printk(" Bad EIP value."); | ||
263 | break; | ||
264 | } | ||
265 | if (ip == (u8 *)regs->ip) | ||
266 | printk("<%02x> ", c); | ||
267 | else | ||
268 | printk("%02x ", c); | ||
269 | } | ||
270 | } | ||
271 | printk("\n"); | ||
272 | } | ||
273 | |||
274 | int is_valid_bugaddr(unsigned long ip) | ||
275 | { | ||
276 | unsigned short ud2; | ||
277 | |||
278 | if (ip < PAGE_OFFSET) | ||
279 | return 0; | ||
280 | if (probe_kernel_address((unsigned short *)ip, ud2)) | ||
281 | return 0; | ||
282 | |||
283 | return ud2 == 0x0b0f; | ||
284 | } | ||
285 | |||
286 | static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; | ||
287 | static int die_owner = -1; | ||
288 | static unsigned int die_nest_count; | ||
289 | |||
290 | unsigned __kprobes long oops_begin(void) | ||
291 | { | ||
292 | unsigned long flags; | ||
293 | |||
294 | oops_enter(); | ||
295 | |||
296 | if (die_owner != raw_smp_processor_id()) { | ||
297 | console_verbose(); | ||
298 | raw_local_irq_save(flags); | ||
299 | __raw_spin_lock(&die_lock); | ||
300 | die_owner = smp_processor_id(); | ||
301 | die_nest_count = 0; | ||
302 | bust_spinlocks(1); | ||
303 | } else { | ||
304 | raw_local_irq_save(flags); | ||
305 | } | ||
306 | die_nest_count++; | ||
307 | return flags; | ||
308 | } | ||
309 | |||
310 | void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) | ||
311 | { | ||
312 | bust_spinlocks(0); | ||
313 | die_owner = -1; | ||
314 | add_taint(TAINT_DIE); | ||
315 | __raw_spin_unlock(&die_lock); | ||
316 | raw_local_irq_restore(flags); | ||
317 | |||
318 | if (!regs) | ||
319 | return; | ||
320 | |||
321 | if (kexec_should_crash(current)) | ||
322 | crash_kexec(regs); | ||
323 | if (in_interrupt()) | ||
324 | panic("Fatal exception in interrupt"); | ||
325 | if (panic_on_oops) | ||
326 | panic("Fatal exception"); | ||
327 | oops_exit(); | ||
328 | do_exit(signr); | ||
329 | } | ||
330 | |||
331 | int __kprobes __die(const char *str, struct pt_regs *regs, long err) | ||
332 | { | ||
333 | unsigned short ss; | ||
334 | unsigned long sp; | ||
335 | |||
336 | printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); | ||
337 | #ifdef CONFIG_PREEMPT | ||
338 | printk("PREEMPT "); | ||
339 | #endif | ||
340 | #ifdef CONFIG_SMP | ||
341 | printk("SMP "); | ||
342 | #endif | ||
343 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
344 | printk("DEBUG_PAGEALLOC"); | ||
345 | #endif | ||
346 | printk("\n"); | ||
347 | sysfs_printk_last_file(); | ||
348 | if (notify_die(DIE_OOPS, str, regs, err, | ||
349 | current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) | ||
350 | return 1; | ||
351 | |||
352 | show_registers(regs); | ||
353 | /* Executive summary in case the oops scrolled away */ | ||
354 | sp = (unsigned long) (®s->sp); | ||
355 | savesegment(ss, ss); | ||
356 | if (user_mode(regs)) { | ||
357 | sp = regs->sp; | ||
358 | ss = regs->ss & 0xffff; | ||
359 | } | ||
360 | printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); | ||
361 | print_symbol("%s", regs->ip); | ||
362 | printk(" SS:ESP %04x:%08lx\n", ss, sp); | ||
363 | return 0; | ||
364 | } | ||
365 | |||
366 | /* | ||
367 | * This is gone through when something in the kernel has done something bad | ||
368 | * and is about to be terminated: | ||
369 | */ | ||
370 | void die(const char *str, struct pt_regs *regs, long err) | ||
371 | { | ||
372 | unsigned long flags = oops_begin(); | ||
373 | |||
374 | if (die_nest_count < 3) { | ||
375 | report_bug(regs->ip, regs); | ||
376 | |||
377 | if (__die(str, regs, err)) | ||
378 | regs = NULL; | ||
379 | } else { | ||
380 | printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); | ||
381 | } | ||
382 | |||
383 | oops_end(flags, regs, SIGSEGV); | ||
384 | } | ||
385 | |||
386 | static DEFINE_SPINLOCK(nmi_print_lock); | ||
387 | |||
388 | void notrace __kprobes | ||
389 | die_nmi(char *str, struct pt_regs *regs, int do_panic) | ||
390 | { | ||
391 | if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) | ||
392 | return; | ||
393 | |||
394 | spin_lock(&nmi_print_lock); | ||
395 | /* | ||
396 | * We are in trouble anyway, lets at least try | ||
397 | * to get a message out: | ||
398 | */ | ||
399 | bust_spinlocks(1); | ||
400 | printk(KERN_EMERG "%s", str); | ||
401 | printk(" on CPU%d, ip %08lx, registers:\n", | ||
402 | smp_processor_id(), regs->ip); | ||
403 | show_registers(regs); | ||
404 | if (do_panic) | ||
405 | panic("Non maskable interrupt"); | ||
406 | console_silent(); | ||
407 | spin_unlock(&nmi_print_lock); | ||
408 | bust_spinlocks(0); | ||
409 | |||
410 | /* | ||
411 | * If we are in kernel we are probably nested up pretty bad | ||
412 | * and might aswell get out now while we still can: | ||
413 | */ | ||
414 | if (!user_mode_vm(regs)) { | ||
415 | current->thread.trap_no = 2; | ||
416 | crash_kexec(regs); | ||
417 | } | ||
418 | |||
419 | do_exit(SIGSEGV); | ||
420 | } | ||
421 | |||
422 | static int __init oops_setup(char *s) | ||
423 | { | ||
424 | if (!s) | ||
425 | return -EINVAL; | ||
426 | if (!strcmp(s, "panic")) | ||
427 | panic_on_oops = 1; | ||
428 | return 0; | ||
429 | } | ||
430 | early_param("oops", oops_setup); | ||
431 | |||
432 | static int __init kstack_setup(char *s) | ||
433 | { | ||
434 | if (!s) | ||
435 | return -EINVAL; | ||
436 | kstack_depth_to_print = simple_strtoul(s, NULL, 0); | ||
437 | return 0; | ||
438 | } | ||
439 | early_param("kstack", kstack_setup); | ||
440 | |||
441 | static int __init code_bytes_setup(char *s) | ||
442 | { | ||
443 | code_bytes = simple_strtoul(s, NULL, 0); | ||
444 | if (code_bytes > 8192) | ||
445 | code_bytes = 8192; | ||
446 | |||
447 | return 1; | ||
448 | } | ||
449 | __setup("code_bytes=", code_bytes_setup); | ||
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c new file mode 100644 index 000000000000..96a5db7da8a7 --- /dev/null +++ b/arch/x86/kernel/dumpstack_64.c | |||
@@ -0,0 +1,575 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
3 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
4 | */ | ||
5 | #include <linux/kallsyms.h> | ||
6 | #include <linux/kprobes.h> | ||
7 | #include <linux/uaccess.h> | ||
8 | #include <linux/utsname.h> | ||
9 | #include <linux/hardirq.h> | ||
10 | #include <linux/kdebug.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/ptrace.h> | ||
13 | #include <linux/kexec.h> | ||
14 | #include <linux/bug.h> | ||
15 | #include <linux/nmi.h> | ||
16 | #include <linux/sysfs.h> | ||
17 | |||
18 | #include <asm/stacktrace.h> | ||
19 | |||
20 | #define STACKSLOTS_PER_LINE 4 | ||
21 | #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) | ||
22 | |||
23 | int panic_on_unrecovered_nmi; | ||
24 | int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; | ||
25 | static unsigned int code_bytes = 64; | ||
26 | static int die_counter; | ||
27 | |||
28 | void printk_address(unsigned long address, int reliable) | ||
29 | { | ||
30 | printk(" [<%p>] %s%pS\n", (void *) address, | ||
31 | reliable ? "" : "? ", (void *) address); | ||
32 | } | ||
33 | |||
34 | static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | ||
35 | unsigned *usedp, char **idp) | ||
36 | { | ||
37 | static char ids[][8] = { | ||
38 | [DEBUG_STACK - 1] = "#DB", | ||
39 | [NMI_STACK - 1] = "NMI", | ||
40 | [DOUBLEFAULT_STACK - 1] = "#DF", | ||
41 | [STACKFAULT_STACK - 1] = "#SS", | ||
42 | [MCE_STACK - 1] = "#MC", | ||
43 | #if DEBUG_STKSZ > EXCEPTION_STKSZ | ||
44 | [N_EXCEPTION_STACKS ... | ||
45 | N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" | ||
46 | #endif | ||
47 | }; | ||
48 | unsigned k; | ||
49 | |||
50 | /* | ||
51 | * Iterate over all exception stacks, and figure out whether | ||
52 | * 'stack' is in one of them: | ||
53 | */ | ||
54 | for (k = 0; k < N_EXCEPTION_STACKS; k++) { | ||
55 | unsigned long end = per_cpu(orig_ist, cpu).ist[k]; | ||
56 | /* | ||
57 | * Is 'stack' above this exception frame's end? | ||
58 | * If yes then skip to the next frame. | ||
59 | */ | ||
60 | if (stack >= end) | ||
61 | continue; | ||
62 | /* | ||
63 | * Is 'stack' above this exception frame's start address? | ||
64 | * If yes then we found the right frame. | ||
65 | */ | ||
66 | if (stack >= end - EXCEPTION_STKSZ) { | ||
67 | /* | ||
68 | * Make sure we only iterate through an exception | ||
69 | * stack once. If it comes up for the second time | ||
70 | * then there's something wrong going on - just | ||
71 | * break out and return NULL: | ||
72 | */ | ||
73 | if (*usedp & (1U << k)) | ||
74 | break; | ||
75 | *usedp |= 1U << k; | ||
76 | *idp = ids[k]; | ||
77 | return (unsigned long *)end; | ||
78 | } | ||
79 | /* | ||
80 | * If this is a debug stack, and if it has a larger size than | ||
81 | * the usual exception stacks, then 'stack' might still | ||
82 | * be within the lower portion of the debug stack: | ||
83 | */ | ||
84 | #if DEBUG_STKSZ > EXCEPTION_STKSZ | ||
85 | if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { | ||
86 | unsigned j = N_EXCEPTION_STACKS - 1; | ||
87 | |||
88 | /* | ||
89 | * Black magic. A large debug stack is composed of | ||
90 | * multiple exception stack entries, which we | ||
91 | * iterate through now. Dont look: | ||
92 | */ | ||
93 | do { | ||
94 | ++j; | ||
95 | end -= EXCEPTION_STKSZ; | ||
96 | ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); | ||
97 | } while (stack < end - EXCEPTION_STKSZ); | ||
98 | if (*usedp & (1U << j)) | ||
99 | break; | ||
100 | *usedp |= 1U << j; | ||
101 | *idp = ids[j]; | ||
102 | return (unsigned long *)end; | ||
103 | } | ||
104 | #endif | ||
105 | } | ||
106 | return NULL; | ||
107 | } | ||
108 | |||
109 | /* | ||
110 | * x86-64 can have up to three kernel stacks: | ||
111 | * process stack | ||
112 | * interrupt stack | ||
113 | * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack | ||
114 | */ | ||
115 | |||
116 | static inline int valid_stack_ptr(struct thread_info *tinfo, | ||
117 | void *p, unsigned int size, void *end) | ||
118 | { | ||
119 | void *t = tinfo; | ||
120 | if (end) { | ||
121 | if (p < end && p >= (end-THREAD_SIZE)) | ||
122 | return 1; | ||
123 | else | ||
124 | return 0; | ||
125 | } | ||
126 | return p > t && p < t + THREAD_SIZE - size; | ||
127 | } | ||
128 | |||
129 | /* The form of the top of the frame on the stack */ | ||
130 | struct stack_frame { | ||
131 | struct stack_frame *next_frame; | ||
132 | unsigned long return_address; | ||
133 | }; | ||
134 | |||
135 | static inline unsigned long | ||
136 | print_context_stack(struct thread_info *tinfo, | ||
137 | unsigned long *stack, unsigned long bp, | ||
138 | const struct stacktrace_ops *ops, void *data, | ||
139 | unsigned long *end) | ||
140 | { | ||
141 | struct stack_frame *frame = (struct stack_frame *)bp; | ||
142 | |||
143 | while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { | ||
144 | unsigned long addr; | ||
145 | |||
146 | addr = *stack; | ||
147 | if (__kernel_text_address(addr)) { | ||
148 | if ((unsigned long) stack == bp + sizeof(long)) { | ||
149 | ops->address(data, addr, 1); | ||
150 | frame = frame->next_frame; | ||
151 | bp = (unsigned long) frame; | ||
152 | } else { | ||
153 | ops->address(data, addr, bp == 0); | ||
154 | } | ||
155 | } | ||
156 | stack++; | ||
157 | } | ||
158 | return bp; | ||
159 | } | ||
160 | |||
161 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | ||
162 | unsigned long *stack, unsigned long bp, | ||
163 | const struct stacktrace_ops *ops, void *data) | ||
164 | { | ||
165 | const unsigned cpu = get_cpu(); | ||
166 | unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; | ||
167 | unsigned used = 0; | ||
168 | struct thread_info *tinfo; | ||
169 | |||
170 | if (!task) | ||
171 | task = current; | ||
172 | |||
173 | if (!stack) { | ||
174 | unsigned long dummy; | ||
175 | stack = &dummy; | ||
176 | if (task && task != current) | ||
177 | stack = (unsigned long *)task->thread.sp; | ||
178 | } | ||
179 | |||
180 | #ifdef CONFIG_FRAME_POINTER | ||
181 | if (!bp) { | ||
182 | if (task == current) { | ||
183 | /* Grab bp right from our regs */ | ||
184 | get_bp(bp); | ||
185 | } else { | ||
186 | /* bp is the last reg pushed by switch_to */ | ||
187 | bp = *(unsigned long *) task->thread.sp; | ||
188 | } | ||
189 | } | ||
190 | #endif | ||
191 | |||
192 | /* | ||
193 | * Print function call entries in all stacks, starting at the | ||
194 | * current stack address. If the stacks consist of nested | ||
195 | * exceptions | ||
196 | */ | ||
197 | tinfo = task_thread_info(task); | ||
198 | for (;;) { | ||
199 | char *id; | ||
200 | unsigned long *estack_end; | ||
201 | estack_end = in_exception_stack(cpu, (unsigned long)stack, | ||
202 | &used, &id); | ||
203 | |||
204 | if (estack_end) { | ||
205 | if (ops->stack(data, id) < 0) | ||
206 | break; | ||
207 | |||
208 | bp = print_context_stack(tinfo, stack, bp, ops, | ||
209 | data, estack_end); | ||
210 | ops->stack(data, "<EOE>"); | ||
211 | /* | ||
212 | * We link to the next stack via the | ||
213 | * second-to-last pointer (index -2 to end) in the | ||
214 | * exception stack: | ||
215 | */ | ||
216 | stack = (unsigned long *) estack_end[-2]; | ||
217 | continue; | ||
218 | } | ||
219 | if (irqstack_end) { | ||
220 | unsigned long *irqstack; | ||
221 | irqstack = irqstack_end - | ||
222 | (IRQSTACKSIZE - 64) / sizeof(*irqstack); | ||
223 | |||
224 | if (stack >= irqstack && stack < irqstack_end) { | ||
225 | if (ops->stack(data, "IRQ") < 0) | ||
226 | break; | ||
227 | bp = print_context_stack(tinfo, stack, bp, | ||
228 | ops, data, irqstack_end); | ||
229 | /* | ||
230 | * We link to the next stack (which would be | ||
231 | * the process stack normally) the last | ||
232 | * pointer (index -1 to end) in the IRQ stack: | ||
233 | */ | ||
234 | stack = (unsigned long *) (irqstack_end[-1]); | ||
235 | irqstack_end = NULL; | ||
236 | ops->stack(data, "EOI"); | ||
237 | continue; | ||
238 | } | ||
239 | } | ||
240 | break; | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * This handles the process stack: | ||
245 | */ | ||
246 | bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); | ||
247 | put_cpu(); | ||
248 | } | ||
249 | EXPORT_SYMBOL(dump_trace); | ||
250 | |||
251 | static void | ||
252 | print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
253 | { | ||
254 | printk(data); | ||
255 | print_symbol(msg, symbol); | ||
256 | printk("\n"); | ||
257 | } | ||
258 | |||
259 | static void print_trace_warning(void *data, char *msg) | ||
260 | { | ||
261 | printk("%s%s\n", (char *)data, msg); | ||
262 | } | ||
263 | |||
264 | static int print_trace_stack(void *data, char *name) | ||
265 | { | ||
266 | printk("%s <%s> ", (char *)data, name); | ||
267 | return 0; | ||
268 | } | ||
269 | |||
270 | /* | ||
271 | * Print one address/symbol entries per line. | ||
272 | */ | ||
273 | static void print_trace_address(void *data, unsigned long addr, int reliable) | ||
274 | { | ||
275 | touch_nmi_watchdog(); | ||
276 | printk(data); | ||
277 | printk_address(addr, reliable); | ||
278 | } | ||
279 | |||
280 | static const struct stacktrace_ops print_trace_ops = { | ||
281 | .warning = print_trace_warning, | ||
282 | .warning_symbol = print_trace_warning_symbol, | ||
283 | .stack = print_trace_stack, | ||
284 | .address = print_trace_address, | ||
285 | }; | ||
286 | |||
287 | static void | ||
288 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
289 | unsigned long *stack, unsigned long bp, char *log_lvl) | ||
290 | { | ||
291 | printk("%sCall Trace:\n", log_lvl); | ||
292 | dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); | ||
293 | } | ||
294 | |||
295 | void show_trace(struct task_struct *task, struct pt_regs *regs, | ||
296 | unsigned long *stack, unsigned long bp) | ||
297 | { | ||
298 | show_trace_log_lvl(task, regs, stack, bp, ""); | ||
299 | } | ||
300 | |||
301 | static void | ||
302 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
303 | unsigned long *sp, unsigned long bp, char *log_lvl) | ||
304 | { | ||
305 | unsigned long *stack; | ||
306 | int i; | ||
307 | const int cpu = smp_processor_id(); | ||
308 | unsigned long *irqstack_end = | ||
309 | (unsigned long *) (cpu_pda(cpu)->irqstackptr); | ||
310 | unsigned long *irqstack = | ||
311 | (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); | ||
312 | |||
313 | /* | ||
314 | * debugging aid: "show_stack(NULL, NULL);" prints the | ||
315 | * back trace for this cpu. | ||
316 | */ | ||
317 | |||
318 | if (sp == NULL) { | ||
319 | if (task) | ||
320 | sp = (unsigned long *)task->thread.sp; | ||
321 | else | ||
322 | sp = (unsigned long *)&sp; | ||
323 | } | ||
324 | |||
325 | stack = sp; | ||
326 | for (i = 0; i < kstack_depth_to_print; i++) { | ||
327 | if (stack >= irqstack && stack <= irqstack_end) { | ||
328 | if (stack == irqstack_end) { | ||
329 | stack = (unsigned long *) (irqstack_end[-1]); | ||
330 | printk(" <EOI> "); | ||
331 | } | ||
332 | } else { | ||
333 | if (((long) stack & (THREAD_SIZE-1)) == 0) | ||
334 | break; | ||
335 | } | ||
336 | if (i && ((i % STACKSLOTS_PER_LINE) == 0)) | ||
337 | printk("\n%s", log_lvl); | ||
338 | printk(" %016lx", *stack++); | ||
339 | touch_nmi_watchdog(); | ||
340 | } | ||
341 | printk("\n"); | ||
342 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); | ||
343 | } | ||
344 | |||
345 | void show_stack(struct task_struct *task, unsigned long *sp) | ||
346 | { | ||
347 | show_stack_log_lvl(task, NULL, sp, 0, ""); | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * The architecture-independent dump_stack generator | ||
352 | */ | ||
353 | void dump_stack(void) | ||
354 | { | ||
355 | unsigned long bp = 0; | ||
356 | unsigned long stack; | ||
357 | |||
358 | #ifdef CONFIG_FRAME_POINTER | ||
359 | if (!bp) | ||
360 | get_bp(bp); | ||
361 | #endif | ||
362 | |||
363 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | ||
364 | current->pid, current->comm, print_tainted(), | ||
365 | init_utsname()->release, | ||
366 | (int)strcspn(init_utsname()->version, " "), | ||
367 | init_utsname()->version); | ||
368 | show_trace(NULL, NULL, &stack, bp); | ||
369 | } | ||
370 | EXPORT_SYMBOL(dump_stack); | ||
371 | |||
372 | void show_registers(struct pt_regs *regs) | ||
373 | { | ||
374 | int i; | ||
375 | unsigned long sp; | ||
376 | const int cpu = smp_processor_id(); | ||
377 | struct task_struct *cur = cpu_pda(cpu)->pcurrent; | ||
378 | |||
379 | sp = regs->sp; | ||
380 | printk("CPU %d ", cpu); | ||
381 | __show_regs(regs, 1); | ||
382 | printk("Process %s (pid: %d, threadinfo %p, task %p)\n", | ||
383 | cur->comm, cur->pid, task_thread_info(cur), cur); | ||
384 | |||
385 | /* | ||
386 | * When in-kernel, we also print out the stack and code at the | ||
387 | * time of the fault.. | ||
388 | */ | ||
389 | if (!user_mode(regs)) { | ||
390 | unsigned int code_prologue = code_bytes * 43 / 64; | ||
391 | unsigned int code_len = code_bytes; | ||
392 | unsigned char c; | ||
393 | u8 *ip; | ||
394 | |||
395 | printk(KERN_EMERG "Stack:\n"); | ||
396 | show_stack_log_lvl(NULL, regs, (unsigned long *)sp, | ||
397 | regs->bp, KERN_EMERG); | ||
398 | |||
399 | printk(KERN_EMERG "Code: "); | ||
400 | |||
401 | ip = (u8 *)regs->ip - code_prologue; | ||
402 | if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { | ||
403 | /* try starting at IP */ | ||
404 | ip = (u8 *)regs->ip; | ||
405 | code_len = code_len - code_prologue + 1; | ||
406 | } | ||
407 | for (i = 0; i < code_len; i++, ip++) { | ||
408 | if (ip < (u8 *)PAGE_OFFSET || | ||
409 | probe_kernel_address(ip, c)) { | ||
410 | printk(" Bad RIP value."); | ||
411 | break; | ||
412 | } | ||
413 | if (ip == (u8 *)regs->ip) | ||
414 | printk("<%02x> ", c); | ||
415 | else | ||
416 | printk("%02x ", c); | ||
417 | } | ||
418 | } | ||
419 | printk("\n"); | ||
420 | } | ||
421 | |||
422 | int is_valid_bugaddr(unsigned long ip) | ||
423 | { | ||
424 | unsigned short ud2; | ||
425 | |||
426 | if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) | ||
427 | return 0; | ||
428 | |||
429 | return ud2 == 0x0b0f; | ||
430 | } | ||
431 | |||
432 | static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; | ||
433 | static int die_owner = -1; | ||
434 | static unsigned int die_nest_count; | ||
435 | |||
436 | unsigned __kprobes long oops_begin(void) | ||
437 | { | ||
438 | int cpu; | ||
439 | unsigned long flags; | ||
440 | |||
441 | oops_enter(); | ||
442 | |||
443 | /* racy, but better than risking deadlock. */ | ||
444 | raw_local_irq_save(flags); | ||
445 | cpu = smp_processor_id(); | ||
446 | if (!__raw_spin_trylock(&die_lock)) { | ||
447 | if (cpu == die_owner) | ||
448 | /* nested oops. should stop eventually */; | ||
449 | else | ||
450 | __raw_spin_lock(&die_lock); | ||
451 | } | ||
452 | die_nest_count++; | ||
453 | die_owner = cpu; | ||
454 | console_verbose(); | ||
455 | bust_spinlocks(1); | ||
456 | return flags; | ||
457 | } | ||
458 | |||
459 | void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) | ||
460 | { | ||
461 | die_owner = -1; | ||
462 | bust_spinlocks(0); | ||
463 | die_nest_count--; | ||
464 | if (!die_nest_count) | ||
465 | /* Nest count reaches zero, release the lock. */ | ||
466 | __raw_spin_unlock(&die_lock); | ||
467 | raw_local_irq_restore(flags); | ||
468 | if (!regs) { | ||
469 | oops_exit(); | ||
470 | return; | ||
471 | } | ||
472 | if (in_interrupt()) | ||
473 | panic("Fatal exception in interrupt"); | ||
474 | if (panic_on_oops) | ||
475 | panic("Fatal exception"); | ||
476 | oops_exit(); | ||
477 | do_exit(signr); | ||
478 | } | ||
479 | |||
480 | int __kprobes __die(const char *str, struct pt_regs *regs, long err) | ||
481 | { | ||
482 | printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); | ||
483 | #ifdef CONFIG_PREEMPT | ||
484 | printk("PREEMPT "); | ||
485 | #endif | ||
486 | #ifdef CONFIG_SMP | ||
487 | printk("SMP "); | ||
488 | #endif | ||
489 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
490 | printk("DEBUG_PAGEALLOC"); | ||
491 | #endif | ||
492 | printk("\n"); | ||
493 | sysfs_printk_last_file(); | ||
494 | if (notify_die(DIE_OOPS, str, regs, err, | ||
495 | current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) | ||
496 | return 1; | ||
497 | |||
498 | show_registers(regs); | ||
499 | add_taint(TAINT_DIE); | ||
500 | /* Executive summary in case the oops scrolled away */ | ||
501 | printk(KERN_ALERT "RIP "); | ||
502 | printk_address(regs->ip, 1); | ||
503 | printk(" RSP <%016lx>\n", regs->sp); | ||
504 | if (kexec_should_crash(current)) | ||
505 | crash_kexec(regs); | ||
506 | return 0; | ||
507 | } | ||
508 | |||
509 | void die(const char *str, struct pt_regs *regs, long err) | ||
510 | { | ||
511 | unsigned long flags = oops_begin(); | ||
512 | |||
513 | if (!user_mode(regs)) | ||
514 | report_bug(regs->ip, regs); | ||
515 | |||
516 | if (__die(str, regs, err)) | ||
517 | regs = NULL; | ||
518 | oops_end(flags, regs, SIGSEGV); | ||
519 | } | ||
520 | |||
521 | notrace __kprobes void | ||
522 | die_nmi(char *str, struct pt_regs *regs, int do_panic) | ||
523 | { | ||
524 | unsigned long flags; | ||
525 | |||
526 | if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) | ||
527 | return; | ||
528 | |||
529 | flags = oops_begin(); | ||
530 | /* | ||
531 | * We are in trouble anyway, lets at least try | ||
532 | * to get a message out. | ||
533 | */ | ||
534 | printk(KERN_EMERG "%s", str); | ||
535 | printk(" on CPU%d, ip %08lx, registers:\n", | ||
536 | smp_processor_id(), regs->ip); | ||
537 | show_registers(regs); | ||
538 | if (kexec_should_crash(current)) | ||
539 | crash_kexec(regs); | ||
540 | if (do_panic || panic_on_oops) | ||
541 | panic("Non maskable interrupt"); | ||
542 | oops_end(flags, NULL, SIGBUS); | ||
543 | nmi_exit(); | ||
544 | local_irq_enable(); | ||
545 | do_exit(SIGBUS); | ||
546 | } | ||
547 | |||
548 | static int __init oops_setup(char *s) | ||
549 | { | ||
550 | if (!s) | ||
551 | return -EINVAL; | ||
552 | if (!strcmp(s, "panic")) | ||
553 | panic_on_oops = 1; | ||
554 | return 0; | ||
555 | } | ||
556 | early_param("oops", oops_setup); | ||
557 | |||
558 | static int __init kstack_setup(char *s) | ||
559 | { | ||
560 | if (!s) | ||
561 | return -EINVAL; | ||
562 | kstack_depth_to_print = simple_strtoul(s, NULL, 0); | ||
563 | return 0; | ||
564 | } | ||
565 | early_param("kstack", kstack_setup); | ||
566 | |||
567 | static int __init code_bytes_setup(char *s) | ||
568 | { | ||
569 | code_bytes = simple_strtoul(s, NULL, 0); | ||
570 | if (code_bytes > 8192) | ||
571 | code_bytes = 8192; | ||
572 | |||
573 | return 1; | ||
574 | } | ||
575 | __setup("code_bytes=", code_bytes_setup); | ||
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 9af89078f7bb..ce97bf3bed12 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -148,6 +148,9 @@ void __init e820_print_map(char *who) | |||
148 | case E820_NVS: | 148 | case E820_NVS: |
149 | printk(KERN_CONT "(ACPI NVS)\n"); | 149 | printk(KERN_CONT "(ACPI NVS)\n"); |
150 | break; | 150 | break; |
151 | case E820_UNUSABLE: | ||
152 | printk("(unusable)\n"); | ||
153 | break; | ||
151 | default: | 154 | default: |
152 | printk(KERN_CONT "type %u\n", e820.map[i].type); | 155 | printk(KERN_CONT "type %u\n", e820.map[i].type); |
153 | break; | 156 | break; |
@@ -1203,7 +1206,7 @@ static int __init parse_memmap_opt(char *p) | |||
1203 | if (!p) | 1206 | if (!p) |
1204 | return -EINVAL; | 1207 | return -EINVAL; |
1205 | 1208 | ||
1206 | if (!strcmp(p, "exactmap")) { | 1209 | if (!strncmp(p, "exactmap", 8)) { |
1207 | #ifdef CONFIG_CRASH_DUMP | 1210 | #ifdef CONFIG_CRASH_DUMP |
1208 | /* | 1211 | /* |
1209 | * If we are doing a crash dump, we still need to know | 1212 | * If we are doing a crash dump, we still need to know |
@@ -1260,6 +1263,7 @@ static inline const char *e820_type_to_string(int e820_type) | |||
1260 | case E820_RAM: return "System RAM"; | 1263 | case E820_RAM: return "System RAM"; |
1261 | case E820_ACPI: return "ACPI Tables"; | 1264 | case E820_ACPI: return "ACPI Tables"; |
1262 | case E820_NVS: return "ACPI Non-volatile Storage"; | 1265 | case E820_NVS: return "ACPI Non-volatile Storage"; |
1266 | case E820_UNUSABLE: return "Unusable memory"; | ||
1263 | default: return "reserved"; | 1267 | default: return "reserved"; |
1264 | } | 1268 | } |
1265 | } | 1269 | } |
@@ -1267,6 +1271,7 @@ static inline const char *e820_type_to_string(int e820_type) | |||
1267 | /* | 1271 | /* |
1268 | * Mark e820 reserved areas as busy for the resource manager. | 1272 | * Mark e820 reserved areas as busy for the resource manager. |
1269 | */ | 1273 | */ |
1274 | static struct resource __initdata *e820_res; | ||
1270 | void __init e820_reserve_resources(void) | 1275 | void __init e820_reserve_resources(void) |
1271 | { | 1276 | { |
1272 | int i; | 1277 | int i; |
@@ -1274,20 +1279,26 @@ void __init e820_reserve_resources(void) | |||
1274 | u64 end; | 1279 | u64 end; |
1275 | 1280 | ||
1276 | res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); | 1281 | res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); |
1282 | e820_res = res; | ||
1277 | for (i = 0; i < e820.nr_map; i++) { | 1283 | for (i = 0; i < e820.nr_map; i++) { |
1278 | end = e820.map[i].addr + e820.map[i].size - 1; | 1284 | end = e820.map[i].addr + e820.map[i].size - 1; |
1279 | #ifndef CONFIG_RESOURCES_64BIT | 1285 | if (end != (resource_size_t)end) { |
1280 | if (end > 0x100000000ULL) { | ||
1281 | res++; | 1286 | res++; |
1282 | continue; | 1287 | continue; |
1283 | } | 1288 | } |
1284 | #endif | ||
1285 | res->name = e820_type_to_string(e820.map[i].type); | 1289 | res->name = e820_type_to_string(e820.map[i].type); |
1286 | res->start = e820.map[i].addr; | 1290 | res->start = e820.map[i].addr; |
1287 | res->end = end; | 1291 | res->end = end; |
1288 | 1292 | ||
1289 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | 1293 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
1290 | insert_resource(&iomem_resource, res); | 1294 | |
1295 | /* | ||
1296 | * don't register the region that could be conflicted with | ||
1297 | * pci device BAR resource and insert them later in | ||
1298 | * pcibios_resource_survey() | ||
1299 | */ | ||
1300 | if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) | ||
1301 | insert_resource(&iomem_resource, res); | ||
1291 | res++; | 1302 | res++; |
1292 | } | 1303 | } |
1293 | 1304 | ||
@@ -1299,6 +1310,19 @@ void __init e820_reserve_resources(void) | |||
1299 | } | 1310 | } |
1300 | } | 1311 | } |
1301 | 1312 | ||
1313 | void __init e820_reserve_resources_late(void) | ||
1314 | { | ||
1315 | int i; | ||
1316 | struct resource *res; | ||
1317 | |||
1318 | res = e820_res; | ||
1319 | for (i = 0; i < e820.nr_map; i++) { | ||
1320 | if (!res->parent && res->end) | ||
1321 | reserve_region_with_split(&iomem_resource, res->start, res->end, res->name); | ||
1322 | res++; | ||
1323 | } | ||
1324 | } | ||
1325 | |||
1302 | char *__init default_machine_specific_memory_setup(void) | 1326 | char *__init default_machine_specific_memory_setup(void) |
1303 | { | 1327 | { |
1304 | char *who = "BIOS-e820"; | 1328 | char *who = "BIOS-e820"; |
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 4353cf5e6fac..733c4f8d42ea 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c | |||
@@ -95,6 +95,66 @@ static void __init nvidia_bugs(int num, int slot, int func) | |||
95 | 95 | ||
96 | } | 96 | } |
97 | 97 | ||
98 | static u32 ati_ixp4x0_rev(int num, int slot, int func) | ||
99 | { | ||
100 | u32 d; | ||
101 | u8 b; | ||
102 | |||
103 | b = read_pci_config_byte(num, slot, func, 0xac); | ||
104 | b &= ~(1<<5); | ||
105 | write_pci_config_byte(num, slot, func, 0xac, b); | ||
106 | |||
107 | d = read_pci_config(num, slot, func, 0x70); | ||
108 | d |= 1<<8; | ||
109 | write_pci_config(num, slot, func, 0x70, d); | ||
110 | |||
111 | d = read_pci_config(num, slot, func, 0x8); | ||
112 | d &= 0xff; | ||
113 | return d; | ||
114 | } | ||
115 | |||
116 | static void __init ati_bugs(int num, int slot, int func) | ||
117 | { | ||
118 | #if defined(CONFIG_ACPI) && defined (CONFIG_X86_IO_APIC) | ||
119 | u32 d; | ||
120 | u8 b; | ||
121 | |||
122 | if (acpi_use_timer_override) | ||
123 | return; | ||
124 | |||
125 | d = ati_ixp4x0_rev(num, slot, func); | ||
126 | if (d < 0x82) | ||
127 | acpi_skip_timer_override = 1; | ||
128 | else { | ||
129 | /* check for IRQ0 interrupt swap */ | ||
130 | outb(0x72, 0xcd6); b = inb(0xcd7); | ||
131 | if (!(b & 0x2)) | ||
132 | acpi_skip_timer_override = 1; | ||
133 | } | ||
134 | |||
135 | if (acpi_skip_timer_override) { | ||
136 | printk(KERN_INFO "SB4X0 revision 0x%x\n", d); | ||
137 | printk(KERN_INFO "Ignoring ACPI timer override.\n"); | ||
138 | printk(KERN_INFO "If you got timer trouble " | ||
139 | "try acpi_use_timer_override\n"); | ||
140 | } | ||
141 | #endif | ||
142 | } | ||
143 | |||
144 | #ifdef CONFIG_DMAR | ||
145 | static void __init intel_g33_dmar(int num, int slot, int func) | ||
146 | { | ||
147 | struct acpi_table_header *dmar_tbl; | ||
148 | acpi_status status; | ||
149 | |||
150 | status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl); | ||
151 | if (ACPI_SUCCESS(status)) { | ||
152 | printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n"); | ||
153 | dmar_disabled = 1; | ||
154 | } | ||
155 | } | ||
156 | #endif | ||
157 | |||
98 | #define QFLAG_APPLY_ONCE 0x1 | 158 | #define QFLAG_APPLY_ONCE 0x1 |
99 | #define QFLAG_APPLIED 0x2 | 159 | #define QFLAG_APPLIED 0x2 |
100 | #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) | 160 | #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) |
@@ -114,6 +174,12 @@ static struct chipset early_qrk[] __initdata = { | |||
114 | PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, | 174 | PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, |
115 | { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, | 175 | { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, |
116 | PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, | 176 | PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, |
177 | { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, | ||
178 | PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, | ||
179 | #ifdef CONFIG_DMAR | ||
180 | { PCI_VENDOR_ID_INTEL, 0x29c0, | ||
181 | PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar }, | ||
182 | #endif | ||
117 | {} | 183 | {} |
118 | }; | 184 | }; |
119 | 185 | ||
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index ff9e7350da54..34ad997d3834 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c | |||
@@ -3,11 +3,19 @@ | |||
3 | #include <linux/init.h> | 3 | #include <linux/init.h> |
4 | #include <linux/string.h> | 4 | #include <linux/string.h> |
5 | #include <linux/screen_info.h> | 5 | #include <linux/screen_info.h> |
6 | #include <linux/usb/ch9.h> | ||
7 | #include <linux/pci_regs.h> | ||
8 | #include <linux/pci_ids.h> | ||
9 | #include <linux/errno.h> | ||
6 | #include <asm/io.h> | 10 | #include <asm/io.h> |
7 | #include <asm/processor.h> | 11 | #include <asm/processor.h> |
8 | #include <asm/fcntl.h> | 12 | #include <asm/fcntl.h> |
9 | #include <asm/setup.h> | 13 | #include <asm/setup.h> |
10 | #include <xen/hvc-console.h> | 14 | #include <xen/hvc-console.h> |
15 | #include <asm/pci-direct.h> | ||
16 | #include <asm/pgtable.h> | ||
17 | #include <asm/fixmap.h> | ||
18 | #include <linux/usb/ehci_def.h> | ||
11 | 19 | ||
12 | /* Simple VGA output */ | 20 | /* Simple VGA output */ |
13 | #define VGABASE (__ISA_IO_base + 0xb8000) | 21 | #define VGABASE (__ISA_IO_base + 0xb8000) |
@@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8; /* ttyS0 */ | |||
78 | static int early_serial_putc(unsigned char ch) | 86 | static int early_serial_putc(unsigned char ch) |
79 | { | 87 | { |
80 | unsigned timeout = 0xffff; | 88 | unsigned timeout = 0xffff; |
89 | |||
81 | while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) | 90 | while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) |
82 | cpu_relax(); | 91 | cpu_relax(); |
83 | outb(ch, early_serial_base + TXR); | 92 | outb(ch, early_serial_base + TXR); |
@@ -111,7 +120,7 @@ static __init void early_serial_init(char *s) | |||
111 | if (!strncmp(s, "0x", 2)) { | 120 | if (!strncmp(s, "0x", 2)) { |
112 | early_serial_base = simple_strtoul(s, &e, 16); | 121 | early_serial_base = simple_strtoul(s, &e, 16); |
113 | } else { | 122 | } else { |
114 | static int bases[] = { 0x3f8, 0x2f8 }; | 123 | static const int __initconst bases[] = { 0x3f8, 0x2f8 }; |
115 | 124 | ||
116 | if (!strncmp(s, "ttyS", 4)) | 125 | if (!strncmp(s, "ttyS", 4)) |
117 | s += 4; | 126 | s += 4; |
@@ -151,6 +160,721 @@ static struct console early_serial_console = { | |||
151 | .index = -1, | 160 | .index = -1, |
152 | }; | 161 | }; |
153 | 162 | ||
163 | #ifdef CONFIG_EARLY_PRINTK_DBGP | ||
164 | |||
165 | static struct ehci_caps __iomem *ehci_caps; | ||
166 | static struct ehci_regs __iomem *ehci_regs; | ||
167 | static struct ehci_dbg_port __iomem *ehci_debug; | ||
168 | static unsigned int dbgp_endpoint_out; | ||
169 | |||
170 | struct ehci_dev { | ||
171 | u32 bus; | ||
172 | u32 slot; | ||
173 | u32 func; | ||
174 | }; | ||
175 | |||
176 | static struct ehci_dev ehci_dev; | ||
177 | |||
178 | #define USB_DEBUG_DEVNUM 127 | ||
179 | |||
180 | #define DBGP_DATA_TOGGLE 0x8800 | ||
181 | |||
182 | static inline u32 dbgp_pid_update(u32 x, u32 tok) | ||
183 | { | ||
184 | return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); | ||
185 | } | ||
186 | |||
187 | static inline u32 dbgp_len_update(u32 x, u32 len) | ||
188 | { | ||
189 | return (x & ~0x0f) | (len & 0x0f); | ||
190 | } | ||
191 | |||
192 | /* | ||
193 | * USB Packet IDs (PIDs) | ||
194 | */ | ||
195 | |||
196 | /* token */ | ||
197 | #define USB_PID_OUT 0xe1 | ||
198 | #define USB_PID_IN 0x69 | ||
199 | #define USB_PID_SOF 0xa5 | ||
200 | #define USB_PID_SETUP 0x2d | ||
201 | /* handshake */ | ||
202 | #define USB_PID_ACK 0xd2 | ||
203 | #define USB_PID_NAK 0x5a | ||
204 | #define USB_PID_STALL 0x1e | ||
205 | #define USB_PID_NYET 0x96 | ||
206 | /* data */ | ||
207 | #define USB_PID_DATA0 0xc3 | ||
208 | #define USB_PID_DATA1 0x4b | ||
209 | #define USB_PID_DATA2 0x87 | ||
210 | #define USB_PID_MDATA 0x0f | ||
211 | /* Special */ | ||
212 | #define USB_PID_PREAMBLE 0x3c | ||
213 | #define USB_PID_ERR 0x3c | ||
214 | #define USB_PID_SPLIT 0x78 | ||
215 | #define USB_PID_PING 0xb4 | ||
216 | #define USB_PID_UNDEF_0 0xf0 | ||
217 | |||
218 | #define USB_PID_DATA_TOGGLE 0x88 | ||
219 | #define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) | ||
220 | |||
221 | #define PCI_CAP_ID_EHCI_DEBUG 0xa | ||
222 | |||
223 | #define HUB_ROOT_RESET_TIME 50 /* times are in msec */ | ||
224 | #define HUB_SHORT_RESET_TIME 10 | ||
225 | #define HUB_LONG_RESET_TIME 200 | ||
226 | #define HUB_RESET_TIMEOUT 500 | ||
227 | |||
228 | #define DBGP_MAX_PACKET 8 | ||
229 | |||
230 | static int dbgp_wait_until_complete(void) | ||
231 | { | ||
232 | u32 ctrl; | ||
233 | int loop = 0x100000; | ||
234 | |||
235 | do { | ||
236 | ctrl = readl(&ehci_debug->control); | ||
237 | /* Stop when the transaction is finished */ | ||
238 | if (ctrl & DBGP_DONE) | ||
239 | break; | ||
240 | } while (--loop > 0); | ||
241 | |||
242 | if (!loop) | ||
243 | return -1; | ||
244 | |||
245 | /* | ||
246 | * Now that we have observed the completed transaction, | ||
247 | * clear the done bit. | ||
248 | */ | ||
249 | writel(ctrl | DBGP_DONE, &ehci_debug->control); | ||
250 | return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); | ||
251 | } | ||
252 | |||
253 | static void dbgp_mdelay(int ms) | ||
254 | { | ||
255 | int i; | ||
256 | |||
257 | while (ms--) { | ||
258 | for (i = 0; i < 1000; i++) | ||
259 | outb(0x1, 0x80); | ||
260 | } | ||
261 | } | ||
262 | |||
263 | static void dbgp_breath(void) | ||
264 | { | ||
265 | /* Sleep to give the debug port a chance to breathe */ | ||
266 | } | ||
267 | |||
268 | static int dbgp_wait_until_done(unsigned ctrl) | ||
269 | { | ||
270 | u32 pids, lpid; | ||
271 | int ret; | ||
272 | int loop = 3; | ||
273 | |||
274 | retry: | ||
275 | writel(ctrl | DBGP_GO, &ehci_debug->control); | ||
276 | ret = dbgp_wait_until_complete(); | ||
277 | pids = readl(&ehci_debug->pids); | ||
278 | lpid = DBGP_PID_GET(pids); | ||
279 | |||
280 | if (ret < 0) | ||
281 | return ret; | ||
282 | |||
283 | /* | ||
284 | * If the port is getting full or it has dropped data | ||
285 | * start pacing ourselves, not necessary but it's friendly. | ||
286 | */ | ||
287 | if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) | ||
288 | dbgp_breath(); | ||
289 | |||
290 | /* If I get a NACK reissue the transmission */ | ||
291 | if (lpid == USB_PID_NAK) { | ||
292 | if (--loop > 0) | ||
293 | goto retry; | ||
294 | } | ||
295 | |||
296 | return ret; | ||
297 | } | ||
298 | |||
299 | static void dbgp_set_data(const void *buf, int size) | ||
300 | { | ||
301 | const unsigned char *bytes = buf; | ||
302 | u32 lo, hi; | ||
303 | int i; | ||
304 | |||
305 | lo = hi = 0; | ||
306 | for (i = 0; i < 4 && i < size; i++) | ||
307 | lo |= bytes[i] << (8*i); | ||
308 | for (; i < 8 && i < size; i++) | ||
309 | hi |= bytes[i] << (8*(i - 4)); | ||
310 | writel(lo, &ehci_debug->data03); | ||
311 | writel(hi, &ehci_debug->data47); | ||
312 | } | ||
313 | |||
314 | static void dbgp_get_data(void *buf, int size) | ||
315 | { | ||
316 | unsigned char *bytes = buf; | ||
317 | u32 lo, hi; | ||
318 | int i; | ||
319 | |||
320 | lo = readl(&ehci_debug->data03); | ||
321 | hi = readl(&ehci_debug->data47); | ||
322 | for (i = 0; i < 4 && i < size; i++) | ||
323 | bytes[i] = (lo >> (8*i)) & 0xff; | ||
324 | for (; i < 8 && i < size; i++) | ||
325 | bytes[i] = (hi >> (8*(i - 4))) & 0xff; | ||
326 | } | ||
327 | |||
328 | static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, | ||
329 | const char *bytes, int size) | ||
330 | { | ||
331 | u32 pids, addr, ctrl; | ||
332 | int ret; | ||
333 | |||
334 | if (size > DBGP_MAX_PACKET) | ||
335 | return -1; | ||
336 | |||
337 | addr = DBGP_EPADDR(devnum, endpoint); | ||
338 | |||
339 | pids = readl(&ehci_debug->pids); | ||
340 | pids = dbgp_pid_update(pids, USB_PID_OUT); | ||
341 | |||
342 | ctrl = readl(&ehci_debug->control); | ||
343 | ctrl = dbgp_len_update(ctrl, size); | ||
344 | ctrl |= DBGP_OUT; | ||
345 | ctrl |= DBGP_GO; | ||
346 | |||
347 | dbgp_set_data(bytes, size); | ||
348 | writel(addr, &ehci_debug->address); | ||
349 | writel(pids, &ehci_debug->pids); | ||
350 | |||
351 | ret = dbgp_wait_until_done(ctrl); | ||
352 | if (ret < 0) | ||
353 | return ret; | ||
354 | |||
355 | return ret; | ||
356 | } | ||
357 | |||
358 | static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, | ||
359 | int size) | ||
360 | { | ||
361 | u32 pids, addr, ctrl; | ||
362 | int ret; | ||
363 | |||
364 | if (size > DBGP_MAX_PACKET) | ||
365 | return -1; | ||
366 | |||
367 | addr = DBGP_EPADDR(devnum, endpoint); | ||
368 | |||
369 | pids = readl(&ehci_debug->pids); | ||
370 | pids = dbgp_pid_update(pids, USB_PID_IN); | ||
371 | |||
372 | ctrl = readl(&ehci_debug->control); | ||
373 | ctrl = dbgp_len_update(ctrl, size); | ||
374 | ctrl &= ~DBGP_OUT; | ||
375 | ctrl |= DBGP_GO; | ||
376 | |||
377 | writel(addr, &ehci_debug->address); | ||
378 | writel(pids, &ehci_debug->pids); | ||
379 | ret = dbgp_wait_until_done(ctrl); | ||
380 | if (ret < 0) | ||
381 | return ret; | ||
382 | |||
383 | if (size > ret) | ||
384 | size = ret; | ||
385 | dbgp_get_data(data, size); | ||
386 | return ret; | ||
387 | } | ||
388 | |||
389 | static int dbgp_control_msg(unsigned devnum, int requesttype, int request, | ||
390 | int value, int index, void *data, int size) | ||
391 | { | ||
392 | u32 pids, addr, ctrl; | ||
393 | struct usb_ctrlrequest req; | ||
394 | int read; | ||
395 | int ret; | ||
396 | |||
397 | read = (requesttype & USB_DIR_IN) != 0; | ||
398 | if (size > (read ? DBGP_MAX_PACKET:0)) | ||
399 | return -1; | ||
400 | |||
401 | /* Compute the control message */ | ||
402 | req.bRequestType = requesttype; | ||
403 | req.bRequest = request; | ||
404 | req.wValue = cpu_to_le16(value); | ||
405 | req.wIndex = cpu_to_le16(index); | ||
406 | req.wLength = cpu_to_le16(size); | ||
407 | |||
408 | pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); | ||
409 | addr = DBGP_EPADDR(devnum, 0); | ||
410 | |||
411 | ctrl = readl(&ehci_debug->control); | ||
412 | ctrl = dbgp_len_update(ctrl, sizeof(req)); | ||
413 | ctrl |= DBGP_OUT; | ||
414 | ctrl |= DBGP_GO; | ||
415 | |||
416 | /* Send the setup message */ | ||
417 | dbgp_set_data(&req, sizeof(req)); | ||
418 | writel(addr, &ehci_debug->address); | ||
419 | writel(pids, &ehci_debug->pids); | ||
420 | ret = dbgp_wait_until_done(ctrl); | ||
421 | if (ret < 0) | ||
422 | return ret; | ||
423 | |||
424 | /* Read the result */ | ||
425 | return dbgp_bulk_read(devnum, 0, data, size); | ||
426 | } | ||
427 | |||
428 | |||
429 | /* Find a PCI capability */ | ||
430 | static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) | ||
431 | { | ||
432 | u8 pos; | ||
433 | int bytes; | ||
434 | |||
435 | if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & | ||
436 | PCI_STATUS_CAP_LIST)) | ||
437 | return 0; | ||
438 | |||
439 | pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); | ||
440 | for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { | ||
441 | u8 id; | ||
442 | |||
443 | pos &= ~3; | ||
444 | id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); | ||
445 | if (id == 0xff) | ||
446 | break; | ||
447 | if (id == cap) | ||
448 | return pos; | ||
449 | |||
450 | pos = read_pci_config_byte(num, slot, func, | ||
451 | pos+PCI_CAP_LIST_NEXT); | ||
452 | } | ||
453 | return 0; | ||
454 | } | ||
455 | |||
456 | static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) | ||
457 | { | ||
458 | u32 class; | ||
459 | |||
460 | class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); | ||
461 | if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) | ||
462 | return 0; | ||
463 | |||
464 | return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); | ||
465 | } | ||
466 | |||
467 | static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) | ||
468 | { | ||
469 | u32 bus, slot, func; | ||
470 | |||
471 | for (bus = 0; bus < 256; bus++) { | ||
472 | for (slot = 0; slot < 32; slot++) { | ||
473 | for (func = 0; func < 8; func++) { | ||
474 | unsigned cap; | ||
475 | |||
476 | cap = __find_dbgp(bus, slot, func); | ||
477 | |||
478 | if (!cap) | ||
479 | continue; | ||
480 | if (ehci_num-- != 0) | ||
481 | continue; | ||
482 | *rbus = bus; | ||
483 | *rslot = slot; | ||
484 | *rfunc = func; | ||
485 | return cap; | ||
486 | } | ||
487 | } | ||
488 | } | ||
489 | return 0; | ||
490 | } | ||
491 | |||
492 | static int ehci_reset_port(int port) | ||
493 | { | ||
494 | u32 portsc; | ||
495 | u32 delay_time, delay; | ||
496 | int loop; | ||
497 | |||
498 | /* Reset the usb debug port */ | ||
499 | portsc = readl(&ehci_regs->port_status[port - 1]); | ||
500 | portsc &= ~PORT_PE; | ||
501 | portsc |= PORT_RESET; | ||
502 | writel(portsc, &ehci_regs->port_status[port - 1]); | ||
503 | |||
504 | delay = HUB_ROOT_RESET_TIME; | ||
505 | for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; | ||
506 | delay_time += delay) { | ||
507 | dbgp_mdelay(delay); | ||
508 | |||
509 | portsc = readl(&ehci_regs->port_status[port - 1]); | ||
510 | if (portsc & PORT_RESET) { | ||
511 | /* force reset to complete */ | ||
512 | loop = 2; | ||
513 | writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), | ||
514 | &ehci_regs->port_status[port - 1]); | ||
515 | do { | ||
516 | portsc = readl(&ehci_regs->port_status[port-1]); | ||
517 | } while ((portsc & PORT_RESET) && (--loop > 0)); | ||
518 | } | ||
519 | |||
520 | /* Device went away? */ | ||
521 | if (!(portsc & PORT_CONNECT)) | ||
522 | return -ENOTCONN; | ||
523 | |||
524 | /* bomb out completely if something weird happend */ | ||
525 | if ((portsc & PORT_CSC)) | ||
526 | return -EINVAL; | ||
527 | |||
528 | /* If we've finished resetting, then break out of the loop */ | ||
529 | if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) | ||
530 | return 0; | ||
531 | } | ||
532 | return -EBUSY; | ||
533 | } | ||
534 | |||
535 | static int ehci_wait_for_port(int port) | ||
536 | { | ||
537 | u32 status; | ||
538 | int ret, reps; | ||
539 | |||
540 | for (reps = 0; reps < 3; reps++) { | ||
541 | dbgp_mdelay(100); | ||
542 | status = readl(&ehci_regs->status); | ||
543 | if (status & STS_PCD) { | ||
544 | ret = ehci_reset_port(port); | ||
545 | if (ret == 0) | ||
546 | return 0; | ||
547 | } | ||
548 | } | ||
549 | return -ENOTCONN; | ||
550 | } | ||
551 | |||
552 | #ifdef DBGP_DEBUG | ||
553 | # define dbgp_printk early_printk | ||
554 | #else | ||
555 | static inline void dbgp_printk(const char *fmt, ...) { } | ||
556 | #endif | ||
557 | |||
558 | typedef void (*set_debug_port_t)(int port); | ||
559 | |||
560 | static void default_set_debug_port(int port) | ||
561 | { | ||
562 | } | ||
563 | |||
564 | static set_debug_port_t set_debug_port = default_set_debug_port; | ||
565 | |||
566 | static void nvidia_set_debug_port(int port) | ||
567 | { | ||
568 | u32 dword; | ||
569 | dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, | ||
570 | 0x74); | ||
571 | dword &= ~(0x0f<<12); | ||
572 | dword |= ((port & 0x0f)<<12); | ||
573 | write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, | ||
574 | dword); | ||
575 | dbgp_printk("set debug port to %d\n", port); | ||
576 | } | ||
577 | |||
578 | static void __init detect_set_debug_port(void) | ||
579 | { | ||
580 | u32 vendorid; | ||
581 | |||
582 | vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, | ||
583 | 0x00); | ||
584 | |||
585 | if ((vendorid & 0xffff) == 0x10de) { | ||
586 | dbgp_printk("using nvidia set_debug_port\n"); | ||
587 | set_debug_port = nvidia_set_debug_port; | ||
588 | } | ||
589 | } | ||
590 | |||
591 | static int __init ehci_setup(void) | ||
592 | { | ||
593 | struct usb_debug_descriptor dbgp_desc; | ||
594 | u32 cmd, ctrl, status, portsc, hcs_params; | ||
595 | u32 debug_port, new_debug_port = 0, n_ports; | ||
596 | u32 devnum; | ||
597 | int ret, i; | ||
598 | int loop; | ||
599 | int port_map_tried; | ||
600 | int playtimes = 3; | ||
601 | |||
602 | try_next_time: | ||
603 | port_map_tried = 0; | ||
604 | |||
605 | try_next_port: | ||
606 | |||
607 | hcs_params = readl(&ehci_caps->hcs_params); | ||
608 | debug_port = HCS_DEBUG_PORT(hcs_params); | ||
609 | n_ports = HCS_N_PORTS(hcs_params); | ||
610 | |||
611 | dbgp_printk("debug_port: %d\n", debug_port); | ||
612 | dbgp_printk("n_ports: %d\n", n_ports); | ||
613 | |||
614 | for (i = 1; i <= n_ports; i++) { | ||
615 | portsc = readl(&ehci_regs->port_status[i-1]); | ||
616 | dbgp_printk("portstatus%d: %08x\n", i, portsc); | ||
617 | } | ||
618 | |||
619 | if (port_map_tried && (new_debug_port != debug_port)) { | ||
620 | if (--playtimes) { | ||
621 | set_debug_port(new_debug_port); | ||
622 | goto try_next_time; | ||
623 | } | ||
624 | return -1; | ||
625 | } | ||
626 | |||
627 | loop = 10; | ||
628 | /* Reset the EHCI controller */ | ||
629 | cmd = readl(&ehci_regs->command); | ||
630 | cmd |= CMD_RESET; | ||
631 | writel(cmd, &ehci_regs->command); | ||
632 | do { | ||
633 | cmd = readl(&ehci_regs->command); | ||
634 | } while ((cmd & CMD_RESET) && (--loop > 0)); | ||
635 | |||
636 | if (!loop) { | ||
637 | dbgp_printk("can not reset ehci\n"); | ||
638 | return -1; | ||
639 | } | ||
640 | dbgp_printk("ehci reset done\n"); | ||
641 | |||
642 | /* Claim ownership, but do not enable yet */ | ||
643 | ctrl = readl(&ehci_debug->control); | ||
644 | ctrl |= DBGP_OWNER; | ||
645 | ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); | ||
646 | writel(ctrl, &ehci_debug->control); | ||
647 | |||
648 | /* Start the ehci running */ | ||
649 | cmd = readl(&ehci_regs->command); | ||
650 | cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); | ||
651 | cmd |= CMD_RUN; | ||
652 | writel(cmd, &ehci_regs->command); | ||
653 | |||
654 | /* Ensure everything is routed to the EHCI */ | ||
655 | writel(FLAG_CF, &ehci_regs->configured_flag); | ||
656 | |||
657 | /* Wait until the controller is no longer halted */ | ||
658 | loop = 10; | ||
659 | do { | ||
660 | status = readl(&ehci_regs->status); | ||
661 | } while ((status & STS_HALT) && (--loop > 0)); | ||
662 | |||
663 | if (!loop) { | ||
664 | dbgp_printk("ehci can be started\n"); | ||
665 | return -1; | ||
666 | } | ||
667 | dbgp_printk("ehci started\n"); | ||
668 | |||
669 | /* Wait for a device to show up in the debug port */ | ||
670 | ret = ehci_wait_for_port(debug_port); | ||
671 | if (ret < 0) { | ||
672 | dbgp_printk("No device found in debug port\n"); | ||
673 | goto next_debug_port; | ||
674 | } | ||
675 | dbgp_printk("ehci wait for port done\n"); | ||
676 | |||
677 | /* Enable the debug port */ | ||
678 | ctrl = readl(&ehci_debug->control); | ||
679 | ctrl |= DBGP_CLAIM; | ||
680 | writel(ctrl, &ehci_debug->control); | ||
681 | ctrl = readl(&ehci_debug->control); | ||
682 | if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { | ||
683 | dbgp_printk("No device in debug port\n"); | ||
684 | writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); | ||
685 | goto err; | ||
686 | } | ||
687 | dbgp_printk("debug ported enabled\n"); | ||
688 | |||
689 | /* Completely transfer the debug device to the debug controller */ | ||
690 | portsc = readl(&ehci_regs->port_status[debug_port - 1]); | ||
691 | portsc &= ~PORT_PE; | ||
692 | writel(portsc, &ehci_regs->port_status[debug_port - 1]); | ||
693 | |||
694 | dbgp_mdelay(100); | ||
695 | |||
696 | /* Find the debug device and make it device number 127 */ | ||
697 | for (devnum = 0; devnum <= 127; devnum++) { | ||
698 | ret = dbgp_control_msg(devnum, | ||
699 | USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, | ||
700 | USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, | ||
701 | &dbgp_desc, sizeof(dbgp_desc)); | ||
702 | if (ret > 0) | ||
703 | break; | ||
704 | } | ||
705 | if (devnum > 127) { | ||
706 | dbgp_printk("Could not find attached debug device\n"); | ||
707 | goto err; | ||
708 | } | ||
709 | if (ret < 0) { | ||
710 | dbgp_printk("Attached device is not a debug device\n"); | ||
711 | goto err; | ||
712 | } | ||
713 | dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; | ||
714 | |||
715 | /* Move the device to 127 if it isn't already there */ | ||
716 | if (devnum != USB_DEBUG_DEVNUM) { | ||
717 | ret = dbgp_control_msg(devnum, | ||
718 | USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, | ||
719 | USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); | ||
720 | if (ret < 0) { | ||
721 | dbgp_printk("Could not move attached device to %d\n", | ||
722 | USB_DEBUG_DEVNUM); | ||
723 | goto err; | ||
724 | } | ||
725 | devnum = USB_DEBUG_DEVNUM; | ||
726 | dbgp_printk("debug device renamed to 127\n"); | ||
727 | } | ||
728 | |||
729 | /* Enable the debug interface */ | ||
730 | ret = dbgp_control_msg(USB_DEBUG_DEVNUM, | ||
731 | USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, | ||
732 | USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); | ||
733 | if (ret < 0) { | ||
734 | dbgp_printk(" Could not enable the debug device\n"); | ||
735 | goto err; | ||
736 | } | ||
737 | dbgp_printk("debug interface enabled\n"); | ||
738 | |||
739 | /* Perform a small write to get the even/odd data state in sync | ||
740 | */ | ||
741 | ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); | ||
742 | if (ret < 0) { | ||
743 | dbgp_printk("dbgp_bulk_write failed: %d\n", ret); | ||
744 | goto err; | ||
745 | } | ||
746 | dbgp_printk("small write doned\n"); | ||
747 | |||
748 | return 0; | ||
749 | err: | ||
750 | /* Things didn't work so remove my claim */ | ||
751 | ctrl = readl(&ehci_debug->control); | ||
752 | ctrl &= ~(DBGP_CLAIM | DBGP_OUT); | ||
753 | writel(ctrl, &ehci_debug->control); | ||
754 | return -1; | ||
755 | |||
756 | next_debug_port: | ||
757 | port_map_tried |= (1<<(debug_port - 1)); | ||
758 | new_debug_port = ((debug_port-1+1)%n_ports) + 1; | ||
759 | if (port_map_tried != ((1<<n_ports) - 1)) { | ||
760 | set_debug_port(new_debug_port); | ||
761 | goto try_next_port; | ||
762 | } | ||
763 | if (--playtimes) { | ||
764 | set_debug_port(new_debug_port); | ||
765 | goto try_next_time; | ||
766 | } | ||
767 | |||
768 | return -1; | ||
769 | } | ||
770 | |||
771 | static int __init early_dbgp_init(char *s) | ||
772 | { | ||
773 | u32 debug_port, bar, offset; | ||
774 | u32 bus, slot, func, cap; | ||
775 | void __iomem *ehci_bar; | ||
776 | u32 dbgp_num; | ||
777 | u32 bar_val; | ||
778 | char *e; | ||
779 | int ret; | ||
780 | u8 byte; | ||
781 | |||
782 | if (!early_pci_allowed()) | ||
783 | return -1; | ||
784 | |||
785 | dbgp_num = 0; | ||
786 | if (*s) | ||
787 | dbgp_num = simple_strtoul(s, &e, 10); | ||
788 | dbgp_printk("dbgp_num: %d\n", dbgp_num); | ||
789 | |||
790 | cap = find_dbgp(dbgp_num, &bus, &slot, &func); | ||
791 | if (!cap) | ||
792 | return -1; | ||
793 | |||
794 | dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot, | ||
795 | func); | ||
796 | |||
797 | debug_port = read_pci_config(bus, slot, func, cap); | ||
798 | bar = (debug_port >> 29) & 0x7; | ||
799 | bar = (bar * 4) + 0xc; | ||
800 | offset = (debug_port >> 16) & 0xfff; | ||
801 | dbgp_printk("bar: %02x offset: %03x\n", bar, offset); | ||
802 | if (bar != PCI_BASE_ADDRESS_0) { | ||
803 | dbgp_printk("only debug ports on bar 1 handled.\n"); | ||
804 | |||
805 | return -1; | ||
806 | } | ||
807 | |||
808 | bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); | ||
809 | dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); | ||
810 | if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { | ||
811 | dbgp_printk("only simple 32bit mmio bars supported\n"); | ||
812 | |||
813 | return -1; | ||
814 | } | ||
815 | |||
816 | /* double check if the mem space is enabled */ | ||
817 | byte = read_pci_config_byte(bus, slot, func, 0x04); | ||
818 | if (!(byte & 0x2)) { | ||
819 | byte |= 0x02; | ||
820 | write_pci_config_byte(bus, slot, func, 0x04, byte); | ||
821 | dbgp_printk("mmio for ehci enabled\n"); | ||
822 | } | ||
823 | |||
824 | /* | ||
825 | * FIXME I don't have the bar size so just guess PAGE_SIZE is more | ||
826 | * than enough. 1K is the biggest I have seen. | ||
827 | */ | ||
828 | set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); | ||
829 | ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); | ||
830 | ehci_bar += bar_val & ~PAGE_MASK; | ||
831 | dbgp_printk("ehci_bar: %p\n", ehci_bar); | ||
832 | |||
833 | ehci_caps = ehci_bar; | ||
834 | ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); | ||
835 | ehci_debug = ehci_bar + offset; | ||
836 | ehci_dev.bus = bus; | ||
837 | ehci_dev.slot = slot; | ||
838 | ehci_dev.func = func; | ||
839 | |||
840 | detect_set_debug_port(); | ||
841 | |||
842 | ret = ehci_setup(); | ||
843 | if (ret < 0) { | ||
844 | dbgp_printk("ehci_setup failed\n"); | ||
845 | ehci_debug = NULL; | ||
846 | |||
847 | return -1; | ||
848 | } | ||
849 | |||
850 | return 0; | ||
851 | } | ||
852 | |||
853 | static void early_dbgp_write(struct console *con, const char *str, u32 n) | ||
854 | { | ||
855 | int chunk, ret; | ||
856 | |||
857 | if (!ehci_debug) | ||
858 | return; | ||
859 | while (n > 0) { | ||
860 | chunk = n; | ||
861 | if (chunk > DBGP_MAX_PACKET) | ||
862 | chunk = DBGP_MAX_PACKET; | ||
863 | ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, | ||
864 | dbgp_endpoint_out, str, chunk); | ||
865 | str += chunk; | ||
866 | n -= chunk; | ||
867 | } | ||
868 | } | ||
869 | |||
870 | static struct console early_dbgp_console = { | ||
871 | .name = "earlydbg", | ||
872 | .write = early_dbgp_write, | ||
873 | .flags = CON_PRINTBUFFER, | ||
874 | .index = -1, | ||
875 | }; | ||
876 | #endif | ||
877 | |||
154 | /* Console interface to a host file on AMD's SimNow! */ | 878 | /* Console interface to a host file on AMD's SimNow! */ |
155 | 879 | ||
156 | static int simnow_fd; | 880 | static int simnow_fd; |
@@ -165,6 +889,7 @@ enum { | |||
165 | static noinline long simnow(long cmd, long a, long b, long c) | 889 | static noinline long simnow(long cmd, long a, long b, long c) |
166 | { | 890 | { |
167 | long ret; | 891 | long ret; |
892 | |||
168 | asm volatile("cpuid" : | 893 | asm volatile("cpuid" : |
169 | "=a" (ret) : | 894 | "=a" (ret) : |
170 | "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); | 895 | "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); |
@@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c) | |||
174 | static void __init simnow_init(char *str) | 899 | static void __init simnow_init(char *str) |
175 | { | 900 | { |
176 | char *fn = "klog"; | 901 | char *fn = "klog"; |
902 | |||
177 | if (*str == '=') | 903 | if (*str == '=') |
178 | fn = ++str; | 904 | fn = ++str; |
179 | /* error ignored */ | 905 | /* error ignored */ |
@@ -194,7 +920,7 @@ static struct console simnow_console = { | |||
194 | 920 | ||
195 | /* Direct interface for emergencies */ | 921 | /* Direct interface for emergencies */ |
196 | static struct console *early_console = &early_vga_console; | 922 | static struct console *early_console = &early_vga_console; |
197 | static int early_console_initialized; | 923 | static int __initdata early_console_initialized; |
198 | 924 | ||
199 | asmlinkage void early_printk(const char *fmt, ...) | 925 | asmlinkage void early_printk(const char *fmt, ...) |
200 | { | 926 | { |
@@ -208,10 +934,11 @@ asmlinkage void early_printk(const char *fmt, ...) | |||
208 | va_end(ap); | 934 | va_end(ap); |
209 | } | 935 | } |
210 | 936 | ||
211 | static int __initdata keep_early; | ||
212 | 937 | ||
213 | static int __init setup_early_printk(char *buf) | 938 | static int __init setup_early_printk(char *buf) |
214 | { | 939 | { |
940 | int keep_early; | ||
941 | |||
215 | if (!buf) | 942 | if (!buf) |
216 | return 0; | 943 | return 0; |
217 | 944 | ||
@@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf) | |||
219 | return 0; | 946 | return 0; |
220 | early_console_initialized = 1; | 947 | early_console_initialized = 1; |
221 | 948 | ||
222 | if (strstr(buf, "keep")) | 949 | keep_early = (strstr(buf, "keep") != NULL); |
223 | keep_early = 1; | ||
224 | 950 | ||
225 | if (!strncmp(buf, "serial", 6)) { | 951 | if (!strncmp(buf, "serial", 6)) { |
226 | early_serial_init(buf + 6); | 952 | early_serial_init(buf + 6); |
@@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf) | |||
238 | simnow_init(buf + 6); | 964 | simnow_init(buf + 6); |
239 | early_console = &simnow_console; | 965 | early_console = &simnow_console; |
240 | keep_early = 1; | 966 | keep_early = 1; |
967 | #ifdef CONFIG_EARLY_PRINTK_DBGP | ||
968 | } else if (!strncmp(buf, "dbgp", 4)) { | ||
969 | if (early_dbgp_init(buf+4) < 0) | ||
970 | return 0; | ||
971 | early_console = &early_dbgp_console; | ||
972 | /* | ||
973 | * usb subsys will reset ehci controller, so don't keep | ||
974 | * that early console | ||
975 | */ | ||
976 | keep_early = 0; | ||
977 | #endif | ||
241 | #ifdef CONFIG_HVC_XEN | 978 | #ifdef CONFIG_HVC_XEN |
242 | } else if (!strncmp(buf, "xen", 3)) { | 979 | } else if (!strncmp(buf, "xen", 3)) { |
243 | early_console = &xenboot_console; | 980 | early_console = &xenboot_console; |
@@ -251,4 +988,5 @@ static int __init setup_early_printk(char *buf) | |||
251 | register_console(early_console); | 988 | register_console(early_console); |
252 | return 0; | 989 | return 0; |
253 | } | 990 | } |
991 | |||
254 | early_param("earlyprintk", setup_early_printk); | 992 | early_param("earlyprintk", setup_early_printk); |
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 06cc8d4254b1..945a31cdd81f 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c | |||
@@ -414,9 +414,11 @@ void __init efi_init(void) | |||
414 | if (memmap.map == NULL) | 414 | if (memmap.map == NULL) |
415 | printk(KERN_ERR "Could not map the EFI memory map!\n"); | 415 | printk(KERN_ERR "Could not map the EFI memory map!\n"); |
416 | memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); | 416 | memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); |
417 | |||
417 | if (memmap.desc_size != sizeof(efi_memory_desc_t)) | 418 | if (memmap.desc_size != sizeof(efi_memory_desc_t)) |
418 | printk(KERN_WARNING "Kernel-defined memdesc" | 419 | printk(KERN_WARNING |
419 | "doesn't match the one from EFI!\n"); | 420 | "Kernel-defined memdesc doesn't match the one from EFI!\n"); |
421 | |||
420 | if (add_efi_memmap) | 422 | if (add_efi_memmap) |
421 | do_add_efi_memmap(); | 423 | do_add_efi_memmap(); |
422 | 424 | ||
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 109792bc7cfa..b21fbfaffe39 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -730,6 +730,7 @@ error_code: | |||
730 | movl $(__USER_DS), %ecx | 730 | movl $(__USER_DS), %ecx |
731 | movl %ecx, %ds | 731 | movl %ecx, %ds |
732 | movl %ecx, %es | 732 | movl %ecx, %es |
733 | TRACE_IRQS_OFF | ||
733 | movl %esp,%eax # pt_regs pointer | 734 | movl %esp,%eax # pt_regs pointer |
734 | call *%edi | 735 | call *%edi |
735 | jmp ret_from_exception | 736 | jmp ret_from_exception |
@@ -760,20 +761,9 @@ ENTRY(device_not_available) | |||
760 | RING0_INT_FRAME | 761 | RING0_INT_FRAME |
761 | pushl $-1 # mark this as an int | 762 | pushl $-1 # mark this as an int |
762 | CFI_ADJUST_CFA_OFFSET 4 | 763 | CFI_ADJUST_CFA_OFFSET 4 |
763 | SAVE_ALL | 764 | pushl $do_device_not_available |
764 | GET_CR0_INTO_EAX | ||
765 | testl $0x4, %eax # EM (math emulation bit) | ||
766 | jne device_not_available_emulate | ||
767 | preempt_stop(CLBR_ANY) | ||
768 | call math_state_restore | ||
769 | jmp ret_from_exception | ||
770 | device_not_available_emulate: | ||
771 | pushl $0 # temporary storage for ORIG_EIP | ||
772 | CFI_ADJUST_CFA_OFFSET 4 | 765 | CFI_ADJUST_CFA_OFFSET 4 |
773 | call math_emulate | 766 | jmp error_code |
774 | addl $4, %esp | ||
775 | CFI_ADJUST_CFA_OFFSET -4 | ||
776 | jmp ret_from_exception | ||
777 | CFI_ENDPROC | 767 | CFI_ENDPROC |
778 | END(device_not_available) | 768 | END(device_not_available) |
779 | 769 | ||
@@ -814,6 +804,7 @@ debug_stack_correct: | |||
814 | pushl $-1 # mark this as an int | 804 | pushl $-1 # mark this as an int |
815 | CFI_ADJUST_CFA_OFFSET 4 | 805 | CFI_ADJUST_CFA_OFFSET 4 |
816 | SAVE_ALL | 806 | SAVE_ALL |
807 | TRACE_IRQS_OFF | ||
817 | xorl %edx,%edx # error code 0 | 808 | xorl %edx,%edx # error code 0 |
818 | movl %esp,%eax # pt_regs pointer | 809 | movl %esp,%eax # pt_regs pointer |
819 | call do_debug | 810 | call do_debug |
@@ -858,6 +849,7 @@ nmi_stack_correct: | |||
858 | pushl %eax | 849 | pushl %eax |
859 | CFI_ADJUST_CFA_OFFSET 4 | 850 | CFI_ADJUST_CFA_OFFSET 4 |
860 | SAVE_ALL | 851 | SAVE_ALL |
852 | TRACE_IRQS_OFF | ||
861 | xorl %edx,%edx # zero error code | 853 | xorl %edx,%edx # zero error code |
862 | movl %esp,%eax # pt_regs pointer | 854 | movl %esp,%eax # pt_regs pointer |
863 | call do_nmi | 855 | call do_nmi |
@@ -898,6 +890,7 @@ nmi_espfix_stack: | |||
898 | pushl %eax | 890 | pushl %eax |
899 | CFI_ADJUST_CFA_OFFSET 4 | 891 | CFI_ADJUST_CFA_OFFSET 4 |
900 | SAVE_ALL | 892 | SAVE_ALL |
893 | TRACE_IRQS_OFF | ||
901 | FIXUP_ESPFIX_STACK # %eax == %esp | 894 | FIXUP_ESPFIX_STACK # %eax == %esp |
902 | xorl %edx,%edx # zero error code | 895 | xorl %edx,%edx # zero error code |
903 | call do_nmi | 896 | call do_nmi |
@@ -928,6 +921,7 @@ KPROBE_ENTRY(int3) | |||
928 | pushl $-1 # mark this as an int | 921 | pushl $-1 # mark this as an int |
929 | CFI_ADJUST_CFA_OFFSET 4 | 922 | CFI_ADJUST_CFA_OFFSET 4 |
930 | SAVE_ALL | 923 | SAVE_ALL |
924 | TRACE_IRQS_OFF | ||
931 | xorl %edx,%edx # zero error code | 925 | xorl %edx,%edx # zero error code |
932 | movl %esp,%eax # pt_regs pointer | 926 | movl %esp,%eax # pt_regs pointer |
933 | call do_int3 | 927 | call do_int3 |
@@ -1030,7 +1024,7 @@ ENTRY(machine_check) | |||
1030 | RING0_INT_FRAME | 1024 | RING0_INT_FRAME |
1031 | pushl $0 | 1025 | pushl $0 |
1032 | CFI_ADJUST_CFA_OFFSET 4 | 1026 | CFI_ADJUST_CFA_OFFSET 4 |
1033 | pushl machine_check_vector | 1027 | pushl $do_machine_check |
1034 | CFI_ADJUST_CFA_OFFSET 4 | 1028 | CFI_ADJUST_CFA_OFFSET 4 |
1035 | jmp error_code | 1029 | jmp error_code |
1036 | CFI_ENDPROC | 1030 | CFI_ENDPROC |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 89434d439605..1db6ce4314e1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -275,9 +275,9 @@ ENTRY(native_usergs_sysret64) | |||
275 | ENTRY(ret_from_fork) | 275 | ENTRY(ret_from_fork) |
276 | CFI_DEFAULT_STACK | 276 | CFI_DEFAULT_STACK |
277 | push kernel_eflags(%rip) | 277 | push kernel_eflags(%rip) |
278 | CFI_ADJUST_CFA_OFFSET 4 | 278 | CFI_ADJUST_CFA_OFFSET 8 |
279 | popf # reset kernel eflags | 279 | popf # reset kernel eflags |
280 | CFI_ADJUST_CFA_OFFSET -4 | 280 | CFI_ADJUST_CFA_OFFSET -8 |
281 | call schedule_tail | 281 | call schedule_tail |
282 | GET_THREAD_INFO(%rcx) | 282 | GET_THREAD_INFO(%rcx) |
283 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) | 283 | testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) |
@@ -667,6 +667,13 @@ END(stub_rt_sigreturn) | |||
667 | SAVE_ARGS | 667 | SAVE_ARGS |
668 | leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler | 668 | leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler |
669 | pushq %rbp | 669 | pushq %rbp |
670 | /* | ||
671 | * Save rbp twice: One is for marking the stack frame, as usual, and the | ||
672 | * other, to fill pt_regs properly. This is because bx comes right | ||
673 | * before the last saved register in that structure, and not bp. If the | ||
674 | * base pointer were in the place bx is today, this would not be needed. | ||
675 | */ | ||
676 | movq %rbp, -8(%rsp) | ||
670 | CFI_ADJUST_CFA_OFFSET 8 | 677 | CFI_ADJUST_CFA_OFFSET 8 |
671 | CFI_REL_OFFSET rbp, 0 | 678 | CFI_REL_OFFSET rbp, 0 |
672 | movq %rsp,%rbp | 679 | movq %rsp,%rbp |
@@ -932,6 +939,9 @@ END(spurious_interrupt) | |||
932 | .if \ist | 939 | .if \ist |
933 | movq %gs:pda_data_offset, %rbp | 940 | movq %gs:pda_data_offset, %rbp |
934 | .endif | 941 | .endif |
942 | .if \irqtrace | ||
943 | TRACE_IRQS_OFF | ||
944 | .endif | ||
935 | movq %rsp,%rdi | 945 | movq %rsp,%rdi |
936 | movq ORIG_RAX(%rsp),%rsi | 946 | movq ORIG_RAX(%rsp),%rsi |
937 | movq $-1,ORIG_RAX(%rsp) | 947 | movq $-1,ORIG_RAX(%rsp) |
@@ -1058,7 +1068,8 @@ KPROBE_ENTRY(error_entry) | |||
1058 | je error_kernelspace | 1068 | je error_kernelspace |
1059 | error_swapgs: | 1069 | error_swapgs: |
1060 | SWAPGS | 1070 | SWAPGS |
1061 | error_sti: | 1071 | error_sti: |
1072 | TRACE_IRQS_OFF | ||
1062 | movq %rdi,RDI(%rsp) | 1073 | movq %rdi,RDI(%rsp) |
1063 | CFI_REL_OFFSET rdi,RDI | 1074 | CFI_REL_OFFSET rdi,RDI |
1064 | movq %rsp,%rdi | 1075 | movq %rsp,%rdi |
@@ -1232,7 +1243,7 @@ ENTRY(simd_coprocessor_error) | |||
1232 | END(simd_coprocessor_error) | 1243 | END(simd_coprocessor_error) |
1233 | 1244 | ||
1234 | ENTRY(device_not_available) | 1245 | ENTRY(device_not_available) |
1235 | zeroentry math_state_restore | 1246 | zeroentry do_device_not_available |
1236 | END(device_not_available) | 1247 | END(device_not_available) |
1237 | 1248 | ||
1238 | /* runs on exception stack */ | 1249 | /* runs on exception stack */ |
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/kernel/es7000_32.c index 50189af14b85..f454c78fcef6 100644 --- a/arch/x86/mach-es7000/es7000plat.c +++ b/arch/x86/kernel/es7000_32.c | |||
@@ -39,10 +39,94 @@ | |||
39 | #include <asm/nmi.h> | 39 | #include <asm/nmi.h> |
40 | #include <asm/smp.h> | 40 | #include <asm/smp.h> |
41 | #include <asm/apicdef.h> | 41 | #include <asm/apicdef.h> |
42 | #include "es7000.h" | ||
43 | #include <mach_mpparse.h> | 42 | #include <mach_mpparse.h> |
44 | 43 | ||
45 | /* | 44 | /* |
45 | * ES7000 chipsets | ||
46 | */ | ||
47 | |||
48 | #define NON_UNISYS 0 | ||
49 | #define ES7000_CLASSIC 1 | ||
50 | #define ES7000_ZORRO 2 | ||
51 | |||
52 | |||
53 | #define MIP_REG 1 | ||
54 | #define MIP_PSAI_REG 4 | ||
55 | |||
56 | #define MIP_BUSY 1 | ||
57 | #define MIP_SPIN 0xf0000 | ||
58 | #define MIP_VALID 0x0100000000000000ULL | ||
59 | #define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff) | ||
60 | |||
61 | #define MIP_RD_LO(VALUE) (VALUE & 0xffffffff) | ||
62 | |||
63 | struct mip_reg_info { | ||
64 | unsigned long long mip_info; | ||
65 | unsigned long long delivery_info; | ||
66 | unsigned long long host_reg; | ||
67 | unsigned long long mip_reg; | ||
68 | }; | ||
69 | |||
70 | struct part_info { | ||
71 | unsigned char type; | ||
72 | unsigned char length; | ||
73 | unsigned char part_id; | ||
74 | unsigned char apic_mode; | ||
75 | unsigned long snum; | ||
76 | char ptype[16]; | ||
77 | char sname[64]; | ||
78 | char pname[64]; | ||
79 | }; | ||
80 | |||
81 | struct psai { | ||
82 | unsigned long long entry_type; | ||
83 | unsigned long long addr; | ||
84 | unsigned long long bep_addr; | ||
85 | }; | ||
86 | |||
87 | struct es7000_mem_info { | ||
88 | unsigned char type; | ||
89 | unsigned char length; | ||
90 | unsigned char resv[6]; | ||
91 | unsigned long long start; | ||
92 | unsigned long long size; | ||
93 | }; | ||
94 | |||
95 | struct es7000_oem_table { | ||
96 | unsigned long long hdr; | ||
97 | struct mip_reg_info mip; | ||
98 | struct part_info pif; | ||
99 | struct es7000_mem_info shm; | ||
100 | struct psai psai; | ||
101 | }; | ||
102 | |||
103 | #ifdef CONFIG_ACPI | ||
104 | |||
105 | struct oem_table { | ||
106 | struct acpi_table_header Header; | ||
107 | u32 OEMTableAddr; | ||
108 | u32 OEMTableSize; | ||
109 | }; | ||
110 | |||
111 | extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); | ||
112 | extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr); | ||
113 | #endif | ||
114 | |||
115 | struct mip_reg { | ||
116 | unsigned long long off_0; | ||
117 | unsigned long long off_8; | ||
118 | unsigned long long off_10; | ||
119 | unsigned long long off_18; | ||
120 | unsigned long long off_20; | ||
121 | unsigned long long off_28; | ||
122 | unsigned long long off_30; | ||
123 | unsigned long long off_38; | ||
124 | }; | ||
125 | |||
126 | #define MIP_SW_APIC 0x1020b | ||
127 | #define MIP_FUNC(VALUE) (VALUE & 0xff) | ||
128 | |||
129 | /* | ||
46 | * ES7000 Globals | 130 | * ES7000 Globals |
47 | */ | 131 | */ |
48 | 132 | ||
@@ -72,7 +156,7 @@ es7000_rename_gsi(int ioapic, int gsi) | |||
72 | base += nr_ioapic_registers[i]; | 156 | base += nr_ioapic_registers[i]; |
73 | } | 157 | } |
74 | 158 | ||
75 | if (!ioapic && (gsi < 16)) | 159 | if (!ioapic && (gsi < 16)) |
76 | gsi += base; | 160 | gsi += base; |
77 | return gsi; | 161 | return gsi; |
78 | } | 162 | } |
@@ -160,21 +244,38 @@ parse_unisys_oem (char *oemptr) | |||
160 | } | 244 | } |
161 | 245 | ||
162 | #ifdef CONFIG_ACPI | 246 | #ifdef CONFIG_ACPI |
163 | int __init | 247 | static unsigned long oem_addrX; |
164 | find_unisys_acpi_oem_table(unsigned long *oem_addr) | 248 | static unsigned long oem_size; |
249 | int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) | ||
165 | { | 250 | { |
166 | struct acpi_table_header *header = NULL; | 251 | struct acpi_table_header *header = NULL; |
167 | int i = 0; | 252 | int i = 0; |
168 | while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { | 253 | acpi_size tbl_size; |
254 | |||
255 | while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) { | ||
169 | if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { | 256 | if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { |
170 | struct oem_table *t = (struct oem_table *)header; | 257 | struct oem_table *t = (struct oem_table *)header; |
171 | *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr, | 258 | |
172 | t->OEMTableSize); | 259 | oem_addrX = t->OEMTableAddr; |
260 | oem_size = t->OEMTableSize; | ||
261 | early_acpi_os_unmap_memory(header, tbl_size); | ||
262 | |||
263 | *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, | ||
264 | oem_size); | ||
173 | return 0; | 265 | return 0; |
174 | } | 266 | } |
267 | early_acpi_os_unmap_memory(header, tbl_size); | ||
175 | } | 268 | } |
176 | return -1; | 269 | return -1; |
177 | } | 270 | } |
271 | |||
272 | void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) | ||
273 | { | ||
274 | if (!oem_addr) | ||
275 | return; | ||
276 | |||
277 | __acpi_unmap_table((char *)oem_addr, oem_size); | ||
278 | } | ||
178 | #endif | 279 | #endif |
179 | 280 | ||
180 | static void | 281 | static void |
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index eaff0bbb1444..6c9bfc9e1e95 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c | |||
@@ -16,87 +16,63 @@ | |||
16 | #include <linux/ctype.h> | 16 | #include <linux/ctype.h> |
17 | #include <linux/init.h> | 17 | #include <linux/init.h> |
18 | #include <linux/hardirq.h> | 18 | #include <linux/hardirq.h> |
19 | #include <linux/dmar.h> | ||
19 | 20 | ||
20 | #include <asm/smp.h> | 21 | #include <asm/smp.h> |
21 | #include <asm/ipi.h> | 22 | #include <asm/ipi.h> |
22 | #include <asm/genapic.h> | 23 | #include <asm/genapic.h> |
23 | 24 | ||
24 | #ifdef CONFIG_ACPI | 25 | extern struct genapic apic_flat; |
25 | #include <acpi/acpi_bus.h> | 26 | extern struct genapic apic_physflat; |
26 | #endif | 27 | extern struct genapic apic_x2xpic_uv_x; |
27 | 28 | extern struct genapic apic_x2apic_phys; | |
28 | DEFINE_PER_CPU(int, x2apic_extra_bits); | 29 | extern struct genapic apic_x2apic_cluster; |
29 | 30 | ||
30 | struct genapic __read_mostly *genapic = &apic_flat; | 31 | struct genapic __read_mostly *genapic = &apic_flat; |
31 | 32 | ||
32 | static enum uv_system_type uv_system_type; | 33 | static struct genapic *apic_probe[] __initdata = { |
34 | &apic_x2apic_uv_x, | ||
35 | &apic_x2apic_phys, | ||
36 | &apic_x2apic_cluster, | ||
37 | &apic_physflat, | ||
38 | NULL, | ||
39 | }; | ||
33 | 40 | ||
34 | /* | 41 | /* |
35 | * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. | 42 | * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. |
36 | */ | 43 | */ |
37 | void __init setup_apic_routing(void) | 44 | void __init setup_apic_routing(void) |
38 | { | 45 | { |
39 | if (uv_system_type == UV_NON_UNIQUE_APIC) | 46 | if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) { |
40 | genapic = &apic_x2apic_uv_x; | 47 | if (!intr_remapping_enabled) |
41 | else | 48 | genapic = &apic_flat; |
42 | #ifdef CONFIG_ACPI | 49 | } |
43 | /* | ||
44 | * Quirk: some x86_64 machines can only use physical APIC mode | ||
45 | * regardless of how many processors are present (x86_64 ES7000 | ||
46 | * is an example). | ||
47 | */ | ||
48 | if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && | ||
49 | (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) | ||
50 | genapic = &apic_physflat; | ||
51 | else | ||
52 | #endif | ||
53 | |||
54 | if (max_physical_apicid < 8) | ||
55 | genapic = &apic_flat; | ||
56 | else | ||
57 | genapic = &apic_physflat; | ||
58 | 50 | ||
59 | printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); | 51 | if (genapic == &apic_flat) { |
52 | if (max_physical_apicid >= 8) | ||
53 | genapic = &apic_physflat; | ||
54 | printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); | ||
55 | } | ||
60 | } | 56 | } |
61 | 57 | ||
62 | /* Same for both flat and physical. */ | 58 | /* Same for both flat and physical. */ |
63 | 59 | ||
64 | void send_IPI_self(int vector) | 60 | void apic_send_IPI_self(int vector) |
65 | { | 61 | { |
66 | __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); | 62 | __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); |
67 | } | 63 | } |
68 | 64 | ||
69 | int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) | 65 | int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) |
70 | { | 66 | { |
71 | if (!strcmp(oem_id, "SGI")) { | 67 | int i; |
72 | if (!strcmp(oem_table_id, "UVL")) | 68 | |
73 | uv_system_type = UV_LEGACY_APIC; | 69 | for (i = 0; apic_probe[i]; ++i) { |
74 | else if (!strcmp(oem_table_id, "UVX")) | 70 | if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { |
75 | uv_system_type = UV_X2APIC; | 71 | genapic = apic_probe[i]; |
76 | else if (!strcmp(oem_table_id, "UVH")) | 72 | printk(KERN_INFO "Setting APIC routing to %s.\n", |
77 | uv_system_type = UV_NON_UNIQUE_APIC; | 73 | genapic->name); |
74 | return 1; | ||
75 | } | ||
78 | } | 76 | } |
79 | return 0; | 77 | return 0; |
80 | } | 78 | } |
81 | |||
82 | unsigned int read_apic_id(void) | ||
83 | { | ||
84 | unsigned int id; | ||
85 | |||
86 | WARN_ON(preemptible() && num_online_cpus() > 1); | ||
87 | id = apic_read(APIC_ID); | ||
88 | if (uv_system_type >= UV_X2APIC) | ||
89 | id |= __get_cpu_var(x2apic_extra_bits); | ||
90 | return id; | ||
91 | } | ||
92 | |||
93 | enum uv_system_type get_uv_system_type(void) | ||
94 | { | ||
95 | return uv_system_type; | ||
96 | } | ||
97 | |||
98 | int is_uv_system(void) | ||
99 | { | ||
100 | return uv_system_type != UV_NONE; | ||
101 | } | ||
102 | EXPORT_SYMBOL_GPL(is_uv_system); | ||
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 786548a62d38..9eca5ba7a6b1 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c | |||
@@ -15,9 +15,20 @@ | |||
15 | #include <linux/kernel.h> | 15 | #include <linux/kernel.h> |
16 | #include <linux/ctype.h> | 16 | #include <linux/ctype.h> |
17 | #include <linux/init.h> | 17 | #include <linux/init.h> |
18 | #include <linux/hardirq.h> | ||
18 | #include <asm/smp.h> | 19 | #include <asm/smp.h> |
19 | #include <asm/ipi.h> | 20 | #include <asm/ipi.h> |
20 | #include <asm/genapic.h> | 21 | #include <asm/genapic.h> |
22 | #include <mach_apicdef.h> | ||
23 | |||
24 | #ifdef CONFIG_ACPI | ||
25 | #include <acpi/acpi_bus.h> | ||
26 | #endif | ||
27 | |||
28 | static int __init flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | ||
29 | { | ||
30 | return 1; | ||
31 | } | ||
21 | 32 | ||
22 | static cpumask_t flat_target_cpus(void) | 33 | static cpumask_t flat_target_cpus(void) |
23 | { | 34 | { |
@@ -95,9 +106,33 @@ static void flat_send_IPI_all(int vector) | |||
95 | __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); | 106 | __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); |
96 | } | 107 | } |
97 | 108 | ||
109 | static unsigned int get_apic_id(unsigned long x) | ||
110 | { | ||
111 | unsigned int id; | ||
112 | |||
113 | id = (((x)>>24) & 0xFFu); | ||
114 | return id; | ||
115 | } | ||
116 | |||
117 | static unsigned long set_apic_id(unsigned int id) | ||
118 | { | ||
119 | unsigned long x; | ||
120 | |||
121 | x = ((id & 0xFFu)<<24); | ||
122 | return x; | ||
123 | } | ||
124 | |||
125 | static unsigned int read_xapic_id(void) | ||
126 | { | ||
127 | unsigned int id; | ||
128 | |||
129 | id = get_apic_id(apic_read(APIC_ID)); | ||
130 | return id; | ||
131 | } | ||
132 | |||
98 | static int flat_apic_id_registered(void) | 133 | static int flat_apic_id_registered(void) |
99 | { | 134 | { |
100 | return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map); | 135 | return physid_isset(read_xapic_id(), phys_cpu_present_map); |
101 | } | 136 | } |
102 | 137 | ||
103 | static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) | 138 | static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) |
@@ -112,6 +147,7 @@ static unsigned int phys_pkg_id(int index_msb) | |||
112 | 147 | ||
113 | struct genapic apic_flat = { | 148 | struct genapic apic_flat = { |
114 | .name = "flat", | 149 | .name = "flat", |
150 | .acpi_madt_oem_check = flat_acpi_madt_oem_check, | ||
115 | .int_delivery_mode = dest_LowestPrio, | 151 | .int_delivery_mode = dest_LowestPrio, |
116 | .int_dest_mode = (APIC_DEST_LOGICAL != 0), | 152 | .int_dest_mode = (APIC_DEST_LOGICAL != 0), |
117 | .target_cpus = flat_target_cpus, | 153 | .target_cpus = flat_target_cpus, |
@@ -121,8 +157,12 @@ struct genapic apic_flat = { | |||
121 | .send_IPI_all = flat_send_IPI_all, | 157 | .send_IPI_all = flat_send_IPI_all, |
122 | .send_IPI_allbutself = flat_send_IPI_allbutself, | 158 | .send_IPI_allbutself = flat_send_IPI_allbutself, |
123 | .send_IPI_mask = flat_send_IPI_mask, | 159 | .send_IPI_mask = flat_send_IPI_mask, |
160 | .send_IPI_self = apic_send_IPI_self, | ||
124 | .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, | 161 | .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, |
125 | .phys_pkg_id = phys_pkg_id, | 162 | .phys_pkg_id = phys_pkg_id, |
163 | .get_apic_id = get_apic_id, | ||
164 | .set_apic_id = set_apic_id, | ||
165 | .apic_id_mask = (0xFFu<<24), | ||
126 | }; | 166 | }; |
127 | 167 | ||
128 | /* | 168 | /* |
@@ -130,6 +170,21 @@ struct genapic apic_flat = { | |||
130 | * We cannot use logical delivery in this case because the mask | 170 | * We cannot use logical delivery in this case because the mask |
131 | * overflows, so use physical mode. | 171 | * overflows, so use physical mode. |
132 | */ | 172 | */ |
173 | static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | ||
174 | { | ||
175 | #ifdef CONFIG_ACPI | ||
176 | /* | ||
177 | * Quirk: some x86_64 machines can only use physical APIC mode | ||
178 | * regardless of how many processors are present (x86_64 ES7000 | ||
179 | * is an example). | ||
180 | */ | ||
181 | if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && | ||
182 | (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) | ||
183 | return 1; | ||
184 | #endif | ||
185 | |||
186 | return 0; | ||
187 | } | ||
133 | 188 | ||
134 | static cpumask_t physflat_target_cpus(void) | 189 | static cpumask_t physflat_target_cpus(void) |
135 | { | 190 | { |
@@ -176,6 +231,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) | |||
176 | 231 | ||
177 | struct genapic apic_physflat = { | 232 | struct genapic apic_physflat = { |
178 | .name = "physical flat", | 233 | .name = "physical flat", |
234 | .acpi_madt_oem_check = physflat_acpi_madt_oem_check, | ||
179 | .int_delivery_mode = dest_Fixed, | 235 | .int_delivery_mode = dest_Fixed, |
180 | .int_dest_mode = (APIC_DEST_PHYSICAL != 0), | 236 | .int_dest_mode = (APIC_DEST_PHYSICAL != 0), |
181 | .target_cpus = physflat_target_cpus, | 237 | .target_cpus = physflat_target_cpus, |
@@ -185,6 +241,10 @@ struct genapic apic_physflat = { | |||
185 | .send_IPI_all = physflat_send_IPI_all, | 241 | .send_IPI_all = physflat_send_IPI_all, |
186 | .send_IPI_allbutself = physflat_send_IPI_allbutself, | 242 | .send_IPI_allbutself = physflat_send_IPI_allbutself, |
187 | .send_IPI_mask = physflat_send_IPI_mask, | 243 | .send_IPI_mask = physflat_send_IPI_mask, |
244 | .send_IPI_self = apic_send_IPI_self, | ||
188 | .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, | 245 | .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, |
189 | .phys_pkg_id = phys_pkg_id, | 246 | .phys_pkg_id = phys_pkg_id, |
247 | .get_apic_id = get_apic_id, | ||
248 | .set_apic_id = set_apic_id, | ||
249 | .apic_id_mask = (0xFFu<<24), | ||
190 | }; | 250 | }; |
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c new file mode 100644 index 000000000000..e4bf2cc0d743 --- /dev/null +++ b/arch/x86/kernel/genx2apic_cluster.c | |||
@@ -0,0 +1,159 @@ | |||
1 | #include <linux/threads.h> | ||
2 | #include <linux/cpumask.h> | ||
3 | #include <linux/string.h> | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/ctype.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/dmar.h> | ||
8 | |||
9 | #include <asm/smp.h> | ||
10 | #include <asm/ipi.h> | ||
11 | #include <asm/genapic.h> | ||
12 | |||
13 | DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); | ||
14 | |||
15 | static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | ||
16 | { | ||
17 | if (cpu_has_x2apic) | ||
18 | return 1; | ||
19 | |||
20 | return 0; | ||
21 | } | ||
22 | |||
23 | /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ | ||
24 | |||
25 | static cpumask_t x2apic_target_cpus(void) | ||
26 | { | ||
27 | return cpumask_of_cpu(0); | ||
28 | } | ||
29 | |||
30 | /* | ||
31 | * for now each logical cpu is in its own vector allocation domain. | ||
32 | */ | ||
33 | static cpumask_t x2apic_vector_allocation_domain(int cpu) | ||
34 | { | ||
35 | cpumask_t domain = CPU_MASK_NONE; | ||
36 | cpu_set(cpu, domain); | ||
37 | return domain; | ||
38 | } | ||
39 | |||
40 | static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, | ||
41 | unsigned int dest) | ||
42 | { | ||
43 | unsigned long cfg; | ||
44 | |||
45 | cfg = __prepare_ICR(0, vector, dest); | ||
46 | |||
47 | /* | ||
48 | * send the IPI. | ||
49 | */ | ||
50 | x2apic_icr_write(cfg, apicid); | ||
51 | } | ||
52 | |||
53 | /* | ||
54 | * for now, we send the IPI's one by one in the cpumask. | ||
55 | * TBD: Based on the cpu mask, we can send the IPI's to the cluster group | ||
56 | * at once. We have 16 cpu's in a cluster. This will minimize IPI register | ||
57 | * writes. | ||
58 | */ | ||
59 | static void x2apic_send_IPI_mask(cpumask_t mask, int vector) | ||
60 | { | ||
61 | unsigned long flags; | ||
62 | unsigned long query_cpu; | ||
63 | |||
64 | local_irq_save(flags); | ||
65 | for_each_cpu_mask(query_cpu, mask) { | ||
66 | __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu), | ||
67 | vector, APIC_DEST_LOGICAL); | ||
68 | } | ||
69 | local_irq_restore(flags); | ||
70 | } | ||
71 | |||
72 | static void x2apic_send_IPI_allbutself(int vector) | ||
73 | { | ||
74 | cpumask_t mask = cpu_online_map; | ||
75 | |||
76 | cpu_clear(smp_processor_id(), mask); | ||
77 | |||
78 | if (!cpus_empty(mask)) | ||
79 | x2apic_send_IPI_mask(mask, vector); | ||
80 | } | ||
81 | |||
82 | static void x2apic_send_IPI_all(int vector) | ||
83 | { | ||
84 | x2apic_send_IPI_mask(cpu_online_map, vector); | ||
85 | } | ||
86 | |||
87 | static int x2apic_apic_id_registered(void) | ||
88 | { | ||
89 | return 1; | ||
90 | } | ||
91 | |||
92 | static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) | ||
93 | { | ||
94 | int cpu; | ||
95 | |||
96 | /* | ||
97 | * We're using fixed IRQ delivery, can only return one phys APIC ID. | ||
98 | * May as well be the first. | ||
99 | */ | ||
100 | cpu = first_cpu(cpumask); | ||
101 | if ((unsigned)cpu < NR_CPUS) | ||
102 | return per_cpu(x86_cpu_to_logical_apicid, cpu); | ||
103 | else | ||
104 | return BAD_APICID; | ||
105 | } | ||
106 | |||
107 | static unsigned int get_apic_id(unsigned long x) | ||
108 | { | ||
109 | unsigned int id; | ||
110 | |||
111 | id = x; | ||
112 | return id; | ||
113 | } | ||
114 | |||
115 | static unsigned long set_apic_id(unsigned int id) | ||
116 | { | ||
117 | unsigned long x; | ||
118 | |||
119 | x = id; | ||
120 | return x; | ||
121 | } | ||
122 | |||
123 | static unsigned int phys_pkg_id(int index_msb) | ||
124 | { | ||
125 | return current_cpu_data.initial_apicid >> index_msb; | ||
126 | } | ||
127 | |||
128 | static void x2apic_send_IPI_self(int vector) | ||
129 | { | ||
130 | apic_write(APIC_SELF_IPI, vector); | ||
131 | } | ||
132 | |||
133 | static void init_x2apic_ldr(void) | ||
134 | { | ||
135 | int cpu = smp_processor_id(); | ||
136 | |||
137 | per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); | ||
138 | return; | ||
139 | } | ||
140 | |||
141 | struct genapic apic_x2apic_cluster = { | ||
142 | .name = "cluster x2apic", | ||
143 | .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, | ||
144 | .int_delivery_mode = dest_LowestPrio, | ||
145 | .int_dest_mode = (APIC_DEST_LOGICAL != 0), | ||
146 | .target_cpus = x2apic_target_cpus, | ||
147 | .vector_allocation_domain = x2apic_vector_allocation_domain, | ||
148 | .apic_id_registered = x2apic_apic_id_registered, | ||
149 | .init_apic_ldr = init_x2apic_ldr, | ||
150 | .send_IPI_all = x2apic_send_IPI_all, | ||
151 | .send_IPI_allbutself = x2apic_send_IPI_allbutself, | ||
152 | .send_IPI_mask = x2apic_send_IPI_mask, | ||
153 | .send_IPI_self = x2apic_send_IPI_self, | ||
154 | .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, | ||
155 | .phys_pkg_id = phys_pkg_id, | ||
156 | .get_apic_id = get_apic_id, | ||
157 | .set_apic_id = set_apic_id, | ||
158 | .apic_id_mask = (0xFFFFFFFFu), | ||
159 | }; | ||
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c new file mode 100644 index 000000000000..8f1343df2627 --- /dev/null +++ b/arch/x86/kernel/genx2apic_phys.c | |||
@@ -0,0 +1,154 @@ | |||
1 | #include <linux/threads.h> | ||
2 | #include <linux/cpumask.h> | ||
3 | #include <linux/string.h> | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/ctype.h> | ||
6 | #include <linux/init.h> | ||
7 | #include <linux/dmar.h> | ||
8 | |||
9 | #include <asm/smp.h> | ||
10 | #include <asm/ipi.h> | ||
11 | #include <asm/genapic.h> | ||
12 | |||
13 | static int x2apic_phys; | ||
14 | |||
15 | static int set_x2apic_phys_mode(char *arg) | ||
16 | { | ||
17 | x2apic_phys = 1; | ||
18 | return 0; | ||
19 | } | ||
20 | early_param("x2apic_phys", set_x2apic_phys_mode); | ||
21 | |||
22 | static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | ||
23 | { | ||
24 | if (cpu_has_x2apic && x2apic_phys) | ||
25 | return 1; | ||
26 | |||
27 | return 0; | ||
28 | } | ||
29 | |||
30 | /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ | ||
31 | |||
32 | static cpumask_t x2apic_target_cpus(void) | ||
33 | { | ||
34 | return cpumask_of_cpu(0); | ||
35 | } | ||
36 | |||
37 | static cpumask_t x2apic_vector_allocation_domain(int cpu) | ||
38 | { | ||
39 | cpumask_t domain = CPU_MASK_NONE; | ||
40 | cpu_set(cpu, domain); | ||
41 | return domain; | ||
42 | } | ||
43 | |||
44 | static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, | ||
45 | unsigned int dest) | ||
46 | { | ||
47 | unsigned long cfg; | ||
48 | |||
49 | cfg = __prepare_ICR(0, vector, dest); | ||
50 | |||
51 | /* | ||
52 | * send the IPI. | ||
53 | */ | ||
54 | x2apic_icr_write(cfg, apicid); | ||
55 | } | ||
56 | |||
57 | static void x2apic_send_IPI_mask(cpumask_t mask, int vector) | ||
58 | { | ||
59 | unsigned long flags; | ||
60 | unsigned long query_cpu; | ||
61 | |||
62 | local_irq_save(flags); | ||
63 | for_each_cpu_mask(query_cpu, mask) { | ||
64 | __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), | ||
65 | vector, APIC_DEST_PHYSICAL); | ||
66 | } | ||
67 | local_irq_restore(flags); | ||
68 | } | ||
69 | |||
70 | static void x2apic_send_IPI_allbutself(int vector) | ||
71 | { | ||
72 | cpumask_t mask = cpu_online_map; | ||
73 | |||
74 | cpu_clear(smp_processor_id(), mask); | ||
75 | |||
76 | if (!cpus_empty(mask)) | ||
77 | x2apic_send_IPI_mask(mask, vector); | ||
78 | } | ||
79 | |||
80 | static void x2apic_send_IPI_all(int vector) | ||
81 | { | ||
82 | x2apic_send_IPI_mask(cpu_online_map, vector); | ||
83 | } | ||
84 | |||
85 | static int x2apic_apic_id_registered(void) | ||
86 | { | ||
87 | return 1; | ||
88 | } | ||
89 | |||
90 | static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) | ||
91 | { | ||
92 | int cpu; | ||
93 | |||
94 | /* | ||
95 | * We're using fixed IRQ delivery, can only return one phys APIC ID. | ||
96 | * May as well be the first. | ||
97 | */ | ||
98 | cpu = first_cpu(cpumask); | ||
99 | if ((unsigned)cpu < NR_CPUS) | ||
100 | return per_cpu(x86_cpu_to_apicid, cpu); | ||
101 | else | ||
102 | return BAD_APICID; | ||
103 | } | ||
104 | |||
105 | static unsigned int get_apic_id(unsigned long x) | ||
106 | { | ||
107 | unsigned int id; | ||
108 | |||
109 | id = x; | ||
110 | return id; | ||
111 | } | ||
112 | |||
113 | static unsigned long set_apic_id(unsigned int id) | ||
114 | { | ||
115 | unsigned long x; | ||
116 | |||
117 | x = id; | ||
118 | return x; | ||
119 | } | ||
120 | |||
121 | static unsigned int phys_pkg_id(int index_msb) | ||
122 | { | ||
123 | return current_cpu_data.initial_apicid >> index_msb; | ||
124 | } | ||
125 | |||
126 | void x2apic_send_IPI_self(int vector) | ||
127 | { | ||
128 | apic_write(APIC_SELF_IPI, vector); | ||
129 | } | ||
130 | |||
131 | void init_x2apic_ldr(void) | ||
132 | { | ||
133 | return; | ||
134 | } | ||
135 | |||
136 | struct genapic apic_x2apic_phys = { | ||
137 | .name = "physical x2apic", | ||
138 | .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, | ||
139 | .int_delivery_mode = dest_Fixed, | ||
140 | .int_dest_mode = (APIC_DEST_PHYSICAL != 0), | ||
141 | .target_cpus = x2apic_target_cpus, | ||
142 | .vector_allocation_domain = x2apic_vector_allocation_domain, | ||
143 | .apic_id_registered = x2apic_apic_id_registered, | ||
144 | .init_apic_ldr = init_x2apic_ldr, | ||
145 | .send_IPI_all = x2apic_send_IPI_all, | ||
146 | .send_IPI_allbutself = x2apic_send_IPI_allbutself, | ||
147 | .send_IPI_mask = x2apic_send_IPI_mask, | ||
148 | .send_IPI_self = x2apic_send_IPI_self, | ||
149 | .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, | ||
150 | .phys_pkg_id = phys_pkg_id, | ||
151 | .get_apic_id = get_apic_id, | ||
152 | .set_apic_id = set_apic_id, | ||
153 | .apic_id_mask = (0xFFFFFFFFu), | ||
154 | }; | ||
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index bfa837cb16be..33581d94a90e 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c | |||
@@ -12,12 +12,12 @@ | |||
12 | #include <linux/threads.h> | 12 | #include <linux/threads.h> |
13 | #include <linux/cpumask.h> | 13 | #include <linux/cpumask.h> |
14 | #include <linux/string.h> | 14 | #include <linux/string.h> |
15 | #include <linux/kernel.h> | ||
16 | #include <linux/ctype.h> | 15 | #include <linux/ctype.h> |
17 | #include <linux/init.h> | 16 | #include <linux/init.h> |
18 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
19 | #include <linux/bootmem.h> | 18 | #include <linux/bootmem.h> |
20 | #include <linux/module.h> | 19 | #include <linux/module.h> |
20 | #include <linux/hardirq.h> | ||
21 | #include <asm/smp.h> | 21 | #include <asm/smp.h> |
22 | #include <asm/ipi.h> | 22 | #include <asm/ipi.h> |
23 | #include <asm/genapic.h> | 23 | #include <asm/genapic.h> |
@@ -26,6 +26,36 @@ | |||
26 | #include <asm/uv/uv_hub.h> | 26 | #include <asm/uv/uv_hub.h> |
27 | #include <asm/uv/bios.h> | 27 | #include <asm/uv/bios.h> |
28 | 28 | ||
29 | DEFINE_PER_CPU(int, x2apic_extra_bits); | ||
30 | |||
31 | static enum uv_system_type uv_system_type; | ||
32 | |||
33 | static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | ||
34 | { | ||
35 | if (!strcmp(oem_id, "SGI")) { | ||
36 | if (!strcmp(oem_table_id, "UVL")) | ||
37 | uv_system_type = UV_LEGACY_APIC; | ||
38 | else if (!strcmp(oem_table_id, "UVX")) | ||
39 | uv_system_type = UV_X2APIC; | ||
40 | else if (!strcmp(oem_table_id, "UVH")) { | ||
41 | uv_system_type = UV_NON_UNIQUE_APIC; | ||
42 | return 1; | ||
43 | } | ||
44 | } | ||
45 | return 0; | ||
46 | } | ||
47 | |||
48 | enum uv_system_type get_uv_system_type(void) | ||
49 | { | ||
50 | return uv_system_type; | ||
51 | } | ||
52 | |||
53 | int is_uv_system(void) | ||
54 | { | ||
55 | return uv_system_type != UV_NONE; | ||
56 | } | ||
57 | EXPORT_SYMBOL_GPL(is_uv_system); | ||
58 | |||
29 | DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); | 59 | DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); |
30 | EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); | 60 | EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); |
31 | 61 | ||
@@ -84,7 +114,7 @@ static void uv_send_IPI_one(int cpu, int vector) | |||
84 | unsigned long val, apicid, lapicid; | 114 | unsigned long val, apicid, lapicid; |
85 | int pnode; | 115 | int pnode; |
86 | 116 | ||
87 | apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ | 117 | apicid = per_cpu(x86_cpu_to_apicid, cpu); |
88 | lapicid = apicid & 0x3f; /* ZZZ macro needed */ | 118 | lapicid = apicid & 0x3f; /* ZZZ macro needed */ |
89 | pnode = uv_apicid_to_pnode(apicid); | 119 | pnode = uv_apicid_to_pnode(apicid); |
90 | val = | 120 | val = |
@@ -123,6 +153,10 @@ static int uv_apic_id_registered(void) | |||
123 | return 1; | 153 | return 1; |
124 | } | 154 | } |
125 | 155 | ||
156 | static void uv_init_apic_ldr(void) | ||
157 | { | ||
158 | } | ||
159 | |||
126 | static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) | 160 | static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) |
127 | { | 161 | { |
128 | int cpu; | 162 | int cpu; |
@@ -138,31 +172,59 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) | |||
138 | return BAD_APICID; | 172 | return BAD_APICID; |
139 | } | 173 | } |
140 | 174 | ||
175 | static unsigned int get_apic_id(unsigned long x) | ||
176 | { | ||
177 | unsigned int id; | ||
178 | |||
179 | WARN_ON(preemptible() && num_online_cpus() > 1); | ||
180 | id = x | __get_cpu_var(x2apic_extra_bits); | ||
181 | |||
182 | return id; | ||
183 | } | ||
184 | |||
185 | static unsigned long set_apic_id(unsigned int id) | ||
186 | { | ||
187 | unsigned long x; | ||
188 | |||
189 | /* maskout x2apic_extra_bits ? */ | ||
190 | x = id; | ||
191 | return x; | ||
192 | } | ||
193 | |||
194 | static unsigned int uv_read_apic_id(void) | ||
195 | { | ||
196 | |||
197 | return get_apic_id(apic_read(APIC_ID)); | ||
198 | } | ||
199 | |||
141 | static unsigned int phys_pkg_id(int index_msb) | 200 | static unsigned int phys_pkg_id(int index_msb) |
142 | { | 201 | { |
143 | return GET_APIC_ID(read_apic_id()) >> index_msb; | 202 | return uv_read_apic_id() >> index_msb; |
144 | } | 203 | } |
145 | 204 | ||
146 | #ifdef ZZZ /* Needs x2apic patch */ | ||
147 | static void uv_send_IPI_self(int vector) | 205 | static void uv_send_IPI_self(int vector) |
148 | { | 206 | { |
149 | apic_write(APIC_SELF_IPI, vector); | 207 | apic_write(APIC_SELF_IPI, vector); |
150 | } | 208 | } |
151 | #endif | ||
152 | 209 | ||
153 | struct genapic apic_x2apic_uv_x = { | 210 | struct genapic apic_x2apic_uv_x = { |
154 | .name = "UV large system", | 211 | .name = "UV large system", |
212 | .acpi_madt_oem_check = uv_acpi_madt_oem_check, | ||
155 | .int_delivery_mode = dest_Fixed, | 213 | .int_delivery_mode = dest_Fixed, |
156 | .int_dest_mode = (APIC_DEST_PHYSICAL != 0), | 214 | .int_dest_mode = (APIC_DEST_PHYSICAL != 0), |
157 | .target_cpus = uv_target_cpus, | 215 | .target_cpus = uv_target_cpus, |
158 | .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */ | 216 | .vector_allocation_domain = uv_vector_allocation_domain, |
159 | .apic_id_registered = uv_apic_id_registered, | 217 | .apic_id_registered = uv_apic_id_registered, |
218 | .init_apic_ldr = uv_init_apic_ldr, | ||
160 | .send_IPI_all = uv_send_IPI_all, | 219 | .send_IPI_all = uv_send_IPI_all, |
161 | .send_IPI_allbutself = uv_send_IPI_allbutself, | 220 | .send_IPI_allbutself = uv_send_IPI_allbutself, |
162 | .send_IPI_mask = uv_send_IPI_mask, | 221 | .send_IPI_mask = uv_send_IPI_mask, |
163 | /* ZZZ.send_IPI_self = uv_send_IPI_self, */ | 222 | .send_IPI_self = uv_send_IPI_self, |
164 | .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, | 223 | .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, |
165 | .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ | 224 | .phys_pkg_id = phys_pkg_id, |
225 | .get_apic_id = get_apic_id, | ||
226 | .set_apic_id = set_apic_id, | ||
227 | .apic_id_mask = (0xFFFFFFFFu), | ||
166 | }; | 228 | }; |
167 | 229 | ||
168 | static __cpuinit void set_x2apic_extra_bits(int pnode) | 230 | static __cpuinit void set_x2apic_extra_bits(int pnode) |
@@ -222,12 +284,13 @@ static __init void map_low_mmrs(void) | |||
222 | 284 | ||
223 | enum map_type {map_wb, map_uc}; | 285 | enum map_type {map_wb, map_uc}; |
224 | 286 | ||
225 | static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type) | 287 | static __init void map_high(char *id, unsigned long base, int shift, |
288 | int max_pnode, enum map_type map_type) | ||
226 | { | 289 | { |
227 | unsigned long bytes, paddr; | 290 | unsigned long bytes, paddr; |
228 | 291 | ||
229 | paddr = base << shift; | 292 | paddr = base << shift; |
230 | bytes = (1UL << shift); | 293 | bytes = (1UL << shift) * (max_pnode + 1); |
231 | printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, | 294 | printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, |
232 | paddr + bytes); | 295 | paddr + bytes); |
233 | if (map_type == map_uc) | 296 | if (map_type == map_uc) |
@@ -243,7 +306,7 @@ static __init void map_gru_high(int max_pnode) | |||
243 | 306 | ||
244 | gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); | 307 | gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); |
245 | if (gru.s.enable) | 308 | if (gru.s.enable) |
246 | map_high("GRU", gru.s.base, shift, map_wb); | 309 | map_high("GRU", gru.s.base, shift, max_pnode, map_wb); |
247 | } | 310 | } |
248 | 311 | ||
249 | static __init void map_config_high(int max_pnode) | 312 | static __init void map_config_high(int max_pnode) |
@@ -253,7 +316,7 @@ static __init void map_config_high(int max_pnode) | |||
253 | 316 | ||
254 | cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); | 317 | cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); |
255 | if (cfg.s.enable) | 318 | if (cfg.s.enable) |
256 | map_high("CONFIG", cfg.s.base, shift, map_uc); | 319 | map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); |
257 | } | 320 | } |
258 | 321 | ||
259 | static __init void map_mmr_high(int max_pnode) | 322 | static __init void map_mmr_high(int max_pnode) |
@@ -263,7 +326,7 @@ static __init void map_mmr_high(int max_pnode) | |||
263 | 326 | ||
264 | mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); | 327 | mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); |
265 | if (mmr.s.enable) | 328 | if (mmr.s.enable) |
266 | map_high("MMR", mmr.s.base, shift, map_uc); | 329 | map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); |
267 | } | 330 | } |
268 | 331 | ||
269 | static __init void map_mmioh_high(int max_pnode) | 332 | static __init void map_mmioh_high(int max_pnode) |
@@ -273,7 +336,7 @@ static __init void map_mmioh_high(int max_pnode) | |||
273 | 336 | ||
274 | mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); | 337 | mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); |
275 | if (mmioh.s.enable) | 338 | if (mmioh.s.enable) |
276 | map_high("MMIOH", mmioh.s.base, shift, map_uc); | 339 | map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); |
277 | } | 340 | } |
278 | 341 | ||
279 | static __init void uv_rtc_init(void) | 342 | static __init void uv_rtc_init(void) |
@@ -401,3 +464,5 @@ void __cpuinit uv_cpu_init(void) | |||
401 | if (get_uv_system_type() == UV_NON_UNIQUE_APIC) | 464 | if (get_uv_system_type() == UV_NON_UNIQUE_APIC) |
402 | set_x2apic_extra_bits(uv_hub_info->pnode); | 465 | set_x2apic_extra_bits(uv_hub_info->pnode); |
403 | } | 466 | } |
467 | |||
468 | |||
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 3e66bd364a9d..1dcb0f13897e 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c | |||
@@ -35,6 +35,7 @@ void __init reserve_ebda_region(void) | |||
35 | 35 | ||
36 | /* start of EBDA area */ | 36 | /* start of EBDA area */ |
37 | ebda_addr = get_bios_ebda(); | 37 | ebda_addr = get_bios_ebda(); |
38 | printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem); | ||
38 | 39 | ||
39 | /* Fixup: bios puts an EBDA in the top 64K segment */ | 40 | /* Fixup: bios puts an EBDA in the top 64K segment */ |
40 | /* of conventional memory, but does not adjust lowmem. */ | 41 | /* of conventional memory, but does not adjust lowmem. */ |
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 9bfc4d72fb2e..d16084f90649 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c | |||
@@ -108,12 +108,11 @@ void __init x86_64_start_kernel(char * real_mode_data) | |||
108 | } | 108 | } |
109 | load_idt((const struct desc_ptr *)&idt_descr); | 109 | load_idt((const struct desc_ptr *)&idt_descr); |
110 | 110 | ||
111 | early_printk("Kernel alive\n"); | 111 | if (console_loglevel == 10) |
112 | early_printk("Kernel alive\n"); | ||
112 | 113 | ||
113 | x86_64_init_pda(); | 114 | x86_64_init_pda(); |
114 | 115 | ||
115 | early_printk("Kernel really alive\n"); | ||
116 | |||
117 | x86_64_start_reservations(real_mode_data); | 116 | x86_64_start_reservations(real_mode_data); |
118 | } | 117 | } |
119 | 118 | ||
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index a7010c3a377a..e835b4eea70b 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4 | |||
172 | * | 172 | * |
173 | * Note that the stack is not yet set up! | 173 | * Note that the stack is not yet set up! |
174 | */ | 174 | */ |
175 | #define PTE_ATTR 0x007 /* PRESENT+RW+USER */ | ||
176 | #define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ | ||
177 | #define PGD_ATTR 0x001 /* PRESENT (no other attributes) */ | ||
178 | |||
179 | default_entry: | 175 | default_entry: |
180 | #ifdef CONFIG_X86_PAE | 176 | #ifdef CONFIG_X86_PAE |
181 | 177 | ||
@@ -196,9 +192,9 @@ default_entry: | |||
196 | movl $pa(pg0), %edi | 192 | movl $pa(pg0), %edi |
197 | movl %edi, pa(init_pg_tables_start) | 193 | movl %edi, pa(init_pg_tables_start) |
198 | movl $pa(swapper_pg_pmd), %edx | 194 | movl $pa(swapper_pg_pmd), %edx |
199 | movl $PTE_ATTR, %eax | 195 | movl $PTE_IDENT_ATTR, %eax |
200 | 10: | 196 | 10: |
201 | leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ | 197 | leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ |
202 | movl %ecx,(%edx) /* Store PMD entry */ | 198 | movl %ecx,(%edx) /* Store PMD entry */ |
203 | /* Upper half already zero */ | 199 | /* Upper half already zero */ |
204 | addl $8,%edx | 200 | addl $8,%edx |
@@ -215,7 +211,7 @@ default_entry: | |||
215 | * End condition: we must map up to and including INIT_MAP_BEYOND_END | 211 | * End condition: we must map up to and including INIT_MAP_BEYOND_END |
216 | * bytes beyond the end of our own page tables. | 212 | * bytes beyond the end of our own page tables. |
217 | */ | 213 | */ |
218 | leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp | 214 | leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp |
219 | cmpl %ebp,%eax | 215 | cmpl %ebp,%eax |
220 | jb 10b | 216 | jb 10b |
221 | 1: | 217 | 1: |
@@ -224,7 +220,7 @@ default_entry: | |||
224 | movl %eax, pa(max_pfn_mapped) | 220 | movl %eax, pa(max_pfn_mapped) |
225 | 221 | ||
226 | /* Do early initialization of the fixmap area */ | 222 | /* Do early initialization of the fixmap area */ |
227 | movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax | 223 | movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax |
228 | movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) | 224 | movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) |
229 | #else /* Not PAE */ | 225 | #else /* Not PAE */ |
230 | 226 | ||
@@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20); | |||
233 | movl $pa(pg0), %edi | 229 | movl $pa(pg0), %edi |
234 | movl %edi, pa(init_pg_tables_start) | 230 | movl %edi, pa(init_pg_tables_start) |
235 | movl $pa(swapper_pg_dir), %edx | 231 | movl $pa(swapper_pg_dir), %edx |
236 | movl $PTE_ATTR, %eax | 232 | movl $PTE_IDENT_ATTR, %eax |
237 | 10: | 233 | 10: |
238 | leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ | 234 | leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ |
239 | movl %ecx,(%edx) /* Store identity PDE entry */ | 235 | movl %ecx,(%edx) /* Store identity PDE entry */ |
240 | movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ | 236 | movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ |
241 | addl $4,%edx | 237 | addl $4,%edx |
@@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); | |||
249 | * bytes beyond the end of our own page tables; the +0x007 is | 245 | * bytes beyond the end of our own page tables; the +0x007 is |
250 | * the attribute bits | 246 | * the attribute bits |
251 | */ | 247 | */ |
252 | leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp | 248 | leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp |
253 | cmpl %ebp,%eax | 249 | cmpl %ebp,%eax |
254 | jb 10b | 250 | jb 10b |
255 | movl %edi,pa(init_pg_tables_end) | 251 | movl %edi,pa(init_pg_tables_end) |
@@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); | |||
257 | movl %eax, pa(max_pfn_mapped) | 253 | movl %eax, pa(max_pfn_mapped) |
258 | 254 | ||
259 | /* Do early initialization of the fixmap area */ | 255 | /* Do early initialization of the fixmap area */ |
260 | movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax | 256 | movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax |
261 | movl %eax,pa(swapper_pg_dir+0xffc) | 257 | movl %eax,pa(swapper_pg_dir+0xffc) |
262 | #endif | 258 | #endif |
263 | jmp 3f | 259 | jmp 3f |
@@ -634,19 +630,19 @@ ENTRY(empty_zero_page) | |||
634 | /* Page-aligned for the benefit of paravirt? */ | 630 | /* Page-aligned for the benefit of paravirt? */ |
635 | .align PAGE_SIZE_asm | 631 | .align PAGE_SIZE_asm |
636 | ENTRY(swapper_pg_dir) | 632 | ENTRY(swapper_pg_dir) |
637 | .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ | 633 | .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ |
638 | # if KPMDS == 3 | 634 | # if KPMDS == 3 |
639 | .long pa(swapper_pg_pmd+PGD_ATTR),0 | 635 | .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 |
640 | .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 | 636 | .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 |
641 | .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 | 637 | .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0 |
642 | # elif KPMDS == 2 | 638 | # elif KPMDS == 2 |
643 | .long 0,0 | 639 | .long 0,0 |
644 | .long pa(swapper_pg_pmd+PGD_ATTR),0 | 640 | .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 |
645 | .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 | 641 | .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 |
646 | # elif KPMDS == 1 | 642 | # elif KPMDS == 1 |
647 | .long 0,0 | 643 | .long 0,0 |
648 | .long 0,0 | 644 | .long 0,0 |
649 | .long pa(swapper_pg_pmd+PGD_ATTR),0 | 645 | .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 |
650 | # else | 646 | # else |
651 | # error "Kernel PMDs should be 1, 2 or 3" | 647 | # error "Kernel PMDs should be 1, 2 or 3" |
652 | # endif | 648 | # endif |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index db3280afe886..26cfdc1d7c7f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -110,7 +110,7 @@ startup_64: | |||
110 | movq %rdi, %rax | 110 | movq %rdi, %rax |
111 | shrq $PMD_SHIFT, %rax | 111 | shrq $PMD_SHIFT, %rax |
112 | andq $(PTRS_PER_PMD - 1), %rax | 112 | andq $(PTRS_PER_PMD - 1), %rax |
113 | leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx | 113 | leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx |
114 | leaq level2_spare_pgt(%rip), %rbx | 114 | leaq level2_spare_pgt(%rip), %rbx |
115 | movq %rdx, 0(%rbx, %rax, 8) | 115 | movq %rdx, 0(%rbx, %rax, 8) |
116 | ident_complete: | 116 | ident_complete: |
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt) | |||
374 | /* Since I easily can, map the first 1G. | 374 | /* Since I easily can, map the first 1G. |
375 | * Don't set NX because code runs from these pages. | 375 | * Don't set NX because code runs from these pages. |
376 | */ | 376 | */ |
377 | PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) | 377 | PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) |
378 | 378 | ||
379 | NEXT_PAGE(level2_kernel_pgt) | 379 | NEXT_PAGE(level2_kernel_pgt) |
380 | /* | 380 | /* |
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 59fd3b6b1303..acf62fc233da 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c | |||
@@ -115,13 +115,17 @@ static void hpet_reserve_platform_timers(unsigned long id) | |||
115 | hd.hd_phys_address = hpet_address; | 115 | hd.hd_phys_address = hpet_address; |
116 | hd.hd_address = hpet; | 116 | hd.hd_address = hpet; |
117 | hd.hd_nirqs = nrtimers; | 117 | hd.hd_nirqs = nrtimers; |
118 | hd.hd_flags = HPET_DATA_PLATFORM; | ||
119 | hpet_reserve_timer(&hd, 0); | 118 | hpet_reserve_timer(&hd, 0); |
120 | 119 | ||
121 | #ifdef CONFIG_HPET_EMULATE_RTC | 120 | #ifdef CONFIG_HPET_EMULATE_RTC |
122 | hpet_reserve_timer(&hd, 1); | 121 | hpet_reserve_timer(&hd, 1); |
123 | #endif | 122 | #endif |
124 | 123 | ||
124 | /* | ||
125 | * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254 | ||
126 | * is wrong for i8259!) not the output IRQ. Many BIOS writers | ||
127 | * don't bother configuring *any* comparator interrupts. | ||
128 | */ | ||
125 | hd.hd_irq[0] = HPET_LEGACY_8254; | 129 | hd.hd_irq[0] = HPET_LEGACY_8254; |
126 | hd.hd_irq[1] = HPET_LEGACY_RTC; | 130 | hd.hd_irq[1] = HPET_LEGACY_RTC; |
127 | 131 | ||
@@ -210,8 +214,8 @@ static void hpet_legacy_clockevent_register(void) | |||
210 | /* Calculate the min / max delta */ | 214 | /* Calculate the min / max delta */ |
211 | hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, | 215 | hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, |
212 | &hpet_clockevent); | 216 | &hpet_clockevent); |
213 | hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, | 217 | /* 5 usec minimum reprogramming delta. */ |
214 | &hpet_clockevent); | 218 | hpet_clockevent.min_delta_ns = 5000; |
215 | 219 | ||
216 | /* | 220 | /* |
217 | * Start hpet with the boot cpu mask and make it | 221 | * Start hpet with the boot cpu mask and make it |
@@ -270,15 +274,22 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode, | |||
270 | } | 274 | } |
271 | 275 | ||
272 | static int hpet_legacy_next_event(unsigned long delta, | 276 | static int hpet_legacy_next_event(unsigned long delta, |
273 | struct clock_event_device *evt) | 277 | struct clock_event_device *evt) |
274 | { | 278 | { |
275 | unsigned long cnt; | 279 | u32 cnt; |
276 | 280 | ||
277 | cnt = hpet_readl(HPET_COUNTER); | 281 | cnt = hpet_readl(HPET_COUNTER); |
278 | cnt += delta; | 282 | cnt += (u32) delta; |
279 | hpet_writel(cnt, HPET_T0_CMP); | 283 | hpet_writel(cnt, HPET_T0_CMP); |
280 | 284 | ||
281 | return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; | 285 | /* |
286 | * We need to read back the CMP register to make sure that | ||
287 | * what we wrote hit the chip before we compare it to the | ||
288 | * counter. | ||
289 | */ | ||
290 | WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt); | ||
291 | |||
292 | return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; | ||
282 | } | 293 | } |
283 | 294 | ||
284 | /* | 295 | /* |
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index eb9ddd8efb82..1f20608d4ca8 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c | |||
@@ -21,9 +21,12 @@ | |||
21 | # include <asm/sigcontext32.h> | 21 | # include <asm/sigcontext32.h> |
22 | # include <asm/user32.h> | 22 | # include <asm/user32.h> |
23 | #else | 23 | #else |
24 | # define save_i387_ia32 save_i387 | 24 | # define save_i387_xstate_ia32 save_i387_xstate |
25 | # define restore_i387_ia32 restore_i387 | 25 | # define restore_i387_xstate_ia32 restore_i387_xstate |
26 | # define _fpstate_ia32 _fpstate | 26 | # define _fpstate_ia32 _fpstate |
27 | # define _xstate_ia32 _xstate | ||
28 | # define sig_xstate_ia32_size sig_xstate_size | ||
29 | # define fx_sw_reserved_ia32 fx_sw_reserved | ||
27 | # define user_i387_ia32_struct user_i387_struct | 30 | # define user_i387_ia32_struct user_i387_struct |
28 | # define user32_fxsr_struct user_fxsr_struct | 31 | # define user32_fxsr_struct user_fxsr_struct |
29 | #endif | 32 | #endif |
@@ -36,6 +39,7 @@ | |||
36 | 39 | ||
37 | static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; | 40 | static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; |
38 | unsigned int xstate_size; | 41 | unsigned int xstate_size; |
42 | unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); | ||
39 | static struct i387_fxsave_struct fx_scratch __cpuinitdata; | 43 | static struct i387_fxsave_struct fx_scratch __cpuinitdata; |
40 | 44 | ||
41 | void __cpuinit mxcsr_feature_mask_init(void) | 45 | void __cpuinit mxcsr_feature_mask_init(void) |
@@ -61,6 +65,11 @@ void __init init_thread_xstate(void) | |||
61 | return; | 65 | return; |
62 | } | 66 | } |
63 | 67 | ||
68 | if (cpu_has_xsave) { | ||
69 | xsave_cntxt_init(); | ||
70 | return; | ||
71 | } | ||
72 | |||
64 | if (cpu_has_fxsr) | 73 | if (cpu_has_fxsr) |
65 | xstate_size = sizeof(struct i387_fxsave_struct); | 74 | xstate_size = sizeof(struct i387_fxsave_struct); |
66 | #ifdef CONFIG_X86_32 | 75 | #ifdef CONFIG_X86_32 |
@@ -83,9 +92,19 @@ void __cpuinit fpu_init(void) | |||
83 | 92 | ||
84 | write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ | 93 | write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ |
85 | 94 | ||
95 | /* | ||
96 | * Boot processor to setup the FP and extended state context info. | ||
97 | */ | ||
98 | if (!smp_processor_id()) | ||
99 | init_thread_xstate(); | ||
100 | xsave_init(); | ||
101 | |||
86 | mxcsr_feature_mask_init(); | 102 | mxcsr_feature_mask_init(); |
87 | /* clean state in init */ | 103 | /* clean state in init */ |
88 | current_thread_info()->status = 0; | 104 | if (cpu_has_xsave) |
105 | current_thread_info()->status = TS_XSAVE; | ||
106 | else | ||
107 | current_thread_info()->status = 0; | ||
89 | clear_used_math(); | 108 | clear_used_math(); |
90 | } | 109 | } |
91 | #endif /* CONFIG_X86_64 */ | 110 | #endif /* CONFIG_X86_64 */ |
@@ -195,6 +214,13 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, | |||
195 | */ | 214 | */ |
196 | target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; | 215 | target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; |
197 | 216 | ||
217 | /* | ||
218 | * update the header bits in the xsave header, indicating the | ||
219 | * presence of FP and SSE state. | ||
220 | */ | ||
221 | if (cpu_has_xsave) | ||
222 | target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; | ||
223 | |||
198 | return ret; | 224 | return ret; |
199 | } | 225 | } |
200 | 226 | ||
@@ -395,6 +421,12 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, | |||
395 | if (!ret) | 421 | if (!ret) |
396 | convert_to_fxsr(target, &env); | 422 | convert_to_fxsr(target, &env); |
397 | 423 | ||
424 | /* | ||
425 | * update the header bit in the xsave header, indicating the | ||
426 | * presence of FP. | ||
427 | */ | ||
428 | if (cpu_has_xsave) | ||
429 | target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; | ||
398 | return ret; | 430 | return ret; |
399 | } | 431 | } |
400 | 432 | ||
@@ -407,7 +439,6 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) | |||
407 | struct task_struct *tsk = current; | 439 | struct task_struct *tsk = current; |
408 | struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; | 440 | struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; |
409 | 441 | ||
410 | unlazy_fpu(tsk); | ||
411 | fp->status = fp->swd; | 442 | fp->status = fp->swd; |
412 | if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) | 443 | if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) |
413 | return -1; | 444 | return -1; |
@@ -421,8 +452,6 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) | |||
421 | struct user_i387_ia32_struct env; | 452 | struct user_i387_ia32_struct env; |
422 | int err = 0; | 453 | int err = 0; |
423 | 454 | ||
424 | unlazy_fpu(tsk); | ||
425 | |||
426 | convert_from_fxsr(&env, tsk); | 455 | convert_from_fxsr(&env, tsk); |
427 | if (__copy_to_user(buf, &env, sizeof(env))) | 456 | if (__copy_to_user(buf, &env, sizeof(env))) |
428 | return -1; | 457 | return -1; |
@@ -432,16 +461,54 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) | |||
432 | if (err) | 461 | if (err) |
433 | return -1; | 462 | return -1; |
434 | 463 | ||
435 | if (__copy_to_user(&buf->_fxsr_env[0], fx, | 464 | if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size)) |
436 | sizeof(struct i387_fxsave_struct))) | 465 | return -1; |
466 | return 1; | ||
467 | } | ||
468 | |||
469 | static int save_i387_xsave(void __user *buf) | ||
470 | { | ||
471 | struct task_struct *tsk = current; | ||
472 | struct _fpstate_ia32 __user *fx = buf; | ||
473 | int err = 0; | ||
474 | |||
475 | /* | ||
476 | * For legacy compatible, we always set FP/SSE bits in the bit | ||
477 | * vector while saving the state to the user context. | ||
478 | * This will enable us capturing any changes(during sigreturn) to | ||
479 | * the FP/SSE bits by the legacy applications which don't touch | ||
480 | * xstate_bv in the xsave header. | ||
481 | * | ||
482 | * xsave aware applications can change the xstate_bv in the xsave | ||
483 | * header as well as change any contents in the memory layout. | ||
484 | * xrestore as part of sigreturn will capture all the changes. | ||
485 | */ | ||
486 | tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; | ||
487 | |||
488 | if (save_i387_fxsave(fx) < 0) | ||
489 | return -1; | ||
490 | |||
491 | err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32, | ||
492 | sizeof(struct _fpx_sw_bytes)); | ||
493 | err |= __put_user(FP_XSTATE_MAGIC2, | ||
494 | (__u32 __user *) (buf + sig_xstate_ia32_size | ||
495 | - FP_XSTATE_MAGIC2_SIZE)); | ||
496 | if (err) | ||
437 | return -1; | 497 | return -1; |
498 | |||
438 | return 1; | 499 | return 1; |
439 | } | 500 | } |
440 | 501 | ||
441 | int save_i387_ia32(struct _fpstate_ia32 __user *buf) | 502 | int save_i387_xstate_ia32(void __user *buf) |
442 | { | 503 | { |
504 | struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; | ||
505 | struct task_struct *tsk = current; | ||
506 | |||
443 | if (!used_math()) | 507 | if (!used_math()) |
444 | return 0; | 508 | return 0; |
509 | |||
510 | if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size)) | ||
511 | return -EACCES; | ||
445 | /* | 512 | /* |
446 | * This will cause a "finit" to be triggered by the next | 513 | * This will cause a "finit" to be triggered by the next |
447 | * attempted FPU operation by the 'current' process. | 514 | * attempted FPU operation by the 'current' process. |
@@ -451,13 +518,17 @@ int save_i387_ia32(struct _fpstate_ia32 __user *buf) | |||
451 | if (!HAVE_HWFP) { | 518 | if (!HAVE_HWFP) { |
452 | return fpregs_soft_get(current, NULL, | 519 | return fpregs_soft_get(current, NULL, |
453 | 0, sizeof(struct user_i387_ia32_struct), | 520 | 0, sizeof(struct user_i387_ia32_struct), |
454 | NULL, buf) ? -1 : 1; | 521 | NULL, fp) ? -1 : 1; |
455 | } | 522 | } |
456 | 523 | ||
524 | unlazy_fpu(tsk); | ||
525 | |||
526 | if (cpu_has_xsave) | ||
527 | return save_i387_xsave(fp); | ||
457 | if (cpu_has_fxsr) | 528 | if (cpu_has_fxsr) |
458 | return save_i387_fxsave(buf); | 529 | return save_i387_fxsave(fp); |
459 | else | 530 | else |
460 | return save_i387_fsave(buf); | 531 | return save_i387_fsave(fp); |
461 | } | 532 | } |
462 | 533 | ||
463 | static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) | 534 | static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) |
@@ -468,14 +539,15 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) | |||
468 | sizeof(struct i387_fsave_struct)); | 539 | sizeof(struct i387_fsave_struct)); |
469 | } | 540 | } |
470 | 541 | ||
471 | static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) | 542 | static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf, |
543 | unsigned int size) | ||
472 | { | 544 | { |
473 | struct task_struct *tsk = current; | 545 | struct task_struct *tsk = current; |
474 | struct user_i387_ia32_struct env; | 546 | struct user_i387_ia32_struct env; |
475 | int err; | 547 | int err; |
476 | 548 | ||
477 | err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], | 549 | err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], |
478 | sizeof(struct i387_fxsave_struct)); | 550 | size); |
479 | /* mxcsr reserved bits must be masked to zero for security reasons */ | 551 | /* mxcsr reserved bits must be masked to zero for security reasons */ |
480 | tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; | 552 | tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; |
481 | if (err || __copy_from_user(&env, buf, sizeof(env))) | 553 | if (err || __copy_from_user(&env, buf, sizeof(env))) |
@@ -485,14 +557,69 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) | |||
485 | return 0; | 557 | return 0; |
486 | } | 558 | } |
487 | 559 | ||
488 | int restore_i387_ia32(struct _fpstate_ia32 __user *buf) | 560 | static int restore_i387_xsave(void __user *buf) |
561 | { | ||
562 | struct _fpx_sw_bytes fx_sw_user; | ||
563 | struct _fpstate_ia32 __user *fx_user = | ||
564 | ((struct _fpstate_ia32 __user *) buf); | ||
565 | struct i387_fxsave_struct __user *fx = | ||
566 | (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; | ||
567 | struct xsave_hdr_struct *xsave_hdr = | ||
568 | ¤t->thread.xstate->xsave.xsave_hdr; | ||
569 | u64 mask; | ||
570 | int err; | ||
571 | |||
572 | if (check_for_xstate(fx, buf, &fx_sw_user)) | ||
573 | goto fx_only; | ||
574 | |||
575 | mask = fx_sw_user.xstate_bv; | ||
576 | |||
577 | err = restore_i387_fxsave(buf, fx_sw_user.xstate_size); | ||
578 | |||
579 | xsave_hdr->xstate_bv &= pcntxt_mask; | ||
580 | /* | ||
581 | * These bits must be zero. | ||
582 | */ | ||
583 | xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; | ||
584 | |||
585 | /* | ||
586 | * Init the state that is not present in the memory layout | ||
587 | * and enabled by the OS. | ||
588 | */ | ||
589 | mask = ~(pcntxt_mask & ~mask); | ||
590 | xsave_hdr->xstate_bv &= mask; | ||
591 | |||
592 | return err; | ||
593 | fx_only: | ||
594 | /* | ||
595 | * Couldn't find the extended state information in the memory | ||
596 | * layout. Restore the FP/SSE and init the other extended state | ||
597 | * enabled by the OS. | ||
598 | */ | ||
599 | xsave_hdr->xstate_bv = XSTATE_FPSSE; | ||
600 | return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct)); | ||
601 | } | ||
602 | |||
603 | int restore_i387_xstate_ia32(void __user *buf) | ||
489 | { | 604 | { |
490 | int err; | 605 | int err; |
491 | struct task_struct *tsk = current; | 606 | struct task_struct *tsk = current; |
607 | struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; | ||
492 | 608 | ||
493 | if (HAVE_HWFP) | 609 | if (HAVE_HWFP) |
494 | clear_fpu(tsk); | 610 | clear_fpu(tsk); |
495 | 611 | ||
612 | if (!buf) { | ||
613 | if (used_math()) { | ||
614 | clear_fpu(tsk); | ||
615 | clear_used_math(); | ||
616 | } | ||
617 | |||
618 | return 0; | ||
619 | } else | ||
620 | if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size)) | ||
621 | return -EACCES; | ||
622 | |||
496 | if (!used_math()) { | 623 | if (!used_math()) { |
497 | err = init_fpu(tsk); | 624 | err = init_fpu(tsk); |
498 | if (err) | 625 | if (err) |
@@ -500,14 +627,17 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf) | |||
500 | } | 627 | } |
501 | 628 | ||
502 | if (HAVE_HWFP) { | 629 | if (HAVE_HWFP) { |
503 | if (cpu_has_fxsr) | 630 | if (cpu_has_xsave) |
504 | err = restore_i387_fxsave(buf); | 631 | err = restore_i387_xsave(buf); |
632 | else if (cpu_has_fxsr) | ||
633 | err = restore_i387_fxsave(fp, sizeof(struct | ||
634 | i387_fxsave_struct)); | ||
505 | else | 635 | else |
506 | err = restore_i387_fsave(buf); | 636 | err = restore_i387_fsave(fp); |
507 | } else { | 637 | } else { |
508 | err = fpregs_soft_set(current, NULL, | 638 | err = fpregs_soft_set(current, NULL, |
509 | 0, sizeof(struct user_i387_ia32_struct), | 639 | 0, sizeof(struct user_i387_ia32_struct), |
510 | NULL, buf) != 0; | 640 | NULL, fp) != 0; |
511 | } | 641 | } |
512 | set_used_math(); | 642 | set_used_math(); |
513 | 643 | ||
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index dc92b49d9204..4b8a53d841f7 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c | |||
@@ -282,6 +282,30 @@ static int __init i8259A_init_sysfs(void) | |||
282 | 282 | ||
283 | device_initcall(i8259A_init_sysfs); | 283 | device_initcall(i8259A_init_sysfs); |
284 | 284 | ||
285 | void mask_8259A(void) | ||
286 | { | ||
287 | unsigned long flags; | ||
288 | |||
289 | spin_lock_irqsave(&i8259A_lock, flags); | ||
290 | |||
291 | outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ | ||
292 | outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ | ||
293 | |||
294 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
295 | } | ||
296 | |||
297 | void unmask_8259A(void) | ||
298 | { | ||
299 | unsigned long flags; | ||
300 | |||
301 | spin_lock_irqsave(&i8259A_lock, flags); | ||
302 | |||
303 | outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ | ||
304 | outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ | ||
305 | |||
306 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
307 | } | ||
308 | |||
285 | void init_8259A(int auto_eoi) | 309 | void init_8259A(int auto_eoi) |
286 | { | 310 | { |
287 | unsigned long flags; | 311 | unsigned long flags; |
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 09cddb57bec4..e710289f673e 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c | |||
@@ -46,10 +46,13 @@ | |||
46 | #include <asm/nmi.h> | 46 | #include <asm/nmi.h> |
47 | #include <asm/msidef.h> | 47 | #include <asm/msidef.h> |
48 | #include <asm/hypertransport.h> | 48 | #include <asm/hypertransport.h> |
49 | #include <asm/setup.h> | ||
49 | 50 | ||
50 | #include <mach_apic.h> | 51 | #include <mach_apic.h> |
51 | #include <mach_apicdef.h> | 52 | #include <mach_apicdef.h> |
52 | 53 | ||
54 | #define __apicdebuginit(type) static type __init | ||
55 | |||
53 | int (*ioapic_renumber_irq)(int ioapic, int irq); | 56 | int (*ioapic_renumber_irq)(int ioapic, int irq); |
54 | atomic_t irq_mis_count; | 57 | atomic_t irq_mis_count; |
55 | 58 | ||
@@ -1341,7 +1344,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, | |||
1341 | ioapic_write_entry(apic, pin, entry); | 1344 | ioapic_write_entry(apic, pin, entry); |
1342 | } | 1345 | } |
1343 | 1346 | ||
1344 | void __init print_IO_APIC(void) | 1347 | |
1348 | __apicdebuginit(void) print_IO_APIC(void) | ||
1345 | { | 1349 | { |
1346 | int apic, i; | 1350 | int apic, i; |
1347 | union IO_APIC_reg_00 reg_00; | 1351 | union IO_APIC_reg_00 reg_00; |
@@ -1456,9 +1460,7 @@ void __init print_IO_APIC(void) | |||
1456 | return; | 1460 | return; |
1457 | } | 1461 | } |
1458 | 1462 | ||
1459 | #if 0 | 1463 | __apicdebuginit(void) print_APIC_bitfield(int base) |
1460 | |||
1461 | static void print_APIC_bitfield(int base) | ||
1462 | { | 1464 | { |
1463 | unsigned int v; | 1465 | unsigned int v; |
1464 | int i, j; | 1466 | int i, j; |
@@ -1479,9 +1481,10 @@ static void print_APIC_bitfield(int base) | |||
1479 | } | 1481 | } |
1480 | } | 1482 | } |
1481 | 1483 | ||
1482 | void /*__init*/ print_local_APIC(void *dummy) | 1484 | __apicdebuginit(void) print_local_APIC(void *dummy) |
1483 | { | 1485 | { |
1484 | unsigned int v, ver, maxlvt; | 1486 | unsigned int v, ver, maxlvt; |
1487 | u64 icr; | ||
1485 | 1488 | ||
1486 | if (apic_verbosity == APIC_QUIET) | 1489 | if (apic_verbosity == APIC_QUIET) |
1487 | return; | 1490 | return; |
@@ -1490,7 +1493,7 @@ void /*__init*/ print_local_APIC(void *dummy) | |||
1490 | smp_processor_id(), hard_smp_processor_id()); | 1493 | smp_processor_id(), hard_smp_processor_id()); |
1491 | v = apic_read(APIC_ID); | 1494 | v = apic_read(APIC_ID); |
1492 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, | 1495 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, |
1493 | GET_APIC_ID(read_apic_id())); | 1496 | GET_APIC_ID(v)); |
1494 | v = apic_read(APIC_LVR); | 1497 | v = apic_read(APIC_LVR); |
1495 | printk(KERN_INFO "... APIC VERSION: %08x\n", v); | 1498 | printk(KERN_INFO "... APIC VERSION: %08x\n", v); |
1496 | ver = GET_APIC_VERSION(v); | 1499 | ver = GET_APIC_VERSION(v); |
@@ -1532,10 +1535,9 @@ void /*__init*/ print_local_APIC(void *dummy) | |||
1532 | printk(KERN_DEBUG "... APIC ESR: %08x\n", v); | 1535 | printk(KERN_DEBUG "... APIC ESR: %08x\n", v); |
1533 | } | 1536 | } |
1534 | 1537 | ||
1535 | v = apic_read(APIC_ICR); | 1538 | icr = apic_icr_read(); |
1536 | printk(KERN_DEBUG "... APIC ICR: %08x\n", v); | 1539 | printk(KERN_DEBUG "... APIC ICR: %08x\n", icr); |
1537 | v = apic_read(APIC_ICR2); | 1540 | printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32); |
1538 | printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); | ||
1539 | 1541 | ||
1540 | v = apic_read(APIC_LVTT); | 1542 | v = apic_read(APIC_LVTT); |
1541 | printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); | 1543 | printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); |
@@ -1563,12 +1565,12 @@ void /*__init*/ print_local_APIC(void *dummy) | |||
1563 | printk("\n"); | 1565 | printk("\n"); |
1564 | } | 1566 | } |
1565 | 1567 | ||
1566 | void print_all_local_APICs(void) | 1568 | __apicdebuginit(void) print_all_local_APICs(void) |
1567 | { | 1569 | { |
1568 | on_each_cpu(print_local_APIC, NULL, 1); | 1570 | on_each_cpu(print_local_APIC, NULL, 1); |
1569 | } | 1571 | } |
1570 | 1572 | ||
1571 | void /*__init*/ print_PIC(void) | 1573 | __apicdebuginit(void) print_PIC(void) |
1572 | { | 1574 | { |
1573 | unsigned int v; | 1575 | unsigned int v; |
1574 | unsigned long flags; | 1576 | unsigned long flags; |
@@ -1600,7 +1602,17 @@ void /*__init*/ print_PIC(void) | |||
1600 | printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); | 1602 | printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); |
1601 | } | 1603 | } |
1602 | 1604 | ||
1603 | #endif /* 0 */ | 1605 | __apicdebuginit(int) print_all_ICs(void) |
1606 | { | ||
1607 | print_PIC(); | ||
1608 | print_all_local_APICs(); | ||
1609 | print_IO_APIC(); | ||
1610 | |||
1611 | return 0; | ||
1612 | } | ||
1613 | |||
1614 | fs_initcall(print_all_ICs); | ||
1615 | |||
1604 | 1616 | ||
1605 | static void __init enable_IO_APIC(void) | 1617 | static void __init enable_IO_APIC(void) |
1606 | { | 1618 | { |
@@ -1698,8 +1710,7 @@ void disable_IO_APIC(void) | |||
1698 | entry.dest_mode = 0; /* Physical */ | 1710 | entry.dest_mode = 0; /* Physical */ |
1699 | entry.delivery_mode = dest_ExtINT; /* ExtInt */ | 1711 | entry.delivery_mode = dest_ExtINT; /* ExtInt */ |
1700 | entry.vector = 0; | 1712 | entry.vector = 0; |
1701 | entry.dest.physical.physical_dest = | 1713 | entry.dest.physical.physical_dest = read_apic_id(); |
1702 | GET_APIC_ID(read_apic_id()); | ||
1703 | 1714 | ||
1704 | /* | 1715 | /* |
1705 | * Add it to the IO-APIC irq-routing table: | 1716 | * Add it to the IO-APIC irq-routing table: |
@@ -1725,10 +1736,8 @@ static void __init setup_ioapic_ids_from_mpc(void) | |||
1725 | unsigned char old_id; | 1736 | unsigned char old_id; |
1726 | unsigned long flags; | 1737 | unsigned long flags; |
1727 | 1738 | ||
1728 | #ifdef CONFIG_X86_NUMAQ | 1739 | if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) |
1729 | if (found_numaq) | ||
1730 | return; | 1740 | return; |
1731 | #endif | ||
1732 | 1741 | ||
1733 | /* | 1742 | /* |
1734 | * Don't check I/O APIC IDs for xAPIC systems. They have | 1743 | * Don't check I/O APIC IDs for xAPIC systems. They have |
@@ -2329,8 +2338,6 @@ void __init setup_IO_APIC(void) | |||
2329 | setup_IO_APIC_irqs(); | 2338 | setup_IO_APIC_irqs(); |
2330 | init_IO_APIC_traps(); | 2339 | init_IO_APIC_traps(); |
2331 | check_timer(); | 2340 | check_timer(); |
2332 | if (!acpi_ioapic) | ||
2333 | print_IO_APIC(); | ||
2334 | } | 2341 | } |
2335 | 2342 | ||
2336 | /* | 2343 | /* |
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 61a83b70c18f..02063ae042f7 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <acpi/acpi_bus.h> | 37 | #include <acpi/acpi_bus.h> |
38 | #endif | 38 | #endif |
39 | #include <linux/bootmem.h> | 39 | #include <linux/bootmem.h> |
40 | #include <linux/dmar.h> | ||
40 | 41 | ||
41 | #include <asm/idle.h> | 42 | #include <asm/idle.h> |
42 | #include <asm/io.h> | 43 | #include <asm/io.h> |
@@ -49,10 +50,13 @@ | |||
49 | #include <asm/nmi.h> | 50 | #include <asm/nmi.h> |
50 | #include <asm/msidef.h> | 51 | #include <asm/msidef.h> |
51 | #include <asm/hypertransport.h> | 52 | #include <asm/hypertransport.h> |
53 | #include <asm/irq_remapping.h> | ||
52 | 54 | ||
53 | #include <mach_ipi.h> | 55 | #include <mach_ipi.h> |
54 | #include <mach_apic.h> | 56 | #include <mach_apic.h> |
55 | 57 | ||
58 | #define __apicdebuginit(type) static type __init | ||
59 | |||
56 | struct irq_cfg { | 60 | struct irq_cfg { |
57 | cpumask_t domain; | 61 | cpumask_t domain; |
58 | cpumask_t old_domain; | 62 | cpumask_t old_domain; |
@@ -87,8 +91,6 @@ int first_system_vector = 0xfe; | |||
87 | 91 | ||
88 | char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; | 92 | char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; |
89 | 93 | ||
90 | #define __apicdebuginit __init | ||
91 | |||
92 | int sis_apic_bug; /* not actually supported, dummy for compile */ | 94 | int sis_apic_bug; /* not actually supported, dummy for compile */ |
93 | 95 | ||
94 | static int no_timer_check; | 96 | static int no_timer_check; |
@@ -108,6 +110,9 @@ static DEFINE_SPINLOCK(vector_lock); | |||
108 | */ | 110 | */ |
109 | int nr_ioapic_registers[MAX_IO_APICS]; | 111 | int nr_ioapic_registers[MAX_IO_APICS]; |
110 | 112 | ||
113 | /* I/O APIC RTE contents at the OS boot up */ | ||
114 | struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; | ||
115 | |||
111 | /* I/O APIC entries */ | 116 | /* I/O APIC entries */ |
112 | struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; | 117 | struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; |
113 | int nr_ioapics; | 118 | int nr_ioapics; |
@@ -303,7 +308,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) | |||
303 | pin = entry->pin; | 308 | pin = entry->pin; |
304 | if (pin == -1) | 309 | if (pin == -1) |
305 | break; | 310 | break; |
306 | io_apic_write(apic, 0x11 + pin*2, dest); | 311 | /* |
312 | * With interrupt-remapping, destination information comes | ||
313 | * from interrupt-remapping table entry. | ||
314 | */ | ||
315 | if (!irq_remapped(irq)) | ||
316 | io_apic_write(apic, 0x11 + pin*2, dest); | ||
307 | reg = io_apic_read(apic, 0x10 + pin*2); | 317 | reg = io_apic_read(apic, 0x10 + pin*2); |
308 | reg &= ~IO_APIC_REDIR_VECTOR_MASK; | 318 | reg &= ~IO_APIC_REDIR_VECTOR_MASK; |
309 | reg |= vector; | 319 | reg |= vector; |
@@ -440,6 +450,69 @@ static void clear_IO_APIC (void) | |||
440 | clear_IO_APIC_pin(apic, pin); | 450 | clear_IO_APIC_pin(apic, pin); |
441 | } | 451 | } |
442 | 452 | ||
453 | /* | ||
454 | * Saves and masks all the unmasked IO-APIC RTE's | ||
455 | */ | ||
456 | int save_mask_IO_APIC_setup(void) | ||
457 | { | ||
458 | union IO_APIC_reg_01 reg_01; | ||
459 | unsigned long flags; | ||
460 | int apic, pin; | ||
461 | |||
462 | /* | ||
463 | * The number of IO-APIC IRQ registers (== #pins): | ||
464 | */ | ||
465 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
466 | spin_lock_irqsave(&ioapic_lock, flags); | ||
467 | reg_01.raw = io_apic_read(apic, 1); | ||
468 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
469 | nr_ioapic_registers[apic] = reg_01.bits.entries+1; | ||
470 | } | ||
471 | |||
472 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
473 | early_ioapic_entries[apic] = | ||
474 | kzalloc(sizeof(struct IO_APIC_route_entry) * | ||
475 | nr_ioapic_registers[apic], GFP_KERNEL); | ||
476 | if (!early_ioapic_entries[apic]) | ||
477 | return -ENOMEM; | ||
478 | } | ||
479 | |||
480 | for (apic = 0; apic < nr_ioapics; apic++) | ||
481 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
482 | struct IO_APIC_route_entry entry; | ||
483 | |||
484 | entry = early_ioapic_entries[apic][pin] = | ||
485 | ioapic_read_entry(apic, pin); | ||
486 | if (!entry.mask) { | ||
487 | entry.mask = 1; | ||
488 | ioapic_write_entry(apic, pin, entry); | ||
489 | } | ||
490 | } | ||
491 | return 0; | ||
492 | } | ||
493 | |||
494 | void restore_IO_APIC_setup(void) | ||
495 | { | ||
496 | int apic, pin; | ||
497 | |||
498 | for (apic = 0; apic < nr_ioapics; apic++) | ||
499 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) | ||
500 | ioapic_write_entry(apic, pin, | ||
501 | early_ioapic_entries[apic][pin]); | ||
502 | } | ||
503 | |||
504 | void reinit_intr_remapped_IO_APIC(int intr_remapping) | ||
505 | { | ||
506 | /* | ||
507 | * for now plain restore of previous settings. | ||
508 | * TBD: In the case of OS enabling interrupt-remapping, | ||
509 | * IO-APIC RTE's need to be setup to point to interrupt-remapping | ||
510 | * table entries. for now, do a plain restore, and wait for | ||
511 | * the setup_IO_APIC_irqs() to do proper initialization. | ||
512 | */ | ||
513 | restore_IO_APIC_setup(); | ||
514 | } | ||
515 | |||
443 | int skip_ioapic_setup; | 516 | int skip_ioapic_setup; |
444 | int ioapic_force; | 517 | int ioapic_force; |
445 | 518 | ||
@@ -839,18 +912,98 @@ void __setup_vector_irq(int cpu) | |||
839 | } | 912 | } |
840 | 913 | ||
841 | static struct irq_chip ioapic_chip; | 914 | static struct irq_chip ioapic_chip; |
915 | #ifdef CONFIG_INTR_REMAP | ||
916 | static struct irq_chip ir_ioapic_chip; | ||
917 | #endif | ||
842 | 918 | ||
843 | static void ioapic_register_intr(int irq, unsigned long trigger) | 919 | static void ioapic_register_intr(int irq, unsigned long trigger) |
844 | { | 920 | { |
845 | if (trigger) { | 921 | if (trigger) |
846 | irq_desc[irq].status |= IRQ_LEVEL; | 922 | irq_desc[irq].status |= IRQ_LEVEL; |
847 | set_irq_chip_and_handler_name(irq, &ioapic_chip, | 923 | else |
848 | handle_fasteoi_irq, "fasteoi"); | ||
849 | } else { | ||
850 | irq_desc[irq].status &= ~IRQ_LEVEL; | 924 | irq_desc[irq].status &= ~IRQ_LEVEL; |
925 | |||
926 | #ifdef CONFIG_INTR_REMAP | ||
927 | if (irq_remapped(irq)) { | ||
928 | irq_desc[irq].status |= IRQ_MOVE_PCNTXT; | ||
929 | if (trigger) | ||
930 | set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, | ||
931 | handle_fasteoi_irq, | ||
932 | "fasteoi"); | ||
933 | else | ||
934 | set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, | ||
935 | handle_edge_irq, "edge"); | ||
936 | return; | ||
937 | } | ||
938 | #endif | ||
939 | if (trigger) | ||
940 | set_irq_chip_and_handler_name(irq, &ioapic_chip, | ||
941 | handle_fasteoi_irq, | ||
942 | "fasteoi"); | ||
943 | else | ||
851 | set_irq_chip_and_handler_name(irq, &ioapic_chip, | 944 | set_irq_chip_and_handler_name(irq, &ioapic_chip, |
852 | handle_edge_irq, "edge"); | 945 | handle_edge_irq, "edge"); |
946 | } | ||
947 | |||
948 | static int setup_ioapic_entry(int apic, int irq, | ||
949 | struct IO_APIC_route_entry *entry, | ||
950 | unsigned int destination, int trigger, | ||
951 | int polarity, int vector) | ||
952 | { | ||
953 | /* | ||
954 | * add it to the IO-APIC irq-routing table: | ||
955 | */ | ||
956 | memset(entry,0,sizeof(*entry)); | ||
957 | |||
958 | #ifdef CONFIG_INTR_REMAP | ||
959 | if (intr_remapping_enabled) { | ||
960 | struct intel_iommu *iommu = map_ioapic_to_ir(apic); | ||
961 | struct irte irte; | ||
962 | struct IR_IO_APIC_route_entry *ir_entry = | ||
963 | (struct IR_IO_APIC_route_entry *) entry; | ||
964 | int index; | ||
965 | |||
966 | if (!iommu) | ||
967 | panic("No mapping iommu for ioapic %d\n", apic); | ||
968 | |||
969 | index = alloc_irte(iommu, irq, 1); | ||
970 | if (index < 0) | ||
971 | panic("Failed to allocate IRTE for ioapic %d\n", apic); | ||
972 | |||
973 | memset(&irte, 0, sizeof(irte)); | ||
974 | |||
975 | irte.present = 1; | ||
976 | irte.dst_mode = INT_DEST_MODE; | ||
977 | irte.trigger_mode = trigger; | ||
978 | irte.dlvry_mode = INT_DELIVERY_MODE; | ||
979 | irte.vector = vector; | ||
980 | irte.dest_id = IRTE_DEST(destination); | ||
981 | |||
982 | modify_irte(irq, &irte); | ||
983 | |||
984 | ir_entry->index2 = (index >> 15) & 0x1; | ||
985 | ir_entry->zero = 0; | ||
986 | ir_entry->format = 1; | ||
987 | ir_entry->index = (index & 0x7fff); | ||
988 | } else | ||
989 | #endif | ||
990 | { | ||
991 | entry->delivery_mode = INT_DELIVERY_MODE; | ||
992 | entry->dest_mode = INT_DEST_MODE; | ||
993 | entry->dest = destination; | ||
853 | } | 994 | } |
995 | |||
996 | entry->mask = 0; /* enable IRQ */ | ||
997 | entry->trigger = trigger; | ||
998 | entry->polarity = polarity; | ||
999 | entry->vector = vector; | ||
1000 | |||
1001 | /* Mask level triggered irqs. | ||
1002 | * Use IRQ_DELAYED_DISABLE for edge triggered irqs. | ||
1003 | */ | ||
1004 | if (trigger) | ||
1005 | entry->mask = 1; | ||
1006 | return 0; | ||
854 | } | 1007 | } |
855 | 1008 | ||
856 | static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, | 1009 | static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, |
@@ -875,24 +1028,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, | |||
875 | apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, | 1028 | apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, |
876 | irq, trigger, polarity); | 1029 | irq, trigger, polarity); |
877 | 1030 | ||
878 | /* | ||
879 | * add it to the IO-APIC irq-routing table: | ||
880 | */ | ||
881 | memset(&entry,0,sizeof(entry)); | ||
882 | |||
883 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
884 | entry.dest_mode = INT_DEST_MODE; | ||
885 | entry.dest = cpu_mask_to_apicid(mask); | ||
886 | entry.mask = 0; /* enable IRQ */ | ||
887 | entry.trigger = trigger; | ||
888 | entry.polarity = polarity; | ||
889 | entry.vector = cfg->vector; | ||
890 | 1031 | ||
891 | /* Mask level triggered irqs. | 1032 | if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, |
892 | * Use IRQ_DELAYED_DISABLE for edge triggered irqs. | 1033 | cpu_mask_to_apicid(mask), trigger, polarity, |
893 | */ | 1034 | cfg->vector)) { |
894 | if (trigger) | 1035 | printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", |
895 | entry.mask = 1; | 1036 | mp_ioapics[apic].mp_apicid, pin); |
1037 | __clear_irq_vector(irq); | ||
1038 | return; | ||
1039 | } | ||
896 | 1040 | ||
897 | ioapic_register_intr(irq, trigger); | 1041 | ioapic_register_intr(irq, trigger); |
898 | if (irq < 16) | 1042 | if (irq < 16) |
@@ -944,6 +1088,9 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, | |||
944 | { | 1088 | { |
945 | struct IO_APIC_route_entry entry; | 1089 | struct IO_APIC_route_entry entry; |
946 | 1090 | ||
1091 | if (intr_remapping_enabled) | ||
1092 | return; | ||
1093 | |||
947 | memset(&entry, 0, sizeof(entry)); | 1094 | memset(&entry, 0, sizeof(entry)); |
948 | 1095 | ||
949 | /* | 1096 | /* |
@@ -970,7 +1117,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, | |||
970 | ioapic_write_entry(apic, pin, entry); | 1117 | ioapic_write_entry(apic, pin, entry); |
971 | } | 1118 | } |
972 | 1119 | ||
973 | void __apicdebuginit print_IO_APIC(void) | 1120 | |
1121 | __apicdebuginit(void) print_IO_APIC(void) | ||
974 | { | 1122 | { |
975 | int apic, i; | 1123 | int apic, i; |
976 | union IO_APIC_reg_00 reg_00; | 1124 | union IO_APIC_reg_00 reg_00; |
@@ -1064,9 +1212,7 @@ void __apicdebuginit print_IO_APIC(void) | |||
1064 | return; | 1212 | return; |
1065 | } | 1213 | } |
1066 | 1214 | ||
1067 | #if 0 | 1215 | __apicdebuginit(void) print_APIC_bitfield(int base) |
1068 | |||
1069 | static __apicdebuginit void print_APIC_bitfield (int base) | ||
1070 | { | 1216 | { |
1071 | unsigned int v; | 1217 | unsigned int v; |
1072 | int i, j; | 1218 | int i, j; |
@@ -1087,9 +1233,10 @@ static __apicdebuginit void print_APIC_bitfield (int base) | |||
1087 | } | 1233 | } |
1088 | } | 1234 | } |
1089 | 1235 | ||
1090 | void __apicdebuginit print_local_APIC(void * dummy) | 1236 | __apicdebuginit(void) print_local_APIC(void *dummy) |
1091 | { | 1237 | { |
1092 | unsigned int v, ver, maxlvt; | 1238 | unsigned int v, ver, maxlvt; |
1239 | unsigned long icr; | ||
1093 | 1240 | ||
1094 | if (apic_verbosity == APIC_QUIET) | 1241 | if (apic_verbosity == APIC_QUIET) |
1095 | return; | 1242 | return; |
@@ -1097,7 +1244,7 @@ void __apicdebuginit print_local_APIC(void * dummy) | |||
1097 | printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", | 1244 | printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", |
1098 | smp_processor_id(), hard_smp_processor_id()); | 1245 | smp_processor_id(), hard_smp_processor_id()); |
1099 | v = apic_read(APIC_ID); | 1246 | v = apic_read(APIC_ID); |
1100 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); | 1247 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); |
1101 | v = apic_read(APIC_LVR); | 1248 | v = apic_read(APIC_LVR); |
1102 | printk(KERN_INFO "... APIC VERSION: %08x\n", v); | 1249 | printk(KERN_INFO "... APIC VERSION: %08x\n", v); |
1103 | ver = GET_APIC_VERSION(v); | 1250 | ver = GET_APIC_VERSION(v); |
@@ -1133,10 +1280,9 @@ void __apicdebuginit print_local_APIC(void * dummy) | |||
1133 | v = apic_read(APIC_ESR); | 1280 | v = apic_read(APIC_ESR); |
1134 | printk(KERN_DEBUG "... APIC ESR: %08x\n", v); | 1281 | printk(KERN_DEBUG "... APIC ESR: %08x\n", v); |
1135 | 1282 | ||
1136 | v = apic_read(APIC_ICR); | 1283 | icr = apic_icr_read(); |
1137 | printk(KERN_DEBUG "... APIC ICR: %08x\n", v); | 1284 | printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); |
1138 | v = apic_read(APIC_ICR2); | 1285 | printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32)); |
1139 | printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); | ||
1140 | 1286 | ||
1141 | v = apic_read(APIC_LVTT); | 1287 | v = apic_read(APIC_LVTT); |
1142 | printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); | 1288 | printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); |
@@ -1164,12 +1310,12 @@ void __apicdebuginit print_local_APIC(void * dummy) | |||
1164 | printk("\n"); | 1310 | printk("\n"); |
1165 | } | 1311 | } |
1166 | 1312 | ||
1167 | void print_all_local_APICs (void) | 1313 | __apicdebuginit(void) print_all_local_APICs(void) |
1168 | { | 1314 | { |
1169 | on_each_cpu(print_local_APIC, NULL, 1); | 1315 | on_each_cpu(print_local_APIC, NULL, 1); |
1170 | } | 1316 | } |
1171 | 1317 | ||
1172 | void __apicdebuginit print_PIC(void) | 1318 | __apicdebuginit(void) print_PIC(void) |
1173 | { | 1319 | { |
1174 | unsigned int v; | 1320 | unsigned int v; |
1175 | unsigned long flags; | 1321 | unsigned long flags; |
@@ -1201,7 +1347,17 @@ void __apicdebuginit print_PIC(void) | |||
1201 | printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); | 1347 | printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); |
1202 | } | 1348 | } |
1203 | 1349 | ||
1204 | #endif /* 0 */ | 1350 | __apicdebuginit(int) print_all_ICs(void) |
1351 | { | ||
1352 | print_PIC(); | ||
1353 | print_all_local_APICs(); | ||
1354 | print_IO_APIC(); | ||
1355 | |||
1356 | return 0; | ||
1357 | } | ||
1358 | |||
1359 | fs_initcall(print_all_ICs); | ||
1360 | |||
1205 | 1361 | ||
1206 | void __init enable_IO_APIC(void) | 1362 | void __init enable_IO_APIC(void) |
1207 | { | 1363 | { |
@@ -1291,7 +1447,7 @@ void disable_IO_APIC(void) | |||
1291 | entry.dest_mode = 0; /* Physical */ | 1447 | entry.dest_mode = 0; /* Physical */ |
1292 | entry.delivery_mode = dest_ExtINT; /* ExtInt */ | 1448 | entry.delivery_mode = dest_ExtINT; /* ExtInt */ |
1293 | entry.vector = 0; | 1449 | entry.vector = 0; |
1294 | entry.dest = GET_APIC_ID(read_apic_id()); | 1450 | entry.dest = read_apic_id(); |
1295 | 1451 | ||
1296 | /* | 1452 | /* |
1297 | * Add it to the IO-APIC irq-routing table: | 1453 | * Add it to the IO-APIC irq-routing table: |
@@ -1397,6 +1553,147 @@ static int ioapic_retrigger_irq(unsigned int irq) | |||
1397 | */ | 1553 | */ |
1398 | 1554 | ||
1399 | #ifdef CONFIG_SMP | 1555 | #ifdef CONFIG_SMP |
1556 | |||
1557 | #ifdef CONFIG_INTR_REMAP | ||
1558 | static void ir_irq_migration(struct work_struct *work); | ||
1559 | |||
1560 | static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); | ||
1561 | |||
1562 | /* | ||
1563 | * Migrate the IO-APIC irq in the presence of intr-remapping. | ||
1564 | * | ||
1565 | * For edge triggered, irq migration is a simple atomic update(of vector | ||
1566 | * and cpu destination) of IRTE and flush the hardware cache. | ||
1567 | * | ||
1568 | * For level triggered, we need to modify the io-apic RTE aswell with the update | ||
1569 | * vector information, along with modifying IRTE with vector and destination. | ||
1570 | * So irq migration for level triggered is little bit more complex compared to | ||
1571 | * edge triggered migration. But the good news is, we use the same algorithm | ||
1572 | * for level triggered migration as we have today, only difference being, | ||
1573 | * we now initiate the irq migration from process context instead of the | ||
1574 | * interrupt context. | ||
1575 | * | ||
1576 | * In future, when we do a directed EOI (combined with cpu EOI broadcast | ||
1577 | * suppression) to the IO-APIC, level triggered irq migration will also be | ||
1578 | * as simple as edge triggered migration and we can do the irq migration | ||
1579 | * with a simple atomic update to IO-APIC RTE. | ||
1580 | */ | ||
1581 | static void migrate_ioapic_irq(int irq, cpumask_t mask) | ||
1582 | { | ||
1583 | struct irq_cfg *cfg = irq_cfg + irq; | ||
1584 | struct irq_desc *desc = irq_desc + irq; | ||
1585 | cpumask_t tmp, cleanup_mask; | ||
1586 | struct irte irte; | ||
1587 | int modify_ioapic_rte = desc->status & IRQ_LEVEL; | ||
1588 | unsigned int dest; | ||
1589 | unsigned long flags; | ||
1590 | |||
1591 | cpus_and(tmp, mask, cpu_online_map); | ||
1592 | if (cpus_empty(tmp)) | ||
1593 | return; | ||
1594 | |||
1595 | if (get_irte(irq, &irte)) | ||
1596 | return; | ||
1597 | |||
1598 | if (assign_irq_vector(irq, mask)) | ||
1599 | return; | ||
1600 | |||
1601 | cpus_and(tmp, cfg->domain, mask); | ||
1602 | dest = cpu_mask_to_apicid(tmp); | ||
1603 | |||
1604 | if (modify_ioapic_rte) { | ||
1605 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1606 | __target_IO_APIC_irq(irq, dest, cfg->vector); | ||
1607 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1608 | } | ||
1609 | |||
1610 | irte.vector = cfg->vector; | ||
1611 | irte.dest_id = IRTE_DEST(dest); | ||
1612 | |||
1613 | /* | ||
1614 | * Modified the IRTE and flushes the Interrupt entry cache. | ||
1615 | */ | ||
1616 | modify_irte(irq, &irte); | ||
1617 | |||
1618 | if (cfg->move_in_progress) { | ||
1619 | cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); | ||
1620 | cfg->move_cleanup_count = cpus_weight(cleanup_mask); | ||
1621 | send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); | ||
1622 | cfg->move_in_progress = 0; | ||
1623 | } | ||
1624 | |||
1625 | irq_desc[irq].affinity = mask; | ||
1626 | } | ||
1627 | |||
1628 | static int migrate_irq_remapped_level(int irq) | ||
1629 | { | ||
1630 | int ret = -1; | ||
1631 | |||
1632 | mask_IO_APIC_irq(irq); | ||
1633 | |||
1634 | if (io_apic_level_ack_pending(irq)) { | ||
1635 | /* | ||
1636 | * Interrupt in progress. Migrating irq now will change the | ||
1637 | * vector information in the IO-APIC RTE and that will confuse | ||
1638 | * the EOI broadcast performed by cpu. | ||
1639 | * So, delay the irq migration to the next instance. | ||
1640 | */ | ||
1641 | schedule_delayed_work(&ir_migration_work, 1); | ||
1642 | goto unmask; | ||
1643 | } | ||
1644 | |||
1645 | /* everthing is clear. we have right of way */ | ||
1646 | migrate_ioapic_irq(irq, irq_desc[irq].pending_mask); | ||
1647 | |||
1648 | ret = 0; | ||
1649 | irq_desc[irq].status &= ~IRQ_MOVE_PENDING; | ||
1650 | cpus_clear(irq_desc[irq].pending_mask); | ||
1651 | |||
1652 | unmask: | ||
1653 | unmask_IO_APIC_irq(irq); | ||
1654 | return ret; | ||
1655 | } | ||
1656 | |||
1657 | static void ir_irq_migration(struct work_struct *work) | ||
1658 | { | ||
1659 | int irq; | ||
1660 | |||
1661 | for (irq = 0; irq < NR_IRQS; irq++) { | ||
1662 | struct irq_desc *desc = irq_desc + irq; | ||
1663 | if (desc->status & IRQ_MOVE_PENDING) { | ||
1664 | unsigned long flags; | ||
1665 | |||
1666 | spin_lock_irqsave(&desc->lock, flags); | ||
1667 | if (!desc->chip->set_affinity || | ||
1668 | !(desc->status & IRQ_MOVE_PENDING)) { | ||
1669 | desc->status &= ~IRQ_MOVE_PENDING; | ||
1670 | spin_unlock_irqrestore(&desc->lock, flags); | ||
1671 | continue; | ||
1672 | } | ||
1673 | |||
1674 | desc->chip->set_affinity(irq, | ||
1675 | irq_desc[irq].pending_mask); | ||
1676 | spin_unlock_irqrestore(&desc->lock, flags); | ||
1677 | } | ||
1678 | } | ||
1679 | } | ||
1680 | |||
1681 | /* | ||
1682 | * Migrates the IRQ destination in the process context. | ||
1683 | */ | ||
1684 | static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) | ||
1685 | { | ||
1686 | if (irq_desc[irq].status & IRQ_LEVEL) { | ||
1687 | irq_desc[irq].status |= IRQ_MOVE_PENDING; | ||
1688 | irq_desc[irq].pending_mask = mask; | ||
1689 | migrate_irq_remapped_level(irq); | ||
1690 | return; | ||
1691 | } | ||
1692 | |||
1693 | migrate_ioapic_irq(irq, mask); | ||
1694 | } | ||
1695 | #endif | ||
1696 | |||
1400 | asmlinkage void smp_irq_move_cleanup_interrupt(void) | 1697 | asmlinkage void smp_irq_move_cleanup_interrupt(void) |
1401 | { | 1698 | { |
1402 | unsigned vector, me; | 1699 | unsigned vector, me; |
@@ -1453,6 +1750,17 @@ static void irq_complete_move(unsigned int irq) | |||
1453 | #else | 1750 | #else |
1454 | static inline void irq_complete_move(unsigned int irq) {} | 1751 | static inline void irq_complete_move(unsigned int irq) {} |
1455 | #endif | 1752 | #endif |
1753 | #ifdef CONFIG_INTR_REMAP | ||
1754 | static void ack_x2apic_level(unsigned int irq) | ||
1755 | { | ||
1756 | ack_x2APIC_irq(); | ||
1757 | } | ||
1758 | |||
1759 | static void ack_x2apic_edge(unsigned int irq) | ||
1760 | { | ||
1761 | ack_x2APIC_irq(); | ||
1762 | } | ||
1763 | #endif | ||
1456 | 1764 | ||
1457 | static void ack_apic_edge(unsigned int irq) | 1765 | static void ack_apic_edge(unsigned int irq) |
1458 | { | 1766 | { |
@@ -1527,6 +1835,21 @@ static struct irq_chip ioapic_chip __read_mostly = { | |||
1527 | .retrigger = ioapic_retrigger_irq, | 1835 | .retrigger = ioapic_retrigger_irq, |
1528 | }; | 1836 | }; |
1529 | 1837 | ||
1838 | #ifdef CONFIG_INTR_REMAP | ||
1839 | static struct irq_chip ir_ioapic_chip __read_mostly = { | ||
1840 | .name = "IR-IO-APIC", | ||
1841 | .startup = startup_ioapic_irq, | ||
1842 | .mask = mask_IO_APIC_irq, | ||
1843 | .unmask = unmask_IO_APIC_irq, | ||
1844 | .ack = ack_x2apic_edge, | ||
1845 | .eoi = ack_x2apic_level, | ||
1846 | #ifdef CONFIG_SMP | ||
1847 | .set_affinity = set_ir_ioapic_affinity_irq, | ||
1848 | #endif | ||
1849 | .retrigger = ioapic_retrigger_irq, | ||
1850 | }; | ||
1851 | #endif | ||
1852 | |||
1530 | static inline void init_IO_APIC_traps(void) | 1853 | static inline void init_IO_APIC_traps(void) |
1531 | { | 1854 | { |
1532 | int irq; | 1855 | int irq; |
@@ -1712,6 +2035,8 @@ static inline void __init check_timer(void) | |||
1712 | * 8259A. | 2035 | * 8259A. |
1713 | */ | 2036 | */ |
1714 | if (pin1 == -1) { | 2037 | if (pin1 == -1) { |
2038 | if (intr_remapping_enabled) | ||
2039 | panic("BIOS bug: timer not connected to IO-APIC"); | ||
1715 | pin1 = pin2; | 2040 | pin1 = pin2; |
1716 | apic1 = apic2; | 2041 | apic1 = apic2; |
1717 | no_pin1 = 1; | 2042 | no_pin1 = 1; |
@@ -1738,6 +2063,8 @@ static inline void __init check_timer(void) | |||
1738 | clear_IO_APIC_pin(0, pin1); | 2063 | clear_IO_APIC_pin(0, pin1); |
1739 | goto out; | 2064 | goto out; |
1740 | } | 2065 | } |
2066 | if (intr_remapping_enabled) | ||
2067 | panic("timer doesn't work through Interrupt-remapped IO-APIC"); | ||
1741 | clear_IO_APIC_pin(apic1, pin1); | 2068 | clear_IO_APIC_pin(apic1, pin1); |
1742 | if (!no_pin1) | 2069 | if (!no_pin1) |
1743 | apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " | 2070 | apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " |
@@ -1854,8 +2181,6 @@ void __init setup_IO_APIC(void) | |||
1854 | setup_IO_APIC_irqs(); | 2181 | setup_IO_APIC_irqs(); |
1855 | init_IO_APIC_traps(); | 2182 | init_IO_APIC_traps(); |
1856 | check_timer(); | 2183 | check_timer(); |
1857 | if (!acpi_ioapic) | ||
1858 | print_IO_APIC(); | ||
1859 | } | 2184 | } |
1860 | 2185 | ||
1861 | struct sysfs_ioapic_data { | 2186 | struct sysfs_ioapic_data { |
@@ -1977,6 +2302,9 @@ void destroy_irq(unsigned int irq) | |||
1977 | 2302 | ||
1978 | dynamic_irq_cleanup(irq); | 2303 | dynamic_irq_cleanup(irq); |
1979 | 2304 | ||
2305 | #ifdef CONFIG_INTR_REMAP | ||
2306 | free_irte(irq); | ||
2307 | #endif | ||
1980 | spin_lock_irqsave(&vector_lock, flags); | 2308 | spin_lock_irqsave(&vector_lock, flags); |
1981 | __clear_irq_vector(irq); | 2309 | __clear_irq_vector(irq); |
1982 | spin_unlock_irqrestore(&vector_lock, flags); | 2310 | spin_unlock_irqrestore(&vector_lock, flags); |
@@ -1995,11 +2323,42 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms | |||
1995 | 2323 | ||
1996 | tmp = TARGET_CPUS; | 2324 | tmp = TARGET_CPUS; |
1997 | err = assign_irq_vector(irq, tmp); | 2325 | err = assign_irq_vector(irq, tmp); |
1998 | if (!err) { | 2326 | if (err) |
1999 | cpus_and(tmp, cfg->domain, tmp); | 2327 | return err; |
2000 | dest = cpu_mask_to_apicid(tmp); | 2328 | |
2329 | cpus_and(tmp, cfg->domain, tmp); | ||
2330 | dest = cpu_mask_to_apicid(tmp); | ||
2331 | |||
2332 | #ifdef CONFIG_INTR_REMAP | ||
2333 | if (irq_remapped(irq)) { | ||
2334 | struct irte irte; | ||
2335 | int ir_index; | ||
2336 | u16 sub_handle; | ||
2337 | |||
2338 | ir_index = map_irq_to_irte_handle(irq, &sub_handle); | ||
2339 | BUG_ON(ir_index == -1); | ||
2340 | |||
2341 | memset (&irte, 0, sizeof(irte)); | ||
2342 | |||
2343 | irte.present = 1; | ||
2344 | irte.dst_mode = INT_DEST_MODE; | ||
2345 | irte.trigger_mode = 0; /* edge */ | ||
2346 | irte.dlvry_mode = INT_DELIVERY_MODE; | ||
2347 | irte.vector = cfg->vector; | ||
2348 | irte.dest_id = IRTE_DEST(dest); | ||
2349 | |||
2350 | modify_irte(irq, &irte); | ||
2001 | 2351 | ||
2002 | msg->address_hi = MSI_ADDR_BASE_HI; | 2352 | msg->address_hi = MSI_ADDR_BASE_HI; |
2353 | msg->data = sub_handle; | ||
2354 | msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | | ||
2355 | MSI_ADDR_IR_SHV | | ||
2356 | MSI_ADDR_IR_INDEX1(ir_index) | | ||
2357 | MSI_ADDR_IR_INDEX2(ir_index); | ||
2358 | } else | ||
2359 | #endif | ||
2360 | { | ||
2361 | msg->address_hi = MSI_ADDR_BASE_HI; | ||
2003 | msg->address_lo = | 2362 | msg->address_lo = |
2004 | MSI_ADDR_BASE_LO | | 2363 | MSI_ADDR_BASE_LO | |
2005 | ((INT_DEST_MODE == 0) ? | 2364 | ((INT_DEST_MODE == 0) ? |
@@ -2049,6 +2408,55 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) | |||
2049 | write_msi_msg(irq, &msg); | 2408 | write_msi_msg(irq, &msg); |
2050 | irq_desc[irq].affinity = mask; | 2409 | irq_desc[irq].affinity = mask; |
2051 | } | 2410 | } |
2411 | |||
2412 | #ifdef CONFIG_INTR_REMAP | ||
2413 | /* | ||
2414 | * Migrate the MSI irq to another cpumask. This migration is | ||
2415 | * done in the process context using interrupt-remapping hardware. | ||
2416 | */ | ||
2417 | static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) | ||
2418 | { | ||
2419 | struct irq_cfg *cfg = irq_cfg + irq; | ||
2420 | unsigned int dest; | ||
2421 | cpumask_t tmp, cleanup_mask; | ||
2422 | struct irte irte; | ||
2423 | |||
2424 | cpus_and(tmp, mask, cpu_online_map); | ||
2425 | if (cpus_empty(tmp)) | ||
2426 | return; | ||
2427 | |||
2428 | if (get_irte(irq, &irte)) | ||
2429 | return; | ||
2430 | |||
2431 | if (assign_irq_vector(irq, mask)) | ||
2432 | return; | ||
2433 | |||
2434 | cpus_and(tmp, cfg->domain, mask); | ||
2435 | dest = cpu_mask_to_apicid(tmp); | ||
2436 | |||
2437 | irte.vector = cfg->vector; | ||
2438 | irte.dest_id = IRTE_DEST(dest); | ||
2439 | |||
2440 | /* | ||
2441 | * atomically update the IRTE with the new destination and vector. | ||
2442 | */ | ||
2443 | modify_irte(irq, &irte); | ||
2444 | |||
2445 | /* | ||
2446 | * After this point, all the interrupts will start arriving | ||
2447 | * at the new destination. So, time to cleanup the previous | ||
2448 | * vector allocation. | ||
2449 | */ | ||
2450 | if (cfg->move_in_progress) { | ||
2451 | cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); | ||
2452 | cfg->move_cleanup_count = cpus_weight(cleanup_mask); | ||
2453 | send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); | ||
2454 | cfg->move_in_progress = 0; | ||
2455 | } | ||
2456 | |||
2457 | irq_desc[irq].affinity = mask; | ||
2458 | } | ||
2459 | #endif | ||
2052 | #endif /* CONFIG_SMP */ | 2460 | #endif /* CONFIG_SMP */ |
2053 | 2461 | ||
2054 | /* | 2462 | /* |
@@ -2066,26 +2474,157 @@ static struct irq_chip msi_chip = { | |||
2066 | .retrigger = ioapic_retrigger_irq, | 2474 | .retrigger = ioapic_retrigger_irq, |
2067 | }; | 2475 | }; |
2068 | 2476 | ||
2069 | int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) | 2477 | #ifdef CONFIG_INTR_REMAP |
2478 | static struct irq_chip msi_ir_chip = { | ||
2479 | .name = "IR-PCI-MSI", | ||
2480 | .unmask = unmask_msi_irq, | ||
2481 | .mask = mask_msi_irq, | ||
2482 | .ack = ack_x2apic_edge, | ||
2483 | #ifdef CONFIG_SMP | ||
2484 | .set_affinity = ir_set_msi_irq_affinity, | ||
2485 | #endif | ||
2486 | .retrigger = ioapic_retrigger_irq, | ||
2487 | }; | ||
2488 | |||
2489 | /* | ||
2490 | * Map the PCI dev to the corresponding remapping hardware unit | ||
2491 | * and allocate 'nvec' consecutive interrupt-remapping table entries | ||
2492 | * in it. | ||
2493 | */ | ||
2494 | static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) | ||
2070 | { | 2495 | { |
2496 | struct intel_iommu *iommu; | ||
2497 | int index; | ||
2498 | |||
2499 | iommu = map_dev_to_ir(dev); | ||
2500 | if (!iommu) { | ||
2501 | printk(KERN_ERR | ||
2502 | "Unable to map PCI %s to iommu\n", pci_name(dev)); | ||
2503 | return -ENOENT; | ||
2504 | } | ||
2505 | |||
2506 | index = alloc_irte(iommu, irq, nvec); | ||
2507 | if (index < 0) { | ||
2508 | printk(KERN_ERR | ||
2509 | "Unable to allocate %d IRTE for PCI %s\n", nvec, | ||
2510 | pci_name(dev)); | ||
2511 | return -ENOSPC; | ||
2512 | } | ||
2513 | return index; | ||
2514 | } | ||
2515 | #endif | ||
2516 | |||
2517 | static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) | ||
2518 | { | ||
2519 | int ret; | ||
2071 | struct msi_msg msg; | 2520 | struct msi_msg msg; |
2521 | |||
2522 | ret = msi_compose_msg(dev, irq, &msg); | ||
2523 | if (ret < 0) | ||
2524 | return ret; | ||
2525 | |||
2526 | set_irq_msi(irq, desc); | ||
2527 | write_msi_msg(irq, &msg); | ||
2528 | |||
2529 | #ifdef CONFIG_INTR_REMAP | ||
2530 | if (irq_remapped(irq)) { | ||
2531 | struct irq_desc *desc = irq_desc + irq; | ||
2532 | /* | ||
2533 | * irq migration in process context | ||
2534 | */ | ||
2535 | desc->status |= IRQ_MOVE_PCNTXT; | ||
2536 | set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); | ||
2537 | } else | ||
2538 | #endif | ||
2539 | set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); | ||
2540 | |||
2541 | return 0; | ||
2542 | } | ||
2543 | |||
2544 | int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) | ||
2545 | { | ||
2072 | int irq, ret; | 2546 | int irq, ret; |
2547 | |||
2073 | irq = create_irq(); | 2548 | irq = create_irq(); |
2074 | if (irq < 0) | 2549 | if (irq < 0) |
2075 | return irq; | 2550 | return irq; |
2076 | 2551 | ||
2077 | ret = msi_compose_msg(dev, irq, &msg); | 2552 | #ifdef CONFIG_INTR_REMAP |
2553 | if (!intr_remapping_enabled) | ||
2554 | goto no_ir; | ||
2555 | |||
2556 | ret = msi_alloc_irte(dev, irq, 1); | ||
2557 | if (ret < 0) | ||
2558 | goto error; | ||
2559 | no_ir: | ||
2560 | #endif | ||
2561 | ret = setup_msi_irq(dev, desc, irq); | ||
2078 | if (ret < 0) { | 2562 | if (ret < 0) { |
2079 | destroy_irq(irq); | 2563 | destroy_irq(irq); |
2080 | return ret; | 2564 | return ret; |
2081 | } | 2565 | } |
2566 | return 0; | ||
2082 | 2567 | ||
2083 | set_irq_msi(irq, desc); | 2568 | #ifdef CONFIG_INTR_REMAP |
2084 | write_msi_msg(irq, &msg); | 2569 | error: |
2570 | destroy_irq(irq); | ||
2571 | return ret; | ||
2572 | #endif | ||
2573 | } | ||
2085 | 2574 | ||
2086 | set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); | 2575 | int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) |
2576 | { | ||
2577 | int irq, ret, sub_handle; | ||
2578 | struct msi_desc *desc; | ||
2579 | #ifdef CONFIG_INTR_REMAP | ||
2580 | struct intel_iommu *iommu = 0; | ||
2581 | int index = 0; | ||
2582 | #endif | ||
2583 | |||
2584 | sub_handle = 0; | ||
2585 | list_for_each_entry(desc, &dev->msi_list, list) { | ||
2586 | irq = create_irq(); | ||
2587 | if (irq < 0) | ||
2588 | return irq; | ||
2589 | #ifdef CONFIG_INTR_REMAP | ||
2590 | if (!intr_remapping_enabled) | ||
2591 | goto no_ir; | ||
2087 | 2592 | ||
2593 | if (!sub_handle) { | ||
2594 | /* | ||
2595 | * allocate the consecutive block of IRTE's | ||
2596 | * for 'nvec' | ||
2597 | */ | ||
2598 | index = msi_alloc_irte(dev, irq, nvec); | ||
2599 | if (index < 0) { | ||
2600 | ret = index; | ||
2601 | goto error; | ||
2602 | } | ||
2603 | } else { | ||
2604 | iommu = map_dev_to_ir(dev); | ||
2605 | if (!iommu) { | ||
2606 | ret = -ENOENT; | ||
2607 | goto error; | ||
2608 | } | ||
2609 | /* | ||
2610 | * setup the mapping between the irq and the IRTE | ||
2611 | * base index, the sub_handle pointing to the | ||
2612 | * appropriate interrupt remap table entry. | ||
2613 | */ | ||
2614 | set_irte_irq(irq, iommu, index, sub_handle); | ||
2615 | } | ||
2616 | no_ir: | ||
2617 | #endif | ||
2618 | ret = setup_msi_irq(dev, desc, irq); | ||
2619 | if (ret < 0) | ||
2620 | goto error; | ||
2621 | sub_handle++; | ||
2622 | } | ||
2088 | return 0; | 2623 | return 0; |
2624 | |||
2625 | error: | ||
2626 | destroy_irq(irq); | ||
2627 | return ret; | ||
2089 | } | 2628 | } |
2090 | 2629 | ||
2091 | void arch_teardown_msi_irq(unsigned int irq) | 2630 | void arch_teardown_msi_irq(unsigned int irq) |
@@ -2333,6 +2872,10 @@ void __init setup_ioapic_dest(void) | |||
2333 | setup_IO_APIC_irq(ioapic, pin, irq, | 2872 | setup_IO_APIC_irq(ioapic, pin, irq, |
2334 | irq_trigger(irq_entry), | 2873 | irq_trigger(irq_entry), |
2335 | irq_polarity(irq_entry)); | 2874 | irq_polarity(irq_entry)); |
2875 | #ifdef CONFIG_INTR_REMAP | ||
2876 | else if (intr_remapping_enabled) | ||
2877 | set_ir_ioapic_affinity_irq(irq, TARGET_CPUS); | ||
2878 | #endif | ||
2336 | else | 2879 | else |
2337 | set_ioapic_affinity_irq(irq, TARGET_CPUS); | 2880 | set_ioapic_affinity_irq(irq, TARGET_CPUS); |
2338 | } | 2881 | } |
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 50e5e4a31c85..191914302744 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/thread_info.h> | 15 | #include <linux/thread_info.h> |
16 | #include <linux/syscalls.h> | 16 | #include <linux/syscalls.h> |
17 | #include <asm/syscalls.h> | ||
17 | 18 | ||
18 | /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ | 19 | /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ |
19 | static void set_bitmap(unsigned long *bitmap, unsigned int base, | 20 | static void set_bitmap(unsigned long *bitmap, unsigned int base, |
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index 3f7537b669d3..f1c688e46f35 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c | |||
@@ -20,6 +20,8 @@ | |||
20 | 20 | ||
21 | #ifdef CONFIG_X86_32 | 21 | #ifdef CONFIG_X86_32 |
22 | #include <mach_apic.h> | 22 | #include <mach_apic.h> |
23 | #include <mach_ipi.h> | ||
24 | |||
23 | /* | 25 | /* |
24 | * the following functions deal with sending IPIs between CPUs. | 26 | * the following functions deal with sending IPIs between CPUs. |
25 | * | 27 | * |
@@ -147,7 +149,6 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector) | |||
147 | } | 149 | } |
148 | 150 | ||
149 | /* must come after the send_IPI functions above for inlining */ | 151 | /* must come after the send_IPI functions above for inlining */ |
150 | #include <mach_ipi.h> | ||
151 | static int convert_apicid_to_cpu(int apic_id) | 152 | static int convert_apicid_to_cpu(int apic_id) |
152 | { | 153 | { |
153 | int i; | 154 | int i; |
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 1cf8c1fcc088..b71e02d42f4f 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
@@ -325,7 +325,7 @@ skip: | |||
325 | for_each_online_cpu(j) | 325 | for_each_online_cpu(j) |
326 | seq_printf(p, "%10u ", | 326 | seq_printf(p, "%10u ", |
327 | per_cpu(irq_stat,j).irq_call_count); | 327 | per_cpu(irq_stat,j).irq_call_count); |
328 | seq_printf(p, " function call interrupts\n"); | 328 | seq_printf(p, " Function call interrupts\n"); |
329 | seq_printf(p, "TLB: "); | 329 | seq_printf(p, "TLB: "); |
330 | for_each_online_cpu(j) | 330 | for_each_online_cpu(j) |
331 | seq_printf(p, "%10u ", | 331 | seq_printf(p, "%10u ", |
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 1f78b238d8d2..f065fe9071b9 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c | |||
@@ -129,7 +129,7 @@ skip: | |||
129 | seq_printf(p, "CAL: "); | 129 | seq_printf(p, "CAL: "); |
130 | for_each_online_cpu(j) | 130 | for_each_online_cpu(j) |
131 | seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); | 131 | seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); |
132 | seq_printf(p, " function call interrupts\n"); | 132 | seq_printf(p, " Function call interrupts\n"); |
133 | seq_printf(p, "TLB: "); | 133 | seq_printf(p, "TLB: "); |
134 | for_each_online_cpu(j) | 134 | for_each_online_cpu(j) |
135 | seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); | 135 | seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); |
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index d66914287ee1..9200a1e2752d 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c | |||
@@ -74,6 +74,15 @@ void __init init_ISA_irqs (void) | |||
74 | } | 74 | } |
75 | } | 75 | } |
76 | 76 | ||
77 | /* | ||
78 | * IRQ2 is cascade interrupt to second interrupt controller | ||
79 | */ | ||
80 | static struct irqaction irq2 = { | ||
81 | .handler = no_action, | ||
82 | .mask = CPU_MASK_NONE, | ||
83 | .name = "cascade", | ||
84 | }; | ||
85 | |||
77 | /* Overridden in paravirt.c */ | 86 | /* Overridden in paravirt.c */ |
78 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | 87 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); |
79 | 88 | ||
@@ -98,6 +107,46 @@ void __init native_init_IRQ(void) | |||
98 | set_intr_gate(vector, interrupt[i]); | 107 | set_intr_gate(vector, interrupt[i]); |
99 | } | 108 | } |
100 | 109 | ||
110 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) | ||
111 | /* | ||
112 | * IRQ0 must be given a fixed assignment and initialized, | ||
113 | * because it's used before the IO-APIC is set up. | ||
114 | */ | ||
115 | set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); | ||
116 | |||
117 | /* | ||
118 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | ||
119 | * IPI, driven by wakeup. | ||
120 | */ | ||
121 | alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | ||
122 | |||
123 | /* IPI for invalidation */ | ||
124 | alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); | ||
125 | |||
126 | /* IPI for generic function call */ | ||
127 | alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | ||
128 | |||
129 | /* IPI for single call function */ | ||
130 | set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); | ||
131 | #endif | ||
132 | |||
133 | #ifdef CONFIG_X86_LOCAL_APIC | ||
134 | /* self generated IPI for local APIC timer */ | ||
135 | alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); | ||
136 | |||
137 | /* IPI vectors for APIC spurious and error interrupts */ | ||
138 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | ||
139 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | ||
140 | #endif | ||
141 | |||
142 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) | ||
143 | /* thermal monitor LVT interrupt */ | ||
144 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | ||
145 | #endif | ||
146 | |||
147 | if (!acpi_ioapic) | ||
148 | setup_irq(2, &irq2); | ||
149 | |||
101 | /* setup after call gates are initialised (usually add in | 150 | /* setup after call gates are initialised (usually add in |
102 | * the architecture specific gates) | 151 | * the architecture specific gates) |
103 | */ | 152 | */ |
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 1f26fd9ec4f4..5b5be9d43c2a 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c | |||
@@ -135,7 +135,7 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { | |||
135 | [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 | 135 | [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 |
136 | }; | 136 | }; |
137 | 137 | ||
138 | static void __init init_ISA_irqs (void) | 138 | void __init init_ISA_irqs(void) |
139 | { | 139 | { |
140 | int i; | 140 | int i; |
141 | 141 | ||
@@ -164,22 +164,8 @@ static void __init init_ISA_irqs (void) | |||
164 | 164 | ||
165 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | 165 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); |
166 | 166 | ||
167 | void __init native_init_IRQ(void) | 167 | static void __init smp_intr_init(void) |
168 | { | 168 | { |
169 | int i; | ||
170 | |||
171 | init_ISA_irqs(); | ||
172 | /* | ||
173 | * Cover the whole vector space, no vector can escape | ||
174 | * us. (some of these will be overridden and become | ||
175 | * 'special' SMP interrupts) | ||
176 | */ | ||
177 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
178 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
179 | if (vector != IA32_SYSCALL_VECTOR) | ||
180 | set_intr_gate(vector, interrupt[i]); | ||
181 | } | ||
182 | |||
183 | #ifdef CONFIG_SMP | 169 | #ifdef CONFIG_SMP |
184 | /* | 170 | /* |
185 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | 171 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper |
@@ -207,6 +193,12 @@ void __init native_init_IRQ(void) | |||
207 | /* Low priority IPI to cleanup after moving an irq */ | 193 | /* Low priority IPI to cleanup after moving an irq */ |
208 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); | 194 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); |
209 | #endif | 195 | #endif |
196 | } | ||
197 | |||
198 | static void __init apic_intr_init(void) | ||
199 | { | ||
200 | smp_intr_init(); | ||
201 | |||
210 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | 202 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); |
211 | alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); | 203 | alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); |
212 | 204 | ||
@@ -216,6 +208,25 @@ void __init native_init_IRQ(void) | |||
216 | /* IPI vectors for APIC spurious and error interrupts */ | 208 | /* IPI vectors for APIC spurious and error interrupts */ |
217 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | 209 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); |
218 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | 210 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); |
211 | } | ||
212 | |||
213 | void __init native_init_IRQ(void) | ||
214 | { | ||
215 | int i; | ||
216 | |||
217 | init_ISA_irqs(); | ||
218 | /* | ||
219 | * Cover the whole vector space, no vector can escape | ||
220 | * us. (some of these will be overridden and become | ||
221 | * 'special' SMP interrupts) | ||
222 | */ | ||
223 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
224 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
225 | if (vector != IA32_SYSCALL_VECTOR) | ||
226 | set_intr_gate(vector, interrupt[i]); | ||
227 | } | ||
228 | |||
229 | apic_intr_init(); | ||
219 | 230 | ||
220 | if (!acpi_ioapic) | 231 | if (!acpi_ioapic) |
221 | setup_irq(2, &irq2); | 232 | setup_irq(2, &irq2); |
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c index 7377ccb21335..304d8bad6559 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/k8.c | |||
@@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges); | |||
16 | static u32 *flush_words; | 16 | static u32 *flush_words; |
17 | 17 | ||
18 | struct pci_device_id k8_nb_ids[] = { | 18 | struct pci_device_id k8_nb_ids[] = { |
19 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, | 19 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, |
20 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, | 20 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, |
21 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) }, | ||
21 | {} | 22 | {} |
22 | }; | 23 | }; |
23 | EXPORT_SYMBOL(k8_nb_ids); | 24 | EXPORT_SYMBOL(k8_nb_ids); |
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index f2d43bc75514..ff7d3b0124f1 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c | |||
@@ -139,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) | |||
139 | if (PageHighMem(pg)) { | 139 | if (PageHighMem(pg)) { |
140 | data = ioremap_cache(pa_data, sizeof(*data)); | 140 | data = ioremap_cache(pa_data, sizeof(*data)); |
141 | if (!data) { | 141 | if (!data) { |
142 | kfree(node); | ||
142 | error = -ENXIO; | 143 | error = -ENXIO; |
143 | goto err_dir; | 144 | goto err_dir; |
144 | } | 145 | } |
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index f47f0eb886b8..10435a120d22 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c | |||
@@ -69,6 +69,9 @@ static int gdb_x86vector = -1; | |||
69 | */ | 69 | */ |
70 | void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) | 70 | void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) |
71 | { | 71 | { |
72 | #ifndef CONFIG_X86_32 | ||
73 | u32 *gdb_regs32 = (u32 *)gdb_regs; | ||
74 | #endif | ||
72 | gdb_regs[GDB_AX] = regs->ax; | 75 | gdb_regs[GDB_AX] = regs->ax; |
73 | gdb_regs[GDB_BX] = regs->bx; | 76 | gdb_regs[GDB_BX] = regs->bx; |
74 | gdb_regs[GDB_CX] = regs->cx; | 77 | gdb_regs[GDB_CX] = regs->cx; |
@@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) | |||
76 | gdb_regs[GDB_SI] = regs->si; | 79 | gdb_regs[GDB_SI] = regs->si; |
77 | gdb_regs[GDB_DI] = regs->di; | 80 | gdb_regs[GDB_DI] = regs->di; |
78 | gdb_regs[GDB_BP] = regs->bp; | 81 | gdb_regs[GDB_BP] = regs->bp; |
79 | gdb_regs[GDB_PS] = regs->flags; | ||
80 | gdb_regs[GDB_PC] = regs->ip; | 82 | gdb_regs[GDB_PC] = regs->ip; |
81 | #ifdef CONFIG_X86_32 | 83 | #ifdef CONFIG_X86_32 |
84 | gdb_regs[GDB_PS] = regs->flags; | ||
82 | gdb_regs[GDB_DS] = regs->ds; | 85 | gdb_regs[GDB_DS] = regs->ds; |
83 | gdb_regs[GDB_ES] = regs->es; | 86 | gdb_regs[GDB_ES] = regs->es; |
84 | gdb_regs[GDB_CS] = regs->cs; | 87 | gdb_regs[GDB_CS] = regs->cs; |
@@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) | |||
94 | gdb_regs[GDB_R13] = regs->r13; | 97 | gdb_regs[GDB_R13] = regs->r13; |
95 | gdb_regs[GDB_R14] = regs->r14; | 98 | gdb_regs[GDB_R14] = regs->r14; |
96 | gdb_regs[GDB_R15] = regs->r15; | 99 | gdb_regs[GDB_R15] = regs->r15; |
100 | gdb_regs32[GDB_PS] = regs->flags; | ||
101 | gdb_regs32[GDB_CS] = regs->cs; | ||
102 | gdb_regs32[GDB_SS] = regs->ss; | ||
97 | #endif | 103 | #endif |
98 | gdb_regs[GDB_SP] = regs->sp; | 104 | gdb_regs[GDB_SP] = regs->sp; |
99 | } | 105 | } |
@@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) | |||
112 | */ | 118 | */ |
113 | void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) | 119 | void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) |
114 | { | 120 | { |
121 | #ifndef CONFIG_X86_32 | ||
122 | u32 *gdb_regs32 = (u32 *)gdb_regs; | ||
123 | #endif | ||
115 | gdb_regs[GDB_AX] = 0; | 124 | gdb_regs[GDB_AX] = 0; |
116 | gdb_regs[GDB_BX] = 0; | 125 | gdb_regs[GDB_BX] = 0; |
117 | gdb_regs[GDB_CX] = 0; | 126 | gdb_regs[GDB_CX] = 0; |
@@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) | |||
129 | gdb_regs[GDB_FS] = 0xFFFF; | 138 | gdb_regs[GDB_FS] = 0xFFFF; |
130 | gdb_regs[GDB_GS] = 0xFFFF; | 139 | gdb_regs[GDB_GS] = 0xFFFF; |
131 | #else | 140 | #else |
132 | gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); | 141 | gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); |
133 | gdb_regs[GDB_PC] = 0; | 142 | gdb_regs32[GDB_CS] = __KERNEL_CS; |
143 | gdb_regs32[GDB_SS] = __KERNEL_DS; | ||
144 | gdb_regs[GDB_PC] = p->thread.ip; | ||
134 | gdb_regs[GDB_R8] = 0; | 145 | gdb_regs[GDB_R8] = 0; |
135 | gdb_regs[GDB_R9] = 0; | 146 | gdb_regs[GDB_R9] = 0; |
136 | gdb_regs[GDB_R10] = 0; | 147 | gdb_regs[GDB_R10] = 0; |
@@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) | |||
153 | */ | 164 | */ |
154 | void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) | 165 | void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) |
155 | { | 166 | { |
167 | #ifndef CONFIG_X86_32 | ||
168 | u32 *gdb_regs32 = (u32 *)gdb_regs; | ||
169 | #endif | ||
156 | regs->ax = gdb_regs[GDB_AX]; | 170 | regs->ax = gdb_regs[GDB_AX]; |
157 | regs->bx = gdb_regs[GDB_BX]; | 171 | regs->bx = gdb_regs[GDB_BX]; |
158 | regs->cx = gdb_regs[GDB_CX]; | 172 | regs->cx = gdb_regs[GDB_CX]; |
@@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) | |||
160 | regs->si = gdb_regs[GDB_SI]; | 174 | regs->si = gdb_regs[GDB_SI]; |
161 | regs->di = gdb_regs[GDB_DI]; | 175 | regs->di = gdb_regs[GDB_DI]; |
162 | regs->bp = gdb_regs[GDB_BP]; | 176 | regs->bp = gdb_regs[GDB_BP]; |
163 | regs->flags = gdb_regs[GDB_PS]; | ||
164 | regs->ip = gdb_regs[GDB_PC]; | 177 | regs->ip = gdb_regs[GDB_PC]; |
165 | #ifdef CONFIG_X86_32 | 178 | #ifdef CONFIG_X86_32 |
179 | regs->flags = gdb_regs[GDB_PS]; | ||
166 | regs->ds = gdb_regs[GDB_DS]; | 180 | regs->ds = gdb_regs[GDB_DS]; |
167 | regs->es = gdb_regs[GDB_ES]; | 181 | regs->es = gdb_regs[GDB_ES]; |
168 | regs->cs = gdb_regs[GDB_CS]; | 182 | regs->cs = gdb_regs[GDB_CS]; |
@@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) | |||
175 | regs->r13 = gdb_regs[GDB_R13]; | 189 | regs->r13 = gdb_regs[GDB_R13]; |
176 | regs->r14 = gdb_regs[GDB_R14]; | 190 | regs->r14 = gdb_regs[GDB_R14]; |
177 | regs->r15 = gdb_regs[GDB_R15]; | 191 | regs->r15 = gdb_regs[GDB_R15]; |
192 | regs->flags = gdb_regs32[GDB_PS]; | ||
193 | regs->cs = gdb_regs32[GDB_CS]; | ||
194 | regs->ss = gdb_regs32[GDB_SS]; | ||
178 | #endif | 195 | #endif |
179 | } | 196 | } |
180 | 197 | ||
@@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, | |||
378 | if (remcomInBuffer[0] == 's') { | 395 | if (remcomInBuffer[0] == 's') { |
379 | linux_regs->flags |= X86_EFLAGS_TF; | 396 | linux_regs->flags |= X86_EFLAGS_TF; |
380 | kgdb_single_step = 1; | 397 | kgdb_single_step = 1; |
381 | if (kgdb_contthread) { | 398 | atomic_set(&kgdb_cpu_doing_single_step, |
382 | atomic_set(&kgdb_cpu_doing_single_step, | 399 | raw_smp_processor_id()); |
383 | raw_smp_processor_id()); | ||
384 | } | ||
385 | } | 400 | } |
386 | 401 | ||
387 | get_debugreg(dr6, 6); | 402 | get_debugreg(dr6, 6); |
@@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) | |||
440 | return NOTIFY_DONE; | 455 | return NOTIFY_DONE; |
441 | 456 | ||
442 | case DIE_NMI_IPI: | 457 | case DIE_NMI_IPI: |
443 | if (atomic_read(&kgdb_active) != -1) { | 458 | /* Just ignore, we will handle the roundup on DIE_NMI. */ |
444 | /* KGDB CPU roundup */ | ||
445 | kgdb_nmicallback(raw_smp_processor_id(), regs); | ||
446 | was_in_debug_nmi[raw_smp_processor_id()] = 1; | ||
447 | touch_nmi_watchdog(); | ||
448 | } | ||
449 | return NOTIFY_DONE; | 459 | return NOTIFY_DONE; |
450 | 460 | ||
451 | case DIE_NMIUNKNOWN: | 461 | case DIE_NMIUNKNOWN: |
@@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) | |||
466 | 476 | ||
467 | case DIE_DEBUG: | 477 | case DIE_DEBUG: |
468 | if (atomic_read(&kgdb_cpu_doing_single_step) == | 478 | if (atomic_read(&kgdb_cpu_doing_single_step) == |
469 | raw_smp_processor_id() && | 479 | raw_smp_processor_id()) { |
470 | user_mode(regs)) | 480 | if (user_mode(regs)) |
471 | return single_step_cont(regs, args); | 481 | return single_step_cont(regs, args); |
482 | break; | ||
483 | } else if (test_thread_flag(TIF_SINGLESTEP)) | ||
484 | /* This means a user thread is single stepping | ||
485 | * a system call which should be ignored | ||
486 | */ | ||
487 | return NOTIFY_DONE; | ||
472 | /* fall through */ | 488 | /* fall through */ |
473 | default: | 489 | default: |
474 | if (user_mode(regs)) | 490 | if (user_mode(regs)) |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8b7a3cf37d2b..478bca986eca 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void) | |||
178 | kvm_deferred_mmu_op(&ftlb, sizeof ftlb); | 178 | kvm_deferred_mmu_op(&ftlb, sizeof ftlb); |
179 | } | 179 | } |
180 | 180 | ||
181 | static void kvm_release_pt(u32 pfn) | 181 | static void kvm_release_pt(unsigned long pfn) |
182 | { | 182 | { |
183 | struct kvm_mmu_op_release_pt rpt = { | 183 | struct kvm_mmu_op_release_pt rpt = { |
184 | .header.op = KVM_MMU_OP_RELEASE_PT, | 184 | .header.op = KVM_MMU_OP_RELEASE_PT, |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d02def06ca91..774ac4991568 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void) | |||
78 | return ret; | 78 | return ret; |
79 | } | 79 | } |
80 | 80 | ||
81 | /* | ||
82 | * If we don't do that, there is the possibility that the guest | ||
83 | * will calibrate under heavy load - thus, getting a lower lpj - | ||
84 | * and execute the delays themselves without load. This is wrong, | ||
85 | * because no delay loop can finish beforehand. | ||
86 | * Any heuristics is subject to fail, because ultimately, a large | ||
87 | * poll of guests can be running and trouble each other. So we preset | ||
88 | * lpj here | ||
89 | */ | ||
90 | static unsigned long kvm_get_tsc_khz(void) | ||
91 | { | ||
92 | return preset_lpj; | ||
93 | } | ||
94 | |||
95 | static void kvm_get_preset_lpj(void) | ||
96 | { | ||
97 | struct pvclock_vcpu_time_info *src; | ||
98 | unsigned long khz; | ||
99 | u64 lpj; | ||
100 | |||
101 | src = &per_cpu(hv_clock, 0); | ||
102 | khz = pvclock_tsc_khz(src); | ||
103 | |||
104 | lpj = ((u64)khz * 1000); | ||
105 | do_div(lpj, HZ); | ||
106 | preset_lpj = lpj; | ||
107 | } | ||
108 | |||
81 | static struct clocksource kvm_clock = { | 109 | static struct clocksource kvm_clock = { |
82 | .name = "kvm-clock", | 110 | .name = "kvm-clock", |
83 | .read = kvm_clock_read, | 111 | .read = kvm_clock_read, |
@@ -153,6 +181,7 @@ void __init kvmclock_init(void) | |||
153 | pv_time_ops.get_wallclock = kvm_get_wallclock; | 181 | pv_time_ops.get_wallclock = kvm_get_wallclock; |
154 | pv_time_ops.set_wallclock = kvm_set_wallclock; | 182 | pv_time_ops.set_wallclock = kvm_set_wallclock; |
155 | pv_time_ops.sched_clock = kvm_clock_read; | 183 | pv_time_ops.sched_clock = kvm_clock_read; |
184 | pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; | ||
156 | #ifdef CONFIG_X86_LOCAL_APIC | 185 | #ifdef CONFIG_X86_LOCAL_APIC |
157 | pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; | 186 | pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; |
158 | #endif | 187 | #endif |
@@ -163,6 +192,7 @@ void __init kvmclock_init(void) | |||
163 | #ifdef CONFIG_KEXEC | 192 | #ifdef CONFIG_KEXEC |
164 | machine_ops.crash_shutdown = kvm_crash_shutdown; | 193 | machine_ops.crash_shutdown = kvm_crash_shutdown; |
165 | #endif | 194 | #endif |
195 | kvm_get_preset_lpj(); | ||
166 | clocksource_register(&kvm_clock); | 196 | clocksource_register(&kvm_clock); |
167 | } | 197 | } |
168 | } | 198 | } |
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index b68e21f06f4f..eee32b43fee3 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <asm/ldt.h> | 18 | #include <asm/ldt.h> |
19 | #include <asm/desc.h> | 19 | #include <asm/desc.h> |
20 | #include <asm/mmu_context.h> | 20 | #include <asm/mmu_context.h> |
21 | #include <asm/syscalls.h> | ||
21 | 22 | ||
22 | #ifdef CONFIG_SMP | 23 | #ifdef CONFIG_SMP |
23 | static void flush_ldt(void *current_mm) | 24 | static void flush_ldt(void *current_mm) |
@@ -51,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) | |||
51 | memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, | 52 | memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, |
52 | (mincount - oldsize) * LDT_ENTRY_SIZE); | 53 | (mincount - oldsize) * LDT_ENTRY_SIZE); |
53 | 54 | ||
55 | paravirt_alloc_ldt(newldt, mincount); | ||
56 | |||
54 | #ifdef CONFIG_X86_64 | 57 | #ifdef CONFIG_X86_64 |
55 | /* CHECKME: Do we really need this ? */ | 58 | /* CHECKME: Do we really need this ? */ |
56 | wmb(); | 59 | wmb(); |
@@ -73,6 +76,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) | |||
73 | #endif | 76 | #endif |
74 | } | 77 | } |
75 | if (oldsize) { | 78 | if (oldsize) { |
79 | paravirt_free_ldt(oldldt, oldsize); | ||
76 | if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) | 80 | if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) |
77 | vfree(oldldt); | 81 | vfree(oldldt); |
78 | else | 82 | else |
@@ -84,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) | |||
84 | static inline int copy_ldt(mm_context_t *new, mm_context_t *old) | 88 | static inline int copy_ldt(mm_context_t *new, mm_context_t *old) |
85 | { | 89 | { |
86 | int err = alloc_ldt(new, old->size, 0); | 90 | int err = alloc_ldt(new, old->size, 0); |
91 | int i; | ||
87 | 92 | ||
88 | if (err < 0) | 93 | if (err < 0) |
89 | return err; | 94 | return err; |
90 | memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); | 95 | |
96 | for(i = 0; i < old->size; i++) | ||
97 | write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); | ||
91 | return 0; | 98 | return 0; |
92 | } | 99 | } |
93 | 100 | ||
@@ -124,6 +131,7 @@ void destroy_context(struct mm_struct *mm) | |||
124 | if (mm == current->active_mm) | 131 | if (mm == current->active_mm) |
125 | clear_LDT(); | 132 | clear_LDT(); |
126 | #endif | 133 | #endif |
134 | paravirt_free_ldt(mm->context.ldt, mm->context.size); | ||
127 | if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) | 135 | if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) |
128 | vfree(mm->context.ldt); | 136 | vfree(mm->context.ldt); |
129 | else | 137 | else |
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c deleted file mode 100644 index 652fa5c38ebe..000000000000 --- a/arch/x86/kernel/microcode.c +++ /dev/null | |||
@@ -1,853 +0,0 @@ | |||
1 | /* | ||
2 | * Intel CPU Microcode Update Driver for Linux | ||
3 | * | ||
4 | * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> | ||
5 | * 2006 Shaohua Li <shaohua.li@intel.com> | ||
6 | * | ||
7 | * This driver allows to upgrade microcode on Intel processors | ||
8 | * belonging to IA-32 family - PentiumPro, Pentium II, | ||
9 | * Pentium III, Xeon, Pentium 4, etc. | ||
10 | * | ||
11 | * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture | ||
12 | * Software Developer's Manual | ||
13 | * Order Number 253668 or free download from: | ||
14 | * | ||
15 | * http://developer.intel.com/design/pentium4/manuals/253668.htm | ||
16 | * | ||
17 | * For more information, go to http://www.urbanmyth.org/microcode | ||
18 | * | ||
19 | * This program is free software; you can redistribute it and/or | ||
20 | * modify it under the terms of the GNU General Public License | ||
21 | * as published by the Free Software Foundation; either version | ||
22 | * 2 of the License, or (at your option) any later version. | ||
23 | * | ||
24 | * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
25 | * Initial release. | ||
26 | * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
27 | * Added read() support + cleanups. | ||
28 | * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
29 | * Added 'device trimming' support. open(O_WRONLY) zeroes | ||
30 | * and frees the saved copy of applied microcode. | ||
31 | * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
32 | * Made to use devfs (/dev/cpu/microcode) + cleanups. | ||
33 | * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> | ||
34 | * Added misc device support (now uses both devfs and misc). | ||
35 | * Added MICROCODE_IOCFREE ioctl to clear memory. | ||
36 | * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> | ||
37 | * Messages for error cases (non Intel & no suitable microcode). | ||
38 | * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> | ||
39 | * Removed ->release(). Removed exclusive open and status bitmap. | ||
40 | * Added microcode_rwsem to serialize read()/write()/ioctl(). | ||
41 | * Removed global kernel lock usage. | ||
42 | * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> | ||
43 | * Write 0 to 0x8B msr and then cpuid before reading revision, | ||
44 | * so that it works even if there were no update done by the | ||
45 | * BIOS. Otherwise, reading from 0x8B gives junk (which happened | ||
46 | * to be 0 on my machine which is why it worked even when I | ||
47 | * disabled update by the BIOS) | ||
48 | * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. | ||
49 | * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and | ||
50 | * Tigran Aivazian <tigran@veritas.com> | ||
51 | * Intel Pentium 4 processor support and bugfixes. | ||
52 | * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> | ||
53 | * Bugfix for HT (Hyper-Threading) enabled processors | ||
54 | * whereby processor resources are shared by all logical processors | ||
55 | * in a single CPU package. | ||
56 | * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and | ||
57 | * Tigran Aivazian <tigran@veritas.com>, | ||
58 | * Serialize updates as required on HT processors due to speculative | ||
59 | * nature of implementation. | ||
60 | * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> | ||
61 | * Fix the panic when writing zero-length microcode chunk. | ||
62 | * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, | ||
63 | * Jun Nakajima <jun.nakajima@intel.com> | ||
64 | * Support for the microcode updates in the new format. | ||
65 | * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> | ||
66 | * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl | ||
67 | * because we no longer hold a copy of applied microcode | ||
68 | * in kernel memory. | ||
69 | * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> | ||
70 | * Fix sigmatch() macro to handle old CPUs with pf == 0. | ||
71 | * Thanks to Stuart Swales for pointing out this bug. | ||
72 | */ | ||
73 | |||
74 | //#define DEBUG /* pr_debug */ | ||
75 | #include <linux/capability.h> | ||
76 | #include <linux/kernel.h> | ||
77 | #include <linux/init.h> | ||
78 | #include <linux/sched.h> | ||
79 | #include <linux/smp_lock.h> | ||
80 | #include <linux/cpumask.h> | ||
81 | #include <linux/module.h> | ||
82 | #include <linux/slab.h> | ||
83 | #include <linux/vmalloc.h> | ||
84 | #include <linux/miscdevice.h> | ||
85 | #include <linux/spinlock.h> | ||
86 | #include <linux/mm.h> | ||
87 | #include <linux/fs.h> | ||
88 | #include <linux/mutex.h> | ||
89 | #include <linux/cpu.h> | ||
90 | #include <linux/firmware.h> | ||
91 | #include <linux/platform_device.h> | ||
92 | |||
93 | #include <asm/msr.h> | ||
94 | #include <asm/uaccess.h> | ||
95 | #include <asm/processor.h> | ||
96 | |||
97 | MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); | ||
98 | MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); | ||
99 | MODULE_LICENSE("GPL"); | ||
100 | |||
101 | #define MICROCODE_VERSION "1.14a" | ||
102 | |||
103 | #define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ | ||
104 | #define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ | ||
105 | #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ | ||
106 | #define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */ | ||
107 | #define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */ | ||
108 | #define DWSIZE (sizeof (u32)) | ||
109 | #define get_totalsize(mc) \ | ||
110 | (((microcode_t *)mc)->hdr.totalsize ? \ | ||
111 | ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) | ||
112 | #define get_datasize(mc) \ | ||
113 | (((microcode_t *)mc)->hdr.datasize ? \ | ||
114 | ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) | ||
115 | |||
116 | #define sigmatch(s1, s2, p1, p2) \ | ||
117 | (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) | ||
118 | |||
119 | #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) | ||
120 | |||
121 | /* serialize access to the physical write to MSR 0x79 */ | ||
122 | static DEFINE_SPINLOCK(microcode_update_lock); | ||
123 | |||
124 | /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ | ||
125 | static DEFINE_MUTEX(microcode_mutex); | ||
126 | |||
127 | static struct ucode_cpu_info { | ||
128 | int valid; | ||
129 | unsigned int sig; | ||
130 | unsigned int pf; | ||
131 | unsigned int rev; | ||
132 | microcode_t *mc; | ||
133 | } ucode_cpu_info[NR_CPUS]; | ||
134 | |||
135 | static void collect_cpu_info(int cpu_num) | ||
136 | { | ||
137 | struct cpuinfo_x86 *c = &cpu_data(cpu_num); | ||
138 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; | ||
139 | unsigned int val[2]; | ||
140 | |||
141 | /* We should bind the task to the CPU */ | ||
142 | BUG_ON(raw_smp_processor_id() != cpu_num); | ||
143 | uci->pf = uci->rev = 0; | ||
144 | uci->mc = NULL; | ||
145 | uci->valid = 1; | ||
146 | |||
147 | if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || | ||
148 | cpu_has(c, X86_FEATURE_IA64)) { | ||
149 | printk(KERN_ERR "microcode: CPU%d not a capable Intel " | ||
150 | "processor\n", cpu_num); | ||
151 | uci->valid = 0; | ||
152 | return; | ||
153 | } | ||
154 | |||
155 | uci->sig = cpuid_eax(0x00000001); | ||
156 | |||
157 | if ((c->x86_model >= 5) || (c->x86 > 6)) { | ||
158 | /* get processor flags from MSR 0x17 */ | ||
159 | rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); | ||
160 | uci->pf = 1 << ((val[1] >> 18) & 7); | ||
161 | } | ||
162 | |||
163 | wrmsr(MSR_IA32_UCODE_REV, 0, 0); | ||
164 | /* see notes above for revision 1.07. Apparent chip bug */ | ||
165 | sync_core(); | ||
166 | /* get the current revision from MSR 0x8B */ | ||
167 | rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev); | ||
168 | pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", | ||
169 | uci->sig, uci->pf, uci->rev); | ||
170 | } | ||
171 | |||
172 | static inline int microcode_update_match(int cpu_num, | ||
173 | microcode_header_t *mc_header, int sig, int pf) | ||
174 | { | ||
175 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; | ||
176 | |||
177 | if (!sigmatch(sig, uci->sig, pf, uci->pf) | ||
178 | || mc_header->rev <= uci->rev) | ||
179 | return 0; | ||
180 | return 1; | ||
181 | } | ||
182 | |||
183 | static int microcode_sanity_check(void *mc) | ||
184 | { | ||
185 | microcode_header_t *mc_header = mc; | ||
186 | struct extended_sigtable *ext_header = NULL; | ||
187 | struct extended_signature *ext_sig; | ||
188 | unsigned long total_size, data_size, ext_table_size; | ||
189 | int sum, orig_sum, ext_sigcount = 0, i; | ||
190 | |||
191 | total_size = get_totalsize(mc_header); | ||
192 | data_size = get_datasize(mc_header); | ||
193 | if (data_size + MC_HEADER_SIZE > total_size) { | ||
194 | printk(KERN_ERR "microcode: error! " | ||
195 | "Bad data size in microcode data file\n"); | ||
196 | return -EINVAL; | ||
197 | } | ||
198 | |||
199 | if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { | ||
200 | printk(KERN_ERR "microcode: error! " | ||
201 | "Unknown microcode update format\n"); | ||
202 | return -EINVAL; | ||
203 | } | ||
204 | ext_table_size = total_size - (MC_HEADER_SIZE + data_size); | ||
205 | if (ext_table_size) { | ||
206 | if ((ext_table_size < EXT_HEADER_SIZE) | ||
207 | || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { | ||
208 | printk(KERN_ERR "microcode: error! " | ||
209 | "Small exttable size in microcode data file\n"); | ||
210 | return -EINVAL; | ||
211 | } | ||
212 | ext_header = mc + MC_HEADER_SIZE + data_size; | ||
213 | if (ext_table_size != exttable_size(ext_header)) { | ||
214 | printk(KERN_ERR "microcode: error! " | ||
215 | "Bad exttable size in microcode data file\n"); | ||
216 | return -EFAULT; | ||
217 | } | ||
218 | ext_sigcount = ext_header->count; | ||
219 | } | ||
220 | |||
221 | /* check extended table checksum */ | ||
222 | if (ext_table_size) { | ||
223 | int ext_table_sum = 0; | ||
224 | int *ext_tablep = (int *)ext_header; | ||
225 | |||
226 | i = ext_table_size / DWSIZE; | ||
227 | while (i--) | ||
228 | ext_table_sum += ext_tablep[i]; | ||
229 | if (ext_table_sum) { | ||
230 | printk(KERN_WARNING "microcode: aborting, " | ||
231 | "bad extended signature table checksum\n"); | ||
232 | return -EINVAL; | ||
233 | } | ||
234 | } | ||
235 | |||
236 | /* calculate the checksum */ | ||
237 | orig_sum = 0; | ||
238 | i = (MC_HEADER_SIZE + data_size) / DWSIZE; | ||
239 | while (i--) | ||
240 | orig_sum += ((int *)mc)[i]; | ||
241 | if (orig_sum) { | ||
242 | printk(KERN_ERR "microcode: aborting, bad checksum\n"); | ||
243 | return -EINVAL; | ||
244 | } | ||
245 | if (!ext_table_size) | ||
246 | return 0; | ||
247 | /* check extended signature checksum */ | ||
248 | for (i = 0; i < ext_sigcount; i++) { | ||
249 | ext_sig = (void *)ext_header + EXT_HEADER_SIZE + | ||
250 | EXT_SIGNATURE_SIZE * i; | ||
251 | sum = orig_sum | ||
252 | - (mc_header->sig + mc_header->pf + mc_header->cksum) | ||
253 | + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); | ||
254 | if (sum) { | ||
255 | printk(KERN_ERR "microcode: aborting, bad checksum\n"); | ||
256 | return -EINVAL; | ||
257 | } | ||
258 | } | ||
259 | return 0; | ||
260 | } | ||
261 | |||
262 | /* | ||
263 | * return 0 - no update found | ||
264 | * return 1 - found update | ||
265 | * return < 0 - error | ||
266 | */ | ||
267 | static int get_maching_microcode(void *mc, int cpu) | ||
268 | { | ||
269 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
270 | microcode_header_t *mc_header = mc; | ||
271 | struct extended_sigtable *ext_header; | ||
272 | unsigned long total_size = get_totalsize(mc_header); | ||
273 | int ext_sigcount, i; | ||
274 | struct extended_signature *ext_sig; | ||
275 | void *new_mc; | ||
276 | |||
277 | if (microcode_update_match(cpu, mc_header, | ||
278 | mc_header->sig, mc_header->pf)) | ||
279 | goto find; | ||
280 | |||
281 | if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) | ||
282 | return 0; | ||
283 | |||
284 | ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; | ||
285 | ext_sigcount = ext_header->count; | ||
286 | ext_sig = (void *)ext_header + EXT_HEADER_SIZE; | ||
287 | for (i = 0; i < ext_sigcount; i++) { | ||
288 | if (microcode_update_match(cpu, mc_header, | ||
289 | ext_sig->sig, ext_sig->pf)) | ||
290 | goto find; | ||
291 | ext_sig++; | ||
292 | } | ||
293 | return 0; | ||
294 | find: | ||
295 | pr_debug("microcode: CPU%d found a matching microcode update with" | ||
296 | " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev); | ||
297 | new_mc = vmalloc(total_size); | ||
298 | if (!new_mc) { | ||
299 | printk(KERN_ERR "microcode: error! Can not allocate memory\n"); | ||
300 | return -ENOMEM; | ||
301 | } | ||
302 | |||
303 | /* free previous update file */ | ||
304 | vfree(uci->mc); | ||
305 | |||
306 | memcpy(new_mc, mc, total_size); | ||
307 | uci->mc = new_mc; | ||
308 | return 1; | ||
309 | } | ||
310 | |||
311 | static void apply_microcode(int cpu) | ||
312 | { | ||
313 | unsigned long flags; | ||
314 | unsigned int val[2]; | ||
315 | int cpu_num = raw_smp_processor_id(); | ||
316 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; | ||
317 | |||
318 | /* We should bind the task to the CPU */ | ||
319 | BUG_ON(cpu_num != cpu); | ||
320 | |||
321 | if (uci->mc == NULL) | ||
322 | return; | ||
323 | |||
324 | /* serialize access to the physical write to MSR 0x79 */ | ||
325 | spin_lock_irqsave(µcode_update_lock, flags); | ||
326 | |||
327 | /* write microcode via MSR 0x79 */ | ||
328 | wrmsr(MSR_IA32_UCODE_WRITE, | ||
329 | (unsigned long) uci->mc->bits, | ||
330 | (unsigned long) uci->mc->bits >> 16 >> 16); | ||
331 | wrmsr(MSR_IA32_UCODE_REV, 0, 0); | ||
332 | |||
333 | /* see notes above for revision 1.07. Apparent chip bug */ | ||
334 | sync_core(); | ||
335 | |||
336 | /* get the current revision from MSR 0x8B */ | ||
337 | rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); | ||
338 | |||
339 | spin_unlock_irqrestore(µcode_update_lock, flags); | ||
340 | if (val[1] != uci->mc->hdr.rev) { | ||
341 | printk(KERN_ERR "microcode: CPU%d update from revision " | ||
342 | "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]); | ||
343 | return; | ||
344 | } | ||
345 | printk(KERN_INFO "microcode: CPU%d updated from revision " | ||
346 | "0x%x to 0x%x, date = %08x \n", | ||
347 | cpu_num, uci->rev, val[1], uci->mc->hdr.date); | ||
348 | uci->rev = val[1]; | ||
349 | } | ||
350 | |||
351 | #ifdef CONFIG_MICROCODE_OLD_INTERFACE | ||
352 | static void __user *user_buffer; /* user area microcode data buffer */ | ||
353 | static unsigned int user_buffer_size; /* it's size */ | ||
354 | |||
355 | static long get_next_ucode(void **mc, long offset) | ||
356 | { | ||
357 | microcode_header_t mc_header; | ||
358 | unsigned long total_size; | ||
359 | |||
360 | /* No more data */ | ||
361 | if (offset >= user_buffer_size) | ||
362 | return 0; | ||
363 | if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) { | ||
364 | printk(KERN_ERR "microcode: error! Can not read user data\n"); | ||
365 | return -EFAULT; | ||
366 | } | ||
367 | total_size = get_totalsize(&mc_header); | ||
368 | if (offset + total_size > user_buffer_size) { | ||
369 | printk(KERN_ERR "microcode: error! Bad total size in microcode " | ||
370 | "data file\n"); | ||
371 | return -EINVAL; | ||
372 | } | ||
373 | *mc = vmalloc(total_size); | ||
374 | if (!*mc) | ||
375 | return -ENOMEM; | ||
376 | if (copy_from_user(*mc, user_buffer + offset, total_size)) { | ||
377 | printk(KERN_ERR "microcode: error! Can not read user data\n"); | ||
378 | vfree(*mc); | ||
379 | return -EFAULT; | ||
380 | } | ||
381 | return offset + total_size; | ||
382 | } | ||
383 | |||
384 | static int do_microcode_update (void) | ||
385 | { | ||
386 | long cursor = 0; | ||
387 | int error = 0; | ||
388 | void *new_mc = NULL; | ||
389 | int cpu; | ||
390 | cpumask_t old; | ||
391 | |||
392 | old = current->cpus_allowed; | ||
393 | |||
394 | while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) { | ||
395 | error = microcode_sanity_check(new_mc); | ||
396 | if (error) | ||
397 | goto out; | ||
398 | /* | ||
399 | * It's possible the data file has multiple matching ucode, | ||
400 | * lets keep searching till the latest version | ||
401 | */ | ||
402 | for_each_online_cpu(cpu) { | ||
403 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
404 | |||
405 | if (!uci->valid) | ||
406 | continue; | ||
407 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); | ||
408 | error = get_maching_microcode(new_mc, cpu); | ||
409 | if (error < 0) | ||
410 | goto out; | ||
411 | if (error == 1) | ||
412 | apply_microcode(cpu); | ||
413 | } | ||
414 | vfree(new_mc); | ||
415 | } | ||
416 | out: | ||
417 | if (cursor > 0) | ||
418 | vfree(new_mc); | ||
419 | if (cursor < 0) | ||
420 | error = cursor; | ||
421 | set_cpus_allowed_ptr(current, &old); | ||
422 | return error; | ||
423 | } | ||
424 | |||
425 | static int microcode_open (struct inode *unused1, struct file *unused2) | ||
426 | { | ||
427 | cycle_kernel_lock(); | ||
428 | return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; | ||
429 | } | ||
430 | |||
431 | static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) | ||
432 | { | ||
433 | ssize_t ret; | ||
434 | |||
435 | if ((len >> PAGE_SHIFT) > num_physpages) { | ||
436 | printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); | ||
437 | return -EINVAL; | ||
438 | } | ||
439 | |||
440 | get_online_cpus(); | ||
441 | mutex_lock(µcode_mutex); | ||
442 | |||
443 | user_buffer = (void __user *) buf; | ||
444 | user_buffer_size = (int) len; | ||
445 | |||
446 | ret = do_microcode_update(); | ||
447 | if (!ret) | ||
448 | ret = (ssize_t)len; | ||
449 | |||
450 | mutex_unlock(µcode_mutex); | ||
451 | put_online_cpus(); | ||
452 | |||
453 | return ret; | ||
454 | } | ||
455 | |||
456 | static const struct file_operations microcode_fops = { | ||
457 | .owner = THIS_MODULE, | ||
458 | .write = microcode_write, | ||
459 | .open = microcode_open, | ||
460 | }; | ||
461 | |||
462 | static struct miscdevice microcode_dev = { | ||
463 | .minor = MICROCODE_MINOR, | ||
464 | .name = "microcode", | ||
465 | .fops = µcode_fops, | ||
466 | }; | ||
467 | |||
468 | static int __init microcode_dev_init (void) | ||
469 | { | ||
470 | int error; | ||
471 | |||
472 | error = misc_register(µcode_dev); | ||
473 | if (error) { | ||
474 | printk(KERN_ERR | ||
475 | "microcode: can't misc_register on minor=%d\n", | ||
476 | MICROCODE_MINOR); | ||
477 | return error; | ||
478 | } | ||
479 | |||
480 | return 0; | ||
481 | } | ||
482 | |||
483 | static void microcode_dev_exit (void) | ||
484 | { | ||
485 | misc_deregister(µcode_dev); | ||
486 | } | ||
487 | |||
488 | MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); | ||
489 | #else | ||
490 | #define microcode_dev_init() 0 | ||
491 | #define microcode_dev_exit() do { } while(0) | ||
492 | #endif | ||
493 | |||
494 | static long get_next_ucode_from_buffer(void **mc, const u8 *buf, | ||
495 | unsigned long size, long offset) | ||
496 | { | ||
497 | microcode_header_t *mc_header; | ||
498 | unsigned long total_size; | ||
499 | |||
500 | /* No more data */ | ||
501 | if (offset >= size) | ||
502 | return 0; | ||
503 | mc_header = (microcode_header_t *)(buf + offset); | ||
504 | total_size = get_totalsize(mc_header); | ||
505 | |||
506 | if (offset + total_size > size) { | ||
507 | printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); | ||
508 | return -EINVAL; | ||
509 | } | ||
510 | |||
511 | *mc = vmalloc(total_size); | ||
512 | if (!*mc) { | ||
513 | printk(KERN_ERR "microcode: error! Can not allocate memory\n"); | ||
514 | return -ENOMEM; | ||
515 | } | ||
516 | memcpy(*mc, buf + offset, total_size); | ||
517 | return offset + total_size; | ||
518 | } | ||
519 | |||
520 | /* fake device for request_firmware */ | ||
521 | static struct platform_device *microcode_pdev; | ||
522 | |||
523 | static int cpu_request_microcode(int cpu) | ||
524 | { | ||
525 | char name[30]; | ||
526 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
527 | const struct firmware *firmware; | ||
528 | const u8 *buf; | ||
529 | unsigned long size; | ||
530 | long offset = 0; | ||
531 | int error; | ||
532 | void *mc; | ||
533 | |||
534 | /* We should bind the task to the CPU */ | ||
535 | BUG_ON(cpu != raw_smp_processor_id()); | ||
536 | sprintf(name,"intel-ucode/%02x-%02x-%02x", | ||
537 | c->x86, c->x86_model, c->x86_mask); | ||
538 | error = request_firmware(&firmware, name, µcode_pdev->dev); | ||
539 | if (error) { | ||
540 | pr_debug("microcode: data file %s load failed\n", name); | ||
541 | return error; | ||
542 | } | ||
543 | buf = firmware->data; | ||
544 | size = firmware->size; | ||
545 | while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) | ||
546 | > 0) { | ||
547 | error = microcode_sanity_check(mc); | ||
548 | if (error) | ||
549 | break; | ||
550 | error = get_maching_microcode(mc, cpu); | ||
551 | if (error < 0) | ||
552 | break; | ||
553 | /* | ||
554 | * It's possible the data file has multiple matching ucode, | ||
555 | * lets keep searching till the latest version | ||
556 | */ | ||
557 | if (error == 1) { | ||
558 | apply_microcode(cpu); | ||
559 | error = 0; | ||
560 | } | ||
561 | vfree(mc); | ||
562 | } | ||
563 | if (offset > 0) | ||
564 | vfree(mc); | ||
565 | if (offset < 0) | ||
566 | error = offset; | ||
567 | release_firmware(firmware); | ||
568 | |||
569 | return error; | ||
570 | } | ||
571 | |||
572 | static int apply_microcode_check_cpu(int cpu) | ||
573 | { | ||
574 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
575 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
576 | cpumask_t old; | ||
577 | unsigned int val[2]; | ||
578 | int err = 0; | ||
579 | |||
580 | /* Check if the microcode is available */ | ||
581 | if (!uci->mc) | ||
582 | return 0; | ||
583 | |||
584 | old = current->cpus_allowed; | ||
585 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); | ||
586 | |||
587 | /* Check if the microcode we have in memory matches the CPU */ | ||
588 | if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || | ||
589 | cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001)) | ||
590 | err = -EINVAL; | ||
591 | |||
592 | if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) { | ||
593 | /* get processor flags from MSR 0x17 */ | ||
594 | rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); | ||
595 | if (uci->pf != (1 << ((val[1] >> 18) & 7))) | ||
596 | err = -EINVAL; | ||
597 | } | ||
598 | |||
599 | if (!err) { | ||
600 | wrmsr(MSR_IA32_UCODE_REV, 0, 0); | ||
601 | /* see notes above for revision 1.07. Apparent chip bug */ | ||
602 | sync_core(); | ||
603 | /* get the current revision from MSR 0x8B */ | ||
604 | rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); | ||
605 | if (uci->rev != val[1]) | ||
606 | err = -EINVAL; | ||
607 | } | ||
608 | |||
609 | if (!err) | ||
610 | apply_microcode(cpu); | ||
611 | else | ||
612 | printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:" | ||
613 | " sig=0x%x, pf=0x%x, rev=0x%x\n", | ||
614 | cpu, uci->sig, uci->pf, uci->rev); | ||
615 | |||
616 | set_cpus_allowed_ptr(current, &old); | ||
617 | return err; | ||
618 | } | ||
619 | |||
620 | static void microcode_init_cpu(int cpu, int resume) | ||
621 | { | ||
622 | cpumask_t old; | ||
623 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
624 | |||
625 | old = current->cpus_allowed; | ||
626 | |||
627 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); | ||
628 | mutex_lock(µcode_mutex); | ||
629 | collect_cpu_info(cpu); | ||
630 | if (uci->valid && system_state == SYSTEM_RUNNING && !resume) | ||
631 | cpu_request_microcode(cpu); | ||
632 | mutex_unlock(µcode_mutex); | ||
633 | set_cpus_allowed_ptr(current, &old); | ||
634 | } | ||
635 | |||
636 | static void microcode_fini_cpu(int cpu) | ||
637 | { | ||
638 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
639 | |||
640 | mutex_lock(µcode_mutex); | ||
641 | uci->valid = 0; | ||
642 | vfree(uci->mc); | ||
643 | uci->mc = NULL; | ||
644 | mutex_unlock(µcode_mutex); | ||
645 | } | ||
646 | |||
647 | static ssize_t reload_store(struct sys_device *dev, | ||
648 | struct sysdev_attribute *attr, | ||
649 | const char *buf, size_t sz) | ||
650 | { | ||
651 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; | ||
652 | char *end; | ||
653 | unsigned long val = simple_strtoul(buf, &end, 0); | ||
654 | int err = 0; | ||
655 | int cpu = dev->id; | ||
656 | |||
657 | if (end == buf) | ||
658 | return -EINVAL; | ||
659 | if (val == 1) { | ||
660 | cpumask_t old = current->cpus_allowed; | ||
661 | |||
662 | get_online_cpus(); | ||
663 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); | ||
664 | |||
665 | mutex_lock(µcode_mutex); | ||
666 | if (uci->valid) | ||
667 | err = cpu_request_microcode(cpu); | ||
668 | mutex_unlock(µcode_mutex); | ||
669 | put_online_cpus(); | ||
670 | set_cpus_allowed_ptr(current, &old); | ||
671 | } | ||
672 | if (err) | ||
673 | return err; | ||
674 | return sz; | ||
675 | } | ||
676 | |||
677 | static ssize_t version_show(struct sys_device *dev, | ||
678 | struct sysdev_attribute *attr, char *buf) | ||
679 | { | ||
680 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; | ||
681 | |||
682 | return sprintf(buf, "0x%x\n", uci->rev); | ||
683 | } | ||
684 | |||
685 | static ssize_t pf_show(struct sys_device *dev, | ||
686 | struct sysdev_attribute *attr, char *buf) | ||
687 | { | ||
688 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; | ||
689 | |||
690 | return sprintf(buf, "0x%x\n", uci->pf); | ||
691 | } | ||
692 | |||
693 | static SYSDEV_ATTR(reload, 0200, NULL, reload_store); | ||
694 | static SYSDEV_ATTR(version, 0400, version_show, NULL); | ||
695 | static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); | ||
696 | |||
697 | static struct attribute *mc_default_attrs[] = { | ||
698 | &attr_reload.attr, | ||
699 | &attr_version.attr, | ||
700 | &attr_processor_flags.attr, | ||
701 | NULL | ||
702 | }; | ||
703 | |||
704 | static struct attribute_group mc_attr_group = { | ||
705 | .attrs = mc_default_attrs, | ||
706 | .name = "microcode", | ||
707 | }; | ||
708 | |||
709 | static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) | ||
710 | { | ||
711 | int err, cpu = sys_dev->id; | ||
712 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
713 | |||
714 | if (!cpu_online(cpu)) | ||
715 | return 0; | ||
716 | |||
717 | pr_debug("microcode: CPU%d added\n", cpu); | ||
718 | memset(uci, 0, sizeof(*uci)); | ||
719 | |||
720 | err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); | ||
721 | if (err) | ||
722 | return err; | ||
723 | |||
724 | microcode_init_cpu(cpu, resume); | ||
725 | |||
726 | return 0; | ||
727 | } | ||
728 | |||
729 | static int mc_sysdev_add(struct sys_device *sys_dev) | ||
730 | { | ||
731 | return __mc_sysdev_add(sys_dev, 0); | ||
732 | } | ||
733 | |||
734 | static int mc_sysdev_remove(struct sys_device *sys_dev) | ||
735 | { | ||
736 | int cpu = sys_dev->id; | ||
737 | |||
738 | if (!cpu_online(cpu)) | ||
739 | return 0; | ||
740 | |||
741 | pr_debug("microcode: CPU%d removed\n", cpu); | ||
742 | microcode_fini_cpu(cpu); | ||
743 | sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); | ||
744 | return 0; | ||
745 | } | ||
746 | |||
747 | static int mc_sysdev_resume(struct sys_device *dev) | ||
748 | { | ||
749 | int cpu = dev->id; | ||
750 | |||
751 | if (!cpu_online(cpu)) | ||
752 | return 0; | ||
753 | pr_debug("microcode: CPU%d resumed\n", cpu); | ||
754 | /* only CPU 0 will apply ucode here */ | ||
755 | apply_microcode(0); | ||
756 | return 0; | ||
757 | } | ||
758 | |||
759 | static struct sysdev_driver mc_sysdev_driver = { | ||
760 | .add = mc_sysdev_add, | ||
761 | .remove = mc_sysdev_remove, | ||
762 | .resume = mc_sysdev_resume, | ||
763 | }; | ||
764 | |||
765 | static __cpuinit int | ||
766 | mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) | ||
767 | { | ||
768 | unsigned int cpu = (unsigned long)hcpu; | ||
769 | struct sys_device *sys_dev; | ||
770 | |||
771 | sys_dev = get_cpu_sysdev(cpu); | ||
772 | switch (action) { | ||
773 | case CPU_UP_CANCELED_FROZEN: | ||
774 | /* The CPU refused to come up during a system resume */ | ||
775 | microcode_fini_cpu(cpu); | ||
776 | break; | ||
777 | case CPU_ONLINE: | ||
778 | case CPU_DOWN_FAILED: | ||
779 | mc_sysdev_add(sys_dev); | ||
780 | break; | ||
781 | case CPU_ONLINE_FROZEN: | ||
782 | /* System-wide resume is in progress, try to apply microcode */ | ||
783 | if (apply_microcode_check_cpu(cpu)) { | ||
784 | /* The application of microcode failed */ | ||
785 | microcode_fini_cpu(cpu); | ||
786 | __mc_sysdev_add(sys_dev, 1); | ||
787 | break; | ||
788 | } | ||
789 | case CPU_DOWN_FAILED_FROZEN: | ||
790 | if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) | ||
791 | printk(KERN_ERR "microcode: Failed to create the sysfs " | ||
792 | "group for CPU%d\n", cpu); | ||
793 | break; | ||
794 | case CPU_DOWN_PREPARE: | ||
795 | mc_sysdev_remove(sys_dev); | ||
796 | break; | ||
797 | case CPU_DOWN_PREPARE_FROZEN: | ||
798 | /* Suspend is in progress, only remove the interface */ | ||
799 | sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); | ||
800 | break; | ||
801 | } | ||
802 | return NOTIFY_OK; | ||
803 | } | ||
804 | |||
805 | static struct notifier_block __refdata mc_cpu_notifier = { | ||
806 | .notifier_call = mc_cpu_callback, | ||
807 | }; | ||
808 | |||
809 | static int __init microcode_init (void) | ||
810 | { | ||
811 | int error; | ||
812 | |||
813 | printk(KERN_INFO | ||
814 | "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n"); | ||
815 | |||
816 | error = microcode_dev_init(); | ||
817 | if (error) | ||
818 | return error; | ||
819 | microcode_pdev = platform_device_register_simple("microcode", -1, | ||
820 | NULL, 0); | ||
821 | if (IS_ERR(microcode_pdev)) { | ||
822 | microcode_dev_exit(); | ||
823 | return PTR_ERR(microcode_pdev); | ||
824 | } | ||
825 | |||
826 | get_online_cpus(); | ||
827 | error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); | ||
828 | put_online_cpus(); | ||
829 | if (error) { | ||
830 | microcode_dev_exit(); | ||
831 | platform_device_unregister(microcode_pdev); | ||
832 | return error; | ||
833 | } | ||
834 | |||
835 | register_hotcpu_notifier(&mc_cpu_notifier); | ||
836 | return 0; | ||
837 | } | ||
838 | |||
839 | static void __exit microcode_exit (void) | ||
840 | { | ||
841 | microcode_dev_exit(); | ||
842 | |||
843 | unregister_hotcpu_notifier(&mc_cpu_notifier); | ||
844 | |||
845 | get_online_cpus(); | ||
846 | sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); | ||
847 | put_online_cpus(); | ||
848 | |||
849 | platform_device_unregister(microcode_pdev); | ||
850 | } | ||
851 | |||
852 | module_init(microcode_init) | ||
853 | module_exit(microcode_exit) | ||
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c new file mode 100644 index 000000000000..7a1f8eeac2c7 --- /dev/null +++ b/arch/x86/kernel/microcode_amd.c | |||
@@ -0,0 +1,435 @@ | |||
1 | /* | ||
2 | * AMD CPU Microcode Update Driver for Linux | ||
3 | * Copyright (C) 2008 Advanced Micro Devices Inc. | ||
4 | * | ||
5 | * Author: Peter Oruba <peter.oruba@amd.com> | ||
6 | * | ||
7 | * Based on work by: | ||
8 | * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> | ||
9 | * | ||
10 | * This driver allows to upgrade microcode on AMD | ||
11 | * family 0x10 and 0x11 processors. | ||
12 | * | ||
13 | * Licensed unter the terms of the GNU General Public | ||
14 | * License version 2. See file COPYING for details. | ||
15 | */ | ||
16 | |||
17 | #include <linux/capability.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/cpumask.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/slab.h> | ||
24 | #include <linux/vmalloc.h> | ||
25 | #include <linux/miscdevice.h> | ||
26 | #include <linux/spinlock.h> | ||
27 | #include <linux/mm.h> | ||
28 | #include <linux/fs.h> | ||
29 | #include <linux/mutex.h> | ||
30 | #include <linux/cpu.h> | ||
31 | #include <linux/firmware.h> | ||
32 | #include <linux/platform_device.h> | ||
33 | #include <linux/pci.h> | ||
34 | #include <linux/pci_ids.h> | ||
35 | |||
36 | #include <asm/msr.h> | ||
37 | #include <asm/uaccess.h> | ||
38 | #include <asm/processor.h> | ||
39 | #include <asm/microcode.h> | ||
40 | |||
41 | MODULE_DESCRIPTION("AMD Microcode Update Driver"); | ||
42 | MODULE_AUTHOR("Peter Oruba <peter.oruba@amd.com>"); | ||
43 | MODULE_LICENSE("GPL v2"); | ||
44 | |||
45 | #define UCODE_MAGIC 0x00414d44 | ||
46 | #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 | ||
47 | #define UCODE_UCODE_TYPE 0x00000001 | ||
48 | |||
49 | struct equiv_cpu_entry { | ||
50 | unsigned int installed_cpu; | ||
51 | unsigned int fixed_errata_mask; | ||
52 | unsigned int fixed_errata_compare; | ||
53 | unsigned int equiv_cpu; | ||
54 | }; | ||
55 | |||
56 | struct microcode_header_amd { | ||
57 | unsigned int data_code; | ||
58 | unsigned int patch_id; | ||
59 | unsigned char mc_patch_data_id[2]; | ||
60 | unsigned char mc_patch_data_len; | ||
61 | unsigned char init_flag; | ||
62 | unsigned int mc_patch_data_checksum; | ||
63 | unsigned int nb_dev_id; | ||
64 | unsigned int sb_dev_id; | ||
65 | unsigned char processor_rev_id[2]; | ||
66 | unsigned char nb_rev_id; | ||
67 | unsigned char sb_rev_id; | ||
68 | unsigned char bios_api_rev; | ||
69 | unsigned char reserved1[3]; | ||
70 | unsigned int match_reg[8]; | ||
71 | }; | ||
72 | |||
73 | struct microcode_amd { | ||
74 | struct microcode_header_amd hdr; | ||
75 | unsigned int mpb[0]; | ||
76 | }; | ||
77 | |||
78 | #define UCODE_MAX_SIZE (2048) | ||
79 | #define DEFAULT_UCODE_DATASIZE (896) | ||
80 | #define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) | ||
81 | #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) | ||
82 | #define DWSIZE (sizeof(u32)) | ||
83 | /* For now we support a fixed ucode total size only */ | ||
84 | #define get_totalsize(mc) \ | ||
85 | ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \ | ||
86 | + MC_HEADER_SIZE) | ||
87 | |||
88 | /* serialize access to the physical write */ | ||
89 | static DEFINE_SPINLOCK(microcode_update_lock); | ||
90 | |||
91 | static struct equiv_cpu_entry *equiv_cpu_table; | ||
92 | |||
93 | static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) | ||
94 | { | ||
95 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
96 | |||
97 | memset(csig, 0, sizeof(*csig)); | ||
98 | |||
99 | if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { | ||
100 | printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", | ||
101 | cpu); | ||
102 | return -1; | ||
103 | } | ||
104 | |||
105 | asm volatile("movl %1, %%ecx; rdmsr" | ||
106 | : "=a" (csig->rev) | ||
107 | : "i" (0x0000008B) : "ecx"); | ||
108 | |||
109 | printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n", | ||
110 | csig->rev); | ||
111 | |||
112 | return 0; | ||
113 | } | ||
114 | |||
115 | static int get_matching_microcode(int cpu, void *mc, int rev) | ||
116 | { | ||
117 | struct microcode_header_amd *mc_header = mc; | ||
118 | struct pci_dev *nb_pci_dev, *sb_pci_dev; | ||
119 | unsigned int current_cpu_id; | ||
120 | unsigned int equiv_cpu_id = 0x00; | ||
121 | unsigned int i = 0; | ||
122 | |||
123 | BUG_ON(equiv_cpu_table == NULL); | ||
124 | current_cpu_id = cpuid_eax(0x00000001); | ||
125 | |||
126 | while (equiv_cpu_table[i].installed_cpu != 0) { | ||
127 | if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { | ||
128 | equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; | ||
129 | break; | ||
130 | } | ||
131 | i++; | ||
132 | } | ||
133 | |||
134 | if (!equiv_cpu_id) { | ||
135 | printk(KERN_ERR "microcode: CPU%d cpu_id " | ||
136 | "not found in equivalent cpu table \n", cpu); | ||
137 | return 0; | ||
138 | } | ||
139 | |||
140 | if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) { | ||
141 | printk(KERN_ERR | ||
142 | "microcode: CPU%d patch does not match " | ||
143 | "(patch is %x, cpu extended is %x) \n", | ||
144 | cpu, mc_header->processor_rev_id[0], | ||
145 | (equiv_cpu_id & 0xff)); | ||
146 | return 0; | ||
147 | } | ||
148 | |||
149 | if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) { | ||
150 | printk(KERN_ERR "microcode: CPU%d patch does not match " | ||
151 | "(patch is %x, cpu base id is %x) \n", | ||
152 | cpu, mc_header->processor_rev_id[1], | ||
153 | ((equiv_cpu_id >> 16) & 0xff)); | ||
154 | |||
155 | return 0; | ||
156 | } | ||
157 | |||
158 | /* ucode may be northbridge specific */ | ||
159 | if (mc_header->nb_dev_id) { | ||
160 | nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, | ||
161 | (mc_header->nb_dev_id & 0xff), | ||
162 | NULL); | ||
163 | if ((!nb_pci_dev) || | ||
164 | (mc_header->nb_rev_id != nb_pci_dev->revision)) { | ||
165 | printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu); | ||
166 | pci_dev_put(nb_pci_dev); | ||
167 | return 0; | ||
168 | } | ||
169 | pci_dev_put(nb_pci_dev); | ||
170 | } | ||
171 | |||
172 | /* ucode may be southbridge specific */ | ||
173 | if (mc_header->sb_dev_id) { | ||
174 | sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, | ||
175 | (mc_header->sb_dev_id & 0xff), | ||
176 | NULL); | ||
177 | if ((!sb_pci_dev) || | ||
178 | (mc_header->sb_rev_id != sb_pci_dev->revision)) { | ||
179 | printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu); | ||
180 | pci_dev_put(sb_pci_dev); | ||
181 | return 0; | ||
182 | } | ||
183 | pci_dev_put(sb_pci_dev); | ||
184 | } | ||
185 | |||
186 | if (mc_header->patch_id <= rev) | ||
187 | return 0; | ||
188 | |||
189 | return 1; | ||
190 | } | ||
191 | |||
192 | static void apply_microcode_amd(int cpu) | ||
193 | { | ||
194 | unsigned long flags; | ||
195 | unsigned int eax, edx; | ||
196 | unsigned int rev; | ||
197 | int cpu_num = raw_smp_processor_id(); | ||
198 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; | ||
199 | struct microcode_amd *mc_amd = uci->mc; | ||
200 | unsigned long addr; | ||
201 | |||
202 | /* We should bind the task to the CPU */ | ||
203 | BUG_ON(cpu_num != cpu); | ||
204 | |||
205 | if (mc_amd == NULL) | ||
206 | return; | ||
207 | |||
208 | spin_lock_irqsave(µcode_update_lock, flags); | ||
209 | |||
210 | addr = (unsigned long)&mc_amd->hdr.data_code; | ||
211 | edx = (unsigned int)(((unsigned long)upper_32_bits(addr))); | ||
212 | eax = (unsigned int)(((unsigned long)lower_32_bits(addr))); | ||
213 | |||
214 | asm volatile("movl %0, %%ecx; wrmsr" : | ||
215 | : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx"); | ||
216 | |||
217 | /* get patch id after patching */ | ||
218 | asm volatile("movl %1, %%ecx; rdmsr" | ||
219 | : "=a" (rev) | ||
220 | : "i" (0x0000008B) : "ecx"); | ||
221 | |||
222 | spin_unlock_irqrestore(µcode_update_lock, flags); | ||
223 | |||
224 | /* check current patch id and patch's id for match */ | ||
225 | if (rev != mc_amd->hdr.patch_id) { | ||
226 | printk(KERN_ERR "microcode: CPU%d update from revision " | ||
227 | "0x%x to 0x%x failed\n", cpu_num, | ||
228 | mc_amd->hdr.patch_id, rev); | ||
229 | return; | ||
230 | } | ||
231 | |||
232 | printk(KERN_INFO "microcode: CPU%d updated from revision " | ||
233 | "0x%x to 0x%x \n", | ||
234 | cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id); | ||
235 | |||
236 | uci->cpu_sig.rev = rev; | ||
237 | } | ||
238 | |||
239 | static void * get_next_ucode(u8 *buf, unsigned int size, | ||
240 | int (*get_ucode_data)(void *, const void *, size_t), | ||
241 | unsigned int *mc_size) | ||
242 | { | ||
243 | unsigned int total_size; | ||
244 | #define UCODE_CONTAINER_SECTION_HDR 8 | ||
245 | u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; | ||
246 | void *mc; | ||
247 | |||
248 | if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) | ||
249 | return NULL; | ||
250 | |||
251 | if (section_hdr[0] != UCODE_UCODE_TYPE) { | ||
252 | printk(KERN_ERR "microcode: error! " | ||
253 | "Wrong microcode payload type field\n"); | ||
254 | return NULL; | ||
255 | } | ||
256 | |||
257 | total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); | ||
258 | |||
259 | printk(KERN_INFO "microcode: size %u, total_size %u\n", | ||
260 | size, total_size); | ||
261 | |||
262 | if (total_size > size || total_size > UCODE_MAX_SIZE) { | ||
263 | printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); | ||
264 | return NULL; | ||
265 | } | ||
266 | |||
267 | mc = vmalloc(UCODE_MAX_SIZE); | ||
268 | if (mc) { | ||
269 | memset(mc, 0, UCODE_MAX_SIZE); | ||
270 | if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) { | ||
271 | vfree(mc); | ||
272 | mc = NULL; | ||
273 | } else | ||
274 | *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; | ||
275 | } | ||
276 | #undef UCODE_CONTAINER_SECTION_HDR | ||
277 | return mc; | ||
278 | } | ||
279 | |||
280 | |||
281 | static int install_equiv_cpu_table(u8 *buf, | ||
282 | int (*get_ucode_data)(void *, const void *, size_t)) | ||
283 | { | ||
284 | #define UCODE_CONTAINER_HEADER_SIZE 12 | ||
285 | u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; | ||
286 | unsigned int *buf_pos = (unsigned int *)container_hdr; | ||
287 | unsigned long size; | ||
288 | |||
289 | if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE)) | ||
290 | return 0; | ||
291 | |||
292 | size = buf_pos[2]; | ||
293 | |||
294 | if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { | ||
295 | printk(KERN_ERR "microcode: error! " | ||
296 | "Wrong microcode equivalnet cpu table\n"); | ||
297 | return 0; | ||
298 | } | ||
299 | |||
300 | equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); | ||
301 | if (!equiv_cpu_table) { | ||
302 | printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n"); | ||
303 | return 0; | ||
304 | } | ||
305 | |||
306 | buf += UCODE_CONTAINER_HEADER_SIZE; | ||
307 | if (get_ucode_data(equiv_cpu_table, buf, size)) { | ||
308 | vfree(equiv_cpu_table); | ||
309 | return 0; | ||
310 | } | ||
311 | |||
312 | return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ | ||
313 | #undef UCODE_CONTAINER_HEADER_SIZE | ||
314 | } | ||
315 | |||
316 | static void free_equiv_cpu_table(void) | ||
317 | { | ||
318 | if (equiv_cpu_table) { | ||
319 | vfree(equiv_cpu_table); | ||
320 | equiv_cpu_table = NULL; | ||
321 | } | ||
322 | } | ||
323 | |||
324 | static int generic_load_microcode(int cpu, void *data, size_t size, | ||
325 | int (*get_ucode_data)(void *, const void *, size_t)) | ||
326 | { | ||
327 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
328 | u8 *ucode_ptr = data, *new_mc = NULL, *mc; | ||
329 | int new_rev = uci->cpu_sig.rev; | ||
330 | unsigned int leftover; | ||
331 | unsigned long offset; | ||
332 | |||
333 | offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data); | ||
334 | if (!offset) { | ||
335 | printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); | ||
336 | return -EINVAL; | ||
337 | } | ||
338 | |||
339 | ucode_ptr += offset; | ||
340 | leftover = size - offset; | ||
341 | |||
342 | while (leftover) { | ||
343 | unsigned int uninitialized_var(mc_size); | ||
344 | struct microcode_header_amd *mc_header; | ||
345 | |||
346 | mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); | ||
347 | if (!mc) | ||
348 | break; | ||
349 | |||
350 | mc_header = (struct microcode_header_amd *)mc; | ||
351 | if (get_matching_microcode(cpu, mc, new_rev)) { | ||
352 | if (new_mc) | ||
353 | vfree(new_mc); | ||
354 | new_rev = mc_header->patch_id; | ||
355 | new_mc = mc; | ||
356 | } else | ||
357 | vfree(mc); | ||
358 | |||
359 | ucode_ptr += mc_size; | ||
360 | leftover -= mc_size; | ||
361 | } | ||
362 | |||
363 | if (new_mc) { | ||
364 | if (!leftover) { | ||
365 | if (uci->mc) | ||
366 | vfree(uci->mc); | ||
367 | uci->mc = new_mc; | ||
368 | pr_debug("microcode: CPU%d found a matching microcode update with" | ||
369 | " version 0x%x (current=0x%x)\n", | ||
370 | cpu, new_rev, uci->cpu_sig.rev); | ||
371 | } else | ||
372 | vfree(new_mc); | ||
373 | } | ||
374 | |||
375 | free_equiv_cpu_table(); | ||
376 | |||
377 | return (int)leftover; | ||
378 | } | ||
379 | |||
380 | static int get_ucode_fw(void *to, const void *from, size_t n) | ||
381 | { | ||
382 | memcpy(to, from, n); | ||
383 | return 0; | ||
384 | } | ||
385 | |||
386 | static int request_microcode_fw(int cpu, struct device *device) | ||
387 | { | ||
388 | const char *fw_name = "amd-ucode/microcode_amd.bin"; | ||
389 | const struct firmware *firmware; | ||
390 | int ret; | ||
391 | |||
392 | /* We should bind the task to the CPU */ | ||
393 | BUG_ON(cpu != raw_smp_processor_id()); | ||
394 | |||
395 | ret = request_firmware(&firmware, fw_name, device); | ||
396 | if (ret) { | ||
397 | printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name); | ||
398 | return ret; | ||
399 | } | ||
400 | |||
401 | ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, | ||
402 | &get_ucode_fw); | ||
403 | |||
404 | release_firmware(firmware); | ||
405 | |||
406 | return ret; | ||
407 | } | ||
408 | |||
409 | static int request_microcode_user(int cpu, const void __user *buf, size_t size) | ||
410 | { | ||
411 | printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode" | ||
412 | "is not supported\n"); | ||
413 | return -1; | ||
414 | } | ||
415 | |||
416 | static void microcode_fini_cpu_amd(int cpu) | ||
417 | { | ||
418 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
419 | |||
420 | vfree(uci->mc); | ||
421 | uci->mc = NULL; | ||
422 | } | ||
423 | |||
424 | static struct microcode_ops microcode_amd_ops = { | ||
425 | .request_microcode_user = request_microcode_user, | ||
426 | .request_microcode_fw = request_microcode_fw, | ||
427 | .collect_cpu_info = collect_cpu_info_amd, | ||
428 | .apply_microcode = apply_microcode_amd, | ||
429 | .microcode_fini_cpu = microcode_fini_cpu_amd, | ||
430 | }; | ||
431 | |||
432 | struct microcode_ops * __init init_amd_microcode(void) | ||
433 | { | ||
434 | return µcode_amd_ops; | ||
435 | } | ||
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c new file mode 100644 index 000000000000..936d8d55f230 --- /dev/null +++ b/arch/x86/kernel/microcode_core.c | |||
@@ -0,0 +1,508 @@ | |||
1 | /* | ||
2 | * Intel CPU Microcode Update Driver for Linux | ||
3 | * | ||
4 | * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> | ||
5 | * 2006 Shaohua Li <shaohua.li@intel.com> | ||
6 | * | ||
7 | * This driver allows to upgrade microcode on Intel processors | ||
8 | * belonging to IA-32 family - PentiumPro, Pentium II, | ||
9 | * Pentium III, Xeon, Pentium 4, etc. | ||
10 | * | ||
11 | * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture | ||
12 | * Software Developer's Manual | ||
13 | * Order Number 253668 or free download from: | ||
14 | * | ||
15 | * http://developer.intel.com/design/pentium4/manuals/253668.htm | ||
16 | * | ||
17 | * For more information, go to http://www.urbanmyth.org/microcode | ||
18 | * | ||
19 | * This program is free software; you can redistribute it and/or | ||
20 | * modify it under the terms of the GNU General Public License | ||
21 | * as published by the Free Software Foundation; either version | ||
22 | * 2 of the License, or (at your option) any later version. | ||
23 | * | ||
24 | * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
25 | * Initial release. | ||
26 | * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
27 | * Added read() support + cleanups. | ||
28 | * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
29 | * Added 'device trimming' support. open(O_WRONLY) zeroes | ||
30 | * and frees the saved copy of applied microcode. | ||
31 | * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
32 | * Made to use devfs (/dev/cpu/microcode) + cleanups. | ||
33 | * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> | ||
34 | * Added misc device support (now uses both devfs and misc). | ||
35 | * Added MICROCODE_IOCFREE ioctl to clear memory. | ||
36 | * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> | ||
37 | * Messages for error cases (non Intel & no suitable microcode). | ||
38 | * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> | ||
39 | * Removed ->release(). Removed exclusive open and status bitmap. | ||
40 | * Added microcode_rwsem to serialize read()/write()/ioctl(). | ||
41 | * Removed global kernel lock usage. | ||
42 | * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> | ||
43 | * Write 0 to 0x8B msr and then cpuid before reading revision, | ||
44 | * so that it works even if there were no update done by the | ||
45 | * BIOS. Otherwise, reading from 0x8B gives junk (which happened | ||
46 | * to be 0 on my machine which is why it worked even when I | ||
47 | * disabled update by the BIOS) | ||
48 | * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. | ||
49 | * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and | ||
50 | * Tigran Aivazian <tigran@veritas.com> | ||
51 | * Intel Pentium 4 processor support and bugfixes. | ||
52 | * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> | ||
53 | * Bugfix for HT (Hyper-Threading) enabled processors | ||
54 | * whereby processor resources are shared by all logical processors | ||
55 | * in a single CPU package. | ||
56 | * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and | ||
57 | * Tigran Aivazian <tigran@veritas.com>, | ||
58 | * Serialize updates as required on HT processors due to | ||
59 | * speculative nature of implementation. | ||
60 | * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> | ||
61 | * Fix the panic when writing zero-length microcode chunk. | ||
62 | * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, | ||
63 | * Jun Nakajima <jun.nakajima@intel.com> | ||
64 | * Support for the microcode updates in the new format. | ||
65 | * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> | ||
66 | * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl | ||
67 | * because we no longer hold a copy of applied microcode | ||
68 | * in kernel memory. | ||
69 | * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> | ||
70 | * Fix sigmatch() macro to handle old CPUs with pf == 0. | ||
71 | * Thanks to Stuart Swales for pointing out this bug. | ||
72 | */ | ||
73 | #include <linux/capability.h> | ||
74 | #include <linux/kernel.h> | ||
75 | #include <linux/init.h> | ||
76 | #include <linux/sched.h> | ||
77 | #include <linux/smp_lock.h> | ||
78 | #include <linux/cpumask.h> | ||
79 | #include <linux/module.h> | ||
80 | #include <linux/slab.h> | ||
81 | #include <linux/vmalloc.h> | ||
82 | #include <linux/miscdevice.h> | ||
83 | #include <linux/spinlock.h> | ||
84 | #include <linux/mm.h> | ||
85 | #include <linux/fs.h> | ||
86 | #include <linux/mutex.h> | ||
87 | #include <linux/cpu.h> | ||
88 | #include <linux/firmware.h> | ||
89 | #include <linux/platform_device.h> | ||
90 | |||
91 | #include <asm/msr.h> | ||
92 | #include <asm/uaccess.h> | ||
93 | #include <asm/processor.h> | ||
94 | #include <asm/microcode.h> | ||
95 | |||
96 | MODULE_DESCRIPTION("Microcode Update Driver"); | ||
97 | MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); | ||
98 | MODULE_LICENSE("GPL"); | ||
99 | |||
100 | #define MICROCODE_VERSION "2.00" | ||
101 | |||
102 | struct microcode_ops *microcode_ops; | ||
103 | |||
104 | /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ | ||
105 | static DEFINE_MUTEX(microcode_mutex); | ||
106 | |||
107 | struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; | ||
108 | EXPORT_SYMBOL_GPL(ucode_cpu_info); | ||
109 | |||
110 | #ifdef CONFIG_MICROCODE_OLD_INTERFACE | ||
111 | static int do_microcode_update(const void __user *buf, size_t size) | ||
112 | { | ||
113 | cpumask_t old; | ||
114 | int error = 0; | ||
115 | int cpu; | ||
116 | |||
117 | old = current->cpus_allowed; | ||
118 | |||
119 | for_each_online_cpu(cpu) { | ||
120 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
121 | |||
122 | if (!uci->valid) | ||
123 | continue; | ||
124 | |||
125 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); | ||
126 | error = microcode_ops->request_microcode_user(cpu, buf, size); | ||
127 | if (error < 0) | ||
128 | goto out; | ||
129 | if (!error) | ||
130 | microcode_ops->apply_microcode(cpu); | ||
131 | } | ||
132 | out: | ||
133 | set_cpus_allowed_ptr(current, &old); | ||
134 | return error; | ||
135 | } | ||
136 | |||
137 | static int microcode_open(struct inode *unused1, struct file *unused2) | ||
138 | { | ||
139 | cycle_kernel_lock(); | ||
140 | return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; | ||
141 | } | ||
142 | |||
143 | static ssize_t microcode_write(struct file *file, const char __user *buf, | ||
144 | size_t len, loff_t *ppos) | ||
145 | { | ||
146 | ssize_t ret; | ||
147 | |||
148 | if ((len >> PAGE_SHIFT) > num_physpages) { | ||
149 | printk(KERN_ERR "microcode: too much data (max %ld pages)\n", | ||
150 | num_physpages); | ||
151 | return -EINVAL; | ||
152 | } | ||
153 | |||
154 | get_online_cpus(); | ||
155 | mutex_lock(µcode_mutex); | ||
156 | |||
157 | ret = do_microcode_update(buf, len); | ||
158 | if (!ret) | ||
159 | ret = (ssize_t)len; | ||
160 | |||
161 | mutex_unlock(µcode_mutex); | ||
162 | put_online_cpus(); | ||
163 | |||
164 | return ret; | ||
165 | } | ||
166 | |||
167 | static const struct file_operations microcode_fops = { | ||
168 | .owner = THIS_MODULE, | ||
169 | .write = microcode_write, | ||
170 | .open = microcode_open, | ||
171 | }; | ||
172 | |||
173 | static struct miscdevice microcode_dev = { | ||
174 | .minor = MICROCODE_MINOR, | ||
175 | .name = "microcode", | ||
176 | .fops = µcode_fops, | ||
177 | }; | ||
178 | |||
179 | static int __init microcode_dev_init(void) | ||
180 | { | ||
181 | int error; | ||
182 | |||
183 | error = misc_register(µcode_dev); | ||
184 | if (error) { | ||
185 | printk(KERN_ERR | ||
186 | "microcode: can't misc_register on minor=%d\n", | ||
187 | MICROCODE_MINOR); | ||
188 | return error; | ||
189 | } | ||
190 | |||
191 | return 0; | ||
192 | } | ||
193 | |||
194 | static void microcode_dev_exit(void) | ||
195 | { | ||
196 | misc_deregister(µcode_dev); | ||
197 | } | ||
198 | |||
199 | MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); | ||
200 | #else | ||
201 | #define microcode_dev_init() 0 | ||
202 | #define microcode_dev_exit() do { } while (0) | ||
203 | #endif | ||
204 | |||
205 | /* fake device for request_firmware */ | ||
206 | struct platform_device *microcode_pdev; | ||
207 | |||
208 | static ssize_t reload_store(struct sys_device *dev, | ||
209 | struct sysdev_attribute *attr, | ||
210 | const char *buf, size_t sz) | ||
211 | { | ||
212 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; | ||
213 | char *end; | ||
214 | unsigned long val = simple_strtoul(buf, &end, 0); | ||
215 | int err = 0; | ||
216 | int cpu = dev->id; | ||
217 | |||
218 | if (end == buf) | ||
219 | return -EINVAL; | ||
220 | if (val == 1) { | ||
221 | cpumask_t old = current->cpus_allowed; | ||
222 | |||
223 | get_online_cpus(); | ||
224 | if (cpu_online(cpu)) { | ||
225 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); | ||
226 | mutex_lock(µcode_mutex); | ||
227 | if (uci->valid) { | ||
228 | err = microcode_ops->request_microcode_fw(cpu, | ||
229 | µcode_pdev->dev); | ||
230 | if (!err) | ||
231 | microcode_ops->apply_microcode(cpu); | ||
232 | } | ||
233 | mutex_unlock(µcode_mutex); | ||
234 | set_cpus_allowed_ptr(current, &old); | ||
235 | } | ||
236 | put_online_cpus(); | ||
237 | } | ||
238 | if (err) | ||
239 | return err; | ||
240 | return sz; | ||
241 | } | ||
242 | |||
243 | static ssize_t version_show(struct sys_device *dev, | ||
244 | struct sysdev_attribute *attr, char *buf) | ||
245 | { | ||
246 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; | ||
247 | |||
248 | return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); | ||
249 | } | ||
250 | |||
251 | static ssize_t pf_show(struct sys_device *dev, | ||
252 | struct sysdev_attribute *attr, char *buf) | ||
253 | { | ||
254 | struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; | ||
255 | |||
256 | return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); | ||
257 | } | ||
258 | |||
259 | static SYSDEV_ATTR(reload, 0200, NULL, reload_store); | ||
260 | static SYSDEV_ATTR(version, 0400, version_show, NULL); | ||
261 | static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); | ||
262 | |||
263 | static struct attribute *mc_default_attrs[] = { | ||
264 | &attr_reload.attr, | ||
265 | &attr_version.attr, | ||
266 | &attr_processor_flags.attr, | ||
267 | NULL | ||
268 | }; | ||
269 | |||
270 | static struct attribute_group mc_attr_group = { | ||
271 | .attrs = mc_default_attrs, | ||
272 | .name = "microcode", | ||
273 | }; | ||
274 | |||
275 | static void microcode_fini_cpu(int cpu) | ||
276 | { | ||
277 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
278 | |||
279 | mutex_lock(µcode_mutex); | ||
280 | microcode_ops->microcode_fini_cpu(cpu); | ||
281 | uci->valid = 0; | ||
282 | mutex_unlock(µcode_mutex); | ||
283 | } | ||
284 | |||
285 | static void collect_cpu_info(int cpu) | ||
286 | { | ||
287 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
288 | |||
289 | memset(uci, 0, sizeof(*uci)); | ||
290 | if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) | ||
291 | uci->valid = 1; | ||
292 | } | ||
293 | |||
294 | static int microcode_resume_cpu(int cpu) | ||
295 | { | ||
296 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
297 | struct cpu_signature nsig; | ||
298 | |||
299 | pr_debug("microcode: CPU%d resumed\n", cpu); | ||
300 | |||
301 | if (!uci->mc) | ||
302 | return 1; | ||
303 | |||
304 | /* | ||
305 | * Let's verify that the 'cached' ucode does belong | ||
306 | * to this cpu (a bit of paranoia): | ||
307 | */ | ||
308 | if (microcode_ops->collect_cpu_info(cpu, &nsig)) { | ||
309 | microcode_fini_cpu(cpu); | ||
310 | return -1; | ||
311 | } | ||
312 | |||
313 | if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { | ||
314 | microcode_fini_cpu(cpu); | ||
315 | /* Should we look for a new ucode here? */ | ||
316 | return 1; | ||
317 | } | ||
318 | |||
319 | return 0; | ||
320 | } | ||
321 | |||
322 | void microcode_update_cpu(int cpu) | ||
323 | { | ||
324 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
325 | int err = 0; | ||
326 | |||
327 | /* | ||
328 | * Check if the system resume is in progress (uci->valid != NULL), | ||
329 | * otherwise just request a firmware: | ||
330 | */ | ||
331 | if (uci->valid) { | ||
332 | err = microcode_resume_cpu(cpu); | ||
333 | } else { | ||
334 | collect_cpu_info(cpu); | ||
335 | if (uci->valid && system_state == SYSTEM_RUNNING) | ||
336 | err = microcode_ops->request_microcode_fw(cpu, | ||
337 | µcode_pdev->dev); | ||
338 | } | ||
339 | if (!err) | ||
340 | microcode_ops->apply_microcode(cpu); | ||
341 | } | ||
342 | |||
343 | static void microcode_init_cpu(int cpu) | ||
344 | { | ||
345 | cpumask_t old = current->cpus_allowed; | ||
346 | |||
347 | set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); | ||
348 | /* We should bind the task to the CPU */ | ||
349 | BUG_ON(raw_smp_processor_id() != cpu); | ||
350 | |||
351 | mutex_lock(µcode_mutex); | ||
352 | microcode_update_cpu(cpu); | ||
353 | mutex_unlock(µcode_mutex); | ||
354 | |||
355 | set_cpus_allowed_ptr(current, &old); | ||
356 | } | ||
357 | |||
358 | static int mc_sysdev_add(struct sys_device *sys_dev) | ||
359 | { | ||
360 | int err, cpu = sys_dev->id; | ||
361 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
362 | |||
363 | if (!cpu_online(cpu)) | ||
364 | return 0; | ||
365 | |||
366 | pr_debug("microcode: CPU%d added\n", cpu); | ||
367 | memset(uci, 0, sizeof(*uci)); | ||
368 | |||
369 | err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); | ||
370 | if (err) | ||
371 | return err; | ||
372 | |||
373 | microcode_init_cpu(cpu); | ||
374 | return 0; | ||
375 | } | ||
376 | |||
377 | static int mc_sysdev_remove(struct sys_device *sys_dev) | ||
378 | { | ||
379 | int cpu = sys_dev->id; | ||
380 | |||
381 | if (!cpu_online(cpu)) | ||
382 | return 0; | ||
383 | |||
384 | pr_debug("microcode: CPU%d removed\n", cpu); | ||
385 | microcode_fini_cpu(cpu); | ||
386 | sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); | ||
387 | return 0; | ||
388 | } | ||
389 | |||
390 | static int mc_sysdev_resume(struct sys_device *dev) | ||
391 | { | ||
392 | int cpu = dev->id; | ||
393 | |||
394 | if (!cpu_online(cpu)) | ||
395 | return 0; | ||
396 | |||
397 | /* only CPU 0 will apply ucode here */ | ||
398 | microcode_update_cpu(0); | ||
399 | return 0; | ||
400 | } | ||
401 | |||
402 | static struct sysdev_driver mc_sysdev_driver = { | ||
403 | .add = mc_sysdev_add, | ||
404 | .remove = mc_sysdev_remove, | ||
405 | .resume = mc_sysdev_resume, | ||
406 | }; | ||
407 | |||
408 | static __cpuinit int | ||
409 | mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) | ||
410 | { | ||
411 | unsigned int cpu = (unsigned long)hcpu; | ||
412 | struct sys_device *sys_dev; | ||
413 | |||
414 | sys_dev = get_cpu_sysdev(cpu); | ||
415 | switch (action) { | ||
416 | case CPU_ONLINE: | ||
417 | case CPU_ONLINE_FROZEN: | ||
418 | microcode_init_cpu(cpu); | ||
419 | case CPU_DOWN_FAILED: | ||
420 | case CPU_DOWN_FAILED_FROZEN: | ||
421 | pr_debug("microcode: CPU%d added\n", cpu); | ||
422 | if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) | ||
423 | printk(KERN_ERR "microcode: Failed to create the sysfs " | ||
424 | "group for CPU%d\n", cpu); | ||
425 | break; | ||
426 | case CPU_DOWN_PREPARE: | ||
427 | case CPU_DOWN_PREPARE_FROZEN: | ||
428 | /* Suspend is in progress, only remove the interface */ | ||
429 | sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); | ||
430 | pr_debug("microcode: CPU%d removed\n", cpu); | ||
431 | break; | ||
432 | case CPU_DEAD: | ||
433 | case CPU_UP_CANCELED_FROZEN: | ||
434 | /* The CPU refused to come up during a system resume */ | ||
435 | microcode_fini_cpu(cpu); | ||
436 | break; | ||
437 | } | ||
438 | return NOTIFY_OK; | ||
439 | } | ||
440 | |||
441 | static struct notifier_block __refdata mc_cpu_notifier = { | ||
442 | .notifier_call = mc_cpu_callback, | ||
443 | }; | ||
444 | |||
445 | static int __init microcode_init(void) | ||
446 | { | ||
447 | struct cpuinfo_x86 *c = &cpu_data(0); | ||
448 | int error; | ||
449 | |||
450 | if (c->x86_vendor == X86_VENDOR_INTEL) | ||
451 | microcode_ops = init_intel_microcode(); | ||
452 | else if (c->x86_vendor == X86_VENDOR_AMD) | ||
453 | microcode_ops = init_amd_microcode(); | ||
454 | |||
455 | if (!microcode_ops) { | ||
456 | printk(KERN_ERR "microcode: no support for this CPU vendor\n"); | ||
457 | return -ENODEV; | ||
458 | } | ||
459 | |||
460 | error = microcode_dev_init(); | ||
461 | if (error) | ||
462 | return error; | ||
463 | microcode_pdev = platform_device_register_simple("microcode", -1, | ||
464 | NULL, 0); | ||
465 | if (IS_ERR(microcode_pdev)) { | ||
466 | microcode_dev_exit(); | ||
467 | return PTR_ERR(microcode_pdev); | ||
468 | } | ||
469 | |||
470 | get_online_cpus(); | ||
471 | error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); | ||
472 | put_online_cpus(); | ||
473 | if (error) { | ||
474 | microcode_dev_exit(); | ||
475 | platform_device_unregister(microcode_pdev); | ||
476 | return error; | ||
477 | } | ||
478 | |||
479 | register_hotcpu_notifier(&mc_cpu_notifier); | ||
480 | |||
481 | printk(KERN_INFO | ||
482 | "Microcode Update Driver: v" MICROCODE_VERSION | ||
483 | " <tigran@aivazian.fsnet.co.uk>" | ||
484 | " <peter.oruba@amd.com>\n"); | ||
485 | |||
486 | return 0; | ||
487 | } | ||
488 | |||
489 | static void __exit microcode_exit(void) | ||
490 | { | ||
491 | microcode_dev_exit(); | ||
492 | |||
493 | unregister_hotcpu_notifier(&mc_cpu_notifier); | ||
494 | |||
495 | get_online_cpus(); | ||
496 | sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); | ||
497 | put_online_cpus(); | ||
498 | |||
499 | platform_device_unregister(microcode_pdev); | ||
500 | |||
501 | microcode_ops = NULL; | ||
502 | |||
503 | printk(KERN_INFO | ||
504 | "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); | ||
505 | } | ||
506 | |||
507 | module_init(microcode_init); | ||
508 | module_exit(microcode_exit); | ||
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c new file mode 100644 index 000000000000..622dc4a21784 --- /dev/null +++ b/arch/x86/kernel/microcode_intel.c | |||
@@ -0,0 +1,480 @@ | |||
1 | /* | ||
2 | * Intel CPU Microcode Update Driver for Linux | ||
3 | * | ||
4 | * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> | ||
5 | * 2006 Shaohua Li <shaohua.li@intel.com> | ||
6 | * | ||
7 | * This driver allows to upgrade microcode on Intel processors | ||
8 | * belonging to IA-32 family - PentiumPro, Pentium II, | ||
9 | * Pentium III, Xeon, Pentium 4, etc. | ||
10 | * | ||
11 | * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture | ||
12 | * Software Developer's Manual | ||
13 | * Order Number 253668 or free download from: | ||
14 | * | ||
15 | * http://developer.intel.com/design/pentium4/manuals/253668.htm | ||
16 | * | ||
17 | * For more information, go to http://www.urbanmyth.org/microcode | ||
18 | * | ||
19 | * This program is free software; you can redistribute it and/or | ||
20 | * modify it under the terms of the GNU General Public License | ||
21 | * as published by the Free Software Foundation; either version | ||
22 | * 2 of the License, or (at your option) any later version. | ||
23 | * | ||
24 | * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
25 | * Initial release. | ||
26 | * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
27 | * Added read() support + cleanups. | ||
28 | * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
29 | * Added 'device trimming' support. open(O_WRONLY) zeroes | ||
30 | * and frees the saved copy of applied microcode. | ||
31 | * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> | ||
32 | * Made to use devfs (/dev/cpu/microcode) + cleanups. | ||
33 | * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> | ||
34 | * Added misc device support (now uses both devfs and misc). | ||
35 | * Added MICROCODE_IOCFREE ioctl to clear memory. | ||
36 | * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> | ||
37 | * Messages for error cases (non Intel & no suitable microcode). | ||
38 | * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> | ||
39 | * Removed ->release(). Removed exclusive open and status bitmap. | ||
40 | * Added microcode_rwsem to serialize read()/write()/ioctl(). | ||
41 | * Removed global kernel lock usage. | ||
42 | * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> | ||
43 | * Write 0 to 0x8B msr and then cpuid before reading revision, | ||
44 | * so that it works even if there were no update done by the | ||
45 | * BIOS. Otherwise, reading from 0x8B gives junk (which happened | ||
46 | * to be 0 on my machine which is why it worked even when I | ||
47 | * disabled update by the BIOS) | ||
48 | * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. | ||
49 | * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and | ||
50 | * Tigran Aivazian <tigran@veritas.com> | ||
51 | * Intel Pentium 4 processor support and bugfixes. | ||
52 | * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> | ||
53 | * Bugfix for HT (Hyper-Threading) enabled processors | ||
54 | * whereby processor resources are shared by all logical processors | ||
55 | * in a single CPU package. | ||
56 | * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and | ||
57 | * Tigran Aivazian <tigran@veritas.com>, | ||
58 | * Serialize updates as required on HT processors due to | ||
59 | * speculative nature of implementation. | ||
60 | * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> | ||
61 | * Fix the panic when writing zero-length microcode chunk. | ||
62 | * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, | ||
63 | * Jun Nakajima <jun.nakajima@intel.com> | ||
64 | * Support for the microcode updates in the new format. | ||
65 | * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> | ||
66 | * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl | ||
67 | * because we no longer hold a copy of applied microcode | ||
68 | * in kernel memory. | ||
69 | * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> | ||
70 | * Fix sigmatch() macro to handle old CPUs with pf == 0. | ||
71 | * Thanks to Stuart Swales for pointing out this bug. | ||
72 | */ | ||
73 | #include <linux/capability.h> | ||
74 | #include <linux/kernel.h> | ||
75 | #include <linux/init.h> | ||
76 | #include <linux/sched.h> | ||
77 | #include <linux/smp_lock.h> | ||
78 | #include <linux/cpumask.h> | ||
79 | #include <linux/module.h> | ||
80 | #include <linux/slab.h> | ||
81 | #include <linux/vmalloc.h> | ||
82 | #include <linux/miscdevice.h> | ||
83 | #include <linux/spinlock.h> | ||
84 | #include <linux/mm.h> | ||
85 | #include <linux/fs.h> | ||
86 | #include <linux/mutex.h> | ||
87 | #include <linux/cpu.h> | ||
88 | #include <linux/firmware.h> | ||
89 | #include <linux/platform_device.h> | ||
90 | |||
91 | #include <asm/msr.h> | ||
92 | #include <asm/uaccess.h> | ||
93 | #include <asm/processor.h> | ||
94 | #include <asm/microcode.h> | ||
95 | |||
96 | MODULE_DESCRIPTION("Microcode Update Driver"); | ||
97 | MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); | ||
98 | MODULE_LICENSE("GPL"); | ||
99 | |||
100 | struct microcode_header_intel { | ||
101 | unsigned int hdrver; | ||
102 | unsigned int rev; | ||
103 | unsigned int date; | ||
104 | unsigned int sig; | ||
105 | unsigned int cksum; | ||
106 | unsigned int ldrver; | ||
107 | unsigned int pf; | ||
108 | unsigned int datasize; | ||
109 | unsigned int totalsize; | ||
110 | unsigned int reserved[3]; | ||
111 | }; | ||
112 | |||
113 | struct microcode_intel { | ||
114 | struct microcode_header_intel hdr; | ||
115 | unsigned int bits[0]; | ||
116 | }; | ||
117 | |||
118 | /* microcode format is extended from prescott processors */ | ||
119 | struct extended_signature { | ||
120 | unsigned int sig; | ||
121 | unsigned int pf; | ||
122 | unsigned int cksum; | ||
123 | }; | ||
124 | |||
125 | struct extended_sigtable { | ||
126 | unsigned int count; | ||
127 | unsigned int cksum; | ||
128 | unsigned int reserved[3]; | ||
129 | struct extended_signature sigs[0]; | ||
130 | }; | ||
131 | |||
132 | #define DEFAULT_UCODE_DATASIZE (2000) | ||
133 | #define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) | ||
134 | #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) | ||
135 | #define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) | ||
136 | #define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) | ||
137 | #define DWSIZE (sizeof(u32)) | ||
138 | #define get_totalsize(mc) \ | ||
139 | (((struct microcode_intel *)mc)->hdr.totalsize ? \ | ||
140 | ((struct microcode_intel *)mc)->hdr.totalsize : \ | ||
141 | DEFAULT_UCODE_TOTALSIZE) | ||
142 | |||
143 | #define get_datasize(mc) \ | ||
144 | (((struct microcode_intel *)mc)->hdr.datasize ? \ | ||
145 | ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) | ||
146 | |||
147 | #define sigmatch(s1, s2, p1, p2) \ | ||
148 | (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) | ||
149 | |||
150 | #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) | ||
151 | |||
152 | /* serialize access to the physical write to MSR 0x79 */ | ||
153 | static DEFINE_SPINLOCK(microcode_update_lock); | ||
154 | |||
155 | static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) | ||
156 | { | ||
157 | struct cpuinfo_x86 *c = &cpu_data(cpu_num); | ||
158 | unsigned int val[2]; | ||
159 | |||
160 | memset(csig, 0, sizeof(*csig)); | ||
161 | |||
162 | if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || | ||
163 | cpu_has(c, X86_FEATURE_IA64)) { | ||
164 | printk(KERN_ERR "microcode: CPU%d not a capable Intel " | ||
165 | "processor\n", cpu_num); | ||
166 | return -1; | ||
167 | } | ||
168 | |||
169 | csig->sig = cpuid_eax(0x00000001); | ||
170 | |||
171 | if ((c->x86_model >= 5) || (c->x86 > 6)) { | ||
172 | /* get processor flags from MSR 0x17 */ | ||
173 | rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); | ||
174 | csig->pf = 1 << ((val[1] >> 18) & 7); | ||
175 | } | ||
176 | |||
177 | wrmsr(MSR_IA32_UCODE_REV, 0, 0); | ||
178 | /* see notes above for revision 1.07. Apparent chip bug */ | ||
179 | sync_core(); | ||
180 | /* get the current revision from MSR 0x8B */ | ||
181 | rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); | ||
182 | pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", | ||
183 | csig->sig, csig->pf, csig->rev); | ||
184 | |||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf) | ||
189 | { | ||
190 | return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1; | ||
191 | } | ||
192 | |||
193 | static inline int | ||
194 | update_match_revision(struct microcode_header_intel *mc_header, int rev) | ||
195 | { | ||
196 | return (mc_header->rev <= rev) ? 0 : 1; | ||
197 | } | ||
198 | |||
199 | static int microcode_sanity_check(void *mc) | ||
200 | { | ||
201 | struct microcode_header_intel *mc_header = mc; | ||
202 | struct extended_sigtable *ext_header = NULL; | ||
203 | struct extended_signature *ext_sig; | ||
204 | unsigned long total_size, data_size, ext_table_size; | ||
205 | int sum, orig_sum, ext_sigcount = 0, i; | ||
206 | |||
207 | total_size = get_totalsize(mc_header); | ||
208 | data_size = get_datasize(mc_header); | ||
209 | if (data_size + MC_HEADER_SIZE > total_size) { | ||
210 | printk(KERN_ERR "microcode: error! " | ||
211 | "Bad data size in microcode data file\n"); | ||
212 | return -EINVAL; | ||
213 | } | ||
214 | |||
215 | if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { | ||
216 | printk(KERN_ERR "microcode: error! " | ||
217 | "Unknown microcode update format\n"); | ||
218 | return -EINVAL; | ||
219 | } | ||
220 | ext_table_size = total_size - (MC_HEADER_SIZE + data_size); | ||
221 | if (ext_table_size) { | ||
222 | if ((ext_table_size < EXT_HEADER_SIZE) | ||
223 | || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { | ||
224 | printk(KERN_ERR "microcode: error! " | ||
225 | "Small exttable size in microcode data file\n"); | ||
226 | return -EINVAL; | ||
227 | } | ||
228 | ext_header = mc + MC_HEADER_SIZE + data_size; | ||
229 | if (ext_table_size != exttable_size(ext_header)) { | ||
230 | printk(KERN_ERR "microcode: error! " | ||
231 | "Bad exttable size in microcode data file\n"); | ||
232 | return -EFAULT; | ||
233 | } | ||
234 | ext_sigcount = ext_header->count; | ||
235 | } | ||
236 | |||
237 | /* check extended table checksum */ | ||
238 | if (ext_table_size) { | ||
239 | int ext_table_sum = 0; | ||
240 | int *ext_tablep = (int *)ext_header; | ||
241 | |||
242 | i = ext_table_size / DWSIZE; | ||
243 | while (i--) | ||
244 | ext_table_sum += ext_tablep[i]; | ||
245 | if (ext_table_sum) { | ||
246 | printk(KERN_WARNING "microcode: aborting, " | ||
247 | "bad extended signature table checksum\n"); | ||
248 | return -EINVAL; | ||
249 | } | ||
250 | } | ||
251 | |||
252 | /* calculate the checksum */ | ||
253 | orig_sum = 0; | ||
254 | i = (MC_HEADER_SIZE + data_size) / DWSIZE; | ||
255 | while (i--) | ||
256 | orig_sum += ((int *)mc)[i]; | ||
257 | if (orig_sum) { | ||
258 | printk(KERN_ERR "microcode: aborting, bad checksum\n"); | ||
259 | return -EINVAL; | ||
260 | } | ||
261 | if (!ext_table_size) | ||
262 | return 0; | ||
263 | /* check extended signature checksum */ | ||
264 | for (i = 0; i < ext_sigcount; i++) { | ||
265 | ext_sig = (void *)ext_header + EXT_HEADER_SIZE + | ||
266 | EXT_SIGNATURE_SIZE * i; | ||
267 | sum = orig_sum | ||
268 | - (mc_header->sig + mc_header->pf + mc_header->cksum) | ||
269 | + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); | ||
270 | if (sum) { | ||
271 | printk(KERN_ERR "microcode: aborting, bad checksum\n"); | ||
272 | return -EINVAL; | ||
273 | } | ||
274 | } | ||
275 | return 0; | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * return 0 - no update found | ||
280 | * return 1 - found update | ||
281 | */ | ||
282 | static int | ||
283 | get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) | ||
284 | { | ||
285 | struct microcode_header_intel *mc_header = mc; | ||
286 | struct extended_sigtable *ext_header; | ||
287 | unsigned long total_size = get_totalsize(mc_header); | ||
288 | int ext_sigcount, i; | ||
289 | struct extended_signature *ext_sig; | ||
290 | |||
291 | if (!update_match_revision(mc_header, rev)) | ||
292 | return 0; | ||
293 | |||
294 | if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf)) | ||
295 | return 1; | ||
296 | |||
297 | /* Look for ext. headers: */ | ||
298 | if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) | ||
299 | return 0; | ||
300 | |||
301 | ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; | ||
302 | ext_sigcount = ext_header->count; | ||
303 | ext_sig = (void *)ext_header + EXT_HEADER_SIZE; | ||
304 | |||
305 | for (i = 0; i < ext_sigcount; i++) { | ||
306 | if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf)) | ||
307 | return 1; | ||
308 | ext_sig++; | ||
309 | } | ||
310 | return 0; | ||
311 | } | ||
312 | |||
313 | static void apply_microcode(int cpu) | ||
314 | { | ||
315 | unsigned long flags; | ||
316 | unsigned int val[2]; | ||
317 | int cpu_num = raw_smp_processor_id(); | ||
318 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
319 | struct microcode_intel *mc_intel = uci->mc; | ||
320 | |||
321 | /* We should bind the task to the CPU */ | ||
322 | BUG_ON(cpu_num != cpu); | ||
323 | |||
324 | if (mc_intel == NULL) | ||
325 | return; | ||
326 | |||
327 | /* serialize access to the physical write to MSR 0x79 */ | ||
328 | spin_lock_irqsave(µcode_update_lock, flags); | ||
329 | |||
330 | /* write microcode via MSR 0x79 */ | ||
331 | wrmsr(MSR_IA32_UCODE_WRITE, | ||
332 | (unsigned long) mc_intel->bits, | ||
333 | (unsigned long) mc_intel->bits >> 16 >> 16); | ||
334 | wrmsr(MSR_IA32_UCODE_REV, 0, 0); | ||
335 | |||
336 | /* see notes above for revision 1.07. Apparent chip bug */ | ||
337 | sync_core(); | ||
338 | |||
339 | /* get the current revision from MSR 0x8B */ | ||
340 | rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); | ||
341 | |||
342 | spin_unlock_irqrestore(µcode_update_lock, flags); | ||
343 | if (val[1] != mc_intel->hdr.rev) { | ||
344 | printk(KERN_ERR "microcode: CPU%d update from revision " | ||
345 | "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]); | ||
346 | return; | ||
347 | } | ||
348 | printk(KERN_INFO "microcode: CPU%d updated from revision " | ||
349 | "0x%x to 0x%x, date = %04x-%02x-%02x \n", | ||
350 | cpu_num, uci->cpu_sig.rev, val[1], | ||
351 | mc_intel->hdr.date & 0xffff, | ||
352 | mc_intel->hdr.date >> 24, | ||
353 | (mc_intel->hdr.date >> 16) & 0xff); | ||
354 | uci->cpu_sig.rev = val[1]; | ||
355 | } | ||
356 | |||
357 | static int generic_load_microcode(int cpu, void *data, size_t size, | ||
358 | int (*get_ucode_data)(void *, const void *, size_t)) | ||
359 | { | ||
360 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
361 | u8 *ucode_ptr = data, *new_mc = NULL, *mc; | ||
362 | int new_rev = uci->cpu_sig.rev; | ||
363 | unsigned int leftover = size; | ||
364 | |||
365 | while (leftover) { | ||
366 | struct microcode_header_intel mc_header; | ||
367 | unsigned int mc_size; | ||
368 | |||
369 | if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) | ||
370 | break; | ||
371 | |||
372 | mc_size = get_totalsize(&mc_header); | ||
373 | if (!mc_size || mc_size > leftover) { | ||
374 | printk(KERN_ERR "microcode: error!" | ||
375 | "Bad data in microcode data file\n"); | ||
376 | break; | ||
377 | } | ||
378 | |||
379 | mc = vmalloc(mc_size); | ||
380 | if (!mc) | ||
381 | break; | ||
382 | |||
383 | if (get_ucode_data(mc, ucode_ptr, mc_size) || | ||
384 | microcode_sanity_check(mc) < 0) { | ||
385 | vfree(mc); | ||
386 | break; | ||
387 | } | ||
388 | |||
389 | if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { | ||
390 | if (new_mc) | ||
391 | vfree(new_mc); | ||
392 | new_rev = mc_header.rev; | ||
393 | new_mc = mc; | ||
394 | } else | ||
395 | vfree(mc); | ||
396 | |||
397 | ucode_ptr += mc_size; | ||
398 | leftover -= mc_size; | ||
399 | } | ||
400 | |||
401 | if (new_mc) { | ||
402 | if (!leftover) { | ||
403 | if (uci->mc) | ||
404 | vfree(uci->mc); | ||
405 | uci->mc = (struct microcode_intel *)new_mc; | ||
406 | pr_debug("microcode: CPU%d found a matching microcode update with" | ||
407 | " version 0x%x (current=0x%x)\n", | ||
408 | cpu, new_rev, uci->cpu_sig.rev); | ||
409 | } else | ||
410 | vfree(new_mc); | ||
411 | } | ||
412 | |||
413 | return (int)leftover; | ||
414 | } | ||
415 | |||
416 | static int get_ucode_fw(void *to, const void *from, size_t n) | ||
417 | { | ||
418 | memcpy(to, from, n); | ||
419 | return 0; | ||
420 | } | ||
421 | |||
422 | static int request_microcode_fw(int cpu, struct device *device) | ||
423 | { | ||
424 | char name[30]; | ||
425 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
426 | const struct firmware *firmware; | ||
427 | int ret; | ||
428 | |||
429 | /* We should bind the task to the CPU */ | ||
430 | BUG_ON(cpu != raw_smp_processor_id()); | ||
431 | sprintf(name, "intel-ucode/%02x-%02x-%02x", | ||
432 | c->x86, c->x86_model, c->x86_mask); | ||
433 | ret = request_firmware(&firmware, name, device); | ||
434 | if (ret) { | ||
435 | pr_debug("microcode: data file %s load failed\n", name); | ||
436 | return ret; | ||
437 | } | ||
438 | |||
439 | ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, | ||
440 | &get_ucode_fw); | ||
441 | |||
442 | release_firmware(firmware); | ||
443 | |||
444 | return ret; | ||
445 | } | ||
446 | |||
447 | static int get_ucode_user(void *to, const void *from, size_t n) | ||
448 | { | ||
449 | return copy_from_user(to, from, n); | ||
450 | } | ||
451 | |||
452 | static int request_microcode_user(int cpu, const void __user *buf, size_t size) | ||
453 | { | ||
454 | /* We should bind the task to the CPU */ | ||
455 | BUG_ON(cpu != raw_smp_processor_id()); | ||
456 | |||
457 | return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); | ||
458 | } | ||
459 | |||
460 | static void microcode_fini_cpu(int cpu) | ||
461 | { | ||
462 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | ||
463 | |||
464 | vfree(uci->mc); | ||
465 | uci->mc = NULL; | ||
466 | } | ||
467 | |||
468 | struct microcode_ops microcode_intel_ops = { | ||
469 | .request_microcode_user = request_microcode_user, | ||
470 | .request_microcode_fw = request_microcode_fw, | ||
471 | .collect_cpu_info = collect_cpu_info, | ||
472 | .apply_microcode = apply_microcode, | ||
473 | .microcode_fini_cpu = microcode_fini_cpu, | ||
474 | }; | ||
475 | |||
476 | struct microcode_ops * __init init_intel_microcode(void) | ||
477 | { | ||
478 | return µcode_intel_ops; | ||
479 | } | ||
480 | |||
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b3fb430725cb..f98f4e1dba09 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c | |||
@@ -397,7 +397,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) | |||
397 | generic_bigsmp_probe(); | 397 | generic_bigsmp_probe(); |
398 | #endif | 398 | #endif |
399 | 399 | ||
400 | #ifdef CONFIG_X86_32 | ||
400 | setup_apic_routing(); | 401 | setup_apic_routing(); |
402 | #endif | ||
401 | if (!num_processors) | 403 | if (!num_processors) |
402 | printk(KERN_ERR "MPTABLE: no processors registered!\n"); | 404 | printk(KERN_ERR "MPTABLE: no processors registered!\n"); |
403 | return num_processors; | 405 | return num_processors; |
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 2e2af5d18191..82a7c7ed6d45 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c | |||
@@ -163,8 +163,8 @@ static int __cpuinit msr_device_create(int cpu) | |||
163 | { | 163 | { |
164 | struct device *dev; | 164 | struct device *dev; |
165 | 165 | ||
166 | dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), | 166 | dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, |
167 | NULL, "msr%d", cpu); | 167 | "msr%d", cpu); |
168 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; | 168 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; |
169 | } | 169 | } |
170 | 170 | ||
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index abb78a2cc4ad..2c97f07f1c2c 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c | |||
@@ -299,6 +299,15 @@ void acpi_nmi_disable(void) | |||
299 | on_each_cpu(__acpi_nmi_disable, NULL, 1); | 299 | on_each_cpu(__acpi_nmi_disable, NULL, 1); |
300 | } | 300 | } |
301 | 301 | ||
302 | /* | ||
303 | * This function is called as soon the LAPIC NMI watchdog driver has everything | ||
304 | * in place and it's ready to check if the NMIs belong to the NMI watchdog | ||
305 | */ | ||
306 | void cpu_nmi_set_wd_enabled(void) | ||
307 | { | ||
308 | __get_cpu_var(wd_enabled) = 1; | ||
309 | } | ||
310 | |||
302 | void setup_apic_nmi_watchdog(void *unused) | 311 | void setup_apic_nmi_watchdog(void *unused) |
303 | { | 312 | { |
304 | if (__get_cpu_var(wd_enabled)) | 313 | if (__get_cpu_var(wd_enabled)) |
@@ -311,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused) | |||
311 | 320 | ||
312 | switch (nmi_watchdog) { | 321 | switch (nmi_watchdog) { |
313 | case NMI_LOCAL_APIC: | 322 | case NMI_LOCAL_APIC: |
314 | /* enable it before to avoid race with handler */ | ||
315 | __get_cpu_var(wd_enabled) = 1; | ||
316 | if (lapic_watchdog_init(nmi_hz) < 0) { | 323 | if (lapic_watchdog_init(nmi_hz) < 0) { |
317 | __get_cpu_var(wd_enabled) = 0; | 324 | __get_cpu_var(wd_enabled) = 0; |
318 | return; | 325 | return; |
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index eecc8c18f010..4caff39078e0 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c | |||
@@ -229,6 +229,12 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, | |||
229 | } | 229 | } |
230 | } | 230 | } |
231 | 231 | ||
232 | static int __init numaq_setup_ioapic_ids(void) | ||
233 | { | ||
234 | /* so can skip it */ | ||
235 | return 1; | ||
236 | } | ||
237 | |||
232 | static struct x86_quirks numaq_x86_quirks __initdata = { | 238 | static struct x86_quirks numaq_x86_quirks __initdata = { |
233 | .arch_pre_time_init = numaq_pre_time_init, | 239 | .arch_pre_time_init = numaq_pre_time_init, |
234 | .arch_time_init = NULL, | 240 | .arch_time_init = NULL, |
@@ -243,6 +249,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = { | |||
243 | .mpc_oem_bus_info = mpc_oem_bus_info, | 249 | .mpc_oem_bus_info = mpc_oem_bus_info, |
244 | .mpc_oem_pci_bus = mpc_oem_pci_bus, | 250 | .mpc_oem_pci_bus = mpc_oem_pci_bus, |
245 | .smp_read_mpc_oem = smp_read_mpc_oem, | 251 | .smp_read_mpc_oem = smp_read_mpc_oem, |
252 | .setup_ioapic_ids = numaq_setup_ioapic_ids, | ||
246 | }; | 253 | }; |
247 | 254 | ||
248 | void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, | 255 | void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, |
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 3e6672274807..7a13fac63a1f 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c | |||
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd); | |||
190 | static void __init platform_detect(void) | 190 | static void __init platform_detect(void) |
191 | { | 191 | { |
192 | size_t propsize; | 192 | size_t propsize; |
193 | u32 rev; | 193 | __be32 rev; |
194 | 194 | ||
195 | if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, | 195 | if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, |
196 | &propsize) || propsize != 4) { | 196 | &propsize) || propsize != 4) { |
197 | printk(KERN_ERR "ofw: getprop call failed!\n"); | 197 | printk(KERN_ERR "ofw: getprop call failed!\n"); |
198 | rev = 0; | 198 | rev = cpu_to_be32(0); |
199 | } | 199 | } |
200 | olpc_platform_info.boardrev = be32_to_cpu(rev); | 200 | olpc_platform_info.boardrev = be32_to_cpu(rev); |
201 | } | 201 | } |
@@ -203,7 +203,7 @@ static void __init platform_detect(void) | |||
203 | static void __init platform_detect(void) | 203 | static void __init platform_detect(void) |
204 | { | 204 | { |
205 | /* stopgap until OFW support is added to the kernel */ | 205 | /* stopgap until OFW support is added to the kernel */ |
206 | olpc_platform_info.boardrev = be32_to_cpu(0xc2); | 206 | olpc_platform_info.boardrev = 0xc2; |
207 | } | 207 | } |
208 | #endif | 208 | #endif |
209 | 209 | ||
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c new file mode 100644 index 000000000000..0e9f1982b1dd --- /dev/null +++ b/arch/x86/kernel/paravirt-spinlocks.c | |||
@@ -0,0 +1,37 @@ | |||
1 | /* | ||
2 | * Split spinlock implementation out into its own file, so it can be | ||
3 | * compiled in a FTRACE-compatible way. | ||
4 | */ | ||
5 | #include <linux/spinlock.h> | ||
6 | #include <linux/module.h> | ||
7 | |||
8 | #include <asm/paravirt.h> | ||
9 | |||
10 | static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) | ||
11 | { | ||
12 | __raw_spin_lock(lock); | ||
13 | } | ||
14 | |||
15 | struct pv_lock_ops pv_lock_ops = { | ||
16 | #ifdef CONFIG_SMP | ||
17 | .spin_is_locked = __ticket_spin_is_locked, | ||
18 | .spin_is_contended = __ticket_spin_is_contended, | ||
19 | |||
20 | .spin_lock = __ticket_spin_lock, | ||
21 | .spin_lock_flags = default_spin_lock_flags, | ||
22 | .spin_trylock = __ticket_spin_trylock, | ||
23 | .spin_unlock = __ticket_spin_unlock, | ||
24 | #endif | ||
25 | }; | ||
26 | EXPORT_SYMBOL(pv_lock_ops); | ||
27 | |||
28 | void __init paravirt_use_bytelocks(void) | ||
29 | { | ||
30 | #ifdef CONFIG_SMP | ||
31 | pv_lock_ops.spin_is_locked = __byte_spin_is_locked; | ||
32 | pv_lock_ops.spin_is_contended = __byte_spin_is_contended; | ||
33 | pv_lock_ops.spin_lock = __byte_spin_lock; | ||
34 | pv_lock_ops.spin_trylock = __byte_spin_trylock; | ||
35 | pv_lock_ops.spin_unlock = __byte_spin_unlock; | ||
36 | #endif | ||
37 | } | ||
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 300da17e61cb..e4c8fb608873 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c | |||
@@ -268,17 +268,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) | |||
268 | return __get_cpu_var(paravirt_lazy_mode); | 268 | return __get_cpu_var(paravirt_lazy_mode); |
269 | } | 269 | } |
270 | 270 | ||
271 | void __init paravirt_use_bytelocks(void) | ||
272 | { | ||
273 | #ifdef CONFIG_SMP | ||
274 | pv_lock_ops.spin_is_locked = __byte_spin_is_locked; | ||
275 | pv_lock_ops.spin_is_contended = __byte_spin_is_contended; | ||
276 | pv_lock_ops.spin_lock = __byte_spin_lock; | ||
277 | pv_lock_ops.spin_trylock = __byte_spin_trylock; | ||
278 | pv_lock_ops.spin_unlock = __byte_spin_unlock; | ||
279 | #endif | ||
280 | } | ||
281 | |||
282 | struct pv_info pv_info = { | 271 | struct pv_info pv_info = { |
283 | .name = "bare hardware", | 272 | .name = "bare hardware", |
284 | .paravirt_enabled = 0, | 273 | .paravirt_enabled = 0, |
@@ -330,6 +319,7 @@ struct pv_cpu_ops pv_cpu_ops = { | |||
330 | #endif | 319 | #endif |
331 | .wbinvd = native_wbinvd, | 320 | .wbinvd = native_wbinvd, |
332 | .read_msr = native_read_msr_safe, | 321 | .read_msr = native_read_msr_safe, |
322 | .read_msr_amd = native_read_msr_amd_safe, | ||
333 | .write_msr = native_write_msr_safe, | 323 | .write_msr = native_write_msr_safe, |
334 | .read_tsc = native_read_tsc, | 324 | .read_tsc = native_read_tsc, |
335 | .read_pmc = native_read_pmc, | 325 | .read_pmc = native_read_pmc, |
@@ -348,6 +338,10 @@ struct pv_cpu_ops pv_cpu_ops = { | |||
348 | .write_ldt_entry = native_write_ldt_entry, | 338 | .write_ldt_entry = native_write_ldt_entry, |
349 | .write_gdt_entry = native_write_gdt_entry, | 339 | .write_gdt_entry = native_write_gdt_entry, |
350 | .write_idt_entry = native_write_idt_entry, | 340 | .write_idt_entry = native_write_idt_entry, |
341 | |||
342 | .alloc_ldt = paravirt_nop, | ||
343 | .free_ldt = paravirt_nop, | ||
344 | |||
351 | .load_sp0 = native_load_sp0, | 345 | .load_sp0 = native_load_sp0, |
352 | 346 | ||
353 | #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) | 347 | #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) |
@@ -373,8 +367,6 @@ struct pv_cpu_ops pv_cpu_ops = { | |||
373 | 367 | ||
374 | struct pv_apic_ops pv_apic_ops = { | 368 | struct pv_apic_ops pv_apic_ops = { |
375 | #ifdef CONFIG_X86_LOCAL_APIC | 369 | #ifdef CONFIG_X86_LOCAL_APIC |
376 | .apic_write = native_apic_write, | ||
377 | .apic_read = native_apic_read, | ||
378 | .setup_boot_clock = setup_boot_APIC_clock, | 370 | .setup_boot_clock = setup_boot_APIC_clock, |
379 | .setup_secondary_clock = setup_secondary_APIC_clock, | 371 | .setup_secondary_clock = setup_secondary_APIC_clock, |
380 | .startup_ipi_hook = paravirt_nop, | 372 | .startup_ipi_hook = paravirt_nop, |
@@ -461,18 +453,6 @@ struct pv_mmu_ops pv_mmu_ops = { | |||
461 | .set_fixmap = native_set_fixmap, | 453 | .set_fixmap = native_set_fixmap, |
462 | }; | 454 | }; |
463 | 455 | ||
464 | struct pv_lock_ops pv_lock_ops = { | ||
465 | #ifdef CONFIG_SMP | ||
466 | .spin_is_locked = __ticket_spin_is_locked, | ||
467 | .spin_is_contended = __ticket_spin_is_contended, | ||
468 | |||
469 | .spin_lock = __ticket_spin_lock, | ||
470 | .spin_trylock = __ticket_spin_trylock, | ||
471 | .spin_unlock = __ticket_spin_unlock, | ||
472 | #endif | ||
473 | }; | ||
474 | EXPORT_SYMBOL(pv_lock_ops); | ||
475 | |||
476 | EXPORT_SYMBOL_GPL(pv_time_ops); | 456 | EXPORT_SYMBOL_GPL(pv_time_ops); |
477 | EXPORT_SYMBOL (pv_cpu_ops); | 457 | EXPORT_SYMBOL (pv_cpu_ops); |
478 | EXPORT_SYMBOL (pv_mmu_ops); | 458 | EXPORT_SYMBOL (pv_mmu_ops); |
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 58262218781b..9fe644f4861d 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c | |||
@@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, | |||
23 | start = start_##ops##_##x; \ | 23 | start = start_##ops##_##x; \ |
24 | end = end_##ops##_##x; \ | 24 | end = end_##ops##_##x; \ |
25 | goto patch_site | 25 | goto patch_site |
26 | switch(type) { | 26 | switch (type) { |
27 | PATCH_SITE(pv_irq_ops, irq_disable); | 27 | PATCH_SITE(pv_irq_ops, irq_disable); |
28 | PATCH_SITE(pv_irq_ops, irq_enable); | 28 | PATCH_SITE(pv_irq_ops, irq_enable); |
29 | PATCH_SITE(pv_irq_ops, restore_fl); | 29 | PATCH_SITE(pv_irq_ops, restore_fl); |
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index dcdac6c826e9..e1e731d78f38 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c | |||
@@ -217,16 +217,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, | |||
217 | 217 | ||
218 | #endif /* CONFIG_IOMMU_DEBUG */ | 218 | #endif /* CONFIG_IOMMU_DEBUG */ |
219 | 219 | ||
220 | static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) | ||
221 | { | ||
222 | unsigned int npages; | ||
223 | |||
224 | npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK); | ||
225 | npages >>= PAGE_SHIFT; | ||
226 | |||
227 | return npages; | ||
228 | } | ||
229 | |||
230 | static inline int translation_enabled(struct iommu_table *tbl) | 220 | static inline int translation_enabled(struct iommu_table *tbl) |
231 | { | 221 | { |
232 | /* only PHBs with translation enabled have an IOMMU table */ | 222 | /* only PHBs with translation enabled have an IOMMU table */ |
@@ -261,7 +251,7 @@ static void iommu_range_reserve(struct iommu_table *tbl, | |||
261 | badbit, tbl, start_addr, npages); | 251 | badbit, tbl, start_addr, npages); |
262 | } | 252 | } |
263 | 253 | ||
264 | set_bit_string(tbl->it_map, index, npages); | 254 | iommu_area_reserve(tbl->it_map, index, npages); |
265 | 255 | ||
266 | spin_unlock_irqrestore(&tbl->it_lock, flags); | 256 | spin_unlock_irqrestore(&tbl->it_lock, flags); |
267 | } | 257 | } |
@@ -408,7 +398,7 @@ static void calgary_unmap_sg(struct device *dev, | |||
408 | if (dmalen == 0) | 398 | if (dmalen == 0) |
409 | break; | 399 | break; |
410 | 400 | ||
411 | npages = num_dma_pages(dma, dmalen); | 401 | npages = iommu_num_pages(dma, dmalen, PAGE_SIZE); |
412 | iommu_free(tbl, dma, npages); | 402 | iommu_free(tbl, dma, npages); |
413 | } | 403 | } |
414 | } | 404 | } |
@@ -427,7 +417,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, | |||
427 | BUG_ON(!sg_page(s)); | 417 | BUG_ON(!sg_page(s)); |
428 | 418 | ||
429 | vaddr = (unsigned long) sg_virt(s); | 419 | vaddr = (unsigned long) sg_virt(s); |
430 | npages = num_dma_pages(vaddr, s->length); | 420 | npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); |
431 | 421 | ||
432 | entry = iommu_range_alloc(dev, tbl, npages); | 422 | entry = iommu_range_alloc(dev, tbl, npages); |
433 | if (entry == bad_dma_address) { | 423 | if (entry == bad_dma_address) { |
@@ -464,7 +454,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, | |||
464 | struct iommu_table *tbl = find_iommu_table(dev); | 454 | struct iommu_table *tbl = find_iommu_table(dev); |
465 | 455 | ||
466 | uaddr = (unsigned long)vaddr; | 456 | uaddr = (unsigned long)vaddr; |
467 | npages = num_dma_pages(uaddr, size); | 457 | npages = iommu_num_pages(uaddr, size, PAGE_SIZE); |
468 | 458 | ||
469 | return iommu_alloc(dev, tbl, vaddr, npages, direction); | 459 | return iommu_alloc(dev, tbl, vaddr, npages, direction); |
470 | } | 460 | } |
@@ -475,7 +465,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, | |||
475 | struct iommu_table *tbl = find_iommu_table(dev); | 465 | struct iommu_table *tbl = find_iommu_table(dev); |
476 | unsigned int npages; | 466 | unsigned int npages; |
477 | 467 | ||
478 | npages = num_dma_pages(dma_handle, size); | 468 | npages = iommu_num_pages(dma_handle, size, PAGE_SIZE); |
479 | iommu_free(tbl, dma_handle, npages); | 469 | iommu_free(tbl, dma_handle, npages); |
480 | } | 470 | } |
481 | 471 | ||
@@ -491,6 +481,8 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, | |||
491 | npages = size >> PAGE_SHIFT; | 481 | npages = size >> PAGE_SHIFT; |
492 | order = get_order(size); | 482 | order = get_order(size); |
493 | 483 | ||
484 | flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); | ||
485 | |||
494 | /* alloc enough pages (and possibly more) */ | 486 | /* alloc enough pages (and possibly more) */ |
495 | ret = (void *)__get_free_pages(flag, order); | 487 | ret = (void *)__get_free_pages(flag, order); |
496 | if (!ret) | 488 | if (!ret) |
@@ -510,8 +502,22 @@ error: | |||
510 | return ret; | 502 | return ret; |
511 | } | 503 | } |
512 | 504 | ||
505 | static void calgary_free_coherent(struct device *dev, size_t size, | ||
506 | void *vaddr, dma_addr_t dma_handle) | ||
507 | { | ||
508 | unsigned int npages; | ||
509 | struct iommu_table *tbl = find_iommu_table(dev); | ||
510 | |||
511 | size = PAGE_ALIGN(size); | ||
512 | npages = size >> PAGE_SHIFT; | ||
513 | |||
514 | iommu_free(tbl, dma_handle, npages); | ||
515 | free_pages((unsigned long)vaddr, get_order(size)); | ||
516 | } | ||
517 | |||
513 | static struct dma_mapping_ops calgary_dma_ops = { | 518 | static struct dma_mapping_ops calgary_dma_ops = { |
514 | .alloc_coherent = calgary_alloc_coherent, | 519 | .alloc_coherent = calgary_alloc_coherent, |
520 | .free_coherent = calgary_free_coherent, | ||
515 | .map_single = calgary_map_single, | 521 | .map_single = calgary_map_single, |
516 | .unmap_single = calgary_unmap_single, | 522 | .unmap_single = calgary_unmap_single, |
517 | .map_sg = calgary_map_sg, | 523 | .map_sg = calgary_map_sg, |
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 87d4d6964ec2..192624820217 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c | |||
@@ -41,11 +41,12 @@ EXPORT_SYMBOL(bad_dma_address); | |||
41 | /* Dummy device used for NULL arguments (normally ISA). Better would | 41 | /* Dummy device used for NULL arguments (normally ISA). Better would |
42 | be probably a smaller DMA mask, but this is bug-to-bug compatible | 42 | be probably a smaller DMA mask, but this is bug-to-bug compatible |
43 | to older i386. */ | 43 | to older i386. */ |
44 | struct device fallback_dev = { | 44 | struct device x86_dma_fallback_dev = { |
45 | .bus_id = "fallback device", | 45 | .bus_id = "fallback device", |
46 | .coherent_dma_mask = DMA_32BIT_MASK, | 46 | .coherent_dma_mask = DMA_32BIT_MASK, |
47 | .dma_mask = &fallback_dev.coherent_dma_mask, | 47 | .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, |
48 | }; | 48 | }; |
49 | EXPORT_SYMBOL(x86_dma_fallback_dev); | ||
49 | 50 | ||
50 | int dma_set_mask(struct device *dev, u64 mask) | 51 | int dma_set_mask(struct device *dev, u64 mask) |
51 | { | 52 | { |
@@ -82,7 +83,7 @@ void __init dma32_reserve_bootmem(void) | |||
82 | * using 512M as goal | 83 | * using 512M as goal |
83 | */ | 84 | */ |
84 | align = 64ULL<<20; | 85 | align = 64ULL<<20; |
85 | size = round_up(dma32_bootmem_size, align); | 86 | size = roundup(dma32_bootmem_size, align); |
86 | dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, | 87 | dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, |
87 | 512ULL<<20); | 88 | 512ULL<<20); |
88 | if (dma32_bootmem_ptr) | 89 | if (dma32_bootmem_ptr) |
@@ -124,15 +125,46 @@ void __init pci_iommu_alloc(void) | |||
124 | pci_swiotlb_init(); | 125 | pci_swiotlb_init(); |
125 | } | 126 | } |
126 | 127 | ||
127 | unsigned long iommu_num_pages(unsigned long addr, unsigned long len) | 128 | unsigned long iommu_nr_pages(unsigned long addr, unsigned long len) |
128 | { | 129 | { |
129 | unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); | 130 | unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); |
130 | 131 | ||
131 | return size >> PAGE_SHIFT; | 132 | return size >> PAGE_SHIFT; |
132 | } | 133 | } |
133 | EXPORT_SYMBOL(iommu_num_pages); | 134 | EXPORT_SYMBOL(iommu_nr_pages); |
134 | #endif | 135 | #endif |
135 | 136 | ||
137 | void *dma_generic_alloc_coherent(struct device *dev, size_t size, | ||
138 | dma_addr_t *dma_addr, gfp_t flag) | ||
139 | { | ||
140 | unsigned long dma_mask; | ||
141 | struct page *page; | ||
142 | dma_addr_t addr; | ||
143 | |||
144 | dma_mask = dma_alloc_coherent_mask(dev, flag); | ||
145 | |||
146 | flag |= __GFP_ZERO; | ||
147 | again: | ||
148 | page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); | ||
149 | if (!page) | ||
150 | return NULL; | ||
151 | |||
152 | addr = page_to_phys(page); | ||
153 | if (!is_buffer_dma_capable(dma_mask, addr, size)) { | ||
154 | __free_pages(page, get_order(size)); | ||
155 | |||
156 | if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) { | ||
157 | flag = (flag & ~GFP_DMA32) | GFP_DMA; | ||
158 | goto again; | ||
159 | } | ||
160 | |||
161 | return NULL; | ||
162 | } | ||
163 | |||
164 | *dma_addr = addr; | ||
165 | return page_address(page); | ||
166 | } | ||
167 | |||
136 | /* | 168 | /* |
137 | * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter | 169 | * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter |
138 | * documentation. | 170 | * documentation. |
@@ -241,147 +273,6 @@ int dma_supported(struct device *dev, u64 mask) | |||
241 | } | 273 | } |
242 | EXPORT_SYMBOL(dma_supported); | 274 | EXPORT_SYMBOL(dma_supported); |
243 | 275 | ||
244 | /* Allocate DMA memory on node near device */ | ||
245 | static noinline struct page * | ||
246 | dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) | ||
247 | { | ||
248 | int node; | ||
249 | |||
250 | node = dev_to_node(dev); | ||
251 | |||
252 | return alloc_pages_node(node, gfp, order); | ||
253 | } | ||
254 | |||
255 | /* | ||
256 | * Allocate memory for a coherent mapping. | ||
257 | */ | ||
258 | void * | ||
259 | dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, | ||
260 | gfp_t gfp) | ||
261 | { | ||
262 | struct dma_mapping_ops *ops = get_dma_ops(dev); | ||
263 | void *memory = NULL; | ||
264 | struct page *page; | ||
265 | unsigned long dma_mask = 0; | ||
266 | dma_addr_t bus; | ||
267 | int noretry = 0; | ||
268 | |||
269 | /* ignore region specifiers */ | ||
270 | gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); | ||
271 | |||
272 | if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) | ||
273 | return memory; | ||
274 | |||
275 | if (!dev) { | ||
276 | dev = &fallback_dev; | ||
277 | gfp |= GFP_DMA; | ||
278 | } | ||
279 | dma_mask = dev->coherent_dma_mask; | ||
280 | if (dma_mask == 0) | ||
281 | dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK; | ||
282 | |||
283 | /* Device not DMA able */ | ||
284 | if (dev->dma_mask == NULL) | ||
285 | return NULL; | ||
286 | |||
287 | /* Don't invoke OOM killer or retry in lower 16MB DMA zone */ | ||
288 | if (gfp & __GFP_DMA) | ||
289 | noretry = 1; | ||
290 | |||
291 | #ifdef CONFIG_X86_64 | ||
292 | /* Why <=? Even when the mask is smaller than 4GB it is often | ||
293 | larger than 16MB and in this case we have a chance of | ||
294 | finding fitting memory in the next higher zone first. If | ||
295 | not retry with true GFP_DMA. -AK */ | ||
296 | if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) { | ||
297 | gfp |= GFP_DMA32; | ||
298 | if (dma_mask < DMA_32BIT_MASK) | ||
299 | noretry = 1; | ||
300 | } | ||
301 | #endif | ||
302 | |||
303 | again: | ||
304 | page = dma_alloc_pages(dev, | ||
305 | noretry ? gfp | __GFP_NORETRY : gfp, get_order(size)); | ||
306 | if (page == NULL) | ||
307 | return NULL; | ||
308 | |||
309 | { | ||
310 | int high, mmu; | ||
311 | bus = page_to_phys(page); | ||
312 | memory = page_address(page); | ||
313 | high = (bus + size) >= dma_mask; | ||
314 | mmu = high; | ||
315 | if (force_iommu && !(gfp & GFP_DMA)) | ||
316 | mmu = 1; | ||
317 | else if (high) { | ||
318 | free_pages((unsigned long)memory, | ||
319 | get_order(size)); | ||
320 | |||
321 | /* Don't use the 16MB ZONE_DMA unless absolutely | ||
322 | needed. It's better to use remapping first. */ | ||
323 | if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) { | ||
324 | gfp = (gfp & ~GFP_DMA32) | GFP_DMA; | ||
325 | goto again; | ||
326 | } | ||
327 | |||
328 | /* Let low level make its own zone decisions */ | ||
329 | gfp &= ~(GFP_DMA32|GFP_DMA); | ||
330 | |||
331 | if (ops->alloc_coherent) | ||
332 | return ops->alloc_coherent(dev, size, | ||
333 | dma_handle, gfp); | ||
334 | return NULL; | ||
335 | } | ||
336 | |||
337 | memset(memory, 0, size); | ||
338 | if (!mmu) { | ||
339 | *dma_handle = bus; | ||
340 | return memory; | ||
341 | } | ||
342 | } | ||
343 | |||
344 | if (ops->alloc_coherent) { | ||
345 | free_pages((unsigned long)memory, get_order(size)); | ||
346 | gfp &= ~(GFP_DMA|GFP_DMA32); | ||
347 | return ops->alloc_coherent(dev, size, dma_handle, gfp); | ||
348 | } | ||
349 | |||
350 | if (ops->map_simple) { | ||
351 | *dma_handle = ops->map_simple(dev, virt_to_phys(memory), | ||
352 | size, | ||
353 | PCI_DMA_BIDIRECTIONAL); | ||
354 | if (*dma_handle != bad_dma_address) | ||
355 | return memory; | ||
356 | } | ||
357 | |||
358 | if (panic_on_overflow) | ||
359 | panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", | ||
360 | (unsigned long)size); | ||
361 | free_pages((unsigned long)memory, get_order(size)); | ||
362 | return NULL; | ||
363 | } | ||
364 | EXPORT_SYMBOL(dma_alloc_coherent); | ||
365 | |||
366 | /* | ||
367 | * Unmap coherent memory. | ||
368 | * The caller must ensure that the device has finished accessing the mapping. | ||
369 | */ | ||
370 | void dma_free_coherent(struct device *dev, size_t size, | ||
371 | void *vaddr, dma_addr_t bus) | ||
372 | { | ||
373 | struct dma_mapping_ops *ops = get_dma_ops(dev); | ||
374 | |||
375 | int order = get_order(size); | ||
376 | WARN_ON(irqs_disabled()); /* for portability */ | ||
377 | if (dma_release_from_coherent(dev, order, vaddr)) | ||
378 | return; | ||
379 | if (ops->unmap_single) | ||
380 | ops->unmap_single(dev, bus, size, 0); | ||
381 | free_pages((unsigned long)vaddr, order); | ||
382 | } | ||
383 | EXPORT_SYMBOL(dma_free_coherent); | ||
384 | |||
385 | static int __init pci_iommu_init(void) | 276 | static int __init pci_iommu_init(void) |
386 | { | 277 | { |
387 | calgary_iommu_init(); | 278 | calgary_iommu_init(); |
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 49285f8fd4d5..e3f75bbcedea 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c | |||
@@ -27,8 +27,8 @@ | |||
27 | #include <linux/scatterlist.h> | 27 | #include <linux/scatterlist.h> |
28 | #include <linux/iommu-helper.h> | 28 | #include <linux/iommu-helper.h> |
29 | #include <linux/sysdev.h> | 29 | #include <linux/sysdev.h> |
30 | #include <linux/io.h> | ||
30 | #include <asm/atomic.h> | 31 | #include <asm/atomic.h> |
31 | #include <asm/io.h> | ||
32 | #include <asm/mtrr.h> | 32 | #include <asm/mtrr.h> |
33 | #include <asm/pgtable.h> | 33 | #include <asm/pgtable.h> |
34 | #include <asm/proto.h> | 34 | #include <asm/proto.h> |
@@ -80,9 +80,10 @@ AGPEXTERN int agp_memory_reserved; | |||
80 | AGPEXTERN __u32 *agp_gatt_table; | 80 | AGPEXTERN __u32 *agp_gatt_table; |
81 | 81 | ||
82 | static unsigned long next_bit; /* protected by iommu_bitmap_lock */ | 82 | static unsigned long next_bit; /* protected by iommu_bitmap_lock */ |
83 | static int need_flush; /* global flush state. set for each gart wrap */ | 83 | static bool need_flush; /* global flush state. set for each gart wrap */ |
84 | 84 | ||
85 | static unsigned long alloc_iommu(struct device *dev, int size) | 85 | static unsigned long alloc_iommu(struct device *dev, int size, |
86 | unsigned long align_mask) | ||
86 | { | 87 | { |
87 | unsigned long offset, flags; | 88 | unsigned long offset, flags; |
88 | unsigned long boundary_size; | 89 | unsigned long boundary_size; |
@@ -90,26 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size) | |||
90 | 91 | ||
91 | base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), | 92 | base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), |
92 | PAGE_SIZE) >> PAGE_SHIFT; | 93 | PAGE_SIZE) >> PAGE_SHIFT; |
93 | boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, | 94 | boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, |
94 | PAGE_SIZE) >> PAGE_SHIFT; | 95 | PAGE_SIZE) >> PAGE_SHIFT; |
95 | 96 | ||
96 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | 97 | spin_lock_irqsave(&iommu_bitmap_lock, flags); |
97 | offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, | 98 | offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, |
98 | size, base_index, boundary_size, 0); | 99 | size, base_index, boundary_size, align_mask); |
99 | if (offset == -1) { | 100 | if (offset == -1) { |
100 | need_flush = 1; | 101 | need_flush = true; |
101 | offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, | 102 | offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, |
102 | size, base_index, boundary_size, 0); | 103 | size, base_index, boundary_size, |
104 | align_mask); | ||
103 | } | 105 | } |
104 | if (offset != -1) { | 106 | if (offset != -1) { |
105 | next_bit = offset+size; | 107 | next_bit = offset+size; |
106 | if (next_bit >= iommu_pages) { | 108 | if (next_bit >= iommu_pages) { |
107 | next_bit = 0; | 109 | next_bit = 0; |
108 | need_flush = 1; | 110 | need_flush = true; |
109 | } | 111 | } |
110 | } | 112 | } |
111 | if (iommu_fullflush) | 113 | if (iommu_fullflush) |
112 | need_flush = 1; | 114 | need_flush = true; |
113 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | 115 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); |
114 | 116 | ||
115 | return offset; | 117 | return offset; |
@@ -134,7 +136,7 @@ static void flush_gart(void) | |||
134 | spin_lock_irqsave(&iommu_bitmap_lock, flags); | 136 | spin_lock_irqsave(&iommu_bitmap_lock, flags); |
135 | if (need_flush) { | 137 | if (need_flush) { |
136 | k8_flush_garts(); | 138 | k8_flush_garts(); |
137 | need_flush = 0; | 139 | need_flush = false; |
138 | } | 140 | } |
139 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); | 141 | spin_unlock_irqrestore(&iommu_bitmap_lock, flags); |
140 | } | 142 | } |
@@ -173,7 +175,8 @@ static void dump_leak(void) | |||
173 | iommu_leak_pages); | 175 | iommu_leak_pages); |
174 | for (i = 0; i < iommu_leak_pages; i += 2) { | 176 | for (i = 0; i < iommu_leak_pages; i += 2) { |
175 | printk(KERN_DEBUG "%lu: ", iommu_pages-i); | 177 | printk(KERN_DEBUG "%lu: ", iommu_pages-i); |
176 | printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); | 178 | printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], |
179 | 0); | ||
177 | printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); | 180 | printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); |
178 | } | 181 | } |
179 | printk(KERN_DEBUG "\n"); | 182 | printk(KERN_DEBUG "\n"); |
@@ -212,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir) | |||
212 | static inline int | 215 | static inline int |
213 | need_iommu(struct device *dev, unsigned long addr, size_t size) | 216 | need_iommu(struct device *dev, unsigned long addr, size_t size) |
214 | { | 217 | { |
215 | u64 mask = *dev->dma_mask; | 218 | return force_iommu || |
216 | int high = addr + size > mask; | 219 | !is_buffer_dma_capable(*dev->dma_mask, addr, size); |
217 | int mmu = high; | ||
218 | |||
219 | if (force_iommu) | ||
220 | mmu = 1; | ||
221 | |||
222 | return mmu; | ||
223 | } | 220 | } |
224 | 221 | ||
225 | static inline int | 222 | static inline int |
226 | nonforced_iommu(struct device *dev, unsigned long addr, size_t size) | 223 | nonforced_iommu(struct device *dev, unsigned long addr, size_t size) |
227 | { | 224 | { |
228 | u64 mask = *dev->dma_mask; | 225 | return !is_buffer_dma_capable(*dev->dma_mask, addr, size); |
229 | int high = addr + size > mask; | ||
230 | int mmu = high; | ||
231 | |||
232 | return mmu; | ||
233 | } | 226 | } |
234 | 227 | ||
235 | /* Map a single continuous physical area into the IOMMU. | 228 | /* Map a single continuous physical area into the IOMMU. |
236 | * Caller needs to check if the iommu is needed and flush. | 229 | * Caller needs to check if the iommu is needed and flush. |
237 | */ | 230 | */ |
238 | static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, | 231 | static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, |
239 | size_t size, int dir) | 232 | size_t size, int dir, unsigned long align_mask) |
240 | { | 233 | { |
241 | unsigned long npages = iommu_num_pages(phys_mem, size); | 234 | unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); |
242 | unsigned long iommu_page = alloc_iommu(dev, npages); | 235 | unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); |
243 | int i; | 236 | int i; |
244 | 237 | ||
245 | if (iommu_page == -1) { | 238 | if (iommu_page == -1) { |
@@ -259,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, | |||
259 | return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); | 252 | return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); |
260 | } | 253 | } |
261 | 254 | ||
262 | static dma_addr_t | ||
263 | gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir) | ||
264 | { | ||
265 | dma_addr_t map = dma_map_area(dev, paddr, size, dir); | ||
266 | |||
267 | flush_gart(); | ||
268 | |||
269 | return map; | ||
270 | } | ||
271 | |||
272 | /* Map a single area into the IOMMU */ | 255 | /* Map a single area into the IOMMU */ |
273 | static dma_addr_t | 256 | static dma_addr_t |
274 | gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) | 257 | gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) |
@@ -276,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) | |||
276 | unsigned long bus; | 259 | unsigned long bus; |
277 | 260 | ||
278 | if (!dev) | 261 | if (!dev) |
279 | dev = &fallback_dev; | 262 | dev = &x86_dma_fallback_dev; |
280 | 263 | ||
281 | if (!need_iommu(dev, paddr, size)) | 264 | if (!need_iommu(dev, paddr, size)) |
282 | return paddr; | 265 | return paddr; |
283 | 266 | ||
284 | bus = gart_map_simple(dev, paddr, size, dir); | 267 | bus = dma_map_area(dev, paddr, size, dir, 0); |
268 | flush_gart(); | ||
285 | 269 | ||
286 | return bus; | 270 | return bus; |
287 | } | 271 | } |
@@ -301,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, | |||
301 | return; | 285 | return; |
302 | 286 | ||
303 | iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; | 287 | iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; |
304 | npages = iommu_num_pages(dma_addr, size); | 288 | npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); |
305 | for (i = 0; i < npages; i++) { | 289 | for (i = 0; i < npages; i++) { |
306 | iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; | 290 | iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; |
307 | CLEAR_LEAK(iommu_page + i); | 291 | CLEAR_LEAK(iommu_page + i); |
@@ -340,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, | |||
340 | unsigned long addr = sg_phys(s); | 324 | unsigned long addr = sg_phys(s); |
341 | 325 | ||
342 | if (nonforced_iommu(dev, addr, s->length)) { | 326 | if (nonforced_iommu(dev, addr, s->length)) { |
343 | addr = dma_map_area(dev, addr, s->length, dir); | 327 | addr = dma_map_area(dev, addr, s->length, dir, 0); |
344 | if (addr == bad_dma_address) { | 328 | if (addr == bad_dma_address) { |
345 | if (i > 0) | 329 | if (i > 0) |
346 | gart_unmap_sg(dev, sg, i, dir); | 330 | gart_unmap_sg(dev, sg, i, dir); |
@@ -362,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, | |||
362 | int nelems, struct scatterlist *sout, | 346 | int nelems, struct scatterlist *sout, |
363 | unsigned long pages) | 347 | unsigned long pages) |
364 | { | 348 | { |
365 | unsigned long iommu_start = alloc_iommu(dev, pages); | 349 | unsigned long iommu_start = alloc_iommu(dev, pages, 0); |
366 | unsigned long iommu_page = iommu_start; | 350 | unsigned long iommu_page = iommu_start; |
367 | struct scatterlist *s; | 351 | struct scatterlist *s; |
368 | int i; | 352 | int i; |
@@ -384,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, | |||
384 | } | 368 | } |
385 | 369 | ||
386 | addr = phys_addr; | 370 | addr = phys_addr; |
387 | pages = iommu_num_pages(s->offset, s->length); | 371 | pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); |
388 | while (pages--) { | 372 | while (pages--) { |
389 | iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); | 373 | iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); |
390 | SET_LEAK(iommu_page); | 374 | SET_LEAK(iommu_page); |
@@ -427,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) | |||
427 | return 0; | 411 | return 0; |
428 | 412 | ||
429 | if (!dev) | 413 | if (!dev) |
430 | dev = &fallback_dev; | 414 | dev = &x86_dma_fallback_dev; |
431 | 415 | ||
432 | out = 0; | 416 | out = 0; |
433 | start = 0; | 417 | start = 0; |
@@ -467,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) | |||
467 | 451 | ||
468 | seg_size += s->length; | 452 | seg_size += s->length; |
469 | need = nextneed; | 453 | need = nextneed; |
470 | pages += iommu_num_pages(s->offset, s->length); | 454 | pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE); |
471 | ps = s; | 455 | ps = s; |
472 | } | 456 | } |
473 | if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) | 457 | if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) |
@@ -499,6 +483,46 @@ error: | |||
499 | return 0; | 483 | return 0; |
500 | } | 484 | } |
501 | 485 | ||
486 | /* allocate and map a coherent mapping */ | ||
487 | static void * | ||
488 | gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, | ||
489 | gfp_t flag) | ||
490 | { | ||
491 | dma_addr_t paddr; | ||
492 | unsigned long align_mask; | ||
493 | struct page *page; | ||
494 | |||
495 | if (force_iommu && !(flag & GFP_DMA)) { | ||
496 | flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); | ||
497 | page = alloc_pages(flag | __GFP_ZERO, get_order(size)); | ||
498 | if (!page) | ||
499 | return NULL; | ||
500 | |||
501 | align_mask = (1UL << get_order(size)) - 1; | ||
502 | paddr = dma_map_area(dev, page_to_phys(page), size, | ||
503 | DMA_BIDIRECTIONAL, align_mask); | ||
504 | |||
505 | flush_gart(); | ||
506 | if (paddr != bad_dma_address) { | ||
507 | *dma_addr = paddr; | ||
508 | return page_address(page); | ||
509 | } | ||
510 | __free_pages(page, get_order(size)); | ||
511 | } else | ||
512 | return dma_generic_alloc_coherent(dev, size, dma_addr, flag); | ||
513 | |||
514 | return NULL; | ||
515 | } | ||
516 | |||
517 | /* free a coherent mapping */ | ||
518 | static void | ||
519 | gart_free_coherent(struct device *dev, size_t size, void *vaddr, | ||
520 | dma_addr_t dma_addr) | ||
521 | { | ||
522 | gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL); | ||
523 | free_pages((unsigned long)vaddr, get_order(size)); | ||
524 | } | ||
525 | |||
502 | static int no_agp; | 526 | static int no_agp; |
503 | 527 | ||
504 | static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) | 528 | static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) |
@@ -626,7 +650,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info) | |||
626 | struct pci_dev *dev; | 650 | struct pci_dev *dev; |
627 | void *gatt; | 651 | void *gatt; |
628 | int i, error; | 652 | int i, error; |
629 | unsigned long start_pfn, end_pfn; | ||
630 | 653 | ||
631 | printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); | 654 | printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); |
632 | aper_size = aper_base = info->aper_size = 0; | 655 | aper_size = aper_base = info->aper_size = 0; |
@@ -650,13 +673,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info) | |||
650 | info->aper_size = aper_size >> 20; | 673 | info->aper_size = aper_size >> 20; |
651 | 674 | ||
652 | gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); | 675 | gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); |
653 | gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); | 676 | gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, |
677 | get_order(gatt_size)); | ||
654 | if (!gatt) | 678 | if (!gatt) |
655 | panic("Cannot allocate GATT table"); | 679 | panic("Cannot allocate GATT table"); |
656 | if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) | 680 | if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) |
657 | panic("Could not set GART PTEs to uncacheable pages"); | 681 | panic("Could not set GART PTEs to uncacheable pages"); |
658 | 682 | ||
659 | memset(gatt, 0, gatt_size); | ||
660 | agp_gatt_table = gatt; | 683 | agp_gatt_table = gatt; |
661 | 684 | ||
662 | enable_gart_translations(); | 685 | enable_gart_translations(); |
@@ -665,19 +688,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info) | |||
665 | if (!error) | 688 | if (!error) |
666 | error = sysdev_register(&device_gart); | 689 | error = sysdev_register(&device_gart); |
667 | if (error) | 690 | if (error) |
668 | panic("Could not register gart_sysdev -- would corrupt data on next suspend"); | 691 | panic("Could not register gart_sysdev -- " |
692 | "would corrupt data on next suspend"); | ||
669 | 693 | ||
670 | flush_gart(); | 694 | flush_gart(); |
671 | 695 | ||
672 | printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", | 696 | printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", |
673 | aper_base, aper_size>>10); | 697 | aper_base, aper_size>>10); |
674 | 698 | ||
675 | /* need to map that range */ | ||
676 | end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); | ||
677 | if (end_pfn > max_low_pfn_mapped) { | ||
678 | start_pfn = (aper_base>>PAGE_SHIFT); | ||
679 | init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); | ||
680 | } | ||
681 | return 0; | 699 | return 0; |
682 | 700 | ||
683 | nommu: | 701 | nommu: |
@@ -687,20 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info) | |||
687 | return -1; | 705 | return -1; |
688 | } | 706 | } |
689 | 707 | ||
690 | extern int agp_amd64_init(void); | ||
691 | |||
692 | static struct dma_mapping_ops gart_dma_ops = { | 708 | static struct dma_mapping_ops gart_dma_ops = { |
693 | .map_single = gart_map_single, | 709 | .map_single = gart_map_single, |
694 | .map_simple = gart_map_simple, | ||
695 | .unmap_single = gart_unmap_single, | 710 | .unmap_single = gart_unmap_single, |
696 | .sync_single_for_cpu = NULL, | ||
697 | .sync_single_for_device = NULL, | ||
698 | .sync_single_range_for_cpu = NULL, | ||
699 | .sync_single_range_for_device = NULL, | ||
700 | .sync_sg_for_cpu = NULL, | ||
701 | .sync_sg_for_device = NULL, | ||
702 | .map_sg = gart_map_sg, | 711 | .map_sg = gart_map_sg, |
703 | .unmap_sg = gart_unmap_sg, | 712 | .unmap_sg = gart_unmap_sg, |
713 | .alloc_coherent = gart_alloc_coherent, | ||
714 | .free_coherent = gart_free_coherent, | ||
704 | }; | 715 | }; |
705 | 716 | ||
706 | void gart_iommu_shutdown(void) | 717 | void gart_iommu_shutdown(void) |
@@ -727,7 +738,8 @@ void __init gart_iommu_init(void) | |||
727 | { | 738 | { |
728 | struct agp_kern_info info; | 739 | struct agp_kern_info info; |
729 | unsigned long iommu_start; | 740 | unsigned long iommu_start; |
730 | unsigned long aper_size; | 741 | unsigned long aper_base, aper_size; |
742 | unsigned long start_pfn, end_pfn; | ||
731 | unsigned long scratch; | 743 | unsigned long scratch; |
732 | long i; | 744 | long i; |
733 | 745 | ||
@@ -759,30 +771,35 @@ void __init gart_iommu_init(void) | |||
759 | (no_agp && init_k8_gatt(&info) < 0)) { | 771 | (no_agp && init_k8_gatt(&info) < 0)) { |
760 | if (max_pfn > MAX_DMA32_PFN) { | 772 | if (max_pfn > MAX_DMA32_PFN) { |
761 | printk(KERN_WARNING "More than 4GB of memory " | 773 | printk(KERN_WARNING "More than 4GB of memory " |
762 | "but GART IOMMU not available.\n" | 774 | "but GART IOMMU not available.\n"); |
763 | KERN_WARNING "falling back to iommu=soft.\n"); | 775 | printk(KERN_WARNING "falling back to iommu=soft.\n"); |
764 | } | 776 | } |
765 | return; | 777 | return; |
766 | } | 778 | } |
767 | 779 | ||
780 | /* need to map that range */ | ||
781 | aper_size = info.aper_size << 20; | ||
782 | aper_base = info.aper_base; | ||
783 | end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); | ||
784 | if (end_pfn > max_low_pfn_mapped) { | ||
785 | start_pfn = (aper_base>>PAGE_SHIFT); | ||
786 | init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); | ||
787 | } | ||
788 | |||
768 | printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); | 789 | printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); |
769 | aper_size = info.aper_size * 1024 * 1024; | ||
770 | iommu_size = check_iommu_size(info.aper_base, aper_size); | 790 | iommu_size = check_iommu_size(info.aper_base, aper_size); |
771 | iommu_pages = iommu_size >> PAGE_SHIFT; | 791 | iommu_pages = iommu_size >> PAGE_SHIFT; |
772 | 792 | ||
773 | iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, | 793 | iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, |
774 | get_order(iommu_pages/8)); | 794 | get_order(iommu_pages/8)); |
775 | if (!iommu_gart_bitmap) | 795 | if (!iommu_gart_bitmap) |
776 | panic("Cannot allocate iommu bitmap\n"); | 796 | panic("Cannot allocate iommu bitmap\n"); |
777 | memset(iommu_gart_bitmap, 0, iommu_pages/8); | ||
778 | 797 | ||
779 | #ifdef CONFIG_IOMMU_LEAK | 798 | #ifdef CONFIG_IOMMU_LEAK |
780 | if (leak_trace) { | 799 | if (leak_trace) { |
781 | iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, | 800 | iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, |
782 | get_order(iommu_pages*sizeof(void *))); | 801 | get_order(iommu_pages*sizeof(void *))); |
783 | if (iommu_leak_tab) | 802 | if (!iommu_leak_tab) |
784 | memset(iommu_leak_tab, 0, iommu_pages * 8); | ||
785 | else | ||
786 | printk(KERN_DEBUG | 803 | printk(KERN_DEBUG |
787 | "PCI-DMA: Cannot allocate leak trace area\n"); | 804 | "PCI-DMA: Cannot allocate leak trace area\n"); |
788 | } | 805 | } |
@@ -792,7 +809,7 @@ void __init gart_iommu_init(void) | |||
792 | * Out of IOMMU space handling. | 809 | * Out of IOMMU space handling. |
793 | * Reserve some invalid pages at the beginning of the GART. | 810 | * Reserve some invalid pages at the beginning of the GART. |
794 | */ | 811 | */ |
795 | set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); | 812 | iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); |
796 | 813 | ||
797 | agp_memory_reserved = iommu_size; | 814 | agp_memory_reserved = iommu_size; |
798 | printk(KERN_INFO | 815 | printk(KERN_INFO |
@@ -850,7 +867,8 @@ void __init gart_parse_options(char *p) | |||
850 | if (!strncmp(p, "leak", 4)) { | 867 | if (!strncmp(p, "leak", 4)) { |
851 | leak_trace = 1; | 868 | leak_trace = 1; |
852 | p += 4; | 869 | p += 4; |
853 | if (*p == '=') ++p; | 870 | if (*p == '=') |
871 | ++p; | ||
854 | if (isdigit(*p) && get_option(&p, &arg)) | 872 | if (isdigit(*p) && get_option(&p, &arg)) |
855 | iommu_leak_pages = arg; | 873 | iommu_leak_pages = arg; |
856 | } | 874 | } |
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 3f91f71cdc3e..c70ab5a5d4c8 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c | |||
@@ -14,7 +14,7 @@ | |||
14 | static int | 14 | static int |
15 | check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) | 15 | check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) |
16 | { | 16 | { |
17 | if (hwdev && bus + size > *hwdev->dma_mask) { | 17 | if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) { |
18 | if (*hwdev->dma_mask >= DMA_32BIT_MASK) | 18 | if (*hwdev->dma_mask >= DMA_32BIT_MASK) |
19 | printk(KERN_ERR | 19 | printk(KERN_ERR |
20 | "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", | 20 | "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", |
@@ -72,7 +72,15 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, | |||
72 | return nents; | 72 | return nents; |
73 | } | 73 | } |
74 | 74 | ||
75 | static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, | ||
76 | dma_addr_t dma_addr) | ||
77 | { | ||
78 | free_pages((unsigned long)vaddr, get_order(size)); | ||
79 | } | ||
80 | |||
75 | struct dma_mapping_ops nommu_dma_ops = { | 81 | struct dma_mapping_ops nommu_dma_ops = { |
82 | .alloc_coherent = dma_generic_alloc_coherent, | ||
83 | .free_coherent = nommu_free_coherent, | ||
76 | .map_single = nommu_map_single, | 84 | .map_single = nommu_map_single, |
77 | .map_sg = nommu_map_sg, | 85 | .map_sg = nommu_map_sg, |
78 | .is_phys = 1, | 86 | .is_phys = 1, |
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c index bc1f2d3ea277..a311ffcaad16 100644 --- a/arch/x86/kernel/pcspeaker.c +++ b/arch/x86/kernel/pcspeaker.c | |||
@@ -1,20 +1,13 @@ | |||
1 | #include <linux/platform_device.h> | 1 | #include <linux/platform_device.h> |
2 | #include <linux/errno.h> | 2 | #include <linux/err.h> |
3 | #include <linux/init.h> | 3 | #include <linux/init.h> |
4 | 4 | ||
5 | static __init int add_pcspkr(void) | 5 | static __init int add_pcspkr(void) |
6 | { | 6 | { |
7 | struct platform_device *pd; | 7 | struct platform_device *pd; |
8 | int ret; | ||
9 | 8 | ||
10 | pd = platform_device_alloc("pcspkr", -1); | 9 | pd = platform_device_register_simple("pcspkr", -1, NULL, 0); |
11 | if (!pd) | ||
12 | return -ENOMEM; | ||
13 | 10 | ||
14 | ret = platform_device_add(pd); | 11 | return IS_ERR(pd) ? PTR_ERR(pd) : 0; |
15 | if (ret) | ||
16 | platform_device_put(pd); | ||
17 | |||
18 | return ret; | ||
19 | } | 12 | } |
20 | device_initcall(add_pcspkr); | 13 | device_initcall(add_pcspkr); |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 7fc4d5b0a6a0..c622772744d8 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -15,7 +15,6 @@ unsigned long idle_nomwait; | |||
15 | EXPORT_SYMBOL(idle_nomwait); | 15 | EXPORT_SYMBOL(idle_nomwait); |
16 | 16 | ||
17 | struct kmem_cache *task_xstate_cachep; | 17 | struct kmem_cache *task_xstate_cachep; |
18 | static int force_mwait __cpuinitdata; | ||
19 | 18 | ||
20 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) | 19 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) |
21 | { | 20 | { |
@@ -185,7 +184,8 @@ static void mwait_idle(void) | |||
185 | static void poll_idle(void) | 184 | static void poll_idle(void) |
186 | { | 185 | { |
187 | local_irq_enable(); | 186 | local_irq_enable(); |
188 | cpu_relax(); | 187 | while (!need_resched()) |
188 | cpu_relax(); | ||
189 | } | 189 | } |
190 | 190 | ||
191 | /* | 191 | /* |
@@ -246,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) | |||
246 | return 1; | 246 | return 1; |
247 | } | 247 | } |
248 | 248 | ||
249 | static cpumask_t c1e_mask = CPU_MASK_NONE; | ||
250 | static int c1e_detected; | ||
251 | |||
252 | void c1e_remove_cpu(int cpu) | ||
253 | { | ||
254 | cpu_clear(cpu, c1e_mask); | ||
255 | } | ||
256 | |||
249 | /* | 257 | /* |
250 | * C1E aware idle routine. We check for C1E active in the interrupt | 258 | * C1E aware idle routine. We check for C1E active in the interrupt |
251 | * pending message MSR. If we detect C1E, then we handle it the same | 259 | * pending message MSR. If we detect C1E, then we handle it the same |
@@ -253,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) | |||
253 | */ | 261 | */ |
254 | static void c1e_idle(void) | 262 | static void c1e_idle(void) |
255 | { | 263 | { |
256 | static cpumask_t c1e_mask = CPU_MASK_NONE; | ||
257 | static int c1e_detected; | ||
258 | |||
259 | if (need_resched()) | 264 | if (need_resched()) |
260 | return; | 265 | return; |
261 | 266 | ||
@@ -265,8 +270,10 @@ static void c1e_idle(void) | |||
265 | rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); | 270 | rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); |
266 | if (lo & K8_INTP_C1E_ACTIVE_MASK) { | 271 | if (lo & K8_INTP_C1E_ACTIVE_MASK) { |
267 | c1e_detected = 1; | 272 | c1e_detected = 1; |
268 | mark_tsc_unstable("TSC halt in C1E"); | 273 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) |
269 | printk(KERN_INFO "System has C1E enabled\n"); | 274 | mark_tsc_unstable("TSC halt in AMD C1E"); |
275 | printk(KERN_INFO "System has AMD C1E enabled\n"); | ||
276 | set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); | ||
270 | } | 277 | } |
271 | } | 278 | } |
272 | 279 | ||
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 3b7a1ddcc0bc..0a1302fe6d45 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/tick.h> | 37 | #include <linux/tick.h> |
38 | #include <linux/percpu.h> | 38 | #include <linux/percpu.h> |
39 | #include <linux/prctl.h> | 39 | #include <linux/prctl.h> |
40 | #include <linux/dmi.h> | ||
40 | 41 | ||
41 | #include <asm/uaccess.h> | 42 | #include <asm/uaccess.h> |
42 | #include <asm/pgtable.h> | 43 | #include <asm/pgtable.h> |
@@ -55,6 +56,9 @@ | |||
55 | #include <asm/tlbflush.h> | 56 | #include <asm/tlbflush.h> |
56 | #include <asm/cpu.h> | 57 | #include <asm/cpu.h> |
57 | #include <asm/kdebug.h> | 58 | #include <asm/kdebug.h> |
59 | #include <asm/idle.h> | ||
60 | #include <asm/syscalls.h> | ||
61 | #include <asm/smp.h> | ||
58 | 62 | ||
59 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); | 63 | asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); |
60 | 64 | ||
@@ -72,46 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk) | |||
72 | return ((unsigned long *)tsk->thread.sp)[3]; | 76 | return ((unsigned long *)tsk->thread.sp)[3]; |
73 | } | 77 | } |
74 | 78 | ||
75 | #ifdef CONFIG_HOTPLUG_CPU | 79 | #ifndef CONFIG_SMP |
76 | #include <asm/nmi.h> | ||
77 | |||
78 | static void cpu_exit_clear(void) | ||
79 | { | ||
80 | int cpu = raw_smp_processor_id(); | ||
81 | |||
82 | idle_task_exit(); | ||
83 | |||
84 | cpu_uninit(); | ||
85 | irq_ctx_exit(cpu); | ||
86 | |||
87 | cpu_clear(cpu, cpu_callout_map); | ||
88 | cpu_clear(cpu, cpu_callin_map); | ||
89 | |||
90 | numa_remove_cpu(cpu); | ||
91 | } | ||
92 | |||
93 | /* We don't actually take CPU down, just spin without interrupts. */ | ||
94 | static inline void play_dead(void) | ||
95 | { | ||
96 | /* This must be done before dead CPU ack */ | ||
97 | cpu_exit_clear(); | ||
98 | mb(); | ||
99 | /* Ack it */ | ||
100 | __get_cpu_var(cpu_state) = CPU_DEAD; | ||
101 | |||
102 | /* | ||
103 | * With physical CPU hotplug, we should halt the cpu | ||
104 | */ | ||
105 | local_irq_disable(); | ||
106 | /* mask all interrupts, flush any and all caches, and halt */ | ||
107 | wbinvd_halt(); | ||
108 | } | ||
109 | #else | ||
110 | static inline void play_dead(void) | 80 | static inline void play_dead(void) |
111 | { | 81 | { |
112 | BUG(); | 82 | BUG(); |
113 | } | 83 | } |
114 | #endif /* CONFIG_HOTPLUG_CPU */ | 84 | #endif |
115 | 85 | ||
116 | /* | 86 | /* |
117 | * The idle thread. There's no useful work to be | 87 | * The idle thread. There's no useful work to be |
@@ -153,12 +123,13 @@ void cpu_idle(void) | |||
153 | } | 123 | } |
154 | } | 124 | } |
155 | 125 | ||
156 | void __show_registers(struct pt_regs *regs, int all) | 126 | void __show_regs(struct pt_regs *regs, int all) |
157 | { | 127 | { |
158 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; | 128 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; |
159 | unsigned long d0, d1, d2, d3, d6, d7; | 129 | unsigned long d0, d1, d2, d3, d6, d7; |
160 | unsigned long sp; | 130 | unsigned long sp; |
161 | unsigned short ss, gs; | 131 | unsigned short ss, gs; |
132 | const char *board; | ||
162 | 133 | ||
163 | if (user_mode_vm(regs)) { | 134 | if (user_mode_vm(regs)) { |
164 | sp = regs->sp; | 135 | sp = regs->sp; |
@@ -171,11 +142,15 @@ void __show_registers(struct pt_regs *regs, int all) | |||
171 | } | 142 | } |
172 | 143 | ||
173 | printk("\n"); | 144 | printk("\n"); |
174 | printk("Pid: %d, comm: %s %s (%s %.*s)\n", | 145 | |
146 | board = dmi_get_system_info(DMI_PRODUCT_NAME); | ||
147 | if (!board) | ||
148 | board = ""; | ||
149 | printk("Pid: %d, comm: %s %s (%s %.*s) %s\n", | ||
175 | task_pid_nr(current), current->comm, | 150 | task_pid_nr(current), current->comm, |
176 | print_tainted(), init_utsname()->release, | 151 | print_tainted(), init_utsname()->release, |
177 | (int)strcspn(init_utsname()->version, " "), | 152 | (int)strcspn(init_utsname()->version, " "), |
178 | init_utsname()->version); | 153 | init_utsname()->version, board); |
179 | 154 | ||
180 | printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", | 155 | printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", |
181 | (u16)regs->cs, regs->ip, regs->flags, | 156 | (u16)regs->cs, regs->ip, regs->flags, |
@@ -214,7 +189,7 @@ void __show_registers(struct pt_regs *regs, int all) | |||
214 | 189 | ||
215 | void show_regs(struct pt_regs *regs) | 190 | void show_regs(struct pt_regs *regs) |
216 | { | 191 | { |
217 | __show_registers(regs, 1); | 192 | __show_regs(regs, 1); |
218 | show_trace(NULL, regs, ®s->sp, regs->bp); | 193 | show_trace(NULL, regs, ®s->sp, regs->bp); |
219 | } | 194 | } |
220 | 195 | ||
@@ -275,6 +250,14 @@ void exit_thread(void) | |||
275 | tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; | 250 | tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; |
276 | put_cpu(); | 251 | put_cpu(); |
277 | } | 252 | } |
253 | #ifdef CONFIG_X86_DS | ||
254 | /* Free any DS contexts that have not been properly released. */ | ||
255 | if (unlikely(current->thread.ds_ctx)) { | ||
256 | /* we clear debugctl to make sure DS is not used. */ | ||
257 | update_debugctlmsr(0); | ||
258 | ds_free(current->thread.ds_ctx); | ||
259 | } | ||
260 | #endif /* CONFIG_X86_DS */ | ||
278 | } | 261 | } |
279 | 262 | ||
280 | void flush_thread(void) | 263 | void flush_thread(void) |
@@ -436,6 +419,35 @@ int set_tsc_mode(unsigned int val) | |||
436 | return 0; | 419 | return 0; |
437 | } | 420 | } |
438 | 421 | ||
422 | #ifdef CONFIG_X86_DS | ||
423 | static int update_debugctl(struct thread_struct *prev, | ||
424 | struct thread_struct *next, unsigned long debugctl) | ||
425 | { | ||
426 | unsigned long ds_prev = 0; | ||
427 | unsigned long ds_next = 0; | ||
428 | |||
429 | if (prev->ds_ctx) | ||
430 | ds_prev = (unsigned long)prev->ds_ctx->ds; | ||
431 | if (next->ds_ctx) | ||
432 | ds_next = (unsigned long)next->ds_ctx->ds; | ||
433 | |||
434 | if (ds_next != ds_prev) { | ||
435 | /* we clear debugctl to make sure DS | ||
436 | * is not in use when we change it */ | ||
437 | debugctl = 0; | ||
438 | update_debugctlmsr(0); | ||
439 | wrmsr(MSR_IA32_DS_AREA, ds_next, 0); | ||
440 | } | ||
441 | return debugctl; | ||
442 | } | ||
443 | #else | ||
444 | static int update_debugctl(struct thread_struct *prev, | ||
445 | struct thread_struct *next, unsigned long debugctl) | ||
446 | { | ||
447 | return debugctl; | ||
448 | } | ||
449 | #endif /* CONFIG_X86_DS */ | ||
450 | |||
439 | static noinline void | 451 | static noinline void |
440 | __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, | 452 | __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, |
441 | struct tss_struct *tss) | 453 | struct tss_struct *tss) |
@@ -446,14 +458,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, | |||
446 | prev = &prev_p->thread; | 458 | prev = &prev_p->thread; |
447 | next = &next_p->thread; | 459 | next = &next_p->thread; |
448 | 460 | ||
449 | debugctl = prev->debugctlmsr; | 461 | debugctl = update_debugctl(prev, next, prev->debugctlmsr); |
450 | if (next->ds_area_msr != prev->ds_area_msr) { | ||
451 | /* we clear debugctl to make sure DS | ||
452 | * is not in use when we change it */ | ||
453 | debugctl = 0; | ||
454 | update_debugctlmsr(0); | ||
455 | wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0); | ||
456 | } | ||
457 | 462 | ||
458 | if (next->debugctlmsr != debugctl) | 463 | if (next->debugctlmsr != debugctl) |
459 | update_debugctlmsr(next->debugctlmsr); | 464 | update_debugctlmsr(next->debugctlmsr); |
@@ -477,13 +482,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, | |||
477 | hard_enable_TSC(); | 482 | hard_enable_TSC(); |
478 | } | 483 | } |
479 | 484 | ||
480 | #ifdef X86_BTS | 485 | #ifdef CONFIG_X86_PTRACE_BTS |
481 | if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) | 486 | if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) |
482 | ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); | 487 | ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); |
483 | 488 | ||
484 | if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) | 489 | if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) |
485 | ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); | 490 | ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); |
486 | #endif | 491 | #endif /* CONFIG_X86_PTRACE_BTS */ |
487 | 492 | ||
488 | 493 | ||
489 | if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { | 494 | if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 71553b664e2a..cd8c0ed02b7e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -37,11 +37,11 @@ | |||
37 | #include <linux/kdebug.h> | 37 | #include <linux/kdebug.h> |
38 | #include <linux/tick.h> | 38 | #include <linux/tick.h> |
39 | #include <linux/prctl.h> | 39 | #include <linux/prctl.h> |
40 | #include <linux/uaccess.h> | ||
41 | #include <linux/io.h> | ||
40 | 42 | ||
41 | #include <asm/uaccess.h> | ||
42 | #include <asm/pgtable.h> | 43 | #include <asm/pgtable.h> |
43 | #include <asm/system.h> | 44 | #include <asm/system.h> |
44 | #include <asm/io.h> | ||
45 | #include <asm/processor.h> | 45 | #include <asm/processor.h> |
46 | #include <asm/i387.h> | 46 | #include <asm/i387.h> |
47 | #include <asm/mmu_context.h> | 47 | #include <asm/mmu_context.h> |
@@ -51,6 +51,7 @@ | |||
51 | #include <asm/proto.h> | 51 | #include <asm/proto.h> |
52 | #include <asm/ia32.h> | 52 | #include <asm/ia32.h> |
53 | #include <asm/idle.h> | 53 | #include <asm/idle.h> |
54 | #include <asm/syscalls.h> | ||
54 | 55 | ||
55 | asmlinkage extern void ret_from_fork(void); | 56 | asmlinkage extern void ret_from_fork(void); |
56 | 57 | ||
@@ -85,28 +86,12 @@ void exit_idle(void) | |||
85 | __exit_idle(); | 86 | __exit_idle(); |
86 | } | 87 | } |
87 | 88 | ||
88 | #ifdef CONFIG_HOTPLUG_CPU | 89 | #ifndef CONFIG_SMP |
89 | DECLARE_PER_CPU(int, cpu_state); | ||
90 | |||
91 | #include <asm/nmi.h> | ||
92 | /* We halt the CPU with physical CPU hotplug */ | ||
93 | static inline void play_dead(void) | ||
94 | { | ||
95 | idle_task_exit(); | ||
96 | mb(); | ||
97 | /* Ack it */ | ||
98 | __get_cpu_var(cpu_state) = CPU_DEAD; | ||
99 | |||
100 | local_irq_disable(); | ||
101 | /* mask all interrupts, flush any and all caches, and halt */ | ||
102 | wbinvd_halt(); | ||
103 | } | ||
104 | #else | ||
105 | static inline void play_dead(void) | 90 | static inline void play_dead(void) |
106 | { | 91 | { |
107 | BUG(); | 92 | BUG(); |
108 | } | 93 | } |
109 | #endif /* CONFIG_HOTPLUG_CPU */ | 94 | #endif |
110 | 95 | ||
111 | /* | 96 | /* |
112 | * The idle thread. There's no useful work to be | 97 | * The idle thread. There's no useful work to be |
@@ -151,7 +136,7 @@ void cpu_idle(void) | |||
151 | } | 136 | } |
152 | 137 | ||
153 | /* Prints also some state that isn't saved in the pt_regs */ | 138 | /* Prints also some state that isn't saved in the pt_regs */ |
154 | void __show_regs(struct pt_regs * regs) | 139 | void __show_regs(struct pt_regs *regs, int all) |
155 | { | 140 | { |
156 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; | 141 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; |
157 | unsigned long d0, d1, d2, d3, d6, d7; | 142 | unsigned long d0, d1, d2, d3, d6, d7; |
@@ -160,60 +145,65 @@ void __show_regs(struct pt_regs * regs) | |||
160 | 145 | ||
161 | printk("\n"); | 146 | printk("\n"); |
162 | print_modules(); | 147 | print_modules(); |
163 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | 148 | printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n", |
164 | current->pid, current->comm, print_tainted(), | 149 | current->pid, current->comm, print_tainted(), |
165 | init_utsname()->release, | 150 | init_utsname()->release, |
166 | (int)strcspn(init_utsname()->version, " "), | 151 | (int)strcspn(init_utsname()->version, " "), |
167 | init_utsname()->version); | 152 | init_utsname()->version); |
168 | printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); | 153 | printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); |
169 | printk_address(regs->ip, 1); | 154 | printk_address(regs->ip, 1); |
170 | printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, | 155 | printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, |
171 | regs->flags); | 156 | regs->sp, regs->flags); |
172 | printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", | 157 | printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", |
173 | regs->ax, regs->bx, regs->cx); | 158 | regs->ax, regs->bx, regs->cx); |
174 | printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", | 159 | printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", |
175 | regs->dx, regs->si, regs->di); | 160 | regs->dx, regs->si, regs->di); |
176 | printk("RBP: %016lx R08: %016lx R09: %016lx\n", | 161 | printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", |
177 | regs->bp, regs->r8, regs->r9); | 162 | regs->bp, regs->r8, regs->r9); |
178 | printk("R10: %016lx R11: %016lx R12: %016lx\n", | 163 | printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", |
179 | regs->r10, regs->r11, regs->r12); | 164 | regs->r10, regs->r11, regs->r12); |
180 | printk("R13: %016lx R14: %016lx R15: %016lx\n", | 165 | printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", |
181 | regs->r13, regs->r14, regs->r15); | 166 | regs->r13, regs->r14, regs->r15); |
182 | 167 | ||
183 | asm("movl %%ds,%0" : "=r" (ds)); | 168 | asm("movl %%ds,%0" : "=r" (ds)); |
184 | asm("movl %%cs,%0" : "=r" (cs)); | 169 | asm("movl %%cs,%0" : "=r" (cs)); |
185 | asm("movl %%es,%0" : "=r" (es)); | 170 | asm("movl %%es,%0" : "=r" (es)); |
186 | asm("movl %%fs,%0" : "=r" (fsindex)); | 171 | asm("movl %%fs,%0" : "=r" (fsindex)); |
187 | asm("movl %%gs,%0" : "=r" (gsindex)); | 172 | asm("movl %%gs,%0" : "=r" (gsindex)); |
188 | 173 | ||
189 | rdmsrl(MSR_FS_BASE, fs); | 174 | rdmsrl(MSR_FS_BASE, fs); |
190 | rdmsrl(MSR_GS_BASE, gs); | 175 | rdmsrl(MSR_GS_BASE, gs); |
191 | rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); | 176 | rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); |
177 | |||
178 | if (!all) | ||
179 | return; | ||
192 | 180 | ||
193 | cr0 = read_cr0(); | 181 | cr0 = read_cr0(); |
194 | cr2 = read_cr2(); | 182 | cr2 = read_cr2(); |
195 | cr3 = read_cr3(); | 183 | cr3 = read_cr3(); |
196 | cr4 = read_cr4(); | 184 | cr4 = read_cr4(); |
197 | 185 | ||
198 | printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", | 186 | printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", |
199 | fs,fsindex,gs,gsindex,shadowgs); | 187 | fs, fsindex, gs, gsindex, shadowgs); |
200 | printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); | 188 | printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, |
201 | printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); | 189 | es, cr0); |
190 | printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, | ||
191 | cr4); | ||
202 | 192 | ||
203 | get_debugreg(d0, 0); | 193 | get_debugreg(d0, 0); |
204 | get_debugreg(d1, 1); | 194 | get_debugreg(d1, 1); |
205 | get_debugreg(d2, 2); | 195 | get_debugreg(d2, 2); |
206 | printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); | 196 | printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); |
207 | get_debugreg(d3, 3); | 197 | get_debugreg(d3, 3); |
208 | get_debugreg(d6, 6); | 198 | get_debugreg(d6, 6); |
209 | get_debugreg(d7, 7); | 199 | get_debugreg(d7, 7); |
210 | printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); | 200 | printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); |
211 | } | 201 | } |
212 | 202 | ||
213 | void show_regs(struct pt_regs *regs) | 203 | void show_regs(struct pt_regs *regs) |
214 | { | 204 | { |
215 | printk("CPU %d:", smp_processor_id()); | 205 | printk(KERN_INFO "CPU %d:", smp_processor_id()); |
216 | __show_regs(regs); | 206 | __show_regs(regs, 1); |
217 | show_trace(NULL, regs, (void *)(regs + 1), regs->bp); | 207 | show_trace(NULL, regs, (void *)(regs + 1), regs->bp); |
218 | } | 208 | } |
219 | 209 | ||
@@ -238,6 +228,14 @@ void exit_thread(void) | |||
238 | t->io_bitmap_max = 0; | 228 | t->io_bitmap_max = 0; |
239 | put_cpu(); | 229 | put_cpu(); |
240 | } | 230 | } |
231 | #ifdef CONFIG_X86_DS | ||
232 | /* Free any DS contexts that have not been properly released. */ | ||
233 | if (unlikely(t->ds_ctx)) { | ||
234 | /* we clear debugctl to make sure DS is not used. */ | ||
235 | update_debugctlmsr(0); | ||
236 | ds_free(t->ds_ctx); | ||
237 | } | ||
238 | #endif /* CONFIG_X86_DS */ | ||
241 | } | 239 | } |
242 | 240 | ||
243 | void flush_thread(void) | 241 | void flush_thread(void) |
@@ -313,10 +311,10 @@ void prepare_to_copy(struct task_struct *tsk) | |||
313 | 311 | ||
314 | int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, | 312 | int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, |
315 | unsigned long unused, | 313 | unsigned long unused, |
316 | struct task_struct * p, struct pt_regs * regs) | 314 | struct task_struct *p, struct pt_regs *regs) |
317 | { | 315 | { |
318 | int err; | 316 | int err; |
319 | struct pt_regs * childregs; | 317 | struct pt_regs *childregs; |
320 | struct task_struct *me = current; | 318 | struct task_struct *me = current; |
321 | 319 | ||
322 | childregs = ((struct pt_regs *) | 320 | childregs = ((struct pt_regs *) |
@@ -361,10 +359,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, | |||
361 | if (test_thread_flag(TIF_IA32)) | 359 | if (test_thread_flag(TIF_IA32)) |
362 | err = do_set_thread_area(p, -1, | 360 | err = do_set_thread_area(p, -1, |
363 | (struct user_desc __user *)childregs->si, 0); | 361 | (struct user_desc __user *)childregs->si, 0); |
364 | else | 362 | else |
365 | #endif | 363 | #endif |
366 | err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); | 364 | err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); |
367 | if (err) | 365 | if (err) |
368 | goto out; | 366 | goto out; |
369 | } | 367 | } |
370 | err = 0; | 368 | err = 0; |
@@ -471,13 +469,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, | |||
471 | next = &next_p->thread; | 469 | next = &next_p->thread; |
472 | 470 | ||
473 | debugctl = prev->debugctlmsr; | 471 | debugctl = prev->debugctlmsr; |
474 | if (next->ds_area_msr != prev->ds_area_msr) { | 472 | |
475 | /* we clear debugctl to make sure DS | 473 | #ifdef CONFIG_X86_DS |
476 | * is not in use when we change it */ | 474 | { |
477 | debugctl = 0; | 475 | unsigned long ds_prev = 0, ds_next = 0; |
478 | update_debugctlmsr(0); | 476 | |
479 | wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); | 477 | if (prev->ds_ctx) |
478 | ds_prev = (unsigned long)prev->ds_ctx->ds; | ||
479 | if (next->ds_ctx) | ||
480 | ds_next = (unsigned long)next->ds_ctx->ds; | ||
481 | |||
482 | if (ds_next != ds_prev) { | ||
483 | /* | ||
484 | * We clear debugctl to make sure DS | ||
485 | * is not in use when we change it: | ||
486 | */ | ||
487 | debugctl = 0; | ||
488 | update_debugctlmsr(0); | ||
489 | wrmsrl(MSR_IA32_DS_AREA, ds_next); | ||
490 | } | ||
480 | } | 491 | } |
492 | #endif /* CONFIG_X86_DS */ | ||
481 | 493 | ||
482 | if (next->debugctlmsr != debugctl) | 494 | if (next->debugctlmsr != debugctl) |
483 | update_debugctlmsr(next->debugctlmsr); | 495 | update_debugctlmsr(next->debugctlmsr); |
@@ -515,13 +527,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, | |||
515 | memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); | 527 | memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); |
516 | } | 528 | } |
517 | 529 | ||
518 | #ifdef X86_BTS | 530 | #ifdef CONFIG_X86_PTRACE_BTS |
519 | if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) | 531 | if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) |
520 | ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); | 532 | ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); |
521 | 533 | ||
522 | if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) | 534 | if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) |
523 | ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); | 535 | ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); |
524 | #endif | 536 | #endif /* CONFIG_X86_PTRACE_BTS */ |
525 | } | 537 | } |
526 | 538 | ||
527 | /* | 539 | /* |
@@ -543,7 +555,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
543 | unsigned fsindex, gsindex; | 555 | unsigned fsindex, gsindex; |
544 | 556 | ||
545 | /* we're going to use this soon, after a few expensive things */ | 557 | /* we're going to use this soon, after a few expensive things */ |
546 | if (next_p->fpu_counter>5) | 558 | if (next_p->fpu_counter > 5) |
547 | prefetch(next->xstate); | 559 | prefetch(next->xstate); |
548 | 560 | ||
549 | /* | 561 | /* |
@@ -551,13 +563,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
551 | */ | 563 | */ |
552 | load_sp0(tss, next); | 564 | load_sp0(tss, next); |
553 | 565 | ||
554 | /* | 566 | /* |
555 | * Switch DS and ES. | 567 | * Switch DS and ES. |
556 | * This won't pick up thread selector changes, but I guess that is ok. | 568 | * This won't pick up thread selector changes, but I guess that is ok. |
557 | */ | 569 | */ |
558 | savesegment(es, prev->es); | 570 | savesegment(es, prev->es); |
559 | if (unlikely(next->es | prev->es)) | 571 | if (unlikely(next->es | prev->es)) |
560 | loadsegment(es, next->es); | 572 | loadsegment(es, next->es); |
561 | 573 | ||
562 | savesegment(ds, prev->ds); | 574 | savesegment(ds, prev->ds); |
563 | if (unlikely(next->ds | prev->ds)) | 575 | if (unlikely(next->ds | prev->ds)) |
@@ -583,7 +595,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
583 | */ | 595 | */ |
584 | arch_leave_lazy_cpu_mode(); | 596 | arch_leave_lazy_cpu_mode(); |
585 | 597 | ||
586 | /* | 598 | /* |
587 | * Switch FS and GS. | 599 | * Switch FS and GS. |
588 | * | 600 | * |
589 | * Segment register != 0 always requires a reload. Also | 601 | * Segment register != 0 always requires a reload. Also |
@@ -592,13 +604,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
592 | */ | 604 | */ |
593 | if (unlikely(fsindex | next->fsindex | prev->fs)) { | 605 | if (unlikely(fsindex | next->fsindex | prev->fs)) { |
594 | loadsegment(fs, next->fsindex); | 606 | loadsegment(fs, next->fsindex); |
595 | /* | 607 | /* |
596 | * Check if the user used a selector != 0; if yes | 608 | * Check if the user used a selector != 0; if yes |
597 | * clear 64bit base, since overloaded base is always | 609 | * clear 64bit base, since overloaded base is always |
598 | * mapped to the Null selector | 610 | * mapped to the Null selector |
599 | */ | 611 | */ |
600 | if (fsindex) | 612 | if (fsindex) |
601 | prev->fs = 0; | 613 | prev->fs = 0; |
602 | } | 614 | } |
603 | /* when next process has a 64bit base use it */ | 615 | /* when next process has a 64bit base use it */ |
604 | if (next->fs) | 616 | if (next->fs) |
@@ -608,7 +620,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
608 | if (unlikely(gsindex | next->gsindex | prev->gs)) { | 620 | if (unlikely(gsindex | next->gsindex | prev->gs)) { |
609 | load_gs_index(next->gsindex); | 621 | load_gs_index(next->gsindex); |
610 | if (gsindex) | 622 | if (gsindex) |
611 | prev->gs = 0; | 623 | prev->gs = 0; |
612 | } | 624 | } |
613 | if (next->gs) | 625 | if (next->gs) |
614 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); | 626 | wrmsrl(MSR_KERNEL_GS_BASE, next->gs); |
@@ -617,12 +629,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
617 | /* Must be after DS reload */ | 629 | /* Must be after DS reload */ |
618 | unlazy_fpu(prev_p); | 630 | unlazy_fpu(prev_p); |
619 | 631 | ||
620 | /* | 632 | /* |
621 | * Switch the PDA and FPU contexts. | 633 | * Switch the PDA and FPU contexts. |
622 | */ | 634 | */ |
623 | prev->usersp = read_pda(oldrsp); | 635 | prev->usersp = read_pda(oldrsp); |
624 | write_pda(oldrsp, next->usersp); | 636 | write_pda(oldrsp, next->usersp); |
625 | write_pda(pcurrent, next_p); | 637 | write_pda(pcurrent, next_p); |
626 | 638 | ||
627 | write_pda(kernelstack, | 639 | write_pda(kernelstack, |
628 | (unsigned long)task_stack_page(next_p) + | 640 | (unsigned long)task_stack_page(next_p) + |
@@ -663,7 +675,7 @@ long sys_execve(char __user *name, char __user * __user *argv, | |||
663 | char __user * __user *envp, struct pt_regs *regs) | 675 | char __user * __user *envp, struct pt_regs *regs) |
664 | { | 676 | { |
665 | long error; | 677 | long error; |
666 | char * filename; | 678 | char *filename; |
667 | 679 | ||
668 | filename = getname(name); | 680 | filename = getname(name); |
669 | error = PTR_ERR(filename); | 681 | error = PTR_ERR(filename); |
@@ -721,55 +733,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs) | |||
721 | unsigned long get_wchan(struct task_struct *p) | 733 | unsigned long get_wchan(struct task_struct *p) |
722 | { | 734 | { |
723 | unsigned long stack; | 735 | unsigned long stack; |
724 | u64 fp,ip; | 736 | u64 fp, ip; |
725 | int count = 0; | 737 | int count = 0; |
726 | 738 | ||
727 | if (!p || p == current || p->state==TASK_RUNNING) | 739 | if (!p || p == current || p->state == TASK_RUNNING) |
728 | return 0; | 740 | return 0; |
729 | stack = (unsigned long)task_stack_page(p); | 741 | stack = (unsigned long)task_stack_page(p); |
730 | if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) | 742 | if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) |
731 | return 0; | 743 | return 0; |
732 | fp = *(u64 *)(p->thread.sp); | 744 | fp = *(u64 *)(p->thread.sp); |
733 | do { | 745 | do { |
734 | if (fp < (unsigned long)stack || | 746 | if (fp < (unsigned long)stack || |
735 | fp > (unsigned long)stack+THREAD_SIZE) | 747 | fp >= (unsigned long)stack+THREAD_SIZE) |
736 | return 0; | 748 | return 0; |
737 | ip = *(u64 *)(fp+8); | 749 | ip = *(u64 *)(fp+8); |
738 | if (!in_sched_functions(ip)) | 750 | if (!in_sched_functions(ip)) |
739 | return ip; | 751 | return ip; |
740 | fp = *(u64 *)fp; | 752 | fp = *(u64 *)fp; |
741 | } while (count++ < 16); | 753 | } while (count++ < 16); |
742 | return 0; | 754 | return 0; |
743 | } | 755 | } |
744 | 756 | ||
745 | long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) | 757 | long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) |
746 | { | 758 | { |
747 | int ret = 0; | 759 | int ret = 0; |
748 | int doit = task == current; | 760 | int doit = task == current; |
749 | int cpu; | 761 | int cpu; |
750 | 762 | ||
751 | switch (code) { | 763 | switch (code) { |
752 | case ARCH_SET_GS: | 764 | case ARCH_SET_GS: |
753 | if (addr >= TASK_SIZE_OF(task)) | 765 | if (addr >= TASK_SIZE_OF(task)) |
754 | return -EPERM; | 766 | return -EPERM; |
755 | cpu = get_cpu(); | 767 | cpu = get_cpu(); |
756 | /* handle small bases via the GDT because that's faster to | 768 | /* handle small bases via the GDT because that's faster to |
757 | switch. */ | 769 | switch. */ |
758 | if (addr <= 0xffffffff) { | 770 | if (addr <= 0xffffffff) { |
759 | set_32bit_tls(task, GS_TLS, addr); | 771 | set_32bit_tls(task, GS_TLS, addr); |
760 | if (doit) { | 772 | if (doit) { |
761 | load_TLS(&task->thread, cpu); | 773 | load_TLS(&task->thread, cpu); |
762 | load_gs_index(GS_TLS_SEL); | 774 | load_gs_index(GS_TLS_SEL); |
763 | } | 775 | } |
764 | task->thread.gsindex = GS_TLS_SEL; | 776 | task->thread.gsindex = GS_TLS_SEL; |
765 | task->thread.gs = 0; | 777 | task->thread.gs = 0; |
766 | } else { | 778 | } else { |
767 | task->thread.gsindex = 0; | 779 | task->thread.gsindex = 0; |
768 | task->thread.gs = addr; | 780 | task->thread.gs = addr; |
769 | if (doit) { | 781 | if (doit) { |
770 | load_gs_index(0); | 782 | load_gs_index(0); |
771 | ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); | 783 | ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); |
772 | } | 784 | } |
773 | } | 785 | } |
774 | put_cpu(); | 786 | put_cpu(); |
775 | break; | 787 | break; |
@@ -823,8 +835,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) | |||
823 | rdmsrl(MSR_KERNEL_GS_BASE, base); | 835 | rdmsrl(MSR_KERNEL_GS_BASE, base); |
824 | else | 836 | else |
825 | base = task->thread.gs; | 837 | base = task->thread.gs; |
826 | } | 838 | } else |
827 | else | ||
828 | base = task->thread.gs; | 839 | base = task->thread.gs; |
829 | ret = put_user(base, (unsigned long __user *)addr); | 840 | ret = put_user(base, (unsigned long __user *)addr); |
830 | break; | 841 | break; |
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e37dccce85db..0a6d8c12e10d 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/errno.h> | 14 | #include <linux/errno.h> |
15 | #include <linux/ptrace.h> | 15 | #include <linux/ptrace.h> |
16 | #include <linux/regset.h> | 16 | #include <linux/regset.h> |
17 | #include <linux/tracehook.h> | ||
17 | #include <linux/user.h> | 18 | #include <linux/user.h> |
18 | #include <linux/elf.h> | 19 | #include <linux/elf.h> |
19 | #include <linux/security.h> | 20 | #include <linux/security.h> |
@@ -39,7 +40,9 @@ enum x86_regset { | |||
39 | REGSET_GENERAL, | 40 | REGSET_GENERAL, |
40 | REGSET_FP, | 41 | REGSET_FP, |
41 | REGSET_XFP, | 42 | REGSET_XFP, |
43 | REGSET_IOPERM64 = REGSET_XFP, | ||
42 | REGSET_TLS, | 44 | REGSET_TLS, |
45 | REGSET_IOPERM32, | ||
43 | }; | 46 | }; |
44 | 47 | ||
45 | /* | 48 | /* |
@@ -69,7 +72,7 @@ static inline bool invalid_selector(u16 value) | |||
69 | 72 | ||
70 | #define FLAG_MASK FLAG_MASK_32 | 73 | #define FLAG_MASK FLAG_MASK_32 |
71 | 74 | ||
72 | static long *pt_regs_access(struct pt_regs *regs, unsigned long regno) | 75 | static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) |
73 | { | 76 | { |
74 | BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); | 77 | BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); |
75 | regno >>= 2; | 78 | regno >>= 2; |
@@ -554,45 +557,138 @@ static int ptrace_set_debugreg(struct task_struct *child, | |||
554 | return 0; | 557 | return 0; |
555 | } | 558 | } |
556 | 559 | ||
557 | #ifdef X86_BTS | 560 | /* |
561 | * These access the current or another (stopped) task's io permission | ||
562 | * bitmap for debugging or core dump. | ||
563 | */ | ||
564 | static int ioperm_active(struct task_struct *target, | ||
565 | const struct user_regset *regset) | ||
566 | { | ||
567 | return target->thread.io_bitmap_max / regset->size; | ||
568 | } | ||
558 | 569 | ||
559 | static int ptrace_bts_get_size(struct task_struct *child) | 570 | static int ioperm_get(struct task_struct *target, |
571 | const struct user_regset *regset, | ||
572 | unsigned int pos, unsigned int count, | ||
573 | void *kbuf, void __user *ubuf) | ||
560 | { | 574 | { |
561 | if (!child->thread.ds_area_msr) | 575 | if (!target->thread.io_bitmap_ptr) |
562 | return -ENXIO; | 576 | return -ENXIO; |
563 | 577 | ||
564 | return ds_get_bts_index((void *)child->thread.ds_area_msr); | 578 | return user_regset_copyout(&pos, &count, &kbuf, &ubuf, |
579 | target->thread.io_bitmap_ptr, | ||
580 | 0, IO_BITMAP_BYTES); | ||
581 | } | ||
582 | |||
583 | #ifdef CONFIG_X86_PTRACE_BTS | ||
584 | /* | ||
585 | * The configuration for a particular BTS hardware implementation. | ||
586 | */ | ||
587 | struct bts_configuration { | ||
588 | /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */ | ||
589 | unsigned char sizeof_bts; | ||
590 | /* the size of a field in the BTS record in bytes */ | ||
591 | unsigned char sizeof_field; | ||
592 | /* a bitmask to enable/disable BTS in DEBUGCTL MSR */ | ||
593 | unsigned long debugctl_mask; | ||
594 | }; | ||
595 | static struct bts_configuration bts_cfg; | ||
596 | |||
597 | #define BTS_MAX_RECORD_SIZE (8 * 3) | ||
598 | |||
599 | |||
600 | /* | ||
601 | * Branch Trace Store (BTS) uses the following format. Different | ||
602 | * architectures vary in the size of those fields. | ||
603 | * - source linear address | ||
604 | * - destination linear address | ||
605 | * - flags | ||
606 | * | ||
607 | * Later architectures use 64bit pointers throughout, whereas earlier | ||
608 | * architectures use 32bit pointers in 32bit mode. | ||
609 | * | ||
610 | * We compute the base address for the first 8 fields based on: | ||
611 | * - the field size stored in the DS configuration | ||
612 | * - the relative field position | ||
613 | * | ||
614 | * In order to store additional information in the BTS buffer, we use | ||
615 | * a special source address to indicate that the record requires | ||
616 | * special interpretation. | ||
617 | * | ||
618 | * Netburst indicated via a bit in the flags field whether the branch | ||
619 | * was predicted; this is ignored. | ||
620 | */ | ||
621 | |||
622 | enum bts_field { | ||
623 | bts_from = 0, | ||
624 | bts_to, | ||
625 | bts_flags, | ||
626 | |||
627 | bts_escape = (unsigned long)-1, | ||
628 | bts_qual = bts_to, | ||
629 | bts_jiffies = bts_flags | ||
630 | }; | ||
631 | |||
632 | static inline unsigned long bts_get(const char *base, enum bts_field field) | ||
633 | { | ||
634 | base += (bts_cfg.sizeof_field * field); | ||
635 | return *(unsigned long *)base; | ||
636 | } | ||
637 | |||
638 | static inline void bts_set(char *base, enum bts_field field, unsigned long val) | ||
639 | { | ||
640 | base += (bts_cfg.sizeof_field * field);; | ||
641 | (*(unsigned long *)base) = val; | ||
642 | } | ||
643 | |||
644 | /* | ||
645 | * Translate a BTS record from the raw format into the bts_struct format | ||
646 | * | ||
647 | * out (out): bts_struct interpretation | ||
648 | * raw: raw BTS record | ||
649 | */ | ||
650 | static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw) | ||
651 | { | ||
652 | memset(out, 0, sizeof(*out)); | ||
653 | if (bts_get(raw, bts_from) == bts_escape) { | ||
654 | out->qualifier = bts_get(raw, bts_qual); | ||
655 | out->variant.jiffies = bts_get(raw, bts_jiffies); | ||
656 | } else { | ||
657 | out->qualifier = BTS_BRANCH; | ||
658 | out->variant.lbr.from_ip = bts_get(raw, bts_from); | ||
659 | out->variant.lbr.to_ip = bts_get(raw, bts_to); | ||
660 | } | ||
565 | } | 661 | } |
566 | 662 | ||
567 | static int ptrace_bts_read_record(struct task_struct *child, | 663 | static int ptrace_bts_read_record(struct task_struct *child, size_t index, |
568 | long index, | ||
569 | struct bts_struct __user *out) | 664 | struct bts_struct __user *out) |
570 | { | 665 | { |
571 | struct bts_struct ret; | 666 | struct bts_struct ret; |
572 | int retval; | 667 | const void *bts_record; |
573 | int bts_end; | 668 | size_t bts_index, bts_end; |
574 | int bts_index; | 669 | int error; |
575 | |||
576 | if (!child->thread.ds_area_msr) | ||
577 | return -ENXIO; | ||
578 | 670 | ||
579 | if (index < 0) | 671 | error = ds_get_bts_end(child, &bts_end); |
580 | return -EINVAL; | 672 | if (error < 0) |
673 | return error; | ||
581 | 674 | ||
582 | bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr); | ||
583 | if (bts_end <= index) | 675 | if (bts_end <= index) |
584 | return -EINVAL; | 676 | return -EINVAL; |
585 | 677 | ||
678 | error = ds_get_bts_index(child, &bts_index); | ||
679 | if (error < 0) | ||
680 | return error; | ||
681 | |||
586 | /* translate the ptrace bts index into the ds bts index */ | 682 | /* translate the ptrace bts index into the ds bts index */ |
587 | bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); | 683 | bts_index += bts_end - (index + 1); |
588 | bts_index -= (index + 1); | 684 | if (bts_end <= bts_index) |
589 | if (bts_index < 0) | 685 | bts_index -= bts_end; |
590 | bts_index += bts_end; | 686 | |
687 | error = ds_access_bts(child, bts_index, &bts_record); | ||
688 | if (error < 0) | ||
689 | return error; | ||
591 | 690 | ||
592 | retval = ds_read_bts((void *)child->thread.ds_area_msr, | 691 | ptrace_bts_translate_record(&ret, bts_record); |
593 | bts_index, &ret); | ||
594 | if (retval < 0) | ||
595 | return retval; | ||
596 | 692 | ||
597 | if (copy_to_user(out, &ret, sizeof(ret))) | 693 | if (copy_to_user(out, &ret, sizeof(ret))) |
598 | return -EFAULT; | 694 | return -EFAULT; |
@@ -600,101 +696,106 @@ static int ptrace_bts_read_record(struct task_struct *child, | |||
600 | return sizeof(ret); | 696 | return sizeof(ret); |
601 | } | 697 | } |
602 | 698 | ||
603 | static int ptrace_bts_clear(struct task_struct *child) | ||
604 | { | ||
605 | if (!child->thread.ds_area_msr) | ||
606 | return -ENXIO; | ||
607 | |||
608 | return ds_clear((void *)child->thread.ds_area_msr); | ||
609 | } | ||
610 | |||
611 | static int ptrace_bts_drain(struct task_struct *child, | 699 | static int ptrace_bts_drain(struct task_struct *child, |
612 | long size, | 700 | long size, |
613 | struct bts_struct __user *out) | 701 | struct bts_struct __user *out) |
614 | { | 702 | { |
615 | int end, i; | 703 | struct bts_struct ret; |
616 | void *ds = (void *)child->thread.ds_area_msr; | 704 | const unsigned char *raw; |
617 | 705 | size_t end, i; | |
618 | if (!ds) | 706 | int error; |
619 | return -ENXIO; | ||
620 | 707 | ||
621 | end = ds_get_bts_index(ds); | 708 | error = ds_get_bts_index(child, &end); |
622 | if (end <= 0) | 709 | if (error < 0) |
623 | return end; | 710 | return error; |
624 | 711 | ||
625 | if (size < (end * sizeof(struct bts_struct))) | 712 | if (size < (end * sizeof(struct bts_struct))) |
626 | return -EIO; | 713 | return -EIO; |
627 | 714 | ||
628 | for (i = 0; i < end; i++, out++) { | 715 | error = ds_access_bts(child, 0, (const void **)&raw); |
629 | struct bts_struct ret; | 716 | if (error < 0) |
630 | int retval; | 717 | return error; |
631 | 718 | ||
632 | retval = ds_read_bts(ds, i, &ret); | 719 | for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { |
633 | if (retval < 0) | 720 | ptrace_bts_translate_record(&ret, raw); |
634 | return retval; | ||
635 | 721 | ||
636 | if (copy_to_user(out, &ret, sizeof(ret))) | 722 | if (copy_to_user(out, &ret, sizeof(ret))) |
637 | return -EFAULT; | 723 | return -EFAULT; |
638 | } | 724 | } |
639 | 725 | ||
640 | ds_clear(ds); | 726 | error = ds_clear_bts(child); |
727 | if (error < 0) | ||
728 | return error; | ||
641 | 729 | ||
642 | return end; | 730 | return end; |
643 | } | 731 | } |
644 | 732 | ||
733 | static void ptrace_bts_ovfl(struct task_struct *child) | ||
734 | { | ||
735 | send_sig(child->thread.bts_ovfl_signal, child, 0); | ||
736 | } | ||
737 | |||
645 | static int ptrace_bts_config(struct task_struct *child, | 738 | static int ptrace_bts_config(struct task_struct *child, |
646 | long cfg_size, | 739 | long cfg_size, |
647 | const struct ptrace_bts_config __user *ucfg) | 740 | const struct ptrace_bts_config __user *ucfg) |
648 | { | 741 | { |
649 | struct ptrace_bts_config cfg; | 742 | struct ptrace_bts_config cfg; |
650 | int bts_size, ret = 0; | 743 | int error = 0; |
651 | void *ds; | ||
652 | 744 | ||
745 | error = -EOPNOTSUPP; | ||
746 | if (!bts_cfg.sizeof_bts) | ||
747 | goto errout; | ||
748 | |||
749 | error = -EIO; | ||
653 | if (cfg_size < sizeof(cfg)) | 750 | if (cfg_size < sizeof(cfg)) |
654 | return -EIO; | 751 | goto errout; |
655 | 752 | ||
753 | error = -EFAULT; | ||
656 | if (copy_from_user(&cfg, ucfg, sizeof(cfg))) | 754 | if (copy_from_user(&cfg, ucfg, sizeof(cfg))) |
657 | return -EFAULT; | 755 | goto errout; |
658 | 756 | ||
659 | if ((int)cfg.size < 0) | 757 | error = -EINVAL; |
660 | return -EINVAL; | 758 | if ((cfg.flags & PTRACE_BTS_O_SIGNAL) && |
759 | !(cfg.flags & PTRACE_BTS_O_ALLOC)) | ||
760 | goto errout; | ||
661 | 761 | ||
662 | bts_size = 0; | 762 | if (cfg.flags & PTRACE_BTS_O_ALLOC) { |
663 | ds = (void *)child->thread.ds_area_msr; | 763 | ds_ovfl_callback_t ovfl = NULL; |
664 | if (ds) { | 764 | unsigned int sig = 0; |
665 | bts_size = ds_get_bts_size(ds); | 765 | |
666 | if (bts_size < 0) | 766 | /* we ignore the error in case we were not tracing child */ |
667 | return bts_size; | 767 | (void)ds_release_bts(child); |
668 | } | 768 | |
669 | cfg.size = PAGE_ALIGN(cfg.size); | 769 | if (cfg.flags & PTRACE_BTS_O_SIGNAL) { |
770 | if (!cfg.signal) | ||
771 | goto errout; | ||
772 | |||
773 | sig = cfg.signal; | ||
774 | ovfl = ptrace_bts_ovfl; | ||
775 | } | ||
670 | 776 | ||
671 | if (bts_size != cfg.size) { | 777 | error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); |
672 | ret = ptrace_bts_realloc(child, cfg.size, | 778 | if (error < 0) |
673 | cfg.flags & PTRACE_BTS_O_CUT_SIZE); | ||
674 | if (ret < 0) | ||
675 | goto errout; | 779 | goto errout; |
676 | 780 | ||
677 | ds = (void *)child->thread.ds_area_msr; | 781 | child->thread.bts_ovfl_signal = sig; |
678 | } | 782 | } |
679 | 783 | ||
680 | if (cfg.flags & PTRACE_BTS_O_SIGNAL) | 784 | error = -EINVAL; |
681 | ret = ds_set_overflow(ds, DS_O_SIGNAL); | 785 | if (!child->thread.ds_ctx && cfg.flags) |
682 | else | ||
683 | ret = ds_set_overflow(ds, DS_O_WRAP); | ||
684 | if (ret < 0) | ||
685 | goto errout; | 786 | goto errout; |
686 | 787 | ||
687 | if (cfg.flags & PTRACE_BTS_O_TRACE) | 788 | if (cfg.flags & PTRACE_BTS_O_TRACE) |
688 | child->thread.debugctlmsr |= ds_debugctl_mask(); | 789 | child->thread.debugctlmsr |= bts_cfg.debugctl_mask; |
689 | else | 790 | else |
690 | child->thread.debugctlmsr &= ~ds_debugctl_mask(); | 791 | child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; |
691 | 792 | ||
692 | if (cfg.flags & PTRACE_BTS_O_SCHED) | 793 | if (cfg.flags & PTRACE_BTS_O_SCHED) |
693 | set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); | 794 | set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); |
694 | else | 795 | else |
695 | clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); | 796 | clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); |
696 | 797 | ||
697 | ret = sizeof(cfg); | 798 | error = sizeof(cfg); |
698 | 799 | ||
699 | out: | 800 | out: |
700 | if (child->thread.debugctlmsr) | 801 | if (child->thread.debugctlmsr) |
@@ -702,10 +803,10 @@ out: | |||
702 | else | 803 | else |
703 | clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); | 804 | clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); |
704 | 805 | ||
705 | return ret; | 806 | return error; |
706 | 807 | ||
707 | errout: | 808 | errout: |
708 | child->thread.debugctlmsr &= ~ds_debugctl_mask(); | 809 | child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; |
709 | clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); | 810 | clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); |
710 | goto out; | 811 | goto out; |
711 | } | 812 | } |
@@ -714,29 +815,40 @@ static int ptrace_bts_status(struct task_struct *child, | |||
714 | long cfg_size, | 815 | long cfg_size, |
715 | struct ptrace_bts_config __user *ucfg) | 816 | struct ptrace_bts_config __user *ucfg) |
716 | { | 817 | { |
717 | void *ds = (void *)child->thread.ds_area_msr; | ||
718 | struct ptrace_bts_config cfg; | 818 | struct ptrace_bts_config cfg; |
819 | size_t end; | ||
820 | const void *base, *max; | ||
821 | int error; | ||
719 | 822 | ||
720 | if (cfg_size < sizeof(cfg)) | 823 | if (cfg_size < sizeof(cfg)) |
721 | return -EIO; | 824 | return -EIO; |
722 | 825 | ||
723 | memset(&cfg, 0, sizeof(cfg)); | 826 | error = ds_get_bts_end(child, &end); |
827 | if (error < 0) | ||
828 | return error; | ||
724 | 829 | ||
725 | if (ds) { | 830 | error = ds_access_bts(child, /* index = */ 0, &base); |
726 | cfg.size = ds_get_bts_size(ds); | 831 | if (error < 0) |
832 | return error; | ||
727 | 833 | ||
728 | if (ds_get_overflow(ds) == DS_O_SIGNAL) | 834 | error = ds_access_bts(child, /* index = */ end, &max); |
729 | cfg.flags |= PTRACE_BTS_O_SIGNAL; | 835 | if (error < 0) |
836 | return error; | ||
730 | 837 | ||
731 | if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && | 838 | memset(&cfg, 0, sizeof(cfg)); |
732 | child->thread.debugctlmsr & ds_debugctl_mask()) | 839 | cfg.size = (max - base); |
733 | cfg.flags |= PTRACE_BTS_O_TRACE; | 840 | cfg.signal = child->thread.bts_ovfl_signal; |
841 | cfg.bts_size = sizeof(struct bts_struct); | ||
734 | 842 | ||
735 | if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) | 843 | if (cfg.signal) |
736 | cfg.flags |= PTRACE_BTS_O_SCHED; | 844 | cfg.flags |= PTRACE_BTS_O_SIGNAL; |
737 | } | ||
738 | 845 | ||
739 | cfg.bts_size = sizeof(struct bts_struct); | 846 | if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && |
847 | child->thread.debugctlmsr & bts_cfg.debugctl_mask) | ||
848 | cfg.flags |= PTRACE_BTS_O_TRACE; | ||
849 | |||
850 | if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) | ||
851 | cfg.flags |= PTRACE_BTS_O_SCHED; | ||
740 | 852 | ||
741 | if (copy_to_user(ucfg, &cfg, sizeof(cfg))) | 853 | if (copy_to_user(ucfg, &cfg, sizeof(cfg))) |
742 | return -EFAULT; | 854 | return -EFAULT; |
@@ -744,89 +856,38 @@ static int ptrace_bts_status(struct task_struct *child, | |||
744 | return sizeof(cfg); | 856 | return sizeof(cfg); |
745 | } | 857 | } |
746 | 858 | ||
747 | |||
748 | static int ptrace_bts_write_record(struct task_struct *child, | 859 | static int ptrace_bts_write_record(struct task_struct *child, |
749 | const struct bts_struct *in) | 860 | const struct bts_struct *in) |
750 | { | 861 | { |
751 | int retval; | 862 | unsigned char bts_record[BTS_MAX_RECORD_SIZE]; |
752 | 863 | ||
753 | if (!child->thread.ds_area_msr) | 864 | BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); |
754 | return -ENXIO; | ||
755 | 865 | ||
756 | retval = ds_write_bts((void *)child->thread.ds_area_msr, in); | 866 | memset(bts_record, 0, bts_cfg.sizeof_bts); |
757 | if (retval) | 867 | switch (in->qualifier) { |
758 | return retval; | 868 | case BTS_INVALID: |
869 | break; | ||
759 | 870 | ||
760 | return sizeof(*in); | 871 | case BTS_BRANCH: |
761 | } | 872 | bts_set(bts_record, bts_from, in->variant.lbr.from_ip); |
873 | bts_set(bts_record, bts_to, in->variant.lbr.to_ip); | ||
874 | break; | ||
762 | 875 | ||
763 | static int ptrace_bts_realloc(struct task_struct *child, | 876 | case BTS_TASK_ARRIVES: |
764 | int size, int reduce_size) | 877 | case BTS_TASK_DEPARTS: |
765 | { | 878 | bts_set(bts_record, bts_from, bts_escape); |
766 | unsigned long rlim, vm; | 879 | bts_set(bts_record, bts_qual, in->qualifier); |
767 | int ret, old_size; | 880 | bts_set(bts_record, bts_jiffies, in->variant.jiffies); |
881 | break; | ||
768 | 882 | ||
769 | if (size < 0) | 883 | default: |
770 | return -EINVAL; | 884 | return -EINVAL; |
771 | |||
772 | old_size = ds_get_bts_size((void *)child->thread.ds_area_msr); | ||
773 | if (old_size < 0) | ||
774 | return old_size; | ||
775 | |||
776 | ret = ds_free((void **)&child->thread.ds_area_msr); | ||
777 | if (ret < 0) | ||
778 | goto out; | ||
779 | |||
780 | size >>= PAGE_SHIFT; | ||
781 | old_size >>= PAGE_SHIFT; | ||
782 | |||
783 | current->mm->total_vm -= old_size; | ||
784 | current->mm->locked_vm -= old_size; | ||
785 | |||
786 | if (size == 0) | ||
787 | goto out; | ||
788 | |||
789 | rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | ||
790 | vm = current->mm->total_vm + size; | ||
791 | if (rlim < vm) { | ||
792 | ret = -ENOMEM; | ||
793 | |||
794 | if (!reduce_size) | ||
795 | goto out; | ||
796 | |||
797 | size = rlim - current->mm->total_vm; | ||
798 | if (size <= 0) | ||
799 | goto out; | ||
800 | } | ||
801 | |||
802 | rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | ||
803 | vm = current->mm->locked_vm + size; | ||
804 | if (rlim < vm) { | ||
805 | ret = -ENOMEM; | ||
806 | |||
807 | if (!reduce_size) | ||
808 | goto out; | ||
809 | |||
810 | size = rlim - current->mm->locked_vm; | ||
811 | if (size <= 0) | ||
812 | goto out; | ||
813 | } | 885 | } |
814 | 886 | ||
815 | ret = ds_allocate((void **)&child->thread.ds_area_msr, | 887 | /* The writing task will be the switched-to task on a context |
816 | size << PAGE_SHIFT); | 888 | * switch. It needs to write into the switched-from task's BTS |
817 | if (ret < 0) | 889 | * buffer. */ |
818 | goto out; | 890 | return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts); |
819 | |||
820 | current->mm->total_vm += size; | ||
821 | current->mm->locked_vm += size; | ||
822 | |||
823 | out: | ||
824 | if (child->thread.ds_area_msr) | ||
825 | set_tsk_thread_flag(child, TIF_DS_AREA_MSR); | ||
826 | else | ||
827 | clear_tsk_thread_flag(child, TIF_DS_AREA_MSR); | ||
828 | |||
829 | return ret; | ||
830 | } | 891 | } |
831 | 892 | ||
832 | void ptrace_bts_take_timestamp(struct task_struct *tsk, | 893 | void ptrace_bts_take_timestamp(struct task_struct *tsk, |
@@ -839,7 +900,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk, | |||
839 | 900 | ||
840 | ptrace_bts_write_record(tsk, &rec); | 901 | ptrace_bts_write_record(tsk, &rec); |
841 | } | 902 | } |
842 | #endif /* X86_BTS */ | 903 | |
904 | static const struct bts_configuration bts_cfg_netburst = { | ||
905 | .sizeof_bts = sizeof(long) * 3, | ||
906 | .sizeof_field = sizeof(long), | ||
907 | .debugctl_mask = (1<<2)|(1<<3)|(1<<5) | ||
908 | }; | ||
909 | |||
910 | static const struct bts_configuration bts_cfg_pentium_m = { | ||
911 | .sizeof_bts = sizeof(long) * 3, | ||
912 | .sizeof_field = sizeof(long), | ||
913 | .debugctl_mask = (1<<6)|(1<<7) | ||
914 | }; | ||
915 | |||
916 | static const struct bts_configuration bts_cfg_core2 = { | ||
917 | .sizeof_bts = 8 * 3, | ||
918 | .sizeof_field = 8, | ||
919 | .debugctl_mask = (1<<6)|(1<<7)|(1<<9) | ||
920 | }; | ||
921 | |||
922 | static inline void bts_configure(const struct bts_configuration *cfg) | ||
923 | { | ||
924 | bts_cfg = *cfg; | ||
925 | } | ||
926 | |||
927 | void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) | ||
928 | { | ||
929 | switch (c->x86) { | ||
930 | case 0x6: | ||
931 | switch (c->x86_model) { | ||
932 | case 0xD: | ||
933 | case 0xE: /* Pentium M */ | ||
934 | bts_configure(&bts_cfg_pentium_m); | ||
935 | break; | ||
936 | case 0xF: /* Core2 */ | ||
937 | case 0x1C: /* Atom */ | ||
938 | bts_configure(&bts_cfg_core2); | ||
939 | break; | ||
940 | default: | ||
941 | /* sorry, don't know about them */ | ||
942 | break; | ||
943 | } | ||
944 | break; | ||
945 | case 0xF: | ||
946 | switch (c->x86_model) { | ||
947 | case 0x0: | ||
948 | case 0x1: | ||
949 | case 0x2: /* Netburst */ | ||
950 | bts_configure(&bts_cfg_netburst); | ||
951 | break; | ||
952 | default: | ||
953 | /* sorry, don't know about them */ | ||
954 | break; | ||
955 | } | ||
956 | break; | ||
957 | default: | ||
958 | /* sorry, don't know about them */ | ||
959 | break; | ||
960 | } | ||
961 | } | ||
962 | #endif /* CONFIG_X86_PTRACE_BTS */ | ||
843 | 963 | ||
844 | /* | 964 | /* |
845 | * Called by kernel/ptrace.c when detaching.. | 965 | * Called by kernel/ptrace.c when detaching.. |
@@ -852,15 +972,15 @@ void ptrace_disable(struct task_struct *child) | |||
852 | #ifdef TIF_SYSCALL_EMU | 972 | #ifdef TIF_SYSCALL_EMU |
853 | clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); | 973 | clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); |
854 | #endif | 974 | #endif |
855 | if (child->thread.ds_area_msr) { | 975 | #ifdef CONFIG_X86_PTRACE_BTS |
856 | #ifdef X86_BTS | 976 | (void)ds_release_bts(child); |
857 | ptrace_bts_realloc(child, 0, 0); | 977 | |
858 | #endif | 978 | child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; |
859 | child->thread.debugctlmsr &= ~ds_debugctl_mask(); | 979 | if (!child->thread.debugctlmsr) |
860 | if (!child->thread.debugctlmsr) | 980 | clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); |
861 | clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); | 981 | |
862 | clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); | 982 | clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); |
863 | } | 983 | #endif /* CONFIG_X86_PTRACE_BTS */ |
864 | } | 984 | } |
865 | 985 | ||
866 | #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION | 986 | #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION |
@@ -980,7 +1100,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) | |||
980 | /* | 1100 | /* |
981 | * These bits need more cooking - not enabled yet: | 1101 | * These bits need more cooking - not enabled yet: |
982 | */ | 1102 | */ |
983 | #ifdef X86_BTS | 1103 | #ifdef CONFIG_X86_PTRACE_BTS |
984 | case PTRACE_BTS_CONFIG: | 1104 | case PTRACE_BTS_CONFIG: |
985 | ret = ptrace_bts_config | 1105 | ret = ptrace_bts_config |
986 | (child, data, (struct ptrace_bts_config __user *)addr); | 1106 | (child, data, (struct ptrace_bts_config __user *)addr); |
@@ -992,7 +1112,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) | |||
992 | break; | 1112 | break; |
993 | 1113 | ||
994 | case PTRACE_BTS_SIZE: | 1114 | case PTRACE_BTS_SIZE: |
995 | ret = ptrace_bts_get_size(child); | 1115 | ret = ds_get_bts_index(child, /* pos = */ NULL); |
996 | break; | 1116 | break; |
997 | 1117 | ||
998 | case PTRACE_BTS_GET: | 1118 | case PTRACE_BTS_GET: |
@@ -1001,14 +1121,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) | |||
1001 | break; | 1121 | break; |
1002 | 1122 | ||
1003 | case PTRACE_BTS_CLEAR: | 1123 | case PTRACE_BTS_CLEAR: |
1004 | ret = ptrace_bts_clear(child); | 1124 | ret = ds_clear_bts(child); |
1005 | break; | 1125 | break; |
1006 | 1126 | ||
1007 | case PTRACE_BTS_DRAIN: | 1127 | case PTRACE_BTS_DRAIN: |
1008 | ret = ptrace_bts_drain | 1128 | ret = ptrace_bts_drain |
1009 | (child, data, (struct bts_struct __user *) addr); | 1129 | (child, data, (struct bts_struct __user *) addr); |
1010 | break; | 1130 | break; |
1011 | #endif | 1131 | #endif /* CONFIG_X86_PTRACE_BTS */ |
1012 | 1132 | ||
1013 | default: | 1133 | default: |
1014 | ret = ptrace_request(child, request, addr, data); | 1134 | ret = ptrace_request(child, request, addr, data); |
@@ -1290,6 +1410,12 @@ static const struct user_regset x86_64_regsets[] = { | |||
1290 | .size = sizeof(long), .align = sizeof(long), | 1410 | .size = sizeof(long), .align = sizeof(long), |
1291 | .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set | 1411 | .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set |
1292 | }, | 1412 | }, |
1413 | [REGSET_IOPERM64] = { | ||
1414 | .core_note_type = NT_386_IOPERM, | ||
1415 | .n = IO_BITMAP_LONGS, | ||
1416 | .size = sizeof(long), .align = sizeof(long), | ||
1417 | .active = ioperm_active, .get = ioperm_get | ||
1418 | }, | ||
1293 | }; | 1419 | }; |
1294 | 1420 | ||
1295 | static const struct user_regset_view user_x86_64_view = { | 1421 | static const struct user_regset_view user_x86_64_view = { |
@@ -1336,6 +1462,12 @@ static const struct user_regset x86_32_regsets[] = { | |||
1336 | .active = regset_tls_active, | 1462 | .active = regset_tls_active, |
1337 | .get = regset_tls_get, .set = regset_tls_set | 1463 | .get = regset_tls_get, .set = regset_tls_set |
1338 | }, | 1464 | }, |
1465 | [REGSET_IOPERM32] = { | ||
1466 | .core_note_type = NT_386_IOPERM, | ||
1467 | .n = IO_BITMAP_BYTES / sizeof(u32), | ||
1468 | .size = sizeof(u32), .align = sizeof(u32), | ||
1469 | .active = ioperm_active, .get = ioperm_get | ||
1470 | }, | ||
1339 | }; | 1471 | }; |
1340 | 1472 | ||
1341 | static const struct user_regset_view user_x86_32_view = { | 1473 | static const struct user_regset_view user_x86_32_view = { |
@@ -1357,7 +1489,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) | |||
1357 | #endif | 1489 | #endif |
1358 | } | 1490 | } |
1359 | 1491 | ||
1360 | void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) | 1492 | void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, |
1493 | int error_code, int si_code) | ||
1361 | { | 1494 | { |
1362 | struct siginfo info; | 1495 | struct siginfo info; |
1363 | 1496 | ||
@@ -1366,7 +1499,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) | |||
1366 | 1499 | ||
1367 | memset(&info, 0, sizeof(info)); | 1500 | memset(&info, 0, sizeof(info)); |
1368 | info.si_signo = SIGTRAP; | 1501 | info.si_signo = SIGTRAP; |
1369 | info.si_code = TRAP_BRKPT; | 1502 | info.si_code = si_code; |
1370 | 1503 | ||
1371 | /* User-mode ip? */ | 1504 | /* User-mode ip? */ |
1372 | info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; | 1505 | info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; |
@@ -1375,30 +1508,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) | |||
1375 | force_sig_info(SIGTRAP, &info, tsk); | 1508 | force_sig_info(SIGTRAP, &info, tsk); |
1376 | } | 1509 | } |
1377 | 1510 | ||
1378 | static void syscall_trace(struct pt_regs *regs) | ||
1379 | { | ||
1380 | if (!(current->ptrace & PT_PTRACED)) | ||
1381 | return; | ||
1382 | |||
1383 | #if 0 | ||
1384 | printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", | ||
1385 | current->comm, | ||
1386 | regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0), | ||
1387 | current_thread_info()->flags, current->ptrace); | ||
1388 | #endif | ||
1389 | |||
1390 | ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) | ||
1391 | ? 0x80 : 0)); | ||
1392 | /* | ||
1393 | * this isn't the same as continuing with a signal, but it will do | ||
1394 | * for normal use. strace only continues with a signal if the | ||
1395 | * stopping signal is not SIGTRAP. -brl | ||
1396 | */ | ||
1397 | if (current->exit_code) { | ||
1398 | send_sig(current->exit_code, current, 1); | ||
1399 | current->exit_code = 0; | ||
1400 | } | ||
1401 | } | ||
1402 | 1511 | ||
1403 | #ifdef CONFIG_X86_32 | 1512 | #ifdef CONFIG_X86_32 |
1404 | # define IS_IA32 1 | 1513 | # define IS_IA32 1 |
@@ -1432,8 +1541,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) | |||
1432 | if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) | 1541 | if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) |
1433 | ret = -1L; | 1542 | ret = -1L; |
1434 | 1543 | ||
1435 | if (ret || test_thread_flag(TIF_SYSCALL_TRACE)) | 1544 | if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && |
1436 | syscall_trace(regs); | 1545 | tracehook_report_syscall_entry(regs)) |
1546 | ret = -1L; | ||
1437 | 1547 | ||
1438 | if (unlikely(current->audit_context)) { | 1548 | if (unlikely(current->audit_context)) { |
1439 | if (IS_IA32) | 1549 | if (IS_IA32) |
@@ -1459,7 +1569,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) | |||
1459 | audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); | 1569 | audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); |
1460 | 1570 | ||
1461 | if (test_thread_flag(TIF_SYSCALL_TRACE)) | 1571 | if (test_thread_flag(TIF_SYSCALL_TRACE)) |
1462 | syscall_trace(regs); | 1572 | tracehook_report_syscall_exit(regs, 0); |
1463 | 1573 | ||
1464 | /* | 1574 | /* |
1465 | * If TIF_SYSCALL_EMU is set, we only get here because of | 1575 | * If TIF_SYSCALL_EMU is set, we only get here because of |
@@ -1475,6 +1585,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) | |||
1475 | * system call instruction. | 1585 | * system call instruction. |
1476 | */ | 1586 | */ |
1477 | if (test_thread_flag(TIF_SINGLESTEP) && | 1587 | if (test_thread_flag(TIF_SINGLESTEP) && |
1478 | (current->ptrace & PT_PTRACED)) | 1588 | tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL)) |
1479 | send_sigtrap(current, regs, 0); | 1589 | send_sigtrap(current, regs, 0, TRAP_BRKPT); |
1480 | } | 1590 | } |
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 05fbe9a0325a..4f9c55f3a7c0 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c | |||
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, | |||
97 | return dst->version; | 97 | return dst->version; |
98 | } | 98 | } |
99 | 99 | ||
100 | unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) | ||
101 | { | ||
102 | u64 pv_tsc_khz = 1000000ULL << 32; | ||
103 | |||
104 | do_div(pv_tsc_khz, src->tsc_to_system_mul); | ||
105 | if (src->tsc_shift < 0) | ||
106 | pv_tsc_khz <<= -src->tsc_shift; | ||
107 | else | ||
108 | pv_tsc_khz >>= src->tsc_shift; | ||
109 | return pv_tsc_khz; | ||
110 | } | ||
111 | |||
100 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) | 112 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) |
101 | { | 113 | { |
102 | struct pvclock_shadow_time shadow; | 114 | struct pvclock_shadow_time shadow; |
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index d13858818100..f6a11b9b1f98 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c | |||
@@ -354,9 +354,27 @@ static void ati_force_hpet_resume(void) | |||
354 | printk(KERN_DEBUG "Force enabled HPET at resume\n"); | 354 | printk(KERN_DEBUG "Force enabled HPET at resume\n"); |
355 | } | 355 | } |
356 | 356 | ||
357 | static u32 ati_ixp4x0_rev(struct pci_dev *dev) | ||
358 | { | ||
359 | u32 d; | ||
360 | u8 b; | ||
361 | |||
362 | pci_read_config_byte(dev, 0xac, &b); | ||
363 | b &= ~(1<<5); | ||
364 | pci_write_config_byte(dev, 0xac, b); | ||
365 | pci_read_config_dword(dev, 0x70, &d); | ||
366 | d |= 1<<8; | ||
367 | pci_write_config_dword(dev, 0x70, d); | ||
368 | pci_read_config_dword(dev, 0x8, &d); | ||
369 | d &= 0xff; | ||
370 | dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d); | ||
371 | return d; | ||
372 | } | ||
373 | |||
357 | static void ati_force_enable_hpet(struct pci_dev *dev) | 374 | static void ati_force_enable_hpet(struct pci_dev *dev) |
358 | { | 375 | { |
359 | u32 uninitialized_var(val); | 376 | u32 d, val; |
377 | u8 b; | ||
360 | 378 | ||
361 | if (hpet_address || force_hpet_address) | 379 | if (hpet_address || force_hpet_address) |
362 | return; | 380 | return; |
@@ -366,14 +384,33 @@ static void ati_force_enable_hpet(struct pci_dev *dev) | |||
366 | return; | 384 | return; |
367 | } | 385 | } |
368 | 386 | ||
387 | d = ati_ixp4x0_rev(dev); | ||
388 | if (d < 0x82) | ||
389 | return; | ||
390 | |||
391 | /* base address */ | ||
369 | pci_write_config_dword(dev, 0x14, 0xfed00000); | 392 | pci_write_config_dword(dev, 0x14, 0xfed00000); |
370 | pci_read_config_dword(dev, 0x14, &val); | 393 | pci_read_config_dword(dev, 0x14, &val); |
394 | |||
395 | /* enable interrupt */ | ||
396 | outb(0x72, 0xcd6); b = inb(0xcd7); | ||
397 | b |= 0x1; | ||
398 | outb(0x72, 0xcd6); outb(b, 0xcd7); | ||
399 | outb(0x72, 0xcd6); b = inb(0xcd7); | ||
400 | if (!(b & 0x1)) | ||
401 | return; | ||
402 | pci_read_config_dword(dev, 0x64, &d); | ||
403 | d |= (1<<10); | ||
404 | pci_write_config_dword(dev, 0x64, d); | ||
405 | pci_read_config_dword(dev, 0x64, &d); | ||
406 | if (!(d & (1<<10))) | ||
407 | return; | ||
408 | |||
371 | force_hpet_address = val; | 409 | force_hpet_address = val; |
372 | force_hpet_resume_type = ATI_FORCE_HPET_RESUME; | 410 | force_hpet_resume_type = ATI_FORCE_HPET_RESUME; |
373 | dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", | 411 | dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", |
374 | force_hpet_address); | 412 | force_hpet_address); |
375 | cached_dev = dev; | 413 | cached_dev = dev; |
376 | return; | ||
377 | } | 414 | } |
378 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, | 415 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, |
379 | ati_force_enable_hpet); | 416 | ati_force_enable_hpet); |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 724adfc63cb9..f4c93f1cfc19 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off); | |||
29 | 29 | ||
30 | static const struct desc_ptr no_idt = {}; | 30 | static const struct desc_ptr no_idt = {}; |
31 | static int reboot_mode; | 31 | static int reboot_mode; |
32 | enum reboot_type reboot_type = BOOT_KBD; | 32 | /* |
33 | * Keyboard reset and triple fault may result in INIT, not RESET, which | ||
34 | * doesn't work when we're in vmx root mode. Try ACPI first. | ||
35 | */ | ||
36 | enum reboot_type reboot_type = BOOT_ACPI; | ||
33 | int reboot_force; | 37 | int reboot_force; |
34 | 38 | ||
35 | #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) | 39 | #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) |
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 05191bbc68b8..0a23b5795b25 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c | |||
@@ -223,11 +223,25 @@ static struct platform_device rtc_device = { | |||
223 | static __init int add_rtc_cmos(void) | 223 | static __init int add_rtc_cmos(void) |
224 | { | 224 | { |
225 | #ifdef CONFIG_PNP | 225 | #ifdef CONFIG_PNP |
226 | if (!pnp_platform_devices) | 226 | static const char *ids[] __initconst = |
227 | platform_device_register(&rtc_device); | 227 | { "PNP0b00", "PNP0b01", "PNP0b02", }; |
228 | #else | 228 | struct pnp_dev *dev; |
229 | struct pnp_id *id; | ||
230 | int i; | ||
231 | |||
232 | pnp_for_each_dev(dev) { | ||
233 | for (id = dev->id; id; id = id->next) { | ||
234 | for (i = 0; i < ARRAY_SIZE(ids); i++) { | ||
235 | if (compare_pnp_id(id, ids[i]) != 0) | ||
236 | return 0; | ||
237 | } | ||
238 | } | ||
239 | } | ||
240 | #endif | ||
241 | |||
229 | platform_device_register(&rtc_device); | 242 | platform_device_register(&rtc_device); |
230 | #endif /* CONFIG_PNP */ | 243 | dev_info(&rtc_device.dev, |
244 | "registered platform RTC device (no PNP device found)\n"); | ||
231 | return 0; | 245 | return 0; |
232 | } | 246 | } |
233 | device_initcall(add_rtc_cmos); | 247 | device_initcall(add_rtc_cmos); |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 362d4e7f2d38..2255782e8d4b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -223,6 +223,9 @@ unsigned long saved_video_mode; | |||
223 | #define RAMDISK_LOAD_FLAG 0x4000 | 223 | #define RAMDISK_LOAD_FLAG 0x4000 |
224 | 224 | ||
225 | static char __initdata command_line[COMMAND_LINE_SIZE]; | 225 | static char __initdata command_line[COMMAND_LINE_SIZE]; |
226 | #ifdef CONFIG_CMDLINE_BOOL | ||
227 | static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; | ||
228 | #endif | ||
226 | 229 | ||
227 | #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) | 230 | #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) |
228 | struct edd edd; | 231 | struct edd edd; |
@@ -299,7 +302,7 @@ static void __init relocate_initrd(void) | |||
299 | if (clen > MAX_MAP_CHUNK-slop) | 302 | if (clen > MAX_MAP_CHUNK-slop) |
300 | clen = MAX_MAP_CHUNK-slop; | 303 | clen = MAX_MAP_CHUNK-slop; |
301 | mapaddr = ramdisk_image & PAGE_MASK; | 304 | mapaddr = ramdisk_image & PAGE_MASK; |
302 | p = early_ioremap(mapaddr, clen+slop); | 305 | p = early_memremap(mapaddr, clen+slop); |
303 | memcpy(q, p+slop, clen); | 306 | memcpy(q, p+slop, clen); |
304 | early_iounmap(p, clen+slop); | 307 | early_iounmap(p, clen+slop); |
305 | q += clen; | 308 | q += clen; |
@@ -376,7 +379,7 @@ static void __init parse_setup_data(void) | |||
376 | return; | 379 | return; |
377 | pa_data = boot_params.hdr.setup_data; | 380 | pa_data = boot_params.hdr.setup_data; |
378 | while (pa_data) { | 381 | while (pa_data) { |
379 | data = early_ioremap(pa_data, PAGE_SIZE); | 382 | data = early_memremap(pa_data, PAGE_SIZE); |
380 | switch (data->type) { | 383 | switch (data->type) { |
381 | case SETUP_E820_EXT: | 384 | case SETUP_E820_EXT: |
382 | parse_e820_ext(data, pa_data); | 385 | parse_e820_ext(data, pa_data); |
@@ -399,7 +402,7 @@ static void __init e820_reserve_setup_data(void) | |||
399 | return; | 402 | return; |
400 | pa_data = boot_params.hdr.setup_data; | 403 | pa_data = boot_params.hdr.setup_data; |
401 | while (pa_data) { | 404 | while (pa_data) { |
402 | data = early_ioremap(pa_data, sizeof(*data)); | 405 | data = early_memremap(pa_data, sizeof(*data)); |
403 | e820_update_range(pa_data, sizeof(*data)+data->len, | 406 | e820_update_range(pa_data, sizeof(*data)+data->len, |
404 | E820_RAM, E820_RESERVED_KERN); | 407 | E820_RAM, E820_RESERVED_KERN); |
405 | found = 1; | 408 | found = 1; |
@@ -425,7 +428,7 @@ static void __init reserve_early_setup_data(void) | |||
425 | return; | 428 | return; |
426 | pa_data = boot_params.hdr.setup_data; | 429 | pa_data = boot_params.hdr.setup_data; |
427 | while (pa_data) { | 430 | while (pa_data) { |
428 | data = early_ioremap(pa_data, sizeof(*data)); | 431 | data = early_memremap(pa_data, sizeof(*data)); |
429 | sprintf(buf, "setup data %x", data->type); | 432 | sprintf(buf, "setup data %x", data->type); |
430 | reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); | 433 | reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); |
431 | pa_data = data->next; | 434 | pa_data = data->next; |
@@ -579,6 +582,190 @@ static struct x86_quirks default_x86_quirks __initdata; | |||
579 | struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; | 582 | struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; |
580 | 583 | ||
581 | /* | 584 | /* |
585 | * Some BIOSes seem to corrupt the low 64k of memory during events | ||
586 | * like suspend/resume and unplugging an HDMI cable. Reserve all | ||
587 | * remaining free memory in that area and fill it with a distinct | ||
588 | * pattern. | ||
589 | */ | ||
590 | #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION | ||
591 | #define MAX_SCAN_AREAS 8 | ||
592 | |||
593 | static int __read_mostly memory_corruption_check = -1; | ||
594 | |||
595 | static unsigned __read_mostly corruption_check_size = 64*1024; | ||
596 | static unsigned __read_mostly corruption_check_period = 60; /* seconds */ | ||
597 | |||
598 | static struct e820entry scan_areas[MAX_SCAN_AREAS]; | ||
599 | static int num_scan_areas; | ||
600 | |||
601 | |||
602 | static int set_corruption_check(char *arg) | ||
603 | { | ||
604 | char *end; | ||
605 | |||
606 | memory_corruption_check = simple_strtol(arg, &end, 10); | ||
607 | |||
608 | return (*end == 0) ? 0 : -EINVAL; | ||
609 | } | ||
610 | early_param("memory_corruption_check", set_corruption_check); | ||
611 | |||
612 | static int set_corruption_check_period(char *arg) | ||
613 | { | ||
614 | char *end; | ||
615 | |||
616 | corruption_check_period = simple_strtoul(arg, &end, 10); | ||
617 | |||
618 | return (*end == 0) ? 0 : -EINVAL; | ||
619 | } | ||
620 | early_param("memory_corruption_check_period", set_corruption_check_period); | ||
621 | |||
622 | static int set_corruption_check_size(char *arg) | ||
623 | { | ||
624 | char *end; | ||
625 | unsigned size; | ||
626 | |||
627 | size = memparse(arg, &end); | ||
628 | |||
629 | if (*end == '\0') | ||
630 | corruption_check_size = size; | ||
631 | |||
632 | return (size == corruption_check_size) ? 0 : -EINVAL; | ||
633 | } | ||
634 | early_param("memory_corruption_check_size", set_corruption_check_size); | ||
635 | |||
636 | |||
637 | static void __init setup_bios_corruption_check(void) | ||
638 | { | ||
639 | u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ | ||
640 | |||
641 | if (memory_corruption_check == -1) { | ||
642 | memory_corruption_check = | ||
643 | #ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK | ||
644 | 1 | ||
645 | #else | ||
646 | 0 | ||
647 | #endif | ||
648 | ; | ||
649 | } | ||
650 | |||
651 | if (corruption_check_size == 0) | ||
652 | memory_corruption_check = 0; | ||
653 | |||
654 | if (!memory_corruption_check) | ||
655 | return; | ||
656 | |||
657 | corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); | ||
658 | |||
659 | while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { | ||
660 | u64 size; | ||
661 | addr = find_e820_area_size(addr, &size, PAGE_SIZE); | ||
662 | |||
663 | if (addr == 0) | ||
664 | break; | ||
665 | |||
666 | if ((addr + size) > corruption_check_size) | ||
667 | size = corruption_check_size - addr; | ||
668 | |||
669 | if (size == 0) | ||
670 | break; | ||
671 | |||
672 | e820_update_range(addr, size, E820_RAM, E820_RESERVED); | ||
673 | scan_areas[num_scan_areas].addr = addr; | ||
674 | scan_areas[num_scan_areas].size = size; | ||
675 | num_scan_areas++; | ||
676 | |||
677 | /* Assume we've already mapped this early memory */ | ||
678 | memset(__va(addr), 0, size); | ||
679 | |||
680 | addr += size; | ||
681 | } | ||
682 | |||
683 | printk(KERN_INFO "Scanning %d areas for low memory corruption\n", | ||
684 | num_scan_areas); | ||
685 | update_e820(); | ||
686 | } | ||
687 | |||
688 | static struct timer_list periodic_check_timer; | ||
689 | |||
690 | void check_for_bios_corruption(void) | ||
691 | { | ||
692 | int i; | ||
693 | int corruption = 0; | ||
694 | |||
695 | if (!memory_corruption_check) | ||
696 | return; | ||
697 | |||
698 | for(i = 0; i < num_scan_areas; i++) { | ||
699 | unsigned long *addr = __va(scan_areas[i].addr); | ||
700 | unsigned long size = scan_areas[i].size; | ||
701 | |||
702 | for(; size; addr++, size -= sizeof(unsigned long)) { | ||
703 | if (!*addr) | ||
704 | continue; | ||
705 | printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", | ||
706 | addr, __pa(addr), *addr); | ||
707 | corruption = 1; | ||
708 | *addr = 0; | ||
709 | } | ||
710 | } | ||
711 | |||
712 | WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); | ||
713 | } | ||
714 | |||
715 | static void periodic_check_for_corruption(unsigned long data) | ||
716 | { | ||
717 | check_for_bios_corruption(); | ||
718 | mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ)); | ||
719 | } | ||
720 | |||
721 | void start_periodic_check_for_corruption(void) | ||
722 | { | ||
723 | if (!memory_corruption_check || corruption_check_period == 0) | ||
724 | return; | ||
725 | |||
726 | printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", | ||
727 | corruption_check_period); | ||
728 | |||
729 | init_timer(&periodic_check_timer); | ||
730 | periodic_check_timer.function = &periodic_check_for_corruption; | ||
731 | periodic_check_for_corruption(0); | ||
732 | } | ||
733 | #endif | ||
734 | |||
735 | static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) | ||
736 | { | ||
737 | printk(KERN_NOTICE | ||
738 | "%s detected: BIOS may corrupt low RAM, working it around.\n", | ||
739 | d->ident); | ||
740 | |||
741 | e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED); | ||
742 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | ||
743 | |||
744 | return 0; | ||
745 | } | ||
746 | |||
747 | /* List of systems that have known low memory corruption BIOS problems */ | ||
748 | static struct dmi_system_id __initdata bad_bios_dmi_table[] = { | ||
749 | #ifdef CONFIG_X86_RESERVE_LOW_64K | ||
750 | { | ||
751 | .callback = dmi_low_memory_corruption, | ||
752 | .ident = "AMI BIOS", | ||
753 | .matches = { | ||
754 | DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), | ||
755 | }, | ||
756 | }, | ||
757 | { | ||
758 | .callback = dmi_low_memory_corruption, | ||
759 | .ident = "Phoenix BIOS", | ||
760 | .matches = { | ||
761 | DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), | ||
762 | }, | ||
763 | }, | ||
764 | #endif | ||
765 | {} | ||
766 | }; | ||
767 | |||
768 | /* | ||
582 | * Determine if we were loaded by an EFI loader. If so, then we have also been | 769 | * Determine if we were loaded by an EFI loader. If so, then we have also been |
583 | * passed the efi memmap, systab, etc., so we should use these data structures | 770 | * passed the efi memmap, systab, etc., so we should use these data structures |
584 | * for initialization. Note, the efi init code path is determined by the | 771 | * for initialization. Note, the efi init code path is determined by the |
@@ -665,11 +852,28 @@ void __init setup_arch(char **cmdline_p) | |||
665 | bss_resource.start = virt_to_phys(&__bss_start); | 852 | bss_resource.start = virt_to_phys(&__bss_start); |
666 | bss_resource.end = virt_to_phys(&__bss_stop)-1; | 853 | bss_resource.end = virt_to_phys(&__bss_stop)-1; |
667 | 854 | ||
855 | #ifdef CONFIG_CMDLINE_BOOL | ||
856 | #ifdef CONFIG_CMDLINE_OVERRIDE | ||
857 | strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); | ||
858 | #else | ||
859 | if (builtin_cmdline[0]) { | ||
860 | /* append boot loader cmdline to builtin */ | ||
861 | strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); | ||
862 | strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); | ||
863 | strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); | ||
864 | } | ||
865 | #endif | ||
866 | #endif | ||
867 | |||
668 | strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); | 868 | strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); |
669 | *cmdline_p = command_line; | 869 | *cmdline_p = command_line; |
670 | 870 | ||
671 | parse_early_param(); | 871 | parse_early_param(); |
672 | 872 | ||
873 | #ifdef CONFIG_X86_64 | ||
874 | check_efer(); | ||
875 | #endif | ||
876 | |||
673 | #if defined(CONFIG_VMI) && defined(CONFIG_X86_32) | 877 | #if defined(CONFIG_VMI) && defined(CONFIG_X86_32) |
674 | /* | 878 | /* |
675 | * Must be before kernel pagetables are setup | 879 | * Must be before kernel pagetables are setup |
@@ -695,6 +899,10 @@ void __init setup_arch(char **cmdline_p) | |||
695 | 899 | ||
696 | finish_e820_parsing(); | 900 | finish_e820_parsing(); |
697 | 901 | ||
902 | dmi_scan_machine(); | ||
903 | |||
904 | dmi_check_system(bad_bios_dmi_table); | ||
905 | |||
698 | #ifdef CONFIG_X86_32 | 906 | #ifdef CONFIG_X86_32 |
699 | probe_roms(); | 907 | probe_roms(); |
700 | #endif | 908 | #endif |
@@ -738,7 +946,8 @@ void __init setup_arch(char **cmdline_p) | |||
738 | #else | 946 | #else |
739 | num_physpages = max_pfn; | 947 | num_physpages = max_pfn; |
740 | 948 | ||
741 | check_efer(); | 949 | if (cpu_has_x2apic) |
950 | check_x2apic(); | ||
742 | 951 | ||
743 | /* How many end-of-memory variables you have, grandma! */ | 952 | /* How many end-of-memory variables you have, grandma! */ |
744 | /* need this before calling reserve_initrd */ | 953 | /* need this before calling reserve_initrd */ |
@@ -750,6 +959,10 @@ void __init setup_arch(char **cmdline_p) | |||
750 | high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; | 959 | high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; |
751 | #endif | 960 | #endif |
752 | 961 | ||
962 | #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION | ||
963 | setup_bios_corruption_check(); | ||
964 | #endif | ||
965 | |||
753 | /* max_pfn_mapped is updated here */ | 966 | /* max_pfn_mapped is updated here */ |
754 | max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); | 967 | max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); |
755 | max_pfn_mapped = max_low_pfn_mapped; | 968 | max_pfn_mapped = max_low_pfn_mapped; |
@@ -778,8 +991,6 @@ void __init setup_arch(char **cmdline_p) | |||
778 | vsmp_init(); | 991 | vsmp_init(); |
779 | #endif | 992 | #endif |
780 | 993 | ||
781 | dmi_scan_machine(); | ||
782 | |||
783 | io_delay_init(); | 994 | io_delay_init(); |
784 | 995 | ||
785 | /* | 996 | /* |
@@ -787,6 +998,8 @@ void __init setup_arch(char **cmdline_p) | |||
787 | */ | 998 | */ |
788 | acpi_boot_table_init(); | 999 | acpi_boot_table_init(); |
789 | 1000 | ||
1001 | early_acpi_boot_init(); | ||
1002 | |||
790 | #ifdef CONFIG_ACPI_NUMA | 1003 | #ifdef CONFIG_ACPI_NUMA |
791 | /* | 1004 | /* |
792 | * Parse SRAT to discover nodes. | 1005 | * Parse SRAT to discover nodes. |
@@ -882,3 +1095,5 @@ void __init setup_arch(char **cmdline_p) | |||
882 | #endif | 1095 | #endif |
883 | #endif | 1096 | #endif |
884 | } | 1097 | } |
1098 | |||
1099 | |||
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 76e305e064f9..0e67f72d9316 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -162,9 +162,16 @@ void __init setup_per_cpu_areas(void) | |||
162 | printk(KERN_INFO | 162 | printk(KERN_INFO |
163 | "cpu %d has no node %d or node-local memory\n", | 163 | "cpu %d has no node %d or node-local memory\n", |
164 | cpu, node); | 164 | cpu, node); |
165 | if (ptr) | ||
166 | printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n", | ||
167 | cpu, __pa(ptr)); | ||
165 | } | 168 | } |
166 | else | 169 | else { |
167 | ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); | 170 | ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); |
171 | if (ptr) | ||
172 | printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", | ||
173 | cpu, node, __pa(ptr)); | ||
174 | } | ||
168 | #endif | 175 | #endif |
169 | per_cpu_offset(cpu) = ptr - __per_cpu_start; | 176 | per_cpu_offset(cpu) = ptr - __per_cpu_start; |
170 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | 177 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); |
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h index 72bbb519d2dc..cc673aa55ce4 100644 --- a/arch/x86/kernel/sigframe.h +++ b/arch/x86/kernel/sigframe.h | |||
@@ -3,9 +3,18 @@ struct sigframe { | |||
3 | char __user *pretcode; | 3 | char __user *pretcode; |
4 | int sig; | 4 | int sig; |
5 | struct sigcontext sc; | 5 | struct sigcontext sc; |
6 | struct _fpstate fpstate; | 6 | /* |
7 | * fpstate is unused. fpstate is moved/allocated after | ||
8 | * retcode[] below. This movement allows to have the FP state and the | ||
9 | * future state extensions (xsave) stay together. | ||
10 | * And at the same time retaining the unused fpstate, prevents changing | ||
11 | * the offset of extramask[] in the sigframe and thus prevent any | ||
12 | * legacy application accessing/modifying it. | ||
13 | */ | ||
14 | struct _fpstate fpstate_unused; | ||
7 | unsigned long extramask[_NSIG_WORDS-1]; | 15 | unsigned long extramask[_NSIG_WORDS-1]; |
8 | char retcode[8]; | 16 | char retcode[8]; |
17 | /* fp state follows here */ | ||
9 | }; | 18 | }; |
10 | 19 | ||
11 | struct rt_sigframe { | 20 | struct rt_sigframe { |
@@ -15,13 +24,19 @@ struct rt_sigframe { | |||
15 | void __user *puc; | 24 | void __user *puc; |
16 | struct siginfo info; | 25 | struct siginfo info; |
17 | struct ucontext uc; | 26 | struct ucontext uc; |
18 | struct _fpstate fpstate; | ||
19 | char retcode[8]; | 27 | char retcode[8]; |
28 | /* fp state follows here */ | ||
20 | }; | 29 | }; |
21 | #else | 30 | #else |
22 | struct rt_sigframe { | 31 | struct rt_sigframe { |
23 | char __user *pretcode; | 32 | char __user *pretcode; |
24 | struct ucontext uc; | 33 | struct ucontext uc; |
25 | struct siginfo info; | 34 | struct siginfo info; |
35 | /* fp state follows here */ | ||
26 | }; | 36 | }; |
37 | |||
38 | int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | ||
39 | sigset_t *set, struct pt_regs *regs); | ||
40 | int ia32_setup_frame(int sig, struct k_sigaction *ka, | ||
41 | sigset_t *set, struct pt_regs *regs); | ||
27 | #endif | 42 | #endif |
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 6fb5bcdd8933..d6dd057d0f22 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/errno.h> | 17 | #include <linux/errno.h> |
18 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
19 | #include <linux/wait.h> | 19 | #include <linux/wait.h> |
20 | #include <linux/tracehook.h> | ||
20 | #include <linux/elf.h> | 21 | #include <linux/elf.h> |
21 | #include <linux/smp.h> | 22 | #include <linux/smp.h> |
22 | #include <linux/mm.h> | 23 | #include <linux/mm.h> |
@@ -26,6 +27,8 @@ | |||
26 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
27 | #include <asm/i387.h> | 28 | #include <asm/i387.h> |
28 | #include <asm/vdso.h> | 29 | #include <asm/vdso.h> |
30 | #include <asm/syscall.h> | ||
31 | #include <asm/syscalls.h> | ||
29 | 32 | ||
30 | #include "sigframe.h" | 33 | #include "sigframe.h" |
31 | 34 | ||
@@ -110,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx) | |||
110 | return do_sigaltstack(uss, uoss, regs->sp); | 113 | return do_sigaltstack(uss, uoss, regs->sp); |
111 | } | 114 | } |
112 | 115 | ||
116 | #define COPY(x) { \ | ||
117 | err |= __get_user(regs->x, &sc->x); \ | ||
118 | } | ||
119 | |||
120 | #define COPY_SEG(seg) { \ | ||
121 | unsigned short tmp; \ | ||
122 | err |= __get_user(tmp, &sc->seg); \ | ||
123 | regs->seg = tmp; \ | ||
124 | } | ||
125 | |||
126 | #define COPY_SEG_STRICT(seg) { \ | ||
127 | unsigned short tmp; \ | ||
128 | err |= __get_user(tmp, &sc->seg); \ | ||
129 | regs->seg = tmp | 3; \ | ||
130 | } | ||
131 | |||
132 | #define GET_SEG(seg) { \ | ||
133 | unsigned short tmp; \ | ||
134 | err |= __get_user(tmp, &sc->seg); \ | ||
135 | loadsegment(seg, tmp); \ | ||
136 | } | ||
113 | 137 | ||
114 | /* | 138 | /* |
115 | * Do a signal return; undo the signal stack. | 139 | * Do a signal return; undo the signal stack. |
@@ -118,28 +142,13 @@ static int | |||
118 | restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | 142 | restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, |
119 | unsigned long *pax) | 143 | unsigned long *pax) |
120 | { | 144 | { |
145 | void __user *buf; | ||
146 | unsigned int tmpflags; | ||
121 | unsigned int err = 0; | 147 | unsigned int err = 0; |
122 | 148 | ||
123 | /* Always make any pending restarted system calls return -EINTR */ | 149 | /* Always make any pending restarted system calls return -EINTR */ |
124 | current_thread_info()->restart_block.fn = do_no_restart_syscall; | 150 | current_thread_info()->restart_block.fn = do_no_restart_syscall; |
125 | 151 | ||
126 | #define COPY(x) err |= __get_user(regs->x, &sc->x) | ||
127 | |||
128 | #define COPY_SEG(seg) \ | ||
129 | { unsigned short tmp; \ | ||
130 | err |= __get_user(tmp, &sc->seg); \ | ||
131 | regs->seg = tmp; } | ||
132 | |||
133 | #define COPY_SEG_STRICT(seg) \ | ||
134 | { unsigned short tmp; \ | ||
135 | err |= __get_user(tmp, &sc->seg); \ | ||
136 | regs->seg = tmp|3; } | ||
137 | |||
138 | #define GET_SEG(seg) \ | ||
139 | { unsigned short tmp; \ | ||
140 | err |= __get_user(tmp, &sc->seg); \ | ||
141 | loadsegment(seg, tmp); } | ||
142 | |||
143 | GET_SEG(gs); | 152 | GET_SEG(gs); |
144 | COPY_SEG(fs); | 153 | COPY_SEG(fs); |
145 | COPY_SEG(es); | 154 | COPY_SEG(es); |
@@ -149,38 +158,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | |||
149 | COPY_SEG_STRICT(cs); | 158 | COPY_SEG_STRICT(cs); |
150 | COPY_SEG_STRICT(ss); | 159 | COPY_SEG_STRICT(ss); |
151 | 160 | ||
152 | { | 161 | err |= __get_user(tmpflags, &sc->flags); |
153 | unsigned int tmpflags; | 162 | regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); |
154 | 163 | regs->orig_ax = -1; /* disable syscall checks */ | |
155 | err |= __get_user(tmpflags, &sc->flags); | ||
156 | regs->flags = (regs->flags & ~FIX_EFLAGS) | | ||
157 | (tmpflags & FIX_EFLAGS); | ||
158 | regs->orig_ax = -1; /* disable syscall checks */ | ||
159 | } | ||
160 | 164 | ||
161 | { | 165 | err |= __get_user(buf, &sc->fpstate); |
162 | struct _fpstate __user *buf; | 166 | err |= restore_i387_xstate(buf); |
163 | |||
164 | err |= __get_user(buf, &sc->fpstate); | ||
165 | if (buf) { | ||
166 | if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) | ||
167 | goto badframe; | ||
168 | err |= restore_i387(buf); | ||
169 | } else { | ||
170 | struct task_struct *me = current; | ||
171 | |||
172 | if (used_math()) { | ||
173 | clear_fpu(me); | ||
174 | clear_used_math(); | ||
175 | } | ||
176 | } | ||
177 | } | ||
178 | 167 | ||
179 | err |= __get_user(*pax, &sc->ax); | 168 | err |= __get_user(*pax, &sc->ax); |
180 | return err; | 169 | return err; |
181 | |||
182 | badframe: | ||
183 | return 1; | ||
184 | } | 170 | } |
185 | 171 | ||
186 | asmlinkage unsigned long sys_sigreturn(unsigned long __unused) | 172 | asmlinkage unsigned long sys_sigreturn(unsigned long __unused) |
@@ -226,9 +212,8 @@ badframe: | |||
226 | return 0; | 212 | return 0; |
227 | } | 213 | } |
228 | 214 | ||
229 | asmlinkage int sys_rt_sigreturn(unsigned long __unused) | 215 | static long do_rt_sigreturn(struct pt_regs *regs) |
230 | { | 216 | { |
231 | struct pt_regs *regs = (struct pt_regs *)&__unused; | ||
232 | struct rt_sigframe __user *frame; | 217 | struct rt_sigframe __user *frame; |
233 | unsigned long ax; | 218 | unsigned long ax; |
234 | sigset_t set; | 219 | sigset_t set; |
@@ -254,15 +239,22 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused) | |||
254 | return ax; | 239 | return ax; |
255 | 240 | ||
256 | badframe: | 241 | badframe: |
257 | force_sig(SIGSEGV, current); | 242 | signal_fault(regs, frame, "rt_sigreturn"); |
258 | return 0; | 243 | return 0; |
259 | } | 244 | } |
260 | 245 | ||
246 | asmlinkage int sys_rt_sigreturn(unsigned long __unused) | ||
247 | { | ||
248 | struct pt_regs *regs = (struct pt_regs *)&__unused; | ||
249 | |||
250 | return do_rt_sigreturn(regs); | ||
251 | } | ||
252 | |||
261 | /* | 253 | /* |
262 | * Set up a signal frame. | 254 | * Set up a signal frame. |
263 | */ | 255 | */ |
264 | static int | 256 | static int |
265 | setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, | 257 | setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, |
266 | struct pt_regs *regs, unsigned long mask) | 258 | struct pt_regs *regs, unsigned long mask) |
267 | { | 259 | { |
268 | int tmp, err = 0; | 260 | int tmp, err = 0; |
@@ -289,7 +281,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, | |||
289 | err |= __put_user(regs->sp, &sc->sp_at_signal); | 281 | err |= __put_user(regs->sp, &sc->sp_at_signal); |
290 | err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); | 282 | err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); |
291 | 283 | ||
292 | tmp = save_i387(fpstate); | 284 | tmp = save_i387_xstate(fpstate); |
293 | if (tmp < 0) | 285 | if (tmp < 0) |
294 | err = 1; | 286 | err = 1; |
295 | else | 287 | else |
@@ -306,7 +298,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, | |||
306 | * Determine which stack to use.. | 298 | * Determine which stack to use.. |
307 | */ | 299 | */ |
308 | static inline void __user * | 300 | static inline void __user * |
309 | get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) | 301 | get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, |
302 | void **fpstate) | ||
310 | { | 303 | { |
311 | unsigned long sp; | 304 | unsigned long sp; |
312 | 305 | ||
@@ -332,6 +325,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) | |||
332 | sp = (unsigned long) ka->sa.sa_restorer; | 325 | sp = (unsigned long) ka->sa.sa_restorer; |
333 | } | 326 | } |
334 | 327 | ||
328 | if (used_math()) { | ||
329 | sp = sp - sig_xstate_size; | ||
330 | *fpstate = (struct _fpstate *) sp; | ||
331 | } | ||
332 | |||
335 | sp -= frame_size; | 333 | sp -= frame_size; |
336 | /* | 334 | /* |
337 | * Align the stack pointer according to the i386 ABI, | 335 | * Align the stack pointer according to the i386 ABI, |
@@ -343,38 +341,29 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) | |||
343 | } | 341 | } |
344 | 342 | ||
345 | static int | 343 | static int |
346 | setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, | 344 | __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, |
347 | struct pt_regs *regs) | 345 | struct pt_regs *regs) |
348 | { | 346 | { |
349 | struct sigframe __user *frame; | 347 | struct sigframe __user *frame; |
350 | void __user *restorer; | 348 | void __user *restorer; |
351 | int err = 0; | 349 | int err = 0; |
352 | int usig; | 350 | void __user *fpstate = NULL; |
353 | 351 | ||
354 | frame = get_sigframe(ka, regs, sizeof(*frame)); | 352 | frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); |
355 | 353 | ||
356 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | 354 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) |
357 | goto give_sigsegv; | 355 | return -EFAULT; |
358 | 356 | ||
359 | usig = current_thread_info()->exec_domain | 357 | if (__put_user(sig, &frame->sig)) |
360 | && current_thread_info()->exec_domain->signal_invmap | 358 | return -EFAULT; |
361 | && sig < 32 | ||
362 | ? current_thread_info()->exec_domain->signal_invmap[sig] | ||
363 | : sig; | ||
364 | 359 | ||
365 | err = __put_user(usig, &frame->sig); | 360 | if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) |
366 | if (err) | 361 | return -EFAULT; |
367 | goto give_sigsegv; | ||
368 | |||
369 | err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]); | ||
370 | if (err) | ||
371 | goto give_sigsegv; | ||
372 | 362 | ||
373 | if (_NSIG_WORDS > 1) { | 363 | if (_NSIG_WORDS > 1) { |
374 | err = __copy_to_user(&frame->extramask, &set->sig[1], | 364 | if (__copy_to_user(&frame->extramask, &set->sig[1], |
375 | sizeof(frame->extramask)); | 365 | sizeof(frame->extramask))) |
376 | if (err) | 366 | return -EFAULT; |
377 | goto give_sigsegv; | ||
378 | } | 367 | } |
379 | 368 | ||
380 | if (current->mm->context.vdso) | 369 | if (current->mm->context.vdso) |
@@ -399,7 +388,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, | |||
399 | err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); | 388 | err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); |
400 | 389 | ||
401 | if (err) | 390 | if (err) |
402 | goto give_sigsegv; | 391 | return -EFAULT; |
403 | 392 | ||
404 | /* Set up registers for signal handler */ | 393 | /* Set up registers for signal handler */ |
405 | regs->sp = (unsigned long)frame; | 394 | regs->sp = (unsigned long)frame; |
@@ -414,50 +403,43 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, | |||
414 | regs->cs = __USER_CS; | 403 | regs->cs = __USER_CS; |
415 | 404 | ||
416 | return 0; | 405 | return 0; |
417 | |||
418 | give_sigsegv: | ||
419 | force_sigsegv(sig, current); | ||
420 | return -EFAULT; | ||
421 | } | 406 | } |
422 | 407 | ||
423 | static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | 408 | static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, |
424 | sigset_t *set, struct pt_regs *regs) | 409 | sigset_t *set, struct pt_regs *regs) |
425 | { | 410 | { |
426 | struct rt_sigframe __user *frame; | 411 | struct rt_sigframe __user *frame; |
427 | void __user *restorer; | 412 | void __user *restorer; |
428 | int err = 0; | 413 | int err = 0; |
429 | int usig; | 414 | void __user *fpstate = NULL; |
430 | 415 | ||
431 | frame = get_sigframe(ka, regs, sizeof(*frame)); | 416 | frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); |
432 | 417 | ||
433 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | 418 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) |
434 | goto give_sigsegv; | 419 | return -EFAULT; |
435 | 420 | ||
436 | usig = current_thread_info()->exec_domain | 421 | err |= __put_user(sig, &frame->sig); |
437 | && current_thread_info()->exec_domain->signal_invmap | ||
438 | && sig < 32 | ||
439 | ? current_thread_info()->exec_domain->signal_invmap[sig] | ||
440 | : sig; | ||
441 | |||
442 | err |= __put_user(usig, &frame->sig); | ||
443 | err |= __put_user(&frame->info, &frame->pinfo); | 422 | err |= __put_user(&frame->info, &frame->pinfo); |
444 | err |= __put_user(&frame->uc, &frame->puc); | 423 | err |= __put_user(&frame->uc, &frame->puc); |
445 | err |= copy_siginfo_to_user(&frame->info, info); | 424 | err |= copy_siginfo_to_user(&frame->info, info); |
446 | if (err) | 425 | if (err) |
447 | goto give_sigsegv; | 426 | return -EFAULT; |
448 | 427 | ||
449 | /* Create the ucontext. */ | 428 | /* Create the ucontext. */ |
450 | err |= __put_user(0, &frame->uc.uc_flags); | 429 | if (cpu_has_xsave) |
430 | err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); | ||
431 | else | ||
432 | err |= __put_user(0, &frame->uc.uc_flags); | ||
451 | err |= __put_user(0, &frame->uc.uc_link); | 433 | err |= __put_user(0, &frame->uc.uc_link); |
452 | err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); | 434 | err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); |
453 | err |= __put_user(sas_ss_flags(regs->sp), | 435 | err |= __put_user(sas_ss_flags(regs->sp), |
454 | &frame->uc.uc_stack.ss_flags); | 436 | &frame->uc.uc_stack.ss_flags); |
455 | err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); | 437 | err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); |
456 | err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, | 438 | err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, |
457 | regs, set->sig[0]); | 439 | regs, set->sig[0]); |
458 | err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); | 440 | err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); |
459 | if (err) | 441 | if (err) |
460 | goto give_sigsegv; | 442 | return -EFAULT; |
461 | 443 | ||
462 | /* Set up to return from userspace. */ | 444 | /* Set up to return from userspace. */ |
463 | restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); | 445 | restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); |
@@ -477,12 +459,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | |||
477 | err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); | 459 | err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); |
478 | 460 | ||
479 | if (err) | 461 | if (err) |
480 | goto give_sigsegv; | 462 | return -EFAULT; |
481 | 463 | ||
482 | /* Set up registers for signal handler */ | 464 | /* Set up registers for signal handler */ |
483 | regs->sp = (unsigned long)frame; | 465 | regs->sp = (unsigned long)frame; |
484 | regs->ip = (unsigned long)ka->sa.sa_handler; | 466 | regs->ip = (unsigned long)ka->sa.sa_handler; |
485 | regs->ax = (unsigned long)usig; | 467 | regs->ax = (unsigned long)sig; |
486 | regs->dx = (unsigned long)&frame->info; | 468 | regs->dx = (unsigned long)&frame->info; |
487 | regs->cx = (unsigned long)&frame->uc; | 469 | regs->cx = (unsigned long)&frame->uc; |
488 | 470 | ||
@@ -492,15 +474,48 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | |||
492 | regs->cs = __USER_CS; | 474 | regs->cs = __USER_CS; |
493 | 475 | ||
494 | return 0; | 476 | return 0; |
495 | |||
496 | give_sigsegv: | ||
497 | force_sigsegv(sig, current); | ||
498 | return -EFAULT; | ||
499 | } | 477 | } |
500 | 478 | ||
501 | /* | 479 | /* |
502 | * OK, we're invoking a handler: | 480 | * OK, we're invoking a handler: |
503 | */ | 481 | */ |
482 | static int signr_convert(int sig) | ||
483 | { | ||
484 | struct thread_info *info = current_thread_info(); | ||
485 | |||
486 | if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) | ||
487 | return info->exec_domain->signal_invmap[sig]; | ||
488 | return sig; | ||
489 | } | ||
490 | |||
491 | #define is_ia32 1 | ||
492 | #define ia32_setup_frame __setup_frame | ||
493 | #define ia32_setup_rt_frame __setup_rt_frame | ||
494 | |||
495 | static int | ||
496 | setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | ||
497 | sigset_t *set, struct pt_regs *regs) | ||
498 | { | ||
499 | int usig = signr_convert(sig); | ||
500 | int ret; | ||
501 | |||
502 | /* Set up the stack frame */ | ||
503 | if (is_ia32) { | ||
504 | if (ka->sa.sa_flags & SA_SIGINFO) | ||
505 | ret = ia32_setup_rt_frame(usig, ka, info, set, regs); | ||
506 | else | ||
507 | ret = ia32_setup_frame(usig, ka, set, regs); | ||
508 | } else | ||
509 | ret = __setup_rt_frame(sig, ka, info, set, regs); | ||
510 | |||
511 | if (ret) { | ||
512 | force_sigsegv(sig, current); | ||
513 | return -EFAULT; | ||
514 | } | ||
515 | |||
516 | return ret; | ||
517 | } | ||
518 | |||
504 | static int | 519 | static int |
505 | handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, | 520 | handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, |
506 | sigset_t *oldset, struct pt_regs *regs) | 521 | sigset_t *oldset, struct pt_regs *regs) |
@@ -508,9 +523,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, | |||
508 | int ret; | 523 | int ret; |
509 | 524 | ||
510 | /* Are we from a system call? */ | 525 | /* Are we from a system call? */ |
511 | if ((long)regs->orig_ax >= 0) { | 526 | if (syscall_get_nr(current, regs) >= 0) { |
512 | /* If so, check system call restarting.. */ | 527 | /* If so, check system call restarting.. */ |
513 | switch (regs->ax) { | 528 | switch (syscall_get_error(current, regs)) { |
514 | case -ERESTART_RESTARTBLOCK: | 529 | case -ERESTART_RESTARTBLOCK: |
515 | case -ERESTARTNOHAND: | 530 | case -ERESTARTNOHAND: |
516 | regs->ax = -EINTR; | 531 | regs->ax = -EINTR; |
@@ -537,15 +552,20 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, | |||
537 | likely(test_and_clear_thread_flag(TIF_FORCED_TF))) | 552 | likely(test_and_clear_thread_flag(TIF_FORCED_TF))) |
538 | regs->flags &= ~X86_EFLAGS_TF; | 553 | regs->flags &= ~X86_EFLAGS_TF; |
539 | 554 | ||
540 | /* Set up the stack frame */ | 555 | ret = setup_rt_frame(sig, ka, info, oldset, regs); |
541 | if (ka->sa.sa_flags & SA_SIGINFO) | ||
542 | ret = setup_rt_frame(sig, ka, info, oldset, regs); | ||
543 | else | ||
544 | ret = setup_frame(sig, ka, oldset, regs); | ||
545 | 556 | ||
546 | if (ret) | 557 | if (ret) |
547 | return ret; | 558 | return ret; |
548 | 559 | ||
560 | #ifdef CONFIG_X86_64 | ||
561 | /* | ||
562 | * This has nothing to do with segment registers, | ||
563 | * despite the name. This magic affects uaccess.h | ||
564 | * macros' behavior. Reset it to the normal setting. | ||
565 | */ | ||
566 | set_fs(USER_DS); | ||
567 | #endif | ||
568 | |||
549 | /* | 569 | /* |
550 | * Clear the direction flag as per the ABI for function entry. | 570 | * Clear the direction flag as per the ABI for function entry. |
551 | */ | 571 | */ |
@@ -558,8 +578,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, | |||
558 | * handler too. | 578 | * handler too. |
559 | */ | 579 | */ |
560 | regs->flags &= ~X86_EFLAGS_TF; | 580 | regs->flags &= ~X86_EFLAGS_TF; |
561 | if (test_thread_flag(TIF_SINGLESTEP)) | ||
562 | ptrace_notify(SIGTRAP); | ||
563 | 581 | ||
564 | spin_lock_irq(¤t->sighand->siglock); | 582 | spin_lock_irq(¤t->sighand->siglock); |
565 | sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); | 583 | sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); |
@@ -568,9 +586,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, | |||
568 | recalc_sigpending(); | 586 | recalc_sigpending(); |
569 | spin_unlock_irq(¤t->sighand->siglock); | 587 | spin_unlock_irq(¤t->sighand->siglock); |
570 | 588 | ||
589 | tracehook_signal_handler(sig, info, ka, regs, | ||
590 | test_thread_flag(TIF_SINGLESTEP)); | ||
591 | |||
571 | return 0; | 592 | return 0; |
572 | } | 593 | } |
573 | 594 | ||
595 | #define NR_restart_syscall __NR_restart_syscall | ||
574 | /* | 596 | /* |
575 | * Note that 'init' is a special process: it doesn't get signals it doesn't | 597 | * Note that 'init' is a special process: it doesn't get signals it doesn't |
576 | * want to handle. Thus you cannot kill init even with a SIGKILL even by | 598 | * want to handle. Thus you cannot kill init even with a SIGKILL even by |
@@ -623,9 +645,9 @@ static void do_signal(struct pt_regs *regs) | |||
623 | } | 645 | } |
624 | 646 | ||
625 | /* Did we come from a system call? */ | 647 | /* Did we come from a system call? */ |
626 | if ((long)regs->orig_ax >= 0) { | 648 | if (syscall_get_nr(current, regs) >= 0) { |
627 | /* Restart the system call - no handlers present */ | 649 | /* Restart the system call - no handlers present */ |
628 | switch (regs->ax) { | 650 | switch (syscall_get_error(current, regs)) { |
629 | case -ERESTARTNOHAND: | 651 | case -ERESTARTNOHAND: |
630 | case -ERESTARTSYS: | 652 | case -ERESTARTSYS: |
631 | case -ERESTARTNOINTR: | 653 | case -ERESTARTNOINTR: |
@@ -634,7 +656,7 @@ static void do_signal(struct pt_regs *regs) | |||
634 | break; | 656 | break; |
635 | 657 | ||
636 | case -ERESTART_RESTARTBLOCK: | 658 | case -ERESTART_RESTARTBLOCK: |
637 | regs->ax = __NR_restart_syscall; | 659 | regs->ax = NR_restart_syscall; |
638 | regs->ip -= 2; | 660 | regs->ip -= 2; |
639 | break; | 661 | break; |
640 | } | 662 | } |
@@ -657,9 +679,38 @@ static void do_signal(struct pt_regs *regs) | |||
657 | void | 679 | void |
658 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | 680 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) |
659 | { | 681 | { |
682 | #if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) | ||
683 | /* notify userspace of pending MCEs */ | ||
684 | if (thread_info_flags & _TIF_MCE_NOTIFY) | ||
685 | mce_notify_user(); | ||
686 | #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ | ||
687 | |||
660 | /* deal with pending signal delivery */ | 688 | /* deal with pending signal delivery */ |
661 | if (thread_info_flags & _TIF_SIGPENDING) | 689 | if (thread_info_flags & _TIF_SIGPENDING) |
662 | do_signal(regs); | 690 | do_signal(regs); |
663 | 691 | ||
692 | if (thread_info_flags & _TIF_NOTIFY_RESUME) { | ||
693 | clear_thread_flag(TIF_NOTIFY_RESUME); | ||
694 | tracehook_notify_resume(regs); | ||
695 | } | ||
696 | |||
697 | #ifdef CONFIG_X86_32 | ||
664 | clear_thread_flag(TIF_IRET); | 698 | clear_thread_flag(TIF_IRET); |
699 | #endif /* CONFIG_X86_32 */ | ||
700 | } | ||
701 | |||
702 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where) | ||
703 | { | ||
704 | struct task_struct *me = current; | ||
705 | |||
706 | if (show_unhandled_signals && printk_ratelimit()) { | ||
707 | printk(KERN_INFO | ||
708 | "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", | ||
709 | me->comm, me->pid, where, frame, | ||
710 | regs->ip, regs->sp, regs->orig_ax); | ||
711 | print_vma_addr(" in ", regs->ip); | ||
712 | printk(KERN_CONT "\n"); | ||
713 | } | ||
714 | |||
715 | force_sig(SIGSEGV, me); | ||
665 | } | 716 | } |
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index ca316b5b742c..a5c9627f4db9 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c | |||
@@ -15,17 +15,21 @@ | |||
15 | #include <linux/errno.h> | 15 | #include <linux/errno.h> |
16 | #include <linux/wait.h> | 16 | #include <linux/wait.h> |
17 | #include <linux/ptrace.h> | 17 | #include <linux/ptrace.h> |
18 | #include <linux/tracehook.h> | ||
18 | #include <linux/unistd.h> | 19 | #include <linux/unistd.h> |
19 | #include <linux/stddef.h> | 20 | #include <linux/stddef.h> |
20 | #include <linux/personality.h> | 21 | #include <linux/personality.h> |
21 | #include <linux/compiler.h> | 22 | #include <linux/compiler.h> |
23 | #include <linux/uaccess.h> | ||
24 | |||
22 | #include <asm/processor.h> | 25 | #include <asm/processor.h> |
23 | #include <asm/ucontext.h> | 26 | #include <asm/ucontext.h> |
24 | #include <asm/uaccess.h> | ||
25 | #include <asm/i387.h> | 27 | #include <asm/i387.h> |
26 | #include <asm/proto.h> | 28 | #include <asm/proto.h> |
27 | #include <asm/ia32_unistd.h> | 29 | #include <asm/ia32_unistd.h> |
28 | #include <asm/mce.h> | 30 | #include <asm/mce.h> |
31 | #include <asm/syscall.h> | ||
32 | #include <asm/syscalls.h> | ||
29 | #include "sigframe.h" | 33 | #include "sigframe.h" |
30 | 34 | ||
31 | #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) | 35 | #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) |
@@ -41,11 +45,6 @@ | |||
41 | # define FIX_EFLAGS __FIX_EFLAGS | 45 | # define FIX_EFLAGS __FIX_EFLAGS |
42 | #endif | 46 | #endif |
43 | 47 | ||
44 | int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | ||
45 | sigset_t *set, struct pt_regs * regs); | ||
46 | int ia32_setup_frame(int sig, struct k_sigaction *ka, | ||
47 | sigset_t *set, struct pt_regs * regs); | ||
48 | |||
49 | asmlinkage long | 48 | asmlinkage long |
50 | sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, | 49 | sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, |
51 | struct pt_regs *regs) | 50 | struct pt_regs *regs) |
@@ -53,67 +52,14 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, | |||
53 | return do_sigaltstack(uss, uoss, regs->sp); | 52 | return do_sigaltstack(uss, uoss, regs->sp); |
54 | } | 53 | } |
55 | 54 | ||
56 | /* | 55 | #define COPY(x) { \ |
57 | * Signal frame handlers. | 56 | err |= __get_user(regs->x, &sc->x); \ |
58 | */ | ||
59 | |||
60 | static inline int save_i387(struct _fpstate __user *buf) | ||
61 | { | ||
62 | struct task_struct *tsk = current; | ||
63 | int err = 0; | ||
64 | |||
65 | BUILD_BUG_ON(sizeof(struct user_i387_struct) != | ||
66 | sizeof(tsk->thread.xstate->fxsave)); | ||
67 | |||
68 | if ((unsigned long)buf % 16) | ||
69 | printk("save_i387: bad fpstate %p\n", buf); | ||
70 | |||
71 | if (!used_math()) | ||
72 | return 0; | ||
73 | clear_used_math(); /* trigger finit */ | ||
74 | if (task_thread_info(tsk)->status & TS_USEDFPU) { | ||
75 | err = save_i387_checking((struct i387_fxsave_struct __user *) | ||
76 | buf); | ||
77 | if (err) | ||
78 | return err; | ||
79 | task_thread_info(tsk)->status &= ~TS_USEDFPU; | ||
80 | stts(); | ||
81 | } else { | ||
82 | if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, | ||
83 | sizeof(struct i387_fxsave_struct))) | ||
84 | return -1; | ||
85 | } | ||
86 | return 1; | ||
87 | } | 57 | } |
88 | 58 | ||
89 | /* | 59 | #define COPY_SEG_STRICT(seg) { \ |
90 | * This restores directly out of user space. Exceptions are handled. | 60 | unsigned short tmp; \ |
91 | */ | 61 | err |= __get_user(tmp, &sc->seg); \ |
92 | static inline int restore_i387(struct _fpstate __user *buf) | 62 | regs->seg = tmp | 3; \ |
93 | { | ||
94 | struct task_struct *tsk = current; | ||
95 | int err; | ||
96 | |||
97 | if (!used_math()) { | ||
98 | err = init_fpu(tsk); | ||
99 | if (err) | ||
100 | return err; | ||
101 | } | ||
102 | |||
103 | if (!(task_thread_info(current)->status & TS_USEDFPU)) { | ||
104 | clts(); | ||
105 | task_thread_info(current)->status |= TS_USEDFPU; | ||
106 | } | ||
107 | err = restore_fpu_checking((__force struct i387_fxsave_struct *)buf); | ||
108 | if (unlikely(err)) { | ||
109 | /* | ||
110 | * Encountered an error while doing the restore from the | ||
111 | * user buffer, clear the fpu state. | ||
112 | */ | ||
113 | clear_fpu(tsk); | ||
114 | clear_used_math(); | ||
115 | } | ||
116 | return err; | ||
117 | } | 63 | } |
118 | 64 | ||
119 | /* | 65 | /* |
@@ -123,13 +69,13 @@ static int | |||
123 | restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | 69 | restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, |
124 | unsigned long *pax) | 70 | unsigned long *pax) |
125 | { | 71 | { |
72 | void __user *buf; | ||
73 | unsigned int tmpflags; | ||
126 | unsigned int err = 0; | 74 | unsigned int err = 0; |
127 | 75 | ||
128 | /* Always make any pending restarted system calls return -EINTR */ | 76 | /* Always make any pending restarted system calls return -EINTR */ |
129 | current_thread_info()->restart_block.fn = do_no_restart_syscall; | 77 | current_thread_info()->restart_block.fn = do_no_restart_syscall; |
130 | 78 | ||
131 | #define COPY(x) err |= __get_user(regs->x, &sc->x) | ||
132 | |||
133 | COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); | 79 | COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); |
134 | COPY(dx); COPY(cx); COPY(ip); | 80 | COPY(dx); COPY(cx); COPY(ip); |
135 | COPY(r8); | 81 | COPY(r8); |
@@ -144,48 +90,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, | |||
144 | /* Kernel saves and restores only the CS segment register on signals, | 90 | /* Kernel saves and restores only the CS segment register on signals, |
145 | * which is the bare minimum needed to allow mixed 32/64-bit code. | 91 | * which is the bare minimum needed to allow mixed 32/64-bit code. |
146 | * App's signal handler can save/restore other segments if needed. */ | 92 | * App's signal handler can save/restore other segments if needed. */ |
147 | { | 93 | COPY_SEG_STRICT(cs); |
148 | unsigned cs; | ||
149 | err |= __get_user(cs, &sc->cs); | ||
150 | regs->cs = cs | 3; /* Force into user mode */ | ||
151 | } | ||
152 | 94 | ||
153 | { | 95 | err |= __get_user(tmpflags, &sc->flags); |
154 | unsigned int tmpflags; | 96 | regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); |
155 | err |= __get_user(tmpflags, &sc->flags); | 97 | regs->orig_ax = -1; /* disable syscall checks */ |
156 | regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); | ||
157 | regs->orig_ax = -1; /* disable syscall checks */ | ||
158 | } | ||
159 | 98 | ||
160 | { | 99 | err |= __get_user(buf, &sc->fpstate); |
161 | struct _fpstate __user * buf; | 100 | err |= restore_i387_xstate(buf); |
162 | err |= __get_user(buf, &sc->fpstate); | ||
163 | |||
164 | if (buf) { | ||
165 | if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) | ||
166 | goto badframe; | ||
167 | err |= restore_i387(buf); | ||
168 | } else { | ||
169 | struct task_struct *me = current; | ||
170 | if (used_math()) { | ||
171 | clear_fpu(me); | ||
172 | clear_used_math(); | ||
173 | } | ||
174 | } | ||
175 | } | ||
176 | 101 | ||
177 | err |= __get_user(*pax, &sc->ax); | 102 | err |= __get_user(*pax, &sc->ax); |
178 | return err; | 103 | return err; |
179 | |||
180 | badframe: | ||
181 | return 1; | ||
182 | } | 104 | } |
183 | 105 | ||
184 | asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) | 106 | static long do_rt_sigreturn(struct pt_regs *regs) |
185 | { | 107 | { |
186 | struct rt_sigframe __user *frame; | 108 | struct rt_sigframe __user *frame; |
187 | sigset_t set; | ||
188 | unsigned long ax; | 109 | unsigned long ax; |
110 | sigset_t set; | ||
189 | 111 | ||
190 | frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); | 112 | frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); |
191 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) | 113 | if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) |
@@ -198,7 +120,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) | |||
198 | current->blocked = set; | 120 | current->blocked = set; |
199 | recalc_sigpending(); | 121 | recalc_sigpending(); |
200 | spin_unlock_irq(¤t->sighand->siglock); | 122 | spin_unlock_irq(¤t->sighand->siglock); |
201 | 123 | ||
202 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) | 124 | if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) |
203 | goto badframe; | 125 | goto badframe; |
204 | 126 | ||
@@ -208,16 +130,22 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) | |||
208 | return ax; | 130 | return ax; |
209 | 131 | ||
210 | badframe: | 132 | badframe: |
211 | signal_fault(regs,frame,"sigreturn"); | 133 | signal_fault(regs, frame, "rt_sigreturn"); |
212 | return 0; | 134 | return 0; |
213 | } | 135 | } |
136 | |||
137 | asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) | ||
138 | { | ||
139 | return do_rt_sigreturn(regs); | ||
140 | } | ||
214 | 141 | ||
215 | /* | 142 | /* |
216 | * Set up a signal frame. | 143 | * Set up a signal frame. |
217 | */ | 144 | */ |
218 | 145 | ||
219 | static inline int | 146 | static inline int |
220 | setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) | 147 | setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, |
148 | unsigned long mask, struct task_struct *me) | ||
221 | { | 149 | { |
222 | int err = 0; | 150 | int err = 0; |
223 | 151 | ||
@@ -269,41 +197,40 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) | |||
269 | sp = current->sas_ss_sp + current->sas_ss_size; | 197 | sp = current->sas_ss_sp + current->sas_ss_size; |
270 | } | 198 | } |
271 | 199 | ||
272 | return (void __user *)round_down(sp - size, 16); | 200 | return (void __user *)round_down(sp - size, 64); |
273 | } | 201 | } |
274 | 202 | ||
275 | static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | 203 | static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, |
276 | sigset_t *set, struct pt_regs * regs) | 204 | sigset_t *set, struct pt_regs *regs) |
277 | { | 205 | { |
278 | struct rt_sigframe __user *frame; | 206 | struct rt_sigframe __user *frame; |
279 | struct _fpstate __user *fp = NULL; | 207 | void __user *fp = NULL; |
280 | int err = 0; | 208 | int err = 0; |
281 | struct task_struct *me = current; | 209 | struct task_struct *me = current; |
282 | 210 | ||
283 | if (used_math()) { | 211 | if (used_math()) { |
284 | fp = get_stack(ka, regs, sizeof(struct _fpstate)); | 212 | fp = get_stack(ka, regs, sig_xstate_size); |
285 | frame = (void __user *)round_down( | 213 | frame = (void __user *)round_down( |
286 | (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; | 214 | (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; |
287 | 215 | ||
288 | if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) | 216 | if (save_i387_xstate(fp) < 0) |
289 | goto give_sigsegv; | 217 | return -EFAULT; |
290 | |||
291 | if (save_i387(fp) < 0) | ||
292 | err |= -1; | ||
293 | } else | 218 | } else |
294 | frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; | 219 | frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; |
295 | 220 | ||
296 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) | 221 | if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) |
297 | goto give_sigsegv; | 222 | return -EFAULT; |
298 | 223 | ||
299 | if (ka->sa.sa_flags & SA_SIGINFO) { | 224 | if (ka->sa.sa_flags & SA_SIGINFO) { |
300 | err |= copy_siginfo_to_user(&frame->info, info); | 225 | if (copy_siginfo_to_user(&frame->info, info)) |
301 | if (err) | 226 | return -EFAULT; |
302 | goto give_sigsegv; | ||
303 | } | 227 | } |
304 | 228 | ||
305 | /* Create the ucontext. */ | 229 | /* Create the ucontext. */ |
306 | err |= __put_user(0, &frame->uc.uc_flags); | 230 | if (cpu_has_xsave) |
231 | err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); | ||
232 | else | ||
233 | err |= __put_user(0, &frame->uc.uc_flags); | ||
307 | err |= __put_user(0, &frame->uc.uc_link); | 234 | err |= __put_user(0, &frame->uc.uc_link); |
308 | err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); | 235 | err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); |
309 | err |= __put_user(sas_ss_flags(regs->sp), | 236 | err |= __put_user(sas_ss_flags(regs->sp), |
@@ -311,9 +238,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | |||
311 | err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); | 238 | err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); |
312 | err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); | 239 | err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); |
313 | err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); | 240 | err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); |
314 | if (sizeof(*set) == 16) { | 241 | if (sizeof(*set) == 16) { |
315 | __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); | 242 | __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); |
316 | __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); | 243 | __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); |
317 | } else | 244 | } else |
318 | err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); | 245 | err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); |
319 | 246 | ||
@@ -324,15 +251,15 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | |||
324 | err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); | 251 | err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); |
325 | } else { | 252 | } else { |
326 | /* could use a vstub here */ | 253 | /* could use a vstub here */ |
327 | goto give_sigsegv; | 254 | return -EFAULT; |
328 | } | 255 | } |
329 | 256 | ||
330 | if (err) | 257 | if (err) |
331 | goto give_sigsegv; | 258 | return -EFAULT; |
332 | 259 | ||
333 | /* Set up registers for signal handler */ | 260 | /* Set up registers for signal handler */ |
334 | regs->di = sig; | 261 | regs->di = sig; |
335 | /* In case the signal handler was declared without prototypes */ | 262 | /* In case the signal handler was declared without prototypes */ |
336 | regs->ax = 0; | 263 | regs->ax = 0; |
337 | 264 | ||
338 | /* This also works for non SA_SIGINFO handlers because they expect the | 265 | /* This also works for non SA_SIGINFO handlers because they expect the |
@@ -348,44 +275,45 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, | |||
348 | regs->cs = __USER_CS; | 275 | regs->cs = __USER_CS; |
349 | 276 | ||
350 | return 0; | 277 | return 0; |
351 | |||
352 | give_sigsegv: | ||
353 | force_sigsegv(sig, current); | ||
354 | return -EFAULT; | ||
355 | } | 278 | } |
356 | 279 | ||
357 | /* | 280 | /* |
358 | * Return -1L or the syscall number that @regs is executing. | 281 | * OK, we're invoking a handler |
359 | */ | 282 | */ |
360 | static long current_syscall(struct pt_regs *regs) | 283 | static int signr_convert(int sig) |
361 | { | 284 | { |
362 | /* | 285 | return sig; |
363 | * We always sign-extend a -1 value being set here, | ||
364 | * so this is always either -1L or a syscall number. | ||
365 | */ | ||
366 | return regs->orig_ax; | ||
367 | } | 286 | } |
368 | 287 | ||
369 | /* | ||
370 | * Return a value that is -EFOO if the system call in @regs->orig_ax | ||
371 | * returned an error. This only works for @regs from @current. | ||
372 | */ | ||
373 | static long current_syscall_ret(struct pt_regs *regs) | ||
374 | { | ||
375 | #ifdef CONFIG_IA32_EMULATION | 288 | #ifdef CONFIG_IA32_EMULATION |
376 | if (test_thread_flag(TIF_IA32)) | 289 | #define is_ia32 test_thread_flag(TIF_IA32) |
377 | /* | 290 | #else |
378 | * Sign-extend the value so (int)-EFOO becomes (long)-EFOO | 291 | #define is_ia32 0 |
379 | * and will match correctly in comparisons. | ||
380 | */ | ||
381 | return (int) regs->ax; | ||
382 | #endif | 292 | #endif |
383 | return regs->ax; | ||
384 | } | ||
385 | 293 | ||
386 | /* | 294 | static int |
387 | * OK, we're invoking a handler | 295 | setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, |
388 | */ | 296 | sigset_t *set, struct pt_regs *regs) |
297 | { | ||
298 | int usig = signr_convert(sig); | ||
299 | int ret; | ||
300 | |||
301 | /* Set up the stack frame */ | ||
302 | if (is_ia32) { | ||
303 | if (ka->sa.sa_flags & SA_SIGINFO) | ||
304 | ret = ia32_setup_rt_frame(usig, ka, info, set, regs); | ||
305 | else | ||
306 | ret = ia32_setup_frame(usig, ka, set, regs); | ||
307 | } else | ||
308 | ret = __setup_rt_frame(sig, ka, info, set, regs); | ||
309 | |||
310 | if (ret) { | ||
311 | force_sigsegv(sig, current); | ||
312 | return -EFAULT; | ||
313 | } | ||
314 | |||
315 | return ret; | ||
316 | } | ||
389 | 317 | ||
390 | static int | 318 | static int |
391 | handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, | 319 | handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, |
@@ -394,9 +322,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, | |||
394 | int ret; | 322 | int ret; |
395 | 323 | ||
396 | /* Are we from a system call? */ | 324 | /* Are we from a system call? */ |
397 | if (current_syscall(regs) >= 0) { | 325 | if (syscall_get_nr(current, regs) >= 0) { |
398 | /* If so, check system call restarting.. */ | 326 | /* If so, check system call restarting.. */ |
399 | switch (current_syscall_ret(regs)) { | 327 | switch (syscall_get_error(current, regs)) { |
400 | case -ERESTART_RESTARTBLOCK: | 328 | case -ERESTART_RESTARTBLOCK: |
401 | case -ERESTARTNOHAND: | 329 | case -ERESTARTNOHAND: |
402 | regs->ax = -EINTR; | 330 | regs->ax = -EINTR; |
@@ -423,50 +351,48 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, | |||
423 | likely(test_and_clear_thread_flag(TIF_FORCED_TF))) | 351 | likely(test_and_clear_thread_flag(TIF_FORCED_TF))) |
424 | regs->flags &= ~X86_EFLAGS_TF; | 352 | regs->flags &= ~X86_EFLAGS_TF; |
425 | 353 | ||
426 | #ifdef CONFIG_IA32_EMULATION | ||
427 | if (test_thread_flag(TIF_IA32)) { | ||
428 | if (ka->sa.sa_flags & SA_SIGINFO) | ||
429 | ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); | ||
430 | else | ||
431 | ret = ia32_setup_frame(sig, ka, oldset, regs); | ||
432 | } else | ||
433 | #endif | ||
434 | ret = setup_rt_frame(sig, ka, info, oldset, regs); | 354 | ret = setup_rt_frame(sig, ka, info, oldset, regs); |
435 | 355 | ||
436 | if (ret == 0) { | 356 | if (ret) |
437 | /* | 357 | return ret; |
438 | * This has nothing to do with segment registers, | ||
439 | * despite the name. This magic affects uaccess.h | ||
440 | * macros' behavior. Reset it to the normal setting. | ||
441 | */ | ||
442 | set_fs(USER_DS); | ||
443 | 358 | ||
444 | /* | 359 | #ifdef CONFIG_X86_64 |
445 | * Clear the direction flag as per the ABI for function entry. | 360 | /* |
446 | */ | 361 | * This has nothing to do with segment registers, |
447 | regs->flags &= ~X86_EFLAGS_DF; | 362 | * despite the name. This magic affects uaccess.h |
363 | * macros' behavior. Reset it to the normal setting. | ||
364 | */ | ||
365 | set_fs(USER_DS); | ||
366 | #endif | ||
448 | 367 | ||
449 | /* | 368 | /* |
450 | * Clear TF when entering the signal handler, but | 369 | * Clear the direction flag as per the ABI for function entry. |
451 | * notify any tracer that was single-stepping it. | 370 | */ |
452 | * The tracer may want to single-step inside the | 371 | regs->flags &= ~X86_EFLAGS_DF; |
453 | * handler too. | ||
454 | */ | ||
455 | regs->flags &= ~X86_EFLAGS_TF; | ||
456 | if (test_thread_flag(TIF_SINGLESTEP)) | ||
457 | ptrace_notify(SIGTRAP); | ||
458 | |||
459 | spin_lock_irq(¤t->sighand->siglock); | ||
460 | sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); | ||
461 | if (!(ka->sa.sa_flags & SA_NODEFER)) | ||
462 | sigaddset(¤t->blocked,sig); | ||
463 | recalc_sigpending(); | ||
464 | spin_unlock_irq(¤t->sighand->siglock); | ||
465 | } | ||
466 | 372 | ||
467 | return ret; | 373 | /* |
374 | * Clear TF when entering the signal handler, but | ||
375 | * notify any tracer that was single-stepping it. | ||
376 | * The tracer may want to single-step inside the | ||
377 | * handler too. | ||
378 | */ | ||
379 | regs->flags &= ~X86_EFLAGS_TF; | ||
380 | |||
381 | spin_lock_irq(¤t->sighand->siglock); | ||
382 | sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); | ||
383 | if (!(ka->sa.sa_flags & SA_NODEFER)) | ||
384 | sigaddset(¤t->blocked, sig); | ||
385 | recalc_sigpending(); | ||
386 | spin_unlock_irq(¤t->sighand->siglock); | ||
387 | |||
388 | tracehook_signal_handler(sig, info, ka, regs, | ||
389 | test_thread_flag(TIF_SINGLESTEP)); | ||
390 | |||
391 | return 0; | ||
468 | } | 392 | } |
469 | 393 | ||
394 | #define NR_restart_syscall \ | ||
395 | test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall | ||
470 | /* | 396 | /* |
471 | * Note that 'init' is a special process: it doesn't get signals it doesn't | 397 | * Note that 'init' is a special process: it doesn't get signals it doesn't |
472 | * want to handle. Thus you cannot kill init even with a SIGKILL even by | 398 | * want to handle. Thus you cannot kill init even with a SIGKILL even by |
@@ -496,7 +422,8 @@ static void do_signal(struct pt_regs *regs) | |||
496 | 422 | ||
497 | signr = get_signal_to_deliver(&info, &ka, regs, NULL); | 423 | signr = get_signal_to_deliver(&info, &ka, regs, NULL); |
498 | if (signr > 0) { | 424 | if (signr > 0) { |
499 | /* Re-enable any watchpoints before delivering the | 425 | /* |
426 | * Re-enable any watchpoints before delivering the | ||
500 | * signal to user space. The processor register will | 427 | * signal to user space. The processor register will |
501 | * have been cleared if the watchpoint triggered | 428 | * have been cleared if the watchpoint triggered |
502 | * inside the kernel. | 429 | * inside the kernel. |
@@ -504,7 +431,7 @@ static void do_signal(struct pt_regs *regs) | |||
504 | if (current->thread.debugreg7) | 431 | if (current->thread.debugreg7) |
505 | set_debugreg(current->thread.debugreg7, 7); | 432 | set_debugreg(current->thread.debugreg7, 7); |
506 | 433 | ||
507 | /* Whee! Actually deliver the signal. */ | 434 | /* Whee! Actually deliver the signal. */ |
508 | if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { | 435 | if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { |
509 | /* | 436 | /* |
510 | * A signal was successfully delivered; the saved | 437 | * A signal was successfully delivered; the saved |
@@ -518,19 +445,18 @@ static void do_signal(struct pt_regs *regs) | |||
518 | } | 445 | } |
519 | 446 | ||
520 | /* Did we come from a system call? */ | 447 | /* Did we come from a system call? */ |
521 | if (current_syscall(regs) >= 0) { | 448 | if (syscall_get_nr(current, regs) >= 0) { |
522 | /* Restart the system call - no handlers present */ | 449 | /* Restart the system call - no handlers present */ |
523 | switch (current_syscall_ret(regs)) { | 450 | switch (syscall_get_error(current, regs)) { |
524 | case -ERESTARTNOHAND: | 451 | case -ERESTARTNOHAND: |
525 | case -ERESTARTSYS: | 452 | case -ERESTARTSYS: |
526 | case -ERESTARTNOINTR: | 453 | case -ERESTARTNOINTR: |
527 | regs->ax = regs->orig_ax; | 454 | regs->ax = regs->orig_ax; |
528 | regs->ip -= 2; | 455 | regs->ip -= 2; |
529 | break; | 456 | break; |
457 | |||
530 | case -ERESTART_RESTARTBLOCK: | 458 | case -ERESTART_RESTARTBLOCK: |
531 | regs->ax = test_thread_flag(TIF_IA32) ? | 459 | regs->ax = NR_restart_syscall; |
532 | __NR_ia32_restart_syscall : | ||
533 | __NR_restart_syscall; | ||
534 | regs->ip -= 2; | 460 | regs->ip -= 2; |
535 | break; | 461 | break; |
536 | } | 462 | } |
@@ -546,29 +472,45 @@ static void do_signal(struct pt_regs *regs) | |||
546 | } | 472 | } |
547 | } | 473 | } |
548 | 474 | ||
549 | void do_notify_resume(struct pt_regs *regs, void *unused, | 475 | /* |
550 | __u32 thread_info_flags) | 476 | * notification of userspace execution resumption |
477 | * - triggered by the TIF_WORK_MASK flags | ||
478 | */ | ||
479 | void | ||
480 | do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | ||
551 | { | 481 | { |
552 | #ifdef CONFIG_X86_MCE | 482 | #if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) |
553 | /* notify userspace of pending MCEs */ | 483 | /* notify userspace of pending MCEs */ |
554 | if (thread_info_flags & _TIF_MCE_NOTIFY) | 484 | if (thread_info_flags & _TIF_MCE_NOTIFY) |
555 | mce_notify_user(); | 485 | mce_notify_user(); |
556 | #endif /* CONFIG_X86_MCE */ | 486 | #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ |
557 | 487 | ||
558 | /* deal with pending signal delivery */ | 488 | /* deal with pending signal delivery */ |
559 | if (thread_info_flags & _TIF_SIGPENDING) | 489 | if (thread_info_flags & _TIF_SIGPENDING) |
560 | do_signal(regs); | 490 | do_signal(regs); |
491 | |||
492 | if (thread_info_flags & _TIF_NOTIFY_RESUME) { | ||
493 | clear_thread_flag(TIF_NOTIFY_RESUME); | ||
494 | tracehook_notify_resume(regs); | ||
495 | } | ||
496 | |||
497 | #ifdef CONFIG_X86_32 | ||
498 | clear_thread_flag(TIF_IRET); | ||
499 | #endif /* CONFIG_X86_32 */ | ||
561 | } | 500 | } |
562 | 501 | ||
563 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where) | 502 | void signal_fault(struct pt_regs *regs, void __user *frame, char *where) |
564 | { | 503 | { |
565 | struct task_struct *me = current; | 504 | struct task_struct *me = current; |
505 | |||
566 | if (show_unhandled_signals && printk_ratelimit()) { | 506 | if (show_unhandled_signals && printk_ratelimit()) { |
567 | printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", | 507 | printk(KERN_INFO |
568 | me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); | 508 | "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", |
509 | me->comm, me->pid, where, frame, | ||
510 | regs->ip, regs->sp, regs->orig_ax); | ||
569 | print_vma_addr(" in ", regs->ip); | 511 | print_vma_addr(" in ", regs->ip); |
570 | printk("\n"); | 512 | printk(KERN_CONT "\n"); |
571 | } | 513 | } |
572 | 514 | ||
573 | force_sig(SIGSEGV, me); | 515 | force_sig(SIGSEGV, me); |
574 | } | 516 | } |
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 361b7a4c640c..18f9b19f5f8f 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c | |||
@@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) | |||
214 | struct smp_ops smp_ops = { | 214 | struct smp_ops smp_ops = { |
215 | .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, | 215 | .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, |
216 | .smp_prepare_cpus = native_smp_prepare_cpus, | 216 | .smp_prepare_cpus = native_smp_prepare_cpus, |
217 | .cpu_up = native_cpu_up, | ||
218 | .smp_cpus_done = native_smp_cpus_done, | 217 | .smp_cpus_done = native_smp_cpus_done, |
219 | 218 | ||
220 | .smp_send_stop = native_smp_send_stop, | 219 | .smp_send_stop = native_smp_send_stop, |
221 | .smp_send_reschedule = native_smp_send_reschedule, | 220 | .smp_send_reschedule = native_smp_send_reschedule, |
222 | 221 | ||
222 | .cpu_up = native_cpu_up, | ||
223 | .cpu_die = native_cpu_die, | ||
224 | .cpu_disable = native_cpu_disable, | ||
225 | .play_dead = native_play_dead, | ||
226 | |||
223 | .send_call_func_ipi = native_send_call_func_ipi, | 227 | .send_call_func_ipi = native_send_call_func_ipi, |
224 | .send_call_func_single_ipi = native_send_call_func_single_ipi, | 228 | .send_call_func_single_ipi = native_send_call_func_single_ipi, |
225 | }; | 229 | }; |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7985c5b3f916..7ed9e070a6e9 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -52,6 +52,7 @@ | |||
52 | #include <asm/desc.h> | 52 | #include <asm/desc.h> |
53 | #include <asm/nmi.h> | 53 | #include <asm/nmi.h> |
54 | #include <asm/irq.h> | 54 | #include <asm/irq.h> |
55 | #include <asm/idle.h> | ||
55 | #include <asm/smp.h> | 56 | #include <asm/smp.h> |
56 | #include <asm/trampoline.h> | 57 | #include <asm/trampoline.h> |
57 | #include <asm/cpu.h> | 58 | #include <asm/cpu.h> |
@@ -88,7 +89,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); | |||
88 | #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) | 89 | #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) |
89 | #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) | 90 | #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) |
90 | #else | 91 | #else |
91 | struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; | 92 | static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; |
92 | #define get_idle_for_cpu(x) (idle_thread_array[(x)]) | 93 | #define get_idle_for_cpu(x) (idle_thread_array[(x)]) |
93 | #define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) | 94 | #define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) |
94 | #endif | 95 | #endif |
@@ -123,13 +124,12 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); | |||
123 | 124 | ||
124 | static atomic_t init_deasserted; | 125 | static atomic_t init_deasserted; |
125 | 126 | ||
126 | static int boot_cpu_logical_apicid; | ||
127 | 127 | ||
128 | /* representing cpus for which sibling maps can be computed */ | 128 | /* representing cpus for which sibling maps can be computed */ |
129 | static cpumask_t cpu_sibling_setup_map; | 129 | static cpumask_t cpu_sibling_setup_map; |
130 | 130 | ||
131 | /* Set if we find a B stepping CPU */ | 131 | /* Set if we find a B stepping CPU */ |
132 | int __cpuinitdata smp_b_stepping; | 132 | static int __cpuinitdata smp_b_stepping; |
133 | 133 | ||
134 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) | 134 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) |
135 | 135 | ||
@@ -165,6 +165,8 @@ static void unmap_cpu_to_node(int cpu) | |||
165 | #endif | 165 | #endif |
166 | 166 | ||
167 | #ifdef CONFIG_X86_32 | 167 | #ifdef CONFIG_X86_32 |
168 | static int boot_cpu_logical_apicid; | ||
169 | |||
168 | u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = | 170 | u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = |
169 | { [0 ... NR_CPUS-1] = BAD_APICID }; | 171 | { [0 ... NR_CPUS-1] = BAD_APICID }; |
170 | 172 | ||
@@ -210,7 +212,7 @@ static void __cpuinit smp_callin(void) | |||
210 | /* | 212 | /* |
211 | * (This works even if the APIC is not enabled.) | 213 | * (This works even if the APIC is not enabled.) |
212 | */ | 214 | */ |
213 | phys_id = GET_APIC_ID(read_apic_id()); | 215 | phys_id = read_apic_id(); |
214 | cpuid = smp_processor_id(); | 216 | cpuid = smp_processor_id(); |
215 | if (cpu_isset(cpuid, cpu_callin_map)) { | 217 | if (cpu_isset(cpuid, cpu_callin_map)) { |
216 | panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, | 218 | panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, |
@@ -257,6 +259,7 @@ static void __cpuinit smp_callin(void) | |||
257 | end_local_APIC_setup(); | 259 | end_local_APIC_setup(); |
258 | map_cpu_to_logical_apicid(); | 260 | map_cpu_to_logical_apicid(); |
259 | 261 | ||
262 | notify_cpu_starting(cpuid); | ||
260 | /* | 263 | /* |
261 | * Get our bogomips. | 264 | * Get our bogomips. |
262 | * | 265 | * |
@@ -279,6 +282,8 @@ static void __cpuinit smp_callin(void) | |||
279 | cpu_set(cpuid, cpu_callin_map); | 282 | cpu_set(cpuid, cpu_callin_map); |
280 | } | 283 | } |
281 | 284 | ||
285 | static int __cpuinitdata unsafe_smp; | ||
286 | |||
282 | /* | 287 | /* |
283 | * Activate a secondary processor. | 288 | * Activate a secondary processor. |
284 | */ | 289 | */ |
@@ -331,14 +336,17 @@ static void __cpuinit start_secondary(void *unused) | |||
331 | * does not change while we are assigning vectors to cpus. Holding | 336 | * does not change while we are assigning vectors to cpus. Holding |
332 | * this lock ensures we don't half assign or remove an irq from a cpu. | 337 | * this lock ensures we don't half assign or remove an irq from a cpu. |
333 | */ | 338 | */ |
334 | ipi_call_lock_irq(); | 339 | ipi_call_lock(); |
335 | lock_vector_lock(); | 340 | lock_vector_lock(); |
336 | __setup_vector_irq(smp_processor_id()); | 341 | __setup_vector_irq(smp_processor_id()); |
337 | cpu_set(smp_processor_id(), cpu_online_map); | 342 | cpu_set(smp_processor_id(), cpu_online_map); |
338 | unlock_vector_lock(); | 343 | unlock_vector_lock(); |
339 | ipi_call_unlock_irq(); | 344 | ipi_call_unlock(); |
340 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; | 345 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; |
341 | 346 | ||
347 | /* enable local interrupts */ | ||
348 | local_irq_enable(); | ||
349 | |||
342 | setup_secondary_clock(); | 350 | setup_secondary_clock(); |
343 | 351 | ||
344 | wmb(); | 352 | wmb(); |
@@ -391,7 +399,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) | |||
391 | goto valid_k7; | 399 | goto valid_k7; |
392 | 400 | ||
393 | /* If we get here, not a certified SMP capable AMD system. */ | 401 | /* If we get here, not a certified SMP capable AMD system. */ |
394 | add_taint(TAINT_UNSAFE_SMP); | 402 | unsafe_smp = 1; |
395 | } | 403 | } |
396 | 404 | ||
397 | valid_k7: | 405 | valid_k7: |
@@ -408,12 +416,10 @@ static void __cpuinit smp_checks(void) | |||
408 | * Don't taint if we are running SMP kernel on a single non-MP | 416 | * Don't taint if we are running SMP kernel on a single non-MP |
409 | * approved Athlon | 417 | * approved Athlon |
410 | */ | 418 | */ |
411 | if (tainted & TAINT_UNSAFE_SMP) { | 419 | if (unsafe_smp && num_online_cpus() > 1) { |
412 | if (num_online_cpus()) | 420 | printk(KERN_INFO "WARNING: This combination of AMD" |
413 | printk(KERN_INFO "WARNING: This combination of AMD" | 421 | "processors is not suitable for SMP.\n"); |
414 | "processors is not suitable for SMP.\n"); | 422 | add_taint(TAINT_UNSAFE_SMP); |
415 | else | ||
416 | tainted &= ~TAINT_UNSAFE_SMP; | ||
417 | } | 423 | } |
418 | } | 424 | } |
419 | 425 | ||
@@ -550,8 +556,7 @@ static inline void __inquire_remote_apic(int apicid) | |||
550 | printk(KERN_CONT | 556 | printk(KERN_CONT |
551 | "a previous APIC delivery may have failed\n"); | 557 | "a previous APIC delivery may have failed\n"); |
552 | 558 | ||
553 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); | 559 | apic_icr_write(APIC_DM_REMRD | regs[i], apicid); |
554 | apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); | ||
555 | 560 | ||
556 | timeout = 0; | 561 | timeout = 0; |
557 | do { | 562 | do { |
@@ -583,11 +588,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) | |||
583 | int maxlvt; | 588 | int maxlvt; |
584 | 589 | ||
585 | /* Target chip */ | 590 | /* Target chip */ |
586 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); | ||
587 | |||
588 | /* Boot on the stack */ | 591 | /* Boot on the stack */ |
589 | /* Kick the second */ | 592 | /* Kick the second */ |
590 | apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); | 593 | apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid); |
591 | 594 | ||
592 | pr_debug("Waiting for send to finish...\n"); | 595 | pr_debug("Waiting for send to finish...\n"); |
593 | send_status = safe_apic_wait_icr_idle(); | 596 | send_status = safe_apic_wait_icr_idle(); |
@@ -596,10 +599,12 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) | |||
596 | * Give the other CPU some time to accept the IPI. | 599 | * Give the other CPU some time to accept the IPI. |
597 | */ | 600 | */ |
598 | udelay(200); | 601 | udelay(200); |
599 | maxlvt = lapic_get_maxlvt(); | 602 | if (APIC_INTEGRATED(apic_version[phys_apicid])) { |
600 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | 603 | maxlvt = lapic_get_maxlvt(); |
601 | apic_write(APIC_ESR, 0); | 604 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ |
602 | accept_status = (apic_read(APIC_ESR) & 0xEF); | 605 | apic_write(APIC_ESR, 0); |
606 | accept_status = (apic_read(APIC_ESR) & 0xEF); | ||
607 | } | ||
603 | pr_debug("NMI sent.\n"); | 608 | pr_debug("NMI sent.\n"); |
604 | 609 | ||
605 | if (send_status) | 610 | if (send_status) |
@@ -640,13 +645,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) | |||
640 | /* | 645 | /* |
641 | * Turn INIT on target chip | 646 | * Turn INIT on target chip |
642 | */ | 647 | */ |
643 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
644 | |||
645 | /* | 648 | /* |
646 | * Send IPI | 649 | * Send IPI |
647 | */ | 650 | */ |
648 | apic_write(APIC_ICR, | 651 | apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, |
649 | APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); | 652 | phys_apicid); |
650 | 653 | ||
651 | pr_debug("Waiting for send to finish...\n"); | 654 | pr_debug("Waiting for send to finish...\n"); |
652 | send_status = safe_apic_wait_icr_idle(); | 655 | send_status = safe_apic_wait_icr_idle(); |
@@ -656,10 +659,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) | |||
656 | pr_debug("Deasserting INIT.\n"); | 659 | pr_debug("Deasserting INIT.\n"); |
657 | 660 | ||
658 | /* Target chip */ | 661 | /* Target chip */ |
659 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
660 | |||
661 | /* Send IPI */ | 662 | /* Send IPI */ |
662 | apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); | 663 | apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); |
663 | 664 | ||
664 | pr_debug("Waiting for send to finish...\n"); | 665 | pr_debug("Waiting for send to finish...\n"); |
665 | send_status = safe_apic_wait_icr_idle(); | 666 | send_status = safe_apic_wait_icr_idle(); |
@@ -702,11 +703,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) | |||
702 | */ | 703 | */ |
703 | 704 | ||
704 | /* Target chip */ | 705 | /* Target chip */ |
705 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); | ||
706 | |||
707 | /* Boot on the stack */ | 706 | /* Boot on the stack */ |
708 | /* Kick the second */ | 707 | /* Kick the second */ |
709 | apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12)); | 708 | apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), |
709 | phys_apicid); | ||
710 | 710 | ||
711 | /* | 711 | /* |
712 | * Give the other CPU some time to accept the IPI. | 712 | * Give the other CPU some time to accept the IPI. |
@@ -1175,10 +1175,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) | |||
1175 | * Setup boot CPU information | 1175 | * Setup boot CPU information |
1176 | */ | 1176 | */ |
1177 | smp_store_cpu_info(0); /* Final full version of the data */ | 1177 | smp_store_cpu_info(0); /* Final full version of the data */ |
1178 | #ifdef CONFIG_X86_32 | ||
1178 | boot_cpu_logical_apicid = logical_smp_processor_id(); | 1179 | boot_cpu_logical_apicid = logical_smp_processor_id(); |
1180 | #endif | ||
1179 | current_thread_info()->cpu = 0; /* needed? */ | 1181 | current_thread_info()->cpu = 0; /* needed? */ |
1180 | set_cpu_sibling_map(0); | 1182 | set_cpu_sibling_map(0); |
1181 | 1183 | ||
1184 | #ifdef CONFIG_X86_64 | ||
1185 | enable_IR_x2apic(); | ||
1186 | setup_apic_routing(); | ||
1187 | #endif | ||
1188 | |||
1182 | if (smp_sanity_check(max_cpus) < 0) { | 1189 | if (smp_sanity_check(max_cpus) < 0) { |
1183 | printk(KERN_INFO "SMP disabled\n"); | 1190 | printk(KERN_INFO "SMP disabled\n"); |
1184 | disable_smp(); | 1191 | disable_smp(); |
@@ -1186,9 +1193,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) | |||
1186 | } | 1193 | } |
1187 | 1194 | ||
1188 | preempt_disable(); | 1195 | preempt_disable(); |
1189 | if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) { | 1196 | if (read_apic_id() != boot_cpu_physical_apicid) { |
1190 | panic("Boot APIC ID in local APIC unexpected (%d vs %d)", | 1197 | panic("Boot APIC ID in local APIC unexpected (%d vs %d)", |
1191 | GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid); | 1198 | read_apic_id(), boot_cpu_physical_apicid); |
1192 | /* Or can we switch back to PIC here? */ | 1199 | /* Or can we switch back to PIC here? */ |
1193 | } | 1200 | } |
1194 | preempt_enable(); | 1201 | preempt_enable(); |
@@ -1254,39 +1261,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) | |||
1254 | check_nmi_watchdog(); | 1261 | check_nmi_watchdog(); |
1255 | } | 1262 | } |
1256 | 1263 | ||
1257 | #ifdef CONFIG_HOTPLUG_CPU | ||
1258 | |||
1259 | static void remove_siblinginfo(int cpu) | ||
1260 | { | ||
1261 | int sibling; | ||
1262 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
1263 | |||
1264 | for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { | ||
1265 | cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); | ||
1266 | /*/ | ||
1267 | * last thread sibling in this cpu core going down | ||
1268 | */ | ||
1269 | if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) | ||
1270 | cpu_data(sibling).booted_cores--; | ||
1271 | } | ||
1272 | |||
1273 | for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) | ||
1274 | cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); | ||
1275 | cpus_clear(per_cpu(cpu_sibling_map, cpu)); | ||
1276 | cpus_clear(per_cpu(cpu_core_map, cpu)); | ||
1277 | c->phys_proc_id = 0; | ||
1278 | c->cpu_core_id = 0; | ||
1279 | cpu_clear(cpu, cpu_sibling_setup_map); | ||
1280 | } | ||
1281 | |||
1282 | static int additional_cpus __initdata = -1; | ||
1283 | |||
1284 | static __init int setup_additional_cpus(char *s) | ||
1285 | { | ||
1286 | return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; | ||
1287 | } | ||
1288 | early_param("additional_cpus", setup_additional_cpus); | ||
1289 | |||
1290 | /* | 1264 | /* |
1291 | * cpu_possible_map should be static, it cannot change as cpu's | 1265 | * cpu_possible_map should be static, it cannot change as cpu's |
1292 | * are onlined, or offlined. The reason is per-cpu data-structures | 1266 | * are onlined, or offlined. The reason is per-cpu data-structures |
@@ -1306,24 +1280,13 @@ early_param("additional_cpus", setup_additional_cpus); | |||
1306 | */ | 1280 | */ |
1307 | __init void prefill_possible_map(void) | 1281 | __init void prefill_possible_map(void) |
1308 | { | 1282 | { |
1309 | int i; | 1283 | int i, possible; |
1310 | int possible; | ||
1311 | 1284 | ||
1312 | /* no processor from mptable or madt */ | 1285 | /* no processor from mptable or madt */ |
1313 | if (!num_processors) | 1286 | if (!num_processors) |
1314 | num_processors = 1; | 1287 | num_processors = 1; |
1315 | 1288 | ||
1316 | #ifdef CONFIG_HOTPLUG_CPU | 1289 | possible = num_processors + disabled_cpus; |
1317 | if (additional_cpus == -1) { | ||
1318 | if (disabled_cpus > 0) | ||
1319 | additional_cpus = disabled_cpus; | ||
1320 | else | ||
1321 | additional_cpus = 0; | ||
1322 | } | ||
1323 | #else | ||
1324 | additional_cpus = 0; | ||
1325 | #endif | ||
1326 | possible = num_processors + additional_cpus; | ||
1327 | if (possible > NR_CPUS) | 1290 | if (possible > NR_CPUS) |
1328 | possible = NR_CPUS; | 1291 | possible = NR_CPUS; |
1329 | 1292 | ||
@@ -1336,6 +1299,31 @@ __init void prefill_possible_map(void) | |||
1336 | nr_cpu_ids = possible; | 1299 | nr_cpu_ids = possible; |
1337 | } | 1300 | } |
1338 | 1301 | ||
1302 | #ifdef CONFIG_HOTPLUG_CPU | ||
1303 | |||
1304 | static void remove_siblinginfo(int cpu) | ||
1305 | { | ||
1306 | int sibling; | ||
1307 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
1308 | |||
1309 | for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { | ||
1310 | cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); | ||
1311 | /*/ | ||
1312 | * last thread sibling in this cpu core going down | ||
1313 | */ | ||
1314 | if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) | ||
1315 | cpu_data(sibling).booted_cores--; | ||
1316 | } | ||
1317 | |||
1318 | for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) | ||
1319 | cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); | ||
1320 | cpus_clear(per_cpu(cpu_sibling_map, cpu)); | ||
1321 | cpus_clear(per_cpu(cpu_core_map, cpu)); | ||
1322 | c->phys_proc_id = 0; | ||
1323 | c->cpu_core_id = 0; | ||
1324 | cpu_clear(cpu, cpu_sibling_setup_map); | ||
1325 | } | ||
1326 | |||
1339 | static void __ref remove_cpu_from_maps(int cpu) | 1327 | static void __ref remove_cpu_from_maps(int cpu) |
1340 | { | 1328 | { |
1341 | cpu_clear(cpu, cpu_online_map); | 1329 | cpu_clear(cpu, cpu_online_map); |
@@ -1346,25 +1334,9 @@ static void __ref remove_cpu_from_maps(int cpu) | |||
1346 | numa_remove_cpu(cpu); | 1334 | numa_remove_cpu(cpu); |
1347 | } | 1335 | } |
1348 | 1336 | ||
1349 | int __cpu_disable(void) | 1337 | void cpu_disable_common(void) |
1350 | { | 1338 | { |
1351 | int cpu = smp_processor_id(); | 1339 | int cpu = smp_processor_id(); |
1352 | |||
1353 | /* | ||
1354 | * Perhaps use cpufreq to drop frequency, but that could go | ||
1355 | * into generic code. | ||
1356 | * | ||
1357 | * We won't take down the boot processor on i386 due to some | ||
1358 | * interrupts only being able to be serviced by the BSP. | ||
1359 | * Especially so if we're not using an IOAPIC -zwane | ||
1360 | */ | ||
1361 | if (cpu == 0) | ||
1362 | return -EBUSY; | ||
1363 | |||
1364 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
1365 | stop_apic_nmi_watchdog(NULL); | ||
1366 | clear_local_APIC(); | ||
1367 | |||
1368 | /* | 1340 | /* |
1369 | * HACK: | 1341 | * HACK: |
1370 | * Allow any queued timer interrupts to get serviced | 1342 | * Allow any queued timer interrupts to get serviced |
@@ -1382,10 +1354,32 @@ int __cpu_disable(void) | |||
1382 | remove_cpu_from_maps(cpu); | 1354 | remove_cpu_from_maps(cpu); |
1383 | unlock_vector_lock(); | 1355 | unlock_vector_lock(); |
1384 | fixup_irqs(cpu_online_map); | 1356 | fixup_irqs(cpu_online_map); |
1357 | } | ||
1358 | |||
1359 | int native_cpu_disable(void) | ||
1360 | { | ||
1361 | int cpu = smp_processor_id(); | ||
1362 | |||
1363 | /* | ||
1364 | * Perhaps use cpufreq to drop frequency, but that could go | ||
1365 | * into generic code. | ||
1366 | * | ||
1367 | * We won't take down the boot processor on i386 due to some | ||
1368 | * interrupts only being able to be serviced by the BSP. | ||
1369 | * Especially so if we're not using an IOAPIC -zwane | ||
1370 | */ | ||
1371 | if (cpu == 0) | ||
1372 | return -EBUSY; | ||
1373 | |||
1374 | if (nmi_watchdog == NMI_LOCAL_APIC) | ||
1375 | stop_apic_nmi_watchdog(NULL); | ||
1376 | clear_local_APIC(); | ||
1377 | |||
1378 | cpu_disable_common(); | ||
1385 | return 0; | 1379 | return 0; |
1386 | } | 1380 | } |
1387 | 1381 | ||
1388 | void __cpu_die(unsigned int cpu) | 1382 | void native_cpu_die(unsigned int cpu) |
1389 | { | 1383 | { |
1390 | /* We don't do anything here: idle task is faking death itself. */ | 1384 | /* We don't do anything here: idle task is faking death itself. */ |
1391 | unsigned int i; | 1385 | unsigned int i; |
@@ -1402,15 +1396,45 @@ void __cpu_die(unsigned int cpu) | |||
1402 | } | 1396 | } |
1403 | printk(KERN_ERR "CPU %u didn't die...\n", cpu); | 1397 | printk(KERN_ERR "CPU %u didn't die...\n", cpu); |
1404 | } | 1398 | } |
1399 | |||
1400 | void play_dead_common(void) | ||
1401 | { | ||
1402 | idle_task_exit(); | ||
1403 | reset_lazy_tlbstate(); | ||
1404 | irq_ctx_exit(raw_smp_processor_id()); | ||
1405 | c1e_remove_cpu(raw_smp_processor_id()); | ||
1406 | |||
1407 | mb(); | ||
1408 | /* Ack it */ | ||
1409 | __get_cpu_var(cpu_state) = CPU_DEAD; | ||
1410 | |||
1411 | /* | ||
1412 | * With physical CPU hotplug, we should halt the cpu | ||
1413 | */ | ||
1414 | local_irq_disable(); | ||
1415 | } | ||
1416 | |||
1417 | void native_play_dead(void) | ||
1418 | { | ||
1419 | play_dead_common(); | ||
1420 | wbinvd_halt(); | ||
1421 | } | ||
1422 | |||
1405 | #else /* ... !CONFIG_HOTPLUG_CPU */ | 1423 | #else /* ... !CONFIG_HOTPLUG_CPU */ |
1406 | int __cpu_disable(void) | 1424 | int native_cpu_disable(void) |
1407 | { | 1425 | { |
1408 | return -ENOSYS; | 1426 | return -ENOSYS; |
1409 | } | 1427 | } |
1410 | 1428 | ||
1411 | void __cpu_die(unsigned int cpu) | 1429 | void native_cpu_die(unsigned int cpu) |
1412 | { | 1430 | { |
1413 | /* We said "no" in __cpu_disable */ | 1431 | /* We said "no" in __cpu_disable */ |
1414 | BUG(); | 1432 | BUG(); |
1415 | } | 1433 | } |
1434 | |||
1435 | void native_play_dead(void) | ||
1436 | { | ||
1437 | BUG(); | ||
1438 | } | ||
1439 | |||
1416 | #endif | 1440 | #endif |
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c index d67ce5f044ba..7b987852e876 100644 --- a/arch/x86/kernel/summit_32.c +++ b/arch/x86/kernel/summit_32.c | |||
@@ -30,7 +30,7 @@ | |||
30 | #include <linux/init.h> | 30 | #include <linux/init.h> |
31 | #include <asm/io.h> | 31 | #include <asm/io.h> |
32 | #include <asm/bios_ebda.h> | 32 | #include <asm/bios_ebda.h> |
33 | #include <asm/mach-summit/mach_mpparse.h> | 33 | #include <asm/summit/mpparse.h> |
34 | 34 | ||
35 | static struct rio_table_hdr *rio_table_hdr __initdata; | 35 | static struct rio_table_hdr *rio_table_hdr __initdata; |
36 | static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; | 36 | static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; |
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 7066cb855a60..1884a8d12bfa 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c | |||
@@ -22,6 +22,8 @@ | |||
22 | #include <linux/uaccess.h> | 22 | #include <linux/uaccess.h> |
23 | #include <linux/unistd.h> | 23 | #include <linux/unistd.h> |
24 | 24 | ||
25 | #include <asm/syscalls.h> | ||
26 | |||
25 | asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, | 27 | asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, |
26 | unsigned long prot, unsigned long flags, | 28 | unsigned long prot, unsigned long flags, |
27 | unsigned long fd, unsigned long pgoff) | 29 | unsigned long fd, unsigned long pgoff) |
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 3b360ef33817..6bc211accf08 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c | |||
@@ -13,15 +13,17 @@ | |||
13 | #include <linux/utsname.h> | 13 | #include <linux/utsname.h> |
14 | #include <linux/personality.h> | 14 | #include <linux/personality.h> |
15 | #include <linux/random.h> | 15 | #include <linux/random.h> |
16 | #include <linux/uaccess.h> | ||
16 | 17 | ||
17 | #include <asm/uaccess.h> | ||
18 | #include <asm/ia32.h> | 18 | #include <asm/ia32.h> |
19 | #include <asm/syscalls.h> | ||
19 | 20 | ||
20 | asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, | 21 | asmlinkage long sys_mmap(unsigned long addr, unsigned long len, |
21 | unsigned long fd, unsigned long off) | 22 | unsigned long prot, unsigned long flags, |
23 | unsigned long fd, unsigned long off) | ||
22 | { | 24 | { |
23 | long error; | 25 | long error; |
24 | struct file * file; | 26 | struct file *file; |
25 | 27 | ||
26 | error = -EINVAL; | 28 | error = -EINVAL; |
27 | if (off & ~PAGE_MASK) | 29 | if (off & ~PAGE_MASK) |
@@ -56,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin, | |||
56 | unmapped base down for this case. This can give | 58 | unmapped base down for this case. This can give |
57 | conflicts with the heap, but we assume that glibc | 59 | conflicts with the heap, but we assume that glibc |
58 | malloc knows how to fall back to mmap. Give it 1GB | 60 | malloc knows how to fall back to mmap. Give it 1GB |
59 | of playground for now. -AK */ | 61 | of playground for now. -AK */ |
60 | *begin = 0x40000000; | 62 | *begin = 0x40000000; |
61 | *end = 0x80000000; | 63 | *end = 0x80000000; |
62 | if (current->flags & PF_RANDOMIZE) { | 64 | if (current->flags & PF_RANDOMIZE) { |
63 | new_begin = randomize_range(*begin, *begin + 0x02000000, 0); | 65 | new_begin = randomize_range(*begin, *begin + 0x02000000, 0); |
64 | if (new_begin) | 66 | if (new_begin) |
@@ -66,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin, | |||
66 | } | 68 | } |
67 | } else { | 69 | } else { |
68 | *begin = TASK_UNMAPPED_BASE; | 70 | *begin = TASK_UNMAPPED_BASE; |
69 | *end = TASK_SIZE; | 71 | *end = TASK_SIZE; |
70 | } | 72 | } |
71 | } | 73 | } |
72 | 74 | ||
73 | unsigned long | 75 | unsigned long |
74 | arch_get_unmapped_area(struct file *filp, unsigned long addr, | 76 | arch_get_unmapped_area(struct file *filp, unsigned long addr, |
@@ -78,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
78 | struct vm_area_struct *vma; | 80 | struct vm_area_struct *vma; |
79 | unsigned long start_addr; | 81 | unsigned long start_addr; |
80 | unsigned long begin, end; | 82 | unsigned long begin, end; |
81 | 83 | ||
82 | if (flags & MAP_FIXED) | 84 | if (flags & MAP_FIXED) |
83 | return addr; | 85 | return addr; |
84 | 86 | ||
85 | find_start_end(flags, &begin, &end); | 87 | find_start_end(flags, &begin, &end); |
86 | 88 | ||
87 | if (len > end) | 89 | if (len > end) |
88 | return -ENOMEM; | 90 | return -ENOMEM; |
@@ -96,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
96 | } | 98 | } |
97 | if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) | 99 | if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) |
98 | && len <= mm->cached_hole_size) { | 100 | && len <= mm->cached_hole_size) { |
99 | mm->cached_hole_size = 0; | 101 | mm->cached_hole_size = 0; |
100 | mm->free_area_cache = begin; | 102 | mm->free_area_cache = begin; |
101 | } | 103 | } |
102 | addr = mm->free_area_cache; | 104 | addr = mm->free_area_cache; |
103 | if (addr < begin) | 105 | if (addr < begin) |
104 | addr = begin; | 106 | addr = begin; |
105 | start_addr = addr; | 107 | start_addr = addr; |
106 | 108 | ||
107 | full_search: | 109 | full_search: |
@@ -127,7 +129,7 @@ full_search: | |||
127 | return addr; | 129 | return addr; |
128 | } | 130 | } |
129 | if (addr + mm->cached_hole_size < vma->vm_start) | 131 | if (addr + mm->cached_hole_size < vma->vm_start) |
130 | mm->cached_hole_size = vma->vm_start - addr; | 132 | mm->cached_hole_size = vma->vm_start - addr; |
131 | 133 | ||
132 | addr = vma->vm_end; | 134 | addr = vma->vm_end; |
133 | } | 135 | } |
@@ -177,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
177 | vma = find_vma(mm, addr-len); | 179 | vma = find_vma(mm, addr-len); |
178 | if (!vma || addr <= vma->vm_start) | 180 | if (!vma || addr <= vma->vm_start) |
179 | /* remember the address as a hint for next time */ | 181 | /* remember the address as a hint for next time */ |
180 | return (mm->free_area_cache = addr-len); | 182 | return mm->free_area_cache = addr-len; |
181 | } | 183 | } |
182 | 184 | ||
183 | if (mm->mmap_base < len) | 185 | if (mm->mmap_base < len) |
@@ -194,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
194 | vma = find_vma(mm, addr); | 196 | vma = find_vma(mm, addr); |
195 | if (!vma || addr+len <= vma->vm_start) | 197 | if (!vma || addr+len <= vma->vm_start) |
196 | /* remember the address as a hint for next time */ | 198 | /* remember the address as a hint for next time */ |
197 | return (mm->free_area_cache = addr); | 199 | return mm->free_area_cache = addr; |
198 | 200 | ||
199 | /* remember the largest hole we saw so far */ | 201 | /* remember the largest hole we saw so far */ |
200 | if (addr + mm->cached_hole_size < vma->vm_start) | 202 | if (addr + mm->cached_hole_size < vma->vm_start) |
@@ -224,13 +226,13 @@ bottomup: | |||
224 | } | 226 | } |
225 | 227 | ||
226 | 228 | ||
227 | asmlinkage long sys_uname(struct new_utsname __user * name) | 229 | asmlinkage long sys_uname(struct new_utsname __user *name) |
228 | { | 230 | { |
229 | int err; | 231 | int err; |
230 | down_read(&uts_sem); | 232 | down_read(&uts_sem); |
231 | err = copy_to_user(name, utsname(), sizeof (*name)); | 233 | err = copy_to_user(name, utsname(), sizeof(*name)); |
232 | up_read(&uts_sem); | 234 | up_read(&uts_sem); |
233 | if (personality(current->personality) == PER_LINUX32) | 235 | if (personality(current->personality) == PER_LINUX32) |
234 | err |= copy_to_user(&name->machine, "i686", 5); | 236 | err |= copy_to_user(&name->machine, "i686", 5); |
235 | return err ? -EFAULT : 0; | 237 | return err ? -EFAULT : 0; |
236 | } | 238 | } |
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 170d43c17487..3d1be4f0fac5 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c | |||
@@ -8,12 +8,12 @@ | |||
8 | #define __NO_STUBS | 8 | #define __NO_STUBS |
9 | 9 | ||
10 | #define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; | 10 | #define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; |
11 | #undef _ASM_X86_64_UNISTD_H_ | 11 | #undef ASM_X86__UNISTD_64_H |
12 | #include <asm/unistd_64.h> | 12 | #include <asm/unistd_64.h> |
13 | 13 | ||
14 | #undef __SYSCALL | 14 | #undef __SYSCALL |
15 | #define __SYSCALL(nr, sym) [nr] = sym, | 15 | #define __SYSCALL(nr, sym) [nr] = sym, |
16 | #undef _ASM_X86_64_UNISTD_H_ | 16 | #undef ASM_X86__UNISTD_64_H |
17 | 17 | ||
18 | typedef void (*sys_call_ptr_t)(void); | 18 | typedef void (*sys_call_ptr_t)(void); |
19 | 19 | ||
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index ffe3c664afc0..77b400f06ea2 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <asm/arch_hooks.h> | 36 | #include <asm/arch_hooks.h> |
37 | #include <asm/hpet.h> | 37 | #include <asm/hpet.h> |
38 | #include <asm/time.h> | 38 | #include <asm/time.h> |
39 | #include <asm/timer.h> | ||
39 | 40 | ||
40 | #include "do_timer.h" | 41 | #include "do_timer.h" |
41 | 42 | ||
@@ -46,10 +47,9 @@ unsigned long profile_pc(struct pt_regs *regs) | |||
46 | unsigned long pc = instruction_pointer(regs); | 47 | unsigned long pc = instruction_pointer(regs); |
47 | 48 | ||
48 | #ifdef CONFIG_SMP | 49 | #ifdef CONFIG_SMP |
49 | if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && | 50 | if (!user_mode_vm(regs) && in_lock_functions(pc)) { |
50 | in_lock_functions(pc)) { | ||
51 | #ifdef CONFIG_FRAME_POINTER | 51 | #ifdef CONFIG_FRAME_POINTER |
52 | return *(unsigned long *)(regs->bp + 4); | 52 | return *(unsigned long *)(regs->bp + sizeof(long)); |
53 | #else | 53 | #else |
54 | unsigned long *sp = (unsigned long *)®s->sp; | 54 | unsigned long *sp = (unsigned long *)®s->sp; |
55 | 55 | ||
@@ -94,6 +94,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) | |||
94 | 94 | ||
95 | do_timer_interrupt_hook(); | 95 | do_timer_interrupt_hook(); |
96 | 96 | ||
97 | #ifdef CONFIG_MCA | ||
97 | if (MCA_bus) { | 98 | if (MCA_bus) { |
98 | /* The PS/2 uses level-triggered interrupts. You can't | 99 | /* The PS/2 uses level-triggered interrupts. You can't |
99 | turn them off, nor would you want to (any attempt to | 100 | turn them off, nor would you want to (any attempt to |
@@ -107,6 +108,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) | |||
107 | u8 irq_v = inb_p( 0x61 ); /* read the current state */ | 108 | u8 irq_v = inb_p( 0x61 ); /* read the current state */ |
108 | outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ | 109 | outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ |
109 | } | 110 | } |
111 | #endif | ||
110 | 112 | ||
111 | return IRQ_HANDLED; | 113 | return IRQ_HANDLED; |
112 | } | 114 | } |
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index e3d49c553af2..cb19d650c216 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/time.h> | 18 | #include <linux/time.h> |
19 | #include <linux/mca.h> | ||
19 | 20 | ||
20 | #include <asm/i8253.h> | 21 | #include <asm/i8253.h> |
21 | #include <asm/hpet.h> | 22 | #include <asm/hpet.h> |
@@ -33,23 +34,34 @@ unsigned long profile_pc(struct pt_regs *regs) | |||
33 | /* Assume the lock function has either no stack frame or a copy | 34 | /* Assume the lock function has either no stack frame or a copy |
34 | of flags from PUSHF | 35 | of flags from PUSHF |
35 | Eflags always has bits 22 and up cleared unlike kernel addresses. */ | 36 | Eflags always has bits 22 and up cleared unlike kernel addresses. */ |
36 | if (!user_mode(regs) && in_lock_functions(pc)) { | 37 | if (!user_mode_vm(regs) && in_lock_functions(pc)) { |
38 | #ifdef CONFIG_FRAME_POINTER | ||
39 | return *(unsigned long *)(regs->bp + sizeof(long)); | ||
40 | #else | ||
37 | unsigned long *sp = (unsigned long *)regs->sp; | 41 | unsigned long *sp = (unsigned long *)regs->sp; |
38 | if (sp[0] >> 22) | 42 | if (sp[0] >> 22) |
39 | return sp[0]; | 43 | return sp[0]; |
40 | if (sp[1] >> 22) | 44 | if (sp[1] >> 22) |
41 | return sp[1]; | 45 | return sp[1]; |
46 | #endif | ||
42 | } | 47 | } |
43 | return pc; | 48 | return pc; |
44 | } | 49 | } |
45 | EXPORT_SYMBOL(profile_pc); | 50 | EXPORT_SYMBOL(profile_pc); |
46 | 51 | ||
47 | static irqreturn_t timer_event_interrupt(int irq, void *dev_id) | 52 | irqreturn_t timer_interrupt(int irq, void *dev_id) |
48 | { | 53 | { |
49 | add_pda(irq0_irqs, 1); | 54 | add_pda(irq0_irqs, 1); |
50 | 55 | ||
51 | global_clock_event->event_handler(global_clock_event); | 56 | global_clock_event->event_handler(global_clock_event); |
52 | 57 | ||
58 | #ifdef CONFIG_MCA | ||
59 | if (MCA_bus) { | ||
60 | u8 irq_v = inb_p(0x61); /* read the current state */ | ||
61 | outb_p(irq_v|0x80, 0x61); /* reset the IRQ */ | ||
62 | } | ||
63 | #endif | ||
64 | |||
53 | return IRQ_HANDLED; | 65 | return IRQ_HANDLED; |
54 | } | 66 | } |
55 | 67 | ||
@@ -100,7 +112,7 @@ unsigned long __init calibrate_cpu(void) | |||
100 | } | 112 | } |
101 | 113 | ||
102 | static struct irqaction irq0 = { | 114 | static struct irqaction irq0 = { |
103 | .handler = timer_event_interrupt, | 115 | .handler = timer_interrupt, |
104 | .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, | 116 | .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, |
105 | .mask = CPU_MASK_NONE, | 117 | .mask = CPU_MASK_NONE, |
106 | .name = "timer" | 118 | .name = "timer" |
@@ -111,16 +123,13 @@ void __init hpet_time_init(void) | |||
111 | if (!hpet_enable()) | 123 | if (!hpet_enable()) |
112 | setup_pit_timer(); | 124 | setup_pit_timer(); |
113 | 125 | ||
126 | irq0.mask = cpumask_of_cpu(0); | ||
114 | setup_irq(0, &irq0); | 127 | setup_irq(0, &irq0); |
115 | } | 128 | } |
116 | 129 | ||
117 | void __init time_init(void) | 130 | void __init time_init(void) |
118 | { | 131 | { |
119 | tsc_init(); | 132 | tsc_init(); |
120 | if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) | ||
121 | vgetcpu_mode = VGETCPU_RDTSCP; | ||
122 | else | ||
123 | vgetcpu_mode = VGETCPU_LSL; | ||
124 | 133 | ||
125 | late_time_init = choose_time_init(); | 134 | late_time_init = choose_time_init(); |
126 | } | 135 | } |
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index fec1ecedc9b7..e00534b33534 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c | |||
@@ -241,3 +241,11 @@ void flush_tlb_all(void) | |||
241 | on_each_cpu(do_flush_tlb_all, NULL, 1); | 241 | on_each_cpu(do_flush_tlb_all, NULL, 1); |
242 | } | 242 | } |
243 | 243 | ||
244 | void reset_lazy_tlbstate(void) | ||
245 | { | ||
246 | int cpu = raw_smp_processor_id(); | ||
247 | |||
248 | per_cpu(cpu_tlbstate, cpu).state = 0; | ||
249 | per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; | ||
250 | } | ||
251 | |||
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index ab6bf375a307..6bb7b8579e70 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <asm/ldt.h> | 10 | #include <asm/ldt.h> |
11 | #include <asm/processor.h> | 11 | #include <asm/processor.h> |
12 | #include <asm/proto.h> | 12 | #include <asm/proto.h> |
13 | #include <asm/syscalls.h> | ||
13 | 14 | ||
14 | #include "tls.h" | 15 | #include "tls.h" |
15 | 16 | ||
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps.c index 03df8e45e5a1..e062974cce34 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps.c | |||
@@ -7,13 +7,11 @@ | |||
7 | */ | 7 | */ |
8 | 8 | ||
9 | /* | 9 | /* |
10 | * 'Traps.c' handles hardware traps and faults after we have saved some | 10 | * Handle hardware traps and faults. |
11 | * state in 'asm.s'. | ||
12 | */ | 11 | */ |
13 | #include <linux/interrupt.h> | 12 | #include <linux/interrupt.h> |
14 | #include <linux/kallsyms.h> | 13 | #include <linux/kallsyms.h> |
15 | #include <linux/spinlock.h> | 14 | #include <linux/spinlock.h> |
16 | #include <linux/highmem.h> | ||
17 | #include <linux/kprobes.h> | 15 | #include <linux/kprobes.h> |
18 | #include <linux/uaccess.h> | 16 | #include <linux/uaccess.h> |
19 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
@@ -32,6 +30,8 @@ | |||
32 | #include <linux/bug.h> | 30 | #include <linux/bug.h> |
33 | #include <linux/nmi.h> | 31 | #include <linux/nmi.h> |
34 | #include <linux/mm.h> | 32 | #include <linux/mm.h> |
33 | #include <linux/smp.h> | ||
34 | #include <linux/io.h> | ||
35 | 35 | ||
36 | #ifdef CONFIG_EISA | 36 | #ifdef CONFIG_EISA |
37 | #include <linux/ioport.h> | 37 | #include <linux/ioport.h> |
@@ -46,21 +46,31 @@ | |||
46 | #include <linux/edac.h> | 46 | #include <linux/edac.h> |
47 | #endif | 47 | #endif |
48 | 48 | ||
49 | #include <asm/arch_hooks.h> | ||
50 | #include <asm/stacktrace.h> | 49 | #include <asm/stacktrace.h> |
51 | #include <asm/processor.h> | 50 | #include <asm/processor.h> |
52 | #include <asm/debugreg.h> | 51 | #include <asm/debugreg.h> |
53 | #include <asm/atomic.h> | 52 | #include <asm/atomic.h> |
54 | #include <asm/system.h> | 53 | #include <asm/system.h> |
55 | #include <asm/unwind.h> | 54 | #include <asm/unwind.h> |
55 | #include <asm/traps.h> | ||
56 | #include <asm/desc.h> | 56 | #include <asm/desc.h> |
57 | #include <asm/i387.h> | 57 | #include <asm/i387.h> |
58 | |||
59 | #include <mach_traps.h> | ||
60 | |||
61 | #ifdef CONFIG_X86_64 | ||
62 | #include <asm/pgalloc.h> | ||
63 | #include <asm/proto.h> | ||
64 | #include <asm/pda.h> | ||
65 | #else | ||
66 | #include <asm/processor-flags.h> | ||
67 | #include <asm/arch_hooks.h> | ||
58 | #include <asm/nmi.h> | 68 | #include <asm/nmi.h> |
59 | #include <asm/smp.h> | 69 | #include <asm/smp.h> |
60 | #include <asm/io.h> | 70 | #include <asm/io.h> |
61 | #include <asm/traps.h> | 71 | #include <asm/traps.h> |
62 | 72 | ||
63 | #include "mach_traps.h" | 73 | #include "cpu/mcheck/mce.h" |
64 | 74 | ||
65 | DECLARE_BITMAP(used_vectors, NR_VECTORS); | 75 | DECLARE_BITMAP(used_vectors, NR_VECTORS); |
66 | EXPORT_SYMBOL_GPL(used_vectors); | 76 | EXPORT_SYMBOL_GPL(used_vectors); |
@@ -77,418 +87,104 @@ char ignore_fpu_irq; | |||
77 | */ | 87 | */ |
78 | gate_desc idt_table[256] | 88 | gate_desc idt_table[256] |
79 | __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; | 89 | __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; |
80 | |||
81 | int panic_on_unrecovered_nmi; | ||
82 | int kstack_depth_to_print = 24; | ||
83 | static unsigned int code_bytes = 64; | ||
84 | static int ignore_nmis; | ||
85 | static int die_counter; | ||
86 | |||
87 | void printk_address(unsigned long address, int reliable) | ||
88 | { | ||
89 | #ifdef CONFIG_KALLSYMS | ||
90 | unsigned long offset = 0; | ||
91 | unsigned long symsize; | ||
92 | const char *symname; | ||
93 | char *modname; | ||
94 | char *delim = ":"; | ||
95 | char namebuf[KSYM_NAME_LEN]; | ||
96 | char reliab[4] = ""; | ||
97 | |||
98 | symname = kallsyms_lookup(address, &symsize, &offset, | ||
99 | &modname, namebuf); | ||
100 | if (!symname) { | ||
101 | printk(" [<%08lx>]\n", address); | ||
102 | return; | ||
103 | } | ||
104 | if (!reliable) | ||
105 | strcpy(reliab, "? "); | ||
106 | |||
107 | if (!modname) | ||
108 | modname = delim = ""; | ||
109 | printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", | ||
110 | address, reliab, delim, modname, delim, symname, offset, symsize); | ||
111 | #else | ||
112 | printk(" [<%08lx>]\n", address); | ||
113 | #endif | 90 | #endif |
114 | } | ||
115 | |||
116 | static inline int valid_stack_ptr(struct thread_info *tinfo, | ||
117 | void *p, unsigned int size) | ||
118 | { | ||
119 | void *t = tinfo; | ||
120 | return p > t && p <= t + THREAD_SIZE - size; | ||
121 | } | ||
122 | |||
123 | /* The form of the top of the frame on the stack */ | ||
124 | struct stack_frame { | ||
125 | struct stack_frame *next_frame; | ||
126 | unsigned long return_address; | ||
127 | }; | ||
128 | |||
129 | static inline unsigned long | ||
130 | print_context_stack(struct thread_info *tinfo, | ||
131 | unsigned long *stack, unsigned long bp, | ||
132 | const struct stacktrace_ops *ops, void *data) | ||
133 | { | ||
134 | struct stack_frame *frame = (struct stack_frame *)bp; | ||
135 | |||
136 | while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { | ||
137 | unsigned long addr; | ||
138 | |||
139 | addr = *stack; | ||
140 | if (__kernel_text_address(addr)) { | ||
141 | if ((unsigned long) stack == bp + 4) { | ||
142 | ops->address(data, addr, 1); | ||
143 | frame = frame->next_frame; | ||
144 | bp = (unsigned long) frame; | ||
145 | } else { | ||
146 | ops->address(data, addr, bp == 0); | ||
147 | } | ||
148 | } | ||
149 | stack++; | ||
150 | } | ||
151 | return bp; | ||
152 | } | ||
153 | |||
154 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | ||
155 | unsigned long *stack, unsigned long bp, | ||
156 | const struct stacktrace_ops *ops, void *data) | ||
157 | { | ||
158 | if (!task) | ||
159 | task = current; | ||
160 | |||
161 | if (!stack) { | ||
162 | unsigned long dummy; | ||
163 | stack = &dummy; | ||
164 | if (task != current) | ||
165 | stack = (unsigned long *)task->thread.sp; | ||
166 | } | ||
167 | |||
168 | #ifdef CONFIG_FRAME_POINTER | ||
169 | if (!bp) { | ||
170 | if (task == current) { | ||
171 | /* Grab bp right from our regs */ | ||
172 | asm("movl %%ebp, %0" : "=r" (bp) :); | ||
173 | } else { | ||
174 | /* bp is the last reg pushed by switch_to */ | ||
175 | bp = *(unsigned long *) task->thread.sp; | ||
176 | } | ||
177 | } | ||
178 | #endif | ||
179 | |||
180 | for (;;) { | ||
181 | struct thread_info *context; | ||
182 | |||
183 | context = (struct thread_info *) | ||
184 | ((unsigned long)stack & (~(THREAD_SIZE - 1))); | ||
185 | bp = print_context_stack(context, stack, bp, ops, data); | ||
186 | /* | ||
187 | * Should be after the line below, but somewhere | ||
188 | * in early boot context comes out corrupted and we | ||
189 | * can't reference it: | ||
190 | */ | ||
191 | if (ops->stack(data, "IRQ") < 0) | ||
192 | break; | ||
193 | stack = (unsigned long *)context->previous_esp; | ||
194 | if (!stack) | ||
195 | break; | ||
196 | touch_nmi_watchdog(); | ||
197 | } | ||
198 | } | ||
199 | EXPORT_SYMBOL(dump_trace); | ||
200 | |||
201 | static void | ||
202 | print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
203 | { | ||
204 | printk(data); | ||
205 | print_symbol(msg, symbol); | ||
206 | printk("\n"); | ||
207 | } | ||
208 | |||
209 | static void print_trace_warning(void *data, char *msg) | ||
210 | { | ||
211 | printk("%s%s\n", (char *)data, msg); | ||
212 | } | ||
213 | 91 | ||
214 | static int print_trace_stack(void *data, char *name) | 92 | static int ignore_nmis; |
215 | { | ||
216 | return 0; | ||
217 | } | ||
218 | |||
219 | /* | ||
220 | * Print one address/symbol entries per line. | ||
221 | */ | ||
222 | static void print_trace_address(void *data, unsigned long addr, int reliable) | ||
223 | { | ||
224 | printk("%s [<%08lx>] ", (char *)data, addr); | ||
225 | if (!reliable) | ||
226 | printk("? "); | ||
227 | print_symbol("%s\n", addr); | ||
228 | touch_nmi_watchdog(); | ||
229 | } | ||
230 | |||
231 | static const struct stacktrace_ops print_trace_ops = { | ||
232 | .warning = print_trace_warning, | ||
233 | .warning_symbol = print_trace_warning_symbol, | ||
234 | .stack = print_trace_stack, | ||
235 | .address = print_trace_address, | ||
236 | }; | ||
237 | 93 | ||
238 | static void | 94 | static inline void conditional_sti(struct pt_regs *regs) |
239 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
240 | unsigned long *stack, unsigned long bp, char *log_lvl) | ||
241 | { | 95 | { |
242 | dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); | 96 | if (regs->flags & X86_EFLAGS_IF) |
243 | printk("%s =======================\n", log_lvl); | 97 | local_irq_enable(); |
244 | } | 98 | } |
245 | 99 | ||
246 | void show_trace(struct task_struct *task, struct pt_regs *regs, | 100 | static inline void preempt_conditional_sti(struct pt_regs *regs) |
247 | unsigned long *stack, unsigned long bp) | ||
248 | { | 101 | { |
249 | show_trace_log_lvl(task, regs, stack, bp, ""); | 102 | inc_preempt_count(); |
103 | if (regs->flags & X86_EFLAGS_IF) | ||
104 | local_irq_enable(); | ||
250 | } | 105 | } |
251 | 106 | ||
252 | static void | 107 | static inline void preempt_conditional_cli(struct pt_regs *regs) |
253 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
254 | unsigned long *sp, unsigned long bp, char *log_lvl) | ||
255 | { | 108 | { |
256 | unsigned long *stack; | 109 | if (regs->flags & X86_EFLAGS_IF) |
257 | int i; | 110 | local_irq_disable(); |
258 | 111 | dec_preempt_count(); | |
259 | if (sp == NULL) { | ||
260 | if (task) | ||
261 | sp = (unsigned long *)task->thread.sp; | ||
262 | else | ||
263 | sp = (unsigned long *)&sp; | ||
264 | } | ||
265 | |||
266 | stack = sp; | ||
267 | for (i = 0; i < kstack_depth_to_print; i++) { | ||
268 | if (kstack_end(stack)) | ||
269 | break; | ||
270 | if (i && ((i % 8) == 0)) | ||
271 | printk("\n%s ", log_lvl); | ||
272 | printk("%08lx ", *stack++); | ||
273 | } | ||
274 | printk("\n%sCall Trace:\n", log_lvl); | ||
275 | |||
276 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); | ||
277 | } | 112 | } |
278 | 113 | ||
279 | void show_stack(struct task_struct *task, unsigned long *sp) | 114 | #ifdef CONFIG_X86_32 |
115 | static inline void | ||
116 | die_if_kernel(const char *str, struct pt_regs *regs, long err) | ||
280 | { | 117 | { |
281 | printk(" "); | 118 | if (!user_mode_vm(regs)) |
282 | show_stack_log_lvl(task, NULL, sp, 0, ""); | 119 | die(str, regs, err); |
283 | } | 120 | } |
284 | 121 | ||
285 | /* | 122 | /* |
286 | * The architecture-independent dump_stack generator | 123 | * Perform the lazy TSS's I/O bitmap copy. If the TSS has an |
124 | * invalid offset set (the LAZY one) and the faulting thread has | ||
125 | * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, | ||
126 | * we set the offset field correctly and return 1. | ||
287 | */ | 127 | */ |
288 | void dump_stack(void) | 128 | static int lazy_iobitmap_copy(void) |
289 | { | 129 | { |
290 | unsigned long bp = 0; | 130 | struct thread_struct *thread; |
291 | unsigned long stack; | 131 | struct tss_struct *tss; |
292 | 132 | int cpu; | |
293 | #ifdef CONFIG_FRAME_POINTER | ||
294 | if (!bp) | ||
295 | asm("movl %%ebp, %0" : "=r" (bp):); | ||
296 | #endif | ||
297 | |||
298 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | ||
299 | current->pid, current->comm, print_tainted(), | ||
300 | init_utsname()->release, | ||
301 | (int)strcspn(init_utsname()->version, " "), | ||
302 | init_utsname()->version); | ||
303 | |||
304 | show_trace(current, NULL, &stack, bp); | ||
305 | } | ||
306 | |||
307 | EXPORT_SYMBOL(dump_stack); | ||
308 | |||
309 | void show_registers(struct pt_regs *regs) | ||
310 | { | ||
311 | int i; | ||
312 | 133 | ||
313 | print_modules(); | 134 | cpu = get_cpu(); |
314 | __show_registers(regs, 0); | 135 | tss = &per_cpu(init_tss, cpu); |
136 | thread = ¤t->thread; | ||
315 | 137 | ||
316 | printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", | 138 | if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && |
317 | TASK_COMM_LEN, current->comm, task_pid_nr(current), | 139 | thread->io_bitmap_ptr) { |
318 | current_thread_info(), current, task_thread_info(current)); | 140 | memcpy(tss->io_bitmap, thread->io_bitmap_ptr, |
319 | /* | 141 | thread->io_bitmap_max); |
320 | * When in-kernel, we also print out the stack and code at the | 142 | /* |
321 | * time of the fault.. | 143 | * If the previously set map was extending to higher ports |
322 | */ | 144 | * than the current one, pad extra space with 0xff (no access). |
323 | if (!user_mode_vm(regs)) { | 145 | */ |
324 | unsigned int code_prologue = code_bytes * 43 / 64; | 146 | if (thread->io_bitmap_max < tss->io_bitmap_max) { |
325 | unsigned int code_len = code_bytes; | 147 | memset((char *) tss->io_bitmap + |
326 | unsigned char c; | 148 | thread->io_bitmap_max, 0xff, |
327 | u8 *ip; | 149 | tss->io_bitmap_max - thread->io_bitmap_max); |
328 | |||
329 | printk("\n" KERN_EMERG "Stack: "); | ||
330 | show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); | ||
331 | |||
332 | printk(KERN_EMERG "Code: "); | ||
333 | |||
334 | ip = (u8 *)regs->ip - code_prologue; | ||
335 | if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { | ||
336 | /* try starting at EIP */ | ||
337 | ip = (u8 *)regs->ip; | ||
338 | code_len = code_len - code_prologue + 1; | ||
339 | } | ||
340 | for (i = 0; i < code_len; i++, ip++) { | ||
341 | if (ip < (u8 *)PAGE_OFFSET || | ||
342 | probe_kernel_address(ip, c)) { | ||
343 | printk(" Bad EIP value."); | ||
344 | break; | ||
345 | } | ||
346 | if (ip == (u8 *)regs->ip) | ||
347 | printk("<%02x> ", c); | ||
348 | else | ||
349 | printk("%02x ", c); | ||
350 | } | 150 | } |
351 | } | 151 | tss->io_bitmap_max = thread->io_bitmap_max; |
352 | printk("\n"); | 152 | tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; |
353 | } | 153 | tss->io_bitmap_owner = thread; |
354 | 154 | put_cpu(); | |
355 | int is_valid_bugaddr(unsigned long ip) | ||
356 | { | ||
357 | unsigned short ud2; | ||
358 | |||
359 | if (ip < PAGE_OFFSET) | ||
360 | return 0; | ||
361 | if (probe_kernel_address((unsigned short *)ip, ud2)) | ||
362 | return 0; | ||
363 | |||
364 | return ud2 == 0x0b0f; | ||
365 | } | ||
366 | |||
367 | static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; | ||
368 | static int die_owner = -1; | ||
369 | static unsigned int die_nest_count; | ||
370 | |||
371 | unsigned __kprobes long oops_begin(void) | ||
372 | { | ||
373 | unsigned long flags; | ||
374 | |||
375 | oops_enter(); | ||
376 | |||
377 | if (die_owner != raw_smp_processor_id()) { | ||
378 | console_verbose(); | ||
379 | raw_local_irq_save(flags); | ||
380 | __raw_spin_lock(&die_lock); | ||
381 | die_owner = smp_processor_id(); | ||
382 | die_nest_count = 0; | ||
383 | bust_spinlocks(1); | ||
384 | } else { | ||
385 | raw_local_irq_save(flags); | ||
386 | } | ||
387 | die_nest_count++; | ||
388 | return flags; | ||
389 | } | ||
390 | |||
391 | void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) | ||
392 | { | ||
393 | bust_spinlocks(0); | ||
394 | die_owner = -1; | ||
395 | add_taint(TAINT_DIE); | ||
396 | __raw_spin_unlock(&die_lock); | ||
397 | raw_local_irq_restore(flags); | ||
398 | |||
399 | if (!regs) | ||
400 | return; | ||
401 | |||
402 | if (kexec_should_crash(current)) | ||
403 | crash_kexec(regs); | ||
404 | |||
405 | if (in_interrupt()) | ||
406 | panic("Fatal exception in interrupt"); | ||
407 | |||
408 | if (panic_on_oops) | ||
409 | panic("Fatal exception"); | ||
410 | |||
411 | oops_exit(); | ||
412 | do_exit(signr); | ||
413 | } | ||
414 | |||
415 | int __kprobes __die(const char *str, struct pt_regs *regs, long err) | ||
416 | { | ||
417 | unsigned short ss; | ||
418 | unsigned long sp; | ||
419 | 155 | ||
420 | printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); | ||
421 | #ifdef CONFIG_PREEMPT | ||
422 | printk("PREEMPT "); | ||
423 | #endif | ||
424 | #ifdef CONFIG_SMP | ||
425 | printk("SMP "); | ||
426 | #endif | ||
427 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
428 | printk("DEBUG_PAGEALLOC"); | ||
429 | #endif | ||
430 | printk("\n"); | ||
431 | if (notify_die(DIE_OOPS, str, regs, err, | ||
432 | current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) | ||
433 | return 1; | 156 | return 1; |
434 | |||
435 | show_registers(regs); | ||
436 | /* Executive summary in case the oops scrolled away */ | ||
437 | sp = (unsigned long) (®s->sp); | ||
438 | savesegment(ss, ss); | ||
439 | if (user_mode(regs)) { | ||
440 | sp = regs->sp; | ||
441 | ss = regs->ss & 0xffff; | ||
442 | } | 157 | } |
443 | printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); | 158 | put_cpu(); |
444 | print_symbol("%s", regs->ip); | ||
445 | printk(" SS:ESP %04x:%08lx\n", ss, sp); | ||
446 | return 0; | ||
447 | } | ||
448 | |||
449 | /* | ||
450 | * This is gone through when something in the kernel has done something bad | ||
451 | * and is about to be terminated: | ||
452 | */ | ||
453 | void die(const char *str, struct pt_regs *regs, long err) | ||
454 | { | ||
455 | unsigned long flags = oops_begin(); | ||
456 | |||
457 | if (die_nest_count < 3) { | ||
458 | report_bug(regs->ip, regs); | ||
459 | |||
460 | if (__die(str, regs, err)) | ||
461 | regs = NULL; | ||
462 | } else { | ||
463 | printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); | ||
464 | } | ||
465 | |||
466 | oops_end(flags, regs, SIGSEGV); | ||
467 | } | ||
468 | 159 | ||
469 | static inline void | 160 | return 0; |
470 | die_if_kernel(const char *str, struct pt_regs *regs, long err) | ||
471 | { | ||
472 | if (!user_mode_vm(regs)) | ||
473 | die(str, regs, err); | ||
474 | } | 161 | } |
162 | #endif | ||
475 | 163 | ||
476 | static void __kprobes | 164 | static void __kprobes |
477 | do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs, | 165 | do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, |
478 | long error_code, siginfo_t *info) | 166 | long error_code, siginfo_t *info) |
479 | { | 167 | { |
480 | struct task_struct *tsk = current; | 168 | struct task_struct *tsk = current; |
481 | 169 | ||
170 | #ifdef CONFIG_X86_32 | ||
482 | if (regs->flags & X86_VM_MASK) { | 171 | if (regs->flags & X86_VM_MASK) { |
483 | if (vm86) | 172 | /* |
173 | * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. | ||
174 | * On nmi (interrupt 2), do_trap should not be called. | ||
175 | */ | ||
176 | if (trapnr < 6) | ||
484 | goto vm86_trap; | 177 | goto vm86_trap; |
485 | goto trap_signal; | 178 | goto trap_signal; |
486 | } | 179 | } |
180 | #endif | ||
487 | 181 | ||
488 | if (!user_mode(regs)) | 182 | if (!user_mode(regs)) |
489 | goto kernel_trap; | 183 | goto kernel_trap; |
490 | 184 | ||
185 | #ifdef CONFIG_X86_32 | ||
491 | trap_signal: | 186 | trap_signal: |
187 | #endif | ||
492 | /* | 188 | /* |
493 | * We want error_code and trap_no set for userspace faults and | 189 | * We want error_code and trap_no set for userspace faults and |
494 | * kernelspace faults which result in die(), but not | 190 | * kernelspace faults which result in die(), but not |
@@ -501,6 +197,18 @@ trap_signal: | |||
501 | tsk->thread.error_code = error_code; | 197 | tsk->thread.error_code = error_code; |
502 | tsk->thread.trap_no = trapnr; | 198 | tsk->thread.trap_no = trapnr; |
503 | 199 | ||
200 | #ifdef CONFIG_X86_64 | ||
201 | if (show_unhandled_signals && unhandled_signal(tsk, signr) && | ||
202 | printk_ratelimit()) { | ||
203 | printk(KERN_INFO | ||
204 | "%s[%d] trap %s ip:%lx sp:%lx error:%lx", | ||
205 | tsk->comm, tsk->pid, str, | ||
206 | regs->ip, regs->sp, error_code); | ||
207 | print_vma_addr(" in ", regs->ip); | ||
208 | printk("\n"); | ||
209 | } | ||
210 | #endif | ||
211 | |||
504 | if (info) | 212 | if (info) |
505 | force_sig_info(signr, info, tsk); | 213 | force_sig_info(signr, info, tsk); |
506 | else | 214 | else |
@@ -515,29 +223,29 @@ kernel_trap: | |||
515 | } | 223 | } |
516 | return; | 224 | return; |
517 | 225 | ||
226 | #ifdef CONFIG_X86_32 | ||
518 | vm86_trap: | 227 | vm86_trap: |
519 | if (handle_vm86_trap((struct kernel_vm86_regs *) regs, | 228 | if (handle_vm86_trap((struct kernel_vm86_regs *) regs, |
520 | error_code, trapnr)) | 229 | error_code, trapnr)) |
521 | goto trap_signal; | 230 | goto trap_signal; |
522 | return; | 231 | return; |
232 | #endif | ||
523 | } | 233 | } |
524 | 234 | ||
525 | #define DO_ERROR(trapnr, signr, str, name) \ | 235 | #define DO_ERROR(trapnr, signr, str, name) \ |
526 | void do_##name(struct pt_regs *regs, long error_code) \ | 236 | dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ |
527 | { \ | 237 | { \ |
528 | trace_hardirqs_fixup(); \ | ||
529 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | 238 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ |
530 | == NOTIFY_STOP) \ | 239 | == NOTIFY_STOP) \ |
531 | return; \ | 240 | return; \ |
532 | do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ | 241 | conditional_sti(regs); \ |
242 | do_trap(trapnr, signr, str, regs, error_code, NULL); \ | ||
533 | } | 243 | } |
534 | 244 | ||
535 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ | 245 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ |
536 | void do_##name(struct pt_regs *regs, long error_code) \ | 246 | dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ |
537 | { \ | 247 | { \ |
538 | siginfo_t info; \ | 248 | siginfo_t info; \ |
539 | if (irq) \ | ||
540 | local_irq_enable(); \ | ||
541 | info.si_signo = signr; \ | 249 | info.si_signo = signr; \ |
542 | info.si_errno = 0; \ | 250 | info.si_errno = 0; \ |
543 | info.si_code = sicode; \ | 251 | info.si_code = sicode; \ |
@@ -545,90 +253,68 @@ void do_##name(struct pt_regs *regs, long error_code) \ | |||
545 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | 253 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ |
546 | == NOTIFY_STOP) \ | 254 | == NOTIFY_STOP) \ |
547 | return; \ | 255 | return; \ |
548 | do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ | 256 | conditional_sti(regs); \ |
257 | do_trap(trapnr, signr, str, regs, error_code, &info); \ | ||
549 | } | 258 | } |
550 | 259 | ||
551 | #define DO_VM86_ERROR(trapnr, signr, str, name) \ | 260 | DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) |
552 | void do_##name(struct pt_regs *regs, long error_code) \ | 261 | DO_ERROR(4, SIGSEGV, "overflow", overflow) |
553 | { \ | 262 | DO_ERROR(5, SIGSEGV, "bounds", bounds) |
554 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | 263 | DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) |
555 | == NOTIFY_STOP) \ | ||
556 | return; \ | ||
557 | do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ | ||
558 | } | ||
559 | |||
560 | #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | ||
561 | void do_##name(struct pt_regs *regs, long error_code) \ | ||
562 | { \ | ||
563 | siginfo_t info; \ | ||
564 | info.si_signo = signr; \ | ||
565 | info.si_errno = 0; \ | ||
566 | info.si_code = sicode; \ | ||
567 | info.si_addr = (void __user *)siaddr; \ | ||
568 | trace_hardirqs_fixup(); \ | ||
569 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
570 | == NOTIFY_STOP) \ | ||
571 | return; \ | ||
572 | do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ | ||
573 | } | ||
574 | |||
575 | DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) | ||
576 | #ifndef CONFIG_KPROBES | ||
577 | DO_VM86_ERROR(3, SIGTRAP, "int3", int3) | ||
578 | #endif | ||
579 | DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) | ||
580 | DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) | ||
581 | DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) | ||
582 | DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | 264 | DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) |
583 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | 265 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) |
584 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) | 266 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) |
267 | #ifdef CONFIG_X86_32 | ||
585 | DO_ERROR(12, SIGBUS, "stack segment", stack_segment) | 268 | DO_ERROR(12, SIGBUS, "stack segment", stack_segment) |
586 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) | 269 | #endif |
587 | DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) | 270 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) |
271 | |||
272 | #ifdef CONFIG_X86_64 | ||
273 | /* Runs on IST stack */ | ||
274 | dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) | ||
275 | { | ||
276 | if (notify_die(DIE_TRAP, "stack segment", regs, error_code, | ||
277 | 12, SIGBUS) == NOTIFY_STOP) | ||
278 | return; | ||
279 | preempt_conditional_sti(regs); | ||
280 | do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); | ||
281 | preempt_conditional_cli(regs); | ||
282 | } | ||
283 | |||
284 | dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) | ||
285 | { | ||
286 | static const char str[] = "double fault"; | ||
287 | struct task_struct *tsk = current; | ||
288 | |||
289 | /* Return not checked because double check cannot be ignored */ | ||
290 | notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); | ||
588 | 291 | ||
589 | void __kprobes | 292 | tsk->thread.error_code = error_code; |
293 | tsk->thread.trap_no = 8; | ||
294 | |||
295 | /* This is always a kernel trap and never fixable (and thus must | ||
296 | never return). */ | ||
297 | for (;;) | ||
298 | die(str, regs, error_code); | ||
299 | } | ||
300 | #endif | ||
301 | |||
302 | dotraplinkage void __kprobes | ||
590 | do_general_protection(struct pt_regs *regs, long error_code) | 303 | do_general_protection(struct pt_regs *regs, long error_code) |
591 | { | 304 | { |
592 | struct task_struct *tsk; | 305 | struct task_struct *tsk; |
593 | struct thread_struct *thread; | ||
594 | struct tss_struct *tss; | ||
595 | int cpu; | ||
596 | 306 | ||
597 | cpu = get_cpu(); | 307 | conditional_sti(regs); |
598 | tss = &per_cpu(init_tss, cpu); | ||
599 | thread = ¤t->thread; | ||
600 | |||
601 | /* | ||
602 | * Perform the lazy TSS's I/O bitmap copy. If the TSS has an | ||
603 | * invalid offset set (the LAZY one) and the faulting thread has | ||
604 | * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS | ||
605 | * and we set the offset field correctly. Then we let the CPU to | ||
606 | * restart the faulting instruction. | ||
607 | */ | ||
608 | if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && | ||
609 | thread->io_bitmap_ptr) { | ||
610 | memcpy(tss->io_bitmap, thread->io_bitmap_ptr, | ||
611 | thread->io_bitmap_max); | ||
612 | /* | ||
613 | * If the previously set map was extending to higher ports | ||
614 | * than the current one, pad extra space with 0xff (no access). | ||
615 | */ | ||
616 | if (thread->io_bitmap_max < tss->io_bitmap_max) { | ||
617 | memset((char *) tss->io_bitmap + | ||
618 | thread->io_bitmap_max, 0xff, | ||
619 | tss->io_bitmap_max - thread->io_bitmap_max); | ||
620 | } | ||
621 | tss->io_bitmap_max = thread->io_bitmap_max; | ||
622 | tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; | ||
623 | tss->io_bitmap_owner = thread; | ||
624 | put_cpu(); | ||
625 | 308 | ||
309 | #ifdef CONFIG_X86_32 | ||
310 | if (lazy_iobitmap_copy()) { | ||
311 | /* restart the faulting instruction */ | ||
626 | return; | 312 | return; |
627 | } | 313 | } |
628 | put_cpu(); | ||
629 | 314 | ||
630 | if (regs->flags & X86_VM_MASK) | 315 | if (regs->flags & X86_VM_MASK) |
631 | goto gp_in_vm86; | 316 | goto gp_in_vm86; |
317 | #endif | ||
632 | 318 | ||
633 | tsk = current; | 319 | tsk = current; |
634 | if (!user_mode(regs)) | 320 | if (!user_mode(regs)) |
@@ -650,10 +336,12 @@ do_general_protection(struct pt_regs *regs, long error_code) | |||
650 | force_sig(SIGSEGV, tsk); | 336 | force_sig(SIGSEGV, tsk); |
651 | return; | 337 | return; |
652 | 338 | ||
339 | #ifdef CONFIG_X86_32 | ||
653 | gp_in_vm86: | 340 | gp_in_vm86: |
654 | local_irq_enable(); | 341 | local_irq_enable(); |
655 | handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); | 342 | handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); |
656 | return; | 343 | return; |
344 | #endif | ||
657 | 345 | ||
658 | gp_in_kernel: | 346 | gp_in_kernel: |
659 | if (fixup_exception(regs)) | 347 | if (fixup_exception(regs)) |
@@ -690,7 +378,8 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs) | |||
690 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | 378 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); |
691 | 379 | ||
692 | /* Clear and disable the memory parity error line. */ | 380 | /* Clear and disable the memory parity error line. */ |
693 | clear_mem_error(reason); | 381 | reason = (reason & 0xf) | 4; |
382 | outb(reason, 0x61); | ||
694 | } | 383 | } |
695 | 384 | ||
696 | static notrace __kprobes void | 385 | static notrace __kprobes void |
@@ -716,7 +405,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs) | |||
716 | static notrace __kprobes void | 405 | static notrace __kprobes void |
717 | unknown_nmi_error(unsigned char reason, struct pt_regs *regs) | 406 | unknown_nmi_error(unsigned char reason, struct pt_regs *regs) |
718 | { | 407 | { |
719 | if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) | 408 | if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == |
409 | NOTIFY_STOP) | ||
720 | return; | 410 | return; |
721 | #ifdef CONFIG_MCA | 411 | #ifdef CONFIG_MCA |
722 | /* | 412 | /* |
@@ -739,41 +429,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) | |||
739 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | 429 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); |
740 | } | 430 | } |
741 | 431 | ||
742 | static DEFINE_SPINLOCK(nmi_print_lock); | ||
743 | |||
744 | void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) | ||
745 | { | ||
746 | if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) | ||
747 | return; | ||
748 | |||
749 | spin_lock(&nmi_print_lock); | ||
750 | /* | ||
751 | * We are in trouble anyway, lets at least try | ||
752 | * to get a message out: | ||
753 | */ | ||
754 | bust_spinlocks(1); | ||
755 | printk(KERN_EMERG "%s", str); | ||
756 | printk(" on CPU%d, ip %08lx, registers:\n", | ||
757 | smp_processor_id(), regs->ip); | ||
758 | show_registers(regs); | ||
759 | if (do_panic) | ||
760 | panic("Non maskable interrupt"); | ||
761 | console_silent(); | ||
762 | spin_unlock(&nmi_print_lock); | ||
763 | bust_spinlocks(0); | ||
764 | |||
765 | /* | ||
766 | * If we are in kernel we are probably nested up pretty bad | ||
767 | * and might aswell get out now while we still can: | ||
768 | */ | ||
769 | if (!user_mode_vm(regs)) { | ||
770 | current->thread.trap_no = 2; | ||
771 | crash_kexec(regs); | ||
772 | } | ||
773 | |||
774 | do_exit(SIGSEGV); | ||
775 | } | ||
776 | |||
777 | static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | 432 | static notrace __kprobes void default_do_nmi(struct pt_regs *regs) |
778 | { | 433 | { |
779 | unsigned char reason = 0; | 434 | unsigned char reason = 0; |
@@ -812,22 +467,25 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | |||
812 | mem_parity_error(reason, regs); | 467 | mem_parity_error(reason, regs); |
813 | if (reason & 0x40) | 468 | if (reason & 0x40) |
814 | io_check_error(reason, regs); | 469 | io_check_error(reason, regs); |
470 | #ifdef CONFIG_X86_32 | ||
815 | /* | 471 | /* |
816 | * Reassert NMI in case it became active meanwhile | 472 | * Reassert NMI in case it became active meanwhile |
817 | * as it's edge-triggered: | 473 | * as it's edge-triggered: |
818 | */ | 474 | */ |
819 | reassert_nmi(); | 475 | reassert_nmi(); |
476 | #endif | ||
820 | } | 477 | } |
821 | 478 | ||
822 | notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) | 479 | dotraplinkage notrace __kprobes void |
480 | do_nmi(struct pt_regs *regs, long error_code) | ||
823 | { | 481 | { |
824 | int cpu; | ||
825 | |||
826 | nmi_enter(); | 482 | nmi_enter(); |
827 | 483 | ||
828 | cpu = smp_processor_id(); | 484 | #ifdef CONFIG_X86_32 |
829 | 485 | { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } | |
830 | ++nmi_count(cpu); | 486 | #else |
487 | add_pda(__nmi_count, 1); | ||
488 | #endif | ||
831 | 489 | ||
832 | if (!ignore_nmis) | 490 | if (!ignore_nmis) |
833 | default_do_nmi(regs); | 491 | default_do_nmi(regs); |
@@ -847,21 +505,44 @@ void restart_nmi(void) | |||
847 | acpi_nmi_enable(); | 505 | acpi_nmi_enable(); |
848 | } | 506 | } |
849 | 507 | ||
850 | #ifdef CONFIG_KPROBES | 508 | /* May run on IST stack. */ |
851 | void __kprobes do_int3(struct pt_regs *regs, long error_code) | 509 | dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) |
852 | { | 510 | { |
853 | trace_hardirqs_fixup(); | 511 | #ifdef CONFIG_KPROBES |
854 | |||
855 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) | 512 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) |
856 | == NOTIFY_STOP) | 513 | == NOTIFY_STOP) |
857 | return; | 514 | return; |
858 | /* | 515 | #else |
859 | * This is an interrupt gate, because kprobes wants interrupts | 516 | if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) |
860 | * disabled. Normal trap handlers don't. | 517 | == NOTIFY_STOP) |
861 | */ | 518 | return; |
862 | restore_interrupts(regs); | 519 | #endif |
520 | |||
521 | preempt_conditional_sti(regs); | ||
522 | do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); | ||
523 | preempt_conditional_cli(regs); | ||
524 | } | ||
863 | 525 | ||
864 | do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); | 526 | #ifdef CONFIG_X86_64 |
527 | /* Help handler running on IST stack to switch back to user stack | ||
528 | for scheduling or signal handling. The actual stack switch is done in | ||
529 | entry.S */ | ||
530 | asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) | ||
531 | { | ||
532 | struct pt_regs *regs = eregs; | ||
533 | /* Did already sync */ | ||
534 | if (eregs == (struct pt_regs *)eregs->sp) | ||
535 | ; | ||
536 | /* Exception from user space */ | ||
537 | else if (user_mode(eregs)) | ||
538 | regs = task_pt_regs(current); | ||
539 | /* Exception from kernel and interrupts are enabled. Move to | ||
540 | kernel process stack. */ | ||
541 | else if (eregs->flags & X86_EFLAGS_IF) | ||
542 | regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); | ||
543 | if (eregs != regs) | ||
544 | *regs = *eregs; | ||
545 | return regs; | ||
865 | } | 546 | } |
866 | #endif | 547 | #endif |
867 | 548 | ||
@@ -886,13 +567,14 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code) | |||
886 | * about restoring all the debug state, and ptrace doesn't have to | 567 | * about restoring all the debug state, and ptrace doesn't have to |
887 | * find every occurrence of the TF bit that could be saved away even | 568 | * find every occurrence of the TF bit that could be saved away even |
888 | * by user code) | 569 | * by user code) |
570 | * | ||
571 | * May run on IST stack. | ||
889 | */ | 572 | */ |
890 | void __kprobes do_debug(struct pt_regs *regs, long error_code) | 573 | dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) |
891 | { | 574 | { |
892 | struct task_struct *tsk = current; | 575 | struct task_struct *tsk = current; |
893 | unsigned int condition; | 576 | unsigned long condition; |
894 | 577 | int si_code; | |
895 | trace_hardirqs_fixup(); | ||
896 | 578 | ||
897 | get_debugreg(condition, 6); | 579 | get_debugreg(condition, 6); |
898 | 580 | ||
@@ -905,9 +587,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
905 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, | 587 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, |
906 | SIGTRAP) == NOTIFY_STOP) | 588 | SIGTRAP) == NOTIFY_STOP) |
907 | return; | 589 | return; |
590 | |||
908 | /* It's safe to allow irq's after DR6 has been saved */ | 591 | /* It's safe to allow irq's after DR6 has been saved */ |
909 | if (regs->flags & X86_EFLAGS_IF) | 592 | preempt_conditional_sti(regs); |
910 | local_irq_enable(); | ||
911 | 593 | ||
912 | /* Mask out spurious debug traps due to lazy DR7 setting */ | 594 | /* Mask out spurious debug traps due to lazy DR7 setting */ |
913 | if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { | 595 | if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { |
@@ -915,8 +597,10 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
915 | goto clear_dr7; | 597 | goto clear_dr7; |
916 | } | 598 | } |
917 | 599 | ||
600 | #ifdef CONFIG_X86_32 | ||
918 | if (regs->flags & X86_VM_MASK) | 601 | if (regs->flags & X86_VM_MASK) |
919 | goto debug_vm86; | 602 | goto debug_vm86; |
603 | #endif | ||
920 | 604 | ||
921 | /* Save debug status register where ptrace can see it */ | 605 | /* Save debug status register where ptrace can see it */ |
922 | tsk->thread.debugreg6 = condition; | 606 | tsk->thread.debugreg6 = condition; |
@@ -926,17 +610,13 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
926 | * kernel space (but re-enable TF when returning to user mode). | 610 | * kernel space (but re-enable TF when returning to user mode). |
927 | */ | 611 | */ |
928 | if (condition & DR_STEP) { | 612 | if (condition & DR_STEP) { |
929 | /* | ||
930 | * We already checked v86 mode above, so we can | ||
931 | * check for kernel mode by just checking the CPL | ||
932 | * of CS. | ||
933 | */ | ||
934 | if (!user_mode(regs)) | 613 | if (!user_mode(regs)) |
935 | goto clear_TF_reenable; | 614 | goto clear_TF_reenable; |
936 | } | 615 | } |
937 | 616 | ||
617 | si_code = get_si_code(condition); | ||
938 | /* Ok, finally something we can handle */ | 618 | /* Ok, finally something we can handle */ |
939 | send_sigtrap(tsk, regs, error_code); | 619 | send_sigtrap(tsk, regs, error_code, si_code); |
940 | 620 | ||
941 | /* | 621 | /* |
942 | * Disable additional traps. They'll be re-enabled when | 622 | * Disable additional traps. They'll be re-enabled when |
@@ -944,18 +624,37 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
944 | */ | 624 | */ |
945 | clear_dr7: | 625 | clear_dr7: |
946 | set_debugreg(0, 7); | 626 | set_debugreg(0, 7); |
627 | preempt_conditional_cli(regs); | ||
947 | return; | 628 | return; |
948 | 629 | ||
630 | #ifdef CONFIG_X86_32 | ||
949 | debug_vm86: | 631 | debug_vm86: |
950 | handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); | 632 | handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); |
633 | preempt_conditional_cli(regs); | ||
951 | return; | 634 | return; |
635 | #endif | ||
952 | 636 | ||
953 | clear_TF_reenable: | 637 | clear_TF_reenable: |
954 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | 638 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); |
955 | regs->flags &= ~X86_EFLAGS_TF; | 639 | regs->flags &= ~X86_EFLAGS_TF; |
640 | preempt_conditional_cli(regs); | ||
956 | return; | 641 | return; |
957 | } | 642 | } |
958 | 643 | ||
644 | #ifdef CONFIG_X86_64 | ||
645 | static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) | ||
646 | { | ||
647 | if (fixup_exception(regs)) | ||
648 | return 1; | ||
649 | |||
650 | notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); | ||
651 | /* Illegal floating point operation in the kernel */ | ||
652 | current->thread.trap_no = trapnr; | ||
653 | die(str, regs, 0); | ||
654 | return 0; | ||
655 | } | ||
656 | #endif | ||
657 | |||
959 | /* | 658 | /* |
960 | * Note that we play around with the 'TS' bit in an attempt to get | 659 | * Note that we play around with the 'TS' bit in an attempt to get |
961 | * the correct behaviour even in the presence of the asynchronous | 660 | * the correct behaviour even in the presence of the asynchronous |
@@ -992,7 +691,9 @@ void math_error(void __user *ip) | |||
992 | swd = get_fpu_swd(task); | 691 | swd = get_fpu_swd(task); |
993 | switch (swd & ~cwd & 0x3f) { | 692 | switch (swd & ~cwd & 0x3f) { |
994 | case 0x000: /* No unmasked exception */ | 693 | case 0x000: /* No unmasked exception */ |
694 | #ifdef CONFIG_X86_32 | ||
995 | return; | 695 | return; |
696 | #endif | ||
996 | default: /* Multiple exceptions */ | 697 | default: /* Multiple exceptions */ |
997 | break; | 698 | break; |
998 | case 0x001: /* Invalid Op */ | 699 | case 0x001: /* Invalid Op */ |
@@ -1020,9 +721,18 @@ void math_error(void __user *ip) | |||
1020 | force_sig_info(SIGFPE, &info, task); | 721 | force_sig_info(SIGFPE, &info, task); |
1021 | } | 722 | } |
1022 | 723 | ||
1023 | void do_coprocessor_error(struct pt_regs *regs, long error_code) | 724 | dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) |
1024 | { | 725 | { |
726 | conditional_sti(regs); | ||
727 | |||
728 | #ifdef CONFIG_X86_32 | ||
1025 | ignore_fpu_irq = 1; | 729 | ignore_fpu_irq = 1; |
730 | #else | ||
731 | if (!user_mode(regs) && | ||
732 | kernel_math_error(regs, "kernel x87 math error", 16)) | ||
733 | return; | ||
734 | #endif | ||
735 | |||
1026 | math_error((void __user *)regs->ip); | 736 | math_error((void __user *)regs->ip); |
1027 | } | 737 | } |
1028 | 738 | ||
@@ -1074,8 +784,12 @@ static void simd_math_error(void __user *ip) | |||
1074 | force_sig_info(SIGFPE, &info, task); | 784 | force_sig_info(SIGFPE, &info, task); |
1075 | } | 785 | } |
1076 | 786 | ||
1077 | void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) | 787 | dotraplinkage void |
788 | do_simd_coprocessor_error(struct pt_regs *regs, long error_code) | ||
1078 | { | 789 | { |
790 | conditional_sti(regs); | ||
791 | |||
792 | #ifdef CONFIG_X86_32 | ||
1079 | if (cpu_has_xmm) { | 793 | if (cpu_has_xmm) { |
1080 | /* Handle SIMD FPU exceptions on PIII+ processors. */ | 794 | /* Handle SIMD FPU exceptions on PIII+ processors. */ |
1081 | ignore_fpu_irq = 1; | 795 | ignore_fpu_irq = 1; |
@@ -1094,16 +808,25 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) | |||
1094 | current->thread.error_code = error_code; | 808 | current->thread.error_code = error_code; |
1095 | die_if_kernel("cache flush denied", regs, error_code); | 809 | die_if_kernel("cache flush denied", regs, error_code); |
1096 | force_sig(SIGSEGV, current); | 810 | force_sig(SIGSEGV, current); |
811 | #else | ||
812 | if (!user_mode(regs) && | ||
813 | kernel_math_error(regs, "kernel simd math error", 19)) | ||
814 | return; | ||
815 | simd_math_error((void __user *)regs->ip); | ||
816 | #endif | ||
1097 | } | 817 | } |
1098 | 818 | ||
1099 | void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) | 819 | dotraplinkage void |
820 | do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) | ||
1100 | { | 821 | { |
822 | conditional_sti(regs); | ||
1101 | #if 0 | 823 | #if 0 |
1102 | /* No need to warn about this any longer. */ | 824 | /* No need to warn about this any longer. */ |
1103 | printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); | 825 | printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); |
1104 | #endif | 826 | #endif |
1105 | } | 827 | } |
1106 | 828 | ||
829 | #ifdef CONFIG_X86_32 | ||
1107 | unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) | 830 | unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) |
1108 | { | 831 | { |
1109 | struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); | 832 | struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); |
@@ -1122,6 +845,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) | |||
1122 | 845 | ||
1123 | return new_kesp; | 846 | return new_kesp; |
1124 | } | 847 | } |
848 | #else | ||
849 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) | ||
850 | { | ||
851 | } | ||
852 | |||
853 | asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) | ||
854 | { | ||
855 | } | ||
856 | #endif | ||
1125 | 857 | ||
1126 | /* | 858 | /* |
1127 | * 'math_state_restore()' saves the current math information in the | 859 | * 'math_state_restore()' saves the current math information in the |
@@ -1154,14 +886,24 @@ asmlinkage void math_state_restore(void) | |||
1154 | } | 886 | } |
1155 | 887 | ||
1156 | clts(); /* Allow maths ops (or we recurse) */ | 888 | clts(); /* Allow maths ops (or we recurse) */ |
889 | #ifdef CONFIG_X86_32 | ||
1157 | restore_fpu(tsk); | 890 | restore_fpu(tsk); |
891 | #else | ||
892 | /* | ||
893 | * Paranoid restore. send a SIGSEGV if we fail to restore the state. | ||
894 | */ | ||
895 | if (unlikely(restore_fpu_checking(tsk))) { | ||
896 | stts(); | ||
897 | force_sig(SIGSEGV, tsk); | ||
898 | return; | ||
899 | } | ||
900 | #endif | ||
1158 | thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ | 901 | thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ |
1159 | tsk->fpu_counter++; | 902 | tsk->fpu_counter++; |
1160 | } | 903 | } |
1161 | EXPORT_SYMBOL_GPL(math_state_restore); | 904 | EXPORT_SYMBOL_GPL(math_state_restore); |
1162 | 905 | ||
1163 | #ifndef CONFIG_MATH_EMULATION | 906 | #ifndef CONFIG_MATH_EMULATION |
1164 | |||
1165 | asmlinkage void math_emulate(long arg) | 907 | asmlinkage void math_emulate(long arg) |
1166 | { | 908 | { |
1167 | printk(KERN_EMERG | 909 | printk(KERN_EMERG |
@@ -1170,12 +912,54 @@ asmlinkage void math_emulate(long arg) | |||
1170 | force_sig(SIGFPE, current); | 912 | force_sig(SIGFPE, current); |
1171 | schedule(); | 913 | schedule(); |
1172 | } | 914 | } |
1173 | |||
1174 | #endif /* CONFIG_MATH_EMULATION */ | 915 | #endif /* CONFIG_MATH_EMULATION */ |
1175 | 916 | ||
917 | dotraplinkage void __kprobes | ||
918 | do_device_not_available(struct pt_regs *regs, long error) | ||
919 | { | ||
920 | #ifdef CONFIG_X86_32 | ||
921 | if (read_cr0() & X86_CR0_EM) { | ||
922 | conditional_sti(regs); | ||
923 | math_emulate(0); | ||
924 | } else { | ||
925 | math_state_restore(); /* interrupts still off */ | ||
926 | conditional_sti(regs); | ||
927 | } | ||
928 | #else | ||
929 | math_state_restore(); | ||
930 | #endif | ||
931 | } | ||
932 | |||
933 | #ifdef CONFIG_X86_32 | ||
934 | #ifdef CONFIG_X86_MCE | ||
935 | dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) | ||
936 | { | ||
937 | conditional_sti(regs); | ||
938 | machine_check_vector(regs, error); | ||
939 | } | ||
940 | #endif | ||
941 | |||
942 | dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) | ||
943 | { | ||
944 | siginfo_t info; | ||
945 | local_irq_enable(); | ||
946 | |||
947 | info.si_signo = SIGILL; | ||
948 | info.si_errno = 0; | ||
949 | info.si_code = ILL_BADSTK; | ||
950 | info.si_addr = 0; | ||
951 | if (notify_die(DIE_TRAP, "iret exception", | ||
952 | regs, error_code, 32, SIGILL) == NOTIFY_STOP) | ||
953 | return; | ||
954 | do_trap(32, SIGILL, "iret exception", regs, error_code, &info); | ||
955 | } | ||
956 | #endif | ||
957 | |||
1176 | void __init trap_init(void) | 958 | void __init trap_init(void) |
1177 | { | 959 | { |
960 | #ifdef CONFIG_X86_32 | ||
1178 | int i; | 961 | int i; |
962 | #endif | ||
1179 | 963 | ||
1180 | #ifdef CONFIG_EISA | 964 | #ifdef CONFIG_EISA |
1181 | void __iomem *p = early_ioremap(0x0FFFD9, 4); | 965 | void __iomem *p = early_ioremap(0x0FFFD9, 4); |
@@ -1185,29 +969,40 @@ void __init trap_init(void) | |||
1185 | early_iounmap(p, 4); | 969 | early_iounmap(p, 4); |
1186 | #endif | 970 | #endif |
1187 | 971 | ||
1188 | set_trap_gate(0, ÷_error); | 972 | set_intr_gate(0, ÷_error); |
1189 | set_intr_gate(1, &debug); | 973 | set_intr_gate_ist(1, &debug, DEBUG_STACK); |
1190 | set_intr_gate(2, &nmi); | 974 | set_intr_gate_ist(2, &nmi, NMI_STACK); |
1191 | set_system_intr_gate(3, &int3); /* int3 can be called from all */ | 975 | /* int3 can be called from all */ |
1192 | set_system_gate(4, &overflow); /* int4 can be called from all */ | 976 | set_system_intr_gate_ist(3, &int3, DEBUG_STACK); |
1193 | set_trap_gate(5, &bounds); | 977 | /* int4 can be called from all */ |
1194 | set_trap_gate(6, &invalid_op); | 978 | set_system_intr_gate(4, &overflow); |
1195 | set_trap_gate(7, &device_not_available); | 979 | set_intr_gate(5, &bounds); |
980 | set_intr_gate(6, &invalid_op); | ||
981 | set_intr_gate(7, &device_not_available); | ||
982 | #ifdef CONFIG_X86_32 | ||
1196 | set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); | 983 | set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); |
1197 | set_trap_gate(9, &coprocessor_segment_overrun); | 984 | #else |
1198 | set_trap_gate(10, &invalid_TSS); | 985 | set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); |
1199 | set_trap_gate(11, &segment_not_present); | 986 | #endif |
1200 | set_trap_gate(12, &stack_segment); | 987 | set_intr_gate(9, &coprocessor_segment_overrun); |
1201 | set_trap_gate(13, &general_protection); | 988 | set_intr_gate(10, &invalid_TSS); |
989 | set_intr_gate(11, &segment_not_present); | ||
990 | set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); | ||
991 | set_intr_gate(13, &general_protection); | ||
1202 | set_intr_gate(14, &page_fault); | 992 | set_intr_gate(14, &page_fault); |
1203 | set_trap_gate(15, &spurious_interrupt_bug); | 993 | set_intr_gate(15, &spurious_interrupt_bug); |
1204 | set_trap_gate(16, &coprocessor_error); | 994 | set_intr_gate(16, &coprocessor_error); |
1205 | set_trap_gate(17, &alignment_check); | 995 | set_intr_gate(17, &alignment_check); |
1206 | #ifdef CONFIG_X86_MCE | 996 | #ifdef CONFIG_X86_MCE |
1207 | set_trap_gate(18, &machine_check); | 997 | set_intr_gate_ist(18, &machine_check, MCE_STACK); |
1208 | #endif | 998 | #endif |
1209 | set_trap_gate(19, &simd_coprocessor_error); | 999 | set_intr_gate(19, &simd_coprocessor_error); |
1210 | 1000 | ||
1001 | #ifdef CONFIG_IA32_EMULATION | ||
1002 | set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); | ||
1003 | #endif | ||
1004 | |||
1005 | #ifdef CONFIG_X86_32 | ||
1211 | if (cpu_has_fxsr) { | 1006 | if (cpu_has_fxsr) { |
1212 | printk(KERN_INFO "Enabling fast FPU save and restore... "); | 1007 | printk(KERN_INFO "Enabling fast FPU save and restore... "); |
1213 | set_in_cr4(X86_CR4_OSFXSR); | 1008 | set_in_cr4(X86_CR4_OSFXSR); |
@@ -1220,37 +1015,20 @@ void __init trap_init(void) | |||
1220 | printk("done.\n"); | 1015 | printk("done.\n"); |
1221 | } | 1016 | } |
1222 | 1017 | ||
1223 | set_system_gate(SYSCALL_VECTOR, &system_call); | 1018 | set_system_trap_gate(SYSCALL_VECTOR, &system_call); |
1224 | 1019 | ||
1225 | /* Reserve all the builtin and the syscall vector: */ | 1020 | /* Reserve all the builtin and the syscall vector: */ |
1226 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) | 1021 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) |
1227 | set_bit(i, used_vectors); | 1022 | set_bit(i, used_vectors); |
1228 | 1023 | ||
1229 | set_bit(SYSCALL_VECTOR, used_vectors); | 1024 | set_bit(SYSCALL_VECTOR, used_vectors); |
1230 | 1025 | #endif | |
1231 | init_thread_xstate(); | ||
1232 | /* | 1026 | /* |
1233 | * Should be a barrier for any external CPU state: | 1027 | * Should be a barrier for any external CPU state: |
1234 | */ | 1028 | */ |
1235 | cpu_init(); | 1029 | cpu_init(); |
1236 | 1030 | ||
1031 | #ifdef CONFIG_X86_32 | ||
1237 | trap_init_hook(); | 1032 | trap_init_hook(); |
1033 | #endif | ||
1238 | } | 1034 | } |
1239 | |||
1240 | static int __init kstack_setup(char *s) | ||
1241 | { | ||
1242 | kstack_depth_to_print = simple_strtoul(s, NULL, 0); | ||
1243 | |||
1244 | return 1; | ||
1245 | } | ||
1246 | __setup("kstack=", kstack_setup); | ||
1247 | |||
1248 | static int __init code_bytes_setup(char *s) | ||
1249 | { | ||
1250 | code_bytes = simple_strtoul(s, NULL, 0); | ||
1251 | if (code_bytes > 8192) | ||
1252 | code_bytes = 8192; | ||
1253 | |||
1254 | return 1; | ||
1255 | } | ||
1256 | __setup("code_bytes=", code_bytes_setup); | ||
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c deleted file mode 100644 index 513caaca7115..000000000000 --- a/arch/x86/kernel/traps_64.c +++ /dev/null | |||
@@ -1,1212 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
3 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
4 | * | ||
5 | * Pentium III FXSR, SSE support | ||
6 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
7 | */ | ||
8 | |||
9 | /* | ||
10 | * 'Traps.c' handles hardware traps and faults after we have saved some | ||
11 | * state in 'entry.S'. | ||
12 | */ | ||
13 | #include <linux/moduleparam.h> | ||
14 | #include <linux/interrupt.h> | ||
15 | #include <linux/kallsyms.h> | ||
16 | #include <linux/spinlock.h> | ||
17 | #include <linux/kprobes.h> | ||
18 | #include <linux/uaccess.h> | ||
19 | #include <linux/utsname.h> | ||
20 | #include <linux/kdebug.h> | ||
21 | #include <linux/kernel.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/ptrace.h> | ||
24 | #include <linux/string.h> | ||
25 | #include <linux/unwind.h> | ||
26 | #include <linux/delay.h> | ||
27 | #include <linux/errno.h> | ||
28 | #include <linux/kexec.h> | ||
29 | #include <linux/sched.h> | ||
30 | #include <linux/timer.h> | ||
31 | #include <linux/init.h> | ||
32 | #include <linux/bug.h> | ||
33 | #include <linux/nmi.h> | ||
34 | #include <linux/mm.h> | ||
35 | |||
36 | #if defined(CONFIG_EDAC) | ||
37 | #include <linux/edac.h> | ||
38 | #endif | ||
39 | |||
40 | #include <asm/stacktrace.h> | ||
41 | #include <asm/processor.h> | ||
42 | #include <asm/debugreg.h> | ||
43 | #include <asm/atomic.h> | ||
44 | #include <asm/system.h> | ||
45 | #include <asm/unwind.h> | ||
46 | #include <asm/desc.h> | ||
47 | #include <asm/i387.h> | ||
48 | #include <asm/nmi.h> | ||
49 | #include <asm/smp.h> | ||
50 | #include <asm/io.h> | ||
51 | #include <asm/pgalloc.h> | ||
52 | #include <asm/proto.h> | ||
53 | #include <asm/pda.h> | ||
54 | #include <asm/traps.h> | ||
55 | |||
56 | #include <mach_traps.h> | ||
57 | |||
58 | int panic_on_unrecovered_nmi; | ||
59 | int kstack_depth_to_print = 12; | ||
60 | static unsigned int code_bytes = 64; | ||
61 | static int ignore_nmis; | ||
62 | static int die_counter; | ||
63 | |||
64 | static inline void conditional_sti(struct pt_regs *regs) | ||
65 | { | ||
66 | if (regs->flags & X86_EFLAGS_IF) | ||
67 | local_irq_enable(); | ||
68 | } | ||
69 | |||
70 | static inline void preempt_conditional_sti(struct pt_regs *regs) | ||
71 | { | ||
72 | inc_preempt_count(); | ||
73 | if (regs->flags & X86_EFLAGS_IF) | ||
74 | local_irq_enable(); | ||
75 | } | ||
76 | |||
77 | static inline void preempt_conditional_cli(struct pt_regs *regs) | ||
78 | { | ||
79 | if (regs->flags & X86_EFLAGS_IF) | ||
80 | local_irq_disable(); | ||
81 | /* Make sure to not schedule here because we could be running | ||
82 | on an exception stack. */ | ||
83 | dec_preempt_count(); | ||
84 | } | ||
85 | |||
86 | void printk_address(unsigned long address, int reliable) | ||
87 | { | ||
88 | printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address); | ||
89 | } | ||
90 | |||
91 | static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | ||
92 | unsigned *usedp, char **idp) | ||
93 | { | ||
94 | static char ids[][8] = { | ||
95 | [DEBUG_STACK - 1] = "#DB", | ||
96 | [NMI_STACK - 1] = "NMI", | ||
97 | [DOUBLEFAULT_STACK - 1] = "#DF", | ||
98 | [STACKFAULT_STACK - 1] = "#SS", | ||
99 | [MCE_STACK - 1] = "#MC", | ||
100 | #if DEBUG_STKSZ > EXCEPTION_STKSZ | ||
101 | [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" | ||
102 | #endif | ||
103 | }; | ||
104 | unsigned k; | ||
105 | |||
106 | /* | ||
107 | * Iterate over all exception stacks, and figure out whether | ||
108 | * 'stack' is in one of them: | ||
109 | */ | ||
110 | for (k = 0; k < N_EXCEPTION_STACKS; k++) { | ||
111 | unsigned long end = per_cpu(orig_ist, cpu).ist[k]; | ||
112 | /* | ||
113 | * Is 'stack' above this exception frame's end? | ||
114 | * If yes then skip to the next frame. | ||
115 | */ | ||
116 | if (stack >= end) | ||
117 | continue; | ||
118 | /* | ||
119 | * Is 'stack' above this exception frame's start address? | ||
120 | * If yes then we found the right frame. | ||
121 | */ | ||
122 | if (stack >= end - EXCEPTION_STKSZ) { | ||
123 | /* | ||
124 | * Make sure we only iterate through an exception | ||
125 | * stack once. If it comes up for the second time | ||
126 | * then there's something wrong going on - just | ||
127 | * break out and return NULL: | ||
128 | */ | ||
129 | if (*usedp & (1U << k)) | ||
130 | break; | ||
131 | *usedp |= 1U << k; | ||
132 | *idp = ids[k]; | ||
133 | return (unsigned long *)end; | ||
134 | } | ||
135 | /* | ||
136 | * If this is a debug stack, and if it has a larger size than | ||
137 | * the usual exception stacks, then 'stack' might still | ||
138 | * be within the lower portion of the debug stack: | ||
139 | */ | ||
140 | #if DEBUG_STKSZ > EXCEPTION_STKSZ | ||
141 | if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { | ||
142 | unsigned j = N_EXCEPTION_STACKS - 1; | ||
143 | |||
144 | /* | ||
145 | * Black magic. A large debug stack is composed of | ||
146 | * multiple exception stack entries, which we | ||
147 | * iterate through now. Dont look: | ||
148 | */ | ||
149 | do { | ||
150 | ++j; | ||
151 | end -= EXCEPTION_STKSZ; | ||
152 | ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); | ||
153 | } while (stack < end - EXCEPTION_STKSZ); | ||
154 | if (*usedp & (1U << j)) | ||
155 | break; | ||
156 | *usedp |= 1U << j; | ||
157 | *idp = ids[j]; | ||
158 | return (unsigned long *)end; | ||
159 | } | ||
160 | #endif | ||
161 | } | ||
162 | return NULL; | ||
163 | } | ||
164 | |||
165 | /* | ||
166 | * x86-64 can have up to three kernel stacks: | ||
167 | * process stack | ||
168 | * interrupt stack | ||
169 | * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack | ||
170 | */ | ||
171 | |||
172 | static inline int valid_stack_ptr(struct thread_info *tinfo, | ||
173 | void *p, unsigned int size, void *end) | ||
174 | { | ||
175 | void *t = tinfo; | ||
176 | if (end) { | ||
177 | if (p < end && p >= (end-THREAD_SIZE)) | ||
178 | return 1; | ||
179 | else | ||
180 | return 0; | ||
181 | } | ||
182 | return p > t && p < t + THREAD_SIZE - size; | ||
183 | } | ||
184 | |||
185 | /* The form of the top of the frame on the stack */ | ||
186 | struct stack_frame { | ||
187 | struct stack_frame *next_frame; | ||
188 | unsigned long return_address; | ||
189 | }; | ||
190 | |||
191 | static inline unsigned long | ||
192 | print_context_stack(struct thread_info *tinfo, | ||
193 | unsigned long *stack, unsigned long bp, | ||
194 | const struct stacktrace_ops *ops, void *data, | ||
195 | unsigned long *end) | ||
196 | { | ||
197 | struct stack_frame *frame = (struct stack_frame *)bp; | ||
198 | |||
199 | while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { | ||
200 | unsigned long addr; | ||
201 | |||
202 | addr = *stack; | ||
203 | if (__kernel_text_address(addr)) { | ||
204 | if ((unsigned long) stack == bp + 8) { | ||
205 | ops->address(data, addr, 1); | ||
206 | frame = frame->next_frame; | ||
207 | bp = (unsigned long) frame; | ||
208 | } else { | ||
209 | ops->address(data, addr, bp == 0); | ||
210 | } | ||
211 | } | ||
212 | stack++; | ||
213 | } | ||
214 | return bp; | ||
215 | } | ||
216 | |||
217 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | ||
218 | unsigned long *stack, unsigned long bp, | ||
219 | const struct stacktrace_ops *ops, void *data) | ||
220 | { | ||
221 | const unsigned cpu = get_cpu(); | ||
222 | unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; | ||
223 | unsigned used = 0; | ||
224 | struct thread_info *tinfo; | ||
225 | |||
226 | if (!task) | ||
227 | task = current; | ||
228 | |||
229 | if (!stack) { | ||
230 | unsigned long dummy; | ||
231 | stack = &dummy; | ||
232 | if (task && task != current) | ||
233 | stack = (unsigned long *)task->thread.sp; | ||
234 | } | ||
235 | |||
236 | #ifdef CONFIG_FRAME_POINTER | ||
237 | if (!bp) { | ||
238 | if (task == current) { | ||
239 | /* Grab bp right from our regs */ | ||
240 | asm("movq %%rbp, %0" : "=r" (bp) :); | ||
241 | } else { | ||
242 | /* bp is the last reg pushed by switch_to */ | ||
243 | bp = *(unsigned long *) task->thread.sp; | ||
244 | } | ||
245 | } | ||
246 | #endif | ||
247 | |||
248 | /* | ||
249 | * Print function call entries in all stacks, starting at the | ||
250 | * current stack address. If the stacks consist of nested | ||
251 | * exceptions | ||
252 | */ | ||
253 | tinfo = task_thread_info(task); | ||
254 | for (;;) { | ||
255 | char *id; | ||
256 | unsigned long *estack_end; | ||
257 | estack_end = in_exception_stack(cpu, (unsigned long)stack, | ||
258 | &used, &id); | ||
259 | |||
260 | if (estack_end) { | ||
261 | if (ops->stack(data, id) < 0) | ||
262 | break; | ||
263 | |||
264 | bp = print_context_stack(tinfo, stack, bp, ops, | ||
265 | data, estack_end); | ||
266 | ops->stack(data, "<EOE>"); | ||
267 | /* | ||
268 | * We link to the next stack via the | ||
269 | * second-to-last pointer (index -2 to end) in the | ||
270 | * exception stack: | ||
271 | */ | ||
272 | stack = (unsigned long *) estack_end[-2]; | ||
273 | continue; | ||
274 | } | ||
275 | if (irqstack_end) { | ||
276 | unsigned long *irqstack; | ||
277 | irqstack = irqstack_end - | ||
278 | (IRQSTACKSIZE - 64) / sizeof(*irqstack); | ||
279 | |||
280 | if (stack >= irqstack && stack < irqstack_end) { | ||
281 | if (ops->stack(data, "IRQ") < 0) | ||
282 | break; | ||
283 | bp = print_context_stack(tinfo, stack, bp, | ||
284 | ops, data, irqstack_end); | ||
285 | /* | ||
286 | * We link to the next stack (which would be | ||
287 | * the process stack normally) the last | ||
288 | * pointer (index -1 to end) in the IRQ stack: | ||
289 | */ | ||
290 | stack = (unsigned long *) (irqstack_end[-1]); | ||
291 | irqstack_end = NULL; | ||
292 | ops->stack(data, "EOI"); | ||
293 | continue; | ||
294 | } | ||
295 | } | ||
296 | break; | ||
297 | } | ||
298 | |||
299 | /* | ||
300 | * This handles the process stack: | ||
301 | */ | ||
302 | bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); | ||
303 | put_cpu(); | ||
304 | } | ||
305 | EXPORT_SYMBOL(dump_trace); | ||
306 | |||
307 | static void | ||
308 | print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
309 | { | ||
310 | print_symbol(msg, symbol); | ||
311 | printk("\n"); | ||
312 | } | ||
313 | |||
314 | static void print_trace_warning(void *data, char *msg) | ||
315 | { | ||
316 | printk("%s\n", msg); | ||
317 | } | ||
318 | |||
319 | static int print_trace_stack(void *data, char *name) | ||
320 | { | ||
321 | printk(" <%s> ", name); | ||
322 | return 0; | ||
323 | } | ||
324 | |||
325 | static void print_trace_address(void *data, unsigned long addr, int reliable) | ||
326 | { | ||
327 | touch_nmi_watchdog(); | ||
328 | printk_address(addr, reliable); | ||
329 | } | ||
330 | |||
331 | static const struct stacktrace_ops print_trace_ops = { | ||
332 | .warning = print_trace_warning, | ||
333 | .warning_symbol = print_trace_warning_symbol, | ||
334 | .stack = print_trace_stack, | ||
335 | .address = print_trace_address, | ||
336 | }; | ||
337 | |||
338 | static void | ||
339 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
340 | unsigned long *stack, unsigned long bp, char *log_lvl) | ||
341 | { | ||
342 | printk("\nCall Trace:\n"); | ||
343 | dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); | ||
344 | printk("\n"); | ||
345 | } | ||
346 | |||
347 | void show_trace(struct task_struct *task, struct pt_regs *regs, | ||
348 | unsigned long *stack, unsigned long bp) | ||
349 | { | ||
350 | show_trace_log_lvl(task, regs, stack, bp, ""); | ||
351 | } | ||
352 | |||
353 | static void | ||
354 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
355 | unsigned long *sp, unsigned long bp, char *log_lvl) | ||
356 | { | ||
357 | unsigned long *stack; | ||
358 | int i; | ||
359 | const int cpu = smp_processor_id(); | ||
360 | unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); | ||
361 | unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); | ||
362 | |||
363 | // debugging aid: "show_stack(NULL, NULL);" prints the | ||
364 | // back trace for this cpu. | ||
365 | |||
366 | if (sp == NULL) { | ||
367 | if (task) | ||
368 | sp = (unsigned long *)task->thread.sp; | ||
369 | else | ||
370 | sp = (unsigned long *)&sp; | ||
371 | } | ||
372 | |||
373 | stack = sp; | ||
374 | for (i = 0; i < kstack_depth_to_print; i++) { | ||
375 | if (stack >= irqstack && stack <= irqstack_end) { | ||
376 | if (stack == irqstack_end) { | ||
377 | stack = (unsigned long *) (irqstack_end[-1]); | ||
378 | printk(" <EOI> "); | ||
379 | } | ||
380 | } else { | ||
381 | if (((long) stack & (THREAD_SIZE-1)) == 0) | ||
382 | break; | ||
383 | } | ||
384 | if (i && ((i % 4) == 0)) | ||
385 | printk("\n"); | ||
386 | printk(" %016lx", *stack++); | ||
387 | touch_nmi_watchdog(); | ||
388 | } | ||
389 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); | ||
390 | } | ||
391 | |||
392 | void show_stack(struct task_struct *task, unsigned long *sp) | ||
393 | { | ||
394 | show_stack_log_lvl(task, NULL, sp, 0, ""); | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | * The architecture-independent dump_stack generator | ||
399 | */ | ||
400 | void dump_stack(void) | ||
401 | { | ||
402 | unsigned long bp = 0; | ||
403 | unsigned long stack; | ||
404 | |||
405 | #ifdef CONFIG_FRAME_POINTER | ||
406 | if (!bp) | ||
407 | asm("movq %%rbp, %0" : "=r" (bp):); | ||
408 | #endif | ||
409 | |||
410 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | ||
411 | current->pid, current->comm, print_tainted(), | ||
412 | init_utsname()->release, | ||
413 | (int)strcspn(init_utsname()->version, " "), | ||
414 | init_utsname()->version); | ||
415 | show_trace(NULL, NULL, &stack, bp); | ||
416 | } | ||
417 | |||
418 | EXPORT_SYMBOL(dump_stack); | ||
419 | |||
420 | void show_registers(struct pt_regs *regs) | ||
421 | { | ||
422 | int i; | ||
423 | unsigned long sp; | ||
424 | const int cpu = smp_processor_id(); | ||
425 | struct task_struct *cur = cpu_pda(cpu)->pcurrent; | ||
426 | |||
427 | sp = regs->sp; | ||
428 | printk("CPU %d ", cpu); | ||
429 | __show_regs(regs); | ||
430 | printk("Process %s (pid: %d, threadinfo %p, task %p)\n", | ||
431 | cur->comm, cur->pid, task_thread_info(cur), cur); | ||
432 | |||
433 | /* | ||
434 | * When in-kernel, we also print out the stack and code at the | ||
435 | * time of the fault.. | ||
436 | */ | ||
437 | if (!user_mode(regs)) { | ||
438 | unsigned int code_prologue = code_bytes * 43 / 64; | ||
439 | unsigned int code_len = code_bytes; | ||
440 | unsigned char c; | ||
441 | u8 *ip; | ||
442 | |||
443 | printk("Stack: "); | ||
444 | show_stack_log_lvl(NULL, regs, (unsigned long *)sp, | ||
445 | regs->bp, ""); | ||
446 | printk("\n"); | ||
447 | |||
448 | printk(KERN_EMERG "Code: "); | ||
449 | |||
450 | ip = (u8 *)regs->ip - code_prologue; | ||
451 | if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { | ||
452 | /* try starting at RIP */ | ||
453 | ip = (u8 *)regs->ip; | ||
454 | code_len = code_len - code_prologue + 1; | ||
455 | } | ||
456 | for (i = 0; i < code_len; i++, ip++) { | ||
457 | if (ip < (u8 *)PAGE_OFFSET || | ||
458 | probe_kernel_address(ip, c)) { | ||
459 | printk(" Bad RIP value."); | ||
460 | break; | ||
461 | } | ||
462 | if (ip == (u8 *)regs->ip) | ||
463 | printk("<%02x> ", c); | ||
464 | else | ||
465 | printk("%02x ", c); | ||
466 | } | ||
467 | } | ||
468 | printk("\n"); | ||
469 | } | ||
470 | |||
471 | int is_valid_bugaddr(unsigned long ip) | ||
472 | { | ||
473 | unsigned short ud2; | ||
474 | |||
475 | if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) | ||
476 | return 0; | ||
477 | |||
478 | return ud2 == 0x0b0f; | ||
479 | } | ||
480 | |||
481 | static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; | ||
482 | static int die_owner = -1; | ||
483 | static unsigned int die_nest_count; | ||
484 | |||
485 | unsigned __kprobes long oops_begin(void) | ||
486 | { | ||
487 | int cpu; | ||
488 | unsigned long flags; | ||
489 | |||
490 | oops_enter(); | ||
491 | |||
492 | /* racy, but better than risking deadlock. */ | ||
493 | raw_local_irq_save(flags); | ||
494 | cpu = smp_processor_id(); | ||
495 | if (!__raw_spin_trylock(&die_lock)) { | ||
496 | if (cpu == die_owner) | ||
497 | /* nested oops. should stop eventually */; | ||
498 | else | ||
499 | __raw_spin_lock(&die_lock); | ||
500 | } | ||
501 | die_nest_count++; | ||
502 | die_owner = cpu; | ||
503 | console_verbose(); | ||
504 | bust_spinlocks(1); | ||
505 | return flags; | ||
506 | } | ||
507 | |||
508 | void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) | ||
509 | { | ||
510 | die_owner = -1; | ||
511 | bust_spinlocks(0); | ||
512 | die_nest_count--; | ||
513 | if (!die_nest_count) | ||
514 | /* Nest count reaches zero, release the lock. */ | ||
515 | __raw_spin_unlock(&die_lock); | ||
516 | raw_local_irq_restore(flags); | ||
517 | if (!regs) { | ||
518 | oops_exit(); | ||
519 | return; | ||
520 | } | ||
521 | if (panic_on_oops) | ||
522 | panic("Fatal exception"); | ||
523 | oops_exit(); | ||
524 | do_exit(signr); | ||
525 | } | ||
526 | |||
527 | int __kprobes __die(const char *str, struct pt_regs *regs, long err) | ||
528 | { | ||
529 | printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); | ||
530 | #ifdef CONFIG_PREEMPT | ||
531 | printk("PREEMPT "); | ||
532 | #endif | ||
533 | #ifdef CONFIG_SMP | ||
534 | printk("SMP "); | ||
535 | #endif | ||
536 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
537 | printk("DEBUG_PAGEALLOC"); | ||
538 | #endif | ||
539 | printk("\n"); | ||
540 | if (notify_die(DIE_OOPS, str, regs, err, | ||
541 | current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) | ||
542 | return 1; | ||
543 | |||
544 | show_registers(regs); | ||
545 | add_taint(TAINT_DIE); | ||
546 | /* Executive summary in case the oops scrolled away */ | ||
547 | printk(KERN_ALERT "RIP "); | ||
548 | printk_address(regs->ip, 1); | ||
549 | printk(" RSP <%016lx>\n", regs->sp); | ||
550 | if (kexec_should_crash(current)) | ||
551 | crash_kexec(regs); | ||
552 | return 0; | ||
553 | } | ||
554 | |||
555 | void die(const char *str, struct pt_regs *regs, long err) | ||
556 | { | ||
557 | unsigned long flags = oops_begin(); | ||
558 | |||
559 | if (!user_mode(regs)) | ||
560 | report_bug(regs->ip, regs); | ||
561 | |||
562 | if (__die(str, regs, err)) | ||
563 | regs = NULL; | ||
564 | oops_end(flags, regs, SIGSEGV); | ||
565 | } | ||
566 | |||
567 | notrace __kprobes void | ||
568 | die_nmi(char *str, struct pt_regs *regs, int do_panic) | ||
569 | { | ||
570 | unsigned long flags; | ||
571 | |||
572 | if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) | ||
573 | return; | ||
574 | |||
575 | flags = oops_begin(); | ||
576 | /* | ||
577 | * We are in trouble anyway, lets at least try | ||
578 | * to get a message out. | ||
579 | */ | ||
580 | printk(KERN_EMERG "%s", str); | ||
581 | printk(" on CPU%d, ip %08lx, registers:\n", | ||
582 | smp_processor_id(), regs->ip); | ||
583 | show_registers(regs); | ||
584 | if (kexec_should_crash(current)) | ||
585 | crash_kexec(regs); | ||
586 | if (do_panic || panic_on_oops) | ||
587 | panic("Non maskable interrupt"); | ||
588 | oops_end(flags, NULL, SIGBUS); | ||
589 | nmi_exit(); | ||
590 | local_irq_enable(); | ||
591 | do_exit(SIGBUS); | ||
592 | } | ||
593 | |||
594 | static void __kprobes | ||
595 | do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, | ||
596 | long error_code, siginfo_t *info) | ||
597 | { | ||
598 | struct task_struct *tsk = current; | ||
599 | |||
600 | if (!user_mode(regs)) | ||
601 | goto kernel_trap; | ||
602 | |||
603 | /* | ||
604 | * We want error_code and trap_no set for userspace faults and | ||
605 | * kernelspace faults which result in die(), but not | ||
606 | * kernelspace faults which are fixed up. die() gives the | ||
607 | * process no chance to handle the signal and notice the | ||
608 | * kernel fault information, so that won't result in polluting | ||
609 | * the information about previously queued, but not yet | ||
610 | * delivered, faults. See also do_general_protection below. | ||
611 | */ | ||
612 | tsk->thread.error_code = error_code; | ||
613 | tsk->thread.trap_no = trapnr; | ||
614 | |||
615 | if (show_unhandled_signals && unhandled_signal(tsk, signr) && | ||
616 | printk_ratelimit()) { | ||
617 | printk(KERN_INFO | ||
618 | "%s[%d] trap %s ip:%lx sp:%lx error:%lx", | ||
619 | tsk->comm, tsk->pid, str, | ||
620 | regs->ip, regs->sp, error_code); | ||
621 | print_vma_addr(" in ", regs->ip); | ||
622 | printk("\n"); | ||
623 | } | ||
624 | |||
625 | if (info) | ||
626 | force_sig_info(signr, info, tsk); | ||
627 | else | ||
628 | force_sig(signr, tsk); | ||
629 | return; | ||
630 | |||
631 | kernel_trap: | ||
632 | if (!fixup_exception(regs)) { | ||
633 | tsk->thread.error_code = error_code; | ||
634 | tsk->thread.trap_no = trapnr; | ||
635 | die(str, regs, error_code); | ||
636 | } | ||
637 | return; | ||
638 | } | ||
639 | |||
640 | #define DO_ERROR(trapnr, signr, str, name) \ | ||
641 | asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ | ||
642 | { \ | ||
643 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
644 | == NOTIFY_STOP) \ | ||
645 | return; \ | ||
646 | conditional_sti(regs); \ | ||
647 | do_trap(trapnr, signr, str, regs, error_code, NULL); \ | ||
648 | } | ||
649 | |||
650 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | ||
651 | asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ | ||
652 | { \ | ||
653 | siginfo_t info; \ | ||
654 | info.si_signo = signr; \ | ||
655 | info.si_errno = 0; \ | ||
656 | info.si_code = sicode; \ | ||
657 | info.si_addr = (void __user *)siaddr; \ | ||
658 | trace_hardirqs_fixup(); \ | ||
659 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
660 | == NOTIFY_STOP) \ | ||
661 | return; \ | ||
662 | conditional_sti(regs); \ | ||
663 | do_trap(trapnr, signr, str, regs, error_code, &info); \ | ||
664 | } | ||
665 | |||
666 | DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) | ||
667 | DO_ERROR(4, SIGSEGV, "overflow", overflow) | ||
668 | DO_ERROR(5, SIGSEGV, "bounds", bounds) | ||
669 | DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) | ||
670 | DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | ||
671 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | ||
672 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) | ||
673 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) | ||
674 | |||
675 | /* Runs on IST stack */ | ||
676 | asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) | ||
677 | { | ||
678 | if (notify_die(DIE_TRAP, "stack segment", regs, error_code, | ||
679 | 12, SIGBUS) == NOTIFY_STOP) | ||
680 | return; | ||
681 | preempt_conditional_sti(regs); | ||
682 | do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); | ||
683 | preempt_conditional_cli(regs); | ||
684 | } | ||
685 | |||
686 | asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) | ||
687 | { | ||
688 | static const char str[] = "double fault"; | ||
689 | struct task_struct *tsk = current; | ||
690 | |||
691 | /* Return not checked because double check cannot be ignored */ | ||
692 | notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); | ||
693 | |||
694 | tsk->thread.error_code = error_code; | ||
695 | tsk->thread.trap_no = 8; | ||
696 | |||
697 | /* This is always a kernel trap and never fixable (and thus must | ||
698 | never return). */ | ||
699 | for (;;) | ||
700 | die(str, regs, error_code); | ||
701 | } | ||
702 | |||
703 | asmlinkage void __kprobes | ||
704 | do_general_protection(struct pt_regs *regs, long error_code) | ||
705 | { | ||
706 | struct task_struct *tsk; | ||
707 | |||
708 | conditional_sti(regs); | ||
709 | |||
710 | tsk = current; | ||
711 | if (!user_mode(regs)) | ||
712 | goto gp_in_kernel; | ||
713 | |||
714 | tsk->thread.error_code = error_code; | ||
715 | tsk->thread.trap_no = 13; | ||
716 | |||
717 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | ||
718 | printk_ratelimit()) { | ||
719 | printk(KERN_INFO | ||
720 | "%s[%d] general protection ip:%lx sp:%lx error:%lx", | ||
721 | tsk->comm, tsk->pid, | ||
722 | regs->ip, regs->sp, error_code); | ||
723 | print_vma_addr(" in ", regs->ip); | ||
724 | printk("\n"); | ||
725 | } | ||
726 | |||
727 | force_sig(SIGSEGV, tsk); | ||
728 | return; | ||
729 | |||
730 | gp_in_kernel: | ||
731 | if (fixup_exception(regs)) | ||
732 | return; | ||
733 | |||
734 | tsk->thread.error_code = error_code; | ||
735 | tsk->thread.trap_no = 13; | ||
736 | if (notify_die(DIE_GPF, "general protection fault", regs, | ||
737 | error_code, 13, SIGSEGV) == NOTIFY_STOP) | ||
738 | return; | ||
739 | die("general protection fault", regs, error_code); | ||
740 | } | ||
741 | |||
742 | static notrace __kprobes void | ||
743 | mem_parity_error(unsigned char reason, struct pt_regs *regs) | ||
744 | { | ||
745 | printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", | ||
746 | reason); | ||
747 | printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); | ||
748 | |||
749 | #if defined(CONFIG_EDAC) | ||
750 | if (edac_handler_set()) { | ||
751 | edac_atomic_assert_error(); | ||
752 | return; | ||
753 | } | ||
754 | #endif | ||
755 | |||
756 | if (panic_on_unrecovered_nmi) | ||
757 | panic("NMI: Not continuing"); | ||
758 | |||
759 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | ||
760 | |||
761 | /* Clear and disable the memory parity error line. */ | ||
762 | reason = (reason & 0xf) | 4; | ||
763 | outb(reason, 0x61); | ||
764 | } | ||
765 | |||
766 | static notrace __kprobes void | ||
767 | io_check_error(unsigned char reason, struct pt_regs *regs) | ||
768 | { | ||
769 | printk("NMI: IOCK error (debug interrupt?)\n"); | ||
770 | show_registers(regs); | ||
771 | |||
772 | /* Re-enable the IOCK line, wait for a few seconds */ | ||
773 | reason = (reason & 0xf) | 8; | ||
774 | outb(reason, 0x61); | ||
775 | mdelay(2000); | ||
776 | reason &= ~8; | ||
777 | outb(reason, 0x61); | ||
778 | } | ||
779 | |||
780 | static notrace __kprobes void | ||
781 | unknown_nmi_error(unsigned char reason, struct pt_regs * regs) | ||
782 | { | ||
783 | if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) | ||
784 | return; | ||
785 | printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", | ||
786 | reason); | ||
787 | printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); | ||
788 | |||
789 | if (panic_on_unrecovered_nmi) | ||
790 | panic("NMI: Not continuing"); | ||
791 | |||
792 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | ||
793 | } | ||
794 | |||
795 | /* Runs on IST stack. This code must keep interrupts off all the time. | ||
796 | Nested NMIs are prevented by the CPU. */ | ||
797 | asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) | ||
798 | { | ||
799 | unsigned char reason = 0; | ||
800 | int cpu; | ||
801 | |||
802 | cpu = smp_processor_id(); | ||
803 | |||
804 | /* Only the BSP gets external NMIs from the system. */ | ||
805 | if (!cpu) | ||
806 | reason = get_nmi_reason(); | ||
807 | |||
808 | if (!(reason & 0xc0)) { | ||
809 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) | ||
810 | == NOTIFY_STOP) | ||
811 | return; | ||
812 | /* | ||
813 | * Ok, so this is none of the documented NMI sources, | ||
814 | * so it must be the NMI watchdog. | ||
815 | */ | ||
816 | if (nmi_watchdog_tick(regs, reason)) | ||
817 | return; | ||
818 | if (!do_nmi_callback(regs, cpu)) | ||
819 | unknown_nmi_error(reason, regs); | ||
820 | |||
821 | return; | ||
822 | } | ||
823 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) | ||
824 | return; | ||
825 | |||
826 | /* AK: following checks seem to be broken on modern chipsets. FIXME */ | ||
827 | if (reason & 0x80) | ||
828 | mem_parity_error(reason, regs); | ||
829 | if (reason & 0x40) | ||
830 | io_check_error(reason, regs); | ||
831 | } | ||
832 | |||
833 | asmlinkage notrace __kprobes void | ||
834 | do_nmi(struct pt_regs *regs, long error_code) | ||
835 | { | ||
836 | nmi_enter(); | ||
837 | |||
838 | add_pda(__nmi_count, 1); | ||
839 | |||
840 | if (!ignore_nmis) | ||
841 | default_do_nmi(regs); | ||
842 | |||
843 | nmi_exit(); | ||
844 | } | ||
845 | |||
846 | void stop_nmi(void) | ||
847 | { | ||
848 | acpi_nmi_disable(); | ||
849 | ignore_nmis++; | ||
850 | } | ||
851 | |||
852 | void restart_nmi(void) | ||
853 | { | ||
854 | ignore_nmis--; | ||
855 | acpi_nmi_enable(); | ||
856 | } | ||
857 | |||
858 | /* runs on IST stack. */ | ||
859 | asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) | ||
860 | { | ||
861 | trace_hardirqs_fixup(); | ||
862 | |||
863 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) | ||
864 | == NOTIFY_STOP) | ||
865 | return; | ||
866 | |||
867 | preempt_conditional_sti(regs); | ||
868 | do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); | ||
869 | preempt_conditional_cli(regs); | ||
870 | } | ||
871 | |||
872 | /* Help handler running on IST stack to switch back to user stack | ||
873 | for scheduling or signal handling. The actual stack switch is done in | ||
874 | entry.S */ | ||
875 | asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) | ||
876 | { | ||
877 | struct pt_regs *regs = eregs; | ||
878 | /* Did already sync */ | ||
879 | if (eregs == (struct pt_regs *)eregs->sp) | ||
880 | ; | ||
881 | /* Exception from user space */ | ||
882 | else if (user_mode(eregs)) | ||
883 | regs = task_pt_regs(current); | ||
884 | /* Exception from kernel and interrupts are enabled. Move to | ||
885 | kernel process stack. */ | ||
886 | else if (eregs->flags & X86_EFLAGS_IF) | ||
887 | regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); | ||
888 | if (eregs != regs) | ||
889 | *regs = *eregs; | ||
890 | return regs; | ||
891 | } | ||
892 | |||
893 | /* runs on IST stack. */ | ||
894 | asmlinkage void __kprobes do_debug(struct pt_regs * regs, | ||
895 | unsigned long error_code) | ||
896 | { | ||
897 | struct task_struct *tsk = current; | ||
898 | unsigned long condition; | ||
899 | siginfo_t info; | ||
900 | |||
901 | trace_hardirqs_fixup(); | ||
902 | |||
903 | get_debugreg(condition, 6); | ||
904 | |||
905 | /* | ||
906 | * The processor cleared BTF, so don't mark that we need it set. | ||
907 | */ | ||
908 | clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); | ||
909 | tsk->thread.debugctlmsr = 0; | ||
910 | |||
911 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, | ||
912 | SIGTRAP) == NOTIFY_STOP) | ||
913 | return; | ||
914 | |||
915 | preempt_conditional_sti(regs); | ||
916 | |||
917 | /* Mask out spurious debug traps due to lazy DR7 setting */ | ||
918 | if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { | ||
919 | if (!tsk->thread.debugreg7) | ||
920 | goto clear_dr7; | ||
921 | } | ||
922 | |||
923 | tsk->thread.debugreg6 = condition; | ||
924 | |||
925 | /* | ||
926 | * Single-stepping through TF: make sure we ignore any events in | ||
927 | * kernel space (but re-enable TF when returning to user mode). | ||
928 | */ | ||
929 | if (condition & DR_STEP) { | ||
930 | if (!user_mode(regs)) | ||
931 | goto clear_TF_reenable; | ||
932 | } | ||
933 | |||
934 | /* Ok, finally something we can handle */ | ||
935 | tsk->thread.trap_no = 1; | ||
936 | tsk->thread.error_code = error_code; | ||
937 | info.si_signo = SIGTRAP; | ||
938 | info.si_errno = 0; | ||
939 | info.si_code = TRAP_BRKPT; | ||
940 | info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; | ||
941 | force_sig_info(SIGTRAP, &info, tsk); | ||
942 | |||
943 | clear_dr7: | ||
944 | set_debugreg(0, 7); | ||
945 | preempt_conditional_cli(regs); | ||
946 | return; | ||
947 | |||
948 | clear_TF_reenable: | ||
949 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | ||
950 | regs->flags &= ~X86_EFLAGS_TF; | ||
951 | preempt_conditional_cli(regs); | ||
952 | return; | ||
953 | } | ||
954 | |||
955 | static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) | ||
956 | { | ||
957 | if (fixup_exception(regs)) | ||
958 | return 1; | ||
959 | |||
960 | notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); | ||
961 | /* Illegal floating point operation in the kernel */ | ||
962 | current->thread.trap_no = trapnr; | ||
963 | die(str, regs, 0); | ||
964 | return 0; | ||
965 | } | ||
966 | |||
967 | /* | ||
968 | * Note that we play around with the 'TS' bit in an attempt to get | ||
969 | * the correct behaviour even in the presence of the asynchronous | ||
970 | * IRQ13 behaviour | ||
971 | */ | ||
972 | asmlinkage void do_coprocessor_error(struct pt_regs *regs) | ||
973 | { | ||
974 | void __user *ip = (void __user *)(regs->ip); | ||
975 | struct task_struct *task; | ||
976 | siginfo_t info; | ||
977 | unsigned short cwd, swd; | ||
978 | |||
979 | conditional_sti(regs); | ||
980 | if (!user_mode(regs) && | ||
981 | kernel_math_error(regs, "kernel x87 math error", 16)) | ||
982 | return; | ||
983 | |||
984 | /* | ||
985 | * Save the info for the exception handler and clear the error. | ||
986 | */ | ||
987 | task = current; | ||
988 | save_init_fpu(task); | ||
989 | task->thread.trap_no = 16; | ||
990 | task->thread.error_code = 0; | ||
991 | info.si_signo = SIGFPE; | ||
992 | info.si_errno = 0; | ||
993 | info.si_code = __SI_FAULT; | ||
994 | info.si_addr = ip; | ||
995 | /* | ||
996 | * (~cwd & swd) will mask out exceptions that are not set to unmasked | ||
997 | * status. 0x3f is the exception bits in these regs, 0x200 is the | ||
998 | * C1 reg you need in case of a stack fault, 0x040 is the stack | ||
999 | * fault bit. We should only be taking one exception at a time, | ||
1000 | * so if this combination doesn't produce any single exception, | ||
1001 | * then we have a bad program that isn't synchronizing its FPU usage | ||
1002 | * and it will suffer the consequences since we won't be able to | ||
1003 | * fully reproduce the context of the exception | ||
1004 | */ | ||
1005 | cwd = get_fpu_cwd(task); | ||
1006 | swd = get_fpu_swd(task); | ||
1007 | switch (swd & ~cwd & 0x3f) { | ||
1008 | case 0x000: /* No unmasked exception */ | ||
1009 | default: /* Multiple exceptions */ | ||
1010 | break; | ||
1011 | case 0x001: /* Invalid Op */ | ||
1012 | /* | ||
1013 | * swd & 0x240 == 0x040: Stack Underflow | ||
1014 | * swd & 0x240 == 0x240: Stack Overflow | ||
1015 | * User must clear the SF bit (0x40) if set | ||
1016 | */ | ||
1017 | info.si_code = FPE_FLTINV; | ||
1018 | break; | ||
1019 | case 0x002: /* Denormalize */ | ||
1020 | case 0x010: /* Underflow */ | ||
1021 | info.si_code = FPE_FLTUND; | ||
1022 | break; | ||
1023 | case 0x004: /* Zero Divide */ | ||
1024 | info.si_code = FPE_FLTDIV; | ||
1025 | break; | ||
1026 | case 0x008: /* Overflow */ | ||
1027 | info.si_code = FPE_FLTOVF; | ||
1028 | break; | ||
1029 | case 0x020: /* Precision */ | ||
1030 | info.si_code = FPE_FLTRES; | ||
1031 | break; | ||
1032 | } | ||
1033 | force_sig_info(SIGFPE, &info, task); | ||
1034 | } | ||
1035 | |||
1036 | asmlinkage void bad_intr(void) | ||
1037 | { | ||
1038 | printk("bad interrupt"); | ||
1039 | } | ||
1040 | |||
1041 | asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) | ||
1042 | { | ||
1043 | void __user *ip = (void __user *)(regs->ip); | ||
1044 | struct task_struct *task; | ||
1045 | siginfo_t info; | ||
1046 | unsigned short mxcsr; | ||
1047 | |||
1048 | conditional_sti(regs); | ||
1049 | if (!user_mode(regs) && | ||
1050 | kernel_math_error(regs, "kernel simd math error", 19)) | ||
1051 | return; | ||
1052 | |||
1053 | /* | ||
1054 | * Save the info for the exception handler and clear the error. | ||
1055 | */ | ||
1056 | task = current; | ||
1057 | save_init_fpu(task); | ||
1058 | task->thread.trap_no = 19; | ||
1059 | task->thread.error_code = 0; | ||
1060 | info.si_signo = SIGFPE; | ||
1061 | info.si_errno = 0; | ||
1062 | info.si_code = __SI_FAULT; | ||
1063 | info.si_addr = ip; | ||
1064 | /* | ||
1065 | * The SIMD FPU exceptions are handled a little differently, as there | ||
1066 | * is only a single status/control register. Thus, to determine which | ||
1067 | * unmasked exception was caught we must mask the exception mask bits | ||
1068 | * at 0x1f80, and then use these to mask the exception bits at 0x3f. | ||
1069 | */ | ||
1070 | mxcsr = get_fpu_mxcsr(task); | ||
1071 | switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { | ||
1072 | case 0x000: | ||
1073 | default: | ||
1074 | break; | ||
1075 | case 0x001: /* Invalid Op */ | ||
1076 | info.si_code = FPE_FLTINV; | ||
1077 | break; | ||
1078 | case 0x002: /* Denormalize */ | ||
1079 | case 0x010: /* Underflow */ | ||
1080 | info.si_code = FPE_FLTUND; | ||
1081 | break; | ||
1082 | case 0x004: /* Zero Divide */ | ||
1083 | info.si_code = FPE_FLTDIV; | ||
1084 | break; | ||
1085 | case 0x008: /* Overflow */ | ||
1086 | info.si_code = FPE_FLTOVF; | ||
1087 | break; | ||
1088 | case 0x020: /* Precision */ | ||
1089 | info.si_code = FPE_FLTRES; | ||
1090 | break; | ||
1091 | } | ||
1092 | force_sig_info(SIGFPE, &info, task); | ||
1093 | } | ||
1094 | |||
1095 | asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) | ||
1096 | { | ||
1097 | } | ||
1098 | |||
1099 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) | ||
1100 | { | ||
1101 | } | ||
1102 | |||
1103 | asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) | ||
1104 | { | ||
1105 | } | ||
1106 | |||
1107 | /* | ||
1108 | * 'math_state_restore()' saves the current math information in the | ||
1109 | * old math state array, and gets the new ones from the current task | ||
1110 | * | ||
1111 | * Careful.. There are problems with IBM-designed IRQ13 behaviour. | ||
1112 | * Don't touch unless you *really* know how it works. | ||
1113 | */ | ||
1114 | asmlinkage void math_state_restore(void) | ||
1115 | { | ||
1116 | struct task_struct *me = current; | ||
1117 | |||
1118 | if (!used_math()) { | ||
1119 | local_irq_enable(); | ||
1120 | /* | ||
1121 | * does a slab alloc which can sleep | ||
1122 | */ | ||
1123 | if (init_fpu(me)) { | ||
1124 | /* | ||
1125 | * ran out of memory! | ||
1126 | */ | ||
1127 | do_group_exit(SIGKILL); | ||
1128 | return; | ||
1129 | } | ||
1130 | local_irq_disable(); | ||
1131 | } | ||
1132 | |||
1133 | clts(); /* Allow maths ops (or we recurse) */ | ||
1134 | /* | ||
1135 | * Paranoid restore. send a SIGSEGV if we fail to restore the state. | ||
1136 | */ | ||
1137 | if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) { | ||
1138 | stts(); | ||
1139 | force_sig(SIGSEGV, me); | ||
1140 | return; | ||
1141 | } | ||
1142 | task_thread_info(me)->status |= TS_USEDFPU; | ||
1143 | me->fpu_counter++; | ||
1144 | } | ||
1145 | EXPORT_SYMBOL_GPL(math_state_restore); | ||
1146 | |||
1147 | void __init trap_init(void) | ||
1148 | { | ||
1149 | set_intr_gate(0, ÷_error); | ||
1150 | set_intr_gate_ist(1, &debug, DEBUG_STACK); | ||
1151 | set_intr_gate_ist(2, &nmi, NMI_STACK); | ||
1152 | set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */ | ||
1153 | set_system_gate(4, &overflow); /* int4 can be called from all */ | ||
1154 | set_intr_gate(5, &bounds); | ||
1155 | set_intr_gate(6, &invalid_op); | ||
1156 | set_intr_gate(7, &device_not_available); | ||
1157 | set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); | ||
1158 | set_intr_gate(9, &coprocessor_segment_overrun); | ||
1159 | set_intr_gate(10, &invalid_TSS); | ||
1160 | set_intr_gate(11, &segment_not_present); | ||
1161 | set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); | ||
1162 | set_intr_gate(13, &general_protection); | ||
1163 | set_intr_gate(14, &page_fault); | ||
1164 | set_intr_gate(15, &spurious_interrupt_bug); | ||
1165 | set_intr_gate(16, &coprocessor_error); | ||
1166 | set_intr_gate(17, &alignment_check); | ||
1167 | #ifdef CONFIG_X86_MCE | ||
1168 | set_intr_gate_ist(18, &machine_check, MCE_STACK); | ||
1169 | #endif | ||
1170 | set_intr_gate(19, &simd_coprocessor_error); | ||
1171 | |||
1172 | #ifdef CONFIG_IA32_EMULATION | ||
1173 | set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); | ||
1174 | #endif | ||
1175 | /* | ||
1176 | * initialize the per thread extended state: | ||
1177 | */ | ||
1178 | init_thread_xstate(); | ||
1179 | /* | ||
1180 | * Should be a barrier for any external CPU state: | ||
1181 | */ | ||
1182 | cpu_init(); | ||
1183 | } | ||
1184 | |||
1185 | static int __init oops_setup(char *s) | ||
1186 | { | ||
1187 | if (!s) | ||
1188 | return -EINVAL; | ||
1189 | if (!strcmp(s, "panic")) | ||
1190 | panic_on_oops = 1; | ||
1191 | return 0; | ||
1192 | } | ||
1193 | early_param("oops", oops_setup); | ||
1194 | |||
1195 | static int __init kstack_setup(char *s) | ||
1196 | { | ||
1197 | if (!s) | ||
1198 | return -EINVAL; | ||
1199 | kstack_depth_to_print = simple_strtoul(s, NULL, 0); | ||
1200 | return 0; | ||
1201 | } | ||
1202 | early_param("kstack", kstack_setup); | ||
1203 | |||
1204 | static int __init code_bytes_setup(char *s) | ||
1205 | { | ||
1206 | code_bytes = simple_strtoul(s, NULL, 0); | ||
1207 | if (code_bytes > 8192) | ||
1208 | code_bytes = 8192; | ||
1209 | |||
1210 | return 1; | ||
1211 | } | ||
1212 | __setup("code_bytes=", code_bytes_setup); | ||
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 8f98e9de1b82..161bb850fc47 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup); | |||
104 | /* | 104 | /* |
105 | * Read TSC and the reference counters. Take care of SMI disturbance | 105 | * Read TSC and the reference counters. Take care of SMI disturbance |
106 | */ | 106 | */ |
107 | static u64 tsc_read_refs(u64 *pm, u64 *hpet) | 107 | static u64 tsc_read_refs(u64 *p, int hpet) |
108 | { | 108 | { |
109 | u64 t1, t2; | 109 | u64 t1, t2; |
110 | int i; | 110 | int i; |
@@ -112,9 +112,9 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet) | |||
112 | for (i = 0; i < MAX_RETRIES; i++) { | 112 | for (i = 0; i < MAX_RETRIES; i++) { |
113 | t1 = get_cycles(); | 113 | t1 = get_cycles(); |
114 | if (hpet) | 114 | if (hpet) |
115 | *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; | 115 | *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; |
116 | else | 116 | else |
117 | *pm = acpi_pm_read_early(); | 117 | *p = acpi_pm_read_early(); |
118 | t2 = get_cycles(); | 118 | t2 = get_cycles(); |
119 | if ((t2 - t1) < SMI_TRESHOLD) | 119 | if ((t2 - t1) < SMI_TRESHOLD) |
120 | return t2; | 120 | return t2; |
@@ -123,13 +123,59 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet) | |||
123 | } | 123 | } |
124 | 124 | ||
125 | /* | 125 | /* |
126 | * Calculate the TSC frequency from HPET reference | ||
127 | */ | ||
128 | static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2) | ||
129 | { | ||
130 | u64 tmp; | ||
131 | |||
132 | if (hpet2 < hpet1) | ||
133 | hpet2 += 0x100000000ULL; | ||
134 | hpet2 -= hpet1; | ||
135 | tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); | ||
136 | do_div(tmp, 1000000); | ||
137 | do_div(deltatsc, tmp); | ||
138 | |||
139 | return (unsigned long) deltatsc; | ||
140 | } | ||
141 | |||
142 | /* | ||
143 | * Calculate the TSC frequency from PMTimer reference | ||
144 | */ | ||
145 | static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) | ||
146 | { | ||
147 | u64 tmp; | ||
148 | |||
149 | if (!pm1 && !pm2) | ||
150 | return ULONG_MAX; | ||
151 | |||
152 | if (pm2 < pm1) | ||
153 | pm2 += (u64)ACPI_PM_OVRRUN; | ||
154 | pm2 -= pm1; | ||
155 | tmp = pm2 * 1000000000LL; | ||
156 | do_div(tmp, PMTMR_TICKS_PER_SEC); | ||
157 | do_div(deltatsc, tmp); | ||
158 | |||
159 | return (unsigned long) deltatsc; | ||
160 | } | ||
161 | |||
162 | #define CAL_MS 10 | ||
163 | #define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS)) | ||
164 | #define CAL_PIT_LOOPS 1000 | ||
165 | |||
166 | #define CAL2_MS 50 | ||
167 | #define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS)) | ||
168 | #define CAL2_PIT_LOOPS 5000 | ||
169 | |||
170 | |||
171 | /* | ||
126 | * Try to calibrate the TSC against the Programmable | 172 | * Try to calibrate the TSC against the Programmable |
127 | * Interrupt Timer and return the frequency of the TSC | 173 | * Interrupt Timer and return the frequency of the TSC |
128 | * in kHz. | 174 | * in kHz. |
129 | * | 175 | * |
130 | * Return ULONG_MAX on failure to calibrate. | 176 | * Return ULONG_MAX on failure to calibrate. |
131 | */ | 177 | */ |
132 | static unsigned long pit_calibrate_tsc(void) | 178 | static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) |
133 | { | 179 | { |
134 | u64 tsc, t1, t2, delta; | 180 | u64 tsc, t1, t2, delta; |
135 | unsigned long tscmin, tscmax; | 181 | unsigned long tscmin, tscmax; |
@@ -144,8 +190,8 @@ static unsigned long pit_calibrate_tsc(void) | |||
144 | * (LSB then MSB) to begin countdown. | 190 | * (LSB then MSB) to begin countdown. |
145 | */ | 191 | */ |
146 | outb(0xb0, 0x43); | 192 | outb(0xb0, 0x43); |
147 | outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); | 193 | outb(latch & 0xff, 0x42); |
148 | outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); | 194 | outb(latch >> 8, 0x42); |
149 | 195 | ||
150 | tsc = t1 = t2 = get_cycles(); | 196 | tsc = t1 = t2 = get_cycles(); |
151 | 197 | ||
@@ -166,31 +212,154 @@ static unsigned long pit_calibrate_tsc(void) | |||
166 | /* | 212 | /* |
167 | * Sanity checks: | 213 | * Sanity checks: |
168 | * | 214 | * |
169 | * If we were not able to read the PIT more than 5000 | 215 | * If we were not able to read the PIT more than loopmin |
170 | * times, then we have been hit by a massive SMI | 216 | * times, then we have been hit by a massive SMI |
171 | * | 217 | * |
172 | * If the maximum is 10 times larger than the minimum, | 218 | * If the maximum is 10 times larger than the minimum, |
173 | * then we got hit by an SMI as well. | 219 | * then we got hit by an SMI as well. |
174 | */ | 220 | */ |
175 | if (pitcnt < 5000 || tscmax > 10 * tscmin) | 221 | if (pitcnt < loopmin || tscmax > 10 * tscmin) |
176 | return ULONG_MAX; | 222 | return ULONG_MAX; |
177 | 223 | ||
178 | /* Calculate the PIT value */ | 224 | /* Calculate the PIT value */ |
179 | delta = t2 - t1; | 225 | delta = t2 - t1; |
180 | do_div(delta, 50); | 226 | do_div(delta, ms); |
181 | return delta; | 227 | return delta; |
182 | } | 228 | } |
183 | 229 | ||
230 | /* | ||
231 | * This reads the current MSB of the PIT counter, and | ||
232 | * checks if we are running on sufficiently fast and | ||
233 | * non-virtualized hardware. | ||
234 | * | ||
235 | * Our expectations are: | ||
236 | * | ||
237 | * - the PIT is running at roughly 1.19MHz | ||
238 | * | ||
239 | * - each IO is going to take about 1us on real hardware, | ||
240 | * but we allow it to be much faster (by a factor of 10) or | ||
241 | * _slightly_ slower (ie we allow up to a 2us read+counter | ||
242 | * update - anything else implies a unacceptably slow CPU | ||
243 | * or PIT for the fast calibration to work. | ||
244 | * | ||
245 | * - with 256 PIT ticks to read the value, we have 214us to | ||
246 | * see the same MSB (and overhead like doing a single TSC | ||
247 | * read per MSB value etc). | ||
248 | * | ||
249 | * - We're doing 2 reads per loop (LSB, MSB), and we expect | ||
250 | * them each to take about a microsecond on real hardware. | ||
251 | * So we expect a count value of around 100. But we'll be | ||
252 | * generous, and accept anything over 50. | ||
253 | * | ||
254 | * - if the PIT is stuck, and we see *many* more reads, we | ||
255 | * return early (and the next caller of pit_expect_msb() | ||
256 | * then consider it a failure when they don't see the | ||
257 | * next expected value). | ||
258 | * | ||
259 | * These expectations mean that we know that we have seen the | ||
260 | * transition from one expected value to another with a fairly | ||
261 | * high accuracy, and we didn't miss any events. We can thus | ||
262 | * use the TSC value at the transitions to calculate a pretty | ||
263 | * good value for the TSC frequencty. | ||
264 | */ | ||
265 | static inline int pit_expect_msb(unsigned char val) | ||
266 | { | ||
267 | int count = 0; | ||
268 | |||
269 | for (count = 0; count < 50000; count++) { | ||
270 | /* Ignore LSB */ | ||
271 | inb(0x42); | ||
272 | if (inb(0x42) != val) | ||
273 | break; | ||
274 | } | ||
275 | return count > 50; | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * How many MSB values do we want to see? We aim for a | ||
280 | * 15ms calibration, which assuming a 2us counter read | ||
281 | * error should give us roughly 150 ppm precision for | ||
282 | * the calibration. | ||
283 | */ | ||
284 | #define QUICK_PIT_MS 15 | ||
285 | #define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) | ||
286 | |||
287 | static unsigned long quick_pit_calibrate(void) | ||
288 | { | ||
289 | /* Set the Gate high, disable speaker */ | ||
290 | outb((inb(0x61) & ~0x02) | 0x01, 0x61); | ||
291 | |||
292 | /* | ||
293 | * Counter 2, mode 0 (one-shot), binary count | ||
294 | * | ||
295 | * NOTE! Mode 2 decrements by two (and then the | ||
296 | * output is flipped each time, giving the same | ||
297 | * final output frequency as a decrement-by-one), | ||
298 | * so mode 0 is much better when looking at the | ||
299 | * individual counts. | ||
300 | */ | ||
301 | outb(0xb0, 0x43); | ||
302 | |||
303 | /* Start at 0xffff */ | ||
304 | outb(0xff, 0x42); | ||
305 | outb(0xff, 0x42); | ||
306 | |||
307 | if (pit_expect_msb(0xff)) { | ||
308 | int i; | ||
309 | u64 t1, t2, delta; | ||
310 | unsigned char expect = 0xfe; | ||
311 | |||
312 | t1 = get_cycles(); | ||
313 | for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) { | ||
314 | if (!pit_expect_msb(expect)) | ||
315 | goto failed; | ||
316 | } | ||
317 | t2 = get_cycles(); | ||
318 | |||
319 | /* | ||
320 | * Make sure we can rely on the second TSC timestamp: | ||
321 | */ | ||
322 | if (!pit_expect_msb(expect)) | ||
323 | goto failed; | ||
324 | |||
325 | /* | ||
326 | * Ok, if we get here, then we've seen the | ||
327 | * MSB of the PIT decrement QUICK_PIT_ITERATIONS | ||
328 | * times, and each MSB had many hits, so we never | ||
329 | * had any sudden jumps. | ||
330 | * | ||
331 | * As a result, we can depend on there not being | ||
332 | * any odd delays anywhere, and the TSC reads are | ||
333 | * reliable. | ||
334 | * | ||
335 | * kHz = ticks / time-in-seconds / 1000; | ||
336 | * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000 | ||
337 | * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000) | ||
338 | */ | ||
339 | delta = (t2 - t1)*PIT_TICK_RATE; | ||
340 | do_div(delta, QUICK_PIT_ITERATIONS*256*1000); | ||
341 | printk("Fast TSC calibration using PIT\n"); | ||
342 | return delta; | ||
343 | } | ||
344 | failed: | ||
345 | return 0; | ||
346 | } | ||
184 | 347 | ||
185 | /** | 348 | /** |
186 | * native_calibrate_tsc - calibrate the tsc on boot | 349 | * native_calibrate_tsc - calibrate the tsc on boot |
187 | */ | 350 | */ |
188 | unsigned long native_calibrate_tsc(void) | 351 | unsigned long native_calibrate_tsc(void) |
189 | { | 352 | { |
190 | u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2; | 353 | u64 tsc1, tsc2, delta, ref1, ref2; |
191 | unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; | 354 | unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; |
192 | unsigned long flags; | 355 | unsigned long flags, latch, ms, fast_calibrate; |
193 | int hpet = is_hpet_enabled(), i; | 356 | int hpet = is_hpet_enabled(), i, loopmin; |
357 | |||
358 | local_irq_save(flags); | ||
359 | fast_calibrate = quick_pit_calibrate(); | ||
360 | local_irq_restore(flags); | ||
361 | if (fast_calibrate) | ||
362 | return fast_calibrate; | ||
194 | 363 | ||
195 | /* | 364 | /* |
196 | * Run 5 calibration loops to get the lowest frequency value | 365 | * Run 5 calibration loops to get the lowest frequency value |
@@ -216,7 +385,13 @@ unsigned long native_calibrate_tsc(void) | |||
216 | * calibration delay loop as we have to wait for a certain | 385 | * calibration delay loop as we have to wait for a certain |
217 | * amount of time anyway. | 386 | * amount of time anyway. |
218 | */ | 387 | */ |
219 | for (i = 0; i < 5; i++) { | 388 | |
389 | /* Preset PIT loop values */ | ||
390 | latch = CAL_LATCH; | ||
391 | ms = CAL_MS; | ||
392 | loopmin = CAL_PIT_LOOPS; | ||
393 | |||
394 | for (i = 0; i < 3; i++) { | ||
220 | unsigned long tsc_pit_khz; | 395 | unsigned long tsc_pit_khz; |
221 | 396 | ||
222 | /* | 397 | /* |
@@ -226,16 +401,16 @@ unsigned long native_calibrate_tsc(void) | |||
226 | * read the end value. | 401 | * read the end value. |
227 | */ | 402 | */ |
228 | local_irq_save(flags); | 403 | local_irq_save(flags); |
229 | tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); | 404 | tsc1 = tsc_read_refs(&ref1, hpet); |
230 | tsc_pit_khz = pit_calibrate_tsc(); | 405 | tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin); |
231 | tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); | 406 | tsc2 = tsc_read_refs(&ref2, hpet); |
232 | local_irq_restore(flags); | 407 | local_irq_restore(flags); |
233 | 408 | ||
234 | /* Pick the lowest PIT TSC calibration so far */ | 409 | /* Pick the lowest PIT TSC calibration so far */ |
235 | tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); | 410 | tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); |
236 | 411 | ||
237 | /* hpet or pmtimer available ? */ | 412 | /* hpet or pmtimer available ? */ |
238 | if (!hpet && !pm1 && !pm2) | 413 | if (!hpet && !ref1 && !ref2) |
239 | continue; | 414 | continue; |
240 | 415 | ||
241 | /* Check, whether the sampling was disturbed by an SMI */ | 416 | /* Check, whether the sampling was disturbed by an SMI */ |
@@ -243,23 +418,41 @@ unsigned long native_calibrate_tsc(void) | |||
243 | continue; | 418 | continue; |
244 | 419 | ||
245 | tsc2 = (tsc2 - tsc1) * 1000000LL; | 420 | tsc2 = (tsc2 - tsc1) * 1000000LL; |
421 | if (hpet) | ||
422 | tsc2 = calc_hpet_ref(tsc2, ref1, ref2); | ||
423 | else | ||
424 | tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2); | ||
246 | 425 | ||
247 | if (hpet) { | 426 | tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); |
248 | if (hpet2 < hpet1) | 427 | |
249 | hpet2 += 0x100000000ULL; | 428 | /* Check the reference deviation */ |
250 | hpet2 -= hpet1; | 429 | delta = ((u64) tsc_pit_min) * 100; |
251 | tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); | 430 | do_div(delta, tsc_ref_min); |
252 | do_div(tsc1, 1000000); | 431 | |
253 | } else { | 432 | /* |
254 | if (pm2 < pm1) | 433 | * If both calibration results are inside a 10% window |
255 | pm2 += (u64)ACPI_PM_OVRRUN; | 434 | * then we can be sure, that the calibration |
256 | pm2 -= pm1; | 435 | * succeeded. We break out of the loop right away. We |
257 | tsc1 = pm2 * 1000000000LL; | 436 | * use the reference value, as it is more precise. |
258 | do_div(tsc1, PMTMR_TICKS_PER_SEC); | 437 | */ |
438 | if (delta >= 90 && delta <= 110) { | ||
439 | printk(KERN_INFO | ||
440 | "TSC: PIT calibration matches %s. %d loops\n", | ||
441 | hpet ? "HPET" : "PMTIMER", i + 1); | ||
442 | return tsc_ref_min; | ||
259 | } | 443 | } |
260 | 444 | ||
261 | do_div(tsc2, tsc1); | 445 | /* |
262 | tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); | 446 | * Check whether PIT failed more than once. This |
447 | * happens in virtualized environments. We need to | ||
448 | * give the virtual PC a slightly longer timeframe for | ||
449 | * the HPET/PMTIMER to make the result precise. | ||
450 | */ | ||
451 | if (i == 1 && tsc_pit_min == ULONG_MAX) { | ||
452 | latch = CAL2_LATCH; | ||
453 | ms = CAL2_MS; | ||
454 | loopmin = CAL2_PIT_LOOPS; | ||
455 | } | ||
263 | } | 456 | } |
264 | 457 | ||
265 | /* | 458 | /* |
@@ -270,7 +463,7 @@ unsigned long native_calibrate_tsc(void) | |||
270 | printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); | 463 | printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); |
271 | 464 | ||
272 | /* We don't have an alternative source, disable TSC */ | 465 | /* We don't have an alternative source, disable TSC */ |
273 | if (!hpet && !pm1 && !pm2) { | 466 | if (!hpet && !ref1 && !ref2) { |
274 | printk("TSC: No reference (HPET/PMTIMER) available\n"); | 467 | printk("TSC: No reference (HPET/PMTIMER) available\n"); |
275 | return 0; | 468 | return 0; |
276 | } | 469 | } |
@@ -278,7 +471,7 @@ unsigned long native_calibrate_tsc(void) | |||
278 | /* The alternative source failed as well, disable TSC */ | 471 | /* The alternative source failed as well, disable TSC */ |
279 | if (tsc_ref_min == ULONG_MAX) { | 472 | if (tsc_ref_min == ULONG_MAX) { |
280 | printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " | 473 | printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " |
281 | "failed due to SMI disturbance.\n"); | 474 | "failed.\n"); |
282 | return 0; | 475 | return 0; |
283 | } | 476 | } |
284 | 477 | ||
@@ -290,44 +483,25 @@ unsigned long native_calibrate_tsc(void) | |||
290 | } | 483 | } |
291 | 484 | ||
292 | /* We don't have an alternative source, use the PIT calibration value */ | 485 | /* We don't have an alternative source, use the PIT calibration value */ |
293 | if (!hpet && !pm1 && !pm2) { | 486 | if (!hpet && !ref1 && !ref2) { |
294 | printk(KERN_INFO "TSC: Using PIT calibration value\n"); | 487 | printk(KERN_INFO "TSC: Using PIT calibration value\n"); |
295 | return tsc_pit_min; | 488 | return tsc_pit_min; |
296 | } | 489 | } |
297 | 490 | ||
298 | /* The alternative source failed, use the PIT calibration value */ | 491 | /* The alternative source failed, use the PIT calibration value */ |
299 | if (tsc_ref_min == ULONG_MAX) { | 492 | if (tsc_ref_min == ULONG_MAX) { |
300 | printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due " | 493 | printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. " |
301 | "to SMI disturbance. Using PIT calibration\n"); | 494 | "Using PIT calibration\n"); |
302 | return tsc_pit_min; | 495 | return tsc_pit_min; |
303 | } | 496 | } |
304 | 497 | ||
305 | /* Check the reference deviation */ | ||
306 | delta = ((u64) tsc_pit_min) * 100; | ||
307 | do_div(delta, tsc_ref_min); | ||
308 | |||
309 | /* | ||
310 | * If both calibration results are inside a 5% window, the we | ||
311 | * use the lower frequency of those as it is probably the | ||
312 | * closest estimate. | ||
313 | */ | ||
314 | if (delta >= 95 && delta <= 105) { | ||
315 | printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n", | ||
316 | hpet ? "HPET" : "PMTIMER"); | ||
317 | printk(KERN_INFO "TSC: using %s calibration value\n", | ||
318 | tsc_pit_min <= tsc_ref_min ? "PIT" : | ||
319 | hpet ? "HPET" : "PMTIMER"); | ||
320 | return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min; | ||
321 | } | ||
322 | |||
323 | printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", | ||
324 | hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); | ||
325 | |||
326 | /* | 498 | /* |
327 | * The calibration values differ too much. In doubt, we use | 499 | * The calibration values differ too much. In doubt, we use |
328 | * the PIT value as we know that there are PMTIMERs around | 500 | * the PIT value as we know that there are PMTIMERs around |
329 | * running at double speed. | 501 | * running at double speed. At least we let the user know: |
330 | */ | 502 | */ |
503 | printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", | ||
504 | hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); | ||
331 | printk(KERN_INFO "TSC: Using PIT calibration value\n"); | 505 | printk(KERN_INFO "TSC: Using PIT calibration value\n"); |
332 | return tsc_pit_min; | 506 | return tsc_pit_min; |
333 | } | 507 | } |
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 594ef47f0a63..61a97e616f70 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c | |||
@@ -25,45 +25,31 @@ | |||
25 | #include <asm/visws/cobalt.h> | 25 | #include <asm/visws/cobalt.h> |
26 | #include <asm/visws/piix4.h> | 26 | #include <asm/visws/piix4.h> |
27 | #include <asm/arch_hooks.h> | 27 | #include <asm/arch_hooks.h> |
28 | #include <asm/io_apic.h> | ||
28 | #include <asm/fixmap.h> | 29 | #include <asm/fixmap.h> |
29 | #include <asm/reboot.h> | 30 | #include <asm/reboot.h> |
30 | #include <asm/setup.h> | 31 | #include <asm/setup.h> |
31 | #include <asm/e820.h> | 32 | #include <asm/e820.h> |
32 | #include <asm/smp.h> | ||
33 | #include <asm/io.h> | 33 | #include <asm/io.h> |
34 | 34 | ||
35 | #include <mach_ipi.h> | 35 | #include <mach_ipi.h> |
36 | 36 | ||
37 | #include "mach_apic.h" | 37 | #include "mach_apic.h" |
38 | 38 | ||
39 | #include <linux/init.h> | ||
40 | #include <linux/smp.h> | ||
41 | |||
42 | #include <linux/kernel_stat.h> | 39 | #include <linux/kernel_stat.h> |
43 | #include <linux/interrupt.h> | ||
44 | #include <linux/init.h> | ||
45 | 40 | ||
46 | #include <asm/io.h> | ||
47 | #include <asm/apic.h> | ||
48 | #include <asm/i8259.h> | 41 | #include <asm/i8259.h> |
49 | #include <asm/irq_vectors.h> | 42 | #include <asm/irq_vectors.h> |
50 | #include <asm/visws/cobalt.h> | ||
51 | #include <asm/visws/lithium.h> | 43 | #include <asm/visws/lithium.h> |
52 | #include <asm/visws/piix4.h> | ||
53 | 44 | ||
54 | #include <linux/sched.h> | 45 | #include <linux/sched.h> |
55 | #include <linux/kernel.h> | 46 | #include <linux/kernel.h> |
56 | #include <linux/init.h> | ||
57 | #include <linux/pci.h> | 47 | #include <linux/pci.h> |
58 | #include <linux/pci_ids.h> | 48 | #include <linux/pci_ids.h> |
59 | 49 | ||
60 | extern int no_broadcast; | 50 | extern int no_broadcast; |
61 | 51 | ||
62 | #include <asm/io.h> | ||
63 | #include <asm/apic.h> | 52 | #include <asm/apic.h> |
64 | #include <asm/arch_hooks.h> | ||
65 | #include <asm/visws/cobalt.h> | ||
66 | #include <asm/visws/lithium.h> | ||
67 | 53 | ||
68 | char visws_board_type = -1; | 54 | char visws_board_type = -1; |
69 | char visws_board_rev = -1; | 55 | char visws_board_rev = -1; |
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 38f566fa27d2..4eeb5cf9720d 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include <asm/io.h> | 46 | #include <asm/io.h> |
47 | #include <asm/tlbflush.h> | 47 | #include <asm/tlbflush.h> |
48 | #include <asm/irq.h> | 48 | #include <asm/irq.h> |
49 | #include <asm/syscalls.h> | ||
49 | 50 | ||
50 | /* | 51 | /* |
51 | * Known problems: | 52 | * Known problems: |
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 6ca515d6db54..8b6c393ab9fd 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c | |||
@@ -235,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry, | |||
235 | const void *desc) | 235 | const void *desc) |
236 | { | 236 | { |
237 | u32 *ldt_entry = (u32 *)desc; | 237 | u32 *ldt_entry = (u32 *)desc; |
238 | vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); | 238 | vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); |
239 | } | 239 | } |
240 | 240 | ||
241 | static void vmi_load_sp0(struct tss_struct *tss, | 241 | static void vmi_load_sp0(struct tss_struct *tss, |
@@ -393,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) | |||
393 | } | 393 | } |
394 | #endif | 394 | #endif |
395 | 395 | ||
396 | static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) | 396 | static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) |
397 | { | 397 | { |
398 | vmi_set_page_type(pfn, VMI_PAGE_L1); | 398 | vmi_set_page_type(pfn, VMI_PAGE_L1); |
399 | vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); | 399 | vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); |
400 | } | 400 | } |
401 | 401 | ||
402 | static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) | 402 | static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn) |
403 | { | 403 | { |
404 | /* | 404 | /* |
405 | * This call comes in very early, before mem_map is setup. | 405 | * This call comes in very early, before mem_map is setup. |
@@ -410,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) | |||
410 | vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); | 410 | vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); |
411 | } | 411 | } |
412 | 412 | ||
413 | static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) | 413 | static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) |
414 | { | 414 | { |
415 | vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); | 415 | vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); |
416 | vmi_check_page_type(clonepfn, VMI_PAGE_L2); | 416 | vmi_check_page_type(clonepfn, VMI_PAGE_L2); |
417 | vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); | 417 | vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); |
418 | } | 418 | } |
419 | 419 | ||
420 | static void vmi_release_pte(u32 pfn) | 420 | static void vmi_release_pte(unsigned long pfn) |
421 | { | 421 | { |
422 | vmi_ops.release_page(pfn, VMI_PAGE_L1); | 422 | vmi_ops.release_page(pfn, VMI_PAGE_L1); |
423 | vmi_set_page_type(pfn, VMI_PAGE_NORMAL); | 423 | vmi_set_page_type(pfn, VMI_PAGE_NORMAL); |
424 | } | 424 | } |
425 | 425 | ||
426 | static void vmi_release_pmd(u32 pfn) | 426 | static void vmi_release_pmd(unsigned long pfn) |
427 | { | 427 | { |
428 | vmi_ops.release_page(pfn, VMI_PAGE_L2); | 428 | vmi_ops.release_page(pfn, VMI_PAGE_L2); |
429 | vmi_set_page_type(pfn, VMI_PAGE_NORMAL); | 429 | vmi_set_page_type(pfn, VMI_PAGE_NORMAL); |
@@ -905,8 +905,8 @@ static inline int __init activate_vmi(void) | |||
905 | #endif | 905 | #endif |
906 | 906 | ||
907 | #ifdef CONFIG_X86_LOCAL_APIC | 907 | #ifdef CONFIG_X86_LOCAL_APIC |
908 | para_fill(pv_apic_ops.apic_read, APICRead); | 908 | para_fill(apic_ops->read, APICRead); |
909 | para_fill(pv_apic_ops.apic_write, APICWrite); | 909 | para_fill(apic_ops->write, APICWrite); |
910 | #endif | 910 | #endif |
911 | 911 | ||
912 | /* | 912 | /* |
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index af5bdad84604..a9b8560adbc2 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S | |||
@@ -140,10 +140,10 @@ SECTIONS | |||
140 | *(.con_initcall.init) | 140 | *(.con_initcall.init) |
141 | __con_initcall_end = .; | 141 | __con_initcall_end = .; |
142 | } | 142 | } |
143 | .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { | 143 | .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { |
144 | __x86cpuvendor_start = .; | 144 | __x86_cpu_dev_start = .; |
145 | *(.x86cpuvendor.init) | 145 | *(.x86_cpu_dev.init) |
146 | __x86cpuvendor_end = .; | 146 | __x86_cpu_dev_end = .; |
147 | } | 147 | } |
148 | SECURITY_INIT | 148 | SECURITY_INIT |
149 | . = ALIGN(4); | 149 | . = ALIGN(4); |
@@ -180,6 +180,7 @@ SECTIONS | |||
180 | . = ALIGN(PAGE_SIZE); | 180 | . = ALIGN(PAGE_SIZE); |
181 | .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { | 181 | .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { |
182 | __per_cpu_start = .; | 182 | __per_cpu_start = .; |
183 | *(.data.percpu.page_aligned) | ||
183 | *(.data.percpu) | 184 | *(.data.percpu) |
184 | *(.data.percpu.shared_aligned) | 185 | *(.data.percpu.shared_aligned) |
185 | __per_cpu_end = .; | 186 | __per_cpu_end = .; |
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 63e5c1a22e88..46e05447405b 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S | |||
@@ -168,12 +168,11 @@ SECTIONS | |||
168 | *(.con_initcall.init) | 168 | *(.con_initcall.init) |
169 | } | 169 | } |
170 | __con_initcall_end = .; | 170 | __con_initcall_end = .; |
171 | . = ALIGN(16); | 171 | __x86_cpu_dev_start = .; |
172 | __x86cpuvendor_start = .; | 172 | .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { |
173 | .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { | 173 | *(.x86_cpu_dev.init) |
174 | *(.x86cpuvendor.init) | ||
175 | } | 174 | } |
176 | __x86cpuvendor_end = .; | 175 | __x86_cpu_dev_end = .; |
177 | SECURITY_INIT | 176 | SECURITY_INIT |
178 | 177 | ||
179 | . = ALIGN(8); | 178 | . = ALIGN(8); |
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 0c029e8959c7..7766d36983fc 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c | |||
@@ -61,7 +61,7 @@ static void vsmp_irq_enable(void) | |||
61 | native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); | 61 | native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); |
62 | } | 62 | } |
63 | 63 | ||
64 | static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, | 64 | static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, |
65 | unsigned long addr, unsigned len) | 65 | unsigned long addr, unsigned len) |
66 | { | 66 | { |
67 | switch (type) { | 67 | switch (type) { |
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c new file mode 100644 index 000000000000..9abac8a9d823 --- /dev/null +++ b/arch/x86/kernel/xsave.c | |||
@@ -0,0 +1,345 @@ | |||
1 | /* | ||
2 | * xsave/xrstor support. | ||
3 | * | ||
4 | * Author: Suresh Siddha <suresh.b.siddha@intel.com> | ||
5 | */ | ||
6 | #include <linux/bootmem.h> | ||
7 | #include <linux/compat.h> | ||
8 | #include <asm/i387.h> | ||
9 | #ifdef CONFIG_IA32_EMULATION | ||
10 | #include <asm/sigcontext32.h> | ||
11 | #endif | ||
12 | #include <asm/xcr.h> | ||
13 | |||
14 | /* | ||
15 | * Supported feature mask by the CPU and the kernel. | ||
16 | */ | ||
17 | u64 pcntxt_mask; | ||
18 | |||
19 | struct _fpx_sw_bytes fx_sw_reserved; | ||
20 | #ifdef CONFIG_IA32_EMULATION | ||
21 | struct _fpx_sw_bytes fx_sw_reserved_ia32; | ||
22 | #endif | ||
23 | |||
24 | /* | ||
25 | * Check for the presence of extended state information in the | ||
26 | * user fpstate pointer in the sigcontext. | ||
27 | */ | ||
28 | int check_for_xstate(struct i387_fxsave_struct __user *buf, | ||
29 | void __user *fpstate, | ||
30 | struct _fpx_sw_bytes *fx_sw_user) | ||
31 | { | ||
32 | int min_xstate_size = sizeof(struct i387_fxsave_struct) + | ||
33 | sizeof(struct xsave_hdr_struct); | ||
34 | unsigned int magic2; | ||
35 | int err; | ||
36 | |||
37 | err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], | ||
38 | sizeof(struct _fpx_sw_bytes)); | ||
39 | |||
40 | if (err) | ||
41 | return err; | ||
42 | |||
43 | /* | ||
44 | * First Magic check failed. | ||
45 | */ | ||
46 | if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) | ||
47 | return -1; | ||
48 | |||
49 | /* | ||
50 | * Check for error scenarios. | ||
51 | */ | ||
52 | if (fx_sw_user->xstate_size < min_xstate_size || | ||
53 | fx_sw_user->xstate_size > xstate_size || | ||
54 | fx_sw_user->xstate_size > fx_sw_user->extended_size) | ||
55 | return -1; | ||
56 | |||
57 | err = __get_user(magic2, (__u32 *) (((void *)fpstate) + | ||
58 | fx_sw_user->extended_size - | ||
59 | FP_XSTATE_MAGIC2_SIZE)); | ||
60 | /* | ||
61 | * Check for the presence of second magic word at the end of memory | ||
62 | * layout. This detects the case where the user just copied the legacy | ||
63 | * fpstate layout with out copying the extended state information | ||
64 | * in the memory layout. | ||
65 | */ | ||
66 | if (err || magic2 != FP_XSTATE_MAGIC2) | ||
67 | return -1; | ||
68 | |||
69 | return 0; | ||
70 | } | ||
71 | |||
72 | #ifdef CONFIG_X86_64 | ||
73 | /* | ||
74 | * Signal frame handlers. | ||
75 | */ | ||
76 | |||
77 | int save_i387_xstate(void __user *buf) | ||
78 | { | ||
79 | struct task_struct *tsk = current; | ||
80 | int err = 0; | ||
81 | |||
82 | if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size)) | ||
83 | return -EACCES; | ||
84 | |||
85 | BUG_ON(sig_xstate_size < xstate_size); | ||
86 | |||
87 | if ((unsigned long)buf % 64) | ||
88 | printk("save_i387_xstate: bad fpstate %p\n", buf); | ||
89 | |||
90 | if (!used_math()) | ||
91 | return 0; | ||
92 | clear_used_math(); /* trigger finit */ | ||
93 | if (task_thread_info(tsk)->status & TS_USEDFPU) { | ||
94 | /* | ||
95 | * Start with clearing the user buffer. This will present a | ||
96 | * clean context for the bytes not touched by the fxsave/xsave. | ||
97 | */ | ||
98 | err = __clear_user(buf, sig_xstate_size); | ||
99 | if (err) | ||
100 | return err; | ||
101 | |||
102 | if (task_thread_info(tsk)->status & TS_XSAVE) | ||
103 | err = xsave_user(buf); | ||
104 | else | ||
105 | err = fxsave_user(buf); | ||
106 | |||
107 | if (err) | ||
108 | return err; | ||
109 | task_thread_info(tsk)->status &= ~TS_USEDFPU; | ||
110 | stts(); | ||
111 | } else { | ||
112 | if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, | ||
113 | xstate_size)) | ||
114 | return -1; | ||
115 | } | ||
116 | |||
117 | if (task_thread_info(tsk)->status & TS_XSAVE) { | ||
118 | struct _fpstate __user *fx = buf; | ||
119 | struct _xstate __user *x = buf; | ||
120 | u64 xstate_bv; | ||
121 | |||
122 | err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved, | ||
123 | sizeof(struct _fpx_sw_bytes)); | ||
124 | |||
125 | err |= __put_user(FP_XSTATE_MAGIC2, | ||
126 | (__u32 __user *) (buf + sig_xstate_size | ||
127 | - FP_XSTATE_MAGIC2_SIZE)); | ||
128 | |||
129 | /* | ||
130 | * Read the xstate_bv which we copied (directly from the cpu or | ||
131 | * from the state in task struct) to the user buffers and | ||
132 | * set the FP/SSE bits. | ||
133 | */ | ||
134 | err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv); | ||
135 | |||
136 | /* | ||
137 | * For legacy compatible, we always set FP/SSE bits in the bit | ||
138 | * vector while saving the state to the user context. This will | ||
139 | * enable us capturing any changes(during sigreturn) to | ||
140 | * the FP/SSE bits by the legacy applications which don't touch | ||
141 | * xstate_bv in the xsave header. | ||
142 | * | ||
143 | * xsave aware apps can change the xstate_bv in the xsave | ||
144 | * header as well as change any contents in the memory layout. | ||
145 | * xrestore as part of sigreturn will capture all the changes. | ||
146 | */ | ||
147 | xstate_bv |= XSTATE_FPSSE; | ||
148 | |||
149 | err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv); | ||
150 | |||
151 | if (err) | ||
152 | return err; | ||
153 | } | ||
154 | |||
155 | return 1; | ||
156 | } | ||
157 | |||
158 | /* | ||
159 | * Restore the extended state if present. Otherwise, restore the FP/SSE | ||
160 | * state. | ||
161 | */ | ||
162 | int restore_user_xstate(void __user *buf) | ||
163 | { | ||
164 | struct _fpx_sw_bytes fx_sw_user; | ||
165 | u64 mask; | ||
166 | int err; | ||
167 | |||
168 | if (((unsigned long)buf % 64) || | ||
169 | check_for_xstate(buf, buf, &fx_sw_user)) | ||
170 | goto fx_only; | ||
171 | |||
172 | mask = fx_sw_user.xstate_bv; | ||
173 | |||
174 | /* | ||
175 | * restore the state passed by the user. | ||
176 | */ | ||
177 | err = xrestore_user(buf, mask); | ||
178 | if (err) | ||
179 | return err; | ||
180 | |||
181 | /* | ||
182 | * init the state skipped by the user. | ||
183 | */ | ||
184 | mask = pcntxt_mask & ~mask; | ||
185 | |||
186 | xrstor_state(init_xstate_buf, mask); | ||
187 | |||
188 | return 0; | ||
189 | |||
190 | fx_only: | ||
191 | /* | ||
192 | * couldn't find the extended state information in the | ||
193 | * memory layout. Restore just the FP/SSE and init all | ||
194 | * the other extended state. | ||
195 | */ | ||
196 | xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE); | ||
197 | return fxrstor_checking((__force struct i387_fxsave_struct *)buf); | ||
198 | } | ||
199 | |||
200 | /* | ||
201 | * This restores directly out of user space. Exceptions are handled. | ||
202 | */ | ||
203 | int restore_i387_xstate(void __user *buf) | ||
204 | { | ||
205 | struct task_struct *tsk = current; | ||
206 | int err = 0; | ||
207 | |||
208 | if (!buf) { | ||
209 | if (used_math()) | ||
210 | goto clear; | ||
211 | return 0; | ||
212 | } else | ||
213 | if (!access_ok(VERIFY_READ, buf, sig_xstate_size)) | ||
214 | return -EACCES; | ||
215 | |||
216 | if (!used_math()) { | ||
217 | err = init_fpu(tsk); | ||
218 | if (err) | ||
219 | return err; | ||
220 | } | ||
221 | |||
222 | if (!(task_thread_info(current)->status & TS_USEDFPU)) { | ||
223 | clts(); | ||
224 | task_thread_info(current)->status |= TS_USEDFPU; | ||
225 | } | ||
226 | if (task_thread_info(tsk)->status & TS_XSAVE) | ||
227 | err = restore_user_xstate(buf); | ||
228 | else | ||
229 | err = fxrstor_checking((__force struct i387_fxsave_struct *) | ||
230 | buf); | ||
231 | if (unlikely(err)) { | ||
232 | /* | ||
233 | * Encountered an error while doing the restore from the | ||
234 | * user buffer, clear the fpu state. | ||
235 | */ | ||
236 | clear: | ||
237 | clear_fpu(tsk); | ||
238 | clear_used_math(); | ||
239 | } | ||
240 | return err; | ||
241 | } | ||
242 | #endif | ||
243 | |||
244 | /* | ||
245 | * Prepare the SW reserved portion of the fxsave memory layout, indicating | ||
246 | * the presence of the extended state information in the memory layout | ||
247 | * pointed by the fpstate pointer in the sigcontext. | ||
248 | * This will be saved when ever the FP and extended state context is | ||
249 | * saved on the user stack during the signal handler delivery to the user. | ||
250 | */ | ||
251 | void prepare_fx_sw_frame(void) | ||
252 | { | ||
253 | int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) + | ||
254 | FP_XSTATE_MAGIC2_SIZE; | ||
255 | |||
256 | sig_xstate_size = sizeof(struct _fpstate) + size_extended; | ||
257 | |||
258 | #ifdef CONFIG_IA32_EMULATION | ||
259 | sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended; | ||
260 | #endif | ||
261 | |||
262 | memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved)); | ||
263 | |||
264 | fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; | ||
265 | fx_sw_reserved.extended_size = sig_xstate_size; | ||
266 | fx_sw_reserved.xstate_bv = pcntxt_mask; | ||
267 | fx_sw_reserved.xstate_size = xstate_size; | ||
268 | #ifdef CONFIG_IA32_EMULATION | ||
269 | memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved, | ||
270 | sizeof(struct _fpx_sw_bytes)); | ||
271 | fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size; | ||
272 | #endif | ||
273 | } | ||
274 | |||
275 | /* | ||
276 | * Represents init state for the supported extended state. | ||
277 | */ | ||
278 | struct xsave_struct *init_xstate_buf; | ||
279 | |||
280 | #ifdef CONFIG_X86_64 | ||
281 | unsigned int sig_xstate_size = sizeof(struct _fpstate); | ||
282 | #endif | ||
283 | |||
284 | /* | ||
285 | * Enable the extended processor state save/restore feature | ||
286 | */ | ||
287 | void __cpuinit xsave_init(void) | ||
288 | { | ||
289 | if (!cpu_has_xsave) | ||
290 | return; | ||
291 | |||
292 | set_in_cr4(X86_CR4_OSXSAVE); | ||
293 | |||
294 | /* | ||
295 | * Enable all the features that the HW is capable of | ||
296 | * and the Linux kernel is aware of. | ||
297 | */ | ||
298 | xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); | ||
299 | } | ||
300 | |||
301 | /* | ||
302 | * setup the xstate image representing the init state | ||
303 | */ | ||
304 | static void __init setup_xstate_init(void) | ||
305 | { | ||
306 | init_xstate_buf = alloc_bootmem(xstate_size); | ||
307 | init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; | ||
308 | } | ||
309 | |||
310 | /* | ||
311 | * Enable and initialize the xsave feature. | ||
312 | */ | ||
313 | void __init xsave_cntxt_init(void) | ||
314 | { | ||
315 | unsigned int eax, ebx, ecx, edx; | ||
316 | |||
317 | cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); | ||
318 | pcntxt_mask = eax + ((u64)edx << 32); | ||
319 | |||
320 | if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { | ||
321 | printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n", | ||
322 | pcntxt_mask); | ||
323 | BUG(); | ||
324 | } | ||
325 | |||
326 | /* | ||
327 | * for now OS knows only about FP/SSE | ||
328 | */ | ||
329 | pcntxt_mask = pcntxt_mask & XCNTXT_MASK; | ||
330 | xsave_init(); | ||
331 | |||
332 | /* | ||
333 | * Recompute the context size for enabled features | ||
334 | */ | ||
335 | cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); | ||
336 | xstate_size = ebx; | ||
337 | |||
338 | prepare_fx_sw_frame(); | ||
339 | |||
340 | setup_xstate_init(); | ||
341 | |||
342 | printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, " | ||
343 | "cntxt size 0x%x\n", | ||
344 | pcntxt_mask, xstate_size); | ||
345 | } | ||
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d0e940bb6f40..c02343594b4d 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
@@ -3,10 +3,13 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ | 5 | common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ |
6 | coalesced_mmio.o) | 6 | coalesced_mmio.o irq_comm.o) |
7 | ifeq ($(CONFIG_KVM_TRACE),y) | 7 | ifeq ($(CONFIG_KVM_TRACE),y) |
8 | common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) | 8 | common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) |
9 | endif | 9 | endif |
10 | ifeq ($(CONFIG_DMAR),y) | ||
11 | common-objs += $(addprefix ../../../virt/kvm/, vtd.o) | ||
12 | endif | ||
10 | 13 | ||
11 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm | 14 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm |
12 | 15 | ||
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 1bf8f57a3041..11c6725fb798 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) | |||
200 | 200 | ||
201 | if (!atomic_inc_and_test(&pt->pending)) | 201 | if (!atomic_inc_and_test(&pt->pending)) |
202 | set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); | 202 | set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); |
203 | if (vcpu0 && waitqueue_active(&vcpu0->wq)) { | 203 | |
204 | vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 204 | if (vcpu0 && waitqueue_active(&vcpu0->wq)) |
205 | wake_up_interruptible(&vcpu0->wq); | 205 | wake_up_interruptible(&vcpu0->wq); |
206 | } | ||
207 | 206 | ||
208 | hrtimer_add_expires_ns(&pt->timer, pt->period); | 207 | hrtimer_add_expires_ns(&pt->timer, pt->period); |
209 | pt->scheduled = ktime_to_ns(hrtimer_get_expires(&pt->timer)); | 208 | pt->scheduled = hrtimer_get_expires_ns(&pt->timer); |
209 | if (pt->period) | ||
210 | ps->channels[0].count_load_time = hrtimer_get_expires(&pt->timer); | ||
210 | 211 | ||
211 | return (pt->period == 0 ? 0 : 1); | 212 | return (pt->period == 0 ? 0 : 1); |
212 | } | 213 | } |
@@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) | |||
215 | { | 216 | { |
216 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; | 217 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; |
217 | 218 | ||
218 | if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending) | 219 | if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack) |
219 | return atomic_read(&pit->pit_state.pit_timer.pending); | 220 | return atomic_read(&pit->pit_state.pit_timer.pending); |
220 | |||
221 | return 0; | 221 | return 0; |
222 | } | 222 | } |
223 | 223 | ||
224 | static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) | ||
225 | { | ||
226 | struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, | ||
227 | irq_ack_notifier); | ||
228 | spin_lock(&ps->inject_lock); | ||
229 | if (atomic_dec_return(&ps->pit_timer.pending) < 0) | ||
230 | atomic_inc(&ps->pit_timer.pending); | ||
231 | ps->irq_ack = 1; | ||
232 | spin_unlock(&ps->inject_lock); | ||
233 | } | ||
234 | |||
224 | static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) | 235 | static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) |
225 | { | 236 | { |
226 | struct kvm_kpit_state *ps; | 237 | struct kvm_kpit_state *ps; |
@@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt) | |||
255 | hrtimer_cancel(&pt->timer); | 266 | hrtimer_cancel(&pt->timer); |
256 | } | 267 | } |
257 | 268 | ||
258 | static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) | 269 | static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) |
259 | { | 270 | { |
271 | struct kvm_kpit_timer *pt = &ps->pit_timer; | ||
260 | s64 interval; | 272 | s64 interval; |
261 | 273 | ||
262 | interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); | 274 | interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); |
@@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) | |||
268 | pt->period = (is_period == 0) ? 0 : interval; | 280 | pt->period = (is_period == 0) ? 0 : interval; |
269 | pt->timer.function = pit_timer_fn; | 281 | pt->timer.function = pit_timer_fn; |
270 | atomic_set(&pt->pending, 0); | 282 | atomic_set(&pt->pending, 0); |
283 | ps->irq_ack = 1; | ||
271 | 284 | ||
272 | hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), | 285 | hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), |
273 | HRTIMER_MODE_ABS); | 286 | HRTIMER_MODE_ABS); |
@@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) | |||
302 | case 1: | 315 | case 1: |
303 | /* FIXME: enhance mode 4 precision */ | 316 | /* FIXME: enhance mode 4 precision */ |
304 | case 4: | 317 | case 4: |
305 | create_pit_timer(&ps->pit_timer, val, 0); | 318 | create_pit_timer(ps, val, 0); |
306 | break; | 319 | break; |
307 | case 2: | 320 | case 2: |
308 | case 3: | 321 | case 3: |
309 | create_pit_timer(&ps->pit_timer, val, 1); | 322 | create_pit_timer(ps, val, 1); |
310 | break; | 323 | break; |
311 | default: | 324 | default: |
312 | destroy_pit_timer(&ps->pit_timer); | 325 | destroy_pit_timer(&ps->pit_timer); |
@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit) | |||
520 | mutex_unlock(&pit->pit_state.lock); | 533 | mutex_unlock(&pit->pit_state.lock); |
521 | 534 | ||
522 | atomic_set(&pit->pit_state.pit_timer.pending, 0); | 535 | atomic_set(&pit->pit_state.pit_timer.pending, 0); |
523 | pit->pit_state.inject_pending = 1; | 536 | pit->pit_state.irq_ack = 1; |
524 | } | 537 | } |
525 | 538 | ||
526 | struct kvm_pit *kvm_create_pit(struct kvm *kvm) | 539 | struct kvm_pit *kvm_create_pit(struct kvm *kvm) |
@@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) | |||
534 | 547 | ||
535 | mutex_init(&pit->pit_state.lock); | 548 | mutex_init(&pit->pit_state.lock); |
536 | mutex_lock(&pit->pit_state.lock); | 549 | mutex_lock(&pit->pit_state.lock); |
550 | spin_lock_init(&pit->pit_state.inject_lock); | ||
537 | 551 | ||
538 | /* Initialize PIO device */ | 552 | /* Initialize PIO device */ |
539 | pit->dev.read = pit_ioport_read; | 553 | pit->dev.read = pit_ioport_read; |
@@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) | |||
555 | pit_state->pit = pit; | 569 | pit_state->pit = pit; |
556 | hrtimer_init(&pit_state->pit_timer.timer, | 570 | hrtimer_init(&pit_state->pit_timer.timer, |
557 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 571 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
572 | pit_state->irq_ack_notifier.gsi = 0; | ||
573 | pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; | ||
574 | kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); | ||
558 | mutex_unlock(&pit->pit_state.lock); | 575 | mutex_unlock(&pit->pit_state.lock); |
559 | 576 | ||
560 | kvm_pit_reset(pit); | 577 | kvm_pit_reset(pit); |
@@ -578,10 +595,8 @@ void kvm_free_pit(struct kvm *kvm) | |||
578 | static void __inject_pit_timer_intr(struct kvm *kvm) | 595 | static void __inject_pit_timer_intr(struct kvm *kvm) |
579 | { | 596 | { |
580 | mutex_lock(&kvm->lock); | 597 | mutex_lock(&kvm->lock); |
581 | kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); | 598 | kvm_set_irq(kvm, 0, 1); |
582 | kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); | 599 | kvm_set_irq(kvm, 0, 0); |
583 | kvm_pic_set_irq(pic_irqchip(kvm), 0, 1); | ||
584 | kvm_pic_set_irq(pic_irqchip(kvm), 0, 0); | ||
585 | mutex_unlock(&kvm->lock); | 600 | mutex_unlock(&kvm->lock); |
586 | } | 601 | } |
587 | 602 | ||
@@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) | |||
592 | struct kvm_kpit_state *ps; | 607 | struct kvm_kpit_state *ps; |
593 | 608 | ||
594 | if (vcpu && pit) { | 609 | if (vcpu && pit) { |
610 | int inject = 0; | ||
595 | ps = &pit->pit_state; | 611 | ps = &pit->pit_state; |
596 | 612 | ||
597 | /* Try to inject pending interrupts when: | 613 | /* Try to inject pending interrupts when |
598 | * 1. Pending exists | 614 | * last one has been acked. |
599 | * 2. Last interrupt was accepted or waited for too long time*/ | 615 | */ |
600 | if (atomic_read(&ps->pit_timer.pending) && | 616 | spin_lock(&ps->inject_lock); |
601 | (ps->inject_pending || | 617 | if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { |
602 | (jiffies - ps->last_injected_time | 618 | ps->irq_ack = 0; |
603 | >= KVM_MAX_PIT_INTR_INTERVAL))) { | 619 | inject = 1; |
604 | ps->inject_pending = 0; | ||
605 | __inject_pit_timer_intr(kvm); | ||
606 | ps->last_injected_time = jiffies; | ||
607 | } | ||
608 | } | ||
609 | } | ||
610 | |||
611 | void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | ||
612 | { | ||
613 | struct kvm_arch *arch = &vcpu->kvm->arch; | ||
614 | struct kvm_kpit_state *ps; | ||
615 | |||
616 | if (vcpu && arch->vpit) { | ||
617 | ps = &arch->vpit->pit_state; | ||
618 | if (atomic_read(&ps->pit_timer.pending) && | ||
619 | (((arch->vpic->pics[0].imr & 1) == 0 && | ||
620 | arch->vpic->pics[0].irq_base == vec) || | ||
621 | (arch->vioapic->redirtbl[0].fields.vector == vec && | ||
622 | arch->vioapic->redirtbl[0].fields.mask != 1))) { | ||
623 | ps->inject_pending = 1; | ||
624 | atomic_dec(&ps->pit_timer.pending); | ||
625 | ps->channels[0].count_load_time = ktime_get(); | ||
626 | } | 620 | } |
621 | spin_unlock(&ps->inject_lock); | ||
622 | if (inject) | ||
623 | __inject_pit_timer_intr(kvm); | ||
627 | } | 624 | } |
628 | } | 625 | } |
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index db25c2a6c8c4..e436d4983aa1 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h | |||
@@ -8,7 +8,6 @@ struct kvm_kpit_timer { | |||
8 | int irq; | 8 | int irq; |
9 | s64 period; /* unit: ns */ | 9 | s64 period; /* unit: ns */ |
10 | s64 scheduled; | 10 | s64 scheduled; |
11 | ktime_t last_update; | ||
12 | atomic_t pending; | 11 | atomic_t pending; |
13 | }; | 12 | }; |
14 | 13 | ||
@@ -34,8 +33,9 @@ struct kvm_kpit_state { | |||
34 | u32 speaker_data_on; | 33 | u32 speaker_data_on; |
35 | struct mutex lock; | 34 | struct mutex lock; |
36 | struct kvm_pit *pit; | 35 | struct kvm_pit *pit; |
37 | bool inject_pending; /* if inject pending interrupts */ | 36 | spinlock_t inject_lock; |
38 | unsigned long last_injected_time; | 37 | unsigned long irq_ack; |
38 | struct kvm_irq_ack_notifier irq_ack_notifier; | ||
39 | }; | 39 | }; |
40 | 40 | ||
41 | struct kvm_pit { | 41 | struct kvm_pit { |
@@ -54,7 +54,6 @@ struct kvm_pit { | |||
54 | #define KVM_PIT_CHANNEL_MASK 0x3 | 54 | #define KVM_PIT_CHANNEL_MASK 0x3 |
55 | 55 | ||
56 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); | 56 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); |
57 | void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
58 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); | 57 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); |
59 | struct kvm_pit *kvm_create_pit(struct kvm *kvm); | 58 | struct kvm_pit *kvm_create_pit(struct kvm *kvm); |
60 | void kvm_free_pit(struct kvm *kvm); | 59 | void kvm_free_pit(struct kvm *kvm); |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index c31164e8aa46..17e41e165f1a 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -30,6 +30,19 @@ | |||
30 | 30 | ||
31 | #include <linux/kvm_host.h> | 31 | #include <linux/kvm_host.h> |
32 | 32 | ||
33 | static void pic_clear_isr(struct kvm_kpic_state *s, int irq) | ||
34 | { | ||
35 | s->isr &= ~(1 << irq); | ||
36 | s->isr_ack |= (1 << irq); | ||
37 | } | ||
38 | |||
39 | void kvm_pic_clear_isr_ack(struct kvm *kvm) | ||
40 | { | ||
41 | struct kvm_pic *s = pic_irqchip(kvm); | ||
42 | s->pics[0].isr_ack = 0xff; | ||
43 | s->pics[1].isr_ack = 0xff; | ||
44 | } | ||
45 | |||
33 | /* | 46 | /* |
34 | * set irq level. If an edge is detected, then the IRR is set to 1 | 47 | * set irq level. If an edge is detected, then the IRR is set to 1 |
35 | */ | 48 | */ |
@@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level) | |||
141 | */ | 154 | */ |
142 | static inline void pic_intack(struct kvm_kpic_state *s, int irq) | 155 | static inline void pic_intack(struct kvm_kpic_state *s, int irq) |
143 | { | 156 | { |
157 | s->isr |= 1 << irq; | ||
144 | if (s->auto_eoi) { | 158 | if (s->auto_eoi) { |
145 | if (s->rotate_on_auto_eoi) | 159 | if (s->rotate_on_auto_eoi) |
146 | s->priority_add = (irq + 1) & 7; | 160 | s->priority_add = (irq + 1) & 7; |
147 | } else | 161 | pic_clear_isr(s, irq); |
148 | s->isr |= (1 << irq); | 162 | } |
149 | /* | 163 | /* |
150 | * We don't clear a level sensitive interrupt here | 164 | * We don't clear a level sensitive interrupt here |
151 | */ | 165 | */ |
@@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq) | |||
153 | s->irr &= ~(1 << irq); | 167 | s->irr &= ~(1 << irq); |
154 | } | 168 | } |
155 | 169 | ||
156 | int kvm_pic_read_irq(struct kvm_pic *s) | 170 | int kvm_pic_read_irq(struct kvm *kvm) |
157 | { | 171 | { |
158 | int irq, irq2, intno; | 172 | int irq, irq2, intno; |
173 | struct kvm_pic *s = pic_irqchip(kvm); | ||
159 | 174 | ||
160 | irq = pic_get_irq(&s->pics[0]); | 175 | irq = pic_get_irq(&s->pics[0]); |
161 | if (irq >= 0) { | 176 | if (irq >= 0) { |
@@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s) | |||
181 | intno = s->pics[0].irq_base + irq; | 196 | intno = s->pics[0].irq_base + irq; |
182 | } | 197 | } |
183 | pic_update_irq(s); | 198 | pic_update_irq(s); |
199 | kvm_notify_acked_irq(kvm, irq); | ||
184 | 200 | ||
185 | return intno; | 201 | return intno; |
186 | } | 202 | } |
187 | 203 | ||
188 | void kvm_pic_reset(struct kvm_kpic_state *s) | 204 | void kvm_pic_reset(struct kvm_kpic_state *s) |
189 | { | 205 | { |
206 | int irq, irqbase; | ||
207 | struct kvm *kvm = s->pics_state->irq_request_opaque; | ||
208 | struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; | ||
209 | |||
210 | if (s == &s->pics_state->pics[0]) | ||
211 | irqbase = 0; | ||
212 | else | ||
213 | irqbase = 8; | ||
214 | |||
215 | for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { | ||
216 | if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) | ||
217 | if (s->irr & (1 << irq) || s->isr & (1 << irq)) | ||
218 | kvm_notify_acked_irq(kvm, irq+irqbase); | ||
219 | } | ||
190 | s->last_irr = 0; | 220 | s->last_irr = 0; |
191 | s->irr = 0; | 221 | s->irr = 0; |
192 | s->imr = 0; | 222 | s->imr = 0; |
193 | s->isr = 0; | 223 | s->isr = 0; |
224 | s->isr_ack = 0xff; | ||
194 | s->priority_add = 0; | 225 | s->priority_add = 0; |
195 | s->irq_base = 0; | 226 | s->irq_base = 0; |
196 | s->read_reg_select = 0; | 227 | s->read_reg_select = 0; |
@@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
243 | priority = get_priority(s, s->isr); | 274 | priority = get_priority(s, s->isr); |
244 | if (priority != 8) { | 275 | if (priority != 8) { |
245 | irq = (priority + s->priority_add) & 7; | 276 | irq = (priority + s->priority_add) & 7; |
246 | s->isr &= ~(1 << irq); | 277 | pic_clear_isr(s, irq); |
247 | if (cmd == 5) | 278 | if (cmd == 5) |
248 | s->priority_add = (irq + 1) & 7; | 279 | s->priority_add = (irq + 1) & 7; |
249 | pic_update_irq(s->pics_state); | 280 | pic_update_irq(s->pics_state); |
@@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
251 | break; | 282 | break; |
252 | case 3: | 283 | case 3: |
253 | irq = val & 7; | 284 | irq = val & 7; |
254 | s->isr &= ~(1 << irq); | 285 | pic_clear_isr(s, irq); |
255 | pic_update_irq(s->pics_state); | 286 | pic_update_irq(s->pics_state); |
256 | break; | 287 | break; |
257 | case 6: | 288 | case 6: |
@@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
260 | break; | 291 | break; |
261 | case 7: | 292 | case 7: |
262 | irq = val & 7; | 293 | irq = val & 7; |
263 | s->isr &= ~(1 << irq); | ||
264 | s->priority_add = (irq + 1) & 7; | 294 | s->priority_add = (irq + 1) & 7; |
295 | pic_clear_isr(s, irq); | ||
265 | pic_update_irq(s->pics_state); | 296 | pic_update_irq(s->pics_state); |
266 | break; | 297 | break; |
267 | default: | 298 | default: |
@@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) | |||
303 | s->pics_state->pics[0].irr &= ~(1 << 2); | 334 | s->pics_state->pics[0].irr &= ~(1 << 2); |
304 | } | 335 | } |
305 | s->irr &= ~(1 << ret); | 336 | s->irr &= ~(1 << ret); |
306 | s->isr &= ~(1 << ret); | 337 | pic_clear_isr(s, ret); |
307 | if (addr1 >> 7 || ret != 2) | 338 | if (addr1 >> 7 || ret != 2) |
308 | pic_update_irq(s->pics_state); | 339 | pic_update_irq(s->pics_state); |
309 | } else { | 340 | } else { |
@@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level) | |||
422 | { | 453 | { |
423 | struct kvm *kvm = opaque; | 454 | struct kvm *kvm = opaque; |
424 | struct kvm_vcpu *vcpu = kvm->vcpus[0]; | 455 | struct kvm_vcpu *vcpu = kvm->vcpus[0]; |
456 | struct kvm_pic *s = pic_irqchip(kvm); | ||
457 | int irq = pic_get_irq(&s->pics[0]); | ||
425 | 458 | ||
426 | pic_irqchip(kvm)->output = level; | 459 | s->output = level; |
427 | if (vcpu) | 460 | if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { |
461 | s->pics[0].isr_ack &= ~(1 << irq); | ||
428 | kvm_vcpu_kick(vcpu); | 462 | kvm_vcpu_kick(vcpu); |
463 | } | ||
429 | } | 464 | } |
430 | 465 | ||
431 | struct kvm_pic *kvm_create_pic(struct kvm *kvm) | 466 | struct kvm_pic *kvm_create_pic(struct kvm *kvm) |
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 76d736b5f664..c019b8edcdb7 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c | |||
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) | |||
72 | if (kvm_apic_accept_pic_intr(v)) { | 72 | if (kvm_apic_accept_pic_intr(v)) { |
73 | s = pic_irqchip(v->kvm); | 73 | s = pic_irqchip(v->kvm); |
74 | s->output = 0; /* PIC */ | 74 | s->output = 0; /* PIC */ |
75 | vector = kvm_pic_read_irq(s); | 75 | vector = kvm_pic_read_irq(v->kvm); |
76 | } | 76 | } |
77 | } | 77 | } |
78 | return vector; | 78 | return vector; |
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); | |||
90 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | 90 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) |
91 | { | 91 | { |
92 | kvm_apic_timer_intr_post(vcpu, vec); | 92 | kvm_apic_timer_intr_post(vcpu, vec); |
93 | kvm_pit_timer_intr_post(vcpu, vec); | ||
94 | /* TODO: PIT, RTC etc. */ | 93 | /* TODO: PIT, RTC etc. */ |
95 | } | 94 | } |
96 | EXPORT_SYMBOL_GPL(kvm_timer_intr_post); | 95 | EXPORT_SYMBOL_GPL(kvm_timer_intr_post); |
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 7ca47cbb48bb..f17c8f5bbf31 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h | |||
@@ -42,6 +42,7 @@ struct kvm_kpic_state { | |||
42 | u8 irr; /* interrupt request register */ | 42 | u8 irr; /* interrupt request register */ |
43 | u8 imr; /* interrupt mask register */ | 43 | u8 imr; /* interrupt mask register */ |
44 | u8 isr; /* interrupt service register */ | 44 | u8 isr; /* interrupt service register */ |
45 | u8 isr_ack; /* interrupt ack detection */ | ||
45 | u8 priority_add; /* highest irq priority */ | 46 | u8 priority_add; /* highest irq priority */ |
46 | u8 irq_base; | 47 | u8 irq_base; |
47 | u8 read_reg_select; | 48 | u8 read_reg_select; |
@@ -63,12 +64,13 @@ struct kvm_pic { | |||
63 | void *irq_request_opaque; | 64 | void *irq_request_opaque; |
64 | int output; /* intr from master PIC */ | 65 | int output; /* intr from master PIC */ |
65 | struct kvm_io_device dev; | 66 | struct kvm_io_device dev; |
67 | void (*ack_notifier)(void *opaque, int irq); | ||
66 | }; | 68 | }; |
67 | 69 | ||
68 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); | 70 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); |
69 | void kvm_pic_set_irq(void *opaque, int irq, int level); | 71 | int kvm_pic_read_irq(struct kvm *kvm); |
70 | int kvm_pic_read_irq(struct kvm_pic *s); | ||
71 | void kvm_pic_update_irq(struct kvm_pic *s); | 72 | void kvm_pic_update_irq(struct kvm_pic *s); |
73 | void kvm_pic_clear_isr_ack(struct kvm *kvm); | ||
72 | 74 | ||
73 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) | 75 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) |
74 | { | 76 | { |
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h new file mode 100644 index 000000000000..1ff819dce7d3 --- /dev/null +++ b/arch/x86/kvm/kvm_cache_regs.h | |||
@@ -0,0 +1,32 @@ | |||
1 | #ifndef ASM_KVM_CACHE_REGS_H | ||
2 | #define ASM_KVM_CACHE_REGS_H | ||
3 | |||
4 | static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, | ||
5 | enum kvm_reg reg) | ||
6 | { | ||
7 | if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) | ||
8 | kvm_x86_ops->cache_reg(vcpu, reg); | ||
9 | |||
10 | return vcpu->arch.regs[reg]; | ||
11 | } | ||
12 | |||
13 | static inline void kvm_register_write(struct kvm_vcpu *vcpu, | ||
14 | enum kvm_reg reg, | ||
15 | unsigned long val) | ||
16 | { | ||
17 | vcpu->arch.regs[reg] = val; | ||
18 | __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); | ||
19 | __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); | ||
20 | } | ||
21 | |||
22 | static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) | ||
23 | { | ||
24 | return kvm_register_read(vcpu, VCPU_REGS_RIP); | ||
25 | } | ||
26 | |||
27 | static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) | ||
28 | { | ||
29 | kvm_register_write(vcpu, VCPU_REGS_RIP, val); | ||
30 | } | ||
31 | |||
32 | #endif | ||
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a5b61de6adf1..0fc3cab48943 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <asm/current.h> | 32 | #include <asm/current.h> |
33 | #include <asm/apicdef.h> | 33 | #include <asm/apicdef.h> |
34 | #include <asm/atomic.h> | 34 | #include <asm/atomic.h> |
35 | #include "kvm_cache_regs.h" | ||
35 | #include "irq.h" | 36 | #include "irq.h" |
36 | 37 | ||
37 | #define PRId64 "d" | 38 | #define PRId64 "d" |
@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
338 | } else | 339 | } else |
339 | apic_clear_vector(vector, apic->regs + APIC_TMR); | 340 | apic_clear_vector(vector, apic->regs + APIC_TMR); |
340 | 341 | ||
341 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) | 342 | kvm_vcpu_kick(vcpu); |
342 | kvm_vcpu_kick(vcpu); | ||
343 | else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) { | ||
344 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
345 | if (waitqueue_active(&vcpu->wq)) | ||
346 | wake_up_interruptible(&vcpu->wq); | ||
347 | } | ||
348 | 343 | ||
349 | result = (orig_irr == 0); | 344 | result = (orig_irr == 0); |
350 | break; | 345 | break; |
@@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
370 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; | 365 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; |
371 | kvm_vcpu_kick(vcpu); | 366 | kvm_vcpu_kick(vcpu); |
372 | } else { | 367 | } else { |
373 | printk(KERN_DEBUG | 368 | apic_debug("Ignoring de-assert INIT to vcpu %d\n", |
374 | "Ignoring de-assert INIT to vcpu %d\n", | 369 | vcpu->vcpu_id); |
375 | vcpu->vcpu_id); | ||
376 | } | 370 | } |
377 | |||
378 | break; | 371 | break; |
379 | 372 | ||
380 | case APIC_DM_STARTUP: | 373 | case APIC_DM_STARTUP: |
381 | printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", | 374 | apic_debug("SIPI to vcpu %d vector 0x%02x\n", |
382 | vcpu->vcpu_id, vector); | 375 | vcpu->vcpu_id, vector); |
383 | if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { | 376 | if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { |
384 | vcpu->arch.sipi_vector = vector; | 377 | vcpu->arch.sipi_vector = vector; |
385 | vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; | 378 | vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; |
386 | if (waitqueue_active(&vcpu->wq)) | 379 | kvm_vcpu_kick(vcpu); |
387 | wake_up_interruptible(&vcpu->wq); | ||
388 | } | 380 | } |
389 | break; | 381 | break; |
390 | 382 | ||
@@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, | |||
438 | static void apic_set_eoi(struct kvm_lapic *apic) | 430 | static void apic_set_eoi(struct kvm_lapic *apic) |
439 | { | 431 | { |
440 | int vector = apic_find_highest_isr(apic); | 432 | int vector = apic_find_highest_isr(apic); |
441 | 433 | int trigger_mode; | |
442 | /* | 434 | /* |
443 | * Not every write EOI will has corresponding ISR, | 435 | * Not every write EOI will has corresponding ISR, |
444 | * one example is when Kernel check timer on setup_IO_APIC | 436 | * one example is when Kernel check timer on setup_IO_APIC |
@@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic) | |||
450 | apic_update_ppr(apic); | 442 | apic_update_ppr(apic); |
451 | 443 | ||
452 | if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) | 444 | if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) |
453 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); | 445 | trigger_mode = IOAPIC_LEVEL_TRIG; |
446 | else | ||
447 | trigger_mode = IOAPIC_EDGE_TRIG; | ||
448 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); | ||
454 | } | 449 | } |
455 | 450 | ||
456 | static void apic_send_ipi(struct kvm_lapic *apic) | 451 | static void apic_send_ipi(struct kvm_lapic *apic) |
@@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) | |||
558 | struct kvm_run *run = vcpu->run; | 553 | struct kvm_run *run = vcpu->run; |
559 | 554 | ||
560 | set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); | 555 | set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); |
561 | kvm_x86_ops->cache_regs(vcpu); | 556 | run->tpr_access.rip = kvm_rip_read(vcpu); |
562 | run->tpr_access.rip = vcpu->arch.rip; | ||
563 | run->tpr_access.is_write = write; | 557 | run->tpr_access.is_write = write; |
564 | } | 558 | } |
565 | 559 | ||
@@ -683,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
683 | * Refer SDM 8.4.1 | 677 | * Refer SDM 8.4.1 |
684 | */ | 678 | */ |
685 | if (len != 4 || alignment) { | 679 | if (len != 4 || alignment) { |
686 | if (printk_ratelimit()) | 680 | /* Don't shout loud, $infamous_os would cause only noise. */ |
687 | printk(KERN_ERR "apic write: bad size=%d %lx\n", | 681 | apic_debug("apic write: bad size=%d %lx\n", |
688 | len, (long)address); | 682 | len, (long)address); |
689 | return; | 683 | return; |
690 | } | 684 | } |
691 | 685 | ||
@@ -947,10 +941,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic) | |||
947 | 941 | ||
948 | if(!atomic_inc_and_test(&apic->timer.pending)) | 942 | if(!atomic_inc_and_test(&apic->timer.pending)) |
949 | set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); | 943 | set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); |
950 | if (waitqueue_active(q)) { | 944 | if (waitqueue_active(q)) |
951 | apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
952 | wake_up_interruptible(q); | 945 | wake_up_interruptible(q); |
953 | } | 946 | |
954 | if (apic_lvtt_period(apic)) { | 947 | if (apic_lvtt_period(apic)) { |
955 | result = 1; | 948 | result = 1; |
956 | hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period); | 949 | hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period); |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0bfe2bd305eb..99c239c5c0ac 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -70,6 +70,9 @@ static int dbg = 0; | |||
70 | module_param(dbg, bool, 0644); | 70 | module_param(dbg, bool, 0644); |
71 | #endif | 71 | #endif |
72 | 72 | ||
73 | static int oos_shadow = 1; | ||
74 | module_param(oos_shadow, bool, 0644); | ||
75 | |||
73 | #ifndef MMU_DEBUG | 76 | #ifndef MMU_DEBUG |
74 | #define ASSERT(x) do { } while (0) | 77 | #define ASSERT(x) do { } while (0) |
75 | #else | 78 | #else |
@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644); | |||
135 | #define ACC_USER_MASK PT_USER_MASK | 138 | #define ACC_USER_MASK PT_USER_MASK |
136 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) | 139 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) |
137 | 140 | ||
138 | struct kvm_pv_mmu_op_buffer { | 141 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
139 | void *ptr; | ||
140 | unsigned len; | ||
141 | unsigned processed; | ||
142 | char buf[512] __aligned(sizeof(long)); | ||
143 | }; | ||
144 | 142 | ||
145 | struct kvm_rmap_desc { | 143 | struct kvm_rmap_desc { |
146 | u64 *shadow_ptes[RMAP_EXT]; | 144 | u64 *shadow_ptes[RMAP_EXT]; |
147 | struct kvm_rmap_desc *more; | 145 | struct kvm_rmap_desc *more; |
148 | }; | 146 | }; |
149 | 147 | ||
148 | struct kvm_shadow_walk { | ||
149 | int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu, | ||
150 | u64 addr, u64 *spte, int level); | ||
151 | }; | ||
152 | |||
153 | struct kvm_unsync_walk { | ||
154 | int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); | ||
155 | }; | ||
156 | |||
157 | typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); | ||
158 | |||
150 | static struct kmem_cache *pte_chain_cache; | 159 | static struct kmem_cache *pte_chain_cache; |
151 | static struct kmem_cache *rmap_desc_cache; | 160 | static struct kmem_cache *rmap_desc_cache; |
152 | static struct kmem_cache *mmu_page_header_cache; | 161 | static struct kmem_cache *mmu_page_header_cache; |
@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) | |||
405 | { | 414 | { |
406 | struct vm_area_struct *vma; | 415 | struct vm_area_struct *vma; |
407 | unsigned long addr; | 416 | unsigned long addr; |
417 | int ret = 0; | ||
408 | 418 | ||
409 | addr = gfn_to_hva(kvm, gfn); | 419 | addr = gfn_to_hva(kvm, gfn); |
410 | if (kvm_is_error_hva(addr)) | 420 | if (kvm_is_error_hva(addr)) |
411 | return 0; | 421 | return ret; |
412 | 422 | ||
423 | down_read(¤t->mm->mmap_sem); | ||
413 | vma = find_vma(current->mm, addr); | 424 | vma = find_vma(current->mm, addr); |
414 | if (vma && is_vm_hugetlb_page(vma)) | 425 | if (vma && is_vm_hugetlb_page(vma)) |
415 | return 1; | 426 | ret = 1; |
427 | up_read(¤t->mm->mmap_sem); | ||
416 | 428 | ||
417 | return 0; | 429 | return ret; |
418 | } | 430 | } |
419 | 431 | ||
420 | static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) | 432 | static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) |
@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
649 | 661 | ||
650 | if (write_protected) | 662 | if (write_protected) |
651 | kvm_flush_remote_tlbs(kvm); | 663 | kvm_flush_remote_tlbs(kvm); |
652 | |||
653 | account_shadowed(kvm, gfn); | ||
654 | } | 664 | } |
655 | 665 | ||
656 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) | 666 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) |
@@ -711,6 +721,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) | |||
711 | u64 *spte; | 721 | u64 *spte; |
712 | int young = 0; | 722 | int young = 0; |
713 | 723 | ||
724 | /* always return old for EPT */ | ||
725 | if (!shadow_accessed_mask) | ||
726 | return 0; | ||
727 | |||
714 | spte = rmap_next(kvm, rmapp, NULL); | 728 | spte = rmap_next(kvm, rmapp, NULL); |
715 | while (spte) { | 729 | while (spte) { |
716 | int _young; | 730 | int _young; |
@@ -855,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, | |||
855 | BUG(); | 869 | BUG(); |
856 | } | 870 | } |
857 | 871 | ||
872 | |||
873 | static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | ||
874 | mmu_parent_walk_fn fn) | ||
875 | { | ||
876 | struct kvm_pte_chain *pte_chain; | ||
877 | struct hlist_node *node; | ||
878 | struct kvm_mmu_page *parent_sp; | ||
879 | int i; | ||
880 | |||
881 | if (!sp->multimapped && sp->parent_pte) { | ||
882 | parent_sp = page_header(__pa(sp->parent_pte)); | ||
883 | fn(vcpu, parent_sp); | ||
884 | mmu_parent_walk(vcpu, parent_sp, fn); | ||
885 | return; | ||
886 | } | ||
887 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
888 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
889 | if (!pte_chain->parent_ptes[i]) | ||
890 | break; | ||
891 | parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); | ||
892 | fn(vcpu, parent_sp); | ||
893 | mmu_parent_walk(vcpu, parent_sp, fn); | ||
894 | } | ||
895 | } | ||
896 | |||
897 | static void kvm_mmu_update_unsync_bitmap(u64 *spte) | ||
898 | { | ||
899 | unsigned int index; | ||
900 | struct kvm_mmu_page *sp = page_header(__pa(spte)); | ||
901 | |||
902 | index = spte - sp->spt; | ||
903 | __set_bit(index, sp->unsync_child_bitmap); | ||
904 | sp->unsync_children = 1; | ||
905 | } | ||
906 | |||
907 | static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) | ||
908 | { | ||
909 | struct kvm_pte_chain *pte_chain; | ||
910 | struct hlist_node *node; | ||
911 | int i; | ||
912 | |||
913 | if (!sp->parent_pte) | ||
914 | return; | ||
915 | |||
916 | if (!sp->multimapped) { | ||
917 | kvm_mmu_update_unsync_bitmap(sp->parent_pte); | ||
918 | return; | ||
919 | } | ||
920 | |||
921 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
922 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
923 | if (!pte_chain->parent_ptes[i]) | ||
924 | break; | ||
925 | kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); | ||
926 | } | ||
927 | } | ||
928 | |||
929 | static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | ||
930 | { | ||
931 | sp->unsync_children = 1; | ||
932 | kvm_mmu_update_parents_unsync(sp); | ||
933 | return 1; | ||
934 | } | ||
935 | |||
936 | static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, | ||
937 | struct kvm_mmu_page *sp) | ||
938 | { | ||
939 | mmu_parent_walk(vcpu, sp, unsync_walk_fn); | ||
940 | kvm_mmu_update_parents_unsync(sp); | ||
941 | } | ||
942 | |||
858 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | 943 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, |
859 | struct kvm_mmu_page *sp) | 944 | struct kvm_mmu_page *sp) |
860 | { | 945 | { |
@@ -864,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | |||
864 | sp->spt[i] = shadow_trap_nonpresent_pte; | 949 | sp->spt[i] = shadow_trap_nonpresent_pte; |
865 | } | 950 | } |
866 | 951 | ||
952 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, | ||
953 | struct kvm_mmu_page *sp) | ||
954 | { | ||
955 | return 1; | ||
956 | } | ||
957 | |||
958 | static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) | ||
959 | { | ||
960 | } | ||
961 | |||
962 | #define for_each_unsync_children(bitmap, idx) \ | ||
963 | for (idx = find_first_bit(bitmap, 512); \ | ||
964 | idx < 512; \ | ||
965 | idx = find_next_bit(bitmap, 512, idx+1)) | ||
966 | |||
967 | static int mmu_unsync_walk(struct kvm_mmu_page *sp, | ||
968 | struct kvm_unsync_walk *walker) | ||
969 | { | ||
970 | int i, ret; | ||
971 | |||
972 | if (!sp->unsync_children) | ||
973 | return 0; | ||
974 | |||
975 | for_each_unsync_children(sp->unsync_child_bitmap, i) { | ||
976 | u64 ent = sp->spt[i]; | ||
977 | |||
978 | if (is_shadow_present_pte(ent)) { | ||
979 | struct kvm_mmu_page *child; | ||
980 | child = page_header(ent & PT64_BASE_ADDR_MASK); | ||
981 | |||
982 | if (child->unsync_children) { | ||
983 | ret = mmu_unsync_walk(child, walker); | ||
984 | if (ret) | ||
985 | return ret; | ||
986 | __clear_bit(i, sp->unsync_child_bitmap); | ||
987 | } | ||
988 | |||
989 | if (child->unsync) { | ||
990 | ret = walker->entry(child, walker); | ||
991 | __clear_bit(i, sp->unsync_child_bitmap); | ||
992 | if (ret) | ||
993 | return ret; | ||
994 | } | ||
995 | } | ||
996 | } | ||
997 | |||
998 | if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) | ||
999 | sp->unsync_children = 0; | ||
1000 | |||
1001 | return 0; | ||
1002 | } | ||
1003 | |||
867 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | 1004 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) |
868 | { | 1005 | { |
869 | unsigned index; | 1006 | unsigned index; |
@@ -884,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | |||
884 | return NULL; | 1021 | return NULL; |
885 | } | 1022 | } |
886 | 1023 | ||
1024 | static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1025 | { | ||
1026 | WARN_ON(!sp->unsync); | ||
1027 | sp->unsync = 0; | ||
1028 | --kvm->stat.mmu_unsync; | ||
1029 | } | ||
1030 | |||
1031 | static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); | ||
1032 | |||
1033 | static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | ||
1034 | { | ||
1035 | if (sp->role.glevels != vcpu->arch.mmu.root_level) { | ||
1036 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1037 | return 1; | ||
1038 | } | ||
1039 | |||
1040 | rmap_write_protect(vcpu->kvm, sp->gfn); | ||
1041 | if (vcpu->arch.mmu.sync_page(vcpu, sp)) { | ||
1042 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1043 | return 1; | ||
1044 | } | ||
1045 | |||
1046 | kvm_mmu_flush_tlb(vcpu); | ||
1047 | kvm_unlink_unsync_page(vcpu->kvm, sp); | ||
1048 | return 0; | ||
1049 | } | ||
1050 | |||
1051 | struct sync_walker { | ||
1052 | struct kvm_vcpu *vcpu; | ||
1053 | struct kvm_unsync_walk walker; | ||
1054 | }; | ||
1055 | |||
1056 | static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) | ||
1057 | { | ||
1058 | struct sync_walker *sync_walk = container_of(walk, struct sync_walker, | ||
1059 | walker); | ||
1060 | struct kvm_vcpu *vcpu = sync_walk->vcpu; | ||
1061 | |||
1062 | kvm_sync_page(vcpu, sp); | ||
1063 | return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)); | ||
1064 | } | ||
1065 | |||
1066 | static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | ||
1067 | { | ||
1068 | struct sync_walker walker = { | ||
1069 | .walker = { .entry = mmu_sync_fn, }, | ||
1070 | .vcpu = vcpu, | ||
1071 | }; | ||
1072 | |||
1073 | while (mmu_unsync_walk(sp, &walker.walker)) | ||
1074 | cond_resched_lock(&vcpu->kvm->mmu_lock); | ||
1075 | } | ||
1076 | |||
887 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | 1077 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, |
888 | gfn_t gfn, | 1078 | gfn_t gfn, |
889 | gva_t gaddr, | 1079 | gva_t gaddr, |
@@ -897,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
897 | unsigned quadrant; | 1087 | unsigned quadrant; |
898 | struct hlist_head *bucket; | 1088 | struct hlist_head *bucket; |
899 | struct kvm_mmu_page *sp; | 1089 | struct kvm_mmu_page *sp; |
900 | struct hlist_node *node; | 1090 | struct hlist_node *node, *tmp; |
901 | 1091 | ||
902 | role.word = 0; | 1092 | role.word = 0; |
903 | role.glevels = vcpu->arch.mmu.root_level; | 1093 | role.glevels = vcpu->arch.mmu.root_level; |
@@ -913,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
913 | gfn, role.word); | 1103 | gfn, role.word); |
914 | index = kvm_page_table_hashfn(gfn); | 1104 | index = kvm_page_table_hashfn(gfn); |
915 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | 1105 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
916 | hlist_for_each_entry(sp, node, bucket, hash_link) | 1106 | hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) |
917 | if (sp->gfn == gfn && sp->role.word == role.word) { | 1107 | if (sp->gfn == gfn) { |
1108 | if (sp->unsync) | ||
1109 | if (kvm_sync_page(vcpu, sp)) | ||
1110 | continue; | ||
1111 | |||
1112 | if (sp->role.word != role.word) | ||
1113 | continue; | ||
1114 | |||
918 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | 1115 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); |
1116 | if (sp->unsync_children) { | ||
1117 | set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); | ||
1118 | kvm_mmu_mark_parents_unsync(vcpu, sp); | ||
1119 | } | ||
919 | pgprintk("%s: found\n", __func__); | 1120 | pgprintk("%s: found\n", __func__); |
920 | return sp; | 1121 | return sp; |
921 | } | 1122 | } |
@@ -927,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
927 | sp->gfn = gfn; | 1128 | sp->gfn = gfn; |
928 | sp->role = role; | 1129 | sp->role = role; |
929 | hlist_add_head(&sp->hash_link, bucket); | 1130 | hlist_add_head(&sp->hash_link, bucket); |
930 | if (!metaphysical) | 1131 | if (!metaphysical) { |
931 | rmap_write_protect(vcpu->kvm, gfn); | 1132 | rmap_write_protect(vcpu->kvm, gfn); |
1133 | account_shadowed(vcpu->kvm, gfn); | ||
1134 | } | ||
932 | if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) | 1135 | if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) |
933 | vcpu->arch.mmu.prefetch_page(vcpu, sp); | 1136 | vcpu->arch.mmu.prefetch_page(vcpu, sp); |
934 | else | 1137 | else |
@@ -936,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
936 | return sp; | 1139 | return sp; |
937 | } | 1140 | } |
938 | 1141 | ||
1142 | static int walk_shadow(struct kvm_shadow_walk *walker, | ||
1143 | struct kvm_vcpu *vcpu, u64 addr) | ||
1144 | { | ||
1145 | hpa_t shadow_addr; | ||
1146 | int level; | ||
1147 | int r; | ||
1148 | u64 *sptep; | ||
1149 | unsigned index; | ||
1150 | |||
1151 | shadow_addr = vcpu->arch.mmu.root_hpa; | ||
1152 | level = vcpu->arch.mmu.shadow_root_level; | ||
1153 | if (level == PT32E_ROOT_LEVEL) { | ||
1154 | shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | ||
1155 | shadow_addr &= PT64_BASE_ADDR_MASK; | ||
1156 | --level; | ||
1157 | } | ||
1158 | |||
1159 | while (level >= PT_PAGE_TABLE_LEVEL) { | ||
1160 | index = SHADOW_PT_INDEX(addr, level); | ||
1161 | sptep = ((u64 *)__va(shadow_addr)) + index; | ||
1162 | r = walker->entry(walker, vcpu, addr, sptep, level); | ||
1163 | if (r) | ||
1164 | return r; | ||
1165 | shadow_addr = *sptep & PT64_BASE_ADDR_MASK; | ||
1166 | --level; | ||
1167 | } | ||
1168 | return 0; | ||
1169 | } | ||
1170 | |||
939 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | 1171 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, |
940 | struct kvm_mmu_page *sp) | 1172 | struct kvm_mmu_page *sp) |
941 | { | 1173 | { |
@@ -951,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, | |||
951 | rmap_remove(kvm, &pt[i]); | 1183 | rmap_remove(kvm, &pt[i]); |
952 | pt[i] = shadow_trap_nonpresent_pte; | 1184 | pt[i] = shadow_trap_nonpresent_pte; |
953 | } | 1185 | } |
954 | kvm_flush_remote_tlbs(kvm); | ||
955 | return; | 1186 | return; |
956 | } | 1187 | } |
957 | 1188 | ||
@@ -970,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, | |||
970 | } | 1201 | } |
971 | pt[i] = shadow_trap_nonpresent_pte; | 1202 | pt[i] = shadow_trap_nonpresent_pte; |
972 | } | 1203 | } |
973 | kvm_flush_remote_tlbs(kvm); | ||
974 | } | 1204 | } |
975 | 1205 | ||
976 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | 1206 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) |
@@ -987,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) | |||
987 | kvm->vcpus[i]->arch.last_pte_updated = NULL; | 1217 | kvm->vcpus[i]->arch.last_pte_updated = NULL; |
988 | } | 1218 | } |
989 | 1219 | ||
990 | static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1220 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) |
991 | { | 1221 | { |
992 | u64 *parent_pte; | 1222 | u64 *parent_pte; |
993 | 1223 | ||
994 | ++kvm->stat.mmu_shadow_zapped; | ||
995 | while (sp->multimapped || sp->parent_pte) { | 1224 | while (sp->multimapped || sp->parent_pte) { |
996 | if (!sp->multimapped) | 1225 | if (!sp->multimapped) |
997 | parent_pte = sp->parent_pte; | 1226 | parent_pte = sp->parent_pte; |
@@ -1006,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
1006 | kvm_mmu_put_page(sp, parent_pte); | 1235 | kvm_mmu_put_page(sp, parent_pte); |
1007 | set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); | 1236 | set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); |
1008 | } | 1237 | } |
1238 | } | ||
1239 | |||
1240 | struct zap_walker { | ||
1241 | struct kvm_unsync_walk walker; | ||
1242 | struct kvm *kvm; | ||
1243 | int zapped; | ||
1244 | }; | ||
1245 | |||
1246 | static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) | ||
1247 | { | ||
1248 | struct zap_walker *zap_walk = container_of(walk, struct zap_walker, | ||
1249 | walker); | ||
1250 | kvm_mmu_zap_page(zap_walk->kvm, sp); | ||
1251 | zap_walk->zapped = 1; | ||
1252 | return 0; | ||
1253 | } | ||
1254 | |||
1255 | static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1256 | { | ||
1257 | struct zap_walker walker = { | ||
1258 | .walker = { .entry = mmu_zap_fn, }, | ||
1259 | .kvm = kvm, | ||
1260 | .zapped = 0, | ||
1261 | }; | ||
1262 | |||
1263 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) | ||
1264 | return 0; | ||
1265 | mmu_unsync_walk(sp, &walker.walker); | ||
1266 | return walker.zapped; | ||
1267 | } | ||
1268 | |||
1269 | static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1270 | { | ||
1271 | int ret; | ||
1272 | ++kvm->stat.mmu_shadow_zapped; | ||
1273 | ret = mmu_zap_unsync_children(kvm, sp); | ||
1009 | kvm_mmu_page_unlink_children(kvm, sp); | 1274 | kvm_mmu_page_unlink_children(kvm, sp); |
1275 | kvm_mmu_unlink_parents(kvm, sp); | ||
1276 | kvm_flush_remote_tlbs(kvm); | ||
1277 | if (!sp->role.invalid && !sp->role.metaphysical) | ||
1278 | unaccount_shadowed(kvm, sp->gfn); | ||
1279 | if (sp->unsync) | ||
1280 | kvm_unlink_unsync_page(kvm, sp); | ||
1010 | if (!sp->root_count) { | 1281 | if (!sp->root_count) { |
1011 | if (!sp->role.metaphysical && !sp->role.invalid) | ||
1012 | unaccount_shadowed(kvm, sp->gfn); | ||
1013 | hlist_del(&sp->hash_link); | 1282 | hlist_del(&sp->hash_link); |
1014 | kvm_mmu_free_page(kvm, sp); | 1283 | kvm_mmu_free_page(kvm, sp); |
1015 | } else { | 1284 | } else { |
1016 | int invalid = sp->role.invalid; | ||
1017 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | ||
1018 | sp->role.invalid = 1; | 1285 | sp->role.invalid = 1; |
1286 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | ||
1019 | kvm_reload_remote_mmus(kvm); | 1287 | kvm_reload_remote_mmus(kvm); |
1020 | if (!sp->role.metaphysical && !invalid) | ||
1021 | unaccount_shadowed(kvm, sp->gfn); | ||
1022 | } | 1288 | } |
1023 | kvm_mmu_reset_last_pte_updated(kvm); | 1289 | kvm_mmu_reset_last_pte_updated(kvm); |
1290 | return ret; | ||
1024 | } | 1291 | } |
1025 | 1292 | ||
1026 | /* | 1293 | /* |
@@ -1073,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | |||
1073 | if (sp->gfn == gfn && !sp->role.metaphysical) { | 1340 | if (sp->gfn == gfn && !sp->role.metaphysical) { |
1074 | pgprintk("%s: gfn %lx role %x\n", __func__, gfn, | 1341 | pgprintk("%s: gfn %lx role %x\n", __func__, gfn, |
1075 | sp->role.word); | 1342 | sp->role.word); |
1076 | kvm_mmu_zap_page(kvm, sp); | ||
1077 | r = 1; | 1343 | r = 1; |
1344 | if (kvm_mmu_zap_page(kvm, sp)) | ||
1345 | n = bucket->first; | ||
1078 | } | 1346 | } |
1079 | return r; | 1347 | return r; |
1080 | } | 1348 | } |
@@ -1097,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | |||
1097 | __set_bit(slot, &sp->slot_bitmap); | 1365 | __set_bit(slot, &sp->slot_bitmap); |
1098 | } | 1366 | } |
1099 | 1367 | ||
1368 | static void mmu_convert_notrap(struct kvm_mmu_page *sp) | ||
1369 | { | ||
1370 | int i; | ||
1371 | u64 *pt = sp->spt; | ||
1372 | |||
1373 | if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) | ||
1374 | return; | ||
1375 | |||
1376 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
1377 | if (pt[i] == shadow_notrap_nonpresent_pte) | ||
1378 | set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); | ||
1379 | } | ||
1380 | } | ||
1381 | |||
1100 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | 1382 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) |
1101 | { | 1383 | { |
1102 | struct page *page; | 1384 | struct page *page; |
@@ -1106,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | |||
1106 | if (gpa == UNMAPPED_GVA) | 1388 | if (gpa == UNMAPPED_GVA) |
1107 | return NULL; | 1389 | return NULL; |
1108 | 1390 | ||
1109 | down_read(¤t->mm->mmap_sem); | ||
1110 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 1391 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); |
1111 | up_read(¤t->mm->mmap_sem); | ||
1112 | 1392 | ||
1113 | return page; | 1393 | return page; |
1114 | } | 1394 | } |
1115 | 1395 | ||
1116 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | 1396 | static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) |
1117 | unsigned pt_access, unsigned pte_access, | ||
1118 | int user_fault, int write_fault, int dirty, | ||
1119 | int *ptwrite, int largepage, gfn_t gfn, | ||
1120 | pfn_t pfn, bool speculative) | ||
1121 | { | 1397 | { |
1122 | u64 spte; | 1398 | unsigned index; |
1123 | int was_rmapped = 0; | 1399 | struct hlist_head *bucket; |
1124 | int was_writeble = is_writeble_pte(*shadow_pte); | 1400 | struct kvm_mmu_page *s; |
1401 | struct hlist_node *node, *n; | ||
1125 | 1402 | ||
1126 | pgprintk("%s: spte %llx access %x write_fault %d" | 1403 | index = kvm_page_table_hashfn(sp->gfn); |
1127 | " user_fault %d gfn %lx\n", | 1404 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
1128 | __func__, *shadow_pte, pt_access, | 1405 | /* don't unsync if pagetable is shadowed with multiple roles */ |
1129 | write_fault, user_fault, gfn); | 1406 | hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { |
1407 | if (s->gfn != sp->gfn || s->role.metaphysical) | ||
1408 | continue; | ||
1409 | if (s->role.word != sp->role.word) | ||
1410 | return 1; | ||
1411 | } | ||
1412 | kvm_mmu_mark_parents_unsync(vcpu, sp); | ||
1413 | ++vcpu->kvm->stat.mmu_unsync; | ||
1414 | sp->unsync = 1; | ||
1415 | mmu_convert_notrap(sp); | ||
1416 | return 0; | ||
1417 | } | ||
1130 | 1418 | ||
1131 | if (is_rmap_pte(*shadow_pte)) { | 1419 | static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, |
1132 | /* | 1420 | bool can_unsync) |
1133 | * If we overwrite a PTE page pointer with a 2MB PMD, unlink | 1421 | { |
1134 | * the parent of the now unreachable PTE. | 1422 | struct kvm_mmu_page *shadow; |
1135 | */ | ||
1136 | if (largepage && !is_large_pte(*shadow_pte)) { | ||
1137 | struct kvm_mmu_page *child; | ||
1138 | u64 pte = *shadow_pte; | ||
1139 | 1423 | ||
1140 | child = page_header(pte & PT64_BASE_ADDR_MASK); | 1424 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); |
1141 | mmu_page_remove_parent_pte(child, shadow_pte); | 1425 | if (shadow) { |
1142 | } else if (pfn != spte_to_pfn(*shadow_pte)) { | 1426 | if (shadow->role.level != PT_PAGE_TABLE_LEVEL) |
1143 | pgprintk("hfn old %lx new %lx\n", | 1427 | return 1; |
1144 | spte_to_pfn(*shadow_pte), pfn); | 1428 | if (shadow->unsync) |
1145 | rmap_remove(vcpu->kvm, shadow_pte); | 1429 | return 0; |
1146 | } else { | 1430 | if (can_unsync && oos_shadow) |
1147 | if (largepage) | 1431 | return kvm_unsync_page(vcpu, shadow); |
1148 | was_rmapped = is_large_pte(*shadow_pte); | 1432 | return 1; |
1149 | else | ||
1150 | was_rmapped = 1; | ||
1151 | } | ||
1152 | } | 1433 | } |
1434 | return 0; | ||
1435 | } | ||
1153 | 1436 | ||
1437 | static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | ||
1438 | unsigned pte_access, int user_fault, | ||
1439 | int write_fault, int dirty, int largepage, | ||
1440 | gfn_t gfn, pfn_t pfn, bool speculative, | ||
1441 | bool can_unsync) | ||
1442 | { | ||
1443 | u64 spte; | ||
1444 | int ret = 0; | ||
1154 | /* | 1445 | /* |
1155 | * We don't set the accessed bit, since we sometimes want to see | 1446 | * We don't set the accessed bit, since we sometimes want to see |
1156 | * whether the guest actually used the pte (in order to detect | 1447 | * whether the guest actually used the pte (in order to detect |
@@ -1158,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1158 | */ | 1449 | */ |
1159 | spte = shadow_base_present_pte | shadow_dirty_mask; | 1450 | spte = shadow_base_present_pte | shadow_dirty_mask; |
1160 | if (!speculative) | 1451 | if (!speculative) |
1161 | pte_access |= PT_ACCESSED_MASK; | 1452 | spte |= shadow_accessed_mask; |
1162 | if (!dirty) | 1453 | if (!dirty) |
1163 | pte_access &= ~ACC_WRITE_MASK; | 1454 | pte_access &= ~ACC_WRITE_MASK; |
1164 | if (pte_access & ACC_EXEC_MASK) | 1455 | if (pte_access & ACC_EXEC_MASK) |
@@ -1174,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1174 | 1465 | ||
1175 | if ((pte_access & ACC_WRITE_MASK) | 1466 | if ((pte_access & ACC_WRITE_MASK) |
1176 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { | 1467 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { |
1177 | struct kvm_mmu_page *shadow; | 1468 | |
1469 | if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { | ||
1470 | ret = 1; | ||
1471 | spte = shadow_trap_nonpresent_pte; | ||
1472 | goto set_pte; | ||
1473 | } | ||
1178 | 1474 | ||
1179 | spte |= PT_WRITABLE_MASK; | 1475 | spte |= PT_WRITABLE_MASK; |
1180 | 1476 | ||
1181 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); | 1477 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { |
1182 | if (shadow || | ||
1183 | (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { | ||
1184 | pgprintk("%s: found shadow page for %lx, marking ro\n", | 1478 | pgprintk("%s: found shadow page for %lx, marking ro\n", |
1185 | __func__, gfn); | 1479 | __func__, gfn); |
1480 | ret = 1; | ||
1186 | pte_access &= ~ACC_WRITE_MASK; | 1481 | pte_access &= ~ACC_WRITE_MASK; |
1187 | if (is_writeble_pte(spte)) { | 1482 | if (is_writeble_pte(spte)) |
1188 | spte &= ~PT_WRITABLE_MASK; | 1483 | spte &= ~PT_WRITABLE_MASK; |
1189 | kvm_x86_ops->tlb_flush(vcpu); | ||
1190 | } | ||
1191 | if (write_fault) | ||
1192 | *ptwrite = 1; | ||
1193 | } | 1484 | } |
1194 | } | 1485 | } |
1195 | 1486 | ||
1196 | if (pte_access & ACC_WRITE_MASK) | 1487 | if (pte_access & ACC_WRITE_MASK) |
1197 | mark_page_dirty(vcpu->kvm, gfn); | 1488 | mark_page_dirty(vcpu->kvm, gfn); |
1198 | 1489 | ||
1199 | pgprintk("%s: setting spte %llx\n", __func__, spte); | 1490 | set_pte: |
1200 | pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", | ||
1201 | (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", | ||
1202 | (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); | ||
1203 | set_shadow_pte(shadow_pte, spte); | 1491 | set_shadow_pte(shadow_pte, spte); |
1204 | if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) | 1492 | return ret; |
1205 | && (spte & PT_PRESENT_MASK)) | 1493 | } |
1494 | |||
1495 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | ||
1496 | unsigned pt_access, unsigned pte_access, | ||
1497 | int user_fault, int write_fault, int dirty, | ||
1498 | int *ptwrite, int largepage, gfn_t gfn, | ||
1499 | pfn_t pfn, bool speculative) | ||
1500 | { | ||
1501 | int was_rmapped = 0; | ||
1502 | int was_writeble = is_writeble_pte(*shadow_pte); | ||
1503 | |||
1504 | pgprintk("%s: spte %llx access %x write_fault %d" | ||
1505 | " user_fault %d gfn %lx\n", | ||
1506 | __func__, *shadow_pte, pt_access, | ||
1507 | write_fault, user_fault, gfn); | ||
1508 | |||
1509 | if (is_rmap_pte(*shadow_pte)) { | ||
1510 | /* | ||
1511 | * If we overwrite a PTE page pointer with a 2MB PMD, unlink | ||
1512 | * the parent of the now unreachable PTE. | ||
1513 | */ | ||
1514 | if (largepage && !is_large_pte(*shadow_pte)) { | ||
1515 | struct kvm_mmu_page *child; | ||
1516 | u64 pte = *shadow_pte; | ||
1517 | |||
1518 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
1519 | mmu_page_remove_parent_pte(child, shadow_pte); | ||
1520 | } else if (pfn != spte_to_pfn(*shadow_pte)) { | ||
1521 | pgprintk("hfn old %lx new %lx\n", | ||
1522 | spte_to_pfn(*shadow_pte), pfn); | ||
1523 | rmap_remove(vcpu->kvm, shadow_pte); | ||
1524 | } else { | ||
1525 | if (largepage) | ||
1526 | was_rmapped = is_large_pte(*shadow_pte); | ||
1527 | else | ||
1528 | was_rmapped = 1; | ||
1529 | } | ||
1530 | } | ||
1531 | if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, | ||
1532 | dirty, largepage, gfn, pfn, speculative, true)) { | ||
1533 | if (write_fault) | ||
1534 | *ptwrite = 1; | ||
1535 | kvm_x86_ops->tlb_flush(vcpu); | ||
1536 | } | ||
1537 | |||
1538 | pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); | ||
1539 | pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", | ||
1540 | is_large_pte(*shadow_pte)? "2MB" : "4kB", | ||
1541 | is_present_pte(*shadow_pte)?"RW":"R", gfn, | ||
1542 | *shadow_pte, shadow_pte); | ||
1543 | if (!was_rmapped && is_large_pte(*shadow_pte)) | ||
1206 | ++vcpu->kvm->stat.lpages; | 1544 | ++vcpu->kvm->stat.lpages; |
1207 | 1545 | ||
1208 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); | 1546 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); |
@@ -1226,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | |||
1226 | { | 1564 | { |
1227 | } | 1565 | } |
1228 | 1566 | ||
1229 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | 1567 | struct direct_shadow_walk { |
1230 | int largepage, gfn_t gfn, pfn_t pfn, | 1568 | struct kvm_shadow_walk walker; |
1231 | int level) | 1569 | pfn_t pfn; |
1232 | { | 1570 | int write; |
1233 | hpa_t table_addr = vcpu->arch.mmu.root_hpa; | 1571 | int largepage; |
1234 | int pt_write = 0; | 1572 | int pt_write; |
1235 | 1573 | }; | |
1236 | for (; ; level--) { | ||
1237 | u32 index = PT64_INDEX(v, level); | ||
1238 | u64 *table; | ||
1239 | |||
1240 | ASSERT(VALID_PAGE(table_addr)); | ||
1241 | table = __va(table_addr); | ||
1242 | 1574 | ||
1243 | if (level == 1) { | 1575 | static int direct_map_entry(struct kvm_shadow_walk *_walk, |
1244 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, | 1576 | struct kvm_vcpu *vcpu, |
1245 | 0, write, 1, &pt_write, 0, gfn, pfn, false); | 1577 | u64 addr, u64 *sptep, int level) |
1246 | return pt_write; | 1578 | { |
1247 | } | 1579 | struct direct_shadow_walk *walk = |
1580 | container_of(_walk, struct direct_shadow_walk, walker); | ||
1581 | struct kvm_mmu_page *sp; | ||
1582 | gfn_t pseudo_gfn; | ||
1583 | gfn_t gfn = addr >> PAGE_SHIFT; | ||
1584 | |||
1585 | if (level == PT_PAGE_TABLE_LEVEL | ||
1586 | || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { | ||
1587 | mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, | ||
1588 | 0, walk->write, 1, &walk->pt_write, | ||
1589 | walk->largepage, gfn, walk->pfn, false); | ||
1590 | ++vcpu->stat.pf_fixed; | ||
1591 | return 1; | ||
1592 | } | ||
1248 | 1593 | ||
1249 | if (largepage && level == 2) { | 1594 | if (*sptep == shadow_trap_nonpresent_pte) { |
1250 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, | 1595 | pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; |
1251 | 0, write, 1, &pt_write, 1, gfn, pfn, false); | 1596 | sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1, |
1252 | return pt_write; | 1597 | 1, ACC_ALL, sptep); |
1598 | if (!sp) { | ||
1599 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
1600 | kvm_release_pfn_clean(walk->pfn); | ||
1601 | return -ENOMEM; | ||
1253 | } | 1602 | } |
1254 | 1603 | ||
1255 | if (table[index] == shadow_trap_nonpresent_pte) { | 1604 | set_shadow_pte(sptep, |
1256 | struct kvm_mmu_page *new_table; | 1605 | __pa(sp->spt) |
1257 | gfn_t pseudo_gfn; | 1606 | | PT_PRESENT_MASK | PT_WRITABLE_MASK |
1258 | 1607 | | shadow_user_mask | shadow_x_mask); | |
1259 | pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) | ||
1260 | >> PAGE_SHIFT; | ||
1261 | new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, | ||
1262 | v, level - 1, | ||
1263 | 1, ACC_ALL, &table[index]); | ||
1264 | if (!new_table) { | ||
1265 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
1266 | kvm_release_pfn_clean(pfn); | ||
1267 | return -ENOMEM; | ||
1268 | } | ||
1269 | |||
1270 | set_shadow_pte(&table[index], | ||
1271 | __pa(new_table->spt) | ||
1272 | | PT_PRESENT_MASK | PT_WRITABLE_MASK | ||
1273 | | shadow_user_mask | shadow_x_mask); | ||
1274 | } | ||
1275 | table_addr = table[index] & PT64_BASE_ADDR_MASK; | ||
1276 | } | 1608 | } |
1609 | return 0; | ||
1610 | } | ||
1611 | |||
1612 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | ||
1613 | int largepage, gfn_t gfn, pfn_t pfn) | ||
1614 | { | ||
1615 | int r; | ||
1616 | struct direct_shadow_walk walker = { | ||
1617 | .walker = { .entry = direct_map_entry, }, | ||
1618 | .pfn = pfn, | ||
1619 | .largepage = largepage, | ||
1620 | .write = write, | ||
1621 | .pt_write = 0, | ||
1622 | }; | ||
1623 | |||
1624 | r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT); | ||
1625 | if (r < 0) | ||
1626 | return r; | ||
1627 | return walker.pt_write; | ||
1277 | } | 1628 | } |
1278 | 1629 | ||
1279 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | 1630 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) |
@@ -1283,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
1283 | pfn_t pfn; | 1634 | pfn_t pfn; |
1284 | unsigned long mmu_seq; | 1635 | unsigned long mmu_seq; |
1285 | 1636 | ||
1286 | down_read(¤t->mm->mmap_sem); | ||
1287 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { | 1637 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { |
1288 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 1638 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
1289 | largepage = 1; | 1639 | largepage = 1; |
1290 | } | 1640 | } |
1291 | 1641 | ||
1292 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 1642 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
1293 | /* implicit mb(), we'll read before PT lock is unlocked */ | 1643 | smp_rmb(); |
1294 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 1644 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1295 | up_read(¤t->mm->mmap_sem); | ||
1296 | 1645 | ||
1297 | /* mmio */ | 1646 | /* mmio */ |
1298 | if (is_error_pfn(pfn)) { | 1647 | if (is_error_pfn(pfn)) { |
@@ -1304,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
1304 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 1653 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
1305 | goto out_unlock; | 1654 | goto out_unlock; |
1306 | kvm_mmu_free_some_pages(vcpu); | 1655 | kvm_mmu_free_some_pages(vcpu); |
1307 | r = __direct_map(vcpu, v, write, largepage, gfn, pfn, | 1656 | r = __direct_map(vcpu, v, write, largepage, gfn, pfn); |
1308 | PT32E_ROOT_LEVEL); | ||
1309 | spin_unlock(&vcpu->kvm->mmu_lock); | 1657 | spin_unlock(&vcpu->kvm->mmu_lock); |
1310 | 1658 | ||
1311 | 1659 | ||
@@ -1401,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
1401 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | 1749 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); |
1402 | } | 1750 | } |
1403 | 1751 | ||
1752 | static void mmu_sync_roots(struct kvm_vcpu *vcpu) | ||
1753 | { | ||
1754 | int i; | ||
1755 | struct kvm_mmu_page *sp; | ||
1756 | |||
1757 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
1758 | return; | ||
1759 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
1760 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
1761 | sp = page_header(root); | ||
1762 | mmu_sync_children(vcpu, sp); | ||
1763 | return; | ||
1764 | } | ||
1765 | for (i = 0; i < 4; ++i) { | ||
1766 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
1767 | |||
1768 | if (root) { | ||
1769 | root &= PT64_BASE_ADDR_MASK; | ||
1770 | sp = page_header(root); | ||
1771 | mmu_sync_children(vcpu, sp); | ||
1772 | } | ||
1773 | } | ||
1774 | } | ||
1775 | |||
1776 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | ||
1777 | { | ||
1778 | spin_lock(&vcpu->kvm->mmu_lock); | ||
1779 | mmu_sync_roots(vcpu); | ||
1780 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
1781 | } | ||
1782 | |||
1404 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | 1783 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) |
1405 | { | 1784 | { |
1406 | return vaddr; | 1785 | return vaddr; |
@@ -1442,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
1442 | if (r) | 1821 | if (r) |
1443 | return r; | 1822 | return r; |
1444 | 1823 | ||
1445 | down_read(¤t->mm->mmap_sem); | ||
1446 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { | 1824 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { |
1447 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 1825 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
1448 | largepage = 1; | 1826 | largepage = 1; |
1449 | } | 1827 | } |
1450 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 1828 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
1451 | /* implicit mb(), we'll read before PT lock is unlocked */ | 1829 | smp_rmb(); |
1452 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 1830 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1453 | up_read(¤t->mm->mmap_sem); | ||
1454 | if (is_error_pfn(pfn)) { | 1831 | if (is_error_pfn(pfn)) { |
1455 | kvm_release_pfn_clean(pfn); | 1832 | kvm_release_pfn_clean(pfn); |
1456 | return 1; | 1833 | return 1; |
@@ -1460,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
1460 | goto out_unlock; | 1837 | goto out_unlock; |
1461 | kvm_mmu_free_some_pages(vcpu); | 1838 | kvm_mmu_free_some_pages(vcpu); |
1462 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, | 1839 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, |
1463 | largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); | 1840 | largepage, gfn, pfn); |
1464 | spin_unlock(&vcpu->kvm->mmu_lock); | 1841 | spin_unlock(&vcpu->kvm->mmu_lock); |
1465 | 1842 | ||
1466 | return r; | 1843 | return r; |
@@ -1485,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) | |||
1485 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 1862 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
1486 | context->free = nonpaging_free; | 1863 | context->free = nonpaging_free; |
1487 | context->prefetch_page = nonpaging_prefetch_page; | 1864 | context->prefetch_page = nonpaging_prefetch_page; |
1865 | context->sync_page = nonpaging_sync_page; | ||
1866 | context->invlpg = nonpaging_invlpg; | ||
1488 | context->root_level = 0; | 1867 | context->root_level = 0; |
1489 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 1868 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
1490 | context->root_hpa = INVALID_PAGE; | 1869 | context->root_hpa = INVALID_PAGE; |
@@ -1532,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | |||
1532 | context->page_fault = paging64_page_fault; | 1911 | context->page_fault = paging64_page_fault; |
1533 | context->gva_to_gpa = paging64_gva_to_gpa; | 1912 | context->gva_to_gpa = paging64_gva_to_gpa; |
1534 | context->prefetch_page = paging64_prefetch_page; | 1913 | context->prefetch_page = paging64_prefetch_page; |
1914 | context->sync_page = paging64_sync_page; | ||
1915 | context->invlpg = paging64_invlpg; | ||
1535 | context->free = paging_free; | 1916 | context->free = paging_free; |
1536 | context->root_level = level; | 1917 | context->root_level = level; |
1537 | context->shadow_root_level = level; | 1918 | context->shadow_root_level = level; |
@@ -1553,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) | |||
1553 | context->gva_to_gpa = paging32_gva_to_gpa; | 1934 | context->gva_to_gpa = paging32_gva_to_gpa; |
1554 | context->free = paging_free; | 1935 | context->free = paging_free; |
1555 | context->prefetch_page = paging32_prefetch_page; | 1936 | context->prefetch_page = paging32_prefetch_page; |
1937 | context->sync_page = paging32_sync_page; | ||
1938 | context->invlpg = paging32_invlpg; | ||
1556 | context->root_level = PT32_ROOT_LEVEL; | 1939 | context->root_level = PT32_ROOT_LEVEL; |
1557 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 1940 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
1558 | context->root_hpa = INVALID_PAGE; | 1941 | context->root_hpa = INVALID_PAGE; |
@@ -1572,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
1572 | context->page_fault = tdp_page_fault; | 1955 | context->page_fault = tdp_page_fault; |
1573 | context->free = nonpaging_free; | 1956 | context->free = nonpaging_free; |
1574 | context->prefetch_page = nonpaging_prefetch_page; | 1957 | context->prefetch_page = nonpaging_prefetch_page; |
1958 | context->sync_page = nonpaging_sync_page; | ||
1959 | context->invlpg = nonpaging_invlpg; | ||
1575 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); | 1960 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); |
1576 | context->root_hpa = INVALID_PAGE; | 1961 | context->root_hpa = INVALID_PAGE; |
1577 | 1962 | ||
@@ -1643,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) | |||
1643 | spin_lock(&vcpu->kvm->mmu_lock); | 2028 | spin_lock(&vcpu->kvm->mmu_lock); |
1644 | kvm_mmu_free_some_pages(vcpu); | 2029 | kvm_mmu_free_some_pages(vcpu); |
1645 | mmu_alloc_roots(vcpu); | 2030 | mmu_alloc_roots(vcpu); |
2031 | mmu_sync_roots(vcpu); | ||
1646 | spin_unlock(&vcpu->kvm->mmu_lock); | 2032 | spin_unlock(&vcpu->kvm->mmu_lock); |
1647 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); | 2033 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); |
1648 | kvm_mmu_flush_tlb(vcpu); | 2034 | kvm_mmu_flush_tlb(vcpu); |
@@ -1763,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1763 | return; | 2149 | return; |
1764 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 2150 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
1765 | 2151 | ||
1766 | down_read(¤t->mm->mmap_sem); | ||
1767 | if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { | 2152 | if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { |
1768 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 2153 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
1769 | vcpu->arch.update_pte.largepage = 1; | 2154 | vcpu->arch.update_pte.largepage = 1; |
1770 | } | 2155 | } |
1771 | vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2156 | vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; |
1772 | /* implicit mb(), we'll read before PT lock is unlocked */ | 2157 | smp_rmb(); |
1773 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2158 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1774 | up_read(¤t->mm->mmap_sem); | ||
1775 | 2159 | ||
1776 | if (is_error_pfn(pfn)) { | 2160 | if (is_error_pfn(pfn)) { |
1777 | kvm_release_pfn_clean(pfn); | 2161 | kvm_release_pfn_clean(pfn); |
@@ -1833,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1833 | index = kvm_page_table_hashfn(gfn); | 2217 | index = kvm_page_table_hashfn(gfn); |
1834 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | 2218 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
1835 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { | 2219 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { |
1836 | if (sp->gfn != gfn || sp->role.metaphysical) | 2220 | if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid) |
1837 | continue; | 2221 | continue; |
1838 | pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; | 2222 | pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; |
1839 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | 2223 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); |
@@ -1851,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1851 | */ | 2235 | */ |
1852 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | 2236 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", |
1853 | gpa, bytes, sp->role.word); | 2237 | gpa, bytes, sp->role.word); |
1854 | kvm_mmu_zap_page(vcpu->kvm, sp); | 2238 | if (kvm_mmu_zap_page(vcpu->kvm, sp)) |
2239 | n = bucket->first; | ||
1855 | ++vcpu->kvm->stat.mmu_flooded; | 2240 | ++vcpu->kvm->stat.mmu_flooded; |
1856 | continue; | 2241 | continue; |
1857 | } | 2242 | } |
@@ -1965,6 +2350,16 @@ out: | |||
1965 | } | 2350 | } |
1966 | EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); | 2351 | EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); |
1967 | 2352 | ||
2353 | void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) | ||
2354 | { | ||
2355 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2356 | vcpu->arch.mmu.invlpg(vcpu, gva); | ||
2357 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2358 | kvm_mmu_flush_tlb(vcpu); | ||
2359 | ++vcpu->stat.invlpg; | ||
2360 | } | ||
2361 | EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); | ||
2362 | |||
1968 | void kvm_enable_tdp(void) | 2363 | void kvm_enable_tdp(void) |
1969 | { | 2364 | { |
1970 | tdp_enabled = true; | 2365 | tdp_enabled = true; |
@@ -2051,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
2051 | { | 2446 | { |
2052 | struct kvm_mmu_page *sp; | 2447 | struct kvm_mmu_page *sp; |
2053 | 2448 | ||
2449 | spin_lock(&kvm->mmu_lock); | ||
2054 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { | 2450 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { |
2055 | int i; | 2451 | int i; |
2056 | u64 *pt; | 2452 | u64 *pt; |
@@ -2064,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
2064 | if (pt[i] & PT_WRITABLE_MASK) | 2460 | if (pt[i] & PT_WRITABLE_MASK) |
2065 | pt[i] &= ~PT_WRITABLE_MASK; | 2461 | pt[i] &= ~PT_WRITABLE_MASK; |
2066 | } | 2462 | } |
2463 | kvm_flush_remote_tlbs(kvm); | ||
2464 | spin_unlock(&kvm->mmu_lock); | ||
2067 | } | 2465 | } |
2068 | 2466 | ||
2069 | void kvm_mmu_zap_all(struct kvm *kvm) | 2467 | void kvm_mmu_zap_all(struct kvm *kvm) |
@@ -2072,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm) | |||
2072 | 2470 | ||
2073 | spin_lock(&kvm->mmu_lock); | 2471 | spin_lock(&kvm->mmu_lock); |
2074 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) | 2472 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) |
2075 | kvm_mmu_zap_page(kvm, sp); | 2473 | if (kvm_mmu_zap_page(kvm, sp)) |
2474 | node = container_of(kvm->arch.active_mmu_pages.next, | ||
2475 | struct kvm_mmu_page, link); | ||
2076 | spin_unlock(&kvm->mmu_lock); | 2476 | spin_unlock(&kvm->mmu_lock); |
2077 | 2477 | ||
2078 | kvm_flush_remote_tlbs(kvm); | 2478 | kvm_flush_remote_tlbs(kvm); |
@@ -2287,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, | |||
2287 | gpa_t addr, unsigned long *ret) | 2687 | gpa_t addr, unsigned long *ret) |
2288 | { | 2688 | { |
2289 | int r; | 2689 | int r; |
2290 | struct kvm_pv_mmu_op_buffer buffer; | 2690 | struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; |
2291 | 2691 | ||
2292 | buffer.ptr = buffer.buf; | 2692 | buffer->ptr = buffer->buf; |
2293 | buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); | 2693 | buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); |
2294 | buffer.processed = 0; | 2694 | buffer->processed = 0; |
2295 | 2695 | ||
2296 | r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); | 2696 | r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); |
2297 | if (r) | 2697 | if (r) |
2298 | goto out; | 2698 | goto out; |
2299 | 2699 | ||
2300 | while (buffer.len) { | 2700 | while (buffer->len) { |
2301 | r = kvm_pv_mmu_op_one(vcpu, &buffer); | 2701 | r = kvm_pv_mmu_op_one(vcpu, buffer); |
2302 | if (r < 0) | 2702 | if (r < 0) |
2303 | goto out; | 2703 | goto out; |
2304 | if (r == 0) | 2704 | if (r == 0) |
@@ -2307,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, | |||
2307 | 2707 | ||
2308 | r = 1; | 2708 | r = 1; |
2309 | out: | 2709 | out: |
2310 | *ret = buffer.processed; | 2710 | *ret = buffer->processed; |
2311 | return r; | 2711 | return r; |
2312 | } | 2712 | } |
2313 | 2713 | ||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 4a814bff21f2..613ec9aa674a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -25,11 +25,11 @@ | |||
25 | #if PTTYPE == 64 | 25 | #if PTTYPE == 64 |
26 | #define pt_element_t u64 | 26 | #define pt_element_t u64 |
27 | #define guest_walker guest_walker64 | 27 | #define guest_walker guest_walker64 |
28 | #define shadow_walker shadow_walker64 | ||
28 | #define FNAME(name) paging##64_##name | 29 | #define FNAME(name) paging##64_##name |
29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | 30 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK |
30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | 31 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK |
31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | 32 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) |
32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | 33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) |
34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS | 34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS |
35 | #ifdef CONFIG_X86_64 | 35 | #ifdef CONFIG_X86_64 |
@@ -42,11 +42,11 @@ | |||
42 | #elif PTTYPE == 32 | 42 | #elif PTTYPE == 32 |
43 | #define pt_element_t u32 | 43 | #define pt_element_t u32 |
44 | #define guest_walker guest_walker32 | 44 | #define guest_walker guest_walker32 |
45 | #define shadow_walker shadow_walker32 | ||
45 | #define FNAME(name) paging##32_##name | 46 | #define FNAME(name) paging##32_##name |
46 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | 47 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK |
47 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | 48 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK |
48 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | 49 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) |
49 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | 50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) |
51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS | 51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS |
52 | #define PT_MAX_FULL_LEVELS 2 | 52 | #define PT_MAX_FULL_LEVELS 2 |
@@ -73,6 +73,17 @@ struct guest_walker { | |||
73 | u32 error_code; | 73 | u32 error_code; |
74 | }; | 74 | }; |
75 | 75 | ||
76 | struct shadow_walker { | ||
77 | struct kvm_shadow_walk walker; | ||
78 | struct guest_walker *guest_walker; | ||
79 | int user_fault; | ||
80 | int write_fault; | ||
81 | int largepage; | ||
82 | int *ptwrite; | ||
83 | pfn_t pfn; | ||
84 | u64 *sptep; | ||
85 | }; | ||
86 | |||
76 | static gfn_t gpte_to_gfn(pt_element_t gpte) | 87 | static gfn_t gpte_to_gfn(pt_element_t gpte) |
77 | { | 88 | { |
78 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | 89 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; |
@@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, | |||
91 | pt_element_t *table; | 102 | pt_element_t *table; |
92 | struct page *page; | 103 | struct page *page; |
93 | 104 | ||
94 | down_read(¤t->mm->mmap_sem); | ||
95 | page = gfn_to_page(kvm, table_gfn); | 105 | page = gfn_to_page(kvm, table_gfn); |
96 | up_read(¤t->mm->mmap_sem); | ||
97 | 106 | ||
98 | table = kmap_atomic(page, KM_USER0); | 107 | table = kmap_atomic(page, KM_USER0); |
99 | |||
100 | ret = CMPXCHG(&table[index], orig_pte, new_pte); | 108 | ret = CMPXCHG(&table[index], orig_pte, new_pte); |
101 | |||
102 | kunmap_atomic(table, KM_USER0); | 109 | kunmap_atomic(table, KM_USER0); |
103 | 110 | ||
104 | kvm_release_page_dirty(page); | 111 | kvm_release_page_dirty(page); |
@@ -274,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
274 | /* | 281 | /* |
275 | * Fetch a shadow pte for a specific level in the paging hierarchy. | 282 | * Fetch a shadow pte for a specific level in the paging hierarchy. |
276 | */ | 283 | */ |
277 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 284 | static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, |
278 | struct guest_walker *walker, | 285 | struct kvm_vcpu *vcpu, u64 addr, |
279 | int user_fault, int write_fault, int largepage, | 286 | u64 *sptep, int level) |
280 | int *ptwrite, pfn_t pfn) | ||
281 | { | 287 | { |
282 | hpa_t shadow_addr; | 288 | struct shadow_walker *sw = |
283 | int level; | 289 | container_of(_sw, struct shadow_walker, walker); |
284 | u64 *shadow_ent; | 290 | struct guest_walker *gw = sw->guest_walker; |
285 | unsigned access = walker->pt_access; | 291 | unsigned access = gw->pt_access; |
286 | 292 | struct kvm_mmu_page *shadow_page; | |
287 | if (!is_present_pte(walker->ptes[walker->level - 1])) | 293 | u64 spte; |
288 | return NULL; | 294 | int metaphysical; |
289 | 295 | gfn_t table_gfn; | |
290 | shadow_addr = vcpu->arch.mmu.root_hpa; | 296 | int r; |
291 | level = vcpu->arch.mmu.shadow_root_level; | 297 | pt_element_t curr_pte; |
292 | if (level == PT32E_ROOT_LEVEL) { | 298 | |
293 | shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | 299 | if (level == PT_PAGE_TABLE_LEVEL |
294 | shadow_addr &= PT64_BASE_ADDR_MASK; | 300 | || (sw->largepage && level == PT_DIRECTORY_LEVEL)) { |
295 | --level; | 301 | mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, |
302 | sw->user_fault, sw->write_fault, | ||
303 | gw->ptes[gw->level-1] & PT_DIRTY_MASK, | ||
304 | sw->ptwrite, sw->largepage, gw->gfn, sw->pfn, | ||
305 | false); | ||
306 | sw->sptep = sptep; | ||
307 | return 1; | ||
296 | } | 308 | } |
297 | 309 | ||
298 | for (; ; level--) { | 310 | if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) |
299 | u32 index = SHADOW_PT_INDEX(addr, level); | 311 | return 0; |
300 | struct kvm_mmu_page *shadow_page; | ||
301 | u64 shadow_pte; | ||
302 | int metaphysical; | ||
303 | gfn_t table_gfn; | ||
304 | |||
305 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | ||
306 | if (level == PT_PAGE_TABLE_LEVEL) | ||
307 | break; | ||
308 | |||
309 | if (largepage && level == PT_DIRECTORY_LEVEL) | ||
310 | break; | ||
311 | 312 | ||
312 | if (is_shadow_present_pte(*shadow_ent) | 313 | if (is_large_pte(*sptep)) { |
313 | && !is_large_pte(*shadow_ent)) { | 314 | set_shadow_pte(sptep, shadow_trap_nonpresent_pte); |
314 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | 315 | kvm_flush_remote_tlbs(vcpu->kvm); |
315 | continue; | 316 | rmap_remove(vcpu->kvm, sptep); |
316 | } | 317 | } |
317 | 318 | ||
318 | if (is_large_pte(*shadow_ent)) | 319 | if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { |
319 | rmap_remove(vcpu->kvm, shadow_ent); | 320 | metaphysical = 1; |
320 | 321 | if (!is_dirty_pte(gw->ptes[level - 1])) | |
321 | if (level - 1 == PT_PAGE_TABLE_LEVEL | 322 | access &= ~ACC_WRITE_MASK; |
322 | && walker->level == PT_DIRECTORY_LEVEL) { | 323 | table_gfn = gpte_to_gfn(gw->ptes[level - 1]); |
323 | metaphysical = 1; | 324 | } else { |
324 | if (!is_dirty_pte(walker->ptes[level - 1])) | 325 | metaphysical = 0; |
325 | access &= ~ACC_WRITE_MASK; | 326 | table_gfn = gw->table_gfn[level - 2]; |
326 | table_gfn = gpte_to_gfn(walker->ptes[level - 1]); | 327 | } |
327 | } else { | 328 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1, |
328 | metaphysical = 0; | 329 | metaphysical, access, sptep); |
329 | table_gfn = walker->table_gfn[level - 2]; | 330 | if (!metaphysical) { |
330 | } | 331 | r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], |
331 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | 332 | &curr_pte, sizeof(curr_pte)); |
332 | metaphysical, access, | 333 | if (r || curr_pte != gw->ptes[level - 2]) { |
333 | shadow_ent); | 334 | kvm_release_pfn_clean(sw->pfn); |
334 | if (!metaphysical) { | 335 | sw->sptep = NULL; |
335 | int r; | 336 | return 1; |
336 | pt_element_t curr_pte; | ||
337 | r = kvm_read_guest_atomic(vcpu->kvm, | ||
338 | walker->pte_gpa[level - 2], | ||
339 | &curr_pte, sizeof(curr_pte)); | ||
340 | if (r || curr_pte != walker->ptes[level - 2]) { | ||
341 | kvm_release_pfn_clean(pfn); | ||
342 | return NULL; | ||
343 | } | ||
344 | } | 337 | } |
345 | shadow_addr = __pa(shadow_page->spt); | ||
346 | shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | ||
347 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
348 | set_shadow_pte(shadow_ent, shadow_pte); | ||
349 | } | 338 | } |
350 | 339 | ||
351 | mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, | 340 | spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK |
352 | user_fault, write_fault, | 341 | | PT_WRITABLE_MASK | PT_USER_MASK; |
353 | walker->ptes[walker->level-1] & PT_DIRTY_MASK, | 342 | *sptep = spte; |
354 | ptwrite, largepage, walker->gfn, pfn, false); | 343 | return 0; |
344 | } | ||
345 | |||
346 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
347 | struct guest_walker *guest_walker, | ||
348 | int user_fault, int write_fault, int largepage, | ||
349 | int *ptwrite, pfn_t pfn) | ||
350 | { | ||
351 | struct shadow_walker walker = { | ||
352 | .walker = { .entry = FNAME(shadow_walk_entry), }, | ||
353 | .guest_walker = guest_walker, | ||
354 | .user_fault = user_fault, | ||
355 | .write_fault = write_fault, | ||
356 | .largepage = largepage, | ||
357 | .ptwrite = ptwrite, | ||
358 | .pfn = pfn, | ||
359 | }; | ||
360 | |||
361 | if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1])) | ||
362 | return NULL; | ||
363 | |||
364 | walk_shadow(&walker.walker, vcpu, addr); | ||
355 | 365 | ||
356 | return shadow_ent; | 366 | return walker.sptep; |
357 | } | 367 | } |
358 | 368 | ||
359 | /* | 369 | /* |
@@ -407,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
407 | return 0; | 417 | return 0; |
408 | } | 418 | } |
409 | 419 | ||
410 | down_read(¤t->mm->mmap_sem); | ||
411 | if (walker.level == PT_DIRECTORY_LEVEL) { | 420 | if (walker.level == PT_DIRECTORY_LEVEL) { |
412 | gfn_t large_gfn; | 421 | gfn_t large_gfn; |
413 | large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); | 422 | large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); |
@@ -417,9 +426,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
417 | } | 426 | } |
418 | } | 427 | } |
419 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 428 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
420 | /* implicit mb(), we'll read before PT lock is unlocked */ | 429 | smp_rmb(); |
421 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); | 430 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); |
422 | up_read(¤t->mm->mmap_sem); | ||
423 | 431 | ||
424 | /* mmio */ | 432 | /* mmio */ |
425 | if (is_error_pfn(pfn)) { | 433 | if (is_error_pfn(pfn)) { |
@@ -453,6 +461,31 @@ out_unlock: | |||
453 | return 0; | 461 | return 0; |
454 | } | 462 | } |
455 | 463 | ||
464 | static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, | ||
465 | struct kvm_vcpu *vcpu, u64 addr, | ||
466 | u64 *sptep, int level) | ||
467 | { | ||
468 | |||
469 | if (level == PT_PAGE_TABLE_LEVEL) { | ||
470 | if (is_shadow_present_pte(*sptep)) | ||
471 | rmap_remove(vcpu->kvm, sptep); | ||
472 | set_shadow_pte(sptep, shadow_trap_nonpresent_pte); | ||
473 | return 1; | ||
474 | } | ||
475 | if (!is_shadow_present_pte(*sptep)) | ||
476 | return 1; | ||
477 | return 0; | ||
478 | } | ||
479 | |||
480 | static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | ||
481 | { | ||
482 | struct shadow_walker walker = { | ||
483 | .walker = { .entry = FNAME(shadow_invlpg_entry), }, | ||
484 | }; | ||
485 | |||
486 | walk_shadow(&walker.walker, vcpu, gva); | ||
487 | } | ||
488 | |||
456 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | 489 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) |
457 | { | 490 | { |
458 | struct guest_walker walker; | 491 | struct guest_walker walker; |
@@ -499,12 +532,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | |||
499 | } | 532 | } |
500 | } | 533 | } |
501 | 534 | ||
535 | /* | ||
536 | * Using the cached information from sp->gfns is safe because: | ||
537 | * - The spte has a reference to the struct page, so the pfn for a given gfn | ||
538 | * can't change unless all sptes pointing to it are nuked first. | ||
539 | * - Alias changes zap the entire shadow cache. | ||
540 | */ | ||
541 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | ||
542 | { | ||
543 | int i, offset, nr_present; | ||
544 | |||
545 | offset = nr_present = 0; | ||
546 | |||
547 | if (PTTYPE == 32) | ||
548 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | ||
549 | |||
550 | for (i = 0; i < PT64_ENT_PER_PAGE; i++) { | ||
551 | unsigned pte_access; | ||
552 | pt_element_t gpte; | ||
553 | gpa_t pte_gpa; | ||
554 | gfn_t gfn = sp->gfns[i]; | ||
555 | |||
556 | if (!is_shadow_present_pte(sp->spt[i])) | ||
557 | continue; | ||
558 | |||
559 | pte_gpa = gfn_to_gpa(sp->gfn); | ||
560 | pte_gpa += (i+offset) * sizeof(pt_element_t); | ||
561 | |||
562 | if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, | ||
563 | sizeof(pt_element_t))) | ||
564 | return -EINVAL; | ||
565 | |||
566 | if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || | ||
567 | !(gpte & PT_ACCESSED_MASK)) { | ||
568 | u64 nonpresent; | ||
569 | |||
570 | rmap_remove(vcpu->kvm, &sp->spt[i]); | ||
571 | if (is_present_pte(gpte)) | ||
572 | nonpresent = shadow_trap_nonpresent_pte; | ||
573 | else | ||
574 | nonpresent = shadow_notrap_nonpresent_pte; | ||
575 | set_shadow_pte(&sp->spt[i], nonpresent); | ||
576 | continue; | ||
577 | } | ||
578 | |||
579 | nr_present++; | ||
580 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | ||
581 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, | ||
582 | is_dirty_pte(gpte), 0, gfn, | ||
583 | spte_to_pfn(sp->spt[i]), true, false); | ||
584 | } | ||
585 | |||
586 | return !nr_present; | ||
587 | } | ||
588 | |||
502 | #undef pt_element_t | 589 | #undef pt_element_t |
503 | #undef guest_walker | 590 | #undef guest_walker |
591 | #undef shadow_walker | ||
504 | #undef FNAME | 592 | #undef FNAME |
505 | #undef PT_BASE_ADDR_MASK | 593 | #undef PT_BASE_ADDR_MASK |
506 | #undef PT_INDEX | 594 | #undef PT_INDEX |
507 | #undef SHADOW_PT_INDEX | ||
508 | #undef PT_LEVEL_MASK | 595 | #undef PT_LEVEL_MASK |
509 | #undef PT_DIR_BASE_ADDR_MASK | 596 | #undef PT_DIR_BASE_ADDR_MASK |
510 | #undef PT_LEVEL_BITS | 597 | #undef PT_LEVEL_BITS |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e2ee264740c7..9c4ce657d963 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include "kvm_svm.h" | 18 | #include "kvm_svm.h" |
19 | #include "irq.h" | 19 | #include "irq.h" |
20 | #include "mmu.h" | 20 | #include "mmu.h" |
21 | #include "kvm_cache_regs.h" | ||
21 | 22 | ||
22 | #include <linux/module.h> | 23 | #include <linux/module.h> |
23 | #include <linux/kernel.h> | 24 | #include <linux/kernel.h> |
@@ -35,10 +36,6 @@ MODULE_LICENSE("GPL"); | |||
35 | #define IOPM_ALLOC_ORDER 2 | 36 | #define IOPM_ALLOC_ORDER 2 |
36 | #define MSRPM_ALLOC_ORDER 1 | 37 | #define MSRPM_ALLOC_ORDER 1 |
37 | 38 | ||
38 | #define DB_VECTOR 1 | ||
39 | #define UD_VECTOR 6 | ||
40 | #define GP_VECTOR 13 | ||
41 | |||
42 | #define DR7_GD_MASK (1 << 13) | 39 | #define DR7_GD_MASK (1 << 13) |
43 | #define DR6_BD_MASK (1 << 13) | 40 | #define DR6_BD_MASK (1 << 13) |
44 | 41 | ||
@@ -47,7 +44,7 @@ MODULE_LICENSE("GPL"); | |||
47 | 44 | ||
48 | #define SVM_FEATURE_NPT (1 << 0) | 45 | #define SVM_FEATURE_NPT (1 << 0) |
49 | #define SVM_FEATURE_LBRV (1 << 1) | 46 | #define SVM_FEATURE_LBRV (1 << 1) |
50 | #define SVM_DEATURE_SVML (1 << 2) | 47 | #define SVM_FEATURE_SVML (1 << 2) |
51 | 48 | ||
52 | #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) | 49 | #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) |
53 | 50 | ||
@@ -62,6 +59,7 @@ static int npt = 1; | |||
62 | module_param(npt, int, S_IRUGO); | 59 | module_param(npt, int, S_IRUGO); |
63 | 60 | ||
64 | static void kvm_reput_irq(struct vcpu_svm *svm); | 61 | static void kvm_reput_irq(struct vcpu_svm *svm); |
62 | static void svm_flush_tlb(struct kvm_vcpu *vcpu); | ||
65 | 63 | ||
66 | static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) | 64 | static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) |
67 | { | 65 | { |
@@ -235,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
235 | printk(KERN_DEBUG "%s: NOP\n", __func__); | 233 | printk(KERN_DEBUG "%s: NOP\n", __func__); |
236 | return; | 234 | return; |
237 | } | 235 | } |
238 | if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) | 236 | if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) |
239 | printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", | 237 | printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", |
240 | __func__, | 238 | __func__, kvm_rip_read(vcpu), svm->next_rip); |
241 | svm->vmcb->save.rip, | ||
242 | svm->next_rip); | ||
243 | 239 | ||
244 | vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; | 240 | kvm_rip_write(vcpu, svm->next_rip); |
245 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; | 241 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; |
246 | 242 | ||
247 | vcpu->arch.interrupt_window_open = 1; | 243 | vcpu->arch.interrupt_window_open = 1; |
@@ -529,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
529 | (1ULL << INTERCEPT_CPUID) | | 525 | (1ULL << INTERCEPT_CPUID) | |
530 | (1ULL << INTERCEPT_INVD) | | 526 | (1ULL << INTERCEPT_INVD) | |
531 | (1ULL << INTERCEPT_HLT) | | 527 | (1ULL << INTERCEPT_HLT) | |
528 | (1ULL << INTERCEPT_INVLPG) | | ||
532 | (1ULL << INTERCEPT_INVLPGA) | | 529 | (1ULL << INTERCEPT_INVLPGA) | |
533 | (1ULL << INTERCEPT_IOIO_PROT) | | 530 | (1ULL << INTERCEPT_IOIO_PROT) | |
534 | (1ULL << INTERCEPT_MSR_PROT) | | 531 | (1ULL << INTERCEPT_MSR_PROT) | |
@@ -580,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
580 | save->dr7 = 0x400; | 577 | save->dr7 = 0x400; |
581 | save->rflags = 2; | 578 | save->rflags = 2; |
582 | save->rip = 0x0000fff0; | 579 | save->rip = 0x0000fff0; |
580 | svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; | ||
583 | 581 | ||
584 | /* | 582 | /* |
585 | * cr0 val on cpu init should be 0x60000010, we enable cpu | 583 | * cr0 val on cpu init should be 0x60000010, we enable cpu |
@@ -592,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
592 | if (npt_enabled) { | 590 | if (npt_enabled) { |
593 | /* Setup VMCB for Nested Paging */ | 591 | /* Setup VMCB for Nested Paging */ |
594 | control->nested_ctl = 1; | 592 | control->nested_ctl = 1; |
595 | control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); | 593 | control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | |
594 | (1ULL << INTERCEPT_INVLPG)); | ||
596 | control->intercept_exceptions &= ~(1 << PF_VECTOR); | 595 | control->intercept_exceptions &= ~(1 << PF_VECTOR); |
597 | control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| | 596 | control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| |
598 | INTERCEPT_CR3_MASK); | 597 | INTERCEPT_CR3_MASK); |
@@ -614,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) | |||
614 | init_vmcb(svm); | 613 | init_vmcb(svm); |
615 | 614 | ||
616 | if (vcpu->vcpu_id != 0) { | 615 | if (vcpu->vcpu_id != 0) { |
617 | svm->vmcb->save.rip = 0; | 616 | kvm_rip_write(vcpu, 0); |
618 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; | 617 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; |
619 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; | 618 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; |
620 | } | 619 | } |
620 | vcpu->arch.regs_avail = ~0; | ||
621 | vcpu->arch.regs_dirty = ~0; | ||
621 | 622 | ||
622 | return 0; | 623 | return 0; |
623 | } | 624 | } |
@@ -720,23 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) | |||
720 | rdtscll(vcpu->arch.host_tsc); | 721 | rdtscll(vcpu->arch.host_tsc); |
721 | } | 722 | } |
722 | 723 | ||
723 | static void svm_cache_regs(struct kvm_vcpu *vcpu) | ||
724 | { | ||
725 | struct vcpu_svm *svm = to_svm(vcpu); | ||
726 | |||
727 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
728 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
729 | vcpu->arch.rip = svm->vmcb->save.rip; | ||
730 | } | ||
731 | |||
732 | static void svm_decache_regs(struct kvm_vcpu *vcpu) | ||
733 | { | ||
734 | struct vcpu_svm *svm = to_svm(vcpu); | ||
735 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
736 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
737 | svm->vmcb->save.rip = vcpu->arch.rip; | ||
738 | } | ||
739 | |||
740 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | 724 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) |
741 | { | 725 | { |
742 | return to_svm(vcpu)->vmcb->save.rflags; | 726 | return to_svm(vcpu)->vmcb->save.rflags; |
@@ -878,6 +862,10 @@ set: | |||
878 | static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 862 | static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
879 | { | 863 | { |
880 | unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; | 864 | unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; |
865 | unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; | ||
866 | |||
867 | if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) | ||
868 | force_new_asid(vcpu); | ||
881 | 869 | ||
882 | vcpu->arch.cr4 = cr4; | 870 | vcpu->arch.cr4 = cr4; |
883 | if (!npt_enabled) | 871 | if (!npt_enabled) |
@@ -1027,8 +1015,15 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1027 | KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code, | 1015 | KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code, |
1028 | (u32)fault_address, (u32)(fault_address >> 32), | 1016 | (u32)fault_address, (u32)(fault_address >> 32), |
1029 | handler); | 1017 | handler); |
1018 | /* | ||
1019 | * FIXME: Tis shouldn't be necessary here, but there is a flush | ||
1020 | * missing in the MMU code. Until we find this bug, flush the | ||
1021 | * complete TLB here on an NPF | ||
1022 | */ | ||
1023 | if (npt_enabled) | ||
1024 | svm_flush_tlb(&svm->vcpu); | ||
1030 | 1025 | ||
1031 | if (event_injection) | 1026 | if (!npt_enabled && event_injection) |
1032 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); | 1027 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); |
1033 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); | 1028 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); |
1034 | } | 1029 | } |
@@ -1127,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1127 | 1122 | ||
1128 | static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1123 | static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1129 | { | 1124 | { |
1130 | svm->next_rip = svm->vmcb->save.rip + 1; | 1125 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; |
1131 | skip_emulated_instruction(&svm->vcpu); | 1126 | skip_emulated_instruction(&svm->vcpu); |
1132 | return kvm_emulate_halt(&svm->vcpu); | 1127 | return kvm_emulate_halt(&svm->vcpu); |
1133 | } | 1128 | } |
1134 | 1129 | ||
1135 | static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1130 | static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1136 | { | 1131 | { |
1137 | svm->next_rip = svm->vmcb->save.rip + 3; | 1132 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
1138 | skip_emulated_instruction(&svm->vcpu); | 1133 | skip_emulated_instruction(&svm->vcpu); |
1139 | kvm_emulate_hypercall(&svm->vcpu); | 1134 | kvm_emulate_hypercall(&svm->vcpu); |
1140 | return 1; | 1135 | return 1; |
@@ -1166,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm, | |||
1166 | 1161 | ||
1167 | static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1162 | static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1168 | { | 1163 | { |
1169 | svm->next_rip = svm->vmcb->save.rip + 2; | 1164 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; |
1170 | kvm_emulate_cpuid(&svm->vcpu); | 1165 | kvm_emulate_cpuid(&svm->vcpu); |
1171 | return 1; | 1166 | return 1; |
1172 | } | 1167 | } |
1173 | 1168 | ||
1169 | static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1170 | { | ||
1171 | if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) | ||
1172 | pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); | ||
1173 | return 1; | ||
1174 | } | ||
1175 | |||
1174 | static int emulate_on_interception(struct vcpu_svm *svm, | 1176 | static int emulate_on_interception(struct vcpu_svm *svm, |
1175 | struct kvm_run *kvm_run) | 1177 | struct kvm_run *kvm_run) |
1176 | { | 1178 | { |
@@ -1261,9 +1263,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1261 | KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, | 1263 | KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, |
1262 | (u32)(data >> 32), handler); | 1264 | (u32)(data >> 32), handler); |
1263 | 1265 | ||
1264 | svm->vmcb->save.rax = data & 0xffffffff; | 1266 | svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; |
1265 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; | 1267 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; |
1266 | svm->next_rip = svm->vmcb->save.rip + 2; | 1268 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; |
1267 | skip_emulated_instruction(&svm->vcpu); | 1269 | skip_emulated_instruction(&svm->vcpu); |
1268 | } | 1270 | } |
1269 | return 1; | 1271 | return 1; |
@@ -1347,13 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
1347 | static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1349 | static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1348 | { | 1350 | { |
1349 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; | 1351 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; |
1350 | u64 data = (svm->vmcb->save.rax & -1u) | 1352 | u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) |
1351 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); | 1353 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); |
1352 | 1354 | ||
1353 | KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), | 1355 | KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), |
1354 | handler); | 1356 | handler); |
1355 | 1357 | ||
1356 | svm->next_rip = svm->vmcb->save.rip + 2; | 1358 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; |
1357 | if (svm_set_msr(&svm->vcpu, ecx, data)) | 1359 | if (svm_set_msr(&svm->vcpu, ecx, data)) |
1358 | kvm_inject_gp(&svm->vcpu, 0); | 1360 | kvm_inject_gp(&svm->vcpu, 0); |
1359 | else | 1361 | else |
@@ -1424,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | |||
1424 | [SVM_EXIT_CPUID] = cpuid_interception, | 1426 | [SVM_EXIT_CPUID] = cpuid_interception, |
1425 | [SVM_EXIT_INVD] = emulate_on_interception, | 1427 | [SVM_EXIT_INVD] = emulate_on_interception, |
1426 | [SVM_EXIT_HLT] = halt_interception, | 1428 | [SVM_EXIT_HLT] = halt_interception, |
1427 | [SVM_EXIT_INVLPG] = emulate_on_interception, | 1429 | [SVM_EXIT_INVLPG] = invlpg_interception, |
1428 | [SVM_EXIT_INVLPGA] = invalid_op_interception, | 1430 | [SVM_EXIT_INVLPGA] = invalid_op_interception, |
1429 | [SVM_EXIT_IOIO] = io_interception, | 1431 | [SVM_EXIT_IOIO] = io_interception, |
1430 | [SVM_EXIT_MSR] = msr_interception, | 1432 | [SVM_EXIT_MSR] = msr_interception, |
@@ -1526,6 +1528,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | |||
1526 | 1528 | ||
1527 | KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); | 1529 | KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); |
1528 | 1530 | ||
1531 | ++svm->vcpu.stat.irq_injections; | ||
1529 | control = &svm->vmcb->control; | 1532 | control = &svm->vmcb->control; |
1530 | control->int_vector = irq; | 1533 | control->int_vector = irq; |
1531 | control->int_ctl &= ~V_INTR_PRIO_MASK; | 1534 | control->int_ctl &= ~V_INTR_PRIO_MASK; |
@@ -1704,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) | |||
1704 | svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; | 1707 | svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; |
1705 | } | 1708 | } |
1706 | 1709 | ||
1710 | #ifdef CONFIG_X86_64 | ||
1711 | #define R "r" | ||
1712 | #else | ||
1713 | #define R "e" | ||
1714 | #endif | ||
1715 | |||
1707 | static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 1716 | static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
1708 | { | 1717 | { |
1709 | struct vcpu_svm *svm = to_svm(vcpu); | 1718 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -1711,6 +1720,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1711 | u16 gs_selector; | 1720 | u16 gs_selector; |
1712 | u16 ldt_selector; | 1721 | u16 ldt_selector; |
1713 | 1722 | ||
1723 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
1724 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
1725 | svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; | ||
1726 | |||
1714 | pre_svm_run(svm); | 1727 | pre_svm_run(svm); |
1715 | 1728 | ||
1716 | sync_lapic_to_cr8(vcpu); | 1729 | sync_lapic_to_cr8(vcpu); |
@@ -1738,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1738 | local_irq_enable(); | 1751 | local_irq_enable(); |
1739 | 1752 | ||
1740 | asm volatile ( | 1753 | asm volatile ( |
1754 | "push %%"R"bp; \n\t" | ||
1755 | "mov %c[rbx](%[svm]), %%"R"bx \n\t" | ||
1756 | "mov %c[rcx](%[svm]), %%"R"cx \n\t" | ||
1757 | "mov %c[rdx](%[svm]), %%"R"dx \n\t" | ||
1758 | "mov %c[rsi](%[svm]), %%"R"si \n\t" | ||
1759 | "mov %c[rdi](%[svm]), %%"R"di \n\t" | ||
1760 | "mov %c[rbp](%[svm]), %%"R"bp \n\t" | ||
1741 | #ifdef CONFIG_X86_64 | 1761 | #ifdef CONFIG_X86_64 |
1742 | "push %%rbp; \n\t" | ||
1743 | #else | ||
1744 | "push %%ebp; \n\t" | ||
1745 | #endif | ||
1746 | |||
1747 | #ifdef CONFIG_X86_64 | ||
1748 | "mov %c[rbx](%[svm]), %%rbx \n\t" | ||
1749 | "mov %c[rcx](%[svm]), %%rcx \n\t" | ||
1750 | "mov %c[rdx](%[svm]), %%rdx \n\t" | ||
1751 | "mov %c[rsi](%[svm]), %%rsi \n\t" | ||
1752 | "mov %c[rdi](%[svm]), %%rdi \n\t" | ||
1753 | "mov %c[rbp](%[svm]), %%rbp \n\t" | ||
1754 | "mov %c[r8](%[svm]), %%r8 \n\t" | 1762 | "mov %c[r8](%[svm]), %%r8 \n\t" |
1755 | "mov %c[r9](%[svm]), %%r9 \n\t" | 1763 | "mov %c[r9](%[svm]), %%r9 \n\t" |
1756 | "mov %c[r10](%[svm]), %%r10 \n\t" | 1764 | "mov %c[r10](%[svm]), %%r10 \n\t" |
@@ -1759,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1759 | "mov %c[r13](%[svm]), %%r13 \n\t" | 1767 | "mov %c[r13](%[svm]), %%r13 \n\t" |
1760 | "mov %c[r14](%[svm]), %%r14 \n\t" | 1768 | "mov %c[r14](%[svm]), %%r14 \n\t" |
1761 | "mov %c[r15](%[svm]), %%r15 \n\t" | 1769 | "mov %c[r15](%[svm]), %%r15 \n\t" |
1762 | #else | ||
1763 | "mov %c[rbx](%[svm]), %%ebx \n\t" | ||
1764 | "mov %c[rcx](%[svm]), %%ecx \n\t" | ||
1765 | "mov %c[rdx](%[svm]), %%edx \n\t" | ||
1766 | "mov %c[rsi](%[svm]), %%esi \n\t" | ||
1767 | "mov %c[rdi](%[svm]), %%edi \n\t" | ||
1768 | "mov %c[rbp](%[svm]), %%ebp \n\t" | ||
1769 | #endif | 1770 | #endif |
1770 | 1771 | ||
1771 | #ifdef CONFIG_X86_64 | ||
1772 | /* Enter guest mode */ | 1772 | /* Enter guest mode */ |
1773 | "push %%rax \n\t" | 1773 | "push %%"R"ax \n\t" |
1774 | "mov %c[vmcb](%[svm]), %%rax \n\t" | 1774 | "mov %c[vmcb](%[svm]), %%"R"ax \n\t" |
1775 | __ex(SVM_VMLOAD) "\n\t" | 1775 | __ex(SVM_VMLOAD) "\n\t" |
1776 | __ex(SVM_VMRUN) "\n\t" | 1776 | __ex(SVM_VMRUN) "\n\t" |
1777 | __ex(SVM_VMSAVE) "\n\t" | 1777 | __ex(SVM_VMSAVE) "\n\t" |
1778 | "pop %%rax \n\t" | 1778 | "pop %%"R"ax \n\t" |
1779 | #else | ||
1780 | /* Enter guest mode */ | ||
1781 | "push %%eax \n\t" | ||
1782 | "mov %c[vmcb](%[svm]), %%eax \n\t" | ||
1783 | __ex(SVM_VMLOAD) "\n\t" | ||
1784 | __ex(SVM_VMRUN) "\n\t" | ||
1785 | __ex(SVM_VMSAVE) "\n\t" | ||
1786 | "pop %%eax \n\t" | ||
1787 | #endif | ||
1788 | 1779 | ||
1789 | /* Save guest registers, load host registers */ | 1780 | /* Save guest registers, load host registers */ |
1781 | "mov %%"R"bx, %c[rbx](%[svm]) \n\t" | ||
1782 | "mov %%"R"cx, %c[rcx](%[svm]) \n\t" | ||
1783 | "mov %%"R"dx, %c[rdx](%[svm]) \n\t" | ||
1784 | "mov %%"R"si, %c[rsi](%[svm]) \n\t" | ||
1785 | "mov %%"R"di, %c[rdi](%[svm]) \n\t" | ||
1786 | "mov %%"R"bp, %c[rbp](%[svm]) \n\t" | ||
1790 | #ifdef CONFIG_X86_64 | 1787 | #ifdef CONFIG_X86_64 |
1791 | "mov %%rbx, %c[rbx](%[svm]) \n\t" | ||
1792 | "mov %%rcx, %c[rcx](%[svm]) \n\t" | ||
1793 | "mov %%rdx, %c[rdx](%[svm]) \n\t" | ||
1794 | "mov %%rsi, %c[rsi](%[svm]) \n\t" | ||
1795 | "mov %%rdi, %c[rdi](%[svm]) \n\t" | ||
1796 | "mov %%rbp, %c[rbp](%[svm]) \n\t" | ||
1797 | "mov %%r8, %c[r8](%[svm]) \n\t" | 1788 | "mov %%r8, %c[r8](%[svm]) \n\t" |
1798 | "mov %%r9, %c[r9](%[svm]) \n\t" | 1789 | "mov %%r9, %c[r9](%[svm]) \n\t" |
1799 | "mov %%r10, %c[r10](%[svm]) \n\t" | 1790 | "mov %%r10, %c[r10](%[svm]) \n\t" |
@@ -1802,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1802 | "mov %%r13, %c[r13](%[svm]) \n\t" | 1793 | "mov %%r13, %c[r13](%[svm]) \n\t" |
1803 | "mov %%r14, %c[r14](%[svm]) \n\t" | 1794 | "mov %%r14, %c[r14](%[svm]) \n\t" |
1804 | "mov %%r15, %c[r15](%[svm]) \n\t" | 1795 | "mov %%r15, %c[r15](%[svm]) \n\t" |
1805 | |||
1806 | "pop %%rbp; \n\t" | ||
1807 | #else | ||
1808 | "mov %%ebx, %c[rbx](%[svm]) \n\t" | ||
1809 | "mov %%ecx, %c[rcx](%[svm]) \n\t" | ||
1810 | "mov %%edx, %c[rdx](%[svm]) \n\t" | ||
1811 | "mov %%esi, %c[rsi](%[svm]) \n\t" | ||
1812 | "mov %%edi, %c[rdi](%[svm]) \n\t" | ||
1813 | "mov %%ebp, %c[rbp](%[svm]) \n\t" | ||
1814 | |||
1815 | "pop %%ebp; \n\t" | ||
1816 | #endif | 1796 | #endif |
1797 | "pop %%"R"bp" | ||
1817 | : | 1798 | : |
1818 | : [svm]"a"(svm), | 1799 | : [svm]"a"(svm), |
1819 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), | 1800 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), |
@@ -1834,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1834 | [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) | 1815 | [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) |
1835 | #endif | 1816 | #endif |
1836 | : "cc", "memory" | 1817 | : "cc", "memory" |
1818 | , R"bx", R"cx", R"dx", R"si", R"di" | ||
1837 | #ifdef CONFIG_X86_64 | 1819 | #ifdef CONFIG_X86_64 |
1838 | , "rbx", "rcx", "rdx", "rsi", "rdi" | ||
1839 | , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" | 1820 | , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" |
1840 | #else | ||
1841 | , "ebx", "ecx", "edx" , "esi", "edi" | ||
1842 | #endif | 1821 | #endif |
1843 | ); | 1822 | ); |
1844 | 1823 | ||
@@ -1846,6 +1825,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1846 | load_db_regs(svm->host_db_regs); | 1825 | load_db_regs(svm->host_db_regs); |
1847 | 1826 | ||
1848 | vcpu->arch.cr2 = svm->vmcb->save.cr2; | 1827 | vcpu->arch.cr2 = svm->vmcb->save.cr2; |
1828 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
1829 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
1830 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; | ||
1849 | 1831 | ||
1850 | write_dr6(svm->host_dr6); | 1832 | write_dr6(svm->host_dr6); |
1851 | write_dr7(svm->host_dr7); | 1833 | write_dr7(svm->host_dr7); |
@@ -1867,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1867 | svm->next_rip = 0; | 1849 | svm->next_rip = 0; |
1868 | } | 1850 | } |
1869 | 1851 | ||
1852 | #undef R | ||
1853 | |||
1870 | static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | 1854 | static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) |
1871 | { | 1855 | { |
1872 | struct vcpu_svm *svm = to_svm(vcpu); | 1856 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -1965,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
1965 | .set_gdt = svm_set_gdt, | 1949 | .set_gdt = svm_set_gdt, |
1966 | .get_dr = svm_get_dr, | 1950 | .get_dr = svm_get_dr, |
1967 | .set_dr = svm_set_dr, | 1951 | .set_dr = svm_set_dr, |
1968 | .cache_regs = svm_cache_regs, | ||
1969 | .decache_regs = svm_decache_regs, | ||
1970 | .get_rflags = svm_get_rflags, | 1952 | .get_rflags = svm_get_rflags, |
1971 | .set_rflags = svm_set_rflags, | 1953 | .set_rflags = svm_set_rflags, |
1972 | 1954 | ||
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2a69773e3b26..2643b430d83a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -26,6 +26,8 @@ | |||
26 | #include <linux/highmem.h> | 26 | #include <linux/highmem.h> |
27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
28 | #include <linux/moduleparam.h> | 28 | #include <linux/moduleparam.h> |
29 | #include "kvm_cache_regs.h" | ||
30 | #include "x86.h" | ||
29 | 31 | ||
30 | #include <asm/io.h> | 32 | #include <asm/io.h> |
31 | #include <asm/desc.h> | 33 | #include <asm/desc.h> |
@@ -47,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0); | |||
47 | static int enable_ept = 1; | 49 | static int enable_ept = 1; |
48 | module_param(enable_ept, bool, 0); | 50 | module_param(enable_ept, bool, 0); |
49 | 51 | ||
52 | static int emulate_invalid_guest_state = 0; | ||
53 | module_param(emulate_invalid_guest_state, bool, 0); | ||
54 | |||
50 | struct vmcs { | 55 | struct vmcs { |
51 | u32 revision_id; | 56 | u32 revision_id; |
52 | u32 abort; | 57 | u32 abort; |
@@ -56,6 +61,7 @@ struct vmcs { | |||
56 | struct vcpu_vmx { | 61 | struct vcpu_vmx { |
57 | struct kvm_vcpu vcpu; | 62 | struct kvm_vcpu vcpu; |
58 | struct list_head local_vcpus_link; | 63 | struct list_head local_vcpus_link; |
64 | unsigned long host_rsp; | ||
59 | int launched; | 65 | int launched; |
60 | u8 fail; | 66 | u8 fail; |
61 | u32 idt_vectoring_info; | 67 | u32 idt_vectoring_info; |
@@ -83,6 +89,7 @@ struct vcpu_vmx { | |||
83 | } irq; | 89 | } irq; |
84 | } rmode; | 90 | } rmode; |
85 | int vpid; | 91 | int vpid; |
92 | bool emulation_required; | ||
86 | }; | 93 | }; |
87 | 94 | ||
88 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | 95 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) |
@@ -468,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) | |||
468 | if (!vcpu->fpu_active) | 475 | if (!vcpu->fpu_active) |
469 | eb |= 1u << NM_VECTOR; | 476 | eb |= 1u << NM_VECTOR; |
470 | if (vcpu->guest_debug.enabled) | 477 | if (vcpu->guest_debug.enabled) |
471 | eb |= 1u << 1; | 478 | eb |= 1u << DB_VECTOR; |
472 | if (vcpu->arch.rmode.active) | 479 | if (vcpu->arch.rmode.active) |
473 | eb = ~0; | 480 | eb = ~0; |
474 | if (vm_need_ept()) | 481 | if (vm_need_ept()) |
@@ -715,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
715 | unsigned long rip; | 722 | unsigned long rip; |
716 | u32 interruptibility; | 723 | u32 interruptibility; |
717 | 724 | ||
718 | rip = vmcs_readl(GUEST_RIP); | 725 | rip = kvm_rip_read(vcpu); |
719 | rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | 726 | rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); |
720 | vmcs_writel(GUEST_RIP, rip); | 727 | kvm_rip_write(vcpu, rip); |
721 | 728 | ||
722 | /* | 729 | /* |
723 | * We emulated an instruction, so temporary interrupt blocking | 730 | * We emulated an instruction, so temporary interrupt blocking |
@@ -733,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
733 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | 740 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, |
734 | bool has_error_code, u32 error_code) | 741 | bool has_error_code, u32 error_code) |
735 | { | 742 | { |
743 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
744 | |||
745 | if (has_error_code) | ||
746 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
747 | |||
748 | if (vcpu->arch.rmode.active) { | ||
749 | vmx->rmode.irq.pending = true; | ||
750 | vmx->rmode.irq.vector = nr; | ||
751 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | ||
752 | if (nr == BP_VECTOR) | ||
753 | vmx->rmode.irq.rip++; | ||
754 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
755 | nr | INTR_TYPE_SOFT_INTR | ||
756 | | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | ||
757 | | INTR_INFO_VALID_MASK); | ||
758 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | ||
759 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); | ||
760 | return; | ||
761 | } | ||
762 | |||
736 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 763 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
737 | nr | INTR_TYPE_EXCEPTION | 764 | nr | INTR_TYPE_EXCEPTION |
738 | | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | 765 | | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) |
739 | | INTR_INFO_VALID_MASK); | 766 | | INTR_INFO_VALID_MASK); |
740 | if (has_error_code) | ||
741 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
742 | } | 767 | } |
743 | 768 | ||
744 | static bool vmx_exception_injected(struct kvm_vcpu *vcpu) | 769 | static bool vmx_exception_injected(struct kvm_vcpu *vcpu) |
745 | { | 770 | { |
746 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 771 | return false; |
747 | |||
748 | return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); | ||
749 | } | 772 | } |
750 | 773 | ||
751 | /* | 774 | /* |
@@ -947,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
947 | return ret; | 970 | return ret; |
948 | } | 971 | } |
949 | 972 | ||
950 | /* | 973 | static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) |
951 | * Sync the rsp and rip registers into the vcpu structure. This allows | ||
952 | * registers to be accessed by indexing vcpu->arch.regs. | ||
953 | */ | ||
954 | static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) | ||
955 | { | ||
956 | vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); | ||
957 | vcpu->arch.rip = vmcs_readl(GUEST_RIP); | ||
958 | } | ||
959 | |||
960 | /* | ||
961 | * Syncs rsp and rip back into the vmcs. Should be called after possible | ||
962 | * modification. | ||
963 | */ | ||
964 | static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) | ||
965 | { | 974 | { |
966 | vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); | 975 | __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); |
967 | vmcs_writel(GUEST_RIP, vcpu->arch.rip); | 976 | switch (reg) { |
977 | case VCPU_REGS_RSP: | ||
978 | vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); | ||
979 | break; | ||
980 | case VCPU_REGS_RIP: | ||
981 | vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); | ||
982 | break; | ||
983 | default: | ||
984 | break; | ||
985 | } | ||
968 | } | 986 | } |
969 | 987 | ||
970 | static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | 988 | static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) |
@@ -1007,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | |||
1007 | 1025 | ||
1008 | static int vmx_get_irq(struct kvm_vcpu *vcpu) | 1026 | static int vmx_get_irq(struct kvm_vcpu *vcpu) |
1009 | { | 1027 | { |
1010 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1028 | if (!vcpu->arch.interrupt.pending) |
1011 | u32 idtv_info_field; | 1029 | return -1; |
1012 | 1030 | return vcpu->arch.interrupt.nr; | |
1013 | idtv_info_field = vmx->idt_vectoring_info; | ||
1014 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | ||
1015 | if (is_external_interrupt(idtv_info_field)) | ||
1016 | return idtv_info_field & VECTORING_INFO_VECTOR_MASK; | ||
1017 | else | ||
1018 | printk(KERN_DEBUG "pending exception: not handled yet\n"); | ||
1019 | } | ||
1020 | return -1; | ||
1021 | } | 1031 | } |
1022 | 1032 | ||
1023 | static __init int cpu_has_kvm_support(void) | 1033 | static __init int cpu_has_kvm_support(void) |
@@ -1031,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void) | |||
1031 | u64 msr; | 1041 | u64 msr; |
1032 | 1042 | ||
1033 | rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); | 1043 | rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); |
1034 | return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | | 1044 | return (msr & (FEATURE_CONTROL_LOCKED | |
1035 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | 1045 | FEATURE_CONTROL_VMXON_ENABLED)) |
1036 | == MSR_IA32_FEATURE_CONTROL_LOCKED; | 1046 | == FEATURE_CONTROL_LOCKED; |
1037 | /* locked but not enabled */ | 1047 | /* locked but not enabled */ |
1038 | } | 1048 | } |
1039 | 1049 | ||
@@ -1045,14 +1055,14 @@ static void hardware_enable(void *garbage) | |||
1045 | 1055 | ||
1046 | INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); | 1056 | INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); |
1047 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); | 1057 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); |
1048 | if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | | 1058 | if ((old & (FEATURE_CONTROL_LOCKED | |
1049 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | 1059 | FEATURE_CONTROL_VMXON_ENABLED)) |
1050 | != (MSR_IA32_FEATURE_CONTROL_LOCKED | | 1060 | != (FEATURE_CONTROL_LOCKED | |
1051 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | 1061 | FEATURE_CONTROL_VMXON_ENABLED)) |
1052 | /* enable and lock */ | 1062 | /* enable and lock */ |
1053 | wrmsrl(MSR_IA32_FEATURE_CONTROL, old | | 1063 | wrmsrl(MSR_IA32_FEATURE_CONTROL, old | |
1054 | MSR_IA32_FEATURE_CONTROL_LOCKED | | 1064 | FEATURE_CONTROL_LOCKED | |
1055 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); | 1065 | FEATURE_CONTROL_VMXON_ENABLED); |
1056 | write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ | 1066 | write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ |
1057 | asm volatile (ASM_VMX_VMXON_RAX | 1067 | asm volatile (ASM_VMX_VMXON_RAX |
1058 | : : "a"(&phys_addr), "m"(phys_addr) | 1068 | : : "a"(&phys_addr), "m"(phys_addr) |
@@ -1120,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1120 | CPU_BASED_CR3_STORE_EXITING | | 1130 | CPU_BASED_CR3_STORE_EXITING | |
1121 | CPU_BASED_USE_IO_BITMAPS | | 1131 | CPU_BASED_USE_IO_BITMAPS | |
1122 | CPU_BASED_MOV_DR_EXITING | | 1132 | CPU_BASED_MOV_DR_EXITING | |
1123 | CPU_BASED_USE_TSC_OFFSETING; | 1133 | CPU_BASED_USE_TSC_OFFSETING | |
1134 | CPU_BASED_INVLPG_EXITING; | ||
1124 | opt = CPU_BASED_TPR_SHADOW | | 1135 | opt = CPU_BASED_TPR_SHADOW | |
1125 | CPU_BASED_USE_MSR_BITMAPS | | 1136 | CPU_BASED_USE_MSR_BITMAPS | |
1126 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | 1137 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; |
@@ -1149,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1149 | _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; | 1160 | _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; |
1150 | #endif | 1161 | #endif |
1151 | if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { | 1162 | if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { |
1152 | /* CR3 accesses don't need to cause VM Exits when EPT enabled */ | 1163 | /* CR3 accesses and invlpg don't need to cause VM Exits when EPT |
1164 | enabled */ | ||
1153 | min &= ~(CPU_BASED_CR3_LOAD_EXITING | | 1165 | min &= ~(CPU_BASED_CR3_LOAD_EXITING | |
1154 | CPU_BASED_CR3_STORE_EXITING); | 1166 | CPU_BASED_CR3_STORE_EXITING | |
1167 | CPU_BASED_INVLPG_EXITING); | ||
1155 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, | 1168 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, |
1156 | &_cpu_based_exec_control) < 0) | 1169 | &_cpu_based_exec_control) < 0) |
1157 | return -EIO; | 1170 | return -EIO; |
@@ -1288,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) | |||
1288 | static void enter_pmode(struct kvm_vcpu *vcpu) | 1301 | static void enter_pmode(struct kvm_vcpu *vcpu) |
1289 | { | 1302 | { |
1290 | unsigned long flags; | 1303 | unsigned long flags; |
1304 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1291 | 1305 | ||
1306 | vmx->emulation_required = 1; | ||
1292 | vcpu->arch.rmode.active = 0; | 1307 | vcpu->arch.rmode.active = 0; |
1293 | 1308 | ||
1294 | vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); | 1309 | vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); |
@@ -1305,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
1305 | 1320 | ||
1306 | update_exception_bitmap(vcpu); | 1321 | update_exception_bitmap(vcpu); |
1307 | 1322 | ||
1323 | if (emulate_invalid_guest_state) | ||
1324 | return; | ||
1325 | |||
1308 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); | 1326 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); |
1309 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); | 1327 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); |
1310 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); | 1328 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); |
@@ -1345,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | |||
1345 | static void enter_rmode(struct kvm_vcpu *vcpu) | 1363 | static void enter_rmode(struct kvm_vcpu *vcpu) |
1346 | { | 1364 | { |
1347 | unsigned long flags; | 1365 | unsigned long flags; |
1366 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1348 | 1367 | ||
1368 | vmx->emulation_required = 1; | ||
1349 | vcpu->arch.rmode.active = 1; | 1369 | vcpu->arch.rmode.active = 1; |
1350 | 1370 | ||
1351 | vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); | 1371 | vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); |
@@ -1367,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
1367 | vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); | 1387 | vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); |
1368 | update_exception_bitmap(vcpu); | 1388 | update_exception_bitmap(vcpu); |
1369 | 1389 | ||
1390 | if (emulate_invalid_guest_state) | ||
1391 | goto continue_rmode; | ||
1392 | |||
1370 | vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); | 1393 | vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); |
1371 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); | 1394 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); |
1372 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); | 1395 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); |
@@ -1382,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
1382 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); | 1405 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); |
1383 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); | 1406 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); |
1384 | 1407 | ||
1408 | continue_rmode: | ||
1385 | kvm_mmu_reset_context(vcpu); | 1409 | kvm_mmu_reset_context(vcpu); |
1386 | init_rmode(vcpu->kvm); | 1410 | init_rmode(vcpu->kvm); |
1387 | } | 1411 | } |
@@ -1715,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | |||
1715 | vmcs_writel(GUEST_GDTR_BASE, dt->base); | 1739 | vmcs_writel(GUEST_GDTR_BASE, dt->base); |
1716 | } | 1740 | } |
1717 | 1741 | ||
1742 | static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) | ||
1743 | { | ||
1744 | struct kvm_segment var; | ||
1745 | u32 ar; | ||
1746 | |||
1747 | vmx_get_segment(vcpu, &var, seg); | ||
1748 | ar = vmx_segment_access_rights(&var); | ||
1749 | |||
1750 | if (var.base != (var.selector << 4)) | ||
1751 | return false; | ||
1752 | if (var.limit != 0xffff) | ||
1753 | return false; | ||
1754 | if (ar != 0xf3) | ||
1755 | return false; | ||
1756 | |||
1757 | return true; | ||
1758 | } | ||
1759 | |||
1760 | static bool code_segment_valid(struct kvm_vcpu *vcpu) | ||
1761 | { | ||
1762 | struct kvm_segment cs; | ||
1763 | unsigned int cs_rpl; | ||
1764 | |||
1765 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
1766 | cs_rpl = cs.selector & SELECTOR_RPL_MASK; | ||
1767 | |||
1768 | if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) | ||
1769 | return false; | ||
1770 | if (!cs.s) | ||
1771 | return false; | ||
1772 | if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) { | ||
1773 | if (cs.dpl > cs_rpl) | ||
1774 | return false; | ||
1775 | } else if (cs.type & AR_TYPE_CODE_MASK) { | ||
1776 | if (cs.dpl != cs_rpl) | ||
1777 | return false; | ||
1778 | } | ||
1779 | if (!cs.present) | ||
1780 | return false; | ||
1781 | |||
1782 | /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ | ||
1783 | return true; | ||
1784 | } | ||
1785 | |||
1786 | static bool stack_segment_valid(struct kvm_vcpu *vcpu) | ||
1787 | { | ||
1788 | struct kvm_segment ss; | ||
1789 | unsigned int ss_rpl; | ||
1790 | |||
1791 | vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); | ||
1792 | ss_rpl = ss.selector & SELECTOR_RPL_MASK; | ||
1793 | |||
1794 | if ((ss.type != 3) || (ss.type != 7)) | ||
1795 | return false; | ||
1796 | if (!ss.s) | ||
1797 | return false; | ||
1798 | if (ss.dpl != ss_rpl) /* DPL != RPL */ | ||
1799 | return false; | ||
1800 | if (!ss.present) | ||
1801 | return false; | ||
1802 | |||
1803 | return true; | ||
1804 | } | ||
1805 | |||
1806 | static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) | ||
1807 | { | ||
1808 | struct kvm_segment var; | ||
1809 | unsigned int rpl; | ||
1810 | |||
1811 | vmx_get_segment(vcpu, &var, seg); | ||
1812 | rpl = var.selector & SELECTOR_RPL_MASK; | ||
1813 | |||
1814 | if (!var.s) | ||
1815 | return false; | ||
1816 | if (!var.present) | ||
1817 | return false; | ||
1818 | if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { | ||
1819 | if (var.dpl < rpl) /* DPL < RPL */ | ||
1820 | return false; | ||
1821 | } | ||
1822 | |||
1823 | /* TODO: Add other members to kvm_segment_field to allow checking for other access | ||
1824 | * rights flags | ||
1825 | */ | ||
1826 | return true; | ||
1827 | } | ||
1828 | |||
1829 | static bool tr_valid(struct kvm_vcpu *vcpu) | ||
1830 | { | ||
1831 | struct kvm_segment tr; | ||
1832 | |||
1833 | vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); | ||
1834 | |||
1835 | if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ | ||
1836 | return false; | ||
1837 | if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */ | ||
1838 | return false; | ||
1839 | if (!tr.present) | ||
1840 | return false; | ||
1841 | |||
1842 | return true; | ||
1843 | } | ||
1844 | |||
1845 | static bool ldtr_valid(struct kvm_vcpu *vcpu) | ||
1846 | { | ||
1847 | struct kvm_segment ldtr; | ||
1848 | |||
1849 | vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); | ||
1850 | |||
1851 | if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ | ||
1852 | return false; | ||
1853 | if (ldtr.type != 2) | ||
1854 | return false; | ||
1855 | if (!ldtr.present) | ||
1856 | return false; | ||
1857 | |||
1858 | return true; | ||
1859 | } | ||
1860 | |||
1861 | static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) | ||
1862 | { | ||
1863 | struct kvm_segment cs, ss; | ||
1864 | |||
1865 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
1866 | vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); | ||
1867 | |||
1868 | return ((cs.selector & SELECTOR_RPL_MASK) == | ||
1869 | (ss.selector & SELECTOR_RPL_MASK)); | ||
1870 | } | ||
1871 | |||
1872 | /* | ||
1873 | * Check if guest state is valid. Returns true if valid, false if | ||
1874 | * not. | ||
1875 | * We assume that registers are always usable | ||
1876 | */ | ||
1877 | static bool guest_state_valid(struct kvm_vcpu *vcpu) | ||
1878 | { | ||
1879 | /* real mode guest state checks */ | ||
1880 | if (!(vcpu->arch.cr0 & X86_CR0_PE)) { | ||
1881 | if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) | ||
1882 | return false; | ||
1883 | if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) | ||
1884 | return false; | ||
1885 | if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) | ||
1886 | return false; | ||
1887 | if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) | ||
1888 | return false; | ||
1889 | if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) | ||
1890 | return false; | ||
1891 | if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) | ||
1892 | return false; | ||
1893 | } else { | ||
1894 | /* protected mode guest state checks */ | ||
1895 | if (!cs_ss_rpl_check(vcpu)) | ||
1896 | return false; | ||
1897 | if (!code_segment_valid(vcpu)) | ||
1898 | return false; | ||
1899 | if (!stack_segment_valid(vcpu)) | ||
1900 | return false; | ||
1901 | if (!data_segment_valid(vcpu, VCPU_SREG_DS)) | ||
1902 | return false; | ||
1903 | if (!data_segment_valid(vcpu, VCPU_SREG_ES)) | ||
1904 | return false; | ||
1905 | if (!data_segment_valid(vcpu, VCPU_SREG_FS)) | ||
1906 | return false; | ||
1907 | if (!data_segment_valid(vcpu, VCPU_SREG_GS)) | ||
1908 | return false; | ||
1909 | if (!tr_valid(vcpu)) | ||
1910 | return false; | ||
1911 | if (!ldtr_valid(vcpu)) | ||
1912 | return false; | ||
1913 | } | ||
1914 | /* TODO: | ||
1915 | * - Add checks on RIP | ||
1916 | * - Add checks on RFLAGS | ||
1917 | */ | ||
1918 | |||
1919 | return true; | ||
1920 | } | ||
1921 | |||
1718 | static int init_rmode_tss(struct kvm *kvm) | 1922 | static int init_rmode_tss(struct kvm *kvm) |
1719 | { | 1923 | { |
1720 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; | 1924 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; |
@@ -1726,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm) | |||
1726 | if (r < 0) | 1930 | if (r < 0) |
1727 | goto out; | 1931 | goto out; |
1728 | data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; | 1932 | data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; |
1729 | r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); | 1933 | r = kvm_write_guest_page(kvm, fn++, &data, |
1934 | TSS_IOPB_BASE_OFFSET, sizeof(u16)); | ||
1730 | if (r < 0) | 1935 | if (r < 0) |
1731 | goto out; | 1936 | goto out; |
1732 | r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); | 1937 | r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); |
@@ -1789,7 +1994,7 @@ static void seg_setup(int seg) | |||
1789 | vmcs_write16(sf->selector, 0); | 1994 | vmcs_write16(sf->selector, 0); |
1790 | vmcs_writel(sf->base, 0); | 1995 | vmcs_writel(sf->base, 0); |
1791 | vmcs_write32(sf->limit, 0xffff); | 1996 | vmcs_write32(sf->limit, 0xffff); |
1792 | vmcs_write32(sf->ar_bytes, 0x93); | 1997 | vmcs_write32(sf->ar_bytes, 0xf3); |
1793 | } | 1998 | } |
1794 | 1999 | ||
1795 | static int alloc_apic_access_page(struct kvm *kvm) | 2000 | static int alloc_apic_access_page(struct kvm *kvm) |
@@ -1808,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm) | |||
1808 | if (r) | 2013 | if (r) |
1809 | goto out; | 2014 | goto out; |
1810 | 2015 | ||
1811 | down_read(¤t->mm->mmap_sem); | ||
1812 | kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); | 2016 | kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); |
1813 | up_read(¤t->mm->mmap_sem); | ||
1814 | out: | 2017 | out: |
1815 | up_write(&kvm->slots_lock); | 2018 | up_write(&kvm->slots_lock); |
1816 | return r; | 2019 | return r; |
@@ -1832,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm) | |||
1832 | if (r) | 2035 | if (r) |
1833 | goto out; | 2036 | goto out; |
1834 | 2037 | ||
1835 | down_read(¤t->mm->mmap_sem); | ||
1836 | kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, | 2038 | kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, |
1837 | VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); | 2039 | VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); |
1838 | up_read(¤t->mm->mmap_sem); | ||
1839 | out: | 2040 | out: |
1840 | up_write(&kvm->slots_lock); | 2041 | up_write(&kvm->slots_lock); |
1841 | return r; | 2042 | return r; |
@@ -1917,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
1917 | } | 2118 | } |
1918 | if (!vm_need_ept()) | 2119 | if (!vm_need_ept()) |
1919 | exec_control |= CPU_BASED_CR3_STORE_EXITING | | 2120 | exec_control |= CPU_BASED_CR3_STORE_EXITING | |
1920 | CPU_BASED_CR3_LOAD_EXITING; | 2121 | CPU_BASED_CR3_LOAD_EXITING | |
2122 | CPU_BASED_INVLPG_EXITING; | ||
1921 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | 2123 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); |
1922 | 2124 | ||
1923 | if (cpu_has_secondary_exec_ctrls()) { | 2125 | if (cpu_has_secondary_exec_ctrls()) { |
@@ -2019,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2019 | u64 msr; | 2221 | u64 msr; |
2020 | int ret; | 2222 | int ret; |
2021 | 2223 | ||
2224 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); | ||
2022 | down_read(&vcpu->kvm->slots_lock); | 2225 | down_read(&vcpu->kvm->slots_lock); |
2023 | if (!init_rmode(vmx->vcpu.kvm)) { | 2226 | if (!init_rmode(vmx->vcpu.kvm)) { |
2024 | ret = -ENOMEM; | 2227 | ret = -ENOMEM; |
@@ -2036,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2036 | 2239 | ||
2037 | fx_init(&vmx->vcpu); | 2240 | fx_init(&vmx->vcpu); |
2038 | 2241 | ||
2242 | seg_setup(VCPU_SREG_CS); | ||
2039 | /* | 2243 | /* |
2040 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | 2244 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode |
2041 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. | 2245 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. |
@@ -2047,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2047 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); | 2251 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); |
2048 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); | 2252 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); |
2049 | } | 2253 | } |
2050 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
2051 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
2052 | 2254 | ||
2053 | seg_setup(VCPU_SREG_DS); | 2255 | seg_setup(VCPU_SREG_DS); |
2054 | seg_setup(VCPU_SREG_ES); | 2256 | seg_setup(VCPU_SREG_ES); |
@@ -2072,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2072 | 2274 | ||
2073 | vmcs_writel(GUEST_RFLAGS, 0x02); | 2275 | vmcs_writel(GUEST_RFLAGS, 0x02); |
2074 | if (vmx->vcpu.vcpu_id == 0) | 2276 | if (vmx->vcpu.vcpu_id == 0) |
2075 | vmcs_writel(GUEST_RIP, 0xfff0); | 2277 | kvm_rip_write(vcpu, 0xfff0); |
2076 | else | 2278 | else |
2077 | vmcs_writel(GUEST_RIP, 0); | 2279 | kvm_rip_write(vcpu, 0); |
2078 | vmcs_writel(GUEST_RSP, 0); | 2280 | kvm_register_write(vcpu, VCPU_REGS_RSP, 0); |
2079 | 2281 | ||
2080 | /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ | 2282 | /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ |
2081 | vmcs_writel(GUEST_DR7, 0x400); | 2283 | vmcs_writel(GUEST_DR7, 0x400); |
@@ -2125,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2125 | 2327 | ||
2126 | ret = 0; | 2328 | ret = 0; |
2127 | 2329 | ||
2330 | /* HACK: Don't enable emulation on guest boot/reset */ | ||
2331 | vmx->emulation_required = 0; | ||
2332 | |||
2128 | out: | 2333 | out: |
2129 | up_read(&vcpu->kvm->slots_lock); | 2334 | up_read(&vcpu->kvm->slots_lock); |
2130 | return ret; | 2335 | return ret; |
@@ -2136,14 +2341,15 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | |||
2136 | 2341 | ||
2137 | KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); | 2342 | KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); |
2138 | 2343 | ||
2344 | ++vcpu->stat.irq_injections; | ||
2139 | if (vcpu->arch.rmode.active) { | 2345 | if (vcpu->arch.rmode.active) { |
2140 | vmx->rmode.irq.pending = true; | 2346 | vmx->rmode.irq.pending = true; |
2141 | vmx->rmode.irq.vector = irq; | 2347 | vmx->rmode.irq.vector = irq; |
2142 | vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); | 2348 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); |
2143 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2349 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
2144 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); | 2350 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); |
2145 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | 2351 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); |
2146 | vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); | 2352 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); |
2147 | return; | 2353 | return; |
2148 | } | 2354 | } |
2149 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2355 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
@@ -2154,7 +2360,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
2154 | { | 2360 | { |
2155 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2361 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
2156 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); | 2362 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); |
2157 | vcpu->arch.nmi_pending = 0; | ||
2158 | } | 2363 | } |
2159 | 2364 | ||
2160 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | 2365 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) |
@@ -2166,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | |||
2166 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); | 2371 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); |
2167 | if (!vcpu->arch.irq_pending[word_index]) | 2372 | if (!vcpu->arch.irq_pending[word_index]) |
2168 | clear_bit(word_index, &vcpu->arch.irq_summary); | 2373 | clear_bit(word_index, &vcpu->arch.irq_summary); |
2169 | vmx_inject_irq(vcpu, irq); | 2374 | kvm_queue_interrupt(vcpu, irq); |
2170 | } | 2375 | } |
2171 | 2376 | ||
2172 | 2377 | ||
@@ -2180,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, | |||
2180 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); | 2385 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); |
2181 | 2386 | ||
2182 | if (vcpu->arch.interrupt_window_open && | 2387 | if (vcpu->arch.interrupt_window_open && |
2183 | vcpu->arch.irq_summary && | 2388 | vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) |
2184 | !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) | ||
2185 | /* | ||
2186 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | ||
2187 | */ | ||
2188 | kvm_do_inject_irq(vcpu); | 2389 | kvm_do_inject_irq(vcpu); |
2189 | 2390 | ||
2391 | if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending) | ||
2392 | vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); | ||
2393 | |||
2190 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 2394 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
2191 | if (!vcpu->arch.interrupt_window_open && | 2395 | if (!vcpu->arch.interrupt_window_open && |
2192 | (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) | 2396 | (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) |
@@ -2237,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) | |||
2237 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, | 2441 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, |
2238 | int vec, u32 err_code) | 2442 | int vec, u32 err_code) |
2239 | { | 2443 | { |
2240 | if (!vcpu->arch.rmode.active) | ||
2241 | return 0; | ||
2242 | |||
2243 | /* | 2444 | /* |
2244 | * Instruction with address size override prefix opcode 0x67 | 2445 | * Instruction with address size override prefix opcode 0x67 |
2245 | * Cause the #SS fault with 0 error code in VM86 mode. | 2446 | * Cause the #SS fault with 0 error code in VM86 mode. |
@@ -2247,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, | |||
2247 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) | 2448 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) |
2248 | if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) | 2449 | if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) |
2249 | return 1; | 2450 | return 1; |
2451 | /* | ||
2452 | * Forward all other exceptions that are valid in real mode. | ||
2453 | * FIXME: Breaks guest debugging in real mode, needs to be fixed with | ||
2454 | * the required debugging infrastructure rework. | ||
2455 | */ | ||
2456 | switch (vec) { | ||
2457 | case DE_VECTOR: | ||
2458 | case DB_VECTOR: | ||
2459 | case BP_VECTOR: | ||
2460 | case OF_VECTOR: | ||
2461 | case BR_VECTOR: | ||
2462 | case UD_VECTOR: | ||
2463 | case DF_VECTOR: | ||
2464 | case SS_VECTOR: | ||
2465 | case GP_VECTOR: | ||
2466 | case MF_VECTOR: | ||
2467 | kvm_queue_exception(vcpu, vec); | ||
2468 | return 1; | ||
2469 | } | ||
2250 | return 0; | 2470 | return 0; |
2251 | } | 2471 | } |
2252 | 2472 | ||
@@ -2288,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2288 | } | 2508 | } |
2289 | 2509 | ||
2290 | error_code = 0; | 2510 | error_code = 0; |
2291 | rip = vmcs_readl(GUEST_RIP); | 2511 | rip = kvm_rip_read(vcpu); |
2292 | if (intr_info & INTR_INFO_DELIVER_CODE_MASK) | 2512 | if (intr_info & INTR_INFO_DELIVER_CODE_MASK) |
2293 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | 2513 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); |
2294 | if (is_page_fault(intr_info)) { | 2514 | if (is_page_fault(intr_info)) { |
@@ -2298,7 +2518,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2298 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | 2518 | cr2 = vmcs_readl(EXIT_QUALIFICATION); |
2299 | KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, | 2519 | KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, |
2300 | (u32)((u64)cr2 >> 32), handler); | 2520 | (u32)((u64)cr2 >> 32), handler); |
2301 | if (vect_info & VECTORING_INFO_VALID_MASK) | 2521 | if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) |
2302 | kvm_mmu_unprotect_page_virt(vcpu, cr2); | 2522 | kvm_mmu_unprotect_page_virt(vcpu, cr2); |
2303 | return kvm_mmu_page_fault(vcpu, cr2, error_code); | 2523 | return kvm_mmu_page_fault(vcpu, cr2, error_code); |
2304 | } | 2524 | } |
@@ -2386,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2386 | reg = (exit_qualification >> 8) & 15; | 2606 | reg = (exit_qualification >> 8) & 15; |
2387 | switch ((exit_qualification >> 4) & 3) { | 2607 | switch ((exit_qualification >> 4) & 3) { |
2388 | case 0: /* mov to cr */ | 2608 | case 0: /* mov to cr */ |
2389 | KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], | 2609 | KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, |
2390 | (u32)((u64)vcpu->arch.regs[reg] >> 32), handler); | 2610 | (u32)kvm_register_read(vcpu, reg), |
2611 | (u32)((u64)kvm_register_read(vcpu, reg) >> 32), | ||
2612 | handler); | ||
2391 | switch (cr) { | 2613 | switch (cr) { |
2392 | case 0: | 2614 | case 0: |
2393 | vcpu_load_rsp_rip(vcpu); | 2615 | kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); |
2394 | kvm_set_cr0(vcpu, vcpu->arch.regs[reg]); | ||
2395 | skip_emulated_instruction(vcpu); | 2616 | skip_emulated_instruction(vcpu); |
2396 | return 1; | 2617 | return 1; |
2397 | case 3: | 2618 | case 3: |
2398 | vcpu_load_rsp_rip(vcpu); | 2619 | kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); |
2399 | kvm_set_cr3(vcpu, vcpu->arch.regs[reg]); | ||
2400 | skip_emulated_instruction(vcpu); | 2620 | skip_emulated_instruction(vcpu); |
2401 | return 1; | 2621 | return 1; |
2402 | case 4: | 2622 | case 4: |
2403 | vcpu_load_rsp_rip(vcpu); | 2623 | kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); |
2404 | kvm_set_cr4(vcpu, vcpu->arch.regs[reg]); | ||
2405 | skip_emulated_instruction(vcpu); | 2624 | skip_emulated_instruction(vcpu); |
2406 | return 1; | 2625 | return 1; |
2407 | case 8: | 2626 | case 8: |
2408 | vcpu_load_rsp_rip(vcpu); | 2627 | kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); |
2409 | kvm_set_cr8(vcpu, vcpu->arch.regs[reg]); | ||
2410 | skip_emulated_instruction(vcpu); | 2628 | skip_emulated_instruction(vcpu); |
2411 | if (irqchip_in_kernel(vcpu->kvm)) | 2629 | if (irqchip_in_kernel(vcpu->kvm)) |
2412 | return 1; | 2630 | return 1; |
@@ -2415,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2415 | }; | 2633 | }; |
2416 | break; | 2634 | break; |
2417 | case 2: /* clts */ | 2635 | case 2: /* clts */ |
2418 | vcpu_load_rsp_rip(vcpu); | ||
2419 | vmx_fpu_deactivate(vcpu); | 2636 | vmx_fpu_deactivate(vcpu); |
2420 | vcpu->arch.cr0 &= ~X86_CR0_TS; | 2637 | vcpu->arch.cr0 &= ~X86_CR0_TS; |
2421 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); | 2638 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); |
@@ -2426,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2426 | case 1: /*mov from cr*/ | 2643 | case 1: /*mov from cr*/ |
2427 | switch (cr) { | 2644 | switch (cr) { |
2428 | case 3: | 2645 | case 3: |
2429 | vcpu_load_rsp_rip(vcpu); | 2646 | kvm_register_write(vcpu, reg, vcpu->arch.cr3); |
2430 | vcpu->arch.regs[reg] = vcpu->arch.cr3; | ||
2431 | vcpu_put_rsp_rip(vcpu); | ||
2432 | KVMTRACE_3D(CR_READ, vcpu, (u32)cr, | 2647 | KVMTRACE_3D(CR_READ, vcpu, (u32)cr, |
2433 | (u32)vcpu->arch.regs[reg], | 2648 | (u32)kvm_register_read(vcpu, reg), |
2434 | (u32)((u64)vcpu->arch.regs[reg] >> 32), | 2649 | (u32)((u64)kvm_register_read(vcpu, reg) >> 32), |
2435 | handler); | 2650 | handler); |
2436 | skip_emulated_instruction(vcpu); | 2651 | skip_emulated_instruction(vcpu); |
2437 | return 1; | 2652 | return 1; |
2438 | case 8: | 2653 | case 8: |
2439 | vcpu_load_rsp_rip(vcpu); | 2654 | kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); |
2440 | vcpu->arch.regs[reg] = kvm_get_cr8(vcpu); | ||
2441 | vcpu_put_rsp_rip(vcpu); | ||
2442 | KVMTRACE_2D(CR_READ, vcpu, (u32)cr, | 2655 | KVMTRACE_2D(CR_READ, vcpu, (u32)cr, |
2443 | (u32)vcpu->arch.regs[reg], handler); | 2656 | (u32)kvm_register_read(vcpu, reg), handler); |
2444 | skip_emulated_instruction(vcpu); | 2657 | skip_emulated_instruction(vcpu); |
2445 | return 1; | 2658 | return 1; |
2446 | } | 2659 | } |
@@ -2472,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2472 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 2685 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
2473 | dr = exit_qualification & 7; | 2686 | dr = exit_qualification & 7; |
2474 | reg = (exit_qualification >> 8) & 15; | 2687 | reg = (exit_qualification >> 8) & 15; |
2475 | vcpu_load_rsp_rip(vcpu); | ||
2476 | if (exit_qualification & 16) { | 2688 | if (exit_qualification & 16) { |
2477 | /* mov from dr */ | 2689 | /* mov from dr */ |
2478 | switch (dr) { | 2690 | switch (dr) { |
@@ -2485,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2485 | default: | 2697 | default: |
2486 | val = 0; | 2698 | val = 0; |
2487 | } | 2699 | } |
2488 | vcpu->arch.regs[reg] = val; | 2700 | kvm_register_write(vcpu, reg, val); |
2489 | KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); | 2701 | KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); |
2490 | } else { | 2702 | } else { |
2491 | /* mov to dr */ | 2703 | /* mov to dr */ |
2492 | } | 2704 | } |
2493 | vcpu_put_rsp_rip(vcpu); | ||
2494 | skip_emulated_instruction(vcpu); | 2705 | skip_emulated_instruction(vcpu); |
2495 | return 1; | 2706 | return 1; |
2496 | } | 2707 | } |
@@ -2583,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2583 | return 1; | 2794 | return 1; |
2584 | } | 2795 | } |
2585 | 2796 | ||
2797 | static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2798 | { | ||
2799 | u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); | ||
2800 | |||
2801 | kvm_mmu_invlpg(vcpu, exit_qualification); | ||
2802 | skip_emulated_instruction(vcpu); | ||
2803 | return 1; | ||
2804 | } | ||
2805 | |||
2586 | static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2806 | static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2587 | { | 2807 | { |
2588 | skip_emulated_instruction(vcpu); | 2808 | skip_emulated_instruction(vcpu); |
@@ -2695,6 +2915,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2695 | return 1; | 2915 | return 1; |
2696 | } | 2916 | } |
2697 | 2917 | ||
2918 | static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, | ||
2919 | struct kvm_run *kvm_run) | ||
2920 | { | ||
2921 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2922 | int err; | ||
2923 | |||
2924 | preempt_enable(); | ||
2925 | local_irq_enable(); | ||
2926 | |||
2927 | while (!guest_state_valid(vcpu)) { | ||
2928 | err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); | ||
2929 | |||
2930 | switch (err) { | ||
2931 | case EMULATE_DONE: | ||
2932 | break; | ||
2933 | case EMULATE_DO_MMIO: | ||
2934 | kvm_report_emulation_failure(vcpu, "mmio"); | ||
2935 | /* TODO: Handle MMIO */ | ||
2936 | return; | ||
2937 | default: | ||
2938 | kvm_report_emulation_failure(vcpu, "emulation failure"); | ||
2939 | return; | ||
2940 | } | ||
2941 | |||
2942 | if (signal_pending(current)) | ||
2943 | break; | ||
2944 | if (need_resched()) | ||
2945 | schedule(); | ||
2946 | } | ||
2947 | |||
2948 | local_irq_disable(); | ||
2949 | preempt_disable(); | ||
2950 | |||
2951 | /* Guest state should be valid now, no more emulation should be needed */ | ||
2952 | vmx->emulation_required = 0; | ||
2953 | } | ||
2954 | |||
2698 | /* | 2955 | /* |
2699 | * The exit handlers return 1 if the exit was handled fully and guest execution | 2956 | * The exit handlers return 1 if the exit was handled fully and guest execution |
2700 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | 2957 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs |
@@ -2714,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | |||
2714 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, | 2971 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, |
2715 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, | 2972 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, |
2716 | [EXIT_REASON_HLT] = handle_halt, | 2973 | [EXIT_REASON_HLT] = handle_halt, |
2974 | [EXIT_REASON_INVLPG] = handle_invlpg, | ||
2717 | [EXIT_REASON_VMCALL] = handle_vmcall, | 2975 | [EXIT_REASON_VMCALL] = handle_vmcall, |
2718 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, | 2976 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, |
2719 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | 2977 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, |
@@ -2735,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
2735 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2993 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2736 | u32 vectoring_info = vmx->idt_vectoring_info; | 2994 | u32 vectoring_info = vmx->idt_vectoring_info; |
2737 | 2995 | ||
2738 | KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), | 2996 | KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), |
2739 | (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); | 2997 | (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); |
2740 | 2998 | ||
2741 | /* Access CR3 don't cause VMExit in paging mode, so we need | 2999 | /* Access CR3 don't cause VMExit in paging mode, so we need |
2742 | * to sync with guest real CR3. */ | 3000 | * to sync with guest real CR3. */ |
@@ -2829,88 +3087,92 @@ static void enable_intr_window(struct kvm_vcpu *vcpu) | |||
2829 | enable_irq_window(vcpu); | 3087 | enable_irq_window(vcpu); |
2830 | } | 3088 | } |
2831 | 3089 | ||
2832 | static void vmx_intr_assist(struct kvm_vcpu *vcpu) | 3090 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) |
2833 | { | 3091 | { |
2834 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3092 | u32 exit_intr_info; |
2835 | u32 idtv_info_field, intr_info_field, exit_intr_info_field; | 3093 | u32 idt_vectoring_info; |
2836 | int vector; | 3094 | bool unblock_nmi; |
3095 | u8 vector; | ||
3096 | int type; | ||
3097 | bool idtv_info_valid; | ||
3098 | u32 error; | ||
2837 | 3099 | ||
2838 | update_tpr_threshold(vcpu); | 3100 | exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); |
2839 | 3101 | if (cpu_has_virtual_nmis()) { | |
2840 | intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); | 3102 | unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; |
2841 | exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO); | 3103 | vector = exit_intr_info & INTR_INFO_VECTOR_MASK; |
2842 | idtv_info_field = vmx->idt_vectoring_info; | 3104 | /* |
2843 | if (intr_info_field & INTR_INFO_VALID_MASK) { | 3105 | * SDM 3: 25.7.1.2 |
2844 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | 3106 | * Re-set bit "block by NMI" before VM entry if vmexit caused by |
2845 | /* TODO: fault when IDT_Vectoring */ | 3107 | * a guest IRET fault. |
2846 | if (printk_ratelimit()) | 3108 | */ |
2847 | printk(KERN_ERR "Fault when IDT_Vectoring\n"); | 3109 | if (unblock_nmi && vector != DF_VECTOR) |
2848 | } | 3110 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, |
2849 | enable_intr_window(vcpu); | 3111 | GUEST_INTR_STATE_NMI); |
2850 | return; | ||
2851 | } | 3112 | } |
2852 | if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { | ||
2853 | if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) | ||
2854 | == INTR_TYPE_EXT_INTR | ||
2855 | && vcpu->arch.rmode.active) { | ||
2856 | u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; | ||
2857 | |||
2858 | vmx_inject_irq(vcpu, vect); | ||
2859 | enable_intr_window(vcpu); | ||
2860 | return; | ||
2861 | } | ||
2862 | |||
2863 | KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); | ||
2864 | 3113 | ||
3114 | idt_vectoring_info = vmx->idt_vectoring_info; | ||
3115 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; | ||
3116 | vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; | ||
3117 | type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; | ||
3118 | if (vmx->vcpu.arch.nmi_injected) { | ||
2865 | /* | 3119 | /* |
2866 | * SDM 3: 25.7.1.2 | 3120 | * SDM 3: 25.7.1.2 |
2867 | * Clear bit "block by NMI" before VM entry if a NMI delivery | 3121 | * Clear bit "block by NMI" before VM entry if a NMI delivery |
2868 | * faulted. | 3122 | * faulted. |
2869 | */ | 3123 | */ |
2870 | if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) | 3124 | if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) |
2871 | == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis()) | 3125 | vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, |
2872 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | 3126 | GUEST_INTR_STATE_NMI); |
2873 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | 3127 | else |
2874 | ~GUEST_INTR_STATE_NMI); | 3128 | vmx->vcpu.arch.nmi_injected = false; |
2875 | 3129 | } | |
2876 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field | 3130 | kvm_clear_exception_queue(&vmx->vcpu); |
2877 | & ~INTR_INFO_RESVD_BITS_MASK); | 3131 | if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) { |
2878 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | 3132 | if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { |
2879 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); | 3133 | error = vmcs_read32(IDT_VECTORING_ERROR_CODE); |
2880 | 3134 | kvm_queue_exception_e(&vmx->vcpu, vector, error); | |
2881 | if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) | 3135 | } else |
2882 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | 3136 | kvm_queue_exception(&vmx->vcpu, vector); |
2883 | vmcs_read32(IDT_VECTORING_ERROR_CODE)); | 3137 | vmx->idt_vectoring_info = 0; |
2884 | enable_intr_window(vcpu); | ||
2885 | return; | ||
2886 | } | 3138 | } |
3139 | kvm_clear_interrupt_queue(&vmx->vcpu); | ||
3140 | if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { | ||
3141 | kvm_queue_interrupt(&vmx->vcpu, vector); | ||
3142 | vmx->idt_vectoring_info = 0; | ||
3143 | } | ||
3144 | } | ||
3145 | |||
3146 | static void vmx_intr_assist(struct kvm_vcpu *vcpu) | ||
3147 | { | ||
3148 | update_tpr_threshold(vcpu); | ||
3149 | |||
2887 | if (cpu_has_virtual_nmis()) { | 3150 | if (cpu_has_virtual_nmis()) { |
2888 | /* | 3151 | if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { |
2889 | * SDM 3: 25.7.1.2 | 3152 | if (vmx_nmi_enabled(vcpu)) { |
2890 | * Re-set bit "block by NMI" before VM entry if vmexit caused by | 3153 | vcpu->arch.nmi_pending = false; |
2891 | * a guest IRET fault. | 3154 | vcpu->arch.nmi_injected = true; |
2892 | */ | 3155 | } else { |
2893 | if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) && | 3156 | enable_intr_window(vcpu); |
2894 | (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8) | 3157 | return; |
2895 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | 3158 | } |
2896 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) | | 3159 | } |
2897 | GUEST_INTR_STATE_NMI); | 3160 | if (vcpu->arch.nmi_injected) { |
2898 | else if (vcpu->arch.nmi_pending) { | 3161 | vmx_inject_nmi(vcpu); |
2899 | if (vmx_nmi_enabled(vcpu)) | ||
2900 | vmx_inject_nmi(vcpu); | ||
2901 | enable_intr_window(vcpu); | 3162 | enable_intr_window(vcpu); |
2902 | return; | 3163 | return; |
2903 | } | 3164 | } |
2904 | |||
2905 | } | 3165 | } |
2906 | if (!kvm_cpu_has_interrupt(vcpu)) | 3166 | if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { |
2907 | return; | 3167 | if (vmx_irq_enabled(vcpu)) |
2908 | if (vmx_irq_enabled(vcpu)) { | 3168 | kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); |
2909 | vector = kvm_cpu_get_interrupt(vcpu); | 3169 | else |
2910 | vmx_inject_irq(vcpu, vector); | 3170 | enable_irq_window(vcpu); |
2911 | kvm_timer_intr_post(vcpu, vector); | 3171 | } |
2912 | } else | 3172 | if (vcpu->arch.interrupt.pending) { |
2913 | enable_irq_window(vcpu); | 3173 | vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); |
3174 | kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); | ||
3175 | } | ||
2914 | } | 3176 | } |
2915 | 3177 | ||
2916 | /* | 3178 | /* |
@@ -2922,9 +3184,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) | |||
2922 | static void fixup_rmode_irq(struct vcpu_vmx *vmx) | 3184 | static void fixup_rmode_irq(struct vcpu_vmx *vmx) |
2923 | { | 3185 | { |
2924 | vmx->rmode.irq.pending = 0; | 3186 | vmx->rmode.irq.pending = 0; |
2925 | if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) | 3187 | if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) |
2926 | return; | 3188 | return; |
2927 | vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); | 3189 | kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); |
2928 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { | 3190 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { |
2929 | vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; | 3191 | vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; |
2930 | vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; | 3192 | vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; |
@@ -2936,11 +3198,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) | |||
2936 | | vmx->rmode.irq.vector; | 3198 | | vmx->rmode.irq.vector; |
2937 | } | 3199 | } |
2938 | 3200 | ||
3201 | #ifdef CONFIG_X86_64 | ||
3202 | #define R "r" | ||
3203 | #define Q "q" | ||
3204 | #else | ||
3205 | #define R "e" | ||
3206 | #define Q "l" | ||
3207 | #endif | ||
3208 | |||
2939 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 3209 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2940 | { | 3210 | { |
2941 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3211 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2942 | u32 intr_info; | 3212 | u32 intr_info; |
2943 | 3213 | ||
3214 | /* Handle invalid guest state instead of entering VMX */ | ||
3215 | if (vmx->emulation_required && emulate_invalid_guest_state) { | ||
3216 | handle_invalid_guest_state(vcpu, kvm_run); | ||
3217 | return; | ||
3218 | } | ||
3219 | |||
3220 | if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) | ||
3221 | vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); | ||
3222 | if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) | ||
3223 | vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); | ||
3224 | |||
2944 | /* | 3225 | /* |
2945 | * Loading guest fpu may have cleared host cr0.ts | 3226 | * Loading guest fpu may have cleared host cr0.ts |
2946 | */ | 3227 | */ |
@@ -2948,26 +3229,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2948 | 3229 | ||
2949 | asm( | 3230 | asm( |
2950 | /* Store host registers */ | 3231 | /* Store host registers */ |
2951 | #ifdef CONFIG_X86_64 | 3232 | "push %%"R"dx; push %%"R"bp;" |
2952 | "push %%rdx; push %%rbp;" | 3233 | "push %%"R"cx \n\t" |
2953 | "push %%rcx \n\t" | 3234 | "cmp %%"R"sp, %c[host_rsp](%0) \n\t" |
2954 | #else | 3235 | "je 1f \n\t" |
2955 | "push %%edx; push %%ebp;" | 3236 | "mov %%"R"sp, %c[host_rsp](%0) \n\t" |
2956 | "push %%ecx \n\t" | ||
2957 | #endif | ||
2958 | __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" | 3237 | __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" |
3238 | "1: \n\t" | ||
2959 | /* Check if vmlaunch of vmresume is needed */ | 3239 | /* Check if vmlaunch of vmresume is needed */ |
2960 | "cmpl $0, %c[launched](%0) \n\t" | 3240 | "cmpl $0, %c[launched](%0) \n\t" |
2961 | /* Load guest registers. Don't clobber flags. */ | 3241 | /* Load guest registers. Don't clobber flags. */ |
3242 | "mov %c[cr2](%0), %%"R"ax \n\t" | ||
3243 | "mov %%"R"ax, %%cr2 \n\t" | ||
3244 | "mov %c[rax](%0), %%"R"ax \n\t" | ||
3245 | "mov %c[rbx](%0), %%"R"bx \n\t" | ||
3246 | "mov %c[rdx](%0), %%"R"dx \n\t" | ||
3247 | "mov %c[rsi](%0), %%"R"si \n\t" | ||
3248 | "mov %c[rdi](%0), %%"R"di \n\t" | ||
3249 | "mov %c[rbp](%0), %%"R"bp \n\t" | ||
2962 | #ifdef CONFIG_X86_64 | 3250 | #ifdef CONFIG_X86_64 |
2963 | "mov %c[cr2](%0), %%rax \n\t" | ||
2964 | "mov %%rax, %%cr2 \n\t" | ||
2965 | "mov %c[rax](%0), %%rax \n\t" | ||
2966 | "mov %c[rbx](%0), %%rbx \n\t" | ||
2967 | "mov %c[rdx](%0), %%rdx \n\t" | ||
2968 | "mov %c[rsi](%0), %%rsi \n\t" | ||
2969 | "mov %c[rdi](%0), %%rdi \n\t" | ||
2970 | "mov %c[rbp](%0), %%rbp \n\t" | ||
2971 | "mov %c[r8](%0), %%r8 \n\t" | 3251 | "mov %c[r8](%0), %%r8 \n\t" |
2972 | "mov %c[r9](%0), %%r9 \n\t" | 3252 | "mov %c[r9](%0), %%r9 \n\t" |
2973 | "mov %c[r10](%0), %%r10 \n\t" | 3253 | "mov %c[r10](%0), %%r10 \n\t" |
@@ -2976,18 +3256,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2976 | "mov %c[r13](%0), %%r13 \n\t" | 3256 | "mov %c[r13](%0), %%r13 \n\t" |
2977 | "mov %c[r14](%0), %%r14 \n\t" | 3257 | "mov %c[r14](%0), %%r14 \n\t" |
2978 | "mov %c[r15](%0), %%r15 \n\t" | 3258 | "mov %c[r15](%0), %%r15 \n\t" |
2979 | "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ | ||
2980 | #else | ||
2981 | "mov %c[cr2](%0), %%eax \n\t" | ||
2982 | "mov %%eax, %%cr2 \n\t" | ||
2983 | "mov %c[rax](%0), %%eax \n\t" | ||
2984 | "mov %c[rbx](%0), %%ebx \n\t" | ||
2985 | "mov %c[rdx](%0), %%edx \n\t" | ||
2986 | "mov %c[rsi](%0), %%esi \n\t" | ||
2987 | "mov %c[rdi](%0), %%edi \n\t" | ||
2988 | "mov %c[rbp](%0), %%ebp \n\t" | ||
2989 | "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ | ||
2990 | #endif | 3259 | #endif |
3260 | "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ | ||
3261 | |||
2991 | /* Enter guest mode */ | 3262 | /* Enter guest mode */ |
2992 | "jne .Llaunched \n\t" | 3263 | "jne .Llaunched \n\t" |
2993 | __ex(ASM_VMX_VMLAUNCH) "\n\t" | 3264 | __ex(ASM_VMX_VMLAUNCH) "\n\t" |
@@ -2995,15 +3266,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2995 | ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" | 3266 | ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" |
2996 | ".Lkvm_vmx_return: " | 3267 | ".Lkvm_vmx_return: " |
2997 | /* Save guest registers, load host registers, keep flags */ | 3268 | /* Save guest registers, load host registers, keep flags */ |
3269 | "xchg %0, (%%"R"sp) \n\t" | ||
3270 | "mov %%"R"ax, %c[rax](%0) \n\t" | ||
3271 | "mov %%"R"bx, %c[rbx](%0) \n\t" | ||
3272 | "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" | ||
3273 | "mov %%"R"dx, %c[rdx](%0) \n\t" | ||
3274 | "mov %%"R"si, %c[rsi](%0) \n\t" | ||
3275 | "mov %%"R"di, %c[rdi](%0) \n\t" | ||
3276 | "mov %%"R"bp, %c[rbp](%0) \n\t" | ||
2998 | #ifdef CONFIG_X86_64 | 3277 | #ifdef CONFIG_X86_64 |
2999 | "xchg %0, (%%rsp) \n\t" | ||
3000 | "mov %%rax, %c[rax](%0) \n\t" | ||
3001 | "mov %%rbx, %c[rbx](%0) \n\t" | ||
3002 | "pushq (%%rsp); popq %c[rcx](%0) \n\t" | ||
3003 | "mov %%rdx, %c[rdx](%0) \n\t" | ||
3004 | "mov %%rsi, %c[rsi](%0) \n\t" | ||
3005 | "mov %%rdi, %c[rdi](%0) \n\t" | ||
3006 | "mov %%rbp, %c[rbp](%0) \n\t" | ||
3007 | "mov %%r8, %c[r8](%0) \n\t" | 3278 | "mov %%r8, %c[r8](%0) \n\t" |
3008 | "mov %%r9, %c[r9](%0) \n\t" | 3279 | "mov %%r9, %c[r9](%0) \n\t" |
3009 | "mov %%r10, %c[r10](%0) \n\t" | 3280 | "mov %%r10, %c[r10](%0) \n\t" |
@@ -3012,28 +3283,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3012 | "mov %%r13, %c[r13](%0) \n\t" | 3283 | "mov %%r13, %c[r13](%0) \n\t" |
3013 | "mov %%r14, %c[r14](%0) \n\t" | 3284 | "mov %%r14, %c[r14](%0) \n\t" |
3014 | "mov %%r15, %c[r15](%0) \n\t" | 3285 | "mov %%r15, %c[r15](%0) \n\t" |
3015 | "mov %%cr2, %%rax \n\t" | ||
3016 | "mov %%rax, %c[cr2](%0) \n\t" | ||
3017 | |||
3018 | "pop %%rbp; pop %%rbp; pop %%rdx \n\t" | ||
3019 | #else | ||
3020 | "xchg %0, (%%esp) \n\t" | ||
3021 | "mov %%eax, %c[rax](%0) \n\t" | ||
3022 | "mov %%ebx, %c[rbx](%0) \n\t" | ||
3023 | "pushl (%%esp); popl %c[rcx](%0) \n\t" | ||
3024 | "mov %%edx, %c[rdx](%0) \n\t" | ||
3025 | "mov %%esi, %c[rsi](%0) \n\t" | ||
3026 | "mov %%edi, %c[rdi](%0) \n\t" | ||
3027 | "mov %%ebp, %c[rbp](%0) \n\t" | ||
3028 | "mov %%cr2, %%eax \n\t" | ||
3029 | "mov %%eax, %c[cr2](%0) \n\t" | ||
3030 | |||
3031 | "pop %%ebp; pop %%ebp; pop %%edx \n\t" | ||
3032 | #endif | 3286 | #endif |
3287 | "mov %%cr2, %%"R"ax \n\t" | ||
3288 | "mov %%"R"ax, %c[cr2](%0) \n\t" | ||
3289 | |||
3290 | "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" | ||
3033 | "setbe %c[fail](%0) \n\t" | 3291 | "setbe %c[fail](%0) \n\t" |
3034 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), | 3292 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), |
3035 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), | 3293 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), |
3036 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | 3294 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), |
3295 | [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), | ||
3037 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), | 3296 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), |
3038 | [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), | 3297 | [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), |
3039 | [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), | 3298 | [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), |
@@ -3053,14 +3312,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3053 | #endif | 3312 | #endif |
3054 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) | 3313 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) |
3055 | : "cc", "memory" | 3314 | : "cc", "memory" |
3315 | , R"bx", R"di", R"si" | ||
3056 | #ifdef CONFIG_X86_64 | 3316 | #ifdef CONFIG_X86_64 |
3057 | , "rbx", "rdi", "rsi" | ||
3058 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" | 3317 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" |
3059 | #else | ||
3060 | , "ebx", "edi", "rsi" | ||
3061 | #endif | 3318 | #endif |
3062 | ); | 3319 | ); |
3063 | 3320 | ||
3321 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); | ||
3322 | vcpu->arch.regs_dirty = 0; | ||
3323 | |||
3064 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 3324 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
3065 | if (vmx->rmode.irq.pending) | 3325 | if (vmx->rmode.irq.pending) |
3066 | fixup_rmode_irq(vmx); | 3326 | fixup_rmode_irq(vmx); |
@@ -3080,8 +3340,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3080 | KVMTRACE_0D(NMI, vcpu, handler); | 3340 | KVMTRACE_0D(NMI, vcpu, handler); |
3081 | asm("int $2"); | 3341 | asm("int $2"); |
3082 | } | 3342 | } |
3343 | |||
3344 | vmx_complete_interrupts(vmx); | ||
3083 | } | 3345 | } |
3084 | 3346 | ||
3347 | #undef R | ||
3348 | #undef Q | ||
3349 | |||
3085 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) | 3350 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) |
3086 | { | 3351 | { |
3087 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3352 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
@@ -3224,8 +3489,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
3224 | .set_idt = vmx_set_idt, | 3489 | .set_idt = vmx_set_idt, |
3225 | .get_gdt = vmx_get_gdt, | 3490 | .get_gdt = vmx_get_gdt, |
3226 | .set_gdt = vmx_set_gdt, | 3491 | .set_gdt = vmx_set_gdt, |
3227 | .cache_regs = vcpu_load_rsp_rip, | 3492 | .cache_reg = vmx_cache_reg, |
3228 | .decache_regs = vcpu_put_rsp_rip, | ||
3229 | .get_rflags = vmx_get_rflags, | 3493 | .get_rflags = vmx_get_rflags, |
3230 | .set_rflags = vmx_set_rflags, | 3494 | .set_rflags = vmx_set_rflags, |
3231 | 3495 | ||
@@ -3301,8 +3565,7 @@ static int __init vmx_init(void) | |||
3301 | kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | | 3565 | kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | |
3302 | VMX_EPT_WRITABLE_MASK | | 3566 | VMX_EPT_WRITABLE_MASK | |
3303 | VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); | 3567 | VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); |
3304 | kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK, | 3568 | kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, |
3305 | VMX_EPT_FAKE_DIRTY_MASK, 0ull, | ||
3306 | VMX_EPT_EXECUTABLE_MASK); | 3569 | VMX_EPT_EXECUTABLE_MASK); |
3307 | kvm_enable_tdp(); | 3570 | kvm_enable_tdp(); |
3308 | } else | 3571 | } else |
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 425a13436b3f..3e010d21fdd7 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h | |||
@@ -331,24 +331,6 @@ enum vmcs_field { | |||
331 | 331 | ||
332 | #define AR_RESERVD_MASK 0xfffe0f00 | 332 | #define AR_RESERVD_MASK 0xfffe0f00 |
333 | 333 | ||
334 | #define MSR_IA32_VMX_BASIC 0x480 | ||
335 | #define MSR_IA32_VMX_PINBASED_CTLS 0x481 | ||
336 | #define MSR_IA32_VMX_PROCBASED_CTLS 0x482 | ||
337 | #define MSR_IA32_VMX_EXIT_CTLS 0x483 | ||
338 | #define MSR_IA32_VMX_ENTRY_CTLS 0x484 | ||
339 | #define MSR_IA32_VMX_MISC 0x485 | ||
340 | #define MSR_IA32_VMX_CR0_FIXED0 0x486 | ||
341 | #define MSR_IA32_VMX_CR0_FIXED1 0x487 | ||
342 | #define MSR_IA32_VMX_CR4_FIXED0 0x488 | ||
343 | #define MSR_IA32_VMX_CR4_FIXED1 0x489 | ||
344 | #define MSR_IA32_VMX_VMCS_ENUM 0x48a | ||
345 | #define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b | ||
346 | #define MSR_IA32_VMX_EPT_VPID_CAP 0x48c | ||
347 | |||
348 | #define MSR_IA32_FEATURE_CONTROL 0x3a | ||
349 | #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 | ||
350 | #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 | ||
351 | |||
352 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 | 334 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 |
353 | #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 | 335 | #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 |
354 | 336 | ||
@@ -370,8 +352,6 @@ enum vmcs_field { | |||
370 | #define VMX_EPT_READABLE_MASK 0x1ull | 352 | #define VMX_EPT_READABLE_MASK 0x1ull |
371 | #define VMX_EPT_WRITABLE_MASK 0x2ull | 353 | #define VMX_EPT_WRITABLE_MASK 0x2ull |
372 | #define VMX_EPT_EXECUTABLE_MASK 0x4ull | 354 | #define VMX_EPT_EXECUTABLE_MASK 0x4ull |
373 | #define VMX_EPT_FAKE_ACCESSED_MASK (1ull << 62) | ||
374 | #define VMX_EPT_FAKE_DIRTY_MASK (1ull << 63) | ||
375 | 355 | ||
376 | #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul | 356 | #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul |
377 | 357 | ||
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0d682fc6aeb3..4f0677d1eae8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -4,10 +4,14 @@ | |||
4 | * derived from drivers/kvm/kvm_main.c | 4 | * derived from drivers/kvm/kvm_main.c |
5 | * | 5 | * |
6 | * Copyright (C) 2006 Qumranet, Inc. | 6 | * Copyright (C) 2006 Qumranet, Inc. |
7 | * Copyright (C) 2008 Qumranet, Inc. | ||
8 | * Copyright IBM Corporation, 2008 | ||
7 | * | 9 | * |
8 | * Authors: | 10 | * Authors: |
9 | * Avi Kivity <avi@qumranet.com> | 11 | * Avi Kivity <avi@qumranet.com> |
10 | * Yaniv Kamay <yaniv@qumranet.com> | 12 | * Yaniv Kamay <yaniv@qumranet.com> |
13 | * Amit Shah <amit.shah@qumranet.com> | ||
14 | * Ben-Ami Yassour <benami@il.ibm.com> | ||
11 | * | 15 | * |
12 | * This work is licensed under the terms of the GNU GPL, version 2. See | 16 | * This work is licensed under the terms of the GNU GPL, version 2. See |
13 | * the COPYING file in the top-level directory. | 17 | * the COPYING file in the top-level directory. |
@@ -19,14 +23,18 @@ | |||
19 | #include "mmu.h" | 23 | #include "mmu.h" |
20 | #include "i8254.h" | 24 | #include "i8254.h" |
21 | #include "tss.h" | 25 | #include "tss.h" |
26 | #include "kvm_cache_regs.h" | ||
27 | #include "x86.h" | ||
22 | 28 | ||
23 | #include <linux/clocksource.h> | 29 | #include <linux/clocksource.h> |
30 | #include <linux/interrupt.h> | ||
24 | #include <linux/kvm.h> | 31 | #include <linux/kvm.h> |
25 | #include <linux/fs.h> | 32 | #include <linux/fs.h> |
26 | #include <linux/vmalloc.h> | 33 | #include <linux/vmalloc.h> |
27 | #include <linux/module.h> | 34 | #include <linux/module.h> |
28 | #include <linux/mman.h> | 35 | #include <linux/mman.h> |
29 | #include <linux/highmem.h> | 36 | #include <linux/highmem.h> |
37 | #include <linux/intel-iommu.h> | ||
30 | 38 | ||
31 | #include <asm/uaccess.h> | 39 | #include <asm/uaccess.h> |
32 | #include <asm/msr.h> | 40 | #include <asm/msr.h> |
@@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | |||
61 | struct kvm_cpuid_entry2 __user *entries); | 69 | struct kvm_cpuid_entry2 __user *entries); |
62 | 70 | ||
63 | struct kvm_x86_ops *kvm_x86_ops; | 71 | struct kvm_x86_ops *kvm_x86_ops; |
72 | EXPORT_SYMBOL_GPL(kvm_x86_ops); | ||
64 | 73 | ||
65 | struct kvm_stats_debugfs_item debugfs_entries[] = { | 74 | struct kvm_stats_debugfs_item debugfs_entries[] = { |
66 | { "pf_fixed", VCPU_STAT(pf_fixed) }, | 75 | { "pf_fixed", VCPU_STAT(pf_fixed) }, |
@@ -83,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
83 | { "fpu_reload", VCPU_STAT(fpu_reload) }, | 92 | { "fpu_reload", VCPU_STAT(fpu_reload) }, |
84 | { "insn_emulation", VCPU_STAT(insn_emulation) }, | 93 | { "insn_emulation", VCPU_STAT(insn_emulation) }, |
85 | { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, | 94 | { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, |
95 | { "irq_injections", VCPU_STAT(irq_injections) }, | ||
86 | { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, | 96 | { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, |
87 | { "mmu_pte_write", VM_STAT(mmu_pte_write) }, | 97 | { "mmu_pte_write", VM_STAT(mmu_pte_write) }, |
88 | { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, | 98 | { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, |
@@ -90,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
90 | { "mmu_flooded", VM_STAT(mmu_flooded) }, | 100 | { "mmu_flooded", VM_STAT(mmu_flooded) }, |
91 | { "mmu_recycled", VM_STAT(mmu_recycled) }, | 101 | { "mmu_recycled", VM_STAT(mmu_recycled) }, |
92 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, | 102 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, |
103 | { "mmu_unsync", VM_STAT(mmu_unsync) }, | ||
93 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, | 104 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, |
94 | { "largepages", VM_STAT(lpages) }, | 105 | { "largepages", VM_STAT(lpages) }, |
95 | { NULL } | 106 | { NULL } |
96 | }; | 107 | }; |
97 | 108 | ||
98 | |||
99 | unsigned long segment_base(u16 selector) | 109 | unsigned long segment_base(u16 selector) |
100 | { | 110 | { |
101 | struct descriptor_table gdt; | 111 | struct descriptor_table gdt; |
@@ -352,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); | |||
352 | void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | 362 | void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
353 | { | 363 | { |
354 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { | 364 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { |
365 | kvm_mmu_sync_roots(vcpu); | ||
355 | kvm_mmu_flush_tlb(vcpu); | 366 | kvm_mmu_flush_tlb(vcpu); |
356 | return; | 367 | return; |
357 | } | 368 | } |
@@ -564,7 +575,7 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info * | |||
564 | hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); | 575 | hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); |
565 | 576 | ||
566 | pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", | 577 | pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", |
567 | __FUNCTION__, tsc_khz, hv_clock->tsc_shift, | 578 | __func__, tsc_khz, hv_clock->tsc_shift, |
568 | hv_clock->tsc_to_system_mul); | 579 | hv_clock->tsc_to_system_mul); |
569 | } | 580 | } |
570 | 581 | ||
@@ -662,6 +673,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
662 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", | 673 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", |
663 | __func__, data); | 674 | __func__, data); |
664 | break; | 675 | break; |
676 | case MSR_IA32_DEBUGCTLMSR: | ||
677 | if (!data) { | ||
678 | /* We support the non-activated case already */ | ||
679 | break; | ||
680 | } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { | ||
681 | /* Values other than LBR and BTF are vendor-specific, | ||
682 | thus reserved and should throw a #GP */ | ||
683 | return 1; | ||
684 | } | ||
685 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", | ||
686 | __func__, data); | ||
687 | break; | ||
665 | case MSR_IA32_UCODE_REV: | 688 | case MSR_IA32_UCODE_REV: |
666 | case MSR_IA32_UCODE_WRITE: | 689 | case MSR_IA32_UCODE_WRITE: |
667 | break; | 690 | break; |
@@ -692,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
692 | /* ...but clean it before doing the actual write */ | 715 | /* ...but clean it before doing the actual write */ |
693 | vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); | 716 | vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); |
694 | 717 | ||
695 | down_read(¤t->mm->mmap_sem); | ||
696 | vcpu->arch.time_page = | 718 | vcpu->arch.time_page = |
697 | gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); | 719 | gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); |
698 | up_read(¤t->mm->mmap_sem); | ||
699 | 720 | ||
700 | if (is_error_page(vcpu->arch.time_page)) { | 721 | if (is_error_page(vcpu->arch.time_page)) { |
701 | kvm_release_page_clean(vcpu->arch.time_page); | 722 | kvm_release_page_clean(vcpu->arch.time_page); |
@@ -752,8 +773,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
752 | case MSR_IA32_MC0_MISC+8: | 773 | case MSR_IA32_MC0_MISC+8: |
753 | case MSR_IA32_MC0_MISC+12: | 774 | case MSR_IA32_MC0_MISC+12: |
754 | case MSR_IA32_MC0_MISC+16: | 775 | case MSR_IA32_MC0_MISC+16: |
776 | case MSR_IA32_MC0_MISC+20: | ||
755 | case MSR_IA32_UCODE_REV: | 777 | case MSR_IA32_UCODE_REV: |
756 | case MSR_IA32_EBL_CR_POWERON: | 778 | case MSR_IA32_EBL_CR_POWERON: |
779 | case MSR_IA32_DEBUGCTLMSR: | ||
780 | case MSR_IA32_LASTBRANCHFROMIP: | ||
781 | case MSR_IA32_LASTBRANCHTOIP: | ||
782 | case MSR_IA32_LASTINTFROMIP: | ||
783 | case MSR_IA32_LASTINTTOIP: | ||
757 | data = 0; | 784 | data = 0; |
758 | break; | 785 | break; |
759 | case MSR_MTRRcap: | 786 | case MSR_MTRRcap: |
@@ -901,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
901 | case KVM_CAP_PV_MMU: | 928 | case KVM_CAP_PV_MMU: |
902 | r = !tdp_enabled; | 929 | r = !tdp_enabled; |
903 | break; | 930 | break; |
931 | case KVM_CAP_IOMMU: | ||
932 | r = intel_iommu_found(); | ||
933 | break; | ||
904 | default: | 934 | default: |
905 | r = 0; | 935 | r = 0; |
906 | break; | 936 | break; |
@@ -1303,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
1303 | struct kvm_vcpu *vcpu = filp->private_data; | 1333 | struct kvm_vcpu *vcpu = filp->private_data; |
1304 | void __user *argp = (void __user *)arg; | 1334 | void __user *argp = (void __user *)arg; |
1305 | int r; | 1335 | int r; |
1336 | struct kvm_lapic_state *lapic = NULL; | ||
1306 | 1337 | ||
1307 | switch (ioctl) { | 1338 | switch (ioctl) { |
1308 | case KVM_GET_LAPIC: { | 1339 | case KVM_GET_LAPIC: { |
1309 | struct kvm_lapic_state lapic; | 1340 | lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); |
1310 | 1341 | ||
1311 | memset(&lapic, 0, sizeof lapic); | 1342 | r = -ENOMEM; |
1312 | r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); | 1343 | if (!lapic) |
1344 | goto out; | ||
1345 | r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); | ||
1313 | if (r) | 1346 | if (r) |
1314 | goto out; | 1347 | goto out; |
1315 | r = -EFAULT; | 1348 | r = -EFAULT; |
1316 | if (copy_to_user(argp, &lapic, sizeof lapic)) | 1349 | if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) |
1317 | goto out; | 1350 | goto out; |
1318 | r = 0; | 1351 | r = 0; |
1319 | break; | 1352 | break; |
1320 | } | 1353 | } |
1321 | case KVM_SET_LAPIC: { | 1354 | case KVM_SET_LAPIC: { |
1322 | struct kvm_lapic_state lapic; | 1355 | lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); |
1323 | 1356 | r = -ENOMEM; | |
1357 | if (!lapic) | ||
1358 | goto out; | ||
1324 | r = -EFAULT; | 1359 | r = -EFAULT; |
1325 | if (copy_from_user(&lapic, argp, sizeof lapic)) | 1360 | if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) |
1326 | goto out; | 1361 | goto out; |
1327 | r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; | 1362 | r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); |
1328 | if (r) | 1363 | if (r) |
1329 | goto out; | 1364 | goto out; |
1330 | r = 0; | 1365 | r = 0; |
@@ -1422,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
1422 | r = -EINVAL; | 1457 | r = -EINVAL; |
1423 | } | 1458 | } |
1424 | out: | 1459 | out: |
1460 | if (lapic) | ||
1461 | kfree(lapic); | ||
1425 | return r; | 1462 | return r; |
1426 | } | 1463 | } |
1427 | 1464 | ||
@@ -1630,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1630 | struct kvm *kvm = filp->private_data; | 1667 | struct kvm *kvm = filp->private_data; |
1631 | void __user *argp = (void __user *)arg; | 1668 | void __user *argp = (void __user *)arg; |
1632 | int r = -EINVAL; | 1669 | int r = -EINVAL; |
1670 | /* | ||
1671 | * This union makes it completely explicit to gcc-3.x | ||
1672 | * that these two variables' stack usage should be | ||
1673 | * combined, not added together. | ||
1674 | */ | ||
1675 | union { | ||
1676 | struct kvm_pit_state ps; | ||
1677 | struct kvm_memory_alias alias; | ||
1678 | } u; | ||
1633 | 1679 | ||
1634 | switch (ioctl) { | 1680 | switch (ioctl) { |
1635 | case KVM_SET_TSS_ADDR: | 1681 | case KVM_SET_TSS_ADDR: |
@@ -1661,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1661 | case KVM_GET_NR_MMU_PAGES: | 1707 | case KVM_GET_NR_MMU_PAGES: |
1662 | r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); | 1708 | r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); |
1663 | break; | 1709 | break; |
1664 | case KVM_SET_MEMORY_ALIAS: { | 1710 | case KVM_SET_MEMORY_ALIAS: |
1665 | struct kvm_memory_alias alias; | ||
1666 | |||
1667 | r = -EFAULT; | 1711 | r = -EFAULT; |
1668 | if (copy_from_user(&alias, argp, sizeof alias)) | 1712 | if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) |
1669 | goto out; | 1713 | goto out; |
1670 | r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); | 1714 | r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); |
1671 | if (r) | 1715 | if (r) |
1672 | goto out; | 1716 | goto out; |
1673 | break; | 1717 | break; |
1674 | } | ||
1675 | case KVM_CREATE_IRQCHIP: | 1718 | case KVM_CREATE_IRQCHIP: |
1676 | r = -ENOMEM; | 1719 | r = -ENOMEM; |
1677 | kvm->arch.vpic = kvm_create_pic(kvm); | 1720 | kvm->arch.vpic = kvm_create_pic(kvm); |
@@ -1699,13 +1742,7 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1699 | goto out; | 1742 | goto out; |
1700 | if (irqchip_in_kernel(kvm)) { | 1743 | if (irqchip_in_kernel(kvm)) { |
1701 | mutex_lock(&kvm->lock); | 1744 | mutex_lock(&kvm->lock); |
1702 | if (irq_event.irq < 16) | 1745 | kvm_set_irq(kvm, irq_event.irq, irq_event.level); |
1703 | kvm_pic_set_irq(pic_irqchip(kvm), | ||
1704 | irq_event.irq, | ||
1705 | irq_event.level); | ||
1706 | kvm_ioapic_set_irq(kvm->arch.vioapic, | ||
1707 | irq_event.irq, | ||
1708 | irq_event.level); | ||
1709 | mutex_unlock(&kvm->lock); | 1746 | mutex_unlock(&kvm->lock); |
1710 | r = 0; | 1747 | r = 0; |
1711 | } | 1748 | } |
@@ -1713,65 +1750,77 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1713 | } | 1750 | } |
1714 | case KVM_GET_IRQCHIP: { | 1751 | case KVM_GET_IRQCHIP: { |
1715 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | 1752 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ |
1716 | struct kvm_irqchip chip; | 1753 | struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); |
1717 | 1754 | ||
1718 | r = -EFAULT; | 1755 | r = -ENOMEM; |
1719 | if (copy_from_user(&chip, argp, sizeof chip)) | 1756 | if (!chip) |
1720 | goto out; | 1757 | goto out; |
1758 | r = -EFAULT; | ||
1759 | if (copy_from_user(chip, argp, sizeof *chip)) | ||
1760 | goto get_irqchip_out; | ||
1721 | r = -ENXIO; | 1761 | r = -ENXIO; |
1722 | if (!irqchip_in_kernel(kvm)) | 1762 | if (!irqchip_in_kernel(kvm)) |
1723 | goto out; | 1763 | goto get_irqchip_out; |
1724 | r = kvm_vm_ioctl_get_irqchip(kvm, &chip); | 1764 | r = kvm_vm_ioctl_get_irqchip(kvm, chip); |
1725 | if (r) | 1765 | if (r) |
1726 | goto out; | 1766 | goto get_irqchip_out; |
1727 | r = -EFAULT; | 1767 | r = -EFAULT; |
1728 | if (copy_to_user(argp, &chip, sizeof chip)) | 1768 | if (copy_to_user(argp, chip, sizeof *chip)) |
1729 | goto out; | 1769 | goto get_irqchip_out; |
1730 | r = 0; | 1770 | r = 0; |
1771 | get_irqchip_out: | ||
1772 | kfree(chip); | ||
1773 | if (r) | ||
1774 | goto out; | ||
1731 | break; | 1775 | break; |
1732 | } | 1776 | } |
1733 | case KVM_SET_IRQCHIP: { | 1777 | case KVM_SET_IRQCHIP: { |
1734 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | 1778 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ |
1735 | struct kvm_irqchip chip; | 1779 | struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); |
1736 | 1780 | ||
1737 | r = -EFAULT; | 1781 | r = -ENOMEM; |
1738 | if (copy_from_user(&chip, argp, sizeof chip)) | 1782 | if (!chip) |
1739 | goto out; | 1783 | goto out; |
1784 | r = -EFAULT; | ||
1785 | if (copy_from_user(chip, argp, sizeof *chip)) | ||
1786 | goto set_irqchip_out; | ||
1740 | r = -ENXIO; | 1787 | r = -ENXIO; |
1741 | if (!irqchip_in_kernel(kvm)) | 1788 | if (!irqchip_in_kernel(kvm)) |
1742 | goto out; | 1789 | goto set_irqchip_out; |
1743 | r = kvm_vm_ioctl_set_irqchip(kvm, &chip); | 1790 | r = kvm_vm_ioctl_set_irqchip(kvm, chip); |
1744 | if (r) | 1791 | if (r) |
1745 | goto out; | 1792 | goto set_irqchip_out; |
1746 | r = 0; | 1793 | r = 0; |
1794 | set_irqchip_out: | ||
1795 | kfree(chip); | ||
1796 | if (r) | ||
1797 | goto out; | ||
1747 | break; | 1798 | break; |
1748 | } | 1799 | } |
1749 | case KVM_GET_PIT: { | 1800 | case KVM_GET_PIT: { |
1750 | struct kvm_pit_state ps; | ||
1751 | r = -EFAULT; | 1801 | r = -EFAULT; |
1752 | if (copy_from_user(&ps, argp, sizeof ps)) | 1802 | if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) |
1753 | goto out; | 1803 | goto out; |
1754 | r = -ENXIO; | 1804 | r = -ENXIO; |
1755 | if (!kvm->arch.vpit) | 1805 | if (!kvm->arch.vpit) |
1756 | goto out; | 1806 | goto out; |
1757 | r = kvm_vm_ioctl_get_pit(kvm, &ps); | 1807 | r = kvm_vm_ioctl_get_pit(kvm, &u.ps); |
1758 | if (r) | 1808 | if (r) |
1759 | goto out; | 1809 | goto out; |
1760 | r = -EFAULT; | 1810 | r = -EFAULT; |
1761 | if (copy_to_user(argp, &ps, sizeof ps)) | 1811 | if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) |
1762 | goto out; | 1812 | goto out; |
1763 | r = 0; | 1813 | r = 0; |
1764 | break; | 1814 | break; |
1765 | } | 1815 | } |
1766 | case KVM_SET_PIT: { | 1816 | case KVM_SET_PIT: { |
1767 | struct kvm_pit_state ps; | ||
1768 | r = -EFAULT; | 1817 | r = -EFAULT; |
1769 | if (copy_from_user(&ps, argp, sizeof ps)) | 1818 | if (copy_from_user(&u.ps, argp, sizeof u.ps)) |
1770 | goto out; | 1819 | goto out; |
1771 | r = -ENXIO; | 1820 | r = -ENXIO; |
1772 | if (!kvm->arch.vpit) | 1821 | if (!kvm->arch.vpit) |
1773 | goto out; | 1822 | goto out; |
1774 | r = kvm_vm_ioctl_set_pit(kvm, &ps); | 1823 | r = kvm_vm_ioctl_set_pit(kvm, &u.ps); |
1775 | if (r) | 1824 | if (r) |
1776 | goto out; | 1825 | goto out; |
1777 | r = 0; | 1826 | r = 0; |
@@ -2018,9 +2067,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
2018 | 2067 | ||
2019 | val = *(u64 *)new; | 2068 | val = *(u64 *)new; |
2020 | 2069 | ||
2021 | down_read(¤t->mm->mmap_sem); | ||
2022 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 2070 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); |
2023 | up_read(¤t->mm->mmap_sem); | ||
2024 | 2071 | ||
2025 | kaddr = kmap_atomic(page, KM_USER0); | 2072 | kaddr = kmap_atomic(page, KM_USER0); |
2026 | set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); | 2073 | set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); |
@@ -2040,6 +2087,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) | |||
2040 | 2087 | ||
2041 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) | 2088 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) |
2042 | { | 2089 | { |
2090 | kvm_mmu_invlpg(vcpu, address); | ||
2043 | return X86EMUL_CONTINUE; | 2091 | return X86EMUL_CONTINUE; |
2044 | } | 2092 | } |
2045 | 2093 | ||
@@ -2080,7 +2128,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) | |||
2080 | void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) | 2128 | void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) |
2081 | { | 2129 | { |
2082 | u8 opcodes[4]; | 2130 | u8 opcodes[4]; |
2083 | unsigned long rip = vcpu->arch.rip; | 2131 | unsigned long rip = kvm_rip_read(vcpu); |
2084 | unsigned long rip_linear; | 2132 | unsigned long rip_linear; |
2085 | 2133 | ||
2086 | if (!printk_ratelimit()) | 2134 | if (!printk_ratelimit()) |
@@ -2102,6 +2150,14 @@ static struct x86_emulate_ops emulate_ops = { | |||
2102 | .cmpxchg_emulated = emulator_cmpxchg_emulated, | 2150 | .cmpxchg_emulated = emulator_cmpxchg_emulated, |
2103 | }; | 2151 | }; |
2104 | 2152 | ||
2153 | static void cache_all_regs(struct kvm_vcpu *vcpu) | ||
2154 | { | ||
2155 | kvm_register_read(vcpu, VCPU_REGS_RAX); | ||
2156 | kvm_register_read(vcpu, VCPU_REGS_RSP); | ||
2157 | kvm_register_read(vcpu, VCPU_REGS_RIP); | ||
2158 | vcpu->arch.regs_dirty = ~0; | ||
2159 | } | ||
2160 | |||
2105 | int emulate_instruction(struct kvm_vcpu *vcpu, | 2161 | int emulate_instruction(struct kvm_vcpu *vcpu, |
2106 | struct kvm_run *run, | 2162 | struct kvm_run *run, |
2107 | unsigned long cr2, | 2163 | unsigned long cr2, |
@@ -2111,8 +2167,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
2111 | int r; | 2167 | int r; |
2112 | struct decode_cache *c; | 2168 | struct decode_cache *c; |
2113 | 2169 | ||
2170 | kvm_clear_exception_queue(vcpu); | ||
2114 | vcpu->arch.mmio_fault_cr2 = cr2; | 2171 | vcpu->arch.mmio_fault_cr2 = cr2; |
2115 | kvm_x86_ops->cache_regs(vcpu); | 2172 | /* |
2173 | * TODO: fix x86_emulate.c to use guest_read/write_register | ||
2174 | * instead of direct ->regs accesses, can save hundred cycles | ||
2175 | * on Intel for instructions that don't read/change RSP, for | ||
2176 | * for example. | ||
2177 | */ | ||
2178 | cache_all_regs(vcpu); | ||
2116 | 2179 | ||
2117 | vcpu->mmio_is_write = 0; | 2180 | vcpu->mmio_is_write = 0; |
2118 | vcpu->arch.pio.string = 0; | 2181 | vcpu->arch.pio.string = 0; |
@@ -2172,7 +2235,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
2172 | return EMULATE_DO_MMIO; | 2235 | return EMULATE_DO_MMIO; |
2173 | } | 2236 | } |
2174 | 2237 | ||
2175 | kvm_x86_ops->decache_regs(vcpu); | ||
2176 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | 2238 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); |
2177 | 2239 | ||
2178 | if (vcpu->mmio_is_write) { | 2240 | if (vcpu->mmio_is_write) { |
@@ -2225,20 +2287,19 @@ int complete_pio(struct kvm_vcpu *vcpu) | |||
2225 | struct kvm_pio_request *io = &vcpu->arch.pio; | 2287 | struct kvm_pio_request *io = &vcpu->arch.pio; |
2226 | long delta; | 2288 | long delta; |
2227 | int r; | 2289 | int r; |
2228 | 2290 | unsigned long val; | |
2229 | kvm_x86_ops->cache_regs(vcpu); | ||
2230 | 2291 | ||
2231 | if (!io->string) { | 2292 | if (!io->string) { |
2232 | if (io->in) | 2293 | if (io->in) { |
2233 | memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, | 2294 | val = kvm_register_read(vcpu, VCPU_REGS_RAX); |
2234 | io->size); | 2295 | memcpy(&val, vcpu->arch.pio_data, io->size); |
2296 | kvm_register_write(vcpu, VCPU_REGS_RAX, val); | ||
2297 | } | ||
2235 | } else { | 2298 | } else { |
2236 | if (io->in) { | 2299 | if (io->in) { |
2237 | r = pio_copy_data(vcpu); | 2300 | r = pio_copy_data(vcpu); |
2238 | if (r) { | 2301 | if (r) |
2239 | kvm_x86_ops->cache_regs(vcpu); | ||
2240 | return r; | 2302 | return r; |
2241 | } | ||
2242 | } | 2303 | } |
2243 | 2304 | ||
2244 | delta = 1; | 2305 | delta = 1; |
@@ -2248,19 +2309,24 @@ int complete_pio(struct kvm_vcpu *vcpu) | |||
2248 | * The size of the register should really depend on | 2309 | * The size of the register should really depend on |
2249 | * current address size. | 2310 | * current address size. |
2250 | */ | 2311 | */ |
2251 | vcpu->arch.regs[VCPU_REGS_RCX] -= delta; | 2312 | val = kvm_register_read(vcpu, VCPU_REGS_RCX); |
2313 | val -= delta; | ||
2314 | kvm_register_write(vcpu, VCPU_REGS_RCX, val); | ||
2252 | } | 2315 | } |
2253 | if (io->down) | 2316 | if (io->down) |
2254 | delta = -delta; | 2317 | delta = -delta; |
2255 | delta *= io->size; | 2318 | delta *= io->size; |
2256 | if (io->in) | 2319 | if (io->in) { |
2257 | vcpu->arch.regs[VCPU_REGS_RDI] += delta; | 2320 | val = kvm_register_read(vcpu, VCPU_REGS_RDI); |
2258 | else | 2321 | val += delta; |
2259 | vcpu->arch.regs[VCPU_REGS_RSI] += delta; | 2322 | kvm_register_write(vcpu, VCPU_REGS_RDI, val); |
2323 | } else { | ||
2324 | val = kvm_register_read(vcpu, VCPU_REGS_RSI); | ||
2325 | val += delta; | ||
2326 | kvm_register_write(vcpu, VCPU_REGS_RSI, val); | ||
2327 | } | ||
2260 | } | 2328 | } |
2261 | 2329 | ||
2262 | kvm_x86_ops->decache_regs(vcpu); | ||
2263 | |||
2264 | io->count -= io->cur_count; | 2330 | io->count -= io->cur_count; |
2265 | io->cur_count = 0; | 2331 | io->cur_count = 0; |
2266 | 2332 | ||
@@ -2313,6 +2379,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2313 | int size, unsigned port) | 2379 | int size, unsigned port) |
2314 | { | 2380 | { |
2315 | struct kvm_io_device *pio_dev; | 2381 | struct kvm_io_device *pio_dev; |
2382 | unsigned long val; | ||
2316 | 2383 | ||
2317 | vcpu->run->exit_reason = KVM_EXIT_IO; | 2384 | vcpu->run->exit_reason = KVM_EXIT_IO; |
2318 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | 2385 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; |
@@ -2333,8 +2400,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2333 | KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, | 2400 | KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, |
2334 | handler); | 2401 | handler); |
2335 | 2402 | ||
2336 | kvm_x86_ops->cache_regs(vcpu); | 2403 | val = kvm_register_read(vcpu, VCPU_REGS_RAX); |
2337 | memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); | 2404 | memcpy(vcpu->arch.pio_data, &val, 4); |
2338 | 2405 | ||
2339 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2406 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
2340 | 2407 | ||
@@ -2492,11 +2559,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) | |||
2492 | KVMTRACE_0D(HLT, vcpu, handler); | 2559 | KVMTRACE_0D(HLT, vcpu, handler); |
2493 | if (irqchip_in_kernel(vcpu->kvm)) { | 2560 | if (irqchip_in_kernel(vcpu->kvm)) { |
2494 | vcpu->arch.mp_state = KVM_MP_STATE_HALTED; | 2561 | vcpu->arch.mp_state = KVM_MP_STATE_HALTED; |
2495 | up_read(&vcpu->kvm->slots_lock); | ||
2496 | kvm_vcpu_block(vcpu); | ||
2497 | down_read(&vcpu->kvm->slots_lock); | ||
2498 | if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) | ||
2499 | return -EINTR; | ||
2500 | return 1; | 2562 | return 1; |
2501 | } else { | 2563 | } else { |
2502 | vcpu->run->exit_reason = KVM_EXIT_HLT; | 2564 | vcpu->run->exit_reason = KVM_EXIT_HLT; |
@@ -2519,13 +2581,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
2519 | unsigned long nr, a0, a1, a2, a3, ret; | 2581 | unsigned long nr, a0, a1, a2, a3, ret; |
2520 | int r = 1; | 2582 | int r = 1; |
2521 | 2583 | ||
2522 | kvm_x86_ops->cache_regs(vcpu); | 2584 | nr = kvm_register_read(vcpu, VCPU_REGS_RAX); |
2523 | 2585 | a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); | |
2524 | nr = vcpu->arch.regs[VCPU_REGS_RAX]; | 2586 | a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); |
2525 | a0 = vcpu->arch.regs[VCPU_REGS_RBX]; | 2587 | a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); |
2526 | a1 = vcpu->arch.regs[VCPU_REGS_RCX]; | 2588 | a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); |
2527 | a2 = vcpu->arch.regs[VCPU_REGS_RDX]; | ||
2528 | a3 = vcpu->arch.regs[VCPU_REGS_RSI]; | ||
2529 | 2589 | ||
2530 | KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); | 2590 | KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); |
2531 | 2591 | ||
@@ -2548,8 +2608,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
2548 | ret = -KVM_ENOSYS; | 2608 | ret = -KVM_ENOSYS; |
2549 | break; | 2609 | break; |
2550 | } | 2610 | } |
2551 | vcpu->arch.regs[VCPU_REGS_RAX] = ret; | 2611 | kvm_register_write(vcpu, VCPU_REGS_RAX, ret); |
2552 | kvm_x86_ops->decache_regs(vcpu); | ||
2553 | ++vcpu->stat.hypercalls; | 2612 | ++vcpu->stat.hypercalls; |
2554 | return r; | 2613 | return r; |
2555 | } | 2614 | } |
@@ -2559,6 +2618,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) | |||
2559 | { | 2618 | { |
2560 | char instruction[3]; | 2619 | char instruction[3]; |
2561 | int ret = 0; | 2620 | int ret = 0; |
2621 | unsigned long rip = kvm_rip_read(vcpu); | ||
2562 | 2622 | ||
2563 | 2623 | ||
2564 | /* | 2624 | /* |
@@ -2568,9 +2628,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) | |||
2568 | */ | 2628 | */ |
2569 | kvm_mmu_zap_all(vcpu->kvm); | 2629 | kvm_mmu_zap_all(vcpu->kvm); |
2570 | 2630 | ||
2571 | kvm_x86_ops->cache_regs(vcpu); | ||
2572 | kvm_x86_ops->patch_hypercall(vcpu, instruction); | 2631 | kvm_x86_ops->patch_hypercall(vcpu, instruction); |
2573 | if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) | 2632 | if (emulator_write_emulated(rip, instruction, 3, vcpu) |
2574 | != X86EMUL_CONTINUE) | 2633 | != X86EMUL_CONTINUE) |
2575 | ret = -EFAULT; | 2634 | ret = -EFAULT; |
2576 | 2635 | ||
@@ -2700,13 +2759,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | |||
2700 | u32 function, index; | 2759 | u32 function, index; |
2701 | struct kvm_cpuid_entry2 *e, *best; | 2760 | struct kvm_cpuid_entry2 *e, *best; |
2702 | 2761 | ||
2703 | kvm_x86_ops->cache_regs(vcpu); | 2762 | function = kvm_register_read(vcpu, VCPU_REGS_RAX); |
2704 | function = vcpu->arch.regs[VCPU_REGS_RAX]; | 2763 | index = kvm_register_read(vcpu, VCPU_REGS_RCX); |
2705 | index = vcpu->arch.regs[VCPU_REGS_RCX]; | 2764 | kvm_register_write(vcpu, VCPU_REGS_RAX, 0); |
2706 | vcpu->arch.regs[VCPU_REGS_RAX] = 0; | 2765 | kvm_register_write(vcpu, VCPU_REGS_RBX, 0); |
2707 | vcpu->arch.regs[VCPU_REGS_RBX] = 0; | 2766 | kvm_register_write(vcpu, VCPU_REGS_RCX, 0); |
2708 | vcpu->arch.regs[VCPU_REGS_RCX] = 0; | 2767 | kvm_register_write(vcpu, VCPU_REGS_RDX, 0); |
2709 | vcpu->arch.regs[VCPU_REGS_RDX] = 0; | ||
2710 | best = NULL; | 2768 | best = NULL; |
2711 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { | 2769 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { |
2712 | e = &vcpu->arch.cpuid_entries[i]; | 2770 | e = &vcpu->arch.cpuid_entries[i]; |
@@ -2724,18 +2782,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | |||
2724 | best = e; | 2782 | best = e; |
2725 | } | 2783 | } |
2726 | if (best) { | 2784 | if (best) { |
2727 | vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; | 2785 | kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); |
2728 | vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; | 2786 | kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); |
2729 | vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; | 2787 | kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); |
2730 | vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; | 2788 | kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); |
2731 | } | 2789 | } |
2732 | kvm_x86_ops->decache_regs(vcpu); | ||
2733 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2790 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
2734 | KVMTRACE_5D(CPUID, vcpu, function, | 2791 | KVMTRACE_5D(CPUID, vcpu, function, |
2735 | (u32)vcpu->arch.regs[VCPU_REGS_RAX], | 2792 | (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), |
2736 | (u32)vcpu->arch.regs[VCPU_REGS_RBX], | 2793 | (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), |
2737 | (u32)vcpu->arch.regs[VCPU_REGS_RCX], | 2794 | (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), |
2738 | (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); | 2795 | (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); |
2739 | } | 2796 | } |
2740 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | 2797 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); |
2741 | 2798 | ||
@@ -2776,9 +2833,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu) | |||
2776 | if (!apic || !apic->vapic_addr) | 2833 | if (!apic || !apic->vapic_addr) |
2777 | return; | 2834 | return; |
2778 | 2835 | ||
2779 | down_read(¤t->mm->mmap_sem); | ||
2780 | page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); | 2836 | page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); |
2781 | up_read(¤t->mm->mmap_sem); | ||
2782 | 2837 | ||
2783 | vcpu->arch.apic->vapic_page = page; | 2838 | vcpu->arch.apic->vapic_page = page; |
2784 | } | 2839 | } |
@@ -2796,28 +2851,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu) | |||
2796 | up_read(&vcpu->kvm->slots_lock); | 2851 | up_read(&vcpu->kvm->slots_lock); |
2797 | } | 2852 | } |
2798 | 2853 | ||
2799 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2854 | static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2800 | { | 2855 | { |
2801 | int r; | 2856 | int r; |
2802 | 2857 | ||
2803 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { | ||
2804 | pr_debug("vcpu %d received sipi with vector # %x\n", | ||
2805 | vcpu->vcpu_id, vcpu->arch.sipi_vector); | ||
2806 | kvm_lapic_reset(vcpu); | ||
2807 | r = kvm_x86_ops->vcpu_reset(vcpu); | ||
2808 | if (r) | ||
2809 | return r; | ||
2810 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
2811 | } | ||
2812 | |||
2813 | down_read(&vcpu->kvm->slots_lock); | ||
2814 | vapic_enter(vcpu); | ||
2815 | |||
2816 | preempted: | ||
2817 | if (vcpu->guest_debug.enabled) | ||
2818 | kvm_x86_ops->guest_debug_pre(vcpu); | ||
2819 | |||
2820 | again: | ||
2821 | if (vcpu->requests) | 2858 | if (vcpu->requests) |
2822 | if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) | 2859 | if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) |
2823 | kvm_mmu_unload(vcpu); | 2860 | kvm_mmu_unload(vcpu); |
@@ -2829,6 +2866,8 @@ again: | |||
2829 | if (vcpu->requests) { | 2866 | if (vcpu->requests) { |
2830 | if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) | 2867 | if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) |
2831 | __kvm_migrate_timers(vcpu); | 2868 | __kvm_migrate_timers(vcpu); |
2869 | if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) | ||
2870 | kvm_mmu_sync_roots(vcpu); | ||
2832 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) | 2871 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) |
2833 | kvm_x86_ops->tlb_flush(vcpu); | 2872 | kvm_x86_ops->tlb_flush(vcpu); |
2834 | if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, | 2873 | if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, |
@@ -2854,21 +2893,15 @@ again: | |||
2854 | 2893 | ||
2855 | local_irq_disable(); | 2894 | local_irq_disable(); |
2856 | 2895 | ||
2857 | if (vcpu->requests || need_resched()) { | 2896 | if (vcpu->requests || need_resched() || signal_pending(current)) { |
2858 | local_irq_enable(); | 2897 | local_irq_enable(); |
2859 | preempt_enable(); | 2898 | preempt_enable(); |
2860 | r = 1; | 2899 | r = 1; |
2861 | goto out; | 2900 | goto out; |
2862 | } | 2901 | } |
2863 | 2902 | ||
2864 | if (signal_pending(current)) { | 2903 | if (vcpu->guest_debug.enabled) |
2865 | local_irq_enable(); | 2904 | kvm_x86_ops->guest_debug_pre(vcpu); |
2866 | preempt_enable(); | ||
2867 | r = -EINTR; | ||
2868 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
2869 | ++vcpu->stat.signal_exits; | ||
2870 | goto out; | ||
2871 | } | ||
2872 | 2905 | ||
2873 | vcpu->guest_mode = 1; | 2906 | vcpu->guest_mode = 1; |
2874 | /* | 2907 | /* |
@@ -2917,8 +2950,8 @@ again: | |||
2917 | * Profile KVM exit RIPs: | 2950 | * Profile KVM exit RIPs: |
2918 | */ | 2951 | */ |
2919 | if (unlikely(prof_on == KVM_PROFILING)) { | 2952 | if (unlikely(prof_on == KVM_PROFILING)) { |
2920 | kvm_x86_ops->cache_regs(vcpu); | 2953 | unsigned long rip = kvm_rip_read(vcpu); |
2921 | profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); | 2954 | profile_hit(KVM_PROFILING, (void *)rip); |
2922 | } | 2955 | } |
2923 | 2956 | ||
2924 | if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) | 2957 | if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) |
@@ -2927,26 +2960,63 @@ again: | |||
2927 | kvm_lapic_sync_from_vapic(vcpu); | 2960 | kvm_lapic_sync_from_vapic(vcpu); |
2928 | 2961 | ||
2929 | r = kvm_x86_ops->handle_exit(kvm_run, vcpu); | 2962 | r = kvm_x86_ops->handle_exit(kvm_run, vcpu); |
2963 | out: | ||
2964 | return r; | ||
2965 | } | ||
2930 | 2966 | ||
2931 | if (r > 0) { | 2967 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2932 | if (dm_request_for_irq_injection(vcpu, kvm_run)) { | 2968 | { |
2933 | r = -EINTR; | 2969 | int r; |
2934 | kvm_run->exit_reason = KVM_EXIT_INTR; | 2970 | |
2935 | ++vcpu->stat.request_irq_exits; | 2971 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { |
2936 | goto out; | 2972 | pr_debug("vcpu %d received sipi with vector # %x\n", |
2937 | } | 2973 | vcpu->vcpu_id, vcpu->arch.sipi_vector); |
2938 | if (!need_resched()) | 2974 | kvm_lapic_reset(vcpu); |
2939 | goto again; | 2975 | r = kvm_x86_ops->vcpu_reset(vcpu); |
2976 | if (r) | ||
2977 | return r; | ||
2978 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
2940 | } | 2979 | } |
2941 | 2980 | ||
2942 | out: | 2981 | down_read(&vcpu->kvm->slots_lock); |
2943 | up_read(&vcpu->kvm->slots_lock); | 2982 | vapic_enter(vcpu); |
2944 | if (r > 0) { | 2983 | |
2945 | kvm_resched(vcpu); | 2984 | r = 1; |
2946 | down_read(&vcpu->kvm->slots_lock); | 2985 | while (r > 0) { |
2947 | goto preempted; | 2986 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) |
2987 | r = vcpu_enter_guest(vcpu, kvm_run); | ||
2988 | else { | ||
2989 | up_read(&vcpu->kvm->slots_lock); | ||
2990 | kvm_vcpu_block(vcpu); | ||
2991 | down_read(&vcpu->kvm->slots_lock); | ||
2992 | if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) | ||
2993 | if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) | ||
2994 | vcpu->arch.mp_state = | ||
2995 | KVM_MP_STATE_RUNNABLE; | ||
2996 | if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) | ||
2997 | r = -EINTR; | ||
2998 | } | ||
2999 | |||
3000 | if (r > 0) { | ||
3001 | if (dm_request_for_irq_injection(vcpu, kvm_run)) { | ||
3002 | r = -EINTR; | ||
3003 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
3004 | ++vcpu->stat.request_irq_exits; | ||
3005 | } | ||
3006 | if (signal_pending(current)) { | ||
3007 | r = -EINTR; | ||
3008 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
3009 | ++vcpu->stat.signal_exits; | ||
3010 | } | ||
3011 | if (need_resched()) { | ||
3012 | up_read(&vcpu->kvm->slots_lock); | ||
3013 | kvm_resched(vcpu); | ||
3014 | down_read(&vcpu->kvm->slots_lock); | ||
3015 | } | ||
3016 | } | ||
2948 | } | 3017 | } |
2949 | 3018 | ||
3019 | up_read(&vcpu->kvm->slots_lock); | ||
2950 | post_kvm_run_save(vcpu, kvm_run); | 3020 | post_kvm_run_save(vcpu, kvm_run); |
2951 | 3021 | ||
2952 | vapic_exit(vcpu); | 3022 | vapic_exit(vcpu); |
@@ -2966,6 +3036,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2966 | 3036 | ||
2967 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { | 3037 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { |
2968 | kvm_vcpu_block(vcpu); | 3038 | kvm_vcpu_block(vcpu); |
3039 | clear_bit(KVM_REQ_UNHALT, &vcpu->requests); | ||
2969 | r = -EAGAIN; | 3040 | r = -EAGAIN; |
2970 | goto out; | 3041 | goto out; |
2971 | } | 3042 | } |
@@ -2999,11 +3070,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2999 | } | 3070 | } |
3000 | } | 3071 | } |
3001 | #endif | 3072 | #endif |
3002 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { | 3073 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) |
3003 | kvm_x86_ops->cache_regs(vcpu); | 3074 | kvm_register_write(vcpu, VCPU_REGS_RAX, |
3004 | vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; | 3075 | kvm_run->hypercall.ret); |
3005 | kvm_x86_ops->decache_regs(vcpu); | ||
3006 | } | ||
3007 | 3076 | ||
3008 | r = __vcpu_run(vcpu, kvm_run); | 3077 | r = __vcpu_run(vcpu, kvm_run); |
3009 | 3078 | ||
@@ -3019,28 +3088,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
3019 | { | 3088 | { |
3020 | vcpu_load(vcpu); | 3089 | vcpu_load(vcpu); |
3021 | 3090 | ||
3022 | kvm_x86_ops->cache_regs(vcpu); | 3091 | regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); |
3023 | 3092 | regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); | |
3024 | regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; | 3093 | regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); |
3025 | regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; | 3094 | regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); |
3026 | regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; | 3095 | regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); |
3027 | regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; | 3096 | regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); |
3028 | regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; | 3097 | regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); |
3029 | regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; | 3098 | regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); |
3030 | regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
3031 | regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; | ||
3032 | #ifdef CONFIG_X86_64 | 3099 | #ifdef CONFIG_X86_64 |
3033 | regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; | 3100 | regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); |
3034 | regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; | 3101 | regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); |
3035 | regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; | 3102 | regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); |
3036 | regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; | 3103 | regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); |
3037 | regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; | 3104 | regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); |
3038 | regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; | 3105 | regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); |
3039 | regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; | 3106 | regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); |
3040 | regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; | 3107 | regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); |
3041 | #endif | 3108 | #endif |
3042 | 3109 | ||
3043 | regs->rip = vcpu->arch.rip; | 3110 | regs->rip = kvm_rip_read(vcpu); |
3044 | regs->rflags = kvm_x86_ops->get_rflags(vcpu); | 3111 | regs->rflags = kvm_x86_ops->get_rflags(vcpu); |
3045 | 3112 | ||
3046 | /* | 3113 | /* |
@@ -3058,29 +3125,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
3058 | { | 3125 | { |
3059 | vcpu_load(vcpu); | 3126 | vcpu_load(vcpu); |
3060 | 3127 | ||
3061 | vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; | 3128 | kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); |
3062 | vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; | 3129 | kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); |
3063 | vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; | 3130 | kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); |
3064 | vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; | 3131 | kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); |
3065 | vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; | 3132 | kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); |
3066 | vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; | 3133 | kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); |
3067 | vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; | 3134 | kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); |
3068 | vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; | 3135 | kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); |
3069 | #ifdef CONFIG_X86_64 | 3136 | #ifdef CONFIG_X86_64 |
3070 | vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; | 3137 | kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); |
3071 | vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; | 3138 | kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); |
3072 | vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; | 3139 | kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); |
3073 | vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; | 3140 | kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); |
3074 | vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; | 3141 | kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); |
3075 | vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; | 3142 | kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); |
3076 | vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; | 3143 | kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); |
3077 | vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; | 3144 | kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); |
3145 | |||
3078 | #endif | 3146 | #endif |
3079 | 3147 | ||
3080 | vcpu->arch.rip = regs->rip; | 3148 | kvm_rip_write(vcpu, regs->rip); |
3081 | kvm_x86_ops->set_rflags(vcpu, regs->rflags); | 3149 | kvm_x86_ops->set_rflags(vcpu, regs->rflags); |
3082 | 3150 | ||
3083 | kvm_x86_ops->decache_regs(vcpu); | ||
3084 | 3151 | ||
3085 | vcpu->arch.exception.pending = false; | 3152 | vcpu->arch.exception.pending = false; |
3086 | 3153 | ||
@@ -3294,11 +3361,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, | |||
3294 | return 0; | 3361 | return 0; |
3295 | } | 3362 | } |
3296 | 3363 | ||
3364 | static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) | ||
3365 | { | ||
3366 | struct kvm_segment segvar = { | ||
3367 | .base = selector << 4, | ||
3368 | .limit = 0xffff, | ||
3369 | .selector = selector, | ||
3370 | .type = 3, | ||
3371 | .present = 1, | ||
3372 | .dpl = 3, | ||
3373 | .db = 0, | ||
3374 | .s = 1, | ||
3375 | .l = 0, | ||
3376 | .g = 0, | ||
3377 | .avl = 0, | ||
3378 | .unusable = 0, | ||
3379 | }; | ||
3380 | kvm_x86_ops->set_segment(vcpu, &segvar, seg); | ||
3381 | return 0; | ||
3382 | } | ||
3383 | |||
3297 | int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | 3384 | int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, |
3298 | int type_bits, int seg) | 3385 | int type_bits, int seg) |
3299 | { | 3386 | { |
3300 | struct kvm_segment kvm_seg; | 3387 | struct kvm_segment kvm_seg; |
3301 | 3388 | ||
3389 | if (!(vcpu->arch.cr0 & X86_CR0_PE)) | ||
3390 | return kvm_load_realmode_segment(vcpu, selector, seg); | ||
3302 | if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) | 3391 | if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) |
3303 | return 1; | 3392 | return 1; |
3304 | kvm_seg.type |= type_bits; | 3393 | kvm_seg.type |= type_bits; |
@@ -3316,17 +3405,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, | |||
3316 | struct tss_segment_32 *tss) | 3405 | struct tss_segment_32 *tss) |
3317 | { | 3406 | { |
3318 | tss->cr3 = vcpu->arch.cr3; | 3407 | tss->cr3 = vcpu->arch.cr3; |
3319 | tss->eip = vcpu->arch.rip; | 3408 | tss->eip = kvm_rip_read(vcpu); |
3320 | tss->eflags = kvm_x86_ops->get_rflags(vcpu); | 3409 | tss->eflags = kvm_x86_ops->get_rflags(vcpu); |
3321 | tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; | 3410 | tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); |
3322 | tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; | 3411 | tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); |
3323 | tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; | 3412 | tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); |
3324 | tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; | 3413 | tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); |
3325 | tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; | 3414 | tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); |
3326 | tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; | 3415 | tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); |
3327 | tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; | 3416 | tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); |
3328 | tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; | 3417 | tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); |
3329 | |||
3330 | tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); | 3418 | tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); |
3331 | tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); | 3419 | tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); |
3332 | tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); | 3420 | tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); |
@@ -3342,17 +3430,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, | |||
3342 | { | 3430 | { |
3343 | kvm_set_cr3(vcpu, tss->cr3); | 3431 | kvm_set_cr3(vcpu, tss->cr3); |
3344 | 3432 | ||
3345 | vcpu->arch.rip = tss->eip; | 3433 | kvm_rip_write(vcpu, tss->eip); |
3346 | kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); | 3434 | kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); |
3347 | 3435 | ||
3348 | vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; | 3436 | kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); |
3349 | vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; | 3437 | kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); |
3350 | vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; | 3438 | kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); |
3351 | vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; | 3439 | kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); |
3352 | vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; | 3440 | kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); |
3353 | vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; | 3441 | kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); |
3354 | vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; | 3442 | kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); |
3355 | vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; | 3443 | kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); |
3356 | 3444 | ||
3357 | if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) | 3445 | if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) |
3358 | return 1; | 3446 | return 1; |
@@ -3380,16 +3468,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, | |||
3380 | static void save_state_to_tss16(struct kvm_vcpu *vcpu, | 3468 | static void save_state_to_tss16(struct kvm_vcpu *vcpu, |
3381 | struct tss_segment_16 *tss) | 3469 | struct tss_segment_16 *tss) |
3382 | { | 3470 | { |
3383 | tss->ip = vcpu->arch.rip; | 3471 | tss->ip = kvm_rip_read(vcpu); |
3384 | tss->flag = kvm_x86_ops->get_rflags(vcpu); | 3472 | tss->flag = kvm_x86_ops->get_rflags(vcpu); |
3385 | tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; | 3473 | tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); |
3386 | tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; | 3474 | tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); |
3387 | tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; | 3475 | tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); |
3388 | tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; | 3476 | tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); |
3389 | tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; | 3477 | tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); |
3390 | tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; | 3478 | tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); |
3391 | tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; | 3479 | tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); |
3392 | tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; | 3480 | tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); |
3393 | 3481 | ||
3394 | tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); | 3482 | tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); |
3395 | tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); | 3483 | tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); |
@@ -3402,16 +3490,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu, | |||
3402 | static int load_state_from_tss16(struct kvm_vcpu *vcpu, | 3490 | static int load_state_from_tss16(struct kvm_vcpu *vcpu, |
3403 | struct tss_segment_16 *tss) | 3491 | struct tss_segment_16 *tss) |
3404 | { | 3492 | { |
3405 | vcpu->arch.rip = tss->ip; | 3493 | kvm_rip_write(vcpu, tss->ip); |
3406 | kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); | 3494 | kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); |
3407 | vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; | 3495 | kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); |
3408 | vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; | 3496 | kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); |
3409 | vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; | 3497 | kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); |
3410 | vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; | 3498 | kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); |
3411 | vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; | 3499 | kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); |
3412 | vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; | 3500 | kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); |
3413 | vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; | 3501 | kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); |
3414 | vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; | 3502 | kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); |
3415 | 3503 | ||
3416 | if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) | 3504 | if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) |
3417 | return 1; | 3505 | return 1; |
@@ -3534,7 +3622,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) | |||
3534 | } | 3622 | } |
3535 | 3623 | ||
3536 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 3624 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
3537 | kvm_x86_ops->cache_regs(vcpu); | ||
3538 | 3625 | ||
3539 | if (nseg_desc.type & 8) | 3626 | if (nseg_desc.type & 8) |
3540 | ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, | 3627 | ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, |
@@ -3559,7 +3646,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) | |||
3559 | tr_seg.type = 11; | 3646 | tr_seg.type = 11; |
3560 | kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); | 3647 | kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); |
3561 | out: | 3648 | out: |
3562 | kvm_x86_ops->decache_regs(vcpu); | ||
3563 | return ret; | 3649 | return ret; |
3564 | } | 3650 | } |
3565 | EXPORT_SYMBOL_GPL(kvm_task_switch); | 3651 | EXPORT_SYMBOL_GPL(kvm_task_switch); |
@@ -3622,6 +3708,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
3622 | pr_debug("Set back pending irq %d\n", | 3708 | pr_debug("Set back pending irq %d\n", |
3623 | pending_vec); | 3709 | pending_vec); |
3624 | } | 3710 | } |
3711 | kvm_pic_clear_isr_ack(vcpu->kvm); | ||
3625 | } | 3712 | } |
3626 | 3713 | ||
3627 | kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); | 3714 | kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); |
@@ -3634,6 +3721,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
3634 | kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); | 3721 | kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); |
3635 | kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); | 3722 | kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); |
3636 | 3723 | ||
3724 | /* Older userspace won't unhalt the vcpu on reset. */ | ||
3725 | if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && | ||
3726 | sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && | ||
3727 | !(vcpu->arch.cr0 & X86_CR0_PE)) | ||
3728 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
3729 | |||
3637 | vcpu_put(vcpu); | 3730 | vcpu_put(vcpu); |
3638 | 3731 | ||
3639 | return 0; | 3732 | return 0; |
@@ -3918,6 +4011,7 @@ struct kvm *kvm_arch_create_vm(void) | |||
3918 | return ERR_PTR(-ENOMEM); | 4011 | return ERR_PTR(-ENOMEM); |
3919 | 4012 | ||
3920 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | 4013 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
4014 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); | ||
3921 | 4015 | ||
3922 | return kvm; | 4016 | return kvm; |
3923 | } | 4017 | } |
@@ -3950,6 +4044,8 @@ static void kvm_free_vcpus(struct kvm *kvm) | |||
3950 | 4044 | ||
3951 | void kvm_arch_destroy_vm(struct kvm *kvm) | 4045 | void kvm_arch_destroy_vm(struct kvm *kvm) |
3952 | { | 4046 | { |
4047 | kvm_iommu_unmap_guest(kvm); | ||
4048 | kvm_free_all_assigned_devices(kvm); | ||
3953 | kvm_free_pit(kvm); | 4049 | kvm_free_pit(kvm); |
3954 | kfree(kvm->arch.vpic); | 4050 | kfree(kvm->arch.vpic); |
3955 | kfree(kvm->arch.vioapic); | 4051 | kfree(kvm->arch.vioapic); |
@@ -3981,7 +4077,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, | |||
3981 | userspace_addr = do_mmap(NULL, 0, | 4077 | userspace_addr = do_mmap(NULL, 0, |
3982 | npages * PAGE_SIZE, | 4078 | npages * PAGE_SIZE, |
3983 | PROT_READ | PROT_WRITE, | 4079 | PROT_READ | PROT_WRITE, |
3984 | MAP_SHARED | MAP_ANONYMOUS, | 4080 | MAP_PRIVATE | MAP_ANONYMOUS, |
3985 | 0); | 4081 | 0); |
3986 | up_write(¤t->mm->mmap_sem); | 4082 | up_write(¤t->mm->mmap_sem); |
3987 | 4083 | ||
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h new file mode 100644 index 000000000000..6a4be78a7384 --- /dev/null +++ b/arch/x86/kvm/x86.h | |||
@@ -0,0 +1,22 @@ | |||
1 | #ifndef ARCH_X86_KVM_X86_H | ||
2 | #define ARCH_X86_KVM_X86_H | ||
3 | |||
4 | #include <linux/kvm_host.h> | ||
5 | |||
6 | static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) | ||
7 | { | ||
8 | vcpu->arch.exception.pending = false; | ||
9 | } | ||
10 | |||
11 | static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) | ||
12 | { | ||
13 | vcpu->arch.interrupt.pending = true; | ||
14 | vcpu->arch.interrupt.nr = vector; | ||
15 | } | ||
16 | |||
17 | static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) | ||
18 | { | ||
19 | vcpu->arch.interrupt.pending = false; | ||
20 | } | ||
21 | |||
22 | #endif | ||
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index f2f90468f8b1..ea051173b0da 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #define DPRINTF(_f, _a ...) printf(_f , ## _a) | 26 | #define DPRINTF(_f, _a ...) printf(_f , ## _a) |
27 | #else | 27 | #else |
28 | #include <linux/kvm_host.h> | 28 | #include <linux/kvm_host.h> |
29 | #include "kvm_cache_regs.h" | ||
29 | #define DPRINTF(x...) do {} while (0) | 30 | #define DPRINTF(x...) do {} while (0) |
30 | #endif | 31 | #endif |
31 | #include <linux/module.h> | 32 | #include <linux/module.h> |
@@ -46,25 +47,26 @@ | |||
46 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ | 47 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ |
47 | #define DstReg (2<<1) /* Register operand. */ | 48 | #define DstReg (2<<1) /* Register operand. */ |
48 | #define DstMem (3<<1) /* Memory operand. */ | 49 | #define DstMem (3<<1) /* Memory operand. */ |
49 | #define DstMask (3<<1) | 50 | #define DstAcc (4<<1) /* Destination Accumulator */ |
51 | #define DstMask (7<<1) | ||
50 | /* Source operand type. */ | 52 | /* Source operand type. */ |
51 | #define SrcNone (0<<3) /* No source operand. */ | 53 | #define SrcNone (0<<4) /* No source operand. */ |
52 | #define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ | 54 | #define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ |
53 | #define SrcReg (1<<3) /* Register operand. */ | 55 | #define SrcReg (1<<4) /* Register operand. */ |
54 | #define SrcMem (2<<3) /* Memory operand. */ | 56 | #define SrcMem (2<<4) /* Memory operand. */ |
55 | #define SrcMem16 (3<<3) /* Memory operand (16-bit). */ | 57 | #define SrcMem16 (3<<4) /* Memory operand (16-bit). */ |
56 | #define SrcMem32 (4<<3) /* Memory operand (32-bit). */ | 58 | #define SrcMem32 (4<<4) /* Memory operand (32-bit). */ |
57 | #define SrcImm (5<<3) /* Immediate operand. */ | 59 | #define SrcImm (5<<4) /* Immediate operand. */ |
58 | #define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ | 60 | #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ |
59 | #define SrcMask (7<<3) | 61 | #define SrcMask (7<<4) |
60 | /* Generic ModRM decode. */ | 62 | /* Generic ModRM decode. */ |
61 | #define ModRM (1<<6) | 63 | #define ModRM (1<<7) |
62 | /* Destination is only written; never read. */ | 64 | /* Destination is only written; never read. */ |
63 | #define Mov (1<<7) | 65 | #define Mov (1<<8) |
64 | #define BitOp (1<<8) | 66 | #define BitOp (1<<9) |
65 | #define MemAbs (1<<9) /* Memory operand is absolute displacement */ | 67 | #define MemAbs (1<<10) /* Memory operand is absolute displacement */ |
66 | #define String (1<<10) /* String instruction (rep capable) */ | 68 | #define String (1<<12) /* String instruction (rep capable) */ |
67 | #define Stack (1<<11) /* Stack instruction (push/pop) */ | 69 | #define Stack (1<<13) /* Stack instruction (push/pop) */ |
68 | #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ | 70 | #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ |
69 | #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ | 71 | #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ |
70 | #define GroupMask 0xff /* Group number stored in bits 0:7 */ | 72 | #define GroupMask 0xff /* Group number stored in bits 0:7 */ |
@@ -94,7 +96,7 @@ static u16 opcode_table[256] = { | |||
94 | /* 0x20 - 0x27 */ | 96 | /* 0x20 - 0x27 */ |
95 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 97 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
96 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 98 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
97 | SrcImmByte, SrcImm, 0, 0, | 99 | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, |
98 | /* 0x28 - 0x2F */ | 100 | /* 0x28 - 0x2F */ |
99 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 101 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
100 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 102 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
@@ -106,7 +108,8 @@ static u16 opcode_table[256] = { | |||
106 | /* 0x38 - 0x3F */ | 108 | /* 0x38 - 0x3F */ |
107 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 109 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
108 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 110 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
109 | 0, 0, 0, 0, | 111 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, |
112 | 0, 0, | ||
110 | /* 0x40 - 0x47 */ | 113 | /* 0x40 - 0x47 */ |
111 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | 114 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, |
112 | /* 0x48 - 0x4F */ | 115 | /* 0x48 - 0x4F */ |
@@ -153,9 +156,16 @@ static u16 opcode_table[256] = { | |||
153 | 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | 156 | 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, |
154 | ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | 157 | ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, |
155 | ByteOp | ImplicitOps | String, ImplicitOps | String, | 158 | ByteOp | ImplicitOps | String, ImplicitOps | String, |
156 | /* 0xB0 - 0xBF */ | 159 | /* 0xB0 - 0xB7 */ |
157 | 0, 0, 0, 0, 0, 0, 0, 0, | 160 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, |
158 | DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0, | 161 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, |
162 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, | ||
163 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, | ||
164 | /* 0xB8 - 0xBF */ | ||
165 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
166 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
167 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
168 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
159 | /* 0xC0 - 0xC7 */ | 169 | /* 0xC0 - 0xC7 */ |
160 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | 170 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, |
161 | 0, ImplicitOps | Stack, 0, 0, | 171 | 0, ImplicitOps | Stack, 0, 0, |
@@ -169,17 +179,20 @@ static u16 opcode_table[256] = { | |||
169 | /* 0xD8 - 0xDF */ | 179 | /* 0xD8 - 0xDF */ |
170 | 0, 0, 0, 0, 0, 0, 0, 0, | 180 | 0, 0, 0, 0, 0, 0, 0, 0, |
171 | /* 0xE0 - 0xE7 */ | 181 | /* 0xE0 - 0xE7 */ |
172 | 0, 0, 0, 0, 0, 0, 0, 0, | 182 | 0, 0, 0, 0, |
183 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | ||
184 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | ||
173 | /* 0xE8 - 0xEF */ | 185 | /* 0xE8 - 0xEF */ |
174 | ImplicitOps | Stack, SrcImm | ImplicitOps, | 186 | ImplicitOps | Stack, SrcImm | ImplicitOps, |
175 | ImplicitOps, SrcImmByte | ImplicitOps, | 187 | ImplicitOps, SrcImmByte | ImplicitOps, |
176 | 0, 0, 0, 0, | 188 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, |
189 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | ||
177 | /* 0xF0 - 0xF7 */ | 190 | /* 0xF0 - 0xF7 */ |
178 | 0, 0, 0, 0, | 191 | 0, 0, 0, 0, |
179 | ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, | 192 | ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, |
180 | /* 0xF8 - 0xFF */ | 193 | /* 0xF8 - 0xFF */ |
181 | ImplicitOps, 0, ImplicitOps, ImplicitOps, | 194 | ImplicitOps, 0, ImplicitOps, ImplicitOps, |
182 | 0, 0, Group | Group4, Group | Group5, | 195 | ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, |
183 | }; | 196 | }; |
184 | 197 | ||
185 | static u16 twobyte_table[256] = { | 198 | static u16 twobyte_table[256] = { |
@@ -268,15 +281,16 @@ static u16 group_table[] = { | |||
268 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | 281 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, |
269 | 0, 0, 0, 0, | 282 | 0, 0, 0, 0, |
270 | [Group3*8] = | 283 | [Group3*8] = |
271 | DstMem | SrcImm | ModRM | SrcImm, 0, | 284 | DstMem | SrcImm | ModRM, 0, |
272 | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | 285 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, |
273 | 0, 0, 0, 0, | 286 | 0, 0, 0, 0, |
274 | [Group4*8] = | 287 | [Group4*8] = |
275 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | 288 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, |
276 | 0, 0, 0, 0, 0, 0, | 289 | 0, 0, 0, 0, 0, 0, |
277 | [Group5*8] = | 290 | [Group5*8] = |
278 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, | 291 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, |
279 | SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, | 292 | SrcMem | ModRM | Stack, 0, |
293 | SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, | ||
280 | [Group7*8] = | 294 | [Group7*8] = |
281 | 0, 0, ModRM | SrcMem, ModRM | SrcMem, | 295 | 0, 0, ModRM | SrcMem, ModRM | SrcMem, |
282 | SrcNone | ModRM | DstMem | Mov, 0, | 296 | SrcNone | ModRM | DstMem | Mov, 0, |
@@ -839,7 +853,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
839 | /* Shadow copy of register state. Committed on successful emulation. */ | 853 | /* Shadow copy of register state. Committed on successful emulation. */ |
840 | 854 | ||
841 | memset(c, 0, sizeof(struct decode_cache)); | 855 | memset(c, 0, sizeof(struct decode_cache)); |
842 | c->eip = ctxt->vcpu->arch.rip; | 856 | c->eip = kvm_rip_read(ctxt->vcpu); |
843 | ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); | 857 | ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); |
844 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); | 858 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); |
845 | 859 | ||
@@ -1048,6 +1062,23 @@ done_prefixes: | |||
1048 | } | 1062 | } |
1049 | c->dst.type = OP_MEM; | 1063 | c->dst.type = OP_MEM; |
1050 | break; | 1064 | break; |
1065 | case DstAcc: | ||
1066 | c->dst.type = OP_REG; | ||
1067 | c->dst.bytes = c->op_bytes; | ||
1068 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1069 | switch (c->op_bytes) { | ||
1070 | case 1: | ||
1071 | c->dst.val = *(u8 *)c->dst.ptr; | ||
1072 | break; | ||
1073 | case 2: | ||
1074 | c->dst.val = *(u16 *)c->dst.ptr; | ||
1075 | break; | ||
1076 | case 4: | ||
1077 | c->dst.val = *(u32 *)c->dst.ptr; | ||
1078 | break; | ||
1079 | } | ||
1080 | c->dst.orig_val = c->dst.val; | ||
1081 | break; | ||
1051 | } | 1082 | } |
1052 | 1083 | ||
1053 | if (c->rip_relative) | 1084 | if (c->rip_relative) |
@@ -1151,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, | |||
1151 | case 1: /* dec */ | 1182 | case 1: /* dec */ |
1152 | emulate_1op("dec", c->dst, ctxt->eflags); | 1183 | emulate_1op("dec", c->dst, ctxt->eflags); |
1153 | break; | 1184 | break; |
1185 | case 2: /* call near abs */ { | ||
1186 | long int old_eip; | ||
1187 | old_eip = c->eip; | ||
1188 | c->eip = c->src.val; | ||
1189 | c->src.val = old_eip; | ||
1190 | emulate_push(ctxt); | ||
1191 | break; | ||
1192 | } | ||
1154 | case 4: /* jmp abs */ | 1193 | case 4: /* jmp abs */ |
1155 | c->eip = c->src.val; | 1194 | c->eip = c->src.val; |
1156 | break; | 1195 | break; |
@@ -1251,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1251 | u64 msr_data; | 1290 | u64 msr_data; |
1252 | unsigned long saved_eip = 0; | 1291 | unsigned long saved_eip = 0; |
1253 | struct decode_cache *c = &ctxt->decode; | 1292 | struct decode_cache *c = &ctxt->decode; |
1293 | unsigned int port; | ||
1294 | int io_dir_in; | ||
1254 | int rc = 0; | 1295 | int rc = 0; |
1255 | 1296 | ||
1256 | /* Shadow copy of register state. Committed on successful emulation. | 1297 | /* Shadow copy of register state. Committed on successful emulation. |
@@ -1267,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1267 | if (c->rep_prefix && (c->d & String)) { | 1308 | if (c->rep_prefix && (c->d & String)) { |
1268 | /* All REP prefixes have the same first termination condition */ | 1309 | /* All REP prefixes have the same first termination condition */ |
1269 | if (c->regs[VCPU_REGS_RCX] == 0) { | 1310 | if (c->regs[VCPU_REGS_RCX] == 0) { |
1270 | ctxt->vcpu->arch.rip = c->eip; | 1311 | kvm_rip_write(ctxt->vcpu, c->eip); |
1271 | goto done; | 1312 | goto done; |
1272 | } | 1313 | } |
1273 | /* The second termination condition only applies for REPE | 1314 | /* The second termination condition only applies for REPE |
@@ -1281,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1281 | (c->b == 0xae) || (c->b == 0xaf)) { | 1322 | (c->b == 0xae) || (c->b == 0xaf)) { |
1282 | if ((c->rep_prefix == REPE_PREFIX) && | 1323 | if ((c->rep_prefix == REPE_PREFIX) && |
1283 | ((ctxt->eflags & EFLG_ZF) == 0)) { | 1324 | ((ctxt->eflags & EFLG_ZF) == 0)) { |
1284 | ctxt->vcpu->arch.rip = c->eip; | 1325 | kvm_rip_write(ctxt->vcpu, c->eip); |
1285 | goto done; | 1326 | goto done; |
1286 | } | 1327 | } |
1287 | if ((c->rep_prefix == REPNE_PREFIX) && | 1328 | if ((c->rep_prefix == REPNE_PREFIX) && |
1288 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { | 1329 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { |
1289 | ctxt->vcpu->arch.rip = c->eip; | 1330 | kvm_rip_write(ctxt->vcpu, c->eip); |
1290 | goto done; | 1331 | goto done; |
1291 | } | 1332 | } |
1292 | } | 1333 | } |
1293 | c->regs[VCPU_REGS_RCX]--; | 1334 | c->regs[VCPU_REGS_RCX]--; |
1294 | c->eip = ctxt->vcpu->arch.rip; | 1335 | c->eip = kvm_rip_read(ctxt->vcpu); |
1295 | } | 1336 | } |
1296 | 1337 | ||
1297 | if (c->src.type == OP_MEM) { | 1338 | if (c->src.type == OP_MEM) { |
@@ -1351,27 +1392,10 @@ special_insn: | |||
1351 | sbb: /* sbb */ | 1392 | sbb: /* sbb */ |
1352 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); | 1393 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); |
1353 | break; | 1394 | break; |
1354 | case 0x20 ... 0x23: | 1395 | case 0x20 ... 0x25: |
1355 | and: /* and */ | 1396 | and: /* and */ |
1356 | emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); | 1397 | emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); |
1357 | break; | 1398 | break; |
1358 | case 0x24: /* and al imm8 */ | ||
1359 | c->dst.type = OP_REG; | ||
1360 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1361 | c->dst.val = *(u8 *)c->dst.ptr; | ||
1362 | c->dst.bytes = 1; | ||
1363 | c->dst.orig_val = c->dst.val; | ||
1364 | goto and; | ||
1365 | case 0x25: /* and ax imm16, or eax imm32 */ | ||
1366 | c->dst.type = OP_REG; | ||
1367 | c->dst.bytes = c->op_bytes; | ||
1368 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1369 | if (c->op_bytes == 2) | ||
1370 | c->dst.val = *(u16 *)c->dst.ptr; | ||
1371 | else | ||
1372 | c->dst.val = *(u32 *)c->dst.ptr; | ||
1373 | c->dst.orig_val = c->dst.val; | ||
1374 | goto and; | ||
1375 | case 0x28 ... 0x2d: | 1399 | case 0x28 ... 0x2d: |
1376 | sub: /* sub */ | 1400 | sub: /* sub */ |
1377 | emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); | 1401 | emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); |
@@ -1659,7 +1683,7 @@ special_insn: | |||
1659 | case 0xae ... 0xaf: /* scas */ | 1683 | case 0xae ... 0xaf: /* scas */ |
1660 | DPRINTF("Urk! I don't handle SCAS.\n"); | 1684 | DPRINTF("Urk! I don't handle SCAS.\n"); |
1661 | goto cannot_emulate; | 1685 | goto cannot_emulate; |
1662 | case 0xb8: /* mov r, imm */ | 1686 | case 0xb0 ... 0xbf: /* mov r, imm */ |
1663 | goto mov; | 1687 | goto mov; |
1664 | case 0xc0 ... 0xc1: | 1688 | case 0xc0 ... 0xc1: |
1665 | emulate_grp2(ctxt); | 1689 | emulate_grp2(ctxt); |
@@ -1679,6 +1703,16 @@ special_insn: | |||
1679 | c->src.val = c->regs[VCPU_REGS_RCX]; | 1703 | c->src.val = c->regs[VCPU_REGS_RCX]; |
1680 | emulate_grp2(ctxt); | 1704 | emulate_grp2(ctxt); |
1681 | break; | 1705 | break; |
1706 | case 0xe4: /* inb */ | ||
1707 | case 0xe5: /* in */ | ||
1708 | port = insn_fetch(u8, 1, c->eip); | ||
1709 | io_dir_in = 1; | ||
1710 | goto do_io; | ||
1711 | case 0xe6: /* outb */ | ||
1712 | case 0xe7: /* out */ | ||
1713 | port = insn_fetch(u8, 1, c->eip); | ||
1714 | io_dir_in = 0; | ||
1715 | goto do_io; | ||
1682 | case 0xe8: /* call (near) */ { | 1716 | case 0xe8: /* call (near) */ { |
1683 | long int rel; | 1717 | long int rel; |
1684 | switch (c->op_bytes) { | 1718 | switch (c->op_bytes) { |
@@ -1729,6 +1763,22 @@ special_insn: | |||
1729 | jmp_rel(c, c->src.val); | 1763 | jmp_rel(c, c->src.val); |
1730 | c->dst.type = OP_NONE; /* Disable writeback. */ | 1764 | c->dst.type = OP_NONE; /* Disable writeback. */ |
1731 | break; | 1765 | break; |
1766 | case 0xec: /* in al,dx */ | ||
1767 | case 0xed: /* in (e/r)ax,dx */ | ||
1768 | port = c->regs[VCPU_REGS_RDX]; | ||
1769 | io_dir_in = 1; | ||
1770 | goto do_io; | ||
1771 | case 0xee: /* out al,dx */ | ||
1772 | case 0xef: /* out (e/r)ax,dx */ | ||
1773 | port = c->regs[VCPU_REGS_RDX]; | ||
1774 | io_dir_in = 0; | ||
1775 | do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, | ||
1776 | (c->d & ByteOp) ? 1 : c->op_bytes, | ||
1777 | port) != 0) { | ||
1778 | c->eip = saved_eip; | ||
1779 | goto cannot_emulate; | ||
1780 | } | ||
1781 | return 0; | ||
1732 | case 0xf4: /* hlt */ | 1782 | case 0xf4: /* hlt */ |
1733 | ctxt->vcpu->arch.halt_request = 1; | 1783 | ctxt->vcpu->arch.halt_request = 1; |
1734 | break; | 1784 | break; |
@@ -1754,6 +1804,14 @@ special_insn: | |||
1754 | ctxt->eflags |= X86_EFLAGS_IF; | 1804 | ctxt->eflags |= X86_EFLAGS_IF; |
1755 | c->dst.type = OP_NONE; /* Disable writeback. */ | 1805 | c->dst.type = OP_NONE; /* Disable writeback. */ |
1756 | break; | 1806 | break; |
1807 | case 0xfc: /* cld */ | ||
1808 | ctxt->eflags &= ~EFLG_DF; | ||
1809 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1810 | break; | ||
1811 | case 0xfd: /* std */ | ||
1812 | ctxt->eflags |= EFLG_DF; | ||
1813 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1814 | break; | ||
1757 | case 0xfe ... 0xff: /* Grp4/Grp5 */ | 1815 | case 0xfe ... 0xff: /* Grp4/Grp5 */ |
1758 | rc = emulate_grp45(ctxt, ops); | 1816 | rc = emulate_grp45(ctxt, ops); |
1759 | if (rc != 0) | 1817 | if (rc != 0) |
@@ -1768,7 +1826,7 @@ writeback: | |||
1768 | 1826 | ||
1769 | /* Commit shadow register state. */ | 1827 | /* Commit shadow register state. */ |
1770 | memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); | 1828 | memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); |
1771 | ctxt->vcpu->arch.rip = c->eip; | 1829 | kvm_rip_write(ctxt->vcpu, c->eip); |
1772 | 1830 | ||
1773 | done: | 1831 | done: |
1774 | if (rc == X86EMUL_UNHANDLEABLE) { | 1832 | if (rc == X86EMUL_UNHANDLEABLE) { |
@@ -1793,7 +1851,7 @@ twobyte_insn: | |||
1793 | goto done; | 1851 | goto done; |
1794 | 1852 | ||
1795 | /* Let the processor re-execute the fixed hypercall */ | 1853 | /* Let the processor re-execute the fixed hypercall */ |
1796 | c->eip = ctxt->vcpu->arch.rip; | 1854 | c->eip = kvm_rip_read(ctxt->vcpu); |
1797 | /* Disable writeback. */ | 1855 | /* Disable writeback. */ |
1798 | c->dst.type = OP_NONE; | 1856 | c->dst.type = OP_NONE; |
1799 | break; | 1857 | break; |
@@ -1889,7 +1947,7 @@ twobyte_insn: | |||
1889 | rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); | 1947 | rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); |
1890 | if (rc) { | 1948 | if (rc) { |
1891 | kvm_inject_gp(ctxt->vcpu, 0); | 1949 | kvm_inject_gp(ctxt->vcpu, 0); |
1892 | c->eip = ctxt->vcpu->arch.rip; | 1950 | c->eip = kvm_rip_read(ctxt->vcpu); |
1893 | } | 1951 | } |
1894 | rc = X86EMUL_CONTINUE; | 1952 | rc = X86EMUL_CONTINUE; |
1895 | c->dst.type = OP_NONE; | 1953 | c->dst.type = OP_NONE; |
@@ -1899,7 +1957,7 @@ twobyte_insn: | |||
1899 | rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); | 1957 | rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); |
1900 | if (rc) { | 1958 | if (rc) { |
1901 | kvm_inject_gp(ctxt->vcpu, 0); | 1959 | kvm_inject_gp(ctxt->vcpu, 0); |
1902 | c->eip = ctxt->vcpu->arch.rip; | 1960 | c->eip = kvm_rip_read(ctxt->vcpu); |
1903 | } else { | 1961 | } else { |
1904 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; | 1962 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; |
1905 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; | 1963 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index d9249a882aa5..65f0b8a47bed 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -55,6 +55,7 @@ | |||
55 | #include <linux/lguest_launcher.h> | 55 | #include <linux/lguest_launcher.h> |
56 | #include <linux/virtio_console.h> | 56 | #include <linux/virtio_console.h> |
57 | #include <linux/pm.h> | 57 | #include <linux/pm.h> |
58 | #include <asm/apic.h> | ||
58 | #include <asm/lguest.h> | 59 | #include <asm/lguest.h> |
59 | #include <asm/paravirt.h> | 60 | #include <asm/paravirt.h> |
60 | #include <asm/param.h> | 61 | #include <asm/param.h> |
@@ -783,14 +784,44 @@ static void lguest_wbinvd(void) | |||
783 | * code qualifies for Advanced. It will also never interrupt anything. It | 784 | * code qualifies for Advanced. It will also never interrupt anything. It |
784 | * does, however, allow us to get through the Linux boot code. */ | 785 | * does, however, allow us to get through the Linux boot code. */ |
785 | #ifdef CONFIG_X86_LOCAL_APIC | 786 | #ifdef CONFIG_X86_LOCAL_APIC |
786 | static void lguest_apic_write(unsigned long reg, u32 v) | 787 | static void lguest_apic_write(u32 reg, u32 v) |
787 | { | 788 | { |
788 | } | 789 | } |
789 | 790 | ||
790 | static u32 lguest_apic_read(unsigned long reg) | 791 | static u32 lguest_apic_read(u32 reg) |
791 | { | 792 | { |
792 | return 0; | 793 | return 0; |
793 | } | 794 | } |
795 | |||
796 | static u64 lguest_apic_icr_read(void) | ||
797 | { | ||
798 | return 0; | ||
799 | } | ||
800 | |||
801 | static void lguest_apic_icr_write(u32 low, u32 id) | ||
802 | { | ||
803 | /* Warn to see if there's any stray references */ | ||
804 | WARN_ON(1); | ||
805 | } | ||
806 | |||
807 | static void lguest_apic_wait_icr_idle(void) | ||
808 | { | ||
809 | return; | ||
810 | } | ||
811 | |||
812 | static u32 lguest_apic_safe_wait_icr_idle(void) | ||
813 | { | ||
814 | return 0; | ||
815 | } | ||
816 | |||
817 | static struct apic_ops lguest_basic_apic_ops = { | ||
818 | .read = lguest_apic_read, | ||
819 | .write = lguest_apic_write, | ||
820 | .icr_read = lguest_apic_icr_read, | ||
821 | .icr_write = lguest_apic_icr_write, | ||
822 | .wait_icr_idle = lguest_apic_wait_icr_idle, | ||
823 | .safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle, | ||
824 | }; | ||
794 | #endif | 825 | #endif |
795 | 826 | ||
796 | /* STOP! Until an interrupt comes in. */ | 827 | /* STOP! Until an interrupt comes in. */ |
@@ -990,8 +1021,7 @@ __init void lguest_init(void) | |||
990 | 1021 | ||
991 | #ifdef CONFIG_X86_LOCAL_APIC | 1022 | #ifdef CONFIG_X86_LOCAL_APIC |
992 | /* apic read/write intercepts */ | 1023 | /* apic read/write intercepts */ |
993 | pv_apic_ops.apic_write = lguest_apic_write; | 1024 | apic_ops = &lguest_basic_apic_ops; |
994 | pv_apic_ops.apic_read = lguest_apic_read; | ||
995 | #endif | 1025 | #endif |
996 | 1026 | ||
997 | /* time operations */ | 1027 | /* time operations */ |
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index aa3fa4119424..55e11aa6d66c 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile | |||
@@ -17,9 +17,6 @@ ifeq ($(CONFIG_X86_32),y) | |||
17 | lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o | 17 | lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o |
18 | else | 18 | else |
19 | obj-y += io_64.o iomap_copy_64.o | 19 | obj-y += io_64.o iomap_copy_64.o |
20 | |||
21 | CFLAGS_csum-partial_64.o := -funroll-loops | ||
22 | |||
23 | lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o | 20 | lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o |
24 | lib-y += thunk_64.o clear_page_64.o copy_page_64.o | 21 | lib-y += thunk_64.o clear_page_64.o copy_page_64.o |
25 | lib-y += memmove_64.o memset_64.o | 22 | lib-y += memmove_64.o memset_64.o |
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c index 01b868ba82f8..321cf720dbb6 100644 --- a/arch/x86/lib/msr-on-cpu.c +++ b/arch/x86/lib/msr-on-cpu.c | |||
@@ -16,37 +16,46 @@ static void __rdmsr_on_cpu(void *info) | |||
16 | rdmsr(rv->msr_no, rv->l, rv->h); | 16 | rdmsr(rv->msr_no, rv->l, rv->h); |
17 | } | 17 | } |
18 | 18 | ||
19 | static void __rdmsr_safe_on_cpu(void *info) | 19 | static void __wrmsr_on_cpu(void *info) |
20 | { | 20 | { |
21 | struct msr_info *rv = info; | 21 | struct msr_info *rv = info; |
22 | 22 | ||
23 | rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h); | 23 | wrmsr(rv->msr_no, rv->l, rv->h); |
24 | } | 24 | } |
25 | 25 | ||
26 | static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe) | 26 | int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) |
27 | { | 27 | { |
28 | int err = 0; | 28 | int err; |
29 | struct msr_info rv; | 29 | struct msr_info rv; |
30 | 30 | ||
31 | rv.msr_no = msr_no; | 31 | rv.msr_no = msr_no; |
32 | if (safe) { | 32 | err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); |
33 | err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, | ||
34 | &rv, 1); | ||
35 | err = err ? err : rv.err; | ||
36 | } else { | ||
37 | err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); | ||
38 | } | ||
39 | *l = rv.l; | 33 | *l = rv.l; |
40 | *h = rv.h; | 34 | *h = rv.h; |
41 | 35 | ||
42 | return err; | 36 | return err; |
43 | } | 37 | } |
44 | 38 | ||
45 | static void __wrmsr_on_cpu(void *info) | 39 | int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) |
40 | { | ||
41 | int err; | ||
42 | struct msr_info rv; | ||
43 | |||
44 | rv.msr_no = msr_no; | ||
45 | rv.l = l; | ||
46 | rv.h = h; | ||
47 | err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); | ||
48 | |||
49 | return err; | ||
50 | } | ||
51 | |||
52 | /* These "safe" variants are slower and should be used when the target MSR | ||
53 | may not actually exist. */ | ||
54 | static void __rdmsr_safe_on_cpu(void *info) | ||
46 | { | 55 | { |
47 | struct msr_info *rv = info; | 56 | struct msr_info *rv = info; |
48 | 57 | ||
49 | wrmsr(rv->msr_no, rv->l, rv->h); | 58 | rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h); |
50 | } | 59 | } |
51 | 60 | ||
52 | static void __wrmsr_safe_on_cpu(void *info) | 61 | static void __wrmsr_safe_on_cpu(void *info) |
@@ -56,45 +65,30 @@ static void __wrmsr_safe_on_cpu(void *info) | |||
56 | rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h); | 65 | rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h); |
57 | } | 66 | } |
58 | 67 | ||
59 | static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe) | 68 | int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) |
60 | { | 69 | { |
61 | int err = 0; | 70 | int err; |
62 | struct msr_info rv; | 71 | struct msr_info rv; |
63 | 72 | ||
64 | rv.msr_no = msr_no; | 73 | rv.msr_no = msr_no; |
65 | rv.l = l; | 74 | err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); |
66 | rv.h = h; | 75 | *l = rv.l; |
67 | if (safe) { | 76 | *h = rv.h; |
68 | err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, | ||
69 | &rv, 1); | ||
70 | err = err ? err : rv.err; | ||
71 | } else { | ||
72 | err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); | ||
73 | } | ||
74 | |||
75 | return err; | ||
76 | } | ||
77 | 77 | ||
78 | int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) | 78 | return err ? err : rv.err; |
79 | { | ||
80 | return _wrmsr_on_cpu(cpu, msr_no, l, h, 0); | ||
81 | } | 79 | } |
82 | 80 | ||
83 | int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) | ||
84 | { | ||
85 | return _rdmsr_on_cpu(cpu, msr_no, l, h, 0); | ||
86 | } | ||
87 | |||
88 | /* These "safe" variants are slower and should be used when the target MSR | ||
89 | may not actually exist. */ | ||
90 | int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) | 81 | int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) |
91 | { | 82 | { |
92 | return _wrmsr_on_cpu(cpu, msr_no, l, h, 1); | 83 | int err; |
93 | } | 84 | struct msr_info rv; |
94 | 85 | ||
95 | int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) | 86 | rv.msr_no = msr_no; |
96 | { | 87 | rv.l = l; |
97 | return _rdmsr_on_cpu(cpu, msr_no, l, h, 1); | 88 | rv.h = h; |
89 | err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); | ||
90 | |||
91 | return err ? err : rv.err; | ||
98 | } | 92 | } |
99 | 93 | ||
100 | EXPORT_SYMBOL(rdmsr_on_cpu); | 94 | EXPORT_SYMBOL(rdmsr_on_cpu); |
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c index 94972e7c094d..82004d2bf05e 100644 --- a/arch/x86/lib/string_32.c +++ b/arch/x86/lib/string_32.c | |||
@@ -22,7 +22,7 @@ char *strcpy(char *dest, const char *src) | |||
22 | "testb %%al,%%al\n\t" | 22 | "testb %%al,%%al\n\t" |
23 | "jne 1b" | 23 | "jne 1b" |
24 | : "=&S" (d0), "=&D" (d1), "=&a" (d2) | 24 | : "=&S" (d0), "=&D" (d1), "=&a" (d2) |
25 | :"0" (src), "1" (dest) : "memory"); | 25 | : "0" (src), "1" (dest) : "memory"); |
26 | return dest; | 26 | return dest; |
27 | } | 27 | } |
28 | EXPORT_SYMBOL(strcpy); | 28 | EXPORT_SYMBOL(strcpy); |
@@ -42,7 +42,7 @@ char *strncpy(char *dest, const char *src, size_t count) | |||
42 | "stosb\n" | 42 | "stosb\n" |
43 | "2:" | 43 | "2:" |
44 | : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) | 44 | : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) |
45 | :"0" (src), "1" (dest), "2" (count) : "memory"); | 45 | : "0" (src), "1" (dest), "2" (count) : "memory"); |
46 | return dest; | 46 | return dest; |
47 | } | 47 | } |
48 | EXPORT_SYMBOL(strncpy); | 48 | EXPORT_SYMBOL(strncpy); |
@@ -60,7 +60,7 @@ char *strcat(char *dest, const char *src) | |||
60 | "testb %%al,%%al\n\t" | 60 | "testb %%al,%%al\n\t" |
61 | "jne 1b" | 61 | "jne 1b" |
62 | : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) | 62 | : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) |
63 | : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory"); | 63 | : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory"); |
64 | return dest; | 64 | return dest; |
65 | } | 65 | } |
66 | EXPORT_SYMBOL(strcat); | 66 | EXPORT_SYMBOL(strcat); |
@@ -105,9 +105,9 @@ int strcmp(const char *cs, const char *ct) | |||
105 | "2:\tsbbl %%eax,%%eax\n\t" | 105 | "2:\tsbbl %%eax,%%eax\n\t" |
106 | "orb $1,%%al\n" | 106 | "orb $1,%%al\n" |
107 | "3:" | 107 | "3:" |
108 | :"=a" (res), "=&S" (d0), "=&D" (d1) | 108 | : "=a" (res), "=&S" (d0), "=&D" (d1) |
109 | :"1" (cs), "2" (ct) | 109 | : "1" (cs), "2" (ct) |
110 | :"memory"); | 110 | : "memory"); |
111 | return res; | 111 | return res; |
112 | } | 112 | } |
113 | EXPORT_SYMBOL(strcmp); | 113 | EXPORT_SYMBOL(strcmp); |
@@ -130,9 +130,9 @@ int strncmp(const char *cs, const char *ct, size_t count) | |||
130 | "3:\tsbbl %%eax,%%eax\n\t" | 130 | "3:\tsbbl %%eax,%%eax\n\t" |
131 | "orb $1,%%al\n" | 131 | "orb $1,%%al\n" |
132 | "4:" | 132 | "4:" |
133 | :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) | 133 | : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) |
134 | :"1" (cs), "2" (ct), "3" (count) | 134 | : "1" (cs), "2" (ct), "3" (count) |
135 | :"memory"); | 135 | : "memory"); |
136 | return res; | 136 | return res; |
137 | } | 137 | } |
138 | EXPORT_SYMBOL(strncmp); | 138 | EXPORT_SYMBOL(strncmp); |
@@ -152,9 +152,9 @@ char *strchr(const char *s, int c) | |||
152 | "movl $1,%1\n" | 152 | "movl $1,%1\n" |
153 | "2:\tmovl %1,%0\n\t" | 153 | "2:\tmovl %1,%0\n\t" |
154 | "decl %0" | 154 | "decl %0" |
155 | :"=a" (res), "=&S" (d0) | 155 | : "=a" (res), "=&S" (d0) |
156 | :"1" (s), "0" (c) | 156 | : "1" (s), "0" (c) |
157 | :"memory"); | 157 | : "memory"); |
158 | return res; | 158 | return res; |
159 | } | 159 | } |
160 | EXPORT_SYMBOL(strchr); | 160 | EXPORT_SYMBOL(strchr); |
@@ -169,9 +169,9 @@ size_t strlen(const char *s) | |||
169 | "scasb\n\t" | 169 | "scasb\n\t" |
170 | "notl %0\n\t" | 170 | "notl %0\n\t" |
171 | "decl %0" | 171 | "decl %0" |
172 | :"=c" (res), "=&D" (d0) | 172 | : "=c" (res), "=&D" (d0) |
173 | :"1" (s), "a" (0), "0" (0xffffffffu) | 173 | : "1" (s), "a" (0), "0" (0xffffffffu) |
174 | :"memory"); | 174 | : "memory"); |
175 | return res; | 175 | return res; |
176 | } | 176 | } |
177 | EXPORT_SYMBOL(strlen); | 177 | EXPORT_SYMBOL(strlen); |
@@ -189,9 +189,9 @@ void *memchr(const void *cs, int c, size_t count) | |||
189 | "je 1f\n\t" | 189 | "je 1f\n\t" |
190 | "movl $1,%0\n" | 190 | "movl $1,%0\n" |
191 | "1:\tdecl %0" | 191 | "1:\tdecl %0" |
192 | :"=D" (res), "=&c" (d0) | 192 | : "=D" (res), "=&c" (d0) |
193 | :"a" (c), "0" (cs), "1" (count) | 193 | : "a" (c), "0" (cs), "1" (count) |
194 | :"memory"); | 194 | : "memory"); |
195 | return res; | 195 | return res; |
196 | } | 196 | } |
197 | EXPORT_SYMBOL(memchr); | 197 | EXPORT_SYMBOL(memchr); |
@@ -228,9 +228,9 @@ size_t strnlen(const char *s, size_t count) | |||
228 | "cmpl $-1,%1\n\t" | 228 | "cmpl $-1,%1\n\t" |
229 | "jne 1b\n" | 229 | "jne 1b\n" |
230 | "3:\tsubl %2,%0" | 230 | "3:\tsubl %2,%0" |
231 | :"=a" (res), "=&d" (d0) | 231 | : "=a" (res), "=&d" (d0) |
232 | :"c" (s), "1" (count) | 232 | : "c" (s), "1" (count) |
233 | :"memory"); | 233 | : "memory"); |
234 | return res; | 234 | return res; |
235 | } | 235 | } |
236 | EXPORT_SYMBOL(strnlen); | 236 | EXPORT_SYMBOL(strnlen); |
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c index 42e8a50303f3..8e2d55f754bf 100644 --- a/arch/x86/lib/strstr_32.c +++ b/arch/x86/lib/strstr_32.c | |||
@@ -23,9 +23,9 @@ __asm__ __volatile__( | |||
23 | "jne 1b\n\t" | 23 | "jne 1b\n\t" |
24 | "xorl %%eax,%%eax\n\t" | 24 | "xorl %%eax,%%eax\n\t" |
25 | "2:" | 25 | "2:" |
26 | :"=a" (__res), "=&c" (d0), "=&S" (d1) | 26 | : "=a" (__res), "=&c" (d0), "=&S" (d1) |
27 | :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct) | 27 | : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct) |
28 | :"dx", "di"); | 28 | : "dx", "di"); |
29 | return __res; | 29 | return __res; |
30 | } | 30 | } |
31 | 31 | ||
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 24e60944971a..9e68075544f6 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c | |||
@@ -14,6 +14,13 @@ | |||
14 | #include <asm/uaccess.h> | 14 | #include <asm/uaccess.h> |
15 | #include <asm/mmx.h> | 15 | #include <asm/mmx.h> |
16 | 16 | ||
17 | #ifdef CONFIG_X86_INTEL_USERCOPY | ||
18 | /* | ||
19 | * Alignment at which movsl is preferred for bulk memory copies. | ||
20 | */ | ||
21 | struct movsl_mask movsl_mask __read_mostly; | ||
22 | #endif | ||
23 | |||
17 | static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n) | 24 | static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n) |
18 | { | 25 | { |
19 | #ifdef CONFIG_X86_INTEL_USERCOPY | 26 | #ifdef CONFIG_X86_INTEL_USERCOPY |
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 3d317836be9e..37b9ae4d44c5 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c | |||
@@ -10,13 +10,15 @@ | |||
10 | #include <asm/e820.h> | 10 | #include <asm/e820.h> |
11 | #include <asm/setup.h> | 11 | #include <asm/setup.h> |
12 | 12 | ||
13 | #include <mach_ipi.h> | ||
14 | |||
13 | #ifdef CONFIG_HOTPLUG_CPU | 15 | #ifdef CONFIG_HOTPLUG_CPU |
14 | #define DEFAULT_SEND_IPI (1) | 16 | #define DEFAULT_SEND_IPI (1) |
15 | #else | 17 | #else |
16 | #define DEFAULT_SEND_IPI (0) | 18 | #define DEFAULT_SEND_IPI (0) |
17 | #endif | 19 | #endif |
18 | 20 | ||
19 | int no_broadcast=DEFAULT_SEND_IPI; | 21 | int no_broadcast = DEFAULT_SEND_IPI; |
20 | 22 | ||
21 | /** | 23 | /** |
22 | * pre_intr_init_hook - initialisation prior to setting up interrupt vectors | 24 | * pre_intr_init_hook - initialisation prior to setting up interrupt vectors |
@@ -36,15 +38,6 @@ void __init pre_intr_init_hook(void) | |||
36 | init_ISA_irqs(); | 38 | init_ISA_irqs(); |
37 | } | 39 | } |
38 | 40 | ||
39 | /* | ||
40 | * IRQ2 is cascade interrupt to second interrupt controller | ||
41 | */ | ||
42 | static struct irqaction irq2 = { | ||
43 | .handler = no_action, | ||
44 | .mask = CPU_MASK_NONE, | ||
45 | .name = "cascade", | ||
46 | }; | ||
47 | |||
48 | /** | 41 | /** |
49 | * intr_init_hook - post gate setup interrupt initialisation | 42 | * intr_init_hook - post gate setup interrupt initialisation |
50 | * | 43 | * |
@@ -60,12 +53,6 @@ void __init intr_init_hook(void) | |||
60 | if (x86_quirks->arch_intr_init()) | 53 | if (x86_quirks->arch_intr_init()) |
61 | return; | 54 | return; |
62 | } | 55 | } |
63 | #ifdef CONFIG_X86_LOCAL_APIC | ||
64 | apic_intr_init(); | ||
65 | #endif | ||
66 | |||
67 | if (!acpi_ioapic) | ||
68 | setup_irq(2, &irq2); | ||
69 | } | 56 | } |
70 | 57 | ||
71 | /** | 58 | /** |
diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile deleted file mode 100644 index 3ef8b43b62fc..000000000000 --- a/arch/x86/mach-es7000/Makefile +++ /dev/null | |||
@@ -1,5 +0,0 @@ | |||
1 | # | ||
2 | # Makefile for the linux kernel. | ||
3 | # | ||
4 | |||
5 | obj-$(CONFIG_X86_ES7000) := es7000plat.o | ||
diff --git a/arch/x86/mach-es7000/es7000.h b/arch/x86/mach-es7000/es7000.h deleted file mode 100644 index c8d5aa132fa0..000000000000 --- a/arch/x86/mach-es7000/es7000.h +++ /dev/null | |||
@@ -1,114 +0,0 @@ | |||
1 | /* | ||
2 | * Written by: Garry Forsgren, Unisys Corporation | ||
3 | * Natalie Protasevich, Unisys Corporation | ||
4 | * This file contains the code to configure and interface | ||
5 | * with Unisys ES7000 series hardware system manager. | ||
6 | * | ||
7 | * Copyright (c) 2003 Unisys Corporation. All Rights Reserved. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify it | ||
10 | * under the terms of version 2 of the GNU General Public License as | ||
11 | * published by the Free Software Foundation. | ||
12 | * | ||
13 | * This program is distributed in the hope that it would be useful, but | ||
14 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License along | ||
18 | * with this program; if not, write the Free Software Foundation, Inc., 59 | ||
19 | * Temple Place - Suite 330, Boston MA 02111-1307, USA. | ||
20 | * | ||
21 | * Contact information: Unisys Corporation, Township Line & Union Meeting | ||
22 | * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: | ||
23 | * | ||
24 | * http://www.unisys.com | ||
25 | */ | ||
26 | |||
27 | /* | ||
28 | * ES7000 chipsets | ||
29 | */ | ||
30 | |||
31 | #define NON_UNISYS 0 | ||
32 | #define ES7000_CLASSIC 1 | ||
33 | #define ES7000_ZORRO 2 | ||
34 | |||
35 | |||
36 | #define MIP_REG 1 | ||
37 | #define MIP_PSAI_REG 4 | ||
38 | |||
39 | #define MIP_BUSY 1 | ||
40 | #define MIP_SPIN 0xf0000 | ||
41 | #define MIP_VALID 0x0100000000000000ULL | ||
42 | #define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff) | ||
43 | |||
44 | #define MIP_RD_LO(VALUE) (VALUE & 0xffffffff) | ||
45 | |||
46 | struct mip_reg_info { | ||
47 | unsigned long long mip_info; | ||
48 | unsigned long long delivery_info; | ||
49 | unsigned long long host_reg; | ||
50 | unsigned long long mip_reg; | ||
51 | }; | ||
52 | |||
53 | struct part_info { | ||
54 | unsigned char type; | ||
55 | unsigned char length; | ||
56 | unsigned char part_id; | ||
57 | unsigned char apic_mode; | ||
58 | unsigned long snum; | ||
59 | char ptype[16]; | ||
60 | char sname[64]; | ||
61 | char pname[64]; | ||
62 | }; | ||
63 | |||
64 | struct psai { | ||
65 | unsigned long long entry_type; | ||
66 | unsigned long long addr; | ||
67 | unsigned long long bep_addr; | ||
68 | }; | ||
69 | |||
70 | struct es7000_mem_info { | ||
71 | unsigned char type; | ||
72 | unsigned char length; | ||
73 | unsigned char resv[6]; | ||
74 | unsigned long long start; | ||
75 | unsigned long long size; | ||
76 | }; | ||
77 | |||
78 | struct es7000_oem_table { | ||
79 | unsigned long long hdr; | ||
80 | struct mip_reg_info mip; | ||
81 | struct part_info pif; | ||
82 | struct es7000_mem_info shm; | ||
83 | struct psai psai; | ||
84 | }; | ||
85 | |||
86 | #ifdef CONFIG_ACPI | ||
87 | |||
88 | struct oem_table { | ||
89 | struct acpi_table_header Header; | ||
90 | u32 OEMTableAddr; | ||
91 | u32 OEMTableSize; | ||
92 | }; | ||
93 | |||
94 | extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); | ||
95 | #endif | ||
96 | |||
97 | struct mip_reg { | ||
98 | unsigned long long off_0; | ||
99 | unsigned long long off_8; | ||
100 | unsigned long long off_10; | ||
101 | unsigned long long off_18; | ||
102 | unsigned long long off_20; | ||
103 | unsigned long long off_28; | ||
104 | unsigned long long off_30; | ||
105 | unsigned long long off_38; | ||
106 | }; | ||
107 | |||
108 | #define MIP_SW_APIC 0x1020b | ||
109 | #define MIP_FUNC(VALUE) (VALUE & 0xff) | ||
110 | |||
111 | extern int parse_unisys_oem (char *oemptr); | ||
112 | extern void setup_unisys(void); | ||
113 | extern int es7000_start_cpu(int cpu, unsigned long eip); | ||
114 | extern void es7000_sw_apic(void); | ||
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile index 0dbd7803a1d5..6730f4e7c744 100644 --- a/arch/x86/mach-generic/Makefile +++ b/arch/x86/mach-generic/Makefile | |||
@@ -9,4 +9,3 @@ obj-$(CONFIG_X86_NUMAQ) += numaq.o | |||
9 | obj-$(CONFIG_X86_SUMMIT) += summit.o | 9 | obj-$(CONFIG_X86_SUMMIT) += summit.o |
10 | obj-$(CONFIG_X86_BIGSMP) += bigsmp.o | 10 | obj-$(CONFIG_X86_BIGSMP) += bigsmp.o |
11 | obj-$(CONFIG_X86_ES7000) += es7000.o | 11 | obj-$(CONFIG_X86_ES7000) += es7000.o |
12 | obj-$(CONFIG_X86_ES7000) += ../../x86/mach-es7000/ | ||
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index 59d771714559..df37fc9d6a26 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c | |||
@@ -5,18 +5,17 @@ | |||
5 | #define APIC_DEFINITION 1 | 5 | #define APIC_DEFINITION 1 |
6 | #include <linux/threads.h> | 6 | #include <linux/threads.h> |
7 | #include <linux/cpumask.h> | 7 | #include <linux/cpumask.h> |
8 | #include <asm/smp.h> | ||
9 | #include <asm/mpspec.h> | 8 | #include <asm/mpspec.h> |
10 | #include <asm/genapic.h> | 9 | #include <asm/genapic.h> |
11 | #include <asm/fixmap.h> | 10 | #include <asm/fixmap.h> |
12 | #include <asm/apicdef.h> | 11 | #include <asm/apicdef.h> |
13 | #include <linux/kernel.h> | 12 | #include <linux/kernel.h> |
14 | #include <linux/smp.h> | ||
15 | #include <linux/init.h> | 13 | #include <linux/init.h> |
16 | #include <linux/dmi.h> | 14 | #include <linux/dmi.h> |
17 | #include <asm/mach-bigsmp/mach_apic.h> | 15 | #include <asm/bigsmp/apicdef.h> |
18 | #include <asm/mach-bigsmp/mach_apicdef.h> | 16 | #include <linux/smp.h> |
19 | #include <asm/mach-bigsmp/mach_ipi.h> | 17 | #include <asm/bigsmp/apic.h> |
18 | #include <asm/bigsmp/ipi.h> | ||
20 | #include <asm/mach-default/mach_mpparse.h> | 19 | #include <asm/mach-default/mach_mpparse.h> |
21 | 20 | ||
22 | static int dmi_bigsmp; /* can be set by dmi scanners */ | 21 | static int dmi_bigsmp; /* can be set by dmi scanners */ |
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 4742626f08c4..6513d41ea21e 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c | |||
@@ -4,20 +4,19 @@ | |||
4 | #define APIC_DEFINITION 1 | 4 | #define APIC_DEFINITION 1 |
5 | #include <linux/threads.h> | 5 | #include <linux/threads.h> |
6 | #include <linux/cpumask.h> | 6 | #include <linux/cpumask.h> |
7 | #include <asm/smp.h> | ||
8 | #include <asm/mpspec.h> | 7 | #include <asm/mpspec.h> |
9 | #include <asm/genapic.h> | 8 | #include <asm/genapic.h> |
10 | #include <asm/fixmap.h> | 9 | #include <asm/fixmap.h> |
11 | #include <asm/apicdef.h> | 10 | #include <asm/apicdef.h> |
12 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
13 | #include <linux/string.h> | 12 | #include <linux/string.h> |
14 | #include <linux/smp.h> | ||
15 | #include <linux/init.h> | 13 | #include <linux/init.h> |
16 | #include <asm/mach-es7000/mach_apicdef.h> | 14 | #include <asm/es7000/apicdef.h> |
17 | #include <asm/mach-es7000/mach_apic.h> | 15 | #include <linux/smp.h> |
18 | #include <asm/mach-es7000/mach_ipi.h> | 16 | #include <asm/es7000/apic.h> |
19 | #include <asm/mach-es7000/mach_mpparse.h> | 17 | #include <asm/es7000/ipi.h> |
20 | #include <asm/mach-es7000/mach_wakecpu.h> | 18 | #include <asm/es7000/mpparse.h> |
19 | #include <asm/es7000/wakecpu.h> | ||
21 | 20 | ||
22 | static int probe_es7000(void) | 21 | static int probe_es7000(void) |
23 | { | 22 | { |
@@ -48,16 +47,26 @@ static __init int mps_oem_check(struct mp_config_table *mpc, char *oem, | |||
48 | /* Hook from generic ACPI tables.c */ | 47 | /* Hook from generic ACPI tables.c */ |
49 | static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) | 48 | static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) |
50 | { | 49 | { |
51 | unsigned long oem_addr; | 50 | unsigned long oem_addr = 0; |
51 | int check_dsdt; | ||
52 | int ret = 0; | ||
53 | |||
54 | /* check dsdt at first to avoid clear fix_map for oem_addr */ | ||
55 | check_dsdt = es7000_check_dsdt(); | ||
56 | |||
52 | if (!find_unisys_acpi_oem_table(&oem_addr)) { | 57 | if (!find_unisys_acpi_oem_table(&oem_addr)) { |
53 | if (es7000_check_dsdt()) | 58 | if (check_dsdt) |
54 | return parse_unisys_oem((char *)oem_addr); | 59 | ret = parse_unisys_oem((char *)oem_addr); |
55 | else { | 60 | else { |
56 | setup_unisys(); | 61 | setup_unisys(); |
57 | return 1; | 62 | ret = 1; |
58 | } | 63 | } |
64 | /* | ||
65 | * we need to unmap it | ||
66 | */ | ||
67 | unmap_unisys_acpi_oem_table(oem_addr); | ||
59 | } | 68 | } |
60 | return 0; | 69 | return ret; |
61 | } | 70 | } |
62 | #else | 71 | #else |
63 | static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) | 72 | static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) |
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c index 8091e68764c4..8cf58394975e 100644 --- a/arch/x86/mach-generic/numaq.c +++ b/arch/x86/mach-generic/numaq.c | |||
@@ -4,7 +4,6 @@ | |||
4 | #define APIC_DEFINITION 1 | 4 | #define APIC_DEFINITION 1 |
5 | #include <linux/threads.h> | 5 | #include <linux/threads.h> |
6 | #include <linux/cpumask.h> | 6 | #include <linux/cpumask.h> |
7 | #include <linux/smp.h> | ||
8 | #include <asm/mpspec.h> | 7 | #include <asm/mpspec.h> |
9 | #include <asm/genapic.h> | 8 | #include <asm/genapic.h> |
10 | #include <asm/fixmap.h> | 9 | #include <asm/fixmap.h> |
@@ -12,11 +11,12 @@ | |||
12 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
13 | #include <linux/string.h> | 12 | #include <linux/string.h> |
14 | #include <linux/init.h> | 13 | #include <linux/init.h> |
15 | #include <asm/mach-numaq/mach_apic.h> | 14 | #include <asm/numaq/apicdef.h> |
16 | #include <asm/mach-numaq/mach_apicdef.h> | 15 | #include <linux/smp.h> |
17 | #include <asm/mach-numaq/mach_ipi.h> | 16 | #include <asm/numaq/apic.h> |
18 | #include <asm/mach-numaq/mach_mpparse.h> | 17 | #include <asm/numaq/ipi.h> |
19 | #include <asm/mach-numaq/mach_wakecpu.h> | 18 | #include <asm/numaq/mpparse.h> |
19 | #include <asm/numaq/wakecpu.h> | ||
20 | #include <asm/numaq.h> | 20 | #include <asm/numaq.h> |
21 | 21 | ||
22 | static int mps_oem_check(struct mp_config_table *mpc, char *oem, | 22 | static int mps_oem_check(struct mp_config_table *mpc, char *oem, |
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index a97ea0f35b1e..6ad6b67a723d 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c | |||
@@ -4,19 +4,18 @@ | |||
4 | #define APIC_DEFINITION 1 | 4 | #define APIC_DEFINITION 1 |
5 | #include <linux/threads.h> | 5 | #include <linux/threads.h> |
6 | #include <linux/cpumask.h> | 6 | #include <linux/cpumask.h> |
7 | #include <asm/smp.h> | ||
8 | #include <asm/mpspec.h> | 7 | #include <asm/mpspec.h> |
9 | #include <asm/genapic.h> | 8 | #include <asm/genapic.h> |
10 | #include <asm/fixmap.h> | 9 | #include <asm/fixmap.h> |
11 | #include <asm/apicdef.h> | 10 | #include <asm/apicdef.h> |
12 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
13 | #include <linux/string.h> | 12 | #include <linux/string.h> |
14 | #include <linux/smp.h> | ||
15 | #include <linux/init.h> | 13 | #include <linux/init.h> |
16 | #include <asm/mach-summit/mach_apic.h> | 14 | #include <asm/summit/apicdef.h> |
17 | #include <asm/mach-summit/mach_apicdef.h> | 15 | #include <linux/smp.h> |
18 | #include <asm/mach-summit/mach_ipi.h> | 16 | #include <asm/summit/apic.h> |
19 | #include <asm/mach-summit/mach_mpparse.h> | 17 | #include <asm/summit/ipi.h> |
18 | #include <asm/summit/mpparse.h> | ||
20 | 19 | ||
21 | static int probe_summit(void) | 20 | static int probe_summit(void) |
22 | { | 21 | { |
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index ee0fba092157..199a5f4a873c 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c | |||
@@ -448,6 +448,8 @@ static void __init start_secondary(void *unused) | |||
448 | 448 | ||
449 | VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); | 449 | VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); |
450 | 450 | ||
451 | notify_cpu_starting(cpuid); | ||
452 | |||
451 | /* enable interrupts */ | 453 | /* enable interrupts */ |
452 | local_irq_enable(); | 454 | local_irq_enable(); |
453 | 455 | ||
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index dfb932dcf136..59f89b434b45 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -13,12 +13,8 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o | |||
13 | mmiotrace-y := pf_in.o mmio-mod.o | 13 | mmiotrace-y := pf_in.o mmio-mod.o |
14 | obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o | 14 | obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o |
15 | 15 | ||
16 | ifeq ($(CONFIG_X86_32),y) | 16 | obj-$(CONFIG_NUMA) += numa_$(BITS).o |
17 | obj-$(CONFIG_NUMA) += discontig_32.o | ||
18 | else | ||
19 | obj-$(CONFIG_NUMA) += numa_64.o | ||
20 | obj-$(CONFIG_K8_NUMA) += k8topology_64.o | 17 | obj-$(CONFIG_K8_NUMA) += k8topology_64.o |
21 | endif | ||
22 | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o | 18 | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o |
23 | 19 | ||
24 | obj-$(CONFIG_MEMTEST) += memtest.o | 20 | obj-$(CONFIG_MEMTEST) += memtest.o |
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a20d1fa64b4e..e7277cbcfb40 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c | |||
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st, | |||
148 | * we have now. "break" is either changing perms, levels or | 148 | * we have now. "break" is either changing perms, levels or |
149 | * address space marker. | 149 | * address space marker. |
150 | */ | 150 | */ |
151 | prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK); | 151 | prot = pgprot_val(new_prot) & PTE_FLAGS_MASK; |
152 | cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK); | 152 | cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK; |
153 | 153 | ||
154 | if (!st->level) { | 154 | if (!st->level) { |
155 | /* First entry */ | 155 | /* First entry */ |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 455f3fe67b42..31e8730fa246 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <asm/tlbflush.h> | 35 | #include <asm/tlbflush.h> |
36 | #include <asm/proto.h> | 36 | #include <asm/proto.h> |
37 | #include <asm-generic/sections.h> | 37 | #include <asm-generic/sections.h> |
38 | #include <asm/traps.h> | ||
38 | 39 | ||
39 | /* | 40 | /* |
40 | * Page fault error code bits | 41 | * Page fault error code bits |
@@ -357,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address) | |||
357 | return 0; | 358 | return 0; |
358 | } | 359 | } |
359 | 360 | ||
360 | void do_invalid_op(struct pt_regs *, unsigned long); | ||
361 | |||
362 | static int is_f00f_bug(struct pt_regs *regs, unsigned long address) | 361 | static int is_f00f_bug(struct pt_regs *regs, unsigned long address) |
363 | { | 362 | { |
364 | #ifdef CONFIG_X86_F00F_BUG | 363 | #ifdef CONFIG_X86_F00F_BUG |
@@ -593,11 +592,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
593 | unsigned long flags; | 592 | unsigned long flags; |
594 | #endif | 593 | #endif |
595 | 594 | ||
596 | /* | ||
597 | * We can fault from pretty much anywhere, with unknown IRQ state. | ||
598 | */ | ||
599 | trace_hardirqs_fixup(); | ||
600 | |||
601 | tsk = current; | 595 | tsk = current; |
602 | mm = tsk->mm; | 596 | mm = tsk->mm; |
603 | prefetchw(&mm->mmap_sem); | 597 | prefetchw(&mm->mmap_sem); |
@@ -646,24 +640,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
646 | } | 640 | } |
647 | 641 | ||
648 | 642 | ||
649 | #ifdef CONFIG_X86_32 | ||
650 | /* It's safe to allow irq's after cr2 has been saved and the vmalloc | ||
651 | fault has been handled. */ | ||
652 | if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK)) | ||
653 | local_irq_enable(); | ||
654 | |||
655 | /* | 643 | /* |
656 | * If we're in an interrupt, have no user context or are running in an | 644 | * It's safe to allow irq's after cr2 has been saved and the |
657 | * atomic region then we must not take the fault. | 645 | * vmalloc fault has been handled. |
646 | * | ||
647 | * User-mode registers count as a user access even for any | ||
648 | * potential system fault or CPU buglet. | ||
658 | */ | 649 | */ |
659 | if (in_atomic() || !mm) | 650 | if (user_mode_vm(regs)) { |
660 | goto bad_area_nosemaphore; | 651 | local_irq_enable(); |
661 | #else /* CONFIG_X86_64 */ | 652 | error_code |= PF_USER; |
662 | if (likely(regs->flags & X86_EFLAGS_IF)) | 653 | } else if (regs->flags & X86_EFLAGS_IF) |
663 | local_irq_enable(); | 654 | local_irq_enable(); |
664 | 655 | ||
656 | #ifdef CONFIG_X86_64 | ||
665 | if (unlikely(error_code & PF_RSVD)) | 657 | if (unlikely(error_code & PF_RSVD)) |
666 | pgtable_bad(address, regs, error_code); | 658 | pgtable_bad(address, regs, error_code); |
659 | #endif | ||
667 | 660 | ||
668 | /* | 661 | /* |
669 | * If we're in an interrupt, have no user context or are running in an | 662 | * If we're in an interrupt, have no user context or are running in an |
@@ -672,15 +665,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
672 | if (unlikely(in_atomic() || !mm)) | 665 | if (unlikely(in_atomic() || !mm)) |
673 | goto bad_area_nosemaphore; | 666 | goto bad_area_nosemaphore; |
674 | 667 | ||
675 | /* | ||
676 | * User-mode registers count as a user access even for any | ||
677 | * potential system fault or CPU buglet. | ||
678 | */ | ||
679 | if (user_mode_vm(regs)) | ||
680 | error_code |= PF_USER; | ||
681 | again: | 668 | again: |
682 | #endif | 669 | /* |
683 | /* When running in the kernel we expect faults to occur only to | 670 | * When running in the kernel we expect faults to occur only to |
684 | * addresses in user space. All other faults represent errors in the | 671 | * addresses in user space. All other faults represent errors in the |
685 | * kernel and should generate an OOPS. Unfortunately, in the case of an | 672 | * kernel and should generate an OOPS. Unfortunately, in the case of an |
686 | * erroneous fault occurring in a code path which already holds mmap_sem | 673 | * erroneous fault occurring in a code path which already holds mmap_sem |
@@ -743,9 +730,6 @@ good_area: | |||
743 | goto bad_area; | 730 | goto bad_area; |
744 | } | 731 | } |
745 | 732 | ||
746 | #ifdef CONFIG_X86_32 | ||
747 | survive: | ||
748 | #endif | ||
749 | /* | 733 | /* |
750 | * If for any reason at all we couldn't handle the fault, | 734 | * If for any reason at all we couldn't handle the fault, |
751 | * make sure we exit gracefully rather than endlessly redo | 735 | * make sure we exit gracefully rather than endlessly redo |
@@ -880,12 +864,11 @@ out_of_memory: | |||
880 | up_read(&mm->mmap_sem); | 864 | up_read(&mm->mmap_sem); |
881 | if (is_global_init(tsk)) { | 865 | if (is_global_init(tsk)) { |
882 | yield(); | 866 | yield(); |
883 | #ifdef CONFIG_X86_32 | 867 | /* |
884 | down_read(&mm->mmap_sem); | 868 | * Re-lookup the vma - in theory the vma tree might |
885 | goto survive; | 869 | * have changed: |
886 | #else | 870 | */ |
887 | goto again; | 871 | goto again; |
888 | #endif | ||
889 | } | 872 | } |
890 | 873 | ||
891 | printk("VM: killing process %s\n", tsk->comm); | 874 | printk("VM: killing process %s\n", tsk->comm); |
@@ -915,15 +898,15 @@ LIST_HEAD(pgd_list); | |||
915 | 898 | ||
916 | void vmalloc_sync_all(void) | 899 | void vmalloc_sync_all(void) |
917 | { | 900 | { |
918 | #ifdef CONFIG_X86_32 | ||
919 | unsigned long start = VMALLOC_START & PGDIR_MASK; | ||
920 | unsigned long address; | 901 | unsigned long address; |
921 | 902 | ||
903 | #ifdef CONFIG_X86_32 | ||
922 | if (SHARED_KERNEL_PMD) | 904 | if (SHARED_KERNEL_PMD) |
923 | return; | 905 | return; |
924 | 906 | ||
925 | BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); | 907 | for (address = VMALLOC_START & PMD_MASK; |
926 | for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { | 908 | address >= TASK_SIZE && address < FIXADDR_TOP; |
909 | address += PMD_SIZE) { | ||
927 | unsigned long flags; | 910 | unsigned long flags; |
928 | struct page *page; | 911 | struct page *page; |
929 | 912 | ||
@@ -936,10 +919,8 @@ void vmalloc_sync_all(void) | |||
936 | spin_unlock_irqrestore(&pgd_lock, flags); | 919 | spin_unlock_irqrestore(&pgd_lock, flags); |
937 | } | 920 | } |
938 | #else /* CONFIG_X86_64 */ | 921 | #else /* CONFIG_X86_64 */ |
939 | unsigned long start = VMALLOC_START & PGDIR_MASK; | 922 | for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; |
940 | unsigned long address; | 923 | address += PGDIR_SIZE) { |
941 | |||
942 | for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { | ||
943 | const pgd_t *pgd_ref = pgd_offset_k(address); | 924 | const pgd_t *pgd_ref = pgd_offset_k(address); |
944 | unsigned long flags; | 925 | unsigned long flags; |
945 | struct page *page; | 926 | struct page *page; |
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 007bb06c7504..4ba373c5b8c8 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c | |||
@@ -82,7 +82,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | |||
82 | pte_t pte = gup_get_pte(ptep); | 82 | pte_t pte = gup_get_pte(ptep); |
83 | struct page *page; | 83 | struct page *page; |
84 | 84 | ||
85 | if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) { | 85 | if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { |
86 | pte_unmap(ptep); | 86 | pte_unmap(ptep); |
87 | return 0; | 87 | return 0; |
88 | } | 88 | } |
@@ -116,10 +116,10 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, | |||
116 | mask = _PAGE_PRESENT|_PAGE_USER; | 116 | mask = _PAGE_PRESENT|_PAGE_USER; |
117 | if (write) | 117 | if (write) |
118 | mask |= _PAGE_RW; | 118 | mask |= _PAGE_RW; |
119 | if ((pte_val(pte) & mask) != mask) | 119 | if ((pte_flags(pte) & mask) != mask) |
120 | return 0; | 120 | return 0; |
121 | /* hugepages are never "special" */ | 121 | /* hugepages are never "special" */ |
122 | VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); | 122 | VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); |
123 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 123 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
124 | 124 | ||
125 | refs = 0; | 125 | refs = 0; |
@@ -173,10 +173,10 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, | |||
173 | mask = _PAGE_PRESENT|_PAGE_USER; | 173 | mask = _PAGE_PRESENT|_PAGE_USER; |
174 | if (write) | 174 | if (write) |
175 | mask |= _PAGE_RW; | 175 | mask |= _PAGE_RW; |
176 | if ((pte_val(pte) & mask) != mask) | 176 | if ((pte_flags(pte) & mask) != mask) |
177 | return 0; | 177 | return 0; |
178 | /* hugepages are never "special" */ | 178 | /* hugepages are never "special" */ |
179 | VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); | 179 | VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); |
180 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 180 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
181 | 181 | ||
182 | refs = 0; | 182 | refs = 0; |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d37f29376b0c..8396868e82c5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/cpumask.h> | 31 | #include <linux/cpumask.h> |
32 | 32 | ||
33 | #include <asm/asm.h> | 33 | #include <asm/asm.h> |
34 | #include <asm/bios_ebda.h> | ||
34 | #include <asm/processor.h> | 35 | #include <asm/processor.h> |
35 | #include <asm/system.h> | 36 | #include <asm/system.h> |
36 | #include <asm/uaccess.h> | 37 | #include <asm/uaccess.h> |
@@ -47,6 +48,7 @@ | |||
47 | #include <asm/paravirt.h> | 48 | #include <asm/paravirt.h> |
48 | #include <asm/setup.h> | 49 | #include <asm/setup.h> |
49 | #include <asm/cacheflush.h> | 50 | #include <asm/cacheflush.h> |
51 | #include <asm/smp.h> | ||
50 | 52 | ||
51 | unsigned int __VMALLOC_RESERVE = 128 << 20; | 53 | unsigned int __VMALLOC_RESERVE = 128 << 20; |
52 | 54 | ||
@@ -194,11 +196,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, | |||
194 | pgd_t *pgd; | 196 | pgd_t *pgd; |
195 | pmd_t *pmd; | 197 | pmd_t *pmd; |
196 | pte_t *pte; | 198 | pte_t *pte; |
197 | unsigned pages_2m = 0, pages_4k = 0; | 199 | unsigned pages_2m, pages_4k; |
200 | int mapping_iter; | ||
201 | |||
202 | /* | ||
203 | * First iteration will setup identity mapping using large/small pages | ||
204 | * based on use_pse, with other attributes same as set by | ||
205 | * the early code in head_32.S | ||
206 | * | ||
207 | * Second iteration will setup the appropriate attributes (NX, GLOBAL..) | ||
208 | * as desired for the kernel identity mapping. | ||
209 | * | ||
210 | * This two pass mechanism conforms to the TLB app note which says: | ||
211 | * | ||
212 | * "Software should not write to a paging-structure entry in a way | ||
213 | * that would change, for any linear address, both the page size | ||
214 | * and either the page frame or attributes." | ||
215 | */ | ||
216 | mapping_iter = 1; | ||
198 | 217 | ||
199 | if (!cpu_has_pse) | 218 | if (!cpu_has_pse) |
200 | use_pse = 0; | 219 | use_pse = 0; |
201 | 220 | ||
221 | repeat: | ||
222 | pages_2m = pages_4k = 0; | ||
202 | pfn = start_pfn; | 223 | pfn = start_pfn; |
203 | pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); | 224 | pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); |
204 | pgd = pgd_base + pgd_idx; | 225 | pgd = pgd_base + pgd_idx; |
@@ -224,6 +245,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, | |||
224 | if (use_pse) { | 245 | if (use_pse) { |
225 | unsigned int addr2; | 246 | unsigned int addr2; |
226 | pgprot_t prot = PAGE_KERNEL_LARGE; | 247 | pgprot_t prot = PAGE_KERNEL_LARGE; |
248 | /* | ||
249 | * first pass will use the same initial | ||
250 | * identity mapping attribute + _PAGE_PSE. | ||
251 | */ | ||
252 | pgprot_t init_prot = | ||
253 | __pgprot(PTE_IDENT_ATTR | | ||
254 | _PAGE_PSE); | ||
227 | 255 | ||
228 | addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + | 256 | addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + |
229 | PAGE_OFFSET + PAGE_SIZE-1; | 257 | PAGE_OFFSET + PAGE_SIZE-1; |
@@ -233,7 +261,10 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, | |||
233 | prot = PAGE_KERNEL_LARGE_EXEC; | 261 | prot = PAGE_KERNEL_LARGE_EXEC; |
234 | 262 | ||
235 | pages_2m++; | 263 | pages_2m++; |
236 | set_pmd(pmd, pfn_pmd(pfn, prot)); | 264 | if (mapping_iter == 1) |
265 | set_pmd(pmd, pfn_pmd(pfn, init_prot)); | ||
266 | else | ||
267 | set_pmd(pmd, pfn_pmd(pfn, prot)); | ||
237 | 268 | ||
238 | pfn += PTRS_PER_PTE; | 269 | pfn += PTRS_PER_PTE; |
239 | continue; | 270 | continue; |
@@ -245,17 +276,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, | |||
245 | for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; | 276 | for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; |
246 | pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { | 277 | pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { |
247 | pgprot_t prot = PAGE_KERNEL; | 278 | pgprot_t prot = PAGE_KERNEL; |
279 | /* | ||
280 | * first pass will use the same initial | ||
281 | * identity mapping attribute. | ||
282 | */ | ||
283 | pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR); | ||
248 | 284 | ||
249 | if (is_kernel_text(addr)) | 285 | if (is_kernel_text(addr)) |
250 | prot = PAGE_KERNEL_EXEC; | 286 | prot = PAGE_KERNEL_EXEC; |
251 | 287 | ||
252 | pages_4k++; | 288 | pages_4k++; |
253 | set_pte(pte, pfn_pte(pfn, prot)); | 289 | if (mapping_iter == 1) |
290 | set_pte(pte, pfn_pte(pfn, init_prot)); | ||
291 | else | ||
292 | set_pte(pte, pfn_pte(pfn, prot)); | ||
254 | } | 293 | } |
255 | } | 294 | } |
256 | } | 295 | } |
257 | update_page_count(PG_LEVEL_2M, pages_2m); | 296 | if (mapping_iter == 1) { |
258 | update_page_count(PG_LEVEL_4K, pages_4k); | 297 | /* |
298 | * update direct mapping page count only in the first | ||
299 | * iteration. | ||
300 | */ | ||
301 | update_page_count(PG_LEVEL_2M, pages_2m); | ||
302 | update_page_count(PG_LEVEL_4K, pages_4k); | ||
303 | |||
304 | /* | ||
305 | * local global flush tlb, which will flush the previous | ||
306 | * mappings present in both small and large page TLB's. | ||
307 | */ | ||
308 | __flush_tlb_all(); | ||
309 | |||
310 | /* | ||
311 | * Second iteration will set the actual desired PTE attributes. | ||
312 | */ | ||
313 | mapping_iter = 2; | ||
314 | goto repeat; | ||
315 | } | ||
259 | } | 316 | } |
260 | 317 | ||
261 | /* | 318 | /* |
@@ -458,11 +515,7 @@ static void __init pagetable_init(void) | |||
458 | { | 515 | { |
459 | pgd_t *pgd_base = swapper_pg_dir; | 516 | pgd_t *pgd_base = swapper_pg_dir; |
460 | 517 | ||
461 | paravirt_pagetable_setup_start(pgd_base); | ||
462 | |||
463 | permanent_kmaps_init(pgd_base); | 518 | permanent_kmaps_init(pgd_base); |
464 | |||
465 | paravirt_pagetable_setup_done(pgd_base); | ||
466 | } | 519 | } |
467 | 520 | ||
468 | #ifdef CONFIG_ACPI_SLEEP | 521 | #ifdef CONFIG_ACPI_SLEEP |
@@ -505,7 +558,7 @@ void zap_low_mappings(void) | |||
505 | 558 | ||
506 | int nx_enabled; | 559 | int nx_enabled; |
507 | 560 | ||
508 | pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); | 561 | pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); |
509 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | 562 | EXPORT_SYMBOL_GPL(__supported_pte_mask); |
510 | 563 | ||
511 | #ifdef CONFIG_X86_PAE | 564 | #ifdef CONFIG_X86_PAE |
@@ -722,7 +775,7 @@ void __init setup_bootmem_allocator(void) | |||
722 | after_init_bootmem = 1; | 775 | after_init_bootmem = 1; |
723 | } | 776 | } |
724 | 777 | ||
725 | static void __init find_early_table_space(unsigned long end) | 778 | static void __init find_early_table_space(unsigned long end, int use_pse) |
726 | { | 779 | { |
727 | unsigned long puds, pmds, ptes, tables, start; | 780 | unsigned long puds, pmds, ptes, tables, start; |
728 | 781 | ||
@@ -732,7 +785,7 @@ static void __init find_early_table_space(unsigned long end) | |||
732 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | 785 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; |
733 | tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); | 786 | tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); |
734 | 787 | ||
735 | if (cpu_has_pse) { | 788 | if (use_pse) { |
736 | unsigned long extra; | 789 | unsigned long extra; |
737 | 790 | ||
738 | extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); | 791 | extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); |
@@ -772,12 +825,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
772 | pgd_t *pgd_base = swapper_pg_dir; | 825 | pgd_t *pgd_base = swapper_pg_dir; |
773 | unsigned long start_pfn, end_pfn; | 826 | unsigned long start_pfn, end_pfn; |
774 | unsigned long big_page_start; | 827 | unsigned long big_page_start; |
828 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
829 | /* | ||
830 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. | ||
831 | * This will simplify cpa(), which otherwise needs to support splitting | ||
832 | * large pages into small in interrupt context, etc. | ||
833 | */ | ||
834 | int use_pse = 0; | ||
835 | #else | ||
836 | int use_pse = cpu_has_pse; | ||
837 | #endif | ||
775 | 838 | ||
776 | /* | 839 | /* |
777 | * Find space for the kernel direct mapping tables. | 840 | * Find space for the kernel direct mapping tables. |
778 | */ | 841 | */ |
779 | if (!after_init_bootmem) | 842 | if (!after_init_bootmem) |
780 | find_early_table_space(end); | 843 | find_early_table_space(end, use_pse); |
781 | 844 | ||
782 | #ifdef CONFIG_X86_PAE | 845 | #ifdef CONFIG_X86_PAE |
783 | set_nx(); | 846 | set_nx(); |
@@ -823,7 +886,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
823 | end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); | 886 | end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); |
824 | if (start_pfn < end_pfn) | 887 | if (start_pfn < end_pfn) |
825 | kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, | 888 | kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, |
826 | cpu_has_pse); | 889 | use_pse); |
827 | 890 | ||
828 | /* tail is not big page alignment ? */ | 891 | /* tail is not big page alignment ? */ |
829 | start_pfn = end_pfn; | 892 | start_pfn = end_pfn; |
@@ -907,6 +970,8 @@ void __init mem_init(void) | |||
907 | int codesize, reservedpages, datasize, initsize; | 970 | int codesize, reservedpages, datasize, initsize; |
908 | int tmp; | 971 | int tmp; |
909 | 972 | ||
973 | start_periodic_check_for_corruption(); | ||
974 | |||
910 | #ifdef CONFIG_FLATMEM | 975 | #ifdef CONFIG_FLATMEM |
911 | BUG_ON(!mem_map); | 976 | BUG_ON(!mem_map); |
912 | #endif | 977 | #endif |
@@ -986,7 +1051,6 @@ void __init mem_init(void) | |||
986 | if (boot_cpu_data.wp_works_ok < 0) | 1051 | if (boot_cpu_data.wp_works_ok < 0) |
987 | test_wp_bit(); | 1052 | test_wp_bit(); |
988 | 1053 | ||
989 | cpa_init(); | ||
990 | save_pg_dir(); | 1054 | save_pg_dir(); |
991 | zap_low_mappings(); | 1055 | zap_low_mappings(); |
992 | } | 1056 | } |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d3746efb060d..b8e461d49412 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/nmi.h> | 31 | #include <linux/nmi.h> |
32 | 32 | ||
33 | #include <asm/processor.h> | 33 | #include <asm/processor.h> |
34 | #include <asm/bios_ebda.h> | ||
34 | #include <asm/system.h> | 35 | #include <asm/system.h> |
35 | #include <asm/uaccess.h> | 36 | #include <asm/uaccess.h> |
36 | #include <asm/pgtable.h> | 37 | #include <asm/pgtable.h> |
@@ -88,6 +89,62 @@ early_param("gbpages", parse_direct_gbpages_on); | |||
88 | 89 | ||
89 | int after_bootmem; | 90 | int after_bootmem; |
90 | 91 | ||
92 | pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; | ||
93 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | ||
94 | |||
95 | static int do_not_nx __cpuinitdata; | ||
96 | |||
97 | /* | ||
98 | * noexec=on|off | ||
99 | * Control non-executable mappings for 64-bit processes. | ||
100 | * | ||
101 | * on Enable (default) | ||
102 | * off Disable | ||
103 | */ | ||
104 | static int __init nonx_setup(char *str) | ||
105 | { | ||
106 | if (!str) | ||
107 | return -EINVAL; | ||
108 | if (!strncmp(str, "on", 2)) { | ||
109 | __supported_pte_mask |= _PAGE_NX; | ||
110 | do_not_nx = 0; | ||
111 | } else if (!strncmp(str, "off", 3)) { | ||
112 | do_not_nx = 1; | ||
113 | __supported_pte_mask &= ~_PAGE_NX; | ||
114 | } | ||
115 | return 0; | ||
116 | } | ||
117 | early_param("noexec", nonx_setup); | ||
118 | |||
119 | void __cpuinit check_efer(void) | ||
120 | { | ||
121 | unsigned long efer; | ||
122 | |||
123 | rdmsrl(MSR_EFER, efer); | ||
124 | if (!(efer & EFER_NX) || do_not_nx) | ||
125 | __supported_pte_mask &= ~_PAGE_NX; | ||
126 | } | ||
127 | |||
128 | int force_personality32; | ||
129 | |||
130 | /* | ||
131 | * noexec32=on|off | ||
132 | * Control non executable heap for 32bit processes. | ||
133 | * To control the stack too use noexec=off | ||
134 | * | ||
135 | * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) | ||
136 | * off PROT_READ implies PROT_EXEC | ||
137 | */ | ||
138 | static int __init nonx32_setup(char *str) | ||
139 | { | ||
140 | if (!strcmp(str, "on")) | ||
141 | force_personality32 &= ~READ_IMPLIES_EXEC; | ||
142 | else if (!strcmp(str, "off")) | ||
143 | force_personality32 |= READ_IMPLIES_EXEC; | ||
144 | return 1; | ||
145 | } | ||
146 | __setup("noexec32=", nonx32_setup); | ||
147 | |||
91 | /* | 148 | /* |
92 | * NOTE: This function is marked __ref because it calls __init function | 149 | * NOTE: This function is marked __ref because it calls __init function |
93 | * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. | 150 | * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. |
@@ -139,9 +196,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) | |||
139 | } | 196 | } |
140 | 197 | ||
141 | pte = pte_offset_kernel(pmd, vaddr); | 198 | pte = pte_offset_kernel(pmd, vaddr); |
142 | if (!pte_none(*pte) && pte_val(new_pte) && | ||
143 | pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) | ||
144 | pte_ERROR(*pte); | ||
145 | set_pte(pte, new_pte); | 199 | set_pte(pte, new_pte); |
146 | 200 | ||
147 | /* | 201 | /* |
@@ -225,7 +279,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) | |||
225 | void __init cleanup_highmap(void) | 279 | void __init cleanup_highmap(void) |
226 | { | 280 | { |
227 | unsigned long vaddr = __START_KERNEL_map; | 281 | unsigned long vaddr = __START_KERNEL_map; |
228 | unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; | 282 | unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; |
229 | pmd_t *pmd = level2_kernel_pgt; | 283 | pmd_t *pmd = level2_kernel_pgt; |
230 | pmd_t *last_pmd = pmd + PTRS_PER_PMD; | 284 | pmd_t *last_pmd = pmd + PTRS_PER_PMD; |
231 | 285 | ||
@@ -256,7 +310,7 @@ static __ref void *alloc_low_page(unsigned long *phys) | |||
256 | if (pfn >= table_top) | 310 | if (pfn >= table_top) |
257 | panic("alloc_low_page: ran out of memory"); | 311 | panic("alloc_low_page: ran out of memory"); |
258 | 312 | ||
259 | adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); | 313 | adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); |
260 | memset(adr, 0, PAGE_SIZE); | 314 | memset(adr, 0, PAGE_SIZE); |
261 | *phys = pfn * PAGE_SIZE; | 315 | *phys = pfn * PAGE_SIZE; |
262 | return adr; | 316 | return adr; |
@@ -271,7 +325,8 @@ static __ref void unmap_low_page(void *adr) | |||
271 | } | 325 | } |
272 | 326 | ||
273 | static unsigned long __meminit | 327 | static unsigned long __meminit |
274 | phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) | 328 | phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, |
329 | pgprot_t prot) | ||
275 | { | 330 | { |
276 | unsigned pages = 0; | 331 | unsigned pages = 0; |
277 | unsigned long last_map_addr = end; | 332 | unsigned long last_map_addr = end; |
@@ -289,36 +344,43 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) | |||
289 | break; | 344 | break; |
290 | } | 345 | } |
291 | 346 | ||
347 | /* | ||
348 | * We will re-use the existing mapping. | ||
349 | * Xen for example has some special requirements, like mapping | ||
350 | * pagetable pages as RO. So assume someone who pre-setup | ||
351 | * these mappings are more intelligent. | ||
352 | */ | ||
292 | if (pte_val(*pte)) | 353 | if (pte_val(*pte)) |
293 | continue; | 354 | continue; |
294 | 355 | ||
295 | if (0) | 356 | if (0) |
296 | printk(" pte=%p addr=%lx pte=%016lx\n", | 357 | printk(" pte=%p addr=%lx pte=%016lx\n", |
297 | pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); | 358 | pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); |
298 | set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); | ||
299 | last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; | ||
300 | pages++; | 359 | pages++; |
360 | set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); | ||
361 | last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; | ||
301 | } | 362 | } |
363 | |||
302 | update_page_count(PG_LEVEL_4K, pages); | 364 | update_page_count(PG_LEVEL_4K, pages); |
303 | 365 | ||
304 | return last_map_addr; | 366 | return last_map_addr; |
305 | } | 367 | } |
306 | 368 | ||
307 | static unsigned long __meminit | 369 | static unsigned long __meminit |
308 | phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) | 370 | phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, |
371 | pgprot_t prot) | ||
309 | { | 372 | { |
310 | pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); | 373 | pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); |
311 | 374 | ||
312 | return phys_pte_init(pte, address, end); | 375 | return phys_pte_init(pte, address, end, prot); |
313 | } | 376 | } |
314 | 377 | ||
315 | static unsigned long __meminit | 378 | static unsigned long __meminit |
316 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | 379 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, |
317 | unsigned long page_size_mask) | 380 | unsigned long page_size_mask, pgprot_t prot) |
318 | { | 381 | { |
319 | unsigned long pages = 0; | 382 | unsigned long pages = 0; |
320 | unsigned long last_map_addr = end; | 383 | unsigned long last_map_addr = end; |
321 | unsigned long start = address; | ||
322 | 384 | ||
323 | int i = pmd_index(address); | 385 | int i = pmd_index(address); |
324 | 386 | ||
@@ -326,6 +388,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | |||
326 | unsigned long pte_phys; | 388 | unsigned long pte_phys; |
327 | pmd_t *pmd = pmd_page + pmd_index(address); | 389 | pmd_t *pmd = pmd_page + pmd_index(address); |
328 | pte_t *pte; | 390 | pte_t *pte; |
391 | pgprot_t new_prot = prot; | ||
329 | 392 | ||
330 | if (address >= end) { | 393 | if (address >= end) { |
331 | if (!after_bootmem) { | 394 | if (!after_bootmem) { |
@@ -339,27 +402,40 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | |||
339 | if (!pmd_large(*pmd)) { | 402 | if (!pmd_large(*pmd)) { |
340 | spin_lock(&init_mm.page_table_lock); | 403 | spin_lock(&init_mm.page_table_lock); |
341 | last_map_addr = phys_pte_update(pmd, address, | 404 | last_map_addr = phys_pte_update(pmd, address, |
342 | end); | 405 | end, prot); |
343 | spin_unlock(&init_mm.page_table_lock); | 406 | spin_unlock(&init_mm.page_table_lock); |
407 | continue; | ||
344 | } | 408 | } |
345 | /* Count entries we're using from level2_ident_pgt */ | 409 | /* |
346 | if (start == 0) | 410 | * If we are ok with PG_LEVEL_2M mapping, then we will |
347 | pages++; | 411 | * use the existing mapping, |
348 | continue; | 412 | * |
413 | * Otherwise, we will split the large page mapping but | ||
414 | * use the same existing protection bits except for | ||
415 | * large page, so that we don't violate Intel's TLB | ||
416 | * Application note (317080) which says, while changing | ||
417 | * the page sizes, new and old translations should | ||
418 | * not differ with respect to page frame and | ||
419 | * attributes. | ||
420 | */ | ||
421 | if (page_size_mask & (1 << PG_LEVEL_2M)) | ||
422 | continue; | ||
423 | new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); | ||
349 | } | 424 | } |
350 | 425 | ||
351 | if (page_size_mask & (1<<PG_LEVEL_2M)) { | 426 | if (page_size_mask & (1<<PG_LEVEL_2M)) { |
352 | pages++; | 427 | pages++; |
353 | spin_lock(&init_mm.page_table_lock); | 428 | spin_lock(&init_mm.page_table_lock); |
354 | set_pte((pte_t *)pmd, | 429 | set_pte((pte_t *)pmd, |
355 | pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); | 430 | pfn_pte(address >> PAGE_SHIFT, |
431 | __pgprot(pgprot_val(prot) | _PAGE_PSE))); | ||
356 | spin_unlock(&init_mm.page_table_lock); | 432 | spin_unlock(&init_mm.page_table_lock); |
357 | last_map_addr = (address & PMD_MASK) + PMD_SIZE; | 433 | last_map_addr = (address & PMD_MASK) + PMD_SIZE; |
358 | continue; | 434 | continue; |
359 | } | 435 | } |
360 | 436 | ||
361 | pte = alloc_low_page(&pte_phys); | 437 | pte = alloc_low_page(&pte_phys); |
362 | last_map_addr = phys_pte_init(pte, address, end); | 438 | last_map_addr = phys_pte_init(pte, address, end, new_prot); |
363 | unmap_low_page(pte); | 439 | unmap_low_page(pte); |
364 | 440 | ||
365 | spin_lock(&init_mm.page_table_lock); | 441 | spin_lock(&init_mm.page_table_lock); |
@@ -372,12 +448,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | |||
372 | 448 | ||
373 | static unsigned long __meminit | 449 | static unsigned long __meminit |
374 | phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, | 450 | phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, |
375 | unsigned long page_size_mask) | 451 | unsigned long page_size_mask, pgprot_t prot) |
376 | { | 452 | { |
377 | pmd_t *pmd = pmd_offset(pud, 0); | 453 | pmd_t *pmd = pmd_offset(pud, 0); |
378 | unsigned long last_map_addr; | 454 | unsigned long last_map_addr; |
379 | 455 | ||
380 | last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); | 456 | last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); |
381 | __flush_tlb_all(); | 457 | __flush_tlb_all(); |
382 | return last_map_addr; | 458 | return last_map_addr; |
383 | } | 459 | } |
@@ -394,6 +470,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
394 | unsigned long pmd_phys; | 470 | unsigned long pmd_phys; |
395 | pud_t *pud = pud_page + pud_index(addr); | 471 | pud_t *pud = pud_page + pud_index(addr); |
396 | pmd_t *pmd; | 472 | pmd_t *pmd; |
473 | pgprot_t prot = PAGE_KERNEL; | ||
397 | 474 | ||
398 | if (addr >= end) | 475 | if (addr >= end) |
399 | break; | 476 | break; |
@@ -405,10 +482,26 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
405 | } | 482 | } |
406 | 483 | ||
407 | if (pud_val(*pud)) { | 484 | if (pud_val(*pud)) { |
408 | if (!pud_large(*pud)) | 485 | if (!pud_large(*pud)) { |
409 | last_map_addr = phys_pmd_update(pud, addr, end, | 486 | last_map_addr = phys_pmd_update(pud, addr, end, |
410 | page_size_mask); | 487 | page_size_mask, prot); |
411 | continue; | 488 | continue; |
489 | } | ||
490 | /* | ||
491 | * If we are ok with PG_LEVEL_1G mapping, then we will | ||
492 | * use the existing mapping. | ||
493 | * | ||
494 | * Otherwise, we will split the gbpage mapping but use | ||
495 | * the same existing protection bits except for large | ||
496 | * page, so that we don't violate Intel's TLB | ||
497 | * Application note (317080) which says, while changing | ||
498 | * the page sizes, new and old translations should | ||
499 | * not differ with respect to page frame and | ||
500 | * attributes. | ||
501 | */ | ||
502 | if (page_size_mask & (1 << PG_LEVEL_1G)) | ||
503 | continue; | ||
504 | prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); | ||
412 | } | 505 | } |
413 | 506 | ||
414 | if (page_size_mask & (1<<PG_LEVEL_1G)) { | 507 | if (page_size_mask & (1<<PG_LEVEL_1G)) { |
@@ -422,7 +515,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
422 | } | 515 | } |
423 | 516 | ||
424 | pmd = alloc_low_page(&pmd_phys); | 517 | pmd = alloc_low_page(&pmd_phys); |
425 | last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); | 518 | last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, |
519 | prot); | ||
426 | unmap_low_page(pmd); | 520 | unmap_low_page(pmd); |
427 | 521 | ||
428 | spin_lock(&init_mm.page_table_lock); | 522 | spin_lock(&init_mm.page_table_lock); |
@@ -430,6 +524,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
430 | spin_unlock(&init_mm.page_table_lock); | 524 | spin_unlock(&init_mm.page_table_lock); |
431 | } | 525 | } |
432 | __flush_tlb_all(); | 526 | __flush_tlb_all(); |
527 | |||
433 | update_page_count(PG_LEVEL_1G, pages); | 528 | update_page_count(PG_LEVEL_1G, pages); |
434 | 529 | ||
435 | return last_map_addr; | 530 | return last_map_addr; |
@@ -446,27 +541,28 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, | |||
446 | return phys_pud_init(pud, addr, end, page_size_mask); | 541 | return phys_pud_init(pud, addr, end, page_size_mask); |
447 | } | 542 | } |
448 | 543 | ||
449 | static void __init find_early_table_space(unsigned long end) | 544 | static void __init find_early_table_space(unsigned long end, int use_pse, |
545 | int use_gbpages) | ||
450 | { | 546 | { |
451 | unsigned long puds, pmds, ptes, tables, start; | 547 | unsigned long puds, pmds, ptes, tables, start; |
452 | 548 | ||
453 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | 549 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; |
454 | tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); | 550 | tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); |
455 | if (direct_gbpages) { | 551 | if (use_gbpages) { |
456 | unsigned long extra; | 552 | unsigned long extra; |
457 | extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); | 553 | extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); |
458 | pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; | 554 | pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; |
459 | } else | 555 | } else |
460 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | 556 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; |
461 | tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); | 557 | tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); |
462 | 558 | ||
463 | if (cpu_has_pse) { | 559 | if (use_pse) { |
464 | unsigned long extra; | 560 | unsigned long extra; |
465 | extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); | 561 | extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); |
466 | ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; | 562 | ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; |
467 | } else | 563 | } else |
468 | ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; | 564 | ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; |
469 | tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); | 565 | tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); |
470 | 566 | ||
471 | /* | 567 | /* |
472 | * RED-PEN putting page tables only on node 0 could | 568 | * RED-PEN putting page tables only on node 0 could |
@@ -528,6 +624,7 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start, | |||
528 | pgd_populate(&init_mm, pgd, __va(pud_phys)); | 624 | pgd_populate(&init_mm, pgd, __va(pud_phys)); |
529 | spin_unlock(&init_mm.page_table_lock); | 625 | spin_unlock(&init_mm.page_table_lock); |
530 | } | 626 | } |
627 | __flush_tlb_all(); | ||
531 | 628 | ||
532 | return last_map_addr; | 629 | return last_map_addr; |
533 | } | 630 | } |
@@ -571,6 +668,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
571 | 668 | ||
572 | struct map_range mr[NR_RANGE_MR]; | 669 | struct map_range mr[NR_RANGE_MR]; |
573 | int nr_range, i; | 670 | int nr_range, i; |
671 | int use_pse, use_gbpages; | ||
574 | 672 | ||
575 | printk(KERN_INFO "init_memory_mapping\n"); | 673 | printk(KERN_INFO "init_memory_mapping\n"); |
576 | 674 | ||
@@ -584,9 +682,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
584 | if (!after_bootmem) | 682 | if (!after_bootmem) |
585 | init_gbpages(); | 683 | init_gbpages(); |
586 | 684 | ||
587 | if (direct_gbpages) | 685 | #ifdef CONFIG_DEBUG_PAGEALLOC |
686 | /* | ||
687 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. | ||
688 | * This will simplify cpa(), which otherwise needs to support splitting | ||
689 | * large pages into small in interrupt context, etc. | ||
690 | */ | ||
691 | use_pse = use_gbpages = 0; | ||
692 | #else | ||
693 | use_pse = cpu_has_pse; | ||
694 | use_gbpages = direct_gbpages; | ||
695 | #endif | ||
696 | |||
697 | if (use_gbpages) | ||
588 | page_size_mask |= 1 << PG_LEVEL_1G; | 698 | page_size_mask |= 1 << PG_LEVEL_1G; |
589 | if (cpu_has_pse) | 699 | if (use_pse) |
590 | page_size_mask |= 1 << PG_LEVEL_2M; | 700 | page_size_mask |= 1 << PG_LEVEL_2M; |
591 | 701 | ||
592 | memset(mr, 0, sizeof(mr)); | 702 | memset(mr, 0, sizeof(mr)); |
@@ -636,7 +746,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
636 | old_start = mr[i].start; | 746 | old_start = mr[i].start; |
637 | memmove(&mr[i], &mr[i+1], | 747 | memmove(&mr[i], &mr[i+1], |
638 | (nr_range - 1 - i) * sizeof (struct map_range)); | 748 | (nr_range - 1 - i) * sizeof (struct map_range)); |
639 | mr[i].start = old_start; | 749 | mr[i--].start = old_start; |
640 | nr_range--; | 750 | nr_range--; |
641 | } | 751 | } |
642 | 752 | ||
@@ -647,7 +757,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
647 | (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); | 757 | (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); |
648 | 758 | ||
649 | if (!after_bootmem) | 759 | if (!after_bootmem) |
650 | find_early_table_space(end); | 760 | find_early_table_space(end, use_pse, use_gbpages); |
651 | 761 | ||
652 | for (i = 0; i < nr_range; i++) | 762 | for (i = 0; i < nr_range; i++) |
653 | last_map_addr = kernel_physical_mapping_init( | 763 | last_map_addr = kernel_physical_mapping_init( |
@@ -769,6 +879,8 @@ void __init mem_init(void) | |||
769 | { | 879 | { |
770 | long codesize, reservedpages, datasize, initsize; | 880 | long codesize, reservedpages, datasize, initsize; |
771 | 881 | ||
882 | start_periodic_check_for_corruption(); | ||
883 | |||
772 | pci_iommu_alloc(); | 884 | pci_iommu_alloc(); |
773 | 885 | ||
774 | /* clear_bss() already clear the empty_zero_page */ | 886 | /* clear_bss() already clear the empty_zero_page */ |
@@ -806,8 +918,6 @@ void __init mem_init(void) | |||
806 | reservedpages << (PAGE_SHIFT-10), | 918 | reservedpages << (PAGE_SHIFT-10), |
807 | datasize >> 10, | 919 | datasize >> 10, |
808 | initsize >> 10); | 920 | initsize >> 10); |
809 | |||
810 | cpa_init(); | ||
811 | } | 921 | } |
812 | 922 | ||
813 | void free_init_pages(char *what, unsigned long begin, unsigned long end) | 923 | void free_init_pages(char *what, unsigned long begin, unsigned long end) |
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index d4b6e6a29ae3..ae71e11eb3e5 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -24,18 +24,47 @@ | |||
24 | 24 | ||
25 | #ifdef CONFIG_X86_64 | 25 | #ifdef CONFIG_X86_64 |
26 | 26 | ||
27 | static inline int phys_addr_valid(unsigned long addr) | ||
28 | { | ||
29 | return addr < (1UL << boot_cpu_data.x86_phys_bits); | ||
30 | } | ||
31 | |||
27 | unsigned long __phys_addr(unsigned long x) | 32 | unsigned long __phys_addr(unsigned long x) |
28 | { | 33 | { |
29 | if (x >= __START_KERNEL_map) | 34 | if (x >= __START_KERNEL_map) { |
30 | return x - __START_KERNEL_map + phys_base; | 35 | x -= __START_KERNEL_map; |
31 | return x - PAGE_OFFSET; | 36 | VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); |
37 | x += phys_base; | ||
38 | } else { | ||
39 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); | ||
40 | x -= PAGE_OFFSET; | ||
41 | VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : | ||
42 | !phys_addr_valid(x)); | ||
43 | } | ||
44 | return x; | ||
32 | } | 45 | } |
33 | EXPORT_SYMBOL(__phys_addr); | 46 | EXPORT_SYMBOL(__phys_addr); |
34 | 47 | ||
35 | static inline int phys_addr_valid(unsigned long addr) | 48 | bool __virt_addr_valid(unsigned long x) |
36 | { | 49 | { |
37 | return addr < (1UL << boot_cpu_data.x86_phys_bits); | 50 | if (x >= __START_KERNEL_map) { |
51 | x -= __START_KERNEL_map; | ||
52 | if (x >= KERNEL_IMAGE_SIZE) | ||
53 | return false; | ||
54 | x += phys_base; | ||
55 | } else { | ||
56 | if (x < PAGE_OFFSET) | ||
57 | return false; | ||
58 | x -= PAGE_OFFSET; | ||
59 | if (system_state == SYSTEM_BOOTING ? | ||
60 | x > MAXMEM : !phys_addr_valid(x)) { | ||
61 | return false; | ||
62 | } | ||
63 | } | ||
64 | |||
65 | return pfn_valid(x >> PAGE_SHIFT); | ||
38 | } | 66 | } |
67 | EXPORT_SYMBOL(__virt_addr_valid); | ||
39 | 68 | ||
40 | #else | 69 | #else |
41 | 70 | ||
@@ -44,6 +73,28 @@ static inline int phys_addr_valid(unsigned long addr) | |||
44 | return 1; | 73 | return 1; |
45 | } | 74 | } |
46 | 75 | ||
76 | #ifdef CONFIG_DEBUG_VIRTUAL | ||
77 | unsigned long __phys_addr(unsigned long x) | ||
78 | { | ||
79 | /* VMALLOC_* aren't constants; not available at the boot time */ | ||
80 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); | ||
81 | VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && | ||
82 | is_vmalloc_addr((void *) x)); | ||
83 | return x - PAGE_OFFSET; | ||
84 | } | ||
85 | EXPORT_SYMBOL(__phys_addr); | ||
86 | #endif | ||
87 | |||
88 | bool __virt_addr_valid(unsigned long x) | ||
89 | { | ||
90 | if (x < PAGE_OFFSET) | ||
91 | return false; | ||
92 | if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) | ||
93 | return false; | ||
94 | return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); | ||
95 | } | ||
96 | EXPORT_SYMBOL(__virt_addr_valid); | ||
97 | |||
47 | #endif | 98 | #endif |
48 | 99 | ||
49 | int page_is_ram(unsigned long pagenr) | 100 | int page_is_ram(unsigned long pagenr) |
@@ -83,6 +134,25 @@ int page_is_ram(unsigned long pagenr) | |||
83 | return 0; | 134 | return 0; |
84 | } | 135 | } |
85 | 136 | ||
137 | int pagerange_is_ram(unsigned long start, unsigned long end) | ||
138 | { | ||
139 | int ram_page = 0, not_rampage = 0; | ||
140 | unsigned long page_nr; | ||
141 | |||
142 | for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); | ||
143 | ++page_nr) { | ||
144 | if (page_is_ram(page_nr)) | ||
145 | ram_page = 1; | ||
146 | else | ||
147 | not_rampage = 1; | ||
148 | |||
149 | if (ram_page == not_rampage) | ||
150 | return -1; | ||
151 | } | ||
152 | |||
153 | return ram_page; | ||
154 | } | ||
155 | |||
86 | /* | 156 | /* |
87 | * Fix up the linear direct mapping of the kernel to avoid cache attribute | 157 | * Fix up the linear direct mapping of the kernel to avoid cache attribute |
88 | * conflicts. | 158 | * conflicts. |
@@ -150,6 +220,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
150 | return (__force void __iomem *)phys_to_virt(phys_addr); | 220 | return (__force void __iomem *)phys_to_virt(phys_addr); |
151 | 221 | ||
152 | /* | 222 | /* |
223 | * Check if the request spans more than any BAR in the iomem resource | ||
224 | * tree. | ||
225 | */ | ||
226 | WARN_ON(iomem_map_sanity_check(phys_addr, size)); | ||
227 | |||
228 | /* | ||
153 | * Don't allow anybody to remap normal RAM that we're using.. | 229 | * Don't allow anybody to remap normal RAM that we're using.. |
154 | */ | 230 | */ |
155 | for (pfn = phys_addr >> PAGE_SHIFT; | 231 | for (pfn = phys_addr >> PAGE_SHIFT; |
@@ -204,16 +280,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
204 | switch (prot_val) { | 280 | switch (prot_val) { |
205 | case _PAGE_CACHE_UC: | 281 | case _PAGE_CACHE_UC: |
206 | default: | 282 | default: |
207 | prot = PAGE_KERNEL_NOCACHE; | 283 | prot = PAGE_KERNEL_IO_NOCACHE; |
208 | break; | 284 | break; |
209 | case _PAGE_CACHE_UC_MINUS: | 285 | case _PAGE_CACHE_UC_MINUS: |
210 | prot = PAGE_KERNEL_UC_MINUS; | 286 | prot = PAGE_KERNEL_IO_UC_MINUS; |
211 | break; | 287 | break; |
212 | case _PAGE_CACHE_WC: | 288 | case _PAGE_CACHE_WC: |
213 | prot = PAGE_KERNEL_WC; | 289 | prot = PAGE_KERNEL_IO_WC; |
214 | break; | 290 | break; |
215 | case _PAGE_CACHE_WB: | 291 | case _PAGE_CACHE_WB: |
216 | prot = PAGE_KERNEL; | 292 | prot = PAGE_KERNEL_IO; |
217 | break; | 293 | break; |
218 | } | 294 | } |
219 | 295 | ||
@@ -421,7 +497,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr) | |||
421 | return; | 497 | return; |
422 | } | 498 | } |
423 | 499 | ||
424 | int __initdata early_ioremap_debug; | 500 | static int __initdata early_ioremap_debug; |
425 | 501 | ||
426 | static int __init early_ioremap_debug_setup(char *str) | 502 | static int __init early_ioremap_debug_setup(char *str) |
427 | { | 503 | { |
@@ -530,12 +606,12 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, | |||
530 | } | 606 | } |
531 | 607 | ||
532 | static inline void __init early_set_fixmap(enum fixed_addresses idx, | 608 | static inline void __init early_set_fixmap(enum fixed_addresses idx, |
533 | unsigned long phys) | 609 | unsigned long phys, pgprot_t prot) |
534 | { | 610 | { |
535 | if (after_paging_init) | 611 | if (after_paging_init) |
536 | set_fixmap(idx, phys); | 612 | __set_fixmap(idx, phys, prot); |
537 | else | 613 | else |
538 | __early_set_fixmap(idx, phys, PAGE_KERNEL); | 614 | __early_set_fixmap(idx, phys, prot); |
539 | } | 615 | } |
540 | 616 | ||
541 | static inline void __init early_clear_fixmap(enum fixed_addresses idx) | 617 | static inline void __init early_clear_fixmap(enum fixed_addresses idx) |
@@ -546,16 +622,22 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx) | |||
546 | __early_set_fixmap(idx, 0, __pgprot(0)); | 622 | __early_set_fixmap(idx, 0, __pgprot(0)); |
547 | } | 623 | } |
548 | 624 | ||
549 | 625 | static void *prev_map[FIX_BTMAPS_SLOTS] __initdata; | |
550 | int __initdata early_ioremap_nested; | 626 | static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; |
551 | |||
552 | static int __init check_early_ioremap_leak(void) | 627 | static int __init check_early_ioremap_leak(void) |
553 | { | 628 | { |
554 | if (!early_ioremap_nested) | 629 | int count = 0; |
630 | int i; | ||
631 | |||
632 | for (i = 0; i < FIX_BTMAPS_SLOTS; i++) | ||
633 | if (prev_map[i]) | ||
634 | count++; | ||
635 | |||
636 | if (!count) | ||
555 | return 0; | 637 | return 0; |
556 | WARN(1, KERN_WARNING | 638 | WARN(1, KERN_WARNING |
557 | "Debug warning: early ioremap leak of %d areas detected.\n", | 639 | "Debug warning: early ioremap leak of %d areas detected.\n", |
558 | early_ioremap_nested); | 640 | count); |
559 | printk(KERN_WARNING | 641 | printk(KERN_WARNING |
560 | "please boot with early_ioremap_debug and report the dmesg.\n"); | 642 | "please boot with early_ioremap_debug and report the dmesg.\n"); |
561 | 643 | ||
@@ -563,18 +645,33 @@ static int __init check_early_ioremap_leak(void) | |||
563 | } | 645 | } |
564 | late_initcall(check_early_ioremap_leak); | 646 | late_initcall(check_early_ioremap_leak); |
565 | 647 | ||
566 | void __init *early_ioremap(unsigned long phys_addr, unsigned long size) | 648 | static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) |
567 | { | 649 | { |
568 | unsigned long offset, last_addr; | 650 | unsigned long offset, last_addr; |
569 | unsigned int nrpages, nesting; | 651 | unsigned int nrpages; |
570 | enum fixed_addresses idx0, idx; | 652 | enum fixed_addresses idx0, idx; |
653 | int i, slot; | ||
571 | 654 | ||
572 | WARN_ON(system_state != SYSTEM_BOOTING); | 655 | WARN_ON(system_state != SYSTEM_BOOTING); |
573 | 656 | ||
574 | nesting = early_ioremap_nested; | 657 | slot = -1; |
658 | for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { | ||
659 | if (!prev_map[i]) { | ||
660 | slot = i; | ||
661 | break; | ||
662 | } | ||
663 | } | ||
664 | |||
665 | if (slot < 0) { | ||
666 | printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n", | ||
667 | phys_addr, size); | ||
668 | WARN_ON(1); | ||
669 | return NULL; | ||
670 | } | ||
671 | |||
575 | if (early_ioremap_debug) { | 672 | if (early_ioremap_debug) { |
576 | printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", | 673 | printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", |
577 | phys_addr, size, nesting); | 674 | phys_addr, size, slot); |
578 | dump_stack(); | 675 | dump_stack(); |
579 | } | 676 | } |
580 | 677 | ||
@@ -585,17 +682,13 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) | |||
585 | return NULL; | 682 | return NULL; |
586 | } | 683 | } |
587 | 684 | ||
588 | if (nesting >= FIX_BTMAPS_NESTING) { | 685 | prev_size[slot] = size; |
589 | WARN_ON(1); | ||
590 | return NULL; | ||
591 | } | ||
592 | early_ioremap_nested++; | ||
593 | /* | 686 | /* |
594 | * Mappings have to be page-aligned | 687 | * Mappings have to be page-aligned |
595 | */ | 688 | */ |
596 | offset = phys_addr & ~PAGE_MASK; | 689 | offset = phys_addr & ~PAGE_MASK; |
597 | phys_addr &= PAGE_MASK; | 690 | phys_addr &= PAGE_MASK; |
598 | size = PAGE_ALIGN(last_addr) - phys_addr; | 691 | size = PAGE_ALIGN(last_addr + 1) - phys_addr; |
599 | 692 | ||
600 | /* | 693 | /* |
601 | * Mappings have to fit in the FIX_BTMAP area. | 694 | * Mappings have to fit in the FIX_BTMAP area. |
@@ -609,10 +702,10 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) | |||
609 | /* | 702 | /* |
610 | * Ok, go for it.. | 703 | * Ok, go for it.. |
611 | */ | 704 | */ |
612 | idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; | 705 | idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; |
613 | idx = idx0; | 706 | idx = idx0; |
614 | while (nrpages > 0) { | 707 | while (nrpages > 0) { |
615 | early_set_fixmap(idx, phys_addr); | 708 | early_set_fixmap(idx, phys_addr, prot); |
616 | phys_addr += PAGE_SIZE; | 709 | phys_addr += PAGE_SIZE; |
617 | --idx; | 710 | --idx; |
618 | --nrpages; | 711 | --nrpages; |
@@ -620,7 +713,20 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) | |||
620 | if (early_ioremap_debug) | 713 | if (early_ioremap_debug) |
621 | printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); | 714 | printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); |
622 | 715 | ||
623 | return (void *) (offset + fix_to_virt(idx0)); | 716 | prev_map[slot] = (void *) (offset + fix_to_virt(idx0)); |
717 | return prev_map[slot]; | ||
718 | } | ||
719 | |||
720 | /* Remap an IO device */ | ||
721 | void __init *early_ioremap(unsigned long phys_addr, unsigned long size) | ||
722 | { | ||
723 | return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); | ||
724 | } | ||
725 | |||
726 | /* Remap memory */ | ||
727 | void __init *early_memremap(unsigned long phys_addr, unsigned long size) | ||
728 | { | ||
729 | return __early_ioremap(phys_addr, size, PAGE_KERNEL); | ||
624 | } | 730 | } |
625 | 731 | ||
626 | void __init early_iounmap(void *addr, unsigned long size) | 732 | void __init early_iounmap(void *addr, unsigned long size) |
@@ -629,15 +735,33 @@ void __init early_iounmap(void *addr, unsigned long size) | |||
629 | unsigned long offset; | 735 | unsigned long offset; |
630 | unsigned int nrpages; | 736 | unsigned int nrpages; |
631 | enum fixed_addresses idx; | 737 | enum fixed_addresses idx; |
632 | int nesting; | 738 | int i, slot; |
739 | |||
740 | slot = -1; | ||
741 | for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { | ||
742 | if (prev_map[i] == addr) { | ||
743 | slot = i; | ||
744 | break; | ||
745 | } | ||
746 | } | ||
747 | |||
748 | if (slot < 0) { | ||
749 | printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", | ||
750 | addr, size); | ||
751 | WARN_ON(1); | ||
752 | return; | ||
753 | } | ||
633 | 754 | ||
634 | nesting = --early_ioremap_nested; | 755 | if (prev_size[slot] != size) { |
635 | if (WARN_ON(nesting < 0)) | 756 | printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", |
757 | addr, size, slot, prev_size[slot]); | ||
758 | WARN_ON(1); | ||
636 | return; | 759 | return; |
760 | } | ||
637 | 761 | ||
638 | if (early_ioremap_debug) { | 762 | if (early_ioremap_debug) { |
639 | printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, | 763 | printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, |
640 | size, nesting); | 764 | size, slot); |
641 | dump_stack(); | 765 | dump_stack(); |
642 | } | 766 | } |
643 | 767 | ||
@@ -649,12 +773,13 @@ void __init early_iounmap(void *addr, unsigned long size) | |||
649 | offset = virt_addr & ~PAGE_MASK; | 773 | offset = virt_addr & ~PAGE_MASK; |
650 | nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; | 774 | nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; |
651 | 775 | ||
652 | idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; | 776 | idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; |
653 | while (nrpages > 0) { | 777 | while (nrpages > 0) { |
654 | early_clear_fixmap(idx); | 778 | early_clear_fixmap(idx); |
655 | --idx; | 779 | --idx; |
656 | --nrpages; | 780 | --nrpages; |
657 | } | 781 | } |
782 | prev_map[slot] = 0; | ||
658 | } | 783 | } |
659 | 784 | ||
660 | void __this_fixmap_does_not_exist(void) | 785 | void __this_fixmap_does_not_exist(void) |
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/numa_32.c index 62fa440678d8..847c164725f4 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/numa_32.c | |||
@@ -328,7 +328,7 @@ void __init initmem_init(unsigned long start_pfn, | |||
328 | 328 | ||
329 | get_memcfg_numa(); | 329 | get_memcfg_numa(); |
330 | 330 | ||
331 | kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); | 331 | kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); |
332 | 332 | ||
333 | kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); | 333 | kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); |
334 | do { | 334 | do { |
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a4dd793d6003..cebcbf152d46 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -79,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void) | |||
79 | return 0; | 79 | return 0; |
80 | 80 | ||
81 | addr = 0x8000; | 81 | addr = 0x8000; |
82 | nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); | 82 | nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); |
83 | nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, | 83 | nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, |
84 | nodemap_size, L1_CACHE_BYTES); | 84 | nodemap_size, L1_CACHE_BYTES); |
85 | if (nodemap_addr == -1UL) { | 85 | if (nodemap_addr == -1UL) { |
@@ -176,10 +176,10 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, | |||
176 | unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; | 176 | unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; |
177 | unsigned long bootmap_start, nodedata_phys; | 177 | unsigned long bootmap_start, nodedata_phys; |
178 | void *bootmap; | 178 | void *bootmap; |
179 | const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); | 179 | const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); |
180 | int nid; | 180 | int nid; |
181 | 181 | ||
182 | start = round_up(start, ZONE_ALIGN); | 182 | start = roundup(start, ZONE_ALIGN); |
183 | 183 | ||
184 | printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, | 184 | printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, |
185 | start, end); | 185 | start, end); |
@@ -210,9 +210,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, | |||
210 | bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); | 210 | bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); |
211 | nid = phys_to_nid(nodedata_phys); | 211 | nid = phys_to_nid(nodedata_phys); |
212 | if (nid == nodeid) | 212 | if (nid == nodeid) |
213 | bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); | 213 | bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); |
214 | else | 214 | else |
215 | bootmap_start = round_up(start, PAGE_SIZE); | 215 | bootmap_start = roundup(start, PAGE_SIZE); |
216 | /* | 216 | /* |
217 | * SMP_CACHE_BYTES could be enough, but init_bootmem_node like | 217 | * SMP_CACHE_BYTES could be enough, but init_bootmem_node like |
218 | * to use that to align to PAGE_SIZE | 218 | * to use that to align to PAGE_SIZE |
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index d4aa503caaa2..e1d106909218 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c | |||
@@ -32,7 +32,7 @@ enum { | |||
32 | GPS = (1<<30) | 32 | GPS = (1<<30) |
33 | }; | 33 | }; |
34 | 34 | ||
35 | #define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1) | 35 | #define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST) |
36 | 36 | ||
37 | static int pte_testbit(pte_t pte) | 37 | static int pte_testbit(pte_t pte) |
38 | { | 38 | { |
@@ -118,6 +118,7 @@ static int pageattr_test(void) | |||
118 | unsigned int level; | 118 | unsigned int level; |
119 | int i, k; | 119 | int i, k; |
120 | int err; | 120 | int err; |
121 | unsigned long test_addr; | ||
121 | 122 | ||
122 | if (print) | 123 | if (print) |
123 | printk(KERN_INFO "CPA self-test:\n"); | 124 | printk(KERN_INFO "CPA self-test:\n"); |
@@ -172,7 +173,8 @@ static int pageattr_test(void) | |||
172 | continue; | 173 | continue; |
173 | } | 174 | } |
174 | 175 | ||
175 | err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT); | 176 | test_addr = addr[i]; |
177 | err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0); | ||
176 | if (err < 0) { | 178 | if (err < 0) { |
177 | printk(KERN_ERR "CPA %d failed %d\n", i, err); | 179 | printk(KERN_ERR "CPA %d failed %d\n", i, err); |
178 | failed++; | 180 | failed++; |
@@ -204,7 +206,8 @@ static int pageattr_test(void) | |||
204 | failed++; | 206 | failed++; |
205 | continue; | 207 | continue; |
206 | } | 208 | } |
207 | err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT); | 209 | test_addr = addr[i]; |
210 | err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0); | ||
208 | if (err < 0) { | 211 | if (err < 0) { |
209 | printk(KERN_ERR "CPA reverting failed: %d\n", err); | 212 | printk(KERN_ERR "CPA reverting failed: %d\n", err); |
210 | failed++; | 213 | failed++; |
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 43e2f8483e4f..a9ec89c3fbca 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -25,15 +25,27 @@ | |||
25 | * The current flushing context - we pass it instead of 5 arguments: | 25 | * The current flushing context - we pass it instead of 5 arguments: |
26 | */ | 26 | */ |
27 | struct cpa_data { | 27 | struct cpa_data { |
28 | unsigned long vaddr; | 28 | unsigned long *vaddr; |
29 | pgprot_t mask_set; | 29 | pgprot_t mask_set; |
30 | pgprot_t mask_clr; | 30 | pgprot_t mask_clr; |
31 | int numpages; | 31 | int numpages; |
32 | int flushtlb; | 32 | int flags; |
33 | unsigned long pfn; | 33 | unsigned long pfn; |
34 | unsigned force_split : 1; | 34 | unsigned force_split : 1; |
35 | int curpage; | ||
35 | }; | 36 | }; |
36 | 37 | ||
38 | /* | ||
39 | * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) | ||
40 | * using cpa_lock. So that we don't allow any other cpu, with stale large tlb | ||
41 | * entries change the page attribute in parallel to some other cpu | ||
42 | * splitting a large page entry along with changing the attribute. | ||
43 | */ | ||
44 | static DEFINE_SPINLOCK(cpa_lock); | ||
45 | |||
46 | #define CPA_FLUSHTLB 1 | ||
47 | #define CPA_ARRAY 2 | ||
48 | |||
37 | #ifdef CONFIG_PROC_FS | 49 | #ifdef CONFIG_PROC_FS |
38 | static unsigned long direct_pages_count[PG_LEVEL_NUM]; | 50 | static unsigned long direct_pages_count[PG_LEVEL_NUM]; |
39 | 51 | ||
@@ -84,7 +96,7 @@ static inline unsigned long highmap_start_pfn(void) | |||
84 | 96 | ||
85 | static inline unsigned long highmap_end_pfn(void) | 97 | static inline unsigned long highmap_end_pfn(void) |
86 | { | 98 | { |
87 | return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; | 99 | return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; |
88 | } | 100 | } |
89 | 101 | ||
90 | #endif | 102 | #endif |
@@ -190,6 +202,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) | |||
190 | } | 202 | } |
191 | } | 203 | } |
192 | 204 | ||
205 | static void cpa_flush_array(unsigned long *start, int numpages, int cache) | ||
206 | { | ||
207 | unsigned int i, level; | ||
208 | unsigned long *addr; | ||
209 | |||
210 | BUG_ON(irqs_disabled()); | ||
211 | |||
212 | on_each_cpu(__cpa_flush_range, NULL, 1); | ||
213 | |||
214 | if (!cache) | ||
215 | return; | ||
216 | |||
217 | /* 4M threshold */ | ||
218 | if (numpages >= 1024) { | ||
219 | if (boot_cpu_data.x86_model >= 4) | ||
220 | wbinvd(); | ||
221 | return; | ||
222 | } | ||
223 | /* | ||
224 | * We only need to flush on one CPU, | ||
225 | * clflush is a MESI-coherent instruction that | ||
226 | * will cause all other CPUs to flush the same | ||
227 | * cachelines: | ||
228 | */ | ||
229 | for (i = 0, addr = start; i < numpages; i++, addr++) { | ||
230 | pte_t *pte = lookup_address(*addr, &level); | ||
231 | |||
232 | /* | ||
233 | * Only flush present addresses: | ||
234 | */ | ||
235 | if (pte && (pte_val(*pte) & _PAGE_PRESENT)) | ||
236 | clflush_cache_range((void *) *addr, PAGE_SIZE); | ||
237 | } | ||
238 | } | ||
239 | |||
193 | /* | 240 | /* |
194 | * Certain areas of memory on x86 require very specific protection flags, | 241 | * Certain areas of memory on x86 require very specific protection flags, |
195 | * for example the BIOS area or kernel text. Callers don't always get this | 242 | * for example the BIOS area or kernel text. Callers don't always get this |
@@ -398,7 +445,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, | |||
398 | */ | 445 | */ |
399 | new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); | 446 | new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); |
400 | __set_pmd_pte(kpte, address, new_pte); | 447 | __set_pmd_pte(kpte, address, new_pte); |
401 | cpa->flushtlb = 1; | 448 | cpa->flags |= CPA_FLUSHTLB; |
402 | do_split = 0; | 449 | do_split = 0; |
403 | } | 450 | } |
404 | 451 | ||
@@ -408,84 +455,6 @@ out_unlock: | |||
408 | return do_split; | 455 | return do_split; |
409 | } | 456 | } |
410 | 457 | ||
411 | static LIST_HEAD(page_pool); | ||
412 | static unsigned long pool_size, pool_pages, pool_low; | ||
413 | static unsigned long pool_used, pool_failed; | ||
414 | |||
415 | static void cpa_fill_pool(struct page **ret) | ||
416 | { | ||
417 | gfp_t gfp = GFP_KERNEL; | ||
418 | unsigned long flags; | ||
419 | struct page *p; | ||
420 | |||
421 | /* | ||
422 | * Avoid recursion (on debug-pagealloc) and also signal | ||
423 | * our priority to get to these pagetables: | ||
424 | */ | ||
425 | if (current->flags & PF_MEMALLOC) | ||
426 | return; | ||
427 | current->flags |= PF_MEMALLOC; | ||
428 | |||
429 | /* | ||
430 | * Allocate atomically from atomic contexts: | ||
431 | */ | ||
432 | if (in_atomic() || irqs_disabled() || debug_pagealloc) | ||
433 | gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; | ||
434 | |||
435 | while (pool_pages < pool_size || (ret && !*ret)) { | ||
436 | p = alloc_pages(gfp, 0); | ||
437 | if (!p) { | ||
438 | pool_failed++; | ||
439 | break; | ||
440 | } | ||
441 | /* | ||
442 | * If the call site needs a page right now, provide it: | ||
443 | */ | ||
444 | if (ret && !*ret) { | ||
445 | *ret = p; | ||
446 | continue; | ||
447 | } | ||
448 | spin_lock_irqsave(&pgd_lock, flags); | ||
449 | list_add(&p->lru, &page_pool); | ||
450 | pool_pages++; | ||
451 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
452 | } | ||
453 | |||
454 | current->flags &= ~PF_MEMALLOC; | ||
455 | } | ||
456 | |||
457 | #define SHIFT_MB (20 - PAGE_SHIFT) | ||
458 | #define ROUND_MB_GB ((1 << 10) - 1) | ||
459 | #define SHIFT_MB_GB 10 | ||
460 | #define POOL_PAGES_PER_GB 16 | ||
461 | |||
462 | void __init cpa_init(void) | ||
463 | { | ||
464 | struct sysinfo si; | ||
465 | unsigned long gb; | ||
466 | |||
467 | si_meminfo(&si); | ||
468 | /* | ||
469 | * Calculate the number of pool pages: | ||
470 | * | ||
471 | * Convert totalram (nr of pages) to MiB and round to the next | ||
472 | * GiB. Shift MiB to Gib and multiply the result by | ||
473 | * POOL_PAGES_PER_GB: | ||
474 | */ | ||
475 | if (debug_pagealloc) { | ||
476 | gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; | ||
477 | pool_size = POOL_PAGES_PER_GB * gb; | ||
478 | } else { | ||
479 | pool_size = 1; | ||
480 | } | ||
481 | pool_low = pool_size; | ||
482 | |||
483 | cpa_fill_pool(NULL); | ||
484 | printk(KERN_DEBUG | ||
485 | "CPA: page pool initialized %lu of %lu pages preallocated\n", | ||
486 | pool_pages, pool_size); | ||
487 | } | ||
488 | |||
489 | static int split_large_page(pte_t *kpte, unsigned long address) | 458 | static int split_large_page(pte_t *kpte, unsigned long address) |
490 | { | 459 | { |
491 | unsigned long flags, pfn, pfninc = 1; | 460 | unsigned long flags, pfn, pfninc = 1; |
@@ -494,28 +463,15 @@ static int split_large_page(pte_t *kpte, unsigned long address) | |||
494 | pgprot_t ref_prot; | 463 | pgprot_t ref_prot; |
495 | struct page *base; | 464 | struct page *base; |
496 | 465 | ||
497 | /* | 466 | if (!debug_pagealloc) |
498 | * Get a page from the pool. The pool list is protected by the | 467 | spin_unlock(&cpa_lock); |
499 | * pgd_lock, which we have to take anyway for the split | 468 | base = alloc_pages(GFP_KERNEL, 0); |
500 | * operation: | 469 | if (!debug_pagealloc) |
501 | */ | 470 | spin_lock(&cpa_lock); |
502 | spin_lock_irqsave(&pgd_lock, flags); | 471 | if (!base) |
503 | if (list_empty(&page_pool)) { | 472 | return -ENOMEM; |
504 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
505 | base = NULL; | ||
506 | cpa_fill_pool(&base); | ||
507 | if (!base) | ||
508 | return -ENOMEM; | ||
509 | spin_lock_irqsave(&pgd_lock, flags); | ||
510 | } else { | ||
511 | base = list_first_entry(&page_pool, struct page, lru); | ||
512 | list_del(&base->lru); | ||
513 | pool_pages--; | ||
514 | |||
515 | if (pool_pages < pool_low) | ||
516 | pool_low = pool_pages; | ||
517 | } | ||
518 | 473 | ||
474 | spin_lock_irqsave(&pgd_lock, flags); | ||
519 | /* | 475 | /* |
520 | * Check for races, another CPU might have split this page | 476 | * Check for races, another CPU might have split this page |
521 | * up for us already: | 477 | * up for us already: |
@@ -572,11 +528,8 @@ out_unlock: | |||
572 | * If we dropped out via the lookup_address check under | 528 | * If we dropped out via the lookup_address check under |
573 | * pgd_lock then stick the page back into the pool: | 529 | * pgd_lock then stick the page back into the pool: |
574 | */ | 530 | */ |
575 | if (base) { | 531 | if (base) |
576 | list_add(&base->lru, &page_pool); | 532 | __free_page(base); |
577 | pool_pages++; | ||
578 | } else | ||
579 | pool_used++; | ||
580 | spin_unlock_irqrestore(&pgd_lock, flags); | 533 | spin_unlock_irqrestore(&pgd_lock, flags); |
581 | 534 | ||
582 | return 0; | 535 | return 0; |
@@ -584,11 +537,16 @@ out_unlock: | |||
584 | 537 | ||
585 | static int __change_page_attr(struct cpa_data *cpa, int primary) | 538 | static int __change_page_attr(struct cpa_data *cpa, int primary) |
586 | { | 539 | { |
587 | unsigned long address = cpa->vaddr; | 540 | unsigned long address; |
588 | int do_split, err; | 541 | int do_split, err; |
589 | unsigned int level; | 542 | unsigned int level; |
590 | pte_t *kpte, old_pte; | 543 | pte_t *kpte, old_pte; |
591 | 544 | ||
545 | if (cpa->flags & CPA_ARRAY) | ||
546 | address = cpa->vaddr[cpa->curpage]; | ||
547 | else | ||
548 | address = *cpa->vaddr; | ||
549 | |||
592 | repeat: | 550 | repeat: |
593 | kpte = lookup_address(address, &level); | 551 | kpte = lookup_address(address, &level); |
594 | if (!kpte) | 552 | if (!kpte) |
@@ -600,7 +558,7 @@ repeat: | |||
600 | return 0; | 558 | return 0; |
601 | WARN(1, KERN_WARNING "CPA: called for zero pte. " | 559 | WARN(1, KERN_WARNING "CPA: called for zero pte. " |
602 | "vaddr = %lx cpa->vaddr = %lx\n", address, | 560 | "vaddr = %lx cpa->vaddr = %lx\n", address, |
603 | cpa->vaddr); | 561 | *cpa->vaddr); |
604 | return -EINVAL; | 562 | return -EINVAL; |
605 | } | 563 | } |
606 | 564 | ||
@@ -626,7 +584,7 @@ repeat: | |||
626 | */ | 584 | */ |
627 | if (pte_val(old_pte) != pte_val(new_pte)) { | 585 | if (pte_val(old_pte) != pte_val(new_pte)) { |
628 | set_pte_atomic(kpte, new_pte); | 586 | set_pte_atomic(kpte, new_pte); |
629 | cpa->flushtlb = 1; | 587 | cpa->flags |= CPA_FLUSHTLB; |
630 | } | 588 | } |
631 | cpa->numpages = 1; | 589 | cpa->numpages = 1; |
632 | return 0; | 590 | return 0; |
@@ -650,7 +608,25 @@ repeat: | |||
650 | */ | 608 | */ |
651 | err = split_large_page(kpte, address); | 609 | err = split_large_page(kpte, address); |
652 | if (!err) { | 610 | if (!err) { |
653 | cpa->flushtlb = 1; | 611 | /* |
612 | * Do a global flush tlb after splitting the large page | ||
613 | * and before we do the actual change page attribute in the PTE. | ||
614 | * | ||
615 | * With out this, we violate the TLB application note, that says | ||
616 | * "The TLBs may contain both ordinary and large-page | ||
617 | * translations for a 4-KByte range of linear addresses. This | ||
618 | * may occur if software modifies the paging structures so that | ||
619 | * the page size used for the address range changes. If the two | ||
620 | * translations differ with respect to page frame or attributes | ||
621 | * (e.g., permissions), processor behavior is undefined and may | ||
622 | * be implementation-specific." | ||
623 | * | ||
624 | * We do this global tlb flush inside the cpa_lock, so that we | ||
625 | * don't allow any other cpu, with stale tlb entries change the | ||
626 | * page attribute in parallel, that also falls into the | ||
627 | * just split large page entry. | ||
628 | */ | ||
629 | flush_tlb_all(); | ||
654 | goto repeat; | 630 | goto repeat; |
655 | } | 631 | } |
656 | 632 | ||
@@ -663,6 +639,7 @@ static int cpa_process_alias(struct cpa_data *cpa) | |||
663 | { | 639 | { |
664 | struct cpa_data alias_cpa; | 640 | struct cpa_data alias_cpa; |
665 | int ret = 0; | 641 | int ret = 0; |
642 | unsigned long temp_cpa_vaddr, vaddr; | ||
666 | 643 | ||
667 | if (cpa->pfn >= max_pfn_mapped) | 644 | if (cpa->pfn >= max_pfn_mapped) |
668 | return 0; | 645 | return 0; |
@@ -675,16 +652,24 @@ static int cpa_process_alias(struct cpa_data *cpa) | |||
675 | * No need to redo, when the primary call touched the direct | 652 | * No need to redo, when the primary call touched the direct |
676 | * mapping already: | 653 | * mapping already: |
677 | */ | 654 | */ |
678 | if (!(within(cpa->vaddr, PAGE_OFFSET, | 655 | if (cpa->flags & CPA_ARRAY) |
656 | vaddr = cpa->vaddr[cpa->curpage]; | ||
657 | else | ||
658 | vaddr = *cpa->vaddr; | ||
659 | |||
660 | if (!(within(vaddr, PAGE_OFFSET, | ||
679 | PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) | 661 | PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) |
680 | #ifdef CONFIG_X86_64 | 662 | #ifdef CONFIG_X86_64 |
681 | || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32), | 663 | || within(vaddr, PAGE_OFFSET + (1UL<<32), |
682 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) | 664 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) |
683 | #endif | 665 | #endif |
684 | )) { | 666 | )) { |
685 | 667 | ||
686 | alias_cpa = *cpa; | 668 | alias_cpa = *cpa; |
687 | alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); | 669 | temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); |
670 | alias_cpa.vaddr = &temp_cpa_vaddr; | ||
671 | alias_cpa.flags &= ~CPA_ARRAY; | ||
672 | |||
688 | 673 | ||
689 | ret = __change_page_attr_set_clr(&alias_cpa, 0); | 674 | ret = __change_page_attr_set_clr(&alias_cpa, 0); |
690 | } | 675 | } |
@@ -696,7 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa) | |||
696 | * No need to redo, when the primary call touched the high | 681 | * No need to redo, when the primary call touched the high |
697 | * mapping already: | 682 | * mapping already: |
698 | */ | 683 | */ |
699 | if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) | 684 | if (within(vaddr, (unsigned long) _text, (unsigned long) _end)) |
700 | return 0; | 685 | return 0; |
701 | 686 | ||
702 | /* | 687 | /* |
@@ -707,8 +692,9 @@ static int cpa_process_alias(struct cpa_data *cpa) | |||
707 | return 0; | 692 | return 0; |
708 | 693 | ||
709 | alias_cpa = *cpa; | 694 | alias_cpa = *cpa; |
710 | alias_cpa.vaddr = | 695 | temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; |
711 | (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; | 696 | alias_cpa.vaddr = &temp_cpa_vaddr; |
697 | alias_cpa.flags &= ~CPA_ARRAY; | ||
712 | 698 | ||
713 | /* | 699 | /* |
714 | * The high mapping range is imprecise, so ignore the return value. | 700 | * The high mapping range is imprecise, so ignore the return value. |
@@ -728,8 +714,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) | |||
728 | * preservation check. | 714 | * preservation check. |
729 | */ | 715 | */ |
730 | cpa->numpages = numpages; | 716 | cpa->numpages = numpages; |
717 | /* for array changes, we can't use large page */ | ||
718 | if (cpa->flags & CPA_ARRAY) | ||
719 | cpa->numpages = 1; | ||
731 | 720 | ||
721 | if (!debug_pagealloc) | ||
722 | spin_lock(&cpa_lock); | ||
732 | ret = __change_page_attr(cpa, checkalias); | 723 | ret = __change_page_attr(cpa, checkalias); |
724 | if (!debug_pagealloc) | ||
725 | spin_unlock(&cpa_lock); | ||
733 | if (ret) | 726 | if (ret) |
734 | return ret; | 727 | return ret; |
735 | 728 | ||
@@ -746,7 +739,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) | |||
746 | */ | 739 | */ |
747 | BUG_ON(cpa->numpages > numpages); | 740 | BUG_ON(cpa->numpages > numpages); |
748 | numpages -= cpa->numpages; | 741 | numpages -= cpa->numpages; |
749 | cpa->vaddr += cpa->numpages * PAGE_SIZE; | 742 | if (cpa->flags & CPA_ARRAY) |
743 | cpa->curpage++; | ||
744 | else | ||
745 | *cpa->vaddr += cpa->numpages * PAGE_SIZE; | ||
746 | |||
750 | } | 747 | } |
751 | return 0; | 748 | return 0; |
752 | } | 749 | } |
@@ -757,9 +754,9 @@ static inline int cache_attr(pgprot_t attr) | |||
757 | (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); | 754 | (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); |
758 | } | 755 | } |
759 | 756 | ||
760 | static int change_page_attr_set_clr(unsigned long addr, int numpages, | 757 | static int change_page_attr_set_clr(unsigned long *addr, int numpages, |
761 | pgprot_t mask_set, pgprot_t mask_clr, | 758 | pgprot_t mask_set, pgprot_t mask_clr, |
762 | int force_split) | 759 | int force_split, int array) |
763 | { | 760 | { |
764 | struct cpa_data cpa; | 761 | struct cpa_data cpa; |
765 | int ret, cache, checkalias; | 762 | int ret, cache, checkalias; |
@@ -774,21 +771,38 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, | |||
774 | return 0; | 771 | return 0; |
775 | 772 | ||
776 | /* Ensure we are PAGE_SIZE aligned */ | 773 | /* Ensure we are PAGE_SIZE aligned */ |
777 | if (addr & ~PAGE_MASK) { | 774 | if (!array) { |
778 | addr &= PAGE_MASK; | 775 | if (*addr & ~PAGE_MASK) { |
779 | /* | 776 | *addr &= PAGE_MASK; |
780 | * People should not be passing in unaligned addresses: | 777 | /* |
781 | */ | 778 | * People should not be passing in unaligned addresses: |
782 | WARN_ON_ONCE(1); | 779 | */ |
780 | WARN_ON_ONCE(1); | ||
781 | } | ||
782 | } else { | ||
783 | int i; | ||
784 | for (i = 0; i < numpages; i++) { | ||
785 | if (addr[i] & ~PAGE_MASK) { | ||
786 | addr[i] &= PAGE_MASK; | ||
787 | WARN_ON_ONCE(1); | ||
788 | } | ||
789 | } | ||
783 | } | 790 | } |
784 | 791 | ||
792 | /* Must avoid aliasing mappings in the highmem code */ | ||
793 | kmap_flush_unused(); | ||
794 | |||
785 | cpa.vaddr = addr; | 795 | cpa.vaddr = addr; |
786 | cpa.numpages = numpages; | 796 | cpa.numpages = numpages; |
787 | cpa.mask_set = mask_set; | 797 | cpa.mask_set = mask_set; |
788 | cpa.mask_clr = mask_clr; | 798 | cpa.mask_clr = mask_clr; |
789 | cpa.flushtlb = 0; | 799 | cpa.flags = 0; |
800 | cpa.curpage = 0; | ||
790 | cpa.force_split = force_split; | 801 | cpa.force_split = force_split; |
791 | 802 | ||
803 | if (array) | ||
804 | cpa.flags |= CPA_ARRAY; | ||
805 | |||
792 | /* No alias checking for _NX bit modifications */ | 806 | /* No alias checking for _NX bit modifications */ |
793 | checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; | 807 | checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; |
794 | 808 | ||
@@ -797,7 +811,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, | |||
797 | /* | 811 | /* |
798 | * Check whether we really changed something: | 812 | * Check whether we really changed something: |
799 | */ | 813 | */ |
800 | if (!cpa.flushtlb) | 814 | if (!(cpa.flags & CPA_FLUSHTLB)) |
801 | goto out; | 815 | goto out; |
802 | 816 | ||
803 | /* | 817 | /* |
@@ -812,27 +826,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, | |||
812 | * error case we fall back to cpa_flush_all (which uses | 826 | * error case we fall back to cpa_flush_all (which uses |
813 | * wbindv): | 827 | * wbindv): |
814 | */ | 828 | */ |
815 | if (!ret && cpu_has_clflush) | 829 | if (!ret && cpu_has_clflush) { |
816 | cpa_flush_range(addr, numpages, cache); | 830 | if (cpa.flags & CPA_ARRAY) |
817 | else | 831 | cpa_flush_array(addr, numpages, cache); |
832 | else | ||
833 | cpa_flush_range(*addr, numpages, cache); | ||
834 | } else | ||
818 | cpa_flush_all(cache); | 835 | cpa_flush_all(cache); |
819 | 836 | ||
820 | out: | 837 | out: |
821 | cpa_fill_pool(NULL); | ||
822 | |||
823 | return ret; | 838 | return ret; |
824 | } | 839 | } |
825 | 840 | ||
826 | static inline int change_page_attr_set(unsigned long addr, int numpages, | 841 | static inline int change_page_attr_set(unsigned long *addr, int numpages, |
827 | pgprot_t mask) | 842 | pgprot_t mask, int array) |
828 | { | 843 | { |
829 | return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); | 844 | return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, |
845 | array); | ||
830 | } | 846 | } |
831 | 847 | ||
832 | static inline int change_page_attr_clear(unsigned long addr, int numpages, | 848 | static inline int change_page_attr_clear(unsigned long *addr, int numpages, |
833 | pgprot_t mask) | 849 | pgprot_t mask, int array) |
834 | { | 850 | { |
835 | return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); | 851 | return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, |
852 | array); | ||
836 | } | 853 | } |
837 | 854 | ||
838 | int _set_memory_uc(unsigned long addr, int numpages) | 855 | int _set_memory_uc(unsigned long addr, int numpages) |
@@ -840,8 +857,8 @@ int _set_memory_uc(unsigned long addr, int numpages) | |||
840 | /* | 857 | /* |
841 | * for now UC MINUS. see comments in ioremap_nocache() | 858 | * for now UC MINUS. see comments in ioremap_nocache() |
842 | */ | 859 | */ |
843 | return change_page_attr_set(addr, numpages, | 860 | return change_page_attr_set(&addr, numpages, |
844 | __pgprot(_PAGE_CACHE_UC_MINUS)); | 861 | __pgprot(_PAGE_CACHE_UC_MINUS), 0); |
845 | } | 862 | } |
846 | 863 | ||
847 | int set_memory_uc(unsigned long addr, int numpages) | 864 | int set_memory_uc(unsigned long addr, int numpages) |
@@ -857,10 +874,48 @@ int set_memory_uc(unsigned long addr, int numpages) | |||
857 | } | 874 | } |
858 | EXPORT_SYMBOL(set_memory_uc); | 875 | EXPORT_SYMBOL(set_memory_uc); |
859 | 876 | ||
877 | int set_memory_array_uc(unsigned long *addr, int addrinarray) | ||
878 | { | ||
879 | unsigned long start; | ||
880 | unsigned long end; | ||
881 | int i; | ||
882 | /* | ||
883 | * for now UC MINUS. see comments in ioremap_nocache() | ||
884 | */ | ||
885 | for (i = 0; i < addrinarray; i++) { | ||
886 | start = __pa(addr[i]); | ||
887 | for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { | ||
888 | if (end != __pa(addr[i + 1])) | ||
889 | break; | ||
890 | i++; | ||
891 | } | ||
892 | if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) | ||
893 | goto out; | ||
894 | } | ||
895 | |||
896 | return change_page_attr_set(addr, addrinarray, | ||
897 | __pgprot(_PAGE_CACHE_UC_MINUS), 1); | ||
898 | out: | ||
899 | for (i = 0; i < addrinarray; i++) { | ||
900 | unsigned long tmp = __pa(addr[i]); | ||
901 | |||
902 | if (tmp == start) | ||
903 | break; | ||
904 | for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { | ||
905 | if (end != __pa(addr[i + 1])) | ||
906 | break; | ||
907 | i++; | ||
908 | } | ||
909 | free_memtype(tmp, end); | ||
910 | } | ||
911 | return -EINVAL; | ||
912 | } | ||
913 | EXPORT_SYMBOL(set_memory_array_uc); | ||
914 | |||
860 | int _set_memory_wc(unsigned long addr, int numpages) | 915 | int _set_memory_wc(unsigned long addr, int numpages) |
861 | { | 916 | { |
862 | return change_page_attr_set(addr, numpages, | 917 | return change_page_attr_set(&addr, numpages, |
863 | __pgprot(_PAGE_CACHE_WC)); | 918 | __pgprot(_PAGE_CACHE_WC), 0); |
864 | } | 919 | } |
865 | 920 | ||
866 | int set_memory_wc(unsigned long addr, int numpages) | 921 | int set_memory_wc(unsigned long addr, int numpages) |
@@ -878,8 +933,8 @@ EXPORT_SYMBOL(set_memory_wc); | |||
878 | 933 | ||
879 | int _set_memory_wb(unsigned long addr, int numpages) | 934 | int _set_memory_wb(unsigned long addr, int numpages) |
880 | { | 935 | { |
881 | return change_page_attr_clear(addr, numpages, | 936 | return change_page_attr_clear(&addr, numpages, |
882 | __pgprot(_PAGE_CACHE_MASK)); | 937 | __pgprot(_PAGE_CACHE_MASK), 0); |
883 | } | 938 | } |
884 | 939 | ||
885 | int set_memory_wb(unsigned long addr, int numpages) | 940 | int set_memory_wb(unsigned long addr, int numpages) |
@@ -890,37 +945,59 @@ int set_memory_wb(unsigned long addr, int numpages) | |||
890 | } | 945 | } |
891 | EXPORT_SYMBOL(set_memory_wb); | 946 | EXPORT_SYMBOL(set_memory_wb); |
892 | 947 | ||
948 | int set_memory_array_wb(unsigned long *addr, int addrinarray) | ||
949 | { | ||
950 | int i; | ||
951 | |||
952 | for (i = 0; i < addrinarray; i++) { | ||
953 | unsigned long start = __pa(addr[i]); | ||
954 | unsigned long end; | ||
955 | |||
956 | for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { | ||
957 | if (end != __pa(addr[i + 1])) | ||
958 | break; | ||
959 | i++; | ||
960 | } | ||
961 | free_memtype(start, end); | ||
962 | } | ||
963 | return change_page_attr_clear(addr, addrinarray, | ||
964 | __pgprot(_PAGE_CACHE_MASK), 1); | ||
965 | } | ||
966 | EXPORT_SYMBOL(set_memory_array_wb); | ||
967 | |||
893 | int set_memory_x(unsigned long addr, int numpages) | 968 | int set_memory_x(unsigned long addr, int numpages) |
894 | { | 969 | { |
895 | return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); | 970 | return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); |
896 | } | 971 | } |
897 | EXPORT_SYMBOL(set_memory_x); | 972 | EXPORT_SYMBOL(set_memory_x); |
898 | 973 | ||
899 | int set_memory_nx(unsigned long addr, int numpages) | 974 | int set_memory_nx(unsigned long addr, int numpages) |
900 | { | 975 | { |
901 | return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); | 976 | return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); |
902 | } | 977 | } |
903 | EXPORT_SYMBOL(set_memory_nx); | 978 | EXPORT_SYMBOL(set_memory_nx); |
904 | 979 | ||
905 | int set_memory_ro(unsigned long addr, int numpages) | 980 | int set_memory_ro(unsigned long addr, int numpages) |
906 | { | 981 | { |
907 | return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); | 982 | return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); |
908 | } | 983 | } |
984 | EXPORT_SYMBOL_GPL(set_memory_ro); | ||
909 | 985 | ||
910 | int set_memory_rw(unsigned long addr, int numpages) | 986 | int set_memory_rw(unsigned long addr, int numpages) |
911 | { | 987 | { |
912 | return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); | 988 | return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); |
913 | } | 989 | } |
990 | EXPORT_SYMBOL_GPL(set_memory_rw); | ||
914 | 991 | ||
915 | int set_memory_np(unsigned long addr, int numpages) | 992 | int set_memory_np(unsigned long addr, int numpages) |
916 | { | 993 | { |
917 | return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); | 994 | return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); |
918 | } | 995 | } |
919 | 996 | ||
920 | int set_memory_4k(unsigned long addr, int numpages) | 997 | int set_memory_4k(unsigned long addr, int numpages) |
921 | { | 998 | { |
922 | return change_page_attr_set_clr(addr, numpages, __pgprot(0), | 999 | return change_page_attr_set_clr(&addr, numpages, __pgprot(0), |
923 | __pgprot(0), 1); | 1000 | __pgprot(0), 1, 0); |
924 | } | 1001 | } |
925 | 1002 | ||
926 | int set_pages_uc(struct page *page, int numpages) | 1003 | int set_pages_uc(struct page *page, int numpages) |
@@ -973,22 +1050,38 @@ int set_pages_rw(struct page *page, int numpages) | |||
973 | 1050 | ||
974 | static int __set_pages_p(struct page *page, int numpages) | 1051 | static int __set_pages_p(struct page *page, int numpages) |
975 | { | 1052 | { |
976 | struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), | 1053 | unsigned long tempaddr = (unsigned long) page_address(page); |
1054 | struct cpa_data cpa = { .vaddr = &tempaddr, | ||
977 | .numpages = numpages, | 1055 | .numpages = numpages, |
978 | .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), | 1056 | .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), |
979 | .mask_clr = __pgprot(0)}; | 1057 | .mask_clr = __pgprot(0), |
1058 | .flags = 0}; | ||
980 | 1059 | ||
981 | return __change_page_attr_set_clr(&cpa, 1); | 1060 | /* |
1061 | * No alias checking needed for setting present flag. otherwise, | ||
1062 | * we may need to break large pages for 64-bit kernel text | ||
1063 | * mappings (this adds to complexity if we want to do this from | ||
1064 | * atomic context especially). Let's keep it simple! | ||
1065 | */ | ||
1066 | return __change_page_attr_set_clr(&cpa, 0); | ||
982 | } | 1067 | } |
983 | 1068 | ||
984 | static int __set_pages_np(struct page *page, int numpages) | 1069 | static int __set_pages_np(struct page *page, int numpages) |
985 | { | 1070 | { |
986 | struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), | 1071 | unsigned long tempaddr = (unsigned long) page_address(page); |
1072 | struct cpa_data cpa = { .vaddr = &tempaddr, | ||
987 | .numpages = numpages, | 1073 | .numpages = numpages, |
988 | .mask_set = __pgprot(0), | 1074 | .mask_set = __pgprot(0), |
989 | .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; | 1075 | .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), |
1076 | .flags = 0}; | ||
990 | 1077 | ||
991 | return __change_page_attr_set_clr(&cpa, 1); | 1078 | /* |
1079 | * No alias checking needed for setting not present flag. otherwise, | ||
1080 | * we may need to break large pages for 64-bit kernel text | ||
1081 | * mappings (this adds to complexity if we want to do this from | ||
1082 | * atomic context especially). Let's keep it simple! | ||
1083 | */ | ||
1084 | return __change_page_attr_set_clr(&cpa, 0); | ||
992 | } | 1085 | } |
993 | 1086 | ||
994 | void kernel_map_pages(struct page *page, int numpages, int enable) | 1087 | void kernel_map_pages(struct page *page, int numpages, int enable) |
@@ -1008,11 +1101,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) | |||
1008 | 1101 | ||
1009 | /* | 1102 | /* |
1010 | * The return value is ignored as the calls cannot fail. | 1103 | * The return value is ignored as the calls cannot fail. |
1011 | * Large pages are kept enabled at boot time, and are | 1104 | * Large pages for identity mappings are not used at boot time |
1012 | * split up quickly with DEBUG_PAGEALLOC. If a splitup | 1105 | * and hence no memory allocations during large page split. |
1013 | * fails here (due to temporary memory shortage) no damage | ||
1014 | * is done because we just keep the largepage intact up | ||
1015 | * to the next attempt when it will likely be split up: | ||
1016 | */ | 1106 | */ |
1017 | if (enable) | 1107 | if (enable) |
1018 | __set_pages_p(page, numpages); | 1108 | __set_pages_p(page, numpages); |
@@ -1024,53 +1114,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) | |||
1024 | * but that can deadlock->flush only current cpu: | 1114 | * but that can deadlock->flush only current cpu: |
1025 | */ | 1115 | */ |
1026 | __flush_tlb_all(); | 1116 | __flush_tlb_all(); |
1027 | |||
1028 | /* | ||
1029 | * Try to refill the page pool here. We can do this only after | ||
1030 | * the tlb flush. | ||
1031 | */ | ||
1032 | cpa_fill_pool(NULL); | ||
1033 | } | 1117 | } |
1034 | 1118 | ||
1035 | #ifdef CONFIG_DEBUG_FS | ||
1036 | static int dpa_show(struct seq_file *m, void *v) | ||
1037 | { | ||
1038 | seq_puts(m, "DEBUG_PAGEALLOC\n"); | ||
1039 | seq_printf(m, "pool_size : %lu\n", pool_size); | ||
1040 | seq_printf(m, "pool_pages : %lu\n", pool_pages); | ||
1041 | seq_printf(m, "pool_low : %lu\n", pool_low); | ||
1042 | seq_printf(m, "pool_used : %lu\n", pool_used); | ||
1043 | seq_printf(m, "pool_failed : %lu\n", pool_failed); | ||
1044 | |||
1045 | return 0; | ||
1046 | } | ||
1047 | |||
1048 | static int dpa_open(struct inode *inode, struct file *filp) | ||
1049 | { | ||
1050 | return single_open(filp, dpa_show, NULL); | ||
1051 | } | ||
1052 | |||
1053 | static const struct file_operations dpa_fops = { | ||
1054 | .open = dpa_open, | ||
1055 | .read = seq_read, | ||
1056 | .llseek = seq_lseek, | ||
1057 | .release = single_release, | ||
1058 | }; | ||
1059 | |||
1060 | static int __init debug_pagealloc_proc_init(void) | ||
1061 | { | ||
1062 | struct dentry *de; | ||
1063 | |||
1064 | de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL, | ||
1065 | &dpa_fops); | ||
1066 | if (!de) | ||
1067 | return -ENOMEM; | ||
1068 | |||
1069 | return 0; | ||
1070 | } | ||
1071 | __initcall(debug_pagealloc_proc_init); | ||
1072 | #endif | ||
1073 | |||
1074 | #ifdef CONFIG_HIBERNATION | 1119 | #ifdef CONFIG_HIBERNATION |
1075 | 1120 | ||
1076 | bool kernel_page_present(struct page *page) | 1121 | bool kernel_page_present(struct page *page) |
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 2a50e0fa64a5..738fd0f24958 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c | |||
@@ -7,24 +7,24 @@ | |||
7 | * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. | 7 | * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/mm.h> | 10 | #include <linux/seq_file.h> |
11 | #include <linux/bootmem.h> | ||
12 | #include <linux/debugfs.h> | ||
11 | #include <linux/kernel.h> | 13 | #include <linux/kernel.h> |
12 | #include <linux/gfp.h> | 14 | #include <linux/gfp.h> |
15 | #include <linux/mm.h> | ||
13 | #include <linux/fs.h> | 16 | #include <linux/fs.h> |
14 | #include <linux/bootmem.h> | ||
15 | #include <linux/debugfs.h> | ||
16 | #include <linux/seq_file.h> | ||
17 | 17 | ||
18 | #include <asm/msr.h> | 18 | #include <asm/cacheflush.h> |
19 | #include <asm/tlbflush.h> | ||
20 | #include <asm/processor.h> | 19 | #include <asm/processor.h> |
21 | #include <asm/page.h> | 20 | #include <asm/tlbflush.h> |
22 | #include <asm/pgtable.h> | 21 | #include <asm/pgtable.h> |
23 | #include <asm/pat.h> | ||
24 | #include <asm/e820.h> | ||
25 | #include <asm/cacheflush.h> | ||
26 | #include <asm/fcntl.h> | 22 | #include <asm/fcntl.h> |
23 | #include <asm/e820.h> | ||
27 | #include <asm/mtrr.h> | 24 | #include <asm/mtrr.h> |
25 | #include <asm/page.h> | ||
26 | #include <asm/msr.h> | ||
27 | #include <asm/pat.h> | ||
28 | #include <asm/io.h> | 28 | #include <asm/io.h> |
29 | 29 | ||
30 | #ifdef CONFIG_X86_PAT | 30 | #ifdef CONFIG_X86_PAT |
@@ -46,6 +46,7 @@ early_param("nopat", nopat); | |||
46 | 46 | ||
47 | 47 | ||
48 | static int debug_enable; | 48 | static int debug_enable; |
49 | |||
49 | static int __init pat_debug_setup(char *str) | 50 | static int __init pat_debug_setup(char *str) |
50 | { | 51 | { |
51 | debug_enable = 1; | 52 | debug_enable = 1; |
@@ -145,14 +146,14 @@ static char *cattr_name(unsigned long flags) | |||
145 | */ | 146 | */ |
146 | 147 | ||
147 | struct memtype { | 148 | struct memtype { |
148 | u64 start; | 149 | u64 start; |
149 | u64 end; | 150 | u64 end; |
150 | unsigned long type; | 151 | unsigned long type; |
151 | struct list_head nd; | 152 | struct list_head nd; |
152 | }; | 153 | }; |
153 | 154 | ||
154 | static LIST_HEAD(memtype_list); | 155 | static LIST_HEAD(memtype_list); |
155 | static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ | 156 | static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ |
156 | 157 | ||
157 | /* | 158 | /* |
158 | * Does intersection of PAT memory type and MTRR memory type and returns | 159 | * Does intersection of PAT memory type and MTRR memory type and returns |
@@ -180,8 +181,8 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) | |||
180 | return req_type; | 181 | return req_type; |
181 | } | 182 | } |
182 | 183 | ||
183 | static int chk_conflict(struct memtype *new, struct memtype *entry, | 184 | static int |
184 | unsigned long *type) | 185 | chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) |
185 | { | 186 | { |
186 | if (new->type != entry->type) { | 187 | if (new->type != entry->type) { |
187 | if (type) { | 188 | if (type) { |
@@ -211,6 +212,66 @@ static struct memtype *cached_entry; | |||
211 | static u64 cached_start; | 212 | static u64 cached_start; |
212 | 213 | ||
213 | /* | 214 | /* |
215 | * For RAM pages, mark the pages as non WB memory type using | ||
216 | * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or | ||
217 | * set_memory_wc() on a RAM page at a time before marking it as WB again. | ||
218 | * This is ok, because only one driver will be owning the page and | ||
219 | * doing set_memory_*() calls. | ||
220 | * | ||
221 | * For now, we use PageNonWB to track that the RAM page is being mapped | ||
222 | * as non WB. In future, we will have to use one more flag | ||
223 | * (or some other mechanism in page_struct) to distinguish between | ||
224 | * UC and WC mapping. | ||
225 | */ | ||
226 | static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, | ||
227 | unsigned long *new_type) | ||
228 | { | ||
229 | struct page *page; | ||
230 | u64 pfn, end_pfn; | ||
231 | |||
232 | for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { | ||
233 | page = pfn_to_page(pfn); | ||
234 | if (page_mapped(page) || PageNonWB(page)) | ||
235 | goto out; | ||
236 | |||
237 | SetPageNonWB(page); | ||
238 | } | ||
239 | return 0; | ||
240 | |||
241 | out: | ||
242 | end_pfn = pfn; | ||
243 | for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { | ||
244 | page = pfn_to_page(pfn); | ||
245 | ClearPageNonWB(page); | ||
246 | } | ||
247 | |||
248 | return -EINVAL; | ||
249 | } | ||
250 | |||
251 | static int free_ram_pages_type(u64 start, u64 end) | ||
252 | { | ||
253 | struct page *page; | ||
254 | u64 pfn, end_pfn; | ||
255 | |||
256 | for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { | ||
257 | page = pfn_to_page(pfn); | ||
258 | if (page_mapped(page) || !PageNonWB(page)) | ||
259 | goto out; | ||
260 | |||
261 | ClearPageNonWB(page); | ||
262 | } | ||
263 | return 0; | ||
264 | |||
265 | out: | ||
266 | end_pfn = pfn; | ||
267 | for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { | ||
268 | page = pfn_to_page(pfn); | ||
269 | SetPageNonWB(page); | ||
270 | } | ||
271 | return -EINVAL; | ||
272 | } | ||
273 | |||
274 | /* | ||
214 | * req_type typically has one of the: | 275 | * req_type typically has one of the: |
215 | * - _PAGE_CACHE_WB | 276 | * - _PAGE_CACHE_WB |
216 | * - _PAGE_CACHE_WC | 277 | * - _PAGE_CACHE_WC |
@@ -226,14 +287,15 @@ static u64 cached_start; | |||
226 | * it will return a negative return value. | 287 | * it will return a negative return value. |
227 | */ | 288 | */ |
228 | int reserve_memtype(u64 start, u64 end, unsigned long req_type, | 289 | int reserve_memtype(u64 start, u64 end, unsigned long req_type, |
229 | unsigned long *new_type) | 290 | unsigned long *new_type) |
230 | { | 291 | { |
231 | struct memtype *new, *entry; | 292 | struct memtype *new, *entry; |
232 | unsigned long actual_type; | 293 | unsigned long actual_type; |
233 | struct list_head *where; | 294 | struct list_head *where; |
295 | int is_range_ram; | ||
234 | int err = 0; | 296 | int err = 0; |
235 | 297 | ||
236 | BUG_ON(start >= end); /* end is exclusive */ | 298 | BUG_ON(start >= end); /* end is exclusive */ |
237 | 299 | ||
238 | if (!pat_enabled) { | 300 | if (!pat_enabled) { |
239 | /* This is identical to page table setting without PAT */ | 301 | /* This is identical to page table setting without PAT */ |
@@ -266,17 +328,24 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
266 | actual_type = _PAGE_CACHE_WB; | 328 | actual_type = _PAGE_CACHE_WB; |
267 | else | 329 | else |
268 | actual_type = _PAGE_CACHE_UC_MINUS; | 330 | actual_type = _PAGE_CACHE_UC_MINUS; |
269 | } else | 331 | } else { |
270 | actual_type = pat_x_mtrr_type(start, end, | 332 | actual_type = pat_x_mtrr_type(start, end, |
271 | req_type & _PAGE_CACHE_MASK); | 333 | req_type & _PAGE_CACHE_MASK); |
334 | } | ||
335 | |||
336 | is_range_ram = pagerange_is_ram(start, end); | ||
337 | if (is_range_ram == 1) | ||
338 | return reserve_ram_pages_type(start, end, req_type, new_type); | ||
339 | else if (is_range_ram < 0) | ||
340 | return -EINVAL; | ||
272 | 341 | ||
273 | new = kmalloc(sizeof(struct memtype), GFP_KERNEL); | 342 | new = kmalloc(sizeof(struct memtype), GFP_KERNEL); |
274 | if (!new) | 343 | if (!new) |
275 | return -ENOMEM; | 344 | return -ENOMEM; |
276 | 345 | ||
277 | new->start = start; | 346 | new->start = start; |
278 | new->end = end; | 347 | new->end = end; |
279 | new->type = actual_type; | 348 | new->type = actual_type; |
280 | 349 | ||
281 | if (new_type) | 350 | if (new_type) |
282 | *new_type = actual_type; | 351 | *new_type = actual_type; |
@@ -335,6 +404,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
335 | start, end, cattr_name(new->type), cattr_name(req_type)); | 404 | start, end, cattr_name(new->type), cattr_name(req_type)); |
336 | kfree(new); | 405 | kfree(new); |
337 | spin_unlock(&memtype_lock); | 406 | spin_unlock(&memtype_lock); |
407 | |||
338 | return err; | 408 | return err; |
339 | } | 409 | } |
340 | 410 | ||
@@ -358,6 +428,7 @@ int free_memtype(u64 start, u64 end) | |||
358 | { | 428 | { |
359 | struct memtype *entry; | 429 | struct memtype *entry; |
360 | int err = -EINVAL; | 430 | int err = -EINVAL; |
431 | int is_range_ram; | ||
361 | 432 | ||
362 | if (!pat_enabled) | 433 | if (!pat_enabled) |
363 | return 0; | 434 | return 0; |
@@ -366,6 +437,12 @@ int free_memtype(u64 start, u64 end) | |||
366 | if (is_ISA_range(start, end - 1)) | 437 | if (is_ISA_range(start, end - 1)) |
367 | return 0; | 438 | return 0; |
368 | 439 | ||
440 | is_range_ram = pagerange_is_ram(start, end); | ||
441 | if (is_range_ram == 1) | ||
442 | return free_ram_pages_type(start, end); | ||
443 | else if (is_range_ram < 0) | ||
444 | return -EINVAL; | ||
445 | |||
369 | spin_lock(&memtype_lock); | 446 | spin_lock(&memtype_lock); |
370 | list_for_each_entry(entry, &memtype_list, nd) { | 447 | list_for_each_entry(entry, &memtype_list, nd) { |
371 | if (entry->start == start && entry->end == end) { | 448 | if (entry->start == start && entry->end == end) { |
@@ -386,6 +463,7 @@ int free_memtype(u64 start, u64 end) | |||
386 | } | 463 | } |
387 | 464 | ||
388 | dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); | 465 | dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); |
466 | |||
389 | return err; | 467 | return err; |
390 | } | 468 | } |
391 | 469 | ||
@@ -492,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, | |||
492 | 570 | ||
493 | void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) | 571 | void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) |
494 | { | 572 | { |
573 | unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); | ||
495 | u64 addr = (u64)pfn << PAGE_SHIFT; | 574 | u64 addr = (u64)pfn << PAGE_SHIFT; |
496 | unsigned long flags; | 575 | unsigned long flags; |
497 | unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); | ||
498 | 576 | ||
499 | reserve_memtype(addr, addr + size, want_flags, &flags); | 577 | reserve_memtype(addr, addr + size, want_flags, &flags); |
500 | if (flags != want_flags) { | 578 | if (flags != want_flags) { |
@@ -514,7 +592,7 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) | |||
514 | free_memtype(addr, addr + size); | 592 | free_memtype(addr, addr + size); |
515 | } | 593 | } |
516 | 594 | ||
517 | #if defined(CONFIG_DEBUG_FS) | 595 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) |
518 | 596 | ||
519 | /* get Nth element of the linked list */ | 597 | /* get Nth element of the linked list */ |
520 | static struct memtype *memtype_get_idx(loff_t pos) | 598 | static struct memtype *memtype_get_idx(loff_t pos) |
@@ -537,6 +615,7 @@ static struct memtype *memtype_get_idx(loff_t pos) | |||
537 | } | 615 | } |
538 | spin_unlock(&memtype_lock); | 616 | spin_unlock(&memtype_lock); |
539 | kfree(print_entry); | 617 | kfree(print_entry); |
618 | |||
540 | return NULL; | 619 | return NULL; |
541 | } | 620 | } |
542 | 621 | ||
@@ -567,6 +646,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v) | |||
567 | seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), | 646 | seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), |
568 | print_entry->start, print_entry->end); | 647 | print_entry->start, print_entry->end); |
569 | kfree(print_entry); | 648 | kfree(print_entry); |
649 | |||
570 | return 0; | 650 | return 0; |
571 | } | 651 | } |
572 | 652 | ||
@@ -598,4 +678,4 @@ static int __init pat_memtype_list_init(void) | |||
598 | 678 | ||
599 | late_initcall(pat_memtype_list_init); | 679 | late_initcall(pat_memtype_list_init); |
600 | 680 | ||
601 | #endif /* CONFIG_DEBUG_FS */ | 681 | #endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index d50302774fe2..86f2ffc43c3d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -63,10 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd) | |||
63 | #define UNSHARED_PTRS_PER_PGD \ | 63 | #define UNSHARED_PTRS_PER_PGD \ |
64 | (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) | 64 | (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) |
65 | 65 | ||
66 | static void pgd_ctor(void *p) | 66 | static void pgd_ctor(pgd_t *pgd) |
67 | { | 67 | { |
68 | pgd_t *pgd = p; | ||
69 | |||
70 | /* If the pgd points to a shared pagetable level (either the | 68 | /* If the pgd points to a shared pagetable level (either the |
71 | ptes in non-PAE, or shared PMD in PAE), then just copy the | 69 | ptes in non-PAE, or shared PMD in PAE), then just copy the |
72 | references from swapper_pg_dir. */ | 70 | references from swapper_pg_dir. */ |
@@ -87,7 +85,7 @@ static void pgd_ctor(void *p) | |||
87 | pgd_list_add(pgd); | 85 | pgd_list_add(pgd); |
88 | } | 86 | } |
89 | 87 | ||
90 | static void pgd_dtor(void *pgd) | 88 | static void pgd_dtor(pgd_t *pgd) |
91 | { | 89 | { |
92 | unsigned long flags; /* can be called from interrupt context */ | 90 | unsigned long flags; /* can be called from interrupt context */ |
93 | 91 | ||
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index cab0abbd1ebe..0951db9ee519 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c | |||
@@ -123,7 +123,8 @@ static int __init parse_vmalloc(char *arg) | |||
123 | if (!arg) | 123 | if (!arg) |
124 | return -EINVAL; | 124 | return -EINVAL; |
125 | 125 | ||
126 | __VMALLOC_RESERVE = memparse(arg, &arg); | 126 | /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/ |
127 | __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET; | ||
127 | return 0; | 128 | return 0; |
128 | } | 129 | } |
129 | early_param("vmalloc", parse_vmalloc); | 130 | early_param("vmalloc", parse_vmalloc); |
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 1b4763e26ea9..51c0a2fc14fe 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c | |||
@@ -138,7 +138,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) | |||
138 | return; | 138 | return; |
139 | } | 139 | } |
140 | 140 | ||
141 | if (is_uv_system()) | 141 | if (get_uv_system_type() >= UV_X2APIC) |
142 | apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; | 142 | apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; |
143 | else | 143 | else |
144 | apic_id = pa->apic_id; | 144 | apic_id = pa->apic_id; |
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile index 30f3eb366667..446902b2a6b6 100644 --- a/arch/x86/oprofile/Makefile +++ b/arch/x86/oprofile/Makefile | |||
@@ -7,6 +7,6 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ | |||
7 | timer_int.o ) | 7 | timer_int.o ) |
8 | 8 | ||
9 | oprofile-y := $(DRIVER_OBJS) init.o backtrace.o | 9 | oprofile-y := $(DRIVER_OBJS) init.o backtrace.o |
10 | oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \ | 10 | oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \ |
11 | op_model_ppro.o op_model_p4.o | 11 | op_model_ppro.o op_model_p4.o |
12 | oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o | 12 | oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o |
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 0227694f7dab..57f6c9088081 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c | |||
@@ -1,10 +1,11 @@ | |||
1 | /** | 1 | /** |
2 | * @file nmi_int.c | 2 | * @file nmi_int.c |
3 | * | 3 | * |
4 | * @remark Copyright 2002 OProfile authors | 4 | * @remark Copyright 2002-2008 OProfile authors |
5 | * @remark Read the file COPYING | 5 | * @remark Read the file COPYING |
6 | * | 6 | * |
7 | * @author John Levon <levon@movementarian.org> | 7 | * @author John Levon <levon@movementarian.org> |
8 | * @author Robert Richter <robert.richter@amd.com> | ||
8 | */ | 9 | */ |
9 | 10 | ||
10 | #include <linux/init.h> | 11 | #include <linux/init.h> |
@@ -295,10 +296,12 @@ static void nmi_cpu_shutdown(void *dummy) | |||
295 | 296 | ||
296 | static void nmi_shutdown(void) | 297 | static void nmi_shutdown(void) |
297 | { | 298 | { |
298 | struct op_msrs *msrs = &get_cpu_var(cpu_msrs); | 299 | struct op_msrs *msrs; |
300 | |||
299 | nmi_enabled = 0; | 301 | nmi_enabled = 0; |
300 | on_each_cpu(nmi_cpu_shutdown, NULL, 1); | 302 | on_each_cpu(nmi_cpu_shutdown, NULL, 1); |
301 | unregister_die_notifier(&profile_exceptions_nb); | 303 | unregister_die_notifier(&profile_exceptions_nb); |
304 | msrs = &get_cpu_var(cpu_msrs); | ||
302 | model->shutdown(msrs); | 305 | model->shutdown(msrs); |
303 | free_msrs(); | 306 | free_msrs(); |
304 | put_cpu_var(cpu_msrs); | 307 | put_cpu_var(cpu_msrs); |
@@ -437,6 +440,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) | |||
437 | __u8 vendor = boot_cpu_data.x86_vendor; | 440 | __u8 vendor = boot_cpu_data.x86_vendor; |
438 | __u8 family = boot_cpu_data.x86; | 441 | __u8 family = boot_cpu_data.x86; |
439 | char *cpu_type; | 442 | char *cpu_type; |
443 | int ret = 0; | ||
440 | 444 | ||
441 | if (!cpu_has_apic) | 445 | if (!cpu_has_apic) |
442 | return -ENODEV; | 446 | return -ENODEV; |
@@ -449,19 +453,23 @@ int __init op_nmi_init(struct oprofile_operations *ops) | |||
449 | default: | 453 | default: |
450 | return -ENODEV; | 454 | return -ENODEV; |
451 | case 6: | 455 | case 6: |
452 | model = &op_athlon_spec; | 456 | model = &op_amd_spec; |
453 | cpu_type = "i386/athlon"; | 457 | cpu_type = "i386/athlon"; |
454 | break; | 458 | break; |
455 | case 0xf: | 459 | case 0xf: |
456 | model = &op_athlon_spec; | 460 | model = &op_amd_spec; |
457 | /* Actually it could be i386/hammer too, but give | 461 | /* Actually it could be i386/hammer too, but give |
458 | user space an consistent name. */ | 462 | user space an consistent name. */ |
459 | cpu_type = "x86-64/hammer"; | 463 | cpu_type = "x86-64/hammer"; |
460 | break; | 464 | break; |
461 | case 0x10: | 465 | case 0x10: |
462 | model = &op_athlon_spec; | 466 | model = &op_amd_spec; |
463 | cpu_type = "x86-64/family10"; | 467 | cpu_type = "x86-64/family10"; |
464 | break; | 468 | break; |
469 | case 0x11: | ||
470 | model = &op_amd_spec; | ||
471 | cpu_type = "x86-64/family11h"; | ||
472 | break; | ||
465 | } | 473 | } |
466 | break; | 474 | break; |
467 | 475 | ||
@@ -488,17 +496,24 @@ int __init op_nmi_init(struct oprofile_operations *ops) | |||
488 | return -ENODEV; | 496 | return -ENODEV; |
489 | } | 497 | } |
490 | 498 | ||
491 | init_sysfs(); | ||
492 | #ifdef CONFIG_SMP | 499 | #ifdef CONFIG_SMP |
493 | register_cpu_notifier(&oprofile_cpu_nb); | 500 | register_cpu_notifier(&oprofile_cpu_nb); |
494 | #endif | 501 | #endif |
495 | using_nmi = 1; | 502 | /* default values, can be overwritten by model */ |
496 | ops->create_files = nmi_create_files; | 503 | ops->create_files = nmi_create_files; |
497 | ops->setup = nmi_setup; | 504 | ops->setup = nmi_setup; |
498 | ops->shutdown = nmi_shutdown; | 505 | ops->shutdown = nmi_shutdown; |
499 | ops->start = nmi_start; | 506 | ops->start = nmi_start; |
500 | ops->stop = nmi_stop; | 507 | ops->stop = nmi_stop; |
501 | ops->cpu_type = cpu_type; | 508 | ops->cpu_type = cpu_type; |
509 | |||
510 | if (model->init) | ||
511 | ret = model->init(ops); | ||
512 | if (ret) | ||
513 | return ret; | ||
514 | |||
515 | init_sysfs(); | ||
516 | using_nmi = 1; | ||
502 | printk(KERN_INFO "oprofile: using NMI interrupt.\n"); | 517 | printk(KERN_INFO "oprofile: using NMI interrupt.\n"); |
503 | return 0; | 518 | return 0; |
504 | } | 519 | } |
@@ -511,4 +526,6 @@ void op_nmi_exit(void) | |||
511 | unregister_cpu_notifier(&oprofile_cpu_nb); | 526 | unregister_cpu_notifier(&oprofile_cpu_nb); |
512 | #endif | 527 | #endif |
513 | } | 528 | } |
529 | if (model->exit) | ||
530 | model->exit(); | ||
514 | } | 531 | } |
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c new file mode 100644 index 000000000000..d9faf607b3a6 --- /dev/null +++ b/arch/x86/oprofile/op_model_amd.c | |||
@@ -0,0 +1,543 @@ | |||
1 | /* | ||
2 | * @file op_model_amd.c | ||
3 | * athlon / K7 / K8 / Family 10h model-specific MSR operations | ||
4 | * | ||
5 | * @remark Copyright 2002-2008 OProfile authors | ||
6 | * @remark Read the file COPYING | ||
7 | * | ||
8 | * @author John Levon | ||
9 | * @author Philippe Elie | ||
10 | * @author Graydon Hoare | ||
11 | * @author Robert Richter <robert.richter@amd.com> | ||
12 | * @author Barry Kasindorf | ||
13 | */ | ||
14 | |||
15 | #include <linux/oprofile.h> | ||
16 | #include <linux/device.h> | ||
17 | #include <linux/pci.h> | ||
18 | |||
19 | #include <asm/ptrace.h> | ||
20 | #include <asm/msr.h> | ||
21 | #include <asm/nmi.h> | ||
22 | |||
23 | #include "op_x86_model.h" | ||
24 | #include "op_counter.h" | ||
25 | |||
26 | #define NUM_COUNTERS 4 | ||
27 | #define NUM_CONTROLS 4 | ||
28 | |||
29 | #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) | ||
30 | #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) | ||
31 | #define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) | ||
32 | #define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) | ||
33 | |||
34 | #define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) | ||
35 | #define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) | ||
36 | #define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) | ||
37 | #define CTRL_SET_ACTIVE(n) (n |= (1<<22)) | ||
38 | #define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) | ||
39 | #define CTRL_CLEAR_LO(x) (x &= (1<<21)) | ||
40 | #define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) | ||
41 | #define CTRL_SET_ENABLE(val) (val |= 1<<20) | ||
42 | #define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) | ||
43 | #define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) | ||
44 | #define CTRL_SET_UM(val, m) (val |= (m << 8)) | ||
45 | #define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) | ||
46 | #define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) | ||
47 | #define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) | ||
48 | #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) | ||
49 | |||
50 | static unsigned long reset_value[NUM_COUNTERS]; | ||
51 | |||
52 | #ifdef CONFIG_OPROFILE_IBS | ||
53 | |||
54 | /* IbsFetchCtl bits/masks */ | ||
55 | #define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ | ||
56 | #define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ | ||
57 | #define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ | ||
58 | |||
59 | /*IbsOpCtl bits */ | ||
60 | #define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ | ||
61 | #define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ | ||
62 | |||
63 | /* Codes used in cpu_buffer.c */ | ||
64 | /* This produces duplicate code, need to be fixed */ | ||
65 | #define IBS_FETCH_BEGIN 3 | ||
66 | #define IBS_OP_BEGIN 4 | ||
67 | |||
68 | /* The function interface needs to be fixed, something like add | ||
69 | data. Should then be added to linux/oprofile.h. */ | ||
70 | extern void oprofile_add_ibs_sample(struct pt_regs *const regs, | ||
71 | unsigned int * const ibs_sample, u8 code); | ||
72 | |||
73 | struct ibs_fetch_sample { | ||
74 | /* MSRC001_1031 IBS Fetch Linear Address Register */ | ||
75 | unsigned int ibs_fetch_lin_addr_low; | ||
76 | unsigned int ibs_fetch_lin_addr_high; | ||
77 | /* MSRC001_1030 IBS Fetch Control Register */ | ||
78 | unsigned int ibs_fetch_ctl_low; | ||
79 | unsigned int ibs_fetch_ctl_high; | ||
80 | /* MSRC001_1032 IBS Fetch Physical Address Register */ | ||
81 | unsigned int ibs_fetch_phys_addr_low; | ||
82 | unsigned int ibs_fetch_phys_addr_high; | ||
83 | }; | ||
84 | |||
85 | struct ibs_op_sample { | ||
86 | /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */ | ||
87 | unsigned int ibs_op_rip_low; | ||
88 | unsigned int ibs_op_rip_high; | ||
89 | /* MSRC001_1035 IBS Op Data Register */ | ||
90 | unsigned int ibs_op_data1_low; | ||
91 | unsigned int ibs_op_data1_high; | ||
92 | /* MSRC001_1036 IBS Op Data 2 Register */ | ||
93 | unsigned int ibs_op_data2_low; | ||
94 | unsigned int ibs_op_data2_high; | ||
95 | /* MSRC001_1037 IBS Op Data 3 Register */ | ||
96 | unsigned int ibs_op_data3_low; | ||
97 | unsigned int ibs_op_data3_high; | ||
98 | /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */ | ||
99 | unsigned int ibs_dc_linear_low; | ||
100 | unsigned int ibs_dc_linear_high; | ||
101 | /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */ | ||
102 | unsigned int ibs_dc_phys_low; | ||
103 | unsigned int ibs_dc_phys_high; | ||
104 | }; | ||
105 | |||
106 | /* | ||
107 | * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+ | ||
108 | */ | ||
109 | static void clear_ibs_nmi(void); | ||
110 | |||
111 | static int ibs_allowed; /* AMD Family10h and later */ | ||
112 | |||
113 | struct op_ibs_config { | ||
114 | unsigned long op_enabled; | ||
115 | unsigned long fetch_enabled; | ||
116 | unsigned long max_cnt_fetch; | ||
117 | unsigned long max_cnt_op; | ||
118 | unsigned long rand_en; | ||
119 | unsigned long dispatched_ops; | ||
120 | }; | ||
121 | |||
122 | static struct op_ibs_config ibs_config; | ||
123 | |||
124 | #endif | ||
125 | |||
126 | /* functions for op_amd_spec */ | ||
127 | |||
128 | static void op_amd_fill_in_addresses(struct op_msrs * const msrs) | ||
129 | { | ||
130 | int i; | ||
131 | |||
132 | for (i = 0; i < NUM_COUNTERS; i++) { | ||
133 | if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) | ||
134 | msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; | ||
135 | else | ||
136 | msrs->counters[i].addr = 0; | ||
137 | } | ||
138 | |||
139 | for (i = 0; i < NUM_CONTROLS; i++) { | ||
140 | if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) | ||
141 | msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; | ||
142 | else | ||
143 | msrs->controls[i].addr = 0; | ||
144 | } | ||
145 | } | ||
146 | |||
147 | |||
148 | static void op_amd_setup_ctrs(struct op_msrs const * const msrs) | ||
149 | { | ||
150 | unsigned int low, high; | ||
151 | int i; | ||
152 | |||
153 | /* clear all counters */ | ||
154 | for (i = 0 ; i < NUM_CONTROLS; ++i) { | ||
155 | if (unlikely(!CTRL_IS_RESERVED(msrs, i))) | ||
156 | continue; | ||
157 | CTRL_READ(low, high, msrs, i); | ||
158 | CTRL_CLEAR_LO(low); | ||
159 | CTRL_CLEAR_HI(high); | ||
160 | CTRL_WRITE(low, high, msrs, i); | ||
161 | } | ||
162 | |||
163 | /* avoid a false detection of ctr overflows in NMI handler */ | ||
164 | for (i = 0; i < NUM_COUNTERS; ++i) { | ||
165 | if (unlikely(!CTR_IS_RESERVED(msrs, i))) | ||
166 | continue; | ||
167 | CTR_WRITE(1, msrs, i); | ||
168 | } | ||
169 | |||
170 | /* enable active counters */ | ||
171 | for (i = 0; i < NUM_COUNTERS; ++i) { | ||
172 | if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { | ||
173 | reset_value[i] = counter_config[i].count; | ||
174 | |||
175 | CTR_WRITE(counter_config[i].count, msrs, i); | ||
176 | |||
177 | CTRL_READ(low, high, msrs, i); | ||
178 | CTRL_CLEAR_LO(low); | ||
179 | CTRL_CLEAR_HI(high); | ||
180 | CTRL_SET_ENABLE(low); | ||
181 | CTRL_SET_USR(low, counter_config[i].user); | ||
182 | CTRL_SET_KERN(low, counter_config[i].kernel); | ||
183 | CTRL_SET_UM(low, counter_config[i].unit_mask); | ||
184 | CTRL_SET_EVENT_LOW(low, counter_config[i].event); | ||
185 | CTRL_SET_EVENT_HIGH(high, counter_config[i].event); | ||
186 | CTRL_SET_HOST_ONLY(high, 0); | ||
187 | CTRL_SET_GUEST_ONLY(high, 0); | ||
188 | |||
189 | CTRL_WRITE(low, high, msrs, i); | ||
190 | } else { | ||
191 | reset_value[i] = 0; | ||
192 | } | ||
193 | } | ||
194 | } | ||
195 | |||
196 | #ifdef CONFIG_OPROFILE_IBS | ||
197 | |||
198 | static inline int | ||
199 | op_amd_handle_ibs(struct pt_regs * const regs, | ||
200 | struct op_msrs const * const msrs) | ||
201 | { | ||
202 | unsigned int low, high; | ||
203 | struct ibs_fetch_sample ibs_fetch; | ||
204 | struct ibs_op_sample ibs_op; | ||
205 | |||
206 | if (!ibs_allowed) | ||
207 | return 1; | ||
208 | |||
209 | if (ibs_config.fetch_enabled) { | ||
210 | rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); | ||
211 | if (high & IBS_FETCH_HIGH_VALID_BIT) { | ||
212 | ibs_fetch.ibs_fetch_ctl_high = high; | ||
213 | ibs_fetch.ibs_fetch_ctl_low = low; | ||
214 | rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high); | ||
215 | ibs_fetch.ibs_fetch_lin_addr_high = high; | ||
216 | ibs_fetch.ibs_fetch_lin_addr_low = low; | ||
217 | rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high); | ||
218 | ibs_fetch.ibs_fetch_phys_addr_high = high; | ||
219 | ibs_fetch.ibs_fetch_phys_addr_low = low; | ||
220 | |||
221 | oprofile_add_ibs_sample(regs, | ||
222 | (unsigned int *)&ibs_fetch, | ||
223 | IBS_FETCH_BEGIN); | ||
224 | |||
225 | /*reenable the IRQ */ | ||
226 | rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); | ||
227 | high &= ~IBS_FETCH_HIGH_VALID_BIT; | ||
228 | high |= IBS_FETCH_HIGH_ENABLE; | ||
229 | low &= IBS_FETCH_LOW_MAX_CNT_MASK; | ||
230 | wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); | ||
231 | } | ||
232 | } | ||
233 | |||
234 | if (ibs_config.op_enabled) { | ||
235 | rdmsr(MSR_AMD64_IBSOPCTL, low, high); | ||
236 | if (low & IBS_OP_LOW_VALID_BIT) { | ||
237 | rdmsr(MSR_AMD64_IBSOPRIP, low, high); | ||
238 | ibs_op.ibs_op_rip_low = low; | ||
239 | ibs_op.ibs_op_rip_high = high; | ||
240 | rdmsr(MSR_AMD64_IBSOPDATA, low, high); | ||
241 | ibs_op.ibs_op_data1_low = low; | ||
242 | ibs_op.ibs_op_data1_high = high; | ||
243 | rdmsr(MSR_AMD64_IBSOPDATA2, low, high); | ||
244 | ibs_op.ibs_op_data2_low = low; | ||
245 | ibs_op.ibs_op_data2_high = high; | ||
246 | rdmsr(MSR_AMD64_IBSOPDATA3, low, high); | ||
247 | ibs_op.ibs_op_data3_low = low; | ||
248 | ibs_op.ibs_op_data3_high = high; | ||
249 | rdmsr(MSR_AMD64_IBSDCLINAD, low, high); | ||
250 | ibs_op.ibs_dc_linear_low = low; | ||
251 | ibs_op.ibs_dc_linear_high = high; | ||
252 | rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high); | ||
253 | ibs_op.ibs_dc_phys_low = low; | ||
254 | ibs_op.ibs_dc_phys_high = high; | ||
255 | |||
256 | /* reenable the IRQ */ | ||
257 | oprofile_add_ibs_sample(regs, | ||
258 | (unsigned int *)&ibs_op, | ||
259 | IBS_OP_BEGIN); | ||
260 | rdmsr(MSR_AMD64_IBSOPCTL, low, high); | ||
261 | high = 0; | ||
262 | low &= ~IBS_OP_LOW_VALID_BIT; | ||
263 | low |= IBS_OP_LOW_ENABLE; | ||
264 | wrmsr(MSR_AMD64_IBSOPCTL, low, high); | ||
265 | } | ||
266 | } | ||
267 | |||
268 | return 1; | ||
269 | } | ||
270 | |||
271 | #endif | ||
272 | |||
273 | static int op_amd_check_ctrs(struct pt_regs * const regs, | ||
274 | struct op_msrs const * const msrs) | ||
275 | { | ||
276 | unsigned int low, high; | ||
277 | int i; | ||
278 | |||
279 | for (i = 0 ; i < NUM_COUNTERS; ++i) { | ||
280 | if (!reset_value[i]) | ||
281 | continue; | ||
282 | CTR_READ(low, high, msrs, i); | ||
283 | if (CTR_OVERFLOWED(low)) { | ||
284 | oprofile_add_sample(regs, i); | ||
285 | CTR_WRITE(reset_value[i], msrs, i); | ||
286 | } | ||
287 | } | ||
288 | |||
289 | #ifdef CONFIG_OPROFILE_IBS | ||
290 | op_amd_handle_ibs(regs, msrs); | ||
291 | #endif | ||
292 | |||
293 | /* See op_model_ppro.c */ | ||
294 | return 1; | ||
295 | } | ||
296 | |||
297 | static void op_amd_start(struct op_msrs const * const msrs) | ||
298 | { | ||
299 | unsigned int low, high; | ||
300 | int i; | ||
301 | for (i = 0 ; i < NUM_COUNTERS ; ++i) { | ||
302 | if (reset_value[i]) { | ||
303 | CTRL_READ(low, high, msrs, i); | ||
304 | CTRL_SET_ACTIVE(low); | ||
305 | CTRL_WRITE(low, high, msrs, i); | ||
306 | } | ||
307 | } | ||
308 | |||
309 | #ifdef CONFIG_OPROFILE_IBS | ||
310 | if (ibs_allowed && ibs_config.fetch_enabled) { | ||
311 | low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; | ||
312 | high = IBS_FETCH_HIGH_ENABLE; | ||
313 | wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); | ||
314 | } | ||
315 | |||
316 | if (ibs_allowed && ibs_config.op_enabled) { | ||
317 | low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_LOW_ENABLE; | ||
318 | high = 0; | ||
319 | wrmsr(MSR_AMD64_IBSOPCTL, low, high); | ||
320 | } | ||
321 | #endif | ||
322 | } | ||
323 | |||
324 | |||
325 | static void op_amd_stop(struct op_msrs const * const msrs) | ||
326 | { | ||
327 | unsigned int low, high; | ||
328 | int i; | ||
329 | |||
330 | /* Subtle: stop on all counters to avoid race with | ||
331 | * setting our pm callback */ | ||
332 | for (i = 0 ; i < NUM_COUNTERS ; ++i) { | ||
333 | if (!reset_value[i]) | ||
334 | continue; | ||
335 | CTRL_READ(low, high, msrs, i); | ||
336 | CTRL_SET_INACTIVE(low); | ||
337 | CTRL_WRITE(low, high, msrs, i); | ||
338 | } | ||
339 | |||
340 | #ifdef CONFIG_OPROFILE_IBS | ||
341 | if (ibs_allowed && ibs_config.fetch_enabled) { | ||
342 | low = 0; /* clear max count and enable */ | ||
343 | high = 0; | ||
344 | wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); | ||
345 | } | ||
346 | |||
347 | if (ibs_allowed && ibs_config.op_enabled) { | ||
348 | low = 0; /* clear max count and enable */ | ||
349 | high = 0; | ||
350 | wrmsr(MSR_AMD64_IBSOPCTL, low, high); | ||
351 | } | ||
352 | #endif | ||
353 | } | ||
354 | |||
355 | static void op_amd_shutdown(struct op_msrs const * const msrs) | ||
356 | { | ||
357 | int i; | ||
358 | |||
359 | for (i = 0 ; i < NUM_COUNTERS ; ++i) { | ||
360 | if (CTR_IS_RESERVED(msrs, i)) | ||
361 | release_perfctr_nmi(MSR_K7_PERFCTR0 + i); | ||
362 | } | ||
363 | for (i = 0 ; i < NUM_CONTROLS ; ++i) { | ||
364 | if (CTRL_IS_RESERVED(msrs, i)) | ||
365 | release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); | ||
366 | } | ||
367 | } | ||
368 | |||
369 | #ifndef CONFIG_OPROFILE_IBS | ||
370 | |||
371 | /* no IBS support */ | ||
372 | |||
373 | static int op_amd_init(struct oprofile_operations *ops) | ||
374 | { | ||
375 | return 0; | ||
376 | } | ||
377 | |||
378 | static void op_amd_exit(void) {} | ||
379 | |||
380 | #else | ||
381 | |||
382 | static u8 ibs_eilvt_off; | ||
383 | |||
384 | static inline void apic_init_ibs_nmi_per_cpu(void *arg) | ||
385 | { | ||
386 | ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); | ||
387 | } | ||
388 | |||
389 | static inline void apic_clear_ibs_nmi_per_cpu(void *arg) | ||
390 | { | ||
391 | setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); | ||
392 | } | ||
393 | |||
394 | static int pfm_amd64_setup_eilvt(void) | ||
395 | { | ||
396 | #define IBSCTL_LVTOFFSETVAL (1 << 8) | ||
397 | #define IBSCTL 0x1cc | ||
398 | struct pci_dev *cpu_cfg; | ||
399 | int nodes; | ||
400 | u32 value = 0; | ||
401 | |||
402 | /* per CPU setup */ | ||
403 | on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1); | ||
404 | |||
405 | nodes = 0; | ||
406 | cpu_cfg = NULL; | ||
407 | do { | ||
408 | cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, | ||
409 | PCI_DEVICE_ID_AMD_10H_NB_MISC, | ||
410 | cpu_cfg); | ||
411 | if (!cpu_cfg) | ||
412 | break; | ||
413 | ++nodes; | ||
414 | pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off | ||
415 | | IBSCTL_LVTOFFSETVAL); | ||
416 | pci_read_config_dword(cpu_cfg, IBSCTL, &value); | ||
417 | if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { | ||
418 | printk(KERN_DEBUG "Failed to setup IBS LVT offset, " | ||
419 | "IBSCTL = 0x%08x", value); | ||
420 | return 1; | ||
421 | } | ||
422 | } while (1); | ||
423 | |||
424 | if (!nodes) { | ||
425 | printk(KERN_DEBUG "No CPU node configured for IBS"); | ||
426 | return 1; | ||
427 | } | ||
428 | |||
429 | #ifdef CONFIG_NUMA | ||
430 | /* Sanity check */ | ||
431 | /* Works only for 64bit with proper numa implementation. */ | ||
432 | if (nodes != num_possible_nodes()) { | ||
433 | printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, " | ||
434 | "found: %d, expected %d", | ||
435 | nodes, num_possible_nodes()); | ||
436 | return 1; | ||
437 | } | ||
438 | #endif | ||
439 | return 0; | ||
440 | } | ||
441 | |||
442 | /* | ||
443 | * initialize the APIC for the IBS interrupts | ||
444 | * if available (AMD Family10h rev B0 and later) | ||
445 | */ | ||
446 | static void setup_ibs(void) | ||
447 | { | ||
448 | ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); | ||
449 | |||
450 | if (!ibs_allowed) | ||
451 | return; | ||
452 | |||
453 | if (pfm_amd64_setup_eilvt()) { | ||
454 | ibs_allowed = 0; | ||
455 | return; | ||
456 | } | ||
457 | |||
458 | printk(KERN_INFO "oprofile: AMD IBS detected\n"); | ||
459 | } | ||
460 | |||
461 | |||
462 | /* | ||
463 | * unitialize the APIC for the IBS interrupts if needed on AMD Family10h | ||
464 | * rev B0 and later */ | ||
465 | static void clear_ibs_nmi(void) | ||
466 | { | ||
467 | if (ibs_allowed) | ||
468 | on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); | ||
469 | } | ||
470 | |||
471 | static int (*create_arch_files)(struct super_block * sb, struct dentry * root); | ||
472 | |||
473 | static int setup_ibs_files(struct super_block * sb, struct dentry * root) | ||
474 | { | ||
475 | char buf[12]; | ||
476 | struct dentry *dir; | ||
477 | int ret = 0; | ||
478 | |||
479 | /* architecture specific files */ | ||
480 | if (create_arch_files) | ||
481 | ret = create_arch_files(sb, root); | ||
482 | |||
483 | if (ret) | ||
484 | return ret; | ||
485 | |||
486 | if (!ibs_allowed) | ||
487 | return ret; | ||
488 | |||
489 | /* model specific files */ | ||
490 | |||
491 | /* setup some reasonable defaults */ | ||
492 | ibs_config.max_cnt_fetch = 250000; | ||
493 | ibs_config.fetch_enabled = 0; | ||
494 | ibs_config.max_cnt_op = 250000; | ||
495 | ibs_config.op_enabled = 0; | ||
496 | ibs_config.dispatched_ops = 1; | ||
497 | snprintf(buf, sizeof(buf), "ibs_fetch"); | ||
498 | dir = oprofilefs_mkdir(sb, root, buf); | ||
499 | oprofilefs_create_ulong(sb, dir, "rand_enable", | ||
500 | &ibs_config.rand_en); | ||
501 | oprofilefs_create_ulong(sb, dir, "enable", | ||
502 | &ibs_config.fetch_enabled); | ||
503 | oprofilefs_create_ulong(sb, dir, "max_count", | ||
504 | &ibs_config.max_cnt_fetch); | ||
505 | snprintf(buf, sizeof(buf), "ibs_uops"); | ||
506 | dir = oprofilefs_mkdir(sb, root, buf); | ||
507 | oprofilefs_create_ulong(sb, dir, "enable", | ||
508 | &ibs_config.op_enabled); | ||
509 | oprofilefs_create_ulong(sb, dir, "max_count", | ||
510 | &ibs_config.max_cnt_op); | ||
511 | oprofilefs_create_ulong(sb, dir, "dispatched_ops", | ||
512 | &ibs_config.dispatched_ops); | ||
513 | |||
514 | return 0; | ||
515 | } | ||
516 | |||
517 | static int op_amd_init(struct oprofile_operations *ops) | ||
518 | { | ||
519 | setup_ibs(); | ||
520 | create_arch_files = ops->create_files; | ||
521 | ops->create_files = setup_ibs_files; | ||
522 | return 0; | ||
523 | } | ||
524 | |||
525 | static void op_amd_exit(void) | ||
526 | { | ||
527 | clear_ibs_nmi(); | ||
528 | } | ||
529 | |||
530 | #endif | ||
531 | |||
532 | struct op_x86_model_spec const op_amd_spec = { | ||
533 | .init = op_amd_init, | ||
534 | .exit = op_amd_exit, | ||
535 | .num_counters = NUM_COUNTERS, | ||
536 | .num_controls = NUM_CONTROLS, | ||
537 | .fill_in_addresses = &op_amd_fill_in_addresses, | ||
538 | .setup_ctrs = &op_amd_setup_ctrs, | ||
539 | .check_ctrs = &op_amd_check_ctrs, | ||
540 | .start = &op_amd_start, | ||
541 | .stop = &op_amd_stop, | ||
542 | .shutdown = &op_amd_shutdown | ||
543 | }; | ||
diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c deleted file mode 100644 index 3d534879a9dc..000000000000 --- a/arch/x86/oprofile/op_model_athlon.c +++ /dev/null | |||
@@ -1,190 +0,0 @@ | |||
1 | /* | ||
2 | * @file op_model_athlon.h | ||
3 | * athlon / K7 / K8 / Family 10h model-specific MSR operations | ||
4 | * | ||
5 | * @remark Copyright 2002 OProfile authors | ||
6 | * @remark Read the file COPYING | ||
7 | * | ||
8 | * @author John Levon | ||
9 | * @author Philippe Elie | ||
10 | * @author Graydon Hoare | ||
11 | */ | ||
12 | |||
13 | #include <linux/oprofile.h> | ||
14 | #include <asm/ptrace.h> | ||
15 | #include <asm/msr.h> | ||
16 | #include <asm/nmi.h> | ||
17 | |||
18 | #include "op_x86_model.h" | ||
19 | #include "op_counter.h" | ||
20 | |||
21 | #define NUM_COUNTERS 4 | ||
22 | #define NUM_CONTROLS 4 | ||
23 | |||
24 | #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) | ||
25 | #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) | ||
26 | #define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) | ||
27 | #define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) | ||
28 | |||
29 | #define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) | ||
30 | #define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) | ||
31 | #define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) | ||
32 | #define CTRL_SET_ACTIVE(n) (n |= (1<<22)) | ||
33 | #define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) | ||
34 | #define CTRL_CLEAR_LO(x) (x &= (1<<21)) | ||
35 | #define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) | ||
36 | #define CTRL_SET_ENABLE(val) (val |= 1<<20) | ||
37 | #define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) | ||
38 | #define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) | ||
39 | #define CTRL_SET_UM(val, m) (val |= (m << 8)) | ||
40 | #define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) | ||
41 | #define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) | ||
42 | #define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) | ||
43 | #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) | ||
44 | |||
45 | static unsigned long reset_value[NUM_COUNTERS]; | ||
46 | |||
47 | static void athlon_fill_in_addresses(struct op_msrs * const msrs) | ||
48 | { | ||
49 | int i; | ||
50 | |||
51 | for (i = 0; i < NUM_COUNTERS; i++) { | ||
52 | if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) | ||
53 | msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; | ||
54 | else | ||
55 | msrs->counters[i].addr = 0; | ||
56 | } | ||
57 | |||
58 | for (i = 0; i < NUM_CONTROLS; i++) { | ||
59 | if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) | ||
60 | msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; | ||
61 | else | ||
62 | msrs->controls[i].addr = 0; | ||
63 | } | ||
64 | } | ||
65 | |||
66 | |||
67 | static void athlon_setup_ctrs(struct op_msrs const * const msrs) | ||
68 | { | ||
69 | unsigned int low, high; | ||
70 | int i; | ||
71 | |||
72 | /* clear all counters */ | ||
73 | for (i = 0 ; i < NUM_CONTROLS; ++i) { | ||
74 | if (unlikely(!CTRL_IS_RESERVED(msrs, i))) | ||
75 | continue; | ||
76 | CTRL_READ(low, high, msrs, i); | ||
77 | CTRL_CLEAR_LO(low); | ||
78 | CTRL_CLEAR_HI(high); | ||
79 | CTRL_WRITE(low, high, msrs, i); | ||
80 | } | ||
81 | |||
82 | /* avoid a false detection of ctr overflows in NMI handler */ | ||
83 | for (i = 0; i < NUM_COUNTERS; ++i) { | ||
84 | if (unlikely(!CTR_IS_RESERVED(msrs, i))) | ||
85 | continue; | ||
86 | CTR_WRITE(1, msrs, i); | ||
87 | } | ||
88 | |||
89 | /* enable active counters */ | ||
90 | for (i = 0; i < NUM_COUNTERS; ++i) { | ||
91 | if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { | ||
92 | reset_value[i] = counter_config[i].count; | ||
93 | |||
94 | CTR_WRITE(counter_config[i].count, msrs, i); | ||
95 | |||
96 | CTRL_READ(low, high, msrs, i); | ||
97 | CTRL_CLEAR_LO(low); | ||
98 | CTRL_CLEAR_HI(high); | ||
99 | CTRL_SET_ENABLE(low); | ||
100 | CTRL_SET_USR(low, counter_config[i].user); | ||
101 | CTRL_SET_KERN(low, counter_config[i].kernel); | ||
102 | CTRL_SET_UM(low, counter_config[i].unit_mask); | ||
103 | CTRL_SET_EVENT_LOW(low, counter_config[i].event); | ||
104 | CTRL_SET_EVENT_HIGH(high, counter_config[i].event); | ||
105 | CTRL_SET_HOST_ONLY(high, 0); | ||
106 | CTRL_SET_GUEST_ONLY(high, 0); | ||
107 | |||
108 | CTRL_WRITE(low, high, msrs, i); | ||
109 | } else { | ||
110 | reset_value[i] = 0; | ||
111 | } | ||
112 | } | ||
113 | } | ||
114 | |||
115 | |||
116 | static int athlon_check_ctrs(struct pt_regs * const regs, | ||
117 | struct op_msrs const * const msrs) | ||
118 | { | ||
119 | unsigned int low, high; | ||
120 | int i; | ||
121 | |||
122 | for (i = 0 ; i < NUM_COUNTERS; ++i) { | ||
123 | if (!reset_value[i]) | ||
124 | continue; | ||
125 | CTR_READ(low, high, msrs, i); | ||
126 | if (CTR_OVERFLOWED(low)) { | ||
127 | oprofile_add_sample(regs, i); | ||
128 | CTR_WRITE(reset_value[i], msrs, i); | ||
129 | } | ||
130 | } | ||
131 | |||
132 | /* See op_model_ppro.c */ | ||
133 | return 1; | ||
134 | } | ||
135 | |||
136 | |||
137 | static void athlon_start(struct op_msrs const * const msrs) | ||
138 | { | ||
139 | unsigned int low, high; | ||
140 | int i; | ||
141 | for (i = 0 ; i < NUM_COUNTERS ; ++i) { | ||
142 | if (reset_value[i]) { | ||
143 | CTRL_READ(low, high, msrs, i); | ||
144 | CTRL_SET_ACTIVE(low); | ||
145 | CTRL_WRITE(low, high, msrs, i); | ||
146 | } | ||
147 | } | ||
148 | } | ||
149 | |||
150 | |||
151 | static void athlon_stop(struct op_msrs const * const msrs) | ||
152 | { | ||
153 | unsigned int low, high; | ||
154 | int i; | ||
155 | |||
156 | /* Subtle: stop on all counters to avoid race with | ||
157 | * setting our pm callback */ | ||
158 | for (i = 0 ; i < NUM_COUNTERS ; ++i) { | ||
159 | if (!reset_value[i]) | ||
160 | continue; | ||
161 | CTRL_READ(low, high, msrs, i); | ||
162 | CTRL_SET_INACTIVE(low); | ||
163 | CTRL_WRITE(low, high, msrs, i); | ||
164 | } | ||
165 | } | ||
166 | |||
167 | static void athlon_shutdown(struct op_msrs const * const msrs) | ||
168 | { | ||
169 | int i; | ||
170 | |||
171 | for (i = 0 ; i < NUM_COUNTERS ; ++i) { | ||
172 | if (CTR_IS_RESERVED(msrs, i)) | ||
173 | release_perfctr_nmi(MSR_K7_PERFCTR0 + i); | ||
174 | } | ||
175 | for (i = 0 ; i < NUM_CONTROLS ; ++i) { | ||
176 | if (CTRL_IS_RESERVED(msrs, i)) | ||
177 | release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); | ||
178 | } | ||
179 | } | ||
180 | |||
181 | struct op_x86_model_spec const op_athlon_spec = { | ||
182 | .num_counters = NUM_COUNTERS, | ||
183 | .num_controls = NUM_CONTROLS, | ||
184 | .fill_in_addresses = &athlon_fill_in_addresses, | ||
185 | .setup_ctrs = &athlon_setup_ctrs, | ||
186 | .check_ctrs = &athlon_check_ctrs, | ||
187 | .start = &athlon_start, | ||
188 | .stop = &athlon_stop, | ||
189 | .shutdown = &athlon_shutdown | ||
190 | }; | ||
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 56b4757a1f47..43ac5af338d8 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c | |||
@@ -10,11 +10,12 @@ | |||
10 | 10 | ||
11 | #include <linux/oprofile.h> | 11 | #include <linux/oprofile.h> |
12 | #include <linux/smp.h> | 12 | #include <linux/smp.h> |
13 | #include <linux/ptrace.h> | ||
14 | #include <linux/nmi.h> | ||
13 | #include <asm/msr.h> | 15 | #include <asm/msr.h> |
14 | #include <asm/ptrace.h> | ||
15 | #include <asm/fixmap.h> | 16 | #include <asm/fixmap.h> |
16 | #include <asm/apic.h> | 17 | #include <asm/apic.h> |
17 | #include <asm/nmi.h> | 18 | |
18 | 19 | ||
19 | #include "op_x86_model.h" | 20 | #include "op_x86_model.h" |
20 | #include "op_counter.h" | 21 | #include "op_counter.h" |
@@ -40,7 +41,7 @@ static unsigned int num_controls = NUM_CONTROLS_NON_HT; | |||
40 | static inline void setup_num_counters(void) | 41 | static inline void setup_num_counters(void) |
41 | { | 42 | { |
42 | #ifdef CONFIG_SMP | 43 | #ifdef CONFIG_SMP |
43 | if (smp_num_siblings == 2){ | 44 | if (smp_num_siblings == 2) { |
44 | num_counters = NUM_COUNTERS_HT2; | 45 | num_counters = NUM_COUNTERS_HT2; |
45 | num_controls = NUM_CONTROLS_HT2; | 46 | num_controls = NUM_CONTROLS_HT2; |
46 | } | 47 | } |
@@ -86,7 +87,7 @@ struct p4_event_binding { | |||
86 | #define CTR_FLAME_2 (1 << 6) | 87 | #define CTR_FLAME_2 (1 << 6) |
87 | #define CTR_IQ_5 (1 << 7) | 88 | #define CTR_IQ_5 (1 << 7) |
88 | 89 | ||
89 | static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { | 90 | static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = { |
90 | { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, | 91 | { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, |
91 | { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, | 92 | { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, |
92 | { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, | 93 | { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, |
@@ -97,32 +98,32 @@ static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { | |||
97 | { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } | 98 | { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } |
98 | }; | 99 | }; |
99 | 100 | ||
100 | #define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT | 101 | #define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT) |
101 | 102 | ||
102 | /* p4 event codes in libop/op_event.h are indices into this table. */ | 103 | /* p4 event codes in libop/op_event.h are indices into this table. */ |
103 | 104 | ||
104 | static struct p4_event_binding p4_events[NUM_EVENTS] = { | 105 | static struct p4_event_binding p4_events[NUM_EVENTS] = { |
105 | 106 | ||
106 | { /* BRANCH_RETIRED */ | 107 | { /* BRANCH_RETIRED */ |
107 | 0x05, 0x06, | 108 | 0x05, 0x06, |
108 | { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, | 109 | { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, |
109 | {CTR_IQ_5, MSR_P4_CRU_ESCR3} } | 110 | {CTR_IQ_5, MSR_P4_CRU_ESCR3} } |
110 | }, | 111 | }, |
111 | 112 | ||
112 | { /* MISPRED_BRANCH_RETIRED */ | 113 | { /* MISPRED_BRANCH_RETIRED */ |
113 | 0x04, 0x03, | 114 | 0x04, 0x03, |
114 | { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, | 115 | { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, |
115 | { CTR_IQ_5, MSR_P4_CRU_ESCR1} } | 116 | { CTR_IQ_5, MSR_P4_CRU_ESCR1} } |
116 | }, | 117 | }, |
117 | 118 | ||
118 | { /* TC_DELIVER_MODE */ | 119 | { /* TC_DELIVER_MODE */ |
119 | 0x01, 0x01, | 120 | 0x01, 0x01, |
120 | { { CTR_MS_0, MSR_P4_TC_ESCR0}, | 121 | { { CTR_MS_0, MSR_P4_TC_ESCR0}, |
121 | { CTR_MS_2, MSR_P4_TC_ESCR1} } | 122 | { CTR_MS_2, MSR_P4_TC_ESCR1} } |
122 | }, | 123 | }, |
123 | 124 | ||
124 | { /* BPU_FETCH_REQUEST */ | 125 | { /* BPU_FETCH_REQUEST */ |
125 | 0x00, 0x03, | 126 | 0x00, 0x03, |
126 | { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, | 127 | { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, |
127 | { CTR_BPU_2, MSR_P4_BPU_ESCR1} } | 128 | { CTR_BPU_2, MSR_P4_BPU_ESCR1} } |
128 | }, | 129 | }, |
@@ -146,7 +147,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { | |||
146 | }, | 147 | }, |
147 | 148 | ||
148 | { /* LOAD_PORT_REPLAY */ | 149 | { /* LOAD_PORT_REPLAY */ |
149 | 0x02, 0x04, | 150 | 0x02, 0x04, |
150 | { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, | 151 | { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, |
151 | { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } | 152 | { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } |
152 | }, | 153 | }, |
@@ -170,43 +171,43 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { | |||
170 | }, | 171 | }, |
171 | 172 | ||
172 | { /* BSQ_CACHE_REFERENCE */ | 173 | { /* BSQ_CACHE_REFERENCE */ |
173 | 0x07, 0x0c, | 174 | 0x07, 0x0c, |
174 | { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, | 175 | { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, |
175 | { CTR_BPU_2, MSR_P4_BSU_ESCR1} } | 176 | { CTR_BPU_2, MSR_P4_BSU_ESCR1} } |
176 | }, | 177 | }, |
177 | 178 | ||
178 | { /* IOQ_ALLOCATION */ | 179 | { /* IOQ_ALLOCATION */ |
179 | 0x06, 0x03, | 180 | 0x06, 0x03, |
180 | { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, | 181 | { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, |
181 | { 0, 0 } } | 182 | { 0, 0 } } |
182 | }, | 183 | }, |
183 | 184 | ||
184 | { /* IOQ_ACTIVE_ENTRIES */ | 185 | { /* IOQ_ACTIVE_ENTRIES */ |
185 | 0x06, 0x1a, | 186 | 0x06, 0x1a, |
186 | { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, | 187 | { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, |
187 | { 0, 0 } } | 188 | { 0, 0 } } |
188 | }, | 189 | }, |
189 | 190 | ||
190 | { /* FSB_DATA_ACTIVITY */ | 191 | { /* FSB_DATA_ACTIVITY */ |
191 | 0x06, 0x17, | 192 | 0x06, 0x17, |
192 | { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, | 193 | { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, |
193 | { CTR_BPU_2, MSR_P4_FSB_ESCR1} } | 194 | { CTR_BPU_2, MSR_P4_FSB_ESCR1} } |
194 | }, | 195 | }, |
195 | 196 | ||
196 | { /* BSQ_ALLOCATION */ | 197 | { /* BSQ_ALLOCATION */ |
197 | 0x07, 0x05, | 198 | 0x07, 0x05, |
198 | { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, | 199 | { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, |
199 | { 0, 0 } } | 200 | { 0, 0 } } |
200 | }, | 201 | }, |
201 | 202 | ||
202 | { /* BSQ_ACTIVE_ENTRIES */ | 203 | { /* BSQ_ACTIVE_ENTRIES */ |
203 | 0x07, 0x06, | 204 | 0x07, 0x06, |
204 | { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, | 205 | { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, |
205 | { 0, 0 } } | 206 | { 0, 0 } } |
206 | }, | 207 | }, |
207 | 208 | ||
208 | { /* X87_ASSIST */ | 209 | { /* X87_ASSIST */ |
209 | 0x05, 0x03, | 210 | 0x05, 0x03, |
210 | { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, | 211 | { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, |
211 | { CTR_IQ_5, MSR_P4_CRU_ESCR3} } | 212 | { CTR_IQ_5, MSR_P4_CRU_ESCR3} } |
212 | }, | 213 | }, |
@@ -216,21 +217,21 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { | |||
216 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, | 217 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, |
217 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } | 218 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } |
218 | }, | 219 | }, |
219 | 220 | ||
220 | { /* PACKED_SP_UOP */ | 221 | { /* PACKED_SP_UOP */ |
221 | 0x01, 0x08, | 222 | 0x01, 0x08, |
222 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, | 223 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, |
223 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } | 224 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } |
224 | }, | 225 | }, |
225 | 226 | ||
226 | { /* PACKED_DP_UOP */ | 227 | { /* PACKED_DP_UOP */ |
227 | 0x01, 0x0c, | 228 | 0x01, 0x0c, |
228 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, | 229 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, |
229 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } | 230 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } |
230 | }, | 231 | }, |
231 | 232 | ||
232 | { /* SCALAR_SP_UOP */ | 233 | { /* SCALAR_SP_UOP */ |
233 | 0x01, 0x0a, | 234 | 0x01, 0x0a, |
234 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, | 235 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, |
235 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } | 236 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } |
236 | }, | 237 | }, |
@@ -242,31 +243,31 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { | |||
242 | }, | 243 | }, |
243 | 244 | ||
244 | { /* 64BIT_MMX_UOP */ | 245 | { /* 64BIT_MMX_UOP */ |
245 | 0x01, 0x02, | 246 | 0x01, 0x02, |
246 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, | 247 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, |
247 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } | 248 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } |
248 | }, | 249 | }, |
249 | 250 | ||
250 | { /* 128BIT_MMX_UOP */ | 251 | { /* 128BIT_MMX_UOP */ |
251 | 0x01, 0x1a, | 252 | 0x01, 0x1a, |
252 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, | 253 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, |
253 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } | 254 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } |
254 | }, | 255 | }, |
255 | 256 | ||
256 | { /* X87_FP_UOP */ | 257 | { /* X87_FP_UOP */ |
257 | 0x01, 0x04, | 258 | 0x01, 0x04, |
258 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, | 259 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, |
259 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } | 260 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } |
260 | }, | 261 | }, |
261 | 262 | ||
262 | { /* X87_SIMD_MOVES_UOP */ | 263 | { /* X87_SIMD_MOVES_UOP */ |
263 | 0x01, 0x2e, | 264 | 0x01, 0x2e, |
264 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, | 265 | { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, |
265 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } | 266 | { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } |
266 | }, | 267 | }, |
267 | 268 | ||
268 | { /* MACHINE_CLEAR */ | 269 | { /* MACHINE_CLEAR */ |
269 | 0x05, 0x02, | 270 | 0x05, 0x02, |
270 | { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, | 271 | { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, |
271 | { CTR_IQ_5, MSR_P4_CRU_ESCR3} } | 272 | { CTR_IQ_5, MSR_P4_CRU_ESCR3} } |
272 | }, | 273 | }, |
@@ -276,9 +277,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { | |||
276 | { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, | 277 | { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, |
277 | { CTR_BPU_2, MSR_P4_FSB_ESCR1} } | 278 | { CTR_BPU_2, MSR_P4_FSB_ESCR1} } |
278 | }, | 279 | }, |
279 | 280 | ||
280 | { /* TC_MS_XFER */ | 281 | { /* TC_MS_XFER */ |
281 | 0x00, 0x05, | 282 | 0x00, 0x05, |
282 | { { CTR_MS_0, MSR_P4_MS_ESCR0}, | 283 | { { CTR_MS_0, MSR_P4_MS_ESCR0}, |
283 | { CTR_MS_2, MSR_P4_MS_ESCR1} } | 284 | { CTR_MS_2, MSR_P4_MS_ESCR1} } |
284 | }, | 285 | }, |
@@ -308,7 +309,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { | |||
308 | }, | 309 | }, |
309 | 310 | ||
310 | { /* INSTR_RETIRED */ | 311 | { /* INSTR_RETIRED */ |
311 | 0x04, 0x02, | 312 | 0x04, 0x02, |
312 | { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, | 313 | { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, |
313 | { CTR_IQ_5, MSR_P4_CRU_ESCR1} } | 314 | { CTR_IQ_5, MSR_P4_CRU_ESCR1} } |
314 | }, | 315 | }, |
@@ -319,14 +320,14 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { | |||
319 | { CTR_IQ_5, MSR_P4_CRU_ESCR1} } | 320 | { CTR_IQ_5, MSR_P4_CRU_ESCR1} } |
320 | }, | 321 | }, |
321 | 322 | ||
322 | { /* UOP_TYPE */ | 323 | { /* UOP_TYPE */ |
323 | 0x02, 0x02, | 324 | 0x02, 0x02, |
324 | { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, | 325 | { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, |
325 | { CTR_IQ_5, MSR_P4_RAT_ESCR1} } | 326 | { CTR_IQ_5, MSR_P4_RAT_ESCR1} } |
326 | }, | 327 | }, |
327 | 328 | ||
328 | { /* RETIRED_MISPRED_BRANCH_TYPE */ | 329 | { /* RETIRED_MISPRED_BRANCH_TYPE */ |
329 | 0x02, 0x05, | 330 | 0x02, 0x05, |
330 | { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, | 331 | { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, |
331 | { CTR_MS_2, MSR_P4_TBPU_ESCR1} } | 332 | { CTR_MS_2, MSR_P4_TBPU_ESCR1} } |
332 | }, | 333 | }, |
@@ -349,8 +350,8 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { | |||
349 | #define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) | 350 | #define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) |
350 | #define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) | 351 | #define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) |
351 | #define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) | 352 | #define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) |
352 | #define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) | 353 | #define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) |
353 | #define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) | 354 | #define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) |
354 | 355 | ||
355 | #define CCCR_RESERVED_BITS 0x38030FFF | 356 | #define CCCR_RESERVED_BITS 0x38030FFF |
356 | #define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) | 357 | #define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) |
@@ -360,15 +361,15 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { | |||
360 | #define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) | 361 | #define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) |
361 | #define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) | 362 | #define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) |
362 | #define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) | 363 | #define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) |
363 | #define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) | 364 | #define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) |
364 | #define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) | 365 | #define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) |
365 | #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) | 366 | #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) |
366 | #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) | 367 | #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) |
367 | 368 | ||
368 | #define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0) | 369 | #define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) |
369 | #define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0) | 370 | #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) |
370 | #define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0) | 371 | #define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0) |
371 | #define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0) | 372 | #define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0) |
372 | #define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) | 373 | #define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) |
373 | 374 | ||
374 | 375 | ||
@@ -380,7 +381,7 @@ static unsigned int get_stagger(void) | |||
380 | #ifdef CONFIG_SMP | 381 | #ifdef CONFIG_SMP |
381 | int cpu = smp_processor_id(); | 382 | int cpu = smp_processor_id(); |
382 | return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu))); | 383 | return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu))); |
383 | #endif | 384 | #endif |
384 | return 0; | 385 | return 0; |
385 | } | 386 | } |
386 | 387 | ||
@@ -395,25 +396,23 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT]; | |||
395 | 396 | ||
396 | static void p4_fill_in_addresses(struct op_msrs * const msrs) | 397 | static void p4_fill_in_addresses(struct op_msrs * const msrs) |
397 | { | 398 | { |
398 | unsigned int i; | 399 | unsigned int i; |
399 | unsigned int addr, cccraddr, stag; | 400 | unsigned int addr, cccraddr, stag; |
400 | 401 | ||
401 | setup_num_counters(); | 402 | setup_num_counters(); |
402 | stag = get_stagger(); | 403 | stag = get_stagger(); |
403 | 404 | ||
404 | /* initialize some registers */ | 405 | /* initialize some registers */ |
405 | for (i = 0; i < num_counters; ++i) { | 406 | for (i = 0; i < num_counters; ++i) |
406 | msrs->counters[i].addr = 0; | 407 | msrs->counters[i].addr = 0; |
407 | } | 408 | for (i = 0; i < num_controls; ++i) |
408 | for (i = 0; i < num_controls; ++i) { | ||
409 | msrs->controls[i].addr = 0; | 409 | msrs->controls[i].addr = 0; |
410 | } | 410 | |
411 | |||
412 | /* the counter & cccr registers we pay attention to */ | 411 | /* the counter & cccr registers we pay attention to */ |
413 | for (i = 0; i < num_counters; ++i) { | 412 | for (i = 0; i < num_counters; ++i) { |
414 | addr = p4_counters[VIRT_CTR(stag, i)].counter_address; | 413 | addr = p4_counters[VIRT_CTR(stag, i)].counter_address; |
415 | cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address; | 414 | cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address; |
416 | if (reserve_perfctr_nmi(addr)){ | 415 | if (reserve_perfctr_nmi(addr)) { |
417 | msrs->counters[i].addr = addr; | 416 | msrs->counters[i].addr = addr; |
418 | msrs->controls[i].addr = cccraddr; | 417 | msrs->controls[i].addr = cccraddr; |
419 | } | 418 | } |
@@ -447,22 +446,22 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs) | |||
447 | if (reserve_evntsel_nmi(addr)) | 446 | if (reserve_evntsel_nmi(addr)) |
448 | msrs->controls[i].addr = addr; | 447 | msrs->controls[i].addr = addr; |
449 | } | 448 | } |
450 | 449 | ||
451 | for (addr = MSR_P4_MS_ESCR0 + stag; | 450 | for (addr = MSR_P4_MS_ESCR0 + stag; |
452 | addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { | 451 | addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { |
453 | if (reserve_evntsel_nmi(addr)) | 452 | if (reserve_evntsel_nmi(addr)) |
454 | msrs->controls[i].addr = addr; | 453 | msrs->controls[i].addr = addr; |
455 | } | 454 | } |
456 | 455 | ||
457 | for (addr = MSR_P4_IX_ESCR0 + stag; | 456 | for (addr = MSR_P4_IX_ESCR0 + stag; |
458 | addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { | 457 | addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { |
459 | if (reserve_evntsel_nmi(addr)) | 458 | if (reserve_evntsel_nmi(addr)) |
460 | msrs->controls[i].addr = addr; | 459 | msrs->controls[i].addr = addr; |
461 | } | 460 | } |
462 | 461 | ||
463 | /* there are 2 remaining non-contiguously located ESCRs */ | 462 | /* there are 2 remaining non-contiguously located ESCRs */ |
464 | 463 | ||
465 | if (num_counters == NUM_COUNTERS_NON_HT) { | 464 | if (num_counters == NUM_COUNTERS_NON_HT) { |
466 | /* standard non-HT CPUs handle both remaining ESCRs*/ | 465 | /* standard non-HT CPUs handle both remaining ESCRs*/ |
467 | if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) | 466 | if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) |
468 | msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; | 467 | msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; |
@@ -498,20 +497,20 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) | |||
498 | unsigned int stag; | 497 | unsigned int stag; |
499 | 498 | ||
500 | stag = get_stagger(); | 499 | stag = get_stagger(); |
501 | 500 | ||
502 | /* convert from counter *number* to counter *bit* */ | 501 | /* convert from counter *number* to counter *bit* */ |
503 | counter_bit = 1 << VIRT_CTR(stag, ctr); | 502 | counter_bit = 1 << VIRT_CTR(stag, ctr); |
504 | 503 | ||
505 | /* find our event binding structure. */ | 504 | /* find our event binding structure. */ |
506 | if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { | 505 | if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { |
507 | printk(KERN_ERR | 506 | printk(KERN_ERR |
508 | "oprofile: P4 event code 0x%lx out of range\n", | 507 | "oprofile: P4 event code 0x%lx out of range\n", |
509 | counter_config[ctr].event); | 508 | counter_config[ctr].event); |
510 | return; | 509 | return; |
511 | } | 510 | } |
512 | 511 | ||
513 | ev = &(p4_events[counter_config[ctr].event - 1]); | 512 | ev = &(p4_events[counter_config[ctr].event - 1]); |
514 | 513 | ||
515 | for (i = 0; i < maxbind; i++) { | 514 | for (i = 0; i < maxbind; i++) { |
516 | if (ev->bindings[i].virt_counter & counter_bit) { | 515 | if (ev->bindings[i].virt_counter & counter_bit) { |
517 | 516 | ||
@@ -526,25 +525,24 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) | |||
526 | ESCR_SET_OS_1(escr, counter_config[ctr].kernel); | 525 | ESCR_SET_OS_1(escr, counter_config[ctr].kernel); |
527 | } | 526 | } |
528 | ESCR_SET_EVENT_SELECT(escr, ev->event_select); | 527 | ESCR_SET_EVENT_SELECT(escr, ev->event_select); |
529 | ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); | 528 | ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); |
530 | ESCR_WRITE(escr, high, ev, i); | 529 | ESCR_WRITE(escr, high, ev, i); |
531 | 530 | ||
532 | /* modify CCCR */ | 531 | /* modify CCCR */ |
533 | CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); | 532 | CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); |
534 | CCCR_CLEAR(cccr); | 533 | CCCR_CLEAR(cccr); |
535 | CCCR_SET_REQUIRED_BITS(cccr); | 534 | CCCR_SET_REQUIRED_BITS(cccr); |
536 | CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); | 535 | CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); |
537 | if (stag == 0) { | 536 | if (stag == 0) |
538 | CCCR_SET_PMI_OVF_0(cccr); | 537 | CCCR_SET_PMI_OVF_0(cccr); |
539 | } else { | 538 | else |
540 | CCCR_SET_PMI_OVF_1(cccr); | 539 | CCCR_SET_PMI_OVF_1(cccr); |
541 | } | ||
542 | CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); | 540 | CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); |
543 | return; | 541 | return; |
544 | } | 542 | } |
545 | } | 543 | } |
546 | 544 | ||
547 | printk(KERN_ERR | 545 | printk(KERN_ERR |
548 | "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n", | 546 | "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n", |
549 | counter_config[ctr].event, stag, ctr); | 547 | counter_config[ctr].event, stag, ctr); |
550 | } | 548 | } |
@@ -559,14 +557,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) | |||
559 | stag = get_stagger(); | 557 | stag = get_stagger(); |
560 | 558 | ||
561 | rdmsr(MSR_IA32_MISC_ENABLE, low, high); | 559 | rdmsr(MSR_IA32_MISC_ENABLE, low, high); |
562 | if (! MISC_PMC_ENABLED_P(low)) { | 560 | if (!MISC_PMC_ENABLED_P(low)) { |
563 | printk(KERN_ERR "oprofile: P4 PMC not available\n"); | 561 | printk(KERN_ERR "oprofile: P4 PMC not available\n"); |
564 | return; | 562 | return; |
565 | } | 563 | } |
566 | 564 | ||
567 | /* clear the cccrs we will use */ | 565 | /* clear the cccrs we will use */ |
568 | for (i = 0 ; i < num_counters ; i++) { | 566 | for (i = 0 ; i < num_counters ; i++) { |
569 | if (unlikely(!CTRL_IS_RESERVED(msrs,i))) | 567 | if (unlikely(!CTRL_IS_RESERVED(msrs, i))) |
570 | continue; | 568 | continue; |
571 | rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); | 569 | rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); |
572 | CCCR_CLEAR(low); | 570 | CCCR_CLEAR(low); |
@@ -576,14 +574,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) | |||
576 | 574 | ||
577 | /* clear all escrs (including those outside our concern) */ | 575 | /* clear all escrs (including those outside our concern) */ |
578 | for (i = num_counters; i < num_controls; i++) { | 576 | for (i = num_counters; i < num_controls; i++) { |
579 | if (unlikely(!CTRL_IS_RESERVED(msrs,i))) | 577 | if (unlikely(!CTRL_IS_RESERVED(msrs, i))) |
580 | continue; | 578 | continue; |
581 | wrmsr(msrs->controls[i].addr, 0, 0); | 579 | wrmsr(msrs->controls[i].addr, 0, 0); |
582 | } | 580 | } |
583 | 581 | ||
584 | /* setup all counters */ | 582 | /* setup all counters */ |
585 | for (i = 0 ; i < num_counters ; ++i) { | 583 | for (i = 0 ; i < num_counters ; ++i) { |
586 | if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) { | 584 | if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) { |
587 | reset_value[i] = counter_config[i].count; | 585 | reset_value[i] = counter_config[i].count; |
588 | pmc_setup_one_p4_counter(i); | 586 | pmc_setup_one_p4_counter(i); |
589 | CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); | 587 | CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); |
@@ -603,11 +601,11 @@ static int p4_check_ctrs(struct pt_regs * const regs, | |||
603 | stag = get_stagger(); | 601 | stag = get_stagger(); |
604 | 602 | ||
605 | for (i = 0; i < num_counters; ++i) { | 603 | for (i = 0; i < num_counters; ++i) { |
606 | 604 | ||
607 | if (!reset_value[i]) | 605 | if (!reset_value[i]) |
608 | continue; | 606 | continue; |
609 | 607 | ||
610 | /* | 608 | /* |
611 | * there is some eccentricity in the hardware which | 609 | * there is some eccentricity in the hardware which |
612 | * requires that we perform 2 extra corrections: | 610 | * requires that we perform 2 extra corrections: |
613 | * | 611 | * |
@@ -616,24 +614,24 @@ static int p4_check_ctrs(struct pt_regs * const regs, | |||
616 | * | 614 | * |
617 | * - write the counter back twice to ensure it gets | 615 | * - write the counter back twice to ensure it gets |
618 | * updated properly. | 616 | * updated properly. |
619 | * | 617 | * |
620 | * the former seems to be related to extra NMIs happening | 618 | * the former seems to be related to extra NMIs happening |
621 | * during the current NMI; the latter is reported as errata | 619 | * during the current NMI; the latter is reported as errata |
622 | * N15 in intel doc 249199-029, pentium 4 specification | 620 | * N15 in intel doc 249199-029, pentium 4 specification |
623 | * update, though their suggested work-around does not | 621 | * update, though their suggested work-around does not |
624 | * appear to solve the problem. | 622 | * appear to solve the problem. |
625 | */ | 623 | */ |
626 | 624 | ||
627 | real = VIRT_CTR(stag, i); | 625 | real = VIRT_CTR(stag, i); |
628 | 626 | ||
629 | CCCR_READ(low, high, real); | 627 | CCCR_READ(low, high, real); |
630 | CTR_READ(ctr, high, real); | 628 | CTR_READ(ctr, high, real); |
631 | if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { | 629 | if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { |
632 | oprofile_add_sample(regs, i); | 630 | oprofile_add_sample(regs, i); |
633 | CTR_WRITE(reset_value[i], real); | 631 | CTR_WRITE(reset_value[i], real); |
634 | CCCR_CLEAR_OVF(low); | 632 | CCCR_CLEAR_OVF(low); |
635 | CCCR_WRITE(low, high, real); | 633 | CCCR_WRITE(low, high, real); |
636 | CTR_WRITE(reset_value[i], real); | 634 | CTR_WRITE(reset_value[i], real); |
637 | } | 635 | } |
638 | } | 636 | } |
639 | 637 | ||
@@ -683,15 +681,16 @@ static void p4_shutdown(struct op_msrs const * const msrs) | |||
683 | int i; | 681 | int i; |
684 | 682 | ||
685 | for (i = 0 ; i < num_counters ; ++i) { | 683 | for (i = 0 ; i < num_counters ; ++i) { |
686 | if (CTR_IS_RESERVED(msrs,i)) | 684 | if (CTR_IS_RESERVED(msrs, i)) |
687 | release_perfctr_nmi(msrs->counters[i].addr); | 685 | release_perfctr_nmi(msrs->counters[i].addr); |
688 | } | 686 | } |
689 | /* some of the control registers are specially reserved in | 687 | /* |
688 | * some of the control registers are specially reserved in | ||
690 | * conjunction with the counter registers (hence the starting offset). | 689 | * conjunction with the counter registers (hence the starting offset). |
691 | * This saves a few bits. | 690 | * This saves a few bits. |
692 | */ | 691 | */ |
693 | for (i = num_counters ; i < num_controls ; ++i) { | 692 | for (i = num_counters ; i < num_controls ; ++i) { |
694 | if (CTRL_IS_RESERVED(msrs,i)) | 693 | if (CTRL_IS_RESERVED(msrs, i)) |
695 | release_evntsel_nmi(msrs->controls[i].addr); | 694 | release_evntsel_nmi(msrs->controls[i].addr); |
696 | } | 695 | } |
697 | } | 696 | } |
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 45b605fa71d0..05a0261ba0c3 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h | |||
@@ -32,6 +32,8 @@ struct pt_regs; | |||
32 | * various x86 CPU models' perfctr support. | 32 | * various x86 CPU models' perfctr support. |
33 | */ | 33 | */ |
34 | struct op_x86_model_spec { | 34 | struct op_x86_model_spec { |
35 | int (*init)(struct oprofile_operations *ops); | ||
36 | void (*exit)(void); | ||
35 | unsigned int const num_counters; | 37 | unsigned int const num_counters; |
36 | unsigned int const num_controls; | 38 | unsigned int const num_controls; |
37 | void (*fill_in_addresses)(struct op_msrs * const msrs); | 39 | void (*fill_in_addresses)(struct op_msrs * const msrs); |
@@ -46,6 +48,6 @@ struct op_x86_model_spec { | |||
46 | extern struct op_x86_model_spec const op_ppro_spec; | 48 | extern struct op_x86_model_spec const op_ppro_spec; |
47 | extern struct op_x86_model_spec const op_p4_spec; | 49 | extern struct op_x86_model_spec const op_p4_spec; |
48 | extern struct op_x86_model_spec const op_p4_ht2_spec; | 50 | extern struct op_x86_model_spec const op_p4_ht2_spec; |
49 | extern struct op_x86_model_spec const op_athlon_spec; | 51 | extern struct op_x86_model_spec const op_amd_spec; |
50 | 52 | ||
51 | #endif /* OP_X86_MODEL_H */ | 53 | #endif /* OP_X86_MODEL_H */ |
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 19af06927fbc..1d88d2b39771 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c | |||
@@ -250,10 +250,5 @@ int __init pci_acpi_init(void) | |||
250 | acpi_pci_irq_enable(dev); | 250 | acpi_pci_irq_enable(dev); |
251 | } | 251 | } |
252 | 252 | ||
253 | #ifdef CONFIG_X86_IO_APIC | ||
254 | if (acpi_ioapic) | ||
255 | print_IO_APIC(); | ||
256 | #endif | ||
257 | |||
258 | return 0; | 253 | return 0; |
259 | } | 254 | } |
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 6a0fca78c362..22e057665e55 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c | |||
@@ -580,7 +580,7 @@ static int __cpuinit amd_cpu_notify(struct notifier_block *self, | |||
580 | unsigned long action, void *hcpu) | 580 | unsigned long action, void *hcpu) |
581 | { | 581 | { |
582 | int cpu = (long)hcpu; | 582 | int cpu = (long)hcpu; |
583 | switch(action) { | 583 | switch (action) { |
584 | case CPU_ONLINE: | 584 | case CPU_ONLINE: |
585 | case CPU_ONLINE_FROZEN: | 585 | case CPU_ONLINE_FROZEN: |
586 | smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0); | 586 | smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0); |
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 4bdaa590375d..3c27a809393b 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c | |||
@@ -511,3 +511,31 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, fam10h_pci_cfg_space_size); | |||
511 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size); | 511 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size); |
512 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size); | 512 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size); |
513 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size); | 513 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size); |
514 | |||
515 | /* | ||
516 | * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from | ||
517 | * confusing the PCI engine: | ||
518 | */ | ||
519 | static void sb600_disable_hpet_bar(struct pci_dev *dev) | ||
520 | { | ||
521 | u8 val; | ||
522 | |||
523 | /* | ||
524 | * The SB600 and SB700 both share the same device | ||
525 | * ID, but the PM register 0x55 does something different | ||
526 | * for the SB700, so make sure we are dealing with the | ||
527 | * SB600 before touching the bit: | ||
528 | */ | ||
529 | |||
530 | pci_read_config_byte(dev, 0x08, &val); | ||
531 | |||
532 | if (val < 0x2F) { | ||
533 | outb(0x55, 0xCD6); | ||
534 | val = inb(0xCD7); | ||
535 | |||
536 | /* Set bit 7 in PM register 0x55 */ | ||
537 | outb(0x55, 0xCD6); | ||
538 | outb(val | 0x80, 0xCD7); | ||
539 | } | ||
540 | } | ||
541 | DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar); | ||
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 8791fc55e715..844df0cbbd3e 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/bootmem.h> | 33 | #include <linux/bootmem.h> |
34 | 34 | ||
35 | #include <asm/pat.h> | 35 | #include <asm/pat.h> |
36 | #include <asm/e820.h> | ||
36 | 37 | ||
37 | #include "pci.h" | 38 | #include "pci.h" |
38 | 39 | ||
@@ -227,6 +228,8 @@ void __init pcibios_resource_survey(void) | |||
227 | pcibios_allocate_bus_resources(&pci_root_buses); | 228 | pcibios_allocate_bus_resources(&pci_root_buses); |
228 | pcibios_allocate_resources(0); | 229 | pcibios_allocate_resources(0); |
229 | pcibios_allocate_resources(1); | 230 | pcibios_allocate_resources(1); |
231 | |||
232 | e820_reserve_resources_late(); | ||
230 | } | 233 | } |
231 | 234 | ||
232 | /** | 235 | /** |
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 8e077185e185..006599db0dc7 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c | |||
@@ -1043,35 +1043,44 @@ static void __init pcibios_fixup_irqs(void) | |||
1043 | if (io_apic_assign_pci_irqs) { | 1043 | if (io_apic_assign_pci_irqs) { |
1044 | int irq; | 1044 | int irq; |
1045 | 1045 | ||
1046 | if (pin) { | 1046 | if (!pin) |
1047 | /* | 1047 | continue; |
1048 | * interrupt pins are numbered starting | 1048 | |
1049 | * from 1 | 1049 | /* |
1050 | */ | 1050 | * interrupt pins are numbered starting from 1 |
1051 | pin--; | 1051 | */ |
1052 | irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, | 1052 | pin--; |
1053 | PCI_SLOT(dev->devfn), pin); | 1053 | irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, |
1054 | /* | 1054 | PCI_SLOT(dev->devfn), pin); |
1055 | * Busses behind bridges are typically not listed in the MP-table. | 1055 | /* |
1056 | * In this case we have to look up the IRQ based on the parent bus, | 1056 | * Busses behind bridges are typically not listed in the |
1057 | * parent slot, and pin number. The SMP code detects such bridged | 1057 | * MP-table. In this case we have to look up the IRQ |
1058 | * busses itself so we should get into this branch reliably. | 1058 | * based on the parent bus, parent slot, and pin number. |
1059 | */ | 1059 | * The SMP code detects such bridged busses itself so we |
1060 | if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ | 1060 | * should get into this branch reliably. |
1061 | struct pci_dev *bridge = dev->bus->self; | 1061 | */ |
1062 | 1062 | if (irq < 0 && dev->bus->parent) { | |
1063 | pin = (pin + PCI_SLOT(dev->devfn)) % 4; | 1063 | /* go back to the bridge */ |
1064 | irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, | 1064 | struct pci_dev *bridge = dev->bus->self; |
1065 | PCI_SLOT(bridge->devfn), pin); | 1065 | int bus; |
1066 | if (irq >= 0) | 1066 | |
1067 | dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n", | 1067 | pin = (pin + PCI_SLOT(dev->devfn)) % 4; |
1068 | pci_name(bridge), | 1068 | bus = bridge->bus->number; |
1069 | 'A' + pin, irq); | 1069 | irq = IO_APIC_get_PCI_irq_vector(bus, |
1070 | } | 1070 | PCI_SLOT(bridge->devfn), pin); |
1071 | if (irq >= 0) { | 1071 | if (irq >= 0) |
1072 | dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq); | 1072 | dev_warn(&dev->dev, |
1073 | dev->irq = irq; | 1073 | "using bridge %s INT %c to " |
1074 | } | 1074 | "get IRQ %d\n", |
1075 | pci_name(bridge), | ||
1076 | 'A' + pin, irq); | ||
1077 | } | ||
1078 | if (irq >= 0) { | ||
1079 | dev_info(&dev->dev, | ||
1080 | "PCI->APIC IRQ transform: INT %c " | ||
1081 | "-> IRQ %d\n", | ||
1082 | 'A' + pin, irq); | ||
1083 | dev->irq = irq; | ||
1075 | } | 1084 | } |
1076 | } | 1085 | } |
1077 | #endif | 1086 | #endif |
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index d9635764ce3d..654a2234f8f3 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c | |||
@@ -209,7 +209,7 @@ static int __init pci_mmcfg_check_hostbridge(void) | |||
209 | return name != NULL; | 209 | return name != NULL; |
210 | } | 210 | } |
211 | 211 | ||
212 | static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) | 212 | static void __init pci_mmcfg_insert_resources(void) |
213 | { | 213 | { |
214 | #define PCI_MMCFG_RESOURCE_NAME_LEN 19 | 214 | #define PCI_MMCFG_RESOURCE_NAME_LEN 19 |
215 | int i; | 215 | int i; |
@@ -233,7 +233,7 @@ static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) | |||
233 | cfg->pci_segment); | 233 | cfg->pci_segment); |
234 | res->start = cfg->address; | 234 | res->start = cfg->address; |
235 | res->end = res->start + (num_buses << 20) - 1; | 235 | res->end = res->start + (num_buses << 20) - 1; |
236 | res->flags = IORESOURCE_MEM | resource_flags; | 236 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
237 | insert_resource(&iomem_resource, res); | 237 | insert_resource(&iomem_resource, res); |
238 | names += PCI_MMCFG_RESOURCE_NAME_LEN; | 238 | names += PCI_MMCFG_RESOURCE_NAME_LEN; |
239 | } | 239 | } |
@@ -434,11 +434,9 @@ static void __init __pci_mmcfg_init(int early) | |||
434 | (pci_mmcfg_config[0].address == 0)) | 434 | (pci_mmcfg_config[0].address == 0)) |
435 | return; | 435 | return; |
436 | 436 | ||
437 | if (pci_mmcfg_arch_init()) { | 437 | if (pci_mmcfg_arch_init()) |
438 | if (known_bridge) | ||
439 | pci_mmcfg_insert_resources(IORESOURCE_BUSY); | ||
440 | pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; | 438 | pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; |
441 | } else { | 439 | else { |
442 | /* | 440 | /* |
443 | * Signal not to attempt to insert mmcfg resources because | 441 | * Signal not to attempt to insert mmcfg resources because |
444 | * the architecture mmcfg setup could not initialize. | 442 | * the architecture mmcfg setup could not initialize. |
@@ -475,7 +473,7 @@ static int __init pci_mmcfg_late_insert_resources(void) | |||
475 | * marked so it won't cause request errors when __request_region is | 473 | * marked so it won't cause request errors when __request_region is |
476 | * called. | 474 | * called. |
477 | */ | 475 | */ |
478 | pci_mmcfg_insert_resources(0); | 476 | pci_mmcfg_insert_resources(); |
479 | 477 | ||
480 | return 0; | 478 | return 0; |
481 | } | 479 | } |
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index d3e083dea720..274d06082f48 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/suspend.h> | 11 | #include <linux/suspend.h> |
12 | #include <asm/mtrr.h> | 12 | #include <asm/mtrr.h> |
13 | #include <asm/mce.h> | 13 | #include <asm/mce.h> |
14 | #include <asm/xcr.h> | ||
14 | 15 | ||
15 | static struct saved_context saved_context; | 16 | static struct saved_context saved_context; |
16 | 17 | ||
@@ -126,6 +127,12 @@ static void __restore_processor_state(struct saved_context *ctxt) | |||
126 | if (boot_cpu_has(X86_FEATURE_SEP)) | 127 | if (boot_cpu_has(X86_FEATURE_SEP)) |
127 | enable_sep_cpu(); | 128 | enable_sep_cpu(); |
128 | 129 | ||
130 | /* | ||
131 | * restore XCR0 for xsave capable cpu's. | ||
132 | */ | ||
133 | if (cpu_has_xsave) | ||
134 | xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); | ||
135 | |||
129 | fix_processor_context(); | 136 | fix_processor_context(); |
130 | do_fpu_end(); | 137 | do_fpu_end(); |
131 | mtrr_ap_init(); | 138 | mtrr_ap_init(); |
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c index 66bdfb591fd8..e3b6cf70d62c 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu_64.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <asm/page.h> | 14 | #include <asm/page.h> |
15 | #include <asm/pgtable.h> | 15 | #include <asm/pgtable.h> |
16 | #include <asm/mtrr.h> | 16 | #include <asm/mtrr.h> |
17 | #include <asm/xcr.h> | ||
17 | 18 | ||
18 | static void fix_processor_context(void); | 19 | static void fix_processor_context(void); |
19 | 20 | ||
@@ -122,6 +123,12 @@ static void __restore_processor_state(struct saved_context *ctxt) | |||
122 | wrmsrl(MSR_GS_BASE, ctxt->gs_base); | 123 | wrmsrl(MSR_GS_BASE, ctxt->gs_base); |
123 | wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); | 124 | wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); |
124 | 125 | ||
126 | /* | ||
127 | * restore XCR0 for xsave capable cpu's. | ||
128 | */ | ||
129 | if (cpu_has_xsave) | ||
130 | xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); | ||
131 | |||
125 | fix_processor_context(); | 132 | fix_processor_context(); |
126 | 133 | ||
127 | do_fpu_end(); | 134 | do_fpu_end(); |
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index 4fc7e872c85e..d1e9b53f9d33 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S | |||
@@ -1,5 +1,3 @@ | |||
1 | .text | ||
2 | |||
3 | /* | 1 | /* |
4 | * This may not use any stack, nor any variable that is not "NoSave": | 2 | * This may not use any stack, nor any variable that is not "NoSave": |
5 | * | 3 | * |
@@ -12,17 +10,18 @@ | |||
12 | #include <asm/segment.h> | 10 | #include <asm/segment.h> |
13 | #include <asm/page.h> | 11 | #include <asm/page.h> |
14 | #include <asm/asm-offsets.h> | 12 | #include <asm/asm-offsets.h> |
13 | #include <asm/processor-flags.h> | ||
15 | 14 | ||
16 | .text | 15 | .text |
17 | 16 | ||
18 | ENTRY(swsusp_arch_suspend) | 17 | ENTRY(swsusp_arch_suspend) |
19 | |||
20 | movl %esp, saved_context_esp | 18 | movl %esp, saved_context_esp |
21 | movl %ebx, saved_context_ebx | 19 | movl %ebx, saved_context_ebx |
22 | movl %ebp, saved_context_ebp | 20 | movl %ebp, saved_context_ebp |
23 | movl %esi, saved_context_esi | 21 | movl %esi, saved_context_esi |
24 | movl %edi, saved_context_edi | 22 | movl %edi, saved_context_edi |
25 | pushfl ; popl saved_context_eflags | 23 | pushfl |
24 | popl saved_context_eflags | ||
26 | 25 | ||
27 | call swsusp_save | 26 | call swsusp_save |
28 | ret | 27 | ret |
@@ -59,7 +58,7 @@ done: | |||
59 | movl mmu_cr4_features, %ecx | 58 | movl mmu_cr4_features, %ecx |
60 | jecxz 1f # cr4 Pentium and higher, skip if zero | 59 | jecxz 1f # cr4 Pentium and higher, skip if zero |
61 | movl %ecx, %edx | 60 | movl %ecx, %edx |
62 | andl $~(1<<7), %edx; # PGE | 61 | andl $~(X86_CR4_PGE), %edx |
63 | movl %edx, %cr4; # turn off PGE | 62 | movl %edx, %cr4; # turn off PGE |
64 | 1: | 63 | 1: |
65 | movl %cr3, %eax; # flush TLB | 64 | movl %cr3, %eax; # flush TLB |
@@ -74,7 +73,8 @@ done: | |||
74 | movl saved_context_esi, %esi | 73 | movl saved_context_esi, %esi |
75 | movl saved_context_edi, %edi | 74 | movl saved_context_edi, %edi |
76 | 75 | ||
77 | pushl saved_context_eflags ; popfl | 76 | pushl saved_context_eflags |
77 | popfl | ||
78 | 78 | ||
79 | xorl %eax, %eax | 79 | xorl %eax, %eax |
80 | 80 | ||
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 3815e425f470..87b9ab166423 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig | |||
@@ -26,5 +26,13 @@ config XEN_MAX_DOMAIN_MEMORY | |||
26 | 26 | ||
27 | config XEN_SAVE_RESTORE | 27 | config XEN_SAVE_RESTORE |
28 | bool | 28 | bool |
29 | depends on PM | 29 | depends on XEN && PM |
30 | default y \ No newline at end of file | 30 | default y |
31 | |||
32 | config XEN_DEBUG_FS | ||
33 | bool "Enable Xen debug and tuning parameters in debugfs" | ||
34 | depends on XEN && DEBUG_FS | ||
35 | default n | ||
36 | help | ||
37 | Enable statistics output and various tuning options in debugfs. | ||
38 | Enabling this option may incur a significant performance overhead. | ||
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 59c1e539aed2..313947940a1a 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile | |||
@@ -1,4 +1,12 @@ | |||
1 | obj-y := enlighten.o setup.o multicalls.o mmu.o \ | 1 | ifdef CONFIG_FTRACE |
2 | # Do not profile debug and lowlevel utilities | ||
3 | CFLAGS_REMOVE_spinlock.o = -pg | ||
4 | CFLAGS_REMOVE_time.o = -pg | ||
5 | CFLAGS_REMOVE_irq.o = -pg | ||
6 | endif | ||
7 | |||
8 | obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ | ||
2 | time.o xen-asm_$(BITS).o grant-table.o suspend.o | 9 | time.o xen-asm_$(BITS).o grant-table.o suspend.o |
3 | 10 | ||
4 | obj-$(CONFIG_SMP) += smp.o | 11 | obj-$(CONFIG_SMP) += smp.o spinlock.o |
12 | obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file | ||
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c new file mode 100644 index 000000000000..b53225d2cac3 --- /dev/null +++ b/arch/x86/xen/debugfs.c | |||
@@ -0,0 +1,123 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/debugfs.h> | ||
3 | #include <linux/module.h> | ||
4 | |||
5 | #include "debugfs.h" | ||
6 | |||
7 | static struct dentry *d_xen_debug; | ||
8 | |||
9 | struct dentry * __init xen_init_debugfs(void) | ||
10 | { | ||
11 | if (!d_xen_debug) { | ||
12 | d_xen_debug = debugfs_create_dir("xen", NULL); | ||
13 | |||
14 | if (!d_xen_debug) | ||
15 | pr_warning("Could not create 'xen' debugfs directory\n"); | ||
16 | } | ||
17 | |||
18 | return d_xen_debug; | ||
19 | } | ||
20 | |||
21 | struct array_data | ||
22 | { | ||
23 | void *array; | ||
24 | unsigned elements; | ||
25 | }; | ||
26 | |||
27 | static int u32_array_open(struct inode *inode, struct file *file) | ||
28 | { | ||
29 | file->private_data = NULL; | ||
30 | return nonseekable_open(inode, file); | ||
31 | } | ||
32 | |||
33 | static size_t format_array(char *buf, size_t bufsize, const char *fmt, | ||
34 | u32 *array, unsigned array_size) | ||
35 | { | ||
36 | size_t ret = 0; | ||
37 | unsigned i; | ||
38 | |||
39 | for(i = 0; i < array_size; i++) { | ||
40 | size_t len; | ||
41 | |||
42 | len = snprintf(buf, bufsize, fmt, array[i]); | ||
43 | len++; /* ' ' or '\n' */ | ||
44 | ret += len; | ||
45 | |||
46 | if (buf) { | ||
47 | buf += len; | ||
48 | bufsize -= len; | ||
49 | buf[-1] = (i == array_size-1) ? '\n' : ' '; | ||
50 | } | ||
51 | } | ||
52 | |||
53 | ret++; /* \0 */ | ||
54 | if (buf) | ||
55 | *buf = '\0'; | ||
56 | |||
57 | return ret; | ||
58 | } | ||
59 | |||
60 | static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size) | ||
61 | { | ||
62 | size_t len = format_array(NULL, 0, fmt, array, array_size); | ||
63 | char *ret; | ||
64 | |||
65 | ret = kmalloc(len, GFP_KERNEL); | ||
66 | if (ret == NULL) | ||
67 | return NULL; | ||
68 | |||
69 | format_array(ret, len, fmt, array, array_size); | ||
70 | return ret; | ||
71 | } | ||
72 | |||
73 | static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, | ||
74 | loff_t *ppos) | ||
75 | { | ||
76 | struct inode *inode = file->f_path.dentry->d_inode; | ||
77 | struct array_data *data = inode->i_private; | ||
78 | size_t size; | ||
79 | |||
80 | if (*ppos == 0) { | ||
81 | if (file->private_data) { | ||
82 | kfree(file->private_data); | ||
83 | file->private_data = NULL; | ||
84 | } | ||
85 | |||
86 | file->private_data = format_array_alloc("%u", data->array, data->elements); | ||
87 | } | ||
88 | |||
89 | size = 0; | ||
90 | if (file->private_data) | ||
91 | size = strlen(file->private_data); | ||
92 | |||
93 | return simple_read_from_buffer(buf, len, ppos, file->private_data, size); | ||
94 | } | ||
95 | |||
96 | static int xen_array_release(struct inode *inode, struct file *file) | ||
97 | { | ||
98 | kfree(file->private_data); | ||
99 | |||
100 | return 0; | ||
101 | } | ||
102 | |||
103 | static struct file_operations u32_array_fops = { | ||
104 | .owner = THIS_MODULE, | ||
105 | .open = u32_array_open, | ||
106 | .release= xen_array_release, | ||
107 | .read = u32_array_read, | ||
108 | }; | ||
109 | |||
110 | struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, | ||
111 | struct dentry *parent, | ||
112 | u32 *array, unsigned elements) | ||
113 | { | ||
114 | struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL); | ||
115 | |||
116 | if (data == NULL) | ||
117 | return NULL; | ||
118 | |||
119 | data->array = array; | ||
120 | data->elements = elements; | ||
121 | |||
122 | return debugfs_create_file(name, mode, parent, data, &u32_array_fops); | ||
123 | } | ||
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h new file mode 100644 index 000000000000..e28132084832 --- /dev/null +++ b/arch/x86/xen/debugfs.h | |||
@@ -0,0 +1,10 @@ | |||
1 | #ifndef _XEN_DEBUGFS_H | ||
2 | #define _XEN_DEBUGFS_H | ||
3 | |||
4 | struct dentry * __init xen_init_debugfs(void); | ||
5 | |||
6 | struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, | ||
7 | struct dentry *parent, | ||
8 | u32 *array, unsigned elements); | ||
9 | |||
10 | #endif /* _XEN_DEBUGFS_H */ | ||
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9ff6e3cbf08f..0013a729b41d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -30,12 +30,12 @@ | |||
30 | #include <xen/interface/xen.h> | 30 | #include <xen/interface/xen.h> |
31 | #include <xen/interface/physdev.h> | 31 | #include <xen/interface/physdev.h> |
32 | #include <xen/interface/vcpu.h> | 32 | #include <xen/interface/vcpu.h> |
33 | #include <xen/interface/sched.h> | ||
34 | #include <xen/features.h> | 33 | #include <xen/features.h> |
35 | #include <xen/page.h> | 34 | #include <xen/page.h> |
36 | #include <xen/hvc-console.h> | 35 | #include <xen/hvc-console.h> |
37 | 36 | ||
38 | #include <asm/paravirt.h> | 37 | #include <asm/paravirt.h> |
38 | #include <asm/apic.h> | ||
39 | #include <asm/page.h> | 39 | #include <asm/page.h> |
40 | #include <asm/xen/hypercall.h> | 40 | #include <asm/xen/hypercall.h> |
41 | #include <asm/xen/hypervisor.h> | 41 | #include <asm/xen/hypervisor.h> |
@@ -57,6 +57,9 @@ EXPORT_SYMBOL_GPL(hypercall_page); | |||
57 | DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); | 57 | DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); |
58 | DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); | 58 | DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); |
59 | 59 | ||
60 | enum xen_domain_type xen_domain_type = XEN_NATIVE; | ||
61 | EXPORT_SYMBOL_GPL(xen_domain_type); | ||
62 | |||
60 | /* | 63 | /* |
61 | * Identity map, in addition to plain kernel map. This needs to be | 64 | * Identity map, in addition to plain kernel map. This needs to be |
62 | * large enough to allocate page table pages to allocate the rest. | 65 | * large enough to allocate page table pages to allocate the rest. |
@@ -110,7 +113,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; | |||
110 | * | 113 | * |
111 | * 0: not available, 1: available | 114 | * 0: not available, 1: available |
112 | */ | 115 | */ |
113 | static int have_vcpu_info_placement = 1; | 116 | static int have_vcpu_info_placement = |
117 | #ifdef CONFIG_X86_32 | ||
118 | 1 | ||
119 | #else | ||
120 | 0 | ||
121 | #endif | ||
122 | ; | ||
123 | |||
114 | 124 | ||
115 | static void xen_vcpu_setup(int cpu) | 125 | static void xen_vcpu_setup(int cpu) |
116 | { | 126 | { |
@@ -226,103 +236,68 @@ static unsigned long xen_get_debugreg(int reg) | |||
226 | return HYPERVISOR_get_debugreg(reg); | 236 | return HYPERVISOR_get_debugreg(reg); |
227 | } | 237 | } |
228 | 238 | ||
229 | static unsigned long xen_save_fl(void) | 239 | static void xen_leave_lazy(void) |
230 | { | 240 | { |
231 | struct vcpu_info *vcpu; | 241 | paravirt_leave_lazy(paravirt_get_lazy_mode()); |
232 | unsigned long flags; | 242 | xen_mc_flush(); |
233 | |||
234 | vcpu = x86_read_percpu(xen_vcpu); | ||
235 | |||
236 | /* flag has opposite sense of mask */ | ||
237 | flags = !vcpu->evtchn_upcall_mask; | ||
238 | |||
239 | /* convert to IF type flag | ||
240 | -0 -> 0x00000000 | ||
241 | -1 -> 0xffffffff | ||
242 | */ | ||
243 | return (-flags) & X86_EFLAGS_IF; | ||
244 | } | 243 | } |
245 | 244 | ||
246 | static void xen_restore_fl(unsigned long flags) | 245 | static unsigned long xen_store_tr(void) |
247 | { | 246 | { |
248 | struct vcpu_info *vcpu; | 247 | return 0; |
249 | |||
250 | /* convert from IF type flag */ | ||
251 | flags = !(flags & X86_EFLAGS_IF); | ||
252 | |||
253 | /* There's a one instruction preempt window here. We need to | ||
254 | make sure we're don't switch CPUs between getting the vcpu | ||
255 | pointer and updating the mask. */ | ||
256 | preempt_disable(); | ||
257 | vcpu = x86_read_percpu(xen_vcpu); | ||
258 | vcpu->evtchn_upcall_mask = flags; | ||
259 | preempt_enable_no_resched(); | ||
260 | |||
261 | /* Doesn't matter if we get preempted here, because any | ||
262 | pending event will get dealt with anyway. */ | ||
263 | |||
264 | if (flags == 0) { | ||
265 | preempt_check_resched(); | ||
266 | barrier(); /* unmask then check (avoid races) */ | ||
267 | if (unlikely(vcpu->evtchn_upcall_pending)) | ||
268 | force_evtchn_callback(); | ||
269 | } | ||
270 | } | 248 | } |
271 | 249 | ||
272 | static void xen_irq_disable(void) | 250 | /* |
251 | * Set the page permissions for a particular virtual address. If the | ||
252 | * address is a vmalloc mapping (or other non-linear mapping), then | ||
253 | * find the linear mapping of the page and also set its protections to | ||
254 | * match. | ||
255 | */ | ||
256 | static void set_aliased_prot(void *v, pgprot_t prot) | ||
273 | { | 257 | { |
274 | /* There's a one instruction preempt window here. We need to | 258 | int level; |
275 | make sure we're don't switch CPUs between getting the vcpu | 259 | pte_t *ptep; |
276 | pointer and updating the mask. */ | 260 | pte_t pte; |
277 | preempt_disable(); | 261 | unsigned long pfn; |
278 | x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; | 262 | struct page *page; |
279 | preempt_enable_no_resched(); | ||
280 | } | ||
281 | 263 | ||
282 | static void xen_irq_enable(void) | 264 | ptep = lookup_address((unsigned long)v, &level); |
283 | { | 265 | BUG_ON(ptep == NULL); |
284 | struct vcpu_info *vcpu; | ||
285 | 266 | ||
286 | /* We don't need to worry about being preempted here, since | 267 | pfn = pte_pfn(*ptep); |
287 | either a) interrupts are disabled, so no preemption, or b) | 268 | page = pfn_to_page(pfn); |
288 | the caller is confused and is trying to re-enable interrupts | ||
289 | on an indeterminate processor. */ | ||
290 | 269 | ||
291 | vcpu = x86_read_percpu(xen_vcpu); | 270 | pte = pfn_pte(pfn, prot); |
292 | vcpu->evtchn_upcall_mask = 0; | ||
293 | 271 | ||
294 | /* Doesn't matter if we get preempted here, because any | 272 | if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) |
295 | pending event will get dealt with anyway. */ | 273 | BUG(); |
296 | 274 | ||
297 | barrier(); /* unmask then check (avoid races) */ | 275 | if (!PageHighMem(page)) { |
298 | if (unlikely(vcpu->evtchn_upcall_pending)) | 276 | void *av = __va(PFN_PHYS(pfn)); |
299 | force_evtchn_callback(); | ||
300 | } | ||
301 | 277 | ||
302 | static void xen_safe_halt(void) | 278 | if (av != v) |
303 | { | 279 | if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) |
304 | /* Blocking includes an implicit local_irq_enable(). */ | 280 | BUG(); |
305 | if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) | 281 | } else |
306 | BUG(); | 282 | kmap_flush_unused(); |
307 | } | 283 | } |
308 | 284 | ||
309 | static void xen_halt(void) | 285 | static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) |
310 | { | 286 | { |
311 | if (irqs_disabled()) | 287 | const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; |
312 | HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); | 288 | int i; |
313 | else | ||
314 | xen_safe_halt(); | ||
315 | } | ||
316 | 289 | ||
317 | static void xen_leave_lazy(void) | 290 | for(i = 0; i < entries; i += entries_per_page) |
318 | { | 291 | set_aliased_prot(ldt + i, PAGE_KERNEL_RO); |
319 | paravirt_leave_lazy(paravirt_get_lazy_mode()); | ||
320 | xen_mc_flush(); | ||
321 | } | 292 | } |
322 | 293 | ||
323 | static unsigned long xen_store_tr(void) | 294 | static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) |
324 | { | 295 | { |
325 | return 0; | 296 | const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; |
297 | int i; | ||
298 | |||
299 | for(i = 0; i < entries; i += entries_per_page) | ||
300 | set_aliased_prot(ldt + i, PAGE_KERNEL); | ||
326 | } | 301 | } |
327 | 302 | ||
328 | static void xen_set_ldt(const void *addr, unsigned entries) | 303 | static void xen_set_ldt(const void *addr, unsigned entries) |
@@ -425,8 +400,7 @@ static void xen_load_gs_index(unsigned int idx) | |||
425 | static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, | 400 | static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, |
426 | const void *ptr) | 401 | const void *ptr) |
427 | { | 402 | { |
428 | unsigned long lp = (unsigned long)&dt[entrynum]; | 403 | xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); |
429 | xmaddr_t mach_lp = virt_to_machine(lp); | ||
430 | u64 entry = *(u64 *)ptr; | 404 | u64 entry = *(u64 *)ptr; |
431 | 405 | ||
432 | preempt_disable(); | 406 | preempt_disable(); |
@@ -559,7 +533,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, | |||
559 | } | 533 | } |
560 | 534 | ||
561 | static void xen_load_sp0(struct tss_struct *tss, | 535 | static void xen_load_sp0(struct tss_struct *tss, |
562 | struct thread_struct *thread) | 536 | struct thread_struct *thread) |
563 | { | 537 | { |
564 | struct multicall_space mcs = xen_mc_entry(0); | 538 | struct multicall_space mcs = xen_mc_entry(0); |
565 | MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); | 539 | MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); |
@@ -580,16 +554,47 @@ static void xen_io_delay(void) | |||
580 | } | 554 | } |
581 | 555 | ||
582 | #ifdef CONFIG_X86_LOCAL_APIC | 556 | #ifdef CONFIG_X86_LOCAL_APIC |
583 | static u32 xen_apic_read(unsigned long reg) | 557 | static u32 xen_apic_read(u32 reg) |
584 | { | 558 | { |
585 | return 0; | 559 | return 0; |
586 | } | 560 | } |
587 | 561 | ||
588 | static void xen_apic_write(unsigned long reg, u32 val) | 562 | static void xen_apic_write(u32 reg, u32 val) |
589 | { | 563 | { |
590 | /* Warn to see if there's any stray references */ | 564 | /* Warn to see if there's any stray references */ |
591 | WARN_ON(1); | 565 | WARN_ON(1); |
592 | } | 566 | } |
567 | |||
568 | static u64 xen_apic_icr_read(void) | ||
569 | { | ||
570 | return 0; | ||
571 | } | ||
572 | |||
573 | static void xen_apic_icr_write(u32 low, u32 id) | ||
574 | { | ||
575 | /* Warn to see if there's any stray references */ | ||
576 | WARN_ON(1); | ||
577 | } | ||
578 | |||
579 | static void xen_apic_wait_icr_idle(void) | ||
580 | { | ||
581 | return; | ||
582 | } | ||
583 | |||
584 | static u32 xen_safe_apic_wait_icr_idle(void) | ||
585 | { | ||
586 | return 0; | ||
587 | } | ||
588 | |||
589 | static struct apic_ops xen_basic_apic_ops = { | ||
590 | .read = xen_apic_read, | ||
591 | .write = xen_apic_write, | ||
592 | .icr_read = xen_apic_icr_read, | ||
593 | .icr_write = xen_apic_icr_write, | ||
594 | .wait_icr_idle = xen_apic_wait_icr_idle, | ||
595 | .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle, | ||
596 | }; | ||
597 | |||
593 | #endif | 598 | #endif |
594 | 599 | ||
595 | static void xen_flush_tlb(void) | 600 | static void xen_flush_tlb(void) |
@@ -803,6 +808,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) | |||
803 | ret = -EFAULT; | 808 | ret = -EFAULT; |
804 | break; | 809 | break; |
805 | #endif | 810 | #endif |
811 | |||
812 | case MSR_STAR: | ||
813 | case MSR_CSTAR: | ||
814 | case MSR_LSTAR: | ||
815 | case MSR_SYSCALL_MASK: | ||
816 | case MSR_IA32_SYSENTER_CS: | ||
817 | case MSR_IA32_SYSENTER_ESP: | ||
818 | case MSR_IA32_SYSENTER_EIP: | ||
819 | /* Fast syscall setup is all done in hypercalls, so | ||
820 | these are all ignored. Stub them out here to stop | ||
821 | Xen console noise. */ | ||
822 | break; | ||
823 | |||
806 | default: | 824 | default: |
807 | ret = native_write_msr_safe(msr, low, high); | 825 | ret = native_write_msr_safe(msr, low, high); |
808 | } | 826 | } |
@@ -812,7 +830,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) | |||
812 | 830 | ||
813 | /* Early in boot, while setting up the initial pagetable, assume | 831 | /* Early in boot, while setting up the initial pagetable, assume |
814 | everything is pinned. */ | 832 | everything is pinned. */ |
815 | static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) | 833 | static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) |
816 | { | 834 | { |
817 | #ifdef CONFIG_FLATMEM | 835 | #ifdef CONFIG_FLATMEM |
818 | BUG_ON(mem_map); /* should only be used early */ | 836 | BUG_ON(mem_map); /* should only be used early */ |
@@ -822,7 +840,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) | |||
822 | 840 | ||
823 | /* Early release_pte assumes that all pts are pinned, since there's | 841 | /* Early release_pte assumes that all pts are pinned, since there's |
824 | only init_mm and anything attached to that is pinned. */ | 842 | only init_mm and anything attached to that is pinned. */ |
825 | static void xen_release_pte_init(u32 pfn) | 843 | static void xen_release_pte_init(unsigned long pfn) |
826 | { | 844 | { |
827 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | 845 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); |
828 | } | 846 | } |
@@ -838,7 +856,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) | |||
838 | 856 | ||
839 | /* This needs to make sure the new pte page is pinned iff its being | 857 | /* This needs to make sure the new pte page is pinned iff its being |
840 | attached to a pinned pagetable. */ | 858 | attached to a pinned pagetable. */ |
841 | static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) | 859 | static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) |
842 | { | 860 | { |
843 | struct page *page = pfn_to_page(pfn); | 861 | struct page *page = pfn_to_page(pfn); |
844 | 862 | ||
@@ -846,8 +864,8 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) | |||
846 | SetPagePinned(page); | 864 | SetPagePinned(page); |
847 | 865 | ||
848 | if (!PageHighMem(page)) { | 866 | if (!PageHighMem(page)) { |
849 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | 867 | make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); |
850 | if (level == PT_PTE) | 868 | if (level == PT_PTE && USE_SPLIT_PTLOCKS) |
851 | pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); | 869 | pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); |
852 | } else | 870 | } else |
853 | /* make sure there are no stray mappings of | 871 | /* make sure there are no stray mappings of |
@@ -856,12 +874,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) | |||
856 | } | 874 | } |
857 | } | 875 | } |
858 | 876 | ||
859 | static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) | 877 | static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) |
860 | { | 878 | { |
861 | xen_alloc_ptpage(mm, pfn, PT_PTE); | 879 | xen_alloc_ptpage(mm, pfn, PT_PTE); |
862 | } | 880 | } |
863 | 881 | ||
864 | static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) | 882 | static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) |
865 | { | 883 | { |
866 | xen_alloc_ptpage(mm, pfn, PT_PMD); | 884 | xen_alloc_ptpage(mm, pfn, PT_PMD); |
867 | } | 885 | } |
@@ -909,13 +927,13 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) | |||
909 | } | 927 | } |
910 | 928 | ||
911 | /* This should never happen until we're OK to use struct page */ | 929 | /* This should never happen until we're OK to use struct page */ |
912 | static void xen_release_ptpage(u32 pfn, unsigned level) | 930 | static void xen_release_ptpage(unsigned long pfn, unsigned level) |
913 | { | 931 | { |
914 | struct page *page = pfn_to_page(pfn); | 932 | struct page *page = pfn_to_page(pfn); |
915 | 933 | ||
916 | if (PagePinned(page)) { | 934 | if (PagePinned(page)) { |
917 | if (!PageHighMem(page)) { | 935 | if (!PageHighMem(page)) { |
918 | if (level == PT_PTE) | 936 | if (level == PT_PTE && USE_SPLIT_PTLOCKS) |
919 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); | 937 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); |
920 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | 938 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); |
921 | } | 939 | } |
@@ -923,23 +941,23 @@ static void xen_release_ptpage(u32 pfn, unsigned level) | |||
923 | } | 941 | } |
924 | } | 942 | } |
925 | 943 | ||
926 | static void xen_release_pte(u32 pfn) | 944 | static void xen_release_pte(unsigned long pfn) |
927 | { | 945 | { |
928 | xen_release_ptpage(pfn, PT_PTE); | 946 | xen_release_ptpage(pfn, PT_PTE); |
929 | } | 947 | } |
930 | 948 | ||
931 | static void xen_release_pmd(u32 pfn) | 949 | static void xen_release_pmd(unsigned long pfn) |
932 | { | 950 | { |
933 | xen_release_ptpage(pfn, PT_PMD); | 951 | xen_release_ptpage(pfn, PT_PMD); |
934 | } | 952 | } |
935 | 953 | ||
936 | #if PAGETABLE_LEVELS == 4 | 954 | #if PAGETABLE_LEVELS == 4 |
937 | static void xen_alloc_pud(struct mm_struct *mm, u32 pfn) | 955 | static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) |
938 | { | 956 | { |
939 | xen_alloc_ptpage(mm, pfn, PT_PUD); | 957 | xen_alloc_ptpage(mm, pfn, PT_PUD); |
940 | } | 958 | } |
941 | 959 | ||
942 | static void xen_release_pud(u32 pfn) | 960 | static void xen_release_pud(unsigned long pfn) |
943 | { | 961 | { |
944 | xen_release_ptpage(pfn, PT_PUD); | 962 | xen_release_ptpage(pfn, PT_PUD); |
945 | } | 963 | } |
@@ -962,6 +980,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) | |||
962 | } | 980 | } |
963 | #endif | 981 | #endif |
964 | 982 | ||
983 | #ifdef CONFIG_X86_32 | ||
965 | static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) | 984 | static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) |
966 | { | 985 | { |
967 | /* If there's an existing pte, then don't allow _PAGE_RW to be set */ | 986 | /* If there's an existing pte, then don't allow _PAGE_RW to be set */ |
@@ -980,6 +999,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) | |||
980 | 999 | ||
981 | xen_set_pte(ptep, pte); | 1000 | xen_set_pte(ptep, pte); |
982 | } | 1001 | } |
1002 | #endif | ||
983 | 1003 | ||
984 | static __init void xen_pagetable_setup_start(pgd_t *base) | 1004 | static __init void xen_pagetable_setup_start(pgd_t *base) |
985 | { | 1005 | { |
@@ -1046,7 +1066,6 @@ void xen_setup_vcpu_info_placement(void) | |||
1046 | 1066 | ||
1047 | /* xen_vcpu_setup managed to place the vcpu_info within the | 1067 | /* xen_vcpu_setup managed to place the vcpu_info within the |
1048 | percpu area for all cpus, so make use of it */ | 1068 | percpu area for all cpus, so make use of it */ |
1049 | #ifdef CONFIG_X86_32 | ||
1050 | if (have_vcpu_info_placement) { | 1069 | if (have_vcpu_info_placement) { |
1051 | printk(KERN_INFO "Xen: using vcpu_info placement\n"); | 1070 | printk(KERN_INFO "Xen: using vcpu_info placement\n"); |
1052 | 1071 | ||
@@ -1056,7 +1075,6 @@ void xen_setup_vcpu_info_placement(void) | |||
1056 | pv_irq_ops.irq_enable = xen_irq_enable_direct; | 1075 | pv_irq_ops.irq_enable = xen_irq_enable_direct; |
1057 | pv_mmu_ops.read_cr2 = xen_read_cr2_direct; | 1076 | pv_mmu_ops.read_cr2 = xen_read_cr2_direct; |
1058 | } | 1077 | } |
1059 | #endif | ||
1060 | } | 1078 | } |
1061 | 1079 | ||
1062 | static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, | 1080 | static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, |
@@ -1077,12 +1095,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, | |||
1077 | goto patch_site | 1095 | goto patch_site |
1078 | 1096 | ||
1079 | switch (type) { | 1097 | switch (type) { |
1080 | #ifdef CONFIG_X86_32 | ||
1081 | SITE(pv_irq_ops, irq_enable); | 1098 | SITE(pv_irq_ops, irq_enable); |
1082 | SITE(pv_irq_ops, irq_disable); | 1099 | SITE(pv_irq_ops, irq_disable); |
1083 | SITE(pv_irq_ops, save_fl); | 1100 | SITE(pv_irq_ops, save_fl); |
1084 | SITE(pv_irq_ops, restore_fl); | 1101 | SITE(pv_irq_ops, restore_fl); |
1085 | #endif /* CONFIG_X86_32 */ | ||
1086 | #undef SITE | 1102 | #undef SITE |
1087 | 1103 | ||
1088 | patch_site: | 1104 | patch_site: |
@@ -1220,6 +1236,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { | |||
1220 | .load_gs_index = xen_load_gs_index, | 1236 | .load_gs_index = xen_load_gs_index, |
1221 | #endif | 1237 | #endif |
1222 | 1238 | ||
1239 | .alloc_ldt = xen_alloc_ldt, | ||
1240 | .free_ldt = xen_free_ldt, | ||
1241 | |||
1223 | .store_gdt = native_store_gdt, | 1242 | .store_gdt = native_store_gdt, |
1224 | .store_idt = native_store_idt, | 1243 | .store_idt = native_store_idt, |
1225 | .store_tr = xen_store_tr, | 1244 | .store_tr = xen_store_tr, |
@@ -1241,40 +1260,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { | |||
1241 | }, | 1260 | }, |
1242 | }; | 1261 | }; |
1243 | 1262 | ||
1244 | static void __init __xen_init_IRQ(void) | ||
1245 | { | ||
1246 | #ifdef CONFIG_X86_64 | ||
1247 | int i; | ||
1248 | |||
1249 | /* Create identity vector->irq map */ | ||
1250 | for(i = 0; i < NR_VECTORS; i++) { | ||
1251 | int cpu; | ||
1252 | |||
1253 | for_each_possible_cpu(cpu) | ||
1254 | per_cpu(vector_irq, cpu)[i] = i; | ||
1255 | } | ||
1256 | #endif /* CONFIG_X86_64 */ | ||
1257 | |||
1258 | xen_init_IRQ(); | ||
1259 | } | ||
1260 | |||
1261 | static const struct pv_irq_ops xen_irq_ops __initdata = { | ||
1262 | .init_IRQ = __xen_init_IRQ, | ||
1263 | .save_fl = xen_save_fl, | ||
1264 | .restore_fl = xen_restore_fl, | ||
1265 | .irq_disable = xen_irq_disable, | ||
1266 | .irq_enable = xen_irq_enable, | ||
1267 | .safe_halt = xen_safe_halt, | ||
1268 | .halt = xen_halt, | ||
1269 | #ifdef CONFIG_X86_64 | ||
1270 | .adjust_exception_frame = xen_adjust_exception_frame, | ||
1271 | #endif | ||
1272 | }; | ||
1273 | |||
1274 | static const struct pv_apic_ops xen_apic_ops __initdata = { | 1263 | static const struct pv_apic_ops xen_apic_ops __initdata = { |
1275 | #ifdef CONFIG_X86_LOCAL_APIC | 1264 | #ifdef CONFIG_X86_LOCAL_APIC |
1276 | .apic_write = xen_apic_write, | ||
1277 | .apic_read = xen_apic_read, | ||
1278 | .setup_boot_clock = paravirt_nop, | 1265 | .setup_boot_clock = paravirt_nop, |
1279 | .setup_secondary_clock = paravirt_nop, | 1266 | .setup_secondary_clock = paravirt_nop, |
1280 | .startup_ipi_hook = paravirt_nop, | 1267 | .startup_ipi_hook = paravirt_nop, |
@@ -1324,7 +1311,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { | |||
1324 | .ptep_modify_prot_commit = __ptep_modify_prot_commit, | 1311 | .ptep_modify_prot_commit = __ptep_modify_prot_commit, |
1325 | 1312 | ||
1326 | .pte_val = xen_pte_val, | 1313 | .pte_val = xen_pte_val, |
1327 | .pte_flags = native_pte_val, | 1314 | .pte_flags = native_pte_flags, |
1328 | .pgd_val = xen_pgd_val, | 1315 | .pgd_val = xen_pgd_val, |
1329 | 1316 | ||
1330 | .make_pte = xen_make_pte, | 1317 | .make_pte = xen_make_pte, |
@@ -1413,7 +1400,7 @@ static void __init xen_reserve_top(void) | |||
1413 | if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) | 1400 | if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) |
1414 | top = pp.virt_start; | 1401 | top = pp.virt_start; |
1415 | 1402 | ||
1416 | reserve_top_address(-top + 2 * PAGE_SIZE); | 1403 | reserve_top_address(-top); |
1417 | #endif /* CONFIG_X86_32 */ | 1404 | #endif /* CONFIG_X86_32 */ |
1418 | } | 1405 | } |
1419 | 1406 | ||
@@ -1447,48 +1434,11 @@ static void *m2v(phys_addr_t maddr) | |||
1447 | return __ka(m2p(maddr)); | 1434 | return __ka(m2p(maddr)); |
1448 | } | 1435 | } |
1449 | 1436 | ||
1450 | #ifdef CONFIG_X86_64 | ||
1451 | static void walk(pgd_t *pgd, unsigned long addr) | ||
1452 | { | ||
1453 | unsigned l4idx = pgd_index(addr); | ||
1454 | unsigned l3idx = pud_index(addr); | ||
1455 | unsigned l2idx = pmd_index(addr); | ||
1456 | unsigned l1idx = pte_index(addr); | ||
1457 | pgd_t l4; | ||
1458 | pud_t l3; | ||
1459 | pmd_t l2; | ||
1460 | pte_t l1; | ||
1461 | |||
1462 | xen_raw_printk("walk %p, %lx -> %d %d %d %d\n", | ||
1463 | pgd, addr, l4idx, l3idx, l2idx, l1idx); | ||
1464 | |||
1465 | l4 = pgd[l4idx]; | ||
1466 | xen_raw_printk(" l4: %016lx\n", l4.pgd); | ||
1467 | xen_raw_printk(" %016lx\n", pgd_val(l4)); | ||
1468 | |||
1469 | l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx]; | ||
1470 | xen_raw_printk(" l3: %016lx\n", l3.pud); | ||
1471 | xen_raw_printk(" %016lx\n", pud_val(l3)); | ||
1472 | |||
1473 | l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx]; | ||
1474 | xen_raw_printk(" l2: %016lx\n", l2.pmd); | ||
1475 | xen_raw_printk(" %016lx\n", pmd_val(l2)); | ||
1476 | |||
1477 | l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx]; | ||
1478 | xen_raw_printk(" l1: %016lx\n", l1.pte); | ||
1479 | xen_raw_printk(" %016lx\n", pte_val(l1)); | ||
1480 | } | ||
1481 | #endif | ||
1482 | |||
1483 | static void set_page_prot(void *addr, pgprot_t prot) | 1437 | static void set_page_prot(void *addr, pgprot_t prot) |
1484 | { | 1438 | { |
1485 | unsigned long pfn = __pa(addr) >> PAGE_SHIFT; | 1439 | unsigned long pfn = __pa(addr) >> PAGE_SHIFT; |
1486 | pte_t pte = pfn_pte(pfn, prot); | 1440 | pte_t pte = pfn_pte(pfn, prot); |
1487 | 1441 | ||
1488 | xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n", | ||
1489 | addr, pfn, get_phys_to_machine(pfn), | ||
1490 | pgprot_val(prot), pte.pte); | ||
1491 | |||
1492 | if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) | 1442 | if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) |
1493 | BUG(); | 1443 | BUG(); |
1494 | } | 1444 | } |
@@ -1664,6 +1614,8 @@ asmlinkage void __init xen_start_kernel(void) | |||
1664 | if (!xen_start_info) | 1614 | if (!xen_start_info) |
1665 | return; | 1615 | return; |
1666 | 1616 | ||
1617 | xen_domain_type = XEN_PV_DOMAIN; | ||
1618 | |||
1667 | BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); | 1619 | BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); |
1668 | 1620 | ||
1669 | xen_setup_features(); | 1621 | xen_setup_features(); |
@@ -1673,10 +1625,18 @@ asmlinkage void __init xen_start_kernel(void) | |||
1673 | pv_init_ops = xen_init_ops; | 1625 | pv_init_ops = xen_init_ops; |
1674 | pv_time_ops = xen_time_ops; | 1626 | pv_time_ops = xen_time_ops; |
1675 | pv_cpu_ops = xen_cpu_ops; | 1627 | pv_cpu_ops = xen_cpu_ops; |
1676 | pv_irq_ops = xen_irq_ops; | ||
1677 | pv_apic_ops = xen_apic_ops; | 1628 | pv_apic_ops = xen_apic_ops; |
1678 | pv_mmu_ops = xen_mmu_ops; | 1629 | pv_mmu_ops = xen_mmu_ops; |
1679 | 1630 | ||
1631 | xen_init_irq_ops(); | ||
1632 | |||
1633 | #ifdef CONFIG_X86_LOCAL_APIC | ||
1634 | /* | ||
1635 | * set up the basic apic ops. | ||
1636 | */ | ||
1637 | apic_ops = &xen_basic_apic_ops; | ||
1638 | #endif | ||
1639 | |||
1680 | if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { | 1640 | if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { |
1681 | pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; | 1641 | pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; |
1682 | pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; | 1642 | pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; |
@@ -1700,7 +1660,7 @@ asmlinkage void __init xen_start_kernel(void) | |||
1700 | 1660 | ||
1701 | /* Prevent unwanted bits from being set in PTEs. */ | 1661 | /* Prevent unwanted bits from being set in PTEs. */ |
1702 | __supported_pte_mask &= ~_PAGE_GLOBAL; | 1662 | __supported_pte_mask &= ~_PAGE_GLOBAL; |
1703 | if (!is_initial_xendomain()) | 1663 | if (!xen_initial_domain()) |
1704 | __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); | 1664 | __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); |
1705 | 1665 | ||
1706 | /* Don't do the full vcpu_info placement stuff until we have a | 1666 | /* Don't do the full vcpu_info placement stuff until we have a |
@@ -1735,7 +1695,7 @@ asmlinkage void __init xen_start_kernel(void) | |||
1735 | boot_params.hdr.ramdisk_size = xen_start_info->mod_len; | 1695 | boot_params.hdr.ramdisk_size = xen_start_info->mod_len; |
1736 | boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); | 1696 | boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); |
1737 | 1697 | ||
1738 | if (!is_initial_xendomain()) { | 1698 | if (!xen_initial_domain()) { |
1739 | add_preferred_console("xenboot", 0, NULL); | 1699 | add_preferred_console("xenboot", 0, NULL); |
1740 | add_preferred_console("tty", 0, NULL); | 1700 | add_preferred_console("tty", 0, NULL); |
1741 | add_preferred_console("hvc", 0, NULL); | 1701 | add_preferred_console("hvc", 0, NULL); |
@@ -1743,15 +1703,6 @@ asmlinkage void __init xen_start_kernel(void) | |||
1743 | 1703 | ||
1744 | xen_raw_console_write("about to get started...\n"); | 1704 | xen_raw_console_write("about to get started...\n"); |
1745 | 1705 | ||
1746 | #if 0 | ||
1747 | xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n", | ||
1748 | &boot_params, __pa_symbol(&boot_params), | ||
1749 | __va(__pa_symbol(&boot_params))); | ||
1750 | |||
1751 | walk(pgd, &boot_params); | ||
1752 | walk(pgd, __va(__pa(&boot_params))); | ||
1753 | #endif | ||
1754 | |||
1755 | /* Start the world */ | 1706 | /* Start the world */ |
1756 | #ifdef CONFIG_X86_32 | 1707 | #ifdef CONFIG_X86_32 |
1757 | i386_start_kernel(); | 1708 | i386_start_kernel(); |
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c new file mode 100644 index 000000000000..28b85ab8422e --- /dev/null +++ b/arch/x86/xen/irq.c | |||
@@ -0,0 +1,143 @@ | |||
1 | #include <linux/hardirq.h> | ||
2 | |||
3 | #include <xen/interface/xen.h> | ||
4 | #include <xen/interface/sched.h> | ||
5 | #include <xen/interface/vcpu.h> | ||
6 | |||
7 | #include <asm/xen/hypercall.h> | ||
8 | #include <asm/xen/hypervisor.h> | ||
9 | |||
10 | #include "xen-ops.h" | ||
11 | |||
12 | /* | ||
13 | * Force a proper event-channel callback from Xen after clearing the | ||
14 | * callback mask. We do this in a very simple manner, by making a call | ||
15 | * down into Xen. The pending flag will be checked by Xen on return. | ||
16 | */ | ||
17 | void xen_force_evtchn_callback(void) | ||
18 | { | ||
19 | (void)HYPERVISOR_xen_version(0, NULL); | ||
20 | } | ||
21 | |||
22 | static void __init __xen_init_IRQ(void) | ||
23 | { | ||
24 | #ifdef CONFIG_X86_64 | ||
25 | int i; | ||
26 | |||
27 | /* Create identity vector->irq map */ | ||
28 | for(i = 0; i < NR_VECTORS; i++) { | ||
29 | int cpu; | ||
30 | |||
31 | for_each_possible_cpu(cpu) | ||
32 | per_cpu(vector_irq, cpu)[i] = i; | ||
33 | } | ||
34 | #endif /* CONFIG_X86_64 */ | ||
35 | |||
36 | xen_init_IRQ(); | ||
37 | } | ||
38 | |||
39 | static unsigned long xen_save_fl(void) | ||
40 | { | ||
41 | struct vcpu_info *vcpu; | ||
42 | unsigned long flags; | ||
43 | |||
44 | vcpu = x86_read_percpu(xen_vcpu); | ||
45 | |||
46 | /* flag has opposite sense of mask */ | ||
47 | flags = !vcpu->evtchn_upcall_mask; | ||
48 | |||
49 | /* convert to IF type flag | ||
50 | -0 -> 0x00000000 | ||
51 | -1 -> 0xffffffff | ||
52 | */ | ||
53 | return (-flags) & X86_EFLAGS_IF; | ||
54 | } | ||
55 | |||
56 | static void xen_restore_fl(unsigned long flags) | ||
57 | { | ||
58 | struct vcpu_info *vcpu; | ||
59 | |||
60 | /* convert from IF type flag */ | ||
61 | flags = !(flags & X86_EFLAGS_IF); | ||
62 | |||
63 | /* There's a one instruction preempt window here. We need to | ||
64 | make sure we're don't switch CPUs between getting the vcpu | ||
65 | pointer and updating the mask. */ | ||
66 | preempt_disable(); | ||
67 | vcpu = x86_read_percpu(xen_vcpu); | ||
68 | vcpu->evtchn_upcall_mask = flags; | ||
69 | preempt_enable_no_resched(); | ||
70 | |||
71 | /* Doesn't matter if we get preempted here, because any | ||
72 | pending event will get dealt with anyway. */ | ||
73 | |||
74 | if (flags == 0) { | ||
75 | preempt_check_resched(); | ||
76 | barrier(); /* unmask then check (avoid races) */ | ||
77 | if (unlikely(vcpu->evtchn_upcall_pending)) | ||
78 | xen_force_evtchn_callback(); | ||
79 | } | ||
80 | } | ||
81 | |||
82 | static void xen_irq_disable(void) | ||
83 | { | ||
84 | /* There's a one instruction preempt window here. We need to | ||
85 | make sure we're don't switch CPUs between getting the vcpu | ||
86 | pointer and updating the mask. */ | ||
87 | preempt_disable(); | ||
88 | x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; | ||
89 | preempt_enable_no_resched(); | ||
90 | } | ||
91 | |||
92 | static void xen_irq_enable(void) | ||
93 | { | ||
94 | struct vcpu_info *vcpu; | ||
95 | |||
96 | /* We don't need to worry about being preempted here, since | ||
97 | either a) interrupts are disabled, so no preemption, or b) | ||
98 | the caller is confused and is trying to re-enable interrupts | ||
99 | on an indeterminate processor. */ | ||
100 | |||
101 | vcpu = x86_read_percpu(xen_vcpu); | ||
102 | vcpu->evtchn_upcall_mask = 0; | ||
103 | |||
104 | /* Doesn't matter if we get preempted here, because any | ||
105 | pending event will get dealt with anyway. */ | ||
106 | |||
107 | barrier(); /* unmask then check (avoid races) */ | ||
108 | if (unlikely(vcpu->evtchn_upcall_pending)) | ||
109 | xen_force_evtchn_callback(); | ||
110 | } | ||
111 | |||
112 | static void xen_safe_halt(void) | ||
113 | { | ||
114 | /* Blocking includes an implicit local_irq_enable(). */ | ||
115 | if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) | ||
116 | BUG(); | ||
117 | } | ||
118 | |||
119 | static void xen_halt(void) | ||
120 | { | ||
121 | if (irqs_disabled()) | ||
122 | HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); | ||
123 | else | ||
124 | xen_safe_halt(); | ||
125 | } | ||
126 | |||
127 | static const struct pv_irq_ops xen_irq_ops __initdata = { | ||
128 | .init_IRQ = __xen_init_IRQ, | ||
129 | .save_fl = xen_save_fl, | ||
130 | .restore_fl = xen_restore_fl, | ||
131 | .irq_disable = xen_irq_disable, | ||
132 | .irq_enable = xen_irq_enable, | ||
133 | .safe_halt = xen_safe_halt, | ||
134 | .halt = xen_halt, | ||
135 | #ifdef CONFIG_X86_64 | ||
136 | .adjust_exception_frame = xen_adjust_exception_frame, | ||
137 | #endif | ||
138 | }; | ||
139 | |||
140 | void __init xen_init_irq_ops() | ||
141 | { | ||
142 | pv_irq_ops = xen_irq_ops; | ||
143 | } | ||
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index aa37469da696..ae173f6edd8b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -40,6 +40,7 @@ | |||
40 | */ | 40 | */ |
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/highmem.h> | 42 | #include <linux/highmem.h> |
43 | #include <linux/debugfs.h> | ||
43 | #include <linux/bug.h> | 44 | #include <linux/bug.h> |
44 | 45 | ||
45 | #include <asm/pgtable.h> | 46 | #include <asm/pgtable.h> |
@@ -57,6 +58,61 @@ | |||
57 | 58 | ||
58 | #include "multicalls.h" | 59 | #include "multicalls.h" |
59 | #include "mmu.h" | 60 | #include "mmu.h" |
61 | #include "debugfs.h" | ||
62 | |||
63 | #define MMU_UPDATE_HISTO 30 | ||
64 | |||
65 | #ifdef CONFIG_XEN_DEBUG_FS | ||
66 | |||
67 | static struct { | ||
68 | u32 pgd_update; | ||
69 | u32 pgd_update_pinned; | ||
70 | u32 pgd_update_batched; | ||
71 | |||
72 | u32 pud_update; | ||
73 | u32 pud_update_pinned; | ||
74 | u32 pud_update_batched; | ||
75 | |||
76 | u32 pmd_update; | ||
77 | u32 pmd_update_pinned; | ||
78 | u32 pmd_update_batched; | ||
79 | |||
80 | u32 pte_update; | ||
81 | u32 pte_update_pinned; | ||
82 | u32 pte_update_batched; | ||
83 | |||
84 | u32 mmu_update; | ||
85 | u32 mmu_update_extended; | ||
86 | u32 mmu_update_histo[MMU_UPDATE_HISTO]; | ||
87 | |||
88 | u32 prot_commit; | ||
89 | u32 prot_commit_batched; | ||
90 | |||
91 | u32 set_pte_at; | ||
92 | u32 set_pte_at_batched; | ||
93 | u32 set_pte_at_pinned; | ||
94 | u32 set_pte_at_current; | ||
95 | u32 set_pte_at_kernel; | ||
96 | } mmu_stats; | ||
97 | |||
98 | static u8 zero_stats; | ||
99 | |||
100 | static inline void check_zero(void) | ||
101 | { | ||
102 | if (unlikely(zero_stats)) { | ||
103 | memset(&mmu_stats, 0, sizeof(mmu_stats)); | ||
104 | zero_stats = 0; | ||
105 | } | ||
106 | } | ||
107 | |||
108 | #define ADD_STATS(elem, val) \ | ||
109 | do { check_zero(); mmu_stats.elem += (val); } while(0) | ||
110 | |||
111 | #else /* !CONFIG_XEN_DEBUG_FS */ | ||
112 | |||
113 | #define ADD_STATS(elem, val) do { (void)(val); } while(0) | ||
114 | |||
115 | #endif /* CONFIG_XEN_DEBUG_FS */ | ||
60 | 116 | ||
61 | /* | 117 | /* |
62 | * Just beyond the highest usermode address. STACK_TOP_MAX has a | 118 | * Just beyond the highest usermode address. STACK_TOP_MAX has a |
@@ -229,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr) | |||
229 | } | 285 | } |
230 | 286 | ||
231 | 287 | ||
232 | static bool page_pinned(void *ptr) | 288 | static bool xen_page_pinned(void *ptr) |
233 | { | 289 | { |
234 | struct page *page = virt_to_page(ptr); | 290 | struct page *page = virt_to_page(ptr); |
235 | 291 | ||
236 | return PagePinned(page); | 292 | return PagePinned(page); |
237 | } | 293 | } |
238 | 294 | ||
239 | static void extend_mmu_update(const struct mmu_update *update) | 295 | static void xen_extend_mmu_update(const struct mmu_update *update) |
240 | { | 296 | { |
241 | struct multicall_space mcs; | 297 | struct multicall_space mcs; |
242 | struct mmu_update *u; | 298 | struct mmu_update *u; |
243 | 299 | ||
244 | mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); | 300 | mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); |
245 | 301 | ||
246 | if (mcs.mc != NULL) | 302 | if (mcs.mc != NULL) { |
303 | ADD_STATS(mmu_update_extended, 1); | ||
304 | ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1); | ||
305 | |||
247 | mcs.mc->args[1]++; | 306 | mcs.mc->args[1]++; |
248 | else { | 307 | |
308 | if (mcs.mc->args[1] < MMU_UPDATE_HISTO) | ||
309 | ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1); | ||
310 | else | ||
311 | ADD_STATS(mmu_update_histo[0], 1); | ||
312 | } else { | ||
313 | ADD_STATS(mmu_update, 1); | ||
249 | mcs = __xen_mc_entry(sizeof(*u)); | 314 | mcs = __xen_mc_entry(sizeof(*u)); |
250 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); | 315 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); |
316 | ADD_STATS(mmu_update_histo[1], 1); | ||
251 | } | 317 | } |
252 | 318 | ||
253 | u = mcs.args; | 319 | u = mcs.args; |
@@ -265,7 +331,9 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) | |||
265 | /* ptr may be ioremapped for 64-bit pagetable setup */ | 331 | /* ptr may be ioremapped for 64-bit pagetable setup */ |
266 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; | 332 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; |
267 | u.val = pmd_val_ma(val); | 333 | u.val = pmd_val_ma(val); |
268 | extend_mmu_update(&u); | 334 | xen_extend_mmu_update(&u); |
335 | |||
336 | ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
269 | 337 | ||
270 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 338 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
271 | 339 | ||
@@ -274,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) | |||
274 | 342 | ||
275 | void xen_set_pmd(pmd_t *ptr, pmd_t val) | 343 | void xen_set_pmd(pmd_t *ptr, pmd_t val) |
276 | { | 344 | { |
345 | ADD_STATS(pmd_update, 1); | ||
346 | |||
277 | /* If page is not pinned, we can just update the entry | 347 | /* If page is not pinned, we can just update the entry |
278 | directly */ | 348 | directly */ |
279 | if (!page_pinned(ptr)) { | 349 | if (!xen_page_pinned(ptr)) { |
280 | *ptr = val; | 350 | *ptr = val; |
281 | return; | 351 | return; |
282 | } | 352 | } |
283 | 353 | ||
354 | ADD_STATS(pmd_update_pinned, 1); | ||
355 | |||
284 | xen_set_pmd_hyper(ptr, val); | 356 | xen_set_pmd_hyper(ptr, val); |
285 | } | 357 | } |
286 | 358 | ||
@@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
300 | if (mm == &init_mm) | 372 | if (mm == &init_mm) |
301 | preempt_disable(); | 373 | preempt_disable(); |
302 | 374 | ||
375 | ADD_STATS(set_pte_at, 1); | ||
376 | // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); | ||
377 | ADD_STATS(set_pte_at_current, mm == current->mm); | ||
378 | ADD_STATS(set_pte_at_kernel, mm == &init_mm); | ||
379 | |||
303 | if (mm == current->mm || mm == &init_mm) { | 380 | if (mm == current->mm || mm == &init_mm) { |
304 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { | 381 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { |
305 | struct multicall_space mcs; | 382 | struct multicall_space mcs; |
306 | mcs = xen_mc_entry(0); | 383 | mcs = xen_mc_entry(0); |
307 | 384 | ||
308 | MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); | 385 | MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); |
386 | ADD_STATS(set_pte_at_batched, 1); | ||
309 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 387 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
310 | goto out; | 388 | goto out; |
311 | } else | 389 | } else |
@@ -334,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, | |||
334 | 412 | ||
335 | u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; | 413 | u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; |
336 | u.val = pte_val_ma(pte); | 414 | u.val = pte_val_ma(pte); |
337 | extend_mmu_update(&u); | 415 | xen_extend_mmu_update(&u); |
416 | |||
417 | ADD_STATS(prot_commit, 1); | ||
418 | ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
338 | 419 | ||
339 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 420 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
340 | } | 421 | } |
@@ -400,7 +481,9 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) | |||
400 | /* ptr may be ioremapped for 64-bit pagetable setup */ | 481 | /* ptr may be ioremapped for 64-bit pagetable setup */ |
401 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; | 482 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; |
402 | u.val = pud_val_ma(val); | 483 | u.val = pud_val_ma(val); |
403 | extend_mmu_update(&u); | 484 | xen_extend_mmu_update(&u); |
485 | |||
486 | ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
404 | 487 | ||
405 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 488 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
406 | 489 | ||
@@ -409,18 +492,26 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) | |||
409 | 492 | ||
410 | void xen_set_pud(pud_t *ptr, pud_t val) | 493 | void xen_set_pud(pud_t *ptr, pud_t val) |
411 | { | 494 | { |
495 | ADD_STATS(pud_update, 1); | ||
496 | |||
412 | /* If page is not pinned, we can just update the entry | 497 | /* If page is not pinned, we can just update the entry |
413 | directly */ | 498 | directly */ |
414 | if (!page_pinned(ptr)) { | 499 | if (!xen_page_pinned(ptr)) { |
415 | *ptr = val; | 500 | *ptr = val; |
416 | return; | 501 | return; |
417 | } | 502 | } |
418 | 503 | ||
504 | ADD_STATS(pud_update_pinned, 1); | ||
505 | |||
419 | xen_set_pud_hyper(ptr, val); | 506 | xen_set_pud_hyper(ptr, val); |
420 | } | 507 | } |
421 | 508 | ||
422 | void xen_set_pte(pte_t *ptep, pte_t pte) | 509 | void xen_set_pte(pte_t *ptep, pte_t pte) |
423 | { | 510 | { |
511 | ADD_STATS(pte_update, 1); | ||
512 | // ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); | ||
513 | ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
514 | |||
424 | #ifdef CONFIG_X86_PAE | 515 | #ifdef CONFIG_X86_PAE |
425 | ptep->pte_high = pte.pte_high; | 516 | ptep->pte_high = pte.pte_high; |
426 | smp_wmb(); | 517 | smp_wmb(); |
@@ -490,7 +581,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) | |||
490 | 581 | ||
491 | u.ptr = virt_to_machine(ptr).maddr; | 582 | u.ptr = virt_to_machine(ptr).maddr; |
492 | u.val = pgd_val_ma(val); | 583 | u.val = pgd_val_ma(val); |
493 | extend_mmu_update(&u); | 584 | xen_extend_mmu_update(&u); |
494 | } | 585 | } |
495 | 586 | ||
496 | /* | 587 | /* |
@@ -517,17 +608,22 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) | |||
517 | { | 608 | { |
518 | pgd_t *user_ptr = xen_get_user_pgd(ptr); | 609 | pgd_t *user_ptr = xen_get_user_pgd(ptr); |
519 | 610 | ||
611 | ADD_STATS(pgd_update, 1); | ||
612 | |||
520 | /* If page is not pinned, we can just update the entry | 613 | /* If page is not pinned, we can just update the entry |
521 | directly */ | 614 | directly */ |
522 | if (!page_pinned(ptr)) { | 615 | if (!xen_page_pinned(ptr)) { |
523 | *ptr = val; | 616 | *ptr = val; |
524 | if (user_ptr) { | 617 | if (user_ptr) { |
525 | WARN_ON(page_pinned(user_ptr)); | 618 | WARN_ON(xen_page_pinned(user_ptr)); |
526 | *user_ptr = val; | 619 | *user_ptr = val; |
527 | } | 620 | } |
528 | return; | 621 | return; |
529 | } | 622 | } |
530 | 623 | ||
624 | ADD_STATS(pgd_update_pinned, 1); | ||
625 | ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
626 | |||
531 | /* If it's pinned, then we can at least batch the kernel and | 627 | /* If it's pinned, then we can at least batch the kernel and |
532 | user updates together. */ | 628 | user updates together. */ |
533 | xen_mc_batch(); | 629 | xen_mc_batch(); |
@@ -555,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) | |||
555 | * For 64-bit, we must skip the Xen hole in the middle of the address | 651 | * For 64-bit, we must skip the Xen hole in the middle of the address |
556 | * space, just after the big x86-64 virtual hole. | 652 | * space, just after the big x86-64 virtual hole. |
557 | */ | 653 | */ |
558 | static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | 654 | static int xen_pgd_walk(struct mm_struct *mm, |
559 | unsigned long limit) | 655 | int (*func)(struct mm_struct *mm, struct page *, |
656 | enum pt_level), | ||
657 | unsigned long limit) | ||
560 | { | 658 | { |
659 | pgd_t *pgd = mm->pgd; | ||
561 | int flush = 0; | 660 | int flush = 0; |
562 | unsigned hole_low, hole_high; | 661 | unsigned hole_low, hole_high; |
563 | unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; | 662 | unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; |
@@ -590,8 +689,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
590 | pmdidx_limit = 0; | 689 | pmdidx_limit = 0; |
591 | #endif | 690 | #endif |
592 | 691 | ||
593 | flush |= (*func)(virt_to_page(pgd), PT_PGD); | ||
594 | |||
595 | for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { | 692 | for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { |
596 | pud_t *pud; | 693 | pud_t *pud; |
597 | 694 | ||
@@ -604,7 +701,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
604 | pud = pud_offset(&pgd[pgdidx], 0); | 701 | pud = pud_offset(&pgd[pgdidx], 0); |
605 | 702 | ||
606 | if (PTRS_PER_PUD > 1) /* not folded */ | 703 | if (PTRS_PER_PUD > 1) /* not folded */ |
607 | flush |= (*func)(virt_to_page(pud), PT_PUD); | 704 | flush |= (*func)(mm, virt_to_page(pud), PT_PUD); |
608 | 705 | ||
609 | for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { | 706 | for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { |
610 | pmd_t *pmd; | 707 | pmd_t *pmd; |
@@ -619,7 +716,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
619 | pmd = pmd_offset(&pud[pudidx], 0); | 716 | pmd = pmd_offset(&pud[pudidx], 0); |
620 | 717 | ||
621 | if (PTRS_PER_PMD > 1) /* not folded */ | 718 | if (PTRS_PER_PMD > 1) /* not folded */ |
622 | flush |= (*func)(virt_to_page(pmd), PT_PMD); | 719 | flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); |
623 | 720 | ||
624 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { | 721 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { |
625 | struct page *pte; | 722 | struct page *pte; |
@@ -633,28 +730,34 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
633 | continue; | 730 | continue; |
634 | 731 | ||
635 | pte = pmd_page(pmd[pmdidx]); | 732 | pte = pmd_page(pmd[pmdidx]); |
636 | flush |= (*func)(pte, PT_PTE); | 733 | flush |= (*func)(mm, pte, PT_PTE); |
637 | } | 734 | } |
638 | } | 735 | } |
639 | } | 736 | } |
737 | |||
640 | out: | 738 | out: |
739 | /* Do the top level last, so that the callbacks can use it as | ||
740 | a cue to do final things like tlb flushes. */ | ||
741 | flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); | ||
641 | 742 | ||
642 | return flush; | 743 | return flush; |
643 | } | 744 | } |
644 | 745 | ||
645 | static spinlock_t *lock_pte(struct page *page) | 746 | /* If we're using split pte locks, then take the page's lock and |
747 | return a pointer to it. Otherwise return NULL. */ | ||
748 | static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) | ||
646 | { | 749 | { |
647 | spinlock_t *ptl = NULL; | 750 | spinlock_t *ptl = NULL; |
648 | 751 | ||
649 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | 752 | #if USE_SPLIT_PTLOCKS |
650 | ptl = __pte_lockptr(page); | 753 | ptl = __pte_lockptr(page); |
651 | spin_lock(ptl); | 754 | spin_lock_nest_lock(ptl, &mm->page_table_lock); |
652 | #endif | 755 | #endif |
653 | 756 | ||
654 | return ptl; | 757 | return ptl; |
655 | } | 758 | } |
656 | 759 | ||
657 | static void do_unlock(void *v) | 760 | static void xen_pte_unlock(void *v) |
658 | { | 761 | { |
659 | spinlock_t *ptl = v; | 762 | spinlock_t *ptl = v; |
660 | spin_unlock(ptl); | 763 | spin_unlock(ptl); |
@@ -672,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn) | |||
672 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 775 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); |
673 | } | 776 | } |
674 | 777 | ||
675 | static int pin_page(struct page *page, enum pt_level level) | 778 | static int xen_pin_page(struct mm_struct *mm, struct page *page, |
779 | enum pt_level level) | ||
676 | { | 780 | { |
677 | unsigned pgfl = TestSetPagePinned(page); | 781 | unsigned pgfl = TestSetPagePinned(page); |
678 | int flush; | 782 | int flush; |
@@ -691,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level) | |||
691 | 795 | ||
692 | flush = 0; | 796 | flush = 0; |
693 | 797 | ||
798 | /* | ||
799 | * We need to hold the pagetable lock between the time | ||
800 | * we make the pagetable RO and when we actually pin | ||
801 | * it. If we don't, then other users may come in and | ||
802 | * attempt to update the pagetable by writing it, | ||
803 | * which will fail because the memory is RO but not | ||
804 | * pinned, so Xen won't do the trap'n'emulate. | ||
805 | * | ||
806 | * If we're using split pte locks, we can't hold the | ||
807 | * entire pagetable's worth of locks during the | ||
808 | * traverse, because we may wrap the preempt count (8 | ||
809 | * bits). The solution is to mark RO and pin each PTE | ||
810 | * page while holding the lock. This means the number | ||
811 | * of locks we end up holding is never more than a | ||
812 | * batch size (~32 entries, at present). | ||
813 | * | ||
814 | * If we're not using split pte locks, we needn't pin | ||
815 | * the PTE pages independently, because we're | ||
816 | * protected by the overall pagetable lock. | ||
817 | */ | ||
694 | ptl = NULL; | 818 | ptl = NULL; |
695 | if (level == PT_PTE) | 819 | if (level == PT_PTE) |
696 | ptl = lock_pte(page); | 820 | ptl = xen_pte_lock(page, mm); |
697 | 821 | ||
698 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 822 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
699 | pfn_pte(pfn, PAGE_KERNEL_RO), | 823 | pfn_pte(pfn, PAGE_KERNEL_RO), |
700 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); | 824 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
701 | 825 | ||
702 | if (level == PT_PTE) | 826 | if (ptl) { |
703 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); | 827 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); |
704 | 828 | ||
705 | if (ptl) { | ||
706 | /* Queue a deferred unlock for when this batch | 829 | /* Queue a deferred unlock for when this batch |
707 | is completed. */ | 830 | is completed. */ |
708 | xen_mc_callback(do_unlock, ptl); | 831 | xen_mc_callback(xen_pte_unlock, ptl); |
709 | } | 832 | } |
710 | } | 833 | } |
711 | 834 | ||
@@ -715,11 +838,11 @@ static int pin_page(struct page *page, enum pt_level level) | |||
715 | /* This is called just after a mm has been created, but it has not | 838 | /* This is called just after a mm has been created, but it has not |
716 | been used yet. We need to make sure that its pagetable is all | 839 | been used yet. We need to make sure that its pagetable is all |
717 | read-only, and can be pinned. */ | 840 | read-only, and can be pinned. */ |
718 | void xen_pgd_pin(pgd_t *pgd) | 841 | static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) |
719 | { | 842 | { |
720 | xen_mc_batch(); | 843 | xen_mc_batch(); |
721 | 844 | ||
722 | if (pgd_walk(pgd, pin_page, USER_LIMIT)) { | 845 | if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) { |
723 | /* re-enable interrupts for kmap_flush_unused */ | 846 | /* re-enable interrupts for kmap_flush_unused */ |
724 | xen_mc_issue(0); | 847 | xen_mc_issue(0); |
725 | kmap_flush_unused(); | 848 | kmap_flush_unused(); |
@@ -733,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd) | |||
733 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); | 856 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); |
734 | 857 | ||
735 | if (user_pgd) { | 858 | if (user_pgd) { |
736 | pin_page(virt_to_page(user_pgd), PT_PGD); | 859 | xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); |
737 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); | 860 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); |
738 | } | 861 | } |
739 | } | 862 | } |
740 | #else /* CONFIG_X86_32 */ | 863 | #else /* CONFIG_X86_32 */ |
741 | #ifdef CONFIG_X86_PAE | 864 | #ifdef CONFIG_X86_PAE |
742 | /* Need to make sure unshared kernel PMD is pinnable */ | 865 | /* Need to make sure unshared kernel PMD is pinnable */ |
743 | pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); | 866 | xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), |
867 | PT_PMD); | ||
744 | #endif | 868 | #endif |
745 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); | 869 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); |
746 | #endif /* CONFIG_X86_64 */ | 870 | #endif /* CONFIG_X86_64 */ |
747 | xen_mc_issue(0); | 871 | xen_mc_issue(0); |
748 | } | 872 | } |
749 | 873 | ||
874 | static void xen_pgd_pin(struct mm_struct *mm) | ||
875 | { | ||
876 | __xen_pgd_pin(mm, mm->pgd); | ||
877 | } | ||
878 | |||
750 | /* | 879 | /* |
751 | * On save, we need to pin all pagetables to make sure they get their | 880 | * On save, we need to pin all pagetables to make sure they get their |
752 | * mfns turned into pfns. Search the list for any unpinned pgds and pin | 881 | * mfns turned into pfns. Search the list for any unpinned pgds and pin |
753 | * them (unpinned pgds are not currently in use, probably because the | 882 | * them (unpinned pgds are not currently in use, probably because the |
754 | * process is under construction or destruction). | 883 | * process is under construction or destruction). |
884 | * | ||
885 | * Expected to be called in stop_machine() ("equivalent to taking | ||
886 | * every spinlock in the system"), so the locking doesn't really | ||
887 | * matter all that much. | ||
755 | */ | 888 | */ |
756 | void xen_mm_pin_all(void) | 889 | void xen_mm_pin_all(void) |
757 | { | 890 | { |
@@ -762,7 +895,7 @@ void xen_mm_pin_all(void) | |||
762 | 895 | ||
763 | list_for_each_entry(page, &pgd_list, lru) { | 896 | list_for_each_entry(page, &pgd_list, lru) { |
764 | if (!PagePinned(page)) { | 897 | if (!PagePinned(page)) { |
765 | xen_pgd_pin((pgd_t *)page_address(page)); | 898 | __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); |
766 | SetPageSavePinned(page); | 899 | SetPageSavePinned(page); |
767 | } | 900 | } |
768 | } | 901 | } |
@@ -775,7 +908,8 @@ void xen_mm_pin_all(void) | |||
775 | * that's before we have page structures to store the bits. So do all | 908 | * that's before we have page structures to store the bits. So do all |
776 | * the book-keeping now. | 909 | * the book-keeping now. |
777 | */ | 910 | */ |
778 | static __init int mark_pinned(struct page *page, enum pt_level level) | 911 | static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, |
912 | enum pt_level level) | ||
779 | { | 913 | { |
780 | SetPagePinned(page); | 914 | SetPagePinned(page); |
781 | return 0; | 915 | return 0; |
@@ -783,10 +917,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level) | |||
783 | 917 | ||
784 | void __init xen_mark_init_mm_pinned(void) | 918 | void __init xen_mark_init_mm_pinned(void) |
785 | { | 919 | { |
786 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); | 920 | xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); |
787 | } | 921 | } |
788 | 922 | ||
789 | static int unpin_page(struct page *page, enum pt_level level) | 923 | static int xen_unpin_page(struct mm_struct *mm, struct page *page, |
924 | enum pt_level level) | ||
790 | { | 925 | { |
791 | unsigned pgfl = TestClearPagePinned(page); | 926 | unsigned pgfl = TestClearPagePinned(page); |
792 | 927 | ||
@@ -796,10 +931,18 @@ static int unpin_page(struct page *page, enum pt_level level) | |||
796 | spinlock_t *ptl = NULL; | 931 | spinlock_t *ptl = NULL; |
797 | struct multicall_space mcs; | 932 | struct multicall_space mcs; |
798 | 933 | ||
934 | /* | ||
935 | * Do the converse to pin_page. If we're using split | ||
936 | * pte locks, we must be holding the lock for while | ||
937 | * the pte page is unpinned but still RO to prevent | ||
938 | * concurrent updates from seeing it in this | ||
939 | * partially-pinned state. | ||
940 | */ | ||
799 | if (level == PT_PTE) { | 941 | if (level == PT_PTE) { |
800 | ptl = lock_pte(page); | 942 | ptl = xen_pte_lock(page, mm); |
801 | 943 | ||
802 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | 944 | if (ptl) |
945 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | ||
803 | } | 946 | } |
804 | 947 | ||
805 | mcs = __xen_mc_entry(0); | 948 | mcs = __xen_mc_entry(0); |
@@ -810,7 +953,7 @@ static int unpin_page(struct page *page, enum pt_level level) | |||
810 | 953 | ||
811 | if (ptl) { | 954 | if (ptl) { |
812 | /* unlock when batch completed */ | 955 | /* unlock when batch completed */ |
813 | xen_mc_callback(do_unlock, ptl); | 956 | xen_mc_callback(xen_pte_unlock, ptl); |
814 | } | 957 | } |
815 | } | 958 | } |
816 | 959 | ||
@@ -818,7 +961,7 @@ static int unpin_page(struct page *page, enum pt_level level) | |||
818 | } | 961 | } |
819 | 962 | ||
820 | /* Release a pagetables pages back as normal RW */ | 963 | /* Release a pagetables pages back as normal RW */ |
821 | static void xen_pgd_unpin(pgd_t *pgd) | 964 | static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) |
822 | { | 965 | { |
823 | xen_mc_batch(); | 966 | xen_mc_batch(); |
824 | 967 | ||
@@ -830,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd) | |||
830 | 973 | ||
831 | if (user_pgd) { | 974 | if (user_pgd) { |
832 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); | 975 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); |
833 | unpin_page(virt_to_page(user_pgd), PT_PGD); | 976 | xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); |
834 | } | 977 | } |
835 | } | 978 | } |
836 | #endif | 979 | #endif |
837 | 980 | ||
838 | #ifdef CONFIG_X86_PAE | 981 | #ifdef CONFIG_X86_PAE |
839 | /* Need to make sure unshared kernel PMD is unpinned */ | 982 | /* Need to make sure unshared kernel PMD is unpinned */ |
840 | pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); | 983 | xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), |
984 | PT_PMD); | ||
841 | #endif | 985 | #endif |
842 | 986 | ||
843 | pgd_walk(pgd, unpin_page, USER_LIMIT); | 987 | xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT); |
844 | 988 | ||
845 | xen_mc_issue(0); | 989 | xen_mc_issue(0); |
846 | } | 990 | } |
847 | 991 | ||
992 | static void xen_pgd_unpin(struct mm_struct *mm) | ||
993 | { | ||
994 | __xen_pgd_unpin(mm, mm->pgd); | ||
995 | } | ||
996 | |||
848 | /* | 997 | /* |
849 | * On resume, undo any pinning done at save, so that the rest of the | 998 | * On resume, undo any pinning done at save, so that the rest of the |
850 | * kernel doesn't see any unexpected pinned pagetables. | 999 | * kernel doesn't see any unexpected pinned pagetables. |
@@ -859,7 +1008,7 @@ void xen_mm_unpin_all(void) | |||
859 | list_for_each_entry(page, &pgd_list, lru) { | 1008 | list_for_each_entry(page, &pgd_list, lru) { |
860 | if (PageSavePinned(page)) { | 1009 | if (PageSavePinned(page)) { |
861 | BUG_ON(!PagePinned(page)); | 1010 | BUG_ON(!PagePinned(page)); |
862 | xen_pgd_unpin((pgd_t *)page_address(page)); | 1011 | __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); |
863 | ClearPageSavePinned(page); | 1012 | ClearPageSavePinned(page); |
864 | } | 1013 | } |
865 | } | 1014 | } |
@@ -870,14 +1019,14 @@ void xen_mm_unpin_all(void) | |||
870 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) | 1019 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) |
871 | { | 1020 | { |
872 | spin_lock(&next->page_table_lock); | 1021 | spin_lock(&next->page_table_lock); |
873 | xen_pgd_pin(next->pgd); | 1022 | xen_pgd_pin(next); |
874 | spin_unlock(&next->page_table_lock); | 1023 | spin_unlock(&next->page_table_lock); |
875 | } | 1024 | } |
876 | 1025 | ||
877 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | 1026 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) |
878 | { | 1027 | { |
879 | spin_lock(&mm->page_table_lock); | 1028 | spin_lock(&mm->page_table_lock); |
880 | xen_pgd_pin(mm->pgd); | 1029 | xen_pgd_pin(mm); |
881 | spin_unlock(&mm->page_table_lock); | 1030 | spin_unlock(&mm->page_table_lock); |
882 | } | 1031 | } |
883 | 1032 | ||
@@ -907,7 +1056,7 @@ static void drop_other_mm_ref(void *info) | |||
907 | } | 1056 | } |
908 | } | 1057 | } |
909 | 1058 | ||
910 | static void drop_mm_ref(struct mm_struct *mm) | 1059 | static void xen_drop_mm_ref(struct mm_struct *mm) |
911 | { | 1060 | { |
912 | cpumask_t mask; | 1061 | cpumask_t mask; |
913 | unsigned cpu; | 1062 | unsigned cpu; |
@@ -937,7 +1086,7 @@ static void drop_mm_ref(struct mm_struct *mm) | |||
937 | smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); | 1086 | smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); |
938 | } | 1087 | } |
939 | #else | 1088 | #else |
940 | static void drop_mm_ref(struct mm_struct *mm) | 1089 | static void xen_drop_mm_ref(struct mm_struct *mm) |
941 | { | 1090 | { |
942 | if (current->active_mm == mm) | 1091 | if (current->active_mm == mm) |
943 | load_cr3(swapper_pg_dir); | 1092 | load_cr3(swapper_pg_dir); |
@@ -961,14 +1110,77 @@ static void drop_mm_ref(struct mm_struct *mm) | |||
961 | void xen_exit_mmap(struct mm_struct *mm) | 1110 | void xen_exit_mmap(struct mm_struct *mm) |
962 | { | 1111 | { |
963 | get_cpu(); /* make sure we don't move around */ | 1112 | get_cpu(); /* make sure we don't move around */ |
964 | drop_mm_ref(mm); | 1113 | xen_drop_mm_ref(mm); |
965 | put_cpu(); | 1114 | put_cpu(); |
966 | 1115 | ||
967 | spin_lock(&mm->page_table_lock); | 1116 | spin_lock(&mm->page_table_lock); |
968 | 1117 | ||
969 | /* pgd may not be pinned in the error exit path of execve */ | 1118 | /* pgd may not be pinned in the error exit path of execve */ |
970 | if (page_pinned(mm->pgd)) | 1119 | if (xen_page_pinned(mm->pgd)) |
971 | xen_pgd_unpin(mm->pgd); | 1120 | xen_pgd_unpin(mm); |
972 | 1121 | ||
973 | spin_unlock(&mm->page_table_lock); | 1122 | spin_unlock(&mm->page_table_lock); |
974 | } | 1123 | } |
1124 | |||
1125 | #ifdef CONFIG_XEN_DEBUG_FS | ||
1126 | |||
1127 | static struct dentry *d_mmu_debug; | ||
1128 | |||
1129 | static int __init xen_mmu_debugfs(void) | ||
1130 | { | ||
1131 | struct dentry *d_xen = xen_init_debugfs(); | ||
1132 | |||
1133 | if (d_xen == NULL) | ||
1134 | return -ENOMEM; | ||
1135 | |||
1136 | d_mmu_debug = debugfs_create_dir("mmu", d_xen); | ||
1137 | |||
1138 | debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats); | ||
1139 | |||
1140 | debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update); | ||
1141 | debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug, | ||
1142 | &mmu_stats.pgd_update_pinned); | ||
1143 | debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug, | ||
1144 | &mmu_stats.pgd_update_pinned); | ||
1145 | |||
1146 | debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update); | ||
1147 | debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug, | ||
1148 | &mmu_stats.pud_update_pinned); | ||
1149 | debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug, | ||
1150 | &mmu_stats.pud_update_pinned); | ||
1151 | |||
1152 | debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update); | ||
1153 | debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug, | ||
1154 | &mmu_stats.pmd_update_pinned); | ||
1155 | debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug, | ||
1156 | &mmu_stats.pmd_update_pinned); | ||
1157 | |||
1158 | debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update); | ||
1159 | // debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug, | ||
1160 | // &mmu_stats.pte_update_pinned); | ||
1161 | debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug, | ||
1162 | &mmu_stats.pte_update_pinned); | ||
1163 | |||
1164 | debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update); | ||
1165 | debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug, | ||
1166 | &mmu_stats.mmu_update_extended); | ||
1167 | xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug, | ||
1168 | mmu_stats.mmu_update_histo, 20); | ||
1169 | |||
1170 | debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at); | ||
1171 | debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug, | ||
1172 | &mmu_stats.set_pte_at_batched); | ||
1173 | debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug, | ||
1174 | &mmu_stats.set_pte_at_current); | ||
1175 | debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug, | ||
1176 | &mmu_stats.set_pte_at_kernel); | ||
1177 | |||
1178 | debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit); | ||
1179 | debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, | ||
1180 | &mmu_stats.prot_commit_batched); | ||
1181 | |||
1182 | return 0; | ||
1183 | } | ||
1184 | fs_initcall(xen_mmu_debugfs); | ||
1185 | |||
1186 | #endif /* CONFIG_XEN_DEBUG_FS */ | ||
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 0f59bd03f9e3..98d71659da5a 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h | |||
@@ -18,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); | |||
18 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); | 18 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); |
19 | void xen_exit_mmap(struct mm_struct *mm); | 19 | void xen_exit_mmap(struct mm_struct *mm); |
20 | 20 | ||
21 | void xen_pgd_pin(pgd_t *pgd); | ||
22 | //void xen_pgd_unpin(pgd_t *pgd); | ||
23 | |||
24 | pteval_t xen_pte_val(pte_t); | 21 | pteval_t xen_pte_val(pte_t); |
25 | pmdval_t xen_pmd_val(pmd_t); | 22 | pmdval_t xen_pmd_val(pmd_t); |
26 | pgdval_t xen_pgd_val(pgd_t); | 23 | pgdval_t xen_pgd_val(pgd_t); |
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 9efd1c6c9776..8ea8a0d0b0de 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c | |||
@@ -21,16 +21,20 @@ | |||
21 | */ | 21 | */ |
22 | #include <linux/percpu.h> | 22 | #include <linux/percpu.h> |
23 | #include <linux/hardirq.h> | 23 | #include <linux/hardirq.h> |
24 | #include <linux/debugfs.h> | ||
24 | 25 | ||
25 | #include <asm/xen/hypercall.h> | 26 | #include <asm/xen/hypercall.h> |
26 | 27 | ||
27 | #include "multicalls.h" | 28 | #include "multicalls.h" |
29 | #include "debugfs.h" | ||
30 | |||
31 | #define MC_BATCH 32 | ||
28 | 32 | ||
29 | #define MC_DEBUG 1 | 33 | #define MC_DEBUG 1 |
30 | 34 | ||
31 | #define MC_BATCH 32 | ||
32 | #define MC_ARGS (MC_BATCH * 16) | 35 | #define MC_ARGS (MC_BATCH * 16) |
33 | 36 | ||
37 | |||
34 | struct mc_buffer { | 38 | struct mc_buffer { |
35 | struct multicall_entry entries[MC_BATCH]; | 39 | struct multicall_entry entries[MC_BATCH]; |
36 | #if MC_DEBUG | 40 | #if MC_DEBUG |
@@ -47,6 +51,76 @@ struct mc_buffer { | |||
47 | static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); | 51 | static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); |
48 | DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); | 52 | DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); |
49 | 53 | ||
54 | /* flush reasons 0- slots, 1- args, 2- callbacks */ | ||
55 | enum flush_reasons | ||
56 | { | ||
57 | FL_SLOTS, | ||
58 | FL_ARGS, | ||
59 | FL_CALLBACKS, | ||
60 | |||
61 | FL_N_REASONS | ||
62 | }; | ||
63 | |||
64 | #ifdef CONFIG_XEN_DEBUG_FS | ||
65 | #define NHYPERCALLS 40 /* not really */ | ||
66 | |||
67 | static struct { | ||
68 | unsigned histo[MC_BATCH+1]; | ||
69 | |||
70 | unsigned issued; | ||
71 | unsigned arg_total; | ||
72 | unsigned hypercalls; | ||
73 | unsigned histo_hypercalls[NHYPERCALLS]; | ||
74 | |||
75 | unsigned flush[FL_N_REASONS]; | ||
76 | } mc_stats; | ||
77 | |||
78 | static u8 zero_stats; | ||
79 | |||
80 | static inline void check_zero(void) | ||
81 | { | ||
82 | if (unlikely(zero_stats)) { | ||
83 | memset(&mc_stats, 0, sizeof(mc_stats)); | ||
84 | zero_stats = 0; | ||
85 | } | ||
86 | } | ||
87 | |||
88 | static void mc_add_stats(const struct mc_buffer *mc) | ||
89 | { | ||
90 | int i; | ||
91 | |||
92 | check_zero(); | ||
93 | |||
94 | mc_stats.issued++; | ||
95 | mc_stats.hypercalls += mc->mcidx; | ||
96 | mc_stats.arg_total += mc->argidx; | ||
97 | |||
98 | mc_stats.histo[mc->mcidx]++; | ||
99 | for(i = 0; i < mc->mcidx; i++) { | ||
100 | unsigned op = mc->entries[i].op; | ||
101 | if (op < NHYPERCALLS) | ||
102 | mc_stats.histo_hypercalls[op]++; | ||
103 | } | ||
104 | } | ||
105 | |||
106 | static void mc_stats_flush(enum flush_reasons idx) | ||
107 | { | ||
108 | check_zero(); | ||
109 | |||
110 | mc_stats.flush[idx]++; | ||
111 | } | ||
112 | |||
113 | #else /* !CONFIG_XEN_DEBUG_FS */ | ||
114 | |||
115 | static inline void mc_add_stats(const struct mc_buffer *mc) | ||
116 | { | ||
117 | } | ||
118 | |||
119 | static inline void mc_stats_flush(enum flush_reasons idx) | ||
120 | { | ||
121 | } | ||
122 | #endif /* CONFIG_XEN_DEBUG_FS */ | ||
123 | |||
50 | void xen_mc_flush(void) | 124 | void xen_mc_flush(void) |
51 | { | 125 | { |
52 | struct mc_buffer *b = &__get_cpu_var(mc_buffer); | 126 | struct mc_buffer *b = &__get_cpu_var(mc_buffer); |
@@ -60,6 +134,8 @@ void xen_mc_flush(void) | |||
60 | something in the middle */ | 134 | something in the middle */ |
61 | local_irq_save(flags); | 135 | local_irq_save(flags); |
62 | 136 | ||
137 | mc_add_stats(b); | ||
138 | |||
63 | if (b->mcidx) { | 139 | if (b->mcidx) { |
64 | #if MC_DEBUG | 140 | #if MC_DEBUG |
65 | memcpy(b->debug, b->entries, | 141 | memcpy(b->debug, b->entries, |
@@ -115,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args) | |||
115 | 191 | ||
116 | if (b->mcidx == MC_BATCH || | 192 | if (b->mcidx == MC_BATCH || |
117 | (argidx + args) > MC_ARGS) { | 193 | (argidx + args) > MC_ARGS) { |
194 | mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); | ||
118 | xen_mc_flush(); | 195 | xen_mc_flush(); |
119 | argidx = roundup(b->argidx, sizeof(u64)); | 196 | argidx = roundup(b->argidx, sizeof(u64)); |
120 | } | 197 | } |
@@ -158,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data) | |||
158 | struct mc_buffer *b = &__get_cpu_var(mc_buffer); | 235 | struct mc_buffer *b = &__get_cpu_var(mc_buffer); |
159 | struct callback *cb; | 236 | struct callback *cb; |
160 | 237 | ||
161 | if (b->cbidx == MC_BATCH) | 238 | if (b->cbidx == MC_BATCH) { |
239 | mc_stats_flush(FL_CALLBACKS); | ||
162 | xen_mc_flush(); | 240 | xen_mc_flush(); |
241 | } | ||
163 | 242 | ||
164 | cb = &b->callbacks[b->cbidx++]; | 243 | cb = &b->callbacks[b->cbidx++]; |
165 | cb->fn = fn; | 244 | cb->fn = fn; |
166 | cb->data = data; | 245 | cb->data = data; |
167 | } | 246 | } |
247 | |||
248 | #ifdef CONFIG_XEN_DEBUG_FS | ||
249 | |||
250 | static struct dentry *d_mc_debug; | ||
251 | |||
252 | static int __init xen_mc_debugfs(void) | ||
253 | { | ||
254 | struct dentry *d_xen = xen_init_debugfs(); | ||
255 | |||
256 | if (d_xen == NULL) | ||
257 | return -ENOMEM; | ||
258 | |||
259 | d_mc_debug = debugfs_create_dir("multicalls", d_xen); | ||
260 | |||
261 | debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats); | ||
262 | |||
263 | debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued); | ||
264 | debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls); | ||
265 | debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total); | ||
266 | |||
267 | xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug, | ||
268 | mc_stats.histo, MC_BATCH); | ||
269 | xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug, | ||
270 | mc_stats.histo_hypercalls, NHYPERCALLS); | ||
271 | xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug, | ||
272 | mc_stats.flush, FL_N_REASONS); | ||
273 | |||
274 | return 0; | ||
275 | } | ||
276 | fs_initcall(xen_mc_debugfs); | ||
277 | |||
278 | #endif /* CONFIG_XEN_DEBUG_FS */ | ||
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index b6acc3a0af46..d67901083888 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c | |||
@@ -42,7 +42,7 @@ char * __init xen_memory_setup(void) | |||
42 | 42 | ||
43 | e820.nr_map = 0; | 43 | e820.nr_map = 0; |
44 | 44 | ||
45 | e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM); | 45 | e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); |
46 | 46 | ||
47 | /* | 47 | /* |
48 | * Even though this is normal, usable memory under Xen, reserve | 48 | * Even though this is normal, usable memory under Xen, reserve |
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index d8faf79a0a1d..d77da613b1d2 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c | |||
@@ -11,11 +11,8 @@ | |||
11 | * useful topology information for the kernel to make use of. As a | 11 | * useful topology information for the kernel to make use of. As a |
12 | * result, all CPUs are treated as if they're single-core and | 12 | * result, all CPUs are treated as if they're single-core and |
13 | * single-threaded. | 13 | * single-threaded. |
14 | * | ||
15 | * This does not handle HOTPLUG_CPU yet. | ||
16 | */ | 14 | */ |
17 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
18 | #include <linux/kernel_stat.h> | ||
19 | #include <linux/err.h> | 16 | #include <linux/err.h> |
20 | #include <linux/smp.h> | 17 | #include <linux/smp.h> |
21 | 18 | ||
@@ -36,8 +33,6 @@ | |||
36 | #include "xen-ops.h" | 33 | #include "xen-ops.h" |
37 | #include "mmu.h" | 34 | #include "mmu.h" |
38 | 35 | ||
39 | static void __cpuinit xen_init_lock_cpu(int cpu); | ||
40 | |||
41 | cpumask_t xen_cpu_initialized_map; | 36 | cpumask_t xen_cpu_initialized_map; |
42 | 37 | ||
43 | static DEFINE_PER_CPU(int, resched_irq); | 38 | static DEFINE_PER_CPU(int, resched_irq); |
@@ -64,11 +59,12 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) | |||
64 | return IRQ_HANDLED; | 59 | return IRQ_HANDLED; |
65 | } | 60 | } |
66 | 61 | ||
67 | static __cpuinit void cpu_bringup_and_idle(void) | 62 | static __cpuinit void cpu_bringup(void) |
68 | { | 63 | { |
69 | int cpu = smp_processor_id(); | 64 | int cpu = smp_processor_id(); |
70 | 65 | ||
71 | cpu_init(); | 66 | cpu_init(); |
67 | touch_softlockup_watchdog(); | ||
72 | preempt_disable(); | 68 | preempt_disable(); |
73 | 69 | ||
74 | xen_enable_sysenter(); | 70 | xen_enable_sysenter(); |
@@ -89,6 +85,11 @@ static __cpuinit void cpu_bringup_and_idle(void) | |||
89 | local_irq_enable(); | 85 | local_irq_enable(); |
90 | 86 | ||
91 | wmb(); /* make sure everything is out */ | 87 | wmb(); /* make sure everything is out */ |
88 | } | ||
89 | |||
90 | static __cpuinit void cpu_bringup_and_idle(void) | ||
91 | { | ||
92 | cpu_bringup(); | ||
92 | cpu_idle(); | 93 | cpu_idle(); |
93 | } | 94 | } |
94 | 95 | ||
@@ -212,8 +213,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) | |||
212 | 213 | ||
213 | cpu_set(cpu, cpu_present_map); | 214 | cpu_set(cpu, cpu_present_map); |
214 | } | 215 | } |
215 | |||
216 | //init_xenbus_allowed_cpumask(); | ||
217 | } | 216 | } |
218 | 217 | ||
219 | static __cpuinit int | 218 | static __cpuinit int |
@@ -281,12 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) | |||
281 | struct task_struct *idle = idle_task(cpu); | 280 | struct task_struct *idle = idle_task(cpu); |
282 | int rc; | 281 | int rc; |
283 | 282 | ||
284 | #if 0 | ||
285 | rc = cpu_up_check(cpu); | ||
286 | if (rc) | ||
287 | return rc; | ||
288 | #endif | ||
289 | |||
290 | #ifdef CONFIG_X86_64 | 283 | #ifdef CONFIG_X86_64 |
291 | /* Allocate node local memory for AP pdas */ | 284 | /* Allocate node local memory for AP pdas */ |
292 | WARN_ON(cpu == 0); | 285 | WARN_ON(cpu == 0); |
@@ -339,6 +332,60 @@ static void xen_smp_cpus_done(unsigned int max_cpus) | |||
339 | { | 332 | { |
340 | } | 333 | } |
341 | 334 | ||
335 | #ifdef CONFIG_HOTPLUG_CPU | ||
336 | static int xen_cpu_disable(void) | ||
337 | { | ||
338 | unsigned int cpu = smp_processor_id(); | ||
339 | if (cpu == 0) | ||
340 | return -EBUSY; | ||
341 | |||
342 | cpu_disable_common(); | ||
343 | |||
344 | load_cr3(swapper_pg_dir); | ||
345 | return 0; | ||
346 | } | ||
347 | |||
348 | static void xen_cpu_die(unsigned int cpu) | ||
349 | { | ||
350 | while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { | ||
351 | current->state = TASK_UNINTERRUPTIBLE; | ||
352 | schedule_timeout(HZ/10); | ||
353 | } | ||
354 | unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); | ||
355 | unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); | ||
356 | unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); | ||
357 | unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); | ||
358 | xen_uninit_lock_cpu(cpu); | ||
359 | xen_teardown_timer(cpu); | ||
360 | |||
361 | if (num_online_cpus() == 1) | ||
362 | alternatives_smp_switch(0); | ||
363 | } | ||
364 | |||
365 | static void xen_play_dead(void) | ||
366 | { | ||
367 | play_dead_common(); | ||
368 | HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); | ||
369 | cpu_bringup(); | ||
370 | } | ||
371 | |||
372 | #else /* !CONFIG_HOTPLUG_CPU */ | ||
373 | static int xen_cpu_disable(void) | ||
374 | { | ||
375 | return -ENOSYS; | ||
376 | } | ||
377 | |||
378 | static void xen_cpu_die(unsigned int cpu) | ||
379 | { | ||
380 | BUG(); | ||
381 | } | ||
382 | |||
383 | static void xen_play_dead(void) | ||
384 | { | ||
385 | BUG(); | ||
386 | } | ||
387 | |||
388 | #endif | ||
342 | static void stop_self(void *v) | 389 | static void stop_self(void *v) |
343 | { | 390 | { |
344 | int cpu = smp_processor_id(); | 391 | int cpu = smp_processor_id(); |
@@ -419,176 +466,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) | |||
419 | return IRQ_HANDLED; | 466 | return IRQ_HANDLED; |
420 | } | 467 | } |
421 | 468 | ||
422 | struct xen_spinlock { | ||
423 | unsigned char lock; /* 0 -> free; 1 -> locked */ | ||
424 | unsigned short spinners; /* count of waiting cpus */ | ||
425 | }; | ||
426 | |||
427 | static int xen_spin_is_locked(struct raw_spinlock *lock) | ||
428 | { | ||
429 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; | ||
430 | |||
431 | return xl->lock != 0; | ||
432 | } | ||
433 | |||
434 | static int xen_spin_is_contended(struct raw_spinlock *lock) | ||
435 | { | ||
436 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; | ||
437 | |||
438 | /* Not strictly true; this is only the count of contended | ||
439 | lock-takers entering the slow path. */ | ||
440 | return xl->spinners != 0; | ||
441 | } | ||
442 | |||
443 | static int xen_spin_trylock(struct raw_spinlock *lock) | ||
444 | { | ||
445 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; | ||
446 | u8 old = 1; | ||
447 | |||
448 | asm("xchgb %b0,%1" | ||
449 | : "+q" (old), "+m" (xl->lock) : : "memory"); | ||
450 | |||
451 | return old == 0; | ||
452 | } | ||
453 | |||
454 | static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; | ||
455 | static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); | ||
456 | |||
457 | static inline void spinning_lock(struct xen_spinlock *xl) | ||
458 | { | ||
459 | __get_cpu_var(lock_spinners) = xl; | ||
460 | wmb(); /* set lock of interest before count */ | ||
461 | asm(LOCK_PREFIX " incw %0" | ||
462 | : "+m" (xl->spinners) : : "memory"); | ||
463 | } | ||
464 | |||
465 | static inline void unspinning_lock(struct xen_spinlock *xl) | ||
466 | { | ||
467 | asm(LOCK_PREFIX " decw %0" | ||
468 | : "+m" (xl->spinners) : : "memory"); | ||
469 | wmb(); /* decrement count before clearing lock */ | ||
470 | __get_cpu_var(lock_spinners) = NULL; | ||
471 | } | ||
472 | |||
473 | static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) | ||
474 | { | ||
475 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; | ||
476 | int irq = __get_cpu_var(lock_kicker_irq); | ||
477 | int ret; | ||
478 | |||
479 | /* If kicker interrupts not initialized yet, just spin */ | ||
480 | if (irq == -1) | ||
481 | return 0; | ||
482 | |||
483 | /* announce we're spinning */ | ||
484 | spinning_lock(xl); | ||
485 | |||
486 | /* clear pending */ | ||
487 | xen_clear_irq_pending(irq); | ||
488 | |||
489 | /* check again make sure it didn't become free while | ||
490 | we weren't looking */ | ||
491 | ret = xen_spin_trylock(lock); | ||
492 | if (ret) | ||
493 | goto out; | ||
494 | |||
495 | /* block until irq becomes pending */ | ||
496 | xen_poll_irq(irq); | ||
497 | kstat_this_cpu.irqs[irq]++; | ||
498 | |||
499 | out: | ||
500 | unspinning_lock(xl); | ||
501 | return ret; | ||
502 | } | ||
503 | |||
504 | static void xen_spin_lock(struct raw_spinlock *lock) | ||
505 | { | ||
506 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; | ||
507 | int timeout; | ||
508 | u8 oldval; | ||
509 | |||
510 | do { | ||
511 | timeout = 1 << 10; | ||
512 | |||
513 | asm("1: xchgb %1,%0\n" | ||
514 | " testb %1,%1\n" | ||
515 | " jz 3f\n" | ||
516 | "2: rep;nop\n" | ||
517 | " cmpb $0,%0\n" | ||
518 | " je 1b\n" | ||
519 | " dec %2\n" | ||
520 | " jnz 2b\n" | ||
521 | "3:\n" | ||
522 | : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) | ||
523 | : "1" (1) | ||
524 | : "memory"); | ||
525 | |||
526 | } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock))); | ||
527 | } | ||
528 | |||
529 | static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) | ||
530 | { | ||
531 | int cpu; | ||
532 | |||
533 | for_each_online_cpu(cpu) { | ||
534 | /* XXX should mix up next cpu selection */ | ||
535 | if (per_cpu(lock_spinners, cpu) == xl) { | ||
536 | xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); | ||
537 | break; | ||
538 | } | ||
539 | } | ||
540 | } | ||
541 | |||
542 | static void xen_spin_unlock(struct raw_spinlock *lock) | ||
543 | { | ||
544 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; | ||
545 | |||
546 | smp_wmb(); /* make sure no writes get moved after unlock */ | ||
547 | xl->lock = 0; /* release lock */ | ||
548 | |||
549 | /* make sure unlock happens before kick */ | ||
550 | barrier(); | ||
551 | |||
552 | if (unlikely(xl->spinners)) | ||
553 | xen_spin_unlock_slow(xl); | ||
554 | } | ||
555 | |||
556 | static __cpuinit void xen_init_lock_cpu(int cpu) | ||
557 | { | ||
558 | int irq; | ||
559 | const char *name; | ||
560 | |||
561 | name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); | ||
562 | irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, | ||
563 | cpu, | ||
564 | xen_reschedule_interrupt, | ||
565 | IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, | ||
566 | name, | ||
567 | NULL); | ||
568 | |||
569 | if (irq >= 0) { | ||
570 | disable_irq(irq); /* make sure it's never delivered */ | ||
571 | per_cpu(lock_kicker_irq, cpu) = irq; | ||
572 | } | ||
573 | |||
574 | printk("cpu %d spinlock event irq %d\n", cpu, irq); | ||
575 | } | ||
576 | |||
577 | static void __init xen_init_spinlocks(void) | ||
578 | { | ||
579 | pv_lock_ops.spin_is_locked = xen_spin_is_locked; | ||
580 | pv_lock_ops.spin_is_contended = xen_spin_is_contended; | ||
581 | pv_lock_ops.spin_lock = xen_spin_lock; | ||
582 | pv_lock_ops.spin_trylock = xen_spin_trylock; | ||
583 | pv_lock_ops.spin_unlock = xen_spin_unlock; | ||
584 | } | ||
585 | |||
586 | static const struct smp_ops xen_smp_ops __initdata = { | 469 | static const struct smp_ops xen_smp_ops __initdata = { |
587 | .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, | 470 | .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, |
588 | .smp_prepare_cpus = xen_smp_prepare_cpus, | 471 | .smp_prepare_cpus = xen_smp_prepare_cpus, |
589 | .cpu_up = xen_cpu_up, | ||
590 | .smp_cpus_done = xen_smp_cpus_done, | 472 | .smp_cpus_done = xen_smp_cpus_done, |
591 | 473 | ||
474 | .cpu_up = xen_cpu_up, | ||
475 | .cpu_die = xen_cpu_die, | ||
476 | .cpu_disable = xen_cpu_disable, | ||
477 | .play_dead = xen_play_dead, | ||
478 | |||
592 | .smp_send_stop = xen_smp_send_stop, | 479 | .smp_send_stop = xen_smp_send_stop, |
593 | .smp_send_reschedule = xen_smp_send_reschedule, | 480 | .smp_send_reschedule = xen_smp_send_reschedule, |
594 | 481 | ||
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c new file mode 100644 index 000000000000..dd71e3a021cd --- /dev/null +++ b/arch/x86/xen/spinlock.c | |||
@@ -0,0 +1,428 @@ | |||
1 | /* | ||
2 | * Split spinlock implementation out into its own file, so it can be | ||
3 | * compiled in a FTRACE-compatible way. | ||
4 | */ | ||
5 | #include <linux/kernel_stat.h> | ||
6 | #include <linux/spinlock.h> | ||
7 | #include <linux/debugfs.h> | ||
8 | #include <linux/log2.h> | ||
9 | |||
10 | #include <asm/paravirt.h> | ||
11 | |||
12 | #include <xen/interface/xen.h> | ||
13 | #include <xen/events.h> | ||
14 | |||
15 | #include "xen-ops.h" | ||
16 | #include "debugfs.h" | ||
17 | |||
18 | #ifdef CONFIG_XEN_DEBUG_FS | ||
19 | static struct xen_spinlock_stats | ||
20 | { | ||
21 | u64 taken; | ||
22 | u32 taken_slow; | ||
23 | u32 taken_slow_nested; | ||
24 | u32 taken_slow_pickup; | ||
25 | u32 taken_slow_spurious; | ||
26 | u32 taken_slow_irqenable; | ||
27 | |||
28 | u64 released; | ||
29 | u32 released_slow; | ||
30 | u32 released_slow_kicked; | ||
31 | |||
32 | #define HISTO_BUCKETS 30 | ||
33 | u32 histo_spin_total[HISTO_BUCKETS+1]; | ||
34 | u32 histo_spin_spinning[HISTO_BUCKETS+1]; | ||
35 | u32 histo_spin_blocked[HISTO_BUCKETS+1]; | ||
36 | |||
37 | u64 time_total; | ||
38 | u64 time_spinning; | ||
39 | u64 time_blocked; | ||
40 | } spinlock_stats; | ||
41 | |||
42 | static u8 zero_stats; | ||
43 | |||
44 | static unsigned lock_timeout = 1 << 10; | ||
45 | #define TIMEOUT lock_timeout | ||
46 | |||
47 | static inline void check_zero(void) | ||
48 | { | ||
49 | if (unlikely(zero_stats)) { | ||
50 | memset(&spinlock_stats, 0, sizeof(spinlock_stats)); | ||
51 | zero_stats = 0; | ||
52 | } | ||
53 | } | ||
54 | |||
55 | #define ADD_STATS(elem, val) \ | ||
56 | do { check_zero(); spinlock_stats.elem += (val); } while(0) | ||
57 | |||
58 | static inline u64 spin_time_start(void) | ||
59 | { | ||
60 | return xen_clocksource_read(); | ||
61 | } | ||
62 | |||
63 | static void __spin_time_accum(u64 delta, u32 *array) | ||
64 | { | ||
65 | unsigned index = ilog2(delta); | ||
66 | |||
67 | check_zero(); | ||
68 | |||
69 | if (index < HISTO_BUCKETS) | ||
70 | array[index]++; | ||
71 | else | ||
72 | array[HISTO_BUCKETS]++; | ||
73 | } | ||
74 | |||
75 | static inline void spin_time_accum_spinning(u64 start) | ||
76 | { | ||
77 | u32 delta = xen_clocksource_read() - start; | ||
78 | |||
79 | __spin_time_accum(delta, spinlock_stats.histo_spin_spinning); | ||
80 | spinlock_stats.time_spinning += delta; | ||
81 | } | ||
82 | |||
83 | static inline void spin_time_accum_total(u64 start) | ||
84 | { | ||
85 | u32 delta = xen_clocksource_read() - start; | ||
86 | |||
87 | __spin_time_accum(delta, spinlock_stats.histo_spin_total); | ||
88 | spinlock_stats.time_total += delta; | ||
89 | } | ||
90 | |||
91 | static inline void spin_time_accum_blocked(u64 start) | ||
92 | { | ||
93 | u32 delta = xen_clocksource_read() - start; | ||
94 | |||
95 | __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); | ||
96 | spinlock_stats.time_blocked += delta; | ||
97 | } | ||
98 | #else /* !CONFIG_XEN_DEBUG_FS */ | ||
99 | #define TIMEOUT (1 << 10) | ||
100 | #define ADD_STATS(elem, val) do { (void)(val); } while(0) | ||
101 | |||
102 | static inline u64 spin_time_start(void) | ||
103 | { | ||
104 | return 0; | ||
105 | } | ||
106 | |||
107 | static inline void spin_time_accum_total(u64 start) | ||
108 | { | ||
109 | } | ||
110 | static inline void spin_time_accum_spinning(u64 start) | ||
111 | { | ||
112 | } | ||
113 | static inline void spin_time_accum_blocked(u64 start) | ||
114 | { | ||
115 | } | ||
116 | #endif /* CONFIG_XEN_DEBUG_FS */ | ||
117 | |||
118 | struct xen_spinlock { | ||
119 | unsigned char lock; /* 0 -> free; 1 -> locked */ | ||
120 | unsigned short spinners; /* count of waiting cpus */ | ||
121 | }; | ||
122 | |||
123 | static int xen_spin_is_locked(struct raw_spinlock *lock) | ||
124 | { | ||
125 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; | ||
126 | |||
127 | return xl->lock != 0; | ||
128 | } | ||
129 | |||
130 | static int xen_spin_is_contended(struct raw_spinlock *lock) | ||
131 | { | ||
132 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; | ||
133 | |||
134 | /* Not strictly true; this is only the count of contended | ||
135 | lock-takers entering the slow path. */ | ||
136 | return xl->spinners != 0; | ||
137 | } | ||
138 | |||
139 | static int xen_spin_trylock(struct raw_spinlock *lock) | ||
140 | { | ||
141 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; | ||
142 | u8 old = 1; | ||
143 | |||
144 | asm("xchgb %b0,%1" | ||
145 | : "+q" (old), "+m" (xl->lock) : : "memory"); | ||
146 | |||
147 | return old == 0; | ||
148 | } | ||
149 | |||
150 | static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; | ||
151 | static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); | ||
152 | |||
153 | /* | ||
154 | * Mark a cpu as interested in a lock. Returns the CPU's previous | ||
155 | * lock of interest, in case we got preempted by an interrupt. | ||
156 | */ | ||
157 | static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) | ||
158 | { | ||
159 | struct xen_spinlock *prev; | ||
160 | |||
161 | prev = __get_cpu_var(lock_spinners); | ||
162 | __get_cpu_var(lock_spinners) = xl; | ||
163 | |||
164 | wmb(); /* set lock of interest before count */ | ||
165 | |||
166 | asm(LOCK_PREFIX " incw %0" | ||
167 | : "+m" (xl->spinners) : : "memory"); | ||
168 | |||
169 | return prev; | ||
170 | } | ||
171 | |||
172 | /* | ||
173 | * Mark a cpu as no longer interested in a lock. Restores previous | ||
174 | * lock of interest (NULL for none). | ||
175 | */ | ||
176 | static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) | ||
177 | { | ||
178 | asm(LOCK_PREFIX " decw %0" | ||
179 | : "+m" (xl->spinners) : : "memory"); | ||
180 | wmb(); /* decrement count before restoring lock */ | ||
181 | __get_cpu_var(lock_spinners) = prev; | ||
182 | } | ||
183 | |||
184 | static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable) | ||
185 | { | ||
186 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; | ||
187 | struct xen_spinlock *prev; | ||
188 | int irq = __get_cpu_var(lock_kicker_irq); | ||
189 | int ret; | ||
190 | unsigned long flags; | ||
191 | u64 start; | ||
192 | |||
193 | /* If kicker interrupts not initialized yet, just spin */ | ||
194 | if (irq == -1) | ||
195 | return 0; | ||
196 | |||
197 | start = spin_time_start(); | ||
198 | |||
199 | /* announce we're spinning */ | ||
200 | prev = spinning_lock(xl); | ||
201 | |||
202 | flags = __raw_local_save_flags(); | ||
203 | if (irq_enable) { | ||
204 | ADD_STATS(taken_slow_irqenable, 1); | ||
205 | raw_local_irq_enable(); | ||
206 | } | ||
207 | |||
208 | ADD_STATS(taken_slow, 1); | ||
209 | ADD_STATS(taken_slow_nested, prev != NULL); | ||
210 | |||
211 | do { | ||
212 | /* clear pending */ | ||
213 | xen_clear_irq_pending(irq); | ||
214 | |||
215 | /* check again make sure it didn't become free while | ||
216 | we weren't looking */ | ||
217 | ret = xen_spin_trylock(lock); | ||
218 | if (ret) { | ||
219 | ADD_STATS(taken_slow_pickup, 1); | ||
220 | |||
221 | /* | ||
222 | * If we interrupted another spinlock while it | ||
223 | * was blocking, make sure it doesn't block | ||
224 | * without rechecking the lock. | ||
225 | */ | ||
226 | if (prev != NULL) | ||
227 | xen_set_irq_pending(irq); | ||
228 | goto out; | ||
229 | } | ||
230 | |||
231 | /* | ||
232 | * Block until irq becomes pending. If we're | ||
233 | * interrupted at this point (after the trylock but | ||
234 | * before entering the block), then the nested lock | ||
235 | * handler guarantees that the irq will be left | ||
236 | * pending if there's any chance the lock became free; | ||
237 | * xen_poll_irq() returns immediately if the irq is | ||
238 | * pending. | ||
239 | */ | ||
240 | xen_poll_irq(irq); | ||
241 | ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); | ||
242 | } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ | ||
243 | |||
244 | kstat_this_cpu.irqs[irq]++; | ||
245 | |||
246 | out: | ||
247 | raw_local_irq_restore(flags); | ||
248 | unspinning_lock(xl, prev); | ||
249 | spin_time_accum_blocked(start); | ||
250 | |||
251 | return ret; | ||
252 | } | ||
253 | |||
254 | static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) | ||
255 | { | ||
256 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; | ||
257 | unsigned timeout; | ||
258 | u8 oldval; | ||
259 | u64 start_spin; | ||
260 | |||
261 | ADD_STATS(taken, 1); | ||
262 | |||
263 | start_spin = spin_time_start(); | ||
264 | |||
265 | do { | ||
266 | u64 start_spin_fast = spin_time_start(); | ||
267 | |||
268 | timeout = TIMEOUT; | ||
269 | |||
270 | asm("1: xchgb %1,%0\n" | ||
271 | " testb %1,%1\n" | ||
272 | " jz 3f\n" | ||
273 | "2: rep;nop\n" | ||
274 | " cmpb $0,%0\n" | ||
275 | " je 1b\n" | ||
276 | " dec %2\n" | ||
277 | " jnz 2b\n" | ||
278 | "3:\n" | ||
279 | : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) | ||
280 | : "1" (1) | ||
281 | : "memory"); | ||
282 | |||
283 | spin_time_accum_spinning(start_spin_fast); | ||
284 | |||
285 | } while (unlikely(oldval != 0 && | ||
286 | (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); | ||
287 | |||
288 | spin_time_accum_total(start_spin); | ||
289 | } | ||
290 | |||
291 | static void xen_spin_lock(struct raw_spinlock *lock) | ||
292 | { | ||
293 | __xen_spin_lock(lock, false); | ||
294 | } | ||
295 | |||
296 | static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) | ||
297 | { | ||
298 | __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); | ||
299 | } | ||
300 | |||
301 | static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) | ||
302 | { | ||
303 | int cpu; | ||
304 | |||
305 | ADD_STATS(released_slow, 1); | ||
306 | |||
307 | for_each_online_cpu(cpu) { | ||
308 | /* XXX should mix up next cpu selection */ | ||
309 | if (per_cpu(lock_spinners, cpu) == xl) { | ||
310 | ADD_STATS(released_slow_kicked, 1); | ||
311 | xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); | ||
312 | break; | ||
313 | } | ||
314 | } | ||
315 | } | ||
316 | |||
317 | static void xen_spin_unlock(struct raw_spinlock *lock) | ||
318 | { | ||
319 | struct xen_spinlock *xl = (struct xen_spinlock *)lock; | ||
320 | |||
321 | ADD_STATS(released, 1); | ||
322 | |||
323 | smp_wmb(); /* make sure no writes get moved after unlock */ | ||
324 | xl->lock = 0; /* release lock */ | ||
325 | |||
326 | /* make sure unlock happens before kick */ | ||
327 | barrier(); | ||
328 | |||
329 | if (unlikely(xl->spinners)) | ||
330 | xen_spin_unlock_slow(xl); | ||
331 | } | ||
332 | |||
333 | static irqreturn_t dummy_handler(int irq, void *dev_id) | ||
334 | { | ||
335 | BUG(); | ||
336 | return IRQ_HANDLED; | ||
337 | } | ||
338 | |||
339 | void __cpuinit xen_init_lock_cpu(int cpu) | ||
340 | { | ||
341 | int irq; | ||
342 | const char *name; | ||
343 | |||
344 | name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); | ||
345 | irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, | ||
346 | cpu, | ||
347 | dummy_handler, | ||
348 | IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, | ||
349 | name, | ||
350 | NULL); | ||
351 | |||
352 | if (irq >= 0) { | ||
353 | disable_irq(irq); /* make sure it's never delivered */ | ||
354 | per_cpu(lock_kicker_irq, cpu) = irq; | ||
355 | } | ||
356 | |||
357 | printk("cpu %d spinlock event irq %d\n", cpu, irq); | ||
358 | } | ||
359 | |||
360 | void xen_uninit_lock_cpu(int cpu) | ||
361 | { | ||
362 | unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); | ||
363 | } | ||
364 | |||
365 | void __init xen_init_spinlocks(void) | ||
366 | { | ||
367 | pv_lock_ops.spin_is_locked = xen_spin_is_locked; | ||
368 | pv_lock_ops.spin_is_contended = xen_spin_is_contended; | ||
369 | pv_lock_ops.spin_lock = xen_spin_lock; | ||
370 | pv_lock_ops.spin_lock_flags = xen_spin_lock_flags; | ||
371 | pv_lock_ops.spin_trylock = xen_spin_trylock; | ||
372 | pv_lock_ops.spin_unlock = xen_spin_unlock; | ||
373 | } | ||
374 | |||
375 | #ifdef CONFIG_XEN_DEBUG_FS | ||
376 | |||
377 | static struct dentry *d_spin_debug; | ||
378 | |||
379 | static int __init xen_spinlock_debugfs(void) | ||
380 | { | ||
381 | struct dentry *d_xen = xen_init_debugfs(); | ||
382 | |||
383 | if (d_xen == NULL) | ||
384 | return -ENOMEM; | ||
385 | |||
386 | d_spin_debug = debugfs_create_dir("spinlocks", d_xen); | ||
387 | |||
388 | debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); | ||
389 | |||
390 | debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout); | ||
391 | |||
392 | debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken); | ||
393 | debugfs_create_u32("taken_slow", 0444, d_spin_debug, | ||
394 | &spinlock_stats.taken_slow); | ||
395 | debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug, | ||
396 | &spinlock_stats.taken_slow_nested); | ||
397 | debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, | ||
398 | &spinlock_stats.taken_slow_pickup); | ||
399 | debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, | ||
400 | &spinlock_stats.taken_slow_spurious); | ||
401 | debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug, | ||
402 | &spinlock_stats.taken_slow_irqenable); | ||
403 | |||
404 | debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released); | ||
405 | debugfs_create_u32("released_slow", 0444, d_spin_debug, | ||
406 | &spinlock_stats.released_slow); | ||
407 | debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, | ||
408 | &spinlock_stats.released_slow_kicked); | ||
409 | |||
410 | debugfs_create_u64("time_spinning", 0444, d_spin_debug, | ||
411 | &spinlock_stats.time_spinning); | ||
412 | debugfs_create_u64("time_blocked", 0444, d_spin_debug, | ||
413 | &spinlock_stats.time_blocked); | ||
414 | debugfs_create_u64("time_total", 0444, d_spin_debug, | ||
415 | &spinlock_stats.time_total); | ||
416 | |||
417 | xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, | ||
418 | spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); | ||
419 | xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, | ||
420 | spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); | ||
421 | xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, | ||
422 | spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); | ||
423 | |||
424 | return 0; | ||
425 | } | ||
426 | fs_initcall(xen_spinlock_debugfs); | ||
427 | |||
428 | #endif /* CONFIG_XEN_DEBUG_FS */ | ||
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 685b77470fc3..c9f7cda48ed7 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c | |||
@@ -30,8 +30,6 @@ | |||
30 | #define TIMER_SLOP 100000 | 30 | #define TIMER_SLOP 100000 |
31 | #define NS_PER_TICK (1000000000LL / HZ) | 31 | #define NS_PER_TICK (1000000000LL / HZ) |
32 | 32 | ||
33 | static cycle_t xen_clocksource_read(void); | ||
34 | |||
35 | /* runstate info updated by Xen */ | 33 | /* runstate info updated by Xen */ |
36 | static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); | 34 | static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); |
37 | 35 | ||
@@ -200,20 +198,13 @@ unsigned long long xen_sched_clock(void) | |||
200 | /* Get the TSC speed from Xen */ | 198 | /* Get the TSC speed from Xen */ |
201 | unsigned long xen_tsc_khz(void) | 199 | unsigned long xen_tsc_khz(void) |
202 | { | 200 | { |
203 | u64 xen_khz = 1000000ULL << 32; | 201 | struct pvclock_vcpu_time_info *info = |
204 | const struct pvclock_vcpu_time_info *info = | ||
205 | &HYPERVISOR_shared_info->vcpu_info[0].time; | 202 | &HYPERVISOR_shared_info->vcpu_info[0].time; |
206 | 203 | ||
207 | do_div(xen_khz, info->tsc_to_system_mul); | 204 | return pvclock_tsc_khz(info); |
208 | if (info->tsc_shift < 0) | ||
209 | xen_khz <<= -info->tsc_shift; | ||
210 | else | ||
211 | xen_khz >>= info->tsc_shift; | ||
212 | |||
213 | return xen_khz; | ||
214 | } | 205 | } |
215 | 206 | ||
216 | static cycle_t xen_clocksource_read(void) | 207 | cycle_t xen_clocksource_read(void) |
217 | { | 208 | { |
218 | struct pvclock_vcpu_time_info *src; | 209 | struct pvclock_vcpu_time_info *src; |
219 | cycle_t ret; | 210 | cycle_t ret; |
@@ -452,6 +443,14 @@ void xen_setup_timer(int cpu) | |||
452 | setup_runstate_info(cpu); | 443 | setup_runstate_info(cpu); |
453 | } | 444 | } |
454 | 445 | ||
446 | void xen_teardown_timer(int cpu) | ||
447 | { | ||
448 | struct clock_event_device *evt; | ||
449 | BUG_ON(cpu == 0); | ||
450 | evt = &per_cpu(xen_clock_events, cpu); | ||
451 | unbind_from_irqhandler(evt->irq, NULL); | ||
452 | } | ||
453 | |||
455 | void xen_setup_cpu_clockevents(void) | 454 | void xen_setup_cpu_clockevents(void) |
456 | { | 455 | { |
457 | BUG_ON(preemptible()); | 456 | BUG_ON(preemptible()); |
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 2497a30f41de..42786f59d9c0 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S | |||
@@ -298,7 +298,7 @@ check_events: | |||
298 | push %eax | 298 | push %eax |
299 | push %ecx | 299 | push %ecx |
300 | push %edx | 300 | push %edx |
301 | call force_evtchn_callback | 301 | call xen_force_evtchn_callback |
302 | pop %edx | 302 | pop %edx |
303 | pop %ecx | 303 | pop %ecx |
304 | pop %eax | 304 | pop %eax |
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 7f58304fafb3..05794c566e87 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S | |||
@@ -26,8 +26,15 @@ | |||
26 | /* Pseudo-flag used for virtual NMI, which we don't implement yet */ | 26 | /* Pseudo-flag used for virtual NMI, which we don't implement yet */ |
27 | #define XEN_EFLAGS_NMI 0x80000000 | 27 | #define XEN_EFLAGS_NMI 0x80000000 |
28 | 28 | ||
29 | #if 0 | 29 | #if 1 |
30 | #include <asm/percpu.h> | 30 | /* |
31 | x86-64 does not yet support direct access to percpu variables | ||
32 | via a segment override, so we just need to make sure this code | ||
33 | never gets used | ||
34 | */ | ||
35 | #define BUG ud2a | ||
36 | #define PER_CPU_VAR(var, off) 0xdeadbeef | ||
37 | #endif | ||
31 | 38 | ||
32 | /* | 39 | /* |
33 | Enable events. This clears the event mask and tests the pending | 40 | Enable events. This clears the event mask and tests the pending |
@@ -35,6 +42,8 @@ | |||
35 | events, then enter the hypervisor to get them handled. | 42 | events, then enter the hypervisor to get them handled. |
36 | */ | 43 | */ |
37 | ENTRY(xen_irq_enable_direct) | 44 | ENTRY(xen_irq_enable_direct) |
45 | BUG | ||
46 | |||
38 | /* Unmask events */ | 47 | /* Unmask events */ |
39 | movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) | 48 | movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) |
40 | 49 | ||
@@ -58,6 +67,8 @@ ENDPATCH(xen_irq_enable_direct) | |||
58 | non-zero. | 67 | non-zero. |
59 | */ | 68 | */ |
60 | ENTRY(xen_irq_disable_direct) | 69 | ENTRY(xen_irq_disable_direct) |
70 | BUG | ||
71 | |||
61 | movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) | 72 | movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) |
62 | ENDPATCH(xen_irq_disable_direct) | 73 | ENDPATCH(xen_irq_disable_direct) |
63 | ret | 74 | ret |
@@ -74,6 +85,8 @@ ENDPATCH(xen_irq_disable_direct) | |||
74 | Xen and x86 use opposite senses (mask vs enable). | 85 | Xen and x86 use opposite senses (mask vs enable). |
75 | */ | 86 | */ |
76 | ENTRY(xen_save_fl_direct) | 87 | ENTRY(xen_save_fl_direct) |
88 | BUG | ||
89 | |||
77 | testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) | 90 | testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) |
78 | setz %ah | 91 | setz %ah |
79 | addb %ah,%ah | 92 | addb %ah,%ah |
@@ -91,6 +104,8 @@ ENDPATCH(xen_save_fl_direct) | |||
91 | if so. | 104 | if so. |
92 | */ | 105 | */ |
93 | ENTRY(xen_restore_fl_direct) | 106 | ENTRY(xen_restore_fl_direct) |
107 | BUG | ||
108 | |||
94 | testb $X86_EFLAGS_IF>>8, %ah | 109 | testb $X86_EFLAGS_IF>>8, %ah |
95 | setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) | 110 | setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) |
96 | /* Preempt here doesn't matter because that will deal with | 111 | /* Preempt here doesn't matter because that will deal with |
@@ -122,7 +137,7 @@ check_events: | |||
122 | push %r9 | 137 | push %r9 |
123 | push %r10 | 138 | push %r10 |
124 | push %r11 | 139 | push %r11 |
125 | call force_evtchn_callback | 140 | call xen_force_evtchn_callback |
126 | pop %r11 | 141 | pop %r11 |
127 | pop %r10 | 142 | pop %r10 |
128 | pop %r9 | 143 | pop %r9 |
@@ -133,7 +148,6 @@ check_events: | |||
133 | pop %rcx | 148 | pop %rcx |
134 | pop %rax | 149 | pop %rax |
135 | ret | 150 | ret |
136 | #endif | ||
137 | 151 | ||
138 | ENTRY(xen_adjust_exception_frame) | 152 | ENTRY(xen_adjust_exception_frame) |
139 | mov 8+0(%rsp),%rcx | 153 | mov 8+0(%rsp),%rcx |
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index dd3c23152a2e..d7422dc2a55c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define XEN_OPS_H | 2 | #define XEN_OPS_H |
3 | 3 | ||
4 | #include <linux/init.h> | 4 | #include <linux/init.h> |
5 | #include <linux/clocksource.h> | ||
5 | #include <linux/irqreturn.h> | 6 | #include <linux/irqreturn.h> |
6 | #include <xen/xen-ops.h> | 7 | #include <xen/xen-ops.h> |
7 | 8 | ||
@@ -31,7 +32,10 @@ void xen_vcpu_restore(void); | |||
31 | 32 | ||
32 | void __init xen_build_dynamic_phys_to_machine(void); | 33 | void __init xen_build_dynamic_phys_to_machine(void); |
33 | 34 | ||
35 | void xen_init_irq_ops(void); | ||
34 | void xen_setup_timer(int cpu); | 36 | void xen_setup_timer(int cpu); |
37 | void xen_teardown_timer(int cpu); | ||
38 | cycle_t xen_clocksource_read(void); | ||
35 | void xen_setup_cpu_clockevents(void); | 39 | void xen_setup_cpu_clockevents(void); |
36 | unsigned long xen_tsc_khz(void); | 40 | unsigned long xen_tsc_khz(void); |
37 | void __init xen_time_init(void); | 41 | void __init xen_time_init(void); |
@@ -50,6 +54,10 @@ void __init xen_setup_vcpu_info_placement(void); | |||
50 | #ifdef CONFIG_SMP | 54 | #ifdef CONFIG_SMP |
51 | void xen_smp_init(void); | 55 | void xen_smp_init(void); |
52 | 56 | ||
57 | void __init xen_init_spinlocks(void); | ||
58 | __cpuinit void xen_init_lock_cpu(int cpu); | ||
59 | void xen_uninit_lock_cpu(int cpu); | ||
60 | |||
53 | extern cpumask_t xen_cpu_initialized_map; | 61 | extern cpumask_t xen_cpu_initialized_map; |
54 | #else | 62 | #else |
55 | static inline void xen_smp_init(void) {} | 63 | static inline void xen_smp_init(void) {} |