diff options
Diffstat (limited to 'arch/x86')
116 files changed, 6863 insertions, 9012 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fc8351f374fd..5b9b12321ad1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -18,6 +18,7 @@ config X86_64 | |||
18 | ### Arch settings | 18 | ### Arch settings |
19 | config X86 | 19 | config X86 |
20 | def_bool y | 20 | def_bool y |
21 | select HAVE_AOUT if X86_32 | ||
21 | select HAVE_UNSTABLE_SCHED_CLOCK | 22 | select HAVE_UNSTABLE_SCHED_CLOCK |
22 | select HAVE_IDE | 23 | select HAVE_IDE |
23 | select HAVE_OPROFILE | 24 | select HAVE_OPROFILE |
@@ -25,6 +26,7 @@ config X86 | |||
25 | select HAVE_KPROBES | 26 | select HAVE_KPROBES |
26 | select ARCH_WANT_OPTIONAL_GPIOLIB | 27 | select ARCH_WANT_OPTIONAL_GPIOLIB |
27 | select HAVE_KRETPROBES | 28 | select HAVE_KRETPROBES |
29 | select HAVE_FTRACE_MCOUNT_RECORD | ||
28 | select HAVE_DYNAMIC_FTRACE | 30 | select HAVE_DYNAMIC_FTRACE |
29 | select HAVE_FTRACE | 31 | select HAVE_FTRACE |
30 | select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) | 32 | select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) |
@@ -38,10 +40,6 @@ config ARCH_DEFCONFIG | |||
38 | default "arch/x86/configs/i386_defconfig" if X86_32 | 40 | default "arch/x86/configs/i386_defconfig" if X86_32 |
39 | default "arch/x86/configs/x86_64_defconfig" if X86_64 | 41 | default "arch/x86/configs/x86_64_defconfig" if X86_64 |
40 | 42 | ||
41 | |||
42 | config GENERIC_LOCKBREAK | ||
43 | def_bool n | ||
44 | |||
45 | config GENERIC_TIME | 43 | config GENERIC_TIME |
46 | def_bool y | 44 | def_bool y |
47 | 45 | ||
@@ -94,7 +92,7 @@ config GENERIC_HWEIGHT | |||
94 | def_bool y | 92 | def_bool y |
95 | 93 | ||
96 | config GENERIC_GPIO | 94 | config GENERIC_GPIO |
97 | def_bool n | 95 | bool |
98 | 96 | ||
99 | config ARCH_MAY_HAVE_PC_FDC | 97 | config ARCH_MAY_HAVE_PC_FDC |
100 | def_bool y | 98 | def_bool y |
@@ -105,12 +103,6 @@ config RWSEM_GENERIC_SPINLOCK | |||
105 | config RWSEM_XCHGADD_ALGORITHM | 103 | config RWSEM_XCHGADD_ALGORITHM |
106 | def_bool X86_XADD | 104 | def_bool X86_XADD |
107 | 105 | ||
108 | config ARCH_HAS_ILOG2_U32 | ||
109 | def_bool n | ||
110 | |||
111 | config ARCH_HAS_ILOG2_U64 | ||
112 | def_bool n | ||
113 | |||
114 | config ARCH_HAS_CPU_IDLE_WAIT | 106 | config ARCH_HAS_CPU_IDLE_WAIT |
115 | def_bool y | 107 | def_bool y |
116 | 108 | ||
@@ -152,9 +144,6 @@ config AUDIT_ARCH | |||
152 | bool | 144 | bool |
153 | default X86_64 | 145 | default X86_64 |
154 | 146 | ||
155 | config ARCH_SUPPORTS_AOUT | ||
156 | def_bool y | ||
157 | |||
158 | config ARCH_SUPPORTS_OPTIMIZED_INLINING | 147 | config ARCH_SUPPORTS_OPTIMIZED_INLINING |
159 | def_bool y | 148 | def_bool y |
160 | 149 | ||
@@ -205,6 +194,7 @@ config X86_TRAMPOLINE | |||
205 | config KTIME_SCALAR | 194 | config KTIME_SCALAR |
206 | def_bool X86_32 | 195 | def_bool X86_32 |
207 | source "init/Kconfig" | 196 | source "init/Kconfig" |
197 | source "kernel/Kconfig.freezer" | ||
208 | 198 | ||
209 | menu "Processor type and features" | 199 | menu "Processor type and features" |
210 | 200 | ||
@@ -760,9 +750,8 @@ config I8K | |||
760 | Say N otherwise. | 750 | Say N otherwise. |
761 | 751 | ||
762 | config X86_REBOOTFIXUPS | 752 | config X86_REBOOTFIXUPS |
763 | def_bool n | 753 | bool "Enable X86 board specific fixups for reboot" |
764 | prompt "Enable X86 board specific fixups for reboot" | 754 | depends on X86_32 |
765 | depends on X86_32 && X86 | ||
766 | ---help--- | 755 | ---help--- |
767 | This enables chipset and/or board specific fixups to be done | 756 | This enables chipset and/or board specific fixups to be done |
768 | in order to get reboot to work correctly. This is only needed on | 757 | in order to get reboot to work correctly. This is only needed on |
@@ -946,16 +935,17 @@ config HIGHMEM | |||
946 | depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) | 935 | depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) |
947 | 936 | ||
948 | config X86_PAE | 937 | config X86_PAE |
949 | def_bool n | 938 | bool "PAE (Physical Address Extension) Support" |
950 | prompt "PAE (Physical Address Extension) Support" | ||
951 | depends on X86_32 && !HIGHMEM4G | 939 | depends on X86_32 && !HIGHMEM4G |
952 | select RESOURCES_64BIT | ||
953 | help | 940 | help |
954 | PAE is required for NX support, and furthermore enables | 941 | PAE is required for NX support, and furthermore enables |
955 | larger swapspace support for non-overcommit purposes. It | 942 | larger swapspace support for non-overcommit purposes. It |
956 | has the cost of more pagetable lookup overhead, and also | 943 | has the cost of more pagetable lookup overhead, and also |
957 | consumes more pagetable space per process. | 944 | consumes more pagetable space per process. |
958 | 945 | ||
946 | config ARCH_PHYS_ADDR_T_64BIT | ||
947 | def_bool X86_64 || X86_PAE | ||
948 | |||
959 | # Common NUMA Features | 949 | # Common NUMA Features |
960 | config NUMA | 950 | config NUMA |
961 | bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" | 951 | bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" |
@@ -1240,8 +1230,7 @@ config X86_PAT | |||
1240 | If unsure, say Y. | 1230 | If unsure, say Y. |
1241 | 1231 | ||
1242 | config EFI | 1232 | config EFI |
1243 | def_bool n | 1233 | bool "EFI runtime service support" |
1244 | prompt "EFI runtime service support" | ||
1245 | depends on ACPI | 1234 | depends on ACPI |
1246 | ---help--- | 1235 | ---help--- |
1247 | This enables the kernel to use EFI runtime services that are | 1236 | This enables the kernel to use EFI runtime services that are |
@@ -1254,14 +1243,6 @@ config EFI | |||
1254 | resultant kernel should continue to boot on existing non-EFI | 1243 | resultant kernel should continue to boot on existing non-EFI |
1255 | platforms. | 1244 | platforms. |
1256 | 1245 | ||
1257 | config IRQBALANCE | ||
1258 | def_bool y | ||
1259 | prompt "Enable kernel irq balancing" | ||
1260 | depends on X86_32 && SMP && X86_IO_APIC | ||
1261 | help | ||
1262 | The default yes will allow the kernel to do irq load balancing. | ||
1263 | Saying no will keep the kernel from doing irq load balancing. | ||
1264 | |||
1265 | config SECCOMP | 1246 | config SECCOMP |
1266 | def_bool y | 1247 | def_bool y |
1267 | prompt "Enable seccomp to safely compute untrusted bytecode" | 1248 | prompt "Enable seccomp to safely compute untrusted bytecode" |
@@ -1885,7 +1866,7 @@ config IA32_EMULATION | |||
1885 | 1866 | ||
1886 | config IA32_AOUT | 1867 | config IA32_AOUT |
1887 | tristate "IA32 a.out support" | 1868 | tristate "IA32 a.out support" |
1888 | depends on IA32_EMULATION && ARCH_SUPPORTS_AOUT | 1869 | depends on IA32_EMULATION |
1889 | help | 1870 | help |
1890 | Support old a.out binaries in the 32bit emulation. | 1871 | Support old a.out binaries in the 32bit emulation. |
1891 | 1872 | ||
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index c5f101360520..0b7c4a3f0651 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu | |||
@@ -38,8 +38,7 @@ config M386 | |||
38 | - "Crusoe" for the Transmeta Crusoe series. | 38 | - "Crusoe" for the Transmeta Crusoe series. |
39 | - "Efficeon" for the Transmeta Efficeon series. | 39 | - "Efficeon" for the Transmeta Efficeon series. |
40 | - "Winchip-C6" for original IDT Winchip. | 40 | - "Winchip-C6" for original IDT Winchip. |
41 | - "Winchip-2" for IDT Winchip 2. | 41 | - "Winchip-2" for IDT Winchips with 3dNow! capabilities. |
42 | - "Winchip-2A" for IDT Winchips with 3dNow! capabilities. | ||
43 | - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). | 42 | - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). |
44 | - "Geode GX/LX" For AMD Geode GX and LX processors. | 43 | - "Geode GX/LX" For AMD Geode GX and LX processors. |
45 | - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. | 44 | - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. |
@@ -194,19 +193,11 @@ config MWINCHIPC6 | |||
194 | treat this chip as a 586TSC with some extended instructions | 193 | treat this chip as a 586TSC with some extended instructions |
195 | and alignment requirements. | 194 | and alignment requirements. |
196 | 195 | ||
197 | config MWINCHIP2 | ||
198 | bool "Winchip-2" | ||
199 | depends on X86_32 | ||
200 | help | ||
201 | Select this for an IDT Winchip-2. Linux and GCC | ||
202 | treat this chip as a 586TSC with some extended instructions | ||
203 | and alignment requirements. | ||
204 | |||
205 | config MWINCHIP3D | 196 | config MWINCHIP3D |
206 | bool "Winchip-2A/Winchip-3" | 197 | bool "Winchip-2/Winchip-2A/Winchip-3" |
207 | depends on X86_32 | 198 | depends on X86_32 |
208 | help | 199 | help |
209 | Select this for an IDT Winchip-2A or 3. Linux and GCC | 200 | Select this for an IDT Winchip-2, 2A or 3. Linux and GCC |
210 | treat this chip as a 586TSC with some extended instructions | 201 | treat this chip as a 586TSC with some extended instructions |
211 | and alignment requirements. Also enable out of order memory | 202 | and alignment requirements. Also enable out of order memory |
212 | stores for this CPU, which can increase performance of some | 203 | stores for this CPU, which can increase performance of some |
@@ -318,7 +309,7 @@ config X86_L1_CACHE_SHIFT | |||
318 | int | 309 | int |
319 | default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC | 310 | default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC |
320 | default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 | 311 | default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 |
321 | default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX | 312 | default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX |
322 | default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 | 313 | default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 |
323 | 314 | ||
324 | config X86_XADD | 315 | config X86_XADD |
@@ -360,7 +351,7 @@ config X86_POPAD_OK | |||
360 | 351 | ||
361 | config X86_ALIGNMENT_16 | 352 | config X86_ALIGNMENT_16 |
362 | def_bool y | 353 | def_bool y |
363 | depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 | 354 | depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 |
364 | 355 | ||
365 | config X86_INTEL_USERCOPY | 356 | config X86_INTEL_USERCOPY |
366 | def_bool y | 357 | def_bool y |
@@ -368,7 +359,7 @@ config X86_INTEL_USERCOPY | |||
368 | 359 | ||
369 | config X86_USE_PPRO_CHECKSUM | 360 | config X86_USE_PPRO_CHECKSUM |
370 | def_bool y | 361 | def_bool y |
371 | depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 | 362 | depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 |
372 | 363 | ||
373 | config X86_USE_3DNOW | 364 | config X86_USE_3DNOW |
374 | def_bool y | 365 | def_bool y |
@@ -376,7 +367,7 @@ config X86_USE_3DNOW | |||
376 | 367 | ||
377 | config X86_OOSTORE | 368 | config X86_OOSTORE |
378 | def_bool y | 369 | def_bool y |
379 | depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR | 370 | depends on (MWINCHIP3D || MWINCHIPC6) && MTRR |
380 | 371 | ||
381 | # | 372 | # |
382 | # P6_NOPs are a relatively minor optimization that require a family >= | 373 | # P6_NOPs are a relatively minor optimization that require a family >= |
@@ -396,7 +387,7 @@ config X86_P6_NOP | |||
396 | 387 | ||
397 | config X86_TSC | 388 | config X86_TSC |
398 | def_bool y | 389 | def_bool y |
399 | depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 | 390 | depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 |
400 | 391 | ||
401 | config X86_CMPXCHG64 | 392 | config X86_CMPXCHG64 |
402 | def_bool y | 393 | def_bool y |
@@ -406,7 +397,7 @@ config X86_CMPXCHG64 | |||
406 | # generates cmov. | 397 | # generates cmov. |
407 | config X86_CMOV | 398 | config X86_CMOV |
408 | def_bool y | 399 | def_bool y |
409 | depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64) | 400 | depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64) |
410 | 401 | ||
411 | config X86_MINIMUM_CPU_FAMILY | 402 | config X86_MINIMUM_CPU_FAMILY |
412 | int | 403 | int |
@@ -417,7 +408,7 @@ config X86_MINIMUM_CPU_FAMILY | |||
417 | 408 | ||
418 | config X86_DEBUGCTLMSR | 409 | config X86_DEBUGCTLMSR |
419 | def_bool y | 410 | def_bool y |
420 | depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) | 411 | depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) |
421 | 412 | ||
422 | menuconfig PROCESSOR_SELECT | 413 | menuconfig PROCESSOR_SELECT |
423 | bool "Supported processor vendors" if EMBEDDED | 414 | bool "Supported processor vendors" if EMBEDDED |
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index b72b4f753113..80177ec052f0 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu | |||
@@ -28,7 +28,6 @@ cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) | |||
28 | cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 | 28 | cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 |
29 | cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 | 29 | cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 |
30 | cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) | 30 | cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) |
31 | cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586) | ||
32 | cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) | 31 | cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) |
33 | cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 | 32 | cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 |
34 | cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) | 33 | cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) |
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 1e6fe0214c85..99b3079dc6ab 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c | |||
@@ -88,14 +88,11 @@ static int vesa_probe(void) | |||
88 | (vminfo.memory_layout == 4 || | 88 | (vminfo.memory_layout == 4 || |
89 | vminfo.memory_layout == 6) && | 89 | vminfo.memory_layout == 6) && |
90 | vminfo.memory_planes == 1) { | 90 | vminfo.memory_planes == 1) { |
91 | #ifdef CONFIG_FB | 91 | #ifdef CONFIG_FB_BOOT_VESA_SUPPORT |
92 | /* Graphics mode, color, linear frame buffer | 92 | /* Graphics mode, color, linear frame buffer |
93 | supported. Only register the mode if | 93 | supported. Only register the mode if |
94 | if framebuffer is configured, however, | 94 | if framebuffer is configured, however, |
95 | otherwise the user will be left without a screen. | 95 | otherwise the user will be left without a screen. */ |
96 | We don't require CONFIG_FB_VESA, however, since | ||
97 | some of the other framebuffer drivers can use | ||
98 | this mode-setting, too. */ | ||
99 | mi = GET_HEAP(struct mode_info, 1); | 96 | mi = GET_HEAP(struct mode_info, 1); |
100 | mi->mode = mode + VIDEO_FIRST_VESA; | 97 | mi->mode = mode + VIDEO_FIRST_VESA; |
101 | mi->depth = vminfo.bpp; | 98 | mi->depth = vminfo.bpp; |
@@ -133,10 +130,12 @@ static int vesa_set_mode(struct mode_info *mode) | |||
133 | if ((vminfo.mode_attr & 0x15) == 0x05) { | 130 | if ((vminfo.mode_attr & 0x15) == 0x05) { |
134 | /* It's a supported text mode */ | 131 | /* It's a supported text mode */ |
135 | is_graphic = 0; | 132 | is_graphic = 0; |
133 | #ifdef CONFIG_FB_BOOT_VESA_SUPPORT | ||
136 | } else if ((vminfo.mode_attr & 0x99) == 0x99) { | 134 | } else if ((vminfo.mode_attr & 0x99) == 0x99) { |
137 | /* It's a graphics mode with linear frame buffer */ | 135 | /* It's a graphics mode with linear frame buffer */ |
138 | is_graphic = 1; | 136 | is_graphic = 1; |
139 | vesa_mode |= 0x4000; /* Request linear frame buffer */ | 137 | vesa_mode |= 0x4000; /* Request linear frame buffer */ |
138 | #endif | ||
140 | } else { | 139 | } else { |
141 | return -1; /* Invalid mode */ | 140 | return -1; /* Invalid mode */ |
142 | } | 141 | } |
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index ca226ca31288..13b8c86ae985 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig | |||
@@ -213,7 +213,6 @@ CONFIG_M686=y | |||
213 | # CONFIG_MCRUSOE is not set | 213 | # CONFIG_MCRUSOE is not set |
214 | # CONFIG_MEFFICEON is not set | 214 | # CONFIG_MEFFICEON is not set |
215 | # CONFIG_MWINCHIPC6 is not set | 215 | # CONFIG_MWINCHIPC6 is not set |
216 | # CONFIG_MWINCHIP2 is not set | ||
217 | # CONFIG_MWINCHIP3D is not set | 216 | # CONFIG_MWINCHIP3D is not set |
218 | # CONFIG_MGEODEGX1 is not set | 217 | # CONFIG_MGEODEGX1 is not set |
219 | # CONFIG_MGEODE_LX is not set | 218 | # CONFIG_MGEODE_LX is not set |
@@ -288,7 +287,6 @@ CONFIG_MTRR=y | |||
288 | # CONFIG_MTRR_SANITIZER is not set | 287 | # CONFIG_MTRR_SANITIZER is not set |
289 | CONFIG_X86_PAT=y | 288 | CONFIG_X86_PAT=y |
290 | CONFIG_EFI=y | 289 | CONFIG_EFI=y |
291 | # CONFIG_IRQBALANCE is not set | ||
292 | CONFIG_SECCOMP=y | 290 | CONFIG_SECCOMP=y |
293 | # CONFIG_HZ_100 is not set | 291 | # CONFIG_HZ_100 is not set |
294 | # CONFIG_HZ_250 is not set | 292 | # CONFIG_HZ_250 is not set |
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 2c4b1c771e28..f0a03d7a7d63 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig | |||
@@ -210,7 +210,6 @@ CONFIG_X86_PC=y | |||
210 | # CONFIG_MCRUSOE is not set | 210 | # CONFIG_MCRUSOE is not set |
211 | # CONFIG_MEFFICEON is not set | 211 | # CONFIG_MEFFICEON is not set |
212 | # CONFIG_MWINCHIPC6 is not set | 212 | # CONFIG_MWINCHIPC6 is not set |
213 | # CONFIG_MWINCHIP2 is not set | ||
214 | # CONFIG_MWINCHIP3D is not set | 213 | # CONFIG_MWINCHIP3D is not set |
215 | # CONFIG_MGEODEGX1 is not set | 214 | # CONFIG_MGEODEGX1 is not set |
216 | # CONFIG_MGEODE_LX is not set | 215 | # CONFIG_MGEODE_LX is not set |
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ffc1bb4fed7d..256b00b61892 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S | |||
@@ -39,11 +39,11 @@ | |||
39 | .endm | 39 | .endm |
40 | 40 | ||
41 | /* clobbers %eax */ | 41 | /* clobbers %eax */ |
42 | .macro CLEAR_RREGS | 42 | .macro CLEAR_RREGS _r9=rax |
43 | xorl %eax,%eax | 43 | xorl %eax,%eax |
44 | movq %rax,R11(%rsp) | 44 | movq %rax,R11(%rsp) |
45 | movq %rax,R10(%rsp) | 45 | movq %rax,R10(%rsp) |
46 | movq %rax,R9(%rsp) | 46 | movq %\_r9,R9(%rsp) |
47 | movq %rax,R8(%rsp) | 47 | movq %rax,R8(%rsp) |
48 | .endm | 48 | .endm |
49 | 49 | ||
@@ -52,11 +52,10 @@ | |||
52 | * We don't reload %eax because syscall_trace_enter() returned | 52 | * We don't reload %eax because syscall_trace_enter() returned |
53 | * the value it wants us to use in the table lookup. | 53 | * the value it wants us to use in the table lookup. |
54 | */ | 54 | */ |
55 | .macro LOAD_ARGS32 offset | 55 | .macro LOAD_ARGS32 offset, _r9=0 |
56 | movl \offset(%rsp),%r11d | 56 | .if \_r9 |
57 | movl \offset+8(%rsp),%r10d | ||
58 | movl \offset+16(%rsp),%r9d | 57 | movl \offset+16(%rsp),%r9d |
59 | movl \offset+24(%rsp),%r8d | 58 | .endif |
60 | movl \offset+40(%rsp),%ecx | 59 | movl \offset+40(%rsp),%ecx |
61 | movl \offset+48(%rsp),%edx | 60 | movl \offset+48(%rsp),%edx |
62 | movl \offset+56(%rsp),%esi | 61 | movl \offset+56(%rsp),%esi |
@@ -145,7 +144,7 @@ ENTRY(ia32_sysenter_target) | |||
145 | SAVE_ARGS 0,0,1 | 144 | SAVE_ARGS 0,0,1 |
146 | /* no need to do an access_ok check here because rbp has been | 145 | /* no need to do an access_ok check here because rbp has been |
147 | 32bit zero extended */ | 146 | 32bit zero extended */ |
148 | 1: movl (%rbp),%r9d | 147 | 1: movl (%rbp),%ebp |
149 | .section __ex_table,"a" | 148 | .section __ex_table,"a" |
150 | .quad 1b,ia32_badarg | 149 | .quad 1b,ia32_badarg |
151 | .previous | 150 | .previous |
@@ -157,7 +156,7 @@ ENTRY(ia32_sysenter_target) | |||
157 | cmpl $(IA32_NR_syscalls-1),%eax | 156 | cmpl $(IA32_NR_syscalls-1),%eax |
158 | ja ia32_badsys | 157 | ja ia32_badsys |
159 | sysenter_do_call: | 158 | sysenter_do_call: |
160 | IA32_ARG_FIXUP 1 | 159 | IA32_ARG_FIXUP |
161 | sysenter_dispatch: | 160 | sysenter_dispatch: |
162 | call *ia32_sys_call_table(,%rax,8) | 161 | call *ia32_sys_call_table(,%rax,8) |
163 | movq %rax,RAX-ARGOFFSET(%rsp) | 162 | movq %rax,RAX-ARGOFFSET(%rsp) |
@@ -234,20 +233,17 @@ sysexit_audit: | |||
234 | #endif | 233 | #endif |
235 | 234 | ||
236 | sysenter_tracesys: | 235 | sysenter_tracesys: |
237 | xchgl %r9d,%ebp | ||
238 | #ifdef CONFIG_AUDITSYSCALL | 236 | #ifdef CONFIG_AUDITSYSCALL |
239 | testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) | 237 | testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) |
240 | jz sysenter_auditsys | 238 | jz sysenter_auditsys |
241 | #endif | 239 | #endif |
242 | SAVE_REST | 240 | SAVE_REST |
243 | CLEAR_RREGS | 241 | CLEAR_RREGS |
244 | movq %r9,R9(%rsp) | ||
245 | movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ | 242 | movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ |
246 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | 243 | movq %rsp,%rdi /* &pt_regs -> arg1 */ |
247 | call syscall_trace_enter | 244 | call syscall_trace_enter |
248 | LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ | 245 | LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ |
249 | RESTORE_REST | 246 | RESTORE_REST |
250 | xchgl %ebp,%r9d | ||
251 | cmpl $(IA32_NR_syscalls-1),%eax | 247 | cmpl $(IA32_NR_syscalls-1),%eax |
252 | ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ | 248 | ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ |
253 | jmp sysenter_do_call | 249 | jmp sysenter_do_call |
@@ -314,9 +310,9 @@ ENTRY(ia32_cstar_target) | |||
314 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) | 310 | testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) |
315 | CFI_REMEMBER_STATE | 311 | CFI_REMEMBER_STATE |
316 | jnz cstar_tracesys | 312 | jnz cstar_tracesys |
317 | cstar_do_call: | ||
318 | cmpl $IA32_NR_syscalls-1,%eax | 313 | cmpl $IA32_NR_syscalls-1,%eax |
319 | ja ia32_badsys | 314 | ja ia32_badsys |
315 | cstar_do_call: | ||
320 | IA32_ARG_FIXUP 1 | 316 | IA32_ARG_FIXUP 1 |
321 | cstar_dispatch: | 317 | cstar_dispatch: |
322 | call *ia32_sys_call_table(,%rax,8) | 318 | call *ia32_sys_call_table(,%rax,8) |
@@ -357,15 +353,13 @@ cstar_tracesys: | |||
357 | #endif | 353 | #endif |
358 | xchgl %r9d,%ebp | 354 | xchgl %r9d,%ebp |
359 | SAVE_REST | 355 | SAVE_REST |
360 | CLEAR_RREGS | 356 | CLEAR_RREGS r9 |
361 | movq %r9,R9(%rsp) | ||
362 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ | 357 | movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ |
363 | movq %rsp,%rdi /* &pt_regs -> arg1 */ | 358 | movq %rsp,%rdi /* &pt_regs -> arg1 */ |
364 | call syscall_trace_enter | 359 | call syscall_trace_enter |
365 | LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ | 360 | LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ |
366 | RESTORE_REST | 361 | RESTORE_REST |
367 | xchgl %ebp,%r9d | 362 | xchgl %ebp,%r9d |
368 | movl RSP-ARGOFFSET(%rsp), %r8d | ||
369 | cmpl $(IA32_NR_syscalls-1),%eax | 363 | cmpl $(IA32_NR_syscalls-1),%eax |
370 | ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ | 364 | ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ |
371 | jmp cstar_do_call | 365 | jmp cstar_do_call |
@@ -577,8 +571,8 @@ ia32_sys_call_table: | |||
577 | .quad compat_sys_setrlimit /* 75 */ | 571 | .quad compat_sys_setrlimit /* 75 */ |
578 | .quad compat_sys_old_getrlimit /* old_getrlimit */ | 572 | .quad compat_sys_old_getrlimit /* old_getrlimit */ |
579 | .quad compat_sys_getrusage | 573 | .quad compat_sys_getrusage |
580 | .quad sys32_gettimeofday | 574 | .quad compat_sys_gettimeofday |
581 | .quad sys32_settimeofday | 575 | .quad compat_sys_settimeofday |
582 | .quad sys_getgroups16 /* 80 */ | 576 | .quad sys_getgroups16 /* 80 */ |
583 | .quad sys_setgroups16 | 577 | .quad sys_setgroups16 |
584 | .quad sys32_old_select | 578 | .quad sys32_old_select |
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index beda4232ce69..2e09dcd3c0a6 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c | |||
@@ -49,41 +49,6 @@ | |||
49 | 49 | ||
50 | #define AA(__x) ((unsigned long)(__x)) | 50 | #define AA(__x) ((unsigned long)(__x)) |
51 | 51 | ||
52 | int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf) | ||
53 | { | ||
54 | compat_ino_t ino; | ||
55 | |||
56 | typeof(ubuf->st_uid) uid = 0; | ||
57 | typeof(ubuf->st_gid) gid = 0; | ||
58 | SET_UID(uid, kbuf->uid); | ||
59 | SET_GID(gid, kbuf->gid); | ||
60 | if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev)) | ||
61 | return -EOVERFLOW; | ||
62 | if (kbuf->size >= 0x7fffffff) | ||
63 | return -EOVERFLOW; | ||
64 | ino = kbuf->ino; | ||
65 | if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino) | ||
66 | return -EOVERFLOW; | ||
67 | if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) || | ||
68 | __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) || | ||
69 | __put_user(ino, &ubuf->st_ino) || | ||
70 | __put_user(kbuf->mode, &ubuf->st_mode) || | ||
71 | __put_user(kbuf->nlink, &ubuf->st_nlink) || | ||
72 | __put_user(uid, &ubuf->st_uid) || | ||
73 | __put_user(gid, &ubuf->st_gid) || | ||
74 | __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || | ||
75 | __put_user(kbuf->size, &ubuf->st_size) || | ||
76 | __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) || | ||
77 | __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || | ||
78 | __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) || | ||
79 | __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || | ||
80 | __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) || | ||
81 | __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || | ||
82 | __put_user(kbuf->blksize, &ubuf->st_blksize) || | ||
83 | __put_user(kbuf->blocks, &ubuf->st_blocks)) | ||
84 | return -EFAULT; | ||
85 | return 0; | ||
86 | } | ||
87 | 52 | ||
88 | asmlinkage long sys32_truncate64(char __user *filename, | 53 | asmlinkage long sys32_truncate64(char __user *filename, |
89 | unsigned long offset_low, | 54 | unsigned long offset_low, |
@@ -402,75 +367,11 @@ asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, | |||
402 | return 0; | 367 | return 0; |
403 | } | 368 | } |
404 | 369 | ||
405 | static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i) | ||
406 | { | ||
407 | int err = -EFAULT; | ||
408 | |||
409 | if (access_ok(VERIFY_READ, i, sizeof(*i))) { | ||
410 | err = __get_user(o->tv_sec, &i->tv_sec); | ||
411 | err |= __get_user(o->tv_usec, &i->tv_usec); | ||
412 | } | ||
413 | return err; | ||
414 | } | ||
415 | |||
416 | static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i) | ||
417 | { | ||
418 | int err = -EFAULT; | ||
419 | |||
420 | if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { | ||
421 | err = __put_user(i->tv_sec, &o->tv_sec); | ||
422 | err |= __put_user(i->tv_usec, &o->tv_usec); | ||
423 | } | ||
424 | return err; | ||
425 | } | ||
426 | |||
427 | asmlinkage long sys32_alarm(unsigned int seconds) | 370 | asmlinkage long sys32_alarm(unsigned int seconds) |
428 | { | 371 | { |
429 | return alarm_setitimer(seconds); | 372 | return alarm_setitimer(seconds); |
430 | } | 373 | } |
431 | 374 | ||
432 | /* | ||
433 | * Translations due to time_t size differences. Which affects all | ||
434 | * sorts of things, like timeval and itimerval. | ||
435 | */ | ||
436 | asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv, | ||
437 | struct timezone __user *tz) | ||
438 | { | ||
439 | if (tv) { | ||
440 | struct timeval ktv; | ||
441 | |||
442 | do_gettimeofday(&ktv); | ||
443 | if (put_tv32(tv, &ktv)) | ||
444 | return -EFAULT; | ||
445 | } | ||
446 | if (tz) { | ||
447 | if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) | ||
448 | return -EFAULT; | ||
449 | } | ||
450 | return 0; | ||
451 | } | ||
452 | |||
453 | asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv, | ||
454 | struct timezone __user *tz) | ||
455 | { | ||
456 | struct timeval ktv; | ||
457 | struct timespec kts; | ||
458 | struct timezone ktz; | ||
459 | |||
460 | if (tv) { | ||
461 | if (get_tv32(&ktv, tv)) | ||
462 | return -EFAULT; | ||
463 | kts.tv_sec = ktv.tv_sec; | ||
464 | kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC; | ||
465 | } | ||
466 | if (tz) { | ||
467 | if (copy_from_user(&ktz, tz, sizeof(ktz))) | ||
468 | return -EFAULT; | ||
469 | } | ||
470 | |||
471 | return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); | ||
472 | } | ||
473 | |||
474 | struct sel_arg_struct { | 375 | struct sel_arg_struct { |
475 | unsigned int n; | 376 | unsigned int n; |
476 | unsigned int inp; | 377 | unsigned int inp; |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5098585f87ce..d7e5a58ee22f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -23,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp) | |||
23 | CFLAGS_tsc.o := $(nostackp) | 23 | CFLAGS_tsc.o := $(nostackp) |
24 | 24 | ||
25 | obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o | 25 | obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o |
26 | obj-y += traps_$(BITS).o irq_$(BITS).o | 26 | obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o |
27 | obj-y += time_$(BITS).o ioport.o ldt.o | 27 | obj-y += time_$(BITS).o ioport.o ldt.o |
28 | obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o | 28 | obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o |
29 | obj-$(CONFIG_X86_VISWS) += visws_quirks.o | 29 | obj-$(CONFIG_X86_VISWS) += visws_quirks.o |
@@ -60,8 +60,8 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o | |||
60 | obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o | 60 | obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o |
61 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o | 61 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o |
62 | obj-$(CONFIG_X86_MPPARSE) += mpparse.o | 62 | obj-$(CONFIG_X86_MPPARSE) += mpparse.o |
63 | obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o | 63 | obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o |
64 | obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o | 64 | obj-$(CONFIG_X86_IO_APIC) += io_apic.o |
65 | obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o | 65 | obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o |
66 | obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o | 66 | obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o |
67 | obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o | 67 | obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o |
@@ -108,7 +108,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o | |||
108 | # 64 bit specific files | 108 | # 64 bit specific files |
109 | ifeq ($(CONFIG_X86_64),y) | 109 | ifeq ($(CONFIG_X86_64),y) |
110 | obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o | 110 | obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o |
111 | obj-y += bios_uv.o | 111 | obj-y += bios_uv.o uv_irq.o uv_sysfs.o |
112 | obj-y += genx2apic_cluster.o | 112 | obj-y += genx2apic_cluster.o |
113 | obj-y += genx2apic_phys.o | 113 | obj-y += genx2apic_phys.o |
114 | obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o | 114 | obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o |
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index eb875cdc7367..0d1c26a583c5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
@@ -1256,7 +1256,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) | |||
1256 | 1256 | ||
1257 | count = | 1257 | count = |
1258 | acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, | 1258 | acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, |
1259 | NR_IRQ_VECTORS); | 1259 | nr_irqs); |
1260 | if (count < 0) { | 1260 | if (count < 0) { |
1261 | printk(KERN_ERR PREFIX | 1261 | printk(KERN_ERR PREFIX |
1262 | "Error parsing interrupt source overrides entry\n"); | 1262 | "Error parsing interrupt source overrides entry\n"); |
@@ -1276,7 +1276,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) | |||
1276 | 1276 | ||
1277 | count = | 1277 | count = |
1278 | acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, | 1278 | acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, |
1279 | NR_IRQ_VECTORS); | 1279 | nr_irqs); |
1280 | if (count < 0) { | 1280 | if (count < 0) { |
1281 | printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); | 1281 | printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); |
1282 | /* TBD: Cleanup to allow fallback to MPS */ | 1282 | /* TBD: Cleanup to allow fallback to MPS */ |
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 426e5d91b63a..c44cd6dbfa14 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/dmi.h> | 10 | #include <linux/dmi.h> |
11 | #include <linux/cpumask.h> | 11 | #include <linux/cpumask.h> |
12 | #include <asm/segment.h> | 12 | #include <asm/segment.h> |
13 | #include <asm/desc.h> | ||
13 | 14 | ||
14 | #include "realmode/wakeup.h" | 15 | #include "realmode/wakeup.h" |
15 | #include "sleep.h" | 16 | #include "sleep.h" |
@@ -98,6 +99,8 @@ int acpi_save_state_mem(void) | |||
98 | header->trampoline_segment = setup_trampoline() >> 4; | 99 | header->trampoline_segment = setup_trampoline() >> 4; |
99 | #ifdef CONFIG_SMP | 100 | #ifdef CONFIG_SMP |
100 | stack_start.sp = temp_stack + 4096; | 101 | stack_start.sp = temp_stack + 4096; |
102 | early_gdt_descr.address = | ||
103 | (unsigned long)get_cpu_gdt_table(smp_processor_id()); | ||
101 | #endif | 104 | #endif |
102 | initial_code = (unsigned long)wakeup_long64; | 105 | initial_code = (unsigned long)wakeup_long64; |
103 | saved_magic = 0x123456789abcdef0; | 106 | saved_magic = 0x123456789abcdef0; |
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index fb04e49776ba..a84ac7b570e6 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c | |||
@@ -444,7 +444,7 @@ void __init alternative_instructions(void) | |||
444 | _text, _etext); | 444 | _text, _etext); |
445 | 445 | ||
446 | /* Only switch to UP mode if we don't immediately boot others */ | 446 | /* Only switch to UP mode if we don't immediately boot others */ |
447 | if (num_possible_cpus() == 1 || setup_max_cpus <= 1) | 447 | if (num_present_cpus() == 1 || setup_max_cpus <= 1) |
448 | alternatives_smp_switch(0); | 448 | alternatives_smp_switch(0); |
449 | } | 449 | } |
450 | #endif | 450 | #endif |
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 34e4d112b1ef..a8fd9ebdc8e2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c | |||
@@ -295,7 +295,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, | |||
295 | u64 address, size_t size) | 295 | u64 address, size_t size) |
296 | { | 296 | { |
297 | int s = 0; | 297 | int s = 0; |
298 | unsigned pages = iommu_num_pages(address, size); | 298 | unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); |
299 | 299 | ||
300 | address &= PAGE_MASK; | 300 | address &= PAGE_MASK; |
301 | 301 | ||
@@ -680,7 +680,8 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, | |||
680 | iommu->exclusion_start < dma_dom->aperture_size) { | 680 | iommu->exclusion_start < dma_dom->aperture_size) { |
681 | unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; | 681 | unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; |
682 | int pages = iommu_num_pages(iommu->exclusion_start, | 682 | int pages = iommu_num_pages(iommu->exclusion_start, |
683 | iommu->exclusion_length); | 683 | iommu->exclusion_length, |
684 | PAGE_SIZE); | ||
684 | dma_ops_reserve_addresses(dma_dom, startpage, pages); | 685 | dma_ops_reserve_addresses(dma_dom, startpage, pages); |
685 | } | 686 | } |
686 | 687 | ||
@@ -935,7 +936,7 @@ static dma_addr_t __map_single(struct device *dev, | |||
935 | unsigned long align_mask = 0; | 936 | unsigned long align_mask = 0; |
936 | int i; | 937 | int i; |
937 | 938 | ||
938 | pages = iommu_num_pages(paddr, size); | 939 | pages = iommu_num_pages(paddr, size, PAGE_SIZE); |
939 | paddr &= PAGE_MASK; | 940 | paddr &= PAGE_MASK; |
940 | 941 | ||
941 | if (align) | 942 | if (align) |
@@ -980,7 +981,7 @@ static void __unmap_single(struct amd_iommu *iommu, | |||
980 | if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) | 981 | if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) |
981 | return; | 982 | return; |
982 | 983 | ||
983 | pages = iommu_num_pages(dma_addr, size); | 984 | pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); |
984 | dma_addr &= PAGE_MASK; | 985 | dma_addr &= PAGE_MASK; |
985 | start = dma_addr; | 986 | start = dma_addr; |
986 | 987 | ||
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 4cd8083c58be..0cdcda35a05f 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c | |||
@@ -212,7 +212,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) | |||
212 | /* Programs the physical address of the device table into the IOMMU hardware */ | 212 | /* Programs the physical address of the device table into the IOMMU hardware */ |
213 | static void __init iommu_set_device_table(struct amd_iommu *iommu) | 213 | static void __init iommu_set_device_table(struct amd_iommu *iommu) |
214 | { | 214 | { |
215 | u32 entry; | 215 | u64 entry; |
216 | 216 | ||
217 | BUG_ON(iommu->mmio_base == NULL); | 217 | BUG_ON(iommu->mmio_base == NULL); |
218 | 218 | ||
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic.c index 21c831d96af3..04a7f960bbc0 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic.c | |||
@@ -23,11 +23,13 @@ | |||
23 | #include <linux/mc146818rtc.h> | 23 | #include <linux/mc146818rtc.h> |
24 | #include <linux/kernel_stat.h> | 24 | #include <linux/kernel_stat.h> |
25 | #include <linux/sysdev.h> | 25 | #include <linux/sysdev.h> |
26 | #include <linux/ioport.h> | ||
26 | #include <linux/cpu.h> | 27 | #include <linux/cpu.h> |
27 | #include <linux/clockchips.h> | 28 | #include <linux/clockchips.h> |
28 | #include <linux/acpi_pmtmr.h> | 29 | #include <linux/acpi_pmtmr.h> |
29 | #include <linux/module.h> | 30 | #include <linux/module.h> |
30 | #include <linux/dmi.h> | 31 | #include <linux/dmi.h> |
32 | #include <linux/dmar.h> | ||
31 | 33 | ||
32 | #include <asm/atomic.h> | 34 | #include <asm/atomic.h> |
33 | #include <asm/smp.h> | 35 | #include <asm/smp.h> |
@@ -36,8 +38,14 @@ | |||
36 | #include <asm/desc.h> | 38 | #include <asm/desc.h> |
37 | #include <asm/arch_hooks.h> | 39 | #include <asm/arch_hooks.h> |
38 | #include <asm/hpet.h> | 40 | #include <asm/hpet.h> |
41 | #include <asm/pgalloc.h> | ||
39 | #include <asm/i8253.h> | 42 | #include <asm/i8253.h> |
40 | #include <asm/nmi.h> | 43 | #include <asm/nmi.h> |
44 | #include <asm/idle.h> | ||
45 | #include <asm/proto.h> | ||
46 | #include <asm/timex.h> | ||
47 | #include <asm/apic.h> | ||
48 | #include <asm/i8259.h> | ||
41 | 49 | ||
42 | #include <mach_apic.h> | 50 | #include <mach_apic.h> |
43 | #include <mach_apicdef.h> | 51 | #include <mach_apicdef.h> |
@@ -50,16 +58,58 @@ | |||
50 | # error SPURIOUS_APIC_VECTOR definition error | 58 | # error SPURIOUS_APIC_VECTOR definition error |
51 | #endif | 59 | #endif |
52 | 60 | ||
53 | unsigned long mp_lapic_addr; | 61 | #ifdef CONFIG_X86_32 |
54 | |||
55 | /* | 62 | /* |
56 | * Knob to control our willingness to enable the local APIC. | 63 | * Knob to control our willingness to enable the local APIC. |
57 | * | 64 | * |
58 | * +1=force-enable | 65 | * +1=force-enable |
59 | */ | 66 | */ |
60 | static int force_enable_local_apic; | 67 | static int force_enable_local_apic; |
61 | int disable_apic; | 68 | /* |
69 | * APIC command line parameters | ||
70 | */ | ||
71 | static int __init parse_lapic(char *arg) | ||
72 | { | ||
73 | force_enable_local_apic = 1; | ||
74 | return 0; | ||
75 | } | ||
76 | early_param("lapic", parse_lapic); | ||
77 | /* Local APIC was disabled by the BIOS and enabled by the kernel */ | ||
78 | static int enabled_via_apicbase; | ||
79 | |||
80 | #endif | ||
81 | |||
82 | #ifdef CONFIG_X86_64 | ||
83 | static int apic_calibrate_pmtmr __initdata; | ||
84 | static __init int setup_apicpmtimer(char *s) | ||
85 | { | ||
86 | apic_calibrate_pmtmr = 1; | ||
87 | notsc_setup(NULL); | ||
88 | return 0; | ||
89 | } | ||
90 | __setup("apicpmtimer", setup_apicpmtimer); | ||
91 | #endif | ||
92 | |||
93 | #ifdef CONFIG_X86_64 | ||
94 | #define HAVE_X2APIC | ||
95 | #endif | ||
96 | |||
97 | #ifdef HAVE_X2APIC | ||
98 | int x2apic; | ||
99 | /* x2apic enabled before OS handover */ | ||
100 | int x2apic_preenabled; | ||
101 | int disable_x2apic; | ||
102 | static __init int setup_nox2apic(char *str) | ||
103 | { | ||
104 | disable_x2apic = 1; | ||
105 | setup_clear_cpu_cap(X86_FEATURE_X2APIC); | ||
106 | return 0; | ||
107 | } | ||
108 | early_param("nox2apic", setup_nox2apic); | ||
109 | #endif | ||
62 | 110 | ||
111 | unsigned long mp_lapic_addr; | ||
112 | int disable_apic; | ||
63 | /* Disable local APIC timer from the kernel commandline or via dmi quirk */ | 113 | /* Disable local APIC timer from the kernel commandline or via dmi quirk */ |
64 | static int disable_apic_timer __cpuinitdata; | 114 | static int disable_apic_timer __cpuinitdata; |
65 | /* Local APIC timer works in C2 */ | 115 | /* Local APIC timer works in C2 */ |
@@ -110,9 +160,6 @@ static struct clock_event_device lapic_clockevent = { | |||
110 | }; | 160 | }; |
111 | static DEFINE_PER_CPU(struct clock_event_device, lapic_events); | 161 | static DEFINE_PER_CPU(struct clock_event_device, lapic_events); |
112 | 162 | ||
113 | /* Local APIC was disabled by the BIOS and enabled by the kernel */ | ||
114 | static int enabled_via_apicbase; | ||
115 | |||
116 | static unsigned long apic_phys; | 163 | static unsigned long apic_phys; |
117 | 164 | ||
118 | /* | 165 | /* |
@@ -202,6 +249,42 @@ static struct apic_ops xapic_ops = { | |||
202 | struct apic_ops __read_mostly *apic_ops = &xapic_ops; | 249 | struct apic_ops __read_mostly *apic_ops = &xapic_ops; |
203 | EXPORT_SYMBOL_GPL(apic_ops); | 250 | EXPORT_SYMBOL_GPL(apic_ops); |
204 | 251 | ||
252 | #ifdef HAVE_X2APIC | ||
253 | static void x2apic_wait_icr_idle(void) | ||
254 | { | ||
255 | /* no need to wait for icr idle in x2apic */ | ||
256 | return; | ||
257 | } | ||
258 | |||
259 | static u32 safe_x2apic_wait_icr_idle(void) | ||
260 | { | ||
261 | /* no need to wait for icr idle in x2apic */ | ||
262 | return 0; | ||
263 | } | ||
264 | |||
265 | void x2apic_icr_write(u32 low, u32 id) | ||
266 | { | ||
267 | wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); | ||
268 | } | ||
269 | |||
270 | u64 x2apic_icr_read(void) | ||
271 | { | ||
272 | unsigned long val; | ||
273 | |||
274 | rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); | ||
275 | return val; | ||
276 | } | ||
277 | |||
278 | static struct apic_ops x2apic_ops = { | ||
279 | .read = native_apic_msr_read, | ||
280 | .write = native_apic_msr_write, | ||
281 | .icr_read = x2apic_icr_read, | ||
282 | .icr_write = x2apic_icr_write, | ||
283 | .wait_icr_idle = x2apic_wait_icr_idle, | ||
284 | .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, | ||
285 | }; | ||
286 | #endif | ||
287 | |||
205 | /** | 288 | /** |
206 | * enable_NMI_through_LVT0 - enable NMI through local vector table 0 | 289 | * enable_NMI_through_LVT0 - enable NMI through local vector table 0 |
207 | */ | 290 | */ |
@@ -219,6 +302,7 @@ void __cpuinit enable_NMI_through_LVT0(void) | |||
219 | apic_write(APIC_LVT0, v); | 302 | apic_write(APIC_LVT0, v); |
220 | } | 303 | } |
221 | 304 | ||
305 | #ifdef CONFIG_X86_32 | ||
222 | /** | 306 | /** |
223 | * get_physical_broadcast - Get number of physical broadcast IDs | 307 | * get_physical_broadcast - Get number of physical broadcast IDs |
224 | */ | 308 | */ |
@@ -226,6 +310,7 @@ int get_physical_broadcast(void) | |||
226 | { | 310 | { |
227 | return modern_apic() ? 0xff : 0xf; | 311 | return modern_apic() ? 0xff : 0xf; |
228 | } | 312 | } |
313 | #endif | ||
229 | 314 | ||
230 | /** | 315 | /** |
231 | * lapic_get_maxlvt - get the maximum number of local vector table entries | 316 | * lapic_get_maxlvt - get the maximum number of local vector table entries |
@@ -247,11 +332,7 @@ int lapic_get_maxlvt(void) | |||
247 | */ | 332 | */ |
248 | 333 | ||
249 | /* Clock divisor */ | 334 | /* Clock divisor */ |
250 | #ifdef CONFG_X86_64 | ||
251 | #define APIC_DIVISOR 1 | ||
252 | #else | ||
253 | #define APIC_DIVISOR 16 | 335 | #define APIC_DIVISOR 16 |
254 | #endif | ||
255 | 336 | ||
256 | /* | 337 | /* |
257 | * This function sets up the local APIC timer, with a timeout of | 338 | * This function sets up the local APIC timer, with a timeout of |
@@ -383,7 +464,7 @@ static void lapic_timer_broadcast(cpumask_t mask) | |||
383 | * Setup the local APIC timer for this CPU. Copy the initilized values | 464 | * Setup the local APIC timer for this CPU. Copy the initilized values |
384 | * of the boot CPU and register the clock event in the framework. | 465 | * of the boot CPU and register the clock event in the framework. |
385 | */ | 466 | */ |
386 | static void __devinit setup_APIC_timer(void) | 467 | static void __cpuinit setup_APIC_timer(void) |
387 | { | 468 | { |
388 | struct clock_event_device *levt = &__get_cpu_var(lapic_events); | 469 | struct clock_event_device *levt = &__get_cpu_var(lapic_events); |
389 | 470 | ||
@@ -453,14 +534,51 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) | |||
453 | } | 534 | } |
454 | } | 535 | } |
455 | 536 | ||
537 | static int __init calibrate_by_pmtimer(long deltapm, long *delta) | ||
538 | { | ||
539 | const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; | ||
540 | const long pm_thresh = pm_100ms / 100; | ||
541 | unsigned long mult; | ||
542 | u64 res; | ||
543 | |||
544 | #ifndef CONFIG_X86_PM_TIMER | ||
545 | return -1; | ||
546 | #endif | ||
547 | |||
548 | apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); | ||
549 | |||
550 | /* Check, if the PM timer is available */ | ||
551 | if (!deltapm) | ||
552 | return -1; | ||
553 | |||
554 | mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); | ||
555 | |||
556 | if (deltapm > (pm_100ms - pm_thresh) && | ||
557 | deltapm < (pm_100ms + pm_thresh)) { | ||
558 | apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); | ||
559 | } else { | ||
560 | res = (((u64)deltapm) * mult) >> 22; | ||
561 | do_div(res, 1000000); | ||
562 | printk(KERN_WARNING "APIC calibration not consistent " | ||
563 | "with PM Timer: %ldms instead of 100ms\n", | ||
564 | (long)res); | ||
565 | /* Correct the lapic counter value */ | ||
566 | res = (((u64)(*delta)) * pm_100ms); | ||
567 | do_div(res, deltapm); | ||
568 | printk(KERN_INFO "APIC delta adjusted to PM-Timer: " | ||
569 | "%lu (%ld)\n", (unsigned long)res, *delta); | ||
570 | *delta = (long)res; | ||
571 | } | ||
572 | |||
573 | return 0; | ||
574 | } | ||
575 | |||
456 | static int __init calibrate_APIC_clock(void) | 576 | static int __init calibrate_APIC_clock(void) |
457 | { | 577 | { |
458 | struct clock_event_device *levt = &__get_cpu_var(lapic_events); | 578 | struct clock_event_device *levt = &__get_cpu_var(lapic_events); |
459 | const long pm_100ms = PMTMR_TICKS_PER_SEC/10; | ||
460 | const long pm_thresh = pm_100ms/100; | ||
461 | void (*real_handler)(struct clock_event_device *dev); | 579 | void (*real_handler)(struct clock_event_device *dev); |
462 | unsigned long deltaj; | 580 | unsigned long deltaj; |
463 | long delta, deltapm; | 581 | long delta; |
464 | int pm_referenced = 0; | 582 | int pm_referenced = 0; |
465 | 583 | ||
466 | local_irq_disable(); | 584 | local_irq_disable(); |
@@ -470,10 +588,10 @@ static int __init calibrate_APIC_clock(void) | |||
470 | global_clock_event->event_handler = lapic_cal_handler; | 588 | global_clock_event->event_handler = lapic_cal_handler; |
471 | 589 | ||
472 | /* | 590 | /* |
473 | * Setup the APIC counter to 1e9. There is no way the lapic | 591 | * Setup the APIC counter to maximum. There is no way the lapic |
474 | * can underflow in the 100ms detection time frame | 592 | * can underflow in the 100ms detection time frame |
475 | */ | 593 | */ |
476 | __setup_APIC_LVTT(1000000000, 0, 0); | 594 | __setup_APIC_LVTT(0xffffffff, 0, 0); |
477 | 595 | ||
478 | /* Let the interrupts run */ | 596 | /* Let the interrupts run */ |
479 | local_irq_enable(); | 597 | local_irq_enable(); |
@@ -490,34 +608,9 @@ static int __init calibrate_APIC_clock(void) | |||
490 | delta = lapic_cal_t1 - lapic_cal_t2; | 608 | delta = lapic_cal_t1 - lapic_cal_t2; |
491 | apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); | 609 | apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); |
492 | 610 | ||
493 | /* Check, if the PM timer is available */ | 611 | /* we trust the PM based calibration if possible */ |
494 | deltapm = lapic_cal_pm2 - lapic_cal_pm1; | 612 | pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, |
495 | apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); | 613 | &delta); |
496 | |||
497 | if (deltapm) { | ||
498 | unsigned long mult; | ||
499 | u64 res; | ||
500 | |||
501 | mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); | ||
502 | |||
503 | if (deltapm > (pm_100ms - pm_thresh) && | ||
504 | deltapm < (pm_100ms + pm_thresh)) { | ||
505 | apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); | ||
506 | } else { | ||
507 | res = (((u64) deltapm) * mult) >> 22; | ||
508 | do_div(res, 1000000); | ||
509 | printk(KERN_WARNING "APIC calibration not consistent " | ||
510 | "with PM Timer: %ldms instead of 100ms\n", | ||
511 | (long)res); | ||
512 | /* Correct the lapic counter value */ | ||
513 | res = (((u64) delta) * pm_100ms); | ||
514 | do_div(res, deltapm); | ||
515 | printk(KERN_INFO "APIC delta adjusted to PM-Timer: " | ||
516 | "%lu (%ld)\n", (unsigned long) res, delta); | ||
517 | delta = (long) res; | ||
518 | } | ||
519 | pm_referenced = 1; | ||
520 | } | ||
521 | 614 | ||
522 | /* Calculate the scaled math multiplication factor */ | 615 | /* Calculate the scaled math multiplication factor */ |
523 | lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, | 616 | lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, |
@@ -559,7 +652,10 @@ static int __init calibrate_APIC_clock(void) | |||
559 | 652 | ||
560 | levt->features &= ~CLOCK_EVT_FEAT_DUMMY; | 653 | levt->features &= ~CLOCK_EVT_FEAT_DUMMY; |
561 | 654 | ||
562 | /* We trust the pm timer based calibration */ | 655 | /* |
656 | * PM timer calibration failed or not turned on | ||
657 | * so lets try APIC timer based calibration | ||
658 | */ | ||
563 | if (!pm_referenced) { | 659 | if (!pm_referenced) { |
564 | apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); | 660 | apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); |
565 | 661 | ||
@@ -652,7 +748,7 @@ void __init setup_boot_APIC_clock(void) | |||
652 | setup_APIC_timer(); | 748 | setup_APIC_timer(); |
653 | } | 749 | } |
654 | 750 | ||
655 | void __devinit setup_secondary_APIC_clock(void) | 751 | void __cpuinit setup_secondary_APIC_clock(void) |
656 | { | 752 | { |
657 | setup_APIC_timer(); | 753 | setup_APIC_timer(); |
658 | } | 754 | } |
@@ -718,6 +814,9 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) | |||
718 | * Besides, if we don't timer interrupts ignore the global | 814 | * Besides, if we don't timer interrupts ignore the global |
719 | * interrupt lock, which is the WrongThing (tm) to do. | 815 | * interrupt lock, which is the WrongThing (tm) to do. |
720 | */ | 816 | */ |
817 | #ifdef CONFIG_X86_64 | ||
818 | exit_idle(); | ||
819 | #endif | ||
721 | irq_enter(); | 820 | irq_enter(); |
722 | local_apic_timer_interrupt(); | 821 | local_apic_timer_interrupt(); |
723 | irq_exit(); | 822 | irq_exit(); |
@@ -991,40 +1090,43 @@ void __init init_bsp_APIC(void) | |||
991 | 1090 | ||
992 | static void __cpuinit lapic_setup_esr(void) | 1091 | static void __cpuinit lapic_setup_esr(void) |
993 | { | 1092 | { |
994 | unsigned long oldvalue, value, maxlvt; | 1093 | unsigned int oldvalue, value, maxlvt; |
995 | if (lapic_is_integrated() && !esr_disable) { | 1094 | |
996 | if (esr_disable) { | 1095 | if (!lapic_is_integrated()) { |
997 | /* | 1096 | printk(KERN_INFO "No ESR for 82489DX.\n"); |
998 | * Something untraceable is creating bad interrupts on | 1097 | return; |
999 | * secondary quads ... for the moment, just leave the | 1098 | } |
1000 | * ESR disabled - we can't do anything useful with the | ||
1001 | * errors anyway - mbligh | ||
1002 | */ | ||
1003 | printk(KERN_INFO "Leaving ESR disabled.\n"); | ||
1004 | return; | ||
1005 | } | ||
1006 | /* !82489DX */ | ||
1007 | maxlvt = lapic_get_maxlvt(); | ||
1008 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | ||
1009 | apic_write(APIC_ESR, 0); | ||
1010 | oldvalue = apic_read(APIC_ESR); | ||
1011 | 1099 | ||
1012 | /* enables sending errors */ | 1100 | if (esr_disable) { |
1013 | value = ERROR_APIC_VECTOR; | ||
1014 | apic_write(APIC_LVTERR, value); | ||
1015 | /* | 1101 | /* |
1016 | * spec says clear errors after enabling vector. | 1102 | * Something untraceable is creating bad interrupts on |
1103 | * secondary quads ... for the moment, just leave the | ||
1104 | * ESR disabled - we can't do anything useful with the | ||
1105 | * errors anyway - mbligh | ||
1017 | */ | 1106 | */ |
1018 | if (maxlvt > 3) | 1107 | printk(KERN_INFO "Leaving ESR disabled.\n"); |
1019 | apic_write(APIC_ESR, 0); | 1108 | return; |
1020 | value = apic_read(APIC_ESR); | ||
1021 | if (value != oldvalue) | ||
1022 | apic_printk(APIC_VERBOSE, "ESR value before enabling " | ||
1023 | "vector: 0x%08lx after: 0x%08lx\n", | ||
1024 | oldvalue, value); | ||
1025 | } else { | ||
1026 | printk(KERN_INFO "No ESR for 82489DX.\n"); | ||
1027 | } | 1109 | } |
1110 | |||
1111 | maxlvt = lapic_get_maxlvt(); | ||
1112 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | ||
1113 | apic_write(APIC_ESR, 0); | ||
1114 | oldvalue = apic_read(APIC_ESR); | ||
1115 | |||
1116 | /* enables sending errors */ | ||
1117 | value = ERROR_APIC_VECTOR; | ||
1118 | apic_write(APIC_LVTERR, value); | ||
1119 | |||
1120 | /* | ||
1121 | * spec says clear errors after enabling vector. | ||
1122 | */ | ||
1123 | if (maxlvt > 3) | ||
1124 | apic_write(APIC_ESR, 0); | ||
1125 | value = apic_read(APIC_ESR); | ||
1126 | if (value != oldvalue) | ||
1127 | apic_printk(APIC_VERBOSE, "ESR value before enabling " | ||
1128 | "vector: 0x%08x after: 0x%08x\n", | ||
1129 | oldvalue, value); | ||
1028 | } | 1130 | } |
1029 | 1131 | ||
1030 | 1132 | ||
@@ -1033,24 +1135,27 @@ static void __cpuinit lapic_setup_esr(void) | |||
1033 | */ | 1135 | */ |
1034 | void __cpuinit setup_local_APIC(void) | 1136 | void __cpuinit setup_local_APIC(void) |
1035 | { | 1137 | { |
1036 | unsigned long value, integrated; | 1138 | unsigned int value; |
1037 | int i, j; | 1139 | int i, j; |
1038 | 1140 | ||
1141 | #ifdef CONFIG_X86_32 | ||
1039 | /* Pound the ESR really hard over the head with a big hammer - mbligh */ | 1142 | /* Pound the ESR really hard over the head with a big hammer - mbligh */ |
1040 | if (esr_disable) { | 1143 | if (lapic_is_integrated() && esr_disable) { |
1041 | apic_write(APIC_ESR, 0); | 1144 | apic_write(APIC_ESR, 0); |
1042 | apic_write(APIC_ESR, 0); | 1145 | apic_write(APIC_ESR, 0); |
1043 | apic_write(APIC_ESR, 0); | 1146 | apic_write(APIC_ESR, 0); |
1044 | apic_write(APIC_ESR, 0); | 1147 | apic_write(APIC_ESR, 0); |
1045 | } | 1148 | } |
1149 | #endif | ||
1046 | 1150 | ||
1047 | integrated = lapic_is_integrated(); | 1151 | preempt_disable(); |
1048 | 1152 | ||
1049 | /* | 1153 | /* |
1050 | * Double-check whether this APIC is really registered. | 1154 | * Double-check whether this APIC is really registered. |
1155 | * This is meaningless in clustered apic mode, so we skip it. | ||
1051 | */ | 1156 | */ |
1052 | if (!apic_id_registered()) | 1157 | if (!apic_id_registered()) |
1053 | WARN_ON_ONCE(1); | 1158 | BUG(); |
1054 | 1159 | ||
1055 | /* | 1160 | /* |
1056 | * Intel recommends to set DFR, LDR and TPR before enabling | 1161 | * Intel recommends to set DFR, LDR and TPR before enabling |
@@ -1096,6 +1201,7 @@ void __cpuinit setup_local_APIC(void) | |||
1096 | */ | 1201 | */ |
1097 | value |= APIC_SPIV_APIC_ENABLED; | 1202 | value |= APIC_SPIV_APIC_ENABLED; |
1098 | 1203 | ||
1204 | #ifdef CONFIG_X86_32 | ||
1099 | /* | 1205 | /* |
1100 | * Some unknown Intel IO/APIC (or APIC) errata is biting us with | 1206 | * Some unknown Intel IO/APIC (or APIC) errata is biting us with |
1101 | * certain networking cards. If high frequency interrupts are | 1207 | * certain networking cards. If high frequency interrupts are |
@@ -1116,8 +1222,13 @@ void __cpuinit setup_local_APIC(void) | |||
1116 | * See also the comment in end_level_ioapic_irq(). --macro | 1222 | * See also the comment in end_level_ioapic_irq(). --macro |
1117 | */ | 1223 | */ |
1118 | 1224 | ||
1119 | /* Enable focus processor (bit==0) */ | 1225 | /* |
1226 | * - enable focus processor (bit==0) | ||
1227 | * - 64bit mode always use processor focus | ||
1228 | * so no need to set it | ||
1229 | */ | ||
1120 | value &= ~APIC_SPIV_FOCUS_DISABLED; | 1230 | value &= ~APIC_SPIV_FOCUS_DISABLED; |
1231 | #endif | ||
1121 | 1232 | ||
1122 | /* | 1233 | /* |
1123 | * Set spurious IRQ vector | 1234 | * Set spurious IRQ vector |
@@ -1154,9 +1265,11 @@ void __cpuinit setup_local_APIC(void) | |||
1154 | value = APIC_DM_NMI; | 1265 | value = APIC_DM_NMI; |
1155 | else | 1266 | else |
1156 | value = APIC_DM_NMI | APIC_LVT_MASKED; | 1267 | value = APIC_DM_NMI | APIC_LVT_MASKED; |
1157 | if (!integrated) /* 82489DX */ | 1268 | if (!lapic_is_integrated()) /* 82489DX */ |
1158 | value |= APIC_LVT_LEVEL_TRIGGER; | 1269 | value |= APIC_LVT_LEVEL_TRIGGER; |
1159 | apic_write(APIC_LVT1, value); | 1270 | apic_write(APIC_LVT1, value); |
1271 | |||
1272 | preempt_enable(); | ||
1160 | } | 1273 | } |
1161 | 1274 | ||
1162 | void __cpuinit end_local_APIC_setup(void) | 1275 | void __cpuinit end_local_APIC_setup(void) |
@@ -1177,6 +1290,153 @@ void __cpuinit end_local_APIC_setup(void) | |||
1177 | apic_pm_activate(); | 1290 | apic_pm_activate(); |
1178 | } | 1291 | } |
1179 | 1292 | ||
1293 | #ifdef HAVE_X2APIC | ||
1294 | void check_x2apic(void) | ||
1295 | { | ||
1296 | int msr, msr2; | ||
1297 | |||
1298 | rdmsr(MSR_IA32_APICBASE, msr, msr2); | ||
1299 | |||
1300 | if (msr & X2APIC_ENABLE) { | ||
1301 | printk("x2apic enabled by BIOS, switching to x2apic ops\n"); | ||
1302 | x2apic_preenabled = x2apic = 1; | ||
1303 | apic_ops = &x2apic_ops; | ||
1304 | } | ||
1305 | } | ||
1306 | |||
1307 | void enable_x2apic(void) | ||
1308 | { | ||
1309 | int msr, msr2; | ||
1310 | |||
1311 | rdmsr(MSR_IA32_APICBASE, msr, msr2); | ||
1312 | if (!(msr & X2APIC_ENABLE)) { | ||
1313 | printk("Enabling x2apic\n"); | ||
1314 | wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); | ||
1315 | } | ||
1316 | } | ||
1317 | |||
1318 | void enable_IR_x2apic(void) | ||
1319 | { | ||
1320 | #ifdef CONFIG_INTR_REMAP | ||
1321 | int ret; | ||
1322 | unsigned long flags; | ||
1323 | |||
1324 | if (!cpu_has_x2apic) | ||
1325 | return; | ||
1326 | |||
1327 | if (!x2apic_preenabled && disable_x2apic) { | ||
1328 | printk(KERN_INFO | ||
1329 | "Skipped enabling x2apic and Interrupt-remapping " | ||
1330 | "because of nox2apic\n"); | ||
1331 | return; | ||
1332 | } | ||
1333 | |||
1334 | if (x2apic_preenabled && disable_x2apic) | ||
1335 | panic("Bios already enabled x2apic, can't enforce nox2apic"); | ||
1336 | |||
1337 | if (!x2apic_preenabled && skip_ioapic_setup) { | ||
1338 | printk(KERN_INFO | ||
1339 | "Skipped enabling x2apic and Interrupt-remapping " | ||
1340 | "because of skipping io-apic setup\n"); | ||
1341 | return; | ||
1342 | } | ||
1343 | |||
1344 | ret = dmar_table_init(); | ||
1345 | if (ret) { | ||
1346 | printk(KERN_INFO | ||
1347 | "dmar_table_init() failed with %d:\n", ret); | ||
1348 | |||
1349 | if (x2apic_preenabled) | ||
1350 | panic("x2apic enabled by bios. But IR enabling failed"); | ||
1351 | else | ||
1352 | printk(KERN_INFO | ||
1353 | "Not enabling x2apic,Intr-remapping\n"); | ||
1354 | return; | ||
1355 | } | ||
1356 | |||
1357 | local_irq_save(flags); | ||
1358 | mask_8259A(); | ||
1359 | |||
1360 | ret = save_mask_IO_APIC_setup(); | ||
1361 | if (ret) { | ||
1362 | printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret); | ||
1363 | goto end; | ||
1364 | } | ||
1365 | |||
1366 | ret = enable_intr_remapping(1); | ||
1367 | |||
1368 | if (ret && x2apic_preenabled) { | ||
1369 | local_irq_restore(flags); | ||
1370 | panic("x2apic enabled by bios. But IR enabling failed"); | ||
1371 | } | ||
1372 | |||
1373 | if (ret) | ||
1374 | goto end_restore; | ||
1375 | |||
1376 | if (!x2apic) { | ||
1377 | x2apic = 1; | ||
1378 | apic_ops = &x2apic_ops; | ||
1379 | enable_x2apic(); | ||
1380 | } | ||
1381 | |||
1382 | end_restore: | ||
1383 | if (ret) | ||
1384 | /* | ||
1385 | * IR enabling failed | ||
1386 | */ | ||
1387 | restore_IO_APIC_setup(); | ||
1388 | else | ||
1389 | reinit_intr_remapped_IO_APIC(x2apic_preenabled); | ||
1390 | |||
1391 | end: | ||
1392 | unmask_8259A(); | ||
1393 | local_irq_restore(flags); | ||
1394 | |||
1395 | if (!ret) { | ||
1396 | if (!x2apic_preenabled) | ||
1397 | printk(KERN_INFO | ||
1398 | "Enabled x2apic and interrupt-remapping\n"); | ||
1399 | else | ||
1400 | printk(KERN_INFO | ||
1401 | "Enabled Interrupt-remapping\n"); | ||
1402 | } else | ||
1403 | printk(KERN_ERR | ||
1404 | "Failed to enable Interrupt-remapping and x2apic\n"); | ||
1405 | #else | ||
1406 | if (!cpu_has_x2apic) | ||
1407 | return; | ||
1408 | |||
1409 | if (x2apic_preenabled) | ||
1410 | panic("x2apic enabled prior OS handover," | ||
1411 | " enable CONFIG_INTR_REMAP"); | ||
1412 | |||
1413 | printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " | ||
1414 | " and x2apic\n"); | ||
1415 | #endif | ||
1416 | |||
1417 | return; | ||
1418 | } | ||
1419 | #endif /* HAVE_X2APIC */ | ||
1420 | |||
1421 | #ifdef CONFIG_X86_64 | ||
1422 | /* | ||
1423 | * Detect and enable local APICs on non-SMP boards. | ||
1424 | * Original code written by Keir Fraser. | ||
1425 | * On AMD64 we trust the BIOS - if it says no APIC it is likely | ||
1426 | * not correctly set up (usually the APIC timer won't work etc.) | ||
1427 | */ | ||
1428 | static int __init detect_init_APIC(void) | ||
1429 | { | ||
1430 | if (!cpu_has_apic) { | ||
1431 | printk(KERN_INFO "No local APIC present\n"); | ||
1432 | return -1; | ||
1433 | } | ||
1434 | |||
1435 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
1436 | boot_cpu_physical_apicid = 0; | ||
1437 | return 0; | ||
1438 | } | ||
1439 | #else | ||
1180 | /* | 1440 | /* |
1181 | * Detect and initialize APIC | 1441 | * Detect and initialize APIC |
1182 | */ | 1442 | */ |
@@ -1255,12 +1515,46 @@ no_apic: | |||
1255 | printk(KERN_INFO "No local APIC present or hardware disabled\n"); | 1515 | printk(KERN_INFO "No local APIC present or hardware disabled\n"); |
1256 | return -1; | 1516 | return -1; |
1257 | } | 1517 | } |
1518 | #endif | ||
1519 | |||
1520 | #ifdef CONFIG_X86_64 | ||
1521 | void __init early_init_lapic_mapping(void) | ||
1522 | { | ||
1523 | unsigned long phys_addr; | ||
1524 | |||
1525 | /* | ||
1526 | * If no local APIC can be found then go out | ||
1527 | * : it means there is no mpatable and MADT | ||
1528 | */ | ||
1529 | if (!smp_found_config) | ||
1530 | return; | ||
1531 | |||
1532 | phys_addr = mp_lapic_addr; | ||
1533 | |||
1534 | set_fixmap_nocache(FIX_APIC_BASE, phys_addr); | ||
1535 | apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", | ||
1536 | APIC_BASE, phys_addr); | ||
1537 | |||
1538 | /* | ||
1539 | * Fetch the APIC ID of the BSP in case we have a | ||
1540 | * default configuration (or the MP table is broken). | ||
1541 | */ | ||
1542 | boot_cpu_physical_apicid = read_apic_id(); | ||
1543 | } | ||
1544 | #endif | ||
1258 | 1545 | ||
1259 | /** | 1546 | /** |
1260 | * init_apic_mappings - initialize APIC mappings | 1547 | * init_apic_mappings - initialize APIC mappings |
1261 | */ | 1548 | */ |
1262 | void __init init_apic_mappings(void) | 1549 | void __init init_apic_mappings(void) |
1263 | { | 1550 | { |
1551 | #ifdef HAVE_X2APIC | ||
1552 | if (x2apic) { | ||
1553 | boot_cpu_physical_apicid = read_apic_id(); | ||
1554 | return; | ||
1555 | } | ||
1556 | #endif | ||
1557 | |||
1264 | /* | 1558 | /* |
1265 | * If no local APIC can be found then set up a fake all | 1559 | * If no local APIC can be found then set up a fake all |
1266 | * zeroes page to simulate the local APIC and another | 1560 | * zeroes page to simulate the local APIC and another |
@@ -1273,8 +1567,8 @@ void __init init_apic_mappings(void) | |||
1273 | apic_phys = mp_lapic_addr; | 1567 | apic_phys = mp_lapic_addr; |
1274 | 1568 | ||
1275 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); | 1569 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); |
1276 | printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE, | 1570 | apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", |
1277 | apic_phys); | 1571 | APIC_BASE, apic_phys); |
1278 | 1572 | ||
1279 | /* | 1573 | /* |
1280 | * Fetch the APIC ID of the BSP in case we have a | 1574 | * Fetch the APIC ID of the BSP in case we have a |
@@ -1282,18 +1576,27 @@ void __init init_apic_mappings(void) | |||
1282 | */ | 1576 | */ |
1283 | if (boot_cpu_physical_apicid == -1U) | 1577 | if (boot_cpu_physical_apicid == -1U) |
1284 | boot_cpu_physical_apicid = read_apic_id(); | 1578 | boot_cpu_physical_apicid = read_apic_id(); |
1285 | |||
1286 | } | 1579 | } |
1287 | 1580 | ||
1288 | /* | 1581 | /* |
1289 | * This initializes the IO-APIC and APIC hardware if this is | 1582 | * This initializes the IO-APIC and APIC hardware if this is |
1290 | * a UP kernel. | 1583 | * a UP kernel. |
1291 | */ | 1584 | */ |
1292 | |||
1293 | int apic_version[MAX_APICS]; | 1585 | int apic_version[MAX_APICS]; |
1294 | 1586 | ||
1295 | int __init APIC_init_uniprocessor(void) | 1587 | int __init APIC_init_uniprocessor(void) |
1296 | { | 1588 | { |
1589 | #ifdef CONFIG_X86_64 | ||
1590 | if (disable_apic) { | ||
1591 | printk(KERN_INFO "Apic disabled\n"); | ||
1592 | return -1; | ||
1593 | } | ||
1594 | if (!cpu_has_apic) { | ||
1595 | disable_apic = 1; | ||
1596 | printk(KERN_INFO "Apic disabled by BIOS\n"); | ||
1597 | return -1; | ||
1598 | } | ||
1599 | #else | ||
1297 | if (!smp_found_config && !cpu_has_apic) | 1600 | if (!smp_found_config && !cpu_has_apic) |
1298 | return -1; | 1601 | return -1; |
1299 | 1602 | ||
@@ -1302,39 +1605,68 @@ int __init APIC_init_uniprocessor(void) | |||
1302 | */ | 1605 | */ |
1303 | if (!cpu_has_apic && | 1606 | if (!cpu_has_apic && |
1304 | APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { | 1607 | APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { |
1305 | printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", | 1608 | printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n", |
1306 | boot_cpu_physical_apicid); | 1609 | boot_cpu_physical_apicid); |
1307 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); | 1610 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); |
1308 | return -1; | 1611 | return -1; |
1309 | } | 1612 | } |
1613 | #endif | ||
1310 | 1614 | ||
1311 | verify_local_APIC(); | 1615 | #ifdef HAVE_X2APIC |
1616 | enable_IR_x2apic(); | ||
1617 | #endif | ||
1618 | #ifdef CONFIG_X86_64 | ||
1619 | setup_apic_routing(); | ||
1620 | #endif | ||
1312 | 1621 | ||
1622 | verify_local_APIC(); | ||
1313 | connect_bsp_APIC(); | 1623 | connect_bsp_APIC(); |
1314 | 1624 | ||
1625 | #ifdef CONFIG_X86_64 | ||
1626 | apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); | ||
1627 | #else | ||
1315 | /* | 1628 | /* |
1316 | * Hack: In case of kdump, after a crash, kernel might be booting | 1629 | * Hack: In case of kdump, after a crash, kernel might be booting |
1317 | * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid | 1630 | * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid |
1318 | * might be zero if read from MP tables. Get it from LAPIC. | 1631 | * might be zero if read from MP tables. Get it from LAPIC. |
1319 | */ | 1632 | */ |
1320 | #ifdef CONFIG_CRASH_DUMP | 1633 | # ifdef CONFIG_CRASH_DUMP |
1321 | boot_cpu_physical_apicid = read_apic_id(); | 1634 | boot_cpu_physical_apicid = read_apic_id(); |
1635 | # endif | ||
1322 | #endif | 1636 | #endif |
1323 | physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); | 1637 | physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); |
1324 | |||
1325 | setup_local_APIC(); | 1638 | setup_local_APIC(); |
1326 | 1639 | ||
1640 | #ifdef CONFIG_X86_64 | ||
1641 | /* | ||
1642 | * Now enable IO-APICs, actually call clear_IO_APIC | ||
1643 | * We need clear_IO_APIC before enabling vector on BP | ||
1644 | */ | ||
1645 | if (!skip_ioapic_setup && nr_ioapics) | ||
1646 | enable_IO_APIC(); | ||
1647 | #endif | ||
1648 | |||
1327 | #ifdef CONFIG_X86_IO_APIC | 1649 | #ifdef CONFIG_X86_IO_APIC |
1328 | if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) | 1650 | if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) |
1329 | #endif | 1651 | #endif |
1330 | localise_nmi_watchdog(); | 1652 | localise_nmi_watchdog(); |
1331 | end_local_APIC_setup(); | 1653 | end_local_APIC_setup(); |
1654 | |||
1332 | #ifdef CONFIG_X86_IO_APIC | 1655 | #ifdef CONFIG_X86_IO_APIC |
1333 | if (smp_found_config) | 1656 | if (smp_found_config && !skip_ioapic_setup && nr_ioapics) |
1334 | if (!skip_ioapic_setup && nr_ioapics) | 1657 | setup_IO_APIC(); |
1335 | setup_IO_APIC(); | 1658 | # ifdef CONFIG_X86_64 |
1659 | else | ||
1660 | nr_ioapics = 0; | ||
1661 | # endif | ||
1336 | #endif | 1662 | #endif |
1663 | |||
1664 | #ifdef CONFIG_X86_64 | ||
1665 | setup_boot_APIC_clock(); | ||
1666 | check_nmi_watchdog(); | ||
1667 | #else | ||
1337 | setup_boot_clock(); | 1668 | setup_boot_clock(); |
1669 | #endif | ||
1338 | 1670 | ||
1339 | return 0; | 1671 | return 0; |
1340 | } | 1672 | } |
@@ -1348,8 +1680,11 @@ int __init APIC_init_uniprocessor(void) | |||
1348 | */ | 1680 | */ |
1349 | void smp_spurious_interrupt(struct pt_regs *regs) | 1681 | void smp_spurious_interrupt(struct pt_regs *regs) |
1350 | { | 1682 | { |
1351 | unsigned long v; | 1683 | u32 v; |
1352 | 1684 | ||
1685 | #ifdef CONFIG_X86_64 | ||
1686 | exit_idle(); | ||
1687 | #endif | ||
1353 | irq_enter(); | 1688 | irq_enter(); |
1354 | /* | 1689 | /* |
1355 | * Check if this really is a spurious interrupt and ACK it | 1690 | * Check if this really is a spurious interrupt and ACK it |
@@ -1360,10 +1695,14 @@ void smp_spurious_interrupt(struct pt_regs *regs) | |||
1360 | if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) | 1695 | if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) |
1361 | ack_APIC_irq(); | 1696 | ack_APIC_irq(); |
1362 | 1697 | ||
1698 | #ifdef CONFIG_X86_64 | ||
1699 | add_pda(irq_spurious_count, 1); | ||
1700 | #else | ||
1363 | /* see sw-dev-man vol 3, chapter 7.4.13.5 */ | 1701 | /* see sw-dev-man vol 3, chapter 7.4.13.5 */ |
1364 | printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " | 1702 | printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " |
1365 | "should never happen.\n", smp_processor_id()); | 1703 | "should never happen.\n", smp_processor_id()); |
1366 | __get_cpu_var(irq_stat).irq_spurious_count++; | 1704 | __get_cpu_var(irq_stat).irq_spurious_count++; |
1705 | #endif | ||
1367 | irq_exit(); | 1706 | irq_exit(); |
1368 | } | 1707 | } |
1369 | 1708 | ||
@@ -1372,8 +1711,11 @@ void smp_spurious_interrupt(struct pt_regs *regs) | |||
1372 | */ | 1711 | */ |
1373 | void smp_error_interrupt(struct pt_regs *regs) | 1712 | void smp_error_interrupt(struct pt_regs *regs) |
1374 | { | 1713 | { |
1375 | unsigned long v, v1; | 1714 | u32 v, v1; |
1376 | 1715 | ||
1716 | #ifdef CONFIG_X86_64 | ||
1717 | exit_idle(); | ||
1718 | #endif | ||
1377 | irq_enter(); | 1719 | irq_enter(); |
1378 | /* First tickle the hardware, only then report what went on. -- REW */ | 1720 | /* First tickle the hardware, only then report what went on. -- REW */ |
1379 | v = apic_read(APIC_ESR); | 1721 | v = apic_read(APIC_ESR); |
@@ -1392,7 +1734,7 @@ void smp_error_interrupt(struct pt_regs *regs) | |||
1392 | 6: Received illegal vector | 1734 | 6: Received illegal vector |
1393 | 7: Illegal register address | 1735 | 7: Illegal register address |
1394 | */ | 1736 | */ |
1395 | printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", | 1737 | printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", |
1396 | smp_processor_id(), v , v1); | 1738 | smp_processor_id(), v , v1); |
1397 | irq_exit(); | 1739 | irq_exit(); |
1398 | } | 1740 | } |
@@ -1565,6 +1907,13 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1565 | cpu_set(cpu, cpu_present_map); | 1907 | cpu_set(cpu, cpu_present_map); |
1566 | } | 1908 | } |
1567 | 1909 | ||
1910 | #ifdef CONFIG_X86_64 | ||
1911 | int hard_smp_processor_id(void) | ||
1912 | { | ||
1913 | return read_apic_id(); | ||
1914 | } | ||
1915 | #endif | ||
1916 | |||
1568 | /* | 1917 | /* |
1569 | * Power management | 1918 | * Power management |
1570 | */ | 1919 | */ |
@@ -1640,7 +1989,7 @@ static int lapic_resume(struct sys_device *dev) | |||
1640 | 1989 | ||
1641 | local_irq_save(flags); | 1990 | local_irq_save(flags); |
1642 | 1991 | ||
1643 | #ifdef CONFIG_X86_64 | 1992 | #ifdef HAVE_X2APIC |
1644 | if (x2apic) | 1993 | if (x2apic) |
1645 | enable_x2apic(); | 1994 | enable_x2apic(); |
1646 | else | 1995 | else |
@@ -1702,7 +2051,7 @@ static struct sys_device device_lapic = { | |||
1702 | .cls = &lapic_sysclass, | 2051 | .cls = &lapic_sysclass, |
1703 | }; | 2052 | }; |
1704 | 2053 | ||
1705 | static void __devinit apic_pm_activate(void) | 2054 | static void __cpuinit apic_pm_activate(void) |
1706 | { | 2055 | { |
1707 | apic_pm_state.active = 1; | 2056 | apic_pm_state.active = 1; |
1708 | } | 2057 | } |
@@ -1728,16 +2077,87 @@ static void apic_pm_activate(void) { } | |||
1728 | 2077 | ||
1729 | #endif /* CONFIG_PM */ | 2078 | #endif /* CONFIG_PM */ |
1730 | 2079 | ||
2080 | #ifdef CONFIG_X86_64 | ||
1731 | /* | 2081 | /* |
1732 | * APIC command line parameters | 2082 | * apic_is_clustered_box() -- Check if we can expect good TSC |
2083 | * | ||
2084 | * Thus far, the major user of this is IBM's Summit2 series: | ||
2085 | * | ||
2086 | * Clustered boxes may have unsynced TSC problems if they are | ||
2087 | * multi-chassis. Use available data to take a good guess. | ||
2088 | * If in doubt, go HPET. | ||
1733 | */ | 2089 | */ |
1734 | static int __init parse_lapic(char *arg) | 2090 | __cpuinit int apic_is_clustered_box(void) |
1735 | { | 2091 | { |
1736 | force_enable_local_apic = 1; | 2092 | int i, clusters, zeros; |
1737 | return 0; | 2093 | unsigned id; |
2094 | u16 *bios_cpu_apicid; | ||
2095 | DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); | ||
2096 | |||
2097 | /* | ||
2098 | * there is not this kind of box with AMD CPU yet. | ||
2099 | * Some AMD box with quadcore cpu and 8 sockets apicid | ||
2100 | * will be [4, 0x23] or [8, 0x27] could be thought to | ||
2101 | * vsmp box still need checking... | ||
2102 | */ | ||
2103 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) | ||
2104 | return 0; | ||
2105 | |||
2106 | bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); | ||
2107 | bitmap_zero(clustermap, NUM_APIC_CLUSTERS); | ||
2108 | |||
2109 | for (i = 0; i < NR_CPUS; i++) { | ||
2110 | /* are we being called early in kernel startup? */ | ||
2111 | if (bios_cpu_apicid) { | ||
2112 | id = bios_cpu_apicid[i]; | ||
2113 | } | ||
2114 | else if (i < nr_cpu_ids) { | ||
2115 | if (cpu_present(i)) | ||
2116 | id = per_cpu(x86_bios_cpu_apicid, i); | ||
2117 | else | ||
2118 | continue; | ||
2119 | } | ||
2120 | else | ||
2121 | break; | ||
2122 | |||
2123 | if (id != BAD_APICID) | ||
2124 | __set_bit(APIC_CLUSTERID(id), clustermap); | ||
2125 | } | ||
2126 | |||
2127 | /* Problem: Partially populated chassis may not have CPUs in some of | ||
2128 | * the APIC clusters they have been allocated. Only present CPUs have | ||
2129 | * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. | ||
2130 | * Since clusters are allocated sequentially, count zeros only if | ||
2131 | * they are bounded by ones. | ||
2132 | */ | ||
2133 | clusters = 0; | ||
2134 | zeros = 0; | ||
2135 | for (i = 0; i < NUM_APIC_CLUSTERS; i++) { | ||
2136 | if (test_bit(i, clustermap)) { | ||
2137 | clusters += 1 + zeros; | ||
2138 | zeros = 0; | ||
2139 | } else | ||
2140 | ++zeros; | ||
2141 | } | ||
2142 | |||
2143 | /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are | ||
2144 | * not guaranteed to be synced between boards | ||
2145 | */ | ||
2146 | if (is_vsmp_box() && clusters > 1) | ||
2147 | return 1; | ||
2148 | |||
2149 | /* | ||
2150 | * If clusters > 2, then should be multi-chassis. | ||
2151 | * May have to revisit this when multi-core + hyperthreaded CPUs come | ||
2152 | * out, but AFAIK this will work even for them. | ||
2153 | */ | ||
2154 | return (clusters > 2); | ||
1738 | } | 2155 | } |
1739 | early_param("lapic", parse_lapic); | 2156 | #endif |
1740 | 2157 | ||
2158 | /* | ||
2159 | * APIC command line parameters | ||
2160 | */ | ||
1741 | static int __init setup_disableapic(char *arg) | 2161 | static int __init setup_disableapic(char *arg) |
1742 | { | 2162 | { |
1743 | disable_apic = 1; | 2163 | disable_apic = 1; |
@@ -1779,7 +2199,6 @@ static int __init apic_set_verbosity(char *arg) | |||
1779 | if (!arg) { | 2199 | if (!arg) { |
1780 | #ifdef CONFIG_X86_64 | 2200 | #ifdef CONFIG_X86_64 |
1781 | skip_ioapic_setup = 0; | 2201 | skip_ioapic_setup = 0; |
1782 | ioapic_force = 1; | ||
1783 | return 0; | 2202 | return 0; |
1784 | #endif | 2203 | #endif |
1785 | return -EINVAL; | 2204 | return -EINVAL; |
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c deleted file mode 100644 index 94ddb69ae15e..000000000000 --- a/arch/x86/kernel/apic_64.c +++ /dev/null | |||
@@ -1,1848 +0,0 @@ | |||
1 | /* | ||
2 | * Local APIC handling, local APIC timers | ||
3 | * | ||
4 | * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> | ||
5 | * | ||
6 | * Fixes | ||
7 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | ||
8 | * thanks to Eric Gilmore | ||
9 | * and Rolf G. Tews | ||
10 | * for testing these extensively. | ||
11 | * Maciej W. Rozycki : Various updates and fixes. | ||
12 | * Mikael Pettersson : Power Management for UP-APIC. | ||
13 | * Pavel Machek and | ||
14 | * Mikael Pettersson : PM converted to driver model. | ||
15 | */ | ||
16 | |||
17 | #include <linux/init.h> | ||
18 | |||
19 | #include <linux/mm.h> | ||
20 | #include <linux/delay.h> | ||
21 | #include <linux/bootmem.h> | ||
22 | #include <linux/interrupt.h> | ||
23 | #include <linux/mc146818rtc.h> | ||
24 | #include <linux/kernel_stat.h> | ||
25 | #include <linux/sysdev.h> | ||
26 | #include <linux/ioport.h> | ||
27 | #include <linux/clockchips.h> | ||
28 | #include <linux/acpi_pmtmr.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/dmar.h> | ||
31 | |||
32 | #include <asm/atomic.h> | ||
33 | #include <asm/smp.h> | ||
34 | #include <asm/mtrr.h> | ||
35 | #include <asm/mpspec.h> | ||
36 | #include <asm/hpet.h> | ||
37 | #include <asm/pgalloc.h> | ||
38 | #include <asm/nmi.h> | ||
39 | #include <asm/idle.h> | ||
40 | #include <asm/proto.h> | ||
41 | #include <asm/timex.h> | ||
42 | #include <asm/apic.h> | ||
43 | #include <asm/i8259.h> | ||
44 | |||
45 | #include <mach_ipi.h> | ||
46 | #include <mach_apic.h> | ||
47 | |||
48 | /* Disable local APIC timer from the kernel commandline or via dmi quirk */ | ||
49 | static int disable_apic_timer __cpuinitdata; | ||
50 | static int apic_calibrate_pmtmr __initdata; | ||
51 | int disable_apic; | ||
52 | int disable_x2apic; | ||
53 | int x2apic; | ||
54 | |||
55 | /* x2apic enabled before OS handover */ | ||
56 | int x2apic_preenabled; | ||
57 | |||
58 | /* Local APIC timer works in C2 */ | ||
59 | int local_apic_timer_c2_ok; | ||
60 | EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); | ||
61 | |||
62 | /* | ||
63 | * Debug level, exported for io_apic.c | ||
64 | */ | ||
65 | unsigned int apic_verbosity; | ||
66 | |||
67 | /* Have we found an MP table */ | ||
68 | int smp_found_config; | ||
69 | |||
70 | static struct resource lapic_resource = { | ||
71 | .name = "Local APIC", | ||
72 | .flags = IORESOURCE_MEM | IORESOURCE_BUSY, | ||
73 | }; | ||
74 | |||
75 | static unsigned int calibration_result; | ||
76 | |||
77 | static int lapic_next_event(unsigned long delta, | ||
78 | struct clock_event_device *evt); | ||
79 | static void lapic_timer_setup(enum clock_event_mode mode, | ||
80 | struct clock_event_device *evt); | ||
81 | static void lapic_timer_broadcast(cpumask_t mask); | ||
82 | static void apic_pm_activate(void); | ||
83 | |||
84 | /* | ||
85 | * The local apic timer can be used for any function which is CPU local. | ||
86 | */ | ||
87 | static struct clock_event_device lapic_clockevent = { | ||
88 | .name = "lapic", | ||
89 | .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT | ||
90 | | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, | ||
91 | .shift = 32, | ||
92 | .set_mode = lapic_timer_setup, | ||
93 | .set_next_event = lapic_next_event, | ||
94 | .broadcast = lapic_timer_broadcast, | ||
95 | .rating = 100, | ||
96 | .irq = -1, | ||
97 | }; | ||
98 | static DEFINE_PER_CPU(struct clock_event_device, lapic_events); | ||
99 | |||
100 | static unsigned long apic_phys; | ||
101 | |||
102 | unsigned long mp_lapic_addr; | ||
103 | |||
104 | /* | ||
105 | * Get the LAPIC version | ||
106 | */ | ||
107 | static inline int lapic_get_version(void) | ||
108 | { | ||
109 | return GET_APIC_VERSION(apic_read(APIC_LVR)); | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * Check, if the APIC is integrated or a separate chip | ||
114 | */ | ||
115 | static inline int lapic_is_integrated(void) | ||
116 | { | ||
117 | #ifdef CONFIG_X86_64 | ||
118 | return 1; | ||
119 | #else | ||
120 | return APIC_INTEGRATED(lapic_get_version()); | ||
121 | #endif | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * Check, whether this is a modern or a first generation APIC | ||
126 | */ | ||
127 | static int modern_apic(void) | ||
128 | { | ||
129 | /* AMD systems use old APIC versions, so check the CPU */ | ||
130 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | ||
131 | boot_cpu_data.x86 >= 0xf) | ||
132 | return 1; | ||
133 | return lapic_get_version() >= 0x14; | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * Paravirt kernels also might be using these below ops. So we still | ||
138 | * use generic apic_read()/apic_write(), which might be pointing to different | ||
139 | * ops in PARAVIRT case. | ||
140 | */ | ||
141 | void xapic_wait_icr_idle(void) | ||
142 | { | ||
143 | while (apic_read(APIC_ICR) & APIC_ICR_BUSY) | ||
144 | cpu_relax(); | ||
145 | } | ||
146 | |||
147 | u32 safe_xapic_wait_icr_idle(void) | ||
148 | { | ||
149 | u32 send_status; | ||
150 | int timeout; | ||
151 | |||
152 | timeout = 0; | ||
153 | do { | ||
154 | send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; | ||
155 | if (!send_status) | ||
156 | break; | ||
157 | udelay(100); | ||
158 | } while (timeout++ < 1000); | ||
159 | |||
160 | return send_status; | ||
161 | } | ||
162 | |||
163 | void xapic_icr_write(u32 low, u32 id) | ||
164 | { | ||
165 | apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); | ||
166 | apic_write(APIC_ICR, low); | ||
167 | } | ||
168 | |||
169 | u64 xapic_icr_read(void) | ||
170 | { | ||
171 | u32 icr1, icr2; | ||
172 | |||
173 | icr2 = apic_read(APIC_ICR2); | ||
174 | icr1 = apic_read(APIC_ICR); | ||
175 | |||
176 | return icr1 | ((u64)icr2 << 32); | ||
177 | } | ||
178 | |||
179 | static struct apic_ops xapic_ops = { | ||
180 | .read = native_apic_mem_read, | ||
181 | .write = native_apic_mem_write, | ||
182 | .icr_read = xapic_icr_read, | ||
183 | .icr_write = xapic_icr_write, | ||
184 | .wait_icr_idle = xapic_wait_icr_idle, | ||
185 | .safe_wait_icr_idle = safe_xapic_wait_icr_idle, | ||
186 | }; | ||
187 | |||
188 | struct apic_ops __read_mostly *apic_ops = &xapic_ops; | ||
189 | EXPORT_SYMBOL_GPL(apic_ops); | ||
190 | |||
191 | static void x2apic_wait_icr_idle(void) | ||
192 | { | ||
193 | /* no need to wait for icr idle in x2apic */ | ||
194 | return; | ||
195 | } | ||
196 | |||
197 | static u32 safe_x2apic_wait_icr_idle(void) | ||
198 | { | ||
199 | /* no need to wait for icr idle in x2apic */ | ||
200 | return 0; | ||
201 | } | ||
202 | |||
203 | void x2apic_icr_write(u32 low, u32 id) | ||
204 | { | ||
205 | wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); | ||
206 | } | ||
207 | |||
208 | u64 x2apic_icr_read(void) | ||
209 | { | ||
210 | unsigned long val; | ||
211 | |||
212 | rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); | ||
213 | return val; | ||
214 | } | ||
215 | |||
216 | static struct apic_ops x2apic_ops = { | ||
217 | .read = native_apic_msr_read, | ||
218 | .write = native_apic_msr_write, | ||
219 | .icr_read = x2apic_icr_read, | ||
220 | .icr_write = x2apic_icr_write, | ||
221 | .wait_icr_idle = x2apic_wait_icr_idle, | ||
222 | .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, | ||
223 | }; | ||
224 | |||
225 | /** | ||
226 | * enable_NMI_through_LVT0 - enable NMI through local vector table 0 | ||
227 | */ | ||
228 | void __cpuinit enable_NMI_through_LVT0(void) | ||
229 | { | ||
230 | unsigned int v; | ||
231 | |||
232 | /* unmask and set to NMI */ | ||
233 | v = APIC_DM_NMI; | ||
234 | |||
235 | /* Level triggered for 82489DX (32bit mode) */ | ||
236 | if (!lapic_is_integrated()) | ||
237 | v |= APIC_LVT_LEVEL_TRIGGER; | ||
238 | |||
239 | apic_write(APIC_LVT0, v); | ||
240 | } | ||
241 | |||
242 | /** | ||
243 | * lapic_get_maxlvt - get the maximum number of local vector table entries | ||
244 | */ | ||
245 | int lapic_get_maxlvt(void) | ||
246 | { | ||
247 | unsigned int v; | ||
248 | |||
249 | v = apic_read(APIC_LVR); | ||
250 | /* | ||
251 | * - we always have APIC integrated on 64bit mode | ||
252 | * - 82489DXs do not report # of LVT entries | ||
253 | */ | ||
254 | return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; | ||
255 | } | ||
256 | |||
257 | /* | ||
258 | * Local APIC timer | ||
259 | */ | ||
260 | |||
261 | /* Clock divisor */ | ||
262 | #ifdef CONFG_X86_64 | ||
263 | #define APIC_DIVISOR 1 | ||
264 | #else | ||
265 | #define APIC_DIVISOR 16 | ||
266 | #endif | ||
267 | |||
268 | /* | ||
269 | * This function sets up the local APIC timer, with a timeout of | ||
270 | * 'clocks' APIC bus clock. During calibration we actually call | ||
271 | * this function twice on the boot CPU, once with a bogus timeout | ||
272 | * value, second time for real. The other (noncalibrating) CPUs | ||
273 | * call this function only once, with the real, calibrated value. | ||
274 | * | ||
275 | * We do reads before writes even if unnecessary, to get around the | ||
276 | * P5 APIC double write bug. | ||
277 | */ | ||
278 | static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | ||
279 | { | ||
280 | unsigned int lvtt_value, tmp_value; | ||
281 | |||
282 | lvtt_value = LOCAL_TIMER_VECTOR; | ||
283 | if (!oneshot) | ||
284 | lvtt_value |= APIC_LVT_TIMER_PERIODIC; | ||
285 | if (!lapic_is_integrated()) | ||
286 | lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); | ||
287 | |||
288 | if (!irqen) | ||
289 | lvtt_value |= APIC_LVT_MASKED; | ||
290 | |||
291 | apic_write(APIC_LVTT, lvtt_value); | ||
292 | |||
293 | /* | ||
294 | * Divide PICLK by 16 | ||
295 | */ | ||
296 | tmp_value = apic_read(APIC_TDCR); | ||
297 | apic_write(APIC_TDCR, | ||
298 | (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | | ||
299 | APIC_TDR_DIV_16); | ||
300 | |||
301 | if (!oneshot) | ||
302 | apic_write(APIC_TMICT, clocks / APIC_DIVISOR); | ||
303 | } | ||
304 | |||
305 | /* | ||
306 | * Setup extended LVT, AMD specific (K8, family 10h) | ||
307 | * | ||
308 | * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and | ||
309 | * MCE interrupts are supported. Thus MCE offset must be set to 0. | ||
310 | * | ||
311 | * If mask=1, the LVT entry does not generate interrupts while mask=0 | ||
312 | * enables the vector. See also the BKDGs. | ||
313 | */ | ||
314 | |||
315 | #define APIC_EILVT_LVTOFF_MCE 0 | ||
316 | #define APIC_EILVT_LVTOFF_IBS 1 | ||
317 | |||
318 | static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) | ||
319 | { | ||
320 | unsigned long reg = (lvt_off << 4) + APIC_EILVT0; | ||
321 | unsigned int v = (mask << 16) | (msg_type << 8) | vector; | ||
322 | |||
323 | apic_write(reg, v); | ||
324 | } | ||
325 | |||
326 | u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) | ||
327 | { | ||
328 | setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); | ||
329 | return APIC_EILVT_LVTOFF_MCE; | ||
330 | } | ||
331 | |||
332 | u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) | ||
333 | { | ||
334 | setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); | ||
335 | return APIC_EILVT_LVTOFF_IBS; | ||
336 | } | ||
337 | EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); | ||
338 | |||
339 | /* | ||
340 | * Program the next event, relative to now | ||
341 | */ | ||
342 | static int lapic_next_event(unsigned long delta, | ||
343 | struct clock_event_device *evt) | ||
344 | { | ||
345 | apic_write(APIC_TMICT, delta); | ||
346 | return 0; | ||
347 | } | ||
348 | |||
349 | /* | ||
350 | * Setup the lapic timer in periodic or oneshot mode | ||
351 | */ | ||
352 | static void lapic_timer_setup(enum clock_event_mode mode, | ||
353 | struct clock_event_device *evt) | ||
354 | { | ||
355 | unsigned long flags; | ||
356 | unsigned int v; | ||
357 | |||
358 | /* Lapic used as dummy for broadcast ? */ | ||
359 | if (evt->features & CLOCK_EVT_FEAT_DUMMY) | ||
360 | return; | ||
361 | |||
362 | local_irq_save(flags); | ||
363 | |||
364 | switch (mode) { | ||
365 | case CLOCK_EVT_MODE_PERIODIC: | ||
366 | case CLOCK_EVT_MODE_ONESHOT: | ||
367 | __setup_APIC_LVTT(calibration_result, | ||
368 | mode != CLOCK_EVT_MODE_PERIODIC, 1); | ||
369 | break; | ||
370 | case CLOCK_EVT_MODE_UNUSED: | ||
371 | case CLOCK_EVT_MODE_SHUTDOWN: | ||
372 | v = apic_read(APIC_LVTT); | ||
373 | v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); | ||
374 | apic_write(APIC_LVTT, v); | ||
375 | break; | ||
376 | case CLOCK_EVT_MODE_RESUME: | ||
377 | /* Nothing to do here */ | ||
378 | break; | ||
379 | } | ||
380 | |||
381 | local_irq_restore(flags); | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * Local APIC timer broadcast function | ||
386 | */ | ||
387 | static void lapic_timer_broadcast(cpumask_t mask) | ||
388 | { | ||
389 | #ifdef CONFIG_SMP | ||
390 | send_IPI_mask(mask, LOCAL_TIMER_VECTOR); | ||
391 | #endif | ||
392 | } | ||
393 | |||
394 | /* | ||
395 | * Setup the local APIC timer for this CPU. Copy the initilized values | ||
396 | * of the boot CPU and register the clock event in the framework. | ||
397 | */ | ||
398 | static void setup_APIC_timer(void) | ||
399 | { | ||
400 | struct clock_event_device *levt = &__get_cpu_var(lapic_events); | ||
401 | |||
402 | memcpy(levt, &lapic_clockevent, sizeof(*levt)); | ||
403 | levt->cpumask = cpumask_of_cpu(smp_processor_id()); | ||
404 | |||
405 | clockevents_register_device(levt); | ||
406 | } | ||
407 | |||
408 | /* | ||
409 | * In this function we calibrate APIC bus clocks to the external | ||
410 | * timer. Unfortunately we cannot use jiffies and the timer irq | ||
411 | * to calibrate, since some later bootup code depends on getting | ||
412 | * the first irq? Ugh. | ||
413 | * | ||
414 | * We want to do the calibration only once since we | ||
415 | * want to have local timer irqs syncron. CPUs connected | ||
416 | * by the same APIC bus have the very same bus frequency. | ||
417 | * And we want to have irqs off anyways, no accidental | ||
418 | * APIC irq that way. | ||
419 | */ | ||
420 | |||
421 | #define TICK_COUNT 100000000 | ||
422 | |||
423 | static int __init calibrate_APIC_clock(void) | ||
424 | { | ||
425 | unsigned apic, apic_start; | ||
426 | unsigned long tsc, tsc_start; | ||
427 | int result; | ||
428 | |||
429 | local_irq_disable(); | ||
430 | |||
431 | /* | ||
432 | * Put whatever arbitrary (but long enough) timeout | ||
433 | * value into the APIC clock, we just want to get the | ||
434 | * counter running for calibration. | ||
435 | * | ||
436 | * No interrupt enable ! | ||
437 | */ | ||
438 | __setup_APIC_LVTT(250000000, 0, 0); | ||
439 | |||
440 | apic_start = apic_read(APIC_TMCCT); | ||
441 | #ifdef CONFIG_X86_PM_TIMER | ||
442 | if (apic_calibrate_pmtmr && pmtmr_ioport) { | ||
443 | pmtimer_wait(5000); /* 5ms wait */ | ||
444 | apic = apic_read(APIC_TMCCT); | ||
445 | result = (apic_start - apic) * 1000L / 5; | ||
446 | } else | ||
447 | #endif | ||
448 | { | ||
449 | rdtscll(tsc_start); | ||
450 | |||
451 | do { | ||
452 | apic = apic_read(APIC_TMCCT); | ||
453 | rdtscll(tsc); | ||
454 | } while ((tsc - tsc_start) < TICK_COUNT && | ||
455 | (apic_start - apic) < TICK_COUNT); | ||
456 | |||
457 | result = (apic_start - apic) * 1000L * tsc_khz / | ||
458 | (tsc - tsc_start); | ||
459 | } | ||
460 | |||
461 | local_irq_enable(); | ||
462 | |||
463 | printk(KERN_DEBUG "APIC timer calibration result %d\n", result); | ||
464 | |||
465 | printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", | ||
466 | result / 1000 / 1000, result / 1000 % 1000); | ||
467 | |||
468 | /* Calculate the scaled math multiplication factor */ | ||
469 | lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, | ||
470 | lapic_clockevent.shift); | ||
471 | lapic_clockevent.max_delta_ns = | ||
472 | clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); | ||
473 | lapic_clockevent.min_delta_ns = | ||
474 | clockevent_delta2ns(0xF, &lapic_clockevent); | ||
475 | |||
476 | calibration_result = (result * APIC_DIVISOR) / HZ; | ||
477 | |||
478 | /* | ||
479 | * Do a sanity check on the APIC calibration result | ||
480 | */ | ||
481 | if (calibration_result < (1000000 / HZ)) { | ||
482 | printk(KERN_WARNING | ||
483 | "APIC frequency too slow, disabling apic timer\n"); | ||
484 | return -1; | ||
485 | } | ||
486 | |||
487 | return 0; | ||
488 | } | ||
489 | |||
490 | /* | ||
491 | * Setup the boot APIC | ||
492 | * | ||
493 | * Calibrate and verify the result. | ||
494 | */ | ||
495 | void __init setup_boot_APIC_clock(void) | ||
496 | { | ||
497 | /* | ||
498 | * The local apic timer can be disabled via the kernel | ||
499 | * commandline or from the CPU detection code. Register the lapic | ||
500 | * timer as a dummy clock event source on SMP systems, so the | ||
501 | * broadcast mechanism is used. On UP systems simply ignore it. | ||
502 | */ | ||
503 | if (disable_apic_timer) { | ||
504 | printk(KERN_INFO "Disabling APIC timer\n"); | ||
505 | /* No broadcast on UP ! */ | ||
506 | if (num_possible_cpus() > 1) { | ||
507 | lapic_clockevent.mult = 1; | ||
508 | setup_APIC_timer(); | ||
509 | } | ||
510 | return; | ||
511 | } | ||
512 | |||
513 | apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" | ||
514 | "calibrating APIC timer ...\n"); | ||
515 | |||
516 | if (calibrate_APIC_clock()) { | ||
517 | /* No broadcast on UP ! */ | ||
518 | if (num_possible_cpus() > 1) | ||
519 | setup_APIC_timer(); | ||
520 | return; | ||
521 | } | ||
522 | |||
523 | /* | ||
524 | * If nmi_watchdog is set to IO_APIC, we need the | ||
525 | * PIT/HPET going. Otherwise register lapic as a dummy | ||
526 | * device. | ||
527 | */ | ||
528 | if (nmi_watchdog != NMI_IO_APIC) | ||
529 | lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; | ||
530 | else | ||
531 | printk(KERN_WARNING "APIC timer registered as dummy," | ||
532 | " due to nmi_watchdog=%d!\n", nmi_watchdog); | ||
533 | |||
534 | /* Setup the lapic or request the broadcast */ | ||
535 | setup_APIC_timer(); | ||
536 | } | ||
537 | |||
538 | void __cpuinit setup_secondary_APIC_clock(void) | ||
539 | { | ||
540 | setup_APIC_timer(); | ||
541 | } | ||
542 | |||
543 | /* | ||
544 | * The guts of the apic timer interrupt | ||
545 | */ | ||
546 | static void local_apic_timer_interrupt(void) | ||
547 | { | ||
548 | int cpu = smp_processor_id(); | ||
549 | struct clock_event_device *evt = &per_cpu(lapic_events, cpu); | ||
550 | |||
551 | /* | ||
552 | * Normally we should not be here till LAPIC has been initialized but | ||
553 | * in some cases like kdump, its possible that there is a pending LAPIC | ||
554 | * timer interrupt from previous kernel's context and is delivered in | ||
555 | * new kernel the moment interrupts are enabled. | ||
556 | * | ||
557 | * Interrupts are enabled early and LAPIC is setup much later, hence | ||
558 | * its possible that when we get here evt->event_handler is NULL. | ||
559 | * Check for event_handler being NULL and discard the interrupt as | ||
560 | * spurious. | ||
561 | */ | ||
562 | if (!evt->event_handler) { | ||
563 | printk(KERN_WARNING | ||
564 | "Spurious LAPIC timer interrupt on cpu %d\n", cpu); | ||
565 | /* Switch it off */ | ||
566 | lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); | ||
567 | return; | ||
568 | } | ||
569 | |||
570 | /* | ||
571 | * the NMI deadlock-detector uses this. | ||
572 | */ | ||
573 | #ifdef CONFIG_X86_64 | ||
574 | add_pda(apic_timer_irqs, 1); | ||
575 | #else | ||
576 | per_cpu(irq_stat, cpu).apic_timer_irqs++; | ||
577 | #endif | ||
578 | |||
579 | evt->event_handler(evt); | ||
580 | } | ||
581 | |||
582 | /* | ||
583 | * Local APIC timer interrupt. This is the most natural way for doing | ||
584 | * local interrupts, but local timer interrupts can be emulated by | ||
585 | * broadcast interrupts too. [in case the hw doesn't support APIC timers] | ||
586 | * | ||
587 | * [ if a single-CPU system runs an SMP kernel then we call the local | ||
588 | * interrupt as well. Thus we cannot inline the local irq ... ] | ||
589 | */ | ||
590 | void smp_apic_timer_interrupt(struct pt_regs *regs) | ||
591 | { | ||
592 | struct pt_regs *old_regs = set_irq_regs(regs); | ||
593 | |||
594 | /* | ||
595 | * NOTE! We'd better ACK the irq immediately, | ||
596 | * because timer handling can be slow. | ||
597 | */ | ||
598 | ack_APIC_irq(); | ||
599 | /* | ||
600 | * update_process_times() expects us to have done irq_enter(). | ||
601 | * Besides, if we don't timer interrupts ignore the global | ||
602 | * interrupt lock, which is the WrongThing (tm) to do. | ||
603 | */ | ||
604 | exit_idle(); | ||
605 | irq_enter(); | ||
606 | local_apic_timer_interrupt(); | ||
607 | irq_exit(); | ||
608 | |||
609 | set_irq_regs(old_regs); | ||
610 | } | ||
611 | |||
612 | int setup_profiling_timer(unsigned int multiplier) | ||
613 | { | ||
614 | return -EINVAL; | ||
615 | } | ||
616 | |||
617 | |||
618 | /* | ||
619 | * Local APIC start and shutdown | ||
620 | */ | ||
621 | |||
622 | /** | ||
623 | * clear_local_APIC - shutdown the local APIC | ||
624 | * | ||
625 | * This is called, when a CPU is disabled and before rebooting, so the state of | ||
626 | * the local APIC has no dangling leftovers. Also used to cleanout any BIOS | ||
627 | * leftovers during boot. | ||
628 | */ | ||
629 | void clear_local_APIC(void) | ||
630 | { | ||
631 | int maxlvt; | ||
632 | u32 v; | ||
633 | |||
634 | /* APIC hasn't been mapped yet */ | ||
635 | if (!apic_phys) | ||
636 | return; | ||
637 | |||
638 | maxlvt = lapic_get_maxlvt(); | ||
639 | /* | ||
640 | * Masking an LVT entry can trigger a local APIC error | ||
641 | * if the vector is zero. Mask LVTERR first to prevent this. | ||
642 | */ | ||
643 | if (maxlvt >= 3) { | ||
644 | v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ | ||
645 | apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); | ||
646 | } | ||
647 | /* | ||
648 | * Careful: we have to set masks only first to deassert | ||
649 | * any level-triggered sources. | ||
650 | */ | ||
651 | v = apic_read(APIC_LVTT); | ||
652 | apic_write(APIC_LVTT, v | APIC_LVT_MASKED); | ||
653 | v = apic_read(APIC_LVT0); | ||
654 | apic_write(APIC_LVT0, v | APIC_LVT_MASKED); | ||
655 | v = apic_read(APIC_LVT1); | ||
656 | apic_write(APIC_LVT1, v | APIC_LVT_MASKED); | ||
657 | if (maxlvt >= 4) { | ||
658 | v = apic_read(APIC_LVTPC); | ||
659 | apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); | ||
660 | } | ||
661 | |||
662 | /* lets not touch this if we didn't frob it */ | ||
663 | #if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) | ||
664 | if (maxlvt >= 5) { | ||
665 | v = apic_read(APIC_LVTTHMR); | ||
666 | apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); | ||
667 | } | ||
668 | #endif | ||
669 | /* | ||
670 | * Clean APIC state for other OSs: | ||
671 | */ | ||
672 | apic_write(APIC_LVTT, APIC_LVT_MASKED); | ||
673 | apic_write(APIC_LVT0, APIC_LVT_MASKED); | ||
674 | apic_write(APIC_LVT1, APIC_LVT_MASKED); | ||
675 | if (maxlvt >= 3) | ||
676 | apic_write(APIC_LVTERR, APIC_LVT_MASKED); | ||
677 | if (maxlvt >= 4) | ||
678 | apic_write(APIC_LVTPC, APIC_LVT_MASKED); | ||
679 | |||
680 | /* Integrated APIC (!82489DX) ? */ | ||
681 | if (lapic_is_integrated()) { | ||
682 | if (maxlvt > 3) | ||
683 | /* Clear ESR due to Pentium errata 3AP and 11AP */ | ||
684 | apic_write(APIC_ESR, 0); | ||
685 | apic_read(APIC_ESR); | ||
686 | } | ||
687 | } | ||
688 | |||
689 | /** | ||
690 | * disable_local_APIC - clear and disable the local APIC | ||
691 | */ | ||
692 | void disable_local_APIC(void) | ||
693 | { | ||
694 | unsigned int value; | ||
695 | |||
696 | clear_local_APIC(); | ||
697 | |||
698 | /* | ||
699 | * Disable APIC (implies clearing of registers | ||
700 | * for 82489DX!). | ||
701 | */ | ||
702 | value = apic_read(APIC_SPIV); | ||
703 | value &= ~APIC_SPIV_APIC_ENABLED; | ||
704 | apic_write(APIC_SPIV, value); | ||
705 | |||
706 | #ifdef CONFIG_X86_32 | ||
707 | /* | ||
708 | * When LAPIC was disabled by the BIOS and enabled by the kernel, | ||
709 | * restore the disabled state. | ||
710 | */ | ||
711 | if (enabled_via_apicbase) { | ||
712 | unsigned int l, h; | ||
713 | |||
714 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
715 | l &= ~MSR_IA32_APICBASE_ENABLE; | ||
716 | wrmsr(MSR_IA32_APICBASE, l, h); | ||
717 | } | ||
718 | #endif | ||
719 | } | ||
720 | |||
721 | /* | ||
722 | * If Linux enabled the LAPIC against the BIOS default disable it down before | ||
723 | * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and | ||
724 | * not power-off. Additionally clear all LVT entries before disable_local_APIC | ||
725 | * for the case where Linux didn't enable the LAPIC. | ||
726 | */ | ||
727 | void lapic_shutdown(void) | ||
728 | { | ||
729 | unsigned long flags; | ||
730 | |||
731 | if (!cpu_has_apic) | ||
732 | return; | ||
733 | |||
734 | local_irq_save(flags); | ||
735 | |||
736 | #ifdef CONFIG_X86_32 | ||
737 | if (!enabled_via_apicbase) | ||
738 | clear_local_APIC(); | ||
739 | else | ||
740 | #endif | ||
741 | disable_local_APIC(); | ||
742 | |||
743 | |||
744 | local_irq_restore(flags); | ||
745 | } | ||
746 | |||
747 | /* | ||
748 | * This is to verify that we're looking at a real local APIC. | ||
749 | * Check these against your board if the CPUs aren't getting | ||
750 | * started for no apparent reason. | ||
751 | */ | ||
752 | int __init verify_local_APIC(void) | ||
753 | { | ||
754 | unsigned int reg0, reg1; | ||
755 | |||
756 | /* | ||
757 | * The version register is read-only in a real APIC. | ||
758 | */ | ||
759 | reg0 = apic_read(APIC_LVR); | ||
760 | apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); | ||
761 | apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); | ||
762 | reg1 = apic_read(APIC_LVR); | ||
763 | apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); | ||
764 | |||
765 | /* | ||
766 | * The two version reads above should print the same | ||
767 | * numbers. If the second one is different, then we | ||
768 | * poke at a non-APIC. | ||
769 | */ | ||
770 | if (reg1 != reg0) | ||
771 | return 0; | ||
772 | |||
773 | /* | ||
774 | * Check if the version looks reasonably. | ||
775 | */ | ||
776 | reg1 = GET_APIC_VERSION(reg0); | ||
777 | if (reg1 == 0x00 || reg1 == 0xff) | ||
778 | return 0; | ||
779 | reg1 = lapic_get_maxlvt(); | ||
780 | if (reg1 < 0x02 || reg1 == 0xff) | ||
781 | return 0; | ||
782 | |||
783 | /* | ||
784 | * The ID register is read/write in a real APIC. | ||
785 | */ | ||
786 | reg0 = apic_read(APIC_ID); | ||
787 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); | ||
788 | apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); | ||
789 | reg1 = apic_read(APIC_ID); | ||
790 | apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); | ||
791 | apic_write(APIC_ID, reg0); | ||
792 | if (reg1 != (reg0 ^ APIC_ID_MASK)) | ||
793 | return 0; | ||
794 | |||
795 | /* | ||
796 | * The next two are just to see if we have sane values. | ||
797 | * They're only really relevant if we're in Virtual Wire | ||
798 | * compatibility mode, but most boxes are anymore. | ||
799 | */ | ||
800 | reg0 = apic_read(APIC_LVT0); | ||
801 | apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); | ||
802 | reg1 = apic_read(APIC_LVT1); | ||
803 | apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); | ||
804 | |||
805 | return 1; | ||
806 | } | ||
807 | |||
808 | /** | ||
809 | * sync_Arb_IDs - synchronize APIC bus arbitration IDs | ||
810 | */ | ||
811 | void __init sync_Arb_IDs(void) | ||
812 | { | ||
813 | /* | ||
814 | * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not | ||
815 | * needed on AMD. | ||
816 | */ | ||
817 | if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) | ||
818 | return; | ||
819 | |||
820 | /* | ||
821 | * Wait for idle. | ||
822 | */ | ||
823 | apic_wait_icr_idle(); | ||
824 | |||
825 | apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); | ||
826 | apic_write(APIC_ICR, APIC_DEST_ALLINC | | ||
827 | APIC_INT_LEVELTRIG | APIC_DM_INIT); | ||
828 | } | ||
829 | |||
830 | /* | ||
831 | * An initial setup of the virtual wire mode. | ||
832 | */ | ||
833 | void __init init_bsp_APIC(void) | ||
834 | { | ||
835 | unsigned int value; | ||
836 | |||
837 | /* | ||
838 | * Don't do the setup now if we have a SMP BIOS as the | ||
839 | * through-I/O-APIC virtual wire mode might be active. | ||
840 | */ | ||
841 | if (smp_found_config || !cpu_has_apic) | ||
842 | return; | ||
843 | |||
844 | /* | ||
845 | * Do not trust the local APIC being empty at bootup. | ||
846 | */ | ||
847 | clear_local_APIC(); | ||
848 | |||
849 | /* | ||
850 | * Enable APIC. | ||
851 | */ | ||
852 | value = apic_read(APIC_SPIV); | ||
853 | value &= ~APIC_VECTOR_MASK; | ||
854 | value |= APIC_SPIV_APIC_ENABLED; | ||
855 | |||
856 | #ifdef CONFIG_X86_32 | ||
857 | /* This bit is reserved on P4/Xeon and should be cleared */ | ||
858 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && | ||
859 | (boot_cpu_data.x86 == 15)) | ||
860 | value &= ~APIC_SPIV_FOCUS_DISABLED; | ||
861 | else | ||
862 | #endif | ||
863 | value |= APIC_SPIV_FOCUS_DISABLED; | ||
864 | value |= SPURIOUS_APIC_VECTOR; | ||
865 | apic_write(APIC_SPIV, value); | ||
866 | |||
867 | /* | ||
868 | * Set up the virtual wire mode. | ||
869 | */ | ||
870 | apic_write(APIC_LVT0, APIC_DM_EXTINT); | ||
871 | value = APIC_DM_NMI; | ||
872 | if (!lapic_is_integrated()) /* 82489DX */ | ||
873 | value |= APIC_LVT_LEVEL_TRIGGER; | ||
874 | apic_write(APIC_LVT1, value); | ||
875 | } | ||
876 | |||
877 | static void __cpuinit lapic_setup_esr(void) | ||
878 | { | ||
879 | unsigned long oldvalue, value, maxlvt; | ||
880 | if (lapic_is_integrated() && !esr_disable) { | ||
881 | if (esr_disable) { | ||
882 | /* | ||
883 | * Something untraceable is creating bad interrupts on | ||
884 | * secondary quads ... for the moment, just leave the | ||
885 | * ESR disabled - we can't do anything useful with the | ||
886 | * errors anyway - mbligh | ||
887 | */ | ||
888 | printk(KERN_INFO "Leaving ESR disabled.\n"); | ||
889 | return; | ||
890 | } | ||
891 | /* !82489DX */ | ||
892 | maxlvt = lapic_get_maxlvt(); | ||
893 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | ||
894 | apic_write(APIC_ESR, 0); | ||
895 | oldvalue = apic_read(APIC_ESR); | ||
896 | |||
897 | /* enables sending errors */ | ||
898 | value = ERROR_APIC_VECTOR; | ||
899 | apic_write(APIC_LVTERR, value); | ||
900 | /* | ||
901 | * spec says clear errors after enabling vector. | ||
902 | */ | ||
903 | if (maxlvt > 3) | ||
904 | apic_write(APIC_ESR, 0); | ||
905 | value = apic_read(APIC_ESR); | ||
906 | if (value != oldvalue) | ||
907 | apic_printk(APIC_VERBOSE, "ESR value before enabling " | ||
908 | "vector: 0x%08lx after: 0x%08lx\n", | ||
909 | oldvalue, value); | ||
910 | } else { | ||
911 | printk(KERN_INFO "No ESR for 82489DX.\n"); | ||
912 | } | ||
913 | } | ||
914 | |||
915 | |||
916 | /** | ||
917 | * setup_local_APIC - setup the local APIC | ||
918 | */ | ||
919 | void __cpuinit setup_local_APIC(void) | ||
920 | { | ||
921 | unsigned int value; | ||
922 | int i, j; | ||
923 | |||
924 | preempt_disable(); | ||
925 | value = apic_read(APIC_LVR); | ||
926 | |||
927 | BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f); | ||
928 | |||
929 | /* | ||
930 | * Double-check whether this APIC is really registered. | ||
931 | * This is meaningless in clustered apic mode, so we skip it. | ||
932 | */ | ||
933 | if (!apic_id_registered()) | ||
934 | BUG(); | ||
935 | |||
936 | /* | ||
937 | * Intel recommends to set DFR, LDR and TPR before enabling | ||
938 | * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel | ||
939 | * document number 292116). So here it goes... | ||
940 | */ | ||
941 | init_apic_ldr(); | ||
942 | |||
943 | /* | ||
944 | * Set Task Priority to 'accept all'. We never change this | ||
945 | * later on. | ||
946 | */ | ||
947 | value = apic_read(APIC_TASKPRI); | ||
948 | value &= ~APIC_TPRI_MASK; | ||
949 | apic_write(APIC_TASKPRI, value); | ||
950 | |||
951 | /* | ||
952 | * After a crash, we no longer service the interrupts and a pending | ||
953 | * interrupt from previous kernel might still have ISR bit set. | ||
954 | * | ||
955 | * Most probably by now CPU has serviced that pending interrupt and | ||
956 | * it might not have done the ack_APIC_irq() because it thought, | ||
957 | * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it | ||
958 | * does not clear the ISR bit and cpu thinks it has already serivced | ||
959 | * the interrupt. Hence a vector might get locked. It was noticed | ||
960 | * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. | ||
961 | */ | ||
962 | for (i = APIC_ISR_NR - 1; i >= 0; i--) { | ||
963 | value = apic_read(APIC_ISR + i*0x10); | ||
964 | for (j = 31; j >= 0; j--) { | ||
965 | if (value & (1<<j)) | ||
966 | ack_APIC_irq(); | ||
967 | } | ||
968 | } | ||
969 | |||
970 | /* | ||
971 | * Now that we are all set up, enable the APIC | ||
972 | */ | ||
973 | value = apic_read(APIC_SPIV); | ||
974 | value &= ~APIC_VECTOR_MASK; | ||
975 | /* | ||
976 | * Enable APIC | ||
977 | */ | ||
978 | value |= APIC_SPIV_APIC_ENABLED; | ||
979 | |||
980 | /* We always use processor focus */ | ||
981 | |||
982 | /* | ||
983 | * Set spurious IRQ vector | ||
984 | */ | ||
985 | value |= SPURIOUS_APIC_VECTOR; | ||
986 | apic_write(APIC_SPIV, value); | ||
987 | |||
988 | /* | ||
989 | * Set up LVT0, LVT1: | ||
990 | * | ||
991 | * set up through-local-APIC on the BP's LINT0. This is not | ||
992 | * strictly necessary in pure symmetric-IO mode, but sometimes | ||
993 | * we delegate interrupts to the 8259A. | ||
994 | */ | ||
995 | /* | ||
996 | * TODO: set up through-local-APIC from through-I/O-APIC? --macro | ||
997 | */ | ||
998 | value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; | ||
999 | if (!smp_processor_id() && !value) { | ||
1000 | value = APIC_DM_EXTINT; | ||
1001 | apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", | ||
1002 | smp_processor_id()); | ||
1003 | } else { | ||
1004 | value = APIC_DM_EXTINT | APIC_LVT_MASKED; | ||
1005 | apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", | ||
1006 | smp_processor_id()); | ||
1007 | } | ||
1008 | apic_write(APIC_LVT0, value); | ||
1009 | |||
1010 | /* | ||
1011 | * only the BP should see the LINT1 NMI signal, obviously. | ||
1012 | */ | ||
1013 | if (!smp_processor_id()) | ||
1014 | value = APIC_DM_NMI; | ||
1015 | else | ||
1016 | value = APIC_DM_NMI | APIC_LVT_MASKED; | ||
1017 | apic_write(APIC_LVT1, value); | ||
1018 | preempt_enable(); | ||
1019 | } | ||
1020 | |||
1021 | void __cpuinit end_local_APIC_setup(void) | ||
1022 | { | ||
1023 | lapic_setup_esr(); | ||
1024 | |||
1025 | #ifdef CONFIG_X86_32 | ||
1026 | { | ||
1027 | unsigned int value; | ||
1028 | /* Disable the local apic timer */ | ||
1029 | value = apic_read(APIC_LVTT); | ||
1030 | value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); | ||
1031 | apic_write(APIC_LVTT, value); | ||
1032 | } | ||
1033 | #endif | ||
1034 | |||
1035 | setup_apic_nmi_watchdog(NULL); | ||
1036 | apic_pm_activate(); | ||
1037 | } | ||
1038 | |||
1039 | void check_x2apic(void) | ||
1040 | { | ||
1041 | int msr, msr2; | ||
1042 | |||
1043 | rdmsr(MSR_IA32_APICBASE, msr, msr2); | ||
1044 | |||
1045 | if (msr & X2APIC_ENABLE) { | ||
1046 | printk("x2apic enabled by BIOS, switching to x2apic ops\n"); | ||
1047 | x2apic_preenabled = x2apic = 1; | ||
1048 | apic_ops = &x2apic_ops; | ||
1049 | } | ||
1050 | } | ||
1051 | |||
1052 | void enable_x2apic(void) | ||
1053 | { | ||
1054 | int msr, msr2; | ||
1055 | |||
1056 | rdmsr(MSR_IA32_APICBASE, msr, msr2); | ||
1057 | if (!(msr & X2APIC_ENABLE)) { | ||
1058 | printk("Enabling x2apic\n"); | ||
1059 | wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); | ||
1060 | } | ||
1061 | } | ||
1062 | |||
1063 | void enable_IR_x2apic(void) | ||
1064 | { | ||
1065 | #ifdef CONFIG_INTR_REMAP | ||
1066 | int ret; | ||
1067 | unsigned long flags; | ||
1068 | |||
1069 | if (!cpu_has_x2apic) | ||
1070 | return; | ||
1071 | |||
1072 | if (!x2apic_preenabled && disable_x2apic) { | ||
1073 | printk(KERN_INFO | ||
1074 | "Skipped enabling x2apic and Interrupt-remapping " | ||
1075 | "because of nox2apic\n"); | ||
1076 | return; | ||
1077 | } | ||
1078 | |||
1079 | if (x2apic_preenabled && disable_x2apic) | ||
1080 | panic("Bios already enabled x2apic, can't enforce nox2apic"); | ||
1081 | |||
1082 | if (!x2apic_preenabled && skip_ioapic_setup) { | ||
1083 | printk(KERN_INFO | ||
1084 | "Skipped enabling x2apic and Interrupt-remapping " | ||
1085 | "because of skipping io-apic setup\n"); | ||
1086 | return; | ||
1087 | } | ||
1088 | |||
1089 | ret = dmar_table_init(); | ||
1090 | if (ret) { | ||
1091 | printk(KERN_INFO | ||
1092 | "dmar_table_init() failed with %d:\n", ret); | ||
1093 | |||
1094 | if (x2apic_preenabled) | ||
1095 | panic("x2apic enabled by bios. But IR enabling failed"); | ||
1096 | else | ||
1097 | printk(KERN_INFO | ||
1098 | "Not enabling x2apic,Intr-remapping\n"); | ||
1099 | return; | ||
1100 | } | ||
1101 | |||
1102 | local_irq_save(flags); | ||
1103 | mask_8259A(); | ||
1104 | save_mask_IO_APIC_setup(); | ||
1105 | |||
1106 | ret = enable_intr_remapping(1); | ||
1107 | |||
1108 | if (ret && x2apic_preenabled) { | ||
1109 | local_irq_restore(flags); | ||
1110 | panic("x2apic enabled by bios. But IR enabling failed"); | ||
1111 | } | ||
1112 | |||
1113 | if (ret) | ||
1114 | goto end; | ||
1115 | |||
1116 | if (!x2apic) { | ||
1117 | x2apic = 1; | ||
1118 | apic_ops = &x2apic_ops; | ||
1119 | enable_x2apic(); | ||
1120 | } | ||
1121 | end: | ||
1122 | if (ret) | ||
1123 | /* | ||
1124 | * IR enabling failed | ||
1125 | */ | ||
1126 | restore_IO_APIC_setup(); | ||
1127 | else | ||
1128 | reinit_intr_remapped_IO_APIC(x2apic_preenabled); | ||
1129 | |||
1130 | unmask_8259A(); | ||
1131 | local_irq_restore(flags); | ||
1132 | |||
1133 | if (!ret) { | ||
1134 | if (!x2apic_preenabled) | ||
1135 | printk(KERN_INFO | ||
1136 | "Enabled x2apic and interrupt-remapping\n"); | ||
1137 | else | ||
1138 | printk(KERN_INFO | ||
1139 | "Enabled Interrupt-remapping\n"); | ||
1140 | } else | ||
1141 | printk(KERN_ERR | ||
1142 | "Failed to enable Interrupt-remapping and x2apic\n"); | ||
1143 | #else | ||
1144 | if (!cpu_has_x2apic) | ||
1145 | return; | ||
1146 | |||
1147 | if (x2apic_preenabled) | ||
1148 | panic("x2apic enabled prior OS handover," | ||
1149 | " enable CONFIG_INTR_REMAP"); | ||
1150 | |||
1151 | printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " | ||
1152 | " and x2apic\n"); | ||
1153 | #endif | ||
1154 | |||
1155 | return; | ||
1156 | } | ||
1157 | |||
1158 | /* | ||
1159 | * Detect and enable local APICs on non-SMP boards. | ||
1160 | * Original code written by Keir Fraser. | ||
1161 | * On AMD64 we trust the BIOS - if it says no APIC it is likely | ||
1162 | * not correctly set up (usually the APIC timer won't work etc.) | ||
1163 | */ | ||
1164 | static int __init detect_init_APIC(void) | ||
1165 | { | ||
1166 | if (!cpu_has_apic) { | ||
1167 | printk(KERN_INFO "No local APIC present\n"); | ||
1168 | return -1; | ||
1169 | } | ||
1170 | |||
1171 | mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; | ||
1172 | boot_cpu_physical_apicid = 0; | ||
1173 | return 0; | ||
1174 | } | ||
1175 | |||
1176 | void __init early_init_lapic_mapping(void) | ||
1177 | { | ||
1178 | unsigned long phys_addr; | ||
1179 | |||
1180 | /* | ||
1181 | * If no local APIC can be found then go out | ||
1182 | * : it means there is no mpatable and MADT | ||
1183 | */ | ||
1184 | if (!smp_found_config) | ||
1185 | return; | ||
1186 | |||
1187 | phys_addr = mp_lapic_addr; | ||
1188 | |||
1189 | set_fixmap_nocache(FIX_APIC_BASE, phys_addr); | ||
1190 | apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", | ||
1191 | APIC_BASE, phys_addr); | ||
1192 | |||
1193 | /* | ||
1194 | * Fetch the APIC ID of the BSP in case we have a | ||
1195 | * default configuration (or the MP table is broken). | ||
1196 | */ | ||
1197 | boot_cpu_physical_apicid = read_apic_id(); | ||
1198 | } | ||
1199 | |||
1200 | /** | ||
1201 | * init_apic_mappings - initialize APIC mappings | ||
1202 | */ | ||
1203 | void __init init_apic_mappings(void) | ||
1204 | { | ||
1205 | if (x2apic) { | ||
1206 | boot_cpu_physical_apicid = read_apic_id(); | ||
1207 | return; | ||
1208 | } | ||
1209 | |||
1210 | /* | ||
1211 | * If no local APIC can be found then set up a fake all | ||
1212 | * zeroes page to simulate the local APIC and another | ||
1213 | * one for the IO-APIC. | ||
1214 | */ | ||
1215 | if (!smp_found_config && detect_init_APIC()) { | ||
1216 | apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); | ||
1217 | apic_phys = __pa(apic_phys); | ||
1218 | } else | ||
1219 | apic_phys = mp_lapic_addr; | ||
1220 | |||
1221 | set_fixmap_nocache(FIX_APIC_BASE, apic_phys); | ||
1222 | apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", | ||
1223 | APIC_BASE, apic_phys); | ||
1224 | |||
1225 | /* | ||
1226 | * Fetch the APIC ID of the BSP in case we have a | ||
1227 | * default configuration (or the MP table is broken). | ||
1228 | */ | ||
1229 | boot_cpu_physical_apicid = read_apic_id(); | ||
1230 | } | ||
1231 | |||
1232 | /* | ||
1233 | * This initializes the IO-APIC and APIC hardware if this is | ||
1234 | * a UP kernel. | ||
1235 | */ | ||
1236 | int apic_version[MAX_APICS]; | ||
1237 | |||
1238 | int __init APIC_init_uniprocessor(void) | ||
1239 | { | ||
1240 | if (disable_apic) { | ||
1241 | printk(KERN_INFO "Apic disabled\n"); | ||
1242 | return -1; | ||
1243 | } | ||
1244 | if (!cpu_has_apic) { | ||
1245 | disable_apic = 1; | ||
1246 | printk(KERN_INFO "Apic disabled by BIOS\n"); | ||
1247 | return -1; | ||
1248 | } | ||
1249 | |||
1250 | enable_IR_x2apic(); | ||
1251 | setup_apic_routing(); | ||
1252 | |||
1253 | verify_local_APIC(); | ||
1254 | |||
1255 | connect_bsp_APIC(); | ||
1256 | |||
1257 | physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); | ||
1258 | apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); | ||
1259 | |||
1260 | setup_local_APIC(); | ||
1261 | |||
1262 | /* | ||
1263 | * Now enable IO-APICs, actually call clear_IO_APIC | ||
1264 | * We need clear_IO_APIC before enabling vector on BP | ||
1265 | */ | ||
1266 | if (!skip_ioapic_setup && nr_ioapics) | ||
1267 | enable_IO_APIC(); | ||
1268 | |||
1269 | if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) | ||
1270 | localise_nmi_watchdog(); | ||
1271 | end_local_APIC_setup(); | ||
1272 | |||
1273 | if (smp_found_config && !skip_ioapic_setup && nr_ioapics) | ||
1274 | setup_IO_APIC(); | ||
1275 | else | ||
1276 | nr_ioapics = 0; | ||
1277 | setup_boot_APIC_clock(); | ||
1278 | check_nmi_watchdog(); | ||
1279 | return 0; | ||
1280 | } | ||
1281 | |||
1282 | /* | ||
1283 | * Local APIC interrupts | ||
1284 | */ | ||
1285 | |||
1286 | /* | ||
1287 | * This interrupt should _never_ happen with our APIC/SMP architecture | ||
1288 | */ | ||
1289 | asmlinkage void smp_spurious_interrupt(void) | ||
1290 | { | ||
1291 | unsigned int v; | ||
1292 | exit_idle(); | ||
1293 | irq_enter(); | ||
1294 | /* | ||
1295 | * Check if this really is a spurious interrupt and ACK it | ||
1296 | * if it is a vectored one. Just in case... | ||
1297 | * Spurious interrupts should not be ACKed. | ||
1298 | */ | ||
1299 | v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); | ||
1300 | if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) | ||
1301 | ack_APIC_irq(); | ||
1302 | |||
1303 | add_pda(irq_spurious_count, 1); | ||
1304 | irq_exit(); | ||
1305 | } | ||
1306 | |||
1307 | /* | ||
1308 | * This interrupt should never happen with our APIC/SMP architecture | ||
1309 | */ | ||
1310 | asmlinkage void smp_error_interrupt(void) | ||
1311 | { | ||
1312 | unsigned int v, v1; | ||
1313 | |||
1314 | exit_idle(); | ||
1315 | irq_enter(); | ||
1316 | /* First tickle the hardware, only then report what went on. -- REW */ | ||
1317 | v = apic_read(APIC_ESR); | ||
1318 | apic_write(APIC_ESR, 0); | ||
1319 | v1 = apic_read(APIC_ESR); | ||
1320 | ack_APIC_irq(); | ||
1321 | atomic_inc(&irq_err_count); | ||
1322 | |||
1323 | /* Here is what the APIC error bits mean: | ||
1324 | 0: Send CS error | ||
1325 | 1: Receive CS error | ||
1326 | 2: Send accept error | ||
1327 | 3: Receive accept error | ||
1328 | 4: Reserved | ||
1329 | 5: Send illegal vector | ||
1330 | 6: Received illegal vector | ||
1331 | 7: Illegal register address | ||
1332 | */ | ||
1333 | printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", | ||
1334 | smp_processor_id(), v , v1); | ||
1335 | irq_exit(); | ||
1336 | } | ||
1337 | |||
1338 | /** | ||
1339 | * connect_bsp_APIC - attach the APIC to the interrupt system | ||
1340 | */ | ||
1341 | void __init connect_bsp_APIC(void) | ||
1342 | { | ||
1343 | #ifdef CONFIG_X86_32 | ||
1344 | if (pic_mode) { | ||
1345 | /* | ||
1346 | * Do not trust the local APIC being empty at bootup. | ||
1347 | */ | ||
1348 | clear_local_APIC(); | ||
1349 | /* | ||
1350 | * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's | ||
1351 | * local APIC to INT and NMI lines. | ||
1352 | */ | ||
1353 | apic_printk(APIC_VERBOSE, "leaving PIC mode, " | ||
1354 | "enabling APIC mode.\n"); | ||
1355 | outb(0x70, 0x22); | ||
1356 | outb(0x01, 0x23); | ||
1357 | } | ||
1358 | #endif | ||
1359 | enable_apic_mode(); | ||
1360 | } | ||
1361 | |||
1362 | /** | ||
1363 | * disconnect_bsp_APIC - detach the APIC from the interrupt system | ||
1364 | * @virt_wire_setup: indicates, whether virtual wire mode is selected | ||
1365 | * | ||
1366 | * Virtual wire mode is necessary to deliver legacy interrupts even when the | ||
1367 | * APIC is disabled. | ||
1368 | */ | ||
1369 | void disconnect_bsp_APIC(int virt_wire_setup) | ||
1370 | { | ||
1371 | unsigned int value; | ||
1372 | |||
1373 | #ifdef CONFIG_X86_32 | ||
1374 | if (pic_mode) { | ||
1375 | /* | ||
1376 | * Put the board back into PIC mode (has an effect only on | ||
1377 | * certain older boards). Note that APIC interrupts, including | ||
1378 | * IPIs, won't work beyond this point! The only exception are | ||
1379 | * INIT IPIs. | ||
1380 | */ | ||
1381 | apic_printk(APIC_VERBOSE, "disabling APIC mode, " | ||
1382 | "entering PIC mode.\n"); | ||
1383 | outb(0x70, 0x22); | ||
1384 | outb(0x00, 0x23); | ||
1385 | return; | ||
1386 | } | ||
1387 | #endif | ||
1388 | |||
1389 | /* Go back to Virtual Wire compatibility mode */ | ||
1390 | |||
1391 | /* For the spurious interrupt use vector F, and enable it */ | ||
1392 | value = apic_read(APIC_SPIV); | ||
1393 | value &= ~APIC_VECTOR_MASK; | ||
1394 | value |= APIC_SPIV_APIC_ENABLED; | ||
1395 | value |= 0xf; | ||
1396 | apic_write(APIC_SPIV, value); | ||
1397 | |||
1398 | if (!virt_wire_setup) { | ||
1399 | /* | ||
1400 | * For LVT0 make it edge triggered, active high, | ||
1401 | * external and enabled | ||
1402 | */ | ||
1403 | value = apic_read(APIC_LVT0); | ||
1404 | value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | | ||
1405 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | | ||
1406 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); | ||
1407 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | ||
1408 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); | ||
1409 | apic_write(APIC_LVT0, value); | ||
1410 | } else { | ||
1411 | /* Disable LVT0 */ | ||
1412 | apic_write(APIC_LVT0, APIC_LVT_MASKED); | ||
1413 | } | ||
1414 | |||
1415 | /* | ||
1416 | * For LVT1 make it edge triggered, active high, | ||
1417 | * nmi and enabled | ||
1418 | */ | ||
1419 | value = apic_read(APIC_LVT1); | ||
1420 | value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | | ||
1421 | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | | ||
1422 | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); | ||
1423 | value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; | ||
1424 | value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); | ||
1425 | apic_write(APIC_LVT1, value); | ||
1426 | } | ||
1427 | |||
1428 | void __cpuinit generic_processor_info(int apicid, int version) | ||
1429 | { | ||
1430 | int cpu; | ||
1431 | cpumask_t tmp_map; | ||
1432 | |||
1433 | /* | ||
1434 | * Validate version | ||
1435 | */ | ||
1436 | if (version == 0x0) { | ||
1437 | printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " | ||
1438 | "fixing up to 0x10. (tell your hw vendor)\n", | ||
1439 | version); | ||
1440 | version = 0x10; | ||
1441 | } | ||
1442 | apic_version[apicid] = version; | ||
1443 | |||
1444 | if (num_processors >= NR_CPUS) { | ||
1445 | printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." | ||
1446 | " Processor ignored.\n", NR_CPUS); | ||
1447 | return; | ||
1448 | } | ||
1449 | |||
1450 | num_processors++; | ||
1451 | cpus_complement(tmp_map, cpu_present_map); | ||
1452 | cpu = first_cpu(tmp_map); | ||
1453 | |||
1454 | physid_set(apicid, phys_cpu_present_map); | ||
1455 | if (apicid == boot_cpu_physical_apicid) { | ||
1456 | /* | ||
1457 | * x86_bios_cpu_apicid is required to have processors listed | ||
1458 | * in same order as logical cpu numbers. Hence the first | ||
1459 | * entry is BSP, and so on. | ||
1460 | */ | ||
1461 | cpu = 0; | ||
1462 | } | ||
1463 | if (apicid > max_physical_apicid) | ||
1464 | max_physical_apicid = apicid; | ||
1465 | |||
1466 | #ifdef CONFIG_X86_32 | ||
1467 | /* | ||
1468 | * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y | ||
1469 | * but we need to work other dependencies like SMP_SUSPEND etc | ||
1470 | * before this can be done without some confusion. | ||
1471 | * if (CPU_HOTPLUG_ENABLED || num_processors > 8) | ||
1472 | * - Ashok Raj <ashok.raj@intel.com> | ||
1473 | */ | ||
1474 | if (max_physical_apicid >= 8) { | ||
1475 | switch (boot_cpu_data.x86_vendor) { | ||
1476 | case X86_VENDOR_INTEL: | ||
1477 | if (!APIC_XAPIC(version)) { | ||
1478 | def_to_bigsmp = 0; | ||
1479 | break; | ||
1480 | } | ||
1481 | /* If P4 and above fall through */ | ||
1482 | case X86_VENDOR_AMD: | ||
1483 | def_to_bigsmp = 1; | ||
1484 | } | ||
1485 | } | ||
1486 | #endif | ||
1487 | |||
1488 | #if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) | ||
1489 | /* are we being called early in kernel startup? */ | ||
1490 | if (early_per_cpu_ptr(x86_cpu_to_apicid)) { | ||
1491 | u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); | ||
1492 | u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); | ||
1493 | |||
1494 | cpu_to_apicid[cpu] = apicid; | ||
1495 | bios_cpu_apicid[cpu] = apicid; | ||
1496 | } else { | ||
1497 | per_cpu(x86_cpu_to_apicid, cpu) = apicid; | ||
1498 | per_cpu(x86_bios_cpu_apicid, cpu) = apicid; | ||
1499 | } | ||
1500 | #endif | ||
1501 | |||
1502 | cpu_set(cpu, cpu_possible_map); | ||
1503 | cpu_set(cpu, cpu_present_map); | ||
1504 | } | ||
1505 | |||
1506 | int hard_smp_processor_id(void) | ||
1507 | { | ||
1508 | return read_apic_id(); | ||
1509 | } | ||
1510 | |||
1511 | /* | ||
1512 | * Power management | ||
1513 | */ | ||
1514 | #ifdef CONFIG_PM | ||
1515 | |||
1516 | static struct { | ||
1517 | /* | ||
1518 | * 'active' is true if the local APIC was enabled by us and | ||
1519 | * not the BIOS; this signifies that we are also responsible | ||
1520 | * for disabling it before entering apm/acpi suspend | ||
1521 | */ | ||
1522 | int active; | ||
1523 | /* r/w apic fields */ | ||
1524 | unsigned int apic_id; | ||
1525 | unsigned int apic_taskpri; | ||
1526 | unsigned int apic_ldr; | ||
1527 | unsigned int apic_dfr; | ||
1528 | unsigned int apic_spiv; | ||
1529 | unsigned int apic_lvtt; | ||
1530 | unsigned int apic_lvtpc; | ||
1531 | unsigned int apic_lvt0; | ||
1532 | unsigned int apic_lvt1; | ||
1533 | unsigned int apic_lvterr; | ||
1534 | unsigned int apic_tmict; | ||
1535 | unsigned int apic_tdcr; | ||
1536 | unsigned int apic_thmr; | ||
1537 | } apic_pm_state; | ||
1538 | |||
1539 | static int lapic_suspend(struct sys_device *dev, pm_message_t state) | ||
1540 | { | ||
1541 | unsigned long flags; | ||
1542 | int maxlvt; | ||
1543 | |||
1544 | if (!apic_pm_state.active) | ||
1545 | return 0; | ||
1546 | |||
1547 | maxlvt = lapic_get_maxlvt(); | ||
1548 | |||
1549 | apic_pm_state.apic_id = apic_read(APIC_ID); | ||
1550 | apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); | ||
1551 | apic_pm_state.apic_ldr = apic_read(APIC_LDR); | ||
1552 | apic_pm_state.apic_dfr = apic_read(APIC_DFR); | ||
1553 | apic_pm_state.apic_spiv = apic_read(APIC_SPIV); | ||
1554 | apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); | ||
1555 | if (maxlvt >= 4) | ||
1556 | apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); | ||
1557 | apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); | ||
1558 | apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); | ||
1559 | apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); | ||
1560 | apic_pm_state.apic_tmict = apic_read(APIC_TMICT); | ||
1561 | apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); | ||
1562 | #if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) | ||
1563 | if (maxlvt >= 5) | ||
1564 | apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); | ||
1565 | #endif | ||
1566 | |||
1567 | local_irq_save(flags); | ||
1568 | disable_local_APIC(); | ||
1569 | local_irq_restore(flags); | ||
1570 | return 0; | ||
1571 | } | ||
1572 | |||
1573 | static int lapic_resume(struct sys_device *dev) | ||
1574 | { | ||
1575 | unsigned int l, h; | ||
1576 | unsigned long flags; | ||
1577 | int maxlvt; | ||
1578 | |||
1579 | if (!apic_pm_state.active) | ||
1580 | return 0; | ||
1581 | |||
1582 | maxlvt = lapic_get_maxlvt(); | ||
1583 | |||
1584 | local_irq_save(flags); | ||
1585 | |||
1586 | #ifdef CONFIG_X86_64 | ||
1587 | if (x2apic) | ||
1588 | enable_x2apic(); | ||
1589 | else | ||
1590 | #endif | ||
1591 | { | ||
1592 | /* | ||
1593 | * Make sure the APICBASE points to the right address | ||
1594 | * | ||
1595 | * FIXME! This will be wrong if we ever support suspend on | ||
1596 | * SMP! We'll need to do this as part of the CPU restore! | ||
1597 | */ | ||
1598 | rdmsr(MSR_IA32_APICBASE, l, h); | ||
1599 | l &= ~MSR_IA32_APICBASE_BASE; | ||
1600 | l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; | ||
1601 | wrmsr(MSR_IA32_APICBASE, l, h); | ||
1602 | } | ||
1603 | |||
1604 | apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); | ||
1605 | apic_write(APIC_ID, apic_pm_state.apic_id); | ||
1606 | apic_write(APIC_DFR, apic_pm_state.apic_dfr); | ||
1607 | apic_write(APIC_LDR, apic_pm_state.apic_ldr); | ||
1608 | apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); | ||
1609 | apic_write(APIC_SPIV, apic_pm_state.apic_spiv); | ||
1610 | apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); | ||
1611 | apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); | ||
1612 | #if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) | ||
1613 | if (maxlvt >= 5) | ||
1614 | apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); | ||
1615 | #endif | ||
1616 | if (maxlvt >= 4) | ||
1617 | apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); | ||
1618 | apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); | ||
1619 | apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); | ||
1620 | apic_write(APIC_TMICT, apic_pm_state.apic_tmict); | ||
1621 | apic_write(APIC_ESR, 0); | ||
1622 | apic_read(APIC_ESR); | ||
1623 | apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); | ||
1624 | apic_write(APIC_ESR, 0); | ||
1625 | apic_read(APIC_ESR); | ||
1626 | |||
1627 | local_irq_restore(flags); | ||
1628 | |||
1629 | return 0; | ||
1630 | } | ||
1631 | |||
1632 | /* | ||
1633 | * This device has no shutdown method - fully functioning local APICs | ||
1634 | * are needed on every CPU up until machine_halt/restart/poweroff. | ||
1635 | */ | ||
1636 | |||
1637 | static struct sysdev_class lapic_sysclass = { | ||
1638 | .name = "lapic", | ||
1639 | .resume = lapic_resume, | ||
1640 | .suspend = lapic_suspend, | ||
1641 | }; | ||
1642 | |||
1643 | static struct sys_device device_lapic = { | ||
1644 | .id = 0, | ||
1645 | .cls = &lapic_sysclass, | ||
1646 | }; | ||
1647 | |||
1648 | static void __cpuinit apic_pm_activate(void) | ||
1649 | { | ||
1650 | apic_pm_state.active = 1; | ||
1651 | } | ||
1652 | |||
1653 | static int __init init_lapic_sysfs(void) | ||
1654 | { | ||
1655 | int error; | ||
1656 | |||
1657 | if (!cpu_has_apic) | ||
1658 | return 0; | ||
1659 | /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ | ||
1660 | |||
1661 | error = sysdev_class_register(&lapic_sysclass); | ||
1662 | if (!error) | ||
1663 | error = sysdev_register(&device_lapic); | ||
1664 | return error; | ||
1665 | } | ||
1666 | device_initcall(init_lapic_sysfs); | ||
1667 | |||
1668 | #else /* CONFIG_PM */ | ||
1669 | |||
1670 | static void apic_pm_activate(void) { } | ||
1671 | |||
1672 | #endif /* CONFIG_PM */ | ||
1673 | |||
1674 | /* | ||
1675 | * apic_is_clustered_box() -- Check if we can expect good TSC | ||
1676 | * | ||
1677 | * Thus far, the major user of this is IBM's Summit2 series: | ||
1678 | * | ||
1679 | * Clustered boxes may have unsynced TSC problems if they are | ||
1680 | * multi-chassis. Use available data to take a good guess. | ||
1681 | * If in doubt, go HPET. | ||
1682 | */ | ||
1683 | __cpuinit int apic_is_clustered_box(void) | ||
1684 | { | ||
1685 | int i, clusters, zeros; | ||
1686 | unsigned id; | ||
1687 | u16 *bios_cpu_apicid; | ||
1688 | DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); | ||
1689 | |||
1690 | /* | ||
1691 | * there is not this kind of box with AMD CPU yet. | ||
1692 | * Some AMD box with quadcore cpu and 8 sockets apicid | ||
1693 | * will be [4, 0x23] or [8, 0x27] could be thought to | ||
1694 | * vsmp box still need checking... | ||
1695 | */ | ||
1696 | if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) | ||
1697 | return 0; | ||
1698 | |||
1699 | bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); | ||
1700 | bitmap_zero(clustermap, NUM_APIC_CLUSTERS); | ||
1701 | |||
1702 | for (i = 0; i < NR_CPUS; i++) { | ||
1703 | /* are we being called early in kernel startup? */ | ||
1704 | if (bios_cpu_apicid) { | ||
1705 | id = bios_cpu_apicid[i]; | ||
1706 | } | ||
1707 | else if (i < nr_cpu_ids) { | ||
1708 | if (cpu_present(i)) | ||
1709 | id = per_cpu(x86_bios_cpu_apicid, i); | ||
1710 | else | ||
1711 | continue; | ||
1712 | } | ||
1713 | else | ||
1714 | break; | ||
1715 | |||
1716 | if (id != BAD_APICID) | ||
1717 | __set_bit(APIC_CLUSTERID(id), clustermap); | ||
1718 | } | ||
1719 | |||
1720 | /* Problem: Partially populated chassis may not have CPUs in some of | ||
1721 | * the APIC clusters they have been allocated. Only present CPUs have | ||
1722 | * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. | ||
1723 | * Since clusters are allocated sequentially, count zeros only if | ||
1724 | * they are bounded by ones. | ||
1725 | */ | ||
1726 | clusters = 0; | ||
1727 | zeros = 0; | ||
1728 | for (i = 0; i < NUM_APIC_CLUSTERS; i++) { | ||
1729 | if (test_bit(i, clustermap)) { | ||
1730 | clusters += 1 + zeros; | ||
1731 | zeros = 0; | ||
1732 | } else | ||
1733 | ++zeros; | ||
1734 | } | ||
1735 | |||
1736 | /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are | ||
1737 | * not guaranteed to be synced between boards | ||
1738 | */ | ||
1739 | if (is_vsmp_box() && clusters > 1) | ||
1740 | return 1; | ||
1741 | |||
1742 | /* | ||
1743 | * If clusters > 2, then should be multi-chassis. | ||
1744 | * May have to revisit this when multi-core + hyperthreaded CPUs come | ||
1745 | * out, but AFAIK this will work even for them. | ||
1746 | */ | ||
1747 | return (clusters > 2); | ||
1748 | } | ||
1749 | |||
1750 | static __init int setup_nox2apic(char *str) | ||
1751 | { | ||
1752 | disable_x2apic = 1; | ||
1753 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_X2APIC); | ||
1754 | return 0; | ||
1755 | } | ||
1756 | early_param("nox2apic", setup_nox2apic); | ||
1757 | |||
1758 | |||
1759 | /* | ||
1760 | * APIC command line parameters | ||
1761 | */ | ||
1762 | static int __init setup_disableapic(char *arg) | ||
1763 | { | ||
1764 | disable_apic = 1; | ||
1765 | setup_clear_cpu_cap(X86_FEATURE_APIC); | ||
1766 | return 0; | ||
1767 | } | ||
1768 | early_param("disableapic", setup_disableapic); | ||
1769 | |||
1770 | /* same as disableapic, for compatibility */ | ||
1771 | static int __init setup_nolapic(char *arg) | ||
1772 | { | ||
1773 | return setup_disableapic(arg); | ||
1774 | } | ||
1775 | early_param("nolapic", setup_nolapic); | ||
1776 | |||
1777 | static int __init parse_lapic_timer_c2_ok(char *arg) | ||
1778 | { | ||
1779 | local_apic_timer_c2_ok = 1; | ||
1780 | return 0; | ||
1781 | } | ||
1782 | early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); | ||
1783 | |||
1784 | static int __init parse_disable_apic_timer(char *arg) | ||
1785 | { | ||
1786 | disable_apic_timer = 1; | ||
1787 | return 0; | ||
1788 | } | ||
1789 | early_param("noapictimer", parse_disable_apic_timer); | ||
1790 | |||
1791 | static int __init parse_nolapic_timer(char *arg) | ||
1792 | { | ||
1793 | disable_apic_timer = 1; | ||
1794 | return 0; | ||
1795 | } | ||
1796 | early_param("nolapic_timer", parse_nolapic_timer); | ||
1797 | |||
1798 | static __init int setup_apicpmtimer(char *s) | ||
1799 | { | ||
1800 | apic_calibrate_pmtmr = 1; | ||
1801 | notsc_setup(NULL); | ||
1802 | return 0; | ||
1803 | } | ||
1804 | __setup("apicpmtimer", setup_apicpmtimer); | ||
1805 | |||
1806 | static int __init apic_set_verbosity(char *arg) | ||
1807 | { | ||
1808 | if (!arg) { | ||
1809 | #ifdef CONFIG_X86_64 | ||
1810 | skip_ioapic_setup = 0; | ||
1811 | ioapic_force = 1; | ||
1812 | return 0; | ||
1813 | #endif | ||
1814 | return -EINVAL; | ||
1815 | } | ||
1816 | |||
1817 | if (strcmp("debug", arg) == 0) | ||
1818 | apic_verbosity = APIC_DEBUG; | ||
1819 | else if (strcmp("verbose", arg) == 0) | ||
1820 | apic_verbosity = APIC_VERBOSE; | ||
1821 | else { | ||
1822 | printk(KERN_WARNING "APIC Verbosity level %s not recognised" | ||
1823 | " use apic=verbose or apic=debug\n", arg); | ||
1824 | return -EINVAL; | ||
1825 | } | ||
1826 | |||
1827 | return 0; | ||
1828 | } | ||
1829 | early_param("apic", apic_set_verbosity); | ||
1830 | |||
1831 | static int __init lapic_insert_resource(void) | ||
1832 | { | ||
1833 | if (!apic_phys) | ||
1834 | return -1; | ||
1835 | |||
1836 | /* Put local APIC into the resource map. */ | ||
1837 | lapic_resource.start = apic_phys; | ||
1838 | lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; | ||
1839 | insert_resource(&iomem_resource, &lapic_resource); | ||
1840 | |||
1841 | return 0; | ||
1842 | } | ||
1843 | |||
1844 | /* | ||
1845 | * need call insert after e820_reserve_resources() | ||
1846 | * that is using request_resource | ||
1847 | */ | ||
1848 | late_initcall(lapic_insert_resource); | ||
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index fdd585f9c53d..f0dfe6f17e7e 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c | |||
@@ -1,8 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * BIOS run time interface routines. | 2 | * BIOS run time interface routines. |
3 | * | 3 | * |
4 | * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | 4 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 5 | * it under the terms of the GNU General Public License as published by |
8 | * the Free Software Foundation; either version 2 of the License, or | 6 | * the Free Software Foundation; either version 2 of the License, or |
@@ -16,33 +14,128 @@ | |||
16 | * You should have received a copy of the GNU General Public License | 14 | * You should have received a copy of the GNU General Public License |
17 | * along with this program; if not, write to the Free Software | 15 | * along with this program; if not, write to the Free Software |
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
17 | * | ||
18 | * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. | ||
19 | * Copyright (c) Russ Anderson | ||
19 | */ | 20 | */ |
20 | 21 | ||
22 | #include <linux/efi.h> | ||
23 | #include <asm/efi.h> | ||
24 | #include <linux/io.h> | ||
21 | #include <asm/uv/bios.h> | 25 | #include <asm/uv/bios.h> |
26 | #include <asm/uv/uv_hub.h> | ||
27 | |||
28 | struct uv_systab uv_systab; | ||
22 | 29 | ||
23 | const char * | 30 | s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) |
24 | x86_bios_strerror(long status) | ||
25 | { | 31 | { |
26 | const char *str; | 32 | struct uv_systab *tab = &uv_systab; |
27 | switch (status) { | 33 | |
28 | case 0: str = "Call completed without error"; break; | 34 | if (!tab->function) |
29 | case -1: str = "Not implemented"; break; | 35 | /* |
30 | case -2: str = "Invalid argument"; break; | 36 | * BIOS does not support UV systab |
31 | case -3: str = "Call completed with error"; break; | 37 | */ |
32 | default: str = "Unknown BIOS status code"; break; | 38 | return BIOS_STATUS_UNIMPLEMENTED; |
33 | } | 39 | |
34 | return str; | 40 | return efi_call6((void *)__va(tab->function), |
41 | (u64)which, a1, a2, a3, a4, a5); | ||
35 | } | 42 | } |
36 | 43 | ||
37 | long | 44 | s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, |
38 | x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second, | 45 | u64 a4, u64 a5) |
39 | unsigned long *drift_info) | ||
40 | { | 46 | { |
41 | struct uv_bios_retval isrv; | 47 | unsigned long bios_flags; |
48 | s64 ret; | ||
42 | 49 | ||
43 | BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0); | 50 | local_irq_save(bios_flags); |
44 | *ticks_per_second = isrv.v0; | 51 | ret = uv_bios_call(which, a1, a2, a3, a4, a5); |
45 | *drift_info = isrv.v1; | 52 | local_irq_restore(bios_flags); |
46 | return isrv.status; | 53 | |
54 | return ret; | ||
47 | } | 55 | } |
48 | EXPORT_SYMBOL_GPL(x86_bios_freq_base); | 56 | |
57 | s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, | ||
58 | u64 a4, u64 a5) | ||
59 | { | ||
60 | s64 ret; | ||
61 | |||
62 | preempt_disable(); | ||
63 | ret = uv_bios_call(which, a1, a2, a3, a4, a5); | ||
64 | preempt_enable(); | ||
65 | |||
66 | return ret; | ||
67 | } | ||
68 | |||
69 | |||
70 | long sn_partition_id; | ||
71 | EXPORT_SYMBOL_GPL(sn_partition_id); | ||
72 | long uv_coherency_id; | ||
73 | EXPORT_SYMBOL_GPL(uv_coherency_id); | ||
74 | long uv_region_size; | ||
75 | EXPORT_SYMBOL_GPL(uv_region_size); | ||
76 | int uv_type; | ||
77 | |||
78 | |||
79 | s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, | ||
80 | long *region) | ||
81 | { | ||
82 | s64 ret; | ||
83 | u64 v0, v1; | ||
84 | union partition_info_u part; | ||
85 | |||
86 | ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc, | ||
87 | (u64)(&v0), (u64)(&v1), 0, 0); | ||
88 | if (ret != BIOS_STATUS_SUCCESS) | ||
89 | return ret; | ||
90 | |||
91 | part.val = v0; | ||
92 | if (uvtype) | ||
93 | *uvtype = part.hub_version; | ||
94 | if (partid) | ||
95 | *partid = part.partition_id; | ||
96 | if (coher) | ||
97 | *coher = part.coherence_id; | ||
98 | if (region) | ||
99 | *region = part.region_size; | ||
100 | return ret; | ||
101 | } | ||
102 | |||
103 | |||
104 | s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) | ||
105 | { | ||
106 | return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, | ||
107 | (u64)ticks_per_second, 0, 0, 0); | ||
108 | } | ||
109 | EXPORT_SYMBOL_GPL(uv_bios_freq_base); | ||
110 | |||
111 | |||
112 | #ifdef CONFIG_EFI | ||
113 | void uv_bios_init(void) | ||
114 | { | ||
115 | struct uv_systab *tab; | ||
116 | |||
117 | if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || | ||
118 | (efi.uv_systab == (unsigned long)NULL)) { | ||
119 | printk(KERN_CRIT "No EFI UV System Table.\n"); | ||
120 | uv_systab.function = (unsigned long)NULL; | ||
121 | return; | ||
122 | } | ||
123 | |||
124 | tab = (struct uv_systab *)ioremap(efi.uv_systab, | ||
125 | sizeof(struct uv_systab)); | ||
126 | if (strncmp(tab->signature, "UVST", 4) != 0) | ||
127 | printk(KERN_ERR "bad signature in UV system table!"); | ||
128 | |||
129 | /* | ||
130 | * Copy table to permanent spot for later use. | ||
131 | */ | ||
132 | memcpy(&uv_systab, tab, sizeof(struct uv_systab)); | ||
133 | iounmap(tab); | ||
134 | |||
135 | printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision); | ||
136 | } | ||
137 | #else /* !CONFIG_EFI */ | ||
138 | |||
139 | void uv_bios_init(void) { } | ||
140 | #endif | ||
141 | |||
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore new file mode 100644 index 000000000000..667df55a4399 --- /dev/null +++ b/arch/x86/kernel/cpu/.gitignore | |||
@@ -0,0 +1 @@ | |||
capflags.c | |||
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 32e73520adf7..8f1e31db2ad5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -249,7 +249,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) | |||
249 | } | 249 | } |
250 | numa_set_node(cpu, node); | 250 | numa_set_node(cpu, node); |
251 | 251 | ||
252 | printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | 252 | printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); |
253 | #endif | 253 | #endif |
254 | } | 254 | } |
255 | 255 | ||
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fb789dd9e691..25581dcb280e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -124,18 +124,25 @@ static inline int flag_is_changeable_p(u32 flag) | |||
124 | { | 124 | { |
125 | u32 f1, f2; | 125 | u32 f1, f2; |
126 | 126 | ||
127 | asm("pushfl\n\t" | 127 | /* |
128 | "pushfl\n\t" | 128 | * Cyrix and IDT cpus allow disabling of CPUID |
129 | "popl %0\n\t" | 129 | * so the code below may return different results |
130 | "movl %0,%1\n\t" | 130 | * when it is executed before and after enabling |
131 | "xorl %2,%0\n\t" | 131 | * the CPUID. Add "volatile" to not allow gcc to |
132 | "pushl %0\n\t" | 132 | * optimize the subsequent calls to this function. |
133 | "popfl\n\t" | 133 | */ |
134 | "pushfl\n\t" | 134 | asm volatile ("pushfl\n\t" |
135 | "popl %0\n\t" | 135 | "pushfl\n\t" |
136 | "popfl\n\t" | 136 | "popl %0\n\t" |
137 | : "=&r" (f1), "=&r" (f2) | 137 | "movl %0,%1\n\t" |
138 | : "ir" (flag)); | 138 | "xorl %2,%0\n\t" |
139 | "pushl %0\n\t" | ||
140 | "popfl\n\t" | ||
141 | "pushfl\n\t" | ||
142 | "popl %0\n\t" | ||
143 | "popfl\n\t" | ||
144 | : "=&r" (f1), "=&r" (f2) | ||
145 | : "ir" (flag)); | ||
139 | 146 | ||
140 | return ((f1^f2) & flag) != 0; | 147 | return ((f1^f2) & flag) != 0; |
141 | } | 148 | } |
@@ -719,12 +726,24 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |||
719 | #endif | 726 | #endif |
720 | } | 727 | } |
721 | 728 | ||
729 | #ifdef CONFIG_X86_64 | ||
730 | static void vgetcpu_set_mode(void) | ||
731 | { | ||
732 | if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) | ||
733 | vgetcpu_mode = VGETCPU_RDTSCP; | ||
734 | else | ||
735 | vgetcpu_mode = VGETCPU_LSL; | ||
736 | } | ||
737 | #endif | ||
738 | |||
722 | void __init identify_boot_cpu(void) | 739 | void __init identify_boot_cpu(void) |
723 | { | 740 | { |
724 | identify_cpu(&boot_cpu_data); | 741 | identify_cpu(&boot_cpu_data); |
725 | #ifdef CONFIG_X86_32 | 742 | #ifdef CONFIG_X86_32 |
726 | sysenter_setup(); | 743 | sysenter_setup(); |
727 | enable_sep_cpu(); | 744 | enable_sep_cpu(); |
745 | #else | ||
746 | vgetcpu_set_mode(); | ||
728 | #endif | 747 | #endif |
729 | } | 748 | } |
730 | 749 | ||
@@ -797,7 +816,7 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) | |||
797 | else if (c->cpuid_level >= 0) | 816 | else if (c->cpuid_level >= 0) |
798 | vendor = c->x86_vendor_id; | 817 | vendor = c->x86_vendor_id; |
799 | 818 | ||
800 | if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) | 819 | if (vendor && !strstr(c->x86_model_id, vendor)) |
801 | printk(KERN_CONT "%s ", vendor); | 820 | printk(KERN_CONT "%s ", vendor); |
802 | 821 | ||
803 | if (c->x86_model_id[0]) | 822 | if (c->x86_model_id[0]) |
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 06fcce516d51..b0461856acfb 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * (C) 2001-2004 Dave Jones. <davej@codemonkey.org.uk> | 2 | * (C) 2001-2004 Dave Jones. <davej@redhat.com> |
3 | * (C) 2002 Padraig Brady. <padraig@antefacto.com> | 3 | * (C) 2002 Padraig Brady. <padraig@antefacto.com> |
4 | * | 4 | * |
5 | * Licensed under the terms of the GNU GPL License version 2. | 5 | * Licensed under the terms of the GNU GPL License version 2. |
@@ -1019,7 +1019,7 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); | |||
1019 | module_param(revid_errata, int, 0644); | 1019 | module_param(revid_errata, int, 0644); |
1020 | MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); | 1020 | MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); |
1021 | 1021 | ||
1022 | MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); | 1022 | MODULE_AUTHOR ("Dave Jones <davej@redhat.com>"); |
1023 | MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); | 1023 | MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); |
1024 | MODULE_LICENSE ("GPL"); | 1024 | MODULE_LICENSE ("GPL"); |
1025 | 1025 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index b5ced806a316..c1ac5790c63e 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c | |||
@@ -246,7 +246,7 @@ static void __exit powernow_k6_exit(void) | |||
246 | } | 246 | } |
247 | 247 | ||
248 | 248 | ||
249 | MODULE_AUTHOR("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); | 249 | MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>"); |
250 | MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); | 250 | MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); |
251 | MODULE_LICENSE("GPL"); | 251 | MODULE_LICENSE("GPL"); |
252 | 252 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 0a61159d7b71..7c7d56b43136 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * AMD K7 Powernow driver. | 2 | * AMD K7 Powernow driver. |
3 | * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs. | 3 | * (C) 2003 Dave Jones on behalf of SuSE Labs. |
4 | * (C) 2003-2004 Dave Jones <davej@redhat.com> | 4 | * (C) 2003-2004 Dave Jones <davej@redhat.com> |
5 | * | 5 | * |
6 | * Licensed under the terms of the GNU GPL License version 2. | 6 | * Licensed under the terms of the GNU GPL License version 2. |
@@ -692,7 +692,7 @@ static void __exit powernow_exit (void) | |||
692 | module_param(acpi_force, int, 0444); | 692 | module_param(acpi_force, int, 0444); |
693 | MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); | 693 | MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); |
694 | 694 | ||
695 | MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); | 695 | MODULE_AUTHOR ("Dave Jones <davej@redhat.com>"); |
696 | MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors."); | 696 | MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors."); |
697 | MODULE_LICENSE ("GPL"); | 697 | MODULE_LICENSE ("GPL"); |
698 | 698 | ||
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 84bb395038d8..008d23ba491b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * Support : mark.langsdorf@amd.com | 7 | * Support : mark.langsdorf@amd.com |
8 | * | 8 | * |
9 | * Based on the powernow-k7.c module written by Dave Jones. | 9 | * Based on the powernow-k7.c module written by Dave Jones. |
10 | * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs | 10 | * (C) 2003 Dave Jones on behalf of SuSE Labs |
11 | * (C) 2004 Dominik Brodowski <linux@brodo.de> | 11 | * (C) 2004 Dominik Brodowski <linux@brodo.de> |
12 | * (C) 2004 Pavel Machek <pavel@suse.cz> | 12 | * (C) 2004 Pavel Machek <pavel@suse.cz> |
13 | * Licensed under the terms of the GNU GPL License version 2. | 13 | * Licensed under the terms of the GNU GPL License version 2. |
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 191f7263c61d..04d0376b64b0 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | |||
@@ -431,7 +431,7 @@ static void __exit speedstep_exit(void) | |||
431 | } | 431 | } |
432 | 432 | ||
433 | 433 | ||
434 | MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); | 434 | MODULE_AUTHOR ("Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>"); |
435 | MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges."); | 435 | MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges."); |
436 | MODULE_LICENSE ("GPL"); | 436 | MODULE_LICENSE ("GPL"); |
437 | 437 | ||
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 99468dbd08da..cce0b6118d55 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -174,7 +174,7 @@ static void __cpuinit srat_detect_node(void) | |||
174 | node = first_node(node_online_map); | 174 | node = first_node(node_online_map); |
175 | numa_set_node(cpu, node); | 175 | numa_set_node(cpu, node); |
176 | 176 | ||
177 | printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); | 177 | printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); |
178 | #endif | 178 | #endif |
179 | } | 179 | } |
180 | 180 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index f390c9f66351..dd3af6e7b39a 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * Athlon/Hammer specific Machine Check Exception Reporting | 2 | * Athlon specific Machine Check Exception Reporting |
3 | * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk> | 3 | * (C) Copyright 2002 Dave Jones <davej@redhat.com> |
4 | */ | 4 | */ |
5 | 5 | ||
6 | #include <linux/init.h> | 6 | #include <linux/init.h> |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index 774d87cfd8cd..0ebf3fc6a610 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * mce.c - x86 Machine Check Exception Reporting | 2 | * mce.c - x86 Machine Check Exception Reporting |
3 | * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk> | 3 | * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@redhat.com> |
4 | */ | 4 | */ |
5 | 5 | ||
6 | #include <linux/init.h> | 6 | #include <linux/init.h> |
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index cc1fccdd31e0..a74af128efc9 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Non Fatal Machine Check Exception Reporting | 2 | * Non Fatal Machine Check Exception Reporting |
3 | * | 3 | * |
4 | * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk> | 4 | * (C) Copyright 2002 Dave Jones. <davej@redhat.com> |
5 | * | 5 | * |
6 | * This file contains routines to check for non-fatal MCEs every 15s | 6 | * This file contains routines to check for non-fatal MCEs every 15s |
7 | * | 7 | * |
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 6bff382094f5..9abd48b22674 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c | |||
@@ -17,6 +17,8 @@ | |||
17 | #include <linux/bitops.h> | 17 | #include <linux/bitops.h> |
18 | #include <linux/smp.h> | 18 | #include <linux/smp.h> |
19 | #include <linux/nmi.h> | 19 | #include <linux/nmi.h> |
20 | #include <linux/kprobes.h> | ||
21 | |||
20 | #include <asm/apic.h> | 22 | #include <asm/apic.h> |
21 | #include <asm/intel_arch_perfmon.h> | 23 | #include <asm/intel_arch_perfmon.h> |
22 | 24 | ||
@@ -336,7 +338,8 @@ static void single_msr_unreserve(void) | |||
336 | release_perfctr_nmi(wd_ops->perfctr); | 338 | release_perfctr_nmi(wd_ops->perfctr); |
337 | } | 339 | } |
338 | 340 | ||
339 | static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) | 341 | static void __kprobes |
342 | single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) | ||
340 | { | 343 | { |
341 | /* start the cycle over again */ | 344 | /* start the cycle over again */ |
342 | write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); | 345 | write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); |
@@ -401,7 +404,7 @@ static int setup_p6_watchdog(unsigned nmi_hz) | |||
401 | return 1; | 404 | return 1; |
402 | } | 405 | } |
403 | 406 | ||
404 | static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) | 407 | static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) |
405 | { | 408 | { |
406 | /* | 409 | /* |
407 | * P6 based Pentium M need to re-unmask | 410 | * P6 based Pentium M need to re-unmask |
@@ -605,7 +608,7 @@ static void p4_unreserve(void) | |||
605 | release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); | 608 | release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); |
606 | } | 609 | } |
607 | 610 | ||
608 | static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) | 611 | static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) |
609 | { | 612 | { |
610 | unsigned dummy; | 613 | unsigned dummy; |
611 | /* | 614 | /* |
@@ -784,7 +787,7 @@ unsigned lapic_adjust_nmi_hz(unsigned hz) | |||
784 | return hz; | 787 | return hz; |
785 | } | 788 | } |
786 | 789 | ||
787 | int lapic_wd_event(unsigned nmi_hz) | 790 | int __kprobes lapic_wd_event(unsigned nmi_hz) |
788 | { | 791 | { |
789 | struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); | 792 | struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); |
790 | u64 ctr; | 793 | u64 ctr; |
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 6a44d6465991..72cefd1e649b 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c | |||
@@ -147,8 +147,8 @@ static __cpuinit int cpuid_device_create(int cpu) | |||
147 | { | 147 | { |
148 | struct device *dev; | 148 | struct device *dev; |
149 | 149 | ||
150 | dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), | 150 | dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, |
151 | NULL, "cpu%d", cpu); | 151 | "cpu%d", cpu); |
152 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; | 152 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; |
153 | } | 153 | } |
154 | 154 | ||
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 72d0c56c1b48..f7cdb3b457aa 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c | |||
@@ -13,6 +13,9 @@ | |||
13 | 13 | ||
14 | static void *kdump_buf_page; | 14 | static void *kdump_buf_page; |
15 | 15 | ||
16 | /* Stores the physical address of elf header of crash image. */ | ||
17 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; | ||
18 | |||
16 | /** | 19 | /** |
17 | * copy_oldmem_page - copy one page from "oldmem" | 20 | * copy_oldmem_page - copy one page from "oldmem" |
18 | * @pfn: page frame number to be copied | 21 | * @pfn: page frame number to be copied |
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index e90a60ef10c2..045b36cada65 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c | |||
@@ -10,6 +10,9 @@ | |||
10 | #include <linux/uaccess.h> | 10 | #include <linux/uaccess.h> |
11 | #include <linux/io.h> | 11 | #include <linux/io.h> |
12 | 12 | ||
13 | /* Stores the physical address of elf header of crash image. */ | ||
14 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; | ||
15 | |||
13 | /** | 16 | /** |
14 | * copy_oldmem_page - copy one page from "oldmem" | 17 | * copy_oldmem_page - copy one page from "oldmem" |
15 | * @pfn: page frame number to be copied | 18 | * @pfn: page frame number to be copied |
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 395acb12b0d1..b4f14c6c09d9 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c | |||
@@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = { | |||
66 | .ds = __USER_DS, | 66 | .ds = __USER_DS, |
67 | .fs = __KERNEL_PERCPU, | 67 | .fs = __KERNEL_PERCPU, |
68 | 68 | ||
69 | .__cr3 = __phys_addr_const((unsigned long)swapper_pg_dir) | 69 | .__cr3 = __pa_nodebug(swapper_pg_dir), |
70 | } | 70 | } |
71 | }; | 71 | }; |
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c new file mode 100644 index 000000000000..1a78180f08d3 --- /dev/null +++ b/arch/x86/kernel/dumpstack_32.c | |||
@@ -0,0 +1,449 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
3 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
4 | */ | ||
5 | #include <linux/kallsyms.h> | ||
6 | #include <linux/kprobes.h> | ||
7 | #include <linux/uaccess.h> | ||
8 | #include <linux/utsname.h> | ||
9 | #include <linux/hardirq.h> | ||
10 | #include <linux/kdebug.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/ptrace.h> | ||
13 | #include <linux/kexec.h> | ||
14 | #include <linux/bug.h> | ||
15 | #include <linux/nmi.h> | ||
16 | #include <linux/sysfs.h> | ||
17 | |||
18 | #include <asm/stacktrace.h> | ||
19 | |||
20 | #define STACKSLOTS_PER_LINE 8 | ||
21 | #define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) | ||
22 | |||
23 | int panic_on_unrecovered_nmi; | ||
24 | int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; | ||
25 | static unsigned int code_bytes = 64; | ||
26 | static int die_counter; | ||
27 | |||
28 | void printk_address(unsigned long address, int reliable) | ||
29 | { | ||
30 | printk(" [<%p>] %s%pS\n", (void *) address, | ||
31 | reliable ? "" : "? ", (void *) address); | ||
32 | } | ||
33 | |||
34 | static inline int valid_stack_ptr(struct thread_info *tinfo, | ||
35 | void *p, unsigned int size, void *end) | ||
36 | { | ||
37 | void *t = tinfo; | ||
38 | if (end) { | ||
39 | if (p < end && p >= (end-THREAD_SIZE)) | ||
40 | return 1; | ||
41 | else | ||
42 | return 0; | ||
43 | } | ||
44 | return p > t && p < t + THREAD_SIZE - size; | ||
45 | } | ||
46 | |||
47 | /* The form of the top of the frame on the stack */ | ||
48 | struct stack_frame { | ||
49 | struct stack_frame *next_frame; | ||
50 | unsigned long return_address; | ||
51 | }; | ||
52 | |||
53 | static inline unsigned long | ||
54 | print_context_stack(struct thread_info *tinfo, | ||
55 | unsigned long *stack, unsigned long bp, | ||
56 | const struct stacktrace_ops *ops, void *data, | ||
57 | unsigned long *end) | ||
58 | { | ||
59 | struct stack_frame *frame = (struct stack_frame *)bp; | ||
60 | |||
61 | while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { | ||
62 | unsigned long addr; | ||
63 | |||
64 | addr = *stack; | ||
65 | if (__kernel_text_address(addr)) { | ||
66 | if ((unsigned long) stack == bp + sizeof(long)) { | ||
67 | ops->address(data, addr, 1); | ||
68 | frame = frame->next_frame; | ||
69 | bp = (unsigned long) frame; | ||
70 | } else { | ||
71 | ops->address(data, addr, bp == 0); | ||
72 | } | ||
73 | } | ||
74 | stack++; | ||
75 | } | ||
76 | return bp; | ||
77 | } | ||
78 | |||
79 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | ||
80 | unsigned long *stack, unsigned long bp, | ||
81 | const struct stacktrace_ops *ops, void *data) | ||
82 | { | ||
83 | if (!task) | ||
84 | task = current; | ||
85 | |||
86 | if (!stack) { | ||
87 | unsigned long dummy; | ||
88 | stack = &dummy; | ||
89 | if (task && task != current) | ||
90 | stack = (unsigned long *)task->thread.sp; | ||
91 | } | ||
92 | |||
93 | #ifdef CONFIG_FRAME_POINTER | ||
94 | if (!bp) { | ||
95 | if (task == current) { | ||
96 | /* Grab bp right from our regs */ | ||
97 | get_bp(bp); | ||
98 | } else { | ||
99 | /* bp is the last reg pushed by switch_to */ | ||
100 | bp = *(unsigned long *) task->thread.sp; | ||
101 | } | ||
102 | } | ||
103 | #endif | ||
104 | |||
105 | for (;;) { | ||
106 | struct thread_info *context; | ||
107 | |||
108 | context = (struct thread_info *) | ||
109 | ((unsigned long)stack & (~(THREAD_SIZE - 1))); | ||
110 | bp = print_context_stack(context, stack, bp, ops, data, NULL); | ||
111 | |||
112 | stack = (unsigned long *)context->previous_esp; | ||
113 | if (!stack) | ||
114 | break; | ||
115 | if (ops->stack(data, "IRQ") < 0) | ||
116 | break; | ||
117 | touch_nmi_watchdog(); | ||
118 | } | ||
119 | } | ||
120 | EXPORT_SYMBOL(dump_trace); | ||
121 | |||
122 | static void | ||
123 | print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
124 | { | ||
125 | printk(data); | ||
126 | print_symbol(msg, symbol); | ||
127 | printk("\n"); | ||
128 | } | ||
129 | |||
130 | static void print_trace_warning(void *data, char *msg) | ||
131 | { | ||
132 | printk("%s%s\n", (char *)data, msg); | ||
133 | } | ||
134 | |||
135 | static int print_trace_stack(void *data, char *name) | ||
136 | { | ||
137 | printk("%s <%s> ", (char *)data, name); | ||
138 | return 0; | ||
139 | } | ||
140 | |||
141 | /* | ||
142 | * Print one address/symbol entries per line. | ||
143 | */ | ||
144 | static void print_trace_address(void *data, unsigned long addr, int reliable) | ||
145 | { | ||
146 | touch_nmi_watchdog(); | ||
147 | printk(data); | ||
148 | printk_address(addr, reliable); | ||
149 | } | ||
150 | |||
151 | static const struct stacktrace_ops print_trace_ops = { | ||
152 | .warning = print_trace_warning, | ||
153 | .warning_symbol = print_trace_warning_symbol, | ||
154 | .stack = print_trace_stack, | ||
155 | .address = print_trace_address, | ||
156 | }; | ||
157 | |||
158 | static void | ||
159 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
160 | unsigned long *stack, unsigned long bp, char *log_lvl) | ||
161 | { | ||
162 | printk("%sCall Trace:\n", log_lvl); | ||
163 | dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); | ||
164 | } | ||
165 | |||
166 | void show_trace(struct task_struct *task, struct pt_regs *regs, | ||
167 | unsigned long *stack, unsigned long bp) | ||
168 | { | ||
169 | show_trace_log_lvl(task, regs, stack, bp, ""); | ||
170 | } | ||
171 | |||
172 | static void | ||
173 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
174 | unsigned long *sp, unsigned long bp, char *log_lvl) | ||
175 | { | ||
176 | unsigned long *stack; | ||
177 | int i; | ||
178 | |||
179 | if (sp == NULL) { | ||
180 | if (task) | ||
181 | sp = (unsigned long *)task->thread.sp; | ||
182 | else | ||
183 | sp = (unsigned long *)&sp; | ||
184 | } | ||
185 | |||
186 | stack = sp; | ||
187 | for (i = 0; i < kstack_depth_to_print; i++) { | ||
188 | if (kstack_end(stack)) | ||
189 | break; | ||
190 | if (i && ((i % STACKSLOTS_PER_LINE) == 0)) | ||
191 | printk("\n%s", log_lvl); | ||
192 | printk(" %08lx", *stack++); | ||
193 | touch_nmi_watchdog(); | ||
194 | } | ||
195 | printk("\n"); | ||
196 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); | ||
197 | } | ||
198 | |||
199 | void show_stack(struct task_struct *task, unsigned long *sp) | ||
200 | { | ||
201 | show_stack_log_lvl(task, NULL, sp, 0, ""); | ||
202 | } | ||
203 | |||
204 | /* | ||
205 | * The architecture-independent dump_stack generator | ||
206 | */ | ||
207 | void dump_stack(void) | ||
208 | { | ||
209 | unsigned long bp = 0; | ||
210 | unsigned long stack; | ||
211 | |||
212 | #ifdef CONFIG_FRAME_POINTER | ||
213 | if (!bp) | ||
214 | get_bp(bp); | ||
215 | #endif | ||
216 | |||
217 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | ||
218 | current->pid, current->comm, print_tainted(), | ||
219 | init_utsname()->release, | ||
220 | (int)strcspn(init_utsname()->version, " "), | ||
221 | init_utsname()->version); | ||
222 | show_trace(NULL, NULL, &stack, bp); | ||
223 | } | ||
224 | |||
225 | EXPORT_SYMBOL(dump_stack); | ||
226 | |||
227 | void show_registers(struct pt_regs *regs) | ||
228 | { | ||
229 | int i; | ||
230 | |||
231 | print_modules(); | ||
232 | __show_regs(regs, 0); | ||
233 | |||
234 | printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", | ||
235 | TASK_COMM_LEN, current->comm, task_pid_nr(current), | ||
236 | current_thread_info(), current, task_thread_info(current)); | ||
237 | /* | ||
238 | * When in-kernel, we also print out the stack and code at the | ||
239 | * time of the fault.. | ||
240 | */ | ||
241 | if (!user_mode_vm(regs)) { | ||
242 | unsigned int code_prologue = code_bytes * 43 / 64; | ||
243 | unsigned int code_len = code_bytes; | ||
244 | unsigned char c; | ||
245 | u8 *ip; | ||
246 | |||
247 | printk(KERN_EMERG "Stack:\n"); | ||
248 | show_stack_log_lvl(NULL, regs, ®s->sp, | ||
249 | 0, KERN_EMERG); | ||
250 | |||
251 | printk(KERN_EMERG "Code: "); | ||
252 | |||
253 | ip = (u8 *)regs->ip - code_prologue; | ||
254 | if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { | ||
255 | /* try starting at IP */ | ||
256 | ip = (u8 *)regs->ip; | ||
257 | code_len = code_len - code_prologue + 1; | ||
258 | } | ||
259 | for (i = 0; i < code_len; i++, ip++) { | ||
260 | if (ip < (u8 *)PAGE_OFFSET || | ||
261 | probe_kernel_address(ip, c)) { | ||
262 | printk(" Bad EIP value."); | ||
263 | break; | ||
264 | } | ||
265 | if (ip == (u8 *)regs->ip) | ||
266 | printk("<%02x> ", c); | ||
267 | else | ||
268 | printk("%02x ", c); | ||
269 | } | ||
270 | } | ||
271 | printk("\n"); | ||
272 | } | ||
273 | |||
274 | int is_valid_bugaddr(unsigned long ip) | ||
275 | { | ||
276 | unsigned short ud2; | ||
277 | |||
278 | if (ip < PAGE_OFFSET) | ||
279 | return 0; | ||
280 | if (probe_kernel_address((unsigned short *)ip, ud2)) | ||
281 | return 0; | ||
282 | |||
283 | return ud2 == 0x0b0f; | ||
284 | } | ||
285 | |||
286 | static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; | ||
287 | static int die_owner = -1; | ||
288 | static unsigned int die_nest_count; | ||
289 | |||
290 | unsigned __kprobes long oops_begin(void) | ||
291 | { | ||
292 | unsigned long flags; | ||
293 | |||
294 | oops_enter(); | ||
295 | |||
296 | if (die_owner != raw_smp_processor_id()) { | ||
297 | console_verbose(); | ||
298 | raw_local_irq_save(flags); | ||
299 | __raw_spin_lock(&die_lock); | ||
300 | die_owner = smp_processor_id(); | ||
301 | die_nest_count = 0; | ||
302 | bust_spinlocks(1); | ||
303 | } else { | ||
304 | raw_local_irq_save(flags); | ||
305 | } | ||
306 | die_nest_count++; | ||
307 | return flags; | ||
308 | } | ||
309 | |||
310 | void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) | ||
311 | { | ||
312 | bust_spinlocks(0); | ||
313 | die_owner = -1; | ||
314 | add_taint(TAINT_DIE); | ||
315 | __raw_spin_unlock(&die_lock); | ||
316 | raw_local_irq_restore(flags); | ||
317 | |||
318 | if (!regs) | ||
319 | return; | ||
320 | |||
321 | if (kexec_should_crash(current)) | ||
322 | crash_kexec(regs); | ||
323 | if (in_interrupt()) | ||
324 | panic("Fatal exception in interrupt"); | ||
325 | if (panic_on_oops) | ||
326 | panic("Fatal exception"); | ||
327 | oops_exit(); | ||
328 | do_exit(signr); | ||
329 | } | ||
330 | |||
331 | int __kprobes __die(const char *str, struct pt_regs *regs, long err) | ||
332 | { | ||
333 | unsigned short ss; | ||
334 | unsigned long sp; | ||
335 | |||
336 | printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); | ||
337 | #ifdef CONFIG_PREEMPT | ||
338 | printk("PREEMPT "); | ||
339 | #endif | ||
340 | #ifdef CONFIG_SMP | ||
341 | printk("SMP "); | ||
342 | #endif | ||
343 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
344 | printk("DEBUG_PAGEALLOC"); | ||
345 | #endif | ||
346 | printk("\n"); | ||
347 | sysfs_printk_last_file(); | ||
348 | if (notify_die(DIE_OOPS, str, regs, err, | ||
349 | current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) | ||
350 | return 1; | ||
351 | |||
352 | show_registers(regs); | ||
353 | /* Executive summary in case the oops scrolled away */ | ||
354 | sp = (unsigned long) (®s->sp); | ||
355 | savesegment(ss, ss); | ||
356 | if (user_mode(regs)) { | ||
357 | sp = regs->sp; | ||
358 | ss = regs->ss & 0xffff; | ||
359 | } | ||
360 | printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); | ||
361 | print_symbol("%s", regs->ip); | ||
362 | printk(" SS:ESP %04x:%08lx\n", ss, sp); | ||
363 | return 0; | ||
364 | } | ||
365 | |||
366 | /* | ||
367 | * This is gone through when something in the kernel has done something bad | ||
368 | * and is about to be terminated: | ||
369 | */ | ||
370 | void die(const char *str, struct pt_regs *regs, long err) | ||
371 | { | ||
372 | unsigned long flags = oops_begin(); | ||
373 | |||
374 | if (die_nest_count < 3) { | ||
375 | report_bug(regs->ip, regs); | ||
376 | |||
377 | if (__die(str, regs, err)) | ||
378 | regs = NULL; | ||
379 | } else { | ||
380 | printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); | ||
381 | } | ||
382 | |||
383 | oops_end(flags, regs, SIGSEGV); | ||
384 | } | ||
385 | |||
386 | static DEFINE_SPINLOCK(nmi_print_lock); | ||
387 | |||
388 | void notrace __kprobes | ||
389 | die_nmi(char *str, struct pt_regs *regs, int do_panic) | ||
390 | { | ||
391 | if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) | ||
392 | return; | ||
393 | |||
394 | spin_lock(&nmi_print_lock); | ||
395 | /* | ||
396 | * We are in trouble anyway, lets at least try | ||
397 | * to get a message out: | ||
398 | */ | ||
399 | bust_spinlocks(1); | ||
400 | printk(KERN_EMERG "%s", str); | ||
401 | printk(" on CPU%d, ip %08lx, registers:\n", | ||
402 | smp_processor_id(), regs->ip); | ||
403 | show_registers(regs); | ||
404 | if (do_panic) | ||
405 | panic("Non maskable interrupt"); | ||
406 | console_silent(); | ||
407 | spin_unlock(&nmi_print_lock); | ||
408 | bust_spinlocks(0); | ||
409 | |||
410 | /* | ||
411 | * If we are in kernel we are probably nested up pretty bad | ||
412 | * and might aswell get out now while we still can: | ||
413 | */ | ||
414 | if (!user_mode_vm(regs)) { | ||
415 | current->thread.trap_no = 2; | ||
416 | crash_kexec(regs); | ||
417 | } | ||
418 | |||
419 | do_exit(SIGSEGV); | ||
420 | } | ||
421 | |||
422 | static int __init oops_setup(char *s) | ||
423 | { | ||
424 | if (!s) | ||
425 | return -EINVAL; | ||
426 | if (!strcmp(s, "panic")) | ||
427 | panic_on_oops = 1; | ||
428 | return 0; | ||
429 | } | ||
430 | early_param("oops", oops_setup); | ||
431 | |||
432 | static int __init kstack_setup(char *s) | ||
433 | { | ||
434 | if (!s) | ||
435 | return -EINVAL; | ||
436 | kstack_depth_to_print = simple_strtoul(s, NULL, 0); | ||
437 | return 0; | ||
438 | } | ||
439 | early_param("kstack", kstack_setup); | ||
440 | |||
441 | static int __init code_bytes_setup(char *s) | ||
442 | { | ||
443 | code_bytes = simple_strtoul(s, NULL, 0); | ||
444 | if (code_bytes > 8192) | ||
445 | code_bytes = 8192; | ||
446 | |||
447 | return 1; | ||
448 | } | ||
449 | __setup("code_bytes=", code_bytes_setup); | ||
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c new file mode 100644 index 000000000000..96a5db7da8a7 --- /dev/null +++ b/arch/x86/kernel/dumpstack_64.c | |||
@@ -0,0 +1,575 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
3 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
4 | */ | ||
5 | #include <linux/kallsyms.h> | ||
6 | #include <linux/kprobes.h> | ||
7 | #include <linux/uaccess.h> | ||
8 | #include <linux/utsname.h> | ||
9 | #include <linux/hardirq.h> | ||
10 | #include <linux/kdebug.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/ptrace.h> | ||
13 | #include <linux/kexec.h> | ||
14 | #include <linux/bug.h> | ||
15 | #include <linux/nmi.h> | ||
16 | #include <linux/sysfs.h> | ||
17 | |||
18 | #include <asm/stacktrace.h> | ||
19 | |||
20 | #define STACKSLOTS_PER_LINE 4 | ||
21 | #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) | ||
22 | |||
23 | int panic_on_unrecovered_nmi; | ||
24 | int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; | ||
25 | static unsigned int code_bytes = 64; | ||
26 | static int die_counter; | ||
27 | |||
28 | void printk_address(unsigned long address, int reliable) | ||
29 | { | ||
30 | printk(" [<%p>] %s%pS\n", (void *) address, | ||
31 | reliable ? "" : "? ", (void *) address); | ||
32 | } | ||
33 | |||
34 | static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | ||
35 | unsigned *usedp, char **idp) | ||
36 | { | ||
37 | static char ids[][8] = { | ||
38 | [DEBUG_STACK - 1] = "#DB", | ||
39 | [NMI_STACK - 1] = "NMI", | ||
40 | [DOUBLEFAULT_STACK - 1] = "#DF", | ||
41 | [STACKFAULT_STACK - 1] = "#SS", | ||
42 | [MCE_STACK - 1] = "#MC", | ||
43 | #if DEBUG_STKSZ > EXCEPTION_STKSZ | ||
44 | [N_EXCEPTION_STACKS ... | ||
45 | N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" | ||
46 | #endif | ||
47 | }; | ||
48 | unsigned k; | ||
49 | |||
50 | /* | ||
51 | * Iterate over all exception stacks, and figure out whether | ||
52 | * 'stack' is in one of them: | ||
53 | */ | ||
54 | for (k = 0; k < N_EXCEPTION_STACKS; k++) { | ||
55 | unsigned long end = per_cpu(orig_ist, cpu).ist[k]; | ||
56 | /* | ||
57 | * Is 'stack' above this exception frame's end? | ||
58 | * If yes then skip to the next frame. | ||
59 | */ | ||
60 | if (stack >= end) | ||
61 | continue; | ||
62 | /* | ||
63 | * Is 'stack' above this exception frame's start address? | ||
64 | * If yes then we found the right frame. | ||
65 | */ | ||
66 | if (stack >= end - EXCEPTION_STKSZ) { | ||
67 | /* | ||
68 | * Make sure we only iterate through an exception | ||
69 | * stack once. If it comes up for the second time | ||
70 | * then there's something wrong going on - just | ||
71 | * break out and return NULL: | ||
72 | */ | ||
73 | if (*usedp & (1U << k)) | ||
74 | break; | ||
75 | *usedp |= 1U << k; | ||
76 | *idp = ids[k]; | ||
77 | return (unsigned long *)end; | ||
78 | } | ||
79 | /* | ||
80 | * If this is a debug stack, and if it has a larger size than | ||
81 | * the usual exception stacks, then 'stack' might still | ||
82 | * be within the lower portion of the debug stack: | ||
83 | */ | ||
84 | #if DEBUG_STKSZ > EXCEPTION_STKSZ | ||
85 | if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { | ||
86 | unsigned j = N_EXCEPTION_STACKS - 1; | ||
87 | |||
88 | /* | ||
89 | * Black magic. A large debug stack is composed of | ||
90 | * multiple exception stack entries, which we | ||
91 | * iterate through now. Dont look: | ||
92 | */ | ||
93 | do { | ||
94 | ++j; | ||
95 | end -= EXCEPTION_STKSZ; | ||
96 | ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); | ||
97 | } while (stack < end - EXCEPTION_STKSZ); | ||
98 | if (*usedp & (1U << j)) | ||
99 | break; | ||
100 | *usedp |= 1U << j; | ||
101 | *idp = ids[j]; | ||
102 | return (unsigned long *)end; | ||
103 | } | ||
104 | #endif | ||
105 | } | ||
106 | return NULL; | ||
107 | } | ||
108 | |||
109 | /* | ||
110 | * x86-64 can have up to three kernel stacks: | ||
111 | * process stack | ||
112 | * interrupt stack | ||
113 | * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack | ||
114 | */ | ||
115 | |||
116 | static inline int valid_stack_ptr(struct thread_info *tinfo, | ||
117 | void *p, unsigned int size, void *end) | ||
118 | { | ||
119 | void *t = tinfo; | ||
120 | if (end) { | ||
121 | if (p < end && p >= (end-THREAD_SIZE)) | ||
122 | return 1; | ||
123 | else | ||
124 | return 0; | ||
125 | } | ||
126 | return p > t && p < t + THREAD_SIZE - size; | ||
127 | } | ||
128 | |||
129 | /* The form of the top of the frame on the stack */ | ||
130 | struct stack_frame { | ||
131 | struct stack_frame *next_frame; | ||
132 | unsigned long return_address; | ||
133 | }; | ||
134 | |||
135 | static inline unsigned long | ||
136 | print_context_stack(struct thread_info *tinfo, | ||
137 | unsigned long *stack, unsigned long bp, | ||
138 | const struct stacktrace_ops *ops, void *data, | ||
139 | unsigned long *end) | ||
140 | { | ||
141 | struct stack_frame *frame = (struct stack_frame *)bp; | ||
142 | |||
143 | while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { | ||
144 | unsigned long addr; | ||
145 | |||
146 | addr = *stack; | ||
147 | if (__kernel_text_address(addr)) { | ||
148 | if ((unsigned long) stack == bp + sizeof(long)) { | ||
149 | ops->address(data, addr, 1); | ||
150 | frame = frame->next_frame; | ||
151 | bp = (unsigned long) frame; | ||
152 | } else { | ||
153 | ops->address(data, addr, bp == 0); | ||
154 | } | ||
155 | } | ||
156 | stack++; | ||
157 | } | ||
158 | return bp; | ||
159 | } | ||
160 | |||
161 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | ||
162 | unsigned long *stack, unsigned long bp, | ||
163 | const struct stacktrace_ops *ops, void *data) | ||
164 | { | ||
165 | const unsigned cpu = get_cpu(); | ||
166 | unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; | ||
167 | unsigned used = 0; | ||
168 | struct thread_info *tinfo; | ||
169 | |||
170 | if (!task) | ||
171 | task = current; | ||
172 | |||
173 | if (!stack) { | ||
174 | unsigned long dummy; | ||
175 | stack = &dummy; | ||
176 | if (task && task != current) | ||
177 | stack = (unsigned long *)task->thread.sp; | ||
178 | } | ||
179 | |||
180 | #ifdef CONFIG_FRAME_POINTER | ||
181 | if (!bp) { | ||
182 | if (task == current) { | ||
183 | /* Grab bp right from our regs */ | ||
184 | get_bp(bp); | ||
185 | } else { | ||
186 | /* bp is the last reg pushed by switch_to */ | ||
187 | bp = *(unsigned long *) task->thread.sp; | ||
188 | } | ||
189 | } | ||
190 | #endif | ||
191 | |||
192 | /* | ||
193 | * Print function call entries in all stacks, starting at the | ||
194 | * current stack address. If the stacks consist of nested | ||
195 | * exceptions | ||
196 | */ | ||
197 | tinfo = task_thread_info(task); | ||
198 | for (;;) { | ||
199 | char *id; | ||
200 | unsigned long *estack_end; | ||
201 | estack_end = in_exception_stack(cpu, (unsigned long)stack, | ||
202 | &used, &id); | ||
203 | |||
204 | if (estack_end) { | ||
205 | if (ops->stack(data, id) < 0) | ||
206 | break; | ||
207 | |||
208 | bp = print_context_stack(tinfo, stack, bp, ops, | ||
209 | data, estack_end); | ||
210 | ops->stack(data, "<EOE>"); | ||
211 | /* | ||
212 | * We link to the next stack via the | ||
213 | * second-to-last pointer (index -2 to end) in the | ||
214 | * exception stack: | ||
215 | */ | ||
216 | stack = (unsigned long *) estack_end[-2]; | ||
217 | continue; | ||
218 | } | ||
219 | if (irqstack_end) { | ||
220 | unsigned long *irqstack; | ||
221 | irqstack = irqstack_end - | ||
222 | (IRQSTACKSIZE - 64) / sizeof(*irqstack); | ||
223 | |||
224 | if (stack >= irqstack && stack < irqstack_end) { | ||
225 | if (ops->stack(data, "IRQ") < 0) | ||
226 | break; | ||
227 | bp = print_context_stack(tinfo, stack, bp, | ||
228 | ops, data, irqstack_end); | ||
229 | /* | ||
230 | * We link to the next stack (which would be | ||
231 | * the process stack normally) the last | ||
232 | * pointer (index -1 to end) in the IRQ stack: | ||
233 | */ | ||
234 | stack = (unsigned long *) (irqstack_end[-1]); | ||
235 | irqstack_end = NULL; | ||
236 | ops->stack(data, "EOI"); | ||
237 | continue; | ||
238 | } | ||
239 | } | ||
240 | break; | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * This handles the process stack: | ||
245 | */ | ||
246 | bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); | ||
247 | put_cpu(); | ||
248 | } | ||
249 | EXPORT_SYMBOL(dump_trace); | ||
250 | |||
251 | static void | ||
252 | print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
253 | { | ||
254 | printk(data); | ||
255 | print_symbol(msg, symbol); | ||
256 | printk("\n"); | ||
257 | } | ||
258 | |||
259 | static void print_trace_warning(void *data, char *msg) | ||
260 | { | ||
261 | printk("%s%s\n", (char *)data, msg); | ||
262 | } | ||
263 | |||
264 | static int print_trace_stack(void *data, char *name) | ||
265 | { | ||
266 | printk("%s <%s> ", (char *)data, name); | ||
267 | return 0; | ||
268 | } | ||
269 | |||
270 | /* | ||
271 | * Print one address/symbol entries per line. | ||
272 | */ | ||
273 | static void print_trace_address(void *data, unsigned long addr, int reliable) | ||
274 | { | ||
275 | touch_nmi_watchdog(); | ||
276 | printk(data); | ||
277 | printk_address(addr, reliable); | ||
278 | } | ||
279 | |||
280 | static const struct stacktrace_ops print_trace_ops = { | ||
281 | .warning = print_trace_warning, | ||
282 | .warning_symbol = print_trace_warning_symbol, | ||
283 | .stack = print_trace_stack, | ||
284 | .address = print_trace_address, | ||
285 | }; | ||
286 | |||
287 | static void | ||
288 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
289 | unsigned long *stack, unsigned long bp, char *log_lvl) | ||
290 | { | ||
291 | printk("%sCall Trace:\n", log_lvl); | ||
292 | dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); | ||
293 | } | ||
294 | |||
295 | void show_trace(struct task_struct *task, struct pt_regs *regs, | ||
296 | unsigned long *stack, unsigned long bp) | ||
297 | { | ||
298 | show_trace_log_lvl(task, regs, stack, bp, ""); | ||
299 | } | ||
300 | |||
301 | static void | ||
302 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
303 | unsigned long *sp, unsigned long bp, char *log_lvl) | ||
304 | { | ||
305 | unsigned long *stack; | ||
306 | int i; | ||
307 | const int cpu = smp_processor_id(); | ||
308 | unsigned long *irqstack_end = | ||
309 | (unsigned long *) (cpu_pda(cpu)->irqstackptr); | ||
310 | unsigned long *irqstack = | ||
311 | (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); | ||
312 | |||
313 | /* | ||
314 | * debugging aid: "show_stack(NULL, NULL);" prints the | ||
315 | * back trace for this cpu. | ||
316 | */ | ||
317 | |||
318 | if (sp == NULL) { | ||
319 | if (task) | ||
320 | sp = (unsigned long *)task->thread.sp; | ||
321 | else | ||
322 | sp = (unsigned long *)&sp; | ||
323 | } | ||
324 | |||
325 | stack = sp; | ||
326 | for (i = 0; i < kstack_depth_to_print; i++) { | ||
327 | if (stack >= irqstack && stack <= irqstack_end) { | ||
328 | if (stack == irqstack_end) { | ||
329 | stack = (unsigned long *) (irqstack_end[-1]); | ||
330 | printk(" <EOI> "); | ||
331 | } | ||
332 | } else { | ||
333 | if (((long) stack & (THREAD_SIZE-1)) == 0) | ||
334 | break; | ||
335 | } | ||
336 | if (i && ((i % STACKSLOTS_PER_LINE) == 0)) | ||
337 | printk("\n%s", log_lvl); | ||
338 | printk(" %016lx", *stack++); | ||
339 | touch_nmi_watchdog(); | ||
340 | } | ||
341 | printk("\n"); | ||
342 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); | ||
343 | } | ||
344 | |||
345 | void show_stack(struct task_struct *task, unsigned long *sp) | ||
346 | { | ||
347 | show_stack_log_lvl(task, NULL, sp, 0, ""); | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * The architecture-independent dump_stack generator | ||
352 | */ | ||
353 | void dump_stack(void) | ||
354 | { | ||
355 | unsigned long bp = 0; | ||
356 | unsigned long stack; | ||
357 | |||
358 | #ifdef CONFIG_FRAME_POINTER | ||
359 | if (!bp) | ||
360 | get_bp(bp); | ||
361 | #endif | ||
362 | |||
363 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | ||
364 | current->pid, current->comm, print_tainted(), | ||
365 | init_utsname()->release, | ||
366 | (int)strcspn(init_utsname()->version, " "), | ||
367 | init_utsname()->version); | ||
368 | show_trace(NULL, NULL, &stack, bp); | ||
369 | } | ||
370 | EXPORT_SYMBOL(dump_stack); | ||
371 | |||
372 | void show_registers(struct pt_regs *regs) | ||
373 | { | ||
374 | int i; | ||
375 | unsigned long sp; | ||
376 | const int cpu = smp_processor_id(); | ||
377 | struct task_struct *cur = cpu_pda(cpu)->pcurrent; | ||
378 | |||
379 | sp = regs->sp; | ||
380 | printk("CPU %d ", cpu); | ||
381 | __show_regs(regs, 1); | ||
382 | printk("Process %s (pid: %d, threadinfo %p, task %p)\n", | ||
383 | cur->comm, cur->pid, task_thread_info(cur), cur); | ||
384 | |||
385 | /* | ||
386 | * When in-kernel, we also print out the stack and code at the | ||
387 | * time of the fault.. | ||
388 | */ | ||
389 | if (!user_mode(regs)) { | ||
390 | unsigned int code_prologue = code_bytes * 43 / 64; | ||
391 | unsigned int code_len = code_bytes; | ||
392 | unsigned char c; | ||
393 | u8 *ip; | ||
394 | |||
395 | printk(KERN_EMERG "Stack:\n"); | ||
396 | show_stack_log_lvl(NULL, regs, (unsigned long *)sp, | ||
397 | regs->bp, KERN_EMERG); | ||
398 | |||
399 | printk(KERN_EMERG "Code: "); | ||
400 | |||
401 | ip = (u8 *)regs->ip - code_prologue; | ||
402 | if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { | ||
403 | /* try starting at IP */ | ||
404 | ip = (u8 *)regs->ip; | ||
405 | code_len = code_len - code_prologue + 1; | ||
406 | } | ||
407 | for (i = 0; i < code_len; i++, ip++) { | ||
408 | if (ip < (u8 *)PAGE_OFFSET || | ||
409 | probe_kernel_address(ip, c)) { | ||
410 | printk(" Bad RIP value."); | ||
411 | break; | ||
412 | } | ||
413 | if (ip == (u8 *)regs->ip) | ||
414 | printk("<%02x> ", c); | ||
415 | else | ||
416 | printk("%02x ", c); | ||
417 | } | ||
418 | } | ||
419 | printk("\n"); | ||
420 | } | ||
421 | |||
422 | int is_valid_bugaddr(unsigned long ip) | ||
423 | { | ||
424 | unsigned short ud2; | ||
425 | |||
426 | if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) | ||
427 | return 0; | ||
428 | |||
429 | return ud2 == 0x0b0f; | ||
430 | } | ||
431 | |||
432 | static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; | ||
433 | static int die_owner = -1; | ||
434 | static unsigned int die_nest_count; | ||
435 | |||
436 | unsigned __kprobes long oops_begin(void) | ||
437 | { | ||
438 | int cpu; | ||
439 | unsigned long flags; | ||
440 | |||
441 | oops_enter(); | ||
442 | |||
443 | /* racy, but better than risking deadlock. */ | ||
444 | raw_local_irq_save(flags); | ||
445 | cpu = smp_processor_id(); | ||
446 | if (!__raw_spin_trylock(&die_lock)) { | ||
447 | if (cpu == die_owner) | ||
448 | /* nested oops. should stop eventually */; | ||
449 | else | ||
450 | __raw_spin_lock(&die_lock); | ||
451 | } | ||
452 | die_nest_count++; | ||
453 | die_owner = cpu; | ||
454 | console_verbose(); | ||
455 | bust_spinlocks(1); | ||
456 | return flags; | ||
457 | } | ||
458 | |||
459 | void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) | ||
460 | { | ||
461 | die_owner = -1; | ||
462 | bust_spinlocks(0); | ||
463 | die_nest_count--; | ||
464 | if (!die_nest_count) | ||
465 | /* Nest count reaches zero, release the lock. */ | ||
466 | __raw_spin_unlock(&die_lock); | ||
467 | raw_local_irq_restore(flags); | ||
468 | if (!regs) { | ||
469 | oops_exit(); | ||
470 | return; | ||
471 | } | ||
472 | if (in_interrupt()) | ||
473 | panic("Fatal exception in interrupt"); | ||
474 | if (panic_on_oops) | ||
475 | panic("Fatal exception"); | ||
476 | oops_exit(); | ||
477 | do_exit(signr); | ||
478 | } | ||
479 | |||
480 | int __kprobes __die(const char *str, struct pt_regs *regs, long err) | ||
481 | { | ||
482 | printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); | ||
483 | #ifdef CONFIG_PREEMPT | ||
484 | printk("PREEMPT "); | ||
485 | #endif | ||
486 | #ifdef CONFIG_SMP | ||
487 | printk("SMP "); | ||
488 | #endif | ||
489 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
490 | printk("DEBUG_PAGEALLOC"); | ||
491 | #endif | ||
492 | printk("\n"); | ||
493 | sysfs_printk_last_file(); | ||
494 | if (notify_die(DIE_OOPS, str, regs, err, | ||
495 | current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) | ||
496 | return 1; | ||
497 | |||
498 | show_registers(regs); | ||
499 | add_taint(TAINT_DIE); | ||
500 | /* Executive summary in case the oops scrolled away */ | ||
501 | printk(KERN_ALERT "RIP "); | ||
502 | printk_address(regs->ip, 1); | ||
503 | printk(" RSP <%016lx>\n", regs->sp); | ||
504 | if (kexec_should_crash(current)) | ||
505 | crash_kexec(regs); | ||
506 | return 0; | ||
507 | } | ||
508 | |||
509 | void die(const char *str, struct pt_regs *regs, long err) | ||
510 | { | ||
511 | unsigned long flags = oops_begin(); | ||
512 | |||
513 | if (!user_mode(regs)) | ||
514 | report_bug(regs->ip, regs); | ||
515 | |||
516 | if (__die(str, regs, err)) | ||
517 | regs = NULL; | ||
518 | oops_end(flags, regs, SIGSEGV); | ||
519 | } | ||
520 | |||
521 | notrace __kprobes void | ||
522 | die_nmi(char *str, struct pt_regs *regs, int do_panic) | ||
523 | { | ||
524 | unsigned long flags; | ||
525 | |||
526 | if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) | ||
527 | return; | ||
528 | |||
529 | flags = oops_begin(); | ||
530 | /* | ||
531 | * We are in trouble anyway, lets at least try | ||
532 | * to get a message out. | ||
533 | */ | ||
534 | printk(KERN_EMERG "%s", str); | ||
535 | printk(" on CPU%d, ip %08lx, registers:\n", | ||
536 | smp_processor_id(), regs->ip); | ||
537 | show_registers(regs); | ||
538 | if (kexec_should_crash(current)) | ||
539 | crash_kexec(regs); | ||
540 | if (do_panic || panic_on_oops) | ||
541 | panic("Non maskable interrupt"); | ||
542 | oops_end(flags, NULL, SIGBUS); | ||
543 | nmi_exit(); | ||
544 | local_irq_enable(); | ||
545 | do_exit(SIGBUS); | ||
546 | } | ||
547 | |||
548 | static int __init oops_setup(char *s) | ||
549 | { | ||
550 | if (!s) | ||
551 | return -EINVAL; | ||
552 | if (!strcmp(s, "panic")) | ||
553 | panic_on_oops = 1; | ||
554 | return 0; | ||
555 | } | ||
556 | early_param("oops", oops_setup); | ||
557 | |||
558 | static int __init kstack_setup(char *s) | ||
559 | { | ||
560 | if (!s) | ||
561 | return -EINVAL; | ||
562 | kstack_depth_to_print = simple_strtoul(s, NULL, 0); | ||
563 | return 0; | ||
564 | } | ||
565 | early_param("kstack", kstack_setup); | ||
566 | |||
567 | static int __init code_bytes_setup(char *s) | ||
568 | { | ||
569 | code_bytes = simple_strtoul(s, NULL, 0); | ||
570 | if (code_bytes > 8192) | ||
571 | code_bytes = 8192; | ||
572 | |||
573 | return 1; | ||
574 | } | ||
575 | __setup("code_bytes=", code_bytes_setup); | ||
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 78e642feac30..ce97bf3bed12 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -1282,12 +1282,10 @@ void __init e820_reserve_resources(void) | |||
1282 | e820_res = res; | 1282 | e820_res = res; |
1283 | for (i = 0; i < e820.nr_map; i++) { | 1283 | for (i = 0; i < e820.nr_map; i++) { |
1284 | end = e820.map[i].addr + e820.map[i].size - 1; | 1284 | end = e820.map[i].addr + e820.map[i].size - 1; |
1285 | #ifndef CONFIG_RESOURCES_64BIT | 1285 | if (end != (resource_size_t)end) { |
1286 | if (end > 0x100000000ULL) { | ||
1287 | res++; | 1286 | res++; |
1288 | continue; | 1287 | continue; |
1289 | } | 1288 | } |
1290 | #endif | ||
1291 | res->name = e820_type_to_string(e820.map[i].type); | 1289 | res->name = e820_type_to_string(e820.map[i].type); |
1292 | res->start = e820.map[i].addr; | 1290 | res->start = e820.map[i].addr; |
1293 | res->end = end; | 1291 | res->end = end; |
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 945a31cdd81f..1119d247fe11 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c | |||
@@ -367,6 +367,10 @@ void __init efi_init(void) | |||
367 | efi.smbios = config_tables[i].table; | 367 | efi.smbios = config_tables[i].table; |
368 | printk(" SMBIOS=0x%lx ", config_tables[i].table); | 368 | printk(" SMBIOS=0x%lx ", config_tables[i].table); |
369 | } else if (!efi_guidcmp(config_tables[i].guid, | 369 | } else if (!efi_guidcmp(config_tables[i].guid, |
370 | UV_SYSTEM_TABLE_GUID)) { | ||
371 | efi.uv_systab = config_tables[i].table; | ||
372 | printk(" UVsystab=0x%lx ", config_tables[i].table); | ||
373 | } else if (!efi_guidcmp(config_tables[i].guid, | ||
370 | HCDP_TABLE_GUID)) { | 374 | HCDP_TABLE_GUID)) { |
371 | efi.hcdp = config_tables[i].table; | 375 | efi.hcdp = config_tables[i].table; |
372 | printk(" HCDP=0x%lx ", config_tables[i].table); | 376 | printk(" HCDP=0x%lx ", config_tables[i].table); |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 109792bc7cfa..c356423a6026 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -629,7 +629,7 @@ ENTRY(interrupt) | |||
629 | ENTRY(irq_entries_start) | 629 | ENTRY(irq_entries_start) |
630 | RING0_INT_FRAME | 630 | RING0_INT_FRAME |
631 | vector=0 | 631 | vector=0 |
632 | .rept NR_IRQS | 632 | .rept NR_VECTORS |
633 | ALIGN | 633 | ALIGN |
634 | .if vector | 634 | .if vector |
635 | CFI_ADJUST_CFA_OFFSET -4 | 635 | CFI_ADJUST_CFA_OFFSET -4 |
@@ -730,6 +730,7 @@ error_code: | |||
730 | movl $(__USER_DS), %ecx | 730 | movl $(__USER_DS), %ecx |
731 | movl %ecx, %ds | 731 | movl %ecx, %ds |
732 | movl %ecx, %es | 732 | movl %ecx, %es |
733 | TRACE_IRQS_OFF | ||
733 | movl %esp,%eax # pt_regs pointer | 734 | movl %esp,%eax # pt_regs pointer |
734 | call *%edi | 735 | call *%edi |
735 | jmp ret_from_exception | 736 | jmp ret_from_exception |
@@ -760,20 +761,9 @@ ENTRY(device_not_available) | |||
760 | RING0_INT_FRAME | 761 | RING0_INT_FRAME |
761 | pushl $-1 # mark this as an int | 762 | pushl $-1 # mark this as an int |
762 | CFI_ADJUST_CFA_OFFSET 4 | 763 | CFI_ADJUST_CFA_OFFSET 4 |
763 | SAVE_ALL | 764 | pushl $do_device_not_available |
764 | GET_CR0_INTO_EAX | ||
765 | testl $0x4, %eax # EM (math emulation bit) | ||
766 | jne device_not_available_emulate | ||
767 | preempt_stop(CLBR_ANY) | ||
768 | call math_state_restore | ||
769 | jmp ret_from_exception | ||
770 | device_not_available_emulate: | ||
771 | pushl $0 # temporary storage for ORIG_EIP | ||
772 | CFI_ADJUST_CFA_OFFSET 4 | 765 | CFI_ADJUST_CFA_OFFSET 4 |
773 | call math_emulate | 766 | jmp error_code |
774 | addl $4, %esp | ||
775 | CFI_ADJUST_CFA_OFFSET -4 | ||
776 | jmp ret_from_exception | ||
777 | CFI_ENDPROC | 767 | CFI_ENDPROC |
778 | END(device_not_available) | 768 | END(device_not_available) |
779 | 769 | ||
@@ -814,6 +804,7 @@ debug_stack_correct: | |||
814 | pushl $-1 # mark this as an int | 804 | pushl $-1 # mark this as an int |
815 | CFI_ADJUST_CFA_OFFSET 4 | 805 | CFI_ADJUST_CFA_OFFSET 4 |
816 | SAVE_ALL | 806 | SAVE_ALL |
807 | TRACE_IRQS_OFF | ||
817 | xorl %edx,%edx # error code 0 | 808 | xorl %edx,%edx # error code 0 |
818 | movl %esp,%eax # pt_regs pointer | 809 | movl %esp,%eax # pt_regs pointer |
819 | call do_debug | 810 | call do_debug |
@@ -858,6 +849,7 @@ nmi_stack_correct: | |||
858 | pushl %eax | 849 | pushl %eax |
859 | CFI_ADJUST_CFA_OFFSET 4 | 850 | CFI_ADJUST_CFA_OFFSET 4 |
860 | SAVE_ALL | 851 | SAVE_ALL |
852 | TRACE_IRQS_OFF | ||
861 | xorl %edx,%edx # zero error code | 853 | xorl %edx,%edx # zero error code |
862 | movl %esp,%eax # pt_regs pointer | 854 | movl %esp,%eax # pt_regs pointer |
863 | call do_nmi | 855 | call do_nmi |
@@ -898,6 +890,7 @@ nmi_espfix_stack: | |||
898 | pushl %eax | 890 | pushl %eax |
899 | CFI_ADJUST_CFA_OFFSET 4 | 891 | CFI_ADJUST_CFA_OFFSET 4 |
900 | SAVE_ALL | 892 | SAVE_ALL |
893 | TRACE_IRQS_OFF | ||
901 | FIXUP_ESPFIX_STACK # %eax == %esp | 894 | FIXUP_ESPFIX_STACK # %eax == %esp |
902 | xorl %edx,%edx # zero error code | 895 | xorl %edx,%edx # zero error code |
903 | call do_nmi | 896 | call do_nmi |
@@ -928,6 +921,7 @@ KPROBE_ENTRY(int3) | |||
928 | pushl $-1 # mark this as an int | 921 | pushl $-1 # mark this as an int |
929 | CFI_ADJUST_CFA_OFFSET 4 | 922 | CFI_ADJUST_CFA_OFFSET 4 |
930 | SAVE_ALL | 923 | SAVE_ALL |
924 | TRACE_IRQS_OFF | ||
931 | xorl %edx,%edx # zero error code | 925 | xorl %edx,%edx # zero error code |
932 | movl %esp,%eax # pt_regs pointer | 926 | movl %esp,%eax # pt_regs pointer |
933 | call do_int3 | 927 | call do_int3 |
@@ -1030,7 +1024,7 @@ ENTRY(machine_check) | |||
1030 | RING0_INT_FRAME | 1024 | RING0_INT_FRAME |
1031 | pushl $0 | 1025 | pushl $0 |
1032 | CFI_ADJUST_CFA_OFFSET 4 | 1026 | CFI_ADJUST_CFA_OFFSET 4 |
1033 | pushl machine_check_vector | 1027 | pushl $do_machine_check |
1034 | CFI_ADJUST_CFA_OFFSET 4 | 1028 | CFI_ADJUST_CFA_OFFSET 4 |
1035 | jmp error_code | 1029 | jmp error_code |
1036 | CFI_ENDPROC | 1030 | CFI_ENDPROC |
@@ -1159,20 +1153,6 @@ ENDPROC(xen_failsafe_callback) | |||
1159 | #ifdef CONFIG_DYNAMIC_FTRACE | 1153 | #ifdef CONFIG_DYNAMIC_FTRACE |
1160 | 1154 | ||
1161 | ENTRY(mcount) | 1155 | ENTRY(mcount) |
1162 | pushl %eax | ||
1163 | pushl %ecx | ||
1164 | pushl %edx | ||
1165 | movl 0xc(%esp), %eax | ||
1166 | subl $MCOUNT_INSN_SIZE, %eax | ||
1167 | |||
1168 | .globl mcount_call | ||
1169 | mcount_call: | ||
1170 | call ftrace_stub | ||
1171 | |||
1172 | popl %edx | ||
1173 | popl %ecx | ||
1174 | popl %eax | ||
1175 | |||
1176 | ret | 1156 | ret |
1177 | END(mcount) | 1157 | END(mcount) |
1178 | 1158 | ||
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index cf3a0b2d0059..09e7145484c5 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -64,32 +64,6 @@ | |||
64 | #ifdef CONFIG_FTRACE | 64 | #ifdef CONFIG_FTRACE |
65 | #ifdef CONFIG_DYNAMIC_FTRACE | 65 | #ifdef CONFIG_DYNAMIC_FTRACE |
66 | ENTRY(mcount) | 66 | ENTRY(mcount) |
67 | |||
68 | subq $0x38, %rsp | ||
69 | movq %rax, (%rsp) | ||
70 | movq %rcx, 8(%rsp) | ||
71 | movq %rdx, 16(%rsp) | ||
72 | movq %rsi, 24(%rsp) | ||
73 | movq %rdi, 32(%rsp) | ||
74 | movq %r8, 40(%rsp) | ||
75 | movq %r9, 48(%rsp) | ||
76 | |||
77 | movq 0x38(%rsp), %rdi | ||
78 | subq $MCOUNT_INSN_SIZE, %rdi | ||
79 | |||
80 | .globl mcount_call | ||
81 | mcount_call: | ||
82 | call ftrace_stub | ||
83 | |||
84 | movq 48(%rsp), %r9 | ||
85 | movq 40(%rsp), %r8 | ||
86 | movq 32(%rsp), %rdi | ||
87 | movq 24(%rsp), %rsi | ||
88 | movq 16(%rsp), %rdx | ||
89 | movq 8(%rsp), %rcx | ||
90 | movq (%rsp), %rax | ||
91 | addq $0x38, %rsp | ||
92 | |||
93 | retq | 67 | retq |
94 | END(mcount) | 68 | END(mcount) |
95 | 69 | ||
@@ -667,6 +641,13 @@ END(stub_rt_sigreturn) | |||
667 | SAVE_ARGS | 641 | SAVE_ARGS |
668 | leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler | 642 | leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler |
669 | pushq %rbp | 643 | pushq %rbp |
644 | /* | ||
645 | * Save rbp twice: One is for marking the stack frame, as usual, and the | ||
646 | * other, to fill pt_regs properly. This is because bx comes right | ||
647 | * before the last saved register in that structure, and not bp. If the | ||
648 | * base pointer were in the place bx is today, this would not be needed. | ||
649 | */ | ||
650 | movq %rbp, -8(%rsp) | ||
670 | CFI_ADJUST_CFA_OFFSET 8 | 651 | CFI_ADJUST_CFA_OFFSET 8 |
671 | CFI_REL_OFFSET rbp, 0 | 652 | CFI_REL_OFFSET rbp, 0 |
672 | movq %rsp,%rbp | 653 | movq %rsp,%rbp |
@@ -932,6 +913,9 @@ END(spurious_interrupt) | |||
932 | .if \ist | 913 | .if \ist |
933 | movq %gs:pda_data_offset, %rbp | 914 | movq %gs:pda_data_offset, %rbp |
934 | .endif | 915 | .endif |
916 | .if \irqtrace | ||
917 | TRACE_IRQS_OFF | ||
918 | .endif | ||
935 | movq %rsp,%rdi | 919 | movq %rsp,%rdi |
936 | movq ORIG_RAX(%rsp),%rsi | 920 | movq ORIG_RAX(%rsp),%rsi |
937 | movq $-1,ORIG_RAX(%rsp) | 921 | movq $-1,ORIG_RAX(%rsp) |
@@ -1058,7 +1042,8 @@ KPROBE_ENTRY(error_entry) | |||
1058 | je error_kernelspace | 1042 | je error_kernelspace |
1059 | error_swapgs: | 1043 | error_swapgs: |
1060 | SWAPGS | 1044 | SWAPGS |
1061 | error_sti: | 1045 | error_sti: |
1046 | TRACE_IRQS_OFF | ||
1062 | movq %rdi,RDI(%rsp) | 1047 | movq %rdi,RDI(%rsp) |
1063 | CFI_REL_OFFSET rdi,RDI | 1048 | CFI_REL_OFFSET rdi,RDI |
1064 | movq %rsp,%rdi | 1049 | movq %rsp,%rdi |
@@ -1232,7 +1217,7 @@ ENTRY(simd_coprocessor_error) | |||
1232 | END(simd_coprocessor_error) | 1217 | END(simd_coprocessor_error) |
1233 | 1218 | ||
1234 | ENTRY(device_not_available) | 1219 | ENTRY(device_not_available) |
1235 | zeroentry math_state_restore | 1220 | zeroentry do_device_not_available |
1236 | END(device_not_available) | 1221 | END(device_not_available) |
1237 | 1222 | ||
1238 | /* runs on exception stack */ | 1223 | /* runs on exception stack */ |
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index 849e5cd485b8..f454c78fcef6 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c | |||
@@ -109,6 +109,7 @@ struct oem_table { | |||
109 | }; | 109 | }; |
110 | 110 | ||
111 | extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); | 111 | extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); |
112 | extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr); | ||
112 | #endif | 113 | #endif |
113 | 114 | ||
114 | struct mip_reg { | 115 | struct mip_reg { |
@@ -243,21 +244,38 @@ parse_unisys_oem (char *oemptr) | |||
243 | } | 244 | } |
244 | 245 | ||
245 | #ifdef CONFIG_ACPI | 246 | #ifdef CONFIG_ACPI |
246 | int __init | 247 | static unsigned long oem_addrX; |
247 | find_unisys_acpi_oem_table(unsigned long *oem_addr) | 248 | static unsigned long oem_size; |
249 | int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) | ||
248 | { | 250 | { |
249 | struct acpi_table_header *header = NULL; | 251 | struct acpi_table_header *header = NULL; |
250 | int i = 0; | 252 | int i = 0; |
251 | while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { | 253 | acpi_size tbl_size; |
254 | |||
255 | while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) { | ||
252 | if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { | 256 | if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { |
253 | struct oem_table *t = (struct oem_table *)header; | 257 | struct oem_table *t = (struct oem_table *)header; |
254 | *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr, | 258 | |
255 | t->OEMTableSize); | 259 | oem_addrX = t->OEMTableAddr; |
260 | oem_size = t->OEMTableSize; | ||
261 | early_acpi_os_unmap_memory(header, tbl_size); | ||
262 | |||
263 | *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, | ||
264 | oem_size); | ||
256 | return 0; | 265 | return 0; |
257 | } | 266 | } |
267 | early_acpi_os_unmap_memory(header, tbl_size); | ||
258 | } | 268 | } |
259 | return -1; | 269 | return -1; |
260 | } | 270 | } |
271 | |||
272 | void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) | ||
273 | { | ||
274 | if (!oem_addr) | ||
275 | return; | ||
276 | |||
277 | __acpi_unmap_table((char *)oem_addr, oem_size); | ||
278 | } | ||
261 | #endif | 279 | #endif |
262 | 280 | ||
263 | static void | 281 | static void |
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ab115cd15fdf..d073d981a730 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c | |||
@@ -11,17 +11,18 @@ | |||
11 | 11 | ||
12 | #include <linux/spinlock.h> | 12 | #include <linux/spinlock.h> |
13 | #include <linux/hardirq.h> | 13 | #include <linux/hardirq.h> |
14 | #include <linux/uaccess.h> | ||
14 | #include <linux/ftrace.h> | 15 | #include <linux/ftrace.h> |
15 | #include <linux/percpu.h> | 16 | #include <linux/percpu.h> |
16 | #include <linux/init.h> | 17 | #include <linux/init.h> |
17 | #include <linux/list.h> | 18 | #include <linux/list.h> |
18 | 19 | ||
19 | #include <asm/alternative.h> | ||
20 | #include <asm/ftrace.h> | 20 | #include <asm/ftrace.h> |
21 | #include <asm/nops.h> | ||
21 | 22 | ||
22 | 23 | ||
23 | /* Long is fine, even if it is only 4 bytes ;-) */ | 24 | /* Long is fine, even if it is only 4 bytes ;-) */ |
24 | static long *ftrace_nop; | 25 | static unsigned long *ftrace_nop; |
25 | 26 | ||
26 | union ftrace_code_union { | 27 | union ftrace_code_union { |
27 | char code[MCOUNT_INSN_SIZE]; | 28 | char code[MCOUNT_INSN_SIZE]; |
@@ -60,11 +61,7 @@ notrace int | |||
60 | ftrace_modify_code(unsigned long ip, unsigned char *old_code, | 61 | ftrace_modify_code(unsigned long ip, unsigned char *old_code, |
61 | unsigned char *new_code) | 62 | unsigned char *new_code) |
62 | { | 63 | { |
63 | unsigned replaced; | 64 | unsigned char replaced[MCOUNT_INSN_SIZE]; |
64 | unsigned old = *(unsigned *)old_code; /* 4 bytes */ | ||
65 | unsigned new = *(unsigned *)new_code; /* 4 bytes */ | ||
66 | unsigned char newch = new_code[4]; | ||
67 | int faulted = 0; | ||
68 | 65 | ||
69 | /* | 66 | /* |
70 | * Note: Due to modules and __init, code can | 67 | * Note: Due to modules and __init, code can |
@@ -72,29 +69,20 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, | |||
72 | * as well as code changing. | 69 | * as well as code changing. |
73 | * | 70 | * |
74 | * No real locking needed, this code is run through | 71 | * No real locking needed, this code is run through |
75 | * kstop_machine. | 72 | * kstop_machine, or before SMP starts. |
76 | */ | 73 | */ |
77 | asm volatile ( | 74 | if (__copy_from_user_inatomic(replaced, (char __user *)ip, MCOUNT_INSN_SIZE)) |
78 | "1: lock\n" | 75 | return 1; |
79 | " cmpxchg %3, (%2)\n" | 76 | |
80 | " jnz 2f\n" | 77 | if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) |
81 | " movb %b4, 4(%2)\n" | 78 | return 2; |
82 | "2:\n" | ||
83 | ".section .fixup, \"ax\"\n" | ||
84 | "3: movl $1, %0\n" | ||
85 | " jmp 2b\n" | ||
86 | ".previous\n" | ||
87 | _ASM_EXTABLE(1b, 3b) | ||
88 | : "=r"(faulted), "=a"(replaced) | ||
89 | : "r"(ip), "r"(new), "c"(newch), | ||
90 | "0"(faulted), "a"(old) | ||
91 | : "memory"); | ||
92 | sync_core(); | ||
93 | 79 | ||
94 | if (replaced != old && replaced != new) | 80 | WARN_ON_ONCE(__copy_to_user_inatomic((char __user *)ip, new_code, |
95 | faulted = 2; | 81 | MCOUNT_INSN_SIZE)); |
96 | 82 | ||
97 | return faulted; | 83 | sync_core(); |
84 | |||
85 | return 0; | ||
98 | } | 86 | } |
99 | 87 | ||
100 | notrace int ftrace_update_ftrace_func(ftrace_func_t func) | 88 | notrace int ftrace_update_ftrace_func(ftrace_func_t func) |
@@ -112,30 +100,76 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func) | |||
112 | 100 | ||
113 | notrace int ftrace_mcount_set(unsigned long *data) | 101 | notrace int ftrace_mcount_set(unsigned long *data) |
114 | { | 102 | { |
115 | unsigned long ip = (long)(&mcount_call); | 103 | /* mcount is initialized as a nop */ |
116 | unsigned long *addr = data; | 104 | *data = 0; |
117 | unsigned char old[MCOUNT_INSN_SIZE], *new; | ||
118 | |||
119 | /* | ||
120 | * Replace the mcount stub with a pointer to the | ||
121 | * ip recorder function. | ||
122 | */ | ||
123 | memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); | ||
124 | new = ftrace_call_replace(ip, *addr); | ||
125 | *addr = ftrace_modify_code(ip, old, new); | ||
126 | |||
127 | return 0; | 105 | return 0; |
128 | } | 106 | } |
129 | 107 | ||
130 | int __init ftrace_dyn_arch_init(void *data) | 108 | int __init ftrace_dyn_arch_init(void *data) |
131 | { | 109 | { |
132 | const unsigned char *const *noptable = find_nop_table(); | 110 | extern const unsigned char ftrace_test_p6nop[]; |
133 | 111 | extern const unsigned char ftrace_test_nop5[]; | |
134 | /* This is running in kstop_machine */ | 112 | extern const unsigned char ftrace_test_jmp[]; |
135 | 113 | int faulted = 0; | |
136 | ftrace_mcount_set(data); | ||
137 | 114 | ||
138 | ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE]; | 115 | /* |
116 | * There is no good nop for all x86 archs. | ||
117 | * We will default to using the P6_NOP5, but first we | ||
118 | * will test to make sure that the nop will actually | ||
119 | * work on this CPU. If it faults, we will then | ||
120 | * go to a lesser efficient 5 byte nop. If that fails | ||
121 | * we then just use a jmp as our nop. This isn't the most | ||
122 | * efficient nop, but we can not use a multi part nop | ||
123 | * since we would then risk being preempted in the middle | ||
124 | * of that nop, and if we enabled tracing then, it might | ||
125 | * cause a system crash. | ||
126 | * | ||
127 | * TODO: check the cpuid to determine the best nop. | ||
128 | */ | ||
129 | asm volatile ( | ||
130 | "jmp ftrace_test_jmp\n" | ||
131 | /* This code needs to stay around */ | ||
132 | ".section .text, \"ax\"\n" | ||
133 | "ftrace_test_jmp:" | ||
134 | "jmp ftrace_test_p6nop\n" | ||
135 | "nop\n" | ||
136 | "nop\n" | ||
137 | "nop\n" /* 2 byte jmp + 3 bytes */ | ||
138 | "ftrace_test_p6nop:" | ||
139 | P6_NOP5 | ||
140 | "jmp 1f\n" | ||
141 | "ftrace_test_nop5:" | ||
142 | ".byte 0x66,0x66,0x66,0x66,0x90\n" | ||
143 | "jmp 1f\n" | ||
144 | ".previous\n" | ||
145 | "1:" | ||
146 | ".section .fixup, \"ax\"\n" | ||
147 | "2: movl $1, %0\n" | ||
148 | " jmp ftrace_test_nop5\n" | ||
149 | "3: movl $2, %0\n" | ||
150 | " jmp 1b\n" | ||
151 | ".previous\n" | ||
152 | _ASM_EXTABLE(ftrace_test_p6nop, 2b) | ||
153 | _ASM_EXTABLE(ftrace_test_nop5, 3b) | ||
154 | : "=r"(faulted) : "0" (faulted)); | ||
155 | |||
156 | switch (faulted) { | ||
157 | case 0: | ||
158 | pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); | ||
159 | ftrace_nop = (unsigned long *)ftrace_test_p6nop; | ||
160 | break; | ||
161 | case 1: | ||
162 | pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); | ||
163 | ftrace_nop = (unsigned long *)ftrace_test_nop5; | ||
164 | break; | ||
165 | case 2: | ||
166 | pr_info("ftrace: converting mcount calls to jmp . + 5\n"); | ||
167 | ftrace_nop = (unsigned long *)ftrace_test_jmp; | ||
168 | break; | ||
169 | } | ||
170 | |||
171 | /* The return code is retured via data */ | ||
172 | *(unsigned long *)data = 0; | ||
139 | 173 | ||
140 | return 0; | 174 | return 0; |
141 | } | 175 | } |
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 9eca5ba7a6b1..2ec2de8d8c46 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c | |||
@@ -179,8 +179,10 @@ static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) | |||
179 | * is an example). | 179 | * is an example). |
180 | */ | 180 | */ |
181 | if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && | 181 | if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && |
182 | (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) | 182 | (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { |
183 | printk(KERN_DEBUG "system APIC only can use physical flat"); | ||
183 | return 1; | 184 | return 1; |
185 | } | ||
184 | #endif | 186 | #endif |
185 | 187 | ||
186 | return 0; | 188 | return 0; |
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index ae2ffc8a400c..bfd532843df6 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c | |||
@@ -114,7 +114,7 @@ static void uv_send_IPI_one(int cpu, int vector) | |||
114 | unsigned long val, apicid, lapicid; | 114 | unsigned long val, apicid, lapicid; |
115 | int pnode; | 115 | int pnode; |
116 | 116 | ||
117 | apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ | 117 | apicid = per_cpu(x86_cpu_to_apicid, cpu); |
118 | lapicid = apicid & 0x3f; /* ZZZ macro needed */ | 118 | lapicid = apicid & 0x3f; /* ZZZ macro needed */ |
119 | pnode = uv_apicid_to_pnode(apicid); | 119 | pnode = uv_apicid_to_pnode(apicid); |
120 | val = | 120 | val = |
@@ -202,12 +202,10 @@ static unsigned int phys_pkg_id(int index_msb) | |||
202 | return uv_read_apic_id() >> index_msb; | 202 | return uv_read_apic_id() >> index_msb; |
203 | } | 203 | } |
204 | 204 | ||
205 | #ifdef ZZZ /* Needs x2apic patch */ | ||
206 | static void uv_send_IPI_self(int vector) | 205 | static void uv_send_IPI_self(int vector) |
207 | { | 206 | { |
208 | apic_write(APIC_SELF_IPI, vector); | 207 | apic_write(APIC_SELF_IPI, vector); |
209 | } | 208 | } |
210 | #endif | ||
211 | 209 | ||
212 | struct genapic apic_x2apic_uv_x = { | 210 | struct genapic apic_x2apic_uv_x = { |
213 | .name = "UV large system", | 211 | .name = "UV large system", |
@@ -215,15 +213,15 @@ struct genapic apic_x2apic_uv_x = { | |||
215 | .int_delivery_mode = dest_Fixed, | 213 | .int_delivery_mode = dest_Fixed, |
216 | .int_dest_mode = (APIC_DEST_PHYSICAL != 0), | 214 | .int_dest_mode = (APIC_DEST_PHYSICAL != 0), |
217 | .target_cpus = uv_target_cpus, | 215 | .target_cpus = uv_target_cpus, |
218 | .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */ | 216 | .vector_allocation_domain = uv_vector_allocation_domain, |
219 | .apic_id_registered = uv_apic_id_registered, | 217 | .apic_id_registered = uv_apic_id_registered, |
220 | .init_apic_ldr = uv_init_apic_ldr, | 218 | .init_apic_ldr = uv_init_apic_ldr, |
221 | .send_IPI_all = uv_send_IPI_all, | 219 | .send_IPI_all = uv_send_IPI_all, |
222 | .send_IPI_allbutself = uv_send_IPI_allbutself, | 220 | .send_IPI_allbutself = uv_send_IPI_allbutself, |
223 | .send_IPI_mask = uv_send_IPI_mask, | 221 | .send_IPI_mask = uv_send_IPI_mask, |
224 | /* ZZZ.send_IPI_self = uv_send_IPI_self, */ | 222 | .send_IPI_self = uv_send_IPI_self, |
225 | .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, | 223 | .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, |
226 | .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ | 224 | .phys_pkg_id = phys_pkg_id, |
227 | .get_apic_id = get_apic_id, | 225 | .get_apic_id = get_apic_id, |
228 | .set_apic_id = set_apic_id, | 226 | .set_apic_id = set_apic_id, |
229 | .apic_id_mask = (0xFFFFFFFFu), | 227 | .apic_id_mask = (0xFFFFFFFFu), |
@@ -286,12 +284,13 @@ static __init void map_low_mmrs(void) | |||
286 | 284 | ||
287 | enum map_type {map_wb, map_uc}; | 285 | enum map_type {map_wb, map_uc}; |
288 | 286 | ||
289 | static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type) | 287 | static __init void map_high(char *id, unsigned long base, int shift, |
288 | int max_pnode, enum map_type map_type) | ||
290 | { | 289 | { |
291 | unsigned long bytes, paddr; | 290 | unsigned long bytes, paddr; |
292 | 291 | ||
293 | paddr = base << shift; | 292 | paddr = base << shift; |
294 | bytes = (1UL << shift); | 293 | bytes = (1UL << shift) * (max_pnode + 1); |
295 | printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, | 294 | printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, |
296 | paddr + bytes); | 295 | paddr + bytes); |
297 | if (map_type == map_uc) | 296 | if (map_type == map_uc) |
@@ -307,7 +306,7 @@ static __init void map_gru_high(int max_pnode) | |||
307 | 306 | ||
308 | gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); | 307 | gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); |
309 | if (gru.s.enable) | 308 | if (gru.s.enable) |
310 | map_high("GRU", gru.s.base, shift, map_wb); | 309 | map_high("GRU", gru.s.base, shift, max_pnode, map_wb); |
311 | } | 310 | } |
312 | 311 | ||
313 | static __init void map_config_high(int max_pnode) | 312 | static __init void map_config_high(int max_pnode) |
@@ -317,7 +316,7 @@ static __init void map_config_high(int max_pnode) | |||
317 | 316 | ||
318 | cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); | 317 | cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); |
319 | if (cfg.s.enable) | 318 | if (cfg.s.enable) |
320 | map_high("CONFIG", cfg.s.base, shift, map_uc); | 319 | map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); |
321 | } | 320 | } |
322 | 321 | ||
323 | static __init void map_mmr_high(int max_pnode) | 322 | static __init void map_mmr_high(int max_pnode) |
@@ -327,7 +326,7 @@ static __init void map_mmr_high(int max_pnode) | |||
327 | 326 | ||
328 | mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); | 327 | mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); |
329 | if (mmr.s.enable) | 328 | if (mmr.s.enable) |
330 | map_high("MMR", mmr.s.base, shift, map_uc); | 329 | map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); |
331 | } | 330 | } |
332 | 331 | ||
333 | static __init void map_mmioh_high(int max_pnode) | 332 | static __init void map_mmioh_high(int max_pnode) |
@@ -337,17 +336,17 @@ static __init void map_mmioh_high(int max_pnode) | |||
337 | 336 | ||
338 | mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); | 337 | mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); |
339 | if (mmioh.s.enable) | 338 | if (mmioh.s.enable) |
340 | map_high("MMIOH", mmioh.s.base, shift, map_uc); | 339 | map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); |
341 | } | 340 | } |
342 | 341 | ||
343 | static __init void uv_rtc_init(void) | 342 | static __init void uv_rtc_init(void) |
344 | { | 343 | { |
345 | long status, ticks_per_sec, drift; | 344 | long status; |
345 | u64 ticks_per_sec; | ||
346 | 346 | ||
347 | status = | 347 | status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, |
348 | x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec, | 348 | &ticks_per_sec); |
349 | &drift); | 349 | if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) { |
350 | if (status != 0 || ticks_per_sec < 100000) { | ||
351 | printk(KERN_WARNING | 350 | printk(KERN_WARNING |
352 | "unable to determine platform RTC clock frequency, " | 351 | "unable to determine platform RTC clock frequency, " |
353 | "guessing.\n"); | 352 | "guessing.\n"); |
@@ -357,7 +356,22 @@ static __init void uv_rtc_init(void) | |||
357 | sn_rtc_cycles_per_second = ticks_per_sec; | 356 | sn_rtc_cycles_per_second = ticks_per_sec; |
358 | } | 357 | } |
359 | 358 | ||
360 | static bool uv_system_inited; | 359 | /* |
360 | * Called on each cpu to initialize the per_cpu UV data area. | ||
361 | * ZZZ hotplug not supported yet | ||
362 | */ | ||
363 | void __cpuinit uv_cpu_init(void) | ||
364 | { | ||
365 | /* CPU 0 initilization will be done via uv_system_init. */ | ||
366 | if (!uv_blade_info) | ||
367 | return; | ||
368 | |||
369 | uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; | ||
370 | |||
371 | if (get_uv_system_type() == UV_NON_UNIQUE_APIC) | ||
372 | set_x2apic_extra_bits(uv_hub_info->pnode); | ||
373 | } | ||
374 | |||
361 | 375 | ||
362 | void __init uv_system_init(void) | 376 | void __init uv_system_init(void) |
363 | { | 377 | { |
@@ -413,6 +427,9 @@ void __init uv_system_init(void) | |||
413 | gnode_upper = (((unsigned long)node_id.s.node_id) & | 427 | gnode_upper = (((unsigned long)node_id.s.node_id) & |
414 | ~((1 << n_val) - 1)) << m_val; | 428 | ~((1 << n_val) - 1)) << m_val; |
415 | 429 | ||
430 | uv_bios_init(); | ||
431 | uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, | ||
432 | &uv_coherency_id, &uv_region_size); | ||
416 | uv_rtc_init(); | 433 | uv_rtc_init(); |
417 | 434 | ||
418 | for_each_present_cpu(cpu) { | 435 | for_each_present_cpu(cpu) { |
@@ -434,7 +451,7 @@ void __init uv_system_init(void) | |||
434 | uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; | 451 | uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; |
435 | uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; | 452 | uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; |
436 | uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; | 453 | uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; |
437 | uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ | 454 | uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; |
438 | uv_node_to_blade[nid] = blade; | 455 | uv_node_to_blade[nid] = blade; |
439 | uv_cpu_to_blade[cpu] = blade; | 456 | uv_cpu_to_blade[cpu] = blade; |
440 | max_pnode = max(pnode, max_pnode); | 457 | max_pnode = max(pnode, max_pnode); |
@@ -449,21 +466,6 @@ void __init uv_system_init(void) | |||
449 | map_mmr_high(max_pnode); | 466 | map_mmr_high(max_pnode); |
450 | map_config_high(max_pnode); | 467 | map_config_high(max_pnode); |
451 | map_mmioh_high(max_pnode); | 468 | map_mmioh_high(max_pnode); |
452 | uv_system_inited = true; | ||
453 | } | ||
454 | 469 | ||
455 | /* | 470 | uv_cpu_init(); |
456 | * Called on each cpu to initialize the per_cpu UV data area. | ||
457 | * ZZZ hotplug not supported yet | ||
458 | */ | ||
459 | void __cpuinit uv_cpu_init(void) | ||
460 | { | ||
461 | BUG_ON(!uv_system_inited); | ||
462 | |||
463 | uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; | ||
464 | |||
465 | if (get_uv_system_type() == UV_NON_UNIQUE_APIC) | ||
466 | set_x2apic_extra_bits(uv_hub_info->pnode); | ||
467 | } | 471 | } |
468 | |||
469 | |||
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 3e66bd364a9d..1dcb0f13897e 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c | |||
@@ -35,6 +35,7 @@ void __init reserve_ebda_region(void) | |||
35 | 35 | ||
36 | /* start of EBDA area */ | 36 | /* start of EBDA area */ |
37 | ebda_addr = get_bios_ebda(); | 37 | ebda_addr = get_bios_ebda(); |
38 | printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem); | ||
38 | 39 | ||
39 | /* Fixup: bios puts an EBDA in the top 64K segment */ | 40 | /* Fixup: bios puts an EBDA in the top 64K segment */ |
40 | /* of conventional memory, but does not adjust lowmem. */ | 41 | /* of conventional memory, but does not adjust lowmem. */ |
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 73deaffadd03..77017e834cf7 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c | |||
@@ -1,29 +1,49 @@ | |||
1 | #include <linux/clocksource.h> | 1 | #include <linux/clocksource.h> |
2 | #include <linux/clockchips.h> | 2 | #include <linux/clockchips.h> |
3 | #include <linux/interrupt.h> | ||
4 | #include <linux/sysdev.h> | ||
3 | #include <linux/delay.h> | 5 | #include <linux/delay.h> |
4 | #include <linux/errno.h> | 6 | #include <linux/errno.h> |
5 | #include <linux/hpet.h> | 7 | #include <linux/hpet.h> |
6 | #include <linux/init.h> | 8 | #include <linux/init.h> |
7 | #include <linux/sysdev.h> | 9 | #include <linux/cpu.h> |
8 | #include <linux/pm.h> | 10 | #include <linux/pm.h> |
11 | #include <linux/io.h> | ||
9 | 12 | ||
10 | #include <asm/fixmap.h> | 13 | #include <asm/fixmap.h> |
11 | #include <asm/hpet.h> | ||
12 | #include <asm/i8253.h> | 14 | #include <asm/i8253.h> |
13 | #include <asm/io.h> | 15 | #include <asm/hpet.h> |
14 | 16 | ||
15 | #define HPET_MASK CLOCKSOURCE_MASK(32) | 17 | #define HPET_MASK CLOCKSOURCE_MASK(32) |
16 | #define HPET_SHIFT 22 | 18 | #define HPET_SHIFT 22 |
17 | 19 | ||
18 | /* FSEC = 10^-15 | 20 | /* FSEC = 10^-15 |
19 | NSEC = 10^-9 */ | 21 | NSEC = 10^-9 */ |
20 | #define FSEC_PER_NSEC 1000000L | 22 | #define FSEC_PER_NSEC 1000000L |
23 | |||
24 | #define HPET_DEV_USED_BIT 2 | ||
25 | #define HPET_DEV_USED (1 << HPET_DEV_USED_BIT) | ||
26 | #define HPET_DEV_VALID 0x8 | ||
27 | #define HPET_DEV_FSB_CAP 0x1000 | ||
28 | #define HPET_DEV_PERI_CAP 0x2000 | ||
29 | |||
30 | #define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) | ||
21 | 31 | ||
22 | /* | 32 | /* |
23 | * HPET address is set in acpi/boot.c, when an ACPI entry exists | 33 | * HPET address is set in acpi/boot.c, when an ACPI entry exists |
24 | */ | 34 | */ |
25 | unsigned long hpet_address; | 35 | unsigned long hpet_address; |
26 | static void __iomem *hpet_virt_address; | 36 | unsigned long hpet_num_timers; |
37 | static void __iomem *hpet_virt_address; | ||
38 | |||
39 | struct hpet_dev { | ||
40 | struct clock_event_device evt; | ||
41 | unsigned int num; | ||
42 | int cpu; | ||
43 | unsigned int irq; | ||
44 | unsigned int flags; | ||
45 | char name[10]; | ||
46 | }; | ||
27 | 47 | ||
28 | unsigned long hpet_readl(unsigned long a) | 48 | unsigned long hpet_readl(unsigned long a) |
29 | { | 49 | { |
@@ -59,7 +79,7 @@ static inline void hpet_clear_mapping(void) | |||
59 | static int boot_hpet_disable; | 79 | static int boot_hpet_disable; |
60 | int hpet_force_user; | 80 | int hpet_force_user; |
61 | 81 | ||
62 | static int __init hpet_setup(char* str) | 82 | static int __init hpet_setup(char *str) |
63 | { | 83 | { |
64 | if (str) { | 84 | if (str) { |
65 | if (!strncmp("disable", str, 7)) | 85 | if (!strncmp("disable", str, 7)) |
@@ -80,7 +100,7 @@ __setup("nohpet", disable_hpet); | |||
80 | 100 | ||
81 | static inline int is_hpet_capable(void) | 101 | static inline int is_hpet_capable(void) |
82 | { | 102 | { |
83 | return (!boot_hpet_disable && hpet_address); | 103 | return !boot_hpet_disable && hpet_address; |
84 | } | 104 | } |
85 | 105 | ||
86 | /* | 106 | /* |
@@ -102,6 +122,9 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled); | |||
102 | * timer 0 and timer 1 in case of RTC emulation. | 122 | * timer 0 and timer 1 in case of RTC emulation. |
103 | */ | 123 | */ |
104 | #ifdef CONFIG_HPET | 124 | #ifdef CONFIG_HPET |
125 | |||
126 | static void hpet_reserve_msi_timers(struct hpet_data *hd); | ||
127 | |||
105 | static void hpet_reserve_platform_timers(unsigned long id) | 128 | static void hpet_reserve_platform_timers(unsigned long id) |
106 | { | 129 | { |
107 | struct hpet __iomem *hpet = hpet_virt_address; | 130 | struct hpet __iomem *hpet = hpet_virt_address; |
@@ -111,25 +134,31 @@ static void hpet_reserve_platform_timers(unsigned long id) | |||
111 | 134 | ||
112 | nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; | 135 | nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; |
113 | 136 | ||
114 | memset(&hd, 0, sizeof (hd)); | 137 | memset(&hd, 0, sizeof(hd)); |
115 | hd.hd_phys_address = hpet_address; | 138 | hd.hd_phys_address = hpet_address; |
116 | hd.hd_address = hpet; | 139 | hd.hd_address = hpet; |
117 | hd.hd_nirqs = nrtimers; | 140 | hd.hd_nirqs = nrtimers; |
118 | hd.hd_flags = HPET_DATA_PLATFORM; | ||
119 | hpet_reserve_timer(&hd, 0); | 141 | hpet_reserve_timer(&hd, 0); |
120 | 142 | ||
121 | #ifdef CONFIG_HPET_EMULATE_RTC | 143 | #ifdef CONFIG_HPET_EMULATE_RTC |
122 | hpet_reserve_timer(&hd, 1); | 144 | hpet_reserve_timer(&hd, 1); |
123 | #endif | 145 | #endif |
124 | 146 | ||
147 | /* | ||
148 | * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254 | ||
149 | * is wrong for i8259!) not the output IRQ. Many BIOS writers | ||
150 | * don't bother configuring *any* comparator interrupts. | ||
151 | */ | ||
125 | hd.hd_irq[0] = HPET_LEGACY_8254; | 152 | hd.hd_irq[0] = HPET_LEGACY_8254; |
126 | hd.hd_irq[1] = HPET_LEGACY_RTC; | 153 | hd.hd_irq[1] = HPET_LEGACY_RTC; |
127 | 154 | ||
128 | for (i = 2; i < nrtimers; timer++, i++) { | 155 | for (i = 2; i < nrtimers; timer++, i++) { |
129 | hd.hd_irq[i] = (readl(&timer->hpet_config) & Tn_INT_ROUTE_CNF_MASK) >> | 156 | hd.hd_irq[i] = (readl(&timer->hpet_config) & |
130 | Tn_INT_ROUTE_CNF_SHIFT; | 157 | Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; |
131 | } | 158 | } |
132 | 159 | ||
160 | hpet_reserve_msi_timers(&hd); | ||
161 | |||
133 | hpet_alloc(&hd); | 162 | hpet_alloc(&hd); |
134 | 163 | ||
135 | } | 164 | } |
@@ -223,60 +252,70 @@ static void hpet_legacy_clockevent_register(void) | |||
223 | printk(KERN_DEBUG "hpet clockevent registered\n"); | 252 | printk(KERN_DEBUG "hpet clockevent registered\n"); |
224 | } | 253 | } |
225 | 254 | ||
226 | static void hpet_legacy_set_mode(enum clock_event_mode mode, | 255 | static int hpet_setup_msi_irq(unsigned int irq); |
227 | struct clock_event_device *evt) | 256 | |
257 | static void hpet_set_mode(enum clock_event_mode mode, | ||
258 | struct clock_event_device *evt, int timer) | ||
228 | { | 259 | { |
229 | unsigned long cfg, cmp, now; | 260 | unsigned long cfg, cmp, now; |
230 | uint64_t delta; | 261 | uint64_t delta; |
231 | 262 | ||
232 | switch(mode) { | 263 | switch (mode) { |
233 | case CLOCK_EVT_MODE_PERIODIC: | 264 | case CLOCK_EVT_MODE_PERIODIC: |
234 | delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult; | 265 | delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; |
235 | delta >>= hpet_clockevent.shift; | 266 | delta >>= evt->shift; |
236 | now = hpet_readl(HPET_COUNTER); | 267 | now = hpet_readl(HPET_COUNTER); |
237 | cmp = now + (unsigned long) delta; | 268 | cmp = now + (unsigned long) delta; |
238 | cfg = hpet_readl(HPET_T0_CFG); | 269 | cfg = hpet_readl(HPET_Tn_CFG(timer)); |
239 | cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | | 270 | cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | |
240 | HPET_TN_SETVAL | HPET_TN_32BIT; | 271 | HPET_TN_SETVAL | HPET_TN_32BIT; |
241 | hpet_writel(cfg, HPET_T0_CFG); | 272 | hpet_writel(cfg, HPET_Tn_CFG(timer)); |
242 | /* | 273 | /* |
243 | * The first write after writing TN_SETVAL to the | 274 | * The first write after writing TN_SETVAL to the |
244 | * config register sets the counter value, the second | 275 | * config register sets the counter value, the second |
245 | * write sets the period. | 276 | * write sets the period. |
246 | */ | 277 | */ |
247 | hpet_writel(cmp, HPET_T0_CMP); | 278 | hpet_writel(cmp, HPET_Tn_CMP(timer)); |
248 | udelay(1); | 279 | udelay(1); |
249 | hpet_writel((unsigned long) delta, HPET_T0_CMP); | 280 | hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); |
250 | break; | 281 | break; |
251 | 282 | ||
252 | case CLOCK_EVT_MODE_ONESHOT: | 283 | case CLOCK_EVT_MODE_ONESHOT: |
253 | cfg = hpet_readl(HPET_T0_CFG); | 284 | cfg = hpet_readl(HPET_Tn_CFG(timer)); |
254 | cfg &= ~HPET_TN_PERIODIC; | 285 | cfg &= ~HPET_TN_PERIODIC; |
255 | cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; | 286 | cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; |
256 | hpet_writel(cfg, HPET_T0_CFG); | 287 | hpet_writel(cfg, HPET_Tn_CFG(timer)); |
257 | break; | 288 | break; |
258 | 289 | ||
259 | case CLOCK_EVT_MODE_UNUSED: | 290 | case CLOCK_EVT_MODE_UNUSED: |
260 | case CLOCK_EVT_MODE_SHUTDOWN: | 291 | case CLOCK_EVT_MODE_SHUTDOWN: |
261 | cfg = hpet_readl(HPET_T0_CFG); | 292 | cfg = hpet_readl(HPET_Tn_CFG(timer)); |
262 | cfg &= ~HPET_TN_ENABLE; | 293 | cfg &= ~HPET_TN_ENABLE; |
263 | hpet_writel(cfg, HPET_T0_CFG); | 294 | hpet_writel(cfg, HPET_Tn_CFG(timer)); |
264 | break; | 295 | break; |
265 | 296 | ||
266 | case CLOCK_EVT_MODE_RESUME: | 297 | case CLOCK_EVT_MODE_RESUME: |
267 | hpet_enable_legacy_int(); | 298 | if (timer == 0) { |
299 | hpet_enable_legacy_int(); | ||
300 | } else { | ||
301 | struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); | ||
302 | hpet_setup_msi_irq(hdev->irq); | ||
303 | disable_irq(hdev->irq); | ||
304 | irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu)); | ||
305 | enable_irq(hdev->irq); | ||
306 | } | ||
268 | break; | 307 | break; |
269 | } | 308 | } |
270 | } | 309 | } |
271 | 310 | ||
272 | static int hpet_legacy_next_event(unsigned long delta, | 311 | static int hpet_next_event(unsigned long delta, |
273 | struct clock_event_device *evt) | 312 | struct clock_event_device *evt, int timer) |
274 | { | 313 | { |
275 | u32 cnt; | 314 | u32 cnt; |
276 | 315 | ||
277 | cnt = hpet_readl(HPET_COUNTER); | 316 | cnt = hpet_readl(HPET_COUNTER); |
278 | cnt += (u32) delta; | 317 | cnt += (u32) delta; |
279 | hpet_writel(cnt, HPET_T0_CMP); | 318 | hpet_writel(cnt, HPET_Tn_CMP(timer)); |
280 | 319 | ||
281 | /* | 320 | /* |
282 | * We need to read back the CMP register to make sure that | 321 | * We need to read back the CMP register to make sure that |
@@ -288,6 +327,347 @@ static int hpet_legacy_next_event(unsigned long delta, | |||
288 | return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; | 327 | return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; |
289 | } | 328 | } |
290 | 329 | ||
330 | static void hpet_legacy_set_mode(enum clock_event_mode mode, | ||
331 | struct clock_event_device *evt) | ||
332 | { | ||
333 | hpet_set_mode(mode, evt, 0); | ||
334 | } | ||
335 | |||
336 | static int hpet_legacy_next_event(unsigned long delta, | ||
337 | struct clock_event_device *evt) | ||
338 | { | ||
339 | return hpet_next_event(delta, evt, 0); | ||
340 | } | ||
341 | |||
342 | /* | ||
343 | * HPET MSI Support | ||
344 | */ | ||
345 | #ifdef CONFIG_PCI_MSI | ||
346 | |||
347 | static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); | ||
348 | static struct hpet_dev *hpet_devs; | ||
349 | |||
350 | void hpet_msi_unmask(unsigned int irq) | ||
351 | { | ||
352 | struct hpet_dev *hdev = get_irq_data(irq); | ||
353 | unsigned long cfg; | ||
354 | |||
355 | /* unmask it */ | ||
356 | cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); | ||
357 | cfg |= HPET_TN_FSB; | ||
358 | hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); | ||
359 | } | ||
360 | |||
361 | void hpet_msi_mask(unsigned int irq) | ||
362 | { | ||
363 | unsigned long cfg; | ||
364 | struct hpet_dev *hdev = get_irq_data(irq); | ||
365 | |||
366 | /* mask it */ | ||
367 | cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); | ||
368 | cfg &= ~HPET_TN_FSB; | ||
369 | hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); | ||
370 | } | ||
371 | |||
372 | void hpet_msi_write(unsigned int irq, struct msi_msg *msg) | ||
373 | { | ||
374 | struct hpet_dev *hdev = get_irq_data(irq); | ||
375 | |||
376 | hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num)); | ||
377 | hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4); | ||
378 | } | ||
379 | |||
380 | void hpet_msi_read(unsigned int irq, struct msi_msg *msg) | ||
381 | { | ||
382 | struct hpet_dev *hdev = get_irq_data(irq); | ||
383 | |||
384 | msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num)); | ||
385 | msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4); | ||
386 | msg->address_hi = 0; | ||
387 | } | ||
388 | |||
389 | static void hpet_msi_set_mode(enum clock_event_mode mode, | ||
390 | struct clock_event_device *evt) | ||
391 | { | ||
392 | struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); | ||
393 | hpet_set_mode(mode, evt, hdev->num); | ||
394 | } | ||
395 | |||
396 | static int hpet_msi_next_event(unsigned long delta, | ||
397 | struct clock_event_device *evt) | ||
398 | { | ||
399 | struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); | ||
400 | return hpet_next_event(delta, evt, hdev->num); | ||
401 | } | ||
402 | |||
403 | static int hpet_setup_msi_irq(unsigned int irq) | ||
404 | { | ||
405 | if (arch_setup_hpet_msi(irq)) { | ||
406 | destroy_irq(irq); | ||
407 | return -EINVAL; | ||
408 | } | ||
409 | return 0; | ||
410 | } | ||
411 | |||
412 | static int hpet_assign_irq(struct hpet_dev *dev) | ||
413 | { | ||
414 | unsigned int irq; | ||
415 | |||
416 | irq = create_irq(); | ||
417 | if (!irq) | ||
418 | return -EINVAL; | ||
419 | |||
420 | set_irq_data(irq, dev); | ||
421 | |||
422 | if (hpet_setup_msi_irq(irq)) | ||
423 | return -EINVAL; | ||
424 | |||
425 | dev->irq = irq; | ||
426 | return 0; | ||
427 | } | ||
428 | |||
429 | static irqreturn_t hpet_interrupt_handler(int irq, void *data) | ||
430 | { | ||
431 | struct hpet_dev *dev = (struct hpet_dev *)data; | ||
432 | struct clock_event_device *hevt = &dev->evt; | ||
433 | |||
434 | if (!hevt->event_handler) { | ||
435 | printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n", | ||
436 | dev->num); | ||
437 | return IRQ_HANDLED; | ||
438 | } | ||
439 | |||
440 | hevt->event_handler(hevt); | ||
441 | return IRQ_HANDLED; | ||
442 | } | ||
443 | |||
444 | static int hpet_setup_irq(struct hpet_dev *dev) | ||
445 | { | ||
446 | |||
447 | if (request_irq(dev->irq, hpet_interrupt_handler, | ||
448 | IRQF_SHARED|IRQF_NOBALANCING, dev->name, dev)) | ||
449 | return -1; | ||
450 | |||
451 | disable_irq(dev->irq); | ||
452 | irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu)); | ||
453 | enable_irq(dev->irq); | ||
454 | |||
455 | printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", | ||
456 | dev->name, dev->irq); | ||
457 | |||
458 | return 0; | ||
459 | } | ||
460 | |||
461 | /* This should be called in specific @cpu */ | ||
462 | static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) | ||
463 | { | ||
464 | struct clock_event_device *evt = &hdev->evt; | ||
465 | uint64_t hpet_freq; | ||
466 | |||
467 | WARN_ON(cpu != smp_processor_id()); | ||
468 | if (!(hdev->flags & HPET_DEV_VALID)) | ||
469 | return; | ||
470 | |||
471 | if (hpet_setup_msi_irq(hdev->irq)) | ||
472 | return; | ||
473 | |||
474 | hdev->cpu = cpu; | ||
475 | per_cpu(cpu_hpet_dev, cpu) = hdev; | ||
476 | evt->name = hdev->name; | ||
477 | hpet_setup_irq(hdev); | ||
478 | evt->irq = hdev->irq; | ||
479 | |||
480 | evt->rating = 110; | ||
481 | evt->features = CLOCK_EVT_FEAT_ONESHOT; | ||
482 | if (hdev->flags & HPET_DEV_PERI_CAP) | ||
483 | evt->features |= CLOCK_EVT_FEAT_PERIODIC; | ||
484 | |||
485 | evt->set_mode = hpet_msi_set_mode; | ||
486 | evt->set_next_event = hpet_msi_next_event; | ||
487 | evt->shift = 32; | ||
488 | |||
489 | /* | ||
490 | * The period is a femto seconds value. We need to calculate the | ||
491 | * scaled math multiplication factor for nanosecond to hpet tick | ||
492 | * conversion. | ||
493 | */ | ||
494 | hpet_freq = 1000000000000000ULL; | ||
495 | do_div(hpet_freq, hpet_period); | ||
496 | evt->mult = div_sc((unsigned long) hpet_freq, | ||
497 | NSEC_PER_SEC, evt->shift); | ||
498 | /* Calculate the max delta */ | ||
499 | evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt); | ||
500 | /* 5 usec minimum reprogramming delta. */ | ||
501 | evt->min_delta_ns = 5000; | ||
502 | |||
503 | evt->cpumask = cpumask_of_cpu(hdev->cpu); | ||
504 | clockevents_register_device(evt); | ||
505 | } | ||
506 | |||
507 | #ifdef CONFIG_HPET | ||
508 | /* Reserve at least one timer for userspace (/dev/hpet) */ | ||
509 | #define RESERVE_TIMERS 1 | ||
510 | #else | ||
511 | #define RESERVE_TIMERS 0 | ||
512 | #endif | ||
513 | |||
514 | static void hpet_msi_capability_lookup(unsigned int start_timer) | ||
515 | { | ||
516 | unsigned int id; | ||
517 | unsigned int num_timers; | ||
518 | unsigned int num_timers_used = 0; | ||
519 | int i; | ||
520 | |||
521 | id = hpet_readl(HPET_ID); | ||
522 | |||
523 | num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); | ||
524 | num_timers++; /* Value read out starts from 0 */ | ||
525 | |||
526 | hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); | ||
527 | if (!hpet_devs) | ||
528 | return; | ||
529 | |||
530 | hpet_num_timers = num_timers; | ||
531 | |||
532 | for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { | ||
533 | struct hpet_dev *hdev = &hpet_devs[num_timers_used]; | ||
534 | unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); | ||
535 | |||
536 | /* Only consider HPET timer with MSI support */ | ||
537 | if (!(cfg & HPET_TN_FSB_CAP)) | ||
538 | continue; | ||
539 | |||
540 | hdev->flags = 0; | ||
541 | if (cfg & HPET_TN_PERIODIC_CAP) | ||
542 | hdev->flags |= HPET_DEV_PERI_CAP; | ||
543 | hdev->num = i; | ||
544 | |||
545 | sprintf(hdev->name, "hpet%d", i); | ||
546 | if (hpet_assign_irq(hdev)) | ||
547 | continue; | ||
548 | |||
549 | hdev->flags |= HPET_DEV_FSB_CAP; | ||
550 | hdev->flags |= HPET_DEV_VALID; | ||
551 | num_timers_used++; | ||
552 | if (num_timers_used == num_possible_cpus()) | ||
553 | break; | ||
554 | } | ||
555 | |||
556 | printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n", | ||
557 | num_timers, num_timers_used); | ||
558 | } | ||
559 | |||
560 | #ifdef CONFIG_HPET | ||
561 | static void hpet_reserve_msi_timers(struct hpet_data *hd) | ||
562 | { | ||
563 | int i; | ||
564 | |||
565 | if (!hpet_devs) | ||
566 | return; | ||
567 | |||
568 | for (i = 0; i < hpet_num_timers; i++) { | ||
569 | struct hpet_dev *hdev = &hpet_devs[i]; | ||
570 | |||
571 | if (!(hdev->flags & HPET_DEV_VALID)) | ||
572 | continue; | ||
573 | |||
574 | hd->hd_irq[hdev->num] = hdev->irq; | ||
575 | hpet_reserve_timer(hd, hdev->num); | ||
576 | } | ||
577 | } | ||
578 | #endif | ||
579 | |||
580 | static struct hpet_dev *hpet_get_unused_timer(void) | ||
581 | { | ||
582 | int i; | ||
583 | |||
584 | if (!hpet_devs) | ||
585 | return NULL; | ||
586 | |||
587 | for (i = 0; i < hpet_num_timers; i++) { | ||
588 | struct hpet_dev *hdev = &hpet_devs[i]; | ||
589 | |||
590 | if (!(hdev->flags & HPET_DEV_VALID)) | ||
591 | continue; | ||
592 | if (test_and_set_bit(HPET_DEV_USED_BIT, | ||
593 | (unsigned long *)&hdev->flags)) | ||
594 | continue; | ||
595 | return hdev; | ||
596 | } | ||
597 | return NULL; | ||
598 | } | ||
599 | |||
600 | struct hpet_work_struct { | ||
601 | struct delayed_work work; | ||
602 | struct completion complete; | ||
603 | }; | ||
604 | |||
605 | static void hpet_work(struct work_struct *w) | ||
606 | { | ||
607 | struct hpet_dev *hdev; | ||
608 | int cpu = smp_processor_id(); | ||
609 | struct hpet_work_struct *hpet_work; | ||
610 | |||
611 | hpet_work = container_of(w, struct hpet_work_struct, work.work); | ||
612 | |||
613 | hdev = hpet_get_unused_timer(); | ||
614 | if (hdev) | ||
615 | init_one_hpet_msi_clockevent(hdev, cpu); | ||
616 | |||
617 | complete(&hpet_work->complete); | ||
618 | } | ||
619 | |||
620 | static int hpet_cpuhp_notify(struct notifier_block *n, | ||
621 | unsigned long action, void *hcpu) | ||
622 | { | ||
623 | unsigned long cpu = (unsigned long)hcpu; | ||
624 | struct hpet_work_struct work; | ||
625 | struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu); | ||
626 | |||
627 | switch (action & 0xf) { | ||
628 | case CPU_ONLINE: | ||
629 | INIT_DELAYED_WORK(&work.work, hpet_work); | ||
630 | init_completion(&work.complete); | ||
631 | /* FIXME: add schedule_work_on() */ | ||
632 | schedule_delayed_work_on(cpu, &work.work, 0); | ||
633 | wait_for_completion(&work.complete); | ||
634 | break; | ||
635 | case CPU_DEAD: | ||
636 | if (hdev) { | ||
637 | free_irq(hdev->irq, hdev); | ||
638 | hdev->flags &= ~HPET_DEV_USED; | ||
639 | per_cpu(cpu_hpet_dev, cpu) = NULL; | ||
640 | } | ||
641 | break; | ||
642 | } | ||
643 | return NOTIFY_OK; | ||
644 | } | ||
645 | #else | ||
646 | |||
647 | static int hpet_setup_msi_irq(unsigned int irq) | ||
648 | { | ||
649 | return 0; | ||
650 | } | ||
651 | static void hpet_msi_capability_lookup(unsigned int start_timer) | ||
652 | { | ||
653 | return; | ||
654 | } | ||
655 | |||
656 | #ifdef CONFIG_HPET | ||
657 | static void hpet_reserve_msi_timers(struct hpet_data *hd) | ||
658 | { | ||
659 | return; | ||
660 | } | ||
661 | #endif | ||
662 | |||
663 | static int hpet_cpuhp_notify(struct notifier_block *n, | ||
664 | unsigned long action, void *hcpu) | ||
665 | { | ||
666 | return NOTIFY_OK; | ||
667 | } | ||
668 | |||
669 | #endif | ||
670 | |||
291 | /* | 671 | /* |
292 | * Clock source related code | 672 | * Clock source related code |
293 | */ | 673 | */ |
@@ -423,8 +803,10 @@ int __init hpet_enable(void) | |||
423 | 803 | ||
424 | if (id & HPET_ID_LEGSUP) { | 804 | if (id & HPET_ID_LEGSUP) { |
425 | hpet_legacy_clockevent_register(); | 805 | hpet_legacy_clockevent_register(); |
806 | hpet_msi_capability_lookup(2); | ||
426 | return 1; | 807 | return 1; |
427 | } | 808 | } |
809 | hpet_msi_capability_lookup(0); | ||
428 | return 0; | 810 | return 0; |
429 | 811 | ||
430 | out_nohpet: | 812 | out_nohpet: |
@@ -441,6 +823,8 @@ out_nohpet: | |||
441 | */ | 823 | */ |
442 | static __init int hpet_late_init(void) | 824 | static __init int hpet_late_init(void) |
443 | { | 825 | { |
826 | int cpu; | ||
827 | |||
444 | if (boot_hpet_disable) | 828 | if (boot_hpet_disable) |
445 | return -ENODEV; | 829 | return -ENODEV; |
446 | 830 | ||
@@ -456,6 +840,13 @@ static __init int hpet_late_init(void) | |||
456 | 840 | ||
457 | hpet_reserve_platform_timers(hpet_readl(HPET_ID)); | 841 | hpet_reserve_platform_timers(hpet_readl(HPET_ID)); |
458 | 842 | ||
843 | for_each_online_cpu(cpu) { | ||
844 | hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); | ||
845 | } | ||
846 | |||
847 | /* This notifier should be called after workqueue is ready */ | ||
848 | hotcpu_notifier(hpet_cpuhp_notify, -20); | ||
849 | |||
459 | return 0; | 850 | return 0; |
460 | } | 851 | } |
461 | fs_initcall(hpet_late_init); | 852 | fs_initcall(hpet_late_init); |
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic.c index 02063ae042f7..b764d7429c61 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic.c | |||
@@ -27,17 +27,21 @@ | |||
27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
28 | #include <linux/pci.h> | 28 | #include <linux/pci.h> |
29 | #include <linux/mc146818rtc.h> | 29 | #include <linux/mc146818rtc.h> |
30 | #include <linux/compiler.h> | ||
30 | #include <linux/acpi.h> | 31 | #include <linux/acpi.h> |
32 | #include <linux/module.h> | ||
31 | #include <linux/sysdev.h> | 33 | #include <linux/sysdev.h> |
32 | #include <linux/msi.h> | 34 | #include <linux/msi.h> |
33 | #include <linux/htirq.h> | 35 | #include <linux/htirq.h> |
34 | #include <linux/dmar.h> | 36 | #include <linux/freezer.h> |
35 | #include <linux/jiffies.h> | 37 | #include <linux/kthread.h> |
38 | #include <linux/jiffies.h> /* time_after() */ | ||
36 | #ifdef CONFIG_ACPI | 39 | #ifdef CONFIG_ACPI |
37 | #include <acpi/acpi_bus.h> | 40 | #include <acpi/acpi_bus.h> |
38 | #endif | 41 | #endif |
39 | #include <linux/bootmem.h> | 42 | #include <linux/bootmem.h> |
40 | #include <linux/dmar.h> | 43 | #include <linux/dmar.h> |
44 | #include <linux/hpet.h> | ||
41 | 45 | ||
42 | #include <asm/idle.h> | 46 | #include <asm/idle.h> |
43 | #include <asm/io.h> | 47 | #include <asm/io.h> |
@@ -46,61 +50,28 @@ | |||
46 | #include <asm/proto.h> | 50 | #include <asm/proto.h> |
47 | #include <asm/acpi.h> | 51 | #include <asm/acpi.h> |
48 | #include <asm/dma.h> | 52 | #include <asm/dma.h> |
53 | #include <asm/timer.h> | ||
49 | #include <asm/i8259.h> | 54 | #include <asm/i8259.h> |
50 | #include <asm/nmi.h> | 55 | #include <asm/nmi.h> |
51 | #include <asm/msidef.h> | 56 | #include <asm/msidef.h> |
52 | #include <asm/hypertransport.h> | 57 | #include <asm/hypertransport.h> |
58 | #include <asm/setup.h> | ||
53 | #include <asm/irq_remapping.h> | 59 | #include <asm/irq_remapping.h> |
60 | #include <asm/hpet.h> | ||
61 | #include <asm/uv/uv_hub.h> | ||
62 | #include <asm/uv/uv_irq.h> | ||
54 | 63 | ||
55 | #include <mach_ipi.h> | 64 | #include <mach_ipi.h> |
56 | #include <mach_apic.h> | 65 | #include <mach_apic.h> |
66 | #include <mach_apicdef.h> | ||
57 | 67 | ||
58 | #define __apicdebuginit(type) static type __init | 68 | #define __apicdebuginit(type) static type __init |
59 | 69 | ||
60 | struct irq_cfg { | 70 | /* |
61 | cpumask_t domain; | 71 | * Is the SiS APIC rmw bug present ? |
62 | cpumask_t old_domain; | 72 | * -1 = don't know, 0 = no, 1 = yes |
63 | unsigned move_cleanup_count; | 73 | */ |
64 | u8 vector; | 74 | int sis_apic_bug = -1; |
65 | u8 move_in_progress : 1; | ||
66 | }; | ||
67 | |||
68 | /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ | ||
69 | static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { | ||
70 | [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, | ||
71 | [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, | ||
72 | [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, | ||
73 | [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, | ||
74 | [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, | ||
75 | [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, | ||
76 | [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, | ||
77 | [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, | ||
78 | [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, | ||
79 | [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, | ||
80 | [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, | ||
81 | [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, | ||
82 | [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, | ||
83 | [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, | ||
84 | [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, | ||
85 | [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, | ||
86 | }; | ||
87 | |||
88 | static int assign_irq_vector(int irq, cpumask_t mask); | ||
89 | |||
90 | int first_system_vector = 0xfe; | ||
91 | |||
92 | char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; | ||
93 | |||
94 | int sis_apic_bug; /* not actually supported, dummy for compile */ | ||
95 | |||
96 | static int no_timer_check; | ||
97 | |||
98 | static int disable_timer_pin_1 __initdata; | ||
99 | |||
100 | int timer_through_8259 __initdata; | ||
101 | |||
102 | /* Where if anywhere is the i8259 connect in external int mode */ | ||
103 | static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; | ||
104 | 75 | ||
105 | static DEFINE_SPINLOCK(ioapic_lock); | 76 | static DEFINE_SPINLOCK(ioapic_lock); |
106 | static DEFINE_SPINLOCK(vector_lock); | 77 | static DEFINE_SPINLOCK(vector_lock); |
@@ -110,9 +81,6 @@ static DEFINE_SPINLOCK(vector_lock); | |||
110 | */ | 81 | */ |
111 | int nr_ioapic_registers[MAX_IO_APICS]; | 82 | int nr_ioapic_registers[MAX_IO_APICS]; |
112 | 83 | ||
113 | /* I/O APIC RTE contents at the OS boot up */ | ||
114 | struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; | ||
115 | |||
116 | /* I/O APIC entries */ | 84 | /* I/O APIC entries */ |
117 | struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; | 85 | struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; |
118 | int nr_ioapics; | 86 | int nr_ioapics; |
@@ -123,11 +91,69 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; | |||
123 | /* # of MP IRQ source entries */ | 91 | /* # of MP IRQ source entries */ |
124 | int mp_irq_entries; | 92 | int mp_irq_entries; |
125 | 93 | ||
94 | #if defined (CONFIG_MCA) || defined (CONFIG_EISA) | ||
95 | int mp_bus_id_to_type[MAX_MP_BUSSES]; | ||
96 | #endif | ||
97 | |||
126 | DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); | 98 | DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); |
127 | 99 | ||
100 | int skip_ioapic_setup; | ||
101 | |||
102 | static int __init parse_noapic(char *str) | ||
103 | { | ||
104 | /* disable IO-APIC */ | ||
105 | disable_ioapic_setup(); | ||
106 | return 0; | ||
107 | } | ||
108 | early_param("noapic", parse_noapic); | ||
109 | |||
110 | struct irq_pin_list; | ||
111 | struct irq_cfg { | ||
112 | unsigned int irq; | ||
113 | struct irq_pin_list *irq_2_pin; | ||
114 | cpumask_t domain; | ||
115 | cpumask_t old_domain; | ||
116 | unsigned move_cleanup_count; | ||
117 | u8 vector; | ||
118 | u8 move_in_progress : 1; | ||
119 | }; | ||
120 | |||
121 | /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ | ||
122 | static struct irq_cfg irq_cfgx[NR_IRQS] = { | ||
123 | [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, | ||
124 | [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, | ||
125 | [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, | ||
126 | [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, | ||
127 | [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, | ||
128 | [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, | ||
129 | [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, | ||
130 | [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, | ||
131 | [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, | ||
132 | [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, | ||
133 | [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, | ||
134 | [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, | ||
135 | [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, | ||
136 | [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, | ||
137 | [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, | ||
138 | [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, | ||
139 | }; | ||
140 | |||
141 | #define for_each_irq_cfg(irq, cfg) \ | ||
142 | for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) | ||
143 | |||
144 | static struct irq_cfg *irq_cfg(unsigned int irq) | ||
145 | { | ||
146 | return irq < nr_irqs ? irq_cfgx + irq : NULL; | ||
147 | } | ||
148 | |||
149 | static struct irq_cfg *irq_cfg_alloc(unsigned int irq) | ||
150 | { | ||
151 | return irq_cfg(irq); | ||
152 | } | ||
153 | |||
128 | /* | 154 | /* |
129 | * Rough estimation of how many shared IRQs there are, can | 155 | * Rough estimation of how many shared IRQs there are, can be changed |
130 | * be changed anytime. | 156 | * anytime. |
131 | */ | 157 | */ |
132 | #define MAX_PLUS_SHARED_IRQS NR_IRQS | 158 | #define MAX_PLUS_SHARED_IRQS NR_IRQS |
133 | #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) | 159 | #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) |
@@ -139,9 +165,36 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); | |||
139 | * between pins and IRQs. | 165 | * between pins and IRQs. |
140 | */ | 166 | */ |
141 | 167 | ||
142 | static struct irq_pin_list { | 168 | struct irq_pin_list { |
143 | short apic, pin, next; | 169 | int apic, pin; |
144 | } irq_2_pin[PIN_MAP_SIZE]; | 170 | struct irq_pin_list *next; |
171 | }; | ||
172 | |||
173 | static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; | ||
174 | static struct irq_pin_list *irq_2_pin_ptr; | ||
175 | |||
176 | static void __init irq_2_pin_init(void) | ||
177 | { | ||
178 | struct irq_pin_list *pin = irq_2_pin_head; | ||
179 | int i; | ||
180 | |||
181 | for (i = 1; i < PIN_MAP_SIZE; i++) | ||
182 | pin[i-1].next = &pin[i]; | ||
183 | |||
184 | irq_2_pin_ptr = &pin[0]; | ||
185 | } | ||
186 | |||
187 | static struct irq_pin_list *get_one_free_irq_2_pin(void) | ||
188 | { | ||
189 | struct irq_pin_list *pin = irq_2_pin_ptr; | ||
190 | |||
191 | if (!pin) | ||
192 | panic("can not get more irq_2_pin\n"); | ||
193 | |||
194 | irq_2_pin_ptr = pin->next; | ||
195 | pin->next = NULL; | ||
196 | return pin; | ||
197 | } | ||
145 | 198 | ||
146 | struct io_apic { | 199 | struct io_apic { |
147 | unsigned int index; | 200 | unsigned int index; |
@@ -172,10 +225,15 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i | |||
172 | /* | 225 | /* |
173 | * Re-write a value: to be used for read-modify-write | 226 | * Re-write a value: to be used for read-modify-write |
174 | * cycles where the read already set up the index register. | 227 | * cycles where the read already set up the index register. |
228 | * | ||
229 | * Older SiS APIC requires we rewrite the index register | ||
175 | */ | 230 | */ |
176 | static inline void io_apic_modify(unsigned int apic, unsigned int value) | 231 | static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) |
177 | { | 232 | { |
178 | struct io_apic __iomem *io_apic = io_apic_base(apic); | 233 | struct io_apic __iomem *io_apic = io_apic_base(apic); |
234 | |||
235 | if (sis_apic_bug) | ||
236 | writel(reg, &io_apic->index); | ||
179 | writel(value, &io_apic->data); | 237 | writel(value, &io_apic->data); |
180 | } | 238 | } |
181 | 239 | ||
@@ -183,16 +241,17 @@ static bool io_apic_level_ack_pending(unsigned int irq) | |||
183 | { | 241 | { |
184 | struct irq_pin_list *entry; | 242 | struct irq_pin_list *entry; |
185 | unsigned long flags; | 243 | unsigned long flags; |
244 | struct irq_cfg *cfg = irq_cfg(irq); | ||
186 | 245 | ||
187 | spin_lock_irqsave(&ioapic_lock, flags); | 246 | spin_lock_irqsave(&ioapic_lock, flags); |
188 | entry = irq_2_pin + irq; | 247 | entry = cfg->irq_2_pin; |
189 | for (;;) { | 248 | for (;;) { |
190 | unsigned int reg; | 249 | unsigned int reg; |
191 | int pin; | 250 | int pin; |
192 | 251 | ||
193 | pin = entry->pin; | 252 | if (!entry) |
194 | if (pin == -1) | ||
195 | break; | 253 | break; |
254 | pin = entry->pin; | ||
196 | reg = io_apic_read(entry->apic, 0x10 + pin*2); | 255 | reg = io_apic_read(entry->apic, 0x10 + pin*2); |
197 | /* Is the remote IRR bit set? */ | 256 | /* Is the remote IRR bit set? */ |
198 | if (reg & IO_APIC_REDIR_REMOTE_IRR) { | 257 | if (reg & IO_APIC_REDIR_REMOTE_IRR) { |
@@ -201,45 +260,13 @@ static bool io_apic_level_ack_pending(unsigned int irq) | |||
201 | } | 260 | } |
202 | if (!entry->next) | 261 | if (!entry->next) |
203 | break; | 262 | break; |
204 | entry = irq_2_pin + entry->next; | 263 | entry = entry->next; |
205 | } | 264 | } |
206 | spin_unlock_irqrestore(&ioapic_lock, flags); | 265 | spin_unlock_irqrestore(&ioapic_lock, flags); |
207 | 266 | ||
208 | return false; | 267 | return false; |
209 | } | 268 | } |
210 | 269 | ||
211 | /* | ||
212 | * Synchronize the IO-APIC and the CPU by doing | ||
213 | * a dummy read from the IO-APIC | ||
214 | */ | ||
215 | static inline void io_apic_sync(unsigned int apic) | ||
216 | { | ||
217 | struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
218 | readl(&io_apic->data); | ||
219 | } | ||
220 | |||
221 | #define __DO_ACTION(R, ACTION, FINAL) \ | ||
222 | \ | ||
223 | { \ | ||
224 | int pin; \ | ||
225 | struct irq_pin_list *entry = irq_2_pin + irq; \ | ||
226 | \ | ||
227 | BUG_ON(irq >= NR_IRQS); \ | ||
228 | for (;;) { \ | ||
229 | unsigned int reg; \ | ||
230 | pin = entry->pin; \ | ||
231 | if (pin == -1) \ | ||
232 | break; \ | ||
233 | reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ | ||
234 | reg ACTION; \ | ||
235 | io_apic_modify(entry->apic, reg); \ | ||
236 | FINAL; \ | ||
237 | if (!entry->next) \ | ||
238 | break; \ | ||
239 | entry = irq_2_pin + entry->next; \ | ||
240 | } \ | ||
241 | } | ||
242 | |||
243 | union entry_union { | 270 | union entry_union { |
244 | struct { u32 w1, w2; }; | 271 | struct { u32 w1, w2; }; |
245 | struct IO_APIC_route_entry entry; | 272 | struct IO_APIC_route_entry entry; |
@@ -299,59 +326,71 @@ static void ioapic_mask_entry(int apic, int pin) | |||
299 | static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) | 326 | static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) |
300 | { | 327 | { |
301 | int apic, pin; | 328 | int apic, pin; |
302 | struct irq_pin_list *entry = irq_2_pin + irq; | 329 | struct irq_cfg *cfg; |
330 | struct irq_pin_list *entry; | ||
303 | 331 | ||
304 | BUG_ON(irq >= NR_IRQS); | 332 | cfg = irq_cfg(irq); |
333 | entry = cfg->irq_2_pin; | ||
305 | for (;;) { | 334 | for (;;) { |
306 | unsigned int reg; | 335 | unsigned int reg; |
336 | |||
337 | if (!entry) | ||
338 | break; | ||
339 | |||
307 | apic = entry->apic; | 340 | apic = entry->apic; |
308 | pin = entry->pin; | 341 | pin = entry->pin; |
309 | if (pin == -1) | 342 | #ifdef CONFIG_INTR_REMAP |
310 | break; | ||
311 | /* | 343 | /* |
312 | * With interrupt-remapping, destination information comes | 344 | * With interrupt-remapping, destination information comes |
313 | * from interrupt-remapping table entry. | 345 | * from interrupt-remapping table entry. |
314 | */ | 346 | */ |
315 | if (!irq_remapped(irq)) | 347 | if (!irq_remapped(irq)) |
316 | io_apic_write(apic, 0x11 + pin*2, dest); | 348 | io_apic_write(apic, 0x11 + pin*2, dest); |
349 | #else | ||
350 | io_apic_write(apic, 0x11 + pin*2, dest); | ||
351 | #endif | ||
317 | reg = io_apic_read(apic, 0x10 + pin*2); | 352 | reg = io_apic_read(apic, 0x10 + pin*2); |
318 | reg &= ~IO_APIC_REDIR_VECTOR_MASK; | 353 | reg &= ~IO_APIC_REDIR_VECTOR_MASK; |
319 | reg |= vector; | 354 | reg |= vector; |
320 | io_apic_modify(apic, reg); | 355 | io_apic_modify(apic, 0x10 + pin*2, reg); |
321 | if (!entry->next) | 356 | if (!entry->next) |
322 | break; | 357 | break; |
323 | entry = irq_2_pin + entry->next; | 358 | entry = entry->next; |
324 | } | 359 | } |
325 | } | 360 | } |
326 | 361 | ||
362 | static int assign_irq_vector(int irq, cpumask_t mask); | ||
363 | |||
327 | static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) | 364 | static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) |
328 | { | 365 | { |
329 | struct irq_cfg *cfg = irq_cfg + irq; | 366 | struct irq_cfg *cfg; |
330 | unsigned long flags; | 367 | unsigned long flags; |
331 | unsigned int dest; | 368 | unsigned int dest; |
332 | cpumask_t tmp; | 369 | cpumask_t tmp; |
370 | struct irq_desc *desc; | ||
333 | 371 | ||
334 | cpus_and(tmp, mask, cpu_online_map); | 372 | cpus_and(tmp, mask, cpu_online_map); |
335 | if (cpus_empty(tmp)) | 373 | if (cpus_empty(tmp)) |
336 | return; | 374 | return; |
337 | 375 | ||
376 | cfg = irq_cfg(irq); | ||
338 | if (assign_irq_vector(irq, mask)) | 377 | if (assign_irq_vector(irq, mask)) |
339 | return; | 378 | return; |
340 | 379 | ||
341 | cpus_and(tmp, cfg->domain, mask); | 380 | cpus_and(tmp, cfg->domain, mask); |
342 | dest = cpu_mask_to_apicid(tmp); | 381 | dest = cpu_mask_to_apicid(tmp); |
343 | |||
344 | /* | 382 | /* |
345 | * Only the high 8 bits are valid. | 383 | * Only the high 8 bits are valid. |
346 | */ | 384 | */ |
347 | dest = SET_APIC_LOGICAL_ID(dest); | 385 | dest = SET_APIC_LOGICAL_ID(dest); |
348 | 386 | ||
387 | desc = irq_to_desc(irq); | ||
349 | spin_lock_irqsave(&ioapic_lock, flags); | 388 | spin_lock_irqsave(&ioapic_lock, flags); |
350 | __target_IO_APIC_irq(irq, dest, cfg->vector); | 389 | __target_IO_APIC_irq(irq, dest, cfg->vector); |
351 | irq_desc[irq].affinity = mask; | 390 | desc->affinity = mask; |
352 | spin_unlock_irqrestore(&ioapic_lock, flags); | 391 | spin_unlock_irqrestore(&ioapic_lock, flags); |
353 | } | 392 | } |
354 | #endif | 393 | #endif /* CONFIG_SMP */ |
355 | 394 | ||
356 | /* | 395 | /* |
357 | * The common case is 1:1 IRQ<->pin mappings. Sometimes there are | 396 | * The common case is 1:1 IRQ<->pin mappings. Sometimes there are |
@@ -360,19 +399,30 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) | |||
360 | */ | 399 | */ |
361 | static void add_pin_to_irq(unsigned int irq, int apic, int pin) | 400 | static void add_pin_to_irq(unsigned int irq, int apic, int pin) |
362 | { | 401 | { |
363 | static int first_free_entry = NR_IRQS; | 402 | struct irq_cfg *cfg; |
364 | struct irq_pin_list *entry = irq_2_pin + irq; | 403 | struct irq_pin_list *entry; |
404 | |||
405 | /* first time to refer irq_cfg, so with new */ | ||
406 | cfg = irq_cfg_alloc(irq); | ||
407 | entry = cfg->irq_2_pin; | ||
408 | if (!entry) { | ||
409 | entry = get_one_free_irq_2_pin(); | ||
410 | cfg->irq_2_pin = entry; | ||
411 | entry->apic = apic; | ||
412 | entry->pin = pin; | ||
413 | return; | ||
414 | } | ||
365 | 415 | ||
366 | BUG_ON(irq >= NR_IRQS); | 416 | while (entry->next) { |
367 | while (entry->next) | 417 | /* not again, please */ |
368 | entry = irq_2_pin + entry->next; | 418 | if (entry->apic == apic && entry->pin == pin) |
419 | return; | ||
369 | 420 | ||
370 | if (entry->pin != -1) { | 421 | entry = entry->next; |
371 | entry->next = first_free_entry; | ||
372 | entry = irq_2_pin + entry->next; | ||
373 | if (++first_free_entry >= PIN_MAP_SIZE) | ||
374 | panic("io_apic.c: ran out of irq_2_pin entries!"); | ||
375 | } | 422 | } |
423 | |||
424 | entry->next = get_one_free_irq_2_pin(); | ||
425 | entry = entry->next; | ||
376 | entry->apic = apic; | 426 | entry->apic = apic; |
377 | entry->pin = pin; | 427 | entry->pin = pin; |
378 | } | 428 | } |
@@ -384,30 +434,86 @@ static void __init replace_pin_at_irq(unsigned int irq, | |||
384 | int oldapic, int oldpin, | 434 | int oldapic, int oldpin, |
385 | int newapic, int newpin) | 435 | int newapic, int newpin) |
386 | { | 436 | { |
387 | struct irq_pin_list *entry = irq_2_pin + irq; | 437 | struct irq_cfg *cfg = irq_cfg(irq); |
438 | struct irq_pin_list *entry = cfg->irq_2_pin; | ||
439 | int replaced = 0; | ||
388 | 440 | ||
389 | while (1) { | 441 | while (entry) { |
390 | if (entry->apic == oldapic && entry->pin == oldpin) { | 442 | if (entry->apic == oldapic && entry->pin == oldpin) { |
391 | entry->apic = newapic; | 443 | entry->apic = newapic; |
392 | entry->pin = newpin; | 444 | entry->pin = newpin; |
393 | } | 445 | replaced = 1; |
394 | if (!entry->next) | 446 | /* every one is different, right? */ |
395 | break; | 447 | break; |
396 | entry = irq_2_pin + entry->next; | 448 | } |
449 | entry = entry->next; | ||
450 | } | ||
451 | |||
452 | /* why? call replace before add? */ | ||
453 | if (!replaced) | ||
454 | add_pin_to_irq(irq, newapic, newpin); | ||
455 | } | ||
456 | |||
457 | static inline void io_apic_modify_irq(unsigned int irq, | ||
458 | int mask_and, int mask_or, | ||
459 | void (*final)(struct irq_pin_list *entry)) | ||
460 | { | ||
461 | int pin; | ||
462 | struct irq_cfg *cfg; | ||
463 | struct irq_pin_list *entry; | ||
464 | |||
465 | cfg = irq_cfg(irq); | ||
466 | for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { | ||
467 | unsigned int reg; | ||
468 | pin = entry->pin; | ||
469 | reg = io_apic_read(entry->apic, 0x10 + pin * 2); | ||
470 | reg &= mask_and; | ||
471 | reg |= mask_or; | ||
472 | io_apic_modify(entry->apic, 0x10 + pin * 2, reg); | ||
473 | if (final) | ||
474 | final(entry); | ||
397 | } | 475 | } |
398 | } | 476 | } |
399 | 477 | ||
478 | static void __unmask_IO_APIC_irq(unsigned int irq) | ||
479 | { | ||
480 | io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); | ||
481 | } | ||
400 | 482 | ||
401 | #define DO_ACTION(name,R,ACTION, FINAL) \ | 483 | #ifdef CONFIG_X86_64 |
402 | \ | 484 | void io_apic_sync(struct irq_pin_list *entry) |
403 | static void name##_IO_APIC_irq (unsigned int irq) \ | 485 | { |
404 | __DO_ACTION(R, ACTION, FINAL) | 486 | /* |
487 | * Synchronize the IO-APIC and the CPU by doing | ||
488 | * a dummy read from the IO-APIC | ||
489 | */ | ||
490 | struct io_apic __iomem *io_apic; | ||
491 | io_apic = io_apic_base(entry->apic); | ||
492 | readl(&io_apic->data); | ||
493 | } | ||
405 | 494 | ||
406 | /* mask = 1 */ | 495 | static void __mask_IO_APIC_irq(unsigned int irq) |
407 | DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) | 496 | { |
497 | io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); | ||
498 | } | ||
499 | #else /* CONFIG_X86_32 */ | ||
500 | static void __mask_IO_APIC_irq(unsigned int irq) | ||
501 | { | ||
502 | io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); | ||
503 | } | ||
408 | 504 | ||
409 | /* mask = 0 */ | 505 | static void __mask_and_edge_IO_APIC_irq(unsigned int irq) |
410 | DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) | 506 | { |
507 | io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, | ||
508 | IO_APIC_REDIR_MASKED, NULL); | ||
509 | } | ||
510 | |||
511 | static void __unmask_and_level_IO_APIC_irq(unsigned int irq) | ||
512 | { | ||
513 | io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, | ||
514 | IO_APIC_REDIR_LEVEL_TRIGGER, NULL); | ||
515 | } | ||
516 | #endif /* CONFIG_X86_32 */ | ||
411 | 517 | ||
412 | static void mask_IO_APIC_irq (unsigned int irq) | 518 | static void mask_IO_APIC_irq (unsigned int irq) |
413 | { | 519 | { |
@@ -450,6 +556,68 @@ static void clear_IO_APIC (void) | |||
450 | clear_IO_APIC_pin(apic, pin); | 556 | clear_IO_APIC_pin(apic, pin); |
451 | } | 557 | } |
452 | 558 | ||
559 | #if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) | ||
560 | void send_IPI_self(int vector) | ||
561 | { | ||
562 | unsigned int cfg; | ||
563 | |||
564 | /* | ||
565 | * Wait for idle. | ||
566 | */ | ||
567 | apic_wait_icr_idle(); | ||
568 | cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; | ||
569 | /* | ||
570 | * Send the IPI. The write to APIC_ICR fires this off. | ||
571 | */ | ||
572 | apic_write(APIC_ICR, cfg); | ||
573 | } | ||
574 | #endif /* !CONFIG_SMP && CONFIG_X86_32*/ | ||
575 | |||
576 | #ifdef CONFIG_X86_32 | ||
577 | /* | ||
578 | * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to | ||
579 | * specific CPU-side IRQs. | ||
580 | */ | ||
581 | |||
582 | #define MAX_PIRQS 8 | ||
583 | static int pirq_entries [MAX_PIRQS]; | ||
584 | static int pirqs_enabled; | ||
585 | |||
586 | static int __init ioapic_pirq_setup(char *str) | ||
587 | { | ||
588 | int i, max; | ||
589 | int ints[MAX_PIRQS+1]; | ||
590 | |||
591 | get_options(str, ARRAY_SIZE(ints), ints); | ||
592 | |||
593 | for (i = 0; i < MAX_PIRQS; i++) | ||
594 | pirq_entries[i] = -1; | ||
595 | |||
596 | pirqs_enabled = 1; | ||
597 | apic_printk(APIC_VERBOSE, KERN_INFO | ||
598 | "PIRQ redirection, working around broken MP-BIOS.\n"); | ||
599 | max = MAX_PIRQS; | ||
600 | if (ints[0] < MAX_PIRQS) | ||
601 | max = ints[0]; | ||
602 | |||
603 | for (i = 0; i < max; i++) { | ||
604 | apic_printk(APIC_VERBOSE, KERN_DEBUG | ||
605 | "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); | ||
606 | /* | ||
607 | * PIRQs are mapped upside down, usually. | ||
608 | */ | ||
609 | pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; | ||
610 | } | ||
611 | return 1; | ||
612 | } | ||
613 | |||
614 | __setup("pirq=", ioapic_pirq_setup); | ||
615 | #endif /* CONFIG_X86_32 */ | ||
616 | |||
617 | #ifdef CONFIG_INTR_REMAP | ||
618 | /* I/O APIC RTE contents at the OS boot up */ | ||
619 | static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; | ||
620 | |||
453 | /* | 621 | /* |
454 | * Saves and masks all the unmasked IO-APIC RTE's | 622 | * Saves and masks all the unmasked IO-APIC RTE's |
455 | */ | 623 | */ |
@@ -474,7 +642,7 @@ int save_mask_IO_APIC_setup(void) | |||
474 | kzalloc(sizeof(struct IO_APIC_route_entry) * | 642 | kzalloc(sizeof(struct IO_APIC_route_entry) * |
475 | nr_ioapic_registers[apic], GFP_KERNEL); | 643 | nr_ioapic_registers[apic], GFP_KERNEL); |
476 | if (!early_ioapic_entries[apic]) | 644 | if (!early_ioapic_entries[apic]) |
477 | return -ENOMEM; | 645 | goto nomem; |
478 | } | 646 | } |
479 | 647 | ||
480 | for (apic = 0; apic < nr_ioapics; apic++) | 648 | for (apic = 0; apic < nr_ioapics; apic++) |
@@ -488,17 +656,31 @@ int save_mask_IO_APIC_setup(void) | |||
488 | ioapic_write_entry(apic, pin, entry); | 656 | ioapic_write_entry(apic, pin, entry); |
489 | } | 657 | } |
490 | } | 658 | } |
659 | |||
491 | return 0; | 660 | return 0; |
661 | |||
662 | nomem: | ||
663 | while (apic >= 0) | ||
664 | kfree(early_ioapic_entries[apic--]); | ||
665 | memset(early_ioapic_entries, 0, | ||
666 | ARRAY_SIZE(early_ioapic_entries)); | ||
667 | |||
668 | return -ENOMEM; | ||
492 | } | 669 | } |
493 | 670 | ||
494 | void restore_IO_APIC_setup(void) | 671 | void restore_IO_APIC_setup(void) |
495 | { | 672 | { |
496 | int apic, pin; | 673 | int apic, pin; |
497 | 674 | ||
498 | for (apic = 0; apic < nr_ioapics; apic++) | 675 | for (apic = 0; apic < nr_ioapics; apic++) { |
676 | if (!early_ioapic_entries[apic]) | ||
677 | break; | ||
499 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) | 678 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) |
500 | ioapic_write_entry(apic, pin, | 679 | ioapic_write_entry(apic, pin, |
501 | early_ioapic_entries[apic][pin]); | 680 | early_ioapic_entries[apic][pin]); |
681 | kfree(early_ioapic_entries[apic]); | ||
682 | early_ioapic_entries[apic] = NULL; | ||
683 | } | ||
502 | } | 684 | } |
503 | 685 | ||
504 | void reinit_intr_remapped_IO_APIC(int intr_remapping) | 686 | void reinit_intr_remapped_IO_APIC(int intr_remapping) |
@@ -512,25 +694,7 @@ void reinit_intr_remapped_IO_APIC(int intr_remapping) | |||
512 | */ | 694 | */ |
513 | restore_IO_APIC_setup(); | 695 | restore_IO_APIC_setup(); |
514 | } | 696 | } |
515 | 697 | #endif | |
516 | int skip_ioapic_setup; | ||
517 | int ioapic_force; | ||
518 | |||
519 | static int __init parse_noapic(char *str) | ||
520 | { | ||
521 | disable_ioapic_setup(); | ||
522 | return 0; | ||
523 | } | ||
524 | early_param("noapic", parse_noapic); | ||
525 | |||
526 | /* Actually the next is obsolete, but keep it for paranoid reasons -AK */ | ||
527 | static int __init disable_timer_pin_setup(char *arg) | ||
528 | { | ||
529 | disable_timer_pin_1 = 1; | ||
530 | return 1; | ||
531 | } | ||
532 | __setup("disable_timer_pin_1", disable_timer_pin_setup); | ||
533 | |||
534 | 698 | ||
535 | /* | 699 | /* |
536 | * Find the IRQ entry number of a certain pin. | 700 | * Find the IRQ entry number of a certain pin. |
@@ -634,22 +798,54 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) | |||
634 | best_guess = irq; | 798 | best_guess = irq; |
635 | } | 799 | } |
636 | } | 800 | } |
637 | BUG_ON(best_guess >= NR_IRQS); | ||
638 | return best_guess; | 801 | return best_guess; |
639 | } | 802 | } |
640 | 803 | ||
804 | EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); | ||
805 | |||
806 | #if defined(CONFIG_EISA) || defined(CONFIG_MCA) | ||
807 | /* | ||
808 | * EISA Edge/Level control register, ELCR | ||
809 | */ | ||
810 | static int EISA_ELCR(unsigned int irq) | ||
811 | { | ||
812 | if (irq < 16) { | ||
813 | unsigned int port = 0x4d0 + (irq >> 3); | ||
814 | return (inb(port) >> (irq & 7)) & 1; | ||
815 | } | ||
816 | apic_printk(APIC_VERBOSE, KERN_INFO | ||
817 | "Broken MPtable reports ISA irq %d\n", irq); | ||
818 | return 0; | ||
819 | } | ||
820 | |||
821 | #endif | ||
822 | |||
641 | /* ISA interrupts are always polarity zero edge triggered, | 823 | /* ISA interrupts are always polarity zero edge triggered, |
642 | * when listed as conforming in the MP table. */ | 824 | * when listed as conforming in the MP table. */ |
643 | 825 | ||
644 | #define default_ISA_trigger(idx) (0) | 826 | #define default_ISA_trigger(idx) (0) |
645 | #define default_ISA_polarity(idx) (0) | 827 | #define default_ISA_polarity(idx) (0) |
646 | 828 | ||
829 | /* EISA interrupts are always polarity zero and can be edge or level | ||
830 | * trigger depending on the ELCR value. If an interrupt is listed as | ||
831 | * EISA conforming in the MP table, that means its trigger type must | ||
832 | * be read in from the ELCR */ | ||
833 | |||
834 | #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) | ||
835 | #define default_EISA_polarity(idx) default_ISA_polarity(idx) | ||
836 | |||
647 | /* PCI interrupts are always polarity one level triggered, | 837 | /* PCI interrupts are always polarity one level triggered, |
648 | * when listed as conforming in the MP table. */ | 838 | * when listed as conforming in the MP table. */ |
649 | 839 | ||
650 | #define default_PCI_trigger(idx) (1) | 840 | #define default_PCI_trigger(idx) (1) |
651 | #define default_PCI_polarity(idx) (1) | 841 | #define default_PCI_polarity(idx) (1) |
652 | 842 | ||
843 | /* MCA interrupts are always polarity zero level triggered, | ||
844 | * when listed as conforming in the MP table. */ | ||
845 | |||
846 | #define default_MCA_trigger(idx) (1) | ||
847 | #define default_MCA_polarity(idx) default_ISA_polarity(idx) | ||
848 | |||
653 | static int MPBIOS_polarity(int idx) | 849 | static int MPBIOS_polarity(int idx) |
654 | { | 850 | { |
655 | int bus = mp_irqs[idx].mp_srcbus; | 851 | int bus = mp_irqs[idx].mp_srcbus; |
@@ -707,6 +903,36 @@ static int MPBIOS_trigger(int idx) | |||
707 | trigger = default_ISA_trigger(idx); | 903 | trigger = default_ISA_trigger(idx); |
708 | else | 904 | else |
709 | trigger = default_PCI_trigger(idx); | 905 | trigger = default_PCI_trigger(idx); |
906 | #if defined(CONFIG_EISA) || defined(CONFIG_MCA) | ||
907 | switch (mp_bus_id_to_type[bus]) { | ||
908 | case MP_BUS_ISA: /* ISA pin */ | ||
909 | { | ||
910 | /* set before the switch */ | ||
911 | break; | ||
912 | } | ||
913 | case MP_BUS_EISA: /* EISA pin */ | ||
914 | { | ||
915 | trigger = default_EISA_trigger(idx); | ||
916 | break; | ||
917 | } | ||
918 | case MP_BUS_PCI: /* PCI pin */ | ||
919 | { | ||
920 | /* set before the switch */ | ||
921 | break; | ||
922 | } | ||
923 | case MP_BUS_MCA: /* MCA pin */ | ||
924 | { | ||
925 | trigger = default_MCA_trigger(idx); | ||
926 | break; | ||
927 | } | ||
928 | default: | ||
929 | { | ||
930 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
931 | trigger = 1; | ||
932 | break; | ||
933 | } | ||
934 | } | ||
935 | #endif | ||
710 | break; | 936 | break; |
711 | case 1: /* edge */ | 937 | case 1: /* edge */ |
712 | { | 938 | { |
@@ -744,6 +970,7 @@ static inline int irq_trigger(int idx) | |||
744 | return MPBIOS_trigger(idx); | 970 | return MPBIOS_trigger(idx); |
745 | } | 971 | } |
746 | 972 | ||
973 | int (*ioapic_renumber_irq)(int ioapic, int irq); | ||
747 | static int pin_2_irq(int idx, int apic, int pin) | 974 | static int pin_2_irq(int idx, int apic, int pin) |
748 | { | 975 | { |
749 | int irq, i; | 976 | int irq, i; |
@@ -765,8 +992,32 @@ static int pin_2_irq(int idx, int apic, int pin) | |||
765 | while (i < apic) | 992 | while (i < apic) |
766 | irq += nr_ioapic_registers[i++]; | 993 | irq += nr_ioapic_registers[i++]; |
767 | irq += pin; | 994 | irq += pin; |
995 | /* | ||
996 | * For MPS mode, so far only needed by ES7000 platform | ||
997 | */ | ||
998 | if (ioapic_renumber_irq) | ||
999 | irq = ioapic_renumber_irq(apic, irq); | ||
768 | } | 1000 | } |
769 | BUG_ON(irq >= NR_IRQS); | 1001 | |
1002 | #ifdef CONFIG_X86_32 | ||
1003 | /* | ||
1004 | * PCI IRQ command line redirection. Yes, limits are hardcoded. | ||
1005 | */ | ||
1006 | if ((pin >= 16) && (pin <= 23)) { | ||
1007 | if (pirq_entries[pin-16] != -1) { | ||
1008 | if (!pirq_entries[pin-16]) { | ||
1009 | apic_printk(APIC_VERBOSE, KERN_DEBUG | ||
1010 | "disabling PIRQ%d\n", pin-16); | ||
1011 | } else { | ||
1012 | irq = pirq_entries[pin-16]; | ||
1013 | apic_printk(APIC_VERBOSE, KERN_DEBUG | ||
1014 | "using PIRQ%d -> IRQ %d\n", | ||
1015 | pin-16, irq); | ||
1016 | } | ||
1017 | } | ||
1018 | } | ||
1019 | #endif | ||
1020 | |||
770 | return irq; | 1021 | return irq; |
771 | } | 1022 | } |
772 | 1023 | ||
@@ -801,8 +1052,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) | |||
801 | int cpu; | 1052 | int cpu; |
802 | struct irq_cfg *cfg; | 1053 | struct irq_cfg *cfg; |
803 | 1054 | ||
804 | BUG_ON((unsigned)irq >= NR_IRQS); | 1055 | cfg = irq_cfg(irq); |
805 | cfg = &irq_cfg[irq]; | ||
806 | 1056 | ||
807 | /* Only try and allocate irqs on cpus that are present */ | 1057 | /* Only try and allocate irqs on cpus that are present */ |
808 | cpus_and(mask, mask, cpu_online_map); | 1058 | cpus_and(mask, mask, cpu_online_map); |
@@ -837,8 +1087,13 @@ next: | |||
837 | } | 1087 | } |
838 | if (unlikely(current_vector == vector)) | 1088 | if (unlikely(current_vector == vector)) |
839 | continue; | 1089 | continue; |
1090 | #ifdef CONFIG_X86_64 | ||
840 | if (vector == IA32_SYSCALL_VECTOR) | 1091 | if (vector == IA32_SYSCALL_VECTOR) |
841 | goto next; | 1092 | goto next; |
1093 | #else | ||
1094 | if (vector == SYSCALL_VECTOR) | ||
1095 | goto next; | ||
1096 | #endif | ||
842 | for_each_cpu_mask_nr(new_cpu, new_mask) | 1097 | for_each_cpu_mask_nr(new_cpu, new_mask) |
843 | if (per_cpu(vector_irq, new_cpu)[vector] != -1) | 1098 | if (per_cpu(vector_irq, new_cpu)[vector] != -1) |
844 | goto next; | 1099 | goto next; |
@@ -875,8 +1130,7 @@ static void __clear_irq_vector(int irq) | |||
875 | cpumask_t mask; | 1130 | cpumask_t mask; |
876 | int cpu, vector; | 1131 | int cpu, vector; |
877 | 1132 | ||
878 | BUG_ON((unsigned)irq >= NR_IRQS); | 1133 | cfg = irq_cfg(irq); |
879 | cfg = &irq_cfg[irq]; | ||
880 | BUG_ON(!cfg->vector); | 1134 | BUG_ON(!cfg->vector); |
881 | 1135 | ||
882 | vector = cfg->vector; | 1136 | vector = cfg->vector; |
@@ -893,12 +1147,13 @@ void __setup_vector_irq(int cpu) | |||
893 | /* Initialize vector_irq on a new cpu */ | 1147 | /* Initialize vector_irq on a new cpu */ |
894 | /* This function must be called with vector_lock held */ | 1148 | /* This function must be called with vector_lock held */ |
895 | int irq, vector; | 1149 | int irq, vector; |
1150 | struct irq_cfg *cfg; | ||
896 | 1151 | ||
897 | /* Mark the inuse vectors */ | 1152 | /* Mark the inuse vectors */ |
898 | for (irq = 0; irq < NR_IRQS; ++irq) { | 1153 | for_each_irq_cfg(irq, cfg) { |
899 | if (!cpu_isset(cpu, irq_cfg[irq].domain)) | 1154 | if (!cpu_isset(cpu, cfg->domain)) |
900 | continue; | 1155 | continue; |
901 | vector = irq_cfg[irq].vector; | 1156 | vector = cfg->vector; |
902 | per_cpu(vector_irq, cpu)[vector] = irq; | 1157 | per_cpu(vector_irq, cpu)[vector] = irq; |
903 | } | 1158 | } |
904 | /* Mark the free vectors */ | 1159 | /* Mark the free vectors */ |
@@ -906,7 +1161,9 @@ void __setup_vector_irq(int cpu) | |||
906 | irq = per_cpu(vector_irq, cpu)[vector]; | 1161 | irq = per_cpu(vector_irq, cpu)[vector]; |
907 | if (irq < 0) | 1162 | if (irq < 0) |
908 | continue; | 1163 | continue; |
909 | if (!cpu_isset(cpu, irq_cfg[irq].domain)) | 1164 | |
1165 | cfg = irq_cfg(irq); | ||
1166 | if (!cpu_isset(cpu, cfg->domain)) | ||
910 | per_cpu(vector_irq, cpu)[vector] = -1; | 1167 | per_cpu(vector_irq, cpu)[vector] = -1; |
911 | } | 1168 | } |
912 | } | 1169 | } |
@@ -916,16 +1173,49 @@ static struct irq_chip ioapic_chip; | |||
916 | static struct irq_chip ir_ioapic_chip; | 1173 | static struct irq_chip ir_ioapic_chip; |
917 | #endif | 1174 | #endif |
918 | 1175 | ||
1176 | #define IOAPIC_AUTO -1 | ||
1177 | #define IOAPIC_EDGE 0 | ||
1178 | #define IOAPIC_LEVEL 1 | ||
1179 | |||
1180 | #ifdef CONFIG_X86_32 | ||
1181 | static inline int IO_APIC_irq_trigger(int irq) | ||
1182 | { | ||
1183 | int apic, idx, pin; | ||
1184 | |||
1185 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1186 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
1187 | idx = find_irq_entry(apic, pin, mp_INT); | ||
1188 | if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) | ||
1189 | return irq_trigger(idx); | ||
1190 | } | ||
1191 | } | ||
1192 | /* | ||
1193 | * nonexistent IRQs are edge default | ||
1194 | */ | ||
1195 | return 0; | ||
1196 | } | ||
1197 | #else | ||
1198 | static inline int IO_APIC_irq_trigger(int irq) | ||
1199 | { | ||
1200 | return 1; | ||
1201 | } | ||
1202 | #endif | ||
1203 | |||
919 | static void ioapic_register_intr(int irq, unsigned long trigger) | 1204 | static void ioapic_register_intr(int irq, unsigned long trigger) |
920 | { | 1205 | { |
921 | if (trigger) | 1206 | struct irq_desc *desc; |
922 | irq_desc[irq].status |= IRQ_LEVEL; | 1207 | |
1208 | desc = irq_to_desc(irq); | ||
1209 | |||
1210 | if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || | ||
1211 | trigger == IOAPIC_LEVEL) | ||
1212 | desc->status |= IRQ_LEVEL; | ||
923 | else | 1213 | else |
924 | irq_desc[irq].status &= ~IRQ_LEVEL; | 1214 | desc->status &= ~IRQ_LEVEL; |
925 | 1215 | ||
926 | #ifdef CONFIG_INTR_REMAP | 1216 | #ifdef CONFIG_INTR_REMAP |
927 | if (irq_remapped(irq)) { | 1217 | if (irq_remapped(irq)) { |
928 | irq_desc[irq].status |= IRQ_MOVE_PCNTXT; | 1218 | desc->status |= IRQ_MOVE_PCNTXT; |
929 | if (trigger) | 1219 | if (trigger) |
930 | set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, | 1220 | set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, |
931 | handle_fasteoi_irq, | 1221 | handle_fasteoi_irq, |
@@ -936,7 +1226,8 @@ static void ioapic_register_intr(int irq, unsigned long trigger) | |||
936 | return; | 1226 | return; |
937 | } | 1227 | } |
938 | #endif | 1228 | #endif |
939 | if (trigger) | 1229 | if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || |
1230 | trigger == IOAPIC_LEVEL) | ||
940 | set_irq_chip_and_handler_name(irq, &ioapic_chip, | 1231 | set_irq_chip_and_handler_name(irq, &ioapic_chip, |
941 | handle_fasteoi_irq, | 1232 | handle_fasteoi_irq, |
942 | "fasteoi"); | 1233 | "fasteoi"); |
@@ -1009,13 +1300,15 @@ static int setup_ioapic_entry(int apic, int irq, | |||
1009 | static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, | 1300 | static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, |
1010 | int trigger, int polarity) | 1301 | int trigger, int polarity) |
1011 | { | 1302 | { |
1012 | struct irq_cfg *cfg = irq_cfg + irq; | 1303 | struct irq_cfg *cfg; |
1013 | struct IO_APIC_route_entry entry; | 1304 | struct IO_APIC_route_entry entry; |
1014 | cpumask_t mask; | 1305 | cpumask_t mask; |
1015 | 1306 | ||
1016 | if (!IO_APIC_IRQ(irq)) | 1307 | if (!IO_APIC_IRQ(irq)) |
1017 | return; | 1308 | return; |
1018 | 1309 | ||
1310 | cfg = irq_cfg(irq); | ||
1311 | |||
1019 | mask = TARGET_CPUS; | 1312 | mask = TARGET_CPUS; |
1020 | if (assign_irq_vector(irq, mask)) | 1313 | if (assign_irq_vector(irq, mask)) |
1021 | return; | 1314 | return; |
@@ -1047,37 +1340,49 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, | |||
1047 | 1340 | ||
1048 | static void __init setup_IO_APIC_irqs(void) | 1341 | static void __init setup_IO_APIC_irqs(void) |
1049 | { | 1342 | { |
1050 | int apic, pin, idx, irq, first_notcon = 1; | 1343 | int apic, pin, idx, irq; |
1344 | int notcon = 0; | ||
1051 | 1345 | ||
1052 | apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); | 1346 | apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); |
1053 | 1347 | ||
1054 | for (apic = 0; apic < nr_ioapics; apic++) { | 1348 | for (apic = 0; apic < nr_ioapics; apic++) { |
1055 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | 1349 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { |
1056 | |||
1057 | idx = find_irq_entry(apic,pin,mp_INT); | ||
1058 | if (idx == -1) { | ||
1059 | if (first_notcon) { | ||
1060 | apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); | ||
1061 | first_notcon = 0; | ||
1062 | } else | ||
1063 | apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); | ||
1064 | continue; | ||
1065 | } | ||
1066 | if (!first_notcon) { | ||
1067 | apic_printk(APIC_VERBOSE, " not connected.\n"); | ||
1068 | first_notcon = 1; | ||
1069 | } | ||
1070 | 1350 | ||
1071 | irq = pin_2_irq(idx, apic, pin); | 1351 | idx = find_irq_entry(apic, pin, mp_INT); |
1072 | add_pin_to_irq(irq, apic, pin); | 1352 | if (idx == -1) { |
1353 | if (!notcon) { | ||
1354 | notcon = 1; | ||
1355 | apic_printk(APIC_VERBOSE, | ||
1356 | KERN_DEBUG " %d-%d", | ||
1357 | mp_ioapics[apic].mp_apicid, | ||
1358 | pin); | ||
1359 | } else | ||
1360 | apic_printk(APIC_VERBOSE, " %d-%d", | ||
1361 | mp_ioapics[apic].mp_apicid, | ||
1362 | pin); | ||
1363 | continue; | ||
1364 | } | ||
1365 | if (notcon) { | ||
1366 | apic_printk(APIC_VERBOSE, | ||
1367 | " (apicid-pin) not connected\n"); | ||
1368 | notcon = 0; | ||
1369 | } | ||
1073 | 1370 | ||
1074 | setup_IO_APIC_irq(apic, pin, irq, | 1371 | irq = pin_2_irq(idx, apic, pin); |
1075 | irq_trigger(idx), irq_polarity(idx)); | 1372 | #ifdef CONFIG_X86_32 |
1076 | } | 1373 | if (multi_timer_check(apic, irq)) |
1374 | continue; | ||
1375 | #endif | ||
1376 | add_pin_to_irq(irq, apic, pin); | ||
1377 | |||
1378 | setup_IO_APIC_irq(apic, pin, irq, | ||
1379 | irq_trigger(idx), irq_polarity(idx)); | ||
1380 | } | ||
1077 | } | 1381 | } |
1078 | 1382 | ||
1079 | if (!first_notcon) | 1383 | if (notcon) |
1080 | apic_printk(APIC_VERBOSE, " not connected.\n"); | 1384 | apic_printk(APIC_VERBOSE, |
1385 | " (apicid-pin) not connected\n"); | ||
1081 | } | 1386 | } |
1082 | 1387 | ||
1083 | /* | 1388 | /* |
@@ -1088,8 +1393,10 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, | |||
1088 | { | 1393 | { |
1089 | struct IO_APIC_route_entry entry; | 1394 | struct IO_APIC_route_entry entry; |
1090 | 1395 | ||
1396 | #ifdef CONFIG_INTR_REMAP | ||
1091 | if (intr_remapping_enabled) | 1397 | if (intr_remapping_enabled) |
1092 | return; | 1398 | return; |
1399 | #endif | ||
1093 | 1400 | ||
1094 | memset(&entry, 0, sizeof(entry)); | 1401 | memset(&entry, 0, sizeof(entry)); |
1095 | 1402 | ||
@@ -1124,7 +1431,10 @@ __apicdebuginit(void) print_IO_APIC(void) | |||
1124 | union IO_APIC_reg_00 reg_00; | 1431 | union IO_APIC_reg_00 reg_00; |
1125 | union IO_APIC_reg_01 reg_01; | 1432 | union IO_APIC_reg_01 reg_01; |
1126 | union IO_APIC_reg_02 reg_02; | 1433 | union IO_APIC_reg_02 reg_02; |
1434 | union IO_APIC_reg_03 reg_03; | ||
1127 | unsigned long flags; | 1435 | unsigned long flags; |
1436 | struct irq_cfg *cfg; | ||
1437 | unsigned int irq; | ||
1128 | 1438 | ||
1129 | if (apic_verbosity == APIC_QUIET) | 1439 | if (apic_verbosity == APIC_QUIET) |
1130 | return; | 1440 | return; |
@@ -1147,12 +1457,16 @@ __apicdebuginit(void) print_IO_APIC(void) | |||
1147 | reg_01.raw = io_apic_read(apic, 1); | 1457 | reg_01.raw = io_apic_read(apic, 1); |
1148 | if (reg_01.bits.version >= 0x10) | 1458 | if (reg_01.bits.version >= 0x10) |
1149 | reg_02.raw = io_apic_read(apic, 2); | 1459 | reg_02.raw = io_apic_read(apic, 2); |
1460 | if (reg_01.bits.version >= 0x20) | ||
1461 | reg_03.raw = io_apic_read(apic, 3); | ||
1150 | spin_unlock_irqrestore(&ioapic_lock, flags); | 1462 | spin_unlock_irqrestore(&ioapic_lock, flags); |
1151 | 1463 | ||
1152 | printk("\n"); | 1464 | printk("\n"); |
1153 | printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); | 1465 | printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); |
1154 | printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); | 1466 | printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); |
1155 | printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); | 1467 | printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); |
1468 | printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); | ||
1469 | printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); | ||
1156 | 1470 | ||
1157 | printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); | 1471 | printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); |
1158 | printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); | 1472 | printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); |
@@ -1160,11 +1474,27 @@ __apicdebuginit(void) print_IO_APIC(void) | |||
1160 | printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); | 1474 | printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); |
1161 | printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); | 1475 | printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); |
1162 | 1476 | ||
1163 | if (reg_01.bits.version >= 0x10) { | 1477 | /* |
1478 | * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, | ||
1479 | * but the value of reg_02 is read as the previous read register | ||
1480 | * value, so ignore it if reg_02 == reg_01. | ||
1481 | */ | ||
1482 | if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { | ||
1164 | printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); | 1483 | printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); |
1165 | printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); | 1484 | printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); |
1166 | } | 1485 | } |
1167 | 1486 | ||
1487 | /* | ||
1488 | * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 | ||
1489 | * or reg_03, but the value of reg_0[23] is read as the previous read | ||
1490 | * register value, so ignore it if reg_03 == reg_0[12]. | ||
1491 | */ | ||
1492 | if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && | ||
1493 | reg_03.raw != reg_01.raw) { | ||
1494 | printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); | ||
1495 | printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); | ||
1496 | } | ||
1497 | |||
1168 | printk(KERN_DEBUG ".... IRQ redirection table:\n"); | 1498 | printk(KERN_DEBUG ".... IRQ redirection table:\n"); |
1169 | 1499 | ||
1170 | printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" | 1500 | printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" |
@@ -1193,16 +1523,16 @@ __apicdebuginit(void) print_IO_APIC(void) | |||
1193 | } | 1523 | } |
1194 | } | 1524 | } |
1195 | printk(KERN_DEBUG "IRQ to pin mappings:\n"); | 1525 | printk(KERN_DEBUG "IRQ to pin mappings:\n"); |
1196 | for (i = 0; i < NR_IRQS; i++) { | 1526 | for_each_irq_cfg(irq, cfg) { |
1197 | struct irq_pin_list *entry = irq_2_pin + i; | 1527 | struct irq_pin_list *entry = cfg->irq_2_pin; |
1198 | if (entry->pin < 0) | 1528 | if (!entry) |
1199 | continue; | 1529 | continue; |
1200 | printk(KERN_DEBUG "IRQ%d ", i); | 1530 | printk(KERN_DEBUG "IRQ%d ", irq); |
1201 | for (;;) { | 1531 | for (;;) { |
1202 | printk("-> %d:%d", entry->apic, entry->pin); | 1532 | printk("-> %d:%d", entry->apic, entry->pin); |
1203 | if (!entry->next) | 1533 | if (!entry->next) |
1204 | break; | 1534 | break; |
1205 | entry = irq_2_pin + entry->next; | 1535 | entry = entry->next; |
1206 | } | 1536 | } |
1207 | printk("\n"); | 1537 | printk("\n"); |
1208 | } | 1538 | } |
@@ -1236,7 +1566,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base) | |||
1236 | __apicdebuginit(void) print_local_APIC(void *dummy) | 1566 | __apicdebuginit(void) print_local_APIC(void *dummy) |
1237 | { | 1567 | { |
1238 | unsigned int v, ver, maxlvt; | 1568 | unsigned int v, ver, maxlvt; |
1239 | unsigned long icr; | 1569 | u64 icr; |
1240 | 1570 | ||
1241 | if (apic_verbosity == APIC_QUIET) | 1571 | if (apic_verbosity == APIC_QUIET) |
1242 | return; | 1572 | return; |
@@ -1253,20 +1583,31 @@ __apicdebuginit(void) print_local_APIC(void *dummy) | |||
1253 | v = apic_read(APIC_TASKPRI); | 1583 | v = apic_read(APIC_TASKPRI); |
1254 | printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); | 1584 | printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); |
1255 | 1585 | ||
1256 | v = apic_read(APIC_ARBPRI); | 1586 | if (APIC_INTEGRATED(ver)) { /* !82489DX */ |
1257 | printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, | 1587 | if (!APIC_XAPIC(ver)) { |
1258 | v & APIC_ARBPRI_MASK); | 1588 | v = apic_read(APIC_ARBPRI); |
1259 | v = apic_read(APIC_PROCPRI); | 1589 | printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, |
1260 | printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); | 1590 | v & APIC_ARBPRI_MASK); |
1591 | } | ||
1592 | v = apic_read(APIC_PROCPRI); | ||
1593 | printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); | ||
1594 | } | ||
1595 | |||
1596 | /* | ||
1597 | * Remote read supported only in the 82489DX and local APIC for | ||
1598 | * Pentium processors. | ||
1599 | */ | ||
1600 | if (!APIC_INTEGRATED(ver) || maxlvt == 3) { | ||
1601 | v = apic_read(APIC_RRR); | ||
1602 | printk(KERN_DEBUG "... APIC RRR: %08x\n", v); | ||
1603 | } | ||
1261 | 1604 | ||
1262 | v = apic_read(APIC_EOI); | ||
1263 | printk(KERN_DEBUG "... APIC EOI: %08x\n", v); | ||
1264 | v = apic_read(APIC_RRR); | ||
1265 | printk(KERN_DEBUG "... APIC RRR: %08x\n", v); | ||
1266 | v = apic_read(APIC_LDR); | 1605 | v = apic_read(APIC_LDR); |
1267 | printk(KERN_DEBUG "... APIC LDR: %08x\n", v); | 1606 | printk(KERN_DEBUG "... APIC LDR: %08x\n", v); |
1268 | v = apic_read(APIC_DFR); | 1607 | if (!x2apic_enabled()) { |
1269 | printk(KERN_DEBUG "... APIC DFR: %08x\n", v); | 1608 | v = apic_read(APIC_DFR); |
1609 | printk(KERN_DEBUG "... APIC DFR: %08x\n", v); | ||
1610 | } | ||
1270 | v = apic_read(APIC_SPIV); | 1611 | v = apic_read(APIC_SPIV); |
1271 | printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); | 1612 | printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); |
1272 | 1613 | ||
@@ -1277,8 +1618,13 @@ __apicdebuginit(void) print_local_APIC(void *dummy) | |||
1277 | printk(KERN_DEBUG "... APIC IRR field:\n"); | 1618 | printk(KERN_DEBUG "... APIC IRR field:\n"); |
1278 | print_APIC_bitfield(APIC_IRR); | 1619 | print_APIC_bitfield(APIC_IRR); |
1279 | 1620 | ||
1280 | v = apic_read(APIC_ESR); | 1621 | if (APIC_INTEGRATED(ver)) { /* !82489DX */ |
1281 | printk(KERN_DEBUG "... APIC ESR: %08x\n", v); | 1622 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ |
1623 | apic_write(APIC_ESR, 0); | ||
1624 | |||
1625 | v = apic_read(APIC_ESR); | ||
1626 | printk(KERN_DEBUG "... APIC ESR: %08x\n", v); | ||
1627 | } | ||
1282 | 1628 | ||
1283 | icr = apic_icr_read(); | 1629 | icr = apic_icr_read(); |
1284 | printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); | 1630 | printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); |
@@ -1312,7 +1658,12 @@ __apicdebuginit(void) print_local_APIC(void *dummy) | |||
1312 | 1658 | ||
1313 | __apicdebuginit(void) print_all_local_APICs(void) | 1659 | __apicdebuginit(void) print_all_local_APICs(void) |
1314 | { | 1660 | { |
1315 | on_each_cpu(print_local_APIC, NULL, 1); | 1661 | int cpu; |
1662 | |||
1663 | preempt_disable(); | ||
1664 | for_each_online_cpu(cpu) | ||
1665 | smp_call_function_single(cpu, print_local_APIC, NULL, 1); | ||
1666 | preempt_enable(); | ||
1316 | } | 1667 | } |
1317 | 1668 | ||
1318 | __apicdebuginit(void) print_PIC(void) | 1669 | __apicdebuginit(void) print_PIC(void) |
@@ -1359,17 +1710,22 @@ __apicdebuginit(int) print_all_ICs(void) | |||
1359 | fs_initcall(print_all_ICs); | 1710 | fs_initcall(print_all_ICs); |
1360 | 1711 | ||
1361 | 1712 | ||
1713 | /* Where if anywhere is the i8259 connect in external int mode */ | ||
1714 | static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; | ||
1715 | |||
1362 | void __init enable_IO_APIC(void) | 1716 | void __init enable_IO_APIC(void) |
1363 | { | 1717 | { |
1364 | union IO_APIC_reg_01 reg_01; | 1718 | union IO_APIC_reg_01 reg_01; |
1365 | int i8259_apic, i8259_pin; | 1719 | int i8259_apic, i8259_pin; |
1366 | int i, apic; | 1720 | int apic; |
1367 | unsigned long flags; | 1721 | unsigned long flags; |
1368 | 1722 | ||
1369 | for (i = 0; i < PIN_MAP_SIZE; i++) { | 1723 | #ifdef CONFIG_X86_32 |
1370 | irq_2_pin[i].pin = -1; | 1724 | int i; |
1371 | irq_2_pin[i].next = 0; | 1725 | if (!pirqs_enabled) |
1372 | } | 1726 | for (i = 0; i < MAX_PIRQS; i++) |
1727 | pirq_entries[i] = -1; | ||
1728 | #endif | ||
1373 | 1729 | ||
1374 | /* | 1730 | /* |
1375 | * The number of IO-APIC IRQ registers (== #pins): | 1731 | * The number of IO-APIC IRQ registers (== #pins): |
@@ -1399,6 +1755,10 @@ void __init enable_IO_APIC(void) | |||
1399 | } | 1755 | } |
1400 | found_i8259: | 1756 | found_i8259: |
1401 | /* Look to see what if the MP table has reported the ExtINT */ | 1757 | /* Look to see what if the MP table has reported the ExtINT */ |
1758 | /* If we could not find the appropriate pin by looking at the ioapic | ||
1759 | * the i8259 probably is not connected the ioapic but give the | ||
1760 | * mptable a chance anyway. | ||
1761 | */ | ||
1402 | i8259_pin = find_isa_irq_pin(0, mp_ExtINT); | 1762 | i8259_pin = find_isa_irq_pin(0, mp_ExtINT); |
1403 | i8259_apic = find_isa_irq_apic(0, mp_ExtINT); | 1763 | i8259_apic = find_isa_irq_apic(0, mp_ExtINT); |
1404 | /* Trust the MP table if nothing is setup in the hardware */ | 1764 | /* Trust the MP table if nothing is setup in the hardware */ |
@@ -1458,6 +1818,133 @@ void disable_IO_APIC(void) | |||
1458 | disconnect_bsp_APIC(ioapic_i8259.pin != -1); | 1818 | disconnect_bsp_APIC(ioapic_i8259.pin != -1); |
1459 | } | 1819 | } |
1460 | 1820 | ||
1821 | #ifdef CONFIG_X86_32 | ||
1822 | /* | ||
1823 | * function to set the IO-APIC physical IDs based on the | ||
1824 | * values stored in the MPC table. | ||
1825 | * | ||
1826 | * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 | ||
1827 | */ | ||
1828 | |||
1829 | static void __init setup_ioapic_ids_from_mpc(void) | ||
1830 | { | ||
1831 | union IO_APIC_reg_00 reg_00; | ||
1832 | physid_mask_t phys_id_present_map; | ||
1833 | int apic; | ||
1834 | int i; | ||
1835 | unsigned char old_id; | ||
1836 | unsigned long flags; | ||
1837 | |||
1838 | if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) | ||
1839 | return; | ||
1840 | |||
1841 | /* | ||
1842 | * Don't check I/O APIC IDs for xAPIC systems. They have | ||
1843 | * no meaning without the serial APIC bus. | ||
1844 | */ | ||
1845 | if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) | ||
1846 | || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) | ||
1847 | return; | ||
1848 | /* | ||
1849 | * This is broken; anything with a real cpu count has to | ||
1850 | * circumvent this idiocy regardless. | ||
1851 | */ | ||
1852 | phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); | ||
1853 | |||
1854 | /* | ||
1855 | * Set the IOAPIC ID to the value stored in the MPC table. | ||
1856 | */ | ||
1857 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1858 | |||
1859 | /* Read the register 0 value */ | ||
1860 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1861 | reg_00.raw = io_apic_read(apic, 0); | ||
1862 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1863 | |||
1864 | old_id = mp_ioapics[apic].mp_apicid; | ||
1865 | |||
1866 | if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { | ||
1867 | printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", | ||
1868 | apic, mp_ioapics[apic].mp_apicid); | ||
1869 | printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", | ||
1870 | reg_00.bits.ID); | ||
1871 | mp_ioapics[apic].mp_apicid = reg_00.bits.ID; | ||
1872 | } | ||
1873 | |||
1874 | /* | ||
1875 | * Sanity check, is the ID really free? Every APIC in a | ||
1876 | * system must have a unique ID or we get lots of nice | ||
1877 | * 'stuck on smp_invalidate_needed IPI wait' messages. | ||
1878 | */ | ||
1879 | if (check_apicid_used(phys_id_present_map, | ||
1880 | mp_ioapics[apic].mp_apicid)) { | ||
1881 | printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", | ||
1882 | apic, mp_ioapics[apic].mp_apicid); | ||
1883 | for (i = 0; i < get_physical_broadcast(); i++) | ||
1884 | if (!physid_isset(i, phys_id_present_map)) | ||
1885 | break; | ||
1886 | if (i >= get_physical_broadcast()) | ||
1887 | panic("Max APIC ID exceeded!\n"); | ||
1888 | printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", | ||
1889 | i); | ||
1890 | physid_set(i, phys_id_present_map); | ||
1891 | mp_ioapics[apic].mp_apicid = i; | ||
1892 | } else { | ||
1893 | physid_mask_t tmp; | ||
1894 | tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); | ||
1895 | apic_printk(APIC_VERBOSE, "Setting %d in the " | ||
1896 | "phys_id_present_map\n", | ||
1897 | mp_ioapics[apic].mp_apicid); | ||
1898 | physids_or(phys_id_present_map, phys_id_present_map, tmp); | ||
1899 | } | ||
1900 | |||
1901 | |||
1902 | /* | ||
1903 | * We need to adjust the IRQ routing table | ||
1904 | * if the ID changed. | ||
1905 | */ | ||
1906 | if (old_id != mp_ioapics[apic].mp_apicid) | ||
1907 | for (i = 0; i < mp_irq_entries; i++) | ||
1908 | if (mp_irqs[i].mp_dstapic == old_id) | ||
1909 | mp_irqs[i].mp_dstapic | ||
1910 | = mp_ioapics[apic].mp_apicid; | ||
1911 | |||
1912 | /* | ||
1913 | * Read the right value from the MPC table and | ||
1914 | * write it into the ID register. | ||
1915 | */ | ||
1916 | apic_printk(APIC_VERBOSE, KERN_INFO | ||
1917 | "...changing IO-APIC physical APIC ID to %d ...", | ||
1918 | mp_ioapics[apic].mp_apicid); | ||
1919 | |||
1920 | reg_00.bits.ID = mp_ioapics[apic].mp_apicid; | ||
1921 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1922 | io_apic_write(apic, 0, reg_00.raw); | ||
1923 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1924 | |||
1925 | /* | ||
1926 | * Sanity check | ||
1927 | */ | ||
1928 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1929 | reg_00.raw = io_apic_read(apic, 0); | ||
1930 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1931 | if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) | ||
1932 | printk("could not set ID!\n"); | ||
1933 | else | ||
1934 | apic_printk(APIC_VERBOSE, " ok.\n"); | ||
1935 | } | ||
1936 | } | ||
1937 | #endif | ||
1938 | |||
1939 | int no_timer_check __initdata; | ||
1940 | |||
1941 | static int __init notimercheck(char *s) | ||
1942 | { | ||
1943 | no_timer_check = 1; | ||
1944 | return 1; | ||
1945 | } | ||
1946 | __setup("no_timer_check", notimercheck); | ||
1947 | |||
1461 | /* | 1948 | /* |
1462 | * There is a nasty bug in some older SMP boards, their mptable lies | 1949 | * There is a nasty bug in some older SMP boards, their mptable lies |
1463 | * about the timer IRQ. We do the following to work around the situation: | 1950 | * about the timer IRQ. We do the following to work around the situation: |
@@ -1471,6 +1958,9 @@ static int __init timer_irq_works(void) | |||
1471 | unsigned long t1 = jiffies; | 1958 | unsigned long t1 = jiffies; |
1472 | unsigned long flags; | 1959 | unsigned long flags; |
1473 | 1960 | ||
1961 | if (no_timer_check) | ||
1962 | return 1; | ||
1963 | |||
1474 | local_save_flags(flags); | 1964 | local_save_flags(flags); |
1475 | local_irq_enable(); | 1965 | local_irq_enable(); |
1476 | /* Let ten ticks pass... */ | 1966 | /* Let ten ticks pass... */ |
@@ -1531,9 +2021,11 @@ static unsigned int startup_ioapic_irq(unsigned int irq) | |||
1531 | return was_pending; | 2021 | return was_pending; |
1532 | } | 2022 | } |
1533 | 2023 | ||
2024 | #ifdef CONFIG_X86_64 | ||
1534 | static int ioapic_retrigger_irq(unsigned int irq) | 2025 | static int ioapic_retrigger_irq(unsigned int irq) |
1535 | { | 2026 | { |
1536 | struct irq_cfg *cfg = &irq_cfg[irq]; | 2027 | |
2028 | struct irq_cfg *cfg = irq_cfg(irq); | ||
1537 | unsigned long flags; | 2029 | unsigned long flags; |
1538 | 2030 | ||
1539 | spin_lock_irqsave(&vector_lock, flags); | 2031 | spin_lock_irqsave(&vector_lock, flags); |
@@ -1542,6 +2034,14 @@ static int ioapic_retrigger_irq(unsigned int irq) | |||
1542 | 2034 | ||
1543 | return 1; | 2035 | return 1; |
1544 | } | 2036 | } |
2037 | #else | ||
2038 | static int ioapic_retrigger_irq(unsigned int irq) | ||
2039 | { | ||
2040 | send_IPI_self(irq_cfg(irq)->vector); | ||
2041 | |||
2042 | return 1; | ||
2043 | } | ||
2044 | #endif | ||
1545 | 2045 | ||
1546 | /* | 2046 | /* |
1547 | * Level and edge triggered IO-APIC interrupts need different handling, | 2047 | * Level and edge triggered IO-APIC interrupts need different handling, |
@@ -1580,11 +2080,11 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); | |||
1580 | */ | 2080 | */ |
1581 | static void migrate_ioapic_irq(int irq, cpumask_t mask) | 2081 | static void migrate_ioapic_irq(int irq, cpumask_t mask) |
1582 | { | 2082 | { |
1583 | struct irq_cfg *cfg = irq_cfg + irq; | 2083 | struct irq_cfg *cfg; |
1584 | struct irq_desc *desc = irq_desc + irq; | 2084 | struct irq_desc *desc; |
1585 | cpumask_t tmp, cleanup_mask; | 2085 | cpumask_t tmp, cleanup_mask; |
1586 | struct irte irte; | 2086 | struct irte irte; |
1587 | int modify_ioapic_rte = desc->status & IRQ_LEVEL; | 2087 | int modify_ioapic_rte; |
1588 | unsigned int dest; | 2088 | unsigned int dest; |
1589 | unsigned long flags; | 2089 | unsigned long flags; |
1590 | 2090 | ||
@@ -1598,9 +2098,12 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) | |||
1598 | if (assign_irq_vector(irq, mask)) | 2098 | if (assign_irq_vector(irq, mask)) |
1599 | return; | 2099 | return; |
1600 | 2100 | ||
2101 | cfg = irq_cfg(irq); | ||
1601 | cpus_and(tmp, cfg->domain, mask); | 2102 | cpus_and(tmp, cfg->domain, mask); |
1602 | dest = cpu_mask_to_apicid(tmp); | 2103 | dest = cpu_mask_to_apicid(tmp); |
1603 | 2104 | ||
2105 | desc = irq_to_desc(irq); | ||
2106 | modify_ioapic_rte = desc->status & IRQ_LEVEL; | ||
1604 | if (modify_ioapic_rte) { | 2107 | if (modify_ioapic_rte) { |
1605 | spin_lock_irqsave(&ioapic_lock, flags); | 2108 | spin_lock_irqsave(&ioapic_lock, flags); |
1606 | __target_IO_APIC_irq(irq, dest, cfg->vector); | 2109 | __target_IO_APIC_irq(irq, dest, cfg->vector); |
@@ -1622,18 +2125,19 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) | |||
1622 | cfg->move_in_progress = 0; | 2125 | cfg->move_in_progress = 0; |
1623 | } | 2126 | } |
1624 | 2127 | ||
1625 | irq_desc[irq].affinity = mask; | 2128 | desc->affinity = mask; |
1626 | } | 2129 | } |
1627 | 2130 | ||
1628 | static int migrate_irq_remapped_level(int irq) | 2131 | static int migrate_irq_remapped_level(int irq) |
1629 | { | 2132 | { |
1630 | int ret = -1; | 2133 | int ret = -1; |
2134 | struct irq_desc *desc = irq_to_desc(irq); | ||
1631 | 2135 | ||
1632 | mask_IO_APIC_irq(irq); | 2136 | mask_IO_APIC_irq(irq); |
1633 | 2137 | ||
1634 | if (io_apic_level_ack_pending(irq)) { | 2138 | if (io_apic_level_ack_pending(irq)) { |
1635 | /* | 2139 | /* |
1636 | * Interrupt in progress. Migrating irq now will change the | 2140 | * Interrupt in progress. Migrating irq now will change the |
1637 | * vector information in the IO-APIC RTE and that will confuse | 2141 | * vector information in the IO-APIC RTE and that will confuse |
1638 | * the EOI broadcast performed by cpu. | 2142 | * the EOI broadcast performed by cpu. |
1639 | * So, delay the irq migration to the next instance. | 2143 | * So, delay the irq migration to the next instance. |
@@ -1643,11 +2147,11 @@ static int migrate_irq_remapped_level(int irq) | |||
1643 | } | 2147 | } |
1644 | 2148 | ||
1645 | /* everthing is clear. we have right of way */ | 2149 | /* everthing is clear. we have right of way */ |
1646 | migrate_ioapic_irq(irq, irq_desc[irq].pending_mask); | 2150 | migrate_ioapic_irq(irq, desc->pending_mask); |
1647 | 2151 | ||
1648 | ret = 0; | 2152 | ret = 0; |
1649 | irq_desc[irq].status &= ~IRQ_MOVE_PENDING; | 2153 | desc->status &= ~IRQ_MOVE_PENDING; |
1650 | cpus_clear(irq_desc[irq].pending_mask); | 2154 | cpus_clear(desc->pending_mask); |
1651 | 2155 | ||
1652 | unmask: | 2156 | unmask: |
1653 | unmask_IO_APIC_irq(irq); | 2157 | unmask_IO_APIC_irq(irq); |
@@ -1656,10 +2160,10 @@ unmask: | |||
1656 | 2160 | ||
1657 | static void ir_irq_migration(struct work_struct *work) | 2161 | static void ir_irq_migration(struct work_struct *work) |
1658 | { | 2162 | { |
1659 | int irq; | 2163 | unsigned int irq; |
2164 | struct irq_desc *desc; | ||
1660 | 2165 | ||
1661 | for (irq = 0; irq < NR_IRQS; irq++) { | 2166 | for_each_irq_desc(irq, desc) { |
1662 | struct irq_desc *desc = irq_desc + irq; | ||
1663 | if (desc->status & IRQ_MOVE_PENDING) { | 2167 | if (desc->status & IRQ_MOVE_PENDING) { |
1664 | unsigned long flags; | 2168 | unsigned long flags; |
1665 | 2169 | ||
@@ -1671,8 +2175,7 @@ static void ir_irq_migration(struct work_struct *work) | |||
1671 | continue; | 2175 | continue; |
1672 | } | 2176 | } |
1673 | 2177 | ||
1674 | desc->chip->set_affinity(irq, | 2178 | desc->chip->set_affinity(irq, desc->pending_mask); |
1675 | irq_desc[irq].pending_mask); | ||
1676 | spin_unlock_irqrestore(&desc->lock, flags); | 2179 | spin_unlock_irqrestore(&desc->lock, flags); |
1677 | } | 2180 | } |
1678 | } | 2181 | } |
@@ -1683,9 +2186,11 @@ static void ir_irq_migration(struct work_struct *work) | |||
1683 | */ | 2186 | */ |
1684 | static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) | 2187 | static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) |
1685 | { | 2188 | { |
1686 | if (irq_desc[irq].status & IRQ_LEVEL) { | 2189 | struct irq_desc *desc = irq_to_desc(irq); |
1687 | irq_desc[irq].status |= IRQ_MOVE_PENDING; | 2190 | |
1688 | irq_desc[irq].pending_mask = mask; | 2191 | if (desc->status & IRQ_LEVEL) { |
2192 | desc->status |= IRQ_MOVE_PENDING; | ||
2193 | desc->pending_mask = mask; | ||
1689 | migrate_irq_remapped_level(irq); | 2194 | migrate_irq_remapped_level(irq); |
1690 | return; | 2195 | return; |
1691 | } | 2196 | } |
@@ -1698,7 +2203,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) | |||
1698 | { | 2203 | { |
1699 | unsigned vector, me; | 2204 | unsigned vector, me; |
1700 | ack_APIC_irq(); | 2205 | ack_APIC_irq(); |
2206 | #ifdef CONFIG_X86_64 | ||
1701 | exit_idle(); | 2207 | exit_idle(); |
2208 | #endif | ||
1702 | irq_enter(); | 2209 | irq_enter(); |
1703 | 2210 | ||
1704 | me = smp_processor_id(); | 2211 | me = smp_processor_id(); |
@@ -1707,11 +2214,12 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) | |||
1707 | struct irq_desc *desc; | 2214 | struct irq_desc *desc; |
1708 | struct irq_cfg *cfg; | 2215 | struct irq_cfg *cfg; |
1709 | irq = __get_cpu_var(vector_irq)[vector]; | 2216 | irq = __get_cpu_var(vector_irq)[vector]; |
1710 | if (irq >= NR_IRQS) | 2217 | |
2218 | desc = irq_to_desc(irq); | ||
2219 | if (!desc) | ||
1711 | continue; | 2220 | continue; |
1712 | 2221 | ||
1713 | desc = irq_desc + irq; | 2222 | cfg = irq_cfg(irq); |
1714 | cfg = irq_cfg + irq; | ||
1715 | spin_lock(&desc->lock); | 2223 | spin_lock(&desc->lock); |
1716 | if (!cfg->move_cleanup_count) | 2224 | if (!cfg->move_cleanup_count) |
1717 | goto unlock; | 2225 | goto unlock; |
@@ -1730,7 +2238,7 @@ unlock: | |||
1730 | 2238 | ||
1731 | static void irq_complete_move(unsigned int irq) | 2239 | static void irq_complete_move(unsigned int irq) |
1732 | { | 2240 | { |
1733 | struct irq_cfg *cfg = irq_cfg + irq; | 2241 | struct irq_cfg *cfg = irq_cfg(irq); |
1734 | unsigned vector, me; | 2242 | unsigned vector, me; |
1735 | 2243 | ||
1736 | if (likely(!cfg->move_in_progress)) | 2244 | if (likely(!cfg->move_in_progress)) |
@@ -1769,19 +2277,50 @@ static void ack_apic_edge(unsigned int irq) | |||
1769 | ack_APIC_irq(); | 2277 | ack_APIC_irq(); |
1770 | } | 2278 | } |
1771 | 2279 | ||
2280 | atomic_t irq_mis_count; | ||
2281 | |||
1772 | static void ack_apic_level(unsigned int irq) | 2282 | static void ack_apic_level(unsigned int irq) |
1773 | { | 2283 | { |
2284 | #ifdef CONFIG_X86_32 | ||
2285 | unsigned long v; | ||
2286 | int i; | ||
2287 | #endif | ||
1774 | int do_unmask_irq = 0; | 2288 | int do_unmask_irq = 0; |
1775 | 2289 | ||
1776 | irq_complete_move(irq); | 2290 | irq_complete_move(irq); |
1777 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 2291 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
1778 | /* If we are moving the irq we need to mask it */ | 2292 | /* If we are moving the irq we need to mask it */ |
1779 | if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { | 2293 | if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { |
1780 | do_unmask_irq = 1; | 2294 | do_unmask_irq = 1; |
1781 | mask_IO_APIC_irq(irq); | 2295 | mask_IO_APIC_irq(irq); |
1782 | } | 2296 | } |
1783 | #endif | 2297 | #endif |
1784 | 2298 | ||
2299 | #ifdef CONFIG_X86_32 | ||
2300 | /* | ||
2301 | * It appears there is an erratum which affects at least version 0x11 | ||
2302 | * of I/O APIC (that's the 82093AA and cores integrated into various | ||
2303 | * chipsets). Under certain conditions a level-triggered interrupt is | ||
2304 | * erroneously delivered as edge-triggered one but the respective IRR | ||
2305 | * bit gets set nevertheless. As a result the I/O unit expects an EOI | ||
2306 | * message but it will never arrive and further interrupts are blocked | ||
2307 | * from the source. The exact reason is so far unknown, but the | ||
2308 | * phenomenon was observed when two consecutive interrupt requests | ||
2309 | * from a given source get delivered to the same CPU and the source is | ||
2310 | * temporarily disabled in between. | ||
2311 | * | ||
2312 | * A workaround is to simulate an EOI message manually. We achieve it | ||
2313 | * by setting the trigger mode to edge and then to level when the edge | ||
2314 | * trigger mode gets detected in the TMR of a local APIC for a | ||
2315 | * level-triggered interrupt. We mask the source for the time of the | ||
2316 | * operation to prevent an edge-triggered interrupt escaping meanwhile. | ||
2317 | * The idea is from Manfred Spraul. --macro | ||
2318 | */ | ||
2319 | i = irq_cfg(irq)->vector; | ||
2320 | |||
2321 | v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); | ||
2322 | #endif | ||
2323 | |||
1785 | /* | 2324 | /* |
1786 | * We must acknowledge the irq before we move it or the acknowledge will | 2325 | * We must acknowledge the irq before we move it or the acknowledge will |
1787 | * not propagate properly. | 2326 | * not propagate properly. |
@@ -1820,31 +2359,41 @@ static void ack_apic_level(unsigned int irq) | |||
1820 | move_masked_irq(irq); | 2359 | move_masked_irq(irq); |
1821 | unmask_IO_APIC_irq(irq); | 2360 | unmask_IO_APIC_irq(irq); |
1822 | } | 2361 | } |
2362 | |||
2363 | #ifdef CONFIG_X86_32 | ||
2364 | if (!(v & (1 << (i & 0x1f)))) { | ||
2365 | atomic_inc(&irq_mis_count); | ||
2366 | spin_lock(&ioapic_lock); | ||
2367 | __mask_and_edge_IO_APIC_irq(irq); | ||
2368 | __unmask_and_level_IO_APIC_irq(irq); | ||
2369 | spin_unlock(&ioapic_lock); | ||
2370 | } | ||
2371 | #endif | ||
1823 | } | 2372 | } |
1824 | 2373 | ||
1825 | static struct irq_chip ioapic_chip __read_mostly = { | 2374 | static struct irq_chip ioapic_chip __read_mostly = { |
1826 | .name = "IO-APIC", | 2375 | .name = "IO-APIC", |
1827 | .startup = startup_ioapic_irq, | 2376 | .startup = startup_ioapic_irq, |
1828 | .mask = mask_IO_APIC_irq, | 2377 | .mask = mask_IO_APIC_irq, |
1829 | .unmask = unmask_IO_APIC_irq, | 2378 | .unmask = unmask_IO_APIC_irq, |
1830 | .ack = ack_apic_edge, | 2379 | .ack = ack_apic_edge, |
1831 | .eoi = ack_apic_level, | 2380 | .eoi = ack_apic_level, |
1832 | #ifdef CONFIG_SMP | 2381 | #ifdef CONFIG_SMP |
1833 | .set_affinity = set_ioapic_affinity_irq, | 2382 | .set_affinity = set_ioapic_affinity_irq, |
1834 | #endif | 2383 | #endif |
1835 | .retrigger = ioapic_retrigger_irq, | 2384 | .retrigger = ioapic_retrigger_irq, |
1836 | }; | 2385 | }; |
1837 | 2386 | ||
1838 | #ifdef CONFIG_INTR_REMAP | 2387 | #ifdef CONFIG_INTR_REMAP |
1839 | static struct irq_chip ir_ioapic_chip __read_mostly = { | 2388 | static struct irq_chip ir_ioapic_chip __read_mostly = { |
1840 | .name = "IR-IO-APIC", | 2389 | .name = "IR-IO-APIC", |
1841 | .startup = startup_ioapic_irq, | 2390 | .startup = startup_ioapic_irq, |
1842 | .mask = mask_IO_APIC_irq, | 2391 | .mask = mask_IO_APIC_irq, |
1843 | .unmask = unmask_IO_APIC_irq, | 2392 | .unmask = unmask_IO_APIC_irq, |
1844 | .ack = ack_x2apic_edge, | 2393 | .ack = ack_x2apic_edge, |
1845 | .eoi = ack_x2apic_level, | 2394 | .eoi = ack_x2apic_level, |
1846 | #ifdef CONFIG_SMP | 2395 | #ifdef CONFIG_SMP |
1847 | .set_affinity = set_ir_ioapic_affinity_irq, | 2396 | .set_affinity = set_ir_ioapic_affinity_irq, |
1848 | #endif | 2397 | #endif |
1849 | .retrigger = ioapic_retrigger_irq, | 2398 | .retrigger = ioapic_retrigger_irq, |
1850 | }; | 2399 | }; |
@@ -1853,6 +2402,8 @@ static struct irq_chip ir_ioapic_chip __read_mostly = { | |||
1853 | static inline void init_IO_APIC_traps(void) | 2402 | static inline void init_IO_APIC_traps(void) |
1854 | { | 2403 | { |
1855 | int irq; | 2404 | int irq; |
2405 | struct irq_desc *desc; | ||
2406 | struct irq_cfg *cfg; | ||
1856 | 2407 | ||
1857 | /* | 2408 | /* |
1858 | * NOTE! The local APIC isn't very good at handling | 2409 | * NOTE! The local APIC isn't very good at handling |
@@ -1865,8 +2416,8 @@ static inline void init_IO_APIC_traps(void) | |||
1865 | * Also, we've got to be careful not to trash gate | 2416 | * Also, we've got to be careful not to trash gate |
1866 | * 0x80, because int 0x80 is hm, kind of importantish. ;) | 2417 | * 0x80, because int 0x80 is hm, kind of importantish. ;) |
1867 | */ | 2418 | */ |
1868 | for (irq = 0; irq < NR_IRQS ; irq++) { | 2419 | for_each_irq_cfg(irq, cfg) { |
1869 | if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) { | 2420 | if (IO_APIC_IRQ(irq) && !cfg->vector) { |
1870 | /* | 2421 | /* |
1871 | * Hmm.. We don't have an entry for this, | 2422 | * Hmm.. We don't have an entry for this, |
1872 | * so default to an old-fashioned 8259 | 2423 | * so default to an old-fashioned 8259 |
@@ -1874,27 +2425,33 @@ static inline void init_IO_APIC_traps(void) | |||
1874 | */ | 2425 | */ |
1875 | if (irq < 16) | 2426 | if (irq < 16) |
1876 | make_8259A_irq(irq); | 2427 | make_8259A_irq(irq); |
1877 | else | 2428 | else { |
2429 | desc = irq_to_desc(irq); | ||
1878 | /* Strange. Oh, well.. */ | 2430 | /* Strange. Oh, well.. */ |
1879 | irq_desc[irq].chip = &no_irq_chip; | 2431 | desc->chip = &no_irq_chip; |
2432 | } | ||
1880 | } | 2433 | } |
1881 | } | 2434 | } |
1882 | } | 2435 | } |
1883 | 2436 | ||
1884 | static void unmask_lapic_irq(unsigned int irq) | 2437 | /* |
2438 | * The local APIC irq-chip implementation: | ||
2439 | */ | ||
2440 | |||
2441 | static void mask_lapic_irq(unsigned int irq) | ||
1885 | { | 2442 | { |
1886 | unsigned long v; | 2443 | unsigned long v; |
1887 | 2444 | ||
1888 | v = apic_read(APIC_LVT0); | 2445 | v = apic_read(APIC_LVT0); |
1889 | apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); | 2446 | apic_write(APIC_LVT0, v | APIC_LVT_MASKED); |
1890 | } | 2447 | } |
1891 | 2448 | ||
1892 | static void mask_lapic_irq(unsigned int irq) | 2449 | static void unmask_lapic_irq(unsigned int irq) |
1893 | { | 2450 | { |
1894 | unsigned long v; | 2451 | unsigned long v; |
1895 | 2452 | ||
1896 | v = apic_read(APIC_LVT0); | 2453 | v = apic_read(APIC_LVT0); |
1897 | apic_write(APIC_LVT0, v | APIC_LVT_MASKED); | 2454 | apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); |
1898 | } | 2455 | } |
1899 | 2456 | ||
1900 | static void ack_lapic_irq (unsigned int irq) | 2457 | static void ack_lapic_irq (unsigned int irq) |
@@ -1911,7 +2468,10 @@ static struct irq_chip lapic_chip __read_mostly = { | |||
1911 | 2468 | ||
1912 | static void lapic_register_intr(int irq) | 2469 | static void lapic_register_intr(int irq) |
1913 | { | 2470 | { |
1914 | irq_desc[irq].status &= ~IRQ_LEVEL; | 2471 | struct irq_desc *desc; |
2472 | |||
2473 | desc = irq_to_desc(irq); | ||
2474 | desc->status &= ~IRQ_LEVEL; | ||
1915 | set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, | 2475 | set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, |
1916 | "edge"); | 2476 | "edge"); |
1917 | } | 2477 | } |
@@ -1919,19 +2479,19 @@ static void lapic_register_intr(int irq) | |||
1919 | static void __init setup_nmi(void) | 2479 | static void __init setup_nmi(void) |
1920 | { | 2480 | { |
1921 | /* | 2481 | /* |
1922 | * Dirty trick to enable the NMI watchdog ... | 2482 | * Dirty trick to enable the NMI watchdog ... |
1923 | * We put the 8259A master into AEOI mode and | 2483 | * We put the 8259A master into AEOI mode and |
1924 | * unmask on all local APICs LVT0 as NMI. | 2484 | * unmask on all local APICs LVT0 as NMI. |
1925 | * | 2485 | * |
1926 | * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') | 2486 | * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') |
1927 | * is from Maciej W. Rozycki - so we do not have to EOI from | 2487 | * is from Maciej W. Rozycki - so we do not have to EOI from |
1928 | * the NMI handler or the timer interrupt. | 2488 | * the NMI handler or the timer interrupt. |
1929 | */ | 2489 | */ |
1930 | printk(KERN_INFO "activating NMI Watchdog ..."); | 2490 | apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); |
1931 | 2491 | ||
1932 | enable_NMI_through_LVT0(); | 2492 | enable_NMI_through_LVT0(); |
1933 | 2493 | ||
1934 | printk(" done.\n"); | 2494 | apic_printk(APIC_VERBOSE, " done.\n"); |
1935 | } | 2495 | } |
1936 | 2496 | ||
1937 | /* | 2497 | /* |
@@ -1948,12 +2508,17 @@ static inline void __init unlock_ExtINT_logic(void) | |||
1948 | unsigned char save_control, save_freq_select; | 2508 | unsigned char save_control, save_freq_select; |
1949 | 2509 | ||
1950 | pin = find_isa_irq_pin(8, mp_INT); | 2510 | pin = find_isa_irq_pin(8, mp_INT); |
2511 | if (pin == -1) { | ||
2512 | WARN_ON_ONCE(1); | ||
2513 | return; | ||
2514 | } | ||
1951 | apic = find_isa_irq_apic(8, mp_INT); | 2515 | apic = find_isa_irq_apic(8, mp_INT); |
1952 | if (pin == -1) | 2516 | if (apic == -1) { |
2517 | WARN_ON_ONCE(1); | ||
1953 | return; | 2518 | return; |
2519 | } | ||
1954 | 2520 | ||
1955 | entry0 = ioapic_read_entry(apic, pin); | 2521 | entry0 = ioapic_read_entry(apic, pin); |
1956 | |||
1957 | clear_IO_APIC_pin(apic, pin); | 2522 | clear_IO_APIC_pin(apic, pin); |
1958 | 2523 | ||
1959 | memset(&entry1, 0, sizeof(entry1)); | 2524 | memset(&entry1, 0, sizeof(entry1)); |
@@ -1988,23 +2553,38 @@ static inline void __init unlock_ExtINT_logic(void) | |||
1988 | ioapic_write_entry(apic, pin, entry0); | 2553 | ioapic_write_entry(apic, pin, entry0); |
1989 | } | 2554 | } |
1990 | 2555 | ||
2556 | static int disable_timer_pin_1 __initdata; | ||
2557 | /* Actually the next is obsolete, but keep it for paranoid reasons -AK */ | ||
2558 | static int __init disable_timer_pin_setup(char *arg) | ||
2559 | { | ||
2560 | disable_timer_pin_1 = 1; | ||
2561 | return 0; | ||
2562 | } | ||
2563 | early_param("disable_timer_pin_1", disable_timer_pin_setup); | ||
2564 | |||
2565 | int timer_through_8259 __initdata; | ||
2566 | |||
1991 | /* | 2567 | /* |
1992 | * This code may look a bit paranoid, but it's supposed to cooperate with | 2568 | * This code may look a bit paranoid, but it's supposed to cooperate with |
1993 | * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ | 2569 | * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ |
1994 | * is so screwy. Thanks to Brian Perkins for testing/hacking this beast | 2570 | * is so screwy. Thanks to Brian Perkins for testing/hacking this beast |
1995 | * fanatically on his truly buggy board. | 2571 | * fanatically on his truly buggy board. |
1996 | * | 2572 | * |
1997 | * FIXME: really need to revamp this for modern platforms only. | 2573 | * FIXME: really need to revamp this for all platforms. |
1998 | */ | 2574 | */ |
1999 | static inline void __init check_timer(void) | 2575 | static inline void __init check_timer(void) |
2000 | { | 2576 | { |
2001 | struct irq_cfg *cfg = irq_cfg + 0; | 2577 | struct irq_cfg *cfg = irq_cfg(0); |
2002 | int apic1, pin1, apic2, pin2; | 2578 | int apic1, pin1, apic2, pin2; |
2003 | unsigned long flags; | 2579 | unsigned long flags; |
2580 | unsigned int ver; | ||
2004 | int no_pin1 = 0; | 2581 | int no_pin1 = 0; |
2005 | 2582 | ||
2006 | local_irq_save(flags); | 2583 | local_irq_save(flags); |
2007 | 2584 | ||
2585 | ver = apic_read(APIC_LVR); | ||
2586 | ver = GET_APIC_VERSION(ver); | ||
2587 | |||
2008 | /* | 2588 | /* |
2009 | * get/set the timer IRQ vector: | 2589 | * get/set the timer IRQ vector: |
2010 | */ | 2590 | */ |
@@ -2013,10 +2593,18 @@ static inline void __init check_timer(void) | |||
2013 | 2593 | ||
2014 | /* | 2594 | /* |
2015 | * As IRQ0 is to be enabled in the 8259A, the virtual | 2595 | * As IRQ0 is to be enabled in the 8259A, the virtual |
2016 | * wire has to be disabled in the local APIC. | 2596 | * wire has to be disabled in the local APIC. Also |
2597 | * timer interrupts need to be acknowledged manually in | ||
2598 | * the 8259A for the i82489DX when using the NMI | ||
2599 | * watchdog as that APIC treats NMIs as level-triggered. | ||
2600 | * The AEOI mode will finish them in the 8259A | ||
2601 | * automatically. | ||
2017 | */ | 2602 | */ |
2018 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | 2603 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); |
2019 | init_8259A(1); | 2604 | init_8259A(1); |
2605 | #ifdef CONFIG_X86_32 | ||
2606 | timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); | ||
2607 | #endif | ||
2020 | 2608 | ||
2021 | pin1 = find_isa_irq_pin(0, mp_INT); | 2609 | pin1 = find_isa_irq_pin(0, mp_INT); |
2022 | apic1 = find_isa_irq_apic(0, mp_INT); | 2610 | apic1 = find_isa_irq_apic(0, mp_INT); |
@@ -2035,8 +2623,10 @@ static inline void __init check_timer(void) | |||
2035 | * 8259A. | 2623 | * 8259A. |
2036 | */ | 2624 | */ |
2037 | if (pin1 == -1) { | 2625 | if (pin1 == -1) { |
2626 | #ifdef CONFIG_INTR_REMAP | ||
2038 | if (intr_remapping_enabled) | 2627 | if (intr_remapping_enabled) |
2039 | panic("BIOS bug: timer not connected to IO-APIC"); | 2628 | panic("BIOS bug: timer not connected to IO-APIC"); |
2629 | #endif | ||
2040 | pin1 = pin2; | 2630 | pin1 = pin2; |
2041 | apic1 = apic2; | 2631 | apic1 = apic2; |
2042 | no_pin1 = 1; | 2632 | no_pin1 = 1; |
@@ -2054,7 +2644,7 @@ static inline void __init check_timer(void) | |||
2054 | setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); | 2644 | setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); |
2055 | } | 2645 | } |
2056 | unmask_IO_APIC_irq(0); | 2646 | unmask_IO_APIC_irq(0); |
2057 | if (!no_timer_check && timer_irq_works()) { | 2647 | if (timer_irq_works()) { |
2058 | if (nmi_watchdog == NMI_IO_APIC) { | 2648 | if (nmi_watchdog == NMI_IO_APIC) { |
2059 | setup_nmi(); | 2649 | setup_nmi(); |
2060 | enable_8259A_irq(0); | 2650 | enable_8259A_irq(0); |
@@ -2063,8 +2653,10 @@ static inline void __init check_timer(void) | |||
2063 | clear_IO_APIC_pin(0, pin1); | 2653 | clear_IO_APIC_pin(0, pin1); |
2064 | goto out; | 2654 | goto out; |
2065 | } | 2655 | } |
2656 | #ifdef CONFIG_INTR_REMAP | ||
2066 | if (intr_remapping_enabled) | 2657 | if (intr_remapping_enabled) |
2067 | panic("timer doesn't work through Interrupt-remapped IO-APIC"); | 2658 | panic("timer doesn't work through Interrupt-remapped IO-APIC"); |
2659 | #endif | ||
2068 | clear_IO_APIC_pin(apic1, pin1); | 2660 | clear_IO_APIC_pin(apic1, pin1); |
2069 | if (!no_pin1) | 2661 | if (!no_pin1) |
2070 | apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " | 2662 | apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " |
@@ -2104,6 +2696,9 @@ static inline void __init check_timer(void) | |||
2104 | "through the IO-APIC - disabling NMI Watchdog!\n"); | 2696 | "through the IO-APIC - disabling NMI Watchdog!\n"); |
2105 | nmi_watchdog = NMI_NONE; | 2697 | nmi_watchdog = NMI_NONE; |
2106 | } | 2698 | } |
2699 | #ifdef CONFIG_X86_32 | ||
2700 | timer_ack = 0; | ||
2701 | #endif | ||
2107 | 2702 | ||
2108 | apic_printk(APIC_QUIET, KERN_INFO | 2703 | apic_printk(APIC_QUIET, KERN_INFO |
2109 | "...trying to set up timer as Virtual Wire IRQ...\n"); | 2704 | "...trying to set up timer as Virtual Wire IRQ...\n"); |
@@ -2140,13 +2735,6 @@ out: | |||
2140 | local_irq_restore(flags); | 2735 | local_irq_restore(flags); |
2141 | } | 2736 | } |
2142 | 2737 | ||
2143 | static int __init notimercheck(char *s) | ||
2144 | { | ||
2145 | no_timer_check = 1; | ||
2146 | return 1; | ||
2147 | } | ||
2148 | __setup("no_timer_check", notimercheck); | ||
2149 | |||
2150 | /* | 2738 | /* |
2151 | * Traditionally ISA IRQ2 is the cascade IRQ, and is not available | 2739 | * Traditionally ISA IRQ2 is the cascade IRQ, and is not available |
2152 | * to devices. However there may be an I/O APIC pin available for | 2740 | * to devices. However there may be an I/O APIC pin available for |
@@ -2164,25 +2752,49 @@ __setup("no_timer_check", notimercheck); | |||
2164 | * the I/O APIC in all cases now. No actual device should request | 2752 | * the I/O APIC in all cases now. No actual device should request |
2165 | * it anyway. --macro | 2753 | * it anyway. --macro |
2166 | */ | 2754 | */ |
2167 | #define PIC_IRQS (1<<2) | 2755 | #define PIC_IRQS (1 << PIC_CASCADE_IR) |
2168 | 2756 | ||
2169 | void __init setup_IO_APIC(void) | 2757 | void __init setup_IO_APIC(void) |
2170 | { | 2758 | { |
2171 | 2759 | ||
2760 | #ifdef CONFIG_X86_32 | ||
2761 | enable_IO_APIC(); | ||
2762 | #else | ||
2172 | /* | 2763 | /* |
2173 | * calling enable_IO_APIC() is moved to setup_local_APIC for BP | 2764 | * calling enable_IO_APIC() is moved to setup_local_APIC for BP |
2174 | */ | 2765 | */ |
2766 | #endif | ||
2175 | 2767 | ||
2176 | io_apic_irqs = ~PIC_IRQS; | 2768 | io_apic_irqs = ~PIC_IRQS; |
2177 | 2769 | ||
2178 | apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); | 2770 | apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); |
2179 | 2771 | /* | |
2772 | * Set up IO-APIC IRQ routing. | ||
2773 | */ | ||
2774 | #ifdef CONFIG_X86_32 | ||
2775 | if (!acpi_ioapic) | ||
2776 | setup_ioapic_ids_from_mpc(); | ||
2777 | #endif | ||
2180 | sync_Arb_IDs(); | 2778 | sync_Arb_IDs(); |
2181 | setup_IO_APIC_irqs(); | 2779 | setup_IO_APIC_irqs(); |
2182 | init_IO_APIC_traps(); | 2780 | init_IO_APIC_traps(); |
2183 | check_timer(); | 2781 | check_timer(); |
2184 | } | 2782 | } |
2185 | 2783 | ||
2784 | /* | ||
2785 | * Called after all the initialization is done. If we didnt find any | ||
2786 | * APIC bugs then we can allow the modify fast path | ||
2787 | */ | ||
2788 | |||
2789 | static int __init io_apic_bug_finalize(void) | ||
2790 | { | ||
2791 | if (sis_apic_bug == -1) | ||
2792 | sis_apic_bug = 0; | ||
2793 | return 0; | ||
2794 | } | ||
2795 | |||
2796 | late_initcall(io_apic_bug_finalize); | ||
2797 | |||
2186 | struct sysfs_ioapic_data { | 2798 | struct sysfs_ioapic_data { |
2187 | struct sys_device dev; | 2799 | struct sys_device dev; |
2188 | struct IO_APIC_route_entry entry[0]; | 2800 | struct IO_APIC_route_entry entry[0]; |
@@ -2270,32 +2882,51 @@ device_initcall(ioapic_init_sysfs); | |||
2270 | /* | 2882 | /* |
2271 | * Dynamic irq allocate and deallocation | 2883 | * Dynamic irq allocate and deallocation |
2272 | */ | 2884 | */ |
2273 | int create_irq(void) | 2885 | unsigned int create_irq_nr(unsigned int irq_want) |
2274 | { | 2886 | { |
2275 | /* Allocate an unused irq */ | 2887 | /* Allocate an unused irq */ |
2276 | int irq; | 2888 | unsigned int irq; |
2277 | int new; | 2889 | unsigned int new; |
2278 | unsigned long flags; | 2890 | unsigned long flags; |
2891 | struct irq_cfg *cfg_new; | ||
2892 | |||
2893 | irq_want = nr_irqs - 1; | ||
2279 | 2894 | ||
2280 | irq = -ENOSPC; | 2895 | irq = 0; |
2281 | spin_lock_irqsave(&vector_lock, flags); | 2896 | spin_lock_irqsave(&vector_lock, flags); |
2282 | for (new = (NR_IRQS - 1); new >= 0; new--) { | 2897 | for (new = irq_want; new > 0; new--) { |
2283 | if (platform_legacy_irq(new)) | 2898 | if (platform_legacy_irq(new)) |
2284 | continue; | 2899 | continue; |
2285 | if (irq_cfg[new].vector != 0) | 2900 | cfg_new = irq_cfg(new); |
2901 | if (cfg_new && cfg_new->vector != 0) | ||
2286 | continue; | 2902 | continue; |
2903 | /* check if need to create one */ | ||
2904 | if (!cfg_new) | ||
2905 | cfg_new = irq_cfg_alloc(new); | ||
2287 | if (__assign_irq_vector(new, TARGET_CPUS) == 0) | 2906 | if (__assign_irq_vector(new, TARGET_CPUS) == 0) |
2288 | irq = new; | 2907 | irq = new; |
2289 | break; | 2908 | break; |
2290 | } | 2909 | } |
2291 | spin_unlock_irqrestore(&vector_lock, flags); | 2910 | spin_unlock_irqrestore(&vector_lock, flags); |
2292 | 2911 | ||
2293 | if (irq >= 0) { | 2912 | if (irq > 0) { |
2294 | dynamic_irq_init(irq); | 2913 | dynamic_irq_init(irq); |
2295 | } | 2914 | } |
2296 | return irq; | 2915 | return irq; |
2297 | } | 2916 | } |
2298 | 2917 | ||
2918 | int create_irq(void) | ||
2919 | { | ||
2920 | int irq; | ||
2921 | |||
2922 | irq = create_irq_nr(nr_irqs - 1); | ||
2923 | |||
2924 | if (irq == 0) | ||
2925 | irq = -1; | ||
2926 | |||
2927 | return irq; | ||
2928 | } | ||
2929 | |||
2299 | void destroy_irq(unsigned int irq) | 2930 | void destroy_irq(unsigned int irq) |
2300 | { | 2931 | { |
2301 | unsigned long flags; | 2932 | unsigned long flags; |
@@ -2316,7 +2947,7 @@ void destroy_irq(unsigned int irq) | |||
2316 | #ifdef CONFIG_PCI_MSI | 2947 | #ifdef CONFIG_PCI_MSI |
2317 | static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) | 2948 | static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) |
2318 | { | 2949 | { |
2319 | struct irq_cfg *cfg = irq_cfg + irq; | 2950 | struct irq_cfg *cfg; |
2320 | int err; | 2951 | int err; |
2321 | unsigned dest; | 2952 | unsigned dest; |
2322 | cpumask_t tmp; | 2953 | cpumask_t tmp; |
@@ -2326,6 +2957,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms | |||
2326 | if (err) | 2957 | if (err) |
2327 | return err; | 2958 | return err; |
2328 | 2959 | ||
2960 | cfg = irq_cfg(irq); | ||
2329 | cpus_and(tmp, cfg->domain, tmp); | 2961 | cpus_and(tmp, cfg->domain, tmp); |
2330 | dest = cpu_mask_to_apicid(tmp); | 2962 | dest = cpu_mask_to_apicid(tmp); |
2331 | 2963 | ||
@@ -2383,10 +3015,11 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms | |||
2383 | #ifdef CONFIG_SMP | 3015 | #ifdef CONFIG_SMP |
2384 | static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) | 3016 | static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) |
2385 | { | 3017 | { |
2386 | struct irq_cfg *cfg = irq_cfg + irq; | 3018 | struct irq_cfg *cfg; |
2387 | struct msi_msg msg; | 3019 | struct msi_msg msg; |
2388 | unsigned int dest; | 3020 | unsigned int dest; |
2389 | cpumask_t tmp; | 3021 | cpumask_t tmp; |
3022 | struct irq_desc *desc; | ||
2390 | 3023 | ||
2391 | cpus_and(tmp, mask, cpu_online_map); | 3024 | cpus_and(tmp, mask, cpu_online_map); |
2392 | if (cpus_empty(tmp)) | 3025 | if (cpus_empty(tmp)) |
@@ -2395,6 +3028,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) | |||
2395 | if (assign_irq_vector(irq, mask)) | 3028 | if (assign_irq_vector(irq, mask)) |
2396 | return; | 3029 | return; |
2397 | 3030 | ||
3031 | cfg = irq_cfg(irq); | ||
2398 | cpus_and(tmp, cfg->domain, mask); | 3032 | cpus_and(tmp, cfg->domain, mask); |
2399 | dest = cpu_mask_to_apicid(tmp); | 3033 | dest = cpu_mask_to_apicid(tmp); |
2400 | 3034 | ||
@@ -2406,7 +3040,8 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) | |||
2406 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); | 3040 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); |
2407 | 3041 | ||
2408 | write_msi_msg(irq, &msg); | 3042 | write_msi_msg(irq, &msg); |
2409 | irq_desc[irq].affinity = mask; | 3043 | desc = irq_to_desc(irq); |
3044 | desc->affinity = mask; | ||
2410 | } | 3045 | } |
2411 | 3046 | ||
2412 | #ifdef CONFIG_INTR_REMAP | 3047 | #ifdef CONFIG_INTR_REMAP |
@@ -2416,10 +3051,11 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) | |||
2416 | */ | 3051 | */ |
2417 | static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) | 3052 | static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) |
2418 | { | 3053 | { |
2419 | struct irq_cfg *cfg = irq_cfg + irq; | 3054 | struct irq_cfg *cfg; |
2420 | unsigned int dest; | 3055 | unsigned int dest; |
2421 | cpumask_t tmp, cleanup_mask; | 3056 | cpumask_t tmp, cleanup_mask; |
2422 | struct irte irte; | 3057 | struct irte irte; |
3058 | struct irq_desc *desc; | ||
2423 | 3059 | ||
2424 | cpus_and(tmp, mask, cpu_online_map); | 3060 | cpus_and(tmp, mask, cpu_online_map); |
2425 | if (cpus_empty(tmp)) | 3061 | if (cpus_empty(tmp)) |
@@ -2431,6 +3067,7 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) | |||
2431 | if (assign_irq_vector(irq, mask)) | 3067 | if (assign_irq_vector(irq, mask)) |
2432 | return; | 3068 | return; |
2433 | 3069 | ||
3070 | cfg = irq_cfg(irq); | ||
2434 | cpus_and(tmp, cfg->domain, mask); | 3071 | cpus_and(tmp, cfg->domain, mask); |
2435 | dest = cpu_mask_to_apicid(tmp); | 3072 | dest = cpu_mask_to_apicid(tmp); |
2436 | 3073 | ||
@@ -2454,7 +3091,8 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) | |||
2454 | cfg->move_in_progress = 0; | 3091 | cfg->move_in_progress = 0; |
2455 | } | 3092 | } |
2456 | 3093 | ||
2457 | irq_desc[irq].affinity = mask; | 3094 | desc = irq_to_desc(irq); |
3095 | desc->affinity = mask; | ||
2458 | } | 3096 | } |
2459 | #endif | 3097 | #endif |
2460 | #endif /* CONFIG_SMP */ | 3098 | #endif /* CONFIG_SMP */ |
@@ -2507,7 +3145,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) | |||
2507 | if (index < 0) { | 3145 | if (index < 0) { |
2508 | printk(KERN_ERR | 3146 | printk(KERN_ERR |
2509 | "Unable to allocate %d IRTE for PCI %s\n", nvec, | 3147 | "Unable to allocate %d IRTE for PCI %s\n", nvec, |
2510 | pci_name(dev)); | 3148 | pci_name(dev)); |
2511 | return -ENOSPC; | 3149 | return -ENOSPC; |
2512 | } | 3150 | } |
2513 | return index; | 3151 | return index; |
@@ -2528,7 +3166,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) | |||
2528 | 3166 | ||
2529 | #ifdef CONFIG_INTR_REMAP | 3167 | #ifdef CONFIG_INTR_REMAP |
2530 | if (irq_remapped(irq)) { | 3168 | if (irq_remapped(irq)) { |
2531 | struct irq_desc *desc = irq_desc + irq; | 3169 | struct irq_desc *desc = irq_to_desc(irq); |
2532 | /* | 3170 | /* |
2533 | * irq migration in process context | 3171 | * irq migration in process context |
2534 | */ | 3172 | */ |
@@ -2538,16 +3176,34 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) | |||
2538 | #endif | 3176 | #endif |
2539 | set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); | 3177 | set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); |
2540 | 3178 | ||
3179 | dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); | ||
3180 | |||
2541 | return 0; | 3181 | return 0; |
2542 | } | 3182 | } |
2543 | 3183 | ||
3184 | static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) | ||
3185 | { | ||
3186 | unsigned int irq; | ||
3187 | |||
3188 | irq = dev->bus->number; | ||
3189 | irq <<= 8; | ||
3190 | irq |= dev->devfn; | ||
3191 | irq <<= 12; | ||
3192 | |||
3193 | return irq; | ||
3194 | } | ||
3195 | |||
2544 | int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) | 3196 | int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) |
2545 | { | 3197 | { |
2546 | int irq, ret; | 3198 | unsigned int irq; |
3199 | int ret; | ||
3200 | unsigned int irq_want; | ||
2547 | 3201 | ||
2548 | irq = create_irq(); | 3202 | irq_want = build_irq_for_pci_dev(dev) + 0x100; |
2549 | if (irq < 0) | 3203 | |
2550 | return irq; | 3204 | irq = create_irq_nr(irq_want); |
3205 | if (irq == 0) | ||
3206 | return -1; | ||
2551 | 3207 | ||
2552 | #ifdef CONFIG_INTR_REMAP | 3208 | #ifdef CONFIG_INTR_REMAP |
2553 | if (!intr_remapping_enabled) | 3209 | if (!intr_remapping_enabled) |
@@ -2574,18 +3230,22 @@ error: | |||
2574 | 3230 | ||
2575 | int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) | 3231 | int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) |
2576 | { | 3232 | { |
2577 | int irq, ret, sub_handle; | 3233 | unsigned int irq; |
3234 | int ret, sub_handle; | ||
2578 | struct msi_desc *desc; | 3235 | struct msi_desc *desc; |
3236 | unsigned int irq_want; | ||
3237 | |||
2579 | #ifdef CONFIG_INTR_REMAP | 3238 | #ifdef CONFIG_INTR_REMAP |
2580 | struct intel_iommu *iommu = 0; | 3239 | struct intel_iommu *iommu = 0; |
2581 | int index = 0; | 3240 | int index = 0; |
2582 | #endif | 3241 | #endif |
2583 | 3242 | ||
3243 | irq_want = build_irq_for_pci_dev(dev) + 0x100; | ||
2584 | sub_handle = 0; | 3244 | sub_handle = 0; |
2585 | list_for_each_entry(desc, &dev->msi_list, list) { | 3245 | list_for_each_entry(desc, &dev->msi_list, list) { |
2586 | irq = create_irq(); | 3246 | irq = create_irq_nr(irq_want--); |
2587 | if (irq < 0) | 3247 | if (irq == 0) |
2588 | return irq; | 3248 | return -1; |
2589 | #ifdef CONFIG_INTR_REMAP | 3249 | #ifdef CONFIG_INTR_REMAP |
2590 | if (!intr_remapping_enabled) | 3250 | if (!intr_remapping_enabled) |
2591 | goto no_ir; | 3251 | goto no_ir; |
@@ -2636,10 +3296,11 @@ void arch_teardown_msi_irq(unsigned int irq) | |||
2636 | #ifdef CONFIG_SMP | 3296 | #ifdef CONFIG_SMP |
2637 | static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) | 3297 | static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) |
2638 | { | 3298 | { |
2639 | struct irq_cfg *cfg = irq_cfg + irq; | 3299 | struct irq_cfg *cfg; |
2640 | struct msi_msg msg; | 3300 | struct msi_msg msg; |
2641 | unsigned int dest; | 3301 | unsigned int dest; |
2642 | cpumask_t tmp; | 3302 | cpumask_t tmp; |
3303 | struct irq_desc *desc; | ||
2643 | 3304 | ||
2644 | cpus_and(tmp, mask, cpu_online_map); | 3305 | cpus_and(tmp, mask, cpu_online_map); |
2645 | if (cpus_empty(tmp)) | 3306 | if (cpus_empty(tmp)) |
@@ -2648,6 +3309,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) | |||
2648 | if (assign_irq_vector(irq, mask)) | 3309 | if (assign_irq_vector(irq, mask)) |
2649 | return; | 3310 | return; |
2650 | 3311 | ||
3312 | cfg = irq_cfg(irq); | ||
2651 | cpus_and(tmp, cfg->domain, mask); | 3313 | cpus_and(tmp, cfg->domain, mask); |
2652 | dest = cpu_mask_to_apicid(tmp); | 3314 | dest = cpu_mask_to_apicid(tmp); |
2653 | 3315 | ||
@@ -2659,7 +3321,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) | |||
2659 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); | 3321 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); |
2660 | 3322 | ||
2661 | dmar_msi_write(irq, &msg); | 3323 | dmar_msi_write(irq, &msg); |
2662 | irq_desc[irq].affinity = mask; | 3324 | desc = irq_to_desc(irq); |
3325 | desc->affinity = mask; | ||
2663 | } | 3326 | } |
2664 | #endif /* CONFIG_SMP */ | 3327 | #endif /* CONFIG_SMP */ |
2665 | 3328 | ||
@@ -2689,6 +3352,69 @@ int arch_setup_dmar_msi(unsigned int irq) | |||
2689 | } | 3352 | } |
2690 | #endif | 3353 | #endif |
2691 | 3354 | ||
3355 | #ifdef CONFIG_HPET_TIMER | ||
3356 | |||
3357 | #ifdef CONFIG_SMP | ||
3358 | static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) | ||
3359 | { | ||
3360 | struct irq_cfg *cfg; | ||
3361 | struct irq_desc *desc; | ||
3362 | struct msi_msg msg; | ||
3363 | unsigned int dest; | ||
3364 | cpumask_t tmp; | ||
3365 | |||
3366 | cpus_and(tmp, mask, cpu_online_map); | ||
3367 | if (cpus_empty(tmp)) | ||
3368 | return; | ||
3369 | |||
3370 | if (assign_irq_vector(irq, mask)) | ||
3371 | return; | ||
3372 | |||
3373 | cfg = irq_cfg(irq); | ||
3374 | cpus_and(tmp, cfg->domain, mask); | ||
3375 | dest = cpu_mask_to_apicid(tmp); | ||
3376 | |||
3377 | hpet_msi_read(irq, &msg); | ||
3378 | |||
3379 | msg.data &= ~MSI_DATA_VECTOR_MASK; | ||
3380 | msg.data |= MSI_DATA_VECTOR(cfg->vector); | ||
3381 | msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; | ||
3382 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); | ||
3383 | |||
3384 | hpet_msi_write(irq, &msg); | ||
3385 | desc = irq_to_desc(irq); | ||
3386 | desc->affinity = mask; | ||
3387 | } | ||
3388 | #endif /* CONFIG_SMP */ | ||
3389 | |||
3390 | struct irq_chip hpet_msi_type = { | ||
3391 | .name = "HPET_MSI", | ||
3392 | .unmask = hpet_msi_unmask, | ||
3393 | .mask = hpet_msi_mask, | ||
3394 | .ack = ack_apic_edge, | ||
3395 | #ifdef CONFIG_SMP | ||
3396 | .set_affinity = hpet_msi_set_affinity, | ||
3397 | #endif | ||
3398 | .retrigger = ioapic_retrigger_irq, | ||
3399 | }; | ||
3400 | |||
3401 | int arch_setup_hpet_msi(unsigned int irq) | ||
3402 | { | ||
3403 | int ret; | ||
3404 | struct msi_msg msg; | ||
3405 | |||
3406 | ret = msi_compose_msg(NULL, irq, &msg); | ||
3407 | if (ret < 0) | ||
3408 | return ret; | ||
3409 | |||
3410 | hpet_msi_write(irq, &msg); | ||
3411 | set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, | ||
3412 | "edge"); | ||
3413 | |||
3414 | return 0; | ||
3415 | } | ||
3416 | #endif | ||
3417 | |||
2692 | #endif /* CONFIG_PCI_MSI */ | 3418 | #endif /* CONFIG_PCI_MSI */ |
2693 | /* | 3419 | /* |
2694 | * Hypertransport interrupt support | 3420 | * Hypertransport interrupt support |
@@ -2713,9 +3439,10 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) | |||
2713 | 3439 | ||
2714 | static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) | 3440 | static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) |
2715 | { | 3441 | { |
2716 | struct irq_cfg *cfg = irq_cfg + irq; | 3442 | struct irq_cfg *cfg; |
2717 | unsigned int dest; | 3443 | unsigned int dest; |
2718 | cpumask_t tmp; | 3444 | cpumask_t tmp; |
3445 | struct irq_desc *desc; | ||
2719 | 3446 | ||
2720 | cpus_and(tmp, mask, cpu_online_map); | 3447 | cpus_and(tmp, mask, cpu_online_map); |
2721 | if (cpus_empty(tmp)) | 3448 | if (cpus_empty(tmp)) |
@@ -2724,11 +3451,13 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) | |||
2724 | if (assign_irq_vector(irq, mask)) | 3451 | if (assign_irq_vector(irq, mask)) |
2725 | return; | 3452 | return; |
2726 | 3453 | ||
3454 | cfg = irq_cfg(irq); | ||
2727 | cpus_and(tmp, cfg->domain, mask); | 3455 | cpus_and(tmp, cfg->domain, mask); |
2728 | dest = cpu_mask_to_apicid(tmp); | 3456 | dest = cpu_mask_to_apicid(tmp); |
2729 | 3457 | ||
2730 | target_ht_irq(irq, dest, cfg->vector); | 3458 | target_ht_irq(irq, dest, cfg->vector); |
2731 | irq_desc[irq].affinity = mask; | 3459 | desc = irq_to_desc(irq); |
3460 | desc->affinity = mask; | ||
2732 | } | 3461 | } |
2733 | #endif | 3462 | #endif |
2734 | 3463 | ||
@@ -2745,7 +3474,7 @@ static struct irq_chip ht_irq_chip = { | |||
2745 | 3474 | ||
2746 | int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) | 3475 | int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) |
2747 | { | 3476 | { |
2748 | struct irq_cfg *cfg = irq_cfg + irq; | 3477 | struct irq_cfg *cfg; |
2749 | int err; | 3478 | int err; |
2750 | cpumask_t tmp; | 3479 | cpumask_t tmp; |
2751 | 3480 | ||
@@ -2755,6 +3484,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) | |||
2755 | struct ht_irq_msg msg; | 3484 | struct ht_irq_msg msg; |
2756 | unsigned dest; | 3485 | unsigned dest; |
2757 | 3486 | ||
3487 | cfg = irq_cfg(irq); | ||
2758 | cpus_and(tmp, cfg->domain, tmp); | 3488 | cpus_and(tmp, cfg->domain, tmp); |
2759 | dest = cpu_mask_to_apicid(tmp); | 3489 | dest = cpu_mask_to_apicid(tmp); |
2760 | 3490 | ||
@@ -2777,20 +3507,196 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) | |||
2777 | 3507 | ||
2778 | set_irq_chip_and_handler_name(irq, &ht_irq_chip, | 3508 | set_irq_chip_and_handler_name(irq, &ht_irq_chip, |
2779 | handle_edge_irq, "edge"); | 3509 | handle_edge_irq, "edge"); |
3510 | |||
3511 | dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); | ||
2780 | } | 3512 | } |
2781 | return err; | 3513 | return err; |
2782 | } | 3514 | } |
2783 | #endif /* CONFIG_HT_IRQ */ | 3515 | #endif /* CONFIG_HT_IRQ */ |
2784 | 3516 | ||
3517 | #ifdef CONFIG_X86_64 | ||
3518 | /* | ||
3519 | * Re-target the irq to the specified CPU and enable the specified MMR located | ||
3520 | * on the specified blade to allow the sending of MSIs to the specified CPU. | ||
3521 | */ | ||
3522 | int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, | ||
3523 | unsigned long mmr_offset) | ||
3524 | { | ||
3525 | const cpumask_t *eligible_cpu = get_cpu_mask(cpu); | ||
3526 | struct irq_cfg *cfg; | ||
3527 | int mmr_pnode; | ||
3528 | unsigned long mmr_value; | ||
3529 | struct uv_IO_APIC_route_entry *entry; | ||
3530 | unsigned long flags; | ||
3531 | int err; | ||
3532 | |||
3533 | err = assign_irq_vector(irq, *eligible_cpu); | ||
3534 | if (err != 0) | ||
3535 | return err; | ||
3536 | |||
3537 | spin_lock_irqsave(&vector_lock, flags); | ||
3538 | set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, | ||
3539 | irq_name); | ||
3540 | spin_unlock_irqrestore(&vector_lock, flags); | ||
3541 | |||
3542 | cfg = irq_cfg(irq); | ||
3543 | |||
3544 | mmr_value = 0; | ||
3545 | entry = (struct uv_IO_APIC_route_entry *)&mmr_value; | ||
3546 | BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); | ||
3547 | |||
3548 | entry->vector = cfg->vector; | ||
3549 | entry->delivery_mode = INT_DELIVERY_MODE; | ||
3550 | entry->dest_mode = INT_DEST_MODE; | ||
3551 | entry->polarity = 0; | ||
3552 | entry->trigger = 0; | ||
3553 | entry->mask = 0; | ||
3554 | entry->dest = cpu_mask_to_apicid(*eligible_cpu); | ||
3555 | |||
3556 | mmr_pnode = uv_blade_to_pnode(mmr_blade); | ||
3557 | uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); | ||
3558 | |||
3559 | return irq; | ||
3560 | } | ||
3561 | |||
3562 | /* | ||
3563 | * Disable the specified MMR located on the specified blade so that MSIs are | ||
3564 | * longer allowed to be sent. | ||
3565 | */ | ||
3566 | void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) | ||
3567 | { | ||
3568 | unsigned long mmr_value; | ||
3569 | struct uv_IO_APIC_route_entry *entry; | ||
3570 | int mmr_pnode; | ||
3571 | |||
3572 | mmr_value = 0; | ||
3573 | entry = (struct uv_IO_APIC_route_entry *)&mmr_value; | ||
3574 | BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); | ||
3575 | |||
3576 | entry->mask = 1; | ||
3577 | |||
3578 | mmr_pnode = uv_blade_to_pnode(mmr_blade); | ||
3579 | uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); | ||
3580 | } | ||
3581 | #endif /* CONFIG_X86_64 */ | ||
3582 | |||
3583 | int __init io_apic_get_redir_entries (int ioapic) | ||
3584 | { | ||
3585 | union IO_APIC_reg_01 reg_01; | ||
3586 | unsigned long flags; | ||
3587 | |||
3588 | spin_lock_irqsave(&ioapic_lock, flags); | ||
3589 | reg_01.raw = io_apic_read(ioapic, 1); | ||
3590 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
3591 | |||
3592 | return reg_01.bits.entries; | ||
3593 | } | ||
3594 | |||
3595 | int __init probe_nr_irqs(void) | ||
3596 | { | ||
3597 | int idx; | ||
3598 | int nr = 0; | ||
3599 | #ifndef CONFIG_XEN | ||
3600 | int nr_min = 32; | ||
3601 | #else | ||
3602 | int nr_min = NR_IRQS; | ||
3603 | #endif | ||
3604 | |||
3605 | for (idx = 0; idx < nr_ioapics; idx++) | ||
3606 | nr += io_apic_get_redir_entries(idx) + 1; | ||
3607 | |||
3608 | /* double it for hotplug and msi and nmi */ | ||
3609 | nr <<= 1; | ||
3610 | |||
3611 | /* something wrong ? */ | ||
3612 | if (nr < nr_min) | ||
3613 | nr = nr_min; | ||
3614 | |||
3615 | return nr; | ||
3616 | } | ||
3617 | |||
2785 | /* -------------------------------------------------------------------------- | 3618 | /* -------------------------------------------------------------------------- |
2786 | ACPI-based IOAPIC Configuration | 3619 | ACPI-based IOAPIC Configuration |
2787 | -------------------------------------------------------------------------- */ | 3620 | -------------------------------------------------------------------------- */ |
2788 | 3621 | ||
2789 | #ifdef CONFIG_ACPI | 3622 | #ifdef CONFIG_ACPI |
2790 | 3623 | ||
2791 | #define IO_APIC_MAX_ID 0xFE | 3624 | #ifdef CONFIG_X86_32 |
3625 | int __init io_apic_get_unique_id(int ioapic, int apic_id) | ||
3626 | { | ||
3627 | union IO_APIC_reg_00 reg_00; | ||
3628 | static physid_mask_t apic_id_map = PHYSID_MASK_NONE; | ||
3629 | physid_mask_t tmp; | ||
3630 | unsigned long flags; | ||
3631 | int i = 0; | ||
2792 | 3632 | ||
2793 | int __init io_apic_get_redir_entries (int ioapic) | 3633 | /* |
3634 | * The P4 platform supports up to 256 APIC IDs on two separate APIC | ||
3635 | * buses (one for LAPICs, one for IOAPICs), where predecessors only | ||
3636 | * supports up to 16 on one shared APIC bus. | ||
3637 | * | ||
3638 | * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full | ||
3639 | * advantage of new APIC bus architecture. | ||
3640 | */ | ||
3641 | |||
3642 | if (physids_empty(apic_id_map)) | ||
3643 | apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); | ||
3644 | |||
3645 | spin_lock_irqsave(&ioapic_lock, flags); | ||
3646 | reg_00.raw = io_apic_read(ioapic, 0); | ||
3647 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
3648 | |||
3649 | if (apic_id >= get_physical_broadcast()) { | ||
3650 | printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " | ||
3651 | "%d\n", ioapic, apic_id, reg_00.bits.ID); | ||
3652 | apic_id = reg_00.bits.ID; | ||
3653 | } | ||
3654 | |||
3655 | /* | ||
3656 | * Every APIC in a system must have a unique ID or we get lots of nice | ||
3657 | * 'stuck on smp_invalidate_needed IPI wait' messages. | ||
3658 | */ | ||
3659 | if (check_apicid_used(apic_id_map, apic_id)) { | ||
3660 | |||
3661 | for (i = 0; i < get_physical_broadcast(); i++) { | ||
3662 | if (!check_apicid_used(apic_id_map, i)) | ||
3663 | break; | ||
3664 | } | ||
3665 | |||
3666 | if (i == get_physical_broadcast()) | ||
3667 | panic("Max apic_id exceeded!\n"); | ||
3668 | |||
3669 | printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " | ||
3670 | "trying %d\n", ioapic, apic_id, i); | ||
3671 | |||
3672 | apic_id = i; | ||
3673 | } | ||
3674 | |||
3675 | tmp = apicid_to_cpu_present(apic_id); | ||
3676 | physids_or(apic_id_map, apic_id_map, tmp); | ||
3677 | |||
3678 | if (reg_00.bits.ID != apic_id) { | ||
3679 | reg_00.bits.ID = apic_id; | ||
3680 | |||
3681 | spin_lock_irqsave(&ioapic_lock, flags); | ||
3682 | io_apic_write(ioapic, 0, reg_00.raw); | ||
3683 | reg_00.raw = io_apic_read(ioapic, 0); | ||
3684 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
3685 | |||
3686 | /* Sanity check */ | ||
3687 | if (reg_00.bits.ID != apic_id) { | ||
3688 | printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); | ||
3689 | return -1; | ||
3690 | } | ||
3691 | } | ||
3692 | |||
3693 | apic_printk(APIC_VERBOSE, KERN_INFO | ||
3694 | "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); | ||
3695 | |||
3696 | return apic_id; | ||
3697 | } | ||
3698 | |||
3699 | int __init io_apic_get_version(int ioapic) | ||
2794 | { | 3700 | { |
2795 | union IO_APIC_reg_01 reg_01; | 3701 | union IO_APIC_reg_01 reg_01; |
2796 | unsigned long flags; | 3702 | unsigned long flags; |
@@ -2799,9 +3705,9 @@ int __init io_apic_get_redir_entries (int ioapic) | |||
2799 | reg_01.raw = io_apic_read(ioapic, 1); | 3705 | reg_01.raw = io_apic_read(ioapic, 1); |
2800 | spin_unlock_irqrestore(&ioapic_lock, flags); | 3706 | spin_unlock_irqrestore(&ioapic_lock, flags); |
2801 | 3707 | ||
2802 | return reg_01.bits.entries; | 3708 | return reg_01.bits.version; |
2803 | } | 3709 | } |
2804 | 3710 | #endif | |
2805 | 3711 | ||
2806 | int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) | 3712 | int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) |
2807 | { | 3713 | { |
@@ -2853,6 +3759,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) | |||
2853 | void __init setup_ioapic_dest(void) | 3759 | void __init setup_ioapic_dest(void) |
2854 | { | 3760 | { |
2855 | int pin, ioapic, irq, irq_entry; | 3761 | int pin, ioapic, irq, irq_entry; |
3762 | struct irq_cfg *cfg; | ||
2856 | 3763 | ||
2857 | if (skip_ioapic_setup == 1) | 3764 | if (skip_ioapic_setup == 1) |
2858 | return; | 3765 | return; |
@@ -2868,7 +3775,8 @@ void __init setup_ioapic_dest(void) | |||
2868 | * when you have too many devices, because at that time only boot | 3775 | * when you have too many devices, because at that time only boot |
2869 | * cpu is online. | 3776 | * cpu is online. |
2870 | */ | 3777 | */ |
2871 | if (!irq_cfg[irq].vector) | 3778 | cfg = irq_cfg(irq); |
3779 | if (!cfg->vector) | ||
2872 | setup_IO_APIC_irq(ioapic, pin, irq, | 3780 | setup_IO_APIC_irq(ioapic, pin, irq, |
2873 | irq_trigger(irq_entry), | 3781 | irq_trigger(irq_entry), |
2874 | irq_polarity(irq_entry)); | 3782 | irq_polarity(irq_entry)); |
@@ -2926,18 +3834,33 @@ void __init ioapic_init_mappings(void) | |||
2926 | struct resource *ioapic_res; | 3834 | struct resource *ioapic_res; |
2927 | int i; | 3835 | int i; |
2928 | 3836 | ||
3837 | irq_2_pin_init(); | ||
2929 | ioapic_res = ioapic_setup_resources(); | 3838 | ioapic_res = ioapic_setup_resources(); |
2930 | for (i = 0; i < nr_ioapics; i++) { | 3839 | for (i = 0; i < nr_ioapics; i++) { |
2931 | if (smp_found_config) { | 3840 | if (smp_found_config) { |
2932 | ioapic_phys = mp_ioapics[i].mp_apicaddr; | 3841 | ioapic_phys = mp_ioapics[i].mp_apicaddr; |
3842 | #ifdef CONFIG_X86_32 | ||
3843 | if (!ioapic_phys) { | ||
3844 | printk(KERN_ERR | ||
3845 | "WARNING: bogus zero IO-APIC " | ||
3846 | "address found in MPTABLE, " | ||
3847 | "disabling IO/APIC support!\n"); | ||
3848 | smp_found_config = 0; | ||
3849 | skip_ioapic_setup = 1; | ||
3850 | goto fake_ioapic_page; | ||
3851 | } | ||
3852 | #endif | ||
2933 | } else { | 3853 | } else { |
3854 | #ifdef CONFIG_X86_32 | ||
3855 | fake_ioapic_page: | ||
3856 | #endif | ||
2934 | ioapic_phys = (unsigned long) | 3857 | ioapic_phys = (unsigned long) |
2935 | alloc_bootmem_pages(PAGE_SIZE); | 3858 | alloc_bootmem_pages(PAGE_SIZE); |
2936 | ioapic_phys = __pa(ioapic_phys); | 3859 | ioapic_phys = __pa(ioapic_phys); |
2937 | } | 3860 | } |
2938 | set_fixmap_nocache(idx, ioapic_phys); | 3861 | set_fixmap_nocache(idx, ioapic_phys); |
2939 | apic_printk(APIC_VERBOSE, | 3862 | apic_printk(APIC_VERBOSE, |
2940 | "mapped IOAPIC to %016lx (%016lx)\n", | 3863 | "mapped IOAPIC to %08lx (%08lx)\n", |
2941 | __fix_to_virt(idx), ioapic_phys); | 3864 | __fix_to_virt(idx), ioapic_phys); |
2942 | idx++; | 3865 | idx++; |
2943 | 3866 | ||
@@ -2971,4 +3894,3 @@ static int __init ioapic_insert_resources(void) | |||
2971 | /* Insert the IO APIC resources after PCI initialization has occured to handle | 3894 | /* Insert the IO APIC resources after PCI initialization has occured to handle |
2972 | * IO APICS that are mapped in on a BAR in PCI space. */ | 3895 | * IO APICS that are mapped in on a BAR in PCI space. */ |
2973 | late_initcall(ioapic_insert_resources); | 3896 | late_initcall(ioapic_insert_resources); |
2974 | |||
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c deleted file mode 100644 index e710289f673e..000000000000 --- a/arch/x86/kernel/io_apic_32.c +++ /dev/null | |||
@@ -1,2908 +0,0 @@ | |||
1 | /* | ||
2 | * Intel IO-APIC support for multi-Pentium hosts. | ||
3 | * | ||
4 | * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo | ||
5 | * | ||
6 | * Many thanks to Stig Venaas for trying out countless experimental | ||
7 | * patches and reporting/debugging problems patiently! | ||
8 | * | ||
9 | * (c) 1999, Multiple IO-APIC support, developed by | ||
10 | * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and | ||
11 | * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, | ||
12 | * further tested and cleaned up by Zach Brown <zab@redhat.com> | ||
13 | * and Ingo Molnar <mingo@redhat.com> | ||
14 | * | ||
15 | * Fixes | ||
16 | * Maciej W. Rozycki : Bits for genuine 82489DX APICs; | ||
17 | * thanks to Eric Gilmore | ||
18 | * and Rolf G. Tews | ||
19 | * for testing these extensively | ||
20 | * Paul Diefenbaugh : Added full ACPI support | ||
21 | */ | ||
22 | |||
23 | #include <linux/mm.h> | ||
24 | #include <linux/interrupt.h> | ||
25 | #include <linux/init.h> | ||
26 | #include <linux/delay.h> | ||
27 | #include <linux/sched.h> | ||
28 | #include <linux/bootmem.h> | ||
29 | #include <linux/mc146818rtc.h> | ||
30 | #include <linux/compiler.h> | ||
31 | #include <linux/acpi.h> | ||
32 | #include <linux/module.h> | ||
33 | #include <linux/sysdev.h> | ||
34 | #include <linux/pci.h> | ||
35 | #include <linux/msi.h> | ||
36 | #include <linux/htirq.h> | ||
37 | #include <linux/freezer.h> | ||
38 | #include <linux/kthread.h> | ||
39 | #include <linux/jiffies.h> /* time_after() */ | ||
40 | |||
41 | #include <asm/io.h> | ||
42 | #include <asm/smp.h> | ||
43 | #include <asm/desc.h> | ||
44 | #include <asm/timer.h> | ||
45 | #include <asm/i8259.h> | ||
46 | #include <asm/nmi.h> | ||
47 | #include <asm/msidef.h> | ||
48 | #include <asm/hypertransport.h> | ||
49 | #include <asm/setup.h> | ||
50 | |||
51 | #include <mach_apic.h> | ||
52 | #include <mach_apicdef.h> | ||
53 | |||
54 | #define __apicdebuginit(type) static type __init | ||
55 | |||
56 | int (*ioapic_renumber_irq)(int ioapic, int irq); | ||
57 | atomic_t irq_mis_count; | ||
58 | |||
59 | /* Where if anywhere is the i8259 connect in external int mode */ | ||
60 | static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; | ||
61 | |||
62 | static DEFINE_SPINLOCK(ioapic_lock); | ||
63 | DEFINE_SPINLOCK(vector_lock); | ||
64 | |||
65 | int timer_through_8259 __initdata; | ||
66 | |||
67 | /* | ||
68 | * Is the SiS APIC rmw bug present ? | ||
69 | * -1 = don't know, 0 = no, 1 = yes | ||
70 | */ | ||
71 | int sis_apic_bug = -1; | ||
72 | |||
73 | /* | ||
74 | * # of IRQ routing registers | ||
75 | */ | ||
76 | int nr_ioapic_registers[MAX_IO_APICS]; | ||
77 | |||
78 | /* I/O APIC entries */ | ||
79 | struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; | ||
80 | int nr_ioapics; | ||
81 | |||
82 | /* MP IRQ source entries */ | ||
83 | struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; | ||
84 | |||
85 | /* # of MP IRQ source entries */ | ||
86 | int mp_irq_entries; | ||
87 | |||
88 | #if defined (CONFIG_MCA) || defined (CONFIG_EISA) | ||
89 | int mp_bus_id_to_type[MAX_MP_BUSSES]; | ||
90 | #endif | ||
91 | |||
92 | DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); | ||
93 | |||
94 | static int disable_timer_pin_1 __initdata; | ||
95 | |||
96 | /* | ||
97 | * Rough estimation of how many shared IRQs there are, can | ||
98 | * be changed anytime. | ||
99 | */ | ||
100 | #define MAX_PLUS_SHARED_IRQS NR_IRQS | ||
101 | #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) | ||
102 | |||
103 | /* | ||
104 | * This is performance-critical, we want to do it O(1) | ||
105 | * | ||
106 | * the indexing order of this array favors 1:1 mappings | ||
107 | * between pins and IRQs. | ||
108 | */ | ||
109 | |||
110 | static struct irq_pin_list { | ||
111 | int apic, pin, next; | ||
112 | } irq_2_pin[PIN_MAP_SIZE]; | ||
113 | |||
114 | struct io_apic { | ||
115 | unsigned int index; | ||
116 | unsigned int unused[3]; | ||
117 | unsigned int data; | ||
118 | }; | ||
119 | |||
120 | static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) | ||
121 | { | ||
122 | return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) | ||
123 | + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); | ||
124 | } | ||
125 | |||
126 | static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) | ||
127 | { | ||
128 | struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
129 | writel(reg, &io_apic->index); | ||
130 | return readl(&io_apic->data); | ||
131 | } | ||
132 | |||
133 | static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) | ||
134 | { | ||
135 | struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
136 | writel(reg, &io_apic->index); | ||
137 | writel(value, &io_apic->data); | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * Re-write a value: to be used for read-modify-write | ||
142 | * cycles where the read already set up the index register. | ||
143 | * | ||
144 | * Older SiS APIC requires we rewrite the index register | ||
145 | */ | ||
146 | static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) | ||
147 | { | ||
148 | volatile struct io_apic __iomem *io_apic = io_apic_base(apic); | ||
149 | if (sis_apic_bug) | ||
150 | writel(reg, &io_apic->index); | ||
151 | writel(value, &io_apic->data); | ||
152 | } | ||
153 | |||
154 | union entry_union { | ||
155 | struct { u32 w1, w2; }; | ||
156 | struct IO_APIC_route_entry entry; | ||
157 | }; | ||
158 | |||
159 | static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) | ||
160 | { | ||
161 | union entry_union eu; | ||
162 | unsigned long flags; | ||
163 | spin_lock_irqsave(&ioapic_lock, flags); | ||
164 | eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); | ||
165 | eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); | ||
166 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
167 | return eu.entry; | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | * When we write a new IO APIC routing entry, we need to write the high | ||
172 | * word first! If the mask bit in the low word is clear, we will enable | ||
173 | * the interrupt, and we need to make sure the entry is fully populated | ||
174 | * before that happens. | ||
175 | */ | ||
176 | static void | ||
177 | __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) | ||
178 | { | ||
179 | union entry_union eu; | ||
180 | eu.entry = e; | ||
181 | io_apic_write(apic, 0x11 + 2*pin, eu.w2); | ||
182 | io_apic_write(apic, 0x10 + 2*pin, eu.w1); | ||
183 | } | ||
184 | |||
185 | static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) | ||
186 | { | ||
187 | unsigned long flags; | ||
188 | spin_lock_irqsave(&ioapic_lock, flags); | ||
189 | __ioapic_write_entry(apic, pin, e); | ||
190 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
191 | } | ||
192 | |||
193 | /* | ||
194 | * When we mask an IO APIC routing entry, we need to write the low | ||
195 | * word first, in order to set the mask bit before we change the | ||
196 | * high bits! | ||
197 | */ | ||
198 | static void ioapic_mask_entry(int apic, int pin) | ||
199 | { | ||
200 | unsigned long flags; | ||
201 | union entry_union eu = { .entry.mask = 1 }; | ||
202 | |||
203 | spin_lock_irqsave(&ioapic_lock, flags); | ||
204 | io_apic_write(apic, 0x10 + 2*pin, eu.w1); | ||
205 | io_apic_write(apic, 0x11 + 2*pin, eu.w2); | ||
206 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
207 | } | ||
208 | |||
209 | /* | ||
210 | * The common case is 1:1 IRQ<->pin mappings. Sometimes there are | ||
211 | * shared ISA-space IRQs, so we have to support them. We are super | ||
212 | * fast in the common case, and fast for shared ISA-space IRQs. | ||
213 | */ | ||
214 | static void add_pin_to_irq(unsigned int irq, int apic, int pin) | ||
215 | { | ||
216 | static int first_free_entry = NR_IRQS; | ||
217 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
218 | |||
219 | while (entry->next) | ||
220 | entry = irq_2_pin + entry->next; | ||
221 | |||
222 | if (entry->pin != -1) { | ||
223 | entry->next = first_free_entry; | ||
224 | entry = irq_2_pin + entry->next; | ||
225 | if (++first_free_entry >= PIN_MAP_SIZE) | ||
226 | panic("io_apic.c: whoops"); | ||
227 | } | ||
228 | entry->apic = apic; | ||
229 | entry->pin = pin; | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * Reroute an IRQ to a different pin. | ||
234 | */ | ||
235 | static void __init replace_pin_at_irq(unsigned int irq, | ||
236 | int oldapic, int oldpin, | ||
237 | int newapic, int newpin) | ||
238 | { | ||
239 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
240 | |||
241 | while (1) { | ||
242 | if (entry->apic == oldapic && entry->pin == oldpin) { | ||
243 | entry->apic = newapic; | ||
244 | entry->pin = newpin; | ||
245 | } | ||
246 | if (!entry->next) | ||
247 | break; | ||
248 | entry = irq_2_pin + entry->next; | ||
249 | } | ||
250 | } | ||
251 | |||
252 | static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) | ||
253 | { | ||
254 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
255 | unsigned int pin, reg; | ||
256 | |||
257 | for (;;) { | ||
258 | pin = entry->pin; | ||
259 | if (pin == -1) | ||
260 | break; | ||
261 | reg = io_apic_read(entry->apic, 0x10 + pin*2); | ||
262 | reg &= ~disable; | ||
263 | reg |= enable; | ||
264 | io_apic_modify(entry->apic, 0x10 + pin*2, reg); | ||
265 | if (!entry->next) | ||
266 | break; | ||
267 | entry = irq_2_pin + entry->next; | ||
268 | } | ||
269 | } | ||
270 | |||
271 | /* mask = 1 */ | ||
272 | static void __mask_IO_APIC_irq(unsigned int irq) | ||
273 | { | ||
274 | __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); | ||
275 | } | ||
276 | |||
277 | /* mask = 0 */ | ||
278 | static void __unmask_IO_APIC_irq(unsigned int irq) | ||
279 | { | ||
280 | __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); | ||
281 | } | ||
282 | |||
283 | /* mask = 1, trigger = 0 */ | ||
284 | static void __mask_and_edge_IO_APIC_irq(unsigned int irq) | ||
285 | { | ||
286 | __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, | ||
287 | IO_APIC_REDIR_LEVEL_TRIGGER); | ||
288 | } | ||
289 | |||
290 | /* mask = 0, trigger = 1 */ | ||
291 | static void __unmask_and_level_IO_APIC_irq(unsigned int irq) | ||
292 | { | ||
293 | __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, | ||
294 | IO_APIC_REDIR_MASKED); | ||
295 | } | ||
296 | |||
297 | static void mask_IO_APIC_irq(unsigned int irq) | ||
298 | { | ||
299 | unsigned long flags; | ||
300 | |||
301 | spin_lock_irqsave(&ioapic_lock, flags); | ||
302 | __mask_IO_APIC_irq(irq); | ||
303 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
304 | } | ||
305 | |||
306 | static void unmask_IO_APIC_irq(unsigned int irq) | ||
307 | { | ||
308 | unsigned long flags; | ||
309 | |||
310 | spin_lock_irqsave(&ioapic_lock, flags); | ||
311 | __unmask_IO_APIC_irq(irq); | ||
312 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
313 | } | ||
314 | |||
315 | static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) | ||
316 | { | ||
317 | struct IO_APIC_route_entry entry; | ||
318 | |||
319 | /* Check delivery_mode to be sure we're not clearing an SMI pin */ | ||
320 | entry = ioapic_read_entry(apic, pin); | ||
321 | if (entry.delivery_mode == dest_SMI) | ||
322 | return; | ||
323 | |||
324 | /* | ||
325 | * Disable it in the IO-APIC irq-routing table: | ||
326 | */ | ||
327 | ioapic_mask_entry(apic, pin); | ||
328 | } | ||
329 | |||
330 | static void clear_IO_APIC(void) | ||
331 | { | ||
332 | int apic, pin; | ||
333 | |||
334 | for (apic = 0; apic < nr_ioapics; apic++) | ||
335 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) | ||
336 | clear_IO_APIC_pin(apic, pin); | ||
337 | } | ||
338 | |||
339 | #ifdef CONFIG_SMP | ||
340 | static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) | ||
341 | { | ||
342 | unsigned long flags; | ||
343 | int pin; | ||
344 | struct irq_pin_list *entry = irq_2_pin + irq; | ||
345 | unsigned int apicid_value; | ||
346 | cpumask_t tmp; | ||
347 | |||
348 | cpus_and(tmp, cpumask, cpu_online_map); | ||
349 | if (cpus_empty(tmp)) | ||
350 | tmp = TARGET_CPUS; | ||
351 | |||
352 | cpus_and(cpumask, tmp, CPU_MASK_ALL); | ||
353 | |||
354 | apicid_value = cpu_mask_to_apicid(cpumask); | ||
355 | /* Prepare to do the io_apic_write */ | ||
356 | apicid_value = apicid_value << 24; | ||
357 | spin_lock_irqsave(&ioapic_lock, flags); | ||
358 | for (;;) { | ||
359 | pin = entry->pin; | ||
360 | if (pin == -1) | ||
361 | break; | ||
362 | io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); | ||
363 | if (!entry->next) | ||
364 | break; | ||
365 | entry = irq_2_pin + entry->next; | ||
366 | } | ||
367 | irq_desc[irq].affinity = cpumask; | ||
368 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
369 | } | ||
370 | |||
371 | #if defined(CONFIG_IRQBALANCE) | ||
372 | # include <asm/processor.h> /* kernel_thread() */ | ||
373 | # include <linux/kernel_stat.h> /* kstat */ | ||
374 | # include <linux/slab.h> /* kmalloc() */ | ||
375 | # include <linux/timer.h> | ||
376 | |||
377 | #define IRQBALANCE_CHECK_ARCH -999 | ||
378 | #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) | ||
379 | #define MIN_BALANCED_IRQ_INTERVAL (HZ/2) | ||
380 | #define BALANCED_IRQ_MORE_DELTA (HZ/10) | ||
381 | #define BALANCED_IRQ_LESS_DELTA (HZ) | ||
382 | |||
383 | static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; | ||
384 | static int physical_balance __read_mostly; | ||
385 | static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; | ||
386 | |||
387 | static struct irq_cpu_info { | ||
388 | unsigned long *last_irq; | ||
389 | unsigned long *irq_delta; | ||
390 | unsigned long irq; | ||
391 | } irq_cpu_data[NR_CPUS]; | ||
392 | |||
393 | #define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) | ||
394 | #define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq]) | ||
395 | #define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq]) | ||
396 | |||
397 | #define IDLE_ENOUGH(cpu,now) \ | ||
398 | (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) | ||
399 | |||
400 | #define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) | ||
401 | |||
402 | #define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i))) | ||
403 | |||
404 | static cpumask_t balance_irq_affinity[NR_IRQS] = { | ||
405 | [0 ... NR_IRQS-1] = CPU_MASK_ALL | ||
406 | }; | ||
407 | |||
408 | void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) | ||
409 | { | ||
410 | balance_irq_affinity[irq] = mask; | ||
411 | } | ||
412 | |||
413 | static unsigned long move(int curr_cpu, cpumask_t allowed_mask, | ||
414 | unsigned long now, int direction) | ||
415 | { | ||
416 | int search_idle = 1; | ||
417 | int cpu = curr_cpu; | ||
418 | |||
419 | goto inside; | ||
420 | |||
421 | do { | ||
422 | if (unlikely(cpu == curr_cpu)) | ||
423 | search_idle = 0; | ||
424 | inside: | ||
425 | if (direction == 1) { | ||
426 | cpu++; | ||
427 | if (cpu >= NR_CPUS) | ||
428 | cpu = 0; | ||
429 | } else { | ||
430 | cpu--; | ||
431 | if (cpu == -1) | ||
432 | cpu = NR_CPUS-1; | ||
433 | } | ||
434 | } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) || | ||
435 | (search_idle && !IDLE_ENOUGH(cpu, now))); | ||
436 | |||
437 | return cpu; | ||
438 | } | ||
439 | |||
440 | static inline void balance_irq(int cpu, int irq) | ||
441 | { | ||
442 | unsigned long now = jiffies; | ||
443 | cpumask_t allowed_mask; | ||
444 | unsigned int new_cpu; | ||
445 | |||
446 | if (irqbalance_disabled) | ||
447 | return; | ||
448 | |||
449 | cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); | ||
450 | new_cpu = move(cpu, allowed_mask, now, 1); | ||
451 | if (cpu != new_cpu) | ||
452 | set_pending_irq(irq, cpumask_of_cpu(new_cpu)); | ||
453 | } | ||
454 | |||
455 | static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) | ||
456 | { | ||
457 | int i, j; | ||
458 | |||
459 | for_each_online_cpu(i) { | ||
460 | for (j = 0; j < NR_IRQS; j++) { | ||
461 | if (!irq_desc[j].action) | ||
462 | continue; | ||
463 | /* Is it a significant load ? */ | ||
464 | if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) < | ||
465 | useful_load_threshold) | ||
466 | continue; | ||
467 | balance_irq(i, j); | ||
468 | } | ||
469 | } | ||
470 | balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, | ||
471 | balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); | ||
472 | return; | ||
473 | } | ||
474 | |||
475 | static void do_irq_balance(void) | ||
476 | { | ||
477 | int i, j; | ||
478 | unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); | ||
479 | unsigned long move_this_load = 0; | ||
480 | int max_loaded = 0, min_loaded = 0; | ||
481 | int load; | ||
482 | unsigned long useful_load_threshold = balanced_irq_interval + 10; | ||
483 | int selected_irq; | ||
484 | int tmp_loaded, first_attempt = 1; | ||
485 | unsigned long tmp_cpu_irq; | ||
486 | unsigned long imbalance = 0; | ||
487 | cpumask_t allowed_mask, target_cpu_mask, tmp; | ||
488 | |||
489 | for_each_possible_cpu(i) { | ||
490 | int package_index; | ||
491 | CPU_IRQ(i) = 0; | ||
492 | if (!cpu_online(i)) | ||
493 | continue; | ||
494 | package_index = CPU_TO_PACKAGEINDEX(i); | ||
495 | for (j = 0; j < NR_IRQS; j++) { | ||
496 | unsigned long value_now, delta; | ||
497 | /* Is this an active IRQ or balancing disabled ? */ | ||
498 | if (!irq_desc[j].action || irq_balancing_disabled(j)) | ||
499 | continue; | ||
500 | if (package_index == i) | ||
501 | IRQ_DELTA(package_index, j) = 0; | ||
502 | /* Determine the total count per processor per IRQ */ | ||
503 | value_now = (unsigned long) kstat_cpu(i).irqs[j]; | ||
504 | |||
505 | /* Determine the activity per processor per IRQ */ | ||
506 | delta = value_now - LAST_CPU_IRQ(i, j); | ||
507 | |||
508 | /* Update last_cpu_irq[][] for the next time */ | ||
509 | LAST_CPU_IRQ(i, j) = value_now; | ||
510 | |||
511 | /* Ignore IRQs whose rate is less than the clock */ | ||
512 | if (delta < useful_load_threshold) | ||
513 | continue; | ||
514 | /* update the load for the processor or package total */ | ||
515 | IRQ_DELTA(package_index, j) += delta; | ||
516 | |||
517 | /* Keep track of the higher numbered sibling as well */ | ||
518 | if (i != package_index) | ||
519 | CPU_IRQ(i) += delta; | ||
520 | /* | ||
521 | * We have sibling A and sibling B in the package | ||
522 | * | ||
523 | * cpu_irq[A] = load for cpu A + load for cpu B | ||
524 | * cpu_irq[B] = load for cpu B | ||
525 | */ | ||
526 | CPU_IRQ(package_index) += delta; | ||
527 | } | ||
528 | } | ||
529 | /* Find the least loaded processor package */ | ||
530 | for_each_online_cpu(i) { | ||
531 | if (i != CPU_TO_PACKAGEINDEX(i)) | ||
532 | continue; | ||
533 | if (min_cpu_irq > CPU_IRQ(i)) { | ||
534 | min_cpu_irq = CPU_IRQ(i); | ||
535 | min_loaded = i; | ||
536 | } | ||
537 | } | ||
538 | max_cpu_irq = ULONG_MAX; | ||
539 | |||
540 | tryanothercpu: | ||
541 | /* | ||
542 | * Look for heaviest loaded processor. | ||
543 | * We may come back to get the next heaviest loaded processor. | ||
544 | * Skip processors with trivial loads. | ||
545 | */ | ||
546 | tmp_cpu_irq = 0; | ||
547 | tmp_loaded = -1; | ||
548 | for_each_online_cpu(i) { | ||
549 | if (i != CPU_TO_PACKAGEINDEX(i)) | ||
550 | continue; | ||
551 | if (max_cpu_irq <= CPU_IRQ(i)) | ||
552 | continue; | ||
553 | if (tmp_cpu_irq < CPU_IRQ(i)) { | ||
554 | tmp_cpu_irq = CPU_IRQ(i); | ||
555 | tmp_loaded = i; | ||
556 | } | ||
557 | } | ||
558 | |||
559 | if (tmp_loaded == -1) { | ||
560 | /* | ||
561 | * In the case of small number of heavy interrupt sources, | ||
562 | * loading some of the cpus too much. We use Ingo's original | ||
563 | * approach to rotate them around. | ||
564 | */ | ||
565 | if (!first_attempt && imbalance >= useful_load_threshold) { | ||
566 | rotate_irqs_among_cpus(useful_load_threshold); | ||
567 | return; | ||
568 | } | ||
569 | goto not_worth_the_effort; | ||
570 | } | ||
571 | |||
572 | first_attempt = 0; /* heaviest search */ | ||
573 | max_cpu_irq = tmp_cpu_irq; /* load */ | ||
574 | max_loaded = tmp_loaded; /* processor */ | ||
575 | imbalance = (max_cpu_irq - min_cpu_irq) / 2; | ||
576 | |||
577 | /* | ||
578 | * if imbalance is less than approx 10% of max load, then | ||
579 | * observe diminishing returns action. - quit | ||
580 | */ | ||
581 | if (imbalance < (max_cpu_irq >> 3)) | ||
582 | goto not_worth_the_effort; | ||
583 | |||
584 | tryanotherirq: | ||
585 | /* if we select an IRQ to move that can't go where we want, then | ||
586 | * see if there is another one to try. | ||
587 | */ | ||
588 | move_this_load = 0; | ||
589 | selected_irq = -1; | ||
590 | for (j = 0; j < NR_IRQS; j++) { | ||
591 | /* Is this an active IRQ? */ | ||
592 | if (!irq_desc[j].action) | ||
593 | continue; | ||
594 | if (imbalance <= IRQ_DELTA(max_loaded, j)) | ||
595 | continue; | ||
596 | /* Try to find the IRQ that is closest to the imbalance | ||
597 | * without going over. | ||
598 | */ | ||
599 | if (move_this_load < IRQ_DELTA(max_loaded, j)) { | ||
600 | move_this_load = IRQ_DELTA(max_loaded, j); | ||
601 | selected_irq = j; | ||
602 | } | ||
603 | } | ||
604 | if (selected_irq == -1) | ||
605 | goto tryanothercpu; | ||
606 | |||
607 | imbalance = move_this_load; | ||
608 | |||
609 | /* For physical_balance case, we accumulated both load | ||
610 | * values in the one of the siblings cpu_irq[], | ||
611 | * to use the same code for physical and logical processors | ||
612 | * as much as possible. | ||
613 | * | ||
614 | * NOTE: the cpu_irq[] array holds the sum of the load for | ||
615 | * sibling A and sibling B in the slot for the lowest numbered | ||
616 | * sibling (A), _AND_ the load for sibling B in the slot for | ||
617 | * the higher numbered sibling. | ||
618 | * | ||
619 | * We seek the least loaded sibling by making the comparison | ||
620 | * (A+B)/2 vs B | ||
621 | */ | ||
622 | load = CPU_IRQ(min_loaded) >> 1; | ||
623 | for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) { | ||
624 | if (load > CPU_IRQ(j)) { | ||
625 | /* This won't change cpu_sibling_map[min_loaded] */ | ||
626 | load = CPU_IRQ(j); | ||
627 | min_loaded = j; | ||
628 | } | ||
629 | } | ||
630 | |||
631 | cpus_and(allowed_mask, | ||
632 | cpu_online_map, | ||
633 | balance_irq_affinity[selected_irq]); | ||
634 | target_cpu_mask = cpumask_of_cpu(min_loaded); | ||
635 | cpus_and(tmp, target_cpu_mask, allowed_mask); | ||
636 | |||
637 | if (!cpus_empty(tmp)) { | ||
638 | /* mark for change destination */ | ||
639 | set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); | ||
640 | |||
641 | /* Since we made a change, come back sooner to | ||
642 | * check for more variation. | ||
643 | */ | ||
644 | balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, | ||
645 | balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); | ||
646 | return; | ||
647 | } | ||
648 | goto tryanotherirq; | ||
649 | |||
650 | not_worth_the_effort: | ||
651 | /* | ||
652 | * if we did not find an IRQ to move, then adjust the time interval | ||
653 | * upward | ||
654 | */ | ||
655 | balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, | ||
656 | balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); | ||
657 | return; | ||
658 | } | ||
659 | |||
660 | static int balanced_irq(void *unused) | ||
661 | { | ||
662 | int i; | ||
663 | unsigned long prev_balance_time = jiffies; | ||
664 | long time_remaining = balanced_irq_interval; | ||
665 | |||
666 | /* push everything to CPU 0 to give us a starting point. */ | ||
667 | for (i = 0 ; i < NR_IRQS ; i++) { | ||
668 | irq_desc[i].pending_mask = cpumask_of_cpu(0); | ||
669 | set_pending_irq(i, cpumask_of_cpu(0)); | ||
670 | } | ||
671 | |||
672 | set_freezable(); | ||
673 | for ( ; ; ) { | ||
674 | time_remaining = schedule_timeout_interruptible(time_remaining); | ||
675 | try_to_freeze(); | ||
676 | if (time_after(jiffies, | ||
677 | prev_balance_time+balanced_irq_interval)) { | ||
678 | preempt_disable(); | ||
679 | do_irq_balance(); | ||
680 | prev_balance_time = jiffies; | ||
681 | time_remaining = balanced_irq_interval; | ||
682 | preempt_enable(); | ||
683 | } | ||
684 | } | ||
685 | return 0; | ||
686 | } | ||
687 | |||
688 | static int __init balanced_irq_init(void) | ||
689 | { | ||
690 | int i; | ||
691 | struct cpuinfo_x86 *c; | ||
692 | cpumask_t tmp; | ||
693 | |||
694 | cpus_shift_right(tmp, cpu_online_map, 2); | ||
695 | c = &boot_cpu_data; | ||
696 | /* When not overwritten by the command line ask subarchitecture. */ | ||
697 | if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) | ||
698 | irqbalance_disabled = NO_BALANCE_IRQ; | ||
699 | if (irqbalance_disabled) | ||
700 | return 0; | ||
701 | |||
702 | /* disable irqbalance completely if there is only one processor online */ | ||
703 | if (num_online_cpus() < 2) { | ||
704 | irqbalance_disabled = 1; | ||
705 | return 0; | ||
706 | } | ||
707 | /* | ||
708 | * Enable physical balance only if more than 1 physical processor | ||
709 | * is present | ||
710 | */ | ||
711 | if (smp_num_siblings > 1 && !cpus_empty(tmp)) | ||
712 | physical_balance = 1; | ||
713 | |||
714 | for_each_online_cpu(i) { | ||
715 | irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); | ||
716 | irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); | ||
717 | if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { | ||
718 | printk(KERN_ERR "balanced_irq_init: out of memory"); | ||
719 | goto failed; | ||
720 | } | ||
721 | } | ||
722 | |||
723 | printk(KERN_INFO "Starting balanced_irq\n"); | ||
724 | if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) | ||
725 | return 0; | ||
726 | printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); | ||
727 | failed: | ||
728 | for_each_possible_cpu(i) { | ||
729 | kfree(irq_cpu_data[i].irq_delta); | ||
730 | irq_cpu_data[i].irq_delta = NULL; | ||
731 | kfree(irq_cpu_data[i].last_irq); | ||
732 | irq_cpu_data[i].last_irq = NULL; | ||
733 | } | ||
734 | return 0; | ||
735 | } | ||
736 | |||
737 | int __devinit irqbalance_disable(char *str) | ||
738 | { | ||
739 | irqbalance_disabled = 1; | ||
740 | return 1; | ||
741 | } | ||
742 | |||
743 | __setup("noirqbalance", irqbalance_disable); | ||
744 | |||
745 | late_initcall(balanced_irq_init); | ||
746 | #endif /* CONFIG_IRQBALANCE */ | ||
747 | #endif /* CONFIG_SMP */ | ||
748 | |||
749 | #ifndef CONFIG_SMP | ||
750 | void send_IPI_self(int vector) | ||
751 | { | ||
752 | unsigned int cfg; | ||
753 | |||
754 | /* | ||
755 | * Wait for idle. | ||
756 | */ | ||
757 | apic_wait_icr_idle(); | ||
758 | cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; | ||
759 | /* | ||
760 | * Send the IPI. The write to APIC_ICR fires this off. | ||
761 | */ | ||
762 | apic_write(APIC_ICR, cfg); | ||
763 | } | ||
764 | #endif /* !CONFIG_SMP */ | ||
765 | |||
766 | |||
767 | /* | ||
768 | * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to | ||
769 | * specific CPU-side IRQs. | ||
770 | */ | ||
771 | |||
772 | #define MAX_PIRQS 8 | ||
773 | static int pirq_entries [MAX_PIRQS]; | ||
774 | static int pirqs_enabled; | ||
775 | int skip_ioapic_setup; | ||
776 | |||
777 | static int __init ioapic_pirq_setup(char *str) | ||
778 | { | ||
779 | int i, max; | ||
780 | int ints[MAX_PIRQS+1]; | ||
781 | |||
782 | get_options(str, ARRAY_SIZE(ints), ints); | ||
783 | |||
784 | for (i = 0; i < MAX_PIRQS; i++) | ||
785 | pirq_entries[i] = -1; | ||
786 | |||
787 | pirqs_enabled = 1; | ||
788 | apic_printk(APIC_VERBOSE, KERN_INFO | ||
789 | "PIRQ redirection, working around broken MP-BIOS.\n"); | ||
790 | max = MAX_PIRQS; | ||
791 | if (ints[0] < MAX_PIRQS) | ||
792 | max = ints[0]; | ||
793 | |||
794 | for (i = 0; i < max; i++) { | ||
795 | apic_printk(APIC_VERBOSE, KERN_DEBUG | ||
796 | "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); | ||
797 | /* | ||
798 | * PIRQs are mapped upside down, usually. | ||
799 | */ | ||
800 | pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; | ||
801 | } | ||
802 | return 1; | ||
803 | } | ||
804 | |||
805 | __setup("pirq=", ioapic_pirq_setup); | ||
806 | |||
807 | /* | ||
808 | * Find the IRQ entry number of a certain pin. | ||
809 | */ | ||
810 | static int find_irq_entry(int apic, int pin, int type) | ||
811 | { | ||
812 | int i; | ||
813 | |||
814 | for (i = 0; i < mp_irq_entries; i++) | ||
815 | if (mp_irqs[i].mp_irqtype == type && | ||
816 | (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || | ||
817 | mp_irqs[i].mp_dstapic == MP_APIC_ALL) && | ||
818 | mp_irqs[i].mp_dstirq == pin) | ||
819 | return i; | ||
820 | |||
821 | return -1; | ||
822 | } | ||
823 | |||
824 | /* | ||
825 | * Find the pin to which IRQ[irq] (ISA) is connected | ||
826 | */ | ||
827 | static int __init find_isa_irq_pin(int irq, int type) | ||
828 | { | ||
829 | int i; | ||
830 | |||
831 | for (i = 0; i < mp_irq_entries; i++) { | ||
832 | int lbus = mp_irqs[i].mp_srcbus; | ||
833 | |||
834 | if (test_bit(lbus, mp_bus_not_pci) && | ||
835 | (mp_irqs[i].mp_irqtype == type) && | ||
836 | (mp_irqs[i].mp_srcbusirq == irq)) | ||
837 | |||
838 | return mp_irqs[i].mp_dstirq; | ||
839 | } | ||
840 | return -1; | ||
841 | } | ||
842 | |||
843 | static int __init find_isa_irq_apic(int irq, int type) | ||
844 | { | ||
845 | int i; | ||
846 | |||
847 | for (i = 0; i < mp_irq_entries; i++) { | ||
848 | int lbus = mp_irqs[i].mp_srcbus; | ||
849 | |||
850 | if (test_bit(lbus, mp_bus_not_pci) && | ||
851 | (mp_irqs[i].mp_irqtype == type) && | ||
852 | (mp_irqs[i].mp_srcbusirq == irq)) | ||
853 | break; | ||
854 | } | ||
855 | if (i < mp_irq_entries) { | ||
856 | int apic; | ||
857 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
858 | if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) | ||
859 | return apic; | ||
860 | } | ||
861 | } | ||
862 | |||
863 | return -1; | ||
864 | } | ||
865 | |||
866 | /* | ||
867 | * Find a specific PCI IRQ entry. | ||
868 | * Not an __init, possibly needed by modules | ||
869 | */ | ||
870 | static int pin_2_irq(int idx, int apic, int pin); | ||
871 | |||
872 | int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) | ||
873 | { | ||
874 | int apic, i, best_guess = -1; | ||
875 | |||
876 | apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " | ||
877 | "slot:%d, pin:%d.\n", bus, slot, pin); | ||
878 | if (test_bit(bus, mp_bus_not_pci)) { | ||
879 | printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); | ||
880 | return -1; | ||
881 | } | ||
882 | for (i = 0; i < mp_irq_entries; i++) { | ||
883 | int lbus = mp_irqs[i].mp_srcbus; | ||
884 | |||
885 | for (apic = 0; apic < nr_ioapics; apic++) | ||
886 | if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || | ||
887 | mp_irqs[i].mp_dstapic == MP_APIC_ALL) | ||
888 | break; | ||
889 | |||
890 | if (!test_bit(lbus, mp_bus_not_pci) && | ||
891 | !mp_irqs[i].mp_irqtype && | ||
892 | (bus == lbus) && | ||
893 | (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { | ||
894 | int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq); | ||
895 | |||
896 | if (!(apic || IO_APIC_IRQ(irq))) | ||
897 | continue; | ||
898 | |||
899 | if (pin == (mp_irqs[i].mp_srcbusirq & 3)) | ||
900 | return irq; | ||
901 | /* | ||
902 | * Use the first all-but-pin matching entry as a | ||
903 | * best-guess fuzzy result for broken mptables. | ||
904 | */ | ||
905 | if (best_guess < 0) | ||
906 | best_guess = irq; | ||
907 | } | ||
908 | } | ||
909 | return best_guess; | ||
910 | } | ||
911 | EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); | ||
912 | |||
913 | /* | ||
914 | * This function currently is only a helper for the i386 smp boot process where | ||
915 | * we need to reprogram the ioredtbls to cater for the cpus which have come online | ||
916 | * so mask in all cases should simply be TARGET_CPUS | ||
917 | */ | ||
918 | #ifdef CONFIG_SMP | ||
919 | void __init setup_ioapic_dest(void) | ||
920 | { | ||
921 | int pin, ioapic, irq, irq_entry; | ||
922 | |||
923 | if (skip_ioapic_setup == 1) | ||
924 | return; | ||
925 | |||
926 | for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { | ||
927 | for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { | ||
928 | irq_entry = find_irq_entry(ioapic, pin, mp_INT); | ||
929 | if (irq_entry == -1) | ||
930 | continue; | ||
931 | irq = pin_2_irq(irq_entry, ioapic, pin); | ||
932 | set_ioapic_affinity_irq(irq, TARGET_CPUS); | ||
933 | } | ||
934 | |||
935 | } | ||
936 | } | ||
937 | #endif | ||
938 | |||
939 | #if defined(CONFIG_EISA) || defined(CONFIG_MCA) | ||
940 | /* | ||
941 | * EISA Edge/Level control register, ELCR | ||
942 | */ | ||
943 | static int EISA_ELCR(unsigned int irq) | ||
944 | { | ||
945 | if (irq < 16) { | ||
946 | unsigned int port = 0x4d0 + (irq >> 3); | ||
947 | return (inb(port) >> (irq & 7)) & 1; | ||
948 | } | ||
949 | apic_printk(APIC_VERBOSE, KERN_INFO | ||
950 | "Broken MPtable reports ISA irq %d\n", irq); | ||
951 | return 0; | ||
952 | } | ||
953 | #endif | ||
954 | |||
955 | /* ISA interrupts are always polarity zero edge triggered, | ||
956 | * when listed as conforming in the MP table. */ | ||
957 | |||
958 | #define default_ISA_trigger(idx) (0) | ||
959 | #define default_ISA_polarity(idx) (0) | ||
960 | |||
961 | /* EISA interrupts are always polarity zero and can be edge or level | ||
962 | * trigger depending on the ELCR value. If an interrupt is listed as | ||
963 | * EISA conforming in the MP table, that means its trigger type must | ||
964 | * be read in from the ELCR */ | ||
965 | |||
966 | #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) | ||
967 | #define default_EISA_polarity(idx) default_ISA_polarity(idx) | ||
968 | |||
969 | /* PCI interrupts are always polarity one level triggered, | ||
970 | * when listed as conforming in the MP table. */ | ||
971 | |||
972 | #define default_PCI_trigger(idx) (1) | ||
973 | #define default_PCI_polarity(idx) (1) | ||
974 | |||
975 | /* MCA interrupts are always polarity zero level triggered, | ||
976 | * when listed as conforming in the MP table. */ | ||
977 | |||
978 | #define default_MCA_trigger(idx) (1) | ||
979 | #define default_MCA_polarity(idx) default_ISA_polarity(idx) | ||
980 | |||
981 | static int MPBIOS_polarity(int idx) | ||
982 | { | ||
983 | int bus = mp_irqs[idx].mp_srcbus; | ||
984 | int polarity; | ||
985 | |||
986 | /* | ||
987 | * Determine IRQ line polarity (high active or low active): | ||
988 | */ | ||
989 | switch (mp_irqs[idx].mp_irqflag & 3) { | ||
990 | case 0: /* conforms, ie. bus-type dependent polarity */ | ||
991 | { | ||
992 | polarity = test_bit(bus, mp_bus_not_pci)? | ||
993 | default_ISA_polarity(idx): | ||
994 | default_PCI_polarity(idx); | ||
995 | break; | ||
996 | } | ||
997 | case 1: /* high active */ | ||
998 | { | ||
999 | polarity = 0; | ||
1000 | break; | ||
1001 | } | ||
1002 | case 2: /* reserved */ | ||
1003 | { | ||
1004 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
1005 | polarity = 1; | ||
1006 | break; | ||
1007 | } | ||
1008 | case 3: /* low active */ | ||
1009 | { | ||
1010 | polarity = 1; | ||
1011 | break; | ||
1012 | } | ||
1013 | default: /* invalid */ | ||
1014 | { | ||
1015 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
1016 | polarity = 1; | ||
1017 | break; | ||
1018 | } | ||
1019 | } | ||
1020 | return polarity; | ||
1021 | } | ||
1022 | |||
1023 | static int MPBIOS_trigger(int idx) | ||
1024 | { | ||
1025 | int bus = mp_irqs[idx].mp_srcbus; | ||
1026 | int trigger; | ||
1027 | |||
1028 | /* | ||
1029 | * Determine IRQ trigger mode (edge or level sensitive): | ||
1030 | */ | ||
1031 | switch ((mp_irqs[idx].mp_irqflag>>2) & 3) { | ||
1032 | case 0: /* conforms, ie. bus-type dependent */ | ||
1033 | { | ||
1034 | trigger = test_bit(bus, mp_bus_not_pci)? | ||
1035 | default_ISA_trigger(idx): | ||
1036 | default_PCI_trigger(idx); | ||
1037 | #if defined(CONFIG_EISA) || defined(CONFIG_MCA) | ||
1038 | switch (mp_bus_id_to_type[bus]) { | ||
1039 | case MP_BUS_ISA: /* ISA pin */ | ||
1040 | { | ||
1041 | /* set before the switch */ | ||
1042 | break; | ||
1043 | } | ||
1044 | case MP_BUS_EISA: /* EISA pin */ | ||
1045 | { | ||
1046 | trigger = default_EISA_trigger(idx); | ||
1047 | break; | ||
1048 | } | ||
1049 | case MP_BUS_PCI: /* PCI pin */ | ||
1050 | { | ||
1051 | /* set before the switch */ | ||
1052 | break; | ||
1053 | } | ||
1054 | case MP_BUS_MCA: /* MCA pin */ | ||
1055 | { | ||
1056 | trigger = default_MCA_trigger(idx); | ||
1057 | break; | ||
1058 | } | ||
1059 | default: | ||
1060 | { | ||
1061 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
1062 | trigger = 1; | ||
1063 | break; | ||
1064 | } | ||
1065 | } | ||
1066 | #endif | ||
1067 | break; | ||
1068 | } | ||
1069 | case 1: /* edge */ | ||
1070 | { | ||
1071 | trigger = 0; | ||
1072 | break; | ||
1073 | } | ||
1074 | case 2: /* reserved */ | ||
1075 | { | ||
1076 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
1077 | trigger = 1; | ||
1078 | break; | ||
1079 | } | ||
1080 | case 3: /* level */ | ||
1081 | { | ||
1082 | trigger = 1; | ||
1083 | break; | ||
1084 | } | ||
1085 | default: /* invalid */ | ||
1086 | { | ||
1087 | printk(KERN_WARNING "broken BIOS!!\n"); | ||
1088 | trigger = 0; | ||
1089 | break; | ||
1090 | } | ||
1091 | } | ||
1092 | return trigger; | ||
1093 | } | ||
1094 | |||
1095 | static inline int irq_polarity(int idx) | ||
1096 | { | ||
1097 | return MPBIOS_polarity(idx); | ||
1098 | } | ||
1099 | |||
1100 | static inline int irq_trigger(int idx) | ||
1101 | { | ||
1102 | return MPBIOS_trigger(idx); | ||
1103 | } | ||
1104 | |||
1105 | static int pin_2_irq(int idx, int apic, int pin) | ||
1106 | { | ||
1107 | int irq, i; | ||
1108 | int bus = mp_irqs[idx].mp_srcbus; | ||
1109 | |||
1110 | /* | ||
1111 | * Debugging check, we are in big trouble if this message pops up! | ||
1112 | */ | ||
1113 | if (mp_irqs[idx].mp_dstirq != pin) | ||
1114 | printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); | ||
1115 | |||
1116 | if (test_bit(bus, mp_bus_not_pci)) | ||
1117 | irq = mp_irqs[idx].mp_srcbusirq; | ||
1118 | else { | ||
1119 | /* | ||
1120 | * PCI IRQs are mapped in order | ||
1121 | */ | ||
1122 | i = irq = 0; | ||
1123 | while (i < apic) | ||
1124 | irq += nr_ioapic_registers[i++]; | ||
1125 | irq += pin; | ||
1126 | |||
1127 | /* | ||
1128 | * For MPS mode, so far only needed by ES7000 platform | ||
1129 | */ | ||
1130 | if (ioapic_renumber_irq) | ||
1131 | irq = ioapic_renumber_irq(apic, irq); | ||
1132 | } | ||
1133 | |||
1134 | /* | ||
1135 | * PCI IRQ command line redirection. Yes, limits are hardcoded. | ||
1136 | */ | ||
1137 | if ((pin >= 16) && (pin <= 23)) { | ||
1138 | if (pirq_entries[pin-16] != -1) { | ||
1139 | if (!pirq_entries[pin-16]) { | ||
1140 | apic_printk(APIC_VERBOSE, KERN_DEBUG | ||
1141 | "disabling PIRQ%d\n", pin-16); | ||
1142 | } else { | ||
1143 | irq = pirq_entries[pin-16]; | ||
1144 | apic_printk(APIC_VERBOSE, KERN_DEBUG | ||
1145 | "using PIRQ%d -> IRQ %d\n", | ||
1146 | pin-16, irq); | ||
1147 | } | ||
1148 | } | ||
1149 | } | ||
1150 | return irq; | ||
1151 | } | ||
1152 | |||
1153 | static inline int IO_APIC_irq_trigger(int irq) | ||
1154 | { | ||
1155 | int apic, idx, pin; | ||
1156 | |||
1157 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1158 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
1159 | idx = find_irq_entry(apic, pin, mp_INT); | ||
1160 | if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) | ||
1161 | return irq_trigger(idx); | ||
1162 | } | ||
1163 | } | ||
1164 | /* | ||
1165 | * nonexistent IRQs are edge default | ||
1166 | */ | ||
1167 | return 0; | ||
1168 | } | ||
1169 | |||
1170 | /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ | ||
1171 | static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; | ||
1172 | |||
1173 | static int __assign_irq_vector(int irq) | ||
1174 | { | ||
1175 | static int current_vector = FIRST_DEVICE_VECTOR, current_offset; | ||
1176 | int vector, offset; | ||
1177 | |||
1178 | BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); | ||
1179 | |||
1180 | if (irq_vector[irq] > 0) | ||
1181 | return irq_vector[irq]; | ||
1182 | |||
1183 | vector = current_vector; | ||
1184 | offset = current_offset; | ||
1185 | next: | ||
1186 | vector += 8; | ||
1187 | if (vector >= first_system_vector) { | ||
1188 | offset = (offset + 1) % 8; | ||
1189 | vector = FIRST_DEVICE_VECTOR + offset; | ||
1190 | } | ||
1191 | if (vector == current_vector) | ||
1192 | return -ENOSPC; | ||
1193 | if (test_and_set_bit(vector, used_vectors)) | ||
1194 | goto next; | ||
1195 | |||
1196 | current_vector = vector; | ||
1197 | current_offset = offset; | ||
1198 | irq_vector[irq] = vector; | ||
1199 | |||
1200 | return vector; | ||
1201 | } | ||
1202 | |||
1203 | static int assign_irq_vector(int irq) | ||
1204 | { | ||
1205 | unsigned long flags; | ||
1206 | int vector; | ||
1207 | |||
1208 | spin_lock_irqsave(&vector_lock, flags); | ||
1209 | vector = __assign_irq_vector(irq); | ||
1210 | spin_unlock_irqrestore(&vector_lock, flags); | ||
1211 | |||
1212 | return vector; | ||
1213 | } | ||
1214 | |||
1215 | static struct irq_chip ioapic_chip; | ||
1216 | |||
1217 | #define IOAPIC_AUTO -1 | ||
1218 | #define IOAPIC_EDGE 0 | ||
1219 | #define IOAPIC_LEVEL 1 | ||
1220 | |||
1221 | static void ioapic_register_intr(int irq, int vector, unsigned long trigger) | ||
1222 | { | ||
1223 | if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || | ||
1224 | trigger == IOAPIC_LEVEL) { | ||
1225 | irq_desc[irq].status |= IRQ_LEVEL; | ||
1226 | set_irq_chip_and_handler_name(irq, &ioapic_chip, | ||
1227 | handle_fasteoi_irq, "fasteoi"); | ||
1228 | } else { | ||
1229 | irq_desc[irq].status &= ~IRQ_LEVEL; | ||
1230 | set_irq_chip_and_handler_name(irq, &ioapic_chip, | ||
1231 | handle_edge_irq, "edge"); | ||
1232 | } | ||
1233 | set_intr_gate(vector, interrupt[irq]); | ||
1234 | } | ||
1235 | |||
1236 | static void __init setup_IO_APIC_irqs(void) | ||
1237 | { | ||
1238 | struct IO_APIC_route_entry entry; | ||
1239 | int apic, pin, idx, irq, first_notcon = 1, vector; | ||
1240 | |||
1241 | apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); | ||
1242 | |||
1243 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1244 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
1245 | |||
1246 | /* | ||
1247 | * add it to the IO-APIC irq-routing table: | ||
1248 | */ | ||
1249 | memset(&entry, 0, sizeof(entry)); | ||
1250 | |||
1251 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
1252 | entry.dest_mode = INT_DEST_MODE; | ||
1253 | entry.mask = 0; /* enable IRQ */ | ||
1254 | entry.dest.logical.logical_dest = | ||
1255 | cpu_mask_to_apicid(TARGET_CPUS); | ||
1256 | |||
1257 | idx = find_irq_entry(apic, pin, mp_INT); | ||
1258 | if (idx == -1) { | ||
1259 | if (first_notcon) { | ||
1260 | apic_printk(APIC_VERBOSE, KERN_DEBUG | ||
1261 | " IO-APIC (apicid-pin) %d-%d", | ||
1262 | mp_ioapics[apic].mp_apicid, | ||
1263 | pin); | ||
1264 | first_notcon = 0; | ||
1265 | } else | ||
1266 | apic_printk(APIC_VERBOSE, ", %d-%d", | ||
1267 | mp_ioapics[apic].mp_apicid, pin); | ||
1268 | continue; | ||
1269 | } | ||
1270 | |||
1271 | if (!first_notcon) { | ||
1272 | apic_printk(APIC_VERBOSE, " not connected.\n"); | ||
1273 | first_notcon = 1; | ||
1274 | } | ||
1275 | |||
1276 | entry.trigger = irq_trigger(idx); | ||
1277 | entry.polarity = irq_polarity(idx); | ||
1278 | |||
1279 | if (irq_trigger(idx)) { | ||
1280 | entry.trigger = 1; | ||
1281 | entry.mask = 1; | ||
1282 | } | ||
1283 | |||
1284 | irq = pin_2_irq(idx, apic, pin); | ||
1285 | /* | ||
1286 | * skip adding the timer int on secondary nodes, which causes | ||
1287 | * a small but painful rift in the time-space continuum | ||
1288 | */ | ||
1289 | if (multi_timer_check(apic, irq)) | ||
1290 | continue; | ||
1291 | else | ||
1292 | add_pin_to_irq(irq, apic, pin); | ||
1293 | |||
1294 | if (!apic && !IO_APIC_IRQ(irq)) | ||
1295 | continue; | ||
1296 | |||
1297 | if (IO_APIC_IRQ(irq)) { | ||
1298 | vector = assign_irq_vector(irq); | ||
1299 | entry.vector = vector; | ||
1300 | ioapic_register_intr(irq, vector, IOAPIC_AUTO); | ||
1301 | |||
1302 | if (!apic && (irq < 16)) | ||
1303 | disable_8259A_irq(irq); | ||
1304 | } | ||
1305 | ioapic_write_entry(apic, pin, entry); | ||
1306 | } | ||
1307 | } | ||
1308 | |||
1309 | if (!first_notcon) | ||
1310 | apic_printk(APIC_VERBOSE, " not connected.\n"); | ||
1311 | } | ||
1312 | |||
1313 | /* | ||
1314 | * Set up the timer pin, possibly with the 8259A-master behind. | ||
1315 | */ | ||
1316 | static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, | ||
1317 | int vector) | ||
1318 | { | ||
1319 | struct IO_APIC_route_entry entry; | ||
1320 | |||
1321 | memset(&entry, 0, sizeof(entry)); | ||
1322 | |||
1323 | /* | ||
1324 | * We use logical delivery to get the timer IRQ | ||
1325 | * to the first CPU. | ||
1326 | */ | ||
1327 | entry.dest_mode = INT_DEST_MODE; | ||
1328 | entry.mask = 1; /* mask IRQ now */ | ||
1329 | entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
1330 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
1331 | entry.polarity = 0; | ||
1332 | entry.trigger = 0; | ||
1333 | entry.vector = vector; | ||
1334 | |||
1335 | /* | ||
1336 | * The timer IRQ doesn't have to know that behind the | ||
1337 | * scene we may have a 8259A-master in AEOI mode ... | ||
1338 | */ | ||
1339 | ioapic_register_intr(0, vector, IOAPIC_EDGE); | ||
1340 | |||
1341 | /* | ||
1342 | * Add it to the IO-APIC irq-routing table: | ||
1343 | */ | ||
1344 | ioapic_write_entry(apic, pin, entry); | ||
1345 | } | ||
1346 | |||
1347 | |||
1348 | __apicdebuginit(void) print_IO_APIC(void) | ||
1349 | { | ||
1350 | int apic, i; | ||
1351 | union IO_APIC_reg_00 reg_00; | ||
1352 | union IO_APIC_reg_01 reg_01; | ||
1353 | union IO_APIC_reg_02 reg_02; | ||
1354 | union IO_APIC_reg_03 reg_03; | ||
1355 | unsigned long flags; | ||
1356 | |||
1357 | if (apic_verbosity == APIC_QUIET) | ||
1358 | return; | ||
1359 | |||
1360 | printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); | ||
1361 | for (i = 0; i < nr_ioapics; i++) | ||
1362 | printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", | ||
1363 | mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); | ||
1364 | |||
1365 | /* | ||
1366 | * We are a bit conservative about what we expect. We have to | ||
1367 | * know about every hardware change ASAP. | ||
1368 | */ | ||
1369 | printk(KERN_INFO "testing the IO APIC.......................\n"); | ||
1370 | |||
1371 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1372 | |||
1373 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1374 | reg_00.raw = io_apic_read(apic, 0); | ||
1375 | reg_01.raw = io_apic_read(apic, 1); | ||
1376 | if (reg_01.bits.version >= 0x10) | ||
1377 | reg_02.raw = io_apic_read(apic, 2); | ||
1378 | if (reg_01.bits.version >= 0x20) | ||
1379 | reg_03.raw = io_apic_read(apic, 3); | ||
1380 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1381 | |||
1382 | printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); | ||
1383 | printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); | ||
1384 | printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); | ||
1385 | printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); | ||
1386 | printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); | ||
1387 | |||
1388 | printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); | ||
1389 | printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); | ||
1390 | |||
1391 | printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); | ||
1392 | printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); | ||
1393 | |||
1394 | /* | ||
1395 | * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, | ||
1396 | * but the value of reg_02 is read as the previous read register | ||
1397 | * value, so ignore it if reg_02 == reg_01. | ||
1398 | */ | ||
1399 | if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { | ||
1400 | printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); | ||
1401 | printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); | ||
1402 | } | ||
1403 | |||
1404 | /* | ||
1405 | * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 | ||
1406 | * or reg_03, but the value of reg_0[23] is read as the previous read | ||
1407 | * register value, so ignore it if reg_03 == reg_0[12]. | ||
1408 | */ | ||
1409 | if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && | ||
1410 | reg_03.raw != reg_01.raw) { | ||
1411 | printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); | ||
1412 | printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); | ||
1413 | } | ||
1414 | |||
1415 | printk(KERN_DEBUG ".... IRQ redirection table:\n"); | ||
1416 | |||
1417 | printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" | ||
1418 | " Stat Dest Deli Vect: \n"); | ||
1419 | |||
1420 | for (i = 0; i <= reg_01.bits.entries; i++) { | ||
1421 | struct IO_APIC_route_entry entry; | ||
1422 | |||
1423 | entry = ioapic_read_entry(apic, i); | ||
1424 | |||
1425 | printk(KERN_DEBUG " %02x %03X %02X ", | ||
1426 | i, | ||
1427 | entry.dest.logical.logical_dest, | ||
1428 | entry.dest.physical.physical_dest | ||
1429 | ); | ||
1430 | |||
1431 | printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", | ||
1432 | entry.mask, | ||
1433 | entry.trigger, | ||
1434 | entry.irr, | ||
1435 | entry.polarity, | ||
1436 | entry.delivery_status, | ||
1437 | entry.dest_mode, | ||
1438 | entry.delivery_mode, | ||
1439 | entry.vector | ||
1440 | ); | ||
1441 | } | ||
1442 | } | ||
1443 | printk(KERN_DEBUG "IRQ to pin mappings:\n"); | ||
1444 | for (i = 0; i < NR_IRQS; i++) { | ||
1445 | struct irq_pin_list *entry = irq_2_pin + i; | ||
1446 | if (entry->pin < 0) | ||
1447 | continue; | ||
1448 | printk(KERN_DEBUG "IRQ%d ", i); | ||
1449 | for (;;) { | ||
1450 | printk("-> %d:%d", entry->apic, entry->pin); | ||
1451 | if (!entry->next) | ||
1452 | break; | ||
1453 | entry = irq_2_pin + entry->next; | ||
1454 | } | ||
1455 | printk("\n"); | ||
1456 | } | ||
1457 | |||
1458 | printk(KERN_INFO ".................................... done.\n"); | ||
1459 | |||
1460 | return; | ||
1461 | } | ||
1462 | |||
1463 | __apicdebuginit(void) print_APIC_bitfield(int base) | ||
1464 | { | ||
1465 | unsigned int v; | ||
1466 | int i, j; | ||
1467 | |||
1468 | if (apic_verbosity == APIC_QUIET) | ||
1469 | return; | ||
1470 | |||
1471 | printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); | ||
1472 | for (i = 0; i < 8; i++) { | ||
1473 | v = apic_read(base + i*0x10); | ||
1474 | for (j = 0; j < 32; j++) { | ||
1475 | if (v & (1<<j)) | ||
1476 | printk("1"); | ||
1477 | else | ||
1478 | printk("0"); | ||
1479 | } | ||
1480 | printk("\n"); | ||
1481 | } | ||
1482 | } | ||
1483 | |||
1484 | __apicdebuginit(void) print_local_APIC(void *dummy) | ||
1485 | { | ||
1486 | unsigned int v, ver, maxlvt; | ||
1487 | u64 icr; | ||
1488 | |||
1489 | if (apic_verbosity == APIC_QUIET) | ||
1490 | return; | ||
1491 | |||
1492 | printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", | ||
1493 | smp_processor_id(), hard_smp_processor_id()); | ||
1494 | v = apic_read(APIC_ID); | ||
1495 | printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, | ||
1496 | GET_APIC_ID(v)); | ||
1497 | v = apic_read(APIC_LVR); | ||
1498 | printk(KERN_INFO "... APIC VERSION: %08x\n", v); | ||
1499 | ver = GET_APIC_VERSION(v); | ||
1500 | maxlvt = lapic_get_maxlvt(); | ||
1501 | |||
1502 | v = apic_read(APIC_TASKPRI); | ||
1503 | printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); | ||
1504 | |||
1505 | if (APIC_INTEGRATED(ver)) { /* !82489DX */ | ||
1506 | v = apic_read(APIC_ARBPRI); | ||
1507 | printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, | ||
1508 | v & APIC_ARBPRI_MASK); | ||
1509 | v = apic_read(APIC_PROCPRI); | ||
1510 | printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); | ||
1511 | } | ||
1512 | |||
1513 | v = apic_read(APIC_EOI); | ||
1514 | printk(KERN_DEBUG "... APIC EOI: %08x\n", v); | ||
1515 | v = apic_read(APIC_RRR); | ||
1516 | printk(KERN_DEBUG "... APIC RRR: %08x\n", v); | ||
1517 | v = apic_read(APIC_LDR); | ||
1518 | printk(KERN_DEBUG "... APIC LDR: %08x\n", v); | ||
1519 | v = apic_read(APIC_DFR); | ||
1520 | printk(KERN_DEBUG "... APIC DFR: %08x\n", v); | ||
1521 | v = apic_read(APIC_SPIV); | ||
1522 | printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); | ||
1523 | |||
1524 | printk(KERN_DEBUG "... APIC ISR field:\n"); | ||
1525 | print_APIC_bitfield(APIC_ISR); | ||
1526 | printk(KERN_DEBUG "... APIC TMR field:\n"); | ||
1527 | print_APIC_bitfield(APIC_TMR); | ||
1528 | printk(KERN_DEBUG "... APIC IRR field:\n"); | ||
1529 | print_APIC_bitfield(APIC_IRR); | ||
1530 | |||
1531 | if (APIC_INTEGRATED(ver)) { /* !82489DX */ | ||
1532 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | ||
1533 | apic_write(APIC_ESR, 0); | ||
1534 | v = apic_read(APIC_ESR); | ||
1535 | printk(KERN_DEBUG "... APIC ESR: %08x\n", v); | ||
1536 | } | ||
1537 | |||
1538 | icr = apic_icr_read(); | ||
1539 | printk(KERN_DEBUG "... APIC ICR: %08x\n", icr); | ||
1540 | printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32); | ||
1541 | |||
1542 | v = apic_read(APIC_LVTT); | ||
1543 | printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); | ||
1544 | |||
1545 | if (maxlvt > 3) { /* PC is LVT#4. */ | ||
1546 | v = apic_read(APIC_LVTPC); | ||
1547 | printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); | ||
1548 | } | ||
1549 | v = apic_read(APIC_LVT0); | ||
1550 | printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); | ||
1551 | v = apic_read(APIC_LVT1); | ||
1552 | printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); | ||
1553 | |||
1554 | if (maxlvt > 2) { /* ERR is LVT#3. */ | ||
1555 | v = apic_read(APIC_LVTERR); | ||
1556 | printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); | ||
1557 | } | ||
1558 | |||
1559 | v = apic_read(APIC_TMICT); | ||
1560 | printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); | ||
1561 | v = apic_read(APIC_TMCCT); | ||
1562 | printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); | ||
1563 | v = apic_read(APIC_TDCR); | ||
1564 | printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); | ||
1565 | printk("\n"); | ||
1566 | } | ||
1567 | |||
1568 | __apicdebuginit(void) print_all_local_APICs(void) | ||
1569 | { | ||
1570 | on_each_cpu(print_local_APIC, NULL, 1); | ||
1571 | } | ||
1572 | |||
1573 | __apicdebuginit(void) print_PIC(void) | ||
1574 | { | ||
1575 | unsigned int v; | ||
1576 | unsigned long flags; | ||
1577 | |||
1578 | if (apic_verbosity == APIC_QUIET) | ||
1579 | return; | ||
1580 | |||
1581 | printk(KERN_DEBUG "\nprinting PIC contents\n"); | ||
1582 | |||
1583 | spin_lock_irqsave(&i8259A_lock, flags); | ||
1584 | |||
1585 | v = inb(0xa1) << 8 | inb(0x21); | ||
1586 | printk(KERN_DEBUG "... PIC IMR: %04x\n", v); | ||
1587 | |||
1588 | v = inb(0xa0) << 8 | inb(0x20); | ||
1589 | printk(KERN_DEBUG "... PIC IRR: %04x\n", v); | ||
1590 | |||
1591 | outb(0x0b, 0xa0); | ||
1592 | outb(0x0b, 0x20); | ||
1593 | v = inb(0xa0) << 8 | inb(0x20); | ||
1594 | outb(0x0a, 0xa0); | ||
1595 | outb(0x0a, 0x20); | ||
1596 | |||
1597 | spin_unlock_irqrestore(&i8259A_lock, flags); | ||
1598 | |||
1599 | printk(KERN_DEBUG "... PIC ISR: %04x\n", v); | ||
1600 | |||
1601 | v = inb(0x4d1) << 8 | inb(0x4d0); | ||
1602 | printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); | ||
1603 | } | ||
1604 | |||
1605 | __apicdebuginit(int) print_all_ICs(void) | ||
1606 | { | ||
1607 | print_PIC(); | ||
1608 | print_all_local_APICs(); | ||
1609 | print_IO_APIC(); | ||
1610 | |||
1611 | return 0; | ||
1612 | } | ||
1613 | |||
1614 | fs_initcall(print_all_ICs); | ||
1615 | |||
1616 | |||
1617 | static void __init enable_IO_APIC(void) | ||
1618 | { | ||
1619 | union IO_APIC_reg_01 reg_01; | ||
1620 | int i8259_apic, i8259_pin; | ||
1621 | int i, apic; | ||
1622 | unsigned long flags; | ||
1623 | |||
1624 | for (i = 0; i < PIN_MAP_SIZE; i++) { | ||
1625 | irq_2_pin[i].pin = -1; | ||
1626 | irq_2_pin[i].next = 0; | ||
1627 | } | ||
1628 | if (!pirqs_enabled) | ||
1629 | for (i = 0; i < MAX_PIRQS; i++) | ||
1630 | pirq_entries[i] = -1; | ||
1631 | |||
1632 | /* | ||
1633 | * The number of IO-APIC IRQ registers (== #pins): | ||
1634 | */ | ||
1635 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1636 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1637 | reg_01.raw = io_apic_read(apic, 1); | ||
1638 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1639 | nr_ioapic_registers[apic] = reg_01.bits.entries+1; | ||
1640 | } | ||
1641 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1642 | int pin; | ||
1643 | /* See if any of the pins is in ExtINT mode */ | ||
1644 | for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { | ||
1645 | struct IO_APIC_route_entry entry; | ||
1646 | entry = ioapic_read_entry(apic, pin); | ||
1647 | |||
1648 | |||
1649 | /* If the interrupt line is enabled and in ExtInt mode | ||
1650 | * I have found the pin where the i8259 is connected. | ||
1651 | */ | ||
1652 | if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { | ||
1653 | ioapic_i8259.apic = apic; | ||
1654 | ioapic_i8259.pin = pin; | ||
1655 | goto found_i8259; | ||
1656 | } | ||
1657 | } | ||
1658 | } | ||
1659 | found_i8259: | ||
1660 | /* Look to see what if the MP table has reported the ExtINT */ | ||
1661 | /* If we could not find the appropriate pin by looking at the ioapic | ||
1662 | * the i8259 probably is not connected the ioapic but give the | ||
1663 | * mptable a chance anyway. | ||
1664 | */ | ||
1665 | i8259_pin = find_isa_irq_pin(0, mp_ExtINT); | ||
1666 | i8259_apic = find_isa_irq_apic(0, mp_ExtINT); | ||
1667 | /* Trust the MP table if nothing is setup in the hardware */ | ||
1668 | if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { | ||
1669 | printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); | ||
1670 | ioapic_i8259.pin = i8259_pin; | ||
1671 | ioapic_i8259.apic = i8259_apic; | ||
1672 | } | ||
1673 | /* Complain if the MP table and the hardware disagree */ | ||
1674 | if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && | ||
1675 | (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) | ||
1676 | { | ||
1677 | printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); | ||
1678 | } | ||
1679 | |||
1680 | /* | ||
1681 | * Do not trust the IO-APIC being empty at bootup | ||
1682 | */ | ||
1683 | clear_IO_APIC(); | ||
1684 | } | ||
1685 | |||
1686 | /* | ||
1687 | * Not an __init, needed by the reboot code | ||
1688 | */ | ||
1689 | void disable_IO_APIC(void) | ||
1690 | { | ||
1691 | /* | ||
1692 | * Clear the IO-APIC before rebooting: | ||
1693 | */ | ||
1694 | clear_IO_APIC(); | ||
1695 | |||
1696 | /* | ||
1697 | * If the i8259 is routed through an IOAPIC | ||
1698 | * Put that IOAPIC in virtual wire mode | ||
1699 | * so legacy interrupts can be delivered. | ||
1700 | */ | ||
1701 | if (ioapic_i8259.pin != -1) { | ||
1702 | struct IO_APIC_route_entry entry; | ||
1703 | |||
1704 | memset(&entry, 0, sizeof(entry)); | ||
1705 | entry.mask = 0; /* Enabled */ | ||
1706 | entry.trigger = 0; /* Edge */ | ||
1707 | entry.irr = 0; | ||
1708 | entry.polarity = 0; /* High */ | ||
1709 | entry.delivery_status = 0; | ||
1710 | entry.dest_mode = 0; /* Physical */ | ||
1711 | entry.delivery_mode = dest_ExtINT; /* ExtInt */ | ||
1712 | entry.vector = 0; | ||
1713 | entry.dest.physical.physical_dest = read_apic_id(); | ||
1714 | |||
1715 | /* | ||
1716 | * Add it to the IO-APIC irq-routing table: | ||
1717 | */ | ||
1718 | ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); | ||
1719 | } | ||
1720 | disconnect_bsp_APIC(ioapic_i8259.pin != -1); | ||
1721 | } | ||
1722 | |||
1723 | /* | ||
1724 | * function to set the IO-APIC physical IDs based on the | ||
1725 | * values stored in the MPC table. | ||
1726 | * | ||
1727 | * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 | ||
1728 | */ | ||
1729 | |||
1730 | static void __init setup_ioapic_ids_from_mpc(void) | ||
1731 | { | ||
1732 | union IO_APIC_reg_00 reg_00; | ||
1733 | physid_mask_t phys_id_present_map; | ||
1734 | int apic; | ||
1735 | int i; | ||
1736 | unsigned char old_id; | ||
1737 | unsigned long flags; | ||
1738 | |||
1739 | if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) | ||
1740 | return; | ||
1741 | |||
1742 | /* | ||
1743 | * Don't check I/O APIC IDs for xAPIC systems. They have | ||
1744 | * no meaning without the serial APIC bus. | ||
1745 | */ | ||
1746 | if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) | ||
1747 | || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) | ||
1748 | return; | ||
1749 | /* | ||
1750 | * This is broken; anything with a real cpu count has to | ||
1751 | * circumvent this idiocy regardless. | ||
1752 | */ | ||
1753 | phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); | ||
1754 | |||
1755 | /* | ||
1756 | * Set the IOAPIC ID to the value stored in the MPC table. | ||
1757 | */ | ||
1758 | for (apic = 0; apic < nr_ioapics; apic++) { | ||
1759 | |||
1760 | /* Read the register 0 value */ | ||
1761 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1762 | reg_00.raw = io_apic_read(apic, 0); | ||
1763 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1764 | |||
1765 | old_id = mp_ioapics[apic].mp_apicid; | ||
1766 | |||
1767 | if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { | ||
1768 | printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", | ||
1769 | apic, mp_ioapics[apic].mp_apicid); | ||
1770 | printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", | ||
1771 | reg_00.bits.ID); | ||
1772 | mp_ioapics[apic].mp_apicid = reg_00.bits.ID; | ||
1773 | } | ||
1774 | |||
1775 | /* | ||
1776 | * Sanity check, is the ID really free? Every APIC in a | ||
1777 | * system must have a unique ID or we get lots of nice | ||
1778 | * 'stuck on smp_invalidate_needed IPI wait' messages. | ||
1779 | */ | ||
1780 | if (check_apicid_used(phys_id_present_map, | ||
1781 | mp_ioapics[apic].mp_apicid)) { | ||
1782 | printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", | ||
1783 | apic, mp_ioapics[apic].mp_apicid); | ||
1784 | for (i = 0; i < get_physical_broadcast(); i++) | ||
1785 | if (!physid_isset(i, phys_id_present_map)) | ||
1786 | break; | ||
1787 | if (i >= get_physical_broadcast()) | ||
1788 | panic("Max APIC ID exceeded!\n"); | ||
1789 | printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", | ||
1790 | i); | ||
1791 | physid_set(i, phys_id_present_map); | ||
1792 | mp_ioapics[apic].mp_apicid = i; | ||
1793 | } else { | ||
1794 | physid_mask_t tmp; | ||
1795 | tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); | ||
1796 | apic_printk(APIC_VERBOSE, "Setting %d in the " | ||
1797 | "phys_id_present_map\n", | ||
1798 | mp_ioapics[apic].mp_apicid); | ||
1799 | physids_or(phys_id_present_map, phys_id_present_map, tmp); | ||
1800 | } | ||
1801 | |||
1802 | |||
1803 | /* | ||
1804 | * We need to adjust the IRQ routing table | ||
1805 | * if the ID changed. | ||
1806 | */ | ||
1807 | if (old_id != mp_ioapics[apic].mp_apicid) | ||
1808 | for (i = 0; i < mp_irq_entries; i++) | ||
1809 | if (mp_irqs[i].mp_dstapic == old_id) | ||
1810 | mp_irqs[i].mp_dstapic | ||
1811 | = mp_ioapics[apic].mp_apicid; | ||
1812 | |||
1813 | /* | ||
1814 | * Read the right value from the MPC table and | ||
1815 | * write it into the ID register. | ||
1816 | */ | ||
1817 | apic_printk(APIC_VERBOSE, KERN_INFO | ||
1818 | "...changing IO-APIC physical APIC ID to %d ...", | ||
1819 | mp_ioapics[apic].mp_apicid); | ||
1820 | |||
1821 | reg_00.bits.ID = mp_ioapics[apic].mp_apicid; | ||
1822 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1823 | io_apic_write(apic, 0, reg_00.raw); | ||
1824 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1825 | |||
1826 | /* | ||
1827 | * Sanity check | ||
1828 | */ | ||
1829 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1830 | reg_00.raw = io_apic_read(apic, 0); | ||
1831 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1832 | if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) | ||
1833 | printk("could not set ID!\n"); | ||
1834 | else | ||
1835 | apic_printk(APIC_VERBOSE, " ok.\n"); | ||
1836 | } | ||
1837 | } | ||
1838 | |||
1839 | int no_timer_check __initdata; | ||
1840 | |||
1841 | static int __init notimercheck(char *s) | ||
1842 | { | ||
1843 | no_timer_check = 1; | ||
1844 | return 1; | ||
1845 | } | ||
1846 | __setup("no_timer_check", notimercheck); | ||
1847 | |||
1848 | /* | ||
1849 | * There is a nasty bug in some older SMP boards, their mptable lies | ||
1850 | * about the timer IRQ. We do the following to work around the situation: | ||
1851 | * | ||
1852 | * - timer IRQ defaults to IO-APIC IRQ | ||
1853 | * - if this function detects that timer IRQs are defunct, then we fall | ||
1854 | * back to ISA timer IRQs | ||
1855 | */ | ||
1856 | static int __init timer_irq_works(void) | ||
1857 | { | ||
1858 | unsigned long t1 = jiffies; | ||
1859 | unsigned long flags; | ||
1860 | |||
1861 | if (no_timer_check) | ||
1862 | return 1; | ||
1863 | |||
1864 | local_save_flags(flags); | ||
1865 | local_irq_enable(); | ||
1866 | /* Let ten ticks pass... */ | ||
1867 | mdelay((10 * 1000) / HZ); | ||
1868 | local_irq_restore(flags); | ||
1869 | |||
1870 | /* | ||
1871 | * Expect a few ticks at least, to be sure some possible | ||
1872 | * glue logic does not lock up after one or two first | ||
1873 | * ticks in a non-ExtINT mode. Also the local APIC | ||
1874 | * might have cached one ExtINT interrupt. Finally, at | ||
1875 | * least one tick may be lost due to delays. | ||
1876 | */ | ||
1877 | if (time_after(jiffies, t1 + 4)) | ||
1878 | return 1; | ||
1879 | |||
1880 | return 0; | ||
1881 | } | ||
1882 | |||
1883 | /* | ||
1884 | * In the SMP+IOAPIC case it might happen that there are an unspecified | ||
1885 | * number of pending IRQ events unhandled. These cases are very rare, | ||
1886 | * so we 'resend' these IRQs via IPIs, to the same CPU. It's much | ||
1887 | * better to do it this way as thus we do not have to be aware of | ||
1888 | * 'pending' interrupts in the IRQ path, except at this point. | ||
1889 | */ | ||
1890 | /* | ||
1891 | * Edge triggered needs to resend any interrupt | ||
1892 | * that was delayed but this is now handled in the device | ||
1893 | * independent code. | ||
1894 | */ | ||
1895 | |||
1896 | /* | ||
1897 | * Startup quirk: | ||
1898 | * | ||
1899 | * Starting up a edge-triggered IO-APIC interrupt is | ||
1900 | * nasty - we need to make sure that we get the edge. | ||
1901 | * If it is already asserted for some reason, we need | ||
1902 | * return 1 to indicate that is was pending. | ||
1903 | * | ||
1904 | * This is not complete - we should be able to fake | ||
1905 | * an edge even if it isn't on the 8259A... | ||
1906 | * | ||
1907 | * (We do this for level-triggered IRQs too - it cannot hurt.) | ||
1908 | */ | ||
1909 | static unsigned int startup_ioapic_irq(unsigned int irq) | ||
1910 | { | ||
1911 | int was_pending = 0; | ||
1912 | unsigned long flags; | ||
1913 | |||
1914 | spin_lock_irqsave(&ioapic_lock, flags); | ||
1915 | if (irq < 16) { | ||
1916 | disable_8259A_irq(irq); | ||
1917 | if (i8259A_irq_pending(irq)) | ||
1918 | was_pending = 1; | ||
1919 | } | ||
1920 | __unmask_IO_APIC_irq(irq); | ||
1921 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
1922 | |||
1923 | return was_pending; | ||
1924 | } | ||
1925 | |||
1926 | static void ack_ioapic_irq(unsigned int irq) | ||
1927 | { | ||
1928 | move_native_irq(irq); | ||
1929 | ack_APIC_irq(); | ||
1930 | } | ||
1931 | |||
1932 | static void ack_ioapic_quirk_irq(unsigned int irq) | ||
1933 | { | ||
1934 | unsigned long v; | ||
1935 | int i; | ||
1936 | |||
1937 | move_native_irq(irq); | ||
1938 | /* | ||
1939 | * It appears there is an erratum which affects at least version 0x11 | ||
1940 | * of I/O APIC (that's the 82093AA and cores integrated into various | ||
1941 | * chipsets). Under certain conditions a level-triggered interrupt is | ||
1942 | * erroneously delivered as edge-triggered one but the respective IRR | ||
1943 | * bit gets set nevertheless. As a result the I/O unit expects an EOI | ||
1944 | * message but it will never arrive and further interrupts are blocked | ||
1945 | * from the source. The exact reason is so far unknown, but the | ||
1946 | * phenomenon was observed when two consecutive interrupt requests | ||
1947 | * from a given source get delivered to the same CPU and the source is | ||
1948 | * temporarily disabled in between. | ||
1949 | * | ||
1950 | * A workaround is to simulate an EOI message manually. We achieve it | ||
1951 | * by setting the trigger mode to edge and then to level when the edge | ||
1952 | * trigger mode gets detected in the TMR of a local APIC for a | ||
1953 | * level-triggered interrupt. We mask the source for the time of the | ||
1954 | * operation to prevent an edge-triggered interrupt escaping meanwhile. | ||
1955 | * The idea is from Manfred Spraul. --macro | ||
1956 | */ | ||
1957 | i = irq_vector[irq]; | ||
1958 | |||
1959 | v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); | ||
1960 | |||
1961 | ack_APIC_irq(); | ||
1962 | |||
1963 | if (!(v & (1 << (i & 0x1f)))) { | ||
1964 | atomic_inc(&irq_mis_count); | ||
1965 | spin_lock(&ioapic_lock); | ||
1966 | __mask_and_edge_IO_APIC_irq(irq); | ||
1967 | __unmask_and_level_IO_APIC_irq(irq); | ||
1968 | spin_unlock(&ioapic_lock); | ||
1969 | } | ||
1970 | } | ||
1971 | |||
1972 | static int ioapic_retrigger_irq(unsigned int irq) | ||
1973 | { | ||
1974 | send_IPI_self(irq_vector[irq]); | ||
1975 | |||
1976 | return 1; | ||
1977 | } | ||
1978 | |||
1979 | static struct irq_chip ioapic_chip __read_mostly = { | ||
1980 | .name = "IO-APIC", | ||
1981 | .startup = startup_ioapic_irq, | ||
1982 | .mask = mask_IO_APIC_irq, | ||
1983 | .unmask = unmask_IO_APIC_irq, | ||
1984 | .ack = ack_ioapic_irq, | ||
1985 | .eoi = ack_ioapic_quirk_irq, | ||
1986 | #ifdef CONFIG_SMP | ||
1987 | .set_affinity = set_ioapic_affinity_irq, | ||
1988 | #endif | ||
1989 | .retrigger = ioapic_retrigger_irq, | ||
1990 | }; | ||
1991 | |||
1992 | |||
1993 | static inline void init_IO_APIC_traps(void) | ||
1994 | { | ||
1995 | int irq; | ||
1996 | |||
1997 | /* | ||
1998 | * NOTE! The local APIC isn't very good at handling | ||
1999 | * multiple interrupts at the same interrupt level. | ||
2000 | * As the interrupt level is determined by taking the | ||
2001 | * vector number and shifting that right by 4, we | ||
2002 | * want to spread these out a bit so that they don't | ||
2003 | * all fall in the same interrupt level. | ||
2004 | * | ||
2005 | * Also, we've got to be careful not to trash gate | ||
2006 | * 0x80, because int 0x80 is hm, kind of importantish. ;) | ||
2007 | */ | ||
2008 | for (irq = 0; irq < NR_IRQS ; irq++) { | ||
2009 | if (IO_APIC_IRQ(irq) && !irq_vector[irq]) { | ||
2010 | /* | ||
2011 | * Hmm.. We don't have an entry for this, | ||
2012 | * so default to an old-fashioned 8259 | ||
2013 | * interrupt if we can.. | ||
2014 | */ | ||
2015 | if (irq < 16) | ||
2016 | make_8259A_irq(irq); | ||
2017 | else | ||
2018 | /* Strange. Oh, well.. */ | ||
2019 | irq_desc[irq].chip = &no_irq_chip; | ||
2020 | } | ||
2021 | } | ||
2022 | } | ||
2023 | |||
2024 | /* | ||
2025 | * The local APIC irq-chip implementation: | ||
2026 | */ | ||
2027 | |||
2028 | static void ack_lapic_irq(unsigned int irq) | ||
2029 | { | ||
2030 | ack_APIC_irq(); | ||
2031 | } | ||
2032 | |||
2033 | static void mask_lapic_irq(unsigned int irq) | ||
2034 | { | ||
2035 | unsigned long v; | ||
2036 | |||
2037 | v = apic_read(APIC_LVT0); | ||
2038 | apic_write(APIC_LVT0, v | APIC_LVT_MASKED); | ||
2039 | } | ||
2040 | |||
2041 | static void unmask_lapic_irq(unsigned int irq) | ||
2042 | { | ||
2043 | unsigned long v; | ||
2044 | |||
2045 | v = apic_read(APIC_LVT0); | ||
2046 | apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); | ||
2047 | } | ||
2048 | |||
2049 | static struct irq_chip lapic_chip __read_mostly = { | ||
2050 | .name = "local-APIC", | ||
2051 | .mask = mask_lapic_irq, | ||
2052 | .unmask = unmask_lapic_irq, | ||
2053 | .ack = ack_lapic_irq, | ||
2054 | }; | ||
2055 | |||
2056 | static void lapic_register_intr(int irq, int vector) | ||
2057 | { | ||
2058 | irq_desc[irq].status &= ~IRQ_LEVEL; | ||
2059 | set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, | ||
2060 | "edge"); | ||
2061 | set_intr_gate(vector, interrupt[irq]); | ||
2062 | } | ||
2063 | |||
2064 | static void __init setup_nmi(void) | ||
2065 | { | ||
2066 | /* | ||
2067 | * Dirty trick to enable the NMI watchdog ... | ||
2068 | * We put the 8259A master into AEOI mode and | ||
2069 | * unmask on all local APICs LVT0 as NMI. | ||
2070 | * | ||
2071 | * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') | ||
2072 | * is from Maciej W. Rozycki - so we do not have to EOI from | ||
2073 | * the NMI handler or the timer interrupt. | ||
2074 | */ | ||
2075 | apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); | ||
2076 | |||
2077 | enable_NMI_through_LVT0(); | ||
2078 | |||
2079 | apic_printk(APIC_VERBOSE, " done.\n"); | ||
2080 | } | ||
2081 | |||
2082 | /* | ||
2083 | * This looks a bit hackish but it's about the only one way of sending | ||
2084 | * a few INTA cycles to 8259As and any associated glue logic. ICR does | ||
2085 | * not support the ExtINT mode, unfortunately. We need to send these | ||
2086 | * cycles as some i82489DX-based boards have glue logic that keeps the | ||
2087 | * 8259A interrupt line asserted until INTA. --macro | ||
2088 | */ | ||
2089 | static inline void __init unlock_ExtINT_logic(void) | ||
2090 | { | ||
2091 | int apic, pin, i; | ||
2092 | struct IO_APIC_route_entry entry0, entry1; | ||
2093 | unsigned char save_control, save_freq_select; | ||
2094 | |||
2095 | pin = find_isa_irq_pin(8, mp_INT); | ||
2096 | if (pin == -1) { | ||
2097 | WARN_ON_ONCE(1); | ||
2098 | return; | ||
2099 | } | ||
2100 | apic = find_isa_irq_apic(8, mp_INT); | ||
2101 | if (apic == -1) { | ||
2102 | WARN_ON_ONCE(1); | ||
2103 | return; | ||
2104 | } | ||
2105 | |||
2106 | entry0 = ioapic_read_entry(apic, pin); | ||
2107 | clear_IO_APIC_pin(apic, pin); | ||
2108 | |||
2109 | memset(&entry1, 0, sizeof(entry1)); | ||
2110 | |||
2111 | entry1.dest_mode = 0; /* physical delivery */ | ||
2112 | entry1.mask = 0; /* unmask IRQ now */ | ||
2113 | entry1.dest.physical.physical_dest = hard_smp_processor_id(); | ||
2114 | entry1.delivery_mode = dest_ExtINT; | ||
2115 | entry1.polarity = entry0.polarity; | ||
2116 | entry1.trigger = 0; | ||
2117 | entry1.vector = 0; | ||
2118 | |||
2119 | ioapic_write_entry(apic, pin, entry1); | ||
2120 | |||
2121 | save_control = CMOS_READ(RTC_CONTROL); | ||
2122 | save_freq_select = CMOS_READ(RTC_FREQ_SELECT); | ||
2123 | CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, | ||
2124 | RTC_FREQ_SELECT); | ||
2125 | CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); | ||
2126 | |||
2127 | i = 100; | ||
2128 | while (i-- > 0) { | ||
2129 | mdelay(10); | ||
2130 | if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) | ||
2131 | i -= 10; | ||
2132 | } | ||
2133 | |||
2134 | CMOS_WRITE(save_control, RTC_CONTROL); | ||
2135 | CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); | ||
2136 | clear_IO_APIC_pin(apic, pin); | ||
2137 | |||
2138 | ioapic_write_entry(apic, pin, entry0); | ||
2139 | } | ||
2140 | |||
2141 | /* | ||
2142 | * This code may look a bit paranoid, but it's supposed to cooperate with | ||
2143 | * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ | ||
2144 | * is so screwy. Thanks to Brian Perkins for testing/hacking this beast | ||
2145 | * fanatically on his truly buggy board. | ||
2146 | */ | ||
2147 | static inline void __init check_timer(void) | ||
2148 | { | ||
2149 | int apic1, pin1, apic2, pin2; | ||
2150 | int no_pin1 = 0; | ||
2151 | int vector; | ||
2152 | unsigned int ver; | ||
2153 | unsigned long flags; | ||
2154 | |||
2155 | local_irq_save(flags); | ||
2156 | |||
2157 | ver = apic_read(APIC_LVR); | ||
2158 | ver = GET_APIC_VERSION(ver); | ||
2159 | |||
2160 | /* | ||
2161 | * get/set the timer IRQ vector: | ||
2162 | */ | ||
2163 | disable_8259A_irq(0); | ||
2164 | vector = assign_irq_vector(0); | ||
2165 | set_intr_gate(vector, interrupt[0]); | ||
2166 | |||
2167 | /* | ||
2168 | * As IRQ0 is to be enabled in the 8259A, the virtual | ||
2169 | * wire has to be disabled in the local APIC. Also | ||
2170 | * timer interrupts need to be acknowledged manually in | ||
2171 | * the 8259A for the i82489DX when using the NMI | ||
2172 | * watchdog as that APIC treats NMIs as level-triggered. | ||
2173 | * The AEOI mode will finish them in the 8259A | ||
2174 | * automatically. | ||
2175 | */ | ||
2176 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); | ||
2177 | init_8259A(1); | ||
2178 | timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); | ||
2179 | |||
2180 | pin1 = find_isa_irq_pin(0, mp_INT); | ||
2181 | apic1 = find_isa_irq_apic(0, mp_INT); | ||
2182 | pin2 = ioapic_i8259.pin; | ||
2183 | apic2 = ioapic_i8259.apic; | ||
2184 | |||
2185 | apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " | ||
2186 | "apic1=%d pin1=%d apic2=%d pin2=%d\n", | ||
2187 | vector, apic1, pin1, apic2, pin2); | ||
2188 | |||
2189 | /* | ||
2190 | * Some BIOS writers are clueless and report the ExtINTA | ||
2191 | * I/O APIC input from the cascaded 8259A as the timer | ||
2192 | * interrupt input. So just in case, if only one pin | ||
2193 | * was found above, try it both directly and through the | ||
2194 | * 8259A. | ||
2195 | */ | ||
2196 | if (pin1 == -1) { | ||
2197 | pin1 = pin2; | ||
2198 | apic1 = apic2; | ||
2199 | no_pin1 = 1; | ||
2200 | } else if (pin2 == -1) { | ||
2201 | pin2 = pin1; | ||
2202 | apic2 = apic1; | ||
2203 | } | ||
2204 | |||
2205 | if (pin1 != -1) { | ||
2206 | /* | ||
2207 | * Ok, does IRQ0 through the IOAPIC work? | ||
2208 | */ | ||
2209 | if (no_pin1) { | ||
2210 | add_pin_to_irq(0, apic1, pin1); | ||
2211 | setup_timer_IRQ0_pin(apic1, pin1, vector); | ||
2212 | } | ||
2213 | unmask_IO_APIC_irq(0); | ||
2214 | if (timer_irq_works()) { | ||
2215 | if (nmi_watchdog == NMI_IO_APIC) { | ||
2216 | setup_nmi(); | ||
2217 | enable_8259A_irq(0); | ||
2218 | } | ||
2219 | if (disable_timer_pin_1 > 0) | ||
2220 | clear_IO_APIC_pin(0, pin1); | ||
2221 | goto out; | ||
2222 | } | ||
2223 | clear_IO_APIC_pin(apic1, pin1); | ||
2224 | if (!no_pin1) | ||
2225 | apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " | ||
2226 | "8254 timer not connected to IO-APIC\n"); | ||
2227 | |||
2228 | apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " | ||
2229 | "(IRQ0) through the 8259A ...\n"); | ||
2230 | apic_printk(APIC_QUIET, KERN_INFO | ||
2231 | "..... (found apic %d pin %d) ...\n", apic2, pin2); | ||
2232 | /* | ||
2233 | * legacy devices should be connected to IO APIC #0 | ||
2234 | */ | ||
2235 | replace_pin_at_irq(0, apic1, pin1, apic2, pin2); | ||
2236 | setup_timer_IRQ0_pin(apic2, pin2, vector); | ||
2237 | unmask_IO_APIC_irq(0); | ||
2238 | enable_8259A_irq(0); | ||
2239 | if (timer_irq_works()) { | ||
2240 | apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); | ||
2241 | timer_through_8259 = 1; | ||
2242 | if (nmi_watchdog == NMI_IO_APIC) { | ||
2243 | disable_8259A_irq(0); | ||
2244 | setup_nmi(); | ||
2245 | enable_8259A_irq(0); | ||
2246 | } | ||
2247 | goto out; | ||
2248 | } | ||
2249 | /* | ||
2250 | * Cleanup, just in case ... | ||
2251 | */ | ||
2252 | disable_8259A_irq(0); | ||
2253 | clear_IO_APIC_pin(apic2, pin2); | ||
2254 | apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); | ||
2255 | } | ||
2256 | |||
2257 | if (nmi_watchdog == NMI_IO_APIC) { | ||
2258 | apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " | ||
2259 | "through the IO-APIC - disabling NMI Watchdog!\n"); | ||
2260 | nmi_watchdog = NMI_NONE; | ||
2261 | } | ||
2262 | timer_ack = 0; | ||
2263 | |||
2264 | apic_printk(APIC_QUIET, KERN_INFO | ||
2265 | "...trying to set up timer as Virtual Wire IRQ...\n"); | ||
2266 | |||
2267 | lapic_register_intr(0, vector); | ||
2268 | apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ | ||
2269 | enable_8259A_irq(0); | ||
2270 | |||
2271 | if (timer_irq_works()) { | ||
2272 | apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); | ||
2273 | goto out; | ||
2274 | } | ||
2275 | disable_8259A_irq(0); | ||
2276 | apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); | ||
2277 | apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); | ||
2278 | |||
2279 | apic_printk(APIC_QUIET, KERN_INFO | ||
2280 | "...trying to set up timer as ExtINT IRQ...\n"); | ||
2281 | |||
2282 | init_8259A(0); | ||
2283 | make_8259A_irq(0); | ||
2284 | apic_write(APIC_LVT0, APIC_DM_EXTINT); | ||
2285 | |||
2286 | unlock_ExtINT_logic(); | ||
2287 | |||
2288 | if (timer_irq_works()) { | ||
2289 | apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); | ||
2290 | goto out; | ||
2291 | } | ||
2292 | apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); | ||
2293 | panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " | ||
2294 | "report. Then try booting with the 'noapic' option.\n"); | ||
2295 | out: | ||
2296 | local_irq_restore(flags); | ||
2297 | } | ||
2298 | |||
2299 | /* | ||
2300 | * Traditionally ISA IRQ2 is the cascade IRQ, and is not available | ||
2301 | * to devices. However there may be an I/O APIC pin available for | ||
2302 | * this interrupt regardless. The pin may be left unconnected, but | ||
2303 | * typically it will be reused as an ExtINT cascade interrupt for | ||
2304 | * the master 8259A. In the MPS case such a pin will normally be | ||
2305 | * reported as an ExtINT interrupt in the MP table. With ACPI | ||
2306 | * there is no provision for ExtINT interrupts, and in the absence | ||
2307 | * of an override it would be treated as an ordinary ISA I/O APIC | ||
2308 | * interrupt, that is edge-triggered and unmasked by default. We | ||
2309 | * used to do this, but it caused problems on some systems because | ||
2310 | * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using | ||
2311 | * the same ExtINT cascade interrupt to drive the local APIC of the | ||
2312 | * bootstrap processor. Therefore we refrain from routing IRQ2 to | ||
2313 | * the I/O APIC in all cases now. No actual device should request | ||
2314 | * it anyway. --macro | ||
2315 | */ | ||
2316 | #define PIC_IRQS (1 << PIC_CASCADE_IR) | ||
2317 | |||
2318 | void __init setup_IO_APIC(void) | ||
2319 | { | ||
2320 | int i; | ||
2321 | |||
2322 | /* Reserve all the system vectors. */ | ||
2323 | for (i = first_system_vector; i < NR_VECTORS; i++) | ||
2324 | set_bit(i, used_vectors); | ||
2325 | |||
2326 | enable_IO_APIC(); | ||
2327 | |||
2328 | io_apic_irqs = ~PIC_IRQS; | ||
2329 | |||
2330 | printk("ENABLING IO-APIC IRQs\n"); | ||
2331 | |||
2332 | /* | ||
2333 | * Set up IO-APIC IRQ routing. | ||
2334 | */ | ||
2335 | if (!acpi_ioapic) | ||
2336 | setup_ioapic_ids_from_mpc(); | ||
2337 | sync_Arb_IDs(); | ||
2338 | setup_IO_APIC_irqs(); | ||
2339 | init_IO_APIC_traps(); | ||
2340 | check_timer(); | ||
2341 | } | ||
2342 | |||
2343 | /* | ||
2344 | * Called after all the initialization is done. If we didnt find any | ||
2345 | * APIC bugs then we can allow the modify fast path | ||
2346 | */ | ||
2347 | |||
2348 | static int __init io_apic_bug_finalize(void) | ||
2349 | { | ||
2350 | if (sis_apic_bug == -1) | ||
2351 | sis_apic_bug = 0; | ||
2352 | return 0; | ||
2353 | } | ||
2354 | |||
2355 | late_initcall(io_apic_bug_finalize); | ||
2356 | |||
2357 | struct sysfs_ioapic_data { | ||
2358 | struct sys_device dev; | ||
2359 | struct IO_APIC_route_entry entry[0]; | ||
2360 | }; | ||
2361 | static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS]; | ||
2362 | |||
2363 | static int ioapic_suspend(struct sys_device *dev, pm_message_t state) | ||
2364 | { | ||
2365 | struct IO_APIC_route_entry *entry; | ||
2366 | struct sysfs_ioapic_data *data; | ||
2367 | int i; | ||
2368 | |||
2369 | data = container_of(dev, struct sysfs_ioapic_data, dev); | ||
2370 | entry = data->entry; | ||
2371 | for (i = 0; i < nr_ioapic_registers[dev->id]; i++) | ||
2372 | entry[i] = ioapic_read_entry(dev->id, i); | ||
2373 | |||
2374 | return 0; | ||
2375 | } | ||
2376 | |||
2377 | static int ioapic_resume(struct sys_device *dev) | ||
2378 | { | ||
2379 | struct IO_APIC_route_entry *entry; | ||
2380 | struct sysfs_ioapic_data *data; | ||
2381 | unsigned long flags; | ||
2382 | union IO_APIC_reg_00 reg_00; | ||
2383 | int i; | ||
2384 | |||
2385 | data = container_of(dev, struct sysfs_ioapic_data, dev); | ||
2386 | entry = data->entry; | ||
2387 | |||
2388 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2389 | reg_00.raw = io_apic_read(dev->id, 0); | ||
2390 | if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { | ||
2391 | reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; | ||
2392 | io_apic_write(dev->id, 0, reg_00.raw); | ||
2393 | } | ||
2394 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2395 | for (i = 0; i < nr_ioapic_registers[dev->id]; i++) | ||
2396 | ioapic_write_entry(dev->id, i, entry[i]); | ||
2397 | |||
2398 | return 0; | ||
2399 | } | ||
2400 | |||
2401 | static struct sysdev_class ioapic_sysdev_class = { | ||
2402 | .name = "ioapic", | ||
2403 | .suspend = ioapic_suspend, | ||
2404 | .resume = ioapic_resume, | ||
2405 | }; | ||
2406 | |||
2407 | static int __init ioapic_init_sysfs(void) | ||
2408 | { | ||
2409 | struct sys_device *dev; | ||
2410 | int i, size, error = 0; | ||
2411 | |||
2412 | error = sysdev_class_register(&ioapic_sysdev_class); | ||
2413 | if (error) | ||
2414 | return error; | ||
2415 | |||
2416 | for (i = 0; i < nr_ioapics; i++) { | ||
2417 | size = sizeof(struct sys_device) + nr_ioapic_registers[i] | ||
2418 | * sizeof(struct IO_APIC_route_entry); | ||
2419 | mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); | ||
2420 | if (!mp_ioapic_data[i]) { | ||
2421 | printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | ||
2422 | continue; | ||
2423 | } | ||
2424 | dev = &mp_ioapic_data[i]->dev; | ||
2425 | dev->id = i; | ||
2426 | dev->cls = &ioapic_sysdev_class; | ||
2427 | error = sysdev_register(dev); | ||
2428 | if (error) { | ||
2429 | kfree(mp_ioapic_data[i]); | ||
2430 | mp_ioapic_data[i] = NULL; | ||
2431 | printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); | ||
2432 | continue; | ||
2433 | } | ||
2434 | } | ||
2435 | |||
2436 | return 0; | ||
2437 | } | ||
2438 | |||
2439 | device_initcall(ioapic_init_sysfs); | ||
2440 | |||
2441 | /* | ||
2442 | * Dynamic irq allocate and deallocation | ||
2443 | */ | ||
2444 | int create_irq(void) | ||
2445 | { | ||
2446 | /* Allocate an unused irq */ | ||
2447 | int irq, new, vector = 0; | ||
2448 | unsigned long flags; | ||
2449 | |||
2450 | irq = -ENOSPC; | ||
2451 | spin_lock_irqsave(&vector_lock, flags); | ||
2452 | for (new = (NR_IRQS - 1); new >= 0; new--) { | ||
2453 | if (platform_legacy_irq(new)) | ||
2454 | continue; | ||
2455 | if (irq_vector[new] != 0) | ||
2456 | continue; | ||
2457 | vector = __assign_irq_vector(new); | ||
2458 | if (likely(vector > 0)) | ||
2459 | irq = new; | ||
2460 | break; | ||
2461 | } | ||
2462 | spin_unlock_irqrestore(&vector_lock, flags); | ||
2463 | |||
2464 | if (irq >= 0) { | ||
2465 | set_intr_gate(vector, interrupt[irq]); | ||
2466 | dynamic_irq_init(irq); | ||
2467 | } | ||
2468 | return irq; | ||
2469 | } | ||
2470 | |||
2471 | void destroy_irq(unsigned int irq) | ||
2472 | { | ||
2473 | unsigned long flags; | ||
2474 | |||
2475 | dynamic_irq_cleanup(irq); | ||
2476 | |||
2477 | spin_lock_irqsave(&vector_lock, flags); | ||
2478 | clear_bit(irq_vector[irq], used_vectors); | ||
2479 | irq_vector[irq] = 0; | ||
2480 | spin_unlock_irqrestore(&vector_lock, flags); | ||
2481 | } | ||
2482 | |||
2483 | /* | ||
2484 | * MSI message composition | ||
2485 | */ | ||
2486 | #ifdef CONFIG_PCI_MSI | ||
2487 | static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) | ||
2488 | { | ||
2489 | int vector; | ||
2490 | unsigned dest; | ||
2491 | |||
2492 | vector = assign_irq_vector(irq); | ||
2493 | if (vector >= 0) { | ||
2494 | dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
2495 | |||
2496 | msg->address_hi = MSI_ADDR_BASE_HI; | ||
2497 | msg->address_lo = | ||
2498 | MSI_ADDR_BASE_LO | | ||
2499 | ((INT_DEST_MODE == 0) ? | ||
2500 | MSI_ADDR_DEST_MODE_PHYSICAL: | ||
2501 | MSI_ADDR_DEST_MODE_LOGICAL) | | ||
2502 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? | ||
2503 | MSI_ADDR_REDIRECTION_CPU: | ||
2504 | MSI_ADDR_REDIRECTION_LOWPRI) | | ||
2505 | MSI_ADDR_DEST_ID(dest); | ||
2506 | |||
2507 | msg->data = | ||
2508 | MSI_DATA_TRIGGER_EDGE | | ||
2509 | MSI_DATA_LEVEL_ASSERT | | ||
2510 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? | ||
2511 | MSI_DATA_DELIVERY_FIXED: | ||
2512 | MSI_DATA_DELIVERY_LOWPRI) | | ||
2513 | MSI_DATA_VECTOR(vector); | ||
2514 | } | ||
2515 | return vector; | ||
2516 | } | ||
2517 | |||
2518 | #ifdef CONFIG_SMP | ||
2519 | static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) | ||
2520 | { | ||
2521 | struct msi_msg msg; | ||
2522 | unsigned int dest; | ||
2523 | cpumask_t tmp; | ||
2524 | int vector; | ||
2525 | |||
2526 | cpus_and(tmp, mask, cpu_online_map); | ||
2527 | if (cpus_empty(tmp)) | ||
2528 | tmp = TARGET_CPUS; | ||
2529 | |||
2530 | vector = assign_irq_vector(irq); | ||
2531 | if (vector < 0) | ||
2532 | return; | ||
2533 | |||
2534 | dest = cpu_mask_to_apicid(mask); | ||
2535 | |||
2536 | read_msi_msg(irq, &msg); | ||
2537 | |||
2538 | msg.data &= ~MSI_DATA_VECTOR_MASK; | ||
2539 | msg.data |= MSI_DATA_VECTOR(vector); | ||
2540 | msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; | ||
2541 | msg.address_lo |= MSI_ADDR_DEST_ID(dest); | ||
2542 | |||
2543 | write_msi_msg(irq, &msg); | ||
2544 | irq_desc[irq].affinity = mask; | ||
2545 | } | ||
2546 | #endif /* CONFIG_SMP */ | ||
2547 | |||
2548 | /* | ||
2549 | * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, | ||
2550 | * which implement the MSI or MSI-X Capability Structure. | ||
2551 | */ | ||
2552 | static struct irq_chip msi_chip = { | ||
2553 | .name = "PCI-MSI", | ||
2554 | .unmask = unmask_msi_irq, | ||
2555 | .mask = mask_msi_irq, | ||
2556 | .ack = ack_ioapic_irq, | ||
2557 | #ifdef CONFIG_SMP | ||
2558 | .set_affinity = set_msi_irq_affinity, | ||
2559 | #endif | ||
2560 | .retrigger = ioapic_retrigger_irq, | ||
2561 | }; | ||
2562 | |||
2563 | int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) | ||
2564 | { | ||
2565 | struct msi_msg msg; | ||
2566 | int irq, ret; | ||
2567 | irq = create_irq(); | ||
2568 | if (irq < 0) | ||
2569 | return irq; | ||
2570 | |||
2571 | ret = msi_compose_msg(dev, irq, &msg); | ||
2572 | if (ret < 0) { | ||
2573 | destroy_irq(irq); | ||
2574 | return ret; | ||
2575 | } | ||
2576 | |||
2577 | set_irq_msi(irq, desc); | ||
2578 | write_msi_msg(irq, &msg); | ||
2579 | |||
2580 | set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, | ||
2581 | "edge"); | ||
2582 | |||
2583 | return 0; | ||
2584 | } | ||
2585 | |||
2586 | void arch_teardown_msi_irq(unsigned int irq) | ||
2587 | { | ||
2588 | destroy_irq(irq); | ||
2589 | } | ||
2590 | |||
2591 | #endif /* CONFIG_PCI_MSI */ | ||
2592 | |||
2593 | /* | ||
2594 | * Hypertransport interrupt support | ||
2595 | */ | ||
2596 | #ifdef CONFIG_HT_IRQ | ||
2597 | |||
2598 | #ifdef CONFIG_SMP | ||
2599 | |||
2600 | static void target_ht_irq(unsigned int irq, unsigned int dest) | ||
2601 | { | ||
2602 | struct ht_irq_msg msg; | ||
2603 | fetch_ht_irq_msg(irq, &msg); | ||
2604 | |||
2605 | msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK); | ||
2606 | msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); | ||
2607 | |||
2608 | msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest); | ||
2609 | msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); | ||
2610 | |||
2611 | write_ht_irq_msg(irq, &msg); | ||
2612 | } | ||
2613 | |||
2614 | static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) | ||
2615 | { | ||
2616 | unsigned int dest; | ||
2617 | cpumask_t tmp; | ||
2618 | |||
2619 | cpus_and(tmp, mask, cpu_online_map); | ||
2620 | if (cpus_empty(tmp)) | ||
2621 | tmp = TARGET_CPUS; | ||
2622 | |||
2623 | cpus_and(mask, tmp, CPU_MASK_ALL); | ||
2624 | |||
2625 | dest = cpu_mask_to_apicid(mask); | ||
2626 | |||
2627 | target_ht_irq(irq, dest); | ||
2628 | irq_desc[irq].affinity = mask; | ||
2629 | } | ||
2630 | #endif | ||
2631 | |||
2632 | static struct irq_chip ht_irq_chip = { | ||
2633 | .name = "PCI-HT", | ||
2634 | .mask = mask_ht_irq, | ||
2635 | .unmask = unmask_ht_irq, | ||
2636 | .ack = ack_ioapic_irq, | ||
2637 | #ifdef CONFIG_SMP | ||
2638 | .set_affinity = set_ht_irq_affinity, | ||
2639 | #endif | ||
2640 | .retrigger = ioapic_retrigger_irq, | ||
2641 | }; | ||
2642 | |||
2643 | int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) | ||
2644 | { | ||
2645 | int vector; | ||
2646 | |||
2647 | vector = assign_irq_vector(irq); | ||
2648 | if (vector >= 0) { | ||
2649 | struct ht_irq_msg msg; | ||
2650 | unsigned dest; | ||
2651 | cpumask_t tmp; | ||
2652 | |||
2653 | cpus_clear(tmp); | ||
2654 | cpu_set(vector >> 8, tmp); | ||
2655 | dest = cpu_mask_to_apicid(tmp); | ||
2656 | |||
2657 | msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); | ||
2658 | |||
2659 | msg.address_lo = | ||
2660 | HT_IRQ_LOW_BASE | | ||
2661 | HT_IRQ_LOW_DEST_ID(dest) | | ||
2662 | HT_IRQ_LOW_VECTOR(vector) | | ||
2663 | ((INT_DEST_MODE == 0) ? | ||
2664 | HT_IRQ_LOW_DM_PHYSICAL : | ||
2665 | HT_IRQ_LOW_DM_LOGICAL) | | ||
2666 | HT_IRQ_LOW_RQEOI_EDGE | | ||
2667 | ((INT_DELIVERY_MODE != dest_LowestPrio) ? | ||
2668 | HT_IRQ_LOW_MT_FIXED : | ||
2669 | HT_IRQ_LOW_MT_ARBITRATED) | | ||
2670 | HT_IRQ_LOW_IRQ_MASKED; | ||
2671 | |||
2672 | write_ht_irq_msg(irq, &msg); | ||
2673 | |||
2674 | set_irq_chip_and_handler_name(irq, &ht_irq_chip, | ||
2675 | handle_edge_irq, "edge"); | ||
2676 | } | ||
2677 | return vector; | ||
2678 | } | ||
2679 | #endif /* CONFIG_HT_IRQ */ | ||
2680 | |||
2681 | /* -------------------------------------------------------------------------- | ||
2682 | ACPI-based IOAPIC Configuration | ||
2683 | -------------------------------------------------------------------------- */ | ||
2684 | |||
2685 | #ifdef CONFIG_ACPI | ||
2686 | |||
2687 | int __init io_apic_get_unique_id(int ioapic, int apic_id) | ||
2688 | { | ||
2689 | union IO_APIC_reg_00 reg_00; | ||
2690 | static physid_mask_t apic_id_map = PHYSID_MASK_NONE; | ||
2691 | physid_mask_t tmp; | ||
2692 | unsigned long flags; | ||
2693 | int i = 0; | ||
2694 | |||
2695 | /* | ||
2696 | * The P4 platform supports up to 256 APIC IDs on two separate APIC | ||
2697 | * buses (one for LAPICs, one for IOAPICs), where predecessors only | ||
2698 | * supports up to 16 on one shared APIC bus. | ||
2699 | * | ||
2700 | * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full | ||
2701 | * advantage of new APIC bus architecture. | ||
2702 | */ | ||
2703 | |||
2704 | if (physids_empty(apic_id_map)) | ||
2705 | apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); | ||
2706 | |||
2707 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2708 | reg_00.raw = io_apic_read(ioapic, 0); | ||
2709 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2710 | |||
2711 | if (apic_id >= get_physical_broadcast()) { | ||
2712 | printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " | ||
2713 | "%d\n", ioapic, apic_id, reg_00.bits.ID); | ||
2714 | apic_id = reg_00.bits.ID; | ||
2715 | } | ||
2716 | |||
2717 | /* | ||
2718 | * Every APIC in a system must have a unique ID or we get lots of nice | ||
2719 | * 'stuck on smp_invalidate_needed IPI wait' messages. | ||
2720 | */ | ||
2721 | if (check_apicid_used(apic_id_map, apic_id)) { | ||
2722 | |||
2723 | for (i = 0; i < get_physical_broadcast(); i++) { | ||
2724 | if (!check_apicid_used(apic_id_map, i)) | ||
2725 | break; | ||
2726 | } | ||
2727 | |||
2728 | if (i == get_physical_broadcast()) | ||
2729 | panic("Max apic_id exceeded!\n"); | ||
2730 | |||
2731 | printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " | ||
2732 | "trying %d\n", ioapic, apic_id, i); | ||
2733 | |||
2734 | apic_id = i; | ||
2735 | } | ||
2736 | |||
2737 | tmp = apicid_to_cpu_present(apic_id); | ||
2738 | physids_or(apic_id_map, apic_id_map, tmp); | ||
2739 | |||
2740 | if (reg_00.bits.ID != apic_id) { | ||
2741 | reg_00.bits.ID = apic_id; | ||
2742 | |||
2743 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2744 | io_apic_write(ioapic, 0, reg_00.raw); | ||
2745 | reg_00.raw = io_apic_read(ioapic, 0); | ||
2746 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2747 | |||
2748 | /* Sanity check */ | ||
2749 | if (reg_00.bits.ID != apic_id) { | ||
2750 | printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); | ||
2751 | return -1; | ||
2752 | } | ||
2753 | } | ||
2754 | |||
2755 | apic_printk(APIC_VERBOSE, KERN_INFO | ||
2756 | "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); | ||
2757 | |||
2758 | return apic_id; | ||
2759 | } | ||
2760 | |||
2761 | |||
2762 | int __init io_apic_get_version(int ioapic) | ||
2763 | { | ||
2764 | union IO_APIC_reg_01 reg_01; | ||
2765 | unsigned long flags; | ||
2766 | |||
2767 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2768 | reg_01.raw = io_apic_read(ioapic, 1); | ||
2769 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2770 | |||
2771 | return reg_01.bits.version; | ||
2772 | } | ||
2773 | |||
2774 | |||
2775 | int __init io_apic_get_redir_entries(int ioapic) | ||
2776 | { | ||
2777 | union IO_APIC_reg_01 reg_01; | ||
2778 | unsigned long flags; | ||
2779 | |||
2780 | spin_lock_irqsave(&ioapic_lock, flags); | ||
2781 | reg_01.raw = io_apic_read(ioapic, 1); | ||
2782 | spin_unlock_irqrestore(&ioapic_lock, flags); | ||
2783 | |||
2784 | return reg_01.bits.entries; | ||
2785 | } | ||
2786 | |||
2787 | |||
2788 | int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low) | ||
2789 | { | ||
2790 | struct IO_APIC_route_entry entry; | ||
2791 | |||
2792 | if (!IO_APIC_IRQ(irq)) { | ||
2793 | printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", | ||
2794 | ioapic); | ||
2795 | return -EINVAL; | ||
2796 | } | ||
2797 | |||
2798 | /* | ||
2799 | * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. | ||
2800 | * Note that we mask (disable) IRQs now -- these get enabled when the | ||
2801 | * corresponding device driver registers for this IRQ. | ||
2802 | */ | ||
2803 | |||
2804 | memset(&entry, 0, sizeof(entry)); | ||
2805 | |||
2806 | entry.delivery_mode = INT_DELIVERY_MODE; | ||
2807 | entry.dest_mode = INT_DEST_MODE; | ||
2808 | entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); | ||
2809 | entry.trigger = edge_level; | ||
2810 | entry.polarity = active_high_low; | ||
2811 | entry.mask = 1; | ||
2812 | |||
2813 | /* | ||
2814 | * IRQs < 16 are already in the irq_2_pin[] map | ||
2815 | */ | ||
2816 | if (irq >= 16) | ||
2817 | add_pin_to_irq(irq, ioapic, pin); | ||
2818 | |||
2819 | entry.vector = assign_irq_vector(irq); | ||
2820 | |||
2821 | apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " | ||
2822 | "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, | ||
2823 | mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq, | ||
2824 | edge_level, active_high_low); | ||
2825 | |||
2826 | ioapic_register_intr(irq, entry.vector, edge_level); | ||
2827 | |||
2828 | if (!ioapic && (irq < 16)) | ||
2829 | disable_8259A_irq(irq); | ||
2830 | |||
2831 | ioapic_write_entry(ioapic, pin, entry); | ||
2832 | |||
2833 | return 0; | ||
2834 | } | ||
2835 | |||
2836 | int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) | ||
2837 | { | ||
2838 | int i; | ||
2839 | |||
2840 | if (skip_ioapic_setup) | ||
2841 | return -1; | ||
2842 | |||
2843 | for (i = 0; i < mp_irq_entries; i++) | ||
2844 | if (mp_irqs[i].mp_irqtype == mp_INT && | ||
2845 | mp_irqs[i].mp_srcbusirq == bus_irq) | ||
2846 | break; | ||
2847 | if (i >= mp_irq_entries) | ||
2848 | return -1; | ||
2849 | |||
2850 | *trigger = irq_trigger(i); | ||
2851 | *polarity = irq_polarity(i); | ||
2852 | return 0; | ||
2853 | } | ||
2854 | |||
2855 | #endif /* CONFIG_ACPI */ | ||
2856 | |||
2857 | static int __init parse_disable_timer_pin_1(char *arg) | ||
2858 | { | ||
2859 | disable_timer_pin_1 = 1; | ||
2860 | return 0; | ||
2861 | } | ||
2862 | early_param("disable_timer_pin_1", parse_disable_timer_pin_1); | ||
2863 | |||
2864 | static int __init parse_enable_timer_pin_1(char *arg) | ||
2865 | { | ||
2866 | disable_timer_pin_1 = -1; | ||
2867 | return 0; | ||
2868 | } | ||
2869 | early_param("enable_timer_pin_1", parse_enable_timer_pin_1); | ||
2870 | |||
2871 | static int __init parse_noapic(char *arg) | ||
2872 | { | ||
2873 | /* disable IO-APIC */ | ||
2874 | disable_ioapic_setup(); | ||
2875 | return 0; | ||
2876 | } | ||
2877 | early_param("noapic", parse_noapic); | ||
2878 | |||
2879 | void __init ioapic_init_mappings(void) | ||
2880 | { | ||
2881 | unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; | ||
2882 | int i; | ||
2883 | |||
2884 | for (i = 0; i < nr_ioapics; i++) { | ||
2885 | if (smp_found_config) { | ||
2886 | ioapic_phys = mp_ioapics[i].mp_apicaddr; | ||
2887 | if (!ioapic_phys) { | ||
2888 | printk(KERN_ERR | ||
2889 | "WARNING: bogus zero IO-APIC " | ||
2890 | "address found in MPTABLE, " | ||
2891 | "disabling IO/APIC support!\n"); | ||
2892 | smp_found_config = 0; | ||
2893 | skip_ioapic_setup = 1; | ||
2894 | goto fake_ioapic_page; | ||
2895 | } | ||
2896 | } else { | ||
2897 | fake_ioapic_page: | ||
2898 | ioapic_phys = (unsigned long) | ||
2899 | alloc_bootmem_pages(PAGE_SIZE); | ||
2900 | ioapic_phys = __pa(ioapic_phys); | ||
2901 | } | ||
2902 | set_fixmap_nocache(idx, ioapic_phys); | ||
2903 | printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", | ||
2904 | __fix_to_virt(idx), ioapic_phys); | ||
2905 | idx++; | ||
2906 | } | ||
2907 | } | ||
2908 | |||
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c new file mode 100644 index 000000000000..d1d4dc52f649 --- /dev/null +++ b/arch/x86/kernel/irq.c | |||
@@ -0,0 +1,189 @@ | |||
1 | /* | ||
2 | * Common interrupt code for 32 and 64 bit | ||
3 | */ | ||
4 | #include <linux/cpu.h> | ||
5 | #include <linux/interrupt.h> | ||
6 | #include <linux/kernel_stat.h> | ||
7 | #include <linux/seq_file.h> | ||
8 | |||
9 | #include <asm/apic.h> | ||
10 | #include <asm/io_apic.h> | ||
11 | #include <asm/smp.h> | ||
12 | |||
13 | atomic_t irq_err_count; | ||
14 | |||
15 | /* | ||
16 | * 'what should we do if we get a hw irq event on an illegal vector'. | ||
17 | * each architecture has to answer this themselves. | ||
18 | */ | ||
19 | void ack_bad_irq(unsigned int irq) | ||
20 | { | ||
21 | printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); | ||
22 | |||
23 | #ifdef CONFIG_X86_LOCAL_APIC | ||
24 | /* | ||
25 | * Currently unexpected vectors happen only on SMP and APIC. | ||
26 | * We _must_ ack these because every local APIC has only N | ||
27 | * irq slots per priority level, and a 'hanging, unacked' IRQ | ||
28 | * holds up an irq slot - in excessive cases (when multiple | ||
29 | * unexpected vectors occur) that might lock up the APIC | ||
30 | * completely. | ||
31 | * But only ack when the APIC is enabled -AK | ||
32 | */ | ||
33 | if (cpu_has_apic) | ||
34 | ack_APIC_irq(); | ||
35 | #endif | ||
36 | } | ||
37 | |||
38 | #ifdef CONFIG_X86_32 | ||
39 | # define irq_stats(x) (&per_cpu(irq_stat, x)) | ||
40 | #else | ||
41 | # define irq_stats(x) cpu_pda(x) | ||
42 | #endif | ||
43 | /* | ||
44 | * /proc/interrupts printing: | ||
45 | */ | ||
46 | static int show_other_interrupts(struct seq_file *p) | ||
47 | { | ||
48 | int j; | ||
49 | |||
50 | seq_printf(p, "NMI: "); | ||
51 | for_each_online_cpu(j) | ||
52 | seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); | ||
53 | seq_printf(p, " Non-maskable interrupts\n"); | ||
54 | #ifdef CONFIG_X86_LOCAL_APIC | ||
55 | seq_printf(p, "LOC: "); | ||
56 | for_each_online_cpu(j) | ||
57 | seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); | ||
58 | seq_printf(p, " Local timer interrupts\n"); | ||
59 | #endif | ||
60 | #ifdef CONFIG_SMP | ||
61 | seq_printf(p, "RES: "); | ||
62 | for_each_online_cpu(j) | ||
63 | seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); | ||
64 | seq_printf(p, " Rescheduling interrupts\n"); | ||
65 | seq_printf(p, "CAL: "); | ||
66 | for_each_online_cpu(j) | ||
67 | seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); | ||
68 | seq_printf(p, " Function call interrupts\n"); | ||
69 | seq_printf(p, "TLB: "); | ||
70 | for_each_online_cpu(j) | ||
71 | seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); | ||
72 | seq_printf(p, " TLB shootdowns\n"); | ||
73 | #endif | ||
74 | #ifdef CONFIG_X86_MCE | ||
75 | seq_printf(p, "TRM: "); | ||
76 | for_each_online_cpu(j) | ||
77 | seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); | ||
78 | seq_printf(p, " Thermal event interrupts\n"); | ||
79 | # ifdef CONFIG_X86_64 | ||
80 | seq_printf(p, "THR: "); | ||
81 | for_each_online_cpu(j) | ||
82 | seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); | ||
83 | seq_printf(p, " Threshold APIC interrupts\n"); | ||
84 | # endif | ||
85 | #endif | ||
86 | #ifdef CONFIG_X86_LOCAL_APIC | ||
87 | seq_printf(p, "SPU: "); | ||
88 | for_each_online_cpu(j) | ||
89 | seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); | ||
90 | seq_printf(p, " Spurious interrupts\n"); | ||
91 | #endif | ||
92 | seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); | ||
93 | #if defined(CONFIG_X86_IO_APIC) | ||
94 | seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); | ||
95 | #endif | ||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | int show_interrupts(struct seq_file *p, void *v) | ||
100 | { | ||
101 | unsigned long flags, any_count = 0; | ||
102 | int i = *(loff_t *) v, j; | ||
103 | struct irqaction *action; | ||
104 | struct irq_desc *desc; | ||
105 | |||
106 | if (i > nr_irqs) | ||
107 | return 0; | ||
108 | |||
109 | if (i == nr_irqs) | ||
110 | return show_other_interrupts(p); | ||
111 | |||
112 | /* print header */ | ||
113 | if (i == 0) { | ||
114 | seq_printf(p, " "); | ||
115 | for_each_online_cpu(j) | ||
116 | seq_printf(p, "CPU%-8d", j); | ||
117 | seq_putc(p, '\n'); | ||
118 | } | ||
119 | |||
120 | desc = irq_to_desc(i); | ||
121 | spin_lock_irqsave(&desc->lock, flags); | ||
122 | #ifndef CONFIG_SMP | ||
123 | any_count = kstat_irqs(i); | ||
124 | #else | ||
125 | for_each_online_cpu(j) | ||
126 | any_count |= kstat_irqs_cpu(i, j); | ||
127 | #endif | ||
128 | action = desc->action; | ||
129 | if (!action && !any_count) | ||
130 | goto out; | ||
131 | |||
132 | seq_printf(p, "%3d: ", i); | ||
133 | #ifndef CONFIG_SMP | ||
134 | seq_printf(p, "%10u ", kstat_irqs(i)); | ||
135 | #else | ||
136 | for_each_online_cpu(j) | ||
137 | seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); | ||
138 | #endif | ||
139 | seq_printf(p, " %8s", desc->chip->name); | ||
140 | seq_printf(p, "-%-8s", desc->name); | ||
141 | |||
142 | if (action) { | ||
143 | seq_printf(p, " %s", action->name); | ||
144 | while ((action = action->next) != NULL) | ||
145 | seq_printf(p, ", %s", action->name); | ||
146 | } | ||
147 | |||
148 | seq_putc(p, '\n'); | ||
149 | out: | ||
150 | spin_unlock_irqrestore(&desc->lock, flags); | ||
151 | return 0; | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * /proc/stat helpers | ||
156 | */ | ||
157 | u64 arch_irq_stat_cpu(unsigned int cpu) | ||
158 | { | ||
159 | u64 sum = irq_stats(cpu)->__nmi_count; | ||
160 | |||
161 | #ifdef CONFIG_X86_LOCAL_APIC | ||
162 | sum += irq_stats(cpu)->apic_timer_irqs; | ||
163 | #endif | ||
164 | #ifdef CONFIG_SMP | ||
165 | sum += irq_stats(cpu)->irq_resched_count; | ||
166 | sum += irq_stats(cpu)->irq_call_count; | ||
167 | sum += irq_stats(cpu)->irq_tlb_count; | ||
168 | #endif | ||
169 | #ifdef CONFIG_X86_MCE | ||
170 | sum += irq_stats(cpu)->irq_thermal_count; | ||
171 | # ifdef CONFIG_X86_64 | ||
172 | sum += irq_stats(cpu)->irq_threshold_count; | ||
173 | #endif | ||
174 | #endif | ||
175 | #ifdef CONFIG_X86_LOCAL_APIC | ||
176 | sum += irq_stats(cpu)->irq_spurious_count; | ||
177 | #endif | ||
178 | return sum; | ||
179 | } | ||
180 | |||
181 | u64 arch_irq_stat(void) | ||
182 | { | ||
183 | u64 sum = atomic_read(&irq_err_count); | ||
184 | |||
185 | #ifdef CONFIG_X86_IO_APIC | ||
186 | sum += atomic_read(&irq_mis_count); | ||
187 | #endif | ||
188 | return sum; | ||
189 | } | ||
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index b71e02d42f4f..a51382672de0 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
@@ -25,29 +25,6 @@ EXPORT_PER_CPU_SYMBOL(irq_stat); | |||
25 | DEFINE_PER_CPU(struct pt_regs *, irq_regs); | 25 | DEFINE_PER_CPU(struct pt_regs *, irq_regs); |
26 | EXPORT_PER_CPU_SYMBOL(irq_regs); | 26 | EXPORT_PER_CPU_SYMBOL(irq_regs); |
27 | 27 | ||
28 | /* | ||
29 | * 'what should we do if we get a hw irq event on an illegal vector'. | ||
30 | * each architecture has to answer this themselves. | ||
31 | */ | ||
32 | void ack_bad_irq(unsigned int irq) | ||
33 | { | ||
34 | printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); | ||
35 | |||
36 | #ifdef CONFIG_X86_LOCAL_APIC | ||
37 | /* | ||
38 | * Currently unexpected vectors happen only on SMP and APIC. | ||
39 | * We _must_ ack these because every local APIC has only N | ||
40 | * irq slots per priority level, and a 'hanging, unacked' IRQ | ||
41 | * holds up an irq slot - in excessive cases (when multiple | ||
42 | * unexpected vectors occur) that might lock up the APIC | ||
43 | * completely. | ||
44 | * But only ack when the APIC is enabled -AK | ||
45 | */ | ||
46 | if (cpu_has_apic) | ||
47 | ack_APIC_irq(); | ||
48 | #endif | ||
49 | } | ||
50 | |||
51 | #ifdef CONFIG_DEBUG_STACKOVERFLOW | 28 | #ifdef CONFIG_DEBUG_STACKOVERFLOW |
52 | /* Debugging check for stack overflow: is there less than 1KB free? */ | 29 | /* Debugging check for stack overflow: is there less than 1KB free? */ |
53 | static int check_stack_overflow(void) | 30 | static int check_stack_overflow(void) |
@@ -223,20 +200,25 @@ unsigned int do_IRQ(struct pt_regs *regs) | |||
223 | { | 200 | { |
224 | struct pt_regs *old_regs; | 201 | struct pt_regs *old_regs; |
225 | /* high bit used in ret_from_ code */ | 202 | /* high bit used in ret_from_ code */ |
226 | int overflow, irq = ~regs->orig_ax; | 203 | int overflow; |
227 | struct irq_desc *desc = irq_desc + irq; | 204 | unsigned vector = ~regs->orig_ax; |
205 | struct irq_desc *desc; | ||
206 | unsigned irq; | ||
228 | 207 | ||
229 | if (unlikely((unsigned)irq >= NR_IRQS)) { | ||
230 | printk(KERN_EMERG "%s: cannot handle IRQ %d\n", | ||
231 | __func__, irq); | ||
232 | BUG(); | ||
233 | } | ||
234 | 208 | ||
235 | old_regs = set_irq_regs(regs); | 209 | old_regs = set_irq_regs(regs); |
236 | irq_enter(); | 210 | irq_enter(); |
211 | irq = __get_cpu_var(vector_irq)[vector]; | ||
237 | 212 | ||
238 | overflow = check_stack_overflow(); | 213 | overflow = check_stack_overflow(); |
239 | 214 | ||
215 | desc = irq_to_desc(irq); | ||
216 | if (unlikely(!desc)) { | ||
217 | printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n", | ||
218 | __func__, irq, vector, smp_processor_id()); | ||
219 | BUG(); | ||
220 | } | ||
221 | |||
240 | if (!execute_on_irq_stack(overflow, desc, irq)) { | 222 | if (!execute_on_irq_stack(overflow, desc, irq)) { |
241 | if (unlikely(overflow)) | 223 | if (unlikely(overflow)) |
242 | print_stack_overflow(); | 224 | print_stack_overflow(); |
@@ -248,146 +230,6 @@ unsigned int do_IRQ(struct pt_regs *regs) | |||
248 | return 1; | 230 | return 1; |
249 | } | 231 | } |
250 | 232 | ||
251 | /* | ||
252 | * Interrupt statistics: | ||
253 | */ | ||
254 | |||
255 | atomic_t irq_err_count; | ||
256 | |||
257 | /* | ||
258 | * /proc/interrupts printing: | ||
259 | */ | ||
260 | |||
261 | int show_interrupts(struct seq_file *p, void *v) | ||
262 | { | ||
263 | int i = *(loff_t *) v, j; | ||
264 | struct irqaction * action; | ||
265 | unsigned long flags; | ||
266 | |||
267 | if (i == 0) { | ||
268 | seq_printf(p, " "); | ||
269 | for_each_online_cpu(j) | ||
270 | seq_printf(p, "CPU%-8d",j); | ||
271 | seq_putc(p, '\n'); | ||
272 | } | ||
273 | |||
274 | if (i < NR_IRQS) { | ||
275 | unsigned any_count = 0; | ||
276 | |||
277 | spin_lock_irqsave(&irq_desc[i].lock, flags); | ||
278 | #ifndef CONFIG_SMP | ||
279 | any_count = kstat_irqs(i); | ||
280 | #else | ||
281 | for_each_online_cpu(j) | ||
282 | any_count |= kstat_cpu(j).irqs[i]; | ||
283 | #endif | ||
284 | action = irq_desc[i].action; | ||
285 | if (!action && !any_count) | ||
286 | goto skip; | ||
287 | seq_printf(p, "%3d: ",i); | ||
288 | #ifndef CONFIG_SMP | ||
289 | seq_printf(p, "%10u ", kstat_irqs(i)); | ||
290 | #else | ||
291 | for_each_online_cpu(j) | ||
292 | seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); | ||
293 | #endif | ||
294 | seq_printf(p, " %8s", irq_desc[i].chip->name); | ||
295 | seq_printf(p, "-%-8s", irq_desc[i].name); | ||
296 | |||
297 | if (action) { | ||
298 | seq_printf(p, " %s", action->name); | ||
299 | while ((action = action->next) != NULL) | ||
300 | seq_printf(p, ", %s", action->name); | ||
301 | } | ||
302 | |||
303 | seq_putc(p, '\n'); | ||
304 | skip: | ||
305 | spin_unlock_irqrestore(&irq_desc[i].lock, flags); | ||
306 | } else if (i == NR_IRQS) { | ||
307 | seq_printf(p, "NMI: "); | ||
308 | for_each_online_cpu(j) | ||
309 | seq_printf(p, "%10u ", nmi_count(j)); | ||
310 | seq_printf(p, " Non-maskable interrupts\n"); | ||
311 | #ifdef CONFIG_X86_LOCAL_APIC | ||
312 | seq_printf(p, "LOC: "); | ||
313 | for_each_online_cpu(j) | ||
314 | seq_printf(p, "%10u ", | ||
315 | per_cpu(irq_stat,j).apic_timer_irqs); | ||
316 | seq_printf(p, " Local timer interrupts\n"); | ||
317 | #endif | ||
318 | #ifdef CONFIG_SMP | ||
319 | seq_printf(p, "RES: "); | ||
320 | for_each_online_cpu(j) | ||
321 | seq_printf(p, "%10u ", | ||
322 | per_cpu(irq_stat,j).irq_resched_count); | ||
323 | seq_printf(p, " Rescheduling interrupts\n"); | ||
324 | seq_printf(p, "CAL: "); | ||
325 | for_each_online_cpu(j) | ||
326 | seq_printf(p, "%10u ", | ||
327 | per_cpu(irq_stat,j).irq_call_count); | ||
328 | seq_printf(p, " Function call interrupts\n"); | ||
329 | seq_printf(p, "TLB: "); | ||
330 | for_each_online_cpu(j) | ||
331 | seq_printf(p, "%10u ", | ||
332 | per_cpu(irq_stat,j).irq_tlb_count); | ||
333 | seq_printf(p, " TLB shootdowns\n"); | ||
334 | #endif | ||
335 | #ifdef CONFIG_X86_MCE | ||
336 | seq_printf(p, "TRM: "); | ||
337 | for_each_online_cpu(j) | ||
338 | seq_printf(p, "%10u ", | ||
339 | per_cpu(irq_stat,j).irq_thermal_count); | ||
340 | seq_printf(p, " Thermal event interrupts\n"); | ||
341 | #endif | ||
342 | #ifdef CONFIG_X86_LOCAL_APIC | ||
343 | seq_printf(p, "SPU: "); | ||
344 | for_each_online_cpu(j) | ||
345 | seq_printf(p, "%10u ", | ||
346 | per_cpu(irq_stat,j).irq_spurious_count); | ||
347 | seq_printf(p, " Spurious interrupts\n"); | ||
348 | #endif | ||
349 | seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); | ||
350 | #if defined(CONFIG_X86_IO_APIC) | ||
351 | seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); | ||
352 | #endif | ||
353 | } | ||
354 | return 0; | ||
355 | } | ||
356 | |||
357 | /* | ||
358 | * /proc/stat helpers | ||
359 | */ | ||
360 | u64 arch_irq_stat_cpu(unsigned int cpu) | ||
361 | { | ||
362 | u64 sum = nmi_count(cpu); | ||
363 | |||
364 | #ifdef CONFIG_X86_LOCAL_APIC | ||
365 | sum += per_cpu(irq_stat, cpu).apic_timer_irqs; | ||
366 | #endif | ||
367 | #ifdef CONFIG_SMP | ||
368 | sum += per_cpu(irq_stat, cpu).irq_resched_count; | ||
369 | sum += per_cpu(irq_stat, cpu).irq_call_count; | ||
370 | sum += per_cpu(irq_stat, cpu).irq_tlb_count; | ||
371 | #endif | ||
372 | #ifdef CONFIG_X86_MCE | ||
373 | sum += per_cpu(irq_stat, cpu).irq_thermal_count; | ||
374 | #endif | ||
375 | #ifdef CONFIG_X86_LOCAL_APIC | ||
376 | sum += per_cpu(irq_stat, cpu).irq_spurious_count; | ||
377 | #endif | ||
378 | return sum; | ||
379 | } | ||
380 | |||
381 | u64 arch_irq_stat(void) | ||
382 | { | ||
383 | u64 sum = atomic_read(&irq_err_count); | ||
384 | |||
385 | #ifdef CONFIG_X86_IO_APIC | ||
386 | sum += atomic_read(&irq_mis_count); | ||
387 | #endif | ||
388 | return sum; | ||
389 | } | ||
390 | |||
391 | #ifdef CONFIG_HOTPLUG_CPU | 233 | #ifdef CONFIG_HOTPLUG_CPU |
392 | #include <mach_apic.h> | 234 | #include <mach_apic.h> |
393 | 235 | ||
@@ -395,20 +237,22 @@ void fixup_irqs(cpumask_t map) | |||
395 | { | 237 | { |
396 | unsigned int irq; | 238 | unsigned int irq; |
397 | static int warned; | 239 | static int warned; |
240 | struct irq_desc *desc; | ||
398 | 241 | ||
399 | for (irq = 0; irq < NR_IRQS; irq++) { | 242 | for_each_irq_desc(irq, desc) { |
400 | cpumask_t mask; | 243 | cpumask_t mask; |
244 | |||
401 | if (irq == 2) | 245 | if (irq == 2) |
402 | continue; | 246 | continue; |
403 | 247 | ||
404 | cpus_and(mask, irq_desc[irq].affinity, map); | 248 | cpus_and(mask, desc->affinity, map); |
405 | if (any_online_cpu(mask) == NR_CPUS) { | 249 | if (any_online_cpu(mask) == NR_CPUS) { |
406 | printk("Breaking affinity for irq %i\n", irq); | 250 | printk("Breaking affinity for irq %i\n", irq); |
407 | mask = map; | 251 | mask = map; |
408 | } | 252 | } |
409 | if (irq_desc[irq].chip->set_affinity) | 253 | if (desc->chip->set_affinity) |
410 | irq_desc[irq].chip->set_affinity(irq, mask); | 254 | desc->chip->set_affinity(irq, mask); |
411 | else if (irq_desc[irq].action && !(warned++)) | 255 | else if (desc->action && !(warned++)) |
412 | printk("Cannot set affinity for irq %i\n", irq); | 256 | printk("Cannot set affinity for irq %i\n", irq); |
413 | } | 257 | } |
414 | 258 | ||
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index f065fe9071b9..60eb84eb77a0 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c | |||
@@ -18,28 +18,6 @@ | |||
18 | #include <asm/idle.h> | 18 | #include <asm/idle.h> |
19 | #include <asm/smp.h> | 19 | #include <asm/smp.h> |
20 | 20 | ||
21 | atomic_t irq_err_count; | ||
22 | |||
23 | /* | ||
24 | * 'what should we do if we get a hw irq event on an illegal vector'. | ||
25 | * each architecture has to answer this themselves. | ||
26 | */ | ||
27 | void ack_bad_irq(unsigned int irq) | ||
28 | { | ||
29 | printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq); | ||
30 | /* | ||
31 | * Currently unexpected vectors happen only on SMP and APIC. | ||
32 | * We _must_ ack these because every local APIC has only N | ||
33 | * irq slots per priority level, and a 'hanging, unacked' IRQ | ||
34 | * holds up an irq slot - in excessive cases (when multiple | ||
35 | * unexpected vectors occur) that might lock up the APIC | ||
36 | * completely. | ||
37 | * But don't ack when the APIC is disabled. -AK | ||
38 | */ | ||
39 | if (!disable_apic) | ||
40 | ack_APIC_irq(); | ||
41 | } | ||
42 | |||
43 | #ifdef CONFIG_DEBUG_STACKOVERFLOW | 21 | #ifdef CONFIG_DEBUG_STACKOVERFLOW |
44 | /* | 22 | /* |
45 | * Probabilistic stack overflow check: | 23 | * Probabilistic stack overflow check: |
@@ -65,122 +43,6 @@ static inline void stack_overflow_check(struct pt_regs *regs) | |||
65 | #endif | 43 | #endif |
66 | 44 | ||
67 | /* | 45 | /* |
68 | * Generic, controller-independent functions: | ||
69 | */ | ||
70 | |||
71 | int show_interrupts(struct seq_file *p, void *v) | ||
72 | { | ||
73 | int i = *(loff_t *) v, j; | ||
74 | struct irqaction * action; | ||
75 | unsigned long flags; | ||
76 | |||
77 | if (i == 0) { | ||
78 | seq_printf(p, " "); | ||
79 | for_each_online_cpu(j) | ||
80 | seq_printf(p, "CPU%-8d",j); | ||
81 | seq_putc(p, '\n'); | ||
82 | } | ||
83 | |||
84 | if (i < NR_IRQS) { | ||
85 | unsigned any_count = 0; | ||
86 | |||
87 | spin_lock_irqsave(&irq_desc[i].lock, flags); | ||
88 | #ifndef CONFIG_SMP | ||
89 | any_count = kstat_irqs(i); | ||
90 | #else | ||
91 | for_each_online_cpu(j) | ||
92 | any_count |= kstat_cpu(j).irqs[i]; | ||
93 | #endif | ||
94 | action = irq_desc[i].action; | ||
95 | if (!action && !any_count) | ||
96 | goto skip; | ||
97 | seq_printf(p, "%3d: ",i); | ||
98 | #ifndef CONFIG_SMP | ||
99 | seq_printf(p, "%10u ", kstat_irqs(i)); | ||
100 | #else | ||
101 | for_each_online_cpu(j) | ||
102 | seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); | ||
103 | #endif | ||
104 | seq_printf(p, " %8s", irq_desc[i].chip->name); | ||
105 | seq_printf(p, "-%-8s", irq_desc[i].name); | ||
106 | |||
107 | if (action) { | ||
108 | seq_printf(p, " %s", action->name); | ||
109 | while ((action = action->next) != NULL) | ||
110 | seq_printf(p, ", %s", action->name); | ||
111 | } | ||
112 | seq_putc(p, '\n'); | ||
113 | skip: | ||
114 | spin_unlock_irqrestore(&irq_desc[i].lock, flags); | ||
115 | } else if (i == NR_IRQS) { | ||
116 | seq_printf(p, "NMI: "); | ||
117 | for_each_online_cpu(j) | ||
118 | seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); | ||
119 | seq_printf(p, " Non-maskable interrupts\n"); | ||
120 | seq_printf(p, "LOC: "); | ||
121 | for_each_online_cpu(j) | ||
122 | seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); | ||
123 | seq_printf(p, " Local timer interrupts\n"); | ||
124 | #ifdef CONFIG_SMP | ||
125 | seq_printf(p, "RES: "); | ||
126 | for_each_online_cpu(j) | ||
127 | seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count); | ||
128 | seq_printf(p, " Rescheduling interrupts\n"); | ||
129 | seq_printf(p, "CAL: "); | ||
130 | for_each_online_cpu(j) | ||
131 | seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); | ||
132 | seq_printf(p, " Function call interrupts\n"); | ||
133 | seq_printf(p, "TLB: "); | ||
134 | for_each_online_cpu(j) | ||
135 | seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); | ||
136 | seq_printf(p, " TLB shootdowns\n"); | ||
137 | #endif | ||
138 | #ifdef CONFIG_X86_MCE | ||
139 | seq_printf(p, "TRM: "); | ||
140 | for_each_online_cpu(j) | ||
141 | seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); | ||
142 | seq_printf(p, " Thermal event interrupts\n"); | ||
143 | seq_printf(p, "THR: "); | ||
144 | for_each_online_cpu(j) | ||
145 | seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); | ||
146 | seq_printf(p, " Threshold APIC interrupts\n"); | ||
147 | #endif | ||
148 | seq_printf(p, "SPU: "); | ||
149 | for_each_online_cpu(j) | ||
150 | seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); | ||
151 | seq_printf(p, " Spurious interrupts\n"); | ||
152 | seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); | ||
153 | } | ||
154 | return 0; | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * /proc/stat helpers | ||
159 | */ | ||
160 | u64 arch_irq_stat_cpu(unsigned int cpu) | ||
161 | { | ||
162 | u64 sum = cpu_pda(cpu)->__nmi_count; | ||
163 | |||
164 | sum += cpu_pda(cpu)->apic_timer_irqs; | ||
165 | #ifdef CONFIG_SMP | ||
166 | sum += cpu_pda(cpu)->irq_resched_count; | ||
167 | sum += cpu_pda(cpu)->irq_call_count; | ||
168 | sum += cpu_pda(cpu)->irq_tlb_count; | ||
169 | #endif | ||
170 | #ifdef CONFIG_X86_MCE | ||
171 | sum += cpu_pda(cpu)->irq_thermal_count; | ||
172 | sum += cpu_pda(cpu)->irq_threshold_count; | ||
173 | #endif | ||
174 | sum += cpu_pda(cpu)->irq_spurious_count; | ||
175 | return sum; | ||
176 | } | ||
177 | |||
178 | u64 arch_irq_stat(void) | ||
179 | { | ||
180 | return atomic_read(&irq_err_count); | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * do_IRQ handles all normal device IRQ's (the special | 46 | * do_IRQ handles all normal device IRQ's (the special |
185 | * SMP cross-CPU interrupts have their own specific | 47 | * SMP cross-CPU interrupts have their own specific |
186 | * handlers). | 48 | * handlers). |
@@ -188,6 +50,7 @@ u64 arch_irq_stat(void) | |||
188 | asmlinkage unsigned int do_IRQ(struct pt_regs *regs) | 50 | asmlinkage unsigned int do_IRQ(struct pt_regs *regs) |
189 | { | 51 | { |
190 | struct pt_regs *old_regs = set_irq_regs(regs); | 52 | struct pt_regs *old_regs = set_irq_regs(regs); |
53 | struct irq_desc *desc; | ||
191 | 54 | ||
192 | /* high bit used in ret_from_ code */ | 55 | /* high bit used in ret_from_ code */ |
193 | unsigned vector = ~regs->orig_ax; | 56 | unsigned vector = ~regs->orig_ax; |
@@ -201,8 +64,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) | |||
201 | stack_overflow_check(regs); | 64 | stack_overflow_check(regs); |
202 | #endif | 65 | #endif |
203 | 66 | ||
204 | if (likely(irq < NR_IRQS)) | 67 | desc = irq_to_desc(irq); |
205 | generic_handle_irq(irq); | 68 | if (likely(desc)) |
69 | generic_handle_irq_desc(irq, desc); | ||
206 | else { | 70 | else { |
207 | if (!disable_apic) | 71 | if (!disable_apic) |
208 | ack_APIC_irq(); | 72 | ack_APIC_irq(); |
@@ -223,8 +87,9 @@ void fixup_irqs(cpumask_t map) | |||
223 | { | 87 | { |
224 | unsigned int irq; | 88 | unsigned int irq; |
225 | static int warned; | 89 | static int warned; |
90 | struct irq_desc *desc; | ||
226 | 91 | ||
227 | for (irq = 0; irq < NR_IRQS; irq++) { | 92 | for_each_irq_desc(irq, desc) { |
228 | cpumask_t mask; | 93 | cpumask_t mask; |
229 | int break_affinity = 0; | 94 | int break_affinity = 0; |
230 | int set_affinity = 1; | 95 | int set_affinity = 1; |
@@ -233,32 +98,32 @@ void fixup_irqs(cpumask_t map) | |||
233 | continue; | 98 | continue; |
234 | 99 | ||
235 | /* interrupt's are disabled at this point */ | 100 | /* interrupt's are disabled at this point */ |
236 | spin_lock(&irq_desc[irq].lock); | 101 | spin_lock(&desc->lock); |
237 | 102 | ||
238 | if (!irq_has_action(irq) || | 103 | if (!irq_has_action(irq) || |
239 | cpus_equal(irq_desc[irq].affinity, map)) { | 104 | cpus_equal(desc->affinity, map)) { |
240 | spin_unlock(&irq_desc[irq].lock); | 105 | spin_unlock(&desc->lock); |
241 | continue; | 106 | continue; |
242 | } | 107 | } |
243 | 108 | ||
244 | cpus_and(mask, irq_desc[irq].affinity, map); | 109 | cpus_and(mask, desc->affinity, map); |
245 | if (cpus_empty(mask)) { | 110 | if (cpus_empty(mask)) { |
246 | break_affinity = 1; | 111 | break_affinity = 1; |
247 | mask = map; | 112 | mask = map; |
248 | } | 113 | } |
249 | 114 | ||
250 | if (irq_desc[irq].chip->mask) | 115 | if (desc->chip->mask) |
251 | irq_desc[irq].chip->mask(irq); | 116 | desc->chip->mask(irq); |
252 | 117 | ||
253 | if (irq_desc[irq].chip->set_affinity) | 118 | if (desc->chip->set_affinity) |
254 | irq_desc[irq].chip->set_affinity(irq, mask); | 119 | desc->chip->set_affinity(irq, mask); |
255 | else if (!(warned++)) | 120 | else if (!(warned++)) |
256 | set_affinity = 0; | 121 | set_affinity = 0; |
257 | 122 | ||
258 | if (irq_desc[irq].chip->unmask) | 123 | if (desc->chip->unmask) |
259 | irq_desc[irq].chip->unmask(irq); | 124 | desc->chip->unmask(irq); |
260 | 125 | ||
261 | spin_unlock(&irq_desc[irq].lock); | 126 | spin_unlock(&desc->lock); |
262 | 127 | ||
263 | if (break_affinity && set_affinity) | 128 | if (break_affinity && set_affinity) |
264 | printk("Broke affinity for irq %i\n", irq); | 129 | printk("Broke affinity for irq %i\n", irq); |
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 9200a1e2752d..845aa9803e80 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c | |||
@@ -69,6 +69,13 @@ void __init init_ISA_irqs (void) | |||
69 | * 16 old-style INTA-cycle interrupts: | 69 | * 16 old-style INTA-cycle interrupts: |
70 | */ | 70 | */ |
71 | for (i = 0; i < 16; i++) { | 71 | for (i = 0; i < 16; i++) { |
72 | /* first time call this irq_desc */ | ||
73 | struct irq_desc *desc = irq_to_desc(i); | ||
74 | |||
75 | desc->status = IRQ_DISABLED; | ||
76 | desc->action = NULL; | ||
77 | desc->depth = 1; | ||
78 | |||
72 | set_irq_chip_and_handler_name(i, &i8259A_chip, | 79 | set_irq_chip_and_handler_name(i, &i8259A_chip, |
73 | handle_level_irq, "XT"); | 80 | handle_level_irq, "XT"); |
74 | } | 81 | } |
@@ -83,6 +90,27 @@ static struct irqaction irq2 = { | |||
83 | .name = "cascade", | 90 | .name = "cascade", |
84 | }; | 91 | }; |
85 | 92 | ||
93 | DEFINE_PER_CPU(vector_irq_t, vector_irq) = { | ||
94 | [0 ... IRQ0_VECTOR - 1] = -1, | ||
95 | [IRQ0_VECTOR] = 0, | ||
96 | [IRQ1_VECTOR] = 1, | ||
97 | [IRQ2_VECTOR] = 2, | ||
98 | [IRQ3_VECTOR] = 3, | ||
99 | [IRQ4_VECTOR] = 4, | ||
100 | [IRQ5_VECTOR] = 5, | ||
101 | [IRQ6_VECTOR] = 6, | ||
102 | [IRQ7_VECTOR] = 7, | ||
103 | [IRQ8_VECTOR] = 8, | ||
104 | [IRQ9_VECTOR] = 9, | ||
105 | [IRQ10_VECTOR] = 10, | ||
106 | [IRQ11_VECTOR] = 11, | ||
107 | [IRQ12_VECTOR] = 12, | ||
108 | [IRQ13_VECTOR] = 13, | ||
109 | [IRQ14_VECTOR] = 14, | ||
110 | [IRQ15_VECTOR] = 15, | ||
111 | [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 | ||
112 | }; | ||
113 | |||
86 | /* Overridden in paravirt.c */ | 114 | /* Overridden in paravirt.c */ |
87 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | 115 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); |
88 | 116 | ||
@@ -98,22 +126,14 @@ void __init native_init_IRQ(void) | |||
98 | * us. (some of these will be overridden and become | 126 | * us. (some of these will be overridden and become |
99 | * 'special' SMP interrupts) | 127 | * 'special' SMP interrupts) |
100 | */ | 128 | */ |
101 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | 129 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { |
102 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
103 | if (i >= NR_IRQS) | ||
104 | break; | ||
105 | /* SYSCALL_VECTOR was reserved in trap_init. */ | 130 | /* SYSCALL_VECTOR was reserved in trap_init. */ |
106 | if (!test_bit(vector, used_vectors)) | 131 | if (i != SYSCALL_VECTOR) |
107 | set_intr_gate(vector, interrupt[i]); | 132 | set_intr_gate(i, interrupt[i]); |
108 | } | 133 | } |
109 | 134 | ||
110 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) | ||
111 | /* | ||
112 | * IRQ0 must be given a fixed assignment and initialized, | ||
113 | * because it's used before the IO-APIC is set up. | ||
114 | */ | ||
115 | set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); | ||
116 | 135 | ||
136 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) | ||
117 | /* | 137 | /* |
118 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | 138 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper |
119 | * IPI, driven by wakeup. | 139 | * IPI, driven by wakeup. |
@@ -128,6 +148,9 @@ void __init native_init_IRQ(void) | |||
128 | 148 | ||
129 | /* IPI for single call function */ | 149 | /* IPI for single call function */ |
130 | set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); | 150 | set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); |
151 | |||
152 | /* Low priority IPI to cleanup after moving an irq */ | ||
153 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); | ||
131 | #endif | 154 | #endif |
132 | 155 | ||
133 | #ifdef CONFIG_X86_LOCAL_APIC | 156 | #ifdef CONFIG_X86_LOCAL_APIC |
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 1f26fd9ec4f4..ff0235391285 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c | |||
@@ -135,51 +135,33 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { | |||
135 | [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 | 135 | [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 |
136 | }; | 136 | }; |
137 | 137 | ||
138 | static void __init init_ISA_irqs (void) | 138 | void __init init_ISA_irqs(void) |
139 | { | 139 | { |
140 | int i; | 140 | int i; |
141 | 141 | ||
142 | init_bsp_APIC(); | 142 | init_bsp_APIC(); |
143 | init_8259A(0); | 143 | init_8259A(0); |
144 | 144 | ||
145 | for (i = 0; i < NR_IRQS; i++) { | 145 | for (i = 0; i < 16; i++) { |
146 | irq_desc[i].status = IRQ_DISABLED; | 146 | /* first time call this irq_desc */ |
147 | irq_desc[i].action = NULL; | 147 | struct irq_desc *desc = irq_to_desc(i); |
148 | irq_desc[i].depth = 1; | ||
149 | 148 | ||
150 | if (i < 16) { | 149 | desc->status = IRQ_DISABLED; |
151 | /* | 150 | desc->action = NULL; |
152 | * 16 old-style INTA-cycle interrupts: | 151 | desc->depth = 1; |
153 | */ | 152 | |
154 | set_irq_chip_and_handler_name(i, &i8259A_chip, | 153 | /* |
154 | * 16 old-style INTA-cycle interrupts: | ||
155 | */ | ||
156 | set_irq_chip_and_handler_name(i, &i8259A_chip, | ||
155 | handle_level_irq, "XT"); | 157 | handle_level_irq, "XT"); |
156 | } else { | ||
157 | /* | ||
158 | * 'high' PCI IRQs filled in on demand | ||
159 | */ | ||
160 | irq_desc[i].chip = &no_irq_chip; | ||
161 | } | ||
162 | } | 158 | } |
163 | } | 159 | } |
164 | 160 | ||
165 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); | 161 | void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); |
166 | 162 | ||
167 | void __init native_init_IRQ(void) | 163 | static void __init smp_intr_init(void) |
168 | { | 164 | { |
169 | int i; | ||
170 | |||
171 | init_ISA_irqs(); | ||
172 | /* | ||
173 | * Cover the whole vector space, no vector can escape | ||
174 | * us. (some of these will be overridden and become | ||
175 | * 'special' SMP interrupts) | ||
176 | */ | ||
177 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
178 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
179 | if (vector != IA32_SYSCALL_VECTOR) | ||
180 | set_intr_gate(vector, interrupt[i]); | ||
181 | } | ||
182 | |||
183 | #ifdef CONFIG_SMP | 165 | #ifdef CONFIG_SMP |
184 | /* | 166 | /* |
185 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper | 167 | * The reschedule interrupt is a CPU-to-CPU reschedule-helper |
@@ -207,6 +189,12 @@ void __init native_init_IRQ(void) | |||
207 | /* Low priority IPI to cleanup after moving an irq */ | 189 | /* Low priority IPI to cleanup after moving an irq */ |
208 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); | 190 | set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); |
209 | #endif | 191 | #endif |
192 | } | ||
193 | |||
194 | static void __init apic_intr_init(void) | ||
195 | { | ||
196 | smp_intr_init(); | ||
197 | |||
210 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); | 198 | alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); |
211 | alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); | 199 | alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); |
212 | 200 | ||
@@ -216,6 +204,25 @@ void __init native_init_IRQ(void) | |||
216 | /* IPI vectors for APIC spurious and error interrupts */ | 204 | /* IPI vectors for APIC spurious and error interrupts */ |
217 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); | 205 | alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); |
218 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); | 206 | alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); |
207 | } | ||
208 | |||
209 | void __init native_init_IRQ(void) | ||
210 | { | ||
211 | int i; | ||
212 | |||
213 | init_ISA_irqs(); | ||
214 | /* | ||
215 | * Cover the whole vector space, no vector can escape | ||
216 | * us. (some of these will be overridden and become | ||
217 | * 'special' SMP interrupts) | ||
218 | */ | ||
219 | for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { | ||
220 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
221 | if (vector != IA32_SYSCALL_VECTOR) | ||
222 | set_intr_gate(vector, interrupt[i]); | ||
223 | } | ||
224 | |||
225 | apic_intr_init(); | ||
219 | 226 | ||
220 | if (!acpi_ioapic) | 227 | if (!acpi_ioapic) |
221 | setup_irq(2, &irq2); | 228 | setup_irq(2, &irq2); |
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d02def06ca91..774ac4991568 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void) | |||
78 | return ret; | 78 | return ret; |
79 | } | 79 | } |
80 | 80 | ||
81 | /* | ||
82 | * If we don't do that, there is the possibility that the guest | ||
83 | * will calibrate under heavy load - thus, getting a lower lpj - | ||
84 | * and execute the delays themselves without load. This is wrong, | ||
85 | * because no delay loop can finish beforehand. | ||
86 | * Any heuristics is subject to fail, because ultimately, a large | ||
87 | * poll of guests can be running and trouble each other. So we preset | ||
88 | * lpj here | ||
89 | */ | ||
90 | static unsigned long kvm_get_tsc_khz(void) | ||
91 | { | ||
92 | return preset_lpj; | ||
93 | } | ||
94 | |||
95 | static void kvm_get_preset_lpj(void) | ||
96 | { | ||
97 | struct pvclock_vcpu_time_info *src; | ||
98 | unsigned long khz; | ||
99 | u64 lpj; | ||
100 | |||
101 | src = &per_cpu(hv_clock, 0); | ||
102 | khz = pvclock_tsc_khz(src); | ||
103 | |||
104 | lpj = ((u64)khz * 1000); | ||
105 | do_div(lpj, HZ); | ||
106 | preset_lpj = lpj; | ||
107 | } | ||
108 | |||
81 | static struct clocksource kvm_clock = { | 109 | static struct clocksource kvm_clock = { |
82 | .name = "kvm-clock", | 110 | .name = "kvm-clock", |
83 | .read = kvm_clock_read, | 111 | .read = kvm_clock_read, |
@@ -153,6 +181,7 @@ void __init kvmclock_init(void) | |||
153 | pv_time_ops.get_wallclock = kvm_get_wallclock; | 181 | pv_time_ops.get_wallclock = kvm_get_wallclock; |
154 | pv_time_ops.set_wallclock = kvm_set_wallclock; | 182 | pv_time_ops.set_wallclock = kvm_set_wallclock; |
155 | pv_time_ops.sched_clock = kvm_clock_read; | 183 | pv_time_ops.sched_clock = kvm_clock_read; |
184 | pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; | ||
156 | #ifdef CONFIG_X86_LOCAL_APIC | 185 | #ifdef CONFIG_X86_LOCAL_APIC |
157 | pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; | 186 | pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; |
158 | #endif | 187 | #endif |
@@ -163,6 +192,7 @@ void __init kvmclock_init(void) | |||
163 | #ifdef CONFIG_KEXEC | 192 | #ifdef CONFIG_KEXEC |
164 | machine_ops.crash_shutdown = kvm_crash_shutdown; | 193 | machine_ops.crash_shutdown = kvm_crash_shutdown; |
165 | #endif | 194 | #endif |
195 | kvm_get_preset_lpj(); | ||
166 | clocksource_register(&kvm_clock); | 196 | clocksource_register(&kvm_clock); |
167 | } | 197 | } |
168 | } | 198 | } |
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 2e2af5d18191..82a7c7ed6d45 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c | |||
@@ -163,8 +163,8 @@ static int __cpuinit msr_device_create(int cpu) | |||
163 | { | 163 | { |
164 | struct device *dev; | 164 | struct device *dev; |
165 | 165 | ||
166 | dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), | 166 | dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, |
167 | NULL, "msr%d", cpu); | 167 | "msr%d", cpu); |
168 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; | 168 | return IS_ERR(dev) ? PTR_ERR(dev) : 0; |
169 | } | 169 | } |
170 | 170 | ||
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 080d1d27f37a..e1e731d78f38 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c | |||
@@ -217,16 +217,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, | |||
217 | 217 | ||
218 | #endif /* CONFIG_IOMMU_DEBUG */ | 218 | #endif /* CONFIG_IOMMU_DEBUG */ |
219 | 219 | ||
220 | static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) | ||
221 | { | ||
222 | unsigned int npages; | ||
223 | |||
224 | npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK); | ||
225 | npages >>= PAGE_SHIFT; | ||
226 | |||
227 | return npages; | ||
228 | } | ||
229 | |||
230 | static inline int translation_enabled(struct iommu_table *tbl) | 220 | static inline int translation_enabled(struct iommu_table *tbl) |
231 | { | 221 | { |
232 | /* only PHBs with translation enabled have an IOMMU table */ | 222 | /* only PHBs with translation enabled have an IOMMU table */ |
@@ -408,7 +398,7 @@ static void calgary_unmap_sg(struct device *dev, | |||
408 | if (dmalen == 0) | 398 | if (dmalen == 0) |
409 | break; | 399 | break; |
410 | 400 | ||
411 | npages = num_dma_pages(dma, dmalen); | 401 | npages = iommu_num_pages(dma, dmalen, PAGE_SIZE); |
412 | iommu_free(tbl, dma, npages); | 402 | iommu_free(tbl, dma, npages); |
413 | } | 403 | } |
414 | } | 404 | } |
@@ -427,7 +417,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, | |||
427 | BUG_ON(!sg_page(s)); | 417 | BUG_ON(!sg_page(s)); |
428 | 418 | ||
429 | vaddr = (unsigned long) sg_virt(s); | 419 | vaddr = (unsigned long) sg_virt(s); |
430 | npages = num_dma_pages(vaddr, s->length); | 420 | npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); |
431 | 421 | ||
432 | entry = iommu_range_alloc(dev, tbl, npages); | 422 | entry = iommu_range_alloc(dev, tbl, npages); |
433 | if (entry == bad_dma_address) { | 423 | if (entry == bad_dma_address) { |
@@ -464,7 +454,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, | |||
464 | struct iommu_table *tbl = find_iommu_table(dev); | 454 | struct iommu_table *tbl = find_iommu_table(dev); |
465 | 455 | ||
466 | uaddr = (unsigned long)vaddr; | 456 | uaddr = (unsigned long)vaddr; |
467 | npages = num_dma_pages(uaddr, size); | 457 | npages = iommu_num_pages(uaddr, size, PAGE_SIZE); |
468 | 458 | ||
469 | return iommu_alloc(dev, tbl, vaddr, npages, direction); | 459 | return iommu_alloc(dev, tbl, vaddr, npages, direction); |
470 | } | 460 | } |
@@ -475,7 +465,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, | |||
475 | struct iommu_table *tbl = find_iommu_table(dev); | 465 | struct iommu_table *tbl = find_iommu_table(dev); |
476 | unsigned int npages; | 466 | unsigned int npages; |
477 | 467 | ||
478 | npages = num_dma_pages(dma_handle, size); | 468 | npages = iommu_num_pages(dma_handle, size, PAGE_SIZE); |
479 | iommu_free(tbl, dma_handle, npages); | 469 | iommu_free(tbl, dma_handle, npages); |
480 | } | 470 | } |
481 | 471 | ||
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 0a3824e837b4..1972266e8ba5 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c | |||
@@ -9,8 +9,6 @@ | |||
9 | #include <asm/calgary.h> | 9 | #include <asm/calgary.h> |
10 | #include <asm/amd_iommu.h> | 10 | #include <asm/amd_iommu.h> |
11 | 11 | ||
12 | static int forbid_dac __read_mostly; | ||
13 | |||
14 | struct dma_mapping_ops *dma_ops; | 12 | struct dma_mapping_ops *dma_ops; |
15 | EXPORT_SYMBOL(dma_ops); | 13 | EXPORT_SYMBOL(dma_ops); |
16 | 14 | ||
@@ -125,13 +123,13 @@ void __init pci_iommu_alloc(void) | |||
125 | pci_swiotlb_init(); | 123 | pci_swiotlb_init(); |
126 | } | 124 | } |
127 | 125 | ||
128 | unsigned long iommu_num_pages(unsigned long addr, unsigned long len) | 126 | unsigned long iommu_nr_pages(unsigned long addr, unsigned long len) |
129 | { | 127 | { |
130 | unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); | 128 | unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); |
131 | 129 | ||
132 | return size >> PAGE_SHIFT; | 130 | return size >> PAGE_SHIFT; |
133 | } | 131 | } |
134 | EXPORT_SYMBOL(iommu_num_pages); | 132 | EXPORT_SYMBOL(iommu_nr_pages); |
135 | #endif | 133 | #endif |
136 | 134 | ||
137 | void *dma_generic_alloc_coherent(struct device *dev, size_t size, | 135 | void *dma_generic_alloc_coherent(struct device *dev, size_t size, |
@@ -293,17 +291,3 @@ void pci_iommu_shutdown(void) | |||
293 | } | 291 | } |
294 | /* Must execute after PCI subsystem */ | 292 | /* Must execute after PCI subsystem */ |
295 | fs_initcall(pci_iommu_init); | 293 | fs_initcall(pci_iommu_init); |
296 | |||
297 | #ifdef CONFIG_PCI | ||
298 | /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ | ||
299 | |||
300 | static __devinit void via_no_dac(struct pci_dev *dev) | ||
301 | { | ||
302 | if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { | ||
303 | printk(KERN_INFO "PCI: VIA PCI bridge detected." | ||
304 | "Disabling DAC.\n"); | ||
305 | forbid_dac = 1; | ||
306 | } | ||
307 | } | ||
308 | DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); | ||
309 | #endif | ||
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 145f1c83369f..e3f75bbcedea 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c | |||
@@ -231,7 +231,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size) | |||
231 | static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, | 231 | static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, |
232 | size_t size, int dir, unsigned long align_mask) | 232 | size_t size, int dir, unsigned long align_mask) |
233 | { | 233 | { |
234 | unsigned long npages = iommu_num_pages(phys_mem, size); | 234 | unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); |
235 | unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); | 235 | unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); |
236 | int i; | 236 | int i; |
237 | 237 | ||
@@ -285,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, | |||
285 | return; | 285 | return; |
286 | 286 | ||
287 | iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; | 287 | iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; |
288 | npages = iommu_num_pages(dma_addr, size); | 288 | npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); |
289 | for (i = 0; i < npages; i++) { | 289 | for (i = 0; i < npages; i++) { |
290 | iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; | 290 | iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; |
291 | CLEAR_LEAK(iommu_page + i); | 291 | CLEAR_LEAK(iommu_page + i); |
@@ -368,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, | |||
368 | } | 368 | } |
369 | 369 | ||
370 | addr = phys_addr; | 370 | addr = phys_addr; |
371 | pages = iommu_num_pages(s->offset, s->length); | 371 | pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); |
372 | while (pages--) { | 372 | while (pages--) { |
373 | iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); | 373 | iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); |
374 | SET_LEAK(iommu_page); | 374 | SET_LEAK(iommu_page); |
@@ -451,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) | |||
451 | 451 | ||
452 | seg_size += s->length; | 452 | seg_size += s->length; |
453 | need = nextneed; | 453 | need = nextneed; |
454 | pages += iommu_num_pages(s->offset, s->length); | 454 | pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE); |
455 | ps = s; | 455 | ps = s; |
456 | } | 456 | } |
457 | if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) | 457 | if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 922c14058f97..0a1302fe6d45 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -123,7 +123,7 @@ void cpu_idle(void) | |||
123 | } | 123 | } |
124 | } | 124 | } |
125 | 125 | ||
126 | void __show_registers(struct pt_regs *regs, int all) | 126 | void __show_regs(struct pt_regs *regs, int all) |
127 | { | 127 | { |
128 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; | 128 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; |
129 | unsigned long d0, d1, d2, d3, d6, d7; | 129 | unsigned long d0, d1, d2, d3, d6, d7; |
@@ -189,7 +189,7 @@ void __show_registers(struct pt_regs *regs, int all) | |||
189 | 189 | ||
190 | void show_regs(struct pt_regs *regs) | 190 | void show_regs(struct pt_regs *regs) |
191 | { | 191 | { |
192 | __show_registers(regs, 1); | 192 | __show_regs(regs, 1); |
193 | show_trace(NULL, regs, ®s->sp, regs->bp); | 193 | show_trace(NULL, regs, ®s->sp, regs->bp); |
194 | } | 194 | } |
195 | 195 | ||
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ca80394ef5b8..cd8c0ed02b7e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -136,7 +136,7 @@ void cpu_idle(void) | |||
136 | } | 136 | } |
137 | 137 | ||
138 | /* Prints also some state that isn't saved in the pt_regs */ | 138 | /* Prints also some state that isn't saved in the pt_regs */ |
139 | void __show_regs(struct pt_regs *regs) | 139 | void __show_regs(struct pt_regs *regs, int all) |
140 | { | 140 | { |
141 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; | 141 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; |
142 | unsigned long d0, d1, d2, d3, d6, d7; | 142 | unsigned long d0, d1, d2, d3, d6, d7; |
@@ -175,6 +175,9 @@ void __show_regs(struct pt_regs *regs) | |||
175 | rdmsrl(MSR_GS_BASE, gs); | 175 | rdmsrl(MSR_GS_BASE, gs); |
176 | rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); | 176 | rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); |
177 | 177 | ||
178 | if (!all) | ||
179 | return; | ||
180 | |||
178 | cr0 = read_cr0(); | 181 | cr0 = read_cr0(); |
179 | cr2 = read_cr2(); | 182 | cr2 = read_cr2(); |
180 | cr3 = read_cr3(); | 183 | cr3 = read_cr3(); |
@@ -200,7 +203,7 @@ void __show_regs(struct pt_regs *regs) | |||
200 | void show_regs(struct pt_regs *regs) | 203 | void show_regs(struct pt_regs *regs) |
201 | { | 204 | { |
202 | printk(KERN_INFO "CPU %d:", smp_processor_id()); | 205 | printk(KERN_INFO "CPU %d:", smp_processor_id()); |
203 | __show_regs(regs); | 206 | __show_regs(regs, 1); |
204 | show_trace(NULL, regs, (void *)(regs + 1), regs->bp); | 207 | show_trace(NULL, regs, (void *)(regs + 1), regs->bp); |
205 | } | 208 | } |
206 | 209 | ||
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 05fbe9a0325a..4f9c55f3a7c0 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c | |||
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, | |||
97 | return dst->version; | 97 | return dst->version; |
98 | } | 98 | } |
99 | 99 | ||
100 | unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) | ||
101 | { | ||
102 | u64 pv_tsc_khz = 1000000ULL << 32; | ||
103 | |||
104 | do_div(pv_tsc_khz, src->tsc_to_system_mul); | ||
105 | if (src->tsc_shift < 0) | ||
106 | pv_tsc_khz <<= -src->tsc_shift; | ||
107 | else | ||
108 | pv_tsc_khz >>= src->tsc_shift; | ||
109 | return pv_tsc_khz; | ||
110 | } | ||
111 | |||
100 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) | 112 | cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) |
101 | { | 113 | { |
102 | struct pvclock_shadow_time shadow; | 114 | struct pvclock_shadow_time shadow; |
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index d13858818100..67465ed89310 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c | |||
@@ -35,9 +35,6 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) | |||
35 | if (!(word & (1 << 13))) { | 35 | if (!(word & (1 << 13))) { |
36 | dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " | 36 | dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " |
37 | "disabling irq balancing and affinity\n"); | 37 | "disabling irq balancing and affinity\n"); |
38 | #ifdef CONFIG_IRQBALANCE | ||
39 | irqbalance_disable(""); | ||
40 | #endif | ||
41 | noirqdebug_setup(""); | 38 | noirqdebug_setup(""); |
42 | #ifdef CONFIG_PROC_FS | 39 | #ifdef CONFIG_PROC_FS |
43 | no_irq_affinity = 1; | 40 | no_irq_affinity = 1; |
@@ -354,9 +351,27 @@ static void ati_force_hpet_resume(void) | |||
354 | printk(KERN_DEBUG "Force enabled HPET at resume\n"); | 351 | printk(KERN_DEBUG "Force enabled HPET at resume\n"); |
355 | } | 352 | } |
356 | 353 | ||
354 | static u32 ati_ixp4x0_rev(struct pci_dev *dev) | ||
355 | { | ||
356 | u32 d; | ||
357 | u8 b; | ||
358 | |||
359 | pci_read_config_byte(dev, 0xac, &b); | ||
360 | b &= ~(1<<5); | ||
361 | pci_write_config_byte(dev, 0xac, b); | ||
362 | pci_read_config_dword(dev, 0x70, &d); | ||
363 | d |= 1<<8; | ||
364 | pci_write_config_dword(dev, 0x70, d); | ||
365 | pci_read_config_dword(dev, 0x8, &d); | ||
366 | d &= 0xff; | ||
367 | dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d); | ||
368 | return d; | ||
369 | } | ||
370 | |||
357 | static void ati_force_enable_hpet(struct pci_dev *dev) | 371 | static void ati_force_enable_hpet(struct pci_dev *dev) |
358 | { | 372 | { |
359 | u32 uninitialized_var(val); | 373 | u32 d, val; |
374 | u8 b; | ||
360 | 375 | ||
361 | if (hpet_address || force_hpet_address) | 376 | if (hpet_address || force_hpet_address) |
362 | return; | 377 | return; |
@@ -366,14 +381,33 @@ static void ati_force_enable_hpet(struct pci_dev *dev) | |||
366 | return; | 381 | return; |
367 | } | 382 | } |
368 | 383 | ||
384 | d = ati_ixp4x0_rev(dev); | ||
385 | if (d < 0x82) | ||
386 | return; | ||
387 | |||
388 | /* base address */ | ||
369 | pci_write_config_dword(dev, 0x14, 0xfed00000); | 389 | pci_write_config_dword(dev, 0x14, 0xfed00000); |
370 | pci_read_config_dword(dev, 0x14, &val); | 390 | pci_read_config_dword(dev, 0x14, &val); |
391 | |||
392 | /* enable interrupt */ | ||
393 | outb(0x72, 0xcd6); b = inb(0xcd7); | ||
394 | b |= 0x1; | ||
395 | outb(0x72, 0xcd6); outb(b, 0xcd7); | ||
396 | outb(0x72, 0xcd6); b = inb(0xcd7); | ||
397 | if (!(b & 0x1)) | ||
398 | return; | ||
399 | pci_read_config_dword(dev, 0x64, &d); | ||
400 | d |= (1<<10); | ||
401 | pci_write_config_dword(dev, 0x64, d); | ||
402 | pci_read_config_dword(dev, 0x64, &d); | ||
403 | if (!(d & (1<<10))) | ||
404 | return; | ||
405 | |||
371 | force_hpet_address = val; | 406 | force_hpet_address = val; |
372 | force_hpet_resume_type = ATI_FORCE_HPET_RESUME; | 407 | force_hpet_resume_type = ATI_FORCE_HPET_RESUME; |
373 | dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", | 408 | dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", |
374 | force_hpet_address); | 409 | force_hpet_address); |
375 | cached_dev = dev; | 410 | cached_dev = dev; |
376 | return; | ||
377 | } | 411 | } |
378 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, | 412 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, |
379 | ati_force_enable_hpet); | 413 | ati_force_enable_hpet); |
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 05191bbc68b8..dd6f2b71561b 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c | |||
@@ -52,7 +52,7 @@ int mach_set_rtc_mmss(unsigned long nowtime) | |||
52 | 52 | ||
53 | cmos_minutes = CMOS_READ(RTC_MINUTES); | 53 | cmos_minutes = CMOS_READ(RTC_MINUTES); |
54 | if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) | 54 | if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) |
55 | BCD_TO_BIN(cmos_minutes); | 55 | cmos_minutes = bcd2bin(cmos_minutes); |
56 | 56 | ||
57 | /* | 57 | /* |
58 | * since we're only adjusting minutes and seconds, | 58 | * since we're only adjusting minutes and seconds, |
@@ -69,8 +69,8 @@ int mach_set_rtc_mmss(unsigned long nowtime) | |||
69 | 69 | ||
70 | if (abs(real_minutes - cmos_minutes) < 30) { | 70 | if (abs(real_minutes - cmos_minutes) < 30) { |
71 | if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { | 71 | if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { |
72 | BIN_TO_BCD(real_seconds); | 72 | real_seconds = bin2bcd(real_seconds); |
73 | BIN_TO_BCD(real_minutes); | 73 | real_minutes = bin2bcd(real_minutes); |
74 | } | 74 | } |
75 | CMOS_WRITE(real_seconds,RTC_SECONDS); | 75 | CMOS_WRITE(real_seconds,RTC_SECONDS); |
76 | CMOS_WRITE(real_minutes,RTC_MINUTES); | 76 | CMOS_WRITE(real_minutes,RTC_MINUTES); |
@@ -124,16 +124,16 @@ unsigned long mach_get_cmos_time(void) | |||
124 | WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); | 124 | WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); |
125 | 125 | ||
126 | if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { | 126 | if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { |
127 | BCD_TO_BIN(sec); | 127 | sec = bcd2bin(sec); |
128 | BCD_TO_BIN(min); | 128 | min = bcd2bin(min); |
129 | BCD_TO_BIN(hour); | 129 | hour = bcd2bin(hour); |
130 | BCD_TO_BIN(day); | 130 | day = bcd2bin(day); |
131 | BCD_TO_BIN(mon); | 131 | mon = bcd2bin(mon); |
132 | BCD_TO_BIN(year); | 132 | year = bcd2bin(year); |
133 | } | 133 | } |
134 | 134 | ||
135 | if (century) { | 135 | if (century) { |
136 | BCD_TO_BIN(century); | 136 | century = bcd2bin(century); |
137 | year += century * 100; | 137 | year += century * 100; |
138 | printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); | 138 | printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); |
139 | } else | 139 | } else |
@@ -223,11 +223,25 @@ static struct platform_device rtc_device = { | |||
223 | static __init int add_rtc_cmos(void) | 223 | static __init int add_rtc_cmos(void) |
224 | { | 224 | { |
225 | #ifdef CONFIG_PNP | 225 | #ifdef CONFIG_PNP |
226 | if (!pnp_platform_devices) | 226 | static const char *ids[] __initconst = |
227 | platform_device_register(&rtc_device); | 227 | { "PNP0b00", "PNP0b01", "PNP0b02", }; |
228 | #else | 228 | struct pnp_dev *dev; |
229 | struct pnp_id *id; | ||
230 | int i; | ||
231 | |||
232 | pnp_for_each_dev(dev) { | ||
233 | for (id = dev->id; id; id = id->next) { | ||
234 | for (i = 0; i < ARRAY_SIZE(ids); i++) { | ||
235 | if (compare_pnp_id(id, ids[i]) != 0) | ||
236 | return 0; | ||
237 | } | ||
238 | } | ||
239 | } | ||
240 | #endif | ||
241 | |||
229 | platform_device_register(&rtc_device); | 242 | platform_device_register(&rtc_device); |
230 | #endif /* CONFIG_PNP */ | 243 | dev_info(&rtc_device.dev, |
244 | "registered platform RTC device (no PNP device found)\n"); | ||
231 | return 0; | 245 | return 0; |
232 | } | 246 | } |
233 | device_initcall(add_rtc_cmos); | 247 | device_initcall(add_rtc_cmos); |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 21b8e0a59780..0fa6790c1dd3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -302,7 +302,7 @@ static void __init relocate_initrd(void) | |||
302 | if (clen > MAX_MAP_CHUNK-slop) | 302 | if (clen > MAX_MAP_CHUNK-slop) |
303 | clen = MAX_MAP_CHUNK-slop; | 303 | clen = MAX_MAP_CHUNK-slop; |
304 | mapaddr = ramdisk_image & PAGE_MASK; | 304 | mapaddr = ramdisk_image & PAGE_MASK; |
305 | p = early_ioremap(mapaddr, clen+slop); | 305 | p = early_memremap(mapaddr, clen+slop); |
306 | memcpy(q, p+slop, clen); | 306 | memcpy(q, p+slop, clen); |
307 | early_iounmap(p, clen+slop); | 307 | early_iounmap(p, clen+slop); |
308 | q += clen; | 308 | q += clen; |
@@ -379,7 +379,7 @@ static void __init parse_setup_data(void) | |||
379 | return; | 379 | return; |
380 | pa_data = boot_params.hdr.setup_data; | 380 | pa_data = boot_params.hdr.setup_data; |
381 | while (pa_data) { | 381 | while (pa_data) { |
382 | data = early_ioremap(pa_data, PAGE_SIZE); | 382 | data = early_memremap(pa_data, PAGE_SIZE); |
383 | switch (data->type) { | 383 | switch (data->type) { |
384 | case SETUP_E820_EXT: | 384 | case SETUP_E820_EXT: |
385 | parse_e820_ext(data, pa_data); | 385 | parse_e820_ext(data, pa_data); |
@@ -402,7 +402,7 @@ static void __init e820_reserve_setup_data(void) | |||
402 | return; | 402 | return; |
403 | pa_data = boot_params.hdr.setup_data; | 403 | pa_data = boot_params.hdr.setup_data; |
404 | while (pa_data) { | 404 | while (pa_data) { |
405 | data = early_ioremap(pa_data, sizeof(*data)); | 405 | data = early_memremap(pa_data, sizeof(*data)); |
406 | e820_update_range(pa_data, sizeof(*data)+data->len, | 406 | e820_update_range(pa_data, sizeof(*data)+data->len, |
407 | E820_RAM, E820_RESERVED_KERN); | 407 | E820_RAM, E820_RESERVED_KERN); |
408 | found = 1; | 408 | found = 1; |
@@ -428,7 +428,7 @@ static void __init reserve_early_setup_data(void) | |||
428 | return; | 428 | return; |
429 | pa_data = boot_params.hdr.setup_data; | 429 | pa_data = boot_params.hdr.setup_data; |
430 | while (pa_data) { | 430 | while (pa_data) { |
431 | data = early_ioremap(pa_data, sizeof(*data)); | 431 | data = early_memremap(pa_data, sizeof(*data)); |
432 | sprintf(buf, "setup data %x", data->type); | 432 | sprintf(buf, "setup data %x", data->type); |
433 | reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); | 433 | reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); |
434 | pa_data = data->next; | 434 | pa_data = data->next; |
@@ -561,7 +561,13 @@ static void __init reserve_standard_io_resources(void) | |||
561 | 561 | ||
562 | } | 562 | } |
563 | 563 | ||
564 | #ifdef CONFIG_PROC_VMCORE | 564 | /* |
565 | * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by | ||
566 | * is_kdump_kernel() to determine if we are booting after a panic. Hence | ||
567 | * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. | ||
568 | */ | ||
569 | |||
570 | #ifdef CONFIG_CRASH_DUMP | ||
565 | /* elfcorehdr= specifies the location of elf core header | 571 | /* elfcorehdr= specifies the location of elf core header |
566 | * stored by the crashed kernel. This option will be passed | 572 | * stored by the crashed kernel. This option will be passed |
567 | * by kexec loader to the capture kernel. | 573 | * by kexec loader to the capture kernel. |
@@ -998,6 +1004,8 @@ void __init setup_arch(char **cmdline_p) | |||
998 | */ | 1004 | */ |
999 | acpi_boot_table_init(); | 1005 | acpi_boot_table_init(); |
1000 | 1006 | ||
1007 | early_acpi_boot_init(); | ||
1008 | |||
1001 | #ifdef CONFIG_ACPI_NUMA | 1009 | #ifdef CONFIG_ACPI_NUMA |
1002 | /* | 1010 | /* |
1003 | * Parse SRAT to discover nodes. | 1011 | * Parse SRAT to discover nodes. |
@@ -1065,6 +1073,7 @@ void __init setup_arch(char **cmdline_p) | |||
1065 | #endif | 1073 | #endif |
1066 | 1074 | ||
1067 | prefill_possible_map(); | 1075 | prefill_possible_map(); |
1076 | |||
1068 | #ifdef CONFIG_X86_64 | 1077 | #ifdef CONFIG_X86_64 |
1069 | init_cpu_to_node(); | 1078 | init_cpu_to_node(); |
1070 | #endif | 1079 | #endif |
@@ -1072,6 +1081,9 @@ void __init setup_arch(char **cmdline_p) | |||
1072 | init_apic_mappings(); | 1081 | init_apic_mappings(); |
1073 | ioapic_init_mappings(); | 1082 | ioapic_init_mappings(); |
1074 | 1083 | ||
1084 | /* need to wait for io_apic is mapped */ | ||
1085 | nr_irqs = probe_nr_irqs(); | ||
1086 | |||
1075 | kvm_guest_init(); | 1087 | kvm_guest_init(); |
1076 | 1088 | ||
1077 | e820_reserve_resources(); | 1089 | e820_reserve_resources(); |
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 0e67f72d9316..410c88f0bfeb 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -140,25 +140,30 @@ static void __init setup_cpu_pda_map(void) | |||
140 | */ | 140 | */ |
141 | void __init setup_per_cpu_areas(void) | 141 | void __init setup_per_cpu_areas(void) |
142 | { | 142 | { |
143 | ssize_t size = PERCPU_ENOUGH_ROOM; | 143 | ssize_t size, old_size; |
144 | char *ptr; | 144 | char *ptr; |
145 | int cpu; | 145 | int cpu; |
146 | unsigned long align = 1; | ||
146 | 147 | ||
147 | /* Setup cpu_pda map */ | 148 | /* Setup cpu_pda map */ |
148 | setup_cpu_pda_map(); | 149 | setup_cpu_pda_map(); |
149 | 150 | ||
150 | /* Copy section for each CPU (we discard the original) */ | 151 | /* Copy section for each CPU (we discard the original) */ |
151 | size = PERCPU_ENOUGH_ROOM; | 152 | old_size = PERCPU_ENOUGH_ROOM; |
153 | align = max_t(unsigned long, PAGE_SIZE, align); | ||
154 | size = roundup(old_size, align); | ||
152 | printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", | 155 | printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", |
153 | size); | 156 | size); |
154 | 157 | ||
155 | for_each_possible_cpu(cpu) { | 158 | for_each_possible_cpu(cpu) { |
156 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 159 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
157 | ptr = alloc_bootmem_pages(size); | 160 | ptr = __alloc_bootmem(size, align, |
161 | __pa(MAX_DMA_ADDRESS)); | ||
158 | #else | 162 | #else |
159 | int node = early_cpu_to_node(cpu); | 163 | int node = early_cpu_to_node(cpu); |
160 | if (!node_online(node) || !NODE_DATA(node)) { | 164 | if (!node_online(node) || !NODE_DATA(node)) { |
161 | ptr = alloc_bootmem_pages(size); | 165 | ptr = __alloc_bootmem(size, align, |
166 | __pa(MAX_DMA_ADDRESS)); | ||
162 | printk(KERN_INFO | 167 | printk(KERN_INFO |
163 | "cpu %d has no node %d or node-local memory\n", | 168 | "cpu %d has no node %d or node-local memory\n", |
164 | cpu, node); | 169 | cpu, node); |
@@ -167,7 +172,8 @@ void __init setup_per_cpu_areas(void) | |||
167 | cpu, __pa(ptr)); | 172 | cpu, __pa(ptr)); |
168 | } | 173 | } |
169 | else { | 174 | else { |
170 | ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); | 175 | ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, |
176 | __pa(MAX_DMA_ADDRESS)); | ||
171 | if (ptr) | 177 | if (ptr) |
172 | printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", | 178 | printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", |
173 | cpu, node, __pa(ptr)); | 179 | cpu, node, __pa(ptr)); |
@@ -175,7 +181,6 @@ void __init setup_per_cpu_areas(void) | |||
175 | #endif | 181 | #endif |
176 | per_cpu_offset(cpu) = ptr - __per_cpu_start; | 182 | per_cpu_offset(cpu) = ptr - __per_cpu_start; |
177 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | 183 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); |
178 | |||
179 | } | 184 | } |
180 | 185 | ||
181 | printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", | 186 | printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 76b6f50978f7..7ece815ea637 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -282,6 +282,8 @@ static void __cpuinit smp_callin(void) | |||
282 | cpu_set(cpuid, cpu_callin_map); | 282 | cpu_set(cpuid, cpu_callin_map); |
283 | } | 283 | } |
284 | 284 | ||
285 | static int __cpuinitdata unsafe_smp; | ||
286 | |||
285 | /* | 287 | /* |
286 | * Activate a secondary processor. | 288 | * Activate a secondary processor. |
287 | */ | 289 | */ |
@@ -334,14 +336,17 @@ static void __cpuinit start_secondary(void *unused) | |||
334 | * does not change while we are assigning vectors to cpus. Holding | 336 | * does not change while we are assigning vectors to cpus. Holding |
335 | * this lock ensures we don't half assign or remove an irq from a cpu. | 337 | * this lock ensures we don't half assign or remove an irq from a cpu. |
336 | */ | 338 | */ |
337 | ipi_call_lock_irq(); | 339 | ipi_call_lock(); |
338 | lock_vector_lock(); | 340 | lock_vector_lock(); |
339 | __setup_vector_irq(smp_processor_id()); | 341 | __setup_vector_irq(smp_processor_id()); |
340 | cpu_set(smp_processor_id(), cpu_online_map); | 342 | cpu_set(smp_processor_id(), cpu_online_map); |
341 | unlock_vector_lock(); | 343 | unlock_vector_lock(); |
342 | ipi_call_unlock_irq(); | 344 | ipi_call_unlock(); |
343 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; | 345 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; |
344 | 346 | ||
347 | /* enable local interrupts */ | ||
348 | local_irq_enable(); | ||
349 | |||
345 | setup_secondary_clock(); | 350 | setup_secondary_clock(); |
346 | 351 | ||
347 | wmb(); | 352 | wmb(); |
@@ -394,7 +399,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) | |||
394 | goto valid_k7; | 399 | goto valid_k7; |
395 | 400 | ||
396 | /* If we get here, not a certified SMP capable AMD system. */ | 401 | /* If we get here, not a certified SMP capable AMD system. */ |
397 | add_taint(TAINT_UNSAFE_SMP); | 402 | unsafe_smp = 1; |
398 | } | 403 | } |
399 | 404 | ||
400 | valid_k7: | 405 | valid_k7: |
@@ -411,12 +416,10 @@ static void __cpuinit smp_checks(void) | |||
411 | * Don't taint if we are running SMP kernel on a single non-MP | 416 | * Don't taint if we are running SMP kernel on a single non-MP |
412 | * approved Athlon | 417 | * approved Athlon |
413 | */ | 418 | */ |
414 | if (tainted & TAINT_UNSAFE_SMP) { | 419 | if (unsafe_smp && num_online_cpus() > 1) { |
415 | if (num_online_cpus()) | 420 | printk(KERN_INFO "WARNING: This combination of AMD" |
416 | printk(KERN_INFO "WARNING: This combination of AMD" | 421 | "processors is not suitable for SMP.\n"); |
417 | "processors is not suitable for SMP.\n"); | 422 | add_taint(TAINT_UNSAFE_SMP); |
418 | else | ||
419 | tainted &= ~TAINT_UNSAFE_SMP; | ||
420 | } | 423 | } |
421 | } | 424 | } |
422 | 425 | ||
@@ -540,10 +543,10 @@ static inline void __inquire_remote_apic(int apicid) | |||
540 | int timeout; | 543 | int timeout; |
541 | u32 status; | 544 | u32 status; |
542 | 545 | ||
543 | printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); | 546 | printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid); |
544 | 547 | ||
545 | for (i = 0; i < ARRAY_SIZE(regs); i++) { | 548 | for (i = 0; i < ARRAY_SIZE(regs); i++) { |
546 | printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]); | 549 | printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]); |
547 | 550 | ||
548 | /* | 551 | /* |
549 | * Wait for idle. | 552 | * Wait for idle. |
@@ -596,10 +599,12 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) | |||
596 | * Give the other CPU some time to accept the IPI. | 599 | * Give the other CPU some time to accept the IPI. |
597 | */ | 600 | */ |
598 | udelay(200); | 601 | udelay(200); |
599 | maxlvt = lapic_get_maxlvt(); | 602 | if (APIC_INTEGRATED(apic_version[phys_apicid])) { |
600 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ | 603 | maxlvt = lapic_get_maxlvt(); |
601 | apic_write(APIC_ESR, 0); | 604 | if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ |
602 | accept_status = (apic_read(APIC_ESR) & 0xEF); | 605 | apic_write(APIC_ESR, 0); |
606 | accept_status = (apic_read(APIC_ESR) & 0xEF); | ||
607 | } | ||
603 | pr_debug("NMI sent.\n"); | 608 | pr_debug("NMI sent.\n"); |
604 | 609 | ||
605 | if (send_status) | 610 | if (send_status) |
@@ -869,7 +874,7 @@ do_rest: | |||
869 | start_ip = setup_trampoline(); | 874 | start_ip = setup_trampoline(); |
870 | 875 | ||
871 | /* So we see what's up */ | 876 | /* So we see what's up */ |
872 | printk(KERN_INFO "Booting processor %d/%d ip %lx\n", | 877 | printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", |
873 | cpu, apicid, start_ip); | 878 | cpu, apicid, start_ip); |
874 | 879 | ||
875 | /* | 880 | /* |
@@ -1256,39 +1261,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) | |||
1256 | check_nmi_watchdog(); | 1261 | check_nmi_watchdog(); |
1257 | } | 1262 | } |
1258 | 1263 | ||
1259 | #ifdef CONFIG_HOTPLUG_CPU | ||
1260 | |||
1261 | static void remove_siblinginfo(int cpu) | ||
1262 | { | ||
1263 | int sibling; | ||
1264 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
1265 | |||
1266 | for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { | ||
1267 | cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); | ||
1268 | /*/ | ||
1269 | * last thread sibling in this cpu core going down | ||
1270 | */ | ||
1271 | if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) | ||
1272 | cpu_data(sibling).booted_cores--; | ||
1273 | } | ||
1274 | |||
1275 | for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) | ||
1276 | cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); | ||
1277 | cpus_clear(per_cpu(cpu_sibling_map, cpu)); | ||
1278 | cpus_clear(per_cpu(cpu_core_map, cpu)); | ||
1279 | c->phys_proc_id = 0; | ||
1280 | c->cpu_core_id = 0; | ||
1281 | cpu_clear(cpu, cpu_sibling_setup_map); | ||
1282 | } | ||
1283 | |||
1284 | static int additional_cpus __initdata = -1; | ||
1285 | |||
1286 | static __init int setup_additional_cpus(char *s) | ||
1287 | { | ||
1288 | return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; | ||
1289 | } | ||
1290 | early_param("additional_cpus", setup_additional_cpus); | ||
1291 | |||
1292 | /* | 1264 | /* |
1293 | * cpu_possible_map should be static, it cannot change as cpu's | 1265 | * cpu_possible_map should be static, it cannot change as cpu's |
1294 | * are onlined, or offlined. The reason is per-cpu data-structures | 1266 | * are onlined, or offlined. The reason is per-cpu data-structures |
@@ -1308,21 +1280,13 @@ early_param("additional_cpus", setup_additional_cpus); | |||
1308 | */ | 1280 | */ |
1309 | __init void prefill_possible_map(void) | 1281 | __init void prefill_possible_map(void) |
1310 | { | 1282 | { |
1311 | int i; | 1283 | int i, possible; |
1312 | int possible; | ||
1313 | 1284 | ||
1314 | /* no processor from mptable or madt */ | 1285 | /* no processor from mptable or madt */ |
1315 | if (!num_processors) | 1286 | if (!num_processors) |
1316 | num_processors = 1; | 1287 | num_processors = 1; |
1317 | 1288 | ||
1318 | if (additional_cpus == -1) { | 1289 | possible = num_processors + disabled_cpus; |
1319 | if (disabled_cpus > 0) | ||
1320 | additional_cpus = disabled_cpus; | ||
1321 | else | ||
1322 | additional_cpus = 0; | ||
1323 | } | ||
1324 | |||
1325 | possible = num_processors + additional_cpus; | ||
1326 | if (possible > NR_CPUS) | 1290 | if (possible > NR_CPUS) |
1327 | possible = NR_CPUS; | 1291 | possible = NR_CPUS; |
1328 | 1292 | ||
@@ -1335,6 +1299,31 @@ __init void prefill_possible_map(void) | |||
1335 | nr_cpu_ids = possible; | 1299 | nr_cpu_ids = possible; |
1336 | } | 1300 | } |
1337 | 1301 | ||
1302 | #ifdef CONFIG_HOTPLUG_CPU | ||
1303 | |||
1304 | static void remove_siblinginfo(int cpu) | ||
1305 | { | ||
1306 | int sibling; | ||
1307 | struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
1308 | |||
1309 | for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { | ||
1310 | cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); | ||
1311 | /*/ | ||
1312 | * last thread sibling in this cpu core going down | ||
1313 | */ | ||
1314 | if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) | ||
1315 | cpu_data(sibling).booted_cores--; | ||
1316 | } | ||
1317 | |||
1318 | for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) | ||
1319 | cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); | ||
1320 | cpus_clear(per_cpu(cpu_sibling_map, cpu)); | ||
1321 | cpus_clear(per_cpu(cpu_core_map, cpu)); | ||
1322 | c->phys_proc_id = 0; | ||
1323 | c->cpu_core_id = 0; | ||
1324 | cpu_clear(cpu, cpu_sibling_setup_map); | ||
1325 | } | ||
1326 | |||
1338 | static void __ref remove_cpu_from_maps(int cpu) | 1327 | static void __ref remove_cpu_from_maps(int cpu) |
1339 | { | 1328 | { |
1340 | cpu_clear(cpu, cpu_online_map); | 1329 | cpu_clear(cpu, cpu_online_map); |
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index bbecf8b6bf96..77b400f06ea2 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c | |||
@@ -47,10 +47,9 @@ unsigned long profile_pc(struct pt_regs *regs) | |||
47 | unsigned long pc = instruction_pointer(regs); | 47 | unsigned long pc = instruction_pointer(regs); |
48 | 48 | ||
49 | #ifdef CONFIG_SMP | 49 | #ifdef CONFIG_SMP |
50 | if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && | 50 | if (!user_mode_vm(regs) && in_lock_functions(pc)) { |
51 | in_lock_functions(pc)) { | ||
52 | #ifdef CONFIG_FRAME_POINTER | 51 | #ifdef CONFIG_FRAME_POINTER |
53 | return *(unsigned long *)(regs->bp + 4); | 52 | return *(unsigned long *)(regs->bp + sizeof(long)); |
54 | #else | 53 | #else |
55 | unsigned long *sp = (unsigned long *)®s->sp; | 54 | unsigned long *sp = (unsigned long *)®s->sp; |
56 | 55 | ||
@@ -95,6 +94,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) | |||
95 | 94 | ||
96 | do_timer_interrupt_hook(); | 95 | do_timer_interrupt_hook(); |
97 | 96 | ||
97 | #ifdef CONFIG_MCA | ||
98 | if (MCA_bus) { | 98 | if (MCA_bus) { |
99 | /* The PS/2 uses level-triggered interrupts. You can't | 99 | /* The PS/2 uses level-triggered interrupts. You can't |
100 | turn them off, nor would you want to (any attempt to | 100 | turn them off, nor would you want to (any attempt to |
@@ -108,6 +108,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) | |||
108 | u8 irq_v = inb_p( 0x61 ); /* read the current state */ | 108 | u8 irq_v = inb_p( 0x61 ); /* read the current state */ |
109 | outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ | 109 | outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ |
110 | } | 110 | } |
111 | #endif | ||
111 | 112 | ||
112 | return IRQ_HANDLED; | 113 | return IRQ_HANDLED; |
113 | } | 114 | } |
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index e3d49c553af2..cb19d650c216 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/time.h> | 18 | #include <linux/time.h> |
19 | #include <linux/mca.h> | ||
19 | 20 | ||
20 | #include <asm/i8253.h> | 21 | #include <asm/i8253.h> |
21 | #include <asm/hpet.h> | 22 | #include <asm/hpet.h> |
@@ -33,23 +34,34 @@ unsigned long profile_pc(struct pt_regs *regs) | |||
33 | /* Assume the lock function has either no stack frame or a copy | 34 | /* Assume the lock function has either no stack frame or a copy |
34 | of flags from PUSHF | 35 | of flags from PUSHF |
35 | Eflags always has bits 22 and up cleared unlike kernel addresses. */ | 36 | Eflags always has bits 22 and up cleared unlike kernel addresses. */ |
36 | if (!user_mode(regs) && in_lock_functions(pc)) { | 37 | if (!user_mode_vm(regs) && in_lock_functions(pc)) { |
38 | #ifdef CONFIG_FRAME_POINTER | ||
39 | return *(unsigned long *)(regs->bp + sizeof(long)); | ||
40 | #else | ||
37 | unsigned long *sp = (unsigned long *)regs->sp; | 41 | unsigned long *sp = (unsigned long *)regs->sp; |
38 | if (sp[0] >> 22) | 42 | if (sp[0] >> 22) |
39 | return sp[0]; | 43 | return sp[0]; |
40 | if (sp[1] >> 22) | 44 | if (sp[1] >> 22) |
41 | return sp[1]; | 45 | return sp[1]; |
46 | #endif | ||
42 | } | 47 | } |
43 | return pc; | 48 | return pc; |
44 | } | 49 | } |
45 | EXPORT_SYMBOL(profile_pc); | 50 | EXPORT_SYMBOL(profile_pc); |
46 | 51 | ||
47 | static irqreturn_t timer_event_interrupt(int irq, void *dev_id) | 52 | irqreturn_t timer_interrupt(int irq, void *dev_id) |
48 | { | 53 | { |
49 | add_pda(irq0_irqs, 1); | 54 | add_pda(irq0_irqs, 1); |
50 | 55 | ||
51 | global_clock_event->event_handler(global_clock_event); | 56 | global_clock_event->event_handler(global_clock_event); |
52 | 57 | ||
58 | #ifdef CONFIG_MCA | ||
59 | if (MCA_bus) { | ||
60 | u8 irq_v = inb_p(0x61); /* read the current state */ | ||
61 | outb_p(irq_v|0x80, 0x61); /* reset the IRQ */ | ||
62 | } | ||
63 | #endif | ||
64 | |||
53 | return IRQ_HANDLED; | 65 | return IRQ_HANDLED; |
54 | } | 66 | } |
55 | 67 | ||
@@ -100,7 +112,7 @@ unsigned long __init calibrate_cpu(void) | |||
100 | } | 112 | } |
101 | 113 | ||
102 | static struct irqaction irq0 = { | 114 | static struct irqaction irq0 = { |
103 | .handler = timer_event_interrupt, | 115 | .handler = timer_interrupt, |
104 | .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, | 116 | .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, |
105 | .mask = CPU_MASK_NONE, | 117 | .mask = CPU_MASK_NONE, |
106 | .name = "timer" | 118 | .name = "timer" |
@@ -111,16 +123,13 @@ void __init hpet_time_init(void) | |||
111 | if (!hpet_enable()) | 123 | if (!hpet_enable()) |
112 | setup_pit_timer(); | 124 | setup_pit_timer(); |
113 | 125 | ||
126 | irq0.mask = cpumask_of_cpu(0); | ||
114 | setup_irq(0, &irq0); | 127 | setup_irq(0, &irq0); |
115 | } | 128 | } |
116 | 129 | ||
117 | void __init time_init(void) | 130 | void __init time_init(void) |
118 | { | 131 | { |
119 | tsc_init(); | 132 | tsc_init(); |
120 | if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) | ||
121 | vgetcpu_mode = VGETCPU_RDTSCP; | ||
122 | else | ||
123 | vgetcpu_mode = VGETCPU_LSL; | ||
124 | 133 | ||
125 | late_time_init = choose_time_init(); | 134 | late_time_init = choose_time_init(); |
126 | } | 135 | } |
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps.c index 0429c5de5ea9..e062974cce34 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps.c | |||
@@ -7,13 +7,11 @@ | |||
7 | */ | 7 | */ |
8 | 8 | ||
9 | /* | 9 | /* |
10 | * 'Traps.c' handles hardware traps and faults after we have saved some | 10 | * Handle hardware traps and faults. |
11 | * state in 'asm.s'. | ||
12 | */ | 11 | */ |
13 | #include <linux/interrupt.h> | 12 | #include <linux/interrupt.h> |
14 | #include <linux/kallsyms.h> | 13 | #include <linux/kallsyms.h> |
15 | #include <linux/spinlock.h> | 14 | #include <linux/spinlock.h> |
16 | #include <linux/highmem.h> | ||
17 | #include <linux/kprobes.h> | 15 | #include <linux/kprobes.h> |
18 | #include <linux/uaccess.h> | 16 | #include <linux/uaccess.h> |
19 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
@@ -32,6 +30,8 @@ | |||
32 | #include <linux/bug.h> | 30 | #include <linux/bug.h> |
33 | #include <linux/nmi.h> | 31 | #include <linux/nmi.h> |
34 | #include <linux/mm.h> | 32 | #include <linux/mm.h> |
33 | #include <linux/smp.h> | ||
34 | #include <linux/io.h> | ||
35 | 35 | ||
36 | #ifdef CONFIG_EISA | 36 | #ifdef CONFIG_EISA |
37 | #include <linux/ioport.h> | 37 | #include <linux/ioport.h> |
@@ -46,21 +46,31 @@ | |||
46 | #include <linux/edac.h> | 46 | #include <linux/edac.h> |
47 | #endif | 47 | #endif |
48 | 48 | ||
49 | #include <asm/arch_hooks.h> | ||
50 | #include <asm/stacktrace.h> | 49 | #include <asm/stacktrace.h> |
51 | #include <asm/processor.h> | 50 | #include <asm/processor.h> |
52 | #include <asm/debugreg.h> | 51 | #include <asm/debugreg.h> |
53 | #include <asm/atomic.h> | 52 | #include <asm/atomic.h> |
54 | #include <asm/system.h> | 53 | #include <asm/system.h> |
55 | #include <asm/unwind.h> | 54 | #include <asm/unwind.h> |
55 | #include <asm/traps.h> | ||
56 | #include <asm/desc.h> | 56 | #include <asm/desc.h> |
57 | #include <asm/i387.h> | 57 | #include <asm/i387.h> |
58 | |||
59 | #include <mach_traps.h> | ||
60 | |||
61 | #ifdef CONFIG_X86_64 | ||
62 | #include <asm/pgalloc.h> | ||
63 | #include <asm/proto.h> | ||
64 | #include <asm/pda.h> | ||
65 | #else | ||
66 | #include <asm/processor-flags.h> | ||
67 | #include <asm/arch_hooks.h> | ||
58 | #include <asm/nmi.h> | 68 | #include <asm/nmi.h> |
59 | #include <asm/smp.h> | 69 | #include <asm/smp.h> |
60 | #include <asm/io.h> | 70 | #include <asm/io.h> |
61 | #include <asm/traps.h> | 71 | #include <asm/traps.h> |
62 | 72 | ||
63 | #include "mach_traps.h" | 73 | #include "cpu/mcheck/mce.h" |
64 | 74 | ||
65 | DECLARE_BITMAP(used_vectors, NR_VECTORS); | 75 | DECLARE_BITMAP(used_vectors, NR_VECTORS); |
66 | EXPORT_SYMBOL_GPL(used_vectors); | 76 | EXPORT_SYMBOL_GPL(used_vectors); |
@@ -77,418 +87,104 @@ char ignore_fpu_irq; | |||
77 | */ | 87 | */ |
78 | gate_desc idt_table[256] | 88 | gate_desc idt_table[256] |
79 | __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; | 89 | __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; |
80 | |||
81 | int panic_on_unrecovered_nmi; | ||
82 | int kstack_depth_to_print = 24; | ||
83 | static unsigned int code_bytes = 64; | ||
84 | static int ignore_nmis; | ||
85 | static int die_counter; | ||
86 | |||
87 | void printk_address(unsigned long address, int reliable) | ||
88 | { | ||
89 | #ifdef CONFIG_KALLSYMS | ||
90 | unsigned long offset = 0; | ||
91 | unsigned long symsize; | ||
92 | const char *symname; | ||
93 | char *modname; | ||
94 | char *delim = ":"; | ||
95 | char namebuf[KSYM_NAME_LEN]; | ||
96 | char reliab[4] = ""; | ||
97 | |||
98 | symname = kallsyms_lookup(address, &symsize, &offset, | ||
99 | &modname, namebuf); | ||
100 | if (!symname) { | ||
101 | printk(" [<%08lx>]\n", address); | ||
102 | return; | ||
103 | } | ||
104 | if (!reliable) | ||
105 | strcpy(reliab, "? "); | ||
106 | |||
107 | if (!modname) | ||
108 | modname = delim = ""; | ||
109 | printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", | ||
110 | address, reliab, delim, modname, delim, symname, offset, symsize); | ||
111 | #else | ||
112 | printk(" [<%08lx>]\n", address); | ||
113 | #endif | 90 | #endif |
114 | } | ||
115 | |||
116 | static inline int valid_stack_ptr(struct thread_info *tinfo, | ||
117 | void *p, unsigned int size) | ||
118 | { | ||
119 | void *t = tinfo; | ||
120 | return p > t && p <= t + THREAD_SIZE - size; | ||
121 | } | ||
122 | |||
123 | /* The form of the top of the frame on the stack */ | ||
124 | struct stack_frame { | ||
125 | struct stack_frame *next_frame; | ||
126 | unsigned long return_address; | ||
127 | }; | ||
128 | |||
129 | static inline unsigned long | ||
130 | print_context_stack(struct thread_info *tinfo, | ||
131 | unsigned long *stack, unsigned long bp, | ||
132 | const struct stacktrace_ops *ops, void *data) | ||
133 | { | ||
134 | struct stack_frame *frame = (struct stack_frame *)bp; | ||
135 | |||
136 | while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { | ||
137 | unsigned long addr; | ||
138 | |||
139 | addr = *stack; | ||
140 | if (__kernel_text_address(addr)) { | ||
141 | if ((unsigned long) stack == bp + 4) { | ||
142 | ops->address(data, addr, 1); | ||
143 | frame = frame->next_frame; | ||
144 | bp = (unsigned long) frame; | ||
145 | } else { | ||
146 | ops->address(data, addr, bp == 0); | ||
147 | } | ||
148 | } | ||
149 | stack++; | ||
150 | } | ||
151 | return bp; | ||
152 | } | ||
153 | |||
154 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | ||
155 | unsigned long *stack, unsigned long bp, | ||
156 | const struct stacktrace_ops *ops, void *data) | ||
157 | { | ||
158 | if (!task) | ||
159 | task = current; | ||
160 | |||
161 | if (!stack) { | ||
162 | unsigned long dummy; | ||
163 | stack = &dummy; | ||
164 | if (task != current) | ||
165 | stack = (unsigned long *)task->thread.sp; | ||
166 | } | ||
167 | |||
168 | #ifdef CONFIG_FRAME_POINTER | ||
169 | if (!bp) { | ||
170 | if (task == current) { | ||
171 | /* Grab bp right from our regs */ | ||
172 | asm("movl %%ebp, %0" : "=r" (bp) :); | ||
173 | } else { | ||
174 | /* bp is the last reg pushed by switch_to */ | ||
175 | bp = *(unsigned long *) task->thread.sp; | ||
176 | } | ||
177 | } | ||
178 | #endif | ||
179 | |||
180 | for (;;) { | ||
181 | struct thread_info *context; | ||
182 | |||
183 | context = (struct thread_info *) | ||
184 | ((unsigned long)stack & (~(THREAD_SIZE - 1))); | ||
185 | bp = print_context_stack(context, stack, bp, ops, data); | ||
186 | /* | ||
187 | * Should be after the line below, but somewhere | ||
188 | * in early boot context comes out corrupted and we | ||
189 | * can't reference it: | ||
190 | */ | ||
191 | if (ops->stack(data, "IRQ") < 0) | ||
192 | break; | ||
193 | stack = (unsigned long *)context->previous_esp; | ||
194 | if (!stack) | ||
195 | break; | ||
196 | touch_nmi_watchdog(); | ||
197 | } | ||
198 | } | ||
199 | EXPORT_SYMBOL(dump_trace); | ||
200 | |||
201 | static void | ||
202 | print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
203 | { | ||
204 | printk(data); | ||
205 | print_symbol(msg, symbol); | ||
206 | printk("\n"); | ||
207 | } | ||
208 | |||
209 | static void print_trace_warning(void *data, char *msg) | ||
210 | { | ||
211 | printk("%s%s\n", (char *)data, msg); | ||
212 | } | ||
213 | 91 | ||
214 | static int print_trace_stack(void *data, char *name) | 92 | static int ignore_nmis; |
215 | { | ||
216 | return 0; | ||
217 | } | ||
218 | |||
219 | /* | ||
220 | * Print one address/symbol entries per line. | ||
221 | */ | ||
222 | static void print_trace_address(void *data, unsigned long addr, int reliable) | ||
223 | { | ||
224 | printk("%s [<%08lx>] ", (char *)data, addr); | ||
225 | if (!reliable) | ||
226 | printk("? "); | ||
227 | print_symbol("%s\n", addr); | ||
228 | touch_nmi_watchdog(); | ||
229 | } | ||
230 | |||
231 | static const struct stacktrace_ops print_trace_ops = { | ||
232 | .warning = print_trace_warning, | ||
233 | .warning_symbol = print_trace_warning_symbol, | ||
234 | .stack = print_trace_stack, | ||
235 | .address = print_trace_address, | ||
236 | }; | ||
237 | 93 | ||
238 | static void | 94 | static inline void conditional_sti(struct pt_regs *regs) |
239 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
240 | unsigned long *stack, unsigned long bp, char *log_lvl) | ||
241 | { | 95 | { |
242 | dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); | 96 | if (regs->flags & X86_EFLAGS_IF) |
243 | printk("%s =======================\n", log_lvl); | 97 | local_irq_enable(); |
244 | } | 98 | } |
245 | 99 | ||
246 | void show_trace(struct task_struct *task, struct pt_regs *regs, | 100 | static inline void preempt_conditional_sti(struct pt_regs *regs) |
247 | unsigned long *stack, unsigned long bp) | ||
248 | { | 101 | { |
249 | show_trace_log_lvl(task, regs, stack, bp, ""); | 102 | inc_preempt_count(); |
103 | if (regs->flags & X86_EFLAGS_IF) | ||
104 | local_irq_enable(); | ||
250 | } | 105 | } |
251 | 106 | ||
252 | static void | 107 | static inline void preempt_conditional_cli(struct pt_regs *regs) |
253 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
254 | unsigned long *sp, unsigned long bp, char *log_lvl) | ||
255 | { | 108 | { |
256 | unsigned long *stack; | 109 | if (regs->flags & X86_EFLAGS_IF) |
257 | int i; | 110 | local_irq_disable(); |
258 | 111 | dec_preempt_count(); | |
259 | if (sp == NULL) { | ||
260 | if (task) | ||
261 | sp = (unsigned long *)task->thread.sp; | ||
262 | else | ||
263 | sp = (unsigned long *)&sp; | ||
264 | } | ||
265 | |||
266 | stack = sp; | ||
267 | for (i = 0; i < kstack_depth_to_print; i++) { | ||
268 | if (kstack_end(stack)) | ||
269 | break; | ||
270 | if (i && ((i % 8) == 0)) | ||
271 | printk("\n%s ", log_lvl); | ||
272 | printk("%08lx ", *stack++); | ||
273 | } | ||
274 | printk("\n%sCall Trace:\n", log_lvl); | ||
275 | |||
276 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); | ||
277 | } | 112 | } |
278 | 113 | ||
279 | void show_stack(struct task_struct *task, unsigned long *sp) | 114 | #ifdef CONFIG_X86_32 |
115 | static inline void | ||
116 | die_if_kernel(const char *str, struct pt_regs *regs, long err) | ||
280 | { | 117 | { |
281 | printk(" "); | 118 | if (!user_mode_vm(regs)) |
282 | show_stack_log_lvl(task, NULL, sp, 0, ""); | 119 | die(str, regs, err); |
283 | } | 120 | } |
284 | 121 | ||
285 | /* | 122 | /* |
286 | * The architecture-independent dump_stack generator | 123 | * Perform the lazy TSS's I/O bitmap copy. If the TSS has an |
124 | * invalid offset set (the LAZY one) and the faulting thread has | ||
125 | * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, | ||
126 | * we set the offset field correctly and return 1. | ||
287 | */ | 127 | */ |
288 | void dump_stack(void) | 128 | static int lazy_iobitmap_copy(void) |
289 | { | 129 | { |
290 | unsigned long bp = 0; | 130 | struct thread_struct *thread; |
291 | unsigned long stack; | 131 | struct tss_struct *tss; |
292 | 132 | int cpu; | |
293 | #ifdef CONFIG_FRAME_POINTER | ||
294 | if (!bp) | ||
295 | asm("movl %%ebp, %0" : "=r" (bp):); | ||
296 | #endif | ||
297 | |||
298 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | ||
299 | current->pid, current->comm, print_tainted(), | ||
300 | init_utsname()->release, | ||
301 | (int)strcspn(init_utsname()->version, " "), | ||
302 | init_utsname()->version); | ||
303 | |||
304 | show_trace(current, NULL, &stack, bp); | ||
305 | } | ||
306 | |||
307 | EXPORT_SYMBOL(dump_stack); | ||
308 | |||
309 | void show_registers(struct pt_regs *regs) | ||
310 | { | ||
311 | int i; | ||
312 | 133 | ||
313 | print_modules(); | 134 | cpu = get_cpu(); |
314 | __show_registers(regs, 0); | 135 | tss = &per_cpu(init_tss, cpu); |
136 | thread = ¤t->thread; | ||
315 | 137 | ||
316 | printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", | 138 | if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && |
317 | TASK_COMM_LEN, current->comm, task_pid_nr(current), | 139 | thread->io_bitmap_ptr) { |
318 | current_thread_info(), current, task_thread_info(current)); | 140 | memcpy(tss->io_bitmap, thread->io_bitmap_ptr, |
319 | /* | 141 | thread->io_bitmap_max); |
320 | * When in-kernel, we also print out the stack and code at the | 142 | /* |
321 | * time of the fault.. | 143 | * If the previously set map was extending to higher ports |
322 | */ | 144 | * than the current one, pad extra space with 0xff (no access). |
323 | if (!user_mode_vm(regs)) { | 145 | */ |
324 | unsigned int code_prologue = code_bytes * 43 / 64; | 146 | if (thread->io_bitmap_max < tss->io_bitmap_max) { |
325 | unsigned int code_len = code_bytes; | 147 | memset((char *) tss->io_bitmap + |
326 | unsigned char c; | 148 | thread->io_bitmap_max, 0xff, |
327 | u8 *ip; | 149 | tss->io_bitmap_max - thread->io_bitmap_max); |
328 | |||
329 | printk("\n" KERN_EMERG "Stack: "); | ||
330 | show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); | ||
331 | |||
332 | printk(KERN_EMERG "Code: "); | ||
333 | |||
334 | ip = (u8 *)regs->ip - code_prologue; | ||
335 | if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { | ||
336 | /* try starting at EIP */ | ||
337 | ip = (u8 *)regs->ip; | ||
338 | code_len = code_len - code_prologue + 1; | ||
339 | } | ||
340 | for (i = 0; i < code_len; i++, ip++) { | ||
341 | if (ip < (u8 *)PAGE_OFFSET || | ||
342 | probe_kernel_address(ip, c)) { | ||
343 | printk(" Bad EIP value."); | ||
344 | break; | ||
345 | } | ||
346 | if (ip == (u8 *)regs->ip) | ||
347 | printk("<%02x> ", c); | ||
348 | else | ||
349 | printk("%02x ", c); | ||
350 | } | 150 | } |
351 | } | 151 | tss->io_bitmap_max = thread->io_bitmap_max; |
352 | printk("\n"); | 152 | tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; |
353 | } | 153 | tss->io_bitmap_owner = thread; |
354 | 154 | put_cpu(); | |
355 | int is_valid_bugaddr(unsigned long ip) | ||
356 | { | ||
357 | unsigned short ud2; | ||
358 | |||
359 | if (ip < PAGE_OFFSET) | ||
360 | return 0; | ||
361 | if (probe_kernel_address((unsigned short *)ip, ud2)) | ||
362 | return 0; | ||
363 | |||
364 | return ud2 == 0x0b0f; | ||
365 | } | ||
366 | |||
367 | static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; | ||
368 | static int die_owner = -1; | ||
369 | static unsigned int die_nest_count; | ||
370 | |||
371 | unsigned __kprobes long oops_begin(void) | ||
372 | { | ||
373 | unsigned long flags; | ||
374 | |||
375 | oops_enter(); | ||
376 | |||
377 | if (die_owner != raw_smp_processor_id()) { | ||
378 | console_verbose(); | ||
379 | raw_local_irq_save(flags); | ||
380 | __raw_spin_lock(&die_lock); | ||
381 | die_owner = smp_processor_id(); | ||
382 | die_nest_count = 0; | ||
383 | bust_spinlocks(1); | ||
384 | } else { | ||
385 | raw_local_irq_save(flags); | ||
386 | } | ||
387 | die_nest_count++; | ||
388 | return flags; | ||
389 | } | ||
390 | |||
391 | void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) | ||
392 | { | ||
393 | bust_spinlocks(0); | ||
394 | die_owner = -1; | ||
395 | add_taint(TAINT_DIE); | ||
396 | __raw_spin_unlock(&die_lock); | ||
397 | raw_local_irq_restore(flags); | ||
398 | |||
399 | if (!regs) | ||
400 | return; | ||
401 | |||
402 | if (kexec_should_crash(current)) | ||
403 | crash_kexec(regs); | ||
404 | |||
405 | if (in_interrupt()) | ||
406 | panic("Fatal exception in interrupt"); | ||
407 | |||
408 | if (panic_on_oops) | ||
409 | panic("Fatal exception"); | ||
410 | |||
411 | oops_exit(); | ||
412 | do_exit(signr); | ||
413 | } | ||
414 | |||
415 | int __kprobes __die(const char *str, struct pt_regs *regs, long err) | ||
416 | { | ||
417 | unsigned short ss; | ||
418 | unsigned long sp; | ||
419 | 155 | ||
420 | printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); | ||
421 | #ifdef CONFIG_PREEMPT | ||
422 | printk("PREEMPT "); | ||
423 | #endif | ||
424 | #ifdef CONFIG_SMP | ||
425 | printk("SMP "); | ||
426 | #endif | ||
427 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
428 | printk("DEBUG_PAGEALLOC"); | ||
429 | #endif | ||
430 | printk("\n"); | ||
431 | if (notify_die(DIE_OOPS, str, regs, err, | ||
432 | current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) | ||
433 | return 1; | 156 | return 1; |
434 | |||
435 | show_registers(regs); | ||
436 | /* Executive summary in case the oops scrolled away */ | ||
437 | sp = (unsigned long) (®s->sp); | ||
438 | savesegment(ss, ss); | ||
439 | if (user_mode(regs)) { | ||
440 | sp = regs->sp; | ||
441 | ss = regs->ss & 0xffff; | ||
442 | } | 157 | } |
443 | printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); | 158 | put_cpu(); |
444 | print_symbol("%s", regs->ip); | ||
445 | printk(" SS:ESP %04x:%08lx\n", ss, sp); | ||
446 | return 0; | ||
447 | } | ||
448 | |||
449 | /* | ||
450 | * This is gone through when something in the kernel has done something bad | ||
451 | * and is about to be terminated: | ||
452 | */ | ||
453 | void die(const char *str, struct pt_regs *regs, long err) | ||
454 | { | ||
455 | unsigned long flags = oops_begin(); | ||
456 | |||
457 | if (die_nest_count < 3) { | ||
458 | report_bug(regs->ip, regs); | ||
459 | |||
460 | if (__die(str, regs, err)) | ||
461 | regs = NULL; | ||
462 | } else { | ||
463 | printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); | ||
464 | } | ||
465 | |||
466 | oops_end(flags, regs, SIGSEGV); | ||
467 | } | ||
468 | 159 | ||
469 | static inline void | 160 | return 0; |
470 | die_if_kernel(const char *str, struct pt_regs *regs, long err) | ||
471 | { | ||
472 | if (!user_mode_vm(regs)) | ||
473 | die(str, regs, err); | ||
474 | } | 161 | } |
162 | #endif | ||
475 | 163 | ||
476 | static void __kprobes | 164 | static void __kprobes |
477 | do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs, | 165 | do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, |
478 | long error_code, siginfo_t *info) | 166 | long error_code, siginfo_t *info) |
479 | { | 167 | { |
480 | struct task_struct *tsk = current; | 168 | struct task_struct *tsk = current; |
481 | 169 | ||
170 | #ifdef CONFIG_X86_32 | ||
482 | if (regs->flags & X86_VM_MASK) { | 171 | if (regs->flags & X86_VM_MASK) { |
483 | if (vm86) | 172 | /* |
173 | * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. | ||
174 | * On nmi (interrupt 2), do_trap should not be called. | ||
175 | */ | ||
176 | if (trapnr < 6) | ||
484 | goto vm86_trap; | 177 | goto vm86_trap; |
485 | goto trap_signal; | 178 | goto trap_signal; |
486 | } | 179 | } |
180 | #endif | ||
487 | 181 | ||
488 | if (!user_mode(regs)) | 182 | if (!user_mode(regs)) |
489 | goto kernel_trap; | 183 | goto kernel_trap; |
490 | 184 | ||
185 | #ifdef CONFIG_X86_32 | ||
491 | trap_signal: | 186 | trap_signal: |
187 | #endif | ||
492 | /* | 188 | /* |
493 | * We want error_code and trap_no set for userspace faults and | 189 | * We want error_code and trap_no set for userspace faults and |
494 | * kernelspace faults which result in die(), but not | 190 | * kernelspace faults which result in die(), but not |
@@ -501,6 +197,18 @@ trap_signal: | |||
501 | tsk->thread.error_code = error_code; | 197 | tsk->thread.error_code = error_code; |
502 | tsk->thread.trap_no = trapnr; | 198 | tsk->thread.trap_no = trapnr; |
503 | 199 | ||
200 | #ifdef CONFIG_X86_64 | ||
201 | if (show_unhandled_signals && unhandled_signal(tsk, signr) && | ||
202 | printk_ratelimit()) { | ||
203 | printk(KERN_INFO | ||
204 | "%s[%d] trap %s ip:%lx sp:%lx error:%lx", | ||
205 | tsk->comm, tsk->pid, str, | ||
206 | regs->ip, regs->sp, error_code); | ||
207 | print_vma_addr(" in ", regs->ip); | ||
208 | printk("\n"); | ||
209 | } | ||
210 | #endif | ||
211 | |||
504 | if (info) | 212 | if (info) |
505 | force_sig_info(signr, info, tsk); | 213 | force_sig_info(signr, info, tsk); |
506 | else | 214 | else |
@@ -515,29 +223,29 @@ kernel_trap: | |||
515 | } | 223 | } |
516 | return; | 224 | return; |
517 | 225 | ||
226 | #ifdef CONFIG_X86_32 | ||
518 | vm86_trap: | 227 | vm86_trap: |
519 | if (handle_vm86_trap((struct kernel_vm86_regs *) regs, | 228 | if (handle_vm86_trap((struct kernel_vm86_regs *) regs, |
520 | error_code, trapnr)) | 229 | error_code, trapnr)) |
521 | goto trap_signal; | 230 | goto trap_signal; |
522 | return; | 231 | return; |
232 | #endif | ||
523 | } | 233 | } |
524 | 234 | ||
525 | #define DO_ERROR(trapnr, signr, str, name) \ | 235 | #define DO_ERROR(trapnr, signr, str, name) \ |
526 | void do_##name(struct pt_regs *regs, long error_code) \ | 236 | dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ |
527 | { \ | 237 | { \ |
528 | trace_hardirqs_fixup(); \ | ||
529 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | 238 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ |
530 | == NOTIFY_STOP) \ | 239 | == NOTIFY_STOP) \ |
531 | return; \ | 240 | return; \ |
532 | do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ | 241 | conditional_sti(regs); \ |
242 | do_trap(trapnr, signr, str, regs, error_code, NULL); \ | ||
533 | } | 243 | } |
534 | 244 | ||
535 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ | 245 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ |
536 | void do_##name(struct pt_regs *regs, long error_code) \ | 246 | dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ |
537 | { \ | 247 | { \ |
538 | siginfo_t info; \ | 248 | siginfo_t info; \ |
539 | if (irq) \ | ||
540 | local_irq_enable(); \ | ||
541 | info.si_signo = signr; \ | 249 | info.si_signo = signr; \ |
542 | info.si_errno = 0; \ | 250 | info.si_errno = 0; \ |
543 | info.si_code = sicode; \ | 251 | info.si_code = sicode; \ |
@@ -545,90 +253,68 @@ void do_##name(struct pt_regs *regs, long error_code) \ | |||
545 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | 253 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ |
546 | == NOTIFY_STOP) \ | 254 | == NOTIFY_STOP) \ |
547 | return; \ | 255 | return; \ |
548 | do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ | 256 | conditional_sti(regs); \ |
257 | do_trap(trapnr, signr, str, regs, error_code, &info); \ | ||
549 | } | 258 | } |
550 | 259 | ||
551 | #define DO_VM86_ERROR(trapnr, signr, str, name) \ | 260 | DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) |
552 | void do_##name(struct pt_regs *regs, long error_code) \ | 261 | DO_ERROR(4, SIGSEGV, "overflow", overflow) |
553 | { \ | 262 | DO_ERROR(5, SIGSEGV, "bounds", bounds) |
554 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | 263 | DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) |
555 | == NOTIFY_STOP) \ | ||
556 | return; \ | ||
557 | do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ | ||
558 | } | ||
559 | |||
560 | #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | ||
561 | void do_##name(struct pt_regs *regs, long error_code) \ | ||
562 | { \ | ||
563 | siginfo_t info; \ | ||
564 | info.si_signo = signr; \ | ||
565 | info.si_errno = 0; \ | ||
566 | info.si_code = sicode; \ | ||
567 | info.si_addr = (void __user *)siaddr; \ | ||
568 | trace_hardirqs_fixup(); \ | ||
569 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
570 | == NOTIFY_STOP) \ | ||
571 | return; \ | ||
572 | do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ | ||
573 | } | ||
574 | |||
575 | DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) | ||
576 | #ifndef CONFIG_KPROBES | ||
577 | DO_VM86_ERROR(3, SIGTRAP, "int3", int3) | ||
578 | #endif | ||
579 | DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) | ||
580 | DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) | ||
581 | DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) | ||
582 | DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | 264 | DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) |
583 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | 265 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) |
584 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) | 266 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) |
267 | #ifdef CONFIG_X86_32 | ||
585 | DO_ERROR(12, SIGBUS, "stack segment", stack_segment) | 268 | DO_ERROR(12, SIGBUS, "stack segment", stack_segment) |
586 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) | 269 | #endif |
587 | DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) | 270 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) |
271 | |||
272 | #ifdef CONFIG_X86_64 | ||
273 | /* Runs on IST stack */ | ||
274 | dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) | ||
275 | { | ||
276 | if (notify_die(DIE_TRAP, "stack segment", regs, error_code, | ||
277 | 12, SIGBUS) == NOTIFY_STOP) | ||
278 | return; | ||
279 | preempt_conditional_sti(regs); | ||
280 | do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); | ||
281 | preempt_conditional_cli(regs); | ||
282 | } | ||
283 | |||
284 | dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) | ||
285 | { | ||
286 | static const char str[] = "double fault"; | ||
287 | struct task_struct *tsk = current; | ||
288 | |||
289 | /* Return not checked because double check cannot be ignored */ | ||
290 | notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); | ||
588 | 291 | ||
589 | void __kprobes | 292 | tsk->thread.error_code = error_code; |
293 | tsk->thread.trap_no = 8; | ||
294 | |||
295 | /* This is always a kernel trap and never fixable (and thus must | ||
296 | never return). */ | ||
297 | for (;;) | ||
298 | die(str, regs, error_code); | ||
299 | } | ||
300 | #endif | ||
301 | |||
302 | dotraplinkage void __kprobes | ||
590 | do_general_protection(struct pt_regs *regs, long error_code) | 303 | do_general_protection(struct pt_regs *regs, long error_code) |
591 | { | 304 | { |
592 | struct task_struct *tsk; | 305 | struct task_struct *tsk; |
593 | struct thread_struct *thread; | ||
594 | struct tss_struct *tss; | ||
595 | int cpu; | ||
596 | 306 | ||
597 | cpu = get_cpu(); | 307 | conditional_sti(regs); |
598 | tss = &per_cpu(init_tss, cpu); | ||
599 | thread = ¤t->thread; | ||
600 | |||
601 | /* | ||
602 | * Perform the lazy TSS's I/O bitmap copy. If the TSS has an | ||
603 | * invalid offset set (the LAZY one) and the faulting thread has | ||
604 | * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS | ||
605 | * and we set the offset field correctly. Then we let the CPU to | ||
606 | * restart the faulting instruction. | ||
607 | */ | ||
608 | if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && | ||
609 | thread->io_bitmap_ptr) { | ||
610 | memcpy(tss->io_bitmap, thread->io_bitmap_ptr, | ||
611 | thread->io_bitmap_max); | ||
612 | /* | ||
613 | * If the previously set map was extending to higher ports | ||
614 | * than the current one, pad extra space with 0xff (no access). | ||
615 | */ | ||
616 | if (thread->io_bitmap_max < tss->io_bitmap_max) { | ||
617 | memset((char *) tss->io_bitmap + | ||
618 | thread->io_bitmap_max, 0xff, | ||
619 | tss->io_bitmap_max - thread->io_bitmap_max); | ||
620 | } | ||
621 | tss->io_bitmap_max = thread->io_bitmap_max; | ||
622 | tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; | ||
623 | tss->io_bitmap_owner = thread; | ||
624 | put_cpu(); | ||
625 | 308 | ||
309 | #ifdef CONFIG_X86_32 | ||
310 | if (lazy_iobitmap_copy()) { | ||
311 | /* restart the faulting instruction */ | ||
626 | return; | 312 | return; |
627 | } | 313 | } |
628 | put_cpu(); | ||
629 | 314 | ||
630 | if (regs->flags & X86_VM_MASK) | 315 | if (regs->flags & X86_VM_MASK) |
631 | goto gp_in_vm86; | 316 | goto gp_in_vm86; |
317 | #endif | ||
632 | 318 | ||
633 | tsk = current; | 319 | tsk = current; |
634 | if (!user_mode(regs)) | 320 | if (!user_mode(regs)) |
@@ -650,10 +336,12 @@ do_general_protection(struct pt_regs *regs, long error_code) | |||
650 | force_sig(SIGSEGV, tsk); | 336 | force_sig(SIGSEGV, tsk); |
651 | return; | 337 | return; |
652 | 338 | ||
339 | #ifdef CONFIG_X86_32 | ||
653 | gp_in_vm86: | 340 | gp_in_vm86: |
654 | local_irq_enable(); | 341 | local_irq_enable(); |
655 | handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); | 342 | handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); |
656 | return; | 343 | return; |
344 | #endif | ||
657 | 345 | ||
658 | gp_in_kernel: | 346 | gp_in_kernel: |
659 | if (fixup_exception(regs)) | 347 | if (fixup_exception(regs)) |
@@ -690,7 +378,8 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs) | |||
690 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | 378 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); |
691 | 379 | ||
692 | /* Clear and disable the memory parity error line. */ | 380 | /* Clear and disable the memory parity error line. */ |
693 | clear_mem_error(reason); | 381 | reason = (reason & 0xf) | 4; |
382 | outb(reason, 0x61); | ||
694 | } | 383 | } |
695 | 384 | ||
696 | static notrace __kprobes void | 385 | static notrace __kprobes void |
@@ -716,7 +405,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs) | |||
716 | static notrace __kprobes void | 405 | static notrace __kprobes void |
717 | unknown_nmi_error(unsigned char reason, struct pt_regs *regs) | 406 | unknown_nmi_error(unsigned char reason, struct pt_regs *regs) |
718 | { | 407 | { |
719 | if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) | 408 | if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == |
409 | NOTIFY_STOP) | ||
720 | return; | 410 | return; |
721 | #ifdef CONFIG_MCA | 411 | #ifdef CONFIG_MCA |
722 | /* | 412 | /* |
@@ -739,41 +429,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) | |||
739 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | 429 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); |
740 | } | 430 | } |
741 | 431 | ||
742 | static DEFINE_SPINLOCK(nmi_print_lock); | ||
743 | |||
744 | void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) | ||
745 | { | ||
746 | if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) | ||
747 | return; | ||
748 | |||
749 | spin_lock(&nmi_print_lock); | ||
750 | /* | ||
751 | * We are in trouble anyway, lets at least try | ||
752 | * to get a message out: | ||
753 | */ | ||
754 | bust_spinlocks(1); | ||
755 | printk(KERN_EMERG "%s", str); | ||
756 | printk(" on CPU%d, ip %08lx, registers:\n", | ||
757 | smp_processor_id(), regs->ip); | ||
758 | show_registers(regs); | ||
759 | if (do_panic) | ||
760 | panic("Non maskable interrupt"); | ||
761 | console_silent(); | ||
762 | spin_unlock(&nmi_print_lock); | ||
763 | bust_spinlocks(0); | ||
764 | |||
765 | /* | ||
766 | * If we are in kernel we are probably nested up pretty bad | ||
767 | * and might aswell get out now while we still can: | ||
768 | */ | ||
769 | if (!user_mode_vm(regs)) { | ||
770 | current->thread.trap_no = 2; | ||
771 | crash_kexec(regs); | ||
772 | } | ||
773 | |||
774 | do_exit(SIGSEGV); | ||
775 | } | ||
776 | |||
777 | static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | 432 | static notrace __kprobes void default_do_nmi(struct pt_regs *regs) |
778 | { | 433 | { |
779 | unsigned char reason = 0; | 434 | unsigned char reason = 0; |
@@ -812,22 +467,25 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | |||
812 | mem_parity_error(reason, regs); | 467 | mem_parity_error(reason, regs); |
813 | if (reason & 0x40) | 468 | if (reason & 0x40) |
814 | io_check_error(reason, regs); | 469 | io_check_error(reason, regs); |
470 | #ifdef CONFIG_X86_32 | ||
815 | /* | 471 | /* |
816 | * Reassert NMI in case it became active meanwhile | 472 | * Reassert NMI in case it became active meanwhile |
817 | * as it's edge-triggered: | 473 | * as it's edge-triggered: |
818 | */ | 474 | */ |
819 | reassert_nmi(); | 475 | reassert_nmi(); |
476 | #endif | ||
820 | } | 477 | } |
821 | 478 | ||
822 | notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) | 479 | dotraplinkage notrace __kprobes void |
480 | do_nmi(struct pt_regs *regs, long error_code) | ||
823 | { | 481 | { |
824 | int cpu; | ||
825 | |||
826 | nmi_enter(); | 482 | nmi_enter(); |
827 | 483 | ||
828 | cpu = smp_processor_id(); | 484 | #ifdef CONFIG_X86_32 |
829 | 485 | { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } | |
830 | ++nmi_count(cpu); | 486 | #else |
487 | add_pda(__nmi_count, 1); | ||
488 | #endif | ||
831 | 489 | ||
832 | if (!ignore_nmis) | 490 | if (!ignore_nmis) |
833 | default_do_nmi(regs); | 491 | default_do_nmi(regs); |
@@ -847,21 +505,44 @@ void restart_nmi(void) | |||
847 | acpi_nmi_enable(); | 505 | acpi_nmi_enable(); |
848 | } | 506 | } |
849 | 507 | ||
850 | #ifdef CONFIG_KPROBES | 508 | /* May run on IST stack. */ |
851 | void __kprobes do_int3(struct pt_regs *regs, long error_code) | 509 | dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) |
852 | { | 510 | { |
853 | trace_hardirqs_fixup(); | 511 | #ifdef CONFIG_KPROBES |
854 | |||
855 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) | 512 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) |
856 | == NOTIFY_STOP) | 513 | == NOTIFY_STOP) |
857 | return; | 514 | return; |
858 | /* | 515 | #else |
859 | * This is an interrupt gate, because kprobes wants interrupts | 516 | if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) |
860 | * disabled. Normal trap handlers don't. | 517 | == NOTIFY_STOP) |
861 | */ | 518 | return; |
862 | restore_interrupts(regs); | 519 | #endif |
520 | |||
521 | preempt_conditional_sti(regs); | ||
522 | do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); | ||
523 | preempt_conditional_cli(regs); | ||
524 | } | ||
863 | 525 | ||
864 | do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); | 526 | #ifdef CONFIG_X86_64 |
527 | /* Help handler running on IST stack to switch back to user stack | ||
528 | for scheduling or signal handling. The actual stack switch is done in | ||
529 | entry.S */ | ||
530 | asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) | ||
531 | { | ||
532 | struct pt_regs *regs = eregs; | ||
533 | /* Did already sync */ | ||
534 | if (eregs == (struct pt_regs *)eregs->sp) | ||
535 | ; | ||
536 | /* Exception from user space */ | ||
537 | else if (user_mode(eregs)) | ||
538 | regs = task_pt_regs(current); | ||
539 | /* Exception from kernel and interrupts are enabled. Move to | ||
540 | kernel process stack. */ | ||
541 | else if (eregs->flags & X86_EFLAGS_IF) | ||
542 | regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); | ||
543 | if (eregs != regs) | ||
544 | *regs = *eregs; | ||
545 | return regs; | ||
865 | } | 546 | } |
866 | #endif | 547 | #endif |
867 | 548 | ||
@@ -886,15 +567,15 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code) | |||
886 | * about restoring all the debug state, and ptrace doesn't have to | 567 | * about restoring all the debug state, and ptrace doesn't have to |
887 | * find every occurrence of the TF bit that could be saved away even | 568 | * find every occurrence of the TF bit that could be saved away even |
888 | * by user code) | 569 | * by user code) |
570 | * | ||
571 | * May run on IST stack. | ||
889 | */ | 572 | */ |
890 | void __kprobes do_debug(struct pt_regs *regs, long error_code) | 573 | dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) |
891 | { | 574 | { |
892 | struct task_struct *tsk = current; | 575 | struct task_struct *tsk = current; |
893 | unsigned int condition; | 576 | unsigned long condition; |
894 | int si_code; | 577 | int si_code; |
895 | 578 | ||
896 | trace_hardirqs_fixup(); | ||
897 | |||
898 | get_debugreg(condition, 6); | 579 | get_debugreg(condition, 6); |
899 | 580 | ||
900 | /* | 581 | /* |
@@ -906,9 +587,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
906 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, | 587 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, |
907 | SIGTRAP) == NOTIFY_STOP) | 588 | SIGTRAP) == NOTIFY_STOP) |
908 | return; | 589 | return; |
590 | |||
909 | /* It's safe to allow irq's after DR6 has been saved */ | 591 | /* It's safe to allow irq's after DR6 has been saved */ |
910 | if (regs->flags & X86_EFLAGS_IF) | 592 | preempt_conditional_sti(regs); |
911 | local_irq_enable(); | ||
912 | 593 | ||
913 | /* Mask out spurious debug traps due to lazy DR7 setting */ | 594 | /* Mask out spurious debug traps due to lazy DR7 setting */ |
914 | if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { | 595 | if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { |
@@ -916,8 +597,10 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
916 | goto clear_dr7; | 597 | goto clear_dr7; |
917 | } | 598 | } |
918 | 599 | ||
600 | #ifdef CONFIG_X86_32 | ||
919 | if (regs->flags & X86_VM_MASK) | 601 | if (regs->flags & X86_VM_MASK) |
920 | goto debug_vm86; | 602 | goto debug_vm86; |
603 | #endif | ||
921 | 604 | ||
922 | /* Save debug status register where ptrace can see it */ | 605 | /* Save debug status register where ptrace can see it */ |
923 | tsk->thread.debugreg6 = condition; | 606 | tsk->thread.debugreg6 = condition; |
@@ -927,16 +610,11 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
927 | * kernel space (but re-enable TF when returning to user mode). | 610 | * kernel space (but re-enable TF when returning to user mode). |
928 | */ | 611 | */ |
929 | if (condition & DR_STEP) { | 612 | if (condition & DR_STEP) { |
930 | /* | ||
931 | * We already checked v86 mode above, so we can | ||
932 | * check for kernel mode by just checking the CPL | ||
933 | * of CS. | ||
934 | */ | ||
935 | if (!user_mode(regs)) | 613 | if (!user_mode(regs)) |
936 | goto clear_TF_reenable; | 614 | goto clear_TF_reenable; |
937 | } | 615 | } |
938 | 616 | ||
939 | si_code = get_si_code((unsigned long)condition); | 617 | si_code = get_si_code(condition); |
940 | /* Ok, finally something we can handle */ | 618 | /* Ok, finally something we can handle */ |
941 | send_sigtrap(tsk, regs, error_code, si_code); | 619 | send_sigtrap(tsk, regs, error_code, si_code); |
942 | 620 | ||
@@ -946,18 +624,37 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
946 | */ | 624 | */ |
947 | clear_dr7: | 625 | clear_dr7: |
948 | set_debugreg(0, 7); | 626 | set_debugreg(0, 7); |
627 | preempt_conditional_cli(regs); | ||
949 | return; | 628 | return; |
950 | 629 | ||
630 | #ifdef CONFIG_X86_32 | ||
951 | debug_vm86: | 631 | debug_vm86: |
952 | handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); | 632 | handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); |
633 | preempt_conditional_cli(regs); | ||
953 | return; | 634 | return; |
635 | #endif | ||
954 | 636 | ||
955 | clear_TF_reenable: | 637 | clear_TF_reenable: |
956 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | 638 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); |
957 | regs->flags &= ~X86_EFLAGS_TF; | 639 | regs->flags &= ~X86_EFLAGS_TF; |
640 | preempt_conditional_cli(regs); | ||
958 | return; | 641 | return; |
959 | } | 642 | } |
960 | 643 | ||
644 | #ifdef CONFIG_X86_64 | ||
645 | static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) | ||
646 | { | ||
647 | if (fixup_exception(regs)) | ||
648 | return 1; | ||
649 | |||
650 | notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); | ||
651 | /* Illegal floating point operation in the kernel */ | ||
652 | current->thread.trap_no = trapnr; | ||
653 | die(str, regs, 0); | ||
654 | return 0; | ||
655 | } | ||
656 | #endif | ||
657 | |||
961 | /* | 658 | /* |
962 | * Note that we play around with the 'TS' bit in an attempt to get | 659 | * Note that we play around with the 'TS' bit in an attempt to get |
963 | * the correct behaviour even in the presence of the asynchronous | 660 | * the correct behaviour even in the presence of the asynchronous |
@@ -994,7 +691,9 @@ void math_error(void __user *ip) | |||
994 | swd = get_fpu_swd(task); | 691 | swd = get_fpu_swd(task); |
995 | switch (swd & ~cwd & 0x3f) { | 692 | switch (swd & ~cwd & 0x3f) { |
996 | case 0x000: /* No unmasked exception */ | 693 | case 0x000: /* No unmasked exception */ |
694 | #ifdef CONFIG_X86_32 | ||
997 | return; | 695 | return; |
696 | #endif | ||
998 | default: /* Multiple exceptions */ | 697 | default: /* Multiple exceptions */ |
999 | break; | 698 | break; |
1000 | case 0x001: /* Invalid Op */ | 699 | case 0x001: /* Invalid Op */ |
@@ -1022,9 +721,18 @@ void math_error(void __user *ip) | |||
1022 | force_sig_info(SIGFPE, &info, task); | 721 | force_sig_info(SIGFPE, &info, task); |
1023 | } | 722 | } |
1024 | 723 | ||
1025 | void do_coprocessor_error(struct pt_regs *regs, long error_code) | 724 | dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) |
1026 | { | 725 | { |
726 | conditional_sti(regs); | ||
727 | |||
728 | #ifdef CONFIG_X86_32 | ||
1027 | ignore_fpu_irq = 1; | 729 | ignore_fpu_irq = 1; |
730 | #else | ||
731 | if (!user_mode(regs) && | ||
732 | kernel_math_error(regs, "kernel x87 math error", 16)) | ||
733 | return; | ||
734 | #endif | ||
735 | |||
1028 | math_error((void __user *)regs->ip); | 736 | math_error((void __user *)regs->ip); |
1029 | } | 737 | } |
1030 | 738 | ||
@@ -1076,8 +784,12 @@ static void simd_math_error(void __user *ip) | |||
1076 | force_sig_info(SIGFPE, &info, task); | 784 | force_sig_info(SIGFPE, &info, task); |
1077 | } | 785 | } |
1078 | 786 | ||
1079 | void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) | 787 | dotraplinkage void |
788 | do_simd_coprocessor_error(struct pt_regs *regs, long error_code) | ||
1080 | { | 789 | { |
790 | conditional_sti(regs); | ||
791 | |||
792 | #ifdef CONFIG_X86_32 | ||
1081 | if (cpu_has_xmm) { | 793 | if (cpu_has_xmm) { |
1082 | /* Handle SIMD FPU exceptions on PIII+ processors. */ | 794 | /* Handle SIMD FPU exceptions on PIII+ processors. */ |
1083 | ignore_fpu_irq = 1; | 795 | ignore_fpu_irq = 1; |
@@ -1096,16 +808,25 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) | |||
1096 | current->thread.error_code = error_code; | 808 | current->thread.error_code = error_code; |
1097 | die_if_kernel("cache flush denied", regs, error_code); | 809 | die_if_kernel("cache flush denied", regs, error_code); |
1098 | force_sig(SIGSEGV, current); | 810 | force_sig(SIGSEGV, current); |
811 | #else | ||
812 | if (!user_mode(regs) && | ||
813 | kernel_math_error(regs, "kernel simd math error", 19)) | ||
814 | return; | ||
815 | simd_math_error((void __user *)regs->ip); | ||
816 | #endif | ||
1099 | } | 817 | } |
1100 | 818 | ||
1101 | void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) | 819 | dotraplinkage void |
820 | do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) | ||
1102 | { | 821 | { |
822 | conditional_sti(regs); | ||
1103 | #if 0 | 823 | #if 0 |
1104 | /* No need to warn about this any longer. */ | 824 | /* No need to warn about this any longer. */ |
1105 | printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); | 825 | printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); |
1106 | #endif | 826 | #endif |
1107 | } | 827 | } |
1108 | 828 | ||
829 | #ifdef CONFIG_X86_32 | ||
1109 | unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) | 830 | unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) |
1110 | { | 831 | { |
1111 | struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); | 832 | struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); |
@@ -1124,6 +845,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) | |||
1124 | 845 | ||
1125 | return new_kesp; | 846 | return new_kesp; |
1126 | } | 847 | } |
848 | #else | ||
849 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) | ||
850 | { | ||
851 | } | ||
852 | |||
853 | asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) | ||
854 | { | ||
855 | } | ||
856 | #endif | ||
1127 | 857 | ||
1128 | /* | 858 | /* |
1129 | * 'math_state_restore()' saves the current math information in the | 859 | * 'math_state_restore()' saves the current math information in the |
@@ -1156,14 +886,24 @@ asmlinkage void math_state_restore(void) | |||
1156 | } | 886 | } |
1157 | 887 | ||
1158 | clts(); /* Allow maths ops (or we recurse) */ | 888 | clts(); /* Allow maths ops (or we recurse) */ |
889 | #ifdef CONFIG_X86_32 | ||
1159 | restore_fpu(tsk); | 890 | restore_fpu(tsk); |
891 | #else | ||
892 | /* | ||
893 | * Paranoid restore. send a SIGSEGV if we fail to restore the state. | ||
894 | */ | ||
895 | if (unlikely(restore_fpu_checking(tsk))) { | ||
896 | stts(); | ||
897 | force_sig(SIGSEGV, tsk); | ||
898 | return; | ||
899 | } | ||
900 | #endif | ||
1160 | thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ | 901 | thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ |
1161 | tsk->fpu_counter++; | 902 | tsk->fpu_counter++; |
1162 | } | 903 | } |
1163 | EXPORT_SYMBOL_GPL(math_state_restore); | 904 | EXPORT_SYMBOL_GPL(math_state_restore); |
1164 | 905 | ||
1165 | #ifndef CONFIG_MATH_EMULATION | 906 | #ifndef CONFIG_MATH_EMULATION |
1166 | |||
1167 | asmlinkage void math_emulate(long arg) | 907 | asmlinkage void math_emulate(long arg) |
1168 | { | 908 | { |
1169 | printk(KERN_EMERG | 909 | printk(KERN_EMERG |
@@ -1172,12 +912,54 @@ asmlinkage void math_emulate(long arg) | |||
1172 | force_sig(SIGFPE, current); | 912 | force_sig(SIGFPE, current); |
1173 | schedule(); | 913 | schedule(); |
1174 | } | 914 | } |
1175 | |||
1176 | #endif /* CONFIG_MATH_EMULATION */ | 915 | #endif /* CONFIG_MATH_EMULATION */ |
1177 | 916 | ||
917 | dotraplinkage void __kprobes | ||
918 | do_device_not_available(struct pt_regs *regs, long error) | ||
919 | { | ||
920 | #ifdef CONFIG_X86_32 | ||
921 | if (read_cr0() & X86_CR0_EM) { | ||
922 | conditional_sti(regs); | ||
923 | math_emulate(0); | ||
924 | } else { | ||
925 | math_state_restore(); /* interrupts still off */ | ||
926 | conditional_sti(regs); | ||
927 | } | ||
928 | #else | ||
929 | math_state_restore(); | ||
930 | #endif | ||
931 | } | ||
932 | |||
933 | #ifdef CONFIG_X86_32 | ||
934 | #ifdef CONFIG_X86_MCE | ||
935 | dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) | ||
936 | { | ||
937 | conditional_sti(regs); | ||
938 | machine_check_vector(regs, error); | ||
939 | } | ||
940 | #endif | ||
941 | |||
942 | dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) | ||
943 | { | ||
944 | siginfo_t info; | ||
945 | local_irq_enable(); | ||
946 | |||
947 | info.si_signo = SIGILL; | ||
948 | info.si_errno = 0; | ||
949 | info.si_code = ILL_BADSTK; | ||
950 | info.si_addr = 0; | ||
951 | if (notify_die(DIE_TRAP, "iret exception", | ||
952 | regs, error_code, 32, SIGILL) == NOTIFY_STOP) | ||
953 | return; | ||
954 | do_trap(32, SIGILL, "iret exception", regs, error_code, &info); | ||
955 | } | ||
956 | #endif | ||
957 | |||
1178 | void __init trap_init(void) | 958 | void __init trap_init(void) |
1179 | { | 959 | { |
960 | #ifdef CONFIG_X86_32 | ||
1180 | int i; | 961 | int i; |
962 | #endif | ||
1181 | 963 | ||
1182 | #ifdef CONFIG_EISA | 964 | #ifdef CONFIG_EISA |
1183 | void __iomem *p = early_ioremap(0x0FFFD9, 4); | 965 | void __iomem *p = early_ioremap(0x0FFFD9, 4); |
@@ -1187,29 +969,40 @@ void __init trap_init(void) | |||
1187 | early_iounmap(p, 4); | 969 | early_iounmap(p, 4); |
1188 | #endif | 970 | #endif |
1189 | 971 | ||
1190 | set_trap_gate(0, ÷_error); | 972 | set_intr_gate(0, ÷_error); |
1191 | set_intr_gate(1, &debug); | 973 | set_intr_gate_ist(1, &debug, DEBUG_STACK); |
1192 | set_intr_gate(2, &nmi); | 974 | set_intr_gate_ist(2, &nmi, NMI_STACK); |
1193 | set_system_intr_gate(3, &int3); /* int3 can be called from all */ | 975 | /* int3 can be called from all */ |
1194 | set_system_gate(4, &overflow); /* int4 can be called from all */ | 976 | set_system_intr_gate_ist(3, &int3, DEBUG_STACK); |
1195 | set_trap_gate(5, &bounds); | 977 | /* int4 can be called from all */ |
1196 | set_trap_gate(6, &invalid_op); | 978 | set_system_intr_gate(4, &overflow); |
1197 | set_trap_gate(7, &device_not_available); | 979 | set_intr_gate(5, &bounds); |
980 | set_intr_gate(6, &invalid_op); | ||
981 | set_intr_gate(7, &device_not_available); | ||
982 | #ifdef CONFIG_X86_32 | ||
1198 | set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); | 983 | set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); |
1199 | set_trap_gate(9, &coprocessor_segment_overrun); | 984 | #else |
1200 | set_trap_gate(10, &invalid_TSS); | 985 | set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); |
1201 | set_trap_gate(11, &segment_not_present); | 986 | #endif |
1202 | set_trap_gate(12, &stack_segment); | 987 | set_intr_gate(9, &coprocessor_segment_overrun); |
1203 | set_trap_gate(13, &general_protection); | 988 | set_intr_gate(10, &invalid_TSS); |
989 | set_intr_gate(11, &segment_not_present); | ||
990 | set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); | ||
991 | set_intr_gate(13, &general_protection); | ||
1204 | set_intr_gate(14, &page_fault); | 992 | set_intr_gate(14, &page_fault); |
1205 | set_trap_gate(15, &spurious_interrupt_bug); | 993 | set_intr_gate(15, &spurious_interrupt_bug); |
1206 | set_trap_gate(16, &coprocessor_error); | 994 | set_intr_gate(16, &coprocessor_error); |
1207 | set_trap_gate(17, &alignment_check); | 995 | set_intr_gate(17, &alignment_check); |
1208 | #ifdef CONFIG_X86_MCE | 996 | #ifdef CONFIG_X86_MCE |
1209 | set_trap_gate(18, &machine_check); | 997 | set_intr_gate_ist(18, &machine_check, MCE_STACK); |
1210 | #endif | 998 | #endif |
1211 | set_trap_gate(19, &simd_coprocessor_error); | 999 | set_intr_gate(19, &simd_coprocessor_error); |
1212 | 1000 | ||
1001 | #ifdef CONFIG_IA32_EMULATION | ||
1002 | set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); | ||
1003 | #endif | ||
1004 | |||
1005 | #ifdef CONFIG_X86_32 | ||
1213 | if (cpu_has_fxsr) { | 1006 | if (cpu_has_fxsr) { |
1214 | printk(KERN_INFO "Enabling fast FPU save and restore... "); | 1007 | printk(KERN_INFO "Enabling fast FPU save and restore... "); |
1215 | set_in_cr4(X86_CR4_OSFXSR); | 1008 | set_in_cr4(X86_CR4_OSFXSR); |
@@ -1222,36 +1015,20 @@ void __init trap_init(void) | |||
1222 | printk("done.\n"); | 1015 | printk("done.\n"); |
1223 | } | 1016 | } |
1224 | 1017 | ||
1225 | set_system_gate(SYSCALL_VECTOR, &system_call); | 1018 | set_system_trap_gate(SYSCALL_VECTOR, &system_call); |
1226 | 1019 | ||
1227 | /* Reserve all the builtin and the syscall vector: */ | 1020 | /* Reserve all the builtin and the syscall vector: */ |
1228 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) | 1021 | for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) |
1229 | set_bit(i, used_vectors); | 1022 | set_bit(i, used_vectors); |
1230 | 1023 | ||
1231 | set_bit(SYSCALL_VECTOR, used_vectors); | 1024 | set_bit(SYSCALL_VECTOR, used_vectors); |
1232 | 1025 | #endif | |
1233 | /* | 1026 | /* |
1234 | * Should be a barrier for any external CPU state: | 1027 | * Should be a barrier for any external CPU state: |
1235 | */ | 1028 | */ |
1236 | cpu_init(); | 1029 | cpu_init(); |
1237 | 1030 | ||
1031 | #ifdef CONFIG_X86_32 | ||
1238 | trap_init_hook(); | 1032 | trap_init_hook(); |
1033 | #endif | ||
1239 | } | 1034 | } |
1240 | |||
1241 | static int __init kstack_setup(char *s) | ||
1242 | { | ||
1243 | kstack_depth_to_print = simple_strtoul(s, NULL, 0); | ||
1244 | |||
1245 | return 1; | ||
1246 | } | ||
1247 | __setup("kstack=", kstack_setup); | ||
1248 | |||
1249 | static int __init code_bytes_setup(char *s) | ||
1250 | { | ||
1251 | code_bytes = simple_strtoul(s, NULL, 0); | ||
1252 | if (code_bytes > 8192) | ||
1253 | code_bytes = 8192; | ||
1254 | |||
1255 | return 1; | ||
1256 | } | ||
1257 | __setup("code_bytes=", code_bytes_setup); | ||
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c deleted file mode 100644 index 9c0ac0cab013..000000000000 --- a/arch/x86/kernel/traps_64.c +++ /dev/null | |||
@@ -1,1214 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
3 | * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs | ||
4 | * | ||
5 | * Pentium III FXSR, SSE support | ||
6 | * Gareth Hughes <gareth@valinux.com>, May 2000 | ||
7 | */ | ||
8 | |||
9 | /* | ||
10 | * 'Traps.c' handles hardware traps and faults after we have saved some | ||
11 | * state in 'entry.S'. | ||
12 | */ | ||
13 | #include <linux/moduleparam.h> | ||
14 | #include <linux/interrupt.h> | ||
15 | #include <linux/kallsyms.h> | ||
16 | #include <linux/spinlock.h> | ||
17 | #include <linux/kprobes.h> | ||
18 | #include <linux/uaccess.h> | ||
19 | #include <linux/utsname.h> | ||
20 | #include <linux/kdebug.h> | ||
21 | #include <linux/kernel.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/ptrace.h> | ||
24 | #include <linux/string.h> | ||
25 | #include <linux/unwind.h> | ||
26 | #include <linux/delay.h> | ||
27 | #include <linux/errno.h> | ||
28 | #include <linux/kexec.h> | ||
29 | #include <linux/sched.h> | ||
30 | #include <linux/timer.h> | ||
31 | #include <linux/init.h> | ||
32 | #include <linux/bug.h> | ||
33 | #include <linux/nmi.h> | ||
34 | #include <linux/mm.h> | ||
35 | #include <linux/smp.h> | ||
36 | #include <linux/io.h> | ||
37 | |||
38 | #if defined(CONFIG_EDAC) | ||
39 | #include <linux/edac.h> | ||
40 | #endif | ||
41 | |||
42 | #include <asm/stacktrace.h> | ||
43 | #include <asm/processor.h> | ||
44 | #include <asm/debugreg.h> | ||
45 | #include <asm/atomic.h> | ||
46 | #include <asm/system.h> | ||
47 | #include <asm/unwind.h> | ||
48 | #include <asm/desc.h> | ||
49 | #include <asm/i387.h> | ||
50 | #include <asm/pgalloc.h> | ||
51 | #include <asm/proto.h> | ||
52 | #include <asm/pda.h> | ||
53 | #include <asm/traps.h> | ||
54 | |||
55 | #include <mach_traps.h> | ||
56 | |||
57 | int panic_on_unrecovered_nmi; | ||
58 | int kstack_depth_to_print = 12; | ||
59 | static unsigned int code_bytes = 64; | ||
60 | static int ignore_nmis; | ||
61 | static int die_counter; | ||
62 | |||
63 | static inline void conditional_sti(struct pt_regs *regs) | ||
64 | { | ||
65 | if (regs->flags & X86_EFLAGS_IF) | ||
66 | local_irq_enable(); | ||
67 | } | ||
68 | |||
69 | static inline void preempt_conditional_sti(struct pt_regs *regs) | ||
70 | { | ||
71 | inc_preempt_count(); | ||
72 | if (regs->flags & X86_EFLAGS_IF) | ||
73 | local_irq_enable(); | ||
74 | } | ||
75 | |||
76 | static inline void preempt_conditional_cli(struct pt_regs *regs) | ||
77 | { | ||
78 | if (regs->flags & X86_EFLAGS_IF) | ||
79 | local_irq_disable(); | ||
80 | /* Make sure to not schedule here because we could be running | ||
81 | on an exception stack. */ | ||
82 | dec_preempt_count(); | ||
83 | } | ||
84 | |||
85 | void printk_address(unsigned long address, int reliable) | ||
86 | { | ||
87 | printk(" [<%016lx>] %s%pS\n", | ||
88 | address, reliable ? "" : "? ", (void *) address); | ||
89 | } | ||
90 | |||
91 | static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, | ||
92 | unsigned *usedp, char **idp) | ||
93 | { | ||
94 | static char ids[][8] = { | ||
95 | [DEBUG_STACK - 1] = "#DB", | ||
96 | [NMI_STACK - 1] = "NMI", | ||
97 | [DOUBLEFAULT_STACK - 1] = "#DF", | ||
98 | [STACKFAULT_STACK - 1] = "#SS", | ||
99 | [MCE_STACK - 1] = "#MC", | ||
100 | #if DEBUG_STKSZ > EXCEPTION_STKSZ | ||
101 | [N_EXCEPTION_STACKS ... | ||
102 | N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" | ||
103 | #endif | ||
104 | }; | ||
105 | unsigned k; | ||
106 | |||
107 | /* | ||
108 | * Iterate over all exception stacks, and figure out whether | ||
109 | * 'stack' is in one of them: | ||
110 | */ | ||
111 | for (k = 0; k < N_EXCEPTION_STACKS; k++) { | ||
112 | unsigned long end = per_cpu(orig_ist, cpu).ist[k]; | ||
113 | /* | ||
114 | * Is 'stack' above this exception frame's end? | ||
115 | * If yes then skip to the next frame. | ||
116 | */ | ||
117 | if (stack >= end) | ||
118 | continue; | ||
119 | /* | ||
120 | * Is 'stack' above this exception frame's start address? | ||
121 | * If yes then we found the right frame. | ||
122 | */ | ||
123 | if (stack >= end - EXCEPTION_STKSZ) { | ||
124 | /* | ||
125 | * Make sure we only iterate through an exception | ||
126 | * stack once. If it comes up for the second time | ||
127 | * then there's something wrong going on - just | ||
128 | * break out and return NULL: | ||
129 | */ | ||
130 | if (*usedp & (1U << k)) | ||
131 | break; | ||
132 | *usedp |= 1U << k; | ||
133 | *idp = ids[k]; | ||
134 | return (unsigned long *)end; | ||
135 | } | ||
136 | /* | ||
137 | * If this is a debug stack, and if it has a larger size than | ||
138 | * the usual exception stacks, then 'stack' might still | ||
139 | * be within the lower portion of the debug stack: | ||
140 | */ | ||
141 | #if DEBUG_STKSZ > EXCEPTION_STKSZ | ||
142 | if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { | ||
143 | unsigned j = N_EXCEPTION_STACKS - 1; | ||
144 | |||
145 | /* | ||
146 | * Black magic. A large debug stack is composed of | ||
147 | * multiple exception stack entries, which we | ||
148 | * iterate through now. Dont look: | ||
149 | */ | ||
150 | do { | ||
151 | ++j; | ||
152 | end -= EXCEPTION_STKSZ; | ||
153 | ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); | ||
154 | } while (stack < end - EXCEPTION_STKSZ); | ||
155 | if (*usedp & (1U << j)) | ||
156 | break; | ||
157 | *usedp |= 1U << j; | ||
158 | *idp = ids[j]; | ||
159 | return (unsigned long *)end; | ||
160 | } | ||
161 | #endif | ||
162 | } | ||
163 | return NULL; | ||
164 | } | ||
165 | |||
166 | /* | ||
167 | * x86-64 can have up to three kernel stacks: | ||
168 | * process stack | ||
169 | * interrupt stack | ||
170 | * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack | ||
171 | */ | ||
172 | |||
173 | static inline int valid_stack_ptr(struct thread_info *tinfo, | ||
174 | void *p, unsigned int size, void *end) | ||
175 | { | ||
176 | void *t = tinfo; | ||
177 | if (end) { | ||
178 | if (p < end && p >= (end-THREAD_SIZE)) | ||
179 | return 1; | ||
180 | else | ||
181 | return 0; | ||
182 | } | ||
183 | return p > t && p < t + THREAD_SIZE - size; | ||
184 | } | ||
185 | |||
186 | /* The form of the top of the frame on the stack */ | ||
187 | struct stack_frame { | ||
188 | struct stack_frame *next_frame; | ||
189 | unsigned long return_address; | ||
190 | }; | ||
191 | |||
192 | static inline unsigned long | ||
193 | print_context_stack(struct thread_info *tinfo, | ||
194 | unsigned long *stack, unsigned long bp, | ||
195 | const struct stacktrace_ops *ops, void *data, | ||
196 | unsigned long *end) | ||
197 | { | ||
198 | struct stack_frame *frame = (struct stack_frame *)bp; | ||
199 | |||
200 | while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { | ||
201 | unsigned long addr; | ||
202 | |||
203 | addr = *stack; | ||
204 | if (__kernel_text_address(addr)) { | ||
205 | if ((unsigned long) stack == bp + 8) { | ||
206 | ops->address(data, addr, 1); | ||
207 | frame = frame->next_frame; | ||
208 | bp = (unsigned long) frame; | ||
209 | } else { | ||
210 | ops->address(data, addr, bp == 0); | ||
211 | } | ||
212 | } | ||
213 | stack++; | ||
214 | } | ||
215 | return bp; | ||
216 | } | ||
217 | |||
218 | void dump_trace(struct task_struct *task, struct pt_regs *regs, | ||
219 | unsigned long *stack, unsigned long bp, | ||
220 | const struct stacktrace_ops *ops, void *data) | ||
221 | { | ||
222 | const unsigned cpu = get_cpu(); | ||
223 | unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; | ||
224 | unsigned used = 0; | ||
225 | struct thread_info *tinfo; | ||
226 | |||
227 | if (!task) | ||
228 | task = current; | ||
229 | |||
230 | if (!stack) { | ||
231 | unsigned long dummy; | ||
232 | stack = &dummy; | ||
233 | if (task && task != current) | ||
234 | stack = (unsigned long *)task->thread.sp; | ||
235 | } | ||
236 | |||
237 | #ifdef CONFIG_FRAME_POINTER | ||
238 | if (!bp) { | ||
239 | if (task == current) { | ||
240 | /* Grab bp right from our regs */ | ||
241 | asm("movq %%rbp, %0" : "=r" (bp) : ); | ||
242 | } else { | ||
243 | /* bp is the last reg pushed by switch_to */ | ||
244 | bp = *(unsigned long *) task->thread.sp; | ||
245 | } | ||
246 | } | ||
247 | #endif | ||
248 | |||
249 | /* | ||
250 | * Print function call entries in all stacks, starting at the | ||
251 | * current stack address. If the stacks consist of nested | ||
252 | * exceptions | ||
253 | */ | ||
254 | tinfo = task_thread_info(task); | ||
255 | for (;;) { | ||
256 | char *id; | ||
257 | unsigned long *estack_end; | ||
258 | estack_end = in_exception_stack(cpu, (unsigned long)stack, | ||
259 | &used, &id); | ||
260 | |||
261 | if (estack_end) { | ||
262 | if (ops->stack(data, id) < 0) | ||
263 | break; | ||
264 | |||
265 | bp = print_context_stack(tinfo, stack, bp, ops, | ||
266 | data, estack_end); | ||
267 | ops->stack(data, "<EOE>"); | ||
268 | /* | ||
269 | * We link to the next stack via the | ||
270 | * second-to-last pointer (index -2 to end) in the | ||
271 | * exception stack: | ||
272 | */ | ||
273 | stack = (unsigned long *) estack_end[-2]; | ||
274 | continue; | ||
275 | } | ||
276 | if (irqstack_end) { | ||
277 | unsigned long *irqstack; | ||
278 | irqstack = irqstack_end - | ||
279 | (IRQSTACKSIZE - 64) / sizeof(*irqstack); | ||
280 | |||
281 | if (stack >= irqstack && stack < irqstack_end) { | ||
282 | if (ops->stack(data, "IRQ") < 0) | ||
283 | break; | ||
284 | bp = print_context_stack(tinfo, stack, bp, | ||
285 | ops, data, irqstack_end); | ||
286 | /* | ||
287 | * We link to the next stack (which would be | ||
288 | * the process stack normally) the last | ||
289 | * pointer (index -1 to end) in the IRQ stack: | ||
290 | */ | ||
291 | stack = (unsigned long *) (irqstack_end[-1]); | ||
292 | irqstack_end = NULL; | ||
293 | ops->stack(data, "EOI"); | ||
294 | continue; | ||
295 | } | ||
296 | } | ||
297 | break; | ||
298 | } | ||
299 | |||
300 | /* | ||
301 | * This handles the process stack: | ||
302 | */ | ||
303 | bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); | ||
304 | put_cpu(); | ||
305 | } | ||
306 | EXPORT_SYMBOL(dump_trace); | ||
307 | |||
308 | static void | ||
309 | print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
310 | { | ||
311 | print_symbol(msg, symbol); | ||
312 | printk("\n"); | ||
313 | } | ||
314 | |||
315 | static void print_trace_warning(void *data, char *msg) | ||
316 | { | ||
317 | printk("%s\n", msg); | ||
318 | } | ||
319 | |||
320 | static int print_trace_stack(void *data, char *name) | ||
321 | { | ||
322 | printk(" <%s> ", name); | ||
323 | return 0; | ||
324 | } | ||
325 | |||
326 | static void print_trace_address(void *data, unsigned long addr, int reliable) | ||
327 | { | ||
328 | touch_nmi_watchdog(); | ||
329 | printk_address(addr, reliable); | ||
330 | } | ||
331 | |||
332 | static const struct stacktrace_ops print_trace_ops = { | ||
333 | .warning = print_trace_warning, | ||
334 | .warning_symbol = print_trace_warning_symbol, | ||
335 | .stack = print_trace_stack, | ||
336 | .address = print_trace_address, | ||
337 | }; | ||
338 | |||
339 | static void | ||
340 | show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
341 | unsigned long *stack, unsigned long bp, char *log_lvl) | ||
342 | { | ||
343 | printk("Call Trace:\n"); | ||
344 | dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); | ||
345 | } | ||
346 | |||
347 | void show_trace(struct task_struct *task, struct pt_regs *regs, | ||
348 | unsigned long *stack, unsigned long bp) | ||
349 | { | ||
350 | show_trace_log_lvl(task, regs, stack, bp, ""); | ||
351 | } | ||
352 | |||
353 | static void | ||
354 | show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, | ||
355 | unsigned long *sp, unsigned long bp, char *log_lvl) | ||
356 | { | ||
357 | unsigned long *stack; | ||
358 | int i; | ||
359 | const int cpu = smp_processor_id(); | ||
360 | unsigned long *irqstack_end = | ||
361 | (unsigned long *) (cpu_pda(cpu)->irqstackptr); | ||
362 | unsigned long *irqstack = | ||
363 | (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); | ||
364 | |||
365 | /* | ||
366 | * debugging aid: "show_stack(NULL, NULL);" prints the | ||
367 | * back trace for this cpu. | ||
368 | */ | ||
369 | |||
370 | if (sp == NULL) { | ||
371 | if (task) | ||
372 | sp = (unsigned long *)task->thread.sp; | ||
373 | else | ||
374 | sp = (unsigned long *)&sp; | ||
375 | } | ||
376 | |||
377 | stack = sp; | ||
378 | for (i = 0; i < kstack_depth_to_print; i++) { | ||
379 | if (stack >= irqstack && stack <= irqstack_end) { | ||
380 | if (stack == irqstack_end) { | ||
381 | stack = (unsigned long *) (irqstack_end[-1]); | ||
382 | printk(" <EOI> "); | ||
383 | } | ||
384 | } else { | ||
385 | if (((long) stack & (THREAD_SIZE-1)) == 0) | ||
386 | break; | ||
387 | } | ||
388 | if (i && ((i % 4) == 0)) | ||
389 | printk("\n"); | ||
390 | printk(" %016lx", *stack++); | ||
391 | touch_nmi_watchdog(); | ||
392 | } | ||
393 | printk("\n"); | ||
394 | show_trace_log_lvl(task, regs, sp, bp, log_lvl); | ||
395 | } | ||
396 | |||
397 | void show_stack(struct task_struct *task, unsigned long *sp) | ||
398 | { | ||
399 | show_stack_log_lvl(task, NULL, sp, 0, ""); | ||
400 | } | ||
401 | |||
402 | /* | ||
403 | * The architecture-independent dump_stack generator | ||
404 | */ | ||
405 | void dump_stack(void) | ||
406 | { | ||
407 | unsigned long bp = 0; | ||
408 | unsigned long stack; | ||
409 | |||
410 | #ifdef CONFIG_FRAME_POINTER | ||
411 | if (!bp) | ||
412 | asm("movq %%rbp, %0" : "=r" (bp) : ); | ||
413 | #endif | ||
414 | |||
415 | printk("Pid: %d, comm: %.20s %s %s %.*s\n", | ||
416 | current->pid, current->comm, print_tainted(), | ||
417 | init_utsname()->release, | ||
418 | (int)strcspn(init_utsname()->version, " "), | ||
419 | init_utsname()->version); | ||
420 | show_trace(NULL, NULL, &stack, bp); | ||
421 | } | ||
422 | EXPORT_SYMBOL(dump_stack); | ||
423 | |||
424 | void show_registers(struct pt_regs *regs) | ||
425 | { | ||
426 | int i; | ||
427 | unsigned long sp; | ||
428 | const int cpu = smp_processor_id(); | ||
429 | struct task_struct *cur = cpu_pda(cpu)->pcurrent; | ||
430 | |||
431 | sp = regs->sp; | ||
432 | printk("CPU %d ", cpu); | ||
433 | __show_regs(regs); | ||
434 | printk("Process %s (pid: %d, threadinfo %p, task %p)\n", | ||
435 | cur->comm, cur->pid, task_thread_info(cur), cur); | ||
436 | |||
437 | /* | ||
438 | * When in-kernel, we also print out the stack and code at the | ||
439 | * time of the fault.. | ||
440 | */ | ||
441 | if (!user_mode(regs)) { | ||
442 | unsigned int code_prologue = code_bytes * 43 / 64; | ||
443 | unsigned int code_len = code_bytes; | ||
444 | unsigned char c; | ||
445 | u8 *ip; | ||
446 | |||
447 | printk("Stack: "); | ||
448 | show_stack_log_lvl(NULL, regs, (unsigned long *)sp, | ||
449 | regs->bp, ""); | ||
450 | |||
451 | printk(KERN_EMERG "Code: "); | ||
452 | |||
453 | ip = (u8 *)regs->ip - code_prologue; | ||
454 | if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { | ||
455 | /* try starting at RIP */ | ||
456 | ip = (u8 *)regs->ip; | ||
457 | code_len = code_len - code_prologue + 1; | ||
458 | } | ||
459 | for (i = 0; i < code_len; i++, ip++) { | ||
460 | if (ip < (u8 *)PAGE_OFFSET || | ||
461 | probe_kernel_address(ip, c)) { | ||
462 | printk(" Bad RIP value."); | ||
463 | break; | ||
464 | } | ||
465 | if (ip == (u8 *)regs->ip) | ||
466 | printk("<%02x> ", c); | ||
467 | else | ||
468 | printk("%02x ", c); | ||
469 | } | ||
470 | } | ||
471 | printk("\n"); | ||
472 | } | ||
473 | |||
474 | int is_valid_bugaddr(unsigned long ip) | ||
475 | { | ||
476 | unsigned short ud2; | ||
477 | |||
478 | if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) | ||
479 | return 0; | ||
480 | |||
481 | return ud2 == 0x0b0f; | ||
482 | } | ||
483 | |||
484 | static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; | ||
485 | static int die_owner = -1; | ||
486 | static unsigned int die_nest_count; | ||
487 | |||
488 | unsigned __kprobes long oops_begin(void) | ||
489 | { | ||
490 | int cpu; | ||
491 | unsigned long flags; | ||
492 | |||
493 | oops_enter(); | ||
494 | |||
495 | /* racy, but better than risking deadlock. */ | ||
496 | raw_local_irq_save(flags); | ||
497 | cpu = smp_processor_id(); | ||
498 | if (!__raw_spin_trylock(&die_lock)) { | ||
499 | if (cpu == die_owner) | ||
500 | /* nested oops. should stop eventually */; | ||
501 | else | ||
502 | __raw_spin_lock(&die_lock); | ||
503 | } | ||
504 | die_nest_count++; | ||
505 | die_owner = cpu; | ||
506 | console_verbose(); | ||
507 | bust_spinlocks(1); | ||
508 | return flags; | ||
509 | } | ||
510 | |||
511 | void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) | ||
512 | { | ||
513 | die_owner = -1; | ||
514 | bust_spinlocks(0); | ||
515 | die_nest_count--; | ||
516 | if (!die_nest_count) | ||
517 | /* Nest count reaches zero, release the lock. */ | ||
518 | __raw_spin_unlock(&die_lock); | ||
519 | raw_local_irq_restore(flags); | ||
520 | if (!regs) { | ||
521 | oops_exit(); | ||
522 | return; | ||
523 | } | ||
524 | if (panic_on_oops) | ||
525 | panic("Fatal exception"); | ||
526 | oops_exit(); | ||
527 | do_exit(signr); | ||
528 | } | ||
529 | |||
530 | int __kprobes __die(const char *str, struct pt_regs *regs, long err) | ||
531 | { | ||
532 | printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); | ||
533 | #ifdef CONFIG_PREEMPT | ||
534 | printk("PREEMPT "); | ||
535 | #endif | ||
536 | #ifdef CONFIG_SMP | ||
537 | printk("SMP "); | ||
538 | #endif | ||
539 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
540 | printk("DEBUG_PAGEALLOC"); | ||
541 | #endif | ||
542 | printk("\n"); | ||
543 | if (notify_die(DIE_OOPS, str, regs, err, | ||
544 | current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) | ||
545 | return 1; | ||
546 | |||
547 | show_registers(regs); | ||
548 | add_taint(TAINT_DIE); | ||
549 | /* Executive summary in case the oops scrolled away */ | ||
550 | printk(KERN_ALERT "RIP "); | ||
551 | printk_address(regs->ip, 1); | ||
552 | printk(" RSP <%016lx>\n", regs->sp); | ||
553 | if (kexec_should_crash(current)) | ||
554 | crash_kexec(regs); | ||
555 | return 0; | ||
556 | } | ||
557 | |||
558 | void die(const char *str, struct pt_regs *regs, long err) | ||
559 | { | ||
560 | unsigned long flags = oops_begin(); | ||
561 | |||
562 | if (!user_mode(regs)) | ||
563 | report_bug(regs->ip, regs); | ||
564 | |||
565 | if (__die(str, regs, err)) | ||
566 | regs = NULL; | ||
567 | oops_end(flags, regs, SIGSEGV); | ||
568 | } | ||
569 | |||
570 | notrace __kprobes void | ||
571 | die_nmi(char *str, struct pt_regs *regs, int do_panic) | ||
572 | { | ||
573 | unsigned long flags; | ||
574 | |||
575 | if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) | ||
576 | return; | ||
577 | |||
578 | flags = oops_begin(); | ||
579 | /* | ||
580 | * We are in trouble anyway, lets at least try | ||
581 | * to get a message out. | ||
582 | */ | ||
583 | printk(KERN_EMERG "%s", str); | ||
584 | printk(" on CPU%d, ip %08lx, registers:\n", | ||
585 | smp_processor_id(), regs->ip); | ||
586 | show_registers(regs); | ||
587 | if (kexec_should_crash(current)) | ||
588 | crash_kexec(regs); | ||
589 | if (do_panic || panic_on_oops) | ||
590 | panic("Non maskable interrupt"); | ||
591 | oops_end(flags, NULL, SIGBUS); | ||
592 | nmi_exit(); | ||
593 | local_irq_enable(); | ||
594 | do_exit(SIGBUS); | ||
595 | } | ||
596 | |||
597 | static void __kprobes | ||
598 | do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, | ||
599 | long error_code, siginfo_t *info) | ||
600 | { | ||
601 | struct task_struct *tsk = current; | ||
602 | |||
603 | if (!user_mode(regs)) | ||
604 | goto kernel_trap; | ||
605 | |||
606 | /* | ||
607 | * We want error_code and trap_no set for userspace faults and | ||
608 | * kernelspace faults which result in die(), but not | ||
609 | * kernelspace faults which are fixed up. die() gives the | ||
610 | * process no chance to handle the signal and notice the | ||
611 | * kernel fault information, so that won't result in polluting | ||
612 | * the information about previously queued, but not yet | ||
613 | * delivered, faults. See also do_general_protection below. | ||
614 | */ | ||
615 | tsk->thread.error_code = error_code; | ||
616 | tsk->thread.trap_no = trapnr; | ||
617 | |||
618 | if (show_unhandled_signals && unhandled_signal(tsk, signr) && | ||
619 | printk_ratelimit()) { | ||
620 | printk(KERN_INFO | ||
621 | "%s[%d] trap %s ip:%lx sp:%lx error:%lx", | ||
622 | tsk->comm, tsk->pid, str, | ||
623 | regs->ip, regs->sp, error_code); | ||
624 | print_vma_addr(" in ", regs->ip); | ||
625 | printk("\n"); | ||
626 | } | ||
627 | |||
628 | if (info) | ||
629 | force_sig_info(signr, info, tsk); | ||
630 | else | ||
631 | force_sig(signr, tsk); | ||
632 | return; | ||
633 | |||
634 | kernel_trap: | ||
635 | if (!fixup_exception(regs)) { | ||
636 | tsk->thread.error_code = error_code; | ||
637 | tsk->thread.trap_no = trapnr; | ||
638 | die(str, regs, error_code); | ||
639 | } | ||
640 | return; | ||
641 | } | ||
642 | |||
643 | #define DO_ERROR(trapnr, signr, str, name) \ | ||
644 | asmlinkage void do_##name(struct pt_regs *regs, long error_code) \ | ||
645 | { \ | ||
646 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
647 | == NOTIFY_STOP) \ | ||
648 | return; \ | ||
649 | conditional_sti(regs); \ | ||
650 | do_trap(trapnr, signr, str, regs, error_code, NULL); \ | ||
651 | } | ||
652 | |||
653 | #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ | ||
654 | asmlinkage void do_##name(struct pt_regs *regs, long error_code) \ | ||
655 | { \ | ||
656 | siginfo_t info; \ | ||
657 | info.si_signo = signr; \ | ||
658 | info.si_errno = 0; \ | ||
659 | info.si_code = sicode; \ | ||
660 | info.si_addr = (void __user *)siaddr; \ | ||
661 | trace_hardirqs_fixup(); \ | ||
662 | if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ | ||
663 | == NOTIFY_STOP) \ | ||
664 | return; \ | ||
665 | conditional_sti(regs); \ | ||
666 | do_trap(trapnr, signr, str, regs, error_code, &info); \ | ||
667 | } | ||
668 | |||
669 | DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) | ||
670 | DO_ERROR(4, SIGSEGV, "overflow", overflow) | ||
671 | DO_ERROR(5, SIGSEGV, "bounds", bounds) | ||
672 | DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) | ||
673 | DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) | ||
674 | DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) | ||
675 | DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) | ||
676 | DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) | ||
677 | |||
678 | /* Runs on IST stack */ | ||
679 | asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) | ||
680 | { | ||
681 | if (notify_die(DIE_TRAP, "stack segment", regs, error_code, | ||
682 | 12, SIGBUS) == NOTIFY_STOP) | ||
683 | return; | ||
684 | preempt_conditional_sti(regs); | ||
685 | do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); | ||
686 | preempt_conditional_cli(regs); | ||
687 | } | ||
688 | |||
689 | asmlinkage void do_double_fault(struct pt_regs *regs, long error_code) | ||
690 | { | ||
691 | static const char str[] = "double fault"; | ||
692 | struct task_struct *tsk = current; | ||
693 | |||
694 | /* Return not checked because double check cannot be ignored */ | ||
695 | notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); | ||
696 | |||
697 | tsk->thread.error_code = error_code; | ||
698 | tsk->thread.trap_no = 8; | ||
699 | |||
700 | /* This is always a kernel trap and never fixable (and thus must | ||
701 | never return). */ | ||
702 | for (;;) | ||
703 | die(str, regs, error_code); | ||
704 | } | ||
705 | |||
706 | asmlinkage void __kprobes | ||
707 | do_general_protection(struct pt_regs *regs, long error_code) | ||
708 | { | ||
709 | struct task_struct *tsk; | ||
710 | |||
711 | conditional_sti(regs); | ||
712 | |||
713 | tsk = current; | ||
714 | if (!user_mode(regs)) | ||
715 | goto gp_in_kernel; | ||
716 | |||
717 | tsk->thread.error_code = error_code; | ||
718 | tsk->thread.trap_no = 13; | ||
719 | |||
720 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | ||
721 | printk_ratelimit()) { | ||
722 | printk(KERN_INFO | ||
723 | "%s[%d] general protection ip:%lx sp:%lx error:%lx", | ||
724 | tsk->comm, tsk->pid, | ||
725 | regs->ip, regs->sp, error_code); | ||
726 | print_vma_addr(" in ", regs->ip); | ||
727 | printk("\n"); | ||
728 | } | ||
729 | |||
730 | force_sig(SIGSEGV, tsk); | ||
731 | return; | ||
732 | |||
733 | gp_in_kernel: | ||
734 | if (fixup_exception(regs)) | ||
735 | return; | ||
736 | |||
737 | tsk->thread.error_code = error_code; | ||
738 | tsk->thread.trap_no = 13; | ||
739 | if (notify_die(DIE_GPF, "general protection fault", regs, | ||
740 | error_code, 13, SIGSEGV) == NOTIFY_STOP) | ||
741 | return; | ||
742 | die("general protection fault", regs, error_code); | ||
743 | } | ||
744 | |||
745 | static notrace __kprobes void | ||
746 | mem_parity_error(unsigned char reason, struct pt_regs *regs) | ||
747 | { | ||
748 | printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", | ||
749 | reason); | ||
750 | printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); | ||
751 | |||
752 | #if defined(CONFIG_EDAC) | ||
753 | if (edac_handler_set()) { | ||
754 | edac_atomic_assert_error(); | ||
755 | return; | ||
756 | } | ||
757 | #endif | ||
758 | |||
759 | if (panic_on_unrecovered_nmi) | ||
760 | panic("NMI: Not continuing"); | ||
761 | |||
762 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | ||
763 | |||
764 | /* Clear and disable the memory parity error line. */ | ||
765 | reason = (reason & 0xf) | 4; | ||
766 | outb(reason, 0x61); | ||
767 | } | ||
768 | |||
769 | static notrace __kprobes void | ||
770 | io_check_error(unsigned char reason, struct pt_regs *regs) | ||
771 | { | ||
772 | printk("NMI: IOCK error (debug interrupt?)\n"); | ||
773 | show_registers(regs); | ||
774 | |||
775 | /* Re-enable the IOCK line, wait for a few seconds */ | ||
776 | reason = (reason & 0xf) | 8; | ||
777 | outb(reason, 0x61); | ||
778 | mdelay(2000); | ||
779 | reason &= ~8; | ||
780 | outb(reason, 0x61); | ||
781 | } | ||
782 | |||
783 | static notrace __kprobes void | ||
784 | unknown_nmi_error(unsigned char reason, struct pt_regs *regs) | ||
785 | { | ||
786 | if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == | ||
787 | NOTIFY_STOP) | ||
788 | return; | ||
789 | printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", | ||
790 | reason); | ||
791 | printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); | ||
792 | |||
793 | if (panic_on_unrecovered_nmi) | ||
794 | panic("NMI: Not continuing"); | ||
795 | |||
796 | printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); | ||
797 | } | ||
798 | |||
799 | /* Runs on IST stack. This code must keep interrupts off all the time. | ||
800 | Nested NMIs are prevented by the CPU. */ | ||
801 | asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) | ||
802 | { | ||
803 | unsigned char reason = 0; | ||
804 | int cpu; | ||
805 | |||
806 | cpu = smp_processor_id(); | ||
807 | |||
808 | /* Only the BSP gets external NMIs from the system. */ | ||
809 | if (!cpu) | ||
810 | reason = get_nmi_reason(); | ||
811 | |||
812 | if (!(reason & 0xc0)) { | ||
813 | if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) | ||
814 | == NOTIFY_STOP) | ||
815 | return; | ||
816 | /* | ||
817 | * Ok, so this is none of the documented NMI sources, | ||
818 | * so it must be the NMI watchdog. | ||
819 | */ | ||
820 | if (nmi_watchdog_tick(regs, reason)) | ||
821 | return; | ||
822 | if (!do_nmi_callback(regs, cpu)) | ||
823 | unknown_nmi_error(reason, regs); | ||
824 | |||
825 | return; | ||
826 | } | ||
827 | if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) | ||
828 | return; | ||
829 | |||
830 | /* AK: following checks seem to be broken on modern chipsets. FIXME */ | ||
831 | if (reason & 0x80) | ||
832 | mem_parity_error(reason, regs); | ||
833 | if (reason & 0x40) | ||
834 | io_check_error(reason, regs); | ||
835 | } | ||
836 | |||
837 | asmlinkage notrace __kprobes void | ||
838 | do_nmi(struct pt_regs *regs, long error_code) | ||
839 | { | ||
840 | nmi_enter(); | ||
841 | |||
842 | add_pda(__nmi_count, 1); | ||
843 | |||
844 | if (!ignore_nmis) | ||
845 | default_do_nmi(regs); | ||
846 | |||
847 | nmi_exit(); | ||
848 | } | ||
849 | |||
850 | void stop_nmi(void) | ||
851 | { | ||
852 | acpi_nmi_disable(); | ||
853 | ignore_nmis++; | ||
854 | } | ||
855 | |||
856 | void restart_nmi(void) | ||
857 | { | ||
858 | ignore_nmis--; | ||
859 | acpi_nmi_enable(); | ||
860 | } | ||
861 | |||
862 | /* runs on IST stack. */ | ||
863 | asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) | ||
864 | { | ||
865 | trace_hardirqs_fixup(); | ||
866 | |||
867 | if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) | ||
868 | == NOTIFY_STOP) | ||
869 | return; | ||
870 | |||
871 | preempt_conditional_sti(regs); | ||
872 | do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); | ||
873 | preempt_conditional_cli(regs); | ||
874 | } | ||
875 | |||
876 | /* Help handler running on IST stack to switch back to user stack | ||
877 | for scheduling or signal handling. The actual stack switch is done in | ||
878 | entry.S */ | ||
879 | asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) | ||
880 | { | ||
881 | struct pt_regs *regs = eregs; | ||
882 | /* Did already sync */ | ||
883 | if (eregs == (struct pt_regs *)eregs->sp) | ||
884 | ; | ||
885 | /* Exception from user space */ | ||
886 | else if (user_mode(eregs)) | ||
887 | regs = task_pt_regs(current); | ||
888 | /* Exception from kernel and interrupts are enabled. Move to | ||
889 | kernel process stack. */ | ||
890 | else if (eregs->flags & X86_EFLAGS_IF) | ||
891 | regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); | ||
892 | if (eregs != regs) | ||
893 | *regs = *eregs; | ||
894 | return regs; | ||
895 | } | ||
896 | |||
897 | /* runs on IST stack. */ | ||
898 | asmlinkage void __kprobes do_debug(struct pt_regs *regs, | ||
899 | unsigned long error_code) | ||
900 | { | ||
901 | struct task_struct *tsk = current; | ||
902 | unsigned long condition; | ||
903 | siginfo_t info; | ||
904 | |||
905 | trace_hardirqs_fixup(); | ||
906 | |||
907 | get_debugreg(condition, 6); | ||
908 | |||
909 | /* | ||
910 | * The processor cleared BTF, so don't mark that we need it set. | ||
911 | */ | ||
912 | clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); | ||
913 | tsk->thread.debugctlmsr = 0; | ||
914 | |||
915 | if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, | ||
916 | SIGTRAP) == NOTIFY_STOP) | ||
917 | return; | ||
918 | |||
919 | preempt_conditional_sti(regs); | ||
920 | |||
921 | /* Mask out spurious debug traps due to lazy DR7 setting */ | ||
922 | if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { | ||
923 | if (!tsk->thread.debugreg7) | ||
924 | goto clear_dr7; | ||
925 | } | ||
926 | |||
927 | tsk->thread.debugreg6 = condition; | ||
928 | |||
929 | /* | ||
930 | * Single-stepping through TF: make sure we ignore any events in | ||
931 | * kernel space (but re-enable TF when returning to user mode). | ||
932 | */ | ||
933 | if (condition & DR_STEP) { | ||
934 | if (!user_mode(regs)) | ||
935 | goto clear_TF_reenable; | ||
936 | } | ||
937 | |||
938 | /* Ok, finally something we can handle */ | ||
939 | tsk->thread.trap_no = 1; | ||
940 | tsk->thread.error_code = error_code; | ||
941 | info.si_signo = SIGTRAP; | ||
942 | info.si_errno = 0; | ||
943 | info.si_code = get_si_code(condition); | ||
944 | info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; | ||
945 | force_sig_info(SIGTRAP, &info, tsk); | ||
946 | |||
947 | clear_dr7: | ||
948 | set_debugreg(0, 7); | ||
949 | preempt_conditional_cli(regs); | ||
950 | return; | ||
951 | |||
952 | clear_TF_reenable: | ||
953 | set_tsk_thread_flag(tsk, TIF_SINGLESTEP); | ||
954 | regs->flags &= ~X86_EFLAGS_TF; | ||
955 | preempt_conditional_cli(regs); | ||
956 | return; | ||
957 | } | ||
958 | |||
959 | static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) | ||
960 | { | ||
961 | if (fixup_exception(regs)) | ||
962 | return 1; | ||
963 | |||
964 | notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); | ||
965 | /* Illegal floating point operation in the kernel */ | ||
966 | current->thread.trap_no = trapnr; | ||
967 | die(str, regs, 0); | ||
968 | return 0; | ||
969 | } | ||
970 | |||
971 | /* | ||
972 | * Note that we play around with the 'TS' bit in an attempt to get | ||
973 | * the correct behaviour even in the presence of the asynchronous | ||
974 | * IRQ13 behaviour | ||
975 | */ | ||
976 | asmlinkage void do_coprocessor_error(struct pt_regs *regs) | ||
977 | { | ||
978 | void __user *ip = (void __user *)(regs->ip); | ||
979 | struct task_struct *task; | ||
980 | siginfo_t info; | ||
981 | unsigned short cwd, swd; | ||
982 | |||
983 | conditional_sti(regs); | ||
984 | if (!user_mode(regs) && | ||
985 | kernel_math_error(regs, "kernel x87 math error", 16)) | ||
986 | return; | ||
987 | |||
988 | /* | ||
989 | * Save the info for the exception handler and clear the error. | ||
990 | */ | ||
991 | task = current; | ||
992 | save_init_fpu(task); | ||
993 | task->thread.trap_no = 16; | ||
994 | task->thread.error_code = 0; | ||
995 | info.si_signo = SIGFPE; | ||
996 | info.si_errno = 0; | ||
997 | info.si_code = __SI_FAULT; | ||
998 | info.si_addr = ip; | ||
999 | /* | ||
1000 | * (~cwd & swd) will mask out exceptions that are not set to unmasked | ||
1001 | * status. 0x3f is the exception bits in these regs, 0x200 is the | ||
1002 | * C1 reg you need in case of a stack fault, 0x040 is the stack | ||
1003 | * fault bit. We should only be taking one exception at a time, | ||
1004 | * so if this combination doesn't produce any single exception, | ||
1005 | * then we have a bad program that isn't synchronizing its FPU usage | ||
1006 | * and it will suffer the consequences since we won't be able to | ||
1007 | * fully reproduce the context of the exception | ||
1008 | */ | ||
1009 | cwd = get_fpu_cwd(task); | ||
1010 | swd = get_fpu_swd(task); | ||
1011 | switch (swd & ~cwd & 0x3f) { | ||
1012 | case 0x000: /* No unmasked exception */ | ||
1013 | default: /* Multiple exceptions */ | ||
1014 | break; | ||
1015 | case 0x001: /* Invalid Op */ | ||
1016 | /* | ||
1017 | * swd & 0x240 == 0x040: Stack Underflow | ||
1018 | * swd & 0x240 == 0x240: Stack Overflow | ||
1019 | * User must clear the SF bit (0x40) if set | ||
1020 | */ | ||
1021 | info.si_code = FPE_FLTINV; | ||
1022 | break; | ||
1023 | case 0x002: /* Denormalize */ | ||
1024 | case 0x010: /* Underflow */ | ||
1025 | info.si_code = FPE_FLTUND; | ||
1026 | break; | ||
1027 | case 0x004: /* Zero Divide */ | ||
1028 | info.si_code = FPE_FLTDIV; | ||
1029 | break; | ||
1030 | case 0x008: /* Overflow */ | ||
1031 | info.si_code = FPE_FLTOVF; | ||
1032 | break; | ||
1033 | case 0x020: /* Precision */ | ||
1034 | info.si_code = FPE_FLTRES; | ||
1035 | break; | ||
1036 | } | ||
1037 | force_sig_info(SIGFPE, &info, task); | ||
1038 | } | ||
1039 | |||
1040 | asmlinkage void bad_intr(void) | ||
1041 | { | ||
1042 | printk("bad interrupt"); | ||
1043 | } | ||
1044 | |||
1045 | asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) | ||
1046 | { | ||
1047 | void __user *ip = (void __user *)(regs->ip); | ||
1048 | struct task_struct *task; | ||
1049 | siginfo_t info; | ||
1050 | unsigned short mxcsr; | ||
1051 | |||
1052 | conditional_sti(regs); | ||
1053 | if (!user_mode(regs) && | ||
1054 | kernel_math_error(regs, "kernel simd math error", 19)) | ||
1055 | return; | ||
1056 | |||
1057 | /* | ||
1058 | * Save the info for the exception handler and clear the error. | ||
1059 | */ | ||
1060 | task = current; | ||
1061 | save_init_fpu(task); | ||
1062 | task->thread.trap_no = 19; | ||
1063 | task->thread.error_code = 0; | ||
1064 | info.si_signo = SIGFPE; | ||
1065 | info.si_errno = 0; | ||
1066 | info.si_code = __SI_FAULT; | ||
1067 | info.si_addr = ip; | ||
1068 | /* | ||
1069 | * The SIMD FPU exceptions are handled a little differently, as there | ||
1070 | * is only a single status/control register. Thus, to determine which | ||
1071 | * unmasked exception was caught we must mask the exception mask bits | ||
1072 | * at 0x1f80, and then use these to mask the exception bits at 0x3f. | ||
1073 | */ | ||
1074 | mxcsr = get_fpu_mxcsr(task); | ||
1075 | switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { | ||
1076 | case 0x000: | ||
1077 | default: | ||
1078 | break; | ||
1079 | case 0x001: /* Invalid Op */ | ||
1080 | info.si_code = FPE_FLTINV; | ||
1081 | break; | ||
1082 | case 0x002: /* Denormalize */ | ||
1083 | case 0x010: /* Underflow */ | ||
1084 | info.si_code = FPE_FLTUND; | ||
1085 | break; | ||
1086 | case 0x004: /* Zero Divide */ | ||
1087 | info.si_code = FPE_FLTDIV; | ||
1088 | break; | ||
1089 | case 0x008: /* Overflow */ | ||
1090 | info.si_code = FPE_FLTOVF; | ||
1091 | break; | ||
1092 | case 0x020: /* Precision */ | ||
1093 | info.si_code = FPE_FLTRES; | ||
1094 | break; | ||
1095 | } | ||
1096 | force_sig_info(SIGFPE, &info, task); | ||
1097 | } | ||
1098 | |||
1099 | asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs) | ||
1100 | { | ||
1101 | } | ||
1102 | |||
1103 | asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) | ||
1104 | { | ||
1105 | } | ||
1106 | |||
1107 | asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) | ||
1108 | { | ||
1109 | } | ||
1110 | |||
1111 | /* | ||
1112 | * 'math_state_restore()' saves the current math information in the | ||
1113 | * old math state array, and gets the new ones from the current task | ||
1114 | * | ||
1115 | * Careful.. There are problems with IBM-designed IRQ13 behaviour. | ||
1116 | * Don't touch unless you *really* know how it works. | ||
1117 | */ | ||
1118 | asmlinkage void math_state_restore(void) | ||
1119 | { | ||
1120 | struct task_struct *me = current; | ||
1121 | |||
1122 | if (!used_math()) { | ||
1123 | local_irq_enable(); | ||
1124 | /* | ||
1125 | * does a slab alloc which can sleep | ||
1126 | */ | ||
1127 | if (init_fpu(me)) { | ||
1128 | /* | ||
1129 | * ran out of memory! | ||
1130 | */ | ||
1131 | do_group_exit(SIGKILL); | ||
1132 | return; | ||
1133 | } | ||
1134 | local_irq_disable(); | ||
1135 | } | ||
1136 | |||
1137 | clts(); /* Allow maths ops (or we recurse) */ | ||
1138 | /* | ||
1139 | * Paranoid restore. send a SIGSEGV if we fail to restore the state. | ||
1140 | */ | ||
1141 | if (unlikely(restore_fpu_checking(me))) { | ||
1142 | stts(); | ||
1143 | force_sig(SIGSEGV, me); | ||
1144 | return; | ||
1145 | } | ||
1146 | task_thread_info(me)->status |= TS_USEDFPU; | ||
1147 | me->fpu_counter++; | ||
1148 | } | ||
1149 | EXPORT_SYMBOL_GPL(math_state_restore); | ||
1150 | |||
1151 | void __init trap_init(void) | ||
1152 | { | ||
1153 | set_intr_gate(0, ÷_error); | ||
1154 | set_intr_gate_ist(1, &debug, DEBUG_STACK); | ||
1155 | set_intr_gate_ist(2, &nmi, NMI_STACK); | ||
1156 | /* int3 can be called from all */ | ||
1157 | set_system_gate_ist(3, &int3, DEBUG_STACK); | ||
1158 | /* int4 can be called from all */ | ||
1159 | set_system_gate(4, &overflow); | ||
1160 | set_intr_gate(5, &bounds); | ||
1161 | set_intr_gate(6, &invalid_op); | ||
1162 | set_intr_gate(7, &device_not_available); | ||
1163 | set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); | ||
1164 | set_intr_gate(9, &coprocessor_segment_overrun); | ||
1165 | set_intr_gate(10, &invalid_TSS); | ||
1166 | set_intr_gate(11, &segment_not_present); | ||
1167 | set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); | ||
1168 | set_intr_gate(13, &general_protection); | ||
1169 | set_intr_gate(14, &page_fault); | ||
1170 | set_intr_gate(15, &spurious_interrupt_bug); | ||
1171 | set_intr_gate(16, &coprocessor_error); | ||
1172 | set_intr_gate(17, &alignment_check); | ||
1173 | #ifdef CONFIG_X86_MCE | ||
1174 | set_intr_gate_ist(18, &machine_check, MCE_STACK); | ||
1175 | #endif | ||
1176 | set_intr_gate(19, &simd_coprocessor_error); | ||
1177 | |||
1178 | #ifdef CONFIG_IA32_EMULATION | ||
1179 | set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); | ||
1180 | #endif | ||
1181 | /* | ||
1182 | * Should be a barrier for any external CPU state: | ||
1183 | */ | ||
1184 | cpu_init(); | ||
1185 | } | ||
1186 | |||
1187 | static int __init oops_setup(char *s) | ||
1188 | { | ||
1189 | if (!s) | ||
1190 | return -EINVAL; | ||
1191 | if (!strcmp(s, "panic")) | ||
1192 | panic_on_oops = 1; | ||
1193 | return 0; | ||
1194 | } | ||
1195 | early_param("oops", oops_setup); | ||
1196 | |||
1197 | static int __init kstack_setup(char *s) | ||
1198 | { | ||
1199 | if (!s) | ||
1200 | return -EINVAL; | ||
1201 | kstack_depth_to_print = simple_strtoul(s, NULL, 0); | ||
1202 | return 0; | ||
1203 | } | ||
1204 | early_param("kstack", kstack_setup); | ||
1205 | |||
1206 | static int __init code_bytes_setup(char *s) | ||
1207 | { | ||
1208 | code_bytes = simple_strtoul(s, NULL, 0); | ||
1209 | if (code_bytes > 8192) | ||
1210 | code_bytes = 8192; | ||
1211 | |||
1212 | return 1; | ||
1213 | } | ||
1214 | __setup("code_bytes=", code_bytes_setup); | ||
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c new file mode 100644 index 000000000000..aeef529917e4 --- /dev/null +++ b/arch/x86/kernel/uv_irq.c | |||
@@ -0,0 +1,79 @@ | |||
1 | /* | ||
2 | * This file is subject to the terms and conditions of the GNU General Public | ||
3 | * License. See the file "COPYING" in the main directory of this archive | ||
4 | * for more details. | ||
5 | * | ||
6 | * SGI UV IRQ functions | ||
7 | * | ||
8 | * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. | ||
9 | */ | ||
10 | |||
11 | #include <linux/module.h> | ||
12 | #include <linux/irq.h> | ||
13 | |||
14 | #include <asm/apic.h> | ||
15 | #include <asm/uv/uv_irq.h> | ||
16 | |||
17 | static void uv_noop(unsigned int irq) | ||
18 | { | ||
19 | } | ||
20 | |||
21 | static unsigned int uv_noop_ret(unsigned int irq) | ||
22 | { | ||
23 | return 0; | ||
24 | } | ||
25 | |||
26 | static void uv_ack_apic(unsigned int irq) | ||
27 | { | ||
28 | ack_APIC_irq(); | ||
29 | } | ||
30 | |||
31 | struct irq_chip uv_irq_chip = { | ||
32 | .name = "UV-CORE", | ||
33 | .startup = uv_noop_ret, | ||
34 | .shutdown = uv_noop, | ||
35 | .enable = uv_noop, | ||
36 | .disable = uv_noop, | ||
37 | .ack = uv_noop, | ||
38 | .mask = uv_noop, | ||
39 | .unmask = uv_noop, | ||
40 | .eoi = uv_ack_apic, | ||
41 | .end = uv_noop, | ||
42 | }; | ||
43 | |||
44 | /* | ||
45 | * Set up a mapping of an available irq and vector, and enable the specified | ||
46 | * MMR that defines the MSI that is to be sent to the specified CPU when an | ||
47 | * interrupt is raised. | ||
48 | */ | ||
49 | int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, | ||
50 | unsigned long mmr_offset) | ||
51 | { | ||
52 | int irq; | ||
53 | int ret; | ||
54 | |||
55 | irq = create_irq(); | ||
56 | if (irq <= 0) | ||
57 | return -EBUSY; | ||
58 | |||
59 | ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); | ||
60 | if (ret != irq) | ||
61 | destroy_irq(irq); | ||
62 | |||
63 | return ret; | ||
64 | } | ||
65 | EXPORT_SYMBOL_GPL(uv_setup_irq); | ||
66 | |||
67 | /* | ||
68 | * Tear down a mapping of an irq and vector, and disable the specified MMR that | ||
69 | * defined the MSI that was to be sent to the specified CPU when an interrupt | ||
70 | * was raised. | ||
71 | * | ||
72 | * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). | ||
73 | */ | ||
74 | void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) | ||
75 | { | ||
76 | arch_disable_uv_irq(mmr_blade, mmr_offset); | ||
77 | destroy_irq(irq); | ||
78 | } | ||
79 | EXPORT_SYMBOL_GPL(uv_teardown_irq); | ||
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c new file mode 100644 index 000000000000..67f9b9dbf800 --- /dev/null +++ b/arch/x86/kernel/uv_sysfs.c | |||
@@ -0,0 +1,72 @@ | |||
1 | /* | ||
2 | * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | * | ||
18 | * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. | ||
19 | * Copyright (c) Russ Anderson | ||
20 | */ | ||
21 | |||
22 | #include <linux/sysdev.h> | ||
23 | #include <asm/uv/bios.h> | ||
24 | |||
25 | struct kobject *sgi_uv_kobj; | ||
26 | |||
27 | static ssize_t partition_id_show(struct kobject *kobj, | ||
28 | struct kobj_attribute *attr, char *buf) | ||
29 | { | ||
30 | return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id); | ||
31 | } | ||
32 | |||
33 | static ssize_t coherence_id_show(struct kobject *kobj, | ||
34 | struct kobj_attribute *attr, char *buf) | ||
35 | { | ||
36 | return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id()); | ||
37 | } | ||
38 | |||
39 | static struct kobj_attribute partition_id_attr = | ||
40 | __ATTR(partition_id, S_IRUGO, partition_id_show, NULL); | ||
41 | |||
42 | static struct kobj_attribute coherence_id_attr = | ||
43 | __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL); | ||
44 | |||
45 | |||
46 | static int __init sgi_uv_sysfs_init(void) | ||
47 | { | ||
48 | unsigned long ret; | ||
49 | |||
50 | if (!sgi_uv_kobj) | ||
51 | sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); | ||
52 | if (!sgi_uv_kobj) { | ||
53 | printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); | ||
54 | return -EINVAL; | ||
55 | } | ||
56 | |||
57 | ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); | ||
58 | if (ret) { | ||
59 | printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); | ||
60 | return ret; | ||
61 | } | ||
62 | |||
63 | ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); | ||
64 | if (ret) { | ||
65 | printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); | ||
66 | return ret; | ||
67 | } | ||
68 | |||
69 | return 0; | ||
70 | } | ||
71 | |||
72 | device_initcall(sgi_uv_sysfs_init); | ||
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 61a97e616f70..0c9667f0752a 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c | |||
@@ -484,10 +484,11 @@ static void disable_cobalt_irq(unsigned int irq) | |||
484 | static unsigned int startup_cobalt_irq(unsigned int irq) | 484 | static unsigned int startup_cobalt_irq(unsigned int irq) |
485 | { | 485 | { |
486 | unsigned long flags; | 486 | unsigned long flags; |
487 | struct irq_desc *desc = irq_to_desc(irq); | ||
487 | 488 | ||
488 | spin_lock_irqsave(&cobalt_lock, flags); | 489 | spin_lock_irqsave(&cobalt_lock, flags); |
489 | if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) | 490 | if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) |
490 | irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); | 491 | desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); |
491 | enable_cobalt_irq(irq); | 492 | enable_cobalt_irq(irq); |
492 | spin_unlock_irqrestore(&cobalt_lock, flags); | 493 | spin_unlock_irqrestore(&cobalt_lock, flags); |
493 | return 0; | 494 | return 0; |
@@ -506,9 +507,10 @@ static void ack_cobalt_irq(unsigned int irq) | |||
506 | static void end_cobalt_irq(unsigned int irq) | 507 | static void end_cobalt_irq(unsigned int irq) |
507 | { | 508 | { |
508 | unsigned long flags; | 509 | unsigned long flags; |
510 | struct irq_desc *desc = irq_to_desc(irq); | ||
509 | 511 | ||
510 | spin_lock_irqsave(&cobalt_lock, flags); | 512 | spin_lock_irqsave(&cobalt_lock, flags); |
511 | if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) | 513 | if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS))) |
512 | enable_cobalt_irq(irq); | 514 | enable_cobalt_irq(irq); |
513 | spin_unlock_irqrestore(&cobalt_lock, flags); | 515 | spin_unlock_irqrestore(&cobalt_lock, flags); |
514 | } | 516 | } |
@@ -626,12 +628,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) | |||
626 | 628 | ||
627 | spin_unlock_irqrestore(&i8259A_lock, flags); | 629 | spin_unlock_irqrestore(&i8259A_lock, flags); |
628 | 630 | ||
629 | desc = irq_desc + realirq; | 631 | desc = irq_to_desc(realirq); |
630 | 632 | ||
631 | /* | 633 | /* |
632 | * handle this 'virtual interrupt' as a Cobalt one now. | 634 | * handle this 'virtual interrupt' as a Cobalt one now. |
633 | */ | 635 | */ |
634 | kstat_cpu(smp_processor_id()).irqs[realirq]++; | 636 | kstat_incr_irqs_this_cpu(realirq, desc); |
635 | 637 | ||
636 | if (likely(desc->action != NULL)) | 638 | if (likely(desc->action != NULL)) |
637 | handle_IRQ_event(realirq, desc->action); | 639 | handle_IRQ_event(realirq, desc->action); |
@@ -662,27 +664,29 @@ void init_VISWS_APIC_irqs(void) | |||
662 | int i; | 664 | int i; |
663 | 665 | ||
664 | for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { | 666 | for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { |
665 | irq_desc[i].status = IRQ_DISABLED; | 667 | struct irq_desc *desc = irq_to_desc(i); |
666 | irq_desc[i].action = 0; | 668 | |
667 | irq_desc[i].depth = 1; | 669 | desc->status = IRQ_DISABLED; |
670 | desc->action = 0; | ||
671 | desc->depth = 1; | ||
668 | 672 | ||
669 | if (i == 0) { | 673 | if (i == 0) { |
670 | irq_desc[i].chip = &cobalt_irq_type; | 674 | desc->chip = &cobalt_irq_type; |
671 | } | 675 | } |
672 | else if (i == CO_IRQ_IDE0) { | 676 | else if (i == CO_IRQ_IDE0) { |
673 | irq_desc[i].chip = &cobalt_irq_type; | 677 | desc->chip = &cobalt_irq_type; |
674 | } | 678 | } |
675 | else if (i == CO_IRQ_IDE1) { | 679 | else if (i == CO_IRQ_IDE1) { |
676 | irq_desc[i].chip = &cobalt_irq_type; | 680 | desc->chip = &cobalt_irq_type; |
677 | } | 681 | } |
678 | else if (i == CO_IRQ_8259) { | 682 | else if (i == CO_IRQ_8259) { |
679 | irq_desc[i].chip = &piix4_master_irq_type; | 683 | desc->chip = &piix4_master_irq_type; |
680 | } | 684 | } |
681 | else if (i < CO_IRQ_APIC0) { | 685 | else if (i < CO_IRQ_APIC0) { |
682 | irq_desc[i].chip = &piix4_virtual_irq_type; | 686 | desc->chip = &piix4_virtual_irq_type; |
683 | } | 687 | } |
684 | else if (IS_CO_APIC(i)) { | 688 | else if (IS_CO_APIC(i)) { |
685 | irq_desc[i].chip = &cobalt_irq_type; | 689 | desc->chip = &cobalt_irq_type; |
686 | } | 690 | } |
687 | } | 691 | } |
688 | 692 | ||
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 6953859fe289..254ee07f8635 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c | |||
@@ -235,11 +235,14 @@ static void __devinit vmi_time_init_clockevent(void) | |||
235 | 235 | ||
236 | void __init vmi_time_init(void) | 236 | void __init vmi_time_init(void) |
237 | { | 237 | { |
238 | unsigned int cpu; | ||
238 | /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ | 239 | /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ |
239 | outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ | 240 | outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ |
240 | 241 | ||
241 | vmi_time_init_clockevent(); | 242 | vmi_time_init_clockevent(); |
242 | setup_irq(0, &vmi_clock_action); | 243 | setup_irq(0, &vmi_clock_action); |
244 | for_each_possible_cpu(cpu) | ||
245 | per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0; | ||
243 | } | 246 | } |
244 | 247 | ||
245 | #ifdef CONFIG_X86_LOCAL_APIC | 248 | #ifdef CONFIG_X86_LOCAL_APIC |
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d0e940bb6f40..c02343594b4d 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
@@ -3,10 +3,13 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ | 5 | common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ |
6 | coalesced_mmio.o) | 6 | coalesced_mmio.o irq_comm.o) |
7 | ifeq ($(CONFIG_KVM_TRACE),y) | 7 | ifeq ($(CONFIG_KVM_TRACE),y) |
8 | common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) | 8 | common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) |
9 | endif | 9 | endif |
10 | ifeq ($(CONFIG_DMAR),y) | ||
11 | common-objs += $(addprefix ../../../virt/kvm/, vtd.o) | ||
12 | endif | ||
10 | 13 | ||
11 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm | 14 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm |
12 | 15 | ||
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c0f7872a9124..634132a9a512 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c | |||
@@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) | |||
200 | 200 | ||
201 | if (!atomic_inc_and_test(&pt->pending)) | 201 | if (!atomic_inc_and_test(&pt->pending)) |
202 | set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); | 202 | set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); |
203 | if (vcpu0 && waitqueue_active(&vcpu0->wq)) { | 203 | |
204 | vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; | 204 | if (vcpu0 && waitqueue_active(&vcpu0->wq)) |
205 | wake_up_interruptible(&vcpu0->wq); | 205 | wake_up_interruptible(&vcpu0->wq); |
206 | } | ||
207 | 206 | ||
208 | pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); | 207 | pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); |
209 | pt->scheduled = ktime_to_ns(pt->timer.expires); | 208 | pt->scheduled = ktime_to_ns(pt->timer.expires); |
209 | if (pt->period) | ||
210 | ps->channels[0].count_load_time = pt->timer.expires; | ||
210 | 211 | ||
211 | return (pt->period == 0 ? 0 : 1); | 212 | return (pt->period == 0 ? 0 : 1); |
212 | } | 213 | } |
@@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) | |||
215 | { | 216 | { |
216 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; | 217 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; |
217 | 218 | ||
218 | if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending) | 219 | if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack) |
219 | return atomic_read(&pit->pit_state.pit_timer.pending); | 220 | return atomic_read(&pit->pit_state.pit_timer.pending); |
220 | |||
221 | return 0; | 221 | return 0; |
222 | } | 222 | } |
223 | 223 | ||
224 | static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) | ||
225 | { | ||
226 | struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, | ||
227 | irq_ack_notifier); | ||
228 | spin_lock(&ps->inject_lock); | ||
229 | if (atomic_dec_return(&ps->pit_timer.pending) < 0) | ||
230 | atomic_inc(&ps->pit_timer.pending); | ||
231 | ps->irq_ack = 1; | ||
232 | spin_unlock(&ps->inject_lock); | ||
233 | } | ||
234 | |||
224 | static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) | 235 | static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) |
225 | { | 236 | { |
226 | struct kvm_kpit_state *ps; | 237 | struct kvm_kpit_state *ps; |
@@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt) | |||
255 | hrtimer_cancel(&pt->timer); | 266 | hrtimer_cancel(&pt->timer); |
256 | } | 267 | } |
257 | 268 | ||
258 | static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) | 269 | static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) |
259 | { | 270 | { |
271 | struct kvm_kpit_timer *pt = &ps->pit_timer; | ||
260 | s64 interval; | 272 | s64 interval; |
261 | 273 | ||
262 | interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); | 274 | interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); |
@@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) | |||
268 | pt->period = (is_period == 0) ? 0 : interval; | 280 | pt->period = (is_period == 0) ? 0 : interval; |
269 | pt->timer.function = pit_timer_fn; | 281 | pt->timer.function = pit_timer_fn; |
270 | atomic_set(&pt->pending, 0); | 282 | atomic_set(&pt->pending, 0); |
283 | ps->irq_ack = 1; | ||
271 | 284 | ||
272 | hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), | 285 | hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), |
273 | HRTIMER_MODE_ABS); | 286 | HRTIMER_MODE_ABS); |
@@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) | |||
302 | case 1: | 315 | case 1: |
303 | /* FIXME: enhance mode 4 precision */ | 316 | /* FIXME: enhance mode 4 precision */ |
304 | case 4: | 317 | case 4: |
305 | create_pit_timer(&ps->pit_timer, val, 0); | 318 | create_pit_timer(ps, val, 0); |
306 | break; | 319 | break; |
307 | case 2: | 320 | case 2: |
308 | case 3: | 321 | case 3: |
309 | create_pit_timer(&ps->pit_timer, val, 1); | 322 | create_pit_timer(ps, val, 1); |
310 | break; | 323 | break; |
311 | default: | 324 | default: |
312 | destroy_pit_timer(&ps->pit_timer); | 325 | destroy_pit_timer(&ps->pit_timer); |
@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit) | |||
520 | mutex_unlock(&pit->pit_state.lock); | 533 | mutex_unlock(&pit->pit_state.lock); |
521 | 534 | ||
522 | atomic_set(&pit->pit_state.pit_timer.pending, 0); | 535 | atomic_set(&pit->pit_state.pit_timer.pending, 0); |
523 | pit->pit_state.inject_pending = 1; | 536 | pit->pit_state.irq_ack = 1; |
524 | } | 537 | } |
525 | 538 | ||
526 | struct kvm_pit *kvm_create_pit(struct kvm *kvm) | 539 | struct kvm_pit *kvm_create_pit(struct kvm *kvm) |
@@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) | |||
534 | 547 | ||
535 | mutex_init(&pit->pit_state.lock); | 548 | mutex_init(&pit->pit_state.lock); |
536 | mutex_lock(&pit->pit_state.lock); | 549 | mutex_lock(&pit->pit_state.lock); |
550 | spin_lock_init(&pit->pit_state.inject_lock); | ||
537 | 551 | ||
538 | /* Initialize PIO device */ | 552 | /* Initialize PIO device */ |
539 | pit->dev.read = pit_ioport_read; | 553 | pit->dev.read = pit_ioport_read; |
@@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) | |||
555 | pit_state->pit = pit; | 569 | pit_state->pit = pit; |
556 | hrtimer_init(&pit_state->pit_timer.timer, | 570 | hrtimer_init(&pit_state->pit_timer.timer, |
557 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 571 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
572 | pit_state->irq_ack_notifier.gsi = 0; | ||
573 | pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; | ||
574 | kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); | ||
558 | mutex_unlock(&pit->pit_state.lock); | 575 | mutex_unlock(&pit->pit_state.lock); |
559 | 576 | ||
560 | kvm_pit_reset(pit); | 577 | kvm_pit_reset(pit); |
@@ -578,10 +595,8 @@ void kvm_free_pit(struct kvm *kvm) | |||
578 | static void __inject_pit_timer_intr(struct kvm *kvm) | 595 | static void __inject_pit_timer_intr(struct kvm *kvm) |
579 | { | 596 | { |
580 | mutex_lock(&kvm->lock); | 597 | mutex_lock(&kvm->lock); |
581 | kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); | 598 | kvm_set_irq(kvm, 0, 1); |
582 | kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); | 599 | kvm_set_irq(kvm, 0, 0); |
583 | kvm_pic_set_irq(pic_irqchip(kvm), 0, 1); | ||
584 | kvm_pic_set_irq(pic_irqchip(kvm), 0, 0); | ||
585 | mutex_unlock(&kvm->lock); | 600 | mutex_unlock(&kvm->lock); |
586 | } | 601 | } |
587 | 602 | ||
@@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) | |||
592 | struct kvm_kpit_state *ps; | 607 | struct kvm_kpit_state *ps; |
593 | 608 | ||
594 | if (vcpu && pit) { | 609 | if (vcpu && pit) { |
610 | int inject = 0; | ||
595 | ps = &pit->pit_state; | 611 | ps = &pit->pit_state; |
596 | 612 | ||
597 | /* Try to inject pending interrupts when: | 613 | /* Try to inject pending interrupts when |
598 | * 1. Pending exists | 614 | * last one has been acked. |
599 | * 2. Last interrupt was accepted or waited for too long time*/ | 615 | */ |
600 | if (atomic_read(&ps->pit_timer.pending) && | 616 | spin_lock(&ps->inject_lock); |
601 | (ps->inject_pending || | 617 | if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { |
602 | (jiffies - ps->last_injected_time | 618 | ps->irq_ack = 0; |
603 | >= KVM_MAX_PIT_INTR_INTERVAL))) { | 619 | inject = 1; |
604 | ps->inject_pending = 0; | ||
605 | __inject_pit_timer_intr(kvm); | ||
606 | ps->last_injected_time = jiffies; | ||
607 | } | ||
608 | } | ||
609 | } | ||
610 | |||
611 | void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | ||
612 | { | ||
613 | struct kvm_arch *arch = &vcpu->kvm->arch; | ||
614 | struct kvm_kpit_state *ps; | ||
615 | |||
616 | if (vcpu && arch->vpit) { | ||
617 | ps = &arch->vpit->pit_state; | ||
618 | if (atomic_read(&ps->pit_timer.pending) && | ||
619 | (((arch->vpic->pics[0].imr & 1) == 0 && | ||
620 | arch->vpic->pics[0].irq_base == vec) || | ||
621 | (arch->vioapic->redirtbl[0].fields.vector == vec && | ||
622 | arch->vioapic->redirtbl[0].fields.mask != 1))) { | ||
623 | ps->inject_pending = 1; | ||
624 | atomic_dec(&ps->pit_timer.pending); | ||
625 | ps->channels[0].count_load_time = ktime_get(); | ||
626 | } | 620 | } |
621 | spin_unlock(&ps->inject_lock); | ||
622 | if (inject) | ||
623 | __inject_pit_timer_intr(kvm); | ||
627 | } | 624 | } |
628 | } | 625 | } |
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index db25c2a6c8c4..e436d4983aa1 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h | |||
@@ -8,7 +8,6 @@ struct kvm_kpit_timer { | |||
8 | int irq; | 8 | int irq; |
9 | s64 period; /* unit: ns */ | 9 | s64 period; /* unit: ns */ |
10 | s64 scheduled; | 10 | s64 scheduled; |
11 | ktime_t last_update; | ||
12 | atomic_t pending; | 11 | atomic_t pending; |
13 | }; | 12 | }; |
14 | 13 | ||
@@ -34,8 +33,9 @@ struct kvm_kpit_state { | |||
34 | u32 speaker_data_on; | 33 | u32 speaker_data_on; |
35 | struct mutex lock; | 34 | struct mutex lock; |
36 | struct kvm_pit *pit; | 35 | struct kvm_pit *pit; |
37 | bool inject_pending; /* if inject pending interrupts */ | 36 | spinlock_t inject_lock; |
38 | unsigned long last_injected_time; | 37 | unsigned long irq_ack; |
38 | struct kvm_irq_ack_notifier irq_ack_notifier; | ||
39 | }; | 39 | }; |
40 | 40 | ||
41 | struct kvm_pit { | 41 | struct kvm_pit { |
@@ -54,7 +54,6 @@ struct kvm_pit { | |||
54 | #define KVM_PIT_CHANNEL_MASK 0x3 | 54 | #define KVM_PIT_CHANNEL_MASK 0x3 |
55 | 55 | ||
56 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); | 56 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); |
57 | void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
58 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); | 57 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); |
59 | struct kvm_pit *kvm_create_pit(struct kvm *kvm); | 58 | struct kvm_pit *kvm_create_pit(struct kvm *kvm); |
60 | void kvm_free_pit(struct kvm *kvm); | 59 | void kvm_free_pit(struct kvm *kvm); |
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index c31164e8aa46..17e41e165f1a 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -30,6 +30,19 @@ | |||
30 | 30 | ||
31 | #include <linux/kvm_host.h> | 31 | #include <linux/kvm_host.h> |
32 | 32 | ||
33 | static void pic_clear_isr(struct kvm_kpic_state *s, int irq) | ||
34 | { | ||
35 | s->isr &= ~(1 << irq); | ||
36 | s->isr_ack |= (1 << irq); | ||
37 | } | ||
38 | |||
39 | void kvm_pic_clear_isr_ack(struct kvm *kvm) | ||
40 | { | ||
41 | struct kvm_pic *s = pic_irqchip(kvm); | ||
42 | s->pics[0].isr_ack = 0xff; | ||
43 | s->pics[1].isr_ack = 0xff; | ||
44 | } | ||
45 | |||
33 | /* | 46 | /* |
34 | * set irq level. If an edge is detected, then the IRR is set to 1 | 47 | * set irq level. If an edge is detected, then the IRR is set to 1 |
35 | */ | 48 | */ |
@@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level) | |||
141 | */ | 154 | */ |
142 | static inline void pic_intack(struct kvm_kpic_state *s, int irq) | 155 | static inline void pic_intack(struct kvm_kpic_state *s, int irq) |
143 | { | 156 | { |
157 | s->isr |= 1 << irq; | ||
144 | if (s->auto_eoi) { | 158 | if (s->auto_eoi) { |
145 | if (s->rotate_on_auto_eoi) | 159 | if (s->rotate_on_auto_eoi) |
146 | s->priority_add = (irq + 1) & 7; | 160 | s->priority_add = (irq + 1) & 7; |
147 | } else | 161 | pic_clear_isr(s, irq); |
148 | s->isr |= (1 << irq); | 162 | } |
149 | /* | 163 | /* |
150 | * We don't clear a level sensitive interrupt here | 164 | * We don't clear a level sensitive interrupt here |
151 | */ | 165 | */ |
@@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq) | |||
153 | s->irr &= ~(1 << irq); | 167 | s->irr &= ~(1 << irq); |
154 | } | 168 | } |
155 | 169 | ||
156 | int kvm_pic_read_irq(struct kvm_pic *s) | 170 | int kvm_pic_read_irq(struct kvm *kvm) |
157 | { | 171 | { |
158 | int irq, irq2, intno; | 172 | int irq, irq2, intno; |
173 | struct kvm_pic *s = pic_irqchip(kvm); | ||
159 | 174 | ||
160 | irq = pic_get_irq(&s->pics[0]); | 175 | irq = pic_get_irq(&s->pics[0]); |
161 | if (irq >= 0) { | 176 | if (irq >= 0) { |
@@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s) | |||
181 | intno = s->pics[0].irq_base + irq; | 196 | intno = s->pics[0].irq_base + irq; |
182 | } | 197 | } |
183 | pic_update_irq(s); | 198 | pic_update_irq(s); |
199 | kvm_notify_acked_irq(kvm, irq); | ||
184 | 200 | ||
185 | return intno; | 201 | return intno; |
186 | } | 202 | } |
187 | 203 | ||
188 | void kvm_pic_reset(struct kvm_kpic_state *s) | 204 | void kvm_pic_reset(struct kvm_kpic_state *s) |
189 | { | 205 | { |
206 | int irq, irqbase; | ||
207 | struct kvm *kvm = s->pics_state->irq_request_opaque; | ||
208 | struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; | ||
209 | |||
210 | if (s == &s->pics_state->pics[0]) | ||
211 | irqbase = 0; | ||
212 | else | ||
213 | irqbase = 8; | ||
214 | |||
215 | for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { | ||
216 | if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) | ||
217 | if (s->irr & (1 << irq) || s->isr & (1 << irq)) | ||
218 | kvm_notify_acked_irq(kvm, irq+irqbase); | ||
219 | } | ||
190 | s->last_irr = 0; | 220 | s->last_irr = 0; |
191 | s->irr = 0; | 221 | s->irr = 0; |
192 | s->imr = 0; | 222 | s->imr = 0; |
193 | s->isr = 0; | 223 | s->isr = 0; |
224 | s->isr_ack = 0xff; | ||
194 | s->priority_add = 0; | 225 | s->priority_add = 0; |
195 | s->irq_base = 0; | 226 | s->irq_base = 0; |
196 | s->read_reg_select = 0; | 227 | s->read_reg_select = 0; |
@@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
243 | priority = get_priority(s, s->isr); | 274 | priority = get_priority(s, s->isr); |
244 | if (priority != 8) { | 275 | if (priority != 8) { |
245 | irq = (priority + s->priority_add) & 7; | 276 | irq = (priority + s->priority_add) & 7; |
246 | s->isr &= ~(1 << irq); | 277 | pic_clear_isr(s, irq); |
247 | if (cmd == 5) | 278 | if (cmd == 5) |
248 | s->priority_add = (irq + 1) & 7; | 279 | s->priority_add = (irq + 1) & 7; |
249 | pic_update_irq(s->pics_state); | 280 | pic_update_irq(s->pics_state); |
@@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
251 | break; | 282 | break; |
252 | case 3: | 283 | case 3: |
253 | irq = val & 7; | 284 | irq = val & 7; |
254 | s->isr &= ~(1 << irq); | 285 | pic_clear_isr(s, irq); |
255 | pic_update_irq(s->pics_state); | 286 | pic_update_irq(s->pics_state); |
256 | break; | 287 | break; |
257 | case 6: | 288 | case 6: |
@@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
260 | break; | 291 | break; |
261 | case 7: | 292 | case 7: |
262 | irq = val & 7; | 293 | irq = val & 7; |
263 | s->isr &= ~(1 << irq); | ||
264 | s->priority_add = (irq + 1) & 7; | 294 | s->priority_add = (irq + 1) & 7; |
295 | pic_clear_isr(s, irq); | ||
265 | pic_update_irq(s->pics_state); | 296 | pic_update_irq(s->pics_state); |
266 | break; | 297 | break; |
267 | default: | 298 | default: |
@@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) | |||
303 | s->pics_state->pics[0].irr &= ~(1 << 2); | 334 | s->pics_state->pics[0].irr &= ~(1 << 2); |
304 | } | 335 | } |
305 | s->irr &= ~(1 << ret); | 336 | s->irr &= ~(1 << ret); |
306 | s->isr &= ~(1 << ret); | 337 | pic_clear_isr(s, ret); |
307 | if (addr1 >> 7 || ret != 2) | 338 | if (addr1 >> 7 || ret != 2) |
308 | pic_update_irq(s->pics_state); | 339 | pic_update_irq(s->pics_state); |
309 | } else { | 340 | } else { |
@@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level) | |||
422 | { | 453 | { |
423 | struct kvm *kvm = opaque; | 454 | struct kvm *kvm = opaque; |
424 | struct kvm_vcpu *vcpu = kvm->vcpus[0]; | 455 | struct kvm_vcpu *vcpu = kvm->vcpus[0]; |
456 | struct kvm_pic *s = pic_irqchip(kvm); | ||
457 | int irq = pic_get_irq(&s->pics[0]); | ||
425 | 458 | ||
426 | pic_irqchip(kvm)->output = level; | 459 | s->output = level; |
427 | if (vcpu) | 460 | if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { |
461 | s->pics[0].isr_ack &= ~(1 << irq); | ||
428 | kvm_vcpu_kick(vcpu); | 462 | kvm_vcpu_kick(vcpu); |
463 | } | ||
429 | } | 464 | } |
430 | 465 | ||
431 | struct kvm_pic *kvm_create_pic(struct kvm *kvm) | 466 | struct kvm_pic *kvm_create_pic(struct kvm *kvm) |
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 76d736b5f664..c019b8edcdb7 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c | |||
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) | |||
72 | if (kvm_apic_accept_pic_intr(v)) { | 72 | if (kvm_apic_accept_pic_intr(v)) { |
73 | s = pic_irqchip(v->kvm); | 73 | s = pic_irqchip(v->kvm); |
74 | s->output = 0; /* PIC */ | 74 | s->output = 0; /* PIC */ |
75 | vector = kvm_pic_read_irq(s); | 75 | vector = kvm_pic_read_irq(v->kvm); |
76 | } | 76 | } |
77 | } | 77 | } |
78 | return vector; | 78 | return vector; |
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); | |||
90 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | 90 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) |
91 | { | 91 | { |
92 | kvm_apic_timer_intr_post(vcpu, vec); | 92 | kvm_apic_timer_intr_post(vcpu, vec); |
93 | kvm_pit_timer_intr_post(vcpu, vec); | ||
94 | /* TODO: PIT, RTC etc. */ | 93 | /* TODO: PIT, RTC etc. */ |
95 | } | 94 | } |
96 | EXPORT_SYMBOL_GPL(kvm_timer_intr_post); | 95 | EXPORT_SYMBOL_GPL(kvm_timer_intr_post); |
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 7ca47cbb48bb..f17c8f5bbf31 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h | |||
@@ -42,6 +42,7 @@ struct kvm_kpic_state { | |||
42 | u8 irr; /* interrupt request register */ | 42 | u8 irr; /* interrupt request register */ |
43 | u8 imr; /* interrupt mask register */ | 43 | u8 imr; /* interrupt mask register */ |
44 | u8 isr; /* interrupt service register */ | 44 | u8 isr; /* interrupt service register */ |
45 | u8 isr_ack; /* interrupt ack detection */ | ||
45 | u8 priority_add; /* highest irq priority */ | 46 | u8 priority_add; /* highest irq priority */ |
46 | u8 irq_base; | 47 | u8 irq_base; |
47 | u8 read_reg_select; | 48 | u8 read_reg_select; |
@@ -63,12 +64,13 @@ struct kvm_pic { | |||
63 | void *irq_request_opaque; | 64 | void *irq_request_opaque; |
64 | int output; /* intr from master PIC */ | 65 | int output; /* intr from master PIC */ |
65 | struct kvm_io_device dev; | 66 | struct kvm_io_device dev; |
67 | void (*ack_notifier)(void *opaque, int irq); | ||
66 | }; | 68 | }; |
67 | 69 | ||
68 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); | 70 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); |
69 | void kvm_pic_set_irq(void *opaque, int irq, int level); | 71 | int kvm_pic_read_irq(struct kvm *kvm); |
70 | int kvm_pic_read_irq(struct kvm_pic *s); | ||
71 | void kvm_pic_update_irq(struct kvm_pic *s); | 72 | void kvm_pic_update_irq(struct kvm_pic *s); |
73 | void kvm_pic_clear_isr_ack(struct kvm *kvm); | ||
72 | 74 | ||
73 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) | 75 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) |
74 | { | 76 | { |
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h new file mode 100644 index 000000000000..1ff819dce7d3 --- /dev/null +++ b/arch/x86/kvm/kvm_cache_regs.h | |||
@@ -0,0 +1,32 @@ | |||
1 | #ifndef ASM_KVM_CACHE_REGS_H | ||
2 | #define ASM_KVM_CACHE_REGS_H | ||
3 | |||
4 | static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, | ||
5 | enum kvm_reg reg) | ||
6 | { | ||
7 | if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) | ||
8 | kvm_x86_ops->cache_reg(vcpu, reg); | ||
9 | |||
10 | return vcpu->arch.regs[reg]; | ||
11 | } | ||
12 | |||
13 | static inline void kvm_register_write(struct kvm_vcpu *vcpu, | ||
14 | enum kvm_reg reg, | ||
15 | unsigned long val) | ||
16 | { | ||
17 | vcpu->arch.regs[reg] = val; | ||
18 | __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); | ||
19 | __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); | ||
20 | } | ||
21 | |||
22 | static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) | ||
23 | { | ||
24 | return kvm_register_read(vcpu, VCPU_REGS_RIP); | ||
25 | } | ||
26 | |||
27 | static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) | ||
28 | { | ||
29 | kvm_register_write(vcpu, VCPU_REGS_RIP, val); | ||
30 | } | ||
31 | |||
32 | #endif | ||
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 73f43de69f67..6571926bfd33 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <asm/current.h> | 32 | #include <asm/current.h> |
33 | #include <asm/apicdef.h> | 33 | #include <asm/apicdef.h> |
34 | #include <asm/atomic.h> | 34 | #include <asm/atomic.h> |
35 | #include "kvm_cache_regs.h" | ||
35 | #include "irq.h" | 36 | #include "irq.h" |
36 | 37 | ||
37 | #define PRId64 "d" | 38 | #define PRId64 "d" |
@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
338 | } else | 339 | } else |
339 | apic_clear_vector(vector, apic->regs + APIC_TMR); | 340 | apic_clear_vector(vector, apic->regs + APIC_TMR); |
340 | 341 | ||
341 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) | 342 | kvm_vcpu_kick(vcpu); |
342 | kvm_vcpu_kick(vcpu); | ||
343 | else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) { | ||
344 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
345 | if (waitqueue_active(&vcpu->wq)) | ||
346 | wake_up_interruptible(&vcpu->wq); | ||
347 | } | ||
348 | 343 | ||
349 | result = (orig_irr == 0); | 344 | result = (orig_irr == 0); |
350 | break; | 345 | break; |
@@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
370 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; | 365 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; |
371 | kvm_vcpu_kick(vcpu); | 366 | kvm_vcpu_kick(vcpu); |
372 | } else { | 367 | } else { |
373 | printk(KERN_DEBUG | 368 | apic_debug("Ignoring de-assert INIT to vcpu %d\n", |
374 | "Ignoring de-assert INIT to vcpu %d\n", | 369 | vcpu->vcpu_id); |
375 | vcpu->vcpu_id); | ||
376 | } | 370 | } |
377 | |||
378 | break; | 371 | break; |
379 | 372 | ||
380 | case APIC_DM_STARTUP: | 373 | case APIC_DM_STARTUP: |
381 | printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", | 374 | apic_debug("SIPI to vcpu %d vector 0x%02x\n", |
382 | vcpu->vcpu_id, vector); | 375 | vcpu->vcpu_id, vector); |
383 | if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { | 376 | if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { |
384 | vcpu->arch.sipi_vector = vector; | 377 | vcpu->arch.sipi_vector = vector; |
385 | vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; | 378 | vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; |
386 | if (waitqueue_active(&vcpu->wq)) | 379 | kvm_vcpu_kick(vcpu); |
387 | wake_up_interruptible(&vcpu->wq); | ||
388 | } | 380 | } |
389 | break; | 381 | break; |
390 | 382 | ||
@@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, | |||
438 | static void apic_set_eoi(struct kvm_lapic *apic) | 430 | static void apic_set_eoi(struct kvm_lapic *apic) |
439 | { | 431 | { |
440 | int vector = apic_find_highest_isr(apic); | 432 | int vector = apic_find_highest_isr(apic); |
441 | 433 | int trigger_mode; | |
442 | /* | 434 | /* |
443 | * Not every write EOI will has corresponding ISR, | 435 | * Not every write EOI will has corresponding ISR, |
444 | * one example is when Kernel check timer on setup_IO_APIC | 436 | * one example is when Kernel check timer on setup_IO_APIC |
@@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic) | |||
450 | apic_update_ppr(apic); | 442 | apic_update_ppr(apic); |
451 | 443 | ||
452 | if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) | 444 | if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) |
453 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); | 445 | trigger_mode = IOAPIC_LEVEL_TRIG; |
446 | else | ||
447 | trigger_mode = IOAPIC_EDGE_TRIG; | ||
448 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); | ||
454 | } | 449 | } |
455 | 450 | ||
456 | static void apic_send_ipi(struct kvm_lapic *apic) | 451 | static void apic_send_ipi(struct kvm_lapic *apic) |
@@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) | |||
558 | struct kvm_run *run = vcpu->run; | 553 | struct kvm_run *run = vcpu->run; |
559 | 554 | ||
560 | set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); | 555 | set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); |
561 | kvm_x86_ops->cache_regs(vcpu); | 556 | run->tpr_access.rip = kvm_rip_read(vcpu); |
562 | run->tpr_access.rip = vcpu->arch.rip; | ||
563 | run->tpr_access.is_write = write; | 557 | run->tpr_access.is_write = write; |
564 | } | 558 | } |
565 | 559 | ||
@@ -683,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
683 | * Refer SDM 8.4.1 | 677 | * Refer SDM 8.4.1 |
684 | */ | 678 | */ |
685 | if (len != 4 || alignment) { | 679 | if (len != 4 || alignment) { |
686 | if (printk_ratelimit()) | 680 | /* Don't shout loud, $infamous_os would cause only noise. */ |
687 | printk(KERN_ERR "apic write: bad size=%d %lx\n", | 681 | apic_debug("apic write: bad size=%d %lx\n", |
688 | len, (long)address); | 682 | len, (long)address); |
689 | return; | 683 | return; |
690 | } | 684 | } |
691 | 685 | ||
@@ -947,10 +941,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic) | |||
947 | 941 | ||
948 | if(!atomic_inc_and_test(&apic->timer.pending)) | 942 | if(!atomic_inc_and_test(&apic->timer.pending)) |
949 | set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); | 943 | set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); |
950 | if (waitqueue_active(q)) { | 944 | if (waitqueue_active(q)) |
951 | apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
952 | wake_up_interruptible(q); | 945 | wake_up_interruptible(q); |
953 | } | 946 | |
954 | if (apic_lvtt_period(apic)) { | 947 | if (apic_lvtt_period(apic)) { |
955 | result = 1; | 948 | result = 1; |
956 | apic->timer.dev.expires = ktime_add_ns( | 949 | apic->timer.dev.expires = ktime_add_ns( |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3da2508eb22a..99c239c5c0ac 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -70,6 +70,9 @@ static int dbg = 0; | |||
70 | module_param(dbg, bool, 0644); | 70 | module_param(dbg, bool, 0644); |
71 | #endif | 71 | #endif |
72 | 72 | ||
73 | static int oos_shadow = 1; | ||
74 | module_param(oos_shadow, bool, 0644); | ||
75 | |||
73 | #ifndef MMU_DEBUG | 76 | #ifndef MMU_DEBUG |
74 | #define ASSERT(x) do { } while (0) | 77 | #define ASSERT(x) do { } while (0) |
75 | #else | 78 | #else |
@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644); | |||
135 | #define ACC_USER_MASK PT_USER_MASK | 138 | #define ACC_USER_MASK PT_USER_MASK |
136 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) | 139 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) |
137 | 140 | ||
138 | struct kvm_pv_mmu_op_buffer { | 141 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
139 | void *ptr; | ||
140 | unsigned len; | ||
141 | unsigned processed; | ||
142 | char buf[512] __aligned(sizeof(long)); | ||
143 | }; | ||
144 | 142 | ||
145 | struct kvm_rmap_desc { | 143 | struct kvm_rmap_desc { |
146 | u64 *shadow_ptes[RMAP_EXT]; | 144 | u64 *shadow_ptes[RMAP_EXT]; |
147 | struct kvm_rmap_desc *more; | 145 | struct kvm_rmap_desc *more; |
148 | }; | 146 | }; |
149 | 147 | ||
148 | struct kvm_shadow_walk { | ||
149 | int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu, | ||
150 | u64 addr, u64 *spte, int level); | ||
151 | }; | ||
152 | |||
153 | struct kvm_unsync_walk { | ||
154 | int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); | ||
155 | }; | ||
156 | |||
157 | typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); | ||
158 | |||
150 | static struct kmem_cache *pte_chain_cache; | 159 | static struct kmem_cache *pte_chain_cache; |
151 | static struct kmem_cache *rmap_desc_cache; | 160 | static struct kmem_cache *rmap_desc_cache; |
152 | static struct kmem_cache *mmu_page_header_cache; | 161 | static struct kmem_cache *mmu_page_header_cache; |
@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) | |||
405 | { | 414 | { |
406 | struct vm_area_struct *vma; | 415 | struct vm_area_struct *vma; |
407 | unsigned long addr; | 416 | unsigned long addr; |
417 | int ret = 0; | ||
408 | 418 | ||
409 | addr = gfn_to_hva(kvm, gfn); | 419 | addr = gfn_to_hva(kvm, gfn); |
410 | if (kvm_is_error_hva(addr)) | 420 | if (kvm_is_error_hva(addr)) |
411 | return 0; | 421 | return ret; |
412 | 422 | ||
423 | down_read(¤t->mm->mmap_sem); | ||
413 | vma = find_vma(current->mm, addr); | 424 | vma = find_vma(current->mm, addr); |
414 | if (vma && is_vm_hugetlb_page(vma)) | 425 | if (vma && is_vm_hugetlb_page(vma)) |
415 | return 1; | 426 | ret = 1; |
427 | up_read(¤t->mm->mmap_sem); | ||
416 | 428 | ||
417 | return 0; | 429 | return ret; |
418 | } | 430 | } |
419 | 431 | ||
420 | static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) | 432 | static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) |
@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
649 | 661 | ||
650 | if (write_protected) | 662 | if (write_protected) |
651 | kvm_flush_remote_tlbs(kvm); | 663 | kvm_flush_remote_tlbs(kvm); |
652 | |||
653 | account_shadowed(kvm, gfn); | ||
654 | } | 664 | } |
655 | 665 | ||
656 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) | 666 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) |
@@ -859,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, | |||
859 | BUG(); | 869 | BUG(); |
860 | } | 870 | } |
861 | 871 | ||
872 | |||
873 | static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | ||
874 | mmu_parent_walk_fn fn) | ||
875 | { | ||
876 | struct kvm_pte_chain *pte_chain; | ||
877 | struct hlist_node *node; | ||
878 | struct kvm_mmu_page *parent_sp; | ||
879 | int i; | ||
880 | |||
881 | if (!sp->multimapped && sp->parent_pte) { | ||
882 | parent_sp = page_header(__pa(sp->parent_pte)); | ||
883 | fn(vcpu, parent_sp); | ||
884 | mmu_parent_walk(vcpu, parent_sp, fn); | ||
885 | return; | ||
886 | } | ||
887 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
888 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
889 | if (!pte_chain->parent_ptes[i]) | ||
890 | break; | ||
891 | parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); | ||
892 | fn(vcpu, parent_sp); | ||
893 | mmu_parent_walk(vcpu, parent_sp, fn); | ||
894 | } | ||
895 | } | ||
896 | |||
897 | static void kvm_mmu_update_unsync_bitmap(u64 *spte) | ||
898 | { | ||
899 | unsigned int index; | ||
900 | struct kvm_mmu_page *sp = page_header(__pa(spte)); | ||
901 | |||
902 | index = spte - sp->spt; | ||
903 | __set_bit(index, sp->unsync_child_bitmap); | ||
904 | sp->unsync_children = 1; | ||
905 | } | ||
906 | |||
907 | static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) | ||
908 | { | ||
909 | struct kvm_pte_chain *pte_chain; | ||
910 | struct hlist_node *node; | ||
911 | int i; | ||
912 | |||
913 | if (!sp->parent_pte) | ||
914 | return; | ||
915 | |||
916 | if (!sp->multimapped) { | ||
917 | kvm_mmu_update_unsync_bitmap(sp->parent_pte); | ||
918 | return; | ||
919 | } | ||
920 | |||
921 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
922 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
923 | if (!pte_chain->parent_ptes[i]) | ||
924 | break; | ||
925 | kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); | ||
926 | } | ||
927 | } | ||
928 | |||
929 | static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | ||
930 | { | ||
931 | sp->unsync_children = 1; | ||
932 | kvm_mmu_update_parents_unsync(sp); | ||
933 | return 1; | ||
934 | } | ||
935 | |||
936 | static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, | ||
937 | struct kvm_mmu_page *sp) | ||
938 | { | ||
939 | mmu_parent_walk(vcpu, sp, unsync_walk_fn); | ||
940 | kvm_mmu_update_parents_unsync(sp); | ||
941 | } | ||
942 | |||
862 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | 943 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, |
863 | struct kvm_mmu_page *sp) | 944 | struct kvm_mmu_page *sp) |
864 | { | 945 | { |
@@ -868,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | |||
868 | sp->spt[i] = shadow_trap_nonpresent_pte; | 949 | sp->spt[i] = shadow_trap_nonpresent_pte; |
869 | } | 950 | } |
870 | 951 | ||
952 | static int nonpaging_sync_page(struct kvm_vcpu *vcpu, | ||
953 | struct kvm_mmu_page *sp) | ||
954 | { | ||
955 | return 1; | ||
956 | } | ||
957 | |||
958 | static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) | ||
959 | { | ||
960 | } | ||
961 | |||
962 | #define for_each_unsync_children(bitmap, idx) \ | ||
963 | for (idx = find_first_bit(bitmap, 512); \ | ||
964 | idx < 512; \ | ||
965 | idx = find_next_bit(bitmap, 512, idx+1)) | ||
966 | |||
967 | static int mmu_unsync_walk(struct kvm_mmu_page *sp, | ||
968 | struct kvm_unsync_walk *walker) | ||
969 | { | ||
970 | int i, ret; | ||
971 | |||
972 | if (!sp->unsync_children) | ||
973 | return 0; | ||
974 | |||
975 | for_each_unsync_children(sp->unsync_child_bitmap, i) { | ||
976 | u64 ent = sp->spt[i]; | ||
977 | |||
978 | if (is_shadow_present_pte(ent)) { | ||
979 | struct kvm_mmu_page *child; | ||
980 | child = page_header(ent & PT64_BASE_ADDR_MASK); | ||
981 | |||
982 | if (child->unsync_children) { | ||
983 | ret = mmu_unsync_walk(child, walker); | ||
984 | if (ret) | ||
985 | return ret; | ||
986 | __clear_bit(i, sp->unsync_child_bitmap); | ||
987 | } | ||
988 | |||
989 | if (child->unsync) { | ||
990 | ret = walker->entry(child, walker); | ||
991 | __clear_bit(i, sp->unsync_child_bitmap); | ||
992 | if (ret) | ||
993 | return ret; | ||
994 | } | ||
995 | } | ||
996 | } | ||
997 | |||
998 | if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) | ||
999 | sp->unsync_children = 0; | ||
1000 | |||
1001 | return 0; | ||
1002 | } | ||
1003 | |||
871 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | 1004 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) |
872 | { | 1005 | { |
873 | unsigned index; | 1006 | unsigned index; |
@@ -888,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | |||
888 | return NULL; | 1021 | return NULL; |
889 | } | 1022 | } |
890 | 1023 | ||
1024 | static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1025 | { | ||
1026 | WARN_ON(!sp->unsync); | ||
1027 | sp->unsync = 0; | ||
1028 | --kvm->stat.mmu_unsync; | ||
1029 | } | ||
1030 | |||
1031 | static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); | ||
1032 | |||
1033 | static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | ||
1034 | { | ||
1035 | if (sp->role.glevels != vcpu->arch.mmu.root_level) { | ||
1036 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1037 | return 1; | ||
1038 | } | ||
1039 | |||
1040 | rmap_write_protect(vcpu->kvm, sp->gfn); | ||
1041 | if (vcpu->arch.mmu.sync_page(vcpu, sp)) { | ||
1042 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1043 | return 1; | ||
1044 | } | ||
1045 | |||
1046 | kvm_mmu_flush_tlb(vcpu); | ||
1047 | kvm_unlink_unsync_page(vcpu->kvm, sp); | ||
1048 | return 0; | ||
1049 | } | ||
1050 | |||
1051 | struct sync_walker { | ||
1052 | struct kvm_vcpu *vcpu; | ||
1053 | struct kvm_unsync_walk walker; | ||
1054 | }; | ||
1055 | |||
1056 | static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) | ||
1057 | { | ||
1058 | struct sync_walker *sync_walk = container_of(walk, struct sync_walker, | ||
1059 | walker); | ||
1060 | struct kvm_vcpu *vcpu = sync_walk->vcpu; | ||
1061 | |||
1062 | kvm_sync_page(vcpu, sp); | ||
1063 | return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)); | ||
1064 | } | ||
1065 | |||
1066 | static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | ||
1067 | { | ||
1068 | struct sync_walker walker = { | ||
1069 | .walker = { .entry = mmu_sync_fn, }, | ||
1070 | .vcpu = vcpu, | ||
1071 | }; | ||
1072 | |||
1073 | while (mmu_unsync_walk(sp, &walker.walker)) | ||
1074 | cond_resched_lock(&vcpu->kvm->mmu_lock); | ||
1075 | } | ||
1076 | |||
891 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | 1077 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, |
892 | gfn_t gfn, | 1078 | gfn_t gfn, |
893 | gva_t gaddr, | 1079 | gva_t gaddr, |
@@ -901,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
901 | unsigned quadrant; | 1087 | unsigned quadrant; |
902 | struct hlist_head *bucket; | 1088 | struct hlist_head *bucket; |
903 | struct kvm_mmu_page *sp; | 1089 | struct kvm_mmu_page *sp; |
904 | struct hlist_node *node; | 1090 | struct hlist_node *node, *tmp; |
905 | 1091 | ||
906 | role.word = 0; | 1092 | role.word = 0; |
907 | role.glevels = vcpu->arch.mmu.root_level; | 1093 | role.glevels = vcpu->arch.mmu.root_level; |
@@ -917,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
917 | gfn, role.word); | 1103 | gfn, role.word); |
918 | index = kvm_page_table_hashfn(gfn); | 1104 | index = kvm_page_table_hashfn(gfn); |
919 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | 1105 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
920 | hlist_for_each_entry(sp, node, bucket, hash_link) | 1106 | hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) |
921 | if (sp->gfn == gfn && sp->role.word == role.word) { | 1107 | if (sp->gfn == gfn) { |
1108 | if (sp->unsync) | ||
1109 | if (kvm_sync_page(vcpu, sp)) | ||
1110 | continue; | ||
1111 | |||
1112 | if (sp->role.word != role.word) | ||
1113 | continue; | ||
1114 | |||
922 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | 1115 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); |
1116 | if (sp->unsync_children) { | ||
1117 | set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); | ||
1118 | kvm_mmu_mark_parents_unsync(vcpu, sp); | ||
1119 | } | ||
923 | pgprintk("%s: found\n", __func__); | 1120 | pgprintk("%s: found\n", __func__); |
924 | return sp; | 1121 | return sp; |
925 | } | 1122 | } |
@@ -931,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
931 | sp->gfn = gfn; | 1128 | sp->gfn = gfn; |
932 | sp->role = role; | 1129 | sp->role = role; |
933 | hlist_add_head(&sp->hash_link, bucket); | 1130 | hlist_add_head(&sp->hash_link, bucket); |
934 | if (!metaphysical) | 1131 | if (!metaphysical) { |
935 | rmap_write_protect(vcpu->kvm, gfn); | 1132 | rmap_write_protect(vcpu->kvm, gfn); |
1133 | account_shadowed(vcpu->kvm, gfn); | ||
1134 | } | ||
936 | if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) | 1135 | if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) |
937 | vcpu->arch.mmu.prefetch_page(vcpu, sp); | 1136 | vcpu->arch.mmu.prefetch_page(vcpu, sp); |
938 | else | 1137 | else |
@@ -940,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
940 | return sp; | 1139 | return sp; |
941 | } | 1140 | } |
942 | 1141 | ||
1142 | static int walk_shadow(struct kvm_shadow_walk *walker, | ||
1143 | struct kvm_vcpu *vcpu, u64 addr) | ||
1144 | { | ||
1145 | hpa_t shadow_addr; | ||
1146 | int level; | ||
1147 | int r; | ||
1148 | u64 *sptep; | ||
1149 | unsigned index; | ||
1150 | |||
1151 | shadow_addr = vcpu->arch.mmu.root_hpa; | ||
1152 | level = vcpu->arch.mmu.shadow_root_level; | ||
1153 | if (level == PT32E_ROOT_LEVEL) { | ||
1154 | shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | ||
1155 | shadow_addr &= PT64_BASE_ADDR_MASK; | ||
1156 | --level; | ||
1157 | } | ||
1158 | |||
1159 | while (level >= PT_PAGE_TABLE_LEVEL) { | ||
1160 | index = SHADOW_PT_INDEX(addr, level); | ||
1161 | sptep = ((u64 *)__va(shadow_addr)) + index; | ||
1162 | r = walker->entry(walker, vcpu, addr, sptep, level); | ||
1163 | if (r) | ||
1164 | return r; | ||
1165 | shadow_addr = *sptep & PT64_BASE_ADDR_MASK; | ||
1166 | --level; | ||
1167 | } | ||
1168 | return 0; | ||
1169 | } | ||
1170 | |||
943 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | 1171 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, |
944 | struct kvm_mmu_page *sp) | 1172 | struct kvm_mmu_page *sp) |
945 | { | 1173 | { |
@@ -955,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, | |||
955 | rmap_remove(kvm, &pt[i]); | 1183 | rmap_remove(kvm, &pt[i]); |
956 | pt[i] = shadow_trap_nonpresent_pte; | 1184 | pt[i] = shadow_trap_nonpresent_pte; |
957 | } | 1185 | } |
958 | kvm_flush_remote_tlbs(kvm); | ||
959 | return; | 1186 | return; |
960 | } | 1187 | } |
961 | 1188 | ||
@@ -974,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, | |||
974 | } | 1201 | } |
975 | pt[i] = shadow_trap_nonpresent_pte; | 1202 | pt[i] = shadow_trap_nonpresent_pte; |
976 | } | 1203 | } |
977 | kvm_flush_remote_tlbs(kvm); | ||
978 | } | 1204 | } |
979 | 1205 | ||
980 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | 1206 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) |
@@ -991,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) | |||
991 | kvm->vcpus[i]->arch.last_pte_updated = NULL; | 1217 | kvm->vcpus[i]->arch.last_pte_updated = NULL; |
992 | } | 1218 | } |
993 | 1219 | ||
994 | static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | 1220 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) |
995 | { | 1221 | { |
996 | u64 *parent_pte; | 1222 | u64 *parent_pte; |
997 | 1223 | ||
998 | ++kvm->stat.mmu_shadow_zapped; | ||
999 | while (sp->multimapped || sp->parent_pte) { | 1224 | while (sp->multimapped || sp->parent_pte) { |
1000 | if (!sp->multimapped) | 1225 | if (!sp->multimapped) |
1001 | parent_pte = sp->parent_pte; | 1226 | parent_pte = sp->parent_pte; |
@@ -1010,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
1010 | kvm_mmu_put_page(sp, parent_pte); | 1235 | kvm_mmu_put_page(sp, parent_pte); |
1011 | set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); | 1236 | set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); |
1012 | } | 1237 | } |
1238 | } | ||
1239 | |||
1240 | struct zap_walker { | ||
1241 | struct kvm_unsync_walk walker; | ||
1242 | struct kvm *kvm; | ||
1243 | int zapped; | ||
1244 | }; | ||
1245 | |||
1246 | static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) | ||
1247 | { | ||
1248 | struct zap_walker *zap_walk = container_of(walk, struct zap_walker, | ||
1249 | walker); | ||
1250 | kvm_mmu_zap_page(zap_walk->kvm, sp); | ||
1251 | zap_walk->zapped = 1; | ||
1252 | return 0; | ||
1253 | } | ||
1254 | |||
1255 | static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1256 | { | ||
1257 | struct zap_walker walker = { | ||
1258 | .walker = { .entry = mmu_zap_fn, }, | ||
1259 | .kvm = kvm, | ||
1260 | .zapped = 0, | ||
1261 | }; | ||
1262 | |||
1263 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) | ||
1264 | return 0; | ||
1265 | mmu_unsync_walk(sp, &walker.walker); | ||
1266 | return walker.zapped; | ||
1267 | } | ||
1268 | |||
1269 | static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1270 | { | ||
1271 | int ret; | ||
1272 | ++kvm->stat.mmu_shadow_zapped; | ||
1273 | ret = mmu_zap_unsync_children(kvm, sp); | ||
1013 | kvm_mmu_page_unlink_children(kvm, sp); | 1274 | kvm_mmu_page_unlink_children(kvm, sp); |
1275 | kvm_mmu_unlink_parents(kvm, sp); | ||
1276 | kvm_flush_remote_tlbs(kvm); | ||
1277 | if (!sp->role.invalid && !sp->role.metaphysical) | ||
1278 | unaccount_shadowed(kvm, sp->gfn); | ||
1279 | if (sp->unsync) | ||
1280 | kvm_unlink_unsync_page(kvm, sp); | ||
1014 | if (!sp->root_count) { | 1281 | if (!sp->root_count) { |
1015 | if (!sp->role.metaphysical && !sp->role.invalid) | ||
1016 | unaccount_shadowed(kvm, sp->gfn); | ||
1017 | hlist_del(&sp->hash_link); | 1282 | hlist_del(&sp->hash_link); |
1018 | kvm_mmu_free_page(kvm, sp); | 1283 | kvm_mmu_free_page(kvm, sp); |
1019 | } else { | 1284 | } else { |
1020 | int invalid = sp->role.invalid; | ||
1021 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | ||
1022 | sp->role.invalid = 1; | 1285 | sp->role.invalid = 1; |
1286 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | ||
1023 | kvm_reload_remote_mmus(kvm); | 1287 | kvm_reload_remote_mmus(kvm); |
1024 | if (!sp->role.metaphysical && !invalid) | ||
1025 | unaccount_shadowed(kvm, sp->gfn); | ||
1026 | } | 1288 | } |
1027 | kvm_mmu_reset_last_pte_updated(kvm); | 1289 | kvm_mmu_reset_last_pte_updated(kvm); |
1290 | return ret; | ||
1028 | } | 1291 | } |
1029 | 1292 | ||
1030 | /* | 1293 | /* |
@@ -1077,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | |||
1077 | if (sp->gfn == gfn && !sp->role.metaphysical) { | 1340 | if (sp->gfn == gfn && !sp->role.metaphysical) { |
1078 | pgprintk("%s: gfn %lx role %x\n", __func__, gfn, | 1341 | pgprintk("%s: gfn %lx role %x\n", __func__, gfn, |
1079 | sp->role.word); | 1342 | sp->role.word); |
1080 | kvm_mmu_zap_page(kvm, sp); | ||
1081 | r = 1; | 1343 | r = 1; |
1344 | if (kvm_mmu_zap_page(kvm, sp)) | ||
1345 | n = bucket->first; | ||
1082 | } | 1346 | } |
1083 | return r; | 1347 | return r; |
1084 | } | 1348 | } |
@@ -1101,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | |||
1101 | __set_bit(slot, &sp->slot_bitmap); | 1365 | __set_bit(slot, &sp->slot_bitmap); |
1102 | } | 1366 | } |
1103 | 1367 | ||
1368 | static void mmu_convert_notrap(struct kvm_mmu_page *sp) | ||
1369 | { | ||
1370 | int i; | ||
1371 | u64 *pt = sp->spt; | ||
1372 | |||
1373 | if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) | ||
1374 | return; | ||
1375 | |||
1376 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
1377 | if (pt[i] == shadow_notrap_nonpresent_pte) | ||
1378 | set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); | ||
1379 | } | ||
1380 | } | ||
1381 | |||
1104 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | 1382 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) |
1105 | { | 1383 | { |
1106 | struct page *page; | 1384 | struct page *page; |
@@ -1110,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | |||
1110 | if (gpa == UNMAPPED_GVA) | 1388 | if (gpa == UNMAPPED_GVA) |
1111 | return NULL; | 1389 | return NULL; |
1112 | 1390 | ||
1113 | down_read(¤t->mm->mmap_sem); | ||
1114 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 1391 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); |
1115 | up_read(¤t->mm->mmap_sem); | ||
1116 | 1392 | ||
1117 | return page; | 1393 | return page; |
1118 | } | 1394 | } |
1119 | 1395 | ||
1120 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | 1396 | static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) |
1121 | unsigned pt_access, unsigned pte_access, | ||
1122 | int user_fault, int write_fault, int dirty, | ||
1123 | int *ptwrite, int largepage, gfn_t gfn, | ||
1124 | pfn_t pfn, bool speculative) | ||
1125 | { | 1397 | { |
1126 | u64 spte; | 1398 | unsigned index; |
1127 | int was_rmapped = 0; | 1399 | struct hlist_head *bucket; |
1128 | int was_writeble = is_writeble_pte(*shadow_pte); | 1400 | struct kvm_mmu_page *s; |
1401 | struct hlist_node *node, *n; | ||
1129 | 1402 | ||
1130 | pgprintk("%s: spte %llx access %x write_fault %d" | 1403 | index = kvm_page_table_hashfn(sp->gfn); |
1131 | " user_fault %d gfn %lx\n", | 1404 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
1132 | __func__, *shadow_pte, pt_access, | 1405 | /* don't unsync if pagetable is shadowed with multiple roles */ |
1133 | write_fault, user_fault, gfn); | 1406 | hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { |
1407 | if (s->gfn != sp->gfn || s->role.metaphysical) | ||
1408 | continue; | ||
1409 | if (s->role.word != sp->role.word) | ||
1410 | return 1; | ||
1411 | } | ||
1412 | kvm_mmu_mark_parents_unsync(vcpu, sp); | ||
1413 | ++vcpu->kvm->stat.mmu_unsync; | ||
1414 | sp->unsync = 1; | ||
1415 | mmu_convert_notrap(sp); | ||
1416 | return 0; | ||
1417 | } | ||
1134 | 1418 | ||
1135 | if (is_rmap_pte(*shadow_pte)) { | 1419 | static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, |
1136 | /* | 1420 | bool can_unsync) |
1137 | * If we overwrite a PTE page pointer with a 2MB PMD, unlink | 1421 | { |
1138 | * the parent of the now unreachable PTE. | 1422 | struct kvm_mmu_page *shadow; |
1139 | */ | ||
1140 | if (largepage && !is_large_pte(*shadow_pte)) { | ||
1141 | struct kvm_mmu_page *child; | ||
1142 | u64 pte = *shadow_pte; | ||
1143 | 1423 | ||
1144 | child = page_header(pte & PT64_BASE_ADDR_MASK); | 1424 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); |
1145 | mmu_page_remove_parent_pte(child, shadow_pte); | 1425 | if (shadow) { |
1146 | } else if (pfn != spte_to_pfn(*shadow_pte)) { | 1426 | if (shadow->role.level != PT_PAGE_TABLE_LEVEL) |
1147 | pgprintk("hfn old %lx new %lx\n", | 1427 | return 1; |
1148 | spte_to_pfn(*shadow_pte), pfn); | 1428 | if (shadow->unsync) |
1149 | rmap_remove(vcpu->kvm, shadow_pte); | 1429 | return 0; |
1150 | } else { | 1430 | if (can_unsync && oos_shadow) |
1151 | if (largepage) | 1431 | return kvm_unsync_page(vcpu, shadow); |
1152 | was_rmapped = is_large_pte(*shadow_pte); | 1432 | return 1; |
1153 | else | ||
1154 | was_rmapped = 1; | ||
1155 | } | ||
1156 | } | 1433 | } |
1434 | return 0; | ||
1435 | } | ||
1157 | 1436 | ||
1437 | static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | ||
1438 | unsigned pte_access, int user_fault, | ||
1439 | int write_fault, int dirty, int largepage, | ||
1440 | gfn_t gfn, pfn_t pfn, bool speculative, | ||
1441 | bool can_unsync) | ||
1442 | { | ||
1443 | u64 spte; | ||
1444 | int ret = 0; | ||
1158 | /* | 1445 | /* |
1159 | * We don't set the accessed bit, since we sometimes want to see | 1446 | * We don't set the accessed bit, since we sometimes want to see |
1160 | * whether the guest actually used the pte (in order to detect | 1447 | * whether the guest actually used the pte (in order to detect |
@@ -1162,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1162 | */ | 1449 | */ |
1163 | spte = shadow_base_present_pte | shadow_dirty_mask; | 1450 | spte = shadow_base_present_pte | shadow_dirty_mask; |
1164 | if (!speculative) | 1451 | if (!speculative) |
1165 | pte_access |= PT_ACCESSED_MASK; | 1452 | spte |= shadow_accessed_mask; |
1166 | if (!dirty) | 1453 | if (!dirty) |
1167 | pte_access &= ~ACC_WRITE_MASK; | 1454 | pte_access &= ~ACC_WRITE_MASK; |
1168 | if (pte_access & ACC_EXEC_MASK) | 1455 | if (pte_access & ACC_EXEC_MASK) |
@@ -1178,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
1178 | 1465 | ||
1179 | if ((pte_access & ACC_WRITE_MASK) | 1466 | if ((pte_access & ACC_WRITE_MASK) |
1180 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { | 1467 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { |
1181 | struct kvm_mmu_page *shadow; | 1468 | |
1469 | if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { | ||
1470 | ret = 1; | ||
1471 | spte = shadow_trap_nonpresent_pte; | ||
1472 | goto set_pte; | ||
1473 | } | ||
1182 | 1474 | ||
1183 | spte |= PT_WRITABLE_MASK; | 1475 | spte |= PT_WRITABLE_MASK; |
1184 | 1476 | ||
1185 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); | 1477 | if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { |
1186 | if (shadow || | ||
1187 | (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { | ||
1188 | pgprintk("%s: found shadow page for %lx, marking ro\n", | 1478 | pgprintk("%s: found shadow page for %lx, marking ro\n", |
1189 | __func__, gfn); | 1479 | __func__, gfn); |
1480 | ret = 1; | ||
1190 | pte_access &= ~ACC_WRITE_MASK; | 1481 | pte_access &= ~ACC_WRITE_MASK; |
1191 | if (is_writeble_pte(spte)) { | 1482 | if (is_writeble_pte(spte)) |
1192 | spte &= ~PT_WRITABLE_MASK; | 1483 | spte &= ~PT_WRITABLE_MASK; |
1193 | kvm_x86_ops->tlb_flush(vcpu); | ||
1194 | } | ||
1195 | if (write_fault) | ||
1196 | *ptwrite = 1; | ||
1197 | } | 1484 | } |
1198 | } | 1485 | } |
1199 | 1486 | ||
1200 | if (pte_access & ACC_WRITE_MASK) | 1487 | if (pte_access & ACC_WRITE_MASK) |
1201 | mark_page_dirty(vcpu->kvm, gfn); | 1488 | mark_page_dirty(vcpu->kvm, gfn); |
1202 | 1489 | ||
1203 | pgprintk("%s: setting spte %llx\n", __func__, spte); | 1490 | set_pte: |
1204 | pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", | ||
1205 | (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", | ||
1206 | (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); | ||
1207 | set_shadow_pte(shadow_pte, spte); | 1491 | set_shadow_pte(shadow_pte, spte); |
1208 | if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) | 1492 | return ret; |
1209 | && (spte & PT_PRESENT_MASK)) | 1493 | } |
1494 | |||
1495 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | ||
1496 | unsigned pt_access, unsigned pte_access, | ||
1497 | int user_fault, int write_fault, int dirty, | ||
1498 | int *ptwrite, int largepage, gfn_t gfn, | ||
1499 | pfn_t pfn, bool speculative) | ||
1500 | { | ||
1501 | int was_rmapped = 0; | ||
1502 | int was_writeble = is_writeble_pte(*shadow_pte); | ||
1503 | |||
1504 | pgprintk("%s: spte %llx access %x write_fault %d" | ||
1505 | " user_fault %d gfn %lx\n", | ||
1506 | __func__, *shadow_pte, pt_access, | ||
1507 | write_fault, user_fault, gfn); | ||
1508 | |||
1509 | if (is_rmap_pte(*shadow_pte)) { | ||
1510 | /* | ||
1511 | * If we overwrite a PTE page pointer with a 2MB PMD, unlink | ||
1512 | * the parent of the now unreachable PTE. | ||
1513 | */ | ||
1514 | if (largepage && !is_large_pte(*shadow_pte)) { | ||
1515 | struct kvm_mmu_page *child; | ||
1516 | u64 pte = *shadow_pte; | ||
1517 | |||
1518 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
1519 | mmu_page_remove_parent_pte(child, shadow_pte); | ||
1520 | } else if (pfn != spte_to_pfn(*shadow_pte)) { | ||
1521 | pgprintk("hfn old %lx new %lx\n", | ||
1522 | spte_to_pfn(*shadow_pte), pfn); | ||
1523 | rmap_remove(vcpu->kvm, shadow_pte); | ||
1524 | } else { | ||
1525 | if (largepage) | ||
1526 | was_rmapped = is_large_pte(*shadow_pte); | ||
1527 | else | ||
1528 | was_rmapped = 1; | ||
1529 | } | ||
1530 | } | ||
1531 | if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, | ||
1532 | dirty, largepage, gfn, pfn, speculative, true)) { | ||
1533 | if (write_fault) | ||
1534 | *ptwrite = 1; | ||
1535 | kvm_x86_ops->tlb_flush(vcpu); | ||
1536 | } | ||
1537 | |||
1538 | pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); | ||
1539 | pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", | ||
1540 | is_large_pte(*shadow_pte)? "2MB" : "4kB", | ||
1541 | is_present_pte(*shadow_pte)?"RW":"R", gfn, | ||
1542 | *shadow_pte, shadow_pte); | ||
1543 | if (!was_rmapped && is_large_pte(*shadow_pte)) | ||
1210 | ++vcpu->kvm->stat.lpages; | 1544 | ++vcpu->kvm->stat.lpages; |
1211 | 1545 | ||
1212 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); | 1546 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); |
@@ -1230,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | |||
1230 | { | 1564 | { |
1231 | } | 1565 | } |
1232 | 1566 | ||
1233 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | 1567 | struct direct_shadow_walk { |
1234 | int largepage, gfn_t gfn, pfn_t pfn, | 1568 | struct kvm_shadow_walk walker; |
1235 | int level) | 1569 | pfn_t pfn; |
1236 | { | 1570 | int write; |
1237 | hpa_t table_addr = vcpu->arch.mmu.root_hpa; | 1571 | int largepage; |
1238 | int pt_write = 0; | 1572 | int pt_write; |
1239 | 1573 | }; | |
1240 | for (; ; level--) { | ||
1241 | u32 index = PT64_INDEX(v, level); | ||
1242 | u64 *table; | ||
1243 | |||
1244 | ASSERT(VALID_PAGE(table_addr)); | ||
1245 | table = __va(table_addr); | ||
1246 | 1574 | ||
1247 | if (level == 1) { | 1575 | static int direct_map_entry(struct kvm_shadow_walk *_walk, |
1248 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, | 1576 | struct kvm_vcpu *vcpu, |
1249 | 0, write, 1, &pt_write, 0, gfn, pfn, false); | 1577 | u64 addr, u64 *sptep, int level) |
1250 | return pt_write; | 1578 | { |
1251 | } | 1579 | struct direct_shadow_walk *walk = |
1580 | container_of(_walk, struct direct_shadow_walk, walker); | ||
1581 | struct kvm_mmu_page *sp; | ||
1582 | gfn_t pseudo_gfn; | ||
1583 | gfn_t gfn = addr >> PAGE_SHIFT; | ||
1584 | |||
1585 | if (level == PT_PAGE_TABLE_LEVEL | ||
1586 | || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { | ||
1587 | mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, | ||
1588 | 0, walk->write, 1, &walk->pt_write, | ||
1589 | walk->largepage, gfn, walk->pfn, false); | ||
1590 | ++vcpu->stat.pf_fixed; | ||
1591 | return 1; | ||
1592 | } | ||
1252 | 1593 | ||
1253 | if (largepage && level == 2) { | 1594 | if (*sptep == shadow_trap_nonpresent_pte) { |
1254 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, | 1595 | pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; |
1255 | 0, write, 1, &pt_write, 1, gfn, pfn, false); | 1596 | sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1, |
1256 | return pt_write; | 1597 | 1, ACC_ALL, sptep); |
1598 | if (!sp) { | ||
1599 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
1600 | kvm_release_pfn_clean(walk->pfn); | ||
1601 | return -ENOMEM; | ||
1257 | } | 1602 | } |
1258 | 1603 | ||
1259 | if (table[index] == shadow_trap_nonpresent_pte) { | 1604 | set_shadow_pte(sptep, |
1260 | struct kvm_mmu_page *new_table; | 1605 | __pa(sp->spt) |
1261 | gfn_t pseudo_gfn; | 1606 | | PT_PRESENT_MASK | PT_WRITABLE_MASK |
1262 | 1607 | | shadow_user_mask | shadow_x_mask); | |
1263 | pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) | ||
1264 | >> PAGE_SHIFT; | ||
1265 | new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, | ||
1266 | v, level - 1, | ||
1267 | 1, ACC_ALL, &table[index]); | ||
1268 | if (!new_table) { | ||
1269 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
1270 | kvm_release_pfn_clean(pfn); | ||
1271 | return -ENOMEM; | ||
1272 | } | ||
1273 | |||
1274 | set_shadow_pte(&table[index], | ||
1275 | __pa(new_table->spt) | ||
1276 | | PT_PRESENT_MASK | PT_WRITABLE_MASK | ||
1277 | | shadow_user_mask | shadow_x_mask); | ||
1278 | } | ||
1279 | table_addr = table[index] & PT64_BASE_ADDR_MASK; | ||
1280 | } | 1608 | } |
1609 | return 0; | ||
1610 | } | ||
1611 | |||
1612 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, | ||
1613 | int largepage, gfn_t gfn, pfn_t pfn) | ||
1614 | { | ||
1615 | int r; | ||
1616 | struct direct_shadow_walk walker = { | ||
1617 | .walker = { .entry = direct_map_entry, }, | ||
1618 | .pfn = pfn, | ||
1619 | .largepage = largepage, | ||
1620 | .write = write, | ||
1621 | .pt_write = 0, | ||
1622 | }; | ||
1623 | |||
1624 | r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT); | ||
1625 | if (r < 0) | ||
1626 | return r; | ||
1627 | return walker.pt_write; | ||
1281 | } | 1628 | } |
1282 | 1629 | ||
1283 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | 1630 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) |
@@ -1287,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
1287 | pfn_t pfn; | 1634 | pfn_t pfn; |
1288 | unsigned long mmu_seq; | 1635 | unsigned long mmu_seq; |
1289 | 1636 | ||
1290 | down_read(¤t->mm->mmap_sem); | ||
1291 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { | 1637 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { |
1292 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 1638 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
1293 | largepage = 1; | 1639 | largepage = 1; |
1294 | } | 1640 | } |
1295 | 1641 | ||
1296 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 1642 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
1297 | /* implicit mb(), we'll read before PT lock is unlocked */ | 1643 | smp_rmb(); |
1298 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 1644 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1299 | up_read(¤t->mm->mmap_sem); | ||
1300 | 1645 | ||
1301 | /* mmio */ | 1646 | /* mmio */ |
1302 | if (is_error_pfn(pfn)) { | 1647 | if (is_error_pfn(pfn)) { |
@@ -1308,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | |||
1308 | if (mmu_notifier_retry(vcpu, mmu_seq)) | 1653 | if (mmu_notifier_retry(vcpu, mmu_seq)) |
1309 | goto out_unlock; | 1654 | goto out_unlock; |
1310 | kvm_mmu_free_some_pages(vcpu); | 1655 | kvm_mmu_free_some_pages(vcpu); |
1311 | r = __direct_map(vcpu, v, write, largepage, gfn, pfn, | 1656 | r = __direct_map(vcpu, v, write, largepage, gfn, pfn); |
1312 | PT32E_ROOT_LEVEL); | ||
1313 | spin_unlock(&vcpu->kvm->mmu_lock); | 1657 | spin_unlock(&vcpu->kvm->mmu_lock); |
1314 | 1658 | ||
1315 | 1659 | ||
@@ -1405,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
1405 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | 1749 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); |
1406 | } | 1750 | } |
1407 | 1751 | ||
1752 | static void mmu_sync_roots(struct kvm_vcpu *vcpu) | ||
1753 | { | ||
1754 | int i; | ||
1755 | struct kvm_mmu_page *sp; | ||
1756 | |||
1757 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
1758 | return; | ||
1759 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
1760 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
1761 | sp = page_header(root); | ||
1762 | mmu_sync_children(vcpu, sp); | ||
1763 | return; | ||
1764 | } | ||
1765 | for (i = 0; i < 4; ++i) { | ||
1766 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
1767 | |||
1768 | if (root) { | ||
1769 | root &= PT64_BASE_ADDR_MASK; | ||
1770 | sp = page_header(root); | ||
1771 | mmu_sync_children(vcpu, sp); | ||
1772 | } | ||
1773 | } | ||
1774 | } | ||
1775 | |||
1776 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | ||
1777 | { | ||
1778 | spin_lock(&vcpu->kvm->mmu_lock); | ||
1779 | mmu_sync_roots(vcpu); | ||
1780 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
1781 | } | ||
1782 | |||
1408 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | 1783 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) |
1409 | { | 1784 | { |
1410 | return vaddr; | 1785 | return vaddr; |
@@ -1446,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
1446 | if (r) | 1821 | if (r) |
1447 | return r; | 1822 | return r; |
1448 | 1823 | ||
1449 | down_read(¤t->mm->mmap_sem); | ||
1450 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { | 1824 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { |
1451 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 1825 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
1452 | largepage = 1; | 1826 | largepage = 1; |
1453 | } | 1827 | } |
1454 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 1828 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
1455 | /* implicit mb(), we'll read before PT lock is unlocked */ | 1829 | smp_rmb(); |
1456 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 1830 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1457 | up_read(¤t->mm->mmap_sem); | ||
1458 | if (is_error_pfn(pfn)) { | 1831 | if (is_error_pfn(pfn)) { |
1459 | kvm_release_pfn_clean(pfn); | 1832 | kvm_release_pfn_clean(pfn); |
1460 | return 1; | 1833 | return 1; |
@@ -1464,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | |||
1464 | goto out_unlock; | 1837 | goto out_unlock; |
1465 | kvm_mmu_free_some_pages(vcpu); | 1838 | kvm_mmu_free_some_pages(vcpu); |
1466 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, | 1839 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, |
1467 | largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); | 1840 | largepage, gfn, pfn); |
1468 | spin_unlock(&vcpu->kvm->mmu_lock); | 1841 | spin_unlock(&vcpu->kvm->mmu_lock); |
1469 | 1842 | ||
1470 | return r; | 1843 | return r; |
@@ -1489,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) | |||
1489 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 1862 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
1490 | context->free = nonpaging_free; | 1863 | context->free = nonpaging_free; |
1491 | context->prefetch_page = nonpaging_prefetch_page; | 1864 | context->prefetch_page = nonpaging_prefetch_page; |
1865 | context->sync_page = nonpaging_sync_page; | ||
1866 | context->invlpg = nonpaging_invlpg; | ||
1492 | context->root_level = 0; | 1867 | context->root_level = 0; |
1493 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 1868 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
1494 | context->root_hpa = INVALID_PAGE; | 1869 | context->root_hpa = INVALID_PAGE; |
@@ -1536,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | |||
1536 | context->page_fault = paging64_page_fault; | 1911 | context->page_fault = paging64_page_fault; |
1537 | context->gva_to_gpa = paging64_gva_to_gpa; | 1912 | context->gva_to_gpa = paging64_gva_to_gpa; |
1538 | context->prefetch_page = paging64_prefetch_page; | 1913 | context->prefetch_page = paging64_prefetch_page; |
1914 | context->sync_page = paging64_sync_page; | ||
1915 | context->invlpg = paging64_invlpg; | ||
1539 | context->free = paging_free; | 1916 | context->free = paging_free; |
1540 | context->root_level = level; | 1917 | context->root_level = level; |
1541 | context->shadow_root_level = level; | 1918 | context->shadow_root_level = level; |
@@ -1557,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) | |||
1557 | context->gva_to_gpa = paging32_gva_to_gpa; | 1934 | context->gva_to_gpa = paging32_gva_to_gpa; |
1558 | context->free = paging_free; | 1935 | context->free = paging_free; |
1559 | context->prefetch_page = paging32_prefetch_page; | 1936 | context->prefetch_page = paging32_prefetch_page; |
1937 | context->sync_page = paging32_sync_page; | ||
1938 | context->invlpg = paging32_invlpg; | ||
1560 | context->root_level = PT32_ROOT_LEVEL; | 1939 | context->root_level = PT32_ROOT_LEVEL; |
1561 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 1940 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
1562 | context->root_hpa = INVALID_PAGE; | 1941 | context->root_hpa = INVALID_PAGE; |
@@ -1576,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) | |||
1576 | context->page_fault = tdp_page_fault; | 1955 | context->page_fault = tdp_page_fault; |
1577 | context->free = nonpaging_free; | 1956 | context->free = nonpaging_free; |
1578 | context->prefetch_page = nonpaging_prefetch_page; | 1957 | context->prefetch_page = nonpaging_prefetch_page; |
1958 | context->sync_page = nonpaging_sync_page; | ||
1959 | context->invlpg = nonpaging_invlpg; | ||
1579 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); | 1960 | context->shadow_root_level = kvm_x86_ops->get_tdp_level(); |
1580 | context->root_hpa = INVALID_PAGE; | 1961 | context->root_hpa = INVALID_PAGE; |
1581 | 1962 | ||
@@ -1647,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) | |||
1647 | spin_lock(&vcpu->kvm->mmu_lock); | 2028 | spin_lock(&vcpu->kvm->mmu_lock); |
1648 | kvm_mmu_free_some_pages(vcpu); | 2029 | kvm_mmu_free_some_pages(vcpu); |
1649 | mmu_alloc_roots(vcpu); | 2030 | mmu_alloc_roots(vcpu); |
2031 | mmu_sync_roots(vcpu); | ||
1650 | spin_unlock(&vcpu->kvm->mmu_lock); | 2032 | spin_unlock(&vcpu->kvm->mmu_lock); |
1651 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); | 2033 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); |
1652 | kvm_mmu_flush_tlb(vcpu); | 2034 | kvm_mmu_flush_tlb(vcpu); |
@@ -1767,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1767 | return; | 2149 | return; |
1768 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 2150 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
1769 | 2151 | ||
1770 | down_read(¤t->mm->mmap_sem); | ||
1771 | if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { | 2152 | if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { |
1772 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | 2153 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); |
1773 | vcpu->arch.update_pte.largepage = 1; | 2154 | vcpu->arch.update_pte.largepage = 1; |
1774 | } | 2155 | } |
1775 | vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2156 | vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; |
1776 | /* implicit mb(), we'll read before PT lock is unlocked */ | 2157 | smp_rmb(); |
1777 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | 2158 | pfn = gfn_to_pfn(vcpu->kvm, gfn); |
1778 | up_read(¤t->mm->mmap_sem); | ||
1779 | 2159 | ||
1780 | if (is_error_pfn(pfn)) { | 2160 | if (is_error_pfn(pfn)) { |
1781 | kvm_release_pfn_clean(pfn); | 2161 | kvm_release_pfn_clean(pfn); |
@@ -1837,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1837 | index = kvm_page_table_hashfn(gfn); | 2217 | index = kvm_page_table_hashfn(gfn); |
1838 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | 2218 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
1839 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { | 2219 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { |
1840 | if (sp->gfn != gfn || sp->role.metaphysical) | 2220 | if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid) |
1841 | continue; | 2221 | continue; |
1842 | pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; | 2222 | pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; |
1843 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | 2223 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); |
@@ -1855,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1855 | */ | 2235 | */ |
1856 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | 2236 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", |
1857 | gpa, bytes, sp->role.word); | 2237 | gpa, bytes, sp->role.word); |
1858 | kvm_mmu_zap_page(vcpu->kvm, sp); | 2238 | if (kvm_mmu_zap_page(vcpu->kvm, sp)) |
2239 | n = bucket->first; | ||
1859 | ++vcpu->kvm->stat.mmu_flooded; | 2240 | ++vcpu->kvm->stat.mmu_flooded; |
1860 | continue; | 2241 | continue; |
1861 | } | 2242 | } |
@@ -1969,6 +2350,16 @@ out: | |||
1969 | } | 2350 | } |
1970 | EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); | 2351 | EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); |
1971 | 2352 | ||
2353 | void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) | ||
2354 | { | ||
2355 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2356 | vcpu->arch.mmu.invlpg(vcpu, gva); | ||
2357 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2358 | kvm_mmu_flush_tlb(vcpu); | ||
2359 | ++vcpu->stat.invlpg; | ||
2360 | } | ||
2361 | EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); | ||
2362 | |||
1972 | void kvm_enable_tdp(void) | 2363 | void kvm_enable_tdp(void) |
1973 | { | 2364 | { |
1974 | tdp_enabled = true; | 2365 | tdp_enabled = true; |
@@ -2055,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
2055 | { | 2446 | { |
2056 | struct kvm_mmu_page *sp; | 2447 | struct kvm_mmu_page *sp; |
2057 | 2448 | ||
2449 | spin_lock(&kvm->mmu_lock); | ||
2058 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { | 2450 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { |
2059 | int i; | 2451 | int i; |
2060 | u64 *pt; | 2452 | u64 *pt; |
@@ -2068,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
2068 | if (pt[i] & PT_WRITABLE_MASK) | 2460 | if (pt[i] & PT_WRITABLE_MASK) |
2069 | pt[i] &= ~PT_WRITABLE_MASK; | 2461 | pt[i] &= ~PT_WRITABLE_MASK; |
2070 | } | 2462 | } |
2463 | kvm_flush_remote_tlbs(kvm); | ||
2464 | spin_unlock(&kvm->mmu_lock); | ||
2071 | } | 2465 | } |
2072 | 2466 | ||
2073 | void kvm_mmu_zap_all(struct kvm *kvm) | 2467 | void kvm_mmu_zap_all(struct kvm *kvm) |
@@ -2076,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm) | |||
2076 | 2470 | ||
2077 | spin_lock(&kvm->mmu_lock); | 2471 | spin_lock(&kvm->mmu_lock); |
2078 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) | 2472 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) |
2079 | kvm_mmu_zap_page(kvm, sp); | 2473 | if (kvm_mmu_zap_page(kvm, sp)) |
2474 | node = container_of(kvm->arch.active_mmu_pages.next, | ||
2475 | struct kvm_mmu_page, link); | ||
2080 | spin_unlock(&kvm->mmu_lock); | 2476 | spin_unlock(&kvm->mmu_lock); |
2081 | 2477 | ||
2082 | kvm_flush_remote_tlbs(kvm); | 2478 | kvm_flush_remote_tlbs(kvm); |
@@ -2291,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, | |||
2291 | gpa_t addr, unsigned long *ret) | 2687 | gpa_t addr, unsigned long *ret) |
2292 | { | 2688 | { |
2293 | int r; | 2689 | int r; |
2294 | struct kvm_pv_mmu_op_buffer buffer; | 2690 | struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; |
2295 | 2691 | ||
2296 | buffer.ptr = buffer.buf; | 2692 | buffer->ptr = buffer->buf; |
2297 | buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); | 2693 | buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); |
2298 | buffer.processed = 0; | 2694 | buffer->processed = 0; |
2299 | 2695 | ||
2300 | r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); | 2696 | r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); |
2301 | if (r) | 2697 | if (r) |
2302 | goto out; | 2698 | goto out; |
2303 | 2699 | ||
2304 | while (buffer.len) { | 2700 | while (buffer->len) { |
2305 | r = kvm_pv_mmu_op_one(vcpu, &buffer); | 2701 | r = kvm_pv_mmu_op_one(vcpu, buffer); |
2306 | if (r < 0) | 2702 | if (r < 0) |
2307 | goto out; | 2703 | goto out; |
2308 | if (r == 0) | 2704 | if (r == 0) |
@@ -2311,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, | |||
2311 | 2707 | ||
2312 | r = 1; | 2708 | r = 1; |
2313 | out: | 2709 | out: |
2314 | *ret = buffer.processed; | 2710 | *ret = buffer->processed; |
2315 | return r; | 2711 | return r; |
2316 | } | 2712 | } |
2317 | 2713 | ||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 4a814bff21f2..613ec9aa674a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -25,11 +25,11 @@ | |||
25 | #if PTTYPE == 64 | 25 | #if PTTYPE == 64 |
26 | #define pt_element_t u64 | 26 | #define pt_element_t u64 |
27 | #define guest_walker guest_walker64 | 27 | #define guest_walker guest_walker64 |
28 | #define shadow_walker shadow_walker64 | ||
28 | #define FNAME(name) paging##64_##name | 29 | #define FNAME(name) paging##64_##name |
29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | 30 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK |
30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | 31 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK |
31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | 32 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) |
32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | 33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) |
34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS | 34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS |
35 | #ifdef CONFIG_X86_64 | 35 | #ifdef CONFIG_X86_64 |
@@ -42,11 +42,11 @@ | |||
42 | #elif PTTYPE == 32 | 42 | #elif PTTYPE == 32 |
43 | #define pt_element_t u32 | 43 | #define pt_element_t u32 |
44 | #define guest_walker guest_walker32 | 44 | #define guest_walker guest_walker32 |
45 | #define shadow_walker shadow_walker32 | ||
45 | #define FNAME(name) paging##32_##name | 46 | #define FNAME(name) paging##32_##name |
46 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | 47 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK |
47 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | 48 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK |
48 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | 49 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) |
49 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | 50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) |
51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS | 51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS |
52 | #define PT_MAX_FULL_LEVELS 2 | 52 | #define PT_MAX_FULL_LEVELS 2 |
@@ -73,6 +73,17 @@ struct guest_walker { | |||
73 | u32 error_code; | 73 | u32 error_code; |
74 | }; | 74 | }; |
75 | 75 | ||
76 | struct shadow_walker { | ||
77 | struct kvm_shadow_walk walker; | ||
78 | struct guest_walker *guest_walker; | ||
79 | int user_fault; | ||
80 | int write_fault; | ||
81 | int largepage; | ||
82 | int *ptwrite; | ||
83 | pfn_t pfn; | ||
84 | u64 *sptep; | ||
85 | }; | ||
86 | |||
76 | static gfn_t gpte_to_gfn(pt_element_t gpte) | 87 | static gfn_t gpte_to_gfn(pt_element_t gpte) |
77 | { | 88 | { |
78 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | 89 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; |
@@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, | |||
91 | pt_element_t *table; | 102 | pt_element_t *table; |
92 | struct page *page; | 103 | struct page *page; |
93 | 104 | ||
94 | down_read(¤t->mm->mmap_sem); | ||
95 | page = gfn_to_page(kvm, table_gfn); | 105 | page = gfn_to_page(kvm, table_gfn); |
96 | up_read(¤t->mm->mmap_sem); | ||
97 | 106 | ||
98 | table = kmap_atomic(page, KM_USER0); | 107 | table = kmap_atomic(page, KM_USER0); |
99 | |||
100 | ret = CMPXCHG(&table[index], orig_pte, new_pte); | 108 | ret = CMPXCHG(&table[index], orig_pte, new_pte); |
101 | |||
102 | kunmap_atomic(table, KM_USER0); | 109 | kunmap_atomic(table, KM_USER0); |
103 | 110 | ||
104 | kvm_release_page_dirty(page); | 111 | kvm_release_page_dirty(page); |
@@ -274,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
274 | /* | 281 | /* |
275 | * Fetch a shadow pte for a specific level in the paging hierarchy. | 282 | * Fetch a shadow pte for a specific level in the paging hierarchy. |
276 | */ | 283 | */ |
277 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 284 | static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, |
278 | struct guest_walker *walker, | 285 | struct kvm_vcpu *vcpu, u64 addr, |
279 | int user_fault, int write_fault, int largepage, | 286 | u64 *sptep, int level) |
280 | int *ptwrite, pfn_t pfn) | ||
281 | { | 287 | { |
282 | hpa_t shadow_addr; | 288 | struct shadow_walker *sw = |
283 | int level; | 289 | container_of(_sw, struct shadow_walker, walker); |
284 | u64 *shadow_ent; | 290 | struct guest_walker *gw = sw->guest_walker; |
285 | unsigned access = walker->pt_access; | 291 | unsigned access = gw->pt_access; |
286 | 292 | struct kvm_mmu_page *shadow_page; | |
287 | if (!is_present_pte(walker->ptes[walker->level - 1])) | 293 | u64 spte; |
288 | return NULL; | 294 | int metaphysical; |
289 | 295 | gfn_t table_gfn; | |
290 | shadow_addr = vcpu->arch.mmu.root_hpa; | 296 | int r; |
291 | level = vcpu->arch.mmu.shadow_root_level; | 297 | pt_element_t curr_pte; |
292 | if (level == PT32E_ROOT_LEVEL) { | 298 | |
293 | shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | 299 | if (level == PT_PAGE_TABLE_LEVEL |
294 | shadow_addr &= PT64_BASE_ADDR_MASK; | 300 | || (sw->largepage && level == PT_DIRECTORY_LEVEL)) { |
295 | --level; | 301 | mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, |
302 | sw->user_fault, sw->write_fault, | ||
303 | gw->ptes[gw->level-1] & PT_DIRTY_MASK, | ||
304 | sw->ptwrite, sw->largepage, gw->gfn, sw->pfn, | ||
305 | false); | ||
306 | sw->sptep = sptep; | ||
307 | return 1; | ||
296 | } | 308 | } |
297 | 309 | ||
298 | for (; ; level--) { | 310 | if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) |
299 | u32 index = SHADOW_PT_INDEX(addr, level); | 311 | return 0; |
300 | struct kvm_mmu_page *shadow_page; | ||
301 | u64 shadow_pte; | ||
302 | int metaphysical; | ||
303 | gfn_t table_gfn; | ||
304 | |||
305 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | ||
306 | if (level == PT_PAGE_TABLE_LEVEL) | ||
307 | break; | ||
308 | |||
309 | if (largepage && level == PT_DIRECTORY_LEVEL) | ||
310 | break; | ||
311 | 312 | ||
312 | if (is_shadow_present_pte(*shadow_ent) | 313 | if (is_large_pte(*sptep)) { |
313 | && !is_large_pte(*shadow_ent)) { | 314 | set_shadow_pte(sptep, shadow_trap_nonpresent_pte); |
314 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | 315 | kvm_flush_remote_tlbs(vcpu->kvm); |
315 | continue; | 316 | rmap_remove(vcpu->kvm, sptep); |
316 | } | 317 | } |
317 | 318 | ||
318 | if (is_large_pte(*shadow_ent)) | 319 | if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { |
319 | rmap_remove(vcpu->kvm, shadow_ent); | 320 | metaphysical = 1; |
320 | 321 | if (!is_dirty_pte(gw->ptes[level - 1])) | |
321 | if (level - 1 == PT_PAGE_TABLE_LEVEL | 322 | access &= ~ACC_WRITE_MASK; |
322 | && walker->level == PT_DIRECTORY_LEVEL) { | 323 | table_gfn = gpte_to_gfn(gw->ptes[level - 1]); |
323 | metaphysical = 1; | 324 | } else { |
324 | if (!is_dirty_pte(walker->ptes[level - 1])) | 325 | metaphysical = 0; |
325 | access &= ~ACC_WRITE_MASK; | 326 | table_gfn = gw->table_gfn[level - 2]; |
326 | table_gfn = gpte_to_gfn(walker->ptes[level - 1]); | 327 | } |
327 | } else { | 328 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1, |
328 | metaphysical = 0; | 329 | metaphysical, access, sptep); |
329 | table_gfn = walker->table_gfn[level - 2]; | 330 | if (!metaphysical) { |
330 | } | 331 | r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], |
331 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | 332 | &curr_pte, sizeof(curr_pte)); |
332 | metaphysical, access, | 333 | if (r || curr_pte != gw->ptes[level - 2]) { |
333 | shadow_ent); | 334 | kvm_release_pfn_clean(sw->pfn); |
334 | if (!metaphysical) { | 335 | sw->sptep = NULL; |
335 | int r; | 336 | return 1; |
336 | pt_element_t curr_pte; | ||
337 | r = kvm_read_guest_atomic(vcpu->kvm, | ||
338 | walker->pte_gpa[level - 2], | ||
339 | &curr_pte, sizeof(curr_pte)); | ||
340 | if (r || curr_pte != walker->ptes[level - 2]) { | ||
341 | kvm_release_pfn_clean(pfn); | ||
342 | return NULL; | ||
343 | } | ||
344 | } | 337 | } |
345 | shadow_addr = __pa(shadow_page->spt); | ||
346 | shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | ||
347 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
348 | set_shadow_pte(shadow_ent, shadow_pte); | ||
349 | } | 338 | } |
350 | 339 | ||
351 | mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, | 340 | spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK |
352 | user_fault, write_fault, | 341 | | PT_WRITABLE_MASK | PT_USER_MASK; |
353 | walker->ptes[walker->level-1] & PT_DIRTY_MASK, | 342 | *sptep = spte; |
354 | ptwrite, largepage, walker->gfn, pfn, false); | 343 | return 0; |
344 | } | ||
345 | |||
346 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
347 | struct guest_walker *guest_walker, | ||
348 | int user_fault, int write_fault, int largepage, | ||
349 | int *ptwrite, pfn_t pfn) | ||
350 | { | ||
351 | struct shadow_walker walker = { | ||
352 | .walker = { .entry = FNAME(shadow_walk_entry), }, | ||
353 | .guest_walker = guest_walker, | ||
354 | .user_fault = user_fault, | ||
355 | .write_fault = write_fault, | ||
356 | .largepage = largepage, | ||
357 | .ptwrite = ptwrite, | ||
358 | .pfn = pfn, | ||
359 | }; | ||
360 | |||
361 | if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1])) | ||
362 | return NULL; | ||
363 | |||
364 | walk_shadow(&walker.walker, vcpu, addr); | ||
355 | 365 | ||
356 | return shadow_ent; | 366 | return walker.sptep; |
357 | } | 367 | } |
358 | 368 | ||
359 | /* | 369 | /* |
@@ -407,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
407 | return 0; | 417 | return 0; |
408 | } | 418 | } |
409 | 419 | ||
410 | down_read(¤t->mm->mmap_sem); | ||
411 | if (walker.level == PT_DIRECTORY_LEVEL) { | 420 | if (walker.level == PT_DIRECTORY_LEVEL) { |
412 | gfn_t large_gfn; | 421 | gfn_t large_gfn; |
413 | large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); | 422 | large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); |
@@ -417,9 +426,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
417 | } | 426 | } |
418 | } | 427 | } |
419 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 428 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
420 | /* implicit mb(), we'll read before PT lock is unlocked */ | 429 | smp_rmb(); |
421 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); | 430 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); |
422 | up_read(¤t->mm->mmap_sem); | ||
423 | 431 | ||
424 | /* mmio */ | 432 | /* mmio */ |
425 | if (is_error_pfn(pfn)) { | 433 | if (is_error_pfn(pfn)) { |
@@ -453,6 +461,31 @@ out_unlock: | |||
453 | return 0; | 461 | return 0; |
454 | } | 462 | } |
455 | 463 | ||
464 | static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, | ||
465 | struct kvm_vcpu *vcpu, u64 addr, | ||
466 | u64 *sptep, int level) | ||
467 | { | ||
468 | |||
469 | if (level == PT_PAGE_TABLE_LEVEL) { | ||
470 | if (is_shadow_present_pte(*sptep)) | ||
471 | rmap_remove(vcpu->kvm, sptep); | ||
472 | set_shadow_pte(sptep, shadow_trap_nonpresent_pte); | ||
473 | return 1; | ||
474 | } | ||
475 | if (!is_shadow_present_pte(*sptep)) | ||
476 | return 1; | ||
477 | return 0; | ||
478 | } | ||
479 | |||
480 | static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) | ||
481 | { | ||
482 | struct shadow_walker walker = { | ||
483 | .walker = { .entry = FNAME(shadow_invlpg_entry), }, | ||
484 | }; | ||
485 | |||
486 | walk_shadow(&walker.walker, vcpu, gva); | ||
487 | } | ||
488 | |||
456 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | 489 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) |
457 | { | 490 | { |
458 | struct guest_walker walker; | 491 | struct guest_walker walker; |
@@ -499,12 +532,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | |||
499 | } | 532 | } |
500 | } | 533 | } |
501 | 534 | ||
535 | /* | ||
536 | * Using the cached information from sp->gfns is safe because: | ||
537 | * - The spte has a reference to the struct page, so the pfn for a given gfn | ||
538 | * can't change unless all sptes pointing to it are nuked first. | ||
539 | * - Alias changes zap the entire shadow cache. | ||
540 | */ | ||
541 | static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) | ||
542 | { | ||
543 | int i, offset, nr_present; | ||
544 | |||
545 | offset = nr_present = 0; | ||
546 | |||
547 | if (PTTYPE == 32) | ||
548 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | ||
549 | |||
550 | for (i = 0; i < PT64_ENT_PER_PAGE; i++) { | ||
551 | unsigned pte_access; | ||
552 | pt_element_t gpte; | ||
553 | gpa_t pte_gpa; | ||
554 | gfn_t gfn = sp->gfns[i]; | ||
555 | |||
556 | if (!is_shadow_present_pte(sp->spt[i])) | ||
557 | continue; | ||
558 | |||
559 | pte_gpa = gfn_to_gpa(sp->gfn); | ||
560 | pte_gpa += (i+offset) * sizeof(pt_element_t); | ||
561 | |||
562 | if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, | ||
563 | sizeof(pt_element_t))) | ||
564 | return -EINVAL; | ||
565 | |||
566 | if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || | ||
567 | !(gpte & PT_ACCESSED_MASK)) { | ||
568 | u64 nonpresent; | ||
569 | |||
570 | rmap_remove(vcpu->kvm, &sp->spt[i]); | ||
571 | if (is_present_pte(gpte)) | ||
572 | nonpresent = shadow_trap_nonpresent_pte; | ||
573 | else | ||
574 | nonpresent = shadow_notrap_nonpresent_pte; | ||
575 | set_shadow_pte(&sp->spt[i], nonpresent); | ||
576 | continue; | ||
577 | } | ||
578 | |||
579 | nr_present++; | ||
580 | pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); | ||
581 | set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, | ||
582 | is_dirty_pte(gpte), 0, gfn, | ||
583 | spte_to_pfn(sp->spt[i]), true, false); | ||
584 | } | ||
585 | |||
586 | return !nr_present; | ||
587 | } | ||
588 | |||
502 | #undef pt_element_t | 589 | #undef pt_element_t |
503 | #undef guest_walker | 590 | #undef guest_walker |
591 | #undef shadow_walker | ||
504 | #undef FNAME | 592 | #undef FNAME |
505 | #undef PT_BASE_ADDR_MASK | 593 | #undef PT_BASE_ADDR_MASK |
506 | #undef PT_INDEX | 594 | #undef PT_INDEX |
507 | #undef SHADOW_PT_INDEX | ||
508 | #undef PT_LEVEL_MASK | 595 | #undef PT_LEVEL_MASK |
509 | #undef PT_DIR_BASE_ADDR_MASK | 596 | #undef PT_DIR_BASE_ADDR_MASK |
510 | #undef PT_LEVEL_BITS | 597 | #undef PT_LEVEL_BITS |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8233b86c778c..9c4ce657d963 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include "kvm_svm.h" | 18 | #include "kvm_svm.h" |
19 | #include "irq.h" | 19 | #include "irq.h" |
20 | #include "mmu.h" | 20 | #include "mmu.h" |
21 | #include "kvm_cache_regs.h" | ||
21 | 22 | ||
22 | #include <linux/module.h> | 23 | #include <linux/module.h> |
23 | #include <linux/kernel.h> | 24 | #include <linux/kernel.h> |
@@ -35,10 +36,6 @@ MODULE_LICENSE("GPL"); | |||
35 | #define IOPM_ALLOC_ORDER 2 | 36 | #define IOPM_ALLOC_ORDER 2 |
36 | #define MSRPM_ALLOC_ORDER 1 | 37 | #define MSRPM_ALLOC_ORDER 1 |
37 | 38 | ||
38 | #define DB_VECTOR 1 | ||
39 | #define UD_VECTOR 6 | ||
40 | #define GP_VECTOR 13 | ||
41 | |||
42 | #define DR7_GD_MASK (1 << 13) | 39 | #define DR7_GD_MASK (1 << 13) |
43 | #define DR6_BD_MASK (1 << 13) | 40 | #define DR6_BD_MASK (1 << 13) |
44 | 41 | ||
@@ -47,7 +44,7 @@ MODULE_LICENSE("GPL"); | |||
47 | 44 | ||
48 | #define SVM_FEATURE_NPT (1 << 0) | 45 | #define SVM_FEATURE_NPT (1 << 0) |
49 | #define SVM_FEATURE_LBRV (1 << 1) | 46 | #define SVM_FEATURE_LBRV (1 << 1) |
50 | #define SVM_DEATURE_SVML (1 << 2) | 47 | #define SVM_FEATURE_SVML (1 << 2) |
51 | 48 | ||
52 | #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) | 49 | #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) |
53 | 50 | ||
@@ -236,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
236 | printk(KERN_DEBUG "%s: NOP\n", __func__); | 233 | printk(KERN_DEBUG "%s: NOP\n", __func__); |
237 | return; | 234 | return; |
238 | } | 235 | } |
239 | if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) | 236 | if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) |
240 | printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", | 237 | printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", |
241 | __func__, | 238 | __func__, kvm_rip_read(vcpu), svm->next_rip); |
242 | svm->vmcb->save.rip, | ||
243 | svm->next_rip); | ||
244 | 239 | ||
245 | vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; | 240 | kvm_rip_write(vcpu, svm->next_rip); |
246 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; | 241 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; |
247 | 242 | ||
248 | vcpu->arch.interrupt_window_open = 1; | 243 | vcpu->arch.interrupt_window_open = 1; |
@@ -530,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
530 | (1ULL << INTERCEPT_CPUID) | | 525 | (1ULL << INTERCEPT_CPUID) | |
531 | (1ULL << INTERCEPT_INVD) | | 526 | (1ULL << INTERCEPT_INVD) | |
532 | (1ULL << INTERCEPT_HLT) | | 527 | (1ULL << INTERCEPT_HLT) | |
528 | (1ULL << INTERCEPT_INVLPG) | | ||
533 | (1ULL << INTERCEPT_INVLPGA) | | 529 | (1ULL << INTERCEPT_INVLPGA) | |
534 | (1ULL << INTERCEPT_IOIO_PROT) | | 530 | (1ULL << INTERCEPT_IOIO_PROT) | |
535 | (1ULL << INTERCEPT_MSR_PROT) | | 531 | (1ULL << INTERCEPT_MSR_PROT) | |
@@ -581,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
581 | save->dr7 = 0x400; | 577 | save->dr7 = 0x400; |
582 | save->rflags = 2; | 578 | save->rflags = 2; |
583 | save->rip = 0x0000fff0; | 579 | save->rip = 0x0000fff0; |
580 | svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; | ||
584 | 581 | ||
585 | /* | 582 | /* |
586 | * cr0 val on cpu init should be 0x60000010, we enable cpu | 583 | * cr0 val on cpu init should be 0x60000010, we enable cpu |
@@ -593,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm) | |||
593 | if (npt_enabled) { | 590 | if (npt_enabled) { |
594 | /* Setup VMCB for Nested Paging */ | 591 | /* Setup VMCB for Nested Paging */ |
595 | control->nested_ctl = 1; | 592 | control->nested_ctl = 1; |
596 | control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); | 593 | control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | |
594 | (1ULL << INTERCEPT_INVLPG)); | ||
597 | control->intercept_exceptions &= ~(1 << PF_VECTOR); | 595 | control->intercept_exceptions &= ~(1 << PF_VECTOR); |
598 | control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| | 596 | control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| |
599 | INTERCEPT_CR3_MASK); | 597 | INTERCEPT_CR3_MASK); |
@@ -615,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) | |||
615 | init_vmcb(svm); | 613 | init_vmcb(svm); |
616 | 614 | ||
617 | if (vcpu->vcpu_id != 0) { | 615 | if (vcpu->vcpu_id != 0) { |
618 | svm->vmcb->save.rip = 0; | 616 | kvm_rip_write(vcpu, 0); |
619 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; | 617 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; |
620 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; | 618 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; |
621 | } | 619 | } |
620 | vcpu->arch.regs_avail = ~0; | ||
621 | vcpu->arch.regs_dirty = ~0; | ||
622 | 622 | ||
623 | return 0; | 623 | return 0; |
624 | } | 624 | } |
@@ -721,23 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) | |||
721 | rdtscll(vcpu->arch.host_tsc); | 721 | rdtscll(vcpu->arch.host_tsc); |
722 | } | 722 | } |
723 | 723 | ||
724 | static void svm_cache_regs(struct kvm_vcpu *vcpu) | ||
725 | { | ||
726 | struct vcpu_svm *svm = to_svm(vcpu); | ||
727 | |||
728 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
729 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
730 | vcpu->arch.rip = svm->vmcb->save.rip; | ||
731 | } | ||
732 | |||
733 | static void svm_decache_regs(struct kvm_vcpu *vcpu) | ||
734 | { | ||
735 | struct vcpu_svm *svm = to_svm(vcpu); | ||
736 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
737 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
738 | svm->vmcb->save.rip = vcpu->arch.rip; | ||
739 | } | ||
740 | |||
741 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | 724 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) |
742 | { | 725 | { |
743 | return to_svm(vcpu)->vmcb->save.rflags; | 726 | return to_svm(vcpu)->vmcb->save.rflags; |
@@ -1040,7 +1023,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1040 | if (npt_enabled) | 1023 | if (npt_enabled) |
1041 | svm_flush_tlb(&svm->vcpu); | 1024 | svm_flush_tlb(&svm->vcpu); |
1042 | 1025 | ||
1043 | if (event_injection) | 1026 | if (!npt_enabled && event_injection) |
1044 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); | 1027 | kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); |
1045 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); | 1028 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); |
1046 | } | 1029 | } |
@@ -1139,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1139 | 1122 | ||
1140 | static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1123 | static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1141 | { | 1124 | { |
1142 | svm->next_rip = svm->vmcb->save.rip + 1; | 1125 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; |
1143 | skip_emulated_instruction(&svm->vcpu); | 1126 | skip_emulated_instruction(&svm->vcpu); |
1144 | return kvm_emulate_halt(&svm->vcpu); | 1127 | return kvm_emulate_halt(&svm->vcpu); |
1145 | } | 1128 | } |
1146 | 1129 | ||
1147 | static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1130 | static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1148 | { | 1131 | { |
1149 | svm->next_rip = svm->vmcb->save.rip + 3; | 1132 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; |
1150 | skip_emulated_instruction(&svm->vcpu); | 1133 | skip_emulated_instruction(&svm->vcpu); |
1151 | kvm_emulate_hypercall(&svm->vcpu); | 1134 | kvm_emulate_hypercall(&svm->vcpu); |
1152 | return 1; | 1135 | return 1; |
@@ -1178,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm, | |||
1178 | 1161 | ||
1179 | static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1162 | static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1180 | { | 1163 | { |
1181 | svm->next_rip = svm->vmcb->save.rip + 2; | 1164 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; |
1182 | kvm_emulate_cpuid(&svm->vcpu); | 1165 | kvm_emulate_cpuid(&svm->vcpu); |
1183 | return 1; | 1166 | return 1; |
1184 | } | 1167 | } |
1185 | 1168 | ||
1169 | static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1170 | { | ||
1171 | if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) | ||
1172 | pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); | ||
1173 | return 1; | ||
1174 | } | ||
1175 | |||
1186 | static int emulate_on_interception(struct vcpu_svm *svm, | 1176 | static int emulate_on_interception(struct vcpu_svm *svm, |
1187 | struct kvm_run *kvm_run) | 1177 | struct kvm_run *kvm_run) |
1188 | { | 1178 | { |
@@ -1273,9 +1263,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1273 | KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, | 1263 | KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, |
1274 | (u32)(data >> 32), handler); | 1264 | (u32)(data >> 32), handler); |
1275 | 1265 | ||
1276 | svm->vmcb->save.rax = data & 0xffffffff; | 1266 | svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; |
1277 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; | 1267 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; |
1278 | svm->next_rip = svm->vmcb->save.rip + 2; | 1268 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; |
1279 | skip_emulated_instruction(&svm->vcpu); | 1269 | skip_emulated_instruction(&svm->vcpu); |
1280 | } | 1270 | } |
1281 | return 1; | 1271 | return 1; |
@@ -1359,13 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
1359 | static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1349 | static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1360 | { | 1350 | { |
1361 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; | 1351 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; |
1362 | u64 data = (svm->vmcb->save.rax & -1u) | 1352 | u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) |
1363 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); | 1353 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); |
1364 | 1354 | ||
1365 | KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), | 1355 | KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), |
1366 | handler); | 1356 | handler); |
1367 | 1357 | ||
1368 | svm->next_rip = svm->vmcb->save.rip + 2; | 1358 | svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; |
1369 | if (svm_set_msr(&svm->vcpu, ecx, data)) | 1359 | if (svm_set_msr(&svm->vcpu, ecx, data)) |
1370 | kvm_inject_gp(&svm->vcpu, 0); | 1360 | kvm_inject_gp(&svm->vcpu, 0); |
1371 | else | 1361 | else |
@@ -1436,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | |||
1436 | [SVM_EXIT_CPUID] = cpuid_interception, | 1426 | [SVM_EXIT_CPUID] = cpuid_interception, |
1437 | [SVM_EXIT_INVD] = emulate_on_interception, | 1427 | [SVM_EXIT_INVD] = emulate_on_interception, |
1438 | [SVM_EXIT_HLT] = halt_interception, | 1428 | [SVM_EXIT_HLT] = halt_interception, |
1439 | [SVM_EXIT_INVLPG] = emulate_on_interception, | 1429 | [SVM_EXIT_INVLPG] = invlpg_interception, |
1440 | [SVM_EXIT_INVLPGA] = invalid_op_interception, | 1430 | [SVM_EXIT_INVLPGA] = invalid_op_interception, |
1441 | [SVM_EXIT_IOIO] = io_interception, | 1431 | [SVM_EXIT_IOIO] = io_interception, |
1442 | [SVM_EXIT_MSR] = msr_interception, | 1432 | [SVM_EXIT_MSR] = msr_interception, |
@@ -1538,6 +1528,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | |||
1538 | 1528 | ||
1539 | KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); | 1529 | KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); |
1540 | 1530 | ||
1531 | ++svm->vcpu.stat.irq_injections; | ||
1541 | control = &svm->vmcb->control; | 1532 | control = &svm->vmcb->control; |
1542 | control->int_vector = irq; | 1533 | control->int_vector = irq; |
1543 | control->int_ctl &= ~V_INTR_PRIO_MASK; | 1534 | control->int_ctl &= ~V_INTR_PRIO_MASK; |
@@ -1716,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) | |||
1716 | svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; | 1707 | svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; |
1717 | } | 1708 | } |
1718 | 1709 | ||
1710 | #ifdef CONFIG_X86_64 | ||
1711 | #define R "r" | ||
1712 | #else | ||
1713 | #define R "e" | ||
1714 | #endif | ||
1715 | |||
1719 | static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 1716 | static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
1720 | { | 1717 | { |
1721 | struct vcpu_svm *svm = to_svm(vcpu); | 1718 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -1723,6 +1720,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1723 | u16 gs_selector; | 1720 | u16 gs_selector; |
1724 | u16 ldt_selector; | 1721 | u16 ldt_selector; |
1725 | 1722 | ||
1723 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
1724 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
1725 | svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; | ||
1726 | |||
1726 | pre_svm_run(svm); | 1727 | pre_svm_run(svm); |
1727 | 1728 | ||
1728 | sync_lapic_to_cr8(vcpu); | 1729 | sync_lapic_to_cr8(vcpu); |
@@ -1750,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1750 | local_irq_enable(); | 1751 | local_irq_enable(); |
1751 | 1752 | ||
1752 | asm volatile ( | 1753 | asm volatile ( |
1754 | "push %%"R"bp; \n\t" | ||
1755 | "mov %c[rbx](%[svm]), %%"R"bx \n\t" | ||
1756 | "mov %c[rcx](%[svm]), %%"R"cx \n\t" | ||
1757 | "mov %c[rdx](%[svm]), %%"R"dx \n\t" | ||
1758 | "mov %c[rsi](%[svm]), %%"R"si \n\t" | ||
1759 | "mov %c[rdi](%[svm]), %%"R"di \n\t" | ||
1760 | "mov %c[rbp](%[svm]), %%"R"bp \n\t" | ||
1753 | #ifdef CONFIG_X86_64 | 1761 | #ifdef CONFIG_X86_64 |
1754 | "push %%rbp; \n\t" | ||
1755 | #else | ||
1756 | "push %%ebp; \n\t" | ||
1757 | #endif | ||
1758 | |||
1759 | #ifdef CONFIG_X86_64 | ||
1760 | "mov %c[rbx](%[svm]), %%rbx \n\t" | ||
1761 | "mov %c[rcx](%[svm]), %%rcx \n\t" | ||
1762 | "mov %c[rdx](%[svm]), %%rdx \n\t" | ||
1763 | "mov %c[rsi](%[svm]), %%rsi \n\t" | ||
1764 | "mov %c[rdi](%[svm]), %%rdi \n\t" | ||
1765 | "mov %c[rbp](%[svm]), %%rbp \n\t" | ||
1766 | "mov %c[r8](%[svm]), %%r8 \n\t" | 1762 | "mov %c[r8](%[svm]), %%r8 \n\t" |
1767 | "mov %c[r9](%[svm]), %%r9 \n\t" | 1763 | "mov %c[r9](%[svm]), %%r9 \n\t" |
1768 | "mov %c[r10](%[svm]), %%r10 \n\t" | 1764 | "mov %c[r10](%[svm]), %%r10 \n\t" |
@@ -1771,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1771 | "mov %c[r13](%[svm]), %%r13 \n\t" | 1767 | "mov %c[r13](%[svm]), %%r13 \n\t" |
1772 | "mov %c[r14](%[svm]), %%r14 \n\t" | 1768 | "mov %c[r14](%[svm]), %%r14 \n\t" |
1773 | "mov %c[r15](%[svm]), %%r15 \n\t" | 1769 | "mov %c[r15](%[svm]), %%r15 \n\t" |
1774 | #else | ||
1775 | "mov %c[rbx](%[svm]), %%ebx \n\t" | ||
1776 | "mov %c[rcx](%[svm]), %%ecx \n\t" | ||
1777 | "mov %c[rdx](%[svm]), %%edx \n\t" | ||
1778 | "mov %c[rsi](%[svm]), %%esi \n\t" | ||
1779 | "mov %c[rdi](%[svm]), %%edi \n\t" | ||
1780 | "mov %c[rbp](%[svm]), %%ebp \n\t" | ||
1781 | #endif | 1770 | #endif |
1782 | 1771 | ||
1783 | #ifdef CONFIG_X86_64 | ||
1784 | /* Enter guest mode */ | ||
1785 | "push %%rax \n\t" | ||
1786 | "mov %c[vmcb](%[svm]), %%rax \n\t" | ||
1787 | __ex(SVM_VMLOAD) "\n\t" | ||
1788 | __ex(SVM_VMRUN) "\n\t" | ||
1789 | __ex(SVM_VMSAVE) "\n\t" | ||
1790 | "pop %%rax \n\t" | ||
1791 | #else | ||
1792 | /* Enter guest mode */ | 1772 | /* Enter guest mode */ |
1793 | "push %%eax \n\t" | 1773 | "push %%"R"ax \n\t" |
1794 | "mov %c[vmcb](%[svm]), %%eax \n\t" | 1774 | "mov %c[vmcb](%[svm]), %%"R"ax \n\t" |
1795 | __ex(SVM_VMLOAD) "\n\t" | 1775 | __ex(SVM_VMLOAD) "\n\t" |
1796 | __ex(SVM_VMRUN) "\n\t" | 1776 | __ex(SVM_VMRUN) "\n\t" |
1797 | __ex(SVM_VMSAVE) "\n\t" | 1777 | __ex(SVM_VMSAVE) "\n\t" |
1798 | "pop %%eax \n\t" | 1778 | "pop %%"R"ax \n\t" |
1799 | #endif | ||
1800 | 1779 | ||
1801 | /* Save guest registers, load host registers */ | 1780 | /* Save guest registers, load host registers */ |
1781 | "mov %%"R"bx, %c[rbx](%[svm]) \n\t" | ||
1782 | "mov %%"R"cx, %c[rcx](%[svm]) \n\t" | ||
1783 | "mov %%"R"dx, %c[rdx](%[svm]) \n\t" | ||
1784 | "mov %%"R"si, %c[rsi](%[svm]) \n\t" | ||
1785 | "mov %%"R"di, %c[rdi](%[svm]) \n\t" | ||
1786 | "mov %%"R"bp, %c[rbp](%[svm]) \n\t" | ||
1802 | #ifdef CONFIG_X86_64 | 1787 | #ifdef CONFIG_X86_64 |
1803 | "mov %%rbx, %c[rbx](%[svm]) \n\t" | ||
1804 | "mov %%rcx, %c[rcx](%[svm]) \n\t" | ||
1805 | "mov %%rdx, %c[rdx](%[svm]) \n\t" | ||
1806 | "mov %%rsi, %c[rsi](%[svm]) \n\t" | ||
1807 | "mov %%rdi, %c[rdi](%[svm]) \n\t" | ||
1808 | "mov %%rbp, %c[rbp](%[svm]) \n\t" | ||
1809 | "mov %%r8, %c[r8](%[svm]) \n\t" | 1788 | "mov %%r8, %c[r8](%[svm]) \n\t" |
1810 | "mov %%r9, %c[r9](%[svm]) \n\t" | 1789 | "mov %%r9, %c[r9](%[svm]) \n\t" |
1811 | "mov %%r10, %c[r10](%[svm]) \n\t" | 1790 | "mov %%r10, %c[r10](%[svm]) \n\t" |
@@ -1814,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1814 | "mov %%r13, %c[r13](%[svm]) \n\t" | 1793 | "mov %%r13, %c[r13](%[svm]) \n\t" |
1815 | "mov %%r14, %c[r14](%[svm]) \n\t" | 1794 | "mov %%r14, %c[r14](%[svm]) \n\t" |
1816 | "mov %%r15, %c[r15](%[svm]) \n\t" | 1795 | "mov %%r15, %c[r15](%[svm]) \n\t" |
1817 | |||
1818 | "pop %%rbp; \n\t" | ||
1819 | #else | ||
1820 | "mov %%ebx, %c[rbx](%[svm]) \n\t" | ||
1821 | "mov %%ecx, %c[rcx](%[svm]) \n\t" | ||
1822 | "mov %%edx, %c[rdx](%[svm]) \n\t" | ||
1823 | "mov %%esi, %c[rsi](%[svm]) \n\t" | ||
1824 | "mov %%edi, %c[rdi](%[svm]) \n\t" | ||
1825 | "mov %%ebp, %c[rbp](%[svm]) \n\t" | ||
1826 | |||
1827 | "pop %%ebp; \n\t" | ||
1828 | #endif | 1796 | #endif |
1797 | "pop %%"R"bp" | ||
1829 | : | 1798 | : |
1830 | : [svm]"a"(svm), | 1799 | : [svm]"a"(svm), |
1831 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), | 1800 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), |
@@ -1846,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1846 | [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) | 1815 | [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) |
1847 | #endif | 1816 | #endif |
1848 | : "cc", "memory" | 1817 | : "cc", "memory" |
1818 | , R"bx", R"cx", R"dx", R"si", R"di" | ||
1849 | #ifdef CONFIG_X86_64 | 1819 | #ifdef CONFIG_X86_64 |
1850 | , "rbx", "rcx", "rdx", "rsi", "rdi" | ||
1851 | , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" | 1820 | , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" |
1852 | #else | ||
1853 | , "ebx", "ecx", "edx" , "esi", "edi" | ||
1854 | #endif | 1821 | #endif |
1855 | ); | 1822 | ); |
1856 | 1823 | ||
@@ -1858,6 +1825,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1858 | load_db_regs(svm->host_db_regs); | 1825 | load_db_regs(svm->host_db_regs); |
1859 | 1826 | ||
1860 | vcpu->arch.cr2 = svm->vmcb->save.cr2; | 1827 | vcpu->arch.cr2 = svm->vmcb->save.cr2; |
1828 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
1829 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
1830 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; | ||
1861 | 1831 | ||
1862 | write_dr6(svm->host_dr6); | 1832 | write_dr6(svm->host_dr6); |
1863 | write_dr7(svm->host_dr7); | 1833 | write_dr7(svm->host_dr7); |
@@ -1879,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1879 | svm->next_rip = 0; | 1849 | svm->next_rip = 0; |
1880 | } | 1850 | } |
1881 | 1851 | ||
1852 | #undef R | ||
1853 | |||
1882 | static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | 1854 | static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) |
1883 | { | 1855 | { |
1884 | struct vcpu_svm *svm = to_svm(vcpu); | 1856 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -1977,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
1977 | .set_gdt = svm_set_gdt, | 1949 | .set_gdt = svm_set_gdt, |
1978 | .get_dr = svm_get_dr, | 1950 | .get_dr = svm_get_dr, |
1979 | .set_dr = svm_set_dr, | 1951 | .set_dr = svm_set_dr, |
1980 | .cache_regs = svm_cache_regs, | ||
1981 | .decache_regs = svm_decache_regs, | ||
1982 | .get_rflags = svm_get_rflags, | 1952 | .get_rflags = svm_get_rflags, |
1983 | .set_rflags = svm_set_rflags, | 1953 | .set_rflags = svm_set_rflags, |
1984 | 1954 | ||
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7041cc52b562..2643b430d83a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -26,6 +26,8 @@ | |||
26 | #include <linux/highmem.h> | 26 | #include <linux/highmem.h> |
27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
28 | #include <linux/moduleparam.h> | 28 | #include <linux/moduleparam.h> |
29 | #include "kvm_cache_regs.h" | ||
30 | #include "x86.h" | ||
29 | 31 | ||
30 | #include <asm/io.h> | 32 | #include <asm/io.h> |
31 | #include <asm/desc.h> | 33 | #include <asm/desc.h> |
@@ -47,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0); | |||
47 | static int enable_ept = 1; | 49 | static int enable_ept = 1; |
48 | module_param(enable_ept, bool, 0); | 50 | module_param(enable_ept, bool, 0); |
49 | 51 | ||
52 | static int emulate_invalid_guest_state = 0; | ||
53 | module_param(emulate_invalid_guest_state, bool, 0); | ||
54 | |||
50 | struct vmcs { | 55 | struct vmcs { |
51 | u32 revision_id; | 56 | u32 revision_id; |
52 | u32 abort; | 57 | u32 abort; |
@@ -56,6 +61,7 @@ struct vmcs { | |||
56 | struct vcpu_vmx { | 61 | struct vcpu_vmx { |
57 | struct kvm_vcpu vcpu; | 62 | struct kvm_vcpu vcpu; |
58 | struct list_head local_vcpus_link; | 63 | struct list_head local_vcpus_link; |
64 | unsigned long host_rsp; | ||
59 | int launched; | 65 | int launched; |
60 | u8 fail; | 66 | u8 fail; |
61 | u32 idt_vectoring_info; | 67 | u32 idt_vectoring_info; |
@@ -83,6 +89,7 @@ struct vcpu_vmx { | |||
83 | } irq; | 89 | } irq; |
84 | } rmode; | 90 | } rmode; |
85 | int vpid; | 91 | int vpid; |
92 | bool emulation_required; | ||
86 | }; | 93 | }; |
87 | 94 | ||
88 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | 95 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) |
@@ -468,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) | |||
468 | if (!vcpu->fpu_active) | 475 | if (!vcpu->fpu_active) |
469 | eb |= 1u << NM_VECTOR; | 476 | eb |= 1u << NM_VECTOR; |
470 | if (vcpu->guest_debug.enabled) | 477 | if (vcpu->guest_debug.enabled) |
471 | eb |= 1u << 1; | 478 | eb |= 1u << DB_VECTOR; |
472 | if (vcpu->arch.rmode.active) | 479 | if (vcpu->arch.rmode.active) |
473 | eb = ~0; | 480 | eb = ~0; |
474 | if (vm_need_ept()) | 481 | if (vm_need_ept()) |
@@ -715,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
715 | unsigned long rip; | 722 | unsigned long rip; |
716 | u32 interruptibility; | 723 | u32 interruptibility; |
717 | 724 | ||
718 | rip = vmcs_readl(GUEST_RIP); | 725 | rip = kvm_rip_read(vcpu); |
719 | rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | 726 | rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); |
720 | vmcs_writel(GUEST_RIP, rip); | 727 | kvm_rip_write(vcpu, rip); |
721 | 728 | ||
722 | /* | 729 | /* |
723 | * We emulated an instruction, so temporary interrupt blocking | 730 | * We emulated an instruction, so temporary interrupt blocking |
@@ -733,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
733 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | 740 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, |
734 | bool has_error_code, u32 error_code) | 741 | bool has_error_code, u32 error_code) |
735 | { | 742 | { |
743 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
744 | |||
745 | if (has_error_code) | ||
746 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
747 | |||
748 | if (vcpu->arch.rmode.active) { | ||
749 | vmx->rmode.irq.pending = true; | ||
750 | vmx->rmode.irq.vector = nr; | ||
751 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); | ||
752 | if (nr == BP_VECTOR) | ||
753 | vmx->rmode.irq.rip++; | ||
754 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
755 | nr | INTR_TYPE_SOFT_INTR | ||
756 | | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | ||
757 | | INTR_INFO_VALID_MASK); | ||
758 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | ||
759 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); | ||
760 | return; | ||
761 | } | ||
762 | |||
736 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 763 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
737 | nr | INTR_TYPE_EXCEPTION | 764 | nr | INTR_TYPE_EXCEPTION |
738 | | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | 765 | | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) |
739 | | INTR_INFO_VALID_MASK); | 766 | | INTR_INFO_VALID_MASK); |
740 | if (has_error_code) | ||
741 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
742 | } | 767 | } |
743 | 768 | ||
744 | static bool vmx_exception_injected(struct kvm_vcpu *vcpu) | 769 | static bool vmx_exception_injected(struct kvm_vcpu *vcpu) |
745 | { | 770 | { |
746 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 771 | return false; |
747 | |||
748 | return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); | ||
749 | } | 772 | } |
750 | 773 | ||
751 | /* | 774 | /* |
@@ -947,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
947 | return ret; | 970 | return ret; |
948 | } | 971 | } |
949 | 972 | ||
950 | /* | 973 | static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) |
951 | * Sync the rsp and rip registers into the vcpu structure. This allows | ||
952 | * registers to be accessed by indexing vcpu->arch.regs. | ||
953 | */ | ||
954 | static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) | ||
955 | { | ||
956 | vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); | ||
957 | vcpu->arch.rip = vmcs_readl(GUEST_RIP); | ||
958 | } | ||
959 | |||
960 | /* | ||
961 | * Syncs rsp and rip back into the vmcs. Should be called after possible | ||
962 | * modification. | ||
963 | */ | ||
964 | static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) | ||
965 | { | 974 | { |
966 | vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); | 975 | __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); |
967 | vmcs_writel(GUEST_RIP, vcpu->arch.rip); | 976 | switch (reg) { |
977 | case VCPU_REGS_RSP: | ||
978 | vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); | ||
979 | break; | ||
980 | case VCPU_REGS_RIP: | ||
981 | vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); | ||
982 | break; | ||
983 | default: | ||
984 | break; | ||
985 | } | ||
968 | } | 986 | } |
969 | 987 | ||
970 | static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | 988 | static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) |
@@ -1007,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | |||
1007 | 1025 | ||
1008 | static int vmx_get_irq(struct kvm_vcpu *vcpu) | 1026 | static int vmx_get_irq(struct kvm_vcpu *vcpu) |
1009 | { | 1027 | { |
1010 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1028 | if (!vcpu->arch.interrupt.pending) |
1011 | u32 idtv_info_field; | 1029 | return -1; |
1012 | 1030 | return vcpu->arch.interrupt.nr; | |
1013 | idtv_info_field = vmx->idt_vectoring_info; | ||
1014 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | ||
1015 | if (is_external_interrupt(idtv_info_field)) | ||
1016 | return idtv_info_field & VECTORING_INFO_VECTOR_MASK; | ||
1017 | else | ||
1018 | printk(KERN_DEBUG "pending exception: not handled yet\n"); | ||
1019 | } | ||
1020 | return -1; | ||
1021 | } | 1031 | } |
1022 | 1032 | ||
1023 | static __init int cpu_has_kvm_support(void) | 1033 | static __init int cpu_has_kvm_support(void) |
@@ -1031,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void) | |||
1031 | u64 msr; | 1041 | u64 msr; |
1032 | 1042 | ||
1033 | rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); | 1043 | rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); |
1034 | return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | | 1044 | return (msr & (FEATURE_CONTROL_LOCKED | |
1035 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | 1045 | FEATURE_CONTROL_VMXON_ENABLED)) |
1036 | == MSR_IA32_FEATURE_CONTROL_LOCKED; | 1046 | == FEATURE_CONTROL_LOCKED; |
1037 | /* locked but not enabled */ | 1047 | /* locked but not enabled */ |
1038 | } | 1048 | } |
1039 | 1049 | ||
@@ -1045,14 +1055,14 @@ static void hardware_enable(void *garbage) | |||
1045 | 1055 | ||
1046 | INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); | 1056 | INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); |
1047 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); | 1057 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); |
1048 | if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | | 1058 | if ((old & (FEATURE_CONTROL_LOCKED | |
1049 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | 1059 | FEATURE_CONTROL_VMXON_ENABLED)) |
1050 | != (MSR_IA32_FEATURE_CONTROL_LOCKED | | 1060 | != (FEATURE_CONTROL_LOCKED | |
1051 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | 1061 | FEATURE_CONTROL_VMXON_ENABLED)) |
1052 | /* enable and lock */ | 1062 | /* enable and lock */ |
1053 | wrmsrl(MSR_IA32_FEATURE_CONTROL, old | | 1063 | wrmsrl(MSR_IA32_FEATURE_CONTROL, old | |
1054 | MSR_IA32_FEATURE_CONTROL_LOCKED | | 1064 | FEATURE_CONTROL_LOCKED | |
1055 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); | 1065 | FEATURE_CONTROL_VMXON_ENABLED); |
1056 | write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ | 1066 | write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ |
1057 | asm volatile (ASM_VMX_VMXON_RAX | 1067 | asm volatile (ASM_VMX_VMXON_RAX |
1058 | : : "a"(&phys_addr), "m"(phys_addr) | 1068 | : : "a"(&phys_addr), "m"(phys_addr) |
@@ -1120,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1120 | CPU_BASED_CR3_STORE_EXITING | | 1130 | CPU_BASED_CR3_STORE_EXITING | |
1121 | CPU_BASED_USE_IO_BITMAPS | | 1131 | CPU_BASED_USE_IO_BITMAPS | |
1122 | CPU_BASED_MOV_DR_EXITING | | 1132 | CPU_BASED_MOV_DR_EXITING | |
1123 | CPU_BASED_USE_TSC_OFFSETING; | 1133 | CPU_BASED_USE_TSC_OFFSETING | |
1134 | CPU_BASED_INVLPG_EXITING; | ||
1124 | opt = CPU_BASED_TPR_SHADOW | | 1135 | opt = CPU_BASED_TPR_SHADOW | |
1125 | CPU_BASED_USE_MSR_BITMAPS | | 1136 | CPU_BASED_USE_MSR_BITMAPS | |
1126 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | 1137 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; |
@@ -1149,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
1149 | _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; | 1160 | _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; |
1150 | #endif | 1161 | #endif |
1151 | if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { | 1162 | if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { |
1152 | /* CR3 accesses don't need to cause VM Exits when EPT enabled */ | 1163 | /* CR3 accesses and invlpg don't need to cause VM Exits when EPT |
1164 | enabled */ | ||
1153 | min &= ~(CPU_BASED_CR3_LOAD_EXITING | | 1165 | min &= ~(CPU_BASED_CR3_LOAD_EXITING | |
1154 | CPU_BASED_CR3_STORE_EXITING); | 1166 | CPU_BASED_CR3_STORE_EXITING | |
1167 | CPU_BASED_INVLPG_EXITING); | ||
1155 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, | 1168 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, |
1156 | &_cpu_based_exec_control) < 0) | 1169 | &_cpu_based_exec_control) < 0) |
1157 | return -EIO; | 1170 | return -EIO; |
@@ -1288,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) | |||
1288 | static void enter_pmode(struct kvm_vcpu *vcpu) | 1301 | static void enter_pmode(struct kvm_vcpu *vcpu) |
1289 | { | 1302 | { |
1290 | unsigned long flags; | 1303 | unsigned long flags; |
1304 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1291 | 1305 | ||
1306 | vmx->emulation_required = 1; | ||
1292 | vcpu->arch.rmode.active = 0; | 1307 | vcpu->arch.rmode.active = 0; |
1293 | 1308 | ||
1294 | vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); | 1309 | vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); |
@@ -1305,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
1305 | 1320 | ||
1306 | update_exception_bitmap(vcpu); | 1321 | update_exception_bitmap(vcpu); |
1307 | 1322 | ||
1323 | if (emulate_invalid_guest_state) | ||
1324 | return; | ||
1325 | |||
1308 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); | 1326 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); |
1309 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); | 1327 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); |
1310 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); | 1328 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); |
@@ -1345,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | |||
1345 | static void enter_rmode(struct kvm_vcpu *vcpu) | 1363 | static void enter_rmode(struct kvm_vcpu *vcpu) |
1346 | { | 1364 | { |
1347 | unsigned long flags; | 1365 | unsigned long flags; |
1366 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1348 | 1367 | ||
1368 | vmx->emulation_required = 1; | ||
1349 | vcpu->arch.rmode.active = 1; | 1369 | vcpu->arch.rmode.active = 1; |
1350 | 1370 | ||
1351 | vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); | 1371 | vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); |
@@ -1367,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
1367 | vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); | 1387 | vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); |
1368 | update_exception_bitmap(vcpu); | 1388 | update_exception_bitmap(vcpu); |
1369 | 1389 | ||
1390 | if (emulate_invalid_guest_state) | ||
1391 | goto continue_rmode; | ||
1392 | |||
1370 | vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); | 1393 | vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); |
1371 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); | 1394 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); |
1372 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); | 1395 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); |
@@ -1382,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
1382 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); | 1405 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); |
1383 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); | 1406 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); |
1384 | 1407 | ||
1408 | continue_rmode: | ||
1385 | kvm_mmu_reset_context(vcpu); | 1409 | kvm_mmu_reset_context(vcpu); |
1386 | init_rmode(vcpu->kvm); | 1410 | init_rmode(vcpu->kvm); |
1387 | } | 1411 | } |
@@ -1715,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | |||
1715 | vmcs_writel(GUEST_GDTR_BASE, dt->base); | 1739 | vmcs_writel(GUEST_GDTR_BASE, dt->base); |
1716 | } | 1740 | } |
1717 | 1741 | ||
1742 | static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) | ||
1743 | { | ||
1744 | struct kvm_segment var; | ||
1745 | u32 ar; | ||
1746 | |||
1747 | vmx_get_segment(vcpu, &var, seg); | ||
1748 | ar = vmx_segment_access_rights(&var); | ||
1749 | |||
1750 | if (var.base != (var.selector << 4)) | ||
1751 | return false; | ||
1752 | if (var.limit != 0xffff) | ||
1753 | return false; | ||
1754 | if (ar != 0xf3) | ||
1755 | return false; | ||
1756 | |||
1757 | return true; | ||
1758 | } | ||
1759 | |||
1760 | static bool code_segment_valid(struct kvm_vcpu *vcpu) | ||
1761 | { | ||
1762 | struct kvm_segment cs; | ||
1763 | unsigned int cs_rpl; | ||
1764 | |||
1765 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
1766 | cs_rpl = cs.selector & SELECTOR_RPL_MASK; | ||
1767 | |||
1768 | if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) | ||
1769 | return false; | ||
1770 | if (!cs.s) | ||
1771 | return false; | ||
1772 | if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) { | ||
1773 | if (cs.dpl > cs_rpl) | ||
1774 | return false; | ||
1775 | } else if (cs.type & AR_TYPE_CODE_MASK) { | ||
1776 | if (cs.dpl != cs_rpl) | ||
1777 | return false; | ||
1778 | } | ||
1779 | if (!cs.present) | ||
1780 | return false; | ||
1781 | |||
1782 | /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ | ||
1783 | return true; | ||
1784 | } | ||
1785 | |||
1786 | static bool stack_segment_valid(struct kvm_vcpu *vcpu) | ||
1787 | { | ||
1788 | struct kvm_segment ss; | ||
1789 | unsigned int ss_rpl; | ||
1790 | |||
1791 | vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); | ||
1792 | ss_rpl = ss.selector & SELECTOR_RPL_MASK; | ||
1793 | |||
1794 | if ((ss.type != 3) || (ss.type != 7)) | ||
1795 | return false; | ||
1796 | if (!ss.s) | ||
1797 | return false; | ||
1798 | if (ss.dpl != ss_rpl) /* DPL != RPL */ | ||
1799 | return false; | ||
1800 | if (!ss.present) | ||
1801 | return false; | ||
1802 | |||
1803 | return true; | ||
1804 | } | ||
1805 | |||
1806 | static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) | ||
1807 | { | ||
1808 | struct kvm_segment var; | ||
1809 | unsigned int rpl; | ||
1810 | |||
1811 | vmx_get_segment(vcpu, &var, seg); | ||
1812 | rpl = var.selector & SELECTOR_RPL_MASK; | ||
1813 | |||
1814 | if (!var.s) | ||
1815 | return false; | ||
1816 | if (!var.present) | ||
1817 | return false; | ||
1818 | if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { | ||
1819 | if (var.dpl < rpl) /* DPL < RPL */ | ||
1820 | return false; | ||
1821 | } | ||
1822 | |||
1823 | /* TODO: Add other members to kvm_segment_field to allow checking for other access | ||
1824 | * rights flags | ||
1825 | */ | ||
1826 | return true; | ||
1827 | } | ||
1828 | |||
1829 | static bool tr_valid(struct kvm_vcpu *vcpu) | ||
1830 | { | ||
1831 | struct kvm_segment tr; | ||
1832 | |||
1833 | vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); | ||
1834 | |||
1835 | if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ | ||
1836 | return false; | ||
1837 | if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */ | ||
1838 | return false; | ||
1839 | if (!tr.present) | ||
1840 | return false; | ||
1841 | |||
1842 | return true; | ||
1843 | } | ||
1844 | |||
1845 | static bool ldtr_valid(struct kvm_vcpu *vcpu) | ||
1846 | { | ||
1847 | struct kvm_segment ldtr; | ||
1848 | |||
1849 | vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); | ||
1850 | |||
1851 | if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ | ||
1852 | return false; | ||
1853 | if (ldtr.type != 2) | ||
1854 | return false; | ||
1855 | if (!ldtr.present) | ||
1856 | return false; | ||
1857 | |||
1858 | return true; | ||
1859 | } | ||
1860 | |||
1861 | static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) | ||
1862 | { | ||
1863 | struct kvm_segment cs, ss; | ||
1864 | |||
1865 | vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
1866 | vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); | ||
1867 | |||
1868 | return ((cs.selector & SELECTOR_RPL_MASK) == | ||
1869 | (ss.selector & SELECTOR_RPL_MASK)); | ||
1870 | } | ||
1871 | |||
1872 | /* | ||
1873 | * Check if guest state is valid. Returns true if valid, false if | ||
1874 | * not. | ||
1875 | * We assume that registers are always usable | ||
1876 | */ | ||
1877 | static bool guest_state_valid(struct kvm_vcpu *vcpu) | ||
1878 | { | ||
1879 | /* real mode guest state checks */ | ||
1880 | if (!(vcpu->arch.cr0 & X86_CR0_PE)) { | ||
1881 | if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) | ||
1882 | return false; | ||
1883 | if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) | ||
1884 | return false; | ||
1885 | if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) | ||
1886 | return false; | ||
1887 | if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) | ||
1888 | return false; | ||
1889 | if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) | ||
1890 | return false; | ||
1891 | if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) | ||
1892 | return false; | ||
1893 | } else { | ||
1894 | /* protected mode guest state checks */ | ||
1895 | if (!cs_ss_rpl_check(vcpu)) | ||
1896 | return false; | ||
1897 | if (!code_segment_valid(vcpu)) | ||
1898 | return false; | ||
1899 | if (!stack_segment_valid(vcpu)) | ||
1900 | return false; | ||
1901 | if (!data_segment_valid(vcpu, VCPU_SREG_DS)) | ||
1902 | return false; | ||
1903 | if (!data_segment_valid(vcpu, VCPU_SREG_ES)) | ||
1904 | return false; | ||
1905 | if (!data_segment_valid(vcpu, VCPU_SREG_FS)) | ||
1906 | return false; | ||
1907 | if (!data_segment_valid(vcpu, VCPU_SREG_GS)) | ||
1908 | return false; | ||
1909 | if (!tr_valid(vcpu)) | ||
1910 | return false; | ||
1911 | if (!ldtr_valid(vcpu)) | ||
1912 | return false; | ||
1913 | } | ||
1914 | /* TODO: | ||
1915 | * - Add checks on RIP | ||
1916 | * - Add checks on RFLAGS | ||
1917 | */ | ||
1918 | |||
1919 | return true; | ||
1920 | } | ||
1921 | |||
1718 | static int init_rmode_tss(struct kvm *kvm) | 1922 | static int init_rmode_tss(struct kvm *kvm) |
1719 | { | 1923 | { |
1720 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; | 1924 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; |
@@ -1726,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm) | |||
1726 | if (r < 0) | 1930 | if (r < 0) |
1727 | goto out; | 1931 | goto out; |
1728 | data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; | 1932 | data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; |
1729 | r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); | 1933 | r = kvm_write_guest_page(kvm, fn++, &data, |
1934 | TSS_IOPB_BASE_OFFSET, sizeof(u16)); | ||
1730 | if (r < 0) | 1935 | if (r < 0) |
1731 | goto out; | 1936 | goto out; |
1732 | r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); | 1937 | r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); |
@@ -1789,7 +1994,7 @@ static void seg_setup(int seg) | |||
1789 | vmcs_write16(sf->selector, 0); | 1994 | vmcs_write16(sf->selector, 0); |
1790 | vmcs_writel(sf->base, 0); | 1995 | vmcs_writel(sf->base, 0); |
1791 | vmcs_write32(sf->limit, 0xffff); | 1996 | vmcs_write32(sf->limit, 0xffff); |
1792 | vmcs_write32(sf->ar_bytes, 0x93); | 1997 | vmcs_write32(sf->ar_bytes, 0xf3); |
1793 | } | 1998 | } |
1794 | 1999 | ||
1795 | static int alloc_apic_access_page(struct kvm *kvm) | 2000 | static int alloc_apic_access_page(struct kvm *kvm) |
@@ -1808,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm) | |||
1808 | if (r) | 2013 | if (r) |
1809 | goto out; | 2014 | goto out; |
1810 | 2015 | ||
1811 | down_read(¤t->mm->mmap_sem); | ||
1812 | kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); | 2016 | kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); |
1813 | up_read(¤t->mm->mmap_sem); | ||
1814 | out: | 2017 | out: |
1815 | up_write(&kvm->slots_lock); | 2018 | up_write(&kvm->slots_lock); |
1816 | return r; | 2019 | return r; |
@@ -1832,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm) | |||
1832 | if (r) | 2035 | if (r) |
1833 | goto out; | 2036 | goto out; |
1834 | 2037 | ||
1835 | down_read(¤t->mm->mmap_sem); | ||
1836 | kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, | 2038 | kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, |
1837 | VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); | 2039 | VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); |
1838 | up_read(¤t->mm->mmap_sem); | ||
1839 | out: | 2040 | out: |
1840 | up_write(&kvm->slots_lock); | 2041 | up_write(&kvm->slots_lock); |
1841 | return r; | 2042 | return r; |
@@ -1917,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
1917 | } | 2118 | } |
1918 | if (!vm_need_ept()) | 2119 | if (!vm_need_ept()) |
1919 | exec_control |= CPU_BASED_CR3_STORE_EXITING | | 2120 | exec_control |= CPU_BASED_CR3_STORE_EXITING | |
1920 | CPU_BASED_CR3_LOAD_EXITING; | 2121 | CPU_BASED_CR3_LOAD_EXITING | |
2122 | CPU_BASED_INVLPG_EXITING; | ||
1921 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | 2123 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); |
1922 | 2124 | ||
1923 | if (cpu_has_secondary_exec_ctrls()) { | 2125 | if (cpu_has_secondary_exec_ctrls()) { |
@@ -2019,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2019 | u64 msr; | 2221 | u64 msr; |
2020 | int ret; | 2222 | int ret; |
2021 | 2223 | ||
2224 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); | ||
2022 | down_read(&vcpu->kvm->slots_lock); | 2225 | down_read(&vcpu->kvm->slots_lock); |
2023 | if (!init_rmode(vmx->vcpu.kvm)) { | 2226 | if (!init_rmode(vmx->vcpu.kvm)) { |
2024 | ret = -ENOMEM; | 2227 | ret = -ENOMEM; |
@@ -2036,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2036 | 2239 | ||
2037 | fx_init(&vmx->vcpu); | 2240 | fx_init(&vmx->vcpu); |
2038 | 2241 | ||
2242 | seg_setup(VCPU_SREG_CS); | ||
2039 | /* | 2243 | /* |
2040 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | 2244 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode |
2041 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. | 2245 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. |
@@ -2047,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2047 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); | 2251 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); |
2048 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); | 2252 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); |
2049 | } | 2253 | } |
2050 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
2051 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
2052 | 2254 | ||
2053 | seg_setup(VCPU_SREG_DS); | 2255 | seg_setup(VCPU_SREG_DS); |
2054 | seg_setup(VCPU_SREG_ES); | 2256 | seg_setup(VCPU_SREG_ES); |
@@ -2072,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2072 | 2274 | ||
2073 | vmcs_writel(GUEST_RFLAGS, 0x02); | 2275 | vmcs_writel(GUEST_RFLAGS, 0x02); |
2074 | if (vmx->vcpu.vcpu_id == 0) | 2276 | if (vmx->vcpu.vcpu_id == 0) |
2075 | vmcs_writel(GUEST_RIP, 0xfff0); | 2277 | kvm_rip_write(vcpu, 0xfff0); |
2076 | else | 2278 | else |
2077 | vmcs_writel(GUEST_RIP, 0); | 2279 | kvm_rip_write(vcpu, 0); |
2078 | vmcs_writel(GUEST_RSP, 0); | 2280 | kvm_register_write(vcpu, VCPU_REGS_RSP, 0); |
2079 | 2281 | ||
2080 | /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ | 2282 | /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ |
2081 | vmcs_writel(GUEST_DR7, 0x400); | 2283 | vmcs_writel(GUEST_DR7, 0x400); |
@@ -2125,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
2125 | 2327 | ||
2126 | ret = 0; | 2328 | ret = 0; |
2127 | 2329 | ||
2330 | /* HACK: Don't enable emulation on guest boot/reset */ | ||
2331 | vmx->emulation_required = 0; | ||
2332 | |||
2128 | out: | 2333 | out: |
2129 | up_read(&vcpu->kvm->slots_lock); | 2334 | up_read(&vcpu->kvm->slots_lock); |
2130 | return ret; | 2335 | return ret; |
@@ -2136,14 +2341,15 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | |||
2136 | 2341 | ||
2137 | KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); | 2342 | KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); |
2138 | 2343 | ||
2344 | ++vcpu->stat.irq_injections; | ||
2139 | if (vcpu->arch.rmode.active) { | 2345 | if (vcpu->arch.rmode.active) { |
2140 | vmx->rmode.irq.pending = true; | 2346 | vmx->rmode.irq.pending = true; |
2141 | vmx->rmode.irq.vector = irq; | 2347 | vmx->rmode.irq.vector = irq; |
2142 | vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); | 2348 | vmx->rmode.irq.rip = kvm_rip_read(vcpu); |
2143 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2349 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
2144 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); | 2350 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); |
2145 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | 2351 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); |
2146 | vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); | 2352 | kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); |
2147 | return; | 2353 | return; |
2148 | } | 2354 | } |
2149 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2355 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
@@ -2154,7 +2360,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) | |||
2154 | { | 2360 | { |
2155 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 2361 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
2156 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); | 2362 | INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); |
2157 | vcpu->arch.nmi_pending = 0; | ||
2158 | } | 2363 | } |
2159 | 2364 | ||
2160 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | 2365 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) |
@@ -2166,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | |||
2166 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); | 2371 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); |
2167 | if (!vcpu->arch.irq_pending[word_index]) | 2372 | if (!vcpu->arch.irq_pending[word_index]) |
2168 | clear_bit(word_index, &vcpu->arch.irq_summary); | 2373 | clear_bit(word_index, &vcpu->arch.irq_summary); |
2169 | vmx_inject_irq(vcpu, irq); | 2374 | kvm_queue_interrupt(vcpu, irq); |
2170 | } | 2375 | } |
2171 | 2376 | ||
2172 | 2377 | ||
@@ -2180,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, | |||
2180 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); | 2385 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); |
2181 | 2386 | ||
2182 | if (vcpu->arch.interrupt_window_open && | 2387 | if (vcpu->arch.interrupt_window_open && |
2183 | vcpu->arch.irq_summary && | 2388 | vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) |
2184 | !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) | ||
2185 | /* | ||
2186 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | ||
2187 | */ | ||
2188 | kvm_do_inject_irq(vcpu); | 2389 | kvm_do_inject_irq(vcpu); |
2189 | 2390 | ||
2391 | if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending) | ||
2392 | vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); | ||
2393 | |||
2190 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 2394 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
2191 | if (!vcpu->arch.interrupt_window_open && | 2395 | if (!vcpu->arch.interrupt_window_open && |
2192 | (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) | 2396 | (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) |
@@ -2237,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) | |||
2237 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, | 2441 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, |
2238 | int vec, u32 err_code) | 2442 | int vec, u32 err_code) |
2239 | { | 2443 | { |
2240 | if (!vcpu->arch.rmode.active) | ||
2241 | return 0; | ||
2242 | |||
2243 | /* | 2444 | /* |
2244 | * Instruction with address size override prefix opcode 0x67 | 2445 | * Instruction with address size override prefix opcode 0x67 |
2245 | * Cause the #SS fault with 0 error code in VM86 mode. | 2446 | * Cause the #SS fault with 0 error code in VM86 mode. |
@@ -2247,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, | |||
2247 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) | 2448 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) |
2248 | if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) | 2449 | if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) |
2249 | return 1; | 2450 | return 1; |
2451 | /* | ||
2452 | * Forward all other exceptions that are valid in real mode. | ||
2453 | * FIXME: Breaks guest debugging in real mode, needs to be fixed with | ||
2454 | * the required debugging infrastructure rework. | ||
2455 | */ | ||
2456 | switch (vec) { | ||
2457 | case DE_VECTOR: | ||
2458 | case DB_VECTOR: | ||
2459 | case BP_VECTOR: | ||
2460 | case OF_VECTOR: | ||
2461 | case BR_VECTOR: | ||
2462 | case UD_VECTOR: | ||
2463 | case DF_VECTOR: | ||
2464 | case SS_VECTOR: | ||
2465 | case GP_VECTOR: | ||
2466 | case MF_VECTOR: | ||
2467 | kvm_queue_exception(vcpu, vec); | ||
2468 | return 1; | ||
2469 | } | ||
2250 | return 0; | 2470 | return 0; |
2251 | } | 2471 | } |
2252 | 2472 | ||
@@ -2288,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2288 | } | 2508 | } |
2289 | 2509 | ||
2290 | error_code = 0; | 2510 | error_code = 0; |
2291 | rip = vmcs_readl(GUEST_RIP); | 2511 | rip = kvm_rip_read(vcpu); |
2292 | if (intr_info & INTR_INFO_DELIVER_CODE_MASK) | 2512 | if (intr_info & INTR_INFO_DELIVER_CODE_MASK) |
2293 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | 2513 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); |
2294 | if (is_page_fault(intr_info)) { | 2514 | if (is_page_fault(intr_info)) { |
@@ -2298,7 +2518,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2298 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | 2518 | cr2 = vmcs_readl(EXIT_QUALIFICATION); |
2299 | KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, | 2519 | KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, |
2300 | (u32)((u64)cr2 >> 32), handler); | 2520 | (u32)((u64)cr2 >> 32), handler); |
2301 | if (vect_info & VECTORING_INFO_VALID_MASK) | 2521 | if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) |
2302 | kvm_mmu_unprotect_page_virt(vcpu, cr2); | 2522 | kvm_mmu_unprotect_page_virt(vcpu, cr2); |
2303 | return kvm_mmu_page_fault(vcpu, cr2, error_code); | 2523 | return kvm_mmu_page_fault(vcpu, cr2, error_code); |
2304 | } | 2524 | } |
@@ -2386,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2386 | reg = (exit_qualification >> 8) & 15; | 2606 | reg = (exit_qualification >> 8) & 15; |
2387 | switch ((exit_qualification >> 4) & 3) { | 2607 | switch ((exit_qualification >> 4) & 3) { |
2388 | case 0: /* mov to cr */ | 2608 | case 0: /* mov to cr */ |
2389 | KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], | 2609 | KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, |
2390 | (u32)((u64)vcpu->arch.regs[reg] >> 32), handler); | 2610 | (u32)kvm_register_read(vcpu, reg), |
2611 | (u32)((u64)kvm_register_read(vcpu, reg) >> 32), | ||
2612 | handler); | ||
2391 | switch (cr) { | 2613 | switch (cr) { |
2392 | case 0: | 2614 | case 0: |
2393 | vcpu_load_rsp_rip(vcpu); | 2615 | kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); |
2394 | kvm_set_cr0(vcpu, vcpu->arch.regs[reg]); | ||
2395 | skip_emulated_instruction(vcpu); | 2616 | skip_emulated_instruction(vcpu); |
2396 | return 1; | 2617 | return 1; |
2397 | case 3: | 2618 | case 3: |
2398 | vcpu_load_rsp_rip(vcpu); | 2619 | kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); |
2399 | kvm_set_cr3(vcpu, vcpu->arch.regs[reg]); | ||
2400 | skip_emulated_instruction(vcpu); | 2620 | skip_emulated_instruction(vcpu); |
2401 | return 1; | 2621 | return 1; |
2402 | case 4: | 2622 | case 4: |
2403 | vcpu_load_rsp_rip(vcpu); | 2623 | kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); |
2404 | kvm_set_cr4(vcpu, vcpu->arch.regs[reg]); | ||
2405 | skip_emulated_instruction(vcpu); | 2624 | skip_emulated_instruction(vcpu); |
2406 | return 1; | 2625 | return 1; |
2407 | case 8: | 2626 | case 8: |
2408 | vcpu_load_rsp_rip(vcpu); | 2627 | kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); |
2409 | kvm_set_cr8(vcpu, vcpu->arch.regs[reg]); | ||
2410 | skip_emulated_instruction(vcpu); | 2628 | skip_emulated_instruction(vcpu); |
2411 | if (irqchip_in_kernel(vcpu->kvm)) | 2629 | if (irqchip_in_kernel(vcpu->kvm)) |
2412 | return 1; | 2630 | return 1; |
@@ -2415,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2415 | }; | 2633 | }; |
2416 | break; | 2634 | break; |
2417 | case 2: /* clts */ | 2635 | case 2: /* clts */ |
2418 | vcpu_load_rsp_rip(vcpu); | ||
2419 | vmx_fpu_deactivate(vcpu); | 2636 | vmx_fpu_deactivate(vcpu); |
2420 | vcpu->arch.cr0 &= ~X86_CR0_TS; | 2637 | vcpu->arch.cr0 &= ~X86_CR0_TS; |
2421 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); | 2638 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); |
@@ -2426,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2426 | case 1: /*mov from cr*/ | 2643 | case 1: /*mov from cr*/ |
2427 | switch (cr) { | 2644 | switch (cr) { |
2428 | case 3: | 2645 | case 3: |
2429 | vcpu_load_rsp_rip(vcpu); | 2646 | kvm_register_write(vcpu, reg, vcpu->arch.cr3); |
2430 | vcpu->arch.regs[reg] = vcpu->arch.cr3; | ||
2431 | vcpu_put_rsp_rip(vcpu); | ||
2432 | KVMTRACE_3D(CR_READ, vcpu, (u32)cr, | 2647 | KVMTRACE_3D(CR_READ, vcpu, (u32)cr, |
2433 | (u32)vcpu->arch.regs[reg], | 2648 | (u32)kvm_register_read(vcpu, reg), |
2434 | (u32)((u64)vcpu->arch.regs[reg] >> 32), | 2649 | (u32)((u64)kvm_register_read(vcpu, reg) >> 32), |
2435 | handler); | 2650 | handler); |
2436 | skip_emulated_instruction(vcpu); | 2651 | skip_emulated_instruction(vcpu); |
2437 | return 1; | 2652 | return 1; |
2438 | case 8: | 2653 | case 8: |
2439 | vcpu_load_rsp_rip(vcpu); | 2654 | kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); |
2440 | vcpu->arch.regs[reg] = kvm_get_cr8(vcpu); | ||
2441 | vcpu_put_rsp_rip(vcpu); | ||
2442 | KVMTRACE_2D(CR_READ, vcpu, (u32)cr, | 2655 | KVMTRACE_2D(CR_READ, vcpu, (u32)cr, |
2443 | (u32)vcpu->arch.regs[reg], handler); | 2656 | (u32)kvm_register_read(vcpu, reg), handler); |
2444 | skip_emulated_instruction(vcpu); | 2657 | skip_emulated_instruction(vcpu); |
2445 | return 1; | 2658 | return 1; |
2446 | } | 2659 | } |
@@ -2472,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2472 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | 2685 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); |
2473 | dr = exit_qualification & 7; | 2686 | dr = exit_qualification & 7; |
2474 | reg = (exit_qualification >> 8) & 15; | 2687 | reg = (exit_qualification >> 8) & 15; |
2475 | vcpu_load_rsp_rip(vcpu); | ||
2476 | if (exit_qualification & 16) { | 2688 | if (exit_qualification & 16) { |
2477 | /* mov from dr */ | 2689 | /* mov from dr */ |
2478 | switch (dr) { | 2690 | switch (dr) { |
@@ -2485,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2485 | default: | 2697 | default: |
2486 | val = 0; | 2698 | val = 0; |
2487 | } | 2699 | } |
2488 | vcpu->arch.regs[reg] = val; | 2700 | kvm_register_write(vcpu, reg, val); |
2489 | KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); | 2701 | KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); |
2490 | } else { | 2702 | } else { |
2491 | /* mov to dr */ | 2703 | /* mov to dr */ |
2492 | } | 2704 | } |
2493 | vcpu_put_rsp_rip(vcpu); | ||
2494 | skip_emulated_instruction(vcpu); | 2705 | skip_emulated_instruction(vcpu); |
2495 | return 1; | 2706 | return 1; |
2496 | } | 2707 | } |
@@ -2583,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2583 | return 1; | 2794 | return 1; |
2584 | } | 2795 | } |
2585 | 2796 | ||
2797 | static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2798 | { | ||
2799 | u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); | ||
2800 | |||
2801 | kvm_mmu_invlpg(vcpu, exit_qualification); | ||
2802 | skip_emulated_instruction(vcpu); | ||
2803 | return 1; | ||
2804 | } | ||
2805 | |||
2586 | static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2806 | static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2587 | { | 2807 | { |
2588 | skip_emulated_instruction(vcpu); | 2808 | skip_emulated_instruction(vcpu); |
@@ -2695,6 +2915,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2695 | return 1; | 2915 | return 1; |
2696 | } | 2916 | } |
2697 | 2917 | ||
2918 | static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, | ||
2919 | struct kvm_run *kvm_run) | ||
2920 | { | ||
2921 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2922 | int err; | ||
2923 | |||
2924 | preempt_enable(); | ||
2925 | local_irq_enable(); | ||
2926 | |||
2927 | while (!guest_state_valid(vcpu)) { | ||
2928 | err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); | ||
2929 | |||
2930 | switch (err) { | ||
2931 | case EMULATE_DONE: | ||
2932 | break; | ||
2933 | case EMULATE_DO_MMIO: | ||
2934 | kvm_report_emulation_failure(vcpu, "mmio"); | ||
2935 | /* TODO: Handle MMIO */ | ||
2936 | return; | ||
2937 | default: | ||
2938 | kvm_report_emulation_failure(vcpu, "emulation failure"); | ||
2939 | return; | ||
2940 | } | ||
2941 | |||
2942 | if (signal_pending(current)) | ||
2943 | break; | ||
2944 | if (need_resched()) | ||
2945 | schedule(); | ||
2946 | } | ||
2947 | |||
2948 | local_irq_disable(); | ||
2949 | preempt_disable(); | ||
2950 | |||
2951 | /* Guest state should be valid now, no more emulation should be needed */ | ||
2952 | vmx->emulation_required = 0; | ||
2953 | } | ||
2954 | |||
2698 | /* | 2955 | /* |
2699 | * The exit handlers return 1 if the exit was handled fully and guest execution | 2956 | * The exit handlers return 1 if the exit was handled fully and guest execution |
2700 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | 2957 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs |
@@ -2714,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | |||
2714 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, | 2971 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, |
2715 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, | 2972 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, |
2716 | [EXIT_REASON_HLT] = handle_halt, | 2973 | [EXIT_REASON_HLT] = handle_halt, |
2974 | [EXIT_REASON_INVLPG] = handle_invlpg, | ||
2717 | [EXIT_REASON_VMCALL] = handle_vmcall, | 2975 | [EXIT_REASON_VMCALL] = handle_vmcall, |
2718 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, | 2976 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, |
2719 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | 2977 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, |
@@ -2735,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
2735 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2993 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2736 | u32 vectoring_info = vmx->idt_vectoring_info; | 2994 | u32 vectoring_info = vmx->idt_vectoring_info; |
2737 | 2995 | ||
2738 | KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), | 2996 | KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), |
2739 | (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); | 2997 | (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); |
2740 | 2998 | ||
2741 | /* Access CR3 don't cause VMExit in paging mode, so we need | 2999 | /* Access CR3 don't cause VMExit in paging mode, so we need |
2742 | * to sync with guest real CR3. */ | 3000 | * to sync with guest real CR3. */ |
@@ -2829,88 +3087,92 @@ static void enable_intr_window(struct kvm_vcpu *vcpu) | |||
2829 | enable_irq_window(vcpu); | 3087 | enable_irq_window(vcpu); |
2830 | } | 3088 | } |
2831 | 3089 | ||
2832 | static void vmx_intr_assist(struct kvm_vcpu *vcpu) | 3090 | static void vmx_complete_interrupts(struct vcpu_vmx *vmx) |
2833 | { | 3091 | { |
2834 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3092 | u32 exit_intr_info; |
2835 | u32 idtv_info_field, intr_info_field, exit_intr_info_field; | 3093 | u32 idt_vectoring_info; |
2836 | int vector; | 3094 | bool unblock_nmi; |
3095 | u8 vector; | ||
3096 | int type; | ||
3097 | bool idtv_info_valid; | ||
3098 | u32 error; | ||
2837 | 3099 | ||
2838 | update_tpr_threshold(vcpu); | 3100 | exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); |
2839 | 3101 | if (cpu_has_virtual_nmis()) { | |
2840 | intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); | 3102 | unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; |
2841 | exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO); | 3103 | vector = exit_intr_info & INTR_INFO_VECTOR_MASK; |
2842 | idtv_info_field = vmx->idt_vectoring_info; | 3104 | /* |
2843 | if (intr_info_field & INTR_INFO_VALID_MASK) { | 3105 | * SDM 3: 25.7.1.2 |
2844 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | 3106 | * Re-set bit "block by NMI" before VM entry if vmexit caused by |
2845 | /* TODO: fault when IDT_Vectoring */ | 3107 | * a guest IRET fault. |
2846 | if (printk_ratelimit()) | 3108 | */ |
2847 | printk(KERN_ERR "Fault when IDT_Vectoring\n"); | 3109 | if (unblock_nmi && vector != DF_VECTOR) |
2848 | } | 3110 | vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, |
2849 | enable_intr_window(vcpu); | 3111 | GUEST_INTR_STATE_NMI); |
2850 | return; | ||
2851 | } | 3112 | } |
2852 | if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { | ||
2853 | if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) | ||
2854 | == INTR_TYPE_EXT_INTR | ||
2855 | && vcpu->arch.rmode.active) { | ||
2856 | u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; | ||
2857 | |||
2858 | vmx_inject_irq(vcpu, vect); | ||
2859 | enable_intr_window(vcpu); | ||
2860 | return; | ||
2861 | } | ||
2862 | |||
2863 | KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); | ||
2864 | 3113 | ||
3114 | idt_vectoring_info = vmx->idt_vectoring_info; | ||
3115 | idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; | ||
3116 | vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; | ||
3117 | type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; | ||
3118 | if (vmx->vcpu.arch.nmi_injected) { | ||
2865 | /* | 3119 | /* |
2866 | * SDM 3: 25.7.1.2 | 3120 | * SDM 3: 25.7.1.2 |
2867 | * Clear bit "block by NMI" before VM entry if a NMI delivery | 3121 | * Clear bit "block by NMI" before VM entry if a NMI delivery |
2868 | * faulted. | 3122 | * faulted. |
2869 | */ | 3123 | */ |
2870 | if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) | 3124 | if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) |
2871 | == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis()) | 3125 | vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, |
2872 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | 3126 | GUEST_INTR_STATE_NMI); |
2873 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & | 3127 | else |
2874 | ~GUEST_INTR_STATE_NMI); | 3128 | vmx->vcpu.arch.nmi_injected = false; |
2875 | 3129 | } | |
2876 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field | 3130 | kvm_clear_exception_queue(&vmx->vcpu); |
2877 | & ~INTR_INFO_RESVD_BITS_MASK); | 3131 | if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) { |
2878 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | 3132 | if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { |
2879 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); | 3133 | error = vmcs_read32(IDT_VECTORING_ERROR_CODE); |
2880 | 3134 | kvm_queue_exception_e(&vmx->vcpu, vector, error); | |
2881 | if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) | 3135 | } else |
2882 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | 3136 | kvm_queue_exception(&vmx->vcpu, vector); |
2883 | vmcs_read32(IDT_VECTORING_ERROR_CODE)); | 3137 | vmx->idt_vectoring_info = 0; |
2884 | enable_intr_window(vcpu); | ||
2885 | return; | ||
2886 | } | 3138 | } |
3139 | kvm_clear_interrupt_queue(&vmx->vcpu); | ||
3140 | if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { | ||
3141 | kvm_queue_interrupt(&vmx->vcpu, vector); | ||
3142 | vmx->idt_vectoring_info = 0; | ||
3143 | } | ||
3144 | } | ||
3145 | |||
3146 | static void vmx_intr_assist(struct kvm_vcpu *vcpu) | ||
3147 | { | ||
3148 | update_tpr_threshold(vcpu); | ||
3149 | |||
2887 | if (cpu_has_virtual_nmis()) { | 3150 | if (cpu_has_virtual_nmis()) { |
2888 | /* | 3151 | if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { |
2889 | * SDM 3: 25.7.1.2 | 3152 | if (vmx_nmi_enabled(vcpu)) { |
2890 | * Re-set bit "block by NMI" before VM entry if vmexit caused by | 3153 | vcpu->arch.nmi_pending = false; |
2891 | * a guest IRET fault. | 3154 | vcpu->arch.nmi_injected = true; |
2892 | */ | 3155 | } else { |
2893 | if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) && | 3156 | enable_intr_window(vcpu); |
2894 | (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8) | 3157 | return; |
2895 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | 3158 | } |
2896 | vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) | | 3159 | } |
2897 | GUEST_INTR_STATE_NMI); | 3160 | if (vcpu->arch.nmi_injected) { |
2898 | else if (vcpu->arch.nmi_pending) { | 3161 | vmx_inject_nmi(vcpu); |
2899 | if (vmx_nmi_enabled(vcpu)) | ||
2900 | vmx_inject_nmi(vcpu); | ||
2901 | enable_intr_window(vcpu); | 3162 | enable_intr_window(vcpu); |
2902 | return; | 3163 | return; |
2903 | } | 3164 | } |
2904 | |||
2905 | } | 3165 | } |
2906 | if (!kvm_cpu_has_interrupt(vcpu)) | 3166 | if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { |
2907 | return; | 3167 | if (vmx_irq_enabled(vcpu)) |
2908 | if (vmx_irq_enabled(vcpu)) { | 3168 | kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); |
2909 | vector = kvm_cpu_get_interrupt(vcpu); | 3169 | else |
2910 | vmx_inject_irq(vcpu, vector); | 3170 | enable_irq_window(vcpu); |
2911 | kvm_timer_intr_post(vcpu, vector); | 3171 | } |
2912 | } else | 3172 | if (vcpu->arch.interrupt.pending) { |
2913 | enable_irq_window(vcpu); | 3173 | vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); |
3174 | kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); | ||
3175 | } | ||
2914 | } | 3176 | } |
2915 | 3177 | ||
2916 | /* | 3178 | /* |
@@ -2922,9 +3184,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) | |||
2922 | static void fixup_rmode_irq(struct vcpu_vmx *vmx) | 3184 | static void fixup_rmode_irq(struct vcpu_vmx *vmx) |
2923 | { | 3185 | { |
2924 | vmx->rmode.irq.pending = 0; | 3186 | vmx->rmode.irq.pending = 0; |
2925 | if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) | 3187 | if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) |
2926 | return; | 3188 | return; |
2927 | vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); | 3189 | kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); |
2928 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { | 3190 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { |
2929 | vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; | 3191 | vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; |
2930 | vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; | 3192 | vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; |
@@ -2936,11 +3198,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) | |||
2936 | | vmx->rmode.irq.vector; | 3198 | | vmx->rmode.irq.vector; |
2937 | } | 3199 | } |
2938 | 3200 | ||
3201 | #ifdef CONFIG_X86_64 | ||
3202 | #define R "r" | ||
3203 | #define Q "q" | ||
3204 | #else | ||
3205 | #define R "e" | ||
3206 | #define Q "l" | ||
3207 | #endif | ||
3208 | |||
2939 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 3209 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2940 | { | 3210 | { |
2941 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3211 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2942 | u32 intr_info; | 3212 | u32 intr_info; |
2943 | 3213 | ||
3214 | /* Handle invalid guest state instead of entering VMX */ | ||
3215 | if (vmx->emulation_required && emulate_invalid_guest_state) { | ||
3216 | handle_invalid_guest_state(vcpu, kvm_run); | ||
3217 | return; | ||
3218 | } | ||
3219 | |||
3220 | if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) | ||
3221 | vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); | ||
3222 | if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) | ||
3223 | vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); | ||
3224 | |||
2944 | /* | 3225 | /* |
2945 | * Loading guest fpu may have cleared host cr0.ts | 3226 | * Loading guest fpu may have cleared host cr0.ts |
2946 | */ | 3227 | */ |
@@ -2948,26 +3229,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2948 | 3229 | ||
2949 | asm( | 3230 | asm( |
2950 | /* Store host registers */ | 3231 | /* Store host registers */ |
2951 | #ifdef CONFIG_X86_64 | 3232 | "push %%"R"dx; push %%"R"bp;" |
2952 | "push %%rdx; push %%rbp;" | 3233 | "push %%"R"cx \n\t" |
2953 | "push %%rcx \n\t" | 3234 | "cmp %%"R"sp, %c[host_rsp](%0) \n\t" |
2954 | #else | 3235 | "je 1f \n\t" |
2955 | "push %%edx; push %%ebp;" | 3236 | "mov %%"R"sp, %c[host_rsp](%0) \n\t" |
2956 | "push %%ecx \n\t" | ||
2957 | #endif | ||
2958 | __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" | 3237 | __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" |
3238 | "1: \n\t" | ||
2959 | /* Check if vmlaunch of vmresume is needed */ | 3239 | /* Check if vmlaunch of vmresume is needed */ |
2960 | "cmpl $0, %c[launched](%0) \n\t" | 3240 | "cmpl $0, %c[launched](%0) \n\t" |
2961 | /* Load guest registers. Don't clobber flags. */ | 3241 | /* Load guest registers. Don't clobber flags. */ |
3242 | "mov %c[cr2](%0), %%"R"ax \n\t" | ||
3243 | "mov %%"R"ax, %%cr2 \n\t" | ||
3244 | "mov %c[rax](%0), %%"R"ax \n\t" | ||
3245 | "mov %c[rbx](%0), %%"R"bx \n\t" | ||
3246 | "mov %c[rdx](%0), %%"R"dx \n\t" | ||
3247 | "mov %c[rsi](%0), %%"R"si \n\t" | ||
3248 | "mov %c[rdi](%0), %%"R"di \n\t" | ||
3249 | "mov %c[rbp](%0), %%"R"bp \n\t" | ||
2962 | #ifdef CONFIG_X86_64 | 3250 | #ifdef CONFIG_X86_64 |
2963 | "mov %c[cr2](%0), %%rax \n\t" | ||
2964 | "mov %%rax, %%cr2 \n\t" | ||
2965 | "mov %c[rax](%0), %%rax \n\t" | ||
2966 | "mov %c[rbx](%0), %%rbx \n\t" | ||
2967 | "mov %c[rdx](%0), %%rdx \n\t" | ||
2968 | "mov %c[rsi](%0), %%rsi \n\t" | ||
2969 | "mov %c[rdi](%0), %%rdi \n\t" | ||
2970 | "mov %c[rbp](%0), %%rbp \n\t" | ||
2971 | "mov %c[r8](%0), %%r8 \n\t" | 3251 | "mov %c[r8](%0), %%r8 \n\t" |
2972 | "mov %c[r9](%0), %%r9 \n\t" | 3252 | "mov %c[r9](%0), %%r9 \n\t" |
2973 | "mov %c[r10](%0), %%r10 \n\t" | 3253 | "mov %c[r10](%0), %%r10 \n\t" |
@@ -2976,18 +3256,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2976 | "mov %c[r13](%0), %%r13 \n\t" | 3256 | "mov %c[r13](%0), %%r13 \n\t" |
2977 | "mov %c[r14](%0), %%r14 \n\t" | 3257 | "mov %c[r14](%0), %%r14 \n\t" |
2978 | "mov %c[r15](%0), %%r15 \n\t" | 3258 | "mov %c[r15](%0), %%r15 \n\t" |
2979 | "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ | ||
2980 | #else | ||
2981 | "mov %c[cr2](%0), %%eax \n\t" | ||
2982 | "mov %%eax, %%cr2 \n\t" | ||
2983 | "mov %c[rax](%0), %%eax \n\t" | ||
2984 | "mov %c[rbx](%0), %%ebx \n\t" | ||
2985 | "mov %c[rdx](%0), %%edx \n\t" | ||
2986 | "mov %c[rsi](%0), %%esi \n\t" | ||
2987 | "mov %c[rdi](%0), %%edi \n\t" | ||
2988 | "mov %c[rbp](%0), %%ebp \n\t" | ||
2989 | "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ | ||
2990 | #endif | 3259 | #endif |
3260 | "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ | ||
3261 | |||
2991 | /* Enter guest mode */ | 3262 | /* Enter guest mode */ |
2992 | "jne .Llaunched \n\t" | 3263 | "jne .Llaunched \n\t" |
2993 | __ex(ASM_VMX_VMLAUNCH) "\n\t" | 3264 | __ex(ASM_VMX_VMLAUNCH) "\n\t" |
@@ -2995,15 +3266,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2995 | ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" | 3266 | ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" |
2996 | ".Lkvm_vmx_return: " | 3267 | ".Lkvm_vmx_return: " |
2997 | /* Save guest registers, load host registers, keep flags */ | 3268 | /* Save guest registers, load host registers, keep flags */ |
3269 | "xchg %0, (%%"R"sp) \n\t" | ||
3270 | "mov %%"R"ax, %c[rax](%0) \n\t" | ||
3271 | "mov %%"R"bx, %c[rbx](%0) \n\t" | ||
3272 | "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" | ||
3273 | "mov %%"R"dx, %c[rdx](%0) \n\t" | ||
3274 | "mov %%"R"si, %c[rsi](%0) \n\t" | ||
3275 | "mov %%"R"di, %c[rdi](%0) \n\t" | ||
3276 | "mov %%"R"bp, %c[rbp](%0) \n\t" | ||
2998 | #ifdef CONFIG_X86_64 | 3277 | #ifdef CONFIG_X86_64 |
2999 | "xchg %0, (%%rsp) \n\t" | ||
3000 | "mov %%rax, %c[rax](%0) \n\t" | ||
3001 | "mov %%rbx, %c[rbx](%0) \n\t" | ||
3002 | "pushq (%%rsp); popq %c[rcx](%0) \n\t" | ||
3003 | "mov %%rdx, %c[rdx](%0) \n\t" | ||
3004 | "mov %%rsi, %c[rsi](%0) \n\t" | ||
3005 | "mov %%rdi, %c[rdi](%0) \n\t" | ||
3006 | "mov %%rbp, %c[rbp](%0) \n\t" | ||
3007 | "mov %%r8, %c[r8](%0) \n\t" | 3278 | "mov %%r8, %c[r8](%0) \n\t" |
3008 | "mov %%r9, %c[r9](%0) \n\t" | 3279 | "mov %%r9, %c[r9](%0) \n\t" |
3009 | "mov %%r10, %c[r10](%0) \n\t" | 3280 | "mov %%r10, %c[r10](%0) \n\t" |
@@ -3012,28 +3283,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3012 | "mov %%r13, %c[r13](%0) \n\t" | 3283 | "mov %%r13, %c[r13](%0) \n\t" |
3013 | "mov %%r14, %c[r14](%0) \n\t" | 3284 | "mov %%r14, %c[r14](%0) \n\t" |
3014 | "mov %%r15, %c[r15](%0) \n\t" | 3285 | "mov %%r15, %c[r15](%0) \n\t" |
3015 | "mov %%cr2, %%rax \n\t" | ||
3016 | "mov %%rax, %c[cr2](%0) \n\t" | ||
3017 | |||
3018 | "pop %%rbp; pop %%rbp; pop %%rdx \n\t" | ||
3019 | #else | ||
3020 | "xchg %0, (%%esp) \n\t" | ||
3021 | "mov %%eax, %c[rax](%0) \n\t" | ||
3022 | "mov %%ebx, %c[rbx](%0) \n\t" | ||
3023 | "pushl (%%esp); popl %c[rcx](%0) \n\t" | ||
3024 | "mov %%edx, %c[rdx](%0) \n\t" | ||
3025 | "mov %%esi, %c[rsi](%0) \n\t" | ||
3026 | "mov %%edi, %c[rdi](%0) \n\t" | ||
3027 | "mov %%ebp, %c[rbp](%0) \n\t" | ||
3028 | "mov %%cr2, %%eax \n\t" | ||
3029 | "mov %%eax, %c[cr2](%0) \n\t" | ||
3030 | |||
3031 | "pop %%ebp; pop %%ebp; pop %%edx \n\t" | ||
3032 | #endif | 3286 | #endif |
3287 | "mov %%cr2, %%"R"ax \n\t" | ||
3288 | "mov %%"R"ax, %c[cr2](%0) \n\t" | ||
3289 | |||
3290 | "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" | ||
3033 | "setbe %c[fail](%0) \n\t" | 3291 | "setbe %c[fail](%0) \n\t" |
3034 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), | 3292 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), |
3035 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), | 3293 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), |
3036 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | 3294 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), |
3295 | [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), | ||
3037 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), | 3296 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), |
3038 | [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), | 3297 | [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), |
3039 | [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), | 3298 | [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), |
@@ -3053,14 +3312,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3053 | #endif | 3312 | #endif |
3054 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) | 3313 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) |
3055 | : "cc", "memory" | 3314 | : "cc", "memory" |
3315 | , R"bx", R"di", R"si" | ||
3056 | #ifdef CONFIG_X86_64 | 3316 | #ifdef CONFIG_X86_64 |
3057 | , "rbx", "rdi", "rsi" | ||
3058 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" | 3317 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" |
3059 | #else | ||
3060 | , "ebx", "edi", "rsi" | ||
3061 | #endif | 3318 | #endif |
3062 | ); | 3319 | ); |
3063 | 3320 | ||
3321 | vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); | ||
3322 | vcpu->arch.regs_dirty = 0; | ||
3323 | |||
3064 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 3324 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); |
3065 | if (vmx->rmode.irq.pending) | 3325 | if (vmx->rmode.irq.pending) |
3066 | fixup_rmode_irq(vmx); | 3326 | fixup_rmode_irq(vmx); |
@@ -3080,8 +3340,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
3080 | KVMTRACE_0D(NMI, vcpu, handler); | 3340 | KVMTRACE_0D(NMI, vcpu, handler); |
3081 | asm("int $2"); | 3341 | asm("int $2"); |
3082 | } | 3342 | } |
3343 | |||
3344 | vmx_complete_interrupts(vmx); | ||
3083 | } | 3345 | } |
3084 | 3346 | ||
3347 | #undef R | ||
3348 | #undef Q | ||
3349 | |||
3085 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) | 3350 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) |
3086 | { | 3351 | { |
3087 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 3352 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
@@ -3224,8 +3489,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
3224 | .set_idt = vmx_set_idt, | 3489 | .set_idt = vmx_set_idt, |
3225 | .get_gdt = vmx_get_gdt, | 3490 | .get_gdt = vmx_get_gdt, |
3226 | .set_gdt = vmx_set_gdt, | 3491 | .set_gdt = vmx_set_gdt, |
3227 | .cache_regs = vcpu_load_rsp_rip, | 3492 | .cache_reg = vmx_cache_reg, |
3228 | .decache_regs = vcpu_put_rsp_rip, | ||
3229 | .get_rflags = vmx_get_rflags, | 3493 | .get_rflags = vmx_get_rflags, |
3230 | .set_rflags = vmx_set_rflags, | 3494 | .set_rflags = vmx_set_rflags, |
3231 | 3495 | ||
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 17e25995b65b..3e010d21fdd7 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h | |||
@@ -331,9 +331,6 @@ enum vmcs_field { | |||
331 | 331 | ||
332 | #define AR_RESERVD_MASK 0xfffe0f00 | 332 | #define AR_RESERVD_MASK 0xfffe0f00 |
333 | 333 | ||
334 | #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 | ||
335 | #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 | ||
336 | |||
337 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 | 334 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 |
338 | #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 | 335 | #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 |
339 | 336 | ||
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0d682fc6aeb3..4f0677d1eae8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -4,10 +4,14 @@ | |||
4 | * derived from drivers/kvm/kvm_main.c | 4 | * derived from drivers/kvm/kvm_main.c |
5 | * | 5 | * |
6 | * Copyright (C) 2006 Qumranet, Inc. | 6 | * Copyright (C) 2006 Qumranet, Inc. |
7 | * Copyright (C) 2008 Qumranet, Inc. | ||
8 | * Copyright IBM Corporation, 2008 | ||
7 | * | 9 | * |
8 | * Authors: | 10 | * Authors: |
9 | * Avi Kivity <avi@qumranet.com> | 11 | * Avi Kivity <avi@qumranet.com> |
10 | * Yaniv Kamay <yaniv@qumranet.com> | 12 | * Yaniv Kamay <yaniv@qumranet.com> |
13 | * Amit Shah <amit.shah@qumranet.com> | ||
14 | * Ben-Ami Yassour <benami@il.ibm.com> | ||
11 | * | 15 | * |
12 | * This work is licensed under the terms of the GNU GPL, version 2. See | 16 | * This work is licensed under the terms of the GNU GPL, version 2. See |
13 | * the COPYING file in the top-level directory. | 17 | * the COPYING file in the top-level directory. |
@@ -19,14 +23,18 @@ | |||
19 | #include "mmu.h" | 23 | #include "mmu.h" |
20 | #include "i8254.h" | 24 | #include "i8254.h" |
21 | #include "tss.h" | 25 | #include "tss.h" |
26 | #include "kvm_cache_regs.h" | ||
27 | #include "x86.h" | ||
22 | 28 | ||
23 | #include <linux/clocksource.h> | 29 | #include <linux/clocksource.h> |
30 | #include <linux/interrupt.h> | ||
24 | #include <linux/kvm.h> | 31 | #include <linux/kvm.h> |
25 | #include <linux/fs.h> | 32 | #include <linux/fs.h> |
26 | #include <linux/vmalloc.h> | 33 | #include <linux/vmalloc.h> |
27 | #include <linux/module.h> | 34 | #include <linux/module.h> |
28 | #include <linux/mman.h> | 35 | #include <linux/mman.h> |
29 | #include <linux/highmem.h> | 36 | #include <linux/highmem.h> |
37 | #include <linux/intel-iommu.h> | ||
30 | 38 | ||
31 | #include <asm/uaccess.h> | 39 | #include <asm/uaccess.h> |
32 | #include <asm/msr.h> | 40 | #include <asm/msr.h> |
@@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, | |||
61 | struct kvm_cpuid_entry2 __user *entries); | 69 | struct kvm_cpuid_entry2 __user *entries); |
62 | 70 | ||
63 | struct kvm_x86_ops *kvm_x86_ops; | 71 | struct kvm_x86_ops *kvm_x86_ops; |
72 | EXPORT_SYMBOL_GPL(kvm_x86_ops); | ||
64 | 73 | ||
65 | struct kvm_stats_debugfs_item debugfs_entries[] = { | 74 | struct kvm_stats_debugfs_item debugfs_entries[] = { |
66 | { "pf_fixed", VCPU_STAT(pf_fixed) }, | 75 | { "pf_fixed", VCPU_STAT(pf_fixed) }, |
@@ -83,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
83 | { "fpu_reload", VCPU_STAT(fpu_reload) }, | 92 | { "fpu_reload", VCPU_STAT(fpu_reload) }, |
84 | { "insn_emulation", VCPU_STAT(insn_emulation) }, | 93 | { "insn_emulation", VCPU_STAT(insn_emulation) }, |
85 | { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, | 94 | { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, |
95 | { "irq_injections", VCPU_STAT(irq_injections) }, | ||
86 | { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, | 96 | { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, |
87 | { "mmu_pte_write", VM_STAT(mmu_pte_write) }, | 97 | { "mmu_pte_write", VM_STAT(mmu_pte_write) }, |
88 | { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, | 98 | { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, |
@@ -90,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
90 | { "mmu_flooded", VM_STAT(mmu_flooded) }, | 100 | { "mmu_flooded", VM_STAT(mmu_flooded) }, |
91 | { "mmu_recycled", VM_STAT(mmu_recycled) }, | 101 | { "mmu_recycled", VM_STAT(mmu_recycled) }, |
92 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, | 102 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, |
103 | { "mmu_unsync", VM_STAT(mmu_unsync) }, | ||
93 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, | 104 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, |
94 | { "largepages", VM_STAT(lpages) }, | 105 | { "largepages", VM_STAT(lpages) }, |
95 | { NULL } | 106 | { NULL } |
96 | }; | 107 | }; |
97 | 108 | ||
98 | |||
99 | unsigned long segment_base(u16 selector) | 109 | unsigned long segment_base(u16 selector) |
100 | { | 110 | { |
101 | struct descriptor_table gdt; | 111 | struct descriptor_table gdt; |
@@ -352,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); | |||
352 | void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | 362 | void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
353 | { | 363 | { |
354 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { | 364 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { |
365 | kvm_mmu_sync_roots(vcpu); | ||
355 | kvm_mmu_flush_tlb(vcpu); | 366 | kvm_mmu_flush_tlb(vcpu); |
356 | return; | 367 | return; |
357 | } | 368 | } |
@@ -564,7 +575,7 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info * | |||
564 | hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); | 575 | hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); |
565 | 576 | ||
566 | pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", | 577 | pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", |
567 | __FUNCTION__, tsc_khz, hv_clock->tsc_shift, | 578 | __func__, tsc_khz, hv_clock->tsc_shift, |
568 | hv_clock->tsc_to_system_mul); | 579 | hv_clock->tsc_to_system_mul); |
569 | } | 580 | } |
570 | 581 | ||
@@ -662,6 +673,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
662 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", | 673 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", |
663 | __func__, data); | 674 | __func__, data); |
664 | break; | 675 | break; |
676 | case MSR_IA32_DEBUGCTLMSR: | ||
677 | if (!data) { | ||
678 | /* We support the non-activated case already */ | ||
679 | break; | ||
680 | } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { | ||
681 | /* Values other than LBR and BTF are vendor-specific, | ||
682 | thus reserved and should throw a #GP */ | ||
683 | return 1; | ||
684 | } | ||
685 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", | ||
686 | __func__, data); | ||
687 | break; | ||
665 | case MSR_IA32_UCODE_REV: | 688 | case MSR_IA32_UCODE_REV: |
666 | case MSR_IA32_UCODE_WRITE: | 689 | case MSR_IA32_UCODE_WRITE: |
667 | break; | 690 | break; |
@@ -692,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
692 | /* ...but clean it before doing the actual write */ | 715 | /* ...but clean it before doing the actual write */ |
693 | vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); | 716 | vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); |
694 | 717 | ||
695 | down_read(¤t->mm->mmap_sem); | ||
696 | vcpu->arch.time_page = | 718 | vcpu->arch.time_page = |
697 | gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); | 719 | gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); |
698 | up_read(¤t->mm->mmap_sem); | ||
699 | 720 | ||
700 | if (is_error_page(vcpu->arch.time_page)) { | 721 | if (is_error_page(vcpu->arch.time_page)) { |
701 | kvm_release_page_clean(vcpu->arch.time_page); | 722 | kvm_release_page_clean(vcpu->arch.time_page); |
@@ -752,8 +773,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
752 | case MSR_IA32_MC0_MISC+8: | 773 | case MSR_IA32_MC0_MISC+8: |
753 | case MSR_IA32_MC0_MISC+12: | 774 | case MSR_IA32_MC0_MISC+12: |
754 | case MSR_IA32_MC0_MISC+16: | 775 | case MSR_IA32_MC0_MISC+16: |
776 | case MSR_IA32_MC0_MISC+20: | ||
755 | case MSR_IA32_UCODE_REV: | 777 | case MSR_IA32_UCODE_REV: |
756 | case MSR_IA32_EBL_CR_POWERON: | 778 | case MSR_IA32_EBL_CR_POWERON: |
779 | case MSR_IA32_DEBUGCTLMSR: | ||
780 | case MSR_IA32_LASTBRANCHFROMIP: | ||
781 | case MSR_IA32_LASTBRANCHTOIP: | ||
782 | case MSR_IA32_LASTINTFROMIP: | ||
783 | case MSR_IA32_LASTINTTOIP: | ||
757 | data = 0; | 784 | data = 0; |
758 | break; | 785 | break; |
759 | case MSR_MTRRcap: | 786 | case MSR_MTRRcap: |
@@ -901,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
901 | case KVM_CAP_PV_MMU: | 928 | case KVM_CAP_PV_MMU: |
902 | r = !tdp_enabled; | 929 | r = !tdp_enabled; |
903 | break; | 930 | break; |
931 | case KVM_CAP_IOMMU: | ||
932 | r = intel_iommu_found(); | ||
933 | break; | ||
904 | default: | 934 | default: |
905 | r = 0; | 935 | r = 0; |
906 | break; | 936 | break; |
@@ -1303,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
1303 | struct kvm_vcpu *vcpu = filp->private_data; | 1333 | struct kvm_vcpu *vcpu = filp->private_data; |
1304 | void __user *argp = (void __user *)arg; | 1334 | void __user *argp = (void __user *)arg; |
1305 | int r; | 1335 | int r; |
1336 | struct kvm_lapic_state *lapic = NULL; | ||
1306 | 1337 | ||
1307 | switch (ioctl) { | 1338 | switch (ioctl) { |
1308 | case KVM_GET_LAPIC: { | 1339 | case KVM_GET_LAPIC: { |
1309 | struct kvm_lapic_state lapic; | 1340 | lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); |
1310 | 1341 | ||
1311 | memset(&lapic, 0, sizeof lapic); | 1342 | r = -ENOMEM; |
1312 | r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); | 1343 | if (!lapic) |
1344 | goto out; | ||
1345 | r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); | ||
1313 | if (r) | 1346 | if (r) |
1314 | goto out; | 1347 | goto out; |
1315 | r = -EFAULT; | 1348 | r = -EFAULT; |
1316 | if (copy_to_user(argp, &lapic, sizeof lapic)) | 1349 | if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) |
1317 | goto out; | 1350 | goto out; |
1318 | r = 0; | 1351 | r = 0; |
1319 | break; | 1352 | break; |
1320 | } | 1353 | } |
1321 | case KVM_SET_LAPIC: { | 1354 | case KVM_SET_LAPIC: { |
1322 | struct kvm_lapic_state lapic; | 1355 | lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); |
1323 | 1356 | r = -ENOMEM; | |
1357 | if (!lapic) | ||
1358 | goto out; | ||
1324 | r = -EFAULT; | 1359 | r = -EFAULT; |
1325 | if (copy_from_user(&lapic, argp, sizeof lapic)) | 1360 | if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) |
1326 | goto out; | 1361 | goto out; |
1327 | r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; | 1362 | r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); |
1328 | if (r) | 1363 | if (r) |
1329 | goto out; | 1364 | goto out; |
1330 | r = 0; | 1365 | r = 0; |
@@ -1422,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, | |||
1422 | r = -EINVAL; | 1457 | r = -EINVAL; |
1423 | } | 1458 | } |
1424 | out: | 1459 | out: |
1460 | if (lapic) | ||
1461 | kfree(lapic); | ||
1425 | return r; | 1462 | return r; |
1426 | } | 1463 | } |
1427 | 1464 | ||
@@ -1630,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1630 | struct kvm *kvm = filp->private_data; | 1667 | struct kvm *kvm = filp->private_data; |
1631 | void __user *argp = (void __user *)arg; | 1668 | void __user *argp = (void __user *)arg; |
1632 | int r = -EINVAL; | 1669 | int r = -EINVAL; |
1670 | /* | ||
1671 | * This union makes it completely explicit to gcc-3.x | ||
1672 | * that these two variables' stack usage should be | ||
1673 | * combined, not added together. | ||
1674 | */ | ||
1675 | union { | ||
1676 | struct kvm_pit_state ps; | ||
1677 | struct kvm_memory_alias alias; | ||
1678 | } u; | ||
1633 | 1679 | ||
1634 | switch (ioctl) { | 1680 | switch (ioctl) { |
1635 | case KVM_SET_TSS_ADDR: | 1681 | case KVM_SET_TSS_ADDR: |
@@ -1661,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1661 | case KVM_GET_NR_MMU_PAGES: | 1707 | case KVM_GET_NR_MMU_PAGES: |
1662 | r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); | 1708 | r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); |
1663 | break; | 1709 | break; |
1664 | case KVM_SET_MEMORY_ALIAS: { | 1710 | case KVM_SET_MEMORY_ALIAS: |
1665 | struct kvm_memory_alias alias; | ||
1666 | |||
1667 | r = -EFAULT; | 1711 | r = -EFAULT; |
1668 | if (copy_from_user(&alias, argp, sizeof alias)) | 1712 | if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) |
1669 | goto out; | 1713 | goto out; |
1670 | r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); | 1714 | r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); |
1671 | if (r) | 1715 | if (r) |
1672 | goto out; | 1716 | goto out; |
1673 | break; | 1717 | break; |
1674 | } | ||
1675 | case KVM_CREATE_IRQCHIP: | 1718 | case KVM_CREATE_IRQCHIP: |
1676 | r = -ENOMEM; | 1719 | r = -ENOMEM; |
1677 | kvm->arch.vpic = kvm_create_pic(kvm); | 1720 | kvm->arch.vpic = kvm_create_pic(kvm); |
@@ -1699,13 +1742,7 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1699 | goto out; | 1742 | goto out; |
1700 | if (irqchip_in_kernel(kvm)) { | 1743 | if (irqchip_in_kernel(kvm)) { |
1701 | mutex_lock(&kvm->lock); | 1744 | mutex_lock(&kvm->lock); |
1702 | if (irq_event.irq < 16) | 1745 | kvm_set_irq(kvm, irq_event.irq, irq_event.level); |
1703 | kvm_pic_set_irq(pic_irqchip(kvm), | ||
1704 | irq_event.irq, | ||
1705 | irq_event.level); | ||
1706 | kvm_ioapic_set_irq(kvm->arch.vioapic, | ||
1707 | irq_event.irq, | ||
1708 | irq_event.level); | ||
1709 | mutex_unlock(&kvm->lock); | 1746 | mutex_unlock(&kvm->lock); |
1710 | r = 0; | 1747 | r = 0; |
1711 | } | 1748 | } |
@@ -1713,65 +1750,77 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1713 | } | 1750 | } |
1714 | case KVM_GET_IRQCHIP: { | 1751 | case KVM_GET_IRQCHIP: { |
1715 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | 1752 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ |
1716 | struct kvm_irqchip chip; | 1753 | struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); |
1717 | 1754 | ||
1718 | r = -EFAULT; | 1755 | r = -ENOMEM; |
1719 | if (copy_from_user(&chip, argp, sizeof chip)) | 1756 | if (!chip) |
1720 | goto out; | 1757 | goto out; |
1758 | r = -EFAULT; | ||
1759 | if (copy_from_user(chip, argp, sizeof *chip)) | ||
1760 | goto get_irqchip_out; | ||
1721 | r = -ENXIO; | 1761 | r = -ENXIO; |
1722 | if (!irqchip_in_kernel(kvm)) | 1762 | if (!irqchip_in_kernel(kvm)) |
1723 | goto out; | 1763 | goto get_irqchip_out; |
1724 | r = kvm_vm_ioctl_get_irqchip(kvm, &chip); | 1764 | r = kvm_vm_ioctl_get_irqchip(kvm, chip); |
1725 | if (r) | 1765 | if (r) |
1726 | goto out; | 1766 | goto get_irqchip_out; |
1727 | r = -EFAULT; | 1767 | r = -EFAULT; |
1728 | if (copy_to_user(argp, &chip, sizeof chip)) | 1768 | if (copy_to_user(argp, chip, sizeof *chip)) |
1729 | goto out; | 1769 | goto get_irqchip_out; |
1730 | r = 0; | 1770 | r = 0; |
1771 | get_irqchip_out: | ||
1772 | kfree(chip); | ||
1773 | if (r) | ||
1774 | goto out; | ||
1731 | break; | 1775 | break; |
1732 | } | 1776 | } |
1733 | case KVM_SET_IRQCHIP: { | 1777 | case KVM_SET_IRQCHIP: { |
1734 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | 1778 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ |
1735 | struct kvm_irqchip chip; | 1779 | struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); |
1736 | 1780 | ||
1737 | r = -EFAULT; | 1781 | r = -ENOMEM; |
1738 | if (copy_from_user(&chip, argp, sizeof chip)) | 1782 | if (!chip) |
1739 | goto out; | 1783 | goto out; |
1784 | r = -EFAULT; | ||
1785 | if (copy_from_user(chip, argp, sizeof *chip)) | ||
1786 | goto set_irqchip_out; | ||
1740 | r = -ENXIO; | 1787 | r = -ENXIO; |
1741 | if (!irqchip_in_kernel(kvm)) | 1788 | if (!irqchip_in_kernel(kvm)) |
1742 | goto out; | 1789 | goto set_irqchip_out; |
1743 | r = kvm_vm_ioctl_set_irqchip(kvm, &chip); | 1790 | r = kvm_vm_ioctl_set_irqchip(kvm, chip); |
1744 | if (r) | 1791 | if (r) |
1745 | goto out; | 1792 | goto set_irqchip_out; |
1746 | r = 0; | 1793 | r = 0; |
1794 | set_irqchip_out: | ||
1795 | kfree(chip); | ||
1796 | if (r) | ||
1797 | goto out; | ||
1747 | break; | 1798 | break; |
1748 | } | 1799 | } |
1749 | case KVM_GET_PIT: { | 1800 | case KVM_GET_PIT: { |
1750 | struct kvm_pit_state ps; | ||
1751 | r = -EFAULT; | 1801 | r = -EFAULT; |
1752 | if (copy_from_user(&ps, argp, sizeof ps)) | 1802 | if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) |
1753 | goto out; | 1803 | goto out; |
1754 | r = -ENXIO; | 1804 | r = -ENXIO; |
1755 | if (!kvm->arch.vpit) | 1805 | if (!kvm->arch.vpit) |
1756 | goto out; | 1806 | goto out; |
1757 | r = kvm_vm_ioctl_get_pit(kvm, &ps); | 1807 | r = kvm_vm_ioctl_get_pit(kvm, &u.ps); |
1758 | if (r) | 1808 | if (r) |
1759 | goto out; | 1809 | goto out; |
1760 | r = -EFAULT; | 1810 | r = -EFAULT; |
1761 | if (copy_to_user(argp, &ps, sizeof ps)) | 1811 | if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) |
1762 | goto out; | 1812 | goto out; |
1763 | r = 0; | 1813 | r = 0; |
1764 | break; | 1814 | break; |
1765 | } | 1815 | } |
1766 | case KVM_SET_PIT: { | 1816 | case KVM_SET_PIT: { |
1767 | struct kvm_pit_state ps; | ||
1768 | r = -EFAULT; | 1817 | r = -EFAULT; |
1769 | if (copy_from_user(&ps, argp, sizeof ps)) | 1818 | if (copy_from_user(&u.ps, argp, sizeof u.ps)) |
1770 | goto out; | 1819 | goto out; |
1771 | r = -ENXIO; | 1820 | r = -ENXIO; |
1772 | if (!kvm->arch.vpit) | 1821 | if (!kvm->arch.vpit) |
1773 | goto out; | 1822 | goto out; |
1774 | r = kvm_vm_ioctl_set_pit(kvm, &ps); | 1823 | r = kvm_vm_ioctl_set_pit(kvm, &u.ps); |
1775 | if (r) | 1824 | if (r) |
1776 | goto out; | 1825 | goto out; |
1777 | r = 0; | 1826 | r = 0; |
@@ -2018,9 +2067,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
2018 | 2067 | ||
2019 | val = *(u64 *)new; | 2068 | val = *(u64 *)new; |
2020 | 2069 | ||
2021 | down_read(¤t->mm->mmap_sem); | ||
2022 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 2070 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); |
2023 | up_read(¤t->mm->mmap_sem); | ||
2024 | 2071 | ||
2025 | kaddr = kmap_atomic(page, KM_USER0); | 2072 | kaddr = kmap_atomic(page, KM_USER0); |
2026 | set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); | 2073 | set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); |
@@ -2040,6 +2087,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) | |||
2040 | 2087 | ||
2041 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) | 2088 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) |
2042 | { | 2089 | { |
2090 | kvm_mmu_invlpg(vcpu, address); | ||
2043 | return X86EMUL_CONTINUE; | 2091 | return X86EMUL_CONTINUE; |
2044 | } | 2092 | } |
2045 | 2093 | ||
@@ -2080,7 +2128,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) | |||
2080 | void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) | 2128 | void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) |
2081 | { | 2129 | { |
2082 | u8 opcodes[4]; | 2130 | u8 opcodes[4]; |
2083 | unsigned long rip = vcpu->arch.rip; | 2131 | unsigned long rip = kvm_rip_read(vcpu); |
2084 | unsigned long rip_linear; | 2132 | unsigned long rip_linear; |
2085 | 2133 | ||
2086 | if (!printk_ratelimit()) | 2134 | if (!printk_ratelimit()) |
@@ -2102,6 +2150,14 @@ static struct x86_emulate_ops emulate_ops = { | |||
2102 | .cmpxchg_emulated = emulator_cmpxchg_emulated, | 2150 | .cmpxchg_emulated = emulator_cmpxchg_emulated, |
2103 | }; | 2151 | }; |
2104 | 2152 | ||
2153 | static void cache_all_regs(struct kvm_vcpu *vcpu) | ||
2154 | { | ||
2155 | kvm_register_read(vcpu, VCPU_REGS_RAX); | ||
2156 | kvm_register_read(vcpu, VCPU_REGS_RSP); | ||
2157 | kvm_register_read(vcpu, VCPU_REGS_RIP); | ||
2158 | vcpu->arch.regs_dirty = ~0; | ||
2159 | } | ||
2160 | |||
2105 | int emulate_instruction(struct kvm_vcpu *vcpu, | 2161 | int emulate_instruction(struct kvm_vcpu *vcpu, |
2106 | struct kvm_run *run, | 2162 | struct kvm_run *run, |
2107 | unsigned long cr2, | 2163 | unsigned long cr2, |
@@ -2111,8 +2167,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
2111 | int r; | 2167 | int r; |
2112 | struct decode_cache *c; | 2168 | struct decode_cache *c; |
2113 | 2169 | ||
2170 | kvm_clear_exception_queue(vcpu); | ||
2114 | vcpu->arch.mmio_fault_cr2 = cr2; | 2171 | vcpu->arch.mmio_fault_cr2 = cr2; |
2115 | kvm_x86_ops->cache_regs(vcpu); | 2172 | /* |
2173 | * TODO: fix x86_emulate.c to use guest_read/write_register | ||
2174 | * instead of direct ->regs accesses, can save hundred cycles | ||
2175 | * on Intel for instructions that don't read/change RSP, for | ||
2176 | * for example. | ||
2177 | */ | ||
2178 | cache_all_regs(vcpu); | ||
2116 | 2179 | ||
2117 | vcpu->mmio_is_write = 0; | 2180 | vcpu->mmio_is_write = 0; |
2118 | vcpu->arch.pio.string = 0; | 2181 | vcpu->arch.pio.string = 0; |
@@ -2172,7 +2235,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
2172 | return EMULATE_DO_MMIO; | 2235 | return EMULATE_DO_MMIO; |
2173 | } | 2236 | } |
2174 | 2237 | ||
2175 | kvm_x86_ops->decache_regs(vcpu); | ||
2176 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | 2238 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); |
2177 | 2239 | ||
2178 | if (vcpu->mmio_is_write) { | 2240 | if (vcpu->mmio_is_write) { |
@@ -2225,20 +2287,19 @@ int complete_pio(struct kvm_vcpu *vcpu) | |||
2225 | struct kvm_pio_request *io = &vcpu->arch.pio; | 2287 | struct kvm_pio_request *io = &vcpu->arch.pio; |
2226 | long delta; | 2288 | long delta; |
2227 | int r; | 2289 | int r; |
2228 | 2290 | unsigned long val; | |
2229 | kvm_x86_ops->cache_regs(vcpu); | ||
2230 | 2291 | ||
2231 | if (!io->string) { | 2292 | if (!io->string) { |
2232 | if (io->in) | 2293 | if (io->in) { |
2233 | memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, | 2294 | val = kvm_register_read(vcpu, VCPU_REGS_RAX); |
2234 | io->size); | 2295 | memcpy(&val, vcpu->arch.pio_data, io->size); |
2296 | kvm_register_write(vcpu, VCPU_REGS_RAX, val); | ||
2297 | } | ||
2235 | } else { | 2298 | } else { |
2236 | if (io->in) { | 2299 | if (io->in) { |
2237 | r = pio_copy_data(vcpu); | 2300 | r = pio_copy_data(vcpu); |
2238 | if (r) { | 2301 | if (r) |
2239 | kvm_x86_ops->cache_regs(vcpu); | ||
2240 | return r; | 2302 | return r; |
2241 | } | ||
2242 | } | 2303 | } |
2243 | 2304 | ||
2244 | delta = 1; | 2305 | delta = 1; |
@@ -2248,19 +2309,24 @@ int complete_pio(struct kvm_vcpu *vcpu) | |||
2248 | * The size of the register should really depend on | 2309 | * The size of the register should really depend on |
2249 | * current address size. | 2310 | * current address size. |
2250 | */ | 2311 | */ |
2251 | vcpu->arch.regs[VCPU_REGS_RCX] -= delta; | 2312 | val = kvm_register_read(vcpu, VCPU_REGS_RCX); |
2313 | val -= delta; | ||
2314 | kvm_register_write(vcpu, VCPU_REGS_RCX, val); | ||
2252 | } | 2315 | } |
2253 | if (io->down) | 2316 | if (io->down) |
2254 | delta = -delta; | 2317 | delta = -delta; |
2255 | delta *= io->size; | 2318 | delta *= io->size; |
2256 | if (io->in) | 2319 | if (io->in) { |
2257 | vcpu->arch.regs[VCPU_REGS_RDI] += delta; | 2320 | val = kvm_register_read(vcpu, VCPU_REGS_RDI); |
2258 | else | 2321 | val += delta; |
2259 | vcpu->arch.regs[VCPU_REGS_RSI] += delta; | 2322 | kvm_register_write(vcpu, VCPU_REGS_RDI, val); |
2323 | } else { | ||
2324 | val = kvm_register_read(vcpu, VCPU_REGS_RSI); | ||
2325 | val += delta; | ||
2326 | kvm_register_write(vcpu, VCPU_REGS_RSI, val); | ||
2327 | } | ||
2260 | } | 2328 | } |
2261 | 2329 | ||
2262 | kvm_x86_ops->decache_regs(vcpu); | ||
2263 | |||
2264 | io->count -= io->cur_count; | 2330 | io->count -= io->cur_count; |
2265 | io->cur_count = 0; | 2331 | io->cur_count = 0; |
2266 | 2332 | ||
@@ -2313,6 +2379,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2313 | int size, unsigned port) | 2379 | int size, unsigned port) |
2314 | { | 2380 | { |
2315 | struct kvm_io_device *pio_dev; | 2381 | struct kvm_io_device *pio_dev; |
2382 | unsigned long val; | ||
2316 | 2383 | ||
2317 | vcpu->run->exit_reason = KVM_EXIT_IO; | 2384 | vcpu->run->exit_reason = KVM_EXIT_IO; |
2318 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | 2385 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; |
@@ -2333,8 +2400,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2333 | KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, | 2400 | KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, |
2334 | handler); | 2401 | handler); |
2335 | 2402 | ||
2336 | kvm_x86_ops->cache_regs(vcpu); | 2403 | val = kvm_register_read(vcpu, VCPU_REGS_RAX); |
2337 | memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); | 2404 | memcpy(vcpu->arch.pio_data, &val, 4); |
2338 | 2405 | ||
2339 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2406 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
2340 | 2407 | ||
@@ -2492,11 +2559,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) | |||
2492 | KVMTRACE_0D(HLT, vcpu, handler); | 2559 | KVMTRACE_0D(HLT, vcpu, handler); |
2493 | if (irqchip_in_kernel(vcpu->kvm)) { | 2560 | if (irqchip_in_kernel(vcpu->kvm)) { |
2494 | vcpu->arch.mp_state = KVM_MP_STATE_HALTED; | 2561 | vcpu->arch.mp_state = KVM_MP_STATE_HALTED; |
2495 | up_read(&vcpu->kvm->slots_lock); | ||
2496 | kvm_vcpu_block(vcpu); | ||
2497 | down_read(&vcpu->kvm->slots_lock); | ||
2498 | if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) | ||
2499 | return -EINTR; | ||
2500 | return 1; | 2562 | return 1; |
2501 | } else { | 2563 | } else { |
2502 | vcpu->run->exit_reason = KVM_EXIT_HLT; | 2564 | vcpu->run->exit_reason = KVM_EXIT_HLT; |
@@ -2519,13 +2581,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
2519 | unsigned long nr, a0, a1, a2, a3, ret; | 2581 | unsigned long nr, a0, a1, a2, a3, ret; |
2520 | int r = 1; | 2582 | int r = 1; |
2521 | 2583 | ||
2522 | kvm_x86_ops->cache_regs(vcpu); | 2584 | nr = kvm_register_read(vcpu, VCPU_REGS_RAX); |
2523 | 2585 | a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); | |
2524 | nr = vcpu->arch.regs[VCPU_REGS_RAX]; | 2586 | a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); |
2525 | a0 = vcpu->arch.regs[VCPU_REGS_RBX]; | 2587 | a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); |
2526 | a1 = vcpu->arch.regs[VCPU_REGS_RCX]; | 2588 | a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); |
2527 | a2 = vcpu->arch.regs[VCPU_REGS_RDX]; | ||
2528 | a3 = vcpu->arch.regs[VCPU_REGS_RSI]; | ||
2529 | 2589 | ||
2530 | KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); | 2590 | KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); |
2531 | 2591 | ||
@@ -2548,8 +2608,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
2548 | ret = -KVM_ENOSYS; | 2608 | ret = -KVM_ENOSYS; |
2549 | break; | 2609 | break; |
2550 | } | 2610 | } |
2551 | vcpu->arch.regs[VCPU_REGS_RAX] = ret; | 2611 | kvm_register_write(vcpu, VCPU_REGS_RAX, ret); |
2552 | kvm_x86_ops->decache_regs(vcpu); | ||
2553 | ++vcpu->stat.hypercalls; | 2612 | ++vcpu->stat.hypercalls; |
2554 | return r; | 2613 | return r; |
2555 | } | 2614 | } |
@@ -2559,6 +2618,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) | |||
2559 | { | 2618 | { |
2560 | char instruction[3]; | 2619 | char instruction[3]; |
2561 | int ret = 0; | 2620 | int ret = 0; |
2621 | unsigned long rip = kvm_rip_read(vcpu); | ||
2562 | 2622 | ||
2563 | 2623 | ||
2564 | /* | 2624 | /* |
@@ -2568,9 +2628,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) | |||
2568 | */ | 2628 | */ |
2569 | kvm_mmu_zap_all(vcpu->kvm); | 2629 | kvm_mmu_zap_all(vcpu->kvm); |
2570 | 2630 | ||
2571 | kvm_x86_ops->cache_regs(vcpu); | ||
2572 | kvm_x86_ops->patch_hypercall(vcpu, instruction); | 2631 | kvm_x86_ops->patch_hypercall(vcpu, instruction); |
2573 | if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) | 2632 | if (emulator_write_emulated(rip, instruction, 3, vcpu) |
2574 | != X86EMUL_CONTINUE) | 2633 | != X86EMUL_CONTINUE) |
2575 | ret = -EFAULT; | 2634 | ret = -EFAULT; |
2576 | 2635 | ||
@@ -2700,13 +2759,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | |||
2700 | u32 function, index; | 2759 | u32 function, index; |
2701 | struct kvm_cpuid_entry2 *e, *best; | 2760 | struct kvm_cpuid_entry2 *e, *best; |
2702 | 2761 | ||
2703 | kvm_x86_ops->cache_regs(vcpu); | 2762 | function = kvm_register_read(vcpu, VCPU_REGS_RAX); |
2704 | function = vcpu->arch.regs[VCPU_REGS_RAX]; | 2763 | index = kvm_register_read(vcpu, VCPU_REGS_RCX); |
2705 | index = vcpu->arch.regs[VCPU_REGS_RCX]; | 2764 | kvm_register_write(vcpu, VCPU_REGS_RAX, 0); |
2706 | vcpu->arch.regs[VCPU_REGS_RAX] = 0; | 2765 | kvm_register_write(vcpu, VCPU_REGS_RBX, 0); |
2707 | vcpu->arch.regs[VCPU_REGS_RBX] = 0; | 2766 | kvm_register_write(vcpu, VCPU_REGS_RCX, 0); |
2708 | vcpu->arch.regs[VCPU_REGS_RCX] = 0; | 2767 | kvm_register_write(vcpu, VCPU_REGS_RDX, 0); |
2709 | vcpu->arch.regs[VCPU_REGS_RDX] = 0; | ||
2710 | best = NULL; | 2768 | best = NULL; |
2711 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { | 2769 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { |
2712 | e = &vcpu->arch.cpuid_entries[i]; | 2770 | e = &vcpu->arch.cpuid_entries[i]; |
@@ -2724,18 +2782,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | |||
2724 | best = e; | 2782 | best = e; |
2725 | } | 2783 | } |
2726 | if (best) { | 2784 | if (best) { |
2727 | vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; | 2785 | kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); |
2728 | vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; | 2786 | kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); |
2729 | vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; | 2787 | kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); |
2730 | vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; | 2788 | kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); |
2731 | } | 2789 | } |
2732 | kvm_x86_ops->decache_regs(vcpu); | ||
2733 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2790 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
2734 | KVMTRACE_5D(CPUID, vcpu, function, | 2791 | KVMTRACE_5D(CPUID, vcpu, function, |
2735 | (u32)vcpu->arch.regs[VCPU_REGS_RAX], | 2792 | (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), |
2736 | (u32)vcpu->arch.regs[VCPU_REGS_RBX], | 2793 | (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), |
2737 | (u32)vcpu->arch.regs[VCPU_REGS_RCX], | 2794 | (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), |
2738 | (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); | 2795 | (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); |
2739 | } | 2796 | } |
2740 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | 2797 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); |
2741 | 2798 | ||
@@ -2776,9 +2833,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu) | |||
2776 | if (!apic || !apic->vapic_addr) | 2833 | if (!apic || !apic->vapic_addr) |
2777 | return; | 2834 | return; |
2778 | 2835 | ||
2779 | down_read(¤t->mm->mmap_sem); | ||
2780 | page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); | 2836 | page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); |
2781 | up_read(¤t->mm->mmap_sem); | ||
2782 | 2837 | ||
2783 | vcpu->arch.apic->vapic_page = page; | 2838 | vcpu->arch.apic->vapic_page = page; |
2784 | } | 2839 | } |
@@ -2796,28 +2851,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu) | |||
2796 | up_read(&vcpu->kvm->slots_lock); | 2851 | up_read(&vcpu->kvm->slots_lock); |
2797 | } | 2852 | } |
2798 | 2853 | ||
2799 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2854 | static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2800 | { | 2855 | { |
2801 | int r; | 2856 | int r; |
2802 | 2857 | ||
2803 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { | ||
2804 | pr_debug("vcpu %d received sipi with vector # %x\n", | ||
2805 | vcpu->vcpu_id, vcpu->arch.sipi_vector); | ||
2806 | kvm_lapic_reset(vcpu); | ||
2807 | r = kvm_x86_ops->vcpu_reset(vcpu); | ||
2808 | if (r) | ||
2809 | return r; | ||
2810 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
2811 | } | ||
2812 | |||
2813 | down_read(&vcpu->kvm->slots_lock); | ||
2814 | vapic_enter(vcpu); | ||
2815 | |||
2816 | preempted: | ||
2817 | if (vcpu->guest_debug.enabled) | ||
2818 | kvm_x86_ops->guest_debug_pre(vcpu); | ||
2819 | |||
2820 | again: | ||
2821 | if (vcpu->requests) | 2858 | if (vcpu->requests) |
2822 | if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) | 2859 | if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) |
2823 | kvm_mmu_unload(vcpu); | 2860 | kvm_mmu_unload(vcpu); |
@@ -2829,6 +2866,8 @@ again: | |||
2829 | if (vcpu->requests) { | 2866 | if (vcpu->requests) { |
2830 | if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) | 2867 | if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) |
2831 | __kvm_migrate_timers(vcpu); | 2868 | __kvm_migrate_timers(vcpu); |
2869 | if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) | ||
2870 | kvm_mmu_sync_roots(vcpu); | ||
2832 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) | 2871 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) |
2833 | kvm_x86_ops->tlb_flush(vcpu); | 2872 | kvm_x86_ops->tlb_flush(vcpu); |
2834 | if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, | 2873 | if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, |
@@ -2854,21 +2893,15 @@ again: | |||
2854 | 2893 | ||
2855 | local_irq_disable(); | 2894 | local_irq_disable(); |
2856 | 2895 | ||
2857 | if (vcpu->requests || need_resched()) { | 2896 | if (vcpu->requests || need_resched() || signal_pending(current)) { |
2858 | local_irq_enable(); | 2897 | local_irq_enable(); |
2859 | preempt_enable(); | 2898 | preempt_enable(); |
2860 | r = 1; | 2899 | r = 1; |
2861 | goto out; | 2900 | goto out; |
2862 | } | 2901 | } |
2863 | 2902 | ||
2864 | if (signal_pending(current)) { | 2903 | if (vcpu->guest_debug.enabled) |
2865 | local_irq_enable(); | 2904 | kvm_x86_ops->guest_debug_pre(vcpu); |
2866 | preempt_enable(); | ||
2867 | r = -EINTR; | ||
2868 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
2869 | ++vcpu->stat.signal_exits; | ||
2870 | goto out; | ||
2871 | } | ||
2872 | 2905 | ||
2873 | vcpu->guest_mode = 1; | 2906 | vcpu->guest_mode = 1; |
2874 | /* | 2907 | /* |
@@ -2917,8 +2950,8 @@ again: | |||
2917 | * Profile KVM exit RIPs: | 2950 | * Profile KVM exit RIPs: |
2918 | */ | 2951 | */ |
2919 | if (unlikely(prof_on == KVM_PROFILING)) { | 2952 | if (unlikely(prof_on == KVM_PROFILING)) { |
2920 | kvm_x86_ops->cache_regs(vcpu); | 2953 | unsigned long rip = kvm_rip_read(vcpu); |
2921 | profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); | 2954 | profile_hit(KVM_PROFILING, (void *)rip); |
2922 | } | 2955 | } |
2923 | 2956 | ||
2924 | if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) | 2957 | if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) |
@@ -2927,26 +2960,63 @@ again: | |||
2927 | kvm_lapic_sync_from_vapic(vcpu); | 2960 | kvm_lapic_sync_from_vapic(vcpu); |
2928 | 2961 | ||
2929 | r = kvm_x86_ops->handle_exit(kvm_run, vcpu); | 2962 | r = kvm_x86_ops->handle_exit(kvm_run, vcpu); |
2963 | out: | ||
2964 | return r; | ||
2965 | } | ||
2930 | 2966 | ||
2931 | if (r > 0) { | 2967 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2932 | if (dm_request_for_irq_injection(vcpu, kvm_run)) { | 2968 | { |
2933 | r = -EINTR; | 2969 | int r; |
2934 | kvm_run->exit_reason = KVM_EXIT_INTR; | 2970 | |
2935 | ++vcpu->stat.request_irq_exits; | 2971 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { |
2936 | goto out; | 2972 | pr_debug("vcpu %d received sipi with vector # %x\n", |
2937 | } | 2973 | vcpu->vcpu_id, vcpu->arch.sipi_vector); |
2938 | if (!need_resched()) | 2974 | kvm_lapic_reset(vcpu); |
2939 | goto again; | 2975 | r = kvm_x86_ops->vcpu_reset(vcpu); |
2976 | if (r) | ||
2977 | return r; | ||
2978 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
2940 | } | 2979 | } |
2941 | 2980 | ||
2942 | out: | 2981 | down_read(&vcpu->kvm->slots_lock); |
2943 | up_read(&vcpu->kvm->slots_lock); | 2982 | vapic_enter(vcpu); |
2944 | if (r > 0) { | 2983 | |
2945 | kvm_resched(vcpu); | 2984 | r = 1; |
2946 | down_read(&vcpu->kvm->slots_lock); | 2985 | while (r > 0) { |
2947 | goto preempted; | 2986 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) |
2987 | r = vcpu_enter_guest(vcpu, kvm_run); | ||
2988 | else { | ||
2989 | up_read(&vcpu->kvm->slots_lock); | ||
2990 | kvm_vcpu_block(vcpu); | ||
2991 | down_read(&vcpu->kvm->slots_lock); | ||
2992 | if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) | ||
2993 | if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) | ||
2994 | vcpu->arch.mp_state = | ||
2995 | KVM_MP_STATE_RUNNABLE; | ||
2996 | if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) | ||
2997 | r = -EINTR; | ||
2998 | } | ||
2999 | |||
3000 | if (r > 0) { | ||
3001 | if (dm_request_for_irq_injection(vcpu, kvm_run)) { | ||
3002 | r = -EINTR; | ||
3003 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
3004 | ++vcpu->stat.request_irq_exits; | ||
3005 | } | ||
3006 | if (signal_pending(current)) { | ||
3007 | r = -EINTR; | ||
3008 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
3009 | ++vcpu->stat.signal_exits; | ||
3010 | } | ||
3011 | if (need_resched()) { | ||
3012 | up_read(&vcpu->kvm->slots_lock); | ||
3013 | kvm_resched(vcpu); | ||
3014 | down_read(&vcpu->kvm->slots_lock); | ||
3015 | } | ||
3016 | } | ||
2948 | } | 3017 | } |
2949 | 3018 | ||
3019 | up_read(&vcpu->kvm->slots_lock); | ||
2950 | post_kvm_run_save(vcpu, kvm_run); | 3020 | post_kvm_run_save(vcpu, kvm_run); |
2951 | 3021 | ||
2952 | vapic_exit(vcpu); | 3022 | vapic_exit(vcpu); |
@@ -2966,6 +3036,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2966 | 3036 | ||
2967 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { | 3037 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { |
2968 | kvm_vcpu_block(vcpu); | 3038 | kvm_vcpu_block(vcpu); |
3039 | clear_bit(KVM_REQ_UNHALT, &vcpu->requests); | ||
2969 | r = -EAGAIN; | 3040 | r = -EAGAIN; |
2970 | goto out; | 3041 | goto out; |
2971 | } | 3042 | } |
@@ -2999,11 +3070,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2999 | } | 3070 | } |
3000 | } | 3071 | } |
3001 | #endif | 3072 | #endif |
3002 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { | 3073 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) |
3003 | kvm_x86_ops->cache_regs(vcpu); | 3074 | kvm_register_write(vcpu, VCPU_REGS_RAX, |
3004 | vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; | 3075 | kvm_run->hypercall.ret); |
3005 | kvm_x86_ops->decache_regs(vcpu); | ||
3006 | } | ||
3007 | 3076 | ||
3008 | r = __vcpu_run(vcpu, kvm_run); | 3077 | r = __vcpu_run(vcpu, kvm_run); |
3009 | 3078 | ||
@@ -3019,28 +3088,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
3019 | { | 3088 | { |
3020 | vcpu_load(vcpu); | 3089 | vcpu_load(vcpu); |
3021 | 3090 | ||
3022 | kvm_x86_ops->cache_regs(vcpu); | 3091 | regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); |
3023 | 3092 | regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); | |
3024 | regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; | 3093 | regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); |
3025 | regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; | 3094 | regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); |
3026 | regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; | 3095 | regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); |
3027 | regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; | 3096 | regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); |
3028 | regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; | 3097 | regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); |
3029 | regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; | 3098 | regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); |
3030 | regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
3031 | regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; | ||
3032 | #ifdef CONFIG_X86_64 | 3099 | #ifdef CONFIG_X86_64 |
3033 | regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; | 3100 | regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); |
3034 | regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; | 3101 | regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); |
3035 | regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; | 3102 | regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); |
3036 | regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; | 3103 | regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); |
3037 | regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; | 3104 | regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); |
3038 | regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; | 3105 | regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); |
3039 | regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; | 3106 | regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); |
3040 | regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; | 3107 | regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); |
3041 | #endif | 3108 | #endif |
3042 | 3109 | ||
3043 | regs->rip = vcpu->arch.rip; | 3110 | regs->rip = kvm_rip_read(vcpu); |
3044 | regs->rflags = kvm_x86_ops->get_rflags(vcpu); | 3111 | regs->rflags = kvm_x86_ops->get_rflags(vcpu); |
3045 | 3112 | ||
3046 | /* | 3113 | /* |
@@ -3058,29 +3125,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
3058 | { | 3125 | { |
3059 | vcpu_load(vcpu); | 3126 | vcpu_load(vcpu); |
3060 | 3127 | ||
3061 | vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; | 3128 | kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); |
3062 | vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; | 3129 | kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); |
3063 | vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; | 3130 | kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); |
3064 | vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; | 3131 | kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); |
3065 | vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; | 3132 | kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); |
3066 | vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; | 3133 | kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); |
3067 | vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; | 3134 | kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); |
3068 | vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; | 3135 | kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); |
3069 | #ifdef CONFIG_X86_64 | 3136 | #ifdef CONFIG_X86_64 |
3070 | vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; | 3137 | kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); |
3071 | vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; | 3138 | kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); |
3072 | vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; | 3139 | kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); |
3073 | vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; | 3140 | kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); |
3074 | vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; | 3141 | kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); |
3075 | vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; | 3142 | kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); |
3076 | vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; | 3143 | kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); |
3077 | vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; | 3144 | kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); |
3145 | |||
3078 | #endif | 3146 | #endif |
3079 | 3147 | ||
3080 | vcpu->arch.rip = regs->rip; | 3148 | kvm_rip_write(vcpu, regs->rip); |
3081 | kvm_x86_ops->set_rflags(vcpu, regs->rflags); | 3149 | kvm_x86_ops->set_rflags(vcpu, regs->rflags); |
3082 | 3150 | ||
3083 | kvm_x86_ops->decache_regs(vcpu); | ||
3084 | 3151 | ||
3085 | vcpu->arch.exception.pending = false; | 3152 | vcpu->arch.exception.pending = false; |
3086 | 3153 | ||
@@ -3294,11 +3361,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, | |||
3294 | return 0; | 3361 | return 0; |
3295 | } | 3362 | } |
3296 | 3363 | ||
3364 | static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) | ||
3365 | { | ||
3366 | struct kvm_segment segvar = { | ||
3367 | .base = selector << 4, | ||
3368 | .limit = 0xffff, | ||
3369 | .selector = selector, | ||
3370 | .type = 3, | ||
3371 | .present = 1, | ||
3372 | .dpl = 3, | ||
3373 | .db = 0, | ||
3374 | .s = 1, | ||
3375 | .l = 0, | ||
3376 | .g = 0, | ||
3377 | .avl = 0, | ||
3378 | .unusable = 0, | ||
3379 | }; | ||
3380 | kvm_x86_ops->set_segment(vcpu, &segvar, seg); | ||
3381 | return 0; | ||
3382 | } | ||
3383 | |||
3297 | int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | 3384 | int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, |
3298 | int type_bits, int seg) | 3385 | int type_bits, int seg) |
3299 | { | 3386 | { |
3300 | struct kvm_segment kvm_seg; | 3387 | struct kvm_segment kvm_seg; |
3301 | 3388 | ||
3389 | if (!(vcpu->arch.cr0 & X86_CR0_PE)) | ||
3390 | return kvm_load_realmode_segment(vcpu, selector, seg); | ||
3302 | if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) | 3391 | if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) |
3303 | return 1; | 3392 | return 1; |
3304 | kvm_seg.type |= type_bits; | 3393 | kvm_seg.type |= type_bits; |
@@ -3316,17 +3405,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, | |||
3316 | struct tss_segment_32 *tss) | 3405 | struct tss_segment_32 *tss) |
3317 | { | 3406 | { |
3318 | tss->cr3 = vcpu->arch.cr3; | 3407 | tss->cr3 = vcpu->arch.cr3; |
3319 | tss->eip = vcpu->arch.rip; | 3408 | tss->eip = kvm_rip_read(vcpu); |
3320 | tss->eflags = kvm_x86_ops->get_rflags(vcpu); | 3409 | tss->eflags = kvm_x86_ops->get_rflags(vcpu); |
3321 | tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; | 3410 | tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); |
3322 | tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; | 3411 | tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); |
3323 | tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; | 3412 | tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); |
3324 | tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; | 3413 | tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); |
3325 | tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; | 3414 | tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); |
3326 | tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; | 3415 | tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); |
3327 | tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; | 3416 | tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); |
3328 | tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; | 3417 | tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); |
3329 | |||
3330 | tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); | 3418 | tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); |
3331 | tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); | 3419 | tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); |
3332 | tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); | 3420 | tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); |
@@ -3342,17 +3430,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, | |||
3342 | { | 3430 | { |
3343 | kvm_set_cr3(vcpu, tss->cr3); | 3431 | kvm_set_cr3(vcpu, tss->cr3); |
3344 | 3432 | ||
3345 | vcpu->arch.rip = tss->eip; | 3433 | kvm_rip_write(vcpu, tss->eip); |
3346 | kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); | 3434 | kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); |
3347 | 3435 | ||
3348 | vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; | 3436 | kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); |
3349 | vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; | 3437 | kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); |
3350 | vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; | 3438 | kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); |
3351 | vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; | 3439 | kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); |
3352 | vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; | 3440 | kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); |
3353 | vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; | 3441 | kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); |
3354 | vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; | 3442 | kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); |
3355 | vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; | 3443 | kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); |
3356 | 3444 | ||
3357 | if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) | 3445 | if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) |
3358 | return 1; | 3446 | return 1; |
@@ -3380,16 +3468,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, | |||
3380 | static void save_state_to_tss16(struct kvm_vcpu *vcpu, | 3468 | static void save_state_to_tss16(struct kvm_vcpu *vcpu, |
3381 | struct tss_segment_16 *tss) | 3469 | struct tss_segment_16 *tss) |
3382 | { | 3470 | { |
3383 | tss->ip = vcpu->arch.rip; | 3471 | tss->ip = kvm_rip_read(vcpu); |
3384 | tss->flag = kvm_x86_ops->get_rflags(vcpu); | 3472 | tss->flag = kvm_x86_ops->get_rflags(vcpu); |
3385 | tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; | 3473 | tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); |
3386 | tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; | 3474 | tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); |
3387 | tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; | 3475 | tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); |
3388 | tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; | 3476 | tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); |
3389 | tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; | 3477 | tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); |
3390 | tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; | 3478 | tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); |
3391 | tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; | 3479 | tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); |
3392 | tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; | 3480 | tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); |
3393 | 3481 | ||
3394 | tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); | 3482 | tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); |
3395 | tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); | 3483 | tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); |
@@ -3402,16 +3490,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu, | |||
3402 | static int load_state_from_tss16(struct kvm_vcpu *vcpu, | 3490 | static int load_state_from_tss16(struct kvm_vcpu *vcpu, |
3403 | struct tss_segment_16 *tss) | 3491 | struct tss_segment_16 *tss) |
3404 | { | 3492 | { |
3405 | vcpu->arch.rip = tss->ip; | 3493 | kvm_rip_write(vcpu, tss->ip); |
3406 | kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); | 3494 | kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); |
3407 | vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; | 3495 | kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); |
3408 | vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; | 3496 | kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); |
3409 | vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; | 3497 | kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); |
3410 | vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; | 3498 | kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); |
3411 | vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; | 3499 | kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); |
3412 | vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; | 3500 | kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); |
3413 | vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; | 3501 | kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); |
3414 | vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; | 3502 | kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); |
3415 | 3503 | ||
3416 | if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) | 3504 | if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) |
3417 | return 1; | 3505 | return 1; |
@@ -3534,7 +3622,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) | |||
3534 | } | 3622 | } |
3535 | 3623 | ||
3536 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 3624 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
3537 | kvm_x86_ops->cache_regs(vcpu); | ||
3538 | 3625 | ||
3539 | if (nseg_desc.type & 8) | 3626 | if (nseg_desc.type & 8) |
3540 | ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, | 3627 | ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, |
@@ -3559,7 +3646,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) | |||
3559 | tr_seg.type = 11; | 3646 | tr_seg.type = 11; |
3560 | kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); | 3647 | kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); |
3561 | out: | 3648 | out: |
3562 | kvm_x86_ops->decache_regs(vcpu); | ||
3563 | return ret; | 3649 | return ret; |
3564 | } | 3650 | } |
3565 | EXPORT_SYMBOL_GPL(kvm_task_switch); | 3651 | EXPORT_SYMBOL_GPL(kvm_task_switch); |
@@ -3622,6 +3708,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
3622 | pr_debug("Set back pending irq %d\n", | 3708 | pr_debug("Set back pending irq %d\n", |
3623 | pending_vec); | 3709 | pending_vec); |
3624 | } | 3710 | } |
3711 | kvm_pic_clear_isr_ack(vcpu->kvm); | ||
3625 | } | 3712 | } |
3626 | 3713 | ||
3627 | kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); | 3714 | kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); |
@@ -3634,6 +3721,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
3634 | kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); | 3721 | kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); |
3635 | kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); | 3722 | kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); |
3636 | 3723 | ||
3724 | /* Older userspace won't unhalt the vcpu on reset. */ | ||
3725 | if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && | ||
3726 | sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && | ||
3727 | !(vcpu->arch.cr0 & X86_CR0_PE)) | ||
3728 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
3729 | |||
3637 | vcpu_put(vcpu); | 3730 | vcpu_put(vcpu); |
3638 | 3731 | ||
3639 | return 0; | 3732 | return 0; |
@@ -3918,6 +4011,7 @@ struct kvm *kvm_arch_create_vm(void) | |||
3918 | return ERR_PTR(-ENOMEM); | 4011 | return ERR_PTR(-ENOMEM); |
3919 | 4012 | ||
3920 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | 4013 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
4014 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); | ||
3921 | 4015 | ||
3922 | return kvm; | 4016 | return kvm; |
3923 | } | 4017 | } |
@@ -3950,6 +4044,8 @@ static void kvm_free_vcpus(struct kvm *kvm) | |||
3950 | 4044 | ||
3951 | void kvm_arch_destroy_vm(struct kvm *kvm) | 4045 | void kvm_arch_destroy_vm(struct kvm *kvm) |
3952 | { | 4046 | { |
4047 | kvm_iommu_unmap_guest(kvm); | ||
4048 | kvm_free_all_assigned_devices(kvm); | ||
3953 | kvm_free_pit(kvm); | 4049 | kvm_free_pit(kvm); |
3954 | kfree(kvm->arch.vpic); | 4050 | kfree(kvm->arch.vpic); |
3955 | kfree(kvm->arch.vioapic); | 4051 | kfree(kvm->arch.vioapic); |
@@ -3981,7 +4077,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, | |||
3981 | userspace_addr = do_mmap(NULL, 0, | 4077 | userspace_addr = do_mmap(NULL, 0, |
3982 | npages * PAGE_SIZE, | 4078 | npages * PAGE_SIZE, |
3983 | PROT_READ | PROT_WRITE, | 4079 | PROT_READ | PROT_WRITE, |
3984 | MAP_SHARED | MAP_ANONYMOUS, | 4080 | MAP_PRIVATE | MAP_ANONYMOUS, |
3985 | 0); | 4081 | 0); |
3986 | up_write(¤t->mm->mmap_sem); | 4082 | up_write(¤t->mm->mmap_sem); |
3987 | 4083 | ||
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h new file mode 100644 index 000000000000..6a4be78a7384 --- /dev/null +++ b/arch/x86/kvm/x86.h | |||
@@ -0,0 +1,22 @@ | |||
1 | #ifndef ARCH_X86_KVM_X86_H | ||
2 | #define ARCH_X86_KVM_X86_H | ||
3 | |||
4 | #include <linux/kvm_host.h> | ||
5 | |||
6 | static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) | ||
7 | { | ||
8 | vcpu->arch.exception.pending = false; | ||
9 | } | ||
10 | |||
11 | static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) | ||
12 | { | ||
13 | vcpu->arch.interrupt.pending = true; | ||
14 | vcpu->arch.interrupt.nr = vector; | ||
15 | } | ||
16 | |||
17 | static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) | ||
18 | { | ||
19 | vcpu->arch.interrupt.pending = false; | ||
20 | } | ||
21 | |||
22 | #endif | ||
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index f2f90468f8b1..ea051173b0da 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #define DPRINTF(_f, _a ...) printf(_f , ## _a) | 26 | #define DPRINTF(_f, _a ...) printf(_f , ## _a) |
27 | #else | 27 | #else |
28 | #include <linux/kvm_host.h> | 28 | #include <linux/kvm_host.h> |
29 | #include "kvm_cache_regs.h" | ||
29 | #define DPRINTF(x...) do {} while (0) | 30 | #define DPRINTF(x...) do {} while (0) |
30 | #endif | 31 | #endif |
31 | #include <linux/module.h> | 32 | #include <linux/module.h> |
@@ -46,25 +47,26 @@ | |||
46 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ | 47 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ |
47 | #define DstReg (2<<1) /* Register operand. */ | 48 | #define DstReg (2<<1) /* Register operand. */ |
48 | #define DstMem (3<<1) /* Memory operand. */ | 49 | #define DstMem (3<<1) /* Memory operand. */ |
49 | #define DstMask (3<<1) | 50 | #define DstAcc (4<<1) /* Destination Accumulator */ |
51 | #define DstMask (7<<1) | ||
50 | /* Source operand type. */ | 52 | /* Source operand type. */ |
51 | #define SrcNone (0<<3) /* No source operand. */ | 53 | #define SrcNone (0<<4) /* No source operand. */ |
52 | #define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ | 54 | #define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ |
53 | #define SrcReg (1<<3) /* Register operand. */ | 55 | #define SrcReg (1<<4) /* Register operand. */ |
54 | #define SrcMem (2<<3) /* Memory operand. */ | 56 | #define SrcMem (2<<4) /* Memory operand. */ |
55 | #define SrcMem16 (3<<3) /* Memory operand (16-bit). */ | 57 | #define SrcMem16 (3<<4) /* Memory operand (16-bit). */ |
56 | #define SrcMem32 (4<<3) /* Memory operand (32-bit). */ | 58 | #define SrcMem32 (4<<4) /* Memory operand (32-bit). */ |
57 | #define SrcImm (5<<3) /* Immediate operand. */ | 59 | #define SrcImm (5<<4) /* Immediate operand. */ |
58 | #define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ | 60 | #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ |
59 | #define SrcMask (7<<3) | 61 | #define SrcMask (7<<4) |
60 | /* Generic ModRM decode. */ | 62 | /* Generic ModRM decode. */ |
61 | #define ModRM (1<<6) | 63 | #define ModRM (1<<7) |
62 | /* Destination is only written; never read. */ | 64 | /* Destination is only written; never read. */ |
63 | #define Mov (1<<7) | 65 | #define Mov (1<<8) |
64 | #define BitOp (1<<8) | 66 | #define BitOp (1<<9) |
65 | #define MemAbs (1<<9) /* Memory operand is absolute displacement */ | 67 | #define MemAbs (1<<10) /* Memory operand is absolute displacement */ |
66 | #define String (1<<10) /* String instruction (rep capable) */ | 68 | #define String (1<<12) /* String instruction (rep capable) */ |
67 | #define Stack (1<<11) /* Stack instruction (push/pop) */ | 69 | #define Stack (1<<13) /* Stack instruction (push/pop) */ |
68 | #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ | 70 | #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ |
69 | #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ | 71 | #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ |
70 | #define GroupMask 0xff /* Group number stored in bits 0:7 */ | 72 | #define GroupMask 0xff /* Group number stored in bits 0:7 */ |
@@ -94,7 +96,7 @@ static u16 opcode_table[256] = { | |||
94 | /* 0x20 - 0x27 */ | 96 | /* 0x20 - 0x27 */ |
95 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 97 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
96 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 98 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
97 | SrcImmByte, SrcImm, 0, 0, | 99 | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, |
98 | /* 0x28 - 0x2F */ | 100 | /* 0x28 - 0x2F */ |
99 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 101 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
100 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 102 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
@@ -106,7 +108,8 @@ static u16 opcode_table[256] = { | |||
106 | /* 0x38 - 0x3F */ | 108 | /* 0x38 - 0x3F */ |
107 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 109 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
108 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | 110 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, |
109 | 0, 0, 0, 0, | 111 | ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, |
112 | 0, 0, | ||
110 | /* 0x40 - 0x47 */ | 113 | /* 0x40 - 0x47 */ |
111 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | 114 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, |
112 | /* 0x48 - 0x4F */ | 115 | /* 0x48 - 0x4F */ |
@@ -153,9 +156,16 @@ static u16 opcode_table[256] = { | |||
153 | 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | 156 | 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, |
154 | ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | 157 | ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, |
155 | ByteOp | ImplicitOps | String, ImplicitOps | String, | 158 | ByteOp | ImplicitOps | String, ImplicitOps | String, |
156 | /* 0xB0 - 0xBF */ | 159 | /* 0xB0 - 0xB7 */ |
157 | 0, 0, 0, 0, 0, 0, 0, 0, | 160 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, |
158 | DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0, | 161 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, |
162 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, | ||
163 | ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, | ||
164 | /* 0xB8 - 0xBF */ | ||
165 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
166 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
167 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
168 | DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, | ||
159 | /* 0xC0 - 0xC7 */ | 169 | /* 0xC0 - 0xC7 */ |
160 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | 170 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, |
161 | 0, ImplicitOps | Stack, 0, 0, | 171 | 0, ImplicitOps | Stack, 0, 0, |
@@ -169,17 +179,20 @@ static u16 opcode_table[256] = { | |||
169 | /* 0xD8 - 0xDF */ | 179 | /* 0xD8 - 0xDF */ |
170 | 0, 0, 0, 0, 0, 0, 0, 0, | 180 | 0, 0, 0, 0, 0, 0, 0, 0, |
171 | /* 0xE0 - 0xE7 */ | 181 | /* 0xE0 - 0xE7 */ |
172 | 0, 0, 0, 0, 0, 0, 0, 0, | 182 | 0, 0, 0, 0, |
183 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | ||
184 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | ||
173 | /* 0xE8 - 0xEF */ | 185 | /* 0xE8 - 0xEF */ |
174 | ImplicitOps | Stack, SrcImm | ImplicitOps, | 186 | ImplicitOps | Stack, SrcImm | ImplicitOps, |
175 | ImplicitOps, SrcImmByte | ImplicitOps, | 187 | ImplicitOps, SrcImmByte | ImplicitOps, |
176 | 0, 0, 0, 0, | 188 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, |
189 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, | ||
177 | /* 0xF0 - 0xF7 */ | 190 | /* 0xF0 - 0xF7 */ |
178 | 0, 0, 0, 0, | 191 | 0, 0, 0, 0, |
179 | ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, | 192 | ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, |
180 | /* 0xF8 - 0xFF */ | 193 | /* 0xF8 - 0xFF */ |
181 | ImplicitOps, 0, ImplicitOps, ImplicitOps, | 194 | ImplicitOps, 0, ImplicitOps, ImplicitOps, |
182 | 0, 0, Group | Group4, Group | Group5, | 195 | ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, |
183 | }; | 196 | }; |
184 | 197 | ||
185 | static u16 twobyte_table[256] = { | 198 | static u16 twobyte_table[256] = { |
@@ -268,15 +281,16 @@ static u16 group_table[] = { | |||
268 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | 281 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, |
269 | 0, 0, 0, 0, | 282 | 0, 0, 0, 0, |
270 | [Group3*8] = | 283 | [Group3*8] = |
271 | DstMem | SrcImm | ModRM | SrcImm, 0, | 284 | DstMem | SrcImm | ModRM, 0, |
272 | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | 285 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, |
273 | 0, 0, 0, 0, | 286 | 0, 0, 0, 0, |
274 | [Group4*8] = | 287 | [Group4*8] = |
275 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | 288 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, |
276 | 0, 0, 0, 0, 0, 0, | 289 | 0, 0, 0, 0, 0, 0, |
277 | [Group5*8] = | 290 | [Group5*8] = |
278 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, | 291 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, |
279 | SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, | 292 | SrcMem | ModRM | Stack, 0, |
293 | SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, | ||
280 | [Group7*8] = | 294 | [Group7*8] = |
281 | 0, 0, ModRM | SrcMem, ModRM | SrcMem, | 295 | 0, 0, ModRM | SrcMem, ModRM | SrcMem, |
282 | SrcNone | ModRM | DstMem | Mov, 0, | 296 | SrcNone | ModRM | DstMem | Mov, 0, |
@@ -839,7 +853,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
839 | /* Shadow copy of register state. Committed on successful emulation. */ | 853 | /* Shadow copy of register state. Committed on successful emulation. */ |
840 | 854 | ||
841 | memset(c, 0, sizeof(struct decode_cache)); | 855 | memset(c, 0, sizeof(struct decode_cache)); |
842 | c->eip = ctxt->vcpu->arch.rip; | 856 | c->eip = kvm_rip_read(ctxt->vcpu); |
843 | ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); | 857 | ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); |
844 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); | 858 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); |
845 | 859 | ||
@@ -1048,6 +1062,23 @@ done_prefixes: | |||
1048 | } | 1062 | } |
1049 | c->dst.type = OP_MEM; | 1063 | c->dst.type = OP_MEM; |
1050 | break; | 1064 | break; |
1065 | case DstAcc: | ||
1066 | c->dst.type = OP_REG; | ||
1067 | c->dst.bytes = c->op_bytes; | ||
1068 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1069 | switch (c->op_bytes) { | ||
1070 | case 1: | ||
1071 | c->dst.val = *(u8 *)c->dst.ptr; | ||
1072 | break; | ||
1073 | case 2: | ||
1074 | c->dst.val = *(u16 *)c->dst.ptr; | ||
1075 | break; | ||
1076 | case 4: | ||
1077 | c->dst.val = *(u32 *)c->dst.ptr; | ||
1078 | break; | ||
1079 | } | ||
1080 | c->dst.orig_val = c->dst.val; | ||
1081 | break; | ||
1051 | } | 1082 | } |
1052 | 1083 | ||
1053 | if (c->rip_relative) | 1084 | if (c->rip_relative) |
@@ -1151,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, | |||
1151 | case 1: /* dec */ | 1182 | case 1: /* dec */ |
1152 | emulate_1op("dec", c->dst, ctxt->eflags); | 1183 | emulate_1op("dec", c->dst, ctxt->eflags); |
1153 | break; | 1184 | break; |
1185 | case 2: /* call near abs */ { | ||
1186 | long int old_eip; | ||
1187 | old_eip = c->eip; | ||
1188 | c->eip = c->src.val; | ||
1189 | c->src.val = old_eip; | ||
1190 | emulate_push(ctxt); | ||
1191 | break; | ||
1192 | } | ||
1154 | case 4: /* jmp abs */ | 1193 | case 4: /* jmp abs */ |
1155 | c->eip = c->src.val; | 1194 | c->eip = c->src.val; |
1156 | break; | 1195 | break; |
@@ -1251,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1251 | u64 msr_data; | 1290 | u64 msr_data; |
1252 | unsigned long saved_eip = 0; | 1291 | unsigned long saved_eip = 0; |
1253 | struct decode_cache *c = &ctxt->decode; | 1292 | struct decode_cache *c = &ctxt->decode; |
1293 | unsigned int port; | ||
1294 | int io_dir_in; | ||
1254 | int rc = 0; | 1295 | int rc = 0; |
1255 | 1296 | ||
1256 | /* Shadow copy of register state. Committed on successful emulation. | 1297 | /* Shadow copy of register state. Committed on successful emulation. |
@@ -1267,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1267 | if (c->rep_prefix && (c->d & String)) { | 1308 | if (c->rep_prefix && (c->d & String)) { |
1268 | /* All REP prefixes have the same first termination condition */ | 1309 | /* All REP prefixes have the same first termination condition */ |
1269 | if (c->regs[VCPU_REGS_RCX] == 0) { | 1310 | if (c->regs[VCPU_REGS_RCX] == 0) { |
1270 | ctxt->vcpu->arch.rip = c->eip; | 1311 | kvm_rip_write(ctxt->vcpu, c->eip); |
1271 | goto done; | 1312 | goto done; |
1272 | } | 1313 | } |
1273 | /* The second termination condition only applies for REPE | 1314 | /* The second termination condition only applies for REPE |
@@ -1281,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
1281 | (c->b == 0xae) || (c->b == 0xaf)) { | 1322 | (c->b == 0xae) || (c->b == 0xaf)) { |
1282 | if ((c->rep_prefix == REPE_PREFIX) && | 1323 | if ((c->rep_prefix == REPE_PREFIX) && |
1283 | ((ctxt->eflags & EFLG_ZF) == 0)) { | 1324 | ((ctxt->eflags & EFLG_ZF) == 0)) { |
1284 | ctxt->vcpu->arch.rip = c->eip; | 1325 | kvm_rip_write(ctxt->vcpu, c->eip); |
1285 | goto done; | 1326 | goto done; |
1286 | } | 1327 | } |
1287 | if ((c->rep_prefix == REPNE_PREFIX) && | 1328 | if ((c->rep_prefix == REPNE_PREFIX) && |
1288 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { | 1329 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { |
1289 | ctxt->vcpu->arch.rip = c->eip; | 1330 | kvm_rip_write(ctxt->vcpu, c->eip); |
1290 | goto done; | 1331 | goto done; |
1291 | } | 1332 | } |
1292 | } | 1333 | } |
1293 | c->regs[VCPU_REGS_RCX]--; | 1334 | c->regs[VCPU_REGS_RCX]--; |
1294 | c->eip = ctxt->vcpu->arch.rip; | 1335 | c->eip = kvm_rip_read(ctxt->vcpu); |
1295 | } | 1336 | } |
1296 | 1337 | ||
1297 | if (c->src.type == OP_MEM) { | 1338 | if (c->src.type == OP_MEM) { |
@@ -1351,27 +1392,10 @@ special_insn: | |||
1351 | sbb: /* sbb */ | 1392 | sbb: /* sbb */ |
1352 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); | 1393 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); |
1353 | break; | 1394 | break; |
1354 | case 0x20 ... 0x23: | 1395 | case 0x20 ... 0x25: |
1355 | and: /* and */ | 1396 | and: /* and */ |
1356 | emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); | 1397 | emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); |
1357 | break; | 1398 | break; |
1358 | case 0x24: /* and al imm8 */ | ||
1359 | c->dst.type = OP_REG; | ||
1360 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1361 | c->dst.val = *(u8 *)c->dst.ptr; | ||
1362 | c->dst.bytes = 1; | ||
1363 | c->dst.orig_val = c->dst.val; | ||
1364 | goto and; | ||
1365 | case 0x25: /* and ax imm16, or eax imm32 */ | ||
1366 | c->dst.type = OP_REG; | ||
1367 | c->dst.bytes = c->op_bytes; | ||
1368 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1369 | if (c->op_bytes == 2) | ||
1370 | c->dst.val = *(u16 *)c->dst.ptr; | ||
1371 | else | ||
1372 | c->dst.val = *(u32 *)c->dst.ptr; | ||
1373 | c->dst.orig_val = c->dst.val; | ||
1374 | goto and; | ||
1375 | case 0x28 ... 0x2d: | 1399 | case 0x28 ... 0x2d: |
1376 | sub: /* sub */ | 1400 | sub: /* sub */ |
1377 | emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); | 1401 | emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); |
@@ -1659,7 +1683,7 @@ special_insn: | |||
1659 | case 0xae ... 0xaf: /* scas */ | 1683 | case 0xae ... 0xaf: /* scas */ |
1660 | DPRINTF("Urk! I don't handle SCAS.\n"); | 1684 | DPRINTF("Urk! I don't handle SCAS.\n"); |
1661 | goto cannot_emulate; | 1685 | goto cannot_emulate; |
1662 | case 0xb8: /* mov r, imm */ | 1686 | case 0xb0 ... 0xbf: /* mov r, imm */ |
1663 | goto mov; | 1687 | goto mov; |
1664 | case 0xc0 ... 0xc1: | 1688 | case 0xc0 ... 0xc1: |
1665 | emulate_grp2(ctxt); | 1689 | emulate_grp2(ctxt); |
@@ -1679,6 +1703,16 @@ special_insn: | |||
1679 | c->src.val = c->regs[VCPU_REGS_RCX]; | 1703 | c->src.val = c->regs[VCPU_REGS_RCX]; |
1680 | emulate_grp2(ctxt); | 1704 | emulate_grp2(ctxt); |
1681 | break; | 1705 | break; |
1706 | case 0xe4: /* inb */ | ||
1707 | case 0xe5: /* in */ | ||
1708 | port = insn_fetch(u8, 1, c->eip); | ||
1709 | io_dir_in = 1; | ||
1710 | goto do_io; | ||
1711 | case 0xe6: /* outb */ | ||
1712 | case 0xe7: /* out */ | ||
1713 | port = insn_fetch(u8, 1, c->eip); | ||
1714 | io_dir_in = 0; | ||
1715 | goto do_io; | ||
1682 | case 0xe8: /* call (near) */ { | 1716 | case 0xe8: /* call (near) */ { |
1683 | long int rel; | 1717 | long int rel; |
1684 | switch (c->op_bytes) { | 1718 | switch (c->op_bytes) { |
@@ -1729,6 +1763,22 @@ special_insn: | |||
1729 | jmp_rel(c, c->src.val); | 1763 | jmp_rel(c, c->src.val); |
1730 | c->dst.type = OP_NONE; /* Disable writeback. */ | 1764 | c->dst.type = OP_NONE; /* Disable writeback. */ |
1731 | break; | 1765 | break; |
1766 | case 0xec: /* in al,dx */ | ||
1767 | case 0xed: /* in (e/r)ax,dx */ | ||
1768 | port = c->regs[VCPU_REGS_RDX]; | ||
1769 | io_dir_in = 1; | ||
1770 | goto do_io; | ||
1771 | case 0xee: /* out al,dx */ | ||
1772 | case 0xef: /* out (e/r)ax,dx */ | ||
1773 | port = c->regs[VCPU_REGS_RDX]; | ||
1774 | io_dir_in = 0; | ||
1775 | do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, | ||
1776 | (c->d & ByteOp) ? 1 : c->op_bytes, | ||
1777 | port) != 0) { | ||
1778 | c->eip = saved_eip; | ||
1779 | goto cannot_emulate; | ||
1780 | } | ||
1781 | return 0; | ||
1732 | case 0xf4: /* hlt */ | 1782 | case 0xf4: /* hlt */ |
1733 | ctxt->vcpu->arch.halt_request = 1; | 1783 | ctxt->vcpu->arch.halt_request = 1; |
1734 | break; | 1784 | break; |
@@ -1754,6 +1804,14 @@ special_insn: | |||
1754 | ctxt->eflags |= X86_EFLAGS_IF; | 1804 | ctxt->eflags |= X86_EFLAGS_IF; |
1755 | c->dst.type = OP_NONE; /* Disable writeback. */ | 1805 | c->dst.type = OP_NONE; /* Disable writeback. */ |
1756 | break; | 1806 | break; |
1807 | case 0xfc: /* cld */ | ||
1808 | ctxt->eflags &= ~EFLG_DF; | ||
1809 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1810 | break; | ||
1811 | case 0xfd: /* std */ | ||
1812 | ctxt->eflags |= EFLG_DF; | ||
1813 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1814 | break; | ||
1757 | case 0xfe ... 0xff: /* Grp4/Grp5 */ | 1815 | case 0xfe ... 0xff: /* Grp4/Grp5 */ |
1758 | rc = emulate_grp45(ctxt, ops); | 1816 | rc = emulate_grp45(ctxt, ops); |
1759 | if (rc != 0) | 1817 | if (rc != 0) |
@@ -1768,7 +1826,7 @@ writeback: | |||
1768 | 1826 | ||
1769 | /* Commit shadow register state. */ | 1827 | /* Commit shadow register state. */ |
1770 | memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); | 1828 | memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); |
1771 | ctxt->vcpu->arch.rip = c->eip; | 1829 | kvm_rip_write(ctxt->vcpu, c->eip); |
1772 | 1830 | ||
1773 | done: | 1831 | done: |
1774 | if (rc == X86EMUL_UNHANDLEABLE) { | 1832 | if (rc == X86EMUL_UNHANDLEABLE) { |
@@ -1793,7 +1851,7 @@ twobyte_insn: | |||
1793 | goto done; | 1851 | goto done; |
1794 | 1852 | ||
1795 | /* Let the processor re-execute the fixed hypercall */ | 1853 | /* Let the processor re-execute the fixed hypercall */ |
1796 | c->eip = ctxt->vcpu->arch.rip; | 1854 | c->eip = kvm_rip_read(ctxt->vcpu); |
1797 | /* Disable writeback. */ | 1855 | /* Disable writeback. */ |
1798 | c->dst.type = OP_NONE; | 1856 | c->dst.type = OP_NONE; |
1799 | break; | 1857 | break; |
@@ -1889,7 +1947,7 @@ twobyte_insn: | |||
1889 | rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); | 1947 | rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); |
1890 | if (rc) { | 1948 | if (rc) { |
1891 | kvm_inject_gp(ctxt->vcpu, 0); | 1949 | kvm_inject_gp(ctxt->vcpu, 0); |
1892 | c->eip = ctxt->vcpu->arch.rip; | 1950 | c->eip = kvm_rip_read(ctxt->vcpu); |
1893 | } | 1951 | } |
1894 | rc = X86EMUL_CONTINUE; | 1952 | rc = X86EMUL_CONTINUE; |
1895 | c->dst.type = OP_NONE; | 1953 | c->dst.type = OP_NONE; |
@@ -1899,7 +1957,7 @@ twobyte_insn: | |||
1899 | rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); | 1957 | rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); |
1900 | if (rc) { | 1958 | if (rc) { |
1901 | kvm_inject_gp(ctxt->vcpu, 0); | 1959 | kvm_inject_gp(ctxt->vcpu, 0); |
1902 | c->eip = ctxt->vcpu->arch.rip; | 1960 | c->eip = kvm_rip_read(ctxt->vcpu); |
1903 | } else { | 1961 | } else { |
1904 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; | 1962 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; |
1905 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; | 1963 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 65f0b8a47bed..48ee4f9435f4 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -582,7 +582,7 @@ static void __init lguest_init_IRQ(void) | |||
582 | for (i = 0; i < LGUEST_IRQS; i++) { | 582 | for (i = 0; i < LGUEST_IRQS; i++) { |
583 | int vector = FIRST_EXTERNAL_VECTOR + i; | 583 | int vector = FIRST_EXTERNAL_VECTOR + i; |
584 | if (vector != SYSCALL_VECTOR) { | 584 | if (vector != SYSCALL_VECTOR) { |
585 | set_intr_gate(vector, interrupt[i]); | 585 | set_intr_gate(vector, interrupt[vector]); |
586 | set_irq_chip_and_handler_name(i, &lguest_irq_controller, | 586 | set_irq_chip_and_handler_name(i, &lguest_irq_controller, |
587 | handle_level_irq, | 587 | handle_level_irq, |
588 | "level"); | 588 | "level"); |
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index df37fc9d6a26..3c3b471ea496 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c | |||
@@ -41,6 +41,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { | |||
41 | { } | 41 | { } |
42 | }; | 42 | }; |
43 | 43 | ||
44 | static cpumask_t vector_allocation_domain(int cpu) | ||
45 | { | ||
46 | return cpumask_of_cpu(cpu); | ||
47 | } | ||
44 | 48 | ||
45 | static int probe_bigsmp(void) | 49 | static int probe_bigsmp(void) |
46 | { | 50 | { |
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 520cca0ee04e..28459cab3ddb 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c | |||
@@ -47,16 +47,26 @@ static __init int mps_oem_check(struct mp_config_table *mpc, char *oem, | |||
47 | /* Hook from generic ACPI tables.c */ | 47 | /* Hook from generic ACPI tables.c */ |
48 | static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) | 48 | static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) |
49 | { | 49 | { |
50 | unsigned long oem_addr; | 50 | unsigned long oem_addr = 0; |
51 | int check_dsdt; | ||
52 | int ret = 0; | ||
53 | |||
54 | /* check dsdt at first to avoid clear fix_map for oem_addr */ | ||
55 | check_dsdt = es7000_check_dsdt(); | ||
56 | |||
51 | if (!find_unisys_acpi_oem_table(&oem_addr)) { | 57 | if (!find_unisys_acpi_oem_table(&oem_addr)) { |
52 | if (es7000_check_dsdt()) | 58 | if (check_dsdt) |
53 | return parse_unisys_oem((char *)oem_addr); | 59 | ret = parse_unisys_oem((char *)oem_addr); |
54 | else { | 60 | else { |
55 | setup_unisys(); | 61 | setup_unisys(); |
56 | return 1; | 62 | ret = 1; |
57 | } | 63 | } |
64 | /* | ||
65 | * we need to unmap it | ||
66 | */ | ||
67 | unmap_unisys_acpi_oem_table(oem_addr); | ||
58 | } | 68 | } |
59 | return 0; | 69 | return ret; |
60 | } | 70 | } |
61 | #else | 71 | #else |
62 | static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) | 72 | static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) |
@@ -65,4 +75,18 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) | |||
65 | } | 75 | } |
66 | #endif | 76 | #endif |
67 | 77 | ||
78 | static cpumask_t vector_allocation_domain(int cpu) | ||
79 | { | ||
80 | /* Careful. Some cpus do not strictly honor the set of cpus | ||
81 | * specified in the interrupt destination when using lowest | ||
82 | * priority interrupt delivery mode. | ||
83 | * | ||
84 | * In particular there was a hyperthreading cpu observed to | ||
85 | * deliver interrupts to the wrong hyperthread when only one | ||
86 | * hyperthread was specified in the interrupt desitination. | ||
87 | */ | ||
88 | cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; | ||
89 | return domain; | ||
90 | } | ||
91 | |||
68 | struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); | 92 | struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); |
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c index 8cf58394975e..71a309b122e6 100644 --- a/arch/x86/mach-generic/numaq.c +++ b/arch/x86/mach-generic/numaq.c | |||
@@ -38,4 +38,18 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id) | |||
38 | return 0; | 38 | return 0; |
39 | } | 39 | } |
40 | 40 | ||
41 | static cpumask_t vector_allocation_domain(int cpu) | ||
42 | { | ||
43 | /* Careful. Some cpus do not strictly honor the set of cpus | ||
44 | * specified in the interrupt destination when using lowest | ||
45 | * priority interrupt delivery mode. | ||
46 | * | ||
47 | * In particular there was a hyperthreading cpu observed to | ||
48 | * deliver interrupts to the wrong hyperthread when only one | ||
49 | * hyperthread was specified in the interrupt desitination. | ||
50 | */ | ||
51 | cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; | ||
52 | return domain; | ||
53 | } | ||
54 | |||
41 | struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); | 55 | struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); |
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index 6ad6b67a723d..6272b5e69da6 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c | |||
@@ -23,4 +23,18 @@ static int probe_summit(void) | |||
23 | return 0; | 23 | return 0; |
24 | } | 24 | } |
25 | 25 | ||
26 | static cpumask_t vector_allocation_domain(int cpu) | ||
27 | { | ||
28 | /* Careful. Some cpus do not strictly honor the set of cpus | ||
29 | * specified in the interrupt destination when using lowest | ||
30 | * priority interrupt delivery mode. | ||
31 | * | ||
32 | * In particular there was a hyperthreading cpu observed to | ||
33 | * deliver interrupts to the wrong hyperthread when only one | ||
34 | * hyperthread was specified in the interrupt desitination. | ||
35 | */ | ||
36 | cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; | ||
37 | return domain; | ||
38 | } | ||
39 | |||
26 | struct genapic apic_summit = APIC_INIT("summit", probe_summit); | 40 | struct genapic apic_summit = APIC_INIT("summit", probe_summit); |
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 199a5f4a873c..0f6e8a6523ae 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c | |||
@@ -1483,7 +1483,7 @@ static void disable_local_vic_irq(unsigned int irq) | |||
1483 | * the interrupt off to another CPU */ | 1483 | * the interrupt off to another CPU */ |
1484 | static void before_handle_vic_irq(unsigned int irq) | 1484 | static void before_handle_vic_irq(unsigned int irq) |
1485 | { | 1485 | { |
1486 | irq_desc_t *desc = irq_desc + irq; | 1486 | irq_desc_t *desc = irq_to_desc(irq); |
1487 | __u8 cpu = smp_processor_id(); | 1487 | __u8 cpu = smp_processor_id(); |
1488 | 1488 | ||
1489 | _raw_spin_lock(&vic_irq_lock); | 1489 | _raw_spin_lock(&vic_irq_lock); |
@@ -1518,7 +1518,7 @@ static void before_handle_vic_irq(unsigned int irq) | |||
1518 | /* Finish the VIC interrupt: basically mask */ | 1518 | /* Finish the VIC interrupt: basically mask */ |
1519 | static void after_handle_vic_irq(unsigned int irq) | 1519 | static void after_handle_vic_irq(unsigned int irq) |
1520 | { | 1520 | { |
1521 | irq_desc_t *desc = irq_desc + irq; | 1521 | irq_desc_t *desc = irq_to_desc(irq); |
1522 | 1522 | ||
1523 | _raw_spin_lock(&vic_irq_lock); | 1523 | _raw_spin_lock(&vic_irq_lock); |
1524 | { | 1524 | { |
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index dfb932dcf136..59f89b434b45 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -13,12 +13,8 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o | |||
13 | mmiotrace-y := pf_in.o mmio-mod.o | 13 | mmiotrace-y := pf_in.o mmio-mod.o |
14 | obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o | 14 | obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o |
15 | 15 | ||
16 | ifeq ($(CONFIG_X86_32),y) | 16 | obj-$(CONFIG_NUMA) += numa_$(BITS).o |
17 | obj-$(CONFIG_NUMA) += discontig_32.o | ||
18 | else | ||
19 | obj-$(CONFIG_NUMA) += numa_64.o | ||
20 | obj-$(CONFIG_K8_NUMA) += k8topology_64.o | 17 | obj-$(CONFIG_K8_NUMA) += k8topology_64.o |
21 | endif | ||
22 | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o | 18 | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o |
23 | 19 | ||
24 | obj-$(CONFIG_MEMTEST) += memtest.o | 20 | obj-$(CONFIG_MEMTEST) += memtest.o |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a742d753d5b0..31e8730fa246 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -592,11 +592,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
592 | unsigned long flags; | 592 | unsigned long flags; |
593 | #endif | 593 | #endif |
594 | 594 | ||
595 | /* | ||
596 | * We can fault from pretty much anywhere, with unknown IRQ state. | ||
597 | */ | ||
598 | trace_hardirqs_fixup(); | ||
599 | |||
600 | tsk = current; | 595 | tsk = current; |
601 | mm = tsk->mm; | 596 | mm = tsk->mm; |
602 | prefetchw(&mm->mmap_sem); | 597 | prefetchw(&mm->mmap_sem); |
@@ -645,24 +640,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
645 | } | 640 | } |
646 | 641 | ||
647 | 642 | ||
648 | #ifdef CONFIG_X86_32 | ||
649 | /* It's safe to allow irq's after cr2 has been saved and the vmalloc | ||
650 | fault has been handled. */ | ||
651 | if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK)) | ||
652 | local_irq_enable(); | ||
653 | |||
654 | /* | 643 | /* |
655 | * If we're in an interrupt, have no user context or are running in an | 644 | * It's safe to allow irq's after cr2 has been saved and the |
656 | * atomic region then we must not take the fault. | 645 | * vmalloc fault has been handled. |
646 | * | ||
647 | * User-mode registers count as a user access even for any | ||
648 | * potential system fault or CPU buglet. | ||
657 | */ | 649 | */ |
658 | if (in_atomic() || !mm) | 650 | if (user_mode_vm(regs)) { |
659 | goto bad_area_nosemaphore; | 651 | local_irq_enable(); |
660 | #else /* CONFIG_X86_64 */ | 652 | error_code |= PF_USER; |
661 | if (likely(regs->flags & X86_EFLAGS_IF)) | 653 | } else if (regs->flags & X86_EFLAGS_IF) |
662 | local_irq_enable(); | 654 | local_irq_enable(); |
663 | 655 | ||
656 | #ifdef CONFIG_X86_64 | ||
664 | if (unlikely(error_code & PF_RSVD)) | 657 | if (unlikely(error_code & PF_RSVD)) |
665 | pgtable_bad(address, regs, error_code); | 658 | pgtable_bad(address, regs, error_code); |
659 | #endif | ||
666 | 660 | ||
667 | /* | 661 | /* |
668 | * If we're in an interrupt, have no user context or are running in an | 662 | * If we're in an interrupt, have no user context or are running in an |
@@ -671,15 +665,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
671 | if (unlikely(in_atomic() || !mm)) | 665 | if (unlikely(in_atomic() || !mm)) |
672 | goto bad_area_nosemaphore; | 666 | goto bad_area_nosemaphore; |
673 | 667 | ||
674 | /* | ||
675 | * User-mode registers count as a user access even for any | ||
676 | * potential system fault or CPU buglet. | ||
677 | */ | ||
678 | if (user_mode_vm(regs)) | ||
679 | error_code |= PF_USER; | ||
680 | again: | 668 | again: |
681 | #endif | 669 | /* |
682 | /* When running in the kernel we expect faults to occur only to | 670 | * When running in the kernel we expect faults to occur only to |
683 | * addresses in user space. All other faults represent errors in the | 671 | * addresses in user space. All other faults represent errors in the |
684 | * kernel and should generate an OOPS. Unfortunately, in the case of an | 672 | * kernel and should generate an OOPS. Unfortunately, in the case of an |
685 | * erroneous fault occurring in a code path which already holds mmap_sem | 673 | * erroneous fault occurring in a code path which already holds mmap_sem |
@@ -742,9 +730,6 @@ good_area: | |||
742 | goto bad_area; | 730 | goto bad_area; |
743 | } | 731 | } |
744 | 732 | ||
745 | #ifdef CONFIG_X86_32 | ||
746 | survive: | ||
747 | #endif | ||
748 | /* | 733 | /* |
749 | * If for any reason at all we couldn't handle the fault, | 734 | * If for any reason at all we couldn't handle the fault, |
750 | * make sure we exit gracefully rather than endlessly redo | 735 | * make sure we exit gracefully rather than endlessly redo |
@@ -879,12 +864,11 @@ out_of_memory: | |||
879 | up_read(&mm->mmap_sem); | 864 | up_read(&mm->mmap_sem); |
880 | if (is_global_init(tsk)) { | 865 | if (is_global_init(tsk)) { |
881 | yield(); | 866 | yield(); |
882 | #ifdef CONFIG_X86_32 | 867 | /* |
883 | down_read(&mm->mmap_sem); | 868 | * Re-lookup the vma - in theory the vma tree might |
884 | goto survive; | 869 | * have changed: |
885 | #else | 870 | */ |
886 | goto again; | 871 | goto again; |
887 | #endif | ||
888 | } | 872 | } |
889 | 873 | ||
890 | printk("VM: killing process %s\n", tsk->comm); | 874 | printk("VM: killing process %s\n", tsk->comm); |
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 007bb06c7504..4ba373c5b8c8 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c | |||
@@ -82,7 +82,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | |||
82 | pte_t pte = gup_get_pte(ptep); | 82 | pte_t pte = gup_get_pte(ptep); |
83 | struct page *page; | 83 | struct page *page; |
84 | 84 | ||
85 | if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) { | 85 | if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { |
86 | pte_unmap(ptep); | 86 | pte_unmap(ptep); |
87 | return 0; | 87 | return 0; |
88 | } | 88 | } |
@@ -116,10 +116,10 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, | |||
116 | mask = _PAGE_PRESENT|_PAGE_USER; | 116 | mask = _PAGE_PRESENT|_PAGE_USER; |
117 | if (write) | 117 | if (write) |
118 | mask |= _PAGE_RW; | 118 | mask |= _PAGE_RW; |
119 | if ((pte_val(pte) & mask) != mask) | 119 | if ((pte_flags(pte) & mask) != mask) |
120 | return 0; | 120 | return 0; |
121 | /* hugepages are never "special" */ | 121 | /* hugepages are never "special" */ |
122 | VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); | 122 | VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); |
123 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 123 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
124 | 124 | ||
125 | refs = 0; | 125 | refs = 0; |
@@ -173,10 +173,10 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, | |||
173 | mask = _PAGE_PRESENT|_PAGE_USER; | 173 | mask = _PAGE_PRESENT|_PAGE_USER; |
174 | if (write) | 174 | if (write) |
175 | mask |= _PAGE_RW; | 175 | mask |= _PAGE_RW; |
176 | if ((pte_val(pte) & mask) != mask) | 176 | if ((pte_flags(pte) & mask) != mask) |
177 | return 0; | 177 | return 0; |
178 | /* hugepages are never "special" */ | 178 | /* hugepages are never "special" */ |
179 | VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); | 179 | VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); |
180 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 180 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
181 | 181 | ||
182 | refs = 0; | 182 | refs = 0; |
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 165c871ba9af..bcc079c282dd 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c | |||
@@ -137,6 +137,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) | |||
137 | 137 | ||
138 | return (void*) vaddr; | 138 | return (void*) vaddr; |
139 | } | 139 | } |
140 | EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ | ||
140 | 141 | ||
141 | struct page *kmap_atomic_to_page(void *ptr) | 142 | struct page *kmap_atomic_to_page(void *ptr) |
142 | { | 143 | { |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bbe044dbe014..8396868e82c5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -558,7 +558,7 @@ void zap_low_mappings(void) | |||
558 | 558 | ||
559 | int nx_enabled; | 559 | int nx_enabled; |
560 | 560 | ||
561 | pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); | 561 | pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); |
562 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | 562 | EXPORT_SYMBOL_GPL(__supported_pte_mask); |
563 | 563 | ||
564 | #ifdef CONFIG_X86_PAE | 564 | #ifdef CONFIG_X86_PAE |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3e10054c5731..b8e461d49412 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -89,7 +89,7 @@ early_param("gbpages", parse_direct_gbpages_on); | |||
89 | 89 | ||
90 | int after_bootmem; | 90 | int after_bootmem; |
91 | 91 | ||
92 | unsigned long __supported_pte_mask __read_mostly = ~0UL; | 92 | pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; |
93 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | 93 | EXPORT_SYMBOL_GPL(__supported_pte_mask); |
94 | 94 | ||
95 | static int do_not_nx __cpuinitdata; | 95 | static int do_not_nx __cpuinitdata; |
@@ -196,9 +196,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) | |||
196 | } | 196 | } |
197 | 197 | ||
198 | pte = pte_offset_kernel(pmd, vaddr); | 198 | pte = pte_offset_kernel(pmd, vaddr); |
199 | if (!pte_none(*pte) && pte_val(new_pte) && | ||
200 | pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) | ||
201 | pte_ERROR(*pte); | ||
202 | set_pte(pte, new_pte); | 199 | set_pte(pte, new_pte); |
203 | 200 | ||
204 | /* | 201 | /* |
@@ -313,7 +310,7 @@ static __ref void *alloc_low_page(unsigned long *phys) | |||
313 | if (pfn >= table_top) | 310 | if (pfn >= table_top) |
314 | panic("alloc_low_page: ran out of memory"); | 311 | panic("alloc_low_page: ran out of memory"); |
315 | 312 | ||
316 | adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); | 313 | adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); |
317 | memset(adr, 0, PAGE_SIZE); | 314 | memset(adr, 0, PAGE_SIZE); |
318 | *phys = pfn * PAGE_SIZE; | 315 | *phys = pfn * PAGE_SIZE; |
319 | return adr; | 316 | return adr; |
@@ -749,7 +746,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
749 | old_start = mr[i].start; | 746 | old_start = mr[i].start; |
750 | memmove(&mr[i], &mr[i+1], | 747 | memmove(&mr[i], &mr[i+1], |
751 | (nr_range - 1 - i) * sizeof (struct map_range)); | 748 | (nr_range - 1 - i) * sizeof (struct map_range)); |
752 | mr[i].start = old_start; | 749 | mr[i--].start = old_start; |
753 | nr_range--; | 750 | nr_range--; |
754 | } | 751 | } |
755 | 752 | ||
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 8cbeda15cd29..ae71e11eb3e5 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -45,6 +45,27 @@ unsigned long __phys_addr(unsigned long x) | |||
45 | } | 45 | } |
46 | EXPORT_SYMBOL(__phys_addr); | 46 | EXPORT_SYMBOL(__phys_addr); |
47 | 47 | ||
48 | bool __virt_addr_valid(unsigned long x) | ||
49 | { | ||
50 | if (x >= __START_KERNEL_map) { | ||
51 | x -= __START_KERNEL_map; | ||
52 | if (x >= KERNEL_IMAGE_SIZE) | ||
53 | return false; | ||
54 | x += phys_base; | ||
55 | } else { | ||
56 | if (x < PAGE_OFFSET) | ||
57 | return false; | ||
58 | x -= PAGE_OFFSET; | ||
59 | if (system_state == SYSTEM_BOOTING ? | ||
60 | x > MAXMEM : !phys_addr_valid(x)) { | ||
61 | return false; | ||
62 | } | ||
63 | } | ||
64 | |||
65 | return pfn_valid(x >> PAGE_SHIFT); | ||
66 | } | ||
67 | EXPORT_SYMBOL(__virt_addr_valid); | ||
68 | |||
48 | #else | 69 | #else |
49 | 70 | ||
50 | static inline int phys_addr_valid(unsigned long addr) | 71 | static inline int phys_addr_valid(unsigned long addr) |
@@ -56,13 +77,24 @@ static inline int phys_addr_valid(unsigned long addr) | |||
56 | unsigned long __phys_addr(unsigned long x) | 77 | unsigned long __phys_addr(unsigned long x) |
57 | { | 78 | { |
58 | /* VMALLOC_* aren't constants; not available at the boot time */ | 79 | /* VMALLOC_* aren't constants; not available at the boot time */ |
59 | VIRTUAL_BUG_ON(x < PAGE_OFFSET || (system_state != SYSTEM_BOOTING && | 80 | VIRTUAL_BUG_ON(x < PAGE_OFFSET); |
60 | is_vmalloc_addr((void *)x))); | 81 | VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && |
82 | is_vmalloc_addr((void *) x)); | ||
61 | return x - PAGE_OFFSET; | 83 | return x - PAGE_OFFSET; |
62 | } | 84 | } |
63 | EXPORT_SYMBOL(__phys_addr); | 85 | EXPORT_SYMBOL(__phys_addr); |
64 | #endif | 86 | #endif |
65 | 87 | ||
88 | bool __virt_addr_valid(unsigned long x) | ||
89 | { | ||
90 | if (x < PAGE_OFFSET) | ||
91 | return false; | ||
92 | if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) | ||
93 | return false; | ||
94 | return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); | ||
95 | } | ||
96 | EXPORT_SYMBOL(__virt_addr_valid); | ||
97 | |||
66 | #endif | 98 | #endif |
67 | 99 | ||
68 | int page_is_ram(unsigned long pagenr) | 100 | int page_is_ram(unsigned long pagenr) |
@@ -188,6 +220,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
188 | return (__force void __iomem *)phys_to_virt(phys_addr); | 220 | return (__force void __iomem *)phys_to_virt(phys_addr); |
189 | 221 | ||
190 | /* | 222 | /* |
223 | * Check if the request spans more than any BAR in the iomem resource | ||
224 | * tree. | ||
225 | */ | ||
226 | WARN_ON(iomem_map_sanity_check(phys_addr, size)); | ||
227 | |||
228 | /* | ||
191 | * Don't allow anybody to remap normal RAM that we're using.. | 229 | * Don't allow anybody to remap normal RAM that we're using.. |
192 | */ | 230 | */ |
193 | for (pfn = phys_addr >> PAGE_SHIFT; | 231 | for (pfn = phys_addr >> PAGE_SHIFT; |
@@ -242,16 +280,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
242 | switch (prot_val) { | 280 | switch (prot_val) { |
243 | case _PAGE_CACHE_UC: | 281 | case _PAGE_CACHE_UC: |
244 | default: | 282 | default: |
245 | prot = PAGE_KERNEL_NOCACHE; | 283 | prot = PAGE_KERNEL_IO_NOCACHE; |
246 | break; | 284 | break; |
247 | case _PAGE_CACHE_UC_MINUS: | 285 | case _PAGE_CACHE_UC_MINUS: |
248 | prot = PAGE_KERNEL_UC_MINUS; | 286 | prot = PAGE_KERNEL_IO_UC_MINUS; |
249 | break; | 287 | break; |
250 | case _PAGE_CACHE_WC: | 288 | case _PAGE_CACHE_WC: |
251 | prot = PAGE_KERNEL_WC; | 289 | prot = PAGE_KERNEL_IO_WC; |
252 | break; | 290 | break; |
253 | case _PAGE_CACHE_WB: | 291 | case _PAGE_CACHE_WB: |
254 | prot = PAGE_KERNEL; | 292 | prot = PAGE_KERNEL_IO; |
255 | break; | 293 | break; |
256 | } | 294 | } |
257 | 295 | ||
@@ -568,12 +606,12 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, | |||
568 | } | 606 | } |
569 | 607 | ||
570 | static inline void __init early_set_fixmap(enum fixed_addresses idx, | 608 | static inline void __init early_set_fixmap(enum fixed_addresses idx, |
571 | unsigned long phys) | 609 | unsigned long phys, pgprot_t prot) |
572 | { | 610 | { |
573 | if (after_paging_init) | 611 | if (after_paging_init) |
574 | set_fixmap(idx, phys); | 612 | __set_fixmap(idx, phys, prot); |
575 | else | 613 | else |
576 | __early_set_fixmap(idx, phys, PAGE_KERNEL); | 614 | __early_set_fixmap(idx, phys, prot); |
577 | } | 615 | } |
578 | 616 | ||
579 | static inline void __init early_clear_fixmap(enum fixed_addresses idx) | 617 | static inline void __init early_clear_fixmap(enum fixed_addresses idx) |
@@ -584,16 +622,22 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx) | |||
584 | __early_set_fixmap(idx, 0, __pgprot(0)); | 622 | __early_set_fixmap(idx, 0, __pgprot(0)); |
585 | } | 623 | } |
586 | 624 | ||
587 | 625 | static void *prev_map[FIX_BTMAPS_SLOTS] __initdata; | |
588 | static int __initdata early_ioremap_nested; | 626 | static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; |
589 | |||
590 | static int __init check_early_ioremap_leak(void) | 627 | static int __init check_early_ioremap_leak(void) |
591 | { | 628 | { |
592 | if (!early_ioremap_nested) | 629 | int count = 0; |
630 | int i; | ||
631 | |||
632 | for (i = 0; i < FIX_BTMAPS_SLOTS; i++) | ||
633 | if (prev_map[i]) | ||
634 | count++; | ||
635 | |||
636 | if (!count) | ||
593 | return 0; | 637 | return 0; |
594 | WARN(1, KERN_WARNING | 638 | WARN(1, KERN_WARNING |
595 | "Debug warning: early ioremap leak of %d areas detected.\n", | 639 | "Debug warning: early ioremap leak of %d areas detected.\n", |
596 | early_ioremap_nested); | 640 | count); |
597 | printk(KERN_WARNING | 641 | printk(KERN_WARNING |
598 | "please boot with early_ioremap_debug and report the dmesg.\n"); | 642 | "please boot with early_ioremap_debug and report the dmesg.\n"); |
599 | 643 | ||
@@ -601,18 +645,33 @@ static int __init check_early_ioremap_leak(void) | |||
601 | } | 645 | } |
602 | late_initcall(check_early_ioremap_leak); | 646 | late_initcall(check_early_ioremap_leak); |
603 | 647 | ||
604 | void __init *early_ioremap(unsigned long phys_addr, unsigned long size) | 648 | static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) |
605 | { | 649 | { |
606 | unsigned long offset, last_addr; | 650 | unsigned long offset, last_addr; |
607 | unsigned int nrpages, nesting; | 651 | unsigned int nrpages; |
608 | enum fixed_addresses idx0, idx; | 652 | enum fixed_addresses idx0, idx; |
653 | int i, slot; | ||
609 | 654 | ||
610 | WARN_ON(system_state != SYSTEM_BOOTING); | 655 | WARN_ON(system_state != SYSTEM_BOOTING); |
611 | 656 | ||
612 | nesting = early_ioremap_nested; | 657 | slot = -1; |
658 | for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { | ||
659 | if (!prev_map[i]) { | ||
660 | slot = i; | ||
661 | break; | ||
662 | } | ||
663 | } | ||
664 | |||
665 | if (slot < 0) { | ||
666 | printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n", | ||
667 | phys_addr, size); | ||
668 | WARN_ON(1); | ||
669 | return NULL; | ||
670 | } | ||
671 | |||
613 | if (early_ioremap_debug) { | 672 | if (early_ioremap_debug) { |
614 | printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", | 673 | printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", |
615 | phys_addr, size, nesting); | 674 | phys_addr, size, slot); |
616 | dump_stack(); | 675 | dump_stack(); |
617 | } | 676 | } |
618 | 677 | ||
@@ -623,11 +682,7 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) | |||
623 | return NULL; | 682 | return NULL; |
624 | } | 683 | } |
625 | 684 | ||
626 | if (nesting >= FIX_BTMAPS_NESTING) { | 685 | prev_size[slot] = size; |
627 | WARN_ON(1); | ||
628 | return NULL; | ||
629 | } | ||
630 | early_ioremap_nested++; | ||
631 | /* | 686 | /* |
632 | * Mappings have to be page-aligned | 687 | * Mappings have to be page-aligned |
633 | */ | 688 | */ |
@@ -647,10 +702,10 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) | |||
647 | /* | 702 | /* |
648 | * Ok, go for it.. | 703 | * Ok, go for it.. |
649 | */ | 704 | */ |
650 | idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; | 705 | idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; |
651 | idx = idx0; | 706 | idx = idx0; |
652 | while (nrpages > 0) { | 707 | while (nrpages > 0) { |
653 | early_set_fixmap(idx, phys_addr); | 708 | early_set_fixmap(idx, phys_addr, prot); |
654 | phys_addr += PAGE_SIZE; | 709 | phys_addr += PAGE_SIZE; |
655 | --idx; | 710 | --idx; |
656 | --nrpages; | 711 | --nrpages; |
@@ -658,7 +713,20 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) | |||
658 | if (early_ioremap_debug) | 713 | if (early_ioremap_debug) |
659 | printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); | 714 | printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); |
660 | 715 | ||
661 | return (void *) (offset + fix_to_virt(idx0)); | 716 | prev_map[slot] = (void *) (offset + fix_to_virt(idx0)); |
717 | return prev_map[slot]; | ||
718 | } | ||
719 | |||
720 | /* Remap an IO device */ | ||
721 | void __init *early_ioremap(unsigned long phys_addr, unsigned long size) | ||
722 | { | ||
723 | return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); | ||
724 | } | ||
725 | |||
726 | /* Remap memory */ | ||
727 | void __init *early_memremap(unsigned long phys_addr, unsigned long size) | ||
728 | { | ||
729 | return __early_ioremap(phys_addr, size, PAGE_KERNEL); | ||
662 | } | 730 | } |
663 | 731 | ||
664 | void __init early_iounmap(void *addr, unsigned long size) | 732 | void __init early_iounmap(void *addr, unsigned long size) |
@@ -667,15 +735,33 @@ void __init early_iounmap(void *addr, unsigned long size) | |||
667 | unsigned long offset; | 735 | unsigned long offset; |
668 | unsigned int nrpages; | 736 | unsigned int nrpages; |
669 | enum fixed_addresses idx; | 737 | enum fixed_addresses idx; |
670 | int nesting; | 738 | int i, slot; |
739 | |||
740 | slot = -1; | ||
741 | for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { | ||
742 | if (prev_map[i] == addr) { | ||
743 | slot = i; | ||
744 | break; | ||
745 | } | ||
746 | } | ||
671 | 747 | ||
672 | nesting = --early_ioremap_nested; | 748 | if (slot < 0) { |
673 | if (WARN_ON(nesting < 0)) | 749 | printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", |
750 | addr, size); | ||
751 | WARN_ON(1); | ||
752 | return; | ||
753 | } | ||
754 | |||
755 | if (prev_size[slot] != size) { | ||
756 | printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", | ||
757 | addr, size, slot, prev_size[slot]); | ||
758 | WARN_ON(1); | ||
674 | return; | 759 | return; |
760 | } | ||
675 | 761 | ||
676 | if (early_ioremap_debug) { | 762 | if (early_ioremap_debug) { |
677 | printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, | 763 | printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, |
678 | size, nesting); | 764 | size, slot); |
679 | dump_stack(); | 765 | dump_stack(); |
680 | } | 766 | } |
681 | 767 | ||
@@ -687,12 +773,13 @@ void __init early_iounmap(void *addr, unsigned long size) | |||
687 | offset = virt_addr & ~PAGE_MASK; | 773 | offset = virt_addr & ~PAGE_MASK; |
688 | nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; | 774 | nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; |
689 | 775 | ||
690 | idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; | 776 | idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; |
691 | while (nrpages > 0) { | 777 | while (nrpages > 0) { |
692 | early_clear_fixmap(idx); | 778 | early_clear_fixmap(idx); |
693 | --idx; | 779 | --idx; |
694 | --nrpages; | 780 | --nrpages; |
695 | } | 781 | } |
782 | prev_map[slot] = 0; | ||
696 | } | 783 | } |
697 | 784 | ||
698 | void __this_fixmap_does_not_exist(void) | 785 | void __this_fixmap_does_not_exist(void) |
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 635b50e85581..2c4baa88f2cb 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c | |||
@@ -56,13 +56,6 @@ struct remap_trace { | |||
56 | static DEFINE_PER_CPU(struct trap_reason, pf_reason); | 56 | static DEFINE_PER_CPU(struct trap_reason, pf_reason); |
57 | static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); | 57 | static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); |
58 | 58 | ||
59 | #if 0 /* XXX: no way gather this info anymore */ | ||
60 | /* Access to this is not per-cpu. */ | ||
61 | static DEFINE_PER_CPU(atomic_t, dropped); | ||
62 | #endif | ||
63 | |||
64 | static struct dentry *marker_file; | ||
65 | |||
66 | static DEFINE_MUTEX(mmiotrace_mutex); | 59 | static DEFINE_MUTEX(mmiotrace_mutex); |
67 | static DEFINE_SPINLOCK(trace_lock); | 60 | static DEFINE_SPINLOCK(trace_lock); |
68 | static atomic_t mmiotrace_enabled; | 61 | static atomic_t mmiotrace_enabled; |
@@ -75,7 +68,7 @@ static LIST_HEAD(trace_list); /* struct remap_trace */ | |||
75 | * and trace_lock. | 68 | * and trace_lock. |
76 | * - Routines depending on is_enabled() must take trace_lock. | 69 | * - Routines depending on is_enabled() must take trace_lock. |
77 | * - trace_list users must hold trace_lock. | 70 | * - trace_list users must hold trace_lock. |
78 | * - is_enabled() guarantees that mmio_trace_record is allowed. | 71 | * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed. |
79 | * - pre/post callbacks assume the effect of is_enabled() being true. | 72 | * - pre/post callbacks assume the effect of is_enabled() being true. |
80 | */ | 73 | */ |
81 | 74 | ||
@@ -97,44 +90,6 @@ static bool is_enabled(void) | |||
97 | return atomic_read(&mmiotrace_enabled); | 90 | return atomic_read(&mmiotrace_enabled); |
98 | } | 91 | } |
99 | 92 | ||
100 | #if 0 /* XXX: needs rewrite */ | ||
101 | /* | ||
102 | * Write callback for the debugfs entry: | ||
103 | * Read a marker and write it to the mmio trace log | ||
104 | */ | ||
105 | static ssize_t write_marker(struct file *file, const char __user *buffer, | ||
106 | size_t count, loff_t *ppos) | ||
107 | { | ||
108 | char *event = NULL; | ||
109 | struct mm_io_header *headp; | ||
110 | ssize_t len = (count > 65535) ? 65535 : count; | ||
111 | |||
112 | event = kzalloc(sizeof(*headp) + len, GFP_KERNEL); | ||
113 | if (!event) | ||
114 | return -ENOMEM; | ||
115 | |||
116 | headp = (struct mm_io_header *)event; | ||
117 | headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT); | ||
118 | headp->data_len = len; | ||
119 | |||
120 | if (copy_from_user(event + sizeof(*headp), buffer, len)) { | ||
121 | kfree(event); | ||
122 | return -EFAULT; | ||
123 | } | ||
124 | |||
125 | spin_lock_irq(&trace_lock); | ||
126 | #if 0 /* XXX: convert this to use tracing */ | ||
127 | if (is_enabled()) | ||
128 | relay_write(chan, event, sizeof(*headp) + len); | ||
129 | else | ||
130 | #endif | ||
131 | len = -EINVAL; | ||
132 | spin_unlock_irq(&trace_lock); | ||
133 | kfree(event); | ||
134 | return len; | ||
135 | } | ||
136 | #endif | ||
137 | |||
138 | static void print_pte(unsigned long address) | 93 | static void print_pte(unsigned long address) |
139 | { | 94 | { |
140 | unsigned int level; | 95 | unsigned int level; |
@@ -307,8 +262,10 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size, | |||
307 | map.map_id = trace->id; | 262 | map.map_id = trace->id; |
308 | 263 | ||
309 | spin_lock_irq(&trace_lock); | 264 | spin_lock_irq(&trace_lock); |
310 | if (!is_enabled()) | 265 | if (!is_enabled()) { |
266 | kfree(trace); | ||
311 | goto not_enabled; | 267 | goto not_enabled; |
268 | } | ||
312 | 269 | ||
313 | mmio_trace_mapping(&map); | 270 | mmio_trace_mapping(&map); |
314 | list_add_tail(&trace->list, &trace_list); | 271 | list_add_tail(&trace->list, &trace_list); |
@@ -377,6 +334,23 @@ void mmiotrace_iounmap(volatile void __iomem *addr) | |||
377 | iounmap_trace_core(addr); | 334 | iounmap_trace_core(addr); |
378 | } | 335 | } |
379 | 336 | ||
337 | int mmiotrace_printk(const char *fmt, ...) | ||
338 | { | ||
339 | int ret = 0; | ||
340 | va_list args; | ||
341 | unsigned long flags; | ||
342 | va_start(args, fmt); | ||
343 | |||
344 | spin_lock_irqsave(&trace_lock, flags); | ||
345 | if (is_enabled()) | ||
346 | ret = mmio_trace_printk(fmt, args); | ||
347 | spin_unlock_irqrestore(&trace_lock, flags); | ||
348 | |||
349 | va_end(args); | ||
350 | return ret; | ||
351 | } | ||
352 | EXPORT_SYMBOL(mmiotrace_printk); | ||
353 | |||
380 | static void clear_trace_list(void) | 354 | static void clear_trace_list(void) |
381 | { | 355 | { |
382 | struct remap_trace *trace; | 356 | struct remap_trace *trace; |
@@ -462,26 +436,12 @@ static void leave_uniprocessor(void) | |||
462 | } | 436 | } |
463 | #endif | 437 | #endif |
464 | 438 | ||
465 | #if 0 /* XXX: out of order */ | ||
466 | static struct file_operations fops_marker = { | ||
467 | .owner = THIS_MODULE, | ||
468 | .write = write_marker | ||
469 | }; | ||
470 | #endif | ||
471 | |||
472 | void enable_mmiotrace(void) | 439 | void enable_mmiotrace(void) |
473 | { | 440 | { |
474 | mutex_lock(&mmiotrace_mutex); | 441 | mutex_lock(&mmiotrace_mutex); |
475 | if (is_enabled()) | 442 | if (is_enabled()) |
476 | goto out; | 443 | goto out; |
477 | 444 | ||
478 | #if 0 /* XXX: tracing does not support text entries */ | ||
479 | marker_file = debugfs_create_file("marker", 0660, dir, NULL, | ||
480 | &fops_marker); | ||
481 | if (!marker_file) | ||
482 | pr_err(NAME "marker file creation failed.\n"); | ||
483 | #endif | ||
484 | |||
485 | if (nommiotrace) | 445 | if (nommiotrace) |
486 | pr_info(NAME "MMIO tracing disabled.\n"); | 446 | pr_info(NAME "MMIO tracing disabled.\n"); |
487 | enter_uniprocessor(); | 447 | enter_uniprocessor(); |
@@ -506,11 +466,6 @@ void disable_mmiotrace(void) | |||
506 | 466 | ||
507 | clear_trace_list(); /* guarantees: no more kmmio callbacks */ | 467 | clear_trace_list(); /* guarantees: no more kmmio callbacks */ |
508 | leave_uniprocessor(); | 468 | leave_uniprocessor(); |
509 | if (marker_file) { | ||
510 | debugfs_remove(marker_file); | ||
511 | marker_file = NULL; | ||
512 | } | ||
513 | |||
514 | pr_info(NAME "disabled.\n"); | 469 | pr_info(NAME "disabled.\n"); |
515 | out: | 470 | out: |
516 | mutex_unlock(&mmiotrace_mutex); | 471 | mutex_unlock(&mmiotrace_mutex); |
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/numa_32.c index 847c164725f4..847c164725f4 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/numa_32.c | |||
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a9ec89c3fbca..407d8784f669 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -792,6 +792,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, | |||
792 | /* Must avoid aliasing mappings in the highmem code */ | 792 | /* Must avoid aliasing mappings in the highmem code */ |
793 | kmap_flush_unused(); | 793 | kmap_flush_unused(); |
794 | 794 | ||
795 | vm_unmap_aliases(); | ||
796 | |||
795 | cpa.vaddr = addr; | 797 | cpa.vaddr = addr; |
796 | cpa.numpages = numpages; | 798 | cpa.numpages = numpages; |
797 | cpa.mask_set = mask_set; | 799 | cpa.mask_set = mask_set; |
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index efa1911e20ca..df3d5c861cda 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c | |||
@@ -79,25 +79,34 @@ static unsigned int mw32[] = { 0xC7 }; | |||
79 | static unsigned int mw64[] = { 0x89, 0x8B }; | 79 | static unsigned int mw64[] = { 0x89, 0x8B }; |
80 | #endif /* not __i386__ */ | 80 | #endif /* not __i386__ */ |
81 | 81 | ||
82 | static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, | 82 | struct prefix_bits { |
83 | int *rexr) | 83 | unsigned shorted:1; |
84 | unsigned enlarged:1; | ||
85 | unsigned rexr:1; | ||
86 | unsigned rex:1; | ||
87 | }; | ||
88 | |||
89 | static int skip_prefix(unsigned char *addr, struct prefix_bits *prf) | ||
84 | { | 90 | { |
85 | int i; | 91 | int i; |
86 | unsigned char *p = addr; | 92 | unsigned char *p = addr; |
87 | *shorted = 0; | 93 | prf->shorted = 0; |
88 | *enlarged = 0; | 94 | prf->enlarged = 0; |
89 | *rexr = 0; | 95 | prf->rexr = 0; |
96 | prf->rex = 0; | ||
90 | 97 | ||
91 | restart: | 98 | restart: |
92 | for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { | 99 | for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { |
93 | if (*p == prefix_codes[i]) { | 100 | if (*p == prefix_codes[i]) { |
94 | if (*p == 0x66) | 101 | if (*p == 0x66) |
95 | *shorted = 1; | 102 | prf->shorted = 1; |
96 | #ifdef __amd64__ | 103 | #ifdef __amd64__ |
97 | if ((*p & 0xf8) == 0x48) | 104 | if ((*p & 0xf8) == 0x48) |
98 | *enlarged = 1; | 105 | prf->enlarged = 1; |
99 | if ((*p & 0xf4) == 0x44) | 106 | if ((*p & 0xf4) == 0x44) |
100 | *rexr = 1; | 107 | prf->rexr = 1; |
108 | if ((*p & 0xf0) == 0x40) | ||
109 | prf->rex = 1; | ||
101 | #endif | 110 | #endif |
102 | p++; | 111 | p++; |
103 | goto restart; | 112 | goto restart; |
@@ -135,12 +144,12 @@ enum reason_type get_ins_type(unsigned long ins_addr) | |||
135 | { | 144 | { |
136 | unsigned int opcode; | 145 | unsigned int opcode; |
137 | unsigned char *p; | 146 | unsigned char *p; |
138 | int shorted, enlarged, rexr; | 147 | struct prefix_bits prf; |
139 | int i; | 148 | int i; |
140 | enum reason_type rv = OTHERS; | 149 | enum reason_type rv = OTHERS; |
141 | 150 | ||
142 | p = (unsigned char *)ins_addr; | 151 | p = (unsigned char *)ins_addr; |
143 | p += skip_prefix(p, &shorted, &enlarged, &rexr); | 152 | p += skip_prefix(p, &prf); |
144 | p += get_opcode(p, &opcode); | 153 | p += get_opcode(p, &opcode); |
145 | 154 | ||
146 | CHECK_OP_TYPE(opcode, reg_rop, REG_READ); | 155 | CHECK_OP_TYPE(opcode, reg_rop, REG_READ); |
@@ -156,10 +165,11 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr) | |||
156 | { | 165 | { |
157 | unsigned int opcode; | 166 | unsigned int opcode; |
158 | unsigned char *p; | 167 | unsigned char *p; |
159 | int i, shorted, enlarged, rexr; | 168 | struct prefix_bits prf; |
169 | int i; | ||
160 | 170 | ||
161 | p = (unsigned char *)ins_addr; | 171 | p = (unsigned char *)ins_addr; |
162 | p += skip_prefix(p, &shorted, &enlarged, &rexr); | 172 | p += skip_prefix(p, &prf); |
163 | p += get_opcode(p, &opcode); | 173 | p += get_opcode(p, &opcode); |
164 | 174 | ||
165 | for (i = 0; i < ARRAY_SIZE(rw8); i++) | 175 | for (i = 0; i < ARRAY_SIZE(rw8); i++) |
@@ -168,7 +178,7 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr) | |||
168 | 178 | ||
169 | for (i = 0; i < ARRAY_SIZE(rw32); i++) | 179 | for (i = 0; i < ARRAY_SIZE(rw32); i++) |
170 | if (rw32[i] == opcode) | 180 | if (rw32[i] == opcode) |
171 | return (shorted ? 2 : (enlarged ? 8 : 4)); | 181 | return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); |
172 | 182 | ||
173 | printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); | 183 | printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); |
174 | return 0; | 184 | return 0; |
@@ -178,10 +188,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr) | |||
178 | { | 188 | { |
179 | unsigned int opcode; | 189 | unsigned int opcode; |
180 | unsigned char *p; | 190 | unsigned char *p; |
181 | int i, shorted, enlarged, rexr; | 191 | struct prefix_bits prf; |
192 | int i; | ||
182 | 193 | ||
183 | p = (unsigned char *)ins_addr; | 194 | p = (unsigned char *)ins_addr; |
184 | p += skip_prefix(p, &shorted, &enlarged, &rexr); | 195 | p += skip_prefix(p, &prf); |
185 | p += get_opcode(p, &opcode); | 196 | p += get_opcode(p, &opcode); |
186 | 197 | ||
187 | for (i = 0; i < ARRAY_SIZE(mw8); i++) | 198 | for (i = 0; i < ARRAY_SIZE(mw8); i++) |
@@ -194,11 +205,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr) | |||
194 | 205 | ||
195 | for (i = 0; i < ARRAY_SIZE(mw32); i++) | 206 | for (i = 0; i < ARRAY_SIZE(mw32); i++) |
196 | if (mw32[i] == opcode) | 207 | if (mw32[i] == opcode) |
197 | return shorted ? 2 : 4; | 208 | return prf.shorted ? 2 : 4; |
198 | 209 | ||
199 | for (i = 0; i < ARRAY_SIZE(mw64); i++) | 210 | for (i = 0; i < ARRAY_SIZE(mw64); i++) |
200 | if (mw64[i] == opcode) | 211 | if (mw64[i] == opcode) |
201 | return shorted ? 2 : (enlarged ? 8 : 4); | 212 | return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); |
202 | 213 | ||
203 | printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); | 214 | printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); |
204 | return 0; | 215 | return 0; |
@@ -238,7 +249,7 @@ enum { | |||
238 | #endif | 249 | #endif |
239 | }; | 250 | }; |
240 | 251 | ||
241 | static unsigned char *get_reg_w8(int no, struct pt_regs *regs) | 252 | static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs) |
242 | { | 253 | { |
243 | unsigned char *rv = NULL; | 254 | unsigned char *rv = NULL; |
244 | 255 | ||
@@ -255,18 +266,6 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs) | |||
255 | case arg_DL: | 266 | case arg_DL: |
256 | rv = (unsigned char *)®s->dx; | 267 | rv = (unsigned char *)®s->dx; |
257 | break; | 268 | break; |
258 | case arg_AH: | ||
259 | rv = 1 + (unsigned char *)®s->ax; | ||
260 | break; | ||
261 | case arg_BH: | ||
262 | rv = 1 + (unsigned char *)®s->bx; | ||
263 | break; | ||
264 | case arg_CH: | ||
265 | rv = 1 + (unsigned char *)®s->cx; | ||
266 | break; | ||
267 | case arg_DH: | ||
268 | rv = 1 + (unsigned char *)®s->dx; | ||
269 | break; | ||
270 | #ifdef __amd64__ | 269 | #ifdef __amd64__ |
271 | case arg_R8: | 270 | case arg_R8: |
272 | rv = (unsigned char *)®s->r8; | 271 | rv = (unsigned char *)®s->r8; |
@@ -294,9 +293,55 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs) | |||
294 | break; | 293 | break; |
295 | #endif | 294 | #endif |
296 | default: | 295 | default: |
297 | printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); | ||
298 | break; | 296 | break; |
299 | } | 297 | } |
298 | |||
299 | if (rv) | ||
300 | return rv; | ||
301 | |||
302 | if (rex) { | ||
303 | /* | ||
304 | * If REX prefix exists, access low bytes of SI etc. | ||
305 | * instead of AH etc. | ||
306 | */ | ||
307 | switch (no) { | ||
308 | case arg_SI: | ||
309 | rv = (unsigned char *)®s->si; | ||
310 | break; | ||
311 | case arg_DI: | ||
312 | rv = (unsigned char *)®s->di; | ||
313 | break; | ||
314 | case arg_BP: | ||
315 | rv = (unsigned char *)®s->bp; | ||
316 | break; | ||
317 | case arg_SP: | ||
318 | rv = (unsigned char *)®s->sp; | ||
319 | break; | ||
320 | default: | ||
321 | break; | ||
322 | } | ||
323 | } else { | ||
324 | switch (no) { | ||
325 | case arg_AH: | ||
326 | rv = 1 + (unsigned char *)®s->ax; | ||
327 | break; | ||
328 | case arg_BH: | ||
329 | rv = 1 + (unsigned char *)®s->bx; | ||
330 | break; | ||
331 | case arg_CH: | ||
332 | rv = 1 + (unsigned char *)®s->cx; | ||
333 | break; | ||
334 | case arg_DH: | ||
335 | rv = 1 + (unsigned char *)®s->dx; | ||
336 | break; | ||
337 | default: | ||
338 | break; | ||
339 | } | ||
340 | } | ||
341 | |||
342 | if (!rv) | ||
343 | printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); | ||
344 | |||
300 | return rv; | 345 | return rv; |
301 | } | 346 | } |
302 | 347 | ||
@@ -368,11 +413,12 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) | |||
368 | unsigned char mod_rm; | 413 | unsigned char mod_rm; |
369 | int reg; | 414 | int reg; |
370 | unsigned char *p; | 415 | unsigned char *p; |
371 | int i, shorted, enlarged, rexr; | 416 | struct prefix_bits prf; |
417 | int i; | ||
372 | unsigned long rv; | 418 | unsigned long rv; |
373 | 419 | ||
374 | p = (unsigned char *)ins_addr; | 420 | p = (unsigned char *)ins_addr; |
375 | p += skip_prefix(p, &shorted, &enlarged, &rexr); | 421 | p += skip_prefix(p, &prf); |
376 | p += get_opcode(p, &opcode); | 422 | p += get_opcode(p, &opcode); |
377 | for (i = 0; i < ARRAY_SIZE(reg_rop); i++) | 423 | for (i = 0; i < ARRAY_SIZE(reg_rop); i++) |
378 | if (reg_rop[i] == opcode) { | 424 | if (reg_rop[i] == opcode) { |
@@ -392,10 +438,10 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) | |||
392 | 438 | ||
393 | do_work: | 439 | do_work: |
394 | mod_rm = *p; | 440 | mod_rm = *p; |
395 | reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); | 441 | reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); |
396 | switch (get_ins_reg_width(ins_addr)) { | 442 | switch (get_ins_reg_width(ins_addr)) { |
397 | case 1: | 443 | case 1: |
398 | return *get_reg_w8(reg, regs); | 444 | return *get_reg_w8(reg, prf.rex, regs); |
399 | 445 | ||
400 | case 2: | 446 | case 2: |
401 | return *(unsigned short *)get_reg_w32(reg, regs); | 447 | return *(unsigned short *)get_reg_w32(reg, regs); |
@@ -422,11 +468,12 @@ unsigned long get_ins_imm_val(unsigned long ins_addr) | |||
422 | unsigned char mod_rm; | 468 | unsigned char mod_rm; |
423 | unsigned char mod; | 469 | unsigned char mod; |
424 | unsigned char *p; | 470 | unsigned char *p; |
425 | int i, shorted, enlarged, rexr; | 471 | struct prefix_bits prf; |
472 | int i; | ||
426 | unsigned long rv; | 473 | unsigned long rv; |
427 | 474 | ||
428 | p = (unsigned char *)ins_addr; | 475 | p = (unsigned char *)ins_addr; |
429 | p += skip_prefix(p, &shorted, &enlarged, &rexr); | 476 | p += skip_prefix(p, &prf); |
430 | p += get_opcode(p, &opcode); | 477 | p += get_opcode(p, &opcode); |
431 | for (i = 0; i < ARRAY_SIZE(imm_wop); i++) | 478 | for (i = 0; i < ARRAY_SIZE(imm_wop); i++) |
432 | if (imm_wop[i] == opcode) { | 479 | if (imm_wop[i] == opcode) { |
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 1b4763e26ea9..51c0a2fc14fe 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c | |||
@@ -138,7 +138,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) | |||
138 | return; | 138 | return; |
139 | } | 139 | } |
140 | 140 | ||
141 | if (is_uv_system()) | 141 | if (get_uv_system_type() >= UV_X2APIC) |
142 | apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; | 142 | apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; |
143 | else | 143 | else |
144 | apic_id = pa->apic_id; | 144 | apic_id = pa->apic_id; |
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index d877c5b423ef..ab50a8d7402c 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c | |||
@@ -3,6 +3,7 @@ | |||
3 | */ | 3 | */ |
4 | #include <linux/module.h> | 4 | #include <linux/module.h> |
5 | #include <linux/io.h> | 5 | #include <linux/io.h> |
6 | #include <linux/mmiotrace.h> | ||
6 | 7 | ||
7 | #define MODULE_NAME "testmmiotrace" | 8 | #define MODULE_NAME "testmmiotrace" |
8 | 9 | ||
@@ -13,6 +14,7 @@ MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); | |||
13 | static void do_write_test(void __iomem *p) | 14 | static void do_write_test(void __iomem *p) |
14 | { | 15 | { |
15 | unsigned int i; | 16 | unsigned int i; |
17 | mmiotrace_printk("Write test.\n"); | ||
16 | for (i = 0; i < 256; i++) | 18 | for (i = 0; i < 256; i++) |
17 | iowrite8(i, p + i); | 19 | iowrite8(i, p + i); |
18 | for (i = 1024; i < (5 * 1024); i += 2) | 20 | for (i = 1024; i < (5 * 1024); i += 2) |
@@ -24,6 +26,7 @@ static void do_write_test(void __iomem *p) | |||
24 | static void do_read_test(void __iomem *p) | 26 | static void do_read_test(void __iomem *p) |
25 | { | 27 | { |
26 | unsigned int i; | 28 | unsigned int i; |
29 | mmiotrace_printk("Read test.\n"); | ||
27 | for (i = 0; i < 256; i++) | 30 | for (i = 0; i < 256; i++) |
28 | ioread8(p + i); | 31 | ioread8(p + i); |
29 | for (i = 1024; i < (5 * 1024); i += 2) | 32 | for (i = 1024; i < (5 * 1024); i += 2) |
@@ -39,6 +42,7 @@ static void do_test(void) | |||
39 | pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); | 42 | pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); |
40 | return; | 43 | return; |
41 | } | 44 | } |
45 | mmiotrace_printk("ioremap returned %p.\n", p); | ||
42 | do_write_test(p); | 46 | do_write_test(p); |
43 | do_read_test(p); | 47 | do_read_test(p); |
44 | iounmap(p); | 48 | iounmap(p); |
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 4bdaa590375d..3c27a809393b 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c | |||
@@ -511,3 +511,31 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, fam10h_pci_cfg_space_size); | |||
511 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size); | 511 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size); |
512 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size); | 512 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size); |
513 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size); | 513 | DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size); |
514 | |||
515 | /* | ||
516 | * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from | ||
517 | * confusing the PCI engine: | ||
518 | */ | ||
519 | static void sb600_disable_hpet_bar(struct pci_dev *dev) | ||
520 | { | ||
521 | u8 val; | ||
522 | |||
523 | /* | ||
524 | * The SB600 and SB700 both share the same device | ||
525 | * ID, but the PM register 0x55 does something different | ||
526 | * for the SB700, so make sure we are dealing with the | ||
527 | * SB600 before touching the bit: | ||
528 | */ | ||
529 | |||
530 | pci_read_config_byte(dev, 0x08, &val); | ||
531 | |||
532 | if (val < 0x2F) { | ||
533 | outb(0x55, 0xCD6); | ||
534 | val = inb(0xCD7); | ||
535 | |||
536 | /* Set bit 7 in PM register 0x55 */ | ||
537 | outb(0x55, 0xCD6); | ||
538 | outb(val | 0x80, 0xCD7); | ||
539 | } | ||
540 | } | ||
541 | DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar); | ||
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 006599db0dc7..bf69dbe08bff 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c | |||
@@ -493,7 +493,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq | |||
493 | if (pirq <= 4) | 493 | if (pirq <= 4) |
494 | irq = read_config_nybble(router, 0x56, pirq - 1); | 494 | irq = read_config_nybble(router, 0x56, pirq - 1); |
495 | dev_info(&dev->dev, | 495 | dev_info(&dev->dev, |
496 | "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n", | 496 | "AMD756: dev [%04x:%04x], router PIRQ %d get IRQ %d\n", |
497 | dev->vendor, dev->device, pirq, irq); | 497 | dev->vendor, dev->device, pirq, irq); |
498 | return irq; | 498 | return irq; |
499 | } | 499 | } |
@@ -501,7 +501,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq | |||
501 | static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) | 501 | static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) |
502 | { | 502 | { |
503 | dev_info(&dev->dev, | 503 | dev_info(&dev->dev, |
504 | "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n", | 504 | "AMD756: dev [%04x:%04x], router PIRQ %d set IRQ %d\n", |
505 | dev->vendor, dev->device, pirq, irq); | 505 | dev->vendor, dev->device, pirq, irq); |
506 | if (pirq <= 4) | 506 | if (pirq <= 4) |
507 | write_config_nybble(router, 0x56, pirq - 1, irq); | 507 | write_config_nybble(router, 0x56, pirq - 1, irq); |
@@ -590,13 +590,20 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route | |||
590 | case PCI_DEVICE_ID_INTEL_ICH10_1: | 590 | case PCI_DEVICE_ID_INTEL_ICH10_1: |
591 | case PCI_DEVICE_ID_INTEL_ICH10_2: | 591 | case PCI_DEVICE_ID_INTEL_ICH10_2: |
592 | case PCI_DEVICE_ID_INTEL_ICH10_3: | 592 | case PCI_DEVICE_ID_INTEL_ICH10_3: |
593 | case PCI_DEVICE_ID_INTEL_PCH_0: | ||
594 | case PCI_DEVICE_ID_INTEL_PCH_1: | ||
595 | r->name = "PIIX/ICH"; | 593 | r->name = "PIIX/ICH"; |
596 | r->get = pirq_piix_get; | 594 | r->get = pirq_piix_get; |
597 | r->set = pirq_piix_set; | 595 | r->set = pirq_piix_set; |
598 | return 1; | 596 | return 1; |
599 | } | 597 | } |
598 | |||
599 | if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && | ||
600 | (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) { | ||
601 | r->name = "PIIX/ICH"; | ||
602 | r->get = pirq_piix_get; | ||
603 | r->set = pirq_piix_set; | ||
604 | return 1; | ||
605 | } | ||
606 | |||
600 | return 0; | 607 | return 0; |
601 | } | 608 | } |
602 | 609 | ||
@@ -823,7 +830,7 @@ static void __init pirq_find_router(struct irq_router *r) | |||
823 | r->get = NULL; | 830 | r->get = NULL; |
824 | r->set = NULL; | 831 | r->set = NULL; |
825 | 832 | ||
826 | DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", | 833 | DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n", |
827 | rt->rtr_vendor, rt->rtr_device); | 834 | rt->rtr_vendor, rt->rtr_device); |
828 | 835 | ||
829 | pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn); | 836 | pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn); |
@@ -843,7 +850,7 @@ static void __init pirq_find_router(struct irq_router *r) | |||
843 | h->probe(r, pirq_router_dev, pirq_router_dev->device)) | 850 | h->probe(r, pirq_router_dev, pirq_router_dev->device)) |
844 | break; | 851 | break; |
845 | } | 852 | } |
846 | dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n", | 853 | dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n", |
847 | pirq_router.name, | 854 | pirq_router.name, |
848 | pirq_router_dev->vendor, pirq_router_dev->device); | 855 | pirq_router_dev->vendor, pirq_router_dev->device); |
849 | 856 | ||
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0013a729b41d..b61534c7a4c4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c | |||
@@ -871,6 +871,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l | |||
871 | /* make sure there are no stray mappings of | 871 | /* make sure there are no stray mappings of |
872 | this page */ | 872 | this page */ |
873 | kmap_flush_unused(); | 873 | kmap_flush_unused(); |
874 | vm_unmap_aliases(); | ||
874 | } | 875 | } |
875 | } | 876 | } |
876 | 877 | ||
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 28b85ab8422e..bb042608c602 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c | |||
@@ -21,7 +21,6 @@ void xen_force_evtchn_callback(void) | |||
21 | 21 | ||
22 | static void __init __xen_init_IRQ(void) | 22 | static void __init __xen_init_IRQ(void) |
23 | { | 23 | { |
24 | #ifdef CONFIG_X86_64 | ||
25 | int i; | 24 | int i; |
26 | 25 | ||
27 | /* Create identity vector->irq map */ | 26 | /* Create identity vector->irq map */ |
@@ -31,7 +30,6 @@ static void __init __xen_init_IRQ(void) | |||
31 | for_each_possible_cpu(cpu) | 30 | for_each_possible_cpu(cpu) |
32 | per_cpu(vector_irq, cpu)[i] = i; | 31 | per_cpu(vector_irq, cpu)[i] = i; |
33 | } | 32 | } |
34 | #endif /* CONFIG_X86_64 */ | ||
35 | 33 | ||
36 | xen_init_IRQ(); | 34 | xen_init_IRQ(); |
37 | } | 35 | } |
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ae173f6edd8b..d4d52f5a1cf7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -846,6 +846,7 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) | |||
846 | /* re-enable interrupts for kmap_flush_unused */ | 846 | /* re-enable interrupts for kmap_flush_unused */ |
847 | xen_mc_issue(0); | 847 | xen_mc_issue(0); |
848 | kmap_flush_unused(); | 848 | kmap_flush_unused(); |
849 | vm_unmap_aliases(); | ||
849 | xen_mc_batch(); | 850 | xen_mc_batch(); |
850 | } | 851 | } |
851 | 852 | ||
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index dd71e3a021cd..5601506f2dd9 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c | |||
@@ -241,7 +241,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl | |||
241 | ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); | 241 | ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); |
242 | } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ | 242 | } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ |
243 | 243 | ||
244 | kstat_this_cpu.irqs[irq]++; | 244 | kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); |
245 | 245 | ||
246 | out: | 246 | out: |
247 | raw_local_irq_restore(flags); | 247 | raw_local_irq_restore(flags); |
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 004ba86326ae..c9f7cda48ed7 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c | |||
@@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void) | |||
198 | /* Get the TSC speed from Xen */ | 198 | /* Get the TSC speed from Xen */ |
199 | unsigned long xen_tsc_khz(void) | 199 | unsigned long xen_tsc_khz(void) |
200 | { | 200 | { |
201 | u64 xen_khz = 1000000ULL << 32; | 201 | struct pvclock_vcpu_time_info *info = |
202 | const struct pvclock_vcpu_time_info *info = | ||
203 | &HYPERVISOR_shared_info->vcpu_info[0].time; | 202 | &HYPERVISOR_shared_info->vcpu_info[0].time; |
204 | 203 | ||
205 | do_div(xen_khz, info->tsc_to_system_mul); | 204 | return pvclock_tsc_khz(info); |
206 | if (info->tsc_shift < 0) | ||
207 | xen_khz <<= -info->tsc_shift; | ||
208 | else | ||
209 | xen_khz >>= info->tsc_shift; | ||
210 | |||
211 | return xen_khz; | ||
212 | } | 205 | } |
213 | 206 | ||
214 | cycle_t xen_clocksource_read(void) | 207 | cycle_t xen_clocksource_read(void) |