aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig43
-rw-r--r--arch/x86/Kconfig.cpu29
-rw-r--r--arch/x86/Makefile_32.cpu1
-rw-r--r--arch/x86/boot/video-vesa.c9
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/ia32/ia32entry.S30
-rw-r--r--arch/x86/ia32/sys_ia32.c99
-rw-r--r--arch/x86/kernel/Makefile8
-rw-r--r--arch/x86/kernel/acpi/boot.c4
-rw-r--r--arch/x86/kernel/acpi/sleep.c3
-rw-r--r--arch/x86/kernel/alternative.c2
-rw-r--r--arch/x86/kernel/amd_iommu.c9
-rw-r--r--arch/x86/kernel/amd_iommu_init.c2
-rw-r--r--arch/x86/kernel/apic.c (renamed from arch/x86/kernel/apic_32.c)627
-rw-r--r--arch/x86/kernel/apic_64.c1848
-rw-r--r--arch/x86/kernel/bios_uv.c137
-rw-r--r--arch/x86/kernel/cpu/.gitignore1
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/common.c45
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c2
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c11
-rw-r--r--arch/x86/kernel/cpuid.c4
-rw-r--r--arch/x86/kernel/crash_dump_32.c3
-rw-r--r--arch/x86/kernel/crash_dump_64.c3
-rw-r--r--arch/x86/kernel/doublefault_32.c2
-rw-r--r--arch/x86/kernel/dumpstack_32.c449
-rw-r--r--arch/x86/kernel/dumpstack_64.c575
-rw-r--r--arch/x86/kernel/e820.c4
-rw-r--r--arch/x86/kernel/efi.c4
-rw-r--r--arch/x86/kernel/entry_32.S38
-rw-r--r--arch/x86/kernel/entry_64.S41
-rw-r--r--arch/x86/kernel/es7000_32.c28
-rw-r--r--arch/x86/kernel/ftrace.c124
-rw-r--r--arch/x86/kernel/genapic_flat_64.c4
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c72
-rw-r--r--arch/x86/kernel/head.c1
-rw-r--r--arch/x86/kernel/hpet.c459
-rw-r--r--arch/x86/kernel/io_apic.c (renamed from arch/x86/kernel/io_apic_64.c)1538
-rw-r--r--arch/x86/kernel/io_apic_32.c2908
-rw-r--r--arch/x86/kernel/irq.c189
-rw-r--r--arch/x86/kernel/irq_32.c194
-rw-r--r--arch/x86/kernel/irq_64.c169
-rw-r--r--arch/x86/kernel/irqinit_32.c47
-rw-r--r--arch/x86/kernel/irqinit_64.c69
-rw-r--r--arch/x86/kernel/kvmclock.c30
-rw-r--r--arch/x86/kernel/msr.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c18
-rw-r--r--arch/x86/kernel/pci-dma.c20
-rw-r--r--arch/x86/kernel/pci-gart_64.c8
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c7
-rw-r--r--arch/x86/kernel/pvclock.c12
-rw-r--r--arch/x86/kernel/quirks.c44
-rw-r--r--arch/x86/kernel/rtc.c42
-rw-r--r--arch/x86/kernel/setup.c22
-rw-r--r--arch/x86/kernel/setup_percpu.c17
-rw-r--r--arch/x86/kernel/smpboot.c107
-rw-r--r--arch/x86/kernel/time_32.c7
-rw-r--r--arch/x86/kernel/time_64.c23
-rw-r--r--arch/x86/kernel/traps.c (renamed from arch/x86/kernel/traps_32.c)893
-rw-r--r--arch/x86/kernel/traps_64.c1214
-rw-r--r--arch/x86/kernel/uv_irq.c79
-rw-r--r--arch/x86/kernel/uv_sysfs.c72
-rw-r--r--arch/x86/kernel/visws_quirks.c32
-rw-r--r--arch/x86/kernel/vmiclock_32.c3
-rw-r--r--arch/x86/kvm/Makefile5
-rw-r--r--arch/x86/kvm/i8254.c81
-rw-r--r--arch/x86/kvm/i8254.h7
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.c3
-rw-r--r--arch/x86/kvm/irq.h6
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h32
-rw-r--r--arch/x86/kvm/lapic.c43
-rw-r--r--arch/x86/kvm/mmu.c680
-rw-r--r--arch/x86/kvm/paging_tmpl.h249
-rw-r--r--arch/x86/kvm/svm.c156
-rw-r--r--arch/x86/kvm/vmx.c712
-rw-r--r--arch/x86/kvm/vmx.h3
-rw-r--r--arch/x86/kvm/x86.c554
-rw-r--r--arch/x86/kvm/x86.h22
-rw-r--r--arch/x86/kvm/x86_emulate.c170
-rw-r--r--arch/x86/lguest/boot.c2
-rw-r--r--arch/x86/mach-generic/bigsmp.c4
-rw-r--r--arch/x86/mach-generic/es7000.c34
-rw-r--r--arch/x86/mach-generic/numaq.c14
-rw-r--r--arch/x86/mach-generic/summit.c14
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c4
-rw-r--r--arch/x86/mm/Makefile6
-rw-r--r--arch/x86/mm/fault.c50
-rw-r--r--arch/x86/mm/gup.c10
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c9
-rw-r--r--arch/x86/mm/ioremap.c149
-rw-r--r--arch/x86/mm/mmio-mod.c87
-rw-r--r--arch/x86/mm/numa_32.c (renamed from arch/x86/mm/discontig_32.c)0
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/mm/pf_in.c121
-rw-r--r--arch/x86/mm/srat_64.c2
-rw-r--r--arch/x86/mm/testmmiotrace.c4
-rw-r--r--arch/x86/pci/fixup.c28
-rw-r--r--arch/x86/pci/irq.c19
-rw-r--r--arch/x86/xen/enlighten.c1
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu.c1
-rw-r--r--arch/x86/xen/spinlock.c2
-rw-r--r--arch/x86/xen/time.c11
116 files changed, 6863 insertions, 9012 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fc8351f374fd..5b9b12321ad1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -18,6 +18,7 @@ config X86_64
18### Arch settings 18### Arch settings
19config X86 19config X86
20 def_bool y 20 def_bool y
21 select HAVE_AOUT if X86_32
21 select HAVE_UNSTABLE_SCHED_CLOCK 22 select HAVE_UNSTABLE_SCHED_CLOCK
22 select HAVE_IDE 23 select HAVE_IDE
23 select HAVE_OPROFILE 24 select HAVE_OPROFILE
@@ -25,6 +26,7 @@ config X86
25 select HAVE_KPROBES 26 select HAVE_KPROBES
26 select ARCH_WANT_OPTIONAL_GPIOLIB 27 select ARCH_WANT_OPTIONAL_GPIOLIB
27 select HAVE_KRETPROBES 28 select HAVE_KRETPROBES
29 select HAVE_FTRACE_MCOUNT_RECORD
28 select HAVE_DYNAMIC_FTRACE 30 select HAVE_DYNAMIC_FTRACE
29 select HAVE_FTRACE 31 select HAVE_FTRACE
30 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 32 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
@@ -38,10 +40,6 @@ config ARCH_DEFCONFIG
38 default "arch/x86/configs/i386_defconfig" if X86_32 40 default "arch/x86/configs/i386_defconfig" if X86_32
39 default "arch/x86/configs/x86_64_defconfig" if X86_64 41 default "arch/x86/configs/x86_64_defconfig" if X86_64
40 42
41
42config GENERIC_LOCKBREAK
43 def_bool n
44
45config GENERIC_TIME 43config GENERIC_TIME
46 def_bool y 44 def_bool y
47 45
@@ -94,7 +92,7 @@ config GENERIC_HWEIGHT
94 def_bool y 92 def_bool y
95 93
96config GENERIC_GPIO 94config GENERIC_GPIO
97 def_bool n 95 bool
98 96
99config ARCH_MAY_HAVE_PC_FDC 97config ARCH_MAY_HAVE_PC_FDC
100 def_bool y 98 def_bool y
@@ -105,12 +103,6 @@ config RWSEM_GENERIC_SPINLOCK
105config RWSEM_XCHGADD_ALGORITHM 103config RWSEM_XCHGADD_ALGORITHM
106 def_bool X86_XADD 104 def_bool X86_XADD
107 105
108config ARCH_HAS_ILOG2_U32
109 def_bool n
110
111config ARCH_HAS_ILOG2_U64
112 def_bool n
113
114config ARCH_HAS_CPU_IDLE_WAIT 106config ARCH_HAS_CPU_IDLE_WAIT
115 def_bool y 107 def_bool y
116 108
@@ -152,9 +144,6 @@ config AUDIT_ARCH
152 bool 144 bool
153 default X86_64 145 default X86_64
154 146
155config ARCH_SUPPORTS_AOUT
156 def_bool y
157
158config ARCH_SUPPORTS_OPTIMIZED_INLINING 147config ARCH_SUPPORTS_OPTIMIZED_INLINING
159 def_bool y 148 def_bool y
160 149
@@ -205,6 +194,7 @@ config X86_TRAMPOLINE
205config KTIME_SCALAR 194config KTIME_SCALAR
206 def_bool X86_32 195 def_bool X86_32
207source "init/Kconfig" 196source "init/Kconfig"
197source "kernel/Kconfig.freezer"
208 198
209menu "Processor type and features" 199menu "Processor type and features"
210 200
@@ -760,9 +750,8 @@ config I8K
760 Say N otherwise. 750 Say N otherwise.
761 751
762config X86_REBOOTFIXUPS 752config X86_REBOOTFIXUPS
763 def_bool n 753 bool "Enable X86 board specific fixups for reboot"
764 prompt "Enable X86 board specific fixups for reboot" 754 depends on X86_32
765 depends on X86_32 && X86
766 ---help--- 755 ---help---
767 This enables chipset and/or board specific fixups to be done 756 This enables chipset and/or board specific fixups to be done
768 in order to get reboot to work correctly. This is only needed on 757 in order to get reboot to work correctly. This is only needed on
@@ -946,16 +935,17 @@ config HIGHMEM
946 depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) 935 depends on X86_32 && (HIGHMEM64G || HIGHMEM4G)
947 936
948config X86_PAE 937config X86_PAE
949 def_bool n 938 bool "PAE (Physical Address Extension) Support"
950 prompt "PAE (Physical Address Extension) Support"
951 depends on X86_32 && !HIGHMEM4G 939 depends on X86_32 && !HIGHMEM4G
952 select RESOURCES_64BIT
953 help 940 help
954 PAE is required for NX support, and furthermore enables 941 PAE is required for NX support, and furthermore enables
955 larger swapspace support for non-overcommit purposes. It 942 larger swapspace support for non-overcommit purposes. It
956 has the cost of more pagetable lookup overhead, and also 943 has the cost of more pagetable lookup overhead, and also
957 consumes more pagetable space per process. 944 consumes more pagetable space per process.
958 945
946config ARCH_PHYS_ADDR_T_64BIT
947 def_bool X86_64 || X86_PAE
948
959# Common NUMA Features 949# Common NUMA Features
960config NUMA 950config NUMA
961 bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" 951 bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
@@ -1240,8 +1230,7 @@ config X86_PAT
1240 If unsure, say Y. 1230 If unsure, say Y.
1241 1231
1242config EFI 1232config EFI
1243 def_bool n 1233 bool "EFI runtime service support"
1244 prompt "EFI runtime service support"
1245 depends on ACPI 1234 depends on ACPI
1246 ---help--- 1235 ---help---
1247 This enables the kernel to use EFI runtime services that are 1236 This enables the kernel to use EFI runtime services that are
@@ -1254,14 +1243,6 @@ config EFI
1254 resultant kernel should continue to boot on existing non-EFI 1243 resultant kernel should continue to boot on existing non-EFI
1255 platforms. 1244 platforms.
1256 1245
1257config IRQBALANCE
1258 def_bool y
1259 prompt "Enable kernel irq balancing"
1260 depends on X86_32 && SMP && X86_IO_APIC
1261 help
1262 The default yes will allow the kernel to do irq load balancing.
1263 Saying no will keep the kernel from doing irq load balancing.
1264
1265config SECCOMP 1246config SECCOMP
1266 def_bool y 1247 def_bool y
1267 prompt "Enable seccomp to safely compute untrusted bytecode" 1248 prompt "Enable seccomp to safely compute untrusted bytecode"
@@ -1885,7 +1866,7 @@ config IA32_EMULATION
1885 1866
1886config IA32_AOUT 1867config IA32_AOUT
1887 tristate "IA32 a.out support" 1868 tristate "IA32 a.out support"
1888 depends on IA32_EMULATION && ARCH_SUPPORTS_AOUT 1869 depends on IA32_EMULATION
1889 help 1870 help
1890 Support old a.out binaries in the 32bit emulation. 1871 Support old a.out binaries in the 32bit emulation.
1891 1872
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index c5f101360520..0b7c4a3f0651 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -38,8 +38,7 @@ config M386
38 - "Crusoe" for the Transmeta Crusoe series. 38 - "Crusoe" for the Transmeta Crusoe series.
39 - "Efficeon" for the Transmeta Efficeon series. 39 - "Efficeon" for the Transmeta Efficeon series.
40 - "Winchip-C6" for original IDT Winchip. 40 - "Winchip-C6" for original IDT Winchip.
41 - "Winchip-2" for IDT Winchip 2. 41 - "Winchip-2" for IDT Winchips with 3dNow! capabilities.
42 - "Winchip-2A" for IDT Winchips with 3dNow! capabilities.
43 - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). 42 - "GeodeGX1" for Geode GX1 (Cyrix MediaGX).
44 - "Geode GX/LX" For AMD Geode GX and LX processors. 43 - "Geode GX/LX" For AMD Geode GX and LX processors.
45 - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. 44 - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
@@ -194,19 +193,11 @@ config MWINCHIPC6
194 treat this chip as a 586TSC with some extended instructions 193 treat this chip as a 586TSC with some extended instructions
195 and alignment requirements. 194 and alignment requirements.
196 195
197config MWINCHIP2
198 bool "Winchip-2"
199 depends on X86_32
200 help
201 Select this for an IDT Winchip-2. Linux and GCC
202 treat this chip as a 586TSC with some extended instructions
203 and alignment requirements.
204
205config MWINCHIP3D 196config MWINCHIP3D
206 bool "Winchip-2A/Winchip-3" 197 bool "Winchip-2/Winchip-2A/Winchip-3"
207 depends on X86_32 198 depends on X86_32
208 help 199 help
209 Select this for an IDT Winchip-2A or 3. Linux and GCC 200 Select this for an IDT Winchip-2, 2A or 3. Linux and GCC
210 treat this chip as a 586TSC with some extended instructions 201 treat this chip as a 586TSC with some extended instructions
211 and alignment requirements. Also enable out of order memory 202 and alignment requirements. Also enable out of order memory
212 stores for this CPU, which can increase performance of some 203 stores for this CPU, which can increase performance of some
@@ -318,7 +309,7 @@ config X86_L1_CACHE_SHIFT
318 int 309 int
319 default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC 310 default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC
320 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 311 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
321 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 312 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
322 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 313 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7
323 314
324config X86_XADD 315config X86_XADD
@@ -360,7 +351,7 @@ config X86_POPAD_OK
360 351
361config X86_ALIGNMENT_16 352config X86_ALIGNMENT_16
362 def_bool y 353 def_bool y
363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 354 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
364 355
365config X86_INTEL_USERCOPY 356config X86_INTEL_USERCOPY
366 def_bool y 357 def_bool y
@@ -368,7 +359,7 @@ config X86_INTEL_USERCOPY
368 359
369config X86_USE_PPRO_CHECKSUM 360config X86_USE_PPRO_CHECKSUM
370 def_bool y 361 def_bool y
371 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 362 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
372 363
373config X86_USE_3DNOW 364config X86_USE_3DNOW
374 def_bool y 365 def_bool y
@@ -376,7 +367,7 @@ config X86_USE_3DNOW
376 367
377config X86_OOSTORE 368config X86_OOSTORE
378 def_bool y 369 def_bool y
379 depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR 370 depends on (MWINCHIP3D || MWINCHIPC6) && MTRR
380 371
381# 372#
382# P6_NOPs are a relatively minor optimization that require a family >= 373# P6_NOPs are a relatively minor optimization that require a family >=
@@ -396,7 +387,7 @@ config X86_P6_NOP
396 387
397config X86_TSC 388config X86_TSC
398 def_bool y 389 def_bool y
399 depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 390 depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
400 391
401config X86_CMPXCHG64 392config X86_CMPXCHG64
402 def_bool y 393 def_bool y
@@ -406,7 +397,7 @@ config X86_CMPXCHG64
406# generates cmov. 397# generates cmov.
407config X86_CMOV 398config X86_CMOV
408 def_bool y 399 def_bool y
409 depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64) 400 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64)
410 401
411config X86_MINIMUM_CPU_FAMILY 402config X86_MINIMUM_CPU_FAMILY
412 int 403 int
@@ -417,7 +408,7 @@ config X86_MINIMUM_CPU_FAMILY
417 408
418config X86_DEBUGCTLMSR 409config X86_DEBUGCTLMSR
419 def_bool y 410 def_bool y
420 depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) 411 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
421 412
422menuconfig PROCESSOR_SELECT 413menuconfig PROCESSOR_SELECT
423 bool "Supported processor vendors" if EMBEDDED 414 bool "Supported processor vendors" if EMBEDDED
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index b72b4f753113..80177ec052f0 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -28,7 +28,6 @@ cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon)
28cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 28cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
29cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 29cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
30cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) 30cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586)
31cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586)
32cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) 31cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
33cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 32cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
34cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) 33cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 1e6fe0214c85..99b3079dc6ab 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -88,14 +88,11 @@ static int vesa_probe(void)
88 (vminfo.memory_layout == 4 || 88 (vminfo.memory_layout == 4 ||
89 vminfo.memory_layout == 6) && 89 vminfo.memory_layout == 6) &&
90 vminfo.memory_planes == 1) { 90 vminfo.memory_planes == 1) {
91#ifdef CONFIG_FB 91#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
92 /* Graphics mode, color, linear frame buffer 92 /* Graphics mode, color, linear frame buffer
93 supported. Only register the mode if 93 supported. Only register the mode if
94 if framebuffer is configured, however, 94 if framebuffer is configured, however,
95 otherwise the user will be left without a screen. 95 otherwise the user will be left without a screen. */
96 We don't require CONFIG_FB_VESA, however, since
97 some of the other framebuffer drivers can use
98 this mode-setting, too. */
99 mi = GET_HEAP(struct mode_info, 1); 96 mi = GET_HEAP(struct mode_info, 1);
100 mi->mode = mode + VIDEO_FIRST_VESA; 97 mi->mode = mode + VIDEO_FIRST_VESA;
101 mi->depth = vminfo.bpp; 98 mi->depth = vminfo.bpp;
@@ -133,10 +130,12 @@ static int vesa_set_mode(struct mode_info *mode)
133 if ((vminfo.mode_attr & 0x15) == 0x05) { 130 if ((vminfo.mode_attr & 0x15) == 0x05) {
134 /* It's a supported text mode */ 131 /* It's a supported text mode */
135 is_graphic = 0; 132 is_graphic = 0;
133#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
136 } else if ((vminfo.mode_attr & 0x99) == 0x99) { 134 } else if ((vminfo.mode_attr & 0x99) == 0x99) {
137 /* It's a graphics mode with linear frame buffer */ 135 /* It's a graphics mode with linear frame buffer */
138 is_graphic = 1; 136 is_graphic = 1;
139 vesa_mode |= 0x4000; /* Request linear frame buffer */ 137 vesa_mode |= 0x4000; /* Request linear frame buffer */
138#endif
140 } else { 139 } else {
141 return -1; /* Invalid mode */ 140 return -1; /* Invalid mode */
142 } 141 }
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index ca226ca31288..13b8c86ae985 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -213,7 +213,6 @@ CONFIG_M686=y
213# CONFIG_MCRUSOE is not set 213# CONFIG_MCRUSOE is not set
214# CONFIG_MEFFICEON is not set 214# CONFIG_MEFFICEON is not set
215# CONFIG_MWINCHIPC6 is not set 215# CONFIG_MWINCHIPC6 is not set
216# CONFIG_MWINCHIP2 is not set
217# CONFIG_MWINCHIP3D is not set 216# CONFIG_MWINCHIP3D is not set
218# CONFIG_MGEODEGX1 is not set 217# CONFIG_MGEODEGX1 is not set
219# CONFIG_MGEODE_LX is not set 218# CONFIG_MGEODE_LX is not set
@@ -288,7 +287,6 @@ CONFIG_MTRR=y
288# CONFIG_MTRR_SANITIZER is not set 287# CONFIG_MTRR_SANITIZER is not set
289CONFIG_X86_PAT=y 288CONFIG_X86_PAT=y
290CONFIG_EFI=y 289CONFIG_EFI=y
291# CONFIG_IRQBALANCE is not set
292CONFIG_SECCOMP=y 290CONFIG_SECCOMP=y
293# CONFIG_HZ_100 is not set 291# CONFIG_HZ_100 is not set
294# CONFIG_HZ_250 is not set 292# CONFIG_HZ_250 is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 2c4b1c771e28..f0a03d7a7d63 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -210,7 +210,6 @@ CONFIG_X86_PC=y
210# CONFIG_MCRUSOE is not set 210# CONFIG_MCRUSOE is not set
211# CONFIG_MEFFICEON is not set 211# CONFIG_MEFFICEON is not set
212# CONFIG_MWINCHIPC6 is not set 212# CONFIG_MWINCHIPC6 is not set
213# CONFIG_MWINCHIP2 is not set
214# CONFIG_MWINCHIP3D is not set 213# CONFIG_MWINCHIP3D is not set
215# CONFIG_MGEODEGX1 is not set 214# CONFIG_MGEODEGX1 is not set
216# CONFIG_MGEODE_LX is not set 215# CONFIG_MGEODE_LX is not set
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ffc1bb4fed7d..256b00b61892 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -39,11 +39,11 @@
39 .endm 39 .endm
40 40
41 /* clobbers %eax */ 41 /* clobbers %eax */
42 .macro CLEAR_RREGS 42 .macro CLEAR_RREGS _r9=rax
43 xorl %eax,%eax 43 xorl %eax,%eax
44 movq %rax,R11(%rsp) 44 movq %rax,R11(%rsp)
45 movq %rax,R10(%rsp) 45 movq %rax,R10(%rsp)
46 movq %rax,R9(%rsp) 46 movq %\_r9,R9(%rsp)
47 movq %rax,R8(%rsp) 47 movq %rax,R8(%rsp)
48 .endm 48 .endm
49 49
@@ -52,11 +52,10 @@
52 * We don't reload %eax because syscall_trace_enter() returned 52 * We don't reload %eax because syscall_trace_enter() returned
53 * the value it wants us to use in the table lookup. 53 * the value it wants us to use in the table lookup.
54 */ 54 */
55 .macro LOAD_ARGS32 offset 55 .macro LOAD_ARGS32 offset, _r9=0
56 movl \offset(%rsp),%r11d 56 .if \_r9
57 movl \offset+8(%rsp),%r10d
58 movl \offset+16(%rsp),%r9d 57 movl \offset+16(%rsp),%r9d
59 movl \offset+24(%rsp),%r8d 58 .endif
60 movl \offset+40(%rsp),%ecx 59 movl \offset+40(%rsp),%ecx
61 movl \offset+48(%rsp),%edx 60 movl \offset+48(%rsp),%edx
62 movl \offset+56(%rsp),%esi 61 movl \offset+56(%rsp),%esi
@@ -145,7 +144,7 @@ ENTRY(ia32_sysenter_target)
145 SAVE_ARGS 0,0,1 144 SAVE_ARGS 0,0,1
146 /* no need to do an access_ok check here because rbp has been 145 /* no need to do an access_ok check here because rbp has been
147 32bit zero extended */ 146 32bit zero extended */
1481: movl (%rbp),%r9d 1471: movl (%rbp),%ebp
149 .section __ex_table,"a" 148 .section __ex_table,"a"
150 .quad 1b,ia32_badarg 149 .quad 1b,ia32_badarg
151 .previous 150 .previous
@@ -157,7 +156,7 @@ ENTRY(ia32_sysenter_target)
157 cmpl $(IA32_NR_syscalls-1),%eax 156 cmpl $(IA32_NR_syscalls-1),%eax
158 ja ia32_badsys 157 ja ia32_badsys
159sysenter_do_call: 158sysenter_do_call:
160 IA32_ARG_FIXUP 1 159 IA32_ARG_FIXUP
161sysenter_dispatch: 160sysenter_dispatch:
162 call *ia32_sys_call_table(,%rax,8) 161 call *ia32_sys_call_table(,%rax,8)
163 movq %rax,RAX-ARGOFFSET(%rsp) 162 movq %rax,RAX-ARGOFFSET(%rsp)
@@ -234,20 +233,17 @@ sysexit_audit:
234#endif 233#endif
235 234
236sysenter_tracesys: 235sysenter_tracesys:
237 xchgl %r9d,%ebp
238#ifdef CONFIG_AUDITSYSCALL 236#ifdef CONFIG_AUDITSYSCALL
239 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) 237 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
240 jz sysenter_auditsys 238 jz sysenter_auditsys
241#endif 239#endif
242 SAVE_REST 240 SAVE_REST
243 CLEAR_RREGS 241 CLEAR_RREGS
244 movq %r9,R9(%rsp)
245 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ 242 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
246 movq %rsp,%rdi /* &pt_regs -> arg1 */ 243 movq %rsp,%rdi /* &pt_regs -> arg1 */
247 call syscall_trace_enter 244 call syscall_trace_enter
248 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 245 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
249 RESTORE_REST 246 RESTORE_REST
250 xchgl %ebp,%r9d
251 cmpl $(IA32_NR_syscalls-1),%eax 247 cmpl $(IA32_NR_syscalls-1),%eax
252 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ 248 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
253 jmp sysenter_do_call 249 jmp sysenter_do_call
@@ -314,9 +310,9 @@ ENTRY(ia32_cstar_target)
314 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) 310 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
315 CFI_REMEMBER_STATE 311 CFI_REMEMBER_STATE
316 jnz cstar_tracesys 312 jnz cstar_tracesys
317cstar_do_call:
318 cmpl $IA32_NR_syscalls-1,%eax 313 cmpl $IA32_NR_syscalls-1,%eax
319 ja ia32_badsys 314 ja ia32_badsys
315cstar_do_call:
320 IA32_ARG_FIXUP 1 316 IA32_ARG_FIXUP 1
321cstar_dispatch: 317cstar_dispatch:
322 call *ia32_sys_call_table(,%rax,8) 318 call *ia32_sys_call_table(,%rax,8)
@@ -357,15 +353,13 @@ cstar_tracesys:
357#endif 353#endif
358 xchgl %r9d,%ebp 354 xchgl %r9d,%ebp
359 SAVE_REST 355 SAVE_REST
360 CLEAR_RREGS 356 CLEAR_RREGS r9
361 movq %r9,R9(%rsp)
362 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 357 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
363 movq %rsp,%rdi /* &pt_regs -> arg1 */ 358 movq %rsp,%rdi /* &pt_regs -> arg1 */
364 call syscall_trace_enter 359 call syscall_trace_enter
365 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 360 LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
366 RESTORE_REST 361 RESTORE_REST
367 xchgl %ebp,%r9d 362 xchgl %ebp,%r9d
368 movl RSP-ARGOFFSET(%rsp), %r8d
369 cmpl $(IA32_NR_syscalls-1),%eax 363 cmpl $(IA32_NR_syscalls-1),%eax
370 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ 364 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
371 jmp cstar_do_call 365 jmp cstar_do_call
@@ -577,8 +571,8 @@ ia32_sys_call_table:
577 .quad compat_sys_setrlimit /* 75 */ 571 .quad compat_sys_setrlimit /* 75 */
578 .quad compat_sys_old_getrlimit /* old_getrlimit */ 572 .quad compat_sys_old_getrlimit /* old_getrlimit */
579 .quad compat_sys_getrusage 573 .quad compat_sys_getrusage
580 .quad sys32_gettimeofday 574 .quad compat_sys_gettimeofday
581 .quad sys32_settimeofday 575 .quad compat_sys_settimeofday
582 .quad sys_getgroups16 /* 80 */ 576 .quad sys_getgroups16 /* 80 */
583 .quad sys_setgroups16 577 .quad sys_setgroups16
584 .quad sys32_old_select 578 .quad sys32_old_select
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index beda4232ce69..2e09dcd3c0a6 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -49,41 +49,6 @@
49 49
50#define AA(__x) ((unsigned long)(__x)) 50#define AA(__x) ((unsigned long)(__x))
51 51
52int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf)
53{
54 compat_ino_t ino;
55
56 typeof(ubuf->st_uid) uid = 0;
57 typeof(ubuf->st_gid) gid = 0;
58 SET_UID(uid, kbuf->uid);
59 SET_GID(gid, kbuf->gid);
60 if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev))
61 return -EOVERFLOW;
62 if (kbuf->size >= 0x7fffffff)
63 return -EOVERFLOW;
64 ino = kbuf->ino;
65 if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino)
66 return -EOVERFLOW;
67 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) ||
68 __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) ||
69 __put_user(ino, &ubuf->st_ino) ||
70 __put_user(kbuf->mode, &ubuf->st_mode) ||
71 __put_user(kbuf->nlink, &ubuf->st_nlink) ||
72 __put_user(uid, &ubuf->st_uid) ||
73 __put_user(gid, &ubuf->st_gid) ||
74 __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) ||
75 __put_user(kbuf->size, &ubuf->st_size) ||
76 __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) ||
77 __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) ||
78 __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) ||
79 __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
80 __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) ||
81 __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
82 __put_user(kbuf->blksize, &ubuf->st_blksize) ||
83 __put_user(kbuf->blocks, &ubuf->st_blocks))
84 return -EFAULT;
85 return 0;
86}
87 52
88asmlinkage long sys32_truncate64(char __user *filename, 53asmlinkage long sys32_truncate64(char __user *filename,
89 unsigned long offset_low, 54 unsigned long offset_low,
@@ -402,75 +367,11 @@ asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
402 return 0; 367 return 0;
403} 368}
404 369
405static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i)
406{
407 int err = -EFAULT;
408
409 if (access_ok(VERIFY_READ, i, sizeof(*i))) {
410 err = __get_user(o->tv_sec, &i->tv_sec);
411 err |= __get_user(o->tv_usec, &i->tv_usec);
412 }
413 return err;
414}
415
416static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i)
417{
418 int err = -EFAULT;
419
420 if (access_ok(VERIFY_WRITE, o, sizeof(*o))) {
421 err = __put_user(i->tv_sec, &o->tv_sec);
422 err |= __put_user(i->tv_usec, &o->tv_usec);
423 }
424 return err;
425}
426
427asmlinkage long sys32_alarm(unsigned int seconds) 370asmlinkage long sys32_alarm(unsigned int seconds)
428{ 371{
429 return alarm_setitimer(seconds); 372 return alarm_setitimer(seconds);
430} 373}
431 374
432/*
433 * Translations due to time_t size differences. Which affects all
434 * sorts of things, like timeval and itimerval.
435 */
436asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv,
437 struct timezone __user *tz)
438{
439 if (tv) {
440 struct timeval ktv;
441
442 do_gettimeofday(&ktv);
443 if (put_tv32(tv, &ktv))
444 return -EFAULT;
445 }
446 if (tz) {
447 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
448 return -EFAULT;
449 }
450 return 0;
451}
452
453asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv,
454 struct timezone __user *tz)
455{
456 struct timeval ktv;
457 struct timespec kts;
458 struct timezone ktz;
459
460 if (tv) {
461 if (get_tv32(&ktv, tv))
462 return -EFAULT;
463 kts.tv_sec = ktv.tv_sec;
464 kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC;
465 }
466 if (tz) {
467 if (copy_from_user(&ktz, tz, sizeof(ktz)))
468 return -EFAULT;
469 }
470
471 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
472}
473
474struct sel_arg_struct { 375struct sel_arg_struct {
475 unsigned int n; 376 unsigned int n;
476 unsigned int inp; 377 unsigned int inp;
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5098585f87ce..d7e5a58ee22f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp)
23CFLAGS_tsc.o := $(nostackp) 23CFLAGS_tsc.o := $(nostackp)
24 24
25obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 25obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
26obj-y += traps_$(BITS).o irq_$(BITS).o 26obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
27obj-y += time_$(BITS).o ioport.o ldt.o 27obj-y += time_$(BITS).o ioport.o ldt.o
28obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 28obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
29obj-$(CONFIG_X86_VISWS) += visws_quirks.o 29obj-$(CONFIG_X86_VISWS) += visws_quirks.o
@@ -60,8 +60,8 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o
60obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o 60obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o
61obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o 61obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
62obj-$(CONFIG_X86_MPPARSE) += mpparse.o 62obj-$(CONFIG_X86_MPPARSE) += mpparse.o
63obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o 63obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
64obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o 64obj-$(CONFIG_X86_IO_APIC) += io_apic.o
65obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 65obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
66obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 66obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
67obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 67obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
@@ -108,7 +108,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o
108# 64 bit specific files 108# 64 bit specific files
109ifeq ($(CONFIG_X86_64),y) 109ifeq ($(CONFIG_X86_64),y)
110 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 110 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
111 obj-y += bios_uv.o 111 obj-y += bios_uv.o uv_irq.o uv_sysfs.o
112 obj-y += genx2apic_cluster.o 112 obj-y += genx2apic_cluster.o
113 obj-y += genx2apic_phys.o 113 obj-y += genx2apic_phys.o
114 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 114 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index eb875cdc7367..0d1c26a583c5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1256,7 +1256,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1256 1256
1257 count = 1257 count =
1258 acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, 1258 acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
1259 NR_IRQ_VECTORS); 1259 nr_irqs);
1260 if (count < 0) { 1260 if (count < 0) {
1261 printk(KERN_ERR PREFIX 1261 printk(KERN_ERR PREFIX
1262 "Error parsing interrupt source overrides entry\n"); 1262 "Error parsing interrupt source overrides entry\n");
@@ -1276,7 +1276,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1276 1276
1277 count = 1277 count =
1278 acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, 1278 acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
1279 NR_IRQ_VECTORS); 1279 nr_irqs);
1280 if (count < 0) { 1280 if (count < 0) {
1281 printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); 1281 printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
1282 /* TBD: Cleanup to allow fallback to MPS */ 1282 /* TBD: Cleanup to allow fallback to MPS */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 426e5d91b63a..c44cd6dbfa14 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -10,6 +10,7 @@
10#include <linux/dmi.h> 10#include <linux/dmi.h>
11#include <linux/cpumask.h> 11#include <linux/cpumask.h>
12#include <asm/segment.h> 12#include <asm/segment.h>
13#include <asm/desc.h>
13 14
14#include "realmode/wakeup.h" 15#include "realmode/wakeup.h"
15#include "sleep.h" 16#include "sleep.h"
@@ -98,6 +99,8 @@ int acpi_save_state_mem(void)
98 header->trampoline_segment = setup_trampoline() >> 4; 99 header->trampoline_segment = setup_trampoline() >> 4;
99#ifdef CONFIG_SMP 100#ifdef CONFIG_SMP
100 stack_start.sp = temp_stack + 4096; 101 stack_start.sp = temp_stack + 4096;
102 early_gdt_descr.address =
103 (unsigned long)get_cpu_gdt_table(smp_processor_id());
101#endif 104#endif
102 initial_code = (unsigned long)wakeup_long64; 105 initial_code = (unsigned long)wakeup_long64;
103 saved_magic = 0x123456789abcdef0; 106 saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index fb04e49776ba..a84ac7b570e6 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -444,7 +444,7 @@ void __init alternative_instructions(void)
444 _text, _etext); 444 _text, _etext);
445 445
446 /* Only switch to UP mode if we don't immediately boot others */ 446 /* Only switch to UP mode if we don't immediately boot others */
447 if (num_possible_cpus() == 1 || setup_max_cpus <= 1) 447 if (num_present_cpus() == 1 || setup_max_cpus <= 1)
448 alternatives_smp_switch(0); 448 alternatives_smp_switch(0);
449 } 449 }
450#endif 450#endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 34e4d112b1ef..a8fd9ebdc8e2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -295,7 +295,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
295 u64 address, size_t size) 295 u64 address, size_t size)
296{ 296{
297 int s = 0; 297 int s = 0;
298 unsigned pages = iommu_num_pages(address, size); 298 unsigned pages = iommu_num_pages(address, size, PAGE_SIZE);
299 299
300 address &= PAGE_MASK; 300 address &= PAGE_MASK;
301 301
@@ -680,7 +680,8 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
680 iommu->exclusion_start < dma_dom->aperture_size) { 680 iommu->exclusion_start < dma_dom->aperture_size) {
681 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 681 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
682 int pages = iommu_num_pages(iommu->exclusion_start, 682 int pages = iommu_num_pages(iommu->exclusion_start,
683 iommu->exclusion_length); 683 iommu->exclusion_length,
684 PAGE_SIZE);
684 dma_ops_reserve_addresses(dma_dom, startpage, pages); 685 dma_ops_reserve_addresses(dma_dom, startpage, pages);
685 } 686 }
686 687
@@ -935,7 +936,7 @@ static dma_addr_t __map_single(struct device *dev,
935 unsigned long align_mask = 0; 936 unsigned long align_mask = 0;
936 int i; 937 int i;
937 938
938 pages = iommu_num_pages(paddr, size); 939 pages = iommu_num_pages(paddr, size, PAGE_SIZE);
939 paddr &= PAGE_MASK; 940 paddr &= PAGE_MASK;
940 941
941 if (align) 942 if (align)
@@ -980,7 +981,7 @@ static void __unmap_single(struct amd_iommu *iommu,
980 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) 981 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
981 return; 982 return;
982 983
983 pages = iommu_num_pages(dma_addr, size); 984 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
984 dma_addr &= PAGE_MASK; 985 dma_addr &= PAGE_MASK;
985 start = dma_addr; 986 start = dma_addr;
986 987
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 4cd8083c58be..0cdcda35a05f 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -212,7 +212,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
212/* Programs the physical address of the device table into the IOMMU hardware */ 212/* Programs the physical address of the device table into the IOMMU hardware */
213static void __init iommu_set_device_table(struct amd_iommu *iommu) 213static void __init iommu_set_device_table(struct amd_iommu *iommu)
214{ 214{
215 u32 entry; 215 u64 entry;
216 216
217 BUG_ON(iommu->mmio_base == NULL); 217 BUG_ON(iommu->mmio_base == NULL);
218 218
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic.c
index 21c831d96af3..04a7f960bbc0 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic.c
@@ -23,11 +23,13 @@
23#include <linux/mc146818rtc.h> 23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h> 24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h> 25#include <linux/sysdev.h>
26#include <linux/ioport.h>
26#include <linux/cpu.h> 27#include <linux/cpu.h>
27#include <linux/clockchips.h> 28#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h> 29#include <linux/acpi_pmtmr.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/dmi.h> 31#include <linux/dmi.h>
32#include <linux/dmar.h>
31 33
32#include <asm/atomic.h> 34#include <asm/atomic.h>
33#include <asm/smp.h> 35#include <asm/smp.h>
@@ -36,8 +38,14 @@
36#include <asm/desc.h> 38#include <asm/desc.h>
37#include <asm/arch_hooks.h> 39#include <asm/arch_hooks.h>
38#include <asm/hpet.h> 40#include <asm/hpet.h>
41#include <asm/pgalloc.h>
39#include <asm/i8253.h> 42#include <asm/i8253.h>
40#include <asm/nmi.h> 43#include <asm/nmi.h>
44#include <asm/idle.h>
45#include <asm/proto.h>
46#include <asm/timex.h>
47#include <asm/apic.h>
48#include <asm/i8259.h>
41 49
42#include <mach_apic.h> 50#include <mach_apic.h>
43#include <mach_apicdef.h> 51#include <mach_apicdef.h>
@@ -50,16 +58,58 @@
50# error SPURIOUS_APIC_VECTOR definition error 58# error SPURIOUS_APIC_VECTOR definition error
51#endif 59#endif
52 60
53unsigned long mp_lapic_addr; 61#ifdef CONFIG_X86_32
54
55/* 62/*
56 * Knob to control our willingness to enable the local APIC. 63 * Knob to control our willingness to enable the local APIC.
57 * 64 *
58 * +1=force-enable 65 * +1=force-enable
59 */ 66 */
60static int force_enable_local_apic; 67static int force_enable_local_apic;
61int disable_apic; 68/*
69 * APIC command line parameters
70 */
71static int __init parse_lapic(char *arg)
72{
73 force_enable_local_apic = 1;
74 return 0;
75}
76early_param("lapic", parse_lapic);
77/* Local APIC was disabled by the BIOS and enabled by the kernel */
78static int enabled_via_apicbase;
79
80#endif
81
82#ifdef CONFIG_X86_64
83static int apic_calibrate_pmtmr __initdata;
84static __init int setup_apicpmtimer(char *s)
85{
86 apic_calibrate_pmtmr = 1;
87 notsc_setup(NULL);
88 return 0;
89}
90__setup("apicpmtimer", setup_apicpmtimer);
91#endif
92
93#ifdef CONFIG_X86_64
94#define HAVE_X2APIC
95#endif
96
97#ifdef HAVE_X2APIC
98int x2apic;
99/* x2apic enabled before OS handover */
100int x2apic_preenabled;
101int disable_x2apic;
102static __init int setup_nox2apic(char *str)
103{
104 disable_x2apic = 1;
105 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
106 return 0;
107}
108early_param("nox2apic", setup_nox2apic);
109#endif
62 110
111unsigned long mp_lapic_addr;
112int disable_apic;
63/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 113/* Disable local APIC timer from the kernel commandline or via dmi quirk */
64static int disable_apic_timer __cpuinitdata; 114static int disable_apic_timer __cpuinitdata;
65/* Local APIC timer works in C2 */ 115/* Local APIC timer works in C2 */
@@ -110,9 +160,6 @@ static struct clock_event_device lapic_clockevent = {
110}; 160};
111static DEFINE_PER_CPU(struct clock_event_device, lapic_events); 161static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
112 162
113/* Local APIC was disabled by the BIOS and enabled by the kernel */
114static int enabled_via_apicbase;
115
116static unsigned long apic_phys; 163static unsigned long apic_phys;
117 164
118/* 165/*
@@ -202,6 +249,42 @@ static struct apic_ops xapic_ops = {
202struct apic_ops __read_mostly *apic_ops = &xapic_ops; 249struct apic_ops __read_mostly *apic_ops = &xapic_ops;
203EXPORT_SYMBOL_GPL(apic_ops); 250EXPORT_SYMBOL_GPL(apic_ops);
204 251
252#ifdef HAVE_X2APIC
253static void x2apic_wait_icr_idle(void)
254{
255 /* no need to wait for icr idle in x2apic */
256 return;
257}
258
259static u32 safe_x2apic_wait_icr_idle(void)
260{
261 /* no need to wait for icr idle in x2apic */
262 return 0;
263}
264
265void x2apic_icr_write(u32 low, u32 id)
266{
267 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
268}
269
270u64 x2apic_icr_read(void)
271{
272 unsigned long val;
273
274 rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
275 return val;
276}
277
278static struct apic_ops x2apic_ops = {
279 .read = native_apic_msr_read,
280 .write = native_apic_msr_write,
281 .icr_read = x2apic_icr_read,
282 .icr_write = x2apic_icr_write,
283 .wait_icr_idle = x2apic_wait_icr_idle,
284 .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
285};
286#endif
287
205/** 288/**
206 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 289 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
207 */ 290 */
@@ -219,6 +302,7 @@ void __cpuinit enable_NMI_through_LVT0(void)
219 apic_write(APIC_LVT0, v); 302 apic_write(APIC_LVT0, v);
220} 303}
221 304
305#ifdef CONFIG_X86_32
222/** 306/**
223 * get_physical_broadcast - Get number of physical broadcast IDs 307 * get_physical_broadcast - Get number of physical broadcast IDs
224 */ 308 */
@@ -226,6 +310,7 @@ int get_physical_broadcast(void)
226{ 310{
227 return modern_apic() ? 0xff : 0xf; 311 return modern_apic() ? 0xff : 0xf;
228} 312}
313#endif
229 314
230/** 315/**
231 * lapic_get_maxlvt - get the maximum number of local vector table entries 316 * lapic_get_maxlvt - get the maximum number of local vector table entries
@@ -247,11 +332,7 @@ int lapic_get_maxlvt(void)
247 */ 332 */
248 333
249/* Clock divisor */ 334/* Clock divisor */
250#ifdef CONFG_X86_64
251#define APIC_DIVISOR 1
252#else
253#define APIC_DIVISOR 16 335#define APIC_DIVISOR 16
254#endif
255 336
256/* 337/*
257 * This function sets up the local APIC timer, with a timeout of 338 * This function sets up the local APIC timer, with a timeout of
@@ -383,7 +464,7 @@ static void lapic_timer_broadcast(cpumask_t mask)
383 * Setup the local APIC timer for this CPU. Copy the initilized values 464 * Setup the local APIC timer for this CPU. Copy the initilized values
384 * of the boot CPU and register the clock event in the framework. 465 * of the boot CPU and register the clock event in the framework.
385 */ 466 */
386static void __devinit setup_APIC_timer(void) 467static void __cpuinit setup_APIC_timer(void)
387{ 468{
388 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 469 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
389 470
@@ -453,14 +534,51 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
453 } 534 }
454} 535}
455 536
537static int __init calibrate_by_pmtimer(long deltapm, long *delta)
538{
539 const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
540 const long pm_thresh = pm_100ms / 100;
541 unsigned long mult;
542 u64 res;
543
544#ifndef CONFIG_X86_PM_TIMER
545 return -1;
546#endif
547
548 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
549
550 /* Check, if the PM timer is available */
551 if (!deltapm)
552 return -1;
553
554 mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
555
556 if (deltapm > (pm_100ms - pm_thresh) &&
557 deltapm < (pm_100ms + pm_thresh)) {
558 apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
559 } else {
560 res = (((u64)deltapm) * mult) >> 22;
561 do_div(res, 1000000);
562 printk(KERN_WARNING "APIC calibration not consistent "
563 "with PM Timer: %ldms instead of 100ms\n",
564 (long)res);
565 /* Correct the lapic counter value */
566 res = (((u64)(*delta)) * pm_100ms);
567 do_div(res, deltapm);
568 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
569 "%lu (%ld)\n", (unsigned long)res, *delta);
570 *delta = (long)res;
571 }
572
573 return 0;
574}
575
456static int __init calibrate_APIC_clock(void) 576static int __init calibrate_APIC_clock(void)
457{ 577{
458 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 578 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
459 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
460 const long pm_thresh = pm_100ms/100;
461 void (*real_handler)(struct clock_event_device *dev); 579 void (*real_handler)(struct clock_event_device *dev);
462 unsigned long deltaj; 580 unsigned long deltaj;
463 long delta, deltapm; 581 long delta;
464 int pm_referenced = 0; 582 int pm_referenced = 0;
465 583
466 local_irq_disable(); 584 local_irq_disable();
@@ -470,10 +588,10 @@ static int __init calibrate_APIC_clock(void)
470 global_clock_event->event_handler = lapic_cal_handler; 588 global_clock_event->event_handler = lapic_cal_handler;
471 589
472 /* 590 /*
473 * Setup the APIC counter to 1e9. There is no way the lapic 591 * Setup the APIC counter to maximum. There is no way the lapic
474 * can underflow in the 100ms detection time frame 592 * can underflow in the 100ms detection time frame
475 */ 593 */
476 __setup_APIC_LVTT(1000000000, 0, 0); 594 __setup_APIC_LVTT(0xffffffff, 0, 0);
477 595
478 /* Let the interrupts run */ 596 /* Let the interrupts run */
479 local_irq_enable(); 597 local_irq_enable();
@@ -490,34 +608,9 @@ static int __init calibrate_APIC_clock(void)
490 delta = lapic_cal_t1 - lapic_cal_t2; 608 delta = lapic_cal_t1 - lapic_cal_t2;
491 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); 609 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
492 610
493 /* Check, if the PM timer is available */ 611 /* we trust the PM based calibration if possible */
494 deltapm = lapic_cal_pm2 - lapic_cal_pm1; 612 pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
495 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); 613 &delta);
496
497 if (deltapm) {
498 unsigned long mult;
499 u64 res;
500
501 mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
502
503 if (deltapm > (pm_100ms - pm_thresh) &&
504 deltapm < (pm_100ms + pm_thresh)) {
505 apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
506 } else {
507 res = (((u64) deltapm) * mult) >> 22;
508 do_div(res, 1000000);
509 printk(KERN_WARNING "APIC calibration not consistent "
510 "with PM Timer: %ldms instead of 100ms\n",
511 (long)res);
512 /* Correct the lapic counter value */
513 res = (((u64) delta) * pm_100ms);
514 do_div(res, deltapm);
515 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
516 "%lu (%ld)\n", (unsigned long) res, delta);
517 delta = (long) res;
518 }
519 pm_referenced = 1;
520 }
521 614
522 /* Calculate the scaled math multiplication factor */ 615 /* Calculate the scaled math multiplication factor */
523 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 616 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
@@ -559,7 +652,10 @@ static int __init calibrate_APIC_clock(void)
559 652
560 levt->features &= ~CLOCK_EVT_FEAT_DUMMY; 653 levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
561 654
562 /* We trust the pm timer based calibration */ 655 /*
656 * PM timer calibration failed or not turned on
657 * so lets try APIC timer based calibration
658 */
563 if (!pm_referenced) { 659 if (!pm_referenced) {
564 apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); 660 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
565 661
@@ -652,7 +748,7 @@ void __init setup_boot_APIC_clock(void)
652 setup_APIC_timer(); 748 setup_APIC_timer();
653} 749}
654 750
655void __devinit setup_secondary_APIC_clock(void) 751void __cpuinit setup_secondary_APIC_clock(void)
656{ 752{
657 setup_APIC_timer(); 753 setup_APIC_timer();
658} 754}
@@ -718,6 +814,9 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
718 * Besides, if we don't timer interrupts ignore the global 814 * Besides, if we don't timer interrupts ignore the global
719 * interrupt lock, which is the WrongThing (tm) to do. 815 * interrupt lock, which is the WrongThing (tm) to do.
720 */ 816 */
817#ifdef CONFIG_X86_64
818 exit_idle();
819#endif
721 irq_enter(); 820 irq_enter();
722 local_apic_timer_interrupt(); 821 local_apic_timer_interrupt();
723 irq_exit(); 822 irq_exit();
@@ -991,40 +1090,43 @@ void __init init_bsp_APIC(void)
991 1090
992static void __cpuinit lapic_setup_esr(void) 1091static void __cpuinit lapic_setup_esr(void)
993{ 1092{
994 unsigned long oldvalue, value, maxlvt; 1093 unsigned int oldvalue, value, maxlvt;
995 if (lapic_is_integrated() && !esr_disable) { 1094
996 if (esr_disable) { 1095 if (!lapic_is_integrated()) {
997 /* 1096 printk(KERN_INFO "No ESR for 82489DX.\n");
998 * Something untraceable is creating bad interrupts on 1097 return;
999 * secondary quads ... for the moment, just leave the 1098 }
1000 * ESR disabled - we can't do anything useful with the
1001 * errors anyway - mbligh
1002 */
1003 printk(KERN_INFO "Leaving ESR disabled.\n");
1004 return;
1005 }
1006 /* !82489DX */
1007 maxlvt = lapic_get_maxlvt();
1008 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1009 apic_write(APIC_ESR, 0);
1010 oldvalue = apic_read(APIC_ESR);
1011 1099
1012 /* enables sending errors */ 1100 if (esr_disable) {
1013 value = ERROR_APIC_VECTOR;
1014 apic_write(APIC_LVTERR, value);
1015 /* 1101 /*
1016 * spec says clear errors after enabling vector. 1102 * Something untraceable is creating bad interrupts on
1103 * secondary quads ... for the moment, just leave the
1104 * ESR disabled - we can't do anything useful with the
1105 * errors anyway - mbligh
1017 */ 1106 */
1018 if (maxlvt > 3) 1107 printk(KERN_INFO "Leaving ESR disabled.\n");
1019 apic_write(APIC_ESR, 0); 1108 return;
1020 value = apic_read(APIC_ESR);
1021 if (value != oldvalue)
1022 apic_printk(APIC_VERBOSE, "ESR value before enabling "
1023 "vector: 0x%08lx after: 0x%08lx\n",
1024 oldvalue, value);
1025 } else {
1026 printk(KERN_INFO "No ESR for 82489DX.\n");
1027 } 1109 }
1110
1111 maxlvt = lapic_get_maxlvt();
1112 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1113 apic_write(APIC_ESR, 0);
1114 oldvalue = apic_read(APIC_ESR);
1115
1116 /* enables sending errors */
1117 value = ERROR_APIC_VECTOR;
1118 apic_write(APIC_LVTERR, value);
1119
1120 /*
1121 * spec says clear errors after enabling vector.
1122 */
1123 if (maxlvt > 3)
1124 apic_write(APIC_ESR, 0);
1125 value = apic_read(APIC_ESR);
1126 if (value != oldvalue)
1127 apic_printk(APIC_VERBOSE, "ESR value before enabling "
1128 "vector: 0x%08x after: 0x%08x\n",
1129 oldvalue, value);
1028} 1130}
1029 1131
1030 1132
@@ -1033,24 +1135,27 @@ static void __cpuinit lapic_setup_esr(void)
1033 */ 1135 */
1034void __cpuinit setup_local_APIC(void) 1136void __cpuinit setup_local_APIC(void)
1035{ 1137{
1036 unsigned long value, integrated; 1138 unsigned int value;
1037 int i, j; 1139 int i, j;
1038 1140
1141#ifdef CONFIG_X86_32
1039 /* Pound the ESR really hard over the head with a big hammer - mbligh */ 1142 /* Pound the ESR really hard over the head with a big hammer - mbligh */
1040 if (esr_disable) { 1143 if (lapic_is_integrated() && esr_disable) {
1041 apic_write(APIC_ESR, 0); 1144 apic_write(APIC_ESR, 0);
1042 apic_write(APIC_ESR, 0); 1145 apic_write(APIC_ESR, 0);
1043 apic_write(APIC_ESR, 0); 1146 apic_write(APIC_ESR, 0);
1044 apic_write(APIC_ESR, 0); 1147 apic_write(APIC_ESR, 0);
1045 } 1148 }
1149#endif
1046 1150
1047 integrated = lapic_is_integrated(); 1151 preempt_disable();
1048 1152
1049 /* 1153 /*
1050 * Double-check whether this APIC is really registered. 1154 * Double-check whether this APIC is really registered.
1155 * This is meaningless in clustered apic mode, so we skip it.
1051 */ 1156 */
1052 if (!apic_id_registered()) 1157 if (!apic_id_registered())
1053 WARN_ON_ONCE(1); 1158 BUG();
1054 1159
1055 /* 1160 /*
1056 * Intel recommends to set DFR, LDR and TPR before enabling 1161 * Intel recommends to set DFR, LDR and TPR before enabling
@@ -1096,6 +1201,7 @@ void __cpuinit setup_local_APIC(void)
1096 */ 1201 */
1097 value |= APIC_SPIV_APIC_ENABLED; 1202 value |= APIC_SPIV_APIC_ENABLED;
1098 1203
1204#ifdef CONFIG_X86_32
1099 /* 1205 /*
1100 * Some unknown Intel IO/APIC (or APIC) errata is biting us with 1206 * Some unknown Intel IO/APIC (or APIC) errata is biting us with
1101 * certain networking cards. If high frequency interrupts are 1207 * certain networking cards. If high frequency interrupts are
@@ -1116,8 +1222,13 @@ void __cpuinit setup_local_APIC(void)
1116 * See also the comment in end_level_ioapic_irq(). --macro 1222 * See also the comment in end_level_ioapic_irq(). --macro
1117 */ 1223 */
1118 1224
1119 /* Enable focus processor (bit==0) */ 1225 /*
1226 * - enable focus processor (bit==0)
1227 * - 64bit mode always use processor focus
1228 * so no need to set it
1229 */
1120 value &= ~APIC_SPIV_FOCUS_DISABLED; 1230 value &= ~APIC_SPIV_FOCUS_DISABLED;
1231#endif
1121 1232
1122 /* 1233 /*
1123 * Set spurious IRQ vector 1234 * Set spurious IRQ vector
@@ -1154,9 +1265,11 @@ void __cpuinit setup_local_APIC(void)
1154 value = APIC_DM_NMI; 1265 value = APIC_DM_NMI;
1155 else 1266 else
1156 value = APIC_DM_NMI | APIC_LVT_MASKED; 1267 value = APIC_DM_NMI | APIC_LVT_MASKED;
1157 if (!integrated) /* 82489DX */ 1268 if (!lapic_is_integrated()) /* 82489DX */
1158 value |= APIC_LVT_LEVEL_TRIGGER; 1269 value |= APIC_LVT_LEVEL_TRIGGER;
1159 apic_write(APIC_LVT1, value); 1270 apic_write(APIC_LVT1, value);
1271
1272 preempt_enable();
1160} 1273}
1161 1274
1162void __cpuinit end_local_APIC_setup(void) 1275void __cpuinit end_local_APIC_setup(void)
@@ -1177,6 +1290,153 @@ void __cpuinit end_local_APIC_setup(void)
1177 apic_pm_activate(); 1290 apic_pm_activate();
1178} 1291}
1179 1292
1293#ifdef HAVE_X2APIC
1294void check_x2apic(void)
1295{
1296 int msr, msr2;
1297
1298 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1299
1300 if (msr & X2APIC_ENABLE) {
1301 printk("x2apic enabled by BIOS, switching to x2apic ops\n");
1302 x2apic_preenabled = x2apic = 1;
1303 apic_ops = &x2apic_ops;
1304 }
1305}
1306
1307void enable_x2apic(void)
1308{
1309 int msr, msr2;
1310
1311 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1312 if (!(msr & X2APIC_ENABLE)) {
1313 printk("Enabling x2apic\n");
1314 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1315 }
1316}
1317
1318void enable_IR_x2apic(void)
1319{
1320#ifdef CONFIG_INTR_REMAP
1321 int ret;
1322 unsigned long flags;
1323
1324 if (!cpu_has_x2apic)
1325 return;
1326
1327 if (!x2apic_preenabled && disable_x2apic) {
1328 printk(KERN_INFO
1329 "Skipped enabling x2apic and Interrupt-remapping "
1330 "because of nox2apic\n");
1331 return;
1332 }
1333
1334 if (x2apic_preenabled && disable_x2apic)
1335 panic("Bios already enabled x2apic, can't enforce nox2apic");
1336
1337 if (!x2apic_preenabled && skip_ioapic_setup) {
1338 printk(KERN_INFO
1339 "Skipped enabling x2apic and Interrupt-remapping "
1340 "because of skipping io-apic setup\n");
1341 return;
1342 }
1343
1344 ret = dmar_table_init();
1345 if (ret) {
1346 printk(KERN_INFO
1347 "dmar_table_init() failed with %d:\n", ret);
1348
1349 if (x2apic_preenabled)
1350 panic("x2apic enabled by bios. But IR enabling failed");
1351 else
1352 printk(KERN_INFO
1353 "Not enabling x2apic,Intr-remapping\n");
1354 return;
1355 }
1356
1357 local_irq_save(flags);
1358 mask_8259A();
1359
1360 ret = save_mask_IO_APIC_setup();
1361 if (ret) {
1362 printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret);
1363 goto end;
1364 }
1365
1366 ret = enable_intr_remapping(1);
1367
1368 if (ret && x2apic_preenabled) {
1369 local_irq_restore(flags);
1370 panic("x2apic enabled by bios. But IR enabling failed");
1371 }
1372
1373 if (ret)
1374 goto end_restore;
1375
1376 if (!x2apic) {
1377 x2apic = 1;
1378 apic_ops = &x2apic_ops;
1379 enable_x2apic();
1380 }
1381
1382end_restore:
1383 if (ret)
1384 /*
1385 * IR enabling failed
1386 */
1387 restore_IO_APIC_setup();
1388 else
1389 reinit_intr_remapped_IO_APIC(x2apic_preenabled);
1390
1391end:
1392 unmask_8259A();
1393 local_irq_restore(flags);
1394
1395 if (!ret) {
1396 if (!x2apic_preenabled)
1397 printk(KERN_INFO
1398 "Enabled x2apic and interrupt-remapping\n");
1399 else
1400 printk(KERN_INFO
1401 "Enabled Interrupt-remapping\n");
1402 } else
1403 printk(KERN_ERR
1404 "Failed to enable Interrupt-remapping and x2apic\n");
1405#else
1406 if (!cpu_has_x2apic)
1407 return;
1408
1409 if (x2apic_preenabled)
1410 panic("x2apic enabled prior OS handover,"
1411 " enable CONFIG_INTR_REMAP");
1412
1413 printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
1414 " and x2apic\n");
1415#endif
1416
1417 return;
1418}
1419#endif /* HAVE_X2APIC */
1420
1421#ifdef CONFIG_X86_64
1422/*
1423 * Detect and enable local APICs on non-SMP boards.
1424 * Original code written by Keir Fraser.
1425 * On AMD64 we trust the BIOS - if it says no APIC it is likely
1426 * not correctly set up (usually the APIC timer won't work etc.)
1427 */
1428static int __init detect_init_APIC(void)
1429{
1430 if (!cpu_has_apic) {
1431 printk(KERN_INFO "No local APIC present\n");
1432 return -1;
1433 }
1434
1435 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1436 boot_cpu_physical_apicid = 0;
1437 return 0;
1438}
1439#else
1180/* 1440/*
1181 * Detect and initialize APIC 1441 * Detect and initialize APIC
1182 */ 1442 */
@@ -1255,12 +1515,46 @@ no_apic:
1255 printk(KERN_INFO "No local APIC present or hardware disabled\n"); 1515 printk(KERN_INFO "No local APIC present or hardware disabled\n");
1256 return -1; 1516 return -1;
1257} 1517}
1518#endif
1519
1520#ifdef CONFIG_X86_64
1521void __init early_init_lapic_mapping(void)
1522{
1523 unsigned long phys_addr;
1524
1525 /*
1526 * If no local APIC can be found then go out
1527 * : it means there is no mpatable and MADT
1528 */
1529 if (!smp_found_config)
1530 return;
1531
1532 phys_addr = mp_lapic_addr;
1533
1534 set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
1535 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
1536 APIC_BASE, phys_addr);
1537
1538 /*
1539 * Fetch the APIC ID of the BSP in case we have a
1540 * default configuration (or the MP table is broken).
1541 */
1542 boot_cpu_physical_apicid = read_apic_id();
1543}
1544#endif
1258 1545
1259/** 1546/**
1260 * init_apic_mappings - initialize APIC mappings 1547 * init_apic_mappings - initialize APIC mappings
1261 */ 1548 */
1262void __init init_apic_mappings(void) 1549void __init init_apic_mappings(void)
1263{ 1550{
1551#ifdef HAVE_X2APIC
1552 if (x2apic) {
1553 boot_cpu_physical_apicid = read_apic_id();
1554 return;
1555 }
1556#endif
1557
1264 /* 1558 /*
1265 * If no local APIC can be found then set up a fake all 1559 * If no local APIC can be found then set up a fake all
1266 * zeroes page to simulate the local APIC and another 1560 * zeroes page to simulate the local APIC and another
@@ -1273,8 +1567,8 @@ void __init init_apic_mappings(void)
1273 apic_phys = mp_lapic_addr; 1567 apic_phys = mp_lapic_addr;
1274 1568
1275 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 1569 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
1276 printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE, 1570 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
1277 apic_phys); 1571 APIC_BASE, apic_phys);
1278 1572
1279 /* 1573 /*
1280 * Fetch the APIC ID of the BSP in case we have a 1574 * Fetch the APIC ID of the BSP in case we have a
@@ -1282,18 +1576,27 @@ void __init init_apic_mappings(void)
1282 */ 1576 */
1283 if (boot_cpu_physical_apicid == -1U) 1577 if (boot_cpu_physical_apicid == -1U)
1284 boot_cpu_physical_apicid = read_apic_id(); 1578 boot_cpu_physical_apicid = read_apic_id();
1285
1286} 1579}
1287 1580
1288/* 1581/*
1289 * This initializes the IO-APIC and APIC hardware if this is 1582 * This initializes the IO-APIC and APIC hardware if this is
1290 * a UP kernel. 1583 * a UP kernel.
1291 */ 1584 */
1292
1293int apic_version[MAX_APICS]; 1585int apic_version[MAX_APICS];
1294 1586
1295int __init APIC_init_uniprocessor(void) 1587int __init APIC_init_uniprocessor(void)
1296{ 1588{
1589#ifdef CONFIG_X86_64
1590 if (disable_apic) {
1591 printk(KERN_INFO "Apic disabled\n");
1592 return -1;
1593 }
1594 if (!cpu_has_apic) {
1595 disable_apic = 1;
1596 printk(KERN_INFO "Apic disabled by BIOS\n");
1597 return -1;
1598 }
1599#else
1297 if (!smp_found_config && !cpu_has_apic) 1600 if (!smp_found_config && !cpu_has_apic)
1298 return -1; 1601 return -1;
1299 1602
@@ -1302,39 +1605,68 @@ int __init APIC_init_uniprocessor(void)
1302 */ 1605 */
1303 if (!cpu_has_apic && 1606 if (!cpu_has_apic &&
1304 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { 1607 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1305 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 1608 printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n",
1306 boot_cpu_physical_apicid); 1609 boot_cpu_physical_apicid);
1307 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1610 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1308 return -1; 1611 return -1;
1309 } 1612 }
1613#endif
1310 1614
1311 verify_local_APIC(); 1615#ifdef HAVE_X2APIC
1616 enable_IR_x2apic();
1617#endif
1618#ifdef CONFIG_X86_64
1619 setup_apic_routing();
1620#endif
1312 1621
1622 verify_local_APIC();
1313 connect_bsp_APIC(); 1623 connect_bsp_APIC();
1314 1624
1625#ifdef CONFIG_X86_64
1626 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
1627#else
1315 /* 1628 /*
1316 * Hack: In case of kdump, after a crash, kernel might be booting 1629 * Hack: In case of kdump, after a crash, kernel might be booting
1317 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid 1630 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
1318 * might be zero if read from MP tables. Get it from LAPIC. 1631 * might be zero if read from MP tables. Get it from LAPIC.
1319 */ 1632 */
1320#ifdef CONFIG_CRASH_DUMP 1633# ifdef CONFIG_CRASH_DUMP
1321 boot_cpu_physical_apicid = read_apic_id(); 1634 boot_cpu_physical_apicid = read_apic_id();
1635# endif
1322#endif 1636#endif
1323 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 1637 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1324
1325 setup_local_APIC(); 1638 setup_local_APIC();
1326 1639
1640#ifdef CONFIG_X86_64
1641 /*
1642 * Now enable IO-APICs, actually call clear_IO_APIC
1643 * We need clear_IO_APIC before enabling vector on BP
1644 */
1645 if (!skip_ioapic_setup && nr_ioapics)
1646 enable_IO_APIC();
1647#endif
1648
1327#ifdef CONFIG_X86_IO_APIC 1649#ifdef CONFIG_X86_IO_APIC
1328 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) 1650 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
1329#endif 1651#endif
1330 localise_nmi_watchdog(); 1652 localise_nmi_watchdog();
1331 end_local_APIC_setup(); 1653 end_local_APIC_setup();
1654
1332#ifdef CONFIG_X86_IO_APIC 1655#ifdef CONFIG_X86_IO_APIC
1333 if (smp_found_config) 1656 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1334 if (!skip_ioapic_setup && nr_ioapics) 1657 setup_IO_APIC();
1335 setup_IO_APIC(); 1658# ifdef CONFIG_X86_64
1659 else
1660 nr_ioapics = 0;
1661# endif
1336#endif 1662#endif
1663
1664#ifdef CONFIG_X86_64
1665 setup_boot_APIC_clock();
1666 check_nmi_watchdog();
1667#else
1337 setup_boot_clock(); 1668 setup_boot_clock();
1669#endif
1338 1670
1339 return 0; 1671 return 0;
1340} 1672}
@@ -1348,8 +1680,11 @@ int __init APIC_init_uniprocessor(void)
1348 */ 1680 */
1349void smp_spurious_interrupt(struct pt_regs *regs) 1681void smp_spurious_interrupt(struct pt_regs *regs)
1350{ 1682{
1351 unsigned long v; 1683 u32 v;
1352 1684
1685#ifdef CONFIG_X86_64
1686 exit_idle();
1687#endif
1353 irq_enter(); 1688 irq_enter();
1354 /* 1689 /*
1355 * Check if this really is a spurious interrupt and ACK it 1690 * Check if this really is a spurious interrupt and ACK it
@@ -1360,10 +1695,14 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1360 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) 1695 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1361 ack_APIC_irq(); 1696 ack_APIC_irq();
1362 1697
1698#ifdef CONFIG_X86_64
1699 add_pda(irq_spurious_count, 1);
1700#else
1363 /* see sw-dev-man vol 3, chapter 7.4.13.5 */ 1701 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
1364 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " 1702 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
1365 "should never happen.\n", smp_processor_id()); 1703 "should never happen.\n", smp_processor_id());
1366 __get_cpu_var(irq_stat).irq_spurious_count++; 1704 __get_cpu_var(irq_stat).irq_spurious_count++;
1705#endif
1367 irq_exit(); 1706 irq_exit();
1368} 1707}
1369 1708
@@ -1372,8 +1711,11 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1372 */ 1711 */
1373void smp_error_interrupt(struct pt_regs *regs) 1712void smp_error_interrupt(struct pt_regs *regs)
1374{ 1713{
1375 unsigned long v, v1; 1714 u32 v, v1;
1376 1715
1716#ifdef CONFIG_X86_64
1717 exit_idle();
1718#endif
1377 irq_enter(); 1719 irq_enter();
1378 /* First tickle the hardware, only then report what went on. -- REW */ 1720 /* First tickle the hardware, only then report what went on. -- REW */
1379 v = apic_read(APIC_ESR); 1721 v = apic_read(APIC_ESR);
@@ -1392,7 +1734,7 @@ void smp_error_interrupt(struct pt_regs *regs)
1392 6: Received illegal vector 1734 6: Received illegal vector
1393 7: Illegal register address 1735 7: Illegal register address
1394 */ 1736 */
1395 printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", 1737 printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
1396 smp_processor_id(), v , v1); 1738 smp_processor_id(), v , v1);
1397 irq_exit(); 1739 irq_exit();
1398} 1740}
@@ -1565,6 +1907,13 @@ void __cpuinit generic_processor_info(int apicid, int version)
1565 cpu_set(cpu, cpu_present_map); 1907 cpu_set(cpu, cpu_present_map);
1566} 1908}
1567 1909
1910#ifdef CONFIG_X86_64
1911int hard_smp_processor_id(void)
1912{
1913 return read_apic_id();
1914}
1915#endif
1916
1568/* 1917/*
1569 * Power management 1918 * Power management
1570 */ 1919 */
@@ -1640,7 +1989,7 @@ static int lapic_resume(struct sys_device *dev)
1640 1989
1641 local_irq_save(flags); 1990 local_irq_save(flags);
1642 1991
1643#ifdef CONFIG_X86_64 1992#ifdef HAVE_X2APIC
1644 if (x2apic) 1993 if (x2apic)
1645 enable_x2apic(); 1994 enable_x2apic();
1646 else 1995 else
@@ -1702,7 +2051,7 @@ static struct sys_device device_lapic = {
1702 .cls = &lapic_sysclass, 2051 .cls = &lapic_sysclass,
1703}; 2052};
1704 2053
1705static void __devinit apic_pm_activate(void) 2054static void __cpuinit apic_pm_activate(void)
1706{ 2055{
1707 apic_pm_state.active = 1; 2056 apic_pm_state.active = 1;
1708} 2057}
@@ -1728,16 +2077,87 @@ static void apic_pm_activate(void) { }
1728 2077
1729#endif /* CONFIG_PM */ 2078#endif /* CONFIG_PM */
1730 2079
2080#ifdef CONFIG_X86_64
1731/* 2081/*
1732 * APIC command line parameters 2082 * apic_is_clustered_box() -- Check if we can expect good TSC
2083 *
2084 * Thus far, the major user of this is IBM's Summit2 series:
2085 *
2086 * Clustered boxes may have unsynced TSC problems if they are
2087 * multi-chassis. Use available data to take a good guess.
2088 * If in doubt, go HPET.
1733 */ 2089 */
1734static int __init parse_lapic(char *arg) 2090__cpuinit int apic_is_clustered_box(void)
1735{ 2091{
1736 force_enable_local_apic = 1; 2092 int i, clusters, zeros;
1737 return 0; 2093 unsigned id;
2094 u16 *bios_cpu_apicid;
2095 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
2096
2097 /*
2098 * there is not this kind of box with AMD CPU yet.
2099 * Some AMD box with quadcore cpu and 8 sockets apicid
2100 * will be [4, 0x23] or [8, 0x27] could be thought to
2101 * vsmp box still need checking...
2102 */
2103 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
2104 return 0;
2105
2106 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
2107 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
2108
2109 for (i = 0; i < NR_CPUS; i++) {
2110 /* are we being called early in kernel startup? */
2111 if (bios_cpu_apicid) {
2112 id = bios_cpu_apicid[i];
2113 }
2114 else if (i < nr_cpu_ids) {
2115 if (cpu_present(i))
2116 id = per_cpu(x86_bios_cpu_apicid, i);
2117 else
2118 continue;
2119 }
2120 else
2121 break;
2122
2123 if (id != BAD_APICID)
2124 __set_bit(APIC_CLUSTERID(id), clustermap);
2125 }
2126
2127 /* Problem: Partially populated chassis may not have CPUs in some of
2128 * the APIC clusters they have been allocated. Only present CPUs have
2129 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
2130 * Since clusters are allocated sequentially, count zeros only if
2131 * they are bounded by ones.
2132 */
2133 clusters = 0;
2134 zeros = 0;
2135 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
2136 if (test_bit(i, clustermap)) {
2137 clusters += 1 + zeros;
2138 zeros = 0;
2139 } else
2140 ++zeros;
2141 }
2142
2143 /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
2144 * not guaranteed to be synced between boards
2145 */
2146 if (is_vsmp_box() && clusters > 1)
2147 return 1;
2148
2149 /*
2150 * If clusters > 2, then should be multi-chassis.
2151 * May have to revisit this when multi-core + hyperthreaded CPUs come
2152 * out, but AFAIK this will work even for them.
2153 */
2154 return (clusters > 2);
1738} 2155}
1739early_param("lapic", parse_lapic); 2156#endif
1740 2157
2158/*
2159 * APIC command line parameters
2160 */
1741static int __init setup_disableapic(char *arg) 2161static int __init setup_disableapic(char *arg)
1742{ 2162{
1743 disable_apic = 1; 2163 disable_apic = 1;
@@ -1779,7 +2199,6 @@ static int __init apic_set_verbosity(char *arg)
1779 if (!arg) { 2199 if (!arg) {
1780#ifdef CONFIG_X86_64 2200#ifdef CONFIG_X86_64
1781 skip_ioapic_setup = 0; 2201 skip_ioapic_setup = 0;
1782 ioapic_force = 1;
1783 return 0; 2202 return 0;
1784#endif 2203#endif
1785 return -EINVAL; 2204 return -EINVAL;
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
deleted file mode 100644
index 94ddb69ae15e..000000000000
--- a/arch/x86/kernel/apic_64.c
+++ /dev/null
@@ -1,1848 +0,0 @@
1/*
2 * Local APIC handling, local APIC timers
3 *
4 * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes
7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
8 * thanks to Eric Gilmore
9 * and Rolf G. Tews
10 * for testing these extensively.
11 * Maciej W. Rozycki : Various updates and fixes.
12 * Mikael Pettersson : Power Management for UP-APIC.
13 * Pavel Machek and
14 * Mikael Pettersson : PM converted to driver model.
15 */
16
17#include <linux/init.h>
18
19#include <linux/mm.h>
20#include <linux/delay.h>
21#include <linux/bootmem.h>
22#include <linux/interrupt.h>
23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h>
26#include <linux/ioport.h>
27#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h>
29#include <linux/module.h>
30#include <linux/dmar.h>
31
32#include <asm/atomic.h>
33#include <asm/smp.h>
34#include <asm/mtrr.h>
35#include <asm/mpspec.h>
36#include <asm/hpet.h>
37#include <asm/pgalloc.h>
38#include <asm/nmi.h>
39#include <asm/idle.h>
40#include <asm/proto.h>
41#include <asm/timex.h>
42#include <asm/apic.h>
43#include <asm/i8259.h>
44
45#include <mach_ipi.h>
46#include <mach_apic.h>
47
48/* Disable local APIC timer from the kernel commandline or via dmi quirk */
49static int disable_apic_timer __cpuinitdata;
50static int apic_calibrate_pmtmr __initdata;
51int disable_apic;
52int disable_x2apic;
53int x2apic;
54
55/* x2apic enabled before OS handover */
56int x2apic_preenabled;
57
58/* Local APIC timer works in C2 */
59int local_apic_timer_c2_ok;
60EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
61
62/*
63 * Debug level, exported for io_apic.c
64 */
65unsigned int apic_verbosity;
66
67/* Have we found an MP table */
68int smp_found_config;
69
70static struct resource lapic_resource = {
71 .name = "Local APIC",
72 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
73};
74
75static unsigned int calibration_result;
76
77static int lapic_next_event(unsigned long delta,
78 struct clock_event_device *evt);
79static void lapic_timer_setup(enum clock_event_mode mode,
80 struct clock_event_device *evt);
81static void lapic_timer_broadcast(cpumask_t mask);
82static void apic_pm_activate(void);
83
84/*
85 * The local apic timer can be used for any function which is CPU local.
86 */
87static struct clock_event_device lapic_clockevent = {
88 .name = "lapic",
89 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
90 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
91 .shift = 32,
92 .set_mode = lapic_timer_setup,
93 .set_next_event = lapic_next_event,
94 .broadcast = lapic_timer_broadcast,
95 .rating = 100,
96 .irq = -1,
97};
98static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
99
100static unsigned long apic_phys;
101
102unsigned long mp_lapic_addr;
103
104/*
105 * Get the LAPIC version
106 */
107static inline int lapic_get_version(void)
108{
109 return GET_APIC_VERSION(apic_read(APIC_LVR));
110}
111
112/*
113 * Check, if the APIC is integrated or a separate chip
114 */
115static inline int lapic_is_integrated(void)
116{
117#ifdef CONFIG_X86_64
118 return 1;
119#else
120 return APIC_INTEGRATED(lapic_get_version());
121#endif
122}
123
124/*
125 * Check, whether this is a modern or a first generation APIC
126 */
127static int modern_apic(void)
128{
129 /* AMD systems use old APIC versions, so check the CPU */
130 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
131 boot_cpu_data.x86 >= 0xf)
132 return 1;
133 return lapic_get_version() >= 0x14;
134}
135
136/*
137 * Paravirt kernels also might be using these below ops. So we still
138 * use generic apic_read()/apic_write(), which might be pointing to different
139 * ops in PARAVIRT case.
140 */
141void xapic_wait_icr_idle(void)
142{
143 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
144 cpu_relax();
145}
146
147u32 safe_xapic_wait_icr_idle(void)
148{
149 u32 send_status;
150 int timeout;
151
152 timeout = 0;
153 do {
154 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
155 if (!send_status)
156 break;
157 udelay(100);
158 } while (timeout++ < 1000);
159
160 return send_status;
161}
162
163void xapic_icr_write(u32 low, u32 id)
164{
165 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
166 apic_write(APIC_ICR, low);
167}
168
169u64 xapic_icr_read(void)
170{
171 u32 icr1, icr2;
172
173 icr2 = apic_read(APIC_ICR2);
174 icr1 = apic_read(APIC_ICR);
175
176 return icr1 | ((u64)icr2 << 32);
177}
178
179static struct apic_ops xapic_ops = {
180 .read = native_apic_mem_read,
181 .write = native_apic_mem_write,
182 .icr_read = xapic_icr_read,
183 .icr_write = xapic_icr_write,
184 .wait_icr_idle = xapic_wait_icr_idle,
185 .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
186};
187
188struct apic_ops __read_mostly *apic_ops = &xapic_ops;
189EXPORT_SYMBOL_GPL(apic_ops);
190
191static void x2apic_wait_icr_idle(void)
192{
193 /* no need to wait for icr idle in x2apic */
194 return;
195}
196
197static u32 safe_x2apic_wait_icr_idle(void)
198{
199 /* no need to wait for icr idle in x2apic */
200 return 0;
201}
202
203void x2apic_icr_write(u32 low, u32 id)
204{
205 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
206}
207
208u64 x2apic_icr_read(void)
209{
210 unsigned long val;
211
212 rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
213 return val;
214}
215
216static struct apic_ops x2apic_ops = {
217 .read = native_apic_msr_read,
218 .write = native_apic_msr_write,
219 .icr_read = x2apic_icr_read,
220 .icr_write = x2apic_icr_write,
221 .wait_icr_idle = x2apic_wait_icr_idle,
222 .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
223};
224
225/**
226 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
227 */
228void __cpuinit enable_NMI_through_LVT0(void)
229{
230 unsigned int v;
231
232 /* unmask and set to NMI */
233 v = APIC_DM_NMI;
234
235 /* Level triggered for 82489DX (32bit mode) */
236 if (!lapic_is_integrated())
237 v |= APIC_LVT_LEVEL_TRIGGER;
238
239 apic_write(APIC_LVT0, v);
240}
241
242/**
243 * lapic_get_maxlvt - get the maximum number of local vector table entries
244 */
245int lapic_get_maxlvt(void)
246{
247 unsigned int v;
248
249 v = apic_read(APIC_LVR);
250 /*
251 * - we always have APIC integrated on 64bit mode
252 * - 82489DXs do not report # of LVT entries
253 */
254 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
255}
256
257/*
258 * Local APIC timer
259 */
260
261/* Clock divisor */
262#ifdef CONFG_X86_64
263#define APIC_DIVISOR 1
264#else
265#define APIC_DIVISOR 16
266#endif
267
268/*
269 * This function sets up the local APIC timer, with a timeout of
270 * 'clocks' APIC bus clock. During calibration we actually call
271 * this function twice on the boot CPU, once with a bogus timeout
272 * value, second time for real. The other (noncalibrating) CPUs
273 * call this function only once, with the real, calibrated value.
274 *
275 * We do reads before writes even if unnecessary, to get around the
276 * P5 APIC double write bug.
277 */
278static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
279{
280 unsigned int lvtt_value, tmp_value;
281
282 lvtt_value = LOCAL_TIMER_VECTOR;
283 if (!oneshot)
284 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
285 if (!lapic_is_integrated())
286 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
287
288 if (!irqen)
289 lvtt_value |= APIC_LVT_MASKED;
290
291 apic_write(APIC_LVTT, lvtt_value);
292
293 /*
294 * Divide PICLK by 16
295 */
296 tmp_value = apic_read(APIC_TDCR);
297 apic_write(APIC_TDCR,
298 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
299 APIC_TDR_DIV_16);
300
301 if (!oneshot)
302 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
303}
304
305/*
306 * Setup extended LVT, AMD specific (K8, family 10h)
307 *
308 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
309 * MCE interrupts are supported. Thus MCE offset must be set to 0.
310 *
311 * If mask=1, the LVT entry does not generate interrupts while mask=0
312 * enables the vector. See also the BKDGs.
313 */
314
315#define APIC_EILVT_LVTOFF_MCE 0
316#define APIC_EILVT_LVTOFF_IBS 1
317
318static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
319{
320 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
321 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
322
323 apic_write(reg, v);
324}
325
326u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
327{
328 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
329 return APIC_EILVT_LVTOFF_MCE;
330}
331
332u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
333{
334 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
335 return APIC_EILVT_LVTOFF_IBS;
336}
337EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
338
339/*
340 * Program the next event, relative to now
341 */
342static int lapic_next_event(unsigned long delta,
343 struct clock_event_device *evt)
344{
345 apic_write(APIC_TMICT, delta);
346 return 0;
347}
348
349/*
350 * Setup the lapic timer in periodic or oneshot mode
351 */
352static void lapic_timer_setup(enum clock_event_mode mode,
353 struct clock_event_device *evt)
354{
355 unsigned long flags;
356 unsigned int v;
357
358 /* Lapic used as dummy for broadcast ? */
359 if (evt->features & CLOCK_EVT_FEAT_DUMMY)
360 return;
361
362 local_irq_save(flags);
363
364 switch (mode) {
365 case CLOCK_EVT_MODE_PERIODIC:
366 case CLOCK_EVT_MODE_ONESHOT:
367 __setup_APIC_LVTT(calibration_result,
368 mode != CLOCK_EVT_MODE_PERIODIC, 1);
369 break;
370 case CLOCK_EVT_MODE_UNUSED:
371 case CLOCK_EVT_MODE_SHUTDOWN:
372 v = apic_read(APIC_LVTT);
373 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
374 apic_write(APIC_LVTT, v);
375 break;
376 case CLOCK_EVT_MODE_RESUME:
377 /* Nothing to do here */
378 break;
379 }
380
381 local_irq_restore(flags);
382}
383
384/*
385 * Local APIC timer broadcast function
386 */
387static void lapic_timer_broadcast(cpumask_t mask)
388{
389#ifdef CONFIG_SMP
390 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
391#endif
392}
393
394/*
395 * Setup the local APIC timer for this CPU. Copy the initilized values
396 * of the boot CPU and register the clock event in the framework.
397 */
398static void setup_APIC_timer(void)
399{
400 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
401
402 memcpy(levt, &lapic_clockevent, sizeof(*levt));
403 levt->cpumask = cpumask_of_cpu(smp_processor_id());
404
405 clockevents_register_device(levt);
406}
407
408/*
409 * In this function we calibrate APIC bus clocks to the external
410 * timer. Unfortunately we cannot use jiffies and the timer irq
411 * to calibrate, since some later bootup code depends on getting
412 * the first irq? Ugh.
413 *
414 * We want to do the calibration only once since we
415 * want to have local timer irqs syncron. CPUs connected
416 * by the same APIC bus have the very same bus frequency.
417 * And we want to have irqs off anyways, no accidental
418 * APIC irq that way.
419 */
420
421#define TICK_COUNT 100000000
422
423static int __init calibrate_APIC_clock(void)
424{
425 unsigned apic, apic_start;
426 unsigned long tsc, tsc_start;
427 int result;
428
429 local_irq_disable();
430
431 /*
432 * Put whatever arbitrary (but long enough) timeout
433 * value into the APIC clock, we just want to get the
434 * counter running for calibration.
435 *
436 * No interrupt enable !
437 */
438 __setup_APIC_LVTT(250000000, 0, 0);
439
440 apic_start = apic_read(APIC_TMCCT);
441#ifdef CONFIG_X86_PM_TIMER
442 if (apic_calibrate_pmtmr && pmtmr_ioport) {
443 pmtimer_wait(5000); /* 5ms wait */
444 apic = apic_read(APIC_TMCCT);
445 result = (apic_start - apic) * 1000L / 5;
446 } else
447#endif
448 {
449 rdtscll(tsc_start);
450
451 do {
452 apic = apic_read(APIC_TMCCT);
453 rdtscll(tsc);
454 } while ((tsc - tsc_start) < TICK_COUNT &&
455 (apic_start - apic) < TICK_COUNT);
456
457 result = (apic_start - apic) * 1000L * tsc_khz /
458 (tsc - tsc_start);
459 }
460
461 local_irq_enable();
462
463 printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
464
465 printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
466 result / 1000 / 1000, result / 1000 % 1000);
467
468 /* Calculate the scaled math multiplication factor */
469 lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC,
470 lapic_clockevent.shift);
471 lapic_clockevent.max_delta_ns =
472 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
473 lapic_clockevent.min_delta_ns =
474 clockevent_delta2ns(0xF, &lapic_clockevent);
475
476 calibration_result = (result * APIC_DIVISOR) / HZ;
477
478 /*
479 * Do a sanity check on the APIC calibration result
480 */
481 if (calibration_result < (1000000 / HZ)) {
482 printk(KERN_WARNING
483 "APIC frequency too slow, disabling apic timer\n");
484 return -1;
485 }
486
487 return 0;
488}
489
490/*
491 * Setup the boot APIC
492 *
493 * Calibrate and verify the result.
494 */
495void __init setup_boot_APIC_clock(void)
496{
497 /*
498 * The local apic timer can be disabled via the kernel
499 * commandline or from the CPU detection code. Register the lapic
500 * timer as a dummy clock event source on SMP systems, so the
501 * broadcast mechanism is used. On UP systems simply ignore it.
502 */
503 if (disable_apic_timer) {
504 printk(KERN_INFO "Disabling APIC timer\n");
505 /* No broadcast on UP ! */
506 if (num_possible_cpus() > 1) {
507 lapic_clockevent.mult = 1;
508 setup_APIC_timer();
509 }
510 return;
511 }
512
513 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
514 "calibrating APIC timer ...\n");
515
516 if (calibrate_APIC_clock()) {
517 /* No broadcast on UP ! */
518 if (num_possible_cpus() > 1)
519 setup_APIC_timer();
520 return;
521 }
522
523 /*
524 * If nmi_watchdog is set to IO_APIC, we need the
525 * PIT/HPET going. Otherwise register lapic as a dummy
526 * device.
527 */
528 if (nmi_watchdog != NMI_IO_APIC)
529 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
530 else
531 printk(KERN_WARNING "APIC timer registered as dummy,"
532 " due to nmi_watchdog=%d!\n", nmi_watchdog);
533
534 /* Setup the lapic or request the broadcast */
535 setup_APIC_timer();
536}
537
538void __cpuinit setup_secondary_APIC_clock(void)
539{
540 setup_APIC_timer();
541}
542
543/*
544 * The guts of the apic timer interrupt
545 */
546static void local_apic_timer_interrupt(void)
547{
548 int cpu = smp_processor_id();
549 struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
550
551 /*
552 * Normally we should not be here till LAPIC has been initialized but
553 * in some cases like kdump, its possible that there is a pending LAPIC
554 * timer interrupt from previous kernel's context and is delivered in
555 * new kernel the moment interrupts are enabled.
556 *
557 * Interrupts are enabled early and LAPIC is setup much later, hence
558 * its possible that when we get here evt->event_handler is NULL.
559 * Check for event_handler being NULL and discard the interrupt as
560 * spurious.
561 */
562 if (!evt->event_handler) {
563 printk(KERN_WARNING
564 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
565 /* Switch it off */
566 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
567 return;
568 }
569
570 /*
571 * the NMI deadlock-detector uses this.
572 */
573#ifdef CONFIG_X86_64
574 add_pda(apic_timer_irqs, 1);
575#else
576 per_cpu(irq_stat, cpu).apic_timer_irqs++;
577#endif
578
579 evt->event_handler(evt);
580}
581
582/*
583 * Local APIC timer interrupt. This is the most natural way for doing
584 * local interrupts, but local timer interrupts can be emulated by
585 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
586 *
587 * [ if a single-CPU system runs an SMP kernel then we call the local
588 * interrupt as well. Thus we cannot inline the local irq ... ]
589 */
590void smp_apic_timer_interrupt(struct pt_regs *regs)
591{
592 struct pt_regs *old_regs = set_irq_regs(regs);
593
594 /*
595 * NOTE! We'd better ACK the irq immediately,
596 * because timer handling can be slow.
597 */
598 ack_APIC_irq();
599 /*
600 * update_process_times() expects us to have done irq_enter().
601 * Besides, if we don't timer interrupts ignore the global
602 * interrupt lock, which is the WrongThing (tm) to do.
603 */
604 exit_idle();
605 irq_enter();
606 local_apic_timer_interrupt();
607 irq_exit();
608
609 set_irq_regs(old_regs);
610}
611
612int setup_profiling_timer(unsigned int multiplier)
613{
614 return -EINVAL;
615}
616
617
618/*
619 * Local APIC start and shutdown
620 */
621
622/**
623 * clear_local_APIC - shutdown the local APIC
624 *
625 * This is called, when a CPU is disabled and before rebooting, so the state of
626 * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
627 * leftovers during boot.
628 */
629void clear_local_APIC(void)
630{
631 int maxlvt;
632 u32 v;
633
634 /* APIC hasn't been mapped yet */
635 if (!apic_phys)
636 return;
637
638 maxlvt = lapic_get_maxlvt();
639 /*
640 * Masking an LVT entry can trigger a local APIC error
641 * if the vector is zero. Mask LVTERR first to prevent this.
642 */
643 if (maxlvt >= 3) {
644 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
645 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
646 }
647 /*
648 * Careful: we have to set masks only first to deassert
649 * any level-triggered sources.
650 */
651 v = apic_read(APIC_LVTT);
652 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
653 v = apic_read(APIC_LVT0);
654 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
655 v = apic_read(APIC_LVT1);
656 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
657 if (maxlvt >= 4) {
658 v = apic_read(APIC_LVTPC);
659 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
660 }
661
662 /* lets not touch this if we didn't frob it */
663#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
664 if (maxlvt >= 5) {
665 v = apic_read(APIC_LVTTHMR);
666 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
667 }
668#endif
669 /*
670 * Clean APIC state for other OSs:
671 */
672 apic_write(APIC_LVTT, APIC_LVT_MASKED);
673 apic_write(APIC_LVT0, APIC_LVT_MASKED);
674 apic_write(APIC_LVT1, APIC_LVT_MASKED);
675 if (maxlvt >= 3)
676 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
677 if (maxlvt >= 4)
678 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
679
680 /* Integrated APIC (!82489DX) ? */
681 if (lapic_is_integrated()) {
682 if (maxlvt > 3)
683 /* Clear ESR due to Pentium errata 3AP and 11AP */
684 apic_write(APIC_ESR, 0);
685 apic_read(APIC_ESR);
686 }
687}
688
689/**
690 * disable_local_APIC - clear and disable the local APIC
691 */
692void disable_local_APIC(void)
693{
694 unsigned int value;
695
696 clear_local_APIC();
697
698 /*
699 * Disable APIC (implies clearing of registers
700 * for 82489DX!).
701 */
702 value = apic_read(APIC_SPIV);
703 value &= ~APIC_SPIV_APIC_ENABLED;
704 apic_write(APIC_SPIV, value);
705
706#ifdef CONFIG_X86_32
707 /*
708 * When LAPIC was disabled by the BIOS and enabled by the kernel,
709 * restore the disabled state.
710 */
711 if (enabled_via_apicbase) {
712 unsigned int l, h;
713
714 rdmsr(MSR_IA32_APICBASE, l, h);
715 l &= ~MSR_IA32_APICBASE_ENABLE;
716 wrmsr(MSR_IA32_APICBASE, l, h);
717 }
718#endif
719}
720
721/*
722 * If Linux enabled the LAPIC against the BIOS default disable it down before
723 * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
724 * not power-off. Additionally clear all LVT entries before disable_local_APIC
725 * for the case where Linux didn't enable the LAPIC.
726 */
727void lapic_shutdown(void)
728{
729 unsigned long flags;
730
731 if (!cpu_has_apic)
732 return;
733
734 local_irq_save(flags);
735
736#ifdef CONFIG_X86_32
737 if (!enabled_via_apicbase)
738 clear_local_APIC();
739 else
740#endif
741 disable_local_APIC();
742
743
744 local_irq_restore(flags);
745}
746
747/*
748 * This is to verify that we're looking at a real local APIC.
749 * Check these against your board if the CPUs aren't getting
750 * started for no apparent reason.
751 */
752int __init verify_local_APIC(void)
753{
754 unsigned int reg0, reg1;
755
756 /*
757 * The version register is read-only in a real APIC.
758 */
759 reg0 = apic_read(APIC_LVR);
760 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
761 apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
762 reg1 = apic_read(APIC_LVR);
763 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
764
765 /*
766 * The two version reads above should print the same
767 * numbers. If the second one is different, then we
768 * poke at a non-APIC.
769 */
770 if (reg1 != reg0)
771 return 0;
772
773 /*
774 * Check if the version looks reasonably.
775 */
776 reg1 = GET_APIC_VERSION(reg0);
777 if (reg1 == 0x00 || reg1 == 0xff)
778 return 0;
779 reg1 = lapic_get_maxlvt();
780 if (reg1 < 0x02 || reg1 == 0xff)
781 return 0;
782
783 /*
784 * The ID register is read/write in a real APIC.
785 */
786 reg0 = apic_read(APIC_ID);
787 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
788 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
789 reg1 = apic_read(APIC_ID);
790 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
791 apic_write(APIC_ID, reg0);
792 if (reg1 != (reg0 ^ APIC_ID_MASK))
793 return 0;
794
795 /*
796 * The next two are just to see if we have sane values.
797 * They're only really relevant if we're in Virtual Wire
798 * compatibility mode, but most boxes are anymore.
799 */
800 reg0 = apic_read(APIC_LVT0);
801 apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
802 reg1 = apic_read(APIC_LVT1);
803 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
804
805 return 1;
806}
807
808/**
809 * sync_Arb_IDs - synchronize APIC bus arbitration IDs
810 */
811void __init sync_Arb_IDs(void)
812{
813 /*
814 * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
815 * needed on AMD.
816 */
817 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
818 return;
819
820 /*
821 * Wait for idle.
822 */
823 apic_wait_icr_idle();
824
825 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
826 apic_write(APIC_ICR, APIC_DEST_ALLINC |
827 APIC_INT_LEVELTRIG | APIC_DM_INIT);
828}
829
830/*
831 * An initial setup of the virtual wire mode.
832 */
833void __init init_bsp_APIC(void)
834{
835 unsigned int value;
836
837 /*
838 * Don't do the setup now if we have a SMP BIOS as the
839 * through-I/O-APIC virtual wire mode might be active.
840 */
841 if (smp_found_config || !cpu_has_apic)
842 return;
843
844 /*
845 * Do not trust the local APIC being empty at bootup.
846 */
847 clear_local_APIC();
848
849 /*
850 * Enable APIC.
851 */
852 value = apic_read(APIC_SPIV);
853 value &= ~APIC_VECTOR_MASK;
854 value |= APIC_SPIV_APIC_ENABLED;
855
856#ifdef CONFIG_X86_32
857 /* This bit is reserved on P4/Xeon and should be cleared */
858 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
859 (boot_cpu_data.x86 == 15))
860 value &= ~APIC_SPIV_FOCUS_DISABLED;
861 else
862#endif
863 value |= APIC_SPIV_FOCUS_DISABLED;
864 value |= SPURIOUS_APIC_VECTOR;
865 apic_write(APIC_SPIV, value);
866
867 /*
868 * Set up the virtual wire mode.
869 */
870 apic_write(APIC_LVT0, APIC_DM_EXTINT);
871 value = APIC_DM_NMI;
872 if (!lapic_is_integrated()) /* 82489DX */
873 value |= APIC_LVT_LEVEL_TRIGGER;
874 apic_write(APIC_LVT1, value);
875}
876
877static void __cpuinit lapic_setup_esr(void)
878{
879 unsigned long oldvalue, value, maxlvt;
880 if (lapic_is_integrated() && !esr_disable) {
881 if (esr_disable) {
882 /*
883 * Something untraceable is creating bad interrupts on
884 * secondary quads ... for the moment, just leave the
885 * ESR disabled - we can't do anything useful with the
886 * errors anyway - mbligh
887 */
888 printk(KERN_INFO "Leaving ESR disabled.\n");
889 return;
890 }
891 /* !82489DX */
892 maxlvt = lapic_get_maxlvt();
893 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
894 apic_write(APIC_ESR, 0);
895 oldvalue = apic_read(APIC_ESR);
896
897 /* enables sending errors */
898 value = ERROR_APIC_VECTOR;
899 apic_write(APIC_LVTERR, value);
900 /*
901 * spec says clear errors after enabling vector.
902 */
903 if (maxlvt > 3)
904 apic_write(APIC_ESR, 0);
905 value = apic_read(APIC_ESR);
906 if (value != oldvalue)
907 apic_printk(APIC_VERBOSE, "ESR value before enabling "
908 "vector: 0x%08lx after: 0x%08lx\n",
909 oldvalue, value);
910 } else {
911 printk(KERN_INFO "No ESR for 82489DX.\n");
912 }
913}
914
915
916/**
917 * setup_local_APIC - setup the local APIC
918 */
919void __cpuinit setup_local_APIC(void)
920{
921 unsigned int value;
922 int i, j;
923
924 preempt_disable();
925 value = apic_read(APIC_LVR);
926
927 BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
928
929 /*
930 * Double-check whether this APIC is really registered.
931 * This is meaningless in clustered apic mode, so we skip it.
932 */
933 if (!apic_id_registered())
934 BUG();
935
936 /*
937 * Intel recommends to set DFR, LDR and TPR before enabling
938 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
939 * document number 292116). So here it goes...
940 */
941 init_apic_ldr();
942
943 /*
944 * Set Task Priority to 'accept all'. We never change this
945 * later on.
946 */
947 value = apic_read(APIC_TASKPRI);
948 value &= ~APIC_TPRI_MASK;
949 apic_write(APIC_TASKPRI, value);
950
951 /*
952 * After a crash, we no longer service the interrupts and a pending
953 * interrupt from previous kernel might still have ISR bit set.
954 *
955 * Most probably by now CPU has serviced that pending interrupt and
956 * it might not have done the ack_APIC_irq() because it thought,
957 * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
958 * does not clear the ISR bit and cpu thinks it has already serivced
959 * the interrupt. Hence a vector might get locked. It was noticed
960 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
961 */
962 for (i = APIC_ISR_NR - 1; i >= 0; i--) {
963 value = apic_read(APIC_ISR + i*0x10);
964 for (j = 31; j >= 0; j--) {
965 if (value & (1<<j))
966 ack_APIC_irq();
967 }
968 }
969
970 /*
971 * Now that we are all set up, enable the APIC
972 */
973 value = apic_read(APIC_SPIV);
974 value &= ~APIC_VECTOR_MASK;
975 /*
976 * Enable APIC
977 */
978 value |= APIC_SPIV_APIC_ENABLED;
979
980 /* We always use processor focus */
981
982 /*
983 * Set spurious IRQ vector
984 */
985 value |= SPURIOUS_APIC_VECTOR;
986 apic_write(APIC_SPIV, value);
987
988 /*
989 * Set up LVT0, LVT1:
990 *
991 * set up through-local-APIC on the BP's LINT0. This is not
992 * strictly necessary in pure symmetric-IO mode, but sometimes
993 * we delegate interrupts to the 8259A.
994 */
995 /*
996 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
997 */
998 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
999 if (!smp_processor_id() && !value) {
1000 value = APIC_DM_EXTINT;
1001 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
1002 smp_processor_id());
1003 } else {
1004 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
1005 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
1006 smp_processor_id());
1007 }
1008 apic_write(APIC_LVT0, value);
1009
1010 /*
1011 * only the BP should see the LINT1 NMI signal, obviously.
1012 */
1013 if (!smp_processor_id())
1014 value = APIC_DM_NMI;
1015 else
1016 value = APIC_DM_NMI | APIC_LVT_MASKED;
1017 apic_write(APIC_LVT1, value);
1018 preempt_enable();
1019}
1020
1021void __cpuinit end_local_APIC_setup(void)
1022{
1023 lapic_setup_esr();
1024
1025#ifdef CONFIG_X86_32
1026 {
1027 unsigned int value;
1028 /* Disable the local apic timer */
1029 value = apic_read(APIC_LVTT);
1030 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1031 apic_write(APIC_LVTT, value);
1032 }
1033#endif
1034
1035 setup_apic_nmi_watchdog(NULL);
1036 apic_pm_activate();
1037}
1038
1039void check_x2apic(void)
1040{
1041 int msr, msr2;
1042
1043 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1044
1045 if (msr & X2APIC_ENABLE) {
1046 printk("x2apic enabled by BIOS, switching to x2apic ops\n");
1047 x2apic_preenabled = x2apic = 1;
1048 apic_ops = &x2apic_ops;
1049 }
1050}
1051
1052void enable_x2apic(void)
1053{
1054 int msr, msr2;
1055
1056 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1057 if (!(msr & X2APIC_ENABLE)) {
1058 printk("Enabling x2apic\n");
1059 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1060 }
1061}
1062
1063void enable_IR_x2apic(void)
1064{
1065#ifdef CONFIG_INTR_REMAP
1066 int ret;
1067 unsigned long flags;
1068
1069 if (!cpu_has_x2apic)
1070 return;
1071
1072 if (!x2apic_preenabled && disable_x2apic) {
1073 printk(KERN_INFO
1074 "Skipped enabling x2apic and Interrupt-remapping "
1075 "because of nox2apic\n");
1076 return;
1077 }
1078
1079 if (x2apic_preenabled && disable_x2apic)
1080 panic("Bios already enabled x2apic, can't enforce nox2apic");
1081
1082 if (!x2apic_preenabled && skip_ioapic_setup) {
1083 printk(KERN_INFO
1084 "Skipped enabling x2apic and Interrupt-remapping "
1085 "because of skipping io-apic setup\n");
1086 return;
1087 }
1088
1089 ret = dmar_table_init();
1090 if (ret) {
1091 printk(KERN_INFO
1092 "dmar_table_init() failed with %d:\n", ret);
1093
1094 if (x2apic_preenabled)
1095 panic("x2apic enabled by bios. But IR enabling failed");
1096 else
1097 printk(KERN_INFO
1098 "Not enabling x2apic,Intr-remapping\n");
1099 return;
1100 }
1101
1102 local_irq_save(flags);
1103 mask_8259A();
1104 save_mask_IO_APIC_setup();
1105
1106 ret = enable_intr_remapping(1);
1107
1108 if (ret && x2apic_preenabled) {
1109 local_irq_restore(flags);
1110 panic("x2apic enabled by bios. But IR enabling failed");
1111 }
1112
1113 if (ret)
1114 goto end;
1115
1116 if (!x2apic) {
1117 x2apic = 1;
1118 apic_ops = &x2apic_ops;
1119 enable_x2apic();
1120 }
1121end:
1122 if (ret)
1123 /*
1124 * IR enabling failed
1125 */
1126 restore_IO_APIC_setup();
1127 else
1128 reinit_intr_remapped_IO_APIC(x2apic_preenabled);
1129
1130 unmask_8259A();
1131 local_irq_restore(flags);
1132
1133 if (!ret) {
1134 if (!x2apic_preenabled)
1135 printk(KERN_INFO
1136 "Enabled x2apic and interrupt-remapping\n");
1137 else
1138 printk(KERN_INFO
1139 "Enabled Interrupt-remapping\n");
1140 } else
1141 printk(KERN_ERR
1142 "Failed to enable Interrupt-remapping and x2apic\n");
1143#else
1144 if (!cpu_has_x2apic)
1145 return;
1146
1147 if (x2apic_preenabled)
1148 panic("x2apic enabled prior OS handover,"
1149 " enable CONFIG_INTR_REMAP");
1150
1151 printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
1152 " and x2apic\n");
1153#endif
1154
1155 return;
1156}
1157
1158/*
1159 * Detect and enable local APICs on non-SMP boards.
1160 * Original code written by Keir Fraser.
1161 * On AMD64 we trust the BIOS - if it says no APIC it is likely
1162 * not correctly set up (usually the APIC timer won't work etc.)
1163 */
1164static int __init detect_init_APIC(void)
1165{
1166 if (!cpu_has_apic) {
1167 printk(KERN_INFO "No local APIC present\n");
1168 return -1;
1169 }
1170
1171 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1172 boot_cpu_physical_apicid = 0;
1173 return 0;
1174}
1175
1176void __init early_init_lapic_mapping(void)
1177{
1178 unsigned long phys_addr;
1179
1180 /*
1181 * If no local APIC can be found then go out
1182 * : it means there is no mpatable and MADT
1183 */
1184 if (!smp_found_config)
1185 return;
1186
1187 phys_addr = mp_lapic_addr;
1188
1189 set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
1190 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
1191 APIC_BASE, phys_addr);
1192
1193 /*
1194 * Fetch the APIC ID of the BSP in case we have a
1195 * default configuration (or the MP table is broken).
1196 */
1197 boot_cpu_physical_apicid = read_apic_id();
1198}
1199
1200/**
1201 * init_apic_mappings - initialize APIC mappings
1202 */
1203void __init init_apic_mappings(void)
1204{
1205 if (x2apic) {
1206 boot_cpu_physical_apicid = read_apic_id();
1207 return;
1208 }
1209
1210 /*
1211 * If no local APIC can be found then set up a fake all
1212 * zeroes page to simulate the local APIC and another
1213 * one for the IO-APIC.
1214 */
1215 if (!smp_found_config && detect_init_APIC()) {
1216 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
1217 apic_phys = __pa(apic_phys);
1218 } else
1219 apic_phys = mp_lapic_addr;
1220
1221 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
1222 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
1223 APIC_BASE, apic_phys);
1224
1225 /*
1226 * Fetch the APIC ID of the BSP in case we have a
1227 * default configuration (or the MP table is broken).
1228 */
1229 boot_cpu_physical_apicid = read_apic_id();
1230}
1231
1232/*
1233 * This initializes the IO-APIC and APIC hardware if this is
1234 * a UP kernel.
1235 */
1236int apic_version[MAX_APICS];
1237
1238int __init APIC_init_uniprocessor(void)
1239{
1240 if (disable_apic) {
1241 printk(KERN_INFO "Apic disabled\n");
1242 return -1;
1243 }
1244 if (!cpu_has_apic) {
1245 disable_apic = 1;
1246 printk(KERN_INFO "Apic disabled by BIOS\n");
1247 return -1;
1248 }
1249
1250 enable_IR_x2apic();
1251 setup_apic_routing();
1252
1253 verify_local_APIC();
1254
1255 connect_bsp_APIC();
1256
1257 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1258 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
1259
1260 setup_local_APIC();
1261
1262 /*
1263 * Now enable IO-APICs, actually call clear_IO_APIC
1264 * We need clear_IO_APIC before enabling vector on BP
1265 */
1266 if (!skip_ioapic_setup && nr_ioapics)
1267 enable_IO_APIC();
1268
1269 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
1270 localise_nmi_watchdog();
1271 end_local_APIC_setup();
1272
1273 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1274 setup_IO_APIC();
1275 else
1276 nr_ioapics = 0;
1277 setup_boot_APIC_clock();
1278 check_nmi_watchdog();
1279 return 0;
1280}
1281
1282/*
1283 * Local APIC interrupts
1284 */
1285
1286/*
1287 * This interrupt should _never_ happen with our APIC/SMP architecture
1288 */
1289asmlinkage void smp_spurious_interrupt(void)
1290{
1291 unsigned int v;
1292 exit_idle();
1293 irq_enter();
1294 /*
1295 * Check if this really is a spurious interrupt and ACK it
1296 * if it is a vectored one. Just in case...
1297 * Spurious interrupts should not be ACKed.
1298 */
1299 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
1300 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1301 ack_APIC_irq();
1302
1303 add_pda(irq_spurious_count, 1);
1304 irq_exit();
1305}
1306
1307/*
1308 * This interrupt should never happen with our APIC/SMP architecture
1309 */
1310asmlinkage void smp_error_interrupt(void)
1311{
1312 unsigned int v, v1;
1313
1314 exit_idle();
1315 irq_enter();
1316 /* First tickle the hardware, only then report what went on. -- REW */
1317 v = apic_read(APIC_ESR);
1318 apic_write(APIC_ESR, 0);
1319 v1 = apic_read(APIC_ESR);
1320 ack_APIC_irq();
1321 atomic_inc(&irq_err_count);
1322
1323 /* Here is what the APIC error bits mean:
1324 0: Send CS error
1325 1: Receive CS error
1326 2: Send accept error
1327 3: Receive accept error
1328 4: Reserved
1329 5: Send illegal vector
1330 6: Received illegal vector
1331 7: Illegal register address
1332 */
1333 printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
1334 smp_processor_id(), v , v1);
1335 irq_exit();
1336}
1337
1338/**
1339 * connect_bsp_APIC - attach the APIC to the interrupt system
1340 */
1341void __init connect_bsp_APIC(void)
1342{
1343#ifdef CONFIG_X86_32
1344 if (pic_mode) {
1345 /*
1346 * Do not trust the local APIC being empty at bootup.
1347 */
1348 clear_local_APIC();
1349 /*
1350 * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
1351 * local APIC to INT and NMI lines.
1352 */
1353 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1354 "enabling APIC mode.\n");
1355 outb(0x70, 0x22);
1356 outb(0x01, 0x23);
1357 }
1358#endif
1359 enable_apic_mode();
1360}
1361
1362/**
1363 * disconnect_bsp_APIC - detach the APIC from the interrupt system
1364 * @virt_wire_setup: indicates, whether virtual wire mode is selected
1365 *
1366 * Virtual wire mode is necessary to deliver legacy interrupts even when the
1367 * APIC is disabled.
1368 */
1369void disconnect_bsp_APIC(int virt_wire_setup)
1370{
1371 unsigned int value;
1372
1373#ifdef CONFIG_X86_32
1374 if (pic_mode) {
1375 /*
1376 * Put the board back into PIC mode (has an effect only on
1377 * certain older boards). Note that APIC interrupts, including
1378 * IPIs, won't work beyond this point! The only exception are
1379 * INIT IPIs.
1380 */
1381 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1382 "entering PIC mode.\n");
1383 outb(0x70, 0x22);
1384 outb(0x00, 0x23);
1385 return;
1386 }
1387#endif
1388
1389 /* Go back to Virtual Wire compatibility mode */
1390
1391 /* For the spurious interrupt use vector F, and enable it */
1392 value = apic_read(APIC_SPIV);
1393 value &= ~APIC_VECTOR_MASK;
1394 value |= APIC_SPIV_APIC_ENABLED;
1395 value |= 0xf;
1396 apic_write(APIC_SPIV, value);
1397
1398 if (!virt_wire_setup) {
1399 /*
1400 * For LVT0 make it edge triggered, active high,
1401 * external and enabled
1402 */
1403 value = apic_read(APIC_LVT0);
1404 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1405 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1406 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1407 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1408 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1409 apic_write(APIC_LVT0, value);
1410 } else {
1411 /* Disable LVT0 */
1412 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1413 }
1414
1415 /*
1416 * For LVT1 make it edge triggered, active high,
1417 * nmi and enabled
1418 */
1419 value = apic_read(APIC_LVT1);
1420 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1421 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1422 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1423 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1424 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1425 apic_write(APIC_LVT1, value);
1426}
1427
1428void __cpuinit generic_processor_info(int apicid, int version)
1429{
1430 int cpu;
1431 cpumask_t tmp_map;
1432
1433 /*
1434 * Validate version
1435 */
1436 if (version == 0x0) {
1437 printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
1438 "fixing up to 0x10. (tell your hw vendor)\n",
1439 version);
1440 version = 0x10;
1441 }
1442 apic_version[apicid] = version;
1443
1444 if (num_processors >= NR_CPUS) {
1445 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1446 " Processor ignored.\n", NR_CPUS);
1447 return;
1448 }
1449
1450 num_processors++;
1451 cpus_complement(tmp_map, cpu_present_map);
1452 cpu = first_cpu(tmp_map);
1453
1454 physid_set(apicid, phys_cpu_present_map);
1455 if (apicid == boot_cpu_physical_apicid) {
1456 /*
1457 * x86_bios_cpu_apicid is required to have processors listed
1458 * in same order as logical cpu numbers. Hence the first
1459 * entry is BSP, and so on.
1460 */
1461 cpu = 0;
1462 }
1463 if (apicid > max_physical_apicid)
1464 max_physical_apicid = apicid;
1465
1466#ifdef CONFIG_X86_32
1467 /*
1468 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1469 * but we need to work other dependencies like SMP_SUSPEND etc
1470 * before this can be done without some confusion.
1471 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
1472 * - Ashok Raj <ashok.raj@intel.com>
1473 */
1474 if (max_physical_apicid >= 8) {
1475 switch (boot_cpu_data.x86_vendor) {
1476 case X86_VENDOR_INTEL:
1477 if (!APIC_XAPIC(version)) {
1478 def_to_bigsmp = 0;
1479 break;
1480 }
1481 /* If P4 and above fall through */
1482 case X86_VENDOR_AMD:
1483 def_to_bigsmp = 1;
1484 }
1485 }
1486#endif
1487
1488#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1489 /* are we being called early in kernel startup? */
1490 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1491 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1492 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1493
1494 cpu_to_apicid[cpu] = apicid;
1495 bios_cpu_apicid[cpu] = apicid;
1496 } else {
1497 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1498 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1499 }
1500#endif
1501
1502 cpu_set(cpu, cpu_possible_map);
1503 cpu_set(cpu, cpu_present_map);
1504}
1505
1506int hard_smp_processor_id(void)
1507{
1508 return read_apic_id();
1509}
1510
1511/*
1512 * Power management
1513 */
1514#ifdef CONFIG_PM
1515
1516static struct {
1517 /*
1518 * 'active' is true if the local APIC was enabled by us and
1519 * not the BIOS; this signifies that we are also responsible
1520 * for disabling it before entering apm/acpi suspend
1521 */
1522 int active;
1523 /* r/w apic fields */
1524 unsigned int apic_id;
1525 unsigned int apic_taskpri;
1526 unsigned int apic_ldr;
1527 unsigned int apic_dfr;
1528 unsigned int apic_spiv;
1529 unsigned int apic_lvtt;
1530 unsigned int apic_lvtpc;
1531 unsigned int apic_lvt0;
1532 unsigned int apic_lvt1;
1533 unsigned int apic_lvterr;
1534 unsigned int apic_tmict;
1535 unsigned int apic_tdcr;
1536 unsigned int apic_thmr;
1537} apic_pm_state;
1538
1539static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1540{
1541 unsigned long flags;
1542 int maxlvt;
1543
1544 if (!apic_pm_state.active)
1545 return 0;
1546
1547 maxlvt = lapic_get_maxlvt();
1548
1549 apic_pm_state.apic_id = apic_read(APIC_ID);
1550 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
1551 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
1552 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
1553 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
1554 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
1555 if (maxlvt >= 4)
1556 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
1557 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
1558 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
1559 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1560 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1561 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1562#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1563 if (maxlvt >= 5)
1564 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1565#endif
1566
1567 local_irq_save(flags);
1568 disable_local_APIC();
1569 local_irq_restore(flags);
1570 return 0;
1571}
1572
1573static int lapic_resume(struct sys_device *dev)
1574{
1575 unsigned int l, h;
1576 unsigned long flags;
1577 int maxlvt;
1578
1579 if (!apic_pm_state.active)
1580 return 0;
1581
1582 maxlvt = lapic_get_maxlvt();
1583
1584 local_irq_save(flags);
1585
1586#ifdef CONFIG_X86_64
1587 if (x2apic)
1588 enable_x2apic();
1589 else
1590#endif
1591 {
1592 /*
1593 * Make sure the APICBASE points to the right address
1594 *
1595 * FIXME! This will be wrong if we ever support suspend on
1596 * SMP! We'll need to do this as part of the CPU restore!
1597 */
1598 rdmsr(MSR_IA32_APICBASE, l, h);
1599 l &= ~MSR_IA32_APICBASE_BASE;
1600 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1601 wrmsr(MSR_IA32_APICBASE, l, h);
1602 }
1603
1604 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1605 apic_write(APIC_ID, apic_pm_state.apic_id);
1606 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
1607 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
1608 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
1609 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1610 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1611 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1612#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1613 if (maxlvt >= 5)
1614 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1615#endif
1616 if (maxlvt >= 4)
1617 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
1618 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
1619 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
1620 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
1621 apic_write(APIC_ESR, 0);
1622 apic_read(APIC_ESR);
1623 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1624 apic_write(APIC_ESR, 0);
1625 apic_read(APIC_ESR);
1626
1627 local_irq_restore(flags);
1628
1629 return 0;
1630}
1631
1632/*
1633 * This device has no shutdown method - fully functioning local APICs
1634 * are needed on every CPU up until machine_halt/restart/poweroff.
1635 */
1636
1637static struct sysdev_class lapic_sysclass = {
1638 .name = "lapic",
1639 .resume = lapic_resume,
1640 .suspend = lapic_suspend,
1641};
1642
1643static struct sys_device device_lapic = {
1644 .id = 0,
1645 .cls = &lapic_sysclass,
1646};
1647
1648static void __cpuinit apic_pm_activate(void)
1649{
1650 apic_pm_state.active = 1;
1651}
1652
1653static int __init init_lapic_sysfs(void)
1654{
1655 int error;
1656
1657 if (!cpu_has_apic)
1658 return 0;
1659 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
1660
1661 error = sysdev_class_register(&lapic_sysclass);
1662 if (!error)
1663 error = sysdev_register(&device_lapic);
1664 return error;
1665}
1666device_initcall(init_lapic_sysfs);
1667
1668#else /* CONFIG_PM */
1669
1670static void apic_pm_activate(void) { }
1671
1672#endif /* CONFIG_PM */
1673
1674/*
1675 * apic_is_clustered_box() -- Check if we can expect good TSC
1676 *
1677 * Thus far, the major user of this is IBM's Summit2 series:
1678 *
1679 * Clustered boxes may have unsynced TSC problems if they are
1680 * multi-chassis. Use available data to take a good guess.
1681 * If in doubt, go HPET.
1682 */
1683__cpuinit int apic_is_clustered_box(void)
1684{
1685 int i, clusters, zeros;
1686 unsigned id;
1687 u16 *bios_cpu_apicid;
1688 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
1689
1690 /*
1691 * there is not this kind of box with AMD CPU yet.
1692 * Some AMD box with quadcore cpu and 8 sockets apicid
1693 * will be [4, 0x23] or [8, 0x27] could be thought to
1694 * vsmp box still need checking...
1695 */
1696 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
1697 return 0;
1698
1699 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1700 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
1701
1702 for (i = 0; i < NR_CPUS; i++) {
1703 /* are we being called early in kernel startup? */
1704 if (bios_cpu_apicid) {
1705 id = bios_cpu_apicid[i];
1706 }
1707 else if (i < nr_cpu_ids) {
1708 if (cpu_present(i))
1709 id = per_cpu(x86_bios_cpu_apicid, i);
1710 else
1711 continue;
1712 }
1713 else
1714 break;
1715
1716 if (id != BAD_APICID)
1717 __set_bit(APIC_CLUSTERID(id), clustermap);
1718 }
1719
1720 /* Problem: Partially populated chassis may not have CPUs in some of
1721 * the APIC clusters they have been allocated. Only present CPUs have
1722 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
1723 * Since clusters are allocated sequentially, count zeros only if
1724 * they are bounded by ones.
1725 */
1726 clusters = 0;
1727 zeros = 0;
1728 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
1729 if (test_bit(i, clustermap)) {
1730 clusters += 1 + zeros;
1731 zeros = 0;
1732 } else
1733 ++zeros;
1734 }
1735
1736 /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
1737 * not guaranteed to be synced between boards
1738 */
1739 if (is_vsmp_box() && clusters > 1)
1740 return 1;
1741
1742 /*
1743 * If clusters > 2, then should be multi-chassis.
1744 * May have to revisit this when multi-core + hyperthreaded CPUs come
1745 * out, but AFAIK this will work even for them.
1746 */
1747 return (clusters > 2);
1748}
1749
1750static __init int setup_nox2apic(char *str)
1751{
1752 disable_x2apic = 1;
1753 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_X2APIC);
1754 return 0;
1755}
1756early_param("nox2apic", setup_nox2apic);
1757
1758
1759/*
1760 * APIC command line parameters
1761 */
1762static int __init setup_disableapic(char *arg)
1763{
1764 disable_apic = 1;
1765 setup_clear_cpu_cap(X86_FEATURE_APIC);
1766 return 0;
1767}
1768early_param("disableapic", setup_disableapic);
1769
1770/* same as disableapic, for compatibility */
1771static int __init setup_nolapic(char *arg)
1772{
1773 return setup_disableapic(arg);
1774}
1775early_param("nolapic", setup_nolapic);
1776
1777static int __init parse_lapic_timer_c2_ok(char *arg)
1778{
1779 local_apic_timer_c2_ok = 1;
1780 return 0;
1781}
1782early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1783
1784static int __init parse_disable_apic_timer(char *arg)
1785{
1786 disable_apic_timer = 1;
1787 return 0;
1788}
1789early_param("noapictimer", parse_disable_apic_timer);
1790
1791static int __init parse_nolapic_timer(char *arg)
1792{
1793 disable_apic_timer = 1;
1794 return 0;
1795}
1796early_param("nolapic_timer", parse_nolapic_timer);
1797
1798static __init int setup_apicpmtimer(char *s)
1799{
1800 apic_calibrate_pmtmr = 1;
1801 notsc_setup(NULL);
1802 return 0;
1803}
1804__setup("apicpmtimer", setup_apicpmtimer);
1805
1806static int __init apic_set_verbosity(char *arg)
1807{
1808 if (!arg) {
1809#ifdef CONFIG_X86_64
1810 skip_ioapic_setup = 0;
1811 ioapic_force = 1;
1812 return 0;
1813#endif
1814 return -EINVAL;
1815 }
1816
1817 if (strcmp("debug", arg) == 0)
1818 apic_verbosity = APIC_DEBUG;
1819 else if (strcmp("verbose", arg) == 0)
1820 apic_verbosity = APIC_VERBOSE;
1821 else {
1822 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1823 " use apic=verbose or apic=debug\n", arg);
1824 return -EINVAL;
1825 }
1826
1827 return 0;
1828}
1829early_param("apic", apic_set_verbosity);
1830
1831static int __init lapic_insert_resource(void)
1832{
1833 if (!apic_phys)
1834 return -1;
1835
1836 /* Put local APIC into the resource map. */
1837 lapic_resource.start = apic_phys;
1838 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
1839 insert_resource(&iomem_resource, &lapic_resource);
1840
1841 return 0;
1842}
1843
1844/*
1845 * need call insert after e820_reserve_resources()
1846 * that is using request_resource
1847 */
1848late_initcall(lapic_insert_resource);
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index fdd585f9c53d..f0dfe6f17e7e 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -1,8 +1,6 @@
1/* 1/*
2 * BIOS run time interface routines. 2 * BIOS run time interface routines.
3 * 3 *
4 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 5 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or 6 * the Free Software Foundation; either version 2 of the License, or
@@ -16,33 +14,128 @@
16 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson
19 */ 20 */
20 21
22#include <linux/efi.h>
23#include <asm/efi.h>
24#include <linux/io.h>
21#include <asm/uv/bios.h> 25#include <asm/uv/bios.h>
26#include <asm/uv/uv_hub.h>
27
28struct uv_systab uv_systab;
22 29
23const char * 30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
24x86_bios_strerror(long status)
25{ 31{
26 const char *str; 32 struct uv_systab *tab = &uv_systab;
27 switch (status) { 33
28 case 0: str = "Call completed without error"; break; 34 if (!tab->function)
29 case -1: str = "Not implemented"; break; 35 /*
30 case -2: str = "Invalid argument"; break; 36 * BIOS does not support UV systab
31 case -3: str = "Call completed with error"; break; 37 */
32 default: str = "Unknown BIOS status code"; break; 38 return BIOS_STATUS_UNIMPLEMENTED;
33 } 39
34 return str; 40 return efi_call6((void *)__va(tab->function),
41 (u64)which, a1, a2, a3, a4, a5);
35} 42}
36 43
37long 44s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
38x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second, 45 u64 a4, u64 a5)
39 unsigned long *drift_info)
40{ 46{
41 struct uv_bios_retval isrv; 47 unsigned long bios_flags;
48 s64 ret;
42 49
43 BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0); 50 local_irq_save(bios_flags);
44 *ticks_per_second = isrv.v0; 51 ret = uv_bios_call(which, a1, a2, a3, a4, a5);
45 *drift_info = isrv.v1; 52 local_irq_restore(bios_flags);
46 return isrv.status; 53
54 return ret;
47} 55}
48EXPORT_SYMBOL_GPL(x86_bios_freq_base); 56
57s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
58 u64 a4, u64 a5)
59{
60 s64 ret;
61
62 preempt_disable();
63 ret = uv_bios_call(which, a1, a2, a3, a4, a5);
64 preempt_enable();
65
66 return ret;
67}
68
69
70long sn_partition_id;
71EXPORT_SYMBOL_GPL(sn_partition_id);
72long uv_coherency_id;
73EXPORT_SYMBOL_GPL(uv_coherency_id);
74long uv_region_size;
75EXPORT_SYMBOL_GPL(uv_region_size);
76int uv_type;
77
78
79s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
80 long *region)
81{
82 s64 ret;
83 u64 v0, v1;
84 union partition_info_u part;
85
86 ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
87 (u64)(&v0), (u64)(&v1), 0, 0);
88 if (ret != BIOS_STATUS_SUCCESS)
89 return ret;
90
91 part.val = v0;
92 if (uvtype)
93 *uvtype = part.hub_version;
94 if (partid)
95 *partid = part.partition_id;
96 if (coher)
97 *coher = part.coherence_id;
98 if (region)
99 *region = part.region_size;
100 return ret;
101}
102
103
104s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
105{
106 return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
107 (u64)ticks_per_second, 0, 0, 0);
108}
109EXPORT_SYMBOL_GPL(uv_bios_freq_base);
110
111
112#ifdef CONFIG_EFI
113void uv_bios_init(void)
114{
115 struct uv_systab *tab;
116
117 if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
118 (efi.uv_systab == (unsigned long)NULL)) {
119 printk(KERN_CRIT "No EFI UV System Table.\n");
120 uv_systab.function = (unsigned long)NULL;
121 return;
122 }
123
124 tab = (struct uv_systab *)ioremap(efi.uv_systab,
125 sizeof(struct uv_systab));
126 if (strncmp(tab->signature, "UVST", 4) != 0)
127 printk(KERN_ERR "bad signature in UV system table!");
128
129 /*
130 * Copy table to permanent spot for later use.
131 */
132 memcpy(&uv_systab, tab, sizeof(struct uv_systab));
133 iounmap(tab);
134
135 printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision);
136}
137#else /* !CONFIG_EFI */
138
139void uv_bios_init(void) { }
140#endif
141
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
new file mode 100644
index 000000000000..667df55a4399
--- /dev/null
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -0,0 +1 @@
capflags.c
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 32e73520adf7..8f1e31db2ad5 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -249,7 +249,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
249 } 249 }
250 numa_set_node(cpu, node); 250 numa_set_node(cpu, node);
251 251
252 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); 252 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
253#endif 253#endif
254} 254}
255 255
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index fb789dd9e691..25581dcb280e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -124,18 +124,25 @@ static inline int flag_is_changeable_p(u32 flag)
124{ 124{
125 u32 f1, f2; 125 u32 f1, f2;
126 126
127 asm("pushfl\n\t" 127 /*
128 "pushfl\n\t" 128 * Cyrix and IDT cpus allow disabling of CPUID
129 "popl %0\n\t" 129 * so the code below may return different results
130 "movl %0,%1\n\t" 130 * when it is executed before and after enabling
131 "xorl %2,%0\n\t" 131 * the CPUID. Add "volatile" to not allow gcc to
132 "pushl %0\n\t" 132 * optimize the subsequent calls to this function.
133 "popfl\n\t" 133 */
134 "pushfl\n\t" 134 asm volatile ("pushfl\n\t"
135 "popl %0\n\t" 135 "pushfl\n\t"
136 "popfl\n\t" 136 "popl %0\n\t"
137 : "=&r" (f1), "=&r" (f2) 137 "movl %0,%1\n\t"
138 : "ir" (flag)); 138 "xorl %2,%0\n\t"
139 "pushl %0\n\t"
140 "popfl\n\t"
141 "pushfl\n\t"
142 "popl %0\n\t"
143 "popfl\n\t"
144 : "=&r" (f1), "=&r" (f2)
145 : "ir" (flag));
139 146
140 return ((f1^f2) & flag) != 0; 147 return ((f1^f2) & flag) != 0;
141} 148}
@@ -719,12 +726,24 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
719#endif 726#endif
720} 727}
721 728
729#ifdef CONFIG_X86_64
730static void vgetcpu_set_mode(void)
731{
732 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
733 vgetcpu_mode = VGETCPU_RDTSCP;
734 else
735 vgetcpu_mode = VGETCPU_LSL;
736}
737#endif
738
722void __init identify_boot_cpu(void) 739void __init identify_boot_cpu(void)
723{ 740{
724 identify_cpu(&boot_cpu_data); 741 identify_cpu(&boot_cpu_data);
725#ifdef CONFIG_X86_32 742#ifdef CONFIG_X86_32
726 sysenter_setup(); 743 sysenter_setup();
727 enable_sep_cpu(); 744 enable_sep_cpu();
745#else
746 vgetcpu_set_mode();
728#endif 747#endif
729} 748}
730 749
@@ -797,7 +816,7 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
797 else if (c->cpuid_level >= 0) 816 else if (c->cpuid_level >= 0)
798 vendor = c->x86_vendor_id; 817 vendor = c->x86_vendor_id;
799 818
800 if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) 819 if (vendor && !strstr(c->x86_model_id, vendor))
801 printk(KERN_CONT "%s ", vendor); 820 printk(KERN_CONT "%s ", vendor);
802 821
803 if (c->x86_model_id[0]) 822 if (c->x86_model_id[0])
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 06fcce516d51..b0461856acfb 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * (C) 2001-2004 Dave Jones. <davej@codemonkey.org.uk> 2 * (C) 2001-2004 Dave Jones. <davej@redhat.com>
3 * (C) 2002 Padraig Brady. <padraig@antefacto.com> 3 * (C) 2002 Padraig Brady. <padraig@antefacto.com>
4 * 4 *
5 * Licensed under the terms of the GNU GPL License version 2. 5 * Licensed under the terms of the GNU GPL License version 2.
@@ -1019,7 +1019,7 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
1019module_param(revid_errata, int, 0644); 1019module_param(revid_errata, int, 0644);
1020MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); 1020MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
1021 1021
1022MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); 1022MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
1023MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); 1023MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
1024MODULE_LICENSE ("GPL"); 1024MODULE_LICENSE ("GPL");
1025 1025
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index b5ced806a316..c1ac5790c63e 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -246,7 +246,7 @@ static void __exit powernow_k6_exit(void)
246} 246}
247 247
248 248
249MODULE_AUTHOR("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); 249MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
250MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); 250MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
251MODULE_LICENSE("GPL"); 251MODULE_LICENSE("GPL");
252 252
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 0a61159d7b71..7c7d56b43136 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * AMD K7 Powernow driver. 2 * AMD K7 Powernow driver.
3 * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs. 3 * (C) 2003 Dave Jones on behalf of SuSE Labs.
4 * (C) 2003-2004 Dave Jones <davej@redhat.com> 4 * (C) 2003-2004 Dave Jones <davej@redhat.com>
5 * 5 *
6 * Licensed under the terms of the GNU GPL License version 2. 6 * Licensed under the terms of the GNU GPL License version 2.
@@ -692,7 +692,7 @@ static void __exit powernow_exit (void)
692module_param(acpi_force, int, 0444); 692module_param(acpi_force, int, 0444);
693MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); 693MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
694 694
695MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); 695MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
696MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors."); 696MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors.");
697MODULE_LICENSE ("GPL"); 697MODULE_LICENSE ("GPL");
698 698
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 84bb395038d8..008d23ba491b 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -7,7 +7,7 @@
7 * Support : mark.langsdorf@amd.com 7 * Support : mark.langsdorf@amd.com
8 * 8 *
9 * Based on the powernow-k7.c module written by Dave Jones. 9 * Based on the powernow-k7.c module written by Dave Jones.
10 * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs 10 * (C) 2003 Dave Jones on behalf of SuSE Labs
11 * (C) 2004 Dominik Brodowski <linux@brodo.de> 11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
12 * (C) 2004 Pavel Machek <pavel@suse.cz> 12 * (C) 2004 Pavel Machek <pavel@suse.cz>
13 * Licensed under the terms of the GNU GPL License version 2. 13 * Licensed under the terms of the GNU GPL License version 2.
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 191f7263c61d..04d0376b64b0 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -431,7 +431,7 @@ static void __exit speedstep_exit(void)
431} 431}
432 432
433 433
434MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); 434MODULE_AUTHOR ("Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
435MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges."); 435MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges.");
436MODULE_LICENSE ("GPL"); 436MODULE_LICENSE ("GPL");
437 437
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 99468dbd08da..cce0b6118d55 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -174,7 +174,7 @@ static void __cpuinit srat_detect_node(void)
174 node = first_node(node_online_map); 174 node = first_node(node_online_map);
175 numa_set_node(cpu, node); 175 numa_set_node(cpu, node);
176 176
177 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); 177 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
178#endif 178#endif
179} 179}
180 180
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index f390c9f66351..dd3af6e7b39a 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Athlon/Hammer specific Machine Check Exception Reporting 2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk> 3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */ 4 */
5 5
6#include <linux/init.h> 6#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 774d87cfd8cd..0ebf3fc6a610 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * mce.c - x86 Machine Check Exception Reporting 2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk> 3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@redhat.com>
4 */ 4 */
5 5
6#include <linux/init.h> 6#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index cc1fccdd31e0..a74af128efc9 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Non Fatal Machine Check Exception Reporting 2 * Non Fatal Machine Check Exception Reporting
3 * 3 *
4 * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk> 4 * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
5 * 5 *
6 * This file contains routines to check for non-fatal MCEs every 15s 6 * This file contains routines to check for non-fatal MCEs every 15s
7 * 7 *
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 6bff382094f5..9abd48b22674 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -17,6 +17,8 @@
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h>
21
20#include <asm/apic.h> 22#include <asm/apic.h>
21#include <asm/intel_arch_perfmon.h> 23#include <asm/intel_arch_perfmon.h>
22 24
@@ -336,7 +338,8 @@ static void single_msr_unreserve(void)
336 release_perfctr_nmi(wd_ops->perfctr); 338 release_perfctr_nmi(wd_ops->perfctr);
337} 339}
338 340
339static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) 341static void __kprobes
342single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
340{ 343{
341 /* start the cycle over again */ 344 /* start the cycle over again */
342 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); 345 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
@@ -401,7 +404,7 @@ static int setup_p6_watchdog(unsigned nmi_hz)
401 return 1; 404 return 1;
402} 405}
403 406
404static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) 407static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
405{ 408{
406 /* 409 /*
407 * P6 based Pentium M need to re-unmask 410 * P6 based Pentium M need to re-unmask
@@ -605,7 +608,7 @@ static void p4_unreserve(void)
605 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); 608 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
606} 609}
607 610
608static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) 611static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
609{ 612{
610 unsigned dummy; 613 unsigned dummy;
611 /* 614 /*
@@ -784,7 +787,7 @@ unsigned lapic_adjust_nmi_hz(unsigned hz)
784 return hz; 787 return hz;
785} 788}
786 789
787int lapic_wd_event(unsigned nmi_hz) 790int __kprobes lapic_wd_event(unsigned nmi_hz)
788{ 791{
789 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 792 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
790 u64 ctr; 793 u64 ctr;
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 6a44d6465991..72cefd1e649b 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -147,8 +147,8 @@ static __cpuinit int cpuid_device_create(int cpu)
147{ 147{
148 struct device *dev; 148 struct device *dev;
149 149
150 dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), 150 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
151 NULL, "cpu%d", cpu); 151 "cpu%d", cpu);
152 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 152 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
153} 153}
154 154
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 72d0c56c1b48..f7cdb3b457aa 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -13,6 +13,9 @@
13 13
14static void *kdump_buf_page; 14static void *kdump_buf_page;
15 15
16/* Stores the physical address of elf header of crash image. */
17unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
18
16/** 19/**
17 * copy_oldmem_page - copy one page from "oldmem" 20 * copy_oldmem_page - copy one page from "oldmem"
18 * @pfn: page frame number to be copied 21 * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index e90a60ef10c2..045b36cada65 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -10,6 +10,9 @@
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/io.h> 11#include <linux/io.h>
12 12
13/* Stores the physical address of elf header of crash image. */
14unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
15
13/** 16/**
14 * copy_oldmem_page - copy one page from "oldmem" 17 * copy_oldmem_page - copy one page from "oldmem"
15 * @pfn: page frame number to be copied 18 * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 395acb12b0d1..b4f14c6c09d9 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = {
66 .ds = __USER_DS, 66 .ds = __USER_DS,
67 .fs = __KERNEL_PERCPU, 67 .fs = __KERNEL_PERCPU,
68 68
69 .__cr3 = __phys_addr_const((unsigned long)swapper_pg_dir) 69 .__cr3 = __pa_nodebug(swapper_pg_dir),
70 } 70 }
71}; 71};
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
new file mode 100644
index 000000000000..1a78180f08d3
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -0,0 +1,449 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16#include <linux/sysfs.h>
17
18#include <asm/stacktrace.h>
19
20#define STACKSLOTS_PER_LINE 8
21#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33
34static inline int valid_stack_ptr(struct thread_info *tinfo,
35 void *p, unsigned int size, void *end)
36{
37 void *t = tinfo;
38 if (end) {
39 if (p < end && p >= (end-THREAD_SIZE))
40 return 1;
41 else
42 return 0;
43 }
44 return p > t && p < t + THREAD_SIZE - size;
45}
46
47/* The form of the top of the frame on the stack */
48struct stack_frame {
49 struct stack_frame *next_frame;
50 unsigned long return_address;
51};
52
53static inline unsigned long
54print_context_stack(struct thread_info *tinfo,
55 unsigned long *stack, unsigned long bp,
56 const struct stacktrace_ops *ops, void *data,
57 unsigned long *end)
58{
59 struct stack_frame *frame = (struct stack_frame *)bp;
60
61 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
62 unsigned long addr;
63
64 addr = *stack;
65 if (__kernel_text_address(addr)) {
66 if ((unsigned long) stack == bp + sizeof(long)) {
67 ops->address(data, addr, 1);
68 frame = frame->next_frame;
69 bp = (unsigned long) frame;
70 } else {
71 ops->address(data, addr, bp == 0);
72 }
73 }
74 stack++;
75 }
76 return bp;
77}
78
79void dump_trace(struct task_struct *task, struct pt_regs *regs,
80 unsigned long *stack, unsigned long bp,
81 const struct stacktrace_ops *ops, void *data)
82{
83 if (!task)
84 task = current;
85
86 if (!stack) {
87 unsigned long dummy;
88 stack = &dummy;
89 if (task && task != current)
90 stack = (unsigned long *)task->thread.sp;
91 }
92
93#ifdef CONFIG_FRAME_POINTER
94 if (!bp) {
95 if (task == current) {
96 /* Grab bp right from our regs */
97 get_bp(bp);
98 } else {
99 /* bp is the last reg pushed by switch_to */
100 bp = *(unsigned long *) task->thread.sp;
101 }
102 }
103#endif
104
105 for (;;) {
106 struct thread_info *context;
107
108 context = (struct thread_info *)
109 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
110 bp = print_context_stack(context, stack, bp, ops, data, NULL);
111
112 stack = (unsigned long *)context->previous_esp;
113 if (!stack)
114 break;
115 if (ops->stack(data, "IRQ") < 0)
116 break;
117 touch_nmi_watchdog();
118 }
119}
120EXPORT_SYMBOL(dump_trace);
121
122static void
123print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
124{
125 printk(data);
126 print_symbol(msg, symbol);
127 printk("\n");
128}
129
130static void print_trace_warning(void *data, char *msg)
131{
132 printk("%s%s\n", (char *)data, msg);
133}
134
135static int print_trace_stack(void *data, char *name)
136{
137 printk("%s <%s> ", (char *)data, name);
138 return 0;
139}
140
141/*
142 * Print one address/symbol entries per line.
143 */
144static void print_trace_address(void *data, unsigned long addr, int reliable)
145{
146 touch_nmi_watchdog();
147 printk(data);
148 printk_address(addr, reliable);
149}
150
151static const struct stacktrace_ops print_trace_ops = {
152 .warning = print_trace_warning,
153 .warning_symbol = print_trace_warning_symbol,
154 .stack = print_trace_stack,
155 .address = print_trace_address,
156};
157
158static void
159show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
160 unsigned long *stack, unsigned long bp, char *log_lvl)
161{
162 printk("%sCall Trace:\n", log_lvl);
163 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
164}
165
166void show_trace(struct task_struct *task, struct pt_regs *regs,
167 unsigned long *stack, unsigned long bp)
168{
169 show_trace_log_lvl(task, regs, stack, bp, "");
170}
171
172static void
173show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
174 unsigned long *sp, unsigned long bp, char *log_lvl)
175{
176 unsigned long *stack;
177 int i;
178
179 if (sp == NULL) {
180 if (task)
181 sp = (unsigned long *)task->thread.sp;
182 else
183 sp = (unsigned long *)&sp;
184 }
185
186 stack = sp;
187 for (i = 0; i < kstack_depth_to_print; i++) {
188 if (kstack_end(stack))
189 break;
190 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
191 printk("\n%s", log_lvl);
192 printk(" %08lx", *stack++);
193 touch_nmi_watchdog();
194 }
195 printk("\n");
196 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
197}
198
199void show_stack(struct task_struct *task, unsigned long *sp)
200{
201 show_stack_log_lvl(task, NULL, sp, 0, "");
202}
203
204/*
205 * The architecture-independent dump_stack generator
206 */
207void dump_stack(void)
208{
209 unsigned long bp = 0;
210 unsigned long stack;
211
212#ifdef CONFIG_FRAME_POINTER
213 if (!bp)
214 get_bp(bp);
215#endif
216
217 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
218 current->pid, current->comm, print_tainted(),
219 init_utsname()->release,
220 (int)strcspn(init_utsname()->version, " "),
221 init_utsname()->version);
222 show_trace(NULL, NULL, &stack, bp);
223}
224
225EXPORT_SYMBOL(dump_stack);
226
227void show_registers(struct pt_regs *regs)
228{
229 int i;
230
231 print_modules();
232 __show_regs(regs, 0);
233
234 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
235 TASK_COMM_LEN, current->comm, task_pid_nr(current),
236 current_thread_info(), current, task_thread_info(current));
237 /*
238 * When in-kernel, we also print out the stack and code at the
239 * time of the fault..
240 */
241 if (!user_mode_vm(regs)) {
242 unsigned int code_prologue = code_bytes * 43 / 64;
243 unsigned int code_len = code_bytes;
244 unsigned char c;
245 u8 *ip;
246
247 printk(KERN_EMERG "Stack:\n");
248 show_stack_log_lvl(NULL, regs, &regs->sp,
249 0, KERN_EMERG);
250
251 printk(KERN_EMERG "Code: ");
252
253 ip = (u8 *)regs->ip - code_prologue;
254 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
255 /* try starting at IP */
256 ip = (u8 *)regs->ip;
257 code_len = code_len - code_prologue + 1;
258 }
259 for (i = 0; i < code_len; i++, ip++) {
260 if (ip < (u8 *)PAGE_OFFSET ||
261 probe_kernel_address(ip, c)) {
262 printk(" Bad EIP value.");
263 break;
264 }
265 if (ip == (u8 *)regs->ip)
266 printk("<%02x> ", c);
267 else
268 printk("%02x ", c);
269 }
270 }
271 printk("\n");
272}
273
274int is_valid_bugaddr(unsigned long ip)
275{
276 unsigned short ud2;
277
278 if (ip < PAGE_OFFSET)
279 return 0;
280 if (probe_kernel_address((unsigned short *)ip, ud2))
281 return 0;
282
283 return ud2 == 0x0b0f;
284}
285
286static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
287static int die_owner = -1;
288static unsigned int die_nest_count;
289
290unsigned __kprobes long oops_begin(void)
291{
292 unsigned long flags;
293
294 oops_enter();
295
296 if (die_owner != raw_smp_processor_id()) {
297 console_verbose();
298 raw_local_irq_save(flags);
299 __raw_spin_lock(&die_lock);
300 die_owner = smp_processor_id();
301 die_nest_count = 0;
302 bust_spinlocks(1);
303 } else {
304 raw_local_irq_save(flags);
305 }
306 die_nest_count++;
307 return flags;
308}
309
310void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
311{
312 bust_spinlocks(0);
313 die_owner = -1;
314 add_taint(TAINT_DIE);
315 __raw_spin_unlock(&die_lock);
316 raw_local_irq_restore(flags);
317
318 if (!regs)
319 return;
320
321 if (kexec_should_crash(current))
322 crash_kexec(regs);
323 if (in_interrupt())
324 panic("Fatal exception in interrupt");
325 if (panic_on_oops)
326 panic("Fatal exception");
327 oops_exit();
328 do_exit(signr);
329}
330
331int __kprobes __die(const char *str, struct pt_regs *regs, long err)
332{
333 unsigned short ss;
334 unsigned long sp;
335
336 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
337#ifdef CONFIG_PREEMPT
338 printk("PREEMPT ");
339#endif
340#ifdef CONFIG_SMP
341 printk("SMP ");
342#endif
343#ifdef CONFIG_DEBUG_PAGEALLOC
344 printk("DEBUG_PAGEALLOC");
345#endif
346 printk("\n");
347 sysfs_printk_last_file();
348 if (notify_die(DIE_OOPS, str, regs, err,
349 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
350 return 1;
351
352 show_registers(regs);
353 /* Executive summary in case the oops scrolled away */
354 sp = (unsigned long) (&regs->sp);
355 savesegment(ss, ss);
356 if (user_mode(regs)) {
357 sp = regs->sp;
358 ss = regs->ss & 0xffff;
359 }
360 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
361 print_symbol("%s", regs->ip);
362 printk(" SS:ESP %04x:%08lx\n", ss, sp);
363 return 0;
364}
365
366/*
367 * This is gone through when something in the kernel has done something bad
368 * and is about to be terminated:
369 */
370void die(const char *str, struct pt_regs *regs, long err)
371{
372 unsigned long flags = oops_begin();
373
374 if (die_nest_count < 3) {
375 report_bug(regs->ip, regs);
376
377 if (__die(str, regs, err))
378 regs = NULL;
379 } else {
380 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
381 }
382
383 oops_end(flags, regs, SIGSEGV);
384}
385
386static DEFINE_SPINLOCK(nmi_print_lock);
387
388void notrace __kprobes
389die_nmi(char *str, struct pt_regs *regs, int do_panic)
390{
391 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
392 return;
393
394 spin_lock(&nmi_print_lock);
395 /*
396 * We are in trouble anyway, lets at least try
397 * to get a message out:
398 */
399 bust_spinlocks(1);
400 printk(KERN_EMERG "%s", str);
401 printk(" on CPU%d, ip %08lx, registers:\n",
402 smp_processor_id(), regs->ip);
403 show_registers(regs);
404 if (do_panic)
405 panic("Non maskable interrupt");
406 console_silent();
407 spin_unlock(&nmi_print_lock);
408 bust_spinlocks(0);
409
410 /*
411 * If we are in kernel we are probably nested up pretty bad
412 * and might aswell get out now while we still can:
413 */
414 if (!user_mode_vm(regs)) {
415 current->thread.trap_no = 2;
416 crash_kexec(regs);
417 }
418
419 do_exit(SIGSEGV);
420}
421
422static int __init oops_setup(char *s)
423{
424 if (!s)
425 return -EINVAL;
426 if (!strcmp(s, "panic"))
427 panic_on_oops = 1;
428 return 0;
429}
430early_param("oops", oops_setup);
431
432static int __init kstack_setup(char *s)
433{
434 if (!s)
435 return -EINVAL;
436 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
437 return 0;
438}
439early_param("kstack", kstack_setup);
440
441static int __init code_bytes_setup(char *s)
442{
443 code_bytes = simple_strtoul(s, NULL, 0);
444 if (code_bytes > 8192)
445 code_bytes = 8192;
446
447 return 1;
448}
449__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
new file mode 100644
index 000000000000..96a5db7da8a7
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -0,0 +1,575 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16#include <linux/sysfs.h>
17
18#include <asm/stacktrace.h>
19
20#define STACKSLOTS_PER_LINE 4
21#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33
34static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
35 unsigned *usedp, char **idp)
36{
37 static char ids[][8] = {
38 [DEBUG_STACK - 1] = "#DB",
39 [NMI_STACK - 1] = "NMI",
40 [DOUBLEFAULT_STACK - 1] = "#DF",
41 [STACKFAULT_STACK - 1] = "#SS",
42 [MCE_STACK - 1] = "#MC",
43#if DEBUG_STKSZ > EXCEPTION_STKSZ
44 [N_EXCEPTION_STACKS ...
45 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
46#endif
47 };
48 unsigned k;
49
50 /*
51 * Iterate over all exception stacks, and figure out whether
52 * 'stack' is in one of them:
53 */
54 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
55 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
56 /*
57 * Is 'stack' above this exception frame's end?
58 * If yes then skip to the next frame.
59 */
60 if (stack >= end)
61 continue;
62 /*
63 * Is 'stack' above this exception frame's start address?
64 * If yes then we found the right frame.
65 */
66 if (stack >= end - EXCEPTION_STKSZ) {
67 /*
68 * Make sure we only iterate through an exception
69 * stack once. If it comes up for the second time
70 * then there's something wrong going on - just
71 * break out and return NULL:
72 */
73 if (*usedp & (1U << k))
74 break;
75 *usedp |= 1U << k;
76 *idp = ids[k];
77 return (unsigned long *)end;
78 }
79 /*
80 * If this is a debug stack, and if it has a larger size than
81 * the usual exception stacks, then 'stack' might still
82 * be within the lower portion of the debug stack:
83 */
84#if DEBUG_STKSZ > EXCEPTION_STKSZ
85 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
86 unsigned j = N_EXCEPTION_STACKS - 1;
87
88 /*
89 * Black magic. A large debug stack is composed of
90 * multiple exception stack entries, which we
91 * iterate through now. Dont look:
92 */
93 do {
94 ++j;
95 end -= EXCEPTION_STKSZ;
96 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
97 } while (stack < end - EXCEPTION_STKSZ);
98 if (*usedp & (1U << j))
99 break;
100 *usedp |= 1U << j;
101 *idp = ids[j];
102 return (unsigned long *)end;
103 }
104#endif
105 }
106 return NULL;
107}
108
109/*
110 * x86-64 can have up to three kernel stacks:
111 * process stack
112 * interrupt stack
113 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
114 */
115
116static inline int valid_stack_ptr(struct thread_info *tinfo,
117 void *p, unsigned int size, void *end)
118{
119 void *t = tinfo;
120 if (end) {
121 if (p < end && p >= (end-THREAD_SIZE))
122 return 1;
123 else
124 return 0;
125 }
126 return p > t && p < t + THREAD_SIZE - size;
127}
128
129/* The form of the top of the frame on the stack */
130struct stack_frame {
131 struct stack_frame *next_frame;
132 unsigned long return_address;
133};
134
135static inline unsigned long
136print_context_stack(struct thread_info *tinfo,
137 unsigned long *stack, unsigned long bp,
138 const struct stacktrace_ops *ops, void *data,
139 unsigned long *end)
140{
141 struct stack_frame *frame = (struct stack_frame *)bp;
142
143 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
144 unsigned long addr;
145
146 addr = *stack;
147 if (__kernel_text_address(addr)) {
148 if ((unsigned long) stack == bp + sizeof(long)) {
149 ops->address(data, addr, 1);
150 frame = frame->next_frame;
151 bp = (unsigned long) frame;
152 } else {
153 ops->address(data, addr, bp == 0);
154 }
155 }
156 stack++;
157 }
158 return bp;
159}
160
161void dump_trace(struct task_struct *task, struct pt_regs *regs,
162 unsigned long *stack, unsigned long bp,
163 const struct stacktrace_ops *ops, void *data)
164{
165 const unsigned cpu = get_cpu();
166 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
167 unsigned used = 0;
168 struct thread_info *tinfo;
169
170 if (!task)
171 task = current;
172
173 if (!stack) {
174 unsigned long dummy;
175 stack = &dummy;
176 if (task && task != current)
177 stack = (unsigned long *)task->thread.sp;
178 }
179
180#ifdef CONFIG_FRAME_POINTER
181 if (!bp) {
182 if (task == current) {
183 /* Grab bp right from our regs */
184 get_bp(bp);
185 } else {
186 /* bp is the last reg pushed by switch_to */
187 bp = *(unsigned long *) task->thread.sp;
188 }
189 }
190#endif
191
192 /*
193 * Print function call entries in all stacks, starting at the
194 * current stack address. If the stacks consist of nested
195 * exceptions
196 */
197 tinfo = task_thread_info(task);
198 for (;;) {
199 char *id;
200 unsigned long *estack_end;
201 estack_end = in_exception_stack(cpu, (unsigned long)stack,
202 &used, &id);
203
204 if (estack_end) {
205 if (ops->stack(data, id) < 0)
206 break;
207
208 bp = print_context_stack(tinfo, stack, bp, ops,
209 data, estack_end);
210 ops->stack(data, "<EOE>");
211 /*
212 * We link to the next stack via the
213 * second-to-last pointer (index -2 to end) in the
214 * exception stack:
215 */
216 stack = (unsigned long *) estack_end[-2];
217 continue;
218 }
219 if (irqstack_end) {
220 unsigned long *irqstack;
221 irqstack = irqstack_end -
222 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
223
224 if (stack >= irqstack && stack < irqstack_end) {
225 if (ops->stack(data, "IRQ") < 0)
226 break;
227 bp = print_context_stack(tinfo, stack, bp,
228 ops, data, irqstack_end);
229 /*
230 * We link to the next stack (which would be
231 * the process stack normally) the last
232 * pointer (index -1 to end) in the IRQ stack:
233 */
234 stack = (unsigned long *) (irqstack_end[-1]);
235 irqstack_end = NULL;
236 ops->stack(data, "EOI");
237 continue;
238 }
239 }
240 break;
241 }
242
243 /*
244 * This handles the process stack:
245 */
246 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
247 put_cpu();
248}
249EXPORT_SYMBOL(dump_trace);
250
251static void
252print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
253{
254 printk(data);
255 print_symbol(msg, symbol);
256 printk("\n");
257}
258
259static void print_trace_warning(void *data, char *msg)
260{
261 printk("%s%s\n", (char *)data, msg);
262}
263
264static int print_trace_stack(void *data, char *name)
265{
266 printk("%s <%s> ", (char *)data, name);
267 return 0;
268}
269
270/*
271 * Print one address/symbol entries per line.
272 */
273static void print_trace_address(void *data, unsigned long addr, int reliable)
274{
275 touch_nmi_watchdog();
276 printk(data);
277 printk_address(addr, reliable);
278}
279
280static const struct stacktrace_ops print_trace_ops = {
281 .warning = print_trace_warning,
282 .warning_symbol = print_trace_warning_symbol,
283 .stack = print_trace_stack,
284 .address = print_trace_address,
285};
286
287static void
288show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
289 unsigned long *stack, unsigned long bp, char *log_lvl)
290{
291 printk("%sCall Trace:\n", log_lvl);
292 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
293}
294
295void show_trace(struct task_struct *task, struct pt_regs *regs,
296 unsigned long *stack, unsigned long bp)
297{
298 show_trace_log_lvl(task, regs, stack, bp, "");
299}
300
301static void
302show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
303 unsigned long *sp, unsigned long bp, char *log_lvl)
304{
305 unsigned long *stack;
306 int i;
307 const int cpu = smp_processor_id();
308 unsigned long *irqstack_end =
309 (unsigned long *) (cpu_pda(cpu)->irqstackptr);
310 unsigned long *irqstack =
311 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
312
313 /*
314 * debugging aid: "show_stack(NULL, NULL);" prints the
315 * back trace for this cpu.
316 */
317
318 if (sp == NULL) {
319 if (task)
320 sp = (unsigned long *)task->thread.sp;
321 else
322 sp = (unsigned long *)&sp;
323 }
324
325 stack = sp;
326 for (i = 0; i < kstack_depth_to_print; i++) {
327 if (stack >= irqstack && stack <= irqstack_end) {
328 if (stack == irqstack_end) {
329 stack = (unsigned long *) (irqstack_end[-1]);
330 printk(" <EOI> ");
331 }
332 } else {
333 if (((long) stack & (THREAD_SIZE-1)) == 0)
334 break;
335 }
336 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
337 printk("\n%s", log_lvl);
338 printk(" %016lx", *stack++);
339 touch_nmi_watchdog();
340 }
341 printk("\n");
342 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
343}
344
345void show_stack(struct task_struct *task, unsigned long *sp)
346{
347 show_stack_log_lvl(task, NULL, sp, 0, "");
348}
349
350/*
351 * The architecture-independent dump_stack generator
352 */
353void dump_stack(void)
354{
355 unsigned long bp = 0;
356 unsigned long stack;
357
358#ifdef CONFIG_FRAME_POINTER
359 if (!bp)
360 get_bp(bp);
361#endif
362
363 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
364 current->pid, current->comm, print_tainted(),
365 init_utsname()->release,
366 (int)strcspn(init_utsname()->version, " "),
367 init_utsname()->version);
368 show_trace(NULL, NULL, &stack, bp);
369}
370EXPORT_SYMBOL(dump_stack);
371
372void show_registers(struct pt_regs *regs)
373{
374 int i;
375 unsigned long sp;
376 const int cpu = smp_processor_id();
377 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
378
379 sp = regs->sp;
380 printk("CPU %d ", cpu);
381 __show_regs(regs, 1);
382 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
383 cur->comm, cur->pid, task_thread_info(cur), cur);
384
385 /*
386 * When in-kernel, we also print out the stack and code at the
387 * time of the fault..
388 */
389 if (!user_mode(regs)) {
390 unsigned int code_prologue = code_bytes * 43 / 64;
391 unsigned int code_len = code_bytes;
392 unsigned char c;
393 u8 *ip;
394
395 printk(KERN_EMERG "Stack:\n");
396 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
397 regs->bp, KERN_EMERG);
398
399 printk(KERN_EMERG "Code: ");
400
401 ip = (u8 *)regs->ip - code_prologue;
402 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
403 /* try starting at IP */
404 ip = (u8 *)regs->ip;
405 code_len = code_len - code_prologue + 1;
406 }
407 for (i = 0; i < code_len; i++, ip++) {
408 if (ip < (u8 *)PAGE_OFFSET ||
409 probe_kernel_address(ip, c)) {
410 printk(" Bad RIP value.");
411 break;
412 }
413 if (ip == (u8 *)regs->ip)
414 printk("<%02x> ", c);
415 else
416 printk("%02x ", c);
417 }
418 }
419 printk("\n");
420}
421
422int is_valid_bugaddr(unsigned long ip)
423{
424 unsigned short ud2;
425
426 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
427 return 0;
428
429 return ud2 == 0x0b0f;
430}
431
432static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
433static int die_owner = -1;
434static unsigned int die_nest_count;
435
436unsigned __kprobes long oops_begin(void)
437{
438 int cpu;
439 unsigned long flags;
440
441 oops_enter();
442
443 /* racy, but better than risking deadlock. */
444 raw_local_irq_save(flags);
445 cpu = smp_processor_id();
446 if (!__raw_spin_trylock(&die_lock)) {
447 if (cpu == die_owner)
448 /* nested oops. should stop eventually */;
449 else
450 __raw_spin_lock(&die_lock);
451 }
452 die_nest_count++;
453 die_owner = cpu;
454 console_verbose();
455 bust_spinlocks(1);
456 return flags;
457}
458
459void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
460{
461 die_owner = -1;
462 bust_spinlocks(0);
463 die_nest_count--;
464 if (!die_nest_count)
465 /* Nest count reaches zero, release the lock. */
466 __raw_spin_unlock(&die_lock);
467 raw_local_irq_restore(flags);
468 if (!regs) {
469 oops_exit();
470 return;
471 }
472 if (in_interrupt())
473 panic("Fatal exception in interrupt");
474 if (panic_on_oops)
475 panic("Fatal exception");
476 oops_exit();
477 do_exit(signr);
478}
479
480int __kprobes __die(const char *str, struct pt_regs *regs, long err)
481{
482 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
483#ifdef CONFIG_PREEMPT
484 printk("PREEMPT ");
485#endif
486#ifdef CONFIG_SMP
487 printk("SMP ");
488#endif
489#ifdef CONFIG_DEBUG_PAGEALLOC
490 printk("DEBUG_PAGEALLOC");
491#endif
492 printk("\n");
493 sysfs_printk_last_file();
494 if (notify_die(DIE_OOPS, str, regs, err,
495 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
496 return 1;
497
498 show_registers(regs);
499 add_taint(TAINT_DIE);
500 /* Executive summary in case the oops scrolled away */
501 printk(KERN_ALERT "RIP ");
502 printk_address(regs->ip, 1);
503 printk(" RSP <%016lx>\n", regs->sp);
504 if (kexec_should_crash(current))
505 crash_kexec(regs);
506 return 0;
507}
508
509void die(const char *str, struct pt_regs *regs, long err)
510{
511 unsigned long flags = oops_begin();
512
513 if (!user_mode(regs))
514 report_bug(regs->ip, regs);
515
516 if (__die(str, regs, err))
517 regs = NULL;
518 oops_end(flags, regs, SIGSEGV);
519}
520
521notrace __kprobes void
522die_nmi(char *str, struct pt_regs *regs, int do_panic)
523{
524 unsigned long flags;
525
526 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
527 return;
528
529 flags = oops_begin();
530 /*
531 * We are in trouble anyway, lets at least try
532 * to get a message out.
533 */
534 printk(KERN_EMERG "%s", str);
535 printk(" on CPU%d, ip %08lx, registers:\n",
536 smp_processor_id(), regs->ip);
537 show_registers(regs);
538 if (kexec_should_crash(current))
539 crash_kexec(regs);
540 if (do_panic || panic_on_oops)
541 panic("Non maskable interrupt");
542 oops_end(flags, NULL, SIGBUS);
543 nmi_exit();
544 local_irq_enable();
545 do_exit(SIGBUS);
546}
547
548static int __init oops_setup(char *s)
549{
550 if (!s)
551 return -EINVAL;
552 if (!strcmp(s, "panic"))
553 panic_on_oops = 1;
554 return 0;
555}
556early_param("oops", oops_setup);
557
558static int __init kstack_setup(char *s)
559{
560 if (!s)
561 return -EINVAL;
562 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
563 return 0;
564}
565early_param("kstack", kstack_setup);
566
567static int __init code_bytes_setup(char *s)
568{
569 code_bytes = simple_strtoul(s, NULL, 0);
570 if (code_bytes > 8192)
571 code_bytes = 8192;
572
573 return 1;
574}
575__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 78e642feac30..ce97bf3bed12 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1282,12 +1282,10 @@ void __init e820_reserve_resources(void)
1282 e820_res = res; 1282 e820_res = res;
1283 for (i = 0; i < e820.nr_map; i++) { 1283 for (i = 0; i < e820.nr_map; i++) {
1284 end = e820.map[i].addr + e820.map[i].size - 1; 1284 end = e820.map[i].addr + e820.map[i].size - 1;
1285#ifndef CONFIG_RESOURCES_64BIT 1285 if (end != (resource_size_t)end) {
1286 if (end > 0x100000000ULL) {
1287 res++; 1286 res++;
1288 continue; 1287 continue;
1289 } 1288 }
1290#endif
1291 res->name = e820_type_to_string(e820.map[i].type); 1289 res->name = e820_type_to_string(e820.map[i].type);
1292 res->start = e820.map[i].addr; 1290 res->start = e820.map[i].addr;
1293 res->end = end; 1291 res->end = end;
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 945a31cdd81f..1119d247fe11 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -367,6 +367,10 @@ void __init efi_init(void)
367 efi.smbios = config_tables[i].table; 367 efi.smbios = config_tables[i].table;
368 printk(" SMBIOS=0x%lx ", config_tables[i].table); 368 printk(" SMBIOS=0x%lx ", config_tables[i].table);
369 } else if (!efi_guidcmp(config_tables[i].guid, 369 } else if (!efi_guidcmp(config_tables[i].guid,
370 UV_SYSTEM_TABLE_GUID)) {
371 efi.uv_systab = config_tables[i].table;
372 printk(" UVsystab=0x%lx ", config_tables[i].table);
373 } else if (!efi_guidcmp(config_tables[i].guid,
370 HCDP_TABLE_GUID)) { 374 HCDP_TABLE_GUID)) {
371 efi.hcdp = config_tables[i].table; 375 efi.hcdp = config_tables[i].table;
372 printk(" HCDP=0x%lx ", config_tables[i].table); 376 printk(" HCDP=0x%lx ", config_tables[i].table);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 109792bc7cfa..c356423a6026 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -629,7 +629,7 @@ ENTRY(interrupt)
629ENTRY(irq_entries_start) 629ENTRY(irq_entries_start)
630 RING0_INT_FRAME 630 RING0_INT_FRAME
631vector=0 631vector=0
632.rept NR_IRQS 632.rept NR_VECTORS
633 ALIGN 633 ALIGN
634 .if vector 634 .if vector
635 CFI_ADJUST_CFA_OFFSET -4 635 CFI_ADJUST_CFA_OFFSET -4
@@ -730,6 +730,7 @@ error_code:
730 movl $(__USER_DS), %ecx 730 movl $(__USER_DS), %ecx
731 movl %ecx, %ds 731 movl %ecx, %ds
732 movl %ecx, %es 732 movl %ecx, %es
733 TRACE_IRQS_OFF
733 movl %esp,%eax # pt_regs pointer 734 movl %esp,%eax # pt_regs pointer
734 call *%edi 735 call *%edi
735 jmp ret_from_exception 736 jmp ret_from_exception
@@ -760,20 +761,9 @@ ENTRY(device_not_available)
760 RING0_INT_FRAME 761 RING0_INT_FRAME
761 pushl $-1 # mark this as an int 762 pushl $-1 # mark this as an int
762 CFI_ADJUST_CFA_OFFSET 4 763 CFI_ADJUST_CFA_OFFSET 4
763 SAVE_ALL 764 pushl $do_device_not_available
764 GET_CR0_INTO_EAX
765 testl $0x4, %eax # EM (math emulation bit)
766 jne device_not_available_emulate
767 preempt_stop(CLBR_ANY)
768 call math_state_restore
769 jmp ret_from_exception
770device_not_available_emulate:
771 pushl $0 # temporary storage for ORIG_EIP
772 CFI_ADJUST_CFA_OFFSET 4 765 CFI_ADJUST_CFA_OFFSET 4
773 call math_emulate 766 jmp error_code
774 addl $4, %esp
775 CFI_ADJUST_CFA_OFFSET -4
776 jmp ret_from_exception
777 CFI_ENDPROC 767 CFI_ENDPROC
778END(device_not_available) 768END(device_not_available)
779 769
@@ -814,6 +804,7 @@ debug_stack_correct:
814 pushl $-1 # mark this as an int 804 pushl $-1 # mark this as an int
815 CFI_ADJUST_CFA_OFFSET 4 805 CFI_ADJUST_CFA_OFFSET 4
816 SAVE_ALL 806 SAVE_ALL
807 TRACE_IRQS_OFF
817 xorl %edx,%edx # error code 0 808 xorl %edx,%edx # error code 0
818 movl %esp,%eax # pt_regs pointer 809 movl %esp,%eax # pt_regs pointer
819 call do_debug 810 call do_debug
@@ -858,6 +849,7 @@ nmi_stack_correct:
858 pushl %eax 849 pushl %eax
859 CFI_ADJUST_CFA_OFFSET 4 850 CFI_ADJUST_CFA_OFFSET 4
860 SAVE_ALL 851 SAVE_ALL
852 TRACE_IRQS_OFF
861 xorl %edx,%edx # zero error code 853 xorl %edx,%edx # zero error code
862 movl %esp,%eax # pt_regs pointer 854 movl %esp,%eax # pt_regs pointer
863 call do_nmi 855 call do_nmi
@@ -898,6 +890,7 @@ nmi_espfix_stack:
898 pushl %eax 890 pushl %eax
899 CFI_ADJUST_CFA_OFFSET 4 891 CFI_ADJUST_CFA_OFFSET 4
900 SAVE_ALL 892 SAVE_ALL
893 TRACE_IRQS_OFF
901 FIXUP_ESPFIX_STACK # %eax == %esp 894 FIXUP_ESPFIX_STACK # %eax == %esp
902 xorl %edx,%edx # zero error code 895 xorl %edx,%edx # zero error code
903 call do_nmi 896 call do_nmi
@@ -928,6 +921,7 @@ KPROBE_ENTRY(int3)
928 pushl $-1 # mark this as an int 921 pushl $-1 # mark this as an int
929 CFI_ADJUST_CFA_OFFSET 4 922 CFI_ADJUST_CFA_OFFSET 4
930 SAVE_ALL 923 SAVE_ALL
924 TRACE_IRQS_OFF
931 xorl %edx,%edx # zero error code 925 xorl %edx,%edx # zero error code
932 movl %esp,%eax # pt_regs pointer 926 movl %esp,%eax # pt_regs pointer
933 call do_int3 927 call do_int3
@@ -1030,7 +1024,7 @@ ENTRY(machine_check)
1030 RING0_INT_FRAME 1024 RING0_INT_FRAME
1031 pushl $0 1025 pushl $0
1032 CFI_ADJUST_CFA_OFFSET 4 1026 CFI_ADJUST_CFA_OFFSET 4
1033 pushl machine_check_vector 1027 pushl $do_machine_check
1034 CFI_ADJUST_CFA_OFFSET 4 1028 CFI_ADJUST_CFA_OFFSET 4
1035 jmp error_code 1029 jmp error_code
1036 CFI_ENDPROC 1030 CFI_ENDPROC
@@ -1159,20 +1153,6 @@ ENDPROC(xen_failsafe_callback)
1159#ifdef CONFIG_DYNAMIC_FTRACE 1153#ifdef CONFIG_DYNAMIC_FTRACE
1160 1154
1161ENTRY(mcount) 1155ENTRY(mcount)
1162 pushl %eax
1163 pushl %ecx
1164 pushl %edx
1165 movl 0xc(%esp), %eax
1166 subl $MCOUNT_INSN_SIZE, %eax
1167
1168.globl mcount_call
1169mcount_call:
1170 call ftrace_stub
1171
1172 popl %edx
1173 popl %ecx
1174 popl %eax
1175
1176 ret 1156 ret
1177END(mcount) 1157END(mcount)
1178 1158
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index cf3a0b2d0059..09e7145484c5 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -64,32 +64,6 @@
64#ifdef CONFIG_FTRACE 64#ifdef CONFIG_FTRACE
65#ifdef CONFIG_DYNAMIC_FTRACE 65#ifdef CONFIG_DYNAMIC_FTRACE
66ENTRY(mcount) 66ENTRY(mcount)
67
68 subq $0x38, %rsp
69 movq %rax, (%rsp)
70 movq %rcx, 8(%rsp)
71 movq %rdx, 16(%rsp)
72 movq %rsi, 24(%rsp)
73 movq %rdi, 32(%rsp)
74 movq %r8, 40(%rsp)
75 movq %r9, 48(%rsp)
76
77 movq 0x38(%rsp), %rdi
78 subq $MCOUNT_INSN_SIZE, %rdi
79
80.globl mcount_call
81mcount_call:
82 call ftrace_stub
83
84 movq 48(%rsp), %r9
85 movq 40(%rsp), %r8
86 movq 32(%rsp), %rdi
87 movq 24(%rsp), %rsi
88 movq 16(%rsp), %rdx
89 movq 8(%rsp), %rcx
90 movq (%rsp), %rax
91 addq $0x38, %rsp
92
93 retq 67 retq
94END(mcount) 68END(mcount)
95 69
@@ -667,6 +641,13 @@ END(stub_rt_sigreturn)
667 SAVE_ARGS 641 SAVE_ARGS
668 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler 642 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
669 pushq %rbp 643 pushq %rbp
644 /*
645 * Save rbp twice: One is for marking the stack frame, as usual, and the
646 * other, to fill pt_regs properly. This is because bx comes right
647 * before the last saved register in that structure, and not bp. If the
648 * base pointer were in the place bx is today, this would not be needed.
649 */
650 movq %rbp, -8(%rsp)
670 CFI_ADJUST_CFA_OFFSET 8 651 CFI_ADJUST_CFA_OFFSET 8
671 CFI_REL_OFFSET rbp, 0 652 CFI_REL_OFFSET rbp, 0
672 movq %rsp,%rbp 653 movq %rsp,%rbp
@@ -932,6 +913,9 @@ END(spurious_interrupt)
932 .if \ist 913 .if \ist
933 movq %gs:pda_data_offset, %rbp 914 movq %gs:pda_data_offset, %rbp
934 .endif 915 .endif
916 .if \irqtrace
917 TRACE_IRQS_OFF
918 .endif
935 movq %rsp,%rdi 919 movq %rsp,%rdi
936 movq ORIG_RAX(%rsp),%rsi 920 movq ORIG_RAX(%rsp),%rsi
937 movq $-1,ORIG_RAX(%rsp) 921 movq $-1,ORIG_RAX(%rsp)
@@ -1058,7 +1042,8 @@ KPROBE_ENTRY(error_entry)
1058 je error_kernelspace 1042 je error_kernelspace
1059error_swapgs: 1043error_swapgs:
1060 SWAPGS 1044 SWAPGS
1061error_sti: 1045error_sti:
1046 TRACE_IRQS_OFF
1062 movq %rdi,RDI(%rsp) 1047 movq %rdi,RDI(%rsp)
1063 CFI_REL_OFFSET rdi,RDI 1048 CFI_REL_OFFSET rdi,RDI
1064 movq %rsp,%rdi 1049 movq %rsp,%rdi
@@ -1232,7 +1217,7 @@ ENTRY(simd_coprocessor_error)
1232END(simd_coprocessor_error) 1217END(simd_coprocessor_error)
1233 1218
1234ENTRY(device_not_available) 1219ENTRY(device_not_available)
1235 zeroentry math_state_restore 1220 zeroentry do_device_not_available
1236END(device_not_available) 1221END(device_not_available)
1237 1222
1238 /* runs on exception stack */ 1223 /* runs on exception stack */
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
index 849e5cd485b8..f454c78fcef6 100644
--- a/arch/x86/kernel/es7000_32.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -109,6 +109,7 @@ struct oem_table {
109}; 109};
110 110
111extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); 111extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
112extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
112#endif 113#endif
113 114
114struct mip_reg { 115struct mip_reg {
@@ -243,21 +244,38 @@ parse_unisys_oem (char *oemptr)
243} 244}
244 245
245#ifdef CONFIG_ACPI 246#ifdef CONFIG_ACPI
246int __init 247static unsigned long oem_addrX;
247find_unisys_acpi_oem_table(unsigned long *oem_addr) 248static unsigned long oem_size;
249int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
248{ 250{
249 struct acpi_table_header *header = NULL; 251 struct acpi_table_header *header = NULL;
250 int i = 0; 252 int i = 0;
251 while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { 253 acpi_size tbl_size;
254
255 while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) {
252 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { 256 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
253 struct oem_table *t = (struct oem_table *)header; 257 struct oem_table *t = (struct oem_table *)header;
254 *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr, 258
255 t->OEMTableSize); 259 oem_addrX = t->OEMTableAddr;
260 oem_size = t->OEMTableSize;
261 early_acpi_os_unmap_memory(header, tbl_size);
262
263 *oem_addr = (unsigned long)__acpi_map_table(oem_addrX,
264 oem_size);
256 return 0; 265 return 0;
257 } 266 }
267 early_acpi_os_unmap_memory(header, tbl_size);
258 } 268 }
259 return -1; 269 return -1;
260} 270}
271
272void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
273{
274 if (!oem_addr)
275 return;
276
277 __acpi_unmap_table((char *)oem_addr, oem_size);
278}
261#endif 279#endif
262 280
263static void 281static void
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ab115cd15fdf..d073d981a730 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -11,17 +11,18 @@
11 11
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/hardirq.h> 13#include <linux/hardirq.h>
14#include <linux/uaccess.h>
14#include <linux/ftrace.h> 15#include <linux/ftrace.h>
15#include <linux/percpu.h> 16#include <linux/percpu.h>
16#include <linux/init.h> 17#include <linux/init.h>
17#include <linux/list.h> 18#include <linux/list.h>
18 19
19#include <asm/alternative.h>
20#include <asm/ftrace.h> 20#include <asm/ftrace.h>
21#include <asm/nops.h>
21 22
22 23
23/* Long is fine, even if it is only 4 bytes ;-) */ 24/* Long is fine, even if it is only 4 bytes ;-) */
24static long *ftrace_nop; 25static unsigned long *ftrace_nop;
25 26
26union ftrace_code_union { 27union ftrace_code_union {
27 char code[MCOUNT_INSN_SIZE]; 28 char code[MCOUNT_INSN_SIZE];
@@ -60,11 +61,7 @@ notrace int
60ftrace_modify_code(unsigned long ip, unsigned char *old_code, 61ftrace_modify_code(unsigned long ip, unsigned char *old_code,
61 unsigned char *new_code) 62 unsigned char *new_code)
62{ 63{
63 unsigned replaced; 64 unsigned char replaced[MCOUNT_INSN_SIZE];
64 unsigned old = *(unsigned *)old_code; /* 4 bytes */
65 unsigned new = *(unsigned *)new_code; /* 4 bytes */
66 unsigned char newch = new_code[4];
67 int faulted = 0;
68 65
69 /* 66 /*
70 * Note: Due to modules and __init, code can 67 * Note: Due to modules and __init, code can
@@ -72,29 +69,20 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
72 * as well as code changing. 69 * as well as code changing.
73 * 70 *
74 * No real locking needed, this code is run through 71 * No real locking needed, this code is run through
75 * kstop_machine. 72 * kstop_machine, or before SMP starts.
76 */ 73 */
77 asm volatile ( 74 if (__copy_from_user_inatomic(replaced, (char __user *)ip, MCOUNT_INSN_SIZE))
78 "1: lock\n" 75 return 1;
79 " cmpxchg %3, (%2)\n" 76
80 " jnz 2f\n" 77 if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
81 " movb %b4, 4(%2)\n" 78 return 2;
82 "2:\n"
83 ".section .fixup, \"ax\"\n"
84 "3: movl $1, %0\n"
85 " jmp 2b\n"
86 ".previous\n"
87 _ASM_EXTABLE(1b, 3b)
88 : "=r"(faulted), "=a"(replaced)
89 : "r"(ip), "r"(new), "c"(newch),
90 "0"(faulted), "a"(old)
91 : "memory");
92 sync_core();
93 79
94 if (replaced != old && replaced != new) 80 WARN_ON_ONCE(__copy_to_user_inatomic((char __user *)ip, new_code,
95 faulted = 2; 81 MCOUNT_INSN_SIZE));
96 82
97 return faulted; 83 sync_core();
84
85 return 0;
98} 86}
99 87
100notrace int ftrace_update_ftrace_func(ftrace_func_t func) 88notrace int ftrace_update_ftrace_func(ftrace_func_t func)
@@ -112,30 +100,76 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func)
112 100
113notrace int ftrace_mcount_set(unsigned long *data) 101notrace int ftrace_mcount_set(unsigned long *data)
114{ 102{
115 unsigned long ip = (long)(&mcount_call); 103 /* mcount is initialized as a nop */
116 unsigned long *addr = data; 104 *data = 0;
117 unsigned char old[MCOUNT_INSN_SIZE], *new;
118
119 /*
120 * Replace the mcount stub with a pointer to the
121 * ip recorder function.
122 */
123 memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
124 new = ftrace_call_replace(ip, *addr);
125 *addr = ftrace_modify_code(ip, old, new);
126
127 return 0; 105 return 0;
128} 106}
129 107
130int __init ftrace_dyn_arch_init(void *data) 108int __init ftrace_dyn_arch_init(void *data)
131{ 109{
132 const unsigned char *const *noptable = find_nop_table(); 110 extern const unsigned char ftrace_test_p6nop[];
133 111 extern const unsigned char ftrace_test_nop5[];
134 /* This is running in kstop_machine */ 112 extern const unsigned char ftrace_test_jmp[];
135 113 int faulted = 0;
136 ftrace_mcount_set(data);
137 114
138 ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE]; 115 /*
116 * There is no good nop for all x86 archs.
117 * We will default to using the P6_NOP5, but first we
118 * will test to make sure that the nop will actually
119 * work on this CPU. If it faults, we will then
120 * go to a lesser efficient 5 byte nop. If that fails
121 * we then just use a jmp as our nop. This isn't the most
122 * efficient nop, but we can not use a multi part nop
123 * since we would then risk being preempted in the middle
124 * of that nop, and if we enabled tracing then, it might
125 * cause a system crash.
126 *
127 * TODO: check the cpuid to determine the best nop.
128 */
129 asm volatile (
130 "jmp ftrace_test_jmp\n"
131 /* This code needs to stay around */
132 ".section .text, \"ax\"\n"
133 "ftrace_test_jmp:"
134 "jmp ftrace_test_p6nop\n"
135 "nop\n"
136 "nop\n"
137 "nop\n" /* 2 byte jmp + 3 bytes */
138 "ftrace_test_p6nop:"
139 P6_NOP5
140 "jmp 1f\n"
141 "ftrace_test_nop5:"
142 ".byte 0x66,0x66,0x66,0x66,0x90\n"
143 "jmp 1f\n"
144 ".previous\n"
145 "1:"
146 ".section .fixup, \"ax\"\n"
147 "2: movl $1, %0\n"
148 " jmp ftrace_test_nop5\n"
149 "3: movl $2, %0\n"
150 " jmp 1b\n"
151 ".previous\n"
152 _ASM_EXTABLE(ftrace_test_p6nop, 2b)
153 _ASM_EXTABLE(ftrace_test_nop5, 3b)
154 : "=r"(faulted) : "0" (faulted));
155
156 switch (faulted) {
157 case 0:
158 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n");
159 ftrace_nop = (unsigned long *)ftrace_test_p6nop;
160 break;
161 case 1:
162 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n");
163 ftrace_nop = (unsigned long *)ftrace_test_nop5;
164 break;
165 case 2:
166 pr_info("ftrace: converting mcount calls to jmp . + 5\n");
167 ftrace_nop = (unsigned long *)ftrace_test_jmp;
168 break;
169 }
170
171 /* The return code is retured via data */
172 *(unsigned long *)data = 0;
139 173
140 return 0; 174 return 0;
141} 175}
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 9eca5ba7a6b1..2ec2de8d8c46 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -179,8 +179,10 @@ static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
179 * is an example). 179 * is an example).
180 */ 180 */
181 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && 181 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
182 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) 182 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
183 printk(KERN_DEBUG "system APIC only can use physical flat");
183 return 1; 184 return 1;
185 }
184#endif 186#endif
185 187
186 return 0; 188 return 0;
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index ae2ffc8a400c..bfd532843df6 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -114,7 +114,7 @@ static void uv_send_IPI_one(int cpu, int vector)
114 unsigned long val, apicid, lapicid; 114 unsigned long val, apicid, lapicid;
115 int pnode; 115 int pnode;
116 116
117 apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ 117 apicid = per_cpu(x86_cpu_to_apicid, cpu);
118 lapicid = apicid & 0x3f; /* ZZZ macro needed */ 118 lapicid = apicid & 0x3f; /* ZZZ macro needed */
119 pnode = uv_apicid_to_pnode(apicid); 119 pnode = uv_apicid_to_pnode(apicid);
120 val = 120 val =
@@ -202,12 +202,10 @@ static unsigned int phys_pkg_id(int index_msb)
202 return uv_read_apic_id() >> index_msb; 202 return uv_read_apic_id() >> index_msb;
203} 203}
204 204
205#ifdef ZZZ /* Needs x2apic patch */
206static void uv_send_IPI_self(int vector) 205static void uv_send_IPI_self(int vector)
207{ 206{
208 apic_write(APIC_SELF_IPI, vector); 207 apic_write(APIC_SELF_IPI, vector);
209} 208}
210#endif
211 209
212struct genapic apic_x2apic_uv_x = { 210struct genapic apic_x2apic_uv_x = {
213 .name = "UV large system", 211 .name = "UV large system",
@@ -215,15 +213,15 @@ struct genapic apic_x2apic_uv_x = {
215 .int_delivery_mode = dest_Fixed, 213 .int_delivery_mode = dest_Fixed,
216 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 214 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
217 .target_cpus = uv_target_cpus, 215 .target_cpus = uv_target_cpus,
218 .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */ 216 .vector_allocation_domain = uv_vector_allocation_domain,
219 .apic_id_registered = uv_apic_id_registered, 217 .apic_id_registered = uv_apic_id_registered,
220 .init_apic_ldr = uv_init_apic_ldr, 218 .init_apic_ldr = uv_init_apic_ldr,
221 .send_IPI_all = uv_send_IPI_all, 219 .send_IPI_all = uv_send_IPI_all,
222 .send_IPI_allbutself = uv_send_IPI_allbutself, 220 .send_IPI_allbutself = uv_send_IPI_allbutself,
223 .send_IPI_mask = uv_send_IPI_mask, 221 .send_IPI_mask = uv_send_IPI_mask,
224 /* ZZZ.send_IPI_self = uv_send_IPI_self, */ 222 .send_IPI_self = uv_send_IPI_self,
225 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, 223 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
226 .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ 224 .phys_pkg_id = phys_pkg_id,
227 .get_apic_id = get_apic_id, 225 .get_apic_id = get_apic_id,
228 .set_apic_id = set_apic_id, 226 .set_apic_id = set_apic_id,
229 .apic_id_mask = (0xFFFFFFFFu), 227 .apic_id_mask = (0xFFFFFFFFu),
@@ -286,12 +284,13 @@ static __init void map_low_mmrs(void)
286 284
287enum map_type {map_wb, map_uc}; 285enum map_type {map_wb, map_uc};
288 286
289static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type) 287static __init void map_high(char *id, unsigned long base, int shift,
288 int max_pnode, enum map_type map_type)
290{ 289{
291 unsigned long bytes, paddr; 290 unsigned long bytes, paddr;
292 291
293 paddr = base << shift; 292 paddr = base << shift;
294 bytes = (1UL << shift); 293 bytes = (1UL << shift) * (max_pnode + 1);
295 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 294 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
296 paddr + bytes); 295 paddr + bytes);
297 if (map_type == map_uc) 296 if (map_type == map_uc)
@@ -307,7 +306,7 @@ static __init void map_gru_high(int max_pnode)
307 306
308 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 307 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
309 if (gru.s.enable) 308 if (gru.s.enable)
310 map_high("GRU", gru.s.base, shift, map_wb); 309 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
311} 310}
312 311
313static __init void map_config_high(int max_pnode) 312static __init void map_config_high(int max_pnode)
@@ -317,7 +316,7 @@ static __init void map_config_high(int max_pnode)
317 316
318 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); 317 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
319 if (cfg.s.enable) 318 if (cfg.s.enable)
320 map_high("CONFIG", cfg.s.base, shift, map_uc); 319 map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
321} 320}
322 321
323static __init void map_mmr_high(int max_pnode) 322static __init void map_mmr_high(int max_pnode)
@@ -327,7 +326,7 @@ static __init void map_mmr_high(int max_pnode)
327 326
328 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); 327 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
329 if (mmr.s.enable) 328 if (mmr.s.enable)
330 map_high("MMR", mmr.s.base, shift, map_uc); 329 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
331} 330}
332 331
333static __init void map_mmioh_high(int max_pnode) 332static __init void map_mmioh_high(int max_pnode)
@@ -337,17 +336,17 @@ static __init void map_mmioh_high(int max_pnode)
337 336
338 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 337 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
339 if (mmioh.s.enable) 338 if (mmioh.s.enable)
340 map_high("MMIOH", mmioh.s.base, shift, map_uc); 339 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
341} 340}
342 341
343static __init void uv_rtc_init(void) 342static __init void uv_rtc_init(void)
344{ 343{
345 long status, ticks_per_sec, drift; 344 long status;
345 u64 ticks_per_sec;
346 346
347 status = 347 status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK,
348 x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec, 348 &ticks_per_sec);
349 &drift); 349 if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
350 if (status != 0 || ticks_per_sec < 100000) {
351 printk(KERN_WARNING 350 printk(KERN_WARNING
352 "unable to determine platform RTC clock frequency, " 351 "unable to determine platform RTC clock frequency, "
353 "guessing.\n"); 352 "guessing.\n");
@@ -357,7 +356,22 @@ static __init void uv_rtc_init(void)
357 sn_rtc_cycles_per_second = ticks_per_sec; 356 sn_rtc_cycles_per_second = ticks_per_sec;
358} 357}
359 358
360static bool uv_system_inited; 359/*
360 * Called on each cpu to initialize the per_cpu UV data area.
361 * ZZZ hotplug not supported yet
362 */
363void __cpuinit uv_cpu_init(void)
364{
365 /* CPU 0 initilization will be done via uv_system_init. */
366 if (!uv_blade_info)
367 return;
368
369 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
370
371 if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
372 set_x2apic_extra_bits(uv_hub_info->pnode);
373}
374
361 375
362void __init uv_system_init(void) 376void __init uv_system_init(void)
363{ 377{
@@ -413,6 +427,9 @@ void __init uv_system_init(void)
413 gnode_upper = (((unsigned long)node_id.s.node_id) & 427 gnode_upper = (((unsigned long)node_id.s.node_id) &
414 ~((1 << n_val) - 1)) << m_val; 428 ~((1 << n_val) - 1)) << m_val;
415 429
430 uv_bios_init();
431 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
432 &uv_coherency_id, &uv_region_size);
416 uv_rtc_init(); 433 uv_rtc_init();
417 434
418 for_each_present_cpu(cpu) { 435 for_each_present_cpu(cpu) {
@@ -434,7 +451,7 @@ void __init uv_system_init(void)
434 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 451 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
435 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 452 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
436 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 453 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
437 uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ 454 uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id;
438 uv_node_to_blade[nid] = blade; 455 uv_node_to_blade[nid] = blade;
439 uv_cpu_to_blade[cpu] = blade; 456 uv_cpu_to_blade[cpu] = blade;
440 max_pnode = max(pnode, max_pnode); 457 max_pnode = max(pnode, max_pnode);
@@ -449,21 +466,6 @@ void __init uv_system_init(void)
449 map_mmr_high(max_pnode); 466 map_mmr_high(max_pnode);
450 map_config_high(max_pnode); 467 map_config_high(max_pnode);
451 map_mmioh_high(max_pnode); 468 map_mmioh_high(max_pnode);
452 uv_system_inited = true;
453}
454 469
455/* 470 uv_cpu_init();
456 * Called on each cpu to initialize the per_cpu UV data area.
457 * ZZZ hotplug not supported yet
458 */
459void __cpuinit uv_cpu_init(void)
460{
461 BUG_ON(!uv_system_inited);
462
463 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
464
465 if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
466 set_x2apic_extra_bits(uv_hub_info->pnode);
467} 471}
468
469
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 3e66bd364a9d..1dcb0f13897e 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -35,6 +35,7 @@ void __init reserve_ebda_region(void)
35 35
36 /* start of EBDA area */ 36 /* start of EBDA area */
37 ebda_addr = get_bios_ebda(); 37 ebda_addr = get_bios_ebda();
38 printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
38 39
39 /* Fixup: bios puts an EBDA in the top 64K segment */ 40 /* Fixup: bios puts an EBDA in the top 64K segment */
40 /* of conventional memory, but does not adjust lowmem. */ 41 /* of conventional memory, but does not adjust lowmem. */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 73deaffadd03..77017e834cf7 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1,29 +1,49 @@
1#include <linux/clocksource.h> 1#include <linux/clocksource.h>
2#include <linux/clockchips.h> 2#include <linux/clockchips.h>
3#include <linux/interrupt.h>
4#include <linux/sysdev.h>
3#include <linux/delay.h> 5#include <linux/delay.h>
4#include <linux/errno.h> 6#include <linux/errno.h>
5#include <linux/hpet.h> 7#include <linux/hpet.h>
6#include <linux/init.h> 8#include <linux/init.h>
7#include <linux/sysdev.h> 9#include <linux/cpu.h>
8#include <linux/pm.h> 10#include <linux/pm.h>
11#include <linux/io.h>
9 12
10#include <asm/fixmap.h> 13#include <asm/fixmap.h>
11#include <asm/hpet.h>
12#include <asm/i8253.h> 14#include <asm/i8253.h>
13#include <asm/io.h> 15#include <asm/hpet.h>
14 16
15#define HPET_MASK CLOCKSOURCE_MASK(32) 17#define HPET_MASK CLOCKSOURCE_MASK(32)
16#define HPET_SHIFT 22 18#define HPET_SHIFT 22
17 19
18/* FSEC = 10^-15 20/* FSEC = 10^-15
19 NSEC = 10^-9 */ 21 NSEC = 10^-9 */
20#define FSEC_PER_NSEC 1000000L 22#define FSEC_PER_NSEC 1000000L
23
24#define HPET_DEV_USED_BIT 2
25#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT)
26#define HPET_DEV_VALID 0x8
27#define HPET_DEV_FSB_CAP 0x1000
28#define HPET_DEV_PERI_CAP 0x2000
29
30#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
21 31
22/* 32/*
23 * HPET address is set in acpi/boot.c, when an ACPI entry exists 33 * HPET address is set in acpi/boot.c, when an ACPI entry exists
24 */ 34 */
25unsigned long hpet_address; 35unsigned long hpet_address;
26static void __iomem *hpet_virt_address; 36unsigned long hpet_num_timers;
37static void __iomem *hpet_virt_address;
38
39struct hpet_dev {
40 struct clock_event_device evt;
41 unsigned int num;
42 int cpu;
43 unsigned int irq;
44 unsigned int flags;
45 char name[10];
46};
27 47
28unsigned long hpet_readl(unsigned long a) 48unsigned long hpet_readl(unsigned long a)
29{ 49{
@@ -59,7 +79,7 @@ static inline void hpet_clear_mapping(void)
59static int boot_hpet_disable; 79static int boot_hpet_disable;
60int hpet_force_user; 80int hpet_force_user;
61 81
62static int __init hpet_setup(char* str) 82static int __init hpet_setup(char *str)
63{ 83{
64 if (str) { 84 if (str) {
65 if (!strncmp("disable", str, 7)) 85 if (!strncmp("disable", str, 7))
@@ -80,7 +100,7 @@ __setup("nohpet", disable_hpet);
80 100
81static inline int is_hpet_capable(void) 101static inline int is_hpet_capable(void)
82{ 102{
83 return (!boot_hpet_disable && hpet_address); 103 return !boot_hpet_disable && hpet_address;
84} 104}
85 105
86/* 106/*
@@ -102,6 +122,9 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled);
102 * timer 0 and timer 1 in case of RTC emulation. 122 * timer 0 and timer 1 in case of RTC emulation.
103 */ 123 */
104#ifdef CONFIG_HPET 124#ifdef CONFIG_HPET
125
126static void hpet_reserve_msi_timers(struct hpet_data *hd);
127
105static void hpet_reserve_platform_timers(unsigned long id) 128static void hpet_reserve_platform_timers(unsigned long id)
106{ 129{
107 struct hpet __iomem *hpet = hpet_virt_address; 130 struct hpet __iomem *hpet = hpet_virt_address;
@@ -111,25 +134,31 @@ static void hpet_reserve_platform_timers(unsigned long id)
111 134
112 nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; 135 nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
113 136
114 memset(&hd, 0, sizeof (hd)); 137 memset(&hd, 0, sizeof(hd));
115 hd.hd_phys_address = hpet_address; 138 hd.hd_phys_address = hpet_address;
116 hd.hd_address = hpet; 139 hd.hd_address = hpet;
117 hd.hd_nirqs = nrtimers; 140 hd.hd_nirqs = nrtimers;
118 hd.hd_flags = HPET_DATA_PLATFORM;
119 hpet_reserve_timer(&hd, 0); 141 hpet_reserve_timer(&hd, 0);
120 142
121#ifdef CONFIG_HPET_EMULATE_RTC 143#ifdef CONFIG_HPET_EMULATE_RTC
122 hpet_reserve_timer(&hd, 1); 144 hpet_reserve_timer(&hd, 1);
123#endif 145#endif
124 146
147 /*
148 * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
149 * is wrong for i8259!) not the output IRQ. Many BIOS writers
150 * don't bother configuring *any* comparator interrupts.
151 */
125 hd.hd_irq[0] = HPET_LEGACY_8254; 152 hd.hd_irq[0] = HPET_LEGACY_8254;
126 hd.hd_irq[1] = HPET_LEGACY_RTC; 153 hd.hd_irq[1] = HPET_LEGACY_RTC;
127 154
128 for (i = 2; i < nrtimers; timer++, i++) { 155 for (i = 2; i < nrtimers; timer++, i++) {
129 hd.hd_irq[i] = (readl(&timer->hpet_config) & Tn_INT_ROUTE_CNF_MASK) >> 156 hd.hd_irq[i] = (readl(&timer->hpet_config) &
130 Tn_INT_ROUTE_CNF_SHIFT; 157 Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
131 } 158 }
132 159
160 hpet_reserve_msi_timers(&hd);
161
133 hpet_alloc(&hd); 162 hpet_alloc(&hd);
134 163
135} 164}
@@ -223,60 +252,70 @@ static void hpet_legacy_clockevent_register(void)
223 printk(KERN_DEBUG "hpet clockevent registered\n"); 252 printk(KERN_DEBUG "hpet clockevent registered\n");
224} 253}
225 254
226static void hpet_legacy_set_mode(enum clock_event_mode mode, 255static int hpet_setup_msi_irq(unsigned int irq);
227 struct clock_event_device *evt) 256
257static void hpet_set_mode(enum clock_event_mode mode,
258 struct clock_event_device *evt, int timer)
228{ 259{
229 unsigned long cfg, cmp, now; 260 unsigned long cfg, cmp, now;
230 uint64_t delta; 261 uint64_t delta;
231 262
232 switch(mode) { 263 switch (mode) {
233 case CLOCK_EVT_MODE_PERIODIC: 264 case CLOCK_EVT_MODE_PERIODIC:
234 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult; 265 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
235 delta >>= hpet_clockevent.shift; 266 delta >>= evt->shift;
236 now = hpet_readl(HPET_COUNTER); 267 now = hpet_readl(HPET_COUNTER);
237 cmp = now + (unsigned long) delta; 268 cmp = now + (unsigned long) delta;
238 cfg = hpet_readl(HPET_T0_CFG); 269 cfg = hpet_readl(HPET_Tn_CFG(timer));
239 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | 270 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
240 HPET_TN_SETVAL | HPET_TN_32BIT; 271 HPET_TN_SETVAL | HPET_TN_32BIT;
241 hpet_writel(cfg, HPET_T0_CFG); 272 hpet_writel(cfg, HPET_Tn_CFG(timer));
242 /* 273 /*
243 * The first write after writing TN_SETVAL to the 274 * The first write after writing TN_SETVAL to the
244 * config register sets the counter value, the second 275 * config register sets the counter value, the second
245 * write sets the period. 276 * write sets the period.
246 */ 277 */
247 hpet_writel(cmp, HPET_T0_CMP); 278 hpet_writel(cmp, HPET_Tn_CMP(timer));
248 udelay(1); 279 udelay(1);
249 hpet_writel((unsigned long) delta, HPET_T0_CMP); 280 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
250 break; 281 break;
251 282
252 case CLOCK_EVT_MODE_ONESHOT: 283 case CLOCK_EVT_MODE_ONESHOT:
253 cfg = hpet_readl(HPET_T0_CFG); 284 cfg = hpet_readl(HPET_Tn_CFG(timer));
254 cfg &= ~HPET_TN_PERIODIC; 285 cfg &= ~HPET_TN_PERIODIC;
255 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; 286 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
256 hpet_writel(cfg, HPET_T0_CFG); 287 hpet_writel(cfg, HPET_Tn_CFG(timer));
257 break; 288 break;
258 289
259 case CLOCK_EVT_MODE_UNUSED: 290 case CLOCK_EVT_MODE_UNUSED:
260 case CLOCK_EVT_MODE_SHUTDOWN: 291 case CLOCK_EVT_MODE_SHUTDOWN:
261 cfg = hpet_readl(HPET_T0_CFG); 292 cfg = hpet_readl(HPET_Tn_CFG(timer));
262 cfg &= ~HPET_TN_ENABLE; 293 cfg &= ~HPET_TN_ENABLE;
263 hpet_writel(cfg, HPET_T0_CFG); 294 hpet_writel(cfg, HPET_Tn_CFG(timer));
264 break; 295 break;
265 296
266 case CLOCK_EVT_MODE_RESUME: 297 case CLOCK_EVT_MODE_RESUME:
267 hpet_enable_legacy_int(); 298 if (timer == 0) {
299 hpet_enable_legacy_int();
300 } else {
301 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
302 hpet_setup_msi_irq(hdev->irq);
303 disable_irq(hdev->irq);
304 irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
305 enable_irq(hdev->irq);
306 }
268 break; 307 break;
269 } 308 }
270} 309}
271 310
272static int hpet_legacy_next_event(unsigned long delta, 311static int hpet_next_event(unsigned long delta,
273 struct clock_event_device *evt) 312 struct clock_event_device *evt, int timer)
274{ 313{
275 u32 cnt; 314 u32 cnt;
276 315
277 cnt = hpet_readl(HPET_COUNTER); 316 cnt = hpet_readl(HPET_COUNTER);
278 cnt += (u32) delta; 317 cnt += (u32) delta;
279 hpet_writel(cnt, HPET_T0_CMP); 318 hpet_writel(cnt, HPET_Tn_CMP(timer));
280 319
281 /* 320 /*
282 * We need to read back the CMP register to make sure that 321 * We need to read back the CMP register to make sure that
@@ -288,6 +327,347 @@ static int hpet_legacy_next_event(unsigned long delta,
288 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 327 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
289} 328}
290 329
330static void hpet_legacy_set_mode(enum clock_event_mode mode,
331 struct clock_event_device *evt)
332{
333 hpet_set_mode(mode, evt, 0);
334}
335
336static int hpet_legacy_next_event(unsigned long delta,
337 struct clock_event_device *evt)
338{
339 return hpet_next_event(delta, evt, 0);
340}
341
342/*
343 * HPET MSI Support
344 */
345#ifdef CONFIG_PCI_MSI
346
347static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
348static struct hpet_dev *hpet_devs;
349
350void hpet_msi_unmask(unsigned int irq)
351{
352 struct hpet_dev *hdev = get_irq_data(irq);
353 unsigned long cfg;
354
355 /* unmask it */
356 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
357 cfg |= HPET_TN_FSB;
358 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
359}
360
361void hpet_msi_mask(unsigned int irq)
362{
363 unsigned long cfg;
364 struct hpet_dev *hdev = get_irq_data(irq);
365
366 /* mask it */
367 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
368 cfg &= ~HPET_TN_FSB;
369 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
370}
371
372void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
373{
374 struct hpet_dev *hdev = get_irq_data(irq);
375
376 hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
377 hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
378}
379
380void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
381{
382 struct hpet_dev *hdev = get_irq_data(irq);
383
384 msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
385 msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
386 msg->address_hi = 0;
387}
388
389static void hpet_msi_set_mode(enum clock_event_mode mode,
390 struct clock_event_device *evt)
391{
392 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
393 hpet_set_mode(mode, evt, hdev->num);
394}
395
396static int hpet_msi_next_event(unsigned long delta,
397 struct clock_event_device *evt)
398{
399 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
400 return hpet_next_event(delta, evt, hdev->num);
401}
402
403static int hpet_setup_msi_irq(unsigned int irq)
404{
405 if (arch_setup_hpet_msi(irq)) {
406 destroy_irq(irq);
407 return -EINVAL;
408 }
409 return 0;
410}
411
412static int hpet_assign_irq(struct hpet_dev *dev)
413{
414 unsigned int irq;
415
416 irq = create_irq();
417 if (!irq)
418 return -EINVAL;
419
420 set_irq_data(irq, dev);
421
422 if (hpet_setup_msi_irq(irq))
423 return -EINVAL;
424
425 dev->irq = irq;
426 return 0;
427}
428
429static irqreturn_t hpet_interrupt_handler(int irq, void *data)
430{
431 struct hpet_dev *dev = (struct hpet_dev *)data;
432 struct clock_event_device *hevt = &dev->evt;
433
434 if (!hevt->event_handler) {
435 printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n",
436 dev->num);
437 return IRQ_HANDLED;
438 }
439
440 hevt->event_handler(hevt);
441 return IRQ_HANDLED;
442}
443
444static int hpet_setup_irq(struct hpet_dev *dev)
445{
446
447 if (request_irq(dev->irq, hpet_interrupt_handler,
448 IRQF_SHARED|IRQF_NOBALANCING, dev->name, dev))
449 return -1;
450
451 disable_irq(dev->irq);
452 irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
453 enable_irq(dev->irq);
454
455 printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
456 dev->name, dev->irq);
457
458 return 0;
459}
460
461/* This should be called in specific @cpu */
462static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
463{
464 struct clock_event_device *evt = &hdev->evt;
465 uint64_t hpet_freq;
466
467 WARN_ON(cpu != smp_processor_id());
468 if (!(hdev->flags & HPET_DEV_VALID))
469 return;
470
471 if (hpet_setup_msi_irq(hdev->irq))
472 return;
473
474 hdev->cpu = cpu;
475 per_cpu(cpu_hpet_dev, cpu) = hdev;
476 evt->name = hdev->name;
477 hpet_setup_irq(hdev);
478 evt->irq = hdev->irq;
479
480 evt->rating = 110;
481 evt->features = CLOCK_EVT_FEAT_ONESHOT;
482 if (hdev->flags & HPET_DEV_PERI_CAP)
483 evt->features |= CLOCK_EVT_FEAT_PERIODIC;
484
485 evt->set_mode = hpet_msi_set_mode;
486 evt->set_next_event = hpet_msi_next_event;
487 evt->shift = 32;
488
489 /*
490 * The period is a femto seconds value. We need to calculate the
491 * scaled math multiplication factor for nanosecond to hpet tick
492 * conversion.
493 */
494 hpet_freq = 1000000000000000ULL;
495 do_div(hpet_freq, hpet_period);
496 evt->mult = div_sc((unsigned long) hpet_freq,
497 NSEC_PER_SEC, evt->shift);
498 /* Calculate the max delta */
499 evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
500 /* 5 usec minimum reprogramming delta. */
501 evt->min_delta_ns = 5000;
502
503 evt->cpumask = cpumask_of_cpu(hdev->cpu);
504 clockevents_register_device(evt);
505}
506
507#ifdef CONFIG_HPET
508/* Reserve at least one timer for userspace (/dev/hpet) */
509#define RESERVE_TIMERS 1
510#else
511#define RESERVE_TIMERS 0
512#endif
513
514static void hpet_msi_capability_lookup(unsigned int start_timer)
515{
516 unsigned int id;
517 unsigned int num_timers;
518 unsigned int num_timers_used = 0;
519 int i;
520
521 id = hpet_readl(HPET_ID);
522
523 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
524 num_timers++; /* Value read out starts from 0 */
525
526 hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
527 if (!hpet_devs)
528 return;
529
530 hpet_num_timers = num_timers;
531
532 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
533 struct hpet_dev *hdev = &hpet_devs[num_timers_used];
534 unsigned long cfg = hpet_readl(HPET_Tn_CFG(i));
535
536 /* Only consider HPET timer with MSI support */
537 if (!(cfg & HPET_TN_FSB_CAP))
538 continue;
539
540 hdev->flags = 0;
541 if (cfg & HPET_TN_PERIODIC_CAP)
542 hdev->flags |= HPET_DEV_PERI_CAP;
543 hdev->num = i;
544
545 sprintf(hdev->name, "hpet%d", i);
546 if (hpet_assign_irq(hdev))
547 continue;
548
549 hdev->flags |= HPET_DEV_FSB_CAP;
550 hdev->flags |= HPET_DEV_VALID;
551 num_timers_used++;
552 if (num_timers_used == num_possible_cpus())
553 break;
554 }
555
556 printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n",
557 num_timers, num_timers_used);
558}
559
560#ifdef CONFIG_HPET
561static void hpet_reserve_msi_timers(struct hpet_data *hd)
562{
563 int i;
564
565 if (!hpet_devs)
566 return;
567
568 for (i = 0; i < hpet_num_timers; i++) {
569 struct hpet_dev *hdev = &hpet_devs[i];
570
571 if (!(hdev->flags & HPET_DEV_VALID))
572 continue;
573
574 hd->hd_irq[hdev->num] = hdev->irq;
575 hpet_reserve_timer(hd, hdev->num);
576 }
577}
578#endif
579
580static struct hpet_dev *hpet_get_unused_timer(void)
581{
582 int i;
583
584 if (!hpet_devs)
585 return NULL;
586
587 for (i = 0; i < hpet_num_timers; i++) {
588 struct hpet_dev *hdev = &hpet_devs[i];
589
590 if (!(hdev->flags & HPET_DEV_VALID))
591 continue;
592 if (test_and_set_bit(HPET_DEV_USED_BIT,
593 (unsigned long *)&hdev->flags))
594 continue;
595 return hdev;
596 }
597 return NULL;
598}
599
600struct hpet_work_struct {
601 struct delayed_work work;
602 struct completion complete;
603};
604
605static void hpet_work(struct work_struct *w)
606{
607 struct hpet_dev *hdev;
608 int cpu = smp_processor_id();
609 struct hpet_work_struct *hpet_work;
610
611 hpet_work = container_of(w, struct hpet_work_struct, work.work);
612
613 hdev = hpet_get_unused_timer();
614 if (hdev)
615 init_one_hpet_msi_clockevent(hdev, cpu);
616
617 complete(&hpet_work->complete);
618}
619
620static int hpet_cpuhp_notify(struct notifier_block *n,
621 unsigned long action, void *hcpu)
622{
623 unsigned long cpu = (unsigned long)hcpu;
624 struct hpet_work_struct work;
625 struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
626
627 switch (action & 0xf) {
628 case CPU_ONLINE:
629 INIT_DELAYED_WORK(&work.work, hpet_work);
630 init_completion(&work.complete);
631 /* FIXME: add schedule_work_on() */
632 schedule_delayed_work_on(cpu, &work.work, 0);
633 wait_for_completion(&work.complete);
634 break;
635 case CPU_DEAD:
636 if (hdev) {
637 free_irq(hdev->irq, hdev);
638 hdev->flags &= ~HPET_DEV_USED;
639 per_cpu(cpu_hpet_dev, cpu) = NULL;
640 }
641 break;
642 }
643 return NOTIFY_OK;
644}
645#else
646
647static int hpet_setup_msi_irq(unsigned int irq)
648{
649 return 0;
650}
651static void hpet_msi_capability_lookup(unsigned int start_timer)
652{
653 return;
654}
655
656#ifdef CONFIG_HPET
657static void hpet_reserve_msi_timers(struct hpet_data *hd)
658{
659 return;
660}
661#endif
662
663static int hpet_cpuhp_notify(struct notifier_block *n,
664 unsigned long action, void *hcpu)
665{
666 return NOTIFY_OK;
667}
668
669#endif
670
291/* 671/*
292 * Clock source related code 672 * Clock source related code
293 */ 673 */
@@ -423,8 +803,10 @@ int __init hpet_enable(void)
423 803
424 if (id & HPET_ID_LEGSUP) { 804 if (id & HPET_ID_LEGSUP) {
425 hpet_legacy_clockevent_register(); 805 hpet_legacy_clockevent_register();
806 hpet_msi_capability_lookup(2);
426 return 1; 807 return 1;
427 } 808 }
809 hpet_msi_capability_lookup(0);
428 return 0; 810 return 0;
429 811
430out_nohpet: 812out_nohpet:
@@ -441,6 +823,8 @@ out_nohpet:
441 */ 823 */
442static __init int hpet_late_init(void) 824static __init int hpet_late_init(void)
443{ 825{
826 int cpu;
827
444 if (boot_hpet_disable) 828 if (boot_hpet_disable)
445 return -ENODEV; 829 return -ENODEV;
446 830
@@ -456,6 +840,13 @@ static __init int hpet_late_init(void)
456 840
457 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 841 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
458 842
843 for_each_online_cpu(cpu) {
844 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
845 }
846
847 /* This notifier should be called after workqueue is ready */
848 hotcpu_notifier(hpet_cpuhp_notify, -20);
849
459 return 0; 850 return 0;
460} 851}
461fs_initcall(hpet_late_init); 852fs_initcall(hpet_late_init);
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic.c
index 02063ae042f7..b764d7429c61 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic.c
@@ -27,17 +27,21 @@
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/pci.h> 28#include <linux/pci.h>
29#include <linux/mc146818rtc.h> 29#include <linux/mc146818rtc.h>
30#include <linux/compiler.h>
30#include <linux/acpi.h> 31#include <linux/acpi.h>
32#include <linux/module.h>
31#include <linux/sysdev.h> 33#include <linux/sysdev.h>
32#include <linux/msi.h> 34#include <linux/msi.h>
33#include <linux/htirq.h> 35#include <linux/htirq.h>
34#include <linux/dmar.h> 36#include <linux/freezer.h>
35#include <linux/jiffies.h> 37#include <linux/kthread.h>
38#include <linux/jiffies.h> /* time_after() */
36#ifdef CONFIG_ACPI 39#ifdef CONFIG_ACPI
37#include <acpi/acpi_bus.h> 40#include <acpi/acpi_bus.h>
38#endif 41#endif
39#include <linux/bootmem.h> 42#include <linux/bootmem.h>
40#include <linux/dmar.h> 43#include <linux/dmar.h>
44#include <linux/hpet.h>
41 45
42#include <asm/idle.h> 46#include <asm/idle.h>
43#include <asm/io.h> 47#include <asm/io.h>
@@ -46,61 +50,28 @@
46#include <asm/proto.h> 50#include <asm/proto.h>
47#include <asm/acpi.h> 51#include <asm/acpi.h>
48#include <asm/dma.h> 52#include <asm/dma.h>
53#include <asm/timer.h>
49#include <asm/i8259.h> 54#include <asm/i8259.h>
50#include <asm/nmi.h> 55#include <asm/nmi.h>
51#include <asm/msidef.h> 56#include <asm/msidef.h>
52#include <asm/hypertransport.h> 57#include <asm/hypertransport.h>
58#include <asm/setup.h>
53#include <asm/irq_remapping.h> 59#include <asm/irq_remapping.h>
60#include <asm/hpet.h>
61#include <asm/uv/uv_hub.h>
62#include <asm/uv/uv_irq.h>
54 63
55#include <mach_ipi.h> 64#include <mach_ipi.h>
56#include <mach_apic.h> 65#include <mach_apic.h>
66#include <mach_apicdef.h>
57 67
58#define __apicdebuginit(type) static type __init 68#define __apicdebuginit(type) static type __init
59 69
60struct irq_cfg { 70/*
61 cpumask_t domain; 71 * Is the SiS APIC rmw bug present ?
62 cpumask_t old_domain; 72 * -1 = don't know, 0 = no, 1 = yes
63 unsigned move_cleanup_count; 73 */
64 u8 vector; 74int sis_apic_bug = -1;
65 u8 move_in_progress : 1;
66};
67
68/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
69static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
70 [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
71 [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
72 [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
73 [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
74 [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
75 [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
76 [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
77 [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
78 [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
79 [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
80 [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
81 [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
82 [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
83 [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
84 [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
85 [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
86};
87
88static int assign_irq_vector(int irq, cpumask_t mask);
89
90int first_system_vector = 0xfe;
91
92char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
93
94int sis_apic_bug; /* not actually supported, dummy for compile */
95
96static int no_timer_check;
97
98static int disable_timer_pin_1 __initdata;
99
100int timer_through_8259 __initdata;
101
102/* Where if anywhere is the i8259 connect in external int mode */
103static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
104 75
105static DEFINE_SPINLOCK(ioapic_lock); 76static DEFINE_SPINLOCK(ioapic_lock);
106static DEFINE_SPINLOCK(vector_lock); 77static DEFINE_SPINLOCK(vector_lock);
@@ -110,9 +81,6 @@ static DEFINE_SPINLOCK(vector_lock);
110 */ 81 */
111int nr_ioapic_registers[MAX_IO_APICS]; 82int nr_ioapic_registers[MAX_IO_APICS];
112 83
113/* I/O APIC RTE contents at the OS boot up */
114struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
115
116/* I/O APIC entries */ 84/* I/O APIC entries */
117struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; 85struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
118int nr_ioapics; 86int nr_ioapics;
@@ -123,11 +91,69 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
123/* # of MP IRQ source entries */ 91/* # of MP IRQ source entries */
124int mp_irq_entries; 92int mp_irq_entries;
125 93
94#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
95int mp_bus_id_to_type[MAX_MP_BUSSES];
96#endif
97
126DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); 98DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
127 99
100int skip_ioapic_setup;
101
102static int __init parse_noapic(char *str)
103{
104 /* disable IO-APIC */
105 disable_ioapic_setup();
106 return 0;
107}
108early_param("noapic", parse_noapic);
109
110struct irq_pin_list;
111struct irq_cfg {
112 unsigned int irq;
113 struct irq_pin_list *irq_2_pin;
114 cpumask_t domain;
115 cpumask_t old_domain;
116 unsigned move_cleanup_count;
117 u8 vector;
118 u8 move_in_progress : 1;
119};
120
121/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
122static struct irq_cfg irq_cfgx[NR_IRQS] = {
123 [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
124 [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
125 [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
126 [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
127 [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
128 [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
129 [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
130 [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
131 [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
132 [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
133 [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
134 [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
135 [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
136 [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
137 [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
138 [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
139};
140
141#define for_each_irq_cfg(irq, cfg) \
142 for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
143
144static struct irq_cfg *irq_cfg(unsigned int irq)
145{
146 return irq < nr_irqs ? irq_cfgx + irq : NULL;
147}
148
149static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
150{
151 return irq_cfg(irq);
152}
153
128/* 154/*
129 * Rough estimation of how many shared IRQs there are, can 155 * Rough estimation of how many shared IRQs there are, can be changed
130 * be changed anytime. 156 * anytime.
131 */ 157 */
132#define MAX_PLUS_SHARED_IRQS NR_IRQS 158#define MAX_PLUS_SHARED_IRQS NR_IRQS
133#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) 159#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
@@ -139,9 +165,36 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
139 * between pins and IRQs. 165 * between pins and IRQs.
140 */ 166 */
141 167
142static struct irq_pin_list { 168struct irq_pin_list {
143 short apic, pin, next; 169 int apic, pin;
144} irq_2_pin[PIN_MAP_SIZE]; 170 struct irq_pin_list *next;
171};
172
173static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
174static struct irq_pin_list *irq_2_pin_ptr;
175
176static void __init irq_2_pin_init(void)
177{
178 struct irq_pin_list *pin = irq_2_pin_head;
179 int i;
180
181 for (i = 1; i < PIN_MAP_SIZE; i++)
182 pin[i-1].next = &pin[i];
183
184 irq_2_pin_ptr = &pin[0];
185}
186
187static struct irq_pin_list *get_one_free_irq_2_pin(void)
188{
189 struct irq_pin_list *pin = irq_2_pin_ptr;
190
191 if (!pin)
192 panic("can not get more irq_2_pin\n");
193
194 irq_2_pin_ptr = pin->next;
195 pin->next = NULL;
196 return pin;
197}
145 198
146struct io_apic { 199struct io_apic {
147 unsigned int index; 200 unsigned int index;
@@ -172,10 +225,15 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
172/* 225/*
173 * Re-write a value: to be used for read-modify-write 226 * Re-write a value: to be used for read-modify-write
174 * cycles where the read already set up the index register. 227 * cycles where the read already set up the index register.
228 *
229 * Older SiS APIC requires we rewrite the index register
175 */ 230 */
176static inline void io_apic_modify(unsigned int apic, unsigned int value) 231static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
177{ 232{
178 struct io_apic __iomem *io_apic = io_apic_base(apic); 233 struct io_apic __iomem *io_apic = io_apic_base(apic);
234
235 if (sis_apic_bug)
236 writel(reg, &io_apic->index);
179 writel(value, &io_apic->data); 237 writel(value, &io_apic->data);
180} 238}
181 239
@@ -183,16 +241,17 @@ static bool io_apic_level_ack_pending(unsigned int irq)
183{ 241{
184 struct irq_pin_list *entry; 242 struct irq_pin_list *entry;
185 unsigned long flags; 243 unsigned long flags;
244 struct irq_cfg *cfg = irq_cfg(irq);
186 245
187 spin_lock_irqsave(&ioapic_lock, flags); 246 spin_lock_irqsave(&ioapic_lock, flags);
188 entry = irq_2_pin + irq; 247 entry = cfg->irq_2_pin;
189 for (;;) { 248 for (;;) {
190 unsigned int reg; 249 unsigned int reg;
191 int pin; 250 int pin;
192 251
193 pin = entry->pin; 252 if (!entry)
194 if (pin == -1)
195 break; 253 break;
254 pin = entry->pin;
196 reg = io_apic_read(entry->apic, 0x10 + pin*2); 255 reg = io_apic_read(entry->apic, 0x10 + pin*2);
197 /* Is the remote IRR bit set? */ 256 /* Is the remote IRR bit set? */
198 if (reg & IO_APIC_REDIR_REMOTE_IRR) { 257 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
@@ -201,45 +260,13 @@ static bool io_apic_level_ack_pending(unsigned int irq)
201 } 260 }
202 if (!entry->next) 261 if (!entry->next)
203 break; 262 break;
204 entry = irq_2_pin + entry->next; 263 entry = entry->next;
205 } 264 }
206 spin_unlock_irqrestore(&ioapic_lock, flags); 265 spin_unlock_irqrestore(&ioapic_lock, flags);
207 266
208 return false; 267 return false;
209} 268}
210 269
211/*
212 * Synchronize the IO-APIC and the CPU by doing
213 * a dummy read from the IO-APIC
214 */
215static inline void io_apic_sync(unsigned int apic)
216{
217 struct io_apic __iomem *io_apic = io_apic_base(apic);
218 readl(&io_apic->data);
219}
220
221#define __DO_ACTION(R, ACTION, FINAL) \
222 \
223{ \
224 int pin; \
225 struct irq_pin_list *entry = irq_2_pin + irq; \
226 \
227 BUG_ON(irq >= NR_IRQS); \
228 for (;;) { \
229 unsigned int reg; \
230 pin = entry->pin; \
231 if (pin == -1) \
232 break; \
233 reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
234 reg ACTION; \
235 io_apic_modify(entry->apic, reg); \
236 FINAL; \
237 if (!entry->next) \
238 break; \
239 entry = irq_2_pin + entry->next; \
240 } \
241}
242
243union entry_union { 270union entry_union {
244 struct { u32 w1, w2; }; 271 struct { u32 w1, w2; };
245 struct IO_APIC_route_entry entry; 272 struct IO_APIC_route_entry entry;
@@ -299,59 +326,71 @@ static void ioapic_mask_entry(int apic, int pin)
299static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) 326static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
300{ 327{
301 int apic, pin; 328 int apic, pin;
302 struct irq_pin_list *entry = irq_2_pin + irq; 329 struct irq_cfg *cfg;
330 struct irq_pin_list *entry;
303 331
304 BUG_ON(irq >= NR_IRQS); 332 cfg = irq_cfg(irq);
333 entry = cfg->irq_2_pin;
305 for (;;) { 334 for (;;) {
306 unsigned int reg; 335 unsigned int reg;
336
337 if (!entry)
338 break;
339
307 apic = entry->apic; 340 apic = entry->apic;
308 pin = entry->pin; 341 pin = entry->pin;
309 if (pin == -1) 342#ifdef CONFIG_INTR_REMAP
310 break;
311 /* 343 /*
312 * With interrupt-remapping, destination information comes 344 * With interrupt-remapping, destination information comes
313 * from interrupt-remapping table entry. 345 * from interrupt-remapping table entry.
314 */ 346 */
315 if (!irq_remapped(irq)) 347 if (!irq_remapped(irq))
316 io_apic_write(apic, 0x11 + pin*2, dest); 348 io_apic_write(apic, 0x11 + pin*2, dest);
349#else
350 io_apic_write(apic, 0x11 + pin*2, dest);
351#endif
317 reg = io_apic_read(apic, 0x10 + pin*2); 352 reg = io_apic_read(apic, 0x10 + pin*2);
318 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 353 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
319 reg |= vector; 354 reg |= vector;
320 io_apic_modify(apic, reg); 355 io_apic_modify(apic, 0x10 + pin*2, reg);
321 if (!entry->next) 356 if (!entry->next)
322 break; 357 break;
323 entry = irq_2_pin + entry->next; 358 entry = entry->next;
324 } 359 }
325} 360}
326 361
362static int assign_irq_vector(int irq, cpumask_t mask);
363
327static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) 364static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
328{ 365{
329 struct irq_cfg *cfg = irq_cfg + irq; 366 struct irq_cfg *cfg;
330 unsigned long flags; 367 unsigned long flags;
331 unsigned int dest; 368 unsigned int dest;
332 cpumask_t tmp; 369 cpumask_t tmp;
370 struct irq_desc *desc;
333 371
334 cpus_and(tmp, mask, cpu_online_map); 372 cpus_and(tmp, mask, cpu_online_map);
335 if (cpus_empty(tmp)) 373 if (cpus_empty(tmp))
336 return; 374 return;
337 375
376 cfg = irq_cfg(irq);
338 if (assign_irq_vector(irq, mask)) 377 if (assign_irq_vector(irq, mask))
339 return; 378 return;
340 379
341 cpus_and(tmp, cfg->domain, mask); 380 cpus_and(tmp, cfg->domain, mask);
342 dest = cpu_mask_to_apicid(tmp); 381 dest = cpu_mask_to_apicid(tmp);
343
344 /* 382 /*
345 * Only the high 8 bits are valid. 383 * Only the high 8 bits are valid.
346 */ 384 */
347 dest = SET_APIC_LOGICAL_ID(dest); 385 dest = SET_APIC_LOGICAL_ID(dest);
348 386
387 desc = irq_to_desc(irq);
349 spin_lock_irqsave(&ioapic_lock, flags); 388 spin_lock_irqsave(&ioapic_lock, flags);
350 __target_IO_APIC_irq(irq, dest, cfg->vector); 389 __target_IO_APIC_irq(irq, dest, cfg->vector);
351 irq_desc[irq].affinity = mask; 390 desc->affinity = mask;
352 spin_unlock_irqrestore(&ioapic_lock, flags); 391 spin_unlock_irqrestore(&ioapic_lock, flags);
353} 392}
354#endif 393#endif /* CONFIG_SMP */
355 394
356/* 395/*
357 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are 396 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
@@ -360,19 +399,30 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
360 */ 399 */
361static void add_pin_to_irq(unsigned int irq, int apic, int pin) 400static void add_pin_to_irq(unsigned int irq, int apic, int pin)
362{ 401{
363 static int first_free_entry = NR_IRQS; 402 struct irq_cfg *cfg;
364 struct irq_pin_list *entry = irq_2_pin + irq; 403 struct irq_pin_list *entry;
404
405 /* first time to refer irq_cfg, so with new */
406 cfg = irq_cfg_alloc(irq);
407 entry = cfg->irq_2_pin;
408 if (!entry) {
409 entry = get_one_free_irq_2_pin();
410 cfg->irq_2_pin = entry;
411 entry->apic = apic;
412 entry->pin = pin;
413 return;
414 }
365 415
366 BUG_ON(irq >= NR_IRQS); 416 while (entry->next) {
367 while (entry->next) 417 /* not again, please */
368 entry = irq_2_pin + entry->next; 418 if (entry->apic == apic && entry->pin == pin)
419 return;
369 420
370 if (entry->pin != -1) { 421 entry = entry->next;
371 entry->next = first_free_entry;
372 entry = irq_2_pin + entry->next;
373 if (++first_free_entry >= PIN_MAP_SIZE)
374 panic("io_apic.c: ran out of irq_2_pin entries!");
375 } 422 }
423
424 entry->next = get_one_free_irq_2_pin();
425 entry = entry->next;
376 entry->apic = apic; 426 entry->apic = apic;
377 entry->pin = pin; 427 entry->pin = pin;
378} 428}
@@ -384,30 +434,86 @@ static void __init replace_pin_at_irq(unsigned int irq,
384 int oldapic, int oldpin, 434 int oldapic, int oldpin,
385 int newapic, int newpin) 435 int newapic, int newpin)
386{ 436{
387 struct irq_pin_list *entry = irq_2_pin + irq; 437 struct irq_cfg *cfg = irq_cfg(irq);
438 struct irq_pin_list *entry = cfg->irq_2_pin;
439 int replaced = 0;
388 440
389 while (1) { 441 while (entry) {
390 if (entry->apic == oldapic && entry->pin == oldpin) { 442 if (entry->apic == oldapic && entry->pin == oldpin) {
391 entry->apic = newapic; 443 entry->apic = newapic;
392 entry->pin = newpin; 444 entry->pin = newpin;
393 } 445 replaced = 1;
394 if (!entry->next) 446 /* every one is different, right? */
395 break; 447 break;
396 entry = irq_2_pin + entry->next; 448 }
449 entry = entry->next;
450 }
451
452 /* why? call replace before add? */
453 if (!replaced)
454 add_pin_to_irq(irq, newapic, newpin);
455}
456
457static inline void io_apic_modify_irq(unsigned int irq,
458 int mask_and, int mask_or,
459 void (*final)(struct irq_pin_list *entry))
460{
461 int pin;
462 struct irq_cfg *cfg;
463 struct irq_pin_list *entry;
464
465 cfg = irq_cfg(irq);
466 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
467 unsigned int reg;
468 pin = entry->pin;
469 reg = io_apic_read(entry->apic, 0x10 + pin * 2);
470 reg &= mask_and;
471 reg |= mask_or;
472 io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
473 if (final)
474 final(entry);
397 } 475 }
398} 476}
399 477
478static void __unmask_IO_APIC_irq(unsigned int irq)
479{
480 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
481}
400 482
401#define DO_ACTION(name,R,ACTION, FINAL) \ 483#ifdef CONFIG_X86_64
402 \ 484void io_apic_sync(struct irq_pin_list *entry)
403 static void name##_IO_APIC_irq (unsigned int irq) \ 485{
404 __DO_ACTION(R, ACTION, FINAL) 486 /*
487 * Synchronize the IO-APIC and the CPU by doing
488 * a dummy read from the IO-APIC
489 */
490 struct io_apic __iomem *io_apic;
491 io_apic = io_apic_base(entry->apic);
492 readl(&io_apic->data);
493}
405 494
406/* mask = 1 */ 495static void __mask_IO_APIC_irq(unsigned int irq)
407DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) 496{
497 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
498}
499#else /* CONFIG_X86_32 */
500static void __mask_IO_APIC_irq(unsigned int irq)
501{
502 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
503}
408 504
409/* mask = 0 */ 505static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
410DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) 506{
507 io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
508 IO_APIC_REDIR_MASKED, NULL);
509}
510
511static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
512{
513 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
514 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
515}
516#endif /* CONFIG_X86_32 */
411 517
412static void mask_IO_APIC_irq (unsigned int irq) 518static void mask_IO_APIC_irq (unsigned int irq)
413{ 519{
@@ -450,6 +556,68 @@ static void clear_IO_APIC (void)
450 clear_IO_APIC_pin(apic, pin); 556 clear_IO_APIC_pin(apic, pin);
451} 557}
452 558
559#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
560void send_IPI_self(int vector)
561{
562 unsigned int cfg;
563
564 /*
565 * Wait for idle.
566 */
567 apic_wait_icr_idle();
568 cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
569 /*
570 * Send the IPI. The write to APIC_ICR fires this off.
571 */
572 apic_write(APIC_ICR, cfg);
573}
574#endif /* !CONFIG_SMP && CONFIG_X86_32*/
575
576#ifdef CONFIG_X86_32
577/*
578 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
579 * specific CPU-side IRQs.
580 */
581
582#define MAX_PIRQS 8
583static int pirq_entries [MAX_PIRQS];
584static int pirqs_enabled;
585
586static int __init ioapic_pirq_setup(char *str)
587{
588 int i, max;
589 int ints[MAX_PIRQS+1];
590
591 get_options(str, ARRAY_SIZE(ints), ints);
592
593 for (i = 0; i < MAX_PIRQS; i++)
594 pirq_entries[i] = -1;
595
596 pirqs_enabled = 1;
597 apic_printk(APIC_VERBOSE, KERN_INFO
598 "PIRQ redirection, working around broken MP-BIOS.\n");
599 max = MAX_PIRQS;
600 if (ints[0] < MAX_PIRQS)
601 max = ints[0];
602
603 for (i = 0; i < max; i++) {
604 apic_printk(APIC_VERBOSE, KERN_DEBUG
605 "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
606 /*
607 * PIRQs are mapped upside down, usually.
608 */
609 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
610 }
611 return 1;
612}
613
614__setup("pirq=", ioapic_pirq_setup);
615#endif /* CONFIG_X86_32 */
616
617#ifdef CONFIG_INTR_REMAP
618/* I/O APIC RTE contents at the OS boot up */
619static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
620
453/* 621/*
454 * Saves and masks all the unmasked IO-APIC RTE's 622 * Saves and masks all the unmasked IO-APIC RTE's
455 */ 623 */
@@ -474,7 +642,7 @@ int save_mask_IO_APIC_setup(void)
474 kzalloc(sizeof(struct IO_APIC_route_entry) * 642 kzalloc(sizeof(struct IO_APIC_route_entry) *
475 nr_ioapic_registers[apic], GFP_KERNEL); 643 nr_ioapic_registers[apic], GFP_KERNEL);
476 if (!early_ioapic_entries[apic]) 644 if (!early_ioapic_entries[apic])
477 return -ENOMEM; 645 goto nomem;
478 } 646 }
479 647
480 for (apic = 0; apic < nr_ioapics; apic++) 648 for (apic = 0; apic < nr_ioapics; apic++)
@@ -488,17 +656,31 @@ int save_mask_IO_APIC_setup(void)
488 ioapic_write_entry(apic, pin, entry); 656 ioapic_write_entry(apic, pin, entry);
489 } 657 }
490 } 658 }
659
491 return 0; 660 return 0;
661
662nomem:
663 while (apic >= 0)
664 kfree(early_ioapic_entries[apic--]);
665 memset(early_ioapic_entries, 0,
666 ARRAY_SIZE(early_ioapic_entries));
667
668 return -ENOMEM;
492} 669}
493 670
494void restore_IO_APIC_setup(void) 671void restore_IO_APIC_setup(void)
495{ 672{
496 int apic, pin; 673 int apic, pin;
497 674
498 for (apic = 0; apic < nr_ioapics; apic++) 675 for (apic = 0; apic < nr_ioapics; apic++) {
676 if (!early_ioapic_entries[apic])
677 break;
499 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) 678 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
500 ioapic_write_entry(apic, pin, 679 ioapic_write_entry(apic, pin,
501 early_ioapic_entries[apic][pin]); 680 early_ioapic_entries[apic][pin]);
681 kfree(early_ioapic_entries[apic]);
682 early_ioapic_entries[apic] = NULL;
683 }
502} 684}
503 685
504void reinit_intr_remapped_IO_APIC(int intr_remapping) 686void reinit_intr_remapped_IO_APIC(int intr_remapping)
@@ -512,25 +694,7 @@ void reinit_intr_remapped_IO_APIC(int intr_remapping)
512 */ 694 */
513 restore_IO_APIC_setup(); 695 restore_IO_APIC_setup();
514} 696}
515 697#endif
516int skip_ioapic_setup;
517int ioapic_force;
518
519static int __init parse_noapic(char *str)
520{
521 disable_ioapic_setup();
522 return 0;
523}
524early_param("noapic", parse_noapic);
525
526/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
527static int __init disable_timer_pin_setup(char *arg)
528{
529 disable_timer_pin_1 = 1;
530 return 1;
531}
532__setup("disable_timer_pin_1", disable_timer_pin_setup);
533
534 698
535/* 699/*
536 * Find the IRQ entry number of a certain pin. 700 * Find the IRQ entry number of a certain pin.
@@ -634,22 +798,54 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
634 best_guess = irq; 798 best_guess = irq;
635 } 799 }
636 } 800 }
637 BUG_ON(best_guess >= NR_IRQS);
638 return best_guess; 801 return best_guess;
639} 802}
640 803
804EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
805
806#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
807/*
808 * EISA Edge/Level control register, ELCR
809 */
810static int EISA_ELCR(unsigned int irq)
811{
812 if (irq < 16) {
813 unsigned int port = 0x4d0 + (irq >> 3);
814 return (inb(port) >> (irq & 7)) & 1;
815 }
816 apic_printk(APIC_VERBOSE, KERN_INFO
817 "Broken MPtable reports ISA irq %d\n", irq);
818 return 0;
819}
820
821#endif
822
641/* ISA interrupts are always polarity zero edge triggered, 823/* ISA interrupts are always polarity zero edge triggered,
642 * when listed as conforming in the MP table. */ 824 * when listed as conforming in the MP table. */
643 825
644#define default_ISA_trigger(idx) (0) 826#define default_ISA_trigger(idx) (0)
645#define default_ISA_polarity(idx) (0) 827#define default_ISA_polarity(idx) (0)
646 828
829/* EISA interrupts are always polarity zero and can be edge or level
830 * trigger depending on the ELCR value. If an interrupt is listed as
831 * EISA conforming in the MP table, that means its trigger type must
832 * be read in from the ELCR */
833
834#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
835#define default_EISA_polarity(idx) default_ISA_polarity(idx)
836
647/* PCI interrupts are always polarity one level triggered, 837/* PCI interrupts are always polarity one level triggered,
648 * when listed as conforming in the MP table. */ 838 * when listed as conforming in the MP table. */
649 839
650#define default_PCI_trigger(idx) (1) 840#define default_PCI_trigger(idx) (1)
651#define default_PCI_polarity(idx) (1) 841#define default_PCI_polarity(idx) (1)
652 842
843/* MCA interrupts are always polarity zero level triggered,
844 * when listed as conforming in the MP table. */
845
846#define default_MCA_trigger(idx) (1)
847#define default_MCA_polarity(idx) default_ISA_polarity(idx)
848
653static int MPBIOS_polarity(int idx) 849static int MPBIOS_polarity(int idx)
654{ 850{
655 int bus = mp_irqs[idx].mp_srcbus; 851 int bus = mp_irqs[idx].mp_srcbus;
@@ -707,6 +903,36 @@ static int MPBIOS_trigger(int idx)
707 trigger = default_ISA_trigger(idx); 903 trigger = default_ISA_trigger(idx);
708 else 904 else
709 trigger = default_PCI_trigger(idx); 905 trigger = default_PCI_trigger(idx);
906#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
907 switch (mp_bus_id_to_type[bus]) {
908 case MP_BUS_ISA: /* ISA pin */
909 {
910 /* set before the switch */
911 break;
912 }
913 case MP_BUS_EISA: /* EISA pin */
914 {
915 trigger = default_EISA_trigger(idx);
916 break;
917 }
918 case MP_BUS_PCI: /* PCI pin */
919 {
920 /* set before the switch */
921 break;
922 }
923 case MP_BUS_MCA: /* MCA pin */
924 {
925 trigger = default_MCA_trigger(idx);
926 break;
927 }
928 default:
929 {
930 printk(KERN_WARNING "broken BIOS!!\n");
931 trigger = 1;
932 break;
933 }
934 }
935#endif
710 break; 936 break;
711 case 1: /* edge */ 937 case 1: /* edge */
712 { 938 {
@@ -744,6 +970,7 @@ static inline int irq_trigger(int idx)
744 return MPBIOS_trigger(idx); 970 return MPBIOS_trigger(idx);
745} 971}
746 972
973int (*ioapic_renumber_irq)(int ioapic, int irq);
747static int pin_2_irq(int idx, int apic, int pin) 974static int pin_2_irq(int idx, int apic, int pin)
748{ 975{
749 int irq, i; 976 int irq, i;
@@ -765,8 +992,32 @@ static int pin_2_irq(int idx, int apic, int pin)
765 while (i < apic) 992 while (i < apic)
766 irq += nr_ioapic_registers[i++]; 993 irq += nr_ioapic_registers[i++];
767 irq += pin; 994 irq += pin;
995 /*
996 * For MPS mode, so far only needed by ES7000 platform
997 */
998 if (ioapic_renumber_irq)
999 irq = ioapic_renumber_irq(apic, irq);
768 } 1000 }
769 BUG_ON(irq >= NR_IRQS); 1001
1002#ifdef CONFIG_X86_32
1003 /*
1004 * PCI IRQ command line redirection. Yes, limits are hardcoded.
1005 */
1006 if ((pin >= 16) && (pin <= 23)) {
1007 if (pirq_entries[pin-16] != -1) {
1008 if (!pirq_entries[pin-16]) {
1009 apic_printk(APIC_VERBOSE, KERN_DEBUG
1010 "disabling PIRQ%d\n", pin-16);
1011 } else {
1012 irq = pirq_entries[pin-16];
1013 apic_printk(APIC_VERBOSE, KERN_DEBUG
1014 "using PIRQ%d -> IRQ %d\n",
1015 pin-16, irq);
1016 }
1017 }
1018 }
1019#endif
1020
770 return irq; 1021 return irq;
771} 1022}
772 1023
@@ -801,8 +1052,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
801 int cpu; 1052 int cpu;
802 struct irq_cfg *cfg; 1053 struct irq_cfg *cfg;
803 1054
804 BUG_ON((unsigned)irq >= NR_IRQS); 1055 cfg = irq_cfg(irq);
805 cfg = &irq_cfg[irq];
806 1056
807 /* Only try and allocate irqs on cpus that are present */ 1057 /* Only try and allocate irqs on cpus that are present */
808 cpus_and(mask, mask, cpu_online_map); 1058 cpus_and(mask, mask, cpu_online_map);
@@ -837,8 +1087,13 @@ next:
837 } 1087 }
838 if (unlikely(current_vector == vector)) 1088 if (unlikely(current_vector == vector))
839 continue; 1089 continue;
1090#ifdef CONFIG_X86_64
840 if (vector == IA32_SYSCALL_VECTOR) 1091 if (vector == IA32_SYSCALL_VECTOR)
841 goto next; 1092 goto next;
1093#else
1094 if (vector == SYSCALL_VECTOR)
1095 goto next;
1096#endif
842 for_each_cpu_mask_nr(new_cpu, new_mask) 1097 for_each_cpu_mask_nr(new_cpu, new_mask)
843 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 1098 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
844 goto next; 1099 goto next;
@@ -875,8 +1130,7 @@ static void __clear_irq_vector(int irq)
875 cpumask_t mask; 1130 cpumask_t mask;
876 int cpu, vector; 1131 int cpu, vector;
877 1132
878 BUG_ON((unsigned)irq >= NR_IRQS); 1133 cfg = irq_cfg(irq);
879 cfg = &irq_cfg[irq];
880 BUG_ON(!cfg->vector); 1134 BUG_ON(!cfg->vector);
881 1135
882 vector = cfg->vector; 1136 vector = cfg->vector;
@@ -893,12 +1147,13 @@ void __setup_vector_irq(int cpu)
893 /* Initialize vector_irq on a new cpu */ 1147 /* Initialize vector_irq on a new cpu */
894 /* This function must be called with vector_lock held */ 1148 /* This function must be called with vector_lock held */
895 int irq, vector; 1149 int irq, vector;
1150 struct irq_cfg *cfg;
896 1151
897 /* Mark the inuse vectors */ 1152 /* Mark the inuse vectors */
898 for (irq = 0; irq < NR_IRQS; ++irq) { 1153 for_each_irq_cfg(irq, cfg) {
899 if (!cpu_isset(cpu, irq_cfg[irq].domain)) 1154 if (!cpu_isset(cpu, cfg->domain))
900 continue; 1155 continue;
901 vector = irq_cfg[irq].vector; 1156 vector = cfg->vector;
902 per_cpu(vector_irq, cpu)[vector] = irq; 1157 per_cpu(vector_irq, cpu)[vector] = irq;
903 } 1158 }
904 /* Mark the free vectors */ 1159 /* Mark the free vectors */
@@ -906,7 +1161,9 @@ void __setup_vector_irq(int cpu)
906 irq = per_cpu(vector_irq, cpu)[vector]; 1161 irq = per_cpu(vector_irq, cpu)[vector];
907 if (irq < 0) 1162 if (irq < 0)
908 continue; 1163 continue;
909 if (!cpu_isset(cpu, irq_cfg[irq].domain)) 1164
1165 cfg = irq_cfg(irq);
1166 if (!cpu_isset(cpu, cfg->domain))
910 per_cpu(vector_irq, cpu)[vector] = -1; 1167 per_cpu(vector_irq, cpu)[vector] = -1;
911 } 1168 }
912} 1169}
@@ -916,16 +1173,49 @@ static struct irq_chip ioapic_chip;
916static struct irq_chip ir_ioapic_chip; 1173static struct irq_chip ir_ioapic_chip;
917#endif 1174#endif
918 1175
1176#define IOAPIC_AUTO -1
1177#define IOAPIC_EDGE 0
1178#define IOAPIC_LEVEL 1
1179
1180#ifdef CONFIG_X86_32
1181static inline int IO_APIC_irq_trigger(int irq)
1182{
1183 int apic, idx, pin;
1184
1185 for (apic = 0; apic < nr_ioapics; apic++) {
1186 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1187 idx = find_irq_entry(apic, pin, mp_INT);
1188 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
1189 return irq_trigger(idx);
1190 }
1191 }
1192 /*
1193 * nonexistent IRQs are edge default
1194 */
1195 return 0;
1196}
1197#else
1198static inline int IO_APIC_irq_trigger(int irq)
1199{
1200 return 1;
1201}
1202#endif
1203
919static void ioapic_register_intr(int irq, unsigned long trigger) 1204static void ioapic_register_intr(int irq, unsigned long trigger)
920{ 1205{
921 if (trigger) 1206 struct irq_desc *desc;
922 irq_desc[irq].status |= IRQ_LEVEL; 1207
1208 desc = irq_to_desc(irq);
1209
1210 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1211 trigger == IOAPIC_LEVEL)
1212 desc->status |= IRQ_LEVEL;
923 else 1213 else
924 irq_desc[irq].status &= ~IRQ_LEVEL; 1214 desc->status &= ~IRQ_LEVEL;
925 1215
926#ifdef CONFIG_INTR_REMAP 1216#ifdef CONFIG_INTR_REMAP
927 if (irq_remapped(irq)) { 1217 if (irq_remapped(irq)) {
928 irq_desc[irq].status |= IRQ_MOVE_PCNTXT; 1218 desc->status |= IRQ_MOVE_PCNTXT;
929 if (trigger) 1219 if (trigger)
930 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, 1220 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
931 handle_fasteoi_irq, 1221 handle_fasteoi_irq,
@@ -936,7 +1226,8 @@ static void ioapic_register_intr(int irq, unsigned long trigger)
936 return; 1226 return;
937 } 1227 }
938#endif 1228#endif
939 if (trigger) 1229 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1230 trigger == IOAPIC_LEVEL)
940 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1231 set_irq_chip_and_handler_name(irq, &ioapic_chip,
941 handle_fasteoi_irq, 1232 handle_fasteoi_irq,
942 "fasteoi"); 1233 "fasteoi");
@@ -1009,13 +1300,15 @@ static int setup_ioapic_entry(int apic, int irq,
1009static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, 1300static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
1010 int trigger, int polarity) 1301 int trigger, int polarity)
1011{ 1302{
1012 struct irq_cfg *cfg = irq_cfg + irq; 1303 struct irq_cfg *cfg;
1013 struct IO_APIC_route_entry entry; 1304 struct IO_APIC_route_entry entry;
1014 cpumask_t mask; 1305 cpumask_t mask;
1015 1306
1016 if (!IO_APIC_IRQ(irq)) 1307 if (!IO_APIC_IRQ(irq))
1017 return; 1308 return;
1018 1309
1310 cfg = irq_cfg(irq);
1311
1019 mask = TARGET_CPUS; 1312 mask = TARGET_CPUS;
1020 if (assign_irq_vector(irq, mask)) 1313 if (assign_irq_vector(irq, mask))
1021 return; 1314 return;
@@ -1047,37 +1340,49 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
1047 1340
1048static void __init setup_IO_APIC_irqs(void) 1341static void __init setup_IO_APIC_irqs(void)
1049{ 1342{
1050 int apic, pin, idx, irq, first_notcon = 1; 1343 int apic, pin, idx, irq;
1344 int notcon = 0;
1051 1345
1052 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1346 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1053 1347
1054 for (apic = 0; apic < nr_ioapics; apic++) { 1348 for (apic = 0; apic < nr_ioapics; apic++) {
1055 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1349 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1056
1057 idx = find_irq_entry(apic,pin,mp_INT);
1058 if (idx == -1) {
1059 if (first_notcon) {
1060 apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
1061 first_notcon = 0;
1062 } else
1063 apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
1064 continue;
1065 }
1066 if (!first_notcon) {
1067 apic_printk(APIC_VERBOSE, " not connected.\n");
1068 first_notcon = 1;
1069 }
1070 1350
1071 irq = pin_2_irq(idx, apic, pin); 1351 idx = find_irq_entry(apic, pin, mp_INT);
1072 add_pin_to_irq(irq, apic, pin); 1352 if (idx == -1) {
1353 if (!notcon) {
1354 notcon = 1;
1355 apic_printk(APIC_VERBOSE,
1356 KERN_DEBUG " %d-%d",
1357 mp_ioapics[apic].mp_apicid,
1358 pin);
1359 } else
1360 apic_printk(APIC_VERBOSE, " %d-%d",
1361 mp_ioapics[apic].mp_apicid,
1362 pin);
1363 continue;
1364 }
1365 if (notcon) {
1366 apic_printk(APIC_VERBOSE,
1367 " (apicid-pin) not connected\n");
1368 notcon = 0;
1369 }
1073 1370
1074 setup_IO_APIC_irq(apic, pin, irq, 1371 irq = pin_2_irq(idx, apic, pin);
1075 irq_trigger(idx), irq_polarity(idx)); 1372#ifdef CONFIG_X86_32
1076 } 1373 if (multi_timer_check(apic, irq))
1374 continue;
1375#endif
1376 add_pin_to_irq(irq, apic, pin);
1377
1378 setup_IO_APIC_irq(apic, pin, irq,
1379 irq_trigger(idx), irq_polarity(idx));
1380 }
1077 } 1381 }
1078 1382
1079 if (!first_notcon) 1383 if (notcon)
1080 apic_printk(APIC_VERBOSE, " not connected.\n"); 1384 apic_printk(APIC_VERBOSE,
1385 " (apicid-pin) not connected\n");
1081} 1386}
1082 1387
1083/* 1388/*
@@ -1088,8 +1393,10 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1088{ 1393{
1089 struct IO_APIC_route_entry entry; 1394 struct IO_APIC_route_entry entry;
1090 1395
1396#ifdef CONFIG_INTR_REMAP
1091 if (intr_remapping_enabled) 1397 if (intr_remapping_enabled)
1092 return; 1398 return;
1399#endif
1093 1400
1094 memset(&entry, 0, sizeof(entry)); 1401 memset(&entry, 0, sizeof(entry));
1095 1402
@@ -1124,7 +1431,10 @@ __apicdebuginit(void) print_IO_APIC(void)
1124 union IO_APIC_reg_00 reg_00; 1431 union IO_APIC_reg_00 reg_00;
1125 union IO_APIC_reg_01 reg_01; 1432 union IO_APIC_reg_01 reg_01;
1126 union IO_APIC_reg_02 reg_02; 1433 union IO_APIC_reg_02 reg_02;
1434 union IO_APIC_reg_03 reg_03;
1127 unsigned long flags; 1435 unsigned long flags;
1436 struct irq_cfg *cfg;
1437 unsigned int irq;
1128 1438
1129 if (apic_verbosity == APIC_QUIET) 1439 if (apic_verbosity == APIC_QUIET)
1130 return; 1440 return;
@@ -1147,12 +1457,16 @@ __apicdebuginit(void) print_IO_APIC(void)
1147 reg_01.raw = io_apic_read(apic, 1); 1457 reg_01.raw = io_apic_read(apic, 1);
1148 if (reg_01.bits.version >= 0x10) 1458 if (reg_01.bits.version >= 0x10)
1149 reg_02.raw = io_apic_read(apic, 2); 1459 reg_02.raw = io_apic_read(apic, 2);
1460 if (reg_01.bits.version >= 0x20)
1461 reg_03.raw = io_apic_read(apic, 3);
1150 spin_unlock_irqrestore(&ioapic_lock, flags); 1462 spin_unlock_irqrestore(&ioapic_lock, flags);
1151 1463
1152 printk("\n"); 1464 printk("\n");
1153 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); 1465 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
1154 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1466 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1155 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1467 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1468 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1469 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1156 1470
1157 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01); 1471 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
1158 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); 1472 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
@@ -1160,11 +1474,27 @@ __apicdebuginit(void) print_IO_APIC(void)
1160 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); 1474 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1161 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); 1475 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1162 1476
1163 if (reg_01.bits.version >= 0x10) { 1477 /*
1478 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1479 * but the value of reg_02 is read as the previous read register
1480 * value, so ignore it if reg_02 == reg_01.
1481 */
1482 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1164 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); 1483 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1165 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); 1484 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1166 } 1485 }
1167 1486
1487 /*
1488 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
1489 * or reg_03, but the value of reg_0[23] is read as the previous read
1490 * register value, so ignore it if reg_03 == reg_0[12].
1491 */
1492 if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
1493 reg_03.raw != reg_01.raw) {
1494 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1495 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1496 }
1497
1168 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1498 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1169 1499
1170 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" 1500 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
@@ -1193,16 +1523,16 @@ __apicdebuginit(void) print_IO_APIC(void)
1193 } 1523 }
1194 } 1524 }
1195 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1525 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1196 for (i = 0; i < NR_IRQS; i++) { 1526 for_each_irq_cfg(irq, cfg) {
1197 struct irq_pin_list *entry = irq_2_pin + i; 1527 struct irq_pin_list *entry = cfg->irq_2_pin;
1198 if (entry->pin < 0) 1528 if (!entry)
1199 continue; 1529 continue;
1200 printk(KERN_DEBUG "IRQ%d ", i); 1530 printk(KERN_DEBUG "IRQ%d ", irq);
1201 for (;;) { 1531 for (;;) {
1202 printk("-> %d:%d", entry->apic, entry->pin); 1532 printk("-> %d:%d", entry->apic, entry->pin);
1203 if (!entry->next) 1533 if (!entry->next)
1204 break; 1534 break;
1205 entry = irq_2_pin + entry->next; 1535 entry = entry->next;
1206 } 1536 }
1207 printk("\n"); 1537 printk("\n");
1208 } 1538 }
@@ -1236,7 +1566,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base)
1236__apicdebuginit(void) print_local_APIC(void *dummy) 1566__apicdebuginit(void) print_local_APIC(void *dummy)
1237{ 1567{
1238 unsigned int v, ver, maxlvt; 1568 unsigned int v, ver, maxlvt;
1239 unsigned long icr; 1569 u64 icr;
1240 1570
1241 if (apic_verbosity == APIC_QUIET) 1571 if (apic_verbosity == APIC_QUIET)
1242 return; 1572 return;
@@ -1253,20 +1583,31 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1253 v = apic_read(APIC_TASKPRI); 1583 v = apic_read(APIC_TASKPRI);
1254 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); 1584 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
1255 1585
1256 v = apic_read(APIC_ARBPRI); 1586 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1257 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, 1587 if (!APIC_XAPIC(ver)) {
1258 v & APIC_ARBPRI_MASK); 1588 v = apic_read(APIC_ARBPRI);
1259 v = apic_read(APIC_PROCPRI); 1589 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
1260 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); 1590 v & APIC_ARBPRI_MASK);
1591 }
1592 v = apic_read(APIC_PROCPRI);
1593 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
1594 }
1595
1596 /*
1597 * Remote read supported only in the 82489DX and local APIC for
1598 * Pentium processors.
1599 */
1600 if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
1601 v = apic_read(APIC_RRR);
1602 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1603 }
1261 1604
1262 v = apic_read(APIC_EOI);
1263 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1264 v = apic_read(APIC_RRR);
1265 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1266 v = apic_read(APIC_LDR); 1605 v = apic_read(APIC_LDR);
1267 printk(KERN_DEBUG "... APIC LDR: %08x\n", v); 1606 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1268 v = apic_read(APIC_DFR); 1607 if (!x2apic_enabled()) {
1269 printk(KERN_DEBUG "... APIC DFR: %08x\n", v); 1608 v = apic_read(APIC_DFR);
1609 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1610 }
1270 v = apic_read(APIC_SPIV); 1611 v = apic_read(APIC_SPIV);
1271 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); 1612 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1272 1613
@@ -1277,8 +1618,13 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1277 printk(KERN_DEBUG "... APIC IRR field:\n"); 1618 printk(KERN_DEBUG "... APIC IRR field:\n");
1278 print_APIC_bitfield(APIC_IRR); 1619 print_APIC_bitfield(APIC_IRR);
1279 1620
1280 v = apic_read(APIC_ESR); 1621 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1281 printk(KERN_DEBUG "... APIC ESR: %08x\n", v); 1622 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1623 apic_write(APIC_ESR, 0);
1624
1625 v = apic_read(APIC_ESR);
1626 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1627 }
1282 1628
1283 icr = apic_icr_read(); 1629 icr = apic_icr_read();
1284 printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); 1630 printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
@@ -1312,7 +1658,12 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1312 1658
1313__apicdebuginit(void) print_all_local_APICs(void) 1659__apicdebuginit(void) print_all_local_APICs(void)
1314{ 1660{
1315 on_each_cpu(print_local_APIC, NULL, 1); 1661 int cpu;
1662
1663 preempt_disable();
1664 for_each_online_cpu(cpu)
1665 smp_call_function_single(cpu, print_local_APIC, NULL, 1);
1666 preempt_enable();
1316} 1667}
1317 1668
1318__apicdebuginit(void) print_PIC(void) 1669__apicdebuginit(void) print_PIC(void)
@@ -1359,17 +1710,22 @@ __apicdebuginit(int) print_all_ICs(void)
1359fs_initcall(print_all_ICs); 1710fs_initcall(print_all_ICs);
1360 1711
1361 1712
1713/* Where if anywhere is the i8259 connect in external int mode */
1714static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
1715
1362void __init enable_IO_APIC(void) 1716void __init enable_IO_APIC(void)
1363{ 1717{
1364 union IO_APIC_reg_01 reg_01; 1718 union IO_APIC_reg_01 reg_01;
1365 int i8259_apic, i8259_pin; 1719 int i8259_apic, i8259_pin;
1366 int i, apic; 1720 int apic;
1367 unsigned long flags; 1721 unsigned long flags;
1368 1722
1369 for (i = 0; i < PIN_MAP_SIZE; i++) { 1723#ifdef CONFIG_X86_32
1370 irq_2_pin[i].pin = -1; 1724 int i;
1371 irq_2_pin[i].next = 0; 1725 if (!pirqs_enabled)
1372 } 1726 for (i = 0; i < MAX_PIRQS; i++)
1727 pirq_entries[i] = -1;
1728#endif
1373 1729
1374 /* 1730 /*
1375 * The number of IO-APIC IRQ registers (== #pins): 1731 * The number of IO-APIC IRQ registers (== #pins):
@@ -1399,6 +1755,10 @@ void __init enable_IO_APIC(void)
1399 } 1755 }
1400 found_i8259: 1756 found_i8259:
1401 /* Look to see what if the MP table has reported the ExtINT */ 1757 /* Look to see what if the MP table has reported the ExtINT */
1758 /* If we could not find the appropriate pin by looking at the ioapic
1759 * the i8259 probably is not connected the ioapic but give the
1760 * mptable a chance anyway.
1761 */
1402 i8259_pin = find_isa_irq_pin(0, mp_ExtINT); 1762 i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
1403 i8259_apic = find_isa_irq_apic(0, mp_ExtINT); 1763 i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
1404 /* Trust the MP table if nothing is setup in the hardware */ 1764 /* Trust the MP table if nothing is setup in the hardware */
@@ -1458,6 +1818,133 @@ void disable_IO_APIC(void)
1458 disconnect_bsp_APIC(ioapic_i8259.pin != -1); 1818 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1459} 1819}
1460 1820
1821#ifdef CONFIG_X86_32
1822/*
1823 * function to set the IO-APIC physical IDs based on the
1824 * values stored in the MPC table.
1825 *
1826 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1827 */
1828
1829static void __init setup_ioapic_ids_from_mpc(void)
1830{
1831 union IO_APIC_reg_00 reg_00;
1832 physid_mask_t phys_id_present_map;
1833 int apic;
1834 int i;
1835 unsigned char old_id;
1836 unsigned long flags;
1837
1838 if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
1839 return;
1840
1841 /*
1842 * Don't check I/O APIC IDs for xAPIC systems. They have
1843 * no meaning without the serial APIC bus.
1844 */
1845 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1846 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
1847 return;
1848 /*
1849 * This is broken; anything with a real cpu count has to
1850 * circumvent this idiocy regardless.
1851 */
1852 phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
1853
1854 /*
1855 * Set the IOAPIC ID to the value stored in the MPC table.
1856 */
1857 for (apic = 0; apic < nr_ioapics; apic++) {
1858
1859 /* Read the register 0 value */
1860 spin_lock_irqsave(&ioapic_lock, flags);
1861 reg_00.raw = io_apic_read(apic, 0);
1862 spin_unlock_irqrestore(&ioapic_lock, flags);
1863
1864 old_id = mp_ioapics[apic].mp_apicid;
1865
1866 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
1867 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1868 apic, mp_ioapics[apic].mp_apicid);
1869 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1870 reg_00.bits.ID);
1871 mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
1872 }
1873
1874 /*
1875 * Sanity check, is the ID really free? Every APIC in a
1876 * system must have a unique ID or we get lots of nice
1877 * 'stuck on smp_invalidate_needed IPI wait' messages.
1878 */
1879 if (check_apicid_used(phys_id_present_map,
1880 mp_ioapics[apic].mp_apicid)) {
1881 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1882 apic, mp_ioapics[apic].mp_apicid);
1883 for (i = 0; i < get_physical_broadcast(); i++)
1884 if (!physid_isset(i, phys_id_present_map))
1885 break;
1886 if (i >= get_physical_broadcast())
1887 panic("Max APIC ID exceeded!\n");
1888 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1889 i);
1890 physid_set(i, phys_id_present_map);
1891 mp_ioapics[apic].mp_apicid = i;
1892 } else {
1893 physid_mask_t tmp;
1894 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
1895 apic_printk(APIC_VERBOSE, "Setting %d in the "
1896 "phys_id_present_map\n",
1897 mp_ioapics[apic].mp_apicid);
1898 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1899 }
1900
1901
1902 /*
1903 * We need to adjust the IRQ routing table
1904 * if the ID changed.
1905 */
1906 if (old_id != mp_ioapics[apic].mp_apicid)
1907 for (i = 0; i < mp_irq_entries; i++)
1908 if (mp_irqs[i].mp_dstapic == old_id)
1909 mp_irqs[i].mp_dstapic
1910 = mp_ioapics[apic].mp_apicid;
1911
1912 /*
1913 * Read the right value from the MPC table and
1914 * write it into the ID register.
1915 */
1916 apic_printk(APIC_VERBOSE, KERN_INFO
1917 "...changing IO-APIC physical APIC ID to %d ...",
1918 mp_ioapics[apic].mp_apicid);
1919
1920 reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
1921 spin_lock_irqsave(&ioapic_lock, flags);
1922 io_apic_write(apic, 0, reg_00.raw);
1923 spin_unlock_irqrestore(&ioapic_lock, flags);
1924
1925 /*
1926 * Sanity check
1927 */
1928 spin_lock_irqsave(&ioapic_lock, flags);
1929 reg_00.raw = io_apic_read(apic, 0);
1930 spin_unlock_irqrestore(&ioapic_lock, flags);
1931 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
1932 printk("could not set ID!\n");
1933 else
1934 apic_printk(APIC_VERBOSE, " ok.\n");
1935 }
1936}
1937#endif
1938
1939int no_timer_check __initdata;
1940
1941static int __init notimercheck(char *s)
1942{
1943 no_timer_check = 1;
1944 return 1;
1945}
1946__setup("no_timer_check", notimercheck);
1947
1461/* 1948/*
1462 * There is a nasty bug in some older SMP boards, their mptable lies 1949 * There is a nasty bug in some older SMP boards, their mptable lies
1463 * about the timer IRQ. We do the following to work around the situation: 1950 * about the timer IRQ. We do the following to work around the situation:
@@ -1471,6 +1958,9 @@ static int __init timer_irq_works(void)
1471 unsigned long t1 = jiffies; 1958 unsigned long t1 = jiffies;
1472 unsigned long flags; 1959 unsigned long flags;
1473 1960
1961 if (no_timer_check)
1962 return 1;
1963
1474 local_save_flags(flags); 1964 local_save_flags(flags);
1475 local_irq_enable(); 1965 local_irq_enable();
1476 /* Let ten ticks pass... */ 1966 /* Let ten ticks pass... */
@@ -1531,9 +2021,11 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
1531 return was_pending; 2021 return was_pending;
1532} 2022}
1533 2023
2024#ifdef CONFIG_X86_64
1534static int ioapic_retrigger_irq(unsigned int irq) 2025static int ioapic_retrigger_irq(unsigned int irq)
1535{ 2026{
1536 struct irq_cfg *cfg = &irq_cfg[irq]; 2027
2028 struct irq_cfg *cfg = irq_cfg(irq);
1537 unsigned long flags; 2029 unsigned long flags;
1538 2030
1539 spin_lock_irqsave(&vector_lock, flags); 2031 spin_lock_irqsave(&vector_lock, flags);
@@ -1542,6 +2034,14 @@ static int ioapic_retrigger_irq(unsigned int irq)
1542 2034
1543 return 1; 2035 return 1;
1544} 2036}
2037#else
2038static int ioapic_retrigger_irq(unsigned int irq)
2039{
2040 send_IPI_self(irq_cfg(irq)->vector);
2041
2042 return 1;
2043}
2044#endif
1545 2045
1546/* 2046/*
1547 * Level and edge triggered IO-APIC interrupts need different handling, 2047 * Level and edge triggered IO-APIC interrupts need different handling,
@@ -1580,11 +2080,11 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
1580 */ 2080 */
1581static void migrate_ioapic_irq(int irq, cpumask_t mask) 2081static void migrate_ioapic_irq(int irq, cpumask_t mask)
1582{ 2082{
1583 struct irq_cfg *cfg = irq_cfg + irq; 2083 struct irq_cfg *cfg;
1584 struct irq_desc *desc = irq_desc + irq; 2084 struct irq_desc *desc;
1585 cpumask_t tmp, cleanup_mask; 2085 cpumask_t tmp, cleanup_mask;
1586 struct irte irte; 2086 struct irte irte;
1587 int modify_ioapic_rte = desc->status & IRQ_LEVEL; 2087 int modify_ioapic_rte;
1588 unsigned int dest; 2088 unsigned int dest;
1589 unsigned long flags; 2089 unsigned long flags;
1590 2090
@@ -1598,9 +2098,12 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
1598 if (assign_irq_vector(irq, mask)) 2098 if (assign_irq_vector(irq, mask))
1599 return; 2099 return;
1600 2100
2101 cfg = irq_cfg(irq);
1601 cpus_and(tmp, cfg->domain, mask); 2102 cpus_and(tmp, cfg->domain, mask);
1602 dest = cpu_mask_to_apicid(tmp); 2103 dest = cpu_mask_to_apicid(tmp);
1603 2104
2105 desc = irq_to_desc(irq);
2106 modify_ioapic_rte = desc->status & IRQ_LEVEL;
1604 if (modify_ioapic_rte) { 2107 if (modify_ioapic_rte) {
1605 spin_lock_irqsave(&ioapic_lock, flags); 2108 spin_lock_irqsave(&ioapic_lock, flags);
1606 __target_IO_APIC_irq(irq, dest, cfg->vector); 2109 __target_IO_APIC_irq(irq, dest, cfg->vector);
@@ -1622,18 +2125,19 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
1622 cfg->move_in_progress = 0; 2125 cfg->move_in_progress = 0;
1623 } 2126 }
1624 2127
1625 irq_desc[irq].affinity = mask; 2128 desc->affinity = mask;
1626} 2129}
1627 2130
1628static int migrate_irq_remapped_level(int irq) 2131static int migrate_irq_remapped_level(int irq)
1629{ 2132{
1630 int ret = -1; 2133 int ret = -1;
2134 struct irq_desc *desc = irq_to_desc(irq);
1631 2135
1632 mask_IO_APIC_irq(irq); 2136 mask_IO_APIC_irq(irq);
1633 2137
1634 if (io_apic_level_ack_pending(irq)) { 2138 if (io_apic_level_ack_pending(irq)) {
1635 /* 2139 /*
1636 * Interrupt in progress. Migrating irq now will change the 2140 * Interrupt in progress. Migrating irq now will change the
1637 * vector information in the IO-APIC RTE and that will confuse 2141 * vector information in the IO-APIC RTE and that will confuse
1638 * the EOI broadcast performed by cpu. 2142 * the EOI broadcast performed by cpu.
1639 * So, delay the irq migration to the next instance. 2143 * So, delay the irq migration to the next instance.
@@ -1643,11 +2147,11 @@ static int migrate_irq_remapped_level(int irq)
1643 } 2147 }
1644 2148
1645 /* everthing is clear. we have right of way */ 2149 /* everthing is clear. we have right of way */
1646 migrate_ioapic_irq(irq, irq_desc[irq].pending_mask); 2150 migrate_ioapic_irq(irq, desc->pending_mask);
1647 2151
1648 ret = 0; 2152 ret = 0;
1649 irq_desc[irq].status &= ~IRQ_MOVE_PENDING; 2153 desc->status &= ~IRQ_MOVE_PENDING;
1650 cpus_clear(irq_desc[irq].pending_mask); 2154 cpus_clear(desc->pending_mask);
1651 2155
1652unmask: 2156unmask:
1653 unmask_IO_APIC_irq(irq); 2157 unmask_IO_APIC_irq(irq);
@@ -1656,10 +2160,10 @@ unmask:
1656 2160
1657static void ir_irq_migration(struct work_struct *work) 2161static void ir_irq_migration(struct work_struct *work)
1658{ 2162{
1659 int irq; 2163 unsigned int irq;
2164 struct irq_desc *desc;
1660 2165
1661 for (irq = 0; irq < NR_IRQS; irq++) { 2166 for_each_irq_desc(irq, desc) {
1662 struct irq_desc *desc = irq_desc + irq;
1663 if (desc->status & IRQ_MOVE_PENDING) { 2167 if (desc->status & IRQ_MOVE_PENDING) {
1664 unsigned long flags; 2168 unsigned long flags;
1665 2169
@@ -1671,8 +2175,7 @@ static void ir_irq_migration(struct work_struct *work)
1671 continue; 2175 continue;
1672 } 2176 }
1673 2177
1674 desc->chip->set_affinity(irq, 2178 desc->chip->set_affinity(irq, desc->pending_mask);
1675 irq_desc[irq].pending_mask);
1676 spin_unlock_irqrestore(&desc->lock, flags); 2179 spin_unlock_irqrestore(&desc->lock, flags);
1677 } 2180 }
1678 } 2181 }
@@ -1683,9 +2186,11 @@ static void ir_irq_migration(struct work_struct *work)
1683 */ 2186 */
1684static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) 2187static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
1685{ 2188{
1686 if (irq_desc[irq].status & IRQ_LEVEL) { 2189 struct irq_desc *desc = irq_to_desc(irq);
1687 irq_desc[irq].status |= IRQ_MOVE_PENDING; 2190
1688 irq_desc[irq].pending_mask = mask; 2191 if (desc->status & IRQ_LEVEL) {
2192 desc->status |= IRQ_MOVE_PENDING;
2193 desc->pending_mask = mask;
1689 migrate_irq_remapped_level(irq); 2194 migrate_irq_remapped_level(irq);
1690 return; 2195 return;
1691 } 2196 }
@@ -1698,7 +2203,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
1698{ 2203{
1699 unsigned vector, me; 2204 unsigned vector, me;
1700 ack_APIC_irq(); 2205 ack_APIC_irq();
2206#ifdef CONFIG_X86_64
1701 exit_idle(); 2207 exit_idle();
2208#endif
1702 irq_enter(); 2209 irq_enter();
1703 2210
1704 me = smp_processor_id(); 2211 me = smp_processor_id();
@@ -1707,11 +2214,12 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
1707 struct irq_desc *desc; 2214 struct irq_desc *desc;
1708 struct irq_cfg *cfg; 2215 struct irq_cfg *cfg;
1709 irq = __get_cpu_var(vector_irq)[vector]; 2216 irq = __get_cpu_var(vector_irq)[vector];
1710 if (irq >= NR_IRQS) 2217
2218 desc = irq_to_desc(irq);
2219 if (!desc)
1711 continue; 2220 continue;
1712 2221
1713 desc = irq_desc + irq; 2222 cfg = irq_cfg(irq);
1714 cfg = irq_cfg + irq;
1715 spin_lock(&desc->lock); 2223 spin_lock(&desc->lock);
1716 if (!cfg->move_cleanup_count) 2224 if (!cfg->move_cleanup_count)
1717 goto unlock; 2225 goto unlock;
@@ -1730,7 +2238,7 @@ unlock:
1730 2238
1731static void irq_complete_move(unsigned int irq) 2239static void irq_complete_move(unsigned int irq)
1732{ 2240{
1733 struct irq_cfg *cfg = irq_cfg + irq; 2241 struct irq_cfg *cfg = irq_cfg(irq);
1734 unsigned vector, me; 2242 unsigned vector, me;
1735 2243
1736 if (likely(!cfg->move_in_progress)) 2244 if (likely(!cfg->move_in_progress))
@@ -1769,19 +2277,50 @@ static void ack_apic_edge(unsigned int irq)
1769 ack_APIC_irq(); 2277 ack_APIC_irq();
1770} 2278}
1771 2279
2280atomic_t irq_mis_count;
2281
1772static void ack_apic_level(unsigned int irq) 2282static void ack_apic_level(unsigned int irq)
1773{ 2283{
2284#ifdef CONFIG_X86_32
2285 unsigned long v;
2286 int i;
2287#endif
1774 int do_unmask_irq = 0; 2288 int do_unmask_irq = 0;
1775 2289
1776 irq_complete_move(irq); 2290 irq_complete_move(irq);
1777#ifdef CONFIG_GENERIC_PENDING_IRQ 2291#ifdef CONFIG_GENERIC_PENDING_IRQ
1778 /* If we are moving the irq we need to mask it */ 2292 /* If we are moving the irq we need to mask it */
1779 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { 2293 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
1780 do_unmask_irq = 1; 2294 do_unmask_irq = 1;
1781 mask_IO_APIC_irq(irq); 2295 mask_IO_APIC_irq(irq);
1782 } 2296 }
1783#endif 2297#endif
1784 2298
2299#ifdef CONFIG_X86_32
2300 /*
2301 * It appears there is an erratum which affects at least version 0x11
2302 * of I/O APIC (that's the 82093AA and cores integrated into various
2303 * chipsets). Under certain conditions a level-triggered interrupt is
2304 * erroneously delivered as edge-triggered one but the respective IRR
2305 * bit gets set nevertheless. As a result the I/O unit expects an EOI
2306 * message but it will never arrive and further interrupts are blocked
2307 * from the source. The exact reason is so far unknown, but the
2308 * phenomenon was observed when two consecutive interrupt requests
2309 * from a given source get delivered to the same CPU and the source is
2310 * temporarily disabled in between.
2311 *
2312 * A workaround is to simulate an EOI message manually. We achieve it
2313 * by setting the trigger mode to edge and then to level when the edge
2314 * trigger mode gets detected in the TMR of a local APIC for a
2315 * level-triggered interrupt. We mask the source for the time of the
2316 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2317 * The idea is from Manfred Spraul. --macro
2318 */
2319 i = irq_cfg(irq)->vector;
2320
2321 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2322#endif
2323
1785 /* 2324 /*
1786 * We must acknowledge the irq before we move it or the acknowledge will 2325 * We must acknowledge the irq before we move it or the acknowledge will
1787 * not propagate properly. 2326 * not propagate properly.
@@ -1820,31 +2359,41 @@ static void ack_apic_level(unsigned int irq)
1820 move_masked_irq(irq); 2359 move_masked_irq(irq);
1821 unmask_IO_APIC_irq(irq); 2360 unmask_IO_APIC_irq(irq);
1822 } 2361 }
2362
2363#ifdef CONFIG_X86_32
2364 if (!(v & (1 << (i & 0x1f)))) {
2365 atomic_inc(&irq_mis_count);
2366 spin_lock(&ioapic_lock);
2367 __mask_and_edge_IO_APIC_irq(irq);
2368 __unmask_and_level_IO_APIC_irq(irq);
2369 spin_unlock(&ioapic_lock);
2370 }
2371#endif
1823} 2372}
1824 2373
1825static struct irq_chip ioapic_chip __read_mostly = { 2374static struct irq_chip ioapic_chip __read_mostly = {
1826 .name = "IO-APIC", 2375 .name = "IO-APIC",
1827 .startup = startup_ioapic_irq, 2376 .startup = startup_ioapic_irq,
1828 .mask = mask_IO_APIC_irq, 2377 .mask = mask_IO_APIC_irq,
1829 .unmask = unmask_IO_APIC_irq, 2378 .unmask = unmask_IO_APIC_irq,
1830 .ack = ack_apic_edge, 2379 .ack = ack_apic_edge,
1831 .eoi = ack_apic_level, 2380 .eoi = ack_apic_level,
1832#ifdef CONFIG_SMP 2381#ifdef CONFIG_SMP
1833 .set_affinity = set_ioapic_affinity_irq, 2382 .set_affinity = set_ioapic_affinity_irq,
1834#endif 2383#endif
1835 .retrigger = ioapic_retrigger_irq, 2384 .retrigger = ioapic_retrigger_irq,
1836}; 2385};
1837 2386
1838#ifdef CONFIG_INTR_REMAP 2387#ifdef CONFIG_INTR_REMAP
1839static struct irq_chip ir_ioapic_chip __read_mostly = { 2388static struct irq_chip ir_ioapic_chip __read_mostly = {
1840 .name = "IR-IO-APIC", 2389 .name = "IR-IO-APIC",
1841 .startup = startup_ioapic_irq, 2390 .startup = startup_ioapic_irq,
1842 .mask = mask_IO_APIC_irq, 2391 .mask = mask_IO_APIC_irq,
1843 .unmask = unmask_IO_APIC_irq, 2392 .unmask = unmask_IO_APIC_irq,
1844 .ack = ack_x2apic_edge, 2393 .ack = ack_x2apic_edge,
1845 .eoi = ack_x2apic_level, 2394 .eoi = ack_x2apic_level,
1846#ifdef CONFIG_SMP 2395#ifdef CONFIG_SMP
1847 .set_affinity = set_ir_ioapic_affinity_irq, 2396 .set_affinity = set_ir_ioapic_affinity_irq,
1848#endif 2397#endif
1849 .retrigger = ioapic_retrigger_irq, 2398 .retrigger = ioapic_retrigger_irq,
1850}; 2399};
@@ -1853,6 +2402,8 @@ static struct irq_chip ir_ioapic_chip __read_mostly = {
1853static inline void init_IO_APIC_traps(void) 2402static inline void init_IO_APIC_traps(void)
1854{ 2403{
1855 int irq; 2404 int irq;
2405 struct irq_desc *desc;
2406 struct irq_cfg *cfg;
1856 2407
1857 /* 2408 /*
1858 * NOTE! The local APIC isn't very good at handling 2409 * NOTE! The local APIC isn't very good at handling
@@ -1865,8 +2416,8 @@ static inline void init_IO_APIC_traps(void)
1865 * Also, we've got to be careful not to trash gate 2416 * Also, we've got to be careful not to trash gate
1866 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2417 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1867 */ 2418 */
1868 for (irq = 0; irq < NR_IRQS ; irq++) { 2419 for_each_irq_cfg(irq, cfg) {
1869 if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) { 2420 if (IO_APIC_IRQ(irq) && !cfg->vector) {
1870 /* 2421 /*
1871 * Hmm.. We don't have an entry for this, 2422 * Hmm.. We don't have an entry for this,
1872 * so default to an old-fashioned 8259 2423 * so default to an old-fashioned 8259
@@ -1874,27 +2425,33 @@ static inline void init_IO_APIC_traps(void)
1874 */ 2425 */
1875 if (irq < 16) 2426 if (irq < 16)
1876 make_8259A_irq(irq); 2427 make_8259A_irq(irq);
1877 else 2428 else {
2429 desc = irq_to_desc(irq);
1878 /* Strange. Oh, well.. */ 2430 /* Strange. Oh, well.. */
1879 irq_desc[irq].chip = &no_irq_chip; 2431 desc->chip = &no_irq_chip;
2432 }
1880 } 2433 }
1881 } 2434 }
1882} 2435}
1883 2436
1884static void unmask_lapic_irq(unsigned int irq) 2437/*
2438 * The local APIC irq-chip implementation:
2439 */
2440
2441static void mask_lapic_irq(unsigned int irq)
1885{ 2442{
1886 unsigned long v; 2443 unsigned long v;
1887 2444
1888 v = apic_read(APIC_LVT0); 2445 v = apic_read(APIC_LVT0);
1889 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 2446 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
1890} 2447}
1891 2448
1892static void mask_lapic_irq(unsigned int irq) 2449static void unmask_lapic_irq(unsigned int irq)
1893{ 2450{
1894 unsigned long v; 2451 unsigned long v;
1895 2452
1896 v = apic_read(APIC_LVT0); 2453 v = apic_read(APIC_LVT0);
1897 apic_write(APIC_LVT0, v | APIC_LVT_MASKED); 2454 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
1898} 2455}
1899 2456
1900static void ack_lapic_irq (unsigned int irq) 2457static void ack_lapic_irq (unsigned int irq)
@@ -1911,7 +2468,10 @@ static struct irq_chip lapic_chip __read_mostly = {
1911 2468
1912static void lapic_register_intr(int irq) 2469static void lapic_register_intr(int irq)
1913{ 2470{
1914 irq_desc[irq].status &= ~IRQ_LEVEL; 2471 struct irq_desc *desc;
2472
2473 desc = irq_to_desc(irq);
2474 desc->status &= ~IRQ_LEVEL;
1915 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2475 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
1916 "edge"); 2476 "edge");
1917} 2477}
@@ -1919,19 +2479,19 @@ static void lapic_register_intr(int irq)
1919static void __init setup_nmi(void) 2479static void __init setup_nmi(void)
1920{ 2480{
1921 /* 2481 /*
1922 * Dirty trick to enable the NMI watchdog ... 2482 * Dirty trick to enable the NMI watchdog ...
1923 * We put the 8259A master into AEOI mode and 2483 * We put the 8259A master into AEOI mode and
1924 * unmask on all local APICs LVT0 as NMI. 2484 * unmask on all local APICs LVT0 as NMI.
1925 * 2485 *
1926 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') 2486 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
1927 * is from Maciej W. Rozycki - so we do not have to EOI from 2487 * is from Maciej W. Rozycki - so we do not have to EOI from
1928 * the NMI handler or the timer interrupt. 2488 * the NMI handler or the timer interrupt.
1929 */ 2489 */
1930 printk(KERN_INFO "activating NMI Watchdog ..."); 2490 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
1931 2491
1932 enable_NMI_through_LVT0(); 2492 enable_NMI_through_LVT0();
1933 2493
1934 printk(" done.\n"); 2494 apic_printk(APIC_VERBOSE, " done.\n");
1935} 2495}
1936 2496
1937/* 2497/*
@@ -1948,12 +2508,17 @@ static inline void __init unlock_ExtINT_logic(void)
1948 unsigned char save_control, save_freq_select; 2508 unsigned char save_control, save_freq_select;
1949 2509
1950 pin = find_isa_irq_pin(8, mp_INT); 2510 pin = find_isa_irq_pin(8, mp_INT);
2511 if (pin == -1) {
2512 WARN_ON_ONCE(1);
2513 return;
2514 }
1951 apic = find_isa_irq_apic(8, mp_INT); 2515 apic = find_isa_irq_apic(8, mp_INT);
1952 if (pin == -1) 2516 if (apic == -1) {
2517 WARN_ON_ONCE(1);
1953 return; 2518 return;
2519 }
1954 2520
1955 entry0 = ioapic_read_entry(apic, pin); 2521 entry0 = ioapic_read_entry(apic, pin);
1956
1957 clear_IO_APIC_pin(apic, pin); 2522 clear_IO_APIC_pin(apic, pin);
1958 2523
1959 memset(&entry1, 0, sizeof(entry1)); 2524 memset(&entry1, 0, sizeof(entry1));
@@ -1988,23 +2553,38 @@ static inline void __init unlock_ExtINT_logic(void)
1988 ioapic_write_entry(apic, pin, entry0); 2553 ioapic_write_entry(apic, pin, entry0);
1989} 2554}
1990 2555
2556static int disable_timer_pin_1 __initdata;
2557/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
2558static int __init disable_timer_pin_setup(char *arg)
2559{
2560 disable_timer_pin_1 = 1;
2561 return 0;
2562}
2563early_param("disable_timer_pin_1", disable_timer_pin_setup);
2564
2565int timer_through_8259 __initdata;
2566
1991/* 2567/*
1992 * This code may look a bit paranoid, but it's supposed to cooperate with 2568 * This code may look a bit paranoid, but it's supposed to cooperate with
1993 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ 2569 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
1994 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast 2570 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
1995 * fanatically on his truly buggy board. 2571 * fanatically on his truly buggy board.
1996 * 2572 *
1997 * FIXME: really need to revamp this for modern platforms only. 2573 * FIXME: really need to revamp this for all platforms.
1998 */ 2574 */
1999static inline void __init check_timer(void) 2575static inline void __init check_timer(void)
2000{ 2576{
2001 struct irq_cfg *cfg = irq_cfg + 0; 2577 struct irq_cfg *cfg = irq_cfg(0);
2002 int apic1, pin1, apic2, pin2; 2578 int apic1, pin1, apic2, pin2;
2003 unsigned long flags; 2579 unsigned long flags;
2580 unsigned int ver;
2004 int no_pin1 = 0; 2581 int no_pin1 = 0;
2005 2582
2006 local_irq_save(flags); 2583 local_irq_save(flags);
2007 2584
2585 ver = apic_read(APIC_LVR);
2586 ver = GET_APIC_VERSION(ver);
2587
2008 /* 2588 /*
2009 * get/set the timer IRQ vector: 2589 * get/set the timer IRQ vector:
2010 */ 2590 */
@@ -2013,10 +2593,18 @@ static inline void __init check_timer(void)
2013 2593
2014 /* 2594 /*
2015 * As IRQ0 is to be enabled in the 8259A, the virtual 2595 * As IRQ0 is to be enabled in the 8259A, the virtual
2016 * wire has to be disabled in the local APIC. 2596 * wire has to be disabled in the local APIC. Also
2597 * timer interrupts need to be acknowledged manually in
2598 * the 8259A for the i82489DX when using the NMI
2599 * watchdog as that APIC treats NMIs as level-triggered.
2600 * The AEOI mode will finish them in the 8259A
2601 * automatically.
2017 */ 2602 */
2018 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2603 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2019 init_8259A(1); 2604 init_8259A(1);
2605#ifdef CONFIG_X86_32
2606 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2607#endif
2020 2608
2021 pin1 = find_isa_irq_pin(0, mp_INT); 2609 pin1 = find_isa_irq_pin(0, mp_INT);
2022 apic1 = find_isa_irq_apic(0, mp_INT); 2610 apic1 = find_isa_irq_apic(0, mp_INT);
@@ -2035,8 +2623,10 @@ static inline void __init check_timer(void)
2035 * 8259A. 2623 * 8259A.
2036 */ 2624 */
2037 if (pin1 == -1) { 2625 if (pin1 == -1) {
2626#ifdef CONFIG_INTR_REMAP
2038 if (intr_remapping_enabled) 2627 if (intr_remapping_enabled)
2039 panic("BIOS bug: timer not connected to IO-APIC"); 2628 panic("BIOS bug: timer not connected to IO-APIC");
2629#endif
2040 pin1 = pin2; 2630 pin1 = pin2;
2041 apic1 = apic2; 2631 apic1 = apic2;
2042 no_pin1 = 1; 2632 no_pin1 = 1;
@@ -2054,7 +2644,7 @@ static inline void __init check_timer(void)
2054 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2644 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
2055 } 2645 }
2056 unmask_IO_APIC_irq(0); 2646 unmask_IO_APIC_irq(0);
2057 if (!no_timer_check && timer_irq_works()) { 2647 if (timer_irq_works()) {
2058 if (nmi_watchdog == NMI_IO_APIC) { 2648 if (nmi_watchdog == NMI_IO_APIC) {
2059 setup_nmi(); 2649 setup_nmi();
2060 enable_8259A_irq(0); 2650 enable_8259A_irq(0);
@@ -2063,8 +2653,10 @@ static inline void __init check_timer(void)
2063 clear_IO_APIC_pin(0, pin1); 2653 clear_IO_APIC_pin(0, pin1);
2064 goto out; 2654 goto out;
2065 } 2655 }
2656#ifdef CONFIG_INTR_REMAP
2066 if (intr_remapping_enabled) 2657 if (intr_remapping_enabled)
2067 panic("timer doesn't work through Interrupt-remapped IO-APIC"); 2658 panic("timer doesn't work through Interrupt-remapped IO-APIC");
2659#endif
2068 clear_IO_APIC_pin(apic1, pin1); 2660 clear_IO_APIC_pin(apic1, pin1);
2069 if (!no_pin1) 2661 if (!no_pin1)
2070 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " 2662 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -2104,6 +2696,9 @@ static inline void __init check_timer(void)
2104 "through the IO-APIC - disabling NMI Watchdog!\n"); 2696 "through the IO-APIC - disabling NMI Watchdog!\n");
2105 nmi_watchdog = NMI_NONE; 2697 nmi_watchdog = NMI_NONE;
2106 } 2698 }
2699#ifdef CONFIG_X86_32
2700 timer_ack = 0;
2701#endif
2107 2702
2108 apic_printk(APIC_QUIET, KERN_INFO 2703 apic_printk(APIC_QUIET, KERN_INFO
2109 "...trying to set up timer as Virtual Wire IRQ...\n"); 2704 "...trying to set up timer as Virtual Wire IRQ...\n");
@@ -2140,13 +2735,6 @@ out:
2140 local_irq_restore(flags); 2735 local_irq_restore(flags);
2141} 2736}
2142 2737
2143static int __init notimercheck(char *s)
2144{
2145 no_timer_check = 1;
2146 return 1;
2147}
2148__setup("no_timer_check", notimercheck);
2149
2150/* 2738/*
2151 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available 2739 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
2152 * to devices. However there may be an I/O APIC pin available for 2740 * to devices. However there may be an I/O APIC pin available for
@@ -2164,25 +2752,49 @@ __setup("no_timer_check", notimercheck);
2164 * the I/O APIC in all cases now. No actual device should request 2752 * the I/O APIC in all cases now. No actual device should request
2165 * it anyway. --macro 2753 * it anyway. --macro
2166 */ 2754 */
2167#define PIC_IRQS (1<<2) 2755#define PIC_IRQS (1 << PIC_CASCADE_IR)
2168 2756
2169void __init setup_IO_APIC(void) 2757void __init setup_IO_APIC(void)
2170{ 2758{
2171 2759
2760#ifdef CONFIG_X86_32
2761 enable_IO_APIC();
2762#else
2172 /* 2763 /*
2173 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 2764 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
2174 */ 2765 */
2766#endif
2175 2767
2176 io_apic_irqs = ~PIC_IRQS; 2768 io_apic_irqs = ~PIC_IRQS;
2177 2769
2178 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 2770 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
2179 2771 /*
2772 * Set up IO-APIC IRQ routing.
2773 */
2774#ifdef CONFIG_X86_32
2775 if (!acpi_ioapic)
2776 setup_ioapic_ids_from_mpc();
2777#endif
2180 sync_Arb_IDs(); 2778 sync_Arb_IDs();
2181 setup_IO_APIC_irqs(); 2779 setup_IO_APIC_irqs();
2182 init_IO_APIC_traps(); 2780 init_IO_APIC_traps();
2183 check_timer(); 2781 check_timer();
2184} 2782}
2185 2783
2784/*
2785 * Called after all the initialization is done. If we didnt find any
2786 * APIC bugs then we can allow the modify fast path
2787 */
2788
2789static int __init io_apic_bug_finalize(void)
2790{
2791 if (sis_apic_bug == -1)
2792 sis_apic_bug = 0;
2793 return 0;
2794}
2795
2796late_initcall(io_apic_bug_finalize);
2797
2186struct sysfs_ioapic_data { 2798struct sysfs_ioapic_data {
2187 struct sys_device dev; 2799 struct sys_device dev;
2188 struct IO_APIC_route_entry entry[0]; 2800 struct IO_APIC_route_entry entry[0];
@@ -2270,32 +2882,51 @@ device_initcall(ioapic_init_sysfs);
2270/* 2882/*
2271 * Dynamic irq allocate and deallocation 2883 * Dynamic irq allocate and deallocation
2272 */ 2884 */
2273int create_irq(void) 2885unsigned int create_irq_nr(unsigned int irq_want)
2274{ 2886{
2275 /* Allocate an unused irq */ 2887 /* Allocate an unused irq */
2276 int irq; 2888 unsigned int irq;
2277 int new; 2889 unsigned int new;
2278 unsigned long flags; 2890 unsigned long flags;
2891 struct irq_cfg *cfg_new;
2892
2893 irq_want = nr_irqs - 1;
2279 2894
2280 irq = -ENOSPC; 2895 irq = 0;
2281 spin_lock_irqsave(&vector_lock, flags); 2896 spin_lock_irqsave(&vector_lock, flags);
2282 for (new = (NR_IRQS - 1); new >= 0; new--) { 2897 for (new = irq_want; new > 0; new--) {
2283 if (platform_legacy_irq(new)) 2898 if (platform_legacy_irq(new))
2284 continue; 2899 continue;
2285 if (irq_cfg[new].vector != 0) 2900 cfg_new = irq_cfg(new);
2901 if (cfg_new && cfg_new->vector != 0)
2286 continue; 2902 continue;
2903 /* check if need to create one */
2904 if (!cfg_new)
2905 cfg_new = irq_cfg_alloc(new);
2287 if (__assign_irq_vector(new, TARGET_CPUS) == 0) 2906 if (__assign_irq_vector(new, TARGET_CPUS) == 0)
2288 irq = new; 2907 irq = new;
2289 break; 2908 break;
2290 } 2909 }
2291 spin_unlock_irqrestore(&vector_lock, flags); 2910 spin_unlock_irqrestore(&vector_lock, flags);
2292 2911
2293 if (irq >= 0) { 2912 if (irq > 0) {
2294 dynamic_irq_init(irq); 2913 dynamic_irq_init(irq);
2295 } 2914 }
2296 return irq; 2915 return irq;
2297} 2916}
2298 2917
2918int create_irq(void)
2919{
2920 int irq;
2921
2922 irq = create_irq_nr(nr_irqs - 1);
2923
2924 if (irq == 0)
2925 irq = -1;
2926
2927 return irq;
2928}
2929
2299void destroy_irq(unsigned int irq) 2930void destroy_irq(unsigned int irq)
2300{ 2931{
2301 unsigned long flags; 2932 unsigned long flags;
@@ -2316,7 +2947,7 @@ void destroy_irq(unsigned int irq)
2316#ifdef CONFIG_PCI_MSI 2947#ifdef CONFIG_PCI_MSI
2317static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) 2948static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
2318{ 2949{
2319 struct irq_cfg *cfg = irq_cfg + irq; 2950 struct irq_cfg *cfg;
2320 int err; 2951 int err;
2321 unsigned dest; 2952 unsigned dest;
2322 cpumask_t tmp; 2953 cpumask_t tmp;
@@ -2326,6 +2957,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2326 if (err) 2957 if (err)
2327 return err; 2958 return err;
2328 2959
2960 cfg = irq_cfg(irq);
2329 cpus_and(tmp, cfg->domain, tmp); 2961 cpus_and(tmp, cfg->domain, tmp);
2330 dest = cpu_mask_to_apicid(tmp); 2962 dest = cpu_mask_to_apicid(tmp);
2331 2963
@@ -2383,10 +3015,11 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2383#ifdef CONFIG_SMP 3015#ifdef CONFIG_SMP
2384static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) 3016static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2385{ 3017{
2386 struct irq_cfg *cfg = irq_cfg + irq; 3018 struct irq_cfg *cfg;
2387 struct msi_msg msg; 3019 struct msi_msg msg;
2388 unsigned int dest; 3020 unsigned int dest;
2389 cpumask_t tmp; 3021 cpumask_t tmp;
3022 struct irq_desc *desc;
2390 3023
2391 cpus_and(tmp, mask, cpu_online_map); 3024 cpus_and(tmp, mask, cpu_online_map);
2392 if (cpus_empty(tmp)) 3025 if (cpus_empty(tmp))
@@ -2395,6 +3028,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2395 if (assign_irq_vector(irq, mask)) 3028 if (assign_irq_vector(irq, mask))
2396 return; 3029 return;
2397 3030
3031 cfg = irq_cfg(irq);
2398 cpus_and(tmp, cfg->domain, mask); 3032 cpus_and(tmp, cfg->domain, mask);
2399 dest = cpu_mask_to_apicid(tmp); 3033 dest = cpu_mask_to_apicid(tmp);
2400 3034
@@ -2406,7 +3040,8 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2406 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3040 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2407 3041
2408 write_msi_msg(irq, &msg); 3042 write_msi_msg(irq, &msg);
2409 irq_desc[irq].affinity = mask; 3043 desc = irq_to_desc(irq);
3044 desc->affinity = mask;
2410} 3045}
2411 3046
2412#ifdef CONFIG_INTR_REMAP 3047#ifdef CONFIG_INTR_REMAP
@@ -2416,10 +3051,11 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2416 */ 3051 */
2417static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) 3052static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2418{ 3053{
2419 struct irq_cfg *cfg = irq_cfg + irq; 3054 struct irq_cfg *cfg;
2420 unsigned int dest; 3055 unsigned int dest;
2421 cpumask_t tmp, cleanup_mask; 3056 cpumask_t tmp, cleanup_mask;
2422 struct irte irte; 3057 struct irte irte;
3058 struct irq_desc *desc;
2423 3059
2424 cpus_and(tmp, mask, cpu_online_map); 3060 cpus_and(tmp, mask, cpu_online_map);
2425 if (cpus_empty(tmp)) 3061 if (cpus_empty(tmp))
@@ -2431,6 +3067,7 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2431 if (assign_irq_vector(irq, mask)) 3067 if (assign_irq_vector(irq, mask))
2432 return; 3068 return;
2433 3069
3070 cfg = irq_cfg(irq);
2434 cpus_and(tmp, cfg->domain, mask); 3071 cpus_and(tmp, cfg->domain, mask);
2435 dest = cpu_mask_to_apicid(tmp); 3072 dest = cpu_mask_to_apicid(tmp);
2436 3073
@@ -2454,7 +3091,8 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2454 cfg->move_in_progress = 0; 3091 cfg->move_in_progress = 0;
2455 } 3092 }
2456 3093
2457 irq_desc[irq].affinity = mask; 3094 desc = irq_to_desc(irq);
3095 desc->affinity = mask;
2458} 3096}
2459#endif 3097#endif
2460#endif /* CONFIG_SMP */ 3098#endif /* CONFIG_SMP */
@@ -2507,7 +3145,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
2507 if (index < 0) { 3145 if (index < 0) {
2508 printk(KERN_ERR 3146 printk(KERN_ERR
2509 "Unable to allocate %d IRTE for PCI %s\n", nvec, 3147 "Unable to allocate %d IRTE for PCI %s\n", nvec,
2510 pci_name(dev)); 3148 pci_name(dev));
2511 return -ENOSPC; 3149 return -ENOSPC;
2512 } 3150 }
2513 return index; 3151 return index;
@@ -2528,7 +3166,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
2528 3166
2529#ifdef CONFIG_INTR_REMAP 3167#ifdef CONFIG_INTR_REMAP
2530 if (irq_remapped(irq)) { 3168 if (irq_remapped(irq)) {
2531 struct irq_desc *desc = irq_desc + irq; 3169 struct irq_desc *desc = irq_to_desc(irq);
2532 /* 3170 /*
2533 * irq migration in process context 3171 * irq migration in process context
2534 */ 3172 */
@@ -2538,16 +3176,34 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
2538#endif 3176#endif
2539 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 3177 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
2540 3178
3179 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
3180
2541 return 0; 3181 return 0;
2542} 3182}
2543 3183
3184static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
3185{
3186 unsigned int irq;
3187
3188 irq = dev->bus->number;
3189 irq <<= 8;
3190 irq |= dev->devfn;
3191 irq <<= 12;
3192
3193 return irq;
3194}
3195
2544int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) 3196int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
2545{ 3197{
2546 int irq, ret; 3198 unsigned int irq;
3199 int ret;
3200 unsigned int irq_want;
2547 3201
2548 irq = create_irq(); 3202 irq_want = build_irq_for_pci_dev(dev) + 0x100;
2549 if (irq < 0) 3203
2550 return irq; 3204 irq = create_irq_nr(irq_want);
3205 if (irq == 0)
3206 return -1;
2551 3207
2552#ifdef CONFIG_INTR_REMAP 3208#ifdef CONFIG_INTR_REMAP
2553 if (!intr_remapping_enabled) 3209 if (!intr_remapping_enabled)
@@ -2574,18 +3230,22 @@ error:
2574 3230
2575int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 3231int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
2576{ 3232{
2577 int irq, ret, sub_handle; 3233 unsigned int irq;
3234 int ret, sub_handle;
2578 struct msi_desc *desc; 3235 struct msi_desc *desc;
3236 unsigned int irq_want;
3237
2579#ifdef CONFIG_INTR_REMAP 3238#ifdef CONFIG_INTR_REMAP
2580 struct intel_iommu *iommu = 0; 3239 struct intel_iommu *iommu = 0;
2581 int index = 0; 3240 int index = 0;
2582#endif 3241#endif
2583 3242
3243 irq_want = build_irq_for_pci_dev(dev) + 0x100;
2584 sub_handle = 0; 3244 sub_handle = 0;
2585 list_for_each_entry(desc, &dev->msi_list, list) { 3245 list_for_each_entry(desc, &dev->msi_list, list) {
2586 irq = create_irq(); 3246 irq = create_irq_nr(irq_want--);
2587 if (irq < 0) 3247 if (irq == 0)
2588 return irq; 3248 return -1;
2589#ifdef CONFIG_INTR_REMAP 3249#ifdef CONFIG_INTR_REMAP
2590 if (!intr_remapping_enabled) 3250 if (!intr_remapping_enabled)
2591 goto no_ir; 3251 goto no_ir;
@@ -2636,10 +3296,11 @@ void arch_teardown_msi_irq(unsigned int irq)
2636#ifdef CONFIG_SMP 3296#ifdef CONFIG_SMP
2637static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) 3297static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
2638{ 3298{
2639 struct irq_cfg *cfg = irq_cfg + irq; 3299 struct irq_cfg *cfg;
2640 struct msi_msg msg; 3300 struct msi_msg msg;
2641 unsigned int dest; 3301 unsigned int dest;
2642 cpumask_t tmp; 3302 cpumask_t tmp;
3303 struct irq_desc *desc;
2643 3304
2644 cpus_and(tmp, mask, cpu_online_map); 3305 cpus_and(tmp, mask, cpu_online_map);
2645 if (cpus_empty(tmp)) 3306 if (cpus_empty(tmp))
@@ -2648,6 +3309,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
2648 if (assign_irq_vector(irq, mask)) 3309 if (assign_irq_vector(irq, mask))
2649 return; 3310 return;
2650 3311
3312 cfg = irq_cfg(irq);
2651 cpus_and(tmp, cfg->domain, mask); 3313 cpus_and(tmp, cfg->domain, mask);
2652 dest = cpu_mask_to_apicid(tmp); 3314 dest = cpu_mask_to_apicid(tmp);
2653 3315
@@ -2659,7 +3321,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
2659 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3321 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2660 3322
2661 dmar_msi_write(irq, &msg); 3323 dmar_msi_write(irq, &msg);
2662 irq_desc[irq].affinity = mask; 3324 desc = irq_to_desc(irq);
3325 desc->affinity = mask;
2663} 3326}
2664#endif /* CONFIG_SMP */ 3327#endif /* CONFIG_SMP */
2665 3328
@@ -2689,6 +3352,69 @@ int arch_setup_dmar_msi(unsigned int irq)
2689} 3352}
2690#endif 3353#endif
2691 3354
3355#ifdef CONFIG_HPET_TIMER
3356
3357#ifdef CONFIG_SMP
3358static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
3359{
3360 struct irq_cfg *cfg;
3361 struct irq_desc *desc;
3362 struct msi_msg msg;
3363 unsigned int dest;
3364 cpumask_t tmp;
3365
3366 cpus_and(tmp, mask, cpu_online_map);
3367 if (cpus_empty(tmp))
3368 return;
3369
3370 if (assign_irq_vector(irq, mask))
3371 return;
3372
3373 cfg = irq_cfg(irq);
3374 cpus_and(tmp, cfg->domain, mask);
3375 dest = cpu_mask_to_apicid(tmp);
3376
3377 hpet_msi_read(irq, &msg);
3378
3379 msg.data &= ~MSI_DATA_VECTOR_MASK;
3380 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3381 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3382 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3383
3384 hpet_msi_write(irq, &msg);
3385 desc = irq_to_desc(irq);
3386 desc->affinity = mask;
3387}
3388#endif /* CONFIG_SMP */
3389
3390struct irq_chip hpet_msi_type = {
3391 .name = "HPET_MSI",
3392 .unmask = hpet_msi_unmask,
3393 .mask = hpet_msi_mask,
3394 .ack = ack_apic_edge,
3395#ifdef CONFIG_SMP
3396 .set_affinity = hpet_msi_set_affinity,
3397#endif
3398 .retrigger = ioapic_retrigger_irq,
3399};
3400
3401int arch_setup_hpet_msi(unsigned int irq)
3402{
3403 int ret;
3404 struct msi_msg msg;
3405
3406 ret = msi_compose_msg(NULL, irq, &msg);
3407 if (ret < 0)
3408 return ret;
3409
3410 hpet_msi_write(irq, &msg);
3411 set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
3412 "edge");
3413
3414 return 0;
3415}
3416#endif
3417
2692#endif /* CONFIG_PCI_MSI */ 3418#endif /* CONFIG_PCI_MSI */
2693/* 3419/*
2694 * Hypertransport interrupt support 3420 * Hypertransport interrupt support
@@ -2713,9 +3439,10 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
2713 3439
2714static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) 3440static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2715{ 3441{
2716 struct irq_cfg *cfg = irq_cfg + irq; 3442 struct irq_cfg *cfg;
2717 unsigned int dest; 3443 unsigned int dest;
2718 cpumask_t tmp; 3444 cpumask_t tmp;
3445 struct irq_desc *desc;
2719 3446
2720 cpus_and(tmp, mask, cpu_online_map); 3447 cpus_and(tmp, mask, cpu_online_map);
2721 if (cpus_empty(tmp)) 3448 if (cpus_empty(tmp))
@@ -2724,11 +3451,13 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2724 if (assign_irq_vector(irq, mask)) 3451 if (assign_irq_vector(irq, mask))
2725 return; 3452 return;
2726 3453
3454 cfg = irq_cfg(irq);
2727 cpus_and(tmp, cfg->domain, mask); 3455 cpus_and(tmp, cfg->domain, mask);
2728 dest = cpu_mask_to_apicid(tmp); 3456 dest = cpu_mask_to_apicid(tmp);
2729 3457
2730 target_ht_irq(irq, dest, cfg->vector); 3458 target_ht_irq(irq, dest, cfg->vector);
2731 irq_desc[irq].affinity = mask; 3459 desc = irq_to_desc(irq);
3460 desc->affinity = mask;
2732} 3461}
2733#endif 3462#endif
2734 3463
@@ -2745,7 +3474,7 @@ static struct irq_chip ht_irq_chip = {
2745 3474
2746int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) 3475int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2747{ 3476{
2748 struct irq_cfg *cfg = irq_cfg + irq; 3477 struct irq_cfg *cfg;
2749 int err; 3478 int err;
2750 cpumask_t tmp; 3479 cpumask_t tmp;
2751 3480
@@ -2755,6 +3484,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2755 struct ht_irq_msg msg; 3484 struct ht_irq_msg msg;
2756 unsigned dest; 3485 unsigned dest;
2757 3486
3487 cfg = irq_cfg(irq);
2758 cpus_and(tmp, cfg->domain, tmp); 3488 cpus_and(tmp, cfg->domain, tmp);
2759 dest = cpu_mask_to_apicid(tmp); 3489 dest = cpu_mask_to_apicid(tmp);
2760 3490
@@ -2777,20 +3507,196 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2777 3507
2778 set_irq_chip_and_handler_name(irq, &ht_irq_chip, 3508 set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2779 handle_edge_irq, "edge"); 3509 handle_edge_irq, "edge");
3510
3511 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
2780 } 3512 }
2781 return err; 3513 return err;
2782} 3514}
2783#endif /* CONFIG_HT_IRQ */ 3515#endif /* CONFIG_HT_IRQ */
2784 3516
3517#ifdef CONFIG_X86_64
3518/*
3519 * Re-target the irq to the specified CPU and enable the specified MMR located
3520 * on the specified blade to allow the sending of MSIs to the specified CPU.
3521 */
3522int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3523 unsigned long mmr_offset)
3524{
3525 const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
3526 struct irq_cfg *cfg;
3527 int mmr_pnode;
3528 unsigned long mmr_value;
3529 struct uv_IO_APIC_route_entry *entry;
3530 unsigned long flags;
3531 int err;
3532
3533 err = assign_irq_vector(irq, *eligible_cpu);
3534 if (err != 0)
3535 return err;
3536
3537 spin_lock_irqsave(&vector_lock, flags);
3538 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
3539 irq_name);
3540 spin_unlock_irqrestore(&vector_lock, flags);
3541
3542 cfg = irq_cfg(irq);
3543
3544 mmr_value = 0;
3545 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3546 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3547
3548 entry->vector = cfg->vector;
3549 entry->delivery_mode = INT_DELIVERY_MODE;
3550 entry->dest_mode = INT_DEST_MODE;
3551 entry->polarity = 0;
3552 entry->trigger = 0;
3553 entry->mask = 0;
3554 entry->dest = cpu_mask_to_apicid(*eligible_cpu);
3555
3556 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3557 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3558
3559 return irq;
3560}
3561
3562/*
3563 * Disable the specified MMR located on the specified blade so that MSIs are
3564 * longer allowed to be sent.
3565 */
3566void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3567{
3568 unsigned long mmr_value;
3569 struct uv_IO_APIC_route_entry *entry;
3570 int mmr_pnode;
3571
3572 mmr_value = 0;
3573 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3574 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3575
3576 entry->mask = 1;
3577
3578 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3579 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3580}
3581#endif /* CONFIG_X86_64 */
3582
3583int __init io_apic_get_redir_entries (int ioapic)
3584{
3585 union IO_APIC_reg_01 reg_01;
3586 unsigned long flags;
3587
3588 spin_lock_irqsave(&ioapic_lock, flags);
3589 reg_01.raw = io_apic_read(ioapic, 1);
3590 spin_unlock_irqrestore(&ioapic_lock, flags);
3591
3592 return reg_01.bits.entries;
3593}
3594
3595int __init probe_nr_irqs(void)
3596{
3597 int idx;
3598 int nr = 0;
3599#ifndef CONFIG_XEN
3600 int nr_min = 32;
3601#else
3602 int nr_min = NR_IRQS;
3603#endif
3604
3605 for (idx = 0; idx < nr_ioapics; idx++)
3606 nr += io_apic_get_redir_entries(idx) + 1;
3607
3608 /* double it for hotplug and msi and nmi */
3609 nr <<= 1;
3610
3611 /* something wrong ? */
3612 if (nr < nr_min)
3613 nr = nr_min;
3614
3615 return nr;
3616}
3617
2785/* -------------------------------------------------------------------------- 3618/* --------------------------------------------------------------------------
2786 ACPI-based IOAPIC Configuration 3619 ACPI-based IOAPIC Configuration
2787 -------------------------------------------------------------------------- */ 3620 -------------------------------------------------------------------------- */
2788 3621
2789#ifdef CONFIG_ACPI 3622#ifdef CONFIG_ACPI
2790 3623
2791#define IO_APIC_MAX_ID 0xFE 3624#ifdef CONFIG_X86_32
3625int __init io_apic_get_unique_id(int ioapic, int apic_id)
3626{
3627 union IO_APIC_reg_00 reg_00;
3628 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
3629 physid_mask_t tmp;
3630 unsigned long flags;
3631 int i = 0;
2792 3632
2793int __init io_apic_get_redir_entries (int ioapic) 3633 /*
3634 * The P4 platform supports up to 256 APIC IDs on two separate APIC
3635 * buses (one for LAPICs, one for IOAPICs), where predecessors only
3636 * supports up to 16 on one shared APIC bus.
3637 *
3638 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
3639 * advantage of new APIC bus architecture.
3640 */
3641
3642 if (physids_empty(apic_id_map))
3643 apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
3644
3645 spin_lock_irqsave(&ioapic_lock, flags);
3646 reg_00.raw = io_apic_read(ioapic, 0);
3647 spin_unlock_irqrestore(&ioapic_lock, flags);
3648
3649 if (apic_id >= get_physical_broadcast()) {
3650 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
3651 "%d\n", ioapic, apic_id, reg_00.bits.ID);
3652 apic_id = reg_00.bits.ID;
3653 }
3654
3655 /*
3656 * Every APIC in a system must have a unique ID or we get lots of nice
3657 * 'stuck on smp_invalidate_needed IPI wait' messages.
3658 */
3659 if (check_apicid_used(apic_id_map, apic_id)) {
3660
3661 for (i = 0; i < get_physical_broadcast(); i++) {
3662 if (!check_apicid_used(apic_id_map, i))
3663 break;
3664 }
3665
3666 if (i == get_physical_broadcast())
3667 panic("Max apic_id exceeded!\n");
3668
3669 printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
3670 "trying %d\n", ioapic, apic_id, i);
3671
3672 apic_id = i;
3673 }
3674
3675 tmp = apicid_to_cpu_present(apic_id);
3676 physids_or(apic_id_map, apic_id_map, tmp);
3677
3678 if (reg_00.bits.ID != apic_id) {
3679 reg_00.bits.ID = apic_id;
3680
3681 spin_lock_irqsave(&ioapic_lock, flags);
3682 io_apic_write(ioapic, 0, reg_00.raw);
3683 reg_00.raw = io_apic_read(ioapic, 0);
3684 spin_unlock_irqrestore(&ioapic_lock, flags);
3685
3686 /* Sanity check */
3687 if (reg_00.bits.ID != apic_id) {
3688 printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
3689 return -1;
3690 }
3691 }
3692
3693 apic_printk(APIC_VERBOSE, KERN_INFO
3694 "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
3695
3696 return apic_id;
3697}
3698
3699int __init io_apic_get_version(int ioapic)
2794{ 3700{
2795 union IO_APIC_reg_01 reg_01; 3701 union IO_APIC_reg_01 reg_01;
2796 unsigned long flags; 3702 unsigned long flags;
@@ -2799,9 +3705,9 @@ int __init io_apic_get_redir_entries (int ioapic)
2799 reg_01.raw = io_apic_read(ioapic, 1); 3705 reg_01.raw = io_apic_read(ioapic, 1);
2800 spin_unlock_irqrestore(&ioapic_lock, flags); 3706 spin_unlock_irqrestore(&ioapic_lock, flags);
2801 3707
2802 return reg_01.bits.entries; 3708 return reg_01.bits.version;
2803} 3709}
2804 3710#endif
2805 3711
2806int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) 3712int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
2807{ 3713{
@@ -2853,6 +3759,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
2853void __init setup_ioapic_dest(void) 3759void __init setup_ioapic_dest(void)
2854{ 3760{
2855 int pin, ioapic, irq, irq_entry; 3761 int pin, ioapic, irq, irq_entry;
3762 struct irq_cfg *cfg;
2856 3763
2857 if (skip_ioapic_setup == 1) 3764 if (skip_ioapic_setup == 1)
2858 return; 3765 return;
@@ -2868,7 +3775,8 @@ void __init setup_ioapic_dest(void)
2868 * when you have too many devices, because at that time only boot 3775 * when you have too many devices, because at that time only boot
2869 * cpu is online. 3776 * cpu is online.
2870 */ 3777 */
2871 if (!irq_cfg[irq].vector) 3778 cfg = irq_cfg(irq);
3779 if (!cfg->vector)
2872 setup_IO_APIC_irq(ioapic, pin, irq, 3780 setup_IO_APIC_irq(ioapic, pin, irq,
2873 irq_trigger(irq_entry), 3781 irq_trigger(irq_entry),
2874 irq_polarity(irq_entry)); 3782 irq_polarity(irq_entry));
@@ -2926,18 +3834,33 @@ void __init ioapic_init_mappings(void)
2926 struct resource *ioapic_res; 3834 struct resource *ioapic_res;
2927 int i; 3835 int i;
2928 3836
3837 irq_2_pin_init();
2929 ioapic_res = ioapic_setup_resources(); 3838 ioapic_res = ioapic_setup_resources();
2930 for (i = 0; i < nr_ioapics; i++) { 3839 for (i = 0; i < nr_ioapics; i++) {
2931 if (smp_found_config) { 3840 if (smp_found_config) {
2932 ioapic_phys = mp_ioapics[i].mp_apicaddr; 3841 ioapic_phys = mp_ioapics[i].mp_apicaddr;
3842#ifdef CONFIG_X86_32
3843 if (!ioapic_phys) {
3844 printk(KERN_ERR
3845 "WARNING: bogus zero IO-APIC "
3846 "address found in MPTABLE, "
3847 "disabling IO/APIC support!\n");
3848 smp_found_config = 0;
3849 skip_ioapic_setup = 1;
3850 goto fake_ioapic_page;
3851 }
3852#endif
2933 } else { 3853 } else {
3854#ifdef CONFIG_X86_32
3855fake_ioapic_page:
3856#endif
2934 ioapic_phys = (unsigned long) 3857 ioapic_phys = (unsigned long)
2935 alloc_bootmem_pages(PAGE_SIZE); 3858 alloc_bootmem_pages(PAGE_SIZE);
2936 ioapic_phys = __pa(ioapic_phys); 3859 ioapic_phys = __pa(ioapic_phys);
2937 } 3860 }
2938 set_fixmap_nocache(idx, ioapic_phys); 3861 set_fixmap_nocache(idx, ioapic_phys);
2939 apic_printk(APIC_VERBOSE, 3862 apic_printk(APIC_VERBOSE,
2940 "mapped IOAPIC to %016lx (%016lx)\n", 3863 "mapped IOAPIC to %08lx (%08lx)\n",
2941 __fix_to_virt(idx), ioapic_phys); 3864 __fix_to_virt(idx), ioapic_phys);
2942 idx++; 3865 idx++;
2943 3866
@@ -2971,4 +3894,3 @@ static int __init ioapic_insert_resources(void)
2971/* Insert the IO APIC resources after PCI initialization has occured to handle 3894/* Insert the IO APIC resources after PCI initialization has occured to handle
2972 * IO APICS that are mapped in on a BAR in PCI space. */ 3895 * IO APICS that are mapped in on a BAR in PCI space. */
2973late_initcall(ioapic_insert_resources); 3896late_initcall(ioapic_insert_resources);
2974
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
deleted file mode 100644
index e710289f673e..000000000000
--- a/arch/x86/kernel/io_apic_32.c
+++ /dev/null
@@ -1,2908 +0,0 @@
1/*
2 * Intel IO-APIC support for multi-Pentium hosts.
3 *
4 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
5 *
6 * Many thanks to Stig Venaas for trying out countless experimental
7 * patches and reporting/debugging problems patiently!
8 *
9 * (c) 1999, Multiple IO-APIC support, developed by
10 * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
11 * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
12 * further tested and cleaned up by Zach Brown <zab@redhat.com>
13 * and Ingo Molnar <mingo@redhat.com>
14 *
15 * Fixes
16 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
17 * thanks to Eric Gilmore
18 * and Rolf G. Tews
19 * for testing these extensively
20 * Paul Diefenbaugh : Added full ACPI support
21 */
22
23#include <linux/mm.h>
24#include <linux/interrupt.h>
25#include <linux/init.h>
26#include <linux/delay.h>
27#include <linux/sched.h>
28#include <linux/bootmem.h>
29#include <linux/mc146818rtc.h>
30#include <linux/compiler.h>
31#include <linux/acpi.h>
32#include <linux/module.h>
33#include <linux/sysdev.h>
34#include <linux/pci.h>
35#include <linux/msi.h>
36#include <linux/htirq.h>
37#include <linux/freezer.h>
38#include <linux/kthread.h>
39#include <linux/jiffies.h> /* time_after() */
40
41#include <asm/io.h>
42#include <asm/smp.h>
43#include <asm/desc.h>
44#include <asm/timer.h>
45#include <asm/i8259.h>
46#include <asm/nmi.h>
47#include <asm/msidef.h>
48#include <asm/hypertransport.h>
49#include <asm/setup.h>
50
51#include <mach_apic.h>
52#include <mach_apicdef.h>
53
54#define __apicdebuginit(type) static type __init
55
56int (*ioapic_renumber_irq)(int ioapic, int irq);
57atomic_t irq_mis_count;
58
59/* Where if anywhere is the i8259 connect in external int mode */
60static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
61
62static DEFINE_SPINLOCK(ioapic_lock);
63DEFINE_SPINLOCK(vector_lock);
64
65int timer_through_8259 __initdata;
66
67/*
68 * Is the SiS APIC rmw bug present ?
69 * -1 = don't know, 0 = no, 1 = yes
70 */
71int sis_apic_bug = -1;
72
73/*
74 * # of IRQ routing registers
75 */
76int nr_ioapic_registers[MAX_IO_APICS];
77
78/* I/O APIC entries */
79struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
80int nr_ioapics;
81
82/* MP IRQ source entries */
83struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
84
85/* # of MP IRQ source entries */
86int mp_irq_entries;
87
88#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
89int mp_bus_id_to_type[MAX_MP_BUSSES];
90#endif
91
92DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
93
94static int disable_timer_pin_1 __initdata;
95
96/*
97 * Rough estimation of how many shared IRQs there are, can
98 * be changed anytime.
99 */
100#define MAX_PLUS_SHARED_IRQS NR_IRQS
101#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
102
103/*
104 * This is performance-critical, we want to do it O(1)
105 *
106 * the indexing order of this array favors 1:1 mappings
107 * between pins and IRQs.
108 */
109
110static struct irq_pin_list {
111 int apic, pin, next;
112} irq_2_pin[PIN_MAP_SIZE];
113
114struct io_apic {
115 unsigned int index;
116 unsigned int unused[3];
117 unsigned int data;
118};
119
120static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
121{
122 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
123 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
124}
125
126static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
127{
128 struct io_apic __iomem *io_apic = io_apic_base(apic);
129 writel(reg, &io_apic->index);
130 return readl(&io_apic->data);
131}
132
133static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
134{
135 struct io_apic __iomem *io_apic = io_apic_base(apic);
136 writel(reg, &io_apic->index);
137 writel(value, &io_apic->data);
138}
139
140/*
141 * Re-write a value: to be used for read-modify-write
142 * cycles where the read already set up the index register.
143 *
144 * Older SiS APIC requires we rewrite the index register
145 */
146static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
147{
148 volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
149 if (sis_apic_bug)
150 writel(reg, &io_apic->index);
151 writel(value, &io_apic->data);
152}
153
154union entry_union {
155 struct { u32 w1, w2; };
156 struct IO_APIC_route_entry entry;
157};
158
159static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
160{
161 union entry_union eu;
162 unsigned long flags;
163 spin_lock_irqsave(&ioapic_lock, flags);
164 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
165 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
166 spin_unlock_irqrestore(&ioapic_lock, flags);
167 return eu.entry;
168}
169
170/*
171 * When we write a new IO APIC routing entry, we need to write the high
172 * word first! If the mask bit in the low word is clear, we will enable
173 * the interrupt, and we need to make sure the entry is fully populated
174 * before that happens.
175 */
176static void
177__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
178{
179 union entry_union eu;
180 eu.entry = e;
181 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
182 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
183}
184
185static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
186{
187 unsigned long flags;
188 spin_lock_irqsave(&ioapic_lock, flags);
189 __ioapic_write_entry(apic, pin, e);
190 spin_unlock_irqrestore(&ioapic_lock, flags);
191}
192
193/*
194 * When we mask an IO APIC routing entry, we need to write the low
195 * word first, in order to set the mask bit before we change the
196 * high bits!
197 */
198static void ioapic_mask_entry(int apic, int pin)
199{
200 unsigned long flags;
201 union entry_union eu = { .entry.mask = 1 };
202
203 spin_lock_irqsave(&ioapic_lock, flags);
204 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
205 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
206 spin_unlock_irqrestore(&ioapic_lock, flags);
207}
208
209/*
210 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
211 * shared ISA-space IRQs, so we have to support them. We are super
212 * fast in the common case, and fast for shared ISA-space IRQs.
213 */
214static void add_pin_to_irq(unsigned int irq, int apic, int pin)
215{
216 static int first_free_entry = NR_IRQS;
217 struct irq_pin_list *entry = irq_2_pin + irq;
218
219 while (entry->next)
220 entry = irq_2_pin + entry->next;
221
222 if (entry->pin != -1) {
223 entry->next = first_free_entry;
224 entry = irq_2_pin + entry->next;
225 if (++first_free_entry >= PIN_MAP_SIZE)
226 panic("io_apic.c: whoops");
227 }
228 entry->apic = apic;
229 entry->pin = pin;
230}
231
232/*
233 * Reroute an IRQ to a different pin.
234 */
235static void __init replace_pin_at_irq(unsigned int irq,
236 int oldapic, int oldpin,
237 int newapic, int newpin)
238{
239 struct irq_pin_list *entry = irq_2_pin + irq;
240
241 while (1) {
242 if (entry->apic == oldapic && entry->pin == oldpin) {
243 entry->apic = newapic;
244 entry->pin = newpin;
245 }
246 if (!entry->next)
247 break;
248 entry = irq_2_pin + entry->next;
249 }
250}
251
252static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
253{
254 struct irq_pin_list *entry = irq_2_pin + irq;
255 unsigned int pin, reg;
256
257 for (;;) {
258 pin = entry->pin;
259 if (pin == -1)
260 break;
261 reg = io_apic_read(entry->apic, 0x10 + pin*2);
262 reg &= ~disable;
263 reg |= enable;
264 io_apic_modify(entry->apic, 0x10 + pin*2, reg);
265 if (!entry->next)
266 break;
267 entry = irq_2_pin + entry->next;
268 }
269}
270
271/* mask = 1 */
272static void __mask_IO_APIC_irq(unsigned int irq)
273{
274 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
275}
276
277/* mask = 0 */
278static void __unmask_IO_APIC_irq(unsigned int irq)
279{
280 __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
281}
282
283/* mask = 1, trigger = 0 */
284static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
285{
286 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
287 IO_APIC_REDIR_LEVEL_TRIGGER);
288}
289
290/* mask = 0, trigger = 1 */
291static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
292{
293 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
294 IO_APIC_REDIR_MASKED);
295}
296
297static void mask_IO_APIC_irq(unsigned int irq)
298{
299 unsigned long flags;
300
301 spin_lock_irqsave(&ioapic_lock, flags);
302 __mask_IO_APIC_irq(irq);
303 spin_unlock_irqrestore(&ioapic_lock, flags);
304}
305
306static void unmask_IO_APIC_irq(unsigned int irq)
307{
308 unsigned long flags;
309
310 spin_lock_irqsave(&ioapic_lock, flags);
311 __unmask_IO_APIC_irq(irq);
312 spin_unlock_irqrestore(&ioapic_lock, flags);
313}
314
315static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
316{
317 struct IO_APIC_route_entry entry;
318
319 /* Check delivery_mode to be sure we're not clearing an SMI pin */
320 entry = ioapic_read_entry(apic, pin);
321 if (entry.delivery_mode == dest_SMI)
322 return;
323
324 /*
325 * Disable it in the IO-APIC irq-routing table:
326 */
327 ioapic_mask_entry(apic, pin);
328}
329
330static void clear_IO_APIC(void)
331{
332 int apic, pin;
333
334 for (apic = 0; apic < nr_ioapics; apic++)
335 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
336 clear_IO_APIC_pin(apic, pin);
337}
338
339#ifdef CONFIG_SMP
340static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
341{
342 unsigned long flags;
343 int pin;
344 struct irq_pin_list *entry = irq_2_pin + irq;
345 unsigned int apicid_value;
346 cpumask_t tmp;
347
348 cpus_and(tmp, cpumask, cpu_online_map);
349 if (cpus_empty(tmp))
350 tmp = TARGET_CPUS;
351
352 cpus_and(cpumask, tmp, CPU_MASK_ALL);
353
354 apicid_value = cpu_mask_to_apicid(cpumask);
355 /* Prepare to do the io_apic_write */
356 apicid_value = apicid_value << 24;
357 spin_lock_irqsave(&ioapic_lock, flags);
358 for (;;) {
359 pin = entry->pin;
360 if (pin == -1)
361 break;
362 io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
363 if (!entry->next)
364 break;
365 entry = irq_2_pin + entry->next;
366 }
367 irq_desc[irq].affinity = cpumask;
368 spin_unlock_irqrestore(&ioapic_lock, flags);
369}
370
371#if defined(CONFIG_IRQBALANCE)
372# include <asm/processor.h> /* kernel_thread() */
373# include <linux/kernel_stat.h> /* kstat */
374# include <linux/slab.h> /* kmalloc() */
375# include <linux/timer.h>
376
377#define IRQBALANCE_CHECK_ARCH -999
378#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
379#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
380#define BALANCED_IRQ_MORE_DELTA (HZ/10)
381#define BALANCED_IRQ_LESS_DELTA (HZ)
382
383static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
384static int physical_balance __read_mostly;
385static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
386
387static struct irq_cpu_info {
388 unsigned long *last_irq;
389 unsigned long *irq_delta;
390 unsigned long irq;
391} irq_cpu_data[NR_CPUS];
392
393#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
394#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
395#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
396
397#define IDLE_ENOUGH(cpu,now) \
398 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
399
400#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
401
402#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i)))
403
404static cpumask_t balance_irq_affinity[NR_IRQS] = {
405 [0 ... NR_IRQS-1] = CPU_MASK_ALL
406};
407
408void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
409{
410 balance_irq_affinity[irq] = mask;
411}
412
413static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
414 unsigned long now, int direction)
415{
416 int search_idle = 1;
417 int cpu = curr_cpu;
418
419 goto inside;
420
421 do {
422 if (unlikely(cpu == curr_cpu))
423 search_idle = 0;
424inside:
425 if (direction == 1) {
426 cpu++;
427 if (cpu >= NR_CPUS)
428 cpu = 0;
429 } else {
430 cpu--;
431 if (cpu == -1)
432 cpu = NR_CPUS-1;
433 }
434 } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
435 (search_idle && !IDLE_ENOUGH(cpu, now)));
436
437 return cpu;
438}
439
440static inline void balance_irq(int cpu, int irq)
441{
442 unsigned long now = jiffies;
443 cpumask_t allowed_mask;
444 unsigned int new_cpu;
445
446 if (irqbalance_disabled)
447 return;
448
449 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
450 new_cpu = move(cpu, allowed_mask, now, 1);
451 if (cpu != new_cpu)
452 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
453}
454
455static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
456{
457 int i, j;
458
459 for_each_online_cpu(i) {
460 for (j = 0; j < NR_IRQS; j++) {
461 if (!irq_desc[j].action)
462 continue;
463 /* Is it a significant load ? */
464 if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
465 useful_load_threshold)
466 continue;
467 balance_irq(i, j);
468 }
469 }
470 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
471 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
472 return;
473}
474
475static void do_irq_balance(void)
476{
477 int i, j;
478 unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
479 unsigned long move_this_load = 0;
480 int max_loaded = 0, min_loaded = 0;
481 int load;
482 unsigned long useful_load_threshold = balanced_irq_interval + 10;
483 int selected_irq;
484 int tmp_loaded, first_attempt = 1;
485 unsigned long tmp_cpu_irq;
486 unsigned long imbalance = 0;
487 cpumask_t allowed_mask, target_cpu_mask, tmp;
488
489 for_each_possible_cpu(i) {
490 int package_index;
491 CPU_IRQ(i) = 0;
492 if (!cpu_online(i))
493 continue;
494 package_index = CPU_TO_PACKAGEINDEX(i);
495 for (j = 0; j < NR_IRQS; j++) {
496 unsigned long value_now, delta;
497 /* Is this an active IRQ or balancing disabled ? */
498 if (!irq_desc[j].action || irq_balancing_disabled(j))
499 continue;
500 if (package_index == i)
501 IRQ_DELTA(package_index, j) = 0;
502 /* Determine the total count per processor per IRQ */
503 value_now = (unsigned long) kstat_cpu(i).irqs[j];
504
505 /* Determine the activity per processor per IRQ */
506 delta = value_now - LAST_CPU_IRQ(i, j);
507
508 /* Update last_cpu_irq[][] for the next time */
509 LAST_CPU_IRQ(i, j) = value_now;
510
511 /* Ignore IRQs whose rate is less than the clock */
512 if (delta < useful_load_threshold)
513 continue;
514 /* update the load for the processor or package total */
515 IRQ_DELTA(package_index, j) += delta;
516
517 /* Keep track of the higher numbered sibling as well */
518 if (i != package_index)
519 CPU_IRQ(i) += delta;
520 /*
521 * We have sibling A and sibling B in the package
522 *
523 * cpu_irq[A] = load for cpu A + load for cpu B
524 * cpu_irq[B] = load for cpu B
525 */
526 CPU_IRQ(package_index) += delta;
527 }
528 }
529 /* Find the least loaded processor package */
530 for_each_online_cpu(i) {
531 if (i != CPU_TO_PACKAGEINDEX(i))
532 continue;
533 if (min_cpu_irq > CPU_IRQ(i)) {
534 min_cpu_irq = CPU_IRQ(i);
535 min_loaded = i;
536 }
537 }
538 max_cpu_irq = ULONG_MAX;
539
540tryanothercpu:
541 /*
542 * Look for heaviest loaded processor.
543 * We may come back to get the next heaviest loaded processor.
544 * Skip processors with trivial loads.
545 */
546 tmp_cpu_irq = 0;
547 tmp_loaded = -1;
548 for_each_online_cpu(i) {
549 if (i != CPU_TO_PACKAGEINDEX(i))
550 continue;
551 if (max_cpu_irq <= CPU_IRQ(i))
552 continue;
553 if (tmp_cpu_irq < CPU_IRQ(i)) {
554 tmp_cpu_irq = CPU_IRQ(i);
555 tmp_loaded = i;
556 }
557 }
558
559 if (tmp_loaded == -1) {
560 /*
561 * In the case of small number of heavy interrupt sources,
562 * loading some of the cpus too much. We use Ingo's original
563 * approach to rotate them around.
564 */
565 if (!first_attempt && imbalance >= useful_load_threshold) {
566 rotate_irqs_among_cpus(useful_load_threshold);
567 return;
568 }
569 goto not_worth_the_effort;
570 }
571
572 first_attempt = 0; /* heaviest search */
573 max_cpu_irq = tmp_cpu_irq; /* load */
574 max_loaded = tmp_loaded; /* processor */
575 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
576
577 /*
578 * if imbalance is less than approx 10% of max load, then
579 * observe diminishing returns action. - quit
580 */
581 if (imbalance < (max_cpu_irq >> 3))
582 goto not_worth_the_effort;
583
584tryanotherirq:
585 /* if we select an IRQ to move that can't go where we want, then
586 * see if there is another one to try.
587 */
588 move_this_load = 0;
589 selected_irq = -1;
590 for (j = 0; j < NR_IRQS; j++) {
591 /* Is this an active IRQ? */
592 if (!irq_desc[j].action)
593 continue;
594 if (imbalance <= IRQ_DELTA(max_loaded, j))
595 continue;
596 /* Try to find the IRQ that is closest to the imbalance
597 * without going over.
598 */
599 if (move_this_load < IRQ_DELTA(max_loaded, j)) {
600 move_this_load = IRQ_DELTA(max_loaded, j);
601 selected_irq = j;
602 }
603 }
604 if (selected_irq == -1)
605 goto tryanothercpu;
606
607 imbalance = move_this_load;
608
609 /* For physical_balance case, we accumulated both load
610 * values in the one of the siblings cpu_irq[],
611 * to use the same code for physical and logical processors
612 * as much as possible.
613 *
614 * NOTE: the cpu_irq[] array holds the sum of the load for
615 * sibling A and sibling B in the slot for the lowest numbered
616 * sibling (A), _AND_ the load for sibling B in the slot for
617 * the higher numbered sibling.
618 *
619 * We seek the least loaded sibling by making the comparison
620 * (A+B)/2 vs B
621 */
622 load = CPU_IRQ(min_loaded) >> 1;
623 for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) {
624 if (load > CPU_IRQ(j)) {
625 /* This won't change cpu_sibling_map[min_loaded] */
626 load = CPU_IRQ(j);
627 min_loaded = j;
628 }
629 }
630
631 cpus_and(allowed_mask,
632 cpu_online_map,
633 balance_irq_affinity[selected_irq]);
634 target_cpu_mask = cpumask_of_cpu(min_loaded);
635 cpus_and(tmp, target_cpu_mask, allowed_mask);
636
637 if (!cpus_empty(tmp)) {
638 /* mark for change destination */
639 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
640
641 /* Since we made a change, come back sooner to
642 * check for more variation.
643 */
644 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
645 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
646 return;
647 }
648 goto tryanotherirq;
649
650not_worth_the_effort:
651 /*
652 * if we did not find an IRQ to move, then adjust the time interval
653 * upward
654 */
655 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
656 balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
657 return;
658}
659
660static int balanced_irq(void *unused)
661{
662 int i;
663 unsigned long prev_balance_time = jiffies;
664 long time_remaining = balanced_irq_interval;
665
666 /* push everything to CPU 0 to give us a starting point. */
667 for (i = 0 ; i < NR_IRQS ; i++) {
668 irq_desc[i].pending_mask = cpumask_of_cpu(0);
669 set_pending_irq(i, cpumask_of_cpu(0));
670 }
671
672 set_freezable();
673 for ( ; ; ) {
674 time_remaining = schedule_timeout_interruptible(time_remaining);
675 try_to_freeze();
676 if (time_after(jiffies,
677 prev_balance_time+balanced_irq_interval)) {
678 preempt_disable();
679 do_irq_balance();
680 prev_balance_time = jiffies;
681 time_remaining = balanced_irq_interval;
682 preempt_enable();
683 }
684 }
685 return 0;
686}
687
688static int __init balanced_irq_init(void)
689{
690 int i;
691 struct cpuinfo_x86 *c;
692 cpumask_t tmp;
693
694 cpus_shift_right(tmp, cpu_online_map, 2);
695 c = &boot_cpu_data;
696 /* When not overwritten by the command line ask subarchitecture. */
697 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
698 irqbalance_disabled = NO_BALANCE_IRQ;
699 if (irqbalance_disabled)
700 return 0;
701
702 /* disable irqbalance completely if there is only one processor online */
703 if (num_online_cpus() < 2) {
704 irqbalance_disabled = 1;
705 return 0;
706 }
707 /*
708 * Enable physical balance only if more than 1 physical processor
709 * is present
710 */
711 if (smp_num_siblings > 1 && !cpus_empty(tmp))
712 physical_balance = 1;
713
714 for_each_online_cpu(i) {
715 irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
716 irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
717 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
718 printk(KERN_ERR "balanced_irq_init: out of memory");
719 goto failed;
720 }
721 }
722
723 printk(KERN_INFO "Starting balanced_irq\n");
724 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
725 return 0;
726 printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
727failed:
728 for_each_possible_cpu(i) {
729 kfree(irq_cpu_data[i].irq_delta);
730 irq_cpu_data[i].irq_delta = NULL;
731 kfree(irq_cpu_data[i].last_irq);
732 irq_cpu_data[i].last_irq = NULL;
733 }
734 return 0;
735}
736
737int __devinit irqbalance_disable(char *str)
738{
739 irqbalance_disabled = 1;
740 return 1;
741}
742
743__setup("noirqbalance", irqbalance_disable);
744
745late_initcall(balanced_irq_init);
746#endif /* CONFIG_IRQBALANCE */
747#endif /* CONFIG_SMP */
748
749#ifndef CONFIG_SMP
750void send_IPI_self(int vector)
751{
752 unsigned int cfg;
753
754 /*
755 * Wait for idle.
756 */
757 apic_wait_icr_idle();
758 cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
759 /*
760 * Send the IPI. The write to APIC_ICR fires this off.
761 */
762 apic_write(APIC_ICR, cfg);
763}
764#endif /* !CONFIG_SMP */
765
766
767/*
768 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
769 * specific CPU-side IRQs.
770 */
771
772#define MAX_PIRQS 8
773static int pirq_entries [MAX_PIRQS];
774static int pirqs_enabled;
775int skip_ioapic_setup;
776
777static int __init ioapic_pirq_setup(char *str)
778{
779 int i, max;
780 int ints[MAX_PIRQS+1];
781
782 get_options(str, ARRAY_SIZE(ints), ints);
783
784 for (i = 0; i < MAX_PIRQS; i++)
785 pirq_entries[i] = -1;
786
787 pirqs_enabled = 1;
788 apic_printk(APIC_VERBOSE, KERN_INFO
789 "PIRQ redirection, working around broken MP-BIOS.\n");
790 max = MAX_PIRQS;
791 if (ints[0] < MAX_PIRQS)
792 max = ints[0];
793
794 for (i = 0; i < max; i++) {
795 apic_printk(APIC_VERBOSE, KERN_DEBUG
796 "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
797 /*
798 * PIRQs are mapped upside down, usually.
799 */
800 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
801 }
802 return 1;
803}
804
805__setup("pirq=", ioapic_pirq_setup);
806
807/*
808 * Find the IRQ entry number of a certain pin.
809 */
810static int find_irq_entry(int apic, int pin, int type)
811{
812 int i;
813
814 for (i = 0; i < mp_irq_entries; i++)
815 if (mp_irqs[i].mp_irqtype == type &&
816 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
817 mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
818 mp_irqs[i].mp_dstirq == pin)
819 return i;
820
821 return -1;
822}
823
824/*
825 * Find the pin to which IRQ[irq] (ISA) is connected
826 */
827static int __init find_isa_irq_pin(int irq, int type)
828{
829 int i;
830
831 for (i = 0; i < mp_irq_entries; i++) {
832 int lbus = mp_irqs[i].mp_srcbus;
833
834 if (test_bit(lbus, mp_bus_not_pci) &&
835 (mp_irqs[i].mp_irqtype == type) &&
836 (mp_irqs[i].mp_srcbusirq == irq))
837
838 return mp_irqs[i].mp_dstirq;
839 }
840 return -1;
841}
842
843static int __init find_isa_irq_apic(int irq, int type)
844{
845 int i;
846
847 for (i = 0; i < mp_irq_entries; i++) {
848 int lbus = mp_irqs[i].mp_srcbus;
849
850 if (test_bit(lbus, mp_bus_not_pci) &&
851 (mp_irqs[i].mp_irqtype == type) &&
852 (mp_irqs[i].mp_srcbusirq == irq))
853 break;
854 }
855 if (i < mp_irq_entries) {
856 int apic;
857 for (apic = 0; apic < nr_ioapics; apic++) {
858 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
859 return apic;
860 }
861 }
862
863 return -1;
864}
865
866/*
867 * Find a specific PCI IRQ entry.
868 * Not an __init, possibly needed by modules
869 */
870static int pin_2_irq(int idx, int apic, int pin);
871
872int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
873{
874 int apic, i, best_guess = -1;
875
876 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
877 "slot:%d, pin:%d.\n", bus, slot, pin);
878 if (test_bit(bus, mp_bus_not_pci)) {
879 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
880 return -1;
881 }
882 for (i = 0; i < mp_irq_entries; i++) {
883 int lbus = mp_irqs[i].mp_srcbus;
884
885 for (apic = 0; apic < nr_ioapics; apic++)
886 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
887 mp_irqs[i].mp_dstapic == MP_APIC_ALL)
888 break;
889
890 if (!test_bit(lbus, mp_bus_not_pci) &&
891 !mp_irqs[i].mp_irqtype &&
892 (bus == lbus) &&
893 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
894 int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
895
896 if (!(apic || IO_APIC_IRQ(irq)))
897 continue;
898
899 if (pin == (mp_irqs[i].mp_srcbusirq & 3))
900 return irq;
901 /*
902 * Use the first all-but-pin matching entry as a
903 * best-guess fuzzy result for broken mptables.
904 */
905 if (best_guess < 0)
906 best_guess = irq;
907 }
908 }
909 return best_guess;
910}
911EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
912
913/*
914 * This function currently is only a helper for the i386 smp boot process where
915 * we need to reprogram the ioredtbls to cater for the cpus which have come online
916 * so mask in all cases should simply be TARGET_CPUS
917 */
918#ifdef CONFIG_SMP
919void __init setup_ioapic_dest(void)
920{
921 int pin, ioapic, irq, irq_entry;
922
923 if (skip_ioapic_setup == 1)
924 return;
925
926 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
927 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
928 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
929 if (irq_entry == -1)
930 continue;
931 irq = pin_2_irq(irq_entry, ioapic, pin);
932 set_ioapic_affinity_irq(irq, TARGET_CPUS);
933 }
934
935 }
936}
937#endif
938
939#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
940/*
941 * EISA Edge/Level control register, ELCR
942 */
943static int EISA_ELCR(unsigned int irq)
944{
945 if (irq < 16) {
946 unsigned int port = 0x4d0 + (irq >> 3);
947 return (inb(port) >> (irq & 7)) & 1;
948 }
949 apic_printk(APIC_VERBOSE, KERN_INFO
950 "Broken MPtable reports ISA irq %d\n", irq);
951 return 0;
952}
953#endif
954
955/* ISA interrupts are always polarity zero edge triggered,
956 * when listed as conforming in the MP table. */
957
958#define default_ISA_trigger(idx) (0)
959#define default_ISA_polarity(idx) (0)
960
961/* EISA interrupts are always polarity zero and can be edge or level
962 * trigger depending on the ELCR value. If an interrupt is listed as
963 * EISA conforming in the MP table, that means its trigger type must
964 * be read in from the ELCR */
965
966#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
967#define default_EISA_polarity(idx) default_ISA_polarity(idx)
968
969/* PCI interrupts are always polarity one level triggered,
970 * when listed as conforming in the MP table. */
971
972#define default_PCI_trigger(idx) (1)
973#define default_PCI_polarity(idx) (1)
974
975/* MCA interrupts are always polarity zero level triggered,
976 * when listed as conforming in the MP table. */
977
978#define default_MCA_trigger(idx) (1)
979#define default_MCA_polarity(idx) default_ISA_polarity(idx)
980
981static int MPBIOS_polarity(int idx)
982{
983 int bus = mp_irqs[idx].mp_srcbus;
984 int polarity;
985
986 /*
987 * Determine IRQ line polarity (high active or low active):
988 */
989 switch (mp_irqs[idx].mp_irqflag & 3) {
990 case 0: /* conforms, ie. bus-type dependent polarity */
991 {
992 polarity = test_bit(bus, mp_bus_not_pci)?
993 default_ISA_polarity(idx):
994 default_PCI_polarity(idx);
995 break;
996 }
997 case 1: /* high active */
998 {
999 polarity = 0;
1000 break;
1001 }
1002 case 2: /* reserved */
1003 {
1004 printk(KERN_WARNING "broken BIOS!!\n");
1005 polarity = 1;
1006 break;
1007 }
1008 case 3: /* low active */
1009 {
1010 polarity = 1;
1011 break;
1012 }
1013 default: /* invalid */
1014 {
1015 printk(KERN_WARNING "broken BIOS!!\n");
1016 polarity = 1;
1017 break;
1018 }
1019 }
1020 return polarity;
1021}
1022
1023static int MPBIOS_trigger(int idx)
1024{
1025 int bus = mp_irqs[idx].mp_srcbus;
1026 int trigger;
1027
1028 /*
1029 * Determine IRQ trigger mode (edge or level sensitive):
1030 */
1031 switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
1032 case 0: /* conforms, ie. bus-type dependent */
1033 {
1034 trigger = test_bit(bus, mp_bus_not_pci)?
1035 default_ISA_trigger(idx):
1036 default_PCI_trigger(idx);
1037#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1038 switch (mp_bus_id_to_type[bus]) {
1039 case MP_BUS_ISA: /* ISA pin */
1040 {
1041 /* set before the switch */
1042 break;
1043 }
1044 case MP_BUS_EISA: /* EISA pin */
1045 {
1046 trigger = default_EISA_trigger(idx);
1047 break;
1048 }
1049 case MP_BUS_PCI: /* PCI pin */
1050 {
1051 /* set before the switch */
1052 break;
1053 }
1054 case MP_BUS_MCA: /* MCA pin */
1055 {
1056 trigger = default_MCA_trigger(idx);
1057 break;
1058 }
1059 default:
1060 {
1061 printk(KERN_WARNING "broken BIOS!!\n");
1062 trigger = 1;
1063 break;
1064 }
1065 }
1066#endif
1067 break;
1068 }
1069 case 1: /* edge */
1070 {
1071 trigger = 0;
1072 break;
1073 }
1074 case 2: /* reserved */
1075 {
1076 printk(KERN_WARNING "broken BIOS!!\n");
1077 trigger = 1;
1078 break;
1079 }
1080 case 3: /* level */
1081 {
1082 trigger = 1;
1083 break;
1084 }
1085 default: /* invalid */
1086 {
1087 printk(KERN_WARNING "broken BIOS!!\n");
1088 trigger = 0;
1089 break;
1090 }
1091 }
1092 return trigger;
1093}
1094
1095static inline int irq_polarity(int idx)
1096{
1097 return MPBIOS_polarity(idx);
1098}
1099
1100static inline int irq_trigger(int idx)
1101{
1102 return MPBIOS_trigger(idx);
1103}
1104
1105static int pin_2_irq(int idx, int apic, int pin)
1106{
1107 int irq, i;
1108 int bus = mp_irqs[idx].mp_srcbus;
1109
1110 /*
1111 * Debugging check, we are in big trouble if this message pops up!
1112 */
1113 if (mp_irqs[idx].mp_dstirq != pin)
1114 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1115
1116 if (test_bit(bus, mp_bus_not_pci))
1117 irq = mp_irqs[idx].mp_srcbusirq;
1118 else {
1119 /*
1120 * PCI IRQs are mapped in order
1121 */
1122 i = irq = 0;
1123 while (i < apic)
1124 irq += nr_ioapic_registers[i++];
1125 irq += pin;
1126
1127 /*
1128 * For MPS mode, so far only needed by ES7000 platform
1129 */
1130 if (ioapic_renumber_irq)
1131 irq = ioapic_renumber_irq(apic, irq);
1132 }
1133
1134 /*
1135 * PCI IRQ command line redirection. Yes, limits are hardcoded.
1136 */
1137 if ((pin >= 16) && (pin <= 23)) {
1138 if (pirq_entries[pin-16] != -1) {
1139 if (!pirq_entries[pin-16]) {
1140 apic_printk(APIC_VERBOSE, KERN_DEBUG
1141 "disabling PIRQ%d\n", pin-16);
1142 } else {
1143 irq = pirq_entries[pin-16];
1144 apic_printk(APIC_VERBOSE, KERN_DEBUG
1145 "using PIRQ%d -> IRQ %d\n",
1146 pin-16, irq);
1147 }
1148 }
1149 }
1150 return irq;
1151}
1152
1153static inline int IO_APIC_irq_trigger(int irq)
1154{
1155 int apic, idx, pin;
1156
1157 for (apic = 0; apic < nr_ioapics; apic++) {
1158 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1159 idx = find_irq_entry(apic, pin, mp_INT);
1160 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
1161 return irq_trigger(idx);
1162 }
1163 }
1164 /*
1165 * nonexistent IRQs are edge default
1166 */
1167 return 0;
1168}
1169
1170/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1171static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
1172
1173static int __assign_irq_vector(int irq)
1174{
1175 static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
1176 int vector, offset;
1177
1178 BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
1179
1180 if (irq_vector[irq] > 0)
1181 return irq_vector[irq];
1182
1183 vector = current_vector;
1184 offset = current_offset;
1185next:
1186 vector += 8;
1187 if (vector >= first_system_vector) {
1188 offset = (offset + 1) % 8;
1189 vector = FIRST_DEVICE_VECTOR + offset;
1190 }
1191 if (vector == current_vector)
1192 return -ENOSPC;
1193 if (test_and_set_bit(vector, used_vectors))
1194 goto next;
1195
1196 current_vector = vector;
1197 current_offset = offset;
1198 irq_vector[irq] = vector;
1199
1200 return vector;
1201}
1202
1203static int assign_irq_vector(int irq)
1204{
1205 unsigned long flags;
1206 int vector;
1207
1208 spin_lock_irqsave(&vector_lock, flags);
1209 vector = __assign_irq_vector(irq);
1210 spin_unlock_irqrestore(&vector_lock, flags);
1211
1212 return vector;
1213}
1214
1215static struct irq_chip ioapic_chip;
1216
1217#define IOAPIC_AUTO -1
1218#define IOAPIC_EDGE 0
1219#define IOAPIC_LEVEL 1
1220
1221static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1222{
1223 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1224 trigger == IOAPIC_LEVEL) {
1225 irq_desc[irq].status |= IRQ_LEVEL;
1226 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1227 handle_fasteoi_irq, "fasteoi");
1228 } else {
1229 irq_desc[irq].status &= ~IRQ_LEVEL;
1230 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1231 handle_edge_irq, "edge");
1232 }
1233 set_intr_gate(vector, interrupt[irq]);
1234}
1235
1236static void __init setup_IO_APIC_irqs(void)
1237{
1238 struct IO_APIC_route_entry entry;
1239 int apic, pin, idx, irq, first_notcon = 1, vector;
1240
1241 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1242
1243 for (apic = 0; apic < nr_ioapics; apic++) {
1244 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1245
1246 /*
1247 * add it to the IO-APIC irq-routing table:
1248 */
1249 memset(&entry, 0, sizeof(entry));
1250
1251 entry.delivery_mode = INT_DELIVERY_MODE;
1252 entry.dest_mode = INT_DEST_MODE;
1253 entry.mask = 0; /* enable IRQ */
1254 entry.dest.logical.logical_dest =
1255 cpu_mask_to_apicid(TARGET_CPUS);
1256
1257 idx = find_irq_entry(apic, pin, mp_INT);
1258 if (idx == -1) {
1259 if (first_notcon) {
1260 apic_printk(APIC_VERBOSE, KERN_DEBUG
1261 " IO-APIC (apicid-pin) %d-%d",
1262 mp_ioapics[apic].mp_apicid,
1263 pin);
1264 first_notcon = 0;
1265 } else
1266 apic_printk(APIC_VERBOSE, ", %d-%d",
1267 mp_ioapics[apic].mp_apicid, pin);
1268 continue;
1269 }
1270
1271 if (!first_notcon) {
1272 apic_printk(APIC_VERBOSE, " not connected.\n");
1273 first_notcon = 1;
1274 }
1275
1276 entry.trigger = irq_trigger(idx);
1277 entry.polarity = irq_polarity(idx);
1278
1279 if (irq_trigger(idx)) {
1280 entry.trigger = 1;
1281 entry.mask = 1;
1282 }
1283
1284 irq = pin_2_irq(idx, apic, pin);
1285 /*
1286 * skip adding the timer int on secondary nodes, which causes
1287 * a small but painful rift in the time-space continuum
1288 */
1289 if (multi_timer_check(apic, irq))
1290 continue;
1291 else
1292 add_pin_to_irq(irq, apic, pin);
1293
1294 if (!apic && !IO_APIC_IRQ(irq))
1295 continue;
1296
1297 if (IO_APIC_IRQ(irq)) {
1298 vector = assign_irq_vector(irq);
1299 entry.vector = vector;
1300 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
1301
1302 if (!apic && (irq < 16))
1303 disable_8259A_irq(irq);
1304 }
1305 ioapic_write_entry(apic, pin, entry);
1306 }
1307 }
1308
1309 if (!first_notcon)
1310 apic_printk(APIC_VERBOSE, " not connected.\n");
1311}
1312
1313/*
1314 * Set up the timer pin, possibly with the 8259A-master behind.
1315 */
1316static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1317 int vector)
1318{
1319 struct IO_APIC_route_entry entry;
1320
1321 memset(&entry, 0, sizeof(entry));
1322
1323 /*
1324 * We use logical delivery to get the timer IRQ
1325 * to the first CPU.
1326 */
1327 entry.dest_mode = INT_DEST_MODE;
1328 entry.mask = 1; /* mask IRQ now */
1329 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
1330 entry.delivery_mode = INT_DELIVERY_MODE;
1331 entry.polarity = 0;
1332 entry.trigger = 0;
1333 entry.vector = vector;
1334
1335 /*
1336 * The timer IRQ doesn't have to know that behind the
1337 * scene we may have a 8259A-master in AEOI mode ...
1338 */
1339 ioapic_register_intr(0, vector, IOAPIC_EDGE);
1340
1341 /*
1342 * Add it to the IO-APIC irq-routing table:
1343 */
1344 ioapic_write_entry(apic, pin, entry);
1345}
1346
1347
1348__apicdebuginit(void) print_IO_APIC(void)
1349{
1350 int apic, i;
1351 union IO_APIC_reg_00 reg_00;
1352 union IO_APIC_reg_01 reg_01;
1353 union IO_APIC_reg_02 reg_02;
1354 union IO_APIC_reg_03 reg_03;
1355 unsigned long flags;
1356
1357 if (apic_verbosity == APIC_QUIET)
1358 return;
1359
1360 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1361 for (i = 0; i < nr_ioapics; i++)
1362 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1363 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
1364
1365 /*
1366 * We are a bit conservative about what we expect. We have to
1367 * know about every hardware change ASAP.
1368 */
1369 printk(KERN_INFO "testing the IO APIC.......................\n");
1370
1371 for (apic = 0; apic < nr_ioapics; apic++) {
1372
1373 spin_lock_irqsave(&ioapic_lock, flags);
1374 reg_00.raw = io_apic_read(apic, 0);
1375 reg_01.raw = io_apic_read(apic, 1);
1376 if (reg_01.bits.version >= 0x10)
1377 reg_02.raw = io_apic_read(apic, 2);
1378 if (reg_01.bits.version >= 0x20)
1379 reg_03.raw = io_apic_read(apic, 3);
1380 spin_unlock_irqrestore(&ioapic_lock, flags);
1381
1382 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
1383 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1384 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1385 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1386 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1387
1388 printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
1389 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1390
1391 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1392 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1393
1394 /*
1395 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1396 * but the value of reg_02 is read as the previous read register
1397 * value, so ignore it if reg_02 == reg_01.
1398 */
1399 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1400 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1401 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1402 }
1403
1404 /*
1405 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
1406 * or reg_03, but the value of reg_0[23] is read as the previous read
1407 * register value, so ignore it if reg_03 == reg_0[12].
1408 */
1409 if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
1410 reg_03.raw != reg_01.raw) {
1411 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1412 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1413 }
1414
1415 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1416
1417 printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
1418 " Stat Dest Deli Vect: \n");
1419
1420 for (i = 0; i <= reg_01.bits.entries; i++) {
1421 struct IO_APIC_route_entry entry;
1422
1423 entry = ioapic_read_entry(apic, i);
1424
1425 printk(KERN_DEBUG " %02x %03X %02X ",
1426 i,
1427 entry.dest.logical.logical_dest,
1428 entry.dest.physical.physical_dest
1429 );
1430
1431 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
1432 entry.mask,
1433 entry.trigger,
1434 entry.irr,
1435 entry.polarity,
1436 entry.delivery_status,
1437 entry.dest_mode,
1438 entry.delivery_mode,
1439 entry.vector
1440 );
1441 }
1442 }
1443 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1444 for (i = 0; i < NR_IRQS; i++) {
1445 struct irq_pin_list *entry = irq_2_pin + i;
1446 if (entry->pin < 0)
1447 continue;
1448 printk(KERN_DEBUG "IRQ%d ", i);
1449 for (;;) {
1450 printk("-> %d:%d", entry->apic, entry->pin);
1451 if (!entry->next)
1452 break;
1453 entry = irq_2_pin + entry->next;
1454 }
1455 printk("\n");
1456 }
1457
1458 printk(KERN_INFO ".................................... done.\n");
1459
1460 return;
1461}
1462
1463__apicdebuginit(void) print_APIC_bitfield(int base)
1464{
1465 unsigned int v;
1466 int i, j;
1467
1468 if (apic_verbosity == APIC_QUIET)
1469 return;
1470
1471 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
1472 for (i = 0; i < 8; i++) {
1473 v = apic_read(base + i*0x10);
1474 for (j = 0; j < 32; j++) {
1475 if (v & (1<<j))
1476 printk("1");
1477 else
1478 printk("0");
1479 }
1480 printk("\n");
1481 }
1482}
1483
1484__apicdebuginit(void) print_local_APIC(void *dummy)
1485{
1486 unsigned int v, ver, maxlvt;
1487 u64 icr;
1488
1489 if (apic_verbosity == APIC_QUIET)
1490 return;
1491
1492 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1493 smp_processor_id(), hard_smp_processor_id());
1494 v = apic_read(APIC_ID);
1495 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
1496 GET_APIC_ID(v));
1497 v = apic_read(APIC_LVR);
1498 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1499 ver = GET_APIC_VERSION(v);
1500 maxlvt = lapic_get_maxlvt();
1501
1502 v = apic_read(APIC_TASKPRI);
1503 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
1504
1505 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1506 v = apic_read(APIC_ARBPRI);
1507 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
1508 v & APIC_ARBPRI_MASK);
1509 v = apic_read(APIC_PROCPRI);
1510 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
1511 }
1512
1513 v = apic_read(APIC_EOI);
1514 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1515 v = apic_read(APIC_RRR);
1516 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1517 v = apic_read(APIC_LDR);
1518 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1519 v = apic_read(APIC_DFR);
1520 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1521 v = apic_read(APIC_SPIV);
1522 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1523
1524 printk(KERN_DEBUG "... APIC ISR field:\n");
1525 print_APIC_bitfield(APIC_ISR);
1526 printk(KERN_DEBUG "... APIC TMR field:\n");
1527 print_APIC_bitfield(APIC_TMR);
1528 printk(KERN_DEBUG "... APIC IRR field:\n");
1529 print_APIC_bitfield(APIC_IRR);
1530
1531 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1532 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1533 apic_write(APIC_ESR, 0);
1534 v = apic_read(APIC_ESR);
1535 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1536 }
1537
1538 icr = apic_icr_read();
1539 printk(KERN_DEBUG "... APIC ICR: %08x\n", icr);
1540 printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32);
1541
1542 v = apic_read(APIC_LVTT);
1543 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
1544
1545 if (maxlvt > 3) { /* PC is LVT#4. */
1546 v = apic_read(APIC_LVTPC);
1547 printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
1548 }
1549 v = apic_read(APIC_LVT0);
1550 printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
1551 v = apic_read(APIC_LVT1);
1552 printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
1553
1554 if (maxlvt > 2) { /* ERR is LVT#3. */
1555 v = apic_read(APIC_LVTERR);
1556 printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
1557 }
1558
1559 v = apic_read(APIC_TMICT);
1560 printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
1561 v = apic_read(APIC_TMCCT);
1562 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1563 v = apic_read(APIC_TDCR);
1564 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1565 printk("\n");
1566}
1567
1568__apicdebuginit(void) print_all_local_APICs(void)
1569{
1570 on_each_cpu(print_local_APIC, NULL, 1);
1571}
1572
1573__apicdebuginit(void) print_PIC(void)
1574{
1575 unsigned int v;
1576 unsigned long flags;
1577
1578 if (apic_verbosity == APIC_QUIET)
1579 return;
1580
1581 printk(KERN_DEBUG "\nprinting PIC contents\n");
1582
1583 spin_lock_irqsave(&i8259A_lock, flags);
1584
1585 v = inb(0xa1) << 8 | inb(0x21);
1586 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
1587
1588 v = inb(0xa0) << 8 | inb(0x20);
1589 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
1590
1591 outb(0x0b, 0xa0);
1592 outb(0x0b, 0x20);
1593 v = inb(0xa0) << 8 | inb(0x20);
1594 outb(0x0a, 0xa0);
1595 outb(0x0a, 0x20);
1596
1597 spin_unlock_irqrestore(&i8259A_lock, flags);
1598
1599 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1600
1601 v = inb(0x4d1) << 8 | inb(0x4d0);
1602 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1603}
1604
1605__apicdebuginit(int) print_all_ICs(void)
1606{
1607 print_PIC();
1608 print_all_local_APICs();
1609 print_IO_APIC();
1610
1611 return 0;
1612}
1613
1614fs_initcall(print_all_ICs);
1615
1616
1617static void __init enable_IO_APIC(void)
1618{
1619 union IO_APIC_reg_01 reg_01;
1620 int i8259_apic, i8259_pin;
1621 int i, apic;
1622 unsigned long flags;
1623
1624 for (i = 0; i < PIN_MAP_SIZE; i++) {
1625 irq_2_pin[i].pin = -1;
1626 irq_2_pin[i].next = 0;
1627 }
1628 if (!pirqs_enabled)
1629 for (i = 0; i < MAX_PIRQS; i++)
1630 pirq_entries[i] = -1;
1631
1632 /*
1633 * The number of IO-APIC IRQ registers (== #pins):
1634 */
1635 for (apic = 0; apic < nr_ioapics; apic++) {
1636 spin_lock_irqsave(&ioapic_lock, flags);
1637 reg_01.raw = io_apic_read(apic, 1);
1638 spin_unlock_irqrestore(&ioapic_lock, flags);
1639 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1640 }
1641 for (apic = 0; apic < nr_ioapics; apic++) {
1642 int pin;
1643 /* See if any of the pins is in ExtINT mode */
1644 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1645 struct IO_APIC_route_entry entry;
1646 entry = ioapic_read_entry(apic, pin);
1647
1648
1649 /* If the interrupt line is enabled and in ExtInt mode
1650 * I have found the pin where the i8259 is connected.
1651 */
1652 if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
1653 ioapic_i8259.apic = apic;
1654 ioapic_i8259.pin = pin;
1655 goto found_i8259;
1656 }
1657 }
1658 }
1659 found_i8259:
1660 /* Look to see what if the MP table has reported the ExtINT */
1661 /* If we could not find the appropriate pin by looking at the ioapic
1662 * the i8259 probably is not connected the ioapic but give the
1663 * mptable a chance anyway.
1664 */
1665 i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
1666 i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
1667 /* Trust the MP table if nothing is setup in the hardware */
1668 if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
1669 printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
1670 ioapic_i8259.pin = i8259_pin;
1671 ioapic_i8259.apic = i8259_apic;
1672 }
1673 /* Complain if the MP table and the hardware disagree */
1674 if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
1675 (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
1676 {
1677 printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
1678 }
1679
1680 /*
1681 * Do not trust the IO-APIC being empty at bootup
1682 */
1683 clear_IO_APIC();
1684}
1685
1686/*
1687 * Not an __init, needed by the reboot code
1688 */
1689void disable_IO_APIC(void)
1690{
1691 /*
1692 * Clear the IO-APIC before rebooting:
1693 */
1694 clear_IO_APIC();
1695
1696 /*
1697 * If the i8259 is routed through an IOAPIC
1698 * Put that IOAPIC in virtual wire mode
1699 * so legacy interrupts can be delivered.
1700 */
1701 if (ioapic_i8259.pin != -1) {
1702 struct IO_APIC_route_entry entry;
1703
1704 memset(&entry, 0, sizeof(entry));
1705 entry.mask = 0; /* Enabled */
1706 entry.trigger = 0; /* Edge */
1707 entry.irr = 0;
1708 entry.polarity = 0; /* High */
1709 entry.delivery_status = 0;
1710 entry.dest_mode = 0; /* Physical */
1711 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1712 entry.vector = 0;
1713 entry.dest.physical.physical_dest = read_apic_id();
1714
1715 /*
1716 * Add it to the IO-APIC irq-routing table:
1717 */
1718 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
1719 }
1720 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1721}
1722
1723/*
1724 * function to set the IO-APIC physical IDs based on the
1725 * values stored in the MPC table.
1726 *
1727 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1728 */
1729
1730static void __init setup_ioapic_ids_from_mpc(void)
1731{
1732 union IO_APIC_reg_00 reg_00;
1733 physid_mask_t phys_id_present_map;
1734 int apic;
1735 int i;
1736 unsigned char old_id;
1737 unsigned long flags;
1738
1739 if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
1740 return;
1741
1742 /*
1743 * Don't check I/O APIC IDs for xAPIC systems. They have
1744 * no meaning without the serial APIC bus.
1745 */
1746 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1747 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
1748 return;
1749 /*
1750 * This is broken; anything with a real cpu count has to
1751 * circumvent this idiocy regardless.
1752 */
1753 phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
1754
1755 /*
1756 * Set the IOAPIC ID to the value stored in the MPC table.
1757 */
1758 for (apic = 0; apic < nr_ioapics; apic++) {
1759
1760 /* Read the register 0 value */
1761 spin_lock_irqsave(&ioapic_lock, flags);
1762 reg_00.raw = io_apic_read(apic, 0);
1763 spin_unlock_irqrestore(&ioapic_lock, flags);
1764
1765 old_id = mp_ioapics[apic].mp_apicid;
1766
1767 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
1768 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1769 apic, mp_ioapics[apic].mp_apicid);
1770 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1771 reg_00.bits.ID);
1772 mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
1773 }
1774
1775 /*
1776 * Sanity check, is the ID really free? Every APIC in a
1777 * system must have a unique ID or we get lots of nice
1778 * 'stuck on smp_invalidate_needed IPI wait' messages.
1779 */
1780 if (check_apicid_used(phys_id_present_map,
1781 mp_ioapics[apic].mp_apicid)) {
1782 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1783 apic, mp_ioapics[apic].mp_apicid);
1784 for (i = 0; i < get_physical_broadcast(); i++)
1785 if (!physid_isset(i, phys_id_present_map))
1786 break;
1787 if (i >= get_physical_broadcast())
1788 panic("Max APIC ID exceeded!\n");
1789 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1790 i);
1791 physid_set(i, phys_id_present_map);
1792 mp_ioapics[apic].mp_apicid = i;
1793 } else {
1794 physid_mask_t tmp;
1795 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
1796 apic_printk(APIC_VERBOSE, "Setting %d in the "
1797 "phys_id_present_map\n",
1798 mp_ioapics[apic].mp_apicid);
1799 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1800 }
1801
1802
1803 /*
1804 * We need to adjust the IRQ routing table
1805 * if the ID changed.
1806 */
1807 if (old_id != mp_ioapics[apic].mp_apicid)
1808 for (i = 0; i < mp_irq_entries; i++)
1809 if (mp_irqs[i].mp_dstapic == old_id)
1810 mp_irqs[i].mp_dstapic
1811 = mp_ioapics[apic].mp_apicid;
1812
1813 /*
1814 * Read the right value from the MPC table and
1815 * write it into the ID register.
1816 */
1817 apic_printk(APIC_VERBOSE, KERN_INFO
1818 "...changing IO-APIC physical APIC ID to %d ...",
1819 mp_ioapics[apic].mp_apicid);
1820
1821 reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
1822 spin_lock_irqsave(&ioapic_lock, flags);
1823 io_apic_write(apic, 0, reg_00.raw);
1824 spin_unlock_irqrestore(&ioapic_lock, flags);
1825
1826 /*
1827 * Sanity check
1828 */
1829 spin_lock_irqsave(&ioapic_lock, flags);
1830 reg_00.raw = io_apic_read(apic, 0);
1831 spin_unlock_irqrestore(&ioapic_lock, flags);
1832 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
1833 printk("could not set ID!\n");
1834 else
1835 apic_printk(APIC_VERBOSE, " ok.\n");
1836 }
1837}
1838
1839int no_timer_check __initdata;
1840
1841static int __init notimercheck(char *s)
1842{
1843 no_timer_check = 1;
1844 return 1;
1845}
1846__setup("no_timer_check", notimercheck);
1847
1848/*
1849 * There is a nasty bug in some older SMP boards, their mptable lies
1850 * about the timer IRQ. We do the following to work around the situation:
1851 *
1852 * - timer IRQ defaults to IO-APIC IRQ
1853 * - if this function detects that timer IRQs are defunct, then we fall
1854 * back to ISA timer IRQs
1855 */
1856static int __init timer_irq_works(void)
1857{
1858 unsigned long t1 = jiffies;
1859 unsigned long flags;
1860
1861 if (no_timer_check)
1862 return 1;
1863
1864 local_save_flags(flags);
1865 local_irq_enable();
1866 /* Let ten ticks pass... */
1867 mdelay((10 * 1000) / HZ);
1868 local_irq_restore(flags);
1869
1870 /*
1871 * Expect a few ticks at least, to be sure some possible
1872 * glue logic does not lock up after one or two first
1873 * ticks in a non-ExtINT mode. Also the local APIC
1874 * might have cached one ExtINT interrupt. Finally, at
1875 * least one tick may be lost due to delays.
1876 */
1877 if (time_after(jiffies, t1 + 4))
1878 return 1;
1879
1880 return 0;
1881}
1882
1883/*
1884 * In the SMP+IOAPIC case it might happen that there are an unspecified
1885 * number of pending IRQ events unhandled. These cases are very rare,
1886 * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
1887 * better to do it this way as thus we do not have to be aware of
1888 * 'pending' interrupts in the IRQ path, except at this point.
1889 */
1890/*
1891 * Edge triggered needs to resend any interrupt
1892 * that was delayed but this is now handled in the device
1893 * independent code.
1894 */
1895
1896/*
1897 * Startup quirk:
1898 *
1899 * Starting up a edge-triggered IO-APIC interrupt is
1900 * nasty - we need to make sure that we get the edge.
1901 * If it is already asserted for some reason, we need
1902 * return 1 to indicate that is was pending.
1903 *
1904 * This is not complete - we should be able to fake
1905 * an edge even if it isn't on the 8259A...
1906 *
1907 * (We do this for level-triggered IRQs too - it cannot hurt.)
1908 */
1909static unsigned int startup_ioapic_irq(unsigned int irq)
1910{
1911 int was_pending = 0;
1912 unsigned long flags;
1913
1914 spin_lock_irqsave(&ioapic_lock, flags);
1915 if (irq < 16) {
1916 disable_8259A_irq(irq);
1917 if (i8259A_irq_pending(irq))
1918 was_pending = 1;
1919 }
1920 __unmask_IO_APIC_irq(irq);
1921 spin_unlock_irqrestore(&ioapic_lock, flags);
1922
1923 return was_pending;
1924}
1925
1926static void ack_ioapic_irq(unsigned int irq)
1927{
1928 move_native_irq(irq);
1929 ack_APIC_irq();
1930}
1931
1932static void ack_ioapic_quirk_irq(unsigned int irq)
1933{
1934 unsigned long v;
1935 int i;
1936
1937 move_native_irq(irq);
1938/*
1939 * It appears there is an erratum which affects at least version 0x11
1940 * of I/O APIC (that's the 82093AA and cores integrated into various
1941 * chipsets). Under certain conditions a level-triggered interrupt is
1942 * erroneously delivered as edge-triggered one but the respective IRR
1943 * bit gets set nevertheless. As a result the I/O unit expects an EOI
1944 * message but it will never arrive and further interrupts are blocked
1945 * from the source. The exact reason is so far unknown, but the
1946 * phenomenon was observed when two consecutive interrupt requests
1947 * from a given source get delivered to the same CPU and the source is
1948 * temporarily disabled in between.
1949 *
1950 * A workaround is to simulate an EOI message manually. We achieve it
1951 * by setting the trigger mode to edge and then to level when the edge
1952 * trigger mode gets detected in the TMR of a local APIC for a
1953 * level-triggered interrupt. We mask the source for the time of the
1954 * operation to prevent an edge-triggered interrupt escaping meanwhile.
1955 * The idea is from Manfred Spraul. --macro
1956 */
1957 i = irq_vector[irq];
1958
1959 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
1960
1961 ack_APIC_irq();
1962
1963 if (!(v & (1 << (i & 0x1f)))) {
1964 atomic_inc(&irq_mis_count);
1965 spin_lock(&ioapic_lock);
1966 __mask_and_edge_IO_APIC_irq(irq);
1967 __unmask_and_level_IO_APIC_irq(irq);
1968 spin_unlock(&ioapic_lock);
1969 }
1970}
1971
1972static int ioapic_retrigger_irq(unsigned int irq)
1973{
1974 send_IPI_self(irq_vector[irq]);
1975
1976 return 1;
1977}
1978
1979static struct irq_chip ioapic_chip __read_mostly = {
1980 .name = "IO-APIC",
1981 .startup = startup_ioapic_irq,
1982 .mask = mask_IO_APIC_irq,
1983 .unmask = unmask_IO_APIC_irq,
1984 .ack = ack_ioapic_irq,
1985 .eoi = ack_ioapic_quirk_irq,
1986#ifdef CONFIG_SMP
1987 .set_affinity = set_ioapic_affinity_irq,
1988#endif
1989 .retrigger = ioapic_retrigger_irq,
1990};
1991
1992
1993static inline void init_IO_APIC_traps(void)
1994{
1995 int irq;
1996
1997 /*
1998 * NOTE! The local APIC isn't very good at handling
1999 * multiple interrupts at the same interrupt level.
2000 * As the interrupt level is determined by taking the
2001 * vector number and shifting that right by 4, we
2002 * want to spread these out a bit so that they don't
2003 * all fall in the same interrupt level.
2004 *
2005 * Also, we've got to be careful not to trash gate
2006 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2007 */
2008 for (irq = 0; irq < NR_IRQS ; irq++) {
2009 if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
2010 /*
2011 * Hmm.. We don't have an entry for this,
2012 * so default to an old-fashioned 8259
2013 * interrupt if we can..
2014 */
2015 if (irq < 16)
2016 make_8259A_irq(irq);
2017 else
2018 /* Strange. Oh, well.. */
2019 irq_desc[irq].chip = &no_irq_chip;
2020 }
2021 }
2022}
2023
2024/*
2025 * The local APIC irq-chip implementation:
2026 */
2027
2028static void ack_lapic_irq(unsigned int irq)
2029{
2030 ack_APIC_irq();
2031}
2032
2033static void mask_lapic_irq(unsigned int irq)
2034{
2035 unsigned long v;
2036
2037 v = apic_read(APIC_LVT0);
2038 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
2039}
2040
2041static void unmask_lapic_irq(unsigned int irq)
2042{
2043 unsigned long v;
2044
2045 v = apic_read(APIC_LVT0);
2046 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2047}
2048
2049static struct irq_chip lapic_chip __read_mostly = {
2050 .name = "local-APIC",
2051 .mask = mask_lapic_irq,
2052 .unmask = unmask_lapic_irq,
2053 .ack = ack_lapic_irq,
2054};
2055
2056static void lapic_register_intr(int irq, int vector)
2057{
2058 irq_desc[irq].status &= ~IRQ_LEVEL;
2059 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2060 "edge");
2061 set_intr_gate(vector, interrupt[irq]);
2062}
2063
2064static void __init setup_nmi(void)
2065{
2066 /*
2067 * Dirty trick to enable the NMI watchdog ...
2068 * We put the 8259A master into AEOI mode and
2069 * unmask on all local APICs LVT0 as NMI.
2070 *
2071 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
2072 * is from Maciej W. Rozycki - so we do not have to EOI from
2073 * the NMI handler or the timer interrupt.
2074 */
2075 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2076
2077 enable_NMI_through_LVT0();
2078
2079 apic_printk(APIC_VERBOSE, " done.\n");
2080}
2081
2082/*
2083 * This looks a bit hackish but it's about the only one way of sending
2084 * a few INTA cycles to 8259As and any associated glue logic. ICR does
2085 * not support the ExtINT mode, unfortunately. We need to send these
2086 * cycles as some i82489DX-based boards have glue logic that keeps the
2087 * 8259A interrupt line asserted until INTA. --macro
2088 */
2089static inline void __init unlock_ExtINT_logic(void)
2090{
2091 int apic, pin, i;
2092 struct IO_APIC_route_entry entry0, entry1;
2093 unsigned char save_control, save_freq_select;
2094
2095 pin = find_isa_irq_pin(8, mp_INT);
2096 if (pin == -1) {
2097 WARN_ON_ONCE(1);
2098 return;
2099 }
2100 apic = find_isa_irq_apic(8, mp_INT);
2101 if (apic == -1) {
2102 WARN_ON_ONCE(1);
2103 return;
2104 }
2105
2106 entry0 = ioapic_read_entry(apic, pin);
2107 clear_IO_APIC_pin(apic, pin);
2108
2109 memset(&entry1, 0, sizeof(entry1));
2110
2111 entry1.dest_mode = 0; /* physical delivery */
2112 entry1.mask = 0; /* unmask IRQ now */
2113 entry1.dest.physical.physical_dest = hard_smp_processor_id();
2114 entry1.delivery_mode = dest_ExtINT;
2115 entry1.polarity = entry0.polarity;
2116 entry1.trigger = 0;
2117 entry1.vector = 0;
2118
2119 ioapic_write_entry(apic, pin, entry1);
2120
2121 save_control = CMOS_READ(RTC_CONTROL);
2122 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
2123 CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
2124 RTC_FREQ_SELECT);
2125 CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
2126
2127 i = 100;
2128 while (i-- > 0) {
2129 mdelay(10);
2130 if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
2131 i -= 10;
2132 }
2133
2134 CMOS_WRITE(save_control, RTC_CONTROL);
2135 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
2136 clear_IO_APIC_pin(apic, pin);
2137
2138 ioapic_write_entry(apic, pin, entry0);
2139}
2140
2141/*
2142 * This code may look a bit paranoid, but it's supposed to cooperate with
2143 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
2144 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2145 * fanatically on his truly buggy board.
2146 */
2147static inline void __init check_timer(void)
2148{
2149 int apic1, pin1, apic2, pin2;
2150 int no_pin1 = 0;
2151 int vector;
2152 unsigned int ver;
2153 unsigned long flags;
2154
2155 local_irq_save(flags);
2156
2157 ver = apic_read(APIC_LVR);
2158 ver = GET_APIC_VERSION(ver);
2159
2160 /*
2161 * get/set the timer IRQ vector:
2162 */
2163 disable_8259A_irq(0);
2164 vector = assign_irq_vector(0);
2165 set_intr_gate(vector, interrupt[0]);
2166
2167 /*
2168 * As IRQ0 is to be enabled in the 8259A, the virtual
2169 * wire has to be disabled in the local APIC. Also
2170 * timer interrupts need to be acknowledged manually in
2171 * the 8259A for the i82489DX when using the NMI
2172 * watchdog as that APIC treats NMIs as level-triggered.
2173 * The AEOI mode will finish them in the 8259A
2174 * automatically.
2175 */
2176 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2177 init_8259A(1);
2178 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2179
2180 pin1 = find_isa_irq_pin(0, mp_INT);
2181 apic1 = find_isa_irq_apic(0, mp_INT);
2182 pin2 = ioapic_i8259.pin;
2183 apic2 = ioapic_i8259.apic;
2184
2185 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
2186 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
2187 vector, apic1, pin1, apic2, pin2);
2188
2189 /*
2190 * Some BIOS writers are clueless and report the ExtINTA
2191 * I/O APIC input from the cascaded 8259A as the timer
2192 * interrupt input. So just in case, if only one pin
2193 * was found above, try it both directly and through the
2194 * 8259A.
2195 */
2196 if (pin1 == -1) {
2197 pin1 = pin2;
2198 apic1 = apic2;
2199 no_pin1 = 1;
2200 } else if (pin2 == -1) {
2201 pin2 = pin1;
2202 apic2 = apic1;
2203 }
2204
2205 if (pin1 != -1) {
2206 /*
2207 * Ok, does IRQ0 through the IOAPIC work?
2208 */
2209 if (no_pin1) {
2210 add_pin_to_irq(0, apic1, pin1);
2211 setup_timer_IRQ0_pin(apic1, pin1, vector);
2212 }
2213 unmask_IO_APIC_irq(0);
2214 if (timer_irq_works()) {
2215 if (nmi_watchdog == NMI_IO_APIC) {
2216 setup_nmi();
2217 enable_8259A_irq(0);
2218 }
2219 if (disable_timer_pin_1 > 0)
2220 clear_IO_APIC_pin(0, pin1);
2221 goto out;
2222 }
2223 clear_IO_APIC_pin(apic1, pin1);
2224 if (!no_pin1)
2225 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
2226 "8254 timer not connected to IO-APIC\n");
2227
2228 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
2229 "(IRQ0) through the 8259A ...\n");
2230 apic_printk(APIC_QUIET, KERN_INFO
2231 "..... (found apic %d pin %d) ...\n", apic2, pin2);
2232 /*
2233 * legacy devices should be connected to IO APIC #0
2234 */
2235 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
2236 setup_timer_IRQ0_pin(apic2, pin2, vector);
2237 unmask_IO_APIC_irq(0);
2238 enable_8259A_irq(0);
2239 if (timer_irq_works()) {
2240 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2241 timer_through_8259 = 1;
2242 if (nmi_watchdog == NMI_IO_APIC) {
2243 disable_8259A_irq(0);
2244 setup_nmi();
2245 enable_8259A_irq(0);
2246 }
2247 goto out;
2248 }
2249 /*
2250 * Cleanup, just in case ...
2251 */
2252 disable_8259A_irq(0);
2253 clear_IO_APIC_pin(apic2, pin2);
2254 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
2255 }
2256
2257 if (nmi_watchdog == NMI_IO_APIC) {
2258 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
2259 "through the IO-APIC - disabling NMI Watchdog!\n");
2260 nmi_watchdog = NMI_NONE;
2261 }
2262 timer_ack = 0;
2263
2264 apic_printk(APIC_QUIET, KERN_INFO
2265 "...trying to set up timer as Virtual Wire IRQ...\n");
2266
2267 lapic_register_intr(0, vector);
2268 apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2269 enable_8259A_irq(0);
2270
2271 if (timer_irq_works()) {
2272 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2273 goto out;
2274 }
2275 disable_8259A_irq(0);
2276 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
2277 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
2278
2279 apic_printk(APIC_QUIET, KERN_INFO
2280 "...trying to set up timer as ExtINT IRQ...\n");
2281
2282 init_8259A(0);
2283 make_8259A_irq(0);
2284 apic_write(APIC_LVT0, APIC_DM_EXTINT);
2285
2286 unlock_ExtINT_logic();
2287
2288 if (timer_irq_works()) {
2289 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2290 goto out;
2291 }
2292 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2293 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2294 "report. Then try booting with the 'noapic' option.\n");
2295out:
2296 local_irq_restore(flags);
2297}
2298
2299/*
2300 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
2301 * to devices. However there may be an I/O APIC pin available for
2302 * this interrupt regardless. The pin may be left unconnected, but
2303 * typically it will be reused as an ExtINT cascade interrupt for
2304 * the master 8259A. In the MPS case such a pin will normally be
2305 * reported as an ExtINT interrupt in the MP table. With ACPI
2306 * there is no provision for ExtINT interrupts, and in the absence
2307 * of an override it would be treated as an ordinary ISA I/O APIC
2308 * interrupt, that is edge-triggered and unmasked by default. We
2309 * used to do this, but it caused problems on some systems because
2310 * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
2311 * the same ExtINT cascade interrupt to drive the local APIC of the
2312 * bootstrap processor. Therefore we refrain from routing IRQ2 to
2313 * the I/O APIC in all cases now. No actual device should request
2314 * it anyway. --macro
2315 */
2316#define PIC_IRQS (1 << PIC_CASCADE_IR)
2317
2318void __init setup_IO_APIC(void)
2319{
2320 int i;
2321
2322 /* Reserve all the system vectors. */
2323 for (i = first_system_vector; i < NR_VECTORS; i++)
2324 set_bit(i, used_vectors);
2325
2326 enable_IO_APIC();
2327
2328 io_apic_irqs = ~PIC_IRQS;
2329
2330 printk("ENABLING IO-APIC IRQs\n");
2331
2332 /*
2333 * Set up IO-APIC IRQ routing.
2334 */
2335 if (!acpi_ioapic)
2336 setup_ioapic_ids_from_mpc();
2337 sync_Arb_IDs();
2338 setup_IO_APIC_irqs();
2339 init_IO_APIC_traps();
2340 check_timer();
2341}
2342
2343/*
2344 * Called after all the initialization is done. If we didnt find any
2345 * APIC bugs then we can allow the modify fast path
2346 */
2347
2348static int __init io_apic_bug_finalize(void)
2349{
2350 if (sis_apic_bug == -1)
2351 sis_apic_bug = 0;
2352 return 0;
2353}
2354
2355late_initcall(io_apic_bug_finalize);
2356
2357struct sysfs_ioapic_data {
2358 struct sys_device dev;
2359 struct IO_APIC_route_entry entry[0];
2360};
2361static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
2362
2363static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
2364{
2365 struct IO_APIC_route_entry *entry;
2366 struct sysfs_ioapic_data *data;
2367 int i;
2368
2369 data = container_of(dev, struct sysfs_ioapic_data, dev);
2370 entry = data->entry;
2371 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
2372 entry[i] = ioapic_read_entry(dev->id, i);
2373
2374 return 0;
2375}
2376
2377static int ioapic_resume(struct sys_device *dev)
2378{
2379 struct IO_APIC_route_entry *entry;
2380 struct sysfs_ioapic_data *data;
2381 unsigned long flags;
2382 union IO_APIC_reg_00 reg_00;
2383 int i;
2384
2385 data = container_of(dev, struct sysfs_ioapic_data, dev);
2386 entry = data->entry;
2387
2388 spin_lock_irqsave(&ioapic_lock, flags);
2389 reg_00.raw = io_apic_read(dev->id, 0);
2390 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
2391 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
2392 io_apic_write(dev->id, 0, reg_00.raw);
2393 }
2394 spin_unlock_irqrestore(&ioapic_lock, flags);
2395 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
2396 ioapic_write_entry(dev->id, i, entry[i]);
2397
2398 return 0;
2399}
2400
2401static struct sysdev_class ioapic_sysdev_class = {
2402 .name = "ioapic",
2403 .suspend = ioapic_suspend,
2404 .resume = ioapic_resume,
2405};
2406
2407static int __init ioapic_init_sysfs(void)
2408{
2409 struct sys_device *dev;
2410 int i, size, error = 0;
2411
2412 error = sysdev_class_register(&ioapic_sysdev_class);
2413 if (error)
2414 return error;
2415
2416 for (i = 0; i < nr_ioapics; i++) {
2417 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
2418 * sizeof(struct IO_APIC_route_entry);
2419 mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
2420 if (!mp_ioapic_data[i]) {
2421 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2422 continue;
2423 }
2424 dev = &mp_ioapic_data[i]->dev;
2425 dev->id = i;
2426 dev->cls = &ioapic_sysdev_class;
2427 error = sysdev_register(dev);
2428 if (error) {
2429 kfree(mp_ioapic_data[i]);
2430 mp_ioapic_data[i] = NULL;
2431 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2432 continue;
2433 }
2434 }
2435
2436 return 0;
2437}
2438
2439device_initcall(ioapic_init_sysfs);
2440
2441/*
2442 * Dynamic irq allocate and deallocation
2443 */
2444int create_irq(void)
2445{
2446 /* Allocate an unused irq */
2447 int irq, new, vector = 0;
2448 unsigned long flags;
2449
2450 irq = -ENOSPC;
2451 spin_lock_irqsave(&vector_lock, flags);
2452 for (new = (NR_IRQS - 1); new >= 0; new--) {
2453 if (platform_legacy_irq(new))
2454 continue;
2455 if (irq_vector[new] != 0)
2456 continue;
2457 vector = __assign_irq_vector(new);
2458 if (likely(vector > 0))
2459 irq = new;
2460 break;
2461 }
2462 spin_unlock_irqrestore(&vector_lock, flags);
2463
2464 if (irq >= 0) {
2465 set_intr_gate(vector, interrupt[irq]);
2466 dynamic_irq_init(irq);
2467 }
2468 return irq;
2469}
2470
2471void destroy_irq(unsigned int irq)
2472{
2473 unsigned long flags;
2474
2475 dynamic_irq_cleanup(irq);
2476
2477 spin_lock_irqsave(&vector_lock, flags);
2478 clear_bit(irq_vector[irq], used_vectors);
2479 irq_vector[irq] = 0;
2480 spin_unlock_irqrestore(&vector_lock, flags);
2481}
2482
2483/*
2484 * MSI message composition
2485 */
2486#ifdef CONFIG_PCI_MSI
2487static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
2488{
2489 int vector;
2490 unsigned dest;
2491
2492 vector = assign_irq_vector(irq);
2493 if (vector >= 0) {
2494 dest = cpu_mask_to_apicid(TARGET_CPUS);
2495
2496 msg->address_hi = MSI_ADDR_BASE_HI;
2497 msg->address_lo =
2498 MSI_ADDR_BASE_LO |
2499 ((INT_DEST_MODE == 0) ?
2500MSI_ADDR_DEST_MODE_PHYSICAL:
2501 MSI_ADDR_DEST_MODE_LOGICAL) |
2502 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2503 MSI_ADDR_REDIRECTION_CPU:
2504 MSI_ADDR_REDIRECTION_LOWPRI) |
2505 MSI_ADDR_DEST_ID(dest);
2506
2507 msg->data =
2508 MSI_DATA_TRIGGER_EDGE |
2509 MSI_DATA_LEVEL_ASSERT |
2510 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2511MSI_DATA_DELIVERY_FIXED:
2512 MSI_DATA_DELIVERY_LOWPRI) |
2513 MSI_DATA_VECTOR(vector);
2514 }
2515 return vector;
2516}
2517
2518#ifdef CONFIG_SMP
2519static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2520{
2521 struct msi_msg msg;
2522 unsigned int dest;
2523 cpumask_t tmp;
2524 int vector;
2525
2526 cpus_and(tmp, mask, cpu_online_map);
2527 if (cpus_empty(tmp))
2528 tmp = TARGET_CPUS;
2529
2530 vector = assign_irq_vector(irq);
2531 if (vector < 0)
2532 return;
2533
2534 dest = cpu_mask_to_apicid(mask);
2535
2536 read_msi_msg(irq, &msg);
2537
2538 msg.data &= ~MSI_DATA_VECTOR_MASK;
2539 msg.data |= MSI_DATA_VECTOR(vector);
2540 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
2541 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2542
2543 write_msi_msg(irq, &msg);
2544 irq_desc[irq].affinity = mask;
2545}
2546#endif /* CONFIG_SMP */
2547
2548/*
2549 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
2550 * which implement the MSI or MSI-X Capability Structure.
2551 */
2552static struct irq_chip msi_chip = {
2553 .name = "PCI-MSI",
2554 .unmask = unmask_msi_irq,
2555 .mask = mask_msi_irq,
2556 .ack = ack_ioapic_irq,
2557#ifdef CONFIG_SMP
2558 .set_affinity = set_msi_irq_affinity,
2559#endif
2560 .retrigger = ioapic_retrigger_irq,
2561};
2562
2563int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
2564{
2565 struct msi_msg msg;
2566 int irq, ret;
2567 irq = create_irq();
2568 if (irq < 0)
2569 return irq;
2570
2571 ret = msi_compose_msg(dev, irq, &msg);
2572 if (ret < 0) {
2573 destroy_irq(irq);
2574 return ret;
2575 }
2576
2577 set_irq_msi(irq, desc);
2578 write_msi_msg(irq, &msg);
2579
2580 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
2581 "edge");
2582
2583 return 0;
2584}
2585
2586void arch_teardown_msi_irq(unsigned int irq)
2587{
2588 destroy_irq(irq);
2589}
2590
2591#endif /* CONFIG_PCI_MSI */
2592
2593/*
2594 * Hypertransport interrupt support
2595 */
2596#ifdef CONFIG_HT_IRQ
2597
2598#ifdef CONFIG_SMP
2599
2600static void target_ht_irq(unsigned int irq, unsigned int dest)
2601{
2602 struct ht_irq_msg msg;
2603 fetch_ht_irq_msg(irq, &msg);
2604
2605 msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
2606 msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
2607
2608 msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
2609 msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
2610
2611 write_ht_irq_msg(irq, &msg);
2612}
2613
2614static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2615{
2616 unsigned int dest;
2617 cpumask_t tmp;
2618
2619 cpus_and(tmp, mask, cpu_online_map);
2620 if (cpus_empty(tmp))
2621 tmp = TARGET_CPUS;
2622
2623 cpus_and(mask, tmp, CPU_MASK_ALL);
2624
2625 dest = cpu_mask_to_apicid(mask);
2626
2627 target_ht_irq(irq, dest);
2628 irq_desc[irq].affinity = mask;
2629}
2630#endif
2631
2632static struct irq_chip ht_irq_chip = {
2633 .name = "PCI-HT",
2634 .mask = mask_ht_irq,
2635 .unmask = unmask_ht_irq,
2636 .ack = ack_ioapic_irq,
2637#ifdef CONFIG_SMP
2638 .set_affinity = set_ht_irq_affinity,
2639#endif
2640 .retrigger = ioapic_retrigger_irq,
2641};
2642
2643int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2644{
2645 int vector;
2646
2647 vector = assign_irq_vector(irq);
2648 if (vector >= 0) {
2649 struct ht_irq_msg msg;
2650 unsigned dest;
2651 cpumask_t tmp;
2652
2653 cpus_clear(tmp);
2654 cpu_set(vector >> 8, tmp);
2655 dest = cpu_mask_to_apicid(tmp);
2656
2657 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
2658
2659 msg.address_lo =
2660 HT_IRQ_LOW_BASE |
2661 HT_IRQ_LOW_DEST_ID(dest) |
2662 HT_IRQ_LOW_VECTOR(vector) |
2663 ((INT_DEST_MODE == 0) ?
2664 HT_IRQ_LOW_DM_PHYSICAL :
2665 HT_IRQ_LOW_DM_LOGICAL) |
2666 HT_IRQ_LOW_RQEOI_EDGE |
2667 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2668 HT_IRQ_LOW_MT_FIXED :
2669 HT_IRQ_LOW_MT_ARBITRATED) |
2670 HT_IRQ_LOW_IRQ_MASKED;
2671
2672 write_ht_irq_msg(irq, &msg);
2673
2674 set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2675 handle_edge_irq, "edge");
2676 }
2677 return vector;
2678}
2679#endif /* CONFIG_HT_IRQ */
2680
2681/* --------------------------------------------------------------------------
2682 ACPI-based IOAPIC Configuration
2683 -------------------------------------------------------------------------- */
2684
2685#ifdef CONFIG_ACPI
2686
2687int __init io_apic_get_unique_id(int ioapic, int apic_id)
2688{
2689 union IO_APIC_reg_00 reg_00;
2690 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
2691 physid_mask_t tmp;
2692 unsigned long flags;
2693 int i = 0;
2694
2695 /*
2696 * The P4 platform supports up to 256 APIC IDs on two separate APIC
2697 * buses (one for LAPICs, one for IOAPICs), where predecessors only
2698 * supports up to 16 on one shared APIC bus.
2699 *
2700 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
2701 * advantage of new APIC bus architecture.
2702 */
2703
2704 if (physids_empty(apic_id_map))
2705 apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
2706
2707 spin_lock_irqsave(&ioapic_lock, flags);
2708 reg_00.raw = io_apic_read(ioapic, 0);
2709 spin_unlock_irqrestore(&ioapic_lock, flags);
2710
2711 if (apic_id >= get_physical_broadcast()) {
2712 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
2713 "%d\n", ioapic, apic_id, reg_00.bits.ID);
2714 apic_id = reg_00.bits.ID;
2715 }
2716
2717 /*
2718 * Every APIC in a system must have a unique ID or we get lots of nice
2719 * 'stuck on smp_invalidate_needed IPI wait' messages.
2720 */
2721 if (check_apicid_used(apic_id_map, apic_id)) {
2722
2723 for (i = 0; i < get_physical_broadcast(); i++) {
2724 if (!check_apicid_used(apic_id_map, i))
2725 break;
2726 }
2727
2728 if (i == get_physical_broadcast())
2729 panic("Max apic_id exceeded!\n");
2730
2731 printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
2732 "trying %d\n", ioapic, apic_id, i);
2733
2734 apic_id = i;
2735 }
2736
2737 tmp = apicid_to_cpu_present(apic_id);
2738 physids_or(apic_id_map, apic_id_map, tmp);
2739
2740 if (reg_00.bits.ID != apic_id) {
2741 reg_00.bits.ID = apic_id;
2742
2743 spin_lock_irqsave(&ioapic_lock, flags);
2744 io_apic_write(ioapic, 0, reg_00.raw);
2745 reg_00.raw = io_apic_read(ioapic, 0);
2746 spin_unlock_irqrestore(&ioapic_lock, flags);
2747
2748 /* Sanity check */
2749 if (reg_00.bits.ID != apic_id) {
2750 printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
2751 return -1;
2752 }
2753 }
2754
2755 apic_printk(APIC_VERBOSE, KERN_INFO
2756 "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
2757
2758 return apic_id;
2759}
2760
2761
2762int __init io_apic_get_version(int ioapic)
2763{
2764 union IO_APIC_reg_01 reg_01;
2765 unsigned long flags;
2766
2767 spin_lock_irqsave(&ioapic_lock, flags);
2768 reg_01.raw = io_apic_read(ioapic, 1);
2769 spin_unlock_irqrestore(&ioapic_lock, flags);
2770
2771 return reg_01.bits.version;
2772}
2773
2774
2775int __init io_apic_get_redir_entries(int ioapic)
2776{
2777 union IO_APIC_reg_01 reg_01;
2778 unsigned long flags;
2779
2780 spin_lock_irqsave(&ioapic_lock, flags);
2781 reg_01.raw = io_apic_read(ioapic, 1);
2782 spin_unlock_irqrestore(&ioapic_lock, flags);
2783
2784 return reg_01.bits.entries;
2785}
2786
2787
2788int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
2789{
2790 struct IO_APIC_route_entry entry;
2791
2792 if (!IO_APIC_IRQ(irq)) {
2793 printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
2794 ioapic);
2795 return -EINVAL;
2796 }
2797
2798 /*
2799 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
2800 * Note that we mask (disable) IRQs now -- these get enabled when the
2801 * corresponding device driver registers for this IRQ.
2802 */
2803
2804 memset(&entry, 0, sizeof(entry));
2805
2806 entry.delivery_mode = INT_DELIVERY_MODE;
2807 entry.dest_mode = INT_DEST_MODE;
2808 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
2809 entry.trigger = edge_level;
2810 entry.polarity = active_high_low;
2811 entry.mask = 1;
2812
2813 /*
2814 * IRQs < 16 are already in the irq_2_pin[] map
2815 */
2816 if (irq >= 16)
2817 add_pin_to_irq(irq, ioapic, pin);
2818
2819 entry.vector = assign_irq_vector(irq);
2820
2821 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
2822 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
2823 mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
2824 edge_level, active_high_low);
2825
2826 ioapic_register_intr(irq, entry.vector, edge_level);
2827
2828 if (!ioapic && (irq < 16))
2829 disable_8259A_irq(irq);
2830
2831 ioapic_write_entry(ioapic, pin, entry);
2832
2833 return 0;
2834}
2835
2836int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
2837{
2838 int i;
2839
2840 if (skip_ioapic_setup)
2841 return -1;
2842
2843 for (i = 0; i < mp_irq_entries; i++)
2844 if (mp_irqs[i].mp_irqtype == mp_INT &&
2845 mp_irqs[i].mp_srcbusirq == bus_irq)
2846 break;
2847 if (i >= mp_irq_entries)
2848 return -1;
2849
2850 *trigger = irq_trigger(i);
2851 *polarity = irq_polarity(i);
2852 return 0;
2853}
2854
2855#endif /* CONFIG_ACPI */
2856
2857static int __init parse_disable_timer_pin_1(char *arg)
2858{
2859 disable_timer_pin_1 = 1;
2860 return 0;
2861}
2862early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
2863
2864static int __init parse_enable_timer_pin_1(char *arg)
2865{
2866 disable_timer_pin_1 = -1;
2867 return 0;
2868}
2869early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
2870
2871static int __init parse_noapic(char *arg)
2872{
2873 /* disable IO-APIC */
2874 disable_ioapic_setup();
2875 return 0;
2876}
2877early_param("noapic", parse_noapic);
2878
2879void __init ioapic_init_mappings(void)
2880{
2881 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
2882 int i;
2883
2884 for (i = 0; i < nr_ioapics; i++) {
2885 if (smp_found_config) {
2886 ioapic_phys = mp_ioapics[i].mp_apicaddr;
2887 if (!ioapic_phys) {
2888 printk(KERN_ERR
2889 "WARNING: bogus zero IO-APIC "
2890 "address found in MPTABLE, "
2891 "disabling IO/APIC support!\n");
2892 smp_found_config = 0;
2893 skip_ioapic_setup = 1;
2894 goto fake_ioapic_page;
2895 }
2896 } else {
2897fake_ioapic_page:
2898 ioapic_phys = (unsigned long)
2899 alloc_bootmem_pages(PAGE_SIZE);
2900 ioapic_phys = __pa(ioapic_phys);
2901 }
2902 set_fixmap_nocache(idx, ioapic_phys);
2903 printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
2904 __fix_to_virt(idx), ioapic_phys);
2905 idx++;
2906 }
2907}
2908
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
new file mode 100644
index 000000000000..d1d4dc52f649
--- /dev/null
+++ b/arch/x86/kernel/irq.c
@@ -0,0 +1,189 @@
1/*
2 * Common interrupt code for 32 and 64 bit
3 */
4#include <linux/cpu.h>
5#include <linux/interrupt.h>
6#include <linux/kernel_stat.h>
7#include <linux/seq_file.h>
8
9#include <asm/apic.h>
10#include <asm/io_apic.h>
11#include <asm/smp.h>
12
13atomic_t irq_err_count;
14
15/*
16 * 'what should we do if we get a hw irq event on an illegal vector'.
17 * each architecture has to answer this themselves.
18 */
19void ack_bad_irq(unsigned int irq)
20{
21 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
22
23#ifdef CONFIG_X86_LOCAL_APIC
24 /*
25 * Currently unexpected vectors happen only on SMP and APIC.
26 * We _must_ ack these because every local APIC has only N
27 * irq slots per priority level, and a 'hanging, unacked' IRQ
28 * holds up an irq slot - in excessive cases (when multiple
29 * unexpected vectors occur) that might lock up the APIC
30 * completely.
31 * But only ack when the APIC is enabled -AK
32 */
33 if (cpu_has_apic)
34 ack_APIC_irq();
35#endif
36}
37
38#ifdef CONFIG_X86_32
39# define irq_stats(x) (&per_cpu(irq_stat, x))
40#else
41# define irq_stats(x) cpu_pda(x)
42#endif
43/*
44 * /proc/interrupts printing:
45 */
46static int show_other_interrupts(struct seq_file *p)
47{
48 int j;
49
50 seq_printf(p, "NMI: ");
51 for_each_online_cpu(j)
52 seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
53 seq_printf(p, " Non-maskable interrupts\n");
54#ifdef CONFIG_X86_LOCAL_APIC
55 seq_printf(p, "LOC: ");
56 for_each_online_cpu(j)
57 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
58 seq_printf(p, " Local timer interrupts\n");
59#endif
60#ifdef CONFIG_SMP
61 seq_printf(p, "RES: ");
62 for_each_online_cpu(j)
63 seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
64 seq_printf(p, " Rescheduling interrupts\n");
65 seq_printf(p, "CAL: ");
66 for_each_online_cpu(j)
67 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
68 seq_printf(p, " Function call interrupts\n");
69 seq_printf(p, "TLB: ");
70 for_each_online_cpu(j)
71 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
72 seq_printf(p, " TLB shootdowns\n");
73#endif
74#ifdef CONFIG_X86_MCE
75 seq_printf(p, "TRM: ");
76 for_each_online_cpu(j)
77 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
78 seq_printf(p, " Thermal event interrupts\n");
79# ifdef CONFIG_X86_64
80 seq_printf(p, "THR: ");
81 for_each_online_cpu(j)
82 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
83 seq_printf(p, " Threshold APIC interrupts\n");
84# endif
85#endif
86#ifdef CONFIG_X86_LOCAL_APIC
87 seq_printf(p, "SPU: ");
88 for_each_online_cpu(j)
89 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
90 seq_printf(p, " Spurious interrupts\n");
91#endif
92 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
93#if defined(CONFIG_X86_IO_APIC)
94 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
95#endif
96 return 0;
97}
98
99int show_interrupts(struct seq_file *p, void *v)
100{
101 unsigned long flags, any_count = 0;
102 int i = *(loff_t *) v, j;
103 struct irqaction *action;
104 struct irq_desc *desc;
105
106 if (i > nr_irqs)
107 return 0;
108
109 if (i == nr_irqs)
110 return show_other_interrupts(p);
111
112 /* print header */
113 if (i == 0) {
114 seq_printf(p, " ");
115 for_each_online_cpu(j)
116 seq_printf(p, "CPU%-8d", j);
117 seq_putc(p, '\n');
118 }
119
120 desc = irq_to_desc(i);
121 spin_lock_irqsave(&desc->lock, flags);
122#ifndef CONFIG_SMP
123 any_count = kstat_irqs(i);
124#else
125 for_each_online_cpu(j)
126 any_count |= kstat_irqs_cpu(i, j);
127#endif
128 action = desc->action;
129 if (!action && !any_count)
130 goto out;
131
132 seq_printf(p, "%3d: ", i);
133#ifndef CONFIG_SMP
134 seq_printf(p, "%10u ", kstat_irqs(i));
135#else
136 for_each_online_cpu(j)
137 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
138#endif
139 seq_printf(p, " %8s", desc->chip->name);
140 seq_printf(p, "-%-8s", desc->name);
141
142 if (action) {
143 seq_printf(p, " %s", action->name);
144 while ((action = action->next) != NULL)
145 seq_printf(p, ", %s", action->name);
146 }
147
148 seq_putc(p, '\n');
149out:
150 spin_unlock_irqrestore(&desc->lock, flags);
151 return 0;
152}
153
154/*
155 * /proc/stat helpers
156 */
157u64 arch_irq_stat_cpu(unsigned int cpu)
158{
159 u64 sum = irq_stats(cpu)->__nmi_count;
160
161#ifdef CONFIG_X86_LOCAL_APIC
162 sum += irq_stats(cpu)->apic_timer_irqs;
163#endif
164#ifdef CONFIG_SMP
165 sum += irq_stats(cpu)->irq_resched_count;
166 sum += irq_stats(cpu)->irq_call_count;
167 sum += irq_stats(cpu)->irq_tlb_count;
168#endif
169#ifdef CONFIG_X86_MCE
170 sum += irq_stats(cpu)->irq_thermal_count;
171# ifdef CONFIG_X86_64
172 sum += irq_stats(cpu)->irq_threshold_count;
173#endif
174#endif
175#ifdef CONFIG_X86_LOCAL_APIC
176 sum += irq_stats(cpu)->irq_spurious_count;
177#endif
178 return sum;
179}
180
181u64 arch_irq_stat(void)
182{
183 u64 sum = atomic_read(&irq_err_count);
184
185#ifdef CONFIG_X86_IO_APIC
186 sum += atomic_read(&irq_mis_count);
187#endif
188 return sum;
189}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index b71e02d42f4f..a51382672de0 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -25,29 +25,6 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
25DEFINE_PER_CPU(struct pt_regs *, irq_regs); 25DEFINE_PER_CPU(struct pt_regs *, irq_regs);
26EXPORT_PER_CPU_SYMBOL(irq_regs); 26EXPORT_PER_CPU_SYMBOL(irq_regs);
27 27
28/*
29 * 'what should we do if we get a hw irq event on an illegal vector'.
30 * each architecture has to answer this themselves.
31 */
32void ack_bad_irq(unsigned int irq)
33{
34 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
35
36#ifdef CONFIG_X86_LOCAL_APIC
37 /*
38 * Currently unexpected vectors happen only on SMP and APIC.
39 * We _must_ ack these because every local APIC has only N
40 * irq slots per priority level, and a 'hanging, unacked' IRQ
41 * holds up an irq slot - in excessive cases (when multiple
42 * unexpected vectors occur) that might lock up the APIC
43 * completely.
44 * But only ack when the APIC is enabled -AK
45 */
46 if (cpu_has_apic)
47 ack_APIC_irq();
48#endif
49}
50
51#ifdef CONFIG_DEBUG_STACKOVERFLOW 28#ifdef CONFIG_DEBUG_STACKOVERFLOW
52/* Debugging check for stack overflow: is there less than 1KB free? */ 29/* Debugging check for stack overflow: is there less than 1KB free? */
53static int check_stack_overflow(void) 30static int check_stack_overflow(void)
@@ -223,20 +200,25 @@ unsigned int do_IRQ(struct pt_regs *regs)
223{ 200{
224 struct pt_regs *old_regs; 201 struct pt_regs *old_regs;
225 /* high bit used in ret_from_ code */ 202 /* high bit used in ret_from_ code */
226 int overflow, irq = ~regs->orig_ax; 203 int overflow;
227 struct irq_desc *desc = irq_desc + irq; 204 unsigned vector = ~regs->orig_ax;
205 struct irq_desc *desc;
206 unsigned irq;
228 207
229 if (unlikely((unsigned)irq >= NR_IRQS)) {
230 printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
231 __func__, irq);
232 BUG();
233 }
234 208
235 old_regs = set_irq_regs(regs); 209 old_regs = set_irq_regs(regs);
236 irq_enter(); 210 irq_enter();
211 irq = __get_cpu_var(vector_irq)[vector];
237 212
238 overflow = check_stack_overflow(); 213 overflow = check_stack_overflow();
239 214
215 desc = irq_to_desc(irq);
216 if (unlikely(!desc)) {
217 printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n",
218 __func__, irq, vector, smp_processor_id());
219 BUG();
220 }
221
240 if (!execute_on_irq_stack(overflow, desc, irq)) { 222 if (!execute_on_irq_stack(overflow, desc, irq)) {
241 if (unlikely(overflow)) 223 if (unlikely(overflow))
242 print_stack_overflow(); 224 print_stack_overflow();
@@ -248,146 +230,6 @@ unsigned int do_IRQ(struct pt_regs *regs)
248 return 1; 230 return 1;
249} 231}
250 232
251/*
252 * Interrupt statistics:
253 */
254
255atomic_t irq_err_count;
256
257/*
258 * /proc/interrupts printing:
259 */
260
261int show_interrupts(struct seq_file *p, void *v)
262{
263 int i = *(loff_t *) v, j;
264 struct irqaction * action;
265 unsigned long flags;
266
267 if (i == 0) {
268 seq_printf(p, " ");
269 for_each_online_cpu(j)
270 seq_printf(p, "CPU%-8d",j);
271 seq_putc(p, '\n');
272 }
273
274 if (i < NR_IRQS) {
275 unsigned any_count = 0;
276
277 spin_lock_irqsave(&irq_desc[i].lock, flags);
278#ifndef CONFIG_SMP
279 any_count = kstat_irqs(i);
280#else
281 for_each_online_cpu(j)
282 any_count |= kstat_cpu(j).irqs[i];
283#endif
284 action = irq_desc[i].action;
285 if (!action && !any_count)
286 goto skip;
287 seq_printf(p, "%3d: ",i);
288#ifndef CONFIG_SMP
289 seq_printf(p, "%10u ", kstat_irqs(i));
290#else
291 for_each_online_cpu(j)
292 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
293#endif
294 seq_printf(p, " %8s", irq_desc[i].chip->name);
295 seq_printf(p, "-%-8s", irq_desc[i].name);
296
297 if (action) {
298 seq_printf(p, " %s", action->name);
299 while ((action = action->next) != NULL)
300 seq_printf(p, ", %s", action->name);
301 }
302
303 seq_putc(p, '\n');
304skip:
305 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
306 } else if (i == NR_IRQS) {
307 seq_printf(p, "NMI: ");
308 for_each_online_cpu(j)
309 seq_printf(p, "%10u ", nmi_count(j));
310 seq_printf(p, " Non-maskable interrupts\n");
311#ifdef CONFIG_X86_LOCAL_APIC
312 seq_printf(p, "LOC: ");
313 for_each_online_cpu(j)
314 seq_printf(p, "%10u ",
315 per_cpu(irq_stat,j).apic_timer_irqs);
316 seq_printf(p, " Local timer interrupts\n");
317#endif
318#ifdef CONFIG_SMP
319 seq_printf(p, "RES: ");
320 for_each_online_cpu(j)
321 seq_printf(p, "%10u ",
322 per_cpu(irq_stat,j).irq_resched_count);
323 seq_printf(p, " Rescheduling interrupts\n");
324 seq_printf(p, "CAL: ");
325 for_each_online_cpu(j)
326 seq_printf(p, "%10u ",
327 per_cpu(irq_stat,j).irq_call_count);
328 seq_printf(p, " Function call interrupts\n");
329 seq_printf(p, "TLB: ");
330 for_each_online_cpu(j)
331 seq_printf(p, "%10u ",
332 per_cpu(irq_stat,j).irq_tlb_count);
333 seq_printf(p, " TLB shootdowns\n");
334#endif
335#ifdef CONFIG_X86_MCE
336 seq_printf(p, "TRM: ");
337 for_each_online_cpu(j)
338 seq_printf(p, "%10u ",
339 per_cpu(irq_stat,j).irq_thermal_count);
340 seq_printf(p, " Thermal event interrupts\n");
341#endif
342#ifdef CONFIG_X86_LOCAL_APIC
343 seq_printf(p, "SPU: ");
344 for_each_online_cpu(j)
345 seq_printf(p, "%10u ",
346 per_cpu(irq_stat,j).irq_spurious_count);
347 seq_printf(p, " Spurious interrupts\n");
348#endif
349 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
350#if defined(CONFIG_X86_IO_APIC)
351 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
352#endif
353 }
354 return 0;
355}
356
357/*
358 * /proc/stat helpers
359 */
360u64 arch_irq_stat_cpu(unsigned int cpu)
361{
362 u64 sum = nmi_count(cpu);
363
364#ifdef CONFIG_X86_LOCAL_APIC
365 sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
366#endif
367#ifdef CONFIG_SMP
368 sum += per_cpu(irq_stat, cpu).irq_resched_count;
369 sum += per_cpu(irq_stat, cpu).irq_call_count;
370 sum += per_cpu(irq_stat, cpu).irq_tlb_count;
371#endif
372#ifdef CONFIG_X86_MCE
373 sum += per_cpu(irq_stat, cpu).irq_thermal_count;
374#endif
375#ifdef CONFIG_X86_LOCAL_APIC
376 sum += per_cpu(irq_stat, cpu).irq_spurious_count;
377#endif
378 return sum;
379}
380
381u64 arch_irq_stat(void)
382{
383 u64 sum = atomic_read(&irq_err_count);
384
385#ifdef CONFIG_X86_IO_APIC
386 sum += atomic_read(&irq_mis_count);
387#endif
388 return sum;
389}
390
391#ifdef CONFIG_HOTPLUG_CPU 233#ifdef CONFIG_HOTPLUG_CPU
392#include <mach_apic.h> 234#include <mach_apic.h>
393 235
@@ -395,20 +237,22 @@ void fixup_irqs(cpumask_t map)
395{ 237{
396 unsigned int irq; 238 unsigned int irq;
397 static int warned; 239 static int warned;
240 struct irq_desc *desc;
398 241
399 for (irq = 0; irq < NR_IRQS; irq++) { 242 for_each_irq_desc(irq, desc) {
400 cpumask_t mask; 243 cpumask_t mask;
244
401 if (irq == 2) 245 if (irq == 2)
402 continue; 246 continue;
403 247
404 cpus_and(mask, irq_desc[irq].affinity, map); 248 cpus_and(mask, desc->affinity, map);
405 if (any_online_cpu(mask) == NR_CPUS) { 249 if (any_online_cpu(mask) == NR_CPUS) {
406 printk("Breaking affinity for irq %i\n", irq); 250 printk("Breaking affinity for irq %i\n", irq);
407 mask = map; 251 mask = map;
408 } 252 }
409 if (irq_desc[irq].chip->set_affinity) 253 if (desc->chip->set_affinity)
410 irq_desc[irq].chip->set_affinity(irq, mask); 254 desc->chip->set_affinity(irq, mask);
411 else if (irq_desc[irq].action && !(warned++)) 255 else if (desc->action && !(warned++))
412 printk("Cannot set affinity for irq %i\n", irq); 256 printk("Cannot set affinity for irq %i\n", irq);
413 } 257 }
414 258
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index f065fe9071b9..60eb84eb77a0 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,28 +18,6 @@
18#include <asm/idle.h> 18#include <asm/idle.h>
19#include <asm/smp.h> 19#include <asm/smp.h>
20 20
21atomic_t irq_err_count;
22
23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'.
25 * each architecture has to answer this themselves.
26 */
27void ack_bad_irq(unsigned int irq)
28{
29 printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq);
30 /*
31 * Currently unexpected vectors happen only on SMP and APIC.
32 * We _must_ ack these because every local APIC has only N
33 * irq slots per priority level, and a 'hanging, unacked' IRQ
34 * holds up an irq slot - in excessive cases (when multiple
35 * unexpected vectors occur) that might lock up the APIC
36 * completely.
37 * But don't ack when the APIC is disabled. -AK
38 */
39 if (!disable_apic)
40 ack_APIC_irq();
41}
42
43#ifdef CONFIG_DEBUG_STACKOVERFLOW 21#ifdef CONFIG_DEBUG_STACKOVERFLOW
44/* 22/*
45 * Probabilistic stack overflow check: 23 * Probabilistic stack overflow check:
@@ -65,122 +43,6 @@ static inline void stack_overflow_check(struct pt_regs *regs)
65#endif 43#endif
66 44
67/* 45/*
68 * Generic, controller-independent functions:
69 */
70
71int show_interrupts(struct seq_file *p, void *v)
72{
73 int i = *(loff_t *) v, j;
74 struct irqaction * action;
75 unsigned long flags;
76
77 if (i == 0) {
78 seq_printf(p, " ");
79 for_each_online_cpu(j)
80 seq_printf(p, "CPU%-8d",j);
81 seq_putc(p, '\n');
82 }
83
84 if (i < NR_IRQS) {
85 unsigned any_count = 0;
86
87 spin_lock_irqsave(&irq_desc[i].lock, flags);
88#ifndef CONFIG_SMP
89 any_count = kstat_irqs(i);
90#else
91 for_each_online_cpu(j)
92 any_count |= kstat_cpu(j).irqs[i];
93#endif
94 action = irq_desc[i].action;
95 if (!action && !any_count)
96 goto skip;
97 seq_printf(p, "%3d: ",i);
98#ifndef CONFIG_SMP
99 seq_printf(p, "%10u ", kstat_irqs(i));
100#else
101 for_each_online_cpu(j)
102 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
103#endif
104 seq_printf(p, " %8s", irq_desc[i].chip->name);
105 seq_printf(p, "-%-8s", irq_desc[i].name);
106
107 if (action) {
108 seq_printf(p, " %s", action->name);
109 while ((action = action->next) != NULL)
110 seq_printf(p, ", %s", action->name);
111 }
112 seq_putc(p, '\n');
113skip:
114 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
115 } else if (i == NR_IRQS) {
116 seq_printf(p, "NMI: ");
117 for_each_online_cpu(j)
118 seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
119 seq_printf(p, " Non-maskable interrupts\n");
120 seq_printf(p, "LOC: ");
121 for_each_online_cpu(j)
122 seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
123 seq_printf(p, " Local timer interrupts\n");
124#ifdef CONFIG_SMP
125 seq_printf(p, "RES: ");
126 for_each_online_cpu(j)
127 seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count);
128 seq_printf(p, " Rescheduling interrupts\n");
129 seq_printf(p, "CAL: ");
130 for_each_online_cpu(j)
131 seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
132 seq_printf(p, " Function call interrupts\n");
133 seq_printf(p, "TLB: ");
134 for_each_online_cpu(j)
135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
136 seq_printf(p, " TLB shootdowns\n");
137#endif
138#ifdef CONFIG_X86_MCE
139 seq_printf(p, "TRM: ");
140 for_each_online_cpu(j)
141 seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
142 seq_printf(p, " Thermal event interrupts\n");
143 seq_printf(p, "THR: ");
144 for_each_online_cpu(j)
145 seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
146 seq_printf(p, " Threshold APIC interrupts\n");
147#endif
148 seq_printf(p, "SPU: ");
149 for_each_online_cpu(j)
150 seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
151 seq_printf(p, " Spurious interrupts\n");
152 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
153 }
154 return 0;
155}
156
157/*
158 * /proc/stat helpers
159 */
160u64 arch_irq_stat_cpu(unsigned int cpu)
161{
162 u64 sum = cpu_pda(cpu)->__nmi_count;
163
164 sum += cpu_pda(cpu)->apic_timer_irqs;
165#ifdef CONFIG_SMP
166 sum += cpu_pda(cpu)->irq_resched_count;
167 sum += cpu_pda(cpu)->irq_call_count;
168 sum += cpu_pda(cpu)->irq_tlb_count;
169#endif
170#ifdef CONFIG_X86_MCE
171 sum += cpu_pda(cpu)->irq_thermal_count;
172 sum += cpu_pda(cpu)->irq_threshold_count;
173#endif
174 sum += cpu_pda(cpu)->irq_spurious_count;
175 return sum;
176}
177
178u64 arch_irq_stat(void)
179{
180 return atomic_read(&irq_err_count);
181}
182
183/*
184 * do_IRQ handles all normal device IRQ's (the special 46 * do_IRQ handles all normal device IRQ's (the special
185 * SMP cross-CPU interrupts have their own specific 47 * SMP cross-CPU interrupts have their own specific
186 * handlers). 48 * handlers).
@@ -188,6 +50,7 @@ u64 arch_irq_stat(void)
188asmlinkage unsigned int do_IRQ(struct pt_regs *regs) 50asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
189{ 51{
190 struct pt_regs *old_regs = set_irq_regs(regs); 52 struct pt_regs *old_regs = set_irq_regs(regs);
53 struct irq_desc *desc;
191 54
192 /* high bit used in ret_from_ code */ 55 /* high bit used in ret_from_ code */
193 unsigned vector = ~regs->orig_ax; 56 unsigned vector = ~regs->orig_ax;
@@ -201,8 +64,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
201 stack_overflow_check(regs); 64 stack_overflow_check(regs);
202#endif 65#endif
203 66
204 if (likely(irq < NR_IRQS)) 67 desc = irq_to_desc(irq);
205 generic_handle_irq(irq); 68 if (likely(desc))
69 generic_handle_irq_desc(irq, desc);
206 else { 70 else {
207 if (!disable_apic) 71 if (!disable_apic)
208 ack_APIC_irq(); 72 ack_APIC_irq();
@@ -223,8 +87,9 @@ void fixup_irqs(cpumask_t map)
223{ 87{
224 unsigned int irq; 88 unsigned int irq;
225 static int warned; 89 static int warned;
90 struct irq_desc *desc;
226 91
227 for (irq = 0; irq < NR_IRQS; irq++) { 92 for_each_irq_desc(irq, desc) {
228 cpumask_t mask; 93 cpumask_t mask;
229 int break_affinity = 0; 94 int break_affinity = 0;
230 int set_affinity = 1; 95 int set_affinity = 1;
@@ -233,32 +98,32 @@ void fixup_irqs(cpumask_t map)
233 continue; 98 continue;
234 99
235 /* interrupt's are disabled at this point */ 100 /* interrupt's are disabled at this point */
236 spin_lock(&irq_desc[irq].lock); 101 spin_lock(&desc->lock);
237 102
238 if (!irq_has_action(irq) || 103 if (!irq_has_action(irq) ||
239 cpus_equal(irq_desc[irq].affinity, map)) { 104 cpus_equal(desc->affinity, map)) {
240 spin_unlock(&irq_desc[irq].lock); 105 spin_unlock(&desc->lock);
241 continue; 106 continue;
242 } 107 }
243 108
244 cpus_and(mask, irq_desc[irq].affinity, map); 109 cpus_and(mask, desc->affinity, map);
245 if (cpus_empty(mask)) { 110 if (cpus_empty(mask)) {
246 break_affinity = 1; 111 break_affinity = 1;
247 mask = map; 112 mask = map;
248 } 113 }
249 114
250 if (irq_desc[irq].chip->mask) 115 if (desc->chip->mask)
251 irq_desc[irq].chip->mask(irq); 116 desc->chip->mask(irq);
252 117
253 if (irq_desc[irq].chip->set_affinity) 118 if (desc->chip->set_affinity)
254 irq_desc[irq].chip->set_affinity(irq, mask); 119 desc->chip->set_affinity(irq, mask);
255 else if (!(warned++)) 120 else if (!(warned++))
256 set_affinity = 0; 121 set_affinity = 0;
257 122
258 if (irq_desc[irq].chip->unmask) 123 if (desc->chip->unmask)
259 irq_desc[irq].chip->unmask(irq); 124 desc->chip->unmask(irq);
260 125
261 spin_unlock(&irq_desc[irq].lock); 126 spin_unlock(&desc->lock);
262 127
263 if (break_affinity && set_affinity) 128 if (break_affinity && set_affinity)
264 printk("Broke affinity for irq %i\n", irq); 129 printk("Broke affinity for irq %i\n", irq);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 9200a1e2752d..845aa9803e80 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -69,6 +69,13 @@ void __init init_ISA_irqs (void)
69 * 16 old-style INTA-cycle interrupts: 69 * 16 old-style INTA-cycle interrupts:
70 */ 70 */
71 for (i = 0; i < 16; i++) { 71 for (i = 0; i < 16; i++) {
72 /* first time call this irq_desc */
73 struct irq_desc *desc = irq_to_desc(i);
74
75 desc->status = IRQ_DISABLED;
76 desc->action = NULL;
77 desc->depth = 1;
78
72 set_irq_chip_and_handler_name(i, &i8259A_chip, 79 set_irq_chip_and_handler_name(i, &i8259A_chip,
73 handle_level_irq, "XT"); 80 handle_level_irq, "XT");
74 } 81 }
@@ -83,6 +90,27 @@ static struct irqaction irq2 = {
83 .name = "cascade", 90 .name = "cascade",
84}; 91};
85 92
93DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
94 [0 ... IRQ0_VECTOR - 1] = -1,
95 [IRQ0_VECTOR] = 0,
96 [IRQ1_VECTOR] = 1,
97 [IRQ2_VECTOR] = 2,
98 [IRQ3_VECTOR] = 3,
99 [IRQ4_VECTOR] = 4,
100 [IRQ5_VECTOR] = 5,
101 [IRQ6_VECTOR] = 6,
102 [IRQ7_VECTOR] = 7,
103 [IRQ8_VECTOR] = 8,
104 [IRQ9_VECTOR] = 9,
105 [IRQ10_VECTOR] = 10,
106 [IRQ11_VECTOR] = 11,
107 [IRQ12_VECTOR] = 12,
108 [IRQ13_VECTOR] = 13,
109 [IRQ14_VECTOR] = 14,
110 [IRQ15_VECTOR] = 15,
111 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
112};
113
86/* Overridden in paravirt.c */ 114/* Overridden in paravirt.c */
87void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 115void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
88 116
@@ -98,22 +126,14 @@ void __init native_init_IRQ(void)
98 * us. (some of these will be overridden and become 126 * us. (some of these will be overridden and become
99 * 'special' SMP interrupts) 127 * 'special' SMP interrupts)
100 */ 128 */
101 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { 129 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
102 int vector = FIRST_EXTERNAL_VECTOR + i;
103 if (i >= NR_IRQS)
104 break;
105 /* SYSCALL_VECTOR was reserved in trap_init. */ 130 /* SYSCALL_VECTOR was reserved in trap_init. */
106 if (!test_bit(vector, used_vectors)) 131 if (i != SYSCALL_VECTOR)
107 set_intr_gate(vector, interrupt[i]); 132 set_intr_gate(i, interrupt[i]);
108 } 133 }
109 134
110#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
111 /*
112 * IRQ0 must be given a fixed assignment and initialized,
113 * because it's used before the IO-APIC is set up.
114 */
115 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
116 135
136#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
117 /* 137 /*
118 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 138 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
119 * IPI, driven by wakeup. 139 * IPI, driven by wakeup.
@@ -128,6 +148,9 @@ void __init native_init_IRQ(void)
128 148
129 /* IPI for single call function */ 149 /* IPI for single call function */
130 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); 150 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
151
152 /* Low priority IPI to cleanup after moving an irq */
153 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
131#endif 154#endif
132 155
133#ifdef CONFIG_X86_LOCAL_APIC 156#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 1f26fd9ec4f4..ff0235391285 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -135,51 +135,33 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
135 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 135 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
136}; 136};
137 137
138static void __init init_ISA_irqs (void) 138void __init init_ISA_irqs(void)
139{ 139{
140 int i; 140 int i;
141 141
142 init_bsp_APIC(); 142 init_bsp_APIC();
143 init_8259A(0); 143 init_8259A(0);
144 144
145 for (i = 0; i < NR_IRQS; i++) { 145 for (i = 0; i < 16; i++) {
146 irq_desc[i].status = IRQ_DISABLED; 146 /* first time call this irq_desc */
147 irq_desc[i].action = NULL; 147 struct irq_desc *desc = irq_to_desc(i);
148 irq_desc[i].depth = 1;
149 148
150 if (i < 16) { 149 desc->status = IRQ_DISABLED;
151 /* 150 desc->action = NULL;
152 * 16 old-style INTA-cycle interrupts: 151 desc->depth = 1;
153 */ 152
154 set_irq_chip_and_handler_name(i, &i8259A_chip, 153 /*
154 * 16 old-style INTA-cycle interrupts:
155 */
156 set_irq_chip_and_handler_name(i, &i8259A_chip,
155 handle_level_irq, "XT"); 157 handle_level_irq, "XT");
156 } else {
157 /*
158 * 'high' PCI IRQs filled in on demand
159 */
160 irq_desc[i].chip = &no_irq_chip;
161 }
162 } 158 }
163} 159}
164 160
165void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 161void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
166 162
167void __init native_init_IRQ(void) 163static void __init smp_intr_init(void)
168{ 164{
169 int i;
170
171 init_ISA_irqs();
172 /*
173 * Cover the whole vector space, no vector can escape
174 * us. (some of these will be overridden and become
175 * 'special' SMP interrupts)
176 */
177 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
178 int vector = FIRST_EXTERNAL_VECTOR + i;
179 if (vector != IA32_SYSCALL_VECTOR)
180 set_intr_gate(vector, interrupt[i]);
181 }
182
183#ifdef CONFIG_SMP 165#ifdef CONFIG_SMP
184 /* 166 /*
185 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 167 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -207,6 +189,12 @@ void __init native_init_IRQ(void)
207 /* Low priority IPI to cleanup after moving an irq */ 189 /* Low priority IPI to cleanup after moving an irq */
208 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 190 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
209#endif 191#endif
192}
193
194static void __init apic_intr_init(void)
195{
196 smp_intr_init();
197
210 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 198 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
211 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 199 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
212 200
@@ -216,6 +204,25 @@ void __init native_init_IRQ(void)
216 /* IPI vectors for APIC spurious and error interrupts */ 204 /* IPI vectors for APIC spurious and error interrupts */
217 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 205 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
218 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 206 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
207}
208
209void __init native_init_IRQ(void)
210{
211 int i;
212
213 init_ISA_irqs();
214 /*
215 * Cover the whole vector space, no vector can escape
216 * us. (some of these will be overridden and become
217 * 'special' SMP interrupts)
218 */
219 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
220 int vector = FIRST_EXTERNAL_VECTOR + i;
221 if (vector != IA32_SYSCALL_VECTOR)
222 set_intr_gate(vector, interrupt[i]);
223 }
224
225 apic_intr_init();
219 226
220 if (!acpi_ioapic) 227 if (!acpi_ioapic)
221 setup_irq(2, &irq2); 228 setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d02def06ca91..774ac4991568 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
78 return ret; 78 return ret;
79} 79}
80 80
81/*
82 * If we don't do that, there is the possibility that the guest
83 * will calibrate under heavy load - thus, getting a lower lpj -
84 * and execute the delays themselves without load. This is wrong,
85 * because no delay loop can finish beforehand.
86 * Any heuristics is subject to fail, because ultimately, a large
87 * poll of guests can be running and trouble each other. So we preset
88 * lpj here
89 */
90static unsigned long kvm_get_tsc_khz(void)
91{
92 return preset_lpj;
93}
94
95static void kvm_get_preset_lpj(void)
96{
97 struct pvclock_vcpu_time_info *src;
98 unsigned long khz;
99 u64 lpj;
100
101 src = &per_cpu(hv_clock, 0);
102 khz = pvclock_tsc_khz(src);
103
104 lpj = ((u64)khz * 1000);
105 do_div(lpj, HZ);
106 preset_lpj = lpj;
107}
108
81static struct clocksource kvm_clock = { 109static struct clocksource kvm_clock = {
82 .name = "kvm-clock", 110 .name = "kvm-clock",
83 .read = kvm_clock_read, 111 .read = kvm_clock_read,
@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
153 pv_time_ops.get_wallclock = kvm_get_wallclock; 181 pv_time_ops.get_wallclock = kvm_get_wallclock;
154 pv_time_ops.set_wallclock = kvm_set_wallclock; 182 pv_time_ops.set_wallclock = kvm_set_wallclock;
155 pv_time_ops.sched_clock = kvm_clock_read; 183 pv_time_ops.sched_clock = kvm_clock_read;
184 pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
156#ifdef CONFIG_X86_LOCAL_APIC 185#ifdef CONFIG_X86_LOCAL_APIC
157 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; 186 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
158#endif 187#endif
@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
163#ifdef CONFIG_KEXEC 192#ifdef CONFIG_KEXEC
164 machine_ops.crash_shutdown = kvm_crash_shutdown; 193 machine_ops.crash_shutdown = kvm_crash_shutdown;
165#endif 194#endif
195 kvm_get_preset_lpj();
166 clocksource_register(&kvm_clock); 196 clocksource_register(&kvm_clock);
167 } 197 }
168} 198}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 2e2af5d18191..82a7c7ed6d45 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -163,8 +163,8 @@ static int __cpuinit msr_device_create(int cpu)
163{ 163{
164 struct device *dev; 164 struct device *dev;
165 165
166 dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), 166 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
167 NULL, "msr%d", cpu); 167 "msr%d", cpu);
168 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 168 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
169} 169}
170 170
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 080d1d27f37a..e1e731d78f38 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -217,16 +217,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap,
217 217
218#endif /* CONFIG_IOMMU_DEBUG */ 218#endif /* CONFIG_IOMMU_DEBUG */
219 219
220static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
221{
222 unsigned int npages;
223
224 npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
225 npages >>= PAGE_SHIFT;
226
227 return npages;
228}
229
230static inline int translation_enabled(struct iommu_table *tbl) 220static inline int translation_enabled(struct iommu_table *tbl)
231{ 221{
232 /* only PHBs with translation enabled have an IOMMU table */ 222 /* only PHBs with translation enabled have an IOMMU table */
@@ -408,7 +398,7 @@ static void calgary_unmap_sg(struct device *dev,
408 if (dmalen == 0) 398 if (dmalen == 0)
409 break; 399 break;
410 400
411 npages = num_dma_pages(dma, dmalen); 401 npages = iommu_num_pages(dma, dmalen, PAGE_SIZE);
412 iommu_free(tbl, dma, npages); 402 iommu_free(tbl, dma, npages);
413 } 403 }
414} 404}
@@ -427,7 +417,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
427 BUG_ON(!sg_page(s)); 417 BUG_ON(!sg_page(s));
428 418
429 vaddr = (unsigned long) sg_virt(s); 419 vaddr = (unsigned long) sg_virt(s);
430 npages = num_dma_pages(vaddr, s->length); 420 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
431 421
432 entry = iommu_range_alloc(dev, tbl, npages); 422 entry = iommu_range_alloc(dev, tbl, npages);
433 if (entry == bad_dma_address) { 423 if (entry == bad_dma_address) {
@@ -464,7 +454,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
464 struct iommu_table *tbl = find_iommu_table(dev); 454 struct iommu_table *tbl = find_iommu_table(dev);
465 455
466 uaddr = (unsigned long)vaddr; 456 uaddr = (unsigned long)vaddr;
467 npages = num_dma_pages(uaddr, size); 457 npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
468 458
469 return iommu_alloc(dev, tbl, vaddr, npages, direction); 459 return iommu_alloc(dev, tbl, vaddr, npages, direction);
470} 460}
@@ -475,7 +465,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
475 struct iommu_table *tbl = find_iommu_table(dev); 465 struct iommu_table *tbl = find_iommu_table(dev);
476 unsigned int npages; 466 unsigned int npages;
477 467
478 npages = num_dma_pages(dma_handle, size); 468 npages = iommu_num_pages(dma_handle, size, PAGE_SIZE);
479 iommu_free(tbl, dma_handle, npages); 469 iommu_free(tbl, dma_handle, npages);
480} 470}
481 471
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 0a3824e837b4..1972266e8ba5 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -9,8 +9,6 @@
9#include <asm/calgary.h> 9#include <asm/calgary.h>
10#include <asm/amd_iommu.h> 10#include <asm/amd_iommu.h>
11 11
12static int forbid_dac __read_mostly;
13
14struct dma_mapping_ops *dma_ops; 12struct dma_mapping_ops *dma_ops;
15EXPORT_SYMBOL(dma_ops); 13EXPORT_SYMBOL(dma_ops);
16 14
@@ -125,13 +123,13 @@ void __init pci_iommu_alloc(void)
125 pci_swiotlb_init(); 123 pci_swiotlb_init();
126} 124}
127 125
128unsigned long iommu_num_pages(unsigned long addr, unsigned long len) 126unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
129{ 127{
130 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); 128 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
131 129
132 return size >> PAGE_SHIFT; 130 return size >> PAGE_SHIFT;
133} 131}
134EXPORT_SYMBOL(iommu_num_pages); 132EXPORT_SYMBOL(iommu_nr_pages);
135#endif 133#endif
136 134
137void *dma_generic_alloc_coherent(struct device *dev, size_t size, 135void *dma_generic_alloc_coherent(struct device *dev, size_t size,
@@ -293,17 +291,3 @@ void pci_iommu_shutdown(void)
293} 291}
294/* Must execute after PCI subsystem */ 292/* Must execute after PCI subsystem */
295fs_initcall(pci_iommu_init); 293fs_initcall(pci_iommu_init);
296
297#ifdef CONFIG_PCI
298/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
299
300static __devinit void via_no_dac(struct pci_dev *dev)
301{
302 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
303 printk(KERN_INFO "PCI: VIA PCI bridge detected."
304 "Disabling DAC.\n");
305 forbid_dac = 1;
306 }
307}
308DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
309#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 145f1c83369f..e3f75bbcedea 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -231,7 +231,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
231static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 231static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
232 size_t size, int dir, unsigned long align_mask) 232 size_t size, int dir, unsigned long align_mask)
233{ 233{
234 unsigned long npages = iommu_num_pages(phys_mem, size); 234 unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
235 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); 235 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
236 int i; 236 int i;
237 237
@@ -285,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
285 return; 285 return;
286 286
287 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; 287 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
288 npages = iommu_num_pages(dma_addr, size); 288 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
289 for (i = 0; i < npages; i++) { 289 for (i = 0; i < npages; i++) {
290 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 290 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
291 CLEAR_LEAK(iommu_page + i); 291 CLEAR_LEAK(iommu_page + i);
@@ -368,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
368 } 368 }
369 369
370 addr = phys_addr; 370 addr = phys_addr;
371 pages = iommu_num_pages(s->offset, s->length); 371 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
372 while (pages--) { 372 while (pages--) {
373 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 373 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
374 SET_LEAK(iommu_page); 374 SET_LEAK(iommu_page);
@@ -451,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
451 451
452 seg_size += s->length; 452 seg_size += s->length;
453 need = nextneed; 453 need = nextneed;
454 pages += iommu_num_pages(s->offset, s->length); 454 pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
455 ps = s; 455 ps = s;
456 } 456 }
457 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) 457 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 922c14058f97..0a1302fe6d45 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -123,7 +123,7 @@ void cpu_idle(void)
123 } 123 }
124} 124}
125 125
126void __show_registers(struct pt_regs *regs, int all) 126void __show_regs(struct pt_regs *regs, int all)
127{ 127{
128 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; 128 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
129 unsigned long d0, d1, d2, d3, d6, d7; 129 unsigned long d0, d1, d2, d3, d6, d7;
@@ -189,7 +189,7 @@ void __show_registers(struct pt_regs *regs, int all)
189 189
190void show_regs(struct pt_regs *regs) 190void show_regs(struct pt_regs *regs)
191{ 191{
192 __show_registers(regs, 1); 192 __show_regs(regs, 1);
193 show_trace(NULL, regs, &regs->sp, regs->bp); 193 show_trace(NULL, regs, &regs->sp, regs->bp);
194} 194}
195 195
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ca80394ef5b8..cd8c0ed02b7e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -136,7 +136,7 @@ void cpu_idle(void)
136} 136}
137 137
138/* Prints also some state that isn't saved in the pt_regs */ 138/* Prints also some state that isn't saved in the pt_regs */
139void __show_regs(struct pt_regs *regs) 139void __show_regs(struct pt_regs *regs, int all)
140{ 140{
141 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 141 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
142 unsigned long d0, d1, d2, d3, d6, d7; 142 unsigned long d0, d1, d2, d3, d6, d7;
@@ -175,6 +175,9 @@ void __show_regs(struct pt_regs *regs)
175 rdmsrl(MSR_GS_BASE, gs); 175 rdmsrl(MSR_GS_BASE, gs);
176 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 176 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
177 177
178 if (!all)
179 return;
180
178 cr0 = read_cr0(); 181 cr0 = read_cr0();
179 cr2 = read_cr2(); 182 cr2 = read_cr2();
180 cr3 = read_cr3(); 183 cr3 = read_cr3();
@@ -200,7 +203,7 @@ void __show_regs(struct pt_regs *regs)
200void show_regs(struct pt_regs *regs) 203void show_regs(struct pt_regs *regs)
201{ 204{
202 printk(KERN_INFO "CPU %d:", smp_processor_id()); 205 printk(KERN_INFO "CPU %d:", smp_processor_id());
203 __show_regs(regs); 206 __show_regs(regs, 1);
204 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 207 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
205} 208}
206 209
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 05fbe9a0325a..4f9c55f3a7c0 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
97 return dst->version; 97 return dst->version;
98} 98}
99 99
100unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
101{
102 u64 pv_tsc_khz = 1000000ULL << 32;
103
104 do_div(pv_tsc_khz, src->tsc_to_system_mul);
105 if (src->tsc_shift < 0)
106 pv_tsc_khz <<= -src->tsc_shift;
107 else
108 pv_tsc_khz >>= src->tsc_shift;
109 return pv_tsc_khz;
110}
111
100cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 112cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
101{ 113{
102 struct pvclock_shadow_time shadow; 114 struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index d13858818100..67465ed89310 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -35,9 +35,6 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
35 if (!(word & (1 << 13))) { 35 if (!(word & (1 << 13))) {
36 dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " 36 dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
37 "disabling irq balancing and affinity\n"); 37 "disabling irq balancing and affinity\n");
38#ifdef CONFIG_IRQBALANCE
39 irqbalance_disable("");
40#endif
41 noirqdebug_setup(""); 38 noirqdebug_setup("");
42#ifdef CONFIG_PROC_FS 39#ifdef CONFIG_PROC_FS
43 no_irq_affinity = 1; 40 no_irq_affinity = 1;
@@ -354,9 +351,27 @@ static void ati_force_hpet_resume(void)
354 printk(KERN_DEBUG "Force enabled HPET at resume\n"); 351 printk(KERN_DEBUG "Force enabled HPET at resume\n");
355} 352}
356 353
354static u32 ati_ixp4x0_rev(struct pci_dev *dev)
355{
356 u32 d;
357 u8 b;
358
359 pci_read_config_byte(dev, 0xac, &b);
360 b &= ~(1<<5);
361 pci_write_config_byte(dev, 0xac, b);
362 pci_read_config_dword(dev, 0x70, &d);
363 d |= 1<<8;
364 pci_write_config_dword(dev, 0x70, d);
365 pci_read_config_dword(dev, 0x8, &d);
366 d &= 0xff;
367 dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d);
368 return d;
369}
370
357static void ati_force_enable_hpet(struct pci_dev *dev) 371static void ati_force_enable_hpet(struct pci_dev *dev)
358{ 372{
359 u32 uninitialized_var(val); 373 u32 d, val;
374 u8 b;
360 375
361 if (hpet_address || force_hpet_address) 376 if (hpet_address || force_hpet_address)
362 return; 377 return;
@@ -366,14 +381,33 @@ static void ati_force_enable_hpet(struct pci_dev *dev)
366 return; 381 return;
367 } 382 }
368 383
384 d = ati_ixp4x0_rev(dev);
385 if (d < 0x82)
386 return;
387
388 /* base address */
369 pci_write_config_dword(dev, 0x14, 0xfed00000); 389 pci_write_config_dword(dev, 0x14, 0xfed00000);
370 pci_read_config_dword(dev, 0x14, &val); 390 pci_read_config_dword(dev, 0x14, &val);
391
392 /* enable interrupt */
393 outb(0x72, 0xcd6); b = inb(0xcd7);
394 b |= 0x1;
395 outb(0x72, 0xcd6); outb(b, 0xcd7);
396 outb(0x72, 0xcd6); b = inb(0xcd7);
397 if (!(b & 0x1))
398 return;
399 pci_read_config_dword(dev, 0x64, &d);
400 d |= (1<<10);
401 pci_write_config_dword(dev, 0x64, d);
402 pci_read_config_dword(dev, 0x64, &d);
403 if (!(d & (1<<10)))
404 return;
405
371 force_hpet_address = val; 406 force_hpet_address = val;
372 force_hpet_resume_type = ATI_FORCE_HPET_RESUME; 407 force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
373 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", 408 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
374 force_hpet_address); 409 force_hpet_address);
375 cached_dev = dev; 410 cached_dev = dev;
376 return;
377} 411}
378DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, 412DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
379 ati_force_enable_hpet); 413 ati_force_enable_hpet);
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 05191bbc68b8..dd6f2b71561b 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -52,7 +52,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
52 52
53 cmos_minutes = CMOS_READ(RTC_MINUTES); 53 cmos_minutes = CMOS_READ(RTC_MINUTES);
54 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) 54 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
55 BCD_TO_BIN(cmos_minutes); 55 cmos_minutes = bcd2bin(cmos_minutes);
56 56
57 /* 57 /*
58 * since we're only adjusting minutes and seconds, 58 * since we're only adjusting minutes and seconds,
@@ -69,8 +69,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
69 69
70 if (abs(real_minutes - cmos_minutes) < 30) { 70 if (abs(real_minutes - cmos_minutes) < 30) {
71 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { 71 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
72 BIN_TO_BCD(real_seconds); 72 real_seconds = bin2bcd(real_seconds);
73 BIN_TO_BCD(real_minutes); 73 real_minutes = bin2bcd(real_minutes);
74 } 74 }
75 CMOS_WRITE(real_seconds,RTC_SECONDS); 75 CMOS_WRITE(real_seconds,RTC_SECONDS);
76 CMOS_WRITE(real_minutes,RTC_MINUTES); 76 CMOS_WRITE(real_minutes,RTC_MINUTES);
@@ -124,16 +124,16 @@ unsigned long mach_get_cmos_time(void)
124 WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); 124 WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
125 125
126 if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { 126 if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
127 BCD_TO_BIN(sec); 127 sec = bcd2bin(sec);
128 BCD_TO_BIN(min); 128 min = bcd2bin(min);
129 BCD_TO_BIN(hour); 129 hour = bcd2bin(hour);
130 BCD_TO_BIN(day); 130 day = bcd2bin(day);
131 BCD_TO_BIN(mon); 131 mon = bcd2bin(mon);
132 BCD_TO_BIN(year); 132 year = bcd2bin(year);
133 } 133 }
134 134
135 if (century) { 135 if (century) {
136 BCD_TO_BIN(century); 136 century = bcd2bin(century);
137 year += century * 100; 137 year += century * 100;
138 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); 138 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
139 } else 139 } else
@@ -223,11 +223,25 @@ static struct platform_device rtc_device = {
223static __init int add_rtc_cmos(void) 223static __init int add_rtc_cmos(void)
224{ 224{
225#ifdef CONFIG_PNP 225#ifdef CONFIG_PNP
226 if (!pnp_platform_devices) 226 static const char *ids[] __initconst =
227 platform_device_register(&rtc_device); 227 { "PNP0b00", "PNP0b01", "PNP0b02", };
228#else 228 struct pnp_dev *dev;
229 struct pnp_id *id;
230 int i;
231
232 pnp_for_each_dev(dev) {
233 for (id = dev->id; id; id = id->next) {
234 for (i = 0; i < ARRAY_SIZE(ids); i++) {
235 if (compare_pnp_id(id, ids[i]) != 0)
236 return 0;
237 }
238 }
239 }
240#endif
241
229 platform_device_register(&rtc_device); 242 platform_device_register(&rtc_device);
230#endif /* CONFIG_PNP */ 243 dev_info(&rtc_device.dev,
244 "registered platform RTC device (no PNP device found)\n");
231 return 0; 245 return 0;
232} 246}
233device_initcall(add_rtc_cmos); 247device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 21b8e0a59780..0fa6790c1dd3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -302,7 +302,7 @@ static void __init relocate_initrd(void)
302 if (clen > MAX_MAP_CHUNK-slop) 302 if (clen > MAX_MAP_CHUNK-slop)
303 clen = MAX_MAP_CHUNK-slop; 303 clen = MAX_MAP_CHUNK-slop;
304 mapaddr = ramdisk_image & PAGE_MASK; 304 mapaddr = ramdisk_image & PAGE_MASK;
305 p = early_ioremap(mapaddr, clen+slop); 305 p = early_memremap(mapaddr, clen+slop);
306 memcpy(q, p+slop, clen); 306 memcpy(q, p+slop, clen);
307 early_iounmap(p, clen+slop); 307 early_iounmap(p, clen+slop);
308 q += clen; 308 q += clen;
@@ -379,7 +379,7 @@ static void __init parse_setup_data(void)
379 return; 379 return;
380 pa_data = boot_params.hdr.setup_data; 380 pa_data = boot_params.hdr.setup_data;
381 while (pa_data) { 381 while (pa_data) {
382 data = early_ioremap(pa_data, PAGE_SIZE); 382 data = early_memremap(pa_data, PAGE_SIZE);
383 switch (data->type) { 383 switch (data->type) {
384 case SETUP_E820_EXT: 384 case SETUP_E820_EXT:
385 parse_e820_ext(data, pa_data); 385 parse_e820_ext(data, pa_data);
@@ -402,7 +402,7 @@ static void __init e820_reserve_setup_data(void)
402 return; 402 return;
403 pa_data = boot_params.hdr.setup_data; 403 pa_data = boot_params.hdr.setup_data;
404 while (pa_data) { 404 while (pa_data) {
405 data = early_ioremap(pa_data, sizeof(*data)); 405 data = early_memremap(pa_data, sizeof(*data));
406 e820_update_range(pa_data, sizeof(*data)+data->len, 406 e820_update_range(pa_data, sizeof(*data)+data->len,
407 E820_RAM, E820_RESERVED_KERN); 407 E820_RAM, E820_RESERVED_KERN);
408 found = 1; 408 found = 1;
@@ -428,7 +428,7 @@ static void __init reserve_early_setup_data(void)
428 return; 428 return;
429 pa_data = boot_params.hdr.setup_data; 429 pa_data = boot_params.hdr.setup_data;
430 while (pa_data) { 430 while (pa_data) {
431 data = early_ioremap(pa_data, sizeof(*data)); 431 data = early_memremap(pa_data, sizeof(*data));
432 sprintf(buf, "setup data %x", data->type); 432 sprintf(buf, "setup data %x", data->type);
433 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); 433 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
434 pa_data = data->next; 434 pa_data = data->next;
@@ -561,7 +561,13 @@ static void __init reserve_standard_io_resources(void)
561 561
562} 562}
563 563
564#ifdef CONFIG_PROC_VMCORE 564/*
565 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
566 * is_kdump_kernel() to determine if we are booting after a panic. Hence
567 * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
568 */
569
570#ifdef CONFIG_CRASH_DUMP
565/* elfcorehdr= specifies the location of elf core header 571/* elfcorehdr= specifies the location of elf core header
566 * stored by the crashed kernel. This option will be passed 572 * stored by the crashed kernel. This option will be passed
567 * by kexec loader to the capture kernel. 573 * by kexec loader to the capture kernel.
@@ -998,6 +1004,8 @@ void __init setup_arch(char **cmdline_p)
998 */ 1004 */
999 acpi_boot_table_init(); 1005 acpi_boot_table_init();
1000 1006
1007 early_acpi_boot_init();
1008
1001#ifdef CONFIG_ACPI_NUMA 1009#ifdef CONFIG_ACPI_NUMA
1002 /* 1010 /*
1003 * Parse SRAT to discover nodes. 1011 * Parse SRAT to discover nodes.
@@ -1065,6 +1073,7 @@ void __init setup_arch(char **cmdline_p)
1065#endif 1073#endif
1066 1074
1067 prefill_possible_map(); 1075 prefill_possible_map();
1076
1068#ifdef CONFIG_X86_64 1077#ifdef CONFIG_X86_64
1069 init_cpu_to_node(); 1078 init_cpu_to_node();
1070#endif 1079#endif
@@ -1072,6 +1081,9 @@ void __init setup_arch(char **cmdline_p)
1072 init_apic_mappings(); 1081 init_apic_mappings();
1073 ioapic_init_mappings(); 1082 ioapic_init_mappings();
1074 1083
1084 /* need to wait for io_apic is mapped */
1085 nr_irqs = probe_nr_irqs();
1086
1075 kvm_guest_init(); 1087 kvm_guest_init();
1076 1088
1077 e820_reserve_resources(); 1089 e820_reserve_resources();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 0e67f72d9316..410c88f0bfeb 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -140,25 +140,30 @@ static void __init setup_cpu_pda_map(void)
140 */ 140 */
141void __init setup_per_cpu_areas(void) 141void __init setup_per_cpu_areas(void)
142{ 142{
143 ssize_t size = PERCPU_ENOUGH_ROOM; 143 ssize_t size, old_size;
144 char *ptr; 144 char *ptr;
145 int cpu; 145 int cpu;
146 unsigned long align = 1;
146 147
147 /* Setup cpu_pda map */ 148 /* Setup cpu_pda map */
148 setup_cpu_pda_map(); 149 setup_cpu_pda_map();
149 150
150 /* Copy section for each CPU (we discard the original) */ 151 /* Copy section for each CPU (we discard the original) */
151 size = PERCPU_ENOUGH_ROOM; 152 old_size = PERCPU_ENOUGH_ROOM;
153 align = max_t(unsigned long, PAGE_SIZE, align);
154 size = roundup(old_size, align);
152 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", 155 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
153 size); 156 size);
154 157
155 for_each_possible_cpu(cpu) { 158 for_each_possible_cpu(cpu) {
156#ifndef CONFIG_NEED_MULTIPLE_NODES 159#ifndef CONFIG_NEED_MULTIPLE_NODES
157 ptr = alloc_bootmem_pages(size); 160 ptr = __alloc_bootmem(size, align,
161 __pa(MAX_DMA_ADDRESS));
158#else 162#else
159 int node = early_cpu_to_node(cpu); 163 int node = early_cpu_to_node(cpu);
160 if (!node_online(node) || !NODE_DATA(node)) { 164 if (!node_online(node) || !NODE_DATA(node)) {
161 ptr = alloc_bootmem_pages(size); 165 ptr = __alloc_bootmem(size, align,
166 __pa(MAX_DMA_ADDRESS));
162 printk(KERN_INFO 167 printk(KERN_INFO
163 "cpu %d has no node %d or node-local memory\n", 168 "cpu %d has no node %d or node-local memory\n",
164 cpu, node); 169 cpu, node);
@@ -167,7 +172,8 @@ void __init setup_per_cpu_areas(void)
167 cpu, __pa(ptr)); 172 cpu, __pa(ptr));
168 } 173 }
169 else { 174 else {
170 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); 175 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
176 __pa(MAX_DMA_ADDRESS));
171 if (ptr) 177 if (ptr)
172 printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", 178 printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
173 cpu, node, __pa(ptr)); 179 cpu, node, __pa(ptr));
@@ -175,7 +181,6 @@ void __init setup_per_cpu_areas(void)
175#endif 181#endif
176 per_cpu_offset(cpu) = ptr - __per_cpu_start; 182 per_cpu_offset(cpu) = ptr - __per_cpu_start;
177 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 183 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
178
179 } 184 }
180 185
181 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", 186 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 76b6f50978f7..7ece815ea637 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -282,6 +282,8 @@ static void __cpuinit smp_callin(void)
282 cpu_set(cpuid, cpu_callin_map); 282 cpu_set(cpuid, cpu_callin_map);
283} 283}
284 284
285static int __cpuinitdata unsafe_smp;
286
285/* 287/*
286 * Activate a secondary processor. 288 * Activate a secondary processor.
287 */ 289 */
@@ -334,14 +336,17 @@ static void __cpuinit start_secondary(void *unused)
334 * does not change while we are assigning vectors to cpus. Holding 336 * does not change while we are assigning vectors to cpus. Holding
335 * this lock ensures we don't half assign or remove an irq from a cpu. 337 * this lock ensures we don't half assign or remove an irq from a cpu.
336 */ 338 */
337 ipi_call_lock_irq(); 339 ipi_call_lock();
338 lock_vector_lock(); 340 lock_vector_lock();
339 __setup_vector_irq(smp_processor_id()); 341 __setup_vector_irq(smp_processor_id());
340 cpu_set(smp_processor_id(), cpu_online_map); 342 cpu_set(smp_processor_id(), cpu_online_map);
341 unlock_vector_lock(); 343 unlock_vector_lock();
342 ipi_call_unlock_irq(); 344 ipi_call_unlock();
343 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 345 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
344 346
347 /* enable local interrupts */
348 local_irq_enable();
349
345 setup_secondary_clock(); 350 setup_secondary_clock();
346 351
347 wmb(); 352 wmb();
@@ -394,7 +399,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
394 goto valid_k7; 399 goto valid_k7;
395 400
396 /* If we get here, not a certified SMP capable AMD system. */ 401 /* If we get here, not a certified SMP capable AMD system. */
397 add_taint(TAINT_UNSAFE_SMP); 402 unsafe_smp = 1;
398 } 403 }
399 404
400valid_k7: 405valid_k7:
@@ -411,12 +416,10 @@ static void __cpuinit smp_checks(void)
411 * Don't taint if we are running SMP kernel on a single non-MP 416 * Don't taint if we are running SMP kernel on a single non-MP
412 * approved Athlon 417 * approved Athlon
413 */ 418 */
414 if (tainted & TAINT_UNSAFE_SMP) { 419 if (unsafe_smp && num_online_cpus() > 1) {
415 if (num_online_cpus()) 420 printk(KERN_INFO "WARNING: This combination of AMD"
416 printk(KERN_INFO "WARNING: This combination of AMD" 421 "processors is not suitable for SMP.\n");
417 "processors is not suitable for SMP.\n"); 422 add_taint(TAINT_UNSAFE_SMP);
418 else
419 tainted &= ~TAINT_UNSAFE_SMP;
420 } 423 }
421} 424}
422 425
@@ -540,10 +543,10 @@ static inline void __inquire_remote_apic(int apicid)
540 int timeout; 543 int timeout;
541 u32 status; 544 u32 status;
542 545
543 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); 546 printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
544 547
545 for (i = 0; i < ARRAY_SIZE(regs); i++) { 548 for (i = 0; i < ARRAY_SIZE(regs); i++) {
546 printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]); 549 printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
547 550
548 /* 551 /*
549 * Wait for idle. 552 * Wait for idle.
@@ -596,10 +599,12 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
596 * Give the other CPU some time to accept the IPI. 599 * Give the other CPU some time to accept the IPI.
597 */ 600 */
598 udelay(200); 601 udelay(200);
599 maxlvt = lapic_get_maxlvt(); 602 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
600 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 603 maxlvt = lapic_get_maxlvt();
601 apic_write(APIC_ESR, 0); 604 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
602 accept_status = (apic_read(APIC_ESR) & 0xEF); 605 apic_write(APIC_ESR, 0);
606 accept_status = (apic_read(APIC_ESR) & 0xEF);
607 }
603 pr_debug("NMI sent.\n"); 608 pr_debug("NMI sent.\n");
604 609
605 if (send_status) 610 if (send_status)
@@ -869,7 +874,7 @@ do_rest:
869 start_ip = setup_trampoline(); 874 start_ip = setup_trampoline();
870 875
871 /* So we see what's up */ 876 /* So we see what's up */
872 printk(KERN_INFO "Booting processor %d/%d ip %lx\n", 877 printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n",
873 cpu, apicid, start_ip); 878 cpu, apicid, start_ip);
874 879
875 /* 880 /*
@@ -1256,39 +1261,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1256 check_nmi_watchdog(); 1261 check_nmi_watchdog();
1257} 1262}
1258 1263
1259#ifdef CONFIG_HOTPLUG_CPU
1260
1261static void remove_siblinginfo(int cpu)
1262{
1263 int sibling;
1264 struct cpuinfo_x86 *c = &cpu_data(cpu);
1265
1266 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
1267 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1268 /*/
1269 * last thread sibling in this cpu core going down
1270 */
1271 if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
1272 cpu_data(sibling).booted_cores--;
1273 }
1274
1275 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
1276 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1277 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1278 cpus_clear(per_cpu(cpu_core_map, cpu));
1279 c->phys_proc_id = 0;
1280 c->cpu_core_id = 0;
1281 cpu_clear(cpu, cpu_sibling_setup_map);
1282}
1283
1284static int additional_cpus __initdata = -1;
1285
1286static __init int setup_additional_cpus(char *s)
1287{
1288 return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
1289}
1290early_param("additional_cpus", setup_additional_cpus);
1291
1292/* 1264/*
1293 * cpu_possible_map should be static, it cannot change as cpu's 1265 * cpu_possible_map should be static, it cannot change as cpu's
1294 * are onlined, or offlined. The reason is per-cpu data-structures 1266 * are onlined, or offlined. The reason is per-cpu data-structures
@@ -1308,21 +1280,13 @@ early_param("additional_cpus", setup_additional_cpus);
1308 */ 1280 */
1309__init void prefill_possible_map(void) 1281__init void prefill_possible_map(void)
1310{ 1282{
1311 int i; 1283 int i, possible;
1312 int possible;
1313 1284
1314 /* no processor from mptable or madt */ 1285 /* no processor from mptable or madt */
1315 if (!num_processors) 1286 if (!num_processors)
1316 num_processors = 1; 1287 num_processors = 1;
1317 1288
1318 if (additional_cpus == -1) { 1289 possible = num_processors + disabled_cpus;
1319 if (disabled_cpus > 0)
1320 additional_cpus = disabled_cpus;
1321 else
1322 additional_cpus = 0;
1323 }
1324
1325 possible = num_processors + additional_cpus;
1326 if (possible > NR_CPUS) 1290 if (possible > NR_CPUS)
1327 possible = NR_CPUS; 1291 possible = NR_CPUS;
1328 1292
@@ -1335,6 +1299,31 @@ __init void prefill_possible_map(void)
1335 nr_cpu_ids = possible; 1299 nr_cpu_ids = possible;
1336} 1300}
1337 1301
1302#ifdef CONFIG_HOTPLUG_CPU
1303
1304static void remove_siblinginfo(int cpu)
1305{
1306 int sibling;
1307 struct cpuinfo_x86 *c = &cpu_data(cpu);
1308
1309 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
1310 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1311 /*/
1312 * last thread sibling in this cpu core going down
1313 */
1314 if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
1315 cpu_data(sibling).booted_cores--;
1316 }
1317
1318 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
1319 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1320 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1321 cpus_clear(per_cpu(cpu_core_map, cpu));
1322 c->phys_proc_id = 0;
1323 c->cpu_core_id = 0;
1324 cpu_clear(cpu, cpu_sibling_setup_map);
1325}
1326
1338static void __ref remove_cpu_from_maps(int cpu) 1327static void __ref remove_cpu_from_maps(int cpu)
1339{ 1328{
1340 cpu_clear(cpu, cpu_online_map); 1329 cpu_clear(cpu, cpu_online_map);
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index bbecf8b6bf96..77b400f06ea2 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -47,10 +47,9 @@ unsigned long profile_pc(struct pt_regs *regs)
47 unsigned long pc = instruction_pointer(regs); 47 unsigned long pc = instruction_pointer(regs);
48 48
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && 50 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
51 in_lock_functions(pc)) {
52#ifdef CONFIG_FRAME_POINTER 51#ifdef CONFIG_FRAME_POINTER
53 return *(unsigned long *)(regs->bp + 4); 52 return *(unsigned long *)(regs->bp + sizeof(long));
54#else 53#else
55 unsigned long *sp = (unsigned long *)&regs->sp; 54 unsigned long *sp = (unsigned long *)&regs->sp;
56 55
@@ -95,6 +94,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
95 94
96 do_timer_interrupt_hook(); 95 do_timer_interrupt_hook();
97 96
97#ifdef CONFIG_MCA
98 if (MCA_bus) { 98 if (MCA_bus) {
99 /* The PS/2 uses level-triggered interrupts. You can't 99 /* The PS/2 uses level-triggered interrupts. You can't
100 turn them off, nor would you want to (any attempt to 100 turn them off, nor would you want to (any attempt to
@@ -108,6 +108,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
108 u8 irq_v = inb_p( 0x61 ); /* read the current state */ 108 u8 irq_v = inb_p( 0x61 ); /* read the current state */
109 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ 109 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */
110 } 110 }
111#endif
111 112
112 return IRQ_HANDLED; 113 return IRQ_HANDLED;
113} 114}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index e3d49c553af2..cb19d650c216 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/time.h> 18#include <linux/time.h>
19#include <linux/mca.h>
19 20
20#include <asm/i8253.h> 21#include <asm/i8253.h>
21#include <asm/hpet.h> 22#include <asm/hpet.h>
@@ -33,23 +34,34 @@ unsigned long profile_pc(struct pt_regs *regs)
33 /* Assume the lock function has either no stack frame or a copy 34 /* Assume the lock function has either no stack frame or a copy
34 of flags from PUSHF 35 of flags from PUSHF
35 Eflags always has bits 22 and up cleared unlike kernel addresses. */ 36 Eflags always has bits 22 and up cleared unlike kernel addresses. */
36 if (!user_mode(regs) && in_lock_functions(pc)) { 37 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else
37 unsigned long *sp = (unsigned long *)regs->sp; 41 unsigned long *sp = (unsigned long *)regs->sp;
38 if (sp[0] >> 22) 42 if (sp[0] >> 22)
39 return sp[0]; 43 return sp[0];
40 if (sp[1] >> 22) 44 if (sp[1] >> 22)
41 return sp[1]; 45 return sp[1];
46#endif
42 } 47 }
43 return pc; 48 return pc;
44} 49}
45EXPORT_SYMBOL(profile_pc); 50EXPORT_SYMBOL(profile_pc);
46 51
47static irqreturn_t timer_event_interrupt(int irq, void *dev_id) 52irqreturn_t timer_interrupt(int irq, void *dev_id)
48{ 53{
49 add_pda(irq0_irqs, 1); 54 add_pda(irq0_irqs, 1);
50 55
51 global_clock_event->event_handler(global_clock_event); 56 global_clock_event->event_handler(global_clock_event);
52 57
58#ifdef CONFIG_MCA
59 if (MCA_bus) {
60 u8 irq_v = inb_p(0x61); /* read the current state */
61 outb_p(irq_v|0x80, 0x61); /* reset the IRQ */
62 }
63#endif
64
53 return IRQ_HANDLED; 65 return IRQ_HANDLED;
54} 66}
55 67
@@ -100,7 +112,7 @@ unsigned long __init calibrate_cpu(void)
100} 112}
101 113
102static struct irqaction irq0 = { 114static struct irqaction irq0 = {
103 .handler = timer_event_interrupt, 115 .handler = timer_interrupt,
104 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, 116 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING,
105 .mask = CPU_MASK_NONE, 117 .mask = CPU_MASK_NONE,
106 .name = "timer" 118 .name = "timer"
@@ -111,16 +123,13 @@ void __init hpet_time_init(void)
111 if (!hpet_enable()) 123 if (!hpet_enable())
112 setup_pit_timer(); 124 setup_pit_timer();
113 125
126 irq0.mask = cpumask_of_cpu(0);
114 setup_irq(0, &irq0); 127 setup_irq(0, &irq0);
115} 128}
116 129
117void __init time_init(void) 130void __init time_init(void)
118{ 131{
119 tsc_init(); 132 tsc_init();
120 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
121 vgetcpu_mode = VGETCPU_RDTSCP;
122 else
123 vgetcpu_mode = VGETCPU_LSL;
124 133
125 late_time_init = choose_time_init(); 134 late_time_init = choose_time_init();
126} 135}
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps.c
index 0429c5de5ea9..e062974cce34 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps.c
@@ -7,13 +7,11 @@
7 */ 7 */
8 8
9/* 9/*
10 * 'Traps.c' handles hardware traps and faults after we have saved some 10 * Handle hardware traps and faults.
11 * state in 'asm.s'.
12 */ 11 */
13#include <linux/interrupt.h> 12#include <linux/interrupt.h>
14#include <linux/kallsyms.h> 13#include <linux/kallsyms.h>
15#include <linux/spinlock.h> 14#include <linux/spinlock.h>
16#include <linux/highmem.h>
17#include <linux/kprobes.h> 15#include <linux/kprobes.h>
18#include <linux/uaccess.h> 16#include <linux/uaccess.h>
19#include <linux/utsname.h> 17#include <linux/utsname.h>
@@ -32,6 +30,8 @@
32#include <linux/bug.h> 30#include <linux/bug.h>
33#include <linux/nmi.h> 31#include <linux/nmi.h>
34#include <linux/mm.h> 32#include <linux/mm.h>
33#include <linux/smp.h>
34#include <linux/io.h>
35 35
36#ifdef CONFIG_EISA 36#ifdef CONFIG_EISA
37#include <linux/ioport.h> 37#include <linux/ioport.h>
@@ -46,21 +46,31 @@
46#include <linux/edac.h> 46#include <linux/edac.h>
47#endif 47#endif
48 48
49#include <asm/arch_hooks.h>
50#include <asm/stacktrace.h> 49#include <asm/stacktrace.h>
51#include <asm/processor.h> 50#include <asm/processor.h>
52#include <asm/debugreg.h> 51#include <asm/debugreg.h>
53#include <asm/atomic.h> 52#include <asm/atomic.h>
54#include <asm/system.h> 53#include <asm/system.h>
55#include <asm/unwind.h> 54#include <asm/unwind.h>
55#include <asm/traps.h>
56#include <asm/desc.h> 56#include <asm/desc.h>
57#include <asm/i387.h> 57#include <asm/i387.h>
58
59#include <mach_traps.h>
60
61#ifdef CONFIG_X86_64
62#include <asm/pgalloc.h>
63#include <asm/proto.h>
64#include <asm/pda.h>
65#else
66#include <asm/processor-flags.h>
67#include <asm/arch_hooks.h>
58#include <asm/nmi.h> 68#include <asm/nmi.h>
59#include <asm/smp.h> 69#include <asm/smp.h>
60#include <asm/io.h> 70#include <asm/io.h>
61#include <asm/traps.h> 71#include <asm/traps.h>
62 72
63#include "mach_traps.h" 73#include "cpu/mcheck/mce.h"
64 74
65DECLARE_BITMAP(used_vectors, NR_VECTORS); 75DECLARE_BITMAP(used_vectors, NR_VECTORS);
66EXPORT_SYMBOL_GPL(used_vectors); 76EXPORT_SYMBOL_GPL(used_vectors);
@@ -77,418 +87,104 @@ char ignore_fpu_irq;
77 */ 87 */
78gate_desc idt_table[256] 88gate_desc idt_table[256]
79 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 89 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
80
81int panic_on_unrecovered_nmi;
82int kstack_depth_to_print = 24;
83static unsigned int code_bytes = 64;
84static int ignore_nmis;
85static int die_counter;
86
87void printk_address(unsigned long address, int reliable)
88{
89#ifdef CONFIG_KALLSYMS
90 unsigned long offset = 0;
91 unsigned long symsize;
92 const char *symname;
93 char *modname;
94 char *delim = ":";
95 char namebuf[KSYM_NAME_LEN];
96 char reliab[4] = "";
97
98 symname = kallsyms_lookup(address, &symsize, &offset,
99 &modname, namebuf);
100 if (!symname) {
101 printk(" [<%08lx>]\n", address);
102 return;
103 }
104 if (!reliable)
105 strcpy(reliab, "? ");
106
107 if (!modname)
108 modname = delim = "";
109 printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
110 address, reliab, delim, modname, delim, symname, offset, symsize);
111#else
112 printk(" [<%08lx>]\n", address);
113#endif 90#endif
114}
115
116static inline int valid_stack_ptr(struct thread_info *tinfo,
117 void *p, unsigned int size)
118{
119 void *t = tinfo;
120 return p > t && p <= t + THREAD_SIZE - size;
121}
122
123/* The form of the top of the frame on the stack */
124struct stack_frame {
125 struct stack_frame *next_frame;
126 unsigned long return_address;
127};
128
129static inline unsigned long
130print_context_stack(struct thread_info *tinfo,
131 unsigned long *stack, unsigned long bp,
132 const struct stacktrace_ops *ops, void *data)
133{
134 struct stack_frame *frame = (struct stack_frame *)bp;
135
136 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
137 unsigned long addr;
138
139 addr = *stack;
140 if (__kernel_text_address(addr)) {
141 if ((unsigned long) stack == bp + 4) {
142 ops->address(data, addr, 1);
143 frame = frame->next_frame;
144 bp = (unsigned long) frame;
145 } else {
146 ops->address(data, addr, bp == 0);
147 }
148 }
149 stack++;
150 }
151 return bp;
152}
153
154void dump_trace(struct task_struct *task, struct pt_regs *regs,
155 unsigned long *stack, unsigned long bp,
156 const struct stacktrace_ops *ops, void *data)
157{
158 if (!task)
159 task = current;
160
161 if (!stack) {
162 unsigned long dummy;
163 stack = &dummy;
164 if (task != current)
165 stack = (unsigned long *)task->thread.sp;
166 }
167
168#ifdef CONFIG_FRAME_POINTER
169 if (!bp) {
170 if (task == current) {
171 /* Grab bp right from our regs */
172 asm("movl %%ebp, %0" : "=r" (bp) :);
173 } else {
174 /* bp is the last reg pushed by switch_to */
175 bp = *(unsigned long *) task->thread.sp;
176 }
177 }
178#endif
179
180 for (;;) {
181 struct thread_info *context;
182
183 context = (struct thread_info *)
184 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
185 bp = print_context_stack(context, stack, bp, ops, data);
186 /*
187 * Should be after the line below, but somewhere
188 * in early boot context comes out corrupted and we
189 * can't reference it:
190 */
191 if (ops->stack(data, "IRQ") < 0)
192 break;
193 stack = (unsigned long *)context->previous_esp;
194 if (!stack)
195 break;
196 touch_nmi_watchdog();
197 }
198}
199EXPORT_SYMBOL(dump_trace);
200
201static void
202print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
203{
204 printk(data);
205 print_symbol(msg, symbol);
206 printk("\n");
207}
208
209static void print_trace_warning(void *data, char *msg)
210{
211 printk("%s%s\n", (char *)data, msg);
212}
213 91
214static int print_trace_stack(void *data, char *name) 92static int ignore_nmis;
215{
216 return 0;
217}
218
219/*
220 * Print one address/symbol entries per line.
221 */
222static void print_trace_address(void *data, unsigned long addr, int reliable)
223{
224 printk("%s [<%08lx>] ", (char *)data, addr);
225 if (!reliable)
226 printk("? ");
227 print_symbol("%s\n", addr);
228 touch_nmi_watchdog();
229}
230
231static const struct stacktrace_ops print_trace_ops = {
232 .warning = print_trace_warning,
233 .warning_symbol = print_trace_warning_symbol,
234 .stack = print_trace_stack,
235 .address = print_trace_address,
236};
237 93
238static void 94static inline void conditional_sti(struct pt_regs *regs)
239show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
240 unsigned long *stack, unsigned long bp, char *log_lvl)
241{ 95{
242 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 96 if (regs->flags & X86_EFLAGS_IF)
243 printk("%s =======================\n", log_lvl); 97 local_irq_enable();
244} 98}
245 99
246void show_trace(struct task_struct *task, struct pt_regs *regs, 100static inline void preempt_conditional_sti(struct pt_regs *regs)
247 unsigned long *stack, unsigned long bp)
248{ 101{
249 show_trace_log_lvl(task, regs, stack, bp, ""); 102 inc_preempt_count();
103 if (regs->flags & X86_EFLAGS_IF)
104 local_irq_enable();
250} 105}
251 106
252static void 107static inline void preempt_conditional_cli(struct pt_regs *regs)
253show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
254 unsigned long *sp, unsigned long bp, char *log_lvl)
255{ 108{
256 unsigned long *stack; 109 if (regs->flags & X86_EFLAGS_IF)
257 int i; 110 local_irq_disable();
258 111 dec_preempt_count();
259 if (sp == NULL) {
260 if (task)
261 sp = (unsigned long *)task->thread.sp;
262 else
263 sp = (unsigned long *)&sp;
264 }
265
266 stack = sp;
267 for (i = 0; i < kstack_depth_to_print; i++) {
268 if (kstack_end(stack))
269 break;
270 if (i && ((i % 8) == 0))
271 printk("\n%s ", log_lvl);
272 printk("%08lx ", *stack++);
273 }
274 printk("\n%sCall Trace:\n", log_lvl);
275
276 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
277} 112}
278 113
279void show_stack(struct task_struct *task, unsigned long *sp) 114#ifdef CONFIG_X86_32
115static inline void
116die_if_kernel(const char *str, struct pt_regs *regs, long err)
280{ 117{
281 printk(" "); 118 if (!user_mode_vm(regs))
282 show_stack_log_lvl(task, NULL, sp, 0, ""); 119 die(str, regs, err);
283} 120}
284 121
285/* 122/*
286 * The architecture-independent dump_stack generator 123 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
124 * invalid offset set (the LAZY one) and the faulting thread has
125 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
126 * we set the offset field correctly and return 1.
287 */ 127 */
288void dump_stack(void) 128static int lazy_iobitmap_copy(void)
289{ 129{
290 unsigned long bp = 0; 130 struct thread_struct *thread;
291 unsigned long stack; 131 struct tss_struct *tss;
292 132 int cpu;
293#ifdef CONFIG_FRAME_POINTER
294 if (!bp)
295 asm("movl %%ebp, %0" : "=r" (bp):);
296#endif
297
298 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
299 current->pid, current->comm, print_tainted(),
300 init_utsname()->release,
301 (int)strcspn(init_utsname()->version, " "),
302 init_utsname()->version);
303
304 show_trace(current, NULL, &stack, bp);
305}
306
307EXPORT_SYMBOL(dump_stack);
308
309void show_registers(struct pt_regs *regs)
310{
311 int i;
312 133
313 print_modules(); 134 cpu = get_cpu();
314 __show_registers(regs, 0); 135 tss = &per_cpu(init_tss, cpu);
136 thread = &current->thread;
315 137
316 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", 138 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
317 TASK_COMM_LEN, current->comm, task_pid_nr(current), 139 thread->io_bitmap_ptr) {
318 current_thread_info(), current, task_thread_info(current)); 140 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
319 /* 141 thread->io_bitmap_max);
320 * When in-kernel, we also print out the stack and code at the 142 /*
321 * time of the fault.. 143 * If the previously set map was extending to higher ports
322 */ 144 * than the current one, pad extra space with 0xff (no access).
323 if (!user_mode_vm(regs)) { 145 */
324 unsigned int code_prologue = code_bytes * 43 / 64; 146 if (thread->io_bitmap_max < tss->io_bitmap_max) {
325 unsigned int code_len = code_bytes; 147 memset((char *) tss->io_bitmap +
326 unsigned char c; 148 thread->io_bitmap_max, 0xff,
327 u8 *ip; 149 tss->io_bitmap_max - thread->io_bitmap_max);
328
329 printk("\n" KERN_EMERG "Stack: ");
330 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
331
332 printk(KERN_EMERG "Code: ");
333
334 ip = (u8 *)regs->ip - code_prologue;
335 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
336 /* try starting at EIP */
337 ip = (u8 *)regs->ip;
338 code_len = code_len - code_prologue + 1;
339 }
340 for (i = 0; i < code_len; i++, ip++) {
341 if (ip < (u8 *)PAGE_OFFSET ||
342 probe_kernel_address(ip, c)) {
343 printk(" Bad EIP value.");
344 break;
345 }
346 if (ip == (u8 *)regs->ip)
347 printk("<%02x> ", c);
348 else
349 printk("%02x ", c);
350 } 150 }
351 } 151 tss->io_bitmap_max = thread->io_bitmap_max;
352 printk("\n"); 152 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
353} 153 tss->io_bitmap_owner = thread;
354 154 put_cpu();
355int is_valid_bugaddr(unsigned long ip)
356{
357 unsigned short ud2;
358
359 if (ip < PAGE_OFFSET)
360 return 0;
361 if (probe_kernel_address((unsigned short *)ip, ud2))
362 return 0;
363
364 return ud2 == 0x0b0f;
365}
366
367static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
368static int die_owner = -1;
369static unsigned int die_nest_count;
370
371unsigned __kprobes long oops_begin(void)
372{
373 unsigned long flags;
374
375 oops_enter();
376
377 if (die_owner != raw_smp_processor_id()) {
378 console_verbose();
379 raw_local_irq_save(flags);
380 __raw_spin_lock(&die_lock);
381 die_owner = smp_processor_id();
382 die_nest_count = 0;
383 bust_spinlocks(1);
384 } else {
385 raw_local_irq_save(flags);
386 }
387 die_nest_count++;
388 return flags;
389}
390
391void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
392{
393 bust_spinlocks(0);
394 die_owner = -1;
395 add_taint(TAINT_DIE);
396 __raw_spin_unlock(&die_lock);
397 raw_local_irq_restore(flags);
398
399 if (!regs)
400 return;
401
402 if (kexec_should_crash(current))
403 crash_kexec(regs);
404
405 if (in_interrupt())
406 panic("Fatal exception in interrupt");
407
408 if (panic_on_oops)
409 panic("Fatal exception");
410
411 oops_exit();
412 do_exit(signr);
413}
414
415int __kprobes __die(const char *str, struct pt_regs *regs, long err)
416{
417 unsigned short ss;
418 unsigned long sp;
419 155
420 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
421#ifdef CONFIG_PREEMPT
422 printk("PREEMPT ");
423#endif
424#ifdef CONFIG_SMP
425 printk("SMP ");
426#endif
427#ifdef CONFIG_DEBUG_PAGEALLOC
428 printk("DEBUG_PAGEALLOC");
429#endif
430 printk("\n");
431 if (notify_die(DIE_OOPS, str, regs, err,
432 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
433 return 1; 156 return 1;
434
435 show_registers(regs);
436 /* Executive summary in case the oops scrolled away */
437 sp = (unsigned long) (&regs->sp);
438 savesegment(ss, ss);
439 if (user_mode(regs)) {
440 sp = regs->sp;
441 ss = regs->ss & 0xffff;
442 } 157 }
443 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); 158 put_cpu();
444 print_symbol("%s", regs->ip);
445 printk(" SS:ESP %04x:%08lx\n", ss, sp);
446 return 0;
447}
448
449/*
450 * This is gone through when something in the kernel has done something bad
451 * and is about to be terminated:
452 */
453void die(const char *str, struct pt_regs *regs, long err)
454{
455 unsigned long flags = oops_begin();
456
457 if (die_nest_count < 3) {
458 report_bug(regs->ip, regs);
459
460 if (__die(str, regs, err))
461 regs = NULL;
462 } else {
463 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
464 }
465
466 oops_end(flags, regs, SIGSEGV);
467}
468 159
469static inline void 160 return 0;
470die_if_kernel(const char *str, struct pt_regs *regs, long err)
471{
472 if (!user_mode_vm(regs))
473 die(str, regs, err);
474} 161}
162#endif
475 163
476static void __kprobes 164static void __kprobes
477do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs, 165do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
478 long error_code, siginfo_t *info) 166 long error_code, siginfo_t *info)
479{ 167{
480 struct task_struct *tsk = current; 168 struct task_struct *tsk = current;
481 169
170#ifdef CONFIG_X86_32
482 if (regs->flags & X86_VM_MASK) { 171 if (regs->flags & X86_VM_MASK) {
483 if (vm86) 172 /*
173 * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
174 * On nmi (interrupt 2), do_trap should not be called.
175 */
176 if (trapnr < 6)
484 goto vm86_trap; 177 goto vm86_trap;
485 goto trap_signal; 178 goto trap_signal;
486 } 179 }
180#endif
487 181
488 if (!user_mode(regs)) 182 if (!user_mode(regs))
489 goto kernel_trap; 183 goto kernel_trap;
490 184
185#ifdef CONFIG_X86_32
491trap_signal: 186trap_signal:
187#endif
492 /* 188 /*
493 * We want error_code and trap_no set for userspace faults and 189 * We want error_code and trap_no set for userspace faults and
494 * kernelspace faults which result in die(), but not 190 * kernelspace faults which result in die(), but not
@@ -501,6 +197,18 @@ trap_signal:
501 tsk->thread.error_code = error_code; 197 tsk->thread.error_code = error_code;
502 tsk->thread.trap_no = trapnr; 198 tsk->thread.trap_no = trapnr;
503 199
200#ifdef CONFIG_X86_64
201 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
202 printk_ratelimit()) {
203 printk(KERN_INFO
204 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
205 tsk->comm, tsk->pid, str,
206 regs->ip, regs->sp, error_code);
207 print_vma_addr(" in ", regs->ip);
208 printk("\n");
209 }
210#endif
211
504 if (info) 212 if (info)
505 force_sig_info(signr, info, tsk); 213 force_sig_info(signr, info, tsk);
506 else 214 else
@@ -515,29 +223,29 @@ kernel_trap:
515 } 223 }
516 return; 224 return;
517 225
226#ifdef CONFIG_X86_32
518vm86_trap: 227vm86_trap:
519 if (handle_vm86_trap((struct kernel_vm86_regs *) regs, 228 if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
520 error_code, trapnr)) 229 error_code, trapnr))
521 goto trap_signal; 230 goto trap_signal;
522 return; 231 return;
232#endif
523} 233}
524 234
525#define DO_ERROR(trapnr, signr, str, name) \ 235#define DO_ERROR(trapnr, signr, str, name) \
526void do_##name(struct pt_regs *regs, long error_code) \ 236dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
527{ \ 237{ \
528 trace_hardirqs_fixup(); \
529 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 238 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
530 == NOTIFY_STOP) \ 239 == NOTIFY_STOP) \
531 return; \ 240 return; \
532 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ 241 conditional_sti(regs); \
242 do_trap(trapnr, signr, str, regs, error_code, NULL); \
533} 243}
534 244
535#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ 245#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
536void do_##name(struct pt_regs *regs, long error_code) \ 246dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
537{ \ 247{ \
538 siginfo_t info; \ 248 siginfo_t info; \
539 if (irq) \
540 local_irq_enable(); \
541 info.si_signo = signr; \ 249 info.si_signo = signr; \
542 info.si_errno = 0; \ 250 info.si_errno = 0; \
543 info.si_code = sicode; \ 251 info.si_code = sicode; \
@@ -545,90 +253,68 @@ void do_##name(struct pt_regs *regs, long error_code) \
545 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 253 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
546 == NOTIFY_STOP) \ 254 == NOTIFY_STOP) \
547 return; \ 255 return; \
548 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ 256 conditional_sti(regs); \
257 do_trap(trapnr, signr, str, regs, error_code, &info); \
549} 258}
550 259
551#define DO_VM86_ERROR(trapnr, signr, str, name) \ 260DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
552void do_##name(struct pt_regs *regs, long error_code) \ 261DO_ERROR(4, SIGSEGV, "overflow", overflow)
553{ \ 262DO_ERROR(5, SIGSEGV, "bounds", bounds)
554 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 263DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
555 == NOTIFY_STOP) \
556 return; \
557 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
558}
559
560#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
561void do_##name(struct pt_regs *regs, long error_code) \
562{ \
563 siginfo_t info; \
564 info.si_signo = signr; \
565 info.si_errno = 0; \
566 info.si_code = sicode; \
567 info.si_addr = (void __user *)siaddr; \
568 trace_hardirqs_fixup(); \
569 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
570 == NOTIFY_STOP) \
571 return; \
572 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
573}
574
575DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
576#ifndef CONFIG_KPROBES
577DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
578#endif
579DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
580DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
581DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
582DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 264DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
583DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 265DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
584DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) 266DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
267#ifdef CONFIG_X86_32
585DO_ERROR(12, SIGBUS, "stack segment", stack_segment) 268DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
586DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) 269#endif
587DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) 270DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
271
272#ifdef CONFIG_X86_64
273/* Runs on IST stack */
274dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
275{
276 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
277 12, SIGBUS) == NOTIFY_STOP)
278 return;
279 preempt_conditional_sti(regs);
280 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
281 preempt_conditional_cli(regs);
282}
283
284dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
285{
286 static const char str[] = "double fault";
287 struct task_struct *tsk = current;
288
289 /* Return not checked because double check cannot be ignored */
290 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
588 291
589void __kprobes 292 tsk->thread.error_code = error_code;
293 tsk->thread.trap_no = 8;
294
295 /* This is always a kernel trap and never fixable (and thus must
296 never return). */
297 for (;;)
298 die(str, regs, error_code);
299}
300#endif
301
302dotraplinkage void __kprobes
590do_general_protection(struct pt_regs *regs, long error_code) 303do_general_protection(struct pt_regs *regs, long error_code)
591{ 304{
592 struct task_struct *tsk; 305 struct task_struct *tsk;
593 struct thread_struct *thread;
594 struct tss_struct *tss;
595 int cpu;
596 306
597 cpu = get_cpu(); 307 conditional_sti(regs);
598 tss = &per_cpu(init_tss, cpu);
599 thread = &current->thread;
600
601 /*
602 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
603 * invalid offset set (the LAZY one) and the faulting thread has
604 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
605 * and we set the offset field correctly. Then we let the CPU to
606 * restart the faulting instruction.
607 */
608 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
609 thread->io_bitmap_ptr) {
610 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
611 thread->io_bitmap_max);
612 /*
613 * If the previously set map was extending to higher ports
614 * than the current one, pad extra space with 0xff (no access).
615 */
616 if (thread->io_bitmap_max < tss->io_bitmap_max) {
617 memset((char *) tss->io_bitmap +
618 thread->io_bitmap_max, 0xff,
619 tss->io_bitmap_max - thread->io_bitmap_max);
620 }
621 tss->io_bitmap_max = thread->io_bitmap_max;
622 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
623 tss->io_bitmap_owner = thread;
624 put_cpu();
625 308
309#ifdef CONFIG_X86_32
310 if (lazy_iobitmap_copy()) {
311 /* restart the faulting instruction */
626 return; 312 return;
627 } 313 }
628 put_cpu();
629 314
630 if (regs->flags & X86_VM_MASK) 315 if (regs->flags & X86_VM_MASK)
631 goto gp_in_vm86; 316 goto gp_in_vm86;
317#endif
632 318
633 tsk = current; 319 tsk = current;
634 if (!user_mode(regs)) 320 if (!user_mode(regs))
@@ -650,10 +336,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
650 force_sig(SIGSEGV, tsk); 336 force_sig(SIGSEGV, tsk);
651 return; 337 return;
652 338
339#ifdef CONFIG_X86_32
653gp_in_vm86: 340gp_in_vm86:
654 local_irq_enable(); 341 local_irq_enable();
655 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 342 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
656 return; 343 return;
344#endif
657 345
658gp_in_kernel: 346gp_in_kernel:
659 if (fixup_exception(regs)) 347 if (fixup_exception(regs))
@@ -690,7 +378,8 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
690 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 378 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
691 379
692 /* Clear and disable the memory parity error line. */ 380 /* Clear and disable the memory parity error line. */
693 clear_mem_error(reason); 381 reason = (reason & 0xf) | 4;
382 outb(reason, 0x61);
694} 383}
695 384
696static notrace __kprobes void 385static notrace __kprobes void
@@ -716,7 +405,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
716static notrace __kprobes void 405static notrace __kprobes void
717unknown_nmi_error(unsigned char reason, struct pt_regs *regs) 406unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
718{ 407{
719 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 408 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
409 NOTIFY_STOP)
720 return; 410 return;
721#ifdef CONFIG_MCA 411#ifdef CONFIG_MCA
722 /* 412 /*
@@ -739,41 +429,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
739 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 429 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
740} 430}
741 431
742static DEFINE_SPINLOCK(nmi_print_lock);
743
744void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
745{
746 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
747 return;
748
749 spin_lock(&nmi_print_lock);
750 /*
751 * We are in trouble anyway, lets at least try
752 * to get a message out:
753 */
754 bust_spinlocks(1);
755 printk(KERN_EMERG "%s", str);
756 printk(" on CPU%d, ip %08lx, registers:\n",
757 smp_processor_id(), regs->ip);
758 show_registers(regs);
759 if (do_panic)
760 panic("Non maskable interrupt");
761 console_silent();
762 spin_unlock(&nmi_print_lock);
763 bust_spinlocks(0);
764
765 /*
766 * If we are in kernel we are probably nested up pretty bad
767 * and might aswell get out now while we still can:
768 */
769 if (!user_mode_vm(regs)) {
770 current->thread.trap_no = 2;
771 crash_kexec(regs);
772 }
773
774 do_exit(SIGSEGV);
775}
776
777static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 432static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
778{ 433{
779 unsigned char reason = 0; 434 unsigned char reason = 0;
@@ -812,22 +467,25 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
812 mem_parity_error(reason, regs); 467 mem_parity_error(reason, regs);
813 if (reason & 0x40) 468 if (reason & 0x40)
814 io_check_error(reason, regs); 469 io_check_error(reason, regs);
470#ifdef CONFIG_X86_32
815 /* 471 /*
816 * Reassert NMI in case it became active meanwhile 472 * Reassert NMI in case it became active meanwhile
817 * as it's edge-triggered: 473 * as it's edge-triggered:
818 */ 474 */
819 reassert_nmi(); 475 reassert_nmi();
476#endif
820} 477}
821 478
822notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) 479dotraplinkage notrace __kprobes void
480do_nmi(struct pt_regs *regs, long error_code)
823{ 481{
824 int cpu;
825
826 nmi_enter(); 482 nmi_enter();
827 483
828 cpu = smp_processor_id(); 484#ifdef CONFIG_X86_32
829 485 { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
830 ++nmi_count(cpu); 486#else
487 add_pda(__nmi_count, 1);
488#endif
831 489
832 if (!ignore_nmis) 490 if (!ignore_nmis)
833 default_do_nmi(regs); 491 default_do_nmi(regs);
@@ -847,21 +505,44 @@ void restart_nmi(void)
847 acpi_nmi_enable(); 505 acpi_nmi_enable();
848} 506}
849 507
850#ifdef CONFIG_KPROBES 508/* May run on IST stack. */
851void __kprobes do_int3(struct pt_regs *regs, long error_code) 509dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
852{ 510{
853 trace_hardirqs_fixup(); 511#ifdef CONFIG_KPROBES
854
855 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) 512 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
856 == NOTIFY_STOP) 513 == NOTIFY_STOP)
857 return; 514 return;
858 /* 515#else
859 * This is an interrupt gate, because kprobes wants interrupts 516 if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
860 * disabled. Normal trap handlers don't. 517 == NOTIFY_STOP)
861 */ 518 return;
862 restore_interrupts(regs); 519#endif
520
521 preempt_conditional_sti(regs);
522 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
523 preempt_conditional_cli(regs);
524}
863 525
864 do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); 526#ifdef CONFIG_X86_64
527/* Help handler running on IST stack to switch back to user stack
528 for scheduling or signal handling. The actual stack switch is done in
529 entry.S */
530asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
531{
532 struct pt_regs *regs = eregs;
533 /* Did already sync */
534 if (eregs == (struct pt_regs *)eregs->sp)
535 ;
536 /* Exception from user space */
537 else if (user_mode(eregs))
538 regs = task_pt_regs(current);
539 /* Exception from kernel and interrupts are enabled. Move to
540 kernel process stack. */
541 else if (eregs->flags & X86_EFLAGS_IF)
542 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
543 if (eregs != regs)
544 *regs = *eregs;
545 return regs;
865} 546}
866#endif 547#endif
867 548
@@ -886,15 +567,15 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code)
886 * about restoring all the debug state, and ptrace doesn't have to 567 * about restoring all the debug state, and ptrace doesn't have to
887 * find every occurrence of the TF bit that could be saved away even 568 * find every occurrence of the TF bit that could be saved away even
888 * by user code) 569 * by user code)
570 *
571 * May run on IST stack.
889 */ 572 */
890void __kprobes do_debug(struct pt_regs *regs, long error_code) 573dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
891{ 574{
892 struct task_struct *tsk = current; 575 struct task_struct *tsk = current;
893 unsigned int condition; 576 unsigned long condition;
894 int si_code; 577 int si_code;
895 578
896 trace_hardirqs_fixup();
897
898 get_debugreg(condition, 6); 579 get_debugreg(condition, 6);
899 580
900 /* 581 /*
@@ -906,9 +587,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
906 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 587 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
907 SIGTRAP) == NOTIFY_STOP) 588 SIGTRAP) == NOTIFY_STOP)
908 return; 589 return;
590
909 /* It's safe to allow irq's after DR6 has been saved */ 591 /* It's safe to allow irq's after DR6 has been saved */
910 if (regs->flags & X86_EFLAGS_IF) 592 preempt_conditional_sti(regs);
911 local_irq_enable();
912 593
913 /* Mask out spurious debug traps due to lazy DR7 setting */ 594 /* Mask out spurious debug traps due to lazy DR7 setting */
914 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 595 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
@@ -916,8 +597,10 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
916 goto clear_dr7; 597 goto clear_dr7;
917 } 598 }
918 599
600#ifdef CONFIG_X86_32
919 if (regs->flags & X86_VM_MASK) 601 if (regs->flags & X86_VM_MASK)
920 goto debug_vm86; 602 goto debug_vm86;
603#endif
921 604
922 /* Save debug status register where ptrace can see it */ 605 /* Save debug status register where ptrace can see it */
923 tsk->thread.debugreg6 = condition; 606 tsk->thread.debugreg6 = condition;
@@ -927,16 +610,11 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
927 * kernel space (but re-enable TF when returning to user mode). 610 * kernel space (but re-enable TF when returning to user mode).
928 */ 611 */
929 if (condition & DR_STEP) { 612 if (condition & DR_STEP) {
930 /*
931 * We already checked v86 mode above, so we can
932 * check for kernel mode by just checking the CPL
933 * of CS.
934 */
935 if (!user_mode(regs)) 613 if (!user_mode(regs))
936 goto clear_TF_reenable; 614 goto clear_TF_reenable;
937 } 615 }
938 616
939 si_code = get_si_code((unsigned long)condition); 617 si_code = get_si_code(condition);
940 /* Ok, finally something we can handle */ 618 /* Ok, finally something we can handle */
941 send_sigtrap(tsk, regs, error_code, si_code); 619 send_sigtrap(tsk, regs, error_code, si_code);
942 620
@@ -946,18 +624,37 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
946 */ 624 */
947clear_dr7: 625clear_dr7:
948 set_debugreg(0, 7); 626 set_debugreg(0, 7);
627 preempt_conditional_cli(regs);
949 return; 628 return;
950 629
630#ifdef CONFIG_X86_32
951debug_vm86: 631debug_vm86:
952 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); 632 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
633 preempt_conditional_cli(regs);
953 return; 634 return;
635#endif
954 636
955clear_TF_reenable: 637clear_TF_reenable:
956 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 638 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
957 regs->flags &= ~X86_EFLAGS_TF; 639 regs->flags &= ~X86_EFLAGS_TF;
640 preempt_conditional_cli(regs);
958 return; 641 return;
959} 642}
960 643
644#ifdef CONFIG_X86_64
645static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
646{
647 if (fixup_exception(regs))
648 return 1;
649
650 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
651 /* Illegal floating point operation in the kernel */
652 current->thread.trap_no = trapnr;
653 die(str, regs, 0);
654 return 0;
655}
656#endif
657
961/* 658/*
962 * Note that we play around with the 'TS' bit in an attempt to get 659 * Note that we play around with the 'TS' bit in an attempt to get
963 * the correct behaviour even in the presence of the asynchronous 660 * the correct behaviour even in the presence of the asynchronous
@@ -994,7 +691,9 @@ void math_error(void __user *ip)
994 swd = get_fpu_swd(task); 691 swd = get_fpu_swd(task);
995 switch (swd & ~cwd & 0x3f) { 692 switch (swd & ~cwd & 0x3f) {
996 case 0x000: /* No unmasked exception */ 693 case 0x000: /* No unmasked exception */
694#ifdef CONFIG_X86_32
997 return; 695 return;
696#endif
998 default: /* Multiple exceptions */ 697 default: /* Multiple exceptions */
999 break; 698 break;
1000 case 0x001: /* Invalid Op */ 699 case 0x001: /* Invalid Op */
@@ -1022,9 +721,18 @@ void math_error(void __user *ip)
1022 force_sig_info(SIGFPE, &info, task); 721 force_sig_info(SIGFPE, &info, task);
1023} 722}
1024 723
1025void do_coprocessor_error(struct pt_regs *regs, long error_code) 724dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
1026{ 725{
726 conditional_sti(regs);
727
728#ifdef CONFIG_X86_32
1027 ignore_fpu_irq = 1; 729 ignore_fpu_irq = 1;
730#else
731 if (!user_mode(regs) &&
732 kernel_math_error(regs, "kernel x87 math error", 16))
733 return;
734#endif
735
1028 math_error((void __user *)regs->ip); 736 math_error((void __user *)regs->ip);
1029} 737}
1030 738
@@ -1076,8 +784,12 @@ static void simd_math_error(void __user *ip)
1076 force_sig_info(SIGFPE, &info, task); 784 force_sig_info(SIGFPE, &info, task);
1077} 785}
1078 786
1079void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 787dotraplinkage void
788do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
1080{ 789{
790 conditional_sti(regs);
791
792#ifdef CONFIG_X86_32
1081 if (cpu_has_xmm) { 793 if (cpu_has_xmm) {
1082 /* Handle SIMD FPU exceptions on PIII+ processors. */ 794 /* Handle SIMD FPU exceptions on PIII+ processors. */
1083 ignore_fpu_irq = 1; 795 ignore_fpu_irq = 1;
@@ -1096,16 +808,25 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
1096 current->thread.error_code = error_code; 808 current->thread.error_code = error_code;
1097 die_if_kernel("cache flush denied", regs, error_code); 809 die_if_kernel("cache flush denied", regs, error_code);
1098 force_sig(SIGSEGV, current); 810 force_sig(SIGSEGV, current);
811#else
812 if (!user_mode(regs) &&
813 kernel_math_error(regs, "kernel simd math error", 19))
814 return;
815 simd_math_error((void __user *)regs->ip);
816#endif
1099} 817}
1100 818
1101void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) 819dotraplinkage void
820do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
1102{ 821{
822 conditional_sti(regs);
1103#if 0 823#if 0
1104 /* No need to warn about this any longer. */ 824 /* No need to warn about this any longer. */
1105 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); 825 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
1106#endif 826#endif
1107} 827}
1108 828
829#ifdef CONFIG_X86_32
1109unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) 830unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
1110{ 831{
1111 struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); 832 struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
@@ -1124,6 +845,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
1124 845
1125 return new_kesp; 846 return new_kesp;
1126} 847}
848#else
849asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
850{
851}
852
853asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
854{
855}
856#endif
1127 857
1128/* 858/*
1129 * 'math_state_restore()' saves the current math information in the 859 * 'math_state_restore()' saves the current math information in the
@@ -1156,14 +886,24 @@ asmlinkage void math_state_restore(void)
1156 } 886 }
1157 887
1158 clts(); /* Allow maths ops (or we recurse) */ 888 clts(); /* Allow maths ops (or we recurse) */
889#ifdef CONFIG_X86_32
1159 restore_fpu(tsk); 890 restore_fpu(tsk);
891#else
892 /*
893 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
894 */
895 if (unlikely(restore_fpu_checking(tsk))) {
896 stts();
897 force_sig(SIGSEGV, tsk);
898 return;
899 }
900#endif
1160 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 901 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
1161 tsk->fpu_counter++; 902 tsk->fpu_counter++;
1162} 903}
1163EXPORT_SYMBOL_GPL(math_state_restore); 904EXPORT_SYMBOL_GPL(math_state_restore);
1164 905
1165#ifndef CONFIG_MATH_EMULATION 906#ifndef CONFIG_MATH_EMULATION
1166
1167asmlinkage void math_emulate(long arg) 907asmlinkage void math_emulate(long arg)
1168{ 908{
1169 printk(KERN_EMERG 909 printk(KERN_EMERG
@@ -1172,12 +912,54 @@ asmlinkage void math_emulate(long arg)
1172 force_sig(SIGFPE, current); 912 force_sig(SIGFPE, current);
1173 schedule(); 913 schedule();
1174} 914}
1175
1176#endif /* CONFIG_MATH_EMULATION */ 915#endif /* CONFIG_MATH_EMULATION */
1177 916
917dotraplinkage void __kprobes
918do_device_not_available(struct pt_regs *regs, long error)
919{
920#ifdef CONFIG_X86_32
921 if (read_cr0() & X86_CR0_EM) {
922 conditional_sti(regs);
923 math_emulate(0);
924 } else {
925 math_state_restore(); /* interrupts still off */
926 conditional_sti(regs);
927 }
928#else
929 math_state_restore();
930#endif
931}
932
933#ifdef CONFIG_X86_32
934#ifdef CONFIG_X86_MCE
935dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error)
936{
937 conditional_sti(regs);
938 machine_check_vector(regs, error);
939}
940#endif
941
942dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
943{
944 siginfo_t info;
945 local_irq_enable();
946
947 info.si_signo = SIGILL;
948 info.si_errno = 0;
949 info.si_code = ILL_BADSTK;
950 info.si_addr = 0;
951 if (notify_die(DIE_TRAP, "iret exception",
952 regs, error_code, 32, SIGILL) == NOTIFY_STOP)
953 return;
954 do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
955}
956#endif
957
1178void __init trap_init(void) 958void __init trap_init(void)
1179{ 959{
960#ifdef CONFIG_X86_32
1180 int i; 961 int i;
962#endif
1181 963
1182#ifdef CONFIG_EISA 964#ifdef CONFIG_EISA
1183 void __iomem *p = early_ioremap(0x0FFFD9, 4); 965 void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1187,29 +969,40 @@ void __init trap_init(void)
1187 early_iounmap(p, 4); 969 early_iounmap(p, 4);
1188#endif 970#endif
1189 971
1190 set_trap_gate(0, &divide_error); 972 set_intr_gate(0, &divide_error);
1191 set_intr_gate(1, &debug); 973 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1192 set_intr_gate(2, &nmi); 974 set_intr_gate_ist(2, &nmi, NMI_STACK);
1193 set_system_intr_gate(3, &int3); /* int3 can be called from all */ 975 /* int3 can be called from all */
1194 set_system_gate(4, &overflow); /* int4 can be called from all */ 976 set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
1195 set_trap_gate(5, &bounds); 977 /* int4 can be called from all */
1196 set_trap_gate(6, &invalid_op); 978 set_system_intr_gate(4, &overflow);
1197 set_trap_gate(7, &device_not_available); 979 set_intr_gate(5, &bounds);
980 set_intr_gate(6, &invalid_op);
981 set_intr_gate(7, &device_not_available);
982#ifdef CONFIG_X86_32
1198 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); 983 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
1199 set_trap_gate(9, &coprocessor_segment_overrun); 984#else
1200 set_trap_gate(10, &invalid_TSS); 985 set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
1201 set_trap_gate(11, &segment_not_present); 986#endif
1202 set_trap_gate(12, &stack_segment); 987 set_intr_gate(9, &coprocessor_segment_overrun);
1203 set_trap_gate(13, &general_protection); 988 set_intr_gate(10, &invalid_TSS);
989 set_intr_gate(11, &segment_not_present);
990 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
991 set_intr_gate(13, &general_protection);
1204 set_intr_gate(14, &page_fault); 992 set_intr_gate(14, &page_fault);
1205 set_trap_gate(15, &spurious_interrupt_bug); 993 set_intr_gate(15, &spurious_interrupt_bug);
1206 set_trap_gate(16, &coprocessor_error); 994 set_intr_gate(16, &coprocessor_error);
1207 set_trap_gate(17, &alignment_check); 995 set_intr_gate(17, &alignment_check);
1208#ifdef CONFIG_X86_MCE 996#ifdef CONFIG_X86_MCE
1209 set_trap_gate(18, &machine_check); 997 set_intr_gate_ist(18, &machine_check, MCE_STACK);
1210#endif 998#endif
1211 set_trap_gate(19, &simd_coprocessor_error); 999 set_intr_gate(19, &simd_coprocessor_error);
1212 1000
1001#ifdef CONFIG_IA32_EMULATION
1002 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
1003#endif
1004
1005#ifdef CONFIG_X86_32
1213 if (cpu_has_fxsr) { 1006 if (cpu_has_fxsr) {
1214 printk(KERN_INFO "Enabling fast FPU save and restore... "); 1007 printk(KERN_INFO "Enabling fast FPU save and restore... ");
1215 set_in_cr4(X86_CR4_OSFXSR); 1008 set_in_cr4(X86_CR4_OSFXSR);
@@ -1222,36 +1015,20 @@ void __init trap_init(void)
1222 printk("done.\n"); 1015 printk("done.\n");
1223 } 1016 }
1224 1017
1225 set_system_gate(SYSCALL_VECTOR, &system_call); 1018 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
1226 1019
1227 /* Reserve all the builtin and the syscall vector: */ 1020 /* Reserve all the builtin and the syscall vector: */
1228 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) 1021 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
1229 set_bit(i, used_vectors); 1022 set_bit(i, used_vectors);
1230 1023
1231 set_bit(SYSCALL_VECTOR, used_vectors); 1024 set_bit(SYSCALL_VECTOR, used_vectors);
1232 1025#endif
1233 /* 1026 /*
1234 * Should be a barrier for any external CPU state: 1027 * Should be a barrier for any external CPU state:
1235 */ 1028 */
1236 cpu_init(); 1029 cpu_init();
1237 1030
1031#ifdef CONFIG_X86_32
1238 trap_init_hook(); 1032 trap_init_hook();
1033#endif
1239} 1034}
1240
1241static int __init kstack_setup(char *s)
1242{
1243 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1244
1245 return 1;
1246}
1247__setup("kstack=", kstack_setup);
1248
1249static int __init code_bytes_setup(char *s)
1250{
1251 code_bytes = simple_strtoul(s, NULL, 0);
1252 if (code_bytes > 8192)
1253 code_bytes = 8192;
1254
1255 return 1;
1256}
1257__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
deleted file mode 100644
index 9c0ac0cab013..000000000000
--- a/arch/x86/kernel/traps_64.c
+++ /dev/null
@@ -1,1214 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 *
5 * Pentium III FXSR, SSE support
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 */
8
9/*
10 * 'Traps.c' handles hardware traps and faults after we have saved some
11 * state in 'entry.S'.
12 */
13#include <linux/moduleparam.h>
14#include <linux/interrupt.h>
15#include <linux/kallsyms.h>
16#include <linux/spinlock.h>
17#include <linux/kprobes.h>
18#include <linux/uaccess.h>
19#include <linux/utsname.h>
20#include <linux/kdebug.h>
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/ptrace.h>
24#include <linux/string.h>
25#include <linux/unwind.h>
26#include <linux/delay.h>
27#include <linux/errno.h>
28#include <linux/kexec.h>
29#include <linux/sched.h>
30#include <linux/timer.h>
31#include <linux/init.h>
32#include <linux/bug.h>
33#include <linux/nmi.h>
34#include <linux/mm.h>
35#include <linux/smp.h>
36#include <linux/io.h>
37
38#if defined(CONFIG_EDAC)
39#include <linux/edac.h>
40#endif
41
42#include <asm/stacktrace.h>
43#include <asm/processor.h>
44#include <asm/debugreg.h>
45#include <asm/atomic.h>
46#include <asm/system.h>
47#include <asm/unwind.h>
48#include <asm/desc.h>
49#include <asm/i387.h>
50#include <asm/pgalloc.h>
51#include <asm/proto.h>
52#include <asm/pda.h>
53#include <asm/traps.h>
54
55#include <mach_traps.h>
56
57int panic_on_unrecovered_nmi;
58int kstack_depth_to_print = 12;
59static unsigned int code_bytes = 64;
60static int ignore_nmis;
61static int die_counter;
62
63static inline void conditional_sti(struct pt_regs *regs)
64{
65 if (regs->flags & X86_EFLAGS_IF)
66 local_irq_enable();
67}
68
69static inline void preempt_conditional_sti(struct pt_regs *regs)
70{
71 inc_preempt_count();
72 if (regs->flags & X86_EFLAGS_IF)
73 local_irq_enable();
74}
75
76static inline void preempt_conditional_cli(struct pt_regs *regs)
77{
78 if (regs->flags & X86_EFLAGS_IF)
79 local_irq_disable();
80 /* Make sure to not schedule here because we could be running
81 on an exception stack. */
82 dec_preempt_count();
83}
84
85void printk_address(unsigned long address, int reliable)
86{
87 printk(" [<%016lx>] %s%pS\n",
88 address, reliable ? "" : "? ", (void *) address);
89}
90
91static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
92 unsigned *usedp, char **idp)
93{
94 static char ids[][8] = {
95 [DEBUG_STACK - 1] = "#DB",
96 [NMI_STACK - 1] = "NMI",
97 [DOUBLEFAULT_STACK - 1] = "#DF",
98 [STACKFAULT_STACK - 1] = "#SS",
99 [MCE_STACK - 1] = "#MC",
100#if DEBUG_STKSZ > EXCEPTION_STKSZ
101 [N_EXCEPTION_STACKS ...
102 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
103#endif
104 };
105 unsigned k;
106
107 /*
108 * Iterate over all exception stacks, and figure out whether
109 * 'stack' is in one of them:
110 */
111 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
112 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
113 /*
114 * Is 'stack' above this exception frame's end?
115 * If yes then skip to the next frame.
116 */
117 if (stack >= end)
118 continue;
119 /*
120 * Is 'stack' above this exception frame's start address?
121 * If yes then we found the right frame.
122 */
123 if (stack >= end - EXCEPTION_STKSZ) {
124 /*
125 * Make sure we only iterate through an exception
126 * stack once. If it comes up for the second time
127 * then there's something wrong going on - just
128 * break out and return NULL:
129 */
130 if (*usedp & (1U << k))
131 break;
132 *usedp |= 1U << k;
133 *idp = ids[k];
134 return (unsigned long *)end;
135 }
136 /*
137 * If this is a debug stack, and if it has a larger size than
138 * the usual exception stacks, then 'stack' might still
139 * be within the lower portion of the debug stack:
140 */
141#if DEBUG_STKSZ > EXCEPTION_STKSZ
142 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
143 unsigned j = N_EXCEPTION_STACKS - 1;
144
145 /*
146 * Black magic. A large debug stack is composed of
147 * multiple exception stack entries, which we
148 * iterate through now. Dont look:
149 */
150 do {
151 ++j;
152 end -= EXCEPTION_STKSZ;
153 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
154 } while (stack < end - EXCEPTION_STKSZ);
155 if (*usedp & (1U << j))
156 break;
157 *usedp |= 1U << j;
158 *idp = ids[j];
159 return (unsigned long *)end;
160 }
161#endif
162 }
163 return NULL;
164}
165
166/*
167 * x86-64 can have up to three kernel stacks:
168 * process stack
169 * interrupt stack
170 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
171 */
172
173static inline int valid_stack_ptr(struct thread_info *tinfo,
174 void *p, unsigned int size, void *end)
175{
176 void *t = tinfo;
177 if (end) {
178 if (p < end && p >= (end-THREAD_SIZE))
179 return 1;
180 else
181 return 0;
182 }
183 return p > t && p < t + THREAD_SIZE - size;
184}
185
186/* The form of the top of the frame on the stack */
187struct stack_frame {
188 struct stack_frame *next_frame;
189 unsigned long return_address;
190};
191
192static inline unsigned long
193print_context_stack(struct thread_info *tinfo,
194 unsigned long *stack, unsigned long bp,
195 const struct stacktrace_ops *ops, void *data,
196 unsigned long *end)
197{
198 struct stack_frame *frame = (struct stack_frame *)bp;
199
200 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
201 unsigned long addr;
202
203 addr = *stack;
204 if (__kernel_text_address(addr)) {
205 if ((unsigned long) stack == bp + 8) {
206 ops->address(data, addr, 1);
207 frame = frame->next_frame;
208 bp = (unsigned long) frame;
209 } else {
210 ops->address(data, addr, bp == 0);
211 }
212 }
213 stack++;
214 }
215 return bp;
216}
217
218void dump_trace(struct task_struct *task, struct pt_regs *regs,
219 unsigned long *stack, unsigned long bp,
220 const struct stacktrace_ops *ops, void *data)
221{
222 const unsigned cpu = get_cpu();
223 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
224 unsigned used = 0;
225 struct thread_info *tinfo;
226
227 if (!task)
228 task = current;
229
230 if (!stack) {
231 unsigned long dummy;
232 stack = &dummy;
233 if (task && task != current)
234 stack = (unsigned long *)task->thread.sp;
235 }
236
237#ifdef CONFIG_FRAME_POINTER
238 if (!bp) {
239 if (task == current) {
240 /* Grab bp right from our regs */
241 asm("movq %%rbp, %0" : "=r" (bp) : );
242 } else {
243 /* bp is the last reg pushed by switch_to */
244 bp = *(unsigned long *) task->thread.sp;
245 }
246 }
247#endif
248
249 /*
250 * Print function call entries in all stacks, starting at the
251 * current stack address. If the stacks consist of nested
252 * exceptions
253 */
254 tinfo = task_thread_info(task);
255 for (;;) {
256 char *id;
257 unsigned long *estack_end;
258 estack_end = in_exception_stack(cpu, (unsigned long)stack,
259 &used, &id);
260
261 if (estack_end) {
262 if (ops->stack(data, id) < 0)
263 break;
264
265 bp = print_context_stack(tinfo, stack, bp, ops,
266 data, estack_end);
267 ops->stack(data, "<EOE>");
268 /*
269 * We link to the next stack via the
270 * second-to-last pointer (index -2 to end) in the
271 * exception stack:
272 */
273 stack = (unsigned long *) estack_end[-2];
274 continue;
275 }
276 if (irqstack_end) {
277 unsigned long *irqstack;
278 irqstack = irqstack_end -
279 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
280
281 if (stack >= irqstack && stack < irqstack_end) {
282 if (ops->stack(data, "IRQ") < 0)
283 break;
284 bp = print_context_stack(tinfo, stack, bp,
285 ops, data, irqstack_end);
286 /*
287 * We link to the next stack (which would be
288 * the process stack normally) the last
289 * pointer (index -1 to end) in the IRQ stack:
290 */
291 stack = (unsigned long *) (irqstack_end[-1]);
292 irqstack_end = NULL;
293 ops->stack(data, "EOI");
294 continue;
295 }
296 }
297 break;
298 }
299
300 /*
301 * This handles the process stack:
302 */
303 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
304 put_cpu();
305}
306EXPORT_SYMBOL(dump_trace);
307
308static void
309print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
310{
311 print_symbol(msg, symbol);
312 printk("\n");
313}
314
315static void print_trace_warning(void *data, char *msg)
316{
317 printk("%s\n", msg);
318}
319
320static int print_trace_stack(void *data, char *name)
321{
322 printk(" <%s> ", name);
323 return 0;
324}
325
326static void print_trace_address(void *data, unsigned long addr, int reliable)
327{
328 touch_nmi_watchdog();
329 printk_address(addr, reliable);
330}
331
332static const struct stacktrace_ops print_trace_ops = {
333 .warning = print_trace_warning,
334 .warning_symbol = print_trace_warning_symbol,
335 .stack = print_trace_stack,
336 .address = print_trace_address,
337};
338
339static void
340show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
341 unsigned long *stack, unsigned long bp, char *log_lvl)
342{
343 printk("Call Trace:\n");
344 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
345}
346
347void show_trace(struct task_struct *task, struct pt_regs *regs,
348 unsigned long *stack, unsigned long bp)
349{
350 show_trace_log_lvl(task, regs, stack, bp, "");
351}
352
353static void
354show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
355 unsigned long *sp, unsigned long bp, char *log_lvl)
356{
357 unsigned long *stack;
358 int i;
359 const int cpu = smp_processor_id();
360 unsigned long *irqstack_end =
361 (unsigned long *) (cpu_pda(cpu)->irqstackptr);
362 unsigned long *irqstack =
363 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
364
365 /*
366 * debugging aid: "show_stack(NULL, NULL);" prints the
367 * back trace for this cpu.
368 */
369
370 if (sp == NULL) {
371 if (task)
372 sp = (unsigned long *)task->thread.sp;
373 else
374 sp = (unsigned long *)&sp;
375 }
376
377 stack = sp;
378 for (i = 0; i < kstack_depth_to_print; i++) {
379 if (stack >= irqstack && stack <= irqstack_end) {
380 if (stack == irqstack_end) {
381 stack = (unsigned long *) (irqstack_end[-1]);
382 printk(" <EOI> ");
383 }
384 } else {
385 if (((long) stack & (THREAD_SIZE-1)) == 0)
386 break;
387 }
388 if (i && ((i % 4) == 0))
389 printk("\n");
390 printk(" %016lx", *stack++);
391 touch_nmi_watchdog();
392 }
393 printk("\n");
394 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
395}
396
397void show_stack(struct task_struct *task, unsigned long *sp)
398{
399 show_stack_log_lvl(task, NULL, sp, 0, "");
400}
401
402/*
403 * The architecture-independent dump_stack generator
404 */
405void dump_stack(void)
406{
407 unsigned long bp = 0;
408 unsigned long stack;
409
410#ifdef CONFIG_FRAME_POINTER
411 if (!bp)
412 asm("movq %%rbp, %0" : "=r" (bp) : );
413#endif
414
415 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
416 current->pid, current->comm, print_tainted(),
417 init_utsname()->release,
418 (int)strcspn(init_utsname()->version, " "),
419 init_utsname()->version);
420 show_trace(NULL, NULL, &stack, bp);
421}
422EXPORT_SYMBOL(dump_stack);
423
424void show_registers(struct pt_regs *regs)
425{
426 int i;
427 unsigned long sp;
428 const int cpu = smp_processor_id();
429 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
430
431 sp = regs->sp;
432 printk("CPU %d ", cpu);
433 __show_regs(regs);
434 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
435 cur->comm, cur->pid, task_thread_info(cur), cur);
436
437 /*
438 * When in-kernel, we also print out the stack and code at the
439 * time of the fault..
440 */
441 if (!user_mode(regs)) {
442 unsigned int code_prologue = code_bytes * 43 / 64;
443 unsigned int code_len = code_bytes;
444 unsigned char c;
445 u8 *ip;
446
447 printk("Stack: ");
448 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
449 regs->bp, "");
450
451 printk(KERN_EMERG "Code: ");
452
453 ip = (u8 *)regs->ip - code_prologue;
454 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
455 /* try starting at RIP */
456 ip = (u8 *)regs->ip;
457 code_len = code_len - code_prologue + 1;
458 }
459 for (i = 0; i < code_len; i++, ip++) {
460 if (ip < (u8 *)PAGE_OFFSET ||
461 probe_kernel_address(ip, c)) {
462 printk(" Bad RIP value.");
463 break;
464 }
465 if (ip == (u8 *)regs->ip)
466 printk("<%02x> ", c);
467 else
468 printk("%02x ", c);
469 }
470 }
471 printk("\n");
472}
473
474int is_valid_bugaddr(unsigned long ip)
475{
476 unsigned short ud2;
477
478 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
479 return 0;
480
481 return ud2 == 0x0b0f;
482}
483
484static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
485static int die_owner = -1;
486static unsigned int die_nest_count;
487
488unsigned __kprobes long oops_begin(void)
489{
490 int cpu;
491 unsigned long flags;
492
493 oops_enter();
494
495 /* racy, but better than risking deadlock. */
496 raw_local_irq_save(flags);
497 cpu = smp_processor_id();
498 if (!__raw_spin_trylock(&die_lock)) {
499 if (cpu == die_owner)
500 /* nested oops. should stop eventually */;
501 else
502 __raw_spin_lock(&die_lock);
503 }
504 die_nest_count++;
505 die_owner = cpu;
506 console_verbose();
507 bust_spinlocks(1);
508 return flags;
509}
510
511void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
512{
513 die_owner = -1;
514 bust_spinlocks(0);
515 die_nest_count--;
516 if (!die_nest_count)
517 /* Nest count reaches zero, release the lock. */
518 __raw_spin_unlock(&die_lock);
519 raw_local_irq_restore(flags);
520 if (!regs) {
521 oops_exit();
522 return;
523 }
524 if (panic_on_oops)
525 panic("Fatal exception");
526 oops_exit();
527 do_exit(signr);
528}
529
530int __kprobes __die(const char *str, struct pt_regs *regs, long err)
531{
532 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
533#ifdef CONFIG_PREEMPT
534 printk("PREEMPT ");
535#endif
536#ifdef CONFIG_SMP
537 printk("SMP ");
538#endif
539#ifdef CONFIG_DEBUG_PAGEALLOC
540 printk("DEBUG_PAGEALLOC");
541#endif
542 printk("\n");
543 if (notify_die(DIE_OOPS, str, regs, err,
544 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
545 return 1;
546
547 show_registers(regs);
548 add_taint(TAINT_DIE);
549 /* Executive summary in case the oops scrolled away */
550 printk(KERN_ALERT "RIP ");
551 printk_address(regs->ip, 1);
552 printk(" RSP <%016lx>\n", regs->sp);
553 if (kexec_should_crash(current))
554 crash_kexec(regs);
555 return 0;
556}
557
558void die(const char *str, struct pt_regs *regs, long err)
559{
560 unsigned long flags = oops_begin();
561
562 if (!user_mode(regs))
563 report_bug(regs->ip, regs);
564
565 if (__die(str, regs, err))
566 regs = NULL;
567 oops_end(flags, regs, SIGSEGV);
568}
569
570notrace __kprobes void
571die_nmi(char *str, struct pt_regs *regs, int do_panic)
572{
573 unsigned long flags;
574
575 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
576 return;
577
578 flags = oops_begin();
579 /*
580 * We are in trouble anyway, lets at least try
581 * to get a message out.
582 */
583 printk(KERN_EMERG "%s", str);
584 printk(" on CPU%d, ip %08lx, registers:\n",
585 smp_processor_id(), regs->ip);
586 show_registers(regs);
587 if (kexec_should_crash(current))
588 crash_kexec(regs);
589 if (do_panic || panic_on_oops)
590 panic("Non maskable interrupt");
591 oops_end(flags, NULL, SIGBUS);
592 nmi_exit();
593 local_irq_enable();
594 do_exit(SIGBUS);
595}
596
597static void __kprobes
598do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
599 long error_code, siginfo_t *info)
600{
601 struct task_struct *tsk = current;
602
603 if (!user_mode(regs))
604 goto kernel_trap;
605
606 /*
607 * We want error_code and trap_no set for userspace faults and
608 * kernelspace faults which result in die(), but not
609 * kernelspace faults which are fixed up. die() gives the
610 * process no chance to handle the signal and notice the
611 * kernel fault information, so that won't result in polluting
612 * the information about previously queued, but not yet
613 * delivered, faults. See also do_general_protection below.
614 */
615 tsk->thread.error_code = error_code;
616 tsk->thread.trap_no = trapnr;
617
618 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
619 printk_ratelimit()) {
620 printk(KERN_INFO
621 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
622 tsk->comm, tsk->pid, str,
623 regs->ip, regs->sp, error_code);
624 print_vma_addr(" in ", regs->ip);
625 printk("\n");
626 }
627
628 if (info)
629 force_sig_info(signr, info, tsk);
630 else
631 force_sig(signr, tsk);
632 return;
633
634kernel_trap:
635 if (!fixup_exception(regs)) {
636 tsk->thread.error_code = error_code;
637 tsk->thread.trap_no = trapnr;
638 die(str, regs, error_code);
639 }
640 return;
641}
642
643#define DO_ERROR(trapnr, signr, str, name) \
644asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
645{ \
646 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
647 == NOTIFY_STOP) \
648 return; \
649 conditional_sti(regs); \
650 do_trap(trapnr, signr, str, regs, error_code, NULL); \
651}
652
653#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
654asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
655{ \
656 siginfo_t info; \
657 info.si_signo = signr; \
658 info.si_errno = 0; \
659 info.si_code = sicode; \
660 info.si_addr = (void __user *)siaddr; \
661 trace_hardirqs_fixup(); \
662 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
663 == NOTIFY_STOP) \
664 return; \
665 conditional_sti(regs); \
666 do_trap(trapnr, signr, str, regs, error_code, &info); \
667}
668
669DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
670DO_ERROR(4, SIGSEGV, "overflow", overflow)
671DO_ERROR(5, SIGSEGV, "bounds", bounds)
672DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
673DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
674DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
675DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
676DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
677
678/* Runs on IST stack */
679asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
680{
681 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
682 12, SIGBUS) == NOTIFY_STOP)
683 return;
684 preempt_conditional_sti(regs);
685 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
686 preempt_conditional_cli(regs);
687}
688
689asmlinkage void do_double_fault(struct pt_regs *regs, long error_code)
690{
691 static const char str[] = "double fault";
692 struct task_struct *tsk = current;
693
694 /* Return not checked because double check cannot be ignored */
695 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
696
697 tsk->thread.error_code = error_code;
698 tsk->thread.trap_no = 8;
699
700 /* This is always a kernel trap and never fixable (and thus must
701 never return). */
702 for (;;)
703 die(str, regs, error_code);
704}
705
706asmlinkage void __kprobes
707do_general_protection(struct pt_regs *regs, long error_code)
708{
709 struct task_struct *tsk;
710
711 conditional_sti(regs);
712
713 tsk = current;
714 if (!user_mode(regs))
715 goto gp_in_kernel;
716
717 tsk->thread.error_code = error_code;
718 tsk->thread.trap_no = 13;
719
720 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
721 printk_ratelimit()) {
722 printk(KERN_INFO
723 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
724 tsk->comm, tsk->pid,
725 regs->ip, regs->sp, error_code);
726 print_vma_addr(" in ", regs->ip);
727 printk("\n");
728 }
729
730 force_sig(SIGSEGV, tsk);
731 return;
732
733gp_in_kernel:
734 if (fixup_exception(regs))
735 return;
736
737 tsk->thread.error_code = error_code;
738 tsk->thread.trap_no = 13;
739 if (notify_die(DIE_GPF, "general protection fault", regs,
740 error_code, 13, SIGSEGV) == NOTIFY_STOP)
741 return;
742 die("general protection fault", regs, error_code);
743}
744
745static notrace __kprobes void
746mem_parity_error(unsigned char reason, struct pt_regs *regs)
747{
748 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
749 reason);
750 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
751
752#if defined(CONFIG_EDAC)
753 if (edac_handler_set()) {
754 edac_atomic_assert_error();
755 return;
756 }
757#endif
758
759 if (panic_on_unrecovered_nmi)
760 panic("NMI: Not continuing");
761
762 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
763
764 /* Clear and disable the memory parity error line. */
765 reason = (reason & 0xf) | 4;
766 outb(reason, 0x61);
767}
768
769static notrace __kprobes void
770io_check_error(unsigned char reason, struct pt_regs *regs)
771{
772 printk("NMI: IOCK error (debug interrupt?)\n");
773 show_registers(regs);
774
775 /* Re-enable the IOCK line, wait for a few seconds */
776 reason = (reason & 0xf) | 8;
777 outb(reason, 0x61);
778 mdelay(2000);
779 reason &= ~8;
780 outb(reason, 0x61);
781}
782
783static notrace __kprobes void
784unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
785{
786 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
787 NOTIFY_STOP)
788 return;
789 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
790 reason);
791 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
792
793 if (panic_on_unrecovered_nmi)
794 panic("NMI: Not continuing");
795
796 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
797}
798
799/* Runs on IST stack. This code must keep interrupts off all the time.
800 Nested NMIs are prevented by the CPU. */
801asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
802{
803 unsigned char reason = 0;
804 int cpu;
805
806 cpu = smp_processor_id();
807
808 /* Only the BSP gets external NMIs from the system. */
809 if (!cpu)
810 reason = get_nmi_reason();
811
812 if (!(reason & 0xc0)) {
813 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
814 == NOTIFY_STOP)
815 return;
816 /*
817 * Ok, so this is none of the documented NMI sources,
818 * so it must be the NMI watchdog.
819 */
820 if (nmi_watchdog_tick(regs, reason))
821 return;
822 if (!do_nmi_callback(regs, cpu))
823 unknown_nmi_error(reason, regs);
824
825 return;
826 }
827 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
828 return;
829
830 /* AK: following checks seem to be broken on modern chipsets. FIXME */
831 if (reason & 0x80)
832 mem_parity_error(reason, regs);
833 if (reason & 0x40)
834 io_check_error(reason, regs);
835}
836
837asmlinkage notrace __kprobes void
838do_nmi(struct pt_regs *regs, long error_code)
839{
840 nmi_enter();
841
842 add_pda(__nmi_count, 1);
843
844 if (!ignore_nmis)
845 default_do_nmi(regs);
846
847 nmi_exit();
848}
849
850void stop_nmi(void)
851{
852 acpi_nmi_disable();
853 ignore_nmis++;
854}
855
856void restart_nmi(void)
857{
858 ignore_nmis--;
859 acpi_nmi_enable();
860}
861
862/* runs on IST stack. */
863asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
864{
865 trace_hardirqs_fixup();
866
867 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
868 == NOTIFY_STOP)
869 return;
870
871 preempt_conditional_sti(regs);
872 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
873 preempt_conditional_cli(regs);
874}
875
876/* Help handler running on IST stack to switch back to user stack
877 for scheduling or signal handling. The actual stack switch is done in
878 entry.S */
879asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
880{
881 struct pt_regs *regs = eregs;
882 /* Did already sync */
883 if (eregs == (struct pt_regs *)eregs->sp)
884 ;
885 /* Exception from user space */
886 else if (user_mode(eregs))
887 regs = task_pt_regs(current);
888 /* Exception from kernel and interrupts are enabled. Move to
889 kernel process stack. */
890 else if (eregs->flags & X86_EFLAGS_IF)
891 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
892 if (eregs != regs)
893 *regs = *eregs;
894 return regs;
895}
896
897/* runs on IST stack. */
898asmlinkage void __kprobes do_debug(struct pt_regs *regs,
899 unsigned long error_code)
900{
901 struct task_struct *tsk = current;
902 unsigned long condition;
903 siginfo_t info;
904
905 trace_hardirqs_fixup();
906
907 get_debugreg(condition, 6);
908
909 /*
910 * The processor cleared BTF, so don't mark that we need it set.
911 */
912 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
913 tsk->thread.debugctlmsr = 0;
914
915 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
916 SIGTRAP) == NOTIFY_STOP)
917 return;
918
919 preempt_conditional_sti(regs);
920
921 /* Mask out spurious debug traps due to lazy DR7 setting */
922 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
923 if (!tsk->thread.debugreg7)
924 goto clear_dr7;
925 }
926
927 tsk->thread.debugreg6 = condition;
928
929 /*
930 * Single-stepping through TF: make sure we ignore any events in
931 * kernel space (but re-enable TF when returning to user mode).
932 */
933 if (condition & DR_STEP) {
934 if (!user_mode(regs))
935 goto clear_TF_reenable;
936 }
937
938 /* Ok, finally something we can handle */
939 tsk->thread.trap_no = 1;
940 tsk->thread.error_code = error_code;
941 info.si_signo = SIGTRAP;
942 info.si_errno = 0;
943 info.si_code = get_si_code(condition);
944 info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
945 force_sig_info(SIGTRAP, &info, tsk);
946
947clear_dr7:
948 set_debugreg(0, 7);
949 preempt_conditional_cli(regs);
950 return;
951
952clear_TF_reenable:
953 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
954 regs->flags &= ~X86_EFLAGS_TF;
955 preempt_conditional_cli(regs);
956 return;
957}
958
959static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
960{
961 if (fixup_exception(regs))
962 return 1;
963
964 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
965 /* Illegal floating point operation in the kernel */
966 current->thread.trap_no = trapnr;
967 die(str, regs, 0);
968 return 0;
969}
970
971/*
972 * Note that we play around with the 'TS' bit in an attempt to get
973 * the correct behaviour even in the presence of the asynchronous
974 * IRQ13 behaviour
975 */
976asmlinkage void do_coprocessor_error(struct pt_regs *regs)
977{
978 void __user *ip = (void __user *)(regs->ip);
979 struct task_struct *task;
980 siginfo_t info;
981 unsigned short cwd, swd;
982
983 conditional_sti(regs);
984 if (!user_mode(regs) &&
985 kernel_math_error(regs, "kernel x87 math error", 16))
986 return;
987
988 /*
989 * Save the info for the exception handler and clear the error.
990 */
991 task = current;
992 save_init_fpu(task);
993 task->thread.trap_no = 16;
994 task->thread.error_code = 0;
995 info.si_signo = SIGFPE;
996 info.si_errno = 0;
997 info.si_code = __SI_FAULT;
998 info.si_addr = ip;
999 /*
1000 * (~cwd & swd) will mask out exceptions that are not set to unmasked
1001 * status. 0x3f is the exception bits in these regs, 0x200 is the
1002 * C1 reg you need in case of a stack fault, 0x040 is the stack
1003 * fault bit. We should only be taking one exception at a time,
1004 * so if this combination doesn't produce any single exception,
1005 * then we have a bad program that isn't synchronizing its FPU usage
1006 * and it will suffer the consequences since we won't be able to
1007 * fully reproduce the context of the exception
1008 */
1009 cwd = get_fpu_cwd(task);
1010 swd = get_fpu_swd(task);
1011 switch (swd & ~cwd & 0x3f) {
1012 case 0x000: /* No unmasked exception */
1013 default: /* Multiple exceptions */
1014 break;
1015 case 0x001: /* Invalid Op */
1016 /*
1017 * swd & 0x240 == 0x040: Stack Underflow
1018 * swd & 0x240 == 0x240: Stack Overflow
1019 * User must clear the SF bit (0x40) if set
1020 */
1021 info.si_code = FPE_FLTINV;
1022 break;
1023 case 0x002: /* Denormalize */
1024 case 0x010: /* Underflow */
1025 info.si_code = FPE_FLTUND;
1026 break;
1027 case 0x004: /* Zero Divide */
1028 info.si_code = FPE_FLTDIV;
1029 break;
1030 case 0x008: /* Overflow */
1031 info.si_code = FPE_FLTOVF;
1032 break;
1033 case 0x020: /* Precision */
1034 info.si_code = FPE_FLTRES;
1035 break;
1036 }
1037 force_sig_info(SIGFPE, &info, task);
1038}
1039
1040asmlinkage void bad_intr(void)
1041{
1042 printk("bad interrupt");
1043}
1044
1045asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1046{
1047 void __user *ip = (void __user *)(regs->ip);
1048 struct task_struct *task;
1049 siginfo_t info;
1050 unsigned short mxcsr;
1051
1052 conditional_sti(regs);
1053 if (!user_mode(regs) &&
1054 kernel_math_error(regs, "kernel simd math error", 19))
1055 return;
1056
1057 /*
1058 * Save the info for the exception handler and clear the error.
1059 */
1060 task = current;
1061 save_init_fpu(task);
1062 task->thread.trap_no = 19;
1063 task->thread.error_code = 0;
1064 info.si_signo = SIGFPE;
1065 info.si_errno = 0;
1066 info.si_code = __SI_FAULT;
1067 info.si_addr = ip;
1068 /*
1069 * The SIMD FPU exceptions are handled a little differently, as there
1070 * is only a single status/control register. Thus, to determine which
1071 * unmasked exception was caught we must mask the exception mask bits
1072 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
1073 */
1074 mxcsr = get_fpu_mxcsr(task);
1075 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
1076 case 0x000:
1077 default:
1078 break;
1079 case 0x001: /* Invalid Op */
1080 info.si_code = FPE_FLTINV;
1081 break;
1082 case 0x002: /* Denormalize */
1083 case 0x010: /* Underflow */
1084 info.si_code = FPE_FLTUND;
1085 break;
1086 case 0x004: /* Zero Divide */
1087 info.si_code = FPE_FLTDIV;
1088 break;
1089 case 0x008: /* Overflow */
1090 info.si_code = FPE_FLTOVF;
1091 break;
1092 case 0x020: /* Precision */
1093 info.si_code = FPE_FLTRES;
1094 break;
1095 }
1096 force_sig_info(SIGFPE, &info, task);
1097}
1098
1099asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs)
1100{
1101}
1102
1103asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
1104{
1105}
1106
1107asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
1108{
1109}
1110
1111/*
1112 * 'math_state_restore()' saves the current math information in the
1113 * old math state array, and gets the new ones from the current task
1114 *
1115 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
1116 * Don't touch unless you *really* know how it works.
1117 */
1118asmlinkage void math_state_restore(void)
1119{
1120 struct task_struct *me = current;
1121
1122 if (!used_math()) {
1123 local_irq_enable();
1124 /*
1125 * does a slab alloc which can sleep
1126 */
1127 if (init_fpu(me)) {
1128 /*
1129 * ran out of memory!
1130 */
1131 do_group_exit(SIGKILL);
1132 return;
1133 }
1134 local_irq_disable();
1135 }
1136
1137 clts(); /* Allow maths ops (or we recurse) */
1138 /*
1139 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
1140 */
1141 if (unlikely(restore_fpu_checking(me))) {
1142 stts();
1143 force_sig(SIGSEGV, me);
1144 return;
1145 }
1146 task_thread_info(me)->status |= TS_USEDFPU;
1147 me->fpu_counter++;
1148}
1149EXPORT_SYMBOL_GPL(math_state_restore);
1150
1151void __init trap_init(void)
1152{
1153 set_intr_gate(0, &divide_error);
1154 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1155 set_intr_gate_ist(2, &nmi, NMI_STACK);
1156 /* int3 can be called from all */
1157 set_system_gate_ist(3, &int3, DEBUG_STACK);
1158 /* int4 can be called from all */
1159 set_system_gate(4, &overflow);
1160 set_intr_gate(5, &bounds);
1161 set_intr_gate(6, &invalid_op);
1162 set_intr_gate(7, &device_not_available);
1163 set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
1164 set_intr_gate(9, &coprocessor_segment_overrun);
1165 set_intr_gate(10, &invalid_TSS);
1166 set_intr_gate(11, &segment_not_present);
1167 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
1168 set_intr_gate(13, &general_protection);
1169 set_intr_gate(14, &page_fault);
1170 set_intr_gate(15, &spurious_interrupt_bug);
1171 set_intr_gate(16, &coprocessor_error);
1172 set_intr_gate(17, &alignment_check);
1173#ifdef CONFIG_X86_MCE
1174 set_intr_gate_ist(18, &machine_check, MCE_STACK);
1175#endif
1176 set_intr_gate(19, &simd_coprocessor_error);
1177
1178#ifdef CONFIG_IA32_EMULATION
1179 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
1180#endif
1181 /*
1182 * Should be a barrier for any external CPU state:
1183 */
1184 cpu_init();
1185}
1186
1187static int __init oops_setup(char *s)
1188{
1189 if (!s)
1190 return -EINVAL;
1191 if (!strcmp(s, "panic"))
1192 panic_on_oops = 1;
1193 return 0;
1194}
1195early_param("oops", oops_setup);
1196
1197static int __init kstack_setup(char *s)
1198{
1199 if (!s)
1200 return -EINVAL;
1201 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1202 return 0;
1203}
1204early_param("kstack", kstack_setup);
1205
1206static int __init code_bytes_setup(char *s)
1207{
1208 code_bytes = simple_strtoul(s, NULL, 0);
1209 if (code_bytes > 8192)
1210 code_bytes = 8192;
1211
1212 return 1;
1213}
1214__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
new file mode 100644
index 000000000000..aeef529917e4
--- /dev/null
+++ b/arch/x86/kernel/uv_irq.c
@@ -0,0 +1,79 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * SGI UV IRQ functions
7 *
8 * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
9 */
10
11#include <linux/module.h>
12#include <linux/irq.h>
13
14#include <asm/apic.h>
15#include <asm/uv/uv_irq.h>
16
17static void uv_noop(unsigned int irq)
18{
19}
20
21static unsigned int uv_noop_ret(unsigned int irq)
22{
23 return 0;
24}
25
26static void uv_ack_apic(unsigned int irq)
27{
28 ack_APIC_irq();
29}
30
31struct irq_chip uv_irq_chip = {
32 .name = "UV-CORE",
33 .startup = uv_noop_ret,
34 .shutdown = uv_noop,
35 .enable = uv_noop,
36 .disable = uv_noop,
37 .ack = uv_noop,
38 .mask = uv_noop,
39 .unmask = uv_noop,
40 .eoi = uv_ack_apic,
41 .end = uv_noop,
42};
43
44/*
45 * Set up a mapping of an available irq and vector, and enable the specified
46 * MMR that defines the MSI that is to be sent to the specified CPU when an
47 * interrupt is raised.
48 */
49int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
50 unsigned long mmr_offset)
51{
52 int irq;
53 int ret;
54
55 irq = create_irq();
56 if (irq <= 0)
57 return -EBUSY;
58
59 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset);
60 if (ret != irq)
61 destroy_irq(irq);
62
63 return ret;
64}
65EXPORT_SYMBOL_GPL(uv_setup_irq);
66
67/*
68 * Tear down a mapping of an irq and vector, and disable the specified MMR that
69 * defined the MSI that was to be sent to the specified CPU when an interrupt
70 * was raised.
71 *
72 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
73 */
74void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset)
75{
76 arch_disable_uv_irq(mmr_blade, mmr_offset);
77 destroy_irq(irq);
78}
79EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
new file mode 100644
index 000000000000..67f9b9dbf800
--- /dev/null
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -0,0 +1,72 @@
1/*
2 * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson
20 */
21
22#include <linux/sysdev.h>
23#include <asm/uv/bios.h>
24
25struct kobject *sgi_uv_kobj;
26
27static ssize_t partition_id_show(struct kobject *kobj,
28 struct kobj_attribute *attr, char *buf)
29{
30 return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
31}
32
33static ssize_t coherence_id_show(struct kobject *kobj,
34 struct kobj_attribute *attr, char *buf)
35{
36 return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
37}
38
39static struct kobj_attribute partition_id_attr =
40 __ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
41
42static struct kobj_attribute coherence_id_attr =
43 __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
44
45
46static int __init sgi_uv_sysfs_init(void)
47{
48 unsigned long ret;
49
50 if (!sgi_uv_kobj)
51 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
52 if (!sgi_uv_kobj) {
53 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n");
54 return -EINVAL;
55 }
56
57 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
58 if (ret) {
59 printk(KERN_WARNING "sysfs_create_file partition_id failed \n");
60 return ret;
61 }
62
63 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
64 if (ret) {
65 printk(KERN_WARNING "sysfs_create_file coherence_id failed \n");
66 return ret;
67 }
68
69 return 0;
70}
71
72device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 61a97e616f70..0c9667f0752a 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -484,10 +484,11 @@ static void disable_cobalt_irq(unsigned int irq)
484static unsigned int startup_cobalt_irq(unsigned int irq) 484static unsigned int startup_cobalt_irq(unsigned int irq)
485{ 485{
486 unsigned long flags; 486 unsigned long flags;
487 struct irq_desc *desc = irq_to_desc(irq);
487 488
488 spin_lock_irqsave(&cobalt_lock, flags); 489 spin_lock_irqsave(&cobalt_lock, flags);
489 if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) 490 if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
490 irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); 491 desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
491 enable_cobalt_irq(irq); 492 enable_cobalt_irq(irq);
492 spin_unlock_irqrestore(&cobalt_lock, flags); 493 spin_unlock_irqrestore(&cobalt_lock, flags);
493 return 0; 494 return 0;
@@ -506,9 +507,10 @@ static void ack_cobalt_irq(unsigned int irq)
506static void end_cobalt_irq(unsigned int irq) 507static void end_cobalt_irq(unsigned int irq)
507{ 508{
508 unsigned long flags; 509 unsigned long flags;
510 struct irq_desc *desc = irq_to_desc(irq);
509 511
510 spin_lock_irqsave(&cobalt_lock, flags); 512 spin_lock_irqsave(&cobalt_lock, flags);
511 if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) 513 if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
512 enable_cobalt_irq(irq); 514 enable_cobalt_irq(irq);
513 spin_unlock_irqrestore(&cobalt_lock, flags); 515 spin_unlock_irqrestore(&cobalt_lock, flags);
514} 516}
@@ -626,12 +628,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
626 628
627 spin_unlock_irqrestore(&i8259A_lock, flags); 629 spin_unlock_irqrestore(&i8259A_lock, flags);
628 630
629 desc = irq_desc + realirq; 631 desc = irq_to_desc(realirq);
630 632
631 /* 633 /*
632 * handle this 'virtual interrupt' as a Cobalt one now. 634 * handle this 'virtual interrupt' as a Cobalt one now.
633 */ 635 */
634 kstat_cpu(smp_processor_id()).irqs[realirq]++; 636 kstat_incr_irqs_this_cpu(realirq, desc);
635 637
636 if (likely(desc->action != NULL)) 638 if (likely(desc->action != NULL))
637 handle_IRQ_event(realirq, desc->action); 639 handle_IRQ_event(realirq, desc->action);
@@ -662,27 +664,29 @@ void init_VISWS_APIC_irqs(void)
662 int i; 664 int i;
663 665
664 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { 666 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
665 irq_desc[i].status = IRQ_DISABLED; 667 struct irq_desc *desc = irq_to_desc(i);
666 irq_desc[i].action = 0; 668
667 irq_desc[i].depth = 1; 669 desc->status = IRQ_DISABLED;
670 desc->action = 0;
671 desc->depth = 1;
668 672
669 if (i == 0) { 673 if (i == 0) {
670 irq_desc[i].chip = &cobalt_irq_type; 674 desc->chip = &cobalt_irq_type;
671 } 675 }
672 else if (i == CO_IRQ_IDE0) { 676 else if (i == CO_IRQ_IDE0) {
673 irq_desc[i].chip = &cobalt_irq_type; 677 desc->chip = &cobalt_irq_type;
674 } 678 }
675 else if (i == CO_IRQ_IDE1) { 679 else if (i == CO_IRQ_IDE1) {
676 irq_desc[i].chip = &cobalt_irq_type; 680 desc->chip = &cobalt_irq_type;
677 } 681 }
678 else if (i == CO_IRQ_8259) { 682 else if (i == CO_IRQ_8259) {
679 irq_desc[i].chip = &piix4_master_irq_type; 683 desc->chip = &piix4_master_irq_type;
680 } 684 }
681 else if (i < CO_IRQ_APIC0) { 685 else if (i < CO_IRQ_APIC0) {
682 irq_desc[i].chip = &piix4_virtual_irq_type; 686 desc->chip = &piix4_virtual_irq_type;
683 } 687 }
684 else if (IS_CO_APIC(i)) { 688 else if (IS_CO_APIC(i)) {
685 irq_desc[i].chip = &cobalt_irq_type; 689 desc->chip = &cobalt_irq_type;
686 } 690 }
687 } 691 }
688 692
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 6953859fe289..254ee07f8635 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -235,11 +235,14 @@ static void __devinit vmi_time_init_clockevent(void)
235 235
236void __init vmi_time_init(void) 236void __init vmi_time_init(void)
237{ 237{
238 unsigned int cpu;
238 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ 239 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
239 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ 240 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
240 241
241 vmi_time_init_clockevent(); 242 vmi_time_init_clockevent();
242 setup_irq(0, &vmi_clock_action); 243 setup_irq(0, &vmi_clock_action);
244 for_each_possible_cpu(cpu)
245 per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
243} 246}
244 247
245#ifdef CONFIG_X86_LOCAL_APIC 248#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d0e940bb6f40..c02343594b4d 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,10 +3,13 @@
3# 3#
4 4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
6 coalesced_mmio.o) 6 coalesced_mmio.o irq_comm.o)
7ifeq ($(CONFIG_KVM_TRACE),y) 7ifeq ($(CONFIG_KVM_TRACE),y)
8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) 8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
9endif 9endif
10ifeq ($(CONFIG_DMAR),y)
11common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
12endif
10 13
11EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
12 15
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c0f7872a9124..634132a9a512 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
200 200
201 if (!atomic_inc_and_test(&pt->pending)) 201 if (!atomic_inc_and_test(&pt->pending))
202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); 202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
203 if (vcpu0 && waitqueue_active(&vcpu0->wq)) { 203
204 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 204 if (vcpu0 && waitqueue_active(&vcpu0->wq))
205 wake_up_interruptible(&vcpu0->wq); 205 wake_up_interruptible(&vcpu0->wq);
206 }
207 206
208 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); 207 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
209 pt->scheduled = ktime_to_ns(pt->timer.expires); 208 pt->scheduled = ktime_to_ns(pt->timer.expires);
209 if (pt->period)
210 ps->channels[0].count_load_time = pt->timer.expires;
210 211
211 return (pt->period == 0 ? 0 : 1); 212 return (pt->period == 0 ? 0 : 1);
212} 213}
@@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
215{ 216{
216 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 217 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
217 218
218 if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending) 219 if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
219 return atomic_read(&pit->pit_state.pit_timer.pending); 220 return atomic_read(&pit->pit_state.pit_timer.pending);
220
221 return 0; 221 return 0;
222} 222}
223 223
224static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
225{
226 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
227 irq_ack_notifier);
228 spin_lock(&ps->inject_lock);
229 if (atomic_dec_return(&ps->pit_timer.pending) < 0)
230 atomic_inc(&ps->pit_timer.pending);
231 ps->irq_ack = 1;
232 spin_unlock(&ps->inject_lock);
233}
234
224static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) 235static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
225{ 236{
226 struct kvm_kpit_state *ps; 237 struct kvm_kpit_state *ps;
@@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt)
255 hrtimer_cancel(&pt->timer); 266 hrtimer_cancel(&pt->timer);
256} 267}
257 268
258static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) 269static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
259{ 270{
271 struct kvm_kpit_timer *pt = &ps->pit_timer;
260 s64 interval; 272 s64 interval;
261 273
262 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 274 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
268 pt->period = (is_period == 0) ? 0 : interval; 280 pt->period = (is_period == 0) ? 0 : interval;
269 pt->timer.function = pit_timer_fn; 281 pt->timer.function = pit_timer_fn;
270 atomic_set(&pt->pending, 0); 282 atomic_set(&pt->pending, 0);
283 ps->irq_ack = 1;
271 284
272 hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), 285 hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
273 HRTIMER_MODE_ABS); 286 HRTIMER_MODE_ABS);
@@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
302 case 1: 315 case 1:
303 /* FIXME: enhance mode 4 precision */ 316 /* FIXME: enhance mode 4 precision */
304 case 4: 317 case 4:
305 create_pit_timer(&ps->pit_timer, val, 0); 318 create_pit_timer(ps, val, 0);
306 break; 319 break;
307 case 2: 320 case 2:
308 case 3: 321 case 3:
309 create_pit_timer(&ps->pit_timer, val, 1); 322 create_pit_timer(ps, val, 1);
310 break; 323 break;
311 default: 324 default:
312 destroy_pit_timer(&ps->pit_timer); 325 destroy_pit_timer(&ps->pit_timer);
@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
520 mutex_unlock(&pit->pit_state.lock); 533 mutex_unlock(&pit->pit_state.lock);
521 534
522 atomic_set(&pit->pit_state.pit_timer.pending, 0); 535 atomic_set(&pit->pit_state.pit_timer.pending, 0);
523 pit->pit_state.inject_pending = 1; 536 pit->pit_state.irq_ack = 1;
524} 537}
525 538
526struct kvm_pit *kvm_create_pit(struct kvm *kvm) 539struct kvm_pit *kvm_create_pit(struct kvm *kvm)
@@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
534 547
535 mutex_init(&pit->pit_state.lock); 548 mutex_init(&pit->pit_state.lock);
536 mutex_lock(&pit->pit_state.lock); 549 mutex_lock(&pit->pit_state.lock);
550 spin_lock_init(&pit->pit_state.inject_lock);
537 551
538 /* Initialize PIO device */ 552 /* Initialize PIO device */
539 pit->dev.read = pit_ioport_read; 553 pit->dev.read = pit_ioport_read;
@@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
555 pit_state->pit = pit; 569 pit_state->pit = pit;
556 hrtimer_init(&pit_state->pit_timer.timer, 570 hrtimer_init(&pit_state->pit_timer.timer,
557 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 571 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
572 pit_state->irq_ack_notifier.gsi = 0;
573 pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
574 kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
558 mutex_unlock(&pit->pit_state.lock); 575 mutex_unlock(&pit->pit_state.lock);
559 576
560 kvm_pit_reset(pit); 577 kvm_pit_reset(pit);
@@ -578,10 +595,8 @@ void kvm_free_pit(struct kvm *kvm)
578static void __inject_pit_timer_intr(struct kvm *kvm) 595static void __inject_pit_timer_intr(struct kvm *kvm)
579{ 596{
580 mutex_lock(&kvm->lock); 597 mutex_lock(&kvm->lock);
581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); 598 kvm_set_irq(kvm, 0, 1);
582 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); 599 kvm_set_irq(kvm, 0, 0);
583 kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
584 kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
585 mutex_unlock(&kvm->lock); 600 mutex_unlock(&kvm->lock);
586} 601}
587 602
@@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
592 struct kvm_kpit_state *ps; 607 struct kvm_kpit_state *ps;
593 608
594 if (vcpu && pit) { 609 if (vcpu && pit) {
610 int inject = 0;
595 ps = &pit->pit_state; 611 ps = &pit->pit_state;
596 612
597 /* Try to inject pending interrupts when: 613 /* Try to inject pending interrupts when
598 * 1. Pending exists 614 * last one has been acked.
599 * 2. Last interrupt was accepted or waited for too long time*/ 615 */
600 if (atomic_read(&ps->pit_timer.pending) && 616 spin_lock(&ps->inject_lock);
601 (ps->inject_pending || 617 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
602 (jiffies - ps->last_injected_time 618 ps->irq_ack = 0;
603 >= KVM_MAX_PIT_INTR_INTERVAL))) { 619 inject = 1;
604 ps->inject_pending = 0;
605 __inject_pit_timer_intr(kvm);
606 ps->last_injected_time = jiffies;
607 }
608 }
609}
610
611void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
612{
613 struct kvm_arch *arch = &vcpu->kvm->arch;
614 struct kvm_kpit_state *ps;
615
616 if (vcpu && arch->vpit) {
617 ps = &arch->vpit->pit_state;
618 if (atomic_read(&ps->pit_timer.pending) &&
619 (((arch->vpic->pics[0].imr & 1) == 0 &&
620 arch->vpic->pics[0].irq_base == vec) ||
621 (arch->vioapic->redirtbl[0].fields.vector == vec &&
622 arch->vioapic->redirtbl[0].fields.mask != 1))) {
623 ps->inject_pending = 1;
624 atomic_dec(&ps->pit_timer.pending);
625 ps->channels[0].count_load_time = ktime_get();
626 } 620 }
621 spin_unlock(&ps->inject_lock);
622 if (inject)
623 __inject_pit_timer_intr(kvm);
627 } 624 }
628} 625}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index db25c2a6c8c4..e436d4983aa1 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -8,7 +8,6 @@ struct kvm_kpit_timer {
8 int irq; 8 int irq;
9 s64 period; /* unit: ns */ 9 s64 period; /* unit: ns */
10 s64 scheduled; 10 s64 scheduled;
11 ktime_t last_update;
12 atomic_t pending; 11 atomic_t pending;
13}; 12};
14 13
@@ -34,8 +33,9 @@ struct kvm_kpit_state {
34 u32 speaker_data_on; 33 u32 speaker_data_on;
35 struct mutex lock; 34 struct mutex lock;
36 struct kvm_pit *pit; 35 struct kvm_pit *pit;
37 bool inject_pending; /* if inject pending interrupts */ 36 spinlock_t inject_lock;
38 unsigned long last_injected_time; 37 unsigned long irq_ack;
38 struct kvm_irq_ack_notifier irq_ack_notifier;
39}; 39};
40 40
41struct kvm_pit { 41struct kvm_pit {
@@ -54,7 +54,6 @@ struct kvm_pit {
54#define KVM_PIT_CHANNEL_MASK 0x3 54#define KVM_PIT_CHANNEL_MASK 0x3
55 55
56void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); 56void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
57void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
58void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); 57void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
59struct kvm_pit *kvm_create_pit(struct kvm *kvm); 58struct kvm_pit *kvm_create_pit(struct kvm *kvm);
60void kvm_free_pit(struct kvm *kvm); 59void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index c31164e8aa46..17e41e165f1a 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -30,6 +30,19 @@
30 30
31#include <linux/kvm_host.h> 31#include <linux/kvm_host.h>
32 32
33static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
34{
35 s->isr &= ~(1 << irq);
36 s->isr_ack |= (1 << irq);
37}
38
39void kvm_pic_clear_isr_ack(struct kvm *kvm)
40{
41 struct kvm_pic *s = pic_irqchip(kvm);
42 s->pics[0].isr_ack = 0xff;
43 s->pics[1].isr_ack = 0xff;
44}
45
33/* 46/*
34 * set irq level. If an edge is detected, then the IRR is set to 1 47 * set irq level. If an edge is detected, then the IRR is set to 1
35 */ 48 */
@@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
141 */ 154 */
142static inline void pic_intack(struct kvm_kpic_state *s, int irq) 155static inline void pic_intack(struct kvm_kpic_state *s, int irq)
143{ 156{
157 s->isr |= 1 << irq;
144 if (s->auto_eoi) { 158 if (s->auto_eoi) {
145 if (s->rotate_on_auto_eoi) 159 if (s->rotate_on_auto_eoi)
146 s->priority_add = (irq + 1) & 7; 160 s->priority_add = (irq + 1) & 7;
147 } else 161 pic_clear_isr(s, irq);
148 s->isr |= (1 << irq); 162 }
149 /* 163 /*
150 * We don't clear a level sensitive interrupt here 164 * We don't clear a level sensitive interrupt here
151 */ 165 */
@@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
153 s->irr &= ~(1 << irq); 167 s->irr &= ~(1 << irq);
154} 168}
155 169
156int kvm_pic_read_irq(struct kvm_pic *s) 170int kvm_pic_read_irq(struct kvm *kvm)
157{ 171{
158 int irq, irq2, intno; 172 int irq, irq2, intno;
173 struct kvm_pic *s = pic_irqchip(kvm);
159 174
160 irq = pic_get_irq(&s->pics[0]); 175 irq = pic_get_irq(&s->pics[0]);
161 if (irq >= 0) { 176 if (irq >= 0) {
@@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s)
181 intno = s->pics[0].irq_base + irq; 196 intno = s->pics[0].irq_base + irq;
182 } 197 }
183 pic_update_irq(s); 198 pic_update_irq(s);
199 kvm_notify_acked_irq(kvm, irq);
184 200
185 return intno; 201 return intno;
186} 202}
187 203
188void kvm_pic_reset(struct kvm_kpic_state *s) 204void kvm_pic_reset(struct kvm_kpic_state *s)
189{ 205{
206 int irq, irqbase;
207 struct kvm *kvm = s->pics_state->irq_request_opaque;
208 struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
209
210 if (s == &s->pics_state->pics[0])
211 irqbase = 0;
212 else
213 irqbase = 8;
214
215 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
216 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
217 if (s->irr & (1 << irq) || s->isr & (1 << irq))
218 kvm_notify_acked_irq(kvm, irq+irqbase);
219 }
190 s->last_irr = 0; 220 s->last_irr = 0;
191 s->irr = 0; 221 s->irr = 0;
192 s->imr = 0; 222 s->imr = 0;
193 s->isr = 0; 223 s->isr = 0;
224 s->isr_ack = 0xff;
194 s->priority_add = 0; 225 s->priority_add = 0;
195 s->irq_base = 0; 226 s->irq_base = 0;
196 s->read_reg_select = 0; 227 s->read_reg_select = 0;
@@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
243 priority = get_priority(s, s->isr); 274 priority = get_priority(s, s->isr);
244 if (priority != 8) { 275 if (priority != 8) {
245 irq = (priority + s->priority_add) & 7; 276 irq = (priority + s->priority_add) & 7;
246 s->isr &= ~(1 << irq); 277 pic_clear_isr(s, irq);
247 if (cmd == 5) 278 if (cmd == 5)
248 s->priority_add = (irq + 1) & 7; 279 s->priority_add = (irq + 1) & 7;
249 pic_update_irq(s->pics_state); 280 pic_update_irq(s->pics_state);
@@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
251 break; 282 break;
252 case 3: 283 case 3:
253 irq = val & 7; 284 irq = val & 7;
254 s->isr &= ~(1 << irq); 285 pic_clear_isr(s, irq);
255 pic_update_irq(s->pics_state); 286 pic_update_irq(s->pics_state);
256 break; 287 break;
257 case 6: 288 case 6:
@@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
260 break; 291 break;
261 case 7: 292 case 7:
262 irq = val & 7; 293 irq = val & 7;
263 s->isr &= ~(1 << irq);
264 s->priority_add = (irq + 1) & 7; 294 s->priority_add = (irq + 1) & 7;
295 pic_clear_isr(s, irq);
265 pic_update_irq(s->pics_state); 296 pic_update_irq(s->pics_state);
266 break; 297 break;
267 default: 298 default:
@@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
303 s->pics_state->pics[0].irr &= ~(1 << 2); 334 s->pics_state->pics[0].irr &= ~(1 << 2);
304 } 335 }
305 s->irr &= ~(1 << ret); 336 s->irr &= ~(1 << ret);
306 s->isr &= ~(1 << ret); 337 pic_clear_isr(s, ret);
307 if (addr1 >> 7 || ret != 2) 338 if (addr1 >> 7 || ret != 2)
308 pic_update_irq(s->pics_state); 339 pic_update_irq(s->pics_state);
309 } else { 340 } else {
@@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level)
422{ 453{
423 struct kvm *kvm = opaque; 454 struct kvm *kvm = opaque;
424 struct kvm_vcpu *vcpu = kvm->vcpus[0]; 455 struct kvm_vcpu *vcpu = kvm->vcpus[0];
456 struct kvm_pic *s = pic_irqchip(kvm);
457 int irq = pic_get_irq(&s->pics[0]);
425 458
426 pic_irqchip(kvm)->output = level; 459 s->output = level;
427 if (vcpu) 460 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
461 s->pics[0].isr_ack &= ~(1 << irq);
428 kvm_vcpu_kick(vcpu); 462 kvm_vcpu_kick(vcpu);
463 }
429} 464}
430 465
431struct kvm_pic *kvm_create_pic(struct kvm *kvm) 466struct kvm_pic *kvm_create_pic(struct kvm *kvm)
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 76d736b5f664..c019b8edcdb7 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
72 if (kvm_apic_accept_pic_intr(v)) { 72 if (kvm_apic_accept_pic_intr(v)) {
73 s = pic_irqchip(v->kvm); 73 s = pic_irqchip(v->kvm);
74 s->output = 0; /* PIC */ 74 s->output = 0; /* PIC */
75 vector = kvm_pic_read_irq(s); 75 vector = kvm_pic_read_irq(v->kvm);
76 } 76 }
77 } 77 }
78 return vector; 78 return vector;
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
90void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) 90void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
91{ 91{
92 kvm_apic_timer_intr_post(vcpu, vec); 92 kvm_apic_timer_intr_post(vcpu, vec);
93 kvm_pit_timer_intr_post(vcpu, vec);
94 /* TODO: PIT, RTC etc. */ 93 /* TODO: PIT, RTC etc. */
95} 94}
96EXPORT_SYMBOL_GPL(kvm_timer_intr_post); 95EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7ca47cbb48bb..f17c8f5bbf31 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -42,6 +42,7 @@ struct kvm_kpic_state {
42 u8 irr; /* interrupt request register */ 42 u8 irr; /* interrupt request register */
43 u8 imr; /* interrupt mask register */ 43 u8 imr; /* interrupt mask register */
44 u8 isr; /* interrupt service register */ 44 u8 isr; /* interrupt service register */
45 u8 isr_ack; /* interrupt ack detection */
45 u8 priority_add; /* highest irq priority */ 46 u8 priority_add; /* highest irq priority */
46 u8 irq_base; 47 u8 irq_base;
47 u8 read_reg_select; 48 u8 read_reg_select;
@@ -63,12 +64,13 @@ struct kvm_pic {
63 void *irq_request_opaque; 64 void *irq_request_opaque;
64 int output; /* intr from master PIC */ 65 int output; /* intr from master PIC */
65 struct kvm_io_device dev; 66 struct kvm_io_device dev;
67 void (*ack_notifier)(void *opaque, int irq);
66}; 68};
67 69
68struct kvm_pic *kvm_create_pic(struct kvm *kvm); 70struct kvm_pic *kvm_create_pic(struct kvm *kvm);
69void kvm_pic_set_irq(void *opaque, int irq, int level); 71int kvm_pic_read_irq(struct kvm *kvm);
70int kvm_pic_read_irq(struct kvm_pic *s);
71void kvm_pic_update_irq(struct kvm_pic *s); 72void kvm_pic_update_irq(struct kvm_pic *s);
73void kvm_pic_clear_isr_ack(struct kvm *kvm);
72 74
73static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) 75static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
74{ 76{
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
new file mode 100644
index 000000000000..1ff819dce7d3
--- /dev/null
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -0,0 +1,32 @@
1#ifndef ASM_KVM_CACHE_REGS_H
2#define ASM_KVM_CACHE_REGS_H
3
4static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
5 enum kvm_reg reg)
6{
7 if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
8 kvm_x86_ops->cache_reg(vcpu, reg);
9
10 return vcpu->arch.regs[reg];
11}
12
13static inline void kvm_register_write(struct kvm_vcpu *vcpu,
14 enum kvm_reg reg,
15 unsigned long val)
16{
17 vcpu->arch.regs[reg] = val;
18 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
19 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
20}
21
22static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
23{
24 return kvm_register_read(vcpu, VCPU_REGS_RIP);
25}
26
27static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
28{
29 kvm_register_write(vcpu, VCPU_REGS_RIP, val);
30}
31
32#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73f43de69f67..6571926bfd33 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,6 +32,7 @@
32#include <asm/current.h> 32#include <asm/current.h>
33#include <asm/apicdef.h> 33#include <asm/apicdef.h>
34#include <asm/atomic.h> 34#include <asm/atomic.h>
35#include "kvm_cache_regs.h"
35#include "irq.h" 36#include "irq.h"
36 37
37#define PRId64 "d" 38#define PRId64 "d"
@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
338 } else 339 } else
339 apic_clear_vector(vector, apic->regs + APIC_TMR); 340 apic_clear_vector(vector, apic->regs + APIC_TMR);
340 341
341 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 342 kvm_vcpu_kick(vcpu);
342 kvm_vcpu_kick(vcpu);
343 else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
344 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
345 if (waitqueue_active(&vcpu->wq))
346 wake_up_interruptible(&vcpu->wq);
347 }
348 343
349 result = (orig_irr == 0); 344 result = (orig_irr == 0);
350 break; 345 break;
@@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
370 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 365 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
371 kvm_vcpu_kick(vcpu); 366 kvm_vcpu_kick(vcpu);
372 } else { 367 } else {
373 printk(KERN_DEBUG 368 apic_debug("Ignoring de-assert INIT to vcpu %d\n",
374 "Ignoring de-assert INIT to vcpu %d\n", 369 vcpu->vcpu_id);
375 vcpu->vcpu_id);
376 } 370 }
377
378 break; 371 break;
379 372
380 case APIC_DM_STARTUP: 373 case APIC_DM_STARTUP:
381 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", 374 apic_debug("SIPI to vcpu %d vector 0x%02x\n",
382 vcpu->vcpu_id, vector); 375 vcpu->vcpu_id, vector);
383 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 376 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
384 vcpu->arch.sipi_vector = vector; 377 vcpu->arch.sipi_vector = vector;
385 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 378 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
386 if (waitqueue_active(&vcpu->wq)) 379 kvm_vcpu_kick(vcpu);
387 wake_up_interruptible(&vcpu->wq);
388 } 380 }
389 break; 381 break;
390 382
@@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
438static void apic_set_eoi(struct kvm_lapic *apic) 430static void apic_set_eoi(struct kvm_lapic *apic)
439{ 431{
440 int vector = apic_find_highest_isr(apic); 432 int vector = apic_find_highest_isr(apic);
441 433 int trigger_mode;
442 /* 434 /*
443 * Not every write EOI will has corresponding ISR, 435 * Not every write EOI will has corresponding ISR,
444 * one example is when Kernel check timer on setup_IO_APIC 436 * one example is when Kernel check timer on setup_IO_APIC
@@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic)
450 apic_update_ppr(apic); 442 apic_update_ppr(apic);
451 443
452 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) 444 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
453 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); 445 trigger_mode = IOAPIC_LEVEL_TRIG;
446 else
447 trigger_mode = IOAPIC_EDGE_TRIG;
448 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
454} 449}
455 450
456static void apic_send_ipi(struct kvm_lapic *apic) 451static void apic_send_ipi(struct kvm_lapic *apic)
@@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
558 struct kvm_run *run = vcpu->run; 553 struct kvm_run *run = vcpu->run;
559 554
560 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); 555 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
561 kvm_x86_ops->cache_regs(vcpu); 556 run->tpr_access.rip = kvm_rip_read(vcpu);
562 run->tpr_access.rip = vcpu->arch.rip;
563 run->tpr_access.is_write = write; 557 run->tpr_access.is_write = write;
564} 558}
565 559
@@ -683,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
683 * Refer SDM 8.4.1 677 * Refer SDM 8.4.1
684 */ 678 */
685 if (len != 4 || alignment) { 679 if (len != 4 || alignment) {
686 if (printk_ratelimit()) 680 /* Don't shout loud, $infamous_os would cause only noise. */
687 printk(KERN_ERR "apic write: bad size=%d %lx\n", 681 apic_debug("apic write: bad size=%d %lx\n",
688 len, (long)address); 682 len, (long)address);
689 return; 683 return;
690 } 684 }
691 685
@@ -947,10 +941,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
947 941
948 if(!atomic_inc_and_test(&apic->timer.pending)) 942 if(!atomic_inc_and_test(&apic->timer.pending))
949 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); 943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
950 if (waitqueue_active(q)) { 944 if (waitqueue_active(q))
951 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
952 wake_up_interruptible(q); 945 wake_up_interruptible(q);
953 } 946
954 if (apic_lvtt_period(apic)) { 947 if (apic_lvtt_period(apic)) {
955 result = 1; 948 result = 1;
956 apic->timer.dev.expires = ktime_add_ns( 949 apic->timer.dev.expires = ktime_add_ns(
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3da2508eb22a..99c239c5c0ac 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -70,6 +70,9 @@ static int dbg = 0;
70module_param(dbg, bool, 0644); 70module_param(dbg, bool, 0644);
71#endif 71#endif
72 72
73static int oos_shadow = 1;
74module_param(oos_shadow, bool, 0644);
75
73#ifndef MMU_DEBUG 76#ifndef MMU_DEBUG
74#define ASSERT(x) do { } while (0) 77#define ASSERT(x) do { } while (0)
75#else 78#else
@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644);
135#define ACC_USER_MASK PT_USER_MASK 138#define ACC_USER_MASK PT_USER_MASK
136#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 139#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
137 140
138struct kvm_pv_mmu_op_buffer { 141#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
139 void *ptr;
140 unsigned len;
141 unsigned processed;
142 char buf[512] __aligned(sizeof(long));
143};
144 142
145struct kvm_rmap_desc { 143struct kvm_rmap_desc {
146 u64 *shadow_ptes[RMAP_EXT]; 144 u64 *shadow_ptes[RMAP_EXT];
147 struct kvm_rmap_desc *more; 145 struct kvm_rmap_desc *more;
148}; 146};
149 147
148struct kvm_shadow_walk {
149 int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
150 u64 addr, u64 *spte, int level);
151};
152
153struct kvm_unsync_walk {
154 int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
155};
156
157typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
158
150static struct kmem_cache *pte_chain_cache; 159static struct kmem_cache *pte_chain_cache;
151static struct kmem_cache *rmap_desc_cache; 160static struct kmem_cache *rmap_desc_cache;
152static struct kmem_cache *mmu_page_header_cache; 161static struct kmem_cache *mmu_page_header_cache;
@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
405{ 414{
406 struct vm_area_struct *vma; 415 struct vm_area_struct *vma;
407 unsigned long addr; 416 unsigned long addr;
417 int ret = 0;
408 418
409 addr = gfn_to_hva(kvm, gfn); 419 addr = gfn_to_hva(kvm, gfn);
410 if (kvm_is_error_hva(addr)) 420 if (kvm_is_error_hva(addr))
411 return 0; 421 return ret;
412 422
423 down_read(&current->mm->mmap_sem);
413 vma = find_vma(current->mm, addr); 424 vma = find_vma(current->mm, addr);
414 if (vma && is_vm_hugetlb_page(vma)) 425 if (vma && is_vm_hugetlb_page(vma))
415 return 1; 426 ret = 1;
427 up_read(&current->mm->mmap_sem);
416 428
417 return 0; 429 return ret;
418} 430}
419 431
420static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) 432static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
649 661
650 if (write_protected) 662 if (write_protected)
651 kvm_flush_remote_tlbs(kvm); 663 kvm_flush_remote_tlbs(kvm);
652
653 account_shadowed(kvm, gfn);
654} 664}
655 665
656static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) 666static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -859,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
859 BUG(); 869 BUG();
860} 870}
861 871
872
873static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
874 mmu_parent_walk_fn fn)
875{
876 struct kvm_pte_chain *pte_chain;
877 struct hlist_node *node;
878 struct kvm_mmu_page *parent_sp;
879 int i;
880
881 if (!sp->multimapped && sp->parent_pte) {
882 parent_sp = page_header(__pa(sp->parent_pte));
883 fn(vcpu, parent_sp);
884 mmu_parent_walk(vcpu, parent_sp, fn);
885 return;
886 }
887 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
888 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
889 if (!pte_chain->parent_ptes[i])
890 break;
891 parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
892 fn(vcpu, parent_sp);
893 mmu_parent_walk(vcpu, parent_sp, fn);
894 }
895}
896
897static void kvm_mmu_update_unsync_bitmap(u64 *spte)
898{
899 unsigned int index;
900 struct kvm_mmu_page *sp = page_header(__pa(spte));
901
902 index = spte - sp->spt;
903 __set_bit(index, sp->unsync_child_bitmap);
904 sp->unsync_children = 1;
905}
906
907static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
908{
909 struct kvm_pte_chain *pte_chain;
910 struct hlist_node *node;
911 int i;
912
913 if (!sp->parent_pte)
914 return;
915
916 if (!sp->multimapped) {
917 kvm_mmu_update_unsync_bitmap(sp->parent_pte);
918 return;
919 }
920
921 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
922 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
923 if (!pte_chain->parent_ptes[i])
924 break;
925 kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
926 }
927}
928
929static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
930{
931 sp->unsync_children = 1;
932 kvm_mmu_update_parents_unsync(sp);
933 return 1;
934}
935
936static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
937 struct kvm_mmu_page *sp)
938{
939 mmu_parent_walk(vcpu, sp, unsync_walk_fn);
940 kvm_mmu_update_parents_unsync(sp);
941}
942
862static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, 943static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
863 struct kvm_mmu_page *sp) 944 struct kvm_mmu_page *sp)
864{ 945{
@@ -868,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
868 sp->spt[i] = shadow_trap_nonpresent_pte; 949 sp->spt[i] = shadow_trap_nonpresent_pte;
869} 950}
870 951
952static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
953 struct kvm_mmu_page *sp)
954{
955 return 1;
956}
957
958static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
959{
960}
961
962#define for_each_unsync_children(bitmap, idx) \
963 for (idx = find_first_bit(bitmap, 512); \
964 idx < 512; \
965 idx = find_next_bit(bitmap, 512, idx+1))
966
967static int mmu_unsync_walk(struct kvm_mmu_page *sp,
968 struct kvm_unsync_walk *walker)
969{
970 int i, ret;
971
972 if (!sp->unsync_children)
973 return 0;
974
975 for_each_unsync_children(sp->unsync_child_bitmap, i) {
976 u64 ent = sp->spt[i];
977
978 if (is_shadow_present_pte(ent)) {
979 struct kvm_mmu_page *child;
980 child = page_header(ent & PT64_BASE_ADDR_MASK);
981
982 if (child->unsync_children) {
983 ret = mmu_unsync_walk(child, walker);
984 if (ret)
985 return ret;
986 __clear_bit(i, sp->unsync_child_bitmap);
987 }
988
989 if (child->unsync) {
990 ret = walker->entry(child, walker);
991 __clear_bit(i, sp->unsync_child_bitmap);
992 if (ret)
993 return ret;
994 }
995 }
996 }
997
998 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
999 sp->unsync_children = 0;
1000
1001 return 0;
1002}
1003
871static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) 1004static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
872{ 1005{
873 unsigned index; 1006 unsigned index;
@@ -888,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
888 return NULL; 1021 return NULL;
889} 1022}
890 1023
1024static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1025{
1026 WARN_ON(!sp->unsync);
1027 sp->unsync = 0;
1028 --kvm->stat.mmu_unsync;
1029}
1030
1031static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
1032
1033static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1034{
1035 if (sp->role.glevels != vcpu->arch.mmu.root_level) {
1036 kvm_mmu_zap_page(vcpu->kvm, sp);
1037 return 1;
1038 }
1039
1040 rmap_write_protect(vcpu->kvm, sp->gfn);
1041 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1042 kvm_mmu_zap_page(vcpu->kvm, sp);
1043 return 1;
1044 }
1045
1046 kvm_mmu_flush_tlb(vcpu);
1047 kvm_unlink_unsync_page(vcpu->kvm, sp);
1048 return 0;
1049}
1050
1051struct sync_walker {
1052 struct kvm_vcpu *vcpu;
1053 struct kvm_unsync_walk walker;
1054};
1055
1056static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1057{
1058 struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
1059 walker);
1060 struct kvm_vcpu *vcpu = sync_walk->vcpu;
1061
1062 kvm_sync_page(vcpu, sp);
1063 return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
1064}
1065
1066static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1067{
1068 struct sync_walker walker = {
1069 .walker = { .entry = mmu_sync_fn, },
1070 .vcpu = vcpu,
1071 };
1072
1073 while (mmu_unsync_walk(sp, &walker.walker))
1074 cond_resched_lock(&vcpu->kvm->mmu_lock);
1075}
1076
891static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1077static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
892 gfn_t gfn, 1078 gfn_t gfn,
893 gva_t gaddr, 1079 gva_t gaddr,
@@ -901,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
901 unsigned quadrant; 1087 unsigned quadrant;
902 struct hlist_head *bucket; 1088 struct hlist_head *bucket;
903 struct kvm_mmu_page *sp; 1089 struct kvm_mmu_page *sp;
904 struct hlist_node *node; 1090 struct hlist_node *node, *tmp;
905 1091
906 role.word = 0; 1092 role.word = 0;
907 role.glevels = vcpu->arch.mmu.root_level; 1093 role.glevels = vcpu->arch.mmu.root_level;
@@ -917,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
917 gfn, role.word); 1103 gfn, role.word);
918 index = kvm_page_table_hashfn(gfn); 1104 index = kvm_page_table_hashfn(gfn);
919 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1105 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
920 hlist_for_each_entry(sp, node, bucket, hash_link) 1106 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
921 if (sp->gfn == gfn && sp->role.word == role.word) { 1107 if (sp->gfn == gfn) {
1108 if (sp->unsync)
1109 if (kvm_sync_page(vcpu, sp))
1110 continue;
1111
1112 if (sp->role.word != role.word)
1113 continue;
1114
922 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1115 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1116 if (sp->unsync_children) {
1117 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1118 kvm_mmu_mark_parents_unsync(vcpu, sp);
1119 }
923 pgprintk("%s: found\n", __func__); 1120 pgprintk("%s: found\n", __func__);
924 return sp; 1121 return sp;
925 } 1122 }
@@ -931,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
931 sp->gfn = gfn; 1128 sp->gfn = gfn;
932 sp->role = role; 1129 sp->role = role;
933 hlist_add_head(&sp->hash_link, bucket); 1130 hlist_add_head(&sp->hash_link, bucket);
934 if (!metaphysical) 1131 if (!metaphysical) {
935 rmap_write_protect(vcpu->kvm, gfn); 1132 rmap_write_protect(vcpu->kvm, gfn);
1133 account_shadowed(vcpu->kvm, gfn);
1134 }
936 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1135 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
937 vcpu->arch.mmu.prefetch_page(vcpu, sp); 1136 vcpu->arch.mmu.prefetch_page(vcpu, sp);
938 else 1137 else
@@ -940,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
940 return sp; 1139 return sp;
941} 1140}
942 1141
1142static int walk_shadow(struct kvm_shadow_walk *walker,
1143 struct kvm_vcpu *vcpu, u64 addr)
1144{
1145 hpa_t shadow_addr;
1146 int level;
1147 int r;
1148 u64 *sptep;
1149 unsigned index;
1150
1151 shadow_addr = vcpu->arch.mmu.root_hpa;
1152 level = vcpu->arch.mmu.shadow_root_level;
1153 if (level == PT32E_ROOT_LEVEL) {
1154 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1155 shadow_addr &= PT64_BASE_ADDR_MASK;
1156 --level;
1157 }
1158
1159 while (level >= PT_PAGE_TABLE_LEVEL) {
1160 index = SHADOW_PT_INDEX(addr, level);
1161 sptep = ((u64 *)__va(shadow_addr)) + index;
1162 r = walker->entry(walker, vcpu, addr, sptep, level);
1163 if (r)
1164 return r;
1165 shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
1166 --level;
1167 }
1168 return 0;
1169}
1170
943static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1171static void kvm_mmu_page_unlink_children(struct kvm *kvm,
944 struct kvm_mmu_page *sp) 1172 struct kvm_mmu_page *sp)
945{ 1173{
@@ -955,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
955 rmap_remove(kvm, &pt[i]); 1183 rmap_remove(kvm, &pt[i]);
956 pt[i] = shadow_trap_nonpresent_pte; 1184 pt[i] = shadow_trap_nonpresent_pte;
957 } 1185 }
958 kvm_flush_remote_tlbs(kvm);
959 return; 1186 return;
960 } 1187 }
961 1188
@@ -974,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
974 } 1201 }
975 pt[i] = shadow_trap_nonpresent_pte; 1202 pt[i] = shadow_trap_nonpresent_pte;
976 } 1203 }
977 kvm_flush_remote_tlbs(kvm);
978} 1204}
979 1205
980static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) 1206static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -991,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
991 kvm->vcpus[i]->arch.last_pte_updated = NULL; 1217 kvm->vcpus[i]->arch.last_pte_updated = NULL;
992} 1218}
993 1219
994static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1220static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
995{ 1221{
996 u64 *parent_pte; 1222 u64 *parent_pte;
997 1223
998 ++kvm->stat.mmu_shadow_zapped;
999 while (sp->multimapped || sp->parent_pte) { 1224 while (sp->multimapped || sp->parent_pte) {
1000 if (!sp->multimapped) 1225 if (!sp->multimapped)
1001 parent_pte = sp->parent_pte; 1226 parent_pte = sp->parent_pte;
@@ -1010,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1010 kvm_mmu_put_page(sp, parent_pte); 1235 kvm_mmu_put_page(sp, parent_pte);
1011 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); 1236 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
1012 } 1237 }
1238}
1239
1240struct zap_walker {
1241 struct kvm_unsync_walk walker;
1242 struct kvm *kvm;
1243 int zapped;
1244};
1245
1246static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1247{
1248 struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
1249 walker);
1250 kvm_mmu_zap_page(zap_walk->kvm, sp);
1251 zap_walk->zapped = 1;
1252 return 0;
1253}
1254
1255static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
1256{
1257 struct zap_walker walker = {
1258 .walker = { .entry = mmu_zap_fn, },
1259 .kvm = kvm,
1260 .zapped = 0,
1261 };
1262
1263 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1264 return 0;
1265 mmu_unsync_walk(sp, &walker.walker);
1266 return walker.zapped;
1267}
1268
1269static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1270{
1271 int ret;
1272 ++kvm->stat.mmu_shadow_zapped;
1273 ret = mmu_zap_unsync_children(kvm, sp);
1013 kvm_mmu_page_unlink_children(kvm, sp); 1274 kvm_mmu_page_unlink_children(kvm, sp);
1275 kvm_mmu_unlink_parents(kvm, sp);
1276 kvm_flush_remote_tlbs(kvm);
1277 if (!sp->role.invalid && !sp->role.metaphysical)
1278 unaccount_shadowed(kvm, sp->gfn);
1279 if (sp->unsync)
1280 kvm_unlink_unsync_page(kvm, sp);
1014 if (!sp->root_count) { 1281 if (!sp->root_count) {
1015 if (!sp->role.metaphysical && !sp->role.invalid)
1016 unaccount_shadowed(kvm, sp->gfn);
1017 hlist_del(&sp->hash_link); 1282 hlist_del(&sp->hash_link);
1018 kvm_mmu_free_page(kvm, sp); 1283 kvm_mmu_free_page(kvm, sp);
1019 } else { 1284 } else {
1020 int invalid = sp->role.invalid;
1021 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1022 sp->role.invalid = 1; 1285 sp->role.invalid = 1;
1286 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1023 kvm_reload_remote_mmus(kvm); 1287 kvm_reload_remote_mmus(kvm);
1024 if (!sp->role.metaphysical && !invalid)
1025 unaccount_shadowed(kvm, sp->gfn);
1026 } 1288 }
1027 kvm_mmu_reset_last_pte_updated(kvm); 1289 kvm_mmu_reset_last_pte_updated(kvm);
1290 return ret;
1028} 1291}
1029 1292
1030/* 1293/*
@@ -1077,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1077 if (sp->gfn == gfn && !sp->role.metaphysical) { 1340 if (sp->gfn == gfn && !sp->role.metaphysical) {
1078 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1341 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1079 sp->role.word); 1342 sp->role.word);
1080 kvm_mmu_zap_page(kvm, sp);
1081 r = 1; 1343 r = 1;
1344 if (kvm_mmu_zap_page(kvm, sp))
1345 n = bucket->first;
1082 } 1346 }
1083 return r; 1347 return r;
1084} 1348}
@@ -1101,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1101 __set_bit(slot, &sp->slot_bitmap); 1365 __set_bit(slot, &sp->slot_bitmap);
1102} 1366}
1103 1367
1368static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1369{
1370 int i;
1371 u64 *pt = sp->spt;
1372
1373 if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1374 return;
1375
1376 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1377 if (pt[i] == shadow_notrap_nonpresent_pte)
1378 set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
1379 }
1380}
1381
1104struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) 1382struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1105{ 1383{
1106 struct page *page; 1384 struct page *page;
@@ -1110,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1110 if (gpa == UNMAPPED_GVA) 1388 if (gpa == UNMAPPED_GVA)
1111 return NULL; 1389 return NULL;
1112 1390
1113 down_read(&current->mm->mmap_sem);
1114 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1391 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1115 up_read(&current->mm->mmap_sem);
1116 1392
1117 return page; 1393 return page;
1118} 1394}
1119 1395
1120static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1396static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1121 unsigned pt_access, unsigned pte_access,
1122 int user_fault, int write_fault, int dirty,
1123 int *ptwrite, int largepage, gfn_t gfn,
1124 pfn_t pfn, bool speculative)
1125{ 1397{
1126 u64 spte; 1398 unsigned index;
1127 int was_rmapped = 0; 1399 struct hlist_head *bucket;
1128 int was_writeble = is_writeble_pte(*shadow_pte); 1400 struct kvm_mmu_page *s;
1401 struct hlist_node *node, *n;
1129 1402
1130 pgprintk("%s: spte %llx access %x write_fault %d" 1403 index = kvm_page_table_hashfn(sp->gfn);
1131 " user_fault %d gfn %lx\n", 1404 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1132 __func__, *shadow_pte, pt_access, 1405 /* don't unsync if pagetable is shadowed with multiple roles */
1133 write_fault, user_fault, gfn); 1406 hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
1407 if (s->gfn != sp->gfn || s->role.metaphysical)
1408 continue;
1409 if (s->role.word != sp->role.word)
1410 return 1;
1411 }
1412 kvm_mmu_mark_parents_unsync(vcpu, sp);
1413 ++vcpu->kvm->stat.mmu_unsync;
1414 sp->unsync = 1;
1415 mmu_convert_notrap(sp);
1416 return 0;
1417}
1134 1418
1135 if (is_rmap_pte(*shadow_pte)) { 1419static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1136 /* 1420 bool can_unsync)
1137 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 1421{
1138 * the parent of the now unreachable PTE. 1422 struct kvm_mmu_page *shadow;
1139 */
1140 if (largepage && !is_large_pte(*shadow_pte)) {
1141 struct kvm_mmu_page *child;
1142 u64 pte = *shadow_pte;
1143 1423
1144 child = page_header(pte & PT64_BASE_ADDR_MASK); 1424 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1145 mmu_page_remove_parent_pte(child, shadow_pte); 1425 if (shadow) {
1146 } else if (pfn != spte_to_pfn(*shadow_pte)) { 1426 if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
1147 pgprintk("hfn old %lx new %lx\n", 1427 return 1;
1148 spte_to_pfn(*shadow_pte), pfn); 1428 if (shadow->unsync)
1149 rmap_remove(vcpu->kvm, shadow_pte); 1429 return 0;
1150 } else { 1430 if (can_unsync && oos_shadow)
1151 if (largepage) 1431 return kvm_unsync_page(vcpu, shadow);
1152 was_rmapped = is_large_pte(*shadow_pte); 1432 return 1;
1153 else
1154 was_rmapped = 1;
1155 }
1156 } 1433 }
1434 return 0;
1435}
1157 1436
1437static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1438 unsigned pte_access, int user_fault,
1439 int write_fault, int dirty, int largepage,
1440 gfn_t gfn, pfn_t pfn, bool speculative,
1441 bool can_unsync)
1442{
1443 u64 spte;
1444 int ret = 0;
1158 /* 1445 /*
1159 * We don't set the accessed bit, since we sometimes want to see 1446 * We don't set the accessed bit, since we sometimes want to see
1160 * whether the guest actually used the pte (in order to detect 1447 * whether the guest actually used the pte (in order to detect
@@ -1162,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1162 */ 1449 */
1163 spte = shadow_base_present_pte | shadow_dirty_mask; 1450 spte = shadow_base_present_pte | shadow_dirty_mask;
1164 if (!speculative) 1451 if (!speculative)
1165 pte_access |= PT_ACCESSED_MASK; 1452 spte |= shadow_accessed_mask;
1166 if (!dirty) 1453 if (!dirty)
1167 pte_access &= ~ACC_WRITE_MASK; 1454 pte_access &= ~ACC_WRITE_MASK;
1168 if (pte_access & ACC_EXEC_MASK) 1455 if (pte_access & ACC_EXEC_MASK)
@@ -1178,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1178 1465
1179 if ((pte_access & ACC_WRITE_MASK) 1466 if ((pte_access & ACC_WRITE_MASK)
1180 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1467 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1181 struct kvm_mmu_page *shadow; 1468
1469 if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
1470 ret = 1;
1471 spte = shadow_trap_nonpresent_pte;
1472 goto set_pte;
1473 }
1182 1474
1183 spte |= PT_WRITABLE_MASK; 1475 spte |= PT_WRITABLE_MASK;
1184 1476
1185 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1477 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1186 if (shadow ||
1187 (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
1188 pgprintk("%s: found shadow page for %lx, marking ro\n", 1478 pgprintk("%s: found shadow page for %lx, marking ro\n",
1189 __func__, gfn); 1479 __func__, gfn);
1480 ret = 1;
1190 pte_access &= ~ACC_WRITE_MASK; 1481 pte_access &= ~ACC_WRITE_MASK;
1191 if (is_writeble_pte(spte)) { 1482 if (is_writeble_pte(spte))
1192 spte &= ~PT_WRITABLE_MASK; 1483 spte &= ~PT_WRITABLE_MASK;
1193 kvm_x86_ops->tlb_flush(vcpu);
1194 }
1195 if (write_fault)
1196 *ptwrite = 1;
1197 } 1484 }
1198 } 1485 }
1199 1486
1200 if (pte_access & ACC_WRITE_MASK) 1487 if (pte_access & ACC_WRITE_MASK)
1201 mark_page_dirty(vcpu->kvm, gfn); 1488 mark_page_dirty(vcpu->kvm, gfn);
1202 1489
1203 pgprintk("%s: setting spte %llx\n", __func__, spte); 1490set_pte:
1204 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1205 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1206 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
1207 set_shadow_pte(shadow_pte, spte); 1491 set_shadow_pte(shadow_pte, spte);
1208 if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) 1492 return ret;
1209 && (spte & PT_PRESENT_MASK)) 1493}
1494
1495static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1496 unsigned pt_access, unsigned pte_access,
1497 int user_fault, int write_fault, int dirty,
1498 int *ptwrite, int largepage, gfn_t gfn,
1499 pfn_t pfn, bool speculative)
1500{
1501 int was_rmapped = 0;
1502 int was_writeble = is_writeble_pte(*shadow_pte);
1503
1504 pgprintk("%s: spte %llx access %x write_fault %d"
1505 " user_fault %d gfn %lx\n",
1506 __func__, *shadow_pte, pt_access,
1507 write_fault, user_fault, gfn);
1508
1509 if (is_rmap_pte(*shadow_pte)) {
1510 /*
1511 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1512 * the parent of the now unreachable PTE.
1513 */
1514 if (largepage && !is_large_pte(*shadow_pte)) {
1515 struct kvm_mmu_page *child;
1516 u64 pte = *shadow_pte;
1517
1518 child = page_header(pte & PT64_BASE_ADDR_MASK);
1519 mmu_page_remove_parent_pte(child, shadow_pte);
1520 } else if (pfn != spte_to_pfn(*shadow_pte)) {
1521 pgprintk("hfn old %lx new %lx\n",
1522 spte_to_pfn(*shadow_pte), pfn);
1523 rmap_remove(vcpu->kvm, shadow_pte);
1524 } else {
1525 if (largepage)
1526 was_rmapped = is_large_pte(*shadow_pte);
1527 else
1528 was_rmapped = 1;
1529 }
1530 }
1531 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1532 dirty, largepage, gfn, pfn, speculative, true)) {
1533 if (write_fault)
1534 *ptwrite = 1;
1535 kvm_x86_ops->tlb_flush(vcpu);
1536 }
1537
1538 pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
1539 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1540 is_large_pte(*shadow_pte)? "2MB" : "4kB",
1541 is_present_pte(*shadow_pte)?"RW":"R", gfn,
1542 *shadow_pte, shadow_pte);
1543 if (!was_rmapped && is_large_pte(*shadow_pte))
1210 ++vcpu->kvm->stat.lpages; 1544 ++vcpu->kvm->stat.lpages;
1211 1545
1212 page_header_update_slot(vcpu->kvm, shadow_pte, gfn); 1546 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
@@ -1230,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1230{ 1564{
1231} 1565}
1232 1566
1233static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 1567struct direct_shadow_walk {
1234 int largepage, gfn_t gfn, pfn_t pfn, 1568 struct kvm_shadow_walk walker;
1235 int level) 1569 pfn_t pfn;
1236{ 1570 int write;
1237 hpa_t table_addr = vcpu->arch.mmu.root_hpa; 1571 int largepage;
1238 int pt_write = 0; 1572 int pt_write;
1239 1573};
1240 for (; ; level--) {
1241 u32 index = PT64_INDEX(v, level);
1242 u64 *table;
1243
1244 ASSERT(VALID_PAGE(table_addr));
1245 table = __va(table_addr);
1246 1574
1247 if (level == 1) { 1575static int direct_map_entry(struct kvm_shadow_walk *_walk,
1248 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1576 struct kvm_vcpu *vcpu,
1249 0, write, 1, &pt_write, 0, gfn, pfn, false); 1577 u64 addr, u64 *sptep, int level)
1250 return pt_write; 1578{
1251 } 1579 struct direct_shadow_walk *walk =
1580 container_of(_walk, struct direct_shadow_walk, walker);
1581 struct kvm_mmu_page *sp;
1582 gfn_t pseudo_gfn;
1583 gfn_t gfn = addr >> PAGE_SHIFT;
1584
1585 if (level == PT_PAGE_TABLE_LEVEL
1586 || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
1587 mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
1588 0, walk->write, 1, &walk->pt_write,
1589 walk->largepage, gfn, walk->pfn, false);
1590 ++vcpu->stat.pf_fixed;
1591 return 1;
1592 }
1252 1593
1253 if (largepage && level == 2) { 1594 if (*sptep == shadow_trap_nonpresent_pte) {
1254 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1595 pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
1255 0, write, 1, &pt_write, 1, gfn, pfn, false); 1596 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
1256 return pt_write; 1597 1, ACC_ALL, sptep);
1598 if (!sp) {
1599 pgprintk("nonpaging_map: ENOMEM\n");
1600 kvm_release_pfn_clean(walk->pfn);
1601 return -ENOMEM;
1257 } 1602 }
1258 1603
1259 if (table[index] == shadow_trap_nonpresent_pte) { 1604 set_shadow_pte(sptep,
1260 struct kvm_mmu_page *new_table; 1605 __pa(sp->spt)
1261 gfn_t pseudo_gfn; 1606 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1262 1607 | shadow_user_mask | shadow_x_mask);
1263 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
1264 >> PAGE_SHIFT;
1265 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1266 v, level - 1,
1267 1, ACC_ALL, &table[index]);
1268 if (!new_table) {
1269 pgprintk("nonpaging_map: ENOMEM\n");
1270 kvm_release_pfn_clean(pfn);
1271 return -ENOMEM;
1272 }
1273
1274 set_shadow_pte(&table[index],
1275 __pa(new_table->spt)
1276 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1277 | shadow_user_mask | shadow_x_mask);
1278 }
1279 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1280 } 1608 }
1609 return 0;
1610}
1611
1612static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1613 int largepage, gfn_t gfn, pfn_t pfn)
1614{
1615 int r;
1616 struct direct_shadow_walk walker = {
1617 .walker = { .entry = direct_map_entry, },
1618 .pfn = pfn,
1619 .largepage = largepage,
1620 .write = write,
1621 .pt_write = 0,
1622 };
1623
1624 r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
1625 if (r < 0)
1626 return r;
1627 return walker.pt_write;
1281} 1628}
1282 1629
1283static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 1630static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
@@ -1287,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1287 pfn_t pfn; 1634 pfn_t pfn;
1288 unsigned long mmu_seq; 1635 unsigned long mmu_seq;
1289 1636
1290 down_read(&current->mm->mmap_sem);
1291 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1637 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1292 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1638 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1293 largepage = 1; 1639 largepage = 1;
1294 } 1640 }
1295 1641
1296 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1642 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1297 /* implicit mb(), we'll read before PT lock is unlocked */ 1643 smp_rmb();
1298 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1644 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1299 up_read(&current->mm->mmap_sem);
1300 1645
1301 /* mmio */ 1646 /* mmio */
1302 if (is_error_pfn(pfn)) { 1647 if (is_error_pfn(pfn)) {
@@ -1308,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1308 if (mmu_notifier_retry(vcpu, mmu_seq)) 1653 if (mmu_notifier_retry(vcpu, mmu_seq))
1309 goto out_unlock; 1654 goto out_unlock;
1310 kvm_mmu_free_some_pages(vcpu); 1655 kvm_mmu_free_some_pages(vcpu);
1311 r = __direct_map(vcpu, v, write, largepage, gfn, pfn, 1656 r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
1312 PT32E_ROOT_LEVEL);
1313 spin_unlock(&vcpu->kvm->mmu_lock); 1657 spin_unlock(&vcpu->kvm->mmu_lock);
1314 1658
1315 1659
@@ -1405,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1405 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 1749 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1406} 1750}
1407 1751
1752static void mmu_sync_roots(struct kvm_vcpu *vcpu)
1753{
1754 int i;
1755 struct kvm_mmu_page *sp;
1756
1757 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1758 return;
1759 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1760 hpa_t root = vcpu->arch.mmu.root_hpa;
1761 sp = page_header(root);
1762 mmu_sync_children(vcpu, sp);
1763 return;
1764 }
1765 for (i = 0; i < 4; ++i) {
1766 hpa_t root = vcpu->arch.mmu.pae_root[i];
1767
1768 if (root) {
1769 root &= PT64_BASE_ADDR_MASK;
1770 sp = page_header(root);
1771 mmu_sync_children(vcpu, sp);
1772 }
1773 }
1774}
1775
1776void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
1777{
1778 spin_lock(&vcpu->kvm->mmu_lock);
1779 mmu_sync_roots(vcpu);
1780 spin_unlock(&vcpu->kvm->mmu_lock);
1781}
1782
1408static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 1783static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1409{ 1784{
1410 return vaddr; 1785 return vaddr;
@@ -1446,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1446 if (r) 1821 if (r)
1447 return r; 1822 return r;
1448 1823
1449 down_read(&current->mm->mmap_sem);
1450 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1824 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1451 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1825 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1452 largepage = 1; 1826 largepage = 1;
1453 } 1827 }
1454 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1828 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1455 /* implicit mb(), we'll read before PT lock is unlocked */ 1829 smp_rmb();
1456 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1830 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1457 up_read(&current->mm->mmap_sem);
1458 if (is_error_pfn(pfn)) { 1831 if (is_error_pfn(pfn)) {
1459 kvm_release_pfn_clean(pfn); 1832 kvm_release_pfn_clean(pfn);
1460 return 1; 1833 return 1;
@@ -1464,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1464 goto out_unlock; 1837 goto out_unlock;
1465 kvm_mmu_free_some_pages(vcpu); 1838 kvm_mmu_free_some_pages(vcpu);
1466 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 1839 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1467 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); 1840 largepage, gfn, pfn);
1468 spin_unlock(&vcpu->kvm->mmu_lock); 1841 spin_unlock(&vcpu->kvm->mmu_lock);
1469 1842
1470 return r; 1843 return r;
@@ -1489,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1489 context->gva_to_gpa = nonpaging_gva_to_gpa; 1862 context->gva_to_gpa = nonpaging_gva_to_gpa;
1490 context->free = nonpaging_free; 1863 context->free = nonpaging_free;
1491 context->prefetch_page = nonpaging_prefetch_page; 1864 context->prefetch_page = nonpaging_prefetch_page;
1865 context->sync_page = nonpaging_sync_page;
1866 context->invlpg = nonpaging_invlpg;
1492 context->root_level = 0; 1867 context->root_level = 0;
1493 context->shadow_root_level = PT32E_ROOT_LEVEL; 1868 context->shadow_root_level = PT32E_ROOT_LEVEL;
1494 context->root_hpa = INVALID_PAGE; 1869 context->root_hpa = INVALID_PAGE;
@@ -1536,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1536 context->page_fault = paging64_page_fault; 1911 context->page_fault = paging64_page_fault;
1537 context->gva_to_gpa = paging64_gva_to_gpa; 1912 context->gva_to_gpa = paging64_gva_to_gpa;
1538 context->prefetch_page = paging64_prefetch_page; 1913 context->prefetch_page = paging64_prefetch_page;
1914 context->sync_page = paging64_sync_page;
1915 context->invlpg = paging64_invlpg;
1539 context->free = paging_free; 1916 context->free = paging_free;
1540 context->root_level = level; 1917 context->root_level = level;
1541 context->shadow_root_level = level; 1918 context->shadow_root_level = level;
@@ -1557,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
1557 context->gva_to_gpa = paging32_gva_to_gpa; 1934 context->gva_to_gpa = paging32_gva_to_gpa;
1558 context->free = paging_free; 1935 context->free = paging_free;
1559 context->prefetch_page = paging32_prefetch_page; 1936 context->prefetch_page = paging32_prefetch_page;
1937 context->sync_page = paging32_sync_page;
1938 context->invlpg = paging32_invlpg;
1560 context->root_level = PT32_ROOT_LEVEL; 1939 context->root_level = PT32_ROOT_LEVEL;
1561 context->shadow_root_level = PT32E_ROOT_LEVEL; 1940 context->shadow_root_level = PT32E_ROOT_LEVEL;
1562 context->root_hpa = INVALID_PAGE; 1941 context->root_hpa = INVALID_PAGE;
@@ -1576,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
1576 context->page_fault = tdp_page_fault; 1955 context->page_fault = tdp_page_fault;
1577 context->free = nonpaging_free; 1956 context->free = nonpaging_free;
1578 context->prefetch_page = nonpaging_prefetch_page; 1957 context->prefetch_page = nonpaging_prefetch_page;
1958 context->sync_page = nonpaging_sync_page;
1959 context->invlpg = nonpaging_invlpg;
1579 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 1960 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
1580 context->root_hpa = INVALID_PAGE; 1961 context->root_hpa = INVALID_PAGE;
1581 1962
@@ -1647,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
1647 spin_lock(&vcpu->kvm->mmu_lock); 2028 spin_lock(&vcpu->kvm->mmu_lock);
1648 kvm_mmu_free_some_pages(vcpu); 2029 kvm_mmu_free_some_pages(vcpu);
1649 mmu_alloc_roots(vcpu); 2030 mmu_alloc_roots(vcpu);
2031 mmu_sync_roots(vcpu);
1650 spin_unlock(&vcpu->kvm->mmu_lock); 2032 spin_unlock(&vcpu->kvm->mmu_lock);
1651 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2033 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1652 kvm_mmu_flush_tlb(vcpu); 2034 kvm_mmu_flush_tlb(vcpu);
@@ -1767,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1767 return; 2149 return;
1768 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2150 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1769 2151
1770 down_read(&current->mm->mmap_sem);
1771 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { 2152 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
1772 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 2153 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1773 vcpu->arch.update_pte.largepage = 1; 2154 vcpu->arch.update_pte.largepage = 1;
1774 } 2155 }
1775 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; 2156 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
1776 /* implicit mb(), we'll read before PT lock is unlocked */ 2157 smp_rmb();
1777 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2158 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1778 up_read(&current->mm->mmap_sem);
1779 2159
1780 if (is_error_pfn(pfn)) { 2160 if (is_error_pfn(pfn)) {
1781 kvm_release_pfn_clean(pfn); 2161 kvm_release_pfn_clean(pfn);
@@ -1837,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1837 index = kvm_page_table_hashfn(gfn); 2217 index = kvm_page_table_hashfn(gfn);
1838 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2218 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1839 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2219 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1840 if (sp->gfn != gfn || sp->role.metaphysical) 2220 if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
1841 continue; 2221 continue;
1842 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; 2222 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1843 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2223 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -1855,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1855 */ 2235 */
1856 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2236 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1857 gpa, bytes, sp->role.word); 2237 gpa, bytes, sp->role.word);
1858 kvm_mmu_zap_page(vcpu->kvm, sp); 2238 if (kvm_mmu_zap_page(vcpu->kvm, sp))
2239 n = bucket->first;
1859 ++vcpu->kvm->stat.mmu_flooded; 2240 ++vcpu->kvm->stat.mmu_flooded;
1860 continue; 2241 continue;
1861 } 2242 }
@@ -1969,6 +2350,16 @@ out:
1969} 2350}
1970EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 2351EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1971 2352
2353void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2354{
2355 spin_lock(&vcpu->kvm->mmu_lock);
2356 vcpu->arch.mmu.invlpg(vcpu, gva);
2357 spin_unlock(&vcpu->kvm->mmu_lock);
2358 kvm_mmu_flush_tlb(vcpu);
2359 ++vcpu->stat.invlpg;
2360}
2361EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
2362
1972void kvm_enable_tdp(void) 2363void kvm_enable_tdp(void)
1973{ 2364{
1974 tdp_enabled = true; 2365 tdp_enabled = true;
@@ -2055,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2055{ 2446{
2056 struct kvm_mmu_page *sp; 2447 struct kvm_mmu_page *sp;
2057 2448
2449 spin_lock(&kvm->mmu_lock);
2058 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 2450 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2059 int i; 2451 int i;
2060 u64 *pt; 2452 u64 *pt;
@@ -2068,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2068 if (pt[i] & PT_WRITABLE_MASK) 2460 if (pt[i] & PT_WRITABLE_MASK)
2069 pt[i] &= ~PT_WRITABLE_MASK; 2461 pt[i] &= ~PT_WRITABLE_MASK;
2070 } 2462 }
2463 kvm_flush_remote_tlbs(kvm);
2464 spin_unlock(&kvm->mmu_lock);
2071} 2465}
2072 2466
2073void kvm_mmu_zap_all(struct kvm *kvm) 2467void kvm_mmu_zap_all(struct kvm *kvm)
@@ -2076,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm)
2076 2470
2077 spin_lock(&kvm->mmu_lock); 2471 spin_lock(&kvm->mmu_lock);
2078 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 2472 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2079 kvm_mmu_zap_page(kvm, sp); 2473 if (kvm_mmu_zap_page(kvm, sp))
2474 node = container_of(kvm->arch.active_mmu_pages.next,
2475 struct kvm_mmu_page, link);
2080 spin_unlock(&kvm->mmu_lock); 2476 spin_unlock(&kvm->mmu_lock);
2081 2477
2082 kvm_flush_remote_tlbs(kvm); 2478 kvm_flush_remote_tlbs(kvm);
@@ -2291,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2291 gpa_t addr, unsigned long *ret) 2687 gpa_t addr, unsigned long *ret)
2292{ 2688{
2293 int r; 2689 int r;
2294 struct kvm_pv_mmu_op_buffer buffer; 2690 struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
2295 2691
2296 buffer.ptr = buffer.buf; 2692 buffer->ptr = buffer->buf;
2297 buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); 2693 buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
2298 buffer.processed = 0; 2694 buffer->processed = 0;
2299 2695
2300 r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); 2696 r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
2301 if (r) 2697 if (r)
2302 goto out; 2698 goto out;
2303 2699
2304 while (buffer.len) { 2700 while (buffer->len) {
2305 r = kvm_pv_mmu_op_one(vcpu, &buffer); 2701 r = kvm_pv_mmu_op_one(vcpu, buffer);
2306 if (r < 0) 2702 if (r < 0)
2307 goto out; 2703 goto out;
2308 if (r == 0) 2704 if (r == 0)
@@ -2311,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2311 2707
2312 r = 1; 2708 r = 1;
2313out: 2709out:
2314 *ret = buffer.processed; 2710 *ret = buffer->processed;
2315 return r; 2711 return r;
2316} 2712}
2317 2713
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4a814bff21f2..613ec9aa674a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -25,11 +25,11 @@
25#if PTTYPE == 64 25#if PTTYPE == 64
26 #define pt_element_t u64 26 #define pt_element_t u64
27 #define guest_walker guest_walker64 27 #define guest_walker guest_walker64
28 #define shadow_walker shadow_walker64
28 #define FNAME(name) paging##64_##name 29 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK 30 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK 31 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 32 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) 33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS 34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
35 #ifdef CONFIG_X86_64 35 #ifdef CONFIG_X86_64
@@ -42,11 +42,11 @@
42#elif PTTYPE == 32 42#elif PTTYPE == 32
43 #define pt_element_t u32 43 #define pt_element_t u32
44 #define guest_walker guest_walker32 44 #define guest_walker guest_walker32
45 #define shadow_walker shadow_walker32
45 #define FNAME(name) paging##32_##name 46 #define FNAME(name) paging##32_##name
46 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK 47 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK 48 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
48 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
49 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) 50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
51 #define PT_LEVEL_BITS PT32_LEVEL_BITS 51 #define PT_LEVEL_BITS PT32_LEVEL_BITS
52 #define PT_MAX_FULL_LEVELS 2 52 #define PT_MAX_FULL_LEVELS 2
@@ -73,6 +73,17 @@ struct guest_walker {
73 u32 error_code; 73 u32 error_code;
74}; 74};
75 75
76struct shadow_walker {
77 struct kvm_shadow_walk walker;
78 struct guest_walker *guest_walker;
79 int user_fault;
80 int write_fault;
81 int largepage;
82 int *ptwrite;
83 pfn_t pfn;
84 u64 *sptep;
85};
86
76static gfn_t gpte_to_gfn(pt_element_t gpte) 87static gfn_t gpte_to_gfn(pt_element_t gpte)
77{ 88{
78 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; 89 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
91 pt_element_t *table; 102 pt_element_t *table;
92 struct page *page; 103 struct page *page;
93 104
94 down_read(&current->mm->mmap_sem);
95 page = gfn_to_page(kvm, table_gfn); 105 page = gfn_to_page(kvm, table_gfn);
96 up_read(&current->mm->mmap_sem);
97 106
98 table = kmap_atomic(page, KM_USER0); 107 table = kmap_atomic(page, KM_USER0);
99
100 ret = CMPXCHG(&table[index], orig_pte, new_pte); 108 ret = CMPXCHG(&table[index], orig_pte, new_pte);
101
102 kunmap_atomic(table, KM_USER0); 109 kunmap_atomic(table, KM_USER0);
103 110
104 kvm_release_page_dirty(page); 111 kvm_release_page_dirty(page);
@@ -274,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
274/* 281/*
275 * Fetch a shadow pte for a specific level in the paging hierarchy. 282 * Fetch a shadow pte for a specific level in the paging hierarchy.
276 */ 283 */
277static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 284static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
278 struct guest_walker *walker, 285 struct kvm_vcpu *vcpu, u64 addr,
279 int user_fault, int write_fault, int largepage, 286 u64 *sptep, int level)
280 int *ptwrite, pfn_t pfn)
281{ 287{
282 hpa_t shadow_addr; 288 struct shadow_walker *sw =
283 int level; 289 container_of(_sw, struct shadow_walker, walker);
284 u64 *shadow_ent; 290 struct guest_walker *gw = sw->guest_walker;
285 unsigned access = walker->pt_access; 291 unsigned access = gw->pt_access;
286 292 struct kvm_mmu_page *shadow_page;
287 if (!is_present_pte(walker->ptes[walker->level - 1])) 293 u64 spte;
288 return NULL; 294 int metaphysical;
289 295 gfn_t table_gfn;
290 shadow_addr = vcpu->arch.mmu.root_hpa; 296 int r;
291 level = vcpu->arch.mmu.shadow_root_level; 297 pt_element_t curr_pte;
292 if (level == PT32E_ROOT_LEVEL) { 298
293 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 299 if (level == PT_PAGE_TABLE_LEVEL
294 shadow_addr &= PT64_BASE_ADDR_MASK; 300 || (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
295 --level; 301 mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
302 sw->user_fault, sw->write_fault,
303 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
304 sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
305 false);
306 sw->sptep = sptep;
307 return 1;
296 } 308 }
297 309
298 for (; ; level--) { 310 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
299 u32 index = SHADOW_PT_INDEX(addr, level); 311 return 0;
300 struct kvm_mmu_page *shadow_page;
301 u64 shadow_pte;
302 int metaphysical;
303 gfn_t table_gfn;
304
305 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
306 if (level == PT_PAGE_TABLE_LEVEL)
307 break;
308
309 if (largepage && level == PT_DIRECTORY_LEVEL)
310 break;
311 312
312 if (is_shadow_present_pte(*shadow_ent) 313 if (is_large_pte(*sptep)) {
313 && !is_large_pte(*shadow_ent)) { 314 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
314 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; 315 kvm_flush_remote_tlbs(vcpu->kvm);
315 continue; 316 rmap_remove(vcpu->kvm, sptep);
316 } 317 }
317 318
318 if (is_large_pte(*shadow_ent)) 319 if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
319 rmap_remove(vcpu->kvm, shadow_ent); 320 metaphysical = 1;
320 321 if (!is_dirty_pte(gw->ptes[level - 1]))
321 if (level - 1 == PT_PAGE_TABLE_LEVEL 322 access &= ~ACC_WRITE_MASK;
322 && walker->level == PT_DIRECTORY_LEVEL) { 323 table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
323 metaphysical = 1; 324 } else {
324 if (!is_dirty_pte(walker->ptes[level - 1])) 325 metaphysical = 0;
325 access &= ~ACC_WRITE_MASK; 326 table_gfn = gw->table_gfn[level - 2];
326 table_gfn = gpte_to_gfn(walker->ptes[level - 1]); 327 }
327 } else { 328 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
328 metaphysical = 0; 329 metaphysical, access, sptep);
329 table_gfn = walker->table_gfn[level - 2]; 330 if (!metaphysical) {
330 } 331 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
331 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, 332 &curr_pte, sizeof(curr_pte));
332 metaphysical, access, 333 if (r || curr_pte != gw->ptes[level - 2]) {
333 shadow_ent); 334 kvm_release_pfn_clean(sw->pfn);
334 if (!metaphysical) { 335 sw->sptep = NULL;
335 int r; 336 return 1;
336 pt_element_t curr_pte;
337 r = kvm_read_guest_atomic(vcpu->kvm,
338 walker->pte_gpa[level - 2],
339 &curr_pte, sizeof(curr_pte));
340 if (r || curr_pte != walker->ptes[level - 2]) {
341 kvm_release_pfn_clean(pfn);
342 return NULL;
343 }
344 } 337 }
345 shadow_addr = __pa(shadow_page->spt);
346 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
347 | PT_WRITABLE_MASK | PT_USER_MASK;
348 set_shadow_pte(shadow_ent, shadow_pte);
349 } 338 }
350 339
351 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, 340 spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
352 user_fault, write_fault, 341 | PT_WRITABLE_MASK | PT_USER_MASK;
353 walker->ptes[walker->level-1] & PT_DIRTY_MASK, 342 *sptep = spte;
354 ptwrite, largepage, walker->gfn, pfn, false); 343 return 0;
344}
345
346static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
347 struct guest_walker *guest_walker,
348 int user_fault, int write_fault, int largepage,
349 int *ptwrite, pfn_t pfn)
350{
351 struct shadow_walker walker = {
352 .walker = { .entry = FNAME(shadow_walk_entry), },
353 .guest_walker = guest_walker,
354 .user_fault = user_fault,
355 .write_fault = write_fault,
356 .largepage = largepage,
357 .ptwrite = ptwrite,
358 .pfn = pfn,
359 };
360
361 if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
362 return NULL;
363
364 walk_shadow(&walker.walker, vcpu, addr);
355 365
356 return shadow_ent; 366 return walker.sptep;
357} 367}
358 368
359/* 369/*
@@ -407,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
407 return 0; 417 return 0;
408 } 418 }
409 419
410 down_read(&current->mm->mmap_sem);
411 if (walker.level == PT_DIRECTORY_LEVEL) { 420 if (walker.level == PT_DIRECTORY_LEVEL) {
412 gfn_t large_gfn; 421 gfn_t large_gfn;
413 large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); 422 large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
@@ -417,9 +426,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
417 } 426 }
418 } 427 }
419 mmu_seq = vcpu->kvm->mmu_notifier_seq; 428 mmu_seq = vcpu->kvm->mmu_notifier_seq;
420 /* implicit mb(), we'll read before PT lock is unlocked */ 429 smp_rmb();
421 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 430 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
422 up_read(&current->mm->mmap_sem);
423 431
424 /* mmio */ 432 /* mmio */
425 if (is_error_pfn(pfn)) { 433 if (is_error_pfn(pfn)) {
@@ -453,6 +461,31 @@ out_unlock:
453 return 0; 461 return 0;
454} 462}
455 463
464static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
465 struct kvm_vcpu *vcpu, u64 addr,
466 u64 *sptep, int level)
467{
468
469 if (level == PT_PAGE_TABLE_LEVEL) {
470 if (is_shadow_present_pte(*sptep))
471 rmap_remove(vcpu->kvm, sptep);
472 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
473 return 1;
474 }
475 if (!is_shadow_present_pte(*sptep))
476 return 1;
477 return 0;
478}
479
480static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
481{
482 struct shadow_walker walker = {
483 .walker = { .entry = FNAME(shadow_invlpg_entry), },
484 };
485
486 walk_shadow(&walker.walker, vcpu, gva);
487}
488
456static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 489static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
457{ 490{
458 struct guest_walker walker; 491 struct guest_walker walker;
@@ -499,12 +532,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
499 } 532 }
500} 533}
501 534
535/*
536 * Using the cached information from sp->gfns is safe because:
537 * - The spte has a reference to the struct page, so the pfn for a given gfn
538 * can't change unless all sptes pointing to it are nuked first.
539 * - Alias changes zap the entire shadow cache.
540 */
541static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
542{
543 int i, offset, nr_present;
544
545 offset = nr_present = 0;
546
547 if (PTTYPE == 32)
548 offset = sp->role.quadrant << PT64_LEVEL_BITS;
549
550 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
551 unsigned pte_access;
552 pt_element_t gpte;
553 gpa_t pte_gpa;
554 gfn_t gfn = sp->gfns[i];
555
556 if (!is_shadow_present_pte(sp->spt[i]))
557 continue;
558
559 pte_gpa = gfn_to_gpa(sp->gfn);
560 pte_gpa += (i+offset) * sizeof(pt_element_t);
561
562 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
563 sizeof(pt_element_t)))
564 return -EINVAL;
565
566 if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
567 !(gpte & PT_ACCESSED_MASK)) {
568 u64 nonpresent;
569
570 rmap_remove(vcpu->kvm, &sp->spt[i]);
571 if (is_present_pte(gpte))
572 nonpresent = shadow_trap_nonpresent_pte;
573 else
574 nonpresent = shadow_notrap_nonpresent_pte;
575 set_shadow_pte(&sp->spt[i], nonpresent);
576 continue;
577 }
578
579 nr_present++;
580 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
581 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
582 is_dirty_pte(gpte), 0, gfn,
583 spte_to_pfn(sp->spt[i]), true, false);
584 }
585
586 return !nr_present;
587}
588
502#undef pt_element_t 589#undef pt_element_t
503#undef guest_walker 590#undef guest_walker
591#undef shadow_walker
504#undef FNAME 592#undef FNAME
505#undef PT_BASE_ADDR_MASK 593#undef PT_BASE_ADDR_MASK
506#undef PT_INDEX 594#undef PT_INDEX
507#undef SHADOW_PT_INDEX
508#undef PT_LEVEL_MASK 595#undef PT_LEVEL_MASK
509#undef PT_DIR_BASE_ADDR_MASK 596#undef PT_DIR_BASE_ADDR_MASK
510#undef PT_LEVEL_BITS 597#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8233b86c778c..9c4ce657d963 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -18,6 +18,7 @@
18#include "kvm_svm.h" 18#include "kvm_svm.h"
19#include "irq.h" 19#include "irq.h"
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h"
21 22
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/kernel.h> 24#include <linux/kernel.h>
@@ -35,10 +36,6 @@ MODULE_LICENSE("GPL");
35#define IOPM_ALLOC_ORDER 2 36#define IOPM_ALLOC_ORDER 2
36#define MSRPM_ALLOC_ORDER 1 37#define MSRPM_ALLOC_ORDER 1
37 38
38#define DB_VECTOR 1
39#define UD_VECTOR 6
40#define GP_VECTOR 13
41
42#define DR7_GD_MASK (1 << 13) 39#define DR7_GD_MASK (1 << 13)
43#define DR6_BD_MASK (1 << 13) 40#define DR6_BD_MASK (1 << 13)
44 41
@@ -47,7 +44,7 @@ MODULE_LICENSE("GPL");
47 44
48#define SVM_FEATURE_NPT (1 << 0) 45#define SVM_FEATURE_NPT (1 << 0)
49#define SVM_FEATURE_LBRV (1 << 1) 46#define SVM_FEATURE_LBRV (1 << 1)
50#define SVM_DEATURE_SVML (1 << 2) 47#define SVM_FEATURE_SVML (1 << 2)
51 48
52#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 49#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
53 50
@@ -236,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
236 printk(KERN_DEBUG "%s: NOP\n", __func__); 233 printk(KERN_DEBUG "%s: NOP\n", __func__);
237 return; 234 return;
238 } 235 }
239 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) 236 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
240 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", 237 printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
241 __func__, 238 __func__, kvm_rip_read(vcpu), svm->next_rip);
242 svm->vmcb->save.rip,
243 svm->next_rip);
244 239
245 vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; 240 kvm_rip_write(vcpu, svm->next_rip);
246 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 241 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
247 242
248 vcpu->arch.interrupt_window_open = 1; 243 vcpu->arch.interrupt_window_open = 1;
@@ -530,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm)
530 (1ULL << INTERCEPT_CPUID) | 525 (1ULL << INTERCEPT_CPUID) |
531 (1ULL << INTERCEPT_INVD) | 526 (1ULL << INTERCEPT_INVD) |
532 (1ULL << INTERCEPT_HLT) | 527 (1ULL << INTERCEPT_HLT) |
528 (1ULL << INTERCEPT_INVLPG) |
533 (1ULL << INTERCEPT_INVLPGA) | 529 (1ULL << INTERCEPT_INVLPGA) |
534 (1ULL << INTERCEPT_IOIO_PROT) | 530 (1ULL << INTERCEPT_IOIO_PROT) |
535 (1ULL << INTERCEPT_MSR_PROT) | 531 (1ULL << INTERCEPT_MSR_PROT) |
@@ -581,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)
581 save->dr7 = 0x400; 577 save->dr7 = 0x400;
582 save->rflags = 2; 578 save->rflags = 2;
583 save->rip = 0x0000fff0; 579 save->rip = 0x0000fff0;
580 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
584 581
585 /* 582 /*
586 * cr0 val on cpu init should be 0x60000010, we enable cpu 583 * cr0 val on cpu init should be 0x60000010, we enable cpu
@@ -593,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm)
593 if (npt_enabled) { 590 if (npt_enabled) {
594 /* Setup VMCB for Nested Paging */ 591 /* Setup VMCB for Nested Paging */
595 control->nested_ctl = 1; 592 control->nested_ctl = 1;
596 control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); 593 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
594 (1ULL << INTERCEPT_INVLPG));
597 control->intercept_exceptions &= ~(1 << PF_VECTOR); 595 control->intercept_exceptions &= ~(1 << PF_VECTOR);
598 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| 596 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
599 INTERCEPT_CR3_MASK); 597 INTERCEPT_CR3_MASK);
@@ -615,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
615 init_vmcb(svm); 613 init_vmcb(svm);
616 614
617 if (vcpu->vcpu_id != 0) { 615 if (vcpu->vcpu_id != 0) {
618 svm->vmcb->save.rip = 0; 616 kvm_rip_write(vcpu, 0);
619 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; 617 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
620 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; 618 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
621 } 619 }
620 vcpu->arch.regs_avail = ~0;
621 vcpu->arch.regs_dirty = ~0;
622 622
623 return 0; 623 return 0;
624} 624}
@@ -721,23 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
721 rdtscll(vcpu->arch.host_tsc); 721 rdtscll(vcpu->arch.host_tsc);
722} 722}
723 723
724static void svm_cache_regs(struct kvm_vcpu *vcpu)
725{
726 struct vcpu_svm *svm = to_svm(vcpu);
727
728 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
729 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
730 vcpu->arch.rip = svm->vmcb->save.rip;
731}
732
733static void svm_decache_regs(struct kvm_vcpu *vcpu)
734{
735 struct vcpu_svm *svm = to_svm(vcpu);
736 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
737 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
738 svm->vmcb->save.rip = vcpu->arch.rip;
739}
740
741static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 724static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
742{ 725{
743 return to_svm(vcpu)->vmcb->save.rflags; 726 return to_svm(vcpu)->vmcb->save.rflags;
@@ -1040,7 +1023,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1040 if (npt_enabled) 1023 if (npt_enabled)
1041 svm_flush_tlb(&svm->vcpu); 1024 svm_flush_tlb(&svm->vcpu);
1042 1025
1043 if (event_injection) 1026 if (!npt_enabled && event_injection)
1044 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1027 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1045 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1028 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1046} 1029}
@@ -1139,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1139 1122
1140static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1123static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1141{ 1124{
1142 svm->next_rip = svm->vmcb->save.rip + 1; 1125 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1143 skip_emulated_instruction(&svm->vcpu); 1126 skip_emulated_instruction(&svm->vcpu);
1144 return kvm_emulate_halt(&svm->vcpu); 1127 return kvm_emulate_halt(&svm->vcpu);
1145} 1128}
1146 1129
1147static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1130static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1148{ 1131{
1149 svm->next_rip = svm->vmcb->save.rip + 3; 1132 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1150 skip_emulated_instruction(&svm->vcpu); 1133 skip_emulated_instruction(&svm->vcpu);
1151 kvm_emulate_hypercall(&svm->vcpu); 1134 kvm_emulate_hypercall(&svm->vcpu);
1152 return 1; 1135 return 1;
@@ -1178,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm,
1178 1161
1179static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1162static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1180{ 1163{
1181 svm->next_rip = svm->vmcb->save.rip + 2; 1164 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1182 kvm_emulate_cpuid(&svm->vcpu); 1165 kvm_emulate_cpuid(&svm->vcpu);
1183 return 1; 1166 return 1;
1184} 1167}
1185 1168
1169static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1170{
1171 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
1172 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
1173 return 1;
1174}
1175
1186static int emulate_on_interception(struct vcpu_svm *svm, 1176static int emulate_on_interception(struct vcpu_svm *svm,
1187 struct kvm_run *kvm_run) 1177 struct kvm_run *kvm_run)
1188{ 1178{
@@ -1273,9 +1263,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1273 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, 1263 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
1274 (u32)(data >> 32), handler); 1264 (u32)(data >> 32), handler);
1275 1265
1276 svm->vmcb->save.rax = data & 0xffffffff; 1266 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
1277 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; 1267 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1278 svm->next_rip = svm->vmcb->save.rip + 2; 1268 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1279 skip_emulated_instruction(&svm->vcpu); 1269 skip_emulated_instruction(&svm->vcpu);
1280 } 1270 }
1281 return 1; 1271 return 1;
@@ -1359,13 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1359static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1349static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1360{ 1350{
1361 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1351 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1362 u64 data = (svm->vmcb->save.rax & -1u) 1352 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
1363 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 1353 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1364 1354
1365 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), 1355 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
1366 handler); 1356 handler);
1367 1357
1368 svm->next_rip = svm->vmcb->save.rip + 2; 1358 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1369 if (svm_set_msr(&svm->vcpu, ecx, data)) 1359 if (svm_set_msr(&svm->vcpu, ecx, data))
1370 kvm_inject_gp(&svm->vcpu, 0); 1360 kvm_inject_gp(&svm->vcpu, 0);
1371 else 1361 else
@@ -1436,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1436 [SVM_EXIT_CPUID] = cpuid_interception, 1426 [SVM_EXIT_CPUID] = cpuid_interception,
1437 [SVM_EXIT_INVD] = emulate_on_interception, 1427 [SVM_EXIT_INVD] = emulate_on_interception,
1438 [SVM_EXIT_HLT] = halt_interception, 1428 [SVM_EXIT_HLT] = halt_interception,
1439 [SVM_EXIT_INVLPG] = emulate_on_interception, 1429 [SVM_EXIT_INVLPG] = invlpg_interception,
1440 [SVM_EXIT_INVLPGA] = invalid_op_interception, 1430 [SVM_EXIT_INVLPGA] = invalid_op_interception,
1441 [SVM_EXIT_IOIO] = io_interception, 1431 [SVM_EXIT_IOIO] = io_interception,
1442 [SVM_EXIT_MSR] = msr_interception, 1432 [SVM_EXIT_MSR] = msr_interception,
@@ -1538,6 +1528,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1538 1528
1539 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); 1529 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
1540 1530
1531 ++svm->vcpu.stat.irq_injections;
1541 control = &svm->vmcb->control; 1532 control = &svm->vmcb->control;
1542 control->int_vector = irq; 1533 control->int_vector = irq;
1543 control->int_ctl &= ~V_INTR_PRIO_MASK; 1534 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -1716,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
1716 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 1707 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
1717} 1708}
1718 1709
1710#ifdef CONFIG_X86_64
1711#define R "r"
1712#else
1713#define R "e"
1714#endif
1715
1719static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1716static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1720{ 1717{
1721 struct vcpu_svm *svm = to_svm(vcpu); 1718 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1723,6 +1720,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1723 u16 gs_selector; 1720 u16 gs_selector;
1724 u16 ldt_selector; 1721 u16 ldt_selector;
1725 1722
1723 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
1724 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
1725 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
1726
1726 pre_svm_run(svm); 1727 pre_svm_run(svm);
1727 1728
1728 sync_lapic_to_cr8(vcpu); 1729 sync_lapic_to_cr8(vcpu);
@@ -1750,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1750 local_irq_enable(); 1751 local_irq_enable();
1751 1752
1752 asm volatile ( 1753 asm volatile (
1754 "push %%"R"bp; \n\t"
1755 "mov %c[rbx](%[svm]), %%"R"bx \n\t"
1756 "mov %c[rcx](%[svm]), %%"R"cx \n\t"
1757 "mov %c[rdx](%[svm]), %%"R"dx \n\t"
1758 "mov %c[rsi](%[svm]), %%"R"si \n\t"
1759 "mov %c[rdi](%[svm]), %%"R"di \n\t"
1760 "mov %c[rbp](%[svm]), %%"R"bp \n\t"
1753#ifdef CONFIG_X86_64 1761#ifdef CONFIG_X86_64
1754 "push %%rbp; \n\t"
1755#else
1756 "push %%ebp; \n\t"
1757#endif
1758
1759#ifdef CONFIG_X86_64
1760 "mov %c[rbx](%[svm]), %%rbx \n\t"
1761 "mov %c[rcx](%[svm]), %%rcx \n\t"
1762 "mov %c[rdx](%[svm]), %%rdx \n\t"
1763 "mov %c[rsi](%[svm]), %%rsi \n\t"
1764 "mov %c[rdi](%[svm]), %%rdi \n\t"
1765 "mov %c[rbp](%[svm]), %%rbp \n\t"
1766 "mov %c[r8](%[svm]), %%r8 \n\t" 1762 "mov %c[r8](%[svm]), %%r8 \n\t"
1767 "mov %c[r9](%[svm]), %%r9 \n\t" 1763 "mov %c[r9](%[svm]), %%r9 \n\t"
1768 "mov %c[r10](%[svm]), %%r10 \n\t" 1764 "mov %c[r10](%[svm]), %%r10 \n\t"
@@ -1771,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1771 "mov %c[r13](%[svm]), %%r13 \n\t" 1767 "mov %c[r13](%[svm]), %%r13 \n\t"
1772 "mov %c[r14](%[svm]), %%r14 \n\t" 1768 "mov %c[r14](%[svm]), %%r14 \n\t"
1773 "mov %c[r15](%[svm]), %%r15 \n\t" 1769 "mov %c[r15](%[svm]), %%r15 \n\t"
1774#else
1775 "mov %c[rbx](%[svm]), %%ebx \n\t"
1776 "mov %c[rcx](%[svm]), %%ecx \n\t"
1777 "mov %c[rdx](%[svm]), %%edx \n\t"
1778 "mov %c[rsi](%[svm]), %%esi \n\t"
1779 "mov %c[rdi](%[svm]), %%edi \n\t"
1780 "mov %c[rbp](%[svm]), %%ebp \n\t"
1781#endif 1770#endif
1782 1771
1783#ifdef CONFIG_X86_64
1784 /* Enter guest mode */
1785 "push %%rax \n\t"
1786 "mov %c[vmcb](%[svm]), %%rax \n\t"
1787 __ex(SVM_VMLOAD) "\n\t"
1788 __ex(SVM_VMRUN) "\n\t"
1789 __ex(SVM_VMSAVE) "\n\t"
1790 "pop %%rax \n\t"
1791#else
1792 /* Enter guest mode */ 1772 /* Enter guest mode */
1793 "push %%eax \n\t" 1773 "push %%"R"ax \n\t"
1794 "mov %c[vmcb](%[svm]), %%eax \n\t" 1774 "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
1795 __ex(SVM_VMLOAD) "\n\t" 1775 __ex(SVM_VMLOAD) "\n\t"
1796 __ex(SVM_VMRUN) "\n\t" 1776 __ex(SVM_VMRUN) "\n\t"
1797 __ex(SVM_VMSAVE) "\n\t" 1777 __ex(SVM_VMSAVE) "\n\t"
1798 "pop %%eax \n\t" 1778 "pop %%"R"ax \n\t"
1799#endif
1800 1779
1801 /* Save guest registers, load host registers */ 1780 /* Save guest registers, load host registers */
1781 "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
1782 "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
1783 "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
1784 "mov %%"R"si, %c[rsi](%[svm]) \n\t"
1785 "mov %%"R"di, %c[rdi](%[svm]) \n\t"
1786 "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
1802#ifdef CONFIG_X86_64 1787#ifdef CONFIG_X86_64
1803 "mov %%rbx, %c[rbx](%[svm]) \n\t"
1804 "mov %%rcx, %c[rcx](%[svm]) \n\t"
1805 "mov %%rdx, %c[rdx](%[svm]) \n\t"
1806 "mov %%rsi, %c[rsi](%[svm]) \n\t"
1807 "mov %%rdi, %c[rdi](%[svm]) \n\t"
1808 "mov %%rbp, %c[rbp](%[svm]) \n\t"
1809 "mov %%r8, %c[r8](%[svm]) \n\t" 1788 "mov %%r8, %c[r8](%[svm]) \n\t"
1810 "mov %%r9, %c[r9](%[svm]) \n\t" 1789 "mov %%r9, %c[r9](%[svm]) \n\t"
1811 "mov %%r10, %c[r10](%[svm]) \n\t" 1790 "mov %%r10, %c[r10](%[svm]) \n\t"
@@ -1814,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1814 "mov %%r13, %c[r13](%[svm]) \n\t" 1793 "mov %%r13, %c[r13](%[svm]) \n\t"
1815 "mov %%r14, %c[r14](%[svm]) \n\t" 1794 "mov %%r14, %c[r14](%[svm]) \n\t"
1816 "mov %%r15, %c[r15](%[svm]) \n\t" 1795 "mov %%r15, %c[r15](%[svm]) \n\t"
1817
1818 "pop %%rbp; \n\t"
1819#else
1820 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1821 "mov %%ecx, %c[rcx](%[svm]) \n\t"
1822 "mov %%edx, %c[rdx](%[svm]) \n\t"
1823 "mov %%esi, %c[rsi](%[svm]) \n\t"
1824 "mov %%edi, %c[rdi](%[svm]) \n\t"
1825 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1826
1827 "pop %%ebp; \n\t"
1828#endif 1796#endif
1797 "pop %%"R"bp"
1829 : 1798 :
1830 : [svm]"a"(svm), 1799 : [svm]"a"(svm),
1831 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), 1800 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -1846,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1846 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) 1815 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
1847#endif 1816#endif
1848 : "cc", "memory" 1817 : "cc", "memory"
1818 , R"bx", R"cx", R"dx", R"si", R"di"
1849#ifdef CONFIG_X86_64 1819#ifdef CONFIG_X86_64
1850 , "rbx", "rcx", "rdx", "rsi", "rdi"
1851 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" 1820 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
1852#else
1853 , "ebx", "ecx", "edx" , "esi", "edi"
1854#endif 1821#endif
1855 ); 1822 );
1856 1823
@@ -1858,6 +1825,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1858 load_db_regs(svm->host_db_regs); 1825 load_db_regs(svm->host_db_regs);
1859 1826
1860 vcpu->arch.cr2 = svm->vmcb->save.cr2; 1827 vcpu->arch.cr2 = svm->vmcb->save.cr2;
1828 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
1829 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
1830 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
1861 1831
1862 write_dr6(svm->host_dr6); 1832 write_dr6(svm->host_dr6);
1863 write_dr7(svm->host_dr7); 1833 write_dr7(svm->host_dr7);
@@ -1879,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1879 svm->next_rip = 0; 1849 svm->next_rip = 0;
1880} 1850}
1881 1851
1852#undef R
1853
1882static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) 1854static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1883{ 1855{
1884 struct vcpu_svm *svm = to_svm(vcpu); 1856 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1977,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = {
1977 .set_gdt = svm_set_gdt, 1949 .set_gdt = svm_set_gdt,
1978 .get_dr = svm_get_dr, 1950 .get_dr = svm_get_dr,
1979 .set_dr = svm_set_dr, 1951 .set_dr = svm_set_dr,
1980 .cache_regs = svm_cache_regs,
1981 .decache_regs = svm_decache_regs,
1982 .get_rflags = svm_get_rflags, 1952 .get_rflags = svm_get_rflags,
1983 .set_rflags = svm_set_rflags, 1953 .set_rflags = svm_set_rflags,
1984 1954
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7041cc52b562..2643b430d83a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -26,6 +26,8 @@
26#include <linux/highmem.h> 26#include <linux/highmem.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/moduleparam.h> 28#include <linux/moduleparam.h>
29#include "kvm_cache_regs.h"
30#include "x86.h"
29 31
30#include <asm/io.h> 32#include <asm/io.h>
31#include <asm/desc.h> 33#include <asm/desc.h>
@@ -47,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0);
47static int enable_ept = 1; 49static int enable_ept = 1;
48module_param(enable_ept, bool, 0); 50module_param(enable_ept, bool, 0);
49 51
52static int emulate_invalid_guest_state = 0;
53module_param(emulate_invalid_guest_state, bool, 0);
54
50struct vmcs { 55struct vmcs {
51 u32 revision_id; 56 u32 revision_id;
52 u32 abort; 57 u32 abort;
@@ -56,6 +61,7 @@ struct vmcs {
56struct vcpu_vmx { 61struct vcpu_vmx {
57 struct kvm_vcpu vcpu; 62 struct kvm_vcpu vcpu;
58 struct list_head local_vcpus_link; 63 struct list_head local_vcpus_link;
64 unsigned long host_rsp;
59 int launched; 65 int launched;
60 u8 fail; 66 u8 fail;
61 u32 idt_vectoring_info; 67 u32 idt_vectoring_info;
@@ -83,6 +89,7 @@ struct vcpu_vmx {
83 } irq; 89 } irq;
84 } rmode; 90 } rmode;
85 int vpid; 91 int vpid;
92 bool emulation_required;
86}; 93};
87 94
88static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 95static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -468,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
468 if (!vcpu->fpu_active) 475 if (!vcpu->fpu_active)
469 eb |= 1u << NM_VECTOR; 476 eb |= 1u << NM_VECTOR;
470 if (vcpu->guest_debug.enabled) 477 if (vcpu->guest_debug.enabled)
471 eb |= 1u << 1; 478 eb |= 1u << DB_VECTOR;
472 if (vcpu->arch.rmode.active) 479 if (vcpu->arch.rmode.active)
473 eb = ~0; 480 eb = ~0;
474 if (vm_need_ept()) 481 if (vm_need_ept())
@@ -715,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
715 unsigned long rip; 722 unsigned long rip;
716 u32 interruptibility; 723 u32 interruptibility;
717 724
718 rip = vmcs_readl(GUEST_RIP); 725 rip = kvm_rip_read(vcpu);
719 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 726 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
720 vmcs_writel(GUEST_RIP, rip); 727 kvm_rip_write(vcpu, rip);
721 728
722 /* 729 /*
723 * We emulated an instruction, so temporary interrupt blocking 730 * We emulated an instruction, so temporary interrupt blocking
@@ -733,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
733static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 740static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
734 bool has_error_code, u32 error_code) 741 bool has_error_code, u32 error_code)
735{ 742{
743 struct vcpu_vmx *vmx = to_vmx(vcpu);
744
745 if (has_error_code)
746 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
747
748 if (vcpu->arch.rmode.active) {
749 vmx->rmode.irq.pending = true;
750 vmx->rmode.irq.vector = nr;
751 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
752 if (nr == BP_VECTOR)
753 vmx->rmode.irq.rip++;
754 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
755 nr | INTR_TYPE_SOFT_INTR
756 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
757 | INTR_INFO_VALID_MASK);
758 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
759 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
760 return;
761 }
762
736 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 763 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
737 nr | INTR_TYPE_EXCEPTION 764 nr | INTR_TYPE_EXCEPTION
738 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) 765 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
739 | INTR_INFO_VALID_MASK); 766 | INTR_INFO_VALID_MASK);
740 if (has_error_code)
741 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
742} 767}
743 768
744static bool vmx_exception_injected(struct kvm_vcpu *vcpu) 769static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
745{ 770{
746 struct vcpu_vmx *vmx = to_vmx(vcpu); 771 return false;
747
748 return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
749} 772}
750 773
751/* 774/*
@@ -947,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
947 return ret; 970 return ret;
948} 971}
949 972
950/* 973static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
951 * Sync the rsp and rip registers into the vcpu structure. This allows
952 * registers to be accessed by indexing vcpu->arch.regs.
953 */
954static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
955{
956 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
957 vcpu->arch.rip = vmcs_readl(GUEST_RIP);
958}
959
960/*
961 * Syncs rsp and rip back into the vmcs. Should be called after possible
962 * modification.
963 */
964static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
965{ 974{
966 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 975 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
967 vmcs_writel(GUEST_RIP, vcpu->arch.rip); 976 switch (reg) {
977 case VCPU_REGS_RSP:
978 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
979 break;
980 case VCPU_REGS_RIP:
981 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
982 break;
983 default:
984 break;
985 }
968} 986}
969 987
970static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) 988static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -1007,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
1007 1025
1008static int vmx_get_irq(struct kvm_vcpu *vcpu) 1026static int vmx_get_irq(struct kvm_vcpu *vcpu)
1009{ 1027{
1010 struct vcpu_vmx *vmx = to_vmx(vcpu); 1028 if (!vcpu->arch.interrupt.pending)
1011 u32 idtv_info_field; 1029 return -1;
1012 1030 return vcpu->arch.interrupt.nr;
1013 idtv_info_field = vmx->idt_vectoring_info;
1014 if (idtv_info_field & INTR_INFO_VALID_MASK) {
1015 if (is_external_interrupt(idtv_info_field))
1016 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
1017 else
1018 printk(KERN_DEBUG "pending exception: not handled yet\n");
1019 }
1020 return -1;
1021} 1031}
1022 1032
1023static __init int cpu_has_kvm_support(void) 1033static __init int cpu_has_kvm_support(void)
@@ -1031,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void)
1031 u64 msr; 1041 u64 msr;
1032 1042
1033 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1043 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1034 return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1044 return (msr & (FEATURE_CONTROL_LOCKED |
1035 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1045 FEATURE_CONTROL_VMXON_ENABLED))
1036 == MSR_IA32_FEATURE_CONTROL_LOCKED; 1046 == FEATURE_CONTROL_LOCKED;
1037 /* locked but not enabled */ 1047 /* locked but not enabled */
1038} 1048}
1039 1049
@@ -1045,14 +1055,14 @@ static void hardware_enable(void *garbage)
1045 1055
1046 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1056 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1047 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1057 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1048 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1058 if ((old & (FEATURE_CONTROL_LOCKED |
1049 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1059 FEATURE_CONTROL_VMXON_ENABLED))
1050 != (MSR_IA32_FEATURE_CONTROL_LOCKED | 1060 != (FEATURE_CONTROL_LOCKED |
1051 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1061 FEATURE_CONTROL_VMXON_ENABLED))
1052 /* enable and lock */ 1062 /* enable and lock */
1053 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 1063 wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
1054 MSR_IA32_FEATURE_CONTROL_LOCKED | 1064 FEATURE_CONTROL_LOCKED |
1055 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); 1065 FEATURE_CONTROL_VMXON_ENABLED);
1056 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1066 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1057 asm volatile (ASM_VMX_VMXON_RAX 1067 asm volatile (ASM_VMX_VMXON_RAX
1058 : : "a"(&phys_addr), "m"(phys_addr) 1068 : : "a"(&phys_addr), "m"(phys_addr)
@@ -1120,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1120 CPU_BASED_CR3_STORE_EXITING | 1130 CPU_BASED_CR3_STORE_EXITING |
1121 CPU_BASED_USE_IO_BITMAPS | 1131 CPU_BASED_USE_IO_BITMAPS |
1122 CPU_BASED_MOV_DR_EXITING | 1132 CPU_BASED_MOV_DR_EXITING |
1123 CPU_BASED_USE_TSC_OFFSETING; 1133 CPU_BASED_USE_TSC_OFFSETING |
1134 CPU_BASED_INVLPG_EXITING;
1124 opt = CPU_BASED_TPR_SHADOW | 1135 opt = CPU_BASED_TPR_SHADOW |
1125 CPU_BASED_USE_MSR_BITMAPS | 1136 CPU_BASED_USE_MSR_BITMAPS |
1126 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1137 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1149,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1149 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 1160 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
1150#endif 1161#endif
1151 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 1162 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
1152 /* CR3 accesses don't need to cause VM Exits when EPT enabled */ 1163 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
1164 enabled */
1153 min &= ~(CPU_BASED_CR3_LOAD_EXITING | 1165 min &= ~(CPU_BASED_CR3_LOAD_EXITING |
1154 CPU_BASED_CR3_STORE_EXITING); 1166 CPU_BASED_CR3_STORE_EXITING |
1167 CPU_BASED_INVLPG_EXITING);
1155 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 1168 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1156 &_cpu_based_exec_control) < 0) 1169 &_cpu_based_exec_control) < 0)
1157 return -EIO; 1170 return -EIO;
@@ -1288,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1288static void enter_pmode(struct kvm_vcpu *vcpu) 1301static void enter_pmode(struct kvm_vcpu *vcpu)
1289{ 1302{
1290 unsigned long flags; 1303 unsigned long flags;
1304 struct vcpu_vmx *vmx = to_vmx(vcpu);
1291 1305
1306 vmx->emulation_required = 1;
1292 vcpu->arch.rmode.active = 0; 1307 vcpu->arch.rmode.active = 0;
1293 1308
1294 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); 1309 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
@@ -1305,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1305 1320
1306 update_exception_bitmap(vcpu); 1321 update_exception_bitmap(vcpu);
1307 1322
1323 if (emulate_invalid_guest_state)
1324 return;
1325
1308 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); 1326 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1309 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); 1327 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1310 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); 1328 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
@@ -1345,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1345static void enter_rmode(struct kvm_vcpu *vcpu) 1363static void enter_rmode(struct kvm_vcpu *vcpu)
1346{ 1364{
1347 unsigned long flags; 1365 unsigned long flags;
1366 struct vcpu_vmx *vmx = to_vmx(vcpu);
1348 1367
1368 vmx->emulation_required = 1;
1349 vcpu->arch.rmode.active = 1; 1369 vcpu->arch.rmode.active = 1;
1350 1370
1351 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1371 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
@@ -1367,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1367 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 1387 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1368 update_exception_bitmap(vcpu); 1388 update_exception_bitmap(vcpu);
1369 1389
1390 if (emulate_invalid_guest_state)
1391 goto continue_rmode;
1392
1370 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); 1393 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1371 vmcs_write32(GUEST_SS_LIMIT, 0xffff); 1394 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1372 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); 1395 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
@@ -1382,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1382 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); 1405 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1383 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); 1406 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1384 1407
1408continue_rmode:
1385 kvm_mmu_reset_context(vcpu); 1409 kvm_mmu_reset_context(vcpu);
1386 init_rmode(vcpu->kvm); 1410 init_rmode(vcpu->kvm);
1387} 1411}
@@ -1715,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1715 vmcs_writel(GUEST_GDTR_BASE, dt->base); 1739 vmcs_writel(GUEST_GDTR_BASE, dt->base);
1716} 1740}
1717 1741
1742static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
1743{
1744 struct kvm_segment var;
1745 u32 ar;
1746
1747 vmx_get_segment(vcpu, &var, seg);
1748 ar = vmx_segment_access_rights(&var);
1749
1750 if (var.base != (var.selector << 4))
1751 return false;
1752 if (var.limit != 0xffff)
1753 return false;
1754 if (ar != 0xf3)
1755 return false;
1756
1757 return true;
1758}
1759
1760static bool code_segment_valid(struct kvm_vcpu *vcpu)
1761{
1762 struct kvm_segment cs;
1763 unsigned int cs_rpl;
1764
1765 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1766 cs_rpl = cs.selector & SELECTOR_RPL_MASK;
1767
1768 if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
1769 return false;
1770 if (!cs.s)
1771 return false;
1772 if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
1773 if (cs.dpl > cs_rpl)
1774 return false;
1775 } else if (cs.type & AR_TYPE_CODE_MASK) {
1776 if (cs.dpl != cs_rpl)
1777 return false;
1778 }
1779 if (!cs.present)
1780 return false;
1781
1782 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
1783 return true;
1784}
1785
1786static bool stack_segment_valid(struct kvm_vcpu *vcpu)
1787{
1788 struct kvm_segment ss;
1789 unsigned int ss_rpl;
1790
1791 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1792 ss_rpl = ss.selector & SELECTOR_RPL_MASK;
1793
1794 if ((ss.type != 3) || (ss.type != 7))
1795 return false;
1796 if (!ss.s)
1797 return false;
1798 if (ss.dpl != ss_rpl) /* DPL != RPL */
1799 return false;
1800 if (!ss.present)
1801 return false;
1802
1803 return true;
1804}
1805
1806static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
1807{
1808 struct kvm_segment var;
1809 unsigned int rpl;
1810
1811 vmx_get_segment(vcpu, &var, seg);
1812 rpl = var.selector & SELECTOR_RPL_MASK;
1813
1814 if (!var.s)
1815 return false;
1816 if (!var.present)
1817 return false;
1818 if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
1819 if (var.dpl < rpl) /* DPL < RPL */
1820 return false;
1821 }
1822
1823 /* TODO: Add other members to kvm_segment_field to allow checking for other access
1824 * rights flags
1825 */
1826 return true;
1827}
1828
1829static bool tr_valid(struct kvm_vcpu *vcpu)
1830{
1831 struct kvm_segment tr;
1832
1833 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
1834
1835 if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
1836 return false;
1837 if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
1838 return false;
1839 if (!tr.present)
1840 return false;
1841
1842 return true;
1843}
1844
1845static bool ldtr_valid(struct kvm_vcpu *vcpu)
1846{
1847 struct kvm_segment ldtr;
1848
1849 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
1850
1851 if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
1852 return false;
1853 if (ldtr.type != 2)
1854 return false;
1855 if (!ldtr.present)
1856 return false;
1857
1858 return true;
1859}
1860
1861static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
1862{
1863 struct kvm_segment cs, ss;
1864
1865 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1866 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1867
1868 return ((cs.selector & SELECTOR_RPL_MASK) ==
1869 (ss.selector & SELECTOR_RPL_MASK));
1870}
1871
1872/*
1873 * Check if guest state is valid. Returns true if valid, false if
1874 * not.
1875 * We assume that registers are always usable
1876 */
1877static bool guest_state_valid(struct kvm_vcpu *vcpu)
1878{
1879 /* real mode guest state checks */
1880 if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
1881 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
1882 return false;
1883 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
1884 return false;
1885 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
1886 return false;
1887 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
1888 return false;
1889 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
1890 return false;
1891 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
1892 return false;
1893 } else {
1894 /* protected mode guest state checks */
1895 if (!cs_ss_rpl_check(vcpu))
1896 return false;
1897 if (!code_segment_valid(vcpu))
1898 return false;
1899 if (!stack_segment_valid(vcpu))
1900 return false;
1901 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
1902 return false;
1903 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
1904 return false;
1905 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
1906 return false;
1907 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
1908 return false;
1909 if (!tr_valid(vcpu))
1910 return false;
1911 if (!ldtr_valid(vcpu))
1912 return false;
1913 }
1914 /* TODO:
1915 * - Add checks on RIP
1916 * - Add checks on RFLAGS
1917 */
1918
1919 return true;
1920}
1921
1718static int init_rmode_tss(struct kvm *kvm) 1922static int init_rmode_tss(struct kvm *kvm)
1719{ 1923{
1720 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 1924 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
@@ -1726,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm)
1726 if (r < 0) 1930 if (r < 0)
1727 goto out; 1931 goto out;
1728 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 1932 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1729 r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); 1933 r = kvm_write_guest_page(kvm, fn++, &data,
1934 TSS_IOPB_BASE_OFFSET, sizeof(u16));
1730 if (r < 0) 1935 if (r < 0)
1731 goto out; 1936 goto out;
1732 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); 1937 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
@@ -1789,7 +1994,7 @@ static void seg_setup(int seg)
1789 vmcs_write16(sf->selector, 0); 1994 vmcs_write16(sf->selector, 0);
1790 vmcs_writel(sf->base, 0); 1995 vmcs_writel(sf->base, 0);
1791 vmcs_write32(sf->limit, 0xffff); 1996 vmcs_write32(sf->limit, 0xffff);
1792 vmcs_write32(sf->ar_bytes, 0x93); 1997 vmcs_write32(sf->ar_bytes, 0xf3);
1793} 1998}
1794 1999
1795static int alloc_apic_access_page(struct kvm *kvm) 2000static int alloc_apic_access_page(struct kvm *kvm)
@@ -1808,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
1808 if (r) 2013 if (r)
1809 goto out; 2014 goto out;
1810 2015
1811 down_read(&current->mm->mmap_sem);
1812 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); 2016 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
1813 up_read(&current->mm->mmap_sem);
1814out: 2017out:
1815 up_write(&kvm->slots_lock); 2018 up_write(&kvm->slots_lock);
1816 return r; 2019 return r;
@@ -1832,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm)
1832 if (r) 2035 if (r)
1833 goto out; 2036 goto out;
1834 2037
1835 down_read(&current->mm->mmap_sem);
1836 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, 2038 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
1837 VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); 2039 VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
1838 up_read(&current->mm->mmap_sem);
1839out: 2040out:
1840 up_write(&kvm->slots_lock); 2041 up_write(&kvm->slots_lock);
1841 return r; 2042 return r;
@@ -1917,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1917 } 2118 }
1918 if (!vm_need_ept()) 2119 if (!vm_need_ept())
1919 exec_control |= CPU_BASED_CR3_STORE_EXITING | 2120 exec_control |= CPU_BASED_CR3_STORE_EXITING |
1920 CPU_BASED_CR3_LOAD_EXITING; 2121 CPU_BASED_CR3_LOAD_EXITING |
2122 CPU_BASED_INVLPG_EXITING;
1921 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); 2123 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1922 2124
1923 if (cpu_has_secondary_exec_ctrls()) { 2125 if (cpu_has_secondary_exec_ctrls()) {
@@ -2019,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2019 u64 msr; 2221 u64 msr;
2020 int ret; 2222 int ret;
2021 2223
2224 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2022 down_read(&vcpu->kvm->slots_lock); 2225 down_read(&vcpu->kvm->slots_lock);
2023 if (!init_rmode(vmx->vcpu.kvm)) { 2226 if (!init_rmode(vmx->vcpu.kvm)) {
2024 ret = -ENOMEM; 2227 ret = -ENOMEM;
@@ -2036,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2036 2239
2037 fx_init(&vmx->vcpu); 2240 fx_init(&vmx->vcpu);
2038 2241
2242 seg_setup(VCPU_SREG_CS);
2039 /* 2243 /*
2040 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode 2244 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
2041 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. 2245 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
@@ -2047,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2047 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); 2251 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
2048 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); 2252 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
2049 } 2253 }
2050 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
2051 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
2052 2254
2053 seg_setup(VCPU_SREG_DS); 2255 seg_setup(VCPU_SREG_DS);
2054 seg_setup(VCPU_SREG_ES); 2256 seg_setup(VCPU_SREG_ES);
@@ -2072,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2072 2274
2073 vmcs_writel(GUEST_RFLAGS, 0x02); 2275 vmcs_writel(GUEST_RFLAGS, 0x02);
2074 if (vmx->vcpu.vcpu_id == 0) 2276 if (vmx->vcpu.vcpu_id == 0)
2075 vmcs_writel(GUEST_RIP, 0xfff0); 2277 kvm_rip_write(vcpu, 0xfff0);
2076 else 2278 else
2077 vmcs_writel(GUEST_RIP, 0); 2279 kvm_rip_write(vcpu, 0);
2078 vmcs_writel(GUEST_RSP, 0); 2280 kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
2079 2281
2080 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ 2282 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
2081 vmcs_writel(GUEST_DR7, 0x400); 2283 vmcs_writel(GUEST_DR7, 0x400);
@@ -2125,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2125 2327
2126 ret = 0; 2328 ret = 0;
2127 2329
2330 /* HACK: Don't enable emulation on guest boot/reset */
2331 vmx->emulation_required = 0;
2332
2128out: 2333out:
2129 up_read(&vcpu->kvm->slots_lock); 2334 up_read(&vcpu->kvm->slots_lock);
2130 return ret; 2335 return ret;
@@ -2136,14 +2341,15 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2136 2341
2137 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); 2342 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
2138 2343
2344 ++vcpu->stat.irq_injections;
2139 if (vcpu->arch.rmode.active) { 2345 if (vcpu->arch.rmode.active) {
2140 vmx->rmode.irq.pending = true; 2346 vmx->rmode.irq.pending = true;
2141 vmx->rmode.irq.vector = irq; 2347 vmx->rmode.irq.vector = irq;
2142 vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); 2348 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2143 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2349 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2144 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); 2350 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2145 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); 2351 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2146 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); 2352 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2147 return; 2353 return;
2148 } 2354 }
2149 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2355 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -2154,7 +2360,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2154{ 2360{
2155 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2361 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2156 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2362 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2157 vcpu->arch.nmi_pending = 0;
2158} 2363}
2159 2364
2160static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 2365static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
@@ -2166,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2166 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); 2371 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
2167 if (!vcpu->arch.irq_pending[word_index]) 2372 if (!vcpu->arch.irq_pending[word_index])
2168 clear_bit(word_index, &vcpu->arch.irq_summary); 2373 clear_bit(word_index, &vcpu->arch.irq_summary);
2169 vmx_inject_irq(vcpu, irq); 2374 kvm_queue_interrupt(vcpu, irq);
2170} 2375}
2171 2376
2172 2377
@@ -2180,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
2180 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); 2385 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2181 2386
2182 if (vcpu->arch.interrupt_window_open && 2387 if (vcpu->arch.interrupt_window_open &&
2183 vcpu->arch.irq_summary && 2388 vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
2184 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
2185 /*
2186 * If interrupts enabled, and not blocked by sti or mov ss. Good.
2187 */
2188 kvm_do_inject_irq(vcpu); 2389 kvm_do_inject_irq(vcpu);
2189 2390
2391 if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
2392 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
2393
2190 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 2394 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2191 if (!vcpu->arch.interrupt_window_open && 2395 if (!vcpu->arch.interrupt_window_open &&
2192 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) 2396 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
@@ -2237,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
2237static int handle_rmode_exception(struct kvm_vcpu *vcpu, 2441static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2238 int vec, u32 err_code) 2442 int vec, u32 err_code)
2239{ 2443{
2240 if (!vcpu->arch.rmode.active)
2241 return 0;
2242
2243 /* 2444 /*
2244 * Instruction with address size override prefix opcode 0x67 2445 * Instruction with address size override prefix opcode 0x67
2245 * Cause the #SS fault with 0 error code in VM86 mode. 2446 * Cause the #SS fault with 0 error code in VM86 mode.
@@ -2247,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2247 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2448 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2248 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) 2449 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
2249 return 1; 2450 return 1;
2451 /*
2452 * Forward all other exceptions that are valid in real mode.
2453 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
2454 * the required debugging infrastructure rework.
2455 */
2456 switch (vec) {
2457 case DE_VECTOR:
2458 case DB_VECTOR:
2459 case BP_VECTOR:
2460 case OF_VECTOR:
2461 case BR_VECTOR:
2462 case UD_VECTOR:
2463 case DF_VECTOR:
2464 case SS_VECTOR:
2465 case GP_VECTOR:
2466 case MF_VECTOR:
2467 kvm_queue_exception(vcpu, vec);
2468 return 1;
2469 }
2250 return 0; 2470 return 0;
2251} 2471}
2252 2472
@@ -2288,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2288 } 2508 }
2289 2509
2290 error_code = 0; 2510 error_code = 0;
2291 rip = vmcs_readl(GUEST_RIP); 2511 rip = kvm_rip_read(vcpu);
2292 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 2512 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
2293 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 2513 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2294 if (is_page_fault(intr_info)) { 2514 if (is_page_fault(intr_info)) {
@@ -2298,7 +2518,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2298 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2518 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2299 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2519 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2300 (u32)((u64)cr2 >> 32), handler); 2520 (u32)((u64)cr2 >> 32), handler);
2301 if (vect_info & VECTORING_INFO_VALID_MASK) 2521 if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
2302 kvm_mmu_unprotect_page_virt(vcpu, cr2); 2522 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2303 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2523 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2304 } 2524 }
@@ -2386,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2386 reg = (exit_qualification >> 8) & 15; 2606 reg = (exit_qualification >> 8) & 15;
2387 switch ((exit_qualification >> 4) & 3) { 2607 switch ((exit_qualification >> 4) & 3) {
2388 case 0: /* mov to cr */ 2608 case 0: /* mov to cr */
2389 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], 2609 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
2390 (u32)((u64)vcpu->arch.regs[reg] >> 32), handler); 2610 (u32)kvm_register_read(vcpu, reg),
2611 (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2612 handler);
2391 switch (cr) { 2613 switch (cr) {
2392 case 0: 2614 case 0:
2393 vcpu_load_rsp_rip(vcpu); 2615 kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
2394 kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
2395 skip_emulated_instruction(vcpu); 2616 skip_emulated_instruction(vcpu);
2396 return 1; 2617 return 1;
2397 case 3: 2618 case 3:
2398 vcpu_load_rsp_rip(vcpu); 2619 kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
2399 kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
2400 skip_emulated_instruction(vcpu); 2620 skip_emulated_instruction(vcpu);
2401 return 1; 2621 return 1;
2402 case 4: 2622 case 4:
2403 vcpu_load_rsp_rip(vcpu); 2623 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
2404 kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
2405 skip_emulated_instruction(vcpu); 2624 skip_emulated_instruction(vcpu);
2406 return 1; 2625 return 1;
2407 case 8: 2626 case 8:
2408 vcpu_load_rsp_rip(vcpu); 2627 kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
2409 kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
2410 skip_emulated_instruction(vcpu); 2628 skip_emulated_instruction(vcpu);
2411 if (irqchip_in_kernel(vcpu->kvm)) 2629 if (irqchip_in_kernel(vcpu->kvm))
2412 return 1; 2630 return 1;
@@ -2415,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2415 }; 2633 };
2416 break; 2634 break;
2417 case 2: /* clts */ 2635 case 2: /* clts */
2418 vcpu_load_rsp_rip(vcpu);
2419 vmx_fpu_deactivate(vcpu); 2636 vmx_fpu_deactivate(vcpu);
2420 vcpu->arch.cr0 &= ~X86_CR0_TS; 2637 vcpu->arch.cr0 &= ~X86_CR0_TS;
2421 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 2638 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
@@ -2426,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2426 case 1: /*mov from cr*/ 2643 case 1: /*mov from cr*/
2427 switch (cr) { 2644 switch (cr) {
2428 case 3: 2645 case 3:
2429 vcpu_load_rsp_rip(vcpu); 2646 kvm_register_write(vcpu, reg, vcpu->arch.cr3);
2430 vcpu->arch.regs[reg] = vcpu->arch.cr3;
2431 vcpu_put_rsp_rip(vcpu);
2432 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, 2647 KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
2433 (u32)vcpu->arch.regs[reg], 2648 (u32)kvm_register_read(vcpu, reg),
2434 (u32)((u64)vcpu->arch.regs[reg] >> 32), 2649 (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2435 handler); 2650 handler);
2436 skip_emulated_instruction(vcpu); 2651 skip_emulated_instruction(vcpu);
2437 return 1; 2652 return 1;
2438 case 8: 2653 case 8:
2439 vcpu_load_rsp_rip(vcpu); 2654 kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
2440 vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
2441 vcpu_put_rsp_rip(vcpu);
2442 KVMTRACE_2D(CR_READ, vcpu, (u32)cr, 2655 KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
2443 (u32)vcpu->arch.regs[reg], handler); 2656 (u32)kvm_register_read(vcpu, reg), handler);
2444 skip_emulated_instruction(vcpu); 2657 skip_emulated_instruction(vcpu);
2445 return 1; 2658 return 1;
2446 } 2659 }
@@ -2472,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2472 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 2685 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2473 dr = exit_qualification & 7; 2686 dr = exit_qualification & 7;
2474 reg = (exit_qualification >> 8) & 15; 2687 reg = (exit_qualification >> 8) & 15;
2475 vcpu_load_rsp_rip(vcpu);
2476 if (exit_qualification & 16) { 2688 if (exit_qualification & 16) {
2477 /* mov from dr */ 2689 /* mov from dr */
2478 switch (dr) { 2690 switch (dr) {
@@ -2485,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2485 default: 2697 default:
2486 val = 0; 2698 val = 0;
2487 } 2699 }
2488 vcpu->arch.regs[reg] = val; 2700 kvm_register_write(vcpu, reg, val);
2489 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); 2701 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
2490 } else { 2702 } else {
2491 /* mov to dr */ 2703 /* mov to dr */
2492 } 2704 }
2493 vcpu_put_rsp_rip(vcpu);
2494 skip_emulated_instruction(vcpu); 2705 skip_emulated_instruction(vcpu);
2495 return 1; 2706 return 1;
2496} 2707}
@@ -2583,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2583 return 1; 2794 return 1;
2584} 2795}
2585 2796
2797static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2798{
2799 u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2800
2801 kvm_mmu_invlpg(vcpu, exit_qualification);
2802 skip_emulated_instruction(vcpu);
2803 return 1;
2804}
2805
2586static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2806static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2587{ 2807{
2588 skip_emulated_instruction(vcpu); 2808 skip_emulated_instruction(vcpu);
@@ -2695,6 +2915,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2695 return 1; 2915 return 1;
2696} 2916}
2697 2917
2918static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
2919 struct kvm_run *kvm_run)
2920{
2921 struct vcpu_vmx *vmx = to_vmx(vcpu);
2922 int err;
2923
2924 preempt_enable();
2925 local_irq_enable();
2926
2927 while (!guest_state_valid(vcpu)) {
2928 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2929
2930 switch (err) {
2931 case EMULATE_DONE:
2932 break;
2933 case EMULATE_DO_MMIO:
2934 kvm_report_emulation_failure(vcpu, "mmio");
2935 /* TODO: Handle MMIO */
2936 return;
2937 default:
2938 kvm_report_emulation_failure(vcpu, "emulation failure");
2939 return;
2940 }
2941
2942 if (signal_pending(current))
2943 break;
2944 if (need_resched())
2945 schedule();
2946 }
2947
2948 local_irq_disable();
2949 preempt_disable();
2950
2951 /* Guest state should be valid now, no more emulation should be needed */
2952 vmx->emulation_required = 0;
2953}
2954
2698/* 2955/*
2699 * The exit handlers return 1 if the exit was handled fully and guest execution 2956 * The exit handlers return 1 if the exit was handled fully and guest execution
2700 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 2957 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -2714,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2714 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 2971 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2715 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 2972 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2716 [EXIT_REASON_HLT] = handle_halt, 2973 [EXIT_REASON_HLT] = handle_halt,
2974 [EXIT_REASON_INVLPG] = handle_invlpg,
2717 [EXIT_REASON_VMCALL] = handle_vmcall, 2975 [EXIT_REASON_VMCALL] = handle_vmcall,
2718 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 2976 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
2719 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 2977 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
@@ -2735,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2735 struct vcpu_vmx *vmx = to_vmx(vcpu); 2993 struct vcpu_vmx *vmx = to_vmx(vcpu);
2736 u32 vectoring_info = vmx->idt_vectoring_info; 2994 u32 vectoring_info = vmx->idt_vectoring_info;
2737 2995
2738 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), 2996 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
2739 (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); 2997 (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
2740 2998
2741 /* Access CR3 don't cause VMExit in paging mode, so we need 2999 /* Access CR3 don't cause VMExit in paging mode, so we need
2742 * to sync with guest real CR3. */ 3000 * to sync with guest real CR3. */
@@ -2829,88 +3087,92 @@ static void enable_intr_window(struct kvm_vcpu *vcpu)
2829 enable_irq_window(vcpu); 3087 enable_irq_window(vcpu);
2830} 3088}
2831 3089
2832static void vmx_intr_assist(struct kvm_vcpu *vcpu) 3090static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
2833{ 3091{
2834 struct vcpu_vmx *vmx = to_vmx(vcpu); 3092 u32 exit_intr_info;
2835 u32 idtv_info_field, intr_info_field, exit_intr_info_field; 3093 u32 idt_vectoring_info;
2836 int vector; 3094 bool unblock_nmi;
3095 u8 vector;
3096 int type;
3097 bool idtv_info_valid;
3098 u32 error;
2837 3099
2838 update_tpr_threshold(vcpu); 3100 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2839 3101 if (cpu_has_virtual_nmis()) {
2840 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 3102 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
2841 exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO); 3103 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
2842 idtv_info_field = vmx->idt_vectoring_info; 3104 /*
2843 if (intr_info_field & INTR_INFO_VALID_MASK) { 3105 * SDM 3: 25.7.1.2
2844 if (idtv_info_field & INTR_INFO_VALID_MASK) { 3106 * Re-set bit "block by NMI" before VM entry if vmexit caused by
2845 /* TODO: fault when IDT_Vectoring */ 3107 * a guest IRET fault.
2846 if (printk_ratelimit()) 3108 */
2847 printk(KERN_ERR "Fault when IDT_Vectoring\n"); 3109 if (unblock_nmi && vector != DF_VECTOR)
2848 } 3110 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2849 enable_intr_window(vcpu); 3111 GUEST_INTR_STATE_NMI);
2850 return;
2851 } 3112 }
2852 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2853 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2854 == INTR_TYPE_EXT_INTR
2855 && vcpu->arch.rmode.active) {
2856 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2857
2858 vmx_inject_irq(vcpu, vect);
2859 enable_intr_window(vcpu);
2860 return;
2861 }
2862
2863 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
2864 3113
3114 idt_vectoring_info = vmx->idt_vectoring_info;
3115 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3116 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3117 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3118 if (vmx->vcpu.arch.nmi_injected) {
2865 /* 3119 /*
2866 * SDM 3: 25.7.1.2 3120 * SDM 3: 25.7.1.2
2867 * Clear bit "block by NMI" before VM entry if a NMI delivery 3121 * Clear bit "block by NMI" before VM entry if a NMI delivery
2868 * faulted. 3122 * faulted.
2869 */ 3123 */
2870 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) 3124 if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
2871 == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis()) 3125 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2872 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 3126 GUEST_INTR_STATE_NMI);
2873 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3127 else
2874 ~GUEST_INTR_STATE_NMI); 3128 vmx->vcpu.arch.nmi_injected = false;
2875 3129 }
2876 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field 3130 kvm_clear_exception_queue(&vmx->vcpu);
2877 & ~INTR_INFO_RESVD_BITS_MASK); 3131 if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
2878 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 3132 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
2879 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 3133 error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
2880 3134 kvm_queue_exception_e(&vmx->vcpu, vector, error);
2881 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) 3135 } else
2882 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 3136 kvm_queue_exception(&vmx->vcpu, vector);
2883 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 3137 vmx->idt_vectoring_info = 0;
2884 enable_intr_window(vcpu);
2885 return;
2886 } 3138 }
3139 kvm_clear_interrupt_queue(&vmx->vcpu);
3140 if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
3141 kvm_queue_interrupt(&vmx->vcpu, vector);
3142 vmx->idt_vectoring_info = 0;
3143 }
3144}
3145
3146static void vmx_intr_assist(struct kvm_vcpu *vcpu)
3147{
3148 update_tpr_threshold(vcpu);
3149
2887 if (cpu_has_virtual_nmis()) { 3150 if (cpu_has_virtual_nmis()) {
2888 /* 3151 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
2889 * SDM 3: 25.7.1.2 3152 if (vmx_nmi_enabled(vcpu)) {
2890 * Re-set bit "block by NMI" before VM entry if vmexit caused by 3153 vcpu->arch.nmi_pending = false;
2891 * a guest IRET fault. 3154 vcpu->arch.nmi_injected = true;
2892 */ 3155 } else {
2893 if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) && 3156 enable_intr_window(vcpu);
2894 (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8) 3157 return;
2895 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 3158 }
2896 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) | 3159 }
2897 GUEST_INTR_STATE_NMI); 3160 if (vcpu->arch.nmi_injected) {
2898 else if (vcpu->arch.nmi_pending) { 3161 vmx_inject_nmi(vcpu);
2899 if (vmx_nmi_enabled(vcpu))
2900 vmx_inject_nmi(vcpu);
2901 enable_intr_window(vcpu); 3162 enable_intr_window(vcpu);
2902 return; 3163 return;
2903 } 3164 }
2904
2905 } 3165 }
2906 if (!kvm_cpu_has_interrupt(vcpu)) 3166 if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
2907 return; 3167 if (vmx_irq_enabled(vcpu))
2908 if (vmx_irq_enabled(vcpu)) { 3168 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
2909 vector = kvm_cpu_get_interrupt(vcpu); 3169 else
2910 vmx_inject_irq(vcpu, vector); 3170 enable_irq_window(vcpu);
2911 kvm_timer_intr_post(vcpu, vector); 3171 }
2912 } else 3172 if (vcpu->arch.interrupt.pending) {
2913 enable_irq_window(vcpu); 3173 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
3174 kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
3175 }
2914} 3176}
2915 3177
2916/* 3178/*
@@ -2922,9 +3184,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2922static void fixup_rmode_irq(struct vcpu_vmx *vmx) 3184static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2923{ 3185{
2924 vmx->rmode.irq.pending = 0; 3186 vmx->rmode.irq.pending = 0;
2925 if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) 3187 if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
2926 return; 3188 return;
2927 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); 3189 kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
2928 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { 3190 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
2929 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; 3191 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
2930 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; 3192 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
@@ -2936,11 +3198,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2936 | vmx->rmode.irq.vector; 3198 | vmx->rmode.irq.vector;
2937} 3199}
2938 3200
3201#ifdef CONFIG_X86_64
3202#define R "r"
3203#define Q "q"
3204#else
3205#define R "e"
3206#define Q "l"
3207#endif
3208
2939static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3209static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2940{ 3210{
2941 struct vcpu_vmx *vmx = to_vmx(vcpu); 3211 struct vcpu_vmx *vmx = to_vmx(vcpu);
2942 u32 intr_info; 3212 u32 intr_info;
2943 3213
3214 /* Handle invalid guest state instead of entering VMX */
3215 if (vmx->emulation_required && emulate_invalid_guest_state) {
3216 handle_invalid_guest_state(vcpu, kvm_run);
3217 return;
3218 }
3219
3220 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3221 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
3222 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
3223 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
3224
2944 /* 3225 /*
2945 * Loading guest fpu may have cleared host cr0.ts 3226 * Loading guest fpu may have cleared host cr0.ts
2946 */ 3227 */
@@ -2948,26 +3229,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2948 3229
2949 asm( 3230 asm(
2950 /* Store host registers */ 3231 /* Store host registers */
2951#ifdef CONFIG_X86_64 3232 "push %%"R"dx; push %%"R"bp;"
2952 "push %%rdx; push %%rbp;" 3233 "push %%"R"cx \n\t"
2953 "push %%rcx \n\t" 3234 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
2954#else 3235 "je 1f \n\t"
2955 "push %%edx; push %%ebp;" 3236 "mov %%"R"sp, %c[host_rsp](%0) \n\t"
2956 "push %%ecx \n\t"
2957#endif
2958 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" 3237 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
3238 "1: \n\t"
2959 /* Check if vmlaunch of vmresume is needed */ 3239 /* Check if vmlaunch of vmresume is needed */
2960 "cmpl $0, %c[launched](%0) \n\t" 3240 "cmpl $0, %c[launched](%0) \n\t"
2961 /* Load guest registers. Don't clobber flags. */ 3241 /* Load guest registers. Don't clobber flags. */
3242 "mov %c[cr2](%0), %%"R"ax \n\t"
3243 "mov %%"R"ax, %%cr2 \n\t"
3244 "mov %c[rax](%0), %%"R"ax \n\t"
3245 "mov %c[rbx](%0), %%"R"bx \n\t"
3246 "mov %c[rdx](%0), %%"R"dx \n\t"
3247 "mov %c[rsi](%0), %%"R"si \n\t"
3248 "mov %c[rdi](%0), %%"R"di \n\t"
3249 "mov %c[rbp](%0), %%"R"bp \n\t"
2962#ifdef CONFIG_X86_64 3250#ifdef CONFIG_X86_64
2963 "mov %c[cr2](%0), %%rax \n\t"
2964 "mov %%rax, %%cr2 \n\t"
2965 "mov %c[rax](%0), %%rax \n\t"
2966 "mov %c[rbx](%0), %%rbx \n\t"
2967 "mov %c[rdx](%0), %%rdx \n\t"
2968 "mov %c[rsi](%0), %%rsi \n\t"
2969 "mov %c[rdi](%0), %%rdi \n\t"
2970 "mov %c[rbp](%0), %%rbp \n\t"
2971 "mov %c[r8](%0), %%r8 \n\t" 3251 "mov %c[r8](%0), %%r8 \n\t"
2972 "mov %c[r9](%0), %%r9 \n\t" 3252 "mov %c[r9](%0), %%r9 \n\t"
2973 "mov %c[r10](%0), %%r10 \n\t" 3253 "mov %c[r10](%0), %%r10 \n\t"
@@ -2976,18 +3256,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2976 "mov %c[r13](%0), %%r13 \n\t" 3256 "mov %c[r13](%0), %%r13 \n\t"
2977 "mov %c[r14](%0), %%r14 \n\t" 3257 "mov %c[r14](%0), %%r14 \n\t"
2978 "mov %c[r15](%0), %%r15 \n\t" 3258 "mov %c[r15](%0), %%r15 \n\t"
2979 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
2980#else
2981 "mov %c[cr2](%0), %%eax \n\t"
2982 "mov %%eax, %%cr2 \n\t"
2983 "mov %c[rax](%0), %%eax \n\t"
2984 "mov %c[rbx](%0), %%ebx \n\t"
2985 "mov %c[rdx](%0), %%edx \n\t"
2986 "mov %c[rsi](%0), %%esi \n\t"
2987 "mov %c[rdi](%0), %%edi \n\t"
2988 "mov %c[rbp](%0), %%ebp \n\t"
2989 "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
2990#endif 3259#endif
3260 "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
3261
2991 /* Enter guest mode */ 3262 /* Enter guest mode */
2992 "jne .Llaunched \n\t" 3263 "jne .Llaunched \n\t"
2993 __ex(ASM_VMX_VMLAUNCH) "\n\t" 3264 __ex(ASM_VMX_VMLAUNCH) "\n\t"
@@ -2995,15 +3266,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2995 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" 3266 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
2996 ".Lkvm_vmx_return: " 3267 ".Lkvm_vmx_return: "
2997 /* Save guest registers, load host registers, keep flags */ 3268 /* Save guest registers, load host registers, keep flags */
3269 "xchg %0, (%%"R"sp) \n\t"
3270 "mov %%"R"ax, %c[rax](%0) \n\t"
3271 "mov %%"R"bx, %c[rbx](%0) \n\t"
3272 "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
3273 "mov %%"R"dx, %c[rdx](%0) \n\t"
3274 "mov %%"R"si, %c[rsi](%0) \n\t"
3275 "mov %%"R"di, %c[rdi](%0) \n\t"
3276 "mov %%"R"bp, %c[rbp](%0) \n\t"
2998#ifdef CONFIG_X86_64 3277#ifdef CONFIG_X86_64
2999 "xchg %0, (%%rsp) \n\t"
3000 "mov %%rax, %c[rax](%0) \n\t"
3001 "mov %%rbx, %c[rbx](%0) \n\t"
3002 "pushq (%%rsp); popq %c[rcx](%0) \n\t"
3003 "mov %%rdx, %c[rdx](%0) \n\t"
3004 "mov %%rsi, %c[rsi](%0) \n\t"
3005 "mov %%rdi, %c[rdi](%0) \n\t"
3006 "mov %%rbp, %c[rbp](%0) \n\t"
3007 "mov %%r8, %c[r8](%0) \n\t" 3278 "mov %%r8, %c[r8](%0) \n\t"
3008 "mov %%r9, %c[r9](%0) \n\t" 3279 "mov %%r9, %c[r9](%0) \n\t"
3009 "mov %%r10, %c[r10](%0) \n\t" 3280 "mov %%r10, %c[r10](%0) \n\t"
@@ -3012,28 +3283,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3012 "mov %%r13, %c[r13](%0) \n\t" 3283 "mov %%r13, %c[r13](%0) \n\t"
3013 "mov %%r14, %c[r14](%0) \n\t" 3284 "mov %%r14, %c[r14](%0) \n\t"
3014 "mov %%r15, %c[r15](%0) \n\t" 3285 "mov %%r15, %c[r15](%0) \n\t"
3015 "mov %%cr2, %%rax \n\t"
3016 "mov %%rax, %c[cr2](%0) \n\t"
3017
3018 "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
3019#else
3020 "xchg %0, (%%esp) \n\t"
3021 "mov %%eax, %c[rax](%0) \n\t"
3022 "mov %%ebx, %c[rbx](%0) \n\t"
3023 "pushl (%%esp); popl %c[rcx](%0) \n\t"
3024 "mov %%edx, %c[rdx](%0) \n\t"
3025 "mov %%esi, %c[rsi](%0) \n\t"
3026 "mov %%edi, %c[rdi](%0) \n\t"
3027 "mov %%ebp, %c[rbp](%0) \n\t"
3028 "mov %%cr2, %%eax \n\t"
3029 "mov %%eax, %c[cr2](%0) \n\t"
3030
3031 "pop %%ebp; pop %%ebp; pop %%edx \n\t"
3032#endif 3286#endif
3287 "mov %%cr2, %%"R"ax \n\t"
3288 "mov %%"R"ax, %c[cr2](%0) \n\t"
3289
3290 "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t"
3033 "setbe %c[fail](%0) \n\t" 3291 "setbe %c[fail](%0) \n\t"
3034 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 3292 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
3035 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 3293 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
3036 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 3294 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
3295 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
3037 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), 3296 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
3038 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), 3297 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
3039 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), 3298 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
@@ -3053,14 +3312,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3053#endif 3312#endif
3054 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 3313 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
3055 : "cc", "memory" 3314 : "cc", "memory"
3315 , R"bx", R"di", R"si"
3056#ifdef CONFIG_X86_64 3316#ifdef CONFIG_X86_64
3057 , "rbx", "rdi", "rsi"
3058 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 3317 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
3059#else
3060 , "ebx", "edi", "rsi"
3061#endif 3318#endif
3062 ); 3319 );
3063 3320
3321 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
3322 vcpu->arch.regs_dirty = 0;
3323
3064 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 3324 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3065 if (vmx->rmode.irq.pending) 3325 if (vmx->rmode.irq.pending)
3066 fixup_rmode_irq(vmx); 3326 fixup_rmode_irq(vmx);
@@ -3080,8 +3340,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3080 KVMTRACE_0D(NMI, vcpu, handler); 3340 KVMTRACE_0D(NMI, vcpu, handler);
3081 asm("int $2"); 3341 asm("int $2");
3082 } 3342 }
3343
3344 vmx_complete_interrupts(vmx);
3083} 3345}
3084 3346
3347#undef R
3348#undef Q
3349
3085static void vmx_free_vmcs(struct kvm_vcpu *vcpu) 3350static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
3086{ 3351{
3087 struct vcpu_vmx *vmx = to_vmx(vcpu); 3352 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3224,8 +3489,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3224 .set_idt = vmx_set_idt, 3489 .set_idt = vmx_set_idt,
3225 .get_gdt = vmx_get_gdt, 3490 .get_gdt = vmx_get_gdt,
3226 .set_gdt = vmx_set_gdt, 3491 .set_gdt = vmx_set_gdt,
3227 .cache_regs = vcpu_load_rsp_rip, 3492 .cache_reg = vmx_cache_reg,
3228 .decache_regs = vcpu_put_rsp_rip,
3229 .get_rflags = vmx_get_rflags, 3493 .get_rflags = vmx_get_rflags,
3230 .set_rflags = vmx_set_rflags, 3494 .set_rflags = vmx_set_rflags,
3231 3495
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 17e25995b65b..3e010d21fdd7 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,9 +331,6 @@ enum vmcs_field {
331 331
332#define AR_RESERVD_MASK 0xfffe0f00 332#define AR_RESERVD_MASK 0xfffe0f00
333 333
334#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
335#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
336
337#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 334#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
338#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 335#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10
339 336
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0d682fc6aeb3..4f0677d1eae8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4,10 +4,14 @@
4 * derived from drivers/kvm/kvm_main.c 4 * derived from drivers/kvm/kvm_main.c
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008
7 * 9 *
8 * Authors: 10 * Authors:
9 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
10 * Yaniv Kamay <yaniv@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Amit Shah <amit.shah@qumranet.com>
14 * Ben-Ami Yassour <benami@il.ibm.com>
11 * 15 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See 16 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory. 17 * the COPYING file in the top-level directory.
@@ -19,14 +23,18 @@
19#include "mmu.h" 23#include "mmu.h"
20#include "i8254.h" 24#include "i8254.h"
21#include "tss.h" 25#include "tss.h"
26#include "kvm_cache_regs.h"
27#include "x86.h"
22 28
23#include <linux/clocksource.h> 29#include <linux/clocksource.h>
30#include <linux/interrupt.h>
24#include <linux/kvm.h> 31#include <linux/kvm.h>
25#include <linux/fs.h> 32#include <linux/fs.h>
26#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
27#include <linux/module.h> 34#include <linux/module.h>
28#include <linux/mman.h> 35#include <linux/mman.h>
29#include <linux/highmem.h> 36#include <linux/highmem.h>
37#include <linux/intel-iommu.h>
30 38
31#include <asm/uaccess.h> 39#include <asm/uaccess.h>
32#include <asm/msr.h> 40#include <asm/msr.h>
@@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
61 struct kvm_cpuid_entry2 __user *entries); 69 struct kvm_cpuid_entry2 __user *entries);
62 70
63struct kvm_x86_ops *kvm_x86_ops; 71struct kvm_x86_ops *kvm_x86_ops;
72EXPORT_SYMBOL_GPL(kvm_x86_ops);
64 73
65struct kvm_stats_debugfs_item debugfs_entries[] = { 74struct kvm_stats_debugfs_item debugfs_entries[] = {
66 { "pf_fixed", VCPU_STAT(pf_fixed) }, 75 { "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -83,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
83 { "fpu_reload", VCPU_STAT(fpu_reload) }, 92 { "fpu_reload", VCPU_STAT(fpu_reload) },
84 { "insn_emulation", VCPU_STAT(insn_emulation) }, 93 { "insn_emulation", VCPU_STAT(insn_emulation) },
85 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 94 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
95 { "irq_injections", VCPU_STAT(irq_injections) },
86 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 96 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
87 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 97 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
88 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 98 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -90,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
90 { "mmu_flooded", VM_STAT(mmu_flooded) }, 100 { "mmu_flooded", VM_STAT(mmu_flooded) },
91 { "mmu_recycled", VM_STAT(mmu_recycled) }, 101 { "mmu_recycled", VM_STAT(mmu_recycled) },
92 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 102 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
103 { "mmu_unsync", VM_STAT(mmu_unsync) },
93 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 104 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
94 { "largepages", VM_STAT(lpages) }, 105 { "largepages", VM_STAT(lpages) },
95 { NULL } 106 { NULL }
96}; 107};
97 108
98
99unsigned long segment_base(u16 selector) 109unsigned long segment_base(u16 selector)
100{ 110{
101 struct descriptor_table gdt; 111 struct descriptor_table gdt;
@@ -352,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
352void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 362void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
353{ 363{
354 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 364 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
365 kvm_mmu_sync_roots(vcpu);
355 kvm_mmu_flush_tlb(vcpu); 366 kvm_mmu_flush_tlb(vcpu);
356 return; 367 return;
357 } 368 }
@@ -564,7 +575,7 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
564 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 575 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
565 576
566 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 577 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
567 __FUNCTION__, tsc_khz, hv_clock->tsc_shift, 578 __func__, tsc_khz, hv_clock->tsc_shift,
568 hv_clock->tsc_to_system_mul); 579 hv_clock->tsc_to_system_mul);
569} 580}
570 581
@@ -662,6 +673,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
662 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", 673 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
663 __func__, data); 674 __func__, data);
664 break; 675 break;
676 case MSR_IA32_DEBUGCTLMSR:
677 if (!data) {
678 /* We support the non-activated case already */
679 break;
680 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
681 /* Values other than LBR and BTF are vendor-specific,
682 thus reserved and should throw a #GP */
683 return 1;
684 }
685 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
686 __func__, data);
687 break;
665 case MSR_IA32_UCODE_REV: 688 case MSR_IA32_UCODE_REV:
666 case MSR_IA32_UCODE_WRITE: 689 case MSR_IA32_UCODE_WRITE:
667 break; 690 break;
@@ -692,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
692 /* ...but clean it before doing the actual write */ 715 /* ...but clean it before doing the actual write */
693 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 716 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
694 717
695 down_read(&current->mm->mmap_sem);
696 vcpu->arch.time_page = 718 vcpu->arch.time_page =
697 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 719 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
698 up_read(&current->mm->mmap_sem);
699 720
700 if (is_error_page(vcpu->arch.time_page)) { 721 if (is_error_page(vcpu->arch.time_page)) {
701 kvm_release_page_clean(vcpu->arch.time_page); 722 kvm_release_page_clean(vcpu->arch.time_page);
@@ -752,8 +773,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
752 case MSR_IA32_MC0_MISC+8: 773 case MSR_IA32_MC0_MISC+8:
753 case MSR_IA32_MC0_MISC+12: 774 case MSR_IA32_MC0_MISC+12:
754 case MSR_IA32_MC0_MISC+16: 775 case MSR_IA32_MC0_MISC+16:
776 case MSR_IA32_MC0_MISC+20:
755 case MSR_IA32_UCODE_REV: 777 case MSR_IA32_UCODE_REV:
756 case MSR_IA32_EBL_CR_POWERON: 778 case MSR_IA32_EBL_CR_POWERON:
779 case MSR_IA32_DEBUGCTLMSR:
780 case MSR_IA32_LASTBRANCHFROMIP:
781 case MSR_IA32_LASTBRANCHTOIP:
782 case MSR_IA32_LASTINTFROMIP:
783 case MSR_IA32_LASTINTTOIP:
757 data = 0; 784 data = 0;
758 break; 785 break;
759 case MSR_MTRRcap: 786 case MSR_MTRRcap:
@@ -901,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext)
901 case KVM_CAP_PV_MMU: 928 case KVM_CAP_PV_MMU:
902 r = !tdp_enabled; 929 r = !tdp_enabled;
903 break; 930 break;
931 case KVM_CAP_IOMMU:
932 r = intel_iommu_found();
933 break;
904 default: 934 default:
905 r = 0; 935 r = 0;
906 break; 936 break;
@@ -1303,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1303 struct kvm_vcpu *vcpu = filp->private_data; 1333 struct kvm_vcpu *vcpu = filp->private_data;
1304 void __user *argp = (void __user *)arg; 1334 void __user *argp = (void __user *)arg;
1305 int r; 1335 int r;
1336 struct kvm_lapic_state *lapic = NULL;
1306 1337
1307 switch (ioctl) { 1338 switch (ioctl) {
1308 case KVM_GET_LAPIC: { 1339 case KVM_GET_LAPIC: {
1309 struct kvm_lapic_state lapic; 1340 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1310 1341
1311 memset(&lapic, 0, sizeof lapic); 1342 r = -ENOMEM;
1312 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); 1343 if (!lapic)
1344 goto out;
1345 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1313 if (r) 1346 if (r)
1314 goto out; 1347 goto out;
1315 r = -EFAULT; 1348 r = -EFAULT;
1316 if (copy_to_user(argp, &lapic, sizeof lapic)) 1349 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1317 goto out; 1350 goto out;
1318 r = 0; 1351 r = 0;
1319 break; 1352 break;
1320 } 1353 }
1321 case KVM_SET_LAPIC: { 1354 case KVM_SET_LAPIC: {
1322 struct kvm_lapic_state lapic; 1355 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1323 1356 r = -ENOMEM;
1357 if (!lapic)
1358 goto out;
1324 r = -EFAULT; 1359 r = -EFAULT;
1325 if (copy_from_user(&lapic, argp, sizeof lapic)) 1360 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1326 goto out; 1361 goto out;
1327 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; 1362 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1328 if (r) 1363 if (r)
1329 goto out; 1364 goto out;
1330 r = 0; 1365 r = 0;
@@ -1422,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1422 r = -EINVAL; 1457 r = -EINVAL;
1423 } 1458 }
1424out: 1459out:
1460 if (lapic)
1461 kfree(lapic);
1425 return r; 1462 return r;
1426} 1463}
1427 1464
@@ -1630,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
1630 struct kvm *kvm = filp->private_data; 1667 struct kvm *kvm = filp->private_data;
1631 void __user *argp = (void __user *)arg; 1668 void __user *argp = (void __user *)arg;
1632 int r = -EINVAL; 1669 int r = -EINVAL;
1670 /*
1671 * This union makes it completely explicit to gcc-3.x
1672 * that these two variables' stack usage should be
1673 * combined, not added together.
1674 */
1675 union {
1676 struct kvm_pit_state ps;
1677 struct kvm_memory_alias alias;
1678 } u;
1633 1679
1634 switch (ioctl) { 1680 switch (ioctl) {
1635 case KVM_SET_TSS_ADDR: 1681 case KVM_SET_TSS_ADDR:
@@ -1661,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
1661 case KVM_GET_NR_MMU_PAGES: 1707 case KVM_GET_NR_MMU_PAGES:
1662 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 1708 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1663 break; 1709 break;
1664 case KVM_SET_MEMORY_ALIAS: { 1710 case KVM_SET_MEMORY_ALIAS:
1665 struct kvm_memory_alias alias;
1666
1667 r = -EFAULT; 1711 r = -EFAULT;
1668 if (copy_from_user(&alias, argp, sizeof alias)) 1712 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
1669 goto out; 1713 goto out;
1670 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); 1714 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
1671 if (r) 1715 if (r)
1672 goto out; 1716 goto out;
1673 break; 1717 break;
1674 }
1675 case KVM_CREATE_IRQCHIP: 1718 case KVM_CREATE_IRQCHIP:
1676 r = -ENOMEM; 1719 r = -ENOMEM;
1677 kvm->arch.vpic = kvm_create_pic(kvm); 1720 kvm->arch.vpic = kvm_create_pic(kvm);
@@ -1699,13 +1742,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
1699 goto out; 1742 goto out;
1700 if (irqchip_in_kernel(kvm)) { 1743 if (irqchip_in_kernel(kvm)) {
1701 mutex_lock(&kvm->lock); 1744 mutex_lock(&kvm->lock);
1702 if (irq_event.irq < 16) 1745 kvm_set_irq(kvm, irq_event.irq, irq_event.level);
1703 kvm_pic_set_irq(pic_irqchip(kvm),
1704 irq_event.irq,
1705 irq_event.level);
1706 kvm_ioapic_set_irq(kvm->arch.vioapic,
1707 irq_event.irq,
1708 irq_event.level);
1709 mutex_unlock(&kvm->lock); 1746 mutex_unlock(&kvm->lock);
1710 r = 0; 1747 r = 0;
1711 } 1748 }
@@ -1713,65 +1750,77 @@ long kvm_arch_vm_ioctl(struct file *filp,
1713 } 1750 }
1714 case KVM_GET_IRQCHIP: { 1751 case KVM_GET_IRQCHIP: {
1715 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1752 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1716 struct kvm_irqchip chip; 1753 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1717 1754
1718 r = -EFAULT; 1755 r = -ENOMEM;
1719 if (copy_from_user(&chip, argp, sizeof chip)) 1756 if (!chip)
1720 goto out; 1757 goto out;
1758 r = -EFAULT;
1759 if (copy_from_user(chip, argp, sizeof *chip))
1760 goto get_irqchip_out;
1721 r = -ENXIO; 1761 r = -ENXIO;
1722 if (!irqchip_in_kernel(kvm)) 1762 if (!irqchip_in_kernel(kvm))
1723 goto out; 1763 goto get_irqchip_out;
1724 r = kvm_vm_ioctl_get_irqchip(kvm, &chip); 1764 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
1725 if (r) 1765 if (r)
1726 goto out; 1766 goto get_irqchip_out;
1727 r = -EFAULT; 1767 r = -EFAULT;
1728 if (copy_to_user(argp, &chip, sizeof chip)) 1768 if (copy_to_user(argp, chip, sizeof *chip))
1729 goto out; 1769 goto get_irqchip_out;
1730 r = 0; 1770 r = 0;
1771 get_irqchip_out:
1772 kfree(chip);
1773 if (r)
1774 goto out;
1731 break; 1775 break;
1732 } 1776 }
1733 case KVM_SET_IRQCHIP: { 1777 case KVM_SET_IRQCHIP: {
1734 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1778 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1735 struct kvm_irqchip chip; 1779 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1736 1780
1737 r = -EFAULT; 1781 r = -ENOMEM;
1738 if (copy_from_user(&chip, argp, sizeof chip)) 1782 if (!chip)
1739 goto out; 1783 goto out;
1784 r = -EFAULT;
1785 if (copy_from_user(chip, argp, sizeof *chip))
1786 goto set_irqchip_out;
1740 r = -ENXIO; 1787 r = -ENXIO;
1741 if (!irqchip_in_kernel(kvm)) 1788 if (!irqchip_in_kernel(kvm))
1742 goto out; 1789 goto set_irqchip_out;
1743 r = kvm_vm_ioctl_set_irqchip(kvm, &chip); 1790 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
1744 if (r) 1791 if (r)
1745 goto out; 1792 goto set_irqchip_out;
1746 r = 0; 1793 r = 0;
1794 set_irqchip_out:
1795 kfree(chip);
1796 if (r)
1797 goto out;
1747 break; 1798 break;
1748 } 1799 }
1749 case KVM_GET_PIT: { 1800 case KVM_GET_PIT: {
1750 struct kvm_pit_state ps;
1751 r = -EFAULT; 1801 r = -EFAULT;
1752 if (copy_from_user(&ps, argp, sizeof ps)) 1802 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
1753 goto out; 1803 goto out;
1754 r = -ENXIO; 1804 r = -ENXIO;
1755 if (!kvm->arch.vpit) 1805 if (!kvm->arch.vpit)
1756 goto out; 1806 goto out;
1757 r = kvm_vm_ioctl_get_pit(kvm, &ps); 1807 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
1758 if (r) 1808 if (r)
1759 goto out; 1809 goto out;
1760 r = -EFAULT; 1810 r = -EFAULT;
1761 if (copy_to_user(argp, &ps, sizeof ps)) 1811 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
1762 goto out; 1812 goto out;
1763 r = 0; 1813 r = 0;
1764 break; 1814 break;
1765 } 1815 }
1766 case KVM_SET_PIT: { 1816 case KVM_SET_PIT: {
1767 struct kvm_pit_state ps;
1768 r = -EFAULT; 1817 r = -EFAULT;
1769 if (copy_from_user(&ps, argp, sizeof ps)) 1818 if (copy_from_user(&u.ps, argp, sizeof u.ps))
1770 goto out; 1819 goto out;
1771 r = -ENXIO; 1820 r = -ENXIO;
1772 if (!kvm->arch.vpit) 1821 if (!kvm->arch.vpit)
1773 goto out; 1822 goto out;
1774 r = kvm_vm_ioctl_set_pit(kvm, &ps); 1823 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
1775 if (r) 1824 if (r)
1776 goto out; 1825 goto out;
1777 r = 0; 1826 r = 0;
@@ -2018,9 +2067,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
2018 2067
2019 val = *(u64 *)new; 2068 val = *(u64 *)new;
2020 2069
2021 down_read(&current->mm->mmap_sem);
2022 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2070 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2023 up_read(&current->mm->mmap_sem);
2024 2071
2025 kaddr = kmap_atomic(page, KM_USER0); 2072 kaddr = kmap_atomic(page, KM_USER0);
2026 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2073 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
@@ -2040,6 +2087,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2040 2087
2041int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2088int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2042{ 2089{
2090 kvm_mmu_invlpg(vcpu, address);
2043 return X86EMUL_CONTINUE; 2091 return X86EMUL_CONTINUE;
2044} 2092}
2045 2093
@@ -2080,7 +2128,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2080void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2128void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2081{ 2129{
2082 u8 opcodes[4]; 2130 u8 opcodes[4];
2083 unsigned long rip = vcpu->arch.rip; 2131 unsigned long rip = kvm_rip_read(vcpu);
2084 unsigned long rip_linear; 2132 unsigned long rip_linear;
2085 2133
2086 if (!printk_ratelimit()) 2134 if (!printk_ratelimit())
@@ -2102,6 +2150,14 @@ static struct x86_emulate_ops emulate_ops = {
2102 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2150 .cmpxchg_emulated = emulator_cmpxchg_emulated,
2103}; 2151};
2104 2152
2153static void cache_all_regs(struct kvm_vcpu *vcpu)
2154{
2155 kvm_register_read(vcpu, VCPU_REGS_RAX);
2156 kvm_register_read(vcpu, VCPU_REGS_RSP);
2157 kvm_register_read(vcpu, VCPU_REGS_RIP);
2158 vcpu->arch.regs_dirty = ~0;
2159}
2160
2105int emulate_instruction(struct kvm_vcpu *vcpu, 2161int emulate_instruction(struct kvm_vcpu *vcpu,
2106 struct kvm_run *run, 2162 struct kvm_run *run,
2107 unsigned long cr2, 2163 unsigned long cr2,
@@ -2111,8 +2167,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2111 int r; 2167 int r;
2112 struct decode_cache *c; 2168 struct decode_cache *c;
2113 2169
2170 kvm_clear_exception_queue(vcpu);
2114 vcpu->arch.mmio_fault_cr2 = cr2; 2171 vcpu->arch.mmio_fault_cr2 = cr2;
2115 kvm_x86_ops->cache_regs(vcpu); 2172 /*
2173 * TODO: fix x86_emulate.c to use guest_read/write_register
2174 * instead of direct ->regs accesses, can save hundred cycles
2175 * on Intel for instructions that don't read/change RSP, for
2176 * for example.
2177 */
2178 cache_all_regs(vcpu);
2116 2179
2117 vcpu->mmio_is_write = 0; 2180 vcpu->mmio_is_write = 0;
2118 vcpu->arch.pio.string = 0; 2181 vcpu->arch.pio.string = 0;
@@ -2172,7 +2235,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2172 return EMULATE_DO_MMIO; 2235 return EMULATE_DO_MMIO;
2173 } 2236 }
2174 2237
2175 kvm_x86_ops->decache_regs(vcpu);
2176 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2238 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2177 2239
2178 if (vcpu->mmio_is_write) { 2240 if (vcpu->mmio_is_write) {
@@ -2225,20 +2287,19 @@ int complete_pio(struct kvm_vcpu *vcpu)
2225 struct kvm_pio_request *io = &vcpu->arch.pio; 2287 struct kvm_pio_request *io = &vcpu->arch.pio;
2226 long delta; 2288 long delta;
2227 int r; 2289 int r;
2228 2290 unsigned long val;
2229 kvm_x86_ops->cache_regs(vcpu);
2230 2291
2231 if (!io->string) { 2292 if (!io->string) {
2232 if (io->in) 2293 if (io->in) {
2233 memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, 2294 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2234 io->size); 2295 memcpy(&val, vcpu->arch.pio_data, io->size);
2296 kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2297 }
2235 } else { 2298 } else {
2236 if (io->in) { 2299 if (io->in) {
2237 r = pio_copy_data(vcpu); 2300 r = pio_copy_data(vcpu);
2238 if (r) { 2301 if (r)
2239 kvm_x86_ops->cache_regs(vcpu);
2240 return r; 2302 return r;
2241 }
2242 } 2303 }
2243 2304
2244 delta = 1; 2305 delta = 1;
@@ -2248,19 +2309,24 @@ int complete_pio(struct kvm_vcpu *vcpu)
2248 * The size of the register should really depend on 2309 * The size of the register should really depend on
2249 * current address size. 2310 * current address size.
2250 */ 2311 */
2251 vcpu->arch.regs[VCPU_REGS_RCX] -= delta; 2312 val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2313 val -= delta;
2314 kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2252 } 2315 }
2253 if (io->down) 2316 if (io->down)
2254 delta = -delta; 2317 delta = -delta;
2255 delta *= io->size; 2318 delta *= io->size;
2256 if (io->in) 2319 if (io->in) {
2257 vcpu->arch.regs[VCPU_REGS_RDI] += delta; 2320 val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2258 else 2321 val += delta;
2259 vcpu->arch.regs[VCPU_REGS_RSI] += delta; 2322 kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2323 } else {
2324 val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2325 val += delta;
2326 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2327 }
2260 } 2328 }
2261 2329
2262 kvm_x86_ops->decache_regs(vcpu);
2263
2264 io->count -= io->cur_count; 2330 io->count -= io->cur_count;
2265 io->cur_count = 0; 2331 io->cur_count = 0;
2266 2332
@@ -2313,6 +2379,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2313 int size, unsigned port) 2379 int size, unsigned port)
2314{ 2380{
2315 struct kvm_io_device *pio_dev; 2381 struct kvm_io_device *pio_dev;
2382 unsigned long val;
2316 2383
2317 vcpu->run->exit_reason = KVM_EXIT_IO; 2384 vcpu->run->exit_reason = KVM_EXIT_IO;
2318 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2385 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2333,8 +2400,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2333 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2400 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2334 handler); 2401 handler);
2335 2402
2336 kvm_x86_ops->cache_regs(vcpu); 2403 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2337 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); 2404 memcpy(vcpu->arch.pio_data, &val, 4);
2338 2405
2339 kvm_x86_ops->skip_emulated_instruction(vcpu); 2406 kvm_x86_ops->skip_emulated_instruction(vcpu);
2340 2407
@@ -2492,11 +2559,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2492 KVMTRACE_0D(HLT, vcpu, handler); 2559 KVMTRACE_0D(HLT, vcpu, handler);
2493 if (irqchip_in_kernel(vcpu->kvm)) { 2560 if (irqchip_in_kernel(vcpu->kvm)) {
2494 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 2561 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2495 up_read(&vcpu->kvm->slots_lock);
2496 kvm_vcpu_block(vcpu);
2497 down_read(&vcpu->kvm->slots_lock);
2498 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
2499 return -EINTR;
2500 return 1; 2562 return 1;
2501 } else { 2563 } else {
2502 vcpu->run->exit_reason = KVM_EXIT_HLT; 2564 vcpu->run->exit_reason = KVM_EXIT_HLT;
@@ -2519,13 +2581,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2519 unsigned long nr, a0, a1, a2, a3, ret; 2581 unsigned long nr, a0, a1, a2, a3, ret;
2520 int r = 1; 2582 int r = 1;
2521 2583
2522 kvm_x86_ops->cache_regs(vcpu); 2584 nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
2523 2585 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
2524 nr = vcpu->arch.regs[VCPU_REGS_RAX]; 2586 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
2525 a0 = vcpu->arch.regs[VCPU_REGS_RBX]; 2587 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
2526 a1 = vcpu->arch.regs[VCPU_REGS_RCX]; 2588 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
2527 a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2528 a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2529 2589
2530 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); 2590 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2531 2591
@@ -2548,8 +2608,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2548 ret = -KVM_ENOSYS; 2608 ret = -KVM_ENOSYS;
2549 break; 2609 break;
2550 } 2610 }
2551 vcpu->arch.regs[VCPU_REGS_RAX] = ret; 2611 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
2552 kvm_x86_ops->decache_regs(vcpu);
2553 ++vcpu->stat.hypercalls; 2612 ++vcpu->stat.hypercalls;
2554 return r; 2613 return r;
2555} 2614}
@@ -2559,6 +2618,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2559{ 2618{
2560 char instruction[3]; 2619 char instruction[3];
2561 int ret = 0; 2620 int ret = 0;
2621 unsigned long rip = kvm_rip_read(vcpu);
2562 2622
2563 2623
2564 /* 2624 /*
@@ -2568,9 +2628,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2568 */ 2628 */
2569 kvm_mmu_zap_all(vcpu->kvm); 2629 kvm_mmu_zap_all(vcpu->kvm);
2570 2630
2571 kvm_x86_ops->cache_regs(vcpu);
2572 kvm_x86_ops->patch_hypercall(vcpu, instruction); 2631 kvm_x86_ops->patch_hypercall(vcpu, instruction);
2573 if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) 2632 if (emulator_write_emulated(rip, instruction, 3, vcpu)
2574 != X86EMUL_CONTINUE) 2633 != X86EMUL_CONTINUE)
2575 ret = -EFAULT; 2634 ret = -EFAULT;
2576 2635
@@ -2700,13 +2759,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2700 u32 function, index; 2759 u32 function, index;
2701 struct kvm_cpuid_entry2 *e, *best; 2760 struct kvm_cpuid_entry2 *e, *best;
2702 2761
2703 kvm_x86_ops->cache_regs(vcpu); 2762 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
2704 function = vcpu->arch.regs[VCPU_REGS_RAX]; 2763 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
2705 index = vcpu->arch.regs[VCPU_REGS_RCX]; 2764 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
2706 vcpu->arch.regs[VCPU_REGS_RAX] = 0; 2765 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
2707 vcpu->arch.regs[VCPU_REGS_RBX] = 0; 2766 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
2708 vcpu->arch.regs[VCPU_REGS_RCX] = 0; 2767 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
2709 vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2710 best = NULL; 2768 best = NULL;
2711 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 2769 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2712 e = &vcpu->arch.cpuid_entries[i]; 2770 e = &vcpu->arch.cpuid_entries[i];
@@ -2724,18 +2782,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2724 best = e; 2782 best = e;
2725 } 2783 }
2726 if (best) { 2784 if (best) {
2727 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; 2785 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
2728 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; 2786 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
2729 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; 2787 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
2730 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; 2788 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
2731 } 2789 }
2732 kvm_x86_ops->decache_regs(vcpu);
2733 kvm_x86_ops->skip_emulated_instruction(vcpu); 2790 kvm_x86_ops->skip_emulated_instruction(vcpu);
2734 KVMTRACE_5D(CPUID, vcpu, function, 2791 KVMTRACE_5D(CPUID, vcpu, function,
2735 (u32)vcpu->arch.regs[VCPU_REGS_RAX], 2792 (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
2736 (u32)vcpu->arch.regs[VCPU_REGS_RBX], 2793 (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
2737 (u32)vcpu->arch.regs[VCPU_REGS_RCX], 2794 (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
2738 (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); 2795 (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
2739} 2796}
2740EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 2797EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2741 2798
@@ -2776,9 +2833,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
2776 if (!apic || !apic->vapic_addr) 2833 if (!apic || !apic->vapic_addr)
2777 return; 2834 return;
2778 2835
2779 down_read(&current->mm->mmap_sem);
2780 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2836 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2781 up_read(&current->mm->mmap_sem);
2782 2837
2783 vcpu->arch.apic->vapic_page = page; 2838 vcpu->arch.apic->vapic_page = page;
2784} 2839}
@@ -2796,28 +2851,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
2796 up_read(&vcpu->kvm->slots_lock); 2851 up_read(&vcpu->kvm->slots_lock);
2797} 2852}
2798 2853
2799static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2854static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2800{ 2855{
2801 int r; 2856 int r;
2802 2857
2803 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
2804 pr_debug("vcpu %d received sipi with vector # %x\n",
2805 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2806 kvm_lapic_reset(vcpu);
2807 r = kvm_x86_ops->vcpu_reset(vcpu);
2808 if (r)
2809 return r;
2810 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2811 }
2812
2813 down_read(&vcpu->kvm->slots_lock);
2814 vapic_enter(vcpu);
2815
2816preempted:
2817 if (vcpu->guest_debug.enabled)
2818 kvm_x86_ops->guest_debug_pre(vcpu);
2819
2820again:
2821 if (vcpu->requests) 2858 if (vcpu->requests)
2822 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 2859 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
2823 kvm_mmu_unload(vcpu); 2860 kvm_mmu_unload(vcpu);
@@ -2829,6 +2866,8 @@ again:
2829 if (vcpu->requests) { 2866 if (vcpu->requests) {
2830 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 2867 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2831 __kvm_migrate_timers(vcpu); 2868 __kvm_migrate_timers(vcpu);
2869 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
2870 kvm_mmu_sync_roots(vcpu);
2832 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 2871 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2833 kvm_x86_ops->tlb_flush(vcpu); 2872 kvm_x86_ops->tlb_flush(vcpu);
2834 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 2873 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
@@ -2854,21 +2893,15 @@ again:
2854 2893
2855 local_irq_disable(); 2894 local_irq_disable();
2856 2895
2857 if (vcpu->requests || need_resched()) { 2896 if (vcpu->requests || need_resched() || signal_pending(current)) {
2858 local_irq_enable(); 2897 local_irq_enable();
2859 preempt_enable(); 2898 preempt_enable();
2860 r = 1; 2899 r = 1;
2861 goto out; 2900 goto out;
2862 } 2901 }
2863 2902
2864 if (signal_pending(current)) { 2903 if (vcpu->guest_debug.enabled)
2865 local_irq_enable(); 2904 kvm_x86_ops->guest_debug_pre(vcpu);
2866 preempt_enable();
2867 r = -EINTR;
2868 kvm_run->exit_reason = KVM_EXIT_INTR;
2869 ++vcpu->stat.signal_exits;
2870 goto out;
2871 }
2872 2905
2873 vcpu->guest_mode = 1; 2906 vcpu->guest_mode = 1;
2874 /* 2907 /*
@@ -2917,8 +2950,8 @@ again:
2917 * Profile KVM exit RIPs: 2950 * Profile KVM exit RIPs:
2918 */ 2951 */
2919 if (unlikely(prof_on == KVM_PROFILING)) { 2952 if (unlikely(prof_on == KVM_PROFILING)) {
2920 kvm_x86_ops->cache_regs(vcpu); 2953 unsigned long rip = kvm_rip_read(vcpu);
2921 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); 2954 profile_hit(KVM_PROFILING, (void *)rip);
2922 } 2955 }
2923 2956
2924 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) 2957 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
@@ -2927,26 +2960,63 @@ again:
2927 kvm_lapic_sync_from_vapic(vcpu); 2960 kvm_lapic_sync_from_vapic(vcpu);
2928 2961
2929 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 2962 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2963out:
2964 return r;
2965}
2930 2966
2931 if (r > 0) { 2967static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2932 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 2968{
2933 r = -EINTR; 2969 int r;
2934 kvm_run->exit_reason = KVM_EXIT_INTR; 2970
2935 ++vcpu->stat.request_irq_exits; 2971 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
2936 goto out; 2972 pr_debug("vcpu %d received sipi with vector # %x\n",
2937 } 2973 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2938 if (!need_resched()) 2974 kvm_lapic_reset(vcpu);
2939 goto again; 2975 r = kvm_x86_ops->vcpu_reset(vcpu);
2976 if (r)
2977 return r;
2978 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2940 } 2979 }
2941 2980
2942out: 2981 down_read(&vcpu->kvm->slots_lock);
2943 up_read(&vcpu->kvm->slots_lock); 2982 vapic_enter(vcpu);
2944 if (r > 0) { 2983
2945 kvm_resched(vcpu); 2984 r = 1;
2946 down_read(&vcpu->kvm->slots_lock); 2985 while (r > 0) {
2947 goto preempted; 2986 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
2987 r = vcpu_enter_guest(vcpu, kvm_run);
2988 else {
2989 up_read(&vcpu->kvm->slots_lock);
2990 kvm_vcpu_block(vcpu);
2991 down_read(&vcpu->kvm->slots_lock);
2992 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
2993 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
2994 vcpu->arch.mp_state =
2995 KVM_MP_STATE_RUNNABLE;
2996 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
2997 r = -EINTR;
2998 }
2999
3000 if (r > 0) {
3001 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3002 r = -EINTR;
3003 kvm_run->exit_reason = KVM_EXIT_INTR;
3004 ++vcpu->stat.request_irq_exits;
3005 }
3006 if (signal_pending(current)) {
3007 r = -EINTR;
3008 kvm_run->exit_reason = KVM_EXIT_INTR;
3009 ++vcpu->stat.signal_exits;
3010 }
3011 if (need_resched()) {
3012 up_read(&vcpu->kvm->slots_lock);
3013 kvm_resched(vcpu);
3014 down_read(&vcpu->kvm->slots_lock);
3015 }
3016 }
2948 } 3017 }
2949 3018
3019 up_read(&vcpu->kvm->slots_lock);
2950 post_kvm_run_save(vcpu, kvm_run); 3020 post_kvm_run_save(vcpu, kvm_run);
2951 3021
2952 vapic_exit(vcpu); 3022 vapic_exit(vcpu);
@@ -2966,6 +3036,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2966 3036
2967 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 3037 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
2968 kvm_vcpu_block(vcpu); 3038 kvm_vcpu_block(vcpu);
3039 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
2969 r = -EAGAIN; 3040 r = -EAGAIN;
2970 goto out; 3041 goto out;
2971 } 3042 }
@@ -2999,11 +3070,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2999 } 3070 }
3000 } 3071 }
3001#endif 3072#endif
3002 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { 3073 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3003 kvm_x86_ops->cache_regs(vcpu); 3074 kvm_register_write(vcpu, VCPU_REGS_RAX,
3004 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; 3075 kvm_run->hypercall.ret);
3005 kvm_x86_ops->decache_regs(vcpu);
3006 }
3007 3076
3008 r = __vcpu_run(vcpu, kvm_run); 3077 r = __vcpu_run(vcpu, kvm_run);
3009 3078
@@ -3019,28 +3088,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3019{ 3088{
3020 vcpu_load(vcpu); 3089 vcpu_load(vcpu);
3021 3090
3022 kvm_x86_ops->cache_regs(vcpu); 3091 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3023 3092 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3024 regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3093 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3025 regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; 3094 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3026 regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; 3095 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3027 regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; 3096 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3028 regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; 3097 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3029 regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; 3098 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3030 regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3031 regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
3032#ifdef CONFIG_X86_64 3099#ifdef CONFIG_X86_64
3033 regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; 3100 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3034 regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; 3101 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3035 regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; 3102 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3036 regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; 3103 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3037 regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; 3104 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3038 regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; 3105 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3039 regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; 3106 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3040 regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; 3107 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
3041#endif 3108#endif
3042 3109
3043 regs->rip = vcpu->arch.rip; 3110 regs->rip = kvm_rip_read(vcpu);
3044 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 3111 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
3045 3112
3046 /* 3113 /*
@@ -3058,29 +3125,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3058{ 3125{
3059 vcpu_load(vcpu); 3126 vcpu_load(vcpu);
3060 3127
3061 vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; 3128 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3062 vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; 3129 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3063 vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; 3130 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3064 vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; 3131 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3065 vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; 3132 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3066 vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; 3133 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3067 vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; 3134 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3068 vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; 3135 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
3069#ifdef CONFIG_X86_64 3136#ifdef CONFIG_X86_64
3070 vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; 3137 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3071 vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; 3138 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3072 vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; 3139 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3073 vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; 3140 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3074 vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; 3141 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3075 vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; 3142 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3076 vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; 3143 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3077 vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; 3144 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3145
3078#endif 3146#endif
3079 3147
3080 vcpu->arch.rip = regs->rip; 3148 kvm_rip_write(vcpu, regs->rip);
3081 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 3149 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3082 3150
3083 kvm_x86_ops->decache_regs(vcpu);
3084 3151
3085 vcpu->arch.exception.pending = false; 3152 vcpu->arch.exception.pending = false;
3086 3153
@@ -3294,11 +3361,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3294 return 0; 3361 return 0;
3295} 3362}
3296 3363
3364static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
3365{
3366 struct kvm_segment segvar = {
3367 .base = selector << 4,
3368 .limit = 0xffff,
3369 .selector = selector,
3370 .type = 3,
3371 .present = 1,
3372 .dpl = 3,
3373 .db = 0,
3374 .s = 1,
3375 .l = 0,
3376 .g = 0,
3377 .avl = 0,
3378 .unusable = 0,
3379 };
3380 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
3381 return 0;
3382}
3383
3297int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3384int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3298 int type_bits, int seg) 3385 int type_bits, int seg)
3299{ 3386{
3300 struct kvm_segment kvm_seg; 3387 struct kvm_segment kvm_seg;
3301 3388
3389 if (!(vcpu->arch.cr0 & X86_CR0_PE))
3390 return kvm_load_realmode_segment(vcpu, selector, seg);
3302 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 3391 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3303 return 1; 3392 return 1;
3304 kvm_seg.type |= type_bits; 3393 kvm_seg.type |= type_bits;
@@ -3316,17 +3405,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3316 struct tss_segment_32 *tss) 3405 struct tss_segment_32 *tss)
3317{ 3406{
3318 tss->cr3 = vcpu->arch.cr3; 3407 tss->cr3 = vcpu->arch.cr3;
3319 tss->eip = vcpu->arch.rip; 3408 tss->eip = kvm_rip_read(vcpu);
3320 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 3409 tss->eflags = kvm_x86_ops->get_rflags(vcpu);
3321 tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; 3410 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3322 tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3411 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3323 tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; 3412 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3324 tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; 3413 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3325 tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; 3414 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3326 tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; 3415 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3327 tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; 3416 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3328 tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; 3417 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3329
3330 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3418 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3331 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3419 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3332 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3420 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
@@ -3342,17 +3430,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3342{ 3430{
3343 kvm_set_cr3(vcpu, tss->cr3); 3431 kvm_set_cr3(vcpu, tss->cr3);
3344 3432
3345 vcpu->arch.rip = tss->eip; 3433 kvm_rip_write(vcpu, tss->eip);
3346 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 3434 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3347 3435
3348 vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; 3436 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
3349 vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; 3437 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
3350 vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; 3438 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
3351 vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; 3439 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
3352 vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; 3440 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
3353 vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; 3441 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
3354 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; 3442 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
3355 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; 3443 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
3356 3444
3357 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3445 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3358 return 1; 3446 return 1;
@@ -3380,16 +3468,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3380static void save_state_to_tss16(struct kvm_vcpu *vcpu, 3468static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3381 struct tss_segment_16 *tss) 3469 struct tss_segment_16 *tss)
3382{ 3470{
3383 tss->ip = vcpu->arch.rip; 3471 tss->ip = kvm_rip_read(vcpu);
3384 tss->flag = kvm_x86_ops->get_rflags(vcpu); 3472 tss->flag = kvm_x86_ops->get_rflags(vcpu);
3385 tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; 3473 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3386 tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; 3474 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3387 tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; 3475 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3388 tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; 3476 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3389 tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; 3477 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3390 tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; 3478 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3391 tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; 3479 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
3392 tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; 3480 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
3393 3481
3394 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3482 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3395 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3483 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
@@ -3402,16 +3490,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3402static int load_state_from_tss16(struct kvm_vcpu *vcpu, 3490static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3403 struct tss_segment_16 *tss) 3491 struct tss_segment_16 *tss)
3404{ 3492{
3405 vcpu->arch.rip = tss->ip; 3493 kvm_rip_write(vcpu, tss->ip);
3406 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 3494 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
3407 vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; 3495 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
3408 vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; 3496 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
3409 vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; 3497 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
3410 vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; 3498 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
3411 vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; 3499 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
3412 vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; 3500 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
3413 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; 3501 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
3414 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; 3502 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
3415 3503
3416 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3504 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3417 return 1; 3505 return 1;
@@ -3534,7 +3622,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3534 } 3622 }
3535 3623
3536 kvm_x86_ops->skip_emulated_instruction(vcpu); 3624 kvm_x86_ops->skip_emulated_instruction(vcpu);
3537 kvm_x86_ops->cache_regs(vcpu);
3538 3625
3539 if (nseg_desc.type & 8) 3626 if (nseg_desc.type & 8)
3540 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, 3627 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
@@ -3559,7 +3646,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3559 tr_seg.type = 11; 3646 tr_seg.type = 11;
3560 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3647 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3561out: 3648out:
3562 kvm_x86_ops->decache_regs(vcpu);
3563 return ret; 3649 return ret;
3564} 3650}
3565EXPORT_SYMBOL_GPL(kvm_task_switch); 3651EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -3622,6 +3708,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3622 pr_debug("Set back pending irq %d\n", 3708 pr_debug("Set back pending irq %d\n",
3623 pending_vec); 3709 pending_vec);
3624 } 3710 }
3711 kvm_pic_clear_isr_ack(vcpu->kvm);
3625 } 3712 }
3626 3713
3627 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3714 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -3634,6 +3721,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3634 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3721 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3635 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3722 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3636 3723
3724 /* Older userspace won't unhalt the vcpu on reset. */
3725 if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
3726 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
3727 !(vcpu->arch.cr0 & X86_CR0_PE))
3728 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3729
3637 vcpu_put(vcpu); 3730 vcpu_put(vcpu);
3638 3731
3639 return 0; 3732 return 0;
@@ -3918,6 +4011,7 @@ struct kvm *kvm_arch_create_vm(void)
3918 return ERR_PTR(-ENOMEM); 4011 return ERR_PTR(-ENOMEM);
3919 4012
3920 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4013 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4014 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
3921 4015
3922 return kvm; 4016 return kvm;
3923} 4017}
@@ -3950,6 +4044,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
3950 4044
3951void kvm_arch_destroy_vm(struct kvm *kvm) 4045void kvm_arch_destroy_vm(struct kvm *kvm)
3952{ 4046{
4047 kvm_iommu_unmap_guest(kvm);
4048 kvm_free_all_assigned_devices(kvm);
3953 kvm_free_pit(kvm); 4049 kvm_free_pit(kvm);
3954 kfree(kvm->arch.vpic); 4050 kfree(kvm->arch.vpic);
3955 kfree(kvm->arch.vioapic); 4051 kfree(kvm->arch.vioapic);
@@ -3981,7 +4077,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
3981 userspace_addr = do_mmap(NULL, 0, 4077 userspace_addr = do_mmap(NULL, 0,
3982 npages * PAGE_SIZE, 4078 npages * PAGE_SIZE,
3983 PROT_READ | PROT_WRITE, 4079 PROT_READ | PROT_WRITE,
3984 MAP_SHARED | MAP_ANONYMOUS, 4080 MAP_PRIVATE | MAP_ANONYMOUS,
3985 0); 4081 0);
3986 up_write(&current->mm->mmap_sem); 4082 up_write(&current->mm->mmap_sem);
3987 4083
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
new file mode 100644
index 000000000000..6a4be78a7384
--- /dev/null
+++ b/arch/x86/kvm/x86.h
@@ -0,0 +1,22 @@
1#ifndef ARCH_X86_KVM_X86_H
2#define ARCH_X86_KVM_X86_H
3
4#include <linux/kvm_host.h>
5
6static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
7{
8 vcpu->arch.exception.pending = false;
9}
10
11static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
12{
13 vcpu->arch.interrupt.pending = true;
14 vcpu->arch.interrupt.nr = vector;
15}
16
17static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
18{
19 vcpu->arch.interrupt.pending = false;
20}
21
22#endif
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index f2f90468f8b1..ea051173b0da 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -26,6 +26,7 @@
26#define DPRINTF(_f, _a ...) printf(_f , ## _a) 26#define DPRINTF(_f, _a ...) printf(_f , ## _a)
27#else 27#else
28#include <linux/kvm_host.h> 28#include <linux/kvm_host.h>
29#include "kvm_cache_regs.h"
29#define DPRINTF(x...) do {} while (0) 30#define DPRINTF(x...) do {} while (0)
30#endif 31#endif
31#include <linux/module.h> 32#include <linux/module.h>
@@ -46,25 +47,26 @@
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ 47#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */ 48#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */ 49#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1) 50#define DstAcc (4<<1) /* Destination Accumulator */
51#define DstMask (7<<1)
50/* Source operand type. */ 52/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */ 53#define SrcNone (0<<4) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ 54#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */ 55#define SrcReg (1<<4) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */ 56#define SrcMem (2<<4) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ 57#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ 58#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */ 59#define SrcImm (5<<4) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ 60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3) 61#define SrcMask (7<<4)
60/* Generic ModRM decode. */ 62/* Generic ModRM decode. */
61#define ModRM (1<<6) 63#define ModRM (1<<7)
62/* Destination is only written; never read. */ 64/* Destination is only written; never read. */
63#define Mov (1<<7) 65#define Mov (1<<8)
64#define BitOp (1<<8) 66#define BitOp (1<<9)
65#define MemAbs (1<<9) /* Memory operand is absolute displacement */ 67#define MemAbs (1<<10) /* Memory operand is absolute displacement */
66#define String (1<<10) /* String instruction (rep capable) */ 68#define String (1<<12) /* String instruction (rep capable) */
67#define Stack (1<<11) /* Stack instruction (push/pop) */ 69#define Stack (1<<13) /* Stack instruction (push/pop) */
68#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 70#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
69#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 71#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
70#define GroupMask 0xff /* Group number stored in bits 0:7 */ 72#define GroupMask 0xff /* Group number stored in bits 0:7 */
@@ -94,7 +96,7 @@ static u16 opcode_table[256] = {
94 /* 0x20 - 0x27 */ 96 /* 0x20 - 0x27 */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 97 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 98 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 SrcImmByte, SrcImm, 0, 0, 99 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
98 /* 0x28 - 0x2F */ 100 /* 0x28 - 0x2F */
99 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
100 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -106,7 +108,8 @@ static u16 opcode_table[256] = {
106 /* 0x38 - 0x3F */ 108 /* 0x38 - 0x3F */
107 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
108 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
109 0, 0, 0, 0, 111 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
112 0, 0,
110 /* 0x40 - 0x47 */ 113 /* 0x40 - 0x47 */
111 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 114 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
112 /* 0x48 - 0x4F */ 115 /* 0x48 - 0x4F */
@@ -153,9 +156,16 @@ static u16 opcode_table[256] = {
153 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 156 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
154 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 157 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
155 ByteOp | ImplicitOps | String, ImplicitOps | String, 158 ByteOp | ImplicitOps | String, ImplicitOps | String,
156 /* 0xB0 - 0xBF */ 159 /* 0xB0 - 0xB7 */
157 0, 0, 0, 0, 0, 0, 0, 0, 160 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
158 DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0, 161 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
162 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
163 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
164 /* 0xB8 - 0xBF */
165 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
166 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
167 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
168 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
159 /* 0xC0 - 0xC7 */ 169 /* 0xC0 - 0xC7 */
160 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 170 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
161 0, ImplicitOps | Stack, 0, 0, 171 0, ImplicitOps | Stack, 0, 0,
@@ -169,17 +179,20 @@ static u16 opcode_table[256] = {
169 /* 0xD8 - 0xDF */ 179 /* 0xD8 - 0xDF */
170 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0,
171 /* 0xE0 - 0xE7 */ 181 /* 0xE0 - 0xE7 */
172 0, 0, 0, 0, 0, 0, 0, 0, 182 0, 0, 0, 0,
183 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
184 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
173 /* 0xE8 - 0xEF */ 185 /* 0xE8 - 0xEF */
174 ImplicitOps | Stack, SrcImm | ImplicitOps, 186 ImplicitOps | Stack, SrcImm | ImplicitOps,
175 ImplicitOps, SrcImmByte | ImplicitOps, 187 ImplicitOps, SrcImmByte | ImplicitOps,
176 0, 0, 0, 0, 188 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
189 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
177 /* 0xF0 - 0xF7 */ 190 /* 0xF0 - 0xF7 */
178 0, 0, 0, 0, 191 0, 0, 0, 0,
179 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, 192 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
180 /* 0xF8 - 0xFF */ 193 /* 0xF8 - 0xFF */
181 ImplicitOps, 0, ImplicitOps, ImplicitOps, 194 ImplicitOps, 0, ImplicitOps, ImplicitOps,
182 0, 0, Group | Group4, Group | Group5, 195 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
183}; 196};
184 197
185static u16 twobyte_table[256] = { 198static u16 twobyte_table[256] = {
@@ -268,15 +281,16 @@ static u16 group_table[] = {
268 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 281 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
269 0, 0, 0, 0, 282 0, 0, 0, 0,
270 [Group3*8] = 283 [Group3*8] =
271 DstMem | SrcImm | ModRM | SrcImm, 0, 284 DstMem | SrcImm | ModRM, 0,
272 DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 285 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
273 0, 0, 0, 0, 286 0, 0, 0, 0,
274 [Group4*8] = 287 [Group4*8] =
275 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 288 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
276 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0,
277 [Group5*8] = 290 [Group5*8] =
278 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 291 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
279 SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, 292 SrcMem | ModRM | Stack, 0,
293 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
280 [Group7*8] = 294 [Group7*8] =
281 0, 0, ModRM | SrcMem, ModRM | SrcMem, 295 0, 0, ModRM | SrcMem, ModRM | SrcMem,
282 SrcNone | ModRM | DstMem | Mov, 0, 296 SrcNone | ModRM | DstMem | Mov, 0,
@@ -839,7 +853,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
839 /* Shadow copy of register state. Committed on successful emulation. */ 853 /* Shadow copy of register state. Committed on successful emulation. */
840 854
841 memset(c, 0, sizeof(struct decode_cache)); 855 memset(c, 0, sizeof(struct decode_cache));
842 c->eip = ctxt->vcpu->arch.rip; 856 c->eip = kvm_rip_read(ctxt->vcpu);
843 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 857 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
844 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 858 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
845 859
@@ -1048,6 +1062,23 @@ done_prefixes:
1048 } 1062 }
1049 c->dst.type = OP_MEM; 1063 c->dst.type = OP_MEM;
1050 break; 1064 break;
1065 case DstAcc:
1066 c->dst.type = OP_REG;
1067 c->dst.bytes = c->op_bytes;
1068 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1069 switch (c->op_bytes) {
1070 case 1:
1071 c->dst.val = *(u8 *)c->dst.ptr;
1072 break;
1073 case 2:
1074 c->dst.val = *(u16 *)c->dst.ptr;
1075 break;
1076 case 4:
1077 c->dst.val = *(u32 *)c->dst.ptr;
1078 break;
1079 }
1080 c->dst.orig_val = c->dst.val;
1081 break;
1051 } 1082 }
1052 1083
1053 if (c->rip_relative) 1084 if (c->rip_relative)
@@ -1151,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1151 case 1: /* dec */ 1182 case 1: /* dec */
1152 emulate_1op("dec", c->dst, ctxt->eflags); 1183 emulate_1op("dec", c->dst, ctxt->eflags);
1153 break; 1184 break;
1185 case 2: /* call near abs */ {
1186 long int old_eip;
1187 old_eip = c->eip;
1188 c->eip = c->src.val;
1189 c->src.val = old_eip;
1190 emulate_push(ctxt);
1191 break;
1192 }
1154 case 4: /* jmp abs */ 1193 case 4: /* jmp abs */
1155 c->eip = c->src.val; 1194 c->eip = c->src.val;
1156 break; 1195 break;
@@ -1251,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1251 u64 msr_data; 1290 u64 msr_data;
1252 unsigned long saved_eip = 0; 1291 unsigned long saved_eip = 0;
1253 struct decode_cache *c = &ctxt->decode; 1292 struct decode_cache *c = &ctxt->decode;
1293 unsigned int port;
1294 int io_dir_in;
1254 int rc = 0; 1295 int rc = 0;
1255 1296
1256 /* Shadow copy of register state. Committed on successful emulation. 1297 /* Shadow copy of register state. Committed on successful emulation.
@@ -1267,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1267 if (c->rep_prefix && (c->d & String)) { 1308 if (c->rep_prefix && (c->d & String)) {
1268 /* All REP prefixes have the same first termination condition */ 1309 /* All REP prefixes have the same first termination condition */
1269 if (c->regs[VCPU_REGS_RCX] == 0) { 1310 if (c->regs[VCPU_REGS_RCX] == 0) {
1270 ctxt->vcpu->arch.rip = c->eip; 1311 kvm_rip_write(ctxt->vcpu, c->eip);
1271 goto done; 1312 goto done;
1272 } 1313 }
1273 /* The second termination condition only applies for REPE 1314 /* The second termination condition only applies for REPE
@@ -1281,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1281 (c->b == 0xae) || (c->b == 0xaf)) { 1322 (c->b == 0xae) || (c->b == 0xaf)) {
1282 if ((c->rep_prefix == REPE_PREFIX) && 1323 if ((c->rep_prefix == REPE_PREFIX) &&
1283 ((ctxt->eflags & EFLG_ZF) == 0)) { 1324 ((ctxt->eflags & EFLG_ZF) == 0)) {
1284 ctxt->vcpu->arch.rip = c->eip; 1325 kvm_rip_write(ctxt->vcpu, c->eip);
1285 goto done; 1326 goto done;
1286 } 1327 }
1287 if ((c->rep_prefix == REPNE_PREFIX) && 1328 if ((c->rep_prefix == REPNE_PREFIX) &&
1288 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { 1329 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
1289 ctxt->vcpu->arch.rip = c->eip; 1330 kvm_rip_write(ctxt->vcpu, c->eip);
1290 goto done; 1331 goto done;
1291 } 1332 }
1292 } 1333 }
1293 c->regs[VCPU_REGS_RCX]--; 1334 c->regs[VCPU_REGS_RCX]--;
1294 c->eip = ctxt->vcpu->arch.rip; 1335 c->eip = kvm_rip_read(ctxt->vcpu);
1295 } 1336 }
1296 1337
1297 if (c->src.type == OP_MEM) { 1338 if (c->src.type == OP_MEM) {
@@ -1351,27 +1392,10 @@ special_insn:
1351 sbb: /* sbb */ 1392 sbb: /* sbb */
1352 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 1393 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1353 break; 1394 break;
1354 case 0x20 ... 0x23: 1395 case 0x20 ... 0x25:
1355 and: /* and */ 1396 and: /* and */
1356 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); 1397 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
1357 break; 1398 break;
1358 case 0x24: /* and al imm8 */
1359 c->dst.type = OP_REG;
1360 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1361 c->dst.val = *(u8 *)c->dst.ptr;
1362 c->dst.bytes = 1;
1363 c->dst.orig_val = c->dst.val;
1364 goto and;
1365 case 0x25: /* and ax imm16, or eax imm32 */
1366 c->dst.type = OP_REG;
1367 c->dst.bytes = c->op_bytes;
1368 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1369 if (c->op_bytes == 2)
1370 c->dst.val = *(u16 *)c->dst.ptr;
1371 else
1372 c->dst.val = *(u32 *)c->dst.ptr;
1373 c->dst.orig_val = c->dst.val;
1374 goto and;
1375 case 0x28 ... 0x2d: 1399 case 0x28 ... 0x2d:
1376 sub: /* sub */ 1400 sub: /* sub */
1377 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); 1401 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
@@ -1659,7 +1683,7 @@ special_insn:
1659 case 0xae ... 0xaf: /* scas */ 1683 case 0xae ... 0xaf: /* scas */
1660 DPRINTF("Urk! I don't handle SCAS.\n"); 1684 DPRINTF("Urk! I don't handle SCAS.\n");
1661 goto cannot_emulate; 1685 goto cannot_emulate;
1662 case 0xb8: /* mov r, imm */ 1686 case 0xb0 ... 0xbf: /* mov r, imm */
1663 goto mov; 1687 goto mov;
1664 case 0xc0 ... 0xc1: 1688 case 0xc0 ... 0xc1:
1665 emulate_grp2(ctxt); 1689 emulate_grp2(ctxt);
@@ -1679,6 +1703,16 @@ special_insn:
1679 c->src.val = c->regs[VCPU_REGS_RCX]; 1703 c->src.val = c->regs[VCPU_REGS_RCX];
1680 emulate_grp2(ctxt); 1704 emulate_grp2(ctxt);
1681 break; 1705 break;
1706 case 0xe4: /* inb */
1707 case 0xe5: /* in */
1708 port = insn_fetch(u8, 1, c->eip);
1709 io_dir_in = 1;
1710 goto do_io;
1711 case 0xe6: /* outb */
1712 case 0xe7: /* out */
1713 port = insn_fetch(u8, 1, c->eip);
1714 io_dir_in = 0;
1715 goto do_io;
1682 case 0xe8: /* call (near) */ { 1716 case 0xe8: /* call (near) */ {
1683 long int rel; 1717 long int rel;
1684 switch (c->op_bytes) { 1718 switch (c->op_bytes) {
@@ -1729,6 +1763,22 @@ special_insn:
1729 jmp_rel(c, c->src.val); 1763 jmp_rel(c, c->src.val);
1730 c->dst.type = OP_NONE; /* Disable writeback. */ 1764 c->dst.type = OP_NONE; /* Disable writeback. */
1731 break; 1765 break;
1766 case 0xec: /* in al,dx */
1767 case 0xed: /* in (e/r)ax,dx */
1768 port = c->regs[VCPU_REGS_RDX];
1769 io_dir_in = 1;
1770 goto do_io;
1771 case 0xee: /* out al,dx */
1772 case 0xef: /* out (e/r)ax,dx */
1773 port = c->regs[VCPU_REGS_RDX];
1774 io_dir_in = 0;
1775 do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in,
1776 (c->d & ByteOp) ? 1 : c->op_bytes,
1777 port) != 0) {
1778 c->eip = saved_eip;
1779 goto cannot_emulate;
1780 }
1781 return 0;
1732 case 0xf4: /* hlt */ 1782 case 0xf4: /* hlt */
1733 ctxt->vcpu->arch.halt_request = 1; 1783 ctxt->vcpu->arch.halt_request = 1;
1734 break; 1784 break;
@@ -1754,6 +1804,14 @@ special_insn:
1754 ctxt->eflags |= X86_EFLAGS_IF; 1804 ctxt->eflags |= X86_EFLAGS_IF;
1755 c->dst.type = OP_NONE; /* Disable writeback. */ 1805 c->dst.type = OP_NONE; /* Disable writeback. */
1756 break; 1806 break;
1807 case 0xfc: /* cld */
1808 ctxt->eflags &= ~EFLG_DF;
1809 c->dst.type = OP_NONE; /* Disable writeback. */
1810 break;
1811 case 0xfd: /* std */
1812 ctxt->eflags |= EFLG_DF;
1813 c->dst.type = OP_NONE; /* Disable writeback. */
1814 break;
1757 case 0xfe ... 0xff: /* Grp4/Grp5 */ 1815 case 0xfe ... 0xff: /* Grp4/Grp5 */
1758 rc = emulate_grp45(ctxt, ops); 1816 rc = emulate_grp45(ctxt, ops);
1759 if (rc != 0) 1817 if (rc != 0)
@@ -1768,7 +1826,7 @@ writeback:
1768 1826
1769 /* Commit shadow register state. */ 1827 /* Commit shadow register state. */
1770 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 1828 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
1771 ctxt->vcpu->arch.rip = c->eip; 1829 kvm_rip_write(ctxt->vcpu, c->eip);
1772 1830
1773done: 1831done:
1774 if (rc == X86EMUL_UNHANDLEABLE) { 1832 if (rc == X86EMUL_UNHANDLEABLE) {
@@ -1793,7 +1851,7 @@ twobyte_insn:
1793 goto done; 1851 goto done;
1794 1852
1795 /* Let the processor re-execute the fixed hypercall */ 1853 /* Let the processor re-execute the fixed hypercall */
1796 c->eip = ctxt->vcpu->arch.rip; 1854 c->eip = kvm_rip_read(ctxt->vcpu);
1797 /* Disable writeback. */ 1855 /* Disable writeback. */
1798 c->dst.type = OP_NONE; 1856 c->dst.type = OP_NONE;
1799 break; 1857 break;
@@ -1889,7 +1947,7 @@ twobyte_insn:
1889 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); 1947 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
1890 if (rc) { 1948 if (rc) {
1891 kvm_inject_gp(ctxt->vcpu, 0); 1949 kvm_inject_gp(ctxt->vcpu, 0);
1892 c->eip = ctxt->vcpu->arch.rip; 1950 c->eip = kvm_rip_read(ctxt->vcpu);
1893 } 1951 }
1894 rc = X86EMUL_CONTINUE; 1952 rc = X86EMUL_CONTINUE;
1895 c->dst.type = OP_NONE; 1953 c->dst.type = OP_NONE;
@@ -1899,7 +1957,7 @@ twobyte_insn:
1899 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); 1957 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
1900 if (rc) { 1958 if (rc) {
1901 kvm_inject_gp(ctxt->vcpu, 0); 1959 kvm_inject_gp(ctxt->vcpu, 0);
1902 c->eip = ctxt->vcpu->arch.rip; 1960 c->eip = kvm_rip_read(ctxt->vcpu);
1903 } else { 1961 } else {
1904 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 1962 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
1905 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 1963 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 65f0b8a47bed..48ee4f9435f4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -582,7 +582,7 @@ static void __init lguest_init_IRQ(void)
582 for (i = 0; i < LGUEST_IRQS; i++) { 582 for (i = 0; i < LGUEST_IRQS; i++) {
583 int vector = FIRST_EXTERNAL_VECTOR + i; 583 int vector = FIRST_EXTERNAL_VECTOR + i;
584 if (vector != SYSCALL_VECTOR) { 584 if (vector != SYSCALL_VECTOR) {
585 set_intr_gate(vector, interrupt[i]); 585 set_intr_gate(vector, interrupt[vector]);
586 set_irq_chip_and_handler_name(i, &lguest_irq_controller, 586 set_irq_chip_and_handler_name(i, &lguest_irq_controller,
587 handle_level_irq, 587 handle_level_irq,
588 "level"); 588 "level");
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index df37fc9d6a26..3c3b471ea496 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -41,6 +41,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
41 { } 41 { }
42}; 42};
43 43
44static cpumask_t vector_allocation_domain(int cpu)
45{
46 return cpumask_of_cpu(cpu);
47}
44 48
45static int probe_bigsmp(void) 49static int probe_bigsmp(void)
46{ 50{
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
index 520cca0ee04e..28459cab3ddb 100644
--- a/arch/x86/mach-generic/es7000.c
+++ b/arch/x86/mach-generic/es7000.c
@@ -47,16 +47,26 @@ static __init int mps_oem_check(struct mp_config_table *mpc, char *oem,
47/* Hook from generic ACPI tables.c */ 47/* Hook from generic ACPI tables.c */
48static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 48static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
49{ 49{
50 unsigned long oem_addr; 50 unsigned long oem_addr = 0;
51 int check_dsdt;
52 int ret = 0;
53
54 /* check dsdt at first to avoid clear fix_map for oem_addr */
55 check_dsdt = es7000_check_dsdt();
56
51 if (!find_unisys_acpi_oem_table(&oem_addr)) { 57 if (!find_unisys_acpi_oem_table(&oem_addr)) {
52 if (es7000_check_dsdt()) 58 if (check_dsdt)
53 return parse_unisys_oem((char *)oem_addr); 59 ret = parse_unisys_oem((char *)oem_addr);
54 else { 60 else {
55 setup_unisys(); 61 setup_unisys();
56 return 1; 62 ret = 1;
57 } 63 }
64 /*
65 * we need to unmap it
66 */
67 unmap_unisys_acpi_oem_table(oem_addr);
58 } 68 }
59 return 0; 69 return ret;
60} 70}
61#else 71#else
62static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 72static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -65,4 +75,18 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
65} 75}
66#endif 76#endif
67 77
78static cpumask_t vector_allocation_domain(int cpu)
79{
80 /* Careful. Some cpus do not strictly honor the set of cpus
81 * specified in the interrupt destination when using lowest
82 * priority interrupt delivery mode.
83 *
84 * In particular there was a hyperthreading cpu observed to
85 * deliver interrupts to the wrong hyperthread when only one
86 * hyperthread was specified in the interrupt desitination.
87 */
88 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
89 return domain;
90}
91
68struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); 92struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
index 8cf58394975e..71a309b122e6 100644
--- a/arch/x86/mach-generic/numaq.c
+++ b/arch/x86/mach-generic/numaq.c
@@ -38,4 +38,18 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
38 return 0; 38 return 0;
39} 39}
40 40
41static cpumask_t vector_allocation_domain(int cpu)
42{
43 /* Careful. Some cpus do not strictly honor the set of cpus
44 * specified in the interrupt destination when using lowest
45 * priority interrupt delivery mode.
46 *
47 * In particular there was a hyperthreading cpu observed to
48 * deliver interrupts to the wrong hyperthread when only one
49 * hyperthread was specified in the interrupt desitination.
50 */
51 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
52 return domain;
53}
54
41struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); 55struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index 6ad6b67a723d..6272b5e69da6 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -23,4 +23,18 @@ static int probe_summit(void)
23 return 0; 23 return 0;
24} 24}
25 25
26static cpumask_t vector_allocation_domain(int cpu)
27{
28 /* Careful. Some cpus do not strictly honor the set of cpus
29 * specified in the interrupt destination when using lowest
30 * priority interrupt delivery mode.
31 *
32 * In particular there was a hyperthreading cpu observed to
33 * deliver interrupts to the wrong hyperthread when only one
34 * hyperthread was specified in the interrupt desitination.
35 */
36 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
37 return domain;
38}
39
26struct genapic apic_summit = APIC_INIT("summit", probe_summit); 40struct genapic apic_summit = APIC_INIT("summit", probe_summit);
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 199a5f4a873c..0f6e8a6523ae 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -1483,7 +1483,7 @@ static void disable_local_vic_irq(unsigned int irq)
1483 * the interrupt off to another CPU */ 1483 * the interrupt off to another CPU */
1484static void before_handle_vic_irq(unsigned int irq) 1484static void before_handle_vic_irq(unsigned int irq)
1485{ 1485{
1486 irq_desc_t *desc = irq_desc + irq; 1486 irq_desc_t *desc = irq_to_desc(irq);
1487 __u8 cpu = smp_processor_id(); 1487 __u8 cpu = smp_processor_id();
1488 1488
1489 _raw_spin_lock(&vic_irq_lock); 1489 _raw_spin_lock(&vic_irq_lock);
@@ -1518,7 +1518,7 @@ static void before_handle_vic_irq(unsigned int irq)
1518/* Finish the VIC interrupt: basically mask */ 1518/* Finish the VIC interrupt: basically mask */
1519static void after_handle_vic_irq(unsigned int irq) 1519static void after_handle_vic_irq(unsigned int irq)
1520{ 1520{
1521 irq_desc_t *desc = irq_desc + irq; 1521 irq_desc_t *desc = irq_to_desc(irq);
1522 1522
1523 _raw_spin_lock(&vic_irq_lock); 1523 _raw_spin_lock(&vic_irq_lock);
1524 { 1524 {
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index dfb932dcf136..59f89b434b45 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -13,12 +13,8 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13mmiotrace-y := pf_in.o mmio-mod.o 13mmiotrace-y := pf_in.o mmio-mod.o
14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15 15
16ifeq ($(CONFIG_X86_32),y) 16obj-$(CONFIG_NUMA) += numa_$(BITS).o
17obj-$(CONFIG_NUMA) += discontig_32.o
18else
19obj-$(CONFIG_NUMA) += numa_64.o
20obj-$(CONFIG_K8_NUMA) += k8topology_64.o 17obj-$(CONFIG_K8_NUMA) += k8topology_64.o
21endif
22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 18obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
23 19
24obj-$(CONFIG_MEMTEST) += memtest.o 20obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a742d753d5b0..31e8730fa246 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -592,11 +592,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
592 unsigned long flags; 592 unsigned long flags;
593#endif 593#endif
594 594
595 /*
596 * We can fault from pretty much anywhere, with unknown IRQ state.
597 */
598 trace_hardirqs_fixup();
599
600 tsk = current; 595 tsk = current;
601 mm = tsk->mm; 596 mm = tsk->mm;
602 prefetchw(&mm->mmap_sem); 597 prefetchw(&mm->mmap_sem);
@@ -645,24 +640,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
645 } 640 }
646 641
647 642
648#ifdef CONFIG_X86_32
649 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
650 fault has been handled. */
651 if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
652 local_irq_enable();
653
654 /* 643 /*
655 * If we're in an interrupt, have no user context or are running in an 644 * It's safe to allow irq's after cr2 has been saved and the
656 * atomic region then we must not take the fault. 645 * vmalloc fault has been handled.
646 *
647 * User-mode registers count as a user access even for any
648 * potential system fault or CPU buglet.
657 */ 649 */
658 if (in_atomic() || !mm) 650 if (user_mode_vm(regs)) {
659 goto bad_area_nosemaphore; 651 local_irq_enable();
660#else /* CONFIG_X86_64 */ 652 error_code |= PF_USER;
661 if (likely(regs->flags & X86_EFLAGS_IF)) 653 } else if (regs->flags & X86_EFLAGS_IF)
662 local_irq_enable(); 654 local_irq_enable();
663 655
656#ifdef CONFIG_X86_64
664 if (unlikely(error_code & PF_RSVD)) 657 if (unlikely(error_code & PF_RSVD))
665 pgtable_bad(address, regs, error_code); 658 pgtable_bad(address, regs, error_code);
659#endif
666 660
667 /* 661 /*
668 * If we're in an interrupt, have no user context or are running in an 662 * If we're in an interrupt, have no user context or are running in an
@@ -671,15 +665,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
671 if (unlikely(in_atomic() || !mm)) 665 if (unlikely(in_atomic() || !mm))
672 goto bad_area_nosemaphore; 666 goto bad_area_nosemaphore;
673 667
674 /*
675 * User-mode registers count as a user access even for any
676 * potential system fault or CPU buglet.
677 */
678 if (user_mode_vm(regs))
679 error_code |= PF_USER;
680again: 668again:
681#endif 669 /*
682 /* When running in the kernel we expect faults to occur only to 670 * When running in the kernel we expect faults to occur only to
683 * addresses in user space. All other faults represent errors in the 671 * addresses in user space. All other faults represent errors in the
684 * kernel and should generate an OOPS. Unfortunately, in the case of an 672 * kernel and should generate an OOPS. Unfortunately, in the case of an
685 * erroneous fault occurring in a code path which already holds mmap_sem 673 * erroneous fault occurring in a code path which already holds mmap_sem
@@ -742,9 +730,6 @@ good_area:
742 goto bad_area; 730 goto bad_area;
743 } 731 }
744 732
745#ifdef CONFIG_X86_32
746survive:
747#endif
748 /* 733 /*
749 * If for any reason at all we couldn't handle the fault, 734 * If for any reason at all we couldn't handle the fault,
750 * make sure we exit gracefully rather than endlessly redo 735 * make sure we exit gracefully rather than endlessly redo
@@ -879,12 +864,11 @@ out_of_memory:
879 up_read(&mm->mmap_sem); 864 up_read(&mm->mmap_sem);
880 if (is_global_init(tsk)) { 865 if (is_global_init(tsk)) {
881 yield(); 866 yield();
882#ifdef CONFIG_X86_32 867 /*
883 down_read(&mm->mmap_sem); 868 * Re-lookup the vma - in theory the vma tree might
884 goto survive; 869 * have changed:
885#else 870 */
886 goto again; 871 goto again;
887#endif
888 } 872 }
889 873
890 printk("VM: killing process %s\n", tsk->comm); 874 printk("VM: killing process %s\n", tsk->comm);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 007bb06c7504..4ba373c5b8c8 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -82,7 +82,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
82 pte_t pte = gup_get_pte(ptep); 82 pte_t pte = gup_get_pte(ptep);
83 struct page *page; 83 struct page *page;
84 84
85 if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) { 85 if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
86 pte_unmap(ptep); 86 pte_unmap(ptep);
87 return 0; 87 return 0;
88 } 88 }
@@ -116,10 +116,10 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
116 mask = _PAGE_PRESENT|_PAGE_USER; 116 mask = _PAGE_PRESENT|_PAGE_USER;
117 if (write) 117 if (write)
118 mask |= _PAGE_RW; 118 mask |= _PAGE_RW;
119 if ((pte_val(pte) & mask) != mask) 119 if ((pte_flags(pte) & mask) != mask)
120 return 0; 120 return 0;
121 /* hugepages are never "special" */ 121 /* hugepages are never "special" */
122 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); 122 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
123 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 123 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
124 124
125 refs = 0; 125 refs = 0;
@@ -173,10 +173,10 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
173 mask = _PAGE_PRESENT|_PAGE_USER; 173 mask = _PAGE_PRESENT|_PAGE_USER;
174 if (write) 174 if (write)
175 mask |= _PAGE_RW; 175 mask |= _PAGE_RW;
176 if ((pte_val(pte) & mask) != mask) 176 if ((pte_flags(pte) & mask) != mask)
177 return 0; 177 return 0;
178 /* hugepages are never "special" */ 178 /* hugepages are never "special" */
179 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); 179 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
180 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 180 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
181 181
182 refs = 0; 182 refs = 0;
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 165c871ba9af..bcc079c282dd 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -137,6 +137,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
137 137
138 return (void*) vaddr; 138 return (void*) vaddr;
139} 139}
140EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
140 141
141struct page *kmap_atomic_to_page(void *ptr) 142struct page *kmap_atomic_to_page(void *ptr)
142{ 143{
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bbe044dbe014..8396868e82c5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -558,7 +558,7 @@ void zap_low_mappings(void)
558 558
559int nx_enabled; 559int nx_enabled;
560 560
561pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); 561pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
562EXPORT_SYMBOL_GPL(__supported_pte_mask); 562EXPORT_SYMBOL_GPL(__supported_pte_mask);
563 563
564#ifdef CONFIG_X86_PAE 564#ifdef CONFIG_X86_PAE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3e10054c5731..b8e461d49412 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -89,7 +89,7 @@ early_param("gbpages", parse_direct_gbpages_on);
89 89
90int after_bootmem; 90int after_bootmem;
91 91
92unsigned long __supported_pte_mask __read_mostly = ~0UL; 92pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
93EXPORT_SYMBOL_GPL(__supported_pte_mask); 93EXPORT_SYMBOL_GPL(__supported_pte_mask);
94 94
95static int do_not_nx __cpuinitdata; 95static int do_not_nx __cpuinitdata;
@@ -196,9 +196,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
196 } 196 }
197 197
198 pte = pte_offset_kernel(pmd, vaddr); 198 pte = pte_offset_kernel(pmd, vaddr);
199 if (!pte_none(*pte) && pte_val(new_pte) &&
200 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
201 pte_ERROR(*pte);
202 set_pte(pte, new_pte); 199 set_pte(pte, new_pte);
203 200
204 /* 201 /*
@@ -313,7 +310,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
313 if (pfn >= table_top) 310 if (pfn >= table_top)
314 panic("alloc_low_page: ran out of memory"); 311 panic("alloc_low_page: ran out of memory");
315 312
316 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 313 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
317 memset(adr, 0, PAGE_SIZE); 314 memset(adr, 0, PAGE_SIZE);
318 *phys = pfn * PAGE_SIZE; 315 *phys = pfn * PAGE_SIZE;
319 return adr; 316 return adr;
@@ -749,7 +746,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
749 old_start = mr[i].start; 746 old_start = mr[i].start;
750 memmove(&mr[i], &mr[i+1], 747 memmove(&mr[i], &mr[i+1],
751 (nr_range - 1 - i) * sizeof (struct map_range)); 748 (nr_range - 1 - i) * sizeof (struct map_range));
752 mr[i].start = old_start; 749 mr[i--].start = old_start;
753 nr_range--; 750 nr_range--;
754 } 751 }
755 752
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 8cbeda15cd29..ae71e11eb3e5 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -45,6 +45,27 @@ unsigned long __phys_addr(unsigned long x)
45} 45}
46EXPORT_SYMBOL(__phys_addr); 46EXPORT_SYMBOL(__phys_addr);
47 47
48bool __virt_addr_valid(unsigned long x)
49{
50 if (x >= __START_KERNEL_map) {
51 x -= __START_KERNEL_map;
52 if (x >= KERNEL_IMAGE_SIZE)
53 return false;
54 x += phys_base;
55 } else {
56 if (x < PAGE_OFFSET)
57 return false;
58 x -= PAGE_OFFSET;
59 if (system_state == SYSTEM_BOOTING ?
60 x > MAXMEM : !phys_addr_valid(x)) {
61 return false;
62 }
63 }
64
65 return pfn_valid(x >> PAGE_SHIFT);
66}
67EXPORT_SYMBOL(__virt_addr_valid);
68
48#else 69#else
49 70
50static inline int phys_addr_valid(unsigned long addr) 71static inline int phys_addr_valid(unsigned long addr)
@@ -56,13 +77,24 @@ static inline int phys_addr_valid(unsigned long addr)
56unsigned long __phys_addr(unsigned long x) 77unsigned long __phys_addr(unsigned long x)
57{ 78{
58 /* VMALLOC_* aren't constants; not available at the boot time */ 79 /* VMALLOC_* aren't constants; not available at the boot time */
59 VIRTUAL_BUG_ON(x < PAGE_OFFSET || (system_state != SYSTEM_BOOTING && 80 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
60 is_vmalloc_addr((void *)x))); 81 VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
82 is_vmalloc_addr((void *) x));
61 return x - PAGE_OFFSET; 83 return x - PAGE_OFFSET;
62} 84}
63EXPORT_SYMBOL(__phys_addr); 85EXPORT_SYMBOL(__phys_addr);
64#endif 86#endif
65 87
88bool __virt_addr_valid(unsigned long x)
89{
90 if (x < PAGE_OFFSET)
91 return false;
92 if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
93 return false;
94 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
95}
96EXPORT_SYMBOL(__virt_addr_valid);
97
66#endif 98#endif
67 99
68int page_is_ram(unsigned long pagenr) 100int page_is_ram(unsigned long pagenr)
@@ -188,6 +220,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
188 return (__force void __iomem *)phys_to_virt(phys_addr); 220 return (__force void __iomem *)phys_to_virt(phys_addr);
189 221
190 /* 222 /*
223 * Check if the request spans more than any BAR in the iomem resource
224 * tree.
225 */
226 WARN_ON(iomem_map_sanity_check(phys_addr, size));
227
228 /*
191 * Don't allow anybody to remap normal RAM that we're using.. 229 * Don't allow anybody to remap normal RAM that we're using..
192 */ 230 */
193 for (pfn = phys_addr >> PAGE_SHIFT; 231 for (pfn = phys_addr >> PAGE_SHIFT;
@@ -242,16 +280,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
242 switch (prot_val) { 280 switch (prot_val) {
243 case _PAGE_CACHE_UC: 281 case _PAGE_CACHE_UC:
244 default: 282 default:
245 prot = PAGE_KERNEL_NOCACHE; 283 prot = PAGE_KERNEL_IO_NOCACHE;
246 break; 284 break;
247 case _PAGE_CACHE_UC_MINUS: 285 case _PAGE_CACHE_UC_MINUS:
248 prot = PAGE_KERNEL_UC_MINUS; 286 prot = PAGE_KERNEL_IO_UC_MINUS;
249 break; 287 break;
250 case _PAGE_CACHE_WC: 288 case _PAGE_CACHE_WC:
251 prot = PAGE_KERNEL_WC; 289 prot = PAGE_KERNEL_IO_WC;
252 break; 290 break;
253 case _PAGE_CACHE_WB: 291 case _PAGE_CACHE_WB:
254 prot = PAGE_KERNEL; 292 prot = PAGE_KERNEL_IO;
255 break; 293 break;
256 } 294 }
257 295
@@ -568,12 +606,12 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
568} 606}
569 607
570static inline void __init early_set_fixmap(enum fixed_addresses idx, 608static inline void __init early_set_fixmap(enum fixed_addresses idx,
571 unsigned long phys) 609 unsigned long phys, pgprot_t prot)
572{ 610{
573 if (after_paging_init) 611 if (after_paging_init)
574 set_fixmap(idx, phys); 612 __set_fixmap(idx, phys, prot);
575 else 613 else
576 __early_set_fixmap(idx, phys, PAGE_KERNEL); 614 __early_set_fixmap(idx, phys, prot);
577} 615}
578 616
579static inline void __init early_clear_fixmap(enum fixed_addresses idx) 617static inline void __init early_clear_fixmap(enum fixed_addresses idx)
@@ -584,16 +622,22 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
584 __early_set_fixmap(idx, 0, __pgprot(0)); 622 __early_set_fixmap(idx, 0, __pgprot(0));
585} 623}
586 624
587 625static void *prev_map[FIX_BTMAPS_SLOTS] __initdata;
588static int __initdata early_ioremap_nested; 626static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
589
590static int __init check_early_ioremap_leak(void) 627static int __init check_early_ioremap_leak(void)
591{ 628{
592 if (!early_ioremap_nested) 629 int count = 0;
630 int i;
631
632 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
633 if (prev_map[i])
634 count++;
635
636 if (!count)
593 return 0; 637 return 0;
594 WARN(1, KERN_WARNING 638 WARN(1, KERN_WARNING
595 "Debug warning: early ioremap leak of %d areas detected.\n", 639 "Debug warning: early ioremap leak of %d areas detected.\n",
596 early_ioremap_nested); 640 count);
597 printk(KERN_WARNING 641 printk(KERN_WARNING
598 "please boot with early_ioremap_debug and report the dmesg.\n"); 642 "please boot with early_ioremap_debug and report the dmesg.\n");
599 643
@@ -601,18 +645,33 @@ static int __init check_early_ioremap_leak(void)
601} 645}
602late_initcall(check_early_ioremap_leak); 646late_initcall(check_early_ioremap_leak);
603 647
604void __init *early_ioremap(unsigned long phys_addr, unsigned long size) 648static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
605{ 649{
606 unsigned long offset, last_addr; 650 unsigned long offset, last_addr;
607 unsigned int nrpages, nesting; 651 unsigned int nrpages;
608 enum fixed_addresses idx0, idx; 652 enum fixed_addresses idx0, idx;
653 int i, slot;
609 654
610 WARN_ON(system_state != SYSTEM_BOOTING); 655 WARN_ON(system_state != SYSTEM_BOOTING);
611 656
612 nesting = early_ioremap_nested; 657 slot = -1;
658 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
659 if (!prev_map[i]) {
660 slot = i;
661 break;
662 }
663 }
664
665 if (slot < 0) {
666 printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n",
667 phys_addr, size);
668 WARN_ON(1);
669 return NULL;
670 }
671
613 if (early_ioremap_debug) { 672 if (early_ioremap_debug) {
614 printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", 673 printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
615 phys_addr, size, nesting); 674 phys_addr, size, slot);
616 dump_stack(); 675 dump_stack();
617 } 676 }
618 677
@@ -623,11 +682,7 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
623 return NULL; 682 return NULL;
624 } 683 }
625 684
626 if (nesting >= FIX_BTMAPS_NESTING) { 685 prev_size[slot] = size;
627 WARN_ON(1);
628 return NULL;
629 }
630 early_ioremap_nested++;
631 /* 686 /*
632 * Mappings have to be page-aligned 687 * Mappings have to be page-aligned
633 */ 688 */
@@ -647,10 +702,10 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
647 /* 702 /*
648 * Ok, go for it.. 703 * Ok, go for it..
649 */ 704 */
650 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; 705 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
651 idx = idx0; 706 idx = idx0;
652 while (nrpages > 0) { 707 while (nrpages > 0) {
653 early_set_fixmap(idx, phys_addr); 708 early_set_fixmap(idx, phys_addr, prot);
654 phys_addr += PAGE_SIZE; 709 phys_addr += PAGE_SIZE;
655 --idx; 710 --idx;
656 --nrpages; 711 --nrpages;
@@ -658,7 +713,20 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
658 if (early_ioremap_debug) 713 if (early_ioremap_debug)
659 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); 714 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
660 715
661 return (void *) (offset + fix_to_virt(idx0)); 716 prev_map[slot] = (void *) (offset + fix_to_virt(idx0));
717 return prev_map[slot];
718}
719
720/* Remap an IO device */
721void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
722{
723 return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
724}
725
726/* Remap memory */
727void __init *early_memremap(unsigned long phys_addr, unsigned long size)
728{
729 return __early_ioremap(phys_addr, size, PAGE_KERNEL);
662} 730}
663 731
664void __init early_iounmap(void *addr, unsigned long size) 732void __init early_iounmap(void *addr, unsigned long size)
@@ -667,15 +735,33 @@ void __init early_iounmap(void *addr, unsigned long size)
667 unsigned long offset; 735 unsigned long offset;
668 unsigned int nrpages; 736 unsigned int nrpages;
669 enum fixed_addresses idx; 737 enum fixed_addresses idx;
670 int nesting; 738 int i, slot;
739
740 slot = -1;
741 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
742 if (prev_map[i] == addr) {
743 slot = i;
744 break;
745 }
746 }
671 747
672 nesting = --early_ioremap_nested; 748 if (slot < 0) {
673 if (WARN_ON(nesting < 0)) 749 printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
750 addr, size);
751 WARN_ON(1);
752 return;
753 }
754
755 if (prev_size[slot] != size) {
756 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
757 addr, size, slot, prev_size[slot]);
758 WARN_ON(1);
674 return; 759 return;
760 }
675 761
676 if (early_ioremap_debug) { 762 if (early_ioremap_debug) {
677 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, 763 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
678 size, nesting); 764 size, slot);
679 dump_stack(); 765 dump_stack();
680 } 766 }
681 767
@@ -687,12 +773,13 @@ void __init early_iounmap(void *addr, unsigned long size)
687 offset = virt_addr & ~PAGE_MASK; 773 offset = virt_addr & ~PAGE_MASK;
688 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; 774 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
689 775
690 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; 776 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
691 while (nrpages > 0) { 777 while (nrpages > 0) {
692 early_clear_fixmap(idx); 778 early_clear_fixmap(idx);
693 --idx; 779 --idx;
694 --nrpages; 780 --nrpages;
695 } 781 }
782 prev_map[slot] = 0;
696} 783}
697 784
698void __this_fixmap_does_not_exist(void) 785void __this_fixmap_does_not_exist(void)
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 635b50e85581..2c4baa88f2cb 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -56,13 +56,6 @@ struct remap_trace {
56static DEFINE_PER_CPU(struct trap_reason, pf_reason); 56static DEFINE_PER_CPU(struct trap_reason, pf_reason);
57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); 57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
58 58
59#if 0 /* XXX: no way gather this info anymore */
60/* Access to this is not per-cpu. */
61static DEFINE_PER_CPU(atomic_t, dropped);
62#endif
63
64static struct dentry *marker_file;
65
66static DEFINE_MUTEX(mmiotrace_mutex); 59static DEFINE_MUTEX(mmiotrace_mutex);
67static DEFINE_SPINLOCK(trace_lock); 60static DEFINE_SPINLOCK(trace_lock);
68static atomic_t mmiotrace_enabled; 61static atomic_t mmiotrace_enabled;
@@ -75,7 +68,7 @@ static LIST_HEAD(trace_list); /* struct remap_trace */
75 * and trace_lock. 68 * and trace_lock.
76 * - Routines depending on is_enabled() must take trace_lock. 69 * - Routines depending on is_enabled() must take trace_lock.
77 * - trace_list users must hold trace_lock. 70 * - trace_list users must hold trace_lock.
78 * - is_enabled() guarantees that mmio_trace_record is allowed. 71 * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
79 * - pre/post callbacks assume the effect of is_enabled() being true. 72 * - pre/post callbacks assume the effect of is_enabled() being true.
80 */ 73 */
81 74
@@ -97,44 +90,6 @@ static bool is_enabled(void)
97 return atomic_read(&mmiotrace_enabled); 90 return atomic_read(&mmiotrace_enabled);
98} 91}
99 92
100#if 0 /* XXX: needs rewrite */
101/*
102 * Write callback for the debugfs entry:
103 * Read a marker and write it to the mmio trace log
104 */
105static ssize_t write_marker(struct file *file, const char __user *buffer,
106 size_t count, loff_t *ppos)
107{
108 char *event = NULL;
109 struct mm_io_header *headp;
110 ssize_t len = (count > 65535) ? 65535 : count;
111
112 event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
113 if (!event)
114 return -ENOMEM;
115
116 headp = (struct mm_io_header *)event;
117 headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
118 headp->data_len = len;
119
120 if (copy_from_user(event + sizeof(*headp), buffer, len)) {
121 kfree(event);
122 return -EFAULT;
123 }
124
125 spin_lock_irq(&trace_lock);
126#if 0 /* XXX: convert this to use tracing */
127 if (is_enabled())
128 relay_write(chan, event, sizeof(*headp) + len);
129 else
130#endif
131 len = -EINVAL;
132 spin_unlock_irq(&trace_lock);
133 kfree(event);
134 return len;
135}
136#endif
137
138static void print_pte(unsigned long address) 93static void print_pte(unsigned long address)
139{ 94{
140 unsigned int level; 95 unsigned int level;
@@ -307,8 +262,10 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size,
307 map.map_id = trace->id; 262 map.map_id = trace->id;
308 263
309 spin_lock_irq(&trace_lock); 264 spin_lock_irq(&trace_lock);
310 if (!is_enabled()) 265 if (!is_enabled()) {
266 kfree(trace);
311 goto not_enabled; 267 goto not_enabled;
268 }
312 269
313 mmio_trace_mapping(&map); 270 mmio_trace_mapping(&map);
314 list_add_tail(&trace->list, &trace_list); 271 list_add_tail(&trace->list, &trace_list);
@@ -377,6 +334,23 @@ void mmiotrace_iounmap(volatile void __iomem *addr)
377 iounmap_trace_core(addr); 334 iounmap_trace_core(addr);
378} 335}
379 336
337int mmiotrace_printk(const char *fmt, ...)
338{
339 int ret = 0;
340 va_list args;
341 unsigned long flags;
342 va_start(args, fmt);
343
344 spin_lock_irqsave(&trace_lock, flags);
345 if (is_enabled())
346 ret = mmio_trace_printk(fmt, args);
347 spin_unlock_irqrestore(&trace_lock, flags);
348
349 va_end(args);
350 return ret;
351}
352EXPORT_SYMBOL(mmiotrace_printk);
353
380static void clear_trace_list(void) 354static void clear_trace_list(void)
381{ 355{
382 struct remap_trace *trace; 356 struct remap_trace *trace;
@@ -462,26 +436,12 @@ static void leave_uniprocessor(void)
462} 436}
463#endif 437#endif
464 438
465#if 0 /* XXX: out of order */
466static struct file_operations fops_marker = {
467 .owner = THIS_MODULE,
468 .write = write_marker
469};
470#endif
471
472void enable_mmiotrace(void) 439void enable_mmiotrace(void)
473{ 440{
474 mutex_lock(&mmiotrace_mutex); 441 mutex_lock(&mmiotrace_mutex);
475 if (is_enabled()) 442 if (is_enabled())
476 goto out; 443 goto out;
477 444
478#if 0 /* XXX: tracing does not support text entries */
479 marker_file = debugfs_create_file("marker", 0660, dir, NULL,
480 &fops_marker);
481 if (!marker_file)
482 pr_err(NAME "marker file creation failed.\n");
483#endif
484
485 if (nommiotrace) 445 if (nommiotrace)
486 pr_info(NAME "MMIO tracing disabled.\n"); 446 pr_info(NAME "MMIO tracing disabled.\n");
487 enter_uniprocessor(); 447 enter_uniprocessor();
@@ -506,11 +466,6 @@ void disable_mmiotrace(void)
506 466
507 clear_trace_list(); /* guarantees: no more kmmio callbacks */ 467 clear_trace_list(); /* guarantees: no more kmmio callbacks */
508 leave_uniprocessor(); 468 leave_uniprocessor();
509 if (marker_file) {
510 debugfs_remove(marker_file);
511 marker_file = NULL;
512 }
513
514 pr_info(NAME "disabled.\n"); 469 pr_info(NAME "disabled.\n");
515out: 470out:
516 mutex_unlock(&mmiotrace_mutex); 471 mutex_unlock(&mmiotrace_mutex);
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/numa_32.c
index 847c164725f4..847c164725f4 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/numa_32.c
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a9ec89c3fbca..407d8784f669 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -792,6 +792,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
792 /* Must avoid aliasing mappings in the highmem code */ 792 /* Must avoid aliasing mappings in the highmem code */
793 kmap_flush_unused(); 793 kmap_flush_unused();
794 794
795 vm_unmap_aliases();
796
795 cpa.vaddr = addr; 797 cpa.vaddr = addr;
796 cpa.numpages = numpages; 798 cpa.numpages = numpages;
797 cpa.mask_set = mask_set; 799 cpa.mask_set = mask_set;
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index efa1911e20ca..df3d5c861cda 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -79,25 +79,34 @@ static unsigned int mw32[] = { 0xC7 };
79static unsigned int mw64[] = { 0x89, 0x8B }; 79static unsigned int mw64[] = { 0x89, 0x8B };
80#endif /* not __i386__ */ 80#endif /* not __i386__ */
81 81
82static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, 82struct prefix_bits {
83 int *rexr) 83 unsigned shorted:1;
84 unsigned enlarged:1;
85 unsigned rexr:1;
86 unsigned rex:1;
87};
88
89static int skip_prefix(unsigned char *addr, struct prefix_bits *prf)
84{ 90{
85 int i; 91 int i;
86 unsigned char *p = addr; 92 unsigned char *p = addr;
87 *shorted = 0; 93 prf->shorted = 0;
88 *enlarged = 0; 94 prf->enlarged = 0;
89 *rexr = 0; 95 prf->rexr = 0;
96 prf->rex = 0;
90 97
91restart: 98restart:
92 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { 99 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
93 if (*p == prefix_codes[i]) { 100 if (*p == prefix_codes[i]) {
94 if (*p == 0x66) 101 if (*p == 0x66)
95 *shorted = 1; 102 prf->shorted = 1;
96#ifdef __amd64__ 103#ifdef __amd64__
97 if ((*p & 0xf8) == 0x48) 104 if ((*p & 0xf8) == 0x48)
98 *enlarged = 1; 105 prf->enlarged = 1;
99 if ((*p & 0xf4) == 0x44) 106 if ((*p & 0xf4) == 0x44)
100 *rexr = 1; 107 prf->rexr = 1;
108 if ((*p & 0xf0) == 0x40)
109 prf->rex = 1;
101#endif 110#endif
102 p++; 111 p++;
103 goto restart; 112 goto restart;
@@ -135,12 +144,12 @@ enum reason_type get_ins_type(unsigned long ins_addr)
135{ 144{
136 unsigned int opcode; 145 unsigned int opcode;
137 unsigned char *p; 146 unsigned char *p;
138 int shorted, enlarged, rexr; 147 struct prefix_bits prf;
139 int i; 148 int i;
140 enum reason_type rv = OTHERS; 149 enum reason_type rv = OTHERS;
141 150
142 p = (unsigned char *)ins_addr; 151 p = (unsigned char *)ins_addr;
143 p += skip_prefix(p, &shorted, &enlarged, &rexr); 152 p += skip_prefix(p, &prf);
144 p += get_opcode(p, &opcode); 153 p += get_opcode(p, &opcode);
145 154
146 CHECK_OP_TYPE(opcode, reg_rop, REG_READ); 155 CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
@@ -156,10 +165,11 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr)
156{ 165{
157 unsigned int opcode; 166 unsigned int opcode;
158 unsigned char *p; 167 unsigned char *p;
159 int i, shorted, enlarged, rexr; 168 struct prefix_bits prf;
169 int i;
160 170
161 p = (unsigned char *)ins_addr; 171 p = (unsigned char *)ins_addr;
162 p += skip_prefix(p, &shorted, &enlarged, &rexr); 172 p += skip_prefix(p, &prf);
163 p += get_opcode(p, &opcode); 173 p += get_opcode(p, &opcode);
164 174
165 for (i = 0; i < ARRAY_SIZE(rw8); i++) 175 for (i = 0; i < ARRAY_SIZE(rw8); i++)
@@ -168,7 +178,7 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr)
168 178
169 for (i = 0; i < ARRAY_SIZE(rw32); i++) 179 for (i = 0; i < ARRAY_SIZE(rw32); i++)
170 if (rw32[i] == opcode) 180 if (rw32[i] == opcode)
171 return (shorted ? 2 : (enlarged ? 8 : 4)); 181 return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
172 182
173 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); 183 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
174 return 0; 184 return 0;
@@ -178,10 +188,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr)
178{ 188{
179 unsigned int opcode; 189 unsigned int opcode;
180 unsigned char *p; 190 unsigned char *p;
181 int i, shorted, enlarged, rexr; 191 struct prefix_bits prf;
192 int i;
182 193
183 p = (unsigned char *)ins_addr; 194 p = (unsigned char *)ins_addr;
184 p += skip_prefix(p, &shorted, &enlarged, &rexr); 195 p += skip_prefix(p, &prf);
185 p += get_opcode(p, &opcode); 196 p += get_opcode(p, &opcode);
186 197
187 for (i = 0; i < ARRAY_SIZE(mw8); i++) 198 for (i = 0; i < ARRAY_SIZE(mw8); i++)
@@ -194,11 +205,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr)
194 205
195 for (i = 0; i < ARRAY_SIZE(mw32); i++) 206 for (i = 0; i < ARRAY_SIZE(mw32); i++)
196 if (mw32[i] == opcode) 207 if (mw32[i] == opcode)
197 return shorted ? 2 : 4; 208 return prf.shorted ? 2 : 4;
198 209
199 for (i = 0; i < ARRAY_SIZE(mw64); i++) 210 for (i = 0; i < ARRAY_SIZE(mw64); i++)
200 if (mw64[i] == opcode) 211 if (mw64[i] == opcode)
201 return shorted ? 2 : (enlarged ? 8 : 4); 212 return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
202 213
203 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); 214 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
204 return 0; 215 return 0;
@@ -238,7 +249,7 @@ enum {
238#endif 249#endif
239}; 250};
240 251
241static unsigned char *get_reg_w8(int no, struct pt_regs *regs) 252static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs)
242{ 253{
243 unsigned char *rv = NULL; 254 unsigned char *rv = NULL;
244 255
@@ -255,18 +266,6 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
255 case arg_DL: 266 case arg_DL:
256 rv = (unsigned char *)&regs->dx; 267 rv = (unsigned char *)&regs->dx;
257 break; 268 break;
258 case arg_AH:
259 rv = 1 + (unsigned char *)&regs->ax;
260 break;
261 case arg_BH:
262 rv = 1 + (unsigned char *)&regs->bx;
263 break;
264 case arg_CH:
265 rv = 1 + (unsigned char *)&regs->cx;
266 break;
267 case arg_DH:
268 rv = 1 + (unsigned char *)&regs->dx;
269 break;
270#ifdef __amd64__ 269#ifdef __amd64__
271 case arg_R8: 270 case arg_R8:
272 rv = (unsigned char *)&regs->r8; 271 rv = (unsigned char *)&regs->r8;
@@ -294,9 +293,55 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
294 break; 293 break;
295#endif 294#endif
296 default: 295 default:
297 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
298 break; 296 break;
299 } 297 }
298
299 if (rv)
300 return rv;
301
302 if (rex) {
303 /*
304 * If REX prefix exists, access low bytes of SI etc.
305 * instead of AH etc.
306 */
307 switch (no) {
308 case arg_SI:
309 rv = (unsigned char *)&regs->si;
310 break;
311 case arg_DI:
312 rv = (unsigned char *)&regs->di;
313 break;
314 case arg_BP:
315 rv = (unsigned char *)&regs->bp;
316 break;
317 case arg_SP:
318 rv = (unsigned char *)&regs->sp;
319 break;
320 default:
321 break;
322 }
323 } else {
324 switch (no) {
325 case arg_AH:
326 rv = 1 + (unsigned char *)&regs->ax;
327 break;
328 case arg_BH:
329 rv = 1 + (unsigned char *)&regs->bx;
330 break;
331 case arg_CH:
332 rv = 1 + (unsigned char *)&regs->cx;
333 break;
334 case arg_DH:
335 rv = 1 + (unsigned char *)&regs->dx;
336 break;
337 default:
338 break;
339 }
340 }
341
342 if (!rv)
343 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
344
300 return rv; 345 return rv;
301} 346}
302 347
@@ -368,11 +413,12 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
368 unsigned char mod_rm; 413 unsigned char mod_rm;
369 int reg; 414 int reg;
370 unsigned char *p; 415 unsigned char *p;
371 int i, shorted, enlarged, rexr; 416 struct prefix_bits prf;
417 int i;
372 unsigned long rv; 418 unsigned long rv;
373 419
374 p = (unsigned char *)ins_addr; 420 p = (unsigned char *)ins_addr;
375 p += skip_prefix(p, &shorted, &enlarged, &rexr); 421 p += skip_prefix(p, &prf);
376 p += get_opcode(p, &opcode); 422 p += get_opcode(p, &opcode);
377 for (i = 0; i < ARRAY_SIZE(reg_rop); i++) 423 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
378 if (reg_rop[i] == opcode) { 424 if (reg_rop[i] == opcode) {
@@ -392,10 +438,10 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
392 438
393do_work: 439do_work:
394 mod_rm = *p; 440 mod_rm = *p;
395 reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); 441 reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
396 switch (get_ins_reg_width(ins_addr)) { 442 switch (get_ins_reg_width(ins_addr)) {
397 case 1: 443 case 1:
398 return *get_reg_w8(reg, regs); 444 return *get_reg_w8(reg, prf.rex, regs);
399 445
400 case 2: 446 case 2:
401 return *(unsigned short *)get_reg_w32(reg, regs); 447 return *(unsigned short *)get_reg_w32(reg, regs);
@@ -422,11 +468,12 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)
422 unsigned char mod_rm; 468 unsigned char mod_rm;
423 unsigned char mod; 469 unsigned char mod;
424 unsigned char *p; 470 unsigned char *p;
425 int i, shorted, enlarged, rexr; 471 struct prefix_bits prf;
472 int i;
426 unsigned long rv; 473 unsigned long rv;
427 474
428 p = (unsigned char *)ins_addr; 475 p = (unsigned char *)ins_addr;
429 p += skip_prefix(p, &shorted, &enlarged, &rexr); 476 p += skip_prefix(p, &prf);
430 p += get_opcode(p, &opcode); 477 p += get_opcode(p, &opcode);
431 for (i = 0; i < ARRAY_SIZE(imm_wop); i++) 478 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
432 if (imm_wop[i] == opcode) { 479 if (imm_wop[i] == opcode) {
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 1b4763e26ea9..51c0a2fc14fe 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -138,7 +138,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
138 return; 138 return;
139 } 139 }
140 140
141 if (is_uv_system()) 141 if (get_uv_system_type() >= UV_X2APIC)
142 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; 142 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
143 else 143 else
144 apic_id = pa->apic_id; 144 apic_id = pa->apic_id;
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index d877c5b423ef..ab50a8d7402c 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/io.h> 5#include <linux/io.h>
6#include <linux/mmiotrace.h>
6 7
7#define MODULE_NAME "testmmiotrace" 8#define MODULE_NAME "testmmiotrace"
8 9
@@ -13,6 +14,7 @@ MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
13static void do_write_test(void __iomem *p) 14static void do_write_test(void __iomem *p)
14{ 15{
15 unsigned int i; 16 unsigned int i;
17 mmiotrace_printk("Write test.\n");
16 for (i = 0; i < 256; i++) 18 for (i = 0; i < 256; i++)
17 iowrite8(i, p + i); 19 iowrite8(i, p + i);
18 for (i = 1024; i < (5 * 1024); i += 2) 20 for (i = 1024; i < (5 * 1024); i += 2)
@@ -24,6 +26,7 @@ static void do_write_test(void __iomem *p)
24static void do_read_test(void __iomem *p) 26static void do_read_test(void __iomem *p)
25{ 27{
26 unsigned int i; 28 unsigned int i;
29 mmiotrace_printk("Read test.\n");
27 for (i = 0; i < 256; i++) 30 for (i = 0; i < 256; i++)
28 ioread8(p + i); 31 ioread8(p + i);
29 for (i = 1024; i < (5 * 1024); i += 2) 32 for (i = 1024; i < (5 * 1024); i += 2)
@@ -39,6 +42,7 @@ static void do_test(void)
39 pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); 42 pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
40 return; 43 return;
41 } 44 }
45 mmiotrace_printk("ioremap returned %p.\n", p);
42 do_write_test(p); 46 do_write_test(p);
43 do_read_test(p); 47 do_read_test(p);
44 iounmap(p); 48 iounmap(p);
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 4bdaa590375d..3c27a809393b 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -511,3 +511,31 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, fam10h_pci_cfg_space_size);
511DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size); 511DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size);
512DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size); 512DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size);
513DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size); 513DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size);
514
515/*
516 * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from
517 * confusing the PCI engine:
518 */
519static void sb600_disable_hpet_bar(struct pci_dev *dev)
520{
521 u8 val;
522
523 /*
524 * The SB600 and SB700 both share the same device
525 * ID, but the PM register 0x55 does something different
526 * for the SB700, so make sure we are dealing with the
527 * SB600 before touching the bit:
528 */
529
530 pci_read_config_byte(dev, 0x08, &val);
531
532 if (val < 0x2F) {
533 outb(0x55, 0xCD6);
534 val = inb(0xCD7);
535
536 /* Set bit 7 in PM register 0x55 */
537 outb(0x55, 0xCD6);
538 outb(val | 0x80, 0xCD7);
539 }
540}
541DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar);
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 006599db0dc7..bf69dbe08bff 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -493,7 +493,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
493 if (pirq <= 4) 493 if (pirq <= 4)
494 irq = read_config_nybble(router, 0x56, pirq - 1); 494 irq = read_config_nybble(router, 0x56, pirq - 1);
495 dev_info(&dev->dev, 495 dev_info(&dev->dev,
496 "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n", 496 "AMD756: dev [%04x:%04x], router PIRQ %d get IRQ %d\n",
497 dev->vendor, dev->device, pirq, irq); 497 dev->vendor, dev->device, pirq, irq);
498 return irq; 498 return irq;
499} 499}
@@ -501,7 +501,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
501static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 501static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
502{ 502{
503 dev_info(&dev->dev, 503 dev_info(&dev->dev,
504 "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n", 504 "AMD756: dev [%04x:%04x], router PIRQ %d set IRQ %d\n",
505 dev->vendor, dev->device, pirq, irq); 505 dev->vendor, dev->device, pirq, irq);
506 if (pirq <= 4) 506 if (pirq <= 4)
507 write_config_nybble(router, 0x56, pirq - 1, irq); 507 write_config_nybble(router, 0x56, pirq - 1, irq);
@@ -590,13 +590,20 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
590 case PCI_DEVICE_ID_INTEL_ICH10_1: 590 case PCI_DEVICE_ID_INTEL_ICH10_1:
591 case PCI_DEVICE_ID_INTEL_ICH10_2: 591 case PCI_DEVICE_ID_INTEL_ICH10_2:
592 case PCI_DEVICE_ID_INTEL_ICH10_3: 592 case PCI_DEVICE_ID_INTEL_ICH10_3:
593 case PCI_DEVICE_ID_INTEL_PCH_0:
594 case PCI_DEVICE_ID_INTEL_PCH_1:
595 r->name = "PIIX/ICH"; 593 r->name = "PIIX/ICH";
596 r->get = pirq_piix_get; 594 r->get = pirq_piix_get;
597 r->set = pirq_piix_set; 595 r->set = pirq_piix_set;
598 return 1; 596 return 1;
599 } 597 }
598
599 if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) &&
600 (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) {
601 r->name = "PIIX/ICH";
602 r->get = pirq_piix_get;
603 r->set = pirq_piix_set;
604 return 1;
605 }
606
600 return 0; 607 return 0;
601} 608}
602 609
@@ -823,7 +830,7 @@ static void __init pirq_find_router(struct irq_router *r)
823 r->get = NULL; 830 r->get = NULL;
824 r->set = NULL; 831 r->set = NULL;
825 832
826 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", 833 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n",
827 rt->rtr_vendor, rt->rtr_device); 834 rt->rtr_vendor, rt->rtr_device);
828 835
829 pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn); 836 pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
@@ -843,7 +850,7 @@ static void __init pirq_find_router(struct irq_router *r)
843 h->probe(r, pirq_router_dev, pirq_router_dev->device)) 850 h->probe(r, pirq_router_dev, pirq_router_dev->device))
844 break; 851 break;
845 } 852 }
846 dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n", 853 dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n",
847 pirq_router.name, 854 pirq_router.name,
848 pirq_router_dev->vendor, pirq_router_dev->device); 855 pirq_router_dev->vendor, pirq_router_dev->device);
849 856
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0013a729b41d..b61534c7a4c4 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -871,6 +871,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
871 /* make sure there are no stray mappings of 871 /* make sure there are no stray mappings of
872 this page */ 872 this page */
873 kmap_flush_unused(); 873 kmap_flush_unused();
874 vm_unmap_aliases();
874 } 875 }
875} 876}
876 877
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 28b85ab8422e..bb042608c602 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -21,7 +21,6 @@ void xen_force_evtchn_callback(void)
21 21
22static void __init __xen_init_IRQ(void) 22static void __init __xen_init_IRQ(void)
23{ 23{
24#ifdef CONFIG_X86_64
25 int i; 24 int i;
26 25
27 /* Create identity vector->irq map */ 26 /* Create identity vector->irq map */
@@ -31,7 +30,6 @@ static void __init __xen_init_IRQ(void)
31 for_each_possible_cpu(cpu) 30 for_each_possible_cpu(cpu)
32 per_cpu(vector_irq, cpu)[i] = i; 31 per_cpu(vector_irq, cpu)[i] = i;
33 } 32 }
34#endif /* CONFIG_X86_64 */
35 33
36 xen_init_IRQ(); 34 xen_init_IRQ();
37} 35}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ae173f6edd8b..d4d52f5a1cf7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -846,6 +846,7 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
846 /* re-enable interrupts for kmap_flush_unused */ 846 /* re-enable interrupts for kmap_flush_unused */
847 xen_mc_issue(0); 847 xen_mc_issue(0);
848 kmap_flush_unused(); 848 kmap_flush_unused();
849 vm_unmap_aliases();
849 xen_mc_batch(); 850 xen_mc_batch();
850 } 851 }
851 852
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index dd71e3a021cd..5601506f2dd9 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -241,7 +241,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl
241 ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); 241 ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
242 } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ 242 } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
243 243
244 kstat_this_cpu.irqs[irq]++; 244 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
245 245
246out: 246out:
247 raw_local_irq_restore(flags); 247 raw_local_irq_restore(flags);
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 004ba86326ae..c9f7cda48ed7 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void)
198/* Get the TSC speed from Xen */ 198/* Get the TSC speed from Xen */
199unsigned long xen_tsc_khz(void) 199unsigned long xen_tsc_khz(void)
200{ 200{
201 u64 xen_khz = 1000000ULL << 32; 201 struct pvclock_vcpu_time_info *info =
202 const struct pvclock_vcpu_time_info *info =
203 &HYPERVISOR_shared_info->vcpu_info[0].time; 202 &HYPERVISOR_shared_info->vcpu_info[0].time;
204 203
205 do_div(xen_khz, info->tsc_to_system_mul); 204 return pvclock_tsc_khz(info);
206 if (info->tsc_shift < 0)
207 xen_khz <<= -info->tsc_shift;
208 else
209 xen_khz >>= info->tsc_shift;
210
211 return xen_khz;
212} 205}
213 206
214cycle_t xen_clocksource_read(void) 207cycle_t xen_clocksource_read(void)