aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig11
-rw-r--r--arch/x86/Kconfig.cpu16
-rw-r--r--arch/x86/Kconfig.debug23
-rw-r--r--arch/x86/Makefile3
-rw-r--r--arch/x86/Makefile_32.cpu9
-rw-r--r--arch/x86/boot/compressed/head_64.S3
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S3
-rw-r--r--arch/x86/boot/video.c6
-rw-r--r--arch/x86/ia32/ia32entry.S3
-rw-r--r--arch/x86/ia32/sys_ia32.c56
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/a.out-core.h10
-rw-r--r--arch/x86/include/asm/acpi.h3
-rw-r--r--arch/x86/include/asm/alternative-asm.h10
-rw-r--r--arch/x86/include/asm/alternative.h1
-rw-r--r--arch/x86/include/asm/amd_iommu.h15
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h38
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h54
-rw-r--r--arch/x86/include/asm/apic.h21
-rw-r--r--arch/x86/include/asm/apicdef.h6
-rw-r--r--arch/x86/include/asm/apicnum.h12
-rw-r--r--arch/x86/include/asm/bug.h4
-rw-r--r--arch/x86/include/asm/cache.h7
-rw-r--r--arch/x86/include/asm/cacheflush.h2
-rw-r--r--arch/x86/include/asm/calgary.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h218
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h234
-rw-r--r--arch/x86/include/asm/debugreg.h33
-rw-r--r--arch/x86/include/asm/desc.h2
-rw-r--r--arch/x86/include/asm/device.h2
-rw-r--r--arch/x86/include/asm/dma-mapping.h15
-rw-r--r--arch/x86/include/asm/e820.h23
-rw-r--r--arch/x86/include/asm/elf.h20
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/gart.h9
-rw-r--r--arch/x86/include/asm/hardirq.h8
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h73
-rw-r--r--arch/x86/include/asm/hw_irq.h34
-rw-r--r--arch/x86/include/asm/inat.h220
-rw-r--r--arch/x86/include/asm/inat_types.h29
-rw-r--r--arch/x86/include/asm/insn.h184
-rw-r--r--arch/x86/include/asm/iommu.h2
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/irq_vectors.h2
-rw-r--r--arch/x86/include/asm/k8.h5
-rw-r--r--arch/x86/include/asm/kvm.h30
-rw-r--r--arch/x86/include/asm/kvm_emulate.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h34
-rw-r--r--arch/x86/include/asm/mce.h14
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/mpspec.h27
-rw-r--r--arch/x86/include/asm/msr.h8
-rw-r--r--arch/x86/include/asm/page_types.h3
-rw-r--r--arch/x86/include/asm/perf_event.h13
-rw-r--r--arch/x86/include/asm/pgtable.h6
-rw-r--r--arch/x86/include/asm/processor.h16
-rw-r--r--arch/x86/include/asm/proto.h17
-rw-r--r--arch/x86/include/asm/ptrace.h62
-rw-r--r--arch/x86/include/asm/sections.h6
-rw-r--r--arch/x86/include/asm/string_32.h9
-rw-r--r--arch/x86/include/asm/svm.h3
-rw-r--r--arch/x86/include/asm/swiotlb.h9
-rw-r--r--arch/x86/include/asm/sys_ia32.h5
-rw-r--r--arch/x86/include/asm/system.h31
-rw-r--r--arch/x86/include/asm/thread_info.h7
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/include/asm/uaccess.h1
-rw-r--r--arch/x86/include/asm/uaccess_32.h27
-rw-r--r--arch/x86/include/asm/uaccess_64.h36
-rw-r--r--arch/x86/include/asm/unistd_32.h3
-rw-r--r--arch/x86/include/asm/unistd_64.h2
-rw-r--r--arch/x86/include/asm/uv/uv_irq.h14
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/include/asm/x86_init.h14
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/processor.c3
-rw-r--r--arch/x86/kernel/acpi/sleep.c24
-rw-r--r--arch/x86/kernel/amd_iommu.c1245
-rw-r--r--arch/x86/kernel/amd_iommu_init.c118
-rw-r--r--arch/x86/kernel/aperture_64.c4
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c34
-rw-r--r--arch/x86/kernel/apic/apic_noop.c200
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c13
-rw-r--r--arch/x86/kernel/apic/es7000_32.c16
-rw-r--r--arch/x86/kernel/apic/io_apic.c364
-rw-r--r--arch/x86/kernel/apic/nmi.c11
-rw-r--r--arch/x86/kernel/apic/numaq_32.c18
-rw-r--r--arch/x86/kernel/apic/probe_32.c2
-rw-r--r--arch/x86/kernel/apic/summit_32.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c27
-rw-r--r--arch/x86/kernel/apm_32.c14
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c36
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c23
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c19
-rw-r--r--arch/x86/kernel/cpu/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c21
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c109
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c29
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c53
-rw-r--r--arch/x86/kernel/cpu/perf_event.c205
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpuid.c17
-rw-r--r--arch/x86/kernel/crash.c5
-rw-r--r--arch/x86/kernel/dumpstack.c7
-rw-r--r--arch/x86/kernel/dumpstack_32.c9
-rw-r--r--arch/x86/kernel/dumpstack_64.c46
-rw-r--r--arch/x86/kernel/entry_32.S31
-rw-r--r--arch/x86/kernel/entry_64.S29
-rw-r--r--arch/x86/kernel/ftrace.c101
-rw-r--r--arch/x86/kernel/head_32.S18
-rw-r--r--arch/x86/kernel/head_64.S7
-rw-r--r--arch/x86/kernel/hw_breakpoint.c555
-rw-r--r--arch/x86/kernel/irq.c122
-rw-r--r--arch/x86/kernel/irq_32.c45
-rw-r--r--arch/x86/kernel/irq_64.c58
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kgdb.c9
-rw-r--r--arch/x86/kernel/kprobes.c257
-rw-r--r--arch/x86/kernel/machine_kexec_32.c8
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/microcode_amd.c89
-rw-r--r--arch/x86/kernel/microcode_core.c10
-rw-r--r--arch/x86/kernel/mpparse.c44
-rw-r--r--arch/x86/kernel/msr.c16
-rw-r--r--arch/x86/kernel/pci-calgary_64.c94
-rw-r--r--arch/x86/kernel/pci-dma.c45
-rw-r--r--arch/x86/kernel/pci-gart_64.c156
-rw-r--r--arch/x86/kernel/pci-nommu.c11
-rw-r--r--arch/x86/kernel/pci-swiotlb.c18
-rw-r--r--arch/x86/kernel/process.c23
-rw-r--r--arch/x86/kernel/process_32.c10
-rw-r--r--arch/x86/kernel/process_64.c47
-rw-r--r--arch/x86/kernel/ptrace.c415
-rw-r--r--arch/x86/kernel/quirks.c9
-rw-r--r--arch/x86/kernel/reboot.c12
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c1
-rw-r--r--arch/x86/kernel/setup.c120
-rw-r--r--arch/x86/kernel/signal.c12
-rw-r--r--arch/x86/kernel/smpboot.c9
-rw-r--r--arch/x86/kernel/syscall_table_32.S1
-rw-r--r--arch/x86/kernel/tlb_uv.c4
-rw-r--r--arch/x86/kernel/traps.c73
-rw-r--r--arch/x86/kernel/tsc_sync.c13
-rw-r--r--arch/x86/kernel/uv_irq.c239
-rw-r--r--arch/x86/kernel/uv_time.c80
-rw-r--r--arch/x86/kernel/visws_quirks.c10
-rw-r--r--arch/x86/kernel/vmlinux.lds.S38
-rw-r--r--arch/x86/kernel/vsyscall_64.c2
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c5
-rw-r--r--arch/x86/kernel/x86_init.c10
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c159
-rw-r--r--arch/x86/kvm/i8254.c2
-rw-r--r--arch/x86/kvm/i8259.c44
-rw-r--r--arch/x86/kvm/irq.h7
-rw-r--r--arch/x86/kvm/lapic.c8
-rw-r--r--arch/x86/kvm/mmu.c3
-rw-r--r--arch/x86/kvm/paging_tmpl.h1
-rw-r--r--arch/x86/kvm/svm.c331
-rw-r--r--arch/x86/kvm/trace.h165
-rw-r--r--arch/x86/kvm/vmx.c448
-rw-r--r--arch/x86/kvm/x86.c572
-rw-r--r--arch/x86/lib/.gitignore1
-rw-r--r--arch/x86/lib/Makefile13
-rw-r--r--arch/x86/lib/copy_user_64.S14
-rw-r--r--arch/x86/lib/inat.c90
-rw-r--r--arch/x86/lib/insn.c516
-rw-r--r--arch/x86/lib/msr.c46
-rw-r--r--arch/x86/lib/usercopy_32.c10
-rw-r--r--arch/x86/lib/x86-opcode-map.txt893
-rw-r--r--arch/x86/mm/extable.c31
-rw-r--r--arch/x86/mm/fault.c13
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/init_32.c10
-rw-r--r--arch/x86/mm/init_64.c35
-rw-r--r--arch/x86/mm/ioremap.c50
-rw-r--r--arch/x86/mm/k8topology_64.c101
-rw-r--r--arch/x86/mm/kmmio.c8
-rw-r--r--arch/x86/mm/numa_32.c4
-rw-r--r--arch/x86/mm/numa_64.c252
-rw-r--r--arch/x86/mm/pageattr.c22
-rw-r--r--arch/x86/mm/pat.c20
-rw-r--r--arch/x86/mm/setup_nx.c59
-rw-r--r--arch/x86/mm/srat_64.c33
-rw-r--r--arch/x86/mm/testmmiotrace.c29
-rw-r--r--arch/x86/mm/tlb.c3
-rw-r--r--arch/x86/power/cpu.c26
-rw-r--r--arch/x86/tools/Makefile31
-rw-r--r--arch/x86/tools/chkobjdump.awk23
-rw-r--r--arch/x86/tools/distill.awk47
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk380
-rw-r--r--arch/x86/tools/test_get_len.c173
-rw-r--r--arch/x86/vdso/vdso32-setup.c1
-rw-r--r--arch/x86/xen/enlighten.c15
204 files changed, 8699 insertions, 3413 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 72ace9515a07..32a1918e1b88 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -49,7 +49,9 @@ config X86
49 select HAVE_KERNEL_GZIP 49 select HAVE_KERNEL_GZIP
50 select HAVE_KERNEL_BZIP2 50 select HAVE_KERNEL_BZIP2
51 select HAVE_KERNEL_LZMA 51 select HAVE_KERNEL_LZMA
52 select HAVE_HW_BREAKPOINT
52 select HAVE_ARCH_KMEMCHECK 53 select HAVE_ARCH_KMEMCHECK
54 select HAVE_USER_RETURN_NOTIFIER
53 55
54config OUTPUT_FORMAT 56config OUTPUT_FORMAT
55 string 57 string
@@ -1330,7 +1332,9 @@ config MATH_EMULATION
1330 kernel, it won't hurt. 1332 kernel, it won't hurt.
1331 1333
1332config MTRR 1334config MTRR
1333 bool "MTRR (Memory Type Range Register) support" 1335 bool
1336 default y
1337 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED
1334 ---help--- 1338 ---help---
1335 On Intel P6 family processors (Pentium Pro, Pentium II and later) 1339 On Intel P6 family processors (Pentium Pro, Pentium II and later)
1336 the Memory Type Range Registers (MTRRs) may be used to control 1340 the Memory Type Range Registers (MTRRs) may be used to control
@@ -1396,7 +1400,8 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1396 1400
1397config X86_PAT 1401config X86_PAT
1398 bool 1402 bool
1399 prompt "x86 PAT support" 1403 default y
1404 prompt "x86 PAT support" if EMBEDDED
1400 depends on MTRR 1405 depends on MTRR
1401 ---help--- 1406 ---help---
1402 Use PAT attributes to setup page level cache control. 1407 Use PAT attributes to setup page level cache control.
@@ -1602,7 +1607,7 @@ config COMPAT_VDSO
1602 depends on X86_32 || IA32_EMULATION 1607 depends on X86_32 || IA32_EMULATION
1603 ---help--- 1608 ---help---
1604 Map the 32-bit VDSO to the predictable old-style address too. 1609 Map the 32-bit VDSO to the predictable old-style address too.
1605 ---help--- 1610
1606 Say N here if you are running a sufficiently recent glibc 1611 Say N here if you are running a sufficiently recent glibc
1607 version (2.3.3 or later), to remove the high-mapped 1612 version (2.3.3 or later), to remove the high-mapped
1608 VDSO mapping and to exclusively use the randomized VDSO. 1613 VDSO mapping and to exclusively use the randomized VDSO.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 2649840d888f..08e442bc3ab9 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -301,15 +301,11 @@ config X86_CPU
301 301
302# 302#
303# Define implied options from the CPU selection here 303# Define implied options from the CPU selection here
304config X86_L1_CACHE_BYTES 304config X86_INTERNODE_CACHE_SHIFT
305 int 305 int
306 default "128" if MPSC 306 default "12" if X86_VSMP
307 default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32 307 default "7" if NUMA
308 308 default X86_L1_CACHE_SHIFT
309config X86_INTERNODE_CACHE_BYTES
310 int
311 default "4096" if X86_VSMP
312 default X86_L1_CACHE_BYTES if !X86_VSMP
313 309
314config X86_CMPXCHG 310config X86_CMPXCHG
315 def_bool X86_64 || (X86_32 && !M386) 311 def_bool X86_64 || (X86_32 && !M386)
@@ -317,9 +313,9 @@ config X86_CMPXCHG
317config X86_L1_CACHE_SHIFT 313config X86_L1_CACHE_SHIFT
318 int 314 int
319 default "7" if MPENTIUM4 || MPSC 315 default "7" if MPENTIUM4 || MPSC
316 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
320 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 317 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
321 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 318 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
322 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
323 319
324config X86_XADD 320config X86_XADD
325 def_bool y 321 def_bool y
@@ -406,7 +402,7 @@ config X86_CMPXCHG64
406# generates cmov. 402# generates cmov.
407config X86_CMOV 403config X86_CMOV
408 def_bool y 404 def_bool y
409 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM) 405 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
410 406
411config X86_MINIMUM_CPU_FAMILY 407config X86_MINIMUM_CPU_FAMILY
412 int 408 int
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d105f29bb6bb..731318e5ac1d 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -186,6 +186,15 @@ config X86_DS_SELFTEST
186config HAVE_MMIOTRACE_SUPPORT 186config HAVE_MMIOTRACE_SUPPORT
187 def_bool y 187 def_bool y
188 188
189config X86_DECODER_SELFTEST
190 bool "x86 instruction decoder selftest"
191 depends on DEBUG_KERNEL
192 ---help---
193 Perform x86 instruction decoder selftests at build time.
194 This option is useful for checking the sanity of x86 instruction
195 decoder code.
196 If unsure, say "N".
197
189# 198#
190# IO delay types: 199# IO delay types:
191# 200#
@@ -287,4 +296,18 @@ config OPTIMIZE_INLINING
287 296
288 If unsure, say N. 297 If unsure, say N.
289 298
299config DEBUG_STRICT_USER_COPY_CHECKS
300 bool "Strict copy size checks"
301 depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING
302 ---help---
303 Enabling this option turns a certain set of sanity checks for user
304 copy operations into compile time failures.
305
306 The copy_from_user() etc checks are there to help test if there
307 are sufficient security checks on the length argument of
308 the copy operation, by having gcc prove that the argument is
309 within bounds.
310
311 If unsure, or if you run an older (pre 4.4) gcc, say N.
312
290endmenu 313endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index d2d24c9ee64d..78b32be55e9e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -155,6 +155,9 @@ all: bzImage
155KBUILD_IMAGE := $(boot)/bzImage 155KBUILD_IMAGE := $(boot)/bzImage
156 156
157bzImage: vmlinux 157bzImage: vmlinux
158ifeq ($(CONFIG_X86_DECODER_SELFTEST),y)
159 $(Q)$(MAKE) $(build)=arch/x86/tools posttest
160endif
158 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) 161 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
159 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot 162 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
160 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ 163 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 30e9a264f69d..1255d953c65d 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -41,11 +41,18 @@ cflags-$(CONFIG_X86_ELAN) += -march=i486
41 41
42# Geode GX1 support 42# Geode GX1 support
43cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx 43cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
44 44cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx)
45# add at the end to overwrite eventual tuning options from earlier 45# add at the end to overwrite eventual tuning options from earlier
46# cpu entries 46# cpu entries
47cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) 47cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
48 48
49# Work around the pentium-mmx code generator madness of gcc4.4.x which
50# does stack alignment by generating horrible code _before_ the mcount
51# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph
52# tracer assumptions. For i686, generic, core2 this is set by the
53# compiler anyway
54cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args)
55
49# Bug fix for binutils: this option is required in order to keep 56# Bug fix for binutils: this option is required in order to keep
50# binutils from generating NOPL instructions against our will. 57# binutils from generating NOPL instructions against our will.
51ifneq ($(CONFIG_X86_P6_NOP),y) 58ifneq ($(CONFIG_X86_P6_NOP),y)
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 077e1b69198e..faff0dc9c06a 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -107,8 +107,7 @@ ENTRY(startup_32)
107 lgdt gdt(%ebp) 107 lgdt gdt(%ebp)
108 108
109 /* Enable PAE mode */ 109 /* Enable PAE mode */
110 xorl %eax, %eax 110 movl $(X86_CR4_PAE), %eax
111 orl $(X86_CR4_PAE), %eax
112 movl %eax, %cr4 111 movl %eax, %cr4
113 112
114 /* 113 /*
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index f4193bb48782..a6f1a59a5b0c 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -4,6 +4,7 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
4 4
5#undef i386 5#undef i386
6 6
7#include <asm/cache.h>
7#include <asm/page_types.h> 8#include <asm/page_types.h>
8 9
9#ifdef CONFIG_X86_64 10#ifdef CONFIG_X86_64
@@ -46,7 +47,7 @@ SECTIONS
46 *(.data.*) 47 *(.data.*)
47 _edata = . ; 48 _edata = . ;
48 } 49 }
49 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 50 . = ALIGN(L1_CACHE_BYTES);
50 .bss : { 51 .bss : {
51 _bss = . ; 52 _bss = . ;
52 *(.bss) 53 *(.bss)
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index d42da3802499..f767164cd5df 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -27,6 +27,12 @@ static void store_cursor_position(void)
27 27
28 boot_params.screen_info.orig_x = oreg.dl; 28 boot_params.screen_info.orig_x = oreg.dl;
29 boot_params.screen_info.orig_y = oreg.dh; 29 boot_params.screen_info.orig_y = oreg.dh;
30
31 if (oreg.ch & 0x20)
32 boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
33
34 if ((oreg.ch & 0x1f) > (oreg.cl & 0x1f))
35 boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
30} 36}
31 37
32static void store_video_mode(void) 38static void store_video_mode(void)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 581b0568fe19..4eefdca9832b 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -653,7 +653,7 @@ ia32_sys_call_table:
653 .quad compat_sys_writev 653 .quad compat_sys_writev
654 .quad sys_getsid 654 .quad sys_getsid
655 .quad sys_fdatasync 655 .quad sys_fdatasync
656 .quad sys32_sysctl /* sysctl */ 656 .quad compat_sys_sysctl /* sysctl */
657 .quad sys_mlock /* 150 */ 657 .quad sys_mlock /* 150 */
658 .quad sys_munlock 658 .quad sys_munlock
659 .quad sys_mlockall 659 .quad sys_mlockall
@@ -841,4 +841,5 @@ ia32_sys_call_table:
841 .quad compat_sys_pwritev 841 .quad compat_sys_pwritev
842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */ 842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
843 .quad sys_perf_event_open 843 .quad sys_perf_event_open
844 .quad compat_sys_recvmmsg
844ia32_syscall_end: 845ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 9f5527198825..df82c0e48ded 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -434,62 +434,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
434 return ret; 434 return ret;
435} 435}
436 436
437#ifdef CONFIG_SYSCTL_SYSCALL
438struct sysctl_ia32 {
439 unsigned int name;
440 int nlen;
441 unsigned int oldval;
442 unsigned int oldlenp;
443 unsigned int newval;
444 unsigned int newlen;
445 unsigned int __unused[4];
446};
447
448
449asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *args32)
450{
451 struct sysctl_ia32 a32;
452 mm_segment_t old_fs = get_fs();
453 void __user *oldvalp, *newvalp;
454 size_t oldlen;
455 int __user *namep;
456 long ret;
457
458 if (copy_from_user(&a32, args32, sizeof(a32)))
459 return -EFAULT;
460
461 /*
462 * We need to pre-validate these because we have to disable
463 * address checking before calling do_sysctl() because of
464 * OLDLEN but we can't run the risk of the user specifying bad
465 * addresses here. Well, since we're dealing with 32 bit
466 * addresses, we KNOW that access_ok() will always succeed, so
467 * this is an expensive NOP, but so what...
468 */
469 namep = compat_ptr(a32.name);
470 oldvalp = compat_ptr(a32.oldval);
471 newvalp = compat_ptr(a32.newval);
472
473 if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
474 || !access_ok(VERIFY_WRITE, namep, 0)
475 || !access_ok(VERIFY_WRITE, oldvalp, 0)
476 || !access_ok(VERIFY_WRITE, newvalp, 0))
477 return -EFAULT;
478
479 set_fs(KERNEL_DS);
480 lock_kernel();
481 ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen,
482 newvalp, (size_t) a32.newlen);
483 unlock_kernel();
484 set_fs(old_fs);
485
486 if (oldvalp && put_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
487 return -EFAULT;
488
489 return ret;
490}
491#endif
492
493/* warning: next two assume little endian */ 437/* warning: next two assume little endian */
494asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, 438asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
495 u32 poslo, u32 poshi) 439 u32 poslo, u32 poshi)
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80cdcfa5..9f828f87ca35 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,6 +10,7 @@ header-y += ptrace-abi.h
10header-y += sigcontext32.h 10header-y += sigcontext32.h
11header-y += ucontext.h 11header-y += ucontext.h
12header-y += processor-flags.h 12header-y += processor-flags.h
13header-y += hw_breakpoint.h
13 14
14unifdef-y += e820.h 15unifdef-y += e820.h
15unifdef-y += ist.h 16unifdef-y += ist.h
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
index bb70e397aa84..7a15588e45d4 100644
--- a/arch/x86/include/asm/a.out-core.h
+++ b/arch/x86/include/asm/a.out-core.h
@@ -17,6 +17,7 @@
17 17
18#include <linux/user.h> 18#include <linux/user.h>
19#include <linux/elfcore.h> 19#include <linux/elfcore.h>
20#include <asm/debugreg.h>
20 21
21/* 22/*
22 * fill in the user structure for an a.out core dump 23 * fill in the user structure for an a.out core dump
@@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
32 >> PAGE_SHIFT; 33 >> PAGE_SHIFT;
33 dump->u_dsize -= dump->u_tsize; 34 dump->u_dsize -= dump->u_tsize;
34 dump->u_ssize = 0; 35 dump->u_ssize = 0;
35 dump->u_debugreg[0] = current->thread.debugreg0; 36 aout_dump_debugregs(dump);
36 dump->u_debugreg[1] = current->thread.debugreg1;
37 dump->u_debugreg[2] = current->thread.debugreg2;
38 dump->u_debugreg[3] = current->thread.debugreg3;
39 dump->u_debugreg[4] = 0;
40 dump->u_debugreg[5] = 0;
41 dump->u_debugreg[6] = current->thread.debugreg6;
42 dump->u_debugreg[7] = current->thread.debugreg7;
43 37
44 if (dump->start_stack < TASK_SIZE) 38 if (dump->start_stack < TASK_SIZE)
45 dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack)) 39 dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 4518dc500903..60d2b2db0bc5 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -118,7 +118,7 @@ extern void acpi_restore_state_mem(void);
118extern unsigned long acpi_wakeup_address; 118extern unsigned long acpi_wakeup_address;
119 119
120/* early initialization routine */ 120/* early initialization routine */
121extern void acpi_reserve_bootmem(void); 121extern void acpi_reserve_wakeup_memory(void);
122 122
123/* 123/*
124 * Check if the CPU can handle C2 and deeper 124 * Check if the CPU can handle C2 and deeper
@@ -158,6 +158,7 @@ struct bootnode;
158 158
159#ifdef CONFIG_ACPI_NUMA 159#ifdef CONFIG_ACPI_NUMA
160extern int acpi_numa; 160extern int acpi_numa;
161extern int acpi_get_nodes(struct bootnode *physnodes);
161extern int acpi_scan_nodes(unsigned long start, unsigned long end); 162extern int acpi_scan_nodes(unsigned long start, unsigned long end);
162#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 163#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
163extern void acpi_fake_nodes(const struct bootnode *fake_nodes, 164extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index e2077d343c33..b97f786a48d5 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -1,17 +1,13 @@
1#ifdef __ASSEMBLY__ 1#ifdef __ASSEMBLY__
2 2
3#ifdef CONFIG_X86_32 3#include <asm/asm.h>
4# define X86_ALIGN .long
5#else
6# define X86_ALIGN .quad
7#endif
8 4
9#ifdef CONFIG_SMP 5#ifdef CONFIG_SMP
10 .macro LOCK_PREFIX 6 .macro LOCK_PREFIX
111: lock 71: lock
12 .section .smp_locks,"a" 8 .section .smp_locks,"a"
13 .align 4 9 _ASM_ALIGN
14 X86_ALIGN 1b 10 _ASM_PTR 1b
15 .previous 11 .previous
16 .endm 12 .endm
17#else 13#else
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index c240efc74e00..69b74a7b877f 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -84,6 +84,7 @@ static inline void alternatives_smp_switch(int smp) {}
84 " .byte " __stringify(feature) "\n" /* feature bit */ \ 84 " .byte " __stringify(feature) "\n" /* feature bit */ \
85 " .byte 662b-661b\n" /* sourcelen */ \ 85 " .byte 662b-661b\n" /* sourcelen */ \
86 " .byte 664f-663f\n" /* replacementlen */ \ 86 " .byte 664f-663f\n" /* replacementlen */ \
87 " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \
87 ".previous\n" \ 88 ".previous\n" \
88 ".section .altinstr_replacement, \"ax\"\n" \ 89 ".section .altinstr_replacement, \"ax\"\n" \
89 "663:\n\t" newinstr "\n664:\n" /* replacement */ \ 90 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index ac95995b7bad..5af2982133b5 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -23,18 +23,13 @@
23#include <linux/irqreturn.h> 23#include <linux/irqreturn.h>
24 24
25#ifdef CONFIG_AMD_IOMMU 25#ifdef CONFIG_AMD_IOMMU
26extern int amd_iommu_init(void); 26
27extern int amd_iommu_init_dma_ops(void);
28extern int amd_iommu_init_passthrough(void);
29extern void amd_iommu_detect(void); 27extern void amd_iommu_detect(void);
30extern irqreturn_t amd_iommu_int_handler(int irq, void *data); 28
31extern void amd_iommu_flush_all_domains(void);
32extern void amd_iommu_flush_all_devices(void);
33extern void amd_iommu_shutdown(void);
34#else 29#else
35static inline int amd_iommu_init(void) { return -ENODEV; } 30
36static inline void amd_iommu_detect(void) { } 31static inline void amd_iommu_detect(void) { }
37static inline void amd_iommu_shutdown(void) { } 32
38#endif 33#endif
39 34
40#endif /* _ASM_X86_AMD_IOMMU_H */ 35#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
new file mode 100644
index 000000000000..84786fb9a23b
--- /dev/null
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -0,0 +1,38 @@
1/*
2 * Copyright (C) 2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published
7 * by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
20#define _ASM_X86_AMD_IOMMU_PROTO_H
21
22struct amd_iommu;
23
24extern int amd_iommu_init_dma_ops(void);
25extern int amd_iommu_init_passthrough(void);
26extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
27extern void amd_iommu_flush_all_domains(void);
28extern void amd_iommu_flush_all_devices(void);
29extern void amd_iommu_apply_erratum_63(u16 devid);
30extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
31
32#ifndef CONFIG_AMD_IOMMU_STATS
33
34static inline void amd_iommu_stats_init(void) { }
35
36#endif /* !CONFIG_AMD_IOMMU_STATS */
37
38#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 2a2cc7a78a81..ba19ad4c47d0 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -25,6 +25,11 @@
25#include <linux/spinlock.h> 25#include <linux/spinlock.h>
26 26
27/* 27/*
28 * Maximum number of IOMMUs supported
29 */
30#define MAX_IOMMUS 32
31
32/*
28 * some size calculation constants 33 * some size calculation constants
29 */ 34 */
30#define DEV_TABLE_ENTRY_SIZE 32 35#define DEV_TABLE_ENTRY_SIZE 32
@@ -206,6 +211,9 @@ extern bool amd_iommu_dump;
206 printk(KERN_INFO "AMD-Vi: " format, ## arg); \ 211 printk(KERN_INFO "AMD-Vi: " format, ## arg); \
207 } while(0); 212 } while(0);
208 213
214/* global flag if IOMMUs cache non-present entries */
215extern bool amd_iommu_np_cache;
216
209/* 217/*
210 * Make iterating over all IOMMUs easier 218 * Make iterating over all IOMMUs easier
211 */ 219 */
@@ -226,6 +234,8 @@ extern bool amd_iommu_dump;
226 * independent of their use. 234 * independent of their use.
227 */ 235 */
228struct protection_domain { 236struct protection_domain {
237 struct list_head list; /* for list of all protection domains */
238 struct list_head dev_list; /* List of all devices in this domain */
229 spinlock_t lock; /* mostly used to lock the page table*/ 239 spinlock_t lock; /* mostly used to lock the page table*/
230 u16 id; /* the domain id written to the device table */ 240 u16 id; /* the domain id written to the device table */
231 int mode; /* paging mode (0-6 levels) */ 241 int mode; /* paging mode (0-6 levels) */
@@ -233,7 +243,20 @@ struct protection_domain {
233 unsigned long flags; /* flags to find out type of domain */ 243 unsigned long flags; /* flags to find out type of domain */
234 bool updated; /* complete domain flush required */ 244 bool updated; /* complete domain flush required */
235 unsigned dev_cnt; /* devices assigned to this domain */ 245 unsigned dev_cnt; /* devices assigned to this domain */
246 unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
236 void *priv; /* private data */ 247 void *priv; /* private data */
248
249};
250
251/*
252 * This struct contains device specific data for the IOMMU
253 */
254struct iommu_dev_data {
255 struct list_head list; /* For domain->dev_list */
256 struct device *dev; /* Device this data belong to */
257 struct device *alias; /* The Alias Device */
258 struct protection_domain *domain; /* Domain the device is bound to */
259 atomic_t bind; /* Domain attach reverent count */
237}; 260};
238 261
239/* 262/*
@@ -291,6 +314,9 @@ struct dma_ops_domain {
291struct amd_iommu { 314struct amd_iommu {
292 struct list_head list; 315 struct list_head list;
293 316
317 /* Index within the IOMMU array */
318 int index;
319
294 /* locks the accesses to the hardware */ 320 /* locks the accesses to the hardware */
295 spinlock_t lock; 321 spinlock_t lock;
296 322
@@ -357,6 +383,21 @@ struct amd_iommu {
357extern struct list_head amd_iommu_list; 383extern struct list_head amd_iommu_list;
358 384
359/* 385/*
386 * Array with pointers to each IOMMU struct
387 * The indices are referenced in the protection domains
388 */
389extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
390
391/* Number of IOMMUs present in the system */
392extern int amd_iommus_present;
393
394/*
395 * Declarations for the global list of all protection domains
396 */
397extern spinlock_t amd_iommu_pd_lock;
398extern struct list_head amd_iommu_pd_list;
399
400/*
360 * Structure defining one entry in the device table 401 * Structure defining one entry in the device table
361 */ 402 */
362struct dev_table_entry { 403struct dev_table_entry {
@@ -416,15 +457,9 @@ extern unsigned amd_iommu_aperture_order;
416/* largest PCI device id we expect translation requests for */ 457/* largest PCI device id we expect translation requests for */
417extern u16 amd_iommu_last_bdf; 458extern u16 amd_iommu_last_bdf;
418 459
419/* data structures for protection domain handling */
420extern struct protection_domain **amd_iommu_pd_table;
421
422/* allocation bitmap for domain ids */ 460/* allocation bitmap for domain ids */
423extern unsigned long *amd_iommu_pd_alloc_bitmap; 461extern unsigned long *amd_iommu_pd_alloc_bitmap;
424 462
425/* will be 1 if device isolation is enabled */
426extern bool amd_iommu_isolate;
427
428/* 463/*
429 * If true, the addresses will be flushed on unmap time, not when 464 * If true, the addresses will be flushed on unmap time, not when
430 * they are reused 465 * they are reused
@@ -462,11 +497,6 @@ struct __iommu_counter {
462#define ADD_STATS_COUNTER(name, x) 497#define ADD_STATS_COUNTER(name, x)
463#define SUB_STATS_COUNTER(name, x) 498#define SUB_STATS_COUNTER(name, x)
464 499
465static inline void amd_iommu_stats_init(void) { }
466
467#endif /* CONFIG_AMD_IOMMU_STATS */ 500#endif /* CONFIG_AMD_IOMMU_STATS */
468 501
469/* some function prototypes */
470extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
471
472#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ 502#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 474d80d3e6cc..b4ac2cdcb64f 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -297,20 +297,20 @@ struct apic {
297 int disable_esr; 297 int disable_esr;
298 298
299 int dest_logical; 299 int dest_logical;
300 unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid); 300 unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
301 unsigned long (*check_apicid_present)(int apicid); 301 unsigned long (*check_apicid_present)(int apicid);
302 302
303 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); 303 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
304 void (*init_apic_ldr)(void); 304 void (*init_apic_ldr)(void);
305 305
306 physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map); 306 void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
307 307
308 void (*setup_apic_routing)(void); 308 void (*setup_apic_routing)(void);
309 int (*multi_timer_check)(int apic, int irq); 309 int (*multi_timer_check)(int apic, int irq);
310 int (*apicid_to_node)(int logical_apicid); 310 int (*apicid_to_node)(int logical_apicid);
311 int (*cpu_to_logical_apicid)(int cpu); 311 int (*cpu_to_logical_apicid)(int cpu);
312 int (*cpu_present_to_apicid)(int mps_cpu); 312 int (*cpu_present_to_apicid)(int mps_cpu);
313 physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); 313 void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
314 void (*setup_portio_remap)(void); 314 void (*setup_portio_remap)(void);
315 int (*check_phys_apicid_present)(int phys_apicid); 315 int (*check_phys_apicid_present)(int phys_apicid);
316 void (*enable_apic_mode)(void); 316 void (*enable_apic_mode)(void);
@@ -488,6 +488,8 @@ static inline unsigned int read_apic_id(void)
488 488
489extern void default_setup_apic_routing(void); 489extern void default_setup_apic_routing(void);
490 490
491extern struct apic apic_noop;
492
491#ifdef CONFIG_X86_32 493#ifdef CONFIG_X86_32
492 494
493extern struct apic apic_default; 495extern struct apic apic_default;
@@ -532,9 +534,9 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
532 return (unsigned int)(mask1 & mask2 & mask3); 534 return (unsigned int)(mask1 & mask2 & mask3);
533} 535}
534 536
535static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid) 537static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
536{ 538{
537 return physid_isset(apicid, bitmap); 539 return physid_isset(apicid, *map);
538} 540}
539 541
540static inline unsigned long default_check_apicid_present(int bit) 542static inline unsigned long default_check_apicid_present(int bit)
@@ -542,9 +544,9 @@ static inline unsigned long default_check_apicid_present(int bit)
542 return physid_isset(bit, phys_cpu_present_map); 544 return physid_isset(bit, phys_cpu_present_map);
543} 545}
544 546
545static inline physid_mask_t default_ioapic_phys_id_map(physid_mask_t phys_map) 547static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
546{ 548{
547 return phys_map; 549 *retmap = *phys_map;
548} 550}
549 551
550/* Mapping from cpu number to logical apicid */ 552/* Mapping from cpu number to logical apicid */
@@ -583,11 +585,6 @@ extern int default_cpu_present_to_apicid(int mps_cpu);
583extern int default_check_phys_apicid_present(int phys_apicid); 585extern int default_check_phys_apicid_present(int phys_apicid);
584#endif 586#endif
585 587
586static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid)
587{
588 return physid_mask_of_physid(phys_apicid);
589}
590
591#endif /* CONFIG_X86_LOCAL_APIC */ 588#endif /* CONFIG_X86_LOCAL_APIC */
592 589
593#ifdef CONFIG_X86_32 590#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 3b62da926de9..7fe3b3060f08 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -11,6 +11,12 @@
11#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000 11#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000
12#define APIC_DEFAULT_PHYS_BASE 0xfee00000 12#define APIC_DEFAULT_PHYS_BASE 0xfee00000
13 13
14/*
15 * This is the IO-APIC register space as specified
16 * by Intel docs:
17 */
18#define IO_APIC_SLOT_SIZE 1024
19
14#define APIC_ID 0x20 20#define APIC_ID 0x20
15 21
16#define APIC_LVR 0x30 22#define APIC_LVR 0x30
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
deleted file mode 100644
index 82f613c607ce..000000000000
--- a/arch/x86/include/asm/apicnum.h
+++ /dev/null
@@ -1,12 +0,0 @@
1#ifndef _ASM_X86_APICNUM_H
2#define _ASM_X86_APICNUM_H
3
4/* define MAX_IO_APICS */
5#ifdef CONFIG_X86_32
6# define MAX_IO_APICS 64
7#else
8# define MAX_IO_APICS 128
9# define MAX_LOCAL_APIC 32768
10#endif
11
12#endif /* _ASM_X86_APICNUM_H */
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index d9cf1cd156d2..f654d1bb17fb 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -22,14 +22,14 @@ do { \
22 ".popsection" \ 22 ".popsection" \
23 : : "i" (__FILE__), "i" (__LINE__), \ 23 : : "i" (__FILE__), "i" (__LINE__), \
24 "i" (sizeof(struct bug_entry))); \ 24 "i" (sizeof(struct bug_entry))); \
25 for (;;) ; \ 25 unreachable(); \
26} while (0) 26} while (0)
27 27
28#else 28#else
29#define BUG() \ 29#define BUG() \
30do { \ 30do { \
31 asm volatile("ud2"); \ 31 asm volatile("ud2"); \
32 for (;;) ; \ 32 unreachable(); \
33} while (0) 33} while (0)
34#endif 34#endif
35 35
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 549860d3be8f..2f9047cfaaca 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -9,12 +9,13 @@
9 9
10#define __read_mostly __attribute__((__section__(".data.read_mostly"))) 10#define __read_mostly __attribute__((__section__(".data.read_mostly")))
11 11
12#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
13#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
14
12#ifdef CONFIG_X86_VSMP 15#ifdef CONFIG_X86_VSMP
13/* vSMP Internode cacheline shift */
14#define INTERNODE_CACHE_SHIFT (12)
15#ifdef CONFIG_SMP 16#ifdef CONFIG_SMP
16#define __cacheline_aligned_in_smp \ 17#define __cacheline_aligned_in_smp \
17 __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ 18 __attribute__((__aligned__(INTERNODE_CACHE_BYTES))) \
18 __page_aligned_data 19 __page_aligned_data
19#endif 20#endif
20#endif 21#endif
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index b54f6afe7ec4..634c40a739a6 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -12,6 +12,7 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
12 unsigned long start, unsigned long end) { } 12 unsigned long start, unsigned long end) { }
13static inline void flush_cache_page(struct vm_area_struct *vma, 13static inline void flush_cache_page(struct vm_area_struct *vma,
14 unsigned long vmaddr, unsigned long pfn) { } 14 unsigned long vmaddr, unsigned long pfn) { }
15#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
15static inline void flush_dcache_page(struct page *page) { } 16static inline void flush_dcache_page(struct page *page) { }
16static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } 17static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
17static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } 18static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
@@ -176,6 +177,7 @@ void clflush_cache_range(void *addr, unsigned int size);
176#ifdef CONFIG_DEBUG_RODATA 177#ifdef CONFIG_DEBUG_RODATA
177void mark_rodata_ro(void); 178void mark_rodata_ro(void);
178extern const int rodata_test_data; 179extern const int rodata_test_data;
180extern int kernel_set_to_readonly;
179void set_kernel_text_rw(void); 181void set_kernel_text_rw(void);
180void set_kernel_text_ro(void); 182void set_kernel_text_ro(void);
181#else 183#else
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index b03bedb62aa7..0918654305af 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -62,10 +62,8 @@ struct cal_chipset_ops {
62extern int use_calgary; 62extern int use_calgary;
63 63
64#ifdef CONFIG_CALGARY_IOMMU 64#ifdef CONFIG_CALGARY_IOMMU
65extern int calgary_iommu_init(void);
66extern void detect_calgary(void); 65extern void detect_calgary(void);
67#else 66#else
68static inline int calgary_iommu_init(void) { return 1; }
69static inline void detect_calgary(void) { return; } 67static inline void detect_calgary(void) { return; }
70#endif 68#endif
71 69
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index ee1931be6593..ffb9bb6b6c37 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -8,14 +8,50 @@
8 * you need to test for the feature in boot_cpu_data. 8 * you need to test for the feature in boot_cpu_data.
9 */ 9 */
10 10
11#define xchg(ptr, v) \ 11extern void __xchg_wrong_size(void);
12 ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr)))) 12
13/*
14 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
15 * Note 2: xchg has side effect, so that attribute volatile is necessary,
16 * but generally the primitive is invalid, *ptr is output argument. --ANK
17 */
13 18
14struct __xchg_dummy { 19struct __xchg_dummy {
15 unsigned long a[100]; 20 unsigned long a[100];
16}; 21};
17#define __xg(x) ((struct __xchg_dummy *)(x)) 22#define __xg(x) ((struct __xchg_dummy *)(x))
18 23
24#define __xchg(x, ptr, size) \
25({ \
26 __typeof(*(ptr)) __x = (x); \
27 switch (size) { \
28 case 1: \
29 asm volatile("xchgb %b0,%1" \
30 : "=q" (__x) \
31 : "m" (*__xg(ptr)), "0" (__x) \
32 : "memory"); \
33 break; \
34 case 2: \
35 asm volatile("xchgw %w0,%1" \
36 : "=r" (__x) \
37 : "m" (*__xg(ptr)), "0" (__x) \
38 : "memory"); \
39 break; \
40 case 4: \
41 asm volatile("xchgl %0,%1" \
42 : "=r" (__x) \
43 : "m" (*__xg(ptr)), "0" (__x) \
44 : "memory"); \
45 break; \
46 default: \
47 __xchg_wrong_size(); \
48 } \
49 __x; \
50})
51
52#define xchg(ptr, v) \
53 __xchg((v), (ptr), sizeof(*ptr))
54
19/* 55/*
20 * The semantics of XCHGCMP8B are a bit strange, this is why 56 * The semantics of XCHGCMP8B are a bit strange, this is why
21 * there is a loop and the loading of %%eax and %%edx has to 57 * there is a loop and the loading of %%eax and %%edx has to
@@ -71,57 +107,63 @@ static inline void __set_64bit_var(unsigned long long *ptr,
71 (unsigned int)((value) >> 32)) \ 107 (unsigned int)((value) >> 32)) \
72 : __set_64bit(ptr, ll_low((value)), ll_high((value)))) 108 : __set_64bit(ptr, ll_low((value)), ll_high((value))))
73 109
74/* 110extern void __cmpxchg_wrong_size(void);
75 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
76 * Note 2: xchg has side effect, so that attribute volatile is necessary,
77 * but generally the primitive is invalid, *ptr is output argument. --ANK
78 */
79static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
80 int size)
81{
82 switch (size) {
83 case 1:
84 asm volatile("xchgb %b0,%1"
85 : "=q" (x)
86 : "m" (*__xg(ptr)), "0" (x)
87 : "memory");
88 break;
89 case 2:
90 asm volatile("xchgw %w0,%1"
91 : "=r" (x)
92 : "m" (*__xg(ptr)), "0" (x)
93 : "memory");
94 break;
95 case 4:
96 asm volatile("xchgl %0,%1"
97 : "=r" (x)
98 : "m" (*__xg(ptr)), "0" (x)
99 : "memory");
100 break;
101 }
102 return x;
103}
104 111
105/* 112/*
106 * Atomic compare and exchange. Compare OLD with MEM, if identical, 113 * Atomic compare and exchange. Compare OLD with MEM, if identical,
107 * store NEW in MEM. Return the initial value in MEM. Success is 114 * store NEW in MEM. Return the initial value in MEM. Success is
108 * indicated by comparing RETURN with OLD. 115 * indicated by comparing RETURN with OLD.
109 */ 116 */
117#define __raw_cmpxchg(ptr, old, new, size, lock) \
118({ \
119 __typeof__(*(ptr)) __ret; \
120 __typeof__(*(ptr)) __old = (old); \
121 __typeof__(*(ptr)) __new = (new); \
122 switch (size) { \
123 case 1: \
124 asm volatile(lock "cmpxchgb %b1,%2" \
125 : "=a"(__ret) \
126 : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
127 : "memory"); \
128 break; \
129 case 2: \
130 asm volatile(lock "cmpxchgw %w1,%2" \
131 : "=a"(__ret) \
132 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
133 : "memory"); \
134 break; \
135 case 4: \
136 asm volatile(lock "cmpxchgl %1,%2" \
137 : "=a"(__ret) \
138 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
139 : "memory"); \
140 break; \
141 default: \
142 __cmpxchg_wrong_size(); \
143 } \
144 __ret; \
145})
146
147#define __cmpxchg(ptr, old, new, size) \
148 __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
149
150#define __sync_cmpxchg(ptr, old, new, size) \
151 __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
152
153#define __cmpxchg_local(ptr, old, new, size) \
154 __raw_cmpxchg((ptr), (old), (new), (size), "")
110 155
111#ifdef CONFIG_X86_CMPXCHG 156#ifdef CONFIG_X86_CMPXCHG
112#define __HAVE_ARCH_CMPXCHG 1 157#define __HAVE_ARCH_CMPXCHG 1
113#define cmpxchg(ptr, o, n) \ 158
114 ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \ 159#define cmpxchg(ptr, old, new) \
115 (unsigned long)(n), \ 160 __cmpxchg((ptr), (old), (new), sizeof(*ptr))
116 sizeof(*(ptr)))) 161
117#define sync_cmpxchg(ptr, o, n) \ 162#define sync_cmpxchg(ptr, old, new) \
118 ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \ 163 __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
119 (unsigned long)(n), \ 164
120 sizeof(*(ptr)))) 165#define cmpxchg_local(ptr, old, new) \
121#define cmpxchg_local(ptr, o, n) \ 166 __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
122 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
123 (unsigned long)(n), \
124 sizeof(*(ptr))))
125#endif 167#endif
126 168
127#ifdef CONFIG_X86_CMPXCHG64 169#ifdef CONFIG_X86_CMPXCHG64
@@ -133,94 +175,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
133 (unsigned long long)(n))) 175 (unsigned long long)(n)))
134#endif 176#endif
135 177
136static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
137 unsigned long new, int size)
138{
139 unsigned long prev;
140 switch (size) {
141 case 1:
142 asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
143 : "=a"(prev)
144 : "q"(new), "m"(*__xg(ptr)), "0"(old)
145 : "memory");
146 return prev;
147 case 2:
148 asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
149 : "=a"(prev)
150 : "r"(new), "m"(*__xg(ptr)), "0"(old)
151 : "memory");
152 return prev;
153 case 4:
154 asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
155 : "=a"(prev)
156 : "r"(new), "m"(*__xg(ptr)), "0"(old)
157 : "memory");
158 return prev;
159 }
160 return old;
161}
162
163/*
164 * Always use locked operations when touching memory shared with a
165 * hypervisor, since the system may be SMP even if the guest kernel
166 * isn't.
167 */
168static inline unsigned long __sync_cmpxchg(volatile void *ptr,
169 unsigned long old,
170 unsigned long new, int size)
171{
172 unsigned long prev;
173 switch (size) {
174 case 1:
175 asm volatile("lock; cmpxchgb %b1,%2"
176 : "=a"(prev)
177 : "q"(new), "m"(*__xg(ptr)), "0"(old)
178 : "memory");
179 return prev;
180 case 2:
181 asm volatile("lock; cmpxchgw %w1,%2"
182 : "=a"(prev)
183 : "r"(new), "m"(*__xg(ptr)), "0"(old)
184 : "memory");
185 return prev;
186 case 4:
187 asm volatile("lock; cmpxchgl %1,%2"
188 : "=a"(prev)
189 : "r"(new), "m"(*__xg(ptr)), "0"(old)
190 : "memory");
191 return prev;
192 }
193 return old;
194}
195
196static inline unsigned long __cmpxchg_local(volatile void *ptr,
197 unsigned long old,
198 unsigned long new, int size)
199{
200 unsigned long prev;
201 switch (size) {
202 case 1:
203 asm volatile("cmpxchgb %b1,%2"
204 : "=a"(prev)
205 : "q"(new), "m"(*__xg(ptr)), "0"(old)
206 : "memory");
207 return prev;
208 case 2:
209 asm volatile("cmpxchgw %w1,%2"
210 : "=a"(prev)
211 : "r"(new), "m"(*__xg(ptr)), "0"(old)
212 : "memory");
213 return prev;
214 case 4:
215 asm volatile("cmpxchgl %1,%2"
216 : "=a"(prev)
217 : "r"(new), "m"(*__xg(ptr)), "0"(old)
218 : "memory");
219 return prev;
220 }
221 return old;
222}
223
224static inline unsigned long long __cmpxchg64(volatile void *ptr, 178static inline unsigned long long __cmpxchg64(volatile void *ptr,
225 unsigned long long old, 179 unsigned long long old,
226 unsigned long long new) 180 unsigned long long new)
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 52de72e0de8c..485ae415faec 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -3,9 +3,6 @@
3 3
4#include <asm/alternative.h> /* Provides LOCK_PREFIX */ 4#include <asm/alternative.h> /* Provides LOCK_PREFIX */
5 5
6#define xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), \
7 (ptr), sizeof(*(ptr))))
8
9#define __xg(x) ((volatile long *)(x)) 6#define __xg(x) ((volatile long *)(x))
10 7
11static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) 8static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
@@ -15,167 +12,118 @@ static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
15 12
16#define _set_64bit set_64bit 13#define _set_64bit set_64bit
17 14
15extern void __xchg_wrong_size(void);
16extern void __cmpxchg_wrong_size(void);
17
18/* 18/*
19 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway 19 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
20 * Note 2: xchg has side effect, so that attribute volatile is necessary, 20 * Note 2: xchg has side effect, so that attribute volatile is necessary,
21 * but generally the primitive is invalid, *ptr is output argument. --ANK 21 * but generally the primitive is invalid, *ptr is output argument. --ANK
22 */ 22 */
23static inline unsigned long __xchg(unsigned long x, volatile void *ptr, 23#define __xchg(x, ptr, size) \
24 int size) 24({ \
25{ 25 __typeof(*(ptr)) __x = (x); \
26 switch (size) { 26 switch (size) { \
27 case 1: 27 case 1: \
28 asm volatile("xchgb %b0,%1" 28 asm volatile("xchgb %b0,%1" \
29 : "=q" (x) 29 : "=q" (__x) \
30 : "m" (*__xg(ptr)), "0" (x) 30 : "m" (*__xg(ptr)), "0" (__x) \
31 : "memory"); 31 : "memory"); \
32 break; 32 break; \
33 case 2: 33 case 2: \
34 asm volatile("xchgw %w0,%1" 34 asm volatile("xchgw %w0,%1" \
35 : "=r" (x) 35 : "=r" (__x) \
36 : "m" (*__xg(ptr)), "0" (x) 36 : "m" (*__xg(ptr)), "0" (__x) \
37 : "memory"); 37 : "memory"); \
38 break; 38 break; \
39 case 4: 39 case 4: \
40 asm volatile("xchgl %k0,%1" 40 asm volatile("xchgl %k0,%1" \
41 : "=r" (x) 41 : "=r" (__x) \
42 : "m" (*__xg(ptr)), "0" (x) 42 : "m" (*__xg(ptr)), "0" (__x) \
43 : "memory"); 43 : "memory"); \
44 break; 44 break; \
45 case 8: 45 case 8: \
46 asm volatile("xchgq %0,%1" 46 asm volatile("xchgq %0,%1" \
47 : "=r" (x) 47 : "=r" (__x) \
48 : "m" (*__xg(ptr)), "0" (x) 48 : "m" (*__xg(ptr)), "0" (__x) \
49 : "memory"); 49 : "memory"); \
50 break; 50 break; \
51 } 51 default: \
52 return x; 52 __xchg_wrong_size(); \
53} 53 } \
54 __x; \
55})
56
57#define xchg(ptr, v) \
58 __xchg((v), (ptr), sizeof(*ptr))
59
60#define __HAVE_ARCH_CMPXCHG 1
54 61
55/* 62/*
56 * Atomic compare and exchange. Compare OLD with MEM, if identical, 63 * Atomic compare and exchange. Compare OLD with MEM, if identical,
57 * store NEW in MEM. Return the initial value in MEM. Success is 64 * store NEW in MEM. Return the initial value in MEM. Success is
58 * indicated by comparing RETURN with OLD. 65 * indicated by comparing RETURN with OLD.
59 */ 66 */
67#define __raw_cmpxchg(ptr, old, new, size, lock) \
68({ \
69 __typeof__(*(ptr)) __ret; \
70 __typeof__(*(ptr)) __old = (old); \
71 __typeof__(*(ptr)) __new = (new); \
72 switch (size) { \
73 case 1: \
74 asm volatile(lock "cmpxchgb %b1,%2" \
75 : "=a"(__ret) \
76 : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
77 : "memory"); \
78 break; \
79 case 2: \
80 asm volatile(lock "cmpxchgw %w1,%2" \
81 : "=a"(__ret) \
82 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
83 : "memory"); \
84 break; \
85 case 4: \
86 asm volatile(lock "cmpxchgl %k1,%2" \
87 : "=a"(__ret) \
88 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
89 : "memory"); \
90 break; \
91 case 8: \
92 asm volatile(lock "cmpxchgq %1,%2" \
93 : "=a"(__ret) \
94 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
95 : "memory"); \
96 break; \
97 default: \
98 __cmpxchg_wrong_size(); \
99 } \
100 __ret; \
101})
60 102
61#define __HAVE_ARCH_CMPXCHG 1 103#define __cmpxchg(ptr, old, new, size) \
104 __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
62 105
63static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, 106#define __sync_cmpxchg(ptr, old, new, size) \
64 unsigned long new, int size) 107 __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
65{
66 unsigned long prev;
67 switch (size) {
68 case 1:
69 asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
70 : "=a"(prev)
71 : "q"(new), "m"(*__xg(ptr)), "0"(old)
72 : "memory");
73 return prev;
74 case 2:
75 asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
76 : "=a"(prev)
77 : "r"(new), "m"(*__xg(ptr)), "0"(old)
78 : "memory");
79 return prev;
80 case 4:
81 asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2"
82 : "=a"(prev)
83 : "r"(new), "m"(*__xg(ptr)), "0"(old)
84 : "memory");
85 return prev;
86 case 8:
87 asm volatile(LOCK_PREFIX "cmpxchgq %1,%2"
88 : "=a"(prev)
89 : "r"(new), "m"(*__xg(ptr)), "0"(old)
90 : "memory");
91 return prev;
92 }
93 return old;
94}
95 108
96/* 109#define __cmpxchg_local(ptr, old, new, size) \
97 * Always use locked operations when touching memory shared with a 110 __raw_cmpxchg((ptr), (old), (new), (size), "")
98 * hypervisor, since the system may be SMP even if the guest kernel
99 * isn't.
100 */
101static inline unsigned long __sync_cmpxchg(volatile void *ptr,
102 unsigned long old,
103 unsigned long new, int size)
104{
105 unsigned long prev;
106 switch (size) {
107 case 1:
108 asm volatile("lock; cmpxchgb %b1,%2"
109 : "=a"(prev)
110 : "q"(new), "m"(*__xg(ptr)), "0"(old)
111 : "memory");
112 return prev;
113 case 2:
114 asm volatile("lock; cmpxchgw %w1,%2"
115 : "=a"(prev)
116 : "r"(new), "m"(*__xg(ptr)), "0"(old)
117 : "memory");
118 return prev;
119 case 4:
120 asm volatile("lock; cmpxchgl %1,%2"
121 : "=a"(prev)
122 : "r"(new), "m"(*__xg(ptr)), "0"(old)
123 : "memory");
124 return prev;
125 }
126 return old;
127}
128 111
129static inline unsigned long __cmpxchg_local(volatile void *ptr, 112#define cmpxchg(ptr, old, new) \
130 unsigned long old, 113 __cmpxchg((ptr), (old), (new), sizeof(*ptr))
131 unsigned long new, int size) 114
132{ 115#define sync_cmpxchg(ptr, old, new) \
133 unsigned long prev; 116 __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
134 switch (size) { 117
135 case 1: 118#define cmpxchg_local(ptr, old, new) \
136 asm volatile("cmpxchgb %b1,%2" 119 __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
137 : "=a"(prev)
138 : "q"(new), "m"(*__xg(ptr)), "0"(old)
139 : "memory");
140 return prev;
141 case 2:
142 asm volatile("cmpxchgw %w1,%2"
143 : "=a"(prev)
144 : "r"(new), "m"(*__xg(ptr)), "0"(old)
145 : "memory");
146 return prev;
147 case 4:
148 asm volatile("cmpxchgl %k1,%2"
149 : "=a"(prev)
150 : "r"(new), "m"(*__xg(ptr)), "0"(old)
151 : "memory");
152 return prev;
153 case 8:
154 asm volatile("cmpxchgq %1,%2"
155 : "=a"(prev)
156 : "r"(new), "m"(*__xg(ptr)), "0"(old)
157 : "memory");
158 return prev;
159 }
160 return old;
161}
162 120
163#define cmpxchg(ptr, o, n) \
164 ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
165 (unsigned long)(n), sizeof(*(ptr))))
166#define cmpxchg64(ptr, o, n) \ 121#define cmpxchg64(ptr, o, n) \
167({ \ 122({ \
168 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ 123 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
169 cmpxchg((ptr), (o), (n)); \ 124 cmpxchg((ptr), (o), (n)); \
170}) 125})
171#define cmpxchg_local(ptr, o, n) \ 126
172 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
173 (unsigned long)(n), \
174 sizeof(*(ptr))))
175#define sync_cmpxchg(ptr, o, n) \
176 ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
177 (unsigned long)(n), \
178 sizeof(*(ptr))))
179#define cmpxchg64_local(ptr, o, n) \ 127#define cmpxchg64_local(ptr, o, n) \
180({ \ 128({ \
181 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ 129 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 3ea6f37be9e2..8240f76b531e 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -18,6 +18,7 @@
18#define DR_TRAP1 (0x2) /* db1 */ 18#define DR_TRAP1 (0x2) /* db1 */
19#define DR_TRAP2 (0x4) /* db2 */ 19#define DR_TRAP2 (0x4) /* db2 */
20#define DR_TRAP3 (0x8) /* db3 */ 20#define DR_TRAP3 (0x8) /* db3 */
21#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
21 22
22#define DR_STEP (0x4000) /* single-step */ 23#define DR_STEP (0x4000) /* single-step */
23#define DR_SWITCH (0x8000) /* task switch */ 24#define DR_SWITCH (0x8000) /* task switch */
@@ -49,6 +50,8 @@
49 50
50#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ 51#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
51#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ 52#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
53#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */
54#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */
52#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ 55#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
53 56
54#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ 57#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
@@ -67,4 +70,34 @@
67#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */ 70#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
68#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */ 71#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
69 72
73/*
74 * HW breakpoint additions
75 */
76#ifdef __KERNEL__
77
78DECLARE_PER_CPU(unsigned long, cpu_dr7);
79
80static inline void hw_breakpoint_disable(void)
81{
82 /* Zero the control register for HW Breakpoint */
83 set_debugreg(0UL, 7);
84
85 /* Zero-out the individual HW breakpoint address registers */
86 set_debugreg(0UL, 0);
87 set_debugreg(0UL, 1);
88 set_debugreg(0UL, 2);
89 set_debugreg(0UL, 3);
90}
91
92static inline int hw_breakpoint_active(void)
93{
94 return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
95}
96
97extern void aout_dump_debugregs(struct user *dump);
98
99extern void hw_breakpoint_restore(void);
100
101#endif /* __KERNEL__ */
102
70#endif /* _ASM_X86_DEBUGREG_H */ 103#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index e8de2f6f5ca5..617bd56b3070 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -288,7 +288,7 @@ static inline void load_LDT(mm_context_t *pc)
288 288
289static inline unsigned long get_desc_base(const struct desc_struct *desc) 289static inline unsigned long get_desc_base(const struct desc_struct *desc)
290{ 290{
291 return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); 291 return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
292} 292}
293 293
294static inline void set_desc_base(struct desc_struct *desc, unsigned long base) 294static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index cee34e9ca45b..029f230ab637 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -8,7 +8,7 @@ struct dev_archdata {
8#ifdef CONFIG_X86_64 8#ifdef CONFIG_X86_64
9struct dma_map_ops *dma_ops; 9struct dma_map_ops *dma_ops;
10#endif 10#endif
11#ifdef CONFIG_DMAR 11#if defined(CONFIG_DMAR) || defined(CONFIG_AMD_IOMMU)
12 void *iommu; /* hook for IOMMU specific extension */ 12 void *iommu; /* hook for IOMMU specific extension */
13#endif 13#endif
14}; 14};
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 0ee770d23d0e..0f6c02f3b7d4 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -14,7 +14,14 @@
14#include <asm/swiotlb.h> 14#include <asm/swiotlb.h>
15#include <asm-generic/dma-coherent.h> 15#include <asm-generic/dma-coherent.h>
16 16
17extern dma_addr_t bad_dma_address; 17#ifdef CONFIG_ISA
18# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
19#else
20# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
21#endif
22
23#define DMA_ERROR_CODE 0
24
18extern int iommu_merge; 25extern int iommu_merge;
19extern struct device x86_dma_fallback_dev; 26extern struct device x86_dma_fallback_dev;
20extern int panic_on_overflow; 27extern int panic_on_overflow;
@@ -42,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
42 if (ops->mapping_error) 49 if (ops->mapping_error)
43 return ops->mapping_error(dev, dma_addr); 50 return ops->mapping_error(dev, dma_addr);
44 51
45 return (dma_addr == bad_dma_address); 52 return (dma_addr == DMA_ERROR_CODE);
46} 53}
47 54
48#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) 55#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
@@ -124,10 +131,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
124 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) 131 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
125 return memory; 132 return memory;
126 133
127 if (!dev) { 134 if (!dev)
128 dev = &x86_dma_fallback_dev; 135 dev = &x86_dma_fallback_dev;
129 gfp |= GFP_DMA;
130 }
131 136
132 if (!is_device_dma_capable(dev)) 137 if (!is_device_dma_capable(dev))
133 return NULL; 138 return NULL;
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 40b4e614fe71..761249e396fe 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -61,6 +61,12 @@ struct e820map {
61 struct e820entry map[E820_X_MAX]; 61 struct e820entry map[E820_X_MAX];
62}; 62};
63 63
64#define ISA_START_ADDRESS 0xa0000
65#define ISA_END_ADDRESS 0x100000
66
67#define BIOS_BEGIN 0x000a0000
68#define BIOS_END 0x00100000
69
64#ifdef __KERNEL__ 70#ifdef __KERNEL__
65/* see comment in arch/x86/kernel/e820.c */ 71/* see comment in arch/x86/kernel/e820.c */
66extern struct e820map e820; 72extern struct e820map e820;
@@ -126,15 +132,18 @@ extern void e820_reserve_resources(void);
126extern void e820_reserve_resources_late(void); 132extern void e820_reserve_resources_late(void);
127extern void setup_memory_map(void); 133extern void setup_memory_map(void);
128extern char *default_machine_specific_memory_setup(void); 134extern char *default_machine_specific_memory_setup(void);
129#endif /* __KERNEL__ */
130#endif /* __ASSEMBLY__ */
131 135
132#define ISA_START_ADDRESS 0xa0000 136/*
133#define ISA_END_ADDRESS 0x100000 137 * Returns true iff the specified range [s,e) is completely contained inside
134#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) 138 * the ISA region.
139 */
140static inline bool is_ISA_range(u64 s, u64 e)
141{
142 return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS;
143}
135 144
136#define BIOS_BEGIN 0x000a0000 145#endif /* __KERNEL__ */
137#define BIOS_END 0x00100000 146#endif /* __ASSEMBLY__ */
138 147
139#ifdef __KERNEL__ 148#ifdef __KERNEL__
140#include <linux/ioport.h> 149#include <linux/ioport.h>
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 456a304b8172..8a024babe5e6 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -157,19 +157,6 @@ do { \
157 157
158#define compat_elf_check_arch(x) elf_check_arch_ia32(x) 158#define compat_elf_check_arch(x) elf_check_arch_ia32(x)
159 159
160static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp)
161{
162 loadsegment(fs, 0);
163 loadsegment(ds, __USER32_DS);
164 loadsegment(es, __USER32_DS);
165 load_gs_index(0);
166 regs->ip = ip;
167 regs->sp = sp;
168 regs->flags = X86_EFLAGS_IF;
169 regs->cs = __USER32_CS;
170 regs->ss = __USER32_DS;
171}
172
173static inline void elf_common_init(struct thread_struct *t, 160static inline void elf_common_init(struct thread_struct *t,
174 struct pt_regs *regs, const u16 ds) 161 struct pt_regs *regs, const u16 ds)
175{ 162{
@@ -191,11 +178,8 @@ do { \
191#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ 178#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
192 elf_common_init(&current->thread, regs, __USER_DS) 179 elf_common_init(&current->thread, regs, __USER_DS)
193 180
194#define compat_start_thread(regs, ip, sp) \ 181void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
195do { \ 182#define compat_start_thread start_thread_ia32
196 start_ia32_thread(regs, ip, sp); \
197 set_fs(USER_DS); \
198} while (0)
199 183
200#define COMPAT_SET_PERSONALITY(ex) \ 184#define COMPAT_SET_PERSONALITY(ex) \
201do { \ 185do { \
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index f5693c81a1db..8e8ec663a98f 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -34,7 +34,7 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
34 smp_invalidate_interrupt) 34 smp_invalidate_interrupt)
35#endif 35#endif
36 36
37BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR) 37BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
38 38
39/* 39/*
40 * every pentium local APIC has two 'local interrupts', with a 40 * every pentium local APIC has two 'local interrupts', with a
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 6cfdafa409d8..4ac5b0f33fc1 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -35,8 +35,7 @@ extern int gart_iommu_aperture_allowed;
35extern int gart_iommu_aperture_disabled; 35extern int gart_iommu_aperture_disabled;
36 36
37extern void early_gart_iommu_check(void); 37extern void early_gart_iommu_check(void);
38extern void gart_iommu_init(void); 38extern int gart_iommu_init(void);
39extern void gart_iommu_shutdown(void);
40extern void __init gart_parse_options(char *); 39extern void __init gart_parse_options(char *);
41extern void gart_iommu_hole_init(void); 40extern void gart_iommu_hole_init(void);
42 41
@@ -48,12 +47,6 @@ extern void gart_iommu_hole_init(void);
48static inline void early_gart_iommu_check(void) 47static inline void early_gart_iommu_check(void)
49{ 48{
50} 49}
51static inline void gart_iommu_init(void)
52{
53}
54static inline void gart_iommu_shutdown(void)
55{
56}
57static inline void gart_parse_options(char *options) 50static inline void gart_parse_options(char *options)
58{ 51{
59} 52}
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 82e3e8f01043..0f8576427cfe 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,7 +12,7 @@ typedef struct {
12 unsigned int apic_timer_irqs; /* arch dependent */ 12 unsigned int apic_timer_irqs; /* arch dependent */
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */ 15 unsigned int x86_platform_ipis; /* arch dependent */
16 unsigned int apic_perf_irqs; 16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs; 17 unsigned int apic_pending_irqs;
18#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
@@ -20,11 +20,11 @@ typedef struct {
20 unsigned int irq_call_count; 20 unsigned int irq_call_count;
21 unsigned int irq_tlb_count; 21 unsigned int irq_tlb_count;
22#endif 22#endif
23#ifdef CONFIG_X86_MCE 23#ifdef CONFIG_X86_THERMAL_VECTOR
24 unsigned int irq_thermal_count; 24 unsigned int irq_thermal_count;
25# ifdef CONFIG_X86_MCE_THRESHOLD 25#endif
26#ifdef CONFIG_X86_MCE_THRESHOLD
26 unsigned int irq_threshold_count; 27 unsigned int irq_threshold_count;
27# endif
28#endif 28#endif
29} ____cacheline_aligned irq_cpustat_t; 29} ____cacheline_aligned irq_cpustat_t;
30 30
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
new file mode 100644
index 000000000000..0675a7c4c20e
--- /dev/null
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,73 @@
1#ifndef _I386_HW_BREAKPOINT_H
2#define _I386_HW_BREAKPOINT_H
3
4#ifdef __KERNEL__
5#define __ARCH_HW_BREAKPOINT_H
6
7/*
8 * The name should probably be something dealt in
9 * a higher level. While dealing with the user
10 * (display/resolving)
11 */
12struct arch_hw_breakpoint {
13 char *name; /* Contains name of the symbol to set bkpt */
14 unsigned long address;
15 u8 len;
16 u8 type;
17};
18
19#include <linux/kdebug.h>
20#include <linux/percpu.h>
21#include <linux/list.h>
22
23/* Available HW breakpoint length encodings */
24#define X86_BREAKPOINT_LEN_1 0x40
25#define X86_BREAKPOINT_LEN_2 0x44
26#define X86_BREAKPOINT_LEN_4 0x4c
27#define X86_BREAKPOINT_LEN_EXECUTE 0x40
28
29#ifdef CONFIG_X86_64
30#define X86_BREAKPOINT_LEN_8 0x48
31#endif
32
33/* Available HW breakpoint type encodings */
34
35/* trigger on instruction execute */
36#define X86_BREAKPOINT_EXECUTE 0x80
37/* trigger on memory write */
38#define X86_BREAKPOINT_WRITE 0x81
39/* trigger on memory read or write */
40#define X86_BREAKPOINT_RW 0x83
41
42/* Total number of available HW breakpoint registers */
43#define HBP_NUM 4
44
45struct perf_event;
46struct pmu;
47
48extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
49extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
50 struct task_struct *tsk);
51extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
52 unsigned long val, void *data);
53
54
55int arch_install_hw_breakpoint(struct perf_event *bp);
56void arch_uninstall_hw_breakpoint(struct perf_event *bp);
57void hw_breakpoint_pmu_read(struct perf_event *bp);
58void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
59
60extern void
61arch_fill_perf_breakpoint(struct perf_event *bp);
62
63unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
64int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
65
66extern int arch_bp_generic_fields(int x86_len, int x86_type,
67 int *gen_len, int *gen_type);
68
69extern struct pmu perf_ops_bp;
70
71#endif /* __KERNEL__ */
72#endif /* _I386_HW_BREAKPOINT_H */
73
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index ba180d93b08c..08c48a81841f 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,7 +27,7 @@
27 27
28/* Interrupt handlers registered during init_IRQ */ 28/* Interrupt handlers registered during init_IRQ */
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void); 30extern void x86_platform_ipi(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_pending_interrupt(void); 32extern void perf_pending_interrupt(void);
33 33
@@ -79,14 +79,32 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
79 int ioapic, int ioapic_pin, 79 int ioapic, int ioapic_pin,
80 int trigger, int polarity) 80 int trigger, int polarity)
81{ 81{
82 irq_attr->ioapic = ioapic; 82 irq_attr->ioapic = ioapic;
83 irq_attr->ioapic_pin = ioapic_pin; 83 irq_attr->ioapic_pin = ioapic_pin;
84 irq_attr->trigger = trigger; 84 irq_attr->trigger = trigger;
85 irq_attr->polarity = polarity; 85 irq_attr->polarity = polarity;
86} 86}
87 87
88extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, 88/*
89 struct io_apic_irq_attr *irq_attr); 89 * This is performance-critical, we want to do it O(1)
90 *
91 * Most irqs are mapped 1:1 with pins.
92 */
93struct irq_cfg {
94 struct irq_pin_list *irq_2_pin;
95 cpumask_var_t domain;
96 cpumask_var_t old_domain;
97 u8 vector;
98 u8 move_in_progress : 1;
99};
100
101extern struct irq_cfg *irq_cfg(unsigned int);
102extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
103extern void send_cleanup_vector(struct irq_cfg *);
104
105struct irq_desc;
106extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *);
107extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr);
90extern void setup_ioapic_dest(void); 108extern void setup_ioapic_dest(void);
91 109
92extern void enable_IO_APIC(void); 110extern void enable_IO_APIC(void);
@@ -101,7 +119,7 @@ extern void eisa_set_level_irq(unsigned int irq);
101/* SMP */ 119/* SMP */
102extern void smp_apic_timer_interrupt(struct pt_regs *); 120extern void smp_apic_timer_interrupt(struct pt_regs *);
103extern void smp_spurious_interrupt(struct pt_regs *); 121extern void smp_spurious_interrupt(struct pt_regs *);
104extern void smp_generic_interrupt(struct pt_regs *); 122extern void smp_x86_platform_ipi(struct pt_regs *);
105extern void smp_error_interrupt(struct pt_regs *); 123extern void smp_error_interrupt(struct pt_regs *);
106#ifdef CONFIG_X86_IO_APIC 124#ifdef CONFIG_X86_IO_APIC
107extern asmlinkage void smp_irq_move_cleanup_interrupt(void); 125extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
new file mode 100644
index 000000000000..205b063e3e32
--- /dev/null
+++ b/arch/x86/include/asm/inat.h
@@ -0,0 +1,220 @@
1#ifndef _ASM_X86_INAT_H
2#define _ASM_X86_INAT_H
3/*
4 * x86 instruction attributes
5 *
6 * Written by Masami Hiramatsu <mhiramat@redhat.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 *
22 */
23#include <asm/inat_types.h>
24
25/*
26 * Internal bits. Don't use bitmasks directly, because these bits are
27 * unstable. You should use checking functions.
28 */
29
30#define INAT_OPCODE_TABLE_SIZE 256
31#define INAT_GROUP_TABLE_SIZE 8
32
33/* Legacy last prefixes */
34#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */
35#define INAT_PFX_REPE 2 /* 0xF3 */ /* LPFX2 */
36#define INAT_PFX_REPNE 3 /* 0xF2 */ /* LPFX3 */
37/* Other Legacy prefixes */
38#define INAT_PFX_LOCK 4 /* 0xF0 */
39#define INAT_PFX_CS 5 /* 0x2E */
40#define INAT_PFX_DS 6 /* 0x3E */
41#define INAT_PFX_ES 7 /* 0x26 */
42#define INAT_PFX_FS 8 /* 0x64 */
43#define INAT_PFX_GS 9 /* 0x65 */
44#define INAT_PFX_SS 10 /* 0x36 */
45#define INAT_PFX_ADDRSZ 11 /* 0x67 */
46/* x86-64 REX prefix */
47#define INAT_PFX_REX 12 /* 0x4X */
48/* AVX VEX prefixes */
49#define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */
50#define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */
51
52#define INAT_LSTPFX_MAX 3
53#define INAT_LGCPFX_MAX 11
54
55/* Immediate size */
56#define INAT_IMM_BYTE 1
57#define INAT_IMM_WORD 2
58#define INAT_IMM_DWORD 3
59#define INAT_IMM_QWORD 4
60#define INAT_IMM_PTR 5
61#define INAT_IMM_VWORD32 6
62#define INAT_IMM_VWORD 7
63
64/* Legacy prefix */
65#define INAT_PFX_OFFS 0
66#define INAT_PFX_BITS 4
67#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1)
68#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS)
69/* Escape opcodes */
70#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS)
71#define INAT_ESC_BITS 2
72#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1)
73#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS)
74/* Group opcodes (1-16) */
75#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS)
76#define INAT_GRP_BITS 5
77#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1)
78#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS)
79/* Immediates */
80#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS)
81#define INAT_IMM_BITS 3
82#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS)
83/* Flags */
84#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS)
85#define INAT_MODRM (1 << (INAT_FLAG_OFFS))
86#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 1))
87#define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 2))
88#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3))
89#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4))
90#define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5))
91#define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6))
92/* Attribute making macros for attribute tables */
93#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS)
94#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS)
95#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM)
96#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS)
97
98/* Attribute search APIs */
99extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
100extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
101 insn_byte_t last_pfx,
102 insn_attr_t esc_attr);
103extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
104 insn_byte_t last_pfx,
105 insn_attr_t esc_attr);
106extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
107 insn_byte_t vex_m,
108 insn_byte_t vex_pp);
109
110/* Attribute checking functions */
111static inline int inat_is_legacy_prefix(insn_attr_t attr)
112{
113 attr &= INAT_PFX_MASK;
114 return attr && attr <= INAT_LGCPFX_MAX;
115}
116
117static inline int inat_is_address_size_prefix(insn_attr_t attr)
118{
119 return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ;
120}
121
122static inline int inat_is_operand_size_prefix(insn_attr_t attr)
123{
124 return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ;
125}
126
127static inline int inat_is_rex_prefix(insn_attr_t attr)
128{
129 return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
130}
131
132static inline int inat_last_prefix_id(insn_attr_t attr)
133{
134 if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
135 return 0;
136 else
137 return attr & INAT_PFX_MASK;
138}
139
140static inline int inat_is_vex_prefix(insn_attr_t attr)
141{
142 attr &= INAT_PFX_MASK;
143 return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3;
144}
145
146static inline int inat_is_vex3_prefix(insn_attr_t attr)
147{
148 return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
149}
150
151static inline int inat_is_escape(insn_attr_t attr)
152{
153 return attr & INAT_ESC_MASK;
154}
155
156static inline int inat_escape_id(insn_attr_t attr)
157{
158 return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS;
159}
160
161static inline int inat_is_group(insn_attr_t attr)
162{
163 return attr & INAT_GRP_MASK;
164}
165
166static inline int inat_group_id(insn_attr_t attr)
167{
168 return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS;
169}
170
171static inline int inat_group_common_attribute(insn_attr_t attr)
172{
173 return attr & ~INAT_GRP_MASK;
174}
175
176static inline int inat_has_immediate(insn_attr_t attr)
177{
178 return attr & INAT_IMM_MASK;
179}
180
181static inline int inat_immediate_size(insn_attr_t attr)
182{
183 return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS;
184}
185
186static inline int inat_has_modrm(insn_attr_t attr)
187{
188 return attr & INAT_MODRM;
189}
190
191static inline int inat_is_force64(insn_attr_t attr)
192{
193 return attr & INAT_FORCE64;
194}
195
196static inline int inat_has_second_immediate(insn_attr_t attr)
197{
198 return attr & INAT_SCNDIMM;
199}
200
201static inline int inat_has_moffset(insn_attr_t attr)
202{
203 return attr & INAT_MOFFSET;
204}
205
206static inline int inat_has_variant(insn_attr_t attr)
207{
208 return attr & INAT_VARIANT;
209}
210
211static inline int inat_accept_vex(insn_attr_t attr)
212{
213 return attr & INAT_VEXOK;
214}
215
216static inline int inat_must_vex(insn_attr_t attr)
217{
218 return attr & INAT_VEXONLY;
219}
220#endif
diff --git a/arch/x86/include/asm/inat_types.h b/arch/x86/include/asm/inat_types.h
new file mode 100644
index 000000000000..cb3c20ce39cf
--- /dev/null
+++ b/arch/x86/include/asm/inat_types.h
@@ -0,0 +1,29 @@
1#ifndef _ASM_X86_INAT_TYPES_H
2#define _ASM_X86_INAT_TYPES_H
3/*
4 * x86 instruction attributes
5 *
6 * Written by Masami Hiramatsu <mhiramat@redhat.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 *
22 */
23
24/* Instruction attributes */
25typedef unsigned int insn_attr_t;
26typedef unsigned char insn_byte_t;
27typedef signed int insn_value_t;
28
29#endif
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
new file mode 100644
index 000000000000..96c2e0ad04ca
--- /dev/null
+++ b/arch/x86/include/asm/insn.h
@@ -0,0 +1,184 @@
1#ifndef _ASM_X86_INSN_H
2#define _ASM_X86_INSN_H
3/*
4 * x86 instruction analysis
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright (C) IBM Corporation, 2009
21 */
22
23/* insn_attr_t is defined in inat.h */
24#include <asm/inat.h>
25
26struct insn_field {
27 union {
28 insn_value_t value;
29 insn_byte_t bytes[4];
30 };
31 /* !0 if we've run insn_get_xxx() for this field */
32 unsigned char got;
33 unsigned char nbytes;
34};
35
36struct insn {
37 struct insn_field prefixes; /*
38 * Prefixes
39 * prefixes.bytes[3]: last prefix
40 */
41 struct insn_field rex_prefix; /* REX prefix */
42 struct insn_field vex_prefix; /* VEX prefix */
43 struct insn_field opcode; /*
44 * opcode.bytes[0]: opcode1
45 * opcode.bytes[1]: opcode2
46 * opcode.bytes[2]: opcode3
47 */
48 struct insn_field modrm;
49 struct insn_field sib;
50 struct insn_field displacement;
51 union {
52 struct insn_field immediate;
53 struct insn_field moffset1; /* for 64bit MOV */
54 struct insn_field immediate1; /* for 64bit imm or off16/32 */
55 };
56 union {
57 struct insn_field moffset2; /* for 64bit MOV */
58 struct insn_field immediate2; /* for 64bit imm or seg16 */
59 };
60
61 insn_attr_t attr;
62 unsigned char opnd_bytes;
63 unsigned char addr_bytes;
64 unsigned char length;
65 unsigned char x86_64;
66
67 const insn_byte_t *kaddr; /* kernel address of insn to analyze */
68 const insn_byte_t *next_byte;
69};
70
71#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
72#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
73#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
74
75#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6)
76#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
77#define X86_SIB_BASE(sib) ((sib) & 0x07)
78
79#define X86_REX_W(rex) ((rex) & 8)
80#define X86_REX_R(rex) ((rex) & 4)
81#define X86_REX_X(rex) ((rex) & 2)
82#define X86_REX_B(rex) ((rex) & 1)
83
84/* VEX bit flags */
85#define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */
86#define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */
87#define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */
88#define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */
89#define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */
90/* VEX bit fields */
91#define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */
92#define X86_VEX2_M 1 /* VEX2.M always 1 */
93#define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */
94#define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */
95#define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */
96
97/* The last prefix is needed for two-byte and three-byte opcodes */
98static inline insn_byte_t insn_last_prefix(struct insn *insn)
99{
100 return insn->prefixes.bytes[3];
101}
102
103extern void insn_init(struct insn *insn, const void *kaddr, int x86_64);
104extern void insn_get_prefixes(struct insn *insn);
105extern void insn_get_opcode(struct insn *insn);
106extern void insn_get_modrm(struct insn *insn);
107extern void insn_get_sib(struct insn *insn);
108extern void insn_get_displacement(struct insn *insn);
109extern void insn_get_immediate(struct insn *insn);
110extern void insn_get_length(struct insn *insn);
111
112/* Attribute will be determined after getting ModRM (for opcode groups) */
113static inline void insn_get_attribute(struct insn *insn)
114{
115 insn_get_modrm(insn);
116}
117
118/* Instruction uses RIP-relative addressing */
119extern int insn_rip_relative(struct insn *insn);
120
121/* Init insn for kernel text */
122static inline void kernel_insn_init(struct insn *insn, const void *kaddr)
123{
124#ifdef CONFIG_X86_64
125 insn_init(insn, kaddr, 1);
126#else /* CONFIG_X86_32 */
127 insn_init(insn, kaddr, 0);
128#endif
129}
130
131static inline int insn_is_avx(struct insn *insn)
132{
133 if (!insn->prefixes.got)
134 insn_get_prefixes(insn);
135 return (insn->vex_prefix.value != 0);
136}
137
138static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
139{
140 if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
141 return X86_VEX2_M;
142 else
143 return X86_VEX3_M(insn->vex_prefix.bytes[1]);
144}
145
146static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
147{
148 if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
149 return X86_VEX_P(insn->vex_prefix.bytes[1]);
150 else
151 return X86_VEX_P(insn->vex_prefix.bytes[2]);
152}
153
154/* Offset of each field from kaddr */
155static inline int insn_offset_rex_prefix(struct insn *insn)
156{
157 return insn->prefixes.nbytes;
158}
159static inline int insn_offset_vex_prefix(struct insn *insn)
160{
161 return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes;
162}
163static inline int insn_offset_opcode(struct insn *insn)
164{
165 return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes;
166}
167static inline int insn_offset_modrm(struct insn *insn)
168{
169 return insn_offset_opcode(insn) + insn->opcode.nbytes;
170}
171static inline int insn_offset_sib(struct insn *insn)
172{
173 return insn_offset_modrm(insn) + insn->modrm.nbytes;
174}
175static inline int insn_offset_displacement(struct insn *insn)
176{
177 return insn_offset_sib(insn) + insn->sib.nbytes;
178}
179static inline int insn_offset_immediate(struct insn *insn)
180{
181 return insn_offset_displacement(insn) + insn->displacement.nbytes;
182}
183
184#endif /* _ASM_X86_INSN_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index fd6d21bbee6c..345c99cef152 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -1,8 +1,6 @@
1#ifndef _ASM_X86_IOMMU_H 1#ifndef _ASM_X86_IOMMU_H
2#define _ASM_X86_IOMMU_H 2#define _ASM_X86_IOMMU_H
3 3
4extern void pci_iommu_shutdown(void);
5extern void no_iommu_init(void);
6extern struct dma_map_ops nommu_dma_ops; 4extern struct dma_map_ops nommu_dma_ops;
7extern int force_iommu, no_iommu; 5extern int force_iommu, no_iommu;
8extern int iommu_detected; 6extern int iommu_detected;
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ddda6cbed6f4..5458380b6ef8 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -34,9 +34,10 @@ static inline int irq_canonicalize(int irq)
34#ifdef CONFIG_HOTPLUG_CPU 34#ifdef CONFIG_HOTPLUG_CPU
35#include <linux/cpumask.h> 35#include <linux/cpumask.h>
36extern void fixup_irqs(void); 36extern void fixup_irqs(void);
37extern void irq_force_complete_move(int);
37#endif 38#endif
38 39
39extern void (*generic_interrupt_extension)(void); 40extern void (*x86_platform_ipi_callback)(void);
40extern void native_init_IRQ(void); 41extern void native_init_IRQ(void);
41extern bool handle_irq(unsigned irq, struct pt_regs *regs); 42extern bool handle_irq(unsigned irq, struct pt_regs *regs);
42 43
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 5b21f0ec3df2..6a635bd39867 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -106,7 +106,7 @@
106/* 106/*
107 * Generic system vector for platform specific use 107 * Generic system vector for platform specific use
108 */ 108 */
109#define GENERIC_INTERRUPT_VECTOR 0xed 109#define X86_PLATFORM_IPI_VECTOR 0xed
110 110
111/* 111/*
112 * Performance monitoring pending work vector: 112 * Performance monitoring pending work vector:
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index c2d1f3b58e5f..f70e60071fe8 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -4,13 +4,16 @@
4#include <linux/pci.h> 4#include <linux/pci.h>
5 5
6extern struct pci_device_id k8_nb_ids[]; 6extern struct pci_device_id k8_nb_ids[];
7struct bootnode;
7 8
8extern int early_is_k8_nb(u32 value); 9extern int early_is_k8_nb(u32 value);
9extern struct pci_dev **k8_northbridges; 10extern struct pci_dev **k8_northbridges;
10extern int num_k8_northbridges; 11extern int num_k8_northbridges;
11extern int cache_k8_northbridges(void); 12extern int cache_k8_northbridges(void);
12extern void k8_flush_garts(void); 13extern void k8_flush_garts(void);
13extern int k8_scan_nodes(unsigned long start, unsigned long end); 14extern int k8_get_nodes(struct bootnode *nodes);
15extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
16extern int k8_scan_nodes(void);
14 17
15#ifdef CONFIG_K8_NB 18#ifdef CONFIG_K8_NB
16static inline struct pci_dev *node_to_k8_nb_misc(int node) 19static inline struct pci_dev *node_to_k8_nb_misc(int node)
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4a5fe914dc59..950df434763f 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -19,6 +19,8 @@
19#define __KVM_HAVE_MSIX 19#define __KVM_HAVE_MSIX
20#define __KVM_HAVE_MCE 20#define __KVM_HAVE_MCE
21#define __KVM_HAVE_PIT_STATE2 21#define __KVM_HAVE_PIT_STATE2
22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS
22 24
23/* Architectural interrupt line count. */ 25/* Architectural interrupt line count. */
24#define KVM_NR_INTERRUPTS 256 26#define KVM_NR_INTERRUPTS 256
@@ -79,6 +81,7 @@ struct kvm_ioapic_state {
79#define KVM_IRQCHIP_PIC_MASTER 0 81#define KVM_IRQCHIP_PIC_MASTER 0
80#define KVM_IRQCHIP_PIC_SLAVE 1 82#define KVM_IRQCHIP_PIC_SLAVE 1
81#define KVM_IRQCHIP_IOAPIC 2 83#define KVM_IRQCHIP_IOAPIC 2
84#define KVM_NR_IRQCHIPS 3
82 85
83/* for KVM_GET_REGS and KVM_SET_REGS */ 86/* for KVM_GET_REGS and KVM_SET_REGS */
84struct kvm_regs { 87struct kvm_regs {
@@ -250,4 +253,31 @@ struct kvm_reinject_control {
250 __u8 pit_reinject; 253 __u8 pit_reinject;
251 __u8 reserved[31]; 254 __u8 reserved[31];
252}; 255};
256
257/* for KVM_GET/SET_VCPU_EVENTS */
258struct kvm_vcpu_events {
259 struct {
260 __u8 injected;
261 __u8 nr;
262 __u8 has_error_code;
263 __u8 pad;
264 __u32 error_code;
265 } exception;
266 struct {
267 __u8 injected;
268 __u8 nr;
269 __u8 soft;
270 __u8 pad;
271 } interrupt;
272 struct {
273 __u8 injected;
274 __u8 pending;
275 __u8 masked;
276 __u8 pad;
277 } nmi;
278 __u32 sipi_vector;
279 __u32 flags;
280 __u32 reserved[10];
281};
282
253#endif /* _ASM_X86_KVM_H */ 283#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7ed2c423116..7c18e1230f54 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -129,7 +129,7 @@ struct decode_cache {
129 u8 seg_override; 129 u8 seg_override;
130 unsigned int d; 130 unsigned int d;
131 unsigned long regs[NR_VCPU_REGS]; 131 unsigned long regs[NR_VCPU_REGS];
132 unsigned long eip; 132 unsigned long eip, eip_orig;
133 /* modrm */ 133 /* modrm */
134 u8 modrm; 134 u8 modrm;
135 u8 modrm_mod; 135 u8 modrm_mod;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d83892226f73..4f865e8b8540 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -354,7 +354,6 @@ struct kvm_vcpu_arch {
354 unsigned int time_offset; 354 unsigned int time_offset;
355 struct page *time_page; 355 struct page *time_page;
356 356
357 bool singlestep; /* guest is single stepped by KVM */
358 bool nmi_pending; 357 bool nmi_pending;
359 bool nmi_injected; 358 bool nmi_injected;
360 359
@@ -371,6 +370,10 @@ struct kvm_vcpu_arch {
371 u64 mcg_status; 370 u64 mcg_status;
372 u64 mcg_ctl; 371 u64 mcg_ctl;
373 u64 *mce_banks; 372 u64 *mce_banks;
373
374 /* used for guest single stepping over the given code position */
375 u16 singlestep_cs;
376 unsigned long singlestep_rip;
374}; 377};
375 378
376struct kvm_mem_alias { 379struct kvm_mem_alias {
@@ -397,7 +400,6 @@ struct kvm_arch{
397 struct kvm_pic *vpic; 400 struct kvm_pic *vpic;
398 struct kvm_ioapic *vioapic; 401 struct kvm_ioapic *vioapic;
399 struct kvm_pit *vpit; 402 struct kvm_pit *vpit;
400 struct hlist_head irq_ack_notifier_list;
401 int vapics_in_nmi_mode; 403 int vapics_in_nmi_mode;
402 404
403 unsigned int tss_addr; 405 unsigned int tss_addr;
@@ -410,8 +412,10 @@ struct kvm_arch{
410 gpa_t ept_identity_map_addr; 412 gpa_t ept_identity_map_addr;
411 413
412 unsigned long irq_sources_bitmap; 414 unsigned long irq_sources_bitmap;
413 unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
414 u64 vm_init_tsc; 415 u64 vm_init_tsc;
416 s64 kvmclock_offset;
417
418 struct kvm_xen_hvm_config xen_hvm_config;
415}; 419};
416 420
417struct kvm_vm_stat { 421struct kvm_vm_stat {
@@ -461,7 +465,7 @@ struct descriptor_table {
461struct kvm_x86_ops { 465struct kvm_x86_ops {
462 int (*cpu_has_kvm_support)(void); /* __init */ 466 int (*cpu_has_kvm_support)(void); /* __init */
463 int (*disabled_by_bios)(void); /* __init */ 467 int (*disabled_by_bios)(void); /* __init */
464 void (*hardware_enable)(void *dummy); /* __init */ 468 int (*hardware_enable)(void *dummy);
465 void (*hardware_disable)(void *dummy); 469 void (*hardware_disable)(void *dummy);
466 void (*check_processor_compatibility)(void *rtn); 470 void (*check_processor_compatibility)(void *rtn);
467 int (*hardware_setup)(void); /* __init */ 471 int (*hardware_setup)(void); /* __init */
@@ -477,8 +481,8 @@ struct kvm_x86_ops {
477 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 481 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
478 void (*vcpu_put)(struct kvm_vcpu *vcpu); 482 void (*vcpu_put)(struct kvm_vcpu *vcpu);
479 483
480 int (*set_guest_debug)(struct kvm_vcpu *vcpu, 484 void (*set_guest_debug)(struct kvm_vcpu *vcpu,
481 struct kvm_guest_debug *dbg); 485 struct kvm_guest_debug *dbg);
482 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 486 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
483 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 487 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
484 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 488 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -506,8 +510,8 @@ struct kvm_x86_ops {
506 510
507 void (*tlb_flush)(struct kvm_vcpu *vcpu); 511 void (*tlb_flush)(struct kvm_vcpu *vcpu);
508 512
509 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 513 void (*run)(struct kvm_vcpu *vcpu);
510 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); 514 int (*handle_exit)(struct kvm_vcpu *vcpu);
511 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 515 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
512 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 516 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
513 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 517 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
@@ -519,6 +523,8 @@ struct kvm_x86_ops {
519 bool has_error_code, u32 error_code); 523 bool has_error_code, u32 error_code);
520 int (*interrupt_allowed)(struct kvm_vcpu *vcpu); 524 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
521 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 525 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
526 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
527 void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
522 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 528 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
523 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 529 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
524 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 530 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
@@ -568,7 +574,7 @@ enum emulation_result {
568#define EMULTYPE_NO_DECODE (1 << 0) 574#define EMULTYPE_NO_DECODE (1 << 0)
569#define EMULTYPE_TRAP_UD (1 << 1) 575#define EMULTYPE_TRAP_UD (1 << 1)
570#define EMULTYPE_SKIP (1 << 2) 576#define EMULTYPE_SKIP (1 << 2)
571int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 577int emulate_instruction(struct kvm_vcpu *vcpu,
572 unsigned long cr2, u16 error_code, int emulation_type); 578 unsigned long cr2, u16 error_code, int emulation_type);
573void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 579void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
574void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 580void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -585,9 +591,9 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
585 591
586struct x86_emulate_ctxt; 592struct x86_emulate_ctxt;
587 593
588int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 594int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
589 int size, unsigned port); 595 int size, unsigned port);
590int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 596int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
591 int size, unsigned long count, int down, 597 int size, unsigned long count, int down,
592 gva_t address, int rep, unsigned port); 598 gva_t address, int rep, unsigned port);
593void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 599void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -616,6 +622,9 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
616int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 622int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
617int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 623int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
618 624
625unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
626void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
627
619void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); 628void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
620void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 629void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
621void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 630void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
@@ -802,4 +811,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
802int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 811int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
803int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 812int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
804 813
814void kvm_define_shared_msr(unsigned index, u32 msr);
815void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
816
805#endif /* _ASM_X86_KVM_HOST_H */ 817#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f1363b72364f..858baa061cfc 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -108,6 +108,8 @@ struct mce_log {
108#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) 108#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
109#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) 109#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
110 110
111extern struct atomic_notifier_head x86_mce_decoder_chain;
112
111#ifdef __KERNEL__ 113#ifdef __KERNEL__
112 114
113#include <linux/percpu.h> 115#include <linux/percpu.h>
@@ -118,9 +120,11 @@ extern int mce_disabled;
118extern int mce_p5_enabled; 120extern int mce_p5_enabled;
119 121
120#ifdef CONFIG_X86_MCE 122#ifdef CONFIG_X86_MCE
121void mcheck_init(struct cpuinfo_x86 *c); 123int mcheck_init(void);
124void mcheck_cpu_init(struct cpuinfo_x86 *c);
122#else 125#else
123static inline void mcheck_init(struct cpuinfo_x86 *c) {} 126static inline int mcheck_init(void) { return 0; }
127static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
124#endif 128#endif
125 129
126#ifdef CONFIG_X86_ANCIENT_MCE 130#ifdef CONFIG_X86_ANCIENT_MCE
@@ -214,5 +218,11 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
214 218
215void mce_log_therm_throt_event(__u64 status); 219void mce_log_therm_throt_event(__u64 status);
216 220
221#ifdef CONFIG_X86_THERMAL_VECTOR
222extern void mcheck_intel_therm_init(void);
223#else
224static inline void mcheck_intel_therm_init(void) { }
225#endif
226
217#endif /* __KERNEL__ */ 227#endif /* __KERNEL__ */
218#endif /* _ASM_X86_MCE_H */ 228#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ef51b501e22a..c24ca9a56458 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -12,6 +12,8 @@ struct device;
12enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; 12enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
13 13
14struct microcode_ops { 14struct microcode_ops {
15 void (*init)(struct device *device);
16 void (*fini)(void);
15 enum ucode_state (*request_microcode_user) (int cpu, 17 enum ucode_state (*request_microcode_user) (int cpu,
16 const void __user *buf, size_t size); 18 const void __user *buf, size_t size);
17 19
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 79c94500c0bb..d8bf23a88d05 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -71,12 +71,7 @@ static inline void early_get_smp_config(void)
71 71
72static inline void find_smp_config(void) 72static inline void find_smp_config(void)
73{ 73{
74 x86_init.mpparse.find_smp_config(1); 74 x86_init.mpparse.find_smp_config();
75}
76
77static inline void early_find_smp_config(void)
78{
79 x86_init.mpparse.find_smp_config(0);
80} 75}
81 76
82#ifdef CONFIG_X86_MPPARSE 77#ifdef CONFIG_X86_MPPARSE
@@ -89,7 +84,7 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
89# else 84# else
90# define default_mpc_oem_bus_info NULL 85# define default_mpc_oem_bus_info NULL
91# endif 86# endif
92extern void default_find_smp_config(unsigned int reserve); 87extern void default_find_smp_config(void);
93extern void default_get_smp_config(unsigned int early); 88extern void default_get_smp_config(unsigned int early);
94#else 89#else
95static inline void early_reserve_e820_mpc_new(void) { } 90static inline void early_reserve_e820_mpc_new(void) { }
@@ -97,7 +92,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
97#define default_mpc_apic_id NULL 92#define default_mpc_apic_id NULL
98#define default_smp_read_mpc_oem NULL 93#define default_smp_read_mpc_oem NULL
99#define default_mpc_oem_bus_info NULL 94#define default_mpc_oem_bus_info NULL
100#define default_find_smp_config x86_init_uint_noop 95#define default_find_smp_config x86_init_noop
101#define default_get_smp_config x86_init_uint_noop 96#define default_get_smp_config x86_init_uint_noop
102#endif 97#endif
103 98
@@ -163,14 +158,16 @@ typedef struct physid_mask physid_mask_t;
163#define physids_shift_left(d, s, n) \ 158#define physids_shift_left(d, s, n) \
164 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) 159 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
165 160
166#define physids_coerce(map) ((map).mask[0]) 161static inline unsigned long physids_coerce(physid_mask_t *map)
162{
163 return map->mask[0];
164}
167 165
168#define physids_promote(physids) \ 166static inline void physids_promote(unsigned long physids, physid_mask_t *map)
169 ({ \ 167{
170 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ 168 physids_clear(*map);
171 __physid_mask.mask[0] = physids; \ 169 map->mask[0] = physids;
172 __physid_mask; \ 170}
173 })
174 171
175/* Note: will create very large stack frames if physid_mask_t is big */ 172/* Note: will create very large stack frames if physid_mask_t is big */
176#define physid_mask_of_physid(physid) \ 173#define physid_mask_of_physid(physid) \
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 7e2b6ba962ff..5bef931f8b14 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -247,8 +247,8 @@ do { \
247#ifdef CONFIG_SMP 247#ifdef CONFIG_SMP
248int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 248int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
249int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 249int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
250void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); 250void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
251void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); 251void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
252int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 252int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
253int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 253int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
254int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); 254int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
@@ -264,12 +264,12 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
264 wrmsr(msr_no, l, h); 264 wrmsr(msr_no, l, h);
265 return 0; 265 return 0;
266} 266}
267static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no, 267static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
268 struct msr *msrs) 268 struct msr *msrs)
269{ 269{
270 rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); 270 rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
271} 271}
272static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no, 272static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
273 struct msr *msrs) 273 struct msr *msrs)
274{ 274{
275 wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); 275 wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 6473f5ccff85..642fe34b36a2 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -49,7 +49,8 @@ extern unsigned long max_pfn_mapped;
49extern unsigned long init_memory_mapping(unsigned long start, 49extern unsigned long init_memory_mapping(unsigned long start,
50 unsigned long end); 50 unsigned long end);
51 51
52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); 52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn,
53 int acpi, int k8);
53extern void free_initmem(void); 54extern void free_initmem(void);
54 55
55#endif /* !__ASSEMBLY__ */ 56#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index ad7ce3fd5065..8d9f8548a870 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -28,9 +28,20 @@
28 */ 28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff 29#define ARCH_PERFMON_EVENT_MASK 0xffff
30 30
31/*
32 * filter mask to validate fixed counter events.
33 * the following filters disqualify for fixed counters:
34 * - inv
35 * - edge
36 * - cnt-mask
37 * The other filters are supported by fixed counters.
38 * The any-thread option is supported starting with v3.
39 */
40#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000
41
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c 42#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) 43#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 44#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ 45#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) 46 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36 47
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index af6fd360ab35..a34c785c5a63 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -16,6 +16,8 @@
16 16
17#ifndef __ASSEMBLY__ 17#ifndef __ASSEMBLY__
18 18
19#include <asm/x86_init.h>
20
19/* 21/*
20 * ZERO_PAGE is a global shared page that is always zero: used 22 * ZERO_PAGE is a global shared page that is always zero: used
21 * for zero-mapped memory areas etc.. 23 * for zero-mapped memory areas etc..
@@ -270,9 +272,9 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
270 unsigned long new_flags) 272 unsigned long new_flags)
271{ 273{
272 /* 274 /*
273 * PAT type is always WB for ISA. So no need to check. 275 * PAT type is always WB for untracked ranges, so no need to check.
274 */ 276 */
275 if (is_ISA_range(paddr, paddr + size - 1)) 277 if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
276 return 1; 278 return 1;
277 279
278 /* 280 /*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c3429e8b2424..6f8ec1c37e0a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -30,6 +30,7 @@ struct mm_struct;
30#include <linux/math64.h> 30#include <linux/math64.h>
31#include <linux/init.h> 31#include <linux/init.h>
32 32
33#define HBP_NUM 4
33/* 34/*
34 * Default implementation of macro that returns current 35 * Default implementation of macro that returns current
35 * instruction pointer ("program counter"). 36 * instruction pointer ("program counter").
@@ -422,6 +423,8 @@ extern unsigned int xstate_size;
422extern void free_thread_xstate(struct task_struct *); 423extern void free_thread_xstate(struct task_struct *);
423extern struct kmem_cache *task_xstate_cachep; 424extern struct kmem_cache *task_xstate_cachep;
424 425
426struct perf_event;
427
425struct thread_struct { 428struct thread_struct {
426 /* Cached TLS descriptors: */ 429 /* Cached TLS descriptors: */
427 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; 430 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
@@ -443,13 +446,10 @@ struct thread_struct {
443 unsigned long fs; 446 unsigned long fs;
444#endif 447#endif
445 unsigned long gs; 448 unsigned long gs;
446 /* Hardware debugging registers: */ 449 /* Save middle states of ptrace breakpoints */
447 unsigned long debugreg0; 450 struct perf_event *ptrace_bps[HBP_NUM];
448 unsigned long debugreg1; 451 /* Debug status used for traps, single steps, etc... */
449 unsigned long debugreg2; 452 unsigned long debugreg6;
450 unsigned long debugreg3;
451 unsigned long debugreg6;
452 unsigned long debugreg7;
453 /* Fault info: */ 453 /* Fault info: */
454 unsigned long cr2; 454 unsigned long cr2;
455 unsigned long trap_no; 455 unsigned long trap_no;
@@ -1000,7 +1000,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
1000#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) 1000#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
1001 1001
1002#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) 1002#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
1003#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ 1003extern unsigned long KSTK_ESP(struct task_struct *task);
1004#endif /* CONFIG_X86_64 */ 1004#endif /* CONFIG_X86_64 */
1005 1005
1006extern void start_thread(struct pt_regs *regs, unsigned long new_ip, 1006extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 621f56d73121..4009f6534f52 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -5,18 +5,19 @@
5 5
6/* misc architecture specific prototypes */ 6/* misc architecture specific prototypes */
7 7
8extern void early_idt_handler(void); 8void early_idt_handler(void);
9 9
10extern void system_call(void); 10void system_call(void);
11extern void syscall_init(void); 11void syscall_init(void);
12 12
13extern void ia32_syscall(void); 13void ia32_syscall(void);
14extern void ia32_cstar_target(void); 14void ia32_cstar_target(void);
15extern void ia32_sysenter_target(void); 15void ia32_sysenter_target(void);
16 16
17extern void syscall32_cpu_init(void); 17void syscall32_cpu_init(void);
18 18
19extern void check_efer(void); 19void x86_configure_nx(void);
20void x86_report_nx(void);
20 21
21extern int reboot_force; 22extern int reboot_force;
22 23
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 0f0d908349aa..3d11fd0f44c5 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -7,6 +7,7 @@
7 7
8#ifdef __KERNEL__ 8#ifdef __KERNEL__
9#include <asm/segment.h> 9#include <asm/segment.h>
10#include <asm/page_types.h>
10#endif 11#endif
11 12
12#ifndef __ASSEMBLY__ 13#ifndef __ASSEMBLY__
@@ -216,6 +217,67 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
216 return regs->sp; 217 return regs->sp;
217} 218}
218 219
220/* Query offset/name of register from its name/offset */
221extern int regs_query_register_offset(const char *name);
222extern const char *regs_query_register_name(unsigned int offset);
223#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
224
225/**
226 * regs_get_register() - get register value from its offset
227 * @regs: pt_regs from which register value is gotten.
228 * @offset: offset number of the register.
229 *
230 * regs_get_register returns the value of a register. The @offset is the
231 * offset of the register in struct pt_regs address which specified by @regs.
232 * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
233 */
234static inline unsigned long regs_get_register(struct pt_regs *regs,
235 unsigned int offset)
236{
237 if (unlikely(offset > MAX_REG_OFFSET))
238 return 0;
239 return *(unsigned long *)((unsigned long)regs + offset);
240}
241
242/**
243 * regs_within_kernel_stack() - check the address in the stack
244 * @regs: pt_regs which contains kernel stack pointer.
245 * @addr: address which is checked.
246 *
247 * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
248 * If @addr is within the kernel stack, it returns true. If not, returns false.
249 */
250static inline int regs_within_kernel_stack(struct pt_regs *regs,
251 unsigned long addr)
252{
253 return ((addr & ~(THREAD_SIZE - 1)) ==
254 (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
255}
256
257/**
258 * regs_get_kernel_stack_nth() - get Nth entry of the stack
259 * @regs: pt_regs which contains kernel stack pointer.
260 * @n: stack entry number.
261 *
262 * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
263 * is specified by @regs. If the @n th entry is NOT in the kernel stack,
264 * this returns 0.
265 */
266static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
267 unsigned int n)
268{
269 unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
270 addr += n;
271 if (regs_within_kernel_stack(regs, (unsigned long)addr))
272 return *addr;
273 else
274 return 0;
275}
276
277/* Get Nth argument at function call */
278extern unsigned long regs_get_argument_nth(struct pt_regs *regs,
279 unsigned int n);
280
219/* 281/*
220 * These are defined as per linux/ptrace.h, which see. 282 * These are defined as per linux/ptrace.h, which see.
221 */ 283 */
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 1b7ee5d673c2..0a5242428659 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -2,7 +2,13 @@
2#define _ASM_X86_SECTIONS_H 2#define _ASM_X86_SECTIONS_H
3 3
4#include <asm-generic/sections.h> 4#include <asm-generic/sections.h>
5#include <asm/uaccess.h>
5 6
6extern char __brk_base[], __brk_limit[]; 7extern char __brk_base[], __brk_limit[];
8extern struct exception_table_entry __stop___ex_table[];
9
10#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
11extern char __end_rodata_hpage_align[];
12#endif
7 13
8#endif /* _ASM_X86_SECTIONS_H */ 14#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index ae907e617181..3d3e8353ee5c 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -177,10 +177,15 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
177 */ 177 */
178 178
179#ifndef CONFIG_KMEMCHECK 179#ifndef CONFIG_KMEMCHECK
180
181#if (__GNUC__ >= 4)
182#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
183#else
180#define memcpy(t, f, n) \ 184#define memcpy(t, f, n) \
181 (__builtin_constant_p((n)) \ 185 (__builtin_constant_p((n)) \
182 ? __constant_memcpy((t), (f), (n)) \ 186 ? __constant_memcpy((t), (f), (n)) \
183 : __memcpy((t), (f), (n))) 187 : __memcpy((t), (f), (n)))
188#endif
184#else 189#else
185/* 190/*
186 * kmemcheck becomes very happy if we use the REP instructions unconditionally, 191 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
@@ -316,11 +321,15 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern,
316 : __memset_generic((s), (c), (count))) 321 : __memset_generic((s), (c), (count)))
317 322
318#define __HAVE_ARCH_MEMSET 323#define __HAVE_ARCH_MEMSET
324#if (__GNUC__ >= 4)
325#define memset(s, c, count) __builtin_memset(s, c, count)
326#else
319#define memset(s, c, count) \ 327#define memset(s, c, count) \
320 (__builtin_constant_p(c) \ 328 (__builtin_constant_p(c) \
321 ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \ 329 ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \
322 (count)) \ 330 (count)) \
323 : __memset((s), (c), (count))) 331 : __memset((s), (c), (count)))
332#endif
324 333
325/* 334/*
326 * find the first occurrence of byte 'c', or 1 past the area if none 335 * find the first occurrence of byte 'c', or 1 past the area if none
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7c1bc1..1fecb7e61130 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
57 u16 intercept_dr_write; 57 u16 intercept_dr_write;
58 u32 intercept_exceptions; 58 u32 intercept_exceptions;
59 u64 intercept; 59 u64 intercept;
60 u8 reserved_1[44]; 60 u8 reserved_1[42];
61 u16 pause_filter_count;
61 u64 iopm_base_pa; 62 u64 iopm_base_pa;
62 u64 msrpm_base_pa; 63 u64 msrpm_base_pa;
63 u64 tsc_offset; 64 u64 tsc_offset;
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index b9e4e20174fb..87ffcb12a1b8 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -3,17 +3,14 @@
3 3
4#include <linux/swiotlb.h> 4#include <linux/swiotlb.h>
5 5
6/* SWIOTLB interface */
7
8extern int swiotlb_force;
9
10#ifdef CONFIG_SWIOTLB 6#ifdef CONFIG_SWIOTLB
11extern int swiotlb; 7extern int swiotlb;
12extern void pci_swiotlb_init(void); 8extern int pci_swiotlb_init(void);
13#else 9#else
14#define swiotlb 0 10#define swiotlb 0
15static inline void pci_swiotlb_init(void) 11static inline int pci_swiotlb_init(void)
16{ 12{
13 return 0;
17} 14}
18#endif 15#endif
19 16
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 72a6dcd1299b..9af9decb38c3 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -51,11 +51,6 @@ asmlinkage long sys32_sched_rr_get_interval(compat_pid_t,
51asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t); 51asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
52asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *); 52asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
53 53
54#ifdef CONFIG_SYSCTL_SYSCALL
55struct sysctl_ia32;
56asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *);
57#endif
58
59asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); 54asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
60asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); 55asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
61 56
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index f08f97374892..022a84386de8 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -128,8 +128,6 @@ do { \
128 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ 128 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
129 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ 129 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
130 "call __switch_to\n\t" \ 130 "call __switch_to\n\t" \
131 ".globl thread_return\n" \
132 "thread_return:\n\t" \
133 "movq "__percpu_arg([current_task])",%%rsi\n\t" \ 131 "movq "__percpu_arg([current_task])",%%rsi\n\t" \
134 __switch_canary \ 132 __switch_canary \
135 "movq %P[thread_info](%%rsi),%%r8\n\t" \ 133 "movq %P[thread_info](%%rsi),%%r8\n\t" \
@@ -157,19 +155,22 @@ extern void native_load_gs_index(unsigned);
157 * Load a segment. Fall back on loading the zero 155 * Load a segment. Fall back on loading the zero
158 * segment if something goes wrong.. 156 * segment if something goes wrong..
159 */ 157 */
160#define loadsegment(seg, value) \ 158#define loadsegment(seg, value) \
161 asm volatile("\n" \ 159do { \
162 "1:\t" \ 160 unsigned short __val = (value); \
163 "movl %k0,%%" #seg "\n" \ 161 \
164 "2:\n" \ 162 asm volatile(" \n" \
165 ".section .fixup,\"ax\"\n" \ 163 "1: movl %k0,%%" #seg " \n" \
166 "3:\t" \ 164 \
167 "movl %k1, %%" #seg "\n\t" \ 165 ".section .fixup,\"ax\" \n" \
168 "jmp 2b\n" \ 166 "2: xorl %k0,%k0 \n" \
169 ".previous\n" \ 167 " jmp 1b \n" \
170 _ASM_EXTABLE(1b,3b) \ 168 ".previous \n" \
171 : :"r" (value), "r" (0) : "memory") 169 \
172 170 _ASM_EXTABLE(1b, 2b) \
171 \
172 : "+r" (__val) : : "memory"); \
173} while (0)
173 174
174/* 175/*
175 * Save a segment register away 176 * Save a segment register away
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2fec4c..375c917c37d2 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
84#define TIF_SECCOMP 8 /* secure computing */ 84#define TIF_SECCOMP 8 /* secure computing */
85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
86#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
86#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 87#define TIF_NOTSC 16 /* TSC is not accessible in userland */
87#define TIF_IA32 17 /* 32bit process */ 88#define TIF_IA32 17 /* 32bit process */
88#define TIF_FORK 18 /* ret_from_fork */ 89#define TIF_FORK 18 /* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
107#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 108#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
108#define _TIF_SECCOMP (1 << TIF_SECCOMP) 109#define _TIF_SECCOMP (1 << TIF_SECCOMP)
109#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) 110#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
111#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
110#define _TIF_NOTSC (1 << TIF_NOTSC) 112#define _TIF_NOTSC (1 << TIF_NOTSC)
111#define _TIF_IA32 (1 << TIF_IA32) 113#define _TIF_IA32 (1 << TIF_IA32)
112#define _TIF_FORK (1 << TIF_FORK) 114#define _TIF_FORK (1 << TIF_FORK)
@@ -142,13 +144,14 @@ struct thread_info {
142 144
143/* Only used for 64 bit */ 145/* Only used for 64 bit */
144#define _TIF_DO_NOTIFY_MASK \ 146#define _TIF_DO_NOTIFY_MASK \
145 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) 147 (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
148 _TIF_USER_RETURN_NOTIFY)
146 149
147/* flags to check in __switch_to() */ 150/* flags to check in __switch_to() */
148#define _TIF_WORK_CTXSW \ 151#define _TIF_WORK_CTXSW \
149 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) 152 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
150 153
151#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW 154#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
152#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) 155#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
153 156
154#define PREEMPT_ACTIVE 0x10000000 157#define PREEMPT_ACTIVE 0x10000000
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index d823c245f63b..40e37b10c6c0 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -143,7 +143,7 @@ extern unsigned long node_remap_size[];
143 | 1*SD_BALANCE_FORK \ 143 | 1*SD_BALANCE_FORK \
144 | 0*SD_BALANCE_WAKE \ 144 | 0*SD_BALANCE_WAKE \
145 | 1*SD_WAKE_AFFINE \ 145 | 1*SD_WAKE_AFFINE \
146 | 1*SD_PREFER_LOCAL \ 146 | 0*SD_PREFER_LOCAL \
147 | 0*SD_SHARE_CPUPOWER \ 147 | 0*SD_SHARE_CPUPOWER \
148 | 0*SD_POWERSAVINGS_BALANCE \ 148 | 0*SD_POWERSAVINGS_BALANCE \
149 | 0*SD_SHARE_PKG_RESOURCES \ 149 | 0*SD_SHARE_PKG_RESOURCES \
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index d2c6c930b491..abd3e0ea762a 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -570,7 +570,6 @@ extern struct movsl_mask {
570#ifdef CONFIG_X86_32 570#ifdef CONFIG_X86_32
571# include "uaccess_32.h" 571# include "uaccess_32.h"
572#else 572#else
573# define ARCH_HAS_SEARCH_EXTABLE
574# include "uaccess_64.h" 573# include "uaccess_64.h"
575#endif 574#endif
576 575
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 632fb44b4cb5..0c9825e97f36 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -187,9 +187,34 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from,
187 187
188unsigned long __must_check copy_to_user(void __user *to, 188unsigned long __must_check copy_to_user(void __user *to,
189 const void *from, unsigned long n); 189 const void *from, unsigned long n);
190unsigned long __must_check copy_from_user(void *to, 190unsigned long __must_check _copy_from_user(void *to,
191 const void __user *from, 191 const void __user *from,
192 unsigned long n); 192 unsigned long n);
193
194
195extern void copy_from_user_overflow(void)
196#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
197 __compiletime_error("copy_from_user() buffer size is not provably correct")
198#else
199 __compiletime_warning("copy_from_user() buffer size is not provably correct")
200#endif
201;
202
203static inline unsigned long __must_check copy_from_user(void *to,
204 const void __user *from,
205 unsigned long n)
206{
207 int sz = __compiletime_object_size(to);
208 int ret = -EFAULT;
209
210 if (likely(sz == -1 || sz >= n))
211 ret = _copy_from_user(to, from, n);
212 else
213 copy_from_user_overflow();
214
215 return ret;
216}
217
193long __must_check strncpy_from_user(char *dst, const char __user *src, 218long __must_check strncpy_from_user(char *dst, const char __user *src,
194 long count); 219 long count);
195long __must_check __strncpy_from_user(char *dst, 220long __must_check __strncpy_from_user(char *dst,
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index db24b215fc50..46324c6a4f6e 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -19,12 +19,37 @@ __must_check unsigned long
19copy_user_generic(void *to, const void *from, unsigned len); 19copy_user_generic(void *to, const void *from, unsigned len);
20 20
21__must_check unsigned long 21__must_check unsigned long
22copy_to_user(void __user *to, const void *from, unsigned len); 22_copy_to_user(void __user *to, const void *from, unsigned len);
23__must_check unsigned long 23__must_check unsigned long
24copy_from_user(void *to, const void __user *from, unsigned len); 24_copy_from_user(void *to, const void __user *from, unsigned len);
25__must_check unsigned long 25__must_check unsigned long
26copy_in_user(void __user *to, const void __user *from, unsigned len); 26copy_in_user(void __user *to, const void __user *from, unsigned len);
27 27
28static inline unsigned long __must_check copy_from_user(void *to,
29 const void __user *from,
30 unsigned long n)
31{
32 int sz = __compiletime_object_size(to);
33 int ret = -EFAULT;
34
35 might_fault();
36 if (likely(sz == -1 || sz >= n))
37 ret = _copy_from_user(to, from, n);
38#ifdef CONFIG_DEBUG_VM
39 else
40 WARN(1, "Buffer overflow detected!\n");
41#endif
42 return ret;
43}
44
45static __always_inline __must_check
46int copy_to_user(void __user *dst, const void *src, unsigned size)
47{
48 might_fault();
49
50 return _copy_to_user(dst, src, size);
51}
52
28static __always_inline __must_check 53static __always_inline __must_check
29int __copy_from_user(void *dst, const void __user *src, unsigned size) 54int __copy_from_user(void *dst, const void __user *src, unsigned size)
30{ 55{
@@ -176,8 +201,11 @@ __must_check long strlen_user(const char __user *str);
176__must_check unsigned long clear_user(void __user *mem, unsigned long len); 201__must_check unsigned long clear_user(void __user *mem, unsigned long len);
177__must_check unsigned long __clear_user(void __user *mem, unsigned long len); 202__must_check unsigned long __clear_user(void __user *mem, unsigned long len);
178 203
179__must_check long __copy_from_user_inatomic(void *dst, const void __user *src, 204static __must_check __always_inline int
180 unsigned size); 205__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
206{
207 return copy_user_generic(dst, (__force const void *)src, size);
208}
181 209
182static __must_check __always_inline int 210static __must_check __always_inline int
183__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) 211__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6fb3c209a7e3..3baf379fa840 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -342,10 +342,11 @@
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335 343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_event_open 336 344#define __NR_perf_event_open 336
345#define __NR_recvmmsg 337
345 346
346#ifdef __KERNEL__ 347#ifdef __KERNEL__
347 348
348#define NR_syscalls 337 349#define NR_syscalls 338
349 350
350#define __ARCH_WANT_IPC_PARSE_VERSION 351#define __ARCH_WANT_IPC_PARSE_VERSION
351#define __ARCH_WANT_OLD_READDIR 352#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 8d3ad0adbc68..4843f7ba754a 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -661,6 +661,8 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) 661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662#define __NR_perf_event_open 298 662#define __NR_perf_event_open 298
663__SYSCALL(__NR_perf_event_open, sys_perf_event_open) 663__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
664#define __NR_recvmmsg 299
665__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
664 666
665#ifndef __NO_STUBS 667#ifndef __NO_STUBS
666#define __ARCH_WANT_OLD_READDIR 668#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h
index 9613c8c0b647..d6b17c760622 100644
--- a/arch/x86/include/asm/uv/uv_irq.h
+++ b/arch/x86/include/asm/uv/uv_irq.h
@@ -25,12 +25,14 @@ struct uv_IO_APIC_route_entry {
25 dest : 32; 25 dest : 32;
26}; 26};
27 27
28extern struct irq_chip uv_irq_chip; 28enum {
29 29 UV_AFFINITY_ALL,
30extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long); 30 UV_AFFINITY_NODE,
31extern void arch_disable_uv_irq(int, unsigned long); 31 UV_AFFINITY_CPU
32};
32 33
33extern int uv_setup_irq(char *, int, int, unsigned long); 34extern int uv_irq_2_mmr_info(int, unsigned long *, int *);
34extern void uv_teardown_irq(unsigned int, int, unsigned long); 35extern int uv_setup_irq(char *, int, int, unsigned long, int);
36extern void uv_teardown_irq(unsigned int);
35 37
36#endif /* _ASM_X86_UV_UV_IRQ_H */ 38#endif /* _ASM_X86_UV_UV_IRQ_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 272514c2d456..2b4945419a84 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -56,6 +56,7 @@
56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
59#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
59 60
60 61
61#define PIN_BASED_EXT_INTR_MASK 0x00000001 62#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -144,6 +145,8 @@ enum vmcs_field {
144 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, 145 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
145 TPR_THRESHOLD = 0x0000401c, 146 TPR_THRESHOLD = 0x0000401c,
146 SECONDARY_VM_EXEC_CONTROL = 0x0000401e, 147 SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
148 PLE_GAP = 0x00004020,
149 PLE_WINDOW = 0x00004022,
147 VM_INSTRUCTION_ERROR = 0x00004400, 150 VM_INSTRUCTION_ERROR = 0x00004400,
148 VM_EXIT_REASON = 0x00004402, 151 VM_EXIT_REASON = 0x00004402,
149 VM_EXIT_INTR_INFO = 0x00004404, 152 VM_EXIT_INTR_INFO = 0x00004404,
@@ -248,6 +251,7 @@ enum vmcs_field {
248#define EXIT_REASON_MSR_READ 31 251#define EXIT_REASON_MSR_READ 31
249#define EXIT_REASON_MSR_WRITE 32 252#define EXIT_REASON_MSR_WRITE 32
250#define EXIT_REASON_MWAIT_INSTRUCTION 36 253#define EXIT_REASON_MWAIT_INSTRUCTION 36
254#define EXIT_REASON_PAUSE_INSTRUCTION 40
251#define EXIT_REASON_MCE_DURING_VMENTRY 41 255#define EXIT_REASON_MCE_DURING_VMENTRY 41
252#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 256#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
253#define EXIT_REASON_APIC_ACCESS 44 257#define EXIT_REASON_APIC_ACCESS 44
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 2c756fd4ab0e..ea0e8ea15e15 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -26,7 +26,7 @@ struct x86_init_mpparse {
26 void (*smp_read_mpc_oem)(struct mpc_table *mpc); 26 void (*smp_read_mpc_oem)(struct mpc_table *mpc);
27 void (*mpc_oem_pci_bus)(struct mpc_bus *m); 27 void (*mpc_oem_pci_bus)(struct mpc_bus *m);
28 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); 28 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
29 void (*find_smp_config)(unsigned int reserve); 29 void (*find_smp_config)(void);
30 void (*get_smp_config)(unsigned int early); 30 void (*get_smp_config)(unsigned int early);
31}; 31};
32 32
@@ -91,6 +91,14 @@ struct x86_init_timers {
91}; 91};
92 92
93/** 93/**
94 * struct x86_init_iommu - platform specific iommu setup
95 * @iommu_init: platform specific iommu setup
96 */
97struct x86_init_iommu {
98 int (*iommu_init)(void);
99};
100
101/**
94 * struct x86_init_ops - functions for platform specific setup 102 * struct x86_init_ops - functions for platform specific setup
95 * 103 *
96 */ 104 */
@@ -101,6 +109,7 @@ struct x86_init_ops {
101 struct x86_init_oem oem; 109 struct x86_init_oem oem;
102 struct x86_init_paging paging; 110 struct x86_init_paging paging;
103 struct x86_init_timers timers; 111 struct x86_init_timers timers;
112 struct x86_init_iommu iommu;
104}; 113};
105 114
106/** 115/**
@@ -116,11 +125,14 @@ struct x86_cpuinit_ops {
116 * @calibrate_tsc: calibrate TSC 125 * @calibrate_tsc: calibrate TSC
117 * @get_wallclock: get time from HW clock like RTC etc. 126 * @get_wallclock: get time from HW clock like RTC etc.
118 * @set_wallclock: set time back to HW clock 127 * @set_wallclock: set time back to HW clock
128 * @is_untracked_pat_range exclude from PAT logic
119 */ 129 */
120struct x86_platform_ops { 130struct x86_platform_ops {
121 unsigned long (*calibrate_tsc)(void); 131 unsigned long (*calibrate_tsc)(void);
122 unsigned long (*get_wallclock)(void); 132 unsigned long (*get_wallclock)(void);
123 int (*set_wallclock)(unsigned long nowtime); 133 int (*set_wallclock)(unsigned long nowtime);
134 void (*iommu_shutdown)(void);
135 bool (*is_untracked_pat_range)(u64 start, u64 end);
124}; 136};
125 137
126extern struct x86_init_ops x86_init; 138extern struct x86_init_ops x86_init;
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d8e5d0cdd678..4f2e66e29ecc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
41obj-y += bootflag.o e820.o 41obj-y += bootflag.o e820.o
42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
43obj-y += alternative.o i8253.o pci-nommu.o 43obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
44obj-y += tsc.o io_delay.o rtc.o 44obj-y += tsc.o io_delay.o rtc.o
45 45
46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index d296f4a195c9..d85d1b2432ba 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -79,7 +79,8 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
79 struct cpuinfo_x86 *c = &cpu_data(pr->id); 79 struct cpuinfo_x86 *c = &cpu_data(pr->id);
80 80
81 pr->pdc = NULL; 81 pr->pdc = NULL;
82 if (c->x86_vendor == X86_VENDOR_INTEL) 82 if (c->x86_vendor == X86_VENDOR_INTEL ||
83 c->x86_vendor == X86_VENDOR_CENTAUR)
83 init_intel_pdc(pr, c); 84 init_intel_pdc(pr, c);
84 85
85 return; 86 return;
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca93638ba430..82e508677b91 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,12 +78,9 @@ int acpi_save_state_mem(void)
78#ifndef CONFIG_64BIT 78#ifndef CONFIG_64BIT
79 store_gdt((struct desc_ptr *)&header->pmode_gdt); 79 store_gdt((struct desc_ptr *)&header->pmode_gdt);
80 80
81 header->pmode_efer_low = nx_enabled; 81 if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
82 if (header->pmode_efer_low & 1) { 82 &header->pmode_efer_high))
83 /* This is strange, why not save efer, always? */ 83 header->pmode_efer_low = header->pmode_efer_high = 0;
84 rdmsr(MSR_EFER, header->pmode_efer_low,
85 header->pmode_efer_high);
86 }
87#endif /* !CONFIG_64BIT */ 84#endif /* !CONFIG_64BIT */
88 85
89 header->pmode_cr0 = read_cr0(); 86 header->pmode_cr0 = read_cr0();
@@ -119,29 +116,32 @@ void acpi_restore_state_mem(void)
119 116
120 117
121/** 118/**
122 * acpi_reserve_bootmem - do _very_ early ACPI initialisation 119 * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
123 * 120 *
124 * We allocate a page from the first 1MB of memory for the wakeup 121 * We allocate a page from the first 1MB of memory for the wakeup
125 * routine for when we come back from a sleep state. The 122 * routine for when we come back from a sleep state. The
126 * runtime allocator allows specification of <16MB pages, but not 123 * runtime allocator allows specification of <16MB pages, but not
127 * <1MB pages. 124 * <1MB pages.
128 */ 125 */
129void __init acpi_reserve_bootmem(void) 126void __init acpi_reserve_wakeup_memory(void)
130{ 127{
128 unsigned long mem;
129
131 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { 130 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
132 printk(KERN_ERR 131 printk(KERN_ERR
133 "ACPI: Wakeup code way too big, S3 disabled.\n"); 132 "ACPI: Wakeup code way too big, S3 disabled.\n");
134 return; 133 return;
135 } 134 }
136 135
137 acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); 136 mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
138 137
139 if (!acpi_realmode) { 138 if (mem == -1L) {
140 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); 139 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
141 return; 140 return;
142 } 141 }
143 142 acpi_realmode = (unsigned long) phys_to_virt(mem);
144 acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); 143 acpi_wakeup_address = mem;
144 reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
145} 145}
146 146
147 147
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 98f230f6a28d..32fb09102a13 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -28,6 +28,7 @@
28#include <asm/proto.h> 28#include <asm/proto.h>
29#include <asm/iommu.h> 29#include <asm/iommu.h>
30#include <asm/gart.h> 30#include <asm/gart.h>
31#include <asm/amd_iommu_proto.h>
31#include <asm/amd_iommu_types.h> 32#include <asm/amd_iommu_types.h>
32#include <asm/amd_iommu.h> 33#include <asm/amd_iommu.h>
33 34
@@ -56,20 +57,115 @@ struct iommu_cmd {
56 u32 data[4]; 57 u32 data[4];
57}; 58};
58 59
59static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
60 struct unity_map_entry *e);
61static struct dma_ops_domain *find_protection_domain(u16 devid);
62static u64 *alloc_pte(struct protection_domain *domain,
63 unsigned long address, int end_lvl,
64 u64 **pte_page, gfp_t gfp);
65static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
66 unsigned long start_page,
67 unsigned int pages);
68static void reset_iommu_command_buffer(struct amd_iommu *iommu); 60static void reset_iommu_command_buffer(struct amd_iommu *iommu);
69static u64 *fetch_pte(struct protection_domain *domain,
70 unsigned long address, int map_size);
71static void update_domain(struct protection_domain *domain); 61static void update_domain(struct protection_domain *domain);
72 62
63/****************************************************************************
64 *
65 * Helper functions
66 *
67 ****************************************************************************/
68
69static inline u16 get_device_id(struct device *dev)
70{
71 struct pci_dev *pdev = to_pci_dev(dev);
72
73 return calc_devid(pdev->bus->number, pdev->devfn);
74}
75
76static struct iommu_dev_data *get_dev_data(struct device *dev)
77{
78 return dev->archdata.iommu;
79}
80
81/*
82 * In this function the list of preallocated protection domains is traversed to
83 * find the domain for a specific device
84 */
85static struct dma_ops_domain *find_protection_domain(u16 devid)
86{
87 struct dma_ops_domain *entry, *ret = NULL;
88 unsigned long flags;
89 u16 alias = amd_iommu_alias_table[devid];
90
91 if (list_empty(&iommu_pd_list))
92 return NULL;
93
94 spin_lock_irqsave(&iommu_pd_list_lock, flags);
95
96 list_for_each_entry(entry, &iommu_pd_list, list) {
97 if (entry->target_dev == devid ||
98 entry->target_dev == alias) {
99 ret = entry;
100 break;
101 }
102 }
103
104 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
105
106 return ret;
107}
108
109/*
110 * This function checks if the driver got a valid device from the caller to
111 * avoid dereferencing invalid pointers.
112 */
113static bool check_device(struct device *dev)
114{
115 u16 devid;
116
117 if (!dev || !dev->dma_mask)
118 return false;
119
120 /* No device or no PCI device */
121 if (!dev || dev->bus != &pci_bus_type)
122 return false;
123
124 devid = get_device_id(dev);
125
126 /* Out of our scope? */
127 if (devid > amd_iommu_last_bdf)
128 return false;
129
130 if (amd_iommu_rlookup_table[devid] == NULL)
131 return false;
132
133 return true;
134}
135
136static int iommu_init_device(struct device *dev)
137{
138 struct iommu_dev_data *dev_data;
139 struct pci_dev *pdev;
140 u16 devid, alias;
141
142 if (dev->archdata.iommu)
143 return 0;
144
145 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
146 if (!dev_data)
147 return -ENOMEM;
148
149 dev_data->dev = dev;
150
151 devid = get_device_id(dev);
152 alias = amd_iommu_alias_table[devid];
153 pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
154 if (pdev)
155 dev_data->alias = &pdev->dev;
156
157 atomic_set(&dev_data->bind, 0);
158
159 dev->archdata.iommu = dev_data;
160
161
162 return 0;
163}
164
165static void iommu_uninit_device(struct device *dev)
166{
167 kfree(dev->archdata.iommu);
168}
73#ifdef CONFIG_AMD_IOMMU_STATS 169#ifdef CONFIG_AMD_IOMMU_STATS
74 170
75/* 171/*
@@ -90,7 +186,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem);
90DECLARE_STATS_COUNTER(total_map_requests); 186DECLARE_STATS_COUNTER(total_map_requests);
91 187
92static struct dentry *stats_dir; 188static struct dentry *stats_dir;
93static struct dentry *de_isolate;
94static struct dentry *de_fflush; 189static struct dentry *de_fflush;
95 190
96static void amd_iommu_stats_add(struct __iommu_counter *cnt) 191static void amd_iommu_stats_add(struct __iommu_counter *cnt)
@@ -108,9 +203,6 @@ static void amd_iommu_stats_init(void)
108 if (stats_dir == NULL) 203 if (stats_dir == NULL)
109 return; 204 return;
110 205
111 de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
112 (u32 *)&amd_iommu_isolate);
113
114 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, 206 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
115 (u32 *)&amd_iommu_unmap_flush); 207 (u32 *)&amd_iommu_unmap_flush);
116 208
@@ -130,12 +222,6 @@ static void amd_iommu_stats_init(void)
130 222
131#endif 223#endif
132 224
133/* returns !0 if the IOMMU is caching non-present entries in its TLB */
134static int iommu_has_npcache(struct amd_iommu *iommu)
135{
136 return iommu->cap & (1UL << IOMMU_CAP_NPCACHE);
137}
138
139/**************************************************************************** 225/****************************************************************************
140 * 226 *
141 * Interrupt handling functions 227 * Interrupt handling functions
@@ -199,6 +285,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
199 break; 285 break;
200 case EVENT_TYPE_ILL_CMD: 286 case EVENT_TYPE_ILL_CMD:
201 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 287 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
288 iommu->reset_in_progress = true;
202 reset_iommu_command_buffer(iommu); 289 reset_iommu_command_buffer(iommu);
203 dump_command(address); 290 dump_command(address);
204 break; 291 break;
@@ -321,11 +408,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
321 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 408 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
322 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 409 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
323 410
324 if (unlikely(i == EXIT_LOOP_COUNT)) { 411 if (unlikely(i == EXIT_LOOP_COUNT))
325 spin_unlock(&iommu->lock); 412 iommu->reset_in_progress = true;
326 reset_iommu_command_buffer(iommu);
327 spin_lock(&iommu->lock);
328 }
329} 413}
330 414
331/* 415/*
@@ -372,26 +456,46 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
372out: 456out:
373 spin_unlock_irqrestore(&iommu->lock, flags); 457 spin_unlock_irqrestore(&iommu->lock, flags);
374 458
459 if (iommu->reset_in_progress)
460 reset_iommu_command_buffer(iommu);
461
375 return 0; 462 return 0;
376} 463}
377 464
465static void iommu_flush_complete(struct protection_domain *domain)
466{
467 int i;
468
469 for (i = 0; i < amd_iommus_present; ++i) {
470 if (!domain->dev_iommu[i])
471 continue;
472
473 /*
474 * Devices of this domain are behind this IOMMU
475 * We need to wait for completion of all commands.
476 */
477 iommu_completion_wait(amd_iommus[i]);
478 }
479}
480
378/* 481/*
379 * Command send function for invalidating a device table entry 482 * Command send function for invalidating a device table entry
380 */ 483 */
381static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 484static int iommu_flush_device(struct device *dev)
382{ 485{
486 struct amd_iommu *iommu;
383 struct iommu_cmd cmd; 487 struct iommu_cmd cmd;
384 int ret; 488 u16 devid;
385 489
386 BUG_ON(iommu == NULL); 490 devid = get_device_id(dev);
491 iommu = amd_iommu_rlookup_table[devid];
387 492
493 /* Build command */
388 memset(&cmd, 0, sizeof(cmd)); 494 memset(&cmd, 0, sizeof(cmd));
389 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 495 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
390 cmd.data[0] = devid; 496 cmd.data[0] = devid;
391 497
392 ret = iommu_queue_command(iommu, &cmd); 498 return iommu_queue_command(iommu, &cmd);
393
394 return ret;
395} 499}
396 500
397static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, 501static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
@@ -430,11 +534,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
430 * It invalidates a single PTE if the range to flush is within a single 534 * It invalidates a single PTE if the range to flush is within a single
431 * page. Otherwise it flushes the whole TLB of the IOMMU. 535 * page. Otherwise it flushes the whole TLB of the IOMMU.
432 */ 536 */
433static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 537static void __iommu_flush_pages(struct protection_domain *domain,
434 u64 address, size_t size) 538 u64 address, size_t size, int pde)
435{ 539{
436 int s = 0; 540 int s = 0, i;
437 unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); 541 unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
438 542
439 address &= PAGE_MASK; 543 address &= PAGE_MASK;
440 544
@@ -447,142 +551,212 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
447 s = 1; 551 s = 1;
448 } 552 }
449 553
450 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
451 554
452 return 0; 555 for (i = 0; i < amd_iommus_present; ++i) {
556 if (!domain->dev_iommu[i])
557 continue;
558
559 /*
560 * Devices of this domain are behind this IOMMU
561 * We need a TLB flush
562 */
563 iommu_queue_inv_iommu_pages(amd_iommus[i], address,
564 domain->id, pde, s);
565 }
566
567 return;
453} 568}
454 569
455/* Flush the whole IO/TLB for a given protection domain */ 570static void iommu_flush_pages(struct protection_domain *domain,
456static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) 571 u64 address, size_t size)
457{ 572{
458 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 573 __iommu_flush_pages(domain, address, size, 0);
459 574}
460 INC_STATS_COUNTER(domain_flush_single);
461 575
462 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); 576/* Flush the whole IO/TLB for a given protection domain */
577static void iommu_flush_tlb(struct protection_domain *domain)
578{
579 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
463} 580}
464 581
465/* Flush the whole IO/TLB for a given protection domain - including PDE */ 582/* Flush the whole IO/TLB for a given protection domain - including PDE */
466static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) 583static void iommu_flush_tlb_pde(struct protection_domain *domain)
467{ 584{
468 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 585 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
469
470 INC_STATS_COUNTER(domain_flush_single);
471
472 iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
473} 586}
474 587
588
475/* 589/*
476 * This function flushes one domain on one IOMMU 590 * This function flushes the DTEs for all devices in domain
477 */ 591 */
478static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) 592static void iommu_flush_domain_devices(struct protection_domain *domain)
479{ 593{
480 struct iommu_cmd cmd; 594 struct iommu_dev_data *dev_data;
481 unsigned long flags; 595 unsigned long flags;
482 596
483 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 597 spin_lock_irqsave(&domain->lock, flags);
484 domid, 1, 1);
485 598
486 spin_lock_irqsave(&iommu->lock, flags); 599 list_for_each_entry(dev_data, &domain->dev_list, list)
487 __iommu_queue_command(iommu, &cmd); 600 iommu_flush_device(dev_data->dev);
488 __iommu_completion_wait(iommu); 601
489 __iommu_wait_for_completion(iommu); 602 spin_unlock_irqrestore(&domain->lock, flags);
490 spin_unlock_irqrestore(&iommu->lock, flags);
491} 603}
492 604
493static void flush_all_domains_on_iommu(struct amd_iommu *iommu) 605static void iommu_flush_all_domain_devices(void)
494{ 606{
495 int i; 607 struct protection_domain *domain;
608 unsigned long flags;
496 609
497 for (i = 1; i < MAX_DOMAIN_ID; ++i) { 610 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
498 if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) 611
499 continue; 612 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
500 flush_domain_on_iommu(iommu, i); 613 iommu_flush_domain_devices(domain);
614 iommu_flush_complete(domain);
501 } 615 }
502 616
617 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
618}
619
620void amd_iommu_flush_all_devices(void)
621{
622 iommu_flush_all_domain_devices();
503} 623}
504 624
505/* 625/*
506 * This function is used to flush the IO/TLB for a given protection domain 626 * This function uses heavy locking and may disable irqs for some time. But
507 * on every IOMMU in the system 627 * this is no issue because it is only called during resume.
508 */ 628 */
509static void iommu_flush_domain(u16 domid) 629void amd_iommu_flush_all_domains(void)
510{ 630{
511 struct amd_iommu *iommu; 631 struct protection_domain *domain;
632 unsigned long flags;
512 633
513 INC_STATS_COUNTER(domain_flush_all); 634 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
514 635
515 for_each_iommu(iommu) 636 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
516 flush_domain_on_iommu(iommu, domid); 637 spin_lock(&domain->lock);
638 iommu_flush_tlb_pde(domain);
639 iommu_flush_complete(domain);
640 spin_unlock(&domain->lock);
641 }
642
643 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
517} 644}
518 645
519void amd_iommu_flush_all_domains(void) 646static void reset_iommu_command_buffer(struct amd_iommu *iommu)
520{ 647{
521 struct amd_iommu *iommu; 648 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
522 649
523 for_each_iommu(iommu) 650 if (iommu->reset_in_progress)
524 flush_all_domains_on_iommu(iommu); 651 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
652
653 amd_iommu_reset_cmd_buffer(iommu);
654 amd_iommu_flush_all_devices();
655 amd_iommu_flush_all_domains();
656
657 iommu->reset_in_progress = false;
525} 658}
526 659
527static void flush_all_devices_for_iommu(struct amd_iommu *iommu) 660/****************************************************************************
661 *
662 * The functions below are used the create the page table mappings for
663 * unity mapped regions.
664 *
665 ****************************************************************************/
666
667/*
668 * This function is used to add another level to an IO page table. Adding
669 * another level increases the size of the address space by 9 bits to a size up
670 * to 64 bits.
671 */
672static bool increase_address_space(struct protection_domain *domain,
673 gfp_t gfp)
528{ 674{
529 int i; 675 u64 *pte;
530 676
531 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 677 if (domain->mode == PAGE_MODE_6_LEVEL)
532 if (iommu != amd_iommu_rlookup_table[i]) 678 /* address space already 64 bit large */
533 continue; 679 return false;
534 680
535 iommu_queue_inv_dev_entry(iommu, i); 681 pte = (void *)get_zeroed_page(gfp);
536 iommu_completion_wait(iommu); 682 if (!pte)
537 } 683 return false;
684
685 *pte = PM_LEVEL_PDE(domain->mode,
686 virt_to_phys(domain->pt_root));
687 domain->pt_root = pte;
688 domain->mode += 1;
689 domain->updated = true;
690
691 return true;
538} 692}
539 693
540static void flush_devices_by_domain(struct protection_domain *domain) 694static u64 *alloc_pte(struct protection_domain *domain,
695 unsigned long address,
696 int end_lvl,
697 u64 **pte_page,
698 gfp_t gfp)
541{ 699{
542 struct amd_iommu *iommu; 700 u64 *pte, *page;
543 int i; 701 int level;
544 702
545 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 703 while (address > PM_LEVEL_SIZE(domain->mode))
546 if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || 704 increase_address_space(domain, gfp);
547 (amd_iommu_pd_table[i] != domain))
548 continue;
549 705
550 iommu = amd_iommu_rlookup_table[i]; 706 level = domain->mode - 1;
551 if (!iommu) 707 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
552 continue; 708
709 while (level > end_lvl) {
710 if (!IOMMU_PTE_PRESENT(*pte)) {
711 page = (u64 *)get_zeroed_page(gfp);
712 if (!page)
713 return NULL;
714 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
715 }
553 716
554 iommu_queue_inv_dev_entry(iommu, i); 717 level -= 1;
555 iommu_completion_wait(iommu); 718
719 pte = IOMMU_PTE_PAGE(*pte);
720
721 if (pte_page && level == end_lvl)
722 *pte_page = pte;
723
724 pte = &pte[PM_LEVEL_INDEX(level, address)];
556 } 725 }
726
727 return pte;
557} 728}
558 729
559static void reset_iommu_command_buffer(struct amd_iommu *iommu) 730/*
731 * This function checks if there is a PTE for a given dma address. If
732 * there is one, it returns the pointer to it.
733 */
734static u64 *fetch_pte(struct protection_domain *domain,
735 unsigned long address, int map_size)
560{ 736{
561 pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); 737 int level;
738 u64 *pte;
562 739
563 if (iommu->reset_in_progress) 740 level = domain->mode - 1;
564 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); 741 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
565 742
566 iommu->reset_in_progress = true; 743 while (level > map_size) {
744 if (!IOMMU_PTE_PRESENT(*pte))
745 return NULL;
567 746
568 amd_iommu_reset_cmd_buffer(iommu); 747 level -= 1;
569 flush_all_devices_for_iommu(iommu);
570 flush_all_domains_on_iommu(iommu);
571 748
572 iommu->reset_in_progress = false; 749 pte = IOMMU_PTE_PAGE(*pte);
573} 750 pte = &pte[PM_LEVEL_INDEX(level, address)];
574 751
575void amd_iommu_flush_all_devices(void) 752 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
576{ 753 pte = NULL;
577 flush_devices_by_domain(NULL); 754 break;
578} 755 }
756 }
579 757
580/**************************************************************************** 758 return pte;
581 * 759}
582 * The functions below are used the create the page table mappings for
583 * unity mapped regions.
584 *
585 ****************************************************************************/
586 760
587/* 761/*
588 * Generic mapping functions. It maps a physical address into a DMA 762 * Generic mapping functions. It maps a physical address into a DMA
@@ -654,28 +828,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
654} 828}
655 829
656/* 830/*
657 * Init the unity mappings for a specific IOMMU in the system
658 *
659 * Basically iterates over all unity mapping entries and applies them to
660 * the default domain DMA of that IOMMU if necessary.
661 */
662static int iommu_init_unity_mappings(struct amd_iommu *iommu)
663{
664 struct unity_map_entry *entry;
665 int ret;
666
667 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
668 if (!iommu_for_unity_map(iommu, entry))
669 continue;
670 ret = dma_ops_unity_map(iommu->default_dom, entry);
671 if (ret)
672 return ret;
673 }
674
675 return 0;
676}
677
678/*
679 * This function actually applies the mapping to the page table of the 831 * This function actually applies the mapping to the page table of the
680 * dma_ops domain. 832 * dma_ops domain.
681 */ 833 */
@@ -704,6 +856,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
704} 856}
705 857
706/* 858/*
859 * Init the unity mappings for a specific IOMMU in the system
860 *
861 * Basically iterates over all unity mapping entries and applies them to
862 * the default domain DMA of that IOMMU if necessary.
863 */
864static int iommu_init_unity_mappings(struct amd_iommu *iommu)
865{
866 struct unity_map_entry *entry;
867 int ret;
868
869 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
870 if (!iommu_for_unity_map(iommu, entry))
871 continue;
872 ret = dma_ops_unity_map(iommu->default_dom, entry);
873 if (ret)
874 return ret;
875 }
876
877 return 0;
878}
879
880/*
707 * Inits the unity mappings required for a specific device 881 * Inits the unity mappings required for a specific device
708 */ 882 */
709static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 883static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
@@ -740,34 +914,23 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
740 */ 914 */
741 915
742/* 916/*
743 * This function checks if there is a PTE for a given dma address. If 917 * Used to reserve address ranges in the aperture (e.g. for exclusion
744 * there is one, it returns the pointer to it. 918 * ranges.
745 */ 919 */
746static u64 *fetch_pte(struct protection_domain *domain, 920static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
747 unsigned long address, int map_size) 921 unsigned long start_page,
922 unsigned int pages)
748{ 923{
749 int level; 924 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
750 u64 *pte;
751
752 level = domain->mode - 1;
753 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
754
755 while (level > map_size) {
756 if (!IOMMU_PTE_PRESENT(*pte))
757 return NULL;
758
759 level -= 1;
760 925
761 pte = IOMMU_PTE_PAGE(*pte); 926 if (start_page + pages > last_page)
762 pte = &pte[PM_LEVEL_INDEX(level, address)]; 927 pages = last_page - start_page;
763 928
764 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { 929 for (i = start_page; i < start_page + pages; ++i) {
765 pte = NULL; 930 int index = i / APERTURE_RANGE_PAGES;
766 break; 931 int page = i % APERTURE_RANGE_PAGES;
767 } 932 __set_bit(page, dom->aperture[index]->bitmap);
768 } 933 }
769
770 return pte;
771} 934}
772 935
773/* 936/*
@@ -775,11 +938,11 @@ static u64 *fetch_pte(struct protection_domain *domain,
775 * aperture in case of dma_ops domain allocation or address allocation 938 * aperture in case of dma_ops domain allocation or address allocation
776 * failure. 939 * failure.
777 */ 940 */
778static int alloc_new_range(struct amd_iommu *iommu, 941static int alloc_new_range(struct dma_ops_domain *dma_dom,
779 struct dma_ops_domain *dma_dom,
780 bool populate, gfp_t gfp) 942 bool populate, gfp_t gfp)
781{ 943{
782 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; 944 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
945 struct amd_iommu *iommu;
783 int i; 946 int i;
784 947
785#ifdef CONFIG_IOMMU_STRESS 948#ifdef CONFIG_IOMMU_STRESS
@@ -819,14 +982,17 @@ static int alloc_new_range(struct amd_iommu *iommu,
819 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 982 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
820 983
821 /* Intialize the exclusion range if necessary */ 984 /* Intialize the exclusion range if necessary */
822 if (iommu->exclusion_start && 985 for_each_iommu(iommu) {
823 iommu->exclusion_start >= dma_dom->aperture[index]->offset && 986 if (iommu->exclusion_start &&
824 iommu->exclusion_start < dma_dom->aperture_size) { 987 iommu->exclusion_start >= dma_dom->aperture[index]->offset
825 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 988 && iommu->exclusion_start < dma_dom->aperture_size) {
826 int pages = iommu_num_pages(iommu->exclusion_start, 989 unsigned long startpage;
827 iommu->exclusion_length, 990 int pages = iommu_num_pages(iommu->exclusion_start,
828 PAGE_SIZE); 991 iommu->exclusion_length,
829 dma_ops_reserve_addresses(dma_dom, startpage, pages); 992 PAGE_SIZE);
993 startpage = iommu->exclusion_start >> PAGE_SHIFT;
994 dma_ops_reserve_addresses(dma_dom, startpage, pages);
995 }
830 } 996 }
831 997
832 /* 998 /*
@@ -928,7 +1094,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
928 } 1094 }
929 1095
930 if (unlikely(address == -1)) 1096 if (unlikely(address == -1))
931 address = bad_dma_address; 1097 address = DMA_ERROR_CODE;
932 1098
933 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 1099 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
934 1100
@@ -973,6 +1139,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
973 * 1139 *
974 ****************************************************************************/ 1140 ****************************************************************************/
975 1141
1142/*
1143 * This function adds a protection domain to the global protection domain list
1144 */
1145static void add_domain_to_list(struct protection_domain *domain)
1146{
1147 unsigned long flags;
1148
1149 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1150 list_add(&domain->list, &amd_iommu_pd_list);
1151 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1152}
1153
1154/*
1155 * This function removes a protection domain to the global
1156 * protection domain list
1157 */
1158static void del_domain_from_list(struct protection_domain *domain)
1159{
1160 unsigned long flags;
1161
1162 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1163 list_del(&domain->list);
1164 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1165}
1166
976static u16 domain_id_alloc(void) 1167static u16 domain_id_alloc(void)
977{ 1168{
978 unsigned long flags; 1169 unsigned long flags;
@@ -1000,26 +1191,6 @@ static void domain_id_free(int id)
1000 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1191 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1001} 1192}
1002 1193
1003/*
1004 * Used to reserve address ranges in the aperture (e.g. for exclusion
1005 * ranges.
1006 */
1007static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
1008 unsigned long start_page,
1009 unsigned int pages)
1010{
1011 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
1012
1013 if (start_page + pages > last_page)
1014 pages = last_page - start_page;
1015
1016 for (i = start_page; i < start_page + pages; ++i) {
1017 int index = i / APERTURE_RANGE_PAGES;
1018 int page = i % APERTURE_RANGE_PAGES;
1019 __set_bit(page, dom->aperture[index]->bitmap);
1020 }
1021}
1022
1023static void free_pagetable(struct protection_domain *domain) 1194static void free_pagetable(struct protection_domain *domain)
1024{ 1195{
1025 int i, j; 1196 int i, j;
@@ -1061,6 +1232,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1061 if (!dom) 1232 if (!dom)
1062 return; 1233 return;
1063 1234
1235 del_domain_from_list(&dom->domain);
1236
1064 free_pagetable(&dom->domain); 1237 free_pagetable(&dom->domain);
1065 1238
1066 for (i = 0; i < APERTURE_MAX_RANGES; ++i) { 1239 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
@@ -1078,7 +1251,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1078 * It also intializes the page table and the address allocator data 1251 * It also intializes the page table and the address allocator data
1079 * structures required for the dma_ops interface 1252 * structures required for the dma_ops interface
1080 */ 1253 */
1081static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) 1254static struct dma_ops_domain *dma_ops_domain_alloc(void)
1082{ 1255{
1083 struct dma_ops_domain *dma_dom; 1256 struct dma_ops_domain *dma_dom;
1084 1257
@@ -1091,6 +1264,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1091 dma_dom->domain.id = domain_id_alloc(); 1264 dma_dom->domain.id = domain_id_alloc();
1092 if (dma_dom->domain.id == 0) 1265 if (dma_dom->domain.id == 0)
1093 goto free_dma_dom; 1266 goto free_dma_dom;
1267 INIT_LIST_HEAD(&dma_dom->domain.dev_list);
1094 dma_dom->domain.mode = PAGE_MODE_2_LEVEL; 1268 dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1095 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 1269 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1096 dma_dom->domain.flags = PD_DMA_OPS_MASK; 1270 dma_dom->domain.flags = PD_DMA_OPS_MASK;
@@ -1101,7 +1275,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1101 dma_dom->need_flush = false; 1275 dma_dom->need_flush = false;
1102 dma_dom->target_dev = 0xffff; 1276 dma_dom->target_dev = 0xffff;
1103 1277
1104 if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) 1278 add_domain_to_list(&dma_dom->domain);
1279
1280 if (alloc_new_range(dma_dom, true, GFP_KERNEL))
1105 goto free_dma_dom; 1281 goto free_dma_dom;
1106 1282
1107 /* 1283 /*
@@ -1129,22 +1305,6 @@ static bool dma_ops_domain(struct protection_domain *domain)
1129 return domain->flags & PD_DMA_OPS_MASK; 1305 return domain->flags & PD_DMA_OPS_MASK;
1130} 1306}
1131 1307
1132/*
1133 * Find out the protection domain structure for a given PCI device. This
1134 * will give us the pointer to the page table root for example.
1135 */
1136static struct protection_domain *domain_for_device(u16 devid)
1137{
1138 struct protection_domain *dom;
1139 unsigned long flags;
1140
1141 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1142 dom = amd_iommu_pd_table[devid];
1143 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1144
1145 return dom;
1146}
1147
1148static void set_dte_entry(u16 devid, struct protection_domain *domain) 1308static void set_dte_entry(u16 devid, struct protection_domain *domain)
1149{ 1309{
1150 u64 pte_root = virt_to_phys(domain->pt_root); 1310 u64 pte_root = virt_to_phys(domain->pt_root);
@@ -1156,42 +1316,123 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain)
1156 amd_iommu_dev_table[devid].data[2] = domain->id; 1316 amd_iommu_dev_table[devid].data[2] = domain->id;
1157 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); 1317 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1158 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1318 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1319}
1320
1321static void clear_dte_entry(u16 devid)
1322{
1323 /* remove entry from the device table seen by the hardware */
1324 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1325 amd_iommu_dev_table[devid].data[1] = 0;
1326 amd_iommu_dev_table[devid].data[2] = 0;
1159 1327
1160 amd_iommu_pd_table[devid] = domain; 1328 amd_iommu_apply_erratum_63(devid);
1329}
1330
1331static void do_attach(struct device *dev, struct protection_domain *domain)
1332{
1333 struct iommu_dev_data *dev_data;
1334 struct amd_iommu *iommu;
1335 u16 devid;
1336
1337 devid = get_device_id(dev);
1338 iommu = amd_iommu_rlookup_table[devid];
1339 dev_data = get_dev_data(dev);
1340
1341 /* Update data structures */
1342 dev_data->domain = domain;
1343 list_add(&dev_data->list, &domain->dev_list);
1344 set_dte_entry(devid, domain);
1345
1346 /* Do reference counting */
1347 domain->dev_iommu[iommu->index] += 1;
1348 domain->dev_cnt += 1;
1349
1350 /* Flush the DTE entry */
1351 iommu_flush_device(dev);
1352}
1353
1354static void do_detach(struct device *dev)
1355{
1356 struct iommu_dev_data *dev_data;
1357 struct amd_iommu *iommu;
1358 u16 devid;
1359
1360 devid = get_device_id(dev);
1361 iommu = amd_iommu_rlookup_table[devid];
1362 dev_data = get_dev_data(dev);
1363
1364 /* decrease reference counters */
1365 dev_data->domain->dev_iommu[iommu->index] -= 1;
1366 dev_data->domain->dev_cnt -= 1;
1367
1368 /* Update data structures */
1369 dev_data->domain = NULL;
1370 list_del(&dev_data->list);
1371 clear_dte_entry(devid);
1372
1373 /* Flush the DTE entry */
1374 iommu_flush_device(dev);
1161} 1375}
1162 1376
1163/* 1377/*
1164 * If a device is not yet associated with a domain, this function does 1378 * If a device is not yet associated with a domain, this function does
1165 * assigns it visible for the hardware 1379 * assigns it visible for the hardware
1166 */ 1380 */
1167static void __attach_device(struct amd_iommu *iommu, 1381static int __attach_device(struct device *dev,
1168 struct protection_domain *domain, 1382 struct protection_domain *domain)
1169 u16 devid)
1170{ 1383{
1384 struct iommu_dev_data *dev_data, *alias_data;
1385
1386 dev_data = get_dev_data(dev);
1387 alias_data = get_dev_data(dev_data->alias);
1388
1389 if (!alias_data)
1390 return -EINVAL;
1391
1171 /* lock domain */ 1392 /* lock domain */
1172 spin_lock(&domain->lock); 1393 spin_lock(&domain->lock);
1173 1394
1174 /* update DTE entry */ 1395 /* Some sanity checks */
1175 set_dte_entry(devid, domain); 1396 if (alias_data->domain != NULL &&
1397 alias_data->domain != domain)
1398 return -EBUSY;
1399
1400 if (dev_data->domain != NULL &&
1401 dev_data->domain != domain)
1402 return -EBUSY;
1176 1403
1177 domain->dev_cnt += 1; 1404 /* Do real assignment */
1405 if (dev_data->alias != dev) {
1406 alias_data = get_dev_data(dev_data->alias);
1407 if (alias_data->domain == NULL)
1408 do_attach(dev_data->alias, domain);
1409
1410 atomic_inc(&alias_data->bind);
1411 }
1412
1413 if (dev_data->domain == NULL)
1414 do_attach(dev, domain);
1415
1416 atomic_inc(&dev_data->bind);
1178 1417
1179 /* ready */ 1418 /* ready */
1180 spin_unlock(&domain->lock); 1419 spin_unlock(&domain->lock);
1420
1421 return 0;
1181} 1422}
1182 1423
1183/* 1424/*
1184 * If a device is not yet associated with a domain, this function does 1425 * If a device is not yet associated with a domain, this function does
1185 * assigns it visible for the hardware 1426 * assigns it visible for the hardware
1186 */ 1427 */
1187static void attach_device(struct amd_iommu *iommu, 1428static int attach_device(struct device *dev,
1188 struct protection_domain *domain, 1429 struct protection_domain *domain)
1189 u16 devid)
1190{ 1430{
1191 unsigned long flags; 1431 unsigned long flags;
1432 int ret;
1192 1433
1193 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1434 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1194 __attach_device(iommu, domain, devid); 1435 ret = __attach_device(dev, domain);
1195 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1436 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1196 1437
1197 /* 1438 /*
@@ -1199,96 +1440,125 @@ static void attach_device(struct amd_iommu *iommu,
1199 * left the caches in the IOMMU dirty. So we have to flush 1440 * left the caches in the IOMMU dirty. So we have to flush
1200 * here to evict all dirty stuff. 1441 * here to evict all dirty stuff.
1201 */ 1442 */
1202 iommu_queue_inv_dev_entry(iommu, devid); 1443 iommu_flush_tlb_pde(domain);
1203 iommu_flush_tlb_pde(iommu, domain->id); 1444
1445 return ret;
1204} 1446}
1205 1447
1206/* 1448/*
1207 * Removes a device from a protection domain (unlocked) 1449 * Removes a device from a protection domain (unlocked)
1208 */ 1450 */
1209static void __detach_device(struct protection_domain *domain, u16 devid) 1451static void __detach_device(struct device *dev)
1210{ 1452{
1453 struct iommu_dev_data *dev_data = get_dev_data(dev);
1454 struct iommu_dev_data *alias_data;
1455 unsigned long flags;
1211 1456
1212 /* lock domain */ 1457 BUG_ON(!dev_data->domain);
1213 spin_lock(&domain->lock);
1214 1458
1215 /* remove domain from the lookup table */ 1459 spin_lock_irqsave(&dev_data->domain->lock, flags);
1216 amd_iommu_pd_table[devid] = NULL;
1217 1460
1218 /* remove entry from the device table seen by the hardware */ 1461 if (dev_data->alias != dev) {
1219 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; 1462 alias_data = get_dev_data(dev_data->alias);
1220 amd_iommu_dev_table[devid].data[1] = 0; 1463 if (atomic_dec_and_test(&alias_data->bind))
1221 amd_iommu_dev_table[devid].data[2] = 0; 1464 do_detach(dev_data->alias);
1465 }
1222 1466
1223 /* decrease reference counter */ 1467 if (atomic_dec_and_test(&dev_data->bind))
1224 domain->dev_cnt -= 1; 1468 do_detach(dev);
1225 1469
1226 /* ready */ 1470 spin_unlock_irqrestore(&dev_data->domain->lock, flags);
1227 spin_unlock(&domain->lock);
1228 1471
1229 /* 1472 /*
1230 * If we run in passthrough mode the device must be assigned to the 1473 * If we run in passthrough mode the device must be assigned to the
1231 * passthrough domain if it is detached from any other domain 1474 * passthrough domain if it is detached from any other domain
1232 */ 1475 */
1233 if (iommu_pass_through) { 1476 if (iommu_pass_through && dev_data->domain == NULL)
1234 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 1477 __attach_device(dev, pt_domain);
1235 __attach_device(iommu, pt_domain, devid);
1236 }
1237} 1478}
1238 1479
1239/* 1480/*
1240 * Removes a device from a protection domain (with devtable_lock held) 1481 * Removes a device from a protection domain (with devtable_lock held)
1241 */ 1482 */
1242static void detach_device(struct protection_domain *domain, u16 devid) 1483static void detach_device(struct device *dev)
1243{ 1484{
1244 unsigned long flags; 1485 unsigned long flags;
1245 1486
1246 /* lock device table */ 1487 /* lock device table */
1247 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1488 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1248 __detach_device(domain, devid); 1489 __detach_device(dev);
1249 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1490 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1250} 1491}
1251 1492
1493/*
1494 * Find out the protection domain structure for a given PCI device. This
1495 * will give us the pointer to the page table root for example.
1496 */
1497static struct protection_domain *domain_for_device(struct device *dev)
1498{
1499 struct protection_domain *dom;
1500 struct iommu_dev_data *dev_data, *alias_data;
1501 unsigned long flags;
1502 u16 devid, alias;
1503
1504 devid = get_device_id(dev);
1505 alias = amd_iommu_alias_table[devid];
1506 dev_data = get_dev_data(dev);
1507 alias_data = get_dev_data(dev_data->alias);
1508 if (!alias_data)
1509 return NULL;
1510
1511 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1512 dom = dev_data->domain;
1513 if (dom == NULL &&
1514 alias_data->domain != NULL) {
1515 __attach_device(dev, alias_data->domain);
1516 dom = alias_data->domain;
1517 }
1518
1519 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1520
1521 return dom;
1522}
1523
1252static int device_change_notifier(struct notifier_block *nb, 1524static int device_change_notifier(struct notifier_block *nb,
1253 unsigned long action, void *data) 1525 unsigned long action, void *data)
1254{ 1526{
1255 struct device *dev = data; 1527 struct device *dev = data;
1256 struct pci_dev *pdev = to_pci_dev(dev); 1528 u16 devid;
1257 u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
1258 struct protection_domain *domain; 1529 struct protection_domain *domain;
1259 struct dma_ops_domain *dma_domain; 1530 struct dma_ops_domain *dma_domain;
1260 struct amd_iommu *iommu; 1531 struct amd_iommu *iommu;
1261 unsigned long flags; 1532 unsigned long flags;
1262 1533
1263 if (devid > amd_iommu_last_bdf) 1534 if (!check_device(dev))
1264 goto out; 1535 return 0;
1265
1266 devid = amd_iommu_alias_table[devid];
1267
1268 iommu = amd_iommu_rlookup_table[devid];
1269 if (iommu == NULL)
1270 goto out;
1271
1272 domain = domain_for_device(devid);
1273 1536
1274 if (domain && !dma_ops_domain(domain)) 1537 devid = get_device_id(dev);
1275 WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " 1538 iommu = amd_iommu_rlookup_table[devid];
1276 "to a non-dma-ops domain\n", dev_name(dev));
1277 1539
1278 switch (action) { 1540 switch (action) {
1279 case BUS_NOTIFY_UNBOUND_DRIVER: 1541 case BUS_NOTIFY_UNBOUND_DRIVER:
1542
1543 domain = domain_for_device(dev);
1544
1280 if (!domain) 1545 if (!domain)
1281 goto out; 1546 goto out;
1282 if (iommu_pass_through) 1547 if (iommu_pass_through)
1283 break; 1548 break;
1284 detach_device(domain, devid); 1549 detach_device(dev);
1285 break; 1550 break;
1286 case BUS_NOTIFY_ADD_DEVICE: 1551 case BUS_NOTIFY_ADD_DEVICE:
1552
1553 iommu_init_device(dev);
1554
1555 domain = domain_for_device(dev);
1556
1287 /* allocate a protection domain if a device is added */ 1557 /* allocate a protection domain if a device is added */
1288 dma_domain = find_protection_domain(devid); 1558 dma_domain = find_protection_domain(devid);
1289 if (dma_domain) 1559 if (dma_domain)
1290 goto out; 1560 goto out;
1291 dma_domain = dma_ops_domain_alloc(iommu); 1561 dma_domain = dma_ops_domain_alloc();
1292 if (!dma_domain) 1562 if (!dma_domain)
1293 goto out; 1563 goto out;
1294 dma_domain->target_dev = devid; 1564 dma_domain->target_dev = devid;
@@ -1298,11 +1568,15 @@ static int device_change_notifier(struct notifier_block *nb,
1298 spin_unlock_irqrestore(&iommu_pd_list_lock, flags); 1568 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1299 1569
1300 break; 1570 break;
1571 case BUS_NOTIFY_DEL_DEVICE:
1572
1573 iommu_uninit_device(dev);
1574
1301 default: 1575 default:
1302 goto out; 1576 goto out;
1303 } 1577 }
1304 1578
1305 iommu_queue_inv_dev_entry(iommu, devid); 1579 iommu_flush_device(dev);
1306 iommu_completion_wait(iommu); 1580 iommu_completion_wait(iommu);
1307 1581
1308out: 1582out:
@@ -1320,106 +1594,46 @@ static struct notifier_block device_nb = {
1320 *****************************************************************************/ 1594 *****************************************************************************/
1321 1595
1322/* 1596/*
1323 * This function checks if the driver got a valid device from the caller to
1324 * avoid dereferencing invalid pointers.
1325 */
1326static bool check_device(struct device *dev)
1327{
1328 if (!dev || !dev->dma_mask)
1329 return false;
1330
1331 return true;
1332}
1333
1334/*
1335 * In this function the list of preallocated protection domains is traversed to
1336 * find the domain for a specific device
1337 */
1338static struct dma_ops_domain *find_protection_domain(u16 devid)
1339{
1340 struct dma_ops_domain *entry, *ret = NULL;
1341 unsigned long flags;
1342
1343 if (list_empty(&iommu_pd_list))
1344 return NULL;
1345
1346 spin_lock_irqsave(&iommu_pd_list_lock, flags);
1347
1348 list_for_each_entry(entry, &iommu_pd_list, list) {
1349 if (entry->target_dev == devid) {
1350 ret = entry;
1351 break;
1352 }
1353 }
1354
1355 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1356
1357 return ret;
1358}
1359
1360/*
1361 * In the dma_ops path we only have the struct device. This function 1597 * In the dma_ops path we only have the struct device. This function
1362 * finds the corresponding IOMMU, the protection domain and the 1598 * finds the corresponding IOMMU, the protection domain and the
1363 * requestor id for a given device. 1599 * requestor id for a given device.
1364 * If the device is not yet associated with a domain this is also done 1600 * If the device is not yet associated with a domain this is also done
1365 * in this function. 1601 * in this function.
1366 */ 1602 */
1367static int get_device_resources(struct device *dev, 1603static struct protection_domain *get_domain(struct device *dev)
1368 struct amd_iommu **iommu,
1369 struct protection_domain **domain,
1370 u16 *bdf)
1371{ 1604{
1605 struct protection_domain *domain;
1372 struct dma_ops_domain *dma_dom; 1606 struct dma_ops_domain *dma_dom;
1373 struct pci_dev *pcidev; 1607 u16 devid = get_device_id(dev);
1374 u16 _bdf;
1375
1376 *iommu = NULL;
1377 *domain = NULL;
1378 *bdf = 0xffff;
1379
1380 if (dev->bus != &pci_bus_type)
1381 return 0;
1382 1608
1383 pcidev = to_pci_dev(dev); 1609 if (!check_device(dev))
1384 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); 1610 return ERR_PTR(-EINVAL);
1385
1386 /* device not translated by any IOMMU in the system? */
1387 if (_bdf > amd_iommu_last_bdf)
1388 return 0;
1389 1611
1390 *bdf = amd_iommu_alias_table[_bdf]; 1612 domain = domain_for_device(dev);
1613 if (domain != NULL && !dma_ops_domain(domain))
1614 return ERR_PTR(-EBUSY);
1391 1615
1392 *iommu = amd_iommu_rlookup_table[*bdf]; 1616 if (domain != NULL)
1393 if (*iommu == NULL) 1617 return domain;
1394 return 0;
1395 *domain = domain_for_device(*bdf);
1396 if (*domain == NULL) {
1397 dma_dom = find_protection_domain(*bdf);
1398 if (!dma_dom)
1399 dma_dom = (*iommu)->default_dom;
1400 *domain = &dma_dom->domain;
1401 attach_device(*iommu, *domain, *bdf);
1402 DUMP_printk("Using protection domain %d for device %s\n",
1403 (*domain)->id, dev_name(dev));
1404 }
1405 1618
1406 if (domain_for_device(_bdf) == NULL) 1619 /* Device not bount yet - bind it */
1407 attach_device(*iommu, *domain, _bdf); 1620 dma_dom = find_protection_domain(devid);
1621 if (!dma_dom)
1622 dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
1623 attach_device(dev, &dma_dom->domain);
1624 DUMP_printk("Using protection domain %d for device %s\n",
1625 dma_dom->domain.id, dev_name(dev));
1408 1626
1409 return 1; 1627 return &dma_dom->domain;
1410} 1628}
1411 1629
1412static void update_device_table(struct protection_domain *domain) 1630static void update_device_table(struct protection_domain *domain)
1413{ 1631{
1414 unsigned long flags; 1632 struct iommu_dev_data *dev_data;
1415 int i;
1416 1633
1417 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 1634 list_for_each_entry(dev_data, &domain->dev_list, list) {
1418 if (amd_iommu_pd_table[i] != domain) 1635 u16 devid = get_device_id(dev_data->dev);
1419 continue; 1636 set_dte_entry(devid, domain);
1420 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1421 set_dte_entry(i, domain);
1422 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1423 } 1637 }
1424} 1638}
1425 1639
@@ -1429,76 +1643,13 @@ static void update_domain(struct protection_domain *domain)
1429 return; 1643 return;
1430 1644
1431 update_device_table(domain); 1645 update_device_table(domain);
1432 flush_devices_by_domain(domain); 1646 iommu_flush_domain_devices(domain);
1433 iommu_flush_domain(domain->id); 1647 iommu_flush_tlb_pde(domain);
1434 1648
1435 domain->updated = false; 1649 domain->updated = false;
1436} 1650}
1437 1651
1438/* 1652/*
1439 * This function is used to add another level to an IO page table. Adding
1440 * another level increases the size of the address space by 9 bits to a size up
1441 * to 64 bits.
1442 */
1443static bool increase_address_space(struct protection_domain *domain,
1444 gfp_t gfp)
1445{
1446 u64 *pte;
1447
1448 if (domain->mode == PAGE_MODE_6_LEVEL)
1449 /* address space already 64 bit large */
1450 return false;
1451
1452 pte = (void *)get_zeroed_page(gfp);
1453 if (!pte)
1454 return false;
1455
1456 *pte = PM_LEVEL_PDE(domain->mode,
1457 virt_to_phys(domain->pt_root));
1458 domain->pt_root = pte;
1459 domain->mode += 1;
1460 domain->updated = true;
1461
1462 return true;
1463}
1464
1465static u64 *alloc_pte(struct protection_domain *domain,
1466 unsigned long address,
1467 int end_lvl,
1468 u64 **pte_page,
1469 gfp_t gfp)
1470{
1471 u64 *pte, *page;
1472 int level;
1473
1474 while (address > PM_LEVEL_SIZE(domain->mode))
1475 increase_address_space(domain, gfp);
1476
1477 level = domain->mode - 1;
1478 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
1479
1480 while (level > end_lvl) {
1481 if (!IOMMU_PTE_PRESENT(*pte)) {
1482 page = (u64 *)get_zeroed_page(gfp);
1483 if (!page)
1484 return NULL;
1485 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
1486 }
1487
1488 level -= 1;
1489
1490 pte = IOMMU_PTE_PAGE(*pte);
1491
1492 if (pte_page && level == end_lvl)
1493 *pte_page = pte;
1494
1495 pte = &pte[PM_LEVEL_INDEX(level, address)];
1496 }
1497
1498 return pte;
1499}
1500
1501/*
1502 * This function fetches the PTE for a given address in the aperture 1653 * This function fetches the PTE for a given address in the aperture
1503 */ 1654 */
1504static u64* dma_ops_get_pte(struct dma_ops_domain *dom, 1655static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
@@ -1528,8 +1679,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1528 * This is the generic map function. It maps one 4kb page at paddr to 1679 * This is the generic map function. It maps one 4kb page at paddr to
1529 * the given address in the DMA address space for the domain. 1680 * the given address in the DMA address space for the domain.
1530 */ 1681 */
1531static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 1682static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
1532 struct dma_ops_domain *dom,
1533 unsigned long address, 1683 unsigned long address,
1534 phys_addr_t paddr, 1684 phys_addr_t paddr,
1535 int direction) 1685 int direction)
@@ -1542,7 +1692,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1542 1692
1543 pte = dma_ops_get_pte(dom, address); 1693 pte = dma_ops_get_pte(dom, address);
1544 if (!pte) 1694 if (!pte)
1545 return bad_dma_address; 1695 return DMA_ERROR_CODE;
1546 1696
1547 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 1697 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1548 1698
@@ -1563,8 +1713,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1563/* 1713/*
1564 * The generic unmapping function for on page in the DMA address space. 1714 * The generic unmapping function for on page in the DMA address space.
1565 */ 1715 */
1566static void dma_ops_domain_unmap(struct amd_iommu *iommu, 1716static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
1567 struct dma_ops_domain *dom,
1568 unsigned long address) 1717 unsigned long address)
1569{ 1718{
1570 struct aperture_range *aperture; 1719 struct aperture_range *aperture;
@@ -1595,7 +1744,6 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1595 * Must be called with the domain lock held. 1744 * Must be called with the domain lock held.
1596 */ 1745 */
1597static dma_addr_t __map_single(struct device *dev, 1746static dma_addr_t __map_single(struct device *dev,
1598 struct amd_iommu *iommu,
1599 struct dma_ops_domain *dma_dom, 1747 struct dma_ops_domain *dma_dom,
1600 phys_addr_t paddr, 1748 phys_addr_t paddr,
1601 size_t size, 1749 size_t size,
@@ -1623,7 +1771,7 @@ static dma_addr_t __map_single(struct device *dev,
1623retry: 1771retry:
1624 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 1772 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
1625 dma_mask); 1773 dma_mask);
1626 if (unlikely(address == bad_dma_address)) { 1774 if (unlikely(address == DMA_ERROR_CODE)) {
1627 /* 1775 /*
1628 * setting next_address here will let the address 1776 * setting next_address here will let the address
1629 * allocator only scan the new allocated range in the 1777 * allocator only scan the new allocated range in the
@@ -1631,7 +1779,7 @@ retry:
1631 */ 1779 */
1632 dma_dom->next_address = dma_dom->aperture_size; 1780 dma_dom->next_address = dma_dom->aperture_size;
1633 1781
1634 if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) 1782 if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
1635 goto out; 1783 goto out;
1636 1784
1637 /* 1785 /*
@@ -1643,8 +1791,8 @@ retry:
1643 1791
1644 start = address; 1792 start = address;
1645 for (i = 0; i < pages; ++i) { 1793 for (i = 0; i < pages; ++i) {
1646 ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); 1794 ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
1647 if (ret == bad_dma_address) 1795 if (ret == DMA_ERROR_CODE)
1648 goto out_unmap; 1796 goto out_unmap;
1649 1797
1650 paddr += PAGE_SIZE; 1798 paddr += PAGE_SIZE;
@@ -1655,10 +1803,10 @@ retry:
1655 ADD_STATS_COUNTER(alloced_io_mem, size); 1803 ADD_STATS_COUNTER(alloced_io_mem, size);
1656 1804
1657 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 1805 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
1658 iommu_flush_tlb(iommu, dma_dom->domain.id); 1806 iommu_flush_tlb(&dma_dom->domain);
1659 dma_dom->need_flush = false; 1807 dma_dom->need_flush = false;
1660 } else if (unlikely(iommu_has_npcache(iommu))) 1808 } else if (unlikely(amd_iommu_np_cache))
1661 iommu_flush_pages(iommu, dma_dom->domain.id, address, size); 1809 iommu_flush_pages(&dma_dom->domain, address, size);
1662 1810
1663out: 1811out:
1664 return address; 1812 return address;
@@ -1667,20 +1815,19 @@ out_unmap:
1667 1815
1668 for (--i; i >= 0; --i) { 1816 for (--i; i >= 0; --i) {
1669 start -= PAGE_SIZE; 1817 start -= PAGE_SIZE;
1670 dma_ops_domain_unmap(iommu, dma_dom, start); 1818 dma_ops_domain_unmap(dma_dom, start);
1671 } 1819 }
1672 1820
1673 dma_ops_free_addresses(dma_dom, address, pages); 1821 dma_ops_free_addresses(dma_dom, address, pages);
1674 1822
1675 return bad_dma_address; 1823 return DMA_ERROR_CODE;
1676} 1824}
1677 1825
1678/* 1826/*
1679 * Does the reverse of the __map_single function. Must be called with 1827 * Does the reverse of the __map_single function. Must be called with
1680 * the domain lock held too 1828 * the domain lock held too
1681 */ 1829 */
1682static void __unmap_single(struct amd_iommu *iommu, 1830static void __unmap_single(struct dma_ops_domain *dma_dom,
1683 struct dma_ops_domain *dma_dom,
1684 dma_addr_t dma_addr, 1831 dma_addr_t dma_addr,
1685 size_t size, 1832 size_t size,
1686 int dir) 1833 int dir)
@@ -1688,7 +1835,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1688 dma_addr_t i, start; 1835 dma_addr_t i, start;
1689 unsigned int pages; 1836 unsigned int pages;
1690 1837
1691 if ((dma_addr == bad_dma_address) || 1838 if ((dma_addr == DMA_ERROR_CODE) ||
1692 (dma_addr + size > dma_dom->aperture_size)) 1839 (dma_addr + size > dma_dom->aperture_size))
1693 return; 1840 return;
1694 1841
@@ -1697,7 +1844,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1697 start = dma_addr; 1844 start = dma_addr;
1698 1845
1699 for (i = 0; i < pages; ++i) { 1846 for (i = 0; i < pages; ++i) {
1700 dma_ops_domain_unmap(iommu, dma_dom, start); 1847 dma_ops_domain_unmap(dma_dom, start);
1701 start += PAGE_SIZE; 1848 start += PAGE_SIZE;
1702 } 1849 }
1703 1850
@@ -1706,7 +1853,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1706 dma_ops_free_addresses(dma_dom, dma_addr, pages); 1853 dma_ops_free_addresses(dma_dom, dma_addr, pages);
1707 1854
1708 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 1855 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
1709 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); 1856 iommu_flush_pages(&dma_dom->domain, dma_addr, size);
1710 dma_dom->need_flush = false; 1857 dma_dom->need_flush = false;
1711 } 1858 }
1712} 1859}
@@ -1720,36 +1867,29 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
1720 struct dma_attrs *attrs) 1867 struct dma_attrs *attrs)
1721{ 1868{
1722 unsigned long flags; 1869 unsigned long flags;
1723 struct amd_iommu *iommu;
1724 struct protection_domain *domain; 1870 struct protection_domain *domain;
1725 u16 devid;
1726 dma_addr_t addr; 1871 dma_addr_t addr;
1727 u64 dma_mask; 1872 u64 dma_mask;
1728 phys_addr_t paddr = page_to_phys(page) + offset; 1873 phys_addr_t paddr = page_to_phys(page) + offset;
1729 1874
1730 INC_STATS_COUNTER(cnt_map_single); 1875 INC_STATS_COUNTER(cnt_map_single);
1731 1876
1732 if (!check_device(dev)) 1877 domain = get_domain(dev);
1733 return bad_dma_address; 1878 if (PTR_ERR(domain) == -EINVAL)
1734
1735 dma_mask = *dev->dma_mask;
1736
1737 get_device_resources(dev, &iommu, &domain, &devid);
1738
1739 if (iommu == NULL || domain == NULL)
1740 /* device not handled by any AMD IOMMU */
1741 return (dma_addr_t)paddr; 1879 return (dma_addr_t)paddr;
1880 else if (IS_ERR(domain))
1881 return DMA_ERROR_CODE;
1742 1882
1743 if (!dma_ops_domain(domain)) 1883 dma_mask = *dev->dma_mask;
1744 return bad_dma_address;
1745 1884
1746 spin_lock_irqsave(&domain->lock, flags); 1885 spin_lock_irqsave(&domain->lock, flags);
1747 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, 1886
1887 addr = __map_single(dev, domain->priv, paddr, size, dir, false,
1748 dma_mask); 1888 dma_mask);
1749 if (addr == bad_dma_address) 1889 if (addr == DMA_ERROR_CODE)
1750 goto out; 1890 goto out;
1751 1891
1752 iommu_completion_wait(iommu); 1892 iommu_flush_complete(domain);
1753 1893
1754out: 1894out:
1755 spin_unlock_irqrestore(&domain->lock, flags); 1895 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1764,25 +1904,19 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
1764 enum dma_data_direction dir, struct dma_attrs *attrs) 1904 enum dma_data_direction dir, struct dma_attrs *attrs)
1765{ 1905{
1766 unsigned long flags; 1906 unsigned long flags;
1767 struct amd_iommu *iommu;
1768 struct protection_domain *domain; 1907 struct protection_domain *domain;
1769 u16 devid;
1770 1908
1771 INC_STATS_COUNTER(cnt_unmap_single); 1909 INC_STATS_COUNTER(cnt_unmap_single);
1772 1910
1773 if (!check_device(dev) || 1911 domain = get_domain(dev);
1774 !get_device_resources(dev, &iommu, &domain, &devid)) 1912 if (IS_ERR(domain))
1775 /* device not handled by any AMD IOMMU */
1776 return;
1777
1778 if (!dma_ops_domain(domain))
1779 return; 1913 return;
1780 1914
1781 spin_lock_irqsave(&domain->lock, flags); 1915 spin_lock_irqsave(&domain->lock, flags);
1782 1916
1783 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1917 __unmap_single(domain->priv, dma_addr, size, dir);
1784 1918
1785 iommu_completion_wait(iommu); 1919 iommu_flush_complete(domain);
1786 1920
1787 spin_unlock_irqrestore(&domain->lock, flags); 1921 spin_unlock_irqrestore(&domain->lock, flags);
1788} 1922}
@@ -1814,9 +1948,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1814 struct dma_attrs *attrs) 1948 struct dma_attrs *attrs)
1815{ 1949{
1816 unsigned long flags; 1950 unsigned long flags;
1817 struct amd_iommu *iommu;
1818 struct protection_domain *domain; 1951 struct protection_domain *domain;
1819 u16 devid;
1820 int i; 1952 int i;
1821 struct scatterlist *s; 1953 struct scatterlist *s;
1822 phys_addr_t paddr; 1954 phys_addr_t paddr;
@@ -1825,25 +1957,20 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1825 1957
1826 INC_STATS_COUNTER(cnt_map_sg); 1958 INC_STATS_COUNTER(cnt_map_sg);
1827 1959
1828 if (!check_device(dev)) 1960 domain = get_domain(dev);
1961 if (PTR_ERR(domain) == -EINVAL)
1962 return map_sg_no_iommu(dev, sglist, nelems, dir);
1963 else if (IS_ERR(domain))
1829 return 0; 1964 return 0;
1830 1965
1831 dma_mask = *dev->dma_mask; 1966 dma_mask = *dev->dma_mask;
1832 1967
1833 get_device_resources(dev, &iommu, &domain, &devid);
1834
1835 if (!iommu || !domain)
1836 return map_sg_no_iommu(dev, sglist, nelems, dir);
1837
1838 if (!dma_ops_domain(domain))
1839 return 0;
1840
1841 spin_lock_irqsave(&domain->lock, flags); 1968 spin_lock_irqsave(&domain->lock, flags);
1842 1969
1843 for_each_sg(sglist, s, nelems, i) { 1970 for_each_sg(sglist, s, nelems, i) {
1844 paddr = sg_phys(s); 1971 paddr = sg_phys(s);
1845 1972
1846 s->dma_address = __map_single(dev, iommu, domain->priv, 1973 s->dma_address = __map_single(dev, domain->priv,
1847 paddr, s->length, dir, false, 1974 paddr, s->length, dir, false,
1848 dma_mask); 1975 dma_mask);
1849 1976
@@ -1854,7 +1981,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1854 goto unmap; 1981 goto unmap;
1855 } 1982 }
1856 1983
1857 iommu_completion_wait(iommu); 1984 iommu_flush_complete(domain);
1858 1985
1859out: 1986out:
1860 spin_unlock_irqrestore(&domain->lock, flags); 1987 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1863,7 +1990,7 @@ out:
1863unmap: 1990unmap:
1864 for_each_sg(sglist, s, mapped_elems, i) { 1991 for_each_sg(sglist, s, mapped_elems, i) {
1865 if (s->dma_address) 1992 if (s->dma_address)
1866 __unmap_single(iommu, domain->priv, s->dma_address, 1993 __unmap_single(domain->priv, s->dma_address,
1867 s->dma_length, dir); 1994 s->dma_length, dir);
1868 s->dma_address = s->dma_length = 0; 1995 s->dma_address = s->dma_length = 0;
1869 } 1996 }
@@ -1882,30 +2009,25 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
1882 struct dma_attrs *attrs) 2009 struct dma_attrs *attrs)
1883{ 2010{
1884 unsigned long flags; 2011 unsigned long flags;
1885 struct amd_iommu *iommu;
1886 struct protection_domain *domain; 2012 struct protection_domain *domain;
1887 struct scatterlist *s; 2013 struct scatterlist *s;
1888 u16 devid;
1889 int i; 2014 int i;
1890 2015
1891 INC_STATS_COUNTER(cnt_unmap_sg); 2016 INC_STATS_COUNTER(cnt_unmap_sg);
1892 2017
1893 if (!check_device(dev) || 2018 domain = get_domain(dev);
1894 !get_device_resources(dev, &iommu, &domain, &devid)) 2019 if (IS_ERR(domain))
1895 return;
1896
1897 if (!dma_ops_domain(domain))
1898 return; 2020 return;
1899 2021
1900 spin_lock_irqsave(&domain->lock, flags); 2022 spin_lock_irqsave(&domain->lock, flags);
1901 2023
1902 for_each_sg(sglist, s, nelems, i) { 2024 for_each_sg(sglist, s, nelems, i) {
1903 __unmap_single(iommu, domain->priv, s->dma_address, 2025 __unmap_single(domain->priv, s->dma_address,
1904 s->dma_length, dir); 2026 s->dma_length, dir);
1905 s->dma_address = s->dma_length = 0; 2027 s->dma_address = s->dma_length = 0;
1906 } 2028 }
1907 2029
1908 iommu_completion_wait(iommu); 2030 iommu_flush_complete(domain);
1909 2031
1910 spin_unlock_irqrestore(&domain->lock, flags); 2032 spin_unlock_irqrestore(&domain->lock, flags);
1911} 2033}
@@ -1918,49 +2040,44 @@ static void *alloc_coherent(struct device *dev, size_t size,
1918{ 2040{
1919 unsigned long flags; 2041 unsigned long flags;
1920 void *virt_addr; 2042 void *virt_addr;
1921 struct amd_iommu *iommu;
1922 struct protection_domain *domain; 2043 struct protection_domain *domain;
1923 u16 devid;
1924 phys_addr_t paddr; 2044 phys_addr_t paddr;
1925 u64 dma_mask = dev->coherent_dma_mask; 2045 u64 dma_mask = dev->coherent_dma_mask;
1926 2046
1927 INC_STATS_COUNTER(cnt_alloc_coherent); 2047 INC_STATS_COUNTER(cnt_alloc_coherent);
1928 2048
1929 if (!check_device(dev)) 2049 domain = get_domain(dev);
2050 if (PTR_ERR(domain) == -EINVAL) {
2051 virt_addr = (void *)__get_free_pages(flag, get_order(size));
2052 *dma_addr = __pa(virt_addr);
2053 return virt_addr;
2054 } else if (IS_ERR(domain))
1930 return NULL; 2055 return NULL;
1931 2056
1932 if (!get_device_resources(dev, &iommu, &domain, &devid)) 2057 dma_mask = dev->coherent_dma_mask;
1933 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); 2058 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
2059 flag |= __GFP_ZERO;
1934 2060
1935 flag |= __GFP_ZERO;
1936 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 2061 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1937 if (!virt_addr) 2062 if (!virt_addr)
1938 return NULL; 2063 return NULL;
1939 2064
1940 paddr = virt_to_phys(virt_addr); 2065 paddr = virt_to_phys(virt_addr);
1941 2066
1942 if (!iommu || !domain) {
1943 *dma_addr = (dma_addr_t)paddr;
1944 return virt_addr;
1945 }
1946
1947 if (!dma_ops_domain(domain))
1948 goto out_free;
1949
1950 if (!dma_mask) 2067 if (!dma_mask)
1951 dma_mask = *dev->dma_mask; 2068 dma_mask = *dev->dma_mask;
1952 2069
1953 spin_lock_irqsave(&domain->lock, flags); 2070 spin_lock_irqsave(&domain->lock, flags);
1954 2071
1955 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 2072 *dma_addr = __map_single(dev, domain->priv, paddr,
1956 size, DMA_BIDIRECTIONAL, true, dma_mask); 2073 size, DMA_BIDIRECTIONAL, true, dma_mask);
1957 2074
1958 if (*dma_addr == bad_dma_address) { 2075 if (*dma_addr == DMA_ERROR_CODE) {
1959 spin_unlock_irqrestore(&domain->lock, flags); 2076 spin_unlock_irqrestore(&domain->lock, flags);
1960 goto out_free; 2077 goto out_free;
1961 } 2078 }
1962 2079
1963 iommu_completion_wait(iommu); 2080 iommu_flush_complete(domain);
1964 2081
1965 spin_unlock_irqrestore(&domain->lock, flags); 2082 spin_unlock_irqrestore(&domain->lock, flags);
1966 2083
@@ -1980,28 +2097,19 @@ static void free_coherent(struct device *dev, size_t size,
1980 void *virt_addr, dma_addr_t dma_addr) 2097 void *virt_addr, dma_addr_t dma_addr)
1981{ 2098{
1982 unsigned long flags; 2099 unsigned long flags;
1983 struct amd_iommu *iommu;
1984 struct protection_domain *domain; 2100 struct protection_domain *domain;
1985 u16 devid;
1986 2101
1987 INC_STATS_COUNTER(cnt_free_coherent); 2102 INC_STATS_COUNTER(cnt_free_coherent);
1988 2103
1989 if (!check_device(dev)) 2104 domain = get_domain(dev);
1990 return; 2105 if (IS_ERR(domain))
1991
1992 get_device_resources(dev, &iommu, &domain, &devid);
1993
1994 if (!iommu || !domain)
1995 goto free_mem;
1996
1997 if (!dma_ops_domain(domain))
1998 goto free_mem; 2106 goto free_mem;
1999 2107
2000 spin_lock_irqsave(&domain->lock, flags); 2108 spin_lock_irqsave(&domain->lock, flags);
2001 2109
2002 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 2110 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2003 2111
2004 iommu_completion_wait(iommu); 2112 iommu_flush_complete(domain);
2005 2113
2006 spin_unlock_irqrestore(&domain->lock, flags); 2114 spin_unlock_irqrestore(&domain->lock, flags);
2007 2115
@@ -2015,22 +2123,7 @@ free_mem:
2015 */ 2123 */
2016static int amd_iommu_dma_supported(struct device *dev, u64 mask) 2124static int amd_iommu_dma_supported(struct device *dev, u64 mask)
2017{ 2125{
2018 u16 bdf; 2126 return check_device(dev);
2019 struct pci_dev *pcidev;
2020
2021 /* No device or no PCI device */
2022 if (!dev || dev->bus != &pci_bus_type)
2023 return 0;
2024
2025 pcidev = to_pci_dev(dev);
2026
2027 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
2028
2029 /* Out of our scope? */
2030 if (bdf > amd_iommu_last_bdf)
2031 return 0;
2032
2033 return 1;
2034} 2127}
2035 2128
2036/* 2129/*
@@ -2044,25 +2137,30 @@ static void prealloc_protection_domains(void)
2044{ 2137{
2045 struct pci_dev *dev = NULL; 2138 struct pci_dev *dev = NULL;
2046 struct dma_ops_domain *dma_dom; 2139 struct dma_ops_domain *dma_dom;
2047 struct amd_iommu *iommu;
2048 u16 devid; 2140 u16 devid;
2049 2141
2050 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2142 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2051 devid = calc_devid(dev->bus->number, dev->devfn); 2143
2052 if (devid > amd_iommu_last_bdf) 2144 /* Do we handle this device? */
2053 continue; 2145 if (!check_device(&dev->dev))
2054 devid = amd_iommu_alias_table[devid];
2055 if (domain_for_device(devid))
2056 continue; 2146 continue;
2057 iommu = amd_iommu_rlookup_table[devid]; 2147
2058 if (!iommu) 2148 iommu_init_device(&dev->dev);
2149
2150 /* Is there already any domain for it? */
2151 if (domain_for_device(&dev->dev))
2059 continue; 2152 continue;
2060 dma_dom = dma_ops_domain_alloc(iommu); 2153
2154 devid = get_device_id(&dev->dev);
2155
2156 dma_dom = dma_ops_domain_alloc();
2061 if (!dma_dom) 2157 if (!dma_dom)
2062 continue; 2158 continue;
2063 init_unity_mappings_for_device(dma_dom, devid); 2159 init_unity_mappings_for_device(dma_dom, devid);
2064 dma_dom->target_dev = devid; 2160 dma_dom->target_dev = devid;
2065 2161
2162 attach_device(&dev->dev, &dma_dom->domain);
2163
2066 list_add_tail(&dma_dom->list, &iommu_pd_list); 2164 list_add_tail(&dma_dom->list, &iommu_pd_list);
2067 } 2165 }
2068} 2166}
@@ -2091,7 +2189,7 @@ int __init amd_iommu_init_dma_ops(void)
2091 * protection domain will be assigned to the default one. 2189 * protection domain will be assigned to the default one.
2092 */ 2190 */
2093 for_each_iommu(iommu) { 2191 for_each_iommu(iommu) {
2094 iommu->default_dom = dma_ops_domain_alloc(iommu); 2192 iommu->default_dom = dma_ops_domain_alloc();
2095 if (iommu->default_dom == NULL) 2193 if (iommu->default_dom == NULL)
2096 return -ENOMEM; 2194 return -ENOMEM;
2097 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 2195 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -2101,15 +2199,12 @@ int __init amd_iommu_init_dma_ops(void)
2101 } 2199 }
2102 2200
2103 /* 2201 /*
2104 * If device isolation is enabled, pre-allocate the protection 2202 * Pre-allocate the protection domains for each device.
2105 * domains for each device.
2106 */ 2203 */
2107 if (amd_iommu_isolate) 2204 prealloc_protection_domains();
2108 prealloc_protection_domains();
2109 2205
2110 iommu_detected = 1; 2206 iommu_detected = 1;
2111 force_iommu = 1; 2207 swiotlb = 0;
2112 bad_dma_address = 0;
2113#ifdef CONFIG_GART_IOMMU 2208#ifdef CONFIG_GART_IOMMU
2114 gart_iommu_aperture_disabled = 1; 2209 gart_iommu_aperture_disabled = 1;
2115 gart_iommu_aperture = 0; 2210 gart_iommu_aperture = 0;
@@ -2148,14 +2243,17 @@ free_domains:
2148 2243
2149static void cleanup_domain(struct protection_domain *domain) 2244static void cleanup_domain(struct protection_domain *domain)
2150{ 2245{
2246 struct iommu_dev_data *dev_data, *next;
2151 unsigned long flags; 2247 unsigned long flags;
2152 u16 devid;
2153 2248
2154 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 2249 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
2155 2250
2156 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) 2251 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2157 if (amd_iommu_pd_table[devid] == domain) 2252 struct device *dev = dev_data->dev;
2158 __detach_device(domain, devid); 2253
2254 do_detach(dev);
2255 atomic_set(&dev_data->bind, 0);
2256 }
2159 2257
2160 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2258 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2161} 2259}
@@ -2165,6 +2263,8 @@ static void protection_domain_free(struct protection_domain *domain)
2165 if (!domain) 2263 if (!domain)
2166 return; 2264 return;
2167 2265
2266 del_domain_from_list(domain);
2267
2168 if (domain->id) 2268 if (domain->id)
2169 domain_id_free(domain->id); 2269 domain_id_free(domain->id);
2170 2270
@@ -2183,6 +2283,9 @@ static struct protection_domain *protection_domain_alloc(void)
2183 domain->id = domain_id_alloc(); 2283 domain->id = domain_id_alloc();
2184 if (!domain->id) 2284 if (!domain->id)
2185 goto out_err; 2285 goto out_err;
2286 INIT_LIST_HEAD(&domain->dev_list);
2287
2288 add_domain_to_list(domain);
2186 2289
2187 return domain; 2290 return domain;
2188 2291
@@ -2239,26 +2342,23 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2239static void amd_iommu_detach_device(struct iommu_domain *dom, 2342static void amd_iommu_detach_device(struct iommu_domain *dom,
2240 struct device *dev) 2343 struct device *dev)
2241{ 2344{
2242 struct protection_domain *domain = dom->priv; 2345 struct iommu_dev_data *dev_data = dev->archdata.iommu;
2243 struct amd_iommu *iommu; 2346 struct amd_iommu *iommu;
2244 struct pci_dev *pdev;
2245 u16 devid; 2347 u16 devid;
2246 2348
2247 if (dev->bus != &pci_bus_type) 2349 if (!check_device(dev))
2248 return; 2350 return;
2249 2351
2250 pdev = to_pci_dev(dev); 2352 devid = get_device_id(dev);
2251
2252 devid = calc_devid(pdev->bus->number, pdev->devfn);
2253 2353
2254 if (devid > 0) 2354 if (dev_data->domain != NULL)
2255 detach_device(domain, devid); 2355 detach_device(dev);
2256 2356
2257 iommu = amd_iommu_rlookup_table[devid]; 2357 iommu = amd_iommu_rlookup_table[devid];
2258 if (!iommu) 2358 if (!iommu)
2259 return; 2359 return;
2260 2360
2261 iommu_queue_inv_dev_entry(iommu, devid); 2361 iommu_flush_device(dev);
2262 iommu_completion_wait(iommu); 2362 iommu_completion_wait(iommu);
2263} 2363}
2264 2364
@@ -2266,35 +2366,30 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
2266 struct device *dev) 2366 struct device *dev)
2267{ 2367{
2268 struct protection_domain *domain = dom->priv; 2368 struct protection_domain *domain = dom->priv;
2269 struct protection_domain *old_domain; 2369 struct iommu_dev_data *dev_data;
2270 struct amd_iommu *iommu; 2370 struct amd_iommu *iommu;
2271 struct pci_dev *pdev; 2371 int ret;
2272 u16 devid; 2372 u16 devid;
2273 2373
2274 if (dev->bus != &pci_bus_type) 2374 if (!check_device(dev))
2275 return -EINVAL; 2375 return -EINVAL;
2276 2376
2277 pdev = to_pci_dev(dev); 2377 dev_data = dev->archdata.iommu;
2278 2378
2279 devid = calc_devid(pdev->bus->number, pdev->devfn); 2379 devid = get_device_id(dev);
2280
2281 if (devid >= amd_iommu_last_bdf ||
2282 devid != amd_iommu_alias_table[devid])
2283 return -EINVAL;
2284 2380
2285 iommu = amd_iommu_rlookup_table[devid]; 2381 iommu = amd_iommu_rlookup_table[devid];
2286 if (!iommu) 2382 if (!iommu)
2287 return -EINVAL; 2383 return -EINVAL;
2288 2384
2289 old_domain = domain_for_device(devid); 2385 if (dev_data->domain)
2290 if (old_domain) 2386 detach_device(dev);
2291 detach_device(old_domain, devid);
2292 2387
2293 attach_device(iommu, domain, devid); 2388 ret = attach_device(dev, domain);
2294 2389
2295 iommu_completion_wait(iommu); 2390 iommu_completion_wait(iommu);
2296 2391
2297 return 0; 2392 return ret;
2298} 2393}
2299 2394
2300static int amd_iommu_map_range(struct iommu_domain *dom, 2395static int amd_iommu_map_range(struct iommu_domain *dom,
@@ -2340,7 +2435,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
2340 iova += PAGE_SIZE; 2435 iova += PAGE_SIZE;
2341 } 2436 }
2342 2437
2343 iommu_flush_domain(domain->id); 2438 iommu_flush_tlb_pde(domain);
2344} 2439}
2345 2440
2346static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2441static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
@@ -2391,8 +2486,9 @@ static struct iommu_ops amd_iommu_ops = {
2391 2486
2392int __init amd_iommu_init_passthrough(void) 2487int __init amd_iommu_init_passthrough(void)
2393{ 2488{
2489 struct amd_iommu *iommu;
2394 struct pci_dev *dev = NULL; 2490 struct pci_dev *dev = NULL;
2395 u16 devid, devid2; 2491 u16 devid;
2396 2492
2397 /* allocate passthroug domain */ 2493 /* allocate passthroug domain */
2398 pt_domain = protection_domain_alloc(); 2494 pt_domain = protection_domain_alloc();
@@ -2402,20 +2498,17 @@ int __init amd_iommu_init_passthrough(void)
2402 pt_domain->mode |= PAGE_MODE_NONE; 2498 pt_domain->mode |= PAGE_MODE_NONE;
2403 2499
2404 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2500 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2405 struct amd_iommu *iommu;
2406 2501
2407 devid = calc_devid(dev->bus->number, dev->devfn); 2502 if (!check_device(&dev->dev))
2408 if (devid > amd_iommu_last_bdf)
2409 continue; 2503 continue;
2410 2504
2411 devid2 = amd_iommu_alias_table[devid]; 2505 devid = get_device_id(&dev->dev);
2412 2506
2413 iommu = amd_iommu_rlookup_table[devid2]; 2507 iommu = amd_iommu_rlookup_table[devid];
2414 if (!iommu) 2508 if (!iommu)
2415 continue; 2509 continue;
2416 2510
2417 __attach_device(iommu, pt_domain, devid); 2511 attach_device(&dev->dev, pt_domain);
2418 __attach_device(iommu, pt_domain, devid2);
2419 } 2512 }
2420 2513
2421 pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); 2514 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index b4b61d462dcc..7ffc39965233 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -25,10 +25,12 @@
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
27#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
28#include <asm/amd_iommu_proto.h>
28#include <asm/amd_iommu_types.h> 29#include <asm/amd_iommu_types.h>
29#include <asm/amd_iommu.h> 30#include <asm/amd_iommu.h>
30#include <asm/iommu.h> 31#include <asm/iommu.h>
31#include <asm/gart.h> 32#include <asm/gart.h>
33#include <asm/x86_init.h>
32 34
33/* 35/*
34 * definitions for the ACPI scanning code 36 * definitions for the ACPI scanning code
@@ -123,18 +125,24 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
123 to handle */ 125 to handle */
124LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 126LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
125 we find in ACPI */ 127 we find in ACPI */
126#ifdef CONFIG_IOMMU_STRESS
127bool amd_iommu_isolate = false;
128#else
129bool amd_iommu_isolate = true; /* if true, device isolation is
130 enabled */
131#endif
132
133bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 128bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
134 129
135LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 130LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
136 system */ 131 system */
137 132
133/* Array to assign indices to IOMMUs*/
134struct amd_iommu *amd_iommus[MAX_IOMMUS];
135int amd_iommus_present;
136
137/* IOMMUs have a non-present cache? */
138bool amd_iommu_np_cache __read_mostly;
139
140/*
141 * List of protection domains - used during resume
142 */
143LIST_HEAD(amd_iommu_pd_list);
144spinlock_t amd_iommu_pd_lock;
145
138/* 146/*
139 * Pointer to the device table which is shared by all AMD IOMMUs 147 * Pointer to the device table which is shared by all AMD IOMMUs
140 * it is indexed by the PCI device id or the HT unit id and contains 148 * it is indexed by the PCI device id or the HT unit id and contains
@@ -157,12 +165,6 @@ u16 *amd_iommu_alias_table;
157struct amd_iommu **amd_iommu_rlookup_table; 165struct amd_iommu **amd_iommu_rlookup_table;
158 166
159/* 167/*
160 * The pd table (protection domain table) is used to find the protection domain
161 * data structure a device belongs to. Indexed with the PCI device id too.
162 */
163struct protection_domain **amd_iommu_pd_table;
164
165/*
166 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap 168 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
167 * to know which ones are already in use. 169 * to know which ones are already in use.
168 */ 170 */
@@ -240,7 +242,7 @@ static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
240 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 242 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
241} 243}
242 244
243static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) 245static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
244{ 246{
245 u32 ctrl; 247 u32 ctrl;
246 248
@@ -519,6 +521,26 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
519 amd_iommu_dev_table[devid].data[i] |= (1 << _bit); 521 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
520} 522}
521 523
524static int get_dev_entry_bit(u16 devid, u8 bit)
525{
526 int i = (bit >> 5) & 0x07;
527 int _bit = bit & 0x1f;
528
529 return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
530}
531
532
533void amd_iommu_apply_erratum_63(u16 devid)
534{
535 int sysmgt;
536
537 sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
538 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
539
540 if (sysmgt == 0x01)
541 set_dev_entry_bit(devid, DEV_ENTRY_IW);
542}
543
522/* Writes the specific IOMMU for a device into the rlookup table */ 544/* Writes the specific IOMMU for a device into the rlookup table */
523static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) 545static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
524{ 546{
@@ -547,6 +569,8 @@ static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
547 if (flags & ACPI_DEVFLAG_LINT1) 569 if (flags & ACPI_DEVFLAG_LINT1)
548 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); 570 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
549 571
572 amd_iommu_apply_erratum_63(devid);
573
550 set_iommu_for_device(iommu, devid); 574 set_iommu_for_device(iommu, devid);
551} 575}
552 576
@@ -816,7 +840,18 @@ static void __init free_iommu_all(void)
816static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 840static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
817{ 841{
818 spin_lock_init(&iommu->lock); 842 spin_lock_init(&iommu->lock);
843
844 /* Add IOMMU to internal data structures */
819 list_add_tail(&iommu->list, &amd_iommu_list); 845 list_add_tail(&iommu->list, &amd_iommu_list);
846 iommu->index = amd_iommus_present++;
847
848 if (unlikely(iommu->index >= MAX_IOMMUS)) {
849 WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
850 return -ENOSYS;
851 }
852
853 /* Index is fine - add IOMMU to the array */
854 amd_iommus[iommu->index] = iommu;
820 855
821 /* 856 /*
822 * Copy data from ACPI table entry to the iommu struct 857 * Copy data from ACPI table entry to the iommu struct
@@ -846,6 +881,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
846 init_iommu_from_acpi(iommu, h); 881 init_iommu_from_acpi(iommu, h);
847 init_iommu_devices(iommu); 882 init_iommu_devices(iommu);
848 883
884 if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
885 amd_iommu_np_cache = true;
886
849 return pci_enable_device(iommu->dev); 887 return pci_enable_device(iommu->dev);
850} 888}
851 889
@@ -903,7 +941,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
903 * 941 *
904 ****************************************************************************/ 942 ****************************************************************************/
905 943
906static int __init iommu_setup_msi(struct amd_iommu *iommu) 944static int iommu_setup_msi(struct amd_iommu *iommu)
907{ 945{
908 int r; 946 int r;
909 947
@@ -1154,19 +1192,10 @@ static struct sys_device device_amd_iommu = {
1154 * functions. Finally it prints some information about AMD IOMMUs and 1192 * functions. Finally it prints some information about AMD IOMMUs and
1155 * the driver state and enables the hardware. 1193 * the driver state and enables the hardware.
1156 */ 1194 */
1157int __init amd_iommu_init(void) 1195static int __init amd_iommu_init(void)
1158{ 1196{
1159 int i, ret = 0; 1197 int i, ret = 0;
1160 1198
1161
1162 if (no_iommu) {
1163 printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
1164 return 0;
1165 }
1166
1167 if (!amd_iommu_detected)
1168 return -ENODEV;
1169
1170 /* 1199 /*
1171 * First parse ACPI tables to find the largest Bus/Dev/Func 1200 * First parse ACPI tables to find the largest Bus/Dev/Func
1172 * we need to handle. Upon this information the shared data 1201 * we need to handle. Upon this information the shared data
@@ -1203,15 +1232,6 @@ int __init amd_iommu_init(void)
1203 if (amd_iommu_rlookup_table == NULL) 1232 if (amd_iommu_rlookup_table == NULL)
1204 goto free; 1233 goto free;
1205 1234
1206 /*
1207 * Protection Domain table - maps devices to protection domains
1208 * This table has the same size as the rlookup_table
1209 */
1210 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1211 get_order(rlookup_table_size));
1212 if (amd_iommu_pd_table == NULL)
1213 goto free;
1214
1215 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( 1235 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
1216 GFP_KERNEL | __GFP_ZERO, 1236 GFP_KERNEL | __GFP_ZERO,
1217 get_order(MAX_DOMAIN_ID/8)); 1237 get_order(MAX_DOMAIN_ID/8));
@@ -1233,6 +1253,8 @@ int __init amd_iommu_init(void)
1233 */ 1253 */
1234 amd_iommu_pd_alloc_bitmap[0] = 1; 1254 amd_iommu_pd_alloc_bitmap[0] = 1;
1235 1255
1256 spin_lock_init(&amd_iommu_pd_lock);
1257
1236 /* 1258 /*
1237 * now the data structures are allocated and basically initialized 1259 * now the data structures are allocated and basically initialized
1238 * start the real acpi table scan 1260 * start the real acpi table scan
@@ -1264,17 +1286,12 @@ int __init amd_iommu_init(void)
1264 if (iommu_pass_through) 1286 if (iommu_pass_through)
1265 goto out; 1287 goto out;
1266 1288
1267 printk(KERN_INFO "AMD-Vi: device isolation ");
1268 if (amd_iommu_isolate)
1269 printk("enabled\n");
1270 else
1271 printk("disabled\n");
1272
1273 if (amd_iommu_unmap_flush) 1289 if (amd_iommu_unmap_flush)
1274 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); 1290 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
1275 else 1291 else
1276 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); 1292 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
1277 1293
1294 x86_platform.iommu_shutdown = disable_iommus;
1278out: 1295out:
1279 return ret; 1296 return ret;
1280 1297
@@ -1282,9 +1299,6 @@ free:
1282 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1299 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1283 get_order(MAX_DOMAIN_ID/8)); 1300 get_order(MAX_DOMAIN_ID/8));
1284 1301
1285 free_pages((unsigned long)amd_iommu_pd_table,
1286 get_order(rlookup_table_size));
1287
1288 free_pages((unsigned long)amd_iommu_rlookup_table, 1302 free_pages((unsigned long)amd_iommu_rlookup_table,
1289 get_order(rlookup_table_size)); 1303 get_order(rlookup_table_size));
1290 1304
@@ -1301,11 +1315,6 @@ free:
1301 goto out; 1315 goto out;
1302} 1316}
1303 1317
1304void amd_iommu_shutdown(void)
1305{
1306 disable_iommus();
1307}
1308
1309/**************************************************************************** 1318/****************************************************************************
1310 * 1319 *
1311 * Early detect code. This code runs at IOMMU detection time in the DMA 1320 * Early detect code. This code runs at IOMMU detection time in the DMA
@@ -1320,16 +1329,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1320 1329
1321void __init amd_iommu_detect(void) 1330void __init amd_iommu_detect(void)
1322{ 1331{
1323 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) 1332 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1324 return; 1333 return;
1325 1334
1326 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1335 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1327 iommu_detected = 1; 1336 iommu_detected = 1;
1328 amd_iommu_detected = 1; 1337 amd_iommu_detected = 1;
1329#ifdef CONFIG_GART_IOMMU 1338 x86_init.iommu.iommu_init = amd_iommu_init;
1330 gart_iommu_aperture_disabled = 1;
1331 gart_iommu_aperture = 0;
1332#endif
1333 } 1339 }
1334} 1340}
1335 1341
@@ -1350,10 +1356,6 @@ static int __init parse_amd_iommu_dump(char *str)
1350static int __init parse_amd_iommu_options(char *str) 1356static int __init parse_amd_iommu_options(char *str)
1351{ 1357{
1352 for (; *str; ++str) { 1358 for (; *str; ++str) {
1353 if (strncmp(str, "isolate", 7) == 0)
1354 amd_iommu_isolate = true;
1355 if (strncmp(str, "share", 5) == 0)
1356 amd_iommu_isolate = false;
1357 if (strncmp(str, "fullflush", 9) == 0) 1359 if (strncmp(str, "fullflush", 9) == 0)
1358 amd_iommu_unmap_flush = true; 1360 amd_iommu_unmap_flush = true;
1359 } 1361 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 128111d8ffe0..e0dfb6856aa2 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -28,6 +28,7 @@
28#include <asm/pci-direct.h> 28#include <asm/pci-direct.h>
29#include <asm/dma.h> 29#include <asm/dma.h>
30#include <asm/k8.h> 30#include <asm/k8.h>
31#include <asm/x86_init.h>
31 32
32int gart_iommu_aperture; 33int gart_iommu_aperture;
33int gart_iommu_aperture_disabled __initdata; 34int gart_iommu_aperture_disabled __initdata;
@@ -400,6 +401,7 @@ void __init gart_iommu_hole_init(void)
400 401
401 iommu_detected = 1; 402 iommu_detected = 1;
402 gart_iommu_aperture = 1; 403 gart_iommu_aperture = 1;
404 x86_init.iommu.iommu_init = gart_iommu_init;
403 405
404 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; 406 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
405 aper_size = (32 * 1024 * 1024) << aper_order; 407 aper_size = (32 * 1024 * 1024) << aper_order;
@@ -456,7 +458,7 @@ out:
456 458
457 if (aper_alloc) { 459 if (aper_alloc) {
458 /* Got the aperture from the AGP bridge */ 460 /* Got the aperture from the AGP bridge */
459 } else if (swiotlb && !valid_agp) { 461 } else if (!valid_agp) {
460 /* Do nothing */ 462 /* Do nothing */
461 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || 463 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
462 force_iommu || 464 force_iommu ||
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index da7b7b9f8bd8..565c1bfc507d 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,7 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
6obj-$(CONFIG_X86_IO_APIC) += io_apic.o 6obj-$(CONFIG_X86_IO_APIC) += io_apic.o
7obj-$(CONFIG_SMP) += ipi.o 7obj-$(CONFIG_SMP) += ipi.o
8 8
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 894aa97f0717..ad8c75b9e453 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -241,28 +241,13 @@ static int modern_apic(void)
241} 241}
242 242
243/* 243/*
244 * bare function to substitute write operation 244 * right after this call apic become NOOP driven
245 * and it's _that_ fast :) 245 * so apic->write/read doesn't do anything
246 */
247static void native_apic_write_dummy(u32 reg, u32 v)
248{
249 WARN_ON_ONCE((cpu_has_apic || !disable_apic));
250}
251
252static u32 native_apic_read_dummy(u32 reg)
253{
254 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
255 return 0;
256}
257
258/*
259 * right after this call apic->write/read doesn't do anything
260 * note that there is no restore operation it works one way
261 */ 246 */
262void apic_disable(void) 247void apic_disable(void)
263{ 248{
264 apic->read = native_apic_read_dummy; 249 pr_info("APIC: switched to apic NOOP\n");
265 apic->write = native_apic_write_dummy; 250 apic = &apic_noop;
266} 251}
267 252
268void native_apic_wait_icr_idle(void) 253void native_apic_wait_icr_idle(void)
@@ -459,7 +444,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
459 v = apic_read(APIC_LVTT); 444 v = apic_read(APIC_LVTT);
460 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 445 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
461 apic_write(APIC_LVTT, v); 446 apic_write(APIC_LVTT, v);
462 apic_write(APIC_TMICT, 0xffffffff); 447 apic_write(APIC_TMICT, 0);
463 break; 448 break;
464 case CLOCK_EVT_MODE_RESUME: 449 case CLOCK_EVT_MODE_RESUME:
465 /* Nothing to do here */ 450 /* Nothing to do here */
@@ -1392,14 +1377,11 @@ void __init enable_IR_x2apic(void)
1392 unsigned long flags; 1377 unsigned long flags;
1393 struct IO_APIC_route_entry **ioapic_entries = NULL; 1378 struct IO_APIC_route_entry **ioapic_entries = NULL;
1394 int ret, x2apic_enabled = 0; 1379 int ret, x2apic_enabled = 0;
1395 int dmar_table_init_ret = 0; 1380 int dmar_table_init_ret;
1396 1381
1397#ifdef CONFIG_INTR_REMAP
1398 dmar_table_init_ret = dmar_table_init(); 1382 dmar_table_init_ret = dmar_table_init();
1399 if (dmar_table_init_ret) 1383 if (dmar_table_init_ret && !x2apic_supported())
1400 pr_debug("dmar_table_init() failed with %d:\n", 1384 return;
1401 dmar_table_init_ret);
1402#endif
1403 1385
1404 ioapic_entries = alloc_ioapic_entries(); 1386 ioapic_entries = alloc_ioapic_entries();
1405 if (!ioapic_entries) { 1387 if (!ioapic_entries) {
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
new file mode 100644
index 000000000000..d9acc3bee0f4
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -0,0 +1,200 @@
1/*
2 * NOOP APIC driver.
3 *
4 * Does almost nothing and should be substituted by a real apic driver via
5 * probe routine.
6 *
7 * Though in case if apic is disabled (for some reason) we try
8 * to not uglify the caller's code and allow to call (some) apic routines
9 * like self-ipi, etc...
10 */
11
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/module.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/ctype.h>
18#include <linux/init.h>
19#include <linux/errno.h>
20#include <asm/fixmap.h>
21#include <asm/mpspec.h>
22#include <asm/apicdef.h>
23#include <asm/apic.h>
24#include <asm/setup.h>
25
26#include <linux/smp.h>
27#include <asm/ipi.h>
28
29#include <linux/interrupt.h>
30#include <asm/acpi.h>
31#include <asm/e820.h>
32
33static void noop_init_apic_ldr(void) { }
34static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
35static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
36static void noop_send_IPI_allbutself(int vector) { }
37static void noop_send_IPI_all(int vector) { }
38static void noop_send_IPI_self(int vector) { }
39static void noop_apic_wait_icr_idle(void) { }
40static void noop_apic_icr_write(u32 low, u32 id) { }
41
42static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
43{
44 return -1;
45}
46
47static u32 noop_safe_apic_wait_icr_idle(void)
48{
49 return 0;
50}
51
52static u64 noop_apic_icr_read(void)
53{
54 return 0;
55}
56
57static int noop_cpu_to_logical_apicid(int cpu)
58{
59 return 0;
60}
61
62static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
63{
64 return 0;
65}
66
67static unsigned int noop_get_apic_id(unsigned long x)
68{
69 return 0;
70}
71
72static int noop_probe(void)
73{
74 /*
75 * NOOP apic should not ever be
76 * enabled via probe routine
77 */
78 return 0;
79}
80
81static int noop_apic_id_registered(void)
82{
83 /*
84 * if we would be really "pedantic"
85 * we should pass read_apic_id() here
86 * but since NOOP suppose APIC ID = 0
87 * lets save a few cycles
88 */
89 return physid_isset(0, phys_cpu_present_map);
90}
91
92static const struct cpumask *noop_target_cpus(void)
93{
94 /* only BSP here */
95 return cpumask_of(0);
96}
97
98static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
99{
100 return physid_isset(apicid, *map);
101}
102
103static unsigned long noop_check_apicid_present(int bit)
104{
105 return physid_isset(bit, phys_cpu_present_map);
106}
107
108static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
109{
110 if (cpu != 0)
111 pr_warning("APIC: Vector allocated for non-BSP cpu\n");
112 cpumask_clear(retmask);
113 cpumask_set_cpu(cpu, retmask);
114}
115
116int noop_apicid_to_node(int logical_apicid)
117{
118 /* we're always on node 0 */
119 return 0;
120}
121
122static u32 noop_apic_read(u32 reg)
123{
124 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
125 return 0;
126}
127
128static void noop_apic_write(u32 reg, u32 v)
129{
130 WARN_ON_ONCE((cpu_has_apic || !disable_apic));
131}
132
133struct apic apic_noop = {
134 .name = "noop",
135 .probe = noop_probe,
136 .acpi_madt_oem_check = NULL,
137
138 .apic_id_registered = noop_apic_id_registered,
139
140 .irq_delivery_mode = dest_LowestPrio,
141 /* logical delivery broadcast to all CPUs: */
142 .irq_dest_mode = 1,
143
144 .target_cpus = noop_target_cpus,
145 .disable_esr = 0,
146 .dest_logical = APIC_DEST_LOGICAL,
147 .check_apicid_used = noop_check_apicid_used,
148 .check_apicid_present = noop_check_apicid_present,
149
150 .vector_allocation_domain = noop_vector_allocation_domain,
151 .init_apic_ldr = noop_init_apic_ldr,
152
153 .ioapic_phys_id_map = default_ioapic_phys_id_map,
154 .setup_apic_routing = NULL,
155 .multi_timer_check = NULL,
156 .apicid_to_node = noop_apicid_to_node,
157
158 .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid,
161
162 .setup_portio_remap = NULL,
163 .check_phys_apicid_present = default_check_phys_apicid_present,
164 .enable_apic_mode = NULL,
165
166 .phys_pkg_id = noop_phys_pkg_id,
167
168 .mps_oem_check = NULL,
169
170 .get_apic_id = noop_get_apic_id,
171 .set_apic_id = NULL,
172 .apic_id_mask = 0x0F << 24,
173
174 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
175 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
176
177 .send_IPI_mask = noop_send_IPI_mask,
178 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
179 .send_IPI_allbutself = noop_send_IPI_allbutself,
180 .send_IPI_all = noop_send_IPI_all,
181 .send_IPI_self = noop_send_IPI_self,
182
183 .wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
184
185 /* should be safe */
186 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
187 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
188
189 .wait_for_init_deassert = NULL,
190
191 .smp_callin_clear_local_apic = NULL,
192 .inquire_remote_apic = NULL,
193
194 .read = noop_apic_read,
195 .write = noop_apic_write,
196 .icr_read = noop_apic_icr_read,
197 .icr_write = noop_apic_icr_write,
198 .wait_icr_idle = noop_apic_wait_icr_idle,
199 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
200};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 77a06413b6b2..38dcecfa5818 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void)
35#endif 35#endif
36} 36}
37 37
38static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) 38static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
39{ 39{
40 return 0; 40 return 0;
41} 41}
@@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
93 return BAD_APICID; 93 return BAD_APICID;
94} 94}
95 95
96static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
97{
98 return physid_mask_of_physid(phys_apicid);
99}
100
101/* Mapping from cpu number to logical apicid */ 96/* Mapping from cpu number to logical apicid */
102static inline int bigsmp_cpu_to_logical_apicid(int cpu) 97static inline int bigsmp_cpu_to_logical_apicid(int cpu)
103{ 98{
@@ -106,10 +101,10 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu)
106 return cpu_physical_id(cpu); 101 return cpu_physical_id(cpu);
107} 102}
108 103
109static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) 104static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
110{ 105{
111 /* For clustered we don't have a good way to do this yet - hack */ 106 /* For clustered we don't have a good way to do this yet - hack */
112 return physids_promote(0xFFL); 107 physids_promote(0xFFL, retmap);
113} 108}
114 109
115static int bigsmp_check_phys_apicid_present(int phys_apicid) 110static int bigsmp_check_phys_apicid_present(int phys_apicid)
@@ -230,7 +225,7 @@ struct apic apic_bigsmp = {
230 .apicid_to_node = bigsmp_apicid_to_node, 225 .apicid_to_node = bigsmp_apicid_to_node,
231 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, 226 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
232 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, 227 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
233 .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present, 228 .apicid_to_cpu_present = physid_set_mask_of_physid,
234 .setup_portio_remap = NULL, 229 .setup_portio_remap = NULL,
235 .check_phys_apicid_present = bigsmp_check_phys_apicid_present, 230 .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
236 .enable_apic_mode = NULL, 231 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 89174f847b49..e85f8fb7f8e7 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -466,11 +466,11 @@ static const struct cpumask *es7000_target_cpus(void)
466 return cpumask_of(smp_processor_id()); 466 return cpumask_of(smp_processor_id());
467} 467}
468 468
469static unsigned long 469static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
470es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
471{ 470{
472 return 0; 471 return 0;
473} 472}
473
474static unsigned long es7000_check_apicid_present(int bit) 474static unsigned long es7000_check_apicid_present(int bit)
475{ 475{
476 return physid_isset(bit, phys_cpu_present_map); 476 return physid_isset(bit, phys_cpu_present_map);
@@ -539,14 +539,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu)
539 539
540static int cpu_id; 540static int cpu_id;
541 541
542static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) 542static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
543{ 543{
544 physid_mask_t mask; 544 physid_set_mask_of_physid(cpu_id, retmap);
545
546 mask = physid_mask_of_physid(cpu_id);
547 ++cpu_id; 545 ++cpu_id;
548
549 return mask;
550} 546}
551 547
552/* Mapping from cpu number to logical apicid */ 548/* Mapping from cpu number to logical apicid */
@@ -561,10 +557,10 @@ static int es7000_cpu_to_logical_apicid(int cpu)
561#endif 557#endif
562} 558}
563 559
564static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) 560static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
565{ 561{
566 /* For clustered we don't have a good way to do this yet - hack */ 562 /* For clustered we don't have a good way to do this yet - hack */
567 return physids_promote(0xff); 563 physids_promote(0xFFL, retmap);
568} 564}
569 565
570static int es7000_check_phys_apicid_present(int cpu_physical_apicid) 566static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index dc69f28489f5..c0b4468683f9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -60,8 +60,6 @@
60#include <asm/irq_remapping.h> 60#include <asm/irq_remapping.h>
61#include <asm/hpet.h> 61#include <asm/hpet.h>
62#include <asm/hw_irq.h> 62#include <asm/hw_irq.h>
63#include <asm/uv/uv_hub.h>
64#include <asm/uv/uv_irq.h>
65 63
66#include <asm/apic.h> 64#include <asm/apic.h>
67 65
@@ -140,20 +138,6 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
140 return pin; 138 return pin;
141} 139}
142 140
143/*
144 * This is performance-critical, we want to do it O(1)
145 *
146 * Most irqs are mapped 1:1 with pins.
147 */
148struct irq_cfg {
149 struct irq_pin_list *irq_2_pin;
150 cpumask_var_t domain;
151 cpumask_var_t old_domain;
152 unsigned move_cleanup_count;
153 u8 vector;
154 u8 move_in_progress : 1;
155};
156
157/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 141/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
158#ifdef CONFIG_SPARSE_IRQ 142#ifdef CONFIG_SPARSE_IRQ
159static struct irq_cfg irq_cfgx[] = { 143static struct irq_cfg irq_cfgx[] = {
@@ -209,7 +193,7 @@ int __init arch_early_irq_init(void)
209} 193}
210 194
211#ifdef CONFIG_SPARSE_IRQ 195#ifdef CONFIG_SPARSE_IRQ
212static struct irq_cfg *irq_cfg(unsigned int irq) 196struct irq_cfg *irq_cfg(unsigned int irq)
213{ 197{
214 struct irq_cfg *cfg = NULL; 198 struct irq_cfg *cfg = NULL;
215 struct irq_desc *desc; 199 struct irq_desc *desc;
@@ -361,7 +345,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
361/* end for move_irq_desc */ 345/* end for move_irq_desc */
362 346
363#else 347#else
364static struct irq_cfg *irq_cfg(unsigned int irq) 348struct irq_cfg *irq_cfg(unsigned int irq)
365{ 349{
366 return irq < nr_irqs ? irq_cfgx + irq : NULL; 350 return irq < nr_irqs ? irq_cfgx + irq : NULL;
367} 351}
@@ -555,23 +539,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
555 add_pin_to_irq_node(cfg, node, newapic, newpin); 539 add_pin_to_irq_node(cfg, node, newapic, newpin);
556} 540}
557 541
542static void __io_apic_modify_irq(struct irq_pin_list *entry,
543 int mask_and, int mask_or,
544 void (*final)(struct irq_pin_list *entry))
545{
546 unsigned int reg, pin;
547
548 pin = entry->pin;
549 reg = io_apic_read(entry->apic, 0x10 + pin * 2);
550 reg &= mask_and;
551 reg |= mask_or;
552 io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
553 if (final)
554 final(entry);
555}
556
558static void io_apic_modify_irq(struct irq_cfg *cfg, 557static void io_apic_modify_irq(struct irq_cfg *cfg,
559 int mask_and, int mask_or, 558 int mask_and, int mask_or,
560 void (*final)(struct irq_pin_list *entry)) 559 void (*final)(struct irq_pin_list *entry))
561{ 560{
562 int pin;
563 struct irq_pin_list *entry; 561 struct irq_pin_list *entry;
564 562
565 for_each_irq_pin(entry, cfg->irq_2_pin) { 563 for_each_irq_pin(entry, cfg->irq_2_pin)
566 unsigned int reg; 564 __io_apic_modify_irq(entry, mask_and, mask_or, final);
567 pin = entry->pin; 565}
568 reg = io_apic_read(entry->apic, 0x10 + pin * 2); 566
569 reg &= mask_and; 567static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry)
570 reg |= mask_or; 568{
571 io_apic_modify(entry->apic, 0x10 + pin * 2, reg); 569 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER,
572 if (final) 570 IO_APIC_REDIR_MASKED, NULL);
573 final(entry); 571}
574 } 572
573static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
574{
575 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED,
576 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
575} 577}
576 578
577static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) 579static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
@@ -595,18 +597,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
595 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 597 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
596} 598}
597 599
598static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
599{
600 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
601 IO_APIC_REDIR_MASKED, NULL);
602}
603
604static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
605{
606 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
607 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
608}
609
610static void mask_IO_APIC_irq_desc(struct irq_desc *desc) 600static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
611{ 601{
612 struct irq_cfg *cfg = desc->chip_data; 602 struct irq_cfg *cfg = desc->chip_data;
@@ -1177,7 +1167,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1177 int cpu, err; 1167 int cpu, err;
1178 cpumask_var_t tmp_mask; 1168 cpumask_var_t tmp_mask;
1179 1169
1180 if ((cfg->move_in_progress) || cfg->move_cleanup_count) 1170 if (cfg->move_in_progress)
1181 return -EBUSY; 1171 return -EBUSY;
1182 1172
1183 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) 1173 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
@@ -1237,8 +1227,7 @@ next:
1237 return err; 1227 return err;
1238} 1228}
1239 1229
1240static int 1230int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1241assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1242{ 1231{
1243 int err; 1232 int err;
1244 unsigned long flags; 1233 unsigned long flags;
@@ -1599,9 +1588,6 @@ __apicdebuginit(void) print_IO_APIC(void)
1599 struct irq_desc *desc; 1588 struct irq_desc *desc;
1600 unsigned int irq; 1589 unsigned int irq;
1601 1590
1602 if (apic_verbosity == APIC_QUIET)
1603 return;
1604
1605 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1591 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1606 for (i = 0; i < nr_ioapics; i++) 1592 for (i = 0; i < nr_ioapics; i++)
1607 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1593 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
@@ -1708,9 +1694,6 @@ __apicdebuginit(void) print_APIC_field(int base)
1708{ 1694{
1709 int i; 1695 int i;
1710 1696
1711 if (apic_verbosity == APIC_QUIET)
1712 return;
1713
1714 printk(KERN_DEBUG); 1697 printk(KERN_DEBUG);
1715 1698
1716 for (i = 0; i < 8; i++) 1699 for (i = 0; i < 8; i++)
@@ -1724,9 +1707,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1724 unsigned int i, v, ver, maxlvt; 1707 unsigned int i, v, ver, maxlvt;
1725 u64 icr; 1708 u64 icr;
1726 1709
1727 if (apic_verbosity == APIC_QUIET)
1728 return;
1729
1730 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1710 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1731 smp_processor_id(), hard_smp_processor_id()); 1711 smp_processor_id(), hard_smp_processor_id());
1732 v = apic_read(APIC_ID); 1712 v = apic_read(APIC_ID);
@@ -1824,13 +1804,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1824 printk("\n"); 1804 printk("\n");
1825} 1805}
1826 1806
1827__apicdebuginit(void) print_all_local_APICs(void) 1807__apicdebuginit(void) print_local_APICs(int maxcpu)
1828{ 1808{
1829 int cpu; 1809 int cpu;
1830 1810
1811 if (!maxcpu)
1812 return;
1813
1831 preempt_disable(); 1814 preempt_disable();
1832 for_each_online_cpu(cpu) 1815 for_each_online_cpu(cpu) {
1816 if (cpu >= maxcpu)
1817 break;
1833 smp_call_function_single(cpu, print_local_APIC, NULL, 1); 1818 smp_call_function_single(cpu, print_local_APIC, NULL, 1);
1819 }
1834 preempt_enable(); 1820 preempt_enable();
1835} 1821}
1836 1822
@@ -1839,7 +1825,7 @@ __apicdebuginit(void) print_PIC(void)
1839 unsigned int v; 1825 unsigned int v;
1840 unsigned long flags; 1826 unsigned long flags;
1841 1827
1842 if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) 1828 if (!nr_legacy_irqs)
1843 return; 1829 return;
1844 1830
1845 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1831 printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1866,21 +1852,41 @@ __apicdebuginit(void) print_PIC(void)
1866 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1852 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1867} 1853}
1868 1854
1869__apicdebuginit(int) print_all_ICs(void) 1855static int __initdata show_lapic = 1;
1856static __init int setup_show_lapic(char *arg)
1870{ 1857{
1858 int num = -1;
1859
1860 if (strcmp(arg, "all") == 0) {
1861 show_lapic = CONFIG_NR_CPUS;
1862 } else {
1863 get_option(&arg, &num);
1864 if (num >= 0)
1865 show_lapic = num;
1866 }
1867
1868 return 1;
1869}
1870__setup("show_lapic=", setup_show_lapic);
1871
1872__apicdebuginit(int) print_ICs(void)
1873{
1874 if (apic_verbosity == APIC_QUIET)
1875 return 0;
1876
1871 print_PIC(); 1877 print_PIC();
1872 1878
1873 /* don't print out if apic is not there */ 1879 /* don't print out if apic is not there */
1874 if (!cpu_has_apic && !apic_from_smp_config()) 1880 if (!cpu_has_apic && !apic_from_smp_config())
1875 return 0; 1881 return 0;
1876 1882
1877 print_all_local_APICs(); 1883 print_local_APICs(show_lapic);
1878 print_IO_APIC(); 1884 print_IO_APIC();
1879 1885
1880 return 0; 1886 return 0;
1881} 1887}
1882 1888
1883fs_initcall(print_all_ICs); 1889fs_initcall(print_ICs);
1884 1890
1885 1891
1886/* Where if anywhere is the i8259 connect in external int mode */ 1892/* Where if anywhere is the i8259 connect in external int mode */
@@ -2031,7 +2037,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2031 * This is broken; anything with a real cpu count has to 2037 * This is broken; anything with a real cpu count has to
2032 * circumvent this idiocy regardless. 2038 * circumvent this idiocy regardless.
2033 */ 2039 */
2034 phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 2040 apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
2035 2041
2036 /* 2042 /*
2037 * Set the IOAPIC ID to the value stored in the MPC table. 2043 * Set the IOAPIC ID to the value stored in the MPC table.
@@ -2058,7 +2064,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2058 * system must have a unique ID or we get lots of nice 2064 * system must have a unique ID or we get lots of nice
2059 * 'stuck on smp_invalidate_needed IPI wait' messages. 2065 * 'stuck on smp_invalidate_needed IPI wait' messages.
2060 */ 2066 */
2061 if (apic->check_apicid_used(phys_id_present_map, 2067 if (apic->check_apicid_used(&phys_id_present_map,
2062 mp_ioapics[apic_id].apicid)) { 2068 mp_ioapics[apic_id].apicid)) {
2063 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 2069 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2064 apic_id, mp_ioapics[apic_id].apicid); 2070 apic_id, mp_ioapics[apic_id].apicid);
@@ -2073,7 +2079,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2073 mp_ioapics[apic_id].apicid = i; 2079 mp_ioapics[apic_id].apicid = i;
2074 } else { 2080 } else {
2075 physid_mask_t tmp; 2081 physid_mask_t tmp;
2076 tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); 2082 apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
2077 apic_printk(APIC_VERBOSE, "Setting %d in the " 2083 apic_printk(APIC_VERBOSE, "Setting %d in the "
2078 "phys_id_present_map\n", 2084 "phys_id_present_map\n",
2079 mp_ioapics[apic_id].apicid); 2085 mp_ioapics[apic_id].apicid);
@@ -2228,20 +2234,16 @@ static int ioapic_retrigger_irq(unsigned int irq)
2228 */ 2234 */
2229 2235
2230#ifdef CONFIG_SMP 2236#ifdef CONFIG_SMP
2231static void send_cleanup_vector(struct irq_cfg *cfg) 2237void send_cleanup_vector(struct irq_cfg *cfg)
2232{ 2238{
2233 cpumask_var_t cleanup_mask; 2239 cpumask_var_t cleanup_mask;
2234 2240
2235 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { 2241 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
2236 unsigned int i; 2242 unsigned int i;
2237 cfg->move_cleanup_count = 0;
2238 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2239 cfg->move_cleanup_count++;
2240 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) 2243 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2241 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); 2244 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
2242 } else { 2245 } else {
2243 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); 2246 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
2244 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
2245 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); 2247 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2246 free_cpumask_var(cleanup_mask); 2248 free_cpumask_var(cleanup_mask);
2247 } 2249 }
@@ -2272,15 +2274,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2272 } 2274 }
2273} 2275}
2274 2276
2275static int
2276assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
2277
2278/* 2277/*
2279 * Either sets desc->affinity to a valid value, and returns 2278 * Either sets desc->affinity to a valid value, and returns
2280 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and 2279 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
2281 * leaves desc->affinity untouched. 2280 * leaves desc->affinity untouched.
2282 */ 2281 */
2283static unsigned int 2282unsigned int
2284set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) 2283set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
2285{ 2284{
2286 struct irq_cfg *cfg; 2285 struct irq_cfg *cfg;
@@ -2433,8 +2432,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2433 2432
2434 cfg = irq_cfg(irq); 2433 cfg = irq_cfg(irq);
2435 spin_lock(&desc->lock); 2434 spin_lock(&desc->lock);
2436 if (!cfg->move_cleanup_count)
2437 goto unlock;
2438 2435
2439 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2436 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2440 goto unlock; 2437 goto unlock;
@@ -2452,7 +2449,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2452 goto unlock; 2449 goto unlock;
2453 } 2450 }
2454 __get_cpu_var(vector_irq)[vector] = -1; 2451 __get_cpu_var(vector_irq)[vector] = -1;
2455 cfg->move_cleanup_count--;
2456unlock: 2452unlock:
2457 spin_unlock(&desc->lock); 2453 spin_unlock(&desc->lock);
2458 } 2454 }
@@ -2460,21 +2456,33 @@ unlock:
2460 irq_exit(); 2456 irq_exit();
2461} 2457}
2462 2458
2463static void irq_complete_move(struct irq_desc **descp) 2459static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
2464{ 2460{
2465 struct irq_desc *desc = *descp; 2461 struct irq_desc *desc = *descp;
2466 struct irq_cfg *cfg = desc->chip_data; 2462 struct irq_cfg *cfg = desc->chip_data;
2467 unsigned vector, me; 2463 unsigned me;
2468 2464
2469 if (likely(!cfg->move_in_progress)) 2465 if (likely(!cfg->move_in_progress))
2470 return; 2466 return;
2471 2467
2472 vector = ~get_irq_regs()->orig_ax;
2473 me = smp_processor_id(); 2468 me = smp_processor_id();
2474 2469
2475 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2470 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2476 send_cleanup_vector(cfg); 2471 send_cleanup_vector(cfg);
2477} 2472}
2473
2474static void irq_complete_move(struct irq_desc **descp)
2475{
2476 __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
2477}
2478
2479void irq_force_complete_move(int irq)
2480{
2481 struct irq_desc *desc = irq_to_desc(irq);
2482 struct irq_cfg *cfg = desc->chip_data;
2483
2484 __irq_complete_move(&desc, cfg->vector);
2485}
2478#else 2486#else
2479static inline void irq_complete_move(struct irq_desc **descp) {} 2487static inline void irq_complete_move(struct irq_desc **descp) {}
2480#endif 2488#endif
@@ -2490,6 +2498,59 @@ static void ack_apic_edge(unsigned int irq)
2490 2498
2491atomic_t irq_mis_count; 2499atomic_t irq_mis_count;
2492 2500
2501/*
2502 * IO-APIC versions below 0x20 don't support EOI register.
2503 * For the record, here is the information about various versions:
2504 * 0Xh 82489DX
2505 * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
2506 * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
2507 * 30h-FFh Reserved
2508 *
2509 * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
2510 * version as 0x2. This is an error with documentation and these ICH chips
2511 * use io-apic's of version 0x20.
2512 *
2513 * For IO-APIC's with EOI register, we use that to do an explicit EOI.
2514 * Otherwise, we simulate the EOI message manually by changing the trigger
2515 * mode to edge and then back to level, with RTE being masked during this.
2516*/
2517static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2518{
2519 struct irq_pin_list *entry;
2520
2521 for_each_irq_pin(entry, cfg->irq_2_pin) {
2522 if (mp_ioapics[entry->apic].apicver >= 0x20) {
2523 /*
2524 * Intr-remapping uses pin number as the virtual vector
2525 * in the RTE. Actual vector is programmed in
2526 * intr-remapping table entry. Hence for the io-apic
2527 * EOI we use the pin number.
2528 */
2529 if (irq_remapped(irq))
2530 io_apic_eoi(entry->apic, entry->pin);
2531 else
2532 io_apic_eoi(entry->apic, cfg->vector);
2533 } else {
2534 __mask_and_edge_IO_APIC_irq(entry);
2535 __unmask_and_level_IO_APIC_irq(entry);
2536 }
2537 }
2538}
2539
2540static void eoi_ioapic_irq(struct irq_desc *desc)
2541{
2542 struct irq_cfg *cfg;
2543 unsigned long flags;
2544 unsigned int irq;
2545
2546 irq = desc->irq;
2547 cfg = desc->chip_data;
2548
2549 spin_lock_irqsave(&ioapic_lock, flags);
2550 __eoi_ioapic_irq(irq, cfg);
2551 spin_unlock_irqrestore(&ioapic_lock, flags);
2552}
2553
2493static void ack_apic_level(unsigned int irq) 2554static void ack_apic_level(unsigned int irq)
2494{ 2555{
2495 struct irq_desc *desc = irq_to_desc(irq); 2556 struct irq_desc *desc = irq_to_desc(irq);
@@ -2525,6 +2586,19 @@ static void ack_apic_level(unsigned int irq)
2525 * level-triggered interrupt. We mask the source for the time of the 2586 * level-triggered interrupt. We mask the source for the time of the
2526 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2587 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2527 * The idea is from Manfred Spraul. --macro 2588 * The idea is from Manfred Spraul. --macro
2589 *
2590 * Also in the case when cpu goes offline, fixup_irqs() will forward
2591 * any unhandled interrupt on the offlined cpu to the new cpu
2592 * destination that is handling the corresponding interrupt. This
2593 * interrupt forwarding is done via IPI's. Hence, in this case also
2594 * level-triggered io-apic interrupt will be seen as an edge
2595 * interrupt in the IRR. And we can't rely on the cpu's EOI
2596 * to be broadcasted to the IO-APIC's which will clear the remoteIRR
2597 * corresponding to the level-triggered interrupt. Hence on IO-APIC's
2598 * supporting EOI register, we do an explicit EOI to clear the
2599 * remote IRR and on IO-APIC's which don't have an EOI register,
2600 * we use the above logic (mask+edge followed by unmask+level) from
2601 * Manfred Spraul to clear the remote IRR.
2528 */ 2602 */
2529 cfg = desc->chip_data; 2603 cfg = desc->chip_data;
2530 i = cfg->vector; 2604 i = cfg->vector;
@@ -2536,6 +2610,19 @@ static void ack_apic_level(unsigned int irq)
2536 */ 2610 */
2537 ack_APIC_irq(); 2611 ack_APIC_irq();
2538 2612
2613 /*
2614 * Tail end of clearing remote IRR bit (either by delivering the EOI
2615 * message via io-apic EOI register write or simulating it using
2616 * mask+edge followed by unnask+level logic) manually when the
2617 * level triggered interrupt is seen as the edge triggered interrupt
2618 * at the cpu.
2619 */
2620 if (!(v & (1 << (i & 0x1f)))) {
2621 atomic_inc(&irq_mis_count);
2622
2623 eoi_ioapic_irq(desc);
2624 }
2625
2539 /* Now we can move and renable the irq */ 2626 /* Now we can move and renable the irq */
2540 if (unlikely(do_unmask_irq)) { 2627 if (unlikely(do_unmask_irq)) {
2541 /* Only migrate the irq if the ack has been received. 2628 /* Only migrate the irq if the ack has been received.
@@ -2569,41 +2656,9 @@ static void ack_apic_level(unsigned int irq)
2569 move_masked_irq(irq); 2656 move_masked_irq(irq);
2570 unmask_IO_APIC_irq_desc(desc); 2657 unmask_IO_APIC_irq_desc(desc);
2571 } 2658 }
2572
2573 /* Tail end of version 0x11 I/O APIC bug workaround */
2574 if (!(v & (1 << (i & 0x1f)))) {
2575 atomic_inc(&irq_mis_count);
2576 spin_lock(&ioapic_lock);
2577 __mask_and_edge_IO_APIC_irq(cfg);
2578 __unmask_and_level_IO_APIC_irq(cfg);
2579 spin_unlock(&ioapic_lock);
2580 }
2581} 2659}
2582 2660
2583#ifdef CONFIG_INTR_REMAP 2661#ifdef CONFIG_INTR_REMAP
2584static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2585{
2586 struct irq_pin_list *entry;
2587
2588 for_each_irq_pin(entry, cfg->irq_2_pin)
2589 io_apic_eoi(entry->apic, entry->pin);
2590}
2591
2592static void
2593eoi_ioapic_irq(struct irq_desc *desc)
2594{
2595 struct irq_cfg *cfg;
2596 unsigned long flags;
2597 unsigned int irq;
2598
2599 irq = desc->irq;
2600 cfg = desc->chip_data;
2601
2602 spin_lock_irqsave(&ioapic_lock, flags);
2603 __eoi_ioapic_irq(irq, cfg);
2604 spin_unlock_irqrestore(&ioapic_lock, flags);
2605}
2606
2607static void ir_ack_apic_edge(unsigned int irq) 2662static void ir_ack_apic_edge(unsigned int irq)
2608{ 2663{
2609 ack_APIC_irq(); 2664 ack_APIC_irq();
@@ -3157,6 +3212,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3157 continue; 3212 continue;
3158 3213
3159 desc_new = move_irq_desc(desc_new, node); 3214 desc_new = move_irq_desc(desc_new, node);
3215 cfg_new = desc_new->chip_data;
3160 3216
3161 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 3217 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
3162 irq = new; 3218 irq = new;
@@ -3708,75 +3764,6 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3708} 3764}
3709#endif /* CONFIG_HT_IRQ */ 3765#endif /* CONFIG_HT_IRQ */
3710 3766
3711#ifdef CONFIG_X86_UV
3712/*
3713 * Re-target the irq to the specified CPU and enable the specified MMR located
3714 * on the specified blade to allow the sending of MSIs to the specified CPU.
3715 */
3716int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3717 unsigned long mmr_offset)
3718{
3719 const struct cpumask *eligible_cpu = cpumask_of(cpu);
3720 struct irq_cfg *cfg;
3721 int mmr_pnode;
3722 unsigned long mmr_value;
3723 struct uv_IO_APIC_route_entry *entry;
3724 unsigned long flags;
3725 int err;
3726
3727 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3728
3729 cfg = irq_cfg(irq);
3730
3731 err = assign_irq_vector(irq, cfg, eligible_cpu);
3732 if (err != 0)
3733 return err;
3734
3735 spin_lock_irqsave(&vector_lock, flags);
3736 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
3737 irq_name);
3738 spin_unlock_irqrestore(&vector_lock, flags);
3739
3740 mmr_value = 0;
3741 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3742 entry->vector = cfg->vector;
3743 entry->delivery_mode = apic->irq_delivery_mode;
3744 entry->dest_mode = apic->irq_dest_mode;
3745 entry->polarity = 0;
3746 entry->trigger = 0;
3747 entry->mask = 0;
3748 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3749
3750 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3751 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3752
3753 if (cfg->move_in_progress)
3754 send_cleanup_vector(cfg);
3755
3756 return irq;
3757}
3758
3759/*
3760 * Disable the specified MMR located on the specified blade so that MSIs are
3761 * longer allowed to be sent.
3762 */
3763void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3764{
3765 unsigned long mmr_value;
3766 struct uv_IO_APIC_route_entry *entry;
3767 int mmr_pnode;
3768
3769 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3770
3771 mmr_value = 0;
3772 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3773 entry->mask = 1;
3774
3775 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3776 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3777}
3778#endif /* CONFIG_X86_64 */
3779
3780int __init io_apic_get_redir_entries (int ioapic) 3767int __init io_apic_get_redir_entries (int ioapic)
3781{ 3768{
3782 union IO_APIC_reg_01 reg_01; 3769 union IO_APIC_reg_01 reg_01;
@@ -3944,7 +3931,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3944 */ 3931 */
3945 3932
3946 if (physids_empty(apic_id_map)) 3933 if (physids_empty(apic_id_map))
3947 apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 3934 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
3948 3935
3949 spin_lock_irqsave(&ioapic_lock, flags); 3936 spin_lock_irqsave(&ioapic_lock, flags);
3950 reg_00.raw = io_apic_read(ioapic, 0); 3937 reg_00.raw = io_apic_read(ioapic, 0);
@@ -3960,10 +3947,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3960 * Every APIC in a system must have a unique ID or we get lots of nice 3947 * Every APIC in a system must have a unique ID or we get lots of nice
3961 * 'stuck on smp_invalidate_needed IPI wait' messages. 3948 * 'stuck on smp_invalidate_needed IPI wait' messages.
3962 */ 3949 */
3963 if (apic->check_apicid_used(apic_id_map, apic_id)) { 3950 if (apic->check_apicid_used(&apic_id_map, apic_id)) {
3964 3951
3965 for (i = 0; i < get_physical_broadcast(); i++) { 3952 for (i = 0; i < get_physical_broadcast(); i++) {
3966 if (!apic->check_apicid_used(apic_id_map, i)) 3953 if (!apic->check_apicid_used(&apic_id_map, i))
3967 break; 3954 break;
3968 } 3955 }
3969 3956
@@ -3976,7 +3963,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3976 apic_id = i; 3963 apic_id = i;
3977 } 3964 }
3978 3965
3979 tmp = apic->apicid_to_cpu_present(apic_id); 3966 apic->apicid_to_cpu_present(apic_id, &tmp);
3980 physids_or(apic_id_map, apic_id_map, tmp); 3967 physids_or(apic_id_map, apic_id_map, tmp);
3981 3968
3982 if (reg_00.bits.ID != apic_id) { 3969 if (reg_00.bits.ID != apic_id) {
@@ -4106,7 +4093,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
4106 for (i = 0; i < nr_ioapics; i++) { 4093 for (i = 0; i < nr_ioapics; i++) {
4107 res[i].name = mem; 4094 res[i].name = mem;
4108 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; 4095 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4109 sprintf(mem, "IOAPIC %u", i); 4096 snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
4110 mem += IOAPIC_RESOURCE_NAME_SIZE; 4097 mem += IOAPIC_RESOURCE_NAME_SIZE;
4111 } 4098 }
4112 4099
@@ -4140,18 +4127,17 @@ void __init ioapic_init_mappings(void)
4140#ifdef CONFIG_X86_32 4127#ifdef CONFIG_X86_32
4141fake_ioapic_page: 4128fake_ioapic_page:
4142#endif 4129#endif
4143 ioapic_phys = (unsigned long) 4130 ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
4144 alloc_bootmem_pages(PAGE_SIZE);
4145 ioapic_phys = __pa(ioapic_phys); 4131 ioapic_phys = __pa(ioapic_phys);
4146 } 4132 }
4147 set_fixmap_nocache(idx, ioapic_phys); 4133 set_fixmap_nocache(idx, ioapic_phys);
4148 apic_printk(APIC_VERBOSE, 4134 apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
4149 "mapped IOAPIC to %08lx (%08lx)\n", 4135 __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
4150 __fix_to_virt(idx), ioapic_phys); 4136 ioapic_phys);
4151 idx++; 4137 idx++;
4152 4138
4153 ioapic_res->start = ioapic_phys; 4139 ioapic_res->start = ioapic_phys;
4154 ioapic_res->end = ioapic_phys + (4 * 1024) - 1; 4140 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
4155 ioapic_res++; 4141 ioapic_res++;
4156 } 4142 }
4157} 4143}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 7ff61d6a188a..6389432a9dbf 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,8 @@
39int unknown_nmi_panic; 39int unknown_nmi_panic;
40int nmi_watchdog_enabled; 40int nmi_watchdog_enabled;
41 41
42static cpumask_t backtrace_mask __read_mostly; 42/* For reliability, we're prepared to waste bits here. */
43static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
43 44
44/* nmi_active: 45/* nmi_active:
45 * >0: the lapic NMI watchdog is active, but can be disabled 46 * >0: the lapic NMI watchdog is active, but can be disabled
@@ -414,7 +415,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
414 } 415 }
415 416
416 /* We can be called before check_nmi_watchdog, hence NULL check. */ 417 /* We can be called before check_nmi_watchdog, hence NULL check. */
417 if (cpumask_test_cpu(cpu, &backtrace_mask)) { 418 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
418 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 419 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
419 420
420 spin_lock(&lock); 421 spin_lock(&lock);
@@ -422,7 +423,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
422 show_regs(regs); 423 show_regs(regs);
423 dump_stack(); 424 dump_stack();
424 spin_unlock(&lock); 425 spin_unlock(&lock);
425 cpumask_clear_cpu(cpu, &backtrace_mask); 426 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
426 427
427 rc = 1; 428 rc = 1;
428 } 429 }
@@ -558,14 +559,14 @@ void arch_trigger_all_cpu_backtrace(void)
558{ 559{
559 int i; 560 int i;
560 561
561 cpumask_copy(&backtrace_mask, cpu_online_mask); 562 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
562 563
563 printk(KERN_INFO "sending NMI to all CPUs:\n"); 564 printk(KERN_INFO "sending NMI to all CPUs:\n");
564 apic->send_IPI_all(NMI_VECTOR); 565 apic->send_IPI_all(NMI_VECTOR);
565 566
566 /* Wait for up to 10 seconds for all CPUs to do the backtrace */ 567 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
567 for (i = 0; i < 10 * 1000; i++) { 568 for (i = 0; i < 10 * 1000; i++) {
568 if (cpumask_empty(&backtrace_mask)) 569 if (cpumask_empty(to_cpumask(backtrace_mask)))
569 break; 570 break;
570 mdelay(1); 571 mdelay(1);
571 } 572 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index efa00e2b8505..98c4665f251c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -264,11 +264,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
264static __init void early_check_numaq(void) 264static __init void early_check_numaq(void)
265{ 265{
266 /* 266 /*
267 * Find possible boot-time SMP configuration:
268 */
269 early_find_smp_config();
270
271 /*
272 * get boot-time SMP configuration: 267 * get boot-time SMP configuration:
273 */ 268 */
274 if (smp_found_config) 269 if (smp_found_config)
@@ -334,10 +329,9 @@ static inline const struct cpumask *numaq_target_cpus(void)
334 return cpu_all_mask; 329 return cpu_all_mask;
335} 330}
336 331
337static inline unsigned long 332static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
338numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
339{ 333{
340 return physid_isset(apicid, bitmap); 334 return physid_isset(apicid, *map);
341} 335}
342 336
343static inline unsigned long numaq_check_apicid_present(int bit) 337static inline unsigned long numaq_check_apicid_present(int bit)
@@ -371,10 +365,10 @@ static inline int numaq_multi_timer_check(int apic, int irq)
371 return apic != 0 && irq == 0; 365 return apic != 0 && irq == 0;
372} 366}
373 367
374static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) 368static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
375{ 369{
376 /* We don't have a good way to do this yet - hack */ 370 /* We don't have a good way to do this yet - hack */
377 return physids_promote(0xFUL); 371 return physids_promote(0xFUL, retmap);
378} 372}
379 373
380static inline int numaq_cpu_to_logical_apicid(int cpu) 374static inline int numaq_cpu_to_logical_apicid(int cpu)
@@ -402,12 +396,12 @@ static inline int numaq_apicid_to_node(int logical_apicid)
402 return logical_apicid >> 4; 396 return logical_apicid >> 4;
403} 397}
404 398
405static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) 399static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
406{ 400{
407 int node = numaq_apicid_to_node(logical_apicid); 401 int node = numaq_apicid_to_node(logical_apicid);
408 int cpu = __ffs(logical_apicid & 0xf); 402 int cpu = __ffs(logical_apicid & 0xf);
409 403
410 return physid_mask_of_physid(cpu + 4*node); 404 physid_set_mask_of_physid(cpu + 4*node, retmap);
411} 405}
412 406
413/* Where the IO area was mapped on multiquad, always 0 otherwise */ 407/* Where the IO area was mapped on multiquad, always 0 otherwise */
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0c0182cc947d..1a6559f6768c 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -108,7 +108,7 @@ struct apic apic_default = {
108 .apicid_to_node = default_apicid_to_node, 108 .apicid_to_node = default_apicid_to_node,
109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid, 109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
110 .cpu_present_to_apicid = default_cpu_present_to_apicid, 110 .cpu_present_to_apicid = default_cpu_present_to_apicid,
111 .apicid_to_cpu_present = default_apicid_to_cpu_present, 111 .apicid_to_cpu_present = physid_set_mask_of_physid,
112 .setup_portio_remap = NULL, 112 .setup_portio_remap = NULL,
113 .check_phys_apicid_present = default_check_phys_apicid_present, 113 .check_phys_apicid_present = default_check_phys_apicid_present,
114 .enable_apic_mode = NULL, 114 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 645ecc4ff0be..9b419263d90d 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void)
183 return cpumask_of(0); 183 return cpumask_of(0);
184} 184}
185 185
186static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) 186static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
187{ 187{
188 return 0; 188 return 0;
189} 189}
@@ -261,15 +261,15 @@ static int summit_cpu_present_to_apicid(int mps_cpu)
261 return BAD_APICID; 261 return BAD_APICID;
262} 262}
263 263
264static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) 264static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
265{ 265{
266 /* For clustered we don't have a good way to do this yet - hack */ 266 /* For clustered we don't have a good way to do this yet - hack */
267 return physids_promote(0x0F); 267 physids_promote(0x0FL, retmap);
268} 268}
269 269
270static physid_mask_t summit_apicid_to_cpu_present(int apicid) 270static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
271{ 271{
272 return physid_mask_of_physid(0); 272 physid_set_mask_of_physid(0, retmap);
273} 273}
274 274
275static int summit_check_phys_apicid_present(int physical_apicid) 275static int summit_check_phys_apicid_present(int physical_apicid)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 326c25477d3d..b684bb303cbf 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -30,10 +30,22 @@
30#include <asm/apic.h> 30#include <asm/apic.h>
31#include <asm/ipi.h> 31#include <asm/ipi.h>
32#include <asm/smp.h> 32#include <asm/smp.h>
33#include <asm/x86_init.h>
33 34
34DEFINE_PER_CPU(int, x2apic_extra_bits); 35DEFINE_PER_CPU(int, x2apic_extra_bits);
35 36
36static enum uv_system_type uv_system_type; 37static enum uv_system_type uv_system_type;
38static u64 gru_start_paddr, gru_end_paddr;
39
40static inline bool is_GRU_range(u64 start, u64 end)
41{
42 return start >= gru_start_paddr && end <= gru_end_paddr;
43}
44
45static bool uv_is_untracked_pat_range(u64 start, u64 end)
46{
47 return is_ISA_range(start, end) || is_GRU_range(start, end);
48}
37 49
38static int early_get_nodeid(void) 50static int early_get_nodeid(void)
39{ 51{
@@ -49,6 +61,7 @@ static int early_get_nodeid(void)
49static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 61static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{ 62{
51 if (!strcmp(oem_id, "SGI")) { 63 if (!strcmp(oem_id, "SGI")) {
64 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
52 if (!strcmp(oem_table_id, "UVL")) 65 if (!strcmp(oem_table_id, "UVL"))
53 uv_system_type = UV_LEGACY_APIC; 66 uv_system_type = UV_LEGACY_APIC;
54 else if (!strcmp(oem_table_id, "UVX")) 67 else if (!strcmp(oem_table_id, "UVX"))
@@ -385,8 +398,12 @@ static __init void map_gru_high(int max_pnode)
385 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; 398 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
386 399
387 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 400 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
388 if (gru.s.enable) 401 if (gru.s.enable) {
389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 402 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
403 gru_start_paddr = ((u64)gru.s.base << shift);
404 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
405
406 }
390} 407}
391 408
392static __init void map_mmr_high(int max_pnode) 409static __init void map_mmr_high(int max_pnode)
@@ -409,6 +426,12 @@ static __init void map_mmioh_high(int max_pnode)
409 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); 426 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
410} 427}
411 428
429static __init void map_low_mmrs(void)
430{
431 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
432 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
433}
434
412static __init void uv_rtc_init(void) 435static __init void uv_rtc_init(void)
413{ 436{
414 long status; 437 long status;
@@ -550,6 +573,8 @@ void __init uv_system_init(void)
550 unsigned long mmr_base, present, paddr; 573 unsigned long mmr_base, present, paddr;
551 unsigned short pnode_mask; 574 unsigned short pnode_mask;
552 575
576 map_low_mmrs();
577
553 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 578 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
554 m_val = m_n_config.s.m_skt; 579 m_val = m_n_config.s.m_skt;
555 n_val = m_n_config.s.n_skt; 580 n_val = m_n_config.s.n_skt;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 151ace69a5aa..b5b6b23bce53 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,7 +204,6 @@
204#include <linux/module.h> 204#include <linux/module.h>
205 205
206#include <linux/poll.h> 206#include <linux/poll.h>
207#include <linux/smp_lock.h>
208#include <linux/types.h> 207#include <linux/types.h>
209#include <linux/stddef.h> 208#include <linux/stddef.h>
210#include <linux/timer.h> 209#include <linux/timer.h>
@@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 402static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
404static struct apm_user *user_list; 403static struct apm_user *user_list;
405static DEFINE_SPINLOCK(user_list_lock); 404static DEFINE_SPINLOCK(user_list_lock);
405static DEFINE_MUTEX(apm_mutex);
406 406
407/* 407/*
408 * Set up a segment that references the real mode segment 0x40 408 * Set up a segment that references the real mode segment 0x40
@@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1531 return -EPERM; 1531 return -EPERM;
1532 switch (cmd) { 1532 switch (cmd) {
1533 case APM_IOC_STANDBY: 1533 case APM_IOC_STANDBY:
1534 lock_kernel(); 1534 mutex_lock(&apm_mutex);
1535 if (as->standbys_read > 0) { 1535 if (as->standbys_read > 0) {
1536 as->standbys_read--; 1536 as->standbys_read--;
1537 as->standbys_pending--; 1537 as->standbys_pending--;
@@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1540 queue_event(APM_USER_STANDBY, as); 1540 queue_event(APM_USER_STANDBY, as);
1541 if (standbys_pending <= 0) 1541 if (standbys_pending <= 0)
1542 standby(); 1542 standby();
1543 unlock_kernel(); 1543 mutex_unlock(&apm_mutex);
1544 break; 1544 break;
1545 case APM_IOC_SUSPEND: 1545 case APM_IOC_SUSPEND:
1546 lock_kernel(); 1546 mutex_lock(&apm_mutex);
1547 if (as->suspends_read > 0) { 1547 if (as->suspends_read > 0) {
1548 as->suspends_read--; 1548 as->suspends_read--;
1549 as->suspends_pending--; 1549 as->suspends_pending--;
@@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1552 queue_event(APM_USER_SUSPEND, as); 1552 queue_event(APM_USER_SUSPEND, as);
1553 if (suspends_pending <= 0) { 1553 if (suspends_pending <= 0) {
1554 ret = suspend(1); 1554 ret = suspend(1);
1555 mutex_unlock(&apm_mutex);
1555 } else { 1556 } else {
1556 as->suspend_wait = 1; 1557 as->suspend_wait = 1;
1558 mutex_unlock(&apm_mutex);
1557 wait_event_interruptible(apm_suspend_waitqueue, 1559 wait_event_interruptible(apm_suspend_waitqueue,
1558 as->suspend_wait == 0); 1560 as->suspend_wait == 0);
1559 ret = as->suspend_result; 1561 ret = as->suspend_result;
1560 } 1562 }
1561 unlock_kernel();
1562 return ret; 1563 return ret;
1563 default: 1564 default:
1564 return -ENOTTY; 1565 return -ENOTTY;
@@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp)
1608{ 1609{
1609 struct apm_user *as; 1610 struct apm_user *as;
1610 1611
1611 lock_kernel();
1612 as = kmalloc(sizeof(*as), GFP_KERNEL); 1612 as = kmalloc(sizeof(*as), GFP_KERNEL);
1613 if (as == NULL) { 1613 if (as == NULL) {
1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", 1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1615 sizeof(*as)); 1615 sizeof(*as));
1616 unlock_kernel();
1617 return -ENOMEM; 1616 return -ENOMEM;
1618 } 1617 }
1619 as->magic = APM_BIOS_MAGIC; 1618 as->magic = APM_BIOS_MAGIC;
@@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp)
1635 user_list = as; 1634 user_list = as;
1636 spin_unlock(&user_list_lock); 1635 spin_unlock(&user_list_lock);
1637 filp->private_data = as; 1636 filp->private_data = as;
1638 unlock_kernel();
1639 return 0; 1637 return 0;
1640} 1638}
1641 1639
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 68537e957a9b..1d2cb383410e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,6 +5,7 @@
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER 6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg 7CFLAGS_REMOVE_common.o = -pg
8CFLAGS_REMOVE_perf_event.o = -pg
8endif 9endif
9 10
10# Make sure load_percpu_segment has no stackprotector 11# Make sure load_percpu_segment has no stackprotector
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c910a716a71c..7128b3799cec 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -535,7 +535,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
535 } 535 }
536 } 536 }
537 537
538 display_cacheinfo(c); 538 cpu_detect_cache_sizes(c);
539 539
540 /* Multi core CPU? */ 540 /* Multi core CPU? */
541 if (c->extended_cpuid_level >= 0x80000008) { 541 if (c->extended_cpuid_level >= 0x80000008) {
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index c95e831bb095..e58d978e0758 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -294,7 +294,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
294 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 294 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
295 } 295 }
296 296
297 display_cacheinfo(c); 297 cpu_detect_cache_sizes(c);
298} 298}
299 299
300enum { 300enum {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc25c2b4a567..c1afa990a6c8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -61,7 +61,7 @@ void __init setup_cpu_local_masks(void)
61static void __cpuinit default_init(struct cpuinfo_x86 *c) 61static void __cpuinit default_init(struct cpuinfo_x86 *c)
62{ 62{
63#ifdef CONFIG_X86_64 63#ifdef CONFIG_X86_64
64 display_cacheinfo(c); 64 cpu_detect_cache_sizes(c);
65#else 65#else
66 /* Not much we can do here... */ 66 /* Not much we can do here... */
67 /* Check if at least it has cpuid */ 67 /* Check if at least it has cpuid */
@@ -383,7 +383,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
383 } 383 }
384} 384}
385 385
386void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 386void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
387{ 387{
388 unsigned int n, dummy, ebx, ecx, edx, l2size; 388 unsigned int n, dummy, ebx, ecx, edx, l2size;
389 389
@@ -391,8 +391,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
391 391
392 if (n >= 0x80000005) { 392 if (n >= 0x80000005) {
393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); 393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
394 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
395 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
396 c->x86_cache_size = (ecx>>24) + (edx>>24); 394 c->x86_cache_size = (ecx>>24) + (edx>>24);
397#ifdef CONFIG_X86_64 395#ifdef CONFIG_X86_64
398 /* On K8 L1 TLB is inclusive, so don't count it */ 396 /* On K8 L1 TLB is inclusive, so don't count it */
@@ -422,9 +420,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
422#endif 420#endif
423 421
424 c->x86_cache_size = l2size; 422 c->x86_cache_size = l2size;
425
426 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
427 l2size, ecx & 0xFF);
428} 423}
429 424
430void __cpuinit detect_ht(struct cpuinfo_x86 *c) 425void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -659,24 +654,31 @@ void __init early_cpu_init(void)
659 const struct cpu_dev *const *cdev; 654 const struct cpu_dev *const *cdev;
660 int count = 0; 655 int count = 0;
661 656
657#ifdef PROCESSOR_SELECT
662 printk(KERN_INFO "KERNEL supported cpus:\n"); 658 printk(KERN_INFO "KERNEL supported cpus:\n");
659#endif
660
663 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { 661 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
664 const struct cpu_dev *cpudev = *cdev; 662 const struct cpu_dev *cpudev = *cdev;
665 unsigned int j;
666 663
667 if (count >= X86_VENDOR_NUM) 664 if (count >= X86_VENDOR_NUM)
668 break; 665 break;
669 cpu_devs[count] = cpudev; 666 cpu_devs[count] = cpudev;
670 count++; 667 count++;
671 668
672 for (j = 0; j < 2; j++) { 669#ifdef PROCESSOR_SELECT
673 if (!cpudev->c_ident[j]) 670 {
674 continue; 671 unsigned int j;
675 printk(KERN_INFO " %s %s\n", cpudev->c_vendor, 672
676 cpudev->c_ident[j]); 673 for (j = 0; j < 2; j++) {
674 if (!cpudev->c_ident[j])
675 continue;
676 printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
677 cpudev->c_ident[j]);
678 }
677 } 679 }
680#endif
678 } 681 }
679
680 early_identify_cpu(&boot_cpu_data); 682 early_identify_cpu(&boot_cpu_data);
681} 683}
682 684
@@ -837,10 +839,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
837 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 839 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
838 } 840 }
839 841
840#ifdef CONFIG_X86_MCE
841 /* Init Machine Check Exception if available. */ 842 /* Init Machine Check Exception if available. */
842 mcheck_init(c); 843 mcheck_cpu_init(c);
843#endif
844 844
845 select_idle_routine(c); 845 select_idle_routine(c);
846 846
@@ -1136,7 +1136,7 @@ void __cpuinit cpu_init(void)
1136 wrmsrl(MSR_KERNEL_GS_BASE, 0); 1136 wrmsrl(MSR_KERNEL_GS_BASE, 0);
1137 barrier(); 1137 barrier();
1138 1138
1139 check_efer(); 1139 x86_configure_nx();
1140 if (cpu != 0) 1140 if (cpu != 0)
1141 enable_x2apic(); 1141 enable_x2apic();
1142 1142
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 6de9a908e400..3624e8a0f71b 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,6 @@ struct cpu_dev {
32extern const struct cpu_dev *const __x86_cpu_dev_start[], 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void display_cacheinfo(struct cpuinfo_x86 *c); 35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36 36
37#endif 37#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 7d5c3b0ea8da..8b581d3905cb 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -526,15 +526,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = {
526 526
527static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) 527static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
528{ 528{
529 /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf 529 /* Intel Xeon Processor 7100 Series Specification Update
530 * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
530 * AL30: A Machine Check Exception (MCE) Occurring during an 531 * AL30: A Machine Check Exception (MCE) Occurring during an
531 * Enhanced Intel SpeedStep Technology Ratio Change May Cause 532 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
532 * Both Processor Cores to Lock Up when HT is enabled*/ 533 * Both Processor Cores to Lock Up. */
533 if (c->x86_vendor == X86_VENDOR_INTEL) { 534 if (c->x86_vendor == X86_VENDOR_INTEL) {
534 if ((c->x86 == 15) && 535 if ((c->x86 == 15) &&
535 (c->x86_model == 6) && 536 (c->x86_model == 6) &&
536 (c->x86_mask == 8) && smt_capable()) 537 (c->x86_mask == 8)) {
538 printk(KERN_INFO "acpi-cpufreq: Intel(R) "
539 "Xeon(R) 7100 Errata AL30, processors may "
540 "lock up on frequency changes: disabling "
541 "acpi-cpufreq.\n");
537 return -ENODEV; 542 return -ENODEV;
543 }
538 } 544 }
539 return 0; 545 return 0;
540} 546}
@@ -549,13 +555,18 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
549 unsigned int result = 0; 555 unsigned int result = 0;
550 struct cpuinfo_x86 *c = &cpu_data(policy->cpu); 556 struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
551 struct acpi_processor_performance *perf; 557 struct acpi_processor_performance *perf;
558#ifdef CONFIG_SMP
559 static int blacklisted;
560#endif
552 561
553 dprintk("acpi_cpufreq_cpu_init\n"); 562 dprintk("acpi_cpufreq_cpu_init\n");
554 563
555#ifdef CONFIG_SMP 564#ifdef CONFIG_SMP
556 result = acpi_cpufreq_blacklist(c); 565 if (blacklisted)
557 if (result) 566 return blacklisted;
558 return result; 567 blacklisted = acpi_cpufreq_blacklist(c);
568 if (blacklisted)
569 return blacklisted;
559#endif 570#endif
560 571
561 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); 572 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index ce2ed3e4aad9..cabd2fa3fc93 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -813,7 +813,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); 813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
814 break; 814 break;
815 case 1 ... 15: 815 case 1 ... 15:
816 longhaul_version = TYPE_LONGHAUL_V1; 816 longhaul_version = TYPE_LONGHAUL_V2;
817 if (c->x86_mask < 8) { 817 if (c->x86_mask < 8) {
818 cpu_model = CPU_SAMUEL2; 818 cpu_model = CPU_SAMUEL2;
819 cpuname = "C3 'Samuel 2' [C5B]"; 819 cpuname = "C3 'Samuel 2' [C5B]";
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6394aa5c7985..3f12dabeab52 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1022,7 +1022,7 @@ static int get_transition_latency(struct powernow_k8_data *data)
1022 * set it to 1 to avoid problems in the future. 1022 * set it to 1 to avoid problems in the future.
1023 * For all others it's a BIOS bug. 1023 * For all others it's a BIOS bug.
1024 */ 1024 */
1025 if (!boot_cpu_data.x86 == 0x11) 1025 if (boot_cpu_data.x86 != 0x11)
1026 printk(KERN_ERR FW_WARN PFX "Invalid zero transition " 1026 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1027 "latency\n"); 1027 "latency\n");
1028 max_latency = 1; 1028 max_latency = 1;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 6911e91fb4f6..3ae5a7a3a500 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -232,28 +232,23 @@ static unsigned int speedstep_detect_chipset(void)
232 return 0; 232 return 0;
233} 233}
234 234
235struct get_freq_data { 235static void get_freq_data(void *_speed)
236 unsigned int speed;
237 unsigned int processor;
238};
239
240static void get_freq_data(void *_data)
241{ 236{
242 struct get_freq_data *data = _data; 237 unsigned int *speed = _speed;
243 238
244 data->speed = speedstep_get_frequency(data->processor); 239 *speed = speedstep_get_frequency(speedstep_processor);
245} 240}
246 241
247static unsigned int speedstep_get(unsigned int cpu) 242static unsigned int speedstep_get(unsigned int cpu)
248{ 243{
249 struct get_freq_data data = { .processor = cpu }; 244 unsigned int speed;
250 245
251 /* You're supposed to ensure CPU is online. */ 246 /* You're supposed to ensure CPU is online. */
252 if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) 247 if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
253 BUG(); 248 BUG();
254 249
255 dprintk("detected %u kHz as current frequency\n", data.speed); 250 dprintk("detected %u kHz as current frequency\n", speed);
256 return data.speed; 251 return speed;
257} 252}
258 253
259/** 254/**
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 19807b89f058..4fbd384fb645 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -373,7 +373,7 @@ static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
373 /* Handle the GX (Formally known as the GX2) */ 373 /* Handle the GX (Formally known as the GX2) */
374 374
375 if (c->x86 == 5 && c->x86_model == 5) 375 if (c->x86 == 5 && c->x86_model == 5)
376 display_cacheinfo(c); 376 cpu_detect_cache_sizes(c);
377 else 377 else
378 init_cyrix(c); 378 init_cyrix(c);
379} 379}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 40e1835b35e8..c900b73f9224 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -263,8 +263,12 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
263 /* Don't do the funky fallback heuristics the AMD version employs 263 /* Don't do the funky fallback heuristics the AMD version employs
264 for now. */ 264 for now. */
265 node = apicid_to_node[apicid]; 265 node = apicid_to_node[apicid];
266 if (node == NUMA_NO_NODE || !node_online(node)) 266 if (node == NUMA_NO_NODE)
267 node = first_node(node_online_map); 267 node = first_node(node_online_map);
268 else if (!node_online(node)) {
269 /* reuse the value from init_cpu_to_node() */
270 node = cpu_to_node(cpu);
271 }
268 numa_set_node(cpu, node); 272 numa_set_node(cpu, node);
269 273
270 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); 274 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 804c40e2bc3e..6c40f6b5b340 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -94,7 +94,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ 94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */
95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ 95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */
96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ 96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */
97 { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */ 97 { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */
98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ 99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */
100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
@@ -102,6 +102,9 @@ static const struct _cache_table __cpuinitconst cache_table[] =
102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ 102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */
103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */
104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
105 { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */
106 { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */
107 { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */
105 { 0x00, 0, 0} 108 { 0x00, 0, 0}
106}; 109};
107 110
@@ -488,22 +491,6 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
488#endif 491#endif
489 } 492 }
490 493
491 if (trace)
492 printk(KERN_INFO "CPU: Trace cache: %dK uops", trace);
493 else if (l1i)
494 printk(KERN_INFO "CPU: L1 I cache: %dK", l1i);
495
496 if (l1d)
497 printk(KERN_CONT ", L1 D cache: %dK\n", l1d);
498 else
499 printk(KERN_CONT "\n");
500
501 if (l2)
502 printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
503
504 if (l3)
505 printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
506
507 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); 494 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
508 495
509 return l2; 496 return l2;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 721a77ca8115..d7ebf25d10ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -46,6 +46,9 @@
46 46
47#include "mce-internal.h" 47#include "mce-internal.h"
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/mce.h>
51
49int mce_disabled __read_mostly; 52int mce_disabled __read_mostly;
50 53
51#define MISC_MCELOG_MINOR 227 54#define MISC_MCELOG_MINOR 227
@@ -85,18 +88,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
85static DEFINE_PER_CPU(struct mce, mces_seen); 88static DEFINE_PER_CPU(struct mce, mces_seen);
86static int cpu_missing; 89static int cpu_missing;
87 90
88static void default_decode_mce(struct mce *m) 91/*
92 * CPU/chipset specific EDAC code can register a notifier call here to print
93 * MCE errors in a human-readable form.
94 */
95ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
96EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
97
98static int default_decode_mce(struct notifier_block *nb, unsigned long val,
99 void *data)
89{ 100{
90 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 101 pr_emerg("No human readable MCE decoding support on this CPU type.\n");
91 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 102 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
103
104 return NOTIFY_STOP;
92} 105}
93 106
94/* 107static struct notifier_block mce_dec_nb = {
95 * CPU/chipset specific EDAC code can register a callback here to print 108 .notifier_call = default_decode_mce,
96 * MCE errors in a human-readable form: 109 .priority = -1,
97 */ 110};
98void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce;
99EXPORT_SYMBOL(x86_mce_decode_callback);
100 111
101/* MCA banks polled by the period polling timer for corrected events */ 112/* MCA banks polled by the period polling timer for corrected events */
102DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 113DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -141,6 +152,9 @@ void mce_log(struct mce *mce)
141{ 152{
142 unsigned next, entry; 153 unsigned next, entry;
143 154
155 /* Emit the trace record: */
156 trace_mce_record(mce);
157
144 mce->finished = 0; 158 mce->finished = 0;
145 wmb(); 159 wmb();
146 for (;;) { 160 for (;;) {
@@ -204,9 +218,9 @@ static void print_mce(struct mce *m)
204 218
205 /* 219 /*
206 * Print out human-readable details about the MCE error, 220 * Print out human-readable details about the MCE error,
207 * (if the CPU has an implementation for that): 221 * (if the CPU has an implementation for that)
208 */ 222 */
209 x86_mce_decode_callback(m); 223 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
210} 224}
211 225
212static void print_mce_head(void) 226static void print_mce_head(void)
@@ -1122,7 +1136,7 @@ static int check_interval = 5 * 60; /* 5 minutes */
1122static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1136static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1123static DEFINE_PER_CPU(struct timer_list, mce_timer); 1137static DEFINE_PER_CPU(struct timer_list, mce_timer);
1124 1138
1125static void mcheck_timer(unsigned long data) 1139static void mce_start_timer(unsigned long data)
1126{ 1140{
1127 struct timer_list *t = &per_cpu(mce_timer, data); 1141 struct timer_list *t = &per_cpu(mce_timer, data);
1128 int *n; 1142 int *n;
@@ -1187,7 +1201,7 @@ int mce_notify_irq(void)
1187} 1201}
1188EXPORT_SYMBOL_GPL(mce_notify_irq); 1202EXPORT_SYMBOL_GPL(mce_notify_irq);
1189 1203
1190static int mce_banks_init(void) 1204static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1191{ 1205{
1192 int i; 1206 int i;
1193 1207
@@ -1206,7 +1220,7 @@ static int mce_banks_init(void)
1206/* 1220/*
1207 * Initialize Machine Checks for a CPU. 1221 * Initialize Machine Checks for a CPU.
1208 */ 1222 */
1209static int __cpuinit mce_cap_init(void) 1223static int __cpuinit __mcheck_cpu_cap_init(void)
1210{ 1224{
1211 unsigned b; 1225 unsigned b;
1212 u64 cap; 1226 u64 cap;
@@ -1228,7 +1242,7 @@ static int __cpuinit mce_cap_init(void)
1228 WARN_ON(banks != 0 && b != banks); 1242 WARN_ON(banks != 0 && b != banks);
1229 banks = b; 1243 banks = b;
1230 if (!mce_banks) { 1244 if (!mce_banks) {
1231 int err = mce_banks_init(); 1245 int err = __mcheck_cpu_mce_banks_init();
1232 1246
1233 if (err) 1247 if (err)
1234 return err; 1248 return err;
@@ -1244,7 +1258,7 @@ static int __cpuinit mce_cap_init(void)
1244 return 0; 1258 return 0;
1245} 1259}
1246 1260
1247static void mce_init(void) 1261static void __mcheck_cpu_init_generic(void)
1248{ 1262{
1249 mce_banks_t all_banks; 1263 mce_banks_t all_banks;
1250 u64 cap; 1264 u64 cap;
@@ -1273,7 +1287,7 @@ static void mce_init(void)
1273} 1287}
1274 1288
1275/* Add per CPU specific workarounds here */ 1289/* Add per CPU specific workarounds here */
1276static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 1290static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1277{ 1291{
1278 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1292 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1279 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1293 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1341,7 +1355,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1341 return 0; 1355 return 0;
1342} 1356}
1343 1357
1344static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1358static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1345{ 1359{
1346 if (c->x86 != 5) 1360 if (c->x86 != 5)
1347 return; 1361 return;
@@ -1355,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1355 } 1369 }
1356} 1370}
1357 1371
1358static void mce_cpu_features(struct cpuinfo_x86 *c) 1372static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1359{ 1373{
1360 switch (c->x86_vendor) { 1374 switch (c->x86_vendor) {
1361 case X86_VENDOR_INTEL: 1375 case X86_VENDOR_INTEL:
@@ -1369,7 +1383,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1369 } 1383 }
1370} 1384}
1371 1385
1372static void mce_init_timer(void) 1386static void __mcheck_cpu_init_timer(void)
1373{ 1387{
1374 struct timer_list *t = &__get_cpu_var(mce_timer); 1388 struct timer_list *t = &__get_cpu_var(mce_timer);
1375 int *n = &__get_cpu_var(mce_next_interval); 1389 int *n = &__get_cpu_var(mce_next_interval);
@@ -1380,7 +1394,7 @@ static void mce_init_timer(void)
1380 *n = check_interval * HZ; 1394 *n = check_interval * HZ;
1381 if (!*n) 1395 if (!*n)
1382 return; 1396 return;
1383 setup_timer(t, mcheck_timer, smp_processor_id()); 1397 setup_timer(t, mce_start_timer, smp_processor_id());
1384 t->expires = round_jiffies(jiffies + *n); 1398 t->expires = round_jiffies(jiffies + *n);
1385 add_timer_on(t, smp_processor_id()); 1399 add_timer_on(t, smp_processor_id());
1386} 1400}
@@ -1400,27 +1414,28 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
1400 * Called for each booted CPU to set up machine checks. 1414 * Called for each booted CPU to set up machine checks.
1401 * Must be called with preempt off: 1415 * Must be called with preempt off:
1402 */ 1416 */
1403void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1417void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1404{ 1418{
1405 if (mce_disabled) 1419 if (mce_disabled)
1406 return; 1420 return;
1407 1421
1408 mce_ancient_init(c); 1422 __mcheck_cpu_ancient_init(c);
1409 1423
1410 if (!mce_available(c)) 1424 if (!mce_available(c))
1411 return; 1425 return;
1412 1426
1413 if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { 1427 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1414 mce_disabled = 1; 1428 mce_disabled = 1;
1415 return; 1429 return;
1416 } 1430 }
1417 1431
1418 machine_check_vector = do_machine_check; 1432 machine_check_vector = do_machine_check;
1419 1433
1420 mce_init(); 1434 __mcheck_cpu_init_generic();
1421 mce_cpu_features(c); 1435 __mcheck_cpu_init_vendor(c);
1422 mce_init_timer(); 1436 __mcheck_cpu_init_timer();
1423 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1437 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1438
1424} 1439}
1425 1440
1426/* 1441/*
@@ -1640,6 +1655,15 @@ static int __init mcheck_enable(char *str)
1640} 1655}
1641__setup("mce", mcheck_enable); 1656__setup("mce", mcheck_enable);
1642 1657
1658int __init mcheck_init(void)
1659{
1660 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1661
1662 mcheck_intel_therm_init();
1663
1664 return 0;
1665}
1666
1643/* 1667/*
1644 * Sysfs support 1668 * Sysfs support
1645 */ 1669 */
@@ -1648,7 +1672,7 @@ __setup("mce", mcheck_enable);
1648 * Disable machine checks on suspend and shutdown. We can't really handle 1672 * Disable machine checks on suspend and shutdown. We can't really handle
1649 * them later. 1673 * them later.
1650 */ 1674 */
1651static int mce_disable(void) 1675static int mce_disable_error_reporting(void)
1652{ 1676{
1653 int i; 1677 int i;
1654 1678
@@ -1663,12 +1687,12 @@ static int mce_disable(void)
1663 1687
1664static int mce_suspend(struct sys_device *dev, pm_message_t state) 1688static int mce_suspend(struct sys_device *dev, pm_message_t state)
1665{ 1689{
1666 return mce_disable(); 1690 return mce_disable_error_reporting();
1667} 1691}
1668 1692
1669static int mce_shutdown(struct sys_device *dev) 1693static int mce_shutdown(struct sys_device *dev)
1670{ 1694{
1671 return mce_disable(); 1695 return mce_disable_error_reporting();
1672} 1696}
1673 1697
1674/* 1698/*
@@ -1678,8 +1702,8 @@ static int mce_shutdown(struct sys_device *dev)
1678 */ 1702 */
1679static int mce_resume(struct sys_device *dev) 1703static int mce_resume(struct sys_device *dev)
1680{ 1704{
1681 mce_init(); 1705 __mcheck_cpu_init_generic();
1682 mce_cpu_features(&current_cpu_data); 1706 __mcheck_cpu_init_vendor(&current_cpu_data);
1683 1707
1684 return 0; 1708 return 0;
1685} 1709}
@@ -1689,8 +1713,8 @@ static void mce_cpu_restart(void *data)
1689 del_timer_sync(&__get_cpu_var(mce_timer)); 1713 del_timer_sync(&__get_cpu_var(mce_timer));
1690 if (!mce_available(&current_cpu_data)) 1714 if (!mce_available(&current_cpu_data))
1691 return; 1715 return;
1692 mce_init(); 1716 __mcheck_cpu_init_generic();
1693 mce_init_timer(); 1717 __mcheck_cpu_init_timer();
1694} 1718}
1695 1719
1696/* Reinit MCEs after user configuration changes */ 1720/* Reinit MCEs after user configuration changes */
@@ -1716,7 +1740,7 @@ static void mce_enable_ce(void *all)
1716 cmci_reenable(); 1740 cmci_reenable();
1717 cmci_recheck(); 1741 cmci_recheck();
1718 if (all) 1742 if (all)
1719 mce_init_timer(); 1743 __mcheck_cpu_init_timer();
1720} 1744}
1721 1745
1722static struct sysdev_class mce_sysclass = { 1746static struct sysdev_class mce_sysclass = {
@@ -1929,13 +1953,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1929} 1953}
1930 1954
1931/* Make sure there are no machine checks on offlined CPUs. */ 1955/* Make sure there are no machine checks on offlined CPUs. */
1932static void mce_disable_cpu(void *h) 1956static void __cpuinit mce_disable_cpu(void *h)
1933{ 1957{
1934 unsigned long action = *(unsigned long *)h; 1958 unsigned long action = *(unsigned long *)h;
1935 int i; 1959 int i;
1936 1960
1937 if (!mce_available(&current_cpu_data)) 1961 if (!mce_available(&current_cpu_data))
1938 return; 1962 return;
1963
1939 if (!(action & CPU_TASKS_FROZEN)) 1964 if (!(action & CPU_TASKS_FROZEN))
1940 cmci_clear(); 1965 cmci_clear();
1941 for (i = 0; i < banks; i++) { 1966 for (i = 0; i < banks; i++) {
@@ -1946,7 +1971,7 @@ static void mce_disable_cpu(void *h)
1946 } 1971 }
1947} 1972}
1948 1973
1949static void mce_reenable_cpu(void *h) 1974static void __cpuinit mce_reenable_cpu(void *h)
1950{ 1975{
1951 unsigned long action = *(unsigned long *)h; 1976 unsigned long action = *(unsigned long *)h;
1952 int i; 1977 int i;
@@ -1991,9 +2016,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1991 break; 2016 break;
1992 case CPU_DOWN_FAILED: 2017 case CPU_DOWN_FAILED:
1993 case CPU_DOWN_FAILED_FROZEN: 2018 case CPU_DOWN_FAILED_FROZEN:
1994 t->expires = round_jiffies(jiffies + 2019 if (!mce_ignore_ce && check_interval) {
2020 t->expires = round_jiffies(jiffies +
1995 __get_cpu_var(mce_next_interval)); 2021 __get_cpu_var(mce_next_interval));
1996 add_timer_on(t, cpu); 2022 add_timer_on(t, cpu);
2023 }
1997 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2024 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1998 break; 2025 break;
1999 case CPU_POST_DEAD: 2026 case CPU_POST_DEAD:
@@ -2025,7 +2052,7 @@ static __init void mce_init_banks(void)
2025 } 2052 }
2026} 2053}
2027 2054
2028static __init int mce_init_device(void) 2055static __init int mcheck_init_device(void)
2029{ 2056{
2030 int err; 2057 int err;
2031 int i = 0; 2058 int i = 0;
@@ -2053,7 +2080,7 @@ static __init int mce_init_device(void)
2053 return err; 2080 return err;
2054} 2081}
2055 2082
2056device_initcall(mce_init_device); 2083device_initcall(mcheck_init_device);
2057 2084
2058/* 2085/*
2059 * Old style boot options parsing. Only for compatibility. 2086 * Old style boot options parsing. Only for compatibility.
@@ -2101,7 +2128,7 @@ static int fake_panic_set(void *data, u64 val)
2101DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2128DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2102 fake_panic_set, "%llu\n"); 2129 fake_panic_set, "%llu\n");
2103 2130
2104static int __init mce_debugfs_init(void) 2131static int __init mcheck_debugfs_init(void)
2105{ 2132{
2106 struct dentry *dmce, *ffake_panic; 2133 struct dentry *dmce, *ffake_panic;
2107 2134
@@ -2115,5 +2142,5 @@ static int __init mce_debugfs_init(void)
2115 2142
2116 return 0; 2143 return 0;
2117} 2144}
2118late_initcall(mce_debugfs_init); 2145late_initcall(mcheck_debugfs_init);
2119#endif 2146#endif
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index b3a1dba75330..4fef985fc221 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state);
49 49
50static atomic_t therm_throt_en = ATOMIC_INIT(0); 50static atomic_t therm_throt_en = ATOMIC_INIT(0);
51 51
52static u32 lvtthmr_init __read_mostly;
53
52#ifdef CONFIG_SYSFS 54#ifdef CONFIG_SYSFS
53#define define_therm_throt_sysdev_one_ro(_name) \ 55#define define_therm_throt_sysdev_one_ro(_name) \
54 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
@@ -254,6 +256,18 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
254 ack_APIC_irq(); 256 ack_APIC_irq();
255} 257}
256 258
259void __init mcheck_intel_therm_init(void)
260{
261 /*
262 * This function is only called on boot CPU. Save the init thermal
263 * LVT value on BSP and use that value to restore APs' thermal LVT
264 * entry BIOS programmed later
265 */
266 if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) &&
267 cpu_has(&boot_cpu_data, X86_FEATURE_ACC))
268 lvtthmr_init = apic_read(APIC_LVTTHMR);
269}
270
257void intel_init_thermal(struct cpuinfo_x86 *c) 271void intel_init_thermal(struct cpuinfo_x86 *c)
258{ 272{
259 unsigned int cpu = smp_processor_id(); 273 unsigned int cpu = smp_processor_id();
@@ -270,7 +284,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
270 * since it might be delivered via SMI already: 284 * since it might be delivered via SMI already:
271 */ 285 */
272 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 286 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
273 h = apic_read(APIC_LVTTHMR); 287
288 /*
289 * The initial value of thermal LVT entries on all APs always reads
290 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
291 * sequence to them and LVT registers are reset to 0s except for
292 * the mask bits which are set to 1s when APs receive INIT IPI.
293 * Always restore the value that BIOS has programmed on AP based on
294 * BSP's info we saved since BIOS is always setting the same value
295 * for all threads/cores
296 */
297 apic_write(APIC_LVTTHMR, lvtthmr_init);
298
299 h = lvtthmr_init;
300
274 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { 301 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
275 printk(KERN_DEBUG 302 printk(KERN_DEBUG
276 "CPU%d: Thermal monitoring handled by SMI\n", cpu); 303 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 315738c74aad..09b1698e0466 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -170,6 +170,41 @@ static int __init cmp_range(const void *x1, const void *x2)
170 return start1 - start2; 170 return start1 - start2;
171} 171}
172 172
173static int __init clean_sort_range(struct res_range *range, int az)
174{
175 int i, j, k = az - 1, nr_range = 0;
176
177 for (i = 0; i < k; i++) {
178 if (range[i].end)
179 continue;
180 for (j = k; j > i; j--) {
181 if (range[j].end) {
182 k = j;
183 break;
184 }
185 }
186 if (j == i)
187 break;
188 range[i].start = range[k].start;
189 range[i].end = range[k].end;
190 range[k].start = 0;
191 range[k].end = 0;
192 k--;
193 }
194 /* count it */
195 for (i = 0; i < az; i++) {
196 if (!range[i].end) {
197 nr_range = i;
198 break;
199 }
200 }
201
202 /* sort them */
203 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
204
205 return nr_range;
206}
207
173#define BIOS_BUG_MSG KERN_WARNING \ 208#define BIOS_BUG_MSG KERN_WARNING \
174 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" 209 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
175 210
@@ -223,22 +258,18 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
223 subtract_range(range, extra_remove_base, 258 subtract_range(range, extra_remove_base,
224 extra_remove_base + extra_remove_size - 1); 259 extra_remove_base + extra_remove_size - 1);
225 260
226 /* get new range num */
227 nr_range = 0;
228 for (i = 0; i < RANGE_NUM; i++) {
229 if (!range[i].end)
230 continue;
231 nr_range++;
232 }
233 if (debug_print) { 261 if (debug_print) {
234 printk(KERN_DEBUG "After UC checking\n"); 262 printk(KERN_DEBUG "After UC checking\n");
235 for (i = 0; i < nr_range; i++) 263 for (i = 0; i < RANGE_NUM; i++) {
264 if (!range[i].end)
265 continue;
236 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 266 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
237 range[i].start, range[i].end + 1); 267 range[i].start, range[i].end + 1);
268 }
238 } 269 }
239 270
240 /* sort the ranges */ 271 /* sort the ranges */
241 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 272 nr_range = clean_sort_range(range, RANGE_NUM);
242 if (debug_print) { 273 if (debug_print) {
243 printk(KERN_DEBUG "After sorting\n"); 274 printk(KERN_DEBUG "After sorting\n");
244 for (i = 0; i < nr_range; i++) 275 for (i = 0; i < nr_range; i++)
@@ -689,8 +720,6 @@ static int __init mtrr_need_cleanup(void)
689 continue; 720 continue;
690 if (!size) 721 if (!size)
691 type = MTRR_NUM_TYPES; 722 type = MTRR_NUM_TYPES;
692 if (type == MTRR_TYPE_WRPROT)
693 type = MTRR_TYPE_UNCACHABLE;
694 num[type]++; 723 num[type]++;
695 } 724 }
696 725
@@ -846,7 +875,7 @@ int __init mtrr_cleanup(unsigned address_bits)
846 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 875 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
847 876
848 range_sums = sum_ranges(range, nr_range); 877 range_sums = sum_ranges(range, nr_range);
849 printk(KERN_INFO "total RAM coverred: %ldM\n", 878 printk(KERN_INFO "total RAM covered: %ldM\n",
850 range_sums >> (20 - PAGE_SHIFT)); 879 range_sums >> (20 - PAGE_SHIFT));
851 880
852 if (mtrr_chunk_size && mtrr_gran_size) { 881 if (mtrr_chunk_size && mtrr_gran_size) {
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b5801c311846..c1bbed1021d9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -77,6 +77,18 @@ struct cpu_hw_events {
77 struct debug_store *ds; 77 struct debug_store *ds;
78}; 78};
79 79
80struct event_constraint {
81 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
82 int code;
83};
84
85#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
86#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
87
88#define for_each_event_constraint(e, c) \
89 for ((e) = (c); (e)->idxmsk[0]; (e)++)
90
91
80/* 92/*
81 * struct x86_pmu - generic x86 pmu 93 * struct x86_pmu - generic x86 pmu
82 */ 94 */
@@ -102,6 +114,8 @@ struct x86_pmu {
102 u64 intel_ctrl; 114 u64 intel_ctrl;
103 void (*enable_bts)(u64 config); 115 void (*enable_bts)(u64 config);
104 void (*disable_bts)(void); 116 void (*disable_bts)(void);
117 int (*get_event_idx)(struct cpu_hw_events *cpuc,
118 struct hw_perf_event *hwc);
105}; 119};
106 120
107static struct x86_pmu x86_pmu __read_mostly; 121static struct x86_pmu x86_pmu __read_mostly;
@@ -110,6 +124,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
110 .enabled = 1, 124 .enabled = 1,
111}; 125};
112 126
127static const struct event_constraint *event_constraints;
128
113/* 129/*
114 * Not sure about some of these 130 * Not sure about some of these
115 */ 131 */
@@ -155,6 +171,16 @@ static u64 p6_pmu_raw_event(u64 hw_event)
155 return hw_event & P6_EVNTSEL_MASK; 171 return hw_event & P6_EVNTSEL_MASK;
156} 172}
157 173
174static const struct event_constraint intel_p6_event_constraints[] =
175{
176 EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
177 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
178 EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
179 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
180 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
181 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
182 EVENT_CONSTRAINT_END
183};
158 184
159/* 185/*
160 * Intel PerfMon v3. Used on Core2 and later. 186 * Intel PerfMon v3. Used on Core2 and later.
@@ -170,6 +196,35 @@ static const u64 intel_perfmon_event_map[] =
170 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 196 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
171}; 197};
172 198
199static const struct event_constraint intel_core_event_constraints[] =
200{
201 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
202 EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
203 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
204 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
205 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
206 EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
207 EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
208 EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
209 EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
210 EVENT_CONSTRAINT_END
211};
212
213static const struct event_constraint intel_nehalem_event_constraints[] =
214{
215 EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
216 EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
217 EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
218 EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
219 EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
220 EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
221 EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
222 EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
223 EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
224 EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
225 EVENT_CONSTRAINT_END
226};
227
173static u64 intel_pmu_event_map(int hw_event) 228static u64 intel_pmu_event_map(int hw_event)
174{ 229{
175 return intel_perfmon_event_map[hw_event]; 230 return intel_perfmon_event_map[hw_event];
@@ -190,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids
190 [PERF_COUNT_HW_CACHE_OP_MAX] 245 [PERF_COUNT_HW_CACHE_OP_MAX]
191 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 246 [PERF_COUNT_HW_CACHE_RESULT_MAX];
192 247
193static const u64 nehalem_hw_cache_event_ids 248static __initconst u64 nehalem_hw_cache_event_ids
194 [PERF_COUNT_HW_CACHE_MAX] 249 [PERF_COUNT_HW_CACHE_MAX]
195 [PERF_COUNT_HW_CACHE_OP_MAX] 250 [PERF_COUNT_HW_CACHE_OP_MAX]
196 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 251 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -281,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids
281 }, 336 },
282}; 337};
283 338
284static const u64 core2_hw_cache_event_ids 339static __initconst u64 core2_hw_cache_event_ids
285 [PERF_COUNT_HW_CACHE_MAX] 340 [PERF_COUNT_HW_CACHE_MAX]
286 [PERF_COUNT_HW_CACHE_OP_MAX] 341 [PERF_COUNT_HW_CACHE_OP_MAX]
287 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 342 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -372,7 +427,7 @@ static const u64 core2_hw_cache_event_ids
372 }, 427 },
373}; 428};
374 429
375static const u64 atom_hw_cache_event_ids 430static __initconst u64 atom_hw_cache_event_ids
376 [PERF_COUNT_HW_CACHE_MAX] 431 [PERF_COUNT_HW_CACHE_MAX]
377 [PERF_COUNT_HW_CACHE_OP_MAX] 432 [PERF_COUNT_HW_CACHE_OP_MAX]
378 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 433 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -469,7 +524,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
469#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL 524#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
470#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL 525#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
471#define CORE_EVNTSEL_INV_MASK 0x00800000ULL 526#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
472#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL 527#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
473 528
474#define CORE_EVNTSEL_MASK \ 529#define CORE_EVNTSEL_MASK \
475 (CORE_EVNTSEL_EVENT_MASK | \ 530 (CORE_EVNTSEL_EVENT_MASK | \
@@ -481,7 +536,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
481 return hw_event & CORE_EVNTSEL_MASK; 536 return hw_event & CORE_EVNTSEL_MASK;
482} 537}
483 538
484static const u64 amd_hw_cache_event_ids 539static __initconst u64 amd_hw_cache_event_ids
485 [PERF_COUNT_HW_CACHE_MAX] 540 [PERF_COUNT_HW_CACHE_MAX]
486 [PERF_COUNT_HW_CACHE_OP_MAX] 541 [PERF_COUNT_HW_CACHE_OP_MAX]
487 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 542 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -932,6 +987,8 @@ static int __hw_perf_event_init(struct perf_event *event)
932 */ 987 */
933 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 988 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
934 989
990 hwc->idx = -1;
991
935 /* 992 /*
936 * Count user and OS events unless requested not to. 993 * Count user and OS events unless requested not to.
937 */ 994 */
@@ -1334,8 +1391,7 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1334 x86_pmu_enable_event(hwc, idx); 1391 x86_pmu_enable_event(hwc, idx);
1335} 1392}
1336 1393
1337static int 1394static int fixed_mode_idx(struct hw_perf_event *hwc)
1338fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1339{ 1395{
1340 unsigned int hw_event; 1396 unsigned int hw_event;
1341 1397
@@ -1349,6 +1405,12 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1349 if (!x86_pmu.num_events_fixed) 1405 if (!x86_pmu.num_events_fixed)
1350 return -1; 1406 return -1;
1351 1407
1408 /*
1409 * fixed counters do not take all possible filters
1410 */
1411 if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
1412 return -1;
1413
1352 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1414 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1353 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 1415 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1354 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) 1416 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@ -1360,22 +1422,57 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1360} 1422}
1361 1423
1362/* 1424/*
1363 * Find a PMC slot for the freshly enabled / scheduled in event: 1425 * generic counter allocator: get next free counter
1364 */ 1426 */
1365static int x86_pmu_enable(struct perf_event *event) 1427static int
1428gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1429{
1430 int idx;
1431
1432 idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
1433 return idx == x86_pmu.num_events ? -1 : idx;
1434}
1435
1436/*
1437 * intel-specific counter allocator: check event constraints
1438 */
1439static int
1440intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1441{
1442 const struct event_constraint *event_constraint;
1443 int i, code;
1444
1445 if (!event_constraints)
1446 goto skip;
1447
1448 code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
1449
1450 for_each_event_constraint(event_constraint, event_constraints) {
1451 if (code == event_constraint->code) {
1452 for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
1453 if (!test_and_set_bit(i, cpuc->used_mask))
1454 return i;
1455 }
1456 return -1;
1457 }
1458 }
1459skip:
1460 return gen_get_event_idx(cpuc, hwc);
1461}
1462
1463static int
1464x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1366{ 1465{
1367 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1368 struct hw_perf_event *hwc = &event->hw;
1369 int idx; 1466 int idx;
1370 1467
1371 idx = fixed_mode_idx(event, hwc); 1468 idx = fixed_mode_idx(hwc);
1372 if (idx == X86_PMC_IDX_FIXED_BTS) { 1469 if (idx == X86_PMC_IDX_FIXED_BTS) {
1373 /* BTS is already occupied. */ 1470 /* BTS is already occupied. */
1374 if (test_and_set_bit(idx, cpuc->used_mask)) 1471 if (test_and_set_bit(idx, cpuc->used_mask))
1375 return -EAGAIN; 1472 return -EAGAIN;
1376 1473
1377 hwc->config_base = 0; 1474 hwc->config_base = 0;
1378 hwc->event_base = 0; 1475 hwc->event_base = 0;
1379 hwc->idx = idx; 1476 hwc->idx = idx;
1380 } else if (idx >= 0) { 1477 } else if (idx >= 0) {
1381 /* 1478 /*
@@ -1396,20 +1493,35 @@ static int x86_pmu_enable(struct perf_event *event)
1396 } else { 1493 } else {
1397 idx = hwc->idx; 1494 idx = hwc->idx;
1398 /* Try to get the previous generic event again */ 1495 /* Try to get the previous generic event again */
1399 if (test_and_set_bit(idx, cpuc->used_mask)) { 1496 if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1400try_generic: 1497try_generic:
1401 idx = find_first_zero_bit(cpuc->used_mask, 1498 idx = x86_pmu.get_event_idx(cpuc, hwc);
1402 x86_pmu.num_events); 1499 if (idx == -1)
1403 if (idx == x86_pmu.num_events)
1404 return -EAGAIN; 1500 return -EAGAIN;
1405 1501
1406 set_bit(idx, cpuc->used_mask); 1502 set_bit(idx, cpuc->used_mask);
1407 hwc->idx = idx; 1503 hwc->idx = idx;
1408 } 1504 }
1409 hwc->config_base = x86_pmu.eventsel; 1505 hwc->config_base = x86_pmu.eventsel;
1410 hwc->event_base = x86_pmu.perfctr; 1506 hwc->event_base = x86_pmu.perfctr;
1411 } 1507 }
1412 1508
1509 return idx;
1510}
1511
1512/*
1513 * Find a PMC slot for the freshly enabled / scheduled in event:
1514 */
1515static int x86_pmu_enable(struct perf_event *event)
1516{
1517 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1518 struct hw_perf_event *hwc = &event->hw;
1519 int idx;
1520
1521 idx = x86_schedule_event(cpuc, hwc);
1522 if (idx < 0)
1523 return idx;
1524
1413 perf_events_lapic_init(); 1525 perf_events_lapic_init();
1414 1526
1415 x86_pmu.disable(hwc, idx); 1527 x86_pmu.disable(hwc, idx);
@@ -1852,7 +1964,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1852 .priority = 1 1964 .priority = 1
1853}; 1965};
1854 1966
1855static struct x86_pmu p6_pmu = { 1967static __initconst struct x86_pmu p6_pmu = {
1856 .name = "p6", 1968 .name = "p6",
1857 .handle_irq = p6_pmu_handle_irq, 1969 .handle_irq = p6_pmu_handle_irq,
1858 .disable_all = p6_pmu_disable_all, 1970 .disable_all = p6_pmu_disable_all,
@@ -1877,9 +1989,10 @@ static struct x86_pmu p6_pmu = {
1877 */ 1989 */
1878 .event_bits = 32, 1990 .event_bits = 32,
1879 .event_mask = (1ULL << 32) - 1, 1991 .event_mask = (1ULL << 32) - 1,
1992 .get_event_idx = intel_get_event_idx,
1880}; 1993};
1881 1994
1882static struct x86_pmu intel_pmu = { 1995static __initconst struct x86_pmu intel_pmu = {
1883 .name = "Intel", 1996 .name = "Intel",
1884 .handle_irq = intel_pmu_handle_irq, 1997 .handle_irq = intel_pmu_handle_irq,
1885 .disable_all = intel_pmu_disable_all, 1998 .disable_all = intel_pmu_disable_all,
@@ -1900,9 +2013,10 @@ static struct x86_pmu intel_pmu = {
1900 .max_period = (1ULL << 31) - 1, 2013 .max_period = (1ULL << 31) - 1,
1901 .enable_bts = intel_pmu_enable_bts, 2014 .enable_bts = intel_pmu_enable_bts,
1902 .disable_bts = intel_pmu_disable_bts, 2015 .disable_bts = intel_pmu_disable_bts,
2016 .get_event_idx = intel_get_event_idx,
1903}; 2017};
1904 2018
1905static struct x86_pmu amd_pmu = { 2019static __initconst struct x86_pmu amd_pmu = {
1906 .name = "AMD", 2020 .name = "AMD",
1907 .handle_irq = amd_pmu_handle_irq, 2021 .handle_irq = amd_pmu_handle_irq,
1908 .disable_all = amd_pmu_disable_all, 2022 .disable_all = amd_pmu_disable_all,
@@ -1920,9 +2034,10 @@ static struct x86_pmu amd_pmu = {
1920 .apic = 1, 2034 .apic = 1,
1921 /* use highest bit to detect overflow */ 2035 /* use highest bit to detect overflow */
1922 .max_period = (1ULL << 47) - 1, 2036 .max_period = (1ULL << 47) - 1,
2037 .get_event_idx = gen_get_event_idx,
1923}; 2038};
1924 2039
1925static int p6_pmu_init(void) 2040static __init int p6_pmu_init(void)
1926{ 2041{
1927 switch (boot_cpu_data.x86_model) { 2042 switch (boot_cpu_data.x86_model) {
1928 case 1: 2043 case 1:
@@ -1932,10 +2047,12 @@ static int p6_pmu_init(void)
1932 case 7: 2047 case 7:
1933 case 8: 2048 case 8:
1934 case 11: /* Pentium III */ 2049 case 11: /* Pentium III */
2050 event_constraints = intel_p6_event_constraints;
1935 break; 2051 break;
1936 case 9: 2052 case 9:
1937 case 13: 2053 case 13:
1938 /* Pentium M */ 2054 /* Pentium M */
2055 event_constraints = intel_p6_event_constraints;
1939 break; 2056 break;
1940 default: 2057 default:
1941 pr_cont("unsupported p6 CPU model %d ", 2058 pr_cont("unsupported p6 CPU model %d ",
@@ -1954,7 +2071,7 @@ static int p6_pmu_init(void)
1954 return 0; 2071 return 0;
1955} 2072}
1956 2073
1957static int intel_pmu_init(void) 2074static __init int intel_pmu_init(void)
1958{ 2075{
1959 union cpuid10_edx edx; 2076 union cpuid10_edx edx;
1960 union cpuid10_eax eax; 2077 union cpuid10_eax eax;
@@ -2007,12 +2124,14 @@ static int intel_pmu_init(void)
2007 sizeof(hw_cache_event_ids)); 2124 sizeof(hw_cache_event_ids));
2008 2125
2009 pr_cont("Core2 events, "); 2126 pr_cont("Core2 events, ");
2127 event_constraints = intel_core_event_constraints;
2010 break; 2128 break;
2011 default: 2129 default:
2012 case 26: 2130 case 26:
2013 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 2131 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
2014 sizeof(hw_cache_event_ids)); 2132 sizeof(hw_cache_event_ids));
2015 2133
2134 event_constraints = intel_nehalem_event_constraints;
2016 pr_cont("Nehalem/Corei7 events, "); 2135 pr_cont("Nehalem/Corei7 events, ");
2017 break; 2136 break;
2018 case 28: 2137 case 28:
@@ -2025,7 +2144,7 @@ static int intel_pmu_init(void)
2025 return 0; 2144 return 0;
2026} 2145}
2027 2146
2028static int amd_pmu_init(void) 2147static __init int amd_pmu_init(void)
2029{ 2148{
2030 /* Performance-monitoring supported from K7 and later: */ 2149 /* Performance-monitoring supported from K7 and later: */
2031 if (boot_cpu_data.x86 < 6) 2150 if (boot_cpu_data.x86 < 6)
@@ -2105,11 +2224,47 @@ static const struct pmu pmu = {
2105 .unthrottle = x86_pmu_unthrottle, 2224 .unthrottle = x86_pmu_unthrottle,
2106}; 2225};
2107 2226
2227static int
2228validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
2229{
2230 struct hw_perf_event fake_event = event->hw;
2231
2232 if (event->pmu && event->pmu != &pmu)
2233 return 0;
2234
2235 return x86_schedule_event(cpuc, &fake_event) >= 0;
2236}
2237
2238static int validate_group(struct perf_event *event)
2239{
2240 struct perf_event *sibling, *leader = event->group_leader;
2241 struct cpu_hw_events fake_pmu;
2242
2243 memset(&fake_pmu, 0, sizeof(fake_pmu));
2244
2245 if (!validate_event(&fake_pmu, leader))
2246 return -ENOSPC;
2247
2248 list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
2249 if (!validate_event(&fake_pmu, sibling))
2250 return -ENOSPC;
2251 }
2252
2253 if (!validate_event(&fake_pmu, event))
2254 return -ENOSPC;
2255
2256 return 0;
2257}
2258
2108const struct pmu *hw_perf_event_init(struct perf_event *event) 2259const struct pmu *hw_perf_event_init(struct perf_event *event)
2109{ 2260{
2110 int err; 2261 int err;
2111 2262
2112 err = __hw_perf_event_init(event); 2263 err = __hw_perf_event_init(event);
2264 if (!err) {
2265 if (event->group_leader != event)
2266 err = validate_group(event);
2267 }
2113 if (err) { 2268 if (err) {
2114 if (event->destroy) 2269 if (event->destroy)
2115 event->destroy(event); 2270 event->destroy(event);
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fab786f60ed6..898df9719afb 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -712,7 +712,7 @@ static void probe_nmi_watchdog(void)
712 switch (boot_cpu_data.x86_vendor) { 712 switch (boot_cpu_data.x86_vendor) {
713 case X86_VENDOR_AMD: 713 case X86_VENDOR_AMD:
714 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && 714 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
715 boot_cpu_data.x86 != 16) 715 boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
716 return; 716 return;
717 wd_ops = &k7_wd_ops; 717 wd_ops = &k7_wd_ops;
718 break; 718 break;
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index bb62b3e5caad..28000743bbb0 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -26,7 +26,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
26 26
27 early_init_transmeta(c); 27 early_init_transmeta(c);
28 28
29 display_cacheinfo(c); 29 cpu_detect_cache_sizes(c);
30 30
31 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
32 max = cpuid_eax(0x80860000); 32 max = cpuid_eax(0x80860000);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 6a52d4b36a30..7ef24a796992 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -116,21 +116,16 @@ static int cpuid_open(struct inode *inode, struct file *file)
116{ 116{
117 unsigned int cpu; 117 unsigned int cpu;
118 struct cpuinfo_x86 *c; 118 struct cpuinfo_x86 *c;
119 int ret = 0;
120
121 lock_kernel();
122 119
123 cpu = iminor(file->f_path.dentry->d_inode); 120 cpu = iminor(file->f_path.dentry->d_inode);
124 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { 121 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
125 ret = -ENXIO; /* No such CPU */ 122 return -ENXIO; /* No such CPU */
126 goto out; 123
127 }
128 c = &cpu_data(cpu); 124 c = &cpu_data(cpu);
129 if (c->cpuid_level < 0) 125 if (c->cpuid_level < 0)
130 ret = -EIO; /* CPUID not supported */ 126 return -EIO; /* CPUID not supported */
131out: 127
132 unlock_kernel(); 128 return 0;
133 return ret;
134} 129}
135 130
136/* 131/*
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 5e409dc298a4..a4849c10a77e 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,8 +27,7 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/iommu.h> 30#include <asm/x86_init.h>
31
32 31
33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
34 33
@@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
106#endif 105#endif
107 106
108#ifdef CONFIG_X86_64 107#ifdef CONFIG_X86_64
109 pci_iommu_shutdown(); 108 x86_platform.iommu_shutdown();
110#endif 109#endif
111 110
112 crash_save_cpu(regs, safe_smp_processor_id()); 111 crash_save_cpu(regs, safe_smp_processor_id());
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2d8a371d4339..b8ce165dde5d 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -268,11 +268,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
268 268
269 show_registers(regs); 269 show_registers(regs);
270#ifdef CONFIG_X86_32 270#ifdef CONFIG_X86_32
271 sp = (unsigned long) (&regs->sp); 271 if (user_mode_vm(regs)) {
272 savesegment(ss, ss);
273 if (user_mode(regs)) {
274 sp = regs->sp; 272 sp = regs->sp;
275 ss = regs->ss & 0xffff; 273 ss = regs->ss & 0xffff;
274 } else {
275 sp = kernel_stack_pointer(regs);
276 savesegment(ss, ss);
276 } 277 }
277 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); 278 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
278 print_symbol("%s", regs->ip); 279 print_symbol("%s", regs->ip);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index f7dd2a7c3bf4..e0ed4c7abb62 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -10,9 +10,9 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
@@ -35,6 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
35 35
36 if (!stack) { 36 if (!stack) {
37 unsigned long dummy; 37 unsigned long dummy;
38
38 stack = &dummy; 39 stack = &dummy;
39 if (task && task != current) 40 if (task && task != current)
40 stack = (unsigned long *)task->thread.sp; 41 stack = (unsigned long *)task->thread.sp;
@@ -57,8 +58,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
57 58
58 context = (struct thread_info *) 59 context = (struct thread_info *)
59 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 60 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
60 bp = print_context_stack(context, stack, bp, ops, 61 bp = print_context_stack(context, stack, bp, ops, data, NULL, &graph);
61 data, NULL, &graph);
62 62
63 stack = (unsigned long *)context->previous_esp; 63 stack = (unsigned long *)context->previous_esp;
64 if (!stack) 64 if (!stack)
@@ -72,7 +72,7 @@ EXPORT_SYMBOL(dump_trace);
72 72
73void 73void
74show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 74show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
75 unsigned long *sp, unsigned long bp, char *log_lvl) 75 unsigned long *sp, unsigned long bp, char *log_lvl)
76{ 76{
77 unsigned long *stack; 77 unsigned long *stack;
78 int i; 78 int i;
@@ -156,4 +156,3 @@ int is_valid_bugaddr(unsigned long ip)
156 156
157 return ud2 == 0x0b0f; 157 return ud2 == 0x0b0f;
158} 158}
159
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a071e6be177e..8e740934bd1f 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -10,26 +10,28 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h" 19#include "dumpstack.h"
20 20
21#define N_EXCEPTION_STACKS_END \
22 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
21 23
22static char x86_stack_ids[][8] = { 24static char x86_stack_ids[][8] = {
23 [DEBUG_STACK - 1] = "#DB", 25 [ DEBUG_STACK-1 ] = "#DB",
24 [NMI_STACK - 1] = "NMI", 26 [ NMI_STACK-1 ] = "NMI",
25 [DOUBLEFAULT_STACK - 1] = "#DF", 27 [ DOUBLEFAULT_STACK-1 ] = "#DF",
26 [STACKFAULT_STACK - 1] = "#SS", 28 [ STACKFAULT_STACK-1 ] = "#SS",
27 [MCE_STACK - 1] = "#MC", 29 [ MCE_STACK-1 ] = "#MC",
28#if DEBUG_STKSZ > EXCEPTION_STKSZ 30#if DEBUG_STKSZ > EXCEPTION_STKSZ
29 [N_EXCEPTION_STACKS ... 31 [ N_EXCEPTION_STACKS ...
30 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 32 N_EXCEPTION_STACKS_END ] = "#DB[?]"
31#endif 33#endif
32 }; 34};
33 35
34int x86_is_stack_id(int id, char *name) 36int x86_is_stack_id(int id, char *name)
35{ 37{
@@ -37,7 +39,7 @@ int x86_is_stack_id(int id, char *name)
37} 39}
38 40
39static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 41static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
40 unsigned *usedp, char **idp) 42 unsigned *usedp, char **idp)
41{ 43{
42 unsigned k; 44 unsigned k;
43 45
@@ -202,21 +204,24 @@ EXPORT_SYMBOL(dump_trace);
202 204
203void 205void
204show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 206show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
205 unsigned long *sp, unsigned long bp, char *log_lvl) 207 unsigned long *sp, unsigned long bp, char *log_lvl)
206{ 208{
209 unsigned long *irq_stack_end;
210 unsigned long *irq_stack;
207 unsigned long *stack; 211 unsigned long *stack;
212 int cpu;
208 int i; 213 int i;
209 const int cpu = smp_processor_id(); 214
210 unsigned long *irq_stack_end = 215 preempt_disable();
211 (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); 216 cpu = smp_processor_id();
212 unsigned long *irq_stack = 217
213 (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); 218 irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
219 irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
214 220
215 /* 221 /*
216 * debugging aid: "show_stack(NULL, NULL);" prints the 222 * Debugging aid: "show_stack(NULL, NULL);" prints the
217 * back trace for this cpu. 223 * back trace for this cpu:
218 */ 224 */
219
220 if (sp == NULL) { 225 if (sp == NULL) {
221 if (task) 226 if (task)
222 sp = (unsigned long *)task->thread.sp; 227 sp = (unsigned long *)task->thread.sp;
@@ -240,6 +245,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
240 printk(" %016lx", *stack++); 245 printk(" %016lx", *stack++);
241 touch_nmi_watchdog(); 246 touch_nmi_watchdog();
242 } 247 }
248 preempt_enable();
249
243 printk("\n"); 250 printk("\n");
244 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 251 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
245} 252}
@@ -303,4 +310,3 @@ int is_valid_bugaddr(unsigned long ip)
303 310
304 return ud2 == 0x0b0f; 311 return ud2 == 0x0b0f;
305} 312}
306
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c6..50b9c220e121 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -334,6 +334,10 @@ ENTRY(ret_from_fork)
334END(ret_from_fork) 334END(ret_from_fork)
335 335
336/* 336/*
337 * Interrupt exit functions should be protected against kprobes
338 */
339 .pushsection .kprobes.text, "ax"
340/*
337 * Return to user mode is not as complex as all this looks, 341 * Return to user mode is not as complex as all this looks,
338 * but we want the default path for a system call return to 342 * but we want the default path for a system call return to
339 * go as quickly as possible which is why some of this is 343 * go as quickly as possible which is why some of this is
@@ -383,6 +387,10 @@ need_resched:
383END(resume_kernel) 387END(resume_kernel)
384#endif 388#endif
385 CFI_ENDPROC 389 CFI_ENDPROC
390/*
391 * End of kprobes section
392 */
393 .popsection
386 394
387/* SYSENTER_RETURN points to after the "sysenter" instruction in 395/* SYSENTER_RETURN points to after the "sysenter" instruction in
388 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ 396 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -513,6 +521,10 @@ sysexit_audit:
513 PTGS_TO_GS_EX 521 PTGS_TO_GS_EX
514ENDPROC(ia32_sysenter_target) 522ENDPROC(ia32_sysenter_target)
515 523
524/*
525 * syscall stub including irq exit should be protected against kprobes
526 */
527 .pushsection .kprobes.text, "ax"
516 # system call handler stub 528 # system call handler stub
517ENTRY(system_call) 529ENTRY(system_call)
518 RING0_INT_FRAME # can't unwind into user space anyway 530 RING0_INT_FRAME # can't unwind into user space anyway
@@ -705,6 +717,10 @@ syscall_badsys:
705 jmp resume_userspace 717 jmp resume_userspace
706END(syscall_badsys) 718END(syscall_badsys)
707 CFI_ENDPROC 719 CFI_ENDPROC
720/*
721 * End of kprobes section
722 */
723 .popsection
708 724
709/* 725/*
710 * System calls that need a pt_regs pointer. 726 * System calls that need a pt_regs pointer.
@@ -814,6 +830,10 @@ common_interrupt:
814ENDPROC(common_interrupt) 830ENDPROC(common_interrupt)
815 CFI_ENDPROC 831 CFI_ENDPROC
816 832
833/*
834 * Irq entries should be protected against kprobes
835 */
836 .pushsection .kprobes.text, "ax"
817#define BUILD_INTERRUPT3(name, nr, fn) \ 837#define BUILD_INTERRUPT3(name, nr, fn) \
818ENTRY(name) \ 838ENTRY(name) \
819 RING0_INT_FRAME; \ 839 RING0_INT_FRAME; \
@@ -980,6 +1000,10 @@ ENTRY(spurious_interrupt_bug)
980 jmp error_code 1000 jmp error_code
981 CFI_ENDPROC 1001 CFI_ENDPROC
982END(spurious_interrupt_bug) 1002END(spurious_interrupt_bug)
1003/*
1004 * End of kprobes section
1005 */
1006 .popsection
983 1007
984ENTRY(kernel_thread_helper) 1008ENTRY(kernel_thread_helper)
985 pushl $0 # fake return address for unwinder 1009 pushl $0 # fake return address for unwinder
@@ -1185,17 +1209,14 @@ END(ftrace_graph_caller)
1185 1209
1186.globl return_to_handler 1210.globl return_to_handler
1187return_to_handler: 1211return_to_handler:
1188 pushl $0
1189 pushl %eax 1212 pushl %eax
1190 pushl %ecx
1191 pushl %edx 1213 pushl %edx
1192 movl %ebp, %eax 1214 movl %ebp, %eax
1193 call ftrace_return_to_handler 1215 call ftrace_return_to_handler
1194 movl %eax, 0xc(%esp) 1216 movl %eax, %ecx
1195 popl %edx 1217 popl %edx
1196 popl %ecx
1197 popl %eax 1218 popl %eax
1198 ret 1219 jmp *%ecx
1199#endif 1220#endif
1200 1221
1201.section .rodata,"a" 1222.section .rodata,"a"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b5c061f8f358..63bca794c8f9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -155,11 +155,11 @@ GLOBAL(return_to_handler)
155 155
156 call ftrace_return_to_handler 156 call ftrace_return_to_handler
157 157
158 movq %rax, 16(%rsp) 158 movq %rax, %rdi
159 movq 8(%rsp), %rdx 159 movq 8(%rsp), %rdx
160 movq (%rsp), %rax 160 movq (%rsp), %rax
161 addq $16, %rsp 161 addq $24, %rsp
162 retq 162 jmp *%rdi
163#endif 163#endif
164 164
165 165
@@ -803,6 +803,10 @@ END(interrupt)
803 call \func 803 call \func
804 .endm 804 .endm
805 805
806/*
807 * Interrupt entry/exit should be protected against kprobes
808 */
809 .pushsection .kprobes.text, "ax"
806 /* 810 /*
807 * The interrupt stubs push (~vector+0x80) onto the stack and 811 * The interrupt stubs push (~vector+0x80) onto the stack and
808 * then jump to common_interrupt. 812 * then jump to common_interrupt.
@@ -941,6 +945,10 @@ ENTRY(retint_kernel)
941 945
942 CFI_ENDPROC 946 CFI_ENDPROC
943END(common_interrupt) 947END(common_interrupt)
948/*
949 * End of kprobes section
950 */
951 .popsection
944 952
945/* 953/*
946 * APIC interrupts. 954 * APIC interrupts.
@@ -969,8 +977,8 @@ apicinterrupt UV_BAU_MESSAGE \
969#endif 977#endif
970apicinterrupt LOCAL_TIMER_VECTOR \ 978apicinterrupt LOCAL_TIMER_VECTOR \
971 apic_timer_interrupt smp_apic_timer_interrupt 979 apic_timer_interrupt smp_apic_timer_interrupt
972apicinterrupt GENERIC_INTERRUPT_VECTOR \ 980apicinterrupt X86_PLATFORM_IPI_VECTOR \
973 generic_interrupt smp_generic_interrupt 981 x86_platform_ipi smp_x86_platform_ipi
974 982
975#ifdef CONFIG_SMP 983#ifdef CONFIG_SMP
976apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 984apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@ -1491,12 +1499,17 @@ error_kernelspace:
1491 leaq irq_return(%rip),%rcx 1499 leaq irq_return(%rip),%rcx
1492 cmpq %rcx,RIP+8(%rsp) 1500 cmpq %rcx,RIP+8(%rsp)
1493 je error_swapgs 1501 je error_swapgs
1494 movl %ecx,%ecx /* zero extend */ 1502 movl %ecx,%eax /* zero extend */
1495 cmpq %rcx,RIP+8(%rsp) 1503 cmpq %rax,RIP+8(%rsp)
1496 je error_swapgs 1504 je bstep_iret
1497 cmpq $gs_change,RIP+8(%rsp) 1505 cmpq $gs_change,RIP+8(%rsp)
1498 je error_swapgs 1506 je error_swapgs
1499 jmp error_sti 1507 jmp error_sti
1508
1509bstep_iret:
1510 /* Fix truncated RIP */
1511 movq %rcx,RIP+8(%rsp)
1512 jmp error_swapgs
1500END(error_entry) 1513END(error_entry)
1501 1514
1502 1515
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 9dbb527e1652..309689245431 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -9,6 +9,8 @@
9 * the dangers of modifying code on the run. 9 * the dangers of modifying code on the run.
10 */ 10 */
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
13#include <linux/hardirq.h> 15#include <linux/hardirq.h>
14#include <linux/uaccess.h> 16#include <linux/uaccess.h>
@@ -187,9 +189,26 @@ static void wait_for_nmi(void)
187 nmi_wait_count++; 189 nmi_wait_count++;
188} 190}
189 191
192static inline int
193within(unsigned long addr, unsigned long start, unsigned long end)
194{
195 return addr >= start && addr < end;
196}
197
190static int 198static int
191do_ftrace_mod_code(unsigned long ip, void *new_code) 199do_ftrace_mod_code(unsigned long ip, void *new_code)
192{ 200{
201 /*
202 * On x86_64, kernel text mappings are mapped read-only with
203 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
204 * of the kernel text mapping to modify the kernel text.
205 *
206 * For 32bit kernels, these mappings are same and we can use
207 * kernel identity mapping to modify code.
208 */
209 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
210 ip = (unsigned long)__va(__pa(ip));
211
193 mod_code_ip = (void *)ip; 212 mod_code_ip = (void *)ip;
194 mod_code_newcode = new_code; 213 mod_code_newcode = new_code;
195 214
@@ -336,15 +355,15 @@ int __init ftrace_dyn_arch_init(void *data)
336 355
337 switch (faulted) { 356 switch (faulted) {
338 case 0: 357 case 0:
339 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); 358 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
340 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); 359 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
341 break; 360 break;
342 case 1: 361 case 1:
343 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); 362 pr_info("converting mcount calls to 66 66 66 66 90\n");
344 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); 363 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
345 break; 364 break;
346 case 2: 365 case 2:
347 pr_info("ftrace: converting mcount calls to jmp . + 5\n"); 366 pr_info("converting mcount calls to jmp . + 5\n");
348 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); 367 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
349 break; 368 break;
350 } 369 }
@@ -468,82 +487,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
468 487
469#ifdef CONFIG_FTRACE_SYSCALLS 488#ifdef CONFIG_FTRACE_SYSCALLS
470 489
471extern unsigned long __start_syscalls_metadata[];
472extern unsigned long __stop_syscalls_metadata[];
473extern unsigned long *sys_call_table; 490extern unsigned long *sys_call_table;
474 491
475static struct syscall_metadata **syscalls_metadata; 492unsigned long __init arch_syscall_addr(int nr)
476
477static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
478{
479 struct syscall_metadata *start;
480 struct syscall_metadata *stop;
481 char str[KSYM_SYMBOL_LEN];
482
483
484 start = (struct syscall_metadata *)__start_syscalls_metadata;
485 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
486 kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
487
488 for ( ; start < stop; start++) {
489 if (start->name && !strcmp(start->name, str))
490 return start;
491 }
492 return NULL;
493}
494
495struct syscall_metadata *syscall_nr_to_meta(int nr)
496{
497 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
498 return NULL;
499
500 return syscalls_metadata[nr];
501}
502
503int syscall_name_to_nr(char *name)
504{
505 int i;
506
507 if (!syscalls_metadata)
508 return -1;
509
510 for (i = 0; i < NR_syscalls; i++) {
511 if (syscalls_metadata[i]) {
512 if (!strcmp(syscalls_metadata[i]->name, name))
513 return i;
514 }
515 }
516 return -1;
517}
518
519void set_syscall_enter_id(int num, int id)
520{
521 syscalls_metadata[num]->enter_id = id;
522}
523
524void set_syscall_exit_id(int num, int id)
525{ 493{
526 syscalls_metadata[num]->exit_id = id; 494 return (unsigned long)(&sys_call_table)[nr];
527}
528
529static int __init arch_init_ftrace_syscalls(void)
530{
531 int i;
532 struct syscall_metadata *meta;
533 unsigned long **psys_syscall_table = &sys_call_table;
534
535 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
536 NR_syscalls, GFP_KERNEL);
537 if (!syscalls_metadata) {
538 WARN_ON(1);
539 return -ENOMEM;
540 }
541
542 for (i = 0; i < NR_syscalls; i++) {
543 meta = find_syscall_meta(psys_syscall_table[i]);
544 syscalls_metadata[i] = meta;
545 }
546 return 0;
547} 495}
548arch_initcall(arch_init_ftrace_syscalls);
549#endif 496#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 050c278481b1..7fd318bac59c 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -18,6 +18,8 @@
18#include <asm/asm-offsets.h> 18#include <asm/asm-offsets.h>
19#include <asm/setup.h> 19#include <asm/setup.h>
20#include <asm/processor-flags.h> 20#include <asm/processor-flags.h>
21#include <asm/msr-index.h>
22#include <asm/cpufeature.h>
21#include <asm/percpu.h> 23#include <asm/percpu.h>
22 24
23/* Physical address */ 25/* Physical address */
@@ -297,25 +299,27 @@ ENTRY(startup_32_smp)
297 orl %edx,%eax 299 orl %edx,%eax
298 movl %eax,%cr4 300 movl %eax,%cr4
299 301
300 btl $5, %eax # check if PAE is enabled 302 testb $X86_CR4_PAE, %al # check if PAE is enabled
301 jnc 6f 303 jz 6f
302 304
303 /* Check if extended functions are implemented */ 305 /* Check if extended functions are implemented */
304 movl $0x80000000, %eax 306 movl $0x80000000, %eax
305 cpuid 307 cpuid
306 cmpl $0x80000000, %eax 308 /* Value must be in the range 0x80000001 to 0x8000ffff */
307 jbe 6f 309 subl $0x80000001, %eax
310 cmpl $(0x8000ffff-0x80000001), %eax
311 ja 6f
308 mov $0x80000001, %eax 312 mov $0x80000001, %eax
309 cpuid 313 cpuid
310 /* Execute Disable bit supported? */ 314 /* Execute Disable bit supported? */
311 btl $20, %edx 315 btl $(X86_FEATURE_NX & 31), %edx
312 jnc 6f 316 jnc 6f
313 317
314 /* Setup EFER (Extended Feature Enable Register) */ 318 /* Setup EFER (Extended Feature Enable Register) */
315 movl $0xc0000080, %ecx 319 movl $MSR_EFER, %ecx
316 rdmsr 320 rdmsr
317 321
318 btsl $11, %eax 322 btsl $_EFER_NX, %eax
319 /* Make changes effective */ 323 /* Make changes effective */
320 wrmsr 324 wrmsr
321 325
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 780cd928fcd5..2d8b5035371c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -212,8 +212,8 @@ ENTRY(secondary_startup_64)
212 */ 212 */
213 lgdt early_gdt_descr(%rip) 213 lgdt early_gdt_descr(%rip)
214 214
215 /* set up data segments. actually 0 would do too */ 215 /* set up data segments */
216 movl $__KERNEL_DS,%eax 216 xorl %eax,%eax
217 movl %eax,%ds 217 movl %eax,%ds
218 movl %eax,%ss 218 movl %eax,%ss
219 movl %eax,%es 219 movl %eax,%es
@@ -262,11 +262,11 @@ ENTRY(secondary_startup_64)
262 .quad x86_64_start_kernel 262 .quad x86_64_start_kernel
263 ENTRY(initial_gs) 263 ENTRY(initial_gs)
264 .quad INIT_PER_CPU_VAR(irq_stack_union) 264 .quad INIT_PER_CPU_VAR(irq_stack_union)
265 __FINITDATA
266 265
267 ENTRY(stack_start) 266 ENTRY(stack_start)
268 .quad init_thread_union+THREAD_SIZE-8 267 .quad init_thread_union+THREAD_SIZE-8
269 .word 0 268 .word 0
269 __FINITDATA
270 270
271bad_address: 271bad_address:
272 jmp bad_address 272 jmp bad_address
@@ -340,6 +340,7 @@ ENTRY(name)
340 i = i + 1 ; \ 340 i = i + 1 ; \
341 .endr 341 .endr
342 342
343 .data
343 /* 344 /*
344 * This default setting generates an ident mapping at address 0x100000 345 * This default setting generates an ident mapping at address 0x100000
345 * and a mapping for the kernel that precisely maps virtual address 346 * and a mapping for the kernel that precisely maps virtual address
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..d42f65ac4927
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,555 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) 2009 IBM Corporation
18 * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Authors: Alan Stern <stern@rowland.harvard.edu>
21 * K.Prasad <prasad@linux.vnet.ibm.com>
22 * Frederic Weisbecker <fweisbec@gmail.com>
23 */
24
25/*
26 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
27 * using the CPU's debug registers.
28 */
29
30#include <linux/perf_event.h>
31#include <linux/hw_breakpoint.h>
32#include <linux/irqflags.h>
33#include <linux/notifier.h>
34#include <linux/kallsyms.h>
35#include <linux/kprobes.h>
36#include <linux/percpu.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/sched.h>
41#include <linux/init.h>
42#include <linux/smp.h>
43
44#include <asm/hw_breakpoint.h>
45#include <asm/processor.h>
46#include <asm/debugreg.h>
47
48/* Per cpu debug control register value */
49DEFINE_PER_CPU(unsigned long, cpu_dr7);
50EXPORT_PER_CPU_SYMBOL(cpu_dr7);
51
52/* Per cpu debug address registers values */
53static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
54
55/*
56 * Stores the breakpoints currently in use on each breakpoint address
57 * register for each cpus
58 */
59static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
60
61
62static inline unsigned long
63__encode_dr7(int drnum, unsigned int len, unsigned int type)
64{
65 unsigned long bp_info;
66
67 bp_info = (len | type) & 0xf;
68 bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
69 bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
70
71 return bp_info;
72}
73
74/*
75 * Encode the length, type, Exact, and Enable bits for a particular breakpoint
76 * as stored in debug register 7.
77 */
78unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
79{
80 return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
81}
82
83/*
84 * Decode the length and type bits for a particular breakpoint as
85 * stored in debug register 7. Return the "enabled" status.
86 */
87int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
88{
89 int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
90
91 *len = (bp_info & 0xc) | 0x40;
92 *type = (bp_info & 0x3) | 0x80;
93
94 return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
95}
96
97/*
98 * Install a perf counter breakpoint.
99 *
100 * We seek a free debug address register and use it for this
101 * breakpoint. Eventually we enable it in the debug control register.
102 *
103 * Atomic: we hold the counter->ctx->lock and we only handle variables
104 * and registers local to this cpu.
105 */
106int arch_install_hw_breakpoint(struct perf_event *bp)
107{
108 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
109 unsigned long *dr7;
110 int i;
111
112 for (i = 0; i < HBP_NUM; i++) {
113 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
114
115 if (!*slot) {
116 *slot = bp;
117 break;
118 }
119 }
120
121 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
122 return -EBUSY;
123
124 set_debugreg(info->address, i);
125 __get_cpu_var(cpu_debugreg[i]) = info->address;
126
127 dr7 = &__get_cpu_var(cpu_dr7);
128 *dr7 |= encode_dr7(i, info->len, info->type);
129
130 set_debugreg(*dr7, 7);
131
132 return 0;
133}
134
135/*
136 * Uninstall the breakpoint contained in the given counter.
137 *
138 * First we search the debug address register it uses and then we disable
139 * it.
140 *
141 * Atomic: we hold the counter->ctx->lock and we only handle variables
142 * and registers local to this cpu.
143 */
144void arch_uninstall_hw_breakpoint(struct perf_event *bp)
145{
146 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
147 unsigned long *dr7;
148 int i;
149
150 for (i = 0; i < HBP_NUM; i++) {
151 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
152
153 if (*slot == bp) {
154 *slot = NULL;
155 break;
156 }
157 }
158
159 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
160 return;
161
162 dr7 = &__get_cpu_var(cpu_dr7);
163 *dr7 &= ~__encode_dr7(i, info->len, info->type);
164
165 set_debugreg(*dr7, 7);
166}
167
168static int get_hbp_len(u8 hbp_len)
169{
170 unsigned int len_in_bytes = 0;
171
172 switch (hbp_len) {
173 case X86_BREAKPOINT_LEN_1:
174 len_in_bytes = 1;
175 break;
176 case X86_BREAKPOINT_LEN_2:
177 len_in_bytes = 2;
178 break;
179 case X86_BREAKPOINT_LEN_4:
180 len_in_bytes = 4;
181 break;
182#ifdef CONFIG_X86_64
183 case X86_BREAKPOINT_LEN_8:
184 len_in_bytes = 8;
185 break;
186#endif
187 }
188 return len_in_bytes;
189}
190
191/*
192 * Check for virtual address in user space.
193 */
194int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
195{
196 unsigned int len;
197
198 len = get_hbp_len(hbp_len);
199
200 return (va <= TASK_SIZE - len);
201}
202
203/*
204 * Check for virtual address in kernel space.
205 */
206static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
207{
208 unsigned int len;
209
210 len = get_hbp_len(hbp_len);
211
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213}
214
215/*
216 * Store a breakpoint's encoded address, length, and type.
217 */
218static int arch_store_info(struct perf_event *bp)
219{
220 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
221 /*
222 * For kernel-addresses, either the address or symbol name can be
223 * specified.
224 */
225 if (info->name)
226 info->address = (unsigned long)
227 kallsyms_lookup_name(info->name);
228 if (info->address)
229 return 0;
230
231 return -EINVAL;
232}
233
234int arch_bp_generic_fields(int x86_len, int x86_type,
235 int *gen_len, int *gen_type)
236{
237 /* Len */
238 switch (x86_len) {
239 case X86_BREAKPOINT_LEN_1:
240 *gen_len = HW_BREAKPOINT_LEN_1;
241 break;
242 case X86_BREAKPOINT_LEN_2:
243 *gen_len = HW_BREAKPOINT_LEN_2;
244 break;
245 case X86_BREAKPOINT_LEN_4:
246 *gen_len = HW_BREAKPOINT_LEN_4;
247 break;
248#ifdef CONFIG_X86_64
249 case X86_BREAKPOINT_LEN_8:
250 *gen_len = HW_BREAKPOINT_LEN_8;
251 break;
252#endif
253 default:
254 return -EINVAL;
255 }
256
257 /* Type */
258 switch (x86_type) {
259 case X86_BREAKPOINT_EXECUTE:
260 *gen_type = HW_BREAKPOINT_X;
261 break;
262 case X86_BREAKPOINT_WRITE:
263 *gen_type = HW_BREAKPOINT_W;
264 break;
265 case X86_BREAKPOINT_RW:
266 *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
267 break;
268 default:
269 return -EINVAL;
270 }
271
272 return 0;
273}
274
275
276static int arch_build_bp_info(struct perf_event *bp)
277{
278 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
279
280 info->address = bp->attr.bp_addr;
281
282 /* Len */
283 switch (bp->attr.bp_len) {
284 case HW_BREAKPOINT_LEN_1:
285 info->len = X86_BREAKPOINT_LEN_1;
286 break;
287 case HW_BREAKPOINT_LEN_2:
288 info->len = X86_BREAKPOINT_LEN_2;
289 break;
290 case HW_BREAKPOINT_LEN_4:
291 info->len = X86_BREAKPOINT_LEN_4;
292 break;
293#ifdef CONFIG_X86_64
294 case HW_BREAKPOINT_LEN_8:
295 info->len = X86_BREAKPOINT_LEN_8;
296 break;
297#endif
298 default:
299 return -EINVAL;
300 }
301
302 /* Type */
303 switch (bp->attr.bp_type) {
304 case HW_BREAKPOINT_W:
305 info->type = X86_BREAKPOINT_WRITE;
306 break;
307 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
308 info->type = X86_BREAKPOINT_RW;
309 break;
310 case HW_BREAKPOINT_X:
311 info->type = X86_BREAKPOINT_EXECUTE;
312 break;
313 default:
314 return -EINVAL;
315 }
316
317 return 0;
318}
319/*
320 * Validate the arch-specific HW Breakpoint register settings
321 */
322int arch_validate_hwbkpt_settings(struct perf_event *bp,
323 struct task_struct *tsk)
324{
325 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
326 unsigned int align;
327 int ret;
328
329
330 ret = arch_build_bp_info(bp);
331 if (ret)
332 return ret;
333
334 ret = -EINVAL;
335
336 if (info->type == X86_BREAKPOINT_EXECUTE)
337 /*
338 * Ptrace-refactoring code
339 * For now, we'll allow instruction breakpoint only for user-space
340 * addresses
341 */
342 if ((!arch_check_va_in_userspace(info->address, info->len)) &&
343 info->len != X86_BREAKPOINT_EXECUTE)
344 return ret;
345
346 switch (info->len) {
347 case X86_BREAKPOINT_LEN_1:
348 align = 0;
349 break;
350 case X86_BREAKPOINT_LEN_2:
351 align = 1;
352 break;
353 case X86_BREAKPOINT_LEN_4:
354 align = 3;
355 break;
356#ifdef CONFIG_X86_64
357 case X86_BREAKPOINT_LEN_8:
358 align = 7;
359 break;
360#endif
361 default:
362 return ret;
363 }
364
365 if (bp->callback)
366 ret = arch_store_info(bp);
367
368 if (ret < 0)
369 return ret;
370 /*
371 * Check that the low-order bits of the address are appropriate
372 * for the alignment implied by len.
373 */
374 if (info->address & align)
375 return -EINVAL;
376
377 /* Check that the virtual address is in the proper range */
378 if (tsk) {
379 if (!arch_check_va_in_userspace(info->address, info->len))
380 return -EFAULT;
381 } else {
382 if (!arch_check_va_in_kernelspace(info->address, info->len))
383 return -EFAULT;
384 }
385
386 return 0;
387}
388
389/*
390 * Dump the debug register contents to the user.
391 * We can't dump our per cpu values because it
392 * may contain cpu wide breakpoint, something that
393 * doesn't belong to the current task.
394 *
395 * TODO: include non-ptrace user breakpoints (perf)
396 */
397void aout_dump_debugregs(struct user *dump)
398{
399 int i;
400 int dr7 = 0;
401 struct perf_event *bp;
402 struct arch_hw_breakpoint *info;
403 struct thread_struct *thread = &current->thread;
404
405 for (i = 0; i < HBP_NUM; i++) {
406 bp = thread->ptrace_bps[i];
407
408 if (bp && !bp->attr.disabled) {
409 dump->u_debugreg[i] = bp->attr.bp_addr;
410 info = counter_arch_bp(bp);
411 dr7 |= encode_dr7(i, info->len, info->type);
412 } else {
413 dump->u_debugreg[i] = 0;
414 }
415 }
416
417 dump->u_debugreg[4] = 0;
418 dump->u_debugreg[5] = 0;
419 dump->u_debugreg[6] = current->thread.debugreg6;
420
421 dump->u_debugreg[7] = dr7;
422}
423EXPORT_SYMBOL_GPL(aout_dump_debugregs);
424
425/*
426 * Release the user breakpoints used by ptrace
427 */
428void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
429{
430 int i;
431 struct thread_struct *t = &tsk->thread;
432
433 for (i = 0; i < HBP_NUM; i++) {
434 unregister_hw_breakpoint(t->ptrace_bps[i]);
435 t->ptrace_bps[i] = NULL;
436 }
437}
438
439void hw_breakpoint_restore(void)
440{
441 set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
442 set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
443 set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
444 set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
445 set_debugreg(current->thread.debugreg6, 6);
446 set_debugreg(__get_cpu_var(cpu_dr7), 7);
447}
448EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
449
450/*
451 * Handle debug exception notifications.
452 *
453 * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
454 *
455 * NOTIFY_DONE returned if one of the following conditions is true.
456 * i) When the causative address is from user-space and the exception
457 * is a valid one, i.e. not triggered as a result of lazy debug register
458 * switching
459 * ii) When there are more bits than trap<n> set in DR6 register (such
460 * as BD, BS or BT) indicating that more than one debug condition is
461 * met and requires some more action in do_debug().
462 *
463 * NOTIFY_STOP returned for all other cases
464 *
465 */
466static int __kprobes hw_breakpoint_handler(struct die_args *args)
467{
468 int i, cpu, rc = NOTIFY_STOP;
469 struct perf_event *bp;
470 unsigned long dr7, dr6;
471 unsigned long *dr6_p;
472
473 /* The DR6 value is pointed by args->err */
474 dr6_p = (unsigned long *)ERR_PTR(args->err);
475 dr6 = *dr6_p;
476
477 /* Do an early return if no trap bits are set in DR6 */
478 if ((dr6 & DR_TRAP_BITS) == 0)
479 return NOTIFY_DONE;
480
481 get_debugreg(dr7, 7);
482 /* Disable breakpoints during exception handling */
483 set_debugreg(0UL, 7);
484 /*
485 * Assert that local interrupts are disabled
486 * Reset the DRn bits in the virtualized register value.
487 * The ptrace trigger routine will add in whatever is needed.
488 */
489 current->thread.debugreg6 &= ~DR_TRAP_BITS;
490 cpu = get_cpu();
491
492 /* Handle all the breakpoints that were triggered */
493 for (i = 0; i < HBP_NUM; ++i) {
494 if (likely(!(dr6 & (DR_TRAP0 << i))))
495 continue;
496
497 /*
498 * The counter may be concurrently released but that can only
499 * occur from a call_rcu() path. We can then safely fetch
500 * the breakpoint, use its callback, touch its counter
501 * while we are in an rcu_read_lock() path.
502 */
503 rcu_read_lock();
504
505 bp = per_cpu(bp_per_reg[i], cpu);
506 if (bp)
507 rc = NOTIFY_DONE;
508 /*
509 * Reset the 'i'th TRAP bit in dr6 to denote completion of
510 * exception handling
511 */
512 (*dr6_p) &= ~(DR_TRAP0 << i);
513 /*
514 * bp can be NULL due to lazy debug register switching
515 * or due to concurrent perf counter removing.
516 */
517 if (!bp) {
518 rcu_read_unlock();
519 break;
520 }
521
522 (bp->callback)(bp, args->regs);
523
524 rcu_read_unlock();
525 }
526 if (dr6 & (~DR_TRAP_BITS))
527 rc = NOTIFY_DONE;
528
529 set_debugreg(dr7, 7);
530 put_cpu();
531
532 return rc;
533}
534
535/*
536 * Handle debug exception notifications.
537 */
538int __kprobes hw_breakpoint_exceptions_notify(
539 struct notifier_block *unused, unsigned long val, void *data)
540{
541 if (val != DIE_DEBUG)
542 return NOTIFY_DONE;
543
544 return hw_breakpoint_handler(data);
545}
546
547void hw_breakpoint_pmu_read(struct perf_event *bp)
548{
549 /* TODO */
550}
551
552void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
553{
554 /* TODO */
555}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 04bbd5278568..664bcb7384ac 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -18,7 +18,7 @@
18atomic_t irq_err_count; 18atomic_t irq_err_count;
19 19
20/* Function pointer for generic interrupt vector handling */ 20/* Function pointer for generic interrupt vector handling */
21void (*generic_interrupt_extension)(void) = NULL; 21void (*x86_platform_ipi_callback)(void) = NULL;
22 22
23/* 23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'. 24 * 'what should we do if we get a hw irq event on an illegal vector'.
@@ -72,10 +72,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); 72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n"); 73 seq_printf(p, " Performance pending work\n");
74#endif 74#endif
75 if (generic_interrupt_extension) { 75 if (x86_platform_ipi_callback) {
76 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
77 for_each_online_cpu(j) 77 for_each_online_cpu(j)
78 seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); 78 seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
79 seq_printf(p, " Platform interrupts\n"); 79 seq_printf(p, " Platform interrupts\n");
80 } 80 }
81#ifdef CONFIG_SMP 81#ifdef CONFIG_SMP
@@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec)
92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); 92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
93 seq_printf(p, " TLB shootdowns\n"); 93 seq_printf(p, " TLB shootdowns\n");
94#endif 94#endif
95#ifdef CONFIG_X86_MCE 95#ifdef CONFIG_X86_THERMAL_VECTOR
96 seq_printf(p, "%*s: ", prec, "TRM"); 96 seq_printf(p, "%*s: ", prec, "TRM");
97 for_each_online_cpu(j) 97 for_each_online_cpu(j)
98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
99 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
100# ifdef CONFIG_X86_MCE_THRESHOLD 100#endif
101#ifdef CONFIG_X86_MCE_THRESHOLD
101 seq_printf(p, "%*s: ", prec, "THR"); 102 seq_printf(p, "%*s: ", prec, "THR");
102 for_each_online_cpu(j) 103 for_each_online_cpu(j)
103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 104 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
104 seq_printf(p, " Threshold APIC interrupts\n"); 105 seq_printf(p, " Threshold APIC interrupts\n");
105# endif
106#endif 106#endif
107#ifdef CONFIG_X86_MCE 107#ifdef CONFIG_X86_MCE
108 seq_printf(p, "%*s: ", prec, "MCE"); 108 seq_printf(p, "%*s: ", prec, "MCE");
@@ -187,18 +187,18 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
187 sum += irq_stats(cpu)->apic_perf_irqs; 187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs; 188 sum += irq_stats(cpu)->apic_pending_irqs;
189#endif 189#endif
190 if (generic_interrupt_extension) 190 if (x86_platform_ipi_callback)
191 sum += irq_stats(cpu)->generic_irqs; 191 sum += irq_stats(cpu)->x86_platform_ipis;
192#ifdef CONFIG_SMP 192#ifdef CONFIG_SMP
193 sum += irq_stats(cpu)->irq_resched_count; 193 sum += irq_stats(cpu)->irq_resched_count;
194 sum += irq_stats(cpu)->irq_call_count; 194 sum += irq_stats(cpu)->irq_call_count;
195 sum += irq_stats(cpu)->irq_tlb_count; 195 sum += irq_stats(cpu)->irq_tlb_count;
196#endif 196#endif
197#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_THERMAL_VECTOR
198 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
199# ifdef CONFIG_X86_MCE_THRESHOLD 199#endif
200#ifdef CONFIG_X86_MCE_THRESHOLD
200 sum += irq_stats(cpu)->irq_threshold_count; 201 sum += irq_stats(cpu)->irq_threshold_count;
201# endif
202#endif 202#endif
203#ifdef CONFIG_X86_MCE 203#ifdef CONFIG_X86_MCE
204 sum += per_cpu(mce_exception_count, cpu); 204 sum += per_cpu(mce_exception_count, cpu);
@@ -251,9 +251,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
251} 251}
252 252
253/* 253/*
254 * Handler for GENERIC_INTERRUPT_VECTOR. 254 * Handler for X86_PLATFORM_IPI_VECTOR.
255 */ 255 */
256void smp_generic_interrupt(struct pt_regs *regs) 256void smp_x86_platform_ipi(struct pt_regs *regs)
257{ 257{
258 struct pt_regs *old_regs = set_irq_regs(regs); 258 struct pt_regs *old_regs = set_irq_regs(regs);
259 259
@@ -263,10 +263,10 @@ void smp_generic_interrupt(struct pt_regs *regs)
263 263
264 irq_enter(); 264 irq_enter();
265 265
266 inc_irq_stat(generic_irqs); 266 inc_irq_stat(x86_platform_ipis);
267 267
268 if (generic_interrupt_extension) 268 if (x86_platform_ipi_callback)
269 generic_interrupt_extension(); 269 x86_platform_ipi_callback();
270 270
271 irq_exit(); 271 irq_exit();
272 272
@@ -274,3 +274,93 @@ void smp_generic_interrupt(struct pt_regs *regs)
274} 274}
275 275
276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
277
278#ifdef CONFIG_HOTPLUG_CPU
279/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
280void fixup_irqs(void)
281{
282 unsigned int irq, vector;
283 static int warned;
284 struct irq_desc *desc;
285
286 for_each_irq_desc(irq, desc) {
287 int break_affinity = 0;
288 int set_affinity = 1;
289 const struct cpumask *affinity;
290
291 if (!desc)
292 continue;
293 if (irq == 2)
294 continue;
295
296 /* interrupt's are disabled at this point */
297 spin_lock(&desc->lock);
298
299 affinity = desc->affinity;
300 if (!irq_has_action(irq) ||
301 cpumask_equal(affinity, cpu_online_mask)) {
302 spin_unlock(&desc->lock);
303 continue;
304 }
305
306 /*
307 * Complete the irq move. This cpu is going down and for
308 * non intr-remapping case, we can't wait till this interrupt
309 * arrives at this cpu before completing the irq move.
310 */
311 irq_force_complete_move(irq);
312
313 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
314 break_affinity = 1;
315 affinity = cpu_all_mask;
316 }
317
318 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
319 desc->chip->mask(irq);
320
321 if (desc->chip->set_affinity)
322 desc->chip->set_affinity(irq, affinity);
323 else if (!(warned++))
324 set_affinity = 0;
325
326 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
327 desc->chip->unmask(irq);
328
329 spin_unlock(&desc->lock);
330
331 if (break_affinity && set_affinity)
332 printk("Broke affinity for irq %i\n", irq);
333 else if (!set_affinity)
334 printk("Cannot set affinity for irq %i\n", irq);
335 }
336
337 /*
338 * We can remove mdelay() and then send spuriuous interrupts to
339 * new cpu targets for all the irqs that were handled previously by
340 * this cpu. While it works, I have seen spurious interrupt messages
341 * (nothing wrong but still...).
342 *
343 * So for now, retain mdelay(1) and check the IRR and then send those
344 * interrupts to new targets as this cpu is already offlined...
345 */
346 mdelay(1);
347
348 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
349 unsigned int irr;
350
351 if (__get_cpu_var(vector_irq)[vector] < 0)
352 continue;
353
354 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
355 if (irr & (1 << (vector % 32))) {
356 irq = __get_cpu_var(vector_irq)[vector];
357
358 desc = irq_to_desc(irq);
359 spin_lock(&desc->lock);
360 if (desc->chip->retrigger)
361 desc->chip->retrigger(irq);
362 spin_unlock(&desc->lock);
363 }
364 }
365}
366#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 7d35d0fe2329..10709f29d166 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
211 211
212 return true; 212 return true;
213} 213}
214
215#ifdef CONFIG_HOTPLUG_CPU
216
217/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
218void fixup_irqs(void)
219{
220 unsigned int irq;
221 struct irq_desc *desc;
222
223 for_each_irq_desc(irq, desc) {
224 const struct cpumask *affinity;
225
226 if (!desc)
227 continue;
228 if (irq == 2)
229 continue;
230
231 affinity = desc->affinity;
232 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
233 printk("Breaking affinity for irq %i\n", irq);
234 affinity = cpu_all_mask;
235 }
236 if (desc->chip->set_affinity)
237 desc->chip->set_affinity(irq, affinity);
238 else if (desc->action)
239 printk_once("Cannot set affinity for irq %i\n", irq);
240 }
241
242#if 0
243 barrier();
244 /* Ingo Molnar says: "after the IO-APIC masks have been redirected
245 [note the nop - the interrupt-enable boundary on x86 is two
246 instructions from sti] - to flush out pending hardirqs and
247 IPIs. After this point nothing is supposed to reach this CPU." */
248 __asm__ __volatile__("sti; nop; cli");
249 barrier();
250#else
251 /* That doesn't seem sufficient. Give it 1ms. */
252 local_irq_enable();
253 mdelay(1);
254 local_irq_disable();
255#endif
256}
257#endif
258
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 977d8b43a0dd..acf8fbf8fbda 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
62 return true; 62 return true;
63} 63}
64 64
65#ifdef CONFIG_HOTPLUG_CPU
66/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
67void fixup_irqs(void)
68{
69 unsigned int irq;
70 static int warned;
71 struct irq_desc *desc;
72
73 for_each_irq_desc(irq, desc) {
74 int break_affinity = 0;
75 int set_affinity = 1;
76 const struct cpumask *affinity;
77
78 if (!desc)
79 continue;
80 if (irq == 2)
81 continue;
82
83 /* interrupt's are disabled at this point */
84 spin_lock(&desc->lock);
85
86 affinity = desc->affinity;
87 if (!irq_has_action(irq) ||
88 cpumask_equal(affinity, cpu_online_mask)) {
89 spin_unlock(&desc->lock);
90 continue;
91 }
92
93 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
94 break_affinity = 1;
95 affinity = cpu_all_mask;
96 }
97
98 if (desc->chip->mask)
99 desc->chip->mask(irq);
100
101 if (desc->chip->set_affinity)
102 desc->chip->set_affinity(irq, affinity);
103 else if (!(warned++))
104 set_affinity = 0;
105
106 if (desc->chip->unmask)
107 desc->chip->unmask(irq);
108
109 spin_unlock(&desc->lock);
110
111 if (break_affinity && set_affinity)
112 printk("Broke affinity for irq %i\n", irq);
113 else if (!set_affinity)
114 printk("Cannot set affinity for irq %i\n", irq);
115 }
116
117 /* That doesn't seem sufficient. Give it 1ms. */
118 local_irq_enable();
119 mdelay(1);
120 local_irq_disable();
121}
122#endif
123 65
124extern void call_softirq(void); 66extern void call_softirq(void);
125 67
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 40f30773fb29..d5932226614f 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -200,8 +200,8 @@ static void __init apic_intr_init(void)
200 /* self generated IPI for local APIC timer */ 200 /* self generated IPI for local APIC timer */
201 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 201 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
202 202
203 /* generic IPI for platform specific use */ 203 /* IPI for X86 platform specific use */
204 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); 204 alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
205 205
206 /* IPI vectors for APIC spurious and error interrupts */ 206 /* IPI vectors for APIC spurious and error interrupts */
207 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 207 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8d82a77a3f3b..20a5b3689463 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -43,6 +43,7 @@
43#include <linux/smp.h> 43#include <linux/smp.h>
44#include <linux/nmi.h> 44#include <linux/nmi.h>
45 45
46#include <asm/debugreg.h>
46#include <asm/apicdef.h> 47#include <asm/apicdef.h>
47#include <asm/system.h> 48#include <asm/system.h>
48 49
@@ -88,7 +89,6 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
88 gdb_regs[GDB_SS] = __KERNEL_DS; 89 gdb_regs[GDB_SS] = __KERNEL_DS;
89 gdb_regs[GDB_FS] = 0xFFFF; 90 gdb_regs[GDB_FS] = 0xFFFF;
90 gdb_regs[GDB_GS] = 0xFFFF; 91 gdb_regs[GDB_GS] = 0xFFFF;
91 gdb_regs[GDB_SP] = (int)&regs->sp;
92#else 92#else
93 gdb_regs[GDB_R8] = regs->r8; 93 gdb_regs[GDB_R8] = regs->r8;
94 gdb_regs[GDB_R9] = regs->r9; 94 gdb_regs[GDB_R9] = regs->r9;
@@ -101,8 +101,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
101 gdb_regs32[GDB_PS] = regs->flags; 101 gdb_regs32[GDB_PS] = regs->flags;
102 gdb_regs32[GDB_CS] = regs->cs; 102 gdb_regs32[GDB_CS] = regs->cs;
103 gdb_regs32[GDB_SS] = regs->ss; 103 gdb_regs32[GDB_SS] = regs->ss;
104 gdb_regs[GDB_SP] = regs->sp;
105#endif 104#endif
105 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
106} 106}
107 107
108/** 108/**
@@ -434,6 +434,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
434 "resuming...\n"); 434 "resuming...\n");
435 kgdb_arch_handle_exception(args->trapnr, args->signr, 435 kgdb_arch_handle_exception(args->trapnr, args->signr,
436 args->err, "c", "", regs); 436 args->err, "c", "", regs);
437 /*
438 * Reset the BS bit in dr6 (pointed by args->err) to
439 * denote completion of processing
440 */
441 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
437 442
438 return NOTIFY_STOP; 443 return NOTIFY_STOP;
439} 444}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7b5169d2b000..1f3186ce213c 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -48,31 +48,22 @@
48#include <linux/preempt.h> 48#include <linux/preempt.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h>
51 52
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
53#include <asm/desc.h> 54#include <asm/desc.h>
54#include <asm/pgtable.h> 55#include <asm/pgtable.h>
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/alternative.h> 57#include <asm/alternative.h>
58#include <asm/insn.h>
59#include <asm/debugreg.h>
57 60
58void jprobe_return_end(void); 61void jprobe_return_end(void);
59 62
60DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 63DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
61DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); 64DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
62 65
63#ifdef CONFIG_X86_64 66#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
64#define stack_addr(regs) ((unsigned long *)regs->sp)
65#else
66/*
67 * "&regs->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs
68 * don't save the ss and esp registers if the CPU is already in kernel
69 * mode when it traps. So for kprobes, regs->sp and regs->ss are not
70 * the [nonexistent] saved stack pointer and ss register, but rather
71 * the top 8 bytes of the pre-int3 stack. So &regs->sp happens to
72 * point to the top of the pre-int3 stack.
73 */
74#define stack_addr(regs) ((unsigned long *)&regs->sp)
75#endif
76 67
77#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ 68#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
78 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ 69 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -106,50 +97,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
106 /* ----------------------------------------------- */ 97 /* ----------------------------------------------- */
107 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
108}; 99};
109static const u32 onebyte_has_modrm[256 / 32] = {
110 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
111 /* ----------------------------------------------- */
112 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
113 W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
114 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
115 W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
116 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
117 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
118 W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
119 W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
120 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
121 W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
122 W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
123 W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
124 W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
125 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
126 W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
127 W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
128 /* ----------------------------------------------- */
129 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
130};
131static const u32 twobyte_has_modrm[256 / 32] = {
132 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
133 /* ----------------------------------------------- */
134 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
135 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
136 W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
137 W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
138 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
139 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
140 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
141 W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
142 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
143 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
144 W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
145 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
146 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
147 W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
148 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
149 W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
150 /* ----------------------------------------------- */
151 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
152};
153#undef W 100#undef W
154 101
155struct kretprobe_blackpoint kretprobe_blacklist[] = { 102struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -244,6 +191,75 @@ retry:
244 } 191 }
245} 192}
246 193
194/* Recover the probed instruction at addr for further analysis. */
195static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
196{
197 struct kprobe *kp;
198 kp = get_kprobe((void *)addr);
199 if (!kp)
200 return -EINVAL;
201
202 /*
203 * Basically, kp->ainsn.insn has an original instruction.
204 * However, RIP-relative instruction can not do single-stepping
205 * at different place, fix_riprel() tweaks the displacement of
206 * that instruction. In that case, we can't recover the instruction
207 * from the kp->ainsn.insn.
208 *
209 * On the other hand, kp->opcode has a copy of the first byte of
210 * the probed instruction, which is overwritten by int3. And
211 * the instruction at kp->addr is not modified by kprobes except
212 * for the first byte, we can recover the original instruction
213 * from it and kp->opcode.
214 */
215 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
216 buf[0] = kp->opcode;
217 return 0;
218}
219
220/* Dummy buffers for kallsyms_lookup */
221static char __dummy_buf[KSYM_NAME_LEN];
222
223/* Check if paddr is at an instruction boundary */
224static int __kprobes can_probe(unsigned long paddr)
225{
226 int ret;
227 unsigned long addr, offset = 0;
228 struct insn insn;
229 kprobe_opcode_t buf[MAX_INSN_SIZE];
230
231 if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
232 return 0;
233
234 /* Decode instructions */
235 addr = paddr - offset;
236 while (addr < paddr) {
237 kernel_insn_init(&insn, (void *)addr);
238 insn_get_opcode(&insn);
239
240 /*
241 * Check if the instruction has been modified by another
242 * kprobe, in which case we replace the breakpoint by the
243 * original instruction in our buffer.
244 */
245 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
246 ret = recover_probed_instruction(buf, addr);
247 if (ret)
248 /*
249 * Another debugging subsystem might insert
250 * this breakpoint. In that case, we can't
251 * recover it.
252 */
253 return 0;
254 kernel_insn_init(&insn, buf);
255 }
256 insn_get_length(&insn);
257 addr += insn.length;
258 }
259
260 return (addr == paddr);
261}
262
247/* 263/*
248 * Returns non-zero if opcode modifies the interrupt flag. 264 * Returns non-zero if opcode modifies the interrupt flag.
249 */ 265 */
@@ -277,68 +293,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
277static void __kprobes fix_riprel(struct kprobe *p) 293static void __kprobes fix_riprel(struct kprobe *p)
278{ 294{
279#ifdef CONFIG_X86_64 295#ifdef CONFIG_X86_64
280 u8 *insn = p->ainsn.insn; 296 struct insn insn;
281 s64 disp; 297 kernel_insn_init(&insn, p->ainsn.insn);
282 int need_modrm;
283
284 /* Skip legacy instruction prefixes. */
285 while (1) {
286 switch (*insn) {
287 case 0x66:
288 case 0x67:
289 case 0x2e:
290 case 0x3e:
291 case 0x26:
292 case 0x64:
293 case 0x65:
294 case 0x36:
295 case 0xf0:
296 case 0xf3:
297 case 0xf2:
298 ++insn;
299 continue;
300 }
301 break;
302 }
303 298
304 /* Skip REX instruction prefix. */ 299 if (insn_rip_relative(&insn)) {
305 if (is_REX_prefix(insn)) 300 s64 newdisp;
306 ++insn; 301 u8 *disp;
307 302 insn_get_displacement(&insn);
308 if (*insn == 0x0f) { 303 /*
309 /* Two-byte opcode. */ 304 * The copied instruction uses the %rip-relative addressing
310 ++insn; 305 * mode. Adjust the displacement for the difference between
311 need_modrm = test_bit(*insn, 306 * the original location of this instruction and the location
312 (unsigned long *)twobyte_has_modrm); 307 * of the copy that will actually be run. The tricky bit here
313 } else 308 * is making sure that the sign extension happens correctly in
314 /* One-byte opcode. */ 309 * this calculation, since we need a signed 32-bit result to
315 need_modrm = test_bit(*insn, 310 * be sign-extended to 64 bits when it's added to the %rip
316 (unsigned long *)onebyte_has_modrm); 311 * value and yield the same 64-bit result that the sign-
317 312 * extension of the original signed 32-bit displacement would
318 if (need_modrm) { 313 * have given.
319 u8 modrm = *++insn; 314 */
320 if ((modrm & 0xc7) == 0x05) { 315 newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
321 /* %rip+disp32 addressing mode */ 316 (u8 *) p->ainsn.insn;
322 /* Displacement follows ModRM byte. */ 317 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
323 ++insn; 318 disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
324 /* 319 *(s32 *) disp = (s32) newdisp;
325 * The copied instruction uses the %rip-relative
326 * addressing mode. Adjust the displacement for the
327 * difference between the original location of this
328 * instruction and the location of the copy that will
329 * actually be run. The tricky bit here is making sure
330 * that the sign extension happens correctly in this
331 * calculation, since we need a signed 32-bit result to
332 * be sign-extended to 64 bits when it's added to the
333 * %rip value and yield the same 64-bit result that the
334 * sign-extension of the original signed 32-bit
335 * displacement would have given.
336 */
337 disp = (u8 *) p->addr + *((s32 *) insn) -
338 (u8 *) p->ainsn.insn;
339 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
340 *(s32 *)insn = (s32) disp;
341 }
342 } 320 }
343#endif 321#endif
344} 322}
@@ -359,6 +337,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
359 337
360int __kprobes arch_prepare_kprobe(struct kprobe *p) 338int __kprobes arch_prepare_kprobe(struct kprobe *p)
361{ 339{
340 if (!can_probe((unsigned long)p->addr))
341 return -EILSEQ;
362 /* insn: must be on special executable page on x86. */ 342 /* insn: must be on special executable page on x86. */
363 p->ainsn.insn = get_insn_slot(); 343 p->ainsn.insn = get_insn_slot();
364 if (!p->ainsn.insn) 344 if (!p->ainsn.insn)
@@ -472,17 +452,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
472{ 452{
473 switch (kcb->kprobe_status) { 453 switch (kcb->kprobe_status) {
474 case KPROBE_HIT_SSDONE: 454 case KPROBE_HIT_SSDONE:
475#ifdef CONFIG_X86_64
476 /* TODO: Provide re-entrancy from post_kprobes_handler() and
477 * avoid exception stack corruption while single-stepping on
478 * the instruction of the new probe.
479 */
480 arch_disarm_kprobe(p);
481 regs->ip = (unsigned long)p->addr;
482 reset_current_kprobe();
483 preempt_enable_no_resched();
484 break;
485#endif
486 case KPROBE_HIT_ACTIVE: 455 case KPROBE_HIT_ACTIVE:
487 save_previous_kprobe(kcb); 456 save_previous_kprobe(kcb);
488 set_current_kprobe(p, regs, kcb); 457 set_current_kprobe(p, regs, kcb);
@@ -491,18 +460,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
491 kcb->kprobe_status = KPROBE_REENTER; 460 kcb->kprobe_status = KPROBE_REENTER;
492 break; 461 break;
493 case KPROBE_HIT_SS: 462 case KPROBE_HIT_SS:
494 if (p == kprobe_running()) { 463 /* A probe has been hit in the codepath leading up to, or just
495 regs->flags &= ~X86_EFLAGS_TF; 464 * after, single-stepping of a probed instruction. This entire
496 regs->flags |= kcb->kprobe_saved_flags; 465 * codepath should strictly reside in .kprobes.text section.
497 return 0; 466 * Raise a BUG or we'll continue in an endless reentering loop
498 } else { 467 * and eventually a stack overflow.
499 /* A probe has been hit in the codepath leading up 468 */
500 * to, or just after, single-stepping of a probed 469 printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
501 * instruction. This entire codepath should strictly 470 p->addr);
502 * reside in .kprobes.text section. Raise a warning 471 dump_kprobe(p);
503 * to highlight this peculiar case. 472 BUG();
504 */
505 }
506 default: 473 default:
507 /* impossible cases */ 474 /* impossible cases */
508 WARN_ON(1); 475 WARN_ON(1);
@@ -967,8 +934,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
967 ret = NOTIFY_STOP; 934 ret = NOTIFY_STOP;
968 break; 935 break;
969 case DIE_DEBUG: 936 case DIE_DEBUG:
970 if (post_kprobe_handler(args->regs)) 937 if (post_kprobe_handler(args->regs)) {
938 /*
939 * Reset the BS bit in dr6 (pointed by args->err) to
940 * denote completion of processing
941 */
942 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
971 ret = NOTIFY_STOP; 943 ret = NOTIFY_STOP;
944 }
972 break; 945 break;
973 case DIE_GPF: 946 case DIE_GPF:
974 /* 947 /*
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c1c429d00130..a3fa43ba5d3b 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
25#include <asm/desc.h> 25#include <asm/desc.h>
26#include <asm/system.h> 26#include <asm/system.h>
27#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
28#include <asm/debugreg.h>
28 29
29static void set_idt(void *newidt, __u16 limit) 30static void set_idt(void *newidt, __u16 limit)
30{ 31{
@@ -157,8 +158,7 @@ int machine_kexec_prepare(struct kimage *image)
157{ 158{
158 int error; 159 int error;
159 160
160 if (nx_enabled) 161 set_pages_x(image->control_code_page, 1);
161 set_pages_x(image->control_code_page, 1);
162 error = machine_kexec_alloc_page_tables(image); 162 error = machine_kexec_alloc_page_tables(image);
163 if (error) 163 if (error)
164 return error; 164 return error;
@@ -172,8 +172,7 @@ int machine_kexec_prepare(struct kimage *image)
172 */ 172 */
173void machine_kexec_cleanup(struct kimage *image) 173void machine_kexec_cleanup(struct kimage *image)
174{ 174{
175 if (nx_enabled) 175 set_pages_nx(image->control_code_page, 1);
176 set_pages_nx(image->control_code_page, 1);
177 machine_kexec_free_page_tables(image); 176 machine_kexec_free_page_tables(image);
178} 177}
179 178
@@ -202,6 +201,7 @@ void machine_kexec(struct kimage *image)
202 201
203 /* Interrupts aren't acceptable while we reboot */ 202 /* Interrupts aren't acceptable while we reboot */
204 local_irq_disable(); 203 local_irq_disable();
204 hw_breakpoint_disable();
205 205
206 if (image->preserve_context) { 206 if (image->preserve_context) {
207#ifdef CONFIG_X86_IO_APIC 207#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 84c3bf209e98..4a8bb82248ae 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
18#include <asm/pgtable.h> 18#include <asm/pgtable.h>
19#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/debugreg.h>
21 22
22static int init_one_level2_page(struct kimage *image, pgd_t *pgd, 23static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
23 unsigned long addr) 24 unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
282 283
283 /* Interrupts aren't acceptable while we reboot */ 284 /* Interrupts aren't acceptable while we reboot */
284 local_irq_disable(); 285 local_irq_disable();
286 hw_breakpoint_disable();
285 287
286 if (image->preserve_context) { 288 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC 289#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 366baa179913..63123d902103 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -33,6 +33,9 @@ MODULE_LICENSE("GPL v2");
33#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 33#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
34#define UCODE_UCODE_TYPE 0x00000001 34#define UCODE_UCODE_TYPE 0x00000001
35 35
36const struct firmware *firmware;
37static int supported_cpu;
38
36struct equiv_cpu_entry { 39struct equiv_cpu_entry {
37 u32 installed_cpu; 40 u32 installed_cpu;
38 u32 fixed_errata_mask; 41 u32 fixed_errata_mask;
@@ -71,17 +74,14 @@ static struct equiv_cpu_entry *equiv_cpu_table;
71 74
72static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 75static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
73{ 76{
74 struct cpuinfo_x86 *c = &cpu_data(cpu);
75 u32 dummy; 77 u32 dummy;
76 78
77 memset(csig, 0, sizeof(*csig)); 79 if (!supported_cpu)
78 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
79 printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not "
80 "supported\n", cpu, c->x86);
81 return -1; 80 return -1;
82 } 81
82 memset(csig, 0, sizeof(*csig));
83 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 83 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
84 printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); 84 pr_info("microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev);
85 return 0; 85 return 0;
86} 86}
87 87
@@ -103,22 +103,15 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
103 i++; 103 i++;
104 } 104 }
105 105
106 if (!equiv_cpu_id) { 106 if (!equiv_cpu_id)
107 printk(KERN_WARNING "microcode: CPU%d: cpu revision "
108 "not listed in equivalent cpu table\n", cpu);
109 return 0; 107 return 0;
110 }
111 108
112 if (mc_header->processor_rev_id != equiv_cpu_id) { 109 if (mc_header->processor_rev_id != equiv_cpu_id)
113 printk(KERN_ERR "microcode: CPU%d: patch mismatch "
114 "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
115 cpu, mc_header->processor_rev_id, equiv_cpu_id);
116 return 0; 110 return 0;
117 }
118 111
119 /* ucode might be chipset specific -- currently we don't support this */ 112 /* ucode might be chipset specific -- currently we don't support this */
120 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 113 if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
121 printk(KERN_ERR "microcode: CPU%d: loading of chipset " 114 pr_err(KERN_ERR "microcode: CPU%d: loading of chipset "
122 "specific code not yet supported\n", cpu); 115 "specific code not yet supported\n", cpu);
123 return 0; 116 return 0;
124 } 117 }
@@ -148,14 +141,12 @@ static int apply_microcode_amd(int cpu)
148 141
149 /* check current patch id and patch's id for match */ 142 /* check current patch id and patch's id for match */
150 if (rev != mc_amd->hdr.patch_id) { 143 if (rev != mc_amd->hdr.patch_id) {
151 printk(KERN_ERR "microcode: CPU%d: update failed " 144 pr_err("microcode: CPU%d: update failed "
152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 145 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
153 return -1; 146 return -1;
154 } 147 }
155 148
156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 149 pr_info("microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev);
157 cpu, rev);
158
159 uci->cpu_sig.rev = rev; 150 uci->cpu_sig.rev = rev;
160 151
161 return 0; 152 return 0;
@@ -178,18 +169,15 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
178 return NULL; 169 return NULL;
179 170
180 if (section_hdr[0] != UCODE_UCODE_TYPE) { 171 if (section_hdr[0] != UCODE_UCODE_TYPE) {
181 printk(KERN_ERR "microcode: error: invalid type field in " 172 pr_err("microcode: error: invalid type field in "
182 "container file section header\n"); 173 "container file section header\n");
183 return NULL; 174 return NULL;
184 } 175 }
185 176
186 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 177 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
187 178
188 printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
189 size, total_size);
190
191 if (total_size > size || total_size > UCODE_MAX_SIZE) { 179 if (total_size > size || total_size > UCODE_MAX_SIZE) {
192 printk(KERN_ERR "microcode: error: size mismatch\n"); 180 pr_err("microcode: error: size mismatch\n");
193 return NULL; 181 return NULL;
194 } 182 }
195 183
@@ -218,15 +206,14 @@ static int install_equiv_cpu_table(const u8 *buf)
218 size = buf_pos[2]; 206 size = buf_pos[2];
219 207
220 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { 208 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
221 printk(KERN_ERR "microcode: error: invalid type field in " 209 pr_err("microcode: error: invalid type field in "
222 "container file section header\n"); 210 "container file section header\n");
223 return 0; 211 return 0;
224 } 212 }
225 213
226 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 214 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
227 if (!equiv_cpu_table) { 215 if (!equiv_cpu_table) {
228 printk(KERN_ERR "microcode: failed to allocate " 216 pr_err("microcode: failed to allocate equivalent CPU table\n");
229 "equivalent CPU table\n");
230 return 0; 217 return 0;
231 } 218 }
232 219
@@ -259,8 +246,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
259 246
260 offset = install_equiv_cpu_table(ucode_ptr); 247 offset = install_equiv_cpu_table(ucode_ptr);
261 if (!offset) { 248 if (!offset) {
262 printk(KERN_ERR "microcode: failed to create " 249 pr_err("microcode: failed to create equivalent cpu table\n");
263 "equivalent cpu table\n");
264 return UCODE_ERROR; 250 return UCODE_ERROR;
265 } 251 }
266 252
@@ -308,27 +294,27 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
308 294
309static enum ucode_state request_microcode_fw(int cpu, struct device *device) 295static enum ucode_state request_microcode_fw(int cpu, struct device *device)
310{ 296{
311 const char *fw_name = "amd-ucode/microcode_amd.bin";
312 const struct firmware *firmware;
313 enum ucode_state ret; 297 enum ucode_state ret;
314 298
315 if (request_firmware(&firmware, fw_name, device)) { 299 if (firmware == NULL)
316 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
317 return UCODE_NFOUND; 300 return UCODE_NFOUND;
301
302 if (*(u32 *)firmware->data != UCODE_MAGIC) {
303 pr_err("microcode: invalid UCODE_MAGIC (0x%08x)\n",
304 *(u32 *)firmware->data);
305 return UCODE_ERROR;
318 } 306 }
319 307
320 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 308 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
321 309
322 release_firmware(firmware);
323
324 return ret; 310 return ret;
325} 311}
326 312
327static enum ucode_state 313static enum ucode_state
328request_microcode_user(int cpu, const void __user *buf, size_t size) 314request_microcode_user(int cpu, const void __user *buf, size_t size)
329{ 315{
330 printk(KERN_INFO "microcode: AMD microcode update via " 316 pr_info("microcode: AMD microcode update via "
331 "/dev/cpu/microcode not supported\n"); 317 "/dev/cpu/microcode not supported\n");
332 return UCODE_ERROR; 318 return UCODE_ERROR;
333} 319}
334 320
@@ -340,7 +326,32 @@ static void microcode_fini_cpu_amd(int cpu)
340 uci->mc = NULL; 326 uci->mc = NULL;
341} 327}
342 328
329void init_microcode_amd(struct device *device)
330{
331 const char *fw_name = "amd-ucode/microcode_amd.bin";
332 struct cpuinfo_x86 *c = &boot_cpu_data;
333
334 WARN_ON(c->x86_vendor != X86_VENDOR_AMD);
335
336 if (c->x86 < 0x10) {
337 pr_warning("microcode: AMD CPU family 0x%x not supported\n",
338 c->x86);
339 return;
340 }
341 supported_cpu = 1;
342
343 if (request_firmware(&firmware, fw_name, device))
344 pr_err("microcode: failed to load file %s\n", fw_name);
345}
346
347void fini_microcode_amd(void)
348{
349 release_firmware(firmware);
350}
351
343static struct microcode_ops microcode_amd_ops = { 352static struct microcode_ops microcode_amd_ops = {
353 .init = init_microcode_amd,
354 .fini = fini_microcode_amd,
344 .request_microcode_user = request_microcode_user, 355 .request_microcode_user = request_microcode_user,
345 .request_microcode_fw = request_microcode_fw, 356 .request_microcode_fw = request_microcode_fw,
346 .collect_cpu_info = collect_cpu_info_amd, 357 .collect_cpu_info = collect_cpu_info_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 378e9a8f1bf8..e68aae397869 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -73,7 +73,6 @@
73#include <linux/platform_device.h> 73#include <linux/platform_device.h>
74#include <linux/miscdevice.h> 74#include <linux/miscdevice.h>
75#include <linux/capability.h> 75#include <linux/capability.h>
76#include <linux/smp_lock.h>
77#include <linux/kernel.h> 76#include <linux/kernel.h>
78#include <linux/module.h> 77#include <linux/module.h>
79#include <linux/mutex.h> 78#include <linux/mutex.h>
@@ -201,7 +200,6 @@ static int do_microcode_update(const void __user *buf, size_t size)
201 200
202static int microcode_open(struct inode *unused1, struct file *unused2) 201static int microcode_open(struct inode *unused1, struct file *unused2)
203{ 202{
204 cycle_kernel_lock();
205 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; 203 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
206} 204}
207 205
@@ -393,7 +391,7 @@ static enum ucode_state microcode_update_cpu(int cpu)
393 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 391 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
394 enum ucode_state ustate; 392 enum ucode_state ustate;
395 393
396 if (uci->valid) 394 if (uci->valid && uci->mc)
397 ustate = microcode_resume_cpu(cpu); 395 ustate = microcode_resume_cpu(cpu);
398 else 396 else
399 ustate = microcode_init_cpu(cpu); 397 ustate = microcode_init_cpu(cpu);
@@ -520,6 +518,9 @@ static int __init microcode_init(void)
520 return PTR_ERR(microcode_pdev); 518 return PTR_ERR(microcode_pdev);
521 } 519 }
522 520
521 if (microcode_ops->init)
522 microcode_ops->init(&microcode_pdev->dev);
523
523 get_online_cpus(); 524 get_online_cpus();
524 mutex_lock(&microcode_mutex); 525 mutex_lock(&microcode_mutex);
525 526
@@ -563,6 +564,9 @@ static void __exit microcode_exit(void)
563 564
564 platform_device_unregister(microcode_pdev); 565 platform_device_unregister(microcode_pdev);
565 566
567 if (microcode_ops->fini)
568 microcode_ops->fini();
569
566 microcode_ops = NULL; 570 microcode_ops = NULL;
567 571
568 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); 572 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5be95ef4ffec..35a57c963df9 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -667,36 +667,18 @@ void __init default_get_smp_config(unsigned int early)
667 */ 667 */
668} 668}
669 669
670static void __init smp_reserve_bootmem(struct mpf_intel *mpf) 670static void __init smp_reserve_memory(struct mpf_intel *mpf)
671{ 671{
672 unsigned long size = get_mpc_size(mpf->physptr); 672 unsigned long size = get_mpc_size(mpf->physptr);
673#ifdef CONFIG_X86_32
674 /*
675 * We cannot access to MPC table to compute table size yet,
676 * as only few megabytes from the bottom is mapped now.
677 * PC-9800's MPC table places on the very last of physical
678 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
679 * yields BUG() in reserve_bootmem.
680 * also need to make sure physptr is below than max_low_pfn
681 * we don't need reserve the area above max_low_pfn
682 */
683 unsigned long end = max_low_pfn * PAGE_SIZE;
684 673
685 if (mpf->physptr < end) { 674 reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc");
686 if (mpf->physptr + size > end)
687 size = end - mpf->physptr;
688 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
689 }
690#else
691 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
692#endif
693} 675}
694 676
695static int __init smp_scan_config(unsigned long base, unsigned long length, 677static int __init smp_scan_config(unsigned long base, unsigned long length)
696 unsigned reserve)
697{ 678{
698 unsigned int *bp = phys_to_virt(base); 679 unsigned int *bp = phys_to_virt(base);
699 struct mpf_intel *mpf; 680 struct mpf_intel *mpf;
681 unsigned long mem;
700 682
701 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 683 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
702 bp, length); 684 bp, length);
@@ -717,12 +699,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
717 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", 699 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
718 mpf, (u64)virt_to_phys(mpf)); 700 mpf, (u64)virt_to_phys(mpf));
719 701
720 if (!reserve) 702 mem = virt_to_phys(mpf);
721 return 1; 703 reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf");
722 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
723 BOOTMEM_DEFAULT);
724 if (mpf->physptr) 704 if (mpf->physptr)
725 smp_reserve_bootmem(mpf); 705 smp_reserve_memory(mpf);
726 706
727 return 1; 707 return 1;
728 } 708 }
@@ -732,7 +712,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
732 return 0; 712 return 0;
733} 713}
734 714
735void __init default_find_smp_config(unsigned int reserve) 715void __init default_find_smp_config(void)
736{ 716{
737 unsigned int address; 717 unsigned int address;
738 718
@@ -744,9 +724,9 @@ void __init default_find_smp_config(unsigned int reserve)
744 * 2) Scan the top 1K of base RAM 724 * 2) Scan the top 1K of base RAM
745 * 3) Scan the 64K of bios 725 * 3) Scan the 64K of bios
746 */ 726 */
747 if (smp_scan_config(0x0, 0x400, reserve) || 727 if (smp_scan_config(0x0, 0x400) ||
748 smp_scan_config(639 * 0x400, 0x400, reserve) || 728 smp_scan_config(639 * 0x400, 0x400) ||
749 smp_scan_config(0xF0000, 0x10000, reserve)) 729 smp_scan_config(0xF0000, 0x10000))
750 return; 730 return;
751 /* 731 /*
752 * If it is an SMP machine we should know now, unless the 732 * If it is an SMP machine we should know now, unless the
@@ -767,7 +747,7 @@ void __init default_find_smp_config(unsigned int reserve)
767 747
768 address = get_bios_ebda(); 748 address = get_bios_ebda();
769 if (address) 749 if (address)
770 smp_scan_config(address, 0x400, reserve); 750 smp_scan_config(address, 0x400);
771} 751}
772 752
773#ifdef CONFIG_X86_IO_APIC 753#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 6a3cefc7dda1..553449951b84 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -174,21 +174,17 @@ static int msr_open(struct inode *inode, struct file *file)
174{ 174{
175 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 175 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
176 struct cpuinfo_x86 *c = &cpu_data(cpu); 176 struct cpuinfo_x86 *c = &cpu_data(cpu);
177 int ret = 0;
178 177
179 lock_kernel();
180 cpu = iminor(file->f_path.dentry->d_inode); 178 cpu = iminor(file->f_path.dentry->d_inode);
181 179
182 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { 180 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
183 ret = -ENXIO; /* No such CPU */ 181 return -ENXIO; /* No such CPU */
184 goto out; 182
185 }
186 c = &cpu_data(cpu); 183 c = &cpu_data(cpu);
187 if (!cpu_has(c, X86_FEATURE_MSR)) 184 if (!cpu_has(c, X86_FEATURE_MSR))
188 ret = -EIO; /* MSR not supported */ 185 return -EIO; /* MSR not supported */
189out: 186
190 unlock_kernel(); 187 return 0;
191 return ret;
192} 188}
193 189
194/* 190/*
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 971a3bec47a8..c563e4c8ff39 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -46,6 +46,7 @@
46#include <asm/dma.h> 46#include <asm/dma.h>
47#include <asm/rio.h> 47#include <asm/rio.h>
48#include <asm/bios_ebda.h> 48#include <asm/bios_ebda.h>
49#include <asm/x86_init.h>
49 50
50#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT 51#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
51int use_calgary __read_mostly = 1; 52int use_calgary __read_mostly = 1;
@@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
244 if (panic_on_overflow) 245 if (panic_on_overflow)
245 panic("Calgary: fix the allocator.\n"); 246 panic("Calgary: fix the allocator.\n");
246 else 247 else
247 return bad_dma_address; 248 return DMA_ERROR_CODE;
248 } 249 }
249 } 250 }
250 251
@@ -260,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
260 void *vaddr, unsigned int npages, int direction) 261 void *vaddr, unsigned int npages, int direction)
261{ 262{
262 unsigned long entry; 263 unsigned long entry;
263 dma_addr_t ret = bad_dma_address; 264 dma_addr_t ret;
264 265
265 entry = iommu_range_alloc(dev, tbl, npages); 266 entry = iommu_range_alloc(dev, tbl, npages);
266 267
267 if (unlikely(entry == bad_dma_address)) 268 if (unlikely(entry == DMA_ERROR_CODE)) {
268 goto error; 269 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
270 "iommu %p\n", npages, tbl);
271 return DMA_ERROR_CODE;
272 }
269 273
270 /* set the return dma address */ 274 /* set the return dma address */
271 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); 275 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
@@ -273,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
273 /* put the TCEs in the HW table */ 277 /* put the TCEs in the HW table */
274 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, 278 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
275 direction); 279 direction);
276
277 return ret; 280 return ret;
278
279error:
280 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
281 "iommu %p\n", npages, tbl);
282 return bad_dma_address;
283} 281}
284 282
285static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 283static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -290,8 +288,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
290 unsigned long flags; 288 unsigned long flags;
291 289
292 /* were we called with bad_dma_address? */ 290 /* were we called with bad_dma_address? */
293 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); 291 badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
294 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { 292 if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
295 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " 293 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
296 "address 0x%Lx\n", dma_addr); 294 "address 0x%Lx\n", dma_addr);
297 return; 295 return;
@@ -318,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
318 316
319 pdev = to_pci_dev(dev); 317 pdev = to_pci_dev(dev);
320 318
319 /* search up the device tree for an iommu */
321 pbus = pdev->bus; 320 pbus = pdev->bus;
322 321 do {
323 /* is the device behind a bridge? Look for the root bus */ 322 tbl = pci_iommu(pbus);
324 while (pbus->parent) 323 if (tbl && tbl->it_busno == pbus->number)
324 break;
325 tbl = NULL;
325 pbus = pbus->parent; 326 pbus = pbus->parent;
326 327 } while (pbus);
327 tbl = pci_iommu(pbus);
328 328
329 BUG_ON(tbl && (tbl->it_busno != pbus->number)); 329 BUG_ON(tbl && (tbl->it_busno != pbus->number));
330 330
@@ -373,7 +373,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); 373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
374 374
375 entry = iommu_range_alloc(dev, tbl, npages); 375 entry = iommu_range_alloc(dev, tbl, npages);
376 if (entry == bad_dma_address) { 376 if (entry == DMA_ERROR_CODE) {
377 /* makes sure unmap knows to stop */ 377 /* makes sure unmap knows to stop */
378 s->dma_length = 0; 378 s->dma_length = 0;
379 goto error; 379 goto error;
@@ -391,7 +391,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
391error: 391error:
392 calgary_unmap_sg(dev, sg, nelems, dir, NULL); 392 calgary_unmap_sg(dev, sg, nelems, dir, NULL);
393 for_each_sg(sg, s, nelems, i) { 393 for_each_sg(sg, s, nelems, i) {
394 sg->dma_address = bad_dma_address; 394 sg->dma_address = DMA_ERROR_CODE;
395 sg->dma_length = 0; 395 sg->dma_length = 0;
396 } 396 }
397 return 0; 397 return 0;
@@ -446,7 +446,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
446 446
447 /* set up tces to cover the allocated range */ 447 /* set up tces to cover the allocated range */
448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
449 if (mapping == bad_dma_address) 449 if (mapping == DMA_ERROR_CODE)
450 goto free; 450 goto free;
451 *dma_handle = mapping; 451 *dma_handle = mapping;
452 return ret; 452 return ret;
@@ -727,7 +727,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
727 struct iommu_table *tbl = pci_iommu(dev->bus); 727 struct iommu_table *tbl = pci_iommu(dev->bus);
728 728
729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */ 729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */
730 iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); 730 iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
731 731
732 /* avoid the BIOS/VGA first 640KB-1MB region */ 732 /* avoid the BIOS/VGA first 640KB-1MB region */
733 /* for CalIOC2 - avoid the entire first MB */ 733 /* for CalIOC2 - avoid the entire first MB */
@@ -1344,6 +1344,23 @@ static void __init get_tce_space_from_tar(void)
1344 return; 1344 return;
1345} 1345}
1346 1346
1347static int __init calgary_iommu_init(void)
1348{
1349 int ret;
1350
1351 /* ok, we're trying to use Calgary - let's roll */
1352 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1353
1354 ret = calgary_init();
1355 if (ret) {
1356 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1357 "falling back to no_iommu\n", ret);
1358 return ret;
1359 }
1360
1361 return 0;
1362}
1363
1347void __init detect_calgary(void) 1364void __init detect_calgary(void)
1348{ 1365{
1349 int bus; 1366 int bus;
@@ -1357,7 +1374,7 @@ void __init detect_calgary(void)
1357 * if the user specified iommu=off or iommu=soft or we found 1374 * if the user specified iommu=off or iommu=soft or we found
1358 * another HW IOMMU already, bail out. 1375 * another HW IOMMU already, bail out.
1359 */ 1376 */
1360 if (swiotlb || no_iommu || iommu_detected) 1377 if (no_iommu || iommu_detected)
1361 return; 1378 return;
1362 1379
1363 if (!use_calgary) 1380 if (!use_calgary)
@@ -1442,9 +1459,7 @@ void __init detect_calgary(void)
1442 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", 1459 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
1443 specified_table_size); 1460 specified_table_size);
1444 1461
1445 /* swiotlb for devices that aren't behind the Calgary. */ 1462 x86_init.iommu.iommu_init = calgary_iommu_init;
1446 if (max_pfn > MAX_DMA32_PFN)
1447 swiotlb = 1;
1448 } 1463 }
1449 return; 1464 return;
1450 1465
@@ -1457,35 +1472,6 @@ cleanup:
1457 } 1472 }
1458} 1473}
1459 1474
1460int __init calgary_iommu_init(void)
1461{
1462 int ret;
1463
1464 if (no_iommu || (swiotlb && !calgary_detected))
1465 return -ENODEV;
1466
1467 if (!calgary_detected)
1468 return -ENODEV;
1469
1470 /* ok, we're trying to use Calgary - let's roll */
1471 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1472
1473 ret = calgary_init();
1474 if (ret) {
1475 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1476 "falling back to no_iommu\n", ret);
1477 return ret;
1478 }
1479
1480 force_iommu = 1;
1481 bad_dma_address = 0x0;
1482 /* dma_ops is set to swiotlb or nommu */
1483 if (!dma_ops)
1484 dma_ops = &nommu_dma_ops;
1485
1486 return 0;
1487}
1488
1489static int __init calgary_parse_options(char *p) 1475static int __init calgary_parse_options(char *p)
1490{ 1476{
1491 unsigned int bridge; 1477 unsigned int bridge;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index b2a71dca5642..afcc58b69c7c 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,10 +11,11 @@
11#include <asm/gart.h> 11#include <asm/gart.h>
12#include <asm/calgary.h> 12#include <asm/calgary.h>
13#include <asm/amd_iommu.h> 13#include <asm/amd_iommu.h>
14#include <asm/x86_init.h>
14 15
15static int forbid_dac __read_mostly; 16static int forbid_dac __read_mostly;
16 17
17struct dma_map_ops *dma_ops; 18struct dma_map_ops *dma_ops = &nommu_dma_ops;
18EXPORT_SYMBOL(dma_ops); 19EXPORT_SYMBOL(dma_ops);
19 20
20static int iommu_sac_force __read_mostly; 21static int iommu_sac_force __read_mostly;
@@ -42,15 +43,10 @@ int iommu_detected __read_mostly = 0;
42 */ 43 */
43int iommu_pass_through __read_mostly; 44int iommu_pass_through __read_mostly;
44 45
45dma_addr_t bad_dma_address __read_mostly = 0; 46/* Dummy device used for NULL arguments (normally ISA). */
46EXPORT_SYMBOL(bad_dma_address);
47
48/* Dummy device used for NULL arguments (normally ISA). Better would
49 be probably a smaller DMA mask, but this is bug-to-bug compatible
50 to older i386. */
51struct device x86_dma_fallback_dev = { 47struct device x86_dma_fallback_dev = {
52 .init_name = "fallback device", 48 .init_name = "fallback device",
53 .coherent_dma_mask = DMA_BIT_MASK(32), 49 .coherent_dma_mask = ISA_DMA_BIT_MASK,
54 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, 50 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
55}; 51};
56EXPORT_SYMBOL(x86_dma_fallback_dev); 52EXPORT_SYMBOL(x86_dma_fallback_dev);
@@ -128,20 +124,17 @@ void __init pci_iommu_alloc(void)
128 /* free the range so iommu could get some range less than 4G */ 124 /* free the range so iommu could get some range less than 4G */
129 dma32_free_bootmem(); 125 dma32_free_bootmem();
130#endif 126#endif
127 if (pci_swiotlb_init())
128 return;
131 129
132 /*
133 * The order of these functions is important for
134 * fall-back/fail-over reasons
135 */
136 gart_iommu_hole_init(); 130 gart_iommu_hole_init();
137 131
138 detect_calgary(); 132 detect_calgary();
139 133
140 detect_intel_iommu(); 134 detect_intel_iommu();
141 135
136 /* needs to be called after gart_iommu_hole_init */
142 amd_iommu_detect(); 137 amd_iommu_detect();
143
144 pci_swiotlb_init();
145} 138}
146 139
147void *dma_generic_alloc_coherent(struct device *dev, size_t size, 140void *dma_generic_alloc_coherent(struct device *dev, size_t size,
@@ -216,7 +209,7 @@ static __init int iommu_setup(char *p)
216 if (!strncmp(p, "allowdac", 8)) 209 if (!strncmp(p, "allowdac", 8))
217 forbid_dac = 0; 210 forbid_dac = 0;
218 if (!strncmp(p, "nodac", 5)) 211 if (!strncmp(p, "nodac", 5))
219 forbid_dac = -1; 212 forbid_dac = 1;
220 if (!strncmp(p, "usedac", 6)) { 213 if (!strncmp(p, "usedac", 6)) {
221 forbid_dac = -1; 214 forbid_dac = -1;
222 return 1; 215 return 1;
@@ -291,25 +284,17 @@ static int __init pci_iommu_init(void)
291#ifdef CONFIG_PCI 284#ifdef CONFIG_PCI
292 dma_debug_add_bus(&pci_bus_type); 285 dma_debug_add_bus(&pci_bus_type);
293#endif 286#endif
287 x86_init.iommu.iommu_init();
294 288
295 calgary_iommu_init(); 289 if (swiotlb) {
296 290 printk(KERN_INFO "PCI-DMA: "
297 intel_iommu_init(); 291 "Using software bounce buffering for IO (SWIOTLB)\n");
292 swiotlb_print_info();
293 } else
294 swiotlb_free();
298 295
299 amd_iommu_init();
300
301 gart_iommu_init();
302
303 no_iommu_init();
304 return 0; 296 return 0;
305} 297}
306
307void pci_iommu_shutdown(void)
308{
309 gart_iommu_shutdown();
310
311 amd_iommu_shutdown();
312}
313/* Must execute after PCI subsystem */ 298/* Must execute after PCI subsystem */
314rootfs_initcall(pci_iommu_init); 299rootfs_initcall(pci_iommu_init);
315 300
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a7f1b64f86e0..e6a0d402f171 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -39,6 +39,7 @@
39#include <asm/swiotlb.h> 39#include <asm/swiotlb.h>
40#include <asm/dma.h> 40#include <asm/dma.h>
41#include <asm/k8.h> 41#include <asm/k8.h>
42#include <asm/x86_init.h>
42 43
43static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 44static unsigned long iommu_bus_base; /* GART remapping area (physical) */
44static unsigned long iommu_size; /* size of remapping area bytes */ 45static unsigned long iommu_size; /* size of remapping area bytes */
@@ -46,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */
46 47
47static u32 *iommu_gatt_base; /* Remapping table */ 48static u32 *iommu_gatt_base; /* Remapping table */
48 49
50static dma_addr_t bad_dma_addr;
51
49/* 52/*
50 * If this is disabled the IOMMU will use an optimized flushing strategy 53 * If this is disabled the IOMMU will use an optimized flushing strategy
51 * of only flushing when an mapping is reused. With it true the GART is 54 * of only flushing when an mapping is reused. With it true the GART is
@@ -92,7 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size,
92 95
93 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 96 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
94 PAGE_SIZE) >> PAGE_SHIFT; 97 PAGE_SIZE) >> PAGE_SHIFT;
95 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, 98 boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
96 PAGE_SIZE) >> PAGE_SHIFT; 99 PAGE_SIZE) >> PAGE_SHIFT;
97 100
98 spin_lock_irqsave(&iommu_bitmap_lock, flags); 101 spin_lock_irqsave(&iommu_bitmap_lock, flags);
@@ -216,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
216 if (panic_on_overflow) 219 if (panic_on_overflow)
217 panic("dma_map_area overflow %lu bytes\n", size); 220 panic("dma_map_area overflow %lu bytes\n", size);
218 iommu_full(dev, size, dir); 221 iommu_full(dev, size, dir);
219 return bad_dma_address; 222 return bad_dma_addr;
220 } 223 }
221 224
222 for (i = 0; i < npages; i++) { 225 for (i = 0; i < npages; i++) {
@@ -294,7 +297,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
294 int i; 297 int i;
295 298
296#ifdef CONFIG_IOMMU_DEBUG 299#ifdef CONFIG_IOMMU_DEBUG
297 printk(KERN_DEBUG "dma_map_sg overflow\n"); 300 pr_debug("dma_map_sg overflow\n");
298#endif 301#endif
299 302
300 for_each_sg(sg, s, nents, i) { 303 for_each_sg(sg, s, nents, i) {
@@ -302,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
302 305
303 if (nonforced_iommu(dev, addr, s->length)) { 306 if (nonforced_iommu(dev, addr, s->length)) {
304 addr = dma_map_area(dev, addr, s->length, dir, 0); 307 addr = dma_map_area(dev, addr, s->length, dir, 0);
305 if (addr == bad_dma_address) { 308 if (addr == bad_dma_addr) {
306 if (i > 0) 309 if (i > 0)
307 gart_unmap_sg(dev, sg, i, dir, NULL); 310 gart_unmap_sg(dev, sg, i, dir, NULL);
308 nents = 0; 311 nents = 0;
@@ -389,12 +392,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
389 if (!dev) 392 if (!dev)
390 dev = &x86_dma_fallback_dev; 393 dev = &x86_dma_fallback_dev;
391 394
392 out = 0; 395 out = 0;
393 start = 0; 396 start = 0;
394 start_sg = sgmap = sg; 397 start_sg = sg;
395 seg_size = 0; 398 sgmap = sg;
396 max_seg_size = dma_get_max_seg_size(dev); 399 seg_size = 0;
397 ps = NULL; /* shut up gcc */ 400 max_seg_size = dma_get_max_seg_size(dev);
401 ps = NULL; /* shut up gcc */
402
398 for_each_sg(sg, s, nents, i) { 403 for_each_sg(sg, s, nents, i) {
399 dma_addr_t addr = sg_phys(s); 404 dma_addr_t addr = sg_phys(s);
400 405
@@ -417,11 +422,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
417 sgmap, pages, need) < 0) 422 sgmap, pages, need) < 0)
418 goto error; 423 goto error;
419 out++; 424 out++;
420 seg_size = 0; 425
421 sgmap = sg_next(sgmap); 426 seg_size = 0;
422 pages = 0; 427 sgmap = sg_next(sgmap);
423 start = i; 428 pages = 0;
424 start_sg = s; 429 start = i;
430 start_sg = s;
425 } 431 }
426 } 432 }
427 433
@@ -455,7 +461,7 @@ error:
455 461
456 iommu_full(dev, pages << PAGE_SHIFT, dir); 462 iommu_full(dev, pages << PAGE_SHIFT, dir);
457 for_each_sg(sg, s, nents, i) 463 for_each_sg(sg, s, nents, i)
458 s->dma_address = bad_dma_address; 464 s->dma_address = bad_dma_addr;
459 return 0; 465 return 0;
460} 466}
461 467
@@ -479,7 +485,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
479 DMA_BIDIRECTIONAL, align_mask); 485 DMA_BIDIRECTIONAL, align_mask);
480 486
481 flush_gart(); 487 flush_gart();
482 if (paddr != bad_dma_address) { 488 if (paddr != bad_dma_addr) {
483 *dma_addr = paddr; 489 *dma_addr = paddr;
484 return page_address(page); 490 return page_address(page);
485 } 491 }
@@ -499,6 +505,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
499 free_pages((unsigned long)vaddr, get_order(size)); 505 free_pages((unsigned long)vaddr, get_order(size));
500} 506}
501 507
508static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
509{
510 return (dma_addr == bad_dma_addr);
511}
512
502static int no_agp; 513static int no_agp;
503 514
504static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 515static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -515,7 +526,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
515 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; 526 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
516 527
517 if (iommu_size < 64*1024*1024) { 528 if (iommu_size < 64*1024*1024) {
518 printk(KERN_WARNING 529 pr_warning(
519 "PCI-DMA: Warning: Small IOMMU %luMB." 530 "PCI-DMA: Warning: Small IOMMU %luMB."
520 " Consider increasing the AGP aperture in BIOS\n", 531 " Consider increasing the AGP aperture in BIOS\n",
521 iommu_size >> 20); 532 iommu_size >> 20);
@@ -570,28 +581,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
570 aperture_alloc = aper_alloc; 581 aperture_alloc = aper_alloc;
571} 582}
572 583
573static int gart_resume(struct sys_device *dev) 584static void gart_fixup_northbridges(struct sys_device *dev)
574{ 585{
575 printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); 586 int i;
576 587
577 if (fix_up_north_bridges) { 588 if (!fix_up_north_bridges)
578 int i; 589 return;
579 590
580 printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); 591 pr_info("PCI-DMA: Restoring GART aperture settings\n");
581 592
582 for (i = 0; i < num_k8_northbridges; i++) { 593 for (i = 0; i < num_k8_northbridges; i++) {
583 struct pci_dev *dev = k8_northbridges[i]; 594 struct pci_dev *dev = k8_northbridges[i];
584 595
585 /* 596 /*
586 * Don't enable translations just yet. That is the next 597 * Don't enable translations just yet. That is the next
587 * step. Restore the pre-suspend aperture settings. 598 * step. Restore the pre-suspend aperture settings.
588 */ 599 */
589 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, 600 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
590 aperture_order << 1); 601 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
591 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
592 aperture_alloc >> 25);
593 }
594 } 602 }
603}
604
605static int gart_resume(struct sys_device *dev)
606{
607 pr_info("PCI-DMA: Resuming GART IOMMU\n");
608
609 gart_fixup_northbridges(dev);
595 610
596 enable_gart_translations(); 611 enable_gart_translations();
597 612
@@ -604,15 +619,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state)
604} 619}
605 620
606static struct sysdev_class gart_sysdev_class = { 621static struct sysdev_class gart_sysdev_class = {
607 .name = "gart", 622 .name = "gart",
608 .suspend = gart_suspend, 623 .suspend = gart_suspend,
609 .resume = gart_resume, 624 .resume = gart_resume,
610 625
611}; 626};
612 627
613static struct sys_device device_gart = { 628static struct sys_device device_gart = {
614 .id = 0, 629 .cls = &gart_sysdev_class,
615 .cls = &gart_sysdev_class,
616}; 630};
617 631
618/* 632/*
@@ -627,7 +641,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
627 void *gatt; 641 void *gatt;
628 int i, error; 642 int i, error;
629 643
630 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 644 pr_info("PCI-DMA: Disabling AGP.\n");
645
631 aper_size = aper_base = info->aper_size = 0; 646 aper_size = aper_base = info->aper_size = 0;
632 dev = NULL; 647 dev = NULL;
633 for (i = 0; i < num_k8_northbridges; i++) { 648 for (i = 0; i < num_k8_northbridges; i++) {
@@ -645,6 +660,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
645 } 660 }
646 if (!aper_base) 661 if (!aper_base)
647 goto nommu; 662 goto nommu;
663
648 info->aper_base = aper_base; 664 info->aper_base = aper_base;
649 info->aper_size = aper_size >> 20; 665 info->aper_size = aper_size >> 20;
650 666
@@ -667,14 +683,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
667 683
668 flush_gart(); 684 flush_gart();
669 685
670 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 686 pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
671 aper_base, aper_size>>10); 687 aper_base, aper_size>>10);
672 688
673 return 0; 689 return 0;
674 690
675 nommu: 691 nommu:
676 /* Should not happen anymore */ 692 /* Should not happen anymore */
677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 693 pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
678 "falling back to iommu=soft.\n"); 694 "falling back to iommu=soft.\n");
679 return -1; 695 return -1;
680} 696}
@@ -686,14 +702,15 @@ static struct dma_map_ops gart_dma_ops = {
686 .unmap_page = gart_unmap_page, 702 .unmap_page = gart_unmap_page,
687 .alloc_coherent = gart_alloc_coherent, 703 .alloc_coherent = gart_alloc_coherent,
688 .free_coherent = gart_free_coherent, 704 .free_coherent = gart_free_coherent,
705 .mapping_error = gart_mapping_error,
689}; 706};
690 707
691void gart_iommu_shutdown(void) 708static void gart_iommu_shutdown(void)
692{ 709{
693 struct pci_dev *dev; 710 struct pci_dev *dev;
694 int i; 711 int i;
695 712
696 if (no_agp && (dma_ops != &gart_dma_ops)) 713 if (no_agp)
697 return; 714 return;
698 715
699 for (i = 0; i < num_k8_northbridges; i++) { 716 for (i = 0; i < num_k8_northbridges; i++) {
@@ -708,7 +725,7 @@ void gart_iommu_shutdown(void)
708 } 725 }
709} 726}
710 727
711void __init gart_iommu_init(void) 728int __init gart_iommu_init(void)
712{ 729{
713 struct agp_kern_info info; 730 struct agp_kern_info info;
714 unsigned long iommu_start; 731 unsigned long iommu_start;
@@ -718,7 +735,7 @@ void __init gart_iommu_init(void)
718 long i; 735 long i;
719 736
720 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) 737 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
721 return; 738 return 0;
722 739
723#ifndef CONFIG_AGP_AMD64 740#ifndef CONFIG_AGP_AMD64
724 no_agp = 1; 741 no_agp = 1;
@@ -730,35 +747,28 @@ void __init gart_iommu_init(void)
730 (agp_copy_info(agp_bridge, &info) < 0); 747 (agp_copy_info(agp_bridge, &info) < 0);
731#endif 748#endif
732 749
733 if (swiotlb)
734 return;
735
736 /* Did we detect a different HW IOMMU? */
737 if (iommu_detected && !gart_iommu_aperture)
738 return;
739
740 if (no_iommu || 750 if (no_iommu ||
741 (!force_iommu && max_pfn <= MAX_DMA32_PFN) || 751 (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
742 !gart_iommu_aperture || 752 !gart_iommu_aperture ||
743 (no_agp && init_k8_gatt(&info) < 0)) { 753 (no_agp && init_k8_gatt(&info) < 0)) {
744 if (max_pfn > MAX_DMA32_PFN) { 754 if (max_pfn > MAX_DMA32_PFN) {
745 printk(KERN_WARNING "More than 4GB of memory " 755 pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
746 "but GART IOMMU not available.\n"); 756 pr_warning("falling back to iommu=soft.\n");
747 printk(KERN_WARNING "falling back to iommu=soft.\n");
748 } 757 }
749 return; 758 return 0;
750 } 759 }
751 760
752 /* need to map that range */ 761 /* need to map that range */
753 aper_size = info.aper_size << 20; 762 aper_size = info.aper_size << 20;
754 aper_base = info.aper_base; 763 aper_base = info.aper_base;
755 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); 764 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
765
756 if (end_pfn > max_low_pfn_mapped) { 766 if (end_pfn > max_low_pfn_mapped) {
757 start_pfn = (aper_base>>PAGE_SHIFT); 767 start_pfn = (aper_base>>PAGE_SHIFT);
758 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); 768 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
759 } 769 }
760 770
761 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 771 pr_info("PCI-DMA: using GART IOMMU.\n");
762 iommu_size = check_iommu_size(info.aper_base, aper_size); 772 iommu_size = check_iommu_size(info.aper_base, aper_size);
763 iommu_pages = iommu_size >> PAGE_SHIFT; 773 iommu_pages = iommu_size >> PAGE_SHIFT;
764 774
@@ -773,8 +783,7 @@ void __init gart_iommu_init(void)
773 783
774 ret = dma_debug_resize_entries(iommu_pages); 784 ret = dma_debug_resize_entries(iommu_pages);
775 if (ret) 785 if (ret)
776 printk(KERN_DEBUG 786 pr_debug("PCI-DMA: Cannot trace all the entries\n");
777 "PCI-DMA: Cannot trace all the entries\n");
778 } 787 }
779#endif 788#endif
780 789
@@ -784,15 +793,14 @@ void __init gart_iommu_init(void)
784 */ 793 */
785 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 794 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
786 795
787 agp_memory_reserved = iommu_size; 796 pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
788 printk(KERN_INFO
789 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
790 iommu_size >> 20); 797 iommu_size >> 20);
791 798
792 iommu_start = aper_size - iommu_size; 799 agp_memory_reserved = iommu_size;
793 iommu_bus_base = info.aper_base + iommu_start; 800 iommu_start = aper_size - iommu_size;
794 bad_dma_address = iommu_bus_base; 801 iommu_bus_base = info.aper_base + iommu_start;
795 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); 802 bad_dma_addr = iommu_bus_base;
803 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
796 804
797 /* 805 /*
798 * Unmap the IOMMU part of the GART. The alias of the page is 806 * Unmap the IOMMU part of the GART. The alias of the page is
@@ -814,7 +822,7 @@ void __init gart_iommu_init(void)
814 * the pages as Not-Present: 822 * the pages as Not-Present:
815 */ 823 */
816 wbinvd(); 824 wbinvd();
817 825
818 /* 826 /*
819 * Now all caches are flushed and we can safely enable 827 * Now all caches are flushed and we can safely enable
820 * GART hardware. Doing it early leaves the possibility 828 * GART hardware. Doing it early leaves the possibility
@@ -838,6 +846,10 @@ void __init gart_iommu_init(void)
838 846
839 flush_gart(); 847 flush_gart();
840 dma_ops = &gart_dma_ops; 848 dma_ops = &gart_dma_ops;
849 x86_platform.iommu_shutdown = gart_iommu_shutdown;
850 swiotlb = 0;
851
852 return 0;
841} 853}
842 854
843void __init gart_parse_options(char *p) 855void __init gart_parse_options(char *p)
@@ -856,7 +868,7 @@ void __init gart_parse_options(char *p)
856#endif 868#endif
857 if (isdigit(*p) && get_option(&p, &arg)) 869 if (isdigit(*p) && get_option(&p, &arg))
858 iommu_size = arg; 870 iommu_size = arg;
859 if (!strncmp(p, "fullflush", 8)) 871 if (!strncmp(p, "fullflush", 9))
860 iommu_fullflush = 1; 872 iommu_fullflush = 1;
861 if (!strncmp(p, "nofullflush", 11)) 873 if (!strncmp(p, "nofullflush", 11))
862 iommu_fullflush = 0; 874 iommu_fullflush = 0;
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a3933d4330cd..22be12b60a8f 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
33 dma_addr_t bus = page_to_phys(page) + offset; 33 dma_addr_t bus = page_to_phys(page) + offset;
34 WARN_ON(size == 0); 34 WARN_ON(size == 0);
35 if (!check_addr("map_single", dev, bus, size)) 35 if (!check_addr("map_single", dev, bus, size))
36 return bad_dma_address; 36 return DMA_ERROR_CODE;
37 flush_write_buffers(); 37 flush_write_buffers();
38 return bus; 38 return bus;
39} 39}
@@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = {
103 .sync_sg_for_device = nommu_sync_sg_for_device, 103 .sync_sg_for_device = nommu_sync_sg_for_device,
104 .is_phys = 1, 104 .is_phys = 1,
105}; 105};
106
107void __init no_iommu_init(void)
108{
109 if (dma_ops)
110 return;
111
112 force_iommu = 0; /* no HW IOMMU */
113 dma_ops = &nommu_dma_ops;
114}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index aaa6b7839f1e..e3c0a66b9e77 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -42,18 +42,28 @@ static struct dma_map_ops swiotlb_dma_ops = {
42 .dma_supported = NULL, 42 .dma_supported = NULL,
43}; 43};
44 44
45void __init pci_swiotlb_init(void) 45/*
46 * pci_swiotlb_init - initialize swiotlb if necessary
47 *
48 * This returns non-zero if we are forced to use swiotlb (by the boot
49 * option).
50 */
51int __init pci_swiotlb_init(void)
46{ 52{
53 int use_swiotlb = swiotlb | swiotlb_force;
54
47 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 55 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
48#ifdef CONFIG_X86_64 56#ifdef CONFIG_X86_64
49 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) 57 if (!no_iommu && max_pfn > MAX_DMA32_PFN)
50 swiotlb = 1; 58 swiotlb = 1;
51#endif 59#endif
52 if (swiotlb_force) 60 if (swiotlb_force)
53 swiotlb = 1; 61 swiotlb = 1;
62
54 if (swiotlb) { 63 if (swiotlb) {
55 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); 64 swiotlb_init(0);
56 swiotlb_init();
57 dma_ops = &swiotlb_dma_ops; 65 dma_ops = &swiotlb_dma_ops;
58 } 66 }
67
68 return use_swiotlb;
59} 69}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2b5776..5e2ba634ea15 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,9 @@
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <linux/user-return-notifier.h>
12#include <trace/events/power.h> 13#include <trace/events/power.h>
14#include <linux/hw_breakpoint.h>
13#include <asm/system.h> 15#include <asm/system.h>
14#include <asm/apic.h> 16#include <asm/apic.h>
15#include <asm/syscalls.h> 17#include <asm/syscalls.h>
@@ -17,6 +19,7 @@
17#include <asm/uaccess.h> 19#include <asm/uaccess.h>
18#include <asm/i387.h> 20#include <asm/i387.h>
19#include <asm/ds.h> 21#include <asm/ds.h>
22#include <asm/debugreg.h>
20 23
21unsigned long idle_halt; 24unsigned long idle_halt;
22EXPORT_SYMBOL(idle_halt); 25EXPORT_SYMBOL(idle_halt);
@@ -103,14 +106,7 @@ void flush_thread(void)
103 } 106 }
104#endif 107#endif
105 108
106 clear_tsk_thread_flag(tsk, TIF_DEBUG); 109 flush_ptrace_hw_breakpoint(tsk);
107
108 tsk->thread.debugreg0 = 0;
109 tsk->thread.debugreg1 = 0;
110 tsk->thread.debugreg2 = 0;
111 tsk->thread.debugreg3 = 0;
112 tsk->thread.debugreg6 = 0;
113 tsk->thread.debugreg7 = 0;
114 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 110 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
115 /* 111 /*
116 * Forget coprocessor state.. 112 * Forget coprocessor state..
@@ -192,16 +188,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
192 else if (next->debugctlmsr != prev->debugctlmsr) 188 else if (next->debugctlmsr != prev->debugctlmsr)
193 update_debugctlmsr(next->debugctlmsr); 189 update_debugctlmsr(next->debugctlmsr);
194 190
195 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
196 set_debugreg(next->debugreg0, 0);
197 set_debugreg(next->debugreg1, 1);
198 set_debugreg(next->debugreg2, 2);
199 set_debugreg(next->debugreg3, 3);
200 /* no 4 and 5 */
201 set_debugreg(next->debugreg6, 6);
202 set_debugreg(next->debugreg7, 7);
203 }
204
205 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 191 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
206 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 192 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
207 /* prev and next are different */ 193 /* prev and next are different */
@@ -224,6 +210,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
224 */ 210 */
225 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 211 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
226 } 212 }
213 propagate_user_return_notify(prev_p, next_p);
227} 214}
228 215
229int sys_fork(struct pt_regs *regs) 216int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf79567cdab..075580b35682 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -58,6 +58,7 @@
58#include <asm/idle.h> 58#include <asm/idle.h>
59#include <asm/syscalls.h> 59#include <asm/syscalls.h>
60#include <asm/ds.h> 60#include <asm/ds.h>
61#include <asm/debugreg.h>
61 62
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 63asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 64
@@ -134,7 +135,7 @@ void __show_regs(struct pt_regs *regs, int all)
134 ss = regs->ss & 0xffff; 135 ss = regs->ss & 0xffff;
135 gs = get_user_gs(regs); 136 gs = get_user_gs(regs);
136 } else { 137 } else {
137 sp = (unsigned long) (&regs->sp); 138 sp = kernel_stack_pointer(regs);
138 savesegment(ss, ss); 139 savesegment(ss, ss);
139 savesegment(gs, gs); 140 savesegment(gs, gs);
140 } 141 }
@@ -187,7 +188,7 @@ void __show_regs(struct pt_regs *regs, int all)
187 188
188void show_regs(struct pt_regs *regs) 189void show_regs(struct pt_regs *regs)
189{ 190{
190 __show_regs(regs, 1); 191 show_registers(regs);
191 show_trace(NULL, regs, &regs->sp, regs->bp); 192 show_trace(NULL, regs, &regs->sp, regs->bp);
192} 193}
193 194
@@ -259,7 +260,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
259 260
260 task_user_gs(p) = get_user_gs(regs); 261 task_user_gs(p) = get_user_gs(regs);
261 262
263 p->thread.io_bitmap_ptr = NULL;
262 tsk = current; 264 tsk = current;
265 err = -ENOMEM;
266
267 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
268
263 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 269 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
264 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, 270 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
265 IO_BITMAP_BYTES, GFP_KERNEL); 271 IO_BITMAP_BYTES, GFP_KERNEL);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ad535b683170..c95c8f4e790a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,6 +52,7 @@
52#include <asm/idle.h> 52#include <asm/idle.h>
53#include <asm/syscalls.h> 53#include <asm/syscalls.h>
54#include <asm/ds.h> 54#include <asm/ds.h>
55#include <asm/debugreg.h>
55 56
56asmlinkage extern void ret_from_fork(void); 57asmlinkage extern void ret_from_fork(void);
57 58
@@ -226,8 +227,7 @@ void __show_regs(struct pt_regs *regs, int all)
226 227
227void show_regs(struct pt_regs *regs) 228void show_regs(struct pt_regs *regs)
228{ 229{
229 printk(KERN_INFO "CPU %d:", smp_processor_id()); 230 show_registers(regs);
230 __show_regs(regs, 1);
231 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 231 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
232} 232}
233 233
@@ -297,12 +297,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
297 297
298 p->thread.fs = me->thread.fs; 298 p->thread.fs = me->thread.fs;
299 p->thread.gs = me->thread.gs; 299 p->thread.gs = me->thread.gs;
300 p->thread.io_bitmap_ptr = NULL;
300 301
301 savesegment(gs, p->thread.gsindex); 302 savesegment(gs, p->thread.gsindex);
302 savesegment(fs, p->thread.fsindex); 303 savesegment(fs, p->thread.fsindex);
303 savesegment(es, p->thread.es); 304 savesegment(es, p->thread.es);
304 savesegment(ds, p->thread.ds); 305 savesegment(ds, p->thread.ds);
305 306
307 err = -ENOMEM;
308 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
309
306 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 310 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
307 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 311 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
308 if (!p->thread.io_bitmap_ptr) { 312 if (!p->thread.io_bitmap_ptr) {
@@ -341,29 +345,46 @@ out:
341 kfree(p->thread.io_bitmap_ptr); 345 kfree(p->thread.io_bitmap_ptr);
342 p->thread.io_bitmap_max = 0; 346 p->thread.io_bitmap_max = 0;
343 } 347 }
348
344 return err; 349 return err;
345} 350}
346 351
347void 352static void
348start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 353start_thread_common(struct pt_regs *regs, unsigned long new_ip,
354 unsigned long new_sp,
355 unsigned int _cs, unsigned int _ss, unsigned int _ds)
349{ 356{
350 loadsegment(fs, 0); 357 loadsegment(fs, 0);
351 loadsegment(es, 0); 358 loadsegment(es, _ds);
352 loadsegment(ds, 0); 359 loadsegment(ds, _ds);
353 load_gs_index(0); 360 load_gs_index(0);
354 regs->ip = new_ip; 361 regs->ip = new_ip;
355 regs->sp = new_sp; 362 regs->sp = new_sp;
356 percpu_write(old_rsp, new_sp); 363 percpu_write(old_rsp, new_sp);
357 regs->cs = __USER_CS; 364 regs->cs = _cs;
358 regs->ss = __USER_DS; 365 regs->ss = _ss;
359 regs->flags = 0x200; 366 regs->flags = X86_EFLAGS_IF;
360 set_fs(USER_DS); 367 set_fs(USER_DS);
361 /* 368 /*
362 * Free the old FP and other extended state 369 * Free the old FP and other extended state
363 */ 370 */
364 free_thread_xstate(current); 371 free_thread_xstate(current);
365} 372}
366EXPORT_SYMBOL_GPL(start_thread); 373
374void
375start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
376{
377 start_thread_common(regs, new_ip, new_sp,
378 __USER_CS, __USER_DS, 0);
379}
380
381#ifdef CONFIG_IA32_EMULATION
382void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
383{
384 start_thread_common(regs, new_ip, new_sp,
385 __USER32_CS, __USER32_DS, __USER32_DS);
386}
387#endif
367 388
368/* 389/*
369 * switch_to(x,y) should switch tasks from x to y. 390 * switch_to(x,y) should switch tasks from x to y.
@@ -495,6 +516,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
495 */ 516 */
496 if (preload_fpu) 517 if (preload_fpu)
497 __math_state_restore(); 518 __math_state_restore();
519
498 return prev_p; 520 return prev_p;
499} 521}
500 522
@@ -664,3 +686,8 @@ long sys_arch_prctl(int code, unsigned long addr)
664 return do_arch_prctl(current, code, addr); 686 return do_arch_prctl(current, code, addr);
665} 687}
666 688
689unsigned long KSTK_ESP(struct task_struct *task)
690{
691 return (test_tsk_thread_flag(task, TIF_IA32)) ?
692 (task_pt_regs(task)->sp) : ((task)->thread.usersp);
693}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7b058a2dc66a..04d182a7cfdb 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,8 @@
22#include <linux/seccomp.h> 22#include <linux/seccomp.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/workqueue.h> 24#include <linux/workqueue.h>
25#include <linux/perf_event.h>
26#include <linux/hw_breakpoint.h>
25 27
26#include <asm/uaccess.h> 28#include <asm/uaccess.h>
27#include <asm/pgtable.h> 29#include <asm/pgtable.h>
@@ -34,6 +36,7 @@
34#include <asm/prctl.h> 36#include <asm/prctl.h>
35#include <asm/proto.h> 37#include <asm/proto.h>
36#include <asm/ds.h> 38#include <asm/ds.h>
39#include <asm/hw_breakpoint.h>
37 40
38#include "tls.h" 41#include "tls.h"
39 42
@@ -49,6 +52,118 @@ enum x86_regset {
49 REGSET_IOPERM32, 52 REGSET_IOPERM32,
50}; 53};
51 54
55struct pt_regs_offset {
56 const char *name;
57 int offset;
58};
59
60#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
61#define REG_OFFSET_END {.name = NULL, .offset = 0}
62
63static const struct pt_regs_offset regoffset_table[] = {
64#ifdef CONFIG_X86_64
65 REG_OFFSET_NAME(r15),
66 REG_OFFSET_NAME(r14),
67 REG_OFFSET_NAME(r13),
68 REG_OFFSET_NAME(r12),
69 REG_OFFSET_NAME(r11),
70 REG_OFFSET_NAME(r10),
71 REG_OFFSET_NAME(r9),
72 REG_OFFSET_NAME(r8),
73#endif
74 REG_OFFSET_NAME(bx),
75 REG_OFFSET_NAME(cx),
76 REG_OFFSET_NAME(dx),
77 REG_OFFSET_NAME(si),
78 REG_OFFSET_NAME(di),
79 REG_OFFSET_NAME(bp),
80 REG_OFFSET_NAME(ax),
81#ifdef CONFIG_X86_32
82 REG_OFFSET_NAME(ds),
83 REG_OFFSET_NAME(es),
84 REG_OFFSET_NAME(fs),
85 REG_OFFSET_NAME(gs),
86#endif
87 REG_OFFSET_NAME(orig_ax),
88 REG_OFFSET_NAME(ip),
89 REG_OFFSET_NAME(cs),
90 REG_OFFSET_NAME(flags),
91 REG_OFFSET_NAME(sp),
92 REG_OFFSET_NAME(ss),
93 REG_OFFSET_END,
94};
95
96/**
97 * regs_query_register_offset() - query register offset from its name
98 * @name: the name of a register
99 *
100 * regs_query_register_offset() returns the offset of a register in struct
101 * pt_regs from its name. If the name is invalid, this returns -EINVAL;
102 */
103int regs_query_register_offset(const char *name)
104{
105 const struct pt_regs_offset *roff;
106 for (roff = regoffset_table; roff->name != NULL; roff++)
107 if (!strcmp(roff->name, name))
108 return roff->offset;
109 return -EINVAL;
110}
111
112/**
113 * regs_query_register_name() - query register name from its offset
114 * @offset: the offset of a register in struct pt_regs.
115 *
116 * regs_query_register_name() returns the name of a register from its
117 * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
118 */
119const char *regs_query_register_name(unsigned int offset)
120{
121 const struct pt_regs_offset *roff;
122 for (roff = regoffset_table; roff->name != NULL; roff++)
123 if (roff->offset == offset)
124 return roff->name;
125 return NULL;
126}
127
128static const int arg_offs_table[] = {
129#ifdef CONFIG_X86_32
130 [0] = offsetof(struct pt_regs, ax),
131 [1] = offsetof(struct pt_regs, dx),
132 [2] = offsetof(struct pt_regs, cx)
133#else /* CONFIG_X86_64 */
134 [0] = offsetof(struct pt_regs, di),
135 [1] = offsetof(struct pt_regs, si),
136 [2] = offsetof(struct pt_regs, dx),
137 [3] = offsetof(struct pt_regs, cx),
138 [4] = offsetof(struct pt_regs, r8),
139 [5] = offsetof(struct pt_regs, r9)
140#endif
141};
142
143/**
144 * regs_get_argument_nth() - get Nth argument at function call
145 * @regs: pt_regs which contains registers at function entry.
146 * @n: argument number.
147 *
148 * regs_get_argument_nth() returns @n th argument of a function call.
149 * Since usually the kernel stack will be changed right after function entry,
150 * you must use this at function entry. If the @n th entry is NOT in the
151 * kernel stack or pt_regs, this returns 0.
152 */
153unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
154{
155 if (n < ARRAY_SIZE(arg_offs_table))
156 return *(unsigned long *)((char *)regs + arg_offs_table[n]);
157 else {
158 /*
159 * The typical case: arg n is on the stack.
160 * (Note: stack[0] = return address, so skip it)
161 */
162 n -= ARRAY_SIZE(arg_offs_table);
163 return regs_get_kernel_stack_nth(regs, 1 + n);
164 }
165}
166
52/* 167/*
53 * does not yet catch signals sent when the child dies. 168 * does not yet catch signals sent when the child dies.
54 * in exit.c or in signal.c. 169 * in exit.c or in signal.c.
@@ -137,11 +252,6 @@ static int set_segment_reg(struct task_struct *task,
137 return 0; 252 return 0;
138} 253}
139 254
140static unsigned long debugreg_addr_limit(struct task_struct *task)
141{
142 return TASK_SIZE - 3;
143}
144
145#else /* CONFIG_X86_64 */ 255#else /* CONFIG_X86_64 */
146 256
147#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) 257#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -266,15 +376,6 @@ static int set_segment_reg(struct task_struct *task,
266 return 0; 376 return 0;
267} 377}
268 378
269static unsigned long debugreg_addr_limit(struct task_struct *task)
270{
271#ifdef CONFIG_IA32_EMULATION
272 if (test_tsk_thread_flag(task, TIF_IA32))
273 return IA32_PAGE_OFFSET - 3;
274#endif
275 return TASK_SIZE_MAX - 7;
276}
277
278#endif /* CONFIG_X86_32 */ 379#endif /* CONFIG_X86_32 */
279 380
280static unsigned long get_flags(struct task_struct *task) 381static unsigned long get_flags(struct task_struct *task)
@@ -454,99 +555,239 @@ static int genregs_set(struct task_struct *target,
454 return ret; 555 return ret;
455} 556}
456 557
558static void ptrace_triggered(struct perf_event *bp, void *data)
559{
560 int i;
561 struct thread_struct *thread = &(current->thread);
562
563 /*
564 * Store in the virtual DR6 register the fact that the breakpoint
565 * was hit so the thread's debugger will see it.
566 */
567 for (i = 0; i < HBP_NUM; i++) {
568 if (thread->ptrace_bps[i] == bp)
569 break;
570 }
571
572 thread->debugreg6 |= (DR_TRAP0 << i);
573}
574
457/* 575/*
458 * This function is trivial and will be inlined by the compiler. 576 * Walk through every ptrace breakpoints for this thread and
459 * Having it separates the implementation details of debug 577 * build the dr7 value on top of their attributes.
460 * registers from the interface details of ptrace. 578 *
461 */ 579 */
462static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) 580static unsigned long ptrace_get_dr7(struct perf_event *bp[])
463{ 581{
464 switch (n) { 582 int i;
465 case 0: return child->thread.debugreg0; 583 int dr7 = 0;
466 case 1: return child->thread.debugreg1; 584 struct arch_hw_breakpoint *info;
467 case 2: return child->thread.debugreg2; 585
468 case 3: return child->thread.debugreg3; 586 for (i = 0; i < HBP_NUM; i++) {
469 case 6: return child->thread.debugreg6; 587 if (bp[i] && !bp[i]->attr.disabled) {
470 case 7: return child->thread.debugreg7; 588 info = counter_arch_bp(bp[i]);
589 dr7 |= encode_dr7(i, info->len, info->type);
590 }
471 } 591 }
472 return 0; 592
593 return dr7;
473} 594}
474 595
475static int ptrace_set_debugreg(struct task_struct *child, 596static struct perf_event *
476 int n, unsigned long data) 597ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
598 struct task_struct *tsk, int disabled)
477{ 599{
478 int i; 600 int err;
601 int gen_len, gen_type;
602 DEFINE_BREAKPOINT_ATTR(attr);
479 603
480 if (unlikely(n == 4 || n == 5)) 604 /*
481 return -EIO; 605 * We shoud have at least an inactive breakpoint at this
606 * slot. It means the user is writing dr7 without having
607 * written the address register first
608 */
609 if (!bp)
610 return ERR_PTR(-EINVAL);
482 611
483 if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) 612 err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
484 return -EIO; 613 if (err)
614 return ERR_PTR(err);
485 615
486 switch (n) { 616 attr = bp->attr;
487 case 0: child->thread.debugreg0 = data; break; 617 attr.bp_len = gen_len;
488 case 1: child->thread.debugreg1 = data; break; 618 attr.bp_type = gen_type;
489 case 2: child->thread.debugreg2 = data; break; 619 attr.disabled = disabled;
490 case 3: child->thread.debugreg3 = data; break;
491 620
492 case 6: 621 return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
493 if ((data & ~0xffffffffUL) != 0) 622}
494 return -EIO; 623
495 child->thread.debugreg6 = data; 624/*
496 break; 625 * Handle ptrace writes to debug register 7.
626 */
627static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
628{
629 struct thread_struct *thread = &(tsk->thread);
630 unsigned long old_dr7;
631 int i, orig_ret = 0, rc = 0;
632 int enabled, second_pass = 0;
633 unsigned len, type;
634 struct perf_event *bp;
635
636 data &= ~DR_CONTROL_RESERVED;
637 old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
638restore:
639 /*
640 * Loop through all the hardware breakpoints, making the
641 * appropriate changes to each.
642 */
643 for (i = 0; i < HBP_NUM; i++) {
644 enabled = decode_dr7(data, i, &len, &type);
645 bp = thread->ptrace_bps[i];
646
647 if (!enabled) {
648 if (bp) {
649 /*
650 * Don't unregister the breakpoints right-away,
651 * unless all register_user_hw_breakpoint()
652 * requests have succeeded. This prevents
653 * any window of opportunity for debug
654 * register grabbing by other users.
655 */
656 if (!second_pass)
657 continue;
658
659 thread->ptrace_bps[i] = NULL;
660 bp = ptrace_modify_breakpoint(bp, len, type,
661 tsk, 1);
662 if (IS_ERR(bp)) {
663 rc = PTR_ERR(bp);
664 thread->ptrace_bps[i] = NULL;
665 break;
666 }
667 thread->ptrace_bps[i] = bp;
668 }
669 continue;
670 }
671
672 bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
673
674 /* Incorrect bp, or we have a bug in bp API */
675 if (IS_ERR(bp)) {
676 rc = PTR_ERR(bp);
677 thread->ptrace_bps[i] = NULL;
678 break;
679 }
680 thread->ptrace_bps[i] = bp;
681 }
682 /*
683 * Make a second pass to free the remaining unused breakpoints
684 * or to restore the original breakpoints if an error occurred.
685 */
686 if (!second_pass) {
687 second_pass = 1;
688 if (rc < 0) {
689 orig_ret = rc;
690 data = old_dr7;
691 }
692 goto restore;
693 }
694 return ((orig_ret < 0) ? orig_ret : rc);
695}
696
697/*
698 * Handle PTRACE_PEEKUSR calls for the debug register area.
699 */
700static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
701{
702 struct thread_struct *thread = &(tsk->thread);
703 unsigned long val = 0;
497 704
498 case 7: 705 if (n < HBP_NUM) {
706 struct perf_event *bp;
707 bp = thread->ptrace_bps[n];
708 if (!bp)
709 return 0;
710 val = bp->hw.info.address;
711 } else if (n == 6) {
712 val = thread->debugreg6;
713 } else if (n == 7) {
714 val = ptrace_get_dr7(thread->ptrace_bps);
715 }
716 return val;
717}
718
719static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
720 unsigned long addr)
721{
722 struct perf_event *bp;
723 struct thread_struct *t = &tsk->thread;
724 DEFINE_BREAKPOINT_ATTR(attr);
725
726 if (!t->ptrace_bps[nr]) {
499 /* 727 /*
500 * Sanity-check data. Take one half-byte at once with 728 * Put stub len and type to register (reserve) an inactive but
501 * check = (val >> (16 + 4*i)) & 0xf. It contains the 729 * correct bp
502 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
503 * 2 and 3 are LENi. Given a list of invalid values,
504 * we do mask |= 1 << invalid_value, so that
505 * (mask >> check) & 1 is a correct test for invalid
506 * values.
507 *
508 * R/Wi contains the type of the breakpoint /
509 * watchpoint, LENi contains the length of the watched
510 * data in the watchpoint case.
511 *
512 * The invalid values are:
513 * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
514 * - R/Wi == 0x10 (break on I/O reads or writes), so
515 * mask |= 0x4444.
516 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
517 * 0x1110.
518 *
519 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
520 *
521 * See the Intel Manual "System Programming Guide",
522 * 15.2.4
523 *
524 * Note that LENi == 0x10 is defined on x86_64 in long
525 * mode (i.e. even for 32-bit userspace software, but
526 * 64-bit kernel), so the x86_64 mask value is 0x5454.
527 * See the AMD manual no. 24593 (AMD64 System Programming)
528 */ 730 */
529#ifdef CONFIG_X86_32 731 attr.bp_addr = addr;
530#define DR7_MASK 0x5f54 732 attr.bp_len = HW_BREAKPOINT_LEN_1;
531#else 733 attr.bp_type = HW_BREAKPOINT_W;
532#define DR7_MASK 0x5554 734 attr.disabled = 1;
533#endif 735
534 data &= ~DR_CONTROL_RESERVED; 736 bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
535 for (i = 0; i < 4; i++) 737 } else {
536 if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) 738 bp = t->ptrace_bps[nr];
537 return -EIO; 739 t->ptrace_bps[nr] = NULL;
538 child->thread.debugreg7 = data; 740
539 if (data) 741 attr = bp->attr;
540 set_tsk_thread_flag(child, TIF_DEBUG); 742 attr.bp_addr = addr;
541 else 743 bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
542 clear_tsk_thread_flag(child, TIF_DEBUG);
543 break;
544 } 744 }
745 /*
746 * CHECKME: the previous code returned -EIO if the addr wasn't a
747 * valid task virtual addr. The new one will return -EINVAL in this
748 * case.
749 * -EINVAL may be what we want for in-kernel breakpoints users, but
750 * -EIO looks better for ptrace, since we refuse a register writing
751 * for the user. And anyway this is the previous behaviour.
752 */
753 if (IS_ERR(bp))
754 return PTR_ERR(bp);
755
756 t->ptrace_bps[nr] = bp;
545 757
546 return 0; 758 return 0;
547} 759}
548 760
549/* 761/*
762 * Handle PTRACE_POKEUSR calls for the debug register area.
763 */
764int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
765{
766 struct thread_struct *thread = &(tsk->thread);
767 int rc = 0;
768
769 /* There are no DR4 or DR5 registers */
770 if (n == 4 || n == 5)
771 return -EIO;
772
773 if (n == 6) {
774 thread->debugreg6 = val;
775 goto ret_path;
776 }
777 if (n < HBP_NUM) {
778 rc = ptrace_set_breakpoint_addr(tsk, n, val);
779 if (rc)
780 return rc;
781 }
782 /* All that's left is DR7 */
783 if (n == 7)
784 rc = ptrace_write_dr7(tsk, val);
785
786ret_path:
787 return rc;
788}
789
790/*
550 * These access the current or another (stopped) task's io permission 791 * These access the current or another (stopped) task's io permission
551 * bitmap for debugging or core dump. 792 * bitmap for debugging or core dump.
552 */ 793 */
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 6c3b2c6fd772..18093d7498f0 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -499,6 +499,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
499{ 499{
500 struct pci_dev *nb_ht; 500 struct pci_dev *nb_ht;
501 unsigned int devfn; 501 unsigned int devfn;
502 u32 node;
502 u32 val; 503 u32 val;
503 504
504 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); 505 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
@@ -507,7 +508,13 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
507 return; 508 return;
508 509
509 pci_read_config_dword(nb_ht, 0x60, &val); 510 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7); 511 node = val & 7;
512 /*
513 * Some hardware may return an invalid node ID,
514 * so check it first:
515 */
516 if (node_online(node))
517 set_dev_node(&dev->dev, node);
511 pci_dev_put(nb_ht); 518 pci_dev_put(nb_ht);
512} 519}
513 520
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a1a3cdda06e1..2b97fc5b124e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -23,7 +23,7 @@
23# include <linux/ctype.h> 23# include <linux/ctype.h>
24# include <linux/mc146818rtc.h> 24# include <linux/mc146818rtc.h>
25#else 25#else
26# include <asm/iommu.h> 26# include <asm/x86_init.h>
27#endif 27#endif
28 28
29/* 29/*
@@ -436,6 +436,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
436 DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), 436 DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
437 }, 437 },
438 }, 438 },
439 { /* Handle problems with rebooting on Apple Macmini3,1 */
440 .callback = set_pci_reboot,
441 .ident = "Apple Macmini3,1",
442 .matches = {
443 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
444 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
445 },
446 },
439 { } 447 { }
440}; 448};
441 449
@@ -614,7 +622,7 @@ void native_machine_shutdown(void)
614#endif 622#endif
615 623
616#ifdef CONFIG_X86_64 624#ifdef CONFIG_X86_64
617 pci_iommu_shutdown(); 625 x86_platform.iommu_shutdown();
618#endif 626#endif
619} 627}
620 628
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index 61a837743fe5..201eab63b05f 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -80,6 +80,7 @@ void mach_reboot_fixups(void)
80 continue; 80 continue;
81 81
82 cur->reboot_fixup(dev); 82 cur->reboot_fixup(dev);
83 pci_dev_put(dev);
83 } 84 }
84} 85}
85 86
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e09f0e2c14b5..946a311a25c9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -106,9 +106,11 @@
106#include <asm/percpu.h> 106#include <asm/percpu.h>
107#include <asm/topology.h> 107#include <asm/topology.h>
108#include <asm/apicdef.h> 108#include <asm/apicdef.h>
109#include <asm/k8.h>
109#ifdef CONFIG_X86_64 110#ifdef CONFIG_X86_64
110#include <asm/numa_64.h> 111#include <asm/numa_64.h>
111#endif 112#endif
113#include <asm/mce.h>
112 114
113/* 115/*
114 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 116 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -247,7 +249,7 @@ EXPORT_SYMBOL(edd);
247 * from boot_params into a safe place. 249 * from boot_params into a safe place.
248 * 250 *
249 */ 251 */
250static inline void copy_edd(void) 252static inline void __init copy_edd(void)
251{ 253{
252 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, 254 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
253 sizeof(edd.mbr_signature)); 255 sizeof(edd.mbr_signature));
@@ -256,7 +258,7 @@ static inline void copy_edd(void)
256 edd.edd_info_nr = boot_params.eddbuf_entries; 258 edd.edd_info_nr = boot_params.eddbuf_entries;
257} 259}
258#else 260#else
259static inline void copy_edd(void) 261static inline void __init copy_edd(void)
260{ 262{
261} 263}
262#endif 264#endif
@@ -486,42 +488,11 @@ static void __init reserve_early_setup_data(void)
486 488
487#ifdef CONFIG_KEXEC 489#ifdef CONFIG_KEXEC
488 490
489/**
490 * Reserve @size bytes of crashkernel memory at any suitable offset.
491 *
492 * @size: Size of the crashkernel memory to reserve.
493 * Returns the base address on success, and -1ULL on failure.
494 */
495static
496unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
497{
498 const unsigned long long alignment = 16<<20; /* 16M */
499 unsigned long long start = 0LL;
500
501 while (1) {
502 int ret;
503
504 start = find_e820_area(start, ULONG_MAX, size, alignment);
505 if (start == -1ULL)
506 return start;
507
508 /* try to reserve it */
509 ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
510 if (ret >= 0)
511 return start;
512
513 start += alignment;
514 }
515}
516
517static inline unsigned long long get_total_mem(void) 491static inline unsigned long long get_total_mem(void)
518{ 492{
519 unsigned long long total; 493 unsigned long long total;
520 494
521 total = max_low_pfn - min_low_pfn; 495 total = max_pfn - min_low_pfn;
522#ifdef CONFIG_HIGHMEM
523 total += highend_pfn - highstart_pfn;
524#endif
525 496
526 return total << PAGE_SHIFT; 497 return total << PAGE_SHIFT;
527} 498}
@@ -541,21 +512,25 @@ static void __init reserve_crashkernel(void)
541 512
542 /* 0 means: find the address automatically */ 513 /* 0 means: find the address automatically */
543 if (crash_base <= 0) { 514 if (crash_base <= 0) {
544 crash_base = find_and_reserve_crashkernel(crash_size); 515 const unsigned long long alignment = 16<<20; /* 16M */
516
517 crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
518 alignment);
545 if (crash_base == -1ULL) { 519 if (crash_base == -1ULL) {
546 pr_info("crashkernel reservation failed. " 520 pr_info("crashkernel reservation failed - No suitable area found.\n");
547 "No suitable area found.\n");
548 return; 521 return;
549 } 522 }
550 } else { 523 } else {
551 ret = reserve_bootmem_generic(crash_base, crash_size, 524 unsigned long long start;
552 BOOTMEM_EXCLUSIVE); 525
553 if (ret < 0) { 526 start = find_e820_area(crash_base, ULONG_MAX, crash_size,
554 pr_info("crashkernel reservation failed - " 527 1<<20);
555 "memory is in use\n"); 528 if (start != crash_base) {
529 pr_info("crashkernel reservation failed - memory is in use.\n");
556 return; 530 return;
557 } 531 }
558 } 532 }
533 reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
559 534
560 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " 535 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
561 "for crashkernel (System RAM: %ldMB)\n", 536 "for crashkernel (System RAM: %ldMB)\n",
@@ -660,6 +635,13 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
660 }, 635 },
661 }, 636 },
662 { 637 {
638 .callback = dmi_low_memory_corruption,
639 .ident = "Phoenix/MSC BIOS",
640 .matches = {
641 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
642 },
643 },
644 {
663 /* 645 /*
664 * AMI BIOS with low memory corruption was found on Intel DG45ID board. 646 * AMI BIOS with low memory corruption was found on Intel DG45ID board.
665 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will 647 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
@@ -691,6 +673,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
691 673
692void __init setup_arch(char **cmdline_p) 674void __init setup_arch(char **cmdline_p)
693{ 675{
676 int acpi = 0;
677 int k8 = 0;
678
694#ifdef CONFIG_X86_32 679#ifdef CONFIG_X86_32
695 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 680 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
696 visws_early_detect(); 681 visws_early_detect();
@@ -783,21 +768,18 @@ void __init setup_arch(char **cmdline_p)
783 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 768 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
784 *cmdline_p = command_line; 769 *cmdline_p = command_line;
785 770
786#ifdef CONFIG_X86_64
787 /* 771 /*
788 * Must call this twice: Once just to detect whether hardware doesn't 772 * x86_configure_nx() is called before parse_early_param() to detect
789 * support NX (so that the early EHCI debug console setup can safely 773 * whether hardware doesn't support NX (so that the early EHCI debug
790 * call set_fixmap(), and then again after parsing early parameters to 774 * console setup can safely call set_fixmap()). It may then be called
791 * honor the respective command line option. 775 * again from within noexec_setup() during parsing early parameters
776 * to honor the respective command line option.
792 */ 777 */
793 check_efer(); 778 x86_configure_nx();
794#endif
795 779
796 parse_early_param(); 780 parse_early_param();
797 781
798#ifdef CONFIG_X86_64 782 x86_report_nx();
799 check_efer();
800#endif
801 783
802 /* Must be before kernel pagetables are setup */ 784 /* Must be before kernel pagetables are setup */
803 vmi_activate(); 785 vmi_activate();
@@ -893,6 +875,13 @@ void __init setup_arch(char **cmdline_p)
893 875
894 reserve_brk(); 876 reserve_brk();
895 877
878#ifdef CONFIG_ACPI_SLEEP
879 /*
880 * Reserve low memory region for sleep support.
881 * even before init_memory_mapping
882 */
883 acpi_reserve_wakeup_memory();
884#endif
896 init_gbpages(); 885 init_gbpages();
897 886
898 /* max_pfn_mapped is updated here */ 887 /* max_pfn_mapped is updated here */
@@ -919,6 +908,8 @@ void __init setup_arch(char **cmdline_p)
919 908
920 reserve_initrd(); 909 reserve_initrd();
921 910
911 reserve_crashkernel();
912
922 vsmp_init(); 913 vsmp_init();
923 914
924 io_delay_init(); 915 io_delay_init();
@@ -930,27 +921,24 @@ void __init setup_arch(char **cmdline_p)
930 921
931 early_acpi_boot_init(); 922 early_acpi_boot_init();
932 923
924 /*
925 * Find and reserve possible boot-time SMP configuration:
926 */
927 find_smp_config();
928
933#ifdef CONFIG_ACPI_NUMA 929#ifdef CONFIG_ACPI_NUMA
934 /* 930 /*
935 * Parse SRAT to discover nodes. 931 * Parse SRAT to discover nodes.
936 */ 932 */
937 acpi_numa_init(); 933 acpi = acpi_numa_init();
938#endif 934#endif
939 935
940 initmem_init(0, max_pfn); 936#ifdef CONFIG_K8_NUMA
941 937 if (!acpi)
942#ifdef CONFIG_ACPI_SLEEP 938 k8 = !k8_numa_init(0, max_pfn);
943 /*
944 * Reserve low memory region for sleep support.
945 */
946 acpi_reserve_bootmem();
947#endif 939#endif
948 /*
949 * Find and reserve possible boot-time SMP configuration:
950 */
951 find_smp_config();
952 940
953 reserve_crashkernel(); 941 initmem_init(0, max_pfn, acpi, k8);
954 942
955#ifdef CONFIG_X86_64 943#ifdef CONFIG_X86_64
956 /* 944 /*
@@ -1024,6 +1012,8 @@ void __init setup_arch(char **cmdline_p)
1024#endif 1012#endif
1025#endif 1013#endif
1026 x86_init.oem.banner(); 1014 x86_init.oem.banner();
1015
1016 mcheck_init();
1027} 1017}
1028 1018
1029#ifdef CONFIG_X86_32 1019#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76055ad..74fe6d86dc5d 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/personality.h> 20#include <linux/personality.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/user-return-notifier.h>
22 23
23#include <asm/processor.h> 24#include <asm/processor.h>
24#include <asm/ucontext.h> 25#include <asm/ucontext.h>
@@ -799,15 +800,6 @@ static void do_signal(struct pt_regs *regs)
799 800
800 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 801 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
801 if (signr > 0) { 802 if (signr > 0) {
802 /*
803 * Re-enable any watchpoints before delivering the
804 * signal to user space. The processor register will
805 * have been cleared if the watchpoint triggered
806 * inside the kernel.
807 */
808 if (current->thread.debugreg7)
809 set_debugreg(current->thread.debugreg7, 7);
810
811 /* Whee! Actually deliver the signal. */ 803 /* Whee! Actually deliver the signal. */
812 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 804 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
813 /* 805 /*
@@ -872,6 +864,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
872 if (current->replacement_session_keyring) 864 if (current->replacement_session_keyring)
873 key_replace_session_keyring(); 865 key_replace_session_keyring();
874 } 866 }
867 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
868 fire_user_return_notifiers();
875 869
876#ifdef CONFIG_X86_32 870#ifdef CONFIG_X86_32
877 clear_thread_flag(TIF_IRET); 871 clear_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 565ebc65920e..324f2a44c221 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1250,16 +1250,7 @@ static void __ref remove_cpu_from_maps(int cpu)
1250void cpu_disable_common(void) 1250void cpu_disable_common(void)
1251{ 1251{
1252 int cpu = smp_processor_id(); 1252 int cpu = smp_processor_id();
1253 /*
1254 * HACK:
1255 * Allow any queued timer interrupts to get serviced
1256 * This is only a temporary solution until we cleanup
1257 * fixup_irqs as we do for IA64.
1258 */
1259 local_irq_enable();
1260 mdelay(1);
1261 1253
1262 local_irq_disable();
1263 remove_siblinginfo(cpu); 1254 remove_siblinginfo(cpu);
1264 1255
1265 /* It's now safe to remove this processor from the online map */ 1256 /* It's now safe to remove this processor from the online map */
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 0157cd26d7cc..70c2125d55b9 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -336,3 +336,4 @@ ENTRY(sys_call_table)
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open 338 .long sys_perf_event_open
339 .long sys_recvmmsg
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 1740c85e24bb..364d015efebc 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -817,10 +817,8 @@ static int __init uv_init_blade(int blade)
817 */ 817 */
818 apicid = blade_to_first_apicid(blade); 818 apicid = blade_to_first_apicid(blade);
819 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); 819 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
820 if ((pa & 0xff) != UV_BAU_MESSAGE) { 820 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
821 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
822 ((apicid << 32) | UV_BAU_MESSAGE)); 821 ((apicid << 32) | UV_BAU_MESSAGE));
823 }
824 return 0; 822 return 0;
825} 823}
826 824
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e37dcee0cc3..33399176512a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -529,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) 529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
530{ 530{
531 struct task_struct *tsk = current; 531 struct task_struct *tsk = current;
532 unsigned long condition; 532 unsigned long dr6;
533 int si_code; 533 int si_code;
534 534
535 get_debugreg(condition, 6); 535 get_debugreg(dr6, 6);
536 536
537 /* Catch kmemcheck conditions first of all! */ 537 /* Catch kmemcheck conditions first of all! */
538 if (condition & DR_STEP && kmemcheck_trap(regs)) 538 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
539 return; 539 return;
540 540
541 /* DR6 may or may not be cleared by the CPU */
542 set_debugreg(0, 6);
541 /* 543 /*
542 * The processor cleared BTF, so don't mark that we need it set. 544 * The processor cleared BTF, so don't mark that we need it set.
543 */ 545 */
544 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 546 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
545 tsk->thread.debugctlmsr = 0; 547 tsk->thread.debugctlmsr = 0;
546 548
547 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 549 /* Store the virtualized DR6 value */
548 SIGTRAP) == NOTIFY_STOP) 550 tsk->thread.debugreg6 = dr6;
551
552 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
553 SIGTRAP) == NOTIFY_STOP)
549 return; 554 return;
550 555
551 /* It's safe to allow irq's after DR6 has been saved */ 556 /* It's safe to allow irq's after DR6 has been saved */
552 preempt_conditional_sti(regs); 557 preempt_conditional_sti(regs);
553 558
554 /* Mask out spurious debug traps due to lazy DR7 setting */ 559 if (regs->flags & X86_VM_MASK) {
555 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 560 handle_vm86_trap((struct kernel_vm86_regs *) regs,
556 if (!tsk->thread.debugreg7) 561 error_code, 1);
557 goto clear_dr7; 562 return;
558 } 563 }
559 564
560#ifdef CONFIG_X86_32
561 if (regs->flags & X86_VM_MASK)
562 goto debug_vm86;
563#endif
564
565 /* Save debug status register where ptrace can see it */
566 tsk->thread.debugreg6 = condition;
567
568 /* 565 /*
569 * Single-stepping through TF: make sure we ignore any events in 566 * Single-stepping through system calls: ignore any exceptions in
570 * kernel space (but re-enable TF when returning to user mode). 567 * kernel space, but re-enable TF when returning to user mode.
568 *
569 * We already checked v86 mode above, so we can check for kernel mode
570 * by just checking the CPL of CS.
571 */ 571 */
572 if (condition & DR_STEP) { 572 if ((dr6 & DR_STEP) && !user_mode(regs)) {
573 if (!user_mode(regs)) 573 tsk->thread.debugreg6 &= ~DR_STEP;
574 goto clear_TF_reenable; 574 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
575 regs->flags &= ~X86_EFLAGS_TF;
575 } 576 }
576 577 si_code = get_si_code(tsk->thread.debugreg6);
577 si_code = get_si_code(condition); 578 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
578 /* Ok, finally something we can handle */ 579 send_sigtrap(tsk, regs, error_code, si_code);
579 send_sigtrap(tsk, regs, error_code, si_code);
580
581 /*
582 * Disable additional traps. They'll be re-enabled when
583 * the signal is delivered.
584 */
585clear_dr7:
586 set_debugreg(0, 7);
587 preempt_conditional_cli(regs); 580 preempt_conditional_cli(regs);
588 return;
589 581
590#ifdef CONFIG_X86_32
591debug_vm86:
592 /* reenable preemption: handle_vm86_trap() might sleep */
593 dec_preempt_count();
594 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
595 conditional_cli(regs);
596 return;
597#endif
598
599clear_TF_reenable:
600 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
601 regs->flags &= ~X86_EFLAGS_TF;
602 preempt_conditional_cli(regs);
603 return; 582 return;
604} 583}
605 584
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index f37930954d15..eed156851f5d 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -114,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
114 return; 114 return;
115 115
116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
117 printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); 117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
118 pr_info(
119 "Skipped synchronization checks as TSC is reliable.\n");
118 return; 120 return;
119 } 121 }
120 122
121 pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
122 smp_processor_id(), cpu);
123
124 /* 123 /*
125 * Reset it - in case this is a second bootup: 124 * Reset it - in case this is a second bootup:
126 */ 125 */
@@ -142,12 +141,14 @@ void __cpuinit check_tsc_sync_source(int cpu)
142 cpu_relax(); 141 cpu_relax();
143 142
144 if (nr_warps) { 143 if (nr_warps) {
145 printk("\n"); 144 pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
145 smp_processor_id(), cpu);
146 pr_warning("Measured %Ld cycles TSC warp between CPUs, " 146 pr_warning("Measured %Ld cycles TSC warp between CPUs, "
147 "turning off TSC clock.\n", max_warp); 147 "turning off TSC clock.\n", max_warp);
148 mark_tsc_unstable("check_tsc_sync_source failed"); 148 mark_tsc_unstable("check_tsc_sync_source failed");
149 } else { 149 } else {
150 printk(" passed.\n"); 150 pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
151 smp_processor_id(), cpu);
151 } 152 }
152 153
153 /* 154 /*
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index aeef529917e4..61d805df4c91 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -9,10 +9,25 @@
9 */ 9 */
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/rbtree.h>
12#include <linux/irq.h> 13#include <linux/irq.h>
13 14
14#include <asm/apic.h> 15#include <asm/apic.h>
15#include <asm/uv/uv_irq.h> 16#include <asm/uv/uv_irq.h>
17#include <asm/uv/uv_hub.h>
18
19/* MMR offset and pnode of hub sourcing interrupts for a given irq */
20struct uv_irq_2_mmr_pnode{
21 struct rb_node list;
22 unsigned long offset;
23 int pnode;
24 int irq;
25};
26
27static spinlock_t uv_irq_lock;
28static struct rb_root uv_irq_root;
29
30static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
16 31
17static void uv_noop(unsigned int irq) 32static void uv_noop(unsigned int irq)
18{ 33{
@@ -39,25 +54,214 @@ struct irq_chip uv_irq_chip = {
39 .unmask = uv_noop, 54 .unmask = uv_noop,
40 .eoi = uv_ack_apic, 55 .eoi = uv_ack_apic,
41 .end = uv_noop, 56 .end = uv_noop,
57 .set_affinity = uv_set_irq_affinity,
42}; 58};
43 59
44/* 60/*
61 * Add offset and pnode information of the hub sourcing interrupts to the
62 * rb tree for a specific irq.
63 */
64static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
65{
66 struct rb_node **link = &uv_irq_root.rb_node;
67 struct rb_node *parent = NULL;
68 struct uv_irq_2_mmr_pnode *n;
69 struct uv_irq_2_mmr_pnode *e;
70 unsigned long irqflags;
71
72 n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
73 uv_blade_to_memory_nid(blade));
74 if (!n)
75 return -ENOMEM;
76
77 n->irq = irq;
78 n->offset = offset;
79 n->pnode = uv_blade_to_pnode(blade);
80 spin_lock_irqsave(&uv_irq_lock, irqflags);
81 /* Find the right place in the rbtree: */
82 while (*link) {
83 parent = *link;
84 e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
85
86 if (unlikely(irq == e->irq)) {
87 /* irq entry exists */
88 e->pnode = uv_blade_to_pnode(blade);
89 e->offset = offset;
90 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
91 kfree(n);
92 return 0;
93 }
94
95 if (irq < e->irq)
96 link = &(*link)->rb_left;
97 else
98 link = &(*link)->rb_right;
99 }
100
101 /* Insert the node into the rbtree. */
102 rb_link_node(&n->list, parent, link);
103 rb_insert_color(&n->list, &uv_irq_root);
104
105 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
106 return 0;
107}
108
109/* Retrieve offset and pnode information from the rb tree for a specific irq */
110int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
111{
112 struct uv_irq_2_mmr_pnode *e;
113 struct rb_node *n;
114 unsigned long irqflags;
115
116 spin_lock_irqsave(&uv_irq_lock, irqflags);
117 n = uv_irq_root.rb_node;
118 while (n) {
119 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
120
121 if (e->irq == irq) {
122 *offset = e->offset;
123 *pnode = e->pnode;
124 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
125 return 0;
126 }
127
128 if (irq < e->irq)
129 n = n->rb_left;
130 else
131 n = n->rb_right;
132 }
133 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
134 return -1;
135}
136
137/*
138 * Re-target the irq to the specified CPU and enable the specified MMR located
139 * on the specified blade to allow the sending of MSIs to the specified CPU.
140 */
141static int
142arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
143 unsigned long mmr_offset, int restrict)
144{
145 const struct cpumask *eligible_cpu = cpumask_of(cpu);
146 struct irq_desc *desc = irq_to_desc(irq);
147 struct irq_cfg *cfg;
148 int mmr_pnode;
149 unsigned long mmr_value;
150 struct uv_IO_APIC_route_entry *entry;
151 int err;
152
153 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
154 sizeof(unsigned long));
155
156 cfg = irq_cfg(irq);
157
158 err = assign_irq_vector(irq, cfg, eligible_cpu);
159 if (err != 0)
160 return err;
161
162 if (restrict == UV_AFFINITY_CPU)
163 desc->status |= IRQ_NO_BALANCING;
164 else
165 desc->status |= IRQ_MOVE_PCNTXT;
166
167 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
168 irq_name);
169
170 mmr_value = 0;
171 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
172 entry->vector = cfg->vector;
173 entry->delivery_mode = apic->irq_delivery_mode;
174 entry->dest_mode = apic->irq_dest_mode;
175 entry->polarity = 0;
176 entry->trigger = 0;
177 entry->mask = 0;
178 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
179
180 mmr_pnode = uv_blade_to_pnode(mmr_blade);
181 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
182
183 if (cfg->move_in_progress)
184 send_cleanup_vector(cfg);
185
186 return irq;
187}
188
189/*
190 * Disable the specified MMR located on the specified blade so that MSIs are
191 * longer allowed to be sent.
192 */
193static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
194{
195 unsigned long mmr_value;
196 struct uv_IO_APIC_route_entry *entry;
197
198 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
199 sizeof(unsigned long));
200
201 mmr_value = 0;
202 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
203 entry->mask = 1;
204
205 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
206}
207
208static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
209{
210 struct irq_desc *desc = irq_to_desc(irq);
211 struct irq_cfg *cfg = desc->chip_data;
212 unsigned int dest;
213 unsigned long mmr_value;
214 struct uv_IO_APIC_route_entry *entry;
215 unsigned long mmr_offset;
216 unsigned mmr_pnode;
217
218 dest = set_desc_affinity(desc, mask);
219 if (dest == BAD_APICID)
220 return -1;
221
222 mmr_value = 0;
223 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
224
225 entry->vector = cfg->vector;
226 entry->delivery_mode = apic->irq_delivery_mode;
227 entry->dest_mode = apic->irq_dest_mode;
228 entry->polarity = 0;
229 entry->trigger = 0;
230 entry->mask = 0;
231 entry->dest = dest;
232
233 /* Get previously stored MMR and pnode of hub sourcing interrupts */
234 if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
235 return -1;
236
237 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
238
239 if (cfg->move_in_progress)
240 send_cleanup_vector(cfg);
241
242 return 0;
243}
244
245/*
45 * Set up a mapping of an available irq and vector, and enable the specified 246 * Set up a mapping of an available irq and vector, and enable the specified
46 * MMR that defines the MSI that is to be sent to the specified CPU when an 247 * MMR that defines the MSI that is to be sent to the specified CPU when an
47 * interrupt is raised. 248 * interrupt is raised.
48 */ 249 */
49int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, 250int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
50 unsigned long mmr_offset) 251 unsigned long mmr_offset, int restrict)
51{ 252{
52 int irq; 253 int irq, ret;
53 int ret; 254
255 irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
54 256
55 irq = create_irq();
56 if (irq <= 0) 257 if (irq <= 0)
57 return -EBUSY; 258 return -EBUSY;
58 259
59 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); 260 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
60 if (ret != irq) 261 restrict);
262 if (ret == irq)
263 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
264 else
61 destroy_irq(irq); 265 destroy_irq(irq);
62 266
63 return ret; 267 return ret;
@@ -71,9 +275,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq);
71 * 275 *
72 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). 276 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
73 */ 277 */
74void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) 278void uv_teardown_irq(unsigned int irq)
75{ 279{
76 arch_disable_uv_irq(mmr_blade, mmr_offset); 280 struct uv_irq_2_mmr_pnode *e;
281 struct rb_node *n;
282 unsigned long irqflags;
283
284 spin_lock_irqsave(&uv_irq_lock, irqflags);
285 n = uv_irq_root.rb_node;
286 while (n) {
287 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
288 if (e->irq == irq) {
289 arch_disable_uv_irq(e->pnode, e->offset);
290 rb_erase(n, &uv_irq_root);
291 kfree(e);
292 break;
293 }
294 if (irq < e->irq)
295 n = n->rb_left;
296 else
297 n = n->rb_right;
298 }
299 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
77 destroy_irq(irq); 300 destroy_irq(irq);
78} 301}
79EXPORT_SYMBOL_GPL(uv_teardown_irq); 302EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 583f11d5c480..3c84aa001c11 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -74,7 +74,7 @@ struct uv_rtc_timer_head {
74 */ 74 */
75static struct uv_rtc_timer_head **blade_info __read_mostly; 75static struct uv_rtc_timer_head **blade_info __read_mostly;
76 76
77static int uv_rtc_enable; 77static int uv_rtc_evt_enable;
78 78
79/* 79/*
80 * Hardware interface routines 80 * Hardware interface routines
@@ -90,7 +90,7 @@ static void uv_rtc_send_IPI(int cpu)
90 pnode = uv_apicid_to_pnode(apicid); 90 pnode = uv_apicid_to_pnode(apicid);
91 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 91 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) | 92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
93 (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); 93 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
94 94
95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
96} 96}
@@ -115,7 +115,7 @@ static int uv_setup_intr(int cpu, u64 expires)
115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, 115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
116 UVH_EVENT_OCCURRED0_RTC1_MASK); 116 UVH_EVENT_OCCURRED0_RTC1_MASK);
117 117
118 val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | 118 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); 119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
120 120
121 /* Set configuration */ 121 /* Set configuration */
@@ -123,7 +123,10 @@ static int uv_setup_intr(int cpu, u64 expires)
123 /* Initialize comparator value */ 123 /* Initialize comparator value */
124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); 124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
125 125
126 return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); 126 if (uv_read_rtc(NULL) <= expires)
127 return 0;
128
129 return !uv_intr_pending(pnode);
127} 130}
128 131
129/* 132/*
@@ -223,6 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
223 226
224 next_cpu = head->next_cpu; 227 next_cpu = head->next_cpu;
225 *t = expires; 228 *t = expires;
229
226 /* Will this one be next to go off? */ 230 /* Will this one be next to go off? */
227 if (next_cpu < 0 || bcpu == next_cpu || 231 if (next_cpu < 0 || bcpu == next_cpu ||
228 expires < head->cpu[next_cpu].expires) { 232 expires < head->cpu[next_cpu].expires) {
@@ -231,7 +235,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
231 *t = ULLONG_MAX; 235 *t = ULLONG_MAX;
232 uv_rtc_find_next_timer(head, pnode); 236 uv_rtc_find_next_timer(head, pnode);
233 spin_unlock_irqrestore(&head->lock, flags); 237 spin_unlock_irqrestore(&head->lock, flags);
234 return 1; 238 return -ETIME;
235 } 239 }
236 } 240 }
237 241
@@ -244,7 +248,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
244 * 248 *
245 * Returns 1 if this timer was pending. 249 * Returns 1 if this timer was pending.
246 */ 250 */
247static int uv_rtc_unset_timer(int cpu) 251static int uv_rtc_unset_timer(int cpu, int force)
248{ 252{
249 int pnode = uv_cpu_to_pnode(cpu); 253 int pnode = uv_cpu_to_pnode(cpu);
250 int bid = uv_cpu_to_blade_id(cpu); 254 int bid = uv_cpu_to_blade_id(cpu);
@@ -256,14 +260,15 @@ static int uv_rtc_unset_timer(int cpu)
256 260
257 spin_lock_irqsave(&head->lock, flags); 261 spin_lock_irqsave(&head->lock, flags);
258 262
259 if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) 263 if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
260 rc = 1; 264 rc = 1;
261 265
262 *t = ULLONG_MAX; 266 if (rc) {
263 267 *t = ULLONG_MAX;
264 /* Was the hardware setup for this timer? */ 268 /* Was the hardware setup for this timer? */
265 if (head->next_cpu == bcpu) 269 if (head->next_cpu == bcpu)
266 uv_rtc_find_next_timer(head, pnode); 270 uv_rtc_find_next_timer(head, pnode);
271 }
267 272
268 spin_unlock_irqrestore(&head->lock, flags); 273 spin_unlock_irqrestore(&head->lock, flags);
269 274
@@ -310,32 +315,32 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode,
310 break; 315 break;
311 case CLOCK_EVT_MODE_UNUSED: 316 case CLOCK_EVT_MODE_UNUSED:
312 case CLOCK_EVT_MODE_SHUTDOWN: 317 case CLOCK_EVT_MODE_SHUTDOWN:
313 uv_rtc_unset_timer(ced_cpu); 318 uv_rtc_unset_timer(ced_cpu, 1);
314 break; 319 break;
315 } 320 }
316} 321}
317 322
318static void uv_rtc_interrupt(void) 323static void uv_rtc_interrupt(void)
319{ 324{
320 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
321 int cpu = smp_processor_id(); 325 int cpu = smp_processor_id();
326 struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
322 327
323 if (!ced || !ced->event_handler) 328 if (!ced || !ced->event_handler)
324 return; 329 return;
325 330
326 if (uv_rtc_unset_timer(cpu) != 1) 331 if (uv_rtc_unset_timer(cpu, 0) != 1)
327 return; 332 return;
328 333
329 ced->event_handler(ced); 334 ced->event_handler(ced);
330} 335}
331 336
332static int __init uv_enable_rtc(char *str) 337static int __init uv_enable_evt_rtc(char *str)
333{ 338{
334 uv_rtc_enable = 1; 339 uv_rtc_evt_enable = 1;
335 340
336 return 1; 341 return 1;
337} 342}
338__setup("uvrtc", uv_enable_rtc); 343__setup("uvrtcevt", uv_enable_evt_rtc);
339 344
340static __init void uv_rtc_register_clockevents(struct work_struct *dummy) 345static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
341{ 346{
@@ -350,27 +355,32 @@ static __init int uv_rtc_setup_clock(void)
350{ 355{
351 int rc; 356 int rc;
352 357
353 if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) 358 if (!is_uv_system())
354 return -ENODEV; 359 return -ENODEV;
355 360
356 generic_interrupt_extension = uv_rtc_interrupt;
357
358 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, 361 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
359 clocksource_uv.shift); 362 clocksource_uv.shift);
360 363
364 /* If single blade, prefer tsc */
365 if (uv_num_possible_blades() == 1)
366 clocksource_uv.rating = 250;
367
361 rc = clocksource_register(&clocksource_uv); 368 rc = clocksource_register(&clocksource_uv);
362 if (rc) { 369 if (rc)
363 generic_interrupt_extension = NULL; 370 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
371 else
372 printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
373 sn_rtc_cycles_per_second/(unsigned long)1E6);
374
375 if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
364 return rc; 376 return rc;
365 }
366 377
367 /* Setup and register clockevents */ 378 /* Setup and register clockevents */
368 rc = uv_rtc_allocate_timers(); 379 rc = uv_rtc_allocate_timers();
369 if (rc) { 380 if (rc)
370 clocksource_unregister(&clocksource_uv); 381 goto error;
371 generic_interrupt_extension = NULL; 382
372 return rc; 383 x86_platform_ipi_callback = uv_rtc_interrupt;
373 }
374 384
375 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, 385 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
376 NSEC_PER_SEC, clock_event_device_uv.shift); 386 NSEC_PER_SEC, clock_event_device_uv.shift);
@@ -383,11 +393,19 @@ static __init int uv_rtc_setup_clock(void)
383 393
384 rc = schedule_on_each_cpu(uv_rtc_register_clockevents); 394 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
385 if (rc) { 395 if (rc) {
386 clocksource_unregister(&clocksource_uv); 396 x86_platform_ipi_callback = NULL;
387 generic_interrupt_extension = NULL;
388 uv_rtc_deallocate_timers(); 397 uv_rtc_deallocate_timers();
398 goto error;
389 } 399 }
390 400
401 printk(KERN_INFO "UV RTC clockevents registered\n");
402
403 return 0;
404
405error:
406 clocksource_unregister(&clocksource_uv);
407 printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
408
391 return rc; 409 return rc;
392} 410}
393arch_initcall(uv_rtc_setup_clock); 411arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index f068553a1b17..34a279a7471d 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -183,7 +183,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
183 return; 183 return;
184 } 184 }
185 185
186 apic_cpus = apic->apicid_to_cpu_present(m->apicid); 186 apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
187 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); 187 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
188 /* 188 /*
189 * Validate version 189 * Validate version
@@ -197,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
197 apic_version[m->apicid] = ver; 197 apic_version[m->apicid] = ver;
198} 198}
199 199
200static void __init visws_find_smp_config(unsigned int reserve) 200static void __init visws_find_smp_config(void)
201{ 201{
202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); 202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -486,7 +486,7 @@ static void end_cobalt_irq(unsigned int irq)
486} 486}
487 487
488static struct irq_chip cobalt_irq_type = { 488static struct irq_chip cobalt_irq_type = {
489 .typename = "Cobalt-APIC", 489 .name = "Cobalt-APIC",
490 .startup = startup_cobalt_irq, 490 .startup = startup_cobalt_irq,
491 .shutdown = disable_cobalt_irq, 491 .shutdown = disable_cobalt_irq,
492 .enable = enable_cobalt_irq, 492 .enable = enable_cobalt_irq,
@@ -523,7 +523,7 @@ static void end_piix4_master_irq(unsigned int irq)
523} 523}
524 524
525static struct irq_chip piix4_master_irq_type = { 525static struct irq_chip piix4_master_irq_type = {
526 .typename = "PIIX4-master", 526 .name = "PIIX4-master",
527 .startup = startup_piix4_master_irq, 527 .startup = startup_piix4_master_irq,
528 .ack = ack_cobalt_irq, 528 .ack = ack_cobalt_irq,
529 .end = end_piix4_master_irq, 529 .end = end_piix4_master_irq,
@@ -531,7 +531,7 @@ static struct irq_chip piix4_master_irq_type = {
531 531
532 532
533static struct irq_chip piix4_virtual_irq_type = { 533static struct irq_chip piix4_virtual_irq_type = {
534 .typename = "PIIX4-virtual", 534 .name = "PIIX4-virtual",
535 .shutdown = disable_8259A_irq, 535 .shutdown = disable_8259A_irq,
536 .enable = enable_8259A_irq, 536 .enable = enable_8259A_irq,
537 .disable = disable_8259A_irq, 537 .disable = disable_8259A_irq,
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 3c68fe2d46cf..f3f2104408d9 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,6 +41,32 @@ ENTRY(phys_startup_64)
41jiffies_64 = jiffies; 41jiffies_64 = jiffies;
42#endif 42#endif
43 43
44#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
45/*
46 * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
47 * we retain large page mappings for boundaries spanning kernel text, rodata
48 * and data sections.
49 *
50 * However, kernel identity mappings will have different RWX permissions
51 * to the pages mapping to text and to the pages padding (which are freed) the
52 * text section. Hence kernel identity mappings will be broken to smaller
53 * pages. For 64-bit, kernel text and kernel identity mappings are different,
54 * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
55 * as well as retain 2MB large page mappings for kernel text.
56 */
57#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
58
59#define X64_ALIGN_DEBUG_RODATA_END \
60 . = ALIGN(HPAGE_SIZE); \
61 __end_rodata_hpage_align = .;
62
63#else
64
65#define X64_ALIGN_DEBUG_RODATA_BEGIN
66#define X64_ALIGN_DEBUG_RODATA_END
67
68#endif
69
44PHDRS { 70PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */ 71 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */ 72 data PT_LOAD FLAGS(7); /* RWE */
@@ -90,7 +116,9 @@ SECTIONS
90 116
91 EXCEPTION_TABLE(16) :text = 0x9090 117 EXCEPTION_TABLE(16) :text = 0x9090
92 118
119 X64_ALIGN_DEBUG_RODATA_BEGIN
93 RO_DATA(PAGE_SIZE) 120 RO_DATA(PAGE_SIZE)
121 X64_ALIGN_DEBUG_RODATA_END
94 122
95 /* Data */ 123 /* Data */
96 .data : AT(ADDR(.data) - LOAD_OFFSET) { 124 .data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -107,13 +135,13 @@ SECTIONS
107 135
108 PAGE_ALIGNED_DATA(PAGE_SIZE) 136 PAGE_ALIGNED_DATA(PAGE_SIZE)
109 137
110 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) 138 CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
111 139
112 DATA_DATA 140 DATA_DATA
113 CONSTRUCTORS 141 CONSTRUCTORS
114 142
115 /* rarely changed data like cpu maps */ 143 /* rarely changed data like cpu maps */
116 READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) 144 READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
117 145
118 /* End of data section */ 146 /* End of data section */
119 _edata = .; 147 _edata = .;
@@ -137,12 +165,12 @@ SECTIONS
137 *(.vsyscall_0) 165 *(.vsyscall_0)
138 } :user 166 } :user
139 167
140 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 168 . = ALIGN(L1_CACHE_BYTES);
141 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { 169 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
142 *(.vsyscall_fn) 170 *(.vsyscall_fn)
143 } 171 }
144 172
145 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 173 . = ALIGN(L1_CACHE_BYTES);
146 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { 174 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
147 *(.vsyscall_gtod_data) 175 *(.vsyscall_gtod_data)
148 } 176 }
@@ -166,7 +194,7 @@ SECTIONS
166 } 194 }
167 vgetcpu_mode = VVIRT(.vgetcpu_mode); 195 vgetcpu_mode = VVIRT(.vgetcpu_mode);
168 196
169 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 197 . = ALIGN(L1_CACHE_BYTES);
170 .jiffies : AT(VLOAD(.jiffies)) { 198 .jiffies : AT(VLOAD(.jiffies)) {
171 *(.jiffies) 199 *(.jiffies)
172 } 200 }
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8cb4974ff599..e02d92d12bcd 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -237,7 +237,7 @@ static ctl_table kernel_table2[] = {
237}; 237};
238 238
239static ctl_table kernel_root_table2[] = { 239static ctl_table kernel_root_table2[] = {
240 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, 240 { .procname = "kernel", .mode = 0555,
241 .child = kernel_table2 }, 241 .child = kernel_table2 },
242 {} 242 {}
243}; 243};
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 3909e3ba5ce3..a1029769b6f2 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -30,9 +30,8 @@ EXPORT_SYMBOL(__put_user_8);
30 30
31EXPORT_SYMBOL(copy_user_generic); 31EXPORT_SYMBOL(copy_user_generic);
32EXPORT_SYMBOL(__copy_user_nocache); 32EXPORT_SYMBOL(__copy_user_nocache);
33EXPORT_SYMBOL(copy_from_user); 33EXPORT_SYMBOL(_copy_from_user);
34EXPORT_SYMBOL(copy_to_user); 34EXPORT_SYMBOL(_copy_to_user);
35EXPORT_SYMBOL(__copy_from_user_inatomic);
36 35
37EXPORT_SYMBOL(copy_page); 36EXPORT_SYMBOL(copy_page);
38EXPORT_SYMBOL(clear_page); 37EXPORT_SYMBOL(clear_page);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 4449a4a2c2ed..ccd179dec36e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -13,11 +13,15 @@
13#include <asm/e820.h> 13#include <asm/e820.h>
14#include <asm/time.h> 14#include <asm/time.h>
15#include <asm/irq.h> 15#include <asm/irq.h>
16#include <asm/pat.h>
16#include <asm/tsc.h> 17#include <asm/tsc.h>
18#include <asm/iommu.h>
17 19
18void __cpuinit x86_init_noop(void) { } 20void __cpuinit x86_init_noop(void) { }
19void __init x86_init_uint_noop(unsigned int unused) { } 21void __init x86_init_uint_noop(unsigned int unused) { }
20void __init x86_init_pgd_noop(pgd_t *unused) { } 22void __init x86_init_pgd_noop(pgd_t *unused) { }
23int __init iommu_init_noop(void) { return 0; }
24void iommu_shutdown_noop(void) { }
21 25
22/* 26/*
23 * The platform setup functions are preset with the default functions 27 * The platform setup functions are preset with the default functions
@@ -62,6 +66,10 @@ struct x86_init_ops x86_init __initdata = {
62 .tsc_pre_init = x86_init_noop, 66 .tsc_pre_init = x86_init_noop,
63 .timer_init = hpet_time_init, 67 .timer_init = hpet_time_init,
64 }, 68 },
69
70 .iommu = {
71 .iommu_init = iommu_init_noop,
72 },
65}; 73};
66 74
67struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 75struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
@@ -72,4 +80,6 @@ struct x86_platform_ops x86_platform = {
72 .calibrate_tsc = native_calibrate_tsc, 80 .calibrate_tsc = native_calibrate_tsc,
73 .get_wallclock = mach_get_cmos_time, 81 .get_wallclock = mach_get_cmos_time,
74 .set_wallclock = mach_set_rtc_mmss, 82 .set_wallclock = mach_set_rtc_mmss,
83 .iommu_shutdown = iommu_shutdown_noop,
84 .is_untracked_pat_range = is_ISA_range,
75}; 85};
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b84e571f4175..4cd498332466 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select USER_RETURN_NOTIFIER
31 ---help--- 32 ---help---
32 Support hosting fully virtualized guest machines using hardware 33 Support hosting fully virtualized guest machines using hardware
33 virtualization extensions. You will need a fairly recent 34 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 0e7fe78d0f74..31a7035c4bd9 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -6,7 +6,8 @@ CFLAGS_svm.o := -I.
6CFLAGS_vmx.o := -I. 6CFLAGS_vmx.o := -I.
7 7
8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o) 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o)
10kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
11 12
12kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1be5cd640e93..7e8faea4651e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -75,6 +75,8 @@
75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
77#define GroupMask 0xff /* Group number stored in bits 0:7 */ 77#define GroupMask 0xff /* Group number stored in bits 0:7 */
78/* Misc flags */
79#define No64 (1<<28)
78/* Source 2 operand type */ 80/* Source 2 operand type */
79#define Src2None (0<<29) 81#define Src2None (0<<29)
80#define Src2CL (1<<29) 82#define Src2CL (1<<29)
@@ -92,19 +94,23 @@ static u32 opcode_table[256] = {
92 /* 0x00 - 0x07 */ 94 /* 0x00 - 0x07 */
93 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
94 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
95 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 97 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
98 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
96 /* 0x08 - 0x0F */ 99 /* 0x08 - 0x0F */
97 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 100 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
98 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 101 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
99 0, 0, 0, 0, 102 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
103 ImplicitOps | Stack | No64, 0,
100 /* 0x10 - 0x17 */ 104 /* 0x10 - 0x17 */
101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
103 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
108 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
104 /* 0x18 - 0x1F */ 109 /* 0x18 - 0x1F */
105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 110 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 111 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 112 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
113 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
108 /* 0x20 - 0x27 */ 114 /* 0x20 - 0x27 */
109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 115 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 116 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -133,7 +139,8 @@ static u32 opcode_table[256] = {
133 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 139 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
134 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 140 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
135 /* 0x60 - 0x67 */ 141 /* 0x60 - 0x67 */
136 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 142 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
143 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
137 0, 0, 0, 0, 144 0, 0, 0, 0,
138 /* 0x68 - 0x6F */ 145 /* 0x68 - 0x6F */
139 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, 146 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
@@ -158,7 +165,7 @@ static u32 opcode_table[256] = {
158 /* 0x90 - 0x97 */ 165 /* 0x90 - 0x97 */
159 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 166 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
160 /* 0x98 - 0x9F */ 167 /* 0x98 - 0x9F */
161 0, 0, SrcImm | Src2Imm16, 0, 168 0, 0, SrcImm | Src2Imm16 | No64, 0,
162 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 169 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
163 /* 0xA0 - 0xA7 */ 170 /* 0xA0 - 0xA7 */
164 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 171 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -185,7 +192,7 @@ static u32 opcode_table[256] = {
185 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, 192 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
186 /* 0xC8 - 0xCF */ 193 /* 0xC8 - 0xCF */
187 0, 0, 0, ImplicitOps | Stack, 194 0, 0, 0, ImplicitOps | Stack,
188 ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, 195 ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
189 /* 0xD0 - 0xD7 */ 196 /* 0xD0 - 0xD7 */
190 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 197 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
191 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 198 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -198,7 +205,7 @@ static u32 opcode_table[256] = {
198 ByteOp | SrcImmUByte, SrcImmUByte, 205 ByteOp | SrcImmUByte, SrcImmUByte,
199 /* 0xE8 - 0xEF */ 206 /* 0xE8 - 0xEF */
200 SrcImm | Stack, SrcImm | ImplicitOps, 207 SrcImm | Stack, SrcImm | ImplicitOps,
201 SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, 208 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
202 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 209 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
203 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 210 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
204 /* 0xF0 - 0xF7 */ 211 /* 0xF0 - 0xF7 */
@@ -244,11 +251,13 @@ static u32 twobyte_table[256] = {
244 /* 0x90 - 0x9F */ 251 /* 0x90 - 0x9F */
245 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 252 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
246 /* 0xA0 - 0xA7 */ 253 /* 0xA0 - 0xA7 */
247 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 254 ImplicitOps | Stack, ImplicitOps | Stack,
255 0, DstMem | SrcReg | ModRM | BitOp,
248 DstMem | SrcReg | Src2ImmByte | ModRM, 256 DstMem | SrcReg | Src2ImmByte | ModRM,
249 DstMem | SrcReg | Src2CL | ModRM, 0, 0, 257 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
250 /* 0xA8 - 0xAF */ 258 /* 0xA8 - 0xAF */
251 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 259 ImplicitOps | Stack, ImplicitOps | Stack,
260 0, DstMem | SrcReg | ModRM | BitOp,
252 DstMem | SrcReg | Src2ImmByte | ModRM, 261 DstMem | SrcReg | Src2ImmByte | ModRM,
253 DstMem | SrcReg | Src2CL | ModRM, 262 DstMem | SrcReg | Src2CL | ModRM,
254 ModRM, 0, 263 ModRM, 0,
@@ -613,6 +622,9 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
613{ 622{
614 int rc = 0; 623 int rc = 0;
615 624
625 /* x86 instructions are limited to 15 bytes. */
626 if (eip + size - ctxt->decode.eip_orig > 15)
627 return X86EMUL_UNHANDLEABLE;
616 eip += ctxt->cs_base; 628 eip += ctxt->cs_base;
617 while (size--) { 629 while (size--) {
618 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 630 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
@@ -871,7 +883,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
871 /* Shadow copy of register state. Committed on successful emulation. */ 883 /* Shadow copy of register state. Committed on successful emulation. */
872 884
873 memset(c, 0, sizeof(struct decode_cache)); 885 memset(c, 0, sizeof(struct decode_cache));
874 c->eip = kvm_rip_read(ctxt->vcpu); 886 c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
875 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 887 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
876 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 888 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
877 889
@@ -962,6 +974,11 @@ done_prefixes:
962 } 974 }
963 } 975 }
964 976
977 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
978 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");;
979 return -1;
980 }
981
965 if (c->d & Group) { 982 if (c->d & Group) {
966 group = c->d & GroupMask; 983 group = c->d & GroupMask;
967 c->modrm = insn_fetch(u8, 1, c->eip); 984 c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1186,6 +1203,69 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1186 return rc; 1203 return rc;
1187} 1204}
1188 1205
1206static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1207{
1208 struct decode_cache *c = &ctxt->decode;
1209 struct kvm_segment segment;
1210
1211 kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
1212
1213 c->src.val = segment.selector;
1214 emulate_push(ctxt);
1215}
1216
1217static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1218 struct x86_emulate_ops *ops, int seg)
1219{
1220 struct decode_cache *c = &ctxt->decode;
1221 unsigned long selector;
1222 int rc;
1223
1224 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
1225 if (rc != 0)
1226 return rc;
1227
1228 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg);
1229 return rc;
1230}
1231
1232static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
1233{
1234 struct decode_cache *c = &ctxt->decode;
1235 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1236 int reg = VCPU_REGS_RAX;
1237
1238 while (reg <= VCPU_REGS_RDI) {
1239 (reg == VCPU_REGS_RSP) ?
1240 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1241
1242 emulate_push(ctxt);
1243 ++reg;
1244 }
1245}
1246
1247static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1248 struct x86_emulate_ops *ops)
1249{
1250 struct decode_cache *c = &ctxt->decode;
1251 int rc = 0;
1252 int reg = VCPU_REGS_RDI;
1253
1254 while (reg >= VCPU_REGS_RAX) {
1255 if (reg == VCPU_REGS_RSP) {
1256 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1257 c->op_bytes);
1258 --reg;
1259 }
1260
1261 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
1262 if (rc != 0)
1263 break;
1264 --reg;
1265 }
1266 return rc;
1267}
1268
1189static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1269static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1190 struct x86_emulate_ops *ops) 1270 struct x86_emulate_ops *ops)
1191{ 1271{
@@ -1707,18 +1787,45 @@ special_insn:
1707 add: /* add */ 1787 add: /* add */
1708 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 1788 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1709 break; 1789 break;
1790 case 0x06: /* push es */
1791 emulate_push_sreg(ctxt, VCPU_SREG_ES);
1792 break;
1793 case 0x07: /* pop es */
1794 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
1795 if (rc != 0)
1796 goto done;
1797 break;
1710 case 0x08 ... 0x0d: 1798 case 0x08 ... 0x0d:
1711 or: /* or */ 1799 or: /* or */
1712 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 1800 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1713 break; 1801 break;
1802 case 0x0e: /* push cs */
1803 emulate_push_sreg(ctxt, VCPU_SREG_CS);
1804 break;
1714 case 0x10 ... 0x15: 1805 case 0x10 ... 0x15:
1715 adc: /* adc */ 1806 adc: /* adc */
1716 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); 1807 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1717 break; 1808 break;
1809 case 0x16: /* push ss */
1810 emulate_push_sreg(ctxt, VCPU_SREG_SS);
1811 break;
1812 case 0x17: /* pop ss */
1813 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
1814 if (rc != 0)
1815 goto done;
1816 break;
1718 case 0x18 ... 0x1d: 1817 case 0x18 ... 0x1d:
1719 sbb: /* sbb */ 1818 sbb: /* sbb */
1720 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 1819 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1721 break; 1820 break;
1821 case 0x1e: /* push ds */
1822 emulate_push_sreg(ctxt, VCPU_SREG_DS);
1823 break;
1824 case 0x1f: /* pop ds */
1825 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
1826 if (rc != 0)
1827 goto done;
1828 break;
1722 case 0x20 ... 0x25: 1829 case 0x20 ... 0x25:
1723 and: /* and */ 1830 and: /* and */
1724 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); 1831 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
@@ -1750,6 +1857,14 @@ special_insn:
1750 if (rc != 0) 1857 if (rc != 0)
1751 goto done; 1858 goto done;
1752 break; 1859 break;
1860 case 0x60: /* pusha */
1861 emulate_pusha(ctxt);
1862 break;
1863 case 0x61: /* popa */
1864 rc = emulate_popa(ctxt, ops);
1865 if (rc != 0)
1866 goto done;
1867 break;
1753 case 0x63: /* movsxd */ 1868 case 0x63: /* movsxd */
1754 if (ctxt->mode != X86EMUL_MODE_PROT64) 1869 if (ctxt->mode != X86EMUL_MODE_PROT64)
1755 goto cannot_emulate; 1870 goto cannot_emulate;
@@ -1761,7 +1876,7 @@ special_insn:
1761 break; 1876 break;
1762 case 0x6c: /* insb */ 1877 case 0x6c: /* insb */
1763 case 0x6d: /* insw/insd */ 1878 case 0x6d: /* insw/insd */
1764 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 1879 if (kvm_emulate_pio_string(ctxt->vcpu,
1765 1, 1880 1,
1766 (c->d & ByteOp) ? 1 : c->op_bytes, 1881 (c->d & ByteOp) ? 1 : c->op_bytes,
1767 c->rep_prefix ? 1882 c->rep_prefix ?
@@ -1777,7 +1892,7 @@ special_insn:
1777 return 0; 1892 return 0;
1778 case 0x6e: /* outsb */ 1893 case 0x6e: /* outsb */
1779 case 0x6f: /* outsw/outsd */ 1894 case 0x6f: /* outsw/outsd */
1780 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 1895 if (kvm_emulate_pio_string(ctxt->vcpu,
1781 0, 1896 0,
1782 (c->d & ByteOp) ? 1 : c->op_bytes, 1897 (c->d & ByteOp) ? 1 : c->op_bytes,
1783 c->rep_prefix ? 1898 c->rep_prefix ?
@@ -2070,7 +2185,7 @@ special_insn:
2070 case 0xef: /* out (e/r)ax,dx */ 2185 case 0xef: /* out (e/r)ax,dx */
2071 port = c->regs[VCPU_REGS_RDX]; 2186 port = c->regs[VCPU_REGS_RDX];
2072 io_dir_in = 0; 2187 io_dir_in = 0;
2073 do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, 2188 do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
2074 (c->d & ByteOp) ? 1 : c->op_bytes, 2189 (c->d & ByteOp) ? 1 : c->op_bytes,
2075 port) != 0) { 2190 port) != 0) {
2076 c->eip = saved_eip; 2191 c->eip = saved_eip;
@@ -2297,6 +2412,14 @@ twobyte_insn:
2297 jmp_rel(c, c->src.val); 2412 jmp_rel(c, c->src.val);
2298 c->dst.type = OP_NONE; 2413 c->dst.type = OP_NONE;
2299 break; 2414 break;
2415 case 0xa0: /* push fs */
2416 emulate_push_sreg(ctxt, VCPU_SREG_FS);
2417 break;
2418 case 0xa1: /* pop fs */
2419 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
2420 if (rc != 0)
2421 goto done;
2422 break;
2300 case 0xa3: 2423 case 0xa3:
2301 bt: /* bt */ 2424 bt: /* bt */
2302 c->dst.type = OP_NONE; 2425 c->dst.type = OP_NONE;
@@ -2308,6 +2431,14 @@ twobyte_insn:
2308 case 0xa5: /* shld cl, r, r/m */ 2431 case 0xa5: /* shld cl, r, r/m */
2309 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 2432 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
2310 break; 2433 break;
2434 case 0xa8: /* push gs */
2435 emulate_push_sreg(ctxt, VCPU_SREG_GS);
2436 break;
2437 case 0xa9: /* pop gs */
2438 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
2439 if (rc != 0)
2440 goto done;
2441 break;
2311 case 0xab: 2442 case 0xab:
2312 bts: /* bts */ 2443 bts: /* bts */
2313 /* only subword offset */ 2444 /* only subword offset */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 144e7f60b5e2..fab7440c9bb2 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -688,10 +688,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
688 struct kvm_vcpu *vcpu; 688 struct kvm_vcpu *vcpu;
689 int i; 689 int i;
690 690
691 mutex_lock(&kvm->irq_lock);
692 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 691 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
693 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 692 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
694 mutex_unlock(&kvm->irq_lock);
695 693
696 /* 694 /*
697 * Provides NMI watchdog support via Virtual Wire mode. 695 * Provides NMI watchdog support via Virtual Wire mode.
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 01f151682802..d057c0cbd245 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -38,7 +38,15 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
38 s->isr_ack |= (1 << irq); 38 s->isr_ack |= (1 << irq);
39 if (s != &s->pics_state->pics[0]) 39 if (s != &s->pics_state->pics[0])
40 irq += 8; 40 irq += 8;
41 /*
42 * We are dropping lock while calling ack notifiers since ack
43 * notifier callbacks for assigned devices call into PIC recursively.
44 * Other interrupt may be delivered to PIC while lock is dropped but
45 * it should be safe since PIC state is already updated at this stage.
46 */
47 spin_unlock(&s->pics_state->lock);
41 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 48 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
49 spin_lock(&s->pics_state->lock);
42} 50}
43 51
44void kvm_pic_clear_isr_ack(struct kvm *kvm) 52void kvm_pic_clear_isr_ack(struct kvm *kvm)
@@ -176,16 +184,18 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
176static inline void pic_intack(struct kvm_kpic_state *s, int irq) 184static inline void pic_intack(struct kvm_kpic_state *s, int irq)
177{ 185{
178 s->isr |= 1 << irq; 186 s->isr |= 1 << irq;
179 if (s->auto_eoi) {
180 if (s->rotate_on_auto_eoi)
181 s->priority_add = (irq + 1) & 7;
182 pic_clear_isr(s, irq);
183 }
184 /* 187 /*
185 * We don't clear a level sensitive interrupt here 188 * We don't clear a level sensitive interrupt here
186 */ 189 */
187 if (!(s->elcr & (1 << irq))) 190 if (!(s->elcr & (1 << irq)))
188 s->irr &= ~(1 << irq); 191 s->irr &= ~(1 << irq);
192
193 if (s->auto_eoi) {
194 if (s->rotate_on_auto_eoi)
195 s->priority_add = (irq + 1) & 7;
196 pic_clear_isr(s, irq);
197 }
198
189} 199}
190 200
191int kvm_pic_read_irq(struct kvm *kvm) 201int kvm_pic_read_irq(struct kvm *kvm)
@@ -225,22 +235,11 @@ int kvm_pic_read_irq(struct kvm *kvm)
225 235
226void kvm_pic_reset(struct kvm_kpic_state *s) 236void kvm_pic_reset(struct kvm_kpic_state *s)
227{ 237{
228 int irq, irqbase, n; 238 int irq;
229 struct kvm *kvm = s->pics_state->irq_request_opaque; 239 struct kvm *kvm = s->pics_state->irq_request_opaque;
230 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; 240 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
241 u8 irr = s->irr, isr = s->imr;
231 242
232 if (s == &s->pics_state->pics[0])
233 irqbase = 0;
234 else
235 irqbase = 8;
236
237 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
238 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
239 if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
240 n = irq + irqbase;
241 kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
242 }
243 }
244 s->last_irr = 0; 243 s->last_irr = 0;
245 s->irr = 0; 244 s->irr = 0;
246 s->imr = 0; 245 s->imr = 0;
@@ -256,6 +255,13 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
256 s->rotate_on_auto_eoi = 0; 255 s->rotate_on_auto_eoi = 0;
257 s->special_fully_nested_mode = 0; 256 s->special_fully_nested_mode = 0;
258 s->init4 = 0; 257 s->init4 = 0;
258
259 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
260 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
261 if (irr & (1 << irq) || isr & (1 << irq)) {
262 pic_clear_isr(s, irq);
263 }
264 }
259} 265}
260 266
261static void pic_ioport_write(void *opaque, u32 addr, u32 val) 267static void pic_ioport_write(void *opaque, u32 addr, u32 val)
@@ -298,9 +304,9 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
298 priority = get_priority(s, s->isr); 304 priority = get_priority(s, s->isr);
299 if (priority != 8) { 305 if (priority != 8) {
300 irq = (priority + s->priority_add) & 7; 306 irq = (priority + s->priority_add) & 7;
301 pic_clear_isr(s, irq);
302 if (cmd == 5) 307 if (cmd == 5)
303 s->priority_add = (irq + 1) & 7; 308 s->priority_add = (irq + 1) & 7;
309 pic_clear_isr(s, irq);
304 pic_update_irq(s->pics_state); 310 pic_update_irq(s->pics_state);
305 } 311 }
306 break; 312 break;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7d6058a2fd38..be399e207d57 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -71,6 +71,7 @@ struct kvm_pic {
71 int output; /* intr from master PIC */ 71 int output; /* intr from master PIC */
72 struct kvm_io_device dev; 72 struct kvm_io_device dev;
73 void (*ack_notifier)(void *opaque, int irq); 73 void (*ack_notifier)(void *opaque, int irq);
74 unsigned long irq_states[16];
74}; 75};
75 76
76struct kvm_pic *kvm_create_pic(struct kvm *kvm); 77struct kvm_pic *kvm_create_pic(struct kvm *kvm);
@@ -85,7 +86,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
85 86
86static inline int irqchip_in_kernel(struct kvm *kvm) 87static inline int irqchip_in_kernel(struct kvm *kvm)
87{ 88{
88 return pic_irqchip(kvm) != NULL; 89 int ret;
90
91 ret = (pic_irqchip(kvm) != NULL);
92 smp_rmb();
93 return ret;
89} 94}
90 95
91void kvm_pic_reset(struct kvm_kpic_state *s); 96void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 23c217692ea9..cd60c0bd1b32 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,7 +32,6 @@
32#include <asm/current.h> 32#include <asm/current.h>
33#include <asm/apicdef.h> 33#include <asm/apicdef.h>
34#include <asm/atomic.h> 34#include <asm/atomic.h>
35#include <asm/apicdef.h>
36#include "kvm_cache_regs.h" 35#include "kvm_cache_regs.h"
37#include "irq.h" 36#include "irq.h"
38#include "trace.h" 37#include "trace.h"
@@ -471,11 +470,8 @@ static void apic_set_eoi(struct kvm_lapic *apic)
471 trigger_mode = IOAPIC_LEVEL_TRIG; 470 trigger_mode = IOAPIC_LEVEL_TRIG;
472 else 471 else
473 trigger_mode = IOAPIC_EDGE_TRIG; 472 trigger_mode = IOAPIC_EDGE_TRIG;
474 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { 473 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
475 mutex_lock(&apic->vcpu->kvm->irq_lock);
476 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 474 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
477 mutex_unlock(&apic->vcpu->kvm->irq_lock);
478 }
479} 475}
480 476
481static void apic_send_ipi(struct kvm_lapic *apic) 477static void apic_send_ipi(struct kvm_lapic *apic)
@@ -504,9 +500,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
504 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, 500 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
505 irq.vector); 501 irq.vector);
506 502
507 mutex_lock(&apic->vcpu->kvm->irq_lock);
508 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); 503 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
509 mutex_unlock(&apic->vcpu->kvm->irq_lock);
510} 504}
511 505
512static u32 apic_get_tmcct(struct kvm_lapic *apic) 506static u32 apic_get_tmcct(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 818b92ad82cf..4c3e5b2314cb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2789,7 +2789,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2789 if (r) 2789 if (r)
2790 goto out; 2790 goto out;
2791 2791
2792 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); 2792 er = emulate_instruction(vcpu, cr2, error_code, 0);
2793 2793
2794 switch (er) { 2794 switch (er) {
2795 case EMULATE_DONE: 2795 case EMULATE_DONE:
@@ -2800,6 +2800,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2800 case EMULATE_FAIL: 2800 case EMULATE_FAIL:
2801 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2801 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2802 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 2802 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2803 vcpu->run->internal.ndata = 0;
2803 return 0; 2804 return 0;
2804 default: 2805 default:
2805 BUG(); 2806 BUG();
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 72558f8ff3f5..a6017132fba8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -467,7 +467,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
467 level = iterator.level; 467 level = iterator.level;
468 sptep = iterator.sptep; 468 sptep = iterator.sptep;
469 469
470 /* FIXME: properly handle invlpg on large guest pages */
471 if (level == PT_PAGE_TABLE_LEVEL || 470 if (level == PT_PAGE_TABLE_LEVEL ||
472 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || 471 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
473 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { 472 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c17404add91f..3de0b37ec038 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -46,6 +46,7 @@ MODULE_LICENSE("GPL");
46#define SVM_FEATURE_NPT (1 << 0) 46#define SVM_FEATURE_NPT (1 << 0)
47#define SVM_FEATURE_LBRV (1 << 1) 47#define SVM_FEATURE_LBRV (1 << 1)
48#define SVM_FEATURE_SVML (1 << 2) 48#define SVM_FEATURE_SVML (1 << 2)
49#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
49 50
50#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 51#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
51#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 52#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -53,15 +54,6 @@ MODULE_LICENSE("GPL");
53 54
54#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 55#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
55 56
56/* Turn on to get debugging output*/
57/* #define NESTED_DEBUG */
58
59#ifdef NESTED_DEBUG
60#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
61#else
62#define nsvm_printk(fmt, args...) do {} while(0)
63#endif
64
65static const u32 host_save_user_msrs[] = { 57static const u32 host_save_user_msrs[] = {
66#ifdef CONFIG_X86_64 58#ifdef CONFIG_X86_64
67 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, 59 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -85,6 +77,9 @@ struct nested_state {
85 /* gpa pointers to the real vectors */ 77 /* gpa pointers to the real vectors */
86 u64 vmcb_msrpm; 78 u64 vmcb_msrpm;
87 79
80 /* A VMEXIT is required but not yet emulated */
81 bool exit_required;
82
88 /* cache for intercepts of the guest */ 83 /* cache for intercepts of the guest */
89 u16 intercept_cr_read; 84 u16 intercept_cr_read;
90 u16 intercept_cr_write; 85 u16 intercept_cr_write;
@@ -112,6 +107,8 @@ struct vcpu_svm {
112 u32 *msrpm; 107 u32 *msrpm;
113 108
114 struct nested_state nested; 109 struct nested_state nested;
110
111 bool nmi_singlestep;
115}; 112};
116 113
117/* enable NPT for AMD64 and X86 with PAE */ 114/* enable NPT for AMD64 and X86 with PAE */
@@ -286,7 +283,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
286 struct vcpu_svm *svm = to_svm(vcpu); 283 struct vcpu_svm *svm = to_svm(vcpu);
287 284
288 if (!svm->next_rip) { 285 if (!svm->next_rip) {
289 if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != 286 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
290 EMULATE_DONE) 287 EMULATE_DONE)
291 printk(KERN_DEBUG "%s: NOP\n", __func__); 288 printk(KERN_DEBUG "%s: NOP\n", __func__);
292 return; 289 return;
@@ -316,7 +313,7 @@ static void svm_hardware_disable(void *garbage)
316 cpu_svm_disable(); 313 cpu_svm_disable();
317} 314}
318 315
319static void svm_hardware_enable(void *garbage) 316static int svm_hardware_enable(void *garbage)
320{ 317{
321 318
322 struct svm_cpu_data *svm_data; 319 struct svm_cpu_data *svm_data;
@@ -325,16 +322,21 @@ static void svm_hardware_enable(void *garbage)
325 struct desc_struct *gdt; 322 struct desc_struct *gdt;
326 int me = raw_smp_processor_id(); 323 int me = raw_smp_processor_id();
327 324
325 rdmsrl(MSR_EFER, efer);
326 if (efer & EFER_SVME)
327 return -EBUSY;
328
328 if (!has_svm()) { 329 if (!has_svm()) {
329 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); 330 printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
330 return; 331 me);
332 return -EINVAL;
331 } 333 }
332 svm_data = per_cpu(svm_data, me); 334 svm_data = per_cpu(svm_data, me);
333 335
334 if (!svm_data) { 336 if (!svm_data) {
335 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", 337 printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
336 me); 338 me);
337 return; 339 return -EINVAL;
338 } 340 }
339 341
340 svm_data->asid_generation = 1; 342 svm_data->asid_generation = 1;
@@ -345,11 +347,12 @@ static void svm_hardware_enable(void *garbage)
345 gdt = (struct desc_struct *)gdt_descr.base; 347 gdt = (struct desc_struct *)gdt_descr.base;
346 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 348 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
347 349
348 rdmsrl(MSR_EFER, efer);
349 wrmsrl(MSR_EFER, efer | EFER_SVME); 350 wrmsrl(MSR_EFER, efer | EFER_SVME);
350 351
351 wrmsrl(MSR_VM_HSAVE_PA, 352 wrmsrl(MSR_VM_HSAVE_PA,
352 page_to_pfn(svm_data->save_area) << PAGE_SHIFT); 353 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
354
355 return 0;
353} 356}
354 357
355static void svm_cpu_uninit(int cpu) 358static void svm_cpu_uninit(int cpu)
@@ -476,7 +479,7 @@ static __init int svm_hardware_setup(void)
476 kvm_enable_efer_bits(EFER_SVME); 479 kvm_enable_efer_bits(EFER_SVME);
477 } 480 }
478 481
479 for_each_online_cpu(cpu) { 482 for_each_possible_cpu(cpu) {
480 r = svm_cpu_init(cpu); 483 r = svm_cpu_init(cpu);
481 if (r) 484 if (r)
482 goto err; 485 goto err;
@@ -510,7 +513,7 @@ static __exit void svm_hardware_unsetup(void)
510{ 513{
511 int cpu; 514 int cpu;
512 515
513 for_each_online_cpu(cpu) 516 for_each_possible_cpu(cpu)
514 svm_cpu_uninit(cpu); 517 svm_cpu_uninit(cpu);
515 518
516 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 519 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
@@ -625,11 +628,12 @@ static void init_vmcb(struct vcpu_svm *svm)
625 save->rip = 0x0000fff0; 628 save->rip = 0x0000fff0;
626 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 629 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
627 630
628 /* 631 /* This is the guest-visible cr0 value.
629 * cr0 val on cpu init should be 0x60000010, we enable cpu 632 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
630 * cache by default. the orderly way is to enable cache in bios.
631 */ 633 */
632 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; 634 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
635 kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
636
633 save->cr4 = X86_CR4_PAE; 637 save->cr4 = X86_CR4_PAE;
634 /* rdx = ?? */ 638 /* rdx = ?? */
635 639
@@ -644,8 +648,6 @@ static void init_vmcb(struct vcpu_svm *svm)
644 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| 648 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
645 INTERCEPT_CR3_MASK); 649 INTERCEPT_CR3_MASK);
646 save->g_pat = 0x0007040600070406ULL; 650 save->g_pat = 0x0007040600070406ULL;
647 /* enable caching because the QEMU Bios doesn't enable it */
648 save->cr0 = X86_CR0_ET;
649 save->cr3 = 0; 651 save->cr3 = 0;
650 save->cr4 = 0; 652 save->cr4 = 0;
651 } 653 }
@@ -654,6 +656,11 @@ static void init_vmcb(struct vcpu_svm *svm)
654 svm->nested.vmcb = 0; 656 svm->nested.vmcb = 0;
655 svm->vcpu.arch.hflags = 0; 657 svm->vcpu.arch.hflags = 0;
656 658
659 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
660 control->pause_filter_count = 3000;
661 control->intercept |= (1ULL << INTERCEPT_PAUSE);
662 }
663
657 enable_gif(svm); 664 enable_gif(svm);
658} 665}
659 666
@@ -758,14 +765,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
758 int i; 765 int i;
759 766
760 if (unlikely(cpu != vcpu->cpu)) { 767 if (unlikely(cpu != vcpu->cpu)) {
761 u64 tsc_this, delta; 768 u64 delta;
762 769
763 /* 770 /*
764 * Make sure that the guest sees a monotonically 771 * Make sure that the guest sees a monotonically
765 * increasing TSC. 772 * increasing TSC.
766 */ 773 */
767 rdtscll(tsc_this); 774 delta = vcpu->arch.host_tsc - native_read_tsc();
768 delta = vcpu->arch.host_tsc - tsc_this;
769 svm->vmcb->control.tsc_offset += delta; 775 svm->vmcb->control.tsc_offset += delta;
770 if (is_nested(svm)) 776 if (is_nested(svm))
771 svm->nested.hsave->control.tsc_offset += delta; 777 svm->nested.hsave->control.tsc_offset += delta;
@@ -787,7 +793,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
787 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 793 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
788 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 794 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
789 795
790 rdtscll(vcpu->arch.host_tsc); 796 vcpu->arch.host_tsc = native_read_tsc();
791} 797}
792 798
793static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 799static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1045,7 +1051,7 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1045 svm->vmcb->control.intercept_exceptions &= 1051 svm->vmcb->control.intercept_exceptions &=
1046 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1052 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
1047 1053
1048 if (vcpu->arch.singlestep) 1054 if (svm->nmi_singlestep)
1049 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1055 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
1050 1056
1051 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1057 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -1060,26 +1066,16 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1060 vcpu->guest_debug = 0; 1066 vcpu->guest_debug = 0;
1061} 1067}
1062 1068
1063static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1069static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1064{ 1070{
1065 int old_debug = vcpu->guest_debug;
1066 struct vcpu_svm *svm = to_svm(vcpu); 1071 struct vcpu_svm *svm = to_svm(vcpu);
1067 1072
1068 vcpu->guest_debug = dbg->control;
1069
1070 update_db_intercept(vcpu);
1071
1072 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1073 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1073 svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; 1074 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
1074 else 1075 else
1075 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1076 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1076 1077
1077 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 1078 update_db_intercept(vcpu);
1078 svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1079 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1080 svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1081
1082 return 0;
1083} 1079}
1084 1080
1085static void load_host_msrs(struct kvm_vcpu *vcpu) 1081static void load_host_msrs(struct kvm_vcpu *vcpu)
@@ -1180,7 +1176,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
1180 } 1176 }
1181} 1177}
1182 1178
1183static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1179static int pf_interception(struct vcpu_svm *svm)
1184{ 1180{
1185 u64 fault_address; 1181 u64 fault_address;
1186 u32 error_code; 1182 u32 error_code;
@@ -1194,17 +1190,19 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1194 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1190 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1195} 1191}
1196 1192
1197static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1193static int db_interception(struct vcpu_svm *svm)
1198{ 1194{
1195 struct kvm_run *kvm_run = svm->vcpu.run;
1196
1199 if (!(svm->vcpu.guest_debug & 1197 if (!(svm->vcpu.guest_debug &
1200 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 1198 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1201 !svm->vcpu.arch.singlestep) { 1199 !svm->nmi_singlestep) {
1202 kvm_queue_exception(&svm->vcpu, DB_VECTOR); 1200 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1203 return 1; 1201 return 1;
1204 } 1202 }
1205 1203
1206 if (svm->vcpu.arch.singlestep) { 1204 if (svm->nmi_singlestep) {
1207 svm->vcpu.arch.singlestep = false; 1205 svm->nmi_singlestep = false;
1208 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) 1206 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1209 svm->vmcb->save.rflags &= 1207 svm->vmcb->save.rflags &=
1210 ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1208 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
@@ -1223,25 +1221,27 @@ static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1223 return 1; 1221 return 1;
1224} 1222}
1225 1223
1226static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1224static int bp_interception(struct vcpu_svm *svm)
1227{ 1225{
1226 struct kvm_run *kvm_run = svm->vcpu.run;
1227
1228 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1228 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1229 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1229 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1230 kvm_run->debug.arch.exception = BP_VECTOR; 1230 kvm_run->debug.arch.exception = BP_VECTOR;
1231 return 0; 1231 return 0;
1232} 1232}
1233 1233
1234static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1234static int ud_interception(struct vcpu_svm *svm)
1235{ 1235{
1236 int er; 1236 int er;
1237 1237
1238 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 1238 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
1239 if (er != EMULATE_DONE) 1239 if (er != EMULATE_DONE)
1240 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1240 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1241 return 1; 1241 return 1;
1242} 1242}
1243 1243
1244static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1244static int nm_interception(struct vcpu_svm *svm)
1245{ 1245{
1246 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1246 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
1247 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) 1247 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
@@ -1251,7 +1251,7 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1251 return 1; 1251 return 1;
1252} 1252}
1253 1253
1254static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1254static int mc_interception(struct vcpu_svm *svm)
1255{ 1255{
1256 /* 1256 /*
1257 * On an #MC intercept the MCE handler is not called automatically in 1257 * On an #MC intercept the MCE handler is not called automatically in
@@ -1264,8 +1264,10 @@ static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1264 return 1; 1264 return 1;
1265} 1265}
1266 1266
1267static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1267static int shutdown_interception(struct vcpu_svm *svm)
1268{ 1268{
1269 struct kvm_run *kvm_run = svm->vcpu.run;
1270
1269 /* 1271 /*
1270 * VMCB is undefined after a SHUTDOWN intercept 1272 * VMCB is undefined after a SHUTDOWN intercept
1271 * so reinitialize it. 1273 * so reinitialize it.
@@ -1277,7 +1279,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1277 return 0; 1279 return 0;
1278} 1280}
1279 1281
1280static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1282static int io_interception(struct vcpu_svm *svm)
1281{ 1283{
1282 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1284 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1283 int size, in, string; 1285 int size, in, string;
@@ -1291,7 +1293,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1291 1293
1292 if (string) { 1294 if (string) {
1293 if (emulate_instruction(&svm->vcpu, 1295 if (emulate_instruction(&svm->vcpu,
1294 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) 1296 0, 0, 0) == EMULATE_DO_MMIO)
1295 return 0; 1297 return 0;
1296 return 1; 1298 return 1;
1297 } 1299 }
@@ -1301,33 +1303,33 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1301 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1303 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1302 1304
1303 skip_emulated_instruction(&svm->vcpu); 1305 skip_emulated_instruction(&svm->vcpu);
1304 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1306 return kvm_emulate_pio(&svm->vcpu, in, size, port);
1305} 1307}
1306 1308
1307static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1309static int nmi_interception(struct vcpu_svm *svm)
1308{ 1310{
1309 return 1; 1311 return 1;
1310} 1312}
1311 1313
1312static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1314static int intr_interception(struct vcpu_svm *svm)
1313{ 1315{
1314 ++svm->vcpu.stat.irq_exits; 1316 ++svm->vcpu.stat.irq_exits;
1315 return 1; 1317 return 1;
1316} 1318}
1317 1319
1318static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1320static int nop_on_interception(struct vcpu_svm *svm)
1319{ 1321{
1320 return 1; 1322 return 1;
1321} 1323}
1322 1324
1323static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1325static int halt_interception(struct vcpu_svm *svm)
1324{ 1326{
1325 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; 1327 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1326 skip_emulated_instruction(&svm->vcpu); 1328 skip_emulated_instruction(&svm->vcpu);
1327 return kvm_emulate_halt(&svm->vcpu); 1329 return kvm_emulate_halt(&svm->vcpu);
1328} 1330}
1329 1331
1330static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1332static int vmmcall_interception(struct vcpu_svm *svm)
1331{ 1333{
1332 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1334 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1333 skip_emulated_instruction(&svm->vcpu); 1335 skip_emulated_instruction(&svm->vcpu);
@@ -1378,8 +1380,15 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
1378 1380
1379 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1381 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1380 1382
1381 if (nested_svm_exit_handled(svm)) { 1383 if (svm->nested.intercept & 1ULL) {
1382 nsvm_printk("VMexit -> INTR\n"); 1384 /*
1385 * The #vmexit can't be emulated here directly because this
1386 * code path runs with irqs and preemtion disabled. A
1387 * #vmexit emulation might sleep. Only signal request for
1388 * the #vmexit here.
1389 */
1390 svm->nested.exit_required = true;
1391 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1383 return 1; 1392 return 1;
1384 } 1393 }
1385 1394
@@ -1390,10 +1399,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
1390{ 1399{
1391 struct page *page; 1400 struct page *page;
1392 1401
1393 down_read(&current->mm->mmap_sem);
1394 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1402 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1395 up_read(&current->mm->mmap_sem);
1396
1397 if (is_error_page(page)) 1403 if (is_error_page(page))
1398 goto error; 1404 goto error;
1399 1405
@@ -1532,14 +1538,12 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1532 } 1538 }
1533 default: { 1539 default: {
1534 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 1540 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1535 nsvm_printk("exit code: 0x%x\n", exit_code);
1536 if (svm->nested.intercept & exit_bits) 1541 if (svm->nested.intercept & exit_bits)
1537 vmexit = NESTED_EXIT_DONE; 1542 vmexit = NESTED_EXIT_DONE;
1538 } 1543 }
1539 } 1544 }
1540 1545
1541 if (vmexit == NESTED_EXIT_DONE) { 1546 if (vmexit == NESTED_EXIT_DONE) {
1542 nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
1543 nested_svm_vmexit(svm); 1547 nested_svm_vmexit(svm);
1544 } 1548 }
1545 1549
@@ -1584,6 +1588,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1584 struct vmcb *hsave = svm->nested.hsave; 1588 struct vmcb *hsave = svm->nested.hsave;
1585 struct vmcb *vmcb = svm->vmcb; 1589 struct vmcb *vmcb = svm->vmcb;
1586 1590
1591 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
1592 vmcb->control.exit_info_1,
1593 vmcb->control.exit_info_2,
1594 vmcb->control.exit_int_info,
1595 vmcb->control.exit_int_info_err);
1596
1587 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); 1597 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
1588 if (!nested_vmcb) 1598 if (!nested_vmcb)
1589 return 1; 1599 return 1;
@@ -1617,6 +1627,22 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1617 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 1627 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1618 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 1628 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1619 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 1629 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
1630
1631 /*
1632 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
1633 * to make sure that we do not lose injected events. So check event_inj
1634 * here and copy it to exit_int_info if it is valid.
1635 * Exit_int_info and event_inj can't be both valid because the case
1636 * below only happens on a VMRUN instruction intercept which has
1637 * no valid exit_int_info set.
1638 */
1639 if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
1640 struct vmcb_control_area *nc = &nested_vmcb->control;
1641
1642 nc->exit_int_info = vmcb->control.event_inj;
1643 nc->exit_int_info_err = vmcb->control.event_inj_err;
1644 }
1645
1620 nested_vmcb->control.tlb_ctl = 0; 1646 nested_vmcb->control.tlb_ctl = 0;
1621 nested_vmcb->control.event_inj = 0; 1647 nested_vmcb->control.event_inj = 0;
1622 nested_vmcb->control.event_inj_err = 0; 1648 nested_vmcb->control.event_inj_err = 0;
@@ -1628,10 +1654,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1628 /* Restore the original control entries */ 1654 /* Restore the original control entries */
1629 copy_vmcb_control_area(vmcb, hsave); 1655 copy_vmcb_control_area(vmcb, hsave);
1630 1656
1631 /* Kill any pending exceptions */
1632 if (svm->vcpu.arch.exception.pending == true)
1633 nsvm_printk("WARNING: Pending Exception\n");
1634
1635 kvm_clear_exception_queue(&svm->vcpu); 1657 kvm_clear_exception_queue(&svm->vcpu);
1636 kvm_clear_interrupt_queue(&svm->vcpu); 1658 kvm_clear_interrupt_queue(&svm->vcpu);
1637 1659
@@ -1702,6 +1724,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1702 /* nested_vmcb is our indicator if nested SVM is activated */ 1724 /* nested_vmcb is our indicator if nested SVM is activated */
1703 svm->nested.vmcb = svm->vmcb->save.rax; 1725 svm->nested.vmcb = svm->vmcb->save.rax;
1704 1726
1727 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
1728 nested_vmcb->save.rip,
1729 nested_vmcb->control.int_ctl,
1730 nested_vmcb->control.event_inj,
1731 nested_vmcb->control.nested_ctl);
1732
1705 /* Clear internal status */ 1733 /* Clear internal status */
1706 kvm_clear_exception_queue(&svm->vcpu); 1734 kvm_clear_exception_queue(&svm->vcpu);
1707 kvm_clear_interrupt_queue(&svm->vcpu); 1735 kvm_clear_interrupt_queue(&svm->vcpu);
@@ -1789,28 +1817,15 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1789 svm->nested.intercept = nested_vmcb->control.intercept; 1817 svm->nested.intercept = nested_vmcb->control.intercept;
1790 1818
1791 force_new_asid(&svm->vcpu); 1819 force_new_asid(&svm->vcpu);
1792 svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
1793 svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
1794 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 1820 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
1795 if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
1796 nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
1797 nested_vmcb->control.int_ctl);
1798 }
1799 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 1821 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
1800 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 1822 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
1801 else 1823 else
1802 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 1824 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1803 1825
1804 nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
1805 nested_vmcb->control.exit_int_info,
1806 nested_vmcb->control.int_state);
1807
1808 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 1826 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1809 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 1827 svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1810 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; 1828 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1811 if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
1812 nsvm_printk("Injecting Event: 0x%x\n",
1813 nested_vmcb->control.event_inj);
1814 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 1829 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1815 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 1830 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1816 1831
@@ -1837,7 +1852,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1837 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; 1852 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1838} 1853}
1839 1854
1840static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1855static int vmload_interception(struct vcpu_svm *svm)
1841{ 1856{
1842 struct vmcb *nested_vmcb; 1857 struct vmcb *nested_vmcb;
1843 1858
@@ -1857,7 +1872,7 @@ static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1857 return 1; 1872 return 1;
1858} 1873}
1859 1874
1860static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1875static int vmsave_interception(struct vcpu_svm *svm)
1861{ 1876{
1862 struct vmcb *nested_vmcb; 1877 struct vmcb *nested_vmcb;
1863 1878
@@ -1877,10 +1892,8 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1877 return 1; 1892 return 1;
1878} 1893}
1879 1894
1880static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1895static int vmrun_interception(struct vcpu_svm *svm)
1881{ 1896{
1882 nsvm_printk("VMrun\n");
1883
1884 if (nested_svm_check_permissions(svm)) 1897 if (nested_svm_check_permissions(svm))
1885 return 1; 1898 return 1;
1886 1899
@@ -1907,7 +1920,7 @@ failed:
1907 return 1; 1920 return 1;
1908} 1921}
1909 1922
1910static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1923static int stgi_interception(struct vcpu_svm *svm)
1911{ 1924{
1912 if (nested_svm_check_permissions(svm)) 1925 if (nested_svm_check_permissions(svm))
1913 return 1; 1926 return 1;
@@ -1920,7 +1933,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1920 return 1; 1933 return 1;
1921} 1934}
1922 1935
1923static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1936static int clgi_interception(struct vcpu_svm *svm)
1924{ 1937{
1925 if (nested_svm_check_permissions(svm)) 1938 if (nested_svm_check_permissions(svm))
1926 return 1; 1939 return 1;
@@ -1937,10 +1950,12 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1937 return 1; 1950 return 1;
1938} 1951}
1939 1952
1940static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1953static int invlpga_interception(struct vcpu_svm *svm)
1941{ 1954{
1942 struct kvm_vcpu *vcpu = &svm->vcpu; 1955 struct kvm_vcpu *vcpu = &svm->vcpu;
1943 nsvm_printk("INVLPGA\n"); 1956
1957 trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
1958 vcpu->arch.regs[VCPU_REGS_RAX]);
1944 1959
1945 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 1960 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
1946 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); 1961 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
@@ -1950,15 +1965,21 @@ static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1950 return 1; 1965 return 1;
1951} 1966}
1952 1967
1953static int invalid_op_interception(struct vcpu_svm *svm, 1968static int skinit_interception(struct vcpu_svm *svm)
1954 struct kvm_run *kvm_run)
1955{ 1969{
1970 trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
1971
1956 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1972 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1957 return 1; 1973 return 1;
1958} 1974}
1959 1975
1960static int task_switch_interception(struct vcpu_svm *svm, 1976static int invalid_op_interception(struct vcpu_svm *svm)
1961 struct kvm_run *kvm_run) 1977{
1978 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1979 return 1;
1980}
1981
1982static int task_switch_interception(struct vcpu_svm *svm)
1962{ 1983{
1963 u16 tss_selector; 1984 u16 tss_selector;
1964 int reason; 1985 int reason;
@@ -2008,14 +2029,14 @@ static int task_switch_interception(struct vcpu_svm *svm,
2008 return kvm_task_switch(&svm->vcpu, tss_selector, reason); 2029 return kvm_task_switch(&svm->vcpu, tss_selector, reason);
2009} 2030}
2010 2031
2011static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2032static int cpuid_interception(struct vcpu_svm *svm)
2012{ 2033{
2013 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2034 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2014 kvm_emulate_cpuid(&svm->vcpu); 2035 kvm_emulate_cpuid(&svm->vcpu);
2015 return 1; 2036 return 1;
2016} 2037}
2017 2038
2018static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2039static int iret_interception(struct vcpu_svm *svm)
2019{ 2040{
2020 ++svm->vcpu.stat.nmi_window_exits; 2041 ++svm->vcpu.stat.nmi_window_exits;
2021 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); 2042 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
@@ -2023,26 +2044,27 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2023 return 1; 2044 return 1;
2024} 2045}
2025 2046
2026static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2047static int invlpg_interception(struct vcpu_svm *svm)
2027{ 2048{
2028 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) 2049 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2029 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2050 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2030 return 1; 2051 return 1;
2031} 2052}
2032 2053
2033static int emulate_on_interception(struct vcpu_svm *svm, 2054static int emulate_on_interception(struct vcpu_svm *svm)
2034 struct kvm_run *kvm_run)
2035{ 2055{
2036 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) 2056 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2037 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2057 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2038 return 1; 2058 return 1;
2039} 2059}
2040 2060
2041static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2061static int cr8_write_interception(struct vcpu_svm *svm)
2042{ 2062{
2063 struct kvm_run *kvm_run = svm->vcpu.run;
2064
2043 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2065 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2044 /* instruction emulation calls kvm_set_cr8() */ 2066 /* instruction emulation calls kvm_set_cr8() */
2045 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); 2067 emulate_instruction(&svm->vcpu, 0, 0, 0);
2046 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2068 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2047 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2069 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
2048 return 1; 2070 return 1;
@@ -2128,7 +2150,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2128 return 0; 2150 return 0;
2129} 2151}
2130 2152
2131static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2153static int rdmsr_interception(struct vcpu_svm *svm)
2132{ 2154{
2133 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2155 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2134 u64 data; 2156 u64 data;
@@ -2221,7 +2243,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2221 return 0; 2243 return 0;
2222} 2244}
2223 2245
2224static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2246static int wrmsr_interception(struct vcpu_svm *svm)
2225{ 2247{
2226 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2248 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2227 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 2249 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
@@ -2237,17 +2259,18 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2237 return 1; 2259 return 1;
2238} 2260}
2239 2261
2240static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2262static int msr_interception(struct vcpu_svm *svm)
2241{ 2263{
2242 if (svm->vmcb->control.exit_info_1) 2264 if (svm->vmcb->control.exit_info_1)
2243 return wrmsr_interception(svm, kvm_run); 2265 return wrmsr_interception(svm);
2244 else 2266 else
2245 return rdmsr_interception(svm, kvm_run); 2267 return rdmsr_interception(svm);
2246} 2268}
2247 2269
2248static int interrupt_window_interception(struct vcpu_svm *svm, 2270static int interrupt_window_interception(struct vcpu_svm *svm)
2249 struct kvm_run *kvm_run)
2250{ 2271{
2272 struct kvm_run *kvm_run = svm->vcpu.run;
2273
2251 svm_clear_vintr(svm); 2274 svm_clear_vintr(svm);
2252 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2275 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2253 /* 2276 /*
@@ -2265,8 +2288,13 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
2265 return 1; 2288 return 1;
2266} 2289}
2267 2290
2268static int (*svm_exit_handlers[])(struct vcpu_svm *svm, 2291static int pause_interception(struct vcpu_svm *svm)
2269 struct kvm_run *kvm_run) = { 2292{
2293 kvm_vcpu_on_spin(&(svm->vcpu));
2294 return 1;
2295}
2296
2297static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2270 [SVM_EXIT_READ_CR0] = emulate_on_interception, 2298 [SVM_EXIT_READ_CR0] = emulate_on_interception,
2271 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2299 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2272 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2300 [SVM_EXIT_READ_CR4] = emulate_on_interception,
@@ -2301,6 +2329,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2301 [SVM_EXIT_CPUID] = cpuid_interception, 2329 [SVM_EXIT_CPUID] = cpuid_interception,
2302 [SVM_EXIT_IRET] = iret_interception, 2330 [SVM_EXIT_IRET] = iret_interception,
2303 [SVM_EXIT_INVD] = emulate_on_interception, 2331 [SVM_EXIT_INVD] = emulate_on_interception,
2332 [SVM_EXIT_PAUSE] = pause_interception,
2304 [SVM_EXIT_HLT] = halt_interception, 2333 [SVM_EXIT_HLT] = halt_interception,
2305 [SVM_EXIT_INVLPG] = invlpg_interception, 2334 [SVM_EXIT_INVLPG] = invlpg_interception,
2306 [SVM_EXIT_INVLPGA] = invlpga_interception, 2335 [SVM_EXIT_INVLPGA] = invlpga_interception,
@@ -2314,26 +2343,36 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2314 [SVM_EXIT_VMSAVE] = vmsave_interception, 2343 [SVM_EXIT_VMSAVE] = vmsave_interception,
2315 [SVM_EXIT_STGI] = stgi_interception, 2344 [SVM_EXIT_STGI] = stgi_interception,
2316 [SVM_EXIT_CLGI] = clgi_interception, 2345 [SVM_EXIT_CLGI] = clgi_interception,
2317 [SVM_EXIT_SKINIT] = invalid_op_interception, 2346 [SVM_EXIT_SKINIT] = skinit_interception,
2318 [SVM_EXIT_WBINVD] = emulate_on_interception, 2347 [SVM_EXIT_WBINVD] = emulate_on_interception,
2319 [SVM_EXIT_MONITOR] = invalid_op_interception, 2348 [SVM_EXIT_MONITOR] = invalid_op_interception,
2320 [SVM_EXIT_MWAIT] = invalid_op_interception, 2349 [SVM_EXIT_MWAIT] = invalid_op_interception,
2321 [SVM_EXIT_NPF] = pf_interception, 2350 [SVM_EXIT_NPF] = pf_interception,
2322}; 2351};
2323 2352
2324static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 2353static int handle_exit(struct kvm_vcpu *vcpu)
2325{ 2354{
2326 struct vcpu_svm *svm = to_svm(vcpu); 2355 struct vcpu_svm *svm = to_svm(vcpu);
2356 struct kvm_run *kvm_run = vcpu->run;
2327 u32 exit_code = svm->vmcb->control.exit_code; 2357 u32 exit_code = svm->vmcb->control.exit_code;
2328 2358
2329 trace_kvm_exit(exit_code, svm->vmcb->save.rip); 2359 trace_kvm_exit(exit_code, svm->vmcb->save.rip);
2330 2360
2361 if (unlikely(svm->nested.exit_required)) {
2362 nested_svm_vmexit(svm);
2363 svm->nested.exit_required = false;
2364
2365 return 1;
2366 }
2367
2331 if (is_nested(svm)) { 2368 if (is_nested(svm)) {
2332 int vmexit; 2369 int vmexit;
2333 2370
2334 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", 2371 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
2335 exit_code, svm->vmcb->control.exit_info_1, 2372 svm->vmcb->control.exit_info_1,
2336 svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); 2373 svm->vmcb->control.exit_info_2,
2374 svm->vmcb->control.exit_int_info,
2375 svm->vmcb->control.exit_int_info_err);
2337 2376
2338 vmexit = nested_svm_exit_special(svm); 2377 vmexit = nested_svm_exit_special(svm);
2339 2378
@@ -2383,7 +2422,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2383 return 0; 2422 return 0;
2384 } 2423 }
2385 2424
2386 return svm_exit_handlers[exit_code](svm, kvm_run); 2425 return svm_exit_handlers[exit_code](svm);
2387} 2426}
2388 2427
2389static void reload_tss(struct kvm_vcpu *vcpu) 2428static void reload_tss(struct kvm_vcpu *vcpu)
@@ -2460,20 +2499,47 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2460 !(svm->vcpu.arch.hflags & HF_NMI_MASK); 2499 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2461} 2500}
2462 2501
2502static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
2503{
2504 struct vcpu_svm *svm = to_svm(vcpu);
2505
2506 return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
2507}
2508
2509static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2510{
2511 struct vcpu_svm *svm = to_svm(vcpu);
2512
2513 if (masked) {
2514 svm->vcpu.arch.hflags |= HF_NMI_MASK;
2515 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
2516 } else {
2517 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
2518 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
2519 }
2520}
2521
2463static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) 2522static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2464{ 2523{
2465 struct vcpu_svm *svm = to_svm(vcpu); 2524 struct vcpu_svm *svm = to_svm(vcpu);
2466 struct vmcb *vmcb = svm->vmcb; 2525 struct vmcb *vmcb = svm->vmcb;
2467 return (vmcb->save.rflags & X86_EFLAGS_IF) && 2526 int ret;
2468 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 2527
2469 gif_set(svm) && 2528 if (!gif_set(svm) ||
2470 !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)); 2529 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
2530 return 0;
2531
2532 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
2533
2534 if (is_nested(svm))
2535 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
2536
2537 return ret;
2471} 2538}
2472 2539
2473static void enable_irq_window(struct kvm_vcpu *vcpu) 2540static void enable_irq_window(struct kvm_vcpu *vcpu)
2474{ 2541{
2475 struct vcpu_svm *svm = to_svm(vcpu); 2542 struct vcpu_svm *svm = to_svm(vcpu);
2476 nsvm_printk("Trying to open IRQ window\n");
2477 2543
2478 nested_svm_intr(svm); 2544 nested_svm_intr(svm);
2479 2545
@@ -2498,7 +2564,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2498 /* Something prevents NMI from been injected. Single step over 2564 /* Something prevents NMI from been injected. Single step over
2499 possible problem (IRET or exception injection or interrupt 2565 possible problem (IRET or exception injection or interrupt
2500 shadow) */ 2566 shadow) */
2501 vcpu->arch.singlestep = true; 2567 svm->nmi_singlestep = true;
2502 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2568 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2503 update_db_intercept(vcpu); 2569 update_db_intercept(vcpu);
2504} 2570}
@@ -2588,13 +2654,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2588#define R "e" 2654#define R "e"
2589#endif 2655#endif
2590 2656
2591static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2657static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2592{ 2658{
2593 struct vcpu_svm *svm = to_svm(vcpu); 2659 struct vcpu_svm *svm = to_svm(vcpu);
2594 u16 fs_selector; 2660 u16 fs_selector;
2595 u16 gs_selector; 2661 u16 gs_selector;
2596 u16 ldt_selector; 2662 u16 ldt_selector;
2597 2663
2664 /*
2665 * A vmexit emulation is required before the vcpu can be executed
2666 * again.
2667 */
2668 if (unlikely(svm->nested.exit_required))
2669 return;
2670
2598 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 2671 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2599 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 2672 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2600 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 2673 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
@@ -2893,6 +2966,8 @@ static struct kvm_x86_ops svm_x86_ops = {
2893 .queue_exception = svm_queue_exception, 2966 .queue_exception = svm_queue_exception,
2894 .interrupt_allowed = svm_interrupt_allowed, 2967 .interrupt_allowed = svm_interrupt_allowed,
2895 .nmi_allowed = svm_nmi_allowed, 2968 .nmi_allowed = svm_nmi_allowed,
2969 .get_nmi_mask = svm_get_nmi_mask,
2970 .set_nmi_mask = svm_set_nmi_mask,
2896 .enable_nmi_window = enable_nmi_window, 2971 .enable_nmi_window = enable_nmi_window,
2897 .enable_irq_window = enable_irq_window, 2972 .enable_irq_window = enable_irq_window,
2898 .update_cr8_intercept = update_cr8_intercept, 2973 .update_cr8_intercept = update_cr8_intercept,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0d480e77eacf..816e0449db0b 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -349,6 +349,171 @@ TRACE_EVENT(kvm_apic_accept_irq,
349 __entry->coalesced ? " (coalesced)" : "") 349 __entry->coalesced ? " (coalesced)" : "")
350); 350);
351 351
352/*
353 * Tracepoint for nested VMRUN
354 */
355TRACE_EVENT(kvm_nested_vmrun,
356 TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
357 __u32 event_inj, bool npt),
358 TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
359
360 TP_STRUCT__entry(
361 __field( __u64, rip )
362 __field( __u64, vmcb )
363 __field( __u64, nested_rip )
364 __field( __u32, int_ctl )
365 __field( __u32, event_inj )
366 __field( bool, npt )
367 ),
368
369 TP_fast_assign(
370 __entry->rip = rip;
371 __entry->vmcb = vmcb;
372 __entry->nested_rip = nested_rip;
373 __entry->int_ctl = int_ctl;
374 __entry->event_inj = event_inj;
375 __entry->npt = npt;
376 ),
377
378 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
379 "event_inj: 0x%08x npt: %s\n",
380 __entry->rip, __entry->vmcb, __entry->nested_rip,
381 __entry->int_ctl, __entry->event_inj,
382 __entry->npt ? "on" : "off")
383);
384
385/*
386 * Tracepoint for #VMEXIT while nested
387 */
388TRACE_EVENT(kvm_nested_vmexit,
389 TP_PROTO(__u64 rip, __u32 exit_code,
390 __u64 exit_info1, __u64 exit_info2,
391 __u32 exit_int_info, __u32 exit_int_info_err),
392 TP_ARGS(rip, exit_code, exit_info1, exit_info2,
393 exit_int_info, exit_int_info_err),
394
395 TP_STRUCT__entry(
396 __field( __u64, rip )
397 __field( __u32, exit_code )
398 __field( __u64, exit_info1 )
399 __field( __u64, exit_info2 )
400 __field( __u32, exit_int_info )
401 __field( __u32, exit_int_info_err )
402 ),
403
404 TP_fast_assign(
405 __entry->rip = rip;
406 __entry->exit_code = exit_code;
407 __entry->exit_info1 = exit_info1;
408 __entry->exit_info2 = exit_info2;
409 __entry->exit_int_info = exit_int_info;
410 __entry->exit_int_info_err = exit_int_info_err;
411 ),
412 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
413 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
414 __entry->rip,
415 ftrace_print_symbols_seq(p, __entry->exit_code,
416 kvm_x86_ops->exit_reasons_str),
417 __entry->exit_info1, __entry->exit_info2,
418 __entry->exit_int_info, __entry->exit_int_info_err)
419);
420
421/*
422 * Tracepoint for #VMEXIT reinjected to the guest
423 */
424TRACE_EVENT(kvm_nested_vmexit_inject,
425 TP_PROTO(__u32 exit_code,
426 __u64 exit_info1, __u64 exit_info2,
427 __u32 exit_int_info, __u32 exit_int_info_err),
428 TP_ARGS(exit_code, exit_info1, exit_info2,
429 exit_int_info, exit_int_info_err),
430
431 TP_STRUCT__entry(
432 __field( __u32, exit_code )
433 __field( __u64, exit_info1 )
434 __field( __u64, exit_info2 )
435 __field( __u32, exit_int_info )
436 __field( __u32, exit_int_info_err )
437 ),
438
439 TP_fast_assign(
440 __entry->exit_code = exit_code;
441 __entry->exit_info1 = exit_info1;
442 __entry->exit_info2 = exit_info2;
443 __entry->exit_int_info = exit_int_info;
444 __entry->exit_int_info_err = exit_int_info_err;
445 ),
446
447 TP_printk("reason: %s ext_inf1: 0x%016llx "
448 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
449 ftrace_print_symbols_seq(p, __entry->exit_code,
450 kvm_x86_ops->exit_reasons_str),
451 __entry->exit_info1, __entry->exit_info2,
452 __entry->exit_int_info, __entry->exit_int_info_err)
453);
454
455/*
456 * Tracepoint for nested #vmexit because of interrupt pending
457 */
458TRACE_EVENT(kvm_nested_intr_vmexit,
459 TP_PROTO(__u64 rip),
460 TP_ARGS(rip),
461
462 TP_STRUCT__entry(
463 __field( __u64, rip )
464 ),
465
466 TP_fast_assign(
467 __entry->rip = rip
468 ),
469
470 TP_printk("rip: 0x%016llx\n", __entry->rip)
471);
472
473/*
474 * Tracepoint for nested #vmexit because of interrupt pending
475 */
476TRACE_EVENT(kvm_invlpga,
477 TP_PROTO(__u64 rip, int asid, u64 address),
478 TP_ARGS(rip, asid, address),
479
480 TP_STRUCT__entry(
481 __field( __u64, rip )
482 __field( int, asid )
483 __field( __u64, address )
484 ),
485
486 TP_fast_assign(
487 __entry->rip = rip;
488 __entry->asid = asid;
489 __entry->address = address;
490 ),
491
492 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
493 __entry->rip, __entry->asid, __entry->address)
494);
495
496/*
497 * Tracepoint for nested #vmexit because of interrupt pending
498 */
499TRACE_EVENT(kvm_skinit,
500 TP_PROTO(__u64 rip, __u32 slb),
501 TP_ARGS(rip, slb),
502
503 TP_STRUCT__entry(
504 __field( __u64, rip )
505 __field( __u32, slb )
506 ),
507
508 TP_fast_assign(
509 __entry->rip = rip;
510 __entry->slb = slb;
511 ),
512
513 TP_printk("rip: 0x%016llx slb: 0x%08x\n",
514 __entry->rip, __entry->slb)
515);
516
352#endif /* _TRACE_KVM_H */ 517#endif /* _TRACE_KVM_H */
353 518
354/* This part must be outside protection */ 519/* This part must be outside protection */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed53b42caba1..d4918d6fc924 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,12 +61,37 @@ module_param_named(unrestricted_guest,
61static int __read_mostly emulate_invalid_guest_state = 0; 61static int __read_mostly emulate_invalid_guest_state = 0;
62module_param(emulate_invalid_guest_state, bool, S_IRUGO); 62module_param(emulate_invalid_guest_state, bool, S_IRUGO);
63 63
64/*
65 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
66 * ple_gap: upper bound on the amount of time between two successive
67 * executions of PAUSE in a loop. Also indicate if ple enabled.
68 * According to test, this time is usually small than 41 cycles.
69 * ple_window: upper bound on the amount of time a guest is allowed to execute
70 * in a PAUSE loop. Tests indicate that most spinlocks are held for
71 * less than 2^12 cycles
72 * Time is measured based on a counter that runs at the same rate as the TSC,
73 * refer SDM volume 3b section 21.6.13 & 22.1.3.
74 */
75#define KVM_VMX_DEFAULT_PLE_GAP 41
76#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
77static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
78module_param(ple_gap, int, S_IRUGO);
79
80static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
81module_param(ple_window, int, S_IRUGO);
82
64struct vmcs { 83struct vmcs {
65 u32 revision_id; 84 u32 revision_id;
66 u32 abort; 85 u32 abort;
67 char data[0]; 86 char data[0];
68}; 87};
69 88
89struct shared_msr_entry {
90 unsigned index;
91 u64 data;
92 u64 mask;
93};
94
70struct vcpu_vmx { 95struct vcpu_vmx {
71 struct kvm_vcpu vcpu; 96 struct kvm_vcpu vcpu;
72 struct list_head local_vcpus_link; 97 struct list_head local_vcpus_link;
@@ -74,13 +99,12 @@ struct vcpu_vmx {
74 int launched; 99 int launched;
75 u8 fail; 100 u8 fail;
76 u32 idt_vectoring_info; 101 u32 idt_vectoring_info;
77 struct kvm_msr_entry *guest_msrs; 102 struct shared_msr_entry *guest_msrs;
78 struct kvm_msr_entry *host_msrs;
79 int nmsrs; 103 int nmsrs;
80 int save_nmsrs; 104 int save_nmsrs;
81 int msr_offset_efer;
82#ifdef CONFIG_X86_64 105#ifdef CONFIG_X86_64
83 int msr_offset_kernel_gs_base; 106 u64 msr_host_kernel_gs_base;
107 u64 msr_guest_kernel_gs_base;
84#endif 108#endif
85 struct vmcs *vmcs; 109 struct vmcs *vmcs;
86 struct { 110 struct {
@@ -88,7 +112,6 @@ struct vcpu_vmx {
88 u16 fs_sel, gs_sel, ldt_sel; 112 u16 fs_sel, gs_sel, ldt_sel;
89 int gs_ldt_reload_needed; 113 int gs_ldt_reload_needed;
90 int fs_reload_needed; 114 int fs_reload_needed;
91 int guest_efer_loaded;
92 } host_state; 115 } host_state;
93 struct { 116 struct {
94 int vm86_active; 117 int vm86_active;
@@ -107,7 +130,6 @@ struct vcpu_vmx {
107 } rmode; 130 } rmode;
108 int vpid; 131 int vpid;
109 bool emulation_required; 132 bool emulation_required;
110 enum emulation_result invalid_state_emulation_result;
111 133
112 /* Support for vnmi-less CPUs */ 134 /* Support for vnmi-less CPUs */
113 int soft_vnmi_blocked; 135 int soft_vnmi_blocked;
@@ -176,6 +198,8 @@ static struct kvm_vmx_segment_field {
176 VMX_SEGMENT_FIELD(LDTR), 198 VMX_SEGMENT_FIELD(LDTR),
177}; 199};
178 200
201static u64 host_efer;
202
179static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 203static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
180 204
181/* 205/*
@@ -184,28 +208,12 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
184 */ 208 */
185static const u32 vmx_msr_index[] = { 209static const u32 vmx_msr_index[] = {
186#ifdef CONFIG_X86_64 210#ifdef CONFIG_X86_64
187 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, 211 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
188#endif 212#endif
189 MSR_EFER, MSR_K6_STAR, 213 MSR_EFER, MSR_K6_STAR,
190}; 214};
191#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 215#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
192 216
193static void load_msrs(struct kvm_msr_entry *e, int n)
194{
195 int i;
196
197 for (i = 0; i < n; ++i)
198 wrmsrl(e[i].index, e[i].data);
199}
200
201static void save_msrs(struct kvm_msr_entry *e, int n)
202{
203 int i;
204
205 for (i = 0; i < n; ++i)
206 rdmsrl(e[i].index, e[i].data);
207}
208
209static inline int is_page_fault(u32 intr_info) 217static inline int is_page_fault(u32 intr_info)
210{ 218{
211 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 219 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -320,6 +328,12 @@ static inline int cpu_has_vmx_unrestricted_guest(void)
320 SECONDARY_EXEC_UNRESTRICTED_GUEST; 328 SECONDARY_EXEC_UNRESTRICTED_GUEST;
321} 329}
322 330
331static inline int cpu_has_vmx_ple(void)
332{
333 return vmcs_config.cpu_based_2nd_exec_ctrl &
334 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
335}
336
323static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 337static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
324{ 338{
325 return flexpriority_enabled && 339 return flexpriority_enabled &&
@@ -348,7 +362,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
348 int i; 362 int i;
349 363
350 for (i = 0; i < vmx->nmsrs; ++i) 364 for (i = 0; i < vmx->nmsrs; ++i)
351 if (vmx->guest_msrs[i].index == msr) 365 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
352 return i; 366 return i;
353 return -1; 367 return -1;
354} 368}
@@ -379,7 +393,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
379 : : "a" (&operand), "c" (ext) : "cc", "memory"); 393 : : "a" (&operand), "c" (ext) : "cc", "memory");
380} 394}
381 395
382static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 396static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
383{ 397{
384 int i; 398 int i;
385 399
@@ -570,17 +584,12 @@ static void reload_tss(void)
570 load_TR_desc(); 584 load_TR_desc();
571} 585}
572 586
573static void load_transition_efer(struct vcpu_vmx *vmx) 587static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
574{ 588{
575 int efer_offset = vmx->msr_offset_efer;
576 u64 host_efer;
577 u64 guest_efer; 589 u64 guest_efer;
578 u64 ignore_bits; 590 u64 ignore_bits;
579 591
580 if (efer_offset < 0) 592 guest_efer = vmx->vcpu.arch.shadow_efer;
581 return;
582 host_efer = vmx->host_msrs[efer_offset].data;
583 guest_efer = vmx->guest_msrs[efer_offset].data;
584 593
585 /* 594 /*
586 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 595 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -593,27 +602,17 @@ static void load_transition_efer(struct vcpu_vmx *vmx)
593 if (guest_efer & EFER_LMA) 602 if (guest_efer & EFER_LMA)
594 ignore_bits &= ~(u64)EFER_SCE; 603 ignore_bits &= ~(u64)EFER_SCE;
595#endif 604#endif
596 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
597 return;
598
599 vmx->host_state.guest_efer_loaded = 1;
600 guest_efer &= ~ignore_bits; 605 guest_efer &= ~ignore_bits;
601 guest_efer |= host_efer & ignore_bits; 606 guest_efer |= host_efer & ignore_bits;
602 wrmsrl(MSR_EFER, guest_efer); 607 vmx->guest_msrs[efer_offset].data = guest_efer;
603 vmx->vcpu.stat.efer_reload++; 608 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
604} 609 return true;
605
606static void reload_host_efer(struct vcpu_vmx *vmx)
607{
608 if (vmx->host_state.guest_efer_loaded) {
609 vmx->host_state.guest_efer_loaded = 0;
610 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
611 }
612} 610}
613 611
614static void vmx_save_host_state(struct kvm_vcpu *vcpu) 612static void vmx_save_host_state(struct kvm_vcpu *vcpu)
615{ 613{
616 struct vcpu_vmx *vmx = to_vmx(vcpu); 614 struct vcpu_vmx *vmx = to_vmx(vcpu);
615 int i;
617 616
618 if (vmx->host_state.loaded) 617 if (vmx->host_state.loaded)
619 return; 618 return;
@@ -650,13 +649,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
650#endif 649#endif
651 650
652#ifdef CONFIG_X86_64 651#ifdef CONFIG_X86_64
653 if (is_long_mode(&vmx->vcpu)) 652 if (is_long_mode(&vmx->vcpu)) {
654 save_msrs(vmx->host_msrs + 653 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
655 vmx->msr_offset_kernel_gs_base, 1); 654 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
656 655 }
657#endif 656#endif
658 load_msrs(vmx->guest_msrs, vmx->save_nmsrs); 657 for (i = 0; i < vmx->save_nmsrs; ++i)
659 load_transition_efer(vmx); 658 kvm_set_shared_msr(vmx->guest_msrs[i].index,
659 vmx->guest_msrs[i].data,
660 vmx->guest_msrs[i].mask);
660} 661}
661 662
662static void __vmx_load_host_state(struct vcpu_vmx *vmx) 663static void __vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -684,9 +685,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
684 local_irq_restore(flags); 685 local_irq_restore(flags);
685 } 686 }
686 reload_tss(); 687 reload_tss();
687 save_msrs(vmx->guest_msrs, vmx->save_nmsrs); 688#ifdef CONFIG_X86_64
688 load_msrs(vmx->host_msrs, vmx->save_nmsrs); 689 if (is_long_mode(&vmx->vcpu)) {
689 reload_host_efer(vmx); 690 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
691 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
692 }
693#endif
690} 694}
691 695
692static void vmx_load_host_state(struct vcpu_vmx *vmx) 696static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -877,19 +881,14 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
877/* 881/*
878 * Swap MSR entry in host/guest MSR entry array. 882 * Swap MSR entry in host/guest MSR entry array.
879 */ 883 */
880#ifdef CONFIG_X86_64
881static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) 884static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
882{ 885{
883 struct kvm_msr_entry tmp; 886 struct shared_msr_entry tmp;
884 887
885 tmp = vmx->guest_msrs[to]; 888 tmp = vmx->guest_msrs[to];
886 vmx->guest_msrs[to] = vmx->guest_msrs[from]; 889 vmx->guest_msrs[to] = vmx->guest_msrs[from];
887 vmx->guest_msrs[from] = tmp; 890 vmx->guest_msrs[from] = tmp;
888 tmp = vmx->host_msrs[to];
889 vmx->host_msrs[to] = vmx->host_msrs[from];
890 vmx->host_msrs[from] = tmp;
891} 891}
892#endif
893 892
894/* 893/*
895 * Set up the vmcs to automatically save and restore system 894 * Set up the vmcs to automatically save and restore system
@@ -898,15 +897,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
898 */ 897 */
899static void setup_msrs(struct vcpu_vmx *vmx) 898static void setup_msrs(struct vcpu_vmx *vmx)
900{ 899{
901 int save_nmsrs; 900 int save_nmsrs, index;
902 unsigned long *msr_bitmap; 901 unsigned long *msr_bitmap;
903 902
904 vmx_load_host_state(vmx); 903 vmx_load_host_state(vmx);
905 save_nmsrs = 0; 904 save_nmsrs = 0;
906#ifdef CONFIG_X86_64 905#ifdef CONFIG_X86_64
907 if (is_long_mode(&vmx->vcpu)) { 906 if (is_long_mode(&vmx->vcpu)) {
908 int index;
909
910 index = __find_msr_index(vmx, MSR_SYSCALL_MASK); 907 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
911 if (index >= 0) 908 if (index >= 0)
912 move_msr_up(vmx, index, save_nmsrs++); 909 move_msr_up(vmx, index, save_nmsrs++);
@@ -916,9 +913,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
916 index = __find_msr_index(vmx, MSR_CSTAR); 913 index = __find_msr_index(vmx, MSR_CSTAR);
917 if (index >= 0) 914 if (index >= 0)
918 move_msr_up(vmx, index, save_nmsrs++); 915 move_msr_up(vmx, index, save_nmsrs++);
919 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
920 if (index >= 0)
921 move_msr_up(vmx, index, save_nmsrs++);
922 /* 916 /*
923 * MSR_K6_STAR is only needed on long mode guests, and only 917 * MSR_K6_STAR is only needed on long mode guests, and only
924 * if efer.sce is enabled. 918 * if efer.sce is enabled.
@@ -928,13 +922,11 @@ static void setup_msrs(struct vcpu_vmx *vmx)
928 move_msr_up(vmx, index, save_nmsrs++); 922 move_msr_up(vmx, index, save_nmsrs++);
929 } 923 }
930#endif 924#endif
931 vmx->save_nmsrs = save_nmsrs; 925 index = __find_msr_index(vmx, MSR_EFER);
926 if (index >= 0 && update_transition_efer(vmx, index))
927 move_msr_up(vmx, index, save_nmsrs++);
932 928
933#ifdef CONFIG_X86_64 929 vmx->save_nmsrs = save_nmsrs;
934 vmx->msr_offset_kernel_gs_base =
935 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
936#endif
937 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
938 930
939 if (cpu_has_vmx_msr_bitmap()) { 931 if (cpu_has_vmx_msr_bitmap()) {
940 if (is_long_mode(&vmx->vcpu)) 932 if (is_long_mode(&vmx->vcpu))
@@ -976,7 +968,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
976static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 968static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
977{ 969{
978 u64 data; 970 u64 data;
979 struct kvm_msr_entry *msr; 971 struct shared_msr_entry *msr;
980 972
981 if (!pdata) { 973 if (!pdata) {
982 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); 974 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
@@ -991,9 +983,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
991 case MSR_GS_BASE: 983 case MSR_GS_BASE:
992 data = vmcs_readl(GUEST_GS_BASE); 984 data = vmcs_readl(GUEST_GS_BASE);
993 break; 985 break;
986 case MSR_KERNEL_GS_BASE:
987 vmx_load_host_state(to_vmx(vcpu));
988 data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
989 break;
990#endif
994 case MSR_EFER: 991 case MSR_EFER:
995 return kvm_get_msr_common(vcpu, msr_index, pdata); 992 return kvm_get_msr_common(vcpu, msr_index, pdata);
996#endif
997 case MSR_IA32_TSC: 993 case MSR_IA32_TSC:
998 data = guest_read_tsc(); 994 data = guest_read_tsc();
999 break; 995 break;
@@ -1007,6 +1003,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1007 data = vmcs_readl(GUEST_SYSENTER_ESP); 1003 data = vmcs_readl(GUEST_SYSENTER_ESP);
1008 break; 1004 break;
1009 default: 1005 default:
1006 vmx_load_host_state(to_vmx(vcpu));
1010 msr = find_msr_entry(to_vmx(vcpu), msr_index); 1007 msr = find_msr_entry(to_vmx(vcpu), msr_index);
1011 if (msr) { 1008 if (msr) {
1012 vmx_load_host_state(to_vmx(vcpu)); 1009 vmx_load_host_state(to_vmx(vcpu));
@@ -1028,7 +1025,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1028static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1025static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1029{ 1026{
1030 struct vcpu_vmx *vmx = to_vmx(vcpu); 1027 struct vcpu_vmx *vmx = to_vmx(vcpu);
1031 struct kvm_msr_entry *msr; 1028 struct shared_msr_entry *msr;
1032 u64 host_tsc; 1029 u64 host_tsc;
1033 int ret = 0; 1030 int ret = 0;
1034 1031
@@ -1044,6 +1041,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1044 case MSR_GS_BASE: 1041 case MSR_GS_BASE:
1045 vmcs_writel(GUEST_GS_BASE, data); 1042 vmcs_writel(GUEST_GS_BASE, data);
1046 break; 1043 break;
1044 case MSR_KERNEL_GS_BASE:
1045 vmx_load_host_state(vmx);
1046 vmx->msr_guest_kernel_gs_base = data;
1047 break;
1047#endif 1048#endif
1048 case MSR_IA32_SYSENTER_CS: 1049 case MSR_IA32_SYSENTER_CS:
1049 vmcs_write32(GUEST_SYSENTER_CS, data); 1050 vmcs_write32(GUEST_SYSENTER_CS, data);
@@ -1097,30 +1098,14 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1097 } 1098 }
1098} 1099}
1099 1100
1100static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1101static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1101{ 1102{
1102 int old_debug = vcpu->guest_debug;
1103 unsigned long flags;
1104
1105 vcpu->guest_debug = dbg->control;
1106 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
1107 vcpu->guest_debug = 0;
1108
1109 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1103 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1110 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); 1104 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
1111 else 1105 else
1112 vmcs_writel(GUEST_DR7, vcpu->arch.dr7); 1106 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
1113 1107
1114 flags = vmcs_readl(GUEST_RFLAGS);
1115 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
1116 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1117 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1118 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1119 vmcs_writel(GUEST_RFLAGS, flags);
1120
1121 update_exception_bitmap(vcpu); 1108 update_exception_bitmap(vcpu);
1122
1123 return 0;
1124} 1109}
1125 1110
1126static __init int cpu_has_kvm_support(void) 1111static __init int cpu_has_kvm_support(void)
@@ -1139,12 +1124,15 @@ static __init int vmx_disabled_by_bios(void)
1139 /* locked but not enabled */ 1124 /* locked but not enabled */
1140} 1125}
1141 1126
1142static void hardware_enable(void *garbage) 1127static int hardware_enable(void *garbage)
1143{ 1128{
1144 int cpu = raw_smp_processor_id(); 1129 int cpu = raw_smp_processor_id();
1145 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1130 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1146 u64 old; 1131 u64 old;
1147 1132
1133 if (read_cr4() & X86_CR4_VMXE)
1134 return -EBUSY;
1135
1148 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1136 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1149 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1137 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1150 if ((old & (FEATURE_CONTROL_LOCKED | 1138 if ((old & (FEATURE_CONTROL_LOCKED |
@@ -1159,6 +1147,10 @@ static void hardware_enable(void *garbage)
1159 asm volatile (ASM_VMX_VMXON_RAX 1147 asm volatile (ASM_VMX_VMXON_RAX
1160 : : "a"(&phys_addr), "m"(phys_addr) 1148 : : "a"(&phys_addr), "m"(phys_addr)
1161 : "memory", "cc"); 1149 : "memory", "cc");
1150
1151 ept_sync_global();
1152
1153 return 0;
1162} 1154}
1163 1155
1164static void vmclear_local_vcpus(void) 1156static void vmclear_local_vcpus(void)
@@ -1250,7 +1242,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1250 SECONDARY_EXEC_WBINVD_EXITING | 1242 SECONDARY_EXEC_WBINVD_EXITING |
1251 SECONDARY_EXEC_ENABLE_VPID | 1243 SECONDARY_EXEC_ENABLE_VPID |
1252 SECONDARY_EXEC_ENABLE_EPT | 1244 SECONDARY_EXEC_ENABLE_EPT |
1253 SECONDARY_EXEC_UNRESTRICTED_GUEST; 1245 SECONDARY_EXEC_UNRESTRICTED_GUEST |
1246 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1254 if (adjust_vmx_controls(min2, opt2, 1247 if (adjust_vmx_controls(min2, opt2,
1255 MSR_IA32_VMX_PROCBASED_CTLS2, 1248 MSR_IA32_VMX_PROCBASED_CTLS2,
1256 &_cpu_based_2nd_exec_control) < 0) 1249 &_cpu_based_2nd_exec_control) < 0)
@@ -1344,15 +1337,17 @@ static void free_kvm_area(void)
1344{ 1337{
1345 int cpu; 1338 int cpu;
1346 1339
1347 for_each_online_cpu(cpu) 1340 for_each_possible_cpu(cpu) {
1348 free_vmcs(per_cpu(vmxarea, cpu)); 1341 free_vmcs(per_cpu(vmxarea, cpu));
1342 per_cpu(vmxarea, cpu) = NULL;
1343 }
1349} 1344}
1350 1345
1351static __init int alloc_kvm_area(void) 1346static __init int alloc_kvm_area(void)
1352{ 1347{
1353 int cpu; 1348 int cpu;
1354 1349
1355 for_each_online_cpu(cpu) { 1350 for_each_possible_cpu(cpu) {
1356 struct vmcs *vmcs; 1351 struct vmcs *vmcs;
1357 1352
1358 vmcs = alloc_vmcs_cpu(cpu); 1353 vmcs = alloc_vmcs_cpu(cpu);
@@ -1394,6 +1389,9 @@ static __init int hardware_setup(void)
1394 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 1389 if (enable_ept && !cpu_has_vmx_ept_2m_page())
1395 kvm_disable_largepages(); 1390 kvm_disable_largepages();
1396 1391
1392 if (!cpu_has_vmx_ple())
1393 ple_gap = 0;
1394
1397 return alloc_kvm_area(); 1395 return alloc_kvm_area();
1398} 1396}
1399 1397
@@ -1536,8 +1534,16 @@ continue_rmode:
1536static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1534static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1537{ 1535{
1538 struct vcpu_vmx *vmx = to_vmx(vcpu); 1536 struct vcpu_vmx *vmx = to_vmx(vcpu);
1539 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 1537 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1538
1539 if (!msr)
1540 return;
1540 1541
1542 /*
1543 * Force kernel_gs_base reloading before EFER changes, as control
1544 * of this msr depends on is_long_mode().
1545 */
1546 vmx_load_host_state(to_vmx(vcpu));
1541 vcpu->arch.shadow_efer = efer; 1547 vcpu->arch.shadow_efer = efer;
1542 if (!msr) 1548 if (!msr)
1543 return; 1549 return;
@@ -1727,6 +1733,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1727 vmcs_write64(EPT_POINTER, eptp); 1733 vmcs_write64(EPT_POINTER, eptp);
1728 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 1734 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
1729 vcpu->kvm->arch.ept_identity_map_addr; 1735 vcpu->kvm->arch.ept_identity_map_addr;
1736 ept_load_pdptrs(vcpu);
1730 } 1737 }
1731 1738
1732 vmx_flush_tlb(vcpu); 1739 vmx_flush_tlb(vcpu);
@@ -2302,13 +2309,22 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2302 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2309 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2303 if (vmx->vpid == 0) 2310 if (vmx->vpid == 0)
2304 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 2311 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2305 if (!enable_ept) 2312 if (!enable_ept) {
2306 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 2313 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2314 enable_unrestricted_guest = 0;
2315 }
2307 if (!enable_unrestricted_guest) 2316 if (!enable_unrestricted_guest)
2308 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2317 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2318 if (!ple_gap)
2319 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
2309 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2320 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2310 } 2321 }
2311 2322
2323 if (ple_gap) {
2324 vmcs_write32(PLE_GAP, ple_gap);
2325 vmcs_write32(PLE_WINDOW, ple_window);
2326 }
2327
2312 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); 2328 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
2313 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 2329 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2314 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 2330 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -2376,10 +2392,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2376 if (wrmsr_safe(index, data_low, data_high) < 0) 2392 if (wrmsr_safe(index, data_low, data_high) < 0)
2377 continue; 2393 continue;
2378 data = data_low | ((u64)data_high << 32); 2394 data = data_low | ((u64)data_high << 32);
2379 vmx->host_msrs[j].index = index; 2395 vmx->guest_msrs[j].index = i;
2380 vmx->host_msrs[j].reserved = 0; 2396 vmx->guest_msrs[j].data = 0;
2381 vmx->host_msrs[j].data = data; 2397 vmx->guest_msrs[j].mask = -1ull;
2382 vmx->guest_msrs[j] = vmx->host_msrs[j];
2383 ++vmx->nmsrs; 2398 ++vmx->nmsrs;
2384 } 2399 }
2385 2400
@@ -2510,7 +2525,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2510 if (vmx->vpid != 0) 2525 if (vmx->vpid != 0)
2511 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2526 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2512 2527
2513 vmx->vcpu.arch.cr0 = 0x60000010; 2528 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
2514 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ 2529 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
2515 vmx_set_cr4(&vmx->vcpu, 0); 2530 vmx_set_cr4(&vmx->vcpu, 0);
2516 vmx_set_efer(&vmx->vcpu, 0); 2531 vmx_set_efer(&vmx->vcpu, 0);
@@ -2627,6 +2642,34 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2627 GUEST_INTR_STATE_NMI)); 2642 GUEST_INTR_STATE_NMI));
2628} 2643}
2629 2644
2645static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2646{
2647 if (!cpu_has_virtual_nmis())
2648 return to_vmx(vcpu)->soft_vnmi_blocked;
2649 else
2650 return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2651 GUEST_INTR_STATE_NMI);
2652}
2653
2654static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2655{
2656 struct vcpu_vmx *vmx = to_vmx(vcpu);
2657
2658 if (!cpu_has_virtual_nmis()) {
2659 if (vmx->soft_vnmi_blocked != masked) {
2660 vmx->soft_vnmi_blocked = masked;
2661 vmx->vnmi_blocked_time = 0;
2662 }
2663 } else {
2664 if (masked)
2665 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2666 GUEST_INTR_STATE_NMI);
2667 else
2668 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2669 GUEST_INTR_STATE_NMI);
2670 }
2671}
2672
2630static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 2673static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
2631{ 2674{
2632 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 2675 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -2659,7 +2702,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2659 * Cause the #SS fault with 0 error code in VM86 mode. 2702 * Cause the #SS fault with 0 error code in VM86 mode.
2660 */ 2703 */
2661 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2704 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2662 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) 2705 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
2663 return 1; 2706 return 1;
2664 /* 2707 /*
2665 * Forward all other exceptions that are valid in real mode. 2708 * Forward all other exceptions that are valid in real mode.
@@ -2710,15 +2753,16 @@ static void kvm_machine_check(void)
2710#endif 2753#endif
2711} 2754}
2712 2755
2713static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2756static int handle_machine_check(struct kvm_vcpu *vcpu)
2714{ 2757{
2715 /* already handled by vcpu_run */ 2758 /* already handled by vcpu_run */
2716 return 1; 2759 return 1;
2717} 2760}
2718 2761
2719static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2762static int handle_exception(struct kvm_vcpu *vcpu)
2720{ 2763{
2721 struct vcpu_vmx *vmx = to_vmx(vcpu); 2764 struct vcpu_vmx *vmx = to_vmx(vcpu);
2765 struct kvm_run *kvm_run = vcpu->run;
2722 u32 intr_info, ex_no, error_code; 2766 u32 intr_info, ex_no, error_code;
2723 unsigned long cr2, rip, dr6; 2767 unsigned long cr2, rip, dr6;
2724 u32 vect_info; 2768 u32 vect_info;
@@ -2728,12 +2772,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2728 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2772 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2729 2773
2730 if (is_machine_check(intr_info)) 2774 if (is_machine_check(intr_info))
2731 return handle_machine_check(vcpu, kvm_run); 2775 return handle_machine_check(vcpu);
2732 2776
2733 if ((vect_info & VECTORING_INFO_VALID_MASK) && 2777 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2734 !is_page_fault(intr_info)) 2778 !is_page_fault(intr_info)) {
2735 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 2779 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2736 "intr info 0x%x\n", __func__, vect_info, intr_info); 2780 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
2781 vcpu->run->internal.ndata = 2;
2782 vcpu->run->internal.data[0] = vect_info;
2783 vcpu->run->internal.data[1] = intr_info;
2784 return 0;
2785 }
2737 2786
2738 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 2787 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2739 return 1; /* already handled by vmx_vcpu_run() */ 2788 return 1; /* already handled by vmx_vcpu_run() */
@@ -2744,7 +2793,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2744 } 2793 }
2745 2794
2746 if (is_invalid_opcode(intr_info)) { 2795 if (is_invalid_opcode(intr_info)) {
2747 er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 2796 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
2748 if (er != EMULATE_DONE) 2797 if (er != EMULATE_DONE)
2749 kvm_queue_exception(vcpu, UD_VECTOR); 2798 kvm_queue_exception(vcpu, UD_VECTOR);
2750 return 1; 2799 return 1;
@@ -2803,20 +2852,19 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2803 return 0; 2852 return 0;
2804} 2853}
2805 2854
2806static int handle_external_interrupt(struct kvm_vcpu *vcpu, 2855static int handle_external_interrupt(struct kvm_vcpu *vcpu)
2807 struct kvm_run *kvm_run)
2808{ 2856{
2809 ++vcpu->stat.irq_exits; 2857 ++vcpu->stat.irq_exits;
2810 return 1; 2858 return 1;
2811} 2859}
2812 2860
2813static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2861static int handle_triple_fault(struct kvm_vcpu *vcpu)
2814{ 2862{
2815 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2863 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2816 return 0; 2864 return 0;
2817} 2865}
2818 2866
2819static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2867static int handle_io(struct kvm_vcpu *vcpu)
2820{ 2868{
2821 unsigned long exit_qualification; 2869 unsigned long exit_qualification;
2822 int size, in, string; 2870 int size, in, string;
@@ -2827,8 +2875,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2827 string = (exit_qualification & 16) != 0; 2875 string = (exit_qualification & 16) != 0;
2828 2876
2829 if (string) { 2877 if (string) {
2830 if (emulate_instruction(vcpu, 2878 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
2831 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
2832 return 0; 2879 return 0;
2833 return 1; 2880 return 1;
2834 } 2881 }
@@ -2838,7 +2885,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2838 port = exit_qualification >> 16; 2885 port = exit_qualification >> 16;
2839 2886
2840 skip_emulated_instruction(vcpu); 2887 skip_emulated_instruction(vcpu);
2841 return kvm_emulate_pio(vcpu, kvm_run, in, size, port); 2888 return kvm_emulate_pio(vcpu, in, size, port);
2842} 2889}
2843 2890
2844static void 2891static void
@@ -2852,7 +2899,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2852 hypercall[2] = 0xc1; 2899 hypercall[2] = 0xc1;
2853} 2900}
2854 2901
2855static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2902static int handle_cr(struct kvm_vcpu *vcpu)
2856{ 2903{
2857 unsigned long exit_qualification, val; 2904 unsigned long exit_qualification, val;
2858 int cr; 2905 int cr;
@@ -2887,7 +2934,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2887 return 1; 2934 return 1;
2888 if (cr8_prev <= cr8) 2935 if (cr8_prev <= cr8)
2889 return 1; 2936 return 1;
2890 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2937 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2891 return 0; 2938 return 0;
2892 } 2939 }
2893 }; 2940 };
@@ -2922,13 +2969,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2922 default: 2969 default:
2923 break; 2970 break;
2924 } 2971 }
2925 kvm_run->exit_reason = 0; 2972 vcpu->run->exit_reason = 0;
2926 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 2973 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2927 (int)(exit_qualification >> 4) & 3, cr); 2974 (int)(exit_qualification >> 4) & 3, cr);
2928 return 0; 2975 return 0;
2929} 2976}
2930 2977
2931static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2978static int handle_dr(struct kvm_vcpu *vcpu)
2932{ 2979{
2933 unsigned long exit_qualification; 2980 unsigned long exit_qualification;
2934 unsigned long val; 2981 unsigned long val;
@@ -2944,13 +2991,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2944 * guest debugging itself. 2991 * guest debugging itself.
2945 */ 2992 */
2946 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 2993 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
2947 kvm_run->debug.arch.dr6 = vcpu->arch.dr6; 2994 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
2948 kvm_run->debug.arch.dr7 = dr; 2995 vcpu->run->debug.arch.dr7 = dr;
2949 kvm_run->debug.arch.pc = 2996 vcpu->run->debug.arch.pc =
2950 vmcs_readl(GUEST_CS_BASE) + 2997 vmcs_readl(GUEST_CS_BASE) +
2951 vmcs_readl(GUEST_RIP); 2998 vmcs_readl(GUEST_RIP);
2952 kvm_run->debug.arch.exception = DB_VECTOR; 2999 vcpu->run->debug.arch.exception = DB_VECTOR;
2953 kvm_run->exit_reason = KVM_EXIT_DEBUG; 3000 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
2954 return 0; 3001 return 0;
2955 } else { 3002 } else {
2956 vcpu->arch.dr7 &= ~DR7_GD; 3003 vcpu->arch.dr7 &= ~DR7_GD;
@@ -3016,13 +3063,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3016 return 1; 3063 return 1;
3017} 3064}
3018 3065
3019static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3066static int handle_cpuid(struct kvm_vcpu *vcpu)
3020{ 3067{
3021 kvm_emulate_cpuid(vcpu); 3068 kvm_emulate_cpuid(vcpu);
3022 return 1; 3069 return 1;
3023} 3070}
3024 3071
3025static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3072static int handle_rdmsr(struct kvm_vcpu *vcpu)
3026{ 3073{
3027 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3074 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3028 u64 data; 3075 u64 data;
@@ -3041,7 +3088,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3041 return 1; 3088 return 1;
3042} 3089}
3043 3090
3044static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3091static int handle_wrmsr(struct kvm_vcpu *vcpu)
3045{ 3092{
3046 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3093 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3047 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 3094 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
@@ -3058,14 +3105,12 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3058 return 1; 3105 return 1;
3059} 3106}
3060 3107
3061static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, 3108static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3062 struct kvm_run *kvm_run)
3063{ 3109{
3064 return 1; 3110 return 1;
3065} 3111}
3066 3112
3067static int handle_interrupt_window(struct kvm_vcpu *vcpu, 3113static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3068 struct kvm_run *kvm_run)
3069{ 3114{
3070 u32 cpu_based_vm_exec_control; 3115 u32 cpu_based_vm_exec_control;
3071 3116
@@ -3081,34 +3126,34 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
3081 * possible 3126 * possible
3082 */ 3127 */
3083 if (!irqchip_in_kernel(vcpu->kvm) && 3128 if (!irqchip_in_kernel(vcpu->kvm) &&
3084 kvm_run->request_interrupt_window && 3129 vcpu->run->request_interrupt_window &&
3085 !kvm_cpu_has_interrupt(vcpu)) { 3130 !kvm_cpu_has_interrupt(vcpu)) {
3086 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 3131 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3087 return 0; 3132 return 0;
3088 } 3133 }
3089 return 1; 3134 return 1;
3090} 3135}
3091 3136
3092static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3137static int handle_halt(struct kvm_vcpu *vcpu)
3093{ 3138{
3094 skip_emulated_instruction(vcpu); 3139 skip_emulated_instruction(vcpu);
3095 return kvm_emulate_halt(vcpu); 3140 return kvm_emulate_halt(vcpu);
3096} 3141}
3097 3142
3098static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3143static int handle_vmcall(struct kvm_vcpu *vcpu)
3099{ 3144{
3100 skip_emulated_instruction(vcpu); 3145 skip_emulated_instruction(vcpu);
3101 kvm_emulate_hypercall(vcpu); 3146 kvm_emulate_hypercall(vcpu);
3102 return 1; 3147 return 1;
3103} 3148}
3104 3149
3105static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3150static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3106{ 3151{
3107 kvm_queue_exception(vcpu, UD_VECTOR); 3152 kvm_queue_exception(vcpu, UD_VECTOR);
3108 return 1; 3153 return 1;
3109} 3154}
3110 3155
3111static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3156static int handle_invlpg(struct kvm_vcpu *vcpu)
3112{ 3157{
3113 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3158 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3114 3159
@@ -3117,14 +3162,14 @@ static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3117 return 1; 3162 return 1;
3118} 3163}
3119 3164
3120static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3165static int handle_wbinvd(struct kvm_vcpu *vcpu)
3121{ 3166{
3122 skip_emulated_instruction(vcpu); 3167 skip_emulated_instruction(vcpu);
3123 /* TODO: Add support for VT-d/pass-through device */ 3168 /* TODO: Add support for VT-d/pass-through device */
3124 return 1; 3169 return 1;
3125} 3170}
3126 3171
3127static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3172static int handle_apic_access(struct kvm_vcpu *vcpu)
3128{ 3173{
3129 unsigned long exit_qualification; 3174 unsigned long exit_qualification;
3130 enum emulation_result er; 3175 enum emulation_result er;
@@ -3133,7 +3178,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3133 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3178 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3134 offset = exit_qualification & 0xffful; 3179 offset = exit_qualification & 0xffful;
3135 3180
3136 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3181 er = emulate_instruction(vcpu, 0, 0, 0);
3137 3182
3138 if (er != EMULATE_DONE) { 3183 if (er != EMULATE_DONE) {
3139 printk(KERN_ERR 3184 printk(KERN_ERR
@@ -3144,7 +3189,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3144 return 1; 3189 return 1;
3145} 3190}
3146 3191
3147static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3192static int handle_task_switch(struct kvm_vcpu *vcpu)
3148{ 3193{
3149 struct vcpu_vmx *vmx = to_vmx(vcpu); 3194 struct vcpu_vmx *vmx = to_vmx(vcpu);
3150 unsigned long exit_qualification; 3195 unsigned long exit_qualification;
@@ -3198,7 +3243,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3198 return 1; 3243 return 1;
3199} 3244}
3200 3245
3201static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3246static int handle_ept_violation(struct kvm_vcpu *vcpu)
3202{ 3247{
3203 unsigned long exit_qualification; 3248 unsigned long exit_qualification;
3204 gpa_t gpa; 3249 gpa_t gpa;
@@ -3219,8 +3264,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3219 vmcs_readl(GUEST_LINEAR_ADDRESS)); 3264 vmcs_readl(GUEST_LINEAR_ADDRESS));
3220 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 3265 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3221 (long unsigned int)exit_qualification); 3266 (long unsigned int)exit_qualification);
3222 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3267 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3223 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; 3268 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
3224 return 0; 3269 return 0;
3225 } 3270 }
3226 3271
@@ -3290,7 +3335,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3290 } 3335 }
3291} 3336}
3292 3337
3293static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3338static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
3294{ 3339{
3295 u64 sptes[4]; 3340 u64 sptes[4];
3296 int nr_sptes, i; 3341 int nr_sptes, i;
@@ -3306,13 +3351,13 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3306 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) 3351 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
3307 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); 3352 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
3308 3353
3309 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3354 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3310 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; 3355 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
3311 3356
3312 return 0; 3357 return 0;
3313} 3358}
3314 3359
3315static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3360static int handle_nmi_window(struct kvm_vcpu *vcpu)
3316{ 3361{
3317 u32 cpu_based_vm_exec_control; 3362 u32 cpu_based_vm_exec_control;
3318 3363
@@ -3325,36 +3370,50 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3325 return 1; 3370 return 1;
3326} 3371}
3327 3372
3328static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, 3373static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3329 struct kvm_run *kvm_run)
3330{ 3374{
3331 struct vcpu_vmx *vmx = to_vmx(vcpu); 3375 struct vcpu_vmx *vmx = to_vmx(vcpu);
3332 enum emulation_result err = EMULATE_DONE; 3376 enum emulation_result err = EMULATE_DONE;
3333 3377 int ret = 1;
3334 local_irq_enable();
3335 preempt_enable();
3336 3378
3337 while (!guest_state_valid(vcpu)) { 3379 while (!guest_state_valid(vcpu)) {
3338 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3380 err = emulate_instruction(vcpu, 0, 0, 0);
3339 3381
3340 if (err == EMULATE_DO_MMIO) 3382 if (err == EMULATE_DO_MMIO) {
3341 break; 3383 ret = 0;
3384 goto out;
3385 }
3342 3386
3343 if (err != EMULATE_DONE) { 3387 if (err != EMULATE_DONE) {
3344 kvm_report_emulation_failure(vcpu, "emulation failure"); 3388 kvm_report_emulation_failure(vcpu, "emulation failure");
3345 break; 3389 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3390 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3391 vcpu->run->internal.ndata = 0;
3392 ret = 0;
3393 goto out;
3346 } 3394 }
3347 3395
3348 if (signal_pending(current)) 3396 if (signal_pending(current))
3349 break; 3397 goto out;
3350 if (need_resched()) 3398 if (need_resched())
3351 schedule(); 3399 schedule();
3352 } 3400 }
3353 3401
3354 preempt_disable(); 3402 vmx->emulation_required = 0;
3355 local_irq_disable(); 3403out:
3404 return ret;
3405}
3356 3406
3357 vmx->invalid_state_emulation_result = err; 3407/*
3408 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
3409 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
3410 */
3411static int handle_pause(struct kvm_vcpu *vcpu)
3412{
3413 skip_emulated_instruction(vcpu);
3414 kvm_vcpu_on_spin(vcpu);
3415
3416 return 1;
3358} 3417}
3359 3418
3360/* 3419/*
@@ -3362,8 +3421,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3362 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 3421 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
3363 * to be done to userspace and return 0. 3422 * to be done to userspace and return 0.
3364 */ 3423 */
3365static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, 3424static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3366 struct kvm_run *kvm_run) = {
3367 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 3425 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
3368 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 3426 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
3369 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 3427 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
@@ -3394,6 +3452,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3394 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3452 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3395 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3453 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3396 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 3454 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
3455 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
3397}; 3456};
3398 3457
3399static const int kvm_vmx_max_exit_handlers = 3458static const int kvm_vmx_max_exit_handlers =
@@ -3403,7 +3462,7 @@ static const int kvm_vmx_max_exit_handlers =
3403 * The guest has exited. See if we can fix it or if we need userspace 3462 * The guest has exited. See if we can fix it or if we need userspace
3404 * assistance. 3463 * assistance.
3405 */ 3464 */
3406static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3465static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3407{ 3466{
3408 struct vcpu_vmx *vmx = to_vmx(vcpu); 3467 struct vcpu_vmx *vmx = to_vmx(vcpu);
3409 u32 exit_reason = vmx->exit_reason; 3468 u32 exit_reason = vmx->exit_reason;
@@ -3411,13 +3470,9 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3411 3470
3412 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); 3471 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
3413 3472
3414 /* If we need to emulate an MMIO from handle_invalid_guest_state 3473 /* If guest state is invalid, start emulating */
3415 * we just return 0 */ 3474 if (vmx->emulation_required && emulate_invalid_guest_state)
3416 if (vmx->emulation_required && emulate_invalid_guest_state) { 3475 return handle_invalid_guest_state(vcpu);
3417 if (guest_state_valid(vcpu))
3418 vmx->emulation_required = 0;
3419 return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
3420 }
3421 3476
3422 /* Access CR3 don't cause VMExit in paging mode, so we need 3477 /* Access CR3 don't cause VMExit in paging mode, so we need
3423 * to sync with guest real CR3. */ 3478 * to sync with guest real CR3. */
@@ -3425,8 +3480,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3425 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3480 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3426 3481
3427 if (unlikely(vmx->fail)) { 3482 if (unlikely(vmx->fail)) {
3428 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3483 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3429 kvm_run->fail_entry.hardware_entry_failure_reason 3484 vcpu->run->fail_entry.hardware_entry_failure_reason
3430 = vmcs_read32(VM_INSTRUCTION_ERROR); 3485 = vmcs_read32(VM_INSTRUCTION_ERROR);
3431 return 0; 3486 return 0;
3432 } 3487 }
@@ -3459,10 +3514,10 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3459 3514
3460 if (exit_reason < kvm_vmx_max_exit_handlers 3515 if (exit_reason < kvm_vmx_max_exit_handlers
3461 && kvm_vmx_exit_handlers[exit_reason]) 3516 && kvm_vmx_exit_handlers[exit_reason])
3462 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); 3517 return kvm_vmx_exit_handlers[exit_reason](vcpu);
3463 else { 3518 else {
3464 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3519 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3465 kvm_run->hw.hardware_exit_reason = exit_reason; 3520 vcpu->run->hw.hardware_exit_reason = exit_reason;
3466 } 3521 }
3467 return 0; 3522 return 0;
3468} 3523}
@@ -3600,23 +3655,18 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3600#define Q "l" 3655#define Q "l"
3601#endif 3656#endif
3602 3657
3603static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3658static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3604{ 3659{
3605 struct vcpu_vmx *vmx = to_vmx(vcpu); 3660 struct vcpu_vmx *vmx = to_vmx(vcpu);
3606 3661
3607 if (enable_ept && is_paging(vcpu)) {
3608 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3609 ept_load_pdptrs(vcpu);
3610 }
3611 /* Record the guest's net vcpu time for enforced NMI injections. */ 3662 /* Record the guest's net vcpu time for enforced NMI injections. */
3612 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 3663 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3613 vmx->entry_time = ktime_get(); 3664 vmx->entry_time = ktime_get();
3614 3665
3615 /* Handle invalid guest state instead of entering VMX */ 3666 /* Don't enter VMX if guest state is invalid, let the exit handler
3616 if (vmx->emulation_required && emulate_invalid_guest_state) { 3667 start emulation until we arrive back to a valid state */
3617 handle_invalid_guest_state(vcpu, kvm_run); 3668 if (vmx->emulation_required && emulate_invalid_guest_state)
3618 return; 3669 return;
3619 }
3620 3670
3621 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 3671 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3622 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 3672 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@ -3775,7 +3825,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3775 __clear_bit(vmx->vpid, vmx_vpid_bitmap); 3825 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3776 spin_unlock(&vmx_vpid_lock); 3826 spin_unlock(&vmx_vpid_lock);
3777 vmx_free_vmcs(vcpu); 3827 vmx_free_vmcs(vcpu);
3778 kfree(vmx->host_msrs);
3779 kfree(vmx->guest_msrs); 3828 kfree(vmx->guest_msrs);
3780 kvm_vcpu_uninit(vcpu); 3829 kvm_vcpu_uninit(vcpu);
3781 kmem_cache_free(kvm_vcpu_cache, vmx); 3830 kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -3802,10 +3851,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3802 goto uninit_vcpu; 3851 goto uninit_vcpu;
3803 } 3852 }
3804 3853
3805 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
3806 if (!vmx->host_msrs)
3807 goto free_guest_msrs;
3808
3809 vmx->vmcs = alloc_vmcs(); 3854 vmx->vmcs = alloc_vmcs();
3810 if (!vmx->vmcs) 3855 if (!vmx->vmcs)
3811 goto free_msrs; 3856 goto free_msrs;
@@ -3836,8 +3881,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3836free_vmcs: 3881free_vmcs:
3837 free_vmcs(vmx->vmcs); 3882 free_vmcs(vmx->vmcs);
3838free_msrs: 3883free_msrs:
3839 kfree(vmx->host_msrs);
3840free_guest_msrs:
3841 kfree(vmx->guest_msrs); 3884 kfree(vmx->guest_msrs);
3842uninit_vcpu: 3885uninit_vcpu:
3843 kvm_vcpu_uninit(&vmx->vcpu); 3886 kvm_vcpu_uninit(&vmx->vcpu);
@@ -3973,6 +4016,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
3973 .queue_exception = vmx_queue_exception, 4016 .queue_exception = vmx_queue_exception,
3974 .interrupt_allowed = vmx_interrupt_allowed, 4017 .interrupt_allowed = vmx_interrupt_allowed,
3975 .nmi_allowed = vmx_nmi_allowed, 4018 .nmi_allowed = vmx_nmi_allowed,
4019 .get_nmi_mask = vmx_get_nmi_mask,
4020 .set_nmi_mask = vmx_set_nmi_mask,
3976 .enable_nmi_window = enable_nmi_window, 4021 .enable_nmi_window = enable_nmi_window,
3977 .enable_irq_window = enable_irq_window, 4022 .enable_irq_window = enable_irq_window,
3978 .update_cr8_intercept = update_cr8_intercept, 4023 .update_cr8_intercept = update_cr8_intercept,
@@ -3987,7 +4032,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
3987 4032
3988static int __init vmx_init(void) 4033static int __init vmx_init(void)
3989{ 4034{
3990 int r; 4035 int r, i;
4036
4037 rdmsrl_safe(MSR_EFER, &host_efer);
4038
4039 for (i = 0; i < NR_VMX_MSR; ++i)
4040 kvm_define_shared_msr(i, vmx_msr_index[i]);
3991 4041
3992 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); 4042 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
3993 if (!vmx_io_bitmap_a) 4043 if (!vmx_io_bitmap_a)
@@ -4049,8 +4099,6 @@ static int __init vmx_init(void)
4049 if (bypass_guest_pf) 4099 if (bypass_guest_pf)
4050 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 4100 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
4051 4101
4052 ept_sync_global();
4053
4054 return 0; 4102 return 0;
4055 4103
4056out3: 4104out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9b9695322f56..9d068966fb2a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,11 +37,13 @@
37#include <linux/iommu.h> 37#include <linux/iommu.h>
38#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
39#include <linux/cpufreq.h> 39#include <linux/cpufreq.h>
40#include <linux/user-return-notifier.h>
40#include <trace/events/kvm.h> 41#include <trace/events/kvm.h>
41#undef TRACE_INCLUDE_FILE 42#undef TRACE_INCLUDE_FILE
42#define CREATE_TRACE_POINTS 43#define CREATE_TRACE_POINTS
43#include "trace.h" 44#include "trace.h"
44 45
46#include <asm/debugreg.h>
45#include <asm/uaccess.h> 47#include <asm/uaccess.h>
46#include <asm/msr.h> 48#include <asm/msr.h>
47#include <asm/desc.h> 49#include <asm/desc.h>
@@ -87,6 +89,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
87int ignore_msrs = 0; 89int ignore_msrs = 0;
88module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 90module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
89 91
92#define KVM_NR_SHARED_MSRS 16
93
94struct kvm_shared_msrs_global {
95 int nr;
96 struct kvm_shared_msr {
97 u32 msr;
98 u64 value;
99 } msrs[KVM_NR_SHARED_MSRS];
100};
101
102struct kvm_shared_msrs {
103 struct user_return_notifier urn;
104 bool registered;
105 u64 current_value[KVM_NR_SHARED_MSRS];
106};
107
108static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
109static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
110
90struct kvm_stats_debugfs_item debugfs_entries[] = { 111struct kvm_stats_debugfs_item debugfs_entries[] = {
91 { "pf_fixed", VCPU_STAT(pf_fixed) }, 112 { "pf_fixed", VCPU_STAT(pf_fixed) },
92 { "pf_guest", VCPU_STAT(pf_guest) }, 113 { "pf_guest", VCPU_STAT(pf_guest) },
@@ -123,6 +144,72 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
123 { NULL } 144 { NULL }
124}; 145};
125 146
147static void kvm_on_user_return(struct user_return_notifier *urn)
148{
149 unsigned slot;
150 struct kvm_shared_msr *global;
151 struct kvm_shared_msrs *locals
152 = container_of(urn, struct kvm_shared_msrs, urn);
153
154 for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
155 global = &shared_msrs_global.msrs[slot];
156 if (global->value != locals->current_value[slot]) {
157 wrmsrl(global->msr, global->value);
158 locals->current_value[slot] = global->value;
159 }
160 }
161 locals->registered = false;
162 user_return_notifier_unregister(urn);
163}
164
165void kvm_define_shared_msr(unsigned slot, u32 msr)
166{
167 int cpu;
168 u64 value;
169
170 if (slot >= shared_msrs_global.nr)
171 shared_msrs_global.nr = slot + 1;
172 shared_msrs_global.msrs[slot].msr = msr;
173 rdmsrl_safe(msr, &value);
174 shared_msrs_global.msrs[slot].value = value;
175 for_each_online_cpu(cpu)
176 per_cpu(shared_msrs, cpu).current_value[slot] = value;
177}
178EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
179
180static void kvm_shared_msr_cpu_online(void)
181{
182 unsigned i;
183 struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
184
185 for (i = 0; i < shared_msrs_global.nr; ++i)
186 locals->current_value[i] = shared_msrs_global.msrs[i].value;
187}
188
189void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
190{
191 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
192
193 if (((value ^ smsr->current_value[slot]) & mask) == 0)
194 return;
195 smsr->current_value[slot] = value;
196 wrmsrl(shared_msrs_global.msrs[slot].msr, value);
197 if (!smsr->registered) {
198 smsr->urn.on_user_return = kvm_on_user_return;
199 user_return_notifier_register(&smsr->urn);
200 smsr->registered = true;
201 }
202}
203EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
204
205static void drop_user_return_notifiers(void *ignore)
206{
207 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
208
209 if (smsr->registered)
210 kvm_on_user_return(&smsr->urn);
211}
212
126unsigned long segment_base(u16 selector) 213unsigned long segment_base(u16 selector)
127{ 214{
128 struct descriptor_table gdt; 215 struct descriptor_table gdt;
@@ -484,16 +571,19 @@ static inline u32 bit(int bitno)
484 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 571 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
485 * 572 *
486 * This list is modified at module load time to reflect the 573 * This list is modified at module load time to reflect the
487 * capabilities of the host cpu. 574 * capabilities of the host cpu. This capabilities test skips MSRs that are
575 * kvm-specific. Those are put in the beginning of the list.
488 */ 576 */
577
578#define KVM_SAVE_MSRS_BEGIN 2
489static u32 msrs_to_save[] = { 579static u32 msrs_to_save[] = {
580 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
490 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 581 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
491 MSR_K6_STAR, 582 MSR_K6_STAR,
492#ifdef CONFIG_X86_64 583#ifdef CONFIG_X86_64
493 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 584 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
494#endif 585#endif
495 MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 586 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
496 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
497}; 587};
498 588
499static unsigned num_msrs_to_save; 589static unsigned num_msrs_to_save;
@@ -677,7 +767,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
677 /* With all the info we got, fill in the values */ 767 /* With all the info we got, fill in the values */
678 768
679 vcpu->hv_clock.system_time = ts.tv_nsec + 769 vcpu->hv_clock.system_time = ts.tv_nsec +
680 (NSEC_PER_SEC * (u64)ts.tv_sec); 770 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
771
681 /* 772 /*
682 * The interface expects us to write an even number signaling that the 773 * The interface expects us to write an even number signaling that the
683 * update is finished. Since the guest won't see the intermediate 774 * update is finished. Since the guest won't see the intermediate
@@ -835,6 +926,38 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
835 return 0; 926 return 0;
836} 927}
837 928
929static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
930{
931 struct kvm *kvm = vcpu->kvm;
932 int lm = is_long_mode(vcpu);
933 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
934 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
935 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
936 : kvm->arch.xen_hvm_config.blob_size_32;
937 u32 page_num = data & ~PAGE_MASK;
938 u64 page_addr = data & PAGE_MASK;
939 u8 *page;
940 int r;
941
942 r = -E2BIG;
943 if (page_num >= blob_size)
944 goto out;
945 r = -ENOMEM;
946 page = kzalloc(PAGE_SIZE, GFP_KERNEL);
947 if (!page)
948 goto out;
949 r = -EFAULT;
950 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
951 goto out_free;
952 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
953 goto out_free;
954 r = 0;
955out_free:
956 kfree(page);
957out:
958 return r;
959}
960
838int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 961int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
839{ 962{
840 switch (msr) { 963 switch (msr) {
@@ -950,6 +1073,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
950 "0x%x data 0x%llx\n", msr, data); 1073 "0x%x data 0x%llx\n", msr, data);
951 break; 1074 break;
952 default: 1075 default:
1076 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1077 return xen_hvm_config(vcpu, data);
953 if (!ignore_msrs) { 1078 if (!ignore_msrs) {
954 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1079 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
955 msr, data); 1080 msr, data);
@@ -1224,6 +1349,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1224 case KVM_CAP_PIT2: 1349 case KVM_CAP_PIT2:
1225 case KVM_CAP_PIT_STATE2: 1350 case KVM_CAP_PIT_STATE2:
1226 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1351 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1352 case KVM_CAP_XEN_HVM:
1353 case KVM_CAP_ADJUST_CLOCK:
1354 case KVM_CAP_VCPU_EVENTS:
1227 r = 1; 1355 r = 1;
1228 break; 1356 break;
1229 case KVM_CAP_COALESCED_MMIO: 1357 case KVM_CAP_COALESCED_MMIO:
@@ -1238,8 +1366,8 @@ int kvm_dev_ioctl_check_extension(long ext)
1238 case KVM_CAP_NR_MEMSLOTS: 1366 case KVM_CAP_NR_MEMSLOTS:
1239 r = KVM_MEMORY_SLOTS; 1367 r = KVM_MEMORY_SLOTS;
1240 break; 1368 break;
1241 case KVM_CAP_PV_MMU: 1369 case KVM_CAP_PV_MMU: /* obsolete */
1242 r = !tdp_enabled; 1370 r = 0;
1243 break; 1371 break;
1244 case KVM_CAP_IOMMU: 1372 case KVM_CAP_IOMMU:
1245 r = iommu_found(); 1373 r = iommu_found();
@@ -1326,6 +1454,12 @@ out:
1326void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1454void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1327{ 1455{
1328 kvm_x86_ops->vcpu_load(vcpu, cpu); 1456 kvm_x86_ops->vcpu_load(vcpu, cpu);
1457 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1458 unsigned long khz = cpufreq_quick_get(cpu);
1459 if (!khz)
1460 khz = tsc_khz;
1461 per_cpu(cpu_tsc_khz, cpu) = khz;
1462 }
1329 kvm_request_guest_time_update(vcpu); 1463 kvm_request_guest_time_update(vcpu);
1330} 1464}
1331 1465
@@ -1692,7 +1826,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
1692 unsigned bank_num = mcg_cap & 0xff, bank; 1826 unsigned bank_num = mcg_cap & 0xff, bank;
1693 1827
1694 r = -EINVAL; 1828 r = -EINVAL;
1695 if (!bank_num) 1829 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
1696 goto out; 1830 goto out;
1697 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) 1831 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
1698 goto out; 1832 goto out;
@@ -1759,6 +1893,61 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1759 return 0; 1893 return 0;
1760} 1894}
1761 1895
1896static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
1897 struct kvm_vcpu_events *events)
1898{
1899 vcpu_load(vcpu);
1900
1901 events->exception.injected = vcpu->arch.exception.pending;
1902 events->exception.nr = vcpu->arch.exception.nr;
1903 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
1904 events->exception.error_code = vcpu->arch.exception.error_code;
1905
1906 events->interrupt.injected = vcpu->arch.interrupt.pending;
1907 events->interrupt.nr = vcpu->arch.interrupt.nr;
1908 events->interrupt.soft = vcpu->arch.interrupt.soft;
1909
1910 events->nmi.injected = vcpu->arch.nmi_injected;
1911 events->nmi.pending = vcpu->arch.nmi_pending;
1912 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
1913
1914 events->sipi_vector = vcpu->arch.sipi_vector;
1915
1916 events->flags = 0;
1917
1918 vcpu_put(vcpu);
1919}
1920
1921static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
1922 struct kvm_vcpu_events *events)
1923{
1924 if (events->flags)
1925 return -EINVAL;
1926
1927 vcpu_load(vcpu);
1928
1929 vcpu->arch.exception.pending = events->exception.injected;
1930 vcpu->arch.exception.nr = events->exception.nr;
1931 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
1932 vcpu->arch.exception.error_code = events->exception.error_code;
1933
1934 vcpu->arch.interrupt.pending = events->interrupt.injected;
1935 vcpu->arch.interrupt.nr = events->interrupt.nr;
1936 vcpu->arch.interrupt.soft = events->interrupt.soft;
1937 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
1938 kvm_pic_clear_isr_ack(vcpu->kvm);
1939
1940 vcpu->arch.nmi_injected = events->nmi.injected;
1941 vcpu->arch.nmi_pending = events->nmi.pending;
1942 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
1943
1944 vcpu->arch.sipi_vector = events->sipi_vector;
1945
1946 vcpu_put(vcpu);
1947
1948 return 0;
1949}
1950
1762long kvm_arch_vcpu_ioctl(struct file *filp, 1951long kvm_arch_vcpu_ioctl(struct file *filp,
1763 unsigned int ioctl, unsigned long arg) 1952 unsigned int ioctl, unsigned long arg)
1764{ 1953{
@@ -1769,6 +1958,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1769 1958
1770 switch (ioctl) { 1959 switch (ioctl) {
1771 case KVM_GET_LAPIC: { 1960 case KVM_GET_LAPIC: {
1961 r = -EINVAL;
1962 if (!vcpu->arch.apic)
1963 goto out;
1772 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1964 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1773 1965
1774 r = -ENOMEM; 1966 r = -ENOMEM;
@@ -1784,6 +1976,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1784 break; 1976 break;
1785 } 1977 }
1786 case KVM_SET_LAPIC: { 1978 case KVM_SET_LAPIC: {
1979 r = -EINVAL;
1980 if (!vcpu->arch.apic)
1981 goto out;
1787 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 1982 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1788 r = -ENOMEM; 1983 r = -ENOMEM;
1789 if (!lapic) 1984 if (!lapic)
@@ -1910,6 +2105,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1910 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2105 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
1911 break; 2106 break;
1912 } 2107 }
2108 case KVM_GET_VCPU_EVENTS: {
2109 struct kvm_vcpu_events events;
2110
2111 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
2112
2113 r = -EFAULT;
2114 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
2115 break;
2116 r = 0;
2117 break;
2118 }
2119 case KVM_SET_VCPU_EVENTS: {
2120 struct kvm_vcpu_events events;
2121
2122 r = -EFAULT;
2123 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
2124 break;
2125
2126 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2127 break;
2128 }
1913 default: 2129 default:
1914 r = -EINVAL; 2130 r = -EINVAL;
1915 } 2131 }
@@ -2038,9 +2254,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2038 sizeof(struct kvm_pic_state)); 2254 sizeof(struct kvm_pic_state));
2039 break; 2255 break;
2040 case KVM_IRQCHIP_IOAPIC: 2256 case KVM_IRQCHIP_IOAPIC:
2041 memcpy(&chip->chip.ioapic, 2257 r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
2042 ioapic_irqchip(kvm),
2043 sizeof(struct kvm_ioapic_state));
2044 break; 2258 break;
2045 default: 2259 default:
2046 r = -EINVAL; 2260 r = -EINVAL;
@@ -2070,11 +2284,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2070 spin_unlock(&pic_irqchip(kvm)->lock); 2284 spin_unlock(&pic_irqchip(kvm)->lock);
2071 break; 2285 break;
2072 case KVM_IRQCHIP_IOAPIC: 2286 case KVM_IRQCHIP_IOAPIC:
2073 mutex_lock(&kvm->irq_lock); 2287 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
2074 memcpy(ioapic_irqchip(kvm),
2075 &chip->chip.ioapic,
2076 sizeof(struct kvm_ioapic_state));
2077 mutex_unlock(&kvm->irq_lock);
2078 break; 2288 break;
2079 default: 2289 default:
2080 r = -EINVAL; 2290 r = -EINVAL;
@@ -2182,7 +2392,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2182{ 2392{
2183 struct kvm *kvm = filp->private_data; 2393 struct kvm *kvm = filp->private_data;
2184 void __user *argp = (void __user *)arg; 2394 void __user *argp = (void __user *)arg;
2185 int r = -EINVAL; 2395 int r = -ENOTTY;
2186 /* 2396 /*
2187 * This union makes it completely explicit to gcc-3.x 2397 * This union makes it completely explicit to gcc-3.x
2188 * that these two variables' stack usage should be 2398 * that these two variables' stack usage should be
@@ -2244,25 +2454,39 @@ long kvm_arch_vm_ioctl(struct file *filp,
2244 if (r) 2454 if (r)
2245 goto out; 2455 goto out;
2246 break; 2456 break;
2247 case KVM_CREATE_IRQCHIP: 2457 case KVM_CREATE_IRQCHIP: {
2458 struct kvm_pic *vpic;
2459
2460 mutex_lock(&kvm->lock);
2461 r = -EEXIST;
2462 if (kvm->arch.vpic)
2463 goto create_irqchip_unlock;
2248 r = -ENOMEM; 2464 r = -ENOMEM;
2249 kvm->arch.vpic = kvm_create_pic(kvm); 2465 vpic = kvm_create_pic(kvm);
2250 if (kvm->arch.vpic) { 2466 if (vpic) {
2251 r = kvm_ioapic_init(kvm); 2467 r = kvm_ioapic_init(kvm);
2252 if (r) { 2468 if (r) {
2253 kfree(kvm->arch.vpic); 2469 kfree(vpic);
2254 kvm->arch.vpic = NULL; 2470 goto create_irqchip_unlock;
2255 goto out;
2256 } 2471 }
2257 } else 2472 } else
2258 goto out; 2473 goto create_irqchip_unlock;
2474 smp_wmb();
2475 kvm->arch.vpic = vpic;
2476 smp_wmb();
2259 r = kvm_setup_default_irq_routing(kvm); 2477 r = kvm_setup_default_irq_routing(kvm);
2260 if (r) { 2478 if (r) {
2479 mutex_lock(&kvm->irq_lock);
2261 kfree(kvm->arch.vpic); 2480 kfree(kvm->arch.vpic);
2262 kfree(kvm->arch.vioapic); 2481 kfree(kvm->arch.vioapic);
2263 goto out; 2482 kvm->arch.vpic = NULL;
2483 kvm->arch.vioapic = NULL;
2484 mutex_unlock(&kvm->irq_lock);
2264 } 2485 }
2486 create_irqchip_unlock:
2487 mutex_unlock(&kvm->lock);
2265 break; 2488 break;
2489 }
2266 case KVM_CREATE_PIT: 2490 case KVM_CREATE_PIT:
2267 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 2491 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2268 goto create_pit; 2492 goto create_pit;
@@ -2292,10 +2516,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
2292 goto out; 2516 goto out;
2293 if (irqchip_in_kernel(kvm)) { 2517 if (irqchip_in_kernel(kvm)) {
2294 __s32 status; 2518 __s32 status;
2295 mutex_lock(&kvm->irq_lock);
2296 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2519 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2297 irq_event.irq, irq_event.level); 2520 irq_event.irq, irq_event.level);
2298 mutex_unlock(&kvm->irq_lock);
2299 if (ioctl == KVM_IRQ_LINE_STATUS) { 2521 if (ioctl == KVM_IRQ_LINE_STATUS) {
2300 irq_event.status = status; 2522 irq_event.status = status;
2301 if (copy_to_user(argp, &irq_event, 2523 if (copy_to_user(argp, &irq_event,
@@ -2421,6 +2643,55 @@ long kvm_arch_vm_ioctl(struct file *filp,
2421 r = 0; 2643 r = 0;
2422 break; 2644 break;
2423 } 2645 }
2646 case KVM_XEN_HVM_CONFIG: {
2647 r = -EFAULT;
2648 if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
2649 sizeof(struct kvm_xen_hvm_config)))
2650 goto out;
2651 r = -EINVAL;
2652 if (kvm->arch.xen_hvm_config.flags)
2653 goto out;
2654 r = 0;
2655 break;
2656 }
2657 case KVM_SET_CLOCK: {
2658 struct timespec now;
2659 struct kvm_clock_data user_ns;
2660 u64 now_ns;
2661 s64 delta;
2662
2663 r = -EFAULT;
2664 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
2665 goto out;
2666
2667 r = -EINVAL;
2668 if (user_ns.flags)
2669 goto out;
2670
2671 r = 0;
2672 ktime_get_ts(&now);
2673 now_ns = timespec_to_ns(&now);
2674 delta = user_ns.clock - now_ns;
2675 kvm->arch.kvmclock_offset = delta;
2676 break;
2677 }
2678 case KVM_GET_CLOCK: {
2679 struct timespec now;
2680 struct kvm_clock_data user_ns;
2681 u64 now_ns;
2682
2683 ktime_get_ts(&now);
2684 now_ns = timespec_to_ns(&now);
2685 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
2686 user_ns.flags = 0;
2687
2688 r = -EFAULT;
2689 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
2690 goto out;
2691 r = 0;
2692 break;
2693 }
2694
2424 default: 2695 default:
2425 ; 2696 ;
2426 } 2697 }
@@ -2433,7 +2704,8 @@ static void kvm_init_msr_list(void)
2433 u32 dummy[2]; 2704 u32 dummy[2];
2434 unsigned i, j; 2705 unsigned i, j;
2435 2706
2436 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2707 /* skip the first msrs in the list. KVM-specific */
2708 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
2437 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2709 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2438 continue; 2710 continue;
2439 if (j < i) 2711 if (j < i)
@@ -2757,13 +3029,13 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
2757} 3029}
2758 3030
2759int emulate_instruction(struct kvm_vcpu *vcpu, 3031int emulate_instruction(struct kvm_vcpu *vcpu,
2760 struct kvm_run *run,
2761 unsigned long cr2, 3032 unsigned long cr2,
2762 u16 error_code, 3033 u16 error_code,
2763 int emulation_type) 3034 int emulation_type)
2764{ 3035{
2765 int r, shadow_mask; 3036 int r, shadow_mask;
2766 struct decode_cache *c; 3037 struct decode_cache *c;
3038 struct kvm_run *run = vcpu->run;
2767 3039
2768 kvm_clear_exception_queue(vcpu); 3040 kvm_clear_exception_queue(vcpu);
2769 vcpu->arch.mmio_fault_cr2 = cr2; 3041 vcpu->arch.mmio_fault_cr2 = cr2;
@@ -2783,7 +3055,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2783 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3055 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2784 3056
2785 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3057 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2786 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 3058 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
2787 vcpu->arch.emulate_ctxt.mode = 3059 vcpu->arch.emulate_ctxt.mode =
2788 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3060 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2789 ? X86EMUL_MODE_REAL : cs_l 3061 ? X86EMUL_MODE_REAL : cs_l
@@ -2861,7 +3133,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2861 return EMULATE_DO_MMIO; 3133 return EMULATE_DO_MMIO;
2862 } 3134 }
2863 3135
2864 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 3136 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2865 3137
2866 if (vcpu->mmio_is_write) { 3138 if (vcpu->mmio_is_write) {
2867 vcpu->mmio_needed = 0; 3139 vcpu->mmio_needed = 0;
@@ -2969,8 +3241,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
2969 return r; 3241 return r;
2970} 3242}
2971 3243
2972int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3244int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
2973 int size, unsigned port)
2974{ 3245{
2975 unsigned long val; 3246 unsigned long val;
2976 3247
@@ -2999,7 +3270,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2999} 3270}
3000EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3271EXPORT_SYMBOL_GPL(kvm_emulate_pio);
3001 3272
3002int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3273int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3003 int size, unsigned long count, int down, 3274 int size, unsigned long count, int down,
3004 gva_t address, int rep, unsigned port) 3275 gva_t address, int rep, unsigned port)
3005{ 3276{
@@ -3072,9 +3343,6 @@ static void bounce_off(void *info)
3072 /* nothing */ 3343 /* nothing */
3073} 3344}
3074 3345
3075static unsigned int ref_freq;
3076static unsigned long tsc_khz_ref;
3077
3078static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 3346static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
3079 void *data) 3347 void *data)
3080{ 3348{
@@ -3083,14 +3351,11 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
3083 struct kvm_vcpu *vcpu; 3351 struct kvm_vcpu *vcpu;
3084 int i, send_ipi = 0; 3352 int i, send_ipi = 0;
3085 3353
3086 if (!ref_freq)
3087 ref_freq = freq->old;
3088
3089 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 3354 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
3090 return 0; 3355 return 0;
3091 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 3356 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
3092 return 0; 3357 return 0;
3093 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 3358 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
3094 3359
3095 spin_lock(&kvm_lock); 3360 spin_lock(&kvm_lock);
3096 list_for_each_entry(kvm, &vm_list, vm_list) { 3361 list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -3127,9 +3392,28 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {
3127 .notifier_call = kvmclock_cpufreq_notifier 3392 .notifier_call = kvmclock_cpufreq_notifier
3128}; 3393};
3129 3394
3395static void kvm_timer_init(void)
3396{
3397 int cpu;
3398
3399 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3400 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3401 CPUFREQ_TRANSITION_NOTIFIER);
3402 for_each_online_cpu(cpu) {
3403 unsigned long khz = cpufreq_get(cpu);
3404 if (!khz)
3405 khz = tsc_khz;
3406 per_cpu(cpu_tsc_khz, cpu) = khz;
3407 }
3408 } else {
3409 for_each_possible_cpu(cpu)
3410 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3411 }
3412}
3413
3130int kvm_arch_init(void *opaque) 3414int kvm_arch_init(void *opaque)
3131{ 3415{
3132 int r, cpu; 3416 int r;
3133 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 3417 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
3134 3418
3135 if (kvm_x86_ops) { 3419 if (kvm_x86_ops) {
@@ -3161,13 +3445,7 @@ int kvm_arch_init(void *opaque)
3161 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 3445 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
3162 PT_DIRTY_MASK, PT64_NX_MASK, 0); 3446 PT_DIRTY_MASK, PT64_NX_MASK, 0);
3163 3447
3164 for_each_possible_cpu(cpu) 3448 kvm_timer_init();
3165 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3166 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3167 tsc_khz_ref = tsc_khz;
3168 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3169 CPUFREQ_TRANSITION_NOTIFIER);
3170 }
3171 3449
3172 return 0; 3450 return 0;
3173 3451
@@ -3295,7 +3573,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
3295 unsigned long *rflags) 3573 unsigned long *rflags)
3296{ 3574{
3297 kvm_lmsw(vcpu, msw); 3575 kvm_lmsw(vcpu, msw);
3298 *rflags = kvm_x86_ops->get_rflags(vcpu); 3576 *rflags = kvm_get_rflags(vcpu);
3299} 3577}
3300 3578
3301unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 3579unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
@@ -3333,7 +3611,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3333 switch (cr) { 3611 switch (cr) {
3334 case 0: 3612 case 0:
3335 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 3613 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
3336 *rflags = kvm_x86_ops->get_rflags(vcpu); 3614 *rflags = kvm_get_rflags(vcpu);
3337 break; 3615 break;
3338 case 2: 3616 case 2:
3339 vcpu->arch.cr2 = val; 3617 vcpu->arch.cr2 = val;
@@ -3453,18 +3731,18 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3453 * 3731 *
3454 * No need to exit to userspace if we already have an interrupt queued. 3732 * No need to exit to userspace if we already have an interrupt queued.
3455 */ 3733 */
3456static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3734static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
3457 struct kvm_run *kvm_run)
3458{ 3735{
3459 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 3736 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3460 kvm_run->request_interrupt_window && 3737 vcpu->run->request_interrupt_window &&
3461 kvm_arch_interrupt_allowed(vcpu)); 3738 kvm_arch_interrupt_allowed(vcpu));
3462} 3739}
3463 3740
3464static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3741static void post_kvm_run_save(struct kvm_vcpu *vcpu)
3465 struct kvm_run *kvm_run)
3466{ 3742{
3467 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 3743 struct kvm_run *kvm_run = vcpu->run;
3744
3745 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3468 kvm_run->cr8 = kvm_get_cr8(vcpu); 3746 kvm_run->cr8 = kvm_get_cr8(vcpu);
3469 kvm_run->apic_base = kvm_get_apic_base(vcpu); 3747 kvm_run->apic_base = kvm_get_apic_base(vcpu);
3470 if (irqchip_in_kernel(vcpu->kvm)) 3748 if (irqchip_in_kernel(vcpu->kvm))
@@ -3525,7 +3803,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3525 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3803 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3526} 3804}
3527 3805
3528static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3806static void inject_pending_event(struct kvm_vcpu *vcpu)
3529{ 3807{
3530 /* try to reinject previous events if any */ 3808 /* try to reinject previous events if any */
3531 if (vcpu->arch.exception.pending) { 3809 if (vcpu->arch.exception.pending) {
@@ -3561,11 +3839,11 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3561 } 3839 }
3562} 3840}
3563 3841
3564static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3842static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3565{ 3843{
3566 int r; 3844 int r;
3567 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 3845 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3568 kvm_run->request_interrupt_window; 3846 vcpu->run->request_interrupt_window;
3569 3847
3570 if (vcpu->requests) 3848 if (vcpu->requests)
3571 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3849 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3586,12 +3864,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3586 kvm_x86_ops->tlb_flush(vcpu); 3864 kvm_x86_ops->tlb_flush(vcpu);
3587 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 3865 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3588 &vcpu->requests)) { 3866 &vcpu->requests)) {
3589 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 3867 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
3590 r = 0; 3868 r = 0;
3591 goto out; 3869 goto out;
3592 } 3870 }
3593 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 3871 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3594 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 3872 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
3595 r = 0; 3873 r = 0;
3596 goto out; 3874 goto out;
3597 } 3875 }
@@ -3615,7 +3893,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3615 goto out; 3893 goto out;
3616 } 3894 }
3617 3895
3618 inject_pending_event(vcpu, kvm_run); 3896 inject_pending_event(vcpu);
3619 3897
3620 /* enable NMI/IRQ window open exits if needed */ 3898 /* enable NMI/IRQ window open exits if needed */
3621 if (vcpu->arch.nmi_pending) 3899 if (vcpu->arch.nmi_pending)
@@ -3641,16 +3919,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3641 } 3919 }
3642 3920
3643 trace_kvm_entry(vcpu->vcpu_id); 3921 trace_kvm_entry(vcpu->vcpu_id);
3644 kvm_x86_ops->run(vcpu, kvm_run); 3922 kvm_x86_ops->run(vcpu);
3645 3923
3646 if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { 3924 /*
3647 set_debugreg(current->thread.debugreg0, 0); 3925 * If the guest has used debug registers, at least dr7
3648 set_debugreg(current->thread.debugreg1, 1); 3926 * will be disabled while returning to the host.
3649 set_debugreg(current->thread.debugreg2, 2); 3927 * If we don't have active breakpoints in the host, we don't
3650 set_debugreg(current->thread.debugreg3, 3); 3928 * care about the messed up debug address registers. But if
3651 set_debugreg(current->thread.debugreg6, 6); 3929 * we have some of them active, restore the old state.
3652 set_debugreg(current->thread.debugreg7, 7); 3930 */
3653 } 3931 if (hw_breakpoint_active())
3932 hw_breakpoint_restore();
3654 3933
3655 set_bit(KVM_REQ_KICK, &vcpu->requests); 3934 set_bit(KVM_REQ_KICK, &vcpu->requests);
3656 local_irq_enable(); 3935 local_irq_enable();
@@ -3682,13 +3961,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3682 3961
3683 kvm_lapic_sync_from_vapic(vcpu); 3962 kvm_lapic_sync_from_vapic(vcpu);
3684 3963
3685 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 3964 r = kvm_x86_ops->handle_exit(vcpu);
3686out: 3965out:
3687 return r; 3966 return r;
3688} 3967}
3689 3968
3690 3969
3691static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3970static int __vcpu_run(struct kvm_vcpu *vcpu)
3692{ 3971{
3693 int r; 3972 int r;
3694 3973
@@ -3708,7 +3987,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3708 r = 1; 3987 r = 1;
3709 while (r > 0) { 3988 while (r > 0) {
3710 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 3989 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3711 r = vcpu_enter_guest(vcpu, kvm_run); 3990 r = vcpu_enter_guest(vcpu);
3712 else { 3991 else {
3713 up_read(&vcpu->kvm->slots_lock); 3992 up_read(&vcpu->kvm->slots_lock);
3714 kvm_vcpu_block(vcpu); 3993 kvm_vcpu_block(vcpu);
@@ -3736,14 +4015,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3736 if (kvm_cpu_has_pending_timer(vcpu)) 4015 if (kvm_cpu_has_pending_timer(vcpu))
3737 kvm_inject_pending_timer_irqs(vcpu); 4016 kvm_inject_pending_timer_irqs(vcpu);
3738 4017
3739 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 4018 if (dm_request_for_irq_injection(vcpu)) {
3740 r = -EINTR; 4019 r = -EINTR;
3741 kvm_run->exit_reason = KVM_EXIT_INTR; 4020 vcpu->run->exit_reason = KVM_EXIT_INTR;
3742 ++vcpu->stat.request_irq_exits; 4021 ++vcpu->stat.request_irq_exits;
3743 } 4022 }
3744 if (signal_pending(current)) { 4023 if (signal_pending(current)) {
3745 r = -EINTR; 4024 r = -EINTR;
3746 kvm_run->exit_reason = KVM_EXIT_INTR; 4025 vcpu->run->exit_reason = KVM_EXIT_INTR;
3747 ++vcpu->stat.signal_exits; 4026 ++vcpu->stat.signal_exits;
3748 } 4027 }
3749 if (need_resched()) { 4028 if (need_resched()) {
@@ -3754,7 +4033,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3754 } 4033 }
3755 4034
3756 up_read(&vcpu->kvm->slots_lock); 4035 up_read(&vcpu->kvm->slots_lock);
3757 post_kvm_run_save(vcpu, kvm_run); 4036 post_kvm_run_save(vcpu);
3758 4037
3759 vapic_exit(vcpu); 4038 vapic_exit(vcpu);
3760 4039
@@ -3787,15 +4066,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3787 if (r) 4066 if (r)
3788 goto out; 4067 goto out;
3789 } 4068 }
3790#if CONFIG_HAS_IOMEM
3791 if (vcpu->mmio_needed) { 4069 if (vcpu->mmio_needed) {
3792 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 4070 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3793 vcpu->mmio_read_completed = 1; 4071 vcpu->mmio_read_completed = 1;
3794 vcpu->mmio_needed = 0; 4072 vcpu->mmio_needed = 0;
3795 4073
3796 down_read(&vcpu->kvm->slots_lock); 4074 down_read(&vcpu->kvm->slots_lock);
3797 r = emulate_instruction(vcpu, kvm_run, 4075 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
3798 vcpu->arch.mmio_fault_cr2, 0,
3799 EMULTYPE_NO_DECODE); 4076 EMULTYPE_NO_DECODE);
3800 up_read(&vcpu->kvm->slots_lock); 4077 up_read(&vcpu->kvm->slots_lock);
3801 if (r == EMULATE_DO_MMIO) { 4078 if (r == EMULATE_DO_MMIO) {
@@ -3806,12 +4083,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3806 goto out; 4083 goto out;
3807 } 4084 }
3808 } 4085 }
3809#endif
3810 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 4086 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3811 kvm_register_write(vcpu, VCPU_REGS_RAX, 4087 kvm_register_write(vcpu, VCPU_REGS_RAX,
3812 kvm_run->hypercall.ret); 4088 kvm_run->hypercall.ret);
3813 4089
3814 r = __vcpu_run(vcpu, kvm_run); 4090 r = __vcpu_run(vcpu);
3815 4091
3816out: 4092out:
3817 if (vcpu->sigset_active) 4093 if (vcpu->sigset_active)
@@ -3845,13 +4121,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3845#endif 4121#endif
3846 4122
3847 regs->rip = kvm_rip_read(vcpu); 4123 regs->rip = kvm_rip_read(vcpu);
3848 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 4124 regs->rflags = kvm_get_rflags(vcpu);
3849
3850 /*
3851 * Don't leak debug flags in case they were set for guest debugging
3852 */
3853 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3854 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3855 4125
3856 vcpu_put(vcpu); 4126 vcpu_put(vcpu);
3857 4127
@@ -3879,12 +4149,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3879 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 4149 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3880 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 4150 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3881 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 4151 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3882
3883#endif 4152#endif
3884 4153
3885 kvm_rip_write(vcpu, regs->rip); 4154 kvm_rip_write(vcpu, regs->rip);
3886 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 4155 kvm_set_rflags(vcpu, regs->rflags);
3887
3888 4156
3889 vcpu->arch.exception.pending = false; 4157 vcpu->arch.exception.pending = false;
3890 4158
@@ -4051,7 +4319,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4051 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4319 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
4052} 4320}
4053 4321
4054static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 4322static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu,
4055 struct desc_struct *seg_desc) 4323 struct desc_struct *seg_desc)
4056{ 4324{
4057 u32 base_addr = get_desc_base(seg_desc); 4325 u32 base_addr = get_desc_base(seg_desc);
@@ -4103,7 +4371,7 @@ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4103{ 4371{
4104 return (seg != VCPU_SREG_LDTR) && 4372 return (seg != VCPU_SREG_LDTR) &&
4105 (seg != VCPU_SREG_TR) && 4373 (seg != VCPU_SREG_TR) &&
4106 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); 4374 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4107} 4375}
4108 4376
4109int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4377int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
@@ -4131,7 +4399,7 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4131{ 4399{
4132 tss->cr3 = vcpu->arch.cr3; 4400 tss->cr3 = vcpu->arch.cr3;
4133 tss->eip = kvm_rip_read(vcpu); 4401 tss->eip = kvm_rip_read(vcpu);
4134 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 4402 tss->eflags = kvm_get_rflags(vcpu);
4135 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4403 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4136 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4404 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4137 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4405 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4155,7 +4423,7 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4155 kvm_set_cr3(vcpu, tss->cr3); 4423 kvm_set_cr3(vcpu, tss->cr3);
4156 4424
4157 kvm_rip_write(vcpu, tss->eip); 4425 kvm_rip_write(vcpu, tss->eip);
4158 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 4426 kvm_set_rflags(vcpu, tss->eflags | 2);
4159 4427
4160 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 4428 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4161 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 4429 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
@@ -4193,7 +4461,7 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4193 struct tss_segment_16 *tss) 4461 struct tss_segment_16 *tss)
4194{ 4462{
4195 tss->ip = kvm_rip_read(vcpu); 4463 tss->ip = kvm_rip_read(vcpu);
4196 tss->flag = kvm_x86_ops->get_rflags(vcpu); 4464 tss->flag = kvm_get_rflags(vcpu);
4197 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4465 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4198 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4466 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4199 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4467 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4208,14 +4476,13 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4208 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4476 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4209 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4477 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4210 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4478 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4211 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
4212} 4479}
4213 4480
4214static int load_state_from_tss16(struct kvm_vcpu *vcpu, 4481static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4215 struct tss_segment_16 *tss) 4482 struct tss_segment_16 *tss)
4216{ 4483{
4217 kvm_rip_write(vcpu, tss->ip); 4484 kvm_rip_write(vcpu, tss->ip);
4218 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 4485 kvm_set_rflags(vcpu, tss->flag | 2);
4219 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 4486 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
4220 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 4487 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
4221 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 4488 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
@@ -4361,8 +4628,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4361 } 4628 }
4362 4629
4363 if (reason == TASK_SWITCH_IRET) { 4630 if (reason == TASK_SWITCH_IRET) {
4364 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4631 u32 eflags = kvm_get_rflags(vcpu);
4365 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4632 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4366 } 4633 }
4367 4634
4368 /* set back link to prev task only if NT bit is set in eflags 4635 /* set back link to prev task only if NT bit is set in eflags
@@ -4370,11 +4637,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4370 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 4637 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4371 old_tss_sel = 0xffff; 4638 old_tss_sel = 0xffff;
4372 4639
4373 /* set back link to prev task only if NT bit is set in eflags
4374 note that old_tss_sel is not used afetr this point */
4375 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4376 old_tss_sel = 0xffff;
4377
4378 if (nseg_desc.type & 8) 4640 if (nseg_desc.type & 8)
4379 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 4641 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4380 old_tss_base, &nseg_desc); 4642 old_tss_base, &nseg_desc);
@@ -4383,8 +4645,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4383 old_tss_base, &nseg_desc); 4645 old_tss_base, &nseg_desc);
4384 4646
4385 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4647 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4386 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4648 u32 eflags = kvm_get_rflags(vcpu);
4387 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 4649 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4388 } 4650 }
4389 4651
4390 if (reason != TASK_SWITCH_IRET) { 4652 if (reason != TASK_SWITCH_IRET) {
@@ -4436,8 +4698,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4436 4698
4437 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 4699 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4438 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 4700 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4439 if (!is_long_mode(vcpu) && is_pae(vcpu)) 4701 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
4440 load_pdptrs(vcpu, vcpu->arch.cr3); 4702 load_pdptrs(vcpu, vcpu->arch.cr3);
4703 mmu_reset_needed = 1;
4704 }
4441 4705
4442 if (mmu_reset_needed) 4706 if (mmu_reset_needed)
4443 kvm_mmu_reset_context(vcpu); 4707 kvm_mmu_reset_context(vcpu);
@@ -4478,12 +4742,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4478int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 4742int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4479 struct kvm_guest_debug *dbg) 4743 struct kvm_guest_debug *dbg)
4480{ 4744{
4745 unsigned long rflags;
4481 int i, r; 4746 int i, r;
4482 4747
4483 vcpu_load(vcpu); 4748 vcpu_load(vcpu);
4484 4749
4485 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == 4750 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
4486 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { 4751 r = -EBUSY;
4752 if (vcpu->arch.exception.pending)
4753 goto unlock_out;
4754 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
4755 kvm_queue_exception(vcpu, DB_VECTOR);
4756 else
4757 kvm_queue_exception(vcpu, BP_VECTOR);
4758 }
4759
4760 /*
4761 * Read rflags as long as potentially injected trace flags are still
4762 * filtered out.
4763 */
4764 rflags = kvm_get_rflags(vcpu);
4765
4766 vcpu->guest_debug = dbg->control;
4767 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
4768 vcpu->guest_debug = 0;
4769
4770 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4487 for (i = 0; i < KVM_NR_DB_REGS; ++i) 4771 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4488 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 4772 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4489 vcpu->arch.switch_db_regs = 4773 vcpu->arch.switch_db_regs =
@@ -4494,13 +4778,23 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4494 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4778 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4495 } 4779 }
4496 4780
4497 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 4781 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
4782 vcpu->arch.singlestep_cs =
4783 get_segment_selector(vcpu, VCPU_SREG_CS);
4784 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
4785 }
4786
4787 /*
4788 * Trigger an rflags update that will inject or remove the trace
4789 * flags.
4790 */
4791 kvm_set_rflags(vcpu, rflags);
4792
4793 kvm_x86_ops->set_guest_debug(vcpu, dbg);
4498 4794
4499 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 4795 r = 0;
4500 kvm_queue_exception(vcpu, DB_VECTOR);
4501 else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4502 kvm_queue_exception(vcpu, BP_VECTOR);
4503 4796
4797unlock_out:
4504 vcpu_put(vcpu); 4798 vcpu_put(vcpu);
4505 4799
4506 return r; 4800 return r;
@@ -4701,14 +4995,26 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4701 return kvm_x86_ops->vcpu_reset(vcpu); 4995 return kvm_x86_ops->vcpu_reset(vcpu);
4702} 4996}
4703 4997
4704void kvm_arch_hardware_enable(void *garbage) 4998int kvm_arch_hardware_enable(void *garbage)
4705{ 4999{
4706 kvm_x86_ops->hardware_enable(garbage); 5000 /*
5001 * Since this may be called from a hotplug notifcation,
5002 * we can't get the CPU frequency directly.
5003 */
5004 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5005 int cpu = raw_smp_processor_id();
5006 per_cpu(cpu_tsc_khz, cpu) = 0;
5007 }
5008
5009 kvm_shared_msr_cpu_online();
5010
5011 return kvm_x86_ops->hardware_enable(garbage);
4707} 5012}
4708 5013
4709void kvm_arch_hardware_disable(void *garbage) 5014void kvm_arch_hardware_disable(void *garbage)
4710{ 5015{
4711 kvm_x86_ops->hardware_disable(garbage); 5016 kvm_x86_ops->hardware_disable(garbage);
5017 drop_user_return_notifiers(garbage);
4712} 5018}
4713 5019
4714int kvm_arch_hardware_setup(void) 5020int kvm_arch_hardware_setup(void)
@@ -4946,8 +5252,36 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4946 return kvm_x86_ops->interrupt_allowed(vcpu); 5252 return kvm_x86_ops->interrupt_allowed(vcpu);
4947} 5253}
4948 5254
5255unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5256{
5257 unsigned long rflags;
5258
5259 rflags = kvm_x86_ops->get_rflags(vcpu);
5260 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5261 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
5262 return rflags;
5263}
5264EXPORT_SYMBOL_GPL(kvm_get_rflags);
5265
5266void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5267{
5268 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5269 vcpu->arch.singlestep_cs ==
5270 get_segment_selector(vcpu, VCPU_SREG_CS) &&
5271 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5272 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5273 kvm_x86_ops->set_rflags(vcpu, rflags);
5274}
5275EXPORT_SYMBOL_GPL(kvm_set_rflags);
5276
4949EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 5277EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
4950EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 5278EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
4951EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 5279EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
4952EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 5280EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
4953EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 5281EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
5282EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
5283EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
5284EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5285EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5286EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5287EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
new file mode 100644
index 000000000000..8df89f0a3fe6
--- /dev/null
+++ b/arch/x86/lib/.gitignore
@@ -0,0 +1 @@
inat-tables.c
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 85f5db95c60f..a2d6472895fb 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,12 +2,25 @@
2# Makefile for x86 specific library files. 2# Makefile for x86 specific library files.
3# 3#
4 4
5inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
6inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
7quiet_cmd_inat_tables = GEN $@
8 cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@
9
10$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
11 $(call cmd,inat_tables)
12
13$(obj)/inat.o: $(obj)/inat-tables.c
14
15clean-files := inat-tables.c
16
5obj-$(CONFIG_SMP) := msr.o 17obj-$(CONFIG_SMP) := msr.o
6 18
7lib-y := delay.o 19lib-y := delay.o
8lib-y += thunk_$(BITS).o 20lib-y += thunk_$(BITS).o
9lib-y += usercopy_$(BITS).o getuser.o putuser.o 21lib-y += usercopy_$(BITS).o getuser.o putuser.o
10lib-y += memcpy_$(BITS).o 22lib-y += memcpy_$(BITS).o
23lib-y += insn.o inat.o
11 24
12obj-y += msr-reg.o msr-reg-export.o 25obj-y += msr-reg.o msr-reg-export.o
13 26
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 6ba0f7bb85ea..cf889d4e076a 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -65,7 +65,7 @@
65 .endm 65 .endm
66 66
67/* Standard copy_to_user with segment limit checking */ 67/* Standard copy_to_user with segment limit checking */
68ENTRY(copy_to_user) 68ENTRY(_copy_to_user)
69 CFI_STARTPROC 69 CFI_STARTPROC
70 GET_THREAD_INFO(%rax) 70 GET_THREAD_INFO(%rax)
71 movq %rdi,%rcx 71 movq %rdi,%rcx
@@ -75,10 +75,10 @@ ENTRY(copy_to_user)
75 jae bad_to_user 75 jae bad_to_user
76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
77 CFI_ENDPROC 77 CFI_ENDPROC
78ENDPROC(copy_to_user) 78ENDPROC(_copy_to_user)
79 79
80/* Standard copy_from_user with segment limit checking */ 80/* Standard copy_from_user with segment limit checking */
81ENTRY(copy_from_user) 81ENTRY(_copy_from_user)
82 CFI_STARTPROC 82 CFI_STARTPROC
83 GET_THREAD_INFO(%rax) 83 GET_THREAD_INFO(%rax)
84 movq %rsi,%rcx 84 movq %rsi,%rcx
@@ -88,7 +88,7 @@ ENTRY(copy_from_user)
88 jae bad_from_user 88 jae bad_from_user
89 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 89 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
90 CFI_ENDPROC 90 CFI_ENDPROC
91ENDPROC(copy_from_user) 91ENDPROC(_copy_from_user)
92 92
93ENTRY(copy_user_generic) 93ENTRY(copy_user_generic)
94 CFI_STARTPROC 94 CFI_STARTPROC
@@ -96,12 +96,6 @@ ENTRY(copy_user_generic)
96 CFI_ENDPROC 96 CFI_ENDPROC
97ENDPROC(copy_user_generic) 97ENDPROC(copy_user_generic)
98 98
99ENTRY(__copy_from_user_inatomic)
100 CFI_STARTPROC
101 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
102 CFI_ENDPROC
103ENDPROC(__copy_from_user_inatomic)
104
105 .section .fixup,"ax" 99 .section .fixup,"ax"
106 /* must zero dest */ 100 /* must zero dest */
107ENTRY(bad_from_user) 101ENTRY(bad_from_user)
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
new file mode 100644
index 000000000000..46fc4ee09fc4
--- /dev/null
+++ b/arch/x86/lib/inat.c
@@ -0,0 +1,90 @@
1/*
2 * x86 instruction attribute tables
3 *
4 * Written by Masami Hiramatsu <mhiramat@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 */
21#include <asm/insn.h>
22
23/* Attribute tables are generated from opcode map */
24#include "inat-tables.c"
25
26/* Attribute search APIs */
27insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
28{
29 return inat_primary_table[opcode];
30}
31
32insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx,
33 insn_attr_t esc_attr)
34{
35 const insn_attr_t *table;
36 insn_attr_t lpfx_attr;
37 int n, m = 0;
38
39 n = inat_escape_id(esc_attr);
40 if (last_pfx) {
41 lpfx_attr = inat_get_opcode_attribute(last_pfx);
42 m = inat_last_prefix_id(lpfx_attr);
43 }
44 table = inat_escape_tables[n][0];
45 if (!table)
46 return 0;
47 if (inat_has_variant(table[opcode]) && m) {
48 table = inat_escape_tables[n][m];
49 if (!table)
50 return 0;
51 }
52 return table[opcode];
53}
54
55insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx,
56 insn_attr_t grp_attr)
57{
58 const insn_attr_t *table;
59 insn_attr_t lpfx_attr;
60 int n, m = 0;
61
62 n = inat_group_id(grp_attr);
63 if (last_pfx) {
64 lpfx_attr = inat_get_opcode_attribute(last_pfx);
65 m = inat_last_prefix_id(lpfx_attr);
66 }
67 table = inat_group_tables[n][0];
68 if (!table)
69 return inat_group_common_attribute(grp_attr);
70 if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) {
71 table = inat_group_tables[n][m];
72 if (!table)
73 return inat_group_common_attribute(grp_attr);
74 }
75 return table[X86_MODRM_REG(modrm)] |
76 inat_group_common_attribute(grp_attr);
77}
78
79insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
80 insn_byte_t vex_p)
81{
82 const insn_attr_t *table;
83 if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
84 return 0;
85 table = inat_avx_tables[vex_m][vex_p];
86 if (!table)
87 return 0;
88 return table[opcode];
89}
90
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
new file mode 100644
index 000000000000..9f33b984d0ef
--- /dev/null
+++ b/arch/x86/lib/insn.c
@@ -0,0 +1,516 @@
1/*
2 * x86 instruction analysis
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004, 2009
19 */
20
21#include <linux/string.h>
22#include <asm/inat.h>
23#include <asm/insn.h>
24
25#define get_next(t, insn) \
26 ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
27
28#define peek_next(t, insn) \
29 ({t r; r = *(t*)insn->next_byte; r; })
30
31#define peek_nbyte_next(t, insn, n) \
32 ({t r; r = *(t*)((insn)->next_byte + n); r; })
33
34/**
35 * insn_init() - initialize struct insn
36 * @insn: &struct insn to be initialized
37 * @kaddr: address (in kernel memory) of instruction (or copy thereof)
38 * @x86_64: !0 for 64-bit kernel or 64-bit app
39 */
40void insn_init(struct insn *insn, const void *kaddr, int x86_64)
41{
42 memset(insn, 0, sizeof(*insn));
43 insn->kaddr = kaddr;
44 insn->next_byte = kaddr;
45 insn->x86_64 = x86_64 ? 1 : 0;
46 insn->opnd_bytes = 4;
47 if (x86_64)
48 insn->addr_bytes = 8;
49 else
50 insn->addr_bytes = 4;
51}
52
53/**
54 * insn_get_prefixes - scan x86 instruction prefix bytes
55 * @insn: &struct insn containing instruction
56 *
57 * Populates the @insn->prefixes bitmap, and updates @insn->next_byte
58 * to point to the (first) opcode. No effect if @insn->prefixes.got
59 * is already set.
60 */
61void insn_get_prefixes(struct insn *insn)
62{
63 struct insn_field *prefixes = &insn->prefixes;
64 insn_attr_t attr;
65 insn_byte_t b, lb;
66 int i, nb;
67
68 if (prefixes->got)
69 return;
70
71 nb = 0;
72 lb = 0;
73 b = peek_next(insn_byte_t, insn);
74 attr = inat_get_opcode_attribute(b);
75 while (inat_is_legacy_prefix(attr)) {
76 /* Skip if same prefix */
77 for (i = 0; i < nb; i++)
78 if (prefixes->bytes[i] == b)
79 goto found;
80 if (nb == 4)
81 /* Invalid instruction */
82 break;
83 prefixes->bytes[nb++] = b;
84 if (inat_is_address_size_prefix(attr)) {
85 /* address size switches 2/4 or 4/8 */
86 if (insn->x86_64)
87 insn->addr_bytes ^= 12;
88 else
89 insn->addr_bytes ^= 6;
90 } else if (inat_is_operand_size_prefix(attr)) {
91 /* oprand size switches 2/4 */
92 insn->opnd_bytes ^= 6;
93 }
94found:
95 prefixes->nbytes++;
96 insn->next_byte++;
97 lb = b;
98 b = peek_next(insn_byte_t, insn);
99 attr = inat_get_opcode_attribute(b);
100 }
101 /* Set the last prefix */
102 if (lb && lb != insn->prefixes.bytes[3]) {
103 if (unlikely(insn->prefixes.bytes[3])) {
104 /* Swap the last prefix */
105 b = insn->prefixes.bytes[3];
106 for (i = 0; i < nb; i++)
107 if (prefixes->bytes[i] == lb)
108 prefixes->bytes[i] = b;
109 }
110 insn->prefixes.bytes[3] = lb;
111 }
112
113 /* Decode REX prefix */
114 if (insn->x86_64) {
115 b = peek_next(insn_byte_t, insn);
116 attr = inat_get_opcode_attribute(b);
117 if (inat_is_rex_prefix(attr)) {
118 insn->rex_prefix.value = b;
119 insn->rex_prefix.nbytes = 1;
120 insn->next_byte++;
121 if (X86_REX_W(b))
122 /* REX.W overrides opnd_size */
123 insn->opnd_bytes = 8;
124 }
125 }
126 insn->rex_prefix.got = 1;
127
128 /* Decode VEX prefix */
129 b = peek_next(insn_byte_t, insn);
130 attr = inat_get_opcode_attribute(b);
131 if (inat_is_vex_prefix(attr)) {
132 insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
133 if (!insn->x86_64) {
134 /*
135 * In 32-bits mode, if the [7:6] bits (mod bits of
136 * ModRM) on the second byte are not 11b, it is
137 * LDS or LES.
138 */
139 if (X86_MODRM_MOD(b2) != 3)
140 goto vex_end;
141 }
142 insn->vex_prefix.bytes[0] = b;
143 insn->vex_prefix.bytes[1] = b2;
144 if (inat_is_vex3_prefix(attr)) {
145 b2 = peek_nbyte_next(insn_byte_t, insn, 2);
146 insn->vex_prefix.bytes[2] = b2;
147 insn->vex_prefix.nbytes = 3;
148 insn->next_byte += 3;
149 if (insn->x86_64 && X86_VEX_W(b2))
150 /* VEX.W overrides opnd_size */
151 insn->opnd_bytes = 8;
152 } else {
153 insn->vex_prefix.nbytes = 2;
154 insn->next_byte += 2;
155 }
156 }
157vex_end:
158 insn->vex_prefix.got = 1;
159
160 prefixes->got = 1;
161 return;
162}
163
164/**
165 * insn_get_opcode - collect opcode(s)
166 * @insn: &struct insn containing instruction
167 *
168 * Populates @insn->opcode, updates @insn->next_byte to point past the
169 * opcode byte(s), and set @insn->attr (except for groups).
170 * If necessary, first collects any preceding (prefix) bytes.
171 * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got
172 * is already 1.
173 */
174void insn_get_opcode(struct insn *insn)
175{
176 struct insn_field *opcode = &insn->opcode;
177 insn_byte_t op, pfx;
178 if (opcode->got)
179 return;
180 if (!insn->prefixes.got)
181 insn_get_prefixes(insn);
182
183 /* Get first opcode */
184 op = get_next(insn_byte_t, insn);
185 opcode->bytes[0] = op;
186 opcode->nbytes = 1;
187
188 /* Check if there is VEX prefix or not */
189 if (insn_is_avx(insn)) {
190 insn_byte_t m, p;
191 m = insn_vex_m_bits(insn);
192 p = insn_vex_p_bits(insn);
193 insn->attr = inat_get_avx_attribute(op, m, p);
194 if (!inat_accept_vex(insn->attr))
195 insn->attr = 0; /* This instruction is bad */
196 goto end; /* VEX has only 1 byte for opcode */
197 }
198
199 insn->attr = inat_get_opcode_attribute(op);
200 while (inat_is_escape(insn->attr)) {
201 /* Get escaped opcode */
202 op = get_next(insn_byte_t, insn);
203 opcode->bytes[opcode->nbytes++] = op;
204 pfx = insn_last_prefix(insn);
205 insn->attr = inat_get_escape_attribute(op, pfx, insn->attr);
206 }
207 if (inat_must_vex(insn->attr))
208 insn->attr = 0; /* This instruction is bad */
209end:
210 opcode->got = 1;
211}
212
213/**
214 * insn_get_modrm - collect ModRM byte, if any
215 * @insn: &struct insn containing instruction
216 *
217 * Populates @insn->modrm and updates @insn->next_byte to point past the
218 * ModRM byte, if any. If necessary, first collects the preceding bytes
219 * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1.
220 */
221void insn_get_modrm(struct insn *insn)
222{
223 struct insn_field *modrm = &insn->modrm;
224 insn_byte_t pfx, mod;
225 if (modrm->got)
226 return;
227 if (!insn->opcode.got)
228 insn_get_opcode(insn);
229
230 if (inat_has_modrm(insn->attr)) {
231 mod = get_next(insn_byte_t, insn);
232 modrm->value = mod;
233 modrm->nbytes = 1;
234 if (inat_is_group(insn->attr)) {
235 pfx = insn_last_prefix(insn);
236 insn->attr = inat_get_group_attribute(mod, pfx,
237 insn->attr);
238 }
239 }
240
241 if (insn->x86_64 && inat_is_force64(insn->attr))
242 insn->opnd_bytes = 8;
243 modrm->got = 1;
244}
245
246
247/**
248 * insn_rip_relative() - Does instruction use RIP-relative addressing mode?
249 * @insn: &struct insn containing instruction
250 *
251 * If necessary, first collects the instruction up to and including the
252 * ModRM byte. No effect if @insn->x86_64 is 0.
253 */
254int insn_rip_relative(struct insn *insn)
255{
256 struct insn_field *modrm = &insn->modrm;
257
258 if (!insn->x86_64)
259 return 0;
260 if (!modrm->got)
261 insn_get_modrm(insn);
262 /*
263 * For rip-relative instructions, the mod field (top 2 bits)
264 * is zero and the r/m field (bottom 3 bits) is 0x5.
265 */
266 return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
267}
268
269/**
270 * insn_get_sib() - Get the SIB byte of instruction
271 * @insn: &struct insn containing instruction
272 *
273 * If necessary, first collects the instruction up to and including the
274 * ModRM byte.
275 */
276void insn_get_sib(struct insn *insn)
277{
278 insn_byte_t modrm;
279
280 if (insn->sib.got)
281 return;
282 if (!insn->modrm.got)
283 insn_get_modrm(insn);
284 if (insn->modrm.nbytes) {
285 modrm = (insn_byte_t)insn->modrm.value;
286 if (insn->addr_bytes != 2 &&
287 X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
288 insn->sib.value = get_next(insn_byte_t, insn);
289 insn->sib.nbytes = 1;
290 }
291 }
292 insn->sib.got = 1;
293}
294
295
296/**
297 * insn_get_displacement() - Get the displacement of instruction
298 * @insn: &struct insn containing instruction
299 *
300 * If necessary, first collects the instruction up to and including the
301 * SIB byte.
302 * Displacement value is sign-expanded.
303 */
304void insn_get_displacement(struct insn *insn)
305{
306 insn_byte_t mod, rm, base;
307
308 if (insn->displacement.got)
309 return;
310 if (!insn->sib.got)
311 insn_get_sib(insn);
312 if (insn->modrm.nbytes) {
313 /*
314 * Interpreting the modrm byte:
315 * mod = 00 - no displacement fields (exceptions below)
316 * mod = 01 - 1-byte displacement field
317 * mod = 10 - displacement field is 4 bytes, or 2 bytes if
318 * address size = 2 (0x67 prefix in 32-bit mode)
319 * mod = 11 - no memory operand
320 *
321 * If address size = 2...
322 * mod = 00, r/m = 110 - displacement field is 2 bytes
323 *
324 * If address size != 2...
325 * mod != 11, r/m = 100 - SIB byte exists
326 * mod = 00, SIB base = 101 - displacement field is 4 bytes
327 * mod = 00, r/m = 101 - rip-relative addressing, displacement
328 * field is 4 bytes
329 */
330 mod = X86_MODRM_MOD(insn->modrm.value);
331 rm = X86_MODRM_RM(insn->modrm.value);
332 base = X86_SIB_BASE(insn->sib.value);
333 if (mod == 3)
334 goto out;
335 if (mod == 1) {
336 insn->displacement.value = get_next(char, insn);
337 insn->displacement.nbytes = 1;
338 } else if (insn->addr_bytes == 2) {
339 if ((mod == 0 && rm == 6) || mod == 2) {
340 insn->displacement.value =
341 get_next(short, insn);
342 insn->displacement.nbytes = 2;
343 }
344 } else {
345 if ((mod == 0 && rm == 5) || mod == 2 ||
346 (mod == 0 && base == 5)) {
347 insn->displacement.value = get_next(int, insn);
348 insn->displacement.nbytes = 4;
349 }
350 }
351 }
352out:
353 insn->displacement.got = 1;
354}
355
356/* Decode moffset16/32/64 */
357static void __get_moffset(struct insn *insn)
358{
359 switch (insn->addr_bytes) {
360 case 2:
361 insn->moffset1.value = get_next(short, insn);
362 insn->moffset1.nbytes = 2;
363 break;
364 case 4:
365 insn->moffset1.value = get_next(int, insn);
366 insn->moffset1.nbytes = 4;
367 break;
368 case 8:
369 insn->moffset1.value = get_next(int, insn);
370 insn->moffset1.nbytes = 4;
371 insn->moffset2.value = get_next(int, insn);
372 insn->moffset2.nbytes = 4;
373 break;
374 }
375 insn->moffset1.got = insn->moffset2.got = 1;
376}
377
378/* Decode imm v32(Iz) */
379static void __get_immv32(struct insn *insn)
380{
381 switch (insn->opnd_bytes) {
382 case 2:
383 insn->immediate.value = get_next(short, insn);
384 insn->immediate.nbytes = 2;
385 break;
386 case 4:
387 case 8:
388 insn->immediate.value = get_next(int, insn);
389 insn->immediate.nbytes = 4;
390 break;
391 }
392}
393
394/* Decode imm v64(Iv/Ov) */
395static void __get_immv(struct insn *insn)
396{
397 switch (insn->opnd_bytes) {
398 case 2:
399 insn->immediate1.value = get_next(short, insn);
400 insn->immediate1.nbytes = 2;
401 break;
402 case 4:
403 insn->immediate1.value = get_next(int, insn);
404 insn->immediate1.nbytes = 4;
405 break;
406 case 8:
407 insn->immediate1.value = get_next(int, insn);
408 insn->immediate1.nbytes = 4;
409 insn->immediate2.value = get_next(int, insn);
410 insn->immediate2.nbytes = 4;
411 break;
412 }
413 insn->immediate1.got = insn->immediate2.got = 1;
414}
415
416/* Decode ptr16:16/32(Ap) */
417static void __get_immptr(struct insn *insn)
418{
419 switch (insn->opnd_bytes) {
420 case 2:
421 insn->immediate1.value = get_next(short, insn);
422 insn->immediate1.nbytes = 2;
423 break;
424 case 4:
425 insn->immediate1.value = get_next(int, insn);
426 insn->immediate1.nbytes = 4;
427 break;
428 case 8:
429 /* ptr16:64 is not exist (no segment) */
430 return;
431 }
432 insn->immediate2.value = get_next(unsigned short, insn);
433 insn->immediate2.nbytes = 2;
434 insn->immediate1.got = insn->immediate2.got = 1;
435}
436
437/**
438 * insn_get_immediate() - Get the immediates of instruction
439 * @insn: &struct insn containing instruction
440 *
441 * If necessary, first collects the instruction up to and including the
442 * displacement bytes.
443 * Basically, most of immediates are sign-expanded. Unsigned-value can be
444 * get by bit masking with ((1 << (nbytes * 8)) - 1)
445 */
446void insn_get_immediate(struct insn *insn)
447{
448 if (insn->immediate.got)
449 return;
450 if (!insn->displacement.got)
451 insn_get_displacement(insn);
452
453 if (inat_has_moffset(insn->attr)) {
454 __get_moffset(insn);
455 goto done;
456 }
457
458 if (!inat_has_immediate(insn->attr))
459 /* no immediates */
460 goto done;
461
462 switch (inat_immediate_size(insn->attr)) {
463 case INAT_IMM_BYTE:
464 insn->immediate.value = get_next(char, insn);
465 insn->immediate.nbytes = 1;
466 break;
467 case INAT_IMM_WORD:
468 insn->immediate.value = get_next(short, insn);
469 insn->immediate.nbytes = 2;
470 break;
471 case INAT_IMM_DWORD:
472 insn->immediate.value = get_next(int, insn);
473 insn->immediate.nbytes = 4;
474 break;
475 case INAT_IMM_QWORD:
476 insn->immediate1.value = get_next(int, insn);
477 insn->immediate1.nbytes = 4;
478 insn->immediate2.value = get_next(int, insn);
479 insn->immediate2.nbytes = 4;
480 break;
481 case INAT_IMM_PTR:
482 __get_immptr(insn);
483 break;
484 case INAT_IMM_VWORD32:
485 __get_immv32(insn);
486 break;
487 case INAT_IMM_VWORD:
488 __get_immv(insn);
489 break;
490 default:
491 break;
492 }
493 if (inat_has_second_immediate(insn->attr)) {
494 insn->immediate2.value = get_next(char, insn);
495 insn->immediate2.nbytes = 1;
496 }
497done:
498 insn->immediate.got = 1;
499}
500
501/**
502 * insn_get_length() - Get the length of instruction
503 * @insn: &struct insn containing instruction
504 *
505 * If necessary, first collects the instruction up to and including the
506 * immediates bytes.
507 */
508void insn_get_length(struct insn *insn)
509{
510 if (insn->length)
511 return;
512 if (!insn->immediate.got)
513 insn_get_immediate(insn);
514 insn->length = (unsigned char)((unsigned long)insn->next_byte
515 - (unsigned long)insn->kaddr);
516}
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 33a1e3ca22d8..41628b104b9e 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -71,14 +71,9 @@ int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
71} 71}
72EXPORT_SYMBOL(wrmsr_on_cpu); 72EXPORT_SYMBOL(wrmsr_on_cpu);
73 73
74/* rdmsr on a bunch of CPUs 74static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,
75 * 75 struct msr *msrs,
76 * @mask: which CPUs 76 void (*msr_func) (void *info))
77 * @msr_no: which MSR
78 * @msrs: array of MSR values
79 *
80 */
81void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
82{ 77{
83 struct msr_info rv; 78 struct msr_info rv;
84 int this_cpu; 79 int this_cpu;
@@ -92,11 +87,23 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
92 this_cpu = get_cpu(); 87 this_cpu = get_cpu();
93 88
94 if (cpumask_test_cpu(this_cpu, mask)) 89 if (cpumask_test_cpu(this_cpu, mask))
95 __rdmsr_on_cpu(&rv); 90 msr_func(&rv);
96 91
97 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); 92 smp_call_function_many(mask, msr_func, &rv, 1);
98 put_cpu(); 93 put_cpu();
99} 94}
95
96/* rdmsr on a bunch of CPUs
97 *
98 * @mask: which CPUs
99 * @msr_no: which MSR
100 * @msrs: array of MSR values
101 *
102 */
103void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
104{
105 __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu);
106}
100EXPORT_SYMBOL(rdmsr_on_cpus); 107EXPORT_SYMBOL(rdmsr_on_cpus);
101 108
102/* 109/*
@@ -107,24 +114,9 @@ EXPORT_SYMBOL(rdmsr_on_cpus);
107 * @msrs: array of MSR values 114 * @msrs: array of MSR values
108 * 115 *
109 */ 116 */
110void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) 117void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
111{ 118{
112 struct msr_info rv; 119 __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu);
113 int this_cpu;
114
115 memset(&rv, 0, sizeof(rv));
116
117 rv.off = cpumask_first(mask);
118 rv.msrs = msrs;
119 rv.msr_no = msr_no;
120
121 this_cpu = get_cpu();
122
123 if (cpumask_test_cpu(this_cpu, mask))
124 __wrmsr_on_cpu(&rv);
125
126 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
127 put_cpu();
128} 120}
129EXPORT_SYMBOL(wrmsr_on_cpus); 121EXPORT_SYMBOL(wrmsr_on_cpus);
130 122
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 1f118d462acc..e218d5df85ff 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -874,7 +874,7 @@ EXPORT_SYMBOL(copy_to_user);
874 * data to the requested size using zero bytes. 874 * data to the requested size using zero bytes.
875 */ 875 */
876unsigned long 876unsigned long
877copy_from_user(void *to, const void __user *from, unsigned long n) 877_copy_from_user(void *to, const void __user *from, unsigned long n)
878{ 878{
879 if (access_ok(VERIFY_READ, from, n)) 879 if (access_ok(VERIFY_READ, from, n))
880 n = __copy_from_user(to, from, n); 880 n = __copy_from_user(to, from, n);
@@ -882,4 +882,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
882 memset(to, 0, n); 882 memset(to, 0, n);
883 return n; 883 return n;
884} 884}
885EXPORT_SYMBOL(copy_from_user); 885EXPORT_SYMBOL(_copy_from_user);
886
887void copy_from_user_overflow(void)
888{
889 WARN(1, "Buffer overflow detected!\n");
890}
891EXPORT_SYMBOL(copy_from_user_overflow);
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
new file mode 100644
index 000000000000..a793da5e560e
--- /dev/null
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -0,0 +1,893 @@
1# x86 Opcode Maps
2#
3#<Opcode maps>
4# Table: table-name
5# Referrer: escaped-name
6# AVXcode: avx-code
7# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
8# (or)
9# opcode: escape # escaped-name
10# EndTable
11#
12#<group maps>
13# GrpTable: GrpXXX
14# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
15# EndTable
16#
17# AVX Superscripts
18# (VEX): this opcode can accept VEX prefix.
19# (oVEX): this opcode requires VEX prefix.
20# (o128): this opcode only supports 128bit VEX.
21# (o256): this opcode only supports 256bit VEX.
22#
23
24Table: one byte opcode
25Referrer:
26AVXcode:
27# 0x00 - 0x0f
2800: ADD Eb,Gb
2901: ADD Ev,Gv
3002: ADD Gb,Eb
3103: ADD Gv,Ev
3204: ADD AL,Ib
3305: ADD rAX,Iz
3406: PUSH ES (i64)
3507: POP ES (i64)
3608: OR Eb,Gb
3709: OR Ev,Gv
380a: OR Gb,Eb
390b: OR Gv,Ev
400c: OR AL,Ib
410d: OR rAX,Iz
420e: PUSH CS (i64)
430f: escape # 2-byte escape
44# 0x10 - 0x1f
4510: ADC Eb,Gb
4611: ADC Ev,Gv
4712: ADC Gb,Eb
4813: ADC Gv,Ev
4914: ADC AL,Ib
5015: ADC rAX,Iz
5116: PUSH SS (i64)
5217: POP SS (i64)
5318: SBB Eb,Gb
5419: SBB Ev,Gv
551a: SBB Gb,Eb
561b: SBB Gv,Ev
571c: SBB AL,Ib
581d: SBB rAX,Iz
591e: PUSH DS (i64)
601f: POP DS (i64)
61# 0x20 - 0x2f
6220: AND Eb,Gb
6321: AND Ev,Gv
6422: AND Gb,Eb
6523: AND Gv,Ev
6624: AND AL,Ib
6725: AND rAx,Iz
6826: SEG=ES (Prefix)
6927: DAA (i64)
7028: SUB Eb,Gb
7129: SUB Ev,Gv
722a: SUB Gb,Eb
732b: SUB Gv,Ev
742c: SUB AL,Ib
752d: SUB rAX,Iz
762e: SEG=CS (Prefix)
772f: DAS (i64)
78# 0x30 - 0x3f
7930: XOR Eb,Gb
8031: XOR Ev,Gv
8132: XOR Gb,Eb
8233: XOR Gv,Ev
8334: XOR AL,Ib
8435: XOR rAX,Iz
8536: SEG=SS (Prefix)
8637: AAA (i64)
8738: CMP Eb,Gb
8839: CMP Ev,Gv
893a: CMP Gb,Eb
903b: CMP Gv,Ev
913c: CMP AL,Ib
923d: CMP rAX,Iz
933e: SEG=DS (Prefix)
943f: AAS (i64)
95# 0x40 - 0x4f
9640: INC eAX (i64) | REX (o64)
9741: INC eCX (i64) | REX.B (o64)
9842: INC eDX (i64) | REX.X (o64)
9943: INC eBX (i64) | REX.XB (o64)
10044: INC eSP (i64) | REX.R (o64)
10145: INC eBP (i64) | REX.RB (o64)
10246: INC eSI (i64) | REX.RX (o64)
10347: INC eDI (i64) | REX.RXB (o64)
10448: DEC eAX (i64) | REX.W (o64)
10549: DEC eCX (i64) | REX.WB (o64)
1064a: DEC eDX (i64) | REX.WX (o64)
1074b: DEC eBX (i64) | REX.WXB (o64)
1084c: DEC eSP (i64) | REX.WR (o64)
1094d: DEC eBP (i64) | REX.WRB (o64)
1104e: DEC eSI (i64) | REX.WRX (o64)
1114f: DEC eDI (i64) | REX.WRXB (o64)
112# 0x50 - 0x5f
11350: PUSH rAX/r8 (d64)
11451: PUSH rCX/r9 (d64)
11552: PUSH rDX/r10 (d64)
11653: PUSH rBX/r11 (d64)
11754: PUSH rSP/r12 (d64)
11855: PUSH rBP/r13 (d64)
11956: PUSH rSI/r14 (d64)
12057: PUSH rDI/r15 (d64)
12158: POP rAX/r8 (d64)
12259: POP rCX/r9 (d64)
1235a: POP rDX/r10 (d64)
1245b: POP rBX/r11 (d64)
1255c: POP rSP/r12 (d64)
1265d: POP rBP/r13 (d64)
1275e: POP rSI/r14 (d64)
1285f: POP rDI/r15 (d64)
129# 0x60 - 0x6f
13060: PUSHA/PUSHAD (i64)
13161: POPA/POPAD (i64)
13262: BOUND Gv,Ma (i64)
13363: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64)
13464: SEG=FS (Prefix)
13565: SEG=GS (Prefix)
13666: Operand-Size (Prefix)
13767: Address-Size (Prefix)
13868: PUSH Iz (d64)
13969: IMUL Gv,Ev,Iz
1406a: PUSH Ib (d64)
1416b: IMUL Gv,Ev,Ib
1426c: INS/INSB Yb,DX
1436d: INS/INSW/INSD Yz,DX
1446e: OUTS/OUTSB DX,Xb
1456f: OUTS/OUTSW/OUTSD DX,Xz
146# 0x70 - 0x7f
14770: JO Jb
14871: JNO Jb
14972: JB/JNAE/JC Jb
15073: JNB/JAE/JNC Jb
15174: JZ/JE Jb
15275: JNZ/JNE Jb
15376: JBE/JNA Jb
15477: JNBE/JA Jb
15578: JS Jb
15679: JNS Jb
1577a: JP/JPE Jb
1587b: JNP/JPO Jb
1597c: JL/JNGE Jb
1607d: JNL/JGE Jb
1617e: JLE/JNG Jb
1627f: JNLE/JG Jb
163# 0x80 - 0x8f
16480: Grp1 Eb,Ib (1A)
16581: Grp1 Ev,Iz (1A)
16682: Grp1 Eb,Ib (1A),(i64)
16783: Grp1 Ev,Ib (1A)
16884: TEST Eb,Gb
16985: TEST Ev,Gv
17086: XCHG Eb,Gb
17187: XCHG Ev,Gv
17288: MOV Eb,Gb
17389: MOV Ev,Gv
1748a: MOV Gb,Eb
1758b: MOV Gv,Ev
1768c: MOV Ev,Sw
1778d: LEA Gv,M
1788e: MOV Sw,Ew
1798f: Grp1A (1A) | POP Ev (d64)
180# 0x90 - 0x9f
18190: NOP | PAUSE (F3) | XCHG r8,rAX
18291: XCHG rCX/r9,rAX
18392: XCHG rDX/r10,rAX
18493: XCHG rBX/r11,rAX
18594: XCHG rSP/r12,rAX
18695: XCHG rBP/r13,rAX
18796: XCHG rSI/r14,rAX
18897: XCHG rDI/r15,rAX
18998: CBW/CWDE/CDQE
19099: CWD/CDQ/CQO
1919a: CALLF Ap (i64)
1929b: FWAIT/WAIT
1939c: PUSHF/D/Q Fv (d64)
1949d: POPF/D/Q Fv (d64)
1959e: SAHF
1969f: LAHF
197# 0xa0 - 0xaf
198a0: MOV AL,Ob
199a1: MOV rAX,Ov
200a2: MOV Ob,AL
201a3: MOV Ov,rAX
202a4: MOVS/B Xb,Yb
203a5: MOVS/W/D/Q Xv,Yv
204a6: CMPS/B Xb,Yb
205a7: CMPS/W/D Xv,Yv
206a8: TEST AL,Ib
207a9: TEST rAX,Iz
208aa: STOS/B Yb,AL
209ab: STOS/W/D/Q Yv,rAX
210ac: LODS/B AL,Xb
211ad: LODS/W/D/Q rAX,Xv
212ae: SCAS/B AL,Yb
213af: SCAS/W/D/Q rAX,Xv
214# 0xb0 - 0xbf
215b0: MOV AL/R8L,Ib
216b1: MOV CL/R9L,Ib
217b2: MOV DL/R10L,Ib
218b3: MOV BL/R11L,Ib
219b4: MOV AH/R12L,Ib
220b5: MOV CH/R13L,Ib
221b6: MOV DH/R14L,Ib
222b7: MOV BH/R15L,Ib
223b8: MOV rAX/r8,Iv
224b9: MOV rCX/r9,Iv
225ba: MOV rDX/r10,Iv
226bb: MOV rBX/r11,Iv
227bc: MOV rSP/r12,Iv
228bd: MOV rBP/r13,Iv
229be: MOV rSI/r14,Iv
230bf: MOV rDI/r15,Iv
231# 0xc0 - 0xcf
232c0: Grp2 Eb,Ib (1A)
233c1: Grp2 Ev,Ib (1A)
234c2: RETN Iw (f64)
235c3: RETN
236c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix)
237c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix)
238c6: Grp11 Eb,Ib (1A)
239c7: Grp11 Ev,Iz (1A)
240c8: ENTER Iw,Ib
241c9: LEAVE (d64)
242ca: RETF Iw
243cb: RETF
244cc: INT3
245cd: INT Ib
246ce: INTO (i64)
247cf: IRET/D/Q
248# 0xd0 - 0xdf
249d0: Grp2 Eb,1 (1A)
250d1: Grp2 Ev,1 (1A)
251d2: Grp2 Eb,CL (1A)
252d3: Grp2 Ev,CL (1A)
253d4: AAM Ib (i64)
254d5: AAD Ib (i64)
255d6:
256d7: XLAT/XLATB
257d8: ESC
258d9: ESC
259da: ESC
260db: ESC
261dc: ESC
262dd: ESC
263de: ESC
264df: ESC
265# 0xe0 - 0xef
266e0: LOOPNE/LOOPNZ Jb (f64)
267e1: LOOPE/LOOPZ Jb (f64)
268e2: LOOP Jb (f64)
269e3: JrCXZ Jb (f64)
270e4: IN AL,Ib
271e5: IN eAX,Ib
272e6: OUT Ib,AL
273e7: OUT Ib,eAX
274e8: CALL Jz (f64)
275e9: JMP-near Jz (f64)
276ea: JMP-far Ap (i64)
277eb: JMP-short Jb (f64)
278ec: IN AL,DX
279ed: IN eAX,DX
280ee: OUT DX,AL
281ef: OUT DX,eAX
282# 0xf0 - 0xff
283f0: LOCK (Prefix)
284f1:
285f2: REPNE (Prefix)
286f3: REP/REPE (Prefix)
287f4: HLT
288f5: CMC
289f6: Grp3_1 Eb (1A)
290f7: Grp3_2 Ev (1A)
291f8: CLC
292f9: STC
293fa: CLI
294fb: STI
295fc: CLD
296fd: STD
297fe: Grp4 (1A)
298ff: Grp5 (1A)
299EndTable
300
301Table: 2-byte opcode (0x0f)
302Referrer: 2-byte escape
303AVXcode: 1
304# 0x0f 0x00-0x0f
30500: Grp6 (1A)
30601: Grp7 (1A)
30702: LAR Gv,Ew
30803: LSL Gv,Ew
30904:
31005: SYSCALL (o64)
31106: CLTS
31207: SYSRET (o64)
31308: INVD
31409: WBINVD
3150a:
3160b: UD2 (1B)
3170c:
3180d: NOP Ev | GrpP
3190e: FEMMS
320# 3DNow! uses the last imm byte as opcode extension.
3210f: 3DNow! Pq,Qq,Ib
322# 0x0f 0x10-0x1f
32310: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128)
32411: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128)
32512: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX)
32613: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128)
32714: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX)
32815: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX)
32916: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX)
33017: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128)
33118: Grp16 (1A)
33219:
3331a:
3341b:
3351c:
3361d:
3371e:
3381f: NOP Ev
339# 0x0f 0x20-0x2f
34020: MOV Rd,Cd
34121: MOV Rd,Dd
34222: MOV Cd,Rd
34323: MOV Dd,Rd
34424:
34525:
34626:
34727:
34828: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX)
34929: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX)
3502a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128)
3512b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX)
3522c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128)
3532d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128)
3542e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128)
3552f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128)
356# 0x0f 0x30-0x3f
35730: WRMSR
35831: RDTSC
35932: RDMSR
36033: RDPMC
36134: SYSENTER
36235: SYSEXIT
36336:
36437: GETSEC
36538: escape # 3-byte escape 1
36639:
3673a: escape # 3-byte escape 2
3683b:
3693c:
3703d:
3713e:
3723f:
373# 0x0f 0x40-0x4f
37440: CMOVO Gv,Ev
37541: CMOVNO Gv,Ev
37642: CMOVB/C/NAE Gv,Ev
37743: CMOVAE/NB/NC Gv,Ev
37844: CMOVE/Z Gv,Ev
37945: CMOVNE/NZ Gv,Ev
38046: CMOVBE/NA Gv,Ev
38147: CMOVA/NBE Gv,Ev
38248: CMOVS Gv,Ev
38349: CMOVNS Gv,Ev
3844a: CMOVP/PE Gv,Ev
3854b: CMOVNP/PO Gv,Ev
3864c: CMOVL/NGE Gv,Ev
3874d: CMOVNL/GE Gv,Ev
3884e: CMOVLE/NG Gv,Ev
3894f: CMOVNLE/G Gv,Ev
390# 0x0f 0x50-0x5f
39150: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX)
39251: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128)
39352: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128)
39453: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128)
39554: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX)
39655: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX)
39756: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX)
39857: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX)
39958: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128)
40059: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128)
4015a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128)
4025b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX)
4035c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128)
4045d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128)
4055e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128)
4065f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128)
407# 0x0f 0x60-0x6f
40860: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128)
40961: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128)
41062: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128)
41163: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128)
41264: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128)
41365: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128)
41466: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128)
41567: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128)
41668: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128)
41769: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128)
4186a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128)
4196b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128)
4206c: punpcklqdq Vdq,Wdq (66),(VEX),(o128)
4216d: punpckhqdq Vdq,Wdq (66),(VEX),(o128)
4226e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128)
4236f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX)
424# 0x0f 0x70-0x7f
42570: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128)
42671: Grp12 (1A)
42772: Grp13 (1A)
42873: Grp14 (1A)
42974: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128)
43075: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128)
43176: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128)
43277: emms/vzeroupper/vzeroall (VEX)
43378: VMREAD Ed/q,Gd/q
43479: VMWRITE Gd/q,Ed/q
4357a:
4367b:
4377c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX)
4387d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX)
4397e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128)
4407f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX)
441# 0x0f 0x80-0x8f
44280: JO Jz (f64)
44381: JNO Jz (f64)
44482: JB/JNAE/JC Jz (f64)
44583: JNB/JAE/JNC Jz (f64)
44684: JZ/JE Jz (f64)
44785: JNZ/JNE Jz (f64)
44886: JBE/JNA Jz (f64)
44987: JNBE/JA Jz (f64)
45088: JS Jz (f64)
45189: JNS Jz (f64)
4528a: JP/JPE Jz (f64)
4538b: JNP/JPO Jz (f64)
4548c: JL/JNGE Jz (f64)
4558d: JNL/JGE Jz (f64)
4568e: JLE/JNG Jz (f64)
4578f: JNLE/JG Jz (f64)
458# 0x0f 0x90-0x9f
45990: SETO Eb
46091: SETNO Eb
46192: SETB/C/NAE Eb
46293: SETAE/NB/NC Eb
46394: SETE/Z Eb
46495: SETNE/NZ Eb
46596: SETBE/NA Eb
46697: SETA/NBE Eb
46798: SETS Eb
46899: SETNS Eb
4699a: SETP/PE Eb
4709b: SETNP/PO Eb
4719c: SETL/NGE Eb
4729d: SETNL/GE Eb
4739e: SETLE/NG Eb
4749f: SETNLE/G Eb
475# 0x0f 0xa0-0xaf
476a0: PUSH FS (d64)
477a1: POP FS (d64)
478a2: CPUID
479a3: BT Ev,Gv
480a4: SHLD Ev,Gv,Ib
481a5: SHLD Ev,Gv,CL
482a6: GrpPDLK
483a7: GrpRNG
484a8: PUSH GS (d64)
485a9: POP GS (d64)
486aa: RSM
487ab: BTS Ev,Gv
488ac: SHRD Ev,Gv,Ib
489ad: SHRD Ev,Gv,CL
490ae: Grp15 (1A),(1C)
491af: IMUL Gv,Ev
492# 0x0f 0xb0-0xbf
493b0: CMPXCHG Eb,Gb
494b1: CMPXCHG Ev,Gv
495b2: LSS Gv,Mp
496b3: BTR Ev,Gv
497b4: LFS Gv,Mp
498b5: LGS Gv,Mp
499b6: MOVZX Gv,Eb
500b7: MOVZX Gv,Ew
501b8: JMPE | POPCNT Gv,Ev (F3)
502b9: Grp10 (1A)
503ba: Grp8 Ev,Ib (1A)
504bb: BTC Ev,Gv
505bc: BSF Gv,Ev
506bd: BSR Gv,Ev
507be: MOVSX Gv,Eb
508bf: MOVSX Gv,Ew
509# 0x0f 0xc0-0xcf
510c0: XADD Eb,Gb
511c1: XADD Ev,Gv
512c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX)
513c3: movnti Md/q,Gd/q
514c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128)
515c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128)
516c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX)
517c7: Grp9 (1A)
518c8: BSWAP RAX/EAX/R8/R8D
519c9: BSWAP RCX/ECX/R9/R9D
520ca: BSWAP RDX/EDX/R10/R10D
521cb: BSWAP RBX/EBX/R11/R11D
522cc: BSWAP RSP/ESP/R12/R12D
523cd: BSWAP RBP/EBP/R13/R13D
524ce: BSWAP RSI/ESI/R14/R14D
525cf: BSWAP RDI/EDI/R15/R15D
526# 0x0f 0xd0-0xdf
527d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX)
528d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128)
529d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128)
530d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128)
531d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128)
532d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128)
533d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
534d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128)
535d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128)
536d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128)
537da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128)
538db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128)
539dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128)
540dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128)
541de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128)
542df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128)
543# 0x0f 0xe0-0xef
544e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128)
545e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128)
546e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128)
547e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128)
548e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128)
549e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128)
550e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX)
551e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX)
552e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128)
553e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128)
554ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128)
555eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128)
556ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128)
557ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128)
558ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128)
559ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128)
560# 0x0f 0xf0-0xff
561f0: lddqu Vdq,Mdq (F2),(VEX)
562f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128)
563f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128)
564f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128)
565f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128)
566f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128)
567f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128)
568f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128)
569f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128)
570f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128)
571fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128)
572fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128)
573fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128)
574fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128)
575fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128)
576ff:
577EndTable
578
579Table: 3-byte opcode 1 (0x0f 0x38)
580Referrer: 3-byte escape 1
581AVXcode: 2
582# 0x0f 0x38 0x00-0x0f
58300: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128)
58401: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128)
58502: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128)
58603: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128)
58704: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128)
58805: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128)
58906: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128)
59007: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128)
59108: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128)
59209: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128)
5930a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128)
5940b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128)
5950c: Vpermilps /r (66),(oVEX)
5960d: Vpermilpd /r (66),(oVEX)
5970e: vtestps /r (66),(oVEX)
5980f: vtestpd /r (66),(oVEX)
599# 0x0f 0x38 0x10-0x1f
60010: pblendvb Vdq,Wdq (66)
60111:
60212:
60313:
60414: blendvps Vdq,Wdq (66)
60515: blendvpd Vdq,Wdq (66)
60616:
60717: ptest Vdq,Wdq (66),(VEX)
60818: vbroadcastss /r (66),(oVEX)
60919: vbroadcastsd /r (66),(oVEX),(o256)
6101a: vbroadcastf128 /r (66),(oVEX),(o256)
6111b:
6121c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128)
6131d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128)
6141e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128)
6151f:
616# 0x0f 0x38 0x20-0x2f
61720: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128)
61821: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128)
61922: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128)
62023: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128)
62124: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128)
62225: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128)
62326:
62427:
62528: pmuldq Vdq,Wdq (66),(VEX),(o128)
62629: pcmpeqq Vdq,Wdq (66),(VEX),(o128)
6272a: movntdqa Vdq,Mdq (66),(VEX),(o128)
6282b: packusdw Vdq,Wdq (66),(VEX),(o128)
6292c: vmaskmovps(ld) /r (66),(oVEX)
6302d: vmaskmovpd(ld) /r (66),(oVEX)
6312e: vmaskmovps(st) /r (66),(oVEX)
6322f: vmaskmovpd(st) /r (66),(oVEX)
633# 0x0f 0x38 0x30-0x3f
63430: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128)
63531: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128)
63632: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128)
63733: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128)
63834: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128)
63935: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128)
64036:
64137: pcmpgtq Vdq,Wdq (66),(VEX),(o128)
64238: pminsb Vdq,Wdq (66),(VEX),(o128)
64339: pminsd Vdq,Wdq (66),(VEX),(o128)
6443a: pminuw Vdq,Wdq (66),(VEX),(o128)
6453b: pminud Vdq,Wdq (66),(VEX),(o128)
6463c: pmaxsb Vdq,Wdq (66),(VEX),(o128)
6473d: pmaxsd Vdq,Wdq (66),(VEX),(o128)
6483e: pmaxuw Vdq,Wdq (66),(VEX),(o128)
6493f: pmaxud Vdq,Wdq (66),(VEX),(o128)
650# 0x0f 0x38 0x40-0x8f
65140: pmulld Vdq,Wdq (66),(VEX),(o128)
65241: phminposuw Vdq,Wdq (66),(VEX),(o128)
65380: INVEPT Gd/q,Mdq (66)
65481: INVPID Gd/q,Mdq (66)
655# 0x0f 0x38 0x90-0xbf (FMA)
65696: vfmaddsub132pd/ps /r (66),(VEX)
65797: vfmsubadd132pd/ps /r (66),(VEX)
65898: vfmadd132pd/ps /r (66),(VEX)
65999: vfmadd132sd/ss /r (66),(VEX),(o128)
6609a: vfmsub132pd/ps /r (66),(VEX)
6619b: vfmsub132sd/ss /r (66),(VEX),(o128)
6629c: vfnmadd132pd/ps /r (66),(VEX)
6639d: vfnmadd132sd/ss /r (66),(VEX),(o128)
6649e: vfnmsub132pd/ps /r (66),(VEX)
6659f: vfnmsub132sd/ss /r (66),(VEX),(o128)
666a6: vfmaddsub213pd/ps /r (66),(VEX)
667a7: vfmsubadd213pd/ps /r (66),(VEX)
668a8: vfmadd213pd/ps /r (66),(VEX)
669a9: vfmadd213sd/ss /r (66),(VEX),(o128)
670aa: vfmsub213pd/ps /r (66),(VEX)
671ab: vfmsub213sd/ss /r (66),(VEX),(o128)
672ac: vfnmadd213pd/ps /r (66),(VEX)
673ad: vfnmadd213sd/ss /r (66),(VEX),(o128)
674ae: vfnmsub213pd/ps /r (66),(VEX)
675af: vfnmsub213sd/ss /r (66),(VEX),(o128)
676b6: vfmaddsub231pd/ps /r (66),(VEX)
677b7: vfmsubadd231pd/ps /r (66),(VEX)
678b8: vfmadd231pd/ps /r (66),(VEX)
679b9: vfmadd231sd/ss /r (66),(VEX),(o128)
680ba: vfmsub231pd/ps /r (66),(VEX)
681bb: vfmsub231sd/ss /r (66),(VEX),(o128)
682bc: vfnmadd231pd/ps /r (66),(VEX)
683bd: vfnmadd231sd/ss /r (66),(VEX),(o128)
684be: vfnmsub231pd/ps /r (66),(VEX)
685bf: vfnmsub231sd/ss /r (66),(VEX),(o128)
686# 0x0f 0x38 0xc0-0xff
687db: aesimc Vdq,Wdq (66),(VEX),(o128)
688dc: aesenc Vdq,Wdq (66),(VEX),(o128)
689dd: aesenclast Vdq,Wdq (66),(VEX),(o128)
690de: aesdec Vdq,Wdq (66),(VEX),(o128)
691df: aesdeclast Vdq,Wdq (66),(VEX),(o128)
692f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2)
693f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2)
694EndTable
695
696Table: 3-byte opcode 2 (0x0f 0x3a)
697Referrer: 3-byte escape 2
698AVXcode: 3
699# 0x0f 0x3a 0x00-0xff
70004: vpermilps /r,Ib (66),(oVEX)
70105: vpermilpd /r,Ib (66),(oVEX)
70206: vperm2f128 /r,Ib (66),(oVEX),(o256)
70308: roundps Vdq,Wdq,Ib (66),(VEX)
70409: roundpd Vdq,Wdq,Ib (66),(VEX)
7050a: roundss Vss,Wss,Ib (66),(VEX),(o128)
7060b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128)
7070c: blendps Vdq,Wdq,Ib (66),(VEX)
7080d: blendpd Vdq,Wdq,Ib (66),(VEX)
7090e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128)
7100f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128)
71114: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128)
71215: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128)
71316: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128)
71417: extractps Ed,Vdq,Ib (66),(VEX),(o128)
71518: vinsertf128 /r,Ib (66),(oVEX),(o256)
71619: vextractf128 /r,Ib (66),(oVEX),(o256)
71720: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128)
71821: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128)
71922: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128)
72040: dpps Vdq,Wdq,Ib (66),(VEX)
72141: dppd Vdq,Wdq,Ib (66),(VEX),(o128)
72242: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128)
72344: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128)
7244a: vblendvps /r,Ib (66),(oVEX)
7254b: vblendvpd /r,Ib (66),(oVEX)
7264c: vpblendvb /r,Ib (66),(oVEX),(o128)
72760: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128)
72861: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128)
72962: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128)
73063: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128)
731df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128)
732EndTable
733
734GrpTable: Grp1
7350: ADD
7361: OR
7372: ADC
7383: SBB
7394: AND
7405: SUB
7416: XOR
7427: CMP
743EndTable
744
745GrpTable: Grp1A
7460: POP
747EndTable
748
749GrpTable: Grp2
7500: ROL
7511: ROR
7522: RCL
7533: RCR
7544: SHL/SAL
7555: SHR
7566:
7577: SAR
758EndTable
759
760GrpTable: Grp3_1
7610: TEST Eb,Ib
7621:
7632: NOT Eb
7643: NEG Eb
7654: MUL AL,Eb
7665: IMUL AL,Eb
7676: DIV AL,Eb
7687: IDIV AL,Eb
769EndTable
770
771GrpTable: Grp3_2
7720: TEST Ev,Iz
7731:
7742: NOT Ev
7753: NEG Ev
7764: MUL rAX,Ev
7775: IMUL rAX,Ev
7786: DIV rAX,Ev
7797: IDIV rAX,Ev
780EndTable
781
782GrpTable: Grp4
7830: INC Eb
7841: DEC Eb
785EndTable
786
787GrpTable: Grp5
7880: INC Ev
7891: DEC Ev
7902: CALLN Ev (f64)
7913: CALLF Ep
7924: JMPN Ev (f64)
7935: JMPF Ep
7946: PUSH Ev (d64)
7957:
796EndTable
797
798GrpTable: Grp6
7990: SLDT Rv/Mw
8001: STR Rv/Mw
8012: LLDT Ew
8023: LTR Ew
8034: VERR Ew
8045: VERW Ew
805EndTable
806
807GrpTable: Grp7
8080: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
8091: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
8102: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B)
8113: LIDT Ms
8124: SMSW Mw/Rv
8135:
8146: LMSW Ew
8157: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
816EndTable
817
818GrpTable: Grp8
8194: BT
8205: BTS
8216: BTR
8227: BTC
823EndTable
824
825GrpTable: Grp9
8261: CMPXCHG8B/16B Mq/Mdq
8276: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3)
8287: VMPTRST Mq
829EndTable
830
831GrpTable: Grp10
832EndTable
833
834GrpTable: Grp11
8350: MOV
836EndTable
837
838GrpTable: Grp12
8392: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128)
8404: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128)
8416: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128)
842EndTable
843
844GrpTable: Grp13
8452: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128)
8464: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128)
8476: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128)
848EndTable
849
850GrpTable: Grp14
8512: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128)
8523: psrldq Udq,Ib (66),(11B),(VEX),(o128)
8536: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128)
8547: pslldq Udq,Ib (66),(11B),(VEX),(o128)
855EndTable
856
857GrpTable: Grp15
8580: fxsave
8591: fxstor
8602: ldmxcsr (VEX)
8613: stmxcsr (VEX)
8624: XSAVE
8635: XRSTOR | lfence (11B)
8646: mfence (11B)
8657: clflush | sfence (11B)
866EndTable
867
868GrpTable: Grp16
8690: prefetch NTA
8701: prefetch T0
8712: prefetch T1
8723: prefetch T2
873EndTable
874
875# AMD's Prefetch Group
876GrpTable: GrpP
8770: PREFETCH
8781: PREFETCHW
879EndTable
880
881GrpTable: GrpPDLK
8820: MONTMUL
8831: XSHA1
8842: XSHA2
885EndTable
886
887GrpTable: GrpRNG
8880: xstore-rng
8891: xcrypt-ecb
8902: xcrypt-cbc
8914: xcrypt-cfb
8925: xcrypt-ofb
893EndTable
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 61b41ca3b5a2..d0474ad2a6e5 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -35,34 +35,3 @@ int fixup_exception(struct pt_regs *regs)
35 35
36 return 0; 36 return 0;
37} 37}
38
39#ifdef CONFIG_X86_64
40/*
41 * Need to defined our own search_extable on X86_64 to work around
42 * a B stepping K8 bug.
43 */
44const struct exception_table_entry *
45search_extable(const struct exception_table_entry *first,
46 const struct exception_table_entry *last,
47 unsigned long value)
48{
49 /* B stepping K8 bug */
50 if ((value >> 32) == 0)
51 value |= 0xffffffffUL << 32;
52
53 while (first <= last) {
54 const struct exception_table_entry *mid;
55 long diff;
56
57 mid = (last - first) / 2 + first;
58 diff = mid->insn - value;
59 if (diff == 0)
60 return mid;
61 else if (diff < 0)
62 first = mid+1;
63 else
64 last = mid-1;
65 }
66 return NULL;
67}
68#endif
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f4cee9028cf0..f62777940dfb 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -38,7 +38,8 @@ enum x86_pf_error_code {
38 * Returns 0 if mmiotrace is disabled, or if the fault is not 38 * Returns 0 if mmiotrace is disabled, or if the fault is not
39 * handled by mmiotrace: 39 * handled by mmiotrace:
40 */ 40 */
41static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 41static inline int __kprobes
42kmmio_fault(struct pt_regs *regs, unsigned long addr)
42{ 43{
43 if (unlikely(is_kmmio_active())) 44 if (unlikely(is_kmmio_active()))
44 if (kmmio_handler(regs, addr) == 1) 45 if (kmmio_handler(regs, addr) == 1)
@@ -46,7 +47,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
46 return 0; 47 return 0;
47} 48}
48 49
49static inline int notify_page_fault(struct pt_regs *regs) 50static inline int __kprobes notify_page_fault(struct pt_regs *regs)
50{ 51{
51 int ret = 0; 52 int ret = 0;
52 53
@@ -240,7 +241,7 @@ void vmalloc_sync_all(void)
240 * 241 *
241 * Handle a fault on the vmalloc or module mapping area 242 * Handle a fault on the vmalloc or module mapping area
242 */ 243 */
243static noinline int vmalloc_fault(unsigned long address) 244static noinline __kprobes int vmalloc_fault(unsigned long address)
244{ 245{
245 unsigned long pgd_paddr; 246 unsigned long pgd_paddr;
246 pmd_t *pmd_k; 247 pmd_t *pmd_k;
@@ -357,7 +358,7 @@ void vmalloc_sync_all(void)
357 * 358 *
358 * This assumes no large pages in there. 359 * This assumes no large pages in there.
359 */ 360 */
360static noinline int vmalloc_fault(unsigned long address) 361static noinline __kprobes int vmalloc_fault(unsigned long address)
361{ 362{
362 pgd_t *pgd, *pgd_ref; 363 pgd_t *pgd, *pgd_ref;
363 pud_t *pud, *pud_ref; 364 pud_t *pud, *pud_ref;
@@ -658,7 +659,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
658 show_fault_oops(regs, error_code, address); 659 show_fault_oops(regs, error_code, address);
659 660
660 stackend = end_of_stack(tsk); 661 stackend = end_of_stack(tsk);
661 if (*stackend != STACK_END_MAGIC) 662 if (tsk != &init_task && *stackend != STACK_END_MAGIC)
662 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); 663 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
663 664
664 tsk->thread.cr2 = address; 665 tsk->thread.cr2 = address;
@@ -860,7 +861,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
860 * There are no security implications to leaving a stale TLB when 861 * There are no security implications to leaving a stale TLB when
861 * increasing the permissions on a page. 862 * increasing the permissions on a page.
862 */ 863 */
863static noinline int 864static noinline __kprobes int
864spurious_fault(unsigned long error_code, unsigned long address) 865spurious_fault(unsigned long error_code, unsigned long address)
865{ 866{
866 pgd_t *pgd; 867 pgd_t *pgd;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 73ffd5536f62..d406c5239019 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -146,10 +146,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
146 use_gbpages = direct_gbpages; 146 use_gbpages = direct_gbpages;
147#endif 147#endif
148 148
149 set_nx();
150 if (nx_enabled)
151 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
152
153 /* Enable PSE if available */ 149 /* Enable PSE if available */
154 if (cpu_has_pse) 150 if (cpu_has_pse)
155 set_in_cr4(X86_CR4_PSE); 151 set_in_cr4(X86_CR4_PSE);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 30938c1d8d5d..c973f8e2a6cf 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -412,7 +412,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
412 pkmap_page_table = pte; 412 pkmap_page_table = pte;
413} 413}
414 414
415static void __init add_one_highpage_init(struct page *page, int pfn) 415static void __init add_one_highpage_init(struct page *page)
416{ 416{
417 ClearPageReserved(page); 417 ClearPageReserved(page);
418 init_page_count(page); 418 init_page_count(page);
@@ -445,7 +445,7 @@ static int __init add_highpages_work_fn(unsigned long start_pfn,
445 if (!pfn_valid(node_pfn)) 445 if (!pfn_valid(node_pfn))
446 continue; 446 continue;
447 page = pfn_to_page(node_pfn); 447 page = pfn_to_page(node_pfn);
448 add_one_highpage_init(page, node_pfn); 448 add_one_highpage_init(page);
449 } 449 }
450 450
451 return 0; 451 return 0;
@@ -703,8 +703,8 @@ void __init find_low_pfn_range(void)
703} 703}
704 704
705#ifndef CONFIG_NEED_MULTIPLE_NODES 705#ifndef CONFIG_NEED_MULTIPLE_NODES
706void __init initmem_init(unsigned long start_pfn, 706void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
707 unsigned long end_pfn) 707 int acpi, int k8)
708{ 708{
709#ifdef CONFIG_HIGHMEM 709#ifdef CONFIG_HIGHMEM
710 highstart_pfn = highend_pfn = max_pfn; 710 highstart_pfn = highend_pfn = max_pfn;
@@ -997,7 +997,7 @@ static noinline int do_test_wp_bit(void)
997const int rodata_test_data = 0xC3; 997const int rodata_test_data = 0xC3;
998EXPORT_SYMBOL_GPL(rodata_test_data); 998EXPORT_SYMBOL_GPL(rodata_test_data);
999 999
1000static int kernel_set_to_readonly; 1000int kernel_set_to_readonly __read_mostly;
1001 1001
1002void set_kernel_text_rw(void) 1002void set_kernel_text_rw(void)
1003{ 1003{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5a4398a6006b..5198b9bb34ef 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -568,7 +568,8 @@ kernel_physical_mapping_init(unsigned long start,
568} 568}
569 569
570#ifndef CONFIG_NUMA 570#ifndef CONFIG_NUMA
571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) 571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
572 int acpi, int k8)
572{ 573{
573 unsigned long bootmap_size, bootmap; 574 unsigned long bootmap_size, bootmap;
574 575
@@ -694,12 +695,12 @@ void __init mem_init(void)
694const int rodata_test_data = 0xC3; 695const int rodata_test_data = 0xC3;
695EXPORT_SYMBOL_GPL(rodata_test_data); 696EXPORT_SYMBOL_GPL(rodata_test_data);
696 697
697static int kernel_set_to_readonly; 698int kernel_set_to_readonly;
698 699
699void set_kernel_text_rw(void) 700void set_kernel_text_rw(void)
700{ 701{
701 unsigned long start = PFN_ALIGN(_stext); 702 unsigned long start = PFN_ALIGN(_text);
702 unsigned long end = PFN_ALIGN(__start_rodata); 703 unsigned long end = PFN_ALIGN(__stop___ex_table);
703 704
704 if (!kernel_set_to_readonly) 705 if (!kernel_set_to_readonly)
705 return; 706 return;
@@ -707,13 +708,18 @@ void set_kernel_text_rw(void)
707 pr_debug("Set kernel text: %lx - %lx for read write\n", 708 pr_debug("Set kernel text: %lx - %lx for read write\n",
708 start, end); 709 start, end);
709 710
711 /*
712 * Make the kernel identity mapping for text RW. Kernel text
713 * mapping will always be RO. Refer to the comment in
714 * static_protections() in pageattr.c
715 */
710 set_memory_rw(start, (end - start) >> PAGE_SHIFT); 716 set_memory_rw(start, (end - start) >> PAGE_SHIFT);
711} 717}
712 718
713void set_kernel_text_ro(void) 719void set_kernel_text_ro(void)
714{ 720{
715 unsigned long start = PFN_ALIGN(_stext); 721 unsigned long start = PFN_ALIGN(_text);
716 unsigned long end = PFN_ALIGN(__start_rodata); 722 unsigned long end = PFN_ALIGN(__stop___ex_table);
717 723
718 if (!kernel_set_to_readonly) 724 if (!kernel_set_to_readonly)
719 return; 725 return;
@@ -721,14 +727,21 @@ void set_kernel_text_ro(void)
721 pr_debug("Set kernel text: %lx - %lx for read only\n", 727 pr_debug("Set kernel text: %lx - %lx for read only\n",
722 start, end); 728 start, end);
723 729
730 /*
731 * Set the kernel identity mapping for text RO.
732 */
724 set_memory_ro(start, (end - start) >> PAGE_SHIFT); 733 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
725} 734}
726 735
727void mark_rodata_ro(void) 736void mark_rodata_ro(void)
728{ 737{
729 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); 738 unsigned long start = PFN_ALIGN(_text);
730 unsigned long rodata_start = 739 unsigned long rodata_start =
731 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 740 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
741 unsigned long end = (unsigned long) &__end_rodata_hpage_align;
742 unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
743 unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
744 unsigned long data_start = (unsigned long) &_sdata;
732 745
733 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 746 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
734 (end - start) >> 10); 747 (end - start) >> 10);
@@ -751,6 +764,14 @@ void mark_rodata_ro(void)
751 printk(KERN_INFO "Testing CPA: again\n"); 764 printk(KERN_INFO "Testing CPA: again\n");
752 set_memory_ro(start, (end-start) >> PAGE_SHIFT); 765 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
753#endif 766#endif
767
768 free_init_pages("unused kernel memory",
769 (unsigned long) page_address(virt_to_page(text_end)),
770 (unsigned long)
771 page_address(virt_to_page(rodata_start)));
772 free_init_pages("unused kernel memory",
773 (unsigned long) page_address(virt_to_page(rodata_end)),
774 (unsigned long) page_address(virt_to_page(data_start)));
754} 775}
755 776
756#endif 777#endif
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 334e63ca7b2b..c246d259822d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -170,8 +170,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
170 (unsigned long long)phys_addr, 170 (unsigned long long)phys_addr,
171 (unsigned long long)(phys_addr + size), 171 (unsigned long long)(phys_addr + size),
172 prot_val, new_prot_val); 172 prot_val, new_prot_val);
173 free_memtype(phys_addr, phys_addr + size); 173 goto err_free_memtype;
174 return NULL;
175 } 174 }
176 prot_val = new_prot_val; 175 prot_val = new_prot_val;
177 } 176 }
@@ -197,26 +196,25 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
197 */ 196 */
198 area = get_vm_area_caller(size, VM_IOREMAP, caller); 197 area = get_vm_area_caller(size, VM_IOREMAP, caller);
199 if (!area) 198 if (!area)
200 return NULL; 199 goto err_free_memtype;
201 area->phys_addr = phys_addr; 200 area->phys_addr = phys_addr;
202 vaddr = (unsigned long) area->addr; 201 vaddr = (unsigned long) area->addr;
203 202
204 if (kernel_map_sync_memtype(phys_addr, size, prot_val)) { 203 if (kernel_map_sync_memtype(phys_addr, size, prot_val))
205 free_memtype(phys_addr, phys_addr + size); 204 goto err_free_area;
206 free_vm_area(area);
207 return NULL;
208 }
209 205
210 if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { 206 if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
211 free_memtype(phys_addr, phys_addr + size); 207 goto err_free_area;
212 free_vm_area(area);
213 return NULL;
214 }
215 208
216 ret_addr = (void __iomem *) (vaddr + offset); 209 ret_addr = (void __iomem *) (vaddr + offset);
217 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); 210 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
218 211
219 return ret_addr; 212 return ret_addr;
213err_free_area:
214 free_vm_area(area);
215err_free_memtype:
216 free_memtype(phys_addr, phys_addr + size);
217 return NULL;
220} 218}
221 219
222/** 220/**
@@ -283,30 +281,6 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
283} 281}
284EXPORT_SYMBOL(ioremap_cache); 282EXPORT_SYMBOL(ioremap_cache);
285 283
286static void __iomem *ioremap_default(resource_size_t phys_addr,
287 unsigned long size)
288{
289 unsigned long flags;
290 void __iomem *ret;
291 int err;
292
293 /*
294 * - WB for WB-able memory and no other conflicting mappings
295 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
296 * - Inherit from confliting mappings otherwise
297 */
298 err = reserve_memtype(phys_addr, phys_addr + size,
299 _PAGE_CACHE_WB, &flags);
300 if (err < 0)
301 return NULL;
302
303 ret = __ioremap_caller(phys_addr, size, flags,
304 __builtin_return_address(0));
305
306 free_memtype(phys_addr, phys_addr + size);
307 return ret;
308}
309
310void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, 284void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
311 unsigned long prot_val) 285 unsigned long prot_val)
312{ 286{
@@ -382,7 +356,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
382 if (page_is_ram(start >> PAGE_SHIFT)) 356 if (page_is_ram(start >> PAGE_SHIFT))
383 return __va(phys); 357 return __va(phys);
384 358
385 addr = (void __force *)ioremap_default(start, PAGE_SIZE); 359 addr = (void __force *)ioremap_cache(start, PAGE_SIZE);
386 if (addr) 360 if (addr)
387 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 361 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
388 362
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 268f8255280f..970ed579d4e4 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -24,6 +24,9 @@
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h> 25#include <asm/k8.h>
26 26
27static struct bootnode __initdata nodes[8];
28static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
29
27static __init int find_northbridge(void) 30static __init int find_northbridge(void)
28{ 31{
29 int num; 32 int num;
@@ -54,18 +57,6 @@ static __init void early_get_boot_cpu_id(void)
54 * need to get boot_cpu_id so can use that to create apicid_to_node 57 * need to get boot_cpu_id so can use that to create apicid_to_node
55 * in k8_scan_nodes() 58 * in k8_scan_nodes()
56 */ 59 */
57 /*
58 * Find possible boot-time SMP configuration:
59 */
60#ifdef CONFIG_X86_MPPARSE
61 early_find_smp_config();
62#endif
63#ifdef CONFIG_ACPI
64 /*
65 * Read APIC information from ACPI tables.
66 */
67 early_acpi_boot_init();
68#endif
69#ifdef CONFIG_X86_MPPARSE 60#ifdef CONFIG_X86_MPPARSE
70 /* 61 /*
71 * get boot-time SMP configuration: 62 * get boot-time SMP configuration:
@@ -76,12 +67,26 @@ static __init void early_get_boot_cpu_id(void)
76 early_init_lapic_mapping(); 67 early_init_lapic_mapping();
77} 68}
78 69
79int __init k8_scan_nodes(unsigned long start, unsigned long end) 70int __init k8_get_nodes(struct bootnode *physnodes)
80{ 71{
81 unsigned numnodes, cores, bits, apicid_base; 72 int i;
73 int ret = 0;
74
75 for_each_node_mask(i, nodes_parsed) {
76 physnodes[ret].start = nodes[i].start;
77 physnodes[ret].end = nodes[i].end;
78 ret++;
79 }
80 return ret;
81}
82
83int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
84{
85 unsigned long start = PFN_PHYS(start_pfn);
86 unsigned long end = PFN_PHYS(end_pfn);
87 unsigned numnodes;
82 unsigned long prevbase; 88 unsigned long prevbase;
83 struct bootnode nodes[8]; 89 int i, nb, found = 0;
84 int i, j, nb, found = 0;
85 u32 nodeid, reg; 90 u32 nodeid, reg;
86 91
87 if (!early_pci_allowed()) 92 if (!early_pci_allowed())
@@ -91,16 +96,15 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
91 if (nb < 0) 96 if (nb < 0)
92 return nb; 97 return nb;
93 98
94 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 99 pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
95 100
96 reg = read_pci_config(0, nb, 0, 0x60); 101 reg = read_pci_config(0, nb, 0, 0x60);
97 numnodes = ((reg >> 4) & 0xF) + 1; 102 numnodes = ((reg >> 4) & 0xF) + 1;
98 if (numnodes <= 1) 103 if (numnodes <= 1)
99 return -1; 104 return -1;
100 105
101 printk(KERN_INFO "Number of nodes %d\n", numnodes); 106 pr_info("Number of physical nodes %d\n", numnodes);
102 107
103 memset(&nodes, 0, sizeof(nodes));
104 prevbase = 0; 108 prevbase = 0;
105 for (i = 0; i < 8; i++) { 109 for (i = 0; i < 8; i++) {
106 unsigned long base, limit; 110 unsigned long base, limit;
@@ -111,28 +115,28 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
111 nodeid = limit & 7; 115 nodeid = limit & 7;
112 if ((base & 3) == 0) { 116 if ((base & 3) == 0) {
113 if (i < numnodes) 117 if (i < numnodes)
114 printk("Skipping disabled node %d\n", i); 118 pr_info("Skipping disabled node %d\n", i);
115 continue; 119 continue;
116 } 120 }
117 if (nodeid >= numnodes) { 121 if (nodeid >= numnodes) {
118 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, 122 pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
119 base, limit); 123 base, limit);
120 continue; 124 continue;
121 } 125 }
122 126
123 if (!limit) { 127 if (!limit) {
124 printk(KERN_INFO "Skipping node entry %d (base %lx)\n", 128 pr_info("Skipping node entry %d (base %lx)\n",
125 i, base); 129 i, base);
126 continue; 130 continue;
127 } 131 }
128 if ((base >> 8) & 3 || (limit >> 8) & 3) { 132 if ((base >> 8) & 3 || (limit >> 8) & 3) {
129 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 133 pr_err("Node %d using interleaving mode %lx/%lx\n",
130 nodeid, (base>>8)&3, (limit>>8) & 3); 134 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
131 return -1; 135 return -1;
132 } 136 }
133 if (node_isset(nodeid, node_possible_map)) { 137 if (node_isset(nodeid, nodes_parsed)) {
134 printk(KERN_INFO "Node %d already present. Skipping\n", 138 pr_info("Node %d already present, skipping\n",
135 nodeid); 139 nodeid);
136 continue; 140 continue;
137 } 141 }
138 142
@@ -141,8 +145,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
141 limit |= (1<<24)-1; 145 limit |= (1<<24)-1;
142 limit++; 146 limit++;
143 147
144 if (limit > max_pfn << PAGE_SHIFT) 148 if (limit > end)
145 limit = max_pfn << PAGE_SHIFT; 149 limit = end;
146 if (limit <= base) 150 if (limit <= base)
147 continue; 151 continue;
148 152
@@ -154,24 +158,24 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
154 if (limit > end) 158 if (limit > end)
155 limit = end; 159 limit = end;
156 if (limit == base) { 160 if (limit == base) {
157 printk(KERN_ERR "Empty node %d\n", nodeid); 161 pr_err("Empty node %d\n", nodeid);
158 continue; 162 continue;
159 } 163 }
160 if (limit < base) { 164 if (limit < base) {
161 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", 165 pr_err("Node %d bogus settings %lx-%lx.\n",
162 nodeid, base, limit); 166 nodeid, base, limit);
163 continue; 167 continue;
164 } 168 }
165 169
166 /* Could sort here, but pun for now. Should not happen anyroads. */ 170 /* Could sort here, but pun for now. Should not happen anyroads. */
167 if (prevbase > base) { 171 if (prevbase > base) {
168 printk(KERN_ERR "Node map not sorted %lx,%lx\n", 172 pr_err("Node map not sorted %lx,%lx\n",
169 prevbase, base); 173 prevbase, base);
170 return -1; 174 return -1;
171 } 175 }
172 176
173 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 177 pr_info("Node %d MemBase %016lx Limit %016lx\n",
174 nodeid, base, limit); 178 nodeid, base, limit);
175 179
176 found++; 180 found++;
177 181
@@ -180,18 +184,29 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
180 184
181 prevbase = base; 185 prevbase = base;
182 186
183 node_set(nodeid, node_possible_map); 187 node_set(nodeid, nodes_parsed);
184 } 188 }
185 189
186 if (!found) 190 if (!found)
187 return -1; 191 return -1;
192 return 0;
193}
194
195int __init k8_scan_nodes(void)
196{
197 unsigned int bits;
198 unsigned int cores;
199 unsigned int apicid_base;
200 int i;
188 201
202 BUG_ON(nodes_empty(nodes_parsed));
203 node_possible_map = nodes_parsed;
189 memnode_shift = compute_hash_shift(nodes, 8, NULL); 204 memnode_shift = compute_hash_shift(nodes, 8, NULL);
190 if (memnode_shift < 0) { 205 if (memnode_shift < 0) {
191 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 206 pr_err("No NUMA node hash function found. Contact maintainer\n");
192 return -1; 207 return -1;
193 } 208 }
194 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 209 pr_info("Using node hash shift of %d\n", memnode_shift);
195 210
196 /* use the coreid bits from early_identify_cpu */ 211 /* use the coreid bits from early_identify_cpu */
197 bits = boot_cpu_data.x86_coreid_bits; 212 bits = boot_cpu_data.x86_coreid_bits;
@@ -200,14 +215,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
200 /* need to get boot_cpu_id early for system with apicid lifting */ 215 /* need to get boot_cpu_id early for system with apicid lifting */
201 early_get_boot_cpu_id(); 216 early_get_boot_cpu_id();
202 if (boot_cpu_physical_apicid > 0) { 217 if (boot_cpu_physical_apicid > 0) {
203 printk(KERN_INFO "BSP APIC ID: %02x\n", 218 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
204 boot_cpu_physical_apicid);
205 apicid_base = boot_cpu_physical_apicid; 219 apicid_base = boot_cpu_physical_apicid;
206 } 220 }
207 221
208 for (i = 0; i < 8; i++) { 222 for_each_node_mask(i, node_possible_map) {
209 if (nodes[i].start == nodes[i].end) 223 int j;
210 continue;
211 224
212 e820_register_active_regions(i, 225 e820_register_active_regions(i,
213 nodes[i].start >> PAGE_SHIFT, 226 nodes[i].start >> PAGE_SHIFT,
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 16ccbd77917f..11a4ad4d6253 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -540,8 +540,14 @@ kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
540 struct die_args *arg = args; 540 struct die_args *arg = args;
541 541
542 if (val == DIE_DEBUG && (arg->err & DR_STEP)) 542 if (val == DIE_DEBUG && (arg->err & DR_STEP))
543 if (post_kmmio_handler(arg->err, arg->regs) == 1) 543 if (post_kmmio_handler(arg->err, arg->regs) == 1) {
544 /*
545 * Reset the BS bit in dr6 (pointed by args->err) to
546 * denote completion of processing
547 */
548 (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP;
544 return NOTIFY_STOP; 549 return NOTIFY_STOP;
550 }
545 551
546 return NOTIFY_DONE; 552 return NOTIFY_DONE;
547} 553}
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index d2530062fe00..b20760ca7244 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -347,8 +347,8 @@ static void init_remap_allocator(int nid)
347 (ulong) node_remap_end_vaddr[nid]); 347 (ulong) node_remap_end_vaddr[nid]);
348} 348}
349 349
350void __init initmem_init(unsigned long start_pfn, 350void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
351 unsigned long end_pfn) 351 int acpi, int k8)
352{ 352{
353 int nid; 353 int nid;
354 long kva_target_pfn; 354 long kva_target_pfn;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 459913beac71..83bbc70d11bb 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -239,8 +239,14 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 239 bootmap = early_node_mem(nodeid, bootmap_start, end,
240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
241 if (bootmap == NULL) { 241 if (bootmap == NULL) {
242 if (nodedata_phys < start || nodedata_phys >= end) 242 if (nodedata_phys < start || nodedata_phys >= end) {
243 free_bootmem(nodedata_phys, pgdat_size); 243 /*
244 * only need to free it if it is from other node
245 * bootmem
246 */
247 if (nid != nodeid)
248 free_bootmem(nodedata_phys, pgdat_size);
249 }
244 node_data[nodeid] = NULL; 250 node_data[nodeid] = NULL;
245 return; 251 return;
246 } 252 }
@@ -306,8 +312,71 @@ void __init numa_init_array(void)
306 312
307#ifdef CONFIG_NUMA_EMU 313#ifdef CONFIG_NUMA_EMU
308/* Numa emulation */ 314/* Numa emulation */
315static struct bootnode nodes[MAX_NUMNODES] __initdata;
316static struct bootnode physnodes[MAX_NUMNODES] __initdata;
309static char *cmdline __initdata; 317static char *cmdline __initdata;
310 318
319static int __init setup_physnodes(unsigned long start, unsigned long end,
320 int acpi, int k8)
321{
322 int nr_nodes = 0;
323 int ret = 0;
324 int i;
325
326#ifdef CONFIG_ACPI_NUMA
327 if (acpi)
328 nr_nodes = acpi_get_nodes(physnodes);
329#endif
330#ifdef CONFIG_K8_NUMA
331 if (k8)
332 nr_nodes = k8_get_nodes(physnodes);
333#endif
334 /*
335 * Basic sanity checking on the physical node map: there may be errors
336 * if the SRAT or K8 incorrectly reported the topology or the mem=
337 * kernel parameter is used.
338 */
339 for (i = 0; i < nr_nodes; i++) {
340 if (physnodes[i].start == physnodes[i].end)
341 continue;
342 if (physnodes[i].start > end) {
343 physnodes[i].end = physnodes[i].start;
344 continue;
345 }
346 if (physnodes[i].end < start) {
347 physnodes[i].start = physnodes[i].end;
348 continue;
349 }
350 if (physnodes[i].start < start)
351 physnodes[i].start = start;
352 if (physnodes[i].end > end)
353 physnodes[i].end = end;
354 }
355
356 /*
357 * Remove all nodes that have no memory or were truncated because of the
358 * limited address range.
359 */
360 for (i = 0; i < nr_nodes; i++) {
361 if (physnodes[i].start == physnodes[i].end)
362 continue;
363 physnodes[ret].start = physnodes[i].start;
364 physnodes[ret].end = physnodes[i].end;
365 ret++;
366 }
367
368 /*
369 * If no physical topology was detected, a single node is faked to cover
370 * the entire address space.
371 */
372 if (!ret) {
373 physnodes[ret].start = start;
374 physnodes[ret].end = end;
375 ret = 1;
376 }
377 return ret;
378}
379
311/* 380/*
312 * Setups up nid to range from addr to addr + size. If the end 381 * Setups up nid to range from addr to addr + size. If the end
313 * boundary is greater than max_addr, then max_addr is used instead. 382 * boundary is greater than max_addr, then max_addr is used instead.
@@ -315,11 +384,9 @@ static char *cmdline __initdata;
315 * allocation past addr and -1 otherwise. addr is adjusted to be at 384 * allocation past addr and -1 otherwise. addr is adjusted to be at
316 * the end of the node. 385 * the end of the node.
317 */ 386 */
318static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, 387static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
319 u64 size, u64 max_addr)
320{ 388{
321 int ret = 0; 389 int ret = 0;
322
323 nodes[nid].start = *addr; 390 nodes[nid].start = *addr;
324 *addr += size; 391 *addr += size;
325 if (*addr >= max_addr) { 392 if (*addr >= max_addr) {
@@ -335,12 +402,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
335} 402}
336 403
337/* 404/*
405 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
406 * to max_addr. The return value is the number of nodes allocated.
407 */
408static int __init split_nodes_interleave(u64 addr, u64 max_addr,
409 int nr_phys_nodes, int nr_nodes)
410{
411 nodemask_t physnode_mask = NODE_MASK_NONE;
412 u64 size;
413 int big;
414 int ret = 0;
415 int i;
416
417 if (nr_nodes <= 0)
418 return -1;
419 if (nr_nodes > MAX_NUMNODES) {
420 pr_info("numa=fake=%d too large, reducing to %d\n",
421 nr_nodes, MAX_NUMNODES);
422 nr_nodes = MAX_NUMNODES;
423 }
424
425 size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
426 /*
427 * Calculate the number of big nodes that can be allocated as a result
428 * of consolidating the remainder.
429 */
430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) /
431 FAKE_NODE_MIN_SIZE;
432
433 size &= FAKE_NODE_MIN_HASH_MASK;
434 if (!size) {
435 pr_err("Not enough memory for each node. "
436 "NUMA emulation disabled.\n");
437 return -1;
438 }
439
440 for (i = 0; i < nr_phys_nodes; i++)
441 if (physnodes[i].start != physnodes[i].end)
442 node_set(i, physnode_mask);
443
444 /*
445 * Continue to fill physical nodes with fake nodes until there is no
446 * memory left on any of them.
447 */
448 while (nodes_weight(physnode_mask)) {
449 for_each_node_mask(i, physnode_mask) {
450 u64 end = physnodes[i].start + size;
451 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
452
453 if (ret < big)
454 end += FAKE_NODE_MIN_SIZE;
455
456 /*
457 * Continue to add memory to this fake node if its
458 * non-reserved memory is less than the per-node size.
459 */
460 while (end - physnodes[i].start -
461 e820_hole_size(physnodes[i].start, end) < size) {
462 end += FAKE_NODE_MIN_SIZE;
463 if (end > physnodes[i].end) {
464 end = physnodes[i].end;
465 break;
466 }
467 }
468
469 /*
470 * If there won't be at least FAKE_NODE_MIN_SIZE of
471 * non-reserved memory in ZONE_DMA32 for the next node,
472 * this one must extend to the boundary.
473 */
474 if (end < dma32_end && dma32_end - end -
475 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
476 end = dma32_end;
477
478 /*
479 * If there won't be enough non-reserved memory for the
480 * next node, this one must extend to the end of the
481 * physical node.
482 */
483 if (physnodes[i].end - end -
484 e820_hole_size(end, physnodes[i].end) < size)
485 end = physnodes[i].end;
486
487 /*
488 * Avoid allocating more nodes than requested, which can
489 * happen as a result of rounding down each node's size
490 * to FAKE_NODE_MIN_SIZE.
491 */
492 if (nodes_weight(physnode_mask) + ret >= nr_nodes)
493 end = physnodes[i].end;
494
495 if (setup_node_range(ret++, &physnodes[i].start,
496 end - physnodes[i].start,
497 physnodes[i].end) < 0)
498 node_clear(i, physnode_mask);
499 }
500 }
501 return ret;
502}
503
504/*
338 * Splits num_nodes nodes up equally starting at node_start. The return value 505 * Splits num_nodes nodes up equally starting at node_start. The return value
339 * is the number of nodes split up and addr is adjusted to be at the end of the 506 * is the number of nodes split up and addr is adjusted to be at the end of the
340 * last node allocated. 507 * last node allocated.
341 */ 508 */
342static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, 509static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
343 u64 max_addr, int node_start,
344 int num_nodes) 510 int num_nodes)
345{ 511{
346 unsigned int big; 512 unsigned int big;
@@ -388,7 +554,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
388 break; 554 break;
389 } 555 }
390 } 556 }
391 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) 557 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
392 break; 558 break;
393 } 559 }
394 return i - node_start + 1; 560 return i - node_start + 1;
@@ -399,12 +565,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
399 * always assigned to a final node and can be asymmetric. Returns the number of 565 * always assigned to a final node and can be asymmetric. Returns the number of
400 * nodes split. 566 * nodes split.
401 */ 567 */
402static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, 568static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
403 u64 max_addr, int node_start, u64 size) 569 u64 size)
404{ 570{
405 int i = node_start; 571 int i = node_start;
406 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 572 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
407 while (!setup_node_range(i++, nodes, addr, size, max_addr)) 573 while (!setup_node_range(i++, addr, size, max_addr))
408 ; 574 ;
409 return i - node_start; 575 return i - node_start;
410} 576}
@@ -413,15 +579,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413 * Sets up the system RAM area from start_pfn to last_pfn according to the 579 * Sets up the system RAM area from start_pfn to last_pfn according to the
414 * numa=fake command-line option. 580 * numa=fake command-line option.
415 */ 581 */
416static struct bootnode nodes[MAX_NUMNODES] __initdata; 582static int __init numa_emulation(unsigned long start_pfn,
417 583 unsigned long last_pfn, int acpi, int k8)
418static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
419{ 584{
420 u64 size, addr = start_pfn << PAGE_SHIFT; 585 u64 size, addr = start_pfn << PAGE_SHIFT;
421 u64 max_addr = last_pfn << PAGE_SHIFT; 586 u64 max_addr = last_pfn << PAGE_SHIFT;
422 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 587 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
588 int num_phys_nodes;
423 589
424 memset(&nodes, 0, sizeof(nodes)); 590 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
425 /* 591 /*
426 * If the numa=fake command-line is just a single number N, split the 592 * If the numa=fake command-line is just a single number N, split the
427 * system RAM into N fake nodes. 593 * system RAM into N fake nodes.
@@ -429,7 +595,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
429 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 595 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
430 long n = simple_strtol(cmdline, NULL, 0); 596 long n = simple_strtol(cmdline, NULL, 0);
431 597
432 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); 598 num_nodes = split_nodes_interleave(addr, max_addr,
599 num_phys_nodes, n);
433 if (num_nodes < 0) 600 if (num_nodes < 0)
434 return num_nodes; 601 return num_nodes;
435 goto out; 602 goto out;
@@ -456,8 +623,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
456 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; 623 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
457 if (size) 624 if (size)
458 for (i = 0; i < coeff; i++, num_nodes++) 625 for (i = 0; i < coeff; i++, num_nodes++)
459 if (setup_node_range(num_nodes, nodes, 626 if (setup_node_range(num_nodes, &addr,
460 &addr, size, max_addr) < 0) 627 size, max_addr) < 0)
461 goto done; 628 goto done;
462 if (!*cmdline) 629 if (!*cmdline)
463 break; 630 break;
@@ -473,7 +640,7 @@ done:
473 if (addr < max_addr) { 640 if (addr < max_addr) {
474 if (coeff_flag && coeff < 0) { 641 if (coeff_flag && coeff < 0) {
475 /* Split remaining nodes into num-sized chunks */ 642 /* Split remaining nodes into num-sized chunks */
476 num_nodes += split_nodes_by_size(nodes, &addr, max_addr, 643 num_nodes += split_nodes_by_size(&addr, max_addr,
477 num_nodes, num); 644 num_nodes, num);
478 goto out; 645 goto out;
479 } 646 }
@@ -482,7 +649,7 @@ done:
482 /* Split remaining nodes into coeff chunks */ 649 /* Split remaining nodes into coeff chunks */
483 if (coeff <= 0) 650 if (coeff <= 0)
484 break; 651 break;
485 num_nodes += split_nodes_equally(nodes, &addr, max_addr, 652 num_nodes += split_nodes_equally(&addr, max_addr,
486 num_nodes, coeff); 653 num_nodes, coeff);
487 break; 654 break;
488 case ',': 655 case ',':
@@ -490,8 +657,8 @@ done:
490 break; 657 break;
491 default: 658 default:
492 /* Give one final node */ 659 /* Give one final node */
493 setup_node_range(num_nodes, nodes, &addr, 660 setup_node_range(num_nodes, &addr, max_addr - addr,
494 max_addr - addr, max_addr); 661 max_addr);
495 num_nodes++; 662 num_nodes++;
496 } 663 }
497 } 664 }
@@ -505,14 +672,10 @@ out:
505 } 672 }
506 673
507 /* 674 /*
508 * We need to vacate all active ranges that may have been registered by 675 * We need to vacate all active ranges that may have been registered for
509 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns 676 * the e820 memory map.
510 * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
511 */ 677 */
512 remove_all_active_ranges(); 678 remove_all_active_ranges();
513#ifdef CONFIG_ACPI_NUMA
514 acpi_numa = -1;
515#endif
516 for_each_node_mask(i, node_possible_map) { 679 for_each_node_mask(i, node_possible_map) {
517 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 680 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
518 nodes[i].end >> PAGE_SHIFT); 681 nodes[i].end >> PAGE_SHIFT);
@@ -524,7 +687,8 @@ out:
524} 687}
525#endif /* CONFIG_NUMA_EMU */ 688#endif /* CONFIG_NUMA_EMU */
526 689
527void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) 690void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
691 int acpi, int k8)
528{ 692{
529 int i; 693 int i;
530 694
@@ -532,23 +696,22 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
532 nodes_clear(node_online_map); 696 nodes_clear(node_online_map);
533 697
534#ifdef CONFIG_NUMA_EMU 698#ifdef CONFIG_NUMA_EMU
535 if (cmdline && !numa_emulation(start_pfn, last_pfn)) 699 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
536 return; 700 return;
537 nodes_clear(node_possible_map); 701 nodes_clear(node_possible_map);
538 nodes_clear(node_online_map); 702 nodes_clear(node_online_map);
539#endif 703#endif
540 704
541#ifdef CONFIG_ACPI_NUMA 705#ifdef CONFIG_ACPI_NUMA
542 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 706 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
543 last_pfn << PAGE_SHIFT)) 707 last_pfn << PAGE_SHIFT))
544 return; 708 return;
545 nodes_clear(node_possible_map); 709 nodes_clear(node_possible_map);
546 nodes_clear(node_online_map); 710 nodes_clear(node_online_map);
547#endif 711#endif
548 712
549#ifdef CONFIG_K8_NUMA 713#ifdef CONFIG_K8_NUMA
550 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 714 if (!numa_off && k8 && !k8_scan_nodes())
551 last_pfn<<PAGE_SHIFT))
552 return; 715 return;
553 nodes_clear(node_possible_map); 716 nodes_clear(node_possible_map);
554 nodes_clear(node_online_map); 717 nodes_clear(node_online_map);
@@ -601,6 +764,25 @@ static __init int numa_setup(char *opt)
601early_param("numa", numa_setup); 764early_param("numa", numa_setup);
602 765
603#ifdef CONFIG_NUMA 766#ifdef CONFIG_NUMA
767
768static __init int find_near_online_node(int node)
769{
770 int n, val;
771 int min_val = INT_MAX;
772 int best_node = -1;
773
774 for_each_online_node(n) {
775 val = node_distance(node, n);
776
777 if (val < min_val) {
778 min_val = val;
779 best_node = n;
780 }
781 }
782
783 return best_node;
784}
785
604/* 786/*
605 * Setup early cpu_to_node. 787 * Setup early cpu_to_node.
606 * 788 *
@@ -632,7 +814,7 @@ void __init init_cpu_to_node(void)
632 if (node == NUMA_NO_NODE) 814 if (node == NUMA_NO_NODE)
633 continue; 815 continue;
634 if (!node_online(node)) 816 if (!node_online(node))
635 continue; 817 node = find_near_online_node(node);
636 numa_set_node(cpu, node); 818 numa_set_node(cpu, node);
637 } 819 }
638} 820}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dd38bfbefd1f..1d4eb93d333c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -279,6 +279,22 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
280 pgprot_val(forbidden) |= _PAGE_RW; 280 pgprot_val(forbidden) |= _PAGE_RW;
281 281
282#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
283 /*
284 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
285 * kernel text mappings for the large page aligned text, rodata sections
286 * will be always read-only. For the kernel identity mappings covering
287 * the holes caused by this alignment can be anything that user asks.
288 *
289 * This will preserve the large page mappings for kernel text/data
290 * at no extra cost.
291 */
292 if (kernel_set_to_readonly &&
293 within(address, (unsigned long)_text,
294 (unsigned long)__end_rodata_hpage_align))
295 pgprot_val(forbidden) |= _PAGE_RW;
296#endif
297
282 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 298 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
283 299
284 return prot; 300 return prot;
@@ -1069,12 +1085,18 @@ EXPORT_SYMBOL(set_memory_array_wb);
1069 1085
1070int set_memory_x(unsigned long addr, int numpages) 1086int set_memory_x(unsigned long addr, int numpages)
1071{ 1087{
1088 if (!(__supported_pte_mask & _PAGE_NX))
1089 return 0;
1090
1072 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); 1091 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1073} 1092}
1074EXPORT_SYMBOL(set_memory_x); 1093EXPORT_SYMBOL(set_memory_x);
1075 1094
1076int set_memory_nx(unsigned long addr, int numpages) 1095int set_memory_nx(unsigned long addr, int numpages)
1077{ 1096{
1097 if (!(__supported_pte_mask & _PAGE_NX))
1098 return 0;
1099
1078 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); 1100 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1079} 1101}
1080EXPORT_SYMBOL(set_memory_nx); 1102EXPORT_SYMBOL(set_memory_nx);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e78cd0ec2bcf..66b55d6e69ed 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -20,6 +20,7 @@
20#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
21#include <asm/processor.h> 21#include <asm/processor.h>
22#include <asm/tlbflush.h> 22#include <asm/tlbflush.h>
23#include <asm/x86_init.h>
23#include <asm/pgtable.h> 24#include <asm/pgtable.h>
24#include <asm/fcntl.h> 25#include <asm/fcntl.h>
25#include <asm/e820.h> 26#include <asm/e820.h>
@@ -355,9 +356,6 @@ static int free_ram_pages_type(u64 start, u64 end)
355 * - _PAGE_CACHE_UC_MINUS 356 * - _PAGE_CACHE_UC_MINUS
356 * - _PAGE_CACHE_UC 357 * - _PAGE_CACHE_UC
357 * 358 *
358 * req_type will have a special case value '-1', when requester want to inherit
359 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
360 *
361 * If new_type is NULL, function will return an error if it cannot reserve the 359 * If new_type is NULL, function will return an error if it cannot reserve the
362 * region with req_type. If new_type is non-NULL, function will return 360 * region with req_type. If new_type is non-NULL, function will return
363 * available type in new_type in case of no error. In case of any error 361 * available type in new_type in case of no error. In case of any error
@@ -377,9 +375,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
377 if (!pat_enabled) { 375 if (!pat_enabled) {
378 /* This is identical to page table setting without PAT */ 376 /* This is identical to page table setting without PAT */
379 if (new_type) { 377 if (new_type) {
380 if (req_type == -1) 378 if (req_type == _PAGE_CACHE_WC)
381 *new_type = _PAGE_CACHE_WB;
382 else if (req_type == _PAGE_CACHE_WC)
383 *new_type = _PAGE_CACHE_UC_MINUS; 379 *new_type = _PAGE_CACHE_UC_MINUS;
384 else 380 else
385 *new_type = req_type & _PAGE_CACHE_MASK; 381 *new_type = req_type & _PAGE_CACHE_MASK;
@@ -388,7 +384,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
388 } 384 }
389 385
390 /* Low ISA region is always mapped WB in page table. No need to track */ 386 /* Low ISA region is always mapped WB in page table. No need to track */
391 if (is_ISA_range(start, end - 1)) { 387 if (x86_platform.is_untracked_pat_range(start, end)) {
392 if (new_type) 388 if (new_type)
393 *new_type = _PAGE_CACHE_WB; 389 *new_type = _PAGE_CACHE_WB;
394 return 0; 390 return 0;
@@ -499,7 +495,7 @@ int free_memtype(u64 start, u64 end)
499 return 0; 495 return 0;
500 496
501 /* Low ISA region is always mapped WB. No need to track */ 497 /* Low ISA region is always mapped WB. No need to track */
502 if (is_ISA_range(start, end - 1)) 498 if (x86_platform.is_untracked_pat_range(start, end))
503 return 0; 499 return 0;
504 500
505 is_range_ram = pat_pagerange_is_ram(start, end); 501 is_range_ram = pat_pagerange_is_ram(start, end);
@@ -582,7 +578,7 @@ static unsigned long lookup_memtype(u64 paddr)
582 int rettype = _PAGE_CACHE_WB; 578 int rettype = _PAGE_CACHE_WB;
583 struct memtype *entry; 579 struct memtype *entry;
584 580
585 if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) 581 if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
586 return rettype; 582 return rettype;
587 583
588 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 584 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
@@ -1018,8 +1014,10 @@ static const struct file_operations memtype_fops = {
1018 1014
1019static int __init pat_memtype_list_init(void) 1015static int __init pat_memtype_list_init(void)
1020{ 1016{
1021 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, 1017 if (pat_enabled) {
1022 NULL, &memtype_fops); 1018 debugfs_create_file("pat_memtype_list", S_IRUSR,
1019 arch_debugfs_dir, NULL, &memtype_fops);
1020 }
1023 return 0; 1021 return 0;
1024} 1022}
1025 1023
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 513d8ed5d2ec..a3250aa34086 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -3,10 +3,8 @@
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5#include <asm/pgtable.h> 5#include <asm/pgtable.h>
6#include <asm/proto.h>
6 7
7int nx_enabled;
8
9#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
10static int disable_nx __cpuinitdata; 8static int disable_nx __cpuinitdata;
11 9
12/* 10/*
@@ -22,48 +20,41 @@ static int __init noexec_setup(char *str)
22 if (!str) 20 if (!str)
23 return -EINVAL; 21 return -EINVAL;
24 if (!strncmp(str, "on", 2)) { 22 if (!strncmp(str, "on", 2)) {
25 __supported_pte_mask |= _PAGE_NX;
26 disable_nx = 0; 23 disable_nx = 0;
27 } else if (!strncmp(str, "off", 3)) { 24 } else if (!strncmp(str, "off", 3)) {
28 disable_nx = 1; 25 disable_nx = 1;
29 __supported_pte_mask &= ~_PAGE_NX;
30 } 26 }
27 x86_configure_nx();
31 return 0; 28 return 0;
32} 29}
33early_param("noexec", noexec_setup); 30early_param("noexec", noexec_setup);
34#endif
35 31
36#ifdef CONFIG_X86_PAE 32void __cpuinit x86_configure_nx(void)
37void __init set_nx(void)
38{ 33{
39 unsigned int v[4], l, h; 34 if (cpu_has_nx && !disable_nx)
40 35 __supported_pte_mask |= _PAGE_NX;
41 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { 36 else
42 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); 37 __supported_pte_mask &= ~_PAGE_NX;
38}
43 39
44 if ((v[3] & (1 << 20)) && !disable_nx) { 40void __init x86_report_nx(void)
45 rdmsr(MSR_EFER, l, h); 41{
46 l |= EFER_NX; 42 if (!cpu_has_nx) {
47 wrmsr(MSR_EFER, l, h); 43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
48 nx_enabled = 1; 44 "missing in CPU or disabled in BIOS!\n");
49 __supported_pte_mask |= _PAGE_NX; 45 } else {
46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
47 if (disable_nx) {
48 printk(KERN_INFO "NX (Execute Disable) protection: "
49 "disabled by kernel command line option\n");
50 } else {
51 printk(KERN_INFO "NX (Execute Disable) protection: "
52 "active\n");
50 } 53 }
51 }
52}
53#else 54#else
54void set_nx(void) 55 /* 32bit non-PAE kernel, NX cannot be used */
55{ 56 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
56} 57 "cannot be enabled: non-PAE kernel!\n");
57#endif 58#endif
58 59 }
59#ifdef CONFIG_X86_64
60void __cpuinit check_efer(void)
61{
62 unsigned long efer;
63
64 rdmsrl(MSR_EFER, efer);
65 if (!(efer & EFER_NX) || disable_nx)
66 __supported_pte_mask &= ~_PAGE_NX;
67} 60}
68#endif
69
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index dbb5381f7b3b..d89075489664 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -136,7 +136,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
136 apicid_to_node[apic_id] = node; 136 apicid_to_node[apic_id] = node;
137 node_set(node, cpu_nodes_parsed); 137 node_set(node, cpu_nodes_parsed);
138 acpi_numa = 1; 138 acpi_numa = 1;
139 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", 139 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
140 pxm, apic_id, node); 140 pxm, apic_id, node);
141} 141}
142 142
@@ -170,7 +170,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
170 apicid_to_node[apic_id] = node; 170 apicid_to_node[apic_id] = node;
171 node_set(node, cpu_nodes_parsed); 171 node_set(node, cpu_nodes_parsed);
172 acpi_numa = 1; 172 acpi_numa = 1;
173 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", 173 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
174 pxm, apic_id, node); 174 pxm, apic_id, node);
175} 175}
176 176
@@ -290,8 +290,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
290 290
291 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 291 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
292 start, end); 292 start, end);
293 e820_register_active_regions(node, start >> PAGE_SHIFT,
294 end >> PAGE_SHIFT);
295 293
296 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { 294 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
297 update_nodes_add(node, start, end); 295 update_nodes_add(node, start, end);
@@ -338,6 +336,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
338 336
339void __init acpi_numa_arch_fixup(void) {} 337void __init acpi_numa_arch_fixup(void) {}
340 338
339int __init acpi_get_nodes(struct bootnode *physnodes)
340{
341 int i;
342 int ret = 0;
343
344 for_each_node_mask(i, nodes_parsed) {
345 physnodes[ret].start = nodes[i].start;
346 physnodes[ret].end = nodes[i].end;
347 ret++;
348 }
349 return ret;
350}
351
341/* Use the information discovered above to actually set up the nodes. */ 352/* Use the information discovered above to actually set up the nodes. */
342int __init acpi_scan_nodes(unsigned long start, unsigned long end) 353int __init acpi_scan_nodes(unsigned long start, unsigned long end)
343{ 354{
@@ -350,11 +361,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
350 for (i = 0; i < MAX_NUMNODES; i++) 361 for (i = 0; i < MAX_NUMNODES; i++)
351 cutoff_node(i, start, end); 362 cutoff_node(i, start, end);
352 363
353 if (!nodes_cover_memory(nodes)) {
354 bad_srat();
355 return -1;
356 }
357
358 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, 364 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
359 memblk_nodeid); 365 memblk_nodeid);
360 if (memnode_shift < 0) { 366 if (memnode_shift < 0) {
@@ -364,6 +370,14 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
364 return -1; 370 return -1;
365 } 371 }
366 372
373 for_each_node_mask(i, nodes_parsed)
374 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
375 nodes[i].end >> PAGE_SHIFT);
376 if (!nodes_cover_memory(nodes)) {
377 bad_srat();
378 return -1;
379 }
380
367 /* Account for nodes with cpus and no memory */ 381 /* Account for nodes with cpus and no memory */
368 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); 382 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
369 383
@@ -454,7 +468,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
454 for (i = 0; i < num_nodes; i++) 468 for (i = 0; i < num_nodes; i++)
455 if (fake_nodes[i].start != fake_nodes[i].end) 469 if (fake_nodes[i].start != fake_nodes[i].end)
456 node_set(i, nodes_parsed); 470 node_set(i, nodes_parsed);
457 WARN_ON(!nodes_cover_memory(fake_nodes));
458} 471}
459 472
460static int null_slit_node_compare(int a, int b) 473static int null_slit_node_compare(int a, int b)
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 427fd1b56df5..8565d944f7cf 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,12 +1,13 @@
1/* 1/*
2 * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi> 2 * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
3 */ 3 */
4
5#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
6
4#include <linux/module.h> 7#include <linux/module.h>
5#include <linux/io.h> 8#include <linux/io.h>
6#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
7 10
8#define MODULE_NAME "testmmiotrace"
9
10static unsigned long mmio_address; 11static unsigned long mmio_address;
11module_param(mmio_address, ulong, 0); 12module_param(mmio_address, ulong, 0);
12MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " 13MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
@@ -30,7 +31,7 @@ static unsigned v32(unsigned i)
30static void do_write_test(void __iomem *p) 31static void do_write_test(void __iomem *p)
31{ 32{
32 unsigned int i; 33 unsigned int i;
33 pr_info(MODULE_NAME ": write test.\n"); 34 pr_info("write test.\n");
34 mmiotrace_printk("Write test.\n"); 35 mmiotrace_printk("Write test.\n");
35 36
36 for (i = 0; i < 256; i++) 37 for (i = 0; i < 256; i++)
@@ -47,7 +48,7 @@ static void do_read_test(void __iomem *p)
47{ 48{
48 unsigned int i; 49 unsigned int i;
49 unsigned errs[3] = { 0 }; 50 unsigned errs[3] = { 0 };
50 pr_info(MODULE_NAME ": read test.\n"); 51 pr_info("read test.\n");
51 mmiotrace_printk("Read test.\n"); 52 mmiotrace_printk("Read test.\n");
52 53
53 for (i = 0; i < 256; i++) 54 for (i = 0; i < 256; i++)
@@ -68,7 +69,7 @@ static void do_read_test(void __iomem *p)
68 69
69static void do_read_far_test(void __iomem *p) 70static void do_read_far_test(void __iomem *p)
70{ 71{
71 pr_info(MODULE_NAME ": read far test.\n"); 72 pr_info("read far test.\n");
72 mmiotrace_printk("Read far test.\n"); 73 mmiotrace_printk("Read far test.\n");
73 74
74 ioread32(p + read_far); 75 ioread32(p + read_far);
@@ -78,7 +79,7 @@ static void do_test(unsigned long size)
78{ 79{
79 void __iomem *p = ioremap_nocache(mmio_address, size); 80 void __iomem *p = ioremap_nocache(mmio_address, size);
80 if (!p) { 81 if (!p) {
81 pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); 82 pr_err("could not ioremap, aborting.\n");
82 return; 83 return;
83 } 84 }
84 mmiotrace_printk("ioremap returned %p.\n", p); 85 mmiotrace_printk("ioremap returned %p.\n", p);
@@ -94,24 +95,22 @@ static int __init init(void)
94 unsigned long size = (read_far) ? (8 << 20) : (16 << 10); 95 unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
95 96
96 if (mmio_address == 0) { 97 if (mmio_address == 0) {
97 pr_err(MODULE_NAME ": you have to use the module argument " 98 pr_err("you have to use the module argument mmio_address.\n");
98 "mmio_address.\n"); 99 pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n");
99 pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
100 " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
101 return -ENXIO; 100 return -ENXIO;
102 } 101 }
103 102
104 pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " 103 pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
105 "address space, and writing 16 kB of rubbish in there.\n", 104 "and writing 16 kB of rubbish in there.\n",
106 size >> 10, mmio_address); 105 size >> 10, mmio_address);
107 do_test(size); 106 do_test(size);
108 pr_info(MODULE_NAME ": All done.\n"); 107 pr_info("All done.\n");
109 return 0; 108 return 0;
110} 109}
111 110
112static void __exit cleanup(void) 111static void __exit cleanup(void)
113{ 112{
114 pr_debug(MODULE_NAME ": unloaded.\n"); 113 pr_debug("unloaded.\n");
115} 114}
116 115
117module_init(init); 116module_init(init);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 36fe08eeb5c3..65b58e4b0b8b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -8,6 +8,7 @@
8 8
9#include <asm/tlbflush.h> 9#include <asm/tlbflush.h>
10#include <asm/mmu_context.h> 10#include <asm/mmu_context.h>
11#include <asm/cache.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
12#include <asm/uv/uv.h> 13#include <asm/uv/uv.h>
13 14
@@ -43,7 +44,7 @@ union smp_flush_state {
43 spinlock_t tlbstate_lock; 44 spinlock_t tlbstate_lock;
44 DECLARE_BITMAP(flush_cpumask, NR_CPUS); 45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
45 }; 46 };
46 char pad[CONFIG_X86_INTERNODE_CACHE_BYTES]; 47 char pad[INTERNODE_CACHE_BYTES];
47} ____cacheline_internodealigned_in_smp; 48} ____cacheline_internodealigned_in_smp;
48 49
49/* State is put into the per CPU data section, but padded 50/* State is put into the per CPU data section, but padded
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 8aa85f17667e..0a979f3e5b8a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -18,6 +18,7 @@
18#include <asm/mce.h> 18#include <asm/mce.h>
19#include <asm/xcr.h> 19#include <asm/xcr.h>
20#include <asm/suspend.h> 20#include <asm/suspend.h>
21#include <asm/debugreg.h>
21 22
22#ifdef CONFIG_X86_32 23#ifdef CONFIG_X86_32
23static struct saved_context saved_context; 24static struct saved_context saved_context;
@@ -142,31 +143,6 @@ static void fix_processor_context(void)
142#endif 143#endif
143 load_TR_desc(); /* This does ltr */ 144 load_TR_desc(); /* This does ltr */
144 load_LDT(&current->active_mm->context); /* This does lldt */ 145 load_LDT(&current->active_mm->context); /* This does lldt */
145
146 /*
147 * Now maybe reload the debug registers
148 */
149 if (current->thread.debugreg7) {
150#ifdef CONFIG_X86_32
151 set_debugreg(current->thread.debugreg0, 0);
152 set_debugreg(current->thread.debugreg1, 1);
153 set_debugreg(current->thread.debugreg2, 2);
154 set_debugreg(current->thread.debugreg3, 3);
155 /* no 4 and 5 */
156 set_debugreg(current->thread.debugreg6, 6);
157 set_debugreg(current->thread.debugreg7, 7);
158#else
159 /* CONFIG_X86_64 */
160 loaddebug(&current->thread, 0);
161 loaddebug(&current->thread, 1);
162 loaddebug(&current->thread, 2);
163 loaddebug(&current->thread, 3);
164 /* no 4 and 5 */
165 loaddebug(&current->thread, 6);
166 loaddebug(&current->thread, 7);
167#endif
168 }
169
170} 146}
171 147
172/** 148/**
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
new file mode 100644
index 000000000000..f82082677337
--- /dev/null
+++ b/arch/x86/tools/Makefile
@@ -0,0 +1,31 @@
1PHONY += posttest
2
3ifeq ($(KBUILD_VERBOSE),1)
4 posttest_verbose = -v
5else
6 posttest_verbose =
7endif
8
9ifeq ($(CONFIG_64BIT),y)
10 posttest_64bit = -y
11else
12 posttest_64bit = -n
13endif
14
15distill_awk = $(srctree)/arch/x86/tools/distill.awk
16chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
17
18quiet_cmd_posttest = TEST $@
19 cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose)
20
21posttest: $(obj)/test_get_len vmlinux
22 $(call cmd,posttest)
23
24hostprogs-y := test_get_len
25
26# -I needed for generated C source and C source which in the kernel tree.
27HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
28
29# Dependencies are also needed.
30$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
31
diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk
new file mode 100644
index 000000000000..0d13cd9fdcff
--- /dev/null
+++ b/arch/x86/tools/chkobjdump.awk
@@ -0,0 +1,23 @@
1# GNU objdump version checker
2#
3# Usage:
4# objdump -v | awk -f chkobjdump.awk
5BEGIN {
6 # objdump version 2.19 or later is OK for the test.
7 od_ver = 2;
8 od_sver = 19;
9}
10
11/^GNU/ {
12 split($4, ver, ".");
13 if (ver[1] > od_ver ||
14 (ver[1] == od_ver && ver[2] >= od_sver)) {
15 exit 1;
16 } else {
17 printf("Warning: objdump version %s is older than %d.%d\n",
18 $4, od_ver, od_sver);
19 print("Warning: Skipping posttest.");
20 # Logic is inverted, because we just skip test without error.
21 exit 0;
22 }
23}
diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk
new file mode 100644
index 000000000000..c13c0ee48ab4
--- /dev/null
+++ b/arch/x86/tools/distill.awk
@@ -0,0 +1,47 @@
1#!/bin/awk -f
2# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
3# Distills the disassembly as follows:
4# - Removes all lines except the disassembled instructions.
5# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes
6# into a single line.
7# - Remove bad(or prefix only) instructions
8
9BEGIN {
10 prev_addr = ""
11 prev_hex = ""
12 prev_mnemonic = ""
13 bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
14 fwait_expr = "^9b "
15 fwait_str="9b\tfwait"
16}
17
18/^ *[0-9a-f]+ <[^>]*>:/ {
19 # Symbol entry
20 printf("%s%s\n", $2, $1)
21}
22
23/^ *[0-9a-f]+:/ {
24 if (split($0, field, "\t") < 3) {
25 # This is a continuation of the same insn.
26 prev_hex = prev_hex field[2]
27 } else {
28 # Skip bad instructions
29 if (match(prev_mnemonic, bad_expr))
30 prev_addr = ""
31 # Split fwait from other f* instructions
32 if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") {
33 printf "%s\t%s\n", prev_addr, fwait_str
34 sub(fwait_expr, "", prev_hex)
35 }
36 if (prev_addr != "")
37 printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
38 prev_addr = field[1]
39 prev_hex = field[2]
40 prev_mnemonic = field[3]
41 }
42}
43
44END {
45 if (prev_addr != "")
46 printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
47}
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
new file mode 100644
index 000000000000..e34e92a28eb6
--- /dev/null
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -0,0 +1,380 @@
1#!/bin/awk -f
2# gen-insn-attr-x86.awk: Instruction attribute table generator
3# Written by Masami Hiramatsu <mhiramat@redhat.com>
4#
5# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c
6
7# Awk implementation sanity check
8function check_awk_implement() {
9 if (!match("abc", "[[:lower:]]+"))
10 return "Your awk doesn't support charactor-class."
11 if (sprintf("%x", 0) != "0")
12 return "Your awk has a printf-format problem."
13 return ""
14}
15
16# Clear working vars
17function clear_vars() {
18 delete table
19 delete lptable2
20 delete lptable1
21 delete lptable3
22 eid = -1 # escape id
23 gid = -1 # group id
24 aid = -1 # AVX id
25 tname = ""
26}
27
28BEGIN {
29 # Implementation error checking
30 awkchecked = check_awk_implement()
31 if (awkchecked != "") {
32 print "Error: " awkchecked > "/dev/stderr"
33 print "Please try to use gawk." > "/dev/stderr"
34 exit 1
35 }
36
37 # Setup generating tables
38 print "/* x86 opcode map generated from x86-opcode-map.txt */"
39 print "/* Do not change this code. */\n"
40 ggid = 1
41 geid = 1
42 gaid = 0
43 delete etable
44 delete gtable
45 delete atable
46
47 opnd_expr = "^[[:alpha:]/]"
48 ext_expr = "^\\("
49 sep_expr = "^\\|$"
50 group_expr = "^Grp[[:alnum:]]+"
51
52 imm_expr = "^[IJAO][[:lower:]]"
53 imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
54 imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
55 imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)"
56 imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)"
57 imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)"
58 imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)"
59 imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
60 imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
61 imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)"
62 imm_flag["Ob"] = "INAT_MOFFSET"
63 imm_flag["Ov"] = "INAT_MOFFSET"
64
65 modrm_expr = "^([CDEGMNPQRSUVW/][[:lower:]]+|NTA|T[012])"
66 force64_expr = "\\([df]64\\)"
67 rex_expr = "^REX(\\.[XRWB]+)*"
68 fpu_expr = "^ESC" # TODO
69
70 lprefix1_expr = "\\(66\\)"
71 lprefix2_expr = "\\(F3\\)"
72 lprefix3_expr = "\\(F2\\)"
73 max_lprefix = 4
74
75 vexok_expr = "\\(VEX\\)"
76 vexonly_expr = "\\(oVEX\\)"
77
78 prefix_expr = "\\(Prefix\\)"
79 prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
80 prefix_num["REPNE"] = "INAT_PFX_REPNE"
81 prefix_num["REP/REPE"] = "INAT_PFX_REPE"
82 prefix_num["LOCK"] = "INAT_PFX_LOCK"
83 prefix_num["SEG=CS"] = "INAT_PFX_CS"
84 prefix_num["SEG=DS"] = "INAT_PFX_DS"
85 prefix_num["SEG=ES"] = "INAT_PFX_ES"
86 prefix_num["SEG=FS"] = "INAT_PFX_FS"
87 prefix_num["SEG=GS"] = "INAT_PFX_GS"
88 prefix_num["SEG=SS"] = "INAT_PFX_SS"
89 prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ"
90 prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2"
91 prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3"
92
93 clear_vars()
94}
95
96function semantic_error(msg) {
97 print "Semantic error at " NR ": " msg > "/dev/stderr"
98 exit 1
99}
100
101function debug(msg) {
102 print "DEBUG: " msg
103}
104
105function array_size(arr, i,c) {
106 c = 0
107 for (i in arr)
108 c++
109 return c
110}
111
112/^Table:/ {
113 print "/* " $0 " */"
114 if (tname != "")
115 semantic_error("Hit Table: before EndTable:.");
116}
117
118/^Referrer:/ {
119 if (NF != 1) {
120 # escape opcode table
121 ref = ""
122 for (i = 2; i <= NF; i++)
123 ref = ref $i
124 eid = escape[ref]
125 tname = sprintf("inat_escape_table_%d", eid)
126 }
127}
128
129/^AVXcode:/ {
130 if (NF != 1) {
131 # AVX/escape opcode table
132 aid = $2
133 if (gaid <= aid)
134 gaid = aid + 1
135 if (tname == "") # AVX only opcode table
136 tname = sprintf("inat_avx_table_%d", $2)
137 }
138 if (aid == -1 && eid == -1) # primary opcode table
139 tname = "inat_primary_table"
140}
141
142/^GrpTable:/ {
143 print "/* " $0 " */"
144 if (!($2 in group))
145 semantic_error("No group: " $2 )
146 gid = group[$2]
147 tname = "inat_group_table_" gid
148}
149
150function print_table(tbl,name,fmt,n)
151{
152 print "const insn_attr_t " name " = {"
153 for (i = 0; i < n; i++) {
154 id = sprintf(fmt, i)
155 if (tbl[id])
156 print " [" id "] = " tbl[id] ","
157 }
158 print "};"
159}
160
161/^EndTable/ {
162 if (gid != -1) {
163 # print group tables
164 if (array_size(table) != 0) {
165 print_table(table, tname "[INAT_GROUP_TABLE_SIZE]",
166 "0x%x", 8)
167 gtable[gid,0] = tname
168 }
169 if (array_size(lptable1) != 0) {
170 print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]",
171 "0x%x", 8)
172 gtable[gid,1] = tname "_1"
173 }
174 if (array_size(lptable2) != 0) {
175 print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]",
176 "0x%x", 8)
177 gtable[gid,2] = tname "_2"
178 }
179 if (array_size(lptable3) != 0) {
180 print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]",
181 "0x%x", 8)
182 gtable[gid,3] = tname "_3"
183 }
184 } else {
185 # print primary/escaped tables
186 if (array_size(table) != 0) {
187 print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]",
188 "0x%02x", 256)
189 etable[eid,0] = tname
190 if (aid >= 0)
191 atable[aid,0] = tname
192 }
193 if (array_size(lptable1) != 0) {
194 print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]",
195 "0x%02x", 256)
196 etable[eid,1] = tname "_1"
197 if (aid >= 0)
198 atable[aid,1] = tname "_1"
199 }
200 if (array_size(lptable2) != 0) {
201 print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]",
202 "0x%02x", 256)
203 etable[eid,2] = tname "_2"
204 if (aid >= 0)
205 atable[aid,2] = tname "_2"
206 }
207 if (array_size(lptable3) != 0) {
208 print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]",
209 "0x%02x", 256)
210 etable[eid,3] = tname "_3"
211 if (aid >= 0)
212 atable[aid,3] = tname "_3"
213 }
214 }
215 print ""
216 clear_vars()
217}
218
219function add_flags(old,new) {
220 if (old && new)
221 return old " | " new
222 else if (old)
223 return old
224 else
225 return new
226}
227
228# convert operands to flags.
229function convert_operands(opnd, i,imm,mod)
230{
231 imm = null
232 mod = null
233 for (i in opnd) {
234 i = opnd[i]
235 if (match(i, imm_expr) == 1) {
236 if (!imm_flag[i])
237 semantic_error("Unknown imm opnd: " i)
238 if (imm) {
239 if (i != "Ib")
240 semantic_error("Second IMM error")
241 imm = add_flags(imm, "INAT_SCNDIMM")
242 } else
243 imm = imm_flag[i]
244 } else if (match(i, modrm_expr))
245 mod = "INAT_MODRM"
246 }
247 return add_flags(imm, mod)
248}
249
250/^[0-9a-f]+\:/ {
251 if (NR == 1)
252 next
253 # get index
254 idx = "0x" substr($1, 1, index($1,":") - 1)
255 if (idx in table)
256 semantic_error("Redefine " idx " in " tname)
257
258 # check if escaped opcode
259 if ("escape" == $2) {
260 if ($3 != "#")
261 semantic_error("No escaped name")
262 ref = ""
263 for (i = 4; i <= NF; i++)
264 ref = ref $i
265 if (ref in escape)
266 semantic_error("Redefine escape (" ref ")")
267 escape[ref] = geid
268 geid++
269 table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")"
270 next
271 }
272
273 variant = null
274 # converts
275 i = 2
276 while (i <= NF) {
277 opcode = $(i++)
278 delete opnds
279 ext = null
280 flags = null
281 opnd = null
282 # parse one opcode
283 if (match($i, opnd_expr)) {
284 opnd = $i
285 split($(i++), opnds, ",")
286 flags = convert_operands(opnds)
287 }
288 if (match($i, ext_expr))
289 ext = $(i++)
290 if (match($i, sep_expr))
291 i++
292 else if (i < NF)
293 semantic_error($i " is not a separator")
294
295 # check if group opcode
296 if (match(opcode, group_expr)) {
297 if (!(opcode in group)) {
298 group[opcode] = ggid
299 ggid++
300 }
301 flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")")
302 }
303 # check force(or default) 64bit
304 if (match(ext, force64_expr))
305 flags = add_flags(flags, "INAT_FORCE64")
306
307 # check REX prefix
308 if (match(opcode, rex_expr))
309 flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)")
310
311 # check coprocessor escape : TODO
312 if (match(opcode, fpu_expr))
313 flags = add_flags(flags, "INAT_MODRM")
314
315 # check VEX only code
316 if (match(ext, vexonly_expr))
317 flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
318
319 # check VEX only code
320 if (match(ext, vexok_expr))
321 flags = add_flags(flags, "INAT_VEXOK")
322
323 # check prefixes
324 if (match(ext, prefix_expr)) {
325 if (!prefix_num[opcode])
326 semantic_error("Unknown prefix: " opcode)
327 flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")")
328 }
329 if (length(flags) == 0)
330 continue
331 # check if last prefix
332 if (match(ext, lprefix1_expr)) {
333 lptable1[idx] = add_flags(lptable1[idx],flags)
334 variant = "INAT_VARIANT"
335 } else if (match(ext, lprefix2_expr)) {
336 lptable2[idx] = add_flags(lptable2[idx],flags)
337 variant = "INAT_VARIANT"
338 } else if (match(ext, lprefix3_expr)) {
339 lptable3[idx] = add_flags(lptable3[idx],flags)
340 variant = "INAT_VARIANT"
341 } else {
342 table[idx] = add_flags(table[idx],flags)
343 }
344 }
345 if (variant)
346 table[idx] = add_flags(table[idx],variant)
347}
348
349END {
350 if (awkchecked != "")
351 exit 1
352 # print escape opcode map's array
353 print "/* Escape opcode map array */"
354 print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \
355 "[INAT_LSTPFX_MAX + 1] = {"
356 for (i = 0; i < geid; i++)
357 for (j = 0; j < max_lprefix; j++)
358 if (etable[i,j])
359 print " ["i"]["j"] = "etable[i,j]","
360 print "};\n"
361 # print group opcode map's array
362 print "/* Group opcode map array */"
363 print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\
364 "[INAT_LSTPFX_MAX + 1] = {"
365 for (i = 0; i < ggid; i++)
366 for (j = 0; j < max_lprefix; j++)
367 if (gtable[i,j])
368 print " ["i"]["j"] = "gtable[i,j]","
369 print "};\n"
370 # print AVX opcode map's array
371 print "/* AVX opcode map array */"
372 print "const insn_attr_t const *inat_avx_tables[X86_VEX_M_MAX + 1]"\
373 "[INAT_LSTPFX_MAX + 1] = {"
374 for (i = 0; i < gaid; i++)
375 for (j = 0; j < max_lprefix; j++)
376 if (atable[i,j])
377 print " ["i"]["j"] = "atable[i,j]","
378 print "};"
379}
380
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
new file mode 100644
index 000000000000..d8214dc03fa7
--- /dev/null
+++ b/arch/x86/tools/test_get_len.c
@@ -0,0 +1,173 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2009
17 */
18
19#include <stdlib.h>
20#include <stdio.h>
21#include <string.h>
22#include <assert.h>
23#include <unistd.h>
24
25#define unlikely(cond) (cond)
26
27#include <asm/insn.h>
28#include <inat.c>
29#include <insn.c>
30
31/*
32 * Test of instruction analysis in general and insn_get_length() in
33 * particular. See if insn_get_length() and the disassembler agree
34 * on the length of each instruction in an elf disassembly.
35 *
36 * Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
37 */
38
39const char *prog;
40static int verbose;
41static int x86_64;
42
43static void usage(void)
44{
45 fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |"
46 " %s [-y|-n] [-v] \n", prog);
47 fprintf(stderr, "\t-y 64bit mode\n");
48 fprintf(stderr, "\t-n 32bit mode\n");
49 fprintf(stderr, "\t-v verbose mode\n");
50 exit(1);
51}
52
53static void malformed_line(const char *line, int line_nr)
54{
55 fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line);
56 exit(3);
57}
58
59static void dump_field(FILE *fp, const char *name, const char *indent,
60 struct insn_field *field)
61{
62 fprintf(fp, "%s.%s = {\n", indent, name);
63 fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n",
64 indent, field->value, field->bytes[0], field->bytes[1],
65 field->bytes[2], field->bytes[3]);
66 fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent,
67 field->got, field->nbytes);
68}
69
70static void dump_insn(FILE *fp, struct insn *insn)
71{
72 fprintf(fp, "Instruction = { \n");
73 dump_field(fp, "prefixes", "\t", &insn->prefixes);
74 dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix);
75 dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix);
76 dump_field(fp, "opcode", "\t", &insn->opcode);
77 dump_field(fp, "modrm", "\t", &insn->modrm);
78 dump_field(fp, "sib", "\t", &insn->sib);
79 dump_field(fp, "displacement", "\t", &insn->displacement);
80 dump_field(fp, "immediate1", "\t", &insn->immediate1);
81 dump_field(fp, "immediate2", "\t", &insn->immediate2);
82 fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n",
83 insn->attr, insn->opnd_bytes, insn->addr_bytes);
84 fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n",
85 insn->length, insn->x86_64, insn->kaddr);
86}
87
88static void parse_args(int argc, char **argv)
89{
90 int c;
91 prog = argv[0];
92 while ((c = getopt(argc, argv, "ynv")) != -1) {
93 switch (c) {
94 case 'y':
95 x86_64 = 1;
96 break;
97 case 'n':
98 x86_64 = 0;
99 break;
100 case 'v':
101 verbose = 1;
102 break;
103 default:
104 usage();
105 }
106 }
107}
108
109#define BUFSIZE 256
110
111int main(int argc, char **argv)
112{
113 char line[BUFSIZE], sym[BUFSIZE] = "<unknown>";
114 unsigned char insn_buf[16];
115 struct insn insn;
116 int insns = 0, c;
117 int warnings = 0;
118
119 parse_args(argc, argv);
120
121 while (fgets(line, BUFSIZE, stdin)) {
122 char copy[BUFSIZE], *s, *tab1, *tab2;
123 int nb = 0;
124 unsigned int b;
125
126 if (line[0] == '<') {
127 /* Symbol line */
128 strcpy(sym, line);
129 continue;
130 }
131
132 insns++;
133 memset(insn_buf, 0, 16);
134 strcpy(copy, line);
135 tab1 = strchr(copy, '\t');
136 if (!tab1)
137 malformed_line(line, insns);
138 s = tab1 + 1;
139 s += strspn(s, " ");
140 tab2 = strchr(s, '\t');
141 if (!tab2)
142 malformed_line(line, insns);
143 *tab2 = '\0'; /* Characters beyond tab2 aren't examined */
144 while (s < tab2) {
145 if (sscanf(s, "%x", &b) == 1) {
146 insn_buf[nb++] = (unsigned char) b;
147 s += 3;
148 } else
149 break;
150 }
151 /* Decode an instruction */
152 insn_init(&insn, insn_buf, x86_64);
153 insn_get_length(&insn);
154 if (insn.length != nb) {
155 warnings++;
156 fprintf(stderr, "Warning: %s found difference at %s\n",
157 prog, sym);
158 fprintf(stderr, "Warning: %s", line);
159 fprintf(stderr, "Warning: objdump says %d bytes, but "
160 "insn_get_length() says %d\n", nb,
161 insn.length);
162 if (verbose)
163 dump_insn(stderr, &insn);
164 }
165 }
166 if (warnings)
167 fprintf(stderr, "Warning: decoded and checked %d"
168 " instructions with %d warnings\n", insns, warnings);
169 else
170 fprintf(stderr, "Succeed: decoded and checked %d"
171 " instructions\n", insns);
172 return 0;
173}
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 58bc00f68b12..02b442e92007 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -393,7 +393,6 @@ static ctl_table abi_table2[] = {
393 393
394static ctl_table abi_root_table2[] = { 394static ctl_table abi_root_table2[] = {
395 { 395 {
396 .ctl_name = CTL_ABI,
397 .procname = "abi", 396 .procname = "abi",
398 .mode = 0555, 397 .mode = 0555,
399 .child = abi_table2 398 .child = abi_table2
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 23a4d80fb39e..c462cea8ef09 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -178,6 +178,7 @@ static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
178static void xen_cpuid(unsigned int *ax, unsigned int *bx, 178static void xen_cpuid(unsigned int *ax, unsigned int *bx,
179 unsigned int *cx, unsigned int *dx) 179 unsigned int *cx, unsigned int *dx)
180{ 180{
181 unsigned maskebx = ~0;
181 unsigned maskecx = ~0; 182 unsigned maskecx = ~0;
182 unsigned maskedx = ~0; 183 unsigned maskedx = ~0;
183 184
@@ -185,9 +186,16 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
185 * Mask out inconvenient features, to try and disable as many 186 * Mask out inconvenient features, to try and disable as many
186 * unsupported kernel subsystems as possible. 187 * unsupported kernel subsystems as possible.
187 */ 188 */
188 if (*ax == 1) { 189 switch (*ax) {
190 case 1:
189 maskecx = cpuid_leaf1_ecx_mask; 191 maskecx = cpuid_leaf1_ecx_mask;
190 maskedx = cpuid_leaf1_edx_mask; 192 maskedx = cpuid_leaf1_edx_mask;
193 break;
194
195 case 0xb:
196 /* Suppress extended topology stuff */
197 maskebx = 0;
198 break;
191 } 199 }
192 200
193 asm(XEN_EMULATE_PREFIX "cpuid" 201 asm(XEN_EMULATE_PREFIX "cpuid"
@@ -197,6 +205,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
197 "=d" (*dx) 205 "=d" (*dx)
198 : "0" (*ax), "2" (*cx)); 206 : "0" (*ax), "2" (*cx));
199 207
208 *bx &= maskebx;
200 *cx &= maskecx; 209 *cx &= maskecx;
201 *dx &= maskedx; 210 *dx &= maskedx;
202} 211}
@@ -1084,10 +1093,8 @@ asmlinkage void __init xen_start_kernel(void)
1084 1093
1085 __supported_pte_mask |= _PAGE_IOMAP; 1094 __supported_pte_mask |= _PAGE_IOMAP;
1086 1095
1087#ifdef CONFIG_X86_64
1088 /* Work out if we support NX */ 1096 /* Work out if we support NX */
1089 check_efer(); 1097 x86_configure_nx();
1090#endif
1091 1098
1092 xen_setup_features(); 1099 xen_setup_features();
1093 1100