aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-06-11 17:31:52 -0400
committerIngo Molnar <mingo@elte.hu>2009-06-11 17:31:52 -0400
commit0d5959723e1db3fd7323c198a50c16cecf96c7a9 (patch)
tree802b623fff261ebcbbddadf84af5524398364a18 /arch/x86
parent62fdac5913f71f8f200bd2c9bd59a02e9a1498e9 (diff)
parent512626a04e72aca60effe111fa0333ed0b195d21 (diff)
Merge branch 'linus' into x86/mce3
Conflicts: arch/x86/kernel/cpu/mcheck/mce_64.c arch/x86/kernel/irq.c Merge reason: Resolve the conflicts above. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild16
-rw-r--r--arch/x86/Kconfig42
-rw-r--r--arch/x86/Kconfig.debug20
-rw-r--r--arch/x86/Makefile19
-rw-r--r--arch/x86/boot/.gitignore2
-rw-r--r--arch/x86/boot/Makefile29
-rw-r--r--arch/x86/boot/a20.c9
-rw-r--r--arch/x86/boot/apm.c76
-rw-r--r--arch/x86/boot/bioscall.S82
-rw-r--r--arch/x86/boot/boot.h48
-rw-r--r--arch/x86/boot/compressed/.gitignore3
-rw-r--r--arch/x86/boot/compressed/Makefile54
-rw-r--r--arch/x86/boot/compressed/head_32.S194
-rw-r--r--arch/x86/boot/compressed/head_64.S169
-rw-r--r--arch/x86/boot/compressed/misc.c12
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c97
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S (renamed from arch/x86/boot/compressed/vmlinux_64.lds)29
-rw-r--r--arch/x86/boot/compressed/vmlinux.scr10
-rw-r--r--arch/x86/boot/compressed/vmlinux_32.lds43
-rw-r--r--arch/x86/boot/edd.c71
-rw-r--r--arch/x86/boot/header.S30
-rw-r--r--arch/x86/boot/main.c39
-rw-r--r--arch/x86/boot/mca.c27
-rw-r--r--arch/x86/boot/memory.c79
-rw-r--r--arch/x86/boot/regs.c29
-rw-r--r--arch/x86/boot/setup.ld6
-rw-r--r--arch/x86/boot/tty.c52
-rw-r--r--arch/x86/boot/video-bios.c27
-rw-r--r--arch/x86/boot/video-vesa.c137
-rw-r--r--arch/x86/boot/video-vga.c95
-rw-r--r--arch/x86/boot/video.c42
-rw-r--r--arch/x86/boot/video.h14
-rw-r--r--arch/x86/configs/i386_defconfig148
-rw-r--r--arch/x86/configs/x86_64_defconfig150
-rw-r--r--arch/x86/ia32/ia32entry.S4
-rw-r--r--arch/x86/include/asm/alternative.h59
-rw-r--r--arch/x86/include/asm/amd_iommu.h2
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h55
-rw-r--r--arch/x86/include/asm/apic.h5
-rw-r--r--arch/x86/include/asm/atomic_32.h236
-rw-r--r--arch/x86/include/asm/boot.h15
-rw-r--r--arch/x86/include/asm/bootparam.h3
-rw-r--r--arch/x86/include/asm/cpu_debug.h101
-rw-r--r--arch/x86/include/asm/cpufeature.h3
-rw-r--r--arch/x86/include/asm/ds.h82
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h6
-rw-r--r--arch/x86/include/asm/i387.h43
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/iomap.h5
-rw-r--r--arch/x86/include/asm/irq_vectors.h8
-rw-r--r--arch/x86/include/asm/k8.h13
-rw-r--r--arch/x86/include/asm/kvm.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h45
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h6
-rw-r--r--arch/x86/include/asm/microcode.h25
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/msr.h23
-rw-r--r--arch/x86/include/asm/nmi.h2
-rw-r--r--arch/x86/include/asm/numa_64.h10
-rw-r--r--arch/x86/include/asm/page_32_types.h4
-rw-r--r--arch/x86/include/asm/page_64_types.h22
-rw-r--r--arch/x86/include/asm/page_types.h6
-rw-r--r--arch/x86/include/asm/paravirt.h22
-rw-r--r--arch/x86/include/asm/perf_counter.h100
-rw-r--r--arch/x86/include/asm/pgtable.h4
-rw-r--r--arch/x86/include/asm/pgtable_64.h6
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h8
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/processor.h44
-rw-r--r--arch/x86/include/asm/ptrace.h9
-rw-r--r--arch/x86/include/asm/required-features.h8
-rw-r--r--arch/x86/include/asm/sparsemem.h2
-rw-r--r--arch/x86/include/asm/svm.h1
-rw-r--r--arch/x86/include/asm/syscalls.h45
-rw-r--r--arch/x86/include/asm/termios.h1
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/tlbflush.h8
-rw-r--r--arch/x86/include/asm/topology.h3
-rw-r--r--arch/x86/include/asm/traps.h5
-rw-r--r--arch/x86/include/asm/unistd_32.h2
-rw-r--r--arch/x86/include/asm/unistd_64.h5
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h2
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h6
-rw-r--r--arch/x86/include/asm/vmx.h1
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c5
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile2
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/amd_iommu.c500
-rw-r--r--arch/x86/kernel/amd_iommu_init.c273
-rw-r--r--arch/x86/kernel/apic/apic.c23
-rw-r--r--arch/x86/kernel/apic/io_apic.c9
-rw-r--r--arch/x86/kernel/apic/nmi.c2
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c15
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/amd.c10
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c417
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c15
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c2
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c153
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c24
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h15
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c6
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1704
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/ds.c921
-rw-r--r--arch/x86/kernel/ds_selftest.c408
-rw-r--r--arch/x86/kernel/ds_selftest.h15
-rw-r--r--arch/x86/kernel/dumpstack.h1
-rw-r--r--arch/x86/kernel/e820.c46
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/entry_64.S29
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/irq.c11
-rw-r--r--arch/x86/kernel/irqinit.c1
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/microcode_amd.c70
-rw-r--r--arch/x86/kernel/microcode_core.c329
-rw-r--r--arch/x86/kernel/microcode_intel.c90
-rw-r--r--arch/x86/kernel/paravirt.c56
-rw-r--r--arch/x86/kernel/pci-calgary_64.c54
-rw-r--r--arch/x86/kernel/pci-gart_64.c55
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/process.c20
-rw-r--r--arch/x86/kernel/process_32.c20
-rw-r--r--arch/x86/kernel/process_64.c20
-rw-r--r--arch/x86/kernel/ptrace.c284
-rw-r--r--arch/x86/kernel/quirks.c37
-rw-r--r--arch/x86/kernel/reboot.c9
-rw-r--r--arch/x86/kernel/setup.c22
-rw-r--r--arch/x86/kernel/setup_percpu.c8
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/smp.c3
-rw-r--r--arch/x86/kernel/smpboot.c8
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tlb_uv.c17
-rw-r--r--arch/x86/kernel/traps.c17
-rw-r--r--arch/x86/kernel/tsc.c19
-rw-r--r--arch/x86/kernel/tsc_sync.c14
-rw-r--r--arch/x86/kernel/vm86_32.c13
-rw-r--r--arch/x86/kernel/vmi_32.c20
-rw-r--r--arch/x86/kernel/vmlinux.lds.S430
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S229
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S298
-rw-r--r--arch/x86/kernel/vsyscall_64.c8
-rw-r--r--arch/x86/kvm/Kconfig6
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/i8254.c109
-rw-r--r--arch/x86/kvm/i8254.h12
-rw-r--r--arch/x86/kvm/irq.c7
-rw-r--r--arch/x86/kvm/kvm_timer.h18
-rw-r--r--arch/x86/kvm/lapic.c251
-rw-r--r--arch/x86/kvm/lapic.h12
-rw-r--r--arch/x86/kvm/mmu.c194
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/paging_tmpl.h16
-rw-r--r--arch/x86/kvm/svm.c415
-rw-r--r--arch/x86/kvm/timer.c46
-rw-r--r--arch/x86/kvm/vmx.c721
-rw-r--r--arch/x86/kvm/x86.c409
-rw-r--r--arch/x86/kvm/x86.h14
-rw-r--r--arch/x86/kvm/x86_emulate.c141
-rw-r--r--arch/x86/lguest/Makefile1
-rw-r--r--arch/x86/lguest/boot.c33
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/msr-on-cpu.c97
-rw-r--r--arch/x86/lib/msr.c183
-rw-r--r--arch/x86/mm/dump_pagetables.c7
-rw-r--r--arch/x86/mm/fault.c69
-rw-r--r--arch/x86/mm/highmem_32.c2
-rw-r--r--arch/x86/mm/init.c78
-rw-r--r--arch/x86/mm/init_32.c61
-rw-r--r--arch/x86/mm/init_64.c47
-rw-r--r--arch/x86/mm/iomap_32.c1
-rw-r--r--arch/x86/mm/kmmio.c104
-rw-r--r--arch/x86/mm/memtest.c17
-rw-r--r--arch/x86/mm/mmio-mod.c2
-rw-r--r--arch/x86/mm/numa_64.c33
-rw-r--r--arch/x86/mm/pageattr.c14
-rw-r--r--arch/x86/mm/srat_64.c98
-rw-r--r--arch/x86/oprofile/nmi_int.c34
-rw-r--r--arch/x86/oprofile/op_model_ppro.c10
-rw-r--r--arch/x86/pci/mmconfig-shared.c6
-rw-r--r--arch/x86/vdso/vdso32-setup.c6
-rw-r--r--arch/x86/vdso/vma.c8
-rw-r--r--arch/x86/xen/enlighten.c65
-rw-r--r--arch/x86/xen/mmu.c23
-rw-r--r--arch/x86/xen/setup.c6
-rw-r--r--arch/x86/xen/xen-ops.h1
205 files changed, 8485 insertions, 4741 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
new file mode 100644
index 000000000000..ad8ec356fb36
--- /dev/null
+++ b/arch/x86/Kbuild
@@ -0,0 +1,16 @@
1
2obj-$(CONFIG_KVM) += kvm/
3
4# Xen paravirtualization support
5obj-$(CONFIG_XEN) += xen/
6
7# lguest paravirtualization support
8obj-$(CONFIG_LGUEST_GUEST) += lguest/
9
10obj-y += kernel/
11obj-y += mm/
12
13obj-y += crypto/
14obj-y += vdso/
15obj-$(CONFIG_IA32_EMULATION) += ia32/
16
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index afd1168eeefb..356d2ec8e2fb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -47,6 +47,11 @@ config X86
47 select HAVE_KERNEL_BZIP2 47 select HAVE_KERNEL_BZIP2
48 select HAVE_KERNEL_LZMA 48 select HAVE_KERNEL_LZMA
49 49
50config OUTPUT_FORMAT
51 string
52 default "elf32-i386" if X86_32
53 default "elf64-x86-64" if X86_64
54
50config ARCH_DEFCONFIG 55config ARCH_DEFCONFIG
51 string 56 string
52 default "arch/x86/configs/i386_defconfig" if X86_32 57 default "arch/x86/configs/i386_defconfig" if X86_32
@@ -734,6 +739,7 @@ config X86_UP_IOAPIC
734config X86_LOCAL_APIC 739config X86_LOCAL_APIC
735 def_bool y 740 def_bool y
736 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 741 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
742 select HAVE_PERF_COUNTERS if (!M386 && !M486)
737 743
738config X86_IO_APIC 744config X86_IO_APIC
739 def_bool y 745 def_bool y
@@ -1497,9 +1503,7 @@ config KEXEC_JUMP
1497 1503
1498config PHYSICAL_START 1504config PHYSICAL_START
1499 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1505 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
1500 default "0x1000000" if X86_NUMAQ 1506 default "0x1000000"
1501 default "0x200000" if X86_64
1502 default "0x100000"
1503 ---help--- 1507 ---help---
1504 This gives the physical address where the kernel is loaded. 1508 This gives the physical address where the kernel is loaded.
1505 1509
@@ -1518,15 +1522,15 @@ config PHYSICAL_START
1518 to be specifically compiled to run from a specific memory area 1522 to be specifically compiled to run from a specific memory area
1519 (normally a reserved region) and this option comes handy. 1523 (normally a reserved region) and this option comes handy.
1520 1524
1521 So if you are using bzImage for capturing the crash dump, leave 1525 So if you are using bzImage for capturing the crash dump,
1522 the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y. 1526 leave the value here unchanged to 0x1000000 and set
1523 Otherwise if you plan to use vmlinux for capturing the crash dump 1527 CONFIG_RELOCATABLE=y. Otherwise if you plan to use vmlinux
1524 change this value to start of the reserved region (Typically 16MB 1528 for capturing the crash dump change this value to start of
1525 0x1000000). In other words, it can be set based on the "X" value as 1529 the reserved region. In other words, it can be set based on
1526 specified in the "crashkernel=YM@XM" command line boot parameter 1530 the "X" value as specified in the "crashkernel=YM@XM"
1527 passed to the panic-ed kernel. Typically this parameter is set as 1531 command line boot parameter passed to the panic-ed
1528 crashkernel=64M@16M. Please take a look at 1532 kernel. Please take a look at Documentation/kdump/kdump.txt
1529 Documentation/kdump/kdump.txt for more details about crash dumps. 1533 for more details about crash dumps.
1530 1534
1531 Usage of bzImage for capturing the crash dump is recommended as 1535 Usage of bzImage for capturing the crash dump is recommended as
1532 one does not have to build two kernels. Same kernel can be used 1536 one does not have to build two kernels. Same kernel can be used
@@ -1539,8 +1543,8 @@ config PHYSICAL_START
1539 Don't change this unless you know what you are doing. 1543 Don't change this unless you know what you are doing.
1540 1544
1541config RELOCATABLE 1545config RELOCATABLE
1542 bool "Build a relocatable kernel (EXPERIMENTAL)" 1546 bool "Build a relocatable kernel"
1543 depends on EXPERIMENTAL 1547 default y
1544 ---help--- 1548 ---help---
1545 This builds a kernel image that retains relocation information 1549 This builds a kernel image that retains relocation information
1546 so it can be loaded someplace besides the default 1MB. 1550 so it can be loaded someplace besides the default 1MB.
@@ -1555,12 +1559,16 @@ config RELOCATABLE
1555 it has been loaded at and the compile time physical address 1559 it has been loaded at and the compile time physical address
1556 (CONFIG_PHYSICAL_START) is ignored. 1560 (CONFIG_PHYSICAL_START) is ignored.
1557 1561
1562# Relocation on x86-32 needs some additional build support
1563config X86_NEED_RELOCS
1564 def_bool y
1565 depends on X86_32 && RELOCATABLE
1566
1558config PHYSICAL_ALIGN 1567config PHYSICAL_ALIGN
1559 hex 1568 hex
1560 prompt "Alignment value to which kernel should be aligned" if X86_32 1569 prompt "Alignment value to which kernel should be aligned" if X86_32
1561 default "0x100000" if X86_32 1570 default "0x1000000"
1562 default "0x200000" if X86_64 1571 range 0x2000 0x1000000
1563 range 0x2000 0x400000
1564 ---help--- 1572 ---help---
1565 This value puts the alignment restrictions on physical address 1573 This value puts the alignment restrictions on physical address
1566 where kernel is loaded and run from. Kernel is compiled for an 1574 where kernel is loaded and run from. Kernel is compiled for an
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d8359e73317f..d105f29bb6bb 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -159,14 +159,30 @@ config IOMMU_DEBUG
159 options. See Documentation/x86_64/boot-options.txt for more 159 options. See Documentation/x86_64/boot-options.txt for more
160 details. 160 details.
161 161
162config IOMMU_STRESS
163 bool "Enable IOMMU stress-test mode"
164 ---help---
165 This option disables various optimizations in IOMMU related
166 code to do real stress testing of the IOMMU code. This option
167 will cause a performance drop and should only be enabled for
168 testing.
169
162config IOMMU_LEAK 170config IOMMU_LEAK
163 bool "IOMMU leak tracing" 171 bool "IOMMU leak tracing"
164 depends on DEBUG_KERNEL 172 depends on IOMMU_DEBUG && DMA_API_DEBUG
165 depends on IOMMU_DEBUG
166 ---help--- 173 ---help---
167 Add a simple leak tracer to the IOMMU code. This is useful when you 174 Add a simple leak tracer to the IOMMU code. This is useful when you
168 are debugging a buggy device driver that leaks IOMMU mappings. 175 are debugging a buggy device driver that leaks IOMMU mappings.
169 176
177config X86_DS_SELFTEST
178 bool "DS selftest"
179 default y
180 depends on DEBUG_KERNEL
181 depends on X86_DS
182 ---help---
183 Perform Debug Store selftests at boot time.
184 If in doubt, say "N".
185
170config HAVE_MMIOTRACE_SUPPORT 186config HAVE_MMIOTRACE_SUPPORT
171 def_bool y 187 def_bool y
172 188
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 8c86b72afdc2..edbd0ca62067 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,8 +7,6 @@ else
7 KBUILD_DEFCONFIG := $(ARCH)_defconfig 7 KBUILD_DEFCONFIG := $(ARCH)_defconfig
8endif 8endif
9 9
10core-$(CONFIG_KVM) += arch/x86/kvm/
11
12# BITS is used as extension for files which are available in a 32 bit 10# BITS is used as extension for files which are available in a 32 bit
13# and a 64 bit version to simplify shared Makefiles. 11# and a 64 bit version to simplify shared Makefiles.
14# e.g.: obj-y += foo_$(BITS).o 12# e.g.: obj-y += foo_$(BITS).o
@@ -118,21 +116,8 @@ head-y += arch/x86/kernel/init_task.o
118 116
119libs-y += arch/x86/lib/ 117libs-y += arch/x86/lib/
120 118
121# Sub architecture files that needs linking first 119# See arch/x86/Kbuild for content of core part of the kernel
122core-y += $(fcore-y) 120core-y += arch/x86/
123
124# Xen paravirtualization support
125core-$(CONFIG_XEN) += arch/x86/xen/
126
127# lguest paravirtualization support
128core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
129
130core-y += arch/x86/kernel/
131core-y += arch/x86/mm/
132
133core-y += arch/x86/crypto/
134core-y += arch/x86/vdso/
135core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
136 121
137# drivers-y are linked after core-y 122# drivers-y are linked after core-y
138drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ 123drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index 172cf8a98bdd..851fe936d242 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -3,6 +3,8 @@ bzImage
3cpustr.h 3cpustr.h
4mkcpustr 4mkcpustr
5offsets.h 5offsets.h
6voffset.h
7zoffset.h
6setup 8setup
7setup.bin 9setup.bin
8setup.elf 10setup.elf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 6633b6e7505a..8d16ada25048 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,9 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage
26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf 26targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
27subdir- := compressed 27subdir- := compressed
28 28
29setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o 29setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o
30setup-y += header.o main.o mca.o memory.o pm.o pmjump.o 30setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
31setup-y += printf.o string.o tty.o video.o video-mode.o version.o 31setup-y += printf.o regs.o string.o tty.o video.o video-mode.o
32setup-y += version.o
32setup-$(CONFIG_X86_APM_BOOT) += apm.o 33setup-$(CONFIG_X86_APM_BOOT) += apm.o
33 34
34# The link order of the video-*.o modules can matter. In particular, 35# The link order of the video-*.o modules can matter. In particular,
@@ -86,19 +87,27 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
86 87
87SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) 88SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
88 89
89sed-offsets := -e 's/^00*/0/' \ 90sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p'
90 -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p'
91 91
92quiet_cmd_offsets = OFFSETS $@ 92quiet_cmd_voffset = VOFFSET $@
93 cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@ 93 cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
94 94
95$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE 95targets += voffset.h
96 $(call if_changed,offsets) 96$(obj)/voffset.h: vmlinux FORCE
97 $(call if_changed,voffset)
98
99sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
100
101quiet_cmd_zoffset = ZOFFSET $@
102 cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
103
104targets += zoffset.h
105$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
106 $(call if_changed,zoffset)
97 107
98targets += offsets.h
99 108
100AFLAGS_header.o += -I$(obj) 109AFLAGS_header.o += -I$(obj)
101$(obj)/header.o: $(obj)/offsets.h 110$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
102 111
103LDFLAGS_setup.elf := -T 112LDFLAGS_setup.elf := -T
104$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE 113$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 7c19ce8c2442..64a31a6d751a 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -2,7 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007-2008 rPath, Inc. - All Rights Reserved 4 * Copyright 2007-2008 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation 5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
6 * 6 *
7 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
8 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -90,8 +90,11 @@ static int a20_test_long(void)
90 90
91static void enable_a20_bios(void) 91static void enable_a20_bios(void)
92{ 92{
93 asm volatile("pushfl; int $0x15; popfl" 93 struct biosregs ireg;
94 : : "a" ((u16)0x2401)); 94
95 initregs(&ireg);
96 ireg.ax = 0x2401;
97 intcall(0x15, &ireg, NULL);
95} 98}
96 99
97static void enable_a20_kbc(void) 100static void enable_a20_kbc(void)
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
index 7aa6033001f9..ee274834ea8b 100644
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * Original APM BIOS checking by Stephen Rothwell, May 1994 7 * Original APM BIOS checking by Stephen Rothwell, May 1994
7 * (sfr@canb.auug.org.au) 8 * (sfr@canb.auug.org.au)
@@ -19,75 +20,56 @@
19 20
20int query_apm_bios(void) 21int query_apm_bios(void)
21{ 22{
22 u16 ax, bx, cx, dx, di; 23 struct biosregs ireg, oreg;
23 u32 ebx, esi;
24 u8 err;
25 24
26 /* APM BIOS installation check */ 25 /* APM BIOS installation check */
27 ax = 0x5300; 26 initregs(&ireg);
28 bx = cx = 0; 27 ireg.ah = 0x53;
29 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0" 28 intcall(0x15, &ireg, &oreg);
30 : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
31 : : "esi", "edi");
32 29
33 if (err) 30 if (oreg.flags & X86_EFLAGS_CF)
34 return -1; /* No APM BIOS */ 31 return -1; /* No APM BIOS */
35 32
36 if (bx != 0x504d) /* "PM" signature */ 33 if (oreg.bx != 0x504d) /* "PM" signature */
37 return -1; 34 return -1;
38 35
39 if (!(cx & 0x02)) /* 32 bits supported? */ 36 if (!(oreg.cx & 0x02)) /* 32 bits supported? */
40 return -1; 37 return -1;
41 38
42 /* Disconnect first, just in case */ 39 /* Disconnect first, just in case */
43 ax = 0x5304; 40 ireg.al = 0x04;
44 bx = 0; 41 intcall(0x15, &ireg, NULL);
45 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
46 : "+a" (ax), "+b" (bx)
47 : : "ecx", "edx", "esi", "edi");
48
49 /* Paranoia */
50 ebx = esi = 0;
51 cx = dx = di = 0;
52 42
53 /* 32-bit connect */ 43 /* 32-bit connect */
54 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6" 44 ireg.al = 0x03;
55 : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx), 45 intcall(0x15, &ireg, &oreg);
56 "+S" (esi), "+D" (di), "=m" (err) 46
57 : "a" (0x5303)); 47 boot_params.apm_bios_info.cseg = oreg.ax;
58 48 boot_params.apm_bios_info.offset = oreg.ebx;
59 boot_params.apm_bios_info.cseg = ax; 49 boot_params.apm_bios_info.cseg_16 = oreg.cx;
60 boot_params.apm_bios_info.offset = ebx; 50 boot_params.apm_bios_info.dseg = oreg.dx;
61 boot_params.apm_bios_info.cseg_16 = cx; 51 boot_params.apm_bios_info.cseg_len = oreg.si;
62 boot_params.apm_bios_info.dseg = dx; 52 boot_params.apm_bios_info.cseg_16_len = oreg.hsi;
63 boot_params.apm_bios_info.cseg_len = (u16)esi; 53 boot_params.apm_bios_info.dseg_len = oreg.di;
64 boot_params.apm_bios_info.cseg_16_len = esi >> 16; 54
65 boot_params.apm_bios_info.dseg_len = di; 55 if (oreg.flags & X86_EFLAGS_CF)
66
67 if (err)
68 return -1; 56 return -1;
69 57
70 /* Redo the installation check as the 32-bit connect; 58 /* Redo the installation check as the 32-bit connect;
71 some BIOSes return different flags this way... */ 59 some BIOSes return different flags this way... */
72 60
73 ax = 0x5300; 61 ireg.al = 0x00;
74 bx = cx = 0; 62 intcall(0x15, &ireg, &oreg);
75 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
76 : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
77 : : "esi", "edi");
78 63
79 if (err || bx != 0x504d) { 64 if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) {
80 /* Failure with 32-bit connect, try to disconect and ignore */ 65 /* Failure with 32-bit connect, try to disconect and ignore */
81 ax = 0x5304; 66 ireg.al = 0x04;
82 bx = 0; 67 intcall(0x15, &ireg, NULL);
83 asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
84 : "+a" (ax), "+b" (bx)
85 : : "ecx", "edx", "esi", "edi");
86 return -1; 68 return -1;
87 } 69 }
88 70
89 boot_params.apm_bios_info.version = ax; 71 boot_params.apm_bios_info.version = oreg.ax;
90 boot_params.apm_bios_info.flags = cx; 72 boot_params.apm_bios_info.flags = oreg.cx;
91 return 0; 73 return 0;
92} 74}
93 75
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
new file mode 100644
index 000000000000..507793739ea5
--- /dev/null
+++ b/arch/x86/boot/bioscall.S
@@ -0,0 +1,82 @@
1/* -----------------------------------------------------------------------
2 *
3 * Copyright 2009 Intel Corporation; author H. Peter Anvin
4 *
5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2 or (at your
7 * option) any later version; incorporated herein by reference.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * "Glove box" for BIOS calls. Avoids the constant problems with BIOSes
13 * touching registers they shouldn't be.
14 */
15
16 .code16
17 .text
18 .globl intcall
19 .type intcall, @function
20intcall:
21 /* Self-modify the INT instruction. Ugly, but works. */
22 cmpb %al, 3f
23 je 1f
24 movb %al, 3f
25 jmp 1f /* Synchronize pipeline */
261:
27 /* Save state */
28 pushfl
29 pushw %fs
30 pushw %gs
31 pushal
32
33 /* Copy input state to stack frame */
34 subw $44, %sp
35 movw %dx, %si
36 movw %sp, %di
37 movw $11, %cx
38 rep; movsd
39
40 /* Pop full state from the stack */
41 popal
42 popw %gs
43 popw %fs
44 popw %es
45 popw %ds
46 popfl
47
48 /* Actual INT */
49 .byte 0xcd /* INT opcode */
503: .byte 0
51
52 /* Push full state to the stack */
53 pushfl
54 pushw %ds
55 pushw %es
56 pushw %fs
57 pushw %gs
58 pushal
59
60 /* Re-establish C environment invariants */
61 cld
62 movzwl %sp, %esp
63 movw %cs, %ax
64 movw %ax, %ds
65 movw %ax, %es
66
67 /* Copy output state from stack frame */
68 movw 68(%esp), %di /* Original %cx == 3rd argument */
69 andw %di, %di
70 jz 4f
71 movw %sp, %si
72 movw $11, %cx
73 rep; movsd
744: addw $44, %sp
75
76 /* Restore state and return */
77 popal
78 popw %gs
79 popw %fs
80 popfl
81 retl
82 .size intcall, .-intcall
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 7b2692e897e5..98239d2658f2 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -26,6 +27,7 @@
26#include <asm/setup.h> 27#include <asm/setup.h>
27#include "bitops.h" 28#include "bitops.h"
28#include <asm/cpufeature.h> 29#include <asm/cpufeature.h>
30#include <asm/processor-flags.h>
29 31
30/* Useful macros */ 32/* Useful macros */
31#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 33#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -241,6 +243,49 @@ int enable_a20(void);
241/* apm.c */ 243/* apm.c */
242int query_apm_bios(void); 244int query_apm_bios(void);
243 245
246/* bioscall.c */
247struct biosregs {
248 union {
249 struct {
250 u32 edi;
251 u32 esi;
252 u32 ebp;
253 u32 _esp;
254 u32 ebx;
255 u32 edx;
256 u32 ecx;
257 u32 eax;
258 u32 _fsgs;
259 u32 _dses;
260 u32 eflags;
261 };
262 struct {
263 u16 di, hdi;
264 u16 si, hsi;
265 u16 bp, hbp;
266 u16 _sp, _hsp;
267 u16 bx, hbx;
268 u16 dx, hdx;
269 u16 cx, hcx;
270 u16 ax, hax;
271 u16 gs, fs;
272 u16 es, ds;
273 u16 flags, hflags;
274 };
275 struct {
276 u8 dil, dih, edi2, edi3;
277 u8 sil, sih, esi2, esi3;
278 u8 bpl, bph, ebp2, ebp3;
279 u8 _spl, _sph, _esp2, _esp3;
280 u8 bl, bh, ebx2, ebx3;
281 u8 dl, dh, edx2, edx3;
282 u8 cl, ch, ecx2, ecx3;
283 u8 al, ah, eax2, eax3;
284 };
285 };
286};
287void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
288
244/* cmdline.c */ 289/* cmdline.c */
245int cmdline_find_option(const char *option, char *buffer, int bufsize); 290int cmdline_find_option(const char *option, char *buffer, int bufsize);
246int cmdline_find_option_bool(const char *option); 291int cmdline_find_option_bool(const char *option);
@@ -279,6 +324,9 @@ int sprintf(char *buf, const char *fmt, ...);
279int vsprintf(char *buf, const char *fmt, va_list args); 324int vsprintf(char *buf, const char *fmt, va_list args);
280int printf(const char *fmt, ...); 325int printf(const char *fmt, ...);
281 326
327/* regs.c */
328void initregs(struct biosregs *regs);
329
282/* string.c */ 330/* string.c */
283int strcmp(const char *str1, const char *str2); 331int strcmp(const char *str1, const char *str2);
284size_t strnlen(const char *s, size_t maxlen); 332size_t strnlen(const char *s, size_t maxlen);
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
index 63eff3b04d01..4a46fab7162e 100644
--- a/arch/x86/boot/compressed/.gitignore
+++ b/arch/x86/boot/compressed/.gitignore
@@ -1,3 +1,6 @@
1relocs 1relocs
2vmlinux.bin.all 2vmlinux.bin.all
3vmlinux.relocs 3vmlinux.relocs
4vmlinux.lds
5mkpiggy
6piggy.S
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 65551c9f8571..49c8a4c37d7c 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,7 +19,9 @@ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
19LDFLAGS := -m elf_$(UTS_MACHINE) 19LDFLAGS := -m elf_$(UTS_MACHINE)
20LDFLAGS_vmlinux := -T 20LDFLAGS_vmlinux := -T
21 21
22$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE 22hostprogs-y := mkpiggy
23
24$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
23 $(call if_changed,ld) 25 $(call if_changed,ld)
24 @: 26 @:
25 27
@@ -29,7 +31,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
29 31
30 32
31targets += vmlinux.bin.all vmlinux.relocs relocs 33targets += vmlinux.bin.all vmlinux.relocs relocs
32hostprogs-$(CONFIG_X86_32) += relocs 34hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs
33 35
34quiet_cmd_relocs = RELOCS $@ 36quiet_cmd_relocs = RELOCS $@
35 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< 37 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -37,46 +39,22 @@ $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
37 $(call if_changed,relocs) 39 $(call if_changed,relocs)
38 40
39vmlinux.bin.all-y := $(obj)/vmlinux.bin 41vmlinux.bin.all-y := $(obj)/vmlinux.bin
40vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs 42vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
41quiet_cmd_relocbin = BUILD $@
42 cmd_relocbin = cat $(filter-out FORCE,$^) > $@
43$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
44 $(call if_changed,relocbin)
45
46ifeq ($(CONFIG_X86_32),y)
47 43
48ifdef CONFIG_RELOCATABLE 44$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
49$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
50 $(call if_changed,gzip)
51$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE
52 $(call if_changed,bzip2)
53$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE
54 $(call if_changed,lzma)
55else
56$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
57 $(call if_changed,gzip) 45 $(call if_changed,gzip)
58$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE 46$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
59 $(call if_changed,bzip2) 47 $(call if_changed,bzip2)
60$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE 48$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
61 $(call if_changed,lzma) 49 $(call if_changed,lzma)
62endif
63LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
64 50
65else 51suffix-$(CONFIG_KERNEL_GZIP) := gz
52suffix-$(CONFIG_KERNEL_BZIP2) := bz2
53suffix-$(CONFIG_KERNEL_LZMA) := lzma
66 54
67$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE 55quiet_cmd_mkpiggy = MKPIGGY $@
68 $(call if_changed,gzip) 56 cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
69$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
70 $(call if_changed,bzip2)
71$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
72 $(call if_changed,lzma)
73
74LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
75endif
76 57
77suffix_$(CONFIG_KERNEL_GZIP) = gz 58targets += piggy.S
78suffix_$(CONFIG_KERNEL_BZIP2) = bz2 59$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
79suffix_$(CONFIG_KERNEL_LZMA) = lzma 60 $(call if_changed,mkpiggy)
80
81$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
82 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 3a8a866fb2e2..75e4f001e706 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -12,16 +12,16 @@
12 * the page directory. [According to comments etc elsewhere on a compressed 12 * the page directory. [According to comments etc elsewhere on a compressed
13 * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC] 13 * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
14 * 14 *
15 * Page 0 is deliberately kept safe, since System Management Mode code in 15 * Page 0 is deliberately kept safe, since System Management Mode code in
16 * laptops may need to access the BIOS data stored there. This is also 16 * laptops may need to access the BIOS data stored there. This is also
17 * useful for future device drivers that either access the BIOS via VM86 17 * useful for future device drivers that either access the BIOS via VM86
18 * mode. 18 * mode.
19 */ 19 */
20 20
21/* 21/*
22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
23 */ 23 */
24.text 24 .text
25 25
26#include <linux/linkage.h> 26#include <linux/linkage.h>
27#include <asm/segment.h> 27#include <asm/segment.h>
@@ -29,161 +29,151 @@
29#include <asm/boot.h> 29#include <asm/boot.h>
30#include <asm/asm-offsets.h> 30#include <asm/asm-offsets.h>
31 31
32.section ".text.head","ax",@progbits 32 .section ".text.head","ax",@progbits
33ENTRY(startup_32) 33ENTRY(startup_32)
34 cld 34 cld
35 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 35 /*
36 * us to not reload segments */ 36 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
37 testb $(1<<6), BP_loadflags(%esi) 37 * us to not reload segments
38 jnz 1f 38 */
39 testb $(1<<6), BP_loadflags(%esi)
40 jnz 1f
39 41
40 cli 42 cli
41 movl $(__BOOT_DS),%eax 43 movl $__BOOT_DS, %eax
42 movl %eax,%ds 44 movl %eax, %ds
43 movl %eax,%es 45 movl %eax, %es
44 movl %eax,%fs 46 movl %eax, %fs
45 movl %eax,%gs 47 movl %eax, %gs
46 movl %eax,%ss 48 movl %eax, %ss
471: 491:
48 50
49/* Calculate the delta between where we were compiled to run 51/*
52 * Calculate the delta between where we were compiled to run
50 * at and where we were actually loaded at. This can only be done 53 * at and where we were actually loaded at. This can only be done
51 * with a short local call on x86. Nothing else will tell us what 54 * with a short local call on x86. Nothing else will tell us what
52 * address we are running at. The reserved chunk of the real-mode 55 * address we are running at. The reserved chunk of the real-mode
53 * data at 0x1e4 (defined as a scratch field) are used as the stack 56 * data at 0x1e4 (defined as a scratch field) are used as the stack
54 * for this calculation. Only 4 bytes are needed. 57 * for this calculation. Only 4 bytes are needed.
55 */ 58 */
56 leal (0x1e4+4)(%esi), %esp 59 leal (BP_scratch+4)(%esi), %esp
57 call 1f 60 call 1f
581: popl %ebp 611: popl %ebp
59 subl $1b, %ebp 62 subl $1b, %ebp
60 63
61/* %ebp contains the address we are loaded at by the boot loader and %ebx 64/*
65 * %ebp contains the address we are loaded at by the boot loader and %ebx
62 * contains the address where we should move the kernel image temporarily 66 * contains the address where we should move the kernel image temporarily
63 * for safe in-place decompression. 67 * for safe in-place decompression.
64 */ 68 */
65 69
66#ifdef CONFIG_RELOCATABLE 70#ifdef CONFIG_RELOCATABLE
67 movl %ebp, %ebx 71 movl %ebp, %ebx
68 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx 72 movl BP_kernel_alignment(%esi), %eax
69 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx 73 decl %eax
74 addl %eax, %ebx
75 notl %eax
76 andl %eax, %ebx
70#else 77#else
71 movl $LOAD_PHYSICAL_ADDR, %ebx 78 movl $LOAD_PHYSICAL_ADDR, %ebx
72#endif 79#endif
73 80
74 /* Replace the compressed data size with the uncompressed size */ 81 /* Target address to relocate to for decompression */
75 subl input_len(%ebp), %ebx 82 addl $z_extract_offset, %ebx
76 movl output_len(%ebp), %eax 83
77 addl %eax, %ebx 84 /* Set up the stack */
78 /* Add 8 bytes for every 32K input block */ 85 leal boot_stack_end(%ebx), %esp
79 shrl $12, %eax 86
80 addl %eax, %ebx 87 /* Zero EFLAGS */
81 /* Add 32K + 18 bytes of extra slack */ 88 pushl $0
82 addl $(32768 + 18), %ebx 89 popfl
83 /* Align on a 4K boundary */ 90
84 addl $4095, %ebx 91/*
85 andl $~4095, %ebx 92 * Copy the compressed kernel to the end of our buffer
86
87/* Copy the compressed kernel to the end of our buffer
88 * where decompression in place becomes safe. 93 * where decompression in place becomes safe.
89 */ 94 */
90 pushl %esi 95 pushl %esi
91 leal _end(%ebp), %esi 96 leal (_bss-4)(%ebp), %esi
92 leal _end(%ebx), %edi 97 leal (_bss-4)(%ebx), %edi
93 movl $(_end - startup_32), %ecx 98 movl $(_bss - startup_32), %ecx
99 shrl $2, %ecx
94 std 100 std
95 rep 101 rep movsl
96 movsb
97 cld 102 cld
98 popl %esi 103 popl %esi
99
100/* Compute the kernel start address.
101 */
102#ifdef CONFIG_RELOCATABLE
103 addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
104 andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
105#else
106 movl $LOAD_PHYSICAL_ADDR, %ebp
107#endif
108 104
109/* 105/*
110 * Jump to the relocated address. 106 * Jump to the relocated address.
111 */ 107 */
112 leal relocated(%ebx), %eax 108 leal relocated(%ebx), %eax
113 jmp *%eax 109 jmp *%eax
114ENDPROC(startup_32) 110ENDPROC(startup_32)
115 111
116.section ".text" 112 .text
117relocated: 113relocated:
118 114
119/* 115/*
120 * Clear BSS 116 * Clear BSS (stack is currently empty)
121 */
122 xorl %eax,%eax
123 leal _edata(%ebx),%edi
124 leal _end(%ebx), %ecx
125 subl %edi,%ecx
126 cld
127 rep
128 stosb
129
130/*
131 * Setup the stack for the decompressor
132 */ 117 */
133 leal boot_stack_end(%ebx), %esp 118 xorl %eax, %eax
119 leal _bss(%ebx), %edi
120 leal _ebss(%ebx), %ecx
121 subl %edi, %ecx
122 shrl $2, %ecx
123 rep stosl
134 124
135/* 125/*
136 * Do the decompression, and jump to the new kernel.. 126 * Do the decompression, and jump to the new kernel..
137 */ 127 */
138 movl output_len(%ebx), %eax 128 leal z_extract_offset_negative(%ebx), %ebp
139 pushl %eax 129 /* push arguments for decompress_kernel: */
140 # push arguments for decompress_kernel: 130 pushl %ebp /* output address */
141 pushl %ebp # output address 131 pushl $z_input_len /* input_len */
142 movl input_len(%ebx), %eax 132 leal input_data(%ebx), %eax
143 pushl %eax # input_len 133 pushl %eax /* input_data */
144 leal input_data(%ebx), %eax 134 leal boot_heap(%ebx), %eax
145 pushl %eax # input_data 135 pushl %eax /* heap area */
146 leal boot_heap(%ebx), %eax 136 pushl %esi /* real mode pointer */
147 pushl %eax # heap area 137 call decompress_kernel
148 pushl %esi # real mode pointer 138 addl $20, %esp
149 call decompress_kernel
150 addl $20, %esp
151 popl %ecx
152 139
153#if CONFIG_RELOCATABLE 140#if CONFIG_RELOCATABLE
154/* Find the address of the relocations. 141/*
142 * Find the address of the relocations.
155 */ 143 */
156 movl %ebp, %edi 144 leal z_output_len(%ebp), %edi
157 addl %ecx, %edi
158 145
159/* Calculate the delta between where vmlinux was compiled to run 146/*
147 * Calculate the delta between where vmlinux was compiled to run
160 * and where it was actually loaded. 148 * and where it was actually loaded.
161 */ 149 */
162 movl %ebp, %ebx 150 movl %ebp, %ebx
163 subl $LOAD_PHYSICAL_ADDR, %ebx 151 subl $LOAD_PHYSICAL_ADDR, %ebx
164 jz 2f /* Nothing to be done if loaded at compiled addr. */ 152 jz 2f /* Nothing to be done if loaded at compiled addr. */
165/* 153/*
166 * Process relocations. 154 * Process relocations.
167 */ 155 */
168 156
1691: subl $4, %edi 1571: subl $4, %edi
170 movl 0(%edi), %ecx 158 movl (%edi), %ecx
171 testl %ecx, %ecx 159 testl %ecx, %ecx
172 jz 2f 160 jz 2f
173 addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) 161 addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
174 jmp 1b 162 jmp 1b
1752: 1632:
176#endif 164#endif
177 165
178/* 166/*
179 * Jump to the decompressed kernel. 167 * Jump to the decompressed kernel.
180 */ 168 */
181 xorl %ebx,%ebx 169 xorl %ebx, %ebx
182 jmp *%ebp 170 jmp *%ebp
183 171
184.bss 172/*
185/* Stack and heap for uncompression */ 173 * Stack and heap for uncompression
186.balign 4 174 */
175 .bss
176 .balign 4
187boot_heap: 177boot_heap:
188 .fill BOOT_HEAP_SIZE, 1, 0 178 .fill BOOT_HEAP_SIZE, 1, 0
189boot_stack: 179boot_stack:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index ed4a82948002..f62c284db9eb 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -21,8 +21,8 @@
21/* 21/*
22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 22 * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
23 */ 23 */
24.code32 24 .code32
25.text 25 .text
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <asm/segment.h> 28#include <asm/segment.h>
@@ -33,12 +33,14 @@
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34#include <asm/asm-offsets.h> 34#include <asm/asm-offsets.h>
35 35
36.section ".text.head" 36 .section ".text.head"
37 .code32 37 .code32
38ENTRY(startup_32) 38ENTRY(startup_32)
39 cld 39 cld
40 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 40 /*
41 * us to not reload segments */ 41 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
42 * us to not reload segments
43 */
42 testb $(1<<6), BP_loadflags(%esi) 44 testb $(1<<6), BP_loadflags(%esi)
43 jnz 1f 45 jnz 1f
44 46
@@ -49,14 +51,15 @@ ENTRY(startup_32)
49 movl %eax, %ss 51 movl %eax, %ss
501: 521:
51 53
52/* Calculate the delta between where we were compiled to run 54/*
55 * Calculate the delta between where we were compiled to run
53 * at and where we were actually loaded at. This can only be done 56 * at and where we were actually loaded at. This can only be done
54 * with a short local call on x86. Nothing else will tell us what 57 * with a short local call on x86. Nothing else will tell us what
55 * address we are running at. The reserved chunk of the real-mode 58 * address we are running at. The reserved chunk of the real-mode
56 * data at 0x1e4 (defined as a scratch field) are used as the stack 59 * data at 0x1e4 (defined as a scratch field) are used as the stack
57 * for this calculation. Only 4 bytes are needed. 60 * for this calculation. Only 4 bytes are needed.
58 */ 61 */
59 leal (0x1e4+4)(%esi), %esp 62 leal (BP_scratch+4)(%esi), %esp
60 call 1f 63 call 1f
611: popl %ebp 641: popl %ebp
62 subl $1b, %ebp 65 subl $1b, %ebp
@@ -70,32 +73,28 @@ ENTRY(startup_32)
70 testl %eax, %eax 73 testl %eax, %eax
71 jnz no_longmode 74 jnz no_longmode
72 75
73/* Compute the delta between where we were compiled to run at 76/*
77 * Compute the delta between where we were compiled to run at
74 * and where the code will actually run at. 78 * and where the code will actually run at.
75 */ 79 *
76/* %ebp contains the address we are loaded at by the boot loader and %ebx 80 * %ebp contains the address we are loaded at by the boot loader and %ebx
77 * contains the address where we should move the kernel image temporarily 81 * contains the address where we should move the kernel image temporarily
78 * for safe in-place decompression. 82 * for safe in-place decompression.
79 */ 83 */
80 84
81#ifdef CONFIG_RELOCATABLE 85#ifdef CONFIG_RELOCATABLE
82 movl %ebp, %ebx 86 movl %ebp, %ebx
83 addl $(PMD_PAGE_SIZE -1), %ebx 87 movl BP_kernel_alignment(%esi), %eax
84 andl $PMD_PAGE_MASK, %ebx 88 decl %eax
89 addl %eax, %ebx
90 notl %eax
91 andl %eax, %ebx
85#else 92#else
86 movl $CONFIG_PHYSICAL_START, %ebx 93 movl $LOAD_PHYSICAL_ADDR, %ebx
87#endif 94#endif
88 95
89 /* Replace the compressed data size with the uncompressed size */ 96 /* Target address to relocate to for decompression */
90 subl input_len(%ebp), %ebx 97 addl $z_extract_offset, %ebx
91 movl output_len(%ebp), %eax
92 addl %eax, %ebx
93 /* Add 8 bytes for every 32K input block */
94 shrl $12, %eax
95 addl %eax, %ebx
96 /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
97 addl $(32768 + 18 + 4095), %ebx
98 andl $~4095, %ebx
99 98
100/* 99/*
101 * Prepare for entering 64 bit mode 100 * Prepare for entering 64 bit mode
@@ -114,7 +113,7 @@ ENTRY(startup_32)
114 /* 113 /*
115 * Build early 4G boot pagetable 114 * Build early 4G boot pagetable
116 */ 115 */
117 /* Initialize Page tables to 0*/ 116 /* Initialize Page tables to 0 */
118 leal pgtable(%ebx), %edi 117 leal pgtable(%ebx), %edi
119 xorl %eax, %eax 118 xorl %eax, %eax
120 movl $((4096*6)/4), %ecx 119 movl $((4096*6)/4), %ecx
@@ -155,7 +154,8 @@ ENTRY(startup_32)
155 btsl $_EFER_LME, %eax 154 btsl $_EFER_LME, %eax
156 wrmsr 155 wrmsr
157 156
158 /* Setup for the jump to 64bit mode 157 /*
158 * Setup for the jump to 64bit mode
159 * 159 *
160 * When the jump is performend we will be in long mode but 160 * When the jump is performend we will be in long mode but
161 * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 161 * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
@@ -184,7 +184,8 @@ no_longmode:
184 184
185#include "../../kernel/verify_cpu_64.S" 185#include "../../kernel/verify_cpu_64.S"
186 186
187 /* Be careful here startup_64 needs to be at a predictable 187 /*
188 * Be careful here startup_64 needs to be at a predictable
188 * address so I can export it in an ELF header. Bootloaders 189 * address so I can export it in an ELF header. Bootloaders
189 * should look at the ELF header to find this address, as 190 * should look at the ELF header to find this address, as
190 * it may change in the future. 191 * it may change in the future.
@@ -192,7 +193,8 @@ no_longmode:
192 .code64 193 .code64
193 .org 0x200 194 .org 0x200
194ENTRY(startup_64) 195ENTRY(startup_64)
195 /* We come here either from startup_32 or directly from a 196 /*
197 * We come here either from startup_32 or directly from a
196 * 64bit bootloader. If we come here from a bootloader we depend on 198 * 64bit bootloader. If we come here from a bootloader we depend on
197 * an identity mapped page table being provied that maps our 199 * an identity mapped page table being provied that maps our
198 * entire text+data+bss and hopefully all of memory. 200 * entire text+data+bss and hopefully all of memory.
@@ -209,50 +211,54 @@ ENTRY(startup_64)
209 movl $0x20, %eax 211 movl $0x20, %eax
210 ltr %ax 212 ltr %ax
211 213
212 /* Compute the decompressed kernel start address. It is where 214 /*
215 * Compute the decompressed kernel start address. It is where
213 * we were loaded at aligned to a 2M boundary. %rbp contains the 216 * we were loaded at aligned to a 2M boundary. %rbp contains the
214 * decompressed kernel start address. 217 * decompressed kernel start address.
215 * 218 *
216 * If it is a relocatable kernel then decompress and run the kernel 219 * If it is a relocatable kernel then decompress and run the kernel
217 * from load address aligned to 2MB addr, otherwise decompress and 220 * from load address aligned to 2MB addr, otherwise decompress and
218 * run the kernel from CONFIG_PHYSICAL_START 221 * run the kernel from LOAD_PHYSICAL_ADDR
222 *
223 * We cannot rely on the calculation done in 32-bit mode, since we
224 * may have been invoked via the 64-bit entry point.
219 */ 225 */
220 226
221 /* Start with the delta to where the kernel will run at. */ 227 /* Start with the delta to where the kernel will run at. */
222#ifdef CONFIG_RELOCATABLE 228#ifdef CONFIG_RELOCATABLE
223 leaq startup_32(%rip) /* - $startup_32 */, %rbp 229 leaq startup_32(%rip) /* - $startup_32 */, %rbp
224 addq $(PMD_PAGE_SIZE - 1), %rbp 230 movl BP_kernel_alignment(%rsi), %eax
225 andq $PMD_PAGE_MASK, %rbp 231 decl %eax
226 movq %rbp, %rbx 232 addq %rax, %rbp
233 notq %rax
234 andq %rax, %rbp
227#else 235#else
228 movq $CONFIG_PHYSICAL_START, %rbp 236 movq $LOAD_PHYSICAL_ADDR, %rbp
229 movq %rbp, %rbx
230#endif 237#endif
231 238
232 /* Replace the compressed data size with the uncompressed size */ 239 /* Target address to relocate to for decompression */
233 movl input_len(%rip), %eax 240 leaq z_extract_offset(%rbp), %rbx
234 subq %rax, %rbx 241
235 movl output_len(%rip), %eax 242 /* Set up the stack */
236 addq %rax, %rbx 243 leaq boot_stack_end(%rbx), %rsp
237 /* Add 8 bytes for every 32K input block */ 244
238 shrq $12, %rax 245 /* Zero EFLAGS */
239 addq %rax, %rbx 246 pushq $0
240 /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ 247 popfq
241 addq $(32768 + 18 + 4095), %rbx 248
242 andq $~4095, %rbx 249/*
243 250 * Copy the compressed kernel to the end of our buffer
244/* Copy the compressed kernel to the end of our buffer
245 * where decompression in place becomes safe. 251 * where decompression in place becomes safe.
246 */ 252 */
247 leaq _end_before_pgt(%rip), %r8 253 pushq %rsi
248 leaq _end_before_pgt(%rbx), %r9 254 leaq (_bss-8)(%rip), %rsi
249 movq $_end_before_pgt /* - $startup_32 */, %rcx 255 leaq (_bss-8)(%rbx), %rdi
2501: subq $8, %r8 256 movq $_bss /* - $startup_32 */, %rcx
251 subq $8, %r9 257 shrq $3, %rcx
252 movq 0(%r8), %rax 258 std
253 movq %rax, 0(%r9) 259 rep movsq
254 subq $8, %rcx 260 cld
255 jnz 1b 261 popq %rsi
256 262
257/* 263/*
258 * Jump to the relocated address. 264 * Jump to the relocated address.
@@ -260,37 +266,28 @@ ENTRY(startup_64)
260 leaq relocated(%rbx), %rax 266 leaq relocated(%rbx), %rax
261 jmp *%rax 267 jmp *%rax
262 268
263.section ".text" 269 .text
264relocated: 270relocated:
265 271
266/* 272/*
267 * Clear BSS 273 * Clear BSS (stack is currently empty)
268 */ 274 */
269 xorq %rax, %rax 275 xorl %eax, %eax
270 leaq _edata(%rbx), %rdi 276 leaq _bss(%rip), %rdi
271 leaq _end_before_pgt(%rbx), %rcx 277 leaq _ebss(%rip), %rcx
272 subq %rdi, %rcx 278 subq %rdi, %rcx
273 cld 279 shrq $3, %rcx
274 rep 280 rep stosq
275 stosb
276
277 /* Setup the stack */
278 leaq boot_stack_end(%rip), %rsp
279
280 /* zero EFLAGS after setting rsp */
281 pushq $0
282 popfq
283 281
284/* 282/*
285 * Do the decompression, and jump to the new kernel.. 283 * Do the decompression, and jump to the new kernel..
286 */ 284 */
287 pushq %rsi # Save the real mode argument 285 pushq %rsi /* Save the real mode argument */
288 movq %rsi, %rdi # real mode address 286 movq %rsi, %rdi /* real mode address */
289 leaq boot_heap(%rip), %rsi # malloc area for uncompression 287 leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
290 leaq input_data(%rip), %rdx # input_data 288 leaq input_data(%rip), %rdx /* input_data */
291 movl input_len(%rip), %eax 289 movl $z_input_len, %ecx /* input_len */
292 movq %rax, %rcx # input_len 290 movq %rbp, %r8 /* output target address */
293 movq %rbp, %r8 # output
294 call decompress_kernel 291 call decompress_kernel
295 popq %rsi 292 popq %rsi
296 293
@@ -311,11 +308,21 @@ gdt:
311 .quad 0x0000000000000000 /* TS continued */ 308 .quad 0x0000000000000000 /* TS continued */
312gdt_end: 309gdt_end:
313 310
314.bss 311/*
315/* Stack and heap for uncompression */ 312 * Stack and heap for uncompression
316.balign 4 313 */
314 .bss
315 .balign 4
317boot_heap: 316boot_heap:
318 .fill BOOT_HEAP_SIZE, 1, 0 317 .fill BOOT_HEAP_SIZE, 1, 0
319boot_stack: 318boot_stack:
320 .fill BOOT_STACK_SIZE, 1, 0 319 .fill BOOT_STACK_SIZE, 1, 0
321boot_stack_end: 320boot_stack_end:
321
322/*
323 * Space for page tables (not in .bss so not zeroed)
324 */
325 .section ".pgtable","a",@nobits
326 .balign 4096
327pgtable:
328 .fill 6*4096, 1, 0
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index e45be73684ff..842b2a36174a 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -325,21 +325,19 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
325 free_mem_ptr = heap; /* Heap */ 325 free_mem_ptr = heap; /* Heap */
326 free_mem_end_ptr = heap + BOOT_HEAP_SIZE; 326 free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
327 327
328 if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
329 error("Destination address inappropriately aligned");
328#ifdef CONFIG_X86_64 330#ifdef CONFIG_X86_64
329 if ((unsigned long)output & (__KERNEL_ALIGN - 1)) 331 if (heap > 0x3fffffffffffUL)
330 error("Destination address not 2M aligned");
331 if ((unsigned long)output >= 0xffffffffffUL)
332 error("Destination address too large"); 332 error("Destination address too large");
333#else 333#else
334 if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1))
335 error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
336 if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) 334 if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
337 error("Destination address too large"); 335 error("Destination address too large");
336#endif
338#ifndef CONFIG_RELOCATABLE 337#ifndef CONFIG_RELOCATABLE
339 if ((u32)output != LOAD_PHYSICAL_ADDR) 338 if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
340 error("Wrong destination address"); 339 error("Wrong destination address");
341#endif 340#endif
342#endif
343 341
344 if (!quiet) 342 if (!quiet)
345 putstr("\nDecompressing Linux... "); 343 putstr("\nDecompressing Linux... ");
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
new file mode 100644
index 000000000000..bcbd36c41432
--- /dev/null
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -0,0 +1,97 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright (C) 2009 Intel Corporation. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License version
7 * 2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301, USA.
18 *
19 * H. Peter Anvin <hpa@linux.intel.com>
20 *
21 * ----------------------------------------------------------------------- */
22
23/*
24 * Compute the desired load offset from a compressed program; outputs
25 * a small assembly wrapper with the appropriate symbols defined.
26 */
27
28#include <stdlib.h>
29#include <stdio.h>
30#include <string.h>
31#include <inttypes.h>
32
33static uint32_t getle32(const void *p)
34{
35 const uint8_t *cp = p;
36
37 return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) +
38 ((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24);
39}
40
41int main(int argc, char *argv[])
42{
43 uint32_t olen;
44 long ilen;
45 unsigned long offs;
46 FILE *f;
47
48 if (argc < 2) {
49 fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
50 return 1;
51 }
52
53 /* Get the information for the compressed kernel image first */
54
55 f = fopen(argv[1], "r");
56 if (!f) {
57 perror(argv[1]);
58 return 1;
59 }
60
61
62 if (fseek(f, -4L, SEEK_END)) {
63 perror(argv[1]);
64 }
65 fread(&olen, sizeof olen, 1, f);
66 ilen = ftell(f);
67 olen = getle32(&olen);
68 fclose(f);
69
70 /*
71 * Now we have the input (compressed) and output (uncompressed)
72 * sizes, compute the necessary decompression offset...
73 */
74
75 offs = (olen > ilen) ? olen - ilen : 0;
76 offs += olen >> 12; /* Add 8 bytes for each 32K block */
77 offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */
78 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
79
80 printf(".section \".rodata.compressed\",\"a\",@progbits\n");
81 printf(".globl z_input_len\n");
82 printf("z_input_len = %lu\n", ilen);
83 printf(".globl z_output_len\n");
84 printf("z_output_len = %lu\n", (unsigned long)olen);
85 printf(".globl z_extract_offset\n");
86 printf("z_extract_offset = 0x%lx\n", offs);
87 /* z_extract_offset_negative allows simplification of head_32.S */
88 printf(".globl z_extract_offset_negative\n");
89 printf("z_extract_offset_negative = -0x%lx\n", offs);
90
91 printf(".globl input_data, input_data_end\n");
92 printf("input_data:\n");
93 printf(".incbin \"%s\"\n", argv[1]);
94 printf("input_data_end:\n");
95
96 return 0;
97}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux.lds.S
index bef1ac891bce..cc353e1b3ffd 100644
--- a/arch/x86/boot/compressed/vmlinux_64.lds
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,6 +1,17 @@
1OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") 1OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
2
3#undef i386
4
5#include <asm/page_types.h>
6
7#ifdef CONFIG_X86_64
2OUTPUT_ARCH(i386:x86-64) 8OUTPUT_ARCH(i386:x86-64)
3ENTRY(startup_64) 9ENTRY(startup_64)
10#else
11OUTPUT_ARCH(i386)
12ENTRY(startup_32)
13#endif
14
4SECTIONS 15SECTIONS
5{ 16{
6 /* Be careful parts of head_64.S assume startup_32 is at 17 /* Be careful parts of head_64.S assume startup_32 is at
@@ -33,16 +44,22 @@ SECTIONS
33 *(.data.*) 44 *(.data.*)
34 _edata = . ; 45 _edata = . ;
35 } 46 }
47 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
36 .bss : { 48 .bss : {
37 _bss = . ; 49 _bss = . ;
38 *(.bss) 50 *(.bss)
39 *(.bss.*) 51 *(.bss.*)
40 *(COMMON) 52 *(COMMON)
41 . = ALIGN(8); 53 . = ALIGN(8); /* For convenience during zeroing */
42 _end_before_pgt = . ;
43 . = ALIGN(4096);
44 pgtable = . ;
45 . = . + 4096 * 6;
46 _ebss = .; 54 _ebss = .;
47 } 55 }
56#ifdef CONFIG_X86_64
57 . = ALIGN(PAGE_SIZE);
58 .pgtable : {
59 _pgtable = . ;
60 *(.pgtable)
61 _epgtable = . ;
62 }
63#endif
64 _end = .;
48} 65}
diff --git a/arch/x86/boot/compressed/vmlinux.scr b/arch/x86/boot/compressed/vmlinux.scr
deleted file mode 100644
index f02382ae5c48..000000000000
--- a/arch/x86/boot/compressed/vmlinux.scr
+++ /dev/null
@@ -1,10 +0,0 @@
1SECTIONS
2{
3 .rodata.compressed : {
4 input_len = .;
5 LONG(input_data_end - input_data) input_data = .;
6 *(.data)
7 output_len = . - 4;
8 input_data_end = .;
9 }
10}
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds
deleted file mode 100644
index bb3c48379c40..000000000000
--- a/arch/x86/boot/compressed/vmlinux_32.lds
+++ /dev/null
@@ -1,43 +0,0 @@
1OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
2OUTPUT_ARCH(i386)
3ENTRY(startup_32)
4SECTIONS
5{
6 /* Be careful parts of head_32.S assume startup_32 is at
7 * address 0.
8 */
9 . = 0;
10 .text.head : {
11 _head = . ;
12 *(.text.head)
13 _ehead = . ;
14 }
15 .rodata.compressed : {
16 *(.rodata.compressed)
17 }
18 .text : {
19 _text = .; /* Text */
20 *(.text)
21 *(.text.*)
22 _etext = . ;
23 }
24 .rodata : {
25 _rodata = . ;
26 *(.rodata) /* read-only data */
27 *(.rodata.*)
28 _erodata = . ;
29 }
30 .data : {
31 _data = . ;
32 *(.data)
33 *(.data.*)
34 _edata = . ;
35 }
36 .bss : {
37 _bss = . ;
38 *(.bss)
39 *(.bss.*)
40 *(COMMON)
41 _end = . ;
42 }
43}
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 1aae8f3e5ca1..c501a5b466f8 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -22,17 +23,17 @@
22 */ 23 */
23static int read_mbr(u8 devno, void *buf) 24static int read_mbr(u8 devno, void *buf)
24{ 25{
25 u16 ax, bx, cx, dx; 26 struct biosregs ireg, oreg;
26 27
27 ax = 0x0201; /* Legacy Read, one sector */ 28 initregs(&ireg);
28 cx = 0x0001; /* Sector 0-0-1 */ 29 ireg.ax = 0x0201; /* Legacy Read, one sector */
29 dx = devno; 30 ireg.cx = 0x0001; /* Sector 0-0-1 */
30 bx = (size_t)buf; 31 ireg.dl = devno;
31 asm volatile("pushfl; stc; int $0x13; setc %%al; popfl" 32 ireg.bx = (size_t)buf;
32 : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx)
33 : : "esi", "edi", "memory");
34 33
35 return -(u8)ax; /* 0 or -1 */ 34 intcall(0x13, &ireg, &oreg);
35
36 return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
36} 37}
37 38
38static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) 39static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
@@ -72,56 +73,46 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
72 73
73static int get_edd_info(u8 devno, struct edd_info *ei) 74static int get_edd_info(u8 devno, struct edd_info *ei)
74{ 75{
75 u16 ax, bx, cx, dx, di; 76 struct biosregs ireg, oreg;
76 77
77 memset(ei, 0, sizeof *ei); 78 memset(ei, 0, sizeof *ei);
78 79
79 /* Check Extensions Present */ 80 /* Check Extensions Present */
80 81
81 ax = 0x4100; 82 initregs(&ireg);
82 bx = EDDMAGIC1; 83 ireg.ah = 0x41;
83 dx = devno; 84 ireg.bx = EDDMAGIC1;
84 asm("pushfl; stc; int $0x13; setc %%al; popfl" 85 ireg.dl = devno;
85 : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx) 86 intcall(0x13, &ireg, &oreg);
86 : : "esi", "edi");
87 87
88 if ((u8)ax) 88 if (oreg.eflags & X86_EFLAGS_CF)
89 return -1; /* No extended information */ 89 return -1; /* No extended information */
90 90
91 if (bx != EDDMAGIC2) 91 if (oreg.bx != EDDMAGIC2)
92 return -1; 92 return -1;
93 93
94 ei->device = devno; 94 ei->device = devno;
95 ei->version = ax >> 8; /* EDD version number */ 95 ei->version = oreg.ah; /* EDD version number */
96 ei->interface_support = cx; /* EDD functionality subsets */ 96 ei->interface_support = oreg.cx; /* EDD functionality subsets */
97 97
98 /* Extended Get Device Parameters */ 98 /* Extended Get Device Parameters */
99 99
100 ei->params.length = sizeof(ei->params); 100 ei->params.length = sizeof(ei->params);
101 ax = 0x4800; 101 ireg.ah = 0x48;
102 dx = devno; 102 ireg.si = (size_t)&ei->params;
103 asm("pushfl; int $0x13; popfl" 103 intcall(0x13, &ireg, &oreg);
104 : "+a" (ax), "+d" (dx), "=m" (ei->params)
105 : "S" (&ei->params)
106 : "ebx", "ecx", "edi");
107 104
108 /* Get legacy CHS parameters */ 105 /* Get legacy CHS parameters */
109 106
110 /* Ralf Brown recommends setting ES:DI to 0:0 */ 107 /* Ralf Brown recommends setting ES:DI to 0:0 */
111 ax = 0x0800; 108 ireg.ah = 0x08;
112 dx = devno; 109 ireg.es = 0;
113 di = 0; 110 intcall(0x13, &ireg, &oreg);
114 asm("pushw %%es; " 111
115 "movw %%di,%%es; " 112 if (!(oreg.eflags & X86_EFLAGS_CF)) {
116 "pushfl; stc; int $0x13; setc %%al; popfl; " 113 ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2);
117 "popw %%es" 114 ei->legacy_max_head = oreg.dh;
118 : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di) 115 ei->legacy_sectors_per_track = oreg.cl & 0x3f;
119 : : "esi");
120
121 if ((u8)ax == 0) {
122 ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2);
123 ei->legacy_max_head = dx >> 8;
124 ei->legacy_sectors_per_track = cx & 0x3f;
125 } 116 }
126 117
127 return 0; 118 return 0;
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 5d84d1c74e4c..b31cc54b4641 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -22,7 +22,8 @@
22#include <asm/page_types.h> 22#include <asm/page_types.h>
23#include <asm/setup.h> 23#include <asm/setup.h>
24#include "boot.h" 24#include "boot.h"
25#include "offsets.h" 25#include "voffset.h"
26#include "zoffset.h"
26 27
27BOOTSEG = 0x07C0 /* original address of boot-sector */ 28BOOTSEG = 0x07C0 /* original address of boot-sector */
28SYSSEG = 0x1000 /* historical load address >> 4 */ 29SYSSEG = 0x1000 /* historical load address >> 4 */
@@ -115,7 +116,7 @@ _start:
115 # Part 2 of the header, from the old setup.S 116 # Part 2 of the header, from the old setup.S
116 117
117 .ascii "HdrS" # header signature 118 .ascii "HdrS" # header signature
118 .word 0x0209 # header version number (>= 0x0105) 119 .word 0x020a # header version number (>= 0x0105)
119 # or else old loadlin-1.5 will fail) 120 # or else old loadlin-1.5 will fail)
120 .globl realmode_swtch 121 .globl realmode_swtch
121realmode_swtch: .word 0, 0 # default_switch, SETUPSEG 122realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -168,7 +169,11 @@ heap_end_ptr: .word _end+STACK_SIZE-512
168 # end of setup code can be used by setup 169 # end of setup code can be used by setup
169 # for local heap purposes. 170 # for local heap purposes.
170 171
171pad1: .word 0 172ext_loader_ver:
173 .byte 0 # Extended boot loader version
174ext_loader_type:
175 .byte 0 # Extended boot loader type
176
172cmd_line_ptr: .long 0 # (Header version 0x0202 or later) 177cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
173 # If nonzero, a 32-bit pointer 178 # If nonzero, a 32-bit pointer
174 # to the kernel command line. 179 # to the kernel command line.
@@ -200,7 +205,7 @@ relocatable_kernel: .byte 1
200#else 205#else
201relocatable_kernel: .byte 0 206relocatable_kernel: .byte 0
202#endif 207#endif
203pad2: .byte 0 208min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment
204pad3: .word 0 209pad3: .word 0
205 210
206cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, 211cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
@@ -212,16 +217,27 @@ hardware_subarch: .long 0 # subarchitecture, added with 2.07
212 217
213hardware_subarch_data: .quad 0 218hardware_subarch_data: .quad 0
214 219
215payload_offset: .long input_data 220payload_offset: .long ZO_input_data
216payload_length: .long input_data_end-input_data 221payload_length: .long ZO_z_input_len
217 222
218setup_data: .quad 0 # 64-bit physical pointer to 223setup_data: .quad 0 # 64-bit physical pointer to
219 # single linked list of 224 # single linked list of
220 # struct setup_data 225 # struct setup_data
221 226
227pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
228
229#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset)
230#define VO_INIT_SIZE (VO__end - VO__text)
231#if ZO_INIT_SIZE > VO_INIT_SIZE
232#define INIT_SIZE ZO_INIT_SIZE
233#else
234#define INIT_SIZE VO_INIT_SIZE
235#endif
236init_size: .long INIT_SIZE # kernel initialization size
237
222# End of setup header ##################################################### 238# End of setup header #####################################################
223 239
224 .section ".inittext", "ax" 240 .section ".entrytext", "ax"
225start_of_setup: 241start_of_setup:
226#ifdef SAFE_RESET_DISK_CONTROLLER 242#ifdef SAFE_RESET_DISK_CONTROLLER
227# Reset the disk controller. 243# Reset the disk controller.
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 58f0415d3ae0..140172b895bd 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -61,11 +62,10 @@ static void copy_boot_params(void)
61 */ 62 */
62static void keyboard_set_repeat(void) 63static void keyboard_set_repeat(void)
63{ 64{
64 u16 ax = 0x0305; 65 struct biosregs ireg;
65 u16 bx = 0; 66 initregs(&ireg);
66 asm volatile("int $0x16" 67 ireg.ax = 0x0305;
67 : "+a" (ax), "+b" (bx) 68 intcall(0x16, &ireg, NULL);
68 : : "ecx", "edx", "esi", "edi");
69} 69}
70 70
71/* 71/*
@@ -73,18 +73,22 @@ static void keyboard_set_repeat(void)
73 */ 73 */
74static void query_ist(void) 74static void query_ist(void)
75{ 75{
76 struct biosregs ireg, oreg;
77
76 /* Some older BIOSes apparently crash on this call, so filter 78 /* Some older BIOSes apparently crash on this call, so filter
77 it from machines too old to have SpeedStep at all. */ 79 it from machines too old to have SpeedStep at all. */
78 if (cpu.level < 6) 80 if (cpu.level < 6)
79 return; 81 return;
80 82
81 asm("int $0x15" 83 initregs(&ireg);
82 : "=a" (boot_params.ist_info.signature), 84 ireg.ax = 0xe980; /* IST Support */
83 "=b" (boot_params.ist_info.command), 85 ireg.edx = 0x47534943; /* Request value */
84 "=c" (boot_params.ist_info.event), 86 intcall(0x15, &ireg, &oreg);
85 "=d" (boot_params.ist_info.perf_level) 87
86 : "a" (0x0000e980), /* IST Support */ 88 boot_params.ist_info.signature = oreg.eax;
87 "d" (0x47534943)); /* Request value */ 89 boot_params.ist_info.command = oreg.ebx;
90 boot_params.ist_info.event = oreg.ecx;
91 boot_params.ist_info.perf_level = oreg.edx;
88} 92}
89 93
90/* 94/*
@@ -93,13 +97,12 @@ static void query_ist(void)
93static void set_bios_mode(void) 97static void set_bios_mode(void)
94{ 98{
95#ifdef CONFIG_X86_64 99#ifdef CONFIG_X86_64
96 u32 eax, ebx; 100 struct biosregs ireg;
97 101
98 eax = 0xec00; 102 initregs(&ireg);
99 ebx = 2; 103 ireg.ax = 0xec00;
100 asm volatile("int $0x15" 104 ireg.bx = 2;
101 : "+a" (eax), "+b" (ebx) 105 intcall(0x15, &ireg, NULL);
102 : : "ecx", "edx", "esi", "edi");
103#endif 106#endif
104} 107}
105 108
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
index 911eaae5d696..a95a531148ef 100644
--- a/arch/x86/boot/mca.c
+++ b/arch/x86/boot/mca.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -16,26 +17,22 @@
16 17
17int query_mca(void) 18int query_mca(void)
18{ 19{
19 u8 err; 20 struct biosregs ireg, oreg;
20 u16 es, bx, len; 21 u16 len;
21 22
22 asm("pushw %%es ; " 23 initregs(&ireg);
23 "int $0x15 ; " 24 ireg.ah = 0xc0;
24 "setc %0 ; " 25 intcall(0x15, &ireg, &oreg);
25 "movw %%es, %1 ; " 26
26 "popw %%es" 27 if (oreg.eflags & X86_EFLAGS_CF)
27 : "=acd" (err), "=acdSD" (es), "=b" (bx)
28 : "a" (0xc000));
29
30 if (err)
31 return -1; /* No MCA present */ 28 return -1; /* No MCA present */
32 29
33 set_fs(es); 30 set_fs(oreg.es);
34 len = rdfs16(bx); 31 len = rdfs16(oreg.bx);
35 32
36 if (len > sizeof(boot_params.sys_desc_table)) 33 if (len > sizeof(boot_params.sys_desc_table))
37 len = sizeof(boot_params.sys_desc_table); 34 len = sizeof(boot_params.sys_desc_table);
38 35
39 copy_from_fs(&boot_params.sys_desc_table, bx, len); 36 copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
40 return 0; 37 return 0;
41} 38}
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 74b3d2ba84e9..cae3feb1035e 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -20,12 +20,16 @@
20static int detect_memory_e820(void) 20static int detect_memory_e820(void)
21{ 21{
22 int count = 0; 22 int count = 0;
23 u32 next = 0; 23 struct biosregs ireg, oreg;
24 u32 size, id, edi;
25 u8 err;
26 struct e820entry *desc = boot_params.e820_map; 24 struct e820entry *desc = boot_params.e820_map;
27 static struct e820entry buf; /* static so it is zeroed */ 25 static struct e820entry buf; /* static so it is zeroed */
28 26
27 initregs(&ireg);
28 ireg.ax = 0xe820;
29 ireg.cx = sizeof buf;
30 ireg.edx = SMAP;
31 ireg.di = (size_t)&buf;
32
29 /* 33 /*
30 * Note: at least one BIOS is known which assumes that the 34 * Note: at least one BIOS is known which assumes that the
31 * buffer pointed to by one e820 call is the same one as 35 * buffer pointed to by one e820 call is the same one as
@@ -41,22 +45,13 @@ static int detect_memory_e820(void)
41 */ 45 */
42 46
43 do { 47 do {
44 size = sizeof buf; 48 intcall(0x15, &ireg, &oreg);
45 49 ireg.ebx = oreg.ebx; /* for next iteration... */
46 /* Important: %edx and %esi are clobbered by some BIOSes,
47 so they must be either used for the error output
48 or explicitly marked clobbered. Given that, assume there
49 is something out there clobbering %ebp and %edi, too. */
50 asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
51 : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
52 "=D" (edi), "+m" (buf)
53 : "D" (&buf), "d" (SMAP), "a" (0xe820)
54 : "esi");
55 50
56 /* BIOSes which terminate the chain with CF = 1 as opposed 51 /* BIOSes which terminate the chain with CF = 1 as opposed
57 to %ebx = 0 don't always report the SMAP signature on 52 to %ebx = 0 don't always report the SMAP signature on
58 the final, failing, probe. */ 53 the final, failing, probe. */
59 if (err) 54 if (oreg.eflags & X86_EFLAGS_CF)
60 break; 55 break;
61 56
62 /* Some BIOSes stop returning SMAP in the middle of 57 /* Some BIOSes stop returning SMAP in the middle of
@@ -64,60 +59,64 @@ static int detect_memory_e820(void)
64 screwed up the map at that point, we might have a 59 screwed up the map at that point, we might have a
65 partial map, the full map, or complete garbage, so 60 partial map, the full map, or complete garbage, so
66 just return failure. */ 61 just return failure. */
67 if (id != SMAP) { 62 if (oreg.eax != SMAP) {
68 count = 0; 63 count = 0;
69 break; 64 break;
70 } 65 }
71 66
72 *desc++ = buf; 67 *desc++ = buf;
73 count++; 68 count++;
74 } while (next && count < ARRAY_SIZE(boot_params.e820_map)); 69 } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
75 70
76 return boot_params.e820_entries = count; 71 return boot_params.e820_entries = count;
77} 72}
78 73
79static int detect_memory_e801(void) 74static int detect_memory_e801(void)
80{ 75{
81 u16 ax, bx, cx, dx; 76 struct biosregs ireg, oreg;
82 u8 err;
83 77
84 bx = cx = dx = 0; 78 initregs(&ireg);
85 ax = 0xe801; 79 ireg.ax = 0xe801;
86 asm("stc; int $0x15; setc %0" 80 intcall(0x15, &ireg, &oreg);
87 : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx));
88 81
89 if (err) 82 if (oreg.eflags & X86_EFLAGS_CF)
90 return -1; 83 return -1;
91 84
92 /* Do we really need to do this? */ 85 /* Do we really need to do this? */
93 if (cx || dx) { 86 if (oreg.cx || oreg.dx) {
94 ax = cx; 87 oreg.ax = oreg.cx;
95 bx = dx; 88 oreg.bx = oreg.dx;
96 } 89 }
97 90
98 if (ax > 15*1024) 91 if (oreg.ax > 15*1024) {
99 return -1; /* Bogus! */ 92 return -1; /* Bogus! */
100 93 } else if (oreg.ax == 15*1024) {
101 /* This ignores memory above 16MB if we have a memory hole 94 boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax;
102 there. If someone actually finds a machine with a memory 95 } else {
103 hole at 16MB and no support for 0E820h they should probably 96 /*
104 generate a fake e820 map. */ 97 * This ignores memory above 16MB if we have a memory
105 boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax; 98 * hole there. If someone actually finds a machine
99 * with a memory hole at 16MB and no support for
100 * 0E820h they should probably generate a fake e820
101 * map.
102 */
103 boot_params.alt_mem_k = oreg.ax;
104 }
106 105
107 return 0; 106 return 0;
108} 107}
109 108
110static int detect_memory_88(void) 109static int detect_memory_88(void)
111{ 110{
112 u16 ax; 111 struct biosregs ireg, oreg;
113 u8 err;
114 112
115 ax = 0x8800; 113 initregs(&ireg);
116 asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax)); 114 ireg.ah = 0x88;
115 intcall(0x15, &ireg, &oreg);
117 116
118 boot_params.screen_info.ext_mem_k = ax; 117 boot_params.screen_info.ext_mem_k = oreg.ax;
119 118
120 return -err; 119 return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
121} 120}
122 121
123int detect_memory(void) 122int detect_memory(void)
diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c
new file mode 100644
index 000000000000..958019b1cfa5
--- /dev/null
+++ b/arch/x86/boot/regs.c
@@ -0,0 +1,29 @@
1/* -----------------------------------------------------------------------
2 *
3 * Copyright 2009 Intel Corporation; author H. Peter Anvin
4 *
5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2 or (at your
7 * option) any later version; incorporated herein by reference.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * Simple helper function for initializing a register set.
13 *
14 * Note that this sets EFLAGS_CF in the input register set; this
15 * makes it easier to catch functions which do nothing but don't
16 * explicitly set CF.
17 */
18
19#include "boot.h"
20
21void initregs(struct biosregs *reg)
22{
23 memset(reg, 0, sizeof *reg);
24 reg->eflags |= X86_EFLAGS_CF;
25 reg->ds = ds();
26 reg->es = ds();
27 reg->fs = fs();
28 reg->gs = gs();
29}
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index bb8dc2de7969..0f6ec455a2b1 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -15,8 +15,11 @@ SECTIONS
15 15
16 . = 497; 16 . = 497;
17 .header : { *(.header) } 17 .header : { *(.header) }
18 .entrytext : { *(.entrytext) }
18 .inittext : { *(.inittext) } 19 .inittext : { *(.inittext) }
19 .initdata : { *(.initdata) } 20 .initdata : { *(.initdata) }
21 __end_init = .;
22
20 .text : { *(.text) } 23 .text : { *(.text) }
21 .text32 : { *(.text32) } 24 .text32 : { *(.text32) }
22 25
@@ -52,4 +55,7 @@ SECTIONS
52 55
53 . = ASSERT(_end <= 0x8000, "Setup too big!"); 56 . = ASSERT(_end <= 0x8000, "Setup too big!");
54 . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); 57 . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
58 /* Necessary for the very-old-loader check to work... */
59 . = ASSERT(__end_init <= 5*512, "init sections too big!");
60
55} 61}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 7e8e8b25f5f6..01ec69c901c7 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -22,24 +23,23 @@
22 23
23void __attribute__((section(".inittext"))) putchar(int ch) 24void __attribute__((section(".inittext"))) putchar(int ch)
24{ 25{
25 unsigned char c = ch; 26 struct biosregs ireg;
26 27
27 if (c == '\n') 28 if (ch == '\n')
28 putchar('\r'); /* \n -> \r\n */ 29 putchar('\r'); /* \n -> \r\n */
29 30
30 /* int $0x10 is known to have bugs involving touching registers 31 initregs(&ireg);
31 it shouldn't. Be extra conservative... */ 32 ireg.bx = 0x0007;
32 asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal" 33 ireg.cx = 0x0001;
33 : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch)); 34 ireg.ah = 0x0e;
35 ireg.al = ch;
36 intcall(0x10, &ireg, NULL);
34} 37}
35 38
36void __attribute__((section(".inittext"))) puts(const char *str) 39void __attribute__((section(".inittext"))) puts(const char *str)
37{ 40{
38 int n = 0; 41 while (*str)
39 while (*str) {
40 putchar(*str++); 42 putchar(*str++);
41 n++;
42 }
43} 43}
44 44
45/* 45/*
@@ -49,14 +49,13 @@ void __attribute__((section(".inittext"))) puts(const char *str)
49 49
50static u8 gettime(void) 50static u8 gettime(void)
51{ 51{
52 u16 ax = 0x0200; 52 struct biosregs ireg, oreg;
53 u16 cx, dx;
54 53
55 asm volatile("int $0x1a" 54 initregs(&ireg);
56 : "+a" (ax), "=c" (cx), "=d" (dx) 55 ireg.ah = 0x02;
57 : : "ebx", "esi", "edi"); 56 intcall(0x1a, &ireg, &oreg);
58 57
59 return dx >> 8; 58 return oreg.dh;
60} 59}
61 60
62/* 61/*
@@ -64,19 +63,24 @@ static u8 gettime(void)
64 */ 63 */
65int getchar(void) 64int getchar(void)
66{ 65{
67 u16 ax = 0; 66 struct biosregs ireg, oreg;
68 asm volatile("int $0x16" : "+a" (ax)); 67
68 initregs(&ireg);
69 /* ireg.ah = 0x00; */
70 intcall(0x16, &ireg, &oreg);
69 71
70 return ax & 0xff; 72 return oreg.al;
71} 73}
72 74
73static int kbd_pending(void) 75static int kbd_pending(void)
74{ 76{
75 u8 pending; 77 struct biosregs ireg, oreg;
76 asm volatile("int $0x16; setnz %0" 78
77 : "=qm" (pending) 79 initregs(&ireg);
78 : "a" (0x0100)); 80 ireg.ah = 0x01;
79 return pending; 81 intcall(0x16, &ireg, &oreg);
82
83 return !(oreg.eflags & X86_EFLAGS_ZF);
80} 84}
81 85
82void kbd_flush(void) 86void kbd_flush(void)
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index 3fa979c9c363..d660be492363 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -29,21 +30,21 @@ static int bios_set_mode(struct mode_info *mi)
29 30
30static int set_bios_mode(u8 mode) 31static int set_bios_mode(u8 mode)
31{ 32{
32 u16 ax; 33 struct biosregs ireg, oreg;
33 u8 new_mode; 34 u8 new_mode;
34 35
35 ax = mode; /* AH=0x00 Set Video Mode */ 36 initregs(&ireg);
36 asm volatile(INT10 37 ireg.al = mode; /* AH=0x00 Set Video Mode */
37 : "+a" (ax) 38 intcall(0x10, &ireg, NULL);
38 : : "ebx", "ecx", "edx", "esi", "edi");
39 39
40 ax = 0x0f00; /* Get Current Video Mode */ 40
41 asm volatile(INT10 41 ireg.ah = 0x0f; /* Get Current Video Mode */
42 : "+a" (ax) 42 intcall(0x10, &ireg, &oreg);
43 : : "ebx", "ecx", "edx", "esi", "edi");
44 43
45 do_restore = 1; /* Assume video contents were lost */ 44 do_restore = 1; /* Assume video contents were lost */
46 new_mode = ax & 0x7f; /* Not all BIOSes are clean with the top bit */ 45
46 /* Not all BIOSes are clean with the top bit */
47 new_mode = ireg.al & 0x7f;
47 48
48 if (new_mode == mode) 49 if (new_mode == mode)
49 return 0; /* Mode change OK */ 50 return 0; /* Mode change OK */
@@ -53,10 +54,8 @@ static int set_bios_mode(u8 mode)
53 /* Mode setting failed, but we didn't end up where we 54 /* Mode setting failed, but we didn't end up where we
54 started. That's bad. Try to revert to the original 55 started. That's bad. Try to revert to the original
55 video mode. */ 56 video mode. */
56 ax = boot_params.screen_info.orig_video_mode; 57 ireg.ax = boot_params.screen_info.orig_video_mode;
57 asm volatile(INT10 58 intcall(0x10, &ireg, NULL);
58 : "+a" (ax)
59 : : "ebx", "ecx", "edx", "esi", "edi");
60 } 59 }
61#endif 60#endif
62 return -1; 61 return -1;
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 4a58c8ce3f69..c700147d6ffb 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -31,7 +32,7 @@ static inline void vesa_store_mode_params_graphics(void) {}
31static int vesa_probe(void) 32static int vesa_probe(void)
32{ 33{
33#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID) 34#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
34 u16 ax, cx, di; 35 struct biosregs ireg, oreg;
35 u16 mode; 36 u16 mode;
36 addr_t mode_ptr; 37 addr_t mode_ptr;
37 struct mode_info *mi; 38 struct mode_info *mi;
@@ -39,13 +40,12 @@ static int vesa_probe(void)
39 40
40 video_vesa.modes = GET_HEAP(struct mode_info, 0); 41 video_vesa.modes = GET_HEAP(struct mode_info, 0);
41 42
42 ax = 0x4f00; 43 initregs(&ireg);
43 di = (size_t)&vginfo; 44 ireg.ax = 0x4f00;
44 asm(INT10 45 ireg.di = (size_t)&vginfo;
45 : "+a" (ax), "+D" (di), "=m" (vginfo) 46 intcall(0x10, &ireg, &oreg);
46 : : "ebx", "ecx", "edx", "esi");
47 47
48 if (ax != 0x004f || 48 if (ireg.ax != 0x004f ||
49 vginfo.signature != VESA_MAGIC || 49 vginfo.signature != VESA_MAGIC ||
50 vginfo.version < 0x0102) 50 vginfo.version < 0x0102)
51 return 0; /* Not present */ 51 return 0; /* Not present */
@@ -65,14 +65,12 @@ static int vesa_probe(void)
65 65
66 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ 66 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
67 67
68 ax = 0x4f01; 68 ireg.ax = 0x4f01;
69 cx = mode; 69 ireg.cx = mode;
70 di = (size_t)&vminfo; 70 ireg.di = (size_t)&vminfo;
71 asm(INT10 71 intcall(0x10, &ireg, &oreg);
72 : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
73 : : "ebx", "edx", "esi");
74 72
75 if (ax != 0x004f) 73 if (ireg.ax != 0x004f)
76 continue; 74 continue;
77 75
78 if ((vminfo.mode_attr & 0x15) == 0x05) { 76 if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -111,20 +109,19 @@ static int vesa_probe(void)
111 109
112static int vesa_set_mode(struct mode_info *mode) 110static int vesa_set_mode(struct mode_info *mode)
113{ 111{
114 u16 ax, bx, cx, di; 112 struct biosregs ireg, oreg;
115 int is_graphic; 113 int is_graphic;
116 u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA; 114 u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
117 115
118 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ 116 memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
119 117
120 ax = 0x4f01; 118 initregs(&ireg);
121 cx = vesa_mode; 119 ireg.ax = 0x4f01;
122 di = (size_t)&vminfo; 120 ireg.cx = vesa_mode;
123 asm(INT10 121 ireg.di = (size_t)&vminfo;
124 : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo) 122 intcall(0x10, &ireg, &oreg);
125 : : "ebx", "edx", "esi");
126 123
127 if (ax != 0x004f) 124 if (oreg.ax != 0x004f)
128 return -1; 125 return -1;
129 126
130 if ((vminfo.mode_attr & 0x15) == 0x05) { 127 if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -141,14 +138,12 @@ static int vesa_set_mode(struct mode_info *mode)
141 } 138 }
142 139
143 140
144 ax = 0x4f02; 141 initregs(&ireg);
145 bx = vesa_mode; 142 ireg.ax = 0x4f02;
146 di = 0; 143 ireg.bx = vesa_mode;
147 asm volatile(INT10 144 intcall(0x10, &ireg, &oreg);
148 : "+a" (ax), "+b" (bx), "+D" (di)
149 : : "ecx", "edx", "esi");
150 145
151 if (ax != 0x004f) 146 if (oreg.ax != 0x004f)
152 return -1; 147 return -1;
153 148
154 graphic_mode = is_graphic; 149 graphic_mode = is_graphic;
@@ -171,50 +166,45 @@ static int vesa_set_mode(struct mode_info *mode)
171/* Switch DAC to 8-bit mode */ 166/* Switch DAC to 8-bit mode */
172static void vesa_dac_set_8bits(void) 167static void vesa_dac_set_8bits(void)
173{ 168{
169 struct biosregs ireg, oreg;
174 u8 dac_size = 6; 170 u8 dac_size = 6;
175 171
176 /* If possible, switch the DAC to 8-bit mode */ 172 /* If possible, switch the DAC to 8-bit mode */
177 if (vginfo.capabilities & 1) { 173 if (vginfo.capabilities & 1) {
178 u16 ax, bx; 174 initregs(&ireg);
179 175 ireg.ax = 0x4f08;
180 ax = 0x4f08; 176 ireg.bh = 0x08;
181 bx = 0x0800; 177 intcall(0x10, &ireg, &oreg);
182 asm volatile(INT10 178 if (oreg.ax == 0x004f)
183 : "+a" (ax), "+b" (bx) 179 dac_size = oreg.bh;
184 : : "ecx", "edx", "esi", "edi");
185
186 if (ax == 0x004f)
187 dac_size = bx >> 8;
188 } 180 }
189 181
190 /* Set the color sizes to the DAC size, and offsets to 0 */ 182 /* Set the color sizes to the DAC size, and offsets to 0 */
191 boot_params.screen_info.red_size = dac_size; 183 boot_params.screen_info.red_size = dac_size;
192 boot_params.screen_info.green_size = dac_size; 184 boot_params.screen_info.green_size = dac_size;
193 boot_params.screen_info.blue_size = dac_size; 185 boot_params.screen_info.blue_size = dac_size;
194 boot_params.screen_info.rsvd_size = dac_size; 186 boot_params.screen_info.rsvd_size = dac_size;
195 187
196 boot_params.screen_info.red_pos = 0; 188 boot_params.screen_info.red_pos = 0;
197 boot_params.screen_info.green_pos = 0; 189 boot_params.screen_info.green_pos = 0;
198 boot_params.screen_info.blue_pos = 0; 190 boot_params.screen_info.blue_pos = 0;
199 boot_params.screen_info.rsvd_pos = 0; 191 boot_params.screen_info.rsvd_pos = 0;
200} 192}
201 193
202/* Save the VESA protected mode info */ 194/* Save the VESA protected mode info */
203static void vesa_store_pm_info(void) 195static void vesa_store_pm_info(void)
204{ 196{
205 u16 ax, bx, di, es; 197 struct biosregs ireg, oreg;
206 198
207 ax = 0x4f0a; 199 initregs(&ireg);
208 bx = di = 0; 200 ireg.ax = 0x4f0a;
209 asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es" 201 intcall(0x10, &ireg, &oreg);
210 : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di)
211 : : "ecx", "esi");
212 202
213 if (ax != 0x004f) 203 if (oreg.ax != 0x004f)
214 return; 204 return;
215 205
216 boot_params.screen_info.vesapm_seg = es; 206 boot_params.screen_info.vesapm_seg = oreg.es;
217 boot_params.screen_info.vesapm_off = di; 207 boot_params.screen_info.vesapm_off = oreg.di;
218} 208}
219 209
220/* 210/*
@@ -252,7 +242,7 @@ static void vesa_store_mode_params_graphics(void)
252void vesa_store_edid(void) 242void vesa_store_edid(void)
253{ 243{
254#ifdef CONFIG_FIRMWARE_EDID 244#ifdef CONFIG_FIRMWARE_EDID
255 u16 ax, bx, cx, dx, di; 245 struct biosregs ireg, oreg;
256 246
257 /* Apparently used as a nonsense token... */ 247 /* Apparently used as a nonsense token... */
258 memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info); 248 memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
@@ -260,33 +250,26 @@ void vesa_store_edid(void)
260 if (vginfo.version < 0x0200) 250 if (vginfo.version < 0x0200)
261 return; /* EDID requires VBE 2.0+ */ 251 return; /* EDID requires VBE 2.0+ */
262 252
263 ax = 0x4f15; /* VBE DDC */ 253 initregs(&ireg);
264 bx = 0x0000; /* Report DDC capabilities */ 254 ireg.ax = 0x4f15; /* VBE DDC */
265 cx = 0; /* Controller 0 */ 255 /* ireg.bx = 0x0000; */ /* Report DDC capabilities */
266 di = 0; /* ES:DI must be 0 by spec */ 256 /* ireg.cx = 0; */ /* Controller 0 */
267 257 ireg.es = 0; /* ES:DI must be 0 by spec */
268 /* Note: The VBE DDC spec is different from the main VESA spec; 258 intcall(0x10, &ireg, &oreg);
269 we genuinely have to assume all registers are destroyed here. */
270
271 asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es"
272 : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di)
273 : : "esi", "edx");
274 259
275 if (ax != 0x004f) 260 if (oreg.ax != 0x004f)
276 return; /* No EDID */ 261 return; /* No EDID */
277 262
278 /* BH = time in seconds to transfer EDD information */ 263 /* BH = time in seconds to transfer EDD information */
279 /* BL = DDC level supported */ 264 /* BL = DDC level supported */
280 265
281 ax = 0x4f15; /* VBE DDC */ 266 ireg.ax = 0x4f15; /* VBE DDC */
282 bx = 0x0001; /* Read EDID */ 267 ireg.bx = 0x0001; /* Read EDID */
283 cx = 0; /* Controller 0 */ 268 /* ireg.cx = 0; */ /* Controller 0 */
284 dx = 0; /* EDID block number */ 269 /* ireg.dx = 0; */ /* EDID block number */
285 di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */ 270 ireg.es = ds();
286 asm(INT10 271 ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */
287 : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info), 272 intcall(0x10, &ireg, &oreg);
288 "+c" (cx), "+D" (di)
289 : : "esi");
290#endif /* CONFIG_FIRMWARE_EDID */ 273#endif /* CONFIG_FIRMWARE_EDID */
291} 274}
292 275
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 9e0587a37768..8f8d827e254d 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -39,30 +40,30 @@ static __videocard video_vga;
39/* Set basic 80x25 mode */ 40/* Set basic 80x25 mode */
40static u8 vga_set_basic_mode(void) 41static u8 vga_set_basic_mode(void)
41{ 42{
43 struct biosregs ireg, oreg;
42 u16 ax; 44 u16 ax;
43 u8 rows; 45 u8 rows;
44 u8 mode; 46 u8 mode;
45 47
48 initregs(&ireg);
49
46#ifdef CONFIG_VIDEO_400_HACK 50#ifdef CONFIG_VIDEO_400_HACK
47 if (adapter >= ADAPTER_VGA) { 51 if (adapter >= ADAPTER_VGA) {
48 asm volatile(INT10 52 ireg.ax = 0x1202;
49 : : "a" (0x1202), "b" (0x0030) 53 ireg.bx = 0x0030;
50 : "ecx", "edx", "esi", "edi"); 54 intcall(0x10, &ireg, NULL);
51 } 55 }
52#endif 56#endif
53 57
54 ax = 0x0f00; 58 ax = 0x0f00;
55 asm volatile(INT10 59 intcall(0x10, &ireg, &oreg);
56 : "+a" (ax) 60 mode = oreg.al;
57 : : "ebx", "ecx", "edx", "esi", "edi");
58
59 mode = (u8)ax;
60 61
61 set_fs(0); 62 set_fs(0);
62 rows = rdfs8(0x484); /* rows minus one */ 63 rows = rdfs8(0x484); /* rows minus one */
63 64
64#ifndef CONFIG_VIDEO_400_HACK 65#ifndef CONFIG_VIDEO_400_HACK
65 if ((ax == 0x5003 || ax == 0x5007) && 66 if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
66 (rows == 0 || rows == 24)) 67 (rows == 0 || rows == 24))
67 return mode; 68 return mode;
68#endif 69#endif
@@ -71,10 +72,8 @@ static u8 vga_set_basic_mode(void)
71 mode = 3; 72 mode = 3;
72 73
73 /* Set the mode */ 74 /* Set the mode */
74 ax = mode; 75 ireg.ax = mode; /* AH=0: set mode */
75 asm volatile(INT10 76 intcall(0x10, &ireg, NULL);
76 : "+a" (ax)
77 : : "ebx", "ecx", "edx", "esi", "edi");
78 do_restore = 1; 77 do_restore = 1;
79 return mode; 78 return mode;
80} 79}
@@ -82,43 +81,69 @@ static u8 vga_set_basic_mode(void)
82static void vga_set_8font(void) 81static void vga_set_8font(void)
83{ 82{
84 /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */ 83 /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */
84 struct biosregs ireg;
85
86 initregs(&ireg);
85 87
86 /* Set 8x8 font */ 88 /* Set 8x8 font */
87 asm volatile(INT10 : : "a" (0x1112), "b" (0)); 89 ireg.ax = 0x1112;
90 /* ireg.bl = 0; */
91 intcall(0x10, &ireg, NULL);
88 92
89 /* Use alternate print screen */ 93 /* Use alternate print screen */
90 asm volatile(INT10 : : "a" (0x1200), "b" (0x20)); 94 ireg.ax = 0x1200;
95 ireg.bl = 0x20;
96 intcall(0x10, &ireg, NULL);
91 97
92 /* Turn off cursor emulation */ 98 /* Turn off cursor emulation */
93 asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); 99 ireg.ax = 0x1201;
100 ireg.bl = 0x34;
101 intcall(0x10, &ireg, NULL);
94 102
95 /* Cursor is scan lines 6-7 */ 103 /* Cursor is scan lines 6-7 */
96 asm volatile(INT10 : : "a" (0x0100), "c" (0x0607)); 104 ireg.ax = 0x0100;
105 ireg.cx = 0x0607;
106 intcall(0x10, &ireg, NULL);
97} 107}
98 108
99static void vga_set_14font(void) 109static void vga_set_14font(void)
100{ 110{
101 /* Set 9x14 font - 80x28 on VGA */ 111 /* Set 9x14 font - 80x28 on VGA */
112 struct biosregs ireg;
113
114 initregs(&ireg);
102 115
103 /* Set 9x14 font */ 116 /* Set 9x14 font */
104 asm volatile(INT10 : : "a" (0x1111), "b" (0)); 117 ireg.ax = 0x1111;
118 /* ireg.bl = 0; */
119 intcall(0x10, &ireg, NULL);
105 120
106 /* Turn off cursor emulation */ 121 /* Turn off cursor emulation */
107 asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); 122 ireg.ax = 0x1201;
123 ireg.bl = 0x34;
124 intcall(0x10, &ireg, NULL);
108 125
109 /* Cursor is scan lines 11-12 */ 126 /* Cursor is scan lines 11-12 */
110 asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c)); 127 ireg.ax = 0x0100;
128 ireg.cx = 0x0b0c;
129 intcall(0x10, &ireg, NULL);
111} 130}
112 131
113static void vga_set_80x43(void) 132static void vga_set_80x43(void)
114{ 133{
115 /* Set 80x43 mode on VGA (not EGA) */ 134 /* Set 80x43 mode on VGA (not EGA) */
135 struct biosregs ireg;
136
137 initregs(&ireg);
116 138
117 /* Set 350 scans */ 139 /* Set 350 scans */
118 asm volatile(INT10 : : "a" (0x1201), "b" (0x30)); 140 ireg.ax = 0x1201;
141 ireg.bl = 0x30;
142 intcall(0x10, &ireg, NULL);
119 143
120 /* Reset video mode */ 144 /* Reset video mode */
121 asm volatile(INT10 : : "a" (0x0003)); 145 ireg.ax = 0x0003;
146 intcall(0x10, &ireg, NULL);
122 147
123 vga_set_8font(); 148 vga_set_8font();
124} 149}
@@ -225,8 +250,6 @@ static int vga_set_mode(struct mode_info *mode)
225 */ 250 */
226static int vga_probe(void) 251static int vga_probe(void)
227{ 252{
228 u16 ega_bx;
229
230 static const char *card_name[] = { 253 static const char *card_name[] = {
231 "CGA/MDA/HGC", "EGA", "VGA" 254 "CGA/MDA/HGC", "EGA", "VGA"
232 }; 255 };
@@ -240,26 +263,26 @@ static int vga_probe(void)
240 sizeof(ega_modes)/sizeof(struct mode_info), 263 sizeof(ega_modes)/sizeof(struct mode_info),
241 sizeof(vga_modes)/sizeof(struct mode_info), 264 sizeof(vga_modes)/sizeof(struct mode_info),
242 }; 265 };
243 u8 vga_flag;
244 266
245 asm(INT10 267 struct biosregs ireg, oreg;
246 : "=b" (ega_bx) 268
247 : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */ 269 initregs(&ireg);
248 : "ecx", "edx", "esi", "edi"); 270
271 ireg.ax = 0x1200;
272 ireg.bl = 0x10; /* Check EGA/VGA */
273 intcall(0x10, &ireg, &oreg);
249 274
250#ifndef _WAKEUP 275#ifndef _WAKEUP
251 boot_params.screen_info.orig_video_ega_bx = ega_bx; 276 boot_params.screen_info.orig_video_ega_bx = oreg.bx;
252#endif 277#endif
253 278
254 /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */ 279 /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
255 if ((u8)ega_bx != 0x10) { 280 if (oreg.bl != 0x10) {
256 /* EGA/VGA */ 281 /* EGA/VGA */
257 asm(INT10 282 ireg.ax = 0x1a00;
258 : "=a" (vga_flag) 283 intcall(0x10, &ireg, &oreg);
259 : "a" (0x1a00)
260 : "ebx", "ecx", "edx", "esi", "edi");
261 284
262 if (vga_flag == 0x1a) { 285 if (oreg.al == 0x1a) {
263 adapter = ADAPTER_VGA; 286 adapter = ADAPTER_VGA;
264#ifndef _WAKEUP 287#ifndef _WAKEUP
265 boot_params.screen_info.orig_video_isVGA = 1; 288 boot_params.screen_info.orig_video_isVGA = 1;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 3bef2c1febe9..bad728b76fc2 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -2,6 +2,7 @@
2 * 2 *
3 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright 2007 rPath, Inc. - All Rights Reserved 4 * Copyright 2007 rPath, Inc. - All Rights Reserved
5 * Copyright 2009 Intel Corporation; author H. Peter Anvin
5 * 6 *
6 * This file is part of the Linux kernel, and is made available under 7 * This file is part of the Linux kernel, and is made available under
7 * the terms of the GNU General Public License version 2. 8 * the terms of the GNU General Public License version 2.
@@ -18,33 +19,29 @@
18 19
19static void store_cursor_position(void) 20static void store_cursor_position(void)
20{ 21{
21 u16 curpos; 22 struct biosregs ireg, oreg;
22 u16 ax, bx;
23 23
24 ax = 0x0300; 24 initregs(&ireg);
25 bx = 0; 25 ireg.ah = 0x03;
26 asm(INT10 26 intcall(0x10, &ireg, &oreg);
27 : "=d" (curpos), "+a" (ax), "+b" (bx)
28 : : "ecx", "esi", "edi");
29 27
30 boot_params.screen_info.orig_x = curpos; 28 boot_params.screen_info.orig_x = oreg.dl;
31 boot_params.screen_info.orig_y = curpos >> 8; 29 boot_params.screen_info.orig_y = oreg.dh;
32} 30}
33 31
34static void store_video_mode(void) 32static void store_video_mode(void)
35{ 33{
36 u16 ax, page; 34 struct biosregs ireg, oreg;
37 35
38 /* N.B.: the saving of the video page here is a bit silly, 36 /* N.B.: the saving of the video page here is a bit silly,
39 since we pretty much assume page 0 everywhere. */ 37 since we pretty much assume page 0 everywhere. */
40 ax = 0x0f00; 38 initregs(&ireg);
41 asm(INT10 39 ireg.ah = 0x0f;
42 : "+a" (ax), "=b" (page) 40 intcall(0x10, &ireg, &oreg);
43 : : "ecx", "edx", "esi", "edi");
44 41
45 /* Not all BIOSes are clean with respect to the top bit */ 42 /* Not all BIOSes are clean with respect to the top bit */
46 boot_params.screen_info.orig_video_mode = ax & 0x7f; 43 boot_params.screen_info.orig_video_mode = oreg.al & 0x7f;
47 boot_params.screen_info.orig_video_page = page >> 8; 44 boot_params.screen_info.orig_video_page = oreg.bh;
48} 45}
49 46
50/* 47/*
@@ -257,7 +254,7 @@ static void restore_screen(void)
257 int y; 254 int y;
258 addr_t dst = 0; 255 addr_t dst = 0;
259 u16 *src = saved.data; 256 u16 *src = saved.data;
260 u16 ax, bx, dx; 257 struct biosregs ireg;
261 258
262 if (graphic_mode) 259 if (graphic_mode)
263 return; /* Can't restore onto a graphic mode */ 260 return; /* Can't restore onto a graphic mode */
@@ -296,12 +293,11 @@ static void restore_screen(void)
296 } 293 }
297 294
298 /* Restore cursor position */ 295 /* Restore cursor position */
299 ax = 0x0200; /* Set cursor position */ 296 initregs(&ireg);
300 bx = 0; /* Page number (<< 8) */ 297 ireg.ah = 0x02; /* Set cursor position */
301 dx = (saved.cury << 8)+saved.curx; 298 ireg.dh = saved.cury;
302 asm volatile(INT10 299 ireg.dl = saved.curx;
303 : "+a" (ax), "+b" (bx), "+d" (dx) 300 intcall(0x10, &ireg, NULL);
304 : : "ecx", "esi", "edi");
305} 301}
306#else 302#else
307#define save_screen() ((void)0) 303#define save_screen() ((void)0)
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index ee63f5d14461..5bb174a997fc 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -112,20 +112,6 @@ extern int force_x, force_y; /* Don't query the BIOS for cols/rows */
112extern int do_restore; /* Restore screen contents */ 112extern int do_restore; /* Restore screen contents */
113extern int graphic_mode; /* Graphics mode with linear frame buffer */ 113extern int graphic_mode; /* Graphics mode with linear frame buffer */
114 114
115/*
116 * int $0x10 is notorious for touching registers it shouldn't.
117 * gcc doesn't like %ebp being clobbered, so define it as a push/pop
118 * sequence here.
119 *
120 * A number of systems, including the original PC can clobber %bp in
121 * certain circumstances, like when scrolling. There exists at least
122 * one Trident video card which could clobber DS under a set of
123 * circumstances that we are unlikely to encounter (scrolling when
124 * using an extended graphics mode of more than 800x600 pixels), but
125 * it's cheap insurance to deal with that here.
126 */
127#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
128
129/* Accessing VGA indexed registers */ 115/* Accessing VGA indexed registers */
130static inline u8 in_idx(u16 port, u8 index) 116static inline u8 in_idx(u16 port, u8 index)
131{ 117{
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 235b81d0f6f2..edb992ebef92 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,12 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.29-rc4 3# Linux kernel version: 2.6.30-rc2
4# Tue Feb 24 15:50:58 2009 4# Mon May 11 16:21:55 2009
5# 5#
6# CONFIG_64BIT is not set 6# CONFIG_64BIT is not set
7CONFIG_X86_32=y 7CONFIG_X86_32=y
8# CONFIG_X86_64 is not set 8# CONFIG_X86_64 is not set
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_OUTPUT_FORMAT="elf32-i386"
10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig" 11CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
11CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
12CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
@@ -33,6 +34,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
33CONFIG_ARCH_HAS_DEFAULT_IDLE=y 34CONFIG_ARCH_HAS_DEFAULT_IDLE=y
34CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y 35CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
35CONFIG_HAVE_SETUP_PER_CPU_AREA=y 36CONFIG_HAVE_SETUP_PER_CPU_AREA=y
37CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
36# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set 38# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
37CONFIG_ARCH_HIBERNATION_POSSIBLE=y 39CONFIG_ARCH_HIBERNATION_POSSIBLE=y
38CONFIG_ARCH_SUSPEND_POSSIBLE=y 40CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -40,15 +42,16 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y
40CONFIG_ARCH_POPULATES_NODE_MAP=y 42CONFIG_ARCH_POPULATES_NODE_MAP=y
41# CONFIG_AUDIT_ARCH is not set 43# CONFIG_AUDIT_ARCH is not set
42CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y 44CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
45CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
43CONFIG_GENERIC_HARDIRQS=y 46CONFIG_GENERIC_HARDIRQS=y
47CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
44CONFIG_GENERIC_IRQ_PROBE=y 48CONFIG_GENERIC_IRQ_PROBE=y
45CONFIG_GENERIC_PENDING_IRQ=y 49CONFIG_GENERIC_PENDING_IRQ=y
46CONFIG_X86_SMP=y
47CONFIG_USE_GENERIC_SMP_HELPERS=y 50CONFIG_USE_GENERIC_SMP_HELPERS=y
48CONFIG_X86_32_SMP=y 51CONFIG_X86_32_SMP=y
49CONFIG_X86_HT=y 52CONFIG_X86_HT=y
50CONFIG_X86_BIOS_REBOOT=y
51CONFIG_X86_TRAMPOLINE=y 53CONFIG_X86_TRAMPOLINE=y
54CONFIG_X86_32_LAZY_GS=y
52CONFIG_KTIME_SCALAR=y 55CONFIG_KTIME_SCALAR=y
53CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" 56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
54 57
@@ -60,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
60CONFIG_INIT_ENV_ARG_LIMIT=32 63CONFIG_INIT_ENV_ARG_LIMIT=32
61CONFIG_LOCALVERSION="" 64CONFIG_LOCALVERSION=""
62# CONFIG_LOCALVERSION_AUTO is not set 65# CONFIG_LOCALVERSION_AUTO is not set
66CONFIG_HAVE_KERNEL_GZIP=y
67CONFIG_HAVE_KERNEL_BZIP2=y
68CONFIG_HAVE_KERNEL_LZMA=y
69CONFIG_KERNEL_GZIP=y
70# CONFIG_KERNEL_BZIP2 is not set
71# CONFIG_KERNEL_LZMA is not set
63CONFIG_SWAP=y 72CONFIG_SWAP=y
64CONFIG_SYSVIPC=y 73CONFIG_SYSVIPC=y
65CONFIG_SYSVIPC_SYSCTL=y 74CONFIG_SYSVIPC_SYSCTL=y
66CONFIG_POSIX_MQUEUE=y 75CONFIG_POSIX_MQUEUE=y
76CONFIG_POSIX_MQUEUE_SYSCTL=y
67CONFIG_BSD_PROCESS_ACCT=y 77CONFIG_BSD_PROCESS_ACCT=y
68# CONFIG_BSD_PROCESS_ACCT_V3 is not set 78# CONFIG_BSD_PROCESS_ACCT_V3 is not set
69CONFIG_TASKSTATS=y 79CONFIG_TASKSTATS=y
@@ -113,23 +123,26 @@ CONFIG_PID_NS=y
113CONFIG_NET_NS=y 123CONFIG_NET_NS=y
114CONFIG_BLK_DEV_INITRD=y 124CONFIG_BLK_DEV_INITRD=y
115CONFIG_INITRAMFS_SOURCE="" 125CONFIG_INITRAMFS_SOURCE=""
126CONFIG_RD_GZIP=y
127CONFIG_RD_BZIP2=y
128CONFIG_RD_LZMA=y
116CONFIG_CC_OPTIMIZE_FOR_SIZE=y 129CONFIG_CC_OPTIMIZE_FOR_SIZE=y
117CONFIG_SYSCTL=y 130CONFIG_SYSCTL=y
131CONFIG_ANON_INODES=y
118# CONFIG_EMBEDDED is not set 132# CONFIG_EMBEDDED is not set
119CONFIG_UID16=y 133CONFIG_UID16=y
120CONFIG_SYSCTL_SYSCALL=y 134CONFIG_SYSCTL_SYSCALL=y
121CONFIG_KALLSYMS=y 135CONFIG_KALLSYMS=y
122CONFIG_KALLSYMS_ALL=y 136CONFIG_KALLSYMS_ALL=y
123CONFIG_KALLSYMS_EXTRA_PASS=y 137CONFIG_KALLSYMS_EXTRA_PASS=y
138# CONFIG_STRIP_ASM_SYMS is not set
124CONFIG_HOTPLUG=y 139CONFIG_HOTPLUG=y
125CONFIG_PRINTK=y 140CONFIG_PRINTK=y
126CONFIG_BUG=y 141CONFIG_BUG=y
127CONFIG_ELF_CORE=y 142CONFIG_ELF_CORE=y
128CONFIG_PCSPKR_PLATFORM=y 143CONFIG_PCSPKR_PLATFORM=y
129# CONFIG_COMPAT_BRK is not set
130CONFIG_BASE_FULL=y 144CONFIG_BASE_FULL=y
131CONFIG_FUTEX=y 145CONFIG_FUTEX=y
132CONFIG_ANON_INODES=y
133CONFIG_EPOLL=y 146CONFIG_EPOLL=y
134CONFIG_SIGNALFD=y 147CONFIG_SIGNALFD=y
135CONFIG_TIMERFD=y 148CONFIG_TIMERFD=y
@@ -139,6 +152,7 @@ CONFIG_AIO=y
139CONFIG_VM_EVENT_COUNTERS=y 152CONFIG_VM_EVENT_COUNTERS=y
140CONFIG_PCI_QUIRKS=y 153CONFIG_PCI_QUIRKS=y
141CONFIG_SLUB_DEBUG=y 154CONFIG_SLUB_DEBUG=y
155# CONFIG_COMPAT_BRK is not set
142# CONFIG_SLAB is not set 156# CONFIG_SLAB is not set
143CONFIG_SLUB=y 157CONFIG_SLUB=y
144# CONFIG_SLOB is not set 158# CONFIG_SLOB is not set
@@ -154,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
154CONFIG_HAVE_KPROBES=y 168CONFIG_HAVE_KPROBES=y
155CONFIG_HAVE_KRETPROBES=y 169CONFIG_HAVE_KRETPROBES=y
156CONFIG_HAVE_ARCH_TRACEHOOK=y 170CONFIG_HAVE_ARCH_TRACEHOOK=y
171CONFIG_HAVE_DMA_API_DEBUG=y
172# CONFIG_SLOW_WORK is not set
157CONFIG_HAVE_GENERIC_DMA_COHERENT=y 173CONFIG_HAVE_GENERIC_DMA_COHERENT=y
158CONFIG_SLABINFO=y 174CONFIG_SLABINFO=y
159CONFIG_RT_MUTEXES=y 175CONFIG_RT_MUTEXES=y
@@ -167,7 +183,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
167CONFIG_STOP_MACHINE=y 183CONFIG_STOP_MACHINE=y
168CONFIG_BLOCK=y 184CONFIG_BLOCK=y
169# CONFIG_LBD is not set 185# CONFIG_LBD is not set
170CONFIG_BLK_DEV_IO_TRACE=y
171CONFIG_BLK_DEV_BSG=y 186CONFIG_BLK_DEV_BSG=y
172# CONFIG_BLK_DEV_INTEGRITY is not set 187# CONFIG_BLK_DEV_INTEGRITY is not set
173 188
@@ -194,12 +209,12 @@ CONFIG_HIGH_RES_TIMERS=y
194CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 209CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
195CONFIG_SMP=y 210CONFIG_SMP=y
196CONFIG_SPARSE_IRQ=y 211CONFIG_SPARSE_IRQ=y
197CONFIG_X86_FIND_SMP_CONFIG=y
198CONFIG_X86_MPPARSE=y 212CONFIG_X86_MPPARSE=y
213# CONFIG_X86_BIGSMP is not set
214CONFIG_X86_EXTENDED_PLATFORM=y
199# CONFIG_X86_ELAN is not set 215# CONFIG_X86_ELAN is not set
200# CONFIG_X86_GENERICARCH is not set
201# CONFIG_X86_VSMP is not set
202# CONFIG_X86_RDC321X is not set 216# CONFIG_X86_RDC321X is not set
217# CONFIG_X86_32_NON_STANDARD is not set
203CONFIG_SCHED_OMIT_FRAME_POINTER=y 218CONFIG_SCHED_OMIT_FRAME_POINTER=y
204# CONFIG_PARAVIRT_GUEST is not set 219# CONFIG_PARAVIRT_GUEST is not set
205# CONFIG_MEMTEST is not set 220# CONFIG_MEMTEST is not set
@@ -230,8 +245,10 @@ CONFIG_M686=y
230# CONFIG_GENERIC_CPU is not set 245# CONFIG_GENERIC_CPU is not set
231CONFIG_X86_GENERIC=y 246CONFIG_X86_GENERIC=y
232CONFIG_X86_CPU=y 247CONFIG_X86_CPU=y
248CONFIG_X86_L1_CACHE_BYTES=64
249CONFIG_X86_INTERNODE_CACHE_BYTES=64
233CONFIG_X86_CMPXCHG=y 250CONFIG_X86_CMPXCHG=y
234CONFIG_X86_L1_CACHE_SHIFT=7 251CONFIG_X86_L1_CACHE_SHIFT=5
235CONFIG_X86_XADD=y 252CONFIG_X86_XADD=y
236# CONFIG_X86_PPRO_FENCE is not set 253# CONFIG_X86_PPRO_FENCE is not set
237CONFIG_X86_WP_WORKS_OK=y 254CONFIG_X86_WP_WORKS_OK=y
@@ -247,7 +264,7 @@ CONFIG_X86_DEBUGCTLMSR=y
247CONFIG_CPU_SUP_INTEL=y 264CONFIG_CPU_SUP_INTEL=y
248CONFIG_CPU_SUP_CYRIX_32=y 265CONFIG_CPU_SUP_CYRIX_32=y
249CONFIG_CPU_SUP_AMD=y 266CONFIG_CPU_SUP_AMD=y
250CONFIG_CPU_SUP_CENTAUR_32=y 267CONFIG_CPU_SUP_CENTAUR=y
251CONFIG_CPU_SUP_TRANSMETA_32=y 268CONFIG_CPU_SUP_TRANSMETA_32=y
252CONFIG_CPU_SUP_UMC_32=y 269CONFIG_CPU_SUP_UMC_32=y
253CONFIG_X86_DS=y 270CONFIG_X86_DS=y
@@ -279,6 +296,7 @@ CONFIG_MICROCODE_AMD=y
279CONFIG_MICROCODE_OLD_INTERFACE=y 296CONFIG_MICROCODE_OLD_INTERFACE=y
280CONFIG_X86_MSR=y 297CONFIG_X86_MSR=y
281CONFIG_X86_CPUID=y 298CONFIG_X86_CPUID=y
299# CONFIG_X86_CPU_DEBUG is not set
282# CONFIG_NOHIGHMEM is not set 300# CONFIG_NOHIGHMEM is not set
283CONFIG_HIGHMEM4G=y 301CONFIG_HIGHMEM4G=y
284# CONFIG_HIGHMEM64G is not set 302# CONFIG_HIGHMEM64G is not set
@@ -302,6 +320,8 @@ CONFIG_ZONE_DMA_FLAG=1
302CONFIG_BOUNCE=y 320CONFIG_BOUNCE=y
303CONFIG_VIRT_TO_BUS=y 321CONFIG_VIRT_TO_BUS=y
304CONFIG_UNEVICTABLE_LRU=y 322CONFIG_UNEVICTABLE_LRU=y
323CONFIG_HAVE_MLOCK=y
324CONFIG_HAVE_MLOCKED_PAGE_BIT=y
305CONFIG_HIGHPTE=y 325CONFIG_HIGHPTE=y
306CONFIG_X86_CHECK_BIOS_CORRUPTION=y 326CONFIG_X86_CHECK_BIOS_CORRUPTION=y
307CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y 327CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
@@ -312,6 +332,7 @@ CONFIG_MTRR=y
312CONFIG_X86_PAT=y 332CONFIG_X86_PAT=y
313CONFIG_EFI=y 333CONFIG_EFI=y
314CONFIG_SECCOMP=y 334CONFIG_SECCOMP=y
335# CONFIG_CC_STACKPROTECTOR is not set
315# CONFIG_HZ_100 is not set 336# CONFIG_HZ_100 is not set
316# CONFIG_HZ_250 is not set 337# CONFIG_HZ_250 is not set
317# CONFIG_HZ_300 is not set 338# CONFIG_HZ_300 is not set
@@ -322,8 +343,9 @@ CONFIG_KEXEC=y
322CONFIG_CRASH_DUMP=y 343CONFIG_CRASH_DUMP=y
323# CONFIG_KEXEC_JUMP is not set 344# CONFIG_KEXEC_JUMP is not set
324CONFIG_PHYSICAL_START=0x1000000 345CONFIG_PHYSICAL_START=0x1000000
325# CONFIG_RELOCATABLE is not set 346CONFIG_RELOCATABLE=y
326CONFIG_PHYSICAL_ALIGN=0x200000 347CONFIG_X86_NEED_RELOCS=y
348CONFIG_PHYSICAL_ALIGN=0x1000000
327CONFIG_HOTPLUG_CPU=y 349CONFIG_HOTPLUG_CPU=y
328# CONFIG_COMPAT_VDSO is not set 350# CONFIG_COMPAT_VDSO is not set
329# CONFIG_CMDLINE_BOOL is not set 351# CONFIG_CMDLINE_BOOL is not set
@@ -363,7 +385,6 @@ CONFIG_ACPI_THERMAL=y
363CONFIG_ACPI_BLACKLIST_YEAR=0 385CONFIG_ACPI_BLACKLIST_YEAR=0
364# CONFIG_ACPI_DEBUG is not set 386# CONFIG_ACPI_DEBUG is not set
365# CONFIG_ACPI_PCI_SLOT is not set 387# CONFIG_ACPI_PCI_SLOT is not set
366CONFIG_ACPI_SYSTEM=y
367CONFIG_X86_PM_TIMER=y 388CONFIG_X86_PM_TIMER=y
368CONFIG_ACPI_CONTAINER=y 389CONFIG_ACPI_CONTAINER=y
369# CONFIG_ACPI_SBS is not set 390# CONFIG_ACPI_SBS is not set
@@ -425,6 +446,7 @@ CONFIG_PCI_BIOS=y
425CONFIG_PCI_DIRECT=y 446CONFIG_PCI_DIRECT=y
426CONFIG_PCI_MMCONFIG=y 447CONFIG_PCI_MMCONFIG=y
427CONFIG_PCI_DOMAINS=y 448CONFIG_PCI_DOMAINS=y
449# CONFIG_DMAR is not set
428CONFIG_PCIEPORTBUS=y 450CONFIG_PCIEPORTBUS=y
429# CONFIG_HOTPLUG_PCI_PCIE is not set 451# CONFIG_HOTPLUG_PCI_PCIE is not set
430CONFIG_PCIEAER=y 452CONFIG_PCIEAER=y
@@ -435,6 +457,7 @@ CONFIG_PCI_MSI=y
435# CONFIG_PCI_DEBUG is not set 457# CONFIG_PCI_DEBUG is not set
436# CONFIG_PCI_STUB is not set 458# CONFIG_PCI_STUB is not set
437CONFIG_HT_IRQ=y 459CONFIG_HT_IRQ=y
460# CONFIG_PCI_IOV is not set
438CONFIG_ISA_DMA_API=y 461CONFIG_ISA_DMA_API=y
439# CONFIG_ISA is not set 462# CONFIG_ISA is not set
440# CONFIG_MCA is not set 463# CONFIG_MCA is not set
@@ -481,7 +504,6 @@ CONFIG_NET=y
481# 504#
482# Networking options 505# Networking options
483# 506#
484CONFIG_COMPAT_NET_DEV_OPS=y
485CONFIG_PACKET=y 507CONFIG_PACKET=y
486CONFIG_PACKET_MMAP=y 508CONFIG_PACKET_MMAP=y
487CONFIG_UNIX=y 509CONFIG_UNIX=y
@@ -639,6 +661,7 @@ CONFIG_LLC=y
639# CONFIG_LAPB is not set 661# CONFIG_LAPB is not set
640# CONFIG_ECONET is not set 662# CONFIG_ECONET is not set
641# CONFIG_WAN_ROUTER is not set 663# CONFIG_WAN_ROUTER is not set
664# CONFIG_PHONET is not set
642CONFIG_NET_SCHED=y 665CONFIG_NET_SCHED=y
643 666
644# 667#
@@ -696,6 +719,7 @@ CONFIG_NET_SCH_FIFO=y
696# 719#
697# CONFIG_NET_PKTGEN is not set 720# CONFIG_NET_PKTGEN is not set
698# CONFIG_NET_TCPPROBE is not set 721# CONFIG_NET_TCPPROBE is not set
722# CONFIG_NET_DROP_MONITOR is not set
699CONFIG_HAMRADIO=y 723CONFIG_HAMRADIO=y
700 724
701# 725#
@@ -706,12 +730,10 @@ CONFIG_HAMRADIO=y
706# CONFIG_IRDA is not set 730# CONFIG_IRDA is not set
707# CONFIG_BT is not set 731# CONFIG_BT is not set
708# CONFIG_AF_RXRPC is not set 732# CONFIG_AF_RXRPC is not set
709# CONFIG_PHONET is not set
710CONFIG_FIB_RULES=y 733CONFIG_FIB_RULES=y
711CONFIG_WIRELESS=y 734CONFIG_WIRELESS=y
712CONFIG_CFG80211=y 735CONFIG_CFG80211=y
713# CONFIG_CFG80211_REG_DEBUG is not set 736# CONFIG_CFG80211_REG_DEBUG is not set
714CONFIG_NL80211=y
715CONFIG_WIRELESS_OLD_REGULATORY=y 737CONFIG_WIRELESS_OLD_REGULATORY=y
716CONFIG_WIRELESS_EXT=y 738CONFIG_WIRELESS_EXT=y
717CONFIG_WIRELESS_EXT_SYSFS=y 739CONFIG_WIRELESS_EXT_SYSFS=y
@@ -789,6 +811,7 @@ CONFIG_MISC_DEVICES=y
789# CONFIG_ICS932S401 is not set 811# CONFIG_ICS932S401 is not set
790# CONFIG_ENCLOSURE_SERVICES is not set 812# CONFIG_ENCLOSURE_SERVICES is not set
791# CONFIG_HP_ILO is not set 813# CONFIG_HP_ILO is not set
814# CONFIG_ISL29003 is not set
792# CONFIG_C2PORT is not set 815# CONFIG_C2PORT is not set
793 816
794# 817#
@@ -842,6 +865,7 @@ CONFIG_SCSI_SPI_ATTRS=y
842# CONFIG_SCSI_LOWLEVEL is not set 865# CONFIG_SCSI_LOWLEVEL is not set
843# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set 866# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
844# CONFIG_SCSI_DH is not set 867# CONFIG_SCSI_DH is not set
868# CONFIG_SCSI_OSD_INITIATOR is not set
845CONFIG_ATA=y 869CONFIG_ATA=y
846# CONFIG_ATA_NONSTANDARD is not set 870# CONFIG_ATA_NONSTANDARD is not set
847CONFIG_ATA_ACPI=y 871CONFIG_ATA_ACPI=y
@@ -940,6 +964,7 @@ CONFIG_DM_ZERO=y
940CONFIG_MACINTOSH_DRIVERS=y 964CONFIG_MACINTOSH_DRIVERS=y
941CONFIG_MAC_EMUMOUSEBTN=y 965CONFIG_MAC_EMUMOUSEBTN=y
942CONFIG_NETDEVICES=y 966CONFIG_NETDEVICES=y
967CONFIG_COMPAT_NET_DEV_OPS=y
943# CONFIG_IFB is not set 968# CONFIG_IFB is not set
944# CONFIG_DUMMY is not set 969# CONFIG_DUMMY is not set
945# CONFIG_BONDING is not set 970# CONFIG_BONDING is not set
@@ -977,6 +1002,8 @@ CONFIG_MII=y
977CONFIG_NET_VENDOR_3COM=y 1002CONFIG_NET_VENDOR_3COM=y
978# CONFIG_VORTEX is not set 1003# CONFIG_VORTEX is not set
979# CONFIG_TYPHOON is not set 1004# CONFIG_TYPHOON is not set
1005# CONFIG_ETHOC is not set
1006# CONFIG_DNET is not set
980CONFIG_NET_TULIP=y 1007CONFIG_NET_TULIP=y
981# CONFIG_DE2104X is not set 1008# CONFIG_DE2104X is not set
982# CONFIG_TULIP is not set 1009# CONFIG_TULIP is not set
@@ -1026,6 +1053,7 @@ CONFIG_E1000=y
1026CONFIG_E1000E=y 1053CONFIG_E1000E=y
1027# CONFIG_IP1000 is not set 1054# CONFIG_IP1000 is not set
1028# CONFIG_IGB is not set 1055# CONFIG_IGB is not set
1056# CONFIG_IGBVF is not set
1029# CONFIG_NS83820 is not set 1057# CONFIG_NS83820 is not set
1030# CONFIG_HAMACHI is not set 1058# CONFIG_HAMACHI is not set
1031# CONFIG_YELLOWFIN is not set 1059# CONFIG_YELLOWFIN is not set
@@ -1040,6 +1068,7 @@ CONFIG_BNX2=y
1040# CONFIG_QLA3XXX is not set 1068# CONFIG_QLA3XXX is not set
1041# CONFIG_ATL1 is not set 1069# CONFIG_ATL1 is not set
1042# CONFIG_ATL1E is not set 1070# CONFIG_ATL1E is not set
1071# CONFIG_ATL1C is not set
1043# CONFIG_JME is not set 1072# CONFIG_JME is not set
1044CONFIG_NETDEV_10000=y 1073CONFIG_NETDEV_10000=y
1045# CONFIG_CHELSIO_T1 is not set 1074# CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1078,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1049# CONFIG_IXGBE is not set 1078# CONFIG_IXGBE is not set
1050# CONFIG_IXGB is not set 1079# CONFIG_IXGB is not set
1051# CONFIG_S2IO is not set 1080# CONFIG_S2IO is not set
1081# CONFIG_VXGE is not set
1052# CONFIG_MYRI10GE is not set 1082# CONFIG_MYRI10GE is not set
1053# CONFIG_NETXEN_NIC is not set 1083# CONFIG_NETXEN_NIC is not set
1054# CONFIG_NIU is not set 1084# CONFIG_NIU is not set
@@ -1058,6 +1088,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1058# CONFIG_BNX2X is not set 1088# CONFIG_BNX2X is not set
1059# CONFIG_QLGE is not set 1089# CONFIG_QLGE is not set
1060# CONFIG_SFC is not set 1090# CONFIG_SFC is not set
1091# CONFIG_BE2NET is not set
1061CONFIG_TR=y 1092CONFIG_TR=y
1062# CONFIG_IBMOL is not set 1093# CONFIG_IBMOL is not set
1063# CONFIG_IBMLS is not set 1094# CONFIG_IBMLS is not set
@@ -1073,8 +1104,8 @@ CONFIG_WLAN_80211=y
1073# CONFIG_LIBERTAS is not set 1104# CONFIG_LIBERTAS is not set
1074# CONFIG_LIBERTAS_THINFIRM is not set 1105# CONFIG_LIBERTAS_THINFIRM is not set
1075# CONFIG_AIRO is not set 1106# CONFIG_AIRO is not set
1076# CONFIG_HERMES is not set
1077# CONFIG_ATMEL is not set 1107# CONFIG_ATMEL is not set
1108# CONFIG_AT76C50X_USB is not set
1078# CONFIG_AIRO_CS is not set 1109# CONFIG_AIRO_CS is not set
1079# CONFIG_PCMCIA_WL3501 is not set 1110# CONFIG_PCMCIA_WL3501 is not set
1080# CONFIG_PRISM54 is not set 1111# CONFIG_PRISM54 is not set
@@ -1084,21 +1115,21 @@ CONFIG_WLAN_80211=y
1084# CONFIG_RTL8187 is not set 1115# CONFIG_RTL8187 is not set
1085# CONFIG_ADM8211 is not set 1116# CONFIG_ADM8211 is not set
1086# CONFIG_MAC80211_HWSIM is not set 1117# CONFIG_MAC80211_HWSIM is not set
1118# CONFIG_MWL8K is not set
1087# CONFIG_P54_COMMON is not set 1119# CONFIG_P54_COMMON is not set
1088CONFIG_ATH5K=y 1120CONFIG_ATH5K=y
1089# CONFIG_ATH5K_DEBUG is not set 1121# CONFIG_ATH5K_DEBUG is not set
1090# CONFIG_ATH9K is not set 1122# CONFIG_ATH9K is not set
1123# CONFIG_AR9170_USB is not set
1091# CONFIG_IPW2100 is not set 1124# CONFIG_IPW2100 is not set
1092# CONFIG_IPW2200 is not set 1125# CONFIG_IPW2200 is not set
1093# CONFIG_IWLCORE is not set 1126# CONFIG_IWLWIFI is not set
1094# CONFIG_IWLWIFI_LEDS is not set
1095# CONFIG_IWLAGN is not set
1096# CONFIG_IWL3945 is not set
1097# CONFIG_HOSTAP is not set 1127# CONFIG_HOSTAP is not set
1098# CONFIG_B43 is not set 1128# CONFIG_B43 is not set
1099# CONFIG_B43LEGACY is not set 1129# CONFIG_B43LEGACY is not set
1100# CONFIG_ZD1211RW is not set 1130# CONFIG_ZD1211RW is not set
1101# CONFIG_RT2X00 is not set 1131# CONFIG_RT2X00 is not set
1132# CONFIG_HERMES is not set
1102 1133
1103# 1134#
1104# Enable WiMAX (Networking options) to see the WiMAX drivers 1135# Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1209,6 +1240,8 @@ CONFIG_INPUT_TABLET=y
1209# CONFIG_TABLET_USB_KBTAB is not set 1240# CONFIG_TABLET_USB_KBTAB is not set
1210# CONFIG_TABLET_USB_WACOM is not set 1241# CONFIG_TABLET_USB_WACOM is not set
1211CONFIG_INPUT_TOUCHSCREEN=y 1242CONFIG_INPUT_TOUCHSCREEN=y
1243# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
1244# CONFIG_TOUCHSCREEN_AD7879 is not set
1212# CONFIG_TOUCHSCREEN_FUJITSU is not set 1245# CONFIG_TOUCHSCREEN_FUJITSU is not set
1213# CONFIG_TOUCHSCREEN_GUNZE is not set 1246# CONFIG_TOUCHSCREEN_GUNZE is not set
1214# CONFIG_TOUCHSCREEN_ELO is not set 1247# CONFIG_TOUCHSCREEN_ELO is not set
@@ -1303,6 +1336,7 @@ CONFIG_UNIX98_PTYS=y
1303# CONFIG_LEGACY_PTYS is not set 1336# CONFIG_LEGACY_PTYS is not set
1304# CONFIG_IPMI_HANDLER is not set 1337# CONFIG_IPMI_HANDLER is not set
1305CONFIG_HW_RANDOM=y 1338CONFIG_HW_RANDOM=y
1339# CONFIG_HW_RANDOM_TIMERIOMEM is not set
1306CONFIG_HW_RANDOM_INTEL=y 1340CONFIG_HW_RANDOM_INTEL=y
1307CONFIG_HW_RANDOM_AMD=y 1341CONFIG_HW_RANDOM_AMD=y
1308CONFIG_HW_RANDOM_GEODE=y 1342CONFIG_HW_RANDOM_GEODE=y
@@ -1390,7 +1424,6 @@ CONFIG_I2C_I801=y
1390# CONFIG_SENSORS_PCF8574 is not set 1424# CONFIG_SENSORS_PCF8574 is not set
1391# CONFIG_PCF8575 is not set 1425# CONFIG_PCF8575 is not set
1392# CONFIG_SENSORS_PCA9539 is not set 1426# CONFIG_SENSORS_PCA9539 is not set
1393# CONFIG_SENSORS_PCF8591 is not set
1394# CONFIG_SENSORS_MAX6875 is not set 1427# CONFIG_SENSORS_MAX6875 is not set
1395# CONFIG_SENSORS_TSL2550 is not set 1428# CONFIG_SENSORS_TSL2550 is not set
1396# CONFIG_I2C_DEBUG_CORE is not set 1429# CONFIG_I2C_DEBUG_CORE is not set
@@ -1424,6 +1457,7 @@ CONFIG_HWMON=y
1424# CONFIG_SENSORS_ADT7475 is not set 1457# CONFIG_SENSORS_ADT7475 is not set
1425# CONFIG_SENSORS_K8TEMP is not set 1458# CONFIG_SENSORS_K8TEMP is not set
1426# CONFIG_SENSORS_ASB100 is not set 1459# CONFIG_SENSORS_ASB100 is not set
1460# CONFIG_SENSORS_ATK0110 is not set
1427# CONFIG_SENSORS_ATXP1 is not set 1461# CONFIG_SENSORS_ATXP1 is not set
1428# CONFIG_SENSORS_DS1621 is not set 1462# CONFIG_SENSORS_DS1621 is not set
1429# CONFIG_SENSORS_I5K_AMB is not set 1463# CONFIG_SENSORS_I5K_AMB is not set
@@ -1433,6 +1467,7 @@ CONFIG_HWMON=y
1433# CONFIG_SENSORS_FSCHER is not set 1467# CONFIG_SENSORS_FSCHER is not set
1434# CONFIG_SENSORS_FSCPOS is not set 1468# CONFIG_SENSORS_FSCPOS is not set
1435# CONFIG_SENSORS_FSCHMD is not set 1469# CONFIG_SENSORS_FSCHMD is not set
1470# CONFIG_SENSORS_G760A is not set
1436# CONFIG_SENSORS_GL518SM is not set 1471# CONFIG_SENSORS_GL518SM is not set
1437# CONFIG_SENSORS_GL520SM is not set 1472# CONFIG_SENSORS_GL520SM is not set
1438# CONFIG_SENSORS_CORETEMP is not set 1473# CONFIG_SENSORS_CORETEMP is not set
@@ -1448,11 +1483,14 @@ CONFIG_HWMON=y
1448# CONFIG_SENSORS_LM90 is not set 1483# CONFIG_SENSORS_LM90 is not set
1449# CONFIG_SENSORS_LM92 is not set 1484# CONFIG_SENSORS_LM92 is not set
1450# CONFIG_SENSORS_LM93 is not set 1485# CONFIG_SENSORS_LM93 is not set
1486# CONFIG_SENSORS_LTC4215 is not set
1451# CONFIG_SENSORS_LTC4245 is not set 1487# CONFIG_SENSORS_LTC4245 is not set
1488# CONFIG_SENSORS_LM95241 is not set
1452# CONFIG_SENSORS_MAX1619 is not set 1489# CONFIG_SENSORS_MAX1619 is not set
1453# CONFIG_SENSORS_MAX6650 is not set 1490# CONFIG_SENSORS_MAX6650 is not set
1454# CONFIG_SENSORS_PC87360 is not set 1491# CONFIG_SENSORS_PC87360 is not set
1455# CONFIG_SENSORS_PC87427 is not set 1492# CONFIG_SENSORS_PC87427 is not set
1493# CONFIG_SENSORS_PCF8591 is not set
1456# CONFIG_SENSORS_SIS5595 is not set 1494# CONFIG_SENSORS_SIS5595 is not set
1457# CONFIG_SENSORS_DME1737 is not set 1495# CONFIG_SENSORS_DME1737 is not set
1458# CONFIG_SENSORS_SMSC47M1 is not set 1496# CONFIG_SENSORS_SMSC47M1 is not set
@@ -1643,7 +1681,6 @@ CONFIG_FB_EFI=y
1643# CONFIG_FB_3DFX is not set 1681# CONFIG_FB_3DFX is not set
1644# CONFIG_FB_VOODOO1 is not set 1682# CONFIG_FB_VOODOO1 is not set
1645# CONFIG_FB_VT8623 is not set 1683# CONFIG_FB_VT8623 is not set
1646# CONFIG_FB_CYBLA is not set
1647# CONFIG_FB_TRIDENT is not set 1684# CONFIG_FB_TRIDENT is not set
1648# CONFIG_FB_ARK is not set 1685# CONFIG_FB_ARK is not set
1649# CONFIG_FB_PM3 is not set 1686# CONFIG_FB_PM3 is not set
@@ -1652,6 +1689,7 @@ CONFIG_FB_EFI=y
1652# CONFIG_FB_VIRTUAL is not set 1689# CONFIG_FB_VIRTUAL is not set
1653# CONFIG_FB_METRONOME is not set 1690# CONFIG_FB_METRONOME is not set
1654# CONFIG_FB_MB862XX is not set 1691# CONFIG_FB_MB862XX is not set
1692# CONFIG_FB_BROADSHEET is not set
1655CONFIG_BACKLIGHT_LCD_SUPPORT=y 1693CONFIG_BACKLIGHT_LCD_SUPPORT=y
1656# CONFIG_LCD_CLASS_DEVICE is not set 1694# CONFIG_LCD_CLASS_DEVICE is not set
1657CONFIG_BACKLIGHT_CLASS_DEVICE=y 1695CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1738,6 +1776,8 @@ CONFIG_SND_PCI=y
1738# CONFIG_SND_INDIGO is not set 1776# CONFIG_SND_INDIGO is not set
1739# CONFIG_SND_INDIGOIO is not set 1777# CONFIG_SND_INDIGOIO is not set
1740# CONFIG_SND_INDIGODJ is not set 1778# CONFIG_SND_INDIGODJ is not set
1779# CONFIG_SND_INDIGOIOX is not set
1780# CONFIG_SND_INDIGODJX is not set
1741# CONFIG_SND_EMU10K1 is not set 1781# CONFIG_SND_EMU10K1 is not set
1742# CONFIG_SND_EMU10K1X is not set 1782# CONFIG_SND_EMU10K1X is not set
1743# CONFIG_SND_ENS1370 is not set 1783# CONFIG_SND_ENS1370 is not set
@@ -1811,15 +1851,17 @@ CONFIG_USB_HIDDEV=y
1811# 1851#
1812# Special HID drivers 1852# Special HID drivers
1813# 1853#
1814CONFIG_HID_COMPAT=y
1815CONFIG_HID_A4TECH=y 1854CONFIG_HID_A4TECH=y
1816CONFIG_HID_APPLE=y 1855CONFIG_HID_APPLE=y
1817CONFIG_HID_BELKIN=y 1856CONFIG_HID_BELKIN=y
1818CONFIG_HID_CHERRY=y 1857CONFIG_HID_CHERRY=y
1819CONFIG_HID_CHICONY=y 1858CONFIG_HID_CHICONY=y
1820CONFIG_HID_CYPRESS=y 1859CONFIG_HID_CYPRESS=y
1860# CONFIG_DRAGONRISE_FF is not set
1821CONFIG_HID_EZKEY=y 1861CONFIG_HID_EZKEY=y
1862CONFIG_HID_KYE=y
1822CONFIG_HID_GYRATION=y 1863CONFIG_HID_GYRATION=y
1864CONFIG_HID_KENSINGTON=y
1823CONFIG_HID_LOGITECH=y 1865CONFIG_HID_LOGITECH=y
1824CONFIG_LOGITECH_FF=y 1866CONFIG_LOGITECH_FF=y
1825# CONFIG_LOGIRUMBLEPAD2_FF is not set 1867# CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1885,11 +1927,11 @@ CONFIG_USB_PRINTER=y
1885# CONFIG_USB_TMC is not set 1927# CONFIG_USB_TMC is not set
1886 1928
1887# 1929#
1888# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; 1930# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
1889# 1931#
1890 1932
1891# 1933#
1892# see USB_STORAGE Help for more information 1934# also be needed; see USB_STORAGE Help for more info
1893# 1935#
1894CONFIG_USB_STORAGE=y 1936CONFIG_USB_STORAGE=y
1895# CONFIG_USB_STORAGE_DEBUG is not set 1937# CONFIG_USB_STORAGE_DEBUG is not set
@@ -1931,7 +1973,6 @@ CONFIG_USB_LIBUSUAL=y
1931# CONFIG_USB_LED is not set 1973# CONFIG_USB_LED is not set
1932# CONFIG_USB_CYPRESS_CY7C63 is not set 1974# CONFIG_USB_CYPRESS_CY7C63 is not set
1933# CONFIG_USB_CYTHERM is not set 1975# CONFIG_USB_CYTHERM is not set
1934# CONFIG_USB_PHIDGET is not set
1935# CONFIG_USB_IDMOUSE is not set 1976# CONFIG_USB_IDMOUSE is not set
1936# CONFIG_USB_FTDI_ELAN is not set 1977# CONFIG_USB_FTDI_ELAN is not set
1937# CONFIG_USB_APPLEDISPLAY is not set 1978# CONFIG_USB_APPLEDISPLAY is not set
@@ -1947,6 +1988,7 @@ CONFIG_USB_LIBUSUAL=y
1947# 1988#
1948# OTG and related infrastructure 1989# OTG and related infrastructure
1949# 1990#
1991# CONFIG_NOP_USB_XCEIV is not set
1950# CONFIG_UWB is not set 1992# CONFIG_UWB is not set
1951# CONFIG_MMC is not set 1993# CONFIG_MMC is not set
1952# CONFIG_MEMSTICK is not set 1994# CONFIG_MEMSTICK is not set
@@ -1958,8 +2000,10 @@ CONFIG_LEDS_CLASS=y
1958# 2000#
1959# CONFIG_LEDS_ALIX2 is not set 2001# CONFIG_LEDS_ALIX2 is not set
1960# CONFIG_LEDS_PCA9532 is not set 2002# CONFIG_LEDS_PCA9532 is not set
2003# CONFIG_LEDS_LP5521 is not set
1961# CONFIG_LEDS_CLEVO_MAIL is not set 2004# CONFIG_LEDS_CLEVO_MAIL is not set
1962# CONFIG_LEDS_PCA955X is not set 2005# CONFIG_LEDS_PCA955X is not set
2006# CONFIG_LEDS_BD2802 is not set
1963 2007
1964# 2008#
1965# LED Triggers 2009# LED Triggers
@@ -1969,6 +2013,10 @@ CONFIG_LEDS_TRIGGERS=y
1969# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set 2013# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
1970# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set 2014# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
1971# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set 2015# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
2016
2017#
2018# iptables trigger is under Netfilter config (LED target)
2019#
1972# CONFIG_ACCESSIBILITY is not set 2020# CONFIG_ACCESSIBILITY is not set
1973# CONFIG_INFINIBAND is not set 2021# CONFIG_INFINIBAND is not set
1974CONFIG_EDAC=y 2022CONFIG_EDAC=y
@@ -2037,6 +2085,7 @@ CONFIG_DMADEVICES=y
2037# DMA Devices 2085# DMA Devices
2038# 2086#
2039# CONFIG_INTEL_IOATDMA is not set 2087# CONFIG_INTEL_IOATDMA is not set
2088# CONFIG_AUXDISPLAY is not set
2040# CONFIG_UIO is not set 2089# CONFIG_UIO is not set
2041# CONFIG_STAGING is not set 2090# CONFIG_STAGING is not set
2042CONFIG_X86_PLATFORM_DEVICES=y 2091CONFIG_X86_PLATFORM_DEVICES=y
@@ -2071,6 +2120,7 @@ CONFIG_DMIID=y
2071# 2120#
2072# CONFIG_EXT2_FS is not set 2121# CONFIG_EXT2_FS is not set
2073CONFIG_EXT3_FS=y 2122CONFIG_EXT3_FS=y
2123# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
2074CONFIG_EXT3_FS_XATTR=y 2124CONFIG_EXT3_FS_XATTR=y
2075CONFIG_EXT3_FS_POSIX_ACL=y 2125CONFIG_EXT3_FS_POSIX_ACL=y
2076CONFIG_EXT3_FS_SECURITY=y 2126CONFIG_EXT3_FS_SECURITY=y
@@ -2101,6 +2151,11 @@ CONFIG_AUTOFS4_FS=y
2101CONFIG_GENERIC_ACL=y 2151CONFIG_GENERIC_ACL=y
2102 2152
2103# 2153#
2154# Caches
2155#
2156# CONFIG_FSCACHE is not set
2157
2158#
2104# CD-ROM/DVD Filesystems 2159# CD-ROM/DVD Filesystems
2105# 2160#
2106CONFIG_ISO9660_FS=y 2161CONFIG_ISO9660_FS=y
@@ -2151,6 +2206,7 @@ CONFIG_MISC_FILESYSTEMS=y
2151# CONFIG_ROMFS_FS is not set 2206# CONFIG_ROMFS_FS is not set
2152# CONFIG_SYSV_FS is not set 2207# CONFIG_SYSV_FS is not set
2153# CONFIG_UFS_FS is not set 2208# CONFIG_UFS_FS is not set
2209# CONFIG_NILFS2_FS is not set
2154CONFIG_NETWORK_FILESYSTEMS=y 2210CONFIG_NETWORK_FILESYSTEMS=y
2155CONFIG_NFS_FS=y 2211CONFIG_NFS_FS=y
2156CONFIG_NFS_V3=y 2212CONFIG_NFS_V3=y
@@ -2164,7 +2220,6 @@ CONFIG_NFS_ACL_SUPPORT=y
2164CONFIG_NFS_COMMON=y 2220CONFIG_NFS_COMMON=y
2165CONFIG_SUNRPC=y 2221CONFIG_SUNRPC=y
2166CONFIG_SUNRPC_GSS=y 2222CONFIG_SUNRPC_GSS=y
2167# CONFIG_SUNRPC_REGISTER_V4 is not set
2168CONFIG_RPCSEC_GSS_KRB5=y 2223CONFIG_RPCSEC_GSS_KRB5=y
2169# CONFIG_RPCSEC_GSS_SPKM3 is not set 2224# CONFIG_RPCSEC_GSS_SPKM3 is not set
2170# CONFIG_SMB_FS is not set 2225# CONFIG_SMB_FS is not set
@@ -2251,6 +2306,7 @@ CONFIG_DEBUG_FS=y
2251CONFIG_DEBUG_KERNEL=y 2306CONFIG_DEBUG_KERNEL=y
2252# CONFIG_DEBUG_SHIRQ is not set 2307# CONFIG_DEBUG_SHIRQ is not set
2253# CONFIG_DETECT_SOFTLOCKUP is not set 2308# CONFIG_DETECT_SOFTLOCKUP is not set
2309# CONFIG_DETECT_HUNG_TASK is not set
2254# CONFIG_SCHED_DEBUG is not set 2310# CONFIG_SCHED_DEBUG is not set
2255CONFIG_SCHEDSTATS=y 2311CONFIG_SCHEDSTATS=y
2256CONFIG_TIMER_STATS=y 2312CONFIG_TIMER_STATS=y
@@ -2266,6 +2322,7 @@ CONFIG_TIMER_STATS=y
2266# CONFIG_LOCK_STAT is not set 2322# CONFIG_LOCK_STAT is not set
2267# CONFIG_DEBUG_SPINLOCK_SLEEP is not set 2323# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
2268# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set 2324# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
2325CONFIG_STACKTRACE=y
2269# CONFIG_DEBUG_KOBJECT is not set 2326# CONFIG_DEBUG_KOBJECT is not set
2270# CONFIG_DEBUG_HIGHMEM is not set 2327# CONFIG_DEBUG_HIGHMEM is not set
2271CONFIG_DEBUG_BUGVERBOSE=y 2328CONFIG_DEBUG_BUGVERBOSE=y
@@ -2289,13 +2346,19 @@ CONFIG_FRAME_POINTER=y
2289# CONFIG_FAULT_INJECTION is not set 2346# CONFIG_FAULT_INJECTION is not set
2290# CONFIG_LATENCYTOP is not set 2347# CONFIG_LATENCYTOP is not set
2291CONFIG_SYSCTL_SYSCALL_CHECK=y 2348CONFIG_SYSCTL_SYSCALL_CHECK=y
2349# CONFIG_DEBUG_PAGEALLOC is not set
2292CONFIG_USER_STACKTRACE_SUPPORT=y 2350CONFIG_USER_STACKTRACE_SUPPORT=y
2351CONFIG_NOP_TRACER=y
2293CONFIG_HAVE_FUNCTION_TRACER=y 2352CONFIG_HAVE_FUNCTION_TRACER=y
2294CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y 2353CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
2295CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y 2354CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2296CONFIG_HAVE_DYNAMIC_FTRACE=y 2355CONFIG_HAVE_DYNAMIC_FTRACE=y
2297CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 2356CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2298CONFIG_HAVE_HW_BRANCH_TRACER=y 2357CONFIG_HAVE_HW_BRANCH_TRACER=y
2358CONFIG_HAVE_FTRACE_SYSCALLS=y
2359CONFIG_RING_BUFFER=y
2360CONFIG_TRACING=y
2361CONFIG_TRACING_SUPPORT=y
2299 2362
2300# 2363#
2301# Tracers 2364# Tracers
@@ -2305,13 +2368,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
2305# CONFIG_SYSPROF_TRACER is not set 2368# CONFIG_SYSPROF_TRACER is not set
2306# CONFIG_SCHED_TRACER is not set 2369# CONFIG_SCHED_TRACER is not set
2307# CONFIG_CONTEXT_SWITCH_TRACER is not set 2370# CONFIG_CONTEXT_SWITCH_TRACER is not set
2371# CONFIG_EVENT_TRACER is not set
2372# CONFIG_FTRACE_SYSCALLS is not set
2308# CONFIG_BOOT_TRACER is not set 2373# CONFIG_BOOT_TRACER is not set
2309# CONFIG_TRACE_BRANCH_PROFILING is not set 2374# CONFIG_TRACE_BRANCH_PROFILING is not set
2310# CONFIG_POWER_TRACER is not set 2375# CONFIG_POWER_TRACER is not set
2311# CONFIG_STACK_TRACER is not set 2376# CONFIG_STACK_TRACER is not set
2312# CONFIG_HW_BRANCH_TRACER is not set 2377# CONFIG_HW_BRANCH_TRACER is not set
2378# CONFIG_KMEMTRACE is not set
2379# CONFIG_WORKQUEUE_TRACER is not set
2380CONFIG_BLK_DEV_IO_TRACE=y
2381# CONFIG_FTRACE_STARTUP_TEST is not set
2382# CONFIG_MMIOTRACE is not set
2313CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2383CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2314# CONFIG_DYNAMIC_PRINTK_DEBUG is not set 2384# CONFIG_DYNAMIC_DEBUG is not set
2385# CONFIG_DMA_API_DEBUG is not set
2315# CONFIG_SAMPLES is not set 2386# CONFIG_SAMPLES is not set
2316CONFIG_HAVE_ARCH_KGDB=y 2387CONFIG_HAVE_ARCH_KGDB=y
2317# CONFIG_KGDB is not set 2388# CONFIG_KGDB is not set
@@ -2321,7 +2392,6 @@ CONFIG_EARLY_PRINTK=y
2321CONFIG_EARLY_PRINTK_DBGP=y 2392CONFIG_EARLY_PRINTK_DBGP=y
2322CONFIG_DEBUG_STACKOVERFLOW=y 2393CONFIG_DEBUG_STACKOVERFLOW=y
2323CONFIG_DEBUG_STACK_USAGE=y 2394CONFIG_DEBUG_STACK_USAGE=y
2324# CONFIG_DEBUG_PAGEALLOC is not set
2325# CONFIG_DEBUG_PER_CPU_MAPS is not set 2395# CONFIG_DEBUG_PER_CPU_MAPS is not set
2326# CONFIG_X86_PTDUMP is not set 2396# CONFIG_X86_PTDUMP is not set
2327CONFIG_DEBUG_RODATA=y 2397CONFIG_DEBUG_RODATA=y
@@ -2329,7 +2399,7 @@ CONFIG_DEBUG_RODATA=y
2329CONFIG_DEBUG_NX_TEST=m 2399CONFIG_DEBUG_NX_TEST=m
2330# CONFIG_4KSTACKS is not set 2400# CONFIG_4KSTACKS is not set
2331CONFIG_DOUBLEFAULT=y 2401CONFIG_DOUBLEFAULT=y
2332# CONFIG_MMIOTRACE is not set 2402CONFIG_HAVE_MMIOTRACE_SUPPORT=y
2333CONFIG_IO_DELAY_TYPE_0X80=0 2403CONFIG_IO_DELAY_TYPE_0X80=0
2334CONFIG_IO_DELAY_TYPE_0XED=1 2404CONFIG_IO_DELAY_TYPE_0XED=1
2335CONFIG_IO_DELAY_TYPE_UDELAY=2 2405CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2365,6 +2435,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
2365CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 2435CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
2366# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set 2436# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
2367# CONFIG_SECURITY_SMACK is not set 2437# CONFIG_SECURITY_SMACK is not set
2438# CONFIG_SECURITY_TOMOYO is not set
2439# CONFIG_IMA is not set
2368CONFIG_CRYPTO=y 2440CONFIG_CRYPTO=y
2369 2441
2370# 2442#
@@ -2380,10 +2452,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
2380CONFIG_CRYPTO_HASH=y 2452CONFIG_CRYPTO_HASH=y
2381CONFIG_CRYPTO_HASH2=y 2453CONFIG_CRYPTO_HASH2=y
2382CONFIG_CRYPTO_RNG2=y 2454CONFIG_CRYPTO_RNG2=y
2455CONFIG_CRYPTO_PCOMP=y
2383CONFIG_CRYPTO_MANAGER=y 2456CONFIG_CRYPTO_MANAGER=y
2384CONFIG_CRYPTO_MANAGER2=y 2457CONFIG_CRYPTO_MANAGER2=y
2385# CONFIG_CRYPTO_GF128MUL is not set 2458# CONFIG_CRYPTO_GF128MUL is not set
2386# CONFIG_CRYPTO_NULL is not set 2459# CONFIG_CRYPTO_NULL is not set
2460CONFIG_CRYPTO_WORKQUEUE=y
2387# CONFIG_CRYPTO_CRYPTD is not set 2461# CONFIG_CRYPTO_CRYPTD is not set
2388CONFIG_CRYPTO_AUTHENC=y 2462CONFIG_CRYPTO_AUTHENC=y
2389# CONFIG_CRYPTO_TEST is not set 2463# CONFIG_CRYPTO_TEST is not set
@@ -2456,6 +2530,7 @@ CONFIG_CRYPTO_DES=y
2456# Compression 2530# Compression
2457# 2531#
2458# CONFIG_CRYPTO_DEFLATE is not set 2532# CONFIG_CRYPTO_DEFLATE is not set
2533# CONFIG_CRYPTO_ZLIB is not set
2459# CONFIG_CRYPTO_LZO is not set 2534# CONFIG_CRYPTO_LZO is not set
2460 2535
2461# 2536#
@@ -2467,11 +2542,13 @@ CONFIG_CRYPTO_HW=y
2467# CONFIG_CRYPTO_DEV_GEODE is not set 2542# CONFIG_CRYPTO_DEV_GEODE is not set
2468# CONFIG_CRYPTO_DEV_HIFN_795X is not set 2543# CONFIG_CRYPTO_DEV_HIFN_795X is not set
2469CONFIG_HAVE_KVM=y 2544CONFIG_HAVE_KVM=y
2545CONFIG_HAVE_KVM_IRQCHIP=y
2470CONFIG_VIRTUALIZATION=y 2546CONFIG_VIRTUALIZATION=y
2471# CONFIG_KVM is not set 2547# CONFIG_KVM is not set
2472# CONFIG_LGUEST is not set 2548# CONFIG_LGUEST is not set
2473# CONFIG_VIRTIO_PCI is not set 2549# CONFIG_VIRTIO_PCI is not set
2474# CONFIG_VIRTIO_BALLOON is not set 2550# CONFIG_VIRTIO_BALLOON is not set
2551CONFIG_BINARY_PRINTF=y
2475 2552
2476# 2553#
2477# Library routines 2554# Library routines
@@ -2489,7 +2566,10 @@ CONFIG_CRC32=y
2489# CONFIG_LIBCRC32C is not set 2566# CONFIG_LIBCRC32C is not set
2490CONFIG_AUDIT_GENERIC=y 2567CONFIG_AUDIT_GENERIC=y
2491CONFIG_ZLIB_INFLATE=y 2568CONFIG_ZLIB_INFLATE=y
2492CONFIG_PLIST=y 2569CONFIG_DECOMPRESS_GZIP=y
2570CONFIG_DECOMPRESS_BZIP2=y
2571CONFIG_DECOMPRESS_LZMA=y
2493CONFIG_HAS_IOMEM=y 2572CONFIG_HAS_IOMEM=y
2494CONFIG_HAS_IOPORT=y 2573CONFIG_HAS_IOPORT=y
2495CONFIG_HAS_DMA=y 2574CONFIG_HAS_DMA=y
2575CONFIG_NLATTR=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 27b8ce0f5908..cee1dd2e69b2 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,12 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.29-rc4 3# Linux kernel version: 2.6.30-rc2
4# Tue Feb 24 15:44:16 2009 4# Mon May 11 16:22:00 2009
5# 5#
6CONFIG_64BIT=y 6CONFIG_64BIT=y
7# CONFIG_X86_32 is not set 7# CONFIG_X86_32 is not set
8CONFIG_X86_64=y 8CONFIG_X86_64=y
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_OUTPUT_FORMAT="elf64-x86-64"
10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" 11CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
11CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
12CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
@@ -34,6 +35,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
34CONFIG_ARCH_HAS_DEFAULT_IDLE=y 35CONFIG_ARCH_HAS_DEFAULT_IDLE=y
35CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y 36CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
36CONFIG_HAVE_SETUP_PER_CPU_AREA=y 37CONFIG_HAVE_SETUP_PER_CPU_AREA=y
38CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
37CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y 39CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
38CONFIG_ARCH_HIBERNATION_POSSIBLE=y 40CONFIG_ARCH_HIBERNATION_POSSIBLE=y
39CONFIG_ARCH_SUSPEND_POSSIBLE=y 41CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -41,14 +43,14 @@ CONFIG_ZONE_DMA32=y
41CONFIG_ARCH_POPULATES_NODE_MAP=y 43CONFIG_ARCH_POPULATES_NODE_MAP=y
42CONFIG_AUDIT_ARCH=y 44CONFIG_AUDIT_ARCH=y
43CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y 45CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
46CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
44CONFIG_GENERIC_HARDIRQS=y 47CONFIG_GENERIC_HARDIRQS=y
48CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
45CONFIG_GENERIC_IRQ_PROBE=y 49CONFIG_GENERIC_IRQ_PROBE=y
46CONFIG_GENERIC_PENDING_IRQ=y 50CONFIG_GENERIC_PENDING_IRQ=y
47CONFIG_X86_SMP=y
48CONFIG_USE_GENERIC_SMP_HELPERS=y 51CONFIG_USE_GENERIC_SMP_HELPERS=y
49CONFIG_X86_64_SMP=y 52CONFIG_X86_64_SMP=y
50CONFIG_X86_HT=y 53CONFIG_X86_HT=y
51CONFIG_X86_BIOS_REBOOT=y
52CONFIG_X86_TRAMPOLINE=y 54CONFIG_X86_TRAMPOLINE=y
53# CONFIG_KTIME_SCALAR is not set 55# CONFIG_KTIME_SCALAR is not set
54CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" 56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
@@ -61,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
61CONFIG_INIT_ENV_ARG_LIMIT=32 63CONFIG_INIT_ENV_ARG_LIMIT=32
62CONFIG_LOCALVERSION="" 64CONFIG_LOCALVERSION=""
63# CONFIG_LOCALVERSION_AUTO is not set 65# CONFIG_LOCALVERSION_AUTO is not set
66CONFIG_HAVE_KERNEL_GZIP=y
67CONFIG_HAVE_KERNEL_BZIP2=y
68CONFIG_HAVE_KERNEL_LZMA=y
69CONFIG_KERNEL_GZIP=y
70# CONFIG_KERNEL_BZIP2 is not set
71# CONFIG_KERNEL_LZMA is not set
64CONFIG_SWAP=y 72CONFIG_SWAP=y
65CONFIG_SYSVIPC=y 73CONFIG_SYSVIPC=y
66CONFIG_SYSVIPC_SYSCTL=y 74CONFIG_SYSVIPC_SYSCTL=y
67CONFIG_POSIX_MQUEUE=y 75CONFIG_POSIX_MQUEUE=y
76CONFIG_POSIX_MQUEUE_SYSCTL=y
68CONFIG_BSD_PROCESS_ACCT=y 77CONFIG_BSD_PROCESS_ACCT=y
69# CONFIG_BSD_PROCESS_ACCT_V3 is not set 78# CONFIG_BSD_PROCESS_ACCT_V3 is not set
70CONFIG_TASKSTATS=y 79CONFIG_TASKSTATS=y
@@ -114,23 +123,26 @@ CONFIG_PID_NS=y
114CONFIG_NET_NS=y 123CONFIG_NET_NS=y
115CONFIG_BLK_DEV_INITRD=y 124CONFIG_BLK_DEV_INITRD=y
116CONFIG_INITRAMFS_SOURCE="" 125CONFIG_INITRAMFS_SOURCE=""
126CONFIG_RD_GZIP=y
127CONFIG_RD_BZIP2=y
128CONFIG_RD_LZMA=y
117CONFIG_CC_OPTIMIZE_FOR_SIZE=y 129CONFIG_CC_OPTIMIZE_FOR_SIZE=y
118CONFIG_SYSCTL=y 130CONFIG_SYSCTL=y
131CONFIG_ANON_INODES=y
119# CONFIG_EMBEDDED is not set 132# CONFIG_EMBEDDED is not set
120CONFIG_UID16=y 133CONFIG_UID16=y
121CONFIG_SYSCTL_SYSCALL=y 134CONFIG_SYSCTL_SYSCALL=y
122CONFIG_KALLSYMS=y 135CONFIG_KALLSYMS=y
123CONFIG_KALLSYMS_ALL=y 136CONFIG_KALLSYMS_ALL=y
124CONFIG_KALLSYMS_EXTRA_PASS=y 137CONFIG_KALLSYMS_EXTRA_PASS=y
138# CONFIG_STRIP_ASM_SYMS is not set
125CONFIG_HOTPLUG=y 139CONFIG_HOTPLUG=y
126CONFIG_PRINTK=y 140CONFIG_PRINTK=y
127CONFIG_BUG=y 141CONFIG_BUG=y
128CONFIG_ELF_CORE=y 142CONFIG_ELF_CORE=y
129CONFIG_PCSPKR_PLATFORM=y 143CONFIG_PCSPKR_PLATFORM=y
130# CONFIG_COMPAT_BRK is not set
131CONFIG_BASE_FULL=y 144CONFIG_BASE_FULL=y
132CONFIG_FUTEX=y 145CONFIG_FUTEX=y
133CONFIG_ANON_INODES=y
134CONFIG_EPOLL=y 146CONFIG_EPOLL=y
135CONFIG_SIGNALFD=y 147CONFIG_SIGNALFD=y
136CONFIG_TIMERFD=y 148CONFIG_TIMERFD=y
@@ -140,6 +152,7 @@ CONFIG_AIO=y
140CONFIG_VM_EVENT_COUNTERS=y 152CONFIG_VM_EVENT_COUNTERS=y
141CONFIG_PCI_QUIRKS=y 153CONFIG_PCI_QUIRKS=y
142CONFIG_SLUB_DEBUG=y 154CONFIG_SLUB_DEBUG=y
155# CONFIG_COMPAT_BRK is not set
143# CONFIG_SLAB is not set 156# CONFIG_SLAB is not set
144CONFIG_SLUB=y 157CONFIG_SLUB=y
145# CONFIG_SLOB is not set 158# CONFIG_SLOB is not set
@@ -155,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
155CONFIG_HAVE_KPROBES=y 168CONFIG_HAVE_KPROBES=y
156CONFIG_HAVE_KRETPROBES=y 169CONFIG_HAVE_KRETPROBES=y
157CONFIG_HAVE_ARCH_TRACEHOOK=y 170CONFIG_HAVE_ARCH_TRACEHOOK=y
171CONFIG_HAVE_DMA_API_DEBUG=y
172# CONFIG_SLOW_WORK is not set
158# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set 173# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
159CONFIG_SLABINFO=y 174CONFIG_SLABINFO=y
160CONFIG_RT_MUTEXES=y 175CONFIG_RT_MUTEXES=y
@@ -167,7 +182,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
167# CONFIG_MODULE_SRCVERSION_ALL is not set 182# CONFIG_MODULE_SRCVERSION_ALL is not set
168CONFIG_STOP_MACHINE=y 183CONFIG_STOP_MACHINE=y
169CONFIG_BLOCK=y 184CONFIG_BLOCK=y
170CONFIG_BLK_DEV_IO_TRACE=y
171CONFIG_BLK_DEV_BSG=y 185CONFIG_BLK_DEV_BSG=y
172# CONFIG_BLK_DEV_INTEGRITY is not set 186# CONFIG_BLK_DEV_INTEGRITY is not set
173CONFIG_BLOCK_COMPAT=y 187CONFIG_BLOCK_COMPAT=y
@@ -195,11 +209,10 @@ CONFIG_HIGH_RES_TIMERS=y
195CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 209CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
196CONFIG_SMP=y 210CONFIG_SMP=y
197CONFIG_SPARSE_IRQ=y 211CONFIG_SPARSE_IRQ=y
198CONFIG_X86_FIND_SMP_CONFIG=y
199CONFIG_X86_MPPARSE=y 212CONFIG_X86_MPPARSE=y
200# CONFIG_X86_ELAN is not set 213CONFIG_X86_EXTENDED_PLATFORM=y
201# CONFIG_X86_GENERICARCH is not set
202# CONFIG_X86_VSMP is not set 214# CONFIG_X86_VSMP is not set
215# CONFIG_X86_UV is not set
203CONFIG_SCHED_OMIT_FRAME_POINTER=y 216CONFIG_SCHED_OMIT_FRAME_POINTER=y
204# CONFIG_PARAVIRT_GUEST is not set 217# CONFIG_PARAVIRT_GUEST is not set
205# CONFIG_MEMTEST is not set 218# CONFIG_MEMTEST is not set
@@ -229,10 +242,10 @@ CONFIG_SCHED_OMIT_FRAME_POINTER=y
229# CONFIG_MCORE2 is not set 242# CONFIG_MCORE2 is not set
230CONFIG_GENERIC_CPU=y 243CONFIG_GENERIC_CPU=y
231CONFIG_X86_CPU=y 244CONFIG_X86_CPU=y
232CONFIG_X86_L1_CACHE_BYTES=128 245CONFIG_X86_L1_CACHE_BYTES=64
233CONFIG_X86_INTERNODE_CACHE_BYTES=128 246CONFIG_X86_INTERNODE_CACHE_BYTES=64
234CONFIG_X86_CMPXCHG=y 247CONFIG_X86_CMPXCHG=y
235CONFIG_X86_L1_CACHE_SHIFT=7 248CONFIG_X86_L1_CACHE_SHIFT=6
236CONFIG_X86_WP_WORKS_OK=y 249CONFIG_X86_WP_WORKS_OK=y
237CONFIG_X86_TSC=y 250CONFIG_X86_TSC=y
238CONFIG_X86_CMPXCHG64=y 251CONFIG_X86_CMPXCHG64=y
@@ -241,7 +254,7 @@ CONFIG_X86_MINIMUM_CPU_FAMILY=64
241CONFIG_X86_DEBUGCTLMSR=y 254CONFIG_X86_DEBUGCTLMSR=y
242CONFIG_CPU_SUP_INTEL=y 255CONFIG_CPU_SUP_INTEL=y
243CONFIG_CPU_SUP_AMD=y 256CONFIG_CPU_SUP_AMD=y
244CONFIG_CPU_SUP_CENTAUR_64=y 257CONFIG_CPU_SUP_CENTAUR=y
245CONFIG_X86_DS=y 258CONFIG_X86_DS=y
246CONFIG_X86_PTRACE_BTS=y 259CONFIG_X86_PTRACE_BTS=y
247CONFIG_HPET_TIMER=y 260CONFIG_HPET_TIMER=y
@@ -268,6 +281,7 @@ CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
268CONFIG_X86_MCE=y 281CONFIG_X86_MCE=y
269CONFIG_X86_MCE_INTEL=y 282CONFIG_X86_MCE_INTEL=y
270CONFIG_X86_MCE_AMD=y 283CONFIG_X86_MCE_AMD=y
284CONFIG_X86_MCE_THRESHOLD=y
271# CONFIG_I8K is not set 285# CONFIG_I8K is not set
272CONFIG_MICROCODE=y 286CONFIG_MICROCODE=y
273CONFIG_MICROCODE_INTEL=y 287CONFIG_MICROCODE_INTEL=y
@@ -275,6 +289,7 @@ CONFIG_MICROCODE_AMD=y
275CONFIG_MICROCODE_OLD_INTERFACE=y 289CONFIG_MICROCODE_OLD_INTERFACE=y
276CONFIG_X86_MSR=y 290CONFIG_X86_MSR=y
277CONFIG_X86_CPUID=y 291CONFIG_X86_CPUID=y
292# CONFIG_X86_CPU_DEBUG is not set
278CONFIG_ARCH_PHYS_ADDR_T_64BIT=y 293CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
279CONFIG_DIRECT_GBPAGES=y 294CONFIG_DIRECT_GBPAGES=y
280CONFIG_NUMA=y 295CONFIG_NUMA=y
@@ -308,6 +323,8 @@ CONFIG_ZONE_DMA_FLAG=1
308CONFIG_BOUNCE=y 323CONFIG_BOUNCE=y
309CONFIG_VIRT_TO_BUS=y 324CONFIG_VIRT_TO_BUS=y
310CONFIG_UNEVICTABLE_LRU=y 325CONFIG_UNEVICTABLE_LRU=y
326CONFIG_HAVE_MLOCK=y
327CONFIG_HAVE_MLOCKED_PAGE_BIT=y
311CONFIG_X86_CHECK_BIOS_CORRUPTION=y 328CONFIG_X86_CHECK_BIOS_CORRUPTION=y
312CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y 329CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
313CONFIG_X86_RESERVE_LOW_64K=y 330CONFIG_X86_RESERVE_LOW_64K=y
@@ -316,6 +333,7 @@ CONFIG_MTRR=y
316CONFIG_X86_PAT=y 333CONFIG_X86_PAT=y
317CONFIG_EFI=y 334CONFIG_EFI=y
318CONFIG_SECCOMP=y 335CONFIG_SECCOMP=y
336# CONFIG_CC_STACKPROTECTOR is not set
319# CONFIG_HZ_100 is not set 337# CONFIG_HZ_100 is not set
320# CONFIG_HZ_250 is not set 338# CONFIG_HZ_250 is not set
321# CONFIG_HZ_300 is not set 339# CONFIG_HZ_300 is not set
@@ -324,9 +342,10 @@ CONFIG_HZ=1000
324CONFIG_SCHED_HRTICK=y 342CONFIG_SCHED_HRTICK=y
325CONFIG_KEXEC=y 343CONFIG_KEXEC=y
326CONFIG_CRASH_DUMP=y 344CONFIG_CRASH_DUMP=y
345# CONFIG_KEXEC_JUMP is not set
327CONFIG_PHYSICAL_START=0x1000000 346CONFIG_PHYSICAL_START=0x1000000
328# CONFIG_RELOCATABLE is not set 347CONFIG_RELOCATABLE=y
329CONFIG_PHYSICAL_ALIGN=0x200000 348CONFIG_PHYSICAL_ALIGN=0x1000000
330CONFIG_HOTPLUG_CPU=y 349CONFIG_HOTPLUG_CPU=y
331# CONFIG_COMPAT_VDSO is not set 350# CONFIG_COMPAT_VDSO is not set
332# CONFIG_CMDLINE_BOOL is not set 351# CONFIG_CMDLINE_BOOL is not set
@@ -369,7 +388,6 @@ CONFIG_ACPI_NUMA=y
369CONFIG_ACPI_BLACKLIST_YEAR=0 388CONFIG_ACPI_BLACKLIST_YEAR=0
370# CONFIG_ACPI_DEBUG is not set 389# CONFIG_ACPI_DEBUG is not set
371# CONFIG_ACPI_PCI_SLOT is not set 390# CONFIG_ACPI_PCI_SLOT is not set
372CONFIG_ACPI_SYSTEM=y
373CONFIG_X86_PM_TIMER=y 391CONFIG_X86_PM_TIMER=y
374CONFIG_ACPI_CONTAINER=y 392CONFIG_ACPI_CONTAINER=y
375# CONFIG_ACPI_SBS is not set 393# CONFIG_ACPI_SBS is not set
@@ -435,6 +453,7 @@ CONFIG_PCI_MSI=y
435# CONFIG_PCI_DEBUG is not set 453# CONFIG_PCI_DEBUG is not set
436# CONFIG_PCI_STUB is not set 454# CONFIG_PCI_STUB is not set
437CONFIG_HT_IRQ=y 455CONFIG_HT_IRQ=y
456# CONFIG_PCI_IOV is not set
438CONFIG_ISA_DMA_API=y 457CONFIG_ISA_DMA_API=y
439CONFIG_K8_NB=y 458CONFIG_K8_NB=y
440CONFIG_PCCARD=y 459CONFIG_PCCARD=y
@@ -480,7 +499,6 @@ CONFIG_NET=y
480# 499#
481# Networking options 500# Networking options
482# 501#
483CONFIG_COMPAT_NET_DEV_OPS=y
484CONFIG_PACKET=y 502CONFIG_PACKET=y
485CONFIG_PACKET_MMAP=y 503CONFIG_PACKET_MMAP=y
486CONFIG_UNIX=y 504CONFIG_UNIX=y
@@ -638,6 +656,7 @@ CONFIG_LLC=y
638# CONFIG_LAPB is not set 656# CONFIG_LAPB is not set
639# CONFIG_ECONET is not set 657# CONFIG_ECONET is not set
640# CONFIG_WAN_ROUTER is not set 658# CONFIG_WAN_ROUTER is not set
659# CONFIG_PHONET is not set
641CONFIG_NET_SCHED=y 660CONFIG_NET_SCHED=y
642 661
643# 662#
@@ -695,6 +714,7 @@ CONFIG_NET_SCH_FIFO=y
695# 714#
696# CONFIG_NET_PKTGEN is not set 715# CONFIG_NET_PKTGEN is not set
697# CONFIG_NET_TCPPROBE is not set 716# CONFIG_NET_TCPPROBE is not set
717# CONFIG_NET_DROP_MONITOR is not set
698CONFIG_HAMRADIO=y 718CONFIG_HAMRADIO=y
699 719
700# 720#
@@ -705,12 +725,10 @@ CONFIG_HAMRADIO=y
705# CONFIG_IRDA is not set 725# CONFIG_IRDA is not set
706# CONFIG_BT is not set 726# CONFIG_BT is not set
707# CONFIG_AF_RXRPC is not set 727# CONFIG_AF_RXRPC is not set
708# CONFIG_PHONET is not set
709CONFIG_FIB_RULES=y 728CONFIG_FIB_RULES=y
710CONFIG_WIRELESS=y 729CONFIG_WIRELESS=y
711CONFIG_CFG80211=y 730CONFIG_CFG80211=y
712# CONFIG_CFG80211_REG_DEBUG is not set 731# CONFIG_CFG80211_REG_DEBUG is not set
713CONFIG_NL80211=y
714CONFIG_WIRELESS_OLD_REGULATORY=y 732CONFIG_WIRELESS_OLD_REGULATORY=y
715CONFIG_WIRELESS_EXT=y 733CONFIG_WIRELESS_EXT=y
716CONFIG_WIRELESS_EXT_SYSFS=y 734CONFIG_WIRELESS_EXT_SYSFS=y
@@ -787,9 +805,8 @@ CONFIG_MISC_DEVICES=y
787# CONFIG_TIFM_CORE is not set 805# CONFIG_TIFM_CORE is not set
788# CONFIG_ICS932S401 is not set 806# CONFIG_ICS932S401 is not set
789# CONFIG_ENCLOSURE_SERVICES is not set 807# CONFIG_ENCLOSURE_SERVICES is not set
790# CONFIG_SGI_XP is not set
791# CONFIG_HP_ILO is not set 808# CONFIG_HP_ILO is not set
792# CONFIG_SGI_GRU is not set 809# CONFIG_ISL29003 is not set
793# CONFIG_C2PORT is not set 810# CONFIG_C2PORT is not set
794 811
795# 812#
@@ -843,6 +860,7 @@ CONFIG_SCSI_SPI_ATTRS=y
843# CONFIG_SCSI_LOWLEVEL is not set 860# CONFIG_SCSI_LOWLEVEL is not set
844# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set 861# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
845# CONFIG_SCSI_DH is not set 862# CONFIG_SCSI_DH is not set
863# CONFIG_SCSI_OSD_INITIATOR is not set
846CONFIG_ATA=y 864CONFIG_ATA=y
847# CONFIG_ATA_NONSTANDARD is not set 865# CONFIG_ATA_NONSTANDARD is not set
848CONFIG_ATA_ACPI=y 866CONFIG_ATA_ACPI=y
@@ -939,6 +957,7 @@ CONFIG_DM_ZERO=y
939CONFIG_MACINTOSH_DRIVERS=y 957CONFIG_MACINTOSH_DRIVERS=y
940CONFIG_MAC_EMUMOUSEBTN=y 958CONFIG_MAC_EMUMOUSEBTN=y
941CONFIG_NETDEVICES=y 959CONFIG_NETDEVICES=y
960CONFIG_COMPAT_NET_DEV_OPS=y
942# CONFIG_IFB is not set 961# CONFIG_IFB is not set
943# CONFIG_DUMMY is not set 962# CONFIG_DUMMY is not set
944# CONFIG_BONDING is not set 963# CONFIG_BONDING is not set
@@ -976,6 +995,8 @@ CONFIG_MII=y
976CONFIG_NET_VENDOR_3COM=y 995CONFIG_NET_VENDOR_3COM=y
977# CONFIG_VORTEX is not set 996# CONFIG_VORTEX is not set
978# CONFIG_TYPHOON is not set 997# CONFIG_TYPHOON is not set
998# CONFIG_ETHOC is not set
999# CONFIG_DNET is not set
979CONFIG_NET_TULIP=y 1000CONFIG_NET_TULIP=y
980# CONFIG_DE2104X is not set 1001# CONFIG_DE2104X is not set
981# CONFIG_TULIP is not set 1002# CONFIG_TULIP is not set
@@ -1025,6 +1046,7 @@ CONFIG_E1000=y
1025# CONFIG_E1000E is not set 1046# CONFIG_E1000E is not set
1026# CONFIG_IP1000 is not set 1047# CONFIG_IP1000 is not set
1027# CONFIG_IGB is not set 1048# CONFIG_IGB is not set
1049# CONFIG_IGBVF is not set
1028# CONFIG_NS83820 is not set 1050# CONFIG_NS83820 is not set
1029# CONFIG_HAMACHI is not set 1051# CONFIG_HAMACHI is not set
1030# CONFIG_YELLOWFIN is not set 1052# CONFIG_YELLOWFIN is not set
@@ -1039,6 +1061,7 @@ CONFIG_TIGON3=y
1039# CONFIG_QLA3XXX is not set 1061# CONFIG_QLA3XXX is not set
1040# CONFIG_ATL1 is not set 1062# CONFIG_ATL1 is not set
1041# CONFIG_ATL1E is not set 1063# CONFIG_ATL1E is not set
1064# CONFIG_ATL1C is not set
1042# CONFIG_JME is not set 1065# CONFIG_JME is not set
1043CONFIG_NETDEV_10000=y 1066CONFIG_NETDEV_10000=y
1044# CONFIG_CHELSIO_T1 is not set 1067# CONFIG_CHELSIO_T1 is not set
@@ -1048,6 +1071,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1048# CONFIG_IXGBE is not set 1071# CONFIG_IXGBE is not set
1049# CONFIG_IXGB is not set 1072# CONFIG_IXGB is not set
1050# CONFIG_S2IO is not set 1073# CONFIG_S2IO is not set
1074# CONFIG_VXGE is not set
1051# CONFIG_MYRI10GE is not set 1075# CONFIG_MYRI10GE is not set
1052# CONFIG_NETXEN_NIC is not set 1076# CONFIG_NETXEN_NIC is not set
1053# CONFIG_NIU is not set 1077# CONFIG_NIU is not set
@@ -1057,6 +1081,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
1057# CONFIG_BNX2X is not set 1081# CONFIG_BNX2X is not set
1058# CONFIG_QLGE is not set 1082# CONFIG_QLGE is not set
1059# CONFIG_SFC is not set 1083# CONFIG_SFC is not set
1084# CONFIG_BE2NET is not set
1060CONFIG_TR=y 1085CONFIG_TR=y
1061# CONFIG_IBMOL is not set 1086# CONFIG_IBMOL is not set
1062# CONFIG_3C359 is not set 1087# CONFIG_3C359 is not set
@@ -1071,8 +1096,8 @@ CONFIG_WLAN_80211=y
1071# CONFIG_LIBERTAS is not set 1096# CONFIG_LIBERTAS is not set
1072# CONFIG_LIBERTAS_THINFIRM is not set 1097# CONFIG_LIBERTAS_THINFIRM is not set
1073# CONFIG_AIRO is not set 1098# CONFIG_AIRO is not set
1074# CONFIG_HERMES is not set
1075# CONFIG_ATMEL is not set 1099# CONFIG_ATMEL is not set
1100# CONFIG_AT76C50X_USB is not set
1076# CONFIG_AIRO_CS is not set 1101# CONFIG_AIRO_CS is not set
1077# CONFIG_PCMCIA_WL3501 is not set 1102# CONFIG_PCMCIA_WL3501 is not set
1078# CONFIG_PRISM54 is not set 1103# CONFIG_PRISM54 is not set
@@ -1082,21 +1107,21 @@ CONFIG_WLAN_80211=y
1082# CONFIG_RTL8187 is not set 1107# CONFIG_RTL8187 is not set
1083# CONFIG_ADM8211 is not set 1108# CONFIG_ADM8211 is not set
1084# CONFIG_MAC80211_HWSIM is not set 1109# CONFIG_MAC80211_HWSIM is not set
1110# CONFIG_MWL8K is not set
1085# CONFIG_P54_COMMON is not set 1111# CONFIG_P54_COMMON is not set
1086CONFIG_ATH5K=y 1112CONFIG_ATH5K=y
1087# CONFIG_ATH5K_DEBUG is not set 1113# CONFIG_ATH5K_DEBUG is not set
1088# CONFIG_ATH9K is not set 1114# CONFIG_ATH9K is not set
1115# CONFIG_AR9170_USB is not set
1089# CONFIG_IPW2100 is not set 1116# CONFIG_IPW2100 is not set
1090# CONFIG_IPW2200 is not set 1117# CONFIG_IPW2200 is not set
1091# CONFIG_IWLCORE is not set 1118# CONFIG_IWLWIFI is not set
1092# CONFIG_IWLWIFI_LEDS is not set
1093# CONFIG_IWLAGN is not set
1094# CONFIG_IWL3945 is not set
1095# CONFIG_HOSTAP is not set 1119# CONFIG_HOSTAP is not set
1096# CONFIG_B43 is not set 1120# CONFIG_B43 is not set
1097# CONFIG_B43LEGACY is not set 1121# CONFIG_B43LEGACY is not set
1098# CONFIG_ZD1211RW is not set 1122# CONFIG_ZD1211RW is not set
1099# CONFIG_RT2X00 is not set 1123# CONFIG_RT2X00 is not set
1124# CONFIG_HERMES is not set
1100 1125
1101# 1126#
1102# Enable WiMAX (Networking options) to see the WiMAX drivers 1127# Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1207,6 +1232,8 @@ CONFIG_INPUT_TABLET=y
1207# CONFIG_TABLET_USB_KBTAB is not set 1232# CONFIG_TABLET_USB_KBTAB is not set
1208# CONFIG_TABLET_USB_WACOM is not set 1233# CONFIG_TABLET_USB_WACOM is not set
1209CONFIG_INPUT_TOUCHSCREEN=y 1234CONFIG_INPUT_TOUCHSCREEN=y
1235# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
1236# CONFIG_TOUCHSCREEN_AD7879 is not set
1210# CONFIG_TOUCHSCREEN_FUJITSU is not set 1237# CONFIG_TOUCHSCREEN_FUJITSU is not set
1211# CONFIG_TOUCHSCREEN_GUNZE is not set 1238# CONFIG_TOUCHSCREEN_GUNZE is not set
1212# CONFIG_TOUCHSCREEN_ELO is not set 1239# CONFIG_TOUCHSCREEN_ELO is not set
@@ -1300,6 +1327,7 @@ CONFIG_UNIX98_PTYS=y
1300# CONFIG_LEGACY_PTYS is not set 1327# CONFIG_LEGACY_PTYS is not set
1301# CONFIG_IPMI_HANDLER is not set 1328# CONFIG_IPMI_HANDLER is not set
1302CONFIG_HW_RANDOM=y 1329CONFIG_HW_RANDOM=y
1330# CONFIG_HW_RANDOM_TIMERIOMEM is not set
1303# CONFIG_HW_RANDOM_INTEL is not set 1331# CONFIG_HW_RANDOM_INTEL is not set
1304# CONFIG_HW_RANDOM_AMD is not set 1332# CONFIG_HW_RANDOM_AMD is not set
1305CONFIG_NVRAM=y 1333CONFIG_NVRAM=y
@@ -1381,7 +1409,6 @@ CONFIG_I2C_I801=y
1381# CONFIG_SENSORS_PCF8574 is not set 1409# CONFIG_SENSORS_PCF8574 is not set
1382# CONFIG_PCF8575 is not set 1410# CONFIG_PCF8575 is not set
1383# CONFIG_SENSORS_PCA9539 is not set 1411# CONFIG_SENSORS_PCA9539 is not set
1384# CONFIG_SENSORS_PCF8591 is not set
1385# CONFIG_SENSORS_MAX6875 is not set 1412# CONFIG_SENSORS_MAX6875 is not set
1386# CONFIG_SENSORS_TSL2550 is not set 1413# CONFIG_SENSORS_TSL2550 is not set
1387# CONFIG_I2C_DEBUG_CORE is not set 1414# CONFIG_I2C_DEBUG_CORE is not set
@@ -1415,6 +1442,7 @@ CONFIG_HWMON=y
1415# CONFIG_SENSORS_ADT7475 is not set 1442# CONFIG_SENSORS_ADT7475 is not set
1416# CONFIG_SENSORS_K8TEMP is not set 1443# CONFIG_SENSORS_K8TEMP is not set
1417# CONFIG_SENSORS_ASB100 is not set 1444# CONFIG_SENSORS_ASB100 is not set
1445# CONFIG_SENSORS_ATK0110 is not set
1418# CONFIG_SENSORS_ATXP1 is not set 1446# CONFIG_SENSORS_ATXP1 is not set
1419# CONFIG_SENSORS_DS1621 is not set 1447# CONFIG_SENSORS_DS1621 is not set
1420# CONFIG_SENSORS_I5K_AMB is not set 1448# CONFIG_SENSORS_I5K_AMB is not set
@@ -1424,6 +1452,7 @@ CONFIG_HWMON=y
1424# CONFIG_SENSORS_FSCHER is not set 1452# CONFIG_SENSORS_FSCHER is not set
1425# CONFIG_SENSORS_FSCPOS is not set 1453# CONFIG_SENSORS_FSCPOS is not set
1426# CONFIG_SENSORS_FSCHMD is not set 1454# CONFIG_SENSORS_FSCHMD is not set
1455# CONFIG_SENSORS_G760A is not set
1427# CONFIG_SENSORS_GL518SM is not set 1456# CONFIG_SENSORS_GL518SM is not set
1428# CONFIG_SENSORS_GL520SM is not set 1457# CONFIG_SENSORS_GL520SM is not set
1429# CONFIG_SENSORS_CORETEMP is not set 1458# CONFIG_SENSORS_CORETEMP is not set
@@ -1439,11 +1468,14 @@ CONFIG_HWMON=y
1439# CONFIG_SENSORS_LM90 is not set 1468# CONFIG_SENSORS_LM90 is not set
1440# CONFIG_SENSORS_LM92 is not set 1469# CONFIG_SENSORS_LM92 is not set
1441# CONFIG_SENSORS_LM93 is not set 1470# CONFIG_SENSORS_LM93 is not set
1471# CONFIG_SENSORS_LTC4215 is not set
1442# CONFIG_SENSORS_LTC4245 is not set 1472# CONFIG_SENSORS_LTC4245 is not set
1473# CONFIG_SENSORS_LM95241 is not set
1443# CONFIG_SENSORS_MAX1619 is not set 1474# CONFIG_SENSORS_MAX1619 is not set
1444# CONFIG_SENSORS_MAX6650 is not set 1475# CONFIG_SENSORS_MAX6650 is not set
1445# CONFIG_SENSORS_PC87360 is not set 1476# CONFIG_SENSORS_PC87360 is not set
1446# CONFIG_SENSORS_PC87427 is not set 1477# CONFIG_SENSORS_PC87427 is not set
1478# CONFIG_SENSORS_PCF8591 is not set
1447# CONFIG_SENSORS_SIS5595 is not set 1479# CONFIG_SENSORS_SIS5595 is not set
1448# CONFIG_SENSORS_DME1737 is not set 1480# CONFIG_SENSORS_DME1737 is not set
1449# CONFIG_SENSORS_SMSC47M1 is not set 1481# CONFIG_SENSORS_SMSC47M1 is not set
@@ -1634,6 +1666,7 @@ CONFIG_FB_EFI=y
1634# CONFIG_FB_VIRTUAL is not set 1666# CONFIG_FB_VIRTUAL is not set
1635# CONFIG_FB_METRONOME is not set 1667# CONFIG_FB_METRONOME is not set
1636# CONFIG_FB_MB862XX is not set 1668# CONFIG_FB_MB862XX is not set
1669# CONFIG_FB_BROADSHEET is not set
1637CONFIG_BACKLIGHT_LCD_SUPPORT=y 1670CONFIG_BACKLIGHT_LCD_SUPPORT=y
1638# CONFIG_LCD_CLASS_DEVICE is not set 1671# CONFIG_LCD_CLASS_DEVICE is not set
1639CONFIG_BACKLIGHT_CLASS_DEVICE=y 1672CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1719,6 +1752,8 @@ CONFIG_SND_PCI=y
1719# CONFIG_SND_INDIGO is not set 1752# CONFIG_SND_INDIGO is not set
1720# CONFIG_SND_INDIGOIO is not set 1753# CONFIG_SND_INDIGOIO is not set
1721# CONFIG_SND_INDIGODJ is not set 1754# CONFIG_SND_INDIGODJ is not set
1755# CONFIG_SND_INDIGOIOX is not set
1756# CONFIG_SND_INDIGODJX is not set
1722# CONFIG_SND_EMU10K1 is not set 1757# CONFIG_SND_EMU10K1 is not set
1723# CONFIG_SND_EMU10K1X is not set 1758# CONFIG_SND_EMU10K1X is not set
1724# CONFIG_SND_ENS1370 is not set 1759# CONFIG_SND_ENS1370 is not set
@@ -1791,15 +1826,17 @@ CONFIG_USB_HIDDEV=y
1791# 1826#
1792# Special HID drivers 1827# Special HID drivers
1793# 1828#
1794CONFIG_HID_COMPAT=y
1795CONFIG_HID_A4TECH=y 1829CONFIG_HID_A4TECH=y
1796CONFIG_HID_APPLE=y 1830CONFIG_HID_APPLE=y
1797CONFIG_HID_BELKIN=y 1831CONFIG_HID_BELKIN=y
1798CONFIG_HID_CHERRY=y 1832CONFIG_HID_CHERRY=y
1799CONFIG_HID_CHICONY=y 1833CONFIG_HID_CHICONY=y
1800CONFIG_HID_CYPRESS=y 1834CONFIG_HID_CYPRESS=y
1835# CONFIG_DRAGONRISE_FF is not set
1801CONFIG_HID_EZKEY=y 1836CONFIG_HID_EZKEY=y
1837CONFIG_HID_KYE=y
1802CONFIG_HID_GYRATION=y 1838CONFIG_HID_GYRATION=y
1839CONFIG_HID_KENSINGTON=y
1803CONFIG_HID_LOGITECH=y 1840CONFIG_HID_LOGITECH=y
1804CONFIG_LOGITECH_FF=y 1841CONFIG_LOGITECH_FF=y
1805# CONFIG_LOGIRUMBLEPAD2_FF is not set 1842# CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1865,11 +1902,11 @@ CONFIG_USB_PRINTER=y
1865# CONFIG_USB_TMC is not set 1902# CONFIG_USB_TMC is not set
1866 1903
1867# 1904#
1868# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; 1905# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
1869# 1906#
1870 1907
1871# 1908#
1872# see USB_STORAGE Help for more information 1909# also be needed; see USB_STORAGE Help for more info
1873# 1910#
1874CONFIG_USB_STORAGE=y 1911CONFIG_USB_STORAGE=y
1875# CONFIG_USB_STORAGE_DEBUG is not set 1912# CONFIG_USB_STORAGE_DEBUG is not set
@@ -1911,7 +1948,6 @@ CONFIG_USB_LIBUSUAL=y
1911# CONFIG_USB_LED is not set 1948# CONFIG_USB_LED is not set
1912# CONFIG_USB_CYPRESS_CY7C63 is not set 1949# CONFIG_USB_CYPRESS_CY7C63 is not set
1913# CONFIG_USB_CYTHERM is not set 1950# CONFIG_USB_CYTHERM is not set
1914# CONFIG_USB_PHIDGET is not set
1915# CONFIG_USB_IDMOUSE is not set 1951# CONFIG_USB_IDMOUSE is not set
1916# CONFIG_USB_FTDI_ELAN is not set 1952# CONFIG_USB_FTDI_ELAN is not set
1917# CONFIG_USB_APPLEDISPLAY is not set 1953# CONFIG_USB_APPLEDISPLAY is not set
@@ -1927,6 +1963,7 @@ CONFIG_USB_LIBUSUAL=y
1927# 1963#
1928# OTG and related infrastructure 1964# OTG and related infrastructure
1929# 1965#
1966# CONFIG_NOP_USB_XCEIV is not set
1930# CONFIG_UWB is not set 1967# CONFIG_UWB is not set
1931# CONFIG_MMC is not set 1968# CONFIG_MMC is not set
1932# CONFIG_MEMSTICK is not set 1969# CONFIG_MEMSTICK is not set
@@ -1938,8 +1975,10 @@ CONFIG_LEDS_CLASS=y
1938# 1975#
1939# CONFIG_LEDS_ALIX2 is not set 1976# CONFIG_LEDS_ALIX2 is not set
1940# CONFIG_LEDS_PCA9532 is not set 1977# CONFIG_LEDS_PCA9532 is not set
1978# CONFIG_LEDS_LP5521 is not set
1941# CONFIG_LEDS_CLEVO_MAIL is not set 1979# CONFIG_LEDS_CLEVO_MAIL is not set
1942# CONFIG_LEDS_PCA955X is not set 1980# CONFIG_LEDS_PCA955X is not set
1981# CONFIG_LEDS_BD2802 is not set
1943 1982
1944# 1983#
1945# LED Triggers 1984# LED Triggers
@@ -1949,6 +1988,10 @@ CONFIG_LEDS_TRIGGERS=y
1949# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set 1988# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
1950# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set 1989# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
1951# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set 1990# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
1991
1992#
1993# iptables trigger is under Netfilter config (LED target)
1994#
1952# CONFIG_ACCESSIBILITY is not set 1995# CONFIG_ACCESSIBILITY is not set
1953# CONFIG_INFINIBAND is not set 1996# CONFIG_INFINIBAND is not set
1954CONFIG_EDAC=y 1997CONFIG_EDAC=y
@@ -2017,6 +2060,7 @@ CONFIG_DMADEVICES=y
2017# DMA Devices 2060# DMA Devices
2018# 2061#
2019# CONFIG_INTEL_IOATDMA is not set 2062# CONFIG_INTEL_IOATDMA is not set
2063# CONFIG_AUXDISPLAY is not set
2020# CONFIG_UIO is not set 2064# CONFIG_UIO is not set
2021# CONFIG_STAGING is not set 2065# CONFIG_STAGING is not set
2022CONFIG_X86_PLATFORM_DEVICES=y 2066CONFIG_X86_PLATFORM_DEVICES=y
@@ -2050,6 +2094,7 @@ CONFIG_DMIID=y
2050# 2094#
2051# CONFIG_EXT2_FS is not set 2095# CONFIG_EXT2_FS is not set
2052CONFIG_EXT3_FS=y 2096CONFIG_EXT3_FS=y
2097# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
2053CONFIG_EXT3_FS_XATTR=y 2098CONFIG_EXT3_FS_XATTR=y
2054CONFIG_EXT3_FS_POSIX_ACL=y 2099CONFIG_EXT3_FS_POSIX_ACL=y
2055CONFIG_EXT3_FS_SECURITY=y 2100CONFIG_EXT3_FS_SECURITY=y
@@ -2081,6 +2126,11 @@ CONFIG_AUTOFS4_FS=y
2081CONFIG_GENERIC_ACL=y 2126CONFIG_GENERIC_ACL=y
2082 2127
2083# 2128#
2129# Caches
2130#
2131# CONFIG_FSCACHE is not set
2132
2133#
2084# CD-ROM/DVD Filesystems 2134# CD-ROM/DVD Filesystems
2085# 2135#
2086CONFIG_ISO9660_FS=y 2136CONFIG_ISO9660_FS=y
@@ -2131,6 +2181,7 @@ CONFIG_MISC_FILESYSTEMS=y
2131# CONFIG_ROMFS_FS is not set 2181# CONFIG_ROMFS_FS is not set
2132# CONFIG_SYSV_FS is not set 2182# CONFIG_SYSV_FS is not set
2133# CONFIG_UFS_FS is not set 2183# CONFIG_UFS_FS is not set
2184# CONFIG_NILFS2_FS is not set
2134CONFIG_NETWORK_FILESYSTEMS=y 2185CONFIG_NETWORK_FILESYSTEMS=y
2135CONFIG_NFS_FS=y 2186CONFIG_NFS_FS=y
2136CONFIG_NFS_V3=y 2187CONFIG_NFS_V3=y
@@ -2144,7 +2195,6 @@ CONFIG_NFS_ACL_SUPPORT=y
2144CONFIG_NFS_COMMON=y 2195CONFIG_NFS_COMMON=y
2145CONFIG_SUNRPC=y 2196CONFIG_SUNRPC=y
2146CONFIG_SUNRPC_GSS=y 2197CONFIG_SUNRPC_GSS=y
2147# CONFIG_SUNRPC_REGISTER_V4 is not set
2148CONFIG_RPCSEC_GSS_KRB5=y 2198CONFIG_RPCSEC_GSS_KRB5=y
2149# CONFIG_RPCSEC_GSS_SPKM3 is not set 2199# CONFIG_RPCSEC_GSS_SPKM3 is not set
2150# CONFIG_SMB_FS is not set 2200# CONFIG_SMB_FS is not set
@@ -2231,6 +2281,7 @@ CONFIG_DEBUG_FS=y
2231CONFIG_DEBUG_KERNEL=y 2281CONFIG_DEBUG_KERNEL=y
2232# CONFIG_DEBUG_SHIRQ is not set 2282# CONFIG_DEBUG_SHIRQ is not set
2233# CONFIG_DETECT_SOFTLOCKUP is not set 2283# CONFIG_DETECT_SOFTLOCKUP is not set
2284# CONFIG_DETECT_HUNG_TASK is not set
2234# CONFIG_SCHED_DEBUG is not set 2285# CONFIG_SCHED_DEBUG is not set
2235CONFIG_SCHEDSTATS=y 2286CONFIG_SCHEDSTATS=y
2236CONFIG_TIMER_STATS=y 2287CONFIG_TIMER_STATS=y
@@ -2246,6 +2297,7 @@ CONFIG_TIMER_STATS=y
2246# CONFIG_LOCK_STAT is not set 2297# CONFIG_LOCK_STAT is not set
2247# CONFIG_DEBUG_SPINLOCK_SLEEP is not set 2298# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
2248# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set 2299# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
2300CONFIG_STACKTRACE=y
2249# CONFIG_DEBUG_KOBJECT is not set 2301# CONFIG_DEBUG_KOBJECT is not set
2250CONFIG_DEBUG_BUGVERBOSE=y 2302CONFIG_DEBUG_BUGVERBOSE=y
2251# CONFIG_DEBUG_INFO is not set 2303# CONFIG_DEBUG_INFO is not set
@@ -2268,13 +2320,19 @@ CONFIG_FRAME_POINTER=y
2268# CONFIG_FAULT_INJECTION is not set 2320# CONFIG_FAULT_INJECTION is not set
2269# CONFIG_LATENCYTOP is not set 2321# CONFIG_LATENCYTOP is not set
2270CONFIG_SYSCTL_SYSCALL_CHECK=y 2322CONFIG_SYSCTL_SYSCALL_CHECK=y
2323# CONFIG_DEBUG_PAGEALLOC is not set
2271CONFIG_USER_STACKTRACE_SUPPORT=y 2324CONFIG_USER_STACKTRACE_SUPPORT=y
2325CONFIG_NOP_TRACER=y
2272CONFIG_HAVE_FUNCTION_TRACER=y 2326CONFIG_HAVE_FUNCTION_TRACER=y
2273CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y 2327CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
2274CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y 2328CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
2275CONFIG_HAVE_DYNAMIC_FTRACE=y 2329CONFIG_HAVE_DYNAMIC_FTRACE=y
2276CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y 2330CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
2277CONFIG_HAVE_HW_BRANCH_TRACER=y 2331CONFIG_HAVE_HW_BRANCH_TRACER=y
2332CONFIG_HAVE_FTRACE_SYSCALLS=y
2333CONFIG_RING_BUFFER=y
2334CONFIG_TRACING=y
2335CONFIG_TRACING_SUPPORT=y
2278 2336
2279# 2337#
2280# Tracers 2338# Tracers
@@ -2284,13 +2342,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
2284# CONFIG_SYSPROF_TRACER is not set 2342# CONFIG_SYSPROF_TRACER is not set
2285# CONFIG_SCHED_TRACER is not set 2343# CONFIG_SCHED_TRACER is not set
2286# CONFIG_CONTEXT_SWITCH_TRACER is not set 2344# CONFIG_CONTEXT_SWITCH_TRACER is not set
2345# CONFIG_EVENT_TRACER is not set
2346# CONFIG_FTRACE_SYSCALLS is not set
2287# CONFIG_BOOT_TRACER is not set 2347# CONFIG_BOOT_TRACER is not set
2288# CONFIG_TRACE_BRANCH_PROFILING is not set 2348# CONFIG_TRACE_BRANCH_PROFILING is not set
2289# CONFIG_POWER_TRACER is not set 2349# CONFIG_POWER_TRACER is not set
2290# CONFIG_STACK_TRACER is not set 2350# CONFIG_STACK_TRACER is not set
2291# CONFIG_HW_BRANCH_TRACER is not set 2351# CONFIG_HW_BRANCH_TRACER is not set
2352# CONFIG_KMEMTRACE is not set
2353# CONFIG_WORKQUEUE_TRACER is not set
2354CONFIG_BLK_DEV_IO_TRACE=y
2355# CONFIG_FTRACE_STARTUP_TEST is not set
2356# CONFIG_MMIOTRACE is not set
2292CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2357CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2293# CONFIG_DYNAMIC_PRINTK_DEBUG is not set 2358# CONFIG_DYNAMIC_DEBUG is not set
2359# CONFIG_DMA_API_DEBUG is not set
2294# CONFIG_SAMPLES is not set 2360# CONFIG_SAMPLES is not set
2295CONFIG_HAVE_ARCH_KGDB=y 2361CONFIG_HAVE_ARCH_KGDB=y
2296# CONFIG_KGDB is not set 2362# CONFIG_KGDB is not set
@@ -2300,14 +2366,13 @@ CONFIG_EARLY_PRINTK=y
2300CONFIG_EARLY_PRINTK_DBGP=y 2366CONFIG_EARLY_PRINTK_DBGP=y
2301CONFIG_DEBUG_STACKOVERFLOW=y 2367CONFIG_DEBUG_STACKOVERFLOW=y
2302CONFIG_DEBUG_STACK_USAGE=y 2368CONFIG_DEBUG_STACK_USAGE=y
2303# CONFIG_DEBUG_PAGEALLOC is not set
2304# CONFIG_DEBUG_PER_CPU_MAPS is not set 2369# CONFIG_DEBUG_PER_CPU_MAPS is not set
2305# CONFIG_X86_PTDUMP is not set 2370# CONFIG_X86_PTDUMP is not set
2306CONFIG_DEBUG_RODATA=y 2371CONFIG_DEBUG_RODATA=y
2307# CONFIG_DEBUG_RODATA_TEST is not set 2372# CONFIG_DEBUG_RODATA_TEST is not set
2308CONFIG_DEBUG_NX_TEST=m 2373CONFIG_DEBUG_NX_TEST=m
2309# CONFIG_IOMMU_DEBUG is not set 2374# CONFIG_IOMMU_DEBUG is not set
2310# CONFIG_MMIOTRACE is not set 2375CONFIG_HAVE_MMIOTRACE_SUPPORT=y
2311CONFIG_IO_DELAY_TYPE_0X80=0 2376CONFIG_IO_DELAY_TYPE_0X80=0
2312CONFIG_IO_DELAY_TYPE_0XED=1 2377CONFIG_IO_DELAY_TYPE_0XED=1
2313CONFIG_IO_DELAY_TYPE_UDELAY=2 2378CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2343,6 +2408,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
2343CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 2408CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
2344# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set 2409# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
2345# CONFIG_SECURITY_SMACK is not set 2410# CONFIG_SECURITY_SMACK is not set
2411# CONFIG_SECURITY_TOMOYO is not set
2412# CONFIG_IMA is not set
2346CONFIG_CRYPTO=y 2413CONFIG_CRYPTO=y
2347 2414
2348# 2415#
@@ -2358,10 +2425,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
2358CONFIG_CRYPTO_HASH=y 2425CONFIG_CRYPTO_HASH=y
2359CONFIG_CRYPTO_HASH2=y 2426CONFIG_CRYPTO_HASH2=y
2360CONFIG_CRYPTO_RNG2=y 2427CONFIG_CRYPTO_RNG2=y
2428CONFIG_CRYPTO_PCOMP=y
2361CONFIG_CRYPTO_MANAGER=y 2429CONFIG_CRYPTO_MANAGER=y
2362CONFIG_CRYPTO_MANAGER2=y 2430CONFIG_CRYPTO_MANAGER2=y
2363# CONFIG_CRYPTO_GF128MUL is not set 2431# CONFIG_CRYPTO_GF128MUL is not set
2364# CONFIG_CRYPTO_NULL is not set 2432# CONFIG_CRYPTO_NULL is not set
2433CONFIG_CRYPTO_WORKQUEUE=y
2365# CONFIG_CRYPTO_CRYPTD is not set 2434# CONFIG_CRYPTO_CRYPTD is not set
2366CONFIG_CRYPTO_AUTHENC=y 2435CONFIG_CRYPTO_AUTHENC=y
2367# CONFIG_CRYPTO_TEST is not set 2436# CONFIG_CRYPTO_TEST is not set
@@ -2413,6 +2482,7 @@ CONFIG_CRYPTO_SHA1=y
2413# 2482#
2414CONFIG_CRYPTO_AES=y 2483CONFIG_CRYPTO_AES=y
2415# CONFIG_CRYPTO_AES_X86_64 is not set 2484# CONFIG_CRYPTO_AES_X86_64 is not set
2485# CONFIG_CRYPTO_AES_NI_INTEL is not set
2416# CONFIG_CRYPTO_ANUBIS is not set 2486# CONFIG_CRYPTO_ANUBIS is not set
2417CONFIG_CRYPTO_ARC4=y 2487CONFIG_CRYPTO_ARC4=y
2418# CONFIG_CRYPTO_BLOWFISH is not set 2488# CONFIG_CRYPTO_BLOWFISH is not set
@@ -2434,6 +2504,7 @@ CONFIG_CRYPTO_DES=y
2434# Compression 2504# Compression
2435# 2505#
2436# CONFIG_CRYPTO_DEFLATE is not set 2506# CONFIG_CRYPTO_DEFLATE is not set
2507# CONFIG_CRYPTO_ZLIB is not set
2437# CONFIG_CRYPTO_LZO is not set 2508# CONFIG_CRYPTO_LZO is not set
2438 2509
2439# 2510#
@@ -2443,10 +2514,12 @@ CONFIG_CRYPTO_DES=y
2443CONFIG_CRYPTO_HW=y 2514CONFIG_CRYPTO_HW=y
2444# CONFIG_CRYPTO_DEV_HIFN_795X is not set 2515# CONFIG_CRYPTO_DEV_HIFN_795X is not set
2445CONFIG_HAVE_KVM=y 2516CONFIG_HAVE_KVM=y
2517CONFIG_HAVE_KVM_IRQCHIP=y
2446CONFIG_VIRTUALIZATION=y 2518CONFIG_VIRTUALIZATION=y
2447# CONFIG_KVM is not set 2519# CONFIG_KVM is not set
2448# CONFIG_VIRTIO_PCI is not set 2520# CONFIG_VIRTIO_PCI is not set
2449# CONFIG_VIRTIO_BALLOON is not set 2521# CONFIG_VIRTIO_BALLOON is not set
2522CONFIG_BINARY_PRINTF=y
2450 2523
2451# 2524#
2452# Library routines 2525# Library routines
@@ -2463,7 +2536,10 @@ CONFIG_CRC32=y
2463# CONFIG_CRC7 is not set 2536# CONFIG_CRC7 is not set
2464# CONFIG_LIBCRC32C is not set 2537# CONFIG_LIBCRC32C is not set
2465CONFIG_ZLIB_INFLATE=y 2538CONFIG_ZLIB_INFLATE=y
2466CONFIG_PLIST=y 2539CONFIG_DECOMPRESS_GZIP=y
2540CONFIG_DECOMPRESS_BZIP2=y
2541CONFIG_DECOMPRESS_LZMA=y
2467CONFIG_HAS_IOMEM=y 2542CONFIG_HAS_IOMEM=y
2468CONFIG_HAS_IOPORT=y 2543CONFIG_HAS_IOPORT=y
2469CONFIG_HAS_DMA=y 2544CONFIG_HAS_DMA=y
2545CONFIG_NLATTR=y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a505202086e8..e590261ba059 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -825,9 +825,11 @@ ia32_sys_call_table:
825 .quad compat_sys_signalfd4 825 .quad compat_sys_signalfd4
826 .quad sys_eventfd2 826 .quad sys_eventfd2
827 .quad sys_epoll_create1 827 .quad sys_epoll_create1
828 .quad sys_dup3 /* 330 */ 828 .quad sys_dup3 /* 330 */
829 .quad sys_pipe2 829 .quad sys_pipe2
830 .quad sys_inotify_init1 830 .quad sys_inotify_init1
831 .quad compat_sys_preadv 831 .quad compat_sys_preadv
832 .quad compat_sys_pwritev 832 .quad compat_sys_pwritev
833 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
834 .quad sys_perf_counter_open
833ia32_syscall_end: 835ia32_syscall_end:
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index f6aa18eadf71..1a37bcdc8606 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -3,6 +3,7 @@
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/stddef.h> 5#include <linux/stddef.h>
6#include <linux/stringify.h>
6#include <asm/asm.h> 7#include <asm/asm.h>
7 8
8/* 9/*
@@ -74,6 +75,22 @@ static inline void alternatives_smp_switch(int smp) {}
74 75
75const unsigned char *const *find_nop_table(void); 76const unsigned char *const *find_nop_table(void);
76 77
78/* alternative assembly primitive: */
79#define ALTERNATIVE(oldinstr, newinstr, feature) \
80 \
81 "661:\n\t" oldinstr "\n662:\n" \
82 ".section .altinstructions,\"a\"\n" \
83 _ASM_ALIGN "\n" \
84 _ASM_PTR "661b\n" /* label */ \
85 _ASM_PTR "663f\n" /* new instruction */ \
86 " .byte " __stringify(feature) "\n" /* feature bit */ \
87 " .byte 662b-661b\n" /* sourcelen */ \
88 " .byte 664f-663f\n" /* replacementlen */ \
89 ".previous\n" \
90 ".section .altinstr_replacement, \"ax\"\n" \
91 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
92 ".previous"
93
77/* 94/*
78 * Alternative instructions for different CPU types or capabilities. 95 * Alternative instructions for different CPU types or capabilities.
79 * 96 *
@@ -87,18 +104,7 @@ const unsigned char *const *find_nop_table(void);
87 * without volatile and memory clobber. 104 * without volatile and memory clobber.
88 */ 105 */
89#define alternative(oldinstr, newinstr, feature) \ 106#define alternative(oldinstr, newinstr, feature) \
90 asm volatile ("661:\n\t" oldinstr "\n662:\n" \ 107 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
91 ".section .altinstructions,\"a\"\n" \
92 _ASM_ALIGN "\n" \
93 _ASM_PTR "661b\n" /* label */ \
94 _ASM_PTR "663f\n" /* new instruction */ \
95 " .byte %c0\n" /* feature bit */ \
96 " .byte 662b-661b\n" /* sourcelen */ \
97 " .byte 664f-663f\n" /* replacementlen */ \
98 ".previous\n" \
99 ".section .altinstr_replacement,\"ax\"\n" \
100 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
101 ".previous" :: "i" (feature) : "memory")
102 108
103/* 109/*
104 * Alternative inline assembly with input. 110 * Alternative inline assembly with input.
@@ -109,35 +115,16 @@ const unsigned char *const *find_nop_table(void);
109 * Best is to use constraints that are fixed size (like (%1) ... "r") 115 * Best is to use constraints that are fixed size (like (%1) ... "r")
110 * If you use variable sized constraints like "m" or "g" in the 116 * If you use variable sized constraints like "m" or "g" in the
111 * replacement make sure to pad to the worst case length. 117 * replacement make sure to pad to the worst case length.
118 * Leaving an unused argument 0 to keep API compatibility.
112 */ 119 */
113#define alternative_input(oldinstr, newinstr, feature, input...) \ 120#define alternative_input(oldinstr, newinstr, feature, input...) \
114 asm volatile ("661:\n\t" oldinstr "\n662:\n" \ 121 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
115 ".section .altinstructions,\"a\"\n" \ 122 : : "i" (0), ## input)
116 _ASM_ALIGN "\n" \
117 _ASM_PTR "661b\n" /* label */ \
118 _ASM_PTR "663f\n" /* new instruction */ \
119 " .byte %c0\n" /* feature bit */ \
120 " .byte 662b-661b\n" /* sourcelen */ \
121 " .byte 664f-663f\n" /* replacementlen */ \
122 ".previous\n" \
123 ".section .altinstr_replacement,\"ax\"\n" \
124 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
125 ".previous" :: "i" (feature), ##input)
126 123
127/* Like alternative_input, but with a single output argument */ 124/* Like alternative_input, but with a single output argument */
128#define alternative_io(oldinstr, newinstr, feature, output, input...) \ 125#define alternative_io(oldinstr, newinstr, feature, output, input...) \
129 asm volatile ("661:\n\t" oldinstr "\n662:\n" \ 126 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
130 ".section .altinstructions,\"a\"\n" \ 127 : output : "i" (0), ## input)
131 _ASM_ALIGN "\n" \
132 _ASM_PTR "661b\n" /* label */ \
133 _ASM_PTR "663f\n" /* new instruction */ \
134 " .byte %c[feat]\n" /* feature bit */ \
135 " .byte 662b-661b\n" /* sourcelen */ \
136 " .byte 664f-663f\n" /* replacementlen */ \
137 ".previous\n" \
138 ".section .altinstr_replacement,\"ax\"\n" \
139 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
140 ".previous" : output : [feat] "i" (feature), ##input)
141 128
142/* 129/*
143 * use this macro(s) if you need more than one output parameter 130 * use this macro(s) if you need more than one output parameter
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index f712344329bc..262e02820049 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -27,6 +27,8 @@ extern int amd_iommu_init(void);
27extern int amd_iommu_init_dma_ops(void); 27extern int amd_iommu_init_dma_ops(void);
28extern void amd_iommu_detect(void); 28extern void amd_iommu_detect(void);
29extern irqreturn_t amd_iommu_int_handler(int irq, void *data); 29extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
30extern void amd_iommu_flush_all_domains(void);
31extern void amd_iommu_flush_all_devices(void);
30#else 32#else
31static inline int amd_iommu_init(void) { return -ENODEV; } 33static inline int amd_iommu_init(void) { return -ENODEV; }
32static inline void amd_iommu_detect(void) { } 34static inline void amd_iommu_detect(void) { }
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 95c8cd9d22b5..0c878caaa0a2 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -194,6 +194,27 @@
194#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ 194#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
195#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops 195#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
196 domain for an IOMMU */ 196 domain for an IOMMU */
197extern bool amd_iommu_dump;
198#define DUMP_printk(format, arg...) \
199 do { \
200 if (amd_iommu_dump) \
201 printk(KERN_INFO "AMD IOMMU: " format, ## arg); \
202 } while(0);
203
204/*
205 * Make iterating over all IOMMUs easier
206 */
207#define for_each_iommu(iommu) \
208 list_for_each_entry((iommu), &amd_iommu_list, list)
209#define for_each_iommu_safe(iommu, next) \
210 list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
211
212#define APERTURE_RANGE_SHIFT 27 /* 128 MB */
213#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT)
214#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
215#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */
216#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT)
217#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
197 218
198/* 219/*
199 * This structure contains generic data for IOMMU protection domains 220 * This structure contains generic data for IOMMU protection domains
@@ -210,6 +231,26 @@ struct protection_domain {
210}; 231};
211 232
212/* 233/*
234 * For dynamic growth the aperture size is split into ranges of 128MB of
235 * DMA address space each. This struct represents one such range.
236 */
237struct aperture_range {
238
239 /* address allocation bitmap */
240 unsigned long *bitmap;
241
242 /*
243 * Array of PTE pages for the aperture. In this array we save all the
244 * leaf pages of the domain page table used for the aperture. This way
245 * we don't need to walk the page table to find a specific PTE. We can
246 * just calculate its address in constant time.
247 */
248 u64 *pte_pages[64];
249
250 unsigned long offset;
251};
252
253/*
213 * Data container for a dma_ops specific protection domain 254 * Data container for a dma_ops specific protection domain
214 */ 255 */
215struct dma_ops_domain { 256struct dma_ops_domain {
@@ -222,18 +263,10 @@ struct dma_ops_domain {
222 unsigned long aperture_size; 263 unsigned long aperture_size;
223 264
224 /* address we start to search for free addresses */ 265 /* address we start to search for free addresses */
225 unsigned long next_bit; 266 unsigned long next_address;
226
227 /* address allocation bitmap */
228 unsigned long *bitmap;
229 267
230 /* 268 /* address space relevant data */
231 * Array of PTE pages for the aperture. In this array we save all the 269 struct aperture_range *aperture[APERTURE_MAX_RANGES];
232 * leaf pages of the domain page table used for the aperture. This way
233 * we don't need to walk the page table to find a specific PTE. We can
234 * just calculate its address in constant time.
235 */
236 u64 **pte_pages;
237 270
238 /* This will be set to true when TLB needs to be flushed */ 271 /* This will be set to true when TLB needs to be flushed */
239 bool need_flush; 272 bool need_flush;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 3738438a91f5..bb7d47925847 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -402,7 +402,7 @@ static inline unsigned default_get_apic_id(unsigned long x)
402{ 402{
403 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); 403 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
404 404
405 if (APIC_XAPIC(ver)) 405 if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
406 return (x >> 24) & 0xFF; 406 return (x >> 24) & 0xFF;
407 else 407 else
408 return (x >> 24) & 0x0F; 408 return (x >> 24) & 0x0F;
@@ -470,6 +470,9 @@ static inline unsigned int read_apic_id(void)
470extern void default_setup_apic_routing(void); 470extern void default_setup_apic_routing(void);
471 471
472#ifdef CONFIG_X86_32 472#ifdef CONFIG_X86_32
473
474extern struct apic apic_default;
475
473/* 476/*
474 * Set up the logical destination ID. 477 * Set up the logical destination ID.
475 * 478 *
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 85b46fba4229..aff9f1fcdcd7 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,241 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
247#define smp_mb__before_atomic_inc() barrier() 247#define smp_mb__before_atomic_inc() barrier()
248#define smp_mb__after_atomic_inc() barrier() 248#define smp_mb__after_atomic_inc() barrier()
249 249
250/* An 64bit atomic type */
251
252typedef struct {
253 unsigned long long counter;
254} atomic64_t;
255
256#define ATOMIC64_INIT(val) { (val) }
257
258/**
259 * atomic64_read - read atomic64 variable
260 * @v: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292
293/**
294 * atomic64_xchg - xchg atomic64 variable
295 * @ptr: pointer to type atomic64_t
296 * @new_val: value to assign
297 * @old_val: old value that was there
298 *
299 * Atomically xchgs the value of @ptr to @new_val and returns
300 * the old value.
301 */
302
303static inline unsigned long long
304atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
305{
306 unsigned long long old_val;
307
308 do {
309 old_val = atomic_read(ptr);
310 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
311
312 return old_val;
313}
314
315/**
316 * atomic64_set - set atomic64 variable
317 * @ptr: pointer to type atomic64_t
318 * @new_val: value to assign
319 *
320 * Atomically sets the value of @ptr to @new_val.
321 */
322static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
323{
324 atomic64_xchg(ptr, new_val);
325}
326
327/**
328 * atomic64_read - read atomic64 variable
329 * @ptr: pointer to type atomic64_t
330 *
331 * Atomically reads the value of @ptr and returns it.
332 */
333static inline unsigned long long atomic64_read(atomic64_t *ptr)
334{
335 unsigned long long curr_val;
336
337 do {
338 curr_val = __atomic64_read(ptr);
339 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
340
341 return curr_val;
342}
343
344/**
345 * atomic64_add_return - add and return
346 * @delta: integer value to add
347 * @ptr: pointer to type atomic64_t
348 *
349 * Atomically adds @delta to @ptr and returns @delta + *@ptr
350 */
351static inline unsigned long long
352atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
353{
354 unsigned long long old_val, new_val;
355
356 do {
357 old_val = atomic_read(ptr);
358 new_val = old_val + delta;
359
360 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
361
362 return new_val;
363}
364
365static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
366{
367 return atomic64_add_return(-delta, ptr);
368}
369
370static inline long atomic64_inc_return(atomic64_t *ptr)
371{
372 return atomic64_add_return(1, ptr);
373}
374
375static inline long atomic64_dec_return(atomic64_t *ptr)
376{
377 return atomic64_sub_return(1, ptr);
378}
379
380/**
381 * atomic64_add - add integer to atomic64 variable
382 * @delta: integer value to add
383 * @ptr: pointer to type atomic64_t
384 *
385 * Atomically adds @delta to @ptr.
386 */
387static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
388{
389 atomic64_add_return(delta, ptr);
390}
391
392/**
393 * atomic64_sub - subtract the atomic64 variable
394 * @delta: integer value to subtract
395 * @ptr: pointer to type atomic64_t
396 *
397 * Atomically subtracts @delta from @ptr.
398 */
399static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
400{
401 atomic64_add(-delta, ptr);
402}
403
404/**
405 * atomic64_sub_and_test - subtract value from variable and test result
406 * @delta: integer value to subtract
407 * @ptr: pointer to type atomic64_t
408 *
409 * Atomically subtracts @delta from @ptr and returns
410 * true if the result is zero, or false for all
411 * other cases.
412 */
413static inline int
414atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
415{
416 unsigned long long old_val = atomic64_sub_return(delta, ptr);
417
418 return old_val == 0;
419}
420
421/**
422 * atomic64_inc - increment atomic64 variable
423 * @ptr: pointer to type atomic64_t
424 *
425 * Atomically increments @ptr by 1.
426 */
427static inline void atomic64_inc(atomic64_t *ptr)
428{
429 atomic64_add(1, ptr);
430}
431
432/**
433 * atomic64_dec - decrement atomic64 variable
434 * @ptr: pointer to type atomic64_t
435 *
436 * Atomically decrements @ptr by 1.
437 */
438static inline void atomic64_dec(atomic64_t *ptr)
439{
440 atomic64_sub(1, ptr);
441}
442
443/**
444 * atomic64_dec_and_test - decrement and test
445 * @ptr: pointer to type atomic64_t
446 *
447 * Atomically decrements @ptr by 1 and
448 * returns true if the result is 0, or false for all other
449 * cases.
450 */
451static inline int atomic64_dec_and_test(atomic64_t *ptr)
452{
453 return atomic64_sub_and_test(1, ptr);
454}
455
456/**
457 * atomic64_inc_and_test - increment and test
458 * @ptr: pointer to type atomic64_t
459 *
460 * Atomically increments @ptr by 1
461 * and returns true if the result is zero, or false for all
462 * other cases.
463 */
464static inline int atomic64_inc_and_test(atomic64_t *ptr)
465{
466 return atomic64_sub_and_test(-1, ptr);
467}
468
469/**
470 * atomic64_add_negative - add and test if negative
471 * @delta: integer value to add
472 * @ptr: pointer to type atomic64_t
473 *
474 * Atomically adds @delta to @ptr and returns true
475 * if the result is negative, or false when
476 * result is greater than or equal to zero.
477 */
478static inline int
479atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
480{
481 long long old_val = atomic64_add_return(delta, ptr);
482
483 return old_val < 0;
484}
485
250#include <asm-generic/atomic.h> 486#include <asm-generic/atomic.h>
251#endif /* _ASM_X86_ATOMIC_32_H */ 487#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 6ba23dd9fc92..418e632d4a80 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -8,11 +8,26 @@
8 8
9#ifdef __KERNEL__ 9#ifdef __KERNEL__
10 10
11#include <asm/page_types.h>
12
11/* Physical address where kernel should be loaded. */ 13/* Physical address where kernel should be loaded. */
12#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ 14#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
13 + (CONFIG_PHYSICAL_ALIGN - 1)) \ 15 + (CONFIG_PHYSICAL_ALIGN - 1)) \
14 & ~(CONFIG_PHYSICAL_ALIGN - 1)) 16 & ~(CONFIG_PHYSICAL_ALIGN - 1))
15 17
18/* Minimum kernel alignment, as a power of two */
19#ifdef CONFIG_x86_64
20#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
21#else
22#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1)
23#endif
24#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
25
26#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
27 (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2))
28#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
29#endif
30
16#ifdef CONFIG_KERNEL_BZIP2 31#ifdef CONFIG_KERNEL_BZIP2
17#define BOOT_HEAP_SIZE 0x400000 32#define BOOT_HEAP_SIZE 0x400000
18#else /* !CONFIG_KERNEL_BZIP2 */ 33#else /* !CONFIG_KERNEL_BZIP2 */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 433adaebf9b6..1724e8de317c 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -50,7 +50,8 @@ struct setup_header {
50 __u32 ramdisk_size; 50 __u32 ramdisk_size;
51 __u32 bootsect_kludge; 51 __u32 bootsect_kludge;
52 __u16 heap_end_ptr; 52 __u16 heap_end_ptr;
53 __u16 _pad1; 53 __u8 ext_loader_ver;
54 __u8 ext_loader_type;
54 __u32 cmd_line_ptr; 55 __u32 cmd_line_ptr;
55 __u32 initrd_addr_max; 56 __u32 initrd_addr_max;
56 __u32 kernel_alignment; 57 __u32 kernel_alignment;
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
index 222802029fa6..d96c1ee3a95c 100644
--- a/arch/x86/include/asm/cpu_debug.h
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -86,105 +86,7 @@ enum cpu_file_bit {
86 CPU_VALUE_BIT, /* value */ 86 CPU_VALUE_BIT, /* value */
87}; 87};
88 88
89#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT) 89#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
90
91/*
92 * DisplayFamily_DisplayModel Processor Families/Processor Number Series
93 * -------------------------- ------------------------------------------
94 * 05_01, 05_02, 05_04 Pentium, Pentium with MMX
95 *
96 * 06_01 Pentium Pro
97 * 06_03, 06_05 Pentium II Xeon, Pentium II
98 * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III
99 *
100 * 06_09, 060D Pentium M
101 *
102 * 06_0E Core Duo, Core Solo
103 *
104 * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series,
105 * Core 2 Quad, Core 2 Extreme, Core 2 Duo,
106 * Pentium dual-core
107 * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650
108 *
109 * 06_1C Atom
110 *
111 * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4
112 * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D
113 *
114 * 0F_06 Xeon 7100, 5000 Series, Xeon MP,
115 * Pentium 4, Pentium D
116 */
117
118/* Register processors bits */
119enum cpu_processor_bit {
120 CPU_NONE,
121/* Intel */
122 CPU_INTEL_PENTIUM_BIT,
123 CPU_INTEL_P6_BIT,
124 CPU_INTEL_PENTIUM_M_BIT,
125 CPU_INTEL_CORE_BIT,
126 CPU_INTEL_CORE2_BIT,
127 CPU_INTEL_ATOM_BIT,
128 CPU_INTEL_XEON_P4_BIT,
129 CPU_INTEL_XEON_MP_BIT,
130/* AMD */
131 CPU_AMD_K6_BIT,
132 CPU_AMD_K7_BIT,
133 CPU_AMD_K8_BIT,
134 CPU_AMD_0F_BIT,
135 CPU_AMD_10_BIT,
136 CPU_AMD_11_BIT,
137};
138
139#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT)
140#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT)
141#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT)
142#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT)
143#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT)
144#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT)
145#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT)
146#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT)
147
148#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
149#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2)
150#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
151#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM)
152#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
153#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM)
154#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON)
155#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON)
156#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
157#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON)
158#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON)
159#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT)
160#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX)
161#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE)
162#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
163#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
164#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
165#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT)
166#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE)
167
168/* Select all supported Intel CPUs */
169#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
170
171#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT)
172#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT)
173#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT)
174#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT)
175#define CPU_AMD_10 (1 << CPU_AMD_10_BIT)
176#define CPU_AMD_11 (1 << CPU_AMD_11_BIT)
177
178#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11)
179#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS)
180#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS)
181#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS)
182
183/* Select all supported AMD CPUs */
184#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS)
185
186/* Select all supported CPUs */
187#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL)
188 90
189#define MAX_CPU_FILES 512 91#define MAX_CPU_FILES 512
190 92
@@ -220,7 +122,6 @@ struct cpu_debug_range {
220 unsigned min; /* Register range min */ 122 unsigned min; /* Register range min */
221 unsigned max; /* Register range max */ 123 unsigned max; /* Register range max */
222 unsigned flag; /* Supported flags */ 124 unsigned flag; /* Supported flags */
223 unsigned model; /* Supported models */
224}; 125};
225 126
226#endif /* _ASM_X86_CPU_DEBUG_H */ 127#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 13cc6a503a02..4a28d22d4793 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -94,6 +94,7 @@
94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ 94#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ 95#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
96#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ 96#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
97#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
97 98
98/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ 99/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
99#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ 100#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
@@ -115,6 +116,8 @@
115#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ 116#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
116#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ 117#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
117#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ 118#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
119#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */
120#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
118#define X86_FEATURE_AES (4*32+25) /* AES instructions */ 121#define X86_FEATURE_AES (4*32+25) /* AES instructions */
119#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ 122#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
120#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ 123#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index a8f672ba100c..70dac199b093 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -15,8 +15,8 @@
15 * - buffer allocation (memory accounting) 15 * - buffer allocation (memory accounting)
16 * 16 *
17 * 17 *
18 * Copyright (C) 2007-2008 Intel Corporation. 18 * Copyright (C) 2007-2009 Intel Corporation.
19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008 19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
20 */ 20 */
21 21
22#ifndef _ASM_X86_DS_H 22#ifndef _ASM_X86_DS_H
@@ -83,8 +83,10 @@ enum ds_feature {
83 * The interrupt threshold is independent from the overflow callback 83 * The interrupt threshold is independent from the overflow callback
84 * to allow users to use their own overflow interrupt handling mechanism. 84 * to allow users to use their own overflow interrupt handling mechanism.
85 * 85 *
86 * task: the task to request recording for; 86 * The function might sleep.
87 * NULL for per-cpu recording on the current cpu 87 *
88 * task: the task to request recording for
89 * cpu: the cpu to request recording for
88 * base: the base pointer for the (non-pageable) buffer; 90 * base: the base pointer for the (non-pageable) buffer;
89 * size: the size of the provided buffer in bytes 91 * size: the size of the provided buffer in bytes
90 * ovfl: pointer to a function to be called on buffer overflow; 92 * ovfl: pointer to a function to be called on buffer overflow;
@@ -93,19 +95,28 @@ enum ds_feature {
93 * -1 if no interrupt threshold is requested. 95 * -1 if no interrupt threshold is requested.
94 * flags: a bit-mask of the above flags 96 * flags: a bit-mask of the above flags
95 */ 97 */
96extern struct bts_tracer *ds_request_bts(struct task_struct *task, 98extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
97 void *base, size_t size, 99 void *base, size_t size,
98 bts_ovfl_callback_t ovfl, 100 bts_ovfl_callback_t ovfl,
99 size_t th, unsigned int flags); 101 size_t th, unsigned int flags);
100extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, 102extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
101 void *base, size_t size, 103 bts_ovfl_callback_t ovfl,
102 pebs_ovfl_callback_t ovfl, 104 size_t th, unsigned int flags);
103 size_t th, unsigned int flags); 105extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
106 void *base, size_t size,
107 pebs_ovfl_callback_t ovfl,
108 size_t th, unsigned int flags);
109extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
110 void *base, size_t size,
111 pebs_ovfl_callback_t ovfl,
112 size_t th, unsigned int flags);
104 113
105/* 114/*
106 * Release BTS or PEBS resources 115 * Release BTS or PEBS resources
107 * Suspend and resume BTS or PEBS tracing 116 * Suspend and resume BTS or PEBS tracing
108 * 117 *
118 * Must be called with irq's enabled.
119 *
109 * tracer: the tracer handle returned from ds_request_~() 120 * tracer: the tracer handle returned from ds_request_~()
110 */ 121 */
111extern void ds_release_bts(struct bts_tracer *tracer); 122extern void ds_release_bts(struct bts_tracer *tracer);
@@ -115,6 +126,28 @@ extern void ds_release_pebs(struct pebs_tracer *tracer);
115extern void ds_suspend_pebs(struct pebs_tracer *tracer); 126extern void ds_suspend_pebs(struct pebs_tracer *tracer);
116extern void ds_resume_pebs(struct pebs_tracer *tracer); 127extern void ds_resume_pebs(struct pebs_tracer *tracer);
117 128
129/*
130 * Release BTS or PEBS resources
131 * Suspend and resume BTS or PEBS tracing
132 *
133 * Cpu tracers must call this on the traced cpu.
134 * Task tracers must call ds_release_~_noirq() for themselves.
135 *
136 * May be called with irq's disabled.
137 *
138 * Returns 0 if successful;
139 * -EPERM if the cpu tracer does not trace the current cpu.
140 * -EPERM if the task tracer does not trace itself.
141 *
142 * tracer: the tracer handle returned from ds_request_~()
143 */
144extern int ds_release_bts_noirq(struct bts_tracer *tracer);
145extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
146extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
147extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
148extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
149extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
150
118 151
119/* 152/*
120 * The raw DS buffer state as it is used for BTS and PEBS recording. 153 * The raw DS buffer state as it is used for BTS and PEBS recording.
@@ -170,9 +203,9 @@ struct bts_struct {
170 } lbr; 203 } lbr;
171 /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */ 204 /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
172 struct { 205 struct {
173 __u64 jiffies; 206 __u64 clock;
174 pid_t pid; 207 pid_t pid;
175 } timestamp; 208 } event;
176 } variant; 209 } variant;
177}; 210};
178 211
@@ -201,8 +234,12 @@ struct bts_trace {
201struct pebs_trace { 234struct pebs_trace {
202 struct ds_trace ds; 235 struct ds_trace ds;
203 236
204 /* the PEBS reset value */ 237 /* the number of valid counters in the below array */
205 unsigned long long reset_value; 238 unsigned int counters;
239
240#define MAX_PEBS_COUNTERS 4
241 /* the counter reset value */
242 unsigned long long counter_reset[MAX_PEBS_COUNTERS];
206}; 243};
207 244
208 245
@@ -237,9 +274,11 @@ extern int ds_reset_pebs(struct pebs_tracer *tracer);
237 * Returns 0 on success; -Eerrno on error 274 * Returns 0 on success; -Eerrno on error
238 * 275 *
239 * tracer: the tracer handle returned from ds_request_pebs() 276 * tracer: the tracer handle returned from ds_request_pebs()
277 * counter: the index of the counter
240 * value: the new counter reset value 278 * value: the new counter reset value
241 */ 279 */
242extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); 280extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
281 unsigned int counter, u64 value);
243 282
244/* 283/*
245 * Initialization 284 * Initialization
@@ -252,21 +291,12 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
252 */ 291 */
253extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); 292extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
254 293
255/*
256 * Task clone/init and cleanup work
257 */
258extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
259extern void ds_exit_thread(struct task_struct *tsk);
260
261#else /* CONFIG_X86_DS */ 294#else /* CONFIG_X86_DS */
262 295
263struct cpuinfo_x86; 296struct cpuinfo_x86;
264static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} 297static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
265static inline void ds_switch_to(struct task_struct *prev, 298static inline void ds_switch_to(struct task_struct *prev,
266 struct task_struct *next) {} 299 struct task_struct *next) {}
267static inline void ds_copy_thread(struct task_struct *tsk,
268 struct task_struct *father) {}
269static inline void ds_exit_thread(struct task_struct *tsk) {}
270 300
271#endif /* CONFIG_X86_DS */ 301#endif /* CONFIG_X86_DS */
272#endif /* _ASM_X86_DS_H */ 302#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 69f886805ecb..ff8cbfa07851 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -50,7 +50,7 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
50BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) 50BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
51 51
52#ifdef CONFIG_PERF_COUNTERS 52#ifdef CONFIG_PERF_COUNTERS
53BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) 53BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
54#endif 54#endif
55 55
56#ifdef CONFIG_X86_THERMAL_VECTOR 56#ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 922ee7c29693..82e3e8f01043 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */ 15 unsigned int generic_irqs; /* arch dependent */
16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs;
16#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
17 unsigned int irq_resched_count; 19 unsigned int irq_resched_count;
18 unsigned int irq_call_count; 20 unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 1c8f28a63058..ba180d93b08c 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,8 @@
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void); 30extern void generic_interrupt(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_pending_interrupt(void);
33
32extern void spurious_interrupt(void); 34extern void spurious_interrupt(void);
33extern void thermal_interrupt(void); 35extern void thermal_interrupt(void);
34extern void reschedule_interrupt(void); 36extern void reschedule_interrupt(void);
@@ -99,7 +101,11 @@ extern void eisa_set_level_irq(unsigned int irq);
99/* SMP */ 101/* SMP */
100extern void smp_apic_timer_interrupt(struct pt_regs *); 102extern void smp_apic_timer_interrupt(struct pt_regs *);
101extern void smp_spurious_interrupt(struct pt_regs *); 103extern void smp_spurious_interrupt(struct pt_regs *);
104extern void smp_generic_interrupt(struct pt_regs *);
102extern void smp_error_interrupt(struct pt_regs *); 105extern void smp_error_interrupt(struct pt_regs *);
106#ifdef CONFIG_X86_IO_APIC
107extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
108#endif
103#ifdef CONFIG_SMP 109#ifdef CONFIG_SMP
104extern void smp_reschedule_interrupt(struct pt_regs *); 110extern void smp_reschedule_interrupt(struct pt_regs *);
105extern void smp_call_function_interrupt(struct pt_regs *); 111extern void smp_call_function_interrupt(struct pt_regs *);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 71c9e5183982..175adf58dd4f 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -67,7 +67,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
67 ".previous\n" 67 ".previous\n"
68 _ASM_EXTABLE(1b, 3b) 68 _ASM_EXTABLE(1b, 3b)
69 : [err] "=r" (err) 69 : [err] "=r" (err)
70#if 0 /* See comment in __save_init_fpu() below. */ 70#if 0 /* See comment in fxsave() below. */
71 : [fx] "r" (fx), "m" (*fx), "0" (0)); 71 : [fx] "r" (fx), "m" (*fx), "0" (0));
72#else 72#else
73 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0)); 73 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
@@ -75,14 +75,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
75 return err; 75 return err;
76} 76}
77 77
78static inline int restore_fpu_checking(struct task_struct *tsk)
79{
80 if (task_thread_info(tsk)->status & TS_XSAVE)
81 return xrstor_checking(&tsk->thread.xstate->xsave);
82 else
83 return fxrstor_checking(&tsk->thread.xstate->fxsave);
84}
85
86/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception 78/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
87 is pending. Clear the x87 state here by setting it to fixed 79 is pending. Clear the x87 state here by setting it to fixed
88 values. The kernel data segment can be sometimes 0 and sometimes 80 values. The kernel data segment can be sometimes 0 and sometimes
@@ -120,7 +112,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
120 ".previous\n" 112 ".previous\n"
121 _ASM_EXTABLE(1b, 3b) 113 _ASM_EXTABLE(1b, 3b)
122 : [err] "=r" (err), "=m" (*fx) 114 : [err] "=r" (err), "=m" (*fx)
123#if 0 /* See comment in __fxsave_clear() below. */ 115#if 0 /* See comment in fxsave() below. */
124 : [fx] "r" (fx), "0" (0)); 116 : [fx] "r" (fx), "0" (0));
125#else 117#else
126 : [fx] "cdaSDb" (fx), "0" (0)); 118 : [fx] "cdaSDb" (fx), "0" (0));
@@ -185,12 +177,9 @@ static inline void tolerant_fwait(void)
185 asm volatile("fnclex ; fwait"); 177 asm volatile("fnclex ; fwait");
186} 178}
187 179
188static inline void restore_fpu(struct task_struct *tsk) 180/* perform fxrstor iff the processor has extended states, otherwise frstor */
181static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
189{ 182{
190 if (task_thread_info(tsk)->status & TS_XSAVE) {
191 xrstor_checking(&tsk->thread.xstate->xsave);
192 return;
193 }
194 /* 183 /*
195 * The "nop" is needed to make the instructions the same 184 * The "nop" is needed to make the instructions the same
196 * length. 185 * length.
@@ -199,7 +188,9 @@ static inline void restore_fpu(struct task_struct *tsk)
199 "nop ; frstor %1", 188 "nop ; frstor %1",
200 "fxrstor %1", 189 "fxrstor %1",
201 X86_FEATURE_FXSR, 190 X86_FEATURE_FXSR,
202 "m" (tsk->thread.xstate->fxsave)); 191 "m" (*fx));
192
193 return 0;
203} 194}
204 195
205/* We need a safe address that is cheap to find and that is already 196/* We need a safe address that is cheap to find and that is already
@@ -262,6 +253,14 @@ end:
262 253
263#endif /* CONFIG_X86_64 */ 254#endif /* CONFIG_X86_64 */
264 255
256static inline int restore_fpu_checking(struct task_struct *tsk)
257{
258 if (task_thread_info(tsk)->status & TS_XSAVE)
259 return xrstor_checking(&tsk->thread.xstate->xsave);
260 else
261 return fxrstor_checking(&tsk->thread.xstate->fxsave);
262}
263
265/* 264/*
266 * Signal frame handlers... 265 * Signal frame handlers...
267 */ 266 */
@@ -305,18 +304,18 @@ static inline void kernel_fpu_end(void)
305/* 304/*
306 * Some instructions like VIA's padlock instructions generate a spurious 305 * Some instructions like VIA's padlock instructions generate a spurious
307 * DNA fault but don't modify SSE registers. And these instructions 306 * DNA fault but don't modify SSE registers. And these instructions
308 * get used from interrupt context aswell. To prevent these kernel instructions 307 * get used from interrupt context as well. To prevent these kernel instructions
309 * in interrupt context interact wrongly with other user/kernel fpu usage, we 308 * in interrupt context interacting wrongly with other user/kernel fpu usage, we
310 * should use them only in the context of irq_ts_save/restore() 309 * should use them only in the context of irq_ts_save/restore()
311 */ 310 */
312static inline int irq_ts_save(void) 311static inline int irq_ts_save(void)
313{ 312{
314 /* 313 /*
315 * If we are in process context, we are ok to take a spurious DNA fault. 314 * If in process context and not atomic, we can take a spurious DNA fault.
316 * Otherwise, doing clts() in process context require pre-emption to 315 * Otherwise, doing clts() in process context requires disabling preemption
317 * be disabled or some heavy lifting like kernel_fpu_begin() 316 * or some heavy lifting like kernel_fpu_begin()
318 */ 317 */
319 if (!in_interrupt()) 318 if (!in_atomic())
320 return 0; 319 return 0;
321 320
322 if (read_cr0() & X86_CR0_TS) { 321 if (read_cr0() & X86_CR0_TS) {
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2e..000000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
1#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
2#define _ASM_X86_INTEL_ARCH_PERFMON_H
3
4#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
5#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
6
7#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
8#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
9
10#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
11#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
12#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
13#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
14
15#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
16#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
17#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
18#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
19 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
20
21union cpuid10_eax {
22 struct {
23 unsigned int version_id:8;
24 unsigned int num_counters:8;
25 unsigned int bit_width:8;
26 unsigned int mask_length:8;
27 } split;
28 unsigned int full;
29};
30
31#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 86af26091d6c..0e9fe1d9d971 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -1,3 +1,6 @@
1#ifndef _ASM_X86_IOMAP_H
2#define _ASM_X86_IOMAP_H
3
1/* 4/*
2 * Copyright © 2008 Ingo Molnar 5 * Copyright © 2008 Ingo Molnar
3 * 6 *
@@ -31,3 +34,5 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
31 34
32void 35void
33iounmap_atomic(void *kvaddr, enum km_type type); 36iounmap_atomic(void *kvaddr, enum km_type type);
37
38#endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 1b35c4357ea8..5b21f0ec3df2 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -104,14 +104,14 @@
104#define LOCAL_TIMER_VECTOR 0xef 104#define LOCAL_TIMER_VECTOR 0xef
105 105
106/* 106/*
107 * Performance monitoring interrupt vector: 107 * Generic system vector for platform specific use
108 */ 108 */
109#define LOCAL_PERF_VECTOR 0xee 109#define GENERIC_INTERRUPT_VECTOR 0xed
110 110
111/* 111/*
112 * Generic system vector for platform specific use 112 * Performance monitoring pending work vector:
113 */ 113 */
114#define GENERIC_INTERRUPT_VECTOR 0xed 114#define LOCAL_PENDING_VECTOR 0xec
115 115
116#define UV_BAU_MESSAGE 0xec 116#define UV_BAU_MESSAGE 0xec
117 117
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index 54c8cc53b24d..c2d1f3b58e5f 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -12,4 +12,17 @@ extern int cache_k8_northbridges(void);
12extern void k8_flush_garts(void); 12extern void k8_flush_garts(void);
13extern int k8_scan_nodes(unsigned long start, unsigned long end); 13extern int k8_scan_nodes(unsigned long start, unsigned long end);
14 14
15#ifdef CONFIG_K8_NB
16static inline struct pci_dev *node_to_k8_nb_misc(int node)
17{
18 return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
19}
20#else
21static inline struct pci_dev *node_to_k8_nb_misc(int node)
22{
23 return NULL;
24}
25#endif
26
27
15#endif /* _ASM_X86_K8_H */ 28#endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index dc3f6cf11704..125be8b19568 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -16,6 +16,7 @@
16#define __KVM_HAVE_MSI 16#define __KVM_HAVE_MSI
17#define __KVM_HAVE_USER_NMI 17#define __KVM_HAVE_USER_NMI
18#define __KVM_HAVE_GUEST_DEBUG 18#define __KVM_HAVE_GUEST_DEBUG
19#define __KVM_HAVE_MSIX
19 20
20/* Architectural interrupt line count. */ 21/* Architectural interrupt line count. */
21#define KVM_NR_INTERRUPTS 256 22#define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f0faf58044ff..eabdc1cfab5c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -185,6 +185,7 @@ union kvm_mmu_page_role {
185 unsigned access:3; 185 unsigned access:3;
186 unsigned invalid:1; 186 unsigned invalid:1;
187 unsigned cr4_pge:1; 187 unsigned cr4_pge:1;
188 unsigned nxe:1;
188 }; 189 };
189}; 190};
190 191
@@ -212,7 +213,6 @@ struct kvm_mmu_page {
212 int multimapped; /* More than one parent_pte? */ 213 int multimapped; /* More than one parent_pte? */
213 int root_count; /* Currently serving as active root */ 214 int root_count; /* Currently serving as active root */
214 bool unsync; 215 bool unsync;
215 bool global;
216 unsigned int unsync_children; 216 unsigned int unsync_children;
217 union { 217 union {
218 u64 *parent_pte; /* !multimapped */ 218 u64 *parent_pte; /* !multimapped */
@@ -261,13 +261,11 @@ struct kvm_mmu {
261 union kvm_mmu_page_role base_role; 261 union kvm_mmu_page_role base_role;
262 262
263 u64 *pae_root; 263 u64 *pae_root;
264 u64 rsvd_bits_mask[2][4];
264}; 265};
265 266
266struct kvm_vcpu_arch { 267struct kvm_vcpu_arch {
267 u64 host_tsc; 268 u64 host_tsc;
268 int interrupt_window_open;
269 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
270 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
271 /* 269 /*
272 * rip and regs accesses must go through 270 * rip and regs accesses must go through
273 * kvm_{register,rip}_{read,write} functions. 271 * kvm_{register,rip}_{read,write} functions.
@@ -286,6 +284,7 @@ struct kvm_vcpu_arch {
286 u64 shadow_efer; 284 u64 shadow_efer;
287 u64 apic_base; 285 u64 apic_base;
288 struct kvm_lapic *apic; /* kernel irqchip context */ 286 struct kvm_lapic *apic; /* kernel irqchip context */
287 int32_t apic_arb_prio;
289 int mp_state; 288 int mp_state;
290 int sipi_vector; 289 int sipi_vector;
291 u64 ia32_misc_enable_msr; 290 u64 ia32_misc_enable_msr;
@@ -320,6 +319,8 @@ struct kvm_vcpu_arch {
320 struct kvm_pio_request pio; 319 struct kvm_pio_request pio;
321 void *pio_data; 320 void *pio_data;
322 321
322 u8 event_exit_inst_len;
323
323 struct kvm_queued_exception { 324 struct kvm_queued_exception {
324 bool pending; 325 bool pending;
325 bool has_error_code; 326 bool has_error_code;
@@ -329,11 +330,12 @@ struct kvm_vcpu_arch {
329 330
330 struct kvm_queued_interrupt { 331 struct kvm_queued_interrupt {
331 bool pending; 332 bool pending;
333 bool soft;
332 u8 nr; 334 u8 nr;
333 } interrupt; 335 } interrupt;
334 336
335 struct { 337 struct {
336 int active; 338 int vm86_active;
337 u8 save_iopl; 339 u8 save_iopl;
338 struct kvm_save_segment { 340 struct kvm_save_segment {
339 u16 selector; 341 u16 selector;
@@ -356,9 +358,9 @@ struct kvm_vcpu_arch {
356 unsigned int time_offset; 358 unsigned int time_offset;
357 struct page *time_page; 359 struct page *time_page;
358 360
361 bool singlestep; /* guest is single stepped by KVM */
359 bool nmi_pending; 362 bool nmi_pending;
360 bool nmi_injected; 363 bool nmi_injected;
361 bool nmi_window_open;
362 364
363 struct mtrr_state_type mtrr_state; 365 struct mtrr_state_type mtrr_state;
364 u32 pat; 366 u32 pat;
@@ -392,15 +394,14 @@ struct kvm_arch{
392 */ 394 */
393 struct list_head active_mmu_pages; 395 struct list_head active_mmu_pages;
394 struct list_head assigned_dev_head; 396 struct list_head assigned_dev_head;
395 struct list_head oos_global_pages;
396 struct iommu_domain *iommu_domain; 397 struct iommu_domain *iommu_domain;
398 int iommu_flags;
397 struct kvm_pic *vpic; 399 struct kvm_pic *vpic;
398 struct kvm_ioapic *vioapic; 400 struct kvm_ioapic *vioapic;
399 struct kvm_pit *vpit; 401 struct kvm_pit *vpit;
400 struct hlist_head irq_ack_notifier_list; 402 struct hlist_head irq_ack_notifier_list;
401 int vapics_in_nmi_mode; 403 int vapics_in_nmi_mode;
402 404
403 int round_robin_prev_vcpu;
404 unsigned int tss_addr; 405 unsigned int tss_addr;
405 struct page *apic_access_page; 406 struct page *apic_access_page;
406 407
@@ -423,7 +424,6 @@ struct kvm_vm_stat {
423 u32 mmu_recycled; 424 u32 mmu_recycled;
424 u32 mmu_cache_miss; 425 u32 mmu_cache_miss;
425 u32 mmu_unsync; 426 u32 mmu_unsync;
426 u32 mmu_unsync_global;
427 u32 remote_tlb_flush; 427 u32 remote_tlb_flush;
428 u32 lpages; 428 u32 lpages;
429}; 429};
@@ -443,7 +443,6 @@ struct kvm_vcpu_stat {
443 u32 halt_exits; 443 u32 halt_exits;
444 u32 halt_wakeup; 444 u32 halt_wakeup;
445 u32 request_irq_exits; 445 u32 request_irq_exits;
446 u32 request_nmi_exits;
447 u32 irq_exits; 446 u32 irq_exits;
448 u32 host_state_reload; 447 u32 host_state_reload;
449 u32 efer_reload; 448 u32 efer_reload;
@@ -511,20 +510,22 @@ struct kvm_x86_ops {
511 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 510 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
512 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); 511 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
513 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 512 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
513 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
514 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
514 void (*patch_hypercall)(struct kvm_vcpu *vcpu, 515 void (*patch_hypercall)(struct kvm_vcpu *vcpu,
515 unsigned char *hypercall_addr); 516 unsigned char *hypercall_addr);
516 int (*get_irq)(struct kvm_vcpu *vcpu); 517 void (*set_irq)(struct kvm_vcpu *vcpu);
517 void (*set_irq)(struct kvm_vcpu *vcpu, int vec); 518 void (*set_nmi)(struct kvm_vcpu *vcpu);
518 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, 519 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
519 bool has_error_code, u32 error_code); 520 bool has_error_code, u32 error_code);
520 bool (*exception_injected)(struct kvm_vcpu *vcpu); 521 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
521 void (*inject_pending_irq)(struct kvm_vcpu *vcpu); 522 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
522 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, 523 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
523 struct kvm_run *run); 524 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
524 525 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
525 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 526 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
526 int (*get_tdp_level)(void); 527 int (*get_tdp_level)(void);
527 int (*get_mt_mask_shift)(void); 528 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
528}; 529};
529 530
530extern struct kvm_x86_ops *kvm_x86_ops; 531extern struct kvm_x86_ops *kvm_x86_ops;
@@ -538,7 +539,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
538void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); 539void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
539void kvm_mmu_set_base_ptes(u64 base_pte); 540void kvm_mmu_set_base_ptes(u64 base_pte);
540void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 541void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
541 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask); 542 u64 dirty_mask, u64 nx_mask, u64 x_mask);
542 543
543int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 544int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
544void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 545void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -552,6 +553,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
552 const void *val, int bytes); 553 const void *val, int bytes);
553int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, 554int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
554 gpa_t addr, unsigned long *ret); 555 gpa_t addr, unsigned long *ret);
556u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
555 557
556extern bool tdp_enabled; 558extern bool tdp_enabled;
557 559
@@ -563,6 +565,7 @@ enum emulation_result {
563 565
564#define EMULTYPE_NO_DECODE (1 << 0) 566#define EMULTYPE_NO_DECODE (1 << 0)
565#define EMULTYPE_TRAP_UD (1 << 1) 567#define EMULTYPE_TRAP_UD (1 << 1)
568#define EMULTYPE_SKIP (1 << 2)
566int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 569int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
567 unsigned long cr2, u16 error_code, int emulation_type); 570 unsigned long cr2, u16 error_code, int emulation_type);
568void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 571void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
@@ -638,7 +641,6 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
638int kvm_mmu_load(struct kvm_vcpu *vcpu); 641int kvm_mmu_load(struct kvm_vcpu *vcpu);
639void kvm_mmu_unload(struct kvm_vcpu *vcpu); 642void kvm_mmu_unload(struct kvm_vcpu *vcpu);
640void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 643void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
641void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
642 644
643int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 645int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
644 646
@@ -769,6 +771,8 @@ enum {
769#define HF_GIF_MASK (1 << 0) 771#define HF_GIF_MASK (1 << 0)
770#define HF_HIF_MASK (1 << 1) 772#define HF_HIF_MASK (1 << 1)
771#define HF_VINTR_MASK (1 << 2) 773#define HF_VINTR_MASK (1 << 2)
774#define HF_NMI_MASK (1 << 3)
775#define HF_IRET_MASK (1 << 4)
772 776
773/* 777/*
774 * Hardware virtualization extension instructions may fault if a 778 * Hardware virtualization extension instructions may fault if a
@@ -791,5 +795,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
791#define KVM_ARCH_WANT_MMU_NOTIFIER 795#define KVM_ARCH_WANT_MMU_NOTIFIER
792int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 796int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
793int kvm_age_hva(struct kvm *kvm, unsigned long hva); 797int kvm_age_hva(struct kvm *kvm, unsigned long hva);
798int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
794 799
795#endif /* _ASM_X86_KVM_HOST_H */ 800#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 6a159732881a..b7ed2c423116 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -143,6 +143,9 @@ struct decode_cache {
143 struct fetch_cache fetch; 143 struct fetch_cache fetch;
144}; 144};
145 145
146#define X86_SHADOW_INT_MOV_SS 1
147#define X86_SHADOW_INT_STI 2
148
146struct x86_emulate_ctxt { 149struct x86_emulate_ctxt {
147 /* Register state before/after emulation. */ 150 /* Register state before/after emulation. */
148 struct kvm_vcpu *vcpu; 151 struct kvm_vcpu *vcpu;
@@ -152,6 +155,9 @@ struct x86_emulate_ctxt {
152 int mode; 155 int mode;
153 u32 cs_base; 156 u32 cs_base;
154 157
158 /* interruptibility state, as a result of execution of STI or MOV SS */
159 int interruptibility;
160
155 /* decode cache */ 161 /* decode cache */
156 struct decode_cache decode; 162 struct decode_cache decode;
157}; 163};
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index c882664716c1..ef51b501e22a 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -9,20 +9,31 @@ struct cpu_signature {
9 9
10struct device; 10struct device;
11 11
12enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
13
12struct microcode_ops { 14struct microcode_ops {
13 int (*request_microcode_user) (int cpu, const void __user *buf, size_t size); 15 enum ucode_state (*request_microcode_user) (int cpu,
14 int (*request_microcode_fw) (int cpu, struct device *device); 16 const void __user *buf, size_t size);
15 17
16 void (*apply_microcode) (int cpu); 18 enum ucode_state (*request_microcode_fw) (int cpu,
19 struct device *device);
17 20
18 int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
19 void (*microcode_fini_cpu) (int cpu); 21 void (*microcode_fini_cpu) (int cpu);
22
23 /*
24 * The generic 'microcode_core' part guarantees that
25 * the callbacks below run on a target cpu when they
26 * are being called.
27 * See also the "Synchronization" section in microcode_core.c.
28 */
29 int (*apply_microcode) (int cpu);
30 int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
20}; 31};
21 32
22struct ucode_cpu_info { 33struct ucode_cpu_info {
23 struct cpu_signature cpu_sig; 34 struct cpu_signature cpu_sig;
24 int valid; 35 int valid;
25 void *mc; 36 void *mc;
26}; 37};
27extern struct ucode_cpu_info ucode_cpu_info[]; 38extern struct ucode_cpu_info ucode_cpu_info[];
28 39
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index c86404695083..1692fb5050e3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -121,7 +121,6 @@
121#define MSR_K8_TOP_MEM1 0xc001001a 121#define MSR_K8_TOP_MEM1 0xc001001a
122#define MSR_K8_TOP_MEM2 0xc001001d 122#define MSR_K8_TOP_MEM2 0xc001001d
123#define MSR_K8_SYSCFG 0xc0010010 123#define MSR_K8_SYSCFG 0xc0010010
124#define MSR_K8_HWCR 0xc0010015
125#define MSR_K8_INT_PENDING_MSG 0xc0010055 124#define MSR_K8_INT_PENDING_MSG 0xc0010055
126/* C1E active bits in int pending message */ 125/* C1E active bits in int pending message */
127#define K8_INTP_C1E_ACTIVE_MASK 0x18000000 126#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 638bf6241807..22603764e7db 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -12,6 +12,17 @@
12 12
13#include <asm/asm.h> 13#include <asm/asm.h>
14#include <asm/errno.h> 14#include <asm/errno.h>
15#include <asm/cpumask.h>
16
17struct msr {
18 union {
19 struct {
20 u32 l;
21 u32 h;
22 };
23 u64 q;
24 };
25};
15 26
16static inline unsigned long long native_read_tscp(unsigned int *aux) 27static inline unsigned long long native_read_tscp(unsigned int *aux)
17{ 28{
@@ -216,6 +227,8 @@ do { \
216#ifdef CONFIG_SMP 227#ifdef CONFIG_SMP
217int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 228int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
218int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 229int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
230void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
231void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
219int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 232int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
220int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 233int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
221#else /* CONFIG_SMP */ 234#else /* CONFIG_SMP */
@@ -229,6 +242,16 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
229 wrmsr(msr_no, l, h); 242 wrmsr(msr_no, l, h);
230 return 0; 243 return 0;
231} 244}
245static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no,
246 struct msr *msrs)
247{
248 rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
249}
250static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no,
251 struct msr *msrs)
252{
253 wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
254}
232static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, 255static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
233 u32 *l, u32 *h) 256 u32 *l, u32 *h)
234{ 257{
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c45a0a568dff..c97264409934 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -64,7 +64,7 @@ static inline int nmi_watchdog_active(void)
64 * but since they are power of two we could use a 64 * but since they are power of two we could use a
65 * cheaper way --cvg 65 * cheaper way --cvg
66 */ 66 */
67 return nmi_watchdog & 0x3; 67 return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
68} 68}
69#endif 69#endif
70 70
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 064ed6df4cbe..c4ae822e415f 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
17extern void numa_init_array(void); 17extern void numa_init_array(void);
18extern int numa_off; 18extern int numa_off;
19 19
20extern void srat_reserve_add_area(int nodeid);
21extern int hotadd_percent;
22
23extern s16 apicid_to_node[MAX_LOCAL_APIC]; 20extern s16 apicid_to_node[MAX_LOCAL_APIC];
24 21
25extern unsigned long numa_free_all_bootmem(void); 22extern unsigned long numa_free_all_bootmem(void);
@@ -27,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
27 unsigned long end); 24 unsigned long end);
28 25
29#ifdef CONFIG_NUMA 26#ifdef CONFIG_NUMA
27/*
28 * Too small node sizes may confuse the VM badly. Usually they
29 * result from BIOS bugs. So dont recognize nodes as standalone
30 * NUMA entities that have less than this amount of RAM listed:
31 */
32#define NODE_MIN_SIZE (4*1024*1024)
33
30extern void __init init_cpu_to_node(void); 34extern void __init init_cpu_to_node(void);
31extern void __cpuinit numa_set_node(int cpu, int node); 35extern void __cpuinit numa_set_node(int cpu, int node);
32extern void __cpuinit numa_clear_node(int cpu); 36extern void __cpuinit numa_clear_node(int cpu);
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 0f915ae649a7..6f1b7331313f 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE;
54extern int sysctl_legacy_va_layout; 54extern int sysctl_legacy_va_layout;
55 55
56extern void find_low_pfn_range(void); 56extern void find_low_pfn_range(void);
57extern unsigned long init_memory_mapping(unsigned long start,
58 unsigned long end);
59extern void initmem_init(unsigned long, unsigned long);
60extern void free_initmem(void);
61extern void setup_bootmem_allocator(void); 57extern void setup_bootmem_allocator(void);
62 58
63#endif /* !__ASSEMBLY__ */ 59#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index d38c91b70248..8d382d3abf38 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -32,22 +32,14 @@
32 */ 32 */
33#define __PAGE_OFFSET _AC(0xffff880000000000, UL) 33#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
34 34
35#define __PHYSICAL_START CONFIG_PHYSICAL_START 35#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \
36#define __KERNEL_ALIGN 0x200000 36 (CONFIG_PHYSICAL_ALIGN - 1)) & \
37 37 ~(CONFIG_PHYSICAL_ALIGN - 1))
38/*
39 * Make sure kernel is aligned to 2MB address. Catching it at compile
40 * time is better. Change your config file and compile the kernel
41 * for a 2MB aligned address (CONFIG_PHYSICAL_START)
42 */
43#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
44#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
45#endif
46 38
47#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) 39#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
48#define __START_KERNEL_map _AC(0xffffffff80000000, UL) 40#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
49 41
50/* See Documentation/x86_64/mm.txt for a description of the memory map. */ 42/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
51#define __PHYSICAL_MASK_SHIFT 46 43#define __PHYSICAL_MASK_SHIFT 46
52#define __VIRTUAL_MASK_SHIFT 48 44#define __VIRTUAL_MASK_SHIFT 48
53 45
@@ -71,12 +63,6 @@ extern unsigned long __phys_addr(unsigned long);
71 63
72#define vmemmap ((struct page *)VMEMMAP_START) 64#define vmemmap ((struct page *)VMEMMAP_START)
73 65
74extern unsigned long init_memory_mapping(unsigned long start,
75 unsigned long end);
76
77extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
78extern void free_initmem(void);
79
80extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); 66extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
81extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); 67extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
82 68
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 826ad37006ab..6473f5ccff85 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr);
46extern unsigned long max_low_pfn_mapped; 46extern unsigned long max_low_pfn_mapped;
47extern unsigned long max_pfn_mapped; 47extern unsigned long max_pfn_mapped;
48 48
49extern unsigned long init_memory_mapping(unsigned long start,
50 unsigned long end);
51
52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
53extern void free_initmem(void);
54
49#endif /* !__ASSEMBLY__ */ 55#endif /* !__ASSEMBLY__ */
50 56
51#endif /* _ASM_X86_PAGE_DEFS_H */ 57#endif /* _ASM_X86_PAGE_DEFS_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a53da004e08e..4fb37c8a0832 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -56,6 +56,7 @@ struct desc_ptr;
56struct tss_struct; 56struct tss_struct;
57struct mm_struct; 57struct mm_struct;
58struct desc_struct; 58struct desc_struct;
59struct task_struct;
59 60
60/* 61/*
61 * Wrapper type for pointers to code which uses the non-standard 62 * Wrapper type for pointers to code which uses the non-standard
@@ -203,7 +204,8 @@ struct pv_cpu_ops {
203 204
204 void (*swapgs)(void); 205 void (*swapgs)(void);
205 206
206 struct pv_lazy_ops lazy_mode; 207 void (*start_context_switch)(struct task_struct *prev);
208 void (*end_context_switch)(struct task_struct *next);
207}; 209};
208 210
209struct pv_irq_ops { 211struct pv_irq_ops {
@@ -1399,25 +1401,23 @@ enum paravirt_lazy_mode {
1399}; 1401};
1400 1402
1401enum paravirt_lazy_mode paravirt_get_lazy_mode(void); 1403enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
1402void paravirt_enter_lazy_cpu(void); 1404void paravirt_start_context_switch(struct task_struct *prev);
1403void paravirt_leave_lazy_cpu(void); 1405void paravirt_end_context_switch(struct task_struct *next);
1406
1404void paravirt_enter_lazy_mmu(void); 1407void paravirt_enter_lazy_mmu(void);
1405void paravirt_leave_lazy_mmu(void); 1408void paravirt_leave_lazy_mmu(void);
1406void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
1407 1409
1408#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE 1410#define __HAVE_ARCH_START_CONTEXT_SWITCH
1409static inline void arch_enter_lazy_cpu_mode(void) 1411static inline void arch_start_context_switch(struct task_struct *prev)
1410{ 1412{
1411 PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter); 1413 PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
1412} 1414}
1413 1415
1414static inline void arch_leave_lazy_cpu_mode(void) 1416static inline void arch_end_context_switch(struct task_struct *next)
1415{ 1417{
1416 PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); 1418 PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
1417} 1419}
1418 1420
1419void arch_flush_lazy_cpu_mode(void);
1420
1421#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE 1421#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
1422static inline void arch_enter_lazy_mmu_mode(void) 1422static inline void arch_enter_lazy_mmu_mode(void)
1423{ 1423{
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 000000000000..876ed97147b3
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,100 @@
1#ifndef _ASM_X86_PERF_COUNTER_H
2#define _ASM_X86_PERF_COUNTER_H
3
4/*
5 * Performance counter hw details:
6 */
7
8#define X86_PMC_MAX_GENERIC 8
9#define X86_PMC_MAX_FIXED 3
10
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64
14
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
16#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
17
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
25
26/*
27 * Includes eventsel and unit mask as well:
28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff
30
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36
37#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
38
39/*
40 * Intel "Architectural Performance Monitoring" CPUID
41 * detection/enumeration details:
42 */
43union cpuid10_eax {
44 struct {
45 unsigned int version_id:8;
46 unsigned int num_counters:8;
47 unsigned int bit_width:8;
48 unsigned int mask_length:8;
49 } split;
50 unsigned int full;
51};
52
53union cpuid10_edx {
54 struct {
55 unsigned int num_counters_fixed:4;
56 unsigned int reserved:28;
57 } split;
58 unsigned int full;
59};
60
61
62/*
63 * Fixed-purpose performance counters:
64 */
65
66/*
67 * All 3 fixed-mode PMCs are configured via this single MSR:
68 */
69#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
70
71/*
72 * The counts are available in three separate MSRs:
73 */
74
75/* Instr_Retired.Any: */
76#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
77#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
78
79/* CPU_CLK_Unhalted.Core: */
80#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
81#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
82
83/* CPU_CLK_Unhalted.Ref: */
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86
87extern void set_perf_counter_pending(void);
88
89#define clear_perf_counter_pending() do { } while (0)
90#define test_perf_counter_pending() (0)
91
92#ifdef CONFIG_PERF_COUNTERS
93extern void init_hw_perf_counters(void);
94extern void perf_counters_lapic_init(void);
95#else
96static inline void init_hw_perf_counters(void) { }
97static inline void perf_counters_lapic_init(void) { }
98#endif
99
100#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 29d96d168bc0..18ef7ebf2631 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -81,6 +81,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
81#define pte_val(x) native_pte_val(x) 81#define pte_val(x) native_pte_val(x)
82#define __pte(x) native_make_pte(x) 82#define __pte(x) native_make_pte(x)
83 83
84#define arch_end_context_switch(prev) do {} while(0)
85
84#endif /* CONFIG_PARAVIRT */ 86#endif /* CONFIG_PARAVIRT */
85 87
86/* 88/*
@@ -503,6 +505,8 @@ static inline int pgd_none(pgd_t pgd)
503 505
504#ifndef __ASSEMBLY__ 506#ifndef __ASSEMBLY__
505 507
508extern int direct_gbpages;
509
506/* local pte updates need not use xchg for locking */ 510/* local pte updates need not use xchg for locking */
507static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) 511static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
508{ 512{
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 6b87bc6d5018..abde308fdb0f 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[];
25 25
26extern void paging_init(void); 26extern void paging_init(void);
27 27
28#endif /* !__ASSEMBLY__ */
29
30#ifndef __ASSEMBLY__
31
32#define pte_ERROR(e) \ 28#define pte_ERROR(e) \
33 printk("%s:%d: bad pte %p(%016lx).\n", \ 29 printk("%s:%d: bad pte %p(%016lx).\n", \
34 __FILE__, __LINE__, &(e), pte_val(e)) 30 __FILE__, __LINE__, &(e), pte_val(e))
@@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
135 131
136#define update_mmu_cache(vma, address, pte) do { } while (0) 132#define update_mmu_cache(vma, address, pte) do { } while (0)
137 133
138extern int direct_gbpages;
139
140/* Encode and de-code a swap entry */ 134/* Encode and de-code a swap entry */
141#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE 135#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
142#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) 136#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index fbf42b8e0383..766ea16fbbbd 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t;
51#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) 51#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
52#define PGDIR_MASK (~(PGDIR_SIZE - 1)) 52#define PGDIR_MASK (~(PGDIR_SIZE - 1))
53 53
54 54/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
55#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) 55#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
56#define VMALLOC_START _AC(0xffffc20000000000, UL) 56#define VMALLOC_START _AC(0xffffc90000000000, UL)
57#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) 57#define VMALLOC_END _AC(0xffffe8ffffffffff, UL)
58#define VMEMMAP_START _AC(0xffffe20000000000, UL) 58#define VMEMMAP_START _AC(0xffffea0000000000, UL)
59#define MODULES_VADDR _AC(0xffffffffa0000000, UL) 59#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
60#define MODULES_END _AC(0xffffffffff000000, UL) 60#define MODULES_END _AC(0xffffffffff000000, UL)
61#define MODULES_LEN (MODULES_END - MODULES_VADDR) 61#define MODULES_LEN (MODULES_END - MODULES_VADDR)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b8238dc8786d..4d258ad76a0f 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,7 +273,6 @@ typedef struct page *pgtable_t;
273 273
274extern pteval_t __supported_pte_mask; 274extern pteval_t __supported_pte_mask;
275extern int nx_enabled; 275extern int nx_enabled;
276extern void set_nx(void);
277 276
278#define pgprot_writecombine pgprot_writecombine 277#define pgprot_writecombine pgprot_writecombine
279extern pgprot_t pgprot_writecombine(pgprot_t prot); 278extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fed93fec9764..c7768269b1cf 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -410,9 +410,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary);
410extern unsigned int xstate_size; 410extern unsigned int xstate_size;
411extern void free_thread_xstate(struct task_struct *); 411extern void free_thread_xstate(struct task_struct *);
412extern struct kmem_cache *task_xstate_cachep; 412extern struct kmem_cache *task_xstate_cachep;
413extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
414extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
415extern unsigned short num_cache_leaves;
416 413
417struct thread_struct { 414struct thread_struct {
418 /* Cached TLS descriptors: */ 415 /* Cached TLS descriptors: */
@@ -428,8 +425,12 @@ struct thread_struct {
428 unsigned short fsindex; 425 unsigned short fsindex;
429 unsigned short gsindex; 426 unsigned short gsindex;
430#endif 427#endif
428#ifdef CONFIG_X86_32
431 unsigned long ip; 429 unsigned long ip;
430#endif
431#ifdef CONFIG_X86_64
432 unsigned long fs; 432 unsigned long fs;
433#endif
433 unsigned long gs; 434 unsigned long gs;
434 /* Hardware debugging registers: */ 435 /* Hardware debugging registers: */
435 unsigned long debugreg0; 436 unsigned long debugreg0;
@@ -461,14 +462,8 @@ struct thread_struct {
461 unsigned io_bitmap_max; 462 unsigned io_bitmap_max;
462/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ 463/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
463 unsigned long debugctlmsr; 464 unsigned long debugctlmsr;
464#ifdef CONFIG_X86_DS 465 /* Debug Store context; see asm/ds.h */
465/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
466 struct ds_context *ds_ctx; 466 struct ds_context *ds_ctx;
467#endif /* CONFIG_X86_DS */
468#ifdef CONFIG_X86_PTRACE_BTS
469/* the signal to send on a bts buffer overflow */
470 unsigned int bts_ovfl_signal;
471#endif /* CONFIG_X86_PTRACE_BTS */
472}; 467};
473 468
474static inline unsigned long native_get_debugreg(int regno) 469static inline unsigned long native_get_debugreg(int regno)
@@ -796,6 +791,21 @@ static inline unsigned long get_debugctlmsr(void)
796 return debugctlmsr; 791 return debugctlmsr;
797} 792}
798 793
794static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
795{
796 u64 debugctlmsr = 0;
797 u32 val1, val2;
798
799#ifndef CONFIG_X86_DEBUGCTLMSR
800 if (boot_cpu_data.x86 < 6)
801 return 0;
802#endif
803 rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
804 debugctlmsr = val1 | ((u64)val2 << 32);
805
806 return debugctlmsr;
807}
808
799static inline void update_debugctlmsr(unsigned long debugctlmsr) 809static inline void update_debugctlmsr(unsigned long debugctlmsr)
800{ 810{
801#ifndef CONFIG_X86_DEBUGCTLMSR 811#ifndef CONFIG_X86_DEBUGCTLMSR
@@ -805,6 +815,18 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
805 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); 815 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
806} 816}
807 817
818static inline void update_debugctlmsr_on_cpu(int cpu,
819 unsigned long debugctlmsr)
820{
821#ifndef CONFIG_X86_DEBUGCTLMSR
822 if (boot_cpu_data.x86 < 6)
823 return;
824#endif
825 wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
826 (u32)((u64)debugctlmsr),
827 (u32)((u64)debugctlmsr >> 32));
828}
829
808/* 830/*
809 * from system description table in BIOS. Mostly for MCA use, but 831 * from system description table in BIOS. Mostly for MCA use, but
810 * others may find it useful: 832 * others may find it useful:
@@ -815,6 +837,7 @@ extern unsigned int BIOS_revision;
815 837
816/* Boot loader type from the setup header: */ 838/* Boot loader type from the setup header: */
817extern int bootloader_type; 839extern int bootloader_type;
840extern int bootloader_version;
818 841
819extern char ignore_fpu_irq; 842extern char ignore_fpu_irq;
820 843
@@ -875,7 +898,6 @@ static inline void spin_lock_prefetch(const void *x)
875 .vm86_info = NULL, \ 898 .vm86_info = NULL, \
876 .sysenter_cs = __KERNEL_CS, \ 899 .sysenter_cs = __KERNEL_CS, \
877 .io_bitmap_ptr = NULL, \ 900 .io_bitmap_ptr = NULL, \
878 .fs = __KERNEL_PERCPU, \
879} 901}
880 902
881/* 903/*
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 624f133943ed..0f0d908349aa 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -236,12 +236,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
236extern int do_set_thread_area(struct task_struct *p, int idx, 236extern int do_set_thread_area(struct task_struct *p, int idx,
237 struct user_desc __user *info, int can_allocate); 237 struct user_desc __user *info, int can_allocate);
238 238
239extern void x86_ptrace_untrace(struct task_struct *); 239#ifdef CONFIG_X86_PTRACE_BTS
240extern void x86_ptrace_fork(struct task_struct *child, 240extern void ptrace_bts_untrace(struct task_struct *tsk);
241 unsigned long clone_flags);
242 241
243#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk) 242#define arch_ptrace_untrace(tsk) ptrace_bts_untrace(tsk)
244#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags) 243#endif /* CONFIG_X86_PTRACE_BTS */
245 244
246#endif /* __KERNEL__ */ 245#endif /* __KERNEL__ */
247 246
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index a4737dddfd58..64cf2d24fad1 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -48,9 +48,15 @@
48#endif 48#endif
49 49
50#ifdef CONFIG_X86_64 50#ifdef CONFIG_X86_64
51#ifdef CONFIG_PARAVIRT
52/* Paravirtualized systems may not have PSE or PGE available */
51#define NEED_PSE 0 53#define NEED_PSE 0
52#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
53#define NEED_PGE 0 54#define NEED_PGE 0
55#else
56#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31)
57#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31)
58#endif
59#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
54#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) 60#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
55#define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) 61#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
56#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) 62#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index e3cc3c063ec5..4517d6b93188 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,7 +27,7 @@
27#else /* CONFIG_X86_32 */ 27#else /* CONFIG_X86_32 */
28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ 28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
29# define MAX_PHYSADDR_BITS 44 29# define MAX_PHYSADDR_BITS 44
30# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */ 30# define MAX_PHYSMEM_BITS 46
31#endif 31#endif
32 32
33#endif /* CONFIG_SPARSEMEM */ 33#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 82ada75f3ebf..85574b7c1bc1 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -225,6 +225,7 @@ struct __attribute__ ((__packed__)) vmcb {
225#define SVM_EVTINJ_VALID_ERR (1 << 11) 225#define SVM_EVTINJ_VALID_ERR (1 << 11)
226 226
227#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK 227#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
228#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
228 229
229#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR 230#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
230#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI 231#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 7043408f6904..372b76edd63f 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * syscalls.h - Linux syscall interfaces (arch-specific) 2 * syscalls.h - Linux syscall interfaces (arch-specific)
3 * 3 *
4 * Copyright (c) 2008 Jaswinder Singh 4 * Copyright (c) 2008 Jaswinder Singh Rajput
5 * 5 *
6 * This file is released under the GPLv2. 6 * This file is released under the GPLv2.
7 * See the file COPYING for more details. 7 * See the file COPYING for more details.
@@ -12,50 +12,55 @@
12 12
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/linkage.h> 14#include <linux/linkage.h>
15#include <linux/types.h>
16#include <linux/signal.h> 15#include <linux/signal.h>
16#include <linux/types.h>
17 17
18/* Common in X86_32 and X86_64 */ 18/* Common in X86_32 and X86_64 */
19/* kernel/ioport.c */ 19/* kernel/ioport.c */
20asmlinkage long sys_ioperm(unsigned long, unsigned long, int); 20asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
21 21
22/* kernel/process.c */
23int sys_fork(struct pt_regs *);
24int sys_vfork(struct pt_regs *);
25
22/* kernel/ldt.c */ 26/* kernel/ldt.c */
23asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); 27asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
24 28
29/* kernel/signal.c */
30long sys_rt_sigreturn(struct pt_regs *);
31
25/* kernel/tls.c */ 32/* kernel/tls.c */
26asmlinkage int sys_set_thread_area(struct user_desc __user *); 33asmlinkage int sys_set_thread_area(struct user_desc __user *);
27asmlinkage int sys_get_thread_area(struct user_desc __user *); 34asmlinkage int sys_get_thread_area(struct user_desc __user *);
28 35
29/* X86_32 only */ 36/* X86_32 only */
30#ifdef CONFIG_X86_32 37#ifdef CONFIG_X86_32
38/* kernel/ioport.c */
39long sys_iopl(struct pt_regs *);
40
31/* kernel/process_32.c */ 41/* kernel/process_32.c */
32int sys_fork(struct pt_regs *);
33int sys_clone(struct pt_regs *); 42int sys_clone(struct pt_regs *);
34int sys_vfork(struct pt_regs *);
35int sys_execve(struct pt_regs *); 43int sys_execve(struct pt_regs *);
36 44
37/* kernel/signal_32.c */ 45/* kernel/signal.c */
38asmlinkage int sys_sigsuspend(int, int, old_sigset_t); 46asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
39asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, 47asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
40 struct old_sigaction __user *); 48 struct old_sigaction __user *);
41int sys_sigaltstack(struct pt_regs *); 49int sys_sigaltstack(struct pt_regs *);
42unsigned long sys_sigreturn(struct pt_regs *); 50unsigned long sys_sigreturn(struct pt_regs *);
43long sys_rt_sigreturn(struct pt_regs *);
44
45/* kernel/ioport.c */
46long sys_iopl(struct pt_regs *);
47 51
48/* kernel/sys_i386_32.c */ 52/* kernel/sys_i386_32.c */
53struct mmap_arg_struct;
54struct sel_arg_struct;
55struct oldold_utsname;
56struct old_utsname;
57
49asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, 58asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
50 unsigned long, unsigned long, unsigned long); 59 unsigned long, unsigned long, unsigned long);
51struct mmap_arg_struct;
52asmlinkage int old_mmap(struct mmap_arg_struct __user *); 60asmlinkage int old_mmap(struct mmap_arg_struct __user *);
53struct sel_arg_struct;
54asmlinkage int old_select(struct sel_arg_struct __user *); 61asmlinkage int old_select(struct sel_arg_struct __user *);
55asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); 62asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
56struct old_utsname;
57asmlinkage int sys_uname(struct old_utsname __user *); 63asmlinkage int sys_uname(struct old_utsname __user *);
58struct oldold_utsname;
59asmlinkage int sys_olduname(struct oldold_utsname __user *); 64asmlinkage int sys_olduname(struct oldold_utsname __user *);
60 65
61/* kernel/vm86_32.c */ 66/* kernel/vm86_32.c */
@@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *);
65#else /* CONFIG_X86_32 */ 70#else /* CONFIG_X86_32 */
66 71
67/* X86_64 only */ 72/* X86_64 only */
73/* kernel/ioport.c */
74asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
75
68/* kernel/process_64.c */ 76/* kernel/process_64.c */
69asmlinkage long sys_fork(struct pt_regs *);
70asmlinkage long sys_clone(unsigned long, unsigned long, 77asmlinkage long sys_clone(unsigned long, unsigned long,
71 void __user *, void __user *, 78 void __user *, void __user *,
72 struct pt_regs *); 79 struct pt_regs *);
73asmlinkage long sys_vfork(struct pt_regs *);
74asmlinkage long sys_execve(char __user *, char __user * __user *, 80asmlinkage long sys_execve(char __user *, char __user * __user *,
75 char __user * __user *, 81 char __user * __user *,
76 struct pt_regs *); 82 struct pt_regs *);
77long sys_arch_prctl(int, unsigned long); 83long sys_arch_prctl(int, unsigned long);
78 84
79/* kernel/ioport.c */ 85/* kernel/signal.c */
80asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
81
82/* kernel/signal_64.c */
83asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, 86asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
84 struct pt_regs *); 87 struct pt_regs *);
85long sys_rt_sigreturn(struct pt_regs *);
86 88
87/* kernel/sys_x86_64.c */ 89/* kernel/sys_x86_64.c */
90struct new_utsname;
91
88asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, 92asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
89 unsigned long, unsigned long, unsigned long); 93 unsigned long, unsigned long, unsigned long);
90struct new_utsname;
91asmlinkage long sys_uname(struct new_utsname __user *); 94asmlinkage long sys_uname(struct new_utsname __user *);
92 95
93#endif /* CONFIG_X86_32 */ 96#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h
index f72956331c49..c4ee8056baca 100644
--- a/arch/x86/include/asm/termios.h
+++ b/arch/x86/include/asm/termios.h
@@ -67,6 +67,7 @@ static inline int user_termio_to_kernel_termios(struct ktermios *termios,
67 SET_LOW_TERMIOS_BITS(termios, termio, c_oflag); 67 SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
68 SET_LOW_TERMIOS_BITS(termios, termio, c_cflag); 68 SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
69 SET_LOW_TERMIOS_BITS(termios, termio, c_lflag); 69 SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
70 get_user(termios->c_line, &termio->c_line);
70 return copy_from_user(termios->c_cc, termio->c_cc, NCC); 71 return copy_from_user(termios->c_cc, termio->c_cc, NCC);
71} 72}
72 73
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8820a73ae090..602c769fc98c 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -94,7 +94,8 @@ struct thread_info {
94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 94#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ 95#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ 96#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
97#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */ 97#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
98#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */
98 99
99#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 100#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
100#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 101#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -116,6 +117,7 @@ struct thread_info {
116#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 117#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
117#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) 118#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
118#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) 119#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
120#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
119#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) 121#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE)
120 122
121/* work to do in syscall_trace_enter() */ 123/* work to do in syscall_trace_enter() */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 16a5c84b0329..a5ecc9c33e92 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -17,7 +17,7 @@
17 17
18static inline void __native_flush_tlb(void) 18static inline void __native_flush_tlb(void)
19{ 19{
20 write_cr3(read_cr3()); 20 native_write_cr3(native_read_cr3());
21} 21}
22 22
23static inline void __native_flush_tlb_global(void) 23static inline void __native_flush_tlb_global(void)
@@ -32,11 +32,11 @@ static inline void __native_flush_tlb_global(void)
32 */ 32 */
33 raw_local_irq_save(flags); 33 raw_local_irq_save(flags);
34 34
35 cr4 = read_cr4(); 35 cr4 = native_read_cr4();
36 /* clear PGE */ 36 /* clear PGE */
37 write_cr4(cr4 & ~X86_CR4_PGE); 37 native_write_cr4(cr4 & ~X86_CR4_PGE);
38 /* write old PGE again and flush TLBs */ 38 /* write old PGE again and flush TLBs */
39 write_cr4(cr4); 39 native_write_cr4(cr4);
40 40
41 raw_local_irq_restore(flags); 41 raw_local_irq_restore(flags);
42} 42}
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index f44b49abca49..066ef590d7e0 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -203,7 +203,8 @@ struct pci_bus;
203void x86_pci_root_bus_res_quirks(struct pci_bus *b); 203void x86_pci_root_bus_res_quirks(struct pci_bus *b);
204 204
205#ifdef CONFIG_SMP 205#ifdef CONFIG_SMP
206#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids) 206#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
207 (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
207#define smt_capable() (smp_num_siblings > 1) 208#define smt_capable() (smp_num_siblings > 1)
208#endif 209#endif
209 210
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0d5342515b86..bfd74c032fca 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_TRAPS_H 2#define _ASM_X86_TRAPS_H
3 3
4#include <asm/debugreg.h> 4#include <asm/debugreg.h>
5#include <asm/siginfo.h> /* TRAP_TRACE, ... */
5 6
6#ifdef CONFIG_X86_32 7#ifdef CONFIG_X86_32
7#define dotraplinkage 8#define dotraplinkage
@@ -13,6 +14,9 @@ asmlinkage void divide_error(void);
13asmlinkage void debug(void); 14asmlinkage void debug(void);
14asmlinkage void nmi(void); 15asmlinkage void nmi(void);
15asmlinkage void int3(void); 16asmlinkage void int3(void);
17asmlinkage void xen_debug(void);
18asmlinkage void xen_int3(void);
19asmlinkage void xen_stack_segment(void);
16asmlinkage void overflow(void); 20asmlinkage void overflow(void);
17asmlinkage void bounds(void); 21asmlinkage void bounds(void);
18asmlinkage void invalid_op(void); 22asmlinkage void invalid_op(void);
@@ -74,7 +78,6 @@ static inline int get_si_code(unsigned long condition)
74} 78}
75 79
76extern int panic_on_unrecovered_nmi; 80extern int panic_on_unrecovered_nmi;
77extern int kstack_depth_to_print;
78 81
79void math_error(void __user *); 82void math_error(void __user *);
80void math_emulate(struct math_emu_info *); 83void math_emulate(struct math_emu_info *);
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74cf8dc..732a30706153 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,8 @@
340#define __NR_inotify_init1 332 340#define __NR_inotify_init1 332
341#define __NR_preadv 333 341#define __NR_preadv 333
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_counter_open 336
343 345
344#ifdef __KERNEL__ 346#ifdef __KERNEL__
345 347
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index f81829462325..900e1617e672 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -657,7 +657,10 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
657__SYSCALL(__NR_preadv, sys_preadv) 657__SYSCALL(__NR_preadv, sys_preadv)
658#define __NR_pwritev 296 658#define __NR_pwritev 296
659__SYSCALL(__NR_pwritev, sys_pwritev) 659__SYSCALL(__NR_pwritev, sys_pwritev)
660 660#define __NR_rt_tgsigqueueinfo 297
661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662#define __NR_perf_counter_open 298
663__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
661 664
662#ifndef __NO_STUBS 665#ifndef __NO_STUBS
663#define __ARCH_WANT_OLD_READDIR 666#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 9b0e61bf7a88..bddd44f2f0ab 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -37,7 +37,7 @@
37#define UV_CPUS_PER_ACT_STATUS 32 37#define UV_CPUS_PER_ACT_STATUS 32
38#define UV_ACT_STATUS_MASK 0x3 38#define UV_ACT_STATUS_MASK 0x3
39#define UV_ACT_STATUS_SIZE 2 39#define UV_ACT_STATUS_SIZE 2
40#define UV_ACTIVATION_DESCRIPTOR_SIZE 32 40#define UV_ADP_SIZE 32
41#define UV_DISTRIBUTION_SIZE 256 41#define UV_DISTRIBUTION_SIZE 256
42#define UV_SW_ACK_NPENDING 8 42#define UV_SW_ACK_NPENDING 8
43#define UV_NET_ENDPOINT_INTD 0x38 43#define UV_NET_ENDPOINT_INTD 0x38
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index d3a98ea1062e..341070f7ad5c 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -133,6 +133,7 @@ struct uv_scir_s {
133struct uv_hub_info_s { 133struct uv_hub_info_s {
134 unsigned long global_mmr_base; 134 unsigned long global_mmr_base;
135 unsigned long gpa_mask; 135 unsigned long gpa_mask;
136 unsigned int gnode_extra;
136 unsigned long gnode_upper; 137 unsigned long gnode_upper;
137 unsigned long lowmem_remap_top; 138 unsigned long lowmem_remap_top;
138 unsigned long lowmem_remap_base; 139 unsigned long lowmem_remap_base;
@@ -159,7 +160,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
159 * p - PNODE (local part of nsids, right shifted 1) 160 * p - PNODE (local part of nsids, right shifted 1)
160 */ 161 */
161#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) 162#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
162#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper) 163#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
164#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1)
163 165
164#define UV_LOCAL_MMR_BASE 0xf4000000UL 166#define UV_LOCAL_MMR_BASE 0xf4000000UL
165#define UV_GLOBAL_MMR32_BASE 0xf8000000UL 167#define UV_GLOBAL_MMR32_BASE 0xf8000000UL
@@ -173,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
173#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) 175#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
174 176
175#define UV_GLOBAL_MMR64_PNODE_BITS(p) \ 177#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
176 ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT) 178 ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
177 179
178#define UV_APIC_PNODE_SHIFT 6 180#define UV_APIC_PNODE_SHIFT 6
179 181
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 498f944010b9..11be5ad2e0e9 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -247,6 +247,7 @@ enum vmcs_field {
247#define EXIT_REASON_MSR_READ 31 247#define EXIT_REASON_MSR_READ 31
248#define EXIT_REASON_MSR_WRITE 32 248#define EXIT_REASON_MSR_WRITE 32
249#define EXIT_REASON_MWAIT_INSTRUCTION 36 249#define EXIT_REASON_MWAIT_INSTRUCTION 36
250#define EXIT_REASON_MCE_DURING_VMENTRY 41
250#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 251#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
251#define EXIT_REASON_APIC_ACCESS 44 252#define EXIT_REASON_APIC_ACCESS 44
252#define EXIT_REASON_EPT_VIOLATION 48 253#define EXIT_REASON_EPT_VIOLATION 48
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 235f5927bb97..4f78bd682125 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -44,6 +44,7 @@ obj-y += process.o
44obj-y += i387.o xsave.o 44obj-y += i387.o xsave.o
45obj-y += ptrace.o 45obj-y += ptrace.o
46obj-$(CONFIG_X86_DS) += ds.o 46obj-$(CONFIG_X86_DS) += ds.o
47obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
47obj-$(CONFIG_X86_32) += tls.o 48obj-$(CONFIG_X86_32) += tls.o
48obj-$(CONFIG_IA32_EMULATION) += tls.o 49obj-$(CONFIG_IA32_EMULATION) += tls.o
49obj-y += step.o 50obj-y += step.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 844e5e25213b..631086159c53 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -985,11 +985,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
985 985
986 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 986 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
987 mp_ioapics[idx].apicid = uniq_ioapic_id(id); 987 mp_ioapics[idx].apicid = uniq_ioapic_id(id);
988#ifdef CONFIG_X86_32
989 mp_ioapics[idx].apicver = io_apic_get_version(idx); 988 mp_ioapics[idx].apicver = io_apic_get_version(idx);
990#else 989
991 mp_ioapics[idx].apicver = 0;
992#endif
993 /* 990 /*
994 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 991 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
995 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 992 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 1c31cc0e9def..167bc16ce0e5 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -9,7 +9,7 @@
9always := wakeup.bin 9always := wakeup.bin
10targets := wakeup.elf wakeup.lds 10targets := wakeup.elf wakeup.lds
11 11
12wakeup-y += wakeup.o wakemain.o video-mode.o copy.o 12wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
13 13
14# The link order of the video-*.o modules can matter. In particular, 14# The link order of the video-*.o modules can matter. In particular,
15# video-vga.o *must* be listed first, followed by video-vesa.o. 15# video-vga.o *must* be listed first, followed by video-vesa.o.
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
new file mode 100644
index 000000000000..f51eb0bb56ce
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/bioscall.S
@@ -0,0 +1 @@
#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
new file mode 100644
index 000000000000..6206033ba202
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/regs.c
@@ -0,0 +1 @@
#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99dad52..1c60554537c3 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -55,7 +55,16 @@ struct iommu_cmd {
55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 55static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
56 struct unity_map_entry *e); 56 struct unity_map_entry *e);
57static struct dma_ops_domain *find_protection_domain(u16 devid); 57static struct dma_ops_domain *find_protection_domain(u16 devid);
58static u64* alloc_pte(struct protection_domain *dom,
59 unsigned long address, u64
60 **pte_page, gfp_t gfp);
61static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
62 unsigned long start_page,
63 unsigned int pages);
58 64
65#ifndef BUS_NOTIFY_UNBOUND_DRIVER
66#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
67#endif
59 68
60#ifdef CONFIG_AMD_IOMMU_STATS 69#ifdef CONFIG_AMD_IOMMU_STATS
61 70
@@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
213{ 222{
214 struct amd_iommu *iommu; 223 struct amd_iommu *iommu;
215 224
216 list_for_each_entry(iommu, &amd_iommu_list, list) 225 for_each_iommu(iommu)
217 iommu_poll_events(iommu); 226 iommu_poll_events(iommu);
218 227
219 return IRQ_HANDLED; 228 return IRQ_HANDLED;
@@ -440,7 +449,7 @@ static void iommu_flush_domain(u16 domid)
440 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 449 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
441 domid, 1, 1); 450 domid, 1, 1);
442 451
443 list_for_each_entry(iommu, &amd_iommu_list, list) { 452 for_each_iommu(iommu) {
444 spin_lock_irqsave(&iommu->lock, flags); 453 spin_lock_irqsave(&iommu->lock, flags);
445 __iommu_queue_command(iommu, &cmd); 454 __iommu_queue_command(iommu, &cmd);
446 __iommu_completion_wait(iommu); 455 __iommu_completion_wait(iommu);
@@ -449,6 +458,35 @@ static void iommu_flush_domain(u16 domid)
449 } 458 }
450} 459}
451 460
461void amd_iommu_flush_all_domains(void)
462{
463 int i;
464
465 for (i = 1; i < MAX_DOMAIN_ID; ++i) {
466 if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
467 continue;
468 iommu_flush_domain(i);
469 }
470}
471
472void amd_iommu_flush_all_devices(void)
473{
474 struct amd_iommu *iommu;
475 int i;
476
477 for (i = 0; i <= amd_iommu_last_bdf; ++i) {
478 if (amd_iommu_pd_table[i] == NULL)
479 continue;
480
481 iommu = amd_iommu_rlookup_table[i];
482 if (!iommu)
483 continue;
484
485 iommu_queue_inv_dev_entry(iommu, i);
486 iommu_completion_wait(iommu);
487 }
488}
489
452/**************************************************************************** 490/****************************************************************************
453 * 491 *
454 * The functions below are used the create the page table mappings for 492 * The functions below are used the create the page table mappings for
@@ -468,7 +506,7 @@ static int iommu_map_page(struct protection_domain *dom,
468 unsigned long phys_addr, 506 unsigned long phys_addr,
469 int prot) 507 int prot)
470{ 508{
471 u64 __pte, *pte, *page; 509 u64 __pte, *pte;
472 510
473 bus_addr = PAGE_ALIGN(bus_addr); 511 bus_addr = PAGE_ALIGN(bus_addr);
474 phys_addr = PAGE_ALIGN(phys_addr); 512 phys_addr = PAGE_ALIGN(phys_addr);
@@ -477,27 +515,7 @@ static int iommu_map_page(struct protection_domain *dom,
477 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) 515 if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
478 return -EINVAL; 516 return -EINVAL;
479 517
480 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; 518 pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
481
482 if (!IOMMU_PTE_PRESENT(*pte)) {
483 page = (u64 *)get_zeroed_page(GFP_KERNEL);
484 if (!page)
485 return -ENOMEM;
486 *pte = IOMMU_L2_PDE(virt_to_phys(page));
487 }
488
489 pte = IOMMU_PTE_PAGE(*pte);
490 pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
491
492 if (!IOMMU_PTE_PRESENT(*pte)) {
493 page = (u64 *)get_zeroed_page(GFP_KERNEL);
494 if (!page)
495 return -ENOMEM;
496 *pte = IOMMU_L1_PDE(virt_to_phys(page));
497 }
498
499 pte = IOMMU_PTE_PAGE(*pte);
500 pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
501 519
502 if (IOMMU_PTE_PRESENT(*pte)) 520 if (IOMMU_PTE_PRESENT(*pte))
503 return -EBUSY; 521 return -EBUSY;
@@ -595,7 +613,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
595 * as allocated in the aperture 613 * as allocated in the aperture
596 */ 614 */
597 if (addr < dma_dom->aperture_size) 615 if (addr < dma_dom->aperture_size)
598 __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); 616 __set_bit(addr >> PAGE_SHIFT,
617 dma_dom->aperture[0]->bitmap);
599 } 618 }
600 619
601 return 0; 620 return 0;
@@ -632,42 +651,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
632 ****************************************************************************/ 651 ****************************************************************************/
633 652
634/* 653/*
635 * The address allocator core function. 654 * The address allocator core functions.
636 * 655 *
637 * called with domain->lock held 656 * called with domain->lock held
638 */ 657 */
658
659/*
660 * This function checks if there is a PTE for a given dma address. If
661 * there is one, it returns the pointer to it.
662 */
663static u64* fetch_pte(struct protection_domain *domain,
664 unsigned long address)
665{
666 u64 *pte;
667
668 pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
669
670 if (!IOMMU_PTE_PRESENT(*pte))
671 return NULL;
672
673 pte = IOMMU_PTE_PAGE(*pte);
674 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
675
676 if (!IOMMU_PTE_PRESENT(*pte))
677 return NULL;
678
679 pte = IOMMU_PTE_PAGE(*pte);
680 pte = &pte[IOMMU_PTE_L0_INDEX(address)];
681
682 return pte;
683}
684
685/*
686 * This function is used to add a new aperture range to an existing
687 * aperture in case of dma_ops domain allocation or address allocation
688 * failure.
689 */
690static int alloc_new_range(struct amd_iommu *iommu,
691 struct dma_ops_domain *dma_dom,
692 bool populate, gfp_t gfp)
693{
694 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
695 int i;
696
697#ifdef CONFIG_IOMMU_STRESS
698 populate = false;
699#endif
700
701 if (index >= APERTURE_MAX_RANGES)
702 return -ENOMEM;
703
704 dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
705 if (!dma_dom->aperture[index])
706 return -ENOMEM;
707
708 dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
709 if (!dma_dom->aperture[index]->bitmap)
710 goto out_free;
711
712 dma_dom->aperture[index]->offset = dma_dom->aperture_size;
713
714 if (populate) {
715 unsigned long address = dma_dom->aperture_size;
716 int i, num_ptes = APERTURE_RANGE_PAGES / 512;
717 u64 *pte, *pte_page;
718
719 for (i = 0; i < num_ptes; ++i) {
720 pte = alloc_pte(&dma_dom->domain, address,
721 &pte_page, gfp);
722 if (!pte)
723 goto out_free;
724
725 dma_dom->aperture[index]->pte_pages[i] = pte_page;
726
727 address += APERTURE_RANGE_SIZE / 64;
728 }
729 }
730
731 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
732
733 /* Intialize the exclusion range if necessary */
734 if (iommu->exclusion_start &&
735 iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
736 iommu->exclusion_start < dma_dom->aperture_size) {
737 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
738 int pages = iommu_num_pages(iommu->exclusion_start,
739 iommu->exclusion_length,
740 PAGE_SIZE);
741 dma_ops_reserve_addresses(dma_dom, startpage, pages);
742 }
743
744 /*
745 * Check for areas already mapped as present in the new aperture
746 * range and mark those pages as reserved in the allocator. Such
747 * mappings may already exist as a result of requested unity
748 * mappings for devices.
749 */
750 for (i = dma_dom->aperture[index]->offset;
751 i < dma_dom->aperture_size;
752 i += PAGE_SIZE) {
753 u64 *pte = fetch_pte(&dma_dom->domain, i);
754 if (!pte || !IOMMU_PTE_PRESENT(*pte))
755 continue;
756
757 dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
758 }
759
760 return 0;
761
762out_free:
763 free_page((unsigned long)dma_dom->aperture[index]->bitmap);
764
765 kfree(dma_dom->aperture[index]);
766 dma_dom->aperture[index] = NULL;
767
768 return -ENOMEM;
769}
770
771static unsigned long dma_ops_area_alloc(struct device *dev,
772 struct dma_ops_domain *dom,
773 unsigned int pages,
774 unsigned long align_mask,
775 u64 dma_mask,
776 unsigned long start)
777{
778 unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
779 int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
780 int i = start >> APERTURE_RANGE_SHIFT;
781 unsigned long boundary_size;
782 unsigned long address = -1;
783 unsigned long limit;
784
785 next_bit >>= PAGE_SHIFT;
786
787 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
788 PAGE_SIZE) >> PAGE_SHIFT;
789
790 for (;i < max_index; ++i) {
791 unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
792
793 if (dom->aperture[i]->offset >= dma_mask)
794 break;
795
796 limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
797 dma_mask >> PAGE_SHIFT);
798
799 address = iommu_area_alloc(dom->aperture[i]->bitmap,
800 limit, next_bit, pages, 0,
801 boundary_size, align_mask);
802 if (address != -1) {
803 address = dom->aperture[i]->offset +
804 (address << PAGE_SHIFT);
805 dom->next_address = address + (pages << PAGE_SHIFT);
806 break;
807 }
808
809 next_bit = 0;
810 }
811
812 return address;
813}
814
639static unsigned long dma_ops_alloc_addresses(struct device *dev, 815static unsigned long dma_ops_alloc_addresses(struct device *dev,
640 struct dma_ops_domain *dom, 816 struct dma_ops_domain *dom,
641 unsigned int pages, 817 unsigned int pages,
642 unsigned long align_mask, 818 unsigned long align_mask,
643 u64 dma_mask) 819 u64 dma_mask)
644{ 820{
645 unsigned long limit;
646 unsigned long address; 821 unsigned long address;
647 unsigned long boundary_size;
648 822
649 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 823#ifdef CONFIG_IOMMU_STRESS
650 PAGE_SIZE) >> PAGE_SHIFT; 824 dom->next_address = 0;
651 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, 825 dom->need_flush = true;
652 dma_mask >> PAGE_SHIFT); 826#endif
653 827
654 if (dom->next_bit >= limit) { 828 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
655 dom->next_bit = 0; 829 dma_mask, dom->next_address);
656 dom->need_flush = true;
657 }
658 830
659 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
660 0 , boundary_size, align_mask);
661 if (address == -1) { 831 if (address == -1) {
662 address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 832 dom->next_address = 0;
663 0, boundary_size, align_mask); 833 address = dma_ops_area_alloc(dev, dom, pages, align_mask,
834 dma_mask, 0);
664 dom->need_flush = true; 835 dom->need_flush = true;
665 } 836 }
666 837
667 if (likely(address != -1)) { 838 if (unlikely(address == -1))
668 dom->next_bit = address + pages;
669 address <<= PAGE_SHIFT;
670 } else
671 address = bad_dma_address; 839 address = bad_dma_address;
672 840
673 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 841 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -684,11 +852,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
684 unsigned long address, 852 unsigned long address,
685 unsigned int pages) 853 unsigned int pages)
686{ 854{
687 address >>= PAGE_SHIFT; 855 unsigned i = address >> APERTURE_RANGE_SHIFT;
688 iommu_area_free(dom->bitmap, address, pages); 856 struct aperture_range *range = dom->aperture[i];
689 857
690 if (address >= dom->next_bit) 858 BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
859
860#ifdef CONFIG_IOMMU_STRESS
861 if (i < 4)
862 return;
863#endif
864
865 if (address >= dom->next_address)
691 dom->need_flush = true; 866 dom->need_flush = true;
867
868 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
869
870 iommu_area_free(range->bitmap, address, pages);
871
692} 872}
693 873
694/**************************************************************************** 874/****************************************************************************
@@ -736,12 +916,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
736 unsigned long start_page, 916 unsigned long start_page,
737 unsigned int pages) 917 unsigned int pages)
738{ 918{
739 unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; 919 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
740 920
741 if (start_page + pages > last_page) 921 if (start_page + pages > last_page)
742 pages = last_page - start_page; 922 pages = last_page - start_page;
743 923
744 iommu_area_reserve(dom->bitmap, start_page, pages); 924 for (i = start_page; i < start_page + pages; ++i) {
925 int index = i / APERTURE_RANGE_PAGES;
926 int page = i % APERTURE_RANGE_PAGES;
927 __set_bit(page, dom->aperture[index]->bitmap);
928 }
745} 929}
746 930
747static void free_pagetable(struct protection_domain *domain) 931static void free_pagetable(struct protection_domain *domain)
@@ -780,14 +964,19 @@ static void free_pagetable(struct protection_domain *domain)
780 */ 964 */
781static void dma_ops_domain_free(struct dma_ops_domain *dom) 965static void dma_ops_domain_free(struct dma_ops_domain *dom)
782{ 966{
967 int i;
968
783 if (!dom) 969 if (!dom)
784 return; 970 return;
785 971
786 free_pagetable(&dom->domain); 972 free_pagetable(&dom->domain);
787 973
788 kfree(dom->pte_pages); 974 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
789 975 if (!dom->aperture[i])
790 kfree(dom->bitmap); 976 continue;
977 free_page((unsigned long)dom->aperture[i]->bitmap);
978 kfree(dom->aperture[i]);
979 }
791 980
792 kfree(dom); 981 kfree(dom);
793} 982}
@@ -797,19 +986,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
797 * It also intializes the page table and the address allocator data 986 * It also intializes the page table and the address allocator data
798 * structures required for the dma_ops interface 987 * structures required for the dma_ops interface
799 */ 988 */
800static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, 989static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
801 unsigned order)
802{ 990{
803 struct dma_ops_domain *dma_dom; 991 struct dma_ops_domain *dma_dom;
804 unsigned i, num_pte_pages;
805 u64 *l2_pde;
806 u64 address;
807
808 /*
809 * Currently the DMA aperture must be between 32 MB and 1GB in size
810 */
811 if ((order < 25) || (order > 30))
812 return NULL;
813 992
814 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); 993 dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
815 if (!dma_dom) 994 if (!dma_dom)
@@ -826,55 +1005,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
826 dma_dom->domain.priv = dma_dom; 1005 dma_dom->domain.priv = dma_dom;
827 if (!dma_dom->domain.pt_root) 1006 if (!dma_dom->domain.pt_root)
828 goto free_dma_dom; 1007 goto free_dma_dom;
829 dma_dom->aperture_size = (1ULL << order);
830 dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
831 GFP_KERNEL);
832 if (!dma_dom->bitmap)
833 goto free_dma_dom;
834 /*
835 * mark the first page as allocated so we never return 0 as
836 * a valid dma-address. So we can use 0 as error value
837 */
838 dma_dom->bitmap[0] = 1;
839 dma_dom->next_bit = 0;
840 1008
841 dma_dom->need_flush = false; 1009 dma_dom->need_flush = false;
842 dma_dom->target_dev = 0xffff; 1010 dma_dom->target_dev = 0xffff;
843 1011
844 /* Intialize the exclusion range if necessary */ 1012 if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
845 if (iommu->exclusion_start && 1013 goto free_dma_dom;
846 iommu->exclusion_start < dma_dom->aperture_size) {
847 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
848 int pages = iommu_num_pages(iommu->exclusion_start,
849 iommu->exclusion_length,
850 PAGE_SIZE);
851 dma_ops_reserve_addresses(dma_dom, startpage, pages);
852 }
853 1014
854 /* 1015 /*
855 * At the last step, build the page tables so we don't need to 1016 * mark the first page as allocated so we never return 0 as
856 * allocate page table pages in the dma_ops mapping/unmapping 1017 * a valid dma-address. So we can use 0 as error value
857 * path.
858 */ 1018 */
859 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); 1019 dma_dom->aperture[0]->bitmap[0] = 1;
860 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), 1020 dma_dom->next_address = 0;
861 GFP_KERNEL);
862 if (!dma_dom->pte_pages)
863 goto free_dma_dom;
864
865 l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
866 if (l2_pde == NULL)
867 goto free_dma_dom;
868 1021
869 dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
870
871 for (i = 0; i < num_pte_pages; ++i) {
872 dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
873 if (!dma_dom->pte_pages[i])
874 goto free_dma_dom;
875 address = virt_to_phys(dma_dom->pte_pages[i]);
876 l2_pde[i] = IOMMU_L1_PDE(address);
877 }
878 1022
879 return dma_dom; 1023 return dma_dom;
880 1024
@@ -983,7 +1127,6 @@ static int device_change_notifier(struct notifier_block *nb,
983 struct protection_domain *domain; 1127 struct protection_domain *domain;
984 struct dma_ops_domain *dma_domain; 1128 struct dma_ops_domain *dma_domain;
985 struct amd_iommu *iommu; 1129 struct amd_iommu *iommu;
986 int order = amd_iommu_aperture_order;
987 unsigned long flags; 1130 unsigned long flags;
988 1131
989 if (devid > amd_iommu_last_bdf) 1132 if (devid > amd_iommu_last_bdf)
@@ -1002,17 +1145,7 @@ static int device_change_notifier(struct notifier_block *nb,
1002 "to a non-dma-ops domain\n", dev_name(dev)); 1145 "to a non-dma-ops domain\n", dev_name(dev));
1003 1146
1004 switch (action) { 1147 switch (action) {
1005 case BUS_NOTIFY_BOUND_DRIVER: 1148 case BUS_NOTIFY_UNBOUND_DRIVER:
1006 if (domain)
1007 goto out;
1008 dma_domain = find_protection_domain(devid);
1009 if (!dma_domain)
1010 dma_domain = iommu->default_dom;
1011 attach_device(iommu, &dma_domain->domain, devid);
1012 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
1013 "device %s\n", dma_domain->domain.id, dev_name(dev));
1014 break;
1015 case BUS_NOTIFY_UNBIND_DRIVER:
1016 if (!domain) 1149 if (!domain)
1017 goto out; 1150 goto out;
1018 detach_device(domain, devid); 1151 detach_device(domain, devid);
@@ -1022,7 +1155,7 @@ static int device_change_notifier(struct notifier_block *nb,
1022 dma_domain = find_protection_domain(devid); 1155 dma_domain = find_protection_domain(devid);
1023 if (dma_domain) 1156 if (dma_domain)
1024 goto out; 1157 goto out;
1025 dma_domain = dma_ops_domain_alloc(iommu, order); 1158 dma_domain = dma_ops_domain_alloc(iommu);
1026 if (!dma_domain) 1159 if (!dma_domain)
1027 goto out; 1160 goto out;
1028 dma_domain->target_dev = devid; 1161 dma_domain->target_dev = devid;
@@ -1133,8 +1266,8 @@ static int get_device_resources(struct device *dev,
1133 dma_dom = (*iommu)->default_dom; 1266 dma_dom = (*iommu)->default_dom;
1134 *domain = &dma_dom->domain; 1267 *domain = &dma_dom->domain;
1135 attach_device(*iommu, *domain, *bdf); 1268 attach_device(*iommu, *domain, *bdf);
1136 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 1269 DUMP_printk("Using protection domain %d for device %s\n",
1137 "device %s\n", (*domain)->id, dev_name(dev)); 1270 (*domain)->id, dev_name(dev));
1138 } 1271 }
1139 1272
1140 if (domain_for_device(_bdf) == NULL) 1273 if (domain_for_device(_bdf) == NULL)
@@ -1144,6 +1277,66 @@ static int get_device_resources(struct device *dev,
1144} 1277}
1145 1278
1146/* 1279/*
1280 * If the pte_page is not yet allocated this function is called
1281 */
1282static u64* alloc_pte(struct protection_domain *dom,
1283 unsigned long address, u64 **pte_page, gfp_t gfp)
1284{
1285 u64 *pte, *page;
1286
1287 pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
1288
1289 if (!IOMMU_PTE_PRESENT(*pte)) {
1290 page = (u64 *)get_zeroed_page(gfp);
1291 if (!page)
1292 return NULL;
1293 *pte = IOMMU_L2_PDE(virt_to_phys(page));
1294 }
1295
1296 pte = IOMMU_PTE_PAGE(*pte);
1297 pte = &pte[IOMMU_PTE_L1_INDEX(address)];
1298
1299 if (!IOMMU_PTE_PRESENT(*pte)) {
1300 page = (u64 *)get_zeroed_page(gfp);
1301 if (!page)
1302 return NULL;
1303 *pte = IOMMU_L1_PDE(virt_to_phys(page));
1304 }
1305
1306 pte = IOMMU_PTE_PAGE(*pte);
1307
1308 if (pte_page)
1309 *pte_page = pte;
1310
1311 pte = &pte[IOMMU_PTE_L0_INDEX(address)];
1312
1313 return pte;
1314}
1315
1316/*
1317 * This function fetches the PTE for a given address in the aperture
1318 */
1319static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1320 unsigned long address)
1321{
1322 struct aperture_range *aperture;
1323 u64 *pte, *pte_page;
1324
1325 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1326 if (!aperture)
1327 return NULL;
1328
1329 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1330 if (!pte) {
1331 pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
1332 aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
1333 } else
1334 pte += IOMMU_PTE_L0_INDEX(address);
1335
1336 return pte;
1337}
1338
1339/*
1147 * This is the generic map function. It maps one 4kb page at paddr to 1340 * This is the generic map function. It maps one 4kb page at paddr to
1148 * the given address in the DMA address space for the domain. 1341 * the given address in the DMA address space for the domain.
1149 */ 1342 */
@@ -1159,8 +1352,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1159 1352
1160 paddr &= PAGE_MASK; 1353 paddr &= PAGE_MASK;
1161 1354
1162 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; 1355 pte = dma_ops_get_pte(dom, address);
1163 pte += IOMMU_PTE_L0_INDEX(address); 1356 if (!pte)
1357 return bad_dma_address;
1164 1358
1165 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 1359 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1166 1360
@@ -1185,14 +1379,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1185 struct dma_ops_domain *dom, 1379 struct dma_ops_domain *dom,
1186 unsigned long address) 1380 unsigned long address)
1187{ 1381{
1382 struct aperture_range *aperture;
1188 u64 *pte; 1383 u64 *pte;
1189 1384
1190 if (address >= dom->aperture_size) 1385 if (address >= dom->aperture_size)
1191 return; 1386 return;
1192 1387
1193 WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); 1388 aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
1389 if (!aperture)
1390 return;
1391
1392 pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
1393 if (!pte)
1394 return;
1194 1395
1195 pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
1196 pte += IOMMU_PTE_L0_INDEX(address); 1396 pte += IOMMU_PTE_L0_INDEX(address);
1197 1397
1198 WARN_ON(!*pte); 1398 WARN_ON(!*pte);
@@ -1216,7 +1416,7 @@ static dma_addr_t __map_single(struct device *dev,
1216 u64 dma_mask) 1416 u64 dma_mask)
1217{ 1417{
1218 dma_addr_t offset = paddr & ~PAGE_MASK; 1418 dma_addr_t offset = paddr & ~PAGE_MASK;
1219 dma_addr_t address, start; 1419 dma_addr_t address, start, ret;
1220 unsigned int pages; 1420 unsigned int pages;
1221 unsigned long align_mask = 0; 1421 unsigned long align_mask = 0;
1222 int i; 1422 int i;
@@ -1232,14 +1432,33 @@ static dma_addr_t __map_single(struct device *dev,
1232 if (align) 1432 if (align)
1233 align_mask = (1UL << get_order(size)) - 1; 1433 align_mask = (1UL << get_order(size)) - 1;
1234 1434
1435retry:
1235 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 1436 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
1236 dma_mask); 1437 dma_mask);
1237 if (unlikely(address == bad_dma_address)) 1438 if (unlikely(address == bad_dma_address)) {
1238 goto out; 1439 /*
1440 * setting next_address here will let the address
1441 * allocator only scan the new allocated range in the
1442 * first run. This is a small optimization.
1443 */
1444 dma_dom->next_address = dma_dom->aperture_size;
1445
1446 if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
1447 goto out;
1448
1449 /*
1450 * aperture was sucessfully enlarged by 128 MB, try
1451 * allocation again
1452 */
1453 goto retry;
1454 }
1239 1455
1240 start = address; 1456 start = address;
1241 for (i = 0; i < pages; ++i) { 1457 for (i = 0; i < pages; ++i) {
1242 dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); 1458 ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
1459 if (ret == bad_dma_address)
1460 goto out_unmap;
1461
1243 paddr += PAGE_SIZE; 1462 paddr += PAGE_SIZE;
1244 start += PAGE_SIZE; 1463 start += PAGE_SIZE;
1245 } 1464 }
@@ -1255,6 +1474,17 @@ static dma_addr_t __map_single(struct device *dev,
1255 1474
1256out: 1475out:
1257 return address; 1476 return address;
1477
1478out_unmap:
1479
1480 for (--i; i >= 0; --i) {
1481 start -= PAGE_SIZE;
1482 dma_ops_domain_unmap(iommu, dma_dom, start);
1483 }
1484
1485 dma_ops_free_addresses(dma_dom, address, pages);
1486
1487 return bad_dma_address;
1258} 1488}
1259 1489
1260/* 1490/*
@@ -1537,8 +1767,10 @@ static void *alloc_coherent(struct device *dev, size_t size,
1537 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1767 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1538 size, DMA_BIDIRECTIONAL, true, dma_mask); 1768 size, DMA_BIDIRECTIONAL, true, dma_mask);
1539 1769
1540 if (*dma_addr == bad_dma_address) 1770 if (*dma_addr == bad_dma_address) {
1771 spin_unlock_irqrestore(&domain->lock, flags);
1541 goto out_free; 1772 goto out_free;
1773 }
1542 1774
1543 iommu_completion_wait(iommu); 1775 iommu_completion_wait(iommu);
1544 1776
@@ -1625,7 +1857,6 @@ static void prealloc_protection_domains(void)
1625 struct pci_dev *dev = NULL; 1857 struct pci_dev *dev = NULL;
1626 struct dma_ops_domain *dma_dom; 1858 struct dma_ops_domain *dma_dom;
1627 struct amd_iommu *iommu; 1859 struct amd_iommu *iommu;
1628 int order = amd_iommu_aperture_order;
1629 u16 devid; 1860 u16 devid;
1630 1861
1631 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1862 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
@@ -1638,7 +1869,7 @@ static void prealloc_protection_domains(void)
1638 iommu = amd_iommu_rlookup_table[devid]; 1869 iommu = amd_iommu_rlookup_table[devid];
1639 if (!iommu) 1870 if (!iommu)
1640 continue; 1871 continue;
1641 dma_dom = dma_ops_domain_alloc(iommu, order); 1872 dma_dom = dma_ops_domain_alloc(iommu);
1642 if (!dma_dom) 1873 if (!dma_dom)
1643 continue; 1874 continue;
1644 init_unity_mappings_for_device(dma_dom, devid); 1875 init_unity_mappings_for_device(dma_dom, devid);
@@ -1664,7 +1895,6 @@ static struct dma_map_ops amd_iommu_dma_ops = {
1664int __init amd_iommu_init_dma_ops(void) 1895int __init amd_iommu_init_dma_ops(void)
1665{ 1896{
1666 struct amd_iommu *iommu; 1897 struct amd_iommu *iommu;
1667 int order = amd_iommu_aperture_order;
1668 int ret; 1898 int ret;
1669 1899
1670 /* 1900 /*
@@ -1672,8 +1902,8 @@ int __init amd_iommu_init_dma_ops(void)
1672 * found in the system. Devices not assigned to any other 1902 * found in the system. Devices not assigned to any other
1673 * protection domain will be assigned to the default one. 1903 * protection domain will be assigned to the default one.
1674 */ 1904 */
1675 list_for_each_entry(iommu, &amd_iommu_list, list) { 1905 for_each_iommu(iommu) {
1676 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1906 iommu->default_dom = dma_ops_domain_alloc(iommu);
1677 if (iommu->default_dom == NULL) 1907 if (iommu->default_dom == NULL)
1678 return -ENOMEM; 1908 return -ENOMEM;
1679 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 1909 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -1710,7 +1940,7 @@ int __init amd_iommu_init_dma_ops(void)
1710 1940
1711free_domains: 1941free_domains:
1712 1942
1713 list_for_each_entry(iommu, &amd_iommu_list, list) { 1943 for_each_iommu(iommu) {
1714 if (iommu->default_dom) 1944 if (iommu->default_dom)
1715 dma_ops_domain_free(iommu->default_dom); 1945 dma_ops_domain_free(iommu->default_dom);
1716 } 1946 }
@@ -1842,7 +2072,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
1842 2072
1843 old_domain = domain_for_device(devid); 2073 old_domain = domain_for_device(devid);
1844 if (old_domain) 2074 if (old_domain)
1845 return -EBUSY; 2075 detach_device(old_domain, devid);
1846 2076
1847 attach_device(iommu, domain, devid); 2077 attach_device(iommu, domain, devid);
1848 2078
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be0902dac..238989ec077d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -115,15 +115,21 @@ struct ivmd_header {
115 u64 range_length; 115 u64 range_length;
116} __attribute__((packed)); 116} __attribute__((packed));
117 117
118bool amd_iommu_dump;
119
118static int __initdata amd_iommu_detected; 120static int __initdata amd_iommu_detected;
119 121
120u16 amd_iommu_last_bdf; /* largest PCI device id we have 122u16 amd_iommu_last_bdf; /* largest PCI device id we have
121 to handle */ 123 to handle */
122LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 124LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
123 we find in ACPI */ 125 we find in ACPI */
124unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ 126#ifdef CONFIG_IOMMU_STRESS
127bool amd_iommu_isolate = false;
128#else
125bool amd_iommu_isolate = true; /* if true, device isolation is 129bool amd_iommu_isolate = true; /* if true, device isolation is
126 enabled */ 130 enabled */
131#endif
132
127bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 133bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
128 134
129LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 135LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid)
175static inline unsigned long tbl_size(int entry_size) 181static inline unsigned long tbl_size(int entry_size)
176{ 182{
177 unsigned shift = PAGE_SHIFT + 183 unsigned shift = PAGE_SHIFT +
178 get_order(amd_iommu_last_bdf * entry_size); 184 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
179 185
180 return 1UL << shift; 186 return 1UL << shift;
181} 187}
@@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size)
193 * This function set the exclusion range in the IOMMU. DMA accesses to the 199 * This function set the exclusion range in the IOMMU. DMA accesses to the
194 * exclusion range are passed through untranslated 200 * exclusion range are passed through untranslated
195 */ 201 */
196static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) 202static void iommu_set_exclusion_range(struct amd_iommu *iommu)
197{ 203{
198 u64 start = iommu->exclusion_start & PAGE_MASK; 204 u64 start = iommu->exclusion_start & PAGE_MASK;
199 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; 205 u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
@@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
225} 231}
226 232
227/* Generic functions to enable/disable certain features of the IOMMU. */ 233/* Generic functions to enable/disable certain features of the IOMMU. */
228static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) 234static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
229{ 235{
230 u32 ctrl; 236 u32 ctrl;
231 237
@@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
244} 250}
245 251
246/* Function to enable the hardware */ 252/* Function to enable the hardware */
247static void __init iommu_enable(struct amd_iommu *iommu) 253static void iommu_enable(struct amd_iommu *iommu)
248{ 254{
249 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", 255 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
250 dev_name(&iommu->dev->dev), iommu->cap_ptr); 256 dev_name(&iommu->dev->dev), iommu->cap_ptr);
@@ -252,11 +258,9 @@ static void __init iommu_enable(struct amd_iommu *iommu)
252 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 258 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
253} 259}
254 260
255/* Function to enable IOMMU event logging and event interrupts */ 261static void iommu_disable(struct amd_iommu *iommu)
256static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
257{ 262{
258 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); 263 iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
259 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
260} 264}
261 265
262/* 266/*
@@ -413,25 +417,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
413{ 417{
414 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 418 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
415 get_order(CMD_BUFFER_SIZE)); 419 get_order(CMD_BUFFER_SIZE));
416 u64 entry;
417 420
418 if (cmd_buf == NULL) 421 if (cmd_buf == NULL)
419 return NULL; 422 return NULL;
420 423
421 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 424 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
422 425
423 entry = (u64)virt_to_phys(cmd_buf); 426 return cmd_buf;
427}
428
429/*
430 * This function writes the command buffer address to the hardware and
431 * enables it.
432 */
433static void iommu_enable_command_buffer(struct amd_iommu *iommu)
434{
435 u64 entry;
436
437 BUG_ON(iommu->cmd_buf == NULL);
438
439 entry = (u64)virt_to_phys(iommu->cmd_buf);
424 entry |= MMIO_CMD_SIZE_512; 440 entry |= MMIO_CMD_SIZE_512;
441
425 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 442 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
426 &entry, sizeof(entry)); 443 &entry, sizeof(entry));
427 444
428 /* set head and tail to zero manually */ 445 /* set head and tail to zero manually */
429 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 446 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
430 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 447 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
431 448
432 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); 449 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
433
434 return cmd_buf;
435} 450}
436 451
437static void __init free_command_buffer(struct amd_iommu *iommu) 452static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -443,20 +458,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
443/* allocates the memory where the IOMMU will log its events to */ 458/* allocates the memory where the IOMMU will log its events to */
444static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) 459static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
445{ 460{
446 u64 entry;
447 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 461 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
448 get_order(EVT_BUFFER_SIZE)); 462 get_order(EVT_BUFFER_SIZE));
449 463
450 if (iommu->evt_buf == NULL) 464 if (iommu->evt_buf == NULL)
451 return NULL; 465 return NULL;
452 466
467 return iommu->evt_buf;
468}
469
470static void iommu_enable_event_buffer(struct amd_iommu *iommu)
471{
472 u64 entry;
473
474 BUG_ON(iommu->evt_buf == NULL);
475
453 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; 476 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
477
454 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, 478 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
455 &entry, sizeof(entry)); 479 &entry, sizeof(entry));
456 480
457 iommu->evt_buf_size = EVT_BUFFER_SIZE; 481 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
458
459 return iommu->evt_buf;
460} 482}
461 483
462static void __init free_event_buffer(struct amd_iommu *iommu) 484static void __init free_event_buffer(struct amd_iommu *iommu)
@@ -596,32 +618,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
596 p += sizeof(struct ivhd_header); 618 p += sizeof(struct ivhd_header);
597 end += h->length; 619 end += h->length;
598 620
621
599 while (p < end) { 622 while (p < end) {
600 e = (struct ivhd_entry *)p; 623 e = (struct ivhd_entry *)p;
601 switch (e->type) { 624 switch (e->type) {
602 case IVHD_DEV_ALL: 625 case IVHD_DEV_ALL:
626
627 DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
628 " last device %02x:%02x.%x flags: %02x\n",
629 PCI_BUS(iommu->first_device),
630 PCI_SLOT(iommu->first_device),
631 PCI_FUNC(iommu->first_device),
632 PCI_BUS(iommu->last_device),
633 PCI_SLOT(iommu->last_device),
634 PCI_FUNC(iommu->last_device),
635 e->flags);
636
603 for (dev_i = iommu->first_device; 637 for (dev_i = iommu->first_device;
604 dev_i <= iommu->last_device; ++dev_i) 638 dev_i <= iommu->last_device; ++dev_i)
605 set_dev_entry_from_acpi(iommu, dev_i, 639 set_dev_entry_from_acpi(iommu, dev_i,
606 e->flags, 0); 640 e->flags, 0);
607 break; 641 break;
608 case IVHD_DEV_SELECT: 642 case IVHD_DEV_SELECT:
643
644 DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
645 "flags: %02x\n",
646 PCI_BUS(e->devid),
647 PCI_SLOT(e->devid),
648 PCI_FUNC(e->devid),
649 e->flags);
650
609 devid = e->devid; 651 devid = e->devid;
610 set_dev_entry_from_acpi(iommu, devid, e->flags, 0); 652 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
611 break; 653 break;
612 case IVHD_DEV_SELECT_RANGE_START: 654 case IVHD_DEV_SELECT_RANGE_START:
655
656 DUMP_printk(" DEV_SELECT_RANGE_START\t "
657 "devid: %02x:%02x.%x flags: %02x\n",
658 PCI_BUS(e->devid),
659 PCI_SLOT(e->devid),
660 PCI_FUNC(e->devid),
661 e->flags);
662
613 devid_start = e->devid; 663 devid_start = e->devid;
614 flags = e->flags; 664 flags = e->flags;
615 ext_flags = 0; 665 ext_flags = 0;
616 alias = false; 666 alias = false;
617 break; 667 break;
618 case IVHD_DEV_ALIAS: 668 case IVHD_DEV_ALIAS:
669
670 DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
671 "flags: %02x devid_to: %02x:%02x.%x\n",
672 PCI_BUS(e->devid),
673 PCI_SLOT(e->devid),
674 PCI_FUNC(e->devid),
675 e->flags,
676 PCI_BUS(e->ext >> 8),
677 PCI_SLOT(e->ext >> 8),
678 PCI_FUNC(e->ext >> 8));
679
619 devid = e->devid; 680 devid = e->devid;
620 devid_to = e->ext >> 8; 681 devid_to = e->ext >> 8;
621 set_dev_entry_from_acpi(iommu, devid, e->flags, 0); 682 set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
622 amd_iommu_alias_table[devid] = devid_to; 683 amd_iommu_alias_table[devid] = devid_to;
623 break; 684 break;
624 case IVHD_DEV_ALIAS_RANGE: 685 case IVHD_DEV_ALIAS_RANGE:
686
687 DUMP_printk(" DEV_ALIAS_RANGE\t\t "
688 "devid: %02x:%02x.%x flags: %02x "
689 "devid_to: %02x:%02x.%x\n",
690 PCI_BUS(e->devid),
691 PCI_SLOT(e->devid),
692 PCI_FUNC(e->devid),
693 e->flags,
694 PCI_BUS(e->ext >> 8),
695 PCI_SLOT(e->ext >> 8),
696 PCI_FUNC(e->ext >> 8));
697
625 devid_start = e->devid; 698 devid_start = e->devid;
626 flags = e->flags; 699 flags = e->flags;
627 devid_to = e->ext >> 8; 700 devid_to = e->ext >> 8;
@@ -629,17 +702,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
629 alias = true; 702 alias = true;
630 break; 703 break;
631 case IVHD_DEV_EXT_SELECT: 704 case IVHD_DEV_EXT_SELECT:
705
706 DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
707 "flags: %02x ext: %08x\n",
708 PCI_BUS(e->devid),
709 PCI_SLOT(e->devid),
710 PCI_FUNC(e->devid),
711 e->flags, e->ext);
712
632 devid = e->devid; 713 devid = e->devid;
633 set_dev_entry_from_acpi(iommu, devid, e->flags, 714 set_dev_entry_from_acpi(iommu, devid, e->flags,
634 e->ext); 715 e->ext);
635 break; 716 break;
636 case IVHD_DEV_EXT_SELECT_RANGE: 717 case IVHD_DEV_EXT_SELECT_RANGE:
718
719 DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
720 "%02x:%02x.%x flags: %02x ext: %08x\n",
721 PCI_BUS(e->devid),
722 PCI_SLOT(e->devid),
723 PCI_FUNC(e->devid),
724 e->flags, e->ext);
725
637 devid_start = e->devid; 726 devid_start = e->devid;
638 flags = e->flags; 727 flags = e->flags;
639 ext_flags = e->ext; 728 ext_flags = e->ext;
640 alias = false; 729 alias = false;
641 break; 730 break;
642 case IVHD_DEV_RANGE_END: 731 case IVHD_DEV_RANGE_END:
732
733 DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
734 PCI_BUS(e->devid),
735 PCI_SLOT(e->devid),
736 PCI_FUNC(e->devid));
737
643 devid = e->devid; 738 devid = e->devid;
644 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 739 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
645 if (alias) 740 if (alias)
@@ -679,7 +774,7 @@ static void __init free_iommu_all(void)
679{ 774{
680 struct amd_iommu *iommu, *next; 775 struct amd_iommu *iommu, *next;
681 776
682 list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { 777 for_each_iommu_safe(iommu, next) {
683 list_del(&iommu->list); 778 list_del(&iommu->list);
684 free_iommu_one(iommu); 779 free_iommu_one(iommu);
685 kfree(iommu); 780 kfree(iommu);
@@ -710,7 +805,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
710 if (!iommu->mmio_base) 805 if (!iommu->mmio_base)
711 return -ENOMEM; 806 return -ENOMEM;
712 807
713 iommu_set_device_table(iommu);
714 iommu->cmd_buf = alloc_command_buffer(iommu); 808 iommu->cmd_buf = alloc_command_buffer(iommu);
715 if (!iommu->cmd_buf) 809 if (!iommu->cmd_buf)
716 return -ENOMEM; 810 return -ENOMEM;
@@ -746,6 +840,15 @@ static int __init init_iommu_all(struct acpi_table_header *table)
746 h = (struct ivhd_header *)p; 840 h = (struct ivhd_header *)p;
747 switch (*p) { 841 switch (*p) {
748 case ACPI_IVHD_TYPE: 842 case ACPI_IVHD_TYPE:
843
844 DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
845 "seg: %d flags: %01x info %04x\n",
846 PCI_BUS(h->devid), PCI_SLOT(h->devid),
847 PCI_FUNC(h->devid), h->cap_ptr,
848 h->pci_seg, h->flags, h->info);
849 DUMP_printk(" mmio-addr: %016llx\n",
850 h->mmio_phys);
851
749 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); 852 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
750 if (iommu == NULL) 853 if (iommu == NULL)
751 return -ENOMEM; 854 return -ENOMEM;
@@ -773,56 +876,9 @@ static int __init init_iommu_all(struct acpi_table_header *table)
773 * 876 *
774 ****************************************************************************/ 877 ****************************************************************************/
775 878
776static int __init iommu_setup_msix(struct amd_iommu *iommu)
777{
778 struct amd_iommu *curr;
779 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
780 int nvec = 0, i;
781
782 list_for_each_entry(curr, &amd_iommu_list, list) {
783 if (curr->dev == iommu->dev) {
784 entries[nvec].entry = curr->evt_msi_num;
785 entries[nvec].vector = 0;
786 curr->int_enabled = true;
787 nvec++;
788 }
789 }
790
791 if (pci_enable_msix(iommu->dev, entries, nvec)) {
792 pci_disable_msix(iommu->dev);
793 return 1;
794 }
795
796 for (i = 0; i < nvec; ++i) {
797 int r = request_irq(entries->vector, amd_iommu_int_handler,
798 IRQF_SAMPLE_RANDOM,
799 "AMD IOMMU",
800 NULL);
801 if (r)
802 goto out_free;
803 }
804
805 return 0;
806
807out_free:
808 for (i -= 1; i >= 0; --i)
809 free_irq(entries->vector, NULL);
810
811 pci_disable_msix(iommu->dev);
812
813 return 1;
814}
815
816static int __init iommu_setup_msi(struct amd_iommu *iommu) 879static int __init iommu_setup_msi(struct amd_iommu *iommu)
817{ 880{
818 int r; 881 int r;
819 struct amd_iommu *curr;
820
821 list_for_each_entry(curr, &amd_iommu_list, list) {
822 if (curr->dev == iommu->dev)
823 curr->int_enabled = true;
824 }
825
826 882
827 if (pci_enable_msi(iommu->dev)) 883 if (pci_enable_msi(iommu->dev))
828 return 1; 884 return 1;
@@ -837,17 +893,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
837 return 1; 893 return 1;
838 } 894 }
839 895
896 iommu->int_enabled = true;
897 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
898
840 return 0; 899 return 0;
841} 900}
842 901
843static int __init iommu_init_msi(struct amd_iommu *iommu) 902static int iommu_init_msi(struct amd_iommu *iommu)
844{ 903{
845 if (iommu->int_enabled) 904 if (iommu->int_enabled)
846 return 0; 905 return 0;
847 906
848 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) 907 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
849 return iommu_setup_msix(iommu);
850 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
851 return iommu_setup_msi(iommu); 908 return iommu_setup_msi(iommu);
852 909
853 return 1; 910 return 1;
@@ -899,6 +956,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
899static int __init init_unity_map_range(struct ivmd_header *m) 956static int __init init_unity_map_range(struct ivmd_header *m)
900{ 957{
901 struct unity_map_entry *e = 0; 958 struct unity_map_entry *e = 0;
959 char *s;
902 960
903 e = kzalloc(sizeof(*e), GFP_KERNEL); 961 e = kzalloc(sizeof(*e), GFP_KERNEL);
904 if (e == NULL) 962 if (e == NULL)
@@ -906,14 +964,19 @@ static int __init init_unity_map_range(struct ivmd_header *m)
906 964
907 switch (m->type) { 965 switch (m->type) {
908 default: 966 default:
967 kfree(e);
968 return 0;
909 case ACPI_IVMD_TYPE: 969 case ACPI_IVMD_TYPE:
970 s = "IVMD_TYPEi\t\t\t";
910 e->devid_start = e->devid_end = m->devid; 971 e->devid_start = e->devid_end = m->devid;
911 break; 972 break;
912 case ACPI_IVMD_TYPE_ALL: 973 case ACPI_IVMD_TYPE_ALL:
974 s = "IVMD_TYPE_ALL\t\t";
913 e->devid_start = 0; 975 e->devid_start = 0;
914 e->devid_end = amd_iommu_last_bdf; 976 e->devid_end = amd_iommu_last_bdf;
915 break; 977 break;
916 case ACPI_IVMD_TYPE_RANGE: 978 case ACPI_IVMD_TYPE_RANGE:
979 s = "IVMD_TYPE_RANGE\t\t";
917 e->devid_start = m->devid; 980 e->devid_start = m->devid;
918 e->devid_end = m->aux; 981 e->devid_end = m->aux;
919 break; 982 break;
@@ -922,6 +985,13 @@ static int __init init_unity_map_range(struct ivmd_header *m)
922 e->address_end = e->address_start + PAGE_ALIGN(m->range_length); 985 e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
923 e->prot = m->flags >> 1; 986 e->prot = m->flags >> 1;
924 987
988 DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
989 " range_start: %016llx range_end: %016llx flags: %x\n", s,
990 PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
991 PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
992 PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
993 e->address_start, e->address_end, m->flags);
994
925 list_add_tail(&e->list, &amd_iommu_unity_map); 995 list_add_tail(&e->list, &amd_iommu_unity_map);
926 996
927 return 0; 997 return 0;
@@ -967,18 +1037,28 @@ static void init_device_table(void)
967 * This function finally enables all IOMMUs found in the system after 1037 * This function finally enables all IOMMUs found in the system after
968 * they have been initialized 1038 * they have been initialized
969 */ 1039 */
970static void __init enable_iommus(void) 1040static void enable_iommus(void)
971{ 1041{
972 struct amd_iommu *iommu; 1042 struct amd_iommu *iommu;
973 1043
974 list_for_each_entry(iommu, &amd_iommu_list, list) { 1044 for_each_iommu(iommu) {
1045 iommu_set_device_table(iommu);
1046 iommu_enable_command_buffer(iommu);
1047 iommu_enable_event_buffer(iommu);
975 iommu_set_exclusion_range(iommu); 1048 iommu_set_exclusion_range(iommu);
976 iommu_init_msi(iommu); 1049 iommu_init_msi(iommu);
977 iommu_enable_event_logging(iommu);
978 iommu_enable(iommu); 1050 iommu_enable(iommu);
979 } 1051 }
980} 1052}
981 1053
1054static void disable_iommus(void)
1055{
1056 struct amd_iommu *iommu;
1057
1058 for_each_iommu(iommu)
1059 iommu_disable(iommu);
1060}
1061
982/* 1062/*
983 * Suspend/Resume support 1063 * Suspend/Resume support
984 * disable suspend until real resume implemented 1064 * disable suspend until real resume implemented
@@ -986,12 +1066,31 @@ static void __init enable_iommus(void)
986 1066
987static int amd_iommu_resume(struct sys_device *dev) 1067static int amd_iommu_resume(struct sys_device *dev)
988{ 1068{
1069 /*
1070 * Disable IOMMUs before reprogramming the hardware registers.
1071 * IOMMU is still enabled from the resume kernel.
1072 */
1073 disable_iommus();
1074
1075 /* re-load the hardware */
1076 enable_iommus();
1077
1078 /*
1079 * we have to flush after the IOMMUs are enabled because a
1080 * disabled IOMMU will never execute the commands we send
1081 */
1082 amd_iommu_flush_all_domains();
1083 amd_iommu_flush_all_devices();
1084
989 return 0; 1085 return 0;
990} 1086}
991 1087
992static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) 1088static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
993{ 1089{
994 return -EINVAL; 1090 /* disable IOMMUs to go out of the way for BIOS */
1091 disable_iommus();
1092
1093 return 0;
995} 1094}
996 1095
997static struct sysdev_class amd_iommu_sysdev_class = { 1096static struct sysdev_class amd_iommu_sysdev_class = {
@@ -1137,9 +1236,6 @@ int __init amd_iommu_init(void)
1137 1236
1138 enable_iommus(); 1237 enable_iommus();
1139 1238
1140 printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
1141 (1 << (amd_iommu_aperture_order-20)));
1142
1143 printk(KERN_INFO "AMD IOMMU: device isolation "); 1239 printk(KERN_INFO "AMD IOMMU: device isolation ");
1144 if (amd_iommu_isolate) 1240 if (amd_iommu_isolate)
1145 printk("enabled\n"); 1241 printk("enabled\n");
@@ -1211,6 +1307,13 @@ void __init amd_iommu_detect(void)
1211 * 1307 *
1212 ****************************************************************************/ 1308 ****************************************************************************/
1213 1309
1310static int __init parse_amd_iommu_dump(char *str)
1311{
1312 amd_iommu_dump = true;
1313
1314 return 1;
1315}
1316
1214static int __init parse_amd_iommu_options(char *str) 1317static int __init parse_amd_iommu_options(char *str)
1215{ 1318{
1216 for (; *str; ++str) { 1319 for (; *str; ++str) {
@@ -1225,15 +1328,5 @@ static int __init parse_amd_iommu_options(char *str)
1225 return 1; 1328 return 1;
1226} 1329}
1227 1330
1228static int __init parse_amd_iommu_size_options(char *str) 1331__setup("amd_iommu_dump", parse_amd_iommu_dump);
1229{
1230 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
1231
1232 if ((order > 24) && (order < 31))
1233 amd_iommu_aperture_order = order;
1234
1235 return 1;
1236}
1237
1238__setup("amd_iommu=", parse_amd_iommu_options); 1332__setup("amd_iommu=", parse_amd_iommu_options);
1239__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ee75d2a9b9cd..8c7c042ecad1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
14 * Mikael Pettersson : PM converted to driver model. 14 * Mikael Pettersson : PM converted to driver model.
15 */ 15 */
16 16
17#include <linux/perf_counter.h>
17#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
18#include <linux/mc146818rtc.h> 19#include <linux/mc146818rtc.h>
19#include <linux/acpi_pmtmr.h> 20#include <linux/acpi_pmtmr.h>
@@ -34,6 +35,7 @@
34#include <linux/smp.h> 35#include <linux/smp.h>
35#include <linux/mm.h> 36#include <linux/mm.h>
36 37
38#include <asm/perf_counter.h>
37#include <asm/pgalloc.h> 39#include <asm/pgalloc.h>
38#include <asm/atomic.h> 40#include <asm/atomic.h>
39#include <asm/mpspec.h> 41#include <asm/mpspec.h>
@@ -249,7 +251,7 @@ static void native_apic_write_dummy(u32 reg, u32 v)
249 251
250static u32 native_apic_read_dummy(u32 reg) 252static u32 native_apic_read_dummy(u32 reg)
251{ 253{
252 WARN_ON_ONCE((cpu_has_apic || !disable_apic)); 254 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
253 return 0; 255 return 0;
254} 256}
255 257
@@ -1187,6 +1189,7 @@ void __cpuinit setup_local_APIC(void)
1187 apic_write(APIC_ESR, 0); 1189 apic_write(APIC_ESR, 0);
1188 } 1190 }
1189#endif 1191#endif
1192 perf_counters_lapic_init();
1190 1193
1191 preempt_disable(); 1194 preempt_disable();
1192 1195
@@ -1609,6 +1612,13 @@ void __init init_apic_mappings(void)
1609 new_apicid = read_apic_id(); 1612 new_apicid = read_apic_id();
1610 if (boot_cpu_physical_apicid != new_apicid) { 1613 if (boot_cpu_physical_apicid != new_apicid) {
1611 boot_cpu_physical_apicid = new_apicid; 1614 boot_cpu_physical_apicid = new_apicid;
1615 /*
1616 * yeah -- we lie about apic_version
1617 * in case if apic was disabled via boot option
1618 * but it's not a problem for SMP compiled kernel
1619 * since smp_sanity_check is prepared for such a case
1620 * and disable smp mode
1621 */
1612 apic_version[new_apicid] = 1622 apic_version[new_apicid] =
1613 GET_APIC_VERSION(apic_read(APIC_LVR)); 1623 GET_APIC_VERSION(apic_read(APIC_LVR));
1614 } 1624 }
@@ -2027,7 +2037,7 @@ static int lapic_resume(struct sys_device *dev)
2027 unsigned int l, h; 2037 unsigned int l, h;
2028 unsigned long flags; 2038 unsigned long flags;
2029 int maxlvt; 2039 int maxlvt;
2030 int ret; 2040 int ret = 0;
2031 struct IO_APIC_route_entry **ioapic_entries = NULL; 2041 struct IO_APIC_route_entry **ioapic_entries = NULL;
2032 2042
2033 if (!apic_pm_state.active) 2043 if (!apic_pm_state.active)
@@ -2038,14 +2048,15 @@ static int lapic_resume(struct sys_device *dev)
2038 ioapic_entries = alloc_ioapic_entries(); 2048 ioapic_entries = alloc_ioapic_entries();
2039 if (!ioapic_entries) { 2049 if (!ioapic_entries) {
2040 WARN(1, "Alloc ioapic_entries in lapic resume failed."); 2050 WARN(1, "Alloc ioapic_entries in lapic resume failed.");
2041 return -ENOMEM; 2051 ret = -ENOMEM;
2052 goto restore;
2042 } 2053 }
2043 2054
2044 ret = save_IO_APIC_setup(ioapic_entries); 2055 ret = save_IO_APIC_setup(ioapic_entries);
2045 if (ret) { 2056 if (ret) {
2046 WARN(1, "Saving IO-APIC state failed: %d\n", ret); 2057 WARN(1, "Saving IO-APIC state failed: %d\n", ret);
2047 free_ioapic_entries(ioapic_entries); 2058 free_ioapic_entries(ioapic_entries);
2048 return ret; 2059 goto restore;
2049 } 2060 }
2050 2061
2051 mask_IO_APIC_setup(ioapic_entries); 2062 mask_IO_APIC_setup(ioapic_entries);
@@ -2097,10 +2108,10 @@ static int lapic_resume(struct sys_device *dev)
2097 restore_IO_APIC_setup(ioapic_entries); 2108 restore_IO_APIC_setup(ioapic_entries);
2098 free_ioapic_entries(ioapic_entries); 2109 free_ioapic_entries(ioapic_entries);
2099 } 2110 }
2100 2111restore:
2101 local_irq_restore(flags); 2112 local_irq_restore(flags);
2102 2113
2103 return 0; 2114 return ret;
2104} 2115}
2105 2116
2106/* 2117/*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ac7f3b6ad583..94605e7f6a54 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -59,6 +59,7 @@
59#include <asm/setup.h> 59#include <asm/setup.h>
60#include <asm/irq_remapping.h> 60#include <asm/irq_remapping.h>
61#include <asm/hpet.h> 61#include <asm/hpet.h>
62#include <asm/hw_irq.h>
62#include <asm/uv/uv_hub.h> 63#include <asm/uv/uv_hub.h>
63#include <asm/uv/uv_irq.h> 64#include <asm/uv/uv_irq.h>
64 65
@@ -176,16 +177,18 @@ int __init arch_early_irq_init(void)
176 struct irq_cfg *cfg; 177 struct irq_cfg *cfg;
177 struct irq_desc *desc; 178 struct irq_desc *desc;
178 int count; 179 int count;
180 int node;
179 int i; 181 int i;
180 182
181 cfg = irq_cfgx; 183 cfg = irq_cfgx;
182 count = ARRAY_SIZE(irq_cfgx); 184 count = ARRAY_SIZE(irq_cfgx);
185 node= cpu_to_node(boot_cpu_id);
183 186
184 for (i = 0; i < count; i++) { 187 for (i = 0; i < count; i++) {
185 desc = irq_to_desc(i); 188 desc = irq_to_desc(i);
186 desc->chip_data = &cfg[i]; 189 desc->chip_data = &cfg[i];
187 alloc_bootmem_cpumask_var(&cfg[i].domain); 190 alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
188 alloc_bootmem_cpumask_var(&cfg[i].old_domain); 191 alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
189 if (i < NR_IRQS_LEGACY) 192 if (i < NR_IRQS_LEGACY)
190 cpumask_setall(cfg[i].domain); 193 cpumask_setall(cfg[i].domain);
191 } 194 }
@@ -4012,6 +4015,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
4012 4015
4013 return apic_id; 4016 return apic_id;
4014} 4017}
4018#endif
4015 4019
4016int __init io_apic_get_version(int ioapic) 4020int __init io_apic_get_version(int ioapic)
4017{ 4021{
@@ -4024,7 +4028,6 @@ int __init io_apic_get_version(int ioapic)
4024 4028
4025 return reg_01.bits.version; 4029 return reg_01.bits.version;
4026} 4030}
4027#endif
4028 4031
4029int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) 4032int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4030{ 4033{
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index c4762276c17e..b3025b43b63a 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data)
104} 104}
105#endif 105#endif
106 106
107static void report_broken_nmi(int cpu, int *prev_nmi_count) 107static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
108{ 108{
109 printk(KERN_CONT "\n"); 109 printk(KERN_CONT "\n");
110 110
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 01eda2ac65e4..440a8bccd91a 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -160,7 +160,6 @@ extern struct apic apic_summit;
160extern struct apic apic_bigsmp; 160extern struct apic apic_bigsmp;
161extern struct apic apic_es7000; 161extern struct apic apic_es7000;
162extern struct apic apic_es7000_cluster; 162extern struct apic apic_es7000_cluster;
163extern struct apic apic_default;
164 163
165struct apic *apic = &apic_default; 164struct apic *apic = &apic_default;
166EXPORT_SYMBOL_GPL(apic); 165EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 4a903e2f0d17..8e4cbb255c38 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -10,7 +10,7 @@
10#include <asm/apic.h> 10#include <asm/apic.h>
11#include <asm/ipi.h> 11#include <asm/ipi.h>
12 12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); 13static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14 14
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{ 16{
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 780a733a5e7a..ef0ae207a7c8 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -562,7 +562,7 @@ void __init uv_system_init(void)
562 union uvh_node_id_u node_id; 562 union uvh_node_id_u node_id;
563 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; 563 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
564 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; 564 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
565 int max_pnode = 0; 565 int gnode_extra, max_pnode = 0;
566 unsigned long mmr_base, present, paddr; 566 unsigned long mmr_base, present, paddr;
567 unsigned short pnode_mask; 567 unsigned short pnode_mask;
568 568
@@ -574,6 +574,13 @@ void __init uv_system_init(void)
574 mmr_base = 574 mmr_base =
575 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 575 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
576 ~UV_MMR_ENABLE; 576 ~UV_MMR_ENABLE;
577 pnode_mask = (1 << n_val) - 1;
578 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
579 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
580 gnode_upper = ((unsigned long)gnode_extra << m_val);
581 printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
582 n_val, m_val, gnode_upper, gnode_extra);
583
577 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); 584 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
578 585
579 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) 586 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
@@ -610,11 +617,6 @@ void __init uv_system_init(void)
610 } 617 }
611 } 618 }
612 619
613 pnode_mask = (1 << n_val) - 1;
614 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
615 gnode_upper = (((unsigned long)node_id.s.node_id) &
616 ~((1 << n_val) - 1)) << m_val;
617
618 uv_bios_init(); 620 uv_bios_init();
619 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 621 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
620 &sn_coherency_id, &sn_region_size); 622 &sn_coherency_id, &sn_region_size);
@@ -637,6 +639,7 @@ void __init uv_system_init(void)
637 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; 639 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
638 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 640 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
639 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 641 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
642 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
640 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 643 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
641 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; 644 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
642 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; 645 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 5a6aa1c1162f..1a830cbd7015 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -146,4 +146,5 @@ void foo(void)
146 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 146 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
147 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 147 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
148 OFFSET(BP_version, boot_params, hdr.version); 148 OFFSET(BP_version, boot_params, hdr.version);
149 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
149} 150}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72f062fb4b5..898ecc47e129 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -125,6 +125,7 @@ int main(void)
125 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 125 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
126 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 126 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
127 OFFSET(BP_version, boot_params, hdr.version); 127 OFFSET(BP_version, boot_params, hdr.version);
128 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
128 129
129 BLANK(); 130 BLANK();
130 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 131 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e4..3efcb2b96a15 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details, features and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
25 25
26obj-$(CONFIG_X86_MCE) += mcheck/ 26obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
27obj-$(CONFIG_MTRR) += mtrr/
28obj-$(CONFIG_CPU_FREQ) += cpufreq/
29 27
30obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 28obj-$(CONFIG_X86_MCE) += mcheck/
29obj-$(CONFIG_MTRR) += mtrr/
30obj-$(CONFIG_CPU_FREQ) += cpufreq/
31
32obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
31 33
32quiet_cmd_mkcapflags = MKCAP $@ 34quiet_cmd_mkcapflags = MKCAP $@
33 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 35 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 728b3750a3e8..e5b27d8f1b47 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -6,6 +6,7 @@
6#include <asm/processor.h> 6#include <asm/processor.h>
7#include <asm/apic.h> 7#include <asm/apic.h>
8#include <asm/cpu.h> 8#include <asm/cpu.h>
9#include <asm/pci-direct.h>
9 10
10#ifdef CONFIG_X86_64 11#ifdef CONFIG_X86_64
11# include <asm/numa_64.h> 12# include <asm/numa_64.h>
@@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
351 (c->x86_model == 8 && c->x86_mask >= 8)) 352 (c->x86_model == 8 && c->x86_mask >= 8))
352 set_cpu_cap(c, X86_FEATURE_K6_MTRR); 353 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
353#endif 354#endif
355#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
356 /* check CPU config space for extended APIC ID */
357 if (c->x86 >= 0xf) {
358 unsigned int val;
359 val = read_pci_config(0, 24, 0, 0x68);
360 if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
361 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
362 }
363#endif
354} 364}
355 365
356static void __cpuinit init_amd(struct cpuinfo_x86 *c) 366static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b0517aa2bd3b..3ffdcfa9abdf 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_counter.h>
16#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
17#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -874,6 +875,7 @@ void __init identify_boot_cpu(void)
874#else 875#else
875 vgetcpu_set_mode(); 876 vgetcpu_set_mode();
876#endif 877#endif
878 init_hw_perf_counters();
877} 879}
878 880
879void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 881void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 2fc4f6bb9ca5..6b2a52dd0403 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -32,9 +32,7 @@
32 32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); 33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); 34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
35static DEFINE_PER_CPU(unsigned, cpu_modelflag);
36static DEFINE_PER_CPU(int, cpu_priv_count); 35static DEFINE_PER_CPU(int, cpu_priv_count);
37static DEFINE_PER_CPU(unsigned, cpu_model);
38 36
39static DEFINE_MUTEX(cpu_debug_lock); 37static DEFINE_MUTEX(cpu_debug_lock);
40 38
@@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {
80 { "value", CPU_REG_ALL, 1 }, 78 { "value", CPU_REG_ALL, 1 },
81}; 79};
82 80
83/* Intel Registers Range */ 81/* CPU Registers Range */
84static struct cpu_debug_range cpu_intel_range[] = { 82static struct cpu_debug_range cpu_reg_range[] = {
85 { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, 83 { 0x00000000, 0x00000001, CPU_MC, },
86 { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, 84 { 0x00000006, 0x00000007, CPU_MONITOR, },
87 { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, 85 { 0x00000010, 0x00000010, CPU_TIME, },
88 { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, 86 { 0x00000011, 0x00000013, CPU_PMC, },
89 { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, 87 { 0x00000017, 0x00000017, CPU_PLATFORM, },
90 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, 88 { 0x0000001B, 0x0000001B, CPU_APIC, },
91 89 { 0x0000002A, 0x0000002B, CPU_POWERON, },
92 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, 90 { 0x0000002C, 0x0000002C, CPU_FREQ, },
93 { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, 91 { 0x0000003A, 0x0000003A, CPU_CONTROL, },
94 { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, 92 { 0x00000040, 0x00000047, CPU_LBRANCH, },
95 { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, 93 { 0x00000060, 0x00000067, CPU_LBRANCH, },
96 94 { 0x00000079, 0x00000079, CPU_BIOS, },
97 { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, 95 { 0x00000088, 0x0000008A, CPU_CACHE, },
98 { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, 96 { 0x0000008B, 0x0000008B, CPU_BIOS, },
99 { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, 97 { 0x0000009B, 0x0000009B, CPU_MONITOR, },
100 { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, 98 { 0x000000C1, 0x000000C4, CPU_PMC, },
101 99 { 0x000000CD, 0x000000CD, CPU_FREQ, },
102 { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, 100 { 0x000000E7, 0x000000E8, CPU_PERF, },
103 { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, 101 { 0x000000FE, 0x000000FE, CPU_MTRR, },
104 { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, 102
105 { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, 103 { 0x00000116, 0x0000011E, CPU_CACHE, },
106 104 { 0x00000174, 0x00000176, CPU_SYSENTER, },
107 { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, 105 { 0x00000179, 0x0000017B, CPU_MC, },
108 { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, 106 { 0x00000186, 0x00000189, CPU_PMC, },
109 { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, 107 { 0x00000198, 0x00000199, CPU_PERF, },
110 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, 108 { 0x0000019A, 0x0000019A, CPU_TIME, },
111 109 { 0x0000019B, 0x0000019D, CPU_THERM, },
112 { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, 110 { 0x000001A0, 0x000001A0, CPU_MISC, },
113 { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, 111 { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
114 { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, 112 { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
115 { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, 113 { 0x000001D9, 0x000001D9, CPU_DEBUG, },
116 { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, 114 { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
117 115
118 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, 116 { 0x00000200, 0x0000020F, CPU_MTRR, },
119 { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, 117 { 0x00000250, 0x00000250, CPU_MTRR, },
120 { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, 118 { 0x00000258, 0x00000259, CPU_MTRR, },
121 { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, 119 { 0x00000268, 0x0000026F, CPU_MTRR, },
122 { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, 120 { 0x00000277, 0x00000277, CPU_PAT, },
123 { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, 121 { 0x000002FF, 0x000002FF, CPU_MTRR, },
124 { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, 122
125 { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, 123 { 0x00000300, 0x00000311, CPU_PMC, },
126 124 { 0x00000345, 0x00000345, CPU_PMC, },
127 { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, 125 { 0x00000360, 0x00000371, CPU_PMC, },
128 { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, 126 { 0x0000038D, 0x00000390, CPU_PMC, },
129 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, 127 { 0x000003A0, 0x000003BE, CPU_PMC, },
130 { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, 128 { 0x000003C0, 0x000003CD, CPU_PMC, },
131 { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, 129 { 0x000003E0, 0x000003E1, CPU_PMC, },
132 { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, 130 { 0x000003F0, 0x000003F2, CPU_PMC, },
133 { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, 131
134 { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, 132 { 0x00000400, 0x00000417, CPU_MC, },
135 133 { 0x00000480, 0x0000048B, CPU_VMX, },
136 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, 134
137 { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, 135 { 0x00000600, 0x00000600, CPU_DEBUG, },
138 { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, 136 { 0x00000680, 0x0000068F, CPU_LBRANCH, },
139 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, 137 { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
140 { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, 138
141 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, 139 { 0x000107CC, 0x000107D3, CPU_PMC, },
142 140
143 { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, 141 { 0xC0000080, 0xC0000080, CPU_FEATURES, },
144 { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, 142 { 0xC0000081, 0xC0000084, CPU_CALL, },
145 { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, 143 { 0xC0000100, 0xC0000102, CPU_BASE, },
146 { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, 144 { 0xC0000103, 0xC0000103, CPU_TIME, },
147 { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, 145
148 { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, 146 { 0xC0010000, 0xC0010007, CPU_PMC, },
149 { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, 147 { 0xC0010010, 0xC0010010, CPU_CONF, },
150 { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, 148 { 0xC0010015, 0xC0010015, CPU_CONF, },
151 { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, 149 { 0xC0010016, 0xC001001A, CPU_MTRR, },
152 { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, 150 { 0xC001001D, 0xC001001D, CPU_MTRR, },
153 { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, 151 { 0xC001001F, 0xC001001F, CPU_CONF, },
154 { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, 152 { 0xC0010030, 0xC0010035, CPU_BIOS, },
155 153 { 0xC0010044, 0xC0010048, CPU_MC, },
156 { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, 154 { 0xC0010050, 0xC0010056, CPU_SMM, },
157 { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, 155 { 0xC0010058, 0xC0010058, CPU_CONF, },
158 { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, 156 { 0xC0010060, 0xC0010060, CPU_CACHE, },
159 { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, 157 { 0xC0010061, 0xC0010068, CPU_SMM, },
160 { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, 158 { 0xC0010069, 0xC001006B, CPU_SMM, },
161 { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, 159 { 0xC0010070, 0xC0010071, CPU_SMM, },
162 { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, 160 { 0xC0010111, 0xC0010113, CPU_SMM, },
163 { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, 161 { 0xC0010114, 0xC0010118, CPU_SVM, },
164 { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, 162 { 0xC0010140, 0xC0010141, CPU_OSVM, },
165 { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, 163 { 0xC0011022, 0xC0011023, CPU_CONF, },
166 { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
167
168 { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
169 { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
170 { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
171
172 { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
173
174 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
175 { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
176 { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
177 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
178}; 164};
179 165
180/* AMD Registers Range */
181static struct cpu_debug_range cpu_amd_range[] = {
182 { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
183 { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
184 { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
185 { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
186 { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
187 { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
188
189 { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
190 { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
191 { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
192 { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
193
194 { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
195 { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
196 { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
197 { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
198 { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
199 { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
200
201 { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
202
203 { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
204 { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
205 { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
206 { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
207
208 { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
209 { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
210 { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
211 { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
212 { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
213 { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
214 { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
215 { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
216 { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
217 { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
218 { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
219 { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
220 { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
221 { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
222 { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
223 { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
224 { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
225 { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
226};
227
228
229/* Intel */
230static int get_intel_modelflag(unsigned model)
231{
232 int flag;
233
234 switch (model) {
235 case 0x0501:
236 case 0x0502:
237 case 0x0504:
238 flag = CPU_INTEL_PENTIUM;
239 break;
240 case 0x0601:
241 case 0x0603:
242 case 0x0605:
243 case 0x0607:
244 case 0x0608:
245 case 0x060A:
246 case 0x060B:
247 flag = CPU_INTEL_P6;
248 break;
249 case 0x0609:
250 case 0x060D:
251 flag = CPU_INTEL_PENTIUM_M;
252 break;
253 case 0x060E:
254 flag = CPU_INTEL_CORE;
255 break;
256 case 0x060F:
257 case 0x0617:
258 flag = CPU_INTEL_CORE2;
259 break;
260 case 0x061C:
261 flag = CPU_INTEL_ATOM;
262 break;
263 case 0x0F00:
264 case 0x0F01:
265 case 0x0F02:
266 case 0x0F03:
267 case 0x0F04:
268 flag = CPU_INTEL_XEON_P4;
269 break;
270 case 0x0F06:
271 flag = CPU_INTEL_XEON_MP;
272 break;
273 default:
274 flag = CPU_NONE;
275 break;
276 }
277
278 return flag;
279}
280
281/* AMD */
282static int get_amd_modelflag(unsigned model)
283{
284 int flag;
285
286 switch (model >> 8) {
287 case 0x6:
288 flag = CPU_AMD_K6;
289 break;
290 case 0x7:
291 flag = CPU_AMD_K7;
292 break;
293 case 0x8:
294 flag = CPU_AMD_K8;
295 break;
296 case 0xf:
297 flag = CPU_AMD_0F;
298 break;
299 case 0x10:
300 flag = CPU_AMD_10;
301 break;
302 case 0x11:
303 flag = CPU_AMD_11;
304 break;
305 default:
306 flag = CPU_NONE;
307 break;
308 }
309
310 return flag;
311}
312
313static int get_cpu_modelflag(unsigned cpu)
314{
315 int flag;
316
317 flag = per_cpu(cpu_model, cpu);
318
319 switch (flag >> 16) {
320 case X86_VENDOR_INTEL:
321 flag = get_intel_modelflag(flag);
322 break;
323 case X86_VENDOR_AMD:
324 flag = get_amd_modelflag(flag & 0xffff);
325 break;
326 default:
327 flag = CPU_NONE;
328 break;
329 }
330
331 return flag;
332}
333
334static int get_cpu_range_count(unsigned cpu)
335{
336 int index;
337
338 switch (per_cpu(cpu_model, cpu) >> 16) {
339 case X86_VENDOR_INTEL:
340 index = ARRAY_SIZE(cpu_intel_range);
341 break;
342 case X86_VENDOR_AMD:
343 index = ARRAY_SIZE(cpu_amd_range);
344 break;
345 default:
346 index = 0;
347 break;
348 }
349
350 return index;
351}
352
353static int is_typeflag_valid(unsigned cpu, unsigned flag) 166static int is_typeflag_valid(unsigned cpu, unsigned flag)
354{ 167{
355 unsigned vendor, modelflag; 168 int i;
356 int i, index;
357 169
358 /* Standard Registers should be always valid */ 170 /* Standard Registers should be always valid */
359 if (flag >= CPU_TSS) 171 if (flag >= CPU_TSS)
360 return 1; 172 return 1;
361 173
362 modelflag = per_cpu(cpu_modelflag, cpu); 174 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
363 vendor = per_cpu(cpu_model, cpu) >> 16; 175 if (cpu_reg_range[i].flag == flag)
364 index = get_cpu_range_count(cpu); 176 return 1;
365
366 for (i = 0; i < index; i++) {
367 switch (vendor) {
368 case X86_VENDOR_INTEL:
369 if ((cpu_intel_range[i].model & modelflag) &&
370 (cpu_intel_range[i].flag & flag))
371 return 1;
372 break;
373 case X86_VENDOR_AMD:
374 if ((cpu_amd_range[i].model & modelflag) &&
375 (cpu_amd_range[i].flag & flag))
376 return 1;
377 break;
378 }
379 } 177 }
380 178
381 /* Invalid */ 179 /* Invalid */
@@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
385static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, 183static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
386 int index, unsigned flag) 184 int index, unsigned flag)
387{ 185{
388 unsigned modelflag; 186 if (cpu_reg_range[index].flag == flag) {
389 187 *min = cpu_reg_range[index].min;
390 modelflag = per_cpu(cpu_modelflag, cpu); 188 *max = cpu_reg_range[index].max;
391 *max = 0; 189 } else
392 switch (per_cpu(cpu_model, cpu) >> 16) { 190 *max = 0;
393 case X86_VENDOR_INTEL:
394 if ((cpu_intel_range[index].model & modelflag) &&
395 (cpu_intel_range[index].flag & flag)) {
396 *min = cpu_intel_range[index].min;
397 *max = cpu_intel_range[index].max;
398 }
399 break;
400 case X86_VENDOR_AMD:
401 if ((cpu_amd_range[index].model & modelflag) &&
402 (cpu_amd_range[index].flag & flag)) {
403 *min = cpu_amd_range[index].min;
404 *max = cpu_amd_range[index].max;
405 }
406 break;
407 }
408 191
409 return *max; 192 return *max;
410} 193}
@@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
434 unsigned msr, msr_min, msr_max; 217 unsigned msr, msr_min, msr_max;
435 struct cpu_private *priv; 218 struct cpu_private *priv;
436 u32 low, high; 219 u32 low, high;
437 int i, range; 220 int i;
438 221
439 if (seq) { 222 if (seq) {
440 priv = seq->private; 223 priv = seq->private;
@@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
446 } 229 }
447 } 230 }
448 231
449 range = get_cpu_range_count(cpu); 232 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
450
451 for (i = 0; i < range; i++) {
452 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) 233 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
453 continue; 234 continue;
454 235
@@ -800,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
800{ 581{
801 struct dentry *cpu_dentry = NULL; 582 struct dentry *cpu_dentry = NULL;
802 unsigned reg, reg_min, reg_max; 583 unsigned reg, reg_min, reg_max;
803 int i, range, err = 0; 584 int i, err = 0;
804 char reg_dir[12]; 585 char reg_dir[12];
805 u32 low, high; 586 u32 low, high;
806 587
807 range = get_cpu_range_count(cpu); 588 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
808
809 for (i = 0; i < range; i++) {
810 if (!get_cpu_range(cpu, &reg_min, &reg_max, i, 589 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
811 cpu_base[type].flag)) 590 cpu_base[type].flag))
812 continue; 591 continue;
@@ -862,10 +641,6 @@ static int cpu_init_cpu(void)
862 cpui = &cpu_data(cpu); 641 cpui = &cpu_data(cpu);
863 if (!cpu_has(cpui, X86_FEATURE_MSR)) 642 if (!cpu_has(cpui, X86_FEATURE_MSR))
864 continue; 643 continue;
865 per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
866 (cpui->x86 << 8) |
867 (cpui->x86_model));
868 per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
869 644
870 sprintf(cpu_dir, "cpu%d", cpu); 645 sprintf(cpu_dir, "cpu%d", cpu);
871 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); 646 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 52c839875478..f138c6c389b9 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -220,11 +220,14 @@ config X86_LONGHAUL
220 If in doubt, say N. 220 If in doubt, say N.
221 221
222config X86_E_POWERSAVER 222config X86_E_POWERSAVER
223 tristate "VIA C7 Enhanced PowerSaver" 223 tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
224 select CPU_FREQ_TABLE 224 select CPU_FREQ_TABLE
225 depends on X86_32 225 depends on X86_32 && EXPERIMENTAL
226 help 226 help
227 This adds the CPUFreq driver for VIA C7 processors. 227 This adds the CPUFreq driver for VIA C7 processors. However, this driver
228 does not have any safeguards to prevent operating the CPU out of spec
229 and is thus considered dangerous. Please use the regular ACPI cpufreq
230 driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
228 231
229 If in doubt, say N. 232 If in doubt, say N.
230 233
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 54b6de2cd947..ae9b503220ca 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)
90{ 90{
91 struct cpuinfo_x86 *cpu = &cpu_data(cpuid); 91 struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
92 92
93 if (cpu->x86_vendor != X86_VENDOR_INTEL || 93 return cpu_has(cpu, X86_FEATURE_EST);
94 !cpu_has(cpu, X86_FEATURE_EST))
95 return 0;
96
97 return 1;
98} 94}
99 95
100static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) 96static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
@@ -550,7 +546,7 @@ static int __init acpi_cpufreq_early_init(void)
550 return -ENOMEM; 546 return -ENOMEM;
551 } 547 }
552 for_each_possible_cpu(i) { 548 for_each_possible_cpu(i) {
553 if (!alloc_cpumask_var_node( 549 if (!zalloc_cpumask_var_node(
554 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, 550 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
555 GFP_KERNEL, cpu_to_node(i))) { 551 GFP_KERNEL, cpu_to_node(i))) {
556 552
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index a8363e5be4ef..d47c775eb0ab 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -322,7 +322,7 @@ static int powernow_acpi_init(void)
322 goto err0; 322 goto err0;
323 } 323 }
324 324
325 if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, 325 if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
326 GFP_KERNEL)) { 326 GFP_KERNEL)) {
327 retval = -ENOMEM; 327 retval = -ENOMEM;
328 goto err05; 328 goto err05;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index f6b32d112357..cf52215d9eb1 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -835,7 +835,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
835{ 835{
836 struct cpufreq_frequency_table *powernow_table; 836 struct cpufreq_frequency_table *powernow_table;
837 int ret_val = -ENODEV; 837 int ret_val = -ENODEV;
838 acpi_integer space_id; 838 acpi_integer control, status;
839 839
840 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 840 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
841 dprintk("register performance failed: bad ACPI data\n"); 841 dprintk("register performance failed: bad ACPI data\n");
@@ -848,12 +848,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
848 goto err_out; 848 goto err_out;
849 } 849 }
850 850
851 space_id = data->acpi_data.control_register.space_id; 851 control = data->acpi_data.control_register.space_id;
852 if ((space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || 852 status = data->acpi_data.status_register.space_id;
853 (space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { 853
854 if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
855 (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
854 dprintk("Invalid control/status registers (%x - %x)\n", 856 dprintk("Invalid control/status registers (%x - %x)\n",
855 data->acpi_data.control_register.space_id, 857 control, status);
856 space_id);
857 goto err_out; 858 goto err_out;
858 } 859 }
859 860
@@ -886,7 +887,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
886 /* notify BIOS that we exist */ 887 /* notify BIOS that we exist */
887 acpi_processor_notify_smm(THIS_MODULE); 888 acpi_processor_notify_smm(THIS_MODULE);
888 889
889 if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { 890 if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
890 printk(KERN_ERR PFX 891 printk(KERN_ERR PFX
891 "unable to alloc powernow_k8_data cpumask\n"); 892 "unable to alloc powernow_k8_data cpumask\n");
892 ret_val = -ENOMEM; 893 ret_val = -ENOMEM;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index c9f1fdc02830..55c831ed71ce 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -471,7 +471,7 @@ static int centrino_target (struct cpufreq_policy *policy,
471 471
472 if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) 472 if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL)))
473 return -ENOMEM; 473 return -ENOMEM;
474 if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { 474 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) {
475 free_cpumask_var(saved_mask); 475 free_cpumask_var(saved_mask);
476 return -ENOMEM; 476 return -ENOMEM;
477 } 477 }
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 483eda96e102..789efe217e1a 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,6 +17,7 @@
17 17
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <asm/smp.h> 19#include <asm/smp.h>
20#include <asm/k8.h>
20 21
21#define LVL_1_INST 1 22#define LVL_1_INST 1
22#define LVL_1_DATA 2 23#define LVL_1_DATA 2
@@ -159,14 +160,6 @@ struct _cpuid4_info_regs {
159 unsigned long can_disable; 160 unsigned long can_disable;
160}; 161};
161 162
162#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
163static struct pci_device_id k8_nb_id[] = {
164 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
165 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
166 {}
167};
168#endif
169
170unsigned short num_cache_leaves; 163unsigned short num_cache_leaves;
171 164
172/* AMD doesn't have CPUID4. Emulate it here to report the same 165/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -207,10 +200,17 @@ union l3_cache {
207}; 200};
208 201
209static const unsigned short __cpuinitconst assocs[] = { 202static const unsigned short __cpuinitconst assocs[] = {
210 [1] = 1, [2] = 2, [4] = 4, [6] = 8, 203 [1] = 1,
211 [8] = 16, [0xa] = 32, [0xb] = 48, 204 [2] = 2,
205 [4] = 4,
206 [6] = 8,
207 [8] = 16,
208 [0xa] = 32,
209 [0xb] = 48,
212 [0xc] = 64, 210 [0xc] = 64,
213 [0xf] = 0xffff // ?? 211 [0xd] = 96,
212 [0xe] = 128,
213 [0xf] = 0xffff /* fully associative - no way to show this currently */
214}; 214};
215 215
216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; 216static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
@@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
271 eax->split.type = types[leaf]; 271 eax->split.type = types[leaf];
272 eax->split.level = levels[leaf]; 272 eax->split.level = levels[leaf];
273 if (leaf == 3) 273 if (leaf == 3)
274 eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; 274 eax->split.num_threads_sharing =
275 current_cpu_data.x86_max_cores - 1;
275 else 276 else
276 eax->split.num_threads_sharing = 0; 277 eax->split.num_threads_sharing = 0;
277 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; 278 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
@@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
291{ 292{
292 if (index < 3) 293 if (index < 3)
293 return; 294 return;
295
296 if (boot_cpu_data.x86 == 0x11)
297 return;
298
299 /* see erratum #382 */
300 if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
301 return;
302
294 this_leaf->can_disable = 1; 303 this_leaf->can_disable = 1;
295} 304}
296 305
@@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
696#define to_object(k) container_of(k, struct _index_kobject, kobj) 705#define to_object(k) container_of(k, struct _index_kobject, kobj)
697#define to_attr(a) container_of(a, struct _cache_attr, attr) 706#define to_attr(a) container_of(a, struct _cache_attr, attr)
698 707
699#ifdef CONFIG_PCI 708static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
700static struct pci_dev *get_k8_northbridge(int node) 709 unsigned int index)
701{
702 struct pci_dev *dev = NULL;
703 int i;
704
705 for (i = 0; i <= node; i++) {
706 do {
707 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
708 if (!dev)
709 break;
710 } while (!pci_match_id(&k8_nb_id[0], dev));
711 if (!dev)
712 break;
713 }
714 return dev;
715}
716#else
717static struct pci_dev *get_k8_northbridge(int node)
718{
719 return NULL;
720}
721#endif
722
723static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
724{ 710{
725 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); 711 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
726 int node = cpu_to_node(cpumask_first(mask)); 712 int node = cpu_to_node(cpu);
727 struct pci_dev *dev = NULL; 713 struct pci_dev *dev = node_to_k8_nb_misc(node);
728 ssize_t ret = 0; 714 unsigned int reg = 0;
729 int i;
730 715
731 if (!this_leaf->can_disable) 716 if (!this_leaf->can_disable)
732 return sprintf(buf, "Feature not enabled\n");
733
734 dev = get_k8_northbridge(node);
735 if (!dev) {
736 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
737 return -EINVAL; 717 return -EINVAL;
738 }
739 718
740 for (i = 0; i < 2; i++) { 719 if (!dev)
741 unsigned int reg; 720 return -EINVAL;
742 721
743 pci_read_config_dword(dev, 0x1BC + i * 4, &reg); 722 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
723 return sprintf(buf, "%x\n", reg);
724}
744 725
745 ret += sprintf(buf, "%sEntry: %d\n", buf, i); 726#define SHOW_CACHE_DISABLE(index) \
746 ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", 727static ssize_t \
747 buf, 728show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
748 reg & 0x80000000 ? "Disabled" : "Allowed", 729{ \
749 reg & 0x40000000 ? "Disabled" : "Allowed"); 730 return show_cache_disable(this_leaf, buf, index); \
750 ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
751 buf, (reg & 0x30000) >> 16, reg & 0xfff);
752 }
753 return ret;
754} 731}
732SHOW_CACHE_DISABLE(0)
733SHOW_CACHE_DISABLE(1)
755 734
756static ssize_t 735static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
757store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, 736 const char *buf, size_t count, unsigned int index)
758 size_t count)
759{ 737{
760 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); 738 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
761 int node = cpu_to_node(cpumask_first(mask)); 739 int node = cpu_to_node(cpu);
762 struct pci_dev *dev = NULL; 740 struct pci_dev *dev = node_to_k8_nb_misc(node);
763 unsigned int ret, index, val; 741 unsigned long val = 0;
742 unsigned int scrubber = 0;
764 743
765 if (!this_leaf->can_disable) 744 if (!this_leaf->can_disable)
766 return 0;
767
768 if (strlen(buf) > 15)
769 return -EINVAL; 745 return -EINVAL;
770 746
771 ret = sscanf(buf, "%x %x", &index, &val); 747 if (!capable(CAP_SYS_ADMIN))
772 if (ret != 2) 748 return -EPERM;
749
750 if (!dev)
773 return -EINVAL; 751 return -EINVAL;
774 if (index > 1) 752
753 if (strict_strtoul(buf, 10, &val) < 0)
775 return -EINVAL; 754 return -EINVAL;
776 755
777 val |= 0xc0000000; 756 val |= 0xc0000000;
778 dev = get_k8_northbridge(node); 757
779 if (!dev) { 758 pci_read_config_dword(dev, 0x58, &scrubber);
780 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); 759 scrubber &= ~0x1f000000;
781 return -EINVAL; 760 pci_write_config_dword(dev, 0x58, scrubber);
782 }
783 761
784 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); 762 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
785 wbinvd(); 763 wbinvd();
786 pci_write_config_dword(dev, 0x1BC + index * 4, val); 764 pci_write_config_dword(dev, 0x1BC + index * 4, val);
765 return count;
766}
787 767
788 return 1; 768#define STORE_CACHE_DISABLE(index) \
769static ssize_t \
770store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
771 const char *buf, size_t count) \
772{ \
773 return store_cache_disable(this_leaf, buf, count, index); \
789} 774}
775STORE_CACHE_DISABLE(0)
776STORE_CACHE_DISABLE(1)
790 777
791struct _cache_attr { 778struct _cache_attr {
792 struct attribute attr; 779 struct attribute attr;
@@ -808,7 +795,10 @@ define_one_ro(size);
808define_one_ro(shared_cpu_map); 795define_one_ro(shared_cpu_map);
809define_one_ro(shared_cpu_list); 796define_one_ro(shared_cpu_list);
810 797
811static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); 798static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
799 show_cache_disable_0, store_cache_disable_0);
800static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
801 show_cache_disable_1, store_cache_disable_1);
812 802
813static struct attribute * default_attrs[] = { 803static struct attribute * default_attrs[] = {
814 &type.attr, 804 &type.attr,
@@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = {
820 &size.attr, 810 &size.attr,
821 &shared_cpu_map.attr, 811 &shared_cpu_map.attr,
822 &shared_cpu_list.attr, 812 &shared_cpu_list.attr,
823 &cache_disable.attr, 813 &cache_disable_0.attr,
814 &cache_disable_1.attr,
824 NULL 815 NULL
825}; 816};
826 817
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 046087e9808f..f2ef6952c400 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -15,7 +15,6 @@
15#include <asm/hw_irq.h> 15#include <asm/hw_irq.h>
16#include <asm/idle.h> 16#include <asm/idle.h>
17#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18#include <asm/apic.h>
19 18
20#include "mce.h" 19#include "mce.h"
21 20
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ce0fe4b5c04f..1d584a18a50d 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)
808 808
809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) 809 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
810 return 0; 810 return 0;
811 rdmsr(MTRRdefType_MSR, def, dummy); 811 rdmsr(MSR_MTRRdefType, def, dummy);
812 def &= 0xff; 812 def &= 0xff;
813 if (def != MTRR_TYPE_UNCACHABLE) 813 if (def != MTRR_TYPE_UNCACHABLE)
814 return 0; 814 return 0;
@@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1003 */ 1003 */
1004 if (!is_cpu(INTEL) || disable_mtrr_trim) 1004 if (!is_cpu(INTEL) || disable_mtrr_trim)
1005 return 0; 1005 return 0;
1006 rdmsr(MTRRdefType_MSR, def, dummy); 1006 rdmsr(MSR_MTRRdefType, def, dummy);
1007 def &= 0xff; 1007 def &= 0xff;
1008 if (def != MTRR_TYPE_UNCACHABLE) 1008 if (def != MTRR_TYPE_UNCACHABLE)
1009 return 0; 1009 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index d21d4fb161f7..0543f69f0b27 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -20,9 +20,9 @@ struct fixed_range_block {
20}; 20};
21 21
22static struct fixed_range_block fixed_range_blocks[] = { 22static struct fixed_range_block fixed_range_blocks[] = {
23 { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ 23 { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
24 { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ 24 { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
25 { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ 25 { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
26 {} 26 {}
27}; 27};
28 28
@@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs)
194 194
195 k8_check_syscfg_dram_mod_en(); 195 k8_check_syscfg_dram_mod_en();
196 196
197 rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); 197 rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
198 198
199 for (i = 0; i < 2; i++) 199 for (i = 0; i < 2; i++)
200 rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); 200 rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
201 for (i = 0; i < 8; i++) 201 for (i = 0; i < 8; i++)
202 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); 202 rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
203} 203}
204 204
205void mtrr_save_fixed_ranges(void *info) 205void mtrr_save_fixed_ranges(void *info)
@@ -310,7 +310,7 @@ void __init get_mtrr_state(void)
310 310
311 vrs = mtrr_state.var_ranges; 311 vrs = mtrr_state.var_ranges;
312 312
313 rdmsr(MTRRcap_MSR, lo, dummy); 313 rdmsr(MSR_MTRRcap, lo, dummy);
314 mtrr_state.have_fixed = (lo >> 8) & 1; 314 mtrr_state.have_fixed = (lo >> 8) & 1;
315 315
316 for (i = 0; i < num_var_ranges; i++) 316 for (i = 0; i < num_var_ranges; i++)
@@ -318,7 +318,7 @@ void __init get_mtrr_state(void)
318 if (mtrr_state.have_fixed) 318 if (mtrr_state.have_fixed)
319 get_fixed_ranges(mtrr_state.fixed_ranges); 319 get_fixed_ranges(mtrr_state.fixed_ranges);
320 320
321 rdmsr(MTRRdefType_MSR, lo, dummy); 321 rdmsr(MSR_MTRRdefType, lo, dummy);
322 mtrr_state.def_type = (lo & 0xff); 322 mtrr_state.def_type = (lo & 0xff);
323 mtrr_state.enabled = (lo & 0xc00) >> 10; 323 mtrr_state.enabled = (lo & 0xc00) >> 10;
324 324
@@ -583,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
583 __flush_tlb(); 583 __flush_tlb();
584 584
585 /* Save MTRR state */ 585 /* Save MTRR state */
586 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 586 rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
587 587
588 /* Disable MTRRs, and set the default type to uncached */ 588 /* Disable MTRRs, and set the default type to uncached */
589 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); 589 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
590} 590}
591 591
592static void post_set(void) __releases(set_atomicity_lock) 592static void post_set(void) __releases(set_atomicity_lock)
@@ -595,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock)
595 __flush_tlb(); 595 __flush_tlb();
596 596
597 /* Intel (P6) standard MTRRs */ 597 /* Intel (P6) standard MTRRs */
598 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 598 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
599 599
600 /* Enable caches */ 600 /* Enable caches */
601 write_cr0(read_cr0() & 0xbfffffff); 601 write_cr0(read_cr0() & 0xbfffffff);
@@ -707,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
707static int generic_have_wrcomb(void) 707static int generic_have_wrcomb(void)
708{ 708{
709 unsigned long config, dummy; 709 unsigned long config, dummy;
710 rdmsr(MTRRcap_MSR, config, dummy); 710 rdmsr(MSR_MTRRcap, config, dummy);
711 return (config & (1 << 10)); 711 return (config & (1 << 10));
712} 712}
713 713
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 03cda01f57c7..8fc248b5aeaf 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)
104 unsigned long config = 0, dummy; 104 unsigned long config = 0, dummy;
105 105
106 if (use_intel()) { 106 if (use_intel()) {
107 rdmsr(MTRRcap_MSR, config, dummy); 107 rdmsr(MSR_MTRRcap, config, dummy);
108 } else if (is_cpu(AMD)) 108 } else if (is_cpu(AMD))
109 config = 2; 109 config = 2;
110 else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) 110 else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 77f67f7b347a..7538b767f206 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,21 +5,6 @@
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/stddef.h> 6#include <linux/stddef.h>
7 7
8#define MTRRcap_MSR 0x0fe
9#define MTRRdefType_MSR 0x2ff
10
11#define MTRRfix64K_00000_MSR 0x250
12#define MTRRfix16K_80000_MSR 0x258
13#define MTRRfix16K_A0000_MSR 0x259
14#define MTRRfix4K_C0000_MSR 0x268
15#define MTRRfix4K_C8000_MSR 0x269
16#define MTRRfix4K_D0000_MSR 0x26a
17#define MTRRfix4K_D8000_MSR 0x26b
18#define MTRRfix4K_E0000_MSR 0x26c
19#define MTRRfix4K_E8000_MSR 0x26d
20#define MTRRfix4K_F0000_MSR 0x26e
21#define MTRRfix4K_F8000_MSR 0x26f
22
23#define MTRR_CHANGE_MASK_FIXED 0x01 8#define MTRR_CHANGE_MASK_FIXED 0x01
24#define MTRR_CHANGE_MASK_VARIABLE 0x02 9#define MTRR_CHANGE_MASK_VARIABLE 0x02
25#define MTRR_CHANGE_MASK_DEFTYPE 0x04 10#define MTRR_CHANGE_MASK_DEFTYPE 0x04
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 7f7e2753685b..1f5fb1588d1f 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
35 35
36 if (use_intel()) 36 if (use_intel())
37 /* Save MTRR state */ 37 /* Save MTRR state */
38 rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); 38 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
39 else 39 else
40 /* Cyrix ARRs - everything else were excluded at the top */ 40 /* Cyrix ARRs - everything else were excluded at the top */
41 ctxt->ccr3 = getCx86(CX86_CCR3); 41 ctxt->ccr3 = getCx86(CX86_CCR3);
@@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
46{ 46{
47 if (use_intel()) 47 if (use_intel())
48 /* Disable MTRRs, and set the default type to uncached */ 48 /* Disable MTRRs, and set the default type to uncached */
49 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, 49 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
50 ctxt->deftype_hi); 50 ctxt->deftype_hi);
51 else if (is_cpu(CYRIX)) 51 else if (is_cpu(CYRIX))
52 /* Cyrix ARRs - everything else were excluded at the top */ 52 /* Cyrix ARRs - everything else were excluded at the top */
@@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)
64 /* Restore MTRRdefType */ 64 /* Restore MTRRdefType */
65 if (use_intel()) 65 if (use_intel())
66 /* Intel (P6) standard MTRRs */ 66 /* Intel (P6) standard MTRRs */
67 mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); 67 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
68 else 68 else
69 /* Cyrix ARRs - everything else was excluded at the top */ 69 /* Cyrix ARRs - everything else was excluded at the top */
70 setCx86(CX86_CCR3, ctxt->ccr3); 70 setCx86(CX86_CCR3, ctxt->ccr3);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 000000000000..895c82e78455
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1704 @@
1/*
2 * Performance counter x86 architecture code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 *
10 * For licencing details see kernel-base/COPYING
11 */
12
13#include <linux/perf_counter.h>
14#include <linux/capability.h>
15#include <linux/notifier.h>
16#include <linux/hardirq.h>
17#include <linux/kprobes.h>
18#include <linux/module.h>
19#include <linux/kdebug.h>
20#include <linux/sched.h>
21#include <linux/uaccess.h>
22
23#include <asm/apic.h>
24#include <asm/stacktrace.h>
25#include <asm/nmi.h>
26
27static u64 perf_counter_mask __read_mostly;
28
29struct cpu_hw_counters {
30 struct perf_counter *counters[X86_PMC_IDX_MAX];
31 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
32 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
33 unsigned long interrupts;
34 int enabled;
35};
36
37/*
38 * struct x86_pmu - generic x86 pmu
39 */
40struct x86_pmu {
41 const char *name;
42 int version;
43 int (*handle_irq)(struct pt_regs *);
44 void (*disable_all)(void);
45 void (*enable_all)(void);
46 void (*enable)(struct hw_perf_counter *, int);
47 void (*disable)(struct hw_perf_counter *, int);
48 unsigned eventsel;
49 unsigned perfctr;
50 u64 (*event_map)(int);
51 u64 (*raw_event)(u64);
52 int max_events;
53 int num_counters;
54 int num_counters_fixed;
55 int counter_bits;
56 u64 counter_mask;
57 u64 max_period;
58 u64 intel_ctrl;
59};
60
61static struct x86_pmu x86_pmu __read_mostly;
62
63static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
64 .enabled = 1,
65};
66
67/*
68 * Intel PerfMon v3. Used on Core2 and later.
69 */
70static const u64 intel_perfmon_event_map[] =
71{
72 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
73 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
74 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
75 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
76 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
77 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
78 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
79};
80
81static u64 intel_pmu_event_map(int event)
82{
83 return intel_perfmon_event_map[event];
84}
85
86/*
87 * Generalized hw caching related event table, filled
88 * in on a per model basis. A value of 0 means
89 * 'not supported', -1 means 'event makes no sense on
90 * this CPU', any other value means the raw event
91 * ID.
92 */
93
94#define C(x) PERF_COUNT_HW_CACHE_##x
95
96static u64 __read_mostly hw_cache_event_ids
97 [PERF_COUNT_HW_CACHE_MAX]
98 [PERF_COUNT_HW_CACHE_OP_MAX]
99 [PERF_COUNT_HW_CACHE_RESULT_MAX];
100
101static const u64 nehalem_hw_cache_event_ids
102 [PERF_COUNT_HW_CACHE_MAX]
103 [PERF_COUNT_HW_CACHE_OP_MAX]
104 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
105{
106 [ C(L1D) ] = {
107 [ C(OP_READ) ] = {
108 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
109 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
110 },
111 [ C(OP_WRITE) ] = {
112 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
113 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
114 },
115 [ C(OP_PREFETCH) ] = {
116 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
117 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
118 },
119 },
120 [ C(L1I ) ] = {
121 [ C(OP_READ) ] = {
122 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
123 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
124 },
125 [ C(OP_WRITE) ] = {
126 [ C(RESULT_ACCESS) ] = -1,
127 [ C(RESULT_MISS) ] = -1,
128 },
129 [ C(OP_PREFETCH) ] = {
130 [ C(RESULT_ACCESS) ] = 0x0,
131 [ C(RESULT_MISS) ] = 0x0,
132 },
133 },
134 [ C(LL ) ] = {
135 [ C(OP_READ) ] = {
136 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
137 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
138 },
139 [ C(OP_WRITE) ] = {
140 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
141 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
142 },
143 [ C(OP_PREFETCH) ] = {
144 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
145 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
146 },
147 },
148 [ C(DTLB) ] = {
149 [ C(OP_READ) ] = {
150 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
151 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
152 },
153 [ C(OP_WRITE) ] = {
154 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
155 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
156 },
157 [ C(OP_PREFETCH) ] = {
158 [ C(RESULT_ACCESS) ] = 0x0,
159 [ C(RESULT_MISS) ] = 0x0,
160 },
161 },
162 [ C(ITLB) ] = {
163 [ C(OP_READ) ] = {
164 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
165 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
166 },
167 [ C(OP_WRITE) ] = {
168 [ C(RESULT_ACCESS) ] = -1,
169 [ C(RESULT_MISS) ] = -1,
170 },
171 [ C(OP_PREFETCH) ] = {
172 [ C(RESULT_ACCESS) ] = -1,
173 [ C(RESULT_MISS) ] = -1,
174 },
175 },
176 [ C(BPU ) ] = {
177 [ C(OP_READ) ] = {
178 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
179 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
180 },
181 [ C(OP_WRITE) ] = {
182 [ C(RESULT_ACCESS) ] = -1,
183 [ C(RESULT_MISS) ] = -1,
184 },
185 [ C(OP_PREFETCH) ] = {
186 [ C(RESULT_ACCESS) ] = -1,
187 [ C(RESULT_MISS) ] = -1,
188 },
189 },
190};
191
192static const u64 core2_hw_cache_event_ids
193 [PERF_COUNT_HW_CACHE_MAX]
194 [PERF_COUNT_HW_CACHE_OP_MAX]
195 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
196{
197 [ C(L1D) ] = {
198 [ C(OP_READ) ] = {
199 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
200 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
201 },
202 [ C(OP_WRITE) ] = {
203 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
204 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
205 },
206 [ C(OP_PREFETCH) ] = {
207 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
208 [ C(RESULT_MISS) ] = 0,
209 },
210 },
211 [ C(L1I ) ] = {
212 [ C(OP_READ) ] = {
213 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
214 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
215 },
216 [ C(OP_WRITE) ] = {
217 [ C(RESULT_ACCESS) ] = -1,
218 [ C(RESULT_MISS) ] = -1,
219 },
220 [ C(OP_PREFETCH) ] = {
221 [ C(RESULT_ACCESS) ] = 0,
222 [ C(RESULT_MISS) ] = 0,
223 },
224 },
225 [ C(LL ) ] = {
226 [ C(OP_READ) ] = {
227 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
228 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
229 },
230 [ C(OP_WRITE) ] = {
231 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
232 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
233 },
234 [ C(OP_PREFETCH) ] = {
235 [ C(RESULT_ACCESS) ] = 0,
236 [ C(RESULT_MISS) ] = 0,
237 },
238 },
239 [ C(DTLB) ] = {
240 [ C(OP_READ) ] = {
241 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
242 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
243 },
244 [ C(OP_WRITE) ] = {
245 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
246 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
247 },
248 [ C(OP_PREFETCH) ] = {
249 [ C(RESULT_ACCESS) ] = 0,
250 [ C(RESULT_MISS) ] = 0,
251 },
252 },
253 [ C(ITLB) ] = {
254 [ C(OP_READ) ] = {
255 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
256 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
257 },
258 [ C(OP_WRITE) ] = {
259 [ C(RESULT_ACCESS) ] = -1,
260 [ C(RESULT_MISS) ] = -1,
261 },
262 [ C(OP_PREFETCH) ] = {
263 [ C(RESULT_ACCESS) ] = -1,
264 [ C(RESULT_MISS) ] = -1,
265 },
266 },
267 [ C(BPU ) ] = {
268 [ C(OP_READ) ] = {
269 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
270 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
271 },
272 [ C(OP_WRITE) ] = {
273 [ C(RESULT_ACCESS) ] = -1,
274 [ C(RESULT_MISS) ] = -1,
275 },
276 [ C(OP_PREFETCH) ] = {
277 [ C(RESULT_ACCESS) ] = -1,
278 [ C(RESULT_MISS) ] = -1,
279 },
280 },
281};
282
283static const u64 atom_hw_cache_event_ids
284 [PERF_COUNT_HW_CACHE_MAX]
285 [PERF_COUNT_HW_CACHE_OP_MAX]
286 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
287{
288 [ C(L1D) ] = {
289 [ C(OP_READ) ] = {
290 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
291 [ C(RESULT_MISS) ] = 0,
292 },
293 [ C(OP_WRITE) ] = {
294 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
295 [ C(RESULT_MISS) ] = 0,
296 },
297 [ C(OP_PREFETCH) ] = {
298 [ C(RESULT_ACCESS) ] = 0x0,
299 [ C(RESULT_MISS) ] = 0,
300 },
301 },
302 [ C(L1I ) ] = {
303 [ C(OP_READ) ] = {
304 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
305 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
306 },
307 [ C(OP_WRITE) ] = {
308 [ C(RESULT_ACCESS) ] = -1,
309 [ C(RESULT_MISS) ] = -1,
310 },
311 [ C(OP_PREFETCH) ] = {
312 [ C(RESULT_ACCESS) ] = 0,
313 [ C(RESULT_MISS) ] = 0,
314 },
315 },
316 [ C(LL ) ] = {
317 [ C(OP_READ) ] = {
318 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
319 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
320 },
321 [ C(OP_WRITE) ] = {
322 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
323 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
324 },
325 [ C(OP_PREFETCH) ] = {
326 [ C(RESULT_ACCESS) ] = 0,
327 [ C(RESULT_MISS) ] = 0,
328 },
329 },
330 [ C(DTLB) ] = {
331 [ C(OP_READ) ] = {
332 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
333 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
334 },
335 [ C(OP_WRITE) ] = {
336 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
337 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
338 },
339 [ C(OP_PREFETCH) ] = {
340 [ C(RESULT_ACCESS) ] = 0,
341 [ C(RESULT_MISS) ] = 0,
342 },
343 },
344 [ C(ITLB) ] = {
345 [ C(OP_READ) ] = {
346 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
347 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
348 },
349 [ C(OP_WRITE) ] = {
350 [ C(RESULT_ACCESS) ] = -1,
351 [ C(RESULT_MISS) ] = -1,
352 },
353 [ C(OP_PREFETCH) ] = {
354 [ C(RESULT_ACCESS) ] = -1,
355 [ C(RESULT_MISS) ] = -1,
356 },
357 },
358 [ C(BPU ) ] = {
359 [ C(OP_READ) ] = {
360 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
361 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
362 },
363 [ C(OP_WRITE) ] = {
364 [ C(RESULT_ACCESS) ] = -1,
365 [ C(RESULT_MISS) ] = -1,
366 },
367 [ C(OP_PREFETCH) ] = {
368 [ C(RESULT_ACCESS) ] = -1,
369 [ C(RESULT_MISS) ] = -1,
370 },
371 },
372};
373
374static u64 intel_pmu_raw_event(u64 event)
375{
376#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
377#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
378#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
379#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
380#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
381
382#define CORE_EVNTSEL_MASK \
383 (CORE_EVNTSEL_EVENT_MASK | \
384 CORE_EVNTSEL_UNIT_MASK | \
385 CORE_EVNTSEL_EDGE_MASK | \
386 CORE_EVNTSEL_INV_MASK | \
387 CORE_EVNTSEL_COUNTER_MASK)
388
389 return event & CORE_EVNTSEL_MASK;
390}
391
392static const u64 amd_0f_hw_cache_event_ids
393 [PERF_COUNT_HW_CACHE_MAX]
394 [PERF_COUNT_HW_CACHE_OP_MAX]
395 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
396{
397 [ C(L1D) ] = {
398 [ C(OP_READ) ] = {
399 [ C(RESULT_ACCESS) ] = 0,
400 [ C(RESULT_MISS) ] = 0,
401 },
402 [ C(OP_WRITE) ] = {
403 [ C(RESULT_ACCESS) ] = 0,
404 [ C(RESULT_MISS) ] = 0,
405 },
406 [ C(OP_PREFETCH) ] = {
407 [ C(RESULT_ACCESS) ] = 0,
408 [ C(RESULT_MISS) ] = 0,
409 },
410 },
411 [ C(L1I ) ] = {
412 [ C(OP_READ) ] = {
413 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
414 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
415 },
416 [ C(OP_WRITE) ] = {
417 [ C(RESULT_ACCESS) ] = -1,
418 [ C(RESULT_MISS) ] = -1,
419 },
420 [ C(OP_PREFETCH) ] = {
421 [ C(RESULT_ACCESS) ] = 0,
422 [ C(RESULT_MISS) ] = 0,
423 },
424 },
425 [ C(LL ) ] = {
426 [ C(OP_READ) ] = {
427 [ C(RESULT_ACCESS) ] = 0,
428 [ C(RESULT_MISS) ] = 0,
429 },
430 [ C(OP_WRITE) ] = {
431 [ C(RESULT_ACCESS) ] = 0,
432 [ C(RESULT_MISS) ] = 0,
433 },
434 [ C(OP_PREFETCH) ] = {
435 [ C(RESULT_ACCESS) ] = 0,
436 [ C(RESULT_MISS) ] = 0,
437 },
438 },
439 [ C(DTLB) ] = {
440 [ C(OP_READ) ] = {
441 [ C(RESULT_ACCESS) ] = 0,
442 [ C(RESULT_MISS) ] = 0,
443 },
444 [ C(OP_WRITE) ] = {
445 [ C(RESULT_ACCESS) ] = 0,
446 [ C(RESULT_MISS) ] = 0,
447 },
448 [ C(OP_PREFETCH) ] = {
449 [ C(RESULT_ACCESS) ] = 0,
450 [ C(RESULT_MISS) ] = 0,
451 },
452 },
453 [ C(ITLB) ] = {
454 [ C(OP_READ) ] = {
455 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
456 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
457 },
458 [ C(OP_WRITE) ] = {
459 [ C(RESULT_ACCESS) ] = -1,
460 [ C(RESULT_MISS) ] = -1,
461 },
462 [ C(OP_PREFETCH) ] = {
463 [ C(RESULT_ACCESS) ] = -1,
464 [ C(RESULT_MISS) ] = -1,
465 },
466 },
467 [ C(BPU ) ] = {
468 [ C(OP_READ) ] = {
469 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
470 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
471 },
472 [ C(OP_WRITE) ] = {
473 [ C(RESULT_ACCESS) ] = -1,
474 [ C(RESULT_MISS) ] = -1,
475 },
476 [ C(OP_PREFETCH) ] = {
477 [ C(RESULT_ACCESS) ] = -1,
478 [ C(RESULT_MISS) ] = -1,
479 },
480 },
481};
482
483/*
484 * AMD Performance Monitor K7 and later.
485 */
486static const u64 amd_perfmon_event_map[] =
487{
488 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
489 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
490 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
491 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
492 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
493 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
494};
495
496static u64 amd_pmu_event_map(int event)
497{
498 return amd_perfmon_event_map[event];
499}
500
501static u64 amd_pmu_raw_event(u64 event)
502{
503#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
504#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
505#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
506#define K7_EVNTSEL_INV_MASK 0x000800000ULL
507#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
508
509#define K7_EVNTSEL_MASK \
510 (K7_EVNTSEL_EVENT_MASK | \
511 K7_EVNTSEL_UNIT_MASK | \
512 K7_EVNTSEL_EDGE_MASK | \
513 K7_EVNTSEL_INV_MASK | \
514 K7_EVNTSEL_COUNTER_MASK)
515
516 return event & K7_EVNTSEL_MASK;
517}
518
519/*
520 * Propagate counter elapsed time into the generic counter.
521 * Can only be executed on the CPU where the counter is active.
522 * Returns the delta events processed.
523 */
524static u64
525x86_perf_counter_update(struct perf_counter *counter,
526 struct hw_perf_counter *hwc, int idx)
527{
528 int shift = 64 - x86_pmu.counter_bits;
529 u64 prev_raw_count, new_raw_count;
530 s64 delta;
531
532 /*
533 * Careful: an NMI might modify the previous counter value.
534 *
535 * Our tactic to handle this is to first atomically read and
536 * exchange a new raw count - then add that new-prev delta
537 * count to the generic counter atomically:
538 */
539again:
540 prev_raw_count = atomic64_read(&hwc->prev_count);
541 rdmsrl(hwc->counter_base + idx, new_raw_count);
542
543 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
544 new_raw_count) != prev_raw_count)
545 goto again;
546
547 /*
548 * Now we have the new raw value and have updated the prev
549 * timestamp already. We can now calculate the elapsed delta
550 * (counter-)time and add that to the generic counter.
551 *
552 * Careful, not all hw sign-extends above the physical width
553 * of the count.
554 */
555 delta = (new_raw_count << shift) - (prev_raw_count << shift);
556 delta >>= shift;
557
558 atomic64_add(delta, &counter->count);
559 atomic64_sub(delta, &hwc->period_left);
560
561 return new_raw_count;
562}
563
564static atomic_t active_counters;
565static DEFINE_MUTEX(pmc_reserve_mutex);
566
567static bool reserve_pmc_hardware(void)
568{
569 int i;
570
571 if (nmi_watchdog == NMI_LOCAL_APIC)
572 disable_lapic_nmi_watchdog();
573
574 for (i = 0; i < x86_pmu.num_counters; i++) {
575 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
576 goto perfctr_fail;
577 }
578
579 for (i = 0; i < x86_pmu.num_counters; i++) {
580 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
581 goto eventsel_fail;
582 }
583
584 return true;
585
586eventsel_fail:
587 for (i--; i >= 0; i--)
588 release_evntsel_nmi(x86_pmu.eventsel + i);
589
590 i = x86_pmu.num_counters;
591
592perfctr_fail:
593 for (i--; i >= 0; i--)
594 release_perfctr_nmi(x86_pmu.perfctr + i);
595
596 if (nmi_watchdog == NMI_LOCAL_APIC)
597 enable_lapic_nmi_watchdog();
598
599 return false;
600}
601
602static void release_pmc_hardware(void)
603{
604 int i;
605
606 for (i = 0; i < x86_pmu.num_counters; i++) {
607 release_perfctr_nmi(x86_pmu.perfctr + i);
608 release_evntsel_nmi(x86_pmu.eventsel + i);
609 }
610
611 if (nmi_watchdog == NMI_LOCAL_APIC)
612 enable_lapic_nmi_watchdog();
613}
614
615static void hw_perf_counter_destroy(struct perf_counter *counter)
616{
617 if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
618 release_pmc_hardware();
619 mutex_unlock(&pmc_reserve_mutex);
620 }
621}
622
623static inline int x86_pmu_initialized(void)
624{
625 return x86_pmu.handle_irq != NULL;
626}
627
628static inline int
629set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
630{
631 unsigned int cache_type, cache_op, cache_result;
632 u64 config, val;
633
634 config = attr->config;
635
636 cache_type = (config >> 0) & 0xff;
637 if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
638 return -EINVAL;
639
640 cache_op = (config >> 8) & 0xff;
641 if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
642 return -EINVAL;
643
644 cache_result = (config >> 16) & 0xff;
645 if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
646 return -EINVAL;
647
648 val = hw_cache_event_ids[cache_type][cache_op][cache_result];
649
650 if (val == 0)
651 return -ENOENT;
652
653 if (val == -1)
654 return -EINVAL;
655
656 hwc->config |= val;
657
658 return 0;
659}
660
661/*
662 * Setup the hardware configuration for a given attr_type
663 */
664static int __hw_perf_counter_init(struct perf_counter *counter)
665{
666 struct perf_counter_attr *attr = &counter->attr;
667 struct hw_perf_counter *hwc = &counter->hw;
668 int err;
669
670 if (!x86_pmu_initialized())
671 return -ENODEV;
672
673 err = 0;
674 if (!atomic_inc_not_zero(&active_counters)) {
675 mutex_lock(&pmc_reserve_mutex);
676 if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
677 err = -EBUSY;
678 else
679 atomic_inc(&active_counters);
680 mutex_unlock(&pmc_reserve_mutex);
681 }
682 if (err)
683 return err;
684
685 /*
686 * Generate PMC IRQs:
687 * (keep 'enabled' bit clear for now)
688 */
689 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
690
691 /*
692 * Count user and OS events unless requested not to.
693 */
694 if (!attr->exclude_user)
695 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
696 if (!attr->exclude_kernel)
697 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
698
699 if (!hwc->sample_period) {
700 hwc->sample_period = x86_pmu.max_period;
701 hwc->last_period = hwc->sample_period;
702 atomic64_set(&hwc->period_left, hwc->sample_period);
703 }
704
705 counter->destroy = hw_perf_counter_destroy;
706
707 /*
708 * Raw event type provide the config in the event structure
709 */
710 if (attr->type == PERF_TYPE_RAW) {
711 hwc->config |= x86_pmu.raw_event(attr->config);
712 return 0;
713 }
714
715 if (attr->type == PERF_TYPE_HW_CACHE)
716 return set_ext_hw_attr(hwc, attr);
717
718 if (attr->config >= x86_pmu.max_events)
719 return -EINVAL;
720 /*
721 * The generic map:
722 */
723 hwc->config |= x86_pmu.event_map(attr->config);
724
725 return 0;
726}
727
728static void intel_pmu_disable_all(void)
729{
730 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
731}
732
733static void amd_pmu_disable_all(void)
734{
735 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
736 int idx;
737
738 if (!cpuc->enabled)
739 return;
740
741 cpuc->enabled = 0;
742 /*
743 * ensure we write the disable before we start disabling the
744 * counters proper, so that amd_pmu_enable_counter() does the
745 * right thing.
746 */
747 barrier();
748
749 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
750 u64 val;
751
752 if (!test_bit(idx, cpuc->active_mask))
753 continue;
754 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
755 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
756 continue;
757 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
758 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
759 }
760}
761
762void hw_perf_disable(void)
763{
764 if (!x86_pmu_initialized())
765 return;
766 return x86_pmu.disable_all();
767}
768
769static void intel_pmu_enable_all(void)
770{
771 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
772}
773
774static void amd_pmu_enable_all(void)
775{
776 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
777 int idx;
778
779 if (cpuc->enabled)
780 return;
781
782 cpuc->enabled = 1;
783 barrier();
784
785 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
786 u64 val;
787
788 if (!test_bit(idx, cpuc->active_mask))
789 continue;
790 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
791 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
792 continue;
793 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
794 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
795 }
796}
797
798void hw_perf_enable(void)
799{
800 if (!x86_pmu_initialized())
801 return;
802 x86_pmu.enable_all();
803}
804
805static inline u64 intel_pmu_get_status(void)
806{
807 u64 status;
808
809 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
810
811 return status;
812}
813
814static inline void intel_pmu_ack_status(u64 ack)
815{
816 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
817}
818
819static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
820{
821 int err;
822 err = checking_wrmsrl(hwc->config_base + idx,
823 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
824}
825
826static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
827{
828 int err;
829 err = checking_wrmsrl(hwc->config_base + idx,
830 hwc->config);
831}
832
833static inline void
834intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
835{
836 int idx = __idx - X86_PMC_IDX_FIXED;
837 u64 ctrl_val, mask;
838 int err;
839
840 mask = 0xfULL << (idx * 4);
841
842 rdmsrl(hwc->config_base, ctrl_val);
843 ctrl_val &= ~mask;
844 err = checking_wrmsrl(hwc->config_base, ctrl_val);
845}
846
847static inline void
848intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
849{
850 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
851 intel_pmu_disable_fixed(hwc, idx);
852 return;
853 }
854
855 x86_pmu_disable_counter(hwc, idx);
856}
857
858static inline void
859amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
860{
861 x86_pmu_disable_counter(hwc, idx);
862}
863
864static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
865
866/*
867 * Set the next IRQ period, based on the hwc->period_left value.
868 * To be called with the counter disabled in hw:
869 */
870static int
871x86_perf_counter_set_period(struct perf_counter *counter,
872 struct hw_perf_counter *hwc, int idx)
873{
874 s64 left = atomic64_read(&hwc->period_left);
875 s64 period = hwc->sample_period;
876 int err, ret = 0;
877
878 /*
879 * If we are way outside a reasoable range then just skip forward:
880 */
881 if (unlikely(left <= -period)) {
882 left = period;
883 atomic64_set(&hwc->period_left, left);
884 hwc->last_period = period;
885 ret = 1;
886 }
887
888 if (unlikely(left <= 0)) {
889 left += period;
890 atomic64_set(&hwc->period_left, left);
891 hwc->last_period = period;
892 ret = 1;
893 }
894 /*
895 * Quirk: certain CPUs dont like it if just 1 event is left:
896 */
897 if (unlikely(left < 2))
898 left = 2;
899
900 if (left > x86_pmu.max_period)
901 left = x86_pmu.max_period;
902
903 per_cpu(prev_left[idx], smp_processor_id()) = left;
904
905 /*
906 * The hw counter starts counting from this counter offset,
907 * mark it to be able to extra future deltas:
908 */
909 atomic64_set(&hwc->prev_count, (u64)-left);
910
911 err = checking_wrmsrl(hwc->counter_base + idx,
912 (u64)(-left) & x86_pmu.counter_mask);
913
914 return ret;
915}
916
917static inline void
918intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
919{
920 int idx = __idx - X86_PMC_IDX_FIXED;
921 u64 ctrl_val, bits, mask;
922 int err;
923
924 /*
925 * Enable IRQ generation (0x8),
926 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
927 * if requested:
928 */
929 bits = 0x8ULL;
930 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
931 bits |= 0x2;
932 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
933 bits |= 0x1;
934 bits <<= (idx * 4);
935 mask = 0xfULL << (idx * 4);
936
937 rdmsrl(hwc->config_base, ctrl_val);
938 ctrl_val &= ~mask;
939 ctrl_val |= bits;
940 err = checking_wrmsrl(hwc->config_base, ctrl_val);
941}
942
943static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
944{
945 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
946 intel_pmu_enable_fixed(hwc, idx);
947 return;
948 }
949
950 x86_pmu_enable_counter(hwc, idx);
951}
952
953static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
954{
955 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
956
957 if (cpuc->enabled)
958 x86_pmu_enable_counter(hwc, idx);
959 else
960 x86_pmu_disable_counter(hwc, idx);
961}
962
963static int
964fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
965{
966 unsigned int event;
967
968 if (!x86_pmu.num_counters_fixed)
969 return -1;
970
971 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
972
973 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
974 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
975 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
976 return X86_PMC_IDX_FIXED_CPU_CYCLES;
977 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
978 return X86_PMC_IDX_FIXED_BUS_CYCLES;
979
980 return -1;
981}
982
983/*
984 * Find a PMC slot for the freshly enabled / scheduled in counter:
985 */
986static int x86_pmu_enable(struct perf_counter *counter)
987{
988 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
989 struct hw_perf_counter *hwc = &counter->hw;
990 int idx;
991
992 idx = fixed_mode_idx(counter, hwc);
993 if (idx >= 0) {
994 /*
995 * Try to get the fixed counter, if that is already taken
996 * then try to get a generic counter:
997 */
998 if (test_and_set_bit(idx, cpuc->used_mask))
999 goto try_generic;
1000
1001 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1002 /*
1003 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
1004 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1005 */
1006 hwc->counter_base =
1007 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1008 hwc->idx = idx;
1009 } else {
1010 idx = hwc->idx;
1011 /* Try to get the previous generic counter again */
1012 if (test_and_set_bit(idx, cpuc->used_mask)) {
1013try_generic:
1014 idx = find_first_zero_bit(cpuc->used_mask,
1015 x86_pmu.num_counters);
1016 if (idx == x86_pmu.num_counters)
1017 return -EAGAIN;
1018
1019 set_bit(idx, cpuc->used_mask);
1020 hwc->idx = idx;
1021 }
1022 hwc->config_base = x86_pmu.eventsel;
1023 hwc->counter_base = x86_pmu.perfctr;
1024 }
1025
1026 perf_counters_lapic_init();
1027
1028 x86_pmu.disable(hwc, idx);
1029
1030 cpuc->counters[idx] = counter;
1031 set_bit(idx, cpuc->active_mask);
1032
1033 x86_perf_counter_set_period(counter, hwc, idx);
1034 x86_pmu.enable(hwc, idx);
1035
1036 return 0;
1037}
1038
1039static void x86_pmu_unthrottle(struct perf_counter *counter)
1040{
1041 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1042 struct hw_perf_counter *hwc = &counter->hw;
1043
1044 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1045 cpuc->counters[hwc->idx] != counter))
1046 return;
1047
1048 x86_pmu.enable(hwc, hwc->idx);
1049}
1050
1051void perf_counter_print_debug(void)
1052{
1053 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1054 struct cpu_hw_counters *cpuc;
1055 unsigned long flags;
1056 int cpu, idx;
1057
1058 if (!x86_pmu.num_counters)
1059 return;
1060
1061 local_irq_save(flags);
1062
1063 cpu = smp_processor_id();
1064 cpuc = &per_cpu(cpu_hw_counters, cpu);
1065
1066 if (x86_pmu.version >= 2) {
1067 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1068 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1069 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1070 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1071
1072 pr_info("\n");
1073 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1074 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1075 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1076 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1077 }
1078 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
1079
1080 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1081 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1082 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1083
1084 prev_left = per_cpu(prev_left[idx], cpu);
1085
1086 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1087 cpu, idx, pmc_ctrl);
1088 pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
1089 cpu, idx, pmc_count);
1090 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1091 cpu, idx, prev_left);
1092 }
1093 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1094 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1095
1096 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1097 cpu, idx, pmc_count);
1098 }
1099 local_irq_restore(flags);
1100}
1101
1102static void x86_pmu_disable(struct perf_counter *counter)
1103{
1104 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1105 struct hw_perf_counter *hwc = &counter->hw;
1106 int idx = hwc->idx;
1107
1108 /*
1109 * Must be done before we disable, otherwise the nmi handler
1110 * could reenable again:
1111 */
1112 clear_bit(idx, cpuc->active_mask);
1113 x86_pmu.disable(hwc, idx);
1114
1115 /*
1116 * Make sure the cleared pointer becomes visible before we
1117 * (potentially) free the counter:
1118 */
1119 barrier();
1120
1121 /*
1122 * Drain the remaining delta count out of a counter
1123 * that we are disabling:
1124 */
1125 x86_perf_counter_update(counter, hwc, idx);
1126 cpuc->counters[idx] = NULL;
1127 clear_bit(idx, cpuc->used_mask);
1128}
1129
1130/*
1131 * Save and restart an expired counter. Called by NMI contexts,
1132 * so it has to be careful about preempting normal counter ops:
1133 */
1134static int intel_pmu_save_and_restart(struct perf_counter *counter)
1135{
1136 struct hw_perf_counter *hwc = &counter->hw;
1137 int idx = hwc->idx;
1138 int ret;
1139
1140 x86_perf_counter_update(counter, hwc, idx);
1141 ret = x86_perf_counter_set_period(counter, hwc, idx);
1142
1143 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1144 intel_pmu_enable_counter(hwc, idx);
1145
1146 return ret;
1147}
1148
1149static void intel_pmu_reset(void)
1150{
1151 unsigned long flags;
1152 int idx;
1153
1154 if (!x86_pmu.num_counters)
1155 return;
1156
1157 local_irq_save(flags);
1158
1159 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1160
1161 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1162 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1163 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1164 }
1165 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1166 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1167 }
1168
1169 local_irq_restore(flags);
1170}
1171
1172
1173/*
1174 * This handler is triggered by the local APIC, so the APIC IRQ handling
1175 * rules apply:
1176 */
1177static int intel_pmu_handle_irq(struct pt_regs *regs)
1178{
1179 struct perf_sample_data data;
1180 struct cpu_hw_counters *cpuc;
1181 int bit, cpu, loops;
1182 u64 ack, status;
1183
1184 data.regs = regs;
1185 data.addr = 0;
1186
1187 cpu = smp_processor_id();
1188 cpuc = &per_cpu(cpu_hw_counters, cpu);
1189
1190 perf_disable();
1191 status = intel_pmu_get_status();
1192 if (!status) {
1193 perf_enable();
1194 return 0;
1195 }
1196
1197 loops = 0;
1198again:
1199 if (++loops > 100) {
1200 WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
1201 perf_counter_print_debug();
1202 intel_pmu_reset();
1203 perf_enable();
1204 return 1;
1205 }
1206
1207 inc_irq_stat(apic_perf_irqs);
1208 ack = status;
1209 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1210 struct perf_counter *counter = cpuc->counters[bit];
1211
1212 clear_bit(bit, (unsigned long *) &status);
1213 if (!test_bit(bit, cpuc->active_mask))
1214 continue;
1215
1216 if (!intel_pmu_save_and_restart(counter))
1217 continue;
1218
1219 if (perf_counter_overflow(counter, 1, &data))
1220 intel_pmu_disable_counter(&counter->hw, bit);
1221 }
1222
1223 intel_pmu_ack_status(ack);
1224
1225 /*
1226 * Repeat if there is more work to be done:
1227 */
1228 status = intel_pmu_get_status();
1229 if (status)
1230 goto again;
1231
1232 perf_enable();
1233
1234 return 1;
1235}
1236
1237static int amd_pmu_handle_irq(struct pt_regs *regs)
1238{
1239 struct perf_sample_data data;
1240 struct cpu_hw_counters *cpuc;
1241 struct perf_counter *counter;
1242 struct hw_perf_counter *hwc;
1243 int cpu, idx, handled = 0;
1244 u64 val;
1245
1246 data.regs = regs;
1247 data.addr = 0;
1248
1249 cpu = smp_processor_id();
1250 cpuc = &per_cpu(cpu_hw_counters, cpu);
1251
1252 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1253 if (!test_bit(idx, cpuc->active_mask))
1254 continue;
1255
1256 counter = cpuc->counters[idx];
1257 hwc = &counter->hw;
1258
1259 val = x86_perf_counter_update(counter, hwc, idx);
1260 if (val & (1ULL << (x86_pmu.counter_bits - 1)))
1261 continue;
1262
1263 /*
1264 * counter overflow
1265 */
1266 handled = 1;
1267 data.period = counter->hw.last_period;
1268
1269 if (!x86_perf_counter_set_period(counter, hwc, idx))
1270 continue;
1271
1272 if (perf_counter_overflow(counter, 1, &data))
1273 amd_pmu_disable_counter(hwc, idx);
1274 }
1275
1276 if (handled)
1277 inc_irq_stat(apic_perf_irqs);
1278
1279 return handled;
1280}
1281
1282void smp_perf_pending_interrupt(struct pt_regs *regs)
1283{
1284 irq_enter();
1285 ack_APIC_irq();
1286 inc_irq_stat(apic_pending_irqs);
1287 perf_counter_do_pending();
1288 irq_exit();
1289}
1290
1291void set_perf_counter_pending(void)
1292{
1293 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1294}
1295
1296void perf_counters_lapic_init(void)
1297{
1298 if (!x86_pmu_initialized())
1299 return;
1300
1301 /*
1302 * Always use NMI for PMU
1303 */
1304 apic_write(APIC_LVTPC, APIC_DM_NMI);
1305}
1306
1307static int __kprobes
1308perf_counter_nmi_handler(struct notifier_block *self,
1309 unsigned long cmd, void *__args)
1310{
1311 struct die_args *args = __args;
1312 struct pt_regs *regs;
1313
1314 if (!atomic_read(&active_counters))
1315 return NOTIFY_DONE;
1316
1317 switch (cmd) {
1318 case DIE_NMI:
1319 case DIE_NMI_IPI:
1320 break;
1321
1322 default:
1323 return NOTIFY_DONE;
1324 }
1325
1326 regs = args->regs;
1327
1328 apic_write(APIC_LVTPC, APIC_DM_NMI);
1329 /*
1330 * Can't rely on the handled return value to say it was our NMI, two
1331 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
1332 *
1333 * If the first NMI handles both, the latter will be empty and daze
1334 * the CPU.
1335 */
1336 x86_pmu.handle_irq(regs);
1337
1338 return NOTIFY_STOP;
1339}
1340
1341static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
1342 .notifier_call = perf_counter_nmi_handler,
1343 .next = NULL,
1344 .priority = 1
1345};
1346
1347static struct x86_pmu intel_pmu = {
1348 .name = "Intel",
1349 .handle_irq = intel_pmu_handle_irq,
1350 .disable_all = intel_pmu_disable_all,
1351 .enable_all = intel_pmu_enable_all,
1352 .enable = intel_pmu_enable_counter,
1353 .disable = intel_pmu_disable_counter,
1354 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
1355 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
1356 .event_map = intel_pmu_event_map,
1357 .raw_event = intel_pmu_raw_event,
1358 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
1359 /*
1360 * Intel PMCs cannot be accessed sanely above 32 bit width,
1361 * so we install an artificial 1<<31 period regardless of
1362 * the generic counter period:
1363 */
1364 .max_period = (1ULL << 31) - 1,
1365};
1366
1367static struct x86_pmu amd_pmu = {
1368 .name = "AMD",
1369 .handle_irq = amd_pmu_handle_irq,
1370 .disable_all = amd_pmu_disable_all,
1371 .enable_all = amd_pmu_enable_all,
1372 .enable = amd_pmu_enable_counter,
1373 .disable = amd_pmu_disable_counter,
1374 .eventsel = MSR_K7_EVNTSEL0,
1375 .perfctr = MSR_K7_PERFCTR0,
1376 .event_map = amd_pmu_event_map,
1377 .raw_event = amd_pmu_raw_event,
1378 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
1379 .num_counters = 4,
1380 .counter_bits = 48,
1381 .counter_mask = (1ULL << 48) - 1,
1382 /* use highest bit to detect overflow */
1383 .max_period = (1ULL << 47) - 1,
1384};
1385
1386static int intel_pmu_init(void)
1387{
1388 union cpuid10_edx edx;
1389 union cpuid10_eax eax;
1390 unsigned int unused;
1391 unsigned int ebx;
1392 int version;
1393
1394 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
1395 return -ENODEV;
1396
1397 /*
1398 * Check whether the Architectural PerfMon supports
1399 * Branch Misses Retired Event or not.
1400 */
1401 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
1402 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
1403 return -ENODEV;
1404
1405 version = eax.split.version_id;
1406 if (version < 2)
1407 return -ENODEV;
1408
1409 x86_pmu = intel_pmu;
1410 x86_pmu.version = version;
1411 x86_pmu.num_counters = eax.split.num_counters;
1412 x86_pmu.counter_bits = eax.split.bit_width;
1413 x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
1414
1415 /*
1416 * Quirk: v2 perfmon does not report fixed-purpose counters, so
1417 * assume at least 3 counters:
1418 */
1419 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
1420
1421 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1422
1423 /*
1424 * Install the hw-cache-events table:
1425 */
1426 switch (boot_cpu_data.x86_model) {
1427 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
1428 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
1429 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
1430 case 29: /* six-core 45 nm xeon "Dunnington" */
1431 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
1432 sizeof(hw_cache_event_ids));
1433
1434 pr_cont("Core2 events, ");
1435 break;
1436 default:
1437 case 26:
1438 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1439 sizeof(hw_cache_event_ids));
1440
1441 pr_cont("Nehalem/Corei7 events, ");
1442 break;
1443 case 28:
1444 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
1445 sizeof(hw_cache_event_ids));
1446
1447 pr_cont("Atom events, ");
1448 break;
1449 }
1450 return 0;
1451}
1452
1453static int amd_pmu_init(void)
1454{
1455 x86_pmu = amd_pmu;
1456
1457 switch (boot_cpu_data.x86) {
1458 case 0x0f:
1459 case 0x10:
1460 case 0x11:
1461 memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
1462 sizeof(hw_cache_event_ids));
1463
1464 pr_cont("AMD Family 0f/10/11 events, ");
1465 break;
1466 }
1467 return 0;
1468}
1469
1470void __init init_hw_perf_counters(void)
1471{
1472 int err;
1473
1474 pr_info("Performance Counters: ");
1475
1476 switch (boot_cpu_data.x86_vendor) {
1477 case X86_VENDOR_INTEL:
1478 err = intel_pmu_init();
1479 break;
1480 case X86_VENDOR_AMD:
1481 err = amd_pmu_init();
1482 break;
1483 default:
1484 return;
1485 }
1486 if (err != 0) {
1487 pr_cont("no PMU driver, software counters only.\n");
1488 return;
1489 }
1490
1491 pr_cont("%s PMU driver.\n", x86_pmu.name);
1492
1493 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1494 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1495 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1496 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1497 }
1498 perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
1499 perf_max_counters = x86_pmu.num_counters;
1500
1501 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1502 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1503 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1504 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1505 }
1506
1507 perf_counter_mask |=
1508 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1509
1510 perf_counters_lapic_init();
1511 register_die_notifier(&perf_counter_nmi_notifier);
1512
1513 pr_info("... version: %d\n", x86_pmu.version);
1514 pr_info("... bit width: %d\n", x86_pmu.counter_bits);
1515 pr_info("... generic counters: %d\n", x86_pmu.num_counters);
1516 pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask);
1517 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1518 pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed);
1519 pr_info("... counter mask: %016Lx\n", perf_counter_mask);
1520}
1521
1522static inline void x86_pmu_read(struct perf_counter *counter)
1523{
1524 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
1525}
1526
1527static const struct pmu pmu = {
1528 .enable = x86_pmu_enable,
1529 .disable = x86_pmu_disable,
1530 .read = x86_pmu_read,
1531 .unthrottle = x86_pmu_unthrottle,
1532};
1533
1534const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
1535{
1536 int err;
1537
1538 err = __hw_perf_counter_init(counter);
1539 if (err)
1540 return ERR_PTR(err);
1541
1542 return &pmu;
1543}
1544
1545/*
1546 * callchain support
1547 */
1548
1549static inline
1550void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
1551{
1552 if (entry->nr < MAX_STACK_DEPTH)
1553 entry->ip[entry->nr++] = ip;
1554}
1555
1556static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1557static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1558
1559
1560static void
1561backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1562{
1563 /* Ignore warnings */
1564}
1565
1566static void backtrace_warning(void *data, char *msg)
1567{
1568 /* Ignore warnings */
1569}
1570
1571static int backtrace_stack(void *data, char *name)
1572{
1573 /* Don't bother with IRQ stacks for now */
1574 return -1;
1575}
1576
1577static void backtrace_address(void *data, unsigned long addr, int reliable)
1578{
1579 struct perf_callchain_entry *entry = data;
1580
1581 if (reliable)
1582 callchain_store(entry, addr);
1583}
1584
1585static const struct stacktrace_ops backtrace_ops = {
1586 .warning = backtrace_warning,
1587 .warning_symbol = backtrace_warning_symbol,
1588 .stack = backtrace_stack,
1589 .address = backtrace_address,
1590};
1591
1592static void
1593perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1594{
1595 unsigned long bp;
1596 char *stack;
1597 int nr = entry->nr;
1598
1599 callchain_store(entry, instruction_pointer(regs));
1600
1601 stack = ((char *)regs + sizeof(struct pt_regs));
1602#ifdef CONFIG_FRAME_POINTER
1603 bp = frame_pointer(regs);
1604#else
1605 bp = 0;
1606#endif
1607
1608 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1609
1610 entry->kernel = entry->nr - nr;
1611}
1612
1613
1614struct stack_frame {
1615 const void __user *next_fp;
1616 unsigned long return_address;
1617};
1618
1619static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
1620{
1621 int ret;
1622
1623 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
1624 return 0;
1625
1626 ret = 1;
1627 pagefault_disable();
1628 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
1629 ret = 0;
1630 pagefault_enable();
1631
1632 return ret;
1633}
1634
1635static void
1636perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1637{
1638 struct stack_frame frame;
1639 const void __user *fp;
1640 int nr = entry->nr;
1641
1642 regs = (struct pt_regs *)current->thread.sp0 - 1;
1643 fp = (void __user *)regs->bp;
1644
1645 callchain_store(entry, regs->ip);
1646
1647 while (entry->nr < MAX_STACK_DEPTH) {
1648 frame.next_fp = NULL;
1649 frame.return_address = 0;
1650
1651 if (!copy_stack_frame(fp, &frame))
1652 break;
1653
1654 if ((unsigned long)fp < user_stack_pointer(regs))
1655 break;
1656
1657 callchain_store(entry, frame.return_address);
1658 fp = frame.next_fp;
1659 }
1660
1661 entry->user = entry->nr - nr;
1662}
1663
1664static void
1665perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1666{
1667 int is_user;
1668
1669 if (!regs)
1670 return;
1671
1672 is_user = user_mode(regs);
1673
1674 if (!current || current->pid == 0)
1675 return;
1676
1677 if (is_user && current->state != TASK_RUNNING)
1678 return;
1679
1680 if (!is_user)
1681 perf_callchain_kernel(regs, entry);
1682
1683 if (current->mm)
1684 perf_callchain_user(regs, entry);
1685}
1686
1687struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1688{
1689 struct perf_callchain_entry *entry;
1690
1691 if (in_nmi())
1692 entry = &__get_cpu_var(nmi_entry);
1693 else
1694 entry = &__get_cpu_var(irq_entry);
1695
1696 entry->nr = 0;
1697 entry->hv = 0;
1698 entry->kernel = 0;
1699 entry->user = 0;
1700
1701 perf_do_callchain(regs, entry);
1702
1703 return entry;
1704}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f6c70a164e32..d6f5b9fbde32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/genapic.h> 22#include <asm/apic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/perf_counter.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 87b67e3a765a..48bfe1386038 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -19,45 +19,61 @@
19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009 19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
20 */ 20 */
21 21
22 22#include <linux/kernel.h>
23#include <asm/ds.h>
24
25#include <linux/errno.h>
26#include <linux/string.h> 23#include <linux/string.h>
27#include <linux/slab.h> 24#include <linux/errno.h>
28#include <linux/sched.h> 25#include <linux/sched.h>
26#include <linux/slab.h>
29#include <linux/mm.h> 27#include <linux/mm.h>
30#include <linux/kernel.h> 28#include <linux/trace_clock.h>
29
30#include <asm/ds.h>
31 31
32#include "ds_selftest.h"
32 33
33/* 34/*
34 * The configuration for a particular DS hardware implementation. 35 * The configuration for a particular DS hardware implementation:
35 */ 36 */
36struct ds_configuration { 37struct ds_configuration {
37 /* the name of the configuration */ 38 /* The name of the configuration: */
38 const char *name; 39 const char *name;
39 /* the size of one pointer-typed field in the DS structure and 40
40 in the BTS and PEBS buffers in bytes; 41 /* The size of pointer-typed fields in DS, BTS, and PEBS: */
41 this covers the first 8 DS fields related to buffer management. */ 42 unsigned char sizeof_ptr_field;
42 unsigned char sizeof_field; 43
43 /* the size of a BTS/PEBS record in bytes */ 44 /* The size of a BTS/PEBS record in bytes: */
44 unsigned char sizeof_rec[2]; 45 unsigned char sizeof_rec[2];
45 /* a series of bit-masks to control various features indexed 46
46 * by enum ds_feature */ 47 /* The number of pebs counter reset values in the DS structure. */
47 unsigned long ctl[dsf_ctl_max]; 48 unsigned char nr_counter_reset;
49
50 /* Control bit-masks indexed by enum ds_feature: */
51 unsigned long ctl[dsf_ctl_max];
48}; 52};
49static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); 53static struct ds_configuration ds_cfg __read_mostly;
54
55
56/* Maximal size of a DS configuration: */
57#define MAX_SIZEOF_DS 0x80
50 58
51#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) 59/* Maximal size of a BTS record: */
60#define MAX_SIZEOF_BTS (3 * 8)
52 61
53#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ 62/* BTS and PEBS buffer alignment: */
54#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ 63#define DS_ALIGNMENT (1 << 3)
55#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
56 64
57#define BTS_CONTROL \ 65/* Number of buffer pointers in DS: */
58 (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ 66#define NUM_DS_PTR_FIELDS 8
59 ds_cfg.ctl[dsf_bts_overflow])
60 67
68/* Size of a pebs reset value in DS: */
69#define PEBS_RESET_FIELD_SIZE 8
70
71/* Mask of control bits in the DS MSR register: */
72#define BTS_CONTROL \
73 ( ds_cfg.ctl[dsf_bts] | \
74 ds_cfg.ctl[dsf_bts_kernel] | \
75 ds_cfg.ctl[dsf_bts_user] | \
76 ds_cfg.ctl[dsf_bts_overflow] )
61 77
62/* 78/*
63 * A BTS or PEBS tracer. 79 * A BTS or PEBS tracer.
@@ -66,29 +82,36 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
66 * to identify tracers. 82 * to identify tracers.
67 */ 83 */
68struct ds_tracer { 84struct ds_tracer {
69 /* the DS context (partially) owned by this tracer */ 85 /* The DS context (partially) owned by this tracer. */
70 struct ds_context *context; 86 struct ds_context *context;
71 /* the buffer provided on ds_request() and its size in bytes */ 87 /* The buffer provided on ds_request() and its size in bytes. */
72 void *buffer; 88 void *buffer;
73 size_t size; 89 size_t size;
74}; 90};
75 91
76struct bts_tracer { 92struct bts_tracer {
77 /* the common DS part */ 93 /* The common DS part: */
78 struct ds_tracer ds; 94 struct ds_tracer ds;
79 /* the trace including the DS configuration */ 95
80 struct bts_trace trace; 96 /* The trace including the DS configuration: */
81 /* buffer overflow notification function */ 97 struct bts_trace trace;
82 bts_ovfl_callback_t ovfl; 98
99 /* Buffer overflow notification function: */
100 bts_ovfl_callback_t ovfl;
101
102 /* Active flags affecting trace collection. */
103 unsigned int flags;
83}; 104};
84 105
85struct pebs_tracer { 106struct pebs_tracer {
86 /* the common DS part */ 107 /* The common DS part: */
87 struct ds_tracer ds; 108 struct ds_tracer ds;
88 /* the trace including the DS configuration */ 109
89 struct pebs_trace trace; 110 /* The trace including the DS configuration: */
90 /* buffer overflow notification function */ 111 struct pebs_trace trace;
91 pebs_ovfl_callback_t ovfl; 112
113 /* Buffer overflow notification function: */
114 pebs_ovfl_callback_t ovfl;
92}; 115};
93 116
94/* 117/*
@@ -97,6 +120,7 @@ struct pebs_tracer {
97 * 120 *
98 * The DS configuration consists of the following fields; different 121 * The DS configuration consists of the following fields; different
99 * architetures vary in the size of those fields. 122 * architetures vary in the size of those fields.
123 *
100 * - double-word aligned base linear address of the BTS buffer 124 * - double-word aligned base linear address of the BTS buffer
101 * - write pointer into the BTS buffer 125 * - write pointer into the BTS buffer
102 * - end linear address of the BTS buffer (one byte beyond the end of 126 * - end linear address of the BTS buffer (one byte beyond the end of
@@ -135,21 +159,22 @@ enum ds_field {
135}; 159};
136 160
137enum ds_qualifier { 161enum ds_qualifier {
138 ds_bts = 0, 162 ds_bts = 0,
139 ds_pebs 163 ds_pebs
140}; 164};
141 165
142static inline unsigned long ds_get(const unsigned char *base, 166static inline unsigned long
143 enum ds_qualifier qual, enum ds_field field) 167ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
144{ 168{
145 base += (ds_cfg.sizeof_field * (field + (4 * qual))); 169 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
146 return *(unsigned long *)base; 170 return *(unsigned long *)base;
147} 171}
148 172
149static inline void ds_set(unsigned char *base, enum ds_qualifier qual, 173static inline void
150 enum ds_field field, unsigned long value) 174ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
175 unsigned long value)
151{ 176{
152 base += (ds_cfg.sizeof_field * (field + (4 * qual))); 177 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
153 (*(unsigned long *)base) = value; 178 (*(unsigned long *)base) = value;
154} 179}
155 180
@@ -159,7 +184,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
159 */ 184 */
160static DEFINE_SPINLOCK(ds_lock); 185static DEFINE_SPINLOCK(ds_lock);
161 186
162
163/* 187/*
164 * We either support (system-wide) per-cpu or per-thread allocation. 188 * We either support (system-wide) per-cpu or per-thread allocation.
165 * We distinguish the two based on the task_struct pointer, where a 189 * We distinguish the two based on the task_struct pointer, where a
@@ -178,12 +202,28 @@ static DEFINE_SPINLOCK(ds_lock);
178 */ 202 */
179static atomic_t tracers = ATOMIC_INIT(0); 203static atomic_t tracers = ATOMIC_INIT(0);
180 204
181static inline void get_tracer(struct task_struct *task) 205static inline int get_tracer(struct task_struct *task)
182{ 206{
183 if (task) 207 int error;
208
209 spin_lock_irq(&ds_lock);
210
211 if (task) {
212 error = -EPERM;
213 if (atomic_read(&tracers) < 0)
214 goto out;
184 atomic_inc(&tracers); 215 atomic_inc(&tracers);
185 else 216 } else {
217 error = -EPERM;
218 if (atomic_read(&tracers) > 0)
219 goto out;
186 atomic_dec(&tracers); 220 atomic_dec(&tracers);
221 }
222
223 error = 0;
224out:
225 spin_unlock_irq(&ds_lock);
226 return error;
187} 227}
188 228
189static inline void put_tracer(struct task_struct *task) 229static inline void put_tracer(struct task_struct *task)
@@ -194,14 +234,6 @@ static inline void put_tracer(struct task_struct *task)
194 atomic_inc(&tracers); 234 atomic_inc(&tracers);
195} 235}
196 236
197static inline int check_tracer(struct task_struct *task)
198{
199 return task ?
200 (atomic_read(&tracers) >= 0) :
201 (atomic_read(&tracers) <= 0);
202}
203
204
205/* 237/*
206 * The DS context is either attached to a thread or to a cpu: 238 * The DS context is either attached to a thread or to a cpu:
207 * - in the former case, the thread_struct contains a pointer to the 239 * - in the former case, the thread_struct contains a pointer to the
@@ -213,61 +245,58 @@ static inline int check_tracer(struct task_struct *task)
213 * deallocated when the last user puts the context. 245 * deallocated when the last user puts the context.
214 */ 246 */
215struct ds_context { 247struct ds_context {
216 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ 248 /* The DS configuration; goes into MSR_IA32_DS_AREA: */
217 unsigned char ds[MAX_SIZEOF_DS]; 249 unsigned char ds[MAX_SIZEOF_DS];
218 /* the owner of the BTS and PEBS configuration, respectively */ 250
219 struct bts_tracer *bts_master; 251 /* The owner of the BTS and PEBS configuration, respectively: */
220 struct pebs_tracer *pebs_master; 252 struct bts_tracer *bts_master;
221 /* use count */ 253 struct pebs_tracer *pebs_master;
222 unsigned long count;
223 /* a pointer to the context location inside the thread_struct
224 * or the per_cpu context array */
225 struct ds_context **this;
226 /* a pointer to the task owning this context, or NULL, if the
227 * context is owned by a cpu */
228 struct task_struct *task;
229};
230 254
231static DEFINE_PER_CPU(struct ds_context *, system_context_array); 255 /* Use count: */
256 unsigned long count;
232 257
233#define system_context per_cpu(system_context_array, smp_processor_id()) 258 /* Pointer to the context pointer field: */
259 struct ds_context **this;
260
261 /* The traced task; NULL for cpu tracing: */
262 struct task_struct *task;
263
264 /* The traced cpu; only valid if task is NULL: */
265 int cpu;
266};
234 267
268static DEFINE_PER_CPU(struct ds_context *, cpu_context);
235 269
236static inline struct ds_context *ds_get_context(struct task_struct *task) 270
271static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
237{ 272{
238 struct ds_context **p_context = 273 struct ds_context **p_context =
239 (task ? &task->thread.ds_ctx : &system_context); 274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));
240 struct ds_context *context = NULL; 275 struct ds_context *context = NULL;
241 struct ds_context *new_context = NULL; 276 struct ds_context *new_context = NULL;
242 unsigned long irq;
243 277
244 /* Chances are small that we already have a context. */ 278 /* Chances are small that we already have a context. */
245 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); 279 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
246 if (!new_context) 280 if (!new_context)
247 return NULL; 281 return NULL;
248 282
249 spin_lock_irqsave(&ds_lock, irq); 283 spin_lock_irq(&ds_lock);
250 284
251 context = *p_context; 285 context = *p_context;
252 if (!context) { 286 if (likely(!context)) {
253 context = new_context; 287 context = new_context;
254 288
255 context->this = p_context; 289 context->this = p_context;
256 context->task = task; 290 context->task = task;
291 context->cpu = cpu;
257 context->count = 0; 292 context->count = 0;
258 293
259 if (task)
260 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
261
262 if (!task || (task == current))
263 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
264
265 *p_context = context; 294 *p_context = context;
266 } 295 }
267 296
268 context->count++; 297 context->count++;
269 298
270 spin_unlock_irqrestore(&ds_lock, irq); 299 spin_unlock_irq(&ds_lock);
271 300
272 if (context != new_context) 301 if (context != new_context)
273 kfree(new_context); 302 kfree(new_context);
@@ -275,8 +304,9 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
275 return context; 304 return context;
276} 305}
277 306
278static inline void ds_put_context(struct ds_context *context) 307static void ds_put_context(struct ds_context *context)
279{ 308{
309 struct task_struct *task;
280 unsigned long irq; 310 unsigned long irq;
281 311
282 if (!context) 312 if (!context)
@@ -291,17 +321,55 @@ static inline void ds_put_context(struct ds_context *context)
291 321
292 *(context->this) = NULL; 322 *(context->this) = NULL;
293 323
294 if (context->task) 324 task = context->task;
295 clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); 325
326 if (task)
327 clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
296 328
297 if (!context->task || (context->task == current)) 329 /*
298 wrmsrl(MSR_IA32_DS_AREA, 0); 330 * We leave the (now dangling) pointer to the DS configuration in
331 * the DS_AREA msr. This is as good or as bad as replacing it with
332 * NULL - the hardware would crash if we enabled tracing.
333 *
334 * This saves us some problems with having to write an msr on a
335 * different cpu while preventing others from doing the same for the
336 * next context for that same cpu.
337 */
299 338
300 spin_unlock_irqrestore(&ds_lock, irq); 339 spin_unlock_irqrestore(&ds_lock, irq);
301 340
341 /* The context might still be in use for context switching. */
342 if (task && (task != current))
343 wait_task_context_switch(task);
344
302 kfree(context); 345 kfree(context);
303} 346}
304 347
348static void ds_install_ds_area(struct ds_context *context)
349{
350 unsigned long ds;
351
352 ds = (unsigned long)context->ds;
353
354 /*
355 * There is a race between the bts master and the pebs master.
356 *
357 * The thread/cpu access is synchronized via get/put_cpu() for
358 * task tracing and via wrmsr_on_cpu for cpu tracing.
359 *
360 * If bts and pebs are collected for the same task or same cpu,
361 * the same confiuration is written twice.
362 */
363 if (context->task) {
364 get_cpu();
365 if (context->task == current)
366 wrmsrl(MSR_IA32_DS_AREA, ds);
367 set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
368 put_cpu();
369 } else
370 wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
371 (u32)((u64)ds), (u32)((u64)ds >> 32));
372}
305 373
306/* 374/*
307 * Call the tracer's callback on a buffer overflow. 375 * Call the tracer's callback on a buffer overflow.
@@ -332,9 +400,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
332 * The remainder of any partially written record is zeroed out. 400 * The remainder of any partially written record is zeroed out.
333 * 401 *
334 * context: the DS context 402 * context: the DS context
335 * qual: the buffer type 403 * qual: the buffer type
336 * record: the data to write 404 * record: the data to write
337 * size: the size of the data 405 * size: the size of the data
338 */ 406 */
339static int ds_write(struct ds_context *context, enum ds_qualifier qual, 407static int ds_write(struct ds_context *context, enum ds_qualifier qual,
340 const void *record, size_t size) 408 const void *record, size_t size)
@@ -349,14 +417,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
349 unsigned long write_size, adj_write_size; 417 unsigned long write_size, adj_write_size;
350 418
351 /* 419 /*
352 * write as much as possible without producing an 420 * Write as much as possible without producing an
353 * overflow interrupt. 421 * overflow interrupt.
354 * 422 *
355 * interrupt_threshold must either be 423 * Interrupt_threshold must either be
356 * - bigger than absolute_maximum or 424 * - bigger than absolute_maximum or
357 * - point to a record between buffer_base and absolute_maximum 425 * - point to a record between buffer_base and absolute_maximum
358 * 426 *
359 * index points to a valid record. 427 * Index points to a valid record.
360 */ 428 */
361 base = ds_get(context->ds, qual, ds_buffer_base); 429 base = ds_get(context->ds, qual, ds_buffer_base);
362 index = ds_get(context->ds, qual, ds_index); 430 index = ds_get(context->ds, qual, ds_index);
@@ -365,8 +433,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
365 433
366 write_end = min(end, int_th); 434 write_end = min(end, int_th);
367 435
368 /* if we are already beyond the interrupt threshold, 436 /*
369 * we fill the entire buffer */ 437 * If we are already beyond the interrupt threshold,
438 * we fill the entire buffer.
439 */
370 if (write_end <= index) 440 if (write_end <= index)
371 write_end = end; 441 write_end = end;
372 442
@@ -383,7 +453,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
383 adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; 453 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
384 adj_write_size *= ds_cfg.sizeof_rec[qual]; 454 adj_write_size *= ds_cfg.sizeof_rec[qual];
385 455
386 /* zero out trailing bytes */ 456 /* Zero out trailing bytes. */
387 memset((char *)index + write_size, 0, 457 memset((char *)index + write_size, 0,
388 adj_write_size - write_size); 458 adj_write_size - write_size);
389 index += adj_write_size; 459 index += adj_write_size;
@@ -410,7 +480,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
410 * Later architectures use 64bit pointers throughout, whereas earlier 480 * Later architectures use 64bit pointers throughout, whereas earlier
411 * architectures use 32bit pointers in 32bit mode. 481 * architectures use 32bit pointers in 32bit mode.
412 * 482 *
413 * We compute the base address for the first 8 fields based on: 483 * We compute the base address for the fields based on:
414 * - the field size stored in the DS configuration 484 * - the field size stored in the DS configuration
415 * - the relative field position 485 * - the relative field position
416 * 486 *
@@ -431,23 +501,23 @@ enum bts_field {
431 bts_to, 501 bts_to,
432 bts_flags, 502 bts_flags,
433 503
434 bts_qual = bts_from, 504 bts_qual = bts_from,
435 bts_jiffies = bts_to, 505 bts_clock = bts_to,
436 bts_pid = bts_flags, 506 bts_pid = bts_flags,
437 507
438 bts_qual_mask = (bts_qual_max - 1), 508 bts_qual_mask = (bts_qual_max - 1),
439 bts_escape = ((unsigned long)-1 & ~bts_qual_mask) 509 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
440}; 510};
441 511
442static inline unsigned long bts_get(const char *base, enum bts_field field) 512static inline unsigned long bts_get(const char *base, enum bts_field field)
443{ 513{
444 base += (ds_cfg.sizeof_field * field); 514 base += (ds_cfg.sizeof_ptr_field * field);
445 return *(unsigned long *)base; 515 return *(unsigned long *)base;
446} 516}
447 517
448static inline void bts_set(char *base, enum bts_field field, unsigned long val) 518static inline void bts_set(char *base, enum bts_field field, unsigned long val)
449{ 519{
450 base += (ds_cfg.sizeof_field * field);; 520 base += (ds_cfg.sizeof_ptr_field * field);;
451 (*(unsigned long *)base) = val; 521 (*(unsigned long *)base) = val;
452} 522}
453 523
@@ -463,8 +533,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val)
463 * 533 *
464 * return: bytes read/written on success; -Eerrno, otherwise 534 * return: bytes read/written on success; -Eerrno, otherwise
465 */ 535 */
466static int bts_read(struct bts_tracer *tracer, const void *at, 536static int
467 struct bts_struct *out) 537bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
468{ 538{
469 if (!tracer) 539 if (!tracer)
470 return -EINVAL; 540 return -EINVAL;
@@ -478,8 +548,8 @@ static int bts_read(struct bts_tracer *tracer, const void *at,
478 memset(out, 0, sizeof(*out)); 548 memset(out, 0, sizeof(*out));
479 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { 549 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
480 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); 550 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
481 out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); 551 out->variant.event.clock = bts_get(at, bts_clock);
482 out->variant.timestamp.pid = bts_get(at, bts_pid); 552 out->variant.event.pid = bts_get(at, bts_pid);
483 } else { 553 } else {
484 out->qualifier = bts_branch; 554 out->qualifier = bts_branch;
485 out->variant.lbr.from = bts_get(at, bts_from); 555 out->variant.lbr.from = bts_get(at, bts_from);
@@ -516,8 +586,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
516 case bts_task_arrives: 586 case bts_task_arrives:
517 case bts_task_departs: 587 case bts_task_departs:
518 bts_set(raw, bts_qual, (bts_escape | in->qualifier)); 588 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
519 bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); 589 bts_set(raw, bts_clock, in->variant.event.clock);
520 bts_set(raw, bts_pid, in->variant.timestamp.pid); 590 bts_set(raw, bts_pid, in->variant.event.pid);
521 break; 591 break;
522 default: 592 default:
523 return -EINVAL; 593 return -EINVAL;
@@ -555,7 +625,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
555 unsigned int flags) { 625 unsigned int flags) {
556 unsigned long buffer, adj; 626 unsigned long buffer, adj;
557 627
558 /* adjust the buffer address and size to meet alignment 628 /*
629 * Adjust the buffer address and size to meet alignment
559 * constraints: 630 * constraints:
560 * - buffer is double-word aligned 631 * - buffer is double-word aligned
561 * - size is multiple of record size 632 * - size is multiple of record size
@@ -577,9 +648,11 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
577 trace->begin = (void *)buffer; 648 trace->begin = (void *)buffer;
578 trace->top = trace->begin; 649 trace->top = trace->begin;
579 trace->end = (void *)(buffer + size); 650 trace->end = (void *)(buffer + size);
580 /* The value for 'no threshold' is -1, which will set the 651 /*
652 * The value for 'no threshold' is -1, which will set the
581 * threshold outside of the buffer, just like we want it. 653 * threshold outside of the buffer, just like we want it.
582 */ 654 */
655 ith *= ds_cfg.sizeof_rec[qual];
583 trace->ith = (void *)(buffer + size - ith); 656 trace->ith = (void *)(buffer + size - ith);
584 657
585 trace->flags = flags; 658 trace->flags = flags;
@@ -588,18 +661,27 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
588 661
589static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, 662static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
590 enum ds_qualifier qual, struct task_struct *task, 663 enum ds_qualifier qual, struct task_struct *task,
591 void *base, size_t size, size_t th, unsigned int flags) 664 int cpu, void *base, size_t size, size_t th)
592{ 665{
593 struct ds_context *context; 666 struct ds_context *context;
594 int error; 667 int error;
668 size_t req_size;
669
670 error = -EOPNOTSUPP;
671 if (!ds_cfg.sizeof_rec[qual])
672 goto out;
595 673
596 error = -EINVAL; 674 error = -EINVAL;
597 if (!base) 675 if (!base)
598 goto out; 676 goto out;
599 677
600 /* we require some space to do alignment adjustments below */ 678 req_size = ds_cfg.sizeof_rec[qual];
679 /* We might need space for alignment adjustments. */
680 if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
681 req_size += DS_ALIGNMENT;
682
601 error = -EINVAL; 683 error = -EINVAL;
602 if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) 684 if (size < req_size)
603 goto out; 685 goto out;
604 686
605 if (th != (size_t)-1) { 687 if (th != (size_t)-1) {
@@ -614,182 +696,318 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
614 tracer->size = size; 696 tracer->size = size;
615 697
616 error = -ENOMEM; 698 error = -ENOMEM;
617 context = ds_get_context(task); 699 context = ds_get_context(task, cpu);
618 if (!context) 700 if (!context)
619 goto out; 701 goto out;
620 tracer->context = context; 702 tracer->context = context;
621 703
622 ds_init_ds_trace(trace, qual, base, size, th, flags); 704 /*
705 * Defer any tracer-specific initialization work for the context until
706 * context ownership has been clarified.
707 */
623 708
624 error = 0; 709 error = 0;
625 out: 710 out:
626 return error; 711 return error;
627} 712}
628 713
629struct bts_tracer *ds_request_bts(struct task_struct *task, 714static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
630 void *base, size_t size, 715 void *base, size_t size,
631 bts_ovfl_callback_t ovfl, size_t th, 716 bts_ovfl_callback_t ovfl, size_t th,
632 unsigned int flags) 717 unsigned int flags)
633{ 718{
634 struct bts_tracer *tracer; 719 struct bts_tracer *tracer;
635 unsigned long irq;
636 int error; 720 int error;
637 721
722 /* Buffer overflow notification is not yet implemented. */
638 error = -EOPNOTSUPP; 723 error = -EOPNOTSUPP;
639 if (!ds_cfg.ctl[dsf_bts]) 724 if (ovfl)
640 goto out; 725 goto out;
641 726
642 /* buffer overflow notification is not yet implemented */ 727 error = get_tracer(task);
643 error = -EOPNOTSUPP; 728 if (error < 0)
644 if (ovfl)
645 goto out; 729 goto out;
646 730
647 error = -ENOMEM; 731 error = -ENOMEM;
648 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); 732 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
649 if (!tracer) 733 if (!tracer)
650 goto out; 734 goto out_put_tracer;
651 tracer->ovfl = ovfl; 735 tracer->ovfl = ovfl;
652 736
737 /* Do some more error checking and acquire a tracing context. */
653 error = ds_request(&tracer->ds, &tracer->trace.ds, 738 error = ds_request(&tracer->ds, &tracer->trace.ds,
654 ds_bts, task, base, size, th, flags); 739 ds_bts, task, cpu, base, size, th);
655 if (error < 0) 740 if (error < 0)
656 goto out_tracer; 741 goto out_tracer;
657 742
658 743 /* Claim the bts part of the tracing context we acquired above. */
659 spin_lock_irqsave(&ds_lock, irq); 744 spin_lock_irq(&ds_lock);
660
661 error = -EPERM;
662 if (!check_tracer(task))
663 goto out_unlock;
664 get_tracer(task);
665 745
666 error = -EPERM; 746 error = -EPERM;
667 if (tracer->ds.context->bts_master) 747 if (tracer->ds.context->bts_master)
668 goto out_put_tracer; 748 goto out_unlock;
669 tracer->ds.context->bts_master = tracer; 749 tracer->ds.context->bts_master = tracer;
670 750
671 spin_unlock_irqrestore(&ds_lock, irq); 751 spin_unlock_irq(&ds_lock);
672 752
753 /*
754 * Now that we own the bts part of the context, let's complete the
755 * initialization for that part.
756 */
757 ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
758 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
759 ds_install_ds_area(tracer->ds.context);
673 760
674 tracer->trace.read = bts_read; 761 tracer->trace.read = bts_read;
675 tracer->trace.write = bts_write; 762 tracer->trace.write = bts_write;
676 763
677 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); 764 /* Start tracing. */
678 ds_resume_bts(tracer); 765 ds_resume_bts(tracer);
679 766
680 return tracer; 767 return tracer;
681 768
682 out_put_tracer:
683 put_tracer(task);
684 out_unlock: 769 out_unlock:
685 spin_unlock_irqrestore(&ds_lock, irq); 770 spin_unlock_irq(&ds_lock);
686 ds_put_context(tracer->ds.context); 771 ds_put_context(tracer->ds.context);
687 out_tracer: 772 out_tracer:
688 kfree(tracer); 773 kfree(tracer);
774 out_put_tracer:
775 put_tracer(task);
689 out: 776 out:
690 return ERR_PTR(error); 777 return ERR_PTR(error);
691} 778}
692 779
693struct pebs_tracer *ds_request_pebs(struct task_struct *task, 780struct bts_tracer *ds_request_bts_task(struct task_struct *task,
694 void *base, size_t size, 781 void *base, size_t size,
695 pebs_ovfl_callback_t ovfl, size_t th, 782 bts_ovfl_callback_t ovfl,
696 unsigned int flags) 783 size_t th, unsigned int flags)
784{
785 return ds_request_bts(task, 0, base, size, ovfl, th, flags);
786}
787
788struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
789 bts_ovfl_callback_t ovfl,
790 size_t th, unsigned int flags)
791{
792 return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
793}
794
795static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
796 void *base, size_t size,
797 pebs_ovfl_callback_t ovfl, size_t th,
798 unsigned int flags)
697{ 799{
698 struct pebs_tracer *tracer; 800 struct pebs_tracer *tracer;
699 unsigned long irq;
700 int error; 801 int error;
701 802
702 /* buffer overflow notification is not yet implemented */ 803 /* Buffer overflow notification is not yet implemented. */
703 error = -EOPNOTSUPP; 804 error = -EOPNOTSUPP;
704 if (ovfl) 805 if (ovfl)
705 goto out; 806 goto out;
706 807
808 error = get_tracer(task);
809 if (error < 0)
810 goto out;
811
707 error = -ENOMEM; 812 error = -ENOMEM;
708 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); 813 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
709 if (!tracer) 814 if (!tracer)
710 goto out; 815 goto out_put_tracer;
711 tracer->ovfl = ovfl; 816 tracer->ovfl = ovfl;
712 817
818 /* Do some more error checking and acquire a tracing context. */
713 error = ds_request(&tracer->ds, &tracer->trace.ds, 819 error = ds_request(&tracer->ds, &tracer->trace.ds,
714 ds_pebs, task, base, size, th, flags); 820 ds_pebs, task, cpu, base, size, th);
715 if (error < 0) 821 if (error < 0)
716 goto out_tracer; 822 goto out_tracer;
717 823
718 spin_lock_irqsave(&ds_lock, irq); 824 /* Claim the pebs part of the tracing context we acquired above. */
719 825 spin_lock_irq(&ds_lock);
720 error = -EPERM;
721 if (!check_tracer(task))
722 goto out_unlock;
723 get_tracer(task);
724 826
725 error = -EPERM; 827 error = -EPERM;
726 if (tracer->ds.context->pebs_master) 828 if (tracer->ds.context->pebs_master)
727 goto out_put_tracer; 829 goto out_unlock;
728 tracer->ds.context->pebs_master = tracer; 830 tracer->ds.context->pebs_master = tracer;
729 831
730 spin_unlock_irqrestore(&ds_lock, irq); 832 spin_unlock_irq(&ds_lock);
731 833
834 /*
835 * Now that we own the pebs part of the context, let's complete the
836 * initialization for that part.
837 */
838 ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); 839 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
840 ds_install_ds_area(tracer->ds.context);
841
842 /* Start tracing. */
733 ds_resume_pebs(tracer); 843 ds_resume_pebs(tracer);
734 844
735 return tracer; 845 return tracer;
736 846
737 out_put_tracer:
738 put_tracer(task);
739 out_unlock: 847 out_unlock:
740 spin_unlock_irqrestore(&ds_lock, irq); 848 spin_unlock_irq(&ds_lock);
741 ds_put_context(tracer->ds.context); 849 ds_put_context(tracer->ds.context);
742 out_tracer: 850 out_tracer:
743 kfree(tracer); 851 kfree(tracer);
852 out_put_tracer:
853 put_tracer(task);
744 out: 854 out:
745 return ERR_PTR(error); 855 return ERR_PTR(error);
746} 856}
747 857
748void ds_release_bts(struct bts_tracer *tracer) 858struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
859 void *base, size_t size,
860 pebs_ovfl_callback_t ovfl,
861 size_t th, unsigned int flags)
749{ 862{
750 if (!tracer) 863 return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
751 return; 864}
752 865
753 ds_suspend_bts(tracer); 866struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
867 pebs_ovfl_callback_t ovfl,
868 size_t th, unsigned int flags)
869{
870 return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
871}
872
873static void ds_free_bts(struct bts_tracer *tracer)
874{
875 struct task_struct *task;
876
877 task = tracer->ds.context->task;
754 878
755 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); 879 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
756 tracer->ds.context->bts_master = NULL; 880 tracer->ds.context->bts_master = NULL;
757 881
758 put_tracer(tracer->ds.context->task); 882 /* Make sure tracing stopped and the tracer is not in use. */
883 if (task && (task != current))
884 wait_task_context_switch(task);
885
759 ds_put_context(tracer->ds.context); 886 ds_put_context(tracer->ds.context);
887 put_tracer(task);
760 888
761 kfree(tracer); 889 kfree(tracer);
762} 890}
763 891
892void ds_release_bts(struct bts_tracer *tracer)
893{
894 might_sleep();
895
896 if (!tracer)
897 return;
898
899 ds_suspend_bts(tracer);
900 ds_free_bts(tracer);
901}
902
903int ds_release_bts_noirq(struct bts_tracer *tracer)
904{
905 struct task_struct *task;
906 unsigned long irq;
907 int error;
908
909 if (!tracer)
910 return 0;
911
912 task = tracer->ds.context->task;
913
914 local_irq_save(irq);
915
916 error = -EPERM;
917 if (!task &&
918 (tracer->ds.context->cpu != smp_processor_id()))
919 goto out;
920
921 error = -EPERM;
922 if (task && (task != current))
923 goto out;
924
925 ds_suspend_bts_noirq(tracer);
926 ds_free_bts(tracer);
927
928 error = 0;
929 out:
930 local_irq_restore(irq);
931 return error;
932}
933
934static void update_task_debugctlmsr(struct task_struct *task,
935 unsigned long debugctlmsr)
936{
937 task->thread.debugctlmsr = debugctlmsr;
938
939 get_cpu();
940 if (task == current)
941 update_debugctlmsr(debugctlmsr);
942 put_cpu();
943}
944
764void ds_suspend_bts(struct bts_tracer *tracer) 945void ds_suspend_bts(struct bts_tracer *tracer)
765{ 946{
766 struct task_struct *task; 947 struct task_struct *task;
948 unsigned long debugctlmsr;
949 int cpu;
767 950
768 if (!tracer) 951 if (!tracer)
769 return; 952 return;
770 953
954 tracer->flags = 0;
955
771 task = tracer->ds.context->task; 956 task = tracer->ds.context->task;
957 cpu = tracer->ds.context->cpu;
772 958
773 if (!task || (task == current)) 959 WARN_ON(!task && irqs_disabled());
774 update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
775 960
776 if (task) { 961 debugctlmsr = (task ?
777 task->thread.debugctlmsr &= ~BTS_CONTROL; 962 task->thread.debugctlmsr :
963 get_debugctlmsr_on_cpu(cpu));
964 debugctlmsr &= ~BTS_CONTROL;
778 965
779 if (!task->thread.debugctlmsr) 966 if (task)
780 clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); 967 update_task_debugctlmsr(task, debugctlmsr);
781 } 968 else
969 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
782} 970}
783 971
784void ds_resume_bts(struct bts_tracer *tracer) 972int ds_suspend_bts_noirq(struct bts_tracer *tracer)
785{ 973{
786 struct task_struct *task; 974 struct task_struct *task;
787 unsigned long control; 975 unsigned long debugctlmsr, irq;
976 int cpu, error = 0;
788 977
789 if (!tracer) 978 if (!tracer)
790 return; 979 return 0;
980
981 tracer->flags = 0;
791 982
792 task = tracer->ds.context->task; 983 task = tracer->ds.context->task;
984 cpu = tracer->ds.context->cpu;
985
986 local_irq_save(irq);
987
988 error = -EPERM;
989 if (!task && (cpu != smp_processor_id()))
990 goto out;
991
992 debugctlmsr = (task ?
993 task->thread.debugctlmsr :
994 get_debugctlmsr());
995 debugctlmsr &= ~BTS_CONTROL;
996
997 if (task)
998 update_task_debugctlmsr(task, debugctlmsr);
999 else
1000 update_debugctlmsr(debugctlmsr);
1001
1002 error = 0;
1003 out:
1004 local_irq_restore(irq);
1005 return error;
1006}
1007
1008static unsigned long ds_bts_control(struct bts_tracer *tracer)
1009{
1010 unsigned long control;
793 1011
794 control = ds_cfg.ctl[dsf_bts]; 1012 control = ds_cfg.ctl[dsf_bts];
795 if (!(tracer->trace.ds.flags & BTS_KERNEL)) 1013 if (!(tracer->trace.ds.flags & BTS_KERNEL))
@@ -797,41 +1015,149 @@ void ds_resume_bts(struct bts_tracer *tracer)
797 if (!(tracer->trace.ds.flags & BTS_USER)) 1015 if (!(tracer->trace.ds.flags & BTS_USER))
798 control |= ds_cfg.ctl[dsf_bts_user]; 1016 control |= ds_cfg.ctl[dsf_bts_user];
799 1017
800 if (task) { 1018 return control;
801 task->thread.debugctlmsr |= control;
802 set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
803 }
804
805 if (!task || (task == current))
806 update_debugctlmsr(get_debugctlmsr() | control);
807} 1019}
808 1020
809void ds_release_pebs(struct pebs_tracer *tracer) 1021void ds_resume_bts(struct bts_tracer *tracer)
810{ 1022{
1023 struct task_struct *task;
1024 unsigned long debugctlmsr;
1025 int cpu;
1026
811 if (!tracer) 1027 if (!tracer)
812 return; 1028 return;
813 1029
814 ds_suspend_pebs(tracer); 1030 tracer->flags = tracer->trace.ds.flags;
1031
1032 task = tracer->ds.context->task;
1033 cpu = tracer->ds.context->cpu;
1034
1035 WARN_ON(!task && irqs_disabled());
1036
1037 debugctlmsr = (task ?
1038 task->thread.debugctlmsr :
1039 get_debugctlmsr_on_cpu(cpu));
1040 debugctlmsr |= ds_bts_control(tracer);
1041
1042 if (task)
1043 update_task_debugctlmsr(task, debugctlmsr);
1044 else
1045 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
1046}
1047
1048int ds_resume_bts_noirq(struct bts_tracer *tracer)
1049{
1050 struct task_struct *task;
1051 unsigned long debugctlmsr, irq;
1052 int cpu, error = 0;
1053
1054 if (!tracer)
1055 return 0;
1056
1057 tracer->flags = tracer->trace.ds.flags;
1058
1059 task = tracer->ds.context->task;
1060 cpu = tracer->ds.context->cpu;
1061
1062 local_irq_save(irq);
1063
1064 error = -EPERM;
1065 if (!task && (cpu != smp_processor_id()))
1066 goto out;
1067
1068 debugctlmsr = (task ?
1069 task->thread.debugctlmsr :
1070 get_debugctlmsr());
1071 debugctlmsr |= ds_bts_control(tracer);
1072
1073 if (task)
1074 update_task_debugctlmsr(task, debugctlmsr);
1075 else
1076 update_debugctlmsr(debugctlmsr);
1077
1078 error = 0;
1079 out:
1080 local_irq_restore(irq);
1081 return error;
1082}
1083
1084static void ds_free_pebs(struct pebs_tracer *tracer)
1085{
1086 struct task_struct *task;
1087
1088 task = tracer->ds.context->task;
815 1089
816 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); 1090 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
817 tracer->ds.context->pebs_master = NULL; 1091 tracer->ds.context->pebs_master = NULL;
818 1092
819 put_tracer(tracer->ds.context->task);
820 ds_put_context(tracer->ds.context); 1093 ds_put_context(tracer->ds.context);
1094 put_tracer(task);
821 1095
822 kfree(tracer); 1096 kfree(tracer);
823} 1097}
824 1098
1099void ds_release_pebs(struct pebs_tracer *tracer)
1100{
1101 might_sleep();
1102
1103 if (!tracer)
1104 return;
1105
1106 ds_suspend_pebs(tracer);
1107 ds_free_pebs(tracer);
1108}
1109
1110int ds_release_pebs_noirq(struct pebs_tracer *tracer)
1111{
1112 struct task_struct *task;
1113 unsigned long irq;
1114 int error;
1115
1116 if (!tracer)
1117 return 0;
1118
1119 task = tracer->ds.context->task;
1120
1121 local_irq_save(irq);
1122
1123 error = -EPERM;
1124 if (!task &&
1125 (tracer->ds.context->cpu != smp_processor_id()))
1126 goto out;
1127
1128 error = -EPERM;
1129 if (task && (task != current))
1130 goto out;
1131
1132 ds_suspend_pebs_noirq(tracer);
1133 ds_free_pebs(tracer);
1134
1135 error = 0;
1136 out:
1137 local_irq_restore(irq);
1138 return error;
1139}
1140
825void ds_suspend_pebs(struct pebs_tracer *tracer) 1141void ds_suspend_pebs(struct pebs_tracer *tracer)
826{ 1142{
827 1143
828} 1144}
829 1145
1146int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
1147{
1148 return 0;
1149}
1150
830void ds_resume_pebs(struct pebs_tracer *tracer) 1151void ds_resume_pebs(struct pebs_tracer *tracer)
831{ 1152{
832 1153
833} 1154}
834 1155
1156int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
1157{
1158 return 0;
1159}
1160
835const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) 1161const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
836{ 1162{
837 if (!tracer) 1163 if (!tracer)
@@ -847,8 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
847 return NULL; 1173 return NULL;
848 1174
849 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); 1175 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
850 tracer->trace.reset_value = 1176
851 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); 1177 tracer->trace.counters = ds_cfg.nr_counter_reset;
1178 memcpy(tracer->trace.counter_reset,
1179 tracer->ds.context->ds +
1180 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
1181 ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
852 1182
853 return &tracer->trace; 1183 return &tracer->trace;
854} 1184}
@@ -873,18 +1203,24 @@ int ds_reset_pebs(struct pebs_tracer *tracer)
873 1203
874 tracer->trace.ds.top = tracer->trace.ds.begin; 1204 tracer->trace.ds.top = tracer->trace.ds.begin;
875 1205
876 ds_set(tracer->ds.context->ds, ds_bts, ds_index, 1206 ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
877 (unsigned long)tracer->trace.ds.top); 1207 (unsigned long)tracer->trace.ds.top);
878 1208
879 return 0; 1209 return 0;
880} 1210}
881 1211
882int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) 1212int ds_set_pebs_reset(struct pebs_tracer *tracer,
1213 unsigned int counter, u64 value)
883{ 1214{
884 if (!tracer) 1215 if (!tracer)
885 return -EINVAL; 1216 return -EINVAL;
886 1217
887 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; 1218 if (ds_cfg.nr_counter_reset < counter)
1219 return -EINVAL;
1220
1221 *(u64 *)(tracer->ds.context->ds +
1222 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
1223 (counter * PEBS_RESET_FIELD_SIZE)) = value;
888 1224
889 return 0; 1225 return 0;
890} 1226}
@@ -894,73 +1230,117 @@ static const struct ds_configuration ds_cfg_netburst = {
894 .ctl[dsf_bts] = (1 << 2) | (1 << 3), 1230 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
895 .ctl[dsf_bts_kernel] = (1 << 5), 1231 .ctl[dsf_bts_kernel] = (1 << 5),
896 .ctl[dsf_bts_user] = (1 << 6), 1232 .ctl[dsf_bts_user] = (1 << 6),
897 1233 .nr_counter_reset = 1,
898 .sizeof_field = sizeof(long),
899 .sizeof_rec[ds_bts] = sizeof(long) * 3,
900#ifdef __i386__
901 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
902#else
903 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
904#endif
905}; 1234};
906static const struct ds_configuration ds_cfg_pentium_m = { 1235static const struct ds_configuration ds_cfg_pentium_m = {
907 .name = "Pentium M", 1236 .name = "Pentium M",
908 .ctl[dsf_bts] = (1 << 6) | (1 << 7), 1237 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
909 1238 .nr_counter_reset = 1,
910 .sizeof_field = sizeof(long),
911 .sizeof_rec[ds_bts] = sizeof(long) * 3,
912#ifdef __i386__
913 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
914#else
915 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
916#endif
917}; 1239};
918static const struct ds_configuration ds_cfg_core2_atom = { 1240static const struct ds_configuration ds_cfg_core2_atom = {
919 .name = "Core 2/Atom", 1241 .name = "Core 2/Atom",
920 .ctl[dsf_bts] = (1 << 6) | (1 << 7), 1242 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
921 .ctl[dsf_bts_kernel] = (1 << 9), 1243 .ctl[dsf_bts_kernel] = (1 << 9),
922 .ctl[dsf_bts_user] = (1 << 10), 1244 .ctl[dsf_bts_user] = (1 << 10),
923 1245 .nr_counter_reset = 1,
924 .sizeof_field = 8, 1246};
925 .sizeof_rec[ds_bts] = 8 * 3, 1247static const struct ds_configuration ds_cfg_core_i7 = {
926 .sizeof_rec[ds_pebs] = 8 * 18, 1248 .name = "Core i7",
1249 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1250 .ctl[dsf_bts_kernel] = (1 << 9),
1251 .ctl[dsf_bts_user] = (1 << 10),
1252 .nr_counter_reset = 4,
927}; 1253};
928 1254
929static void 1255static void
930ds_configure(const struct ds_configuration *cfg) 1256ds_configure(const struct ds_configuration *cfg,
1257 struct cpuinfo_x86 *cpu)
931{ 1258{
1259 unsigned long nr_pebs_fields = 0;
1260
1261 printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
1262
1263#ifdef __i386__
1264 nr_pebs_fields = 10;
1265#else
1266 nr_pebs_fields = 18;
1267#endif
1268
1269 /*
1270 * Starting with version 2, architectural performance
1271 * monitoring supports a format specifier.
1272 */
1273 if ((cpuid_eax(0xa) & 0xff) > 1) {
1274 unsigned long perf_capabilities, format;
1275
1276 rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
1277
1278 format = (perf_capabilities >> 8) & 0xf;
1279
1280 switch (format) {
1281 case 0:
1282 nr_pebs_fields = 18;
1283 break;
1284 case 1:
1285 nr_pebs_fields = 22;
1286 break;
1287 default:
1288 printk(KERN_INFO
1289 "[ds] unknown PEBS format: %lu\n", format);
1290 nr_pebs_fields = 0;
1291 break;
1292 }
1293 }
1294
932 memset(&ds_cfg, 0, sizeof(ds_cfg)); 1295 memset(&ds_cfg, 0, sizeof(ds_cfg));
933 ds_cfg = *cfg; 1296 ds_cfg = *cfg;
934 1297
935 printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); 1298 ds_cfg.sizeof_ptr_field =
1299 (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
1300
1301 ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
1302 ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
936 1303
937 if (!cpu_has_bts) { 1304 if (!cpu_has(cpu, X86_FEATURE_BTS)) {
938 ds_cfg.ctl[dsf_bts] = 0; 1305 ds_cfg.sizeof_rec[ds_bts] = 0;
939 printk(KERN_INFO "[ds] bts not available\n"); 1306 printk(KERN_INFO "[ds] bts not available\n");
940 } 1307 }
941 if (!cpu_has_pebs) 1308 if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
1309 ds_cfg.sizeof_rec[ds_pebs] = 0;
942 printk(KERN_INFO "[ds] pebs not available\n"); 1310 printk(KERN_INFO "[ds] pebs not available\n");
1311 }
1312
1313 printk(KERN_INFO "[ds] sizes: address: %u bit, ",
1314 8 * ds_cfg.sizeof_ptr_field);
1315 printk("bts/pebs record: %u/%u bytes\n",
1316 ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
943 1317
944 WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); 1318 WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
945} 1319}
946 1320
947void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) 1321void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
948{ 1322{
1323 /* Only configure the first cpu. Others are identical. */
1324 if (ds_cfg.name)
1325 return;
1326
949 switch (c->x86) { 1327 switch (c->x86) {
950 case 0x6: 1328 case 0x6:
951 switch (c->x86_model) { 1329 switch (c->x86_model) {
952 case 0x9: 1330 case 0x9:
953 case 0xd: /* Pentium M */ 1331 case 0xd: /* Pentium M */
954 ds_configure(&ds_cfg_pentium_m); 1332 ds_configure(&ds_cfg_pentium_m, c);
955 break; 1333 break;
956 case 0xf: 1334 case 0xf:
957 case 0x17: /* Core2 */ 1335 case 0x17: /* Core2 */
958 case 0x1c: /* Atom */ 1336 case 0x1c: /* Atom */
959 ds_configure(&ds_cfg_core2_atom); 1337 ds_configure(&ds_cfg_core2_atom, c);
1338 break;
1339 case 0x1a: /* Core i7 */
1340 ds_configure(&ds_cfg_core_i7, c);
960 break; 1341 break;
961 case 0x1a: /* i7 */
962 default: 1342 default:
963 /* sorry, don't know about them */ 1343 /* Sorry, don't know about them. */
964 break; 1344 break;
965 } 1345 }
966 break; 1346 break;
@@ -969,64 +1349,89 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
969 case 0x0: 1349 case 0x0:
970 case 0x1: 1350 case 0x1:
971 case 0x2: /* Netburst */ 1351 case 0x2: /* Netburst */
972 ds_configure(&ds_cfg_netburst); 1352 ds_configure(&ds_cfg_netburst, c);
973 break; 1353 break;
974 default: 1354 default:
975 /* sorry, don't know about them */ 1355 /* Sorry, don't know about them. */
976 break; 1356 break;
977 } 1357 }
978 break; 1358 break;
979 default: 1359 default:
980 /* sorry, don't know about them */ 1360 /* Sorry, don't know about them. */
981 break; 1361 break;
982 } 1362 }
983} 1363}
984 1364
1365static inline void ds_take_timestamp(struct ds_context *context,
1366 enum bts_qualifier qualifier,
1367 struct task_struct *task)
1368{
1369 struct bts_tracer *tracer = context->bts_master;
1370 struct bts_struct ts;
1371
1372 /* Prevent compilers from reading the tracer pointer twice. */
1373 barrier();
1374
1375 if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
1376 return;
1377
1378 memset(&ts, 0, sizeof(ts));
1379 ts.qualifier = qualifier;
1380 ts.variant.event.clock = trace_clock_global();
1381 ts.variant.event.pid = task->pid;
1382
1383 bts_write(tracer, &ts);
1384}
1385
985/* 1386/*
986 * Change the DS configuration from tracing prev to tracing next. 1387 * Change the DS configuration from tracing prev to tracing next.
987 */ 1388 */
988void ds_switch_to(struct task_struct *prev, struct task_struct *next) 1389void ds_switch_to(struct task_struct *prev, struct task_struct *next)
989{ 1390{
990 struct ds_context *prev_ctx = prev->thread.ds_ctx; 1391 struct ds_context *prev_ctx = prev->thread.ds_ctx;
991 struct ds_context *next_ctx = next->thread.ds_ctx; 1392 struct ds_context *next_ctx = next->thread.ds_ctx;
1393 unsigned long debugctlmsr = next->thread.debugctlmsr;
1394
1395 /* Make sure all data is read before we start. */
1396 barrier();
992 1397
993 if (prev_ctx) { 1398 if (prev_ctx) {
994 update_debugctlmsr(0); 1399 update_debugctlmsr(0);
995 1400
996 if (prev_ctx->bts_master && 1401 ds_take_timestamp(prev_ctx, bts_task_departs, prev);
997 (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
998 struct bts_struct ts = {
999 .qualifier = bts_task_departs,
1000 .variant.timestamp.jiffies = jiffies_64,
1001 .variant.timestamp.pid = prev->pid
1002 };
1003 bts_write(prev_ctx->bts_master, &ts);
1004 }
1005 } 1402 }
1006 1403
1007 if (next_ctx) { 1404 if (next_ctx) {
1008 if (next_ctx->bts_master && 1405 ds_take_timestamp(next_ctx, bts_task_arrives, next);
1009 (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
1010 struct bts_struct ts = {
1011 .qualifier = bts_task_arrives,
1012 .variant.timestamp.jiffies = jiffies_64,
1013 .variant.timestamp.pid = next->pid
1014 };
1015 bts_write(next_ctx->bts_master, &ts);
1016 }
1017 1406
1018 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); 1407 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
1019 } 1408 }
1020 1409
1021 update_debugctlmsr(next->thread.debugctlmsr); 1410 update_debugctlmsr(debugctlmsr);
1022} 1411}
1023 1412
1024void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) 1413static __init int ds_selftest(void)
1025{ 1414{
1026 clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); 1415 if (ds_cfg.sizeof_rec[ds_bts]) {
1027 tsk->thread.ds_ctx = NULL; 1416 int error;
1028}
1029 1417
1030void ds_exit_thread(struct task_struct *tsk) 1418 error = ds_selftest_bts();
1031{ 1419 if (error) {
1420 WARN(1, "[ds] selftest failed. disabling bts.\n");
1421 ds_cfg.sizeof_rec[ds_bts] = 0;
1422 }
1423 }
1424
1425 if (ds_cfg.sizeof_rec[ds_pebs]) {
1426 int error;
1427
1428 error = ds_selftest_pebs();
1429 if (error) {
1430 WARN(1, "[ds] selftest failed. disabling pebs.\n");
1431 ds_cfg.sizeof_rec[ds_pebs] = 0;
1432 }
1433 }
1434
1435 return 0;
1032} 1436}
1437device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
new file mode 100644
index 000000000000..6bc7c199ab99
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.c
@@ -0,0 +1,408 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#include "ds_selftest.h"
10
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/smp.h>
14#include <linux/cpu.h>
15
16#include <asm/ds.h>
17
18
19#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */
20#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */
21
22struct ds_selftest_bts_conf {
23 struct bts_tracer *tracer;
24 int error;
25 int (*suspend)(struct bts_tracer *);
26 int (*resume)(struct bts_tracer *);
27};
28
29static int ds_selftest_bts_consistency(const struct bts_trace *trace)
30{
31 int error = 0;
32
33 if (!trace) {
34 printk(KERN_CONT "failed to access trace...");
35 /* Bail out. Other tests are pointless. */
36 return -1;
37 }
38
39 if (!trace->read) {
40 printk(KERN_CONT "bts read not available...");
41 error = -1;
42 }
43
44 /* Do some sanity checks on the trace configuration. */
45 if (!trace->ds.n) {
46 printk(KERN_CONT "empty bts buffer...");
47 error = -1;
48 }
49 if (!trace->ds.size) {
50 printk(KERN_CONT "bad bts trace setup...");
51 error = -1;
52 }
53 if (trace->ds.end !=
54 (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
55 printk(KERN_CONT "bad bts buffer setup...");
56 error = -1;
57 }
58 /*
59 * We allow top in [begin; end], since its not clear when the
60 * overflow adjustment happens: after the increment or before the
61 * write.
62 */
63 if ((trace->ds.top < trace->ds.begin) ||
64 (trace->ds.end < trace->ds.top)) {
65 printk(KERN_CONT "bts top out of bounds...");
66 error = -1;
67 }
68
69 return error;
70}
71
72static int ds_selftest_bts_read(struct bts_tracer *tracer,
73 const struct bts_trace *trace,
74 const void *from, const void *to)
75{
76 const unsigned char *at;
77
78 /*
79 * Check a few things which do not belong to this test.
80 * They should be covered by other tests.
81 */
82 if (!trace)
83 return -1;
84
85 if (!trace->read)
86 return -1;
87
88 if (to < from)
89 return -1;
90
91 if (from < trace->ds.begin)
92 return -1;
93
94 if (trace->ds.end < to)
95 return -1;
96
97 if (!trace->ds.size)
98 return -1;
99
100 /* Now to the test itself. */
101 for (at = from; (void *)at < to; at += trace->ds.size) {
102 struct bts_struct bts;
103 unsigned long index;
104 int error;
105
106 if (((void *)at - trace->ds.begin) % trace->ds.size) {
107 printk(KERN_CONT
108 "read from non-integer index...");
109 return -1;
110 }
111 index = ((void *)at - trace->ds.begin) / trace->ds.size;
112
113 memset(&bts, 0, sizeof(bts));
114 error = trace->read(tracer, at, &bts);
115 if (error < 0) {
116 printk(KERN_CONT
117 "error reading bts trace at [%lu] (0x%p)...",
118 index, at);
119 return error;
120 }
121
122 switch (bts.qualifier) {
123 case BTS_BRANCH:
124 break;
125 default:
126 printk(KERN_CONT
127 "unexpected bts entry %llu at [%lu] (0x%p)...",
128 bts.qualifier, index, at);
129 return -1;
130 }
131 }
132
133 return 0;
134}
135
136static void ds_selftest_bts_cpu(void *arg)
137{
138 struct ds_selftest_bts_conf *conf = arg;
139 const struct bts_trace *trace;
140 void *top;
141
142 if (IS_ERR(conf->tracer)) {
143 conf->error = PTR_ERR(conf->tracer);
144 conf->tracer = NULL;
145
146 printk(KERN_CONT
147 "initialization failed (err: %d)...", conf->error);
148 return;
149 }
150
151 /* We should meanwhile have enough trace. */
152 conf->error = conf->suspend(conf->tracer);
153 if (conf->error < 0)
154 return;
155
156 /* Let's see if we can access the trace. */
157 trace = ds_read_bts(conf->tracer);
158
159 conf->error = ds_selftest_bts_consistency(trace);
160 if (conf->error < 0)
161 return;
162
163 /* If everything went well, we should have a few trace entries. */
164 if (trace->ds.top == trace->ds.begin) {
165 /*
166 * It is possible but highly unlikely that we got a
167 * buffer overflow and end up at exactly the same
168 * position we started from.
169 * Let's issue a warning, but continue.
170 */
171 printk(KERN_CONT "no trace/overflow...");
172 }
173
174 /* Let's try to read the trace we collected. */
175 conf->error =
176 ds_selftest_bts_read(conf->tracer, trace,
177 trace->ds.begin, trace->ds.top);
178 if (conf->error < 0)
179 return;
180
181 /*
182 * Let's read the trace again.
183 * Since we suspended tracing, we should get the same result.
184 */
185 top = trace->ds.top;
186
187 trace = ds_read_bts(conf->tracer);
188 conf->error = ds_selftest_bts_consistency(trace);
189 if (conf->error < 0)
190 return;
191
192 if (top != trace->ds.top) {
193 printk(KERN_CONT "suspend not working...");
194 conf->error = -1;
195 return;
196 }
197
198 /* Let's collect some more trace - see if resume is working. */
199 conf->error = conf->resume(conf->tracer);
200 if (conf->error < 0)
201 return;
202
203 conf->error = conf->suspend(conf->tracer);
204 if (conf->error < 0)
205 return;
206
207 trace = ds_read_bts(conf->tracer);
208
209 conf->error = ds_selftest_bts_consistency(trace);
210 if (conf->error < 0)
211 return;
212
213 if (trace->ds.top == top) {
214 /*
215 * It is possible but highly unlikely that we got a
216 * buffer overflow and end up at exactly the same
217 * position we started from.
218 * Let's issue a warning and check the full trace.
219 */
220 printk(KERN_CONT
221 "no resume progress/overflow...");
222
223 conf->error =
224 ds_selftest_bts_read(conf->tracer, trace,
225 trace->ds.begin, trace->ds.end);
226 } else if (trace->ds.top < top) {
227 /*
228 * We had a buffer overflow - the entire buffer should
229 * contain trace records.
230 */
231 conf->error =
232 ds_selftest_bts_read(conf->tracer, trace,
233 trace->ds.begin, trace->ds.end);
234 } else {
235 /*
236 * It is quite likely that the buffer did not overflow.
237 * Let's just check the delta trace.
238 */
239 conf->error =
240 ds_selftest_bts_read(conf->tracer, trace, top,
241 trace->ds.top);
242 }
243 if (conf->error < 0)
244 return;
245
246 conf->error = 0;
247}
248
249static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
250{
251 ds_suspend_bts(tracer);
252 return 0;
253}
254
255static int ds_resume_bts_wrap(struct bts_tracer *tracer)
256{
257 ds_resume_bts(tracer);
258 return 0;
259}
260
261static void ds_release_bts_noirq_wrap(void *tracer)
262{
263 (void)ds_release_bts_noirq(tracer);
264}
265
266static int ds_selftest_bts_bad_release_noirq(int cpu,
267 struct bts_tracer *tracer)
268{
269 int error = -EPERM;
270
271 /* Try to release the tracer on the wrong cpu. */
272 get_cpu();
273 if (cpu != smp_processor_id()) {
274 error = ds_release_bts_noirq(tracer);
275 if (error != -EPERM)
276 printk(KERN_CONT "release on wrong cpu...");
277 }
278 put_cpu();
279
280 return error ? 0 : -1;
281}
282
283static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
284{
285 struct bts_tracer *tracer;
286 int error;
287
288 /* Try to request cpu tracing while task tracing is active. */
289 tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
290 (size_t)-1, BTS_KERNEL);
291 error = PTR_ERR(tracer);
292 if (!IS_ERR(tracer)) {
293 ds_release_bts(tracer);
294 error = 0;
295 }
296
297 if (error != -EPERM)
298 printk(KERN_CONT "cpu/task tracing overlap...");
299
300 return error ? 0 : -1;
301}
302
303static int ds_selftest_bts_bad_request_task(void *buffer)
304{
305 struct bts_tracer *tracer;
306 int error;
307
308 /* Try to request cpu tracing while task tracing is active. */
309 tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
310 (size_t)-1, BTS_KERNEL);
311 error = PTR_ERR(tracer);
312 if (!IS_ERR(tracer)) {
313 error = 0;
314 ds_release_bts(tracer);
315 }
316
317 if (error != -EPERM)
318 printk(KERN_CONT "task/cpu tracing overlap...");
319
320 return error ? 0 : -1;
321}
322
323int ds_selftest_bts(void)
324{
325 struct ds_selftest_bts_conf conf;
326 unsigned char buffer[BUFFER_SIZE], *small_buffer;
327 unsigned long irq;
328 int cpu;
329
330 printk(KERN_INFO "[ds] bts selftest...");
331 conf.error = 0;
332
333 small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
334
335 get_online_cpus();
336 for_each_online_cpu(cpu) {
337 conf.suspend = ds_suspend_bts_wrap;
338 conf.resume = ds_resume_bts_wrap;
339 conf.tracer =
340 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
341 NULL, (size_t)-1, BTS_KERNEL);
342 ds_selftest_bts_cpu(&conf);
343 if (conf.error >= 0)
344 conf.error = ds_selftest_bts_bad_request_task(buffer);
345 ds_release_bts(conf.tracer);
346 if (conf.error < 0)
347 goto out;
348
349 conf.suspend = ds_suspend_bts_noirq;
350 conf.resume = ds_resume_bts_noirq;
351 conf.tracer =
352 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
353 NULL, (size_t)-1, BTS_KERNEL);
354 smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
355 if (conf.error >= 0) {
356 conf.error =
357 ds_selftest_bts_bad_release_noirq(cpu,
358 conf.tracer);
359 /* We must not release the tracer twice. */
360 if (conf.error < 0)
361 conf.tracer = NULL;
362 }
363 if (conf.error >= 0)
364 conf.error = ds_selftest_bts_bad_request_task(buffer);
365 smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
366 conf.tracer, 1);
367 if (conf.error < 0)
368 goto out;
369 }
370
371 conf.suspend = ds_suspend_bts_wrap;
372 conf.resume = ds_resume_bts_wrap;
373 conf.tracer =
374 ds_request_bts_task(current, buffer, BUFFER_SIZE,
375 NULL, (size_t)-1, BTS_KERNEL);
376 ds_selftest_bts_cpu(&conf);
377 if (conf.error >= 0)
378 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
379 ds_release_bts(conf.tracer);
380 if (conf.error < 0)
381 goto out;
382
383 conf.suspend = ds_suspend_bts_noirq;
384 conf.resume = ds_resume_bts_noirq;
385 conf.tracer =
386 ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
387 NULL, (size_t)-1, BTS_KERNEL);
388 local_irq_save(irq);
389 ds_selftest_bts_cpu(&conf);
390 if (conf.error >= 0)
391 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
392 ds_release_bts_noirq(conf.tracer);
393 local_irq_restore(irq);
394 if (conf.error < 0)
395 goto out;
396
397 conf.error = 0;
398 out:
399 put_online_cpus();
400 printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
401
402 return conf.error;
403}
404
405int ds_selftest_pebs(void)
406{
407 return 0;
408}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
new file mode 100644
index 000000000000..2ba8745c6663
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.h
@@ -0,0 +1,15 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#ifdef CONFIG_X86_DS_SELFTEST
10extern int ds_selftest_bts(void);
11extern int ds_selftest_pebs(void);
12#else
13static inline int ds_selftest_bts(void) { return 0; }
14static inline int ds_selftest_pebs(void) { return 0; }
15#endif
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index da87590b8698..81086c227ab7 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
29 unsigned long *sp, unsigned long bp, char *log_lvl); 29 unsigned long *sp, unsigned long bp, char *log_lvl);
30 30
31extern unsigned int code_bytes; 31extern unsigned int code_bytes;
32extern int kstack_depth_to_print;
33 32
34/* The form of the top of the frame on the stack */ 33/* The form of the top of the frame on the stack */
35struct stack_frame { 34struct stack_frame {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 006281302925..7271fa33d791 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
617 */ 617 */
618__init void e820_setup_gap(void) 618__init void e820_setup_gap(void)
619{ 619{
620 unsigned long gapstart, gapsize, round; 620 unsigned long gapstart, gapsize;
621 int found; 621 int found;
622 622
623 gapstart = 0x10000000; 623 gapstart = 0x10000000;
@@ -635,14 +635,9 @@ __init void e820_setup_gap(void)
635#endif 635#endif
636 636
637 /* 637 /*
638 * See how much we want to round up: start off with 638 * e820_reserve_resources_late protect stolen RAM already
639 * rounding to the next 1MB area.
640 */ 639 */
641 round = 0x100000; 640 pci_mem_start = gapstart;
642 while ((gapsize >> 4) > round)
643 round += round;
644 /* Fun with two's complement */
645 pci_mem_start = (gapstart + round) & -round;
646 641
647 printk(KERN_INFO 642 printk(KERN_INFO
648 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", 643 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
@@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void)
1371 } 1366 }
1372} 1367}
1373 1368
1369/* How much should we pad RAM ending depending on where it is? */
1370static unsigned long ram_alignment(resource_size_t pos)
1371{
1372 unsigned long mb = pos >> 20;
1373
1374 /* To 64kB in the first megabyte */
1375 if (!mb)
1376 return 64*1024;
1377
1378 /* To 1MB in the first 16MB */
1379 if (mb < 16)
1380 return 1024*1024;
1381
1382 /* To 32MB for anything above that */
1383 return 32*1024*1024;
1384}
1385
1374void __init e820_reserve_resources_late(void) 1386void __init e820_reserve_resources_late(void)
1375{ 1387{
1376 int i; 1388 int i;
@@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void)
1382 insert_resource_expand_to_fit(&iomem_resource, res); 1394 insert_resource_expand_to_fit(&iomem_resource, res);
1383 res++; 1395 res++;
1384 } 1396 }
1397
1398 /*
1399 * Try to bump up RAM regions to reasonable boundaries to
1400 * avoid stolen RAM:
1401 */
1402 for (i = 0; i < e820.nr_map; i++) {
1403 struct e820entry *entry = &e820_saved.map[i];
1404 resource_size_t start, end;
1405
1406 if (entry->type != E820_RAM)
1407 continue;
1408 start = entry->addr + entry->size;
1409 end = round_up(start, ram_alignment(start));
1410 if (start == end)
1411 continue;
1412 reserve_region_with_split(&iomem_resource, start,
1413 end - 1, "RAM buffer");
1414 }
1385} 1415}
1386 1416
1387char *__init default_machine_specific_memory_setup(void) 1417char *__init default_machine_specific_memory_setup(void)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 76b8cd953dee..ebdb85cf2686 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func)
97} 97}
98 98
99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) 99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100static u32 __init ati_ixp4x0_rev(int num, int slot, int func) 101static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
101{ 102{
102 u32 d; 103 u32 d;
@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
114 d &= 0xff; 115 d &= 0xff;
115 return d; 116 return d;
116} 117}
118#endif
117 119
118static void __init ati_bugs(int num, int slot, int func) 120static void __init ati_bugs(int num, int slot, int func)
119{ 121{
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 4234b1235652..de74f0a3e0ed 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -147,27 +147,14 @@ END(ftrace_graph_caller)
147GLOBAL(return_to_handler) 147GLOBAL(return_to_handler)
148 subq $80, %rsp 148 subq $80, %rsp
149 149
150 /* Save the return values */
150 movq %rax, (%rsp) 151 movq %rax, (%rsp)
151 movq %rcx, 8(%rsp) 152 movq %rdx, 8(%rsp)
152 movq %rdx, 16(%rsp)
153 movq %rsi, 24(%rsp)
154 movq %rdi, 32(%rsp)
155 movq %r8, 40(%rsp)
156 movq %r9, 48(%rsp)
157 movq %r10, 56(%rsp)
158 movq %r11, 64(%rsp)
159 153
160 call ftrace_return_to_handler 154 call ftrace_return_to_handler
161 155
162 movq %rax, 72(%rsp) 156 movq %rax, 72(%rsp)
163 movq 64(%rsp), %r11 157 movq 8(%rsp), %rdx
164 movq 56(%rsp), %r10
165 movq 48(%rsp), %r9
166 movq 40(%rsp), %r8
167 movq 32(%rsp), %rdi
168 movq 24(%rsp), %rsi
169 movq 16(%rsp), %rdx
170 movq 8(%rsp), %rcx
171 movq (%rsp), %rax 158 movq (%rsp), %rax
172 addq $72, %rsp 159 addq $72, %rsp
173 retq 160 retq
@@ -1032,6 +1019,11 @@ apicinterrupt ERROR_APIC_VECTOR \
1032apicinterrupt SPURIOUS_APIC_VECTOR \ 1019apicinterrupt SPURIOUS_APIC_VECTOR \
1033 spurious_interrupt smp_spurious_interrupt 1020 spurious_interrupt smp_spurious_interrupt
1034 1021
1022#ifdef CONFIG_PERF_COUNTERS
1023apicinterrupt LOCAL_PENDING_VECTOR \
1024 perf_pending_interrupt smp_perf_pending_interrupt
1025#endif
1026
1035/* 1027/*
1036 * Exception entry points. 1028 * Exception entry points.
1037 */ 1029 */
@@ -1386,6 +1378,11 @@ END(xen_failsafe_callback)
1386paranoidzeroentry_ist debug do_debug DEBUG_STACK 1378paranoidzeroentry_ist debug do_debug DEBUG_STACK
1387paranoidzeroentry_ist int3 do_int3 DEBUG_STACK 1379paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1388paranoiderrorentry stack_segment do_stack_segment 1380paranoiderrorentry stack_segment do_stack_segment
1381#ifdef CONFIG_XEN
1382zeroentry xen_debug do_debug
1383zeroentry xen_int3 do_int3
1384errorentry xen_stack_segment do_stack_segment
1385#endif
1389errorentry general_protection do_general_protection 1386errorentry general_protection do_general_protection
1390errorentry page_fault do_page_fault 1387errorentry page_fault do_page_fault
1391#ifdef CONFIG_X86_MCE 1388#ifdef CONFIG_X86_MCE
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 30683883e0cd..dc5ed4bdd88d 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -608,13 +608,6 @@ ignore_int:
608ENTRY(initial_code) 608ENTRY(initial_code)
609 .long i386_start_kernel 609 .long i386_start_kernel
610 610
611.section .text
612/*
613 * Real beginning of normal "text" segment
614 */
615ENTRY(stext)
616ENTRY(_stext)
617
618/* 611/*
619 * BSS section 612 * BSS section
620 */ 613 */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 9773395aa758..b0cdde6932f5 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -13,6 +13,7 @@
13#include <asm/irq.h> 13#include <asm/irq.h>
14#include <asm/idle.h> 14#include <asm/idle.h>
15#include <asm/mce.h> 15#include <asm/mce.h>
16#include <asm/hw_irq.h>
16 17
17atomic_t irq_err_count; 18atomic_t irq_err_count;
18 19
@@ -62,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
62 for_each_online_cpu(j) 63 for_each_online_cpu(j)
63 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
64 seq_printf(p, " Spurious interrupts\n"); 65 seq_printf(p, " Spurious interrupts\n");
66 seq_printf(p, "%*s: ", prec, "CNT");
67 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance counter interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND");
71 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n");
65#endif 74#endif
66 if (generic_interrupt_extension) { 75 if (generic_interrupt_extension) {
67 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
@@ -175,6 +184,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
175#ifdef CONFIG_X86_LOCAL_APIC 184#ifdef CONFIG_X86_LOCAL_APIC
176 sum += irq_stats(cpu)->apic_timer_irqs; 185 sum += irq_stats(cpu)->apic_timer_irqs;
177 sum += irq_stats(cpu)->irq_spurious_count; 186 sum += irq_stats(cpu)->irq_spurious_count;
187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs;
178#endif 189#endif
179 if (generic_interrupt_extension) 190 if (generic_interrupt_extension)
180 sum += irq_stats(cpu)->generic_irqs; 191 sum += irq_stats(cpu)->generic_irqs;
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 4a69ec55be3d..696f0e475c2d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -207,7 +207,6 @@ static void __init apic_intr_init(void)
207 207
208 /* Performance monitoring interrupts: */ 208 /* Performance monitoring interrupts: */
209# ifdef CONFIG_PERF_COUNTERS 209# ifdef CONFIG_PERF_COUNTERS
210 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
211 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); 210 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
212# endif 211# endif
213 212
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index b1f4dffb919e..8d82a77a3f3b 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -142,7 +142,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
142 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 142 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
143 gdb_regs32[GDB_CS] = __KERNEL_CS; 143 gdb_regs32[GDB_CS] = __KERNEL_CS;
144 gdb_regs32[GDB_SS] = __KERNEL_DS; 144 gdb_regs32[GDB_SS] = __KERNEL_DS;
145 gdb_regs[GDB_PC] = p->thread.ip; 145 gdb_regs[GDB_PC] = 0;
146 gdb_regs[GDB_R8] = 0; 146 gdb_regs[GDB_R8] = 0;
147 gdb_regs[GDB_R9] = 0; 147 gdb_regs[GDB_R9] = 0;
148 gdb_regs[GDB_R10] = 0; 148 gdb_regs[GDB_R10] = 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33019ddb56b4..a78ecad0c900 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,7 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <asm/timer.h>
30 31
31#define MMU_QUEUE_SIZE 1024 32#define MMU_QUEUE_SIZE 1024
32 33
@@ -195,7 +196,7 @@ static void kvm_leave_lazy_mmu(void)
195 struct kvm_para_state *state = kvm_para_state(); 196 struct kvm_para_state *state = kvm_para_state();
196 197
197 mmu_queue_flush(state); 198 mmu_queue_flush(state);
198 paravirt_leave_lazy(paravirt_get_lazy_mode()); 199 paravirt_leave_lazy_mmu();
199 state->mode = paravirt_get_lazy_mode(); 200 state->mode = paravirt_get_lazy_mode();
200} 201}
201 202
@@ -230,6 +231,9 @@ static void paravirt_ops_setup(void)
230 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; 231 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
231 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; 232 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
232 } 233 }
234#ifdef CONFIG_X86_IO_APIC
235 no_timer_check = 1;
236#endif
233} 237}
234 238
235void __init kvm_guest_init(void) 239void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 453b5795a5c6..366baa179913 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,25 +13,13 @@
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15 */ 15 */
16#include <linux/platform_device.h>
17#include <linux/capability.h>
18#include <linux/miscdevice.h>
19#include <linux/firmware.h> 16#include <linux/firmware.h>
20#include <linux/spinlock.h>
21#include <linux/cpumask.h>
22#include <linux/pci_ids.h> 17#include <linux/pci_ids.h>
23#include <linux/uaccess.h> 18#include <linux/uaccess.h>
24#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
25#include <linux/kernel.h> 20#include <linux/kernel.h>
26#include <linux/module.h> 21#include <linux/module.h>
27#include <linux/mutex.h>
28#include <linux/sched.h>
29#include <linux/init.h>
30#include <linux/slab.h>
31#include <linux/cpu.h>
32#include <linux/pci.h> 22#include <linux/pci.h>
33#include <linux/fs.h>
34#include <linux/mm.h>
35 23
36#include <asm/microcode.h> 24#include <asm/microcode.h>
37#include <asm/processor.h> 25#include <asm/processor.h>
@@ -79,9 +67,6 @@ struct microcode_amd {
79#define UCODE_CONTAINER_SECTION_HDR 8 67#define UCODE_CONTAINER_SECTION_HDR 8
80#define UCODE_CONTAINER_HEADER_SIZE 12 68#define UCODE_CONTAINER_HEADER_SIZE 12
81 69
82/* serialize access to the physical write */
83static DEFINE_SPINLOCK(microcode_update_lock);
84
85static struct equiv_cpu_entry *equiv_cpu_table; 70static struct equiv_cpu_entry *equiv_cpu_table;
86 71
87static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 72static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
@@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
144 return 1; 129 return 1;
145} 130}
146 131
147static void apply_microcode_amd(int cpu) 132static int apply_microcode_amd(int cpu)
148{ 133{
149 unsigned long flags;
150 u32 rev, dummy; 134 u32 rev, dummy;
151 int cpu_num = raw_smp_processor_id(); 135 int cpu_num = raw_smp_processor_id();
152 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; 136 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
@@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu)
156 BUG_ON(cpu_num != cpu); 140 BUG_ON(cpu_num != cpu);
157 141
158 if (mc_amd == NULL) 142 if (mc_amd == NULL)
159 return; 143 return 0;
160 144
161 spin_lock_irqsave(&microcode_update_lock, flags);
162 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); 145 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
163 /* get patch id after patching */ 146 /* get patch id after patching */
164 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); 147 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
165 spin_unlock_irqrestore(&microcode_update_lock, flags);
166 148
167 /* check current patch id and patch's id for match */ 149 /* check current patch id and patch's id for match */
168 if (rev != mc_amd->hdr.patch_id) { 150 if (rev != mc_amd->hdr.patch_id) {
169 printk(KERN_ERR "microcode: CPU%d: update failed " 151 printk(KERN_ERR "microcode: CPU%d: update failed "
170 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
171 return; 153 return -1;
172 } 154 }
173 155
174 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
175 cpu, rev); 157 cpu, rev);
176 158
177 uci->cpu_sig.rev = rev; 159 uci->cpu_sig.rev = rev;
160
161 return 0;
178} 162}
179 163
180static int get_ucode_data(void *to, const u8 *from, size_t n) 164static int get_ucode_data(void *to, const u8 *from, size_t n)
@@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf)
257 241
258static void free_equiv_cpu_table(void) 242static void free_equiv_cpu_table(void)
259{ 243{
260 if (equiv_cpu_table) { 244 vfree(equiv_cpu_table);
261 vfree(equiv_cpu_table); 245 equiv_cpu_table = NULL;
262 equiv_cpu_table = NULL;
263 }
264} 246}
265 247
266static int generic_load_microcode(int cpu, const u8 *data, size_t size) 248static enum ucode_state
249generic_load_microcode(int cpu, const u8 *data, size_t size)
267{ 250{
268 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 251 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
269 const u8 *ucode_ptr = data; 252 const u8 *ucode_ptr = data;
@@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
272 int new_rev = uci->cpu_sig.rev; 255 int new_rev = uci->cpu_sig.rev;
273 unsigned int leftover; 256 unsigned int leftover;
274 unsigned long offset; 257 unsigned long offset;
258 enum ucode_state state = UCODE_OK;
275 259
276 offset = install_equiv_cpu_table(ucode_ptr); 260 offset = install_equiv_cpu_table(ucode_ptr);
277 if (!offset) { 261 if (!offset) {
278 printk(KERN_ERR "microcode: failed to create " 262 printk(KERN_ERR "microcode: failed to create "
279 "equivalent cpu table\n"); 263 "equivalent cpu table\n");
280 return -EINVAL; 264 return UCODE_ERROR;
281 } 265 }
282 266
283 ucode_ptr += offset; 267 ucode_ptr += offset;
@@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
293 277
294 mc_header = (struct microcode_header_amd *)mc; 278 mc_header = (struct microcode_header_amd *)mc;
295 if (get_matching_microcode(cpu, mc, new_rev)) { 279 if (get_matching_microcode(cpu, mc, new_rev)) {
296 if (new_mc) 280 vfree(new_mc);
297 vfree(new_mc);
298 new_rev = mc_header->patch_id; 281 new_rev = mc_header->patch_id;
299 new_mc = mc; 282 new_mc = mc;
300 } else 283 } else
@@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
306 289
307 if (new_mc) { 290 if (new_mc) {
308 if (!leftover) { 291 if (!leftover) {
309 if (uci->mc) 292 vfree(uci->mc);
310 vfree(uci->mc);
311 uci->mc = new_mc; 293 uci->mc = new_mc;
312 pr_debug("microcode: CPU%d found a matching microcode " 294 pr_debug("microcode: CPU%d found a matching microcode "
313 "update with version 0x%x (current=0x%x)\n", 295 "update with version 0x%x (current=0x%x)\n",
314 cpu, new_rev, uci->cpu_sig.rev); 296 cpu, new_rev, uci->cpu_sig.rev);
315 } else 297 } else {
316 vfree(new_mc); 298 vfree(new_mc);
317 } 299 state = UCODE_ERROR;
300 }
301 } else
302 state = UCODE_NFOUND;
318 303
319 free_equiv_cpu_table(); 304 free_equiv_cpu_table();
320 305
321 return (int)leftover; 306 return state;
322} 307}
323 308
324static int request_microcode_fw(int cpu, struct device *device) 309static enum ucode_state request_microcode_fw(int cpu, struct device *device)
325{ 310{
326 const char *fw_name = "amd-ucode/microcode_amd.bin"; 311 const char *fw_name = "amd-ucode/microcode_amd.bin";
327 const struct firmware *firmware; 312 const struct firmware *firmware;
328 int ret; 313 enum ucode_state ret;
329
330 /* We should bind the task to the CPU */
331 BUG_ON(cpu != raw_smp_processor_id());
332 314
333 ret = request_firmware(&firmware, fw_name, device); 315 if (request_firmware(&firmware, fw_name, device)) {
334 if (ret) {
335 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); 316 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
336 return ret; 317 return UCODE_NFOUND;
337 } 318 }
338 319
339 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 320 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
@@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device)
343 return ret; 324 return ret;
344} 325}
345 326
346static int request_microcode_user(int cpu, const void __user *buf, size_t size) 327static enum ucode_state
328request_microcode_user(int cpu, const void __user *buf, size_t size)
347{ 329{
348 printk(KERN_INFO "microcode: AMD microcode update via " 330 printk(KERN_INFO "microcode: AMD microcode update via "
349 "/dev/cpu/microcode not supported\n"); 331 "/dev/cpu/microcode not supported\n");
350 return -1; 332 return UCODE_ERROR;
351} 333}
352 334
353static void microcode_fini_cpu_amd(int cpu) 335static void microcode_fini_cpu_amd(int cpu)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 98c470c069d1..9c4461501fcb 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -71,27 +71,18 @@
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h> 73#include <linux/platform_device.h>
74#include <linux/capability.h>
75#include <linux/miscdevice.h> 74#include <linux/miscdevice.h>
76#include <linux/firmware.h> 75#include <linux/capability.h>
77#include <linux/smp_lock.h> 76#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
79#include <linux/cpumask.h>
80#include <linux/uaccess.h>
81#include <linux/vmalloc.h>
82#include <linux/kernel.h> 77#include <linux/kernel.h>
83#include <linux/module.h> 78#include <linux/module.h>
84#include <linux/mutex.h> 79#include <linux/mutex.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
88#include <linux/cpu.h> 80#include <linux/cpu.h>
89#include <linux/fs.h> 81#include <linux/fs.h>
90#include <linux/mm.h> 82#include <linux/mm.h>
91 83
92#include <asm/microcode.h> 84#include <asm/microcode.h>
93#include <asm/processor.h> 85#include <asm/processor.h>
94#include <asm/msr.h>
95 86
96MODULE_DESCRIPTION("Microcode Update Driver"); 87MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 88MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -101,36 +92,110 @@ MODULE_LICENSE("GPL");
101 92
102static struct microcode_ops *microcode_ops; 93static struct microcode_ops *microcode_ops;
103 94
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ 95/*
96 * Synchronization.
97 *
98 * All non cpu-hotplug-callback call sites use:
99 *
100 * - microcode_mutex to synchronize with each other;
101 * - get/put_online_cpus() to synchronize with
102 * the cpu-hotplug-callback call sites.
103 *
104 * We guarantee that only a single cpu is being
105 * updated at any particular moment of time.
106 */
105static DEFINE_MUTEX(microcode_mutex); 107static DEFINE_MUTEX(microcode_mutex);
106 108
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; 109struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info); 110EXPORT_SYMBOL_GPL(ucode_cpu_info);
109 111
112/*
113 * Operations that are run on a target cpu:
114 */
115
116struct cpu_info_ctx {
117 struct cpu_signature *cpu_sig;
118 int err;
119};
120
121static void collect_cpu_info_local(void *arg)
122{
123 struct cpu_info_ctx *ctx = arg;
124
125 ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
126 ctx->cpu_sig);
127}
128
129static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
130{
131 struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
132 int ret;
133
134 ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
135 if (!ret)
136 ret = ctx.err;
137
138 return ret;
139}
140
141static int collect_cpu_info(int cpu)
142{
143 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
144 int ret;
145
146 memset(uci, 0, sizeof(*uci));
147
148 ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
149 if (!ret)
150 uci->valid = 1;
151
152 return ret;
153}
154
155struct apply_microcode_ctx {
156 int err;
157};
158
159static void apply_microcode_local(void *arg)
160{
161 struct apply_microcode_ctx *ctx = arg;
162
163 ctx->err = microcode_ops->apply_microcode(smp_processor_id());
164}
165
166static int apply_microcode_on_target(int cpu)
167{
168 struct apply_microcode_ctx ctx = { .err = 0 };
169 int ret;
170
171 ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
172 if (!ret)
173 ret = ctx.err;
174
175 return ret;
176}
177
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE 178#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111static int do_microcode_update(const void __user *buf, size_t size) 179static int do_microcode_update(const void __user *buf, size_t size)
112{ 180{
113 cpumask_t old;
114 int error = 0; 181 int error = 0;
115 int cpu; 182 int cpu;
116 183
117 old = current->cpus_allowed;
118
119 for_each_online_cpu(cpu) { 184 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 185 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
186 enum ucode_state ustate;
121 187
122 if (!uci->valid) 188 if (!uci->valid)
123 continue; 189 continue;
124 190
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 191 ustate = microcode_ops->request_microcode_user(cpu, buf, size);
126 error = microcode_ops->request_microcode_user(cpu, buf, size); 192 if (ustate == UCODE_ERROR) {
127 if (error < 0) 193 error = -1;
128 goto out; 194 break;
129 if (!error) 195 } else if (ustate == UCODE_OK)
130 microcode_ops->apply_microcode(cpu); 196 apply_microcode_on_target(cpu);
131 } 197 }
132out: 198
133 set_cpus_allowed_ptr(current, &old);
134 return error; 199 return error;
135} 200}
136 201
@@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2)
143static ssize_t microcode_write(struct file *file, const char __user *buf, 208static ssize_t microcode_write(struct file *file, const char __user *buf,
144 size_t len, loff_t *ppos) 209 size_t len, loff_t *ppos)
145{ 210{
146 ssize_t ret; 211 ssize_t ret = -EINVAL;
147 212
148 if ((len >> PAGE_SHIFT) > num_physpages) { 213 if ((len >> PAGE_SHIFT) > num_physpages) {
149 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", 214 pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
150 num_physpages); 215 return ret;
151 return -EINVAL;
152 } 216 }
153 217
154 get_online_cpus(); 218 get_online_cpus();
155 mutex_lock(&microcode_mutex); 219 mutex_lock(&microcode_mutex);
156 220
157 ret = do_microcode_update(buf, len); 221 if (do_microcode_update(buf, len) == 0)
158 if (!ret)
159 ret = (ssize_t)len; 222 ret = (ssize_t)len;
160 223
161 mutex_unlock(&microcode_mutex); 224 mutex_unlock(&microcode_mutex);
@@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
165} 228}
166 229
167static const struct file_operations microcode_fops = { 230static const struct file_operations microcode_fops = {
168 .owner = THIS_MODULE, 231 .owner = THIS_MODULE,
169 .write = microcode_write, 232 .write = microcode_write,
170 .open = microcode_open, 233 .open = microcode_open,
171}; 234};
172 235
173static struct miscdevice microcode_dev = { 236static struct miscdevice microcode_dev = {
174 .minor = MICROCODE_MINOR, 237 .minor = MICROCODE_MINOR,
175 .name = "microcode", 238 .name = "microcode",
176 .fops = &microcode_fops, 239 .fops = &microcode_fops,
177}; 240};
178 241
179static int __init microcode_dev_init(void) 242static int __init microcode_dev_init(void)
@@ -182,9 +245,7 @@ static int __init microcode_dev_init(void)
182 245
183 error = misc_register(&microcode_dev); 246 error = misc_register(&microcode_dev);
184 if (error) { 247 if (error) {
185 printk(KERN_ERR 248 pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
186 "microcode: can't misc_register on minor=%d\n",
187 MICROCODE_MINOR);
188 return error; 249 return error;
189 } 250 }
190 251
@@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
205/* fake device for request_firmware */ 266/* fake device for request_firmware */
206static struct platform_device *microcode_pdev; 267static struct platform_device *microcode_pdev;
207 268
208static long reload_for_cpu(void *unused) 269static int reload_for_cpu(int cpu)
209{ 270{
210 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); 271 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
211 int err = 0; 272 int err = 0;
212 273
213 mutex_lock(&microcode_mutex); 274 mutex_lock(&microcode_mutex);
214 if (uci->valid) { 275 if (uci->valid) {
215 err = microcode_ops->request_microcode_fw(smp_processor_id(), 276 enum ucode_state ustate;
216 &microcode_pdev->dev); 277
217 if (!err) 278 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
218 microcode_ops->apply_microcode(smp_processor_id()); 279 if (ustate == UCODE_OK)
280 apply_microcode_on_target(cpu);
281 else
282 if (ustate == UCODE_ERROR)
283 err = -EINVAL;
219 } 284 }
220 mutex_unlock(&microcode_mutex); 285 mutex_unlock(&microcode_mutex);
286
221 return err; 287 return err;
222} 288}
223 289
224static ssize_t reload_store(struct sys_device *dev, 290static ssize_t reload_store(struct sys_device *dev,
225 struct sysdev_attribute *attr, 291 struct sysdev_attribute *attr,
226 const char *buf, size_t sz) 292 const char *buf, size_t size)
227{ 293{
228 char *end; 294 unsigned long val;
229 unsigned long val = simple_strtoul(buf, &end, 0);
230 int err = 0;
231 int cpu = dev->id; 295 int cpu = dev->id;
296 int ret = 0;
297 char *end;
232 298
299 val = simple_strtoul(buf, &end, 0);
233 if (end == buf) 300 if (end == buf)
234 return -EINVAL; 301 return -EINVAL;
302
235 if (val == 1) { 303 if (val == 1) {
236 get_online_cpus(); 304 get_online_cpus();
237 if (cpu_online(cpu)) 305 if (cpu_online(cpu))
238 err = work_on_cpu(cpu, reload_for_cpu, NULL); 306 ret = reload_for_cpu(cpu);
239 put_online_cpus(); 307 put_online_cpus();
240 } 308 }
241 if (err) 309
242 return err; 310 if (!ret)
243 return sz; 311 ret = size;
312
313 return ret;
244} 314}
245 315
246static ssize_t version_show(struct sys_device *dev, 316static ssize_t version_show(struct sys_device *dev,
@@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = {
271}; 341};
272 342
273static struct attribute_group mc_attr_group = { 343static struct attribute_group mc_attr_group = {
274 .attrs = mc_default_attrs, 344 .attrs = mc_default_attrs,
275 .name = "microcode", 345 .name = "microcode",
276}; 346};
277 347
278static void __microcode_fini_cpu(int cpu) 348static void microcode_fini_cpu(int cpu)
279{ 349{
280 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 350 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
281 351
@@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu)
283 uci->valid = 0; 353 uci->valid = 0;
284} 354}
285 355
286static void microcode_fini_cpu(int cpu) 356static enum ucode_state microcode_resume_cpu(int cpu)
287{
288 mutex_lock(&microcode_mutex);
289 __microcode_fini_cpu(cpu);
290 mutex_unlock(&microcode_mutex);
291}
292
293static void collect_cpu_info(int cpu)
294{ 357{
295 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 358 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
296 359
297 memset(uci, 0, sizeof(*uci)); 360 if (!uci->mc)
298 if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) 361 return UCODE_NFOUND;
299 uci->valid = 1; 362
363 pr_debug("microcode: CPU%d updated upon resume\n", cpu);
364 apply_microcode_on_target(cpu);
365
366 return UCODE_OK;
300} 367}
301 368
302static int microcode_resume_cpu(int cpu) 369static enum ucode_state microcode_init_cpu(int cpu)
303{ 370{
304 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 371 enum ucode_state ustate;
305 struct cpu_signature nsig;
306 372
307 pr_debug("microcode: CPU%d resumed\n", cpu); 373 if (collect_cpu_info(cpu))
374 return UCODE_ERROR;
308 375
309 if (!uci->mc) 376 /* --dimm. Trigger a delayed update? */
310 return 1; 377 if (system_state != SYSTEM_RUNNING)
378 return UCODE_NFOUND;
311 379
312 /* 380 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
313 * Let's verify that the 'cached' ucode does belong
314 * to this cpu (a bit of paranoia):
315 */
316 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
317 __microcode_fini_cpu(cpu);
318 printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
319 cpu);
320 return -1;
321 }
322 381
323 if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { 382 if (ustate == UCODE_OK) {
324 __microcode_fini_cpu(cpu); 383 pr_debug("microcode: CPU%d updated upon init\n", cpu);
325 printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", 384 apply_microcode_on_target(cpu);
326 cpu);
327 /* Should we look for a new ucode here? */
328 return 1;
329 } 385 }
330 386
331 return 0; 387 return ustate;
332} 388}
333 389
334static long microcode_update_cpu(void *unused) 390static enum ucode_state microcode_update_cpu(int cpu)
335{ 391{
336 struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); 392 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
337 int err = 0; 393 enum ucode_state ustate;
338 394
339 /* 395 if (uci->valid)
340 * Check if the system resume is in progress (uci->valid != NULL), 396 ustate = microcode_resume_cpu(cpu);
341 * otherwise just request a firmware: 397 else
342 */ 398 ustate = microcode_init_cpu(cpu);
343 if (uci->valid) {
344 err = microcode_resume_cpu(smp_processor_id());
345 } else {
346 collect_cpu_info(smp_processor_id());
347 if (uci->valid && system_state == SYSTEM_RUNNING)
348 err = microcode_ops->request_microcode_fw(
349 smp_processor_id(),
350 &microcode_pdev->dev);
351 }
352 if (!err)
353 microcode_ops->apply_microcode(smp_processor_id());
354 return err;
355}
356 399
357static int microcode_init_cpu(int cpu) 400 return ustate;
358{
359 int err;
360 mutex_lock(&microcode_mutex);
361 err = work_on_cpu(cpu, microcode_update_cpu, NULL);
362 mutex_unlock(&microcode_mutex);
363
364 return err;
365} 401}
366 402
367static int mc_sysdev_add(struct sys_device *sys_dev) 403static int mc_sysdev_add(struct sys_device *sys_dev)
368{ 404{
369 int err, cpu = sys_dev->id; 405 int err, cpu = sys_dev->id;
370 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
371 406
372 if (!cpu_online(cpu)) 407 if (!cpu_online(cpu))
373 return 0; 408 return 0;
374 409
375 pr_debug("microcode: CPU%d added\n", cpu); 410 pr_debug("microcode: CPU%d added\n", cpu);
376 memset(uci, 0, sizeof(*uci));
377 411
378 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 412 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
379 if (err) 413 if (err)
380 return err; 414 return err;
381 415
382 err = microcode_init_cpu(cpu); 416 if (microcode_init_cpu(cpu) == UCODE_ERROR)
417 err = -EINVAL;
383 418
384 return err; 419 return err;
385} 420}
@@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
400static int mc_sysdev_resume(struct sys_device *dev) 435static int mc_sysdev_resume(struct sys_device *dev)
401{ 436{
402 int cpu = dev->id; 437 int cpu = dev->id;
438 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
403 439
404 if (!cpu_online(cpu)) 440 if (!cpu_online(cpu))
405 return 0; 441 return 0;
406 442
407 /* only CPU 0 will apply ucode here */ 443 /*
408 microcode_update_cpu(NULL); 444 * All non-bootup cpus are still disabled,
445 * so only CPU 0 will apply ucode here.
446 *
447 * Moreover, there can be no concurrent
448 * updates from any other places at this point.
449 */
450 WARN_ON(cpu != 0);
451
452 if (uci->valid && uci->mc)
453 microcode_ops->apply_microcode(cpu);
454
409 return 0; 455 return 0;
410} 456}
411 457
412static struct sysdev_driver mc_sysdev_driver = { 458static struct sysdev_driver mc_sysdev_driver = {
413 .add = mc_sysdev_add, 459 .add = mc_sysdev_add,
414 .remove = mc_sysdev_remove, 460 .remove = mc_sysdev_remove,
415 .resume = mc_sysdev_resume, 461 .resume = mc_sysdev_resume,
416}; 462};
417 463
418static __cpuinit int 464static __cpuinit int
@@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
425 switch (action) { 471 switch (action) {
426 case CPU_ONLINE: 472 case CPU_ONLINE:
427 case CPU_ONLINE_FROZEN: 473 case CPU_ONLINE_FROZEN:
428 if (microcode_init_cpu(cpu)) 474 microcode_update_cpu(cpu);
429 printk(KERN_ERR "microcode: failed to init CPU%d\n",
430 cpu);
431 case CPU_DOWN_FAILED: 475 case CPU_DOWN_FAILED:
432 case CPU_DOWN_FAILED_FROZEN: 476 case CPU_DOWN_FAILED_FROZEN:
433 pr_debug("microcode: CPU%d added\n", cpu); 477 pr_debug("microcode: CPU%d added\n", cpu);
434 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 478 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
435 printk(KERN_ERR "microcode: Failed to create the sysfs " 479 pr_err("microcode: Failed to create group for CPU%d\n", cpu);
436 "group for CPU%d\n", cpu);
437 break; 480 break;
438 case CPU_DOWN_PREPARE: 481 case CPU_DOWN_PREPARE:
439 case CPU_DOWN_PREPARE_FROZEN: 482 case CPU_DOWN_PREPARE_FROZEN:
@@ -465,13 +508,10 @@ static int __init microcode_init(void)
465 microcode_ops = init_amd_microcode(); 508 microcode_ops = init_amd_microcode();
466 509
467 if (!microcode_ops) { 510 if (!microcode_ops) {
468 printk(KERN_ERR "microcode: no support for this CPU vendor\n"); 511 pr_err("microcode: no support for this CPU vendor\n");
469 return -ENODEV; 512 return -ENODEV;
470 } 513 }
471 514
472 error = microcode_dev_init();
473 if (error)
474 return error;
475 microcode_pdev = platform_device_register_simple("microcode", -1, 515 microcode_pdev = platform_device_register_simple("microcode", -1,
476 NULL, 0); 516 NULL, 0);
477 if (IS_ERR(microcode_pdev)) { 517 if (IS_ERR(microcode_pdev)) {
@@ -480,23 +520,31 @@ static int __init microcode_init(void)
480 } 520 }
481 521
482 get_online_cpus(); 522 get_online_cpus();
523 mutex_lock(&microcode_mutex);
524
483 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); 525 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
526
527 mutex_unlock(&microcode_mutex);
484 put_online_cpus(); 528 put_online_cpus();
529
485 if (error) { 530 if (error) {
486 microcode_dev_exit();
487 platform_device_unregister(microcode_pdev); 531 platform_device_unregister(microcode_pdev);
488 return error; 532 return error;
489 } 533 }
490 534
535 error = microcode_dev_init();
536 if (error)
537 return error;
538
491 register_hotcpu_notifier(&mc_cpu_notifier); 539 register_hotcpu_notifier(&mc_cpu_notifier);
492 540
493 printk(KERN_INFO 541 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
494 "Microcode Update Driver: v" MICROCODE_VERSION
495 " <tigran@aivazian.fsnet.co.uk>," 542 " <tigran@aivazian.fsnet.co.uk>,"
496 " Peter Oruba\n"); 543 " Peter Oruba\n");
497 544
498 return 0; 545 return 0;
499} 546}
547module_init(microcode_init);
500 548
501static void __exit microcode_exit(void) 549static void __exit microcode_exit(void)
502{ 550{
@@ -505,16 +553,17 @@ static void __exit microcode_exit(void)
505 unregister_hotcpu_notifier(&mc_cpu_notifier); 553 unregister_hotcpu_notifier(&mc_cpu_notifier);
506 554
507 get_online_cpus(); 555 get_online_cpus();
556 mutex_lock(&microcode_mutex);
557
508 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); 558 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
559
560 mutex_unlock(&microcode_mutex);
509 put_online_cpus(); 561 put_online_cpus();
510 562
511 platform_device_unregister(microcode_pdev); 563 platform_device_unregister(microcode_pdev);
512 564
513 microcode_ops = NULL; 565 microcode_ops = NULL;
514 566
515 printk(KERN_INFO 567 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
516 "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
517} 568}
518
519module_init(microcode_init);
520module_exit(microcode_exit); 569module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 149b9ec7c1ab..0d334ddd0a96 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,24 +70,11 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73#include <linux/platform_device.h>
74#include <linux/capability.h>
75#include <linux/miscdevice.h>
76#include <linux/firmware.h> 73#include <linux/firmware.h>
77#include <linux/smp_lock.h>
78#include <linux/spinlock.h>
79#include <linux/cpumask.h>
80#include <linux/uaccess.h> 74#include <linux/uaccess.h>
81#include <linux/vmalloc.h>
82#include <linux/kernel.h> 75#include <linux/kernel.h>
83#include <linux/module.h> 76#include <linux/module.h>
84#include <linux/mutex.h> 77#include <linux/vmalloc.h>
85#include <linux/sched.h>
86#include <linux/init.h>
87#include <linux/slab.h>
88#include <linux/cpu.h>
89#include <linux/fs.h>
90#include <linux/mm.h>
91 78
92#include <asm/microcode.h> 79#include <asm/microcode.h>
93#include <asm/processor.h> 80#include <asm/processor.h>
@@ -150,13 +137,9 @@ struct extended_sigtable {
150 137
151#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) 138#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
152 139
153/* serialize access to the physical write to MSR 0x79 */
154static DEFINE_SPINLOCK(microcode_update_lock);
155
156static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) 140static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
157{ 141{
158 struct cpuinfo_x86 *c = &cpu_data(cpu_num); 142 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
159 unsigned long flags;
160 unsigned int val[2]; 143 unsigned int val[2];
161 144
162 memset(csig, 0, sizeof(*csig)); 145 memset(csig, 0, sizeof(*csig));
@@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
176 csig->pf = 1 << ((val[1] >> 18) & 7); 159 csig->pf = 1 << ((val[1] >> 18) & 7);
177 } 160 }
178 161
179 /* serialize access to the physical write to MSR 0x79 */
180 spin_lock_irqsave(&microcode_update_lock, flags);
181
182 wrmsr(MSR_IA32_UCODE_REV, 0, 0); 162 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
183 /* see notes above for revision 1.07. Apparent chip bug */ 163 /* see notes above for revision 1.07. Apparent chip bug */
184 sync_core(); 164 sync_core();
185 /* get the current revision from MSR 0x8B */ 165 /* get the current revision from MSR 0x8B */
186 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 166 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
187 spin_unlock_irqrestore(&microcode_update_lock, flags);
188 167
189 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", 168 printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
190 csig->sig, csig->pf, csig->rev); 169 cpu_num, csig->sig, csig->pf, csig->rev);
191 170
192 return 0; 171 return 0;
193} 172}
@@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
318 return 0; 297 return 0;
319} 298}
320 299
321static void apply_microcode(int cpu) 300static int apply_microcode(int cpu)
322{ 301{
323 struct microcode_intel *mc_intel; 302 struct microcode_intel *mc_intel;
324 struct ucode_cpu_info *uci; 303 struct ucode_cpu_info *uci;
325 unsigned long flags;
326 unsigned int val[2]; 304 unsigned int val[2];
327 int cpu_num; 305 int cpu_num;
328 306
@@ -334,10 +312,7 @@ static void apply_microcode(int cpu)
334 BUG_ON(cpu_num != cpu); 312 BUG_ON(cpu_num != cpu);
335 313
336 if (mc_intel == NULL) 314 if (mc_intel == NULL)
337 return; 315 return 0;
338
339 /* serialize access to the physical write to MSR 0x79 */
340 spin_lock_irqsave(&microcode_update_lock, flags);
341 316
342 /* write microcode via MSR 0x79 */ 317 /* write microcode via MSR 0x79 */
343 wrmsr(MSR_IA32_UCODE_WRITE, 318 wrmsr(MSR_IA32_UCODE_WRITE,
@@ -351,30 +326,32 @@ static void apply_microcode(int cpu)
351 /* get the current revision from MSR 0x8B */ 326 /* get the current revision from MSR 0x8B */
352 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 327 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
353 328
354 spin_unlock_irqrestore(&microcode_update_lock, flags);
355 if (val[1] != mc_intel->hdr.rev) { 329 if (val[1] != mc_intel->hdr.rev) {
356 printk(KERN_ERR "microcode: CPU%d update from revision " 330 printk(KERN_ERR "microcode: CPU%d update "
357 "0x%x to 0x%x failed\n", 331 "to revision 0x%x failed\n",
358 cpu_num, uci->cpu_sig.rev, val[1]); 332 cpu_num, mc_intel->hdr.rev);
359 return; 333 return -1;
360 } 334 }
361 printk(KERN_INFO "microcode: CPU%d updated from revision " 335 printk(KERN_INFO "microcode: CPU%d updated to revision "
362 "0x%x to 0x%x, date = %04x-%02x-%02x \n", 336 "0x%x, date = %04x-%02x-%02x \n",
363 cpu_num, uci->cpu_sig.rev, val[1], 337 cpu_num, val[1],
364 mc_intel->hdr.date & 0xffff, 338 mc_intel->hdr.date & 0xffff,
365 mc_intel->hdr.date >> 24, 339 mc_intel->hdr.date >> 24,
366 (mc_intel->hdr.date >> 16) & 0xff); 340 (mc_intel->hdr.date >> 16) & 0xff);
367 341
368 uci->cpu_sig.rev = val[1]; 342 uci->cpu_sig.rev = val[1];
343
344 return 0;
369} 345}
370 346
371static int generic_load_microcode(int cpu, void *data, size_t size, 347static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
372 int (*get_ucode_data)(void *, const void *, size_t)) 348 int (*get_ucode_data)(void *, const void *, size_t))
373{ 349{
374 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 350 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
375 u8 *ucode_ptr = data, *new_mc = NULL, *mc; 351 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
376 int new_rev = uci->cpu_sig.rev; 352 int new_rev = uci->cpu_sig.rev;
377 unsigned int leftover = size; 353 unsigned int leftover = size;
354 enum ucode_state state = UCODE_OK;
378 355
379 while (leftover) { 356 while (leftover) {
380 struct microcode_header_intel mc_header; 357 struct microcode_header_intel mc_header;
@@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
412 leftover -= mc_size; 389 leftover -= mc_size;
413 } 390 }
414 391
415 if (!new_mc) 392 if (leftover) {
393 if (new_mc)
394 vfree(new_mc);
395 state = UCODE_ERROR;
416 goto out; 396 goto out;
397 }
417 398
418 if (leftover) { 399 if (!new_mc) {
419 vfree(new_mc); 400 state = UCODE_NFOUND;
420 goto out; 401 goto out;
421 } 402 }
422 403
@@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
427 pr_debug("microcode: CPU%d found a matching microcode update with" 408 pr_debug("microcode: CPU%d found a matching microcode update with"
428 " version 0x%x (current=0x%x)\n", 409 " version 0x%x (current=0x%x)\n",
429 cpu, new_rev, uci->cpu_sig.rev); 410 cpu, new_rev, uci->cpu_sig.rev);
430 411out:
431 out: 412 return state;
432 return (int)leftover;
433} 413}
434 414
435static int get_ucode_fw(void *to, const void *from, size_t n) 415static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
438 return 0; 418 return 0;
439} 419}
440 420
441static int request_microcode_fw(int cpu, struct device *device) 421static enum ucode_state request_microcode_fw(int cpu, struct device *device)
442{ 422{
443 char name[30]; 423 char name[30];
444 struct cpuinfo_x86 *c = &cpu_data(cpu); 424 struct cpuinfo_x86 *c = &cpu_data(cpu);
445 const struct firmware *firmware; 425 const struct firmware *firmware;
446 int ret; 426 enum ucode_state ret;
447 427
448 /* We should bind the task to the CPU */
449 BUG_ON(cpu != raw_smp_processor_id());
450 sprintf(name, "intel-ucode/%02x-%02x-%02x", 428 sprintf(name, "intel-ucode/%02x-%02x-%02x",
451 c->x86, c->x86_model, c->x86_mask); 429 c->x86, c->x86_model, c->x86_mask);
452 ret = request_firmware(&firmware, name, device); 430
453 if (ret) { 431 if (request_firmware(&firmware, name, device)) {
454 pr_debug("microcode: data file %s load failed\n", name); 432 pr_debug("microcode: data file %s load failed\n", name);
455 return ret; 433 return UCODE_NFOUND;
456 } 434 }
457 435
458 ret = generic_load_microcode(cpu, (void *)firmware->data, 436 ret = generic_load_microcode(cpu, (void *)firmware->data,
@@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
468 return copy_from_user(to, from, n); 446 return copy_from_user(to, from, n);
469} 447}
470 448
471static int request_microcode_user(int cpu, const void __user *buf, size_t size) 449static enum ucode_state
450request_microcode_user(int cpu, const void __user *buf, size_t size)
472{ 451{
473 /* We should bind the task to the CPU */
474 BUG_ON(cpu != raw_smp_processor_id());
475
476 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); 452 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
477} 453}
478 454
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 9faf43bea336..70ec9b951d76 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -248,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
248 248
249static inline void enter_lazy(enum paravirt_lazy_mode mode) 249static inline void enter_lazy(enum paravirt_lazy_mode mode)
250{ 250{
251 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); 251 BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
252 BUG_ON(preemptible());
253 252
254 __get_cpu_var(paravirt_lazy_mode) = mode; 253 percpu_write(paravirt_lazy_mode, mode);
255} 254}
256 255
257void paravirt_leave_lazy(enum paravirt_lazy_mode mode) 256static void leave_lazy(enum paravirt_lazy_mode mode)
258{ 257{
259 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); 258 BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
260 BUG_ON(preemptible());
261 259
262 __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; 260 percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
263} 261}
264 262
265void paravirt_enter_lazy_mmu(void) 263void paravirt_enter_lazy_mmu(void)
@@ -269,22 +267,36 @@ void paravirt_enter_lazy_mmu(void)
269 267
270void paravirt_leave_lazy_mmu(void) 268void paravirt_leave_lazy_mmu(void)
271{ 269{
272 paravirt_leave_lazy(PARAVIRT_LAZY_MMU); 270 leave_lazy(PARAVIRT_LAZY_MMU);
273} 271}
274 272
275void paravirt_enter_lazy_cpu(void) 273void paravirt_start_context_switch(struct task_struct *prev)
276{ 274{
275 BUG_ON(preemptible());
276
277 if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
278 arch_leave_lazy_mmu_mode();
279 set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
280 }
277 enter_lazy(PARAVIRT_LAZY_CPU); 281 enter_lazy(PARAVIRT_LAZY_CPU);
278} 282}
279 283
280void paravirt_leave_lazy_cpu(void) 284void paravirt_end_context_switch(struct task_struct *next)
281{ 285{
282 paravirt_leave_lazy(PARAVIRT_LAZY_CPU); 286 BUG_ON(preemptible());
287
288 leave_lazy(PARAVIRT_LAZY_CPU);
289
290 if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
291 arch_enter_lazy_mmu_mode();
283} 292}
284 293
285enum paravirt_lazy_mode paravirt_get_lazy_mode(void) 294enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
286{ 295{
287 return __get_cpu_var(paravirt_lazy_mode); 296 if (in_interrupt())
297 return PARAVIRT_LAZY_NONE;
298
299 return percpu_read(paravirt_lazy_mode);
288} 300}
289 301
290void arch_flush_lazy_mmu_mode(void) 302void arch_flush_lazy_mmu_mode(void)
@@ -292,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void)
292 preempt_disable(); 304 preempt_disable();
293 305
294 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 306 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
295 WARN_ON(preempt_count() == 1);
296 arch_leave_lazy_mmu_mode(); 307 arch_leave_lazy_mmu_mode();
297 arch_enter_lazy_mmu_mode(); 308 arch_enter_lazy_mmu_mode();
298 } 309 }
@@ -300,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void)
300 preempt_enable(); 311 preempt_enable();
301} 312}
302 313
303void arch_flush_lazy_cpu_mode(void)
304{
305 preempt_disable();
306
307 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
308 WARN_ON(preempt_count() == 1);
309 arch_leave_lazy_cpu_mode();
310 arch_enter_lazy_cpu_mode();
311 }
312
313 preempt_enable();
314}
315
316struct pv_info pv_info = { 314struct pv_info pv_info = {
317 .name = "bare hardware", 315 .name = "bare hardware",
318 .paravirt_enabled = 0, 316 .paravirt_enabled = 0,
@@ -404,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = {
404 .set_iopl_mask = native_set_iopl_mask, 402 .set_iopl_mask = native_set_iopl_mask,
405 .io_delay = native_io_delay, 403 .io_delay = native_io_delay,
406 404
407 .lazy_mode = { 405 .start_context_switch = paravirt_nop,
408 .enter = paravirt_nop, 406 .end_context_switch = paravirt_nop,
409 .leave = paravirt_nop,
410 },
411}; 407};
412 408
413struct pv_apic_ops pv_apic_ops = { 409struct pv_apic_ops pv_apic_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 755c21e906f3..971a3bec47a8 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = {
186 186
187static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; 187static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
188 188
189/* enable this to stress test the chip's TCE cache */
190#ifdef CONFIG_IOMMU_DEBUG
191static int debugging = 1;
192
193static inline unsigned long verify_bit_range(unsigned long* bitmap,
194 int expected, unsigned long start, unsigned long end)
195{
196 unsigned long idx = start;
197
198 BUG_ON(start >= end);
199
200 while (idx < end) {
201 if (!!test_bit(idx, bitmap) != expected)
202 return idx;
203 ++idx;
204 }
205
206 /* all bits have the expected value */
207 return ~0UL;
208}
209#else /* debugging is disabled */
210static int debugging;
211
212static inline unsigned long verify_bit_range(unsigned long* bitmap,
213 int expected, unsigned long start, unsigned long end)
214{
215 return ~0UL;
216}
217
218#endif /* CONFIG_IOMMU_DEBUG */
219
220static inline int translation_enabled(struct iommu_table *tbl) 189static inline int translation_enabled(struct iommu_table *tbl)
221{ 190{
222 /* only PHBs with translation enabled have an IOMMU table */ 191 /* only PHBs with translation enabled have an IOMMU table */
@@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
228{ 197{
229 unsigned long index; 198 unsigned long index;
230 unsigned long end; 199 unsigned long end;
231 unsigned long badbit;
232 unsigned long flags; 200 unsigned long flags;
233 201
234 index = start_addr >> PAGE_SHIFT; 202 index = start_addr >> PAGE_SHIFT;
@@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
243 211
244 spin_lock_irqsave(&tbl->it_lock, flags); 212 spin_lock_irqsave(&tbl->it_lock, flags);
245 213
246 badbit = verify_bit_range(tbl->it_map, 0, index, end);
247 if (badbit != ~0UL) {
248 if (printk_ratelimit())
249 printk(KERN_ERR "Calgary: entry already allocated at "
250 "0x%lx tbl %p dma 0x%lx npages %u\n",
251 badbit, tbl, start_addr, npages);
252 }
253
254 iommu_area_reserve(tbl->it_map, index, npages); 214 iommu_area_reserve(tbl->it_map, index, npages);
255 215
256 spin_unlock_irqrestore(&tbl->it_lock, flags); 216 spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
326 unsigned int npages) 286 unsigned int npages)
327{ 287{
328 unsigned long entry; 288 unsigned long entry;
329 unsigned long badbit;
330 unsigned long badend; 289 unsigned long badend;
331 unsigned long flags; 290 unsigned long flags;
332 291
@@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
346 305
347 spin_lock_irqsave(&tbl->it_lock, flags); 306 spin_lock_irqsave(&tbl->it_lock, flags);
348 307
349 badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
350 if (badbit != ~0UL) {
351 if (printk_ratelimit())
352 printk(KERN_ERR "Calgary: bit is off at 0x%lx "
353 "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
354 badbit, tbl, dma_addr, entry, npages);
355 }
356
357 iommu_area_free(tbl->it_map, entry, npages); 308 iommu_area_free(tbl->it_map, entry, npages);
358 309
359 spin_unlock_irqrestore(&tbl->it_lock, flags); 310 spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -1488,9 +1439,8 @@ void __init detect_calgary(void)
1488 iommu_detected = 1; 1439 iommu_detected = 1;
1489 calgary_detected = 1; 1440 calgary_detected = 1;
1490 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); 1441 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
1491 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1442 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
1492 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1443 specified_table_size);
1493 debugging ? "enabled" : "disabled");
1494 1444
1495 /* swiotlb for devices that aren't behind the Calgary. */ 1445 /* swiotlb for devices that aren't behind the Calgary. */
1496 if (max_pfn > MAX_DMA32_PFN) 1446 if (max_pfn > MAX_DMA32_PFN)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index b284b58c035c..cfd9f9063896 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -144,48 +144,21 @@ static void flush_gart(void)
144} 144}
145 145
146#ifdef CONFIG_IOMMU_LEAK 146#ifdef CONFIG_IOMMU_LEAK
147
148#define SET_LEAK(x) \
149 do { \
150 if (iommu_leak_tab) \
151 iommu_leak_tab[x] = __builtin_return_address(0);\
152 } while (0)
153
154#define CLEAR_LEAK(x) \
155 do { \
156 if (iommu_leak_tab) \
157 iommu_leak_tab[x] = NULL; \
158 } while (0)
159
160/* Debugging aid for drivers that don't free their IOMMU tables */ 147/* Debugging aid for drivers that don't free their IOMMU tables */
161static void **iommu_leak_tab;
162static int leak_trace; 148static int leak_trace;
163static int iommu_leak_pages = 20; 149static int iommu_leak_pages = 20;
164 150
165static void dump_leak(void) 151static void dump_leak(void)
166{ 152{
167 int i;
168 static int dump; 153 static int dump;
169 154
170 if (dump || !iommu_leak_tab) 155 if (dump)
171 return; 156 return;
172 dump = 1; 157 dump = 1;
173 show_stack(NULL, NULL);
174 158
175 /* Very crude. dump some from the end of the table too */ 159 show_stack(NULL, NULL);
176 printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", 160 debug_dma_dump_mappings(NULL);
177 iommu_leak_pages);
178 for (i = 0; i < iommu_leak_pages; i += 2) {
179 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
180 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
181 0);
182 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
183 }
184 printk(KERN_DEBUG "\n");
185} 161}
186#else
187# define SET_LEAK(x)
188# define CLEAR_LEAK(x)
189#endif 162#endif
190 163
191static void iommu_full(struct device *dev, size_t size, int dir) 164static void iommu_full(struct device *dev, size_t size, int dir)
@@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
248 221
249 for (i = 0; i < npages; i++) { 222 for (i = 0; i < npages; i++) {
250 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); 223 iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
251 SET_LEAK(iommu_page + i);
252 phys_mem += PAGE_SIZE; 224 phys_mem += PAGE_SIZE;
253 } 225 }
254 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 226 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
@@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
294 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); 266 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
295 for (i = 0; i < npages; i++) { 267 for (i = 0; i < npages; i++) {
296 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 268 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
297 CLEAR_LEAK(iommu_page + i);
298 } 269 }
299 free_iommu(iommu_page, npages); 270 free_iommu(iommu_page, npages);
300} 271}
@@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
377 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); 348 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
378 while (pages--) { 349 while (pages--) {
379 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 350 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
380 SET_LEAK(iommu_page);
381 addr += PAGE_SIZE; 351 addr += PAGE_SIZE;
382 iommu_page++; 352 iommu_page++;
383 } 353 }
@@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
688 658
689 agp_gatt_table = gatt; 659 agp_gatt_table = gatt;
690 660
691 enable_gart_translations();
692
693 error = sysdev_class_register(&gart_sysdev_class); 661 error = sysdev_class_register(&gart_sysdev_class);
694 if (!error) 662 if (!error)
695 error = sysdev_register(&device_gart); 663 error = sysdev_register(&device_gart);
@@ -801,11 +769,12 @@ void __init gart_iommu_init(void)
801 769
802#ifdef CONFIG_IOMMU_LEAK 770#ifdef CONFIG_IOMMU_LEAK
803 if (leak_trace) { 771 if (leak_trace) {
804 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 772 int ret;
805 get_order(iommu_pages*sizeof(void *))); 773
806 if (!iommu_leak_tab) 774 ret = dma_debug_resize_entries(iommu_pages);
775 if (ret)
807 printk(KERN_DEBUG 776 printk(KERN_DEBUG
808 "PCI-DMA: Cannot allocate leak trace area\n"); 777 "PCI-DMA: Cannot trace all the entries\n");
809 } 778 }
810#endif 779#endif
811 780
@@ -845,6 +814,14 @@ void __init gart_iommu_init(void)
845 * the pages as Not-Present: 814 * the pages as Not-Present:
846 */ 815 */
847 wbinvd(); 816 wbinvd();
817
818 /*
819 * Now all caches are flushed and we can safely enable
820 * GART hardware. Doing it early leaves the possibility
821 * of stale cache entries that can lead to GART PTE
822 * errors.
823 */
824 enable_gart_translations();
848 825
849 /* 826 /*
850 * Try to workaround a bug (thanks to BenH): 827 * Try to workaround a bug (thanks to BenH):
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 221a3853e268..a1712f2b50f1 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
28 return paddr; 28 return paddr;
29} 29}
30 30
31phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) 31phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
32{ 32{
33 return baddr; 33 return baddr;
34} 34}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ca989158e847..3bb2be1649bd 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,12 +8,15 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h>
11#include <trace/power.h> 12#include <trace/power.h>
12#include <asm/system.h> 13#include <asm/system.h>
13#include <asm/apic.h> 14#include <asm/apic.h>
15#include <asm/syscalls.h>
14#include <asm/idle.h> 16#include <asm/idle.h>
15#include <asm/uaccess.h> 17#include <asm/uaccess.h>
16#include <asm/i387.h> 18#include <asm/i387.h>
19#include <asm/ds.h>
17 20
18unsigned long idle_halt; 21unsigned long idle_halt;
19EXPORT_SYMBOL(idle_halt); 22EXPORT_SYMBOL(idle_halt);
@@ -45,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk)
45 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); 48 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
46 tsk->thread.xstate = NULL; 49 tsk->thread.xstate = NULL;
47 } 50 }
51
52 WARN(tsk->thread.ds_ctx, "leaking DS context\n");
48} 53}
49 54
50void free_thread_info(struct thread_info *ti) 55void free_thread_info(struct thread_info *ti)
@@ -83,8 +88,6 @@ void exit_thread(void)
83 put_cpu(); 88 put_cpu();
84 kfree(bp); 89 kfree(bp);
85 } 90 }
86
87 ds_exit_thread(current);
88} 91}
89 92
90void flush_thread(void) 93void flush_thread(void)
@@ -613,3 +616,16 @@ static int __init idle_setup(char *str)
613} 616}
614early_param("idle", idle_setup); 617early_param("idle", idle_setup);
615 618
619unsigned long arch_align_stack(unsigned long sp)
620{
621 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
622 sp -= get_random_int() % 8192;
623 return sp & ~0xf;
624}
625
626unsigned long arch_randomize_brk(struct mm_struct *mm)
627{
628 unsigned long range_end = mm->brk + 0x02000000;
629 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
630}
631
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 76f8f84043a2..59f4524984af 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,8 +9,6 @@
9 * This file handles the architecture-dependent parts of process handling.. 9 * This file handles the architecture-dependent parts of process handling..
10 */ 10 */
11 11
12#include <stdarg.h>
13
14#include <linux/stackprotector.h> 12#include <linux/stackprotector.h>
15#include <linux/cpu.h> 13#include <linux/cpu.h>
16#include <linux/errno.h> 14#include <linux/errno.h>
@@ -33,7 +31,6 @@
33#include <linux/module.h> 31#include <linux/module.h>
34#include <linux/kallsyms.h> 32#include <linux/kallsyms.h>
35#include <linux/ptrace.h> 33#include <linux/ptrace.h>
36#include <linux/random.h>
37#include <linux/personality.h> 34#include <linux/personality.h>
38#include <linux/tick.h> 35#include <linux/tick.h>
39#include <linux/percpu.h> 36#include <linux/percpu.h>
@@ -290,7 +287,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
290 p->thread.io_bitmap_max = 0; 287 p->thread.io_bitmap_max = 0;
291 } 288 }
292 289
293 ds_copy_thread(p, current); 290 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
291 p->thread.ds_ctx = NULL;
294 292
295 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); 293 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
296 p->thread.debugctlmsr = 0; 294 p->thread.debugctlmsr = 0;
@@ -407,7 +405,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
407 * done before math_state_restore, so the TS bit is up 405 * done before math_state_restore, so the TS bit is up
408 * to date. 406 * to date.
409 */ 407 */
410 arch_leave_lazy_cpu_mode(); 408 arch_end_context_switch(next_p);
411 409
412 /* If the task has used fpu the last 5 timeslices, just do a full 410 /* If the task has used fpu the last 5 timeslices, just do a full
413 * restore of the math state immediately to avoid the trap; the 411 * restore of the math state immediately to avoid the trap; the
@@ -497,15 +495,3 @@ unsigned long get_wchan(struct task_struct *p)
497 return 0; 495 return 0;
498} 496}
499 497
500unsigned long arch_align_stack(unsigned long sp)
501{
502 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
503 sp -= get_random_int() % 8192;
504 return sp & ~0xf;
505}
506
507unsigned long arch_randomize_brk(struct mm_struct *mm)
508{
509 unsigned long range_end = mm->brk + 0x02000000;
510 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
511}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b751a41392b1..ebefb5407b9d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,8 +14,6 @@
14 * This file handles the architecture-dependent parts of process handling.. 14 * This file handles the architecture-dependent parts of process handling..
15 */ 15 */
16 16
17#include <stdarg.h>
18
19#include <linux/stackprotector.h> 17#include <linux/stackprotector.h>
20#include <linux/cpu.h> 18#include <linux/cpu.h>
21#include <linux/errno.h> 19#include <linux/errno.h>
@@ -32,7 +30,6 @@
32#include <linux/delay.h> 30#include <linux/delay.h>
33#include <linux/module.h> 31#include <linux/module.h>
34#include <linux/ptrace.h> 32#include <linux/ptrace.h>
35#include <linux/random.h>
36#include <linux/notifier.h> 33#include <linux/notifier.h>
37#include <linux/kprobes.h> 34#include <linux/kprobes.h>
38#include <linux/kdebug.h> 35#include <linux/kdebug.h>
@@ -335,7 +332,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
335 goto out; 332 goto out;
336 } 333 }
337 334
338 ds_copy_thread(p, me); 335 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
336 p->thread.ds_ctx = NULL;
339 337
340 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); 338 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
341 p->thread.debugctlmsr = 0; 339 p->thread.debugctlmsr = 0;
@@ -428,7 +426,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
428 * done before math_state_restore, so the TS bit is up 426 * done before math_state_restore, so the TS bit is up
429 * to date. 427 * to date.
430 */ 428 */
431 arch_leave_lazy_cpu_mode(); 429 arch_end_context_switch(next_p);
432 430
433 /* 431 /*
434 * Switch FS and GS. 432 * Switch FS and GS.
@@ -660,15 +658,3 @@ long sys_arch_prctl(int code, unsigned long addr)
660 return do_arch_prctl(current, code, addr); 658 return do_arch_prctl(current, code, addr);
661} 659}
662 660
663unsigned long arch_align_stack(unsigned long sp)
664{
665 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
666 sp -= get_random_int() % 8192;
667 return sp & ~0xf;
668}
669
670unsigned long arch_randomize_brk(struct mm_struct *mm)
671{
672 unsigned long range_end = mm->brk + 0x02000000;
673 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
674}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 23b7c8f017e2..09ecbde91c13 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <linux/seccomp.h> 22#include <linux/seccomp.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/workqueue.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
@@ -578,17 +579,130 @@ static int ioperm_get(struct task_struct *target,
578} 579}
579 580
580#ifdef CONFIG_X86_PTRACE_BTS 581#ifdef CONFIG_X86_PTRACE_BTS
582/*
583 * A branch trace store context.
584 *
585 * Contexts may only be installed by ptrace_bts_config() and only for
586 * ptraced tasks.
587 *
588 * Contexts are destroyed when the tracee is detached from the tracer.
589 * The actual destruction work requires interrupts enabled, so the
590 * work is deferred and will be scheduled during __ptrace_unlink().
591 *
592 * Contexts hold an additional task_struct reference on the traced
593 * task, as well as a reference on the tracer's mm.
594 *
595 * Ptrace already holds a task_struct for the duration of ptrace operations,
596 * but since destruction is deferred, it may be executed after both
597 * tracer and tracee exited.
598 */
599struct bts_context {
600 /* The branch trace handle. */
601 struct bts_tracer *tracer;
602
603 /* The buffer used to store the branch trace and its size. */
604 void *buffer;
605 unsigned int size;
606
607 /* The mm that paid for the above buffer. */
608 struct mm_struct *mm;
609
610 /* The task this context belongs to. */
611 struct task_struct *task;
612
613 /* The signal to send on a bts buffer overflow. */
614 unsigned int bts_ovfl_signal;
615
616 /* The work struct to destroy a context. */
617 struct work_struct work;
618};
619
620static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
621{
622 void *buffer = NULL;
623 int err = -ENOMEM;
624
625 err = account_locked_memory(current->mm, current->signal->rlim, size);
626 if (err < 0)
627 return err;
628
629 buffer = kzalloc(size, GFP_KERNEL);
630 if (!buffer)
631 goto out_refund;
632
633 context->buffer = buffer;
634 context->size = size;
635 context->mm = get_task_mm(current);
636
637 return 0;
638
639 out_refund:
640 refund_locked_memory(current->mm, size);
641 return err;
642}
643
644static inline void free_bts_buffer(struct bts_context *context)
645{
646 if (!context->buffer)
647 return;
648
649 kfree(context->buffer);
650 context->buffer = NULL;
651
652 refund_locked_memory(context->mm, context->size);
653 context->size = 0;
654
655 mmput(context->mm);
656 context->mm = NULL;
657}
658
659static void free_bts_context_work(struct work_struct *w)
660{
661 struct bts_context *context;
662
663 context = container_of(w, struct bts_context, work);
664
665 ds_release_bts(context->tracer);
666 put_task_struct(context->task);
667 free_bts_buffer(context);
668 kfree(context);
669}
670
671static inline void free_bts_context(struct bts_context *context)
672{
673 INIT_WORK(&context->work, free_bts_context_work);
674 schedule_work(&context->work);
675}
676
677static inline struct bts_context *alloc_bts_context(struct task_struct *task)
678{
679 struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
680 if (context) {
681 context->task = task;
682 task->bts = context;
683
684 get_task_struct(task);
685 }
686
687 return context;
688}
689
581static int ptrace_bts_read_record(struct task_struct *child, size_t index, 690static int ptrace_bts_read_record(struct task_struct *child, size_t index,
582 struct bts_struct __user *out) 691 struct bts_struct __user *out)
583{ 692{
693 struct bts_context *context;
584 const struct bts_trace *trace; 694 const struct bts_trace *trace;
585 struct bts_struct bts; 695 struct bts_struct bts;
586 const unsigned char *at; 696 const unsigned char *at;
587 int error; 697 int error;
588 698
589 trace = ds_read_bts(child->bts); 699 context = child->bts;
700 if (!context)
701 return -ESRCH;
702
703 trace = ds_read_bts(context->tracer);
590 if (!trace) 704 if (!trace)
591 return -EPERM; 705 return -ESRCH;
592 706
593 at = trace->ds.top - ((index + 1) * trace->ds.size); 707 at = trace->ds.top - ((index + 1) * trace->ds.size);
594 if ((void *)at < trace->ds.begin) 708 if ((void *)at < trace->ds.begin)
@@ -597,7 +711,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
597 if (!trace->read) 711 if (!trace->read)
598 return -EOPNOTSUPP; 712 return -EOPNOTSUPP;
599 713
600 error = trace->read(child->bts, at, &bts); 714 error = trace->read(context->tracer, at, &bts);
601 if (error < 0) 715 if (error < 0)
602 return error; 716 return error;
603 717
@@ -611,13 +725,18 @@ static int ptrace_bts_drain(struct task_struct *child,
611 long size, 725 long size,
612 struct bts_struct __user *out) 726 struct bts_struct __user *out)
613{ 727{
728 struct bts_context *context;
614 const struct bts_trace *trace; 729 const struct bts_trace *trace;
615 const unsigned char *at; 730 const unsigned char *at;
616 int error, drained = 0; 731 int error, drained = 0;
617 732
618 trace = ds_read_bts(child->bts); 733 context = child->bts;
734 if (!context)
735 return -ESRCH;
736
737 trace = ds_read_bts(context->tracer);
619 if (!trace) 738 if (!trace)
620 return -EPERM; 739 return -ESRCH;
621 740
622 if (!trace->read) 741 if (!trace->read)
623 return -EOPNOTSUPP; 742 return -EOPNOTSUPP;
@@ -628,9 +747,8 @@ static int ptrace_bts_drain(struct task_struct *child,
628 for (at = trace->ds.begin; (void *)at < trace->ds.top; 747 for (at = trace->ds.begin; (void *)at < trace->ds.top;
629 out++, drained++, at += trace->ds.size) { 748 out++, drained++, at += trace->ds.size) {
630 struct bts_struct bts; 749 struct bts_struct bts;
631 int error;
632 750
633 error = trace->read(child->bts, at, &bts); 751 error = trace->read(context->tracer, at, &bts);
634 if (error < 0) 752 if (error < 0)
635 return error; 753 return error;
636 754
@@ -640,35 +758,18 @@ static int ptrace_bts_drain(struct task_struct *child,
640 758
641 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); 759 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
642 760
643 error = ds_reset_bts(child->bts); 761 error = ds_reset_bts(context->tracer);
644 if (error < 0) 762 if (error < 0)
645 return error; 763 return error;
646 764
647 return drained; 765 return drained;
648} 766}
649 767
650static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
651{
652 child->bts_buffer = alloc_locked_buffer(size);
653 if (!child->bts_buffer)
654 return -ENOMEM;
655
656 child->bts_size = size;
657
658 return 0;
659}
660
661static void ptrace_bts_free_buffer(struct task_struct *child)
662{
663 free_locked_buffer(child->bts_buffer, child->bts_size);
664 child->bts_buffer = NULL;
665 child->bts_size = 0;
666}
667
668static int ptrace_bts_config(struct task_struct *child, 768static int ptrace_bts_config(struct task_struct *child,
669 long cfg_size, 769 long cfg_size,
670 const struct ptrace_bts_config __user *ucfg) 770 const struct ptrace_bts_config __user *ucfg)
671{ 771{
772 struct bts_context *context;
672 struct ptrace_bts_config cfg; 773 struct ptrace_bts_config cfg;
673 unsigned int flags = 0; 774 unsigned int flags = 0;
674 775
@@ -678,28 +779,33 @@ static int ptrace_bts_config(struct task_struct *child,
678 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 779 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
679 return -EFAULT; 780 return -EFAULT;
680 781
681 if (child->bts) { 782 context = child->bts;
682 ds_release_bts(child->bts); 783 if (!context)
683 child->bts = NULL; 784 context = alloc_bts_context(child);
684 } 785 if (!context)
786 return -ENOMEM;
685 787
686 if (cfg.flags & PTRACE_BTS_O_SIGNAL) { 788 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
687 if (!cfg.signal) 789 if (!cfg.signal)
688 return -EINVAL; 790 return -EINVAL;
689 791
690 child->thread.bts_ovfl_signal = cfg.signal;
691 return -EOPNOTSUPP; 792 return -EOPNOTSUPP;
793 context->bts_ovfl_signal = cfg.signal;
692 } 794 }
693 795
694 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && 796 ds_release_bts(context->tracer);
695 (cfg.size != child->bts_size)) { 797 context->tracer = NULL;
696 int error;
697 798
698 ptrace_bts_free_buffer(child); 799 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
800 int err;
699 801
700 error = ptrace_bts_allocate_buffer(child, cfg.size); 802 free_bts_buffer(context);
701 if (error < 0) 803 if (!cfg.size)
702 return error; 804 return 0;
805
806 err = alloc_bts_buffer(context, cfg.size);
807 if (err < 0)
808 return err;
703 } 809 }
704 810
705 if (cfg.flags & PTRACE_BTS_O_TRACE) 811 if (cfg.flags & PTRACE_BTS_O_TRACE)
@@ -708,15 +814,14 @@ static int ptrace_bts_config(struct task_struct *child,
708 if (cfg.flags & PTRACE_BTS_O_SCHED) 814 if (cfg.flags & PTRACE_BTS_O_SCHED)
709 flags |= BTS_TIMESTAMPS; 815 flags |= BTS_TIMESTAMPS;
710 816
711 child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, 817 context->tracer =
712 /* ovfl = */ NULL, /* th = */ (size_t)-1, 818 ds_request_bts_task(child, context->buffer, context->size,
713 flags); 819 NULL, (size_t)-1, flags);
714 if (IS_ERR(child->bts)) { 820 if (unlikely(IS_ERR(context->tracer))) {
715 int error = PTR_ERR(child->bts); 821 int error = PTR_ERR(context->tracer);
716
717 ptrace_bts_free_buffer(child);
718 child->bts = NULL;
719 822
823 free_bts_buffer(context);
824 context->tracer = NULL;
720 return error; 825 return error;
721 } 826 }
722 827
@@ -727,20 +832,25 @@ static int ptrace_bts_status(struct task_struct *child,
727 long cfg_size, 832 long cfg_size,
728 struct ptrace_bts_config __user *ucfg) 833 struct ptrace_bts_config __user *ucfg)
729{ 834{
835 struct bts_context *context;
730 const struct bts_trace *trace; 836 const struct bts_trace *trace;
731 struct ptrace_bts_config cfg; 837 struct ptrace_bts_config cfg;
732 838
839 context = child->bts;
840 if (!context)
841 return -ESRCH;
842
733 if (cfg_size < sizeof(cfg)) 843 if (cfg_size < sizeof(cfg))
734 return -EIO; 844 return -EIO;
735 845
736 trace = ds_read_bts(child->bts); 846 trace = ds_read_bts(context->tracer);
737 if (!trace) 847 if (!trace)
738 return -EPERM; 848 return -ESRCH;
739 849
740 memset(&cfg, 0, sizeof(cfg)); 850 memset(&cfg, 0, sizeof(cfg));
741 cfg.size = trace->ds.end - trace->ds.begin; 851 cfg.size = trace->ds.end - trace->ds.begin;
742 cfg.signal = child->thread.bts_ovfl_signal; 852 cfg.signal = context->bts_ovfl_signal;
743 cfg.bts_size = sizeof(struct bts_struct); 853 cfg.bts_size = sizeof(struct bts_struct);
744 854
745 if (cfg.signal) 855 if (cfg.signal)
746 cfg.flags |= PTRACE_BTS_O_SIGNAL; 856 cfg.flags |= PTRACE_BTS_O_SIGNAL;
@@ -759,80 +869,51 @@ static int ptrace_bts_status(struct task_struct *child,
759 869
760static int ptrace_bts_clear(struct task_struct *child) 870static int ptrace_bts_clear(struct task_struct *child)
761{ 871{
872 struct bts_context *context;
762 const struct bts_trace *trace; 873 const struct bts_trace *trace;
763 874
764 trace = ds_read_bts(child->bts); 875 context = child->bts;
876 if (!context)
877 return -ESRCH;
878
879 trace = ds_read_bts(context->tracer);
765 if (!trace) 880 if (!trace)
766 return -EPERM; 881 return -ESRCH;
767 882
768 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); 883 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
769 884
770 return ds_reset_bts(child->bts); 885 return ds_reset_bts(context->tracer);
771} 886}
772 887
773static int ptrace_bts_size(struct task_struct *child) 888static int ptrace_bts_size(struct task_struct *child)
774{ 889{
890 struct bts_context *context;
775 const struct bts_trace *trace; 891 const struct bts_trace *trace;
776 892
777 trace = ds_read_bts(child->bts); 893 context = child->bts;
894 if (!context)
895 return -ESRCH;
896
897 trace = ds_read_bts(context->tracer);
778 if (!trace) 898 if (!trace)
779 return -EPERM; 899 return -ESRCH;
780 900
781 return (trace->ds.top - trace->ds.begin) / trace->ds.size; 901 return (trace->ds.top - trace->ds.begin) / trace->ds.size;
782} 902}
783 903
784static void ptrace_bts_fork(struct task_struct *tsk) 904/*
785{ 905 * Called from __ptrace_unlink() after the child has been moved back
786 tsk->bts = NULL; 906 * to its original parent.
787 tsk->bts_buffer = NULL; 907 */
788 tsk->bts_size = 0; 908void ptrace_bts_untrace(struct task_struct *child)
789 tsk->thread.bts_ovfl_signal = 0;
790}
791
792static void ptrace_bts_untrace(struct task_struct *child)
793{ 909{
794 if (unlikely(child->bts)) { 910 if (unlikely(child->bts)) {
795 ds_release_bts(child->bts); 911 free_bts_context(child->bts);
796 child->bts = NULL; 912 child->bts = NULL;
797
798 /* We cannot update total_vm and locked_vm since
799 child's mm is already gone. But we can reclaim the
800 memory. */
801 kfree(child->bts_buffer);
802 child->bts_buffer = NULL;
803 child->bts_size = 0;
804 } 913 }
805} 914}
806
807static void ptrace_bts_detach(struct task_struct *child)
808{
809 /*
810 * Ptrace_detach() races with ptrace_untrace() in case
811 * the child dies and is reaped by another thread.
812 *
813 * We only do the memory accounting at this point and
814 * leave the buffer deallocation and the bts tracer
815 * release to ptrace_bts_untrace() which will be called
816 * later on with tasklist_lock held.
817 */
818 release_locked_buffer(child->bts_buffer, child->bts_size);
819}
820#else
821static inline void ptrace_bts_fork(struct task_struct *tsk) {}
822static inline void ptrace_bts_detach(struct task_struct *child) {}
823static inline void ptrace_bts_untrace(struct task_struct *child) {}
824#endif /* CONFIG_X86_PTRACE_BTS */ 915#endif /* CONFIG_X86_PTRACE_BTS */
825 916
826void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
827{
828 ptrace_bts_fork(child);
829}
830
831void x86_ptrace_untrace(struct task_struct *child)
832{
833 ptrace_bts_untrace(child);
834}
835
836/* 917/*
837 * Called by kernel/ptrace.c when detaching.. 918 * Called by kernel/ptrace.c when detaching..
838 * 919 *
@@ -844,7 +925,6 @@ void ptrace_disable(struct task_struct *child)
844#ifdef TIF_SYSCALL_EMU 925#ifdef TIF_SYSCALL_EMU
845 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 926 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
846#endif 927#endif
847 ptrace_bts_detach(child);
848} 928}
849 929
850#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 930#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 7563b31b4f03..af71d06624bf 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,5 +491,42 @@ void force_hpet_resume(void)
491 break; 491 break;
492 } 492 }
493} 493}
494#endif
495
496#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
497/* Set correct numa_node information for AMD NB functions */
498static void __init quirk_amd_nb_node(struct pci_dev *dev)
499{
500 struct pci_dev *nb_ht;
501 unsigned int devfn;
502 u32 val;
503
504 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
505 nb_ht = pci_get_slot(dev->bus, devfn);
506 if (!nb_ht)
507 return;
508
509 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7);
511 pci_dev_put(dev);
512}
494 513
514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
515 quirk_amd_nb_node);
516DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
517 quirk_amd_nb_node);
518DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
519 quirk_amd_nb_node);
520DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
521 quirk_amd_nb_node);
522DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
523 quirk_amd_nb_node);
524DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
525 quirk_amd_nb_node);
526DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
527 quirk_amd_nb_node);
528DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
529 quirk_amd_nb_node);
530DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
531 quirk_amd_nb_node);
495#endif 532#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 667188e0b5a0..d2d1ce8170f0 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
192 DMI_MATCH(DMI_BOARD_NAME, "0KP561"), 192 DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
193 }, 193 },
194 }, 194 },
195 { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
196 .callback = set_bios_reboot,
197 .ident = "Dell OptiPlex 360",
198 .matches = {
199 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
200 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
201 DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
202 },
203 },
195 { /* Handle problems with rebooting on Dell 2400's */ 204 { /* Handle problems with rebooting on Dell 2400's */
196 .callback = set_bios_reboot, 205 .callback = set_bios_reboot,
197 .ident = "Dell PowerEdge 2400", 206 .ident = "Dell PowerEdge 2400",
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 523bb697120d..d1c636bf31a7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,14 @@
112#define ARCH_SETUP 112#define ARCH_SETUP
113#endif 113#endif
114 114
115/*
116 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
117 * The direct mapping extends to max_pfn_mapped, so that we can directly access
118 * apertures, ACPI and other tables without having to play with fixmaps.
119 */
120unsigned long max_low_pfn_mapped;
121unsigned long max_pfn_mapped;
122
115RESERVE_BRK(dmi_alloc, 65536); 123RESERVE_BRK(dmi_alloc, 65536);
116 124
117unsigned int boot_cpu_id __read_mostly; 125unsigned int boot_cpu_id __read_mostly;
@@ -214,8 +222,8 @@ unsigned long mmu_cr4_features;
214unsigned long mmu_cr4_features = X86_CR4_PAE; 222unsigned long mmu_cr4_features = X86_CR4_PAE;
215#endif 223#endif
216 224
217/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 225/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
218int bootloader_type; 226int bootloader_type, bootloader_version;
219 227
220/* 228/*
221 * Setup options 229 * Setup options
@@ -706,6 +714,12 @@ void __init setup_arch(char **cmdline_p)
706#endif 714#endif
707 saved_video_mode = boot_params.hdr.vid_mode; 715 saved_video_mode = boot_params.hdr.vid_mode;
708 bootloader_type = boot_params.hdr.type_of_loader; 716 bootloader_type = boot_params.hdr.type_of_loader;
717 if ((bootloader_type >> 4) == 0xe) {
718 bootloader_type &= 0xf;
719 bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
720 }
721 bootloader_version = bootloader_type & 0xf;
722 bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
709 723
710#ifdef CONFIG_BLK_DEV_RAM 724#ifdef CONFIG_BLK_DEV_RAM
711 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; 725 rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
@@ -854,12 +868,16 @@ void __init setup_arch(char **cmdline_p)
854 max_low_pfn = max_pfn; 868 max_low_pfn = max_pfn;
855 869
856 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 870 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
871 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
857#endif 872#endif
858 873
859#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION 874#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
860 setup_bios_corruption_check(); 875 setup_bios_corruption_check();
861#endif 876#endif
862 877
878 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
879 max_pfn_mapped<<PAGE_SHIFT);
880
863 reserve_brk(); 881 reserve_brk();
864 882
865 /* max_pfn_mapped is updated here */ 883 /* max_pfn_mapped is updated here */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 8f0e13be36b3..9c3f0823e6aa 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -425,6 +425,14 @@ void __init setup_per_cpu_areas(void)
425 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 425 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
426#endif 426#endif
427 427
428#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
429 /*
430 * make sure boot cpu node_number is right, when boot cpu is on the
431 * node that doesn't have mem installed
432 */
433 per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
434#endif
435
428 /* Setup node to cpumask map */ 436 /* Setup node to cpumask map */
429 setup_node_to_cpumask_map(); 437 setup_node_to_cpumask_map();
430 438
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4976888094f0..4c578751e94e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9
10#include <linux/sched.h> 9#include <linux/sched.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index bf1831aa14fa..ec1de97600e7 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -198,6 +198,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
198{ 198{
199 ack_APIC_irq(); 199 ack_APIC_irq();
200 inc_irq_stat(irq_resched_count); 200 inc_irq_stat(irq_resched_count);
201 /*
202 * KVM uses this interrupt to force a cpu out of guest mode
203 */
201} 204}
202 205
203void smp_call_function_interrupt(struct pt_regs *regs) 206void smp_call_function_interrupt(struct pt_regs *regs)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d2e8de958156..7c80007ea5f7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -992,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
992 */ 992 */
993 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && 993 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
994 !cpu_has_apic) { 994 !cpu_has_apic) {
995 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 995 if (!disable_apic) {
996 boot_cpu_physical_apicid); 996 pr_err("BIOS bug, local APIC #%d not detected!...\n",
997 printk(KERN_ERR "... forcing use of dummy APIC emulation." 997 boot_cpu_physical_apicid);
998 pr_err("... forcing use of dummy APIC emulation."
998 "(tell your hw vendor)\n"); 999 "(tell your hw vendor)\n");
1000 }
999 smpboot_clear_io_apic(); 1001 smpboot_clear_io_apic();
1000 arch_disable_smp_support(); 1002 arch_disable_smp_support();
1001 return -1; 1003 return -1;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index f7bddc2e37d1..4aaf7e48394f 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
20 20
21static int save_stack_stack(void *data, char *name) 21static int save_stack_stack(void *data, char *name)
22{ 22{
23 return -1; 23 return 0;
24} 24}
25 25
26static void save_stack_address(void *data, unsigned long addr, int reliable) 26static void save_stack_address(void *data, unsigned long addr, int reliable)
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c8736b491..d51321ddafda 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,5 @@ ENTRY(sys_call_table)
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_preadv 335 .long sys_preadv
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_counter_open
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ed0c33761e6d..124d40c575df 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode)
715 struct bau_desc *adp; 715 struct bau_desc *adp;
716 struct bau_desc *ad2; 716 struct bau_desc *ad2;
717 717
718 adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node); 718 /*
719 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
720 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
721 */
722 adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
723 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
719 BUG_ON(!adp); 724 BUG_ON(!adp);
720 725
721 pa = uv_gpa(adp); /* need the real nasid*/ 726 pa = uv_gpa(adp); /* need the real nasid*/
@@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode)
729 (n << UV_DESC_BASE_PNODE_SHIFT | m)); 734 (n << UV_DESC_BASE_PNODE_SHIFT | m));
730 } 735 }
731 736
732 for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { 737 /*
738 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
739 * cpu even though we only use the first one; one descriptor can
740 * describe a broadcast to 256 nodes.
741 */
742 for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
743 i++, ad2++) {
733 memset(ad2, 0, sizeof(struct bau_desc)); 744 memset(ad2, 0, sizeof(struct bau_desc));
734 ad2->header.sw_ack_flag = 1; 745 ad2->header.sw_ack_flag = 1;
735 /* 746 /*
@@ -832,7 +843,7 @@ static int __init uv_bau_init(void)
832 return 0; 843 return 0;
833 844
834 for_each_possible_cpu(cur_cpu) 845 for_each_possible_cpu(cur_cpu)
835 alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), 846 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
836 GFP_KERNEL, cpu_to_node(cur_cpu)); 847 GFP_KERNEL, cpu_to_node(cur_cpu));
837 848
838 uv_bau_retry_limit = 1; 849 uv_bau_retry_limit = 1;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index f4d683b630ba..1e1e27b7d438 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void)
839 } 839 }
840 840
841 clts(); /* Allow maths ops (or we recurse) */ 841 clts(); /* Allow maths ops (or we recurse) */
842#ifdef CONFIG_X86_32
843 restore_fpu(tsk);
844#else
845 /* 842 /*
846 * Paranoid restore. send a SIGSEGV if we fail to restore the state. 843 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
847 */ 844 */
@@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void)
850 force_sig(SIGSEGV, tsk); 847 force_sig(SIGSEGV, tsk);
851 return; 848 return;
852 } 849 }
853#endif 850
854 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 851 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
855 tsk->fpu_counter++; 852 tsk->fpu_counter++;
856} 853}
@@ -945,8 +942,13 @@ void __init trap_init(void)
945#endif 942#endif
946 set_intr_gate(19, &simd_coprocessor_error); 943 set_intr_gate(19, &simd_coprocessor_error);
947 944
945 /* Reserve all the builtin and the syscall vector: */
946 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
947 set_bit(i, used_vectors);
948
948#ifdef CONFIG_IA32_EMULATION 949#ifdef CONFIG_IA32_EMULATION
949 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 950 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
951 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
950#endif 952#endif
951 953
952#ifdef CONFIG_X86_32 954#ifdef CONFIG_X86_32
@@ -963,14 +965,9 @@ void __init trap_init(void)
963 } 965 }
964 966
965 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 967 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
968 set_bit(SYSCALL_VECTOR, used_vectors);
966#endif 969#endif
967 970
968 /* Reserve all the builtin and the syscall vector: */
969 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
970 set_bit(i, used_vectors);
971
972 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
973
974 /* 971 /*
975 * Should be a barrier for any external CPU state: 972 * Should be a barrier for any external CPU state:
976 */ 973 */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index d57de05dc430..3e1c057e98fe 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void)
384{ 384{
385 u64 tsc1, tsc2, delta, ref1, ref2; 385 u64 tsc1, tsc2, delta, ref1, ref2;
386 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 386 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
387 unsigned long flags, latch, ms, fast_calibrate, tsc_khz; 387 unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
388 int hpet = is_hpet_enabled(), i, loopmin; 388 int hpet = is_hpet_enabled(), i, loopmin;
389 389
390 tsc_khz = get_hypervisor_tsc_freq(); 390 hv_tsc_khz = get_hypervisor_tsc_freq();
391 if (tsc_khz) { 391 if (hv_tsc_khz) {
392 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); 392 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
393 return tsc_khz; 393 return hv_tsc_khz;
394 } 394 }
395 395
396 local_irq_save(flags); 396 local_irq_save(flags);
@@ -710,7 +710,16 @@ static cycle_t read_tsc(struct clocksource *cs)
710#ifdef CONFIG_X86_64 710#ifdef CONFIG_X86_64
711static cycle_t __vsyscall_fn vread_tsc(void) 711static cycle_t __vsyscall_fn vread_tsc(void)
712{ 712{
713 cycle_t ret = (cycle_t)vget_cycles(); 713 cycle_t ret;
714
715 /*
716 * Surround the RDTSC by barriers, to make sure it's not
717 * speculated to outside the seqlock critical section and
718 * does not cause time warps:
719 */
720 rdtsc_barrier();
721 ret = (cycle_t)vget_cycles();
722 rdtsc_barrier();
714 723
715 return ret >= __vsyscall_gtod_data.clock.cycle_last ? 724 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
716 ret : __vsyscall_gtod_data.clock.cycle_last; 725 ret : __vsyscall_gtod_data.clock.cycle_last;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index bf36328f6ef9..027b5b498993 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count;
34 * of a critical section, to be able to prove TSC time-warps: 34 * of a critical section, to be able to prove TSC time-warps:
35 */ 35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; 36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
37
37static __cpuinitdata cycles_t last_tsc; 38static __cpuinitdata cycles_t last_tsc;
38static __cpuinitdata cycles_t max_warp; 39static __cpuinitdata cycles_t max_warp;
39static __cpuinitdata int nr_warps; 40static __cpuinitdata int nr_warps;
@@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
113 return; 114 return;
114 115
115 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
116 printk(KERN_INFO 117 pr_info("Skipping synchronization checks as TSC is reliable.\n");
117 "Skipping synchronization checks as TSC is reliable.\n");
118 return; 118 return;
119 } 119 }
120 120
121 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", 121 pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
122 smp_processor_id(), cpu); 122 smp_processor_id(), cpu);
123 123
124 /* 124 /*
125 * Reset it - in case this is a second bootup: 125 * Reset it - in case this is a second bootup:
@@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu)
143 143
144 if (nr_warps) { 144 if (nr_warps) {
145 printk("\n"); 145 printk("\n");
146 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," 146 pr_warning("Measured %Ld cycles TSC warp between CPUs, "
147 " turning off TSC clock.\n", max_warp); 147 "turning off TSC clock.\n", max_warp);
148 mark_tsc_unstable("check_tsc_sync_source failed"); 148 mark_tsc_unstable("check_tsc_sync_source failed");
149 } else { 149 } else {
150 printk(" passed.\n"); 150 printk(" passed.\n");
@@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void)
195 while (atomic_read(&stop_count) != cpus) 195 while (atomic_read(&stop_count) != cpus)
196 cpu_relax(); 196 cpu_relax();
197} 197}
198#undef NR_LOOPS
199
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index d7ac84e7fc1c..9c4e62539058 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
287 info->regs.pt.ds = 0; 287 info->regs.pt.ds = 0;
288 info->regs.pt.es = 0; 288 info->regs.pt.es = 0;
289 info->regs.pt.fs = 0; 289 info->regs.pt.fs = 0;
290 290#ifndef CONFIG_X86_32_LAZY_GS
291/* we are clearing gs later just before "jmp resume_userspace", 291 info->regs.pt.gs = 0;
292 * because it is not saved/restored. 292#endif
293 */
294 293
295/* 294/*
296 * The flags register is also special: we cannot trust that the user 295 * The flags register is also special: we cannot trust that the user
@@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
318 } 317 }
319 318
320/* 319/*
321 * Save old state, set default return value (%ax) to 0 320 * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
322 */ 321 */
323 info->regs32->ax = 0; 322 info->regs32->ax = VM86_SIGNAL;
324 tsk->thread.saved_sp0 = tsk->thread.sp0; 323 tsk->thread.saved_sp0 = tsk->thread.sp0;
325 tsk->thread.saved_fs = info->regs32->fs; 324 tsk->thread.saved_fs = info->regs32->fs;
326 tsk->thread.saved_gs = get_user_gs(info->regs32); 325 tsk->thread.saved_gs = get_user_gs(info->regs32);
@@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
343 __asm__ __volatile__( 342 __asm__ __volatile__(
344 "movl %0,%%esp\n\t" 343 "movl %0,%%esp\n\t"
345 "movl %1,%%ebp\n\t" 344 "movl %1,%%ebp\n\t"
345#ifdef CONFIG_X86_32_LAZY_GS
346 "mov %2, %%gs\n\t" 346 "mov %2, %%gs\n\t"
347#endif
347 "jmp resume_userspace" 348 "jmp resume_userspace"
348 : /* no outputs */ 349 : /* no outputs */
349 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); 350 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95deb9f2211e..b263423fbe2a 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
462} 462}
463#endif 463#endif
464 464
465static void vmi_enter_lazy_cpu(void) 465static void vmi_start_context_switch(struct task_struct *prev)
466{ 466{
467 paravirt_enter_lazy_cpu(); 467 paravirt_start_context_switch(prev);
468 vmi_ops.set_lazy_mode(2); 468 vmi_ops.set_lazy_mode(2);
469} 469}
470 470
471static void vmi_end_context_switch(struct task_struct *next)
472{
473 vmi_ops.set_lazy_mode(0);
474 paravirt_end_context_switch(next);
475}
476
471static void vmi_enter_lazy_mmu(void) 477static void vmi_enter_lazy_mmu(void)
472{ 478{
473 paravirt_enter_lazy_mmu(); 479 paravirt_enter_lazy_mmu();
474 vmi_ops.set_lazy_mode(1); 480 vmi_ops.set_lazy_mode(1);
475} 481}
476 482
477static void vmi_leave_lazy(void) 483static void vmi_leave_lazy_mmu(void)
478{ 484{
479 paravirt_leave_lazy(paravirt_get_lazy_mode());
480 vmi_ops.set_lazy_mode(0); 485 vmi_ops.set_lazy_mode(0);
486 paravirt_leave_lazy_mmu();
481} 487}
482 488
483static inline int __init check_vmi_rom(struct vrom_header *rom) 489static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -711,14 +717,14 @@ static inline int __init activate_vmi(void)
711 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); 717 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
712 para_fill(pv_cpu_ops.io_delay, IODelay); 718 para_fill(pv_cpu_ops.io_delay, IODelay);
713 719
714 para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, 720 para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
715 set_lazy_mode, SetLazyMode); 721 set_lazy_mode, SetLazyMode);
716 para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, 722 para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
717 set_lazy_mode, SetLazyMode); 723 set_lazy_mode, SetLazyMode);
718 724
719 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, 725 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
720 set_lazy_mode, SetLazyMode); 726 set_lazy_mode, SetLazyMode);
721 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, 727 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
722 set_lazy_mode, SetLazyMode); 728 set_lazy_mode, SetLazyMode);
723 729
724 /* user and kernel flush are just handled with different flags to FlushTLB */ 730 /* user and kernel flush are just handled with different flags to FlushTLB */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 849ee611f013..4c85b2e2bb65 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -1,5 +1,431 @@
1/*
2 * ld script for the x86 kernel
3 *
4 * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
5 *
6 * Modernisation, unification and other changes and fixes:
7 * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org>
8 *
9 *
10 * Don't define absolute symbols until and unless you know that symbol
11 * value is should remain constant even if kernel image is relocated
12 * at run time. Absolute symbols are not relocated. If symbol value should
13 * change if kernel is relocated, make the symbol section relative and
14 * put it inside the section definition.
15 */
16
1#ifdef CONFIG_X86_32 17#ifdef CONFIG_X86_32
2# include "vmlinux_32.lds.S" 18#define LOAD_OFFSET __PAGE_OFFSET
3#else 19#else
4# include "vmlinux_64.lds.S" 20#define LOAD_OFFSET __START_KERNEL_map
5#endif 21#endif
22
23#include <asm-generic/vmlinux.lds.h>
24#include <asm/asm-offsets.h>
25#include <asm/thread_info.h>
26#include <asm/page_types.h>
27#include <asm/cache.h>
28#include <asm/boot.h>
29
30#undef i386 /* in case the preprocessor is a 32bit one */
31
32OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
33
34#ifdef CONFIG_X86_32
35OUTPUT_ARCH(i386)
36ENTRY(phys_startup_32)
37jiffies = jiffies_64;
38#else
39OUTPUT_ARCH(i386:x86-64)
40ENTRY(phys_startup_64)
41jiffies_64 = jiffies;
42#endif
43
44PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */
47#ifdef CONFIG_X86_64
48 user PT_LOAD FLAGS(7); /* RWE */
49 data.init PT_LOAD FLAGS(7); /* RWE */
50#ifdef CONFIG_SMP
51 percpu PT_LOAD FLAGS(7); /* RWE */
52#endif
53 data.init2 PT_LOAD FLAGS(7); /* RWE */
54#endif
55 note PT_NOTE FLAGS(0); /* ___ */
56}
57
58SECTIONS
59{
60#ifdef CONFIG_X86_32
61 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
62 phys_startup_32 = startup_32 - LOAD_OFFSET;
63#else
64 . = __START_KERNEL;
65 phys_startup_64 = startup_64 - LOAD_OFFSET;
66#endif
67
68 /* Text and read-only data */
69
70 /* bootstrapping code */
71 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
72 _text = .;
73 *(.text.head)
74 } :text = 0x9090
75
76 /* The rest of the text */
77 .text : AT(ADDR(.text) - LOAD_OFFSET) {
78#ifdef CONFIG_X86_32
79 /* not really needed, already page aligned */
80 . = ALIGN(PAGE_SIZE);
81 *(.text.page_aligned)
82#endif
83 . = ALIGN(8);
84 _stext = .;
85 TEXT_TEXT
86 SCHED_TEXT
87 LOCK_TEXT
88 KPROBES_TEXT
89 IRQENTRY_TEXT
90 *(.fixup)
91 *(.gnu.warning)
92 /* End of text section */
93 _etext = .;
94 } :text = 0x9090
95
96 NOTES :text :note
97
98 /* Exception table */
99 . = ALIGN(16);
100 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
101 __start___ex_table = .;
102 *(__ex_table)
103 __stop___ex_table = .;
104 } :text = 0x9090
105
106 RODATA
107
108 /* Data */
109 . = ALIGN(PAGE_SIZE);
110 .data : AT(ADDR(.data) - LOAD_OFFSET) {
111 DATA_DATA
112 CONSTRUCTORS
113
114#ifdef CONFIG_X86_64
115 /* End of data section */
116 _edata = .;
117#endif
118 } :data
119
120#ifdef CONFIG_X86_32
121 /* 32 bit has nosave before _edata */
122 . = ALIGN(PAGE_SIZE);
123 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
124 __nosave_begin = .;
125 *(.data.nosave)
126 . = ALIGN(PAGE_SIZE);
127 __nosave_end = .;
128 }
129#endif
130
131 . = ALIGN(PAGE_SIZE);
132 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
133 *(.data.page_aligned)
134 *(.data.idt)
135 }
136
137#ifdef CONFIG_X86_32
138 . = ALIGN(32);
139#else
140 . = ALIGN(PAGE_SIZE);
141 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
142#endif
143 .data.cacheline_aligned :
144 AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
145 *(.data.cacheline_aligned)
146 }
147
148 /* rarely changed data like cpu maps */
149#ifdef CONFIG_X86_32
150 . = ALIGN(32);
151#else
152 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
153#endif
154 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
155 *(.data.read_mostly)
156
157#ifdef CONFIG_X86_32
158 /* End of data section */
159 _edata = .;
160#endif
161 }
162
163#ifdef CONFIG_X86_64
164
165#define VSYSCALL_ADDR (-10*1024*1024)
166#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
167 SIZEOF(.data.read_mostly) + 4095) & ~(4095))
168#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
169 SIZEOF(.data.read_mostly) + 4095) & ~(4095))
170
171#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
172#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
173
174#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
175#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
176
177 . = VSYSCALL_ADDR;
178 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
179 *(.vsyscall_0)
180 } :user
181
182 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
183
184 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
185 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
186 *(.vsyscall_fn)
187 }
188
189 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
190 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
191 *(.vsyscall_gtod_data)
192 }
193
194 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
195 .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
196 *(.vsyscall_clock)
197 }
198 vsyscall_clock = VVIRT(.vsyscall_clock);
199
200
201 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
202 *(.vsyscall_1)
203 }
204 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
205 *(.vsyscall_2)
206 }
207
208 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
209 *(.vgetcpu_mode)
210 }
211 vgetcpu_mode = VVIRT(.vgetcpu_mode);
212
213 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
214 .jiffies : AT(VLOAD(.jiffies)) {
215 *(.jiffies)
216 }
217 jiffies = VVIRT(.jiffies);
218
219 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
220 *(.vsyscall_3)
221 }
222
223 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
224
225#undef VSYSCALL_ADDR
226#undef VSYSCALL_PHYS_ADDR
227#undef VSYSCALL_VIRT_ADDR
228#undef VLOAD_OFFSET
229#undef VLOAD
230#undef VVIRT_OFFSET
231#undef VVIRT
232
233#endif /* CONFIG_X86_64 */
234
235 /* init_task */
236 . = ALIGN(THREAD_SIZE);
237 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
238 *(.data.init_task)
239 }
240#ifdef CONFIG_X86_64
241 :data.init
242#endif
243
244 /*
245 * smp_locks might be freed after init
246 * start/end must be page aligned
247 */
248 . = ALIGN(PAGE_SIZE);
249 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
250 __smp_locks = .;
251 *(.smp_locks)
252 __smp_locks_end = .;
253 . = ALIGN(PAGE_SIZE);
254 }
255
256 /* Init code and data - will be freed after init */
257 . = ALIGN(PAGE_SIZE);
258 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
259 __init_begin = .; /* paired with __init_end */
260 _sinittext = .;
261 INIT_TEXT
262 _einittext = .;
263 }
264
265 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
266 INIT_DATA
267 }
268
269 . = ALIGN(16);
270 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
271 __setup_start = .;
272 *(.init.setup)
273 __setup_end = .;
274 }
275 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
276 __initcall_start = .;
277 INITCALLS
278 __initcall_end = .;
279 }
280
281 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
282 __con_initcall_start = .;
283 *(.con_initcall.init)
284 __con_initcall_end = .;
285 }
286
287 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
288 __x86_cpu_dev_start = .;
289 *(.x86_cpu_dev.init)
290 __x86_cpu_dev_end = .;
291 }
292
293 SECURITY_INIT
294
295 . = ALIGN(8);
296 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
297 __parainstructions = .;
298 *(.parainstructions)
299 __parainstructions_end = .;
300 }
301
302 . = ALIGN(8);
303 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
304 __alt_instructions = .;
305 *(.altinstructions)
306 __alt_instructions_end = .;
307 }
308
309 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
310 *(.altinstr_replacement)
311 }
312
313 /*
314 * .exit.text is discard at runtime, not link time, to deal with
315 * references from .altinstructions and .eh_frame
316 */
317 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
318 EXIT_TEXT
319 }
320
321 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
322 EXIT_DATA
323 }
324
325#ifdef CONFIG_BLK_DEV_INITRD
326 . = ALIGN(PAGE_SIZE);
327 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
328 __initramfs_start = .;
329 *(.init.ramfs)
330 __initramfs_end = .;
331 }
332#endif
333
334#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
335 /*
336 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
337 * output PHDR, so the next output section - __data_nosave - should
338 * start another section data.init2. Also, pda should be at the head of
339 * percpu area. Preallocate it and define the percpu offset symbol
340 * so that it can be accessed as a percpu variable.
341 */
342 . = ALIGN(PAGE_SIZE);
343 PERCPU_VADDR(0, :percpu)
344#else
345 PERCPU(PAGE_SIZE)
346#endif
347
348 . = ALIGN(PAGE_SIZE);
349
350 /* freed after init ends here */
351 .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
352 __init_end = .;
353 }
354
355#ifdef CONFIG_X86_64
356 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
357 . = ALIGN(PAGE_SIZE);
358 __nosave_begin = .;
359 *(.data.nosave)
360 . = ALIGN(PAGE_SIZE);
361 __nosave_end = .;
362 } :data.init2
363 /* use another section data.init2, see PERCPU_VADDR() above */
364#endif
365
366 /* BSS */
367 . = ALIGN(PAGE_SIZE);
368 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
369 __bss_start = .;
370 *(.bss.page_aligned)
371 *(.bss)
372 . = ALIGN(4);
373 __bss_stop = .;
374 }
375
376 . = ALIGN(PAGE_SIZE);
377 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
378 __brk_base = .;
379 . += 64 * 1024; /* 64k alignment slop space */
380 *(.brk_reservation) /* areas brk users have reserved */
381 __brk_limit = .;
382 }
383
384 .end : AT(ADDR(.end) - LOAD_OFFSET) {
385 _end = .;
386 }
387
388 /* Sections to be discarded */
389 /DISCARD/ : {
390 *(.exitcall.exit)
391 *(.eh_frame)
392 *(.discard)
393 }
394
395 STABS_DEBUG
396 DWARF_DEBUG
397}
398
399
400#ifdef CONFIG_X86_32
401ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
402 "kernel image bigger than KERNEL_IMAGE_SIZE")
403#else
404/*
405 * Per-cpu symbols which need to be offset from __per_cpu_load
406 * for the boot processor.
407 */
408#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
409INIT_PER_CPU(gdt_page);
410INIT_PER_CPU(irq_stack_union);
411
412/*
413 * Build-time check on the image size:
414 */
415ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
416 "kernel image bigger than KERNEL_IMAGE_SIZE")
417
418#ifdef CONFIG_SMP
419ASSERT((per_cpu__irq_stack_union == 0),
420 "irq_stack_union is not at start of per-cpu area");
421#endif
422
423#endif /* CONFIG_X86_32 */
424
425#ifdef CONFIG_KEXEC
426#include <asm/kexec.h>
427
428ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
429 "kexec control code size is too big")
430#endif
431
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
deleted file mode 100644
index 62ad500d55f3..000000000000
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ /dev/null
@@ -1,229 +0,0 @@
1/* ld script to make i386 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 *
4 * Don't define absolute symbols until and unless you know that symbol
5 * value is should remain constant even if kernel image is relocated
6 * at run time. Absolute symbols are not relocated. If symbol value should
7 * change if kernel is relocated, make the symbol section relative and
8 * put it inside the section definition.
9 */
10
11#define LOAD_OFFSET __PAGE_OFFSET
12
13#include <asm-generic/vmlinux.lds.h>
14#include <asm/thread_info.h>
15#include <asm/page_types.h>
16#include <asm/cache.h>
17#include <asm/boot.h>
18
19OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
20OUTPUT_ARCH(i386)
21ENTRY(phys_startup_32)
22jiffies = jiffies_64;
23
24PHDRS {
25 text PT_LOAD FLAGS(5); /* R_E */
26 data PT_LOAD FLAGS(7); /* RWE */
27 note PT_NOTE FLAGS(0); /* ___ */
28}
29SECTIONS
30{
31 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
32 phys_startup_32 = startup_32 - LOAD_OFFSET;
33
34 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
35 _text = .; /* Text and read-only data */
36 *(.text.head)
37 } :text = 0x9090
38
39 /* read-only */
40 .text : AT(ADDR(.text) - LOAD_OFFSET) {
41 . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */
42 *(.text.page_aligned)
43 TEXT_TEXT
44 SCHED_TEXT
45 LOCK_TEXT
46 KPROBES_TEXT
47 IRQENTRY_TEXT
48 *(.fixup)
49 *(.gnu.warning)
50 _etext = .; /* End of text section */
51 } :text = 0x9090
52
53 NOTES :text :note
54
55 . = ALIGN(16); /* Exception table */
56 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
57 __start___ex_table = .;
58 *(__ex_table)
59 __stop___ex_table = .;
60 } :text = 0x9090
61
62 RODATA
63
64 /* writeable */
65 . = ALIGN(PAGE_SIZE);
66 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
67 DATA_DATA
68 CONSTRUCTORS
69 } :data
70
71 . = ALIGN(PAGE_SIZE);
72 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
73 __nosave_begin = .;
74 *(.data.nosave)
75 . = ALIGN(PAGE_SIZE);
76 __nosave_end = .;
77 }
78
79 . = ALIGN(PAGE_SIZE);
80 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
81 *(.data.page_aligned)
82 *(.data.idt)
83 }
84
85 . = ALIGN(32);
86 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
87 *(.data.cacheline_aligned)
88 }
89
90 /* rarely changed data like cpu maps */
91 . = ALIGN(32);
92 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
93 *(.data.read_mostly)
94 _edata = .; /* End of data section */
95 }
96
97 . = ALIGN(THREAD_SIZE); /* init_task */
98 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
99 *(.data.init_task)
100 }
101
102 /* might get freed after init */
103 . = ALIGN(PAGE_SIZE);
104 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
105 __smp_locks = .;
106 *(.smp_locks)
107 __smp_locks_end = .;
108 }
109 /* will be freed after init
110 * Following ALIGN() is required to make sure no other data falls on the
111 * same page where __smp_alt_end is pointing as that page might be freed
112 * after boot. Always make sure that ALIGN() directive is present after
113 * the section which contains __smp_alt_end.
114 */
115 . = ALIGN(PAGE_SIZE);
116
117 /* will be freed after init */
118 . = ALIGN(PAGE_SIZE); /* Init code and data */
119 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
120 __init_begin = .;
121 _sinittext = .;
122 INIT_TEXT
123 _einittext = .;
124 }
125 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
126 INIT_DATA
127 }
128 . = ALIGN(16);
129 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
130 __setup_start = .;
131 *(.init.setup)
132 __setup_end = .;
133 }
134 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
135 __initcall_start = .;
136 INITCALLS
137 __initcall_end = .;
138 }
139 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
140 __con_initcall_start = .;
141 *(.con_initcall.init)
142 __con_initcall_end = .;
143 }
144 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
145 __x86_cpu_dev_start = .;
146 *(.x86_cpu_dev.init)
147 __x86_cpu_dev_end = .;
148 }
149 SECURITY_INIT
150 . = ALIGN(4);
151 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
152 __alt_instructions = .;
153 *(.altinstructions)
154 __alt_instructions_end = .;
155 }
156 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
157 *(.altinstr_replacement)
158 }
159 . = ALIGN(4);
160 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
161 __parainstructions = .;
162 *(.parainstructions)
163 __parainstructions_end = .;
164 }
165 /* .exit.text is discard at runtime, not link time, to deal with references
166 from .altinstructions and .eh_frame */
167 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
168 EXIT_TEXT
169 }
170 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
171 EXIT_DATA
172 }
173#if defined(CONFIG_BLK_DEV_INITRD)
174 . = ALIGN(PAGE_SIZE);
175 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
176 __initramfs_start = .;
177 *(.init.ramfs)
178 __initramfs_end = .;
179 }
180#endif
181 PERCPU(PAGE_SIZE)
182 . = ALIGN(PAGE_SIZE);
183 /* freed after init ends here */
184
185 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
186 __init_end = .;
187 __bss_start = .; /* BSS */
188 *(.bss.page_aligned)
189 *(.bss)
190 . = ALIGN(4);
191 __bss_stop = .;
192 }
193
194 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
195 . = ALIGN(PAGE_SIZE);
196 __brk_base = . ;
197 . += 64 * 1024 ; /* 64k alignment slop space */
198 *(.brk_reservation) /* areas brk users have reserved */
199 __brk_limit = . ;
200 }
201
202 .end : AT(ADDR(.end) - LOAD_OFFSET) {
203 _end = . ;
204 }
205
206 /* Sections to be discarded */
207 /DISCARD/ : {
208 *(.exitcall.exit)
209 *(.discard)
210 }
211
212 STABS_DEBUG
213
214 DWARF_DEBUG
215}
216
217/*
218 * Build-time check on the image size:
219 */
220ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
221 "kernel image bigger than KERNEL_IMAGE_SIZE")
222
223#ifdef CONFIG_KEXEC
224/* Link time checks */
225#include <asm/kexec.h>
226
227ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
228 "kexec control code size is too big")
229#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
deleted file mode 100644
index c8742507b030..000000000000
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ /dev/null
@@ -1,298 +0,0 @@
1/* ld script to make x86-64 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 */
4
5#define LOAD_OFFSET __START_KERNEL_map
6
7#include <asm-generic/vmlinux.lds.h>
8#include <asm/asm-offsets.h>
9#include <asm/page_types.h>
10
11#undef i386 /* in case the preprocessor is a 32bit one */
12
13OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
14OUTPUT_ARCH(i386:x86-64)
15ENTRY(phys_startup_64)
16jiffies_64 = jiffies;
17PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */
19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */
22#ifdef CONFIG_SMP
23 percpu PT_LOAD FLAGS(7); /* RWE */
24#endif
25 data.init2 PT_LOAD FLAGS(7); /* RWE */
26 note PT_NOTE FLAGS(0); /* ___ */
27}
28SECTIONS
29{
30 . = __START_KERNEL;
31 phys_startup_64 = startup_64 - LOAD_OFFSET;
32 .text : AT(ADDR(.text) - LOAD_OFFSET) {
33 _text = .; /* Text and read-only data */
34 /* First the code that has to be first for bootstrapping */
35 *(.text.head)
36 _stext = .;
37 /* Then the rest */
38 TEXT_TEXT
39 SCHED_TEXT
40 LOCK_TEXT
41 KPROBES_TEXT
42 IRQENTRY_TEXT
43 *(.fixup)
44 *(.gnu.warning)
45 _etext = .; /* End of text section */
46 } :text = 0x9090
47
48 NOTES :text :note
49
50 . = ALIGN(16); /* Exception table */
51 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
52 __start___ex_table = .;
53 *(__ex_table)
54 __stop___ex_table = .;
55 } :text = 0x9090
56
57 RODATA
58
59 . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
60 /* Data */
61 .data : AT(ADDR(.data) - LOAD_OFFSET) {
62 DATA_DATA
63 CONSTRUCTORS
64 _edata = .; /* End of data section */
65 } :data
66
67
68 .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
69 . = ALIGN(PAGE_SIZE);
70 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
71 *(.data.cacheline_aligned)
72 }
73 . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
74 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
75 *(.data.read_mostly)
76 }
77
78#define VSYSCALL_ADDR (-10*1024*1024)
79#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
80#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
81
82#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
83#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
84
85#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
86#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
87
88 . = VSYSCALL_ADDR;
89 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
90 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
91
92 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
93 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
94 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
95 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
96 { *(.vsyscall_gtod_data) }
97 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
98 .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
99 { *(.vsyscall_clock) }
100 vsyscall_clock = VVIRT(.vsyscall_clock);
101
102
103 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
104 { *(.vsyscall_1) }
105 .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
106 { *(.vsyscall_2) }
107
108 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
109 vgetcpu_mode = VVIRT(.vgetcpu_mode);
110
111 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
112 .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
113 jiffies = VVIRT(.jiffies);
114
115 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
116 { *(.vsyscall_3) }
117
118 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
119
120#undef VSYSCALL_ADDR
121#undef VSYSCALL_PHYS_ADDR
122#undef VSYSCALL_VIRT_ADDR
123#undef VLOAD_OFFSET
124#undef VLOAD
125#undef VVIRT_OFFSET
126#undef VVIRT
127
128 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
129 . = ALIGN(THREAD_SIZE); /* init_task */
130 *(.data.init_task)
131 }:data.init
132
133 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
134 . = ALIGN(PAGE_SIZE);
135 *(.data.page_aligned)
136 }
137
138 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
139 /* might get freed after init */
140 . = ALIGN(PAGE_SIZE);
141 __smp_alt_begin = .;
142 __smp_locks = .;
143 *(.smp_locks)
144 __smp_locks_end = .;
145 . = ALIGN(PAGE_SIZE);
146 __smp_alt_end = .;
147 }
148
149 . = ALIGN(PAGE_SIZE); /* Init code and data */
150 __init_begin = .; /* paired with __init_end */
151 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
152 _sinittext = .;
153 INIT_TEXT
154 _einittext = .;
155 }
156 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
157 __initdata_begin = .;
158 INIT_DATA
159 __initdata_end = .;
160 }
161
162 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
163 . = ALIGN(16);
164 __setup_start = .;
165 *(.init.setup)
166 __setup_end = .;
167 }
168 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
169 __initcall_start = .;
170 INITCALLS
171 __initcall_end = .;
172 }
173 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
174 __con_initcall_start = .;
175 *(.con_initcall.init)
176 __con_initcall_end = .;
177 }
178 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
179 __x86_cpu_dev_start = .;
180 *(.x86_cpu_dev.init)
181 __x86_cpu_dev_end = .;
182 }
183 SECURITY_INIT
184
185 . = ALIGN(8);
186 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
187 __parainstructions = .;
188 *(.parainstructions)
189 __parainstructions_end = .;
190 }
191
192 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
193 . = ALIGN(8);
194 __alt_instructions = .;
195 *(.altinstructions)
196 __alt_instructions_end = .;
197 }
198 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
199 *(.altinstr_replacement)
200 }
201 /* .exit.text is discard at runtime, not link time, to deal with references
202 from .altinstructions and .eh_frame */
203 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
204 EXIT_TEXT
205 }
206 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
207 EXIT_DATA
208 }
209
210#ifdef CONFIG_BLK_DEV_INITRD
211 . = ALIGN(PAGE_SIZE);
212 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
213 __initramfs_start = .;
214 *(.init.ramfs)
215 __initramfs_end = .;
216 }
217#endif
218
219#ifdef CONFIG_SMP
220 /*
221 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
222 * output PHDR, so the next output section - __data_nosave - should
223 * start another section data.init2. Also, pda should be at the head of
224 * percpu area. Preallocate it and define the percpu offset symbol
225 * so that it can be accessed as a percpu variable.
226 */
227 . = ALIGN(PAGE_SIZE);
228 PERCPU_VADDR(0, :percpu)
229#else
230 PERCPU(PAGE_SIZE)
231#endif
232
233 . = ALIGN(PAGE_SIZE);
234 __init_end = .;
235
236 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
237 . = ALIGN(PAGE_SIZE);
238 __nosave_begin = .;
239 *(.data.nosave)
240 . = ALIGN(PAGE_SIZE);
241 __nosave_end = .;
242 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
243
244 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
245 . = ALIGN(PAGE_SIZE);
246 __bss_start = .; /* BSS */
247 *(.bss.page_aligned)
248 *(.bss)
249 __bss_stop = .;
250 }
251
252 .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
253 . = ALIGN(PAGE_SIZE);
254 __brk_base = . ;
255 . += 64 * 1024 ; /* 64k alignment slop space */
256 *(.brk_reservation) /* areas brk users have reserved */
257 __brk_limit = . ;
258 }
259
260 _end = . ;
261
262 /* Sections to be discarded */
263 /DISCARD/ : {
264 *(.exitcall.exit)
265 *(.eh_frame)
266 *(.discard)
267 }
268
269 STABS_DEBUG
270
271 DWARF_DEBUG
272}
273
274 /*
275 * Per-cpu symbols which need to be offset from __per_cpu_load
276 * for the boot processor.
277 */
278#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
279INIT_PER_CPU(gdt_page);
280INIT_PER_CPU(irq_stack_union);
281
282/*
283 * Build-time check on the image size:
284 */
285ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
286 "kernel image bigger than KERNEL_IMAGE_SIZE")
287
288#ifdef CONFIG_SMP
289ASSERT((per_cpu__irq_stack_union == 0),
290 "irq_stack_union is not at start of per-cpu area");
291#endif
292
293#ifdef CONFIG_KEXEC
294#include <asm/kexec.h>
295
296ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
297 "kexec control code size is too big")
298#endif
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 44153afc9067..25ee06a80aad 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
132 return; 132 return;
133 } 133 }
134 134
135 /*
136 * Surround the RDTSC by barriers, to make sure it's not
137 * speculated to outside the seqlock critical section and
138 * does not cause time warps:
139 */
140 rdtsc_barrier();
141 now = vread(); 135 now = vread();
142 rdtsc_barrier();
143
144 base = __vsyscall_gtod_data.clock.cycle_last; 136 base = __vsyscall_gtod_data.clock.cycle_last;
145 mask = __vsyscall_gtod_data.clock.mask; 137 mask = __vsyscall_gtod_data.clock.mask;
146 mult = __vsyscall_gtod_data.clock.mult; 138 mult = __vsyscall_gtod_data.clock.mult;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a58504ea78cc..8600a09e0c6c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -50,6 +50,9 @@ config KVM_INTEL
50 Provides support for KVM on Intel processors equipped with the VT 50 Provides support for KVM on Intel processors equipped with the VT
51 extensions. 51 extensions.
52 52
53 To compile this as a module, choose M here: the module
54 will be called kvm-intel.
55
53config KVM_AMD 56config KVM_AMD
54 tristate "KVM for AMD processors support" 57 tristate "KVM for AMD processors support"
55 depends on KVM 58 depends on KVM
@@ -57,6 +60,9 @@ config KVM_AMD
57 Provides support for KVM on AMD processors equipped with the AMD-V 60 Provides support for KVM on AMD processors equipped with the AMD-V
58 (SVM) extensions. 61 (SVM) extensions.
59 62
63 To compile this as a module, choose M here: the module
64 will be called kvm-amd.
65
60config KVM_TRACE 66config KVM_TRACE
61 bool "KVM trace support" 67 bool "KVM trace support"
62 depends on KVM && SYSFS 68 depends on KVM && SYSFS
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d3ec292f00f2..b43c4efafe80 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -14,7 +14,7 @@ endif
14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
15 15
16kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ 16kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
17 i8254.o 17 i8254.o timer.o
18obj-$(CONFIG_KVM) += kvm.o 18obj-$(CONFIG_KVM) += kvm.o
19kvm-intel-objs = vmx.o 19kvm-intel-objs = vmx.o
20obj-$(CONFIG_KVM_INTEL) += kvm-intel.o 20obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c13bb92d3157..4d6f0d293ee2 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -98,6 +98,37 @@ static int pit_get_gate(struct kvm *kvm, int channel)
98 return kvm->arch.vpit->pit_state.channels[channel].gate; 98 return kvm->arch.vpit->pit_state.channels[channel].gate;
99} 99}
100 100
101static s64 __kpit_elapsed(struct kvm *kvm)
102{
103 s64 elapsed;
104 ktime_t remaining;
105 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
106
107 /*
108 * The Counter does not stop when it reaches zero. In
109 * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
110 * the highest count, either FFFF hex for binary counting
111 * or 9999 for BCD counting, and continues counting.
112 * Modes 2 and 3 are periodic; the Counter reloads
113 * itself with the initial count and continues counting
114 * from there.
115 */
116 remaining = hrtimer_expires_remaining(&ps->pit_timer.timer);
117 elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
118 elapsed = mod_64(elapsed, ps->pit_timer.period);
119
120 return elapsed;
121}
122
123static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
124 int channel)
125{
126 if (channel == 0)
127 return __kpit_elapsed(kvm);
128
129 return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
130}
131
101static int pit_get_count(struct kvm *kvm, int channel) 132static int pit_get_count(struct kvm *kvm, int channel)
102{ 133{
103 struct kvm_kpit_channel_state *c = 134 struct kvm_kpit_channel_state *c =
@@ -107,7 +138,7 @@ static int pit_get_count(struct kvm *kvm, int channel)
107 138
108 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); 139 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
109 140
110 t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); 141 t = kpit_elapsed(kvm, c, channel);
111 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); 142 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
112 143
113 switch (c->mode) { 144 switch (c->mode) {
@@ -137,7 +168,7 @@ static int pit_get_out(struct kvm *kvm, int channel)
137 168
138 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); 169 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
139 170
140 t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); 171 t = kpit_elapsed(kvm, c, channel);
141 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); 172 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
142 173
143 switch (c->mode) { 174 switch (c->mode) {
@@ -193,28 +224,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
193 } 224 }
194} 225}
195 226
196static int __pit_timer_fn(struct kvm_kpit_state *ps)
197{
198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
199 struct kvm_kpit_timer *pt = &ps->pit_timer;
200
201 if (!atomic_inc_and_test(&pt->pending))
202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
203
204 if (!pt->reinject)
205 atomic_set(&pt->pending, 1);
206
207 if (vcpu0 && waitqueue_active(&vcpu0->wq))
208 wake_up_interruptible(&vcpu0->wq);
209
210 hrtimer_add_expires_ns(&pt->timer, pt->period);
211 pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
212 if (pt->period)
213 ps->channels[0].count_load_time = ktime_get();
214
215 return (pt->period == 0 ? 0 : 1);
216}
217
218int pit_has_pending_timer(struct kvm_vcpu *vcpu) 227int pit_has_pending_timer(struct kvm_vcpu *vcpu)
219{ 228{
220 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 229 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -235,21 +244,6 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
235 spin_unlock(&ps->inject_lock); 244 spin_unlock(&ps->inject_lock);
236} 245}
237 246
238static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
239{
240 struct kvm_kpit_state *ps;
241 int restart_timer = 0;
242
243 ps = container_of(data, struct kvm_kpit_state, pit_timer.timer);
244
245 restart_timer = __pit_timer_fn(ps);
246
247 if (restart_timer)
248 return HRTIMER_RESTART;
249 else
250 return HRTIMER_NORESTART;
251}
252
253void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) 247void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
254{ 248{
255 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 249 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -263,15 +257,26 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
263 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 257 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
264} 258}
265 259
266static void destroy_pit_timer(struct kvm_kpit_timer *pt) 260static void destroy_pit_timer(struct kvm_timer *pt)
267{ 261{
268 pr_debug("pit: execute del timer!\n"); 262 pr_debug("pit: execute del timer!\n");
269 hrtimer_cancel(&pt->timer); 263 hrtimer_cancel(&pt->timer);
270} 264}
271 265
266static bool kpit_is_periodic(struct kvm_timer *ktimer)
267{
268 struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
269 pit_timer);
270 return ps->is_periodic;
271}
272
273static struct kvm_timer_ops kpit_ops = {
274 .is_periodic = kpit_is_periodic,
275};
276
272static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) 277static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
273{ 278{
274 struct kvm_kpit_timer *pt = &ps->pit_timer; 279 struct kvm_timer *pt = &ps->pit_timer;
275 s64 interval; 280 s64 interval;
276 281
277 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 282 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -280,8 +285,14 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
280 285
281 /* TODO The new value only affected after the retriggered */ 286 /* TODO The new value only affected after the retriggered */
282 hrtimer_cancel(&pt->timer); 287 hrtimer_cancel(&pt->timer);
283 pt->period = (is_period == 0) ? 0 : interval; 288 pt->period = interval;
284 pt->timer.function = pit_timer_fn; 289 ps->is_periodic = is_period;
290
291 pt->timer.function = kvm_timer_fn;
292 pt->t_ops = &kpit_ops;
293 pt->kvm = ps->pit->kvm;
294 pt->vcpu_id = 0;
295
285 atomic_set(&pt->pending, 0); 296 atomic_set(&pt->pending, 0);
286 ps->irq_ack = 1; 297 ps->irq_ack = 1;
287 298
@@ -298,23 +309,23 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
298 pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); 309 pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
299 310
300 /* 311 /*
301 * Though spec said the state of 8254 is undefined after power-up, 312 * The largest possible initial count is 0; this is equivalent
302 * seems some tricky OS like Windows XP depends on IRQ0 interrupt 313 * to 216 for binary counting and 104 for BCD counting.
303 * when booting up.
304 * So here setting initialize rate for it, and not a specific number
305 */ 314 */
306 if (val == 0) 315 if (val == 0)
307 val = 0x10000; 316 val = 0x10000;
308 317
309 ps->channels[channel].count_load_time = ktime_get();
310 ps->channels[channel].count = val; 318 ps->channels[channel].count = val;
311 319
312 if (channel != 0) 320 if (channel != 0) {
321 ps->channels[channel].count_load_time = ktime_get();
313 return; 322 return;
323 }
314 324
315 /* Two types of timer 325 /* Two types of timer
316 * mode 1 is one shot, mode 2 is period, otherwise del timer */ 326 * mode 1 is one shot, mode 2 is period, otherwise del timer */
317 switch (ps->channels[0].mode) { 327 switch (ps->channels[0].mode) {
328 case 0:
318 case 1: 329 case 1:
319 /* FIXME: enhance mode 4 precision */ 330 /* FIXME: enhance mode 4 precision */
320 case 4: 331 case 4:
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 6acbe4b505d5..bbd863ff60b7 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -3,15 +3,6 @@
3 3
4#include "iodev.h" 4#include "iodev.h"
5 5
6struct kvm_kpit_timer {
7 struct hrtimer timer;
8 int irq;
9 s64 period; /* unit: ns */
10 s64 scheduled;
11 atomic_t pending;
12 bool reinject;
13};
14
15struct kvm_kpit_channel_state { 6struct kvm_kpit_channel_state {
16 u32 count; /* can be 65536 */ 7 u32 count; /* can be 65536 */
17 u16 latched_count; 8 u16 latched_count;
@@ -30,7 +21,8 @@ struct kvm_kpit_channel_state {
30 21
31struct kvm_kpit_state { 22struct kvm_kpit_state {
32 struct kvm_kpit_channel_state channels[3]; 23 struct kvm_kpit_channel_state channels[3];
33 struct kvm_kpit_timer pit_timer; 24 struct kvm_timer pit_timer;
25 bool is_periodic;
34 u32 speaker_data_on; 26 u32 speaker_data_on;
35 struct mutex lock; 27 struct mutex lock;
36 struct kvm_pit *pit; 28 struct kvm_pit *pit;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index cf17ed52f6fb..96dfbb6ad2a9 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -24,6 +24,7 @@
24 24
25#include "irq.h" 25#include "irq.h"
26#include "i8254.h" 26#include "i8254.h"
27#include "x86.h"
27 28
28/* 29/*
29 * check if there are pending timer events 30 * check if there are pending timer events
@@ -48,6 +49,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
48{ 49{
49 struct kvm_pic *s; 50 struct kvm_pic *s;
50 51
52 if (!irqchip_in_kernel(v->kvm))
53 return v->arch.interrupt.pending;
54
51 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ 55 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
52 if (kvm_apic_accept_pic_intr(v)) { 56 if (kvm_apic_accept_pic_intr(v)) {
53 s = pic_irqchip(v->kvm); /* PIC */ 57 s = pic_irqchip(v->kvm); /* PIC */
@@ -67,6 +71,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
67 struct kvm_pic *s; 71 struct kvm_pic *s;
68 int vector; 72 int vector;
69 73
74 if (!irqchip_in_kernel(v->kvm))
75 return v->arch.interrupt.nr;
76
70 vector = kvm_get_apic_interrupt(v); /* APIC */ 77 vector = kvm_get_apic_interrupt(v); /* APIC */
71 if (vector == -1) { 78 if (vector == -1) {
72 if (kvm_apic_accept_pic_intr(v)) { 79 if (kvm_apic_accept_pic_intr(v)) {
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
new file mode 100644
index 000000000000..26bd6ba74e1c
--- /dev/null
+++ b/arch/x86/kvm/kvm_timer.h
@@ -0,0 +1,18 @@
1
2struct kvm_timer {
3 struct hrtimer timer;
4 s64 period; /* unit: ns */
5 atomic_t pending; /* accumulated triggered timers */
6 bool reinject;
7 struct kvm_timer_ops *t_ops;
8 struct kvm *kvm;
9 int vcpu_id;
10};
11
12struct kvm_timer_ops {
13 bool (*is_periodic)(struct kvm_timer *);
14};
15
16
17enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
18
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f0b67f2cdd69..ae99d83f81a3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -196,20 +196,15 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
196} 196}
197EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); 197EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
198 198
199int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) 199static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
200 int vector, int level, int trig_mode);
201
202int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
200{ 203{
201 struct kvm_lapic *apic = vcpu->arch.apic; 204 struct kvm_lapic *apic = vcpu->arch.apic;
202 205
203 if (!apic_test_and_set_irr(vec, apic)) { 206 return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
204 /* a new pending irq is set in IRR */ 207 irq->level, irq->trig_mode);
205 if (trig)
206 apic_set_vector(vec, apic->regs + APIC_TMR);
207 else
208 apic_clear_vector(vec, apic->regs + APIC_TMR);
209 kvm_vcpu_kick(apic->vcpu);
210 return 1;
211 }
212 return 0;
213} 208}
214 209
215static inline int apic_find_highest_isr(struct kvm_lapic *apic) 210static inline int apic_find_highest_isr(struct kvm_lapic *apic)
@@ -250,7 +245,7 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
250 245
251int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) 246int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
252{ 247{
253 return kvm_apic_id(apic) == dest; 248 return dest == 0xff || kvm_apic_id(apic) == dest;
254} 249}
255 250
256int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) 251int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
@@ -279,37 +274,34 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
279 return result; 274 return result;
280} 275}
281 276
282static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 277int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
283 int short_hand, int dest, int dest_mode) 278 int short_hand, int dest, int dest_mode)
284{ 279{
285 int result = 0; 280 int result = 0;
286 struct kvm_lapic *target = vcpu->arch.apic; 281 struct kvm_lapic *target = vcpu->arch.apic;
287 282
288 apic_debug("target %p, source %p, dest 0x%x, " 283 apic_debug("target %p, source %p, dest 0x%x, "
289 "dest_mode 0x%x, short_hand 0x%x", 284 "dest_mode 0x%x, short_hand 0x%x\n",
290 target, source, dest, dest_mode, short_hand); 285 target, source, dest, dest_mode, short_hand);
291 286
292 ASSERT(!target); 287 ASSERT(!target);
293 switch (short_hand) { 288 switch (short_hand) {
294 case APIC_DEST_NOSHORT: 289 case APIC_DEST_NOSHORT:
295 if (dest_mode == 0) { 290 if (dest_mode == 0)
296 /* Physical mode. */ 291 /* Physical mode. */
297 if ((dest == 0xFF) || (dest == kvm_apic_id(target))) 292 result = kvm_apic_match_physical_addr(target, dest);
298 result = 1; 293 else
299 } else
300 /* Logical mode. */ 294 /* Logical mode. */
301 result = kvm_apic_match_logical_addr(target, dest); 295 result = kvm_apic_match_logical_addr(target, dest);
302 break; 296 break;
303 case APIC_DEST_SELF: 297 case APIC_DEST_SELF:
304 if (target == source) 298 result = (target == source);
305 result = 1;
306 break; 299 break;
307 case APIC_DEST_ALLINC: 300 case APIC_DEST_ALLINC:
308 result = 1; 301 result = 1;
309 break; 302 break;
310 case APIC_DEST_ALLBUT: 303 case APIC_DEST_ALLBUT:
311 if (target != source) 304 result = (target != source);
312 result = 1;
313 break; 305 break;
314 default: 306 default:
315 printk(KERN_WARNING "Bad dest shorthand value %x\n", 307 printk(KERN_WARNING "Bad dest shorthand value %x\n",
@@ -327,20 +319,22 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
327static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, 319static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
328 int vector, int level, int trig_mode) 320 int vector, int level, int trig_mode)
329{ 321{
330 int orig_irr, result = 0; 322 int result = 0;
331 struct kvm_vcpu *vcpu = apic->vcpu; 323 struct kvm_vcpu *vcpu = apic->vcpu;
332 324
333 switch (delivery_mode) { 325 switch (delivery_mode) {
334 case APIC_DM_FIXED:
335 case APIC_DM_LOWEST: 326 case APIC_DM_LOWEST:
327 vcpu->arch.apic_arb_prio++;
328 case APIC_DM_FIXED:
336 /* FIXME add logic for vcpu on reset */ 329 /* FIXME add logic for vcpu on reset */
337 if (unlikely(!apic_enabled(apic))) 330 if (unlikely(!apic_enabled(apic)))
338 break; 331 break;
339 332
340 orig_irr = apic_test_and_set_irr(vector, apic); 333 result = !apic_test_and_set_irr(vector, apic);
341 if (orig_irr && trig_mode) { 334 if (!result) {
342 apic_debug("level trig mode repeatedly for vector %d", 335 if (trig_mode)
343 vector); 336 apic_debug("level trig mode repeatedly for "
337 "vector %d", vector);
344 break; 338 break;
345 } 339 }
346 340
@@ -349,10 +343,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
349 apic_set_vector(vector, apic->regs + APIC_TMR); 343 apic_set_vector(vector, apic->regs + APIC_TMR);
350 } else 344 } else
351 apic_clear_vector(vector, apic->regs + APIC_TMR); 345 apic_clear_vector(vector, apic->regs + APIC_TMR);
352
353 kvm_vcpu_kick(vcpu); 346 kvm_vcpu_kick(vcpu);
354
355 result = (orig_irr == 0);
356 break; 347 break;
357 348
358 case APIC_DM_REMRD: 349 case APIC_DM_REMRD:
@@ -364,12 +355,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
364 break; 355 break;
365 356
366 case APIC_DM_NMI: 357 case APIC_DM_NMI:
358 result = 1;
367 kvm_inject_nmi(vcpu); 359 kvm_inject_nmi(vcpu);
368 kvm_vcpu_kick(vcpu); 360 kvm_vcpu_kick(vcpu);
369 break; 361 break;
370 362
371 case APIC_DM_INIT: 363 case APIC_DM_INIT:
372 if (level) { 364 if (level) {
365 result = 1;
373 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 366 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
374 printk(KERN_DEBUG 367 printk(KERN_DEBUG
375 "INIT on a runnable vcpu %d\n", 368 "INIT on a runnable vcpu %d\n",
@@ -386,6 +379,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
386 apic_debug("SIPI to vcpu %d vector 0x%02x\n", 379 apic_debug("SIPI to vcpu %d vector 0x%02x\n",
387 vcpu->vcpu_id, vector); 380 vcpu->vcpu_id, vector);
388 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 381 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
382 result = 1;
389 vcpu->arch.sipi_vector = vector; 383 vcpu->arch.sipi_vector = vector;
390 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 384 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
391 kvm_vcpu_kick(vcpu); 385 kvm_vcpu_kick(vcpu);
@@ -408,43 +402,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
408 return result; 402 return result;
409} 403}
410 404
411static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, 405int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
412 unsigned long bitmap)
413{
414 int last;
415 int next;
416 struct kvm_lapic *apic = NULL;
417
418 last = kvm->arch.round_robin_prev_vcpu;
419 next = last;
420
421 do {
422 if (++next == KVM_MAX_VCPUS)
423 next = 0;
424 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
425 continue;
426 apic = kvm->vcpus[next]->arch.apic;
427 if (apic && apic_enabled(apic))
428 break;
429 apic = NULL;
430 } while (next != last);
431 kvm->arch.round_robin_prev_vcpu = next;
432
433 if (!apic)
434 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
435
436 return apic;
437}
438
439struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
440 unsigned long bitmap)
441{ 406{
442 struct kvm_lapic *apic; 407 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
443
444 apic = kvm_apic_round_robin(kvm, vector, bitmap);
445 if (apic)
446 return apic->vcpu;
447 return NULL;
448} 408}
449 409
450static void apic_set_eoi(struct kvm_lapic *apic) 410static void apic_set_eoi(struct kvm_lapic *apic)
@@ -472,47 +432,24 @@ static void apic_send_ipi(struct kvm_lapic *apic)
472{ 432{
473 u32 icr_low = apic_get_reg(apic, APIC_ICR); 433 u32 icr_low = apic_get_reg(apic, APIC_ICR);
474 u32 icr_high = apic_get_reg(apic, APIC_ICR2); 434 u32 icr_high = apic_get_reg(apic, APIC_ICR2);
435 struct kvm_lapic_irq irq;
475 436
476 unsigned int dest = GET_APIC_DEST_FIELD(icr_high); 437 irq.vector = icr_low & APIC_VECTOR_MASK;
477 unsigned int short_hand = icr_low & APIC_SHORT_MASK; 438 irq.delivery_mode = icr_low & APIC_MODE_MASK;
478 unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; 439 irq.dest_mode = icr_low & APIC_DEST_MASK;
479 unsigned int level = icr_low & APIC_INT_ASSERT; 440 irq.level = icr_low & APIC_INT_ASSERT;
480 unsigned int dest_mode = icr_low & APIC_DEST_MASK; 441 irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
481 unsigned int delivery_mode = icr_low & APIC_MODE_MASK; 442 irq.shorthand = icr_low & APIC_SHORT_MASK;
482 unsigned int vector = icr_low & APIC_VECTOR_MASK; 443 irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
483
484 struct kvm_vcpu *target;
485 struct kvm_vcpu *vcpu;
486 unsigned long lpr_map = 0;
487 int i;
488 444
489 apic_debug("icr_high 0x%x, icr_low 0x%x, " 445 apic_debug("icr_high 0x%x, icr_low 0x%x, "
490 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " 446 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
491 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", 447 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
492 icr_high, icr_low, short_hand, dest, 448 icr_high, icr_low, irq.shorthand, irq.dest_id,
493 trig_mode, level, dest_mode, delivery_mode, vector); 449 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
494 450 irq.vector);
495 for (i = 0; i < KVM_MAX_VCPUS; i++) {
496 vcpu = apic->vcpu->kvm->vcpus[i];
497 if (!vcpu)
498 continue;
499
500 if (vcpu->arch.apic &&
501 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
502 if (delivery_mode == APIC_DM_LOWEST)
503 set_bit(vcpu->vcpu_id, &lpr_map);
504 else
505 __apic_accept_irq(vcpu->arch.apic, delivery_mode,
506 vector, level, trig_mode);
507 }
508 }
509 451
510 if (delivery_mode == APIC_DM_LOWEST) { 452 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
511 target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
512 if (target != NULL)
513 __apic_accept_irq(target->arch.apic, delivery_mode,
514 vector, level, trig_mode);
515 }
516} 453}
517 454
518static u32 apic_get_tmcct(struct kvm_lapic *apic) 455static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -527,12 +464,13 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
527 if (apic_get_reg(apic, APIC_TMICT) == 0) 464 if (apic_get_reg(apic, APIC_TMICT) == 0)
528 return 0; 465 return 0;
529 466
530 remaining = hrtimer_expires_remaining(&apic->timer.dev); 467 remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer);
531 if (ktime_to_ns(remaining) < 0) 468 if (ktime_to_ns(remaining) < 0)
532 remaining = ktime_set(0, 0); 469 remaining = ktime_set(0, 0);
533 470
534 ns = mod_64(ktime_to_ns(remaining), apic->timer.period); 471 ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
535 tmcct = div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); 472 tmcct = div64_u64(ns,
473 (APIC_BUS_CYCLE_NS * apic->divide_count));
536 474
537 return tmcct; 475 return tmcct;
538} 476}
@@ -619,25 +557,25 @@ static void update_divide_count(struct kvm_lapic *apic)
619 tdcr = apic_get_reg(apic, APIC_TDCR); 557 tdcr = apic_get_reg(apic, APIC_TDCR);
620 tmp1 = tdcr & 0xf; 558 tmp1 = tdcr & 0xf;
621 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; 559 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
622 apic->timer.divide_count = 0x1 << (tmp2 & 0x7); 560 apic->divide_count = 0x1 << (tmp2 & 0x7);
623 561
624 apic_debug("timer divide count is 0x%x\n", 562 apic_debug("timer divide count is 0x%x\n",
625 apic->timer.divide_count); 563 apic->divide_count);
626} 564}
627 565
628static void start_apic_timer(struct kvm_lapic *apic) 566static void start_apic_timer(struct kvm_lapic *apic)
629{ 567{
630 ktime_t now = apic->timer.dev.base->get_time(); 568 ktime_t now = apic->lapic_timer.timer.base->get_time();
631 569
632 apic->timer.period = apic_get_reg(apic, APIC_TMICT) * 570 apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) *
633 APIC_BUS_CYCLE_NS * apic->timer.divide_count; 571 APIC_BUS_CYCLE_NS * apic->divide_count;
634 atomic_set(&apic->timer.pending, 0); 572 atomic_set(&apic->lapic_timer.pending, 0);
635 573
636 if (!apic->timer.period) 574 if (!apic->lapic_timer.period)
637 return; 575 return;
638 576
639 hrtimer_start(&apic->timer.dev, 577 hrtimer_start(&apic->lapic_timer.timer,
640 ktime_add_ns(now, apic->timer.period), 578 ktime_add_ns(now, apic->lapic_timer.period),
641 HRTIMER_MODE_ABS); 579 HRTIMER_MODE_ABS);
642 580
643 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" 581 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
@@ -646,9 +584,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
646 "expire @ 0x%016" PRIx64 ".\n", __func__, 584 "expire @ 0x%016" PRIx64 ".\n", __func__,
647 APIC_BUS_CYCLE_NS, ktime_to_ns(now), 585 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
648 apic_get_reg(apic, APIC_TMICT), 586 apic_get_reg(apic, APIC_TMICT),
649 apic->timer.period, 587 apic->lapic_timer.period,
650 ktime_to_ns(ktime_add_ns(now, 588 ktime_to_ns(ktime_add_ns(now,
651 apic->timer.period))); 589 apic->lapic_timer.period)));
652} 590}
653 591
654static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) 592static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -730,7 +668,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
730 apic_set_reg(apic, APIC_LVTT + 0x10 * i, 668 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
731 lvt_val | APIC_LVT_MASKED); 669 lvt_val | APIC_LVT_MASKED);
732 } 670 }
733 atomic_set(&apic->timer.pending, 0); 671 atomic_set(&apic->lapic_timer.pending, 0);
734 672
735 } 673 }
736 break; 674 break;
@@ -762,7 +700,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
762 break; 700 break;
763 701
764 case APIC_TMICT: 702 case APIC_TMICT:
765 hrtimer_cancel(&apic->timer.dev); 703 hrtimer_cancel(&apic->lapic_timer.timer);
766 apic_set_reg(apic, APIC_TMICT, val); 704 apic_set_reg(apic, APIC_TMICT, val);
767 start_apic_timer(apic); 705 start_apic_timer(apic);
768 return; 706 return;
@@ -802,7 +740,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
802 if (!vcpu->arch.apic) 740 if (!vcpu->arch.apic)
803 return; 741 return;
804 742
805 hrtimer_cancel(&vcpu->arch.apic->timer.dev); 743 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
806 744
807 if (vcpu->arch.apic->regs_page) 745 if (vcpu->arch.apic->regs_page)
808 __free_page(vcpu->arch.apic->regs_page); 746 __free_page(vcpu->arch.apic->regs_page);
@@ -880,7 +818,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
880 ASSERT(apic != NULL); 818 ASSERT(apic != NULL);
881 819
882 /* Stop the timer in case it's a reset to an active apic */ 820 /* Stop the timer in case it's a reset to an active apic */
883 hrtimer_cancel(&apic->timer.dev); 821 hrtimer_cancel(&apic->lapic_timer.timer);
884 822
885 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); 823 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
886 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 824 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
@@ -905,11 +843,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
905 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 843 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
906 } 844 }
907 update_divide_count(apic); 845 update_divide_count(apic);
908 atomic_set(&apic->timer.pending, 0); 846 atomic_set(&apic->lapic_timer.pending, 0);
909 if (vcpu->vcpu_id == 0) 847 if (vcpu->vcpu_id == 0)
910 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; 848 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
911 apic_update_ppr(apic); 849 apic_update_ppr(apic);
912 850
851 vcpu->arch.apic_arb_prio = 0;
852
913 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" 853 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
914 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, 854 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
915 vcpu, kvm_apic_id(apic), 855 vcpu, kvm_apic_id(apic),
@@ -917,16 +857,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
917} 857}
918EXPORT_SYMBOL_GPL(kvm_lapic_reset); 858EXPORT_SYMBOL_GPL(kvm_lapic_reset);
919 859
920int kvm_lapic_enabled(struct kvm_vcpu *vcpu) 860bool kvm_apic_present(struct kvm_vcpu *vcpu)
921{ 861{
922 struct kvm_lapic *apic = vcpu->arch.apic; 862 return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
923 int ret = 0; 863}
924
925 if (!apic)
926 return 0;
927 ret = apic_enabled(apic);
928 864
929 return ret; 865int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
866{
867 return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
930} 868}
931EXPORT_SYMBOL_GPL(kvm_lapic_enabled); 869EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
932 870
@@ -936,22 +874,11 @@ EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
936 *---------------------------------------------------------------------- 874 *----------------------------------------------------------------------
937 */ 875 */
938 876
939/* TODO: make sure __apic_timer_fn runs in current pCPU */ 877static bool lapic_is_periodic(struct kvm_timer *ktimer)
940static int __apic_timer_fn(struct kvm_lapic *apic)
941{ 878{
942 int result = 0; 879 struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
943 wait_queue_head_t *q = &apic->vcpu->wq; 880 lapic_timer);
944 881 return apic_lvtt_period(apic);
945 if(!atomic_inc_and_test(&apic->timer.pending))
946 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
947 if (waitqueue_active(q))
948 wake_up_interruptible(q);
949
950 if (apic_lvtt_period(apic)) {
951 result = 1;
952 hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
953 }
954 return result;
955} 882}
956 883
957int apic_has_pending_timer(struct kvm_vcpu *vcpu) 884int apic_has_pending_timer(struct kvm_vcpu *vcpu)
@@ -959,7 +886,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
959 struct kvm_lapic *lapic = vcpu->arch.apic; 886 struct kvm_lapic *lapic = vcpu->arch.apic;
960 887
961 if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) 888 if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
962 return atomic_read(&lapic->timer.pending); 889 return atomic_read(&lapic->lapic_timer.pending);
963 890
964 return 0; 891 return 0;
965} 892}
@@ -986,20 +913,9 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
986 kvm_apic_local_deliver(apic, APIC_LVT0); 913 kvm_apic_local_deliver(apic, APIC_LVT0);
987} 914}
988 915
989static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) 916static struct kvm_timer_ops lapic_timer_ops = {
990{ 917 .is_periodic = lapic_is_periodic,
991 struct kvm_lapic *apic; 918};
992 int restart_timer = 0;
993
994 apic = container_of(data, struct kvm_lapic, timer.dev);
995
996 restart_timer = __apic_timer_fn(apic);
997
998 if (restart_timer)
999 return HRTIMER_RESTART;
1000 else
1001 return HRTIMER_NORESTART;
1002}
1003 919
1004int kvm_create_lapic(struct kvm_vcpu *vcpu) 920int kvm_create_lapic(struct kvm_vcpu *vcpu)
1005{ 921{
@@ -1024,8 +940,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1024 memset(apic->regs, 0, PAGE_SIZE); 940 memset(apic->regs, 0, PAGE_SIZE);
1025 apic->vcpu = vcpu; 941 apic->vcpu = vcpu;
1026 942
1027 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 943 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
1028 apic->timer.dev.function = apic_timer_fn; 944 HRTIMER_MODE_ABS);
945 apic->lapic_timer.timer.function = kvm_timer_fn;
946 apic->lapic_timer.t_ops = &lapic_timer_ops;
947 apic->lapic_timer.kvm = vcpu->kvm;
948 apic->lapic_timer.vcpu_id = vcpu->vcpu_id;
949
1029 apic->base_address = APIC_DEFAULT_PHYS_BASE; 950 apic->base_address = APIC_DEFAULT_PHYS_BASE;
1030 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; 951 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
1031 952
@@ -1078,9 +999,9 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1078{ 999{
1079 struct kvm_lapic *apic = vcpu->arch.apic; 1000 struct kvm_lapic *apic = vcpu->arch.apic;
1080 1001
1081 if (apic && atomic_read(&apic->timer.pending) > 0) { 1002 if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
1082 if (kvm_apic_local_deliver(apic, APIC_LVTT)) 1003 if (kvm_apic_local_deliver(apic, APIC_LVTT))
1083 atomic_dec(&apic->timer.pending); 1004 atomic_dec(&apic->lapic_timer.pending);
1084 } 1005 }
1085} 1006}
1086 1007
@@ -1106,7 +1027,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1106 MSR_IA32_APICBASE_BASE; 1027 MSR_IA32_APICBASE_BASE;
1107 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 1028 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
1108 apic_update_ppr(apic); 1029 apic_update_ppr(apic);
1109 hrtimer_cancel(&apic->timer.dev); 1030 hrtimer_cancel(&apic->lapic_timer.timer);
1110 update_divide_count(apic); 1031 update_divide_count(apic);
1111 start_apic_timer(apic); 1032 start_apic_timer(apic);
1112} 1033}
@@ -1119,7 +1040,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1119 if (!apic) 1040 if (!apic)
1120 return; 1041 return;
1121 1042
1122 timer = &apic->timer.dev; 1043 timer = &apic->lapic_timer.timer;
1123 if (hrtimer_cancel(timer)) 1044 if (hrtimer_cancel(timer))
1124 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 1045 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
1125} 1046}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 45ab6ee71209..a587f8349c46 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,18 +2,15 @@
2#define __KVM_X86_LAPIC_H 2#define __KVM_X86_LAPIC_H
3 3
4#include "iodev.h" 4#include "iodev.h"
5#include "kvm_timer.h"
5 6
6#include <linux/kvm_host.h> 7#include <linux/kvm_host.h>
7 8
8struct kvm_lapic { 9struct kvm_lapic {
9 unsigned long base_address; 10 unsigned long base_address;
10 struct kvm_io_device dev; 11 struct kvm_io_device dev;
11 struct { 12 struct kvm_timer lapic_timer;
12 atomic_t pending; 13 u32 divide_count;
13 s64 period; /* unit: ns */
14 u32 divide_count;
15 struct hrtimer dev;
16 } timer;
17 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
18 struct page *regs_page; 15 struct page *regs_page;
19 void *regs; 16 void *regs;
@@ -34,12 +31,13 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
34 31
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 32int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 33int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
37int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); 34int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
38 35
39u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); 36u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
40void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); 37void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
41void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); 38void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
42int kvm_lapic_enabled(struct kvm_vcpu *vcpu); 39int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
40bool kvm_apic_present(struct kvm_vcpu *vcpu);
43int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); 41int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
44 42
45void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); 43void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 32cf11e5728a..5c3d6e81a7dc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -126,6 +126,7 @@ module_param(oos_shadow, bool, 0644);
126#define PFERR_PRESENT_MASK (1U << 0) 126#define PFERR_PRESENT_MASK (1U << 0)
127#define PFERR_WRITE_MASK (1U << 1) 127#define PFERR_WRITE_MASK (1U << 1)
128#define PFERR_USER_MASK (1U << 2) 128#define PFERR_USER_MASK (1U << 2)
129#define PFERR_RSVD_MASK (1U << 3)
129#define PFERR_FETCH_MASK (1U << 4) 130#define PFERR_FETCH_MASK (1U << 4)
130 131
131#define PT_DIRECTORY_LEVEL 2 132#define PT_DIRECTORY_LEVEL 2
@@ -177,7 +178,11 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
177static u64 __read_mostly shadow_user_mask; 178static u64 __read_mostly shadow_user_mask;
178static u64 __read_mostly shadow_accessed_mask; 179static u64 __read_mostly shadow_accessed_mask;
179static u64 __read_mostly shadow_dirty_mask; 180static u64 __read_mostly shadow_dirty_mask;
180static u64 __read_mostly shadow_mt_mask; 181
182static inline u64 rsvd_bits(int s, int e)
183{
184 return ((1ULL << (e - s + 1)) - 1) << s;
185}
181 186
182void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) 187void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
183{ 188{
@@ -193,14 +198,13 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
193EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); 198EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
194 199
195void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 200void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
196 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask) 201 u64 dirty_mask, u64 nx_mask, u64 x_mask)
197{ 202{
198 shadow_user_mask = user_mask; 203 shadow_user_mask = user_mask;
199 shadow_accessed_mask = accessed_mask; 204 shadow_accessed_mask = accessed_mask;
200 shadow_dirty_mask = dirty_mask; 205 shadow_dirty_mask = dirty_mask;
201 shadow_nx_mask = nx_mask; 206 shadow_nx_mask = nx_mask;
202 shadow_x_mask = x_mask; 207 shadow_x_mask = x_mask;
203 shadow_mt_mask = mt_mask;
204} 208}
205EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 209EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
206 210
@@ -219,11 +223,6 @@ static int is_nx(struct kvm_vcpu *vcpu)
219 return vcpu->arch.shadow_efer & EFER_NX; 223 return vcpu->arch.shadow_efer & EFER_NX;
220} 224}
221 225
222static int is_present_pte(unsigned long pte)
223{
224 return pte & PT_PRESENT_MASK;
225}
226
227static int is_shadow_present_pte(u64 pte) 226static int is_shadow_present_pte(u64 pte)
228{ 227{
229 return pte != shadow_trap_nonpresent_pte 228 return pte != shadow_trap_nonpresent_pte
@@ -1074,18 +1073,10 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1074 return NULL; 1073 return NULL;
1075} 1074}
1076 1075
1077static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
1078{
1079 list_del(&sp->oos_link);
1080 --kvm->stat.mmu_unsync_global;
1081}
1082
1083static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1076static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1084{ 1077{
1085 WARN_ON(!sp->unsync); 1078 WARN_ON(!sp->unsync);
1086 sp->unsync = 0; 1079 sp->unsync = 0;
1087 if (sp->global)
1088 kvm_unlink_unsync_global(kvm, sp);
1089 --kvm->stat.mmu_unsync; 1080 --kvm->stat.mmu_unsync;
1090} 1081}
1091 1082
@@ -1248,7 +1239,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1248 pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); 1239 pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
1249 sp->gfn = gfn; 1240 sp->gfn = gfn;
1250 sp->role = role; 1241 sp->role = role;
1251 sp->global = 0;
1252 hlist_add_head(&sp->hash_link, bucket); 1242 hlist_add_head(&sp->hash_link, bucket);
1253 if (!direct) { 1243 if (!direct) {
1254 if (rmap_write_protect(vcpu->kvm, gfn)) 1244 if (rmap_write_protect(vcpu->kvm, gfn))
@@ -1616,7 +1606,7 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1616 return mtrr_state->def_type; 1606 return mtrr_state->def_type;
1617} 1607}
1618 1608
1619static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) 1609u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1620{ 1610{
1621 u8 mtrr; 1611 u8 mtrr;
1622 1612
@@ -1626,6 +1616,7 @@ static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1626 mtrr = MTRR_TYPE_WRBACK; 1616 mtrr = MTRR_TYPE_WRBACK;
1627 return mtrr; 1617 return mtrr;
1628} 1618}
1619EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1629 1620
1630static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1621static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1631{ 1622{
@@ -1646,11 +1637,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1646 ++vcpu->kvm->stat.mmu_unsync; 1637 ++vcpu->kvm->stat.mmu_unsync;
1647 sp->unsync = 1; 1638 sp->unsync = 1;
1648 1639
1649 if (sp->global) { 1640 kvm_mmu_mark_parents_unsync(vcpu, sp);
1650 list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
1651 ++vcpu->kvm->stat.mmu_unsync_global;
1652 } else
1653 kvm_mmu_mark_parents_unsync(vcpu, sp);
1654 1641
1655 mmu_convert_notrap(sp); 1642 mmu_convert_notrap(sp);
1656 return 0; 1643 return 0;
@@ -1677,21 +1664,11 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1677static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1664static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1678 unsigned pte_access, int user_fault, 1665 unsigned pte_access, int user_fault,
1679 int write_fault, int dirty, int largepage, 1666 int write_fault, int dirty, int largepage,
1680 int global, gfn_t gfn, pfn_t pfn, bool speculative, 1667 gfn_t gfn, pfn_t pfn, bool speculative,
1681 bool can_unsync) 1668 bool can_unsync)
1682{ 1669{
1683 u64 spte; 1670 u64 spte;
1684 int ret = 0; 1671 int ret = 0;
1685 u64 mt_mask = shadow_mt_mask;
1686 struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
1687
1688 if (!global && sp->global) {
1689 sp->global = 0;
1690 if (sp->unsync) {
1691 kvm_unlink_unsync_global(vcpu->kvm, sp);
1692 kvm_mmu_mark_parents_unsync(vcpu, sp);
1693 }
1694 }
1695 1672
1696 /* 1673 /*
1697 * We don't set the accessed bit, since we sometimes want to see 1674 * We don't set the accessed bit, since we sometimes want to see
@@ -1711,16 +1688,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1711 spte |= shadow_user_mask; 1688 spte |= shadow_user_mask;
1712 if (largepage) 1689 if (largepage)
1713 spte |= PT_PAGE_SIZE_MASK; 1690 spte |= PT_PAGE_SIZE_MASK;
1714 if (mt_mask) { 1691 if (tdp_enabled)
1715 if (!kvm_is_mmio_pfn(pfn)) { 1692 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1716 mt_mask = get_memory_type(vcpu, gfn) << 1693 kvm_is_mmio_pfn(pfn));
1717 kvm_x86_ops->get_mt_mask_shift();
1718 mt_mask |= VMX_EPT_IGMT_BIT;
1719 } else
1720 mt_mask = MTRR_TYPE_UNCACHABLE <<
1721 kvm_x86_ops->get_mt_mask_shift();
1722 spte |= mt_mask;
1723 }
1724 1694
1725 spte |= (u64)pfn << PAGE_SHIFT; 1695 spte |= (u64)pfn << PAGE_SHIFT;
1726 1696
@@ -1765,8 +1735,8 @@ set_pte:
1765static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1735static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1766 unsigned pt_access, unsigned pte_access, 1736 unsigned pt_access, unsigned pte_access,
1767 int user_fault, int write_fault, int dirty, 1737 int user_fault, int write_fault, int dirty,
1768 int *ptwrite, int largepage, int global, 1738 int *ptwrite, int largepage, gfn_t gfn,
1769 gfn_t gfn, pfn_t pfn, bool speculative) 1739 pfn_t pfn, bool speculative)
1770{ 1740{
1771 int was_rmapped = 0; 1741 int was_rmapped = 0;
1772 int was_writeble = is_writeble_pte(*shadow_pte); 1742 int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1795,7 +1765,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1795 was_rmapped = 1; 1765 was_rmapped = 1;
1796 } 1766 }
1797 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, 1767 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1798 dirty, largepage, global, gfn, pfn, speculative, true)) { 1768 dirty, largepage, gfn, pfn, speculative, true)) {
1799 if (write_fault) 1769 if (write_fault)
1800 *ptwrite = 1; 1770 *ptwrite = 1;
1801 kvm_x86_ops->tlb_flush(vcpu); 1771 kvm_x86_ops->tlb_flush(vcpu);
@@ -1843,7 +1813,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1843 || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { 1813 || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
1844 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 1814 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
1845 0, write, 1, &pt_write, 1815 0, write, 1, &pt_write,
1846 largepage, 0, gfn, pfn, false); 1816 largepage, gfn, pfn, false);
1847 ++vcpu->stat.pf_fixed; 1817 ++vcpu->stat.pf_fixed;
1848 break; 1818 break;
1849 } 1819 }
@@ -1942,7 +1912,19 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
1942 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 1912 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1943} 1913}
1944 1914
1945static void mmu_alloc_roots(struct kvm_vcpu *vcpu) 1915static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
1916{
1917 int ret = 0;
1918
1919 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
1920 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
1921 ret = 1;
1922 }
1923
1924 return ret;
1925}
1926
1927static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
1946{ 1928{
1947 int i; 1929 int i;
1948 gfn_t root_gfn; 1930 gfn_t root_gfn;
@@ -1957,13 +1939,15 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1957 ASSERT(!VALID_PAGE(root)); 1939 ASSERT(!VALID_PAGE(root));
1958 if (tdp_enabled) 1940 if (tdp_enabled)
1959 direct = 1; 1941 direct = 1;
1942 if (mmu_check_root(vcpu, root_gfn))
1943 return 1;
1960 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 1944 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1961 PT64_ROOT_LEVEL, direct, 1945 PT64_ROOT_LEVEL, direct,
1962 ACC_ALL, NULL); 1946 ACC_ALL, NULL);
1963 root = __pa(sp->spt); 1947 root = __pa(sp->spt);
1964 ++sp->root_count; 1948 ++sp->root_count;
1965 vcpu->arch.mmu.root_hpa = root; 1949 vcpu->arch.mmu.root_hpa = root;
1966 return; 1950 return 0;
1967 } 1951 }
1968 direct = !is_paging(vcpu); 1952 direct = !is_paging(vcpu);
1969 if (tdp_enabled) 1953 if (tdp_enabled)
@@ -1980,6 +1964,8 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1980 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; 1964 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1981 } else if (vcpu->arch.mmu.root_level == 0) 1965 } else if (vcpu->arch.mmu.root_level == 0)
1982 root_gfn = 0; 1966 root_gfn = 0;
1967 if (mmu_check_root(vcpu, root_gfn))
1968 return 1;
1983 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 1969 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1984 PT32_ROOT_LEVEL, direct, 1970 PT32_ROOT_LEVEL, direct,
1985 ACC_ALL, NULL); 1971 ACC_ALL, NULL);
@@ -1988,6 +1974,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1988 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 1974 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1989 } 1975 }
1990 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 1976 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1977 return 0;
1991} 1978}
1992 1979
1993static void mmu_sync_roots(struct kvm_vcpu *vcpu) 1980static void mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2006,7 +1993,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2006 for (i = 0; i < 4; ++i) { 1993 for (i = 0; i < 4; ++i) {
2007 hpa_t root = vcpu->arch.mmu.pae_root[i]; 1994 hpa_t root = vcpu->arch.mmu.pae_root[i];
2008 1995
2009 if (root) { 1996 if (root && VALID_PAGE(root)) {
2010 root &= PT64_BASE_ADDR_MASK; 1997 root &= PT64_BASE_ADDR_MASK;
2011 sp = page_header(root); 1998 sp = page_header(root);
2012 mmu_sync_children(vcpu, sp); 1999 mmu_sync_children(vcpu, sp);
@@ -2014,15 +2001,6 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2014 } 2001 }
2015} 2002}
2016 2003
2017static void mmu_sync_global(struct kvm_vcpu *vcpu)
2018{
2019 struct kvm *kvm = vcpu->kvm;
2020 struct kvm_mmu_page *sp, *n;
2021
2022 list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
2023 kvm_sync_page(vcpu, sp);
2024}
2025
2026void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2004void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2027{ 2005{
2028 spin_lock(&vcpu->kvm->mmu_lock); 2006 spin_lock(&vcpu->kvm->mmu_lock);
@@ -2030,13 +2008,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2030 spin_unlock(&vcpu->kvm->mmu_lock); 2008 spin_unlock(&vcpu->kvm->mmu_lock);
2031} 2009}
2032 2010
2033void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
2034{
2035 spin_lock(&vcpu->kvm->mmu_lock);
2036 mmu_sync_global(vcpu);
2037 spin_unlock(&vcpu->kvm->mmu_lock);
2038}
2039
2040static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 2011static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
2041{ 2012{
2042 return vaddr; 2013 return vaddr;
@@ -2151,6 +2122,14 @@ static void paging_free(struct kvm_vcpu *vcpu)
2151 nonpaging_free(vcpu); 2122 nonpaging_free(vcpu);
2152} 2123}
2153 2124
2125static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2126{
2127 int bit7;
2128
2129 bit7 = (gpte >> 7) & 1;
2130 return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
2131}
2132
2154#define PTTYPE 64 2133#define PTTYPE 64
2155#include "paging_tmpl.h" 2134#include "paging_tmpl.h"
2156#undef PTTYPE 2135#undef PTTYPE
@@ -2159,6 +2138,59 @@ static void paging_free(struct kvm_vcpu *vcpu)
2159#include "paging_tmpl.h" 2138#include "paging_tmpl.h"
2160#undef PTTYPE 2139#undef PTTYPE
2161 2140
2141static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2142{
2143 struct kvm_mmu *context = &vcpu->arch.mmu;
2144 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2145 u64 exb_bit_rsvd = 0;
2146
2147 if (!is_nx(vcpu))
2148 exb_bit_rsvd = rsvd_bits(63, 63);
2149 switch (level) {
2150 case PT32_ROOT_LEVEL:
2151 /* no rsvd bits for 2 level 4K page table entries */
2152 context->rsvd_bits_mask[0][1] = 0;
2153 context->rsvd_bits_mask[0][0] = 0;
2154 if (is_cpuid_PSE36())
2155 /* 36bits PSE 4MB page */
2156 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2157 else
2158 /* 32 bits PSE 4MB page */
2159 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2160 context->rsvd_bits_mask[1][0] = ~0ull;
2161 break;
2162 case PT32E_ROOT_LEVEL:
2163 context->rsvd_bits_mask[0][2] =
2164 rsvd_bits(maxphyaddr, 63) |
2165 rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
2166 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2167 rsvd_bits(maxphyaddr, 62); /* PDE */
2168 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2169 rsvd_bits(maxphyaddr, 62); /* PTE */
2170 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2171 rsvd_bits(maxphyaddr, 62) |
2172 rsvd_bits(13, 20); /* large page */
2173 context->rsvd_bits_mask[1][0] = ~0ull;
2174 break;
2175 case PT64_ROOT_LEVEL:
2176 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
2177 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2178 context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
2179 rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2180 context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2181 rsvd_bits(maxphyaddr, 51);
2182 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2183 rsvd_bits(maxphyaddr, 51);
2184 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2185 context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2];
2186 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2187 rsvd_bits(maxphyaddr, 51) |
2188 rsvd_bits(13, 20); /* large page */
2189 context->rsvd_bits_mask[1][0] = ~0ull;
2190 break;
2191 }
2192}
2193
2162static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) 2194static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2163{ 2195{
2164 struct kvm_mmu *context = &vcpu->arch.mmu; 2196 struct kvm_mmu *context = &vcpu->arch.mmu;
@@ -2179,6 +2211,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2179 2211
2180static int paging64_init_context(struct kvm_vcpu *vcpu) 2212static int paging64_init_context(struct kvm_vcpu *vcpu)
2181{ 2213{
2214 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2182 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); 2215 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2183} 2216}
2184 2217
@@ -2186,6 +2219,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2186{ 2219{
2187 struct kvm_mmu *context = &vcpu->arch.mmu; 2220 struct kvm_mmu *context = &vcpu->arch.mmu;
2188 2221
2222 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2189 context->new_cr3 = paging_new_cr3; 2223 context->new_cr3 = paging_new_cr3;
2190 context->page_fault = paging32_page_fault; 2224 context->page_fault = paging32_page_fault;
2191 context->gva_to_gpa = paging32_gva_to_gpa; 2225 context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2201,6 +2235,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2201 2235
2202static int paging32E_init_context(struct kvm_vcpu *vcpu) 2236static int paging32E_init_context(struct kvm_vcpu *vcpu)
2203{ 2237{
2238 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2204 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); 2239 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2205} 2240}
2206 2241
@@ -2221,12 +2256,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2221 context->gva_to_gpa = nonpaging_gva_to_gpa; 2256 context->gva_to_gpa = nonpaging_gva_to_gpa;
2222 context->root_level = 0; 2257 context->root_level = 0;
2223 } else if (is_long_mode(vcpu)) { 2258 } else if (is_long_mode(vcpu)) {
2259 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2224 context->gva_to_gpa = paging64_gva_to_gpa; 2260 context->gva_to_gpa = paging64_gva_to_gpa;
2225 context->root_level = PT64_ROOT_LEVEL; 2261 context->root_level = PT64_ROOT_LEVEL;
2226 } else if (is_pae(vcpu)) { 2262 } else if (is_pae(vcpu)) {
2263 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2227 context->gva_to_gpa = paging64_gva_to_gpa; 2264 context->gva_to_gpa = paging64_gva_to_gpa;
2228 context->root_level = PT32E_ROOT_LEVEL; 2265 context->root_level = PT32E_ROOT_LEVEL;
2229 } else { 2266 } else {
2267 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2230 context->gva_to_gpa = paging32_gva_to_gpa; 2268 context->gva_to_gpa = paging32_gva_to_gpa;
2231 context->root_level = PT32_ROOT_LEVEL; 2269 context->root_level = PT32_ROOT_LEVEL;
2232 } 2270 }
@@ -2290,9 +2328,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2290 goto out; 2328 goto out;
2291 spin_lock(&vcpu->kvm->mmu_lock); 2329 spin_lock(&vcpu->kvm->mmu_lock);
2292 kvm_mmu_free_some_pages(vcpu); 2330 kvm_mmu_free_some_pages(vcpu);
2293 mmu_alloc_roots(vcpu); 2331 r = mmu_alloc_roots(vcpu);
2294 mmu_sync_roots(vcpu); 2332 mmu_sync_roots(vcpu);
2295 spin_unlock(&vcpu->kvm->mmu_lock); 2333 spin_unlock(&vcpu->kvm->mmu_lock);
2334 if (r)
2335 goto out;
2296 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2336 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2297 kvm_mmu_flush_tlb(vcpu); 2337 kvm_mmu_flush_tlb(vcpu);
2298out: 2338out:
@@ -2638,14 +2678,6 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
2638 2678
2639static void free_mmu_pages(struct kvm_vcpu *vcpu) 2679static void free_mmu_pages(struct kvm_vcpu *vcpu)
2640{ 2680{
2641 struct kvm_mmu_page *sp;
2642
2643 while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2644 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
2645 struct kvm_mmu_page, link);
2646 kvm_mmu_zap_page(vcpu->kvm, sp);
2647 cond_resched();
2648 }
2649 free_page((unsigned long)vcpu->arch.mmu.pae_root); 2681 free_page((unsigned long)vcpu->arch.mmu.pae_root);
2650} 2682}
2651 2683
@@ -2710,7 +2742,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2710{ 2742{
2711 struct kvm_mmu_page *sp; 2743 struct kvm_mmu_page *sp;
2712 2744
2713 spin_lock(&kvm->mmu_lock);
2714 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 2745 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2715 int i; 2746 int i;
2716 u64 *pt; 2747 u64 *pt;
@@ -2725,7 +2756,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2725 pt[i] &= ~PT_WRITABLE_MASK; 2756 pt[i] &= ~PT_WRITABLE_MASK;
2726 } 2757 }
2727 kvm_flush_remote_tlbs(kvm); 2758 kvm_flush_remote_tlbs(kvm);
2728 spin_unlock(&kvm->mmu_lock);
2729} 2759}
2730 2760
2731void kvm_mmu_zap_all(struct kvm *kvm) 2761void kvm_mmu_zap_all(struct kvm *kvm)
@@ -3007,11 +3037,13 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3007 " in nonleaf level: levels %d gva %lx" 3037 " in nonleaf level: levels %d gva %lx"
3008 " level %d pte %llx\n", audit_msg, 3038 " level %d pte %llx\n", audit_msg,
3009 vcpu->arch.mmu.root_level, va, level, ent); 3039 vcpu->arch.mmu.root_level, va, level, ent);
3010 3040 else
3011 audit_mappings_page(vcpu, ent, va, level - 1); 3041 audit_mappings_page(vcpu, ent, va, level - 1);
3012 } else { 3042 } else {
3013 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); 3043 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
3014 hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT; 3044 gfn_t gfn = gpa >> PAGE_SHIFT;
3045 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3046 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3015 3047
3016 if (is_shadow_present_pte(ent) 3048 if (is_shadow_present_pte(ent)
3017 && (ent & PT64_BASE_ADDR_MASK) != hpa) 3049 && (ent & PT64_BASE_ADDR_MASK) != hpa)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index eaab2145f62b..3494a2fb136e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
75 return vcpu->arch.cr0 & X86_CR0_PG; 75 return vcpu->arch.cr0 & X86_CR0_PG;
76} 76}
77 77
78static inline int is_present_pte(unsigned long pte)
79{
80 return pte & PT_PRESENT_MASK;
81}
82
78#endif 83#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bd70206c561..258e4591e1ca 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
123 gfn_t table_gfn; 123 gfn_t table_gfn;
124 unsigned index, pt_access, pte_access; 124 unsigned index, pt_access, pte_access;
125 gpa_t pte_gpa; 125 gpa_t pte_gpa;
126 int rsvd_fault = 0;
126 127
127 pgprintk("%s: addr %lx\n", __func__, addr); 128 pgprintk("%s: addr %lx\n", __func__, addr);
128walk: 129walk:
@@ -157,6 +158,10 @@ walk:
157 if (!is_present_pte(pte)) 158 if (!is_present_pte(pte))
158 goto not_present; 159 goto not_present;
159 160
161 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
162 if (rsvd_fault)
163 goto access_error;
164
160 if (write_fault && !is_writeble_pte(pte)) 165 if (write_fault && !is_writeble_pte(pte))
161 if (user_fault || is_write_protection(vcpu)) 166 if (user_fault || is_write_protection(vcpu))
162 goto access_error; 167 goto access_error;
@@ -209,7 +214,6 @@ walk:
209 if (ret) 214 if (ret)
210 goto walk; 215 goto walk;
211 pte |= PT_DIRTY_MASK; 216 pte |= PT_DIRTY_MASK;
212 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
213 walker->ptes[walker->level - 1] = pte; 217 walker->ptes[walker->level - 1] = pte;
214 } 218 }
215 219
@@ -233,6 +237,8 @@ err:
233 walker->error_code |= PFERR_USER_MASK; 237 walker->error_code |= PFERR_USER_MASK;
234 if (fetch_fault) 238 if (fetch_fault)
235 walker->error_code |= PFERR_FETCH_MASK; 239 walker->error_code |= PFERR_FETCH_MASK;
240 if (rsvd_fault)
241 walker->error_code |= PFERR_RSVD_MASK;
236 return 0; 242 return 0;
237} 243}
238 244
@@ -262,8 +268,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
262 kvm_get_pfn(pfn); 268 kvm_get_pfn(pfn);
263 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 269 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
264 gpte & PT_DIRTY_MASK, NULL, largepage, 270 gpte & PT_DIRTY_MASK, NULL, largepage,
265 gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte), 271 gpte_to_gfn(gpte), pfn, true);
266 pfn, true);
267} 272}
268 273
269/* 274/*
@@ -297,7 +302,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
297 user_fault, write_fault, 302 user_fault, write_fault,
298 gw->ptes[gw->level-1] & PT_DIRTY_MASK, 303 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
299 ptwrite, largepage, 304 ptwrite, largepage,
300 gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
301 gw->gfn, pfn, false); 305 gw->gfn, pfn, false);
302 break; 306 break;
303 } 307 }
@@ -380,7 +384,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
380 return r; 384 return r;
381 385
382 /* 386 /*
383 * Look up the shadow pte for the faulting address. 387 * Look up the guest pte for the faulting address.
384 */ 388 */
385 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, 389 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
386 fetch_fault); 390 fetch_fault);
@@ -586,7 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
586 nr_present++; 590 nr_present++;
587 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 591 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
588 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 592 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
589 is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn, 593 is_dirty_pte(gpte), 0, gfn,
590 spte_to_pfn(sp->spt[i]), true, false); 594 spte_to_pfn(sp->spt[i]), true, false);
591 } 595 }
592 596
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1f8510c51d6e..71510e07e69e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -19,6 +19,7 @@
19#include "irq.h" 19#include "irq.h"
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h" 21#include "kvm_cache_regs.h"
22#include "x86.h"
22 23
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/kernel.h> 25#include <linux/kernel.h>
@@ -69,7 +70,6 @@ module_param(npt, int, S_IRUGO);
69static int nested = 0; 70static int nested = 0;
70module_param(nested, int, S_IRUGO); 71module_param(nested, int, S_IRUGO);
71 72
72static void kvm_reput_irq(struct vcpu_svm *svm);
73static void svm_flush_tlb(struct kvm_vcpu *vcpu); 73static void svm_flush_tlb(struct kvm_vcpu *vcpu);
74 74
75static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); 75static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
@@ -132,24 +132,6 @@ static inline u32 svm_has(u32 feat)
132 return svm_features & feat; 132 return svm_features & feat;
133} 133}
134 134
135static inline u8 pop_irq(struct kvm_vcpu *vcpu)
136{
137 int word_index = __ffs(vcpu->arch.irq_summary);
138 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
139 int irq = word_index * BITS_PER_LONG + bit_index;
140
141 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
142 if (!vcpu->arch.irq_pending[word_index])
143 clear_bit(word_index, &vcpu->arch.irq_summary);
144 return irq;
145}
146
147static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
148{
149 set_bit(irq, vcpu->arch.irq_pending);
150 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
151}
152
153static inline void clgi(void) 135static inline void clgi(void)
154{ 136{
155 asm volatile (__ex(SVM_CLGI)); 137 asm volatile (__ex(SVM_CLGI));
@@ -214,17 +196,31 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
214 svm->vmcb->control.event_inj_err = error_code; 196 svm->vmcb->control.event_inj_err = error_code;
215} 197}
216 198
217static bool svm_exception_injected(struct kvm_vcpu *vcpu) 199static int is_external_interrupt(u32 info)
200{
201 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
202 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
203}
204
205static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
218{ 206{
219 struct vcpu_svm *svm = to_svm(vcpu); 207 struct vcpu_svm *svm = to_svm(vcpu);
208 u32 ret = 0;
220 209
221 return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); 210 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
211 ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
212 return ret & mask;
222} 213}
223 214
224static int is_external_interrupt(u32 info) 215static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
225{ 216{
226 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 217 struct vcpu_svm *svm = to_svm(vcpu);
227 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); 218
219 if (mask == 0)
220 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
221 else
222 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
223
228} 224}
229 225
230static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 226static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
@@ -232,7 +228,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
232 struct vcpu_svm *svm = to_svm(vcpu); 228 struct vcpu_svm *svm = to_svm(vcpu);
233 229
234 if (!svm->next_rip) { 230 if (!svm->next_rip) {
235 printk(KERN_DEBUG "%s: NOP\n", __func__); 231 if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) !=
232 EMULATE_DONE)
233 printk(KERN_DEBUG "%s: NOP\n", __func__);
236 return; 234 return;
237 } 235 }
238 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) 236 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
@@ -240,9 +238,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
240 __func__, kvm_rip_read(vcpu), svm->next_rip); 238 __func__, kvm_rip_read(vcpu), svm->next_rip);
241 239
242 kvm_rip_write(vcpu, svm->next_rip); 240 kvm_rip_write(vcpu, svm->next_rip);
243 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 241 svm_set_interrupt_shadow(vcpu, 0);
244
245 vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK);
246} 242}
247 243
248static int has_svm(void) 244static int has_svm(void)
@@ -830,6 +826,15 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
830 if (!var->unusable) 826 if (!var->unusable)
831 var->type |= 0x1; 827 var->type |= 0x1;
832 break; 828 break;
829 case VCPU_SREG_SS:
830 /* On AMD CPUs sometimes the DB bit in the segment
831 * descriptor is left as 1, although the whole segment has
832 * been made unusable. Clear it here to pass an Intel VMX
833 * entry check when cross vendor migrating.
834 */
835 if (var->unusable)
836 var->db = 0;
837 break;
833 } 838 }
834} 839}
835 840
@@ -960,15 +965,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
960 965
961} 966}
962 967
963static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 968static void update_db_intercept(struct kvm_vcpu *vcpu)
964{ 969{
965 int old_debug = vcpu->guest_debug;
966 struct vcpu_svm *svm = to_svm(vcpu); 970 struct vcpu_svm *svm = to_svm(vcpu);
967 971
968 vcpu->guest_debug = dbg->control;
969
970 svm->vmcb->control.intercept_exceptions &= 972 svm->vmcb->control.intercept_exceptions &=
971 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 973 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
974
975 if (vcpu->arch.singlestep)
976 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
977
972 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 978 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
973 if (vcpu->guest_debug & 979 if (vcpu->guest_debug &
974 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 980 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@ -979,6 +985,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
979 1 << BP_VECTOR; 985 1 << BP_VECTOR;
980 } else 986 } else
981 vcpu->guest_debug = 0; 987 vcpu->guest_debug = 0;
988}
989
990static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
991{
992 int old_debug = vcpu->guest_debug;
993 struct vcpu_svm *svm = to_svm(vcpu);
994
995 vcpu->guest_debug = dbg->control;
996
997 update_db_intercept(vcpu);
982 998
983 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 999 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
984 svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; 1000 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
@@ -993,16 +1009,6 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
993 return 0; 1009 return 0;
994} 1010}
995 1011
996static int svm_get_irq(struct kvm_vcpu *vcpu)
997{
998 struct vcpu_svm *svm = to_svm(vcpu);
999 u32 exit_int_info = svm->vmcb->control.exit_int_info;
1000
1001 if (is_external_interrupt(exit_int_info))
1002 return exit_int_info & SVM_EVTINJ_VEC_MASK;
1003 return -1;
1004}
1005
1006static void load_host_msrs(struct kvm_vcpu *vcpu) 1012static void load_host_msrs(struct kvm_vcpu *vcpu)
1007{ 1013{
1008#ifdef CONFIG_X86_64 1014#ifdef CONFIG_X86_64
@@ -1107,17 +1113,8 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
1107 1113
1108static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1114static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1109{ 1115{
1110 u32 exit_int_info = svm->vmcb->control.exit_int_info;
1111 struct kvm *kvm = svm->vcpu.kvm;
1112 u64 fault_address; 1116 u64 fault_address;
1113 u32 error_code; 1117 u32 error_code;
1114 bool event_injection = false;
1115
1116 if (!irqchip_in_kernel(kvm) &&
1117 is_external_interrupt(exit_int_info)) {
1118 event_injection = true;
1119 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
1120 }
1121 1118
1122 fault_address = svm->vmcb->control.exit_info_2; 1119 fault_address = svm->vmcb->control.exit_info_2;
1123 error_code = svm->vmcb->control.exit_info_1; 1120 error_code = svm->vmcb->control.exit_info_1;
@@ -1137,23 +1134,40 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1137 */ 1134 */
1138 if (npt_enabled) 1135 if (npt_enabled)
1139 svm_flush_tlb(&svm->vcpu); 1136 svm_flush_tlb(&svm->vcpu);
1140 1137 else {
1141 if (!npt_enabled && event_injection) 1138 if (kvm_event_needs_reinjection(&svm->vcpu))
1142 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1139 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1140 }
1143 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1141 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1144} 1142}
1145 1143
1146static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1144static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1147{ 1145{
1148 if (!(svm->vcpu.guest_debug & 1146 if (!(svm->vcpu.guest_debug &
1149 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 1147 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1148 !svm->vcpu.arch.singlestep) {
1150 kvm_queue_exception(&svm->vcpu, DB_VECTOR); 1149 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1151 return 1; 1150 return 1;
1152 } 1151 }
1153 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1152
1154 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1153 if (svm->vcpu.arch.singlestep) {
1155 kvm_run->debug.arch.exception = DB_VECTOR; 1154 svm->vcpu.arch.singlestep = false;
1156 return 0; 1155 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1156 svm->vmcb->save.rflags &=
1157 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1158 update_db_intercept(&svm->vcpu);
1159 }
1160
1161 if (svm->vcpu.guest_debug &
1162 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
1163 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1164 kvm_run->debug.arch.pc =
1165 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1166 kvm_run->debug.arch.exception = DB_VECTOR;
1167 return 0;
1168 }
1169
1170 return 1;
1157} 1171}
1158 1172
1159static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1173static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1842,17 +1856,51 @@ static int task_switch_interception(struct vcpu_svm *svm,
1842 struct kvm_run *kvm_run) 1856 struct kvm_run *kvm_run)
1843{ 1857{
1844 u16 tss_selector; 1858 u16 tss_selector;
1859 int reason;
1860 int int_type = svm->vmcb->control.exit_int_info &
1861 SVM_EXITINTINFO_TYPE_MASK;
1862 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
1863 uint32_t type =
1864 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
1865 uint32_t idt_v =
1866 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
1845 1867
1846 tss_selector = (u16)svm->vmcb->control.exit_info_1; 1868 tss_selector = (u16)svm->vmcb->control.exit_info_1;
1869
1847 if (svm->vmcb->control.exit_info_2 & 1870 if (svm->vmcb->control.exit_info_2 &
1848 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) 1871 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
1849 return kvm_task_switch(&svm->vcpu, tss_selector, 1872 reason = TASK_SWITCH_IRET;
1850 TASK_SWITCH_IRET); 1873 else if (svm->vmcb->control.exit_info_2 &
1851 if (svm->vmcb->control.exit_info_2 & 1874 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
1852 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) 1875 reason = TASK_SWITCH_JMP;
1853 return kvm_task_switch(&svm->vcpu, tss_selector, 1876 else if (idt_v)
1854 TASK_SWITCH_JMP); 1877 reason = TASK_SWITCH_GATE;
1855 return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL); 1878 else
1879 reason = TASK_SWITCH_CALL;
1880
1881 if (reason == TASK_SWITCH_GATE) {
1882 switch (type) {
1883 case SVM_EXITINTINFO_TYPE_NMI:
1884 svm->vcpu.arch.nmi_injected = false;
1885 break;
1886 case SVM_EXITINTINFO_TYPE_EXEPT:
1887 kvm_clear_exception_queue(&svm->vcpu);
1888 break;
1889 case SVM_EXITINTINFO_TYPE_INTR:
1890 kvm_clear_interrupt_queue(&svm->vcpu);
1891 break;
1892 default:
1893 break;
1894 }
1895 }
1896
1897 if (reason != TASK_SWITCH_GATE ||
1898 int_type == SVM_EXITINTINFO_TYPE_SOFT ||
1899 (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
1900 (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
1901 skip_emulated_instruction(&svm->vcpu);
1902
1903 return kvm_task_switch(&svm->vcpu, tss_selector, reason);
1856} 1904}
1857 1905
1858static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1906static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1862,6 +1910,14 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1862 return 1; 1910 return 1;
1863} 1911}
1864 1912
1913static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1914{
1915 ++svm->vcpu.stat.nmi_window_exits;
1916 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
1917 svm->vcpu.arch.hflags |= HF_IRET_MASK;
1918 return 1;
1919}
1920
1865static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1921static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1866{ 1922{
1867 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) 1923 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
@@ -1879,8 +1935,14 @@ static int emulate_on_interception(struct vcpu_svm *svm,
1879 1935
1880static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1936static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1881{ 1937{
1938 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
1939 /* instruction emulation calls kvm_set_cr8() */
1882 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); 1940 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
1883 if (irqchip_in_kernel(svm->vcpu.kvm)) 1941 if (irqchip_in_kernel(svm->vcpu.kvm)) {
1942 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
1943 return 1;
1944 }
1945 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
1884 return 1; 1946 return 1;
1885 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 1947 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1886 return 0; 1948 return 0;
@@ -2090,8 +2152,9 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
2090 * If the user space waits to inject interrupts, exit as soon as 2152 * If the user space waits to inject interrupts, exit as soon as
2091 * possible 2153 * possible
2092 */ 2154 */
2093 if (kvm_run->request_interrupt_window && 2155 if (!irqchip_in_kernel(svm->vcpu.kvm) &&
2094 !svm->vcpu.arch.irq_summary) { 2156 kvm_run->request_interrupt_window &&
2157 !kvm_cpu_has_interrupt(&svm->vcpu)) {
2095 ++svm->vcpu.stat.irq_window_exits; 2158 ++svm->vcpu.stat.irq_window_exits;
2096 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 2159 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2097 return 0; 2160 return 0;
@@ -2134,6 +2197,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2134 [SVM_EXIT_VINTR] = interrupt_window_interception, 2197 [SVM_EXIT_VINTR] = interrupt_window_interception,
2135 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ 2198 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
2136 [SVM_EXIT_CPUID] = cpuid_interception, 2199 [SVM_EXIT_CPUID] = cpuid_interception,
2200 [SVM_EXIT_IRET] = iret_interception,
2137 [SVM_EXIT_INVD] = emulate_on_interception, 2201 [SVM_EXIT_INVD] = emulate_on_interception,
2138 [SVM_EXIT_HLT] = halt_interception, 2202 [SVM_EXIT_HLT] = halt_interception,
2139 [SVM_EXIT_INVLPG] = invlpg_interception, 2203 [SVM_EXIT_INVLPG] = invlpg_interception,
@@ -2194,7 +2258,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2194 } 2258 }
2195 } 2259 }
2196 2260
2197 kvm_reput_irq(svm);
2198 2261
2199 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 2262 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
2200 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2263 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2205,7 +2268,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2205 2268
2206 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 2269 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
2207 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 2270 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
2208 exit_code != SVM_EXIT_NPF) 2271 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
2209 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 2272 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
2210 "exit_code 0x%x\n", 2273 "exit_code 0x%x\n",
2211 __func__, svm->vmcb->control.exit_int_info, 2274 __func__, svm->vmcb->control.exit_int_info,
@@ -2242,6 +2305,15 @@ static void pre_svm_run(struct vcpu_svm *svm)
2242 new_asid(svm, svm_data); 2305 new_asid(svm, svm_data);
2243} 2306}
2244 2307
2308static void svm_inject_nmi(struct kvm_vcpu *vcpu)
2309{
2310 struct vcpu_svm *svm = to_svm(vcpu);
2311
2312 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
2313 vcpu->arch.hflags |= HF_NMI_MASK;
2314 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
2315 ++vcpu->stat.nmi_injections;
2316}
2245 2317
2246static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) 2318static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2247{ 2319{
@@ -2257,134 +2329,71 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2257 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 2329 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
2258} 2330}
2259 2331
2260static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) 2332static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
2261{ 2333{
2262 struct vcpu_svm *svm = to_svm(vcpu); 2334 struct vcpu_svm *svm = to_svm(vcpu);
2263 2335
2264 nested_svm_intr(svm); 2336 svm->vmcb->control.event_inj = nr |
2265 2337 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2266 svm_inject_irq(svm, irq);
2267} 2338}
2268 2339
2269static void update_cr8_intercept(struct kvm_vcpu *vcpu) 2340static void svm_set_irq(struct kvm_vcpu *vcpu)
2270{ 2341{
2271 struct vcpu_svm *svm = to_svm(vcpu); 2342 struct vcpu_svm *svm = to_svm(vcpu);
2272 struct vmcb *vmcb = svm->vmcb;
2273 int max_irr, tpr;
2274 2343
2275 if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr) 2344 nested_svm_intr(svm);
2276 return;
2277 2345
2278 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2346 svm_queue_irq(vcpu, vcpu->arch.interrupt.nr);
2347}
2279 2348
2280 max_irr = kvm_lapic_find_highest_irr(vcpu); 2349static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
2281 if (max_irr == -1) 2350{
2282 return; 2351 struct vcpu_svm *svm = to_svm(vcpu);
2283 2352
2284 tpr = kvm_lapic_get_cr8(vcpu) << 4; 2353 if (irr == -1)
2354 return;
2285 2355
2286 if (tpr >= (max_irr & 0xf0)) 2356 if (tpr >= irr)
2287 vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; 2357 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
2288} 2358}
2289 2359
2290static void svm_intr_assist(struct kvm_vcpu *vcpu) 2360static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2291{ 2361{
2292 struct vcpu_svm *svm = to_svm(vcpu); 2362 struct vcpu_svm *svm = to_svm(vcpu);
2293 struct vmcb *vmcb = svm->vmcb; 2363 struct vmcb *vmcb = svm->vmcb;
2294 int intr_vector = -1; 2364 return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2295 2365 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2296 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
2297 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
2298 intr_vector = vmcb->control.exit_int_info &
2299 SVM_EVTINJ_VEC_MASK;
2300 vmcb->control.exit_int_info = 0;
2301 svm_inject_irq(svm, intr_vector);
2302 goto out;
2303 }
2304
2305 if (vmcb->control.int_ctl & V_IRQ_MASK)
2306 goto out;
2307
2308 if (!kvm_cpu_has_interrupt(vcpu))
2309 goto out;
2310
2311 if (nested_svm_intr(svm))
2312 goto out;
2313
2314 if (!(svm->vcpu.arch.hflags & HF_GIF_MASK))
2315 goto out;
2316
2317 if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
2318 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
2319 (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
2320 /* unable to deliver irq, set pending irq */
2321 svm_set_vintr(svm);
2322 svm_inject_irq(svm, 0x0);
2323 goto out;
2324 }
2325 /* Okay, we can deliver the interrupt: grab it and update PIC state. */
2326 intr_vector = kvm_cpu_get_interrupt(vcpu);
2327 svm_inject_irq(svm, intr_vector);
2328out:
2329 update_cr8_intercept(vcpu);
2330} 2366}
2331 2367
2332static void kvm_reput_irq(struct vcpu_svm *svm) 2368static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2333{ 2369{
2334 struct vmcb_control_area *control = &svm->vmcb->control; 2370 struct vcpu_svm *svm = to_svm(vcpu);
2335 2371 struct vmcb *vmcb = svm->vmcb;
2336 if ((control->int_ctl & V_IRQ_MASK) 2372 return (vmcb->save.rflags & X86_EFLAGS_IF) &&
2337 && !irqchip_in_kernel(svm->vcpu.kvm)) { 2373 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2338 control->int_ctl &= ~V_IRQ_MASK; 2374 (svm->vcpu.arch.hflags & HF_GIF_MASK);
2339 push_irq(&svm->vcpu, control->int_vector);
2340 }
2341
2342 svm->vcpu.arch.interrupt_window_open =
2343 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2344 (svm->vcpu.arch.hflags & HF_GIF_MASK);
2345} 2375}
2346 2376
2347static void svm_do_inject_vector(struct vcpu_svm *svm) 2377static void enable_irq_window(struct kvm_vcpu *vcpu)
2348{ 2378{
2349 struct kvm_vcpu *vcpu = &svm->vcpu; 2379 svm_set_vintr(to_svm(vcpu));
2350 int word_index = __ffs(vcpu->arch.irq_summary); 2380 svm_inject_irq(to_svm(vcpu), 0x0);
2351 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
2352 int irq = word_index * BITS_PER_LONG + bit_index;
2353
2354 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
2355 if (!vcpu->arch.irq_pending[word_index])
2356 clear_bit(word_index, &vcpu->arch.irq_summary);
2357 svm_inject_irq(svm, irq);
2358} 2381}
2359 2382
2360static void do_interrupt_requests(struct kvm_vcpu *vcpu, 2383static void enable_nmi_window(struct kvm_vcpu *vcpu)
2361 struct kvm_run *kvm_run)
2362{ 2384{
2363 struct vcpu_svm *svm = to_svm(vcpu); 2385 struct vcpu_svm *svm = to_svm(vcpu);
2364 struct vmcb_control_area *control = &svm->vmcb->control;
2365
2366 if (nested_svm_intr(svm))
2367 return;
2368 2386
2369 svm->vcpu.arch.interrupt_window_open = 2387 if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
2370 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && 2388 == HF_NMI_MASK)
2371 (svm->vmcb->save.rflags & X86_EFLAGS_IF) && 2389 return; /* IRET will cause a vm exit */
2372 (svm->vcpu.arch.hflags & HF_GIF_MASK));
2373 2390
2374 if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) 2391 /* Something prevents NMI from been injected. Single step over
2375 /* 2392 possible problem (IRET or exception injection or interrupt
2376 * If interrupts enabled, and not blocked by sti or mov ss. Good. 2393 shadow) */
2377 */ 2394 vcpu->arch.singlestep = true;
2378 svm_do_inject_vector(svm); 2395 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2379 2396 update_db_intercept(vcpu);
2380 /*
2381 * Interrupts blocked. Wait for unblock.
2382 */
2383 if (!svm->vcpu.arch.interrupt_window_open &&
2384 (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
2385 svm_set_vintr(svm);
2386 else
2387 svm_clear_vintr(svm);
2388} 2397}
2389 2398
2390static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 2399static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2407,7 +2416,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
2407 2416
2408 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 2417 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
2409 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 2418 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
2410 kvm_lapic_set_tpr(vcpu, cr8); 2419 kvm_set_cr8(vcpu, cr8);
2411 } 2420 }
2412} 2421}
2413 2422
@@ -2416,14 +2425,54 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
2416 struct vcpu_svm *svm = to_svm(vcpu); 2425 struct vcpu_svm *svm = to_svm(vcpu);
2417 u64 cr8; 2426 u64 cr8;
2418 2427
2419 if (!irqchip_in_kernel(vcpu->kvm))
2420 return;
2421
2422 cr8 = kvm_get_cr8(vcpu); 2428 cr8 = kvm_get_cr8(vcpu);
2423 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 2429 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
2424 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 2430 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
2425} 2431}
2426 2432
2433static void svm_complete_interrupts(struct vcpu_svm *svm)
2434{
2435 u8 vector;
2436 int type;
2437 u32 exitintinfo = svm->vmcb->control.exit_int_info;
2438
2439 if (svm->vcpu.arch.hflags & HF_IRET_MASK)
2440 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
2441
2442 svm->vcpu.arch.nmi_injected = false;
2443 kvm_clear_exception_queue(&svm->vcpu);
2444 kvm_clear_interrupt_queue(&svm->vcpu);
2445
2446 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
2447 return;
2448
2449 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
2450 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
2451
2452 switch (type) {
2453 case SVM_EXITINTINFO_TYPE_NMI:
2454 svm->vcpu.arch.nmi_injected = true;
2455 break;
2456 case SVM_EXITINTINFO_TYPE_EXEPT:
2457 /* In case of software exception do not reinject an exception
2458 vector, but re-execute and instruction instead */
2459 if (kvm_exception_is_soft(vector))
2460 break;
2461 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
2462 u32 err = svm->vmcb->control.exit_int_info_err;
2463 kvm_queue_exception_e(&svm->vcpu, vector, err);
2464
2465 } else
2466 kvm_queue_exception(&svm->vcpu, vector);
2467 break;
2468 case SVM_EXITINTINFO_TYPE_INTR:
2469 kvm_queue_interrupt(&svm->vcpu, vector, false);
2470 break;
2471 default:
2472 break;
2473 }
2474}
2475
2427#ifdef CONFIG_X86_64 2476#ifdef CONFIG_X86_64
2428#define R "r" 2477#define R "r"
2429#else 2478#else
@@ -2552,6 +2601,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2552 sync_cr8_to_lapic(vcpu); 2601 sync_cr8_to_lapic(vcpu);
2553 2602
2554 svm->next_rip = 0; 2603 svm->next_rip = 0;
2604
2605 svm_complete_interrupts(svm);
2555} 2606}
2556 2607
2557#undef R 2608#undef R
@@ -2617,7 +2668,7 @@ static int get_npt_level(void)
2617#endif 2668#endif
2618} 2669}
2619 2670
2620static int svm_get_mt_mask_shift(void) 2671static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
2621{ 2672{
2622 return 0; 2673 return 0;
2623} 2674}
@@ -2667,17 +2718,21 @@ static struct kvm_x86_ops svm_x86_ops = {
2667 .run = svm_vcpu_run, 2718 .run = svm_vcpu_run,
2668 .handle_exit = handle_exit, 2719 .handle_exit = handle_exit,
2669 .skip_emulated_instruction = skip_emulated_instruction, 2720 .skip_emulated_instruction = skip_emulated_instruction,
2721 .set_interrupt_shadow = svm_set_interrupt_shadow,
2722 .get_interrupt_shadow = svm_get_interrupt_shadow,
2670 .patch_hypercall = svm_patch_hypercall, 2723 .patch_hypercall = svm_patch_hypercall,
2671 .get_irq = svm_get_irq,
2672 .set_irq = svm_set_irq, 2724 .set_irq = svm_set_irq,
2725 .set_nmi = svm_inject_nmi,
2673 .queue_exception = svm_queue_exception, 2726 .queue_exception = svm_queue_exception,
2674 .exception_injected = svm_exception_injected, 2727 .interrupt_allowed = svm_interrupt_allowed,
2675 .inject_pending_irq = svm_intr_assist, 2728 .nmi_allowed = svm_nmi_allowed,
2676 .inject_pending_vectors = do_interrupt_requests, 2729 .enable_nmi_window = enable_nmi_window,
2730 .enable_irq_window = enable_irq_window,
2731 .update_cr8_intercept = update_cr8_intercept,
2677 2732
2678 .set_tss_addr = svm_set_tss_addr, 2733 .set_tss_addr = svm_set_tss_addr,
2679 .get_tdp_level = get_npt_level, 2734 .get_tdp_level = get_npt_level,
2680 .get_mt_mask_shift = svm_get_mt_mask_shift, 2735 .get_mt_mask = svm_get_mt_mask,
2681}; 2736};
2682 2737
2683static int __init svm_init(void) 2738static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
new file mode 100644
index 000000000000..86dbac072d0c
--- /dev/null
+++ b/arch/x86/kvm/timer.c
@@ -0,0 +1,46 @@
1#include <linux/kvm_host.h>
2#include <linux/kvm.h>
3#include <linux/hrtimer.h>
4#include <asm/atomic.h>
5#include "kvm_timer.h"
6
7static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
8{
9 int restart_timer = 0;
10 wait_queue_head_t *q = &vcpu->wq;
11
12 /* FIXME: this code should not know anything about vcpus */
13 if (!atomic_inc_and_test(&ktimer->pending))
14 set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
15
16 if (!ktimer->reinject)
17 atomic_set(&ktimer->pending, 1);
18
19 if (waitqueue_active(q))
20 wake_up_interruptible(q);
21
22 if (ktimer->t_ops->is_periodic(ktimer)) {
23 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
24 restart_timer = 1;
25 }
26
27 return restart_timer;
28}
29
30enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
31{
32 int restart_timer;
33 struct kvm_vcpu *vcpu;
34 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
35
36 vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id];
37 if (!vcpu)
38 return HRTIMER_NORESTART;
39
40 restart_timer = __kvm_timer_fn(vcpu, ktimer);
41 if (restart_timer)
42 return HRTIMER_RESTART;
43 else
44 return HRTIMER_NORESTART;
45}
46
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bb481330716f..32d6ae8fb60e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -32,26 +32,27 @@
32#include <asm/desc.h> 32#include <asm/desc.h>
33#include <asm/vmx.h> 33#include <asm/vmx.h>
34#include <asm/virtext.h> 34#include <asm/virtext.h>
35#include <asm/mce.h>
35 36
36#define __ex(x) __kvm_handle_fault_on_reboot(x) 37#define __ex(x) __kvm_handle_fault_on_reboot(x)
37 38
38MODULE_AUTHOR("Qumranet"); 39MODULE_AUTHOR("Qumranet");
39MODULE_LICENSE("GPL"); 40MODULE_LICENSE("GPL");
40 41
41static int bypass_guest_pf = 1; 42static int __read_mostly bypass_guest_pf = 1;
42module_param(bypass_guest_pf, bool, 0); 43module_param(bypass_guest_pf, bool, S_IRUGO);
43 44
44static int enable_vpid = 1; 45static int __read_mostly enable_vpid = 1;
45module_param(enable_vpid, bool, 0); 46module_param_named(vpid, enable_vpid, bool, 0444);
46 47
47static int flexpriority_enabled = 1; 48static int __read_mostly flexpriority_enabled = 1;
48module_param(flexpriority_enabled, bool, 0); 49module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
49 50
50static int enable_ept = 1; 51static int __read_mostly enable_ept = 1;
51module_param(enable_ept, bool, 0); 52module_param_named(ept, enable_ept, bool, S_IRUGO);
52 53
53static int emulate_invalid_guest_state = 0; 54static int __read_mostly emulate_invalid_guest_state = 0;
54module_param(emulate_invalid_guest_state, bool, 0); 55module_param(emulate_invalid_guest_state, bool, S_IRUGO);
55 56
56struct vmcs { 57struct vmcs {
57 u32 revision_id; 58 u32 revision_id;
@@ -97,6 +98,7 @@ struct vcpu_vmx {
97 int soft_vnmi_blocked; 98 int soft_vnmi_blocked;
98 ktime_t entry_time; 99 ktime_t entry_time;
99 s64 vnmi_blocked_time; 100 s64 vnmi_blocked_time;
101 u32 exit_reason;
100}; 102};
101 103
102static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 104static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -111,9 +113,10 @@ static DEFINE_PER_CPU(struct vmcs *, vmxarea);
111static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 113static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
112static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); 114static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
113 115
114static struct page *vmx_io_bitmap_a; 116static unsigned long *vmx_io_bitmap_a;
115static struct page *vmx_io_bitmap_b; 117static unsigned long *vmx_io_bitmap_b;
116static struct page *vmx_msr_bitmap; 118static unsigned long *vmx_msr_bitmap_legacy;
119static unsigned long *vmx_msr_bitmap_longmode;
117 120
118static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 121static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
119static DEFINE_SPINLOCK(vmx_vpid_lock); 122static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -213,70 +216,78 @@ static inline int is_external_interrupt(u32 intr_info)
213 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 216 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
214} 217}
215 218
219static inline int is_machine_check(u32 intr_info)
220{
221 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
222 INTR_INFO_VALID_MASK)) ==
223 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
224}
225
216static inline int cpu_has_vmx_msr_bitmap(void) 226static inline int cpu_has_vmx_msr_bitmap(void)
217{ 227{
218 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS); 228 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
219} 229}
220 230
221static inline int cpu_has_vmx_tpr_shadow(void) 231static inline int cpu_has_vmx_tpr_shadow(void)
222{ 232{
223 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); 233 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
224} 234}
225 235
226static inline int vm_need_tpr_shadow(struct kvm *kvm) 236static inline int vm_need_tpr_shadow(struct kvm *kvm)
227{ 237{
228 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); 238 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
229} 239}
230 240
231static inline int cpu_has_secondary_exec_ctrls(void) 241static inline int cpu_has_secondary_exec_ctrls(void)
232{ 242{
233 return (vmcs_config.cpu_based_exec_ctrl & 243 return vmcs_config.cpu_based_exec_ctrl &
234 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); 244 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
235} 245}
236 246
237static inline bool cpu_has_vmx_virtualize_apic_accesses(void) 247static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
238{ 248{
239 return flexpriority_enabled 249 return vmcs_config.cpu_based_2nd_exec_ctrl &
240 && (vmcs_config.cpu_based_2nd_exec_ctrl & 250 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
241 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 251}
252
253static inline bool cpu_has_vmx_flexpriority(void)
254{
255 return cpu_has_vmx_tpr_shadow() &&
256 cpu_has_vmx_virtualize_apic_accesses();
242} 257}
243 258
244static inline int cpu_has_vmx_invept_individual_addr(void) 259static inline int cpu_has_vmx_invept_individual_addr(void)
245{ 260{
246 return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT)); 261 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
247} 262}
248 263
249static inline int cpu_has_vmx_invept_context(void) 264static inline int cpu_has_vmx_invept_context(void)
250{ 265{
251 return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT)); 266 return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
252} 267}
253 268
254static inline int cpu_has_vmx_invept_global(void) 269static inline int cpu_has_vmx_invept_global(void)
255{ 270{
256 return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT)); 271 return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
257} 272}
258 273
259static inline int cpu_has_vmx_ept(void) 274static inline int cpu_has_vmx_ept(void)
260{ 275{
261 return (vmcs_config.cpu_based_2nd_exec_ctrl & 276 return vmcs_config.cpu_based_2nd_exec_ctrl &
262 SECONDARY_EXEC_ENABLE_EPT); 277 SECONDARY_EXEC_ENABLE_EPT;
263}
264
265static inline int vm_need_ept(void)
266{
267 return (cpu_has_vmx_ept() && enable_ept);
268} 278}
269 279
270static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 280static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
271{ 281{
272 return ((cpu_has_vmx_virtualize_apic_accesses()) && 282 return flexpriority_enabled &&
273 (irqchip_in_kernel(kvm))); 283 (cpu_has_vmx_virtualize_apic_accesses()) &&
284 (irqchip_in_kernel(kvm));
274} 285}
275 286
276static inline int cpu_has_vmx_vpid(void) 287static inline int cpu_has_vmx_vpid(void)
277{ 288{
278 return (vmcs_config.cpu_based_2nd_exec_ctrl & 289 return vmcs_config.cpu_based_2nd_exec_ctrl &
279 SECONDARY_EXEC_ENABLE_VPID); 290 SECONDARY_EXEC_ENABLE_VPID;
280} 291}
281 292
282static inline int cpu_has_virtual_nmis(void) 293static inline int cpu_has_virtual_nmis(void)
@@ -284,6 +295,11 @@ static inline int cpu_has_virtual_nmis(void)
284 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 295 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
285} 296}
286 297
298static inline bool report_flexpriority(void)
299{
300 return flexpriority_enabled;
301}
302
287static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 303static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
288{ 304{
289 int i; 305 int i;
@@ -381,7 +397,7 @@ static inline void ept_sync_global(void)
381 397
382static inline void ept_sync_context(u64 eptp) 398static inline void ept_sync_context(u64 eptp)
383{ 399{
384 if (vm_need_ept()) { 400 if (enable_ept) {
385 if (cpu_has_vmx_invept_context()) 401 if (cpu_has_vmx_invept_context())
386 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); 402 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
387 else 403 else
@@ -391,7 +407,7 @@ static inline void ept_sync_context(u64 eptp)
391 407
392static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) 408static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
393{ 409{
394 if (vm_need_ept()) { 410 if (enable_ept) {
395 if (cpu_has_vmx_invept_individual_addr()) 411 if (cpu_has_vmx_invept_individual_addr())
396 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, 412 __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
397 eptp, gpa); 413 eptp, gpa);
@@ -478,7 +494,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
478{ 494{
479 u32 eb; 495 u32 eb;
480 496
481 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); 497 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
482 if (!vcpu->fpu_active) 498 if (!vcpu->fpu_active)
483 eb |= 1u << NM_VECTOR; 499 eb |= 1u << NM_VECTOR;
484 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 500 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -488,9 +504,9 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
488 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 504 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
489 eb |= 1u << BP_VECTOR; 505 eb |= 1u << BP_VECTOR;
490 } 506 }
491 if (vcpu->arch.rmode.active) 507 if (vcpu->arch.rmode.vm86_active)
492 eb = ~0; 508 eb = ~0;
493 if (vm_need_ept()) 509 if (enable_ept)
494 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 510 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
495 vmcs_write32(EXCEPTION_BITMAP, eb); 511 vmcs_write32(EXCEPTION_BITMAP, eb);
496} 512}
@@ -724,29 +740,50 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
724 740
725static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 741static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
726{ 742{
727 if (vcpu->arch.rmode.active) 743 if (vcpu->arch.rmode.vm86_active)
728 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 744 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
729 vmcs_writel(GUEST_RFLAGS, rflags); 745 vmcs_writel(GUEST_RFLAGS, rflags);
730} 746}
731 747
748static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
749{
750 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
751 int ret = 0;
752
753 if (interruptibility & GUEST_INTR_STATE_STI)
754 ret |= X86_SHADOW_INT_STI;
755 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
756 ret |= X86_SHADOW_INT_MOV_SS;
757
758 return ret & mask;
759}
760
761static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
762{
763 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
764 u32 interruptibility = interruptibility_old;
765
766 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
767
768 if (mask & X86_SHADOW_INT_MOV_SS)
769 interruptibility |= GUEST_INTR_STATE_MOV_SS;
770 if (mask & X86_SHADOW_INT_STI)
771 interruptibility |= GUEST_INTR_STATE_STI;
772
773 if ((interruptibility != interruptibility_old))
774 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
775}
776
732static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 777static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
733{ 778{
734 unsigned long rip; 779 unsigned long rip;
735 u32 interruptibility;
736 780
737 rip = kvm_rip_read(vcpu); 781 rip = kvm_rip_read(vcpu);
738 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 782 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
739 kvm_rip_write(vcpu, rip); 783 kvm_rip_write(vcpu, rip);
740 784
741 /* 785 /* skipping an emulated instruction also counts */
742 * We emulated an instruction, so temporary interrupt blocking 786 vmx_set_interrupt_shadow(vcpu, 0);
743 * should be removed, if set.
744 */
745 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
746 if (interruptibility & 3)
747 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
748 interruptibility & ~3);
749 vcpu->arch.interrupt_window_open = 1;
750} 787}
751 788
752static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 789static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -760,7 +797,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
760 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 797 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
761 } 798 }
762 799
763 if (vcpu->arch.rmode.active) { 800 if (vcpu->arch.rmode.vm86_active) {
764 vmx->rmode.irq.pending = true; 801 vmx->rmode.irq.pending = true;
765 vmx->rmode.irq.vector = nr; 802 vmx->rmode.irq.vector = nr;
766 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 803 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -773,8 +810,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
773 return; 810 return;
774 } 811 }
775 812
776 if (nr == BP_VECTOR || nr == OF_VECTOR) { 813 if (kvm_exception_is_soft(nr)) {
777 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); 814 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
815 vmx->vcpu.arch.event_exit_inst_len);
778 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 816 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
779 } else 817 } else
780 intr_info |= INTR_TYPE_HARD_EXCEPTION; 818 intr_info |= INTR_TYPE_HARD_EXCEPTION;
@@ -782,11 +820,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
782 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 820 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
783} 821}
784 822
785static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
786{
787 return false;
788}
789
790/* 823/*
791 * Swap MSR entry in host/guest MSR entry array. 824 * Swap MSR entry in host/guest MSR entry array.
792 */ 825 */
@@ -812,6 +845,7 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
812static void setup_msrs(struct vcpu_vmx *vmx) 845static void setup_msrs(struct vcpu_vmx *vmx)
813{ 846{
814 int save_nmsrs; 847 int save_nmsrs;
848 unsigned long *msr_bitmap;
815 849
816 vmx_load_host_state(vmx); 850 vmx_load_host_state(vmx);
817 save_nmsrs = 0; 851 save_nmsrs = 0;
@@ -847,6 +881,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
847 __find_msr_index(vmx, MSR_KERNEL_GS_BASE); 881 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
848#endif 882#endif
849 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); 883 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
884
885 if (cpu_has_vmx_msr_bitmap()) {
886 if (is_long_mode(&vmx->vcpu))
887 msr_bitmap = vmx_msr_bitmap_longmode;
888 else
889 msr_bitmap = vmx_msr_bitmap_legacy;
890
891 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
892 }
850} 893}
851 894
852/* 895/*
@@ -1034,13 +1077,6 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1034 return 0; 1077 return 0;
1035} 1078}
1036 1079
1037static int vmx_get_irq(struct kvm_vcpu *vcpu)
1038{
1039 if (!vcpu->arch.interrupt.pending)
1040 return -1;
1041 return vcpu->arch.interrupt.nr;
1042}
1043
1044static __init int cpu_has_kvm_support(void) 1080static __init int cpu_has_kvm_support(void)
1045{ 1081{
1046 return cpu_has_vmx(); 1082 return cpu_has_vmx();
@@ -1294,6 +1330,18 @@ static __init int hardware_setup(void)
1294 if (boot_cpu_has(X86_FEATURE_NX)) 1330 if (boot_cpu_has(X86_FEATURE_NX))
1295 kvm_enable_efer_bits(EFER_NX); 1331 kvm_enable_efer_bits(EFER_NX);
1296 1332
1333 if (!cpu_has_vmx_vpid())
1334 enable_vpid = 0;
1335
1336 if (!cpu_has_vmx_ept())
1337 enable_ept = 0;
1338
1339 if (!cpu_has_vmx_flexpriority())
1340 flexpriority_enabled = 0;
1341
1342 if (!cpu_has_vmx_tpr_shadow())
1343 kvm_x86_ops->update_cr8_intercept = NULL;
1344
1297 return alloc_kvm_area(); 1345 return alloc_kvm_area();
1298} 1346}
1299 1347
@@ -1324,7 +1372,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1324 struct vcpu_vmx *vmx = to_vmx(vcpu); 1372 struct vcpu_vmx *vmx = to_vmx(vcpu);
1325 1373
1326 vmx->emulation_required = 1; 1374 vmx->emulation_required = 1;
1327 vcpu->arch.rmode.active = 0; 1375 vcpu->arch.rmode.vm86_active = 0;
1328 1376
1329 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); 1377 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
1330 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); 1378 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
@@ -1386,7 +1434,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1386 struct vcpu_vmx *vmx = to_vmx(vcpu); 1434 struct vcpu_vmx *vmx = to_vmx(vcpu);
1387 1435
1388 vmx->emulation_required = 1; 1436 vmx->emulation_required = 1;
1389 vcpu->arch.rmode.active = 1; 1437 vcpu->arch.rmode.vm86_active = 1;
1390 1438
1391 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1439 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1392 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1440 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1485,7 +1533,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1485static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1533static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1486{ 1534{
1487 vpid_sync_vcpu_all(to_vmx(vcpu)); 1535 vpid_sync_vcpu_all(to_vmx(vcpu));
1488 if (vm_need_ept()) 1536 if (enable_ept)
1489 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1537 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1490} 1538}
1491 1539
@@ -1555,10 +1603,10 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1555 1603
1556 vmx_fpu_deactivate(vcpu); 1604 vmx_fpu_deactivate(vcpu);
1557 1605
1558 if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) 1606 if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE))
1559 enter_pmode(vcpu); 1607 enter_pmode(vcpu);
1560 1608
1561 if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) 1609 if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE))
1562 enter_rmode(vcpu); 1610 enter_rmode(vcpu);
1563 1611
1564#ifdef CONFIG_X86_64 1612#ifdef CONFIG_X86_64
@@ -1570,7 +1618,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1570 } 1618 }
1571#endif 1619#endif
1572 1620
1573 if (vm_need_ept()) 1621 if (enable_ept)
1574 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 1622 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
1575 1623
1576 vmcs_writel(CR0_READ_SHADOW, cr0); 1624 vmcs_writel(CR0_READ_SHADOW, cr0);
@@ -1599,7 +1647,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1599 u64 eptp; 1647 u64 eptp;
1600 1648
1601 guest_cr3 = cr3; 1649 guest_cr3 = cr3;
1602 if (vm_need_ept()) { 1650 if (enable_ept) {
1603 eptp = construct_eptp(cr3); 1651 eptp = construct_eptp(cr3);
1604 vmcs_write64(EPT_POINTER, eptp); 1652 vmcs_write64(EPT_POINTER, eptp);
1605 ept_sync_context(eptp); 1653 ept_sync_context(eptp);
@@ -1616,11 +1664,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1616 1664
1617static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1665static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1618{ 1666{
1619 unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ? 1667 unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ?
1620 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 1668 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1621 1669
1622 vcpu->arch.cr4 = cr4; 1670 vcpu->arch.cr4 = cr4;
1623 if (vm_need_ept()) 1671 if (enable_ept)
1624 ept_update_paging_mode_cr4(&hw_cr4, vcpu); 1672 ept_update_paging_mode_cr4(&hw_cr4, vcpu);
1625 1673
1626 vmcs_writel(CR4_READ_SHADOW, cr4); 1674 vmcs_writel(CR4_READ_SHADOW, cr4);
@@ -1699,7 +1747,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
1699 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 1747 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1700 u32 ar; 1748 u32 ar;
1701 1749
1702 if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { 1750 if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) {
1703 vcpu->arch.rmode.tr.selector = var->selector; 1751 vcpu->arch.rmode.tr.selector = var->selector;
1704 vcpu->arch.rmode.tr.base = var->base; 1752 vcpu->arch.rmode.tr.base = var->base;
1705 vcpu->arch.rmode.tr.limit = var->limit; 1753 vcpu->arch.rmode.tr.limit = var->limit;
@@ -1709,7 +1757,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
1709 vmcs_writel(sf->base, var->base); 1757 vmcs_writel(sf->base, var->base);
1710 vmcs_write32(sf->limit, var->limit); 1758 vmcs_write32(sf->limit, var->limit);
1711 vmcs_write16(sf->selector, var->selector); 1759 vmcs_write16(sf->selector, var->selector);
1712 if (vcpu->arch.rmode.active && var->s) { 1760 if (vcpu->arch.rmode.vm86_active && var->s) {
1713 /* 1761 /*
1714 * Hack real-mode segments into vm86 compatibility. 1762 * Hack real-mode segments into vm86 compatibility.
1715 */ 1763 */
@@ -1982,7 +2030,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
1982 pfn_t identity_map_pfn; 2030 pfn_t identity_map_pfn;
1983 u32 tmp; 2031 u32 tmp;
1984 2032
1985 if (!vm_need_ept()) 2033 if (!enable_ept)
1986 return 1; 2034 return 1;
1987 if (unlikely(!kvm->arch.ept_identity_pagetable)) { 2035 if (unlikely(!kvm->arch.ept_identity_pagetable)) {
1988 printk(KERN_ERR "EPT: identity-mapping pagetable " 2036 printk(KERN_ERR "EPT: identity-mapping pagetable "
@@ -2071,7 +2119,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
2071 int vpid; 2119 int vpid;
2072 2120
2073 vmx->vpid = 0; 2121 vmx->vpid = 0;
2074 if (!enable_vpid || !cpu_has_vmx_vpid()) 2122 if (!enable_vpid)
2075 return; 2123 return;
2076 spin_lock(&vmx_vpid_lock); 2124 spin_lock(&vmx_vpid_lock);
2077 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 2125 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
@@ -2082,9 +2130,9 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
2082 spin_unlock(&vmx_vpid_lock); 2130 spin_unlock(&vmx_vpid_lock);
2083} 2131}
2084 2132
2085static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) 2133static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
2086{ 2134{
2087 void *va; 2135 int f = sizeof(unsigned long);
2088 2136
2089 if (!cpu_has_vmx_msr_bitmap()) 2137 if (!cpu_has_vmx_msr_bitmap())
2090 return; 2138 return;
@@ -2094,16 +2142,21 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
2094 * have the write-low and read-high bitmap offsets the wrong way round. 2142 * have the write-low and read-high bitmap offsets the wrong way round.
2095 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 2143 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
2096 */ 2144 */
2097 va = kmap(msr_bitmap);
2098 if (msr <= 0x1fff) { 2145 if (msr <= 0x1fff) {
2099 __clear_bit(msr, va + 0x000); /* read-low */ 2146 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
2100 __clear_bit(msr, va + 0x800); /* write-low */ 2147 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
2101 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 2148 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2102 msr &= 0x1fff; 2149 msr &= 0x1fff;
2103 __clear_bit(msr, va + 0x400); /* read-high */ 2150 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
2104 __clear_bit(msr, va + 0xc00); /* write-high */ 2151 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
2105 } 2152 }
2106 kunmap(msr_bitmap); 2153}
2154
2155static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2156{
2157 if (!longmode_only)
2158 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
2159 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
2107} 2160}
2108 2161
2109/* 2162/*
@@ -2121,11 +2174,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2121 u32 exec_control; 2174 u32 exec_control;
2122 2175
2123 /* I/O */ 2176 /* I/O */
2124 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); 2177 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
2125 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); 2178 vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
2126 2179
2127 if (cpu_has_vmx_msr_bitmap()) 2180 if (cpu_has_vmx_msr_bitmap())
2128 vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap)); 2181 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
2129 2182
2130 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 2183 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
2131 2184
@@ -2141,7 +2194,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2141 CPU_BASED_CR8_LOAD_EXITING; 2194 CPU_BASED_CR8_LOAD_EXITING;
2142#endif 2195#endif
2143 } 2196 }
2144 if (!vm_need_ept()) 2197 if (!enable_ept)
2145 exec_control |= CPU_BASED_CR3_STORE_EXITING | 2198 exec_control |= CPU_BASED_CR3_STORE_EXITING |
2146 CPU_BASED_CR3_LOAD_EXITING | 2199 CPU_BASED_CR3_LOAD_EXITING |
2147 CPU_BASED_INVLPG_EXITING; 2200 CPU_BASED_INVLPG_EXITING;
@@ -2154,7 +2207,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2154 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2207 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2155 if (vmx->vpid == 0) 2208 if (vmx->vpid == 0)
2156 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 2209 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2157 if (!vm_need_ept()) 2210 if (!enable_ept)
2158 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 2211 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2159 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2212 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2160 } 2213 }
@@ -2273,7 +2326,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2273 goto out; 2326 goto out;
2274 } 2327 }
2275 2328
2276 vmx->vcpu.arch.rmode.active = 0; 2329 vmx->vcpu.arch.rmode.vm86_active = 0;
2277 2330
2278 vmx->soft_vnmi_blocked = 0; 2331 vmx->soft_vnmi_blocked = 0;
2279 2332
@@ -2402,14 +2455,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2402 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2455 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2403} 2456}
2404 2457
2405static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) 2458static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2406{ 2459{
2407 struct vcpu_vmx *vmx = to_vmx(vcpu); 2460 struct vcpu_vmx *vmx = to_vmx(vcpu);
2461 uint32_t intr;
2462 int irq = vcpu->arch.interrupt.nr;
2408 2463
2409 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); 2464 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
2410 2465
2411 ++vcpu->stat.irq_injections; 2466 ++vcpu->stat.irq_injections;
2412 if (vcpu->arch.rmode.active) { 2467 if (vcpu->arch.rmode.vm86_active) {
2413 vmx->rmode.irq.pending = true; 2468 vmx->rmode.irq.pending = true;
2414 vmx->rmode.irq.vector = irq; 2469 vmx->rmode.irq.vector = irq;
2415 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 2470 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2419,8 +2474,14 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2419 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); 2474 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2420 return; 2475 return;
2421 } 2476 }
2422 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2477 intr = irq | INTR_INFO_VALID_MASK;
2423 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 2478 if (vcpu->arch.interrupt.soft) {
2479 intr |= INTR_TYPE_SOFT_INTR;
2480 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2481 vmx->vcpu.arch.event_exit_inst_len);
2482 } else
2483 intr |= INTR_TYPE_EXT_INTR;
2484 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
2424} 2485}
2425 2486
2426static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 2487static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2441,7 +2502,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2441 } 2502 }
2442 2503
2443 ++vcpu->stat.nmi_injections; 2504 ++vcpu->stat.nmi_injections;
2444 if (vcpu->arch.rmode.active) { 2505 if (vcpu->arch.rmode.vm86_active) {
2445 vmx->rmode.irq.pending = true; 2506 vmx->rmode.irq.pending = true;
2446 vmx->rmode.irq.vector = NMI_VECTOR; 2507 vmx->rmode.irq.vector = NMI_VECTOR;
2447 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 2508 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2456,76 +2517,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2456 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2517 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2457} 2518}
2458 2519
2459static void vmx_update_window_states(struct kvm_vcpu *vcpu) 2520static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2460{ 2521{
2461 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2462
2463 vcpu->arch.nmi_window_open =
2464 !(guest_intr & (GUEST_INTR_STATE_STI |
2465 GUEST_INTR_STATE_MOV_SS |
2466 GUEST_INTR_STATE_NMI));
2467 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) 2522 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
2468 vcpu->arch.nmi_window_open = 0; 2523 return 0;
2469
2470 vcpu->arch.interrupt_window_open =
2471 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2472 !(guest_intr & (GUEST_INTR_STATE_STI |
2473 GUEST_INTR_STATE_MOV_SS)));
2474}
2475
2476static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2477{
2478 int word_index = __ffs(vcpu->arch.irq_summary);
2479 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
2480 int irq = word_index * BITS_PER_LONG + bit_index;
2481 2524
2482 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); 2525 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2483 if (!vcpu->arch.irq_pending[word_index]) 2526 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
2484 clear_bit(word_index, &vcpu->arch.irq_summary); 2527 GUEST_INTR_STATE_NMI));
2485 kvm_queue_interrupt(vcpu, irq);
2486} 2528}
2487 2529
2488static void do_interrupt_requests(struct kvm_vcpu *vcpu, 2530static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
2489 struct kvm_run *kvm_run)
2490{ 2531{
2491 vmx_update_window_states(vcpu); 2532 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2492 2533 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2493 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 2534 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
2494 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2495 GUEST_INTR_STATE_STI |
2496 GUEST_INTR_STATE_MOV_SS);
2497
2498 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
2499 if (vcpu->arch.interrupt.pending) {
2500 enable_nmi_window(vcpu);
2501 } else if (vcpu->arch.nmi_window_open) {
2502 vcpu->arch.nmi_pending = false;
2503 vcpu->arch.nmi_injected = true;
2504 } else {
2505 enable_nmi_window(vcpu);
2506 return;
2507 }
2508 }
2509 if (vcpu->arch.nmi_injected) {
2510 vmx_inject_nmi(vcpu);
2511 if (vcpu->arch.nmi_pending)
2512 enable_nmi_window(vcpu);
2513 else if (vcpu->arch.irq_summary
2514 || kvm_run->request_interrupt_window)
2515 enable_irq_window(vcpu);
2516 return;
2517 }
2518
2519 if (vcpu->arch.interrupt_window_open) {
2520 if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
2521 kvm_do_inject_irq(vcpu);
2522
2523 if (vcpu->arch.interrupt.pending)
2524 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
2525 }
2526 if (!vcpu->arch.interrupt_window_open &&
2527 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
2528 enable_irq_window(vcpu);
2529} 2535}
2530 2536
2531static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 2537static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2585,6 +2591,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2585 return 0; 2591 return 0;
2586} 2592}
2587 2593
2594/*
2595 * Trigger machine check on the host. We assume all the MSRs are already set up
2596 * by the CPU and that we still run on the same CPU as the MCE occurred on.
2597 * We pass a fake environment to the machine check handler because we want
2598 * the guest to be always treated like user space, no matter what context
2599 * it used internally.
2600 */
2601static void kvm_machine_check(void)
2602{
2603#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
2604 struct pt_regs regs = {
2605 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
2606 .flags = X86_EFLAGS_IF,
2607 };
2608
2609 do_machine_check(&regs, 0);
2610#endif
2611}
2612
2613static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2614{
2615 /* already handled by vcpu_run */
2616 return 1;
2617}
2618
2588static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2619static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2589{ 2620{
2590 struct vcpu_vmx *vmx = to_vmx(vcpu); 2621 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2596,17 +2627,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2596 vect_info = vmx->idt_vectoring_info; 2627 vect_info = vmx->idt_vectoring_info;
2597 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2628 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2598 2629
2630 if (is_machine_check(intr_info))
2631 return handle_machine_check(vcpu, kvm_run);
2632
2599 if ((vect_info & VECTORING_INFO_VALID_MASK) && 2633 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2600 !is_page_fault(intr_info)) 2634 !is_page_fault(intr_info))
2601 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 2635 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
2602 "intr info 0x%x\n", __func__, vect_info, intr_info); 2636 "intr info 0x%x\n", __func__, vect_info, intr_info);
2603 2637
2604 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
2605 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
2606 set_bit(irq, vcpu->arch.irq_pending);
2607 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
2608 }
2609
2610 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 2638 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2611 return 1; /* already handled by vmx_vcpu_run() */ 2639 return 1; /* already handled by vmx_vcpu_run() */
2612 2640
@@ -2628,17 +2656,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2628 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 2656 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2629 if (is_page_fault(intr_info)) { 2657 if (is_page_fault(intr_info)) {
2630 /* EPT won't cause page fault directly */ 2658 /* EPT won't cause page fault directly */
2631 if (vm_need_ept()) 2659 if (enable_ept)
2632 BUG(); 2660 BUG();
2633 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2661 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2634 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2662 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2635 (u32)((u64)cr2 >> 32), handler); 2663 (u32)((u64)cr2 >> 32), handler);
2636 if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) 2664 if (kvm_event_needs_reinjection(vcpu))
2637 kvm_mmu_unprotect_page_virt(vcpu, cr2); 2665 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2638 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2666 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2639 } 2667 }
2640 2668
2641 if (vcpu->arch.rmode.active && 2669 if (vcpu->arch.rmode.vm86_active &&
2642 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, 2670 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2643 error_code)) { 2671 error_code)) {
2644 if (vcpu->arch.halt_request) { 2672 if (vcpu->arch.halt_request) {
@@ -2753,13 +2781,18 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2753 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); 2781 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
2754 skip_emulated_instruction(vcpu); 2782 skip_emulated_instruction(vcpu);
2755 return 1; 2783 return 1;
2756 case 8: 2784 case 8: {
2757 kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); 2785 u8 cr8_prev = kvm_get_cr8(vcpu);
2758 skip_emulated_instruction(vcpu); 2786 u8 cr8 = kvm_register_read(vcpu, reg);
2759 if (irqchip_in_kernel(vcpu->kvm)) 2787 kvm_set_cr8(vcpu, cr8);
2760 return 1; 2788 skip_emulated_instruction(vcpu);
2761 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2789 if (irqchip_in_kernel(vcpu->kvm))
2762 return 0; 2790 return 1;
2791 if (cr8_prev <= cr8)
2792 return 1;
2793 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2794 return 0;
2795 }
2763 }; 2796 };
2764 break; 2797 break;
2765 case 2: /* clts */ 2798 case 2: /* clts */
@@ -2957,8 +2990,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2957 * If the user space waits to inject interrupts, exit as soon as 2990 * If the user space waits to inject interrupts, exit as soon as
2958 * possible 2991 * possible
2959 */ 2992 */
2960 if (kvm_run->request_interrupt_window && 2993 if (!irqchip_in_kernel(vcpu->kvm) &&
2961 !vcpu->arch.irq_summary) { 2994 kvm_run->request_interrupt_window &&
2995 !kvm_cpu_has_interrupt(vcpu)) {
2962 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 2996 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2963 return 0; 2997 return 0;
2964 } 2998 }
@@ -2980,7 +3014,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2980 3014
2981static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3015static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2982{ 3016{
2983 u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 3017 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2984 3018
2985 kvm_mmu_invlpg(vcpu, exit_qualification); 3019 kvm_mmu_invlpg(vcpu, exit_qualification);
2986 skip_emulated_instruction(vcpu); 3020 skip_emulated_instruction(vcpu);
@@ -2996,11 +3030,11 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2996 3030
2997static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3031static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2998{ 3032{
2999 u64 exit_qualification; 3033 unsigned long exit_qualification;
3000 enum emulation_result er; 3034 enum emulation_result er;
3001 unsigned long offset; 3035 unsigned long offset;
3002 3036
3003 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 3037 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3004 offset = exit_qualification & 0xffful; 3038 offset = exit_qualification & 0xffful;
3005 3039
3006 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3040 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
@@ -3019,22 +3053,41 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3019 struct vcpu_vmx *vmx = to_vmx(vcpu); 3053 struct vcpu_vmx *vmx = to_vmx(vcpu);
3020 unsigned long exit_qualification; 3054 unsigned long exit_qualification;
3021 u16 tss_selector; 3055 u16 tss_selector;
3022 int reason; 3056 int reason, type, idt_v;
3057
3058 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
3059 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
3023 3060
3024 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3061 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3025 3062
3026 reason = (u32)exit_qualification >> 30; 3063 reason = (u32)exit_qualification >> 30;
3027 if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected && 3064 if (reason == TASK_SWITCH_GATE && idt_v) {
3028 (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 3065 switch (type) {
3029 (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK) 3066 case INTR_TYPE_NMI_INTR:
3030 == INTR_TYPE_NMI_INTR) { 3067 vcpu->arch.nmi_injected = false;
3031 vcpu->arch.nmi_injected = false; 3068 if (cpu_has_virtual_nmis())
3032 if (cpu_has_virtual_nmis()) 3069 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3033 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 3070 GUEST_INTR_STATE_NMI);
3034 GUEST_INTR_STATE_NMI); 3071 break;
3072 case INTR_TYPE_EXT_INTR:
3073 case INTR_TYPE_SOFT_INTR:
3074 kvm_clear_interrupt_queue(vcpu);
3075 break;
3076 case INTR_TYPE_HARD_EXCEPTION:
3077 case INTR_TYPE_SOFT_EXCEPTION:
3078 kvm_clear_exception_queue(vcpu);
3079 break;
3080 default:
3081 break;
3082 }
3035 } 3083 }
3036 tss_selector = exit_qualification; 3084 tss_selector = exit_qualification;
3037 3085
3086 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
3087 type != INTR_TYPE_EXT_INTR &&
3088 type != INTR_TYPE_NMI_INTR))
3089 skip_emulated_instruction(vcpu);
3090
3038 if (!kvm_task_switch(vcpu, tss_selector, reason)) 3091 if (!kvm_task_switch(vcpu, tss_selector, reason))
3039 return 0; 3092 return 0;
3040 3093
@@ -3051,11 +3104,11 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3051 3104
3052static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3105static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3053{ 3106{
3054 u64 exit_qualification; 3107 unsigned long exit_qualification;
3055 gpa_t gpa; 3108 gpa_t gpa;
3056 int gla_validity; 3109 int gla_validity;
3057 3110
3058 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 3111 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3059 3112
3060 if (exit_qualification & (1 << 6)) { 3113 if (exit_qualification & (1 << 6)) {
3061 printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); 3114 printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
@@ -3067,7 +3120,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3067 printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); 3120 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
3068 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", 3121 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
3069 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), 3122 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
3070 (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); 3123 vmcs_readl(GUEST_LINEAR_ADDRESS));
3071 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 3124 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3072 (long unsigned int)exit_qualification); 3125 (long unsigned int)exit_qualification);
3073 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3126 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -3150,6 +3203,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3150 [EXIT_REASON_WBINVD] = handle_wbinvd, 3203 [EXIT_REASON_WBINVD] = handle_wbinvd,
3151 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 3204 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
3152 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3205 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3206 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3153}; 3207};
3154 3208
3155static const int kvm_vmx_max_exit_handlers = 3209static const int kvm_vmx_max_exit_handlers =
@@ -3159,10 +3213,10 @@ static const int kvm_vmx_max_exit_handlers =
3159 * The guest has exited. See if we can fix it or if we need userspace 3213 * The guest has exited. See if we can fix it or if we need userspace
3160 * assistance. 3214 * assistance.
3161 */ 3215 */
3162static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3216static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3163{ 3217{
3164 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
3165 struct vcpu_vmx *vmx = to_vmx(vcpu); 3218 struct vcpu_vmx *vmx = to_vmx(vcpu);
3219 u32 exit_reason = vmx->exit_reason;
3166 u32 vectoring_info = vmx->idt_vectoring_info; 3220 u32 vectoring_info = vmx->idt_vectoring_info;
3167 3221
3168 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), 3222 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
@@ -3178,7 +3232,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3178 3232
3179 /* Access CR3 don't cause VMExit in paging mode, so we need 3233 /* Access CR3 don't cause VMExit in paging mode, so we need
3180 * to sync with guest real CR3. */ 3234 * to sync with guest real CR3. */
3181 if (vm_need_ept() && is_paging(vcpu)) { 3235 if (enable_ept && is_paging(vcpu)) {
3182 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3236 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3183 ept_load_pdptrs(vcpu); 3237 ept_load_pdptrs(vcpu);
3184 } 3238 }
@@ -3199,9 +3253,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3199 __func__, vectoring_info, exit_reason); 3253 __func__, vectoring_info, exit_reason);
3200 3254
3201 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { 3255 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
3202 if (vcpu->arch.interrupt_window_open) { 3256 if (vmx_interrupt_allowed(vcpu)) {
3203 vmx->soft_vnmi_blocked = 0; 3257 vmx->soft_vnmi_blocked = 0;
3204 vcpu->arch.nmi_window_open = 1;
3205 } else if (vmx->vnmi_blocked_time > 1000000000LL && 3258 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
3206 vcpu->arch.nmi_pending) { 3259 vcpu->arch.nmi_pending) {
3207 /* 3260 /*
@@ -3214,7 +3267,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3214 "state on VCPU %d after 1 s timeout\n", 3267 "state on VCPU %d after 1 s timeout\n",
3215 __func__, vcpu->vcpu_id); 3268 __func__, vcpu->vcpu_id);
3216 vmx->soft_vnmi_blocked = 0; 3269 vmx->soft_vnmi_blocked = 0;
3217 vmx->vcpu.arch.nmi_window_open = 1;
3218 } 3270 }
3219 } 3271 }
3220 3272
@@ -3228,122 +3280,107 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3228 return 0; 3280 return 0;
3229} 3281}
3230 3282
3231static void update_tpr_threshold(struct kvm_vcpu *vcpu) 3283static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3232{ 3284{
3233 int max_irr, tpr; 3285 if (irr == -1 || tpr < irr) {
3234
3235 if (!vm_need_tpr_shadow(vcpu->kvm))
3236 return;
3237
3238 if (!kvm_lapic_enabled(vcpu) ||
3239 ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
3240 vmcs_write32(TPR_THRESHOLD, 0); 3286 vmcs_write32(TPR_THRESHOLD, 0);
3241 return; 3287 return;
3242 } 3288 }
3243 3289
3244 tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4; 3290 vmcs_write32(TPR_THRESHOLD, irr);
3245 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
3246} 3291}
3247 3292
3248static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 3293static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3249{ 3294{
3250 u32 exit_intr_info; 3295 u32 exit_intr_info;
3251 u32 idt_vectoring_info; 3296 u32 idt_vectoring_info = vmx->idt_vectoring_info;
3252 bool unblock_nmi; 3297 bool unblock_nmi;
3253 u8 vector; 3298 u8 vector;
3254 int type; 3299 int type;
3255 bool idtv_info_valid; 3300 bool idtv_info_valid;
3256 u32 error;
3257 3301
3258 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3302 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3303
3304 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
3305
3306 /* Handle machine checks before interrupts are enabled */
3307 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
3308 || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
3309 && is_machine_check(exit_intr_info)))
3310 kvm_machine_check();
3311
3312 /* We need to handle NMIs before interrupts are enabled */
3313 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3314 (exit_intr_info & INTR_INFO_VALID_MASK)) {
3315 KVMTRACE_0D(NMI, &vmx->vcpu, handler);
3316 asm("int $2");
3317 }
3318
3319 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3320
3259 if (cpu_has_virtual_nmis()) { 3321 if (cpu_has_virtual_nmis()) {
3260 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 3322 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
3261 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 3323 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
3262 /* 3324 /*
3263 * SDM 3: 25.7.1.2 3325 * SDM 3: 27.7.1.2 (September 2008)
3264 * Re-set bit "block by NMI" before VM entry if vmexit caused by 3326 * Re-set bit "block by NMI" before VM entry if vmexit caused by
3265 * a guest IRET fault. 3327 * a guest IRET fault.
3328 * SDM 3: 23.2.2 (September 2008)
3329 * Bit 12 is undefined in any of the following cases:
3330 * If the VM exit sets the valid bit in the IDT-vectoring
3331 * information field.
3332 * If the VM exit is due to a double fault.
3266 */ 3333 */
3267 if (unblock_nmi && vector != DF_VECTOR) 3334 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
3335 vector != DF_VECTOR && !idtv_info_valid)
3268 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 3336 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3269 GUEST_INTR_STATE_NMI); 3337 GUEST_INTR_STATE_NMI);
3270 } else if (unlikely(vmx->soft_vnmi_blocked)) 3338 } else if (unlikely(vmx->soft_vnmi_blocked))
3271 vmx->vnmi_blocked_time += 3339 vmx->vnmi_blocked_time +=
3272 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 3340 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
3273 3341
3274 idt_vectoring_info = vmx->idt_vectoring_info; 3342 vmx->vcpu.arch.nmi_injected = false;
3275 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 3343 kvm_clear_exception_queue(&vmx->vcpu);
3344 kvm_clear_interrupt_queue(&vmx->vcpu);
3345
3346 if (!idtv_info_valid)
3347 return;
3348
3276 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 3349 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3277 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 3350 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3278 if (vmx->vcpu.arch.nmi_injected) { 3351
3352 switch (type) {
3353 case INTR_TYPE_NMI_INTR:
3354 vmx->vcpu.arch.nmi_injected = true;
3279 /* 3355 /*
3280 * SDM 3: 25.7.1.2 3356 * SDM 3: 27.7.1.2 (September 2008)
3281 * Clear bit "block by NMI" before VM entry if a NMI delivery 3357 * Clear bit "block by NMI" before VM entry if a NMI
3282 * faulted. 3358 * delivery faulted.
3283 */ 3359 */
3284 if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) 3360 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3285 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 3361 GUEST_INTR_STATE_NMI);
3286 GUEST_INTR_STATE_NMI); 3362 break;
3287 else 3363 case INTR_TYPE_SOFT_EXCEPTION:
3288 vmx->vcpu.arch.nmi_injected = false; 3364 vmx->vcpu.arch.event_exit_inst_len =
3289 } 3365 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3290 kvm_clear_exception_queue(&vmx->vcpu); 3366 /* fall through */
3291 if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION || 3367 case INTR_TYPE_HARD_EXCEPTION:
3292 type == INTR_TYPE_SOFT_EXCEPTION)) {
3293 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 3368 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3294 error = vmcs_read32(IDT_VECTORING_ERROR_CODE); 3369 u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
3295 kvm_queue_exception_e(&vmx->vcpu, vector, error); 3370 kvm_queue_exception_e(&vmx->vcpu, vector, err);
3296 } else 3371 } else
3297 kvm_queue_exception(&vmx->vcpu, vector); 3372 kvm_queue_exception(&vmx->vcpu, vector);
3298 vmx->idt_vectoring_info = 0; 3373 break;
3299 } 3374 case INTR_TYPE_SOFT_INTR:
3300 kvm_clear_interrupt_queue(&vmx->vcpu); 3375 vmx->vcpu.arch.event_exit_inst_len =
3301 if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { 3376 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3302 kvm_queue_interrupt(&vmx->vcpu, vector); 3377 /* fall through */
3303 vmx->idt_vectoring_info = 0; 3378 case INTR_TYPE_EXT_INTR:
3304 } 3379 kvm_queue_interrupt(&vmx->vcpu, vector,
3305} 3380 type == INTR_TYPE_SOFT_INTR);
3306 3381 break;
3307static void vmx_intr_assist(struct kvm_vcpu *vcpu) 3382 default:
3308{ 3383 break;
3309 update_tpr_threshold(vcpu);
3310
3311 vmx_update_window_states(vcpu);
3312
3313 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3314 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3315 GUEST_INTR_STATE_STI |
3316 GUEST_INTR_STATE_MOV_SS);
3317
3318 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
3319 if (vcpu->arch.interrupt.pending) {
3320 enable_nmi_window(vcpu);
3321 } else if (vcpu->arch.nmi_window_open) {
3322 vcpu->arch.nmi_pending = false;
3323 vcpu->arch.nmi_injected = true;
3324 } else {
3325 enable_nmi_window(vcpu);
3326 return;
3327 }
3328 }
3329 if (vcpu->arch.nmi_injected) {
3330 vmx_inject_nmi(vcpu);
3331 if (vcpu->arch.nmi_pending)
3332 enable_nmi_window(vcpu);
3333 else if (kvm_cpu_has_interrupt(vcpu))
3334 enable_irq_window(vcpu);
3335 return;
3336 }
3337 if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
3338 if (vcpu->arch.interrupt_window_open)
3339 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
3340 else
3341 enable_irq_window(vcpu);
3342 }
3343 if (vcpu->arch.interrupt.pending) {
3344 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
3345 if (kvm_cpu_has_interrupt(vcpu))
3346 enable_irq_window(vcpu);
3347 } 3384 }
3348} 3385}
3349 3386
@@ -3381,7 +3418,6 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3381static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3418static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3382{ 3419{
3383 struct vcpu_vmx *vmx = to_vmx(vcpu); 3420 struct vcpu_vmx *vmx = to_vmx(vcpu);
3384 u32 intr_info;
3385 3421
3386 /* Record the guest's net vcpu time for enforced NMI injections. */ 3422 /* Record the guest's net vcpu time for enforced NMI injections. */
3387 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 3423 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
@@ -3505,20 +3541,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3505 if (vmx->rmode.irq.pending) 3541 if (vmx->rmode.irq.pending)
3506 fixup_rmode_irq(vmx); 3542 fixup_rmode_irq(vmx);
3507 3543
3508 vmx_update_window_states(vcpu);
3509
3510 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 3544 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
3511 vmx->launched = 1; 3545 vmx->launched = 1;
3512 3546
3513 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3514
3515 /* We need to handle NMIs before interrupts are enabled */
3516 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3517 (intr_info & INTR_INFO_VALID_MASK)) {
3518 KVMTRACE_0D(NMI, vcpu, handler);
3519 asm("int $2");
3520 }
3521
3522 vmx_complete_interrupts(vmx); 3547 vmx_complete_interrupts(vmx);
3523} 3548}
3524 3549
@@ -3593,7 +3618,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3593 if (alloc_apic_access_page(kvm) != 0) 3618 if (alloc_apic_access_page(kvm) != 0)
3594 goto free_vmcs; 3619 goto free_vmcs;
3595 3620
3596 if (vm_need_ept()) 3621 if (enable_ept)
3597 if (alloc_identity_pagetable(kvm) != 0) 3622 if (alloc_identity_pagetable(kvm) != 0)
3598 goto free_vmcs; 3623 goto free_vmcs;
3599 3624
@@ -3631,9 +3656,32 @@ static int get_ept_level(void)
3631 return VMX_EPT_DEFAULT_GAW + 1; 3656 return VMX_EPT_DEFAULT_GAW + 1;
3632} 3657}
3633 3658
3634static int vmx_get_mt_mask_shift(void) 3659static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3635{ 3660{
3636 return VMX_EPT_MT_EPTE_SHIFT; 3661 u64 ret;
3662
3663 /* For VT-d and EPT combination
3664 * 1. MMIO: always map as UC
3665 * 2. EPT with VT-d:
3666 * a. VT-d without snooping control feature: can't guarantee the
3667 * result, try to trust guest.
3668 * b. VT-d with snooping control feature: snooping control feature of
3669 * VT-d engine can guarantee the cache correctness. Just set it
3670 * to WB to keep consistent with host. So the same as item 3.
3671 * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
3672 * consistent with host MTRR
3673 */
3674 if (is_mmio)
3675 ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
3676 else if (vcpu->kvm->arch.iommu_domain &&
3677 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
3678 ret = kvm_get_guest_memory_type(vcpu, gfn) <<
3679 VMX_EPT_MT_EPTE_SHIFT;
3680 else
3681 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
3682 | VMX_EPT_IGMT_BIT;
3683
3684 return ret;
3637} 3685}
3638 3686
3639static struct kvm_x86_ops vmx_x86_ops = { 3687static struct kvm_x86_ops vmx_x86_ops = {
@@ -3644,7 +3692,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3644 .check_processor_compatibility = vmx_check_processor_compat, 3692 .check_processor_compatibility = vmx_check_processor_compat,
3645 .hardware_enable = hardware_enable, 3693 .hardware_enable = hardware_enable,
3646 .hardware_disable = hardware_disable, 3694 .hardware_disable = hardware_disable,
3647 .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, 3695 .cpu_has_accelerated_tpr = report_flexpriority,
3648 3696
3649 .vcpu_create = vmx_create_vcpu, 3697 .vcpu_create = vmx_create_vcpu,
3650 .vcpu_free = vmx_free_vcpu, 3698 .vcpu_free = vmx_free_vcpu,
@@ -3678,78 +3726,82 @@ static struct kvm_x86_ops vmx_x86_ops = {
3678 .tlb_flush = vmx_flush_tlb, 3726 .tlb_flush = vmx_flush_tlb,
3679 3727
3680 .run = vmx_vcpu_run, 3728 .run = vmx_vcpu_run,
3681 .handle_exit = kvm_handle_exit, 3729 .handle_exit = vmx_handle_exit,
3682 .skip_emulated_instruction = skip_emulated_instruction, 3730 .skip_emulated_instruction = skip_emulated_instruction,
3731 .set_interrupt_shadow = vmx_set_interrupt_shadow,
3732 .get_interrupt_shadow = vmx_get_interrupt_shadow,
3683 .patch_hypercall = vmx_patch_hypercall, 3733 .patch_hypercall = vmx_patch_hypercall,
3684 .get_irq = vmx_get_irq,
3685 .set_irq = vmx_inject_irq, 3734 .set_irq = vmx_inject_irq,
3735 .set_nmi = vmx_inject_nmi,
3686 .queue_exception = vmx_queue_exception, 3736 .queue_exception = vmx_queue_exception,
3687 .exception_injected = vmx_exception_injected, 3737 .interrupt_allowed = vmx_interrupt_allowed,
3688 .inject_pending_irq = vmx_intr_assist, 3738 .nmi_allowed = vmx_nmi_allowed,
3689 .inject_pending_vectors = do_interrupt_requests, 3739 .enable_nmi_window = enable_nmi_window,
3740 .enable_irq_window = enable_irq_window,
3741 .update_cr8_intercept = update_cr8_intercept,
3690 3742
3691 .set_tss_addr = vmx_set_tss_addr, 3743 .set_tss_addr = vmx_set_tss_addr,
3692 .get_tdp_level = get_ept_level, 3744 .get_tdp_level = get_ept_level,
3693 .get_mt_mask_shift = vmx_get_mt_mask_shift, 3745 .get_mt_mask = vmx_get_mt_mask,
3694}; 3746};
3695 3747
3696static int __init vmx_init(void) 3748static int __init vmx_init(void)
3697{ 3749{
3698 void *va;
3699 int r; 3750 int r;
3700 3751
3701 vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 3752 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
3702 if (!vmx_io_bitmap_a) 3753 if (!vmx_io_bitmap_a)
3703 return -ENOMEM; 3754 return -ENOMEM;
3704 3755
3705 vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 3756 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
3706 if (!vmx_io_bitmap_b) { 3757 if (!vmx_io_bitmap_b) {
3707 r = -ENOMEM; 3758 r = -ENOMEM;
3708 goto out; 3759 goto out;
3709 } 3760 }
3710 3761
3711 vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 3762 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
3712 if (!vmx_msr_bitmap) { 3763 if (!vmx_msr_bitmap_legacy) {
3713 r = -ENOMEM; 3764 r = -ENOMEM;
3714 goto out1; 3765 goto out1;
3715 } 3766 }
3716 3767
3768 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
3769 if (!vmx_msr_bitmap_longmode) {
3770 r = -ENOMEM;
3771 goto out2;
3772 }
3773
3717 /* 3774 /*
3718 * Allow direct access to the PC debug port (it is often used for I/O 3775 * Allow direct access to the PC debug port (it is often used for I/O
3719 * delays, but the vmexits simply slow things down). 3776 * delays, but the vmexits simply slow things down).
3720 */ 3777 */
3721 va = kmap(vmx_io_bitmap_a); 3778 memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
3722 memset(va, 0xff, PAGE_SIZE); 3779 clear_bit(0x80, vmx_io_bitmap_a);
3723 clear_bit(0x80, va);
3724 kunmap(vmx_io_bitmap_a);
3725 3780
3726 va = kmap(vmx_io_bitmap_b); 3781 memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
3727 memset(va, 0xff, PAGE_SIZE);
3728 kunmap(vmx_io_bitmap_b);
3729 3782
3730 va = kmap(vmx_msr_bitmap); 3783 memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
3731 memset(va, 0xff, PAGE_SIZE); 3784 memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
3732 kunmap(vmx_msr_bitmap);
3733 3785
3734 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 3786 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
3735 3787
3736 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); 3788 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
3737 if (r) 3789 if (r)
3738 goto out2; 3790 goto out3;
3739 3791
3740 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE); 3792 vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
3741 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE); 3793 vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
3742 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS); 3794 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
3743 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); 3795 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
3744 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); 3796 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
3797 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
3745 3798
3746 if (vm_need_ept()) { 3799 if (enable_ept) {
3747 bypass_guest_pf = 0; 3800 bypass_guest_pf = 0;
3748 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | 3801 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3749 VMX_EPT_WRITABLE_MASK); 3802 VMX_EPT_WRITABLE_MASK);
3750 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 3803 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3751 VMX_EPT_EXECUTABLE_MASK, 3804 VMX_EPT_EXECUTABLE_MASK);
3752 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3753 kvm_enable_tdp(); 3805 kvm_enable_tdp();
3754 } else 3806 } else
3755 kvm_disable_tdp(); 3807 kvm_disable_tdp();
@@ -3761,20 +3813,23 @@ static int __init vmx_init(void)
3761 3813
3762 return 0; 3814 return 0;
3763 3815
3816out3:
3817 free_page((unsigned long)vmx_msr_bitmap_longmode);
3764out2: 3818out2:
3765 __free_page(vmx_msr_bitmap); 3819 free_page((unsigned long)vmx_msr_bitmap_legacy);
3766out1: 3820out1:
3767 __free_page(vmx_io_bitmap_b); 3821 free_page((unsigned long)vmx_io_bitmap_b);
3768out: 3822out:
3769 __free_page(vmx_io_bitmap_a); 3823 free_page((unsigned long)vmx_io_bitmap_a);
3770 return r; 3824 return r;
3771} 3825}
3772 3826
3773static void __exit vmx_exit(void) 3827static void __exit vmx_exit(void)
3774{ 3828{
3775 __free_page(vmx_msr_bitmap); 3829 free_page((unsigned long)vmx_msr_bitmap_legacy);
3776 __free_page(vmx_io_bitmap_b); 3830 free_page((unsigned long)vmx_msr_bitmap_longmode);
3777 __free_page(vmx_io_bitmap_a); 3831 free_page((unsigned long)vmx_io_bitmap_b);
3832 free_page((unsigned long)vmx_io_bitmap_a);
3778 3833
3779 kvm_exit(); 3834 kvm_exit();
3780} 3835}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3944e917e794..249540f98513 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -91,7 +91,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
91 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 91 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
92 { "hypercalls", VCPU_STAT(hypercalls) }, 92 { "hypercalls", VCPU_STAT(hypercalls) },
93 { "request_irq", VCPU_STAT(request_irq_exits) }, 93 { "request_irq", VCPU_STAT(request_irq_exits) },
94 { "request_nmi", VCPU_STAT(request_nmi_exits) },
95 { "irq_exits", VCPU_STAT(irq_exits) }, 94 { "irq_exits", VCPU_STAT(irq_exits) },
96 { "host_state_reload", VCPU_STAT(host_state_reload) }, 95 { "host_state_reload", VCPU_STAT(host_state_reload) },
97 { "efer_reload", VCPU_STAT(efer_reload) }, 96 { "efer_reload", VCPU_STAT(efer_reload) },
@@ -108,7 +107,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
108 { "mmu_recycled", VM_STAT(mmu_recycled) }, 107 { "mmu_recycled", VM_STAT(mmu_recycled) },
109 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 108 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
110 { "mmu_unsync", VM_STAT(mmu_unsync) }, 109 { "mmu_unsync", VM_STAT(mmu_unsync) },
111 { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
112 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 110 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
113 { "largepages", VM_STAT(lpages) }, 111 { "largepages", VM_STAT(lpages) },
114 { NULL } 112 { NULL }
@@ -234,7 +232,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
234 goto out; 232 goto out;
235 } 233 }
236 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 234 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
237 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { 235 if (is_present_pte(pdpte[i]) &&
236 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
238 ret = 0; 237 ret = 0;
239 goto out; 238 goto out;
240 } 239 }
@@ -321,7 +320,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
321 kvm_x86_ops->set_cr0(vcpu, cr0); 320 kvm_x86_ops->set_cr0(vcpu, cr0);
322 vcpu->arch.cr0 = cr0; 321 vcpu->arch.cr0 = cr0;
323 322
324 kvm_mmu_sync_global(vcpu);
325 kvm_mmu_reset_context(vcpu); 323 kvm_mmu_reset_context(vcpu);
326 return; 324 return;
327} 325}
@@ -370,7 +368,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
370 kvm_x86_ops->set_cr4(vcpu, cr4); 368 kvm_x86_ops->set_cr4(vcpu, cr4);
371 vcpu->arch.cr4 = cr4; 369 vcpu->arch.cr4 = cr4;
372 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; 370 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
373 kvm_mmu_sync_global(vcpu);
374 kvm_mmu_reset_context(vcpu); 371 kvm_mmu_reset_context(vcpu);
375} 372}
376EXPORT_SYMBOL_GPL(kvm_set_cr4); 373EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -523,6 +520,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
523 efer |= vcpu->arch.shadow_efer & EFER_LMA; 520 efer |= vcpu->arch.shadow_efer & EFER_LMA;
524 521
525 vcpu->arch.shadow_efer = efer; 522 vcpu->arch.shadow_efer = efer;
523
524 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
525 kvm_mmu_reset_context(vcpu);
526} 526}
527 527
528void kvm_enable_efer_bits(u64 mask) 528void kvm_enable_efer_bits(u64 mask)
@@ -630,14 +630,17 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
630 unsigned long flags; 630 unsigned long flags;
631 struct kvm_vcpu_arch *vcpu = &v->arch; 631 struct kvm_vcpu_arch *vcpu = &v->arch;
632 void *shared_kaddr; 632 void *shared_kaddr;
633 unsigned long this_tsc_khz;
633 634
634 if ((!vcpu->time_page)) 635 if ((!vcpu->time_page))
635 return; 636 return;
636 637
637 if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) { 638 this_tsc_khz = get_cpu_var(cpu_tsc_khz);
638 kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock); 639 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
639 vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz); 640 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
641 vcpu->hv_clock_tsc_khz = this_tsc_khz;
640 } 642 }
643 put_cpu_var(cpu_tsc_khz);
641 644
642 /* Keep irq disabled to prevent changes to the clock */ 645 /* Keep irq disabled to prevent changes to the clock */
643 local_irq_save(flags); 646 local_irq_save(flags);
@@ -893,6 +896,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
893 case MSR_IA32_LASTINTFROMIP: 896 case MSR_IA32_LASTINTFROMIP:
894 case MSR_IA32_LASTINTTOIP: 897 case MSR_IA32_LASTINTTOIP:
895 case MSR_VM_HSAVE_PA: 898 case MSR_VM_HSAVE_PA:
899 case MSR_P6_EVNTSEL0:
900 case MSR_P6_EVNTSEL1:
896 data = 0; 901 data = 0;
897 break; 902 break;
898 case MSR_MTRRcap: 903 case MSR_MTRRcap:
@@ -1024,6 +1029,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1024 case KVM_CAP_SYNC_MMU: 1029 case KVM_CAP_SYNC_MMU:
1025 case KVM_CAP_REINJECT_CONTROL: 1030 case KVM_CAP_REINJECT_CONTROL:
1026 case KVM_CAP_IRQ_INJECT_STATUS: 1031 case KVM_CAP_IRQ_INJECT_STATUS:
1032 case KVM_CAP_ASSIGN_DEV_IRQ:
1027 r = 1; 1033 r = 1;
1028 break; 1034 break;
1029 case KVM_CAP_COALESCED_MMIO: 1035 case KVM_CAP_COALESCED_MMIO:
@@ -1241,41 +1247,53 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1241 entry->flags = 0; 1247 entry->flags = 0;
1242} 1248}
1243 1249
1250#define F(x) bit(X86_FEATURE_##x)
1251
1244static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 1252static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1245 u32 index, int *nent, int maxnent) 1253 u32 index, int *nent, int maxnent)
1246{ 1254{
1247 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | 1255 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1248 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1249 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1250 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1251 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1252 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
1253 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1254 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
1255 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
1256 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
1257 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
1258 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
1259 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
1260 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
1261 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
1262 bit(X86_FEATURE_PGE) |
1263 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
1264 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
1265 bit(X86_FEATURE_SYSCALL) |
1266 (is_efer_nx() ? bit(X86_FEATURE_NX) : 0) |
1267#ifdef CONFIG_X86_64 1256#ifdef CONFIG_X86_64
1268 bit(X86_FEATURE_LM) | 1257 unsigned f_lm = F(LM);
1258#else
1259 unsigned f_lm = 0;
1269#endif 1260#endif
1270 bit(X86_FEATURE_FXSR_OPT) | 1261
1271 bit(X86_FEATURE_MMXEXT) | 1262 /* cpuid 1.edx */
1272 bit(X86_FEATURE_3DNOWEXT) | 1263 const u32 kvm_supported_word0_x86_features =
1273 bit(X86_FEATURE_3DNOW); 1264 F(FPU) | F(VME) | F(DE) | F(PSE) |
1274 const u32 kvm_supported_word3_x86_features = 1265 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1275 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); 1266 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
1267 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1268 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
1269 0 /* Reserved, DS, ACPI */ | F(MMX) |
1270 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
1271 0 /* HTT, TM, Reserved, PBE */;
1272 /* cpuid 0x80000001.edx */
1273 const u32 kvm_supported_word1_x86_features =
1274 F(FPU) | F(VME) | F(DE) | F(PSE) |
1275 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1276 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
1277 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1278 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1279 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1280 F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
1281 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1282 /* cpuid 1.ecx */
1283 const u32 kvm_supported_word4_x86_features =
1284 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
1285 0 /* DS-CPL, VMX, SMX, EST */ |
1286 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1287 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1288 0 /* Reserved, DCA */ | F(XMM4_1) |
1289 F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
1290 0 /* Reserved, XSAVE, OSXSAVE */;
1291 /* cpuid 0x80000001.ecx */
1276 const u32 kvm_supported_word6_x86_features = 1292 const u32 kvm_supported_word6_x86_features =
1277 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) | 1293 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1278 bit(X86_FEATURE_SVM); 1294 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1295 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
1296 0 /* SKINIT */ | 0 /* WDT */;
1279 1297
1280 /* all calls to cpuid_count() should be made on the same cpu */ 1298 /* all calls to cpuid_count() should be made on the same cpu */
1281 get_cpu(); 1299 get_cpu();
@@ -1288,7 +1306,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1288 break; 1306 break;
1289 case 1: 1307 case 1:
1290 entry->edx &= kvm_supported_word0_x86_features; 1308 entry->edx &= kvm_supported_word0_x86_features;
1291 entry->ecx &= kvm_supported_word3_x86_features; 1309 entry->ecx &= kvm_supported_word4_x86_features;
1292 break; 1310 break;
1293 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1311 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1294 * may return different values. This forces us to get_cpu() before 1312 * may return different values. This forces us to get_cpu() before
@@ -1350,6 +1368,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1350 put_cpu(); 1368 put_cpu();
1351} 1369}
1352 1370
1371#undef F
1372
1353static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 1373static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1354 struct kvm_cpuid_entry2 __user *entries) 1374 struct kvm_cpuid_entry2 __user *entries)
1355{ 1375{
@@ -1421,8 +1441,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1421 return -ENXIO; 1441 return -ENXIO;
1422 vcpu_load(vcpu); 1442 vcpu_load(vcpu);
1423 1443
1424 set_bit(irq->irq, vcpu->arch.irq_pending); 1444 kvm_queue_interrupt(vcpu, irq->irq, false);
1425 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1426 1445
1427 vcpu_put(vcpu); 1446 vcpu_put(vcpu);
1428 1447
@@ -1584,8 +1603,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1584 r = -EINVAL; 1603 r = -EINVAL;
1585 } 1604 }
1586out: 1605out:
1587 if (lapic) 1606 kfree(lapic);
1588 kfree(lapic);
1589 return r; 1607 return r;
1590} 1608}
1591 1609
@@ -1606,10 +1624,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1606 return -EINVAL; 1624 return -EINVAL;
1607 1625
1608 down_write(&kvm->slots_lock); 1626 down_write(&kvm->slots_lock);
1627 spin_lock(&kvm->mmu_lock);
1609 1628
1610 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 1629 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1611 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 1630 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1612 1631
1632 spin_unlock(&kvm->mmu_lock);
1613 up_write(&kvm->slots_lock); 1633 up_write(&kvm->slots_lock);
1614 return 0; 1634 return 0;
1615} 1635}
@@ -1785,7 +1805,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1785 1805
1786 /* If nothing is dirty, don't bother messing with page tables. */ 1806 /* If nothing is dirty, don't bother messing with page tables. */
1787 if (is_dirty) { 1807 if (is_dirty) {
1808 spin_lock(&kvm->mmu_lock);
1788 kvm_mmu_slot_remove_write_access(kvm, log->slot); 1809 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1810 spin_unlock(&kvm->mmu_lock);
1789 kvm_flush_remote_tlbs(kvm); 1811 kvm_flush_remote_tlbs(kvm);
1790 memslot = &kvm->memslots[log->slot]; 1812 memslot = &kvm->memslots[log->slot];
1791 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1813 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
@@ -2360,7 +2382,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2360 u16 error_code, 2382 u16 error_code,
2361 int emulation_type) 2383 int emulation_type)
2362{ 2384{
2363 int r; 2385 int r, shadow_mask;
2364 struct decode_cache *c; 2386 struct decode_cache *c;
2365 2387
2366 kvm_clear_exception_queue(vcpu); 2388 kvm_clear_exception_queue(vcpu);
@@ -2408,7 +2430,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2408 } 2430 }
2409 } 2431 }
2410 2432
2433 if (emulation_type & EMULTYPE_SKIP) {
2434 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
2435 return EMULATE_DONE;
2436 }
2437
2411 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2438 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2439 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
2440
2441 if (r == 0)
2442 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
2412 2443
2413 if (vcpu->arch.pio.string) 2444 if (vcpu->arch.pio.string)
2414 return EMULATE_DO_MMIO; 2445 return EMULATE_DO_MMIO;
@@ -2761,7 +2792,7 @@ int kvm_arch_init(void *opaque)
2761 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 2792 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2762 kvm_mmu_set_base_ptes(PT_PRESENT_MASK); 2793 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2763 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 2794 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2764 PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); 2795 PT_DIRTY_MASK, PT64_NX_MASK, 0);
2765 2796
2766 for_each_possible_cpu(cpu) 2797 for_each_possible_cpu(cpu)
2767 per_cpu(cpu_tsc_khz, cpu) = tsc_khz; 2798 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
@@ -3012,6 +3043,16 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3012 return best; 3043 return best;
3013} 3044}
3014 3045
3046int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3047{
3048 struct kvm_cpuid_entry2 *best;
3049
3050 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
3051 if (best)
3052 return best->eax & 0xff;
3053 return 36;
3054}
3055
3015void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 3056void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3016{ 3057{
3017 u32 function, index; 3058 u32 function, index;
@@ -3048,10 +3089,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3048static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 3089static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
3049 struct kvm_run *kvm_run) 3090 struct kvm_run *kvm_run)
3050{ 3091{
3051 return (!vcpu->arch.irq_summary && 3092 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3052 kvm_run->request_interrupt_window && 3093 kvm_run->request_interrupt_window &&
3053 vcpu->arch.interrupt_window_open && 3094 kvm_arch_interrupt_allowed(vcpu));
3054 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
3055} 3095}
3056 3096
3057static void post_kvm_run_save(struct kvm_vcpu *vcpu, 3097static void post_kvm_run_save(struct kvm_vcpu *vcpu,
@@ -3064,8 +3104,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
3064 kvm_run->ready_for_interrupt_injection = 1; 3104 kvm_run->ready_for_interrupt_injection = 1;
3065 else 3105 else
3066 kvm_run->ready_for_interrupt_injection = 3106 kvm_run->ready_for_interrupt_injection =
3067 (vcpu->arch.interrupt_window_open && 3107 kvm_arch_interrupt_allowed(vcpu) &&
3068 vcpu->arch.irq_summary == 0); 3108 !kvm_cpu_has_interrupt(vcpu) &&
3109 !kvm_event_needs_reinjection(vcpu);
3069} 3110}
3070 3111
3071static void vapic_enter(struct kvm_vcpu *vcpu) 3112static void vapic_enter(struct kvm_vcpu *vcpu)
@@ -3094,9 +3135,63 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
3094 up_read(&vcpu->kvm->slots_lock); 3135 up_read(&vcpu->kvm->slots_lock);
3095} 3136}
3096 3137
3138static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3139{
3140 int max_irr, tpr;
3141
3142 if (!kvm_x86_ops->update_cr8_intercept)
3143 return;
3144
3145 if (!vcpu->arch.apic->vapic_addr)
3146 max_irr = kvm_lapic_find_highest_irr(vcpu);
3147 else
3148 max_irr = -1;
3149
3150 if (max_irr != -1)
3151 max_irr >>= 4;
3152
3153 tpr = kvm_lapic_get_cr8(vcpu);
3154
3155 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3156}
3157
3158static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3159{
3160 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3161 kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
3162
3163 /* try to reinject previous events if any */
3164 if (vcpu->arch.nmi_injected) {
3165 kvm_x86_ops->set_nmi(vcpu);
3166 return;
3167 }
3168
3169 if (vcpu->arch.interrupt.pending) {
3170 kvm_x86_ops->set_irq(vcpu);
3171 return;
3172 }
3173
3174 /* try to inject new event if pending */
3175 if (vcpu->arch.nmi_pending) {
3176 if (kvm_x86_ops->nmi_allowed(vcpu)) {
3177 vcpu->arch.nmi_pending = false;
3178 vcpu->arch.nmi_injected = true;
3179 kvm_x86_ops->set_nmi(vcpu);
3180 }
3181 } else if (kvm_cpu_has_interrupt(vcpu)) {
3182 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
3183 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
3184 false);
3185 kvm_x86_ops->set_irq(vcpu);
3186 }
3187 }
3188}
3189
3097static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3190static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3098{ 3191{
3099 int r; 3192 int r;
3193 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3194 kvm_run->request_interrupt_window;
3100 3195
3101 if (vcpu->requests) 3196 if (vcpu->requests)
3102 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 3197 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3128,9 +3223,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3128 } 3223 }
3129 } 3224 }
3130 3225
3131 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3132 kvm_inject_pending_timer_irqs(vcpu);
3133
3134 preempt_disable(); 3226 preempt_disable();
3135 3227
3136 kvm_x86_ops->prepare_guest_switch(vcpu); 3228 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -3138,6 +3230,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3138 3230
3139 local_irq_disable(); 3231 local_irq_disable();
3140 3232
3233 clear_bit(KVM_REQ_KICK, &vcpu->requests);
3234 smp_mb__after_clear_bit();
3235
3141 if (vcpu->requests || need_resched() || signal_pending(current)) { 3236 if (vcpu->requests || need_resched() || signal_pending(current)) {
3142 local_irq_enable(); 3237 local_irq_enable();
3143 preempt_enable(); 3238 preempt_enable();
@@ -3145,21 +3240,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3145 goto out; 3240 goto out;
3146 } 3241 }
3147 3242
3148 vcpu->guest_mode = 1;
3149 /*
3150 * Make sure that guest_mode assignment won't happen after
3151 * testing the pending IRQ vector bitmap.
3152 */
3153 smp_wmb();
3154
3155 if (vcpu->arch.exception.pending) 3243 if (vcpu->arch.exception.pending)
3156 __queue_exception(vcpu); 3244 __queue_exception(vcpu);
3157 else if (irqchip_in_kernel(vcpu->kvm))
3158 kvm_x86_ops->inject_pending_irq(vcpu);
3159 else 3245 else
3160 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); 3246 inject_pending_irq(vcpu, kvm_run);
3161 3247
3162 kvm_lapic_sync_to_vapic(vcpu); 3248 /* enable NMI/IRQ window open exits if needed */
3249 if (vcpu->arch.nmi_pending)
3250 kvm_x86_ops->enable_nmi_window(vcpu);
3251 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
3252 kvm_x86_ops->enable_irq_window(vcpu);
3253
3254 if (kvm_lapic_enabled(vcpu)) {
3255 update_cr8_intercept(vcpu);
3256 kvm_lapic_sync_to_vapic(vcpu);
3257 }
3163 3258
3164 up_read(&vcpu->kvm->slots_lock); 3259 up_read(&vcpu->kvm->slots_lock);
3165 3260
@@ -3193,7 +3288,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3193 set_debugreg(vcpu->arch.host_dr6, 6); 3288 set_debugreg(vcpu->arch.host_dr6, 6);
3194 set_debugreg(vcpu->arch.host_dr7, 7); 3289 set_debugreg(vcpu->arch.host_dr7, 7);
3195 3290
3196 vcpu->guest_mode = 0; 3291 set_bit(KVM_REQ_KICK, &vcpu->requests);
3197 local_irq_enable(); 3292 local_irq_enable();
3198 3293
3199 ++vcpu->stat.exits; 3294 ++vcpu->stat.exits;
@@ -3220,8 +3315,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3220 profile_hit(KVM_PROFILING, (void *)rip); 3315 profile_hit(KVM_PROFILING, (void *)rip);
3221 } 3316 }
3222 3317
3223 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
3224 vcpu->arch.exception.pending = false;
3225 3318
3226 kvm_lapic_sync_from_vapic(vcpu); 3319 kvm_lapic_sync_from_vapic(vcpu);
3227 3320
@@ -3230,6 +3323,7 @@ out:
3230 return r; 3323 return r;
3231} 3324}
3232 3325
3326
3233static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3327static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3234{ 3328{
3235 int r; 3329 int r;
@@ -3256,29 +3350,42 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3256 kvm_vcpu_block(vcpu); 3350 kvm_vcpu_block(vcpu);
3257 down_read(&vcpu->kvm->slots_lock); 3351 down_read(&vcpu->kvm->slots_lock);
3258 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 3352 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3259 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 3353 {
3354 switch(vcpu->arch.mp_state) {
3355 case KVM_MP_STATE_HALTED:
3260 vcpu->arch.mp_state = 3356 vcpu->arch.mp_state =
3261 KVM_MP_STATE_RUNNABLE; 3357 KVM_MP_STATE_RUNNABLE;
3262 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) 3358 case KVM_MP_STATE_RUNNABLE:
3263 r = -EINTR; 3359 break;
3360 case KVM_MP_STATE_SIPI_RECEIVED:
3361 default:
3362 r = -EINTR;
3363 break;
3364 }
3365 }
3264 } 3366 }
3265 3367
3266 if (r > 0) { 3368 if (r <= 0)
3267 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 3369 break;
3268 r = -EINTR; 3370
3269 kvm_run->exit_reason = KVM_EXIT_INTR; 3371 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3270 ++vcpu->stat.request_irq_exits; 3372 if (kvm_cpu_has_pending_timer(vcpu))
3271 } 3373 kvm_inject_pending_timer_irqs(vcpu);
3272 if (signal_pending(current)) { 3374
3273 r = -EINTR; 3375 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3274 kvm_run->exit_reason = KVM_EXIT_INTR; 3376 r = -EINTR;
3275 ++vcpu->stat.signal_exits; 3377 kvm_run->exit_reason = KVM_EXIT_INTR;
3276 } 3378 ++vcpu->stat.request_irq_exits;
3277 if (need_resched()) { 3379 }
3278 up_read(&vcpu->kvm->slots_lock); 3380 if (signal_pending(current)) {
3279 kvm_resched(vcpu); 3381 r = -EINTR;
3280 down_read(&vcpu->kvm->slots_lock); 3382 kvm_run->exit_reason = KVM_EXIT_INTR;
3281 } 3383 ++vcpu->stat.signal_exits;
3384 }
3385 if (need_resched()) {
3386 up_read(&vcpu->kvm->slots_lock);
3387 kvm_resched(vcpu);
3388 down_read(&vcpu->kvm->slots_lock);
3282 } 3389 }
3283 } 3390 }
3284 3391
@@ -3442,7 +3549,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3442 struct kvm_sregs *sregs) 3549 struct kvm_sregs *sregs)
3443{ 3550{
3444 struct descriptor_table dt; 3551 struct descriptor_table dt;
3445 int pending_vec;
3446 3552
3447 vcpu_load(vcpu); 3553 vcpu_load(vcpu);
3448 3554
@@ -3472,16 +3578,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3472 sregs->efer = vcpu->arch.shadow_efer; 3578 sregs->efer = vcpu->arch.shadow_efer;
3473 sregs->apic_base = kvm_get_apic_base(vcpu); 3579 sregs->apic_base = kvm_get_apic_base(vcpu);
3474 3580
3475 if (irqchip_in_kernel(vcpu->kvm)) { 3581 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
3476 memset(sregs->interrupt_bitmap, 0, 3582
3477 sizeof sregs->interrupt_bitmap); 3583 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
3478 pending_vec = kvm_x86_ops->get_irq(vcpu); 3584 set_bit(vcpu->arch.interrupt.nr,
3479 if (pending_vec >= 0) 3585 (unsigned long *)sregs->interrupt_bitmap);
3480 set_bit(pending_vec,
3481 (unsigned long *)sregs->interrupt_bitmap);
3482 } else
3483 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
3484 sizeof sregs->interrupt_bitmap);
3485 3586
3486 vcpu_put(vcpu); 3587 vcpu_put(vcpu);
3487 3588
@@ -3688,7 +3789,6 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3688 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); 3789 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
3689 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); 3790 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
3690 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 3791 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3691 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3692} 3792}
3693 3793
3694static int load_state_from_tss32(struct kvm_vcpu *vcpu, 3794static int load_state_from_tss32(struct kvm_vcpu *vcpu,
@@ -3785,8 +3885,8 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3785} 3885}
3786 3886
3787static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3887static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3788 u32 old_tss_base, 3888 u16 old_tss_sel, u32 old_tss_base,
3789 struct desc_struct *nseg_desc) 3889 struct desc_struct *nseg_desc)
3790{ 3890{
3791 struct tss_segment_16 tss_segment_16; 3891 struct tss_segment_16 tss_segment_16;
3792 int ret = 0; 3892 int ret = 0;
@@ -3805,6 +3905,16 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3805 &tss_segment_16, sizeof tss_segment_16)) 3905 &tss_segment_16, sizeof tss_segment_16))
3806 goto out; 3906 goto out;
3807 3907
3908 if (old_tss_sel != 0xffff) {
3909 tss_segment_16.prev_task_link = old_tss_sel;
3910
3911 if (kvm_write_guest(vcpu->kvm,
3912 get_tss_base_addr(vcpu, nseg_desc),
3913 &tss_segment_16.prev_task_link,
3914 sizeof tss_segment_16.prev_task_link))
3915 goto out;
3916 }
3917
3808 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3918 if (load_state_from_tss16(vcpu, &tss_segment_16))
3809 goto out; 3919 goto out;
3810 3920
@@ -3814,7 +3924,7 @@ out:
3814} 3924}
3815 3925
3816static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3926static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3817 u32 old_tss_base, 3927 u16 old_tss_sel, u32 old_tss_base,
3818 struct desc_struct *nseg_desc) 3928 struct desc_struct *nseg_desc)
3819{ 3929{
3820 struct tss_segment_32 tss_segment_32; 3930 struct tss_segment_32 tss_segment_32;
@@ -3834,6 +3944,16 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3834 &tss_segment_32, sizeof tss_segment_32)) 3944 &tss_segment_32, sizeof tss_segment_32))
3835 goto out; 3945 goto out;
3836 3946
3947 if (old_tss_sel != 0xffff) {
3948 tss_segment_32.prev_task_link = old_tss_sel;
3949
3950 if (kvm_write_guest(vcpu->kvm,
3951 get_tss_base_addr(vcpu, nseg_desc),
3952 &tss_segment_32.prev_task_link,
3953 sizeof tss_segment_32.prev_task_link))
3954 goto out;
3955 }
3956
3837 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3957 if (load_state_from_tss32(vcpu, &tss_segment_32))
3838 goto out; 3958 goto out;
3839 3959
@@ -3887,14 +4007,22 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3887 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 4007 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
3888 } 4008 }
3889 4009
3890 kvm_x86_ops->skip_emulated_instruction(vcpu); 4010 /* set back link to prev task only if NT bit is set in eflags
4011 note that old_tss_sel is not used afetr this point */
4012 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4013 old_tss_sel = 0xffff;
4014
4015 /* set back link to prev task only if NT bit is set in eflags
4016 note that old_tss_sel is not used afetr this point */
4017 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4018 old_tss_sel = 0xffff;
3891 4019
3892 if (nseg_desc.type & 8) 4020 if (nseg_desc.type & 8)
3893 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, 4021 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
3894 &nseg_desc); 4022 old_tss_base, &nseg_desc);
3895 else 4023 else
3896 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, 4024 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
3897 &nseg_desc); 4025 old_tss_base, &nseg_desc);
3898 4026
3899 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 4027 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
3900 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 4028 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
@@ -3920,7 +4048,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3920 struct kvm_sregs *sregs) 4048 struct kvm_sregs *sregs)
3921{ 4049{
3922 int mmu_reset_needed = 0; 4050 int mmu_reset_needed = 0;
3923 int i, pending_vec, max_bits; 4051 int pending_vec, max_bits;
3924 struct descriptor_table dt; 4052 struct descriptor_table dt;
3925 4053
3926 vcpu_load(vcpu); 4054 vcpu_load(vcpu);
@@ -3934,7 +4062,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3934 4062
3935 vcpu->arch.cr2 = sregs->cr2; 4063 vcpu->arch.cr2 = sregs->cr2;
3936 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 4064 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
3937 vcpu->arch.cr3 = sregs->cr3; 4065
4066 down_read(&vcpu->kvm->slots_lock);
4067 if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
4068 vcpu->arch.cr3 = sregs->cr3;
4069 else
4070 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
4071 up_read(&vcpu->kvm->slots_lock);
3938 4072
3939 kvm_set_cr8(vcpu, sregs->cr8); 4073 kvm_set_cr8(vcpu, sregs->cr8);
3940 4074
@@ -3956,25 +4090,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3956 if (mmu_reset_needed) 4090 if (mmu_reset_needed)
3957 kvm_mmu_reset_context(vcpu); 4091 kvm_mmu_reset_context(vcpu);
3958 4092
3959 if (!irqchip_in_kernel(vcpu->kvm)) { 4093 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
3960 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, 4094 pending_vec = find_first_bit(
3961 sizeof vcpu->arch.irq_pending); 4095 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
3962 vcpu->arch.irq_summary = 0; 4096 if (pending_vec < max_bits) {
3963 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) 4097 kvm_queue_interrupt(vcpu, pending_vec, false);
3964 if (vcpu->arch.irq_pending[i]) 4098 pr_debug("Set back pending irq %d\n", pending_vec);
3965 __set_bit(i, &vcpu->arch.irq_summary); 4099 if (irqchip_in_kernel(vcpu->kvm))
3966 } else { 4100 kvm_pic_clear_isr_ack(vcpu->kvm);
3967 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
3968 pending_vec = find_first_bit(
3969 (const unsigned long *)sregs->interrupt_bitmap,
3970 max_bits);
3971 /* Only pending external irq is handled here */
3972 if (pending_vec < max_bits) {
3973 kvm_x86_ops->set_irq(vcpu, pending_vec);
3974 pr_debug("Set back pending irq %d\n",
3975 pending_vec);
3976 }
3977 kvm_pic_clear_isr_ack(vcpu->kvm);
3978 } 4101 }
3979 4102
3980 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 4103 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -4308,7 +4431,6 @@ struct kvm *kvm_arch_create_vm(void)
4308 return ERR_PTR(-ENOMEM); 4431 return ERR_PTR(-ENOMEM);
4309 4432
4310 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4433 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4311 INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
4312 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 4434 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4313 4435
4314 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 4436 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4411,12 +4533,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4411 } 4533 }
4412 } 4534 }
4413 4535
4536 spin_lock(&kvm->mmu_lock);
4414 if (!kvm->arch.n_requested_mmu_pages) { 4537 if (!kvm->arch.n_requested_mmu_pages) {
4415 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 4538 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4416 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 4539 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4417 } 4540 }
4418 4541
4419 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 4542 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4543 spin_unlock(&kvm->mmu_lock);
4420 kvm_flush_remote_tlbs(kvm); 4544 kvm_flush_remote_tlbs(kvm);
4421 4545
4422 return 0; 4546 return 0;
@@ -4425,6 +4549,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4425void kvm_arch_flush_shadow(struct kvm *kvm) 4549void kvm_arch_flush_shadow(struct kvm *kvm)
4426{ 4550{
4427 kvm_mmu_zap_all(kvm); 4551 kvm_mmu_zap_all(kvm);
4552 kvm_reload_remote_mmus(kvm);
4428} 4553}
4429 4554
4430int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4555int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
@@ -4434,28 +4559,24 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4434 || vcpu->arch.nmi_pending; 4559 || vcpu->arch.nmi_pending;
4435} 4560}
4436 4561
4437static void vcpu_kick_intr(void *info)
4438{
4439#ifdef DEBUG
4440 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
4441 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
4442#endif
4443}
4444
4445void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 4562void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4446{ 4563{
4447 int ipi_pcpu = vcpu->cpu; 4564 int me;
4448 int cpu = get_cpu(); 4565 int cpu = vcpu->cpu;
4449 4566
4450 if (waitqueue_active(&vcpu->wq)) { 4567 if (waitqueue_active(&vcpu->wq)) {
4451 wake_up_interruptible(&vcpu->wq); 4568 wake_up_interruptible(&vcpu->wq);
4452 ++vcpu->stat.halt_wakeup; 4569 ++vcpu->stat.halt_wakeup;
4453 } 4570 }
4454 /* 4571
4455 * We may be called synchronously with irqs disabled in guest mode, 4572 me = get_cpu();
4456 * So need not to call smp_call_function_single() in that case. 4573 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
4457 */ 4574 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
4458 if (vcpu->guest_mode && vcpu->cpu != cpu) 4575 smp_send_reschedule(cpu);
4459 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
4460 put_cpu(); 4576 put_cpu();
4461} 4577}
4578
4579int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4580{
4581 return kvm_x86_ops->interrupt_allowed(vcpu);
4582}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6a4be78a7384..4c8e10af78e8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,9 +8,11 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
8 vcpu->arch.exception.pending = false; 8 vcpu->arch.exception.pending = false;
9} 9}
10 10
11static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) 11static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
12 bool soft)
12{ 13{
13 vcpu->arch.interrupt.pending = true; 14 vcpu->arch.interrupt.pending = true;
15 vcpu->arch.interrupt.soft = soft;
14 vcpu->arch.interrupt.nr = vector; 16 vcpu->arch.interrupt.nr = vector;
15} 17}
16 18
@@ -19,4 +21,14 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
19 vcpu->arch.interrupt.pending = false; 21 vcpu->arch.interrupt.pending = false;
20} 22}
21 23
24static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
25{
26 return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
27 vcpu->arch.nmi_injected;
28}
29
30static inline bool kvm_exception_is_soft(unsigned int nr)
31{
32 return (nr == BP_VECTOR) || (nr == OF_VECTOR);
33}
22#endif 34#endif
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ca91749d2083..c1b6c232e02b 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -59,13 +59,14 @@
59#define SrcImm (5<<4) /* Immediate operand. */ 59#define SrcImm (5<<4) /* Immediate operand. */
60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ 60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
61#define SrcOne (7<<4) /* Implied '1' */ 61#define SrcOne (7<<4) /* Implied '1' */
62#define SrcMask (7<<4) 62#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
63#define SrcMask (0xf<<4)
63/* Generic ModRM decode. */ 64/* Generic ModRM decode. */
64#define ModRM (1<<7) 65#define ModRM (1<<8)
65/* Destination is only written; never read. */ 66/* Destination is only written; never read. */
66#define Mov (1<<8) 67#define Mov (1<<9)
67#define BitOp (1<<9) 68#define BitOp (1<<10)
68#define MemAbs (1<<10) /* Memory operand is absolute displacement */ 69#define MemAbs (1<<11) /* Memory operand is absolute displacement */
69#define String (1<<12) /* String instruction (rep capable) */ 70#define String (1<<12) /* String instruction (rep capable) */
70#define Stack (1<<13) /* Stack instruction (push/pop) */ 71#define Stack (1<<13) /* Stack instruction (push/pop) */
71#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 72#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
@@ -76,6 +77,7 @@
76#define Src2CL (1<<29) 77#define Src2CL (1<<29)
77#define Src2ImmByte (2<<29) 78#define Src2ImmByte (2<<29)
78#define Src2One (3<<29) 79#define Src2One (3<<29)
80#define Src2Imm16 (4<<29)
79#define Src2Mask (7<<29) 81#define Src2Mask (7<<29)
80 82
81enum { 83enum {
@@ -135,11 +137,11 @@ static u32 opcode_table[256] = {
135 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 137 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
136 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 138 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
137 /* 0x70 - 0x77 */ 139 /* 0x70 - 0x77 */
138 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 140 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
139 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 141 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
140 /* 0x78 - 0x7F */ 142 /* 0x78 - 0x7F */
141 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 143 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
142 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 144 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
143 /* 0x80 - 0x87 */ 145 /* 0x80 - 0x87 */
144 Group | Group1_80, Group | Group1_81, 146 Group | Group1_80, Group | Group1_81,
145 Group | Group1_82, Group | Group1_83, 147 Group | Group1_82, Group | Group1_83,
@@ -153,7 +155,8 @@ static u32 opcode_table[256] = {
153 /* 0x90 - 0x97 */ 155 /* 0x90 - 0x97 */
154 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 156 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
155 /* 0x98 - 0x9F */ 157 /* 0x98 - 0x9F */
156 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 158 0, 0, SrcImm | Src2Imm16, 0,
159 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
157 /* 0xA0 - 0xA7 */ 160 /* 0xA0 - 0xA7 */
158 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 161 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
159 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 162 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
@@ -178,7 +181,8 @@ static u32 opcode_table[256] = {
178 0, ImplicitOps | Stack, 0, 0, 181 0, ImplicitOps | Stack, 0, 0,
179 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, 182 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
180 /* 0xC8 - 0xCF */ 183 /* 0xC8 - 0xCF */
181 0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0, 184 0, 0, 0, ImplicitOps | Stack,
185 ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps,
182 /* 0xD0 - 0xD7 */ 186 /* 0xD0 - 0xD7 */
183 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 187 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
184 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 188 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -187,11 +191,11 @@ static u32 opcode_table[256] = {
187 0, 0, 0, 0, 0, 0, 0, 0, 191 0, 0, 0, 0, 0, 0, 0, 0,
188 /* 0xE0 - 0xE7 */ 192 /* 0xE0 - 0xE7 */
189 0, 0, 0, 0, 193 0, 0, 0, 0,
190 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 194 ByteOp | SrcImmUByte, SrcImmUByte,
191 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 195 ByteOp | SrcImmUByte, SrcImmUByte,
192 /* 0xE8 - 0xEF */ 196 /* 0xE8 - 0xEF */
193 ImplicitOps | Stack, SrcImm | ImplicitOps, 197 SrcImm | Stack, SrcImm | ImplicitOps,
194 ImplicitOps, SrcImmByte | ImplicitOps, 198 SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
195 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 199 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
196 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 200 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
197 /* 0xF0 - 0xF7 */ 201 /* 0xF0 - 0xF7 */
@@ -230,10 +234,8 @@ static u32 twobyte_table[256] = {
230 /* 0x70 - 0x7F */ 234 /* 0x70 - 0x7F */
231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
232 /* 0x80 - 0x8F */ 236 /* 0x80 - 0x8F */
233 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 237 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
234 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 238 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
235 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
236 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
237 /* 0x90 - 0x9F */ 239 /* 0x90 - 0x9F */
238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 240 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
239 /* 0xA0 - 0xA7 */ 241 /* 0xA0 - 0xA7 */
@@ -1044,10 +1046,14 @@ done_prefixes:
1044 } 1046 }
1045 break; 1047 break;
1046 case SrcImmByte: 1048 case SrcImmByte:
1049 case SrcImmUByte:
1047 c->src.type = OP_IMM; 1050 c->src.type = OP_IMM;
1048 c->src.ptr = (unsigned long *)c->eip; 1051 c->src.ptr = (unsigned long *)c->eip;
1049 c->src.bytes = 1; 1052 c->src.bytes = 1;
1050 c->src.val = insn_fetch(s8, 1, c->eip); 1053 if ((c->d & SrcMask) == SrcImmByte)
1054 c->src.val = insn_fetch(s8, 1, c->eip);
1055 else
1056 c->src.val = insn_fetch(u8, 1, c->eip);
1051 break; 1057 break;
1052 case SrcOne: 1058 case SrcOne:
1053 c->src.bytes = 1; 1059 c->src.bytes = 1;
@@ -1072,6 +1078,12 @@ done_prefixes:
1072 c->src2.bytes = 1; 1078 c->src2.bytes = 1;
1073 c->src2.val = insn_fetch(u8, 1, c->eip); 1079 c->src2.val = insn_fetch(u8, 1, c->eip);
1074 break; 1080 break;
1081 case Src2Imm16:
1082 c->src2.type = OP_IMM;
1083 c->src2.ptr = (unsigned long *)c->eip;
1084 c->src2.bytes = 2;
1085 c->src2.val = insn_fetch(u16, 2, c->eip);
1086 break;
1075 case Src2One: 1087 case Src2One:
1076 c->src2.bytes = 1; 1088 c->src2.bytes = 1;
1077 c->src2.val = 1; 1089 c->src2.val = 1;
@@ -1349,6 +1361,20 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1349 return 0; 1361 return 0;
1350} 1362}
1351 1363
1364void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
1365{
1366 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
1367 /*
1368 * an sti; sti; sequence only disable interrupts for the first
1369 * instruction. So, if the last instruction, be it emulated or
1370 * not, left the system with the INT_STI flag enabled, it
1371 * means that the last instruction is an sti. We should not
1372 * leave the flag on in this case. The same goes for mov ss
1373 */
1374 if (!(int_shadow & mask))
1375 ctxt->interruptibility = mask;
1376}
1377
1352int 1378int
1353x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 1379x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1354{ 1380{
@@ -1360,6 +1386,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1360 int io_dir_in; 1386 int io_dir_in;
1361 int rc = 0; 1387 int rc = 0;
1362 1388
1389 ctxt->interruptibility = 0;
1390
1363 /* Shadow copy of register state. Committed on successful emulation. 1391 /* Shadow copy of register state. Committed on successful emulation.
1364 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't 1392 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
1365 * modify them. 1393 * modify them.
@@ -1531,13 +1559,10 @@ special_insn:
1531 return -1; 1559 return -1;
1532 } 1560 }
1533 return 0; 1561 return 0;
1534 case 0x70 ... 0x7f: /* jcc (short) */ { 1562 case 0x70 ... 0x7f: /* jcc (short) */
1535 int rel = insn_fetch(s8, 1, c->eip);
1536
1537 if (test_cc(c->b, ctxt->eflags)) 1563 if (test_cc(c->b, ctxt->eflags))
1538 jmp_rel(c, rel); 1564 jmp_rel(c, c->src.val);
1539 break; 1565 break;
1540 }
1541 case 0x80 ... 0x83: /* Grp1 */ 1566 case 0x80 ... 0x83: /* Grp1 */
1542 switch (c->modrm_reg) { 1567 switch (c->modrm_reg) {
1543 case 0: 1568 case 0:
@@ -1609,6 +1634,9 @@ special_insn:
1609 int err; 1634 int err;
1610 1635
1611 sel = c->src.val; 1636 sel = c->src.val;
1637 if (c->modrm_reg == VCPU_SREG_SS)
1638 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
1639
1612 if (c->modrm_reg <= 5) { 1640 if (c->modrm_reg <= 5) {
1613 type_bits = (c->modrm_reg == 1) ? 9 : 1; 1641 type_bits = (c->modrm_reg == 1) ? 9 : 1;
1614 err = kvm_load_segment_descriptor(ctxt->vcpu, sel, 1642 err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
@@ -1769,59 +1797,32 @@ special_insn:
1769 break; 1797 break;
1770 case 0xe4: /* inb */ 1798 case 0xe4: /* inb */
1771 case 0xe5: /* in */ 1799 case 0xe5: /* in */
1772 port = insn_fetch(u8, 1, c->eip); 1800 port = c->src.val;
1773 io_dir_in = 1; 1801 io_dir_in = 1;
1774 goto do_io; 1802 goto do_io;
1775 case 0xe6: /* outb */ 1803 case 0xe6: /* outb */
1776 case 0xe7: /* out */ 1804 case 0xe7: /* out */
1777 port = insn_fetch(u8, 1, c->eip); 1805 port = c->src.val;
1778 io_dir_in = 0; 1806 io_dir_in = 0;
1779 goto do_io; 1807 goto do_io;
1780 case 0xe8: /* call (near) */ { 1808 case 0xe8: /* call (near) */ {
1781 long int rel; 1809 long int rel = c->src.val;
1782 switch (c->op_bytes) {
1783 case 2:
1784 rel = insn_fetch(s16, 2, c->eip);
1785 break;
1786 case 4:
1787 rel = insn_fetch(s32, 4, c->eip);
1788 break;
1789 default:
1790 DPRINTF("Call: Invalid op_bytes\n");
1791 goto cannot_emulate;
1792 }
1793 c->src.val = (unsigned long) c->eip; 1810 c->src.val = (unsigned long) c->eip;
1794 jmp_rel(c, rel); 1811 jmp_rel(c, rel);
1795 c->op_bytes = c->ad_bytes;
1796 emulate_push(ctxt); 1812 emulate_push(ctxt);
1797 break; 1813 break;
1798 } 1814 }
1799 case 0xe9: /* jmp rel */ 1815 case 0xe9: /* jmp rel */
1800 goto jmp; 1816 goto jmp;
1801 case 0xea: /* jmp far */ { 1817 case 0xea: /* jmp far */
1802 uint32_t eip; 1818 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9,
1803 uint16_t sel; 1819 VCPU_SREG_CS) < 0) {
1804
1805 switch (c->op_bytes) {
1806 case 2:
1807 eip = insn_fetch(u16, 2, c->eip);
1808 break;
1809 case 4:
1810 eip = insn_fetch(u32, 4, c->eip);
1811 break;
1812 default:
1813 DPRINTF("jmp far: Invalid op_bytes\n");
1814 goto cannot_emulate;
1815 }
1816 sel = insn_fetch(u16, 2, c->eip);
1817 if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
1818 DPRINTF("jmp far: Failed to load CS descriptor\n"); 1820 DPRINTF("jmp far: Failed to load CS descriptor\n");
1819 goto cannot_emulate; 1821 goto cannot_emulate;
1820 } 1822 }
1821 1823
1822 c->eip = eip; 1824 c->eip = c->src.val;
1823 break; 1825 break;
1824 }
1825 case 0xeb: 1826 case 0xeb:
1826 jmp: /* jmp rel short */ 1827 jmp: /* jmp rel short */
1827 jmp_rel(c, c->src.val); 1828 jmp_rel(c, c->src.val);
@@ -1865,6 +1866,7 @@ special_insn:
1865 c->dst.type = OP_NONE; /* Disable writeback. */ 1866 c->dst.type = OP_NONE; /* Disable writeback. */
1866 break; 1867 break;
1867 case 0xfb: /* sti */ 1868 case 0xfb: /* sti */
1869 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
1868 ctxt->eflags |= X86_EFLAGS_IF; 1870 ctxt->eflags |= X86_EFLAGS_IF;
1869 c->dst.type = OP_NONE; /* Disable writeback. */ 1871 c->dst.type = OP_NONE; /* Disable writeback. */
1870 break; 1872 break;
@@ -2039,28 +2041,11 @@ twobyte_insn:
2039 if (!test_cc(c->b, ctxt->eflags)) 2041 if (!test_cc(c->b, ctxt->eflags))
2040 c->dst.type = OP_NONE; /* no writeback */ 2042 c->dst.type = OP_NONE; /* no writeback */
2041 break; 2043 break;
2042 case 0x80 ... 0x8f: /* jnz rel, etc*/ { 2044 case 0x80 ... 0x8f: /* jnz rel, etc*/
2043 long int rel;
2044
2045 switch (c->op_bytes) {
2046 case 2:
2047 rel = insn_fetch(s16, 2, c->eip);
2048 break;
2049 case 4:
2050 rel = insn_fetch(s32, 4, c->eip);
2051 break;
2052 case 8:
2053 rel = insn_fetch(s64, 8, c->eip);
2054 break;
2055 default:
2056 DPRINTF("jnz: Invalid op_bytes\n");
2057 goto cannot_emulate;
2058 }
2059 if (test_cc(c->b, ctxt->eflags)) 2045 if (test_cc(c->b, ctxt->eflags))
2060 jmp_rel(c, rel); 2046 jmp_rel(c, c->src.val);
2061 c->dst.type = OP_NONE; 2047 c->dst.type = OP_NONE;
2062 break; 2048 break;
2063 }
2064 case 0xa3: 2049 case 0xa3:
2065 bt: /* bt */ 2050 bt: /* bt */
2066 c->dst.type = OP_NONE; 2051 c->dst.type = OP_NONE;
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
index 27f0c9ed7f60..94e0e54056a9 100644
--- a/arch/x86/lguest/Makefile
+++ b/arch/x86/lguest/Makefile
@@ -1 +1,2 @@
1obj-y := i386_head.o boot.o 1obj-y := i386_head.o boot.o
2CFLAGS_boot.o := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 45acbcf25683..4e0c26559395 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -67,6 +67,7 @@
67#include <asm/mce.h> 67#include <asm/mce.h>
68#include <asm/io.h> 68#include <asm/io.h>
69#include <asm/i387.h> 69#include <asm/i387.h>
70#include <asm/stackprotector.h>
70#include <asm/reboot.h> /* for struct machine_ops */ 71#include <asm/reboot.h> /* for struct machine_ops */
71 72
72/*G:010 Welcome to the Guest! 73/*G:010 Welcome to the Guest!
@@ -166,10 +167,16 @@ static void lazy_hcall3(unsigned long call,
166 167
167/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 168/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
168 * issue the do-nothing hypercall to flush any stored calls. */ 169 * issue the do-nothing hypercall to flush any stored calls. */
169static void lguest_leave_lazy_mode(void) 170static void lguest_leave_lazy_mmu_mode(void)
170{ 171{
171 paravirt_leave_lazy(paravirt_get_lazy_mode());
172 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 172 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
173 paravirt_leave_lazy_mmu();
174}
175
176static void lguest_end_context_switch(struct task_struct *next)
177{
178 kvm_hypercall0(LHCALL_FLUSH_ASYNC);
179 paravirt_end_context_switch(next);
173} 180}
174 181
175/*G:033 182/*G:033
@@ -1053,8 +1060,8 @@ __init void lguest_init(void)
1053 pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; 1060 pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
1054 pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; 1061 pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
1055 pv_cpu_ops.wbinvd = lguest_wbinvd; 1062 pv_cpu_ops.wbinvd = lguest_wbinvd;
1056 pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; 1063 pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
1057 pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode; 1064 pv_cpu_ops.end_context_switch = lguest_end_context_switch;
1058 1065
1059 /* pagetable management */ 1066 /* pagetable management */
1060 pv_mmu_ops.write_cr3 = lguest_write_cr3; 1067 pv_mmu_ops.write_cr3 = lguest_write_cr3;
@@ -1067,7 +1074,7 @@ __init void lguest_init(void)
1067 pv_mmu_ops.read_cr2 = lguest_read_cr2; 1074 pv_mmu_ops.read_cr2 = lguest_read_cr2;
1068 pv_mmu_ops.read_cr3 = lguest_read_cr3; 1075 pv_mmu_ops.read_cr3 = lguest_read_cr3;
1069 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; 1076 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
1070 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode; 1077 pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
1071 pv_mmu_ops.pte_update = lguest_pte_update; 1078 pv_mmu_ops.pte_update = lguest_pte_update;
1072 pv_mmu_ops.pte_update_defer = lguest_pte_update; 1079 pv_mmu_ops.pte_update_defer = lguest_pte_update;
1073 1080
@@ -1088,13 +1095,21 @@ __init void lguest_init(void)
1088 * lguest_init() where the rest of the fairly chaotic boot setup 1095 * lguest_init() where the rest of the fairly chaotic boot setup
1089 * occurs. */ 1096 * occurs. */
1090 1097
1098 /* The stack protector is a weird thing where gcc places a canary
1099 * value on the stack and then checks it on return. This file is
1100 * compiled with -fno-stack-protector it, so we got this far without
1101 * problems. The value of the canary is kept at offset 20 from the
1102 * %gs register, so we need to set that up before calling C functions
1103 * in other files. */
1104 setup_stack_canary_segment(0);
1105 /* We could just call load_stack_canary_segment(), but we might as
1106 * call switch_to_new_gdt() which loads the whole table and sets up
1107 * the per-cpu segment descriptor register %fs as well. */
1108 switch_to_new_gdt(0);
1109
1091 /* As described in head_32.S, we map the first 128M of memory. */ 1110 /* As described in head_32.S, we map the first 128M of memory. */
1092 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1111 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1093 1112
1094 /* Load the %fs segment register (the per-cpu segment register) with
1095 * the normal data segment to get through booting. */
1096 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
1097
1098 /* The Host<->Guest Switcher lives at the top of our address space, and 1113 /* The Host<->Guest Switcher lives at the top of our address space, and
1099 * the Host told us how big it is when we made LGUEST_INIT hypercall: 1114 * the Host told us how big it is when we made LGUEST_INIT hypercall:
1100 * it put the answer in lguest_data.reserve_mem */ 1115 * it put the answer in lguest_data.reserve_mem */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 55e11aa6d66c..f9d35632666b 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,7 +2,7 @@
2# Makefile for x86 specific library files. 2# Makefile for x86 specific library files.
3# 3#
4 4
5obj-$(CONFIG_SMP) := msr-on-cpu.o 5obj-$(CONFIG_SMP) := msr.o
6 6
7lib-y := delay.o 7lib-y := delay.o
8lib-y += thunk_$(BITS).o 8lib-y += thunk_$(BITS).o
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
deleted file mode 100644
index 321cf720dbb6..000000000000
--- a/arch/x86/lib/msr-on-cpu.c
+++ /dev/null
@@ -1,97 +0,0 @@
1#include <linux/module.h>
2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h>
5
6struct msr_info {
7 u32 msr_no;
8 u32 l, h;
9 int err;
10};
11
12static void __rdmsr_on_cpu(void *info)
13{
14 struct msr_info *rv = info;
15
16 rdmsr(rv->msr_no, rv->l, rv->h);
17}
18
19static void __wrmsr_on_cpu(void *info)
20{
21 struct msr_info *rv = info;
22
23 wrmsr(rv->msr_no, rv->l, rv->h);
24}
25
26int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
27{
28 int err;
29 struct msr_info rv;
30
31 rv.msr_no = msr_no;
32 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
33 *l = rv.l;
34 *h = rv.h;
35
36 return err;
37}
38
39int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
40{
41 int err;
42 struct msr_info rv;
43
44 rv.msr_no = msr_no;
45 rv.l = l;
46 rv.h = h;
47 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
48
49 return err;
50}
51
52/* These "safe" variants are slower and should be used when the target MSR
53 may not actually exist. */
54static void __rdmsr_safe_on_cpu(void *info)
55{
56 struct msr_info *rv = info;
57
58 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
59}
60
61static void __wrmsr_safe_on_cpu(void *info)
62{
63 struct msr_info *rv = info;
64
65 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
66}
67
68int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
69{
70 int err;
71 struct msr_info rv;
72
73 rv.msr_no = msr_no;
74 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
75 *l = rv.l;
76 *h = rv.h;
77
78 return err ? err : rv.err;
79}
80
81int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
82{
83 int err;
84 struct msr_info rv;
85
86 rv.msr_no = msr_no;
87 rv.l = l;
88 rv.h = h;
89 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
90
91 return err ? err : rv.err;
92}
93
94EXPORT_SYMBOL(rdmsr_on_cpu);
95EXPORT_SYMBOL(wrmsr_on_cpu);
96EXPORT_SYMBOL(rdmsr_safe_on_cpu);
97EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
new file mode 100644
index 000000000000..1440b9c0547e
--- /dev/null
+++ b/arch/x86/lib/msr.c
@@ -0,0 +1,183 @@
1#include <linux/module.h>
2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h>
5
6struct msr_info {
7 u32 msr_no;
8 struct msr reg;
9 struct msr *msrs;
10 int off;
11 int err;
12};
13
14static void __rdmsr_on_cpu(void *info)
15{
16 struct msr_info *rv = info;
17 struct msr *reg;
18 int this_cpu = raw_smp_processor_id();
19
20 if (rv->msrs)
21 reg = &rv->msrs[this_cpu - rv->off];
22 else
23 reg = &rv->reg;
24
25 rdmsr(rv->msr_no, reg->l, reg->h);
26}
27
28static void __wrmsr_on_cpu(void *info)
29{
30 struct msr_info *rv = info;
31 struct msr *reg;
32 int this_cpu = raw_smp_processor_id();
33
34 if (rv->msrs)
35 reg = &rv->msrs[this_cpu - rv->off];
36 else
37 reg = &rv->reg;
38
39 wrmsr(rv->msr_no, reg->l, reg->h);
40}
41
42int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
43{
44 int err;
45 struct msr_info rv;
46
47 memset(&rv, 0, sizeof(rv));
48
49 rv.msr_no = msr_no;
50 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
51 *l = rv.reg.l;
52 *h = rv.reg.h;
53
54 return err;
55}
56EXPORT_SYMBOL(rdmsr_on_cpu);
57
58int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
59{
60 int err;
61 struct msr_info rv;
62
63 memset(&rv, 0, sizeof(rv));
64
65 rv.msr_no = msr_no;
66 rv.reg.l = l;
67 rv.reg.h = h;
68 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
69
70 return err;
71}
72EXPORT_SYMBOL(wrmsr_on_cpu);
73
74/* rdmsr on a bunch of CPUs
75 *
76 * @mask: which CPUs
77 * @msr_no: which MSR
78 * @msrs: array of MSR values
79 *
80 */
81void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
82{
83 struct msr_info rv;
84 int this_cpu;
85
86 memset(&rv, 0, sizeof(rv));
87
88 rv.off = cpumask_first(mask);
89 rv.msrs = msrs;
90 rv.msr_no = msr_no;
91
92 preempt_disable();
93 /*
94 * FIXME: handle the CPU we're executing on separately for now until
95 * smp_call_function_many has been fixed to not skip it.
96 */
97 this_cpu = raw_smp_processor_id();
98 smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1);
99
100 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
101 preempt_enable();
102}
103EXPORT_SYMBOL(rdmsr_on_cpus);
104
105/*
106 * wrmsr on a bunch of CPUs
107 *
108 * @mask: which CPUs
109 * @msr_no: which MSR
110 * @msrs: array of MSR values
111 *
112 */
113void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
114{
115 struct msr_info rv;
116 int this_cpu;
117
118 memset(&rv, 0, sizeof(rv));
119
120 rv.off = cpumask_first(mask);
121 rv.msrs = msrs;
122 rv.msr_no = msr_no;
123
124 preempt_disable();
125 /*
126 * FIXME: handle the CPU we're executing on separately for now until
127 * smp_call_function_many has been fixed to not skip it.
128 */
129 this_cpu = raw_smp_processor_id();
130 smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1);
131
132 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
133 preempt_enable();
134}
135EXPORT_SYMBOL(wrmsr_on_cpus);
136
137/* These "safe" variants are slower and should be used when the target MSR
138 may not actually exist. */
139static void __rdmsr_safe_on_cpu(void *info)
140{
141 struct msr_info *rv = info;
142
143 rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
144}
145
146static void __wrmsr_safe_on_cpu(void *info)
147{
148 struct msr_info *rv = info;
149
150 rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
151}
152
153int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
154{
155 int err;
156 struct msr_info rv;
157
158 memset(&rv, 0, sizeof(rv));
159
160 rv.msr_no = msr_no;
161 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
162 *l = rv.reg.l;
163 *h = rv.reg.h;
164
165 return err ? err : rv.err;
166}
167EXPORT_SYMBOL(rdmsr_safe_on_cpu);
168
169int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
170{
171 int err;
172 struct msr_info rv;
173
174 memset(&rv, 0, sizeof(rv));
175
176 rv.msr_no = msr_no;
177 rv.reg.l = l;
178 rv.reg.h = h;
179 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
180
181 return err ? err : rv.err;
182}
183EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e7277cbcfb40..a725b7f760ae 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st,
161 st->current_address >= st->marker[1].start_address) { 161 st->current_address >= st->marker[1].start_address) {
162 const char *unit = units; 162 const char *unit = units;
163 unsigned long delta; 163 unsigned long delta;
164 int width = sizeof(unsigned long) * 2;
164 165
165 /* 166 /*
166 * Now print the actual finished series 167 * Now print the actual finished series
167 */ 168 */
168 seq_printf(m, "0x%p-0x%p ", 169 seq_printf(m, "0x%0*lx-0x%0*lx ",
169 (void *)st->start_address, 170 width, st->start_address,
170 (void *)st->current_address); 171 width, st->current_address);
171 172
172 delta = (st->current_address - st->start_address) >> 10; 173 delta = (st->current_address - st->start_address) >> 10;
173 while (!(delta & 1023) && unit[1]) { 174 while (!(delta & 1023) && unit[1]) {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa0..c6acc6326374 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -3,40 +3,17 @@
3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. 3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar 4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
5 */ 5 */
6#include <linux/interrupt.h> 6#include <linux/magic.h> /* STACK_END_MAGIC */
7#include <linux/mmiotrace.h> 7#include <linux/sched.h> /* test_thread_flag(), ... */
8#include <linux/bootmem.h> 8#include <linux/kdebug.h> /* oops_begin/end, ... */
9#include <linux/compiler.h> 9#include <linux/module.h> /* search_exception_table */
10#include <linux/highmem.h> 10#include <linux/bootmem.h> /* max_low_pfn */
11#include <linux/kprobes.h> 11#include <linux/kprobes.h> /* __kprobes, ... */
12#include <linux/uaccess.h> 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/vmalloc.h> 13#include <linux/perf_counter.h> /* perf_swcounter_event */
14#include <linux/vt_kern.h> 14
15#include <linux/signal.h> 15#include <asm/traps.h> /* dotraplinkage, ... */
16#include <linux/kernel.h> 16#include <asm/pgalloc.h> /* pgd_*(), ... */
17#include <linux/ptrace.h>
18#include <linux/string.h>
19#include <linux/module.h>
20#include <linux/kdebug.h>
21#include <linux/errno.h>
22#include <linux/magic.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/init.h>
26#include <linux/mman.h>
27#include <linux/tty.h>
28#include <linux/smp.h>
29#include <linux/mm.h>
30
31#include <asm-generic/sections.h>
32
33#include <asm/tlbflush.h>
34#include <asm/pgalloc.h>
35#include <asm/segment.h>
36#include <asm/system.h>
37#include <asm/proto.h>
38#include <asm/traps.h>
39#include <asm/desc.h>
40 17
41/* 18/*
42 * Page fault error code bits: 19 * Page fault error code bits:
@@ -225,12 +202,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
225 if (!pmd_present(*pmd_k)) 202 if (!pmd_present(*pmd_k))
226 return NULL; 203 return NULL;
227 204
228 if (!pmd_present(*pmd)) { 205 if (!pmd_present(*pmd))
229 set_pmd(pmd, *pmd_k); 206 set_pmd(pmd, *pmd_k);
230 arch_flush_lazy_mmu_mode(); 207 else
231 } else {
232 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 208 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
233 }
234 209
235 return pmd_k; 210 return pmd_k;
236} 211}
@@ -538,8 +513,6 @@ bad:
538static int is_errata93(struct pt_regs *regs, unsigned long address) 513static int is_errata93(struct pt_regs *regs, unsigned long address)
539{ 514{
540#ifdef CONFIG_X86_64 515#ifdef CONFIG_X86_64
541 static int once;
542
543 if (address != regs->ip) 516 if (address != regs->ip)
544 return 0; 517 return 0;
545 518
@@ -549,10 +522,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
549 address |= 0xffffffffUL << 32; 522 address |= 0xffffffffUL << 32;
550 if ((address >= (u64)_stext && address <= (u64)_etext) || 523 if ((address >= (u64)_stext && address <= (u64)_etext) ||
551 (address >= MODULES_VADDR && address <= MODULES_END)) { 524 (address >= MODULES_VADDR && address <= MODULES_END)) {
552 if (!once) { 525 printk_once(errata93_warning);
553 printk(errata93_warning);
554 once = 1;
555 }
556 regs->ip = address; 526 regs->ip = address;
557 return 1; 527 return 1;
558 } 528 }
@@ -1044,6 +1014,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1044 if (unlikely(error_code & PF_RSVD)) 1014 if (unlikely(error_code & PF_RSVD))
1045 pgtable_bad(regs, error_code, address); 1015 pgtable_bad(regs, error_code, address);
1046 1016
1017 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
1018
1047 /* 1019 /*
1048 * If we're in an interrupt, have no user context or are running 1020 * If we're in an interrupt, have no user context or are running
1049 * in an atomic region then we must not take the fault: 1021 * in an atomic region then we must not take the fault:
@@ -1137,10 +1109,15 @@ good_area:
1137 return; 1109 return;
1138 } 1110 }
1139 1111
1140 if (fault & VM_FAULT_MAJOR) 1112 if (fault & VM_FAULT_MAJOR) {
1141 tsk->maj_flt++; 1113 tsk->maj_flt++;
1142 else 1114 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1115 regs, address);
1116 } else {
1143 tsk->min_flt++; 1117 tsk->min_flt++;
1118 perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1119 regs, address);
1120 }
1144 1121
1145 check_v8086_mode(regs, address, tsk); 1122 check_v8086_mode(regs, address, tsk);
1146 1123
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 8126e8d1a2a4..58f621e81919 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -44,7 +44,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
45 BUG_ON(!pte_none(*(kmap_pte-idx))); 45 BUG_ON(!pte_none(*(kmap_pte-idx)));
46 set_pte(kmap_pte-idx, mk_pte(page, prot)); 46 set_pte(kmap_pte-idx, mk_pte(page, prot));
47 arch_flush_lazy_mmu_mode();
48 47
49 return (void *)vaddr; 48 return (void *)vaddr;
50} 49}
@@ -74,7 +73,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
74#endif 73#endif
75 } 74 }
76 75
77 arch_flush_lazy_mmu_mode();
78 pagefault_enable(); 76 pagefault_enable();
79} 77}
80 78
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ae4f7b5d7104..34c1bfb64f1c 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,3 +1,4 @@
1#include <linux/initrd.h>
1#include <linux/ioport.h> 2#include <linux/ioport.h>
2#include <linux/swap.h> 3#include <linux/swap.h>
3 4
@@ -10,6 +11,9 @@
10#include <asm/setup.h> 11#include <asm/setup.h>
11#include <asm/system.h> 12#include <asm/system.h>
12#include <asm/tlbflush.h> 13#include <asm/tlbflush.h>
14#include <asm/tlb.h>
15
16DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
13 17
14unsigned long __initdata e820_table_start; 18unsigned long __initdata e820_table_start;
15unsigned long __meminitdata e820_table_end; 19unsigned long __meminitdata e820_table_end;
@@ -23,6 +27,69 @@ int direct_gbpages
23#endif 27#endif
24; 28;
25 29
30int nx_enabled;
31
32#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
33static int disable_nx __cpuinitdata;
34
35/*
36 * noexec = on|off
37 *
38 * Control non-executable mappings for processes.
39 *
40 * on Enable
41 * off Disable
42 */
43static int __init noexec_setup(char *str)
44{
45 if (!str)
46 return -EINVAL;
47 if (!strncmp(str, "on", 2)) {
48 __supported_pte_mask |= _PAGE_NX;
49 disable_nx = 0;
50 } else if (!strncmp(str, "off", 3)) {
51 disable_nx = 1;
52 __supported_pte_mask &= ~_PAGE_NX;
53 }
54 return 0;
55}
56early_param("noexec", noexec_setup);
57#endif
58
59#ifdef CONFIG_X86_PAE
60static void __init set_nx(void)
61{
62 unsigned int v[4], l, h;
63
64 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
65 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
66
67 if ((v[3] & (1 << 20)) && !disable_nx) {
68 rdmsr(MSR_EFER, l, h);
69 l |= EFER_NX;
70 wrmsr(MSR_EFER, l, h);
71 nx_enabled = 1;
72 __supported_pte_mask |= _PAGE_NX;
73 }
74 }
75}
76#else
77static inline void set_nx(void)
78{
79}
80#endif
81
82#ifdef CONFIG_X86_64
83void __cpuinit check_efer(void)
84{
85 unsigned long efer;
86
87 rdmsrl(MSR_EFER, efer);
88 if (!(efer & EFER_NX) || disable_nx)
89 __supported_pte_mask &= ~_PAGE_NX;
90}
91#endif
92
26static void __init find_early_table_space(unsigned long end, int use_pse, 93static void __init find_early_table_space(unsigned long end, int use_pse,
27 int use_gbpages) 94 int use_gbpages)
28{ 95{
@@ -66,12 +133,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
66 */ 133 */
67#ifdef CONFIG_X86_32 134#ifdef CONFIG_X86_32
68 start = 0x7000; 135 start = 0x7000;
69 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, 136#else
70 tables, PAGE_SIZE);
71#else /* CONFIG_X86_64 */
72 start = 0x8000; 137 start = 0x8000;
73 e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
74#endif 138#endif
139 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
140 tables, PAGE_SIZE);
75 if (e820_table_start == -1UL) 141 if (e820_table_start == -1UL)
76 panic("Cannot find space for the kernel page tables"); 142 panic("Cannot find space for the kernel page tables");
77 143
@@ -159,12 +225,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
159 use_gbpages = direct_gbpages; 225 use_gbpages = direct_gbpages;
160#endif 226#endif
161 227
162#ifdef CONFIG_X86_32
163#ifdef CONFIG_X86_PAE
164 set_nx(); 228 set_nx();
165 if (nx_enabled) 229 if (nx_enabled)
166 printk(KERN_INFO "NX (Execute Disable) protection: active\n"); 230 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
167#endif
168 231
169 /* Enable PSE if available */ 232 /* Enable PSE if available */
170 if (cpu_has_pse) 233 if (cpu_has_pse)
@@ -175,7 +238,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
175 set_in_cr4(X86_CR4_PGE); 238 set_in_cr4(X86_CR4_PGE);
176 __supported_pte_mask |= _PAGE_GLOBAL; 239 __supported_pte_mask |= _PAGE_GLOBAL;
177 } 240 }
178#endif
179 241
180 if (use_gbpages) 242 if (use_gbpages)
181 page_size_mask |= 1 << PG_LEVEL_1G; 243 page_size_mask |= 1 << PG_LEVEL_1G;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 749559ed80f5..949708d7a481 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,12 +49,9 @@
49#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/setup.h> 50#include <asm/setup.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#include <asm/page_types.h>
52#include <asm/init.h> 53#include <asm/init.h>
53 54
54unsigned long max_low_pfn_mapped;
55unsigned long max_pfn_mapped;
56
57DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
58unsigned long highstart_pfn, highend_pfn; 55unsigned long highstart_pfn, highend_pfn;
59 56
60static noinline int do_test_wp_bit(void); 57static noinline int do_test_wp_bit(void);
@@ -587,61 +584,9 @@ void zap_low_mappings(void)
587 flush_tlb_all(); 584 flush_tlb_all();
588} 585}
589 586
590int nx_enabled;
591
592pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); 587pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
593EXPORT_SYMBOL_GPL(__supported_pte_mask); 588EXPORT_SYMBOL_GPL(__supported_pte_mask);
594 589
595#ifdef CONFIG_X86_PAE
596
597static int disable_nx __initdata;
598
599/*
600 * noexec = on|off
601 *
602 * Control non executable mappings.
603 *
604 * on Enable
605 * off Disable
606 */
607static int __init noexec_setup(char *str)
608{
609 if (!str || !strcmp(str, "on")) {
610 if (cpu_has_nx) {
611 __supported_pte_mask |= _PAGE_NX;
612 disable_nx = 0;
613 }
614 } else {
615 if (!strcmp(str, "off")) {
616 disable_nx = 1;
617 __supported_pte_mask &= ~_PAGE_NX;
618 } else {
619 return -EINVAL;
620 }
621 }
622
623 return 0;
624}
625early_param("noexec", noexec_setup);
626
627void __init set_nx(void)
628{
629 unsigned int v[4], l, h;
630
631 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
632 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
633
634 if ((v[3] & (1 << 20)) && !disable_nx) {
635 rdmsr(MSR_EFER, l, h);
636 l |= EFER_NX;
637 wrmsr(MSR_EFER, l, h);
638 nx_enabled = 1;
639 __supported_pte_mask |= _PAGE_NX;
640 }
641 }
642}
643#endif
644
645/* user-defined highmem size */ 590/* user-defined highmem size */
646static unsigned int highmem_pages = -1; 591static unsigned int highmem_pages = -1;
647 592
@@ -761,15 +706,15 @@ void __init initmem_init(unsigned long start_pfn,
761 highstart_pfn = highend_pfn = max_pfn; 706 highstart_pfn = highend_pfn = max_pfn;
762 if (max_pfn > max_low_pfn) 707 if (max_pfn > max_low_pfn)
763 highstart_pfn = max_low_pfn; 708 highstart_pfn = max_low_pfn;
764 memory_present(0, 0, highend_pfn);
765 e820_register_active_regions(0, 0, highend_pfn); 709 e820_register_active_regions(0, 0, highend_pfn);
710 sparse_memory_present_with_active_regions(0);
766 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 711 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
767 pages_to_mb(highend_pfn - highstart_pfn)); 712 pages_to_mb(highend_pfn - highstart_pfn));
768 num_physpages = highend_pfn; 713 num_physpages = highend_pfn;
769 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 714 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
770#else 715#else
771 memory_present(0, 0, max_low_pfn);
772 e820_register_active_regions(0, 0, max_low_pfn); 716 e820_register_active_regions(0, 0, max_low_pfn);
717 sparse_memory_present_with_active_regions(0);
773 num_physpages = max_low_pfn; 718 num_physpages = max_low_pfn;
774 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 719 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
775#endif 720#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1753e8020df6..52bb9519bb86 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -50,18 +50,8 @@
50#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/init.h> 51#include <asm/init.h>
52 52
53/*
54 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
55 * The direct mapping extends to max_pfn_mapped, so that we can directly access
56 * apertures, ACPI and other tables without having to play with fixmaps.
57 */
58unsigned long max_low_pfn_mapped;
59unsigned long max_pfn_mapped;
60
61static unsigned long dma_reserve __initdata; 53static unsigned long dma_reserve __initdata;
62 54
63DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
64
65static int __init parse_direct_gbpages_off(char *arg) 55static int __init parse_direct_gbpages_off(char *arg)
66{ 56{
67 direct_gbpages = 0; 57 direct_gbpages = 0;
@@ -85,39 +75,6 @@ early_param("gbpages", parse_direct_gbpages_on);
85pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; 75pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
86EXPORT_SYMBOL_GPL(__supported_pte_mask); 76EXPORT_SYMBOL_GPL(__supported_pte_mask);
87 77
88static int disable_nx __cpuinitdata;
89
90/*
91 * noexec=on|off
92 * Control non-executable mappings for 64-bit processes.
93 *
94 * on Enable (default)
95 * off Disable
96 */
97static int __init nonx_setup(char *str)
98{
99 if (!str)
100 return -EINVAL;
101 if (!strncmp(str, "on", 2)) {
102 __supported_pte_mask |= _PAGE_NX;
103 disable_nx = 0;
104 } else if (!strncmp(str, "off", 3)) {
105 disable_nx = 1;
106 __supported_pte_mask &= ~_PAGE_NX;
107 }
108 return 0;
109}
110early_param("noexec", nonx_setup);
111
112void __cpuinit check_efer(void)
113{
114 unsigned long efer;
115
116 rdmsrl(MSR_EFER, efer);
117 if (!(efer & EFER_NX) || disable_nx)
118 __supported_pte_mask &= ~_PAGE_NX;
119}
120
121int force_personality32; 78int force_personality32;
122 79
123/* 80/*
@@ -628,6 +585,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
628 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); 585 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
629 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); 586 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
630} 587}
588#endif
631 589
632void __init paging_init(void) 590void __init paging_init(void)
633{ 591{
@@ -638,11 +596,10 @@ void __init paging_init(void)
638 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 596 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
639 max_zone_pfns[ZONE_NORMAL] = max_pfn; 597 max_zone_pfns[ZONE_NORMAL] = max_pfn;
640 598
641 memory_present(0, 0, max_pfn); 599 sparse_memory_present_with_active_regions(MAX_NUMNODES);
642 sparse_init(); 600 sparse_init();
643 free_area_init_nodes(max_zone_pfns); 601 free_area_init_nodes(max_zone_pfns);
644} 602}
645#endif
646 603
647/* 604/*
648 * Memory hotplug specific functions 605 * Memory hotplug specific functions
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 8056545e2d39..fe6f84ca121e 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -82,7 +82,6 @@ iounmap_atomic(void *kvaddr, enum km_type type)
82 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) 82 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
83 kpte_clear_flush(kmap_pte-idx, vaddr); 83 kpte_clear_flush(kmap_pte-idx, vaddr);
84 84
85 arch_flush_lazy_mmu_mode();
86 pagefault_enable(); 85 pagefault_enable();
87} 86}
88EXPORT_SYMBOL_GPL(iounmap_atomic); 87EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 50dc802a1c46..16ccbd77917f 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -32,7 +32,7 @@ struct kmmio_fault_page {
32 struct list_head list; 32 struct list_head list;
33 struct kmmio_fault_page *release_next; 33 struct kmmio_fault_page *release_next;
34 unsigned long page; /* location of the fault page */ 34 unsigned long page; /* location of the fault page */
35 bool old_presence; /* page presence prior to arming */ 35 pteval_t old_presence; /* page presence prior to arming */
36 bool armed; 36 bool armed;
37 37
38 /* 38 /*
@@ -97,60 +97,62 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
97static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) 97static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
98{ 98{
99 struct list_head *head; 99 struct list_head *head;
100 struct kmmio_fault_page *p; 100 struct kmmio_fault_page *f;
101 101
102 page &= PAGE_MASK; 102 page &= PAGE_MASK;
103 head = kmmio_page_list(page); 103 head = kmmio_page_list(page);
104 list_for_each_entry_rcu(p, head, list) { 104 list_for_each_entry_rcu(f, head, list) {
105 if (p->page == page) 105 if (f->page == page)
106 return p; 106 return f;
107 } 107 }
108 return NULL; 108 return NULL;
109} 109}
110 110
111static void set_pmd_presence(pmd_t *pmd, bool present, bool *old) 111static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
112{ 112{
113 pmdval_t v = pmd_val(*pmd); 113 pmdval_t v = pmd_val(*pmd);
114 *old = !!(v & _PAGE_PRESENT); 114 if (clear) {
115 v &= ~_PAGE_PRESENT; 115 *old = v & _PAGE_PRESENT;
116 if (present) 116 v &= ~_PAGE_PRESENT;
117 v |= _PAGE_PRESENT; 117 } else /* presume this has been called with clear==true previously */
118 v |= *old;
118 set_pmd(pmd, __pmd(v)); 119 set_pmd(pmd, __pmd(v));
119} 120}
120 121
121static void set_pte_presence(pte_t *pte, bool present, bool *old) 122static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
122{ 123{
123 pteval_t v = pte_val(*pte); 124 pteval_t v = pte_val(*pte);
124 *old = !!(v & _PAGE_PRESENT); 125 if (clear) {
125 v &= ~_PAGE_PRESENT; 126 *old = v & _PAGE_PRESENT;
126 if (present) 127 v &= ~_PAGE_PRESENT;
127 v |= _PAGE_PRESENT; 128 } else /* presume this has been called with clear==true previously */
129 v |= *old;
128 set_pte_atomic(pte, __pte(v)); 130 set_pte_atomic(pte, __pte(v));
129} 131}
130 132
131static int set_page_presence(unsigned long addr, bool present, bool *old) 133static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
132{ 134{
133 unsigned int level; 135 unsigned int level;
134 pte_t *pte = lookup_address(addr, &level); 136 pte_t *pte = lookup_address(f->page, &level);
135 137
136 if (!pte) { 138 if (!pte) {
137 pr_err("kmmio: no pte for page 0x%08lx\n", addr); 139 pr_err("kmmio: no pte for page 0x%08lx\n", f->page);
138 return -1; 140 return -1;
139 } 141 }
140 142
141 switch (level) { 143 switch (level) {
142 case PG_LEVEL_2M: 144 case PG_LEVEL_2M:
143 set_pmd_presence((pmd_t *)pte, present, old); 145 clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
144 break; 146 break;
145 case PG_LEVEL_4K: 147 case PG_LEVEL_4K:
146 set_pte_presence(pte, present, old); 148 clear_pte_presence(pte, clear, &f->old_presence);
147 break; 149 break;
148 default: 150 default:
149 pr_err("kmmio: unexpected page level 0x%x.\n", level); 151 pr_err("kmmio: unexpected page level 0x%x.\n", level);
150 return -1; 152 return -1;
151 } 153 }
152 154
153 __flush_tlb_one(addr); 155 __flush_tlb_one(f->page);
154 return 0; 156 return 0;
155} 157}
156 158
@@ -171,9 +173,9 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
171 WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); 173 WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
172 if (f->armed) { 174 if (f->armed) {
173 pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", 175 pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
174 f->page, f->count, f->old_presence); 176 f->page, f->count, !!f->old_presence);
175 } 177 }
176 ret = set_page_presence(f->page, false, &f->old_presence); 178 ret = clear_page_presence(f, true);
177 WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); 179 WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
178 f->armed = true; 180 f->armed = true;
179 return ret; 181 return ret;
@@ -182,8 +184,7 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
182/** Restore the given page to saved presence state. */ 184/** Restore the given page to saved presence state. */
183static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) 185static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
184{ 186{
185 bool tmp; 187 int ret = clear_page_presence(f, false);
186 int ret = set_page_presence(f->page, f->old_presence, &tmp);
187 WARN_ONCE(ret < 0, 188 WARN_ONCE(ret < 0,
188 KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); 189 KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
189 f->armed = false; 190 f->armed = false;
@@ -310,7 +311,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
310 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); 311 struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
311 312
312 if (!ctx->active) { 313 if (!ctx->active) {
313 pr_debug("kmmio: spurious debug trap on CPU %d.\n", 314 /*
315 * debug traps without an active context are due to either
316 * something external causing them (f.e. using a debugger while
317 * mmio tracing enabled), or erroneous behaviour
318 */
319 pr_warning("kmmio: unexpected debug trap on CPU %d.\n",
314 smp_processor_id()); 320 smp_processor_id());
315 goto out; 321 goto out;
316 } 322 }
@@ -439,12 +445,12 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
439 head, 445 head,
440 struct kmmio_delayed_release, 446 struct kmmio_delayed_release,
441 rcu); 447 rcu);
442 struct kmmio_fault_page *p = dr->release_list; 448 struct kmmio_fault_page *f = dr->release_list;
443 while (p) { 449 while (f) {
444 struct kmmio_fault_page *next = p->release_next; 450 struct kmmio_fault_page *next = f->release_next;
445 BUG_ON(p->count); 451 BUG_ON(f->count);
446 kfree(p); 452 kfree(f);
447 p = next; 453 f = next;
448 } 454 }
449 kfree(dr); 455 kfree(dr);
450} 456}
@@ -453,19 +459,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
453{ 459{
454 struct kmmio_delayed_release *dr = 460 struct kmmio_delayed_release *dr =
455 container_of(head, struct kmmio_delayed_release, rcu); 461 container_of(head, struct kmmio_delayed_release, rcu);
456 struct kmmio_fault_page *p = dr->release_list; 462 struct kmmio_fault_page *f = dr->release_list;
457 struct kmmio_fault_page **prevp = &dr->release_list; 463 struct kmmio_fault_page **prevp = &dr->release_list;
458 unsigned long flags; 464 unsigned long flags;
459 465
460 spin_lock_irqsave(&kmmio_lock, flags); 466 spin_lock_irqsave(&kmmio_lock, flags);
461 while (p) { 467 while (f) {
462 if (!p->count) { 468 if (!f->count) {
463 list_del_rcu(&p->list); 469 list_del_rcu(&f->list);
464 prevp = &p->release_next; 470 prevp = &f->release_next;
465 } else { 471 } else {
466 *prevp = p->release_next; 472 *prevp = f->release_next;
467 } 473 }
468 p = p->release_next; 474 f = f->release_next;
469 } 475 }
470 spin_unlock_irqrestore(&kmmio_lock, flags); 476 spin_unlock_irqrestore(&kmmio_lock, flags);
471 477
@@ -528,8 +534,8 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
528} 534}
529EXPORT_SYMBOL(unregister_kmmio_probe); 535EXPORT_SYMBOL(unregister_kmmio_probe);
530 536
531static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, 537static int
532 void *args) 538kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
533{ 539{
534 struct die_args *arg = args; 540 struct die_args *arg = args;
535 541
@@ -544,11 +550,23 @@ static struct notifier_block nb_die = {
544 .notifier_call = kmmio_die_notifier 550 .notifier_call = kmmio_die_notifier
545}; 551};
546 552
547static int __init init_kmmio(void) 553int kmmio_init(void)
548{ 554{
549 int i; 555 int i;
556
550 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) 557 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
551 INIT_LIST_HEAD(&kmmio_page_table[i]); 558 INIT_LIST_HEAD(&kmmio_page_table[i]);
559
552 return register_die_notifier(&nb_die); 560 return register_die_notifier(&nb_die);
553} 561}
554fs_initcall(init_kmmio); /* should be before device_initcall() */ 562
563void kmmio_cleanup(void)
564{
565 int i;
566
567 unregister_die_notifier(&nb_die);
568 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
569 WARN_ONCE(!list_empty(&kmmio_page_table[i]),
570 KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
571 }
572}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 605c8be06217..18d244f70205 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -40,23 +40,22 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
40 40
41static void __init memtest(u64 pattern, u64 start_phys, u64 size) 41static void __init memtest(u64 pattern, u64 start_phys, u64 size)
42{ 42{
43 u64 i, count; 43 u64 *p, *start, *end;
44 u64 *start;
45 u64 start_bad, last_bad; 44 u64 start_bad, last_bad;
46 u64 start_phys_aligned; 45 u64 start_phys_aligned;
47 size_t incr; 46 const size_t incr = sizeof(pattern);
48 47
49 incr = sizeof(pattern);
50 start_phys_aligned = ALIGN(start_phys, incr); 48 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned); 49 start = __va(start_phys_aligned);
50 end = start + (size - (start_phys_aligned - start_phys)) / incr;
53 start_bad = 0; 51 start_bad = 0;
54 last_bad = 0; 52 last_bad = 0;
55 53
56 for (i = 0; i < count; i++) 54 for (p = start; p < end; p++)
57 start[i] = pattern; 55 *p = pattern;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { 56
59 if (*start == pattern) 57 for (p = start; p < end; p++, start_phys_aligned += incr) {
58 if (*p == pattern)
60 continue; 59 continue;
61 if (start_phys_aligned == last_bad + incr) { 60 if (start_phys_aligned == last_bad + incr) {
62 last_bad += incr; 61 last_bad += incr;
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index c9342ed8b402..132772a8ec57 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -451,6 +451,7 @@ void enable_mmiotrace(void)
451 451
452 if (nommiotrace) 452 if (nommiotrace)
453 pr_info(NAME "MMIO tracing disabled.\n"); 453 pr_info(NAME "MMIO tracing disabled.\n");
454 kmmio_init();
454 enter_uniprocessor(); 455 enter_uniprocessor();
455 spin_lock_irq(&trace_lock); 456 spin_lock_irq(&trace_lock);
456 atomic_inc(&mmiotrace_enabled); 457 atomic_inc(&mmiotrace_enabled);
@@ -473,6 +474,7 @@ void disable_mmiotrace(void)
473 474
474 clear_trace_list(); /* guarantees: no more kmmio callbacks */ 475 clear_trace_list(); /* guarantees: no more kmmio callbacks */
475 leave_uniprocessor(); 476 leave_uniprocessor();
477 kmmio_cleanup();
476 pr_info(NAME "disabled.\n"); 478 pr_info(NAME "disabled.\n");
477out: 479out:
478 mutex_unlock(&mmiotrace_mutex); 480 mutex_unlock(&mmiotrace_mutex);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 2d05a12029dc..459913beac71 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -179,18 +179,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
179} 179}
180 180
181/* Initialize bootmem allocator for a node */ 181/* Initialize bootmem allocator for a node */
182void __init setup_node_bootmem(int nodeid, unsigned long start, 182void __init
183 unsigned long end) 183setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
184{ 184{
185 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 185 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
186 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
186 unsigned long bootmap_start, nodedata_phys; 187 unsigned long bootmap_start, nodedata_phys;
187 void *bootmap; 188 void *bootmap;
188 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
189 int nid; 189 int nid;
190 190
191 if (!end) 191 if (!end)
192 return; 192 return;
193 193
194 /*
195 * Don't confuse VM with a node that doesn't have the
196 * minimum amount of memory:
197 */
198 if (end && (end - start) < NODE_MIN_SIZE)
199 return;
200
194 start = roundup(start, ZONE_ALIGN); 201 start = roundup(start, ZONE_ALIGN);
195 202
196 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 203 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
@@ -272,9 +279,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
272 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, 279 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
273 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); 280 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
274 281
275#ifdef CONFIG_ACPI_NUMA
276 srat_reserve_add_area(nodeid);
277#endif
278 node_set_online(nodeid); 282 node_set_online(nodeid);
279} 283}
280 284
@@ -578,21 +582,6 @@ unsigned long __init numa_free_all_bootmem(void)
578 return pages; 582 return pages;
579} 583}
580 584
581void __init paging_init(void)
582{
583 unsigned long max_zone_pfns[MAX_NR_ZONES];
584
585 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
586 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
587 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
588 max_zone_pfns[ZONE_NORMAL] = max_pfn;
589
590 sparse_memory_present_with_active_regions(MAX_NUMNODES);
591 sparse_init();
592
593 free_area_init_nodes(max_zone_pfns);
594}
595
596static __init int numa_setup(char *opt) 585static __init int numa_setup(char *opt)
597{ 586{
598 if (!opt) 587 if (!opt)
@@ -606,8 +595,6 @@ static __init int numa_setup(char *opt)
606#ifdef CONFIG_ACPI_NUMA 595#ifdef CONFIG_ACPI_NUMA
607 if (!strncmp(opt, "noacpi", 6)) 596 if (!strncmp(opt, "noacpi", 6))
608 acpi_numa = -1; 597 acpi_numa = -1;
609 if (!strncmp(opt, "hotadd=", 7))
610 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
611#endif 598#endif
612 return 0; 599 return 0;
613} 600}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e17efed088c5..6ce9518fe2ac 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -839,13 +839,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
839 839
840 vm_unmap_aliases(); 840 vm_unmap_aliases();
841 841
842 /*
843 * If we're called with lazy mmu updates enabled, the
844 * in-memory pte state may be stale. Flush pending updates to
845 * bring them up to date.
846 */
847 arch_flush_lazy_mmu_mode();
848
849 cpa.vaddr = addr; 842 cpa.vaddr = addr;
850 cpa.pages = pages; 843 cpa.pages = pages;
851 cpa.numpages = numpages; 844 cpa.numpages = numpages;
@@ -890,13 +883,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
890 } else 883 } else
891 cpa_flush_all(cache); 884 cpa_flush_all(cache);
892 885
893 /*
894 * If we've been called with lazy mmu updates enabled, then
895 * make sure that everything gets flushed out before we
896 * return.
897 */
898 arch_flush_lazy_mmu_mode();
899
900out: 886out:
901 return ret; 887 return ret;
902} 888}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 01765955baaf..2dfcbf9df2ae 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -31,17 +31,11 @@ static nodemask_t nodes_parsed __initdata;
31static nodemask_t cpu_nodes_parsed __initdata; 31static nodemask_t cpu_nodes_parsed __initdata;
32static struct bootnode nodes[MAX_NUMNODES] __initdata; 32static struct bootnode nodes[MAX_NUMNODES] __initdata;
33static struct bootnode nodes_add[MAX_NUMNODES]; 33static struct bootnode nodes_add[MAX_NUMNODES];
34static int found_add_area __initdata;
35int hotadd_percent __initdata = 0;
36 34
37static int num_node_memblks __initdata; 35static int num_node_memblks __initdata;
38static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; 36static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
39static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; 37static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
40 38
41/* Too small nodes confuse the VM badly. Usually they result
42 from BIOS bugs. */
43#define NODE_MIN_SIZE (4*1024*1024)
44
45static __init int setup_node(int pxm) 39static __init int setup_node(int pxm)
46{ 40{
47 return acpi_map_pxm_to_node(pxm); 41 return acpi_map_pxm_to_node(pxm);
@@ -66,9 +60,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
66{ 60{
67 struct bootnode *nd = &nodes[i]; 61 struct bootnode *nd = &nodes[i];
68 62
69 if (found_add_area)
70 return;
71
72 if (nd->start < start) { 63 if (nd->start < start) {
73 nd->start = start; 64 nd->start = start;
74 if (nd->end < nd->start) 65 if (nd->end < nd->start)
@@ -86,7 +77,6 @@ static __init void bad_srat(void)
86 int i; 77 int i;
87 printk(KERN_ERR "SRAT: SRAT not used.\n"); 78 printk(KERN_ERR "SRAT: SRAT not used.\n");
88 acpi_numa = -1; 79 acpi_numa = -1;
89 found_add_area = 0;
90 for (i = 0; i < MAX_LOCAL_APIC; i++) 80 for (i = 0; i < MAX_LOCAL_APIC; i++)
91 apicid_to_node[i] = NUMA_NO_NODE; 81 apicid_to_node[i] = NUMA_NO_NODE;
92 for (i = 0; i < MAX_NUMNODES; i++) 82 for (i = 0; i < MAX_NUMNODES; i++)
@@ -182,24 +172,21 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
182 pxm, apic_id, node); 172 pxm, apic_id, node);
183} 173}
184 174
185static int update_end_of_memory(unsigned long end) {return -1;}
186static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
187#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 175#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
188static inline int save_add_info(void) {return 1;} 176static inline int save_add_info(void) {return 1;}
189#else 177#else
190static inline int save_add_info(void) {return 0;} 178static inline int save_add_info(void) {return 0;}
191#endif 179#endif
192/* 180/*
193 * Update nodes_add and decide if to include add are in the zone. 181 * Update nodes_add[]
194 * Both SPARSE and RESERVE need nodes_add information. 182 * This code supports one contiguous hot add area per node
195 * This code supports one contiguous hot add area per node.
196 */ 183 */
197static int __init 184static void __init
198reserve_hotadd(int node, unsigned long start, unsigned long end) 185update_nodes_add(int node, unsigned long start, unsigned long end)
199{ 186{
200 unsigned long s_pfn = start >> PAGE_SHIFT; 187 unsigned long s_pfn = start >> PAGE_SHIFT;
201 unsigned long e_pfn = end >> PAGE_SHIFT; 188 unsigned long e_pfn = end >> PAGE_SHIFT;
202 int ret = 0, changed = 0; 189 int changed = 0;
203 struct bootnode *nd = &nodes_add[node]; 190 struct bootnode *nd = &nodes_add[node];
204 191
205 /* I had some trouble with strange memory hotadd regions breaking 192 /* I had some trouble with strange memory hotadd regions breaking
@@ -210,7 +197,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
210 mistakes */ 197 mistakes */
211 if ((signed long)(end - start) < NODE_MIN_SIZE) { 198 if ((signed long)(end - start) < NODE_MIN_SIZE) {
212 printk(KERN_ERR "SRAT: Hotplug area too small\n"); 199 printk(KERN_ERR "SRAT: Hotplug area too small\n");
213 return -1; 200 return;
214 } 201 }
215 202
216 /* This check might be a bit too strict, but I'm keeping it for now. */ 203 /* This check might be a bit too strict, but I'm keeping it for now. */
@@ -218,12 +205,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
218 printk(KERN_ERR 205 printk(KERN_ERR
219 "SRAT: Hotplug area %lu -> %lu has existing memory\n", 206 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
220 s_pfn, e_pfn); 207 s_pfn, e_pfn);
221 return -1; 208 return;
222 }
223
224 if (!hotadd_enough_memory(&nodes_add[node])) {
225 printk(KERN_ERR "SRAT: Hotplug area too large\n");
226 return -1;
227 } 209 }
228 210
229 /* Looks good */ 211 /* Looks good */
@@ -245,11 +227,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
245 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); 227 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
246 } 228 }
247 229
248 ret = update_end_of_memory(nd->end);
249
250 if (changed) 230 if (changed)
251 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); 231 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
252 return ret; 232 nd->start, nd->end);
253} 233}
254 234
255/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 235/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -310,13 +290,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
310 start, end); 290 start, end);
311 e820_register_active_regions(node, start >> PAGE_SHIFT, 291 e820_register_active_regions(node, start >> PAGE_SHIFT,
312 end >> PAGE_SHIFT); 292 end >> PAGE_SHIFT);
313 push_node_boundaries(node, nd->start >> PAGE_SHIFT,
314 nd->end >> PAGE_SHIFT);
315 293
316 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && 294 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
317 (reserve_hotadd(node, start, end) < 0)) { 295 update_nodes_add(node, start, end);
318 /* Ignore hotadd region. Undo damage */ 296 /* restore nodes[node] */
319 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
320 *nd = oldnode; 297 *nd = oldnode;
321 if ((nd->start | nd->end) == 0) 298 if ((nd->start | nd->end) == 0)
322 node_clear(node, nodes_parsed); 299 node_clear(node, nodes_parsed);
@@ -345,9 +322,9 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
345 pxmram = 0; 322 pxmram = 0;
346 } 323 }
347 324
348 e820ram = max_pfn - absent_pages_in_range(0, max_pfn); 325 e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
349 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ 326 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
350 if ((long)(e820ram - pxmram) >= 1*1024*1024) { 327 if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
351 printk(KERN_ERR 328 printk(KERN_ERR
352 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", 329 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
353 (pxmram << PAGE_SHIFT) >> 20, 330 (pxmram << PAGE_SHIFT) >> 20,
@@ -357,17 +334,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
357 return 1; 334 return 1;
358} 335}
359 336
360static void __init unparse_node(int node)
361{
362 int i;
363 node_clear(node, nodes_parsed);
364 node_clear(node, cpu_nodes_parsed);
365 for (i = 0; i < MAX_LOCAL_APIC; i++) {
366 if (apicid_to_node[i] == node)
367 apicid_to_node[i] = NUMA_NO_NODE;
368 }
369}
370
371void __init acpi_numa_arch_fixup(void) {} 337void __init acpi_numa_arch_fixup(void) {}
372 338
373/* Use the information discovered above to actually set up the nodes. */ 339/* Use the information discovered above to actually set up the nodes. */
@@ -379,18 +345,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
379 return -1; 345 return -1;
380 346
381 /* First clean up the node list */ 347 /* First clean up the node list */
382 for (i = 0; i < MAX_NUMNODES; i++) { 348 for (i = 0; i < MAX_NUMNODES; i++)
383 cutoff_node(i, start, end); 349 cutoff_node(i, start, end);
384 /*
385 * don't confuse VM with a node that doesn't have the
386 * minimum memory.
387 */
388 if (nodes[i].end &&
389 (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
390 unparse_node(i);
391 node_set_offline(i);
392 }
393 }
394 350
395 if (!nodes_cover_memory(nodes)) { 351 if (!nodes_cover_memory(nodes)) {
396 bad_srat(); 352 bad_srat();
@@ -423,7 +379,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
423 379
424 if (node == NUMA_NO_NODE) 380 if (node == NUMA_NO_NODE)
425 continue; 381 continue;
426 if (!node_isset(node, node_possible_map)) 382 if (!node_online(node))
427 numa_clear_node(i); 383 numa_clear_node(i);
428 } 384 }
429 numa_init_array(); 385 numa_init_array();
@@ -510,26 +466,6 @@ static int null_slit_node_compare(int a, int b)
510} 466}
511#endif /* CONFIG_NUMA_EMU */ 467#endif /* CONFIG_NUMA_EMU */
512 468
513void __init srat_reserve_add_area(int nodeid)
514{
515 if (found_add_area && nodes_add[nodeid].end) {
516 u64 total_mb;
517
518 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
519 "for node %d at %Lx-%Lx\n",
520 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
521 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
522 >> PAGE_SHIFT;
523 total_mb *= sizeof(struct page);
524 total_mb >>= 20;
525 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
526 "pre-allocated memory.\n", (unsigned long long)total_mb);
527 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
528 nodes_add[nodeid].end - nodes_add[nodeid].start,
529 BOOTMEM_DEFAULT);
530 }
531}
532
533int __node_distance(int a, int b) 469int __node_distance(int a, int b)
534{ 470{
535 int index; 471 int index;
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864ad49a7..b07dd8d0b321 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
40 40
41 switch (val) { 41 switch (val) {
42 case DIE_NMI: 42 case DIE_NMI:
43 if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) 43 case DIE_NMI_IPI:
44 ret = NOTIFY_STOP; 44 model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
45 ret = NOTIFY_STOP;
45 break; 46 break;
46 default: 47 default:
47 break; 48 break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
134static struct notifier_block profile_exceptions_nb = { 135static struct notifier_block profile_exceptions_nb = {
135 .notifier_call = profile_exceptions_notify, 136 .notifier_call = profile_exceptions_notify,
136 .next = NULL, 137 .next = NULL,
137 .priority = 0 138 .priority = 2
138}; 139};
139 140
140static int nmi_setup(void) 141static int nmi_setup(void)
@@ -356,14 +357,11 @@ static void exit_sysfs(void)
356#define exit_sysfs() do { } while (0) 357#define exit_sysfs() do { } while (0)
357#endif /* CONFIG_PM */ 358#endif /* CONFIG_PM */
358 359
359static int p4force;
360module_param(p4force, int, 0);
361
362static int __init p4_init(char **cpu_type) 360static int __init p4_init(char **cpu_type)
363{ 361{
364 __u8 cpu_model = boot_cpu_data.x86_model; 362 __u8 cpu_model = boot_cpu_data.x86_model;
365 363
366 if (!p4force && (cpu_model > 6 || cpu_model == 5)) 364 if (cpu_model > 6 || cpu_model == 5)
367 return 0; 365 return 0;
368 366
369#ifndef CONFIG_SMP 367#ifndef CONFIG_SMP
@@ -389,10 +387,25 @@ static int __init p4_init(char **cpu_type)
389 return 0; 387 return 0;
390} 388}
391 389
390static int force_arch_perfmon;
391static int force_cpu_type(const char *str, struct kernel_param *kp)
392{
393 if (!strcmp(str, "archperfmon")) {
394 force_arch_perfmon = 1;
395 printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
396 }
397
398 return 0;
399}
400module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0);
401
392static int __init ppro_init(char **cpu_type) 402static int __init ppro_init(char **cpu_type)
393{ 403{
394 __u8 cpu_model = boot_cpu_data.x86_model; 404 __u8 cpu_model = boot_cpu_data.x86_model;
395 405
406 if (force_arch_perfmon && cpu_has_arch_perfmon)
407 return 0;
408
396 switch (cpu_model) { 409 switch (cpu_model) {
397 case 0 ... 2: 410 case 0 ... 2:
398 *cpu_type = "i386/ppro"; 411 *cpu_type = "i386/ppro";
@@ -414,6 +427,13 @@ static int __init ppro_init(char **cpu_type)
414 case 15: case 23: 427 case 15: case 23:
415 *cpu_type = "i386/core_2"; 428 *cpu_type = "i386/core_2";
416 break; 429 break;
430 case 26:
431 arch_perfmon_setup_counters();
432 *cpu_type = "i386/core_i7";
433 break;
434 case 28:
435 *cpu_type = "i386/atom";
436 break;
417 default: 437 default:
418 /* Unknown */ 438 /* Unknown */
419 return 0; 439 return 0;
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 10131fbdaada..4da7230b3d17 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
18#include <asm/msr.h> 18#include <asm/msr.h>
19#include <asm/apic.h> 19#include <asm/apic.h>
20#include <asm/nmi.h> 20#include <asm/nmi.h>
21#include <asm/intel_arch_perfmon.h> 21#include <asm/perf_counter.h>
22 22
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24#include "op_counter.h" 24#include "op_counter.h"
@@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
136 u64 val; 136 u64 val;
137 int i; 137 int i;
138 138
139 /*
140 * This can happen if perf counters are in use when
141 * we steal the die notifier NMI.
142 */
143 if (unlikely(!reset_value))
144 goto out;
145
139 for (i = 0 ; i < num_counters; ++i) { 146 for (i = 0 ; i < num_counters; ++i) {
140 if (!reset_value[i]) 147 if (!reset_value[i])
141 continue; 148 continue;
@@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
146 } 153 }
147 } 154 }
148 155
156out:
149 /* Only P6 based Pentium M need to re-unmask the apic vector but it 157 /* Only P6 based Pentium M need to re-unmask the apic vector but it
150 * doesn't hurt other P6 variant */ 158 * doesn't hurt other P6 variant */
151 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); 159 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 5fa10bb9604f..8766b0e216c5 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -375,7 +375,7 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
375 if (!fixmem32) 375 if (!fixmem32)
376 return AE_OK; 376 return AE_OK;
377 if ((mcfg_res->start >= fixmem32->address) && 377 if ((mcfg_res->start >= fixmem32->address) &&
378 (mcfg_res->end <= (fixmem32->address + 378 (mcfg_res->end < (fixmem32->address +
379 fixmem32->address_length))) { 379 fixmem32->address_length))) {
380 mcfg_res->flags = 1; 380 mcfg_res->flags = 1;
381 return AE_CTRL_TERMINATE; 381 return AE_CTRL_TERMINATE;
@@ -392,7 +392,7 @@ static acpi_status __init check_mcfg_resource(struct acpi_resource *res,
392 return AE_OK; 392 return AE_OK;
393 393
394 if ((mcfg_res->start >= address.minimum) && 394 if ((mcfg_res->start >= address.minimum) &&
395 (mcfg_res->end <= (address.minimum + address.address_length))) { 395 (mcfg_res->end < (address.minimum + address.address_length))) {
396 mcfg_res->flags = 1; 396 mcfg_res->flags = 1;
397 return AE_CTRL_TERMINATE; 397 return AE_CTRL_TERMINATE;
398 } 398 }
@@ -418,7 +418,7 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
418 struct resource mcfg_res; 418 struct resource mcfg_res;
419 419
420 mcfg_res.start = start; 420 mcfg_res.start = start;
421 mcfg_res.end = end; 421 mcfg_res.end = end - 1;
422 mcfg_res.flags = 0; 422 mcfg_res.flags = 0;
423 423
424 acpi_get_devices("PNP0C01", find_mboard_resource, &mcfg_res, NULL); 424 acpi_get_devices("PNP0C01", find_mboard_resource, &mcfg_res, NULL);
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 1241f118ab56..58bc00f68b12 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
338 } 338 }
339 } 339 }
340 340
341 current->mm->context.vdso = (void *)addr;
342
341 if (compat_uses_vma || !compat) { 343 if (compat_uses_vma || !compat) {
342 /* 344 /*
343 * MAYWRITE to allow gdb to COW and set breakpoints 345 * MAYWRITE to allow gdb to COW and set breakpoints
@@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
358 goto up_fail; 360 goto up_fail;
359 } 361 }
360 362
361 current->mm->context.vdso = (void *)addr;
362 current_thread_info()->sysenter_return = 363 current_thread_info()->sysenter_return =
363 VDSO32_SYMBOL(addr, SYSENTER_RETURN); 364 VDSO32_SYMBOL(addr, SYSENTER_RETURN);
364 365
365 up_fail: 366 up_fail:
367 if (ret)
368 current->mm->context.vdso = NULL;
369
366 up_write(&mm->mmap_sem); 370 up_write(&mm->mmap_sem);
367 371
368 return ret; 372 return ret;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7133cdf9098b..21e1aeb9f3ea 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -8,6 +8,7 @@
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/random.h> 10#include <linux/random.h>
11#include <linux/elf.h>
11#include <asm/vsyscall.h> 12#include <asm/vsyscall.h>
12#include <asm/vgtod.h> 13#include <asm/vgtod.h>
13#include <asm/proto.h> 14#include <asm/proto.h>
@@ -115,15 +116,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
115 goto up_fail; 116 goto up_fail;
116 } 117 }
117 118
119 current->mm->context.vdso = (void *)addr;
120
118 ret = install_special_mapping(mm, addr, vdso_size, 121 ret = install_special_mapping(mm, addr, vdso_size,
119 VM_READ|VM_EXEC| 122 VM_READ|VM_EXEC|
120 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 123 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
121 VM_ALWAYSDUMP, 124 VM_ALWAYSDUMP,
122 vdso_pages); 125 vdso_pages);
123 if (ret) 126 if (ret) {
127 current->mm->context.vdso = NULL;
124 goto up_fail; 128 goto up_fail;
129 }
125 130
126 current->mm->context.vdso = (void *)addr;
127up_fail: 131up_fail:
128 up_write(&mm->mmap_sem); 132 up_write(&mm->mmap_sem);
129 return ret; 133 return ret;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09e8c36ee80..0a1700a2be9c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -20,6 +20,7 @@
20#include <linux/delay.h> 20#include <linux/delay.h>
21#include <linux/start_kernel.h> 21#include <linux/start_kernel.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/kprobes.h>
23#include <linux/bootmem.h> 24#include <linux/bootmem.h>
24#include <linux/module.h> 25#include <linux/module.h>
25#include <linux/mm.h> 26#include <linux/mm.h>
@@ -44,6 +45,7 @@
44#include <asm/processor.h> 45#include <asm/processor.h>
45#include <asm/proto.h> 46#include <asm/proto.h>
46#include <asm/msr-index.h> 47#include <asm/msr-index.h>
48#include <asm/traps.h>
47#include <asm/setup.h> 49#include <asm/setup.h>
48#include <asm/desc.h> 50#include <asm/desc.h>
49#include <asm/pgtable.h> 51#include <asm/pgtable.h>
@@ -240,10 +242,10 @@ static unsigned long xen_get_debugreg(int reg)
240 return HYPERVISOR_get_debugreg(reg); 242 return HYPERVISOR_get_debugreg(reg);
241} 243}
242 244
243void xen_leave_lazy(void) 245static void xen_end_context_switch(struct task_struct *next)
244{ 246{
245 paravirt_leave_lazy(paravirt_get_lazy_mode());
246 xen_mc_flush(); 247 xen_mc_flush();
248 paravirt_end_context_switch(next);
247} 249}
248 250
249static unsigned long xen_store_tr(void) 251static unsigned long xen_store_tr(void)
@@ -428,11 +430,44 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
428static int cvt_gate_to_trap(int vector, const gate_desc *val, 430static int cvt_gate_to_trap(int vector, const gate_desc *val,
429 struct trap_info *info) 431 struct trap_info *info)
430{ 432{
433 unsigned long addr;
434
431 if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) 435 if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
432 return 0; 436 return 0;
433 437
434 info->vector = vector; 438 info->vector = vector;
435 info->address = gate_offset(*val); 439
440 addr = gate_offset(*val);
441#ifdef CONFIG_X86_64
442 /*
443 * Look for known traps using IST, and substitute them
444 * appropriately. The debugger ones are the only ones we care
445 * about. Xen will handle faults like double_fault and
446 * machine_check, so we should never see them. Warn if
447 * there's an unexpected IST-using fault handler.
448 */
449 if (addr == (unsigned long)debug)
450 addr = (unsigned long)xen_debug;
451 else if (addr == (unsigned long)int3)
452 addr = (unsigned long)xen_int3;
453 else if (addr == (unsigned long)stack_segment)
454 addr = (unsigned long)xen_stack_segment;
455 else if (addr == (unsigned long)double_fault ||
456 addr == (unsigned long)nmi) {
457 /* Don't need to handle these */
458 return 0;
459#ifdef CONFIG_X86_MCE
460 } else if (addr == (unsigned long)machine_check) {
461 return 0;
462#endif
463 } else {
464 /* Some other trap using IST? */
465 if (WARN_ON(val->ist != 0))
466 return 0;
467 }
468#endif /* CONFIG_X86_64 */
469 info->address = addr;
470
436 info->cs = gate_segment(*val); 471 info->cs = gate_segment(*val);
437 info->flags = val->dpl; 472 info->flags = val->dpl;
438 /* interrupt gates clear IF */ 473 /* interrupt gates clear IF */
@@ -623,10 +658,26 @@ static void xen_clts(void)
623 xen_mc_issue(PARAVIRT_LAZY_CPU); 658 xen_mc_issue(PARAVIRT_LAZY_CPU);
624} 659}
625 660
661static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
662
663static unsigned long xen_read_cr0(void)
664{
665 unsigned long cr0 = percpu_read(xen_cr0_value);
666
667 if (unlikely(cr0 == 0)) {
668 cr0 = native_read_cr0();
669 percpu_write(xen_cr0_value, cr0);
670 }
671
672 return cr0;
673}
674
626static void xen_write_cr0(unsigned long cr0) 675static void xen_write_cr0(unsigned long cr0)
627{ 676{
628 struct multicall_space mcs; 677 struct multicall_space mcs;
629 678
679 percpu_write(xen_cr0_value, cr0);
680
630 /* Only pay attention to cr0.TS; everything else is 681 /* Only pay attention to cr0.TS; everything else is
631 ignored. */ 682 ignored. */
632 mcs = xen_mc_entry(0); 683 mcs = xen_mc_entry(0);
@@ -812,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
812 863
813 .clts = xen_clts, 864 .clts = xen_clts,
814 865
815 .read_cr0 = native_read_cr0, 866 .read_cr0 = xen_read_cr0,
816 .write_cr0 = xen_write_cr0, 867 .write_cr0 = xen_write_cr0,
817 868
818 .read_cr4 = native_read_cr4, 869 .read_cr4 = native_read_cr4,
@@ -860,10 +911,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
860 /* Xen takes care of %gs when switching to usermode for us */ 911 /* Xen takes care of %gs when switching to usermode for us */
861 .swapgs = paravirt_nop, 912 .swapgs = paravirt_nop,
862 913
863 .lazy_mode = { 914 .start_context_switch = paravirt_start_context_switch,
864 .enter = paravirt_enter_lazy_cpu, 915 .end_context_switch = xen_end_context_switch,
865 .leave = xen_leave_lazy,
866 },
867}; 916};
868 917
869static const struct pv_apic_ops xen_apic_ops __initdata = { 918static const struct pv_apic_ops xen_apic_ops __initdata = {
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index fba55b1a4021..4ceb28581652 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -452,10 +452,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
452void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 452void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
453 pte_t *ptep, pte_t pteval) 453 pte_t *ptep, pte_t pteval)
454{ 454{
455 /* updates to init_mm may be done without lock */
456 if (mm == &init_mm)
457 preempt_disable();
458
459 ADD_STATS(set_pte_at, 1); 455 ADD_STATS(set_pte_at, 1);
460// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); 456// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
461 ADD_STATS(set_pte_at_current, mm == current->mm); 457 ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -476,9 +472,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
476 } 472 }
477 xen_set_pte(ptep, pteval); 473 xen_set_pte(ptep, pteval);
478 474
479out: 475out: return;
480 if (mm == &init_mm)
481 preempt_enable();
482} 476}
483 477
484pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 478pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@ -1152,10 +1146,8 @@ static void drop_other_mm_ref(void *info)
1152 1146
1153 /* If this cpu still has a stale cr3 reference, then make sure 1147 /* If this cpu still has a stale cr3 reference, then make sure
1154 it has been flushed. */ 1148 it has been flushed. */
1155 if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) { 1149 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1156 load_cr3(swapper_pg_dir); 1150 load_cr3(swapper_pg_dir);
1157 arch_flush_lazy_cpu_mode();
1158 }
1159} 1151}
1160 1152
1161static void xen_drop_mm_ref(struct mm_struct *mm) 1153static void xen_drop_mm_ref(struct mm_struct *mm)
@@ -1168,7 +1160,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1168 load_cr3(swapper_pg_dir); 1160 load_cr3(swapper_pg_dir);
1169 else 1161 else
1170 leave_mm(smp_processor_id()); 1162 leave_mm(smp_processor_id());
1171 arch_flush_lazy_cpu_mode();
1172 } 1163 }
1173 1164
1174 /* Get the "official" set of cpus referring to our pagetable. */ 1165 /* Get the "official" set of cpus referring to our pagetable. */
@@ -1876,6 +1867,14 @@ __init void xen_post_allocator_init(void)
1876 xen_mark_init_mm_pinned(); 1867 xen_mark_init_mm_pinned();
1877} 1868}
1878 1869
1870static void xen_leave_lazy_mmu(void)
1871{
1872 preempt_disable();
1873 xen_mc_flush();
1874 paravirt_leave_lazy_mmu();
1875 preempt_enable();
1876}
1877
1879const struct pv_mmu_ops xen_mmu_ops __initdata = { 1878const struct pv_mmu_ops xen_mmu_ops __initdata = {
1880 .pagetable_setup_start = xen_pagetable_setup_start, 1879 .pagetable_setup_start = xen_pagetable_setup_start,
1881 .pagetable_setup_done = xen_pagetable_setup_done, 1880 .pagetable_setup_done = xen_pagetable_setup_done,
@@ -1949,7 +1948,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
1949 1948
1950 .lazy_mode = { 1949 .lazy_mode = {
1951 .enter = paravirt_enter_lazy_mmu, 1950 .enter = paravirt_enter_lazy_mmu,
1952 .leave = xen_leave_lazy, 1951 .leave = xen_leave_lazy_mmu,
1953 }, 1952 },
1954 1953
1955 .set_fixmap = xen_set_fixmap, 1954 .set_fixmap = xen_set_fixmap,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 15c6c68db6a2..ad0047f47cd4 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -61,9 +61,9 @@ char * __init xen_memory_setup(void)
61 * - xen_start_info 61 * - xen_start_info
62 * See comment above "struct start_info" in <xen/interface/xen.h> 62 * See comment above "struct start_info" in <xen/interface/xen.h>
63 */ 63 */
64 e820_add_region(__pa(xen_start_info->mfn_list), 64 reserve_early(__pa(xen_start_info->mfn_list),
65 xen_start_info->pt_base - xen_start_info->mfn_list, 65 __pa(xen_start_info->pt_base),
66 E820_RESERVED); 66 "XEN START INFO");
67 67
68 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 68 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
69 69
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index ca6596b05d53..22494fd4c9b5 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void); 30void xen_ident_map_ISA(void);
31void xen_reserve_top(void); 31void xen_reserve_top(void);
32 32
33void xen_leave_lazy(void);
34void xen_post_allocator_init(void); 33void xen_post_allocator_init(void);
35 34
36char * __init xen_memory_setup(void); 35char * __init xen_memory_setup(void);